1- 问题提出
2- 线性回归
3- 理论推导
4- Python/Spark实现
1 # -*- coding: utf-8 -*- 2 from pyspark import SparkContext 3 4 5 theta = [0, 0] 6 alpha = 0.001 7 8 sc = SparkContext('local') 9 10 def func_theta_x(x):11 return sum([i * j for i, j in zip(theta, x)])12 13 def cost(x):14 thx = func_theta_x(x)15 return thx - x[-1]16 17 def partial_theta(x):18 dif = cost(x)19 return [dif * i for i in x[:-1]]20 21 rdd = sc.textFile('/home/freyr/linearRegression.txt')\22 .map(lambda line: map(float, line.strip().split('\t')))23 24 maxiter = 40025 iter = 026 while True:27 parTheta = rdd.map(partial_theta)\28 .reduce(lambda x, y: [i + j for i, j in zip(x, y)])29 30 for i in range(2):31 theta[i] = theta[i] - alpha * parTheta[i]32 33 iter += 134 35 if iter <= maxiter:36 if sum(map(abs, parTheta)) <= 0.01:37 print 'I get it!!!'38 print 'Iter = %s' % iter39 print 'Theta = %s' % theta40 break41 else:42 print 'Failed...'43 break
PS: 1.