from pyspark.mllib.classification import SVMWithSGD,SVMModel from pyspark.mllib.regression import LabeledPoint from pyspark import SparkContext from pyspark import SparkConf
import os import sys import logging # Path for spark source folder os.environ['SPARK_HOME']="D:\javaPackages\spark-1.6.0-bin-hadoop2.6" # Append pyspark to Python Path sys.path.append("D:\javaPackages\spark-1.6.0-bin-hadoop2.6\python") sys.path.append("D:\javaPackages\spark-1.6.0-bin-hadoop2.6\python\lib\py4j-0.9-src.zip")
conf = SparkConf() conf.set("YARN_CONF_DIR ", "D:\javaPackages\hadoop_conf_dir\yarn-conf") conf.set("spark.driver.memory", "2g") conf.setMaster("yarn-client") conf.setAppName("TestSVM") logger = logging.getLogger('pyspark') sc = SparkContext(conf=conf) mylog = [] #载入数据并解析 def parsePoint(line): values = [float(x) for x in line.split(" ")] return LabeledPoint(values[0],values[1:]) data = sc.textFile("/home/xiatao/machine_learing/SVM/sample_svm_data.txt")
from pyspark.mllib.classification import LogisticRegressionWithSGD,LogisticRegressionModel from pyspark.mllib.regression import LabeledPoint from pyspark import SparkContext from pyspark import SparkConf
import os import sys import logging
# Path for spark source folder os.environ['SPARK_HOME']="D:\javaPackages\spark-1.6.0-bin-hadoop2.6" # Append pyspark to Python Path sys.path.append("D:\javaPackages\spark-1.6.0-bin-hadoop2.6\python") sys.path.append("D:\javaPackages\spark-1.6.0-bin-hadoop2.6\python\lib\py4j-0.9-src.zip")
values = [float(x) for x in line.split(" ")] return LabeledPoint(values[0],values[1:])
data = sc.textFile("/home/xiatao/machine_learing/logistic_regression/sample_svm_data.txt") parseData = data.map(parsePoint) #创建模型 model = LogisticRegressionWithSGD.train(parseData)
  Linear least squares, Lasso, and ridge regression   Linear least squares是回归问题最常用的公式,其误差函数如下:$$L(w;x,y):=\frac{1}{2}(w^Tx-y)^{2}$$ 使用不同的规则参数将会派生出不同的相关回归方法;其中的 ordinary least squares和linear least squares不使用规则参数,ridge regression(岭回归)使用L2规则参数,Lasso使用L1规则参数。所有的这些模型,其平均误差或者训练误差$$\frac{1}{n}\sum_{i=1}^n(w^Tx-y_i)^2$$ 即均方差。   一下代码展示了如何载入数据、转换为LabeledPoint类型的RDD。然后使用 LinearRegressionWithSGD来创建简单线性模型来预测标签值。最后再计算均方差来评估适应度。
# -*- coding:utf-8 -*- # 流线性回归模型测试 from pyspark.mllib.linalg import Vectors from pyspark.mllib.regression import StreamingLinearRegressionWithSGD from pyspark.streaming import StreamingContext from pyspark.mllib.regression import LabeledPoint from pyspark import SparkContext from pyspark import SparkConf
import os import sys import logging # Path for spark source folder os.environ['SPARK_HOME']="D:\javaPackages\spark-1.6.0-bin-hadoop2.6" # Append pyspark to Python Path sys.path.append("D:\javaPackages\spark-1.6.0-bin-hadoop2.6\python") sys.path.append("D:\javaPackages\spark-1.6.0-bin-hadoop2.6\python\lib\py4j-0.9-src.zip")