Fra*_*ank 7 scikit-learn apache-spark apache-spark-mllib
我在scikit-learn和spark.ml中设置了一个非常简单的逻辑回归问题,结果有所不同:他们学习的模型不同,但我无法弄清楚为什么(数据相同,模型类型是同样,正规化也是一样的......).
毫无疑问,我错过了一方或另一方的设置.哪个环境?我应该如何设置scikit或spark.ml以找到与其对应的相同模型?
我在下面给出了sklearn代码和spark.ml代码.两者都应该准备好剪切和粘贴并运行.
import numpy as np
from sklearn.linear_model import LogisticRegression, Ridge
X = np.array([
[-0.7306653538519616, 0.0],
[0.6750417712898752, -0.4232874171873786],
[0.1863463229359709, -0.8163423997075965],
[-0.6719842051493347, 0.0],
[0.9699938346531928, 0.0],
[0.22759406190283604, 0.0],
[0.9688721028330911, 0.0],
[0.5993795346650845, 0.0],
[0.9219423508390701, -0.8972778242305388],
[0.7006904841584055, -0.5607635619919824]
])
y = np.array([
0.0,
1.0,
1.0,
0.0,
1.0,
1.0,
1.0,
0.0,
0.0,
0.0
])
m, n = X.shape
# Add intercept term to simulate inputs to GameEstimator
X_with_intercept = np.hstack((X, np.ones(m)[:,np.newaxis]))
l = 0.3
e = LogisticRegression(
fit_intercept=False,
penalty='l2',
C=1/l,
max_iter=100,
tol=1e-11)
e.fit(X_with_intercept, y)
print e.coef_
# => [[ 0.98662189 0.45571052 -0.23467255]]
# Linear regression is called Ridge in sklearn
e = Ridge(
fit_intercept=False,
alpha=l,
max_iter=100,
tol=1e-11)
e.fit(X_with_intercept, y)
print e.coef_
# =>[ 0.32155545 0.17904355 0.41222418]
Run Code Online (Sandbox Code Playgroud)
import org.apache.spark.{SparkConf, SparkContext}
import org.apache.spark.ml.classification.LogisticRegression
import org.apache.spark.ml.regression.LinearRegression
import org.apache.spark.mllib.linalg.Vectors
import org.apache.spark.mllib.regression.LabeledPoint
import org.apache.spark.sql.SQLContext
object TestSparkRegression {
def main(args: Array[String]): Unit = {
import org.apache.log4j.{Level, Logger}
Logger.getLogger("org").setLevel(Level.OFF)
Logger.getLogger("akka").setLevel(Level.OFF)
val conf = new SparkConf().setAppName("test").setMaster("local")
val sc = new SparkContext(conf)
val sparkTrainingData = new SQLContext(sc)
.createDataFrame(Seq(
LabeledPoint(0.0, Vectors.dense(-0.7306653538519616, 0.0)),
LabeledPoint(1.0, Vectors.dense(0.6750417712898752, -0.4232874171873786)),
LabeledPoint(1.0, Vectors.dense(0.1863463229359709, -0.8163423997075965)),
LabeledPoint(0.0, Vectors.dense(-0.6719842051493347, 0.0)),
LabeledPoint(1.0, Vectors.dense(0.9699938346531928, 0.0)),
LabeledPoint(1.0, Vectors.dense(0.22759406190283604, 0.0)),
LabeledPoint(1.0, Vectors.dense(0.9688721028330911, 0.0)),
LabeledPoint(0.0, Vectors.dense(0.5993795346650845, 0.0)),
LabeledPoint(0.0, Vectors.dense(0.9219423508390701, -0.8972778242305388)),
LabeledPoint(0.0, Vectors.dense(0.7006904841584055, -0.5607635619919824))))
.toDF("label", "features")
val logisticModel = new LogisticRegression()
.setRegParam(0.3)
.setLabelCol("label")
.setFeaturesCol("features")
.fit(sparkTrainingData)
println(s"Spark logistic model coefficients: ${logisticModel.coefficients} Intercept: ${logisticModel.intercept}")
// Spark logistic model coefficients: [0.5451588538376263,0.26740606573584713] Intercept: -0.13897955358689987
val linearModel = new LinearRegression()
.setRegParam(0.3)
.setLabelCol("label")
.setFeaturesCol("features")
.setSolver("l-bfgs")
.fit(sparkTrainingData)
println(s"Spark linear model coefficients: ${linearModel.coefficients} Intercept: ${linearModel.intercept}")
// Spark linear model coefficients: [0.19852664861346023,0.11501200541407802] Intercept: 0.45464906876832323
sc.stop()
}
}
Run Code Online (Sandbox Code Playgroud)
Dha*_*esh 13
您需要执行以下操作:
首先标准化python和spark数据帧.Spark内部默认使用标准化.注意在两个包中的标准缩放器实现中考虑标准偏差公式的差异.
对于逻辑回归,Spark使用对数损失的平均值(分母是权重之和,当所有权重为1时,是训练实例的数量),而sklearn使用对数损失的总和.在线性回归中,与sklearn不同,spark在平方误差项的总和中使用1/2n因子.需要相应地缩小Spark正则化 - 对于逻辑回归是1/10倍,对于该示例中的线性回归是1/20倍.
Scikit-学习代码
import numpy as np
from sklearn.linear_model import LogisticRegression, Ridge
X = np.array([
[-0.7306653538519616, 0.0],
[0.6750417712898752, -0.4232874171873786],
[0.1863463229359709, -0.8163423997075965],
[-0.6719842051493347, 0.0],
[0.9699938346531928, 0.0],
[0.22759406190283604, 0.0],
[0.9688721028330911, 0.0],
[0.5993795346650845, 0.0],
[0.9219423508390701, -0.8972778242305388],
[0.7006904841584055, -0.5607635619919824]
])
y = np.array([
0.0,
1.0,
1.0,
0.0,
1.0,
1.0,
1.0,
0.0,
0.0,
0.0
])
m, n = X.shape
from sklearn.preprocessing import StandardScaler
## sqrt(n-1)/sqrt(n) factor for getting the same standardization as spark
Xsc=StandardScaler().fit_transform(X)*3.0/np.sqrt(10.0)
l = 0.3
e = LogisticRegression(
fit_intercept=True,
penalty='l2',
C=1/l,
max_iter=100,
tol=1e-11,
solver='lbfgs',verbose=1)
e.fit(Xsc, y)
print e.coef_, e.intercept_
# => [[ 0.82122437 0.32615256]] [-0.01181534]
#e.get_params(deep=True)
# Linear regression is called Ridge in sklearn
e = Ridge(
fit_intercept=True,
alpha=l,
max_iter=100,
tol=1e-11)
e.fit(Xsc, y)
print e.coef_,e.intercept_
# =>[ 0.21310109 0.09203616] 0.5
Run Code Online (Sandbox Code Playgroud)
Spark代码(重构为使用ML API)
import org.apache.spark.{SparkConf, SparkContext}
import org.apache.spark.ml.classification.LogisticRegression
import org.apache.spark.ml.regression.LinearRegression
import org.apache.spark.ml.linalg.Vectors
import org.apache.spark.sql.SQLContext
import org.apache.spark.ml.feature.StandardScaler
val sparkTrainingData_orig = new SQLContext(sc).
createDataFrame(Seq(
(0.0, Vectors.dense(Array(-0.7306653538519616, 0.0))),
(1.0, Vectors.dense(Array(0.6750417712898752, -0.4232874171873786))),
(1.0, Vectors.dense(Array(0.1863463229359709, -0.8163423997075965))),
(0.0, Vectors.dense(Array(-0.6719842051493347, 0.0))),
(1.0, Vectors.dense(Array(0.9699938346531928, 0.0))),
(1.0, Vectors.dense(Array(0.22759406190283604, 0.0))),
(1.0, Vectors.dense(Array(0.9688721028330911, 0.0))),
(0.0, Vectors.dense(Array(0.5993795346650845, 0.0))),
(0.0, Vectors.dense(Array(0.9219423508390701, -0.8972778242305388))),
(0.0, Vectors.dense(Array(0.7006904841584055, -0.5607635619919824))))).
toDF("label", "features_orig")
val sparkTrainingData=new StandardScaler().
setWithMean(true).
setInputCol("features_orig").
setOutputCol("features").
fit(sparkTrainingData_orig).
transform(sparkTrainingData_orig)
//Make regularization 0.3/10=0.03
val logisticModel = new LogisticRegression().
setRegParam(0.03).
setLabelCol("label").
setFeaturesCol("features").
setTol(1e-12).
setMaxIter(100).
fit(sparkTrainingData)
println(s"Spark logistic model coefficients: ${logisticModel.coefficients} Intercept: ${logisticModel.intercept}")
// Spark logistic model coefficients: [0.8212244419577079,0.32615245441495727] Intercept: -0.011815325216668142
//Make regularization 0.3/20=0.015
val linearModel = new LinearRegression().
setRegParam(0.015).
setLabelCol("label").
setFeaturesCol("features").
setTol(1e-12).
setMaxIter(100).
fit(sparkTrainingData)
println(s"Spark linear model coefficients: ${linearModel.coefficients} Intercept: ${linearModel.intercept}")
// Spark linear model coefficients: [0.21394341729353747,0.09257340293212045] Intercept: 0.5
Run Code Online (Sandbox Code Playgroud)
| 归档时间: |
|
| 查看次数: |
941 次 |
| 最近记录: |