Jua*_*vid 10 python apache-spark pyspark
我有一个相关矩阵计算如下pyspark 2.2:
from pyspark.ml.linalg import Vectors
from pyspark.ml.stat import Correlation
from pyspark.ml.linalg import Vectors
from pyspark.ml.feature import VectorAssembler
datos = sql("""select * from proceso_riesgos.jdgc_bd_train_mn_ingresos""")
Variables_corr= ['ingreso_final_mix','ingreso_final_promedio',
'ingreso_final_mediana','ingreso_final_trimedia','ingresos_serv_q1',
'ingresos_serv_q2','ingresos_serv_q3','prom_ingresos_serv','y_correc']
assembler = VectorAssembler(
inputCols=Variables_corr,
outputCol="features")
datos1=datos.select(Variables_corr).filter("y_correc is not null")
output = assembler.transform(datos)
r1 = Correlation.corr(output, "features")
Run Code Online (Sandbox Code Playgroud)
结果是一个带有变量的数据框,称为"pearson(features):matrix":
Row(pearson(features)=DenseMatrix(20, 20, [1.0, 0.9428, 0.8908, 0.913,
0.567, 0.5832, 0.6148, 0.6488, ..., -0.589, -0.6145, -0.5906, -0.5534,
-0.5346, -0.0797, -0.617, 1.0], False))]
Run Code Online (Sandbox Code Playgroud)
我需要获取这些值并将其导出到excel,或者能够操纵结果.列表可能是令人沮丧的.
感谢帮助!!
pis*_*all 21
请试试这段代码.用我的read()
电话替换你的数据.请注意,在映射lambda函数之前,我已将SQL df转换为RDD.
from pyspark.mllib.stat import Statistics
import pandas as pd
# df = sqlCtx.read.format('com.databricks.spark.csv').option('header', 'true').option('inferschema', 'true').load('corr_test.csv')
df = datos
col_names = df.columns
features = df.rdd.map(lambda row: row[0:])
corr_mat=Statistics.corr(features, method="pearson")
corr_df = pd.DataFrame(corr_mat)
corr_df.index, corr_df.columns = col_names, col_names
Run Code Online (Sandbox Code Playgroud)
示例输出:
print(corr_df.to_string())
p1m p2m p3m p6m p9m p1m_ya p2m_ya p3m_ya p6m_ya p9m_ya p3m_q_ty 1ya_sales 2ya_sales seasonal_sales
p1m 1.000000 0.755679 0.755452 0.506780 0.557281 0.299348 0.182835 -0.001173 0.332484 0.308060 0.354096 0.029385 0.871112 0.292136
p2m 0.755679 1.000000 0.987618 0.896422 0.863010 0.103545 0.431919 0.318233 0.660824 0.588278 0.533427 0.082632 0.766487 0.521879
p3m 0.755452 0.987618 1.000000 0.866792 0.822750 0.056984 0.386290 0.274494 0.606200 0.523938 0.464158 0.020544 0.749018 0.451629
p6m 0.506780 0.896422 0.866792 1.000000 0.979228 0.210658 0.690670 0.623754 0.851390 0.790276 0.738892 0.362444 0.502335 0.754078
p9m 0.557281 0.863010 0.822750 0.979228 1.000000 0.388865 0.779092 0.695114 0.912167 0.872120 0.843273 0.499578 0.548269 0.849284
p1m_ya 0.299348 0.103545 0.056984 0.210658 0.388865 1.000000 0.614836 0.547236 0.564361 0.682653 0.771472 0.874493 0.313053 0.735593
p2m_ya 0.182835 0.431919 0.386290 0.690670 0.779092 0.614836 1.000000 0.976696 0.943147 0.933545 0.887659 0.775088 0.315853 0.899157
p3m_ya -0.001173 0.318233 0.274494 0.623754 0.695114 0.547236 0.976696 1.000000 0.894490 0.891665 0.824135 0.778251 0.162183 0.848247
p6m_ya 0.332484 0.660824 0.606200 0.851390 0.912167 0.564361 0.943147 0.894490 1.000000 0.982057 0.928130 0.692184 0.466502 0.940549
p9m_ya 0.308060 0.588278 0.523938 0.790276 0.872120 0.682653 0.933545 0.891665 0.982057 1.000000 0.970826 0.800886 0.431627 0.977719
p3m_q_ty 0.354096 0.533427 0.464158 0.738892 0.843273 0.771472 0.887659 0.824135 0.928130 0.970826 1.000000 0.864894 0.402324 0.995414
1ya_sales 0.029385 0.082632 0.020544 0.362444 0.499578 0.874493 0.775088 0.778251 0.692184 0.800886 0.864894 1.000000 0.065062 0.858691
2ya_sales 0.871112 0.766487 0.749018 0.502335 0.548269 0.313053 0.315853 0.162183 0.466502 0.431627 0.402324 0.065062 1.000000 0.343994
seasonal_sales 0.292136 0.521879 0.451629 0.754078 0.849284 0.735593 0.899157 0.848247 0.940549 0.977719 0.995414 0.858691 0.343994 1.000000
Run Code Online (Sandbox Code Playgroud)
你快到了!不需要使用旧的 rdd mllib api 。
这是我生成熊猫数据框的方法,您可以导出为 excel 或 csv 或其他格式。
def correlation_matrix(df, corr_columns, method='pearson'):
vector_col = "corr_features"
assembler = VectorAssembler(inputCols=corr_columns, outputCol=vector_col)
df_vector = assembler.transform(df).select(vector_col)
matrix = Correlation.corr(df_vector, vector_col, method)
result = matrix.collect()[0]["pearson({})".format(vector_col)].values
return pd.DataFrame(result.reshape(-1, len(corr_columns)), columns=corr_columns, index=corr_columns)
Run Code Online (Sandbox Code Playgroud)
归档时间: |
|
查看次数: |
7432 次 |
最近记录: |