jax*_*jax 23 python numpy scipy pandas
我有一个庞大的数据集,在机器学习建模之前总是建议首先你应该删除高度相关的描述符(列)如何计算列wice相关性并删除具有阈值的列说删除所有列或描述符具有> 0.8的相关性.它也应该保留减少数据中的标题..
示例数据集
GA PN PC MBP GR AP
0.033 6.652 6.681 0.194 0.874 3.177
0.034 9.039 6.224 0.194 1.137 3.4
0.035 10.936 10.304 1.015 0.911 4.9
0.022 10.11 9.603 1.374 0.848 4.566
0.035 2.963 17.156 0.599 0.823 9.406
0.033 10.872 10.244 1.015 0.574 4.871
0.035 21.694 22.389 1.015 0.859 9.259
0.035 10.936 10.304 1.015 0.911 4.5
Run Code Online (Sandbox Code Playgroud)
请帮忙....
NIS*_*AGA 24
这是我用过的方法 -
def correlation(dataset, threshold):
col_corr = set() # Set of all the names of deleted columns
corr_matrix = dataset.corr()
for i in range(len(corr_matrix.columns)):
for j in range(i):
if (corr_matrix.iloc[i, j] >= threshold) and (corr_matrix.columns[j] not in col_corr):
colname = corr_matrix.columns[i] # getting the name of column
col_corr.add(colname)
if colname in dataset.columns:
del dataset[colname] # deleting the column from the dataset
print(dataset)
Run Code Online (Sandbox Code Playgroud)
希望这可以帮助!
Che*_* Wu 13
这里的方法对我来说很好用,只有几行代码:https : //chrisalbon.com/machine_learning/feature_selection/drop_highly_correlated_features/
import numpy as np
# Create correlation matrix
corr_matrix = df.corr().abs()
# Select upper triangle of correlation matrix
upper = corr_matrix.where(np.triu(np.ones(corr_matrix.shape), k=1).astype(np.bool))
# Find features with correlation greater than 0.95
to_drop = [column for column in upper.columns if any(upper[column] > 0.95)]
# Drop features
df.drop(to_drop, axis=1, inplace=True)
Run Code Online (Sandbox Code Playgroud)
小智 9
这是我创建的一个 Auto ML 类,用于消除特征之间的多重共线性。
使我的代码与众不同的是,在两个具有高相关性的特征中,我消除了与目标相关性最低的特征!我从 Vishal Patel Sir 的这个研讨会中得到了这个想法 - https://www.youtube.com/watch?v=ioXKxulmwVQ&feature=youtu.be
#Feature selection class to eliminate multicollinearity
class MultiCollinearityEliminator():
#Class Constructor
def __init__(self, df, target, threshold):
self.df = df
self.target = target
self.threshold = threshold
#Method to create and return the feature correlation matrix dataframe
def createCorrMatrix(self, include_target = False):
#Checking we should include the target in the correlation matrix
if (include_target == False):
df_temp = self.df.drop([self.target], axis =1)
#Setting method to Pearson to prevent issues in case the default method for df.corr() gets changed
#Setting min_period to 30 for the sample size to be statistically significant (normal) according to
#central limit theorem
corrMatrix = df_temp.corr(method='pearson', min_periods=30).abs()
#Target is included for creating the series of feature to target correlation - Please refer the notes under the
#print statement to understand why we create the series of feature to target correlation
elif (include_target == True):
corrMatrix = self.df.corr(method='pearson', min_periods=30).abs()
return corrMatrix
#Method to create and return the feature to target correlation matrix dataframe
def createCorrMatrixWithTarget(self):
#After obtaining the list of correlated features, this method will help to view which variables
#(in the list of correlated features) are least correlated with the target
#This way, out the list of correlated features, we can ensure to elimate the feature that is
#least correlated with the target
#This not only helps to sustain the predictive power of the model but also helps in reducing model complexity
#Obtaining the correlation matrix of the dataframe (along with the target)
corrMatrix = self.createCorrMatrix(include_target = True)
#Creating the required dataframe, then dropping the target row
#and sorting by the value of correlation with target (in asceding order)
corrWithTarget = pd.DataFrame(corrMatrix.loc[:,self.target]).drop([self.target], axis = 0).sort_values(by = self.target)
print(corrWithTarget, '\n')
return corrWithTarget
#Method to create and return the list of correlated features
def createCorrelatedFeaturesList(self):
#Obtaining the correlation matrix of the dataframe (without the target)
corrMatrix = self.createCorrMatrix(include_target = False)
colCorr = []
#Iterating through the columns of the correlation matrix dataframe
for column in corrMatrix.columns:
#Iterating through the values (row wise) of the correlation matrix dataframe
for idx, row in corrMatrix.iterrows():
if(row[column]>self.threshold) and (row[column]<1):
#Adding the features that are not already in the list of correlated features
if (idx not in colCorr):
colCorr.append(idx)
if (column not in colCorr):
colCorr.append(column)
print(colCorr, '\n')
return colCorr
#Method to eliminate the least important features from the list of correlated features
def deleteFeatures(self, colCorr):
#Obtaining the feature to target correlation matrix dataframe
corrWithTarget = self.createCorrMatrixWithTarget()
for idx, row in corrWithTarget.iterrows():
print(idx, '\n')
if (idx in colCorr):
self.df = self.df.drop(idx, axis =1)
break
return self.df
#Method to run automatically eliminate multicollinearity
def autoEliminateMulticollinearity(self):
#Obtaining the list of correlated features
colCorr = self.createCorrelatedFeaturesList()
while colCorr != []:
#Obtaining the dataframe after deleting the feature (from the list of correlated features)
#that is least correlated with the taregt
self.df = self.deleteFeatures(colCorr)
#Obtaining the list of correlated features
colCorr = self.createCorrelatedFeaturesList()
return self.df
Run Code Online (Sandbox Code Playgroud)
您可以对给定的数据框 df 使用以下内容:
corr_matrix = df.corr().abs()
high_corr_var=np.where(corr_matrix>0.8)
high_corr_var=[(corr_matrix.columns[x],corr_matrix.columns[y]) for x,y in zip(*high_corr_var) if x!=y and x<y]
Run Code Online (Sandbox Code Playgroud)
您可以在下面测试此代码吗?
pandas as pd
import numpy as np
# Create feature matrix with two highly correlated features
X = np.array([[1, 1, 1],
[2, 2, 0],
[3, 3, 1],
[4, 4, 0],
[5, 5, 1],
[6, 6, 0],
[7, 7, 1],
[8, 7, 0],
[9, 7, 1]])
# Convert feature matrix into DataFrame
df = pd.DataFrame(X)
# View the data frame
df
# Create correlation matrix
corr_matrix = df.corr().abs()
# Select upper triangle of correlation matrix
upper = corr_matrix.where(np.triu(np.ones(corr_matrix.shape), k=1).astype(np.bool))
# Find index of feature columns with correlation greater than 0.95
to_drop = [column for column in upper.columns if any(upper[column] > 0.95)]
# Drop features
df.drop(df[to_drop], axis=1)
Run Code Online (Sandbox Code Playgroud)
我发现TomDobbs 提供的答案非常有用,但它没有按预期工作。它有两个问题:
我下面的修订版更正了这些问题:
def remove_collinear_features(x, threshold):
'''
Objective:
Remove collinear features in a dataframe with a correlation coefficient
greater than the threshold. Removing collinear features can help a model
to generalize and improves the interpretability of the model.
Inputs:
x: features dataframe
threshold: features with correlations greater than this value are removed
Output:
dataframe that contains only the non-highly-collinear features
'''
# Calculate the correlation matrix
corr_matrix = x.corr()
iters = range(len(corr_matrix.columns) - 1)
drop_cols = []
# Iterate through the correlation matrix and compare correlations
for i in iters:
for j in range(i+1):
item = corr_matrix.iloc[j:(j+1), (i+1):(i+2)]
col = item.columns
row = item.index
val = abs(item.values)
# If correlation exceeds the threshold
if val >= threshold:
# Print the correlated features and the correlation value
print(col.values[0], "|", row.values[0], "|", round(val[0][0], 2))
drop_cols.append(col.values[0])
# Drop one of each pair of correlated columns
drops = set(drop_cols)
x = x.drop(columns=drops)
return x
Run Code Online (Sandbox Code Playgroud)
我冒昧地修改了 TomDobbs 的答案。评论中报告的错误现已删除。此外,新函数还过滤掉了负相关性。
def corr_df(x, corr_val):
'''
Obj: Drops features that are strongly correlated to other features.
This lowers model complexity, and aids in generalizing the model.
Inputs:
df: features df (x)
corr_val: Columns are dropped relative to the corr_val input (e.g. 0.8)
Output: df that only includes uncorrelated features
'''
# Creates Correlation Matrix and Instantiates
corr_matrix = x.corr()
iters = range(len(corr_matrix.columns) - 1)
drop_cols = []
# Iterates through Correlation Matrix Table to find correlated columns
for i in iters:
for j in range(i):
item = corr_matrix.iloc[j:(j+1), (i+1):(i+2)]
col = item.columns
row = item.index
val = item.values
if abs(val) >= corr_val:
# Prints the correlated feature set and the corr val
print(col.values[0], "|", row.values[0], "|", round(val[0][0], 2))
drop_cols.append(i)
drops = sorted(set(drop_cols))[::-1]
# Drops the correlated columns
for i in drops:
col = x.iloc[:, (i+1):(i+2)].columns.values
x = x.drop(col, axis=1)
return x
Run Code Online (Sandbox Code Playgroud)