SVR超参数选择和可视化

0 data-visualization svm data-analysis scikit-learn grid-search

我只是数据分析的初学者。我想使用“交叉验证网格搜索方法”来确定径向基函数 (RBF) 内核 SVM 的参数 gamma 和 C。我不知道应该将数据放在这段代码的哪里,也不知道我的数据类型是什么应该使用(训练或目标数据)?

对于SVR

import numpy as np
import pandas as pd
from math import sqrt
from sklearn.tree import DecisionTreeRegressor
import matplotlib.pyplot as plt
from sklearn.ensemble import AdaBoostRegressor
from sklearn.metrics import mean_squared_error,explained_variance_score
from TwoStageTrAdaBoostR2 import TwoStageTrAdaBoostR2 # import the two-stage algorithm
from sklearn import preprocessing
from sklearn import svm
from sklearn.svm import SVR
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import classification_report
from matplotlib.colors import Normalize
from sklearn.svm import SVC

# Data import (source)
source= pd.read_csv(sourcedata)

# Data import (target)
data= pd.read_csv(targetdata)

# Sample Size
datatrain = data.sample(n=60, random_state=1)
datatest = data[~dataL.index.isin(data.index)]

# Merge training set data (source and target)
train = pd.concat([source, datatrain], sort=False)
train.reset_index(inplace=True, drop=True)
datatest.reset_index(inplace=True, drop=True)

# Variable input
X_train, y_train = train[['x1', 'x2']].values, train['y'].values
X_test, y_test = FL[['x1', 'x2']].values, FL['y'].values

# Parameter setting
#sample_size = [n_source1+n_source2+n_source3+n_source4+n_source5, n_target_train]
n_estimators = 100
steps = 8
fold = 5
random_state = np.random.RandomState(1)
sample_size = [350, 60]

#1  twostage tradaboost.r2
regr_1 = TwoStageTrAdaBoostR2(SVR(C=50, gamma='auto'),
                      n_estimators = n_estimators, sample_size = sample_size,
                      steps = steps, fold = fold,
                      random_state = random_state)
regr_1.fit(X_train, y_train)
y_pred1 = regr_1.predict(X_test)
print("MSE of regular two stage trAdaboostR2--model1:",sqrt(mean_squared_error(y_test, y_pred1)))


#Plot the results
plt.figure()
plt.scatter(y_test, y_test-y_pred1, c="black", label="TwoStageTrAdaBoostR2_model1", s=10)
plt.xlabel("CAR")
plt.ylabel("Err")
plt.title("Two-stage Transfer Learning Boosted Decision Tree Regression", loc='left', fontsize=12, fontweight=0, color="orange")
plt.legend()
plt.show()
Run Code Online (Sandbox Code Playgroud)

对于交叉验证网格搜索方法(最佳参数):

# Cross validation grid search (best parameters) 
parameter_candidates = [
  {'C': [1, 10, 100, 1000], 'gamma': [0.001, 0.0001], 'kernel': ['linear']},
  {'C': [1, 10, 100, 1000], 'gamma': [0.001, 0.0001], 'kernel': ['rbf']},
]
svr = svm.SVC()
clf = grid_search.GridSearchCV(svr, parameters, c=5 ,n_jobs=-1)
clf.fit(X_train, y_train)
print('Best score for data:', clf.best_score_)
print('Best C:',clf.best_estimator_.C) 
print('Best Kernel:',clf.best_estimator_.kernel)
print('Best Gamma:',clf.best_estimator_.gamma)
Run Code Online (Sandbox Code Playgroud)

用于参数效果的可视化

c_range = np.logspace(-2, 2, 4)
gamma_range = np.logspace(-2, 2, 5)
tuned_parameters = [{'kernel': ['rbf'],'C': c_range,'gamma':gamma_range},
                    {'kernel': ['linear'], 'C': c_range,'gamma':gamma_range}]

svr = svm.SVR()
clf = GridSearchCV(svr,param_grid=tuned_parameters,verbose=2,n_jobs=-1,
                   scoring='explained_variance')
clf.fit(X_train, y_train)

print('Best score for data:', clf.best_score_)
print('Best C:',clf.best_estimator_.C) 
print('Best Kernel:',clf.best_estimator_.kernel)
print('Best Gamma:',clf.best_estimator_.gamma)

# scores for rbf kernel
n = len(gamma_range)*len(c_range)
scores_rbf = clf.cv_results_['mean_test_score'][:n].reshape(len(gamma_range),
                                                            len(c_range))

# scores for rbf kernel
scores_linear = clf.cv_results_['mean_test_score'][n:].reshape(len(gamma_range),
                                                               len(c_range))

class MidpointNormalize(Normalize):

    def __init__(self, vmin=None, vmax=None, midpoint=None, clip=False):
        self.midpoint = midpoint
        Normalize.__init__(self, vmin, vmax, clip)

    def __call__(self, value, clip=None):
        x, y = [self.vmin, self.midpoint, self.vmax], [0, 0.5, 1]
        return np.ma.masked_array(np.interp(value, x, y))



plt.figure(figsize=(8, 6))
plt.subplots_adjust(left=.2, right=0.95, bottom=0.15, top=0.95)
plt.imshow(scores_rbf, interpolation='nearest', cmap=plt.cm.hot,
           norm=MidpointNormalize(vmin=0.2, midpoint=0.92))
plt.xlabel('gamma')
plt.ylabel('C')
plt.colorbar()
plt.xticks(np.arange(len(gamma_range)), gamma_range, rotation=45)
plt.yticks(np.arange(len(c_range)), c_range)
plt.title('Validation accuracy')
plt.show()
Run Code Online (Sandbox Code Playgroud)

当我使用这段代码时,我发现了以下输出热图! 在此输入图像描述

但我正在尝试获得这样的热图 在此输入图像描述

bal*_*day 5

以下带有一些典型回归数据的代码应该可以正常工作:

import numpy as np
import matplotlib.pyplot as plt
from sklearn import svm, datasets
from sklearn.model_selection import GridSearchCV,train_test_split
from matplotlib.colors import Normalize

class MidpointNormalize(Normalize):

    def __init__(self, vmin=None, vmax=None, midpoint=None, clip=False):
        self.midpoint = midpoint
        Normalize.__init__(self, vmin, vmax, clip)

    def __call__(self, value, clip=None):
        x, y = [self.vmin, self.midpoint, self.vmax], [0, 0.5, 1]
        return np.ma.masked_array(np.interp(value, x, y))
    

X, y = datasets.load_boston(return_X_y=True)
X_train, X_test, y_train, y_test = train_test_split(X,y)

# Cross validation grid search (best parameters) 
c_range = np.logspace(-0, 4, 8)
gamma_range = np.logspace(-4, 0, 8)
tuned_parameters = [{'kernel': ['rbf'],'C': c_range,'gamma':gamma_range},
                    {'kernel': ['linear'], 'C': c_range,'gamma':gamma_range}]

svr = svm.SVR()
clf = GridSearchCV(svr,param_grid=tuned_parameters,verbose=20,n_jobs=-4,cv=4,
                   scoring='explained_variance')
clf.fit(X_train, y_train)

print('Best score for data:', clf.best_score_)
print('Best C:',clf.best_estimator_.C) 
print('Best Kernel:',clf.best_estimator_.kernel)
print('Best Gamma:',clf.best_estimator_.gamma)

# scores for rbf kernel
n = len(gamma_range)*len(c_range)
scores_rbf = clf.cv_results_['mean_test_score'][:n].reshape(len(gamma_range),
                                                            len(c_range))

# scores for rbf kernel
scores_linear = clf.cv_results_['mean_test_score'][n:].reshape(len(gamma_range),
                                                               len(c_range))


plt.figure(figsize=(8, 6))
plt.subplots_adjust(left=.2, right=0.95, bottom=0.15, top=0.95)
plt.imshow(scores_rbf, interpolation='nearest', cmap=plt.cm.hot,
           norm=MidpointNormalize(vmin=-.2, midpoint=0.5))
plt.xlabel('gamma')
plt.ylabel('C')
plt.colorbar()
plt.xticks(np.arange(len(gamma_range)),
           [np.format_float_scientific(i,1) for i in gamma_range],rotation=45)
plt.yticks(np.arange(len(c_range)), 
           [np.format_float_scientific(i,) for i in c_range])
plt.title('Validation accuracy')
plt.show()
Run Code Online (Sandbox Code Playgroud)

网格的粒度非常低,但否则需要一些时间运行。此外,网格的限制需要比我选择的网格进行更多的教育。

我不确定您为什么会收到错误,但我让事情变得简单,并在我的代码片段中启动了一次 SVR,以便您可以看到它是如何工作的。我还对Cgamma数组使用了不同的长度,这只是为了显示这些参数是如何进行的。有时我发现,如果所有内容都具有相同的长度,则很难看出哪个参数负责什么。

最终的图看起来像这样,但这在很大程度上取决于网格的范围、其粒度以及您正在使用的数据集。另请注意,我更改了您提供的类的参数MidpointNormalize

在此输入图像描述