以下代码:
require(caret)
require(plyr)
portuguese_scores = read.table("https://raw.githubusercontent.com/JimGorman17/Datasets/master/student-por.csv",sep=";",header=TRUE, stringsAsFactors = FALSE)
portuguese_scores <- portuguese_scores[,!names(portuguese_scores) %in% c("school", "age", "G1", "G2")]
median_score <- summary(portuguese_scores$G3)['Median']
portuguese_scores$score_gte_than_median <- as.factor(median_score<=portuguese_scores$G3)
portuguese_scores <- portuguese_scores[,!names(portuguese_scores) %in% c("G3")]
portuguese_scores$sex <- as.numeric(mapvalues(portuguese_scores$sex, from = c("M", "F"), to = c(0, 1)))
portuguese_scores$address <- as.numeric(mapvalues(portuguese_scores$address, from = c("U", "R"), to = c(0, 1)))
portuguese_scores$famsize <- as.numeric(mapvalues(portuguese_scores$famsize, from = c("LE3", "GT3"), to = c(0, 1)))
portuguese_scores$Pstatus <- as.numeric(mapvalues(portuguese_scores$Pstatus, from = c("T", "A"), to = c(0, 1)))
portuguese_scores$Mjob <- as.numeric(mapvalues(portuguese_scores$Mjob, from = c("at_home","health","other","services","teacher"), to = c(0, 1,2,3,4)))
portuguese_scores$Fjob <- as.numeric(mapvalues(portuguese_scores$Fjob, from = c("at_home","health","other","services","teacher"), to = c(0, 1,2,3,4)))
portuguese_scores$reason <- as.numeric(mapvalues(portuguese_scores$reason, from = c("course","home","other","reputation"), to = c(0, 1,2,3)))
portuguese_scores$guardian <- as.numeric(mapvalues(portuguese_scores$guardian, from = c("father","mother","other"), to = c(0, 1,2)))
portuguese_scores$schoolsup <- as.numeric(mapvalues(portuguese_scores$schoolsup, from = c("no","yes"), to = c(0, 1)))
portuguese_scores$famsup <- as.numeric(mapvalues(portuguese_scores$famsup, from = c("no","yes"), to = c(0, 1)))
portuguese_scores$paid <- as.numeric(mapvalues(portuguese_scores$paid, from = c("no","yes"), to = c(0, 1)))
portuguese_scores$activities <- as.numeric(mapvalues(portuguese_scores$activities, from = c("no","yes"), to = c(0, 1)))
portuguese_scores$nursery <- as.numeric(mapvalues(portuguese_scores$nursery, from = c("no","yes"), to = c(0, 1)))
portuguese_scores$higher <- as.numeric(mapvalues(portuguese_scores$higher, from = c("no","yes"), to = c(0, 1)))
portuguese_scores$internet <- as.numeric(mapvalues(portuguese_scores$internet, from = c("no","yes"), to = c(0, 1)))
portuguese_scores$romantic <- as.numeric(mapvalues(portuguese_scores$romantic, from = c("no","yes"), to = c(0, 1)))
normalize <- function(x){ return( (x - min(x) )/( max(x) - min(x) ) )}
port_n <- data.frame(lapply(portuguese_scores[1:28], normalize), portuguese_scores[29])
set.seed(123)
train_sample <- sample(nrow(port_n), .9 * nrow(port_n))
port_train <- port_n[train_sample,]
port_test <- port_n[-train_sample,]
out1 <- train(port_train[,1:28], port_train[,29], method = "svmRadial")
out1
Run Code Online (Sandbox Code Playgroud)
生成以下输出:
Support Vector Machines with Radial Basis Function Kernel
584 samples
28 predictor
2 classes: 'FALSE', 'TRUE'
No pre-processing
Resampling: Bootstrapped (25 reps)
Summary of sample sizes: 584, 584, 584, 584, 584, 584, ...
Resampling results across tuning parameters:
C Accuracy Kappa Accuracy SD Kappa SD
0.25 0.7383930 0.4633478 0.02782725 0.05484469
0.50 0.7382364 0.4637857 0.02883617 0.05763094
1.00 0.7290191 0.4456935 0.02570423 0.05180727
Tuning parameter 'sigma' was held constant at a value of 0.02166535
Accuracy was used to select the optimal model using the largest value.
The final values used for the model were sigma = 0.02166535 and C = 0.25.
Run Code Online (Sandbox Code Playgroud)
我的问题:
更新(致所有亲密选民):
为此,您需要使用tuneGrid参数。您需要为参数创建自己的对,然后对其进行测试。
例如,由于您想在所有情况下测试 C=0.25,您需要创建一个如下所示的 data.frame:
svmGrid <- data.frame(C=rep(0.25,10), sigma=1:10/100)
Run Code Online (Sandbox Code Playgroud)
这具有相同的 C (0.25) 值和不同的 sigma 值以进行优化。您需要自己为 sigma 提供这些值(这只是一个示例 - 使用任意多个)。
换句话说,根据上面的data.frame,你的svm模型会被测试10次。每次 C 将是常数并等于 0.25,而 sigma 将采用 0.01 到 0.1 的值,步长为 0.01。将进行 10 次测试并选择最佳组合。
然后你像这样运行模型:
#adding the tuneGrid argument
out1 <- train(port_train[,1:28], port_train[,29], method = "svmRadial", tuneGrid=svmGrid)
Run Code Online (Sandbox Code Playgroud)
输出:
> out1
Support Vector Machines with Radial Basis Function Kernel
584 samples
28 predictor
2 classes: 'FALSE', 'TRUE'
No pre-processing
Resampling: Bootstrapped (25 reps)
Summary of sample sizes: 584, 584, 584, 584, 584, 584, ...
Resampling results across tuning parameters:
sigma Accuracy Kappa Accuracy SD Kappa SD
0.01 0.7297315 0.4417768 0.03082764 0.06044173
0.02 0.7312643 0.4474754 0.03289345 0.06567919
0.03 0.7301472 0.4468033 0.03618417 0.07187019
0.04 0.7288286 0.4463212 0.03609275 0.07200966
0.05 0.7281374 0.4466735 0.03569426 0.07055105
0.06 0.7238098 0.4400315 0.03348371 0.06666725
0.07 0.7213752 0.4364012 0.03467845 0.06849882
0.08 0.7175949 0.4286502 0.04013475 0.08014780
0.09 0.7042396 0.3981745 0.04346037 0.08864786
0.10 0.6651296 0.3061489 0.06450228 0.14079631
Tuning parameter 'C' was held constant at a value of 0.25
Accuracy was used to select the optimal model using the largest value.
The final values used for the model were sigma = 0.02 and C = 0.25.
Run Code Online (Sandbox Code Playgroud)
你有你的优化西格玛!