sp2*_*sp2 3 r machine-learning svm feature-selection
我有一个看起来像这样的数据集
ID 885038 885039 885040 885041 885042 885043 885044 Class
1267359 2 0 0 0 0 1 0 0
1295720 0 0 0 0 0 1 0 0
1295721 0 0 0 0 0 1 0 0
1295723 0 0 0 0 0 1 0 0
1295724 0 0 0 1 0 1 0 0
1295725 0 0 0 1 0 1 0 0
1295726 2 0 0 0 0 1 0 1
1295727 2 0 0 0 0 1 0 1
1295740 0 0 0 0 0 1 0 1
1295742 0 0 0 0 0 1 0 1
1295744 0 0 0 0 0 1 0 1
1295745 0 0 0 0 0 1 0 1
1295746 0 0 0 0 0 1 0 1
Run Code Online (Sandbox Code Playgroud)
为了进行递归特征消除,我按照步骤进行了操作
以下是我为此做的编写的R代码,但是,它没有显示任何错误,循环继续训练集的长度.
data <- read.csv("dummy - Copy.csv", header = TRUE)
rownames(data) <- data[,1]
data<-data[,-1]
for (k in 1:length(data)){
inTraining <- createDataPartition(data$Class, p = .70, list = FALSE)
training <- data[ inTraining,]
testing <- data[-inTraining,]
## Building the model ####
svm.model <- svm(Class ~ ., data = training, cross=10,metric="ROC",type="eps-regression",kernel="linear",na.action=na.omit,probability = TRUE)
###### auc measure #######
#prediction and ROC
svm.model$index
svm.pred <- predict(svm.model, testing, probability = TRUE)
#calculating auc
c <- as.numeric(svm.pred)
c = c - 1
pred <- prediction(c, testing$Class)
perf <- performance(pred,"tpr","fpr")
plot(perf,fpr.stop=0.1)
auc <- performance(pred, measure = "auc")
auc <- auc@y.values[[1]]
#compute the weight vector
w = t(svm.model$coefs)%*%svm.model$SV
#compute ranking criteria
weight_matrix = w * w
#rank the features
w_transpose <- t(weight_matrix)
w2 <- as.matrix(w_transpose[order(w_transpose[,1], decreasing = FALSE),])
a <- as.matrix(w2[which(w2 == min(w2)),]) #to get the rows with minimum values
row.names(a) -> remove
data<- data[,setdiff(colnames(data),remove)]
print(length(data))
length <- (length(data))
cols_names <- colnames(data)
print(auc)
output <- paste(length,auc,sep=";")
write(output, file = "output.txt",append = TRUE)
write(cols_names, file = paste(length,"cols_selected", ".txt", sep=""))
}
Run Code Online (Sandbox Code Playgroud)
打印输出就像
[1] 3
[1] 0.5
[1] 2
[1] 0.5
[1] 2
[1] 0.5
[1] 2
[1] 0.75
[1] 2
[1] 1
[1] 2
[1] 0.75
[1] 2
[1] 0.5
[1] 2
[1] 0.75
Run Code Online (Sandbox Code Playgroud)
但是当我选择任何特征子集时,对于例如特征3并使用上面的代码(没有循环)构建SVM模型,我得不到0.75的相同AUC值.
data <- read.csv("3.csv", header = TRUE)
rownames(data) <- data[,1]
data<-data[,-1]
inTraining <- createDataPartition(data$Class, p = .70, list = FALSE)
training <- data[ inTraining,]
testing <- data[-inTraining,]
## Building the model ####
svm.model <- svm(Class ~ ., data = training, cross=10,metric="ROC",type="eps-regression",kernel="linear",na.action=na.omit,probability = TRUE)
###### auc measure #######
#prediction and ROC
svm.model$index
svm.pred <- predict(svm.model, testing, probability = TRUE)
#calculating auc
c <- as.numeric(svm.pred)
c = c - 1
pred <- prediction(c, testing$Class)
perf <- performance(pred,"tpr","fpr")
plot(perf,fpr.stop=0.1)
auc <- performance(pred, measure = "auc")
auc <- auc@y.values[[1]]
print(auc)
prints output
[1] 3
[1] 0.75 (instead of 0.5)
Run Code Online (Sandbox Code Playgroud)
两个代码都是相同的(一个具有递归循环,另一个没有任何递归循环)仍然存在相同特征子集的AUC值的差异.
两个代码的3个特征(885041
,885043
和Class
)是相同的,但它给出了不同的AUC值.
我认为使用交叉验证只是没问题.在您的代码中,您已经使用10倍CV来测试错误.拆分数据集似乎不必要.
由于您没有提及调整参数,cost
或者gamma
将其设置为默认值.
library(tidyverse)
library(e1071)
library(caret)
library(ROCR)
library(foreach)
Run Code Online (Sandbox Code Playgroud)
功能名称是数字,似乎svm()
在拟合过程后更改其中的名称.为了匹配,我会先改变列名.
其次,折叠可以分配caret::creadeFolds()
而不是createDataPartition()
.
set.seed(1)
k <- 5 # 5-fold CV
mydf3 <-
mydf %>%
rename_at(.vars = vars(-ID, -Class), .funs = function(x) str_c("X.", x, ".")) %>%
mutate(fold = createFolds(1:n(), k = k, list = FALSE)) # fold id column
# the number of features-------------------------------
x_num <-
mydf3 %>%
select(-ID, -Class, -fold) %>%
ncol()
Run Code Online (Sandbox Code Playgroud)
迭代,foreach()
可以是另一种选择.
cl <- parallel::makeCluster(2)
doParallel::registerDoParallel(cl, cores = 2)
parallel::clusterExport(cl, c("mydf3", "x_num"))
parallel::clusterEvalQ(cl, c(library(tidyverse), library(ROCR)))
#---------------------------------------------------------------
svm_rank <-
foreach(j = seq_len(x_num), .combine = rbind) %do% {
mod <-
foreach(cv = 1:k, .combine = bind_rows, .inorder = FALSE) %dopar% { # parallization
tr <-
mydf3 %>%
filter(fold != cv) %>% # train
select(-fold, -ID) %>%
e1071::svm( # fitting svm
Class ~ .,
data = .,
kernel = "linear",
type = "eps-regression",
probability = TRUE,
na.action = na.omit
)
# auc
te <-
mydf3 %>%
filter(fold == cv) %>%
predict(tr, newdata = ., probability = TRUE)
predob <- prediction(te, mydf3 %>% filter(fold == cv) %>% select(Class))
auc <- performance(predob, measure = "auc")@y.values[[1]]
# ranking - your formula
w <- t(tr$coefs) %*% tr$SV
if (is.null(names(w))) colnames(w) <- attr(tr$terms, "term.labels") # when only one feature left
(w * w) %>%
tbl_df() %>%
mutate(auc = auc)
}
auc <- mean(mod %>% select(auc) %>% pull()) # aggregate cv auc
w_mat <- colMeans(mod %>% select(-auc)) # aggregate cv ranking
remove <- names(which.min(w_mat)) # minimum rank
used <-
mydf3 %>%
select(-ID, -Class, -fold) %>%
names() %>%
str_c(collapse = " & ")
mydf3 <-
mydf3 %>%
select(-remove) # remove feature for next step
tibble(used = used, delete = remove, auc = auc)
}
#---------------------------------------------------
parallel::stopCluster(cl)
Run Code Online (Sandbox Code Playgroud)
对于每一步,您都可以获得
svm_rank
#> # A tibble: 7 x 3
#> used delete auc
#> <chr> <chr> <dbl>
#> 1 X.885038. & X.885039. & X.885040. & X.885041. & X.885042… X.88503… 0.7
#> 2 X.885038. & X.885040. & X.885041. & X.885042. & X.885043… X.88504… 0.7
#> 3 X.885038. & X.885041. & X.885042. & X.885043. & X.885044. X.88504… 0.7
#> 4 X.885038. & X.885041. & X.885043. & X.885044. X.88504… 0.7
#> 5 X.885038. & X.885041. & X.885043. X.88504… 0.7
#> 6 X.885038. & X.885041. X.88503… 0.7
#> 7 X.885041. X.88504… 0.7
Run Code Online (Sandbox Code Playgroud)
归档时间: |
|
查看次数: |
485 次 |
最近记录: |