不同的结果:R 中的“xgboost”与“caret”

D_H*_*D_H 7 r classification machine-learning r-caret xgboost

我尝试在 R 中设置 xgboost 分类模型

  1. 使用 xgboost 库
  2. 带插入符号

但尽管我使用相同的设置(?),我得到了不同的结果(例如,对测试数据集的不同预测)。我使用来自 mlbench 库的 Ionosphere 数据集,并尝试在本示例中尽可能保持简单(没有交叉验证、参数调整等):

有人知道为什么我得到不同的结果(见下文)。由于 caret 只是 xgboost 的包装器(“它只是调用相同的 xgboost 包”),因此结果应该完全相同。

library(caret)
library(xgboost)
library(mlbench)

#####
###Load & Prepare Data
#####
data(Ionosphere)
dataset <- Ionosphere
dataset <- dataset[,-c(2)] #remove (Constant everywhere)
dataset$V1 <- as.numeric(as.character(dataset$V1)) #factor to numeric
dataset$Class<-ifelse(dataset$Class=="good",dataset$Class<-1,dataset$Class<-0) #convert good->1 and bad->0
dataset$Class<-as.factor(dataset$Class) #convert to factor

#####
###Create Train & Test Dataset
#####
set.seed(1992)
validation_index<-createDataPartition(dataset$Class, p=0.8, list=FALSE)
testSet<-dataset[-validation_index,]
trainSet<-dataset[validation_index,]

#xgb.DMatrix for xgb.train() ("trainSet[c(34)]=trainSet$Class")
xgb.trainData<-xgb.DMatrix(data = data.matrix(trainSet[,-c(34)]), label = data.matrix(trainSet$Class))
xgb.testData<-xgb.DMatrix(data = as.matrix(testSet[,-c(34)]), label = as.matrix(data.matrix(testSet$Class)))

#####
###Set parameters & create models
#####
#params
param <-  data.frame(nrounds=c(100), max_depth = c(2),eta =c(0.3),gamma=c(0),
                    colsample_bytree=c(0.8),min_child_weight=c(1),subsample=c(1))                           

#xgboost
set.seed(1992)
fit.xgb <- xgb.train(
        params = list(eta = param$eta, max_depth = param$max_depth, 
            gamma = param$gamma, colsample_bytree = param$colsample_bytree, 
            min_child_weight = param$min_child_weight, subsample = param$subsample),    
        data = xgb.trainData, nrounds = param$nrounds, objective = "binary:logistic")

#caret
set.seed(1992)
fit.xgbTree <- train(Class~., data=trainSet, method="xgbTree",
                    metric="Accuracy", trControl=trainControl(method="none"),tuneGrid=param)

#####
###Print results (predictions)
#####
print("xgboost")
predictionxgb <- as.numeric(predict(fit.xgb,  xgb.testData) >= 0.5)
confusionMatrix(predictionxgb,testSet$Class)
#Confusion Matrix and Statistics
#
#          Reference
#Prediction  0  1
#         0 18  0
#         1  7 45
# ...


print("caret")
predictionsxgbTree <- predict(fit.xgbTree, testSet)
confusionMatrix(predictionsxgbTree, testSet$Class)
#Confusion Matrix and Statistics
#
#          Reference
#Prediction  0  1
#         0 17  0
#         1  8 45
# ...
Run Code Online (Sandbox Code Playgroud)

这些关于模型的信息也可能有所帮助(我没有看到重要的区别):

#xgboost:
fit.xgb
##### xgb.Booster
#raw: 35.9 Kb 
#call:
#  xgb.train(params = list(eta = param$eta, max_depth = param$max_depth, 
#    gamma = param$gamma, colsample_bytree = param$colsample_bytree, 
#    min_child_weight = param$min_child_weight, subsample = param$subsample), 
#    data = xgb.trainData, nrounds = param$nrounds, objective = "binary:logistic")
#params (as set within xgb.train):
#  eta = "0.3", max_depth = "2", gamma = "0", colsample_bytree = "0.8", min_child_weight = "1", subsample = "1", objective = "binary:logistic", silent = "1"
#xgb.attributes:
#  niter
#callbacks:
#  cb.print.evaluation(period = print_every_n)
#niter: 100


#caret:
fit.xgbTree$finalModel
##### xgb.Booster
#raw: 36 Kb 
#call:
#  xgboost::xgb.train(params = list(eta = param$eta, max_depth = param$max_depth, 
#   gamma = param$gamma, colsample_bytree = param$colsample_bytree, 
#   min_child_weight = param$min_child_weight, subsample = param$subsample), 
#    data = x, nrounds = param$nrounds, objective = "binary:logistic")
#params (as set within xgb.train):
#  eta = "0.3", max_depth = "2", gamma = "0", colsample_bytree = "0.8", min_child_weight = "1", subsample = "1", objective = "binary:logistic", silent = "1"
#xgb.attributes:
#  niter
#callbacks:
#  cb.print.evaluation(period = print_every_n)
#niter: 100
#xNames: #V1V3V4V5V6V7V8V9V10V11V12V13V14V15V16V17V18V19V20V21V22V23V24V25V26V27V28V29V30V31V32V33V34
#problemType: Classification
#tuneValue:
#          nrounds max_depth eta gamma colsample_bytree min_child_weight subsample
#1     100         2 0.3     0              0.8                1         1
#obsLevels: 01
#param:
#        list()
Run Code Online (Sandbox Code Playgroud)