我正在用来tidymodels创建随机福雷斯特预测。我的测试数据包含训练数据中不存在的新因子级别,这会导致错误:
1: Novel levels found in column 'Siblings': '4'. The levels have been removed, and values have been coerced to 'NA'.
2: There are new levels in a factor: NA
> test_predict
Fehler: Objekt 'test_predict' nicht gefunden
Run Code Online (Sandbox Code Playgroud)
我尝试在“兄弟姐妹”列中包含“step_novel和” step_dummy,但这并不能解决错误。我应该如何处理训练数据中不存在的新因素?
library(tidyverse)
library(tidymodels)
data <-
data.frame(
Survived = as.factor(c(0,1,1,1,0,0,0,0,0,1,1,1,0,0,0,0)),
Siblings = as.factor(c(1,1,0,1,0,0,0,3,1,1,0,1,0,0,0,3)),
Class = as.factor(c(0,1,0,1,0,1,0,0,0,1,0,1,0,1,0,0)),
Embarked = as.factor(c("s","c","m","m","s","c","s","m","m","s","s","s","s","s","s","s"))
)
test <-
data.frame(
Siblings = as.factor(c(1,1,0,1,0,0,0,3,1,1,0,1,0,0,0,4)), #New factor level
Class = as.factor(c(0,1,0,1,0,1,0,0,0,1,0,1,0,1,0,0)),
Embarked = as.factor(c("s","c","m","m","s","c","s","m","m","s","s","s","s","s","s","s"))
)
#Model
rf_model <-
rand_forest() %>%
set_args(
mtry = 3,
trees = 1000,
min_n = 15
) %>%
set_engine("ranger",
importance = "impurity") %>%
set_mode("classification")
#Recipe
data_recipe <-
recipe(Survived ~Siblings + Class + Embarked, data=data) %>%
step_novel(Siblings) %>%
step_dummy(Siblings)
#Workflow
rf_workflow <-
workflow() %>%
add_recipe(data_recipe) %>%
add_model(rf_model)
final_model <- fit(rf_workflow, data)
final_model
test_predict <- predict(final_model, test)
test_predict
Run Code Online (Sandbox Code Playgroud)
如果您在文档中注意到step_novel(),它说:
\n\n当拟合可以处理新因子水平的模型时,请考虑使用
\nworkflows::add_recipe()withallow_novel_levels = TRUEset inhardhat::default_recipe_blueprint()。这将使您的模型能够在预测时处理新的级别,而不是抛出警告或错误。
所以你想这样做:
\nlibrary(tidyverse)\nlibrary(tidymodels)\n#> Registered S3 method overwritten by \'tune\':\n#> method from \n#> required_pkgs.model_spec parsnip\n\ndata <-\n data.frame(\n Survived = as.factor(c(0,1,1,1,0,0,0,0,0,1,1,1,0,0,0,0)),\n Siblings = as.factor(c(1,1,0,1,0,0,0,3,1,1,0,1,0,0,0,3)),\n Class = as.factor(c(0,1,0,1,0,1,0,0,0,1,0,1,0,1,0,0)),\n Embarked = as.factor(c("s","c","m","m","s","c","s","m","m","s","s","s","s","s","s","s")) \n )\n\ntest <-\n data.frame(\n Siblings = as.factor(c(1,1,0,1,0,0,0,3,1,1,0,1,0,0,0,4)), #New factor level\n Class = as.factor(c(0,1,0,1,0,1,0,0,0,1,0,1,0,1,0,0)),\n Embarked = as.factor(c("s","c","m","m","s","c","s","m","m","s","s","s","s","s","s","s")) \n )\n\n#Model\nrf_model <-\n rand_forest() %>%\n set_args(\n mtry = 3,\n trees = 1000,\n min_n = 15\n ) %>%\n set_engine("ranger", \n importance = "impurity") %>%\n set_mode("classification")\n\n#Recipe\ndata_recipe <- \n recipe(Survived ~Siblings + Class + Embarked, data=data) %>%\n step_novel(Siblings) %>%\n step_dummy(Siblings)\n\n#Workflow\nrf_workflow <- \n workflow() %>%\n add_recipe(data_recipe, \n blueprint = hardhat::default_recipe_blueprint(allow_novel_levels = TRUE)) %>%\n add_model(rf_model)\n\nfinal_model <- fit(rf_workflow, data)\nfinal_model\n#> \xe2\x95\x90\xe2\x95\x90 Workflow [trained] \xe2\x95\x90\xe2\x95\x90\xe2\x95\x90\xe2\x95\x90\xe2\x95\x90\xe2\x95\x90\xe2\x95\x90\xe2\x95\x90\xe2\x95\x90\xe2\x95\x90\xe2\x95\x90\xe2\x95\x90\xe2\x95\x90\xe2\x95\x90\xe2\x95\x90\xe2\x95\x90\xe2\x95\x90\xe2\x95\x90\xe2\x95\x90\xe2\x95\x90\xe2\x95\x90\xe2\x95\x90\xe2\x95\x90\xe2\x95\x90\xe2\x95\x90\xe2\x95\x90\xe2\x95\x90\xe2\x95\x90\xe2\x95\x90\xe2\x95\x90\xe2\x95\x90\xe2\x95\x90\xe2\x95\x90\xe2\x95\x90\xe2\x95\x90\xe2\x95\x90\xe2\x95\x90\xe2\x95\x90\xe2\x95\x90\xe2\x95\x90\xe2\x95\x90\xe2\x95\x90\xe2\x95\x90\xe2\x95\x90\xe2\x95\x90\xe2\x95\x90\xe2\x95\x90\xe2\x95\x90\xe2\x95\x90\xe2\x95\x90\xe2\x95\x90\xe2\x95\x90\xe2\x95\x90\xe2\x95\x90\xe2\x95\x90\xe2\x95\x90\xe2\x95\x90\xe2\x95\x90\n#> Preprocessor: Recipe\n#> Model: rand_forest()\n#> \n#> \xe2\x94\x80\xe2\x94\x80 Preprocessor \xe2\x94\x80\xe2\x94\x80\xe2\x94\x80\xe2\x94\x80\xe2\x94\x80\xe2\x94\x80\xe2\x94\x80\xe2\x94\x80\xe2\x94\x80\xe2\x94\x80\xe2\x94\x80\xe2\x94\x80\xe2\x94\x80\xe2\x94\x80\xe2\x94\x80\xe2\x94\x80\xe2\x94\x80\xe2\x94\x80\xe2\x94\x80\xe2\x94\x80\xe2\x94\x80\xe2\x94\x80\xe2\x94\x80\xe2\x94\x80\xe2\x94\x80\xe2\x94\x80\xe2\x94\x80\xe2\x94\x80\xe2\x94\x80\xe2\x94\x80\xe2\x94\x80\xe2\x94\x80\xe2\x94\x80\xe2\x94\x80\xe2\x94\x80\xe2\x94\x80\xe2\x94\x80\xe2\x94\x80\xe2\x94\x80\xe2\x94\x80\xe2\x94\x80\xe2\x94\x80\xe2\x94\x80\xe2\x94\x80\xe2\x94\x80\xe2\x94\x80\xe2\x94\x80\xe2\x94\x80\xe2\x94\x80\xe2\x94\x80\xe2\x94\x80\xe2\x94\x80\xe2\x94\x80\xe2\x94\x80\xe2\x94\x80\xe2\x94\x80\xe2\x94\x80\xe2\x94\x80\xe2\x94\x80\xe2\x94\x80\xe2\x94\x80\xe2\x94\x80\xe2\x94\x80\xe2\x94\x80\n#> 2 Recipe Steps\n#> \n#> \xe2\x80\xa2 step_novel()\n#> \xe2\x80\xa2 step_dummy()\n#> \n#> \xe2\x94\x80\xe2\x94\x80 Model \xe2\x94\x80\xe2\x94\x80\xe2\x94\x80\xe2\x94\x80\xe2\x94\x80\xe2\x94\x80\xe2\x94\x80\xe2\x94\x80\xe2\x94\x80\xe2\x94\x80\xe2\x94\x80\xe2\x94\x80\xe2\x94\x80\xe2\x94\x80\xe2\x94\x80\xe2\x94\x80\xe2\x94\x80\xe2\x94\x80\xe2\x94\x80\xe2\x94\x80\xe2\x94\x80\xe2\x94\x80\xe2\x94\x80\xe2\x94\x80\xe2\x94\x80\xe2\x94\x80\xe2\x94\x80\xe2\x94\x80\xe2\x94\x80\xe2\x94\x80\xe2\x94\x80\xe2\x94\x80\xe2\x94\x80\xe2\x94\x80\xe2\x94\x80\xe2\x94\x80\xe2\x94\x80\xe2\x94\x80\xe2\x94\x80\xe2\x94\x80\xe2\x94\x80\xe2\x94\x80\xe2\x94\x80\xe2\x94\x80\xe2\x94\x80\xe2\x94\x80\xe2\x94\x80\xe2\x94\x80\xe2\x94\x80\xe2\x94\x80\xe2\x94\x80\xe2\x94\x80\xe2\x94\x80\xe2\x94\x80\xe2\x94\x80\xe2\x94\x80\xe2\x94\x80\xe2\x94\x80\xe2\x94\x80\xe2\x94\x80\xe2\x94\x80\xe2\x94\x80\xe2\x94\x80\xe2\x94\x80\xe2\x94\x80\xe2\x94\x80\xe2\x94\x80\xe2\x94\x80\xe2\x94\x80\xe2\x94\x80\xe2\x94\x80\n#> Ranger result\n#> \n#> Call:\n#> ranger::ranger(x = maybe_data_frame(x), y = y, mtry = min_cols(~3, x), num.trees = ~1000, min.node.size = min_rows(~15, x), importance = ~"impurity", num.threads = 1, verbose = FALSE, seed = sample.int(10^5, 1), probability = TRUE) \n#> \n#> Type: Probability estimation \n#> Number of trees: 1000 \n#> Sample size: 16 \n#> Number of independent variables: 5 \n#> Mtry: 3 \n#> Target node size: 15 \n#> Variable importance mode: impurity \n#> Splitrule: gini \n#> OOB prediction error (Brier s.): 0.254242\n\ntest_predict <- predict(final_model, test)\ntest_predict\n#> # A tibble: 16 x 1\n#> .pred_class\n#> <fct> \n#> 1 0 \n#> 2 1 \n#> 3 0 \n#> 4 1 \n#> 5 0 \n#> 6 0 \n#> 7 0 \n#> 8 0 \n#> 9 0 \n#> 10 1 \n#> 11 0 \n#> 12 1 \n#> 13 0 \n#> 14 0 \n#> 15 0 \n#> 16 0\nRun Code Online (Sandbox Code Playgroud)\n由reprex 包(v2.0.0)创建于 2021-07-09
\n工作流程功能对新数据的因子级别和其他方面非常严格,以确保它们与训练数据匹配。
\n| 归档时间: |
|
| 查看次数: |
1005 次 |
| 最近记录: |