D K*_*aid 4 r r-recipes tidymodels r-parsnip
我想评估同一数据集上多个(主要是)线性回归模型的性能。我想也许使用tidymodels包和workflowsets::workflow_set()可能会起作用。我按照此处的示例进行操作,但我无法弄清楚如何从代码中实际获得拟合结果。
# Load packages
library("tidyverse")
library('workflowsets')
library("parsnip")
library("recipes")
# Data
dat <-
structure(list(q = c(66.65, 75.58, 83.06, 91.28, 119.26, 133.14,
146.32, 153.39, 168.57, 182.36, 210.09, 188.19, 213.42, 296.95,
326.33, 358.63, 475.99, 475.99, 683.44, 683.44, 838.49, 1282.1,
1648.97, 1572.97, 2055.14, 2521.39, 2685.11, 2859.46, 3242.87,
6899.19, 6377.42, 7581.96, 9599.32), c = c(317.06, 283.99, 279.56,
283.99, 227.84, 227.84, 262.5, 242.64, 270.9, 266.67, 210.6,
235.12, 235.12, 210.6, 207.31, 227.84, 220.78, 194.67, 177.13,
207.31, 179.94, 177.13, 182.79, 139.89, 148.98, 144.36, 137.71,
158.66, 142.11, 142.11, 119.52, 110.48, 158.66), c_less_c_nought = c(300.06,
266.99, 262.56, 266.99, 210.84, 210.84, 245.5, 225.64, 253.9,
249.67, 193.6, 218.12, 218.12, 193.6, 190.31, 210.84, 203.78,
177.67, 160.13, 190.31, 162.94, 160.13, 165.79, 122.89, 131.98,
127.36, 120.71, 141.66, 125.11, 125.11, 102.52, 93.48, 141.66
)), row.names = c(NA, -33L), class = c("tbl_df", "tbl", "data.frame"
))
# Recipes for models
eq1_mod1_recipe <-
recipes::recipe(c ~ q, data = dat) %>%
step_log(c, q, base = 10)
eq2_mod2_a_recipe <-
recipes::recipe(c_less_c_nought ~ q, data = dat) %>%
step_log(c_less_c_nought, q, base = 10)
# Define model types
lm_model <-
parsnip::linear_reg() %>%
parsnip::set_engine("lm") %>%
parsnip::set_mode("regression")
# Run the models?
cq_models <-
workflowsets::workflow_set(
preproc = list(eq1m1 = eq1_mod1_recipe, e2m2a = eq2_mod2_a_recipe),
models = list(lm = lm_model)
)
Run Code Online (Sandbox Code Playgroud)
看来这实际上并不适合模型本身。我需要在什么/哪里添加代码才能适应线性模型?
或者,是否有更好但仍然“整洁”的方法来做到这一点?接受建议。
工作流程集仅适用于重新采样的数据,例如交叉验证或引导折叠。这是设计使然,这样人们就不会尝试比较将单个时间拟合到数据集的性能指标。
\nlibrary("tidymodels")\n\n# Data\ndat <- \n structure(list(q = c(66.65, 75.58, 83.06, 91.28, 119.26, 133.14, \n 146.32, 153.39, 168.57, 182.36, 210.09, 188.19, 213.42, 296.95, \n 326.33, 358.63, 475.99, 475.99, 683.44, 683.44, 838.49, 1282.1, \n 1648.97, 1572.97, 2055.14, 2521.39, 2685.11, 2859.46, 3242.87, \n 6899.19, 6377.42, 7581.96, 9599.32), c = c(317.06, 283.99, 279.56, \n 283.99, 227.84, 227.84, 262.5, 242.64, 270.9, 266.67, 210.6, \n 235.12, 235.12, 210.6, 207.31, 227.84, 220.78, 194.67, 177.13, \n 207.31, 179.94, 177.13, 182.79, 139.89, 148.98, 144.36, 137.71, \n 158.66, 142.11, 142.11, 119.52, 110.48, 158.66), c_less_c_nought = c(300.06, \n 266.99, 262.56, 266.99, 210.84, 210.84, 245.5, 225.64, 253.9, \n 249.67, 193.6, 218.12, 218.12, 193.6, 190.31, 210.84, 203.78, \n 177.67, 160.13, 190.31, 162.94, 160.13, 165.79, 122.89, 131.98, \n 127.36, 120.71, 141.66, 125.11, 125.11, 102.52, 93.48, 141.66\n )), row.names = c(NA, -33L), class = c("tbl_df", "tbl", "data.frame"\n )) \n\n\nfolds <- bootstraps(dat, times = 10)\n\neq1_mod1_recipe <-\n recipe(c ~ q, data = dat) %>% \n step_log(c, q, base = 10)\n\neq2_mod2_a_recipe <- \n recipe(c_less_c_nought ~ q, data = dat) %>% \n step_log(c_less_c_nought, q, base = 10) \n\nlm_model <- linear_reg()\n\nres <-\n workflow_set(\n preproc = list(eq1m1 = eq1_mod1_recipe, e2m2a = eq2_mod2_a_recipe),\n models = list(lm = lm_model)\n ) %>%\n workflow_map("fit_resamples", resamples = folds)\n\n\ncollect_metrics(res)\n#> # A tibble: 4 \xc3\x97 9\n#> wflow_id .config preproc model .metric .estimator mean n std_err\n#> <chr> <chr> <chr> <chr> <chr> <chr> <dbl> <int> <dbl>\n#> 1 eq1m1_lm Preprocessor1_\xe2\x80\xa6 recipe line\xe2\x80\xa6 rmse standard 0.0454 10 0.00214\n#> 2 eq1m1_lm Preprocessor1_\xe2\x80\xa6 recipe line\xe2\x80\xa6 rsq standard 0.857 10 0.0220 \n#> 3 e2m2a_lm Preprocessor1_\xe2\x80\xa6 recipe line\xe2\x80\xa6 rmse standard 0.0502 10 0.00245\n#> 4 e2m2a_lm Preprocessor1_\xe2\x80\xa6 recipe line\xe2\x80\xa6 rsq standard 0.856 10 0.0221\nRun Code Online (Sandbox Code Playgroud)\n由reprex 包于 2022 年 2 月 23 日创建(v2.0.1)
\n