sta*_*oob 8 parallel-processing loops r machine-learning cluster-computing
我正在使用 R 编程语言。我正在尝试在具有不平衡类(即二元响应变量比率 95% 到 5%)的非常大的数据集(超过 1 亿行)上拟合随机森林模型。为此,我编写了 R 代码:
下面,我包含了这些步骤的 R 代码:
第 1 步:为问题创建数据
# Step 1: Randomly create data and make initial training/test set:
library(dplyr)
library(ranger)
original_data = rbind( data_1 = data.frame( class = 1, height = rnorm(10000, 180,10), weight = rnorm(10000, 90,10), salary = rnorm(10000,50000,10000)), data_2 = data.frame(class = 0, height = rnorm(100, 160,10), weight = rnorm(100, 100,10), salary = rnorm(100,40000,10000)) )
original_data$class = as.factor(original_data$class)
original_data$id = 1:nrow(original_data)
test_set= rbind(original_data[ sample( which( original_data$class == "0" ) , replace = FALSE , 30 ) , ], original_data[ sample( which( original_data$class == "1" ) , replace = FALSE, 2000 ) , ])
train_set = anti_join(original_data, test_set)
Run Code Online (Sandbox Code Playgroud)
第 2 步:创建“平衡”随机子集:
# Step 2: Create "Balanced" Random Subsets:
results <- list()
for (i in 1:100)
{
iteration_i = i
sample_i = rbind(train_set[ sample( which( train_set$class == "0" ) , replace = TRUE , 50 ) , ], train_set[ sample( which( train_set$class == "1" ) , replace = TRUE, 60 ) , ])
results_tmp = data.frame(iteration_i, sample_i)
results_tmp$iteration_i = as.factor(results_tmp$iteration_i)
results[[i]] <- results_tmp
}
results_df <- do.call(rbind.data.frame, results)
X<-split(results_df, results_df$iteration)
invisible(lapply(seq_along(results),
function(i,x) {assign(paste0("train_set_",i),x[[i]], envir=.GlobalEnv)},
x=results))
Run Code Online (Sandbox Code Playgroud)
第 3 步:在每个子集上训练模型
# Step 3: Train Models on Each Subset:
#training
wd = getwd()
results_1 <- list()
for (i in 1:100){
model_i <- ranger(class ~ height + weight + salary, data = X[[i]], probability = TRUE)
saveRDS(model_i, paste0("wd", paste("model_", i, ".RDS")))
results_1[[i]] <- model_i
}
Run Code Online (Sandbox Code Playgroud)
步骤4:组合所有模型并使用组合模型对测试集进行预测:
# Step 4: Combine All Models and Use Combined Model to Make Predictions on the Test Set:
results_2 <- list()
for (i in 1:100){
predict_i <- data.frame(predict(results_1[[i]], data = test_set)$predictions)
predict_i$id = 1:nrow(predict_i)
results_2[[i]] <- predict_i
}
final_predictions = aggregate(.~ id, do.call(rbind, results_2), mean)
Run Code Online (Sandbox Code Playgroud)
我的问题:我想看看是否可以将“并行计算”合并到步骤 2、步骤 3 和步骤 4 中,以便有可能使我编写的代码运行得更快。我查阅了其他帖子(例如/sf/ask/987420731/,https://stats.stackexchange.com/questions/519640/parallelizing-random-forest -learning-in-r-changes-the-class-of-the-rf-object),我想看看是否可以重新格式化我编写的代码并合并类似的“并行计算”函数来改进我的代码:
library(parallel)
library(doParallel)
library(foreach)
#Try to parallelize
cl <- makeCluster(detectCores()-1)
registerDoParallel(cl)
# Insert Reformatted Step 2 - Step 4 Here:
stopImplicitCluster()
stopCluster(cl)
rm(cl)
Run Code Online (Sandbox Code Playgroud)
但我对并行计算的世界仍然很陌生,并且仍在尝试弄清楚如何重新格式化我的代码,以便它能够工作。
有人可以告诉我该怎么做吗?
笔记:
在我之前咨询的问题中(例如R中随机森林的并行执行,https://stats.stackexchange.com/questions/519640/parallelizing-random-forest-learning-in-r-changes-the-class-of- the-rf-object),使用“randomForest”包而不是“ranger”我也愿意使用“randomForest”包,如果这将使并行化变得更容易。
我承认我的代码的整体结构可能不是最佳编写的 - 如果这将使并行化变得更容易,我愿意接受重写我的代码的建议。
我意识到 R 中有几个流行的包可用于并行化代码(例如https://cran.r-project.org/web/packages/doSNOW/index.html) - 我也愿意使用任何这些包用于并行化我的代码。
注意到您对tidymodels方法的开放态度,您可以使用您的original_data并包括并行处理来尝试此操作:
library(tidyverse)\nlibrary(tidymodels)\nlibrary(vip)\nlibrary(doParallel)\nlibrary(tictoc)\nlibrary(themis)\n\nregisterDoParallel(cores = 6)\n\n# Supplied data\nset.seed(2022)\n\noriginal_data <- rbind(\n data_1 = data.frame(\n class = 1,\n height = rnorm(10000, 180, 10),\n weight = rnorm(10000, 90, 10),\n salary = rnorm(10000, 50000, 10000)\n ),\n data_2 = data.frame(\n class = 0,\n height = rnorm(100, 160, 10),\n weight = rnorm(100, 100, 10),\n salary = rnorm(100, 40000, 10000)\n )\n)\n\noriginal_data$class <- as.factor(original_data$class)\noriginal_data$id <- 1:nrow(original_data)\n\ntic()\n\n# Train / test data\nset.seed(2022)\n\ndata_split <- \n original_data |>\n initial_split(strata = class) # stratify by class\n\ntrain_df <- data_split |> training()\ntest_df <- data_split |> testing()\n\n# Create a pre-processing recipe\nclass_recipe <-\n train_df |>\n recipe() |>\n update_role(class, new_role = "outcome") |>\n update_role(id, new_role = "id") |>\n update_role(-has_role("outcome"), -has_role("id"), new_role = "predictor") |> \n step_rose(class)\n\n# Check class balance\nclass_recipe |> prep() |> bake(new_data = NULL) |> count(class)\n#> # A tibble: 2 \xc3\x97 2\n#> class n\n#> <fct> <int>\n#> 1 0 7407\n#> 2 1 7589\n\nsummary(class_recipe)\n#> # A tibble: 5 \xc3\x97 4\n#> variable type role source \n#> <chr> <chr> <chr> <chr> \n#> 1 class nominal outcome original\n#> 2 height numeric predictor original\n#> 3 weight numeric predictor original\n#> 4 salary numeric predictor original\n#> 5 id numeric id original\n\n# Create model & workflow\nranger_model <- \n rand_forest(mtry = tune()) |>\n set_engine("ranger", importance = "impurity") |>\n set_mode("classification")\n\nranger_wflow <- workflow() |>\n add_recipe(class_recipe) |>\n add_model(ranger_model)\n\n# Tune model with 10-fold Cross Validation\nset.seed(2022)\n\nfolds <- vfold_cv(train_df, v = 10)\n\nset.seed(2022)\n\nranger_res <- ranger_wflow |> \n tune_grid(\n resamples = folds,\n grid = crossing(\n mtry = seq(1, 3, 1),\n ),\n control = control_grid(verbose = TRUE),\n metrics = metric_set(accuracy) # choose a metric, e.g. accuracy\n )\n\n# Fit model\nbest_tune <- ranger_res |> select_best()\n\nset.seed(2022)\n\nranger_fit <- ranger_wflow |> \n finalize_workflow(best_tune) %>% \n fit(train_df)\n\n# Test\nclass_results <- ranger_fit |> augment(new_data = test_df)\n\nclass_results |> accuracy(class, .pred_class)\n#> # A tibble: 1 \xc3\x97 3\n#> .metric .estimator .estimate\n#> <chr> <chr> <dbl>\n#> 1 accuracy binary 0.912\n\n# Visualise feature importance\nranger_fit |>\n extract_fit_parsnip() |> \n vip() +\n labs(title = "Feature Importance -- Ranger")\nRun Code Online (Sandbox Code Playgroud)\n
toc()\n#> 62.393 sec elapsed\nRun Code Online (Sandbox Code Playgroud)\n由reprex 包于 2022 年 6 月 21 日创建(v2.0.1)
\n