1 parallel-processing for-loop r sapply
我对尝试在 R 中运行并行进程还很陌生,因为我使用的大部分数据都不是那么大。但是,我没有使用更大的数据集,我试图从 9000 条调查评论中“查找和替换”一组大约 2000 个名称。我已经使用 gsub 创建了一个 for 循环来完成工作,但这需要很长时间:
completed <- 0
for (name in names){
text_df$text <- sapply(text_df$text, gsub, pattern=paste0("(?<=\\W|^)", name, "(?=\\W|$)"), replacement="RemovedLeader", ignore.case=TRUE, perl=TRUE)
completed <- completed + 1
print(paste0("Completed ", completed," out of ", length(names)))
}
Run Code Online (Sandbox Code Playgroud)
据我了解,这应该是一个相当简单的并行运行过程,但我遇到了一些麻烦。我已经尝试使用 parSapply 运行它,但是我很难重新编写 gsub(它本身目前在 for 循环中的一个 sapply 中)以在 for 循环之外工作。谢谢您的帮助。
这还不是并行化的任务。由于您必须一次又一次地重新申请名称替换。
apply-Family 函数使用列表,列表在 R 中非常慢。
通过使用矢量化避免列表。
在这种特殊情况下,我建议:通过将所有速度工作放到正则表达式模块来提高速度。例如,将 200 个名称放入一个正则表达式搜索模式中。
names <- c("a", "b", "c", "d", "e")
texts <- c("this is a thing", "a b and an a", "this is a c cat", "B c d acd", "e A e ead")
# one regex pattern to replace all names at once
pattern = paste0("(?<=\\W|^)(", paste(names, collapse="|"), ")(?=\\W|$)")
# use regex's speed
gsub(pattern = pattern, replacement = "RemovedLeader", x = texts, ignore.case = TRUE, perl = TRUE)
# [1] "this is RemovedLeader thing"
# [2] "RemovedLeader RemovedLeader and an RemovedLeader"
# [3] "this is RemovedLeader RemovedLeader cat"
# [4] "RemovedLeader RemovedLeader RemovedLeader acd"
# [5] "RemovedLeader RemovedLeader RemovedLeader ead"
Run Code Online (Sandbox Code Playgroud)
使用parallel包并行化
只是为了向您展示如何通过并行化解决这个问题:
names <- c("a", "b", "c", "d", "e")
texts <- c("this is a thing", "a b and an a", "this is a c", "B c d", "e A e")
# Before parallelization, think about over which sequence
# you can parallelize this procedure - which components in a sequence
# are independent?
# In this case the single strings of the texts are independent from each other.
# so write a function for one string to be fully processed as desired.
# let's define a function which applies the name replacements on one text piece.
replace_names <- function(names, string) {
for (name in names) {
string <- gsub(pattern = paste0("(?<=\\W|^)", name, "(?=\\W|$)"),
x = string,
replacement = "RemovedLeader",
ignore.case = TRUE,
perl = TRUE)
}
string
}
# Let us then apply this replacement for one string
# over the entire texts vector - for your case: `texts = text_df$texts`
# 0. load the `parallel` package
require(parallel) # install.packages("parallel")
# 1. prepare a cluster
cl <- parallel::makeCluster(parallel::detectCores() - 1)
# 2. export all variables and functions needed for the calculation to cluster
parallel::clusterExport(cl=cl, varlist=list(replace_names = "replace_names",
names = "names",
texts = "texts"))
# 3. run and get results type in console: `parallel::` and then TAB to see all
# available functions
# use `?<functionname>` and RET to see more details about the functions
new_texts <- parallel::parSapply(cl, texts, function(txt) replace_names(names, txt))
# 4. don't forget to stop the cluster to give resources free
stopCluster(cl = cl)
Run Code Online (Sandbox Code Playgroud)
输出:
this is a thing
"this is RemovedLeader thing"
a b and an a
"RemovedLeader RemovedLeader and an RemovedLeader"
this is a c cat
"this is RemovedLeader RemovedLeader cat"
B c d acd
"RemovedLeader RemovedLeader RemovedLeader acd"
e A e ead
"RemovedLeader RemovedLeader RemovedLeader ead"
Run Code Online (Sandbox Code Playgroud)
因此,您基本上可以对文本向量中的单个字符串进行并行化。
并行使用foreach- doParallel-parallel包(不需要导出变量和函数 - 更适合自动化)
names <- c("a", "b", "c", "d", "e")
texts <- c("this is a thing", "a b and an a", "this is a c", "B c d", "e A e")
replace_names <- function(names, string) {
for (name in names) {
string <- gsub(pattern = paste0("(?<=\\W|^)", name, "(?=\\W|$)"),
x = string,
replacement = "RemovedLeader",
ignore.case = TRUE,
perl = TRUE)
}
string
}
# 0. load the `parallel` package
require(parallel) # install.packages("parallel")
require(doParallel)
require(foreach)
# 1. prepare a cluster and register it for doParallel
cl <- parallel::makeCluster(parallel::detectCores() - 1)
registerDoParallel(cl) # no need for exporting variables!
# 2. run foreach and %dopar%
new_texts <- foreach::foreach(txt=texts) %dopar% replace_names(names, txt)
# rule for combining result can be given -> instead as list, bind result with `c` to a vector:
new_texts_vec <- foreach::foreach(txt=texts, .combine=`c`) %dopar% replace_names(names, txt)
# 3. don't forget to stop the cluster to give resources free
parallel::stopCluster(cl = cl)
Run Code Online (Sandbox Code Playgroud)
使用foreach和doParallel包对并行化进行抽象
####################
# define papply
####################
papply <- function(sequential,
monadic_func,
exclude_cores=1,
cores=NULL,
...) {
# prepare cluster
cl <- parallel::makeCluster(
if (is.null(cores)) {
parallel::detectCores() - exclude_cores
} else {
cores
})
# register
registerDoParallel(cl)
# run job
res <- foreach::`%dopar%`(foreach::foreach(x=sequential, ...),
do.call(monadic_func, list(x)))
parallel::stopCluster(cl=cl)
res
}
Run Code Online (Sandbox Code Playgroud)
###################
# define p_star_apply
###################
p_star_apply <- function(list_of_args_list,
multiadic_func,
exclude_cores=1,
cores=NULL,
...) {
# prepare cluster
cl <- parallel::makeCluster(
if (is.null(cores)) {
parallel::detectCores() - exclude_cores
} else {
cores
})
# register
registerDoParallel(cl)
# run job
res <- foreach::`%dopar%`(foreach::foreach(argsl=list_of_args_list, ...),
do.call(multiadic_func, argsl))
parallel::stopCluster(cl=cl)
res
} # works!
Run Code Online (Sandbox Code Playgroud)
你可以这样使用它:
###################
# usage papply
# - for parallelization of a monadic function
# - arguments can be any type of sequence
# - define by .combine=`list` or .combine=`c`
# whether output bound to a list or a vector e.g.
###################
# prepare monadic function (=takes exactly 1 necessary arguments)
replace_names <- function(string, .names=names) {
for (name in .names) {
string <- gsub(pattern = paste0("(?<=\\W|^)", name, "(?=\\W|$)"),
x = string,
replacement = "RemovedLeader",
ignore.case = TRUE,
perl = TRUE)
}
string
}
# call papply by giving for sequential the sequence of arguments
# and for monadic_func the appropriate monadic function:
papply(sequential = texts,
monadic_func = replace_names,
.export="names", # this is necessary for replace_names' default argument
.combine=`c`) # this makes results be bound as vector
# for the `...` argument consult documentation by ?foreach::foreach
Run Code Online (Sandbox Code Playgroud)
还有p_star_apply一种更通用的形式,其中多个参数依次被赋予一个函数,从而提供了很大的自由度。只是将正确顺序的参数打包到列表列表中。
###################
# usage p_star_apply
# - for parallelization of multiadic functions
# - arguments must be ordered as list or argument lists
# named or unnamed (if unnamed, argument order must be correct)
# - define by .combine=`list` or .combine=`c`
# whether output bound to a list or a vector e.g.
###################
# prepare multiadic function
# in this case dyadic (takes 2 necessary arguments)
dyadic_replace_names <- function(string, .names) {
for (name in .names) {
string <- gsub(pattern = paste0("(?<=\\W|^)", name, "(?=\\W|$)"),
x = string,
replacement = "RemovedLeader",
ignore.case = TRUE,
perl = TRUE)
}
string
}
# prepare list of arguments lists
# (named elements or unnamed if in correct order)
argsl_lists <- lapply(texts, function(t) list(t, names))
p_star_apply(list_of_args_list=argsl_lists,
multiadic_func=dyadic_replace_names) # returns as list
p_star_apply(list_of_args_list=argsl_lists,
multiadic_func=dyadic_replace_names,
.combine=`c`) # returns as vectors
# showing that `dot-dot-dot` artument, capable to forward the
# arguments for the `foreach()` part!
Run Code Online (Sandbox Code Playgroud)