JD *_*ong 36 performance r plyr
我有一个模拟,有一个巨大的聚合,并在中间组合步骤.我使用plyr的ddply()函数对这个过程进行了原型设计,这对我的大部分需求非常有用.但是我需要这个聚合步骤更快,因为我必须运行10K模拟.我已经在并行缩放模拟,但如果这一步更快,我可以大大减少我需要的节点数量.
这是对我要做的事情的合理简化:
library(Hmisc)
# Set up some example data
year <- sample(1970:2008, 1e6, rep=T)
state <- sample(1:50, 1e6, rep=T)
group1 <- sample(1:6, 1e6, rep=T)
group2 <- sample(1:3, 1e6, rep=T)
myFact <- rnorm(100, 15, 1e6)
weights <- rnorm(1e6)
myDF <- data.frame(year, state, group1, group2, myFact, weights)
# this is the step I want to make faster
system.time(aggregateDF <- ddply(myDF, c("year", "state", "group1", "group2"),
function(df) wtd.mean(df$myFact, weights=df$weights)
)
)
Run Code Online (Sandbox Code Playgroud)
所有提示或建议表示赞赏!
had*_*ley 37
您可以使用不可变数据框而不是正常的R数据框,该数据框在您进行子集时返回指向原始数据的指针,并且可以更快:
idf <- idata.frame(myDF)
system.time(aggregateDF <- ddply(idf, c("year", "state", "group1", "group2"),
function(df) wtd.mean(df$myFact, weights=df$weights)))
# user system elapsed
# 18.032 0.416 19.250
Run Code Online (Sandbox Code Playgroud)
如果我要编写一个完全针对这种情况定制的plyr函数,我会做这样的事情:
system.time({
ids <- id(myDF[c("year", "state", "group1", "group2")], drop = TRUE)
data <- as.matrix(myDF[c("myFact", "weights")])
indices <- plyr:::split_indices(seq_len(nrow(data)), ids, n = attr(ids, "n"))
fun <- function(rows) {
weighted.mean(data[rows, 1], data[rows, 2])
}
values <- vapply(indices, fun, numeric(1))
labels <- myDF[match(seq_len(attr(ids, "n")), ids),
c("year", "state", "group1", "group2")]
aggregateDF <- cbind(labels, values)
})
# user system elapsed
# 2.04 0.29 2.33
Run Code Online (Sandbox Code Playgroud)
It's so much faster because it avoids copying the data, only extracting the subset needed for each computation when it's computed. Switching the data to matrix form gives another speed boost because matrix subsetting is much faster than data frame subsetting.
dat*_*urf 25
进一步的2倍加速和更简洁的代码:
library(data.table)
dtb <- data.table(myDF, key="year,state,group1,group2")
system.time(
res <- dtb[, weighted.mean(myFact, weights), by=list(year, state, group1, group2)]
)
# user system elapsed
# 0.950 0.050 1.007
Run Code Online (Sandbox Code Playgroud)
我的第一篇文章,所以请你好;)
从data.tablev1.9.2的,setDT功能导出的转换会data.frame以data.table 供参考(与保留data.table的说法-所有set*功能通过引用修改对象).这意味着,没有不必要的复制,因此很快.你可以计时,但它会疏忽.
require(data.table)
system.time({
setDT(myDF)
res <- myDF[, weighted.mean(myFact, weights),
by=list(year, state, group1, group2)]
})
# user system elapsed
# 0.970 0.024 1.015
Run Code Online (Sandbox Code Playgroud)
这与data.table(.)使用上述OP解决方案的1.264秒相反,用于创建dtb.
我会用基础R来描述
g <- with(myDF, paste(year, state, group1, group2))
x <- with(myDF, c(tapply(weights * myFact, g, sum) / tapply(weights, g, sum)))
aggregateDF <- myDF[match(names(x), g), c("year", "state", "group1", "group2")]
aggregateDF$V1 <- x
Run Code Online (Sandbox Code Playgroud)
在我的机器上,与原始代码的67秒相比需要5秒.
编辑
刚刚发现另一个加速rowsum功能:
g <- with(myDF, paste(year, state, group1, group2))
X <- with(myDF, rowsum(data.frame(a=weights*myFact, b=weights), g))
x <- X$a/X$b
aggregateDF2 <- myDF[match(rownames(X), g), c("year", "state", "group1", "group2")]
aggregateDF2$V1 <- x
Run Code Online (Sandbox Code Playgroud)
需要3秒!
您使用的是最新版本的plyr(请注意:尚未将其用于所有CRAN镜像)?如果是这样,你可以并行运行它.
这是llply示例,但同样适用于ddply:
x <- seq_len(20)
wait <- function(i) Sys.sleep(0.1)
system.time(llply(x, wait))
# user system elapsed
# 0.007 0.005 2.005
library(doMC)
registerDoMC(2)
system.time(llply(x, wait, .parallel = TRUE))
# user system elapsed
# 0.020 0.011 1.038
Run Code Online (Sandbox Code Playgroud)
编辑:
好吧,其他循环方法更糟糕,所以这可能需要(a)C/C++代码或(b)更根本地重新思考你是如何做的.我甚至没有尝试使用,by()因为我的经验非常慢.
groups <- unique(myDF[,c("year", "state", "group1", "group2")])
system.time(
aggregateDF <- do.call("rbind", lapply(1:nrow(groups), function(i) {
df.tmp <- myDF[myDF$year==groups[i,"year"] & myDF$state==groups[i,"state"] & myDF$group1==groups[i,"group1"] & myDF$group2==groups[i,"group2"],]
cbind(groups[i,], wtd.mean(df.tmp$myFact, weights=df.tmp$weights))
}))
)
aggregateDF <- data.frame()
system.time(
for(i in 1:nrow(groups)) {
df.tmp <- myDF[myDF$year==groups[i,"year"] & myDF$state==groups[i,"state"] & myDF$group1==groups[i,"group1"] & myDF$group2==groups[i,"group2"],]
aggregateDF <- rbind(aggregateDF, data.frame(cbind(groups[i,], wtd.mean(df.tmp$myFact, weights=df.tmp$weights))))
}
)
Run Code Online (Sandbox Code Playgroud)
当应用的函数有多个向量args时,我通常使用带tapply的索引向量:
system.time(tapply(1:nrow(myDF), myDF[c('year', 'state', 'group1', 'group2')], function(s) weighted.mean(myDF$myFact[s], myDF$weights[s])))
# user system elapsed
# 1.36 0.08 1.44
Run Code Online (Sandbox Code Playgroud)
我使用一个简单的包装器,它相当于隐藏了一切:
tmapply(list(myDF$myFact, myDF$weights), myDF[c('year', 'state', 'group1', 'group2')], weighted.mean)
Run Code Online (Sandbox Code Playgroud)
编辑包括tmapply评论如下:
tmapply = function(XS, INDEX, FUN, ..., simplify=T) {
FUN = match.fun(FUN)
if (!is.list(XS))
XS = list(XS)
tapply(1:length(XS[[1L]]), INDEX, function(s, ...)
do.call(FUN, c(lapply(XS, `[`, s), list(...))), ..., simplify=simplify)
}
Run Code Online (Sandbox Code Playgroud)