Phi*_*hil 6 performance r dplyr
我在dplyr中总结一个数据框时,试图在一个组中找到几个因子变量中最常见的值.我需要一个执行以下操作的公式:
有几个公式可行.但是,我能想到的那些都很慢.快速的那些不方便一次应用于数据帧中的几个变量.我想知道是否有人知道一种与dplyr很好地集成的快速方法.
我尝试了以下方法:
生成样本数据(50000组,100个随机字母)
z <- data.frame(a = rep(1:50000,100), b = sample(LETTERS, 5000000, replace = TRUE))
str(z)
'data.frame': 5000000 obs. of 2 variables:
$ a: int 1 2 3 4 5 6 7 8 9 10 ...
$ b: Factor w/ 26 levels "A","B","C","D",..: 6 4 14 12 3 19 17 19 15 20 ...
Run Code Online (Sandbox Code Playgroud)
"清洁" - 但缓慢的方法1
y <- z %>%
group_by(a) %>%
summarise(c = names(table(b))[which.max(table(b))])
user system elapsed
26.772 2.011 29.568
Run Code Online (Sandbox Code Playgroud)
"清洁" - 但缓慢的方法2
y <- z %>%
group_by(a) %>%
summarise(c = names(which(table(b) == max(table(b)))[1]))
user system elapsed
29.329 2.029 32.361
Run Code Online (Sandbox Code Playgroud)
"清洁" - 但缓慢的方法3
y <- z %>%
group_by(a) %>%
summarise(c = names(sort(table(b),decreasing = TRUE)[1]))
user system elapsed
35.086 6.905 42.485
Run Code Online (Sandbox Code Playgroud)
"凌乱"但快速的方法
y <- z %>%
group_by(a,b) %>%
summarise(counter = n()) %>%
group_by(a) %>%
filter(counter == max(counter))
y <- y[!duplicated(y$a),]
y <- y$counter <- NULL
user system elapsed
7.061 0.330 7.664
Run Code Online (Sandbox Code Playgroud)
这是另一种选择dplyr:
set.seed(123)
z <- data.frame(a = rep(1:50000,100),
b = sample(LETTERS, 5000000, replace = TRUE),
stringsAsFactors = FALSE)
a <- z %>% group_by(a, b) %>% summarise(c=n()) %>% filter(row_number(desc(c))==1) %>% .$b
b <- z %>% group_by(a) %>% summarise(c=names(which(table(b) == max(table(b)))[1])) %>% .$c
Run Code Online (Sandbox Code Playgroud)
我们确保这些是等效的方法:
> identical(a, b)
#[1] TRUE
Run Code Online (Sandbox Code Playgroud)
更新
正如@docendodiscimus所提到的,你也可以这样做:
count(z, a, b) %>% slice(which.max(n))
Run Code Online (Sandbox Code Playgroud)
以下是基准测试的结果:
library(microbenchmark)
mbm <- microbenchmark(
steven = z %>% group_by(a, b) %>% summarise(c = n()) %>% filter(row_number(desc(c))==1),
phil = z %>% group_by(a) %>% summarise(c = names(which(table(b) == max(table(b)))[1])),
docendo = count(z, a, b) %>% slice(which.max(n)),
times = 10
)
Run Code Online (Sandbox Code Playgroud)
#Unit: seconds
# expr min lq mean median uq max neval cld
# steven 4.752168 4.789564 4.815986 4.813686 4.847964 4.875109 10 b
# phil 15.356051 15.378914 15.467534 15.458844 15.533385 15.606690 10 c
# docendo 4.586096 4.611401 4.669375 4.688420 4.702352 4.753583 10 a
Run Code Online (Sandbox Code Playgroud)
为什么选择dplyr?
#dummy data
set.seed(123)
z <- data.frame(a = rep(1:50000,100),
b = sample(LETTERS, 5000000, replace = TRUE))
#result
names(sort(table(z$b),decreasing = TRUE)[1])
# [1] "S"
#time it
system.time(
names(sort(table(z$b),decreasing = TRUE)[1])
)
# user system elapsed
# 0.36 0.00 0.36
Run Code Online (Sandbox Code Playgroud)
编辑:多列
#dummy data
set.seed(123)
z <- data.frame(a = rep(1:50000,100),
b = sample(LETTERS, 5000000, replace = TRUE),
c = sample(LETTERS, 5000000, replace = TRUE),
d = sample(LETTERS, 5000000, replace = TRUE))
# check for multiple columns
sapply(c("b","c","d"), function(i)
names(sort(table(z[,i]),decreasing = TRUE)[1])
)
# b c d
#"S" "N" "G"
#time it
system.time(
sapply(c("b","c","d"), function(i)
names(sort(table(z[,i]),decreasing = TRUE)[1]))
)
# user system elapsed
# 0.61 0.17 0.78
Run Code Online (Sandbox Code Playgroud)
data.table 仍然是最快的选择:
z <- data.frame(a = rep(1:50000,100), b = sample(LETTERS, 5000000, replace = TRUE))
Run Code Online (Sandbox Code Playgroud)
标杆:
library(data.table)
library(dplyr)
#dplyr
system.time({
y <- z %>%
group_by(a) %>%
summarise(c = names(which(table(b) == max(table(b)))[1]))
})
user system elapsed
14.52 0.01 14.70
#data.table
system.time(
setDT(z)[, .N, by=b][order(N),][.N,]
)
user system elapsed
0.05 0.02 0.06
#@zx8754 's way - base R
system.time(
names(sort(table(z$b),decreasing = TRUE)[1])
)
user system elapsed
0.73 0.06 0.81
Run Code Online (Sandbox Code Playgroud)
正如使用data.table可以看到的那样:
setDT(z)[, .N, by=b][order(N),][.N,]
Run Code Online (Sandbox Code Playgroud)
要么
#just to get the name
setDT(z)[, .N, by=b][order(N),][.N, b]
Run Code Online (Sandbox Code Playgroud)
似乎是最快的
所有列的更新:
使用@ zx8754的数据
set.seed(123)
z2 <- data.frame(a = rep(1:50000,100),
b = sample(LETTERS, 5000000, replace = TRUE),
c = sample(LETTERS, 5000000, replace = TRUE),
d = sample(LETTERS, 5000000, replace = TRUE))
Run Code Online (Sandbox Code Playgroud)
你可以这样做:
#with data.table
system.time(
sapply(c('b','c','d'), function(x) {
data.table(x = z2[[x]])[, .N, by=x][order(N),][.N, x]
}))
user system elapsed
0.34 0.00 0.34
#with base-R
system.time(
sapply(c("b","c","d"), function(i)
names(sort(table(z2[,i]),decreasing = TRUE)[1]))
)
user system elapsed
4.14 0.11 4.26
Run Code Online (Sandbox Code Playgroud)
只是为了确认结果是一样的:
sapply(c('b','c','d'), function(x) {
data.table(x = z2[[x]])[, .N, by=x][order(N),][.N, x]
})
b c d
S N G
sapply(c("b","c","d"), function(i)
names(sort(table(z2[,i]),decreasing = TRUE)[1]))
b c d
"S" "N" "G"
Run Code Online (Sandbox Code Playgroud)