drm*_*iod 5 performance r dplyr
我试图p.values
用长数据格式的非常庞大的数据框中的学生t检验来计算.由于我的原始数据框有关于数据框内的行,因此p.values的计算需要很长时间(大约需要100分钟).
我正在尝试加快这个过程,但我不确定数据框是否是提高速度的最佳格式,或者我是否应该重塑数据并使用a matrix
.
这是一个可重复的例子,最后有一个小数据框和一个基准.
library(dplyr)
my.t.test <- function (x, y = NULL) {
nx <- length(x)
mx <- mean(x)
vx <- var(x)
ny <- length(y)
my <- mean(y)
vy <- var(y)
stderrx <- sqrt(vx/nx)
stderry <- sqrt(vy/ny)
stderr <- sqrt(stderrx^2 + stderry^2)
df <- stderr^4/(stderrx^4/(nx - 1) + stderry^4/(ny - 1))
tstat <- (mx - my - 0)/stderr
pval <- 2 * pt(-abs(tstat), df)
return(pval)
}
cont <- c("A", "B")
set.seed(1)
df1 <- data.frame(id=rep(1:1000, each=8),
replicate=1:4,
A=rnorm(8000, mean=26, sd=5),
B=rnorm(8000, mean=25, sd=7))
completeDF <- function() {
df1 %>%
group_by(id) %>%
summarise(Comparison=paste(cont, collapse=' - '),
p.value=t.test(get(cont[1]), get(cont[2]))$p.value,
log10.p.value=-log10(p.value),
log2.foldchange=mean(get(cont[1]), na.rm=TRUE) - mean(get(cont[2]), na.rm=TRUE)
)}
noPvalue <- function() {
df1 %>%
group_by(id) %>%
summarise(Comparison=paste(cont, collapse=' - '),
log2.foldchange=mean(get(cont[1]), na.rm=TRUE) - mean(get(cont[2]), na.rm=TRUE)
)}
myPvalue <- function() {
df1 %>%
group_by(id) %>%
summarise(Comparison=paste(cont, collapse=' - '),
p.value=my.t.test(get(cont[1]), get(cont[2])),
log10.p.value=-log10(p.value),
log2.foldchange=mean(get(cont[1]), na.rm=TRUE) - mean(get(cont[2]), na.rm=TRUE)
)}
microbenchmark::microbenchmark(
completeDF(), noPvalue(), myPvalue()
)
Run Code Online (Sandbox Code Playgroud)
我的基准:
Unit: milliseconds
expr min lq mean median uq max neval
completeDF() 358.38330 365.09423 424.60255 369.20453 377.40354 655.2009 100
noPvalue() 57.42996 58.89978 81.86222 59.66851 60.96582 337.2346 100
myPvalue() 216.04812 220.98277 318.09568 224.19516 493.74908 609.4516 100
Run Code Online (Sandbox Code Playgroud)
因此,通过我非常简化(无测试等)的t.test功能,我已经节省了一些时间.但我想知道是否可以通过某种方式进行矢量化来进一步改进.
首先,替换mean(x)
为sum(x) / length(x)
,因为mean
速度慢.
然后,当我描述更新后my.t.test
,我发现80%的执行时间花在了var
.所以我var
用Rcpp实现替换.
library(Rcpp)
cppFunction("double var_cpp (NumericVector x, double xc) {
size_t n = (size_t)x.size();
double z1 = 0.0, z2 = 0.0, *p = &x[0], *q = &x[n];
if (n & 2) {z1 = (*p - xc) * (*p - xc); p++;}
for (; p < q; p += 2) {
z1 += (p[0] - xc) * (p[0] - xc);
z2 += (p[1] - xc) * (p[1] - xc);
}
z1 = (z1 + z2) / (double)(n - 1);
return z1;
}")
library(microbenchmark)
x <- runif(1e+7)
xc <- sum(x) / length(x)
microbenchmark(var_cpp(x, xc), var(x))
#Unit: milliseconds
# expr min lq mean median uq max
# var_cpp(x, xc) 20.71985 20.76298 21.00832 20.80576 20.87323 25.85723
# var(x) 109.61120 109.78513 111.92657 109.89077 114.21301 121.98907
Run Code Online (Sandbox Code Playgroud)
sum
也可以提升.
cppFunction("double sum_cpp (NumericVector x) {
size_t n = (size_t)x.size();
double z1 = 0.0, z2 = 0.0, *p = &x[0], *q = &x[n];
if (n & 2) z1 = *p++;
for (; p < q; p += 2) {z1 += p[0]; z2 += p[1];}
z1 = (z1 + z2);
return z1;
}")
microbenchmark(sum_cpp(x), sum(x))
#Unit: milliseconds
# expr min lq mean median uq max neval
# sum_cpp(x) 15.58856 15.63613 15.70195 15.67847 15.69998 18.14852 100
# sum(x) 30.13504 30.20687 30.23993 30.23877 30.26721 30.40525 100
Run Code Online (Sandbox Code Playgroud)
所以这些给出:
my.t.test.cpp <- function (x, y = NULL) {
nx <- length(x)
mx <- sum_cpp(x) / nx
vx <- var_cpp(x, mx)
ny <- length(y)
my <- sum_cpp(y) / ny
vy <- var_cpp(y, my)
stderrx <- sqrt(vx/nx)
stderry <- sqrt(vy/ny)
stderr <- sqrt(stderrx^2 + stderry^2)
df <- stderr^4/(stderrx^4/(nx - 1) + stderry^4/(ny - 1))
tstat <- (mx - my - 0)/stderr
pval <- 2 * pt(-abs(tstat), df)
return(pval)
}
Run Code Online (Sandbox Code Playgroud)
感谢Martin将dplyr
代码转换为R基本代码.现在我可以更好地了解OP正在做什么.
还要感谢Martin加入fcpp
他的修订版.我也写下了fcpp
自己(靠近他).我对不同大小的数据集进行基准测试表明,fcpp
并且f2.2
具有相同的性能(如他的基准测试显示).
但是,我们factor
在开始时都受到该功能的瓶颈.对于OP的数据df1
,其中分组变量id
是1:1000
,我们能做到class(id) <- "factor"; levels(id) <- 1:1000
.通常as.factor
,如果分组变量已经是数据框中的一个因素,我们可能会使用哪个有用.参见R:为什么使用as.factor()而不仅仅是因子().
均值和方差计算需要按组完成,但t检验和p值计算可以进行矢量化.
my.t.test.2 <- function(grp, x, y) {
grp <- factor(grp)
x_g <- split(x, grp)
x_n <- lengths(x_g)
x_mean <- vapply(x_g, mean, numeric(1))
x_var <- vapply(x_g, var, numeric(1))
y_g <- split(y, grp)
y_n <- lengths(y_g)
y_mean <- vapply(y_g, mean, numeric(1))
y_var <- vapply(y_g, var, numeric(1))
x_se2 <- x_var / x_n
y_se2 <- y_var / y_n
se <- sqrt(x_se2 + y_se2)
tstat <- (x_mean - y_mean) / se
df <- se^4 / (x_se2^2 / (x_n - 1L) + (y_se2^2) / (y_n - 1L))
2 * pt(-abs(tstat), df)
}
Run Code Online (Sandbox Code Playgroud)
人们可以通过避免调度("原因"给予缓慢mean()
)和最小化冗余计算(例如,每组的长度)来尝试并且超级聪明.
my.t.test.2.1 <- compiler::cmpfun(function(grp, x, y) {
grp <- factor(grp)
x_g <- split.default(x, grp)
n <- lengths(x_g)
n1 <- n - 1L
x_mean <- vapply(x_g, mean.default, numeric(1), USE.NAMES = FALSE)
x_var <- vapply(x_g, var, numeric(1), USE.NAMES = FALSE)
y_g <- split.default(y, grp)
y_mean <- vapply(y_g, mean.default, numeric(1), USE.NAMES = FALSE)
y_var <- vapply(y_g, var, numeric(1), USE.NAMES = FALSE)
x_se2 <- x_var / n
y_se2 <- y_var / n
se <- sqrt(x_se2 + y_se2)
tstat <- (x_mean - y_mean) / se
df <- se^4 / ((x_se2^2 + y_se2^2) / n1)
2 * pt(-abs(tstat), df)
})
Run Code Online (Sandbox Code Playgroud)
可以包装规范和其他解决方案以提供相同的输出
f0 <- function(df)
df %>% group_by(id) %>% summarize(p.value = t.test(A, B)$p.value)
f1 <- function(df)
df %>% group_by(id) %>% summarize(p.value = my.t.test(A, B))
f2 <- function(df)
tibble(id = unique(df$id), p.value = my.t.test.2(df$id, df$A, df$B))
f2.1 <- function(df)
tibble(id = unique(df$id), p.value = my.t.test.2.1(df$id, df$A, df$B))
Run Code Online (Sandbox Code Playgroud)
f2.1()
产生与规范实现相同的结果,速度大约是其两倍; 担心mean()
等等(f2()
对比f2.1()
)的速度似乎主要是被误导了
> all.equal.default(f0(df1), f2.1(df1))
[1] TRUE
> microbenchmark(f0(df1), f1(df1), f2(df1), f2.1(df1), times = 5)
Unit: milliseconds
expr min lq mean median uq max neval
f0(df1) 374.2819 379.7749 380.8365 380.0094 381.2368 388.8794 5
f1(df1) 249.6502 250.2525 251.8813 252.1965 253.3444 253.9630 5
f2(df1) 154.1152 158.3243 159.8277 159.1076 162.7602 164.8311 5
f2.1(df1) 151.0032 151.0149 152.3900 152.8105 153.2840 153.8373 5
Run Code Online (Sandbox Code Playgroud)
对我来说是C++实现
my.t.test.cpp <- function (x, y = NULL) {
nx <- length(x)
mx <- sum_cpp(x) / nx
vx <- var_cpp(x, mx)
ny <- length(y)
my <- sum_cpp(y) / ny
vy <- var_cpp(y, my)
stderrx <- sqrt(vx/nx)
stderry <- sqrt(vy/ny)
stderr <- sqrt(stderrx^2 + stderry^2)
df <- stderr^4/(stderrx^4/(nx - 1) + stderry^4/(ny - 1))
tstat <- (mx - my - 0)/stderr
pval <- 2 * pt(-abs(tstat), df)
return(pval)
}
fcpp <- function(df)
df %>% group_by(id) %>% summarize(p.value = my.t.test.cpp(A, B))
Run Code Online (Sandbox Code Playgroud)
产生的结果等于规范和时钟约100毫秒.
对2.1解决方案进行概要分析表明,大部分时间都花在内部var()
,其中有一个调用stopifnot()
以及一个匹配调用的参数
> var
function (x, y = NULL, na.rm = FALSE, use)
{
...
na.method <- pmatch(use, c("all.obs", "complete.obs", "pairwise.complete.obs",
"everything", "na.or.complete"))
...
if (is.data.frame(x))
x <- as.matrix(x)
else stopifnot(is.atomic(x))
...
.Call(C_cov, x, y, na.method, FALSE)
}
<bytecode: 0x5e1a440>
<environment: namespace:stats>
> Rprof(); x <- my.t.test.2.1(df1$id, df1$A, df1$B); Rprof(NULL); summaryRprof()
$by.self
self.time self.pct total.time total.pct
"withCallingHandlers" 0.04 28.57 0.08 57.14
"tryCatchList" 0.04 28.57 0.04 28.57
"vapply" 0.02 14.29 0.14 100.00
"stopifnot" 0.02 14.29 0.12 85.71
"match.call" 0.02 14.29 0.02 14.29
$by.total
total.time total.pct self.time self.pct
"vapply" 0.14 100.00 0.02 14.29
"my.t.test.2.1" 0.14 100.00 0.00 0.00
"stopifnot" 0.12 85.71 0.02 14.29
"FUN" 0.12 85.71 0.00 0.00
"withCallingHandlers" 0.08 57.14 0.04 28.57
"tryCatchList" 0.04 28.57 0.04 28.57
"tryCatch" 0.04 28.57 0.00 0.00
"match.call" 0.02 14.29 0.02 14.29
$sample.interval
[1] 0.02
$sampling.time
[1] 0.14
Run Code Online (Sandbox Code Playgroud)
因此,在追求速度时,可以避免参数检查并直接调用C函数
my.t.test.2.2 <- compiler::cmpfun(function(grp, x, y) {
var <- function(x)
.Call(stats:::C_cov, x, NULL, 4L, FALSE)
grp <- factor(grp)
x_g <- split.default(x, grp)
n <- lengths(x_g)
n1 <- n - 1L
x_mean <- vapply(x_g, mean.default, numeric(1), USE.NAMES = FALSE)
x_var <- vapply(x_g, var, numeric(1), USE.NAMES = FALSE)
y_g <- split.default(y, grp)
y_mean <- vapply(y_g, mean.default, numeric(1), USE.NAMES = FALSE)
y_var <- vapply(y_g, var, numeric(1), USE.NAMES = FALSE)
x_se2 <- x_var / n
y_se2 <- y_var / n
se <- sqrt(x_se2 + y_se2)
tstat <- (x_mean - y_mean) / se
df <- se^4 / ((x_se2^2 + y_se2^2) / n1)
2 * pt(-abs(tstat), df)
})
f2.2 <- function(df)
tibble(id = unique(df$id), p.value = my.t.test.2.2(df$id, df$A, df$B))
Run Code Online (Sandbox Code Playgroud)
事实证明这是非常有效的.
> all.equal.default(f0(df1), f2.2(df1))
[1] TRUE
> microbenchmark(
+ f0(df1), f1(df1), f2(df1), f2.1(df1), f2.2(df1), fcpp(df1),
+ times = 5
+ )
Unit: milliseconds
expr min lq mean median uq max neval
f0(df1) 378.61985 379.25525 393.38371 379.56797 386.2806 443.19488 5
f1(df1) 250.99802 252.45281 253.55140 253.34249 255.2801 255.68362 5
f2(df1) 156.76073 158.63126 159.63693 160.33446 161.2260 161.23216 5
f2.1(df1) 146.64555 148.28773 151.17250 151.38536 153.9363 155.60751 5
f2.2(df1) 25.24441 25.62982 27.50898 26.11755 30.0836 30.46951 5
fcpp(df1) 104.20851 104.50396 105.19383 104.62905 104.7876 107.84006 5
Run Code Online (Sandbox Code Playgroud)
我们可以使用方差计算的C++实现,而不是调用R的计算
my.t.test.2.2.cpp <- compiler::cmpfun(function(grp, x, y) {
grp <- factor(grp)
x_g <- split.default(x, grp)
n <- lengths(x_g)
n1 <- n - 1L
x_mean <- vapply(x_g, mean.default, numeric(1), USE.NAMES = FALSE)
x_var <- unlist(Map(var_cpp, x_g, x_mean))
y_g <- split.default(y, grp)
y_mean <- vapply(y_g, mean.default, numeric(1), USE.NAMES = FALSE)
y_var <- unlist(Map(var_cpp, y_g, y_mean))
x_se2 <- x_var / n
y_se2 <- y_var / n
se <- sqrt(x_se2 + y_se2)
tstat <- (x_mean - y_mean) / se
df <- se^4 / ((x_se2^2 + y_se2^2) / n1)
2 * pt(-abs(tstat), df)
})
f2.2.cpp <- function(df)
tibble(id = unique(df$id), p.value = my.t.test.2.2.cpp(df$id, df$A, df$B))
Run Code Online (Sandbox Code Playgroud)
表现相当
> microbenchmark(f2.2(df1), f2.2.cpp(df1), times = 20)
Unit: milliseconds
expr min lq mean median uq max neval
f2.2(df1) 25.11237 25.69622 30.27956 26.35570 29.81884 87.34955 20
f2.2.cpp(df1) 24.88787 25.25171 26.80836 25.43498 29.06338 30.80012 20
Run Code Online (Sandbox Code Playgroud)
我不确定哪个更糟糕 - 为方差编写自己的C++代码,或者直接调用R的C代码.
更快的C++解决方案可以计算单个调用中的组均值和方差
cppFunction('List doit(IntegerVector group, NumericVector x) {
int n_grp = 0;
for (int i = 0; i < group.size(); ++i)
n_grp = group[i] > n_grp ? group[i] : n_grp;
std::vector<int> n(n_grp);
std::vector<double> sum(n_grp), sumsq(n_grp);
for (int i = 0; i < group.size(); ++i) {
n[ group[i] - 1 ] += 1;
sum[ group[i] - 1 ] += x[i];
sumsq[ group[i] - 1 ] += x[i] * x[i];
}
NumericVector mean(n_grp), var(n_grp);
for (size_t i = 0; i < n.size(); ++i) {
mean[i] = sum[i] / n[i];
var[i] = (sumsq[i] - sum[i] * mean[i]) / (n[i] - 1);
}
return List::create(_["n"]=n[0], _["mean"]=mean, _["var"]=var);
}')
my.t.test.2.3.cpp <- compiler::cmpfun(function(grp, x, y) {
x <- doit(grp, x)
y <- doit(grp, y)
x_se2 <- x$var / x$n
y_se2 <- y$var / y$n
se <- sqrt(x_se2 + y_se2)
tstat <- (x$mean - y$mean) / se
df <- se^4 / ((x_se2^2 + y_se2^2) / (x$n - 1L))
2 * pt(-abs(tstat), df)
})
f2.3.cpp <- function(df)
tibble(
id = unique(df$id),
p.value = my.t.test.2.3.cpp(df$id, df$A, df$B)
)
Run Code Online (Sandbox Code Playgroud)
这很快
> all.equal.default(f0(df1), f2.3.cpp(df1))
[1] TRUE
> microbenchmark(f2.2(df1), f2.2.cpp(df1), f2.3.cpp(df1), times = 50)
Unit: milliseconds
expr min lq mean median uq max
f2.2(df1) 24.743364 25.445833 28.032135 25.873117 29.191020 88.642771
f2.2.cpp(df1) 24.122380 24.867212 26.012985 25.369963 25.897866 30.783544
f2.3.cpp(df1) 2.831635 2.946094 3.101408 2.992049 3.073788 7.191572
neval
50
50
50
>
Run Code Online (Sandbox Code Playgroud)
另一种替代方案是Bioconductor包genefilter::rowttests()
,它需要一个矩阵
set.seed(1)
m1 <- cbind(
matrix(rnorm(8000, mean = 26, sd = 5), ncol=8, byrow = TRUE),
matrix(rnorm(8000, mean = 25, sd = 7), ncol=8, byrow = TRUE)
)
f4 <- function(m1)
genefilter::rowttests(m1, factor(rep(1:2, each=8)))
Run Code Online (Sandbox Code Playgroud)
并且也很快
> microbenchmark(f2.3.cpp(df1), f4(m1), times=50)
Unit: milliseconds
expr min lq mean median uq max neval
f2.3.cpp(df1) 2.760877 2.796542 2.877030 2.845795 2.895441 3.286143 50
f4(m1) 1.335288 1.359007 1.397601 1.377544 1.412606 1.693340 50
Run Code Online (Sandbox Code Playgroud)
(一些不同之处在于创建tibble).