krl*_*mlr 48 algorithm performance r
这个问题导致了一个新的R包:
wrswoR
没有替换使用的R的默认采样sample.int似乎需要二次运行时间,例如,当使用从均匀分布中提取的权重时.这对于大样本量来说很慢.有人知道从R中可以使用的更快的实现吗?两个选项是"替换拒绝采样"(参见stats.sx上的这个问题)和Wong和Easton(1980)的算法(在StackOverflow答案中使用Python实现).
感谢Ben Bolker暗示C函数,该函数在被调用时sample.int具有内部调用replace=F和非均匀权重:ProbSampleNoReplace.实际上,代码显示了两个嵌套for循环(第420行random.c).
以下是根据经验分析运行时间的代码:
library(plyr)
sample.int.test <- function(n, p) {
sample.int(2 * n, n, replace=F, prob=p); NULL }
times <- ldply(
1:7,
function(i) {
n <- 1024 * (2 ** i)
p <- runif(2 * n)
data.frame(
n=n,
user=system.time(sample.int.test(n, p), gcFirst=T)['user.self'])
},
.progress='text'
)
times
library(ggplot2)
ggplot(times, aes(x=n, y=user/n)) + geom_point() + scale_x_log10() +
ylab('Time per unit (s)')
# Output:
n user
1 2048 0.008
2 4096 0.028
3 8192 0.100
4 16384 0.408
5 32768 1.645
6 65536 6.604
7 131072 26.558
Run Code Online (Sandbox Code Playgroud)

编辑:感谢Arun指出未加权的采样似乎没有这种性能损失.
Jul*_*ora 23
更新:
一个Rcpp实施Efraimidis&Spirakis算法(感谢@Hemmo,@Dinrem,@krlmlr和@rtlgrmpf):
library(inline)
library(Rcpp)
src <-
'
int num = as<int>(size), x = as<int>(n);
Rcpp::NumericVector vx = Rcpp::clone<Rcpp::NumericVector>(x);
Rcpp::NumericVector pr = Rcpp::clone<Rcpp::NumericVector>(prob);
Rcpp::NumericVector rnd = rexp(x) / pr;
for(int i= 0; i<vx.size(); ++i) vx[i] = i;
std::partial_sort(vx.begin(), vx.begin() + num, vx.end(), Comp(rnd));
vx = vx[seq(0, num - 1)] + 1;
return vx;
'
incl <-
'
struct Comp{
Comp(const Rcpp::NumericVector& v ) : _v(v) {}
bool operator ()(int a, int b) { return _v[a] < _v[b]; }
const Rcpp::NumericVector& _v;
};
'
funFast <- cxxfunction(signature(n = "Numeric", size = "integer", prob = "numeric"),
src, plugin = "Rcpp", include = incl)
# See the bottom of the answer for comparison
p <- c(995/1000, rep(1/1000, 5))
n <- 100000
system.time(print(table(replicate(funFast(6, 3, p), n = n)) / n))
1 2 3 4 5 6
1.00000 0.39996 0.39969 0.39973 0.40180 0.39882
user system elapsed
3.93 0.00 3.96
# In case of:
# Rcpp::IntegerVector vx = Rcpp::clone<Rcpp::IntegerVector>(x);
# i.e. instead of NumericVector
1 2 3 4 5 6
1.00000 0.40150 0.39888 0.39925 0.40057 0.39980
user system elapsed
1.93 0.00 2.03
Run Code Online (Sandbox Code Playgroud)
旧版:
让我们尝试一些可能的方法:
简单的拒绝取样与更换.这比sample.int.rej@krlmlr提供的功能要简单得多,即样本大小总是等于n.正如我们将要看到的,假设权重的均匀分布仍然很快,但在另一种情况下则非常慢.
fastSampleReject <- function(all, n, w){
out <- numeric(0)
while(length(out) < n)
out <- unique(c(out, sample(all, n, replace = TRUE, prob = w)))
out[1:n]
}
Run Code Online (Sandbox Code Playgroud)
Wong和Easton的算法(1980).以下是此 Python版本的实现.它很稳定,我可能会遗漏一些东西,但与其他功能相比要慢得多.
fastSample1980 <- function(all, n, w){
tws <- w
for(i in (length(tws) - 1):0)
tws[1 + i] <- sum(tws[1 + i], tws[1 + 2 * i + 1],
tws[1 + 2 * i + 2], na.rm = TRUE)
out <- numeric(n)
for(i in 1:n){
gas <- tws[1] * runif(1)
k <- 0
while(gas > w[1 + k]){
gas <- gas - w[1 + k]
k <- 2 * k + 1
if(gas > tws[1 + k]){
gas <- gas - tws[1 + k]
k <- k + 1
}
}
wgh <- w[1 + k]
out[i] <- all[1 + k]
w[1 + k] <- 0
while(1 + k >= 1){
tws[1 + k] <- tws[1 + k] - wgh
k <- floor((k - 1) / 2)
}
}
out
}
Run Code Online (Sandbox Code Playgroud)
Rcpp实现了Wong和Easton的算法.可能它可以进一步优化,因为这是我的第一个可用Rcpp功能,但无论如何它运作良好.
library(inline)
library(Rcpp)
src <-
'
Rcpp::NumericVector weights = Rcpp::clone<Rcpp::NumericVector>(w);
Rcpp::NumericVector tws = Rcpp::clone<Rcpp::NumericVector>(w);
Rcpp::NumericVector x = Rcpp::NumericVector(all);
int k, num = as<int>(n);
Rcpp::NumericVector out(num);
double gas, wgh;
if((weights.size() - 1) % 2 == 0){
tws[((weights.size()-1)/2)] += tws[weights.size()-1] + tws[weights.size()-2];
}
else
{
tws[floor((weights.size() - 1)/2)] += tws[weights.size() - 1];
}
for (int i = (floor((weights.size() - 1)/2) - 1); i >= 0; i--){
tws[i] += (tws[2 * i + 1]) + (tws[2 * i + 2]);
}
for(int i = 0; i < num; i++){
gas = as<double>(runif(1)) * tws[0];
k = 0;
while(gas > weights[k]){
gas -= weights[k];
k = 2 * k + 1;
if(gas > tws[k]){
gas -= tws[k];
k += 1;
}
}
wgh = weights[k];
out[i] = x[k];
weights[k] = 0;
while(k > 0){
tws[k] -= wgh;
k = floor((k - 1) / 2);
}
tws[0] -= wgh;
}
return out;
'
fun <- cxxfunction(signature(all = "numeric", n = "integer", w = "numeric"),
src, plugin = "Rcpp")
Run Code Online (Sandbox Code Playgroud)
现在一些结果:
times1 <- ldply(
1:6,
function(i) {
n <- 1024 * (2 ** i)
p <- runif(2 * n) # Uniform distribution
p <- p/sum(p)
data.frame(
n=n,
user=c(system.time(sample.int.test(n, p), gcFirst=T)['user.self'],
system.time(weighted_Random_Sample(1:(2*n), p, n), gcFirst=T)['user.self'],
system.time(fun(1:(2*n), n, p), gcFirst=T)['user.self'],
system.time(sample.int.rej(2*n, n, p), gcFirst=T)['user.self'],
system.time(fastSampleReject(1:(2*n), n, p), gcFirst=T)['user.self'],
system.time(fastSample1980(1:(2*n), n, p), gcFirst=T)['user.self']),
id=c("Base", "Reservoir", "Rcpp", "Rejection", "Rejection simple", "1980"))
},
.progress='text'
)
times2 <- ldply(
1:6,
function(i) {
n <- 1024 * (2 ** i)
p <- runif(2 * n - 1)
p <- p/sum(p)
p <- c(0.999, 0.001 * p) # Special case
data.frame(
n=n,
user=c(system.time(sample.int.test(n, p), gcFirst=T)['user.self'],
system.time(weighted_Random_Sample(1:(2*n), p, n), gcFirst=T)['user.self'],
system.time(fun(1:(2*n), n, p), gcFirst=T)['user.self'],
system.time(sample.int.rej(2*n, n, p), gcFirst=T)['user.self'],
system.time(fastSampleReject(1:(2*n), n, p), gcFirst=T)['user.self'],
system.time(fastSample1980(1:(2*n), n, p), gcFirst=T)['user.self']),
id=c("Base", "Reservoir", "Rcpp", "Rejection", "Rejection simple", "1980"))
},
.progress='text'
)
Run Code Online (Sandbox Code Playgroud)


arrange(times1, id)
n user id
1 2048 0.53 1980
2 4096 0.94 1980
3 8192 2.00 1980
4 16384 4.32 1980
5 32768 9.10 1980
6 65536 21.32 1980
7 2048 0.02 Base
8 4096 0.05 Base
9 8192 0.18 Base
10 16384 0.75 Base
11 32768 2.99 Base
12 65536 12.23 Base
13 2048 0.00 Rcpp
14 4096 0.01 Rcpp
15 8192 0.03 Rcpp
16 16384 0.07 Rcpp
17 32768 0.14 Rcpp
18 65536 0.31 Rcpp
19 2048 0.00 Rejection
20 4096 0.00 Rejection
21 8192 0.00 Rejection
22 16384 0.02 Rejection
23 32768 0.02 Rejection
24 65536 0.03 Rejection
25 2048 0.00 Rejection simple
26 4096 0.01 Rejection simple
27 8192 0.00 Rejection simple
28 16384 0.01 Rejection simple
29 32768 0.00 Rejection simple
30 65536 0.05 Rejection simple
31 2048 0.00 Reservoir
32 4096 0.00 Reservoir
33 8192 0.00 Reservoir
34 16384 0.02 Reservoir
35 32768 0.03 Reservoir
36 65536 0.05 Reservoir
arrange(times2, id)
n user id
1 2048 0.43 1980
2 4096 0.93 1980
3 8192 2.00 1980
4 16384 4.36 1980
5 32768 9.08 1980
6 65536 19.34 1980
7 2048 0.01 Base
8 4096 0.04 Base
9 8192 0.18 Base
10 16384 0.75 Base
11 32768 3.11 Base
12 65536 12.04 Base
13 2048 0.01 Rcpp
14 4096 0.02 Rcpp
15 8192 0.03 Rcpp
16 16384 0.08 Rcpp
17 32768 0.15 Rcpp
18 65536 0.33 Rcpp
19 2048 0.00 Rejection
20 4096 0.00 Rejection
21 8192 0.02 Rejection
22 16384 0.02 Rejection
23 32768 0.05 Rejection
24 65536 0.08 Rejection
25 2048 1.43 Rejection simple
26 4096 2.87 Rejection simple
27 8192 6.17 Rejection simple
28 16384 13.68 Rejection simple
29 32768 29.74 Rejection simple
30 65536 73.32 Rejection simple
31 2048 0.00 Reservoir
32 4096 0.00 Reservoir
33 8192 0.02 Reservoir
34 16384 0.02 Reservoir
35 32768 0.02 Reservoir
36 65536 0.04 Reservoir
Run Code Online (Sandbox Code Playgroud)
显然我们可以拒绝函数,1980因为它比Base两种情况都慢.Rejection simple当在第二种情况下存在单个概率0.999时也遇到麻烦.
所以仍然存在Rejection,Rcpp,Reservoir.最后一步是检查值本身是否正确.为了确定它们,我们将使用它sample作为基准(也是为了消除p由于没有替换的采样而不必重合的概率的混淆).
p <- c(995/1000, rep(1/1000, 5))
n <- 100000
system.time(print(table(replicate(sample(1:6, 3, repl = FALSE, prob = p), n = n))/n))
1 2 3 4 5 6
1.00000 0.39992 0.39886 0.40088 0.39711 0.40323 # Benchmark
user system elapsed
1.90 0.00 2.03
system.time(print(table(replicate(sample.int.rej(2*3, 3, p), n = n))/n))
1 2 3 4 5 6
1.00000 0.40007 0.40099 0.39962 0.40153 0.39779
user system elapsed
76.02 0.03 77.49 # Slow
system.time(print(table(replicate(weighted_Random_Sample(1:6, p, 3), n = n))/n))
1 2 3 4 5 6
1.00000 0.49535 0.41484 0.36432 0.36338 0.36211 # Incorrect
user system elapsed
3.64 0.01 3.67
system.time(print(table(replicate(fun(1:6, 3, p), n = n))/n))
1 2 3 4 5 6
1.00000 0.39876 0.40031 0.40219 0.40039 0.39835
user system elapsed
4.41 0.02 4.47
Run Code Online (Sandbox Code Playgroud)
请注意这里的一些事情.由于某种原因weighted_Random_Sample返回不正确的值(我根本没有查看它,但它假设均匀分布它是正确的).sample.int.rej重复采样非常慢.
总之,Rcpp在重复采样的情况下,它似乎是最佳选择,而在sample.int.rej其他方面更快,也更容易使用.
Din*_*nre 20
我决定深入研究一些评论,发现Efraimidis和Spirakis的论文非常吸引人(感谢@Hemmo寻找参考文献).本文的总体思路是:通过生成一个随机统一数字并将其增加到每个项目的权重超过1的幂来创建一个密钥.然后,您只需将最高键值作为样本.这非常出色!
weighted_Random_Sample <- function(
.data,
.weights,
.n
){
key <- runif(length(.data)) ^ (1 / .weights)
return(.data[order(key, decreasing=TRUE)][1:.n])
}
Run Code Online (Sandbox Code Playgroud)
如果将'.n'设置为'.data'的长度(应始终为'.weights'的长度),这实际上是加权的储层置换,但该方法适用于采样和置换.
更新:我应该提到上面的函数要求权重大于零.否则key <- runif(length(.data)) ^ (1 / .weights)将无法正确订购.
仅仅为了踢,我还使用了OP中的测试场景来比较两个函数.
set.seed(1)
times_WRS <- ldply(
1:7,
function(i) {
n <- 1024 * (2 ** i)
p <- runif(2 * n)
n_Set <- 1:(2 * n)
data.frame(
n=n,
user=system.time(weighted_Random_Sample(n_Set, p, n), gcFirst=T)['user.self'])
},
.progress='text'
)
sample.int.test <- function(n, p) {
sample.int(2 * n, n, replace=F, prob=p); NULL }
times_sample.int <- ldply(
1:7,
function(i) {
n <- 1024 * (2 ** i)
p <- runif(2 * n)
data.frame(
n=n,
user=system.time(sample.int.test(n, p), gcFirst=T)['user.self'])
},
.progress='text'
)
times_WRS$group <- "WRS"
times_sample.int$group <- "sample.int"
library(ggplot2)
ggplot(rbind(times_WRS, times_sample.int) , aes(x=n, y=user/n, col=group)) + geom_point() + scale_x_log10() + ylab('Time per unit (s)')
Run Code Online (Sandbox Code Playgroud)
以下是时代:
times_WRS
# n user
# 1 2048 0.00
# 2 4096 0.01
# 3 8192 0.00
# 4 16384 0.01
# 5 32768 0.03
# 6 65536 0.06
# 7 131072 0.16
times_sample.int
# n user
# 1 2048 0.02
# 2 4096 0.05
# 3 8192 0.14
# 4 16384 0.58
# 5 32768 2.33
# 6 65536 9.23
# 7 131072 37.79
Run Code Online (Sandbox Code Playgroud)
