tmf*_*mnk 5 performance r sampling dataframe
作为我上一个问题的后续问题,我对提高现有递归采样函数的性能感兴趣。
递归采样是指为给定的暴露 ID 随机选择最多 n 个唯一的未暴露 ID,然后从剩余的未暴露 ID 中随机选择最多 n 个唯一的未暴露 ID 为另一个暴露 ID。如果给定的公开 ID 没有剩余的未公开 ID,则该公开 ID 将被忽略。
原函数如下:
recursive_sample <- function(data, n) {
groups <- unique(data[["exposed"]])
out <- data.frame(exposed = character(), unexposed = character())
for (group in groups) {
chosen <- data %>%
filter(exposed == group,
!unexposed %in% out$unexposed) %>%
group_by(unexposed) %>%
slice(1) %>%
ungroup() %>%
sample_n(size = min(n, nrow(.)))
out <- rbind(out, chosen)
}
out
}
Run Code Online (Sandbox Code Playgroud)
我能够创建一个更有效的,如下所示:
recursive_sample2 <- function(data, n) {
groups <- unique(data[["exposed"]])
out <- tibble(exposed = integer(), unexposed = integer())
for (group in groups) {
chosen <- data %>%
filter(exposed == group,
!unexposed %in% out$unexposed) %>%
filter(!duplicated(unexposed)) %>%
sample_n(size = min(n, nrow(.)))
out <- bind_rows(out, chosen)
}
out
}
Run Code Online (Sandbox Code Playgroud)
样本数据和基准测试:
set.seed(123)
df <- tibble(exposed = rep(1:100, each = 100),
unexposed = sample(1:7000, 10000, replace = TRUE))
microbenchmark(f1 = recursive_sample(df, 5),
f2 = recursive_sample2(df, 5),
times = 10)
Unit: milliseconds
expr min lq mean median uq max neval cld
f1 1307.7198 1316.5276 1379.0533 1371.3952 1416.6360 1540.955 10 b
f2 839.0086 865.2547 914.8327 901.2288 970.9518 1036.170 10 a
Run Code Online (Sandbox Code Playgroud)
然而,对于我的实际数据集,我需要一个更高效(即更快)的函数。任何关于更有效版本的想法,无论是data.table涉及并行化还是其他方法,都是受欢迎的。
2023年12月编辑
@minem 和 @ThomasIsCoding 的版本要快得多;但是,它们仅返回部分正确的结果。考虑样本数据,例如:
df <- structure(list(exposed = c(4L, 4L, 4L, 1L, 1L, 1L, 3L, 2L, 2L,
2L, 2L, 2L), unexposed = c(1L, 2L, 2L, 1L, 2L, 3L, 10L, 4L, 5L,
7L, 8L, 9L), rowid = 1:12), class = "data.frame", row.names = c("1",
"2", "3", "4", "5", "6", "7", "8", "9", "10", "11", "12"))
exposed unexposed rowid
1 4 1 1
2 4 2 2
3 4 2 3
4 1 1 4
5 1 2 5
6 1 3 6
7 3 10 7
8 2 4 8
9 2 5 9
10 2 7 10
11 2 8 11
12 2 9 12
Run Code Online (Sandbox Code Playgroud)
我希望 Exposure == 4 被采样两次, Exposure == 1 被采样一次, Exposure == 3 被采样一次, Exposure == 2 被采样两次。换句话说,抽样程序应反映所提供的数据顺序。期望的输出:
exposed unexposed rowid
1 4 1 1
2 4 2 2
6 1 3 6
7 3 10 7
8 2 4 8
11 2 8 11
Run Code Online (Sandbox Code Playgroud)
一种data.table保留采样值的运行列表并使用%!in%fromcollapse来防止再次采样的解决方案:
library(data.table)\nlibrary(collapse) # for %!in%\n\nfjblood1 <- function(data, n) {\n setDT(data)\n # randomly select unique exposed/unexposed rows\n u <- 1:nrow(data)\n if (anyDuplicated(data, by = c("exposed", "unexposed"))) {\n u <- u[\n data[,i := .I][\n sample(.N), -i[duplicated(.SD)], .SDcols = c("exposed", "unexposed")\n ]\n ]\n data[,i := NULL]\n } \n sampled <- vector(mode(data$unexposed), length(u))\n k <- 0L\n data[\n u, .(\n unexposed = {\n x <- unexposed[unexposed %!in% sampled[seq_len(k)]]\n x <- x[sample.int(length(x), min(length(x), n))]\n sampled[seq.int(k + 1L, along.with = x)] <- x\n k <- k + length(x)\n x\n }\n ), exposed\n ]\n}\nRun Code Online (Sandbox Code Playgroud)\n或者,如果df您想要保留其他列:
fjblood2 <- function(data, n) {\n setDT(data)\n u <- 1:nrow(data)\n if (anyDuplicated(data, by = c("exposed", "unexposed"))) {\n u <- u[\n data[,i := .I][\n sample(.N), -i[duplicated(.SD)], .SDcols = c("exposed", "unexposed")\n ]\n ]\n data[,i := NULL]\n } \n sampled <- vector(mode(data$unexposed), length(u))\n k <- 0L\n data[\n u, {\n i <- which(unexposed %!in% sampled[seq_len(k)])\n i <- i[sample.int(length(i), min(length(i), n))]\n sampled[seq.int(k + 1L, along.with = i)] <- unexposed[i]\n k <- k + length(i)\n .SD[i]\n }, exposed\n ]\n}\nRun Code Online (Sandbox Code Playgroud)\n其他候选函数:
\nftmfmnk <- function(data, n) {\n groups <- unique(data[["exposed"]])\n out <- tibble(exposed = integer(), unexposed = integer())\n \n for (group in groups) {\n chosen <- data %>%\n filter(\n exposed == group,\n !unexposed %in% out$unexposed\n ) %>%\n filter(!duplicated(unexposed)) %>%\n sample_n(size = min(n, nrow(.)))\n \n out <- bind_rows(out, chosen)\n }\n \n out\n}\n\nfminem <- function(data, n) {\n groups <- unique(data[["exposed"]])\n # working on vectors is faster\n id <- 1:nrow(data)\n i <- vector("integer")\n unexposed2 <- vector(class(data$unexposed))\n ex <- data$exposed\n ux <- data$unexposed\n \n for (group in groups) {\n f1 <- ex == group # first filter\n f2 <- !ux[f1] %in% unexposed2 # 2nd filter (only on those that match 1st)\n id3 <- id[f1][f2][!duplicated(ux[f1][f2])] # check duplicates only on needed\n # and select necesary row ids\n is <- sample(id3, size = min(length(id3), n)) # sample row ids\n i <- c(i, is) # add to list\n unexposed2 <- ux[i] # resave unexposed2\n }\n out <- data[i, ] # only one data.frame subset\n out$id <- NULL\n out\n}\n\nftic <- function(df, n) {\n lst <- split(df, ~exposed)[as.character(unique(df$exposed))]\n pool <- c()\n for (k in seq_along(lst)) {\n d <- lst[[k]]\n dd <- subset(d[sample.int(nrow(d)), ], !duplicated(unexposed) & !unexposed %in% pool)\n lst[[k]] <- head(dd, n)\n pool <- c(pool, dd$unexposed)\n }\n `rownames<-`(do.call(rbind, lst), NULL)\n}\n\n`%nin%` <- function (x, table) {\n match(x, table, nomatch = 0L) == 0L\n}\n\nfmnist_fast <- function(dat, n) {\n # make a named vector\n vec <- setNames(dat$unexposed, dat$exposed)\n \n # Randomly shuffle the groups\n ord <- unique(names(vec))\n \n # random order the vector\n vec <- sample(vec, size = length(vec))\n \n # keep the names for sampling and later\n nms <- names(vec)\n \n \n # note that there is no need to sample again since both the order \n # of the groups as well as the order within the groups (vec) was sampled.\n \n # by setting an initial value, we avoid an exception in the first iteration\n init <- head(unique(vec[nms == ord[1]]), n)\n \n # `unique()` drops the names \n # yet names are needed to retain the initial data structure\n names(init) <- rep(ord[1], length(init))\n \n res <- Reduce(\n \\(x, y) {\n # current subgroup\n vec_sub <- vec[nms == y]\n \n # without unique, duplicated values inside of the group \n # can be selected multiple times\n res_vec <- head(unique(vec_sub[vec_sub %nin% x]), n)\n \n # in case all values of the current group are already sampled\n len_vec <- length(res_vec)\n if (len_vec == 0) {\n return(x)\n }\n \n # `unique()` drops the names \n # yet names are needed to retain the initial data structure\n names(res_vec) <- rep(y, len_vec)\n \n # concate\n return(c(x, res_vec))\n },\n # first group is already in init\n ord[-1], \n # first group, randomly sampled (see above)\n init = init\n )\n \n # back to a data.frame\n res_dat <- data.frame(exposed = names(res),\n unexposed = res)\n return(res_dat)\n}\nRun Code Online (Sandbox Code Playgroud)\n比较问题中数据集的结果(用另外两列来演示fjblood2):
df <- data.frame(exposed = c(4L, 4L, 4L, 1L, 1L, 1L, 3L, 2L, 2L, 2L, 2L, 2L),\n unexposed = c(1L, 2L, 2L, 1L, 2L, 3L, 10L, 4L, 5L, 7L, 8L, 9L),\n rowid = 1:12)\n\ndt <- as.data.table(df)\nfjblood1(dt, 2L)\n#> exposed unexposed\n#> 1: 4 1\n#> 2: 4 2\n#> 3: 1 3\n#> 4: 3 10\n#> 5: 2 4\n#> 6: 2 7\nfjblood2(dt, 2L)\n#> exposed unexposed rowid\n#> 1: 4 2 3\n#> 2: 4 1 1\n#> 3: 1 3 6\n#> 4: 3 10 7\n#> 5: 2 8 11\n#> 6: 2 9 12\nftmfmnk(df, 2L)\n#> # A tibble: 6 \xc3\x97 3\n#> exposed unexposed rowid\n#> <int> <int> <int>\n#> 1 4 1 1\n#> 2 4 2 2\n#> 3 1 3 6\n#> 4 3 10 7\n#> 5 2 4 8\n#> 6 2 9 12\nfminem(df, 2L)\n#> exposed unexposed rowid\n#> 1 4 1 1\n#> 2 4 2 2\n#> 3 4 2 3\n#> 5 1 2 5\n#> 10 2 7 10\n#> 9 2 5 9\nfmnist_fast(df, 2L)\n#> exposed unexposed\n#> 1 4 2\n#> 2 4 1\n#> 3 1 3\n#> 4 3 10\n#> 5 2 5\n#> 6 2 8\nftic(df, 2L)\n#> exposed unexposed rowid\n#> 1 4 2 3\n#> 2 4 1 1\n#> 3 1 3 6\n#> 4 3 10 7\n#> 5 2 7 10\n#> 6 2 5 9\nRun Code Online (Sandbox Code Playgroud)\n正如更新的问题中所指出的,它的fminem行为不符合预期。
使用更大的数据集进行基准测试:
\nset.seed(123)\ndf <- tibble(\n exposed = rep(1:1000, each = 1000),\n unexposed = sample(7e4, 1e6, 1),\n data1 = runif(1e6),\n data2 = sample(LETTERS, 1e6, 1)\n)\ndt <- as.data.table(df)\n\nmicrobenchmark::microbenchmark(\n ftmfmnk = ftmfmnk(df, 100L),\n fminem = fminem(df, 100L),\n ftic = ftic(df, 100L),\n fmnist_fast(df, 100L),\n fjblood1 = fjblood1(dt, 100L),\n fjblood2 = fjblood2(dt, 100L),\n times = 1\n)\n#> Unit: milliseconds\n#> expr min lq mean median uq max neval\n#> ftmfmnk 74362.9330 74362.9330 74362.9330 74362.9330 74362.9330 74362.9330 1\n#> fminem 5675.6071 5675.6071 5675.6071 5675.6071 5675.6071 5675.6071 1\n#> ftic 4624.8753 4624.8753 4624.8753 4624.8753 4624.8753 4624.8753 1\n#> fmnist_fast(df, 100L) 16919.1535 16919.1535 16919.1535 16919.1535 16919.1535 16919.1535 1\n#> fjblood1 943.9751 943.9751 943.9751 943.9751 943.9751 943.9751 1\n#> fjblood2 909.6813 909.6813 909.6813 909.6813 909.6813 909.6813 1\nRun Code Online (Sandbox Code Playgroud)\n另一个具有更多复制和排除的基准ftmfmnk:
microbenchmark::microbenchmark(\n # ftmfmnk = ftmfmnk(df, 100L),\n fminem = fminem(df, 100L),\n ftic = ftic(df, 100L),\n fmnist_fast(df, 100L),\n fjblood1 = fjblood1(dt, 100L),\n fjblood2 = fjblood2(dt, 100L),\n times = 10\n)\n#> Unit: milliseconds\n#> expr min lq mean median uq max neval\n#> fminem 5518.5713 5550.8040 5614.1822 5600.498 5615.9687 5788.3764 10\n#> ftic 4370.4639 4412.8366 4467.5108 4434.879 4558.6254 4607.8332 10\n#> fmnist_fast(df, 100L) 16483.5461 16524.7059 16639.4094 16602.269 16708.1953 17001.8668 10\n#> fjblood1 618.8821 628.6334 644.4724 636.097 646.6322 693.7889 10\n#> fjblood2 707.1110 730.6251 756.7017 754.079 788.2097 814.0123 10\nRun Code Online (Sandbox Code Playgroud)\n
下面的代码适用于之前的问题和旧数据集。有关问题和数据集最新更新的最新解决方案,请参阅此。
我(个人)更喜欢使用基本 R 进行编码,这里有一些选项,因为我相信基本 R 有助于理解底层机制,并且即使没有额外的包也可以实现相当高的效率。
当然,如果你只追求纯粹的速度,你也应该参考其他高级包和相应的方法,就像@jblood所做的那样。
Reduceftic1 <- function(data, n) {
Reduce(
\(x, y) {
rbind(x, head(subset(y, !unexposed %in% x$unexposed), n))
},
split(data[sample(1:nrow(data)), ], ~exposed)[as.character(unique(data$exposed))]
)
}
Run Code Online (Sandbox Code Playgroud)
for循环:版本 1ftic2 <- function(data, n) {
with(
data,
{
v <- unique(exposed)
lst <- vector("list", length(v))
q <- c()
for (k in seq_along(v)) {
p <- unexposed[(!unexposed %in% q) & exposed == v[k]]
if (length(p) == 0) {
lst[[k]] <- NULL
} else {
u <- head(p[sample.int(length(p))], n)
lst[[k]] <- cbind(exposed = v[k], unexposed = u)
q <- c(q, u)
}
}
as.data.frame(do.call(rbind, lst))
}
)
}
Run Code Online (Sandbox Code Playgroud)
for循环:版本 2ftic3 <- function(data, n) {
lst <- with(data, split(unexposed, exposed)[as.character(unique(exposed))])
pool <- c()
for (k in seq_along(lst)) {
v <- lst[[k]]
u <- v[!v %in% pool]
lst[k] <- ifelse(length(u) > n, list(sample(u, n)), list(u))
pool <- c(pool, lst[[k]])
}
type.convert(setNames(rev(stack(lst)), names(data)), as.is = TRUE)
}
Run Code Online (Sandbox Code Playgroud)
根据dfOP的问题给出
> ftic1(df, 2)
exposed unexposed
1 4 1
2 4 2
5 1 3
6 3 10
10 2 8
8 2 5
> ftic2(df, 2)
exposed unexposed
1 4 1
2 4 2
3 1 3
4 3 10
5 2 7
6 2 4
> ftic3(df, 2)
exposed unexposed
1 4 1
2 4 2
3 1 3
4 3 10
5 2 9
6 2 8
Run Code Online (Sandbox Code Playgroud)
方法列举如下
fjblood1 <- function(data, n) {
setDT(data)
sampled <- vector(mode(data$unexposed), nrow(data))
k <- 0L
data[
, .(
unexposed = {
x <- unexposed[unexposed %!in% sampled[seq_len(k)]]
x <- x[sample.int(length(x), min(length(x), n))]
sampled[seq.int(k + 1L, along.with = x)] <- x
k <- k + length(x)
x
}
), exposed
]
}
fjblood2 <- function(data, n) {
setDT(data)
sampled <- vector(mode(data$unexposed), nrow(data))
k <- 0L
data[
,
{
i <- which(unexposed %!in% sampled[seq_len(k)])
i <- i[sample.int(length(i), min(length(i), n))]
sampled[seq.int(k + 1L, along.with = i)] <- unexposed[i]
k <- k + length(i)
.SD[i]
},
exposed
]
}
ftmfmnk <- function(data, n) {
groups <- unique(data[["exposed"]])
out <- tibble(exposed = integer(), unexposed = integer())
for (group in groups) {
chosen <- data %>%
filter(
exposed == group,
!unexposed %in% out$unexposed
) %>%
filter(!duplicated(unexposed)) %>%
sample_n(size = min(n, nrow(.)))
out <- bind_rows(out, chosen)
}
out
}
fminem <- function(data, n) {
groups <- unique(data[["exposed"]])
# working on vectors is faster
id <- 1:nrow(data)
i <- vector("integer")
unexposed2 <- vector(class(data$unexposed))
ex <- data$exposed
ux <- data$unexposed
for (group in groups) {
f1 <- ex == group # first filter
f2 <- !ux[f1] %in% unexposed2 # 2nd filter (only on those that match 1st)
id3 <- id[f1][f2][!duplicated(ux[f1][f2])] # check duplicates only on needed
# and select necesary row ids
is <- sample(id3, size = min(length(id3), n)) # sample row ids
i <- c(i, is) # add to list
unexposed2 <- ux[i] # resave unexposed2
}
out <- data[i, ] # only one data.frame subset
out$id <- NULL
out
}
ftic1 <- function(data, n) {
Reduce(
\(x, y) {
rbind(x, head(subset(y, !unexposed %in% x$unexposed), n))
},
split(data[sample(1:nrow(data)), ], ~exposed)[as.character(unique(data$exposed))]
)
}
ftic2 <- function(data, n) {
with(
data,
{
v <- unique(exposed)
lst <- vector("list", length(v))
q <- c()
for (k in seq_along(v)) {
p <- unexposed[(!unexposed %in% q) & exposed == v[k]]
if (length(p) == 0) {
lst[[k]] <- NULL
} else {
u <- head(p[sample.int(length(p))], n)
lst[[k]] <- cbind(exposed = v[k], unexposed = u)
q <- c(q, u)
}
}
as.data.frame(do.call(rbind, lst))
}
)
}
ftic3 <- function(data, n) {
lst <- with(data, split(unexposed, exposed)[as.character(unique(exposed))])
pool <- c()
for (k in seq_along(lst)) {
v <- lst[[k]]
u <- v[!v %in% pool]
lst[k] <- ifelse(length(u) > n, list(sample(u, n)), list(u))
pool <- c(pool, lst[[k]])
}
type.convert(setNames(rev(stack(lst)), names(data)), as.is = TRUE)
}
Run Code Online (Sandbox Code Playgroud)
使用更新问题中给出的数据集,我们运行以下基准测试脚本
df <- structure(
list(exposed = c(
4L, 4L, 1L, 1L, 1L, 3L, 2L, 2L, 2L,
2L, 2L
), unexposed = c(
1L, 2L, 1L, 2L, 3L, 10L, 4L, 5L, 7L, 8L,
9L
)),
class = "data.frame", row.names = c(
"1", "2", "3", "4",
"5", "6", "7", "8", "9", "10", "11"
)
)
n <- 2
dt <- as.data.table(df)
microbenchmark(
fjblood1 = fjblood1(dt, n),
fjblood2 = fjblood2(dt, n),
fminem = fminem(df, n),
ftmfmnk = ftmfmnk(df, n),
ftic1 = ftic1(df, n),
ftic2 = ftic2(df, n),
ftic3 = ftic3(df,n),
unit = "relative"
)
Run Code Online (Sandbox Code Playgroud)
这使
Unit: relative
expr min lq mean median uq max neval
fjblood1 6.4168151 5.142746 5.714648 5.137915 5.182090 21.810226 100
fjblood2 10.2215142 8.260134 9.204300 8.501112 8.128708 22.865106 100
fminem 0.9815306 1.075249 1.996025 1.091704 1.052034 31.366451 100
ftmfmnk 116.7095553 98.740732 92.170153 95.355659 85.750936 73.738831 100
ftic1 8.7822192 7.385250 7.638289 7.132669 6.762140 18.621511 100
ftic2 1.0000000 1.000000 1.000000 1.000000 1.000000 1.000000 100
ftic3 4.2110394 4.059356 4.015434 4.008809 3.705119 8.273886 100
Run Code Online (Sandbox Code Playgroud)
set.seed(123)
df <- tibble(
exposed = rep(1:1000, each = 1000),
unexposed = c(replicate(1e3, sample(7e4, 1e3)))
)
dt <- as.data.table(df)
n <- 2L
microbenchmark(
fjblood1 = fjblood1(dt, n),
fjblood2 = fjblood2(dt, n),
ftic1 = ftic1(df, n),
ftic2 = ftic2(df, n),
ftic3 = ftic3(df, n),
times = 10L,
unit = "relative"
)
Run Code Online (Sandbox Code Playgroud)
节目
Unit: relative
expr min lq mean median uq max
fjblood1 1.000000 1.000000 1.000000 1.000000 1.000000 1.0000000
fjblood2 2.771467 2.472605 2.360890 2.563041 2.723441 1.3771934
ftic1 10.580318 12.251636 9.301784 10.176871 10.005448 5.2545817
ftic2 557.732618 556.806101 417.405326 462.315480 417.127467 206.5138370
ftic3 1.628257 1.514933 1.423689 1.720916 1.607287 0.7996517
neval
10
10
10
10
10
Run Code Online (Sandbox Code Playgroud)
set.seed(123)
df <- tibble(
exposed = rep(1:1000, each = 1000),
unexposed = c(replicate(1e3, sample(7e4, 1e3)))
)
dt <- as.data.table(df)
n <- 100L
microbenchmark(
fjblood1 = fjblood1(dt, n),
ftic3 = ftic3(df, n),
times = 20L,
unit = "relative"
)
Run Code Online (Sandbox Code Playgroud)
节目
Unit: relative
expr min lq mean median uq max neval
fjblood1 1.000000 1.000000 1.000000 1.000000 1.00000 1.000000 20
ftic3 5.955011 5.107471 4.364985 4.545183 4.01909 3.064281 20
Run Code Online (Sandbox Code Playgroud)