提高递归采样函数的性能

tmf*_*mnk 5 performance r sampling dataframe

作为我上一个问题的后续问题,我对提高现有递归采样函数的性能感兴趣。

递归采样是指为给定的暴露 ID 随机选择最多 n 个唯一的未暴露 ID,然后从剩余的未暴露 ID 中随机选择最多 n 个唯一的未暴露 ID 为另一个暴露 ID。如果给定的公开 ID 没有剩余的未公开 ID,则该公开 ID 将被忽略。

原函数如下:

recursive_sample <- function(data, n) {
 
 groups <- unique(data[["exposed"]])
 out <- data.frame(exposed = character(), unexposed = character())
 
 for (group in groups) {
  
  chosen <- data %>%
   filter(exposed == group,
          !unexposed %in% out$unexposed) %>%
   group_by(unexposed) %>%
   slice(1) %>%
   ungroup() %>%
   sample_n(size = min(n, nrow(.))) 
  
  out <- rbind(out, chosen)
  
 }
 
 out
 
}
Run Code Online (Sandbox Code Playgroud)

我能够创建一个更有效的,如下所示:

recursive_sample2 <- function(data, n) {
 
 groups <- unique(data[["exposed"]])
 out <- tibble(exposed = integer(), unexposed = integer())
 
 for (group in groups) {
  
  chosen <- data %>%
   filter(exposed == group,
          !unexposed %in% out$unexposed) %>%
   filter(!duplicated(unexposed)) %>%
   sample_n(size = min(n, nrow(.))) 
  
  out <- bind_rows(out, chosen)
  
 }
 
 out
 
}
Run Code Online (Sandbox Code Playgroud)

样本数据和基准测试:

set.seed(123)
df <- tibble(exposed = rep(1:100, each = 100),
             unexposed = sample(1:7000, 10000, replace = TRUE))

microbenchmark(f1 = recursive_sample(df, 5),
               f2 = recursive_sample2(df, 5),
               times = 10)

Unit: milliseconds
 expr       min        lq      mean    median        uq      max neval cld
   f1 1307.7198 1316.5276 1379.0533 1371.3952 1416.6360 1540.955    10   b
   f2  839.0086  865.2547  914.8327  901.2288  970.9518 1036.170    10  a 
Run Code Online (Sandbox Code Playgroud)

然而,对于我的实际数据集,我需要一个更高效(即更快)的函数。任何关于更有效版本的想法,无论是data.table涉及并行化还是其他方法,都是受欢迎的。

2023年12月编辑

@minem 和 @ThomasIsCoding 的版本要快得多;但是,它们仅返回部分正确的结果。考虑样本数据,例如:

df <- structure(list(exposed = c(4L, 4L, 4L, 1L, 1L, 1L, 3L, 2L, 2L, 
                                 2L, 2L, 2L), unexposed = c(1L, 2L, 2L, 1L, 2L, 3L, 10L, 4L, 5L, 
                                                            7L, 8L, 9L), rowid = 1:12), class = "data.frame", row.names = c("1", 
                                                                                                                            "2", "3", "4", "5", "6", "7", "8", "9", "10", "11", "12"))

   exposed unexposed rowid
1        4         1     1
2        4         2     2
3        4         2     3
4        1         1     4
5        1         2     5
6        1         3     6
7        3        10     7
8        2         4     8
9        2         5     9
10       2         7    10
11       2         8    11
12       2         9    12
Run Code Online (Sandbox Code Playgroud)

我希望 Exposure == 4 被采样两次, Exposure == 1 被采样一次, Exposure == 3 被采样一次, Exposure == 2 被采样两次。换句话说,抽样程序应反映所提供的数据顺序。期望的输出:

exposed unexposed rowid
1        4         1     1
2        4         2     2
6        1         3     6
7        3        10     7
8        2         4     8
11       2         8    11
Run Code Online (Sandbox Code Playgroud)

jbl*_*d94 6

更新于 12/13/23

\n

一种data.table保留采样值的运行列表并使用%!in%fromcollapse来防止再次采样的解决方案:

\n
library(data.table)\nlibrary(collapse) # for %!in%\n\nfjblood1 <- function(data, n) {\n  setDT(data)\n  # randomly select unique exposed/unexposed rows\n  u <- 1:nrow(data)\n  if (anyDuplicated(data, by = c("exposed", "unexposed"))) {\n    u <- u[\n      data[,i := .I][\n        sample(.N), -i[duplicated(.SD)], .SDcols = c("exposed", "unexposed")\n      ]\n    ]\n    data[,i := NULL]\n  } \n  sampled <- vector(mode(data$unexposed), length(u))\n  k <- 0L\n  data[\n    u, .(\n      unexposed = {\n        x <- unexposed[unexposed %!in% sampled[seq_len(k)]]\n        x <- x[sample.int(length(x), min(length(x), n))]\n        sampled[seq.int(k + 1L, along.with = x)] <- x\n        k <- k + length(x)\n        x\n      }\n    ), exposed\n  ]\n}\n
Run Code Online (Sandbox Code Playgroud)\n

或者,如果df您想要保留其他列:

\n
fjblood2 <- function(data, n) {\n  setDT(data)\n  u <- 1:nrow(data)\n  if (anyDuplicated(data, by = c("exposed", "unexposed"))) {\n    u <- u[\n      data[,i := .I][\n        sample(.N), -i[duplicated(.SD)], .SDcols = c("exposed", "unexposed")\n      ]\n    ]\n    data[,i := NULL]\n  } \n  sampled <- vector(mode(data$unexposed), length(u))\n  k <- 0L\n  data[\n    u, {\n      i <- which(unexposed %!in% sampled[seq_len(k)])\n      i <- i[sample.int(length(i), min(length(i), n))]\n      sampled[seq.int(k + 1L, along.with = i)] <- unexposed[i]\n      k <- k + length(i)\n      .SD[i]\n    }, exposed\n  ]\n}\n
Run Code Online (Sandbox Code Playgroud)\n

其他候选函数:

\n
ftmfmnk <- function(data, n) {\n  groups <- unique(data[["exposed"]])\n  out <- tibble(exposed = integer(), unexposed = integer())\n  \n  for (group in groups) {\n    chosen <- data %>%\n      filter(\n        exposed == group,\n        !unexposed %in% out$unexposed\n      ) %>%\n      filter(!duplicated(unexposed)) %>%\n      sample_n(size = min(n, nrow(.)))\n    \n    out <- bind_rows(out, chosen)\n  }\n  \n  out\n}\n\nfminem <- function(data, n) {\n  groups <- unique(data[["exposed"]])\n  # working on vectors is faster\n  id <- 1:nrow(data)\n  i <- vector("integer")\n  unexposed2 <- vector(class(data$unexposed))\n  ex <- data$exposed\n  ux <- data$unexposed\n  \n  for (group in groups) {\n    f1 <- ex == group # first filter\n    f2 <- !ux[f1] %in% unexposed2 # 2nd filter (only on those that match 1st)\n    id3 <- id[f1][f2][!duplicated(ux[f1][f2])] # check duplicates only on needed\n    # and select necesary row ids\n    is <- sample(id3, size = min(length(id3), n)) # sample row ids\n    i <- c(i, is) # add to list\n    unexposed2 <- ux[i] # resave unexposed2\n  }\n  out <- data[i, ] # only one data.frame subset\n  out$id <- NULL\n  out\n}\n\nftic <- function(df, n) {\n  lst <- split(df, ~exposed)[as.character(unique(df$exposed))]\n  pool <- c()\n  for (k in seq_along(lst)) {\n    d <- lst[[k]]\n    dd <- subset(d[sample.int(nrow(d)), ], !duplicated(unexposed) & !unexposed %in% pool)\n    lst[[k]] <- head(dd, n)\n    pool <- c(pool, dd$unexposed)\n  }\n  `rownames<-`(do.call(rbind, lst), NULL)\n}\n\n`%nin%` <- function (x, table) {\n  match(x, table, nomatch = 0L) == 0L\n}\n\nfmnist_fast <- function(dat, n) {\n  # make a named vector\n  vec <- setNames(dat$unexposed, dat$exposed)\n  \n  # Randomly shuffle the groups\n  ord <- unique(names(vec))\n  \n  # random order the vector\n  vec <- sample(vec, size = length(vec))\n  \n  # keep the names for sampling and later\n  nms <- names(vec)\n  \n  \n  # note that there is no need to sample again since both the order \n  # of the groups as well as the order within the groups (vec) was sampled.\n  \n  # by setting an initial value, we avoid an exception in the first iteration\n  init <- head(unique(vec[nms == ord[1]]), n)\n  \n  # `unique()` drops the names \n  # yet names are needed to retain the initial data structure\n  names(init) <- rep(ord[1], length(init))\n  \n  res <- Reduce(\n    \\(x, y) {\n      # current subgroup\n      vec_sub <- vec[nms == y]\n      \n      # without unique, duplicated values inside of the group \n      # can be selected multiple times\n      res_vec <- head(unique(vec_sub[vec_sub %nin% x]), n)\n      \n      # in case all values of the current group are already sampled\n      len_vec <- length(res_vec)\n      if (len_vec == 0) {\n        return(x)\n      }\n      \n      # `unique()` drops the names \n      # yet names are needed to retain the initial data structure\n      names(res_vec) <- rep(y, len_vec)\n      \n      # concate\n      return(c(x, res_vec))\n    },\n    # first group is already in init\n    ord[-1], \n    # first group, randomly sampled (see above)\n    init = init\n  )\n  \n  # back to a data.frame\n  res_dat <- data.frame(exposed   = names(res),\n                        unexposed = res)\n  return(res_dat)\n}\n
Run Code Online (Sandbox Code Playgroud)\n

比较问题中数据集的结果(用另外两列来演示fjblood2):

\n
df <- data.frame(exposed = c(4L, 4L, 4L, 1L, 1L, 1L, 3L, 2L, 2L, 2L, 2L, 2L),\n                 unexposed = c(1L, 2L, 2L, 1L, 2L, 3L, 10L, 4L, 5L, 7L, 8L, 9L),\n                 rowid = 1:12)\n\ndt <- as.data.table(df)\nfjblood1(dt, 2L)\n#>    exposed unexposed\n#> 1:       4         1\n#> 2:       4         2\n#> 3:       1         3\n#> 4:       3        10\n#> 5:       2         4\n#> 6:       2         7\nfjblood2(dt, 2L)\n#>    exposed unexposed rowid\n#> 1:       4         2     3\n#> 2:       4         1     1\n#> 3:       1         3     6\n#> 4:       3        10     7\n#> 5:       2         8    11\n#> 6:       2         9    12\nftmfmnk(df, 2L)\n#> # A tibble: 6 \xc3\x97 3\n#>   exposed unexposed rowid\n#>     <int>     <int> <int>\n#> 1       4         1     1\n#> 2       4         2     2\n#> 3       1         3     6\n#> 4       3        10     7\n#> 5       2         4     8\n#> 6       2         9    12\nfminem(df, 2L)\n#>    exposed unexposed rowid\n#> 1        4         1     1\n#> 2        4         2     2\n#> 3        4         2     3\n#> 5        1         2     5\n#> 10       2         7    10\n#> 9        2         5     9\nfmnist_fast(df, 2L)\n#>   exposed unexposed\n#> 1       4         2\n#> 2       4         1\n#> 3       1         3\n#> 4       3        10\n#> 5       2         5\n#> 6       2         8\nftic(df, 2L)\n#>   exposed unexposed rowid\n#> 1       4         2     3\n#> 2       4         1     1\n#> 3       1         3     6\n#> 4       3        10     7\n#> 5       2         7    10\n#> 6       2         5     9\n
Run Code Online (Sandbox Code Playgroud)\n

正如更新的问题中所指出的,它的fminem行为不符合预期。

\n

使用更大的数据集进行基准测试:

\n
set.seed(123)\ndf <- tibble(\n  exposed = rep(1:1000, each = 1000),\n  unexposed = sample(7e4, 1e6, 1),\n  data1 = runif(1e6),\n  data2 = sample(LETTERS, 1e6, 1)\n)\ndt <- as.data.table(df)\n\nmicrobenchmark::microbenchmark(\n  ftmfmnk = ftmfmnk(df, 100L),\n  fminem = fminem(df, 100L),\n  ftic = ftic(df, 100L),\n  fmnist_fast(df, 100L),\n  fjblood1 = fjblood1(dt, 100L),\n  fjblood2 = fjblood2(dt, 100L),\n  times = 1\n)\n#> Unit: milliseconds\n#>                   expr        min         lq       mean     median         uq        max neval\n#>                ftmfmnk 74362.9330 74362.9330 74362.9330 74362.9330 74362.9330 74362.9330     1\n#>                 fminem  5675.6071  5675.6071  5675.6071  5675.6071  5675.6071  5675.6071     1\n#>                   ftic  4624.8753  4624.8753  4624.8753  4624.8753  4624.8753  4624.8753     1\n#>  fmnist_fast(df, 100L) 16919.1535 16919.1535 16919.1535 16919.1535 16919.1535 16919.1535     1\n#>               fjblood1   943.9751   943.9751   943.9751   943.9751   943.9751   943.9751     1\n#>               fjblood2   909.6813   909.6813   909.6813   909.6813   909.6813   909.6813     1\n
Run Code Online (Sandbox Code Playgroud)\n

另一个具有更多复制和排除的基准ftmfmnk

\n
microbenchmark::microbenchmark(\n  # ftmfmnk = ftmfmnk(df, 100L),\n  fminem = fminem(df, 100L),\n  ftic = ftic(df, 100L),\n  fmnist_fast(df, 100L),\n  fjblood1 = fjblood1(dt, 100L),\n  fjblood2 = fjblood2(dt, 100L),\n  times = 10\n)\n#> Unit: milliseconds\n#>                   expr        min         lq       mean    median         uq        max neval\n#>                 fminem  5518.5713  5550.8040  5614.1822  5600.498  5615.9687  5788.3764    10\n#>                   ftic  4370.4639  4412.8366  4467.5108  4434.879  4558.6254  4607.8332    10\n#>  fmnist_fast(df, 100L) 16483.5461 16524.7059 16639.4094 16602.269 16708.1953 17001.8668    10\n#>               fjblood1   618.8821   628.6334   644.4724   636.097   646.6322   693.7889    10\n#>               fjblood2   707.1110   730.6251   756.7017   754.079   788.2097   814.0123    10\n
Run Code Online (Sandbox Code Playgroud)\n


Tho*_*ing 5

下面的代码适用于之前的问题和旧数据集。有关问题和数据集最新更新的最新解决方案,请参阅


免责声明

我(个人)更喜欢使用基本 R 进行编码,这里有一些选项,因为我相信基本 R 有助于理解底层机制,并且即使没有额外的包也可以实现相当高的效率。

当然,如果你只追求纯粹的速度,你也应该参考其他高级包和相应的方法,就像@jblood所做的那样


基础 R 选项

  • Reduce
ftic1 <- function(data, n) {
    Reduce(
        \(x, y) {
            rbind(x, head(subset(y, !unexposed %in% x$unexposed), n))
        },
        split(data[sample(1:nrow(data)), ], ~exposed)[as.character(unique(data$exposed))]
    )
}
Run Code Online (Sandbox Code Playgroud)
  • for循环:版本 1
ftic2 <- function(data, n) {
    with(
        data,
        {
            v <- unique(exposed)
            lst <- vector("list", length(v))
            q <- c()
            for (k in seq_along(v)) {
                p <- unexposed[(!unexposed %in% q) & exposed == v[k]]
                if (length(p) == 0) {
                    lst[[k]] <- NULL
                } else {
                    u <- head(p[sample.int(length(p))], n)
                    lst[[k]] <- cbind(exposed = v[k], unexposed = u)
                    q <- c(q, u)
                }
            }
            as.data.frame(do.call(rbind, lst))
        }
    )
}
Run Code Online (Sandbox Code Playgroud)
  • for循环:版本 2
ftic3 <- function(data, n) {
    lst <- with(data, split(unexposed, exposed)[as.character(unique(exposed))])
    pool <- c()
    for (k in seq_along(lst)) {
        v <- lst[[k]]
        u <- v[!v %in% pool]
        lst[k] <- ifelse(length(u) > n, list(sample(u, n)), list(u))
        pool <- c(pool, lst[[k]])
    }
    type.convert(setNames(rev(stack(lst)), names(data)), as.is = TRUE)
}
Run Code Online (Sandbox Code Playgroud)

输出示例

根据dfOP的问题给出

> ftic1(df, 2)
   exposed unexposed
1        4         1
2        4         2
5        1         3
6        3        10
10       2         8
8        2         5

> ftic2(df, 2)
  exposed unexposed
1       4         1
2       4         2
3       1         3
4       3        10
5       2         7
6       2         4

> ftic3(df, 2)
  exposed unexposed
1       4         1
2       4         2
3       1         3
4       3        10
5       2         9
6       2         8
Run Code Online (Sandbox Code Playgroud)

基准测试(具有不同的数据集大小)

方法列举如下

fjblood1 <- function(data, n) {
    setDT(data)
    sampled <- vector(mode(data$unexposed), nrow(data))
    k <- 0L
    data[
        , .(
            unexposed = {
                x <- unexposed[unexposed %!in% sampled[seq_len(k)]]
                x <- x[sample.int(length(x), min(length(x), n))]
                sampled[seq.int(k + 1L, along.with = x)] <- x
                k <- k + length(x)
                x
            }
        ), exposed
    ]
}

fjblood2 <- function(data, n) {
    setDT(data)
    sampled <- vector(mode(data$unexposed), nrow(data))
    k <- 0L
    data[
        ,
        {
            i <- which(unexposed %!in% sampled[seq_len(k)])
            i <- i[sample.int(length(i), min(length(i), n))]
            sampled[seq.int(k + 1L, along.with = i)] <- unexposed[i]
            k <- k + length(i)
            .SD[i]
        },
        exposed
    ]
}

ftmfmnk <- function(data, n) {
    groups <- unique(data[["exposed"]])
    out <- tibble(exposed = integer(), unexposed = integer())

    for (group in groups) {
        chosen <- data %>%
            filter(
                exposed == group,
                !unexposed %in% out$unexposed
            ) %>%
            filter(!duplicated(unexposed)) %>%
            sample_n(size = min(n, nrow(.)))

        out <- bind_rows(out, chosen)
    }

    out
}

fminem <- function(data, n) {
    groups <- unique(data[["exposed"]])
    # working on vectors is faster
    id <- 1:nrow(data)
    i <- vector("integer")
    unexposed2 <- vector(class(data$unexposed))
    ex <- data$exposed
    ux <- data$unexposed

    for (group in groups) {
        f1 <- ex == group # first filter
        f2 <- !ux[f1] %in% unexposed2 # 2nd filter (only on those that match 1st)
        id3 <- id[f1][f2][!duplicated(ux[f1][f2])] # check duplicates only on needed
        # and select necesary row ids
        is <- sample(id3, size = min(length(id3), n)) # sample row ids
        i <- c(i, is) # add to list
        unexposed2 <- ux[i] # resave unexposed2
    }
    out <- data[i, ] # only one data.frame subset
    out$id <- NULL
    out
}


ftic1 <- function(data, n) {
    Reduce(
        \(x, y) {
            rbind(x, head(subset(y, !unexposed %in% x$unexposed), n))
        },
        split(data[sample(1:nrow(data)), ], ~exposed)[as.character(unique(data$exposed))]
    )
}

ftic2 <- function(data, n) {
    with(
        data,
        {
            v <- unique(exposed)
            lst <- vector("list", length(v))
            q <- c()
            for (k in seq_along(v)) {
                p <- unexposed[(!unexposed %in% q) & exposed == v[k]]
                if (length(p) == 0) {
                    lst[[k]] <- NULL
                } else {
                    u <- head(p[sample.int(length(p))], n)
                    lst[[k]] <- cbind(exposed = v[k], unexposed = u)
                    q <- c(q, u)
                }
            }
            as.data.frame(do.call(rbind, lst))
        }
    )
}

ftic3 <- function(data, n) {
    lst <- with(data, split(unexposed, exposed)[as.character(unique(exposed))])
    pool <- c()
    for (k in seq_along(lst)) {
        v <- lst[[k]]
        u <- v[!v %in% pool]
        lst[k] <- ifelse(length(u) > n, list(sample(u, n)), list(u))
        pool <- c(pool, lst[[k]])
    }
    type.convert(setNames(rev(stack(lst)), names(data)), as.is = TRUE)
}
Run Code Online (Sandbox Code Playgroud)
  • 小尺寸

使用更新问题中给出的数据集,我们运行以下基准测试脚本

df <- structure(
    list(exposed = c(
        4L, 4L, 1L, 1L, 1L, 3L, 2L, 2L, 2L,
        2L, 2L
    ), unexposed = c(
        1L, 2L, 1L, 2L, 3L, 10L, 4L, 5L, 7L, 8L,
        9L
    )),
    class = "data.frame", row.names = c(
        "1", "2", "3", "4",
        "5", "6", "7", "8", "9", "10", "11"
    )
)
n <- 2
dt <- as.data.table(df)
microbenchmark(
    fjblood1 = fjblood1(dt, n),
    fjblood2 = fjblood2(dt, n),
    fminem = fminem(df, n),
    ftmfmnk = ftmfmnk(df, n),
    ftic1 = ftic1(df, n),
    ftic2 = ftic2(df, n),
    ftic3 = ftic3(df,n),
    unit = "relative"
)
Run Code Online (Sandbox Code Playgroud)

这使

Unit: relative
     expr         min        lq      mean    median        uq       max neval
 fjblood1   6.4168151  5.142746  5.714648  5.137915  5.182090 21.810226   100
 fjblood2  10.2215142  8.260134  9.204300  8.501112  8.128708 22.865106   100
   fminem   0.9815306  1.075249  1.996025  1.091704  1.052034 31.366451   100
  ftmfmnk 116.7095553 98.740732 92.170153 95.355659 85.750936 73.738831   100
    ftic1   8.7822192  7.385250  7.638289  7.132669  6.762140 18.621511   100
    ftic2   1.0000000  1.000000  1.000000  1.000000  1.000000  1.000000   100
    ftic3   4.2110394  4.059356  4.015434  4.008809  3.705119  8.273886   100
Run Code Online (Sandbox Code Playgroud)
  • 大尺寸
set.seed(123)
df <- tibble(
    exposed = rep(1:1000, each = 1000),
    unexposed = c(replicate(1e3, sample(7e4, 1e3)))
)
dt <- as.data.table(df)
n <- 2L
microbenchmark(
    fjblood1 = fjblood1(dt, n),
    fjblood2 = fjblood2(dt, n),
    ftic1 = ftic1(df, n),
    ftic2 = ftic2(df, n),
    ftic3 = ftic3(df, n),
    times = 10L,
    unit = "relative"
)
Run Code Online (Sandbox Code Playgroud)

节目

Unit: relative
     expr        min         lq       mean     median         uq         max
 fjblood1   1.000000   1.000000   1.000000   1.000000   1.000000   1.0000000
 fjblood2   2.771467   2.472605   2.360890   2.563041   2.723441   1.3771934
    ftic1  10.580318  12.251636   9.301784  10.176871  10.005448   5.2545817
    ftic2 557.732618 556.806101 417.405326 462.315480 417.127467 206.5138370
    ftic3   1.628257   1.514933   1.423689   1.720916   1.607287   0.7996517
 neval
    10
    10
    10
    10
    10
Run Code Online (Sandbox Code Playgroud)
  • 尺寸更大
set.seed(123)
df <- tibble(
    exposed = rep(1:1000, each = 1000),
    unexposed = c(replicate(1e3, sample(7e4, 1e3)))
)
dt <- as.data.table(df)
n <- 100L
microbenchmark(
    fjblood1 = fjblood1(dt, n),
    ftic3 = ftic3(df, n),
    times = 20L,
    unit = "relative"
)
Run Code Online (Sandbox Code Playgroud)

节目

Unit: relative
     expr      min       lq     mean   median      uq      max neval
 fjblood1 1.000000 1.000000 1.000000 1.000000 1.00000 1.000000    20
    ftic3 5.955011 5.107471 4.364985 4.545183 4.01909 3.064281    20
Run Code Online (Sandbox Code Playgroud)