更快的组合版本

maj*_*jom 12 combinations r combn data.table

有没有办法加快combn命令,以获得从矢量中取出的2个元素的所有独特组合?

通常这将设置如下:

# Get latest version of data.table
library(devtools)
install_github("Rdatatable/data.table",  build_vignettes = FALSE)  
library(data.table)

# Toy data
d <- data.table(id=as.character(paste0("A", 10001:15000))) 

# Transform data 
system.time({
d.1 <- as.data.table(t(combn(d$id, 2)))
})
Run Code Online (Sandbox Code Playgroud)

但是,combn使用data.table计算所有可能的组合要慢10倍(23秒对比我的计算机3秒).

system.time({
d.2 <- d[, list(neighbor=d$id[-which(d$id==id)]), by=c("id")]
})
Run Code Online (Sandbox Code Playgroud)

处理非常大的向量,我正在寻找一种通过仅计算唯一组合(如combn)来节省内存的方法,但是使用data.table的速度(参见第二个代码片段).

我感谢任何帮助.

akr*_*run 20

你可以使用combnPrimgRbase

source("http://bioconductor.org/biocLite.R")
biocLite("gRbase") # will install dependent packages automatically.
system.time({
 d.1 <- as.data.table(t(combn(d$id, 2)))
 })
#   user  system elapsed 
# 27.322   0.585  27.674 

system.time({
d.2 <- as.data.table(t(combnPrim(d$id,2)))
 })
#   user  system elapsed 
#  2.317   0.110   2.425 

identical(d.1[order(V1, V2),], d.2[order(V1,V2),])
#[1] TRUE
Run Code Online (Sandbox Code Playgroud)

  • @Aurèle这个包不是基于CRAN repository.at我写的答案.现在,我认为它与[this]相同(https://cran.r-project.org/web/packages/gRbase/index.html) (2认同)

Aru*_*run 17

这是一种使用data.table功能的方式foverlaps(),结果也很快!

require(data.table) ## 1.9.4+
d[, `:=`(id1 = 1L, id2 = .I)] ## add interval columns for overlaps
setkey(d, id1, id2)

system.time(olaps <- foverlaps(d, d, type="within", which=TRUE)[xid != yid])
#  0.603   0.062   0.717
Run Code Online (Sandbox Code Playgroud)

请注意,foverlaps() 不会计算所有排列.xid != yid需要子集来移除自身重叠.通过实现ignoreSelf参数可以更有效地内部处理子集- 类似于IRanges::findOverlaps.

现在只需使用获得的ID执行子集:

system.time(ans <- setDT(list(d$id[olaps$xid], d$id[olaps$yid])))
#   0.576   0.047   0.662 
Run Code Online (Sandbox Code Playgroud)

总共,~1.4秒.


优点是你可以采用相同的方式,即使你的data.table d有超过1列你要获得组合,并使用相同数量的内存(因为我们返回索引).在这种情况下,你只需:

cbind(d[olaps$xid, your_cols, with=FALSE], d[olaps$yid, your_cols, with=FALSE])
Run Code Online (Sandbox Code Playgroud)

但它仅限于替换combn(., 2L).不超过2L.

  • Fwiw,这也相当快:`system.time(res < - d [,r:= .I] [d,on =.(r> r),nomatch = 0])`来自http:// stackoverflow. COM/A/43315317 / (2认同)

Ani*_*jee 9

这是使用Rcpp的解决方案.

library(Rcpp)
library(data.table)
cppFunction('
Rcpp::DataFrame combi2(Rcpp::CharacterVector inputVector){
    int len = inputVector.size();
    int retLen = len * (len-1) / 2;
    Rcpp::CharacterVector outputVector1(retLen);
    Rcpp::CharacterVector outputVector2(retLen);
    int start = 0;
    for (int i = 0; i < len; ++i){
        for (int j = i+1; j < len; ++j){
            outputVector1(start) = inputVector(i);
            outputVector2(start) = inputVector(j);
            ++start;
            }
        }
    return(Rcpp::DataFrame::create(Rcpp::Named("id") = outputVector1,
                              Rcpp::Named("neighbor") = outputVector2));
};
')

# Toy data
d <- data.table(id=as.character(paste0("A", 10001:15000))) 

system.time({
    d.2 <- d[, list(neighbor=d$id[-which(d$id==id)]), by=c("id")]
    })
#  1.908   0.397   2.389

system.time({
    d[, `:=`(id1 = 1L, id2 = .I)] ## add interval columns for overlaps
    setkey(d, id1, id2)
    olaps <- foverlaps(d, d, type="within", which=TRUE)[xid != yid]
    ans <- setDT(list(d$id[olaps$xid], d$id[olaps$yid]))
    })
#  0.653   0.038   0.705

system.time(ans2 <- combi2(d$id))
#  1.377   0.108   1.495 
Run Code Online (Sandbox Code Playgroud)

使用Rcpp函数获取索引然后形成data.table,效果更好.

cppFunction('
Rcpp::DataFrame combi2inds(const Rcpp::CharacterVector inputVector){
const int len = inputVector.size();
const int retLen = len * (len-1) / 2;
Rcpp::IntegerVector outputVector1(retLen);
Rcpp::IntegerVector outputVector2(retLen);
int indexSkip;
for (int i = 0; i < len; ++i){
    indexSkip = len * i - ((i+1) * i)/2;
    for (int j = 0; j < len-1-i; ++j){
        outputVector1(indexSkip+j) = i+1;
        outputVector2(indexSkip+j) = i+j+1+1;
        }
    }
return(Rcpp::DataFrame::create(Rcpp::Named("xid") = outputVector1,
                          Rcpp::Named("yid") = outputVector2));
};
')

system.time({
        indices <- combi2inds(d$id)
        ans2 <- setDT(list(d$id[indices$xid], d$id[indices$yid]))
        })      
#  0.389   0.027   0.425 
Run Code Online (Sandbox Code Playgroud)


Jos*_*ood 8

如果没有基准测试,标题中包含快速单词的任何变体的帖子都是不完整的.在我们发布任何基准测试之前,我想提一下,自从发布此问题以来,已经发布了两个高度优化的软件包arrangementsRcppAlgos(我是作者)用于生成组合R.

为了让你过一个自己的速度的想法2.3.0RcppAlgos,这里是一个基本标杆:

## We test generating just over 3 million combinations
choose(25, 10)
[1] 3268760

microbenchmark(arrngmnt = arrangements::combinations(25, 10),
               combn = combn(25, 10),
               gRBase = gRbase::combnPrim(25, 10),
               serAlgos = RcppAlgos::comboGeneral(25, 10),
               parAlgos = RcppAlgos::comboGeneral(25, 10, nThreads = 4),
               unit = "relative", times = 20)
Unit: relative
    expr        min         lq       mean     median         uq        max neval
arrngmnt   2.979378   3.072319   1.898390   3.756307   2.139258  0.4842967    20
   combn 226.470755 230.410716 118.157110 232.905393 125.718512 17.7778585    20
  gRBase  34.219914  34.209820  18.789954  34.218320  19.934485  3.6455493    20
serAlgos   2.836651   3.078791   2.458645   3.703929   2.231475  1.1652445    20
parAlgos   1.000000   1.000000   1.000000   1.000000   1.000000  1.0000000    20
Run Code Online (Sandbox Code Playgroud)

现在,我们基于生成组合的特定情况发布的其他函数进行基准测试,选择2并生成combn对象.

功能如下:

funAkraf <- function(d) {
    a <- comb2.int(length(d$id))      ## comb2.int from the answer given by @akraf
    setDT(list(V1 = d$id[a[,1]], V2 = d$id[a[,2]]))
}

funAnirban <- function(d) {
    indices <- combi2inds(d$id)
    ans2 <- setDT(list(d$id[indices$xid], d$id[indices$yid]))
    ans2
}

funArun <- function(d) {
    d[, `:=`(id1 = 1L, id2 = .I)] ## add interval columns for overlaps
    setkey(d, id1, id2)
    olaps <- foverlaps(d, d, type="within", which=TRUE)[xid != yid]
    ans <- setDT(list(d$id[olaps$xid], d$id[olaps$yid]))
    ans
}

funArrangements <- function(d) {
  a <- arrangements::combinations(x = d$id, k = 2)
  setDT(list(a[, 1], a[, 2]))
}

funGRbase <- function(d) {
  a <- gRbase::combnPrim(d$id,2)
  setDT(list(a[1, ], a[2, ]))
}

funOPCombn <- function(d) {
  a <- combn(d$id, 2)
  setDT(list(a[1, ], a[2, ]))
}

funRcppAlgos <- function(d) {
  a <- RcppAlgos::comboGeneral(d$id, 2, nThreads = 4)
  setDT(list(a[, 1], a[, 2]))
}
Run Code Online (Sandbox Code Playgroud)

以下是OP给出的示例的基准:

d <- data.table(id=as.character(paste0("A", 10001:15000))) 

microbenchmark(funAkraf(d),
               funAnirban(d),
               funArrangements(d),
               funArun(d),
               funGRbase(d),
               funOPCombn(d),
               funRcppAlgos(d),
               times = 10, unit = "relative")
    Unit: relative
              expr       min        lq      mean    median        uq       max neval
       funAkraf(d)  3.220550  2.971264  2.815023  2.665616  2.344018  3.383673    10
     funAnirban(d)  1.000000  1.000000  1.000000  1.000000  1.000000  1.000000    10
funArrangements(d)  1.464730  1.689231  1.834650  1.960233  1.932361  1.693305    10
        funArun(d)  3.256889  2.908075  2.634831  2.729180  2.432277  2.193849    10
      funGRbase(d)  3.513847  3.340637  3.327845  3.196399  3.291480  3.129362    10
     funOPCombn(d) 30.310469 26.255374 21.656376 22.386270 18.527904 15.626261    10
   funRcppAlgos(d)  1.676808  1.956696  1.943773  2.085968  1.949133  1.804180    10
Run Code Online (Sandbox Code Playgroud)

我们看到@AnirbanMukherjee提供的功能是这项任务最快的,其次是gRbase::combnPrim/ data.table(非常接近的时间).

他们都给出了相同的结果:

dFac <- d
dFac$id <- as.factor(dFac$id)

library(microbenchmark)
microbenchmark(funAkraf(dFac),
               funAnirban(dFac),
               funArrangements(dFac),
               funArun(dFac),
               funGRbase(dFac),
               funOPCombn(dFac),
               funRcppAlgos(dFac),
               times = 10, unit = "relative")
Unit: relative
                 expr        min         lq      mean   median        uq       max   neval
       funAkraf(dFac)  10.898202  10.949896  7.589814 10.01369  8.050005  5.557014      10
     funAnirban(dFac)   3.104212   3.337344  2.317024  3.00254  2.471887  1.530978      10
funArrangements(dFac)   2.054116   2.058768  1.858268  1.94507  2.797956  1.691875      10
        funArun(dFac)  10.646680  12.905119  7.703085 11.50311  8.410893  3.802155      10
      funGRbase(dFac)  16.523356  21.609917 12.991400 19.73776 13.599870  6.498135      10
     funOPCombn(dFac) 108.301876 108.753085 64.338478 95.56197 65.494335 28.183104      10
   funRcppAlgos(dFac)   1.000000   1.000000  1.000000  1.00000  1.000000  1.000000      10 
Run Code Online (Sandbox Code Playgroud)

感谢@Frank指出如何比较两个,RcppAlgos而不是经历创建新的麻烦arrangements,然后安排它们:

identical(funAkraf(d), funOPCombn(d))
#[1] TRUE
identical(funAkraf(d), funArrangements(d))
#[1] TRUE
identical(funRcppAlgos(d), funArrangements(d))
#[1] TRUE
identical(funRcppAlgos(d), funAnirban(d))
#[1] TRUE
identical(funRcppAlgos(d), funArun(d))
#[1] TRUE

## different order... we must sort
identical(funRcppAlgos(d), funGRbase(d))
[1] FALSE
d1 <- funGRbase(d)
d2 <- funRcppAlgos(d)

## now it's the same
identical(d1[order(V1, V2),], d2[order(V1,V2),])
#[1] TRUE
Run Code Online (Sandbox Code Playgroud)

  • 仅供参考,用于与订单无关的比较:`DT = data.table(a = 1:2); fsetequal(DT, DT[2:1])` (2认同)

akr*_*ica 5


如果您不想使用额外的依赖项,这里有两个基本 R 解决方案:

  • comb2.int使用rep和其他序列生成函数来生成所需的输出。

  • comb2.mat创建一个矩阵,用于upper.tri()获取上三角形并which(..., arr.ind = TRUE)获取列索引和行索引=>所有组合。

可能性一:comb2.int

comb2.int <- function(n, rep = FALSE){
  if(!rep){
    # e.g. n=3 => (1,2), (1,3), (2,3)
    x <- rep(1:n,(n:1)-1)
    i <- seq_along(x)+1
    o <- c(0,cumsum((n-2):1))
    y <- i-o[x]
  }else{
    # e.g. n=3 => (1,1), (1,2), (1,3), (2,2), (2,3), (3,3)
    x <- rep(1:n,n:1)
    i <- seq_along(x)
    o <- c(0,cumsum(n:2))
    y <- i-o[x]+x-1
  }
  return(cbind(x,y))
}
Run Code Online (Sandbox Code Playgroud)

可能性2:comb2.mat

comb2.mat <- function(n, rep = FALSE){
  # Use which(..., arr.ind = TRUE) to get coordinates.
  m <- matrix(FALSE, nrow = n, ncol = n)
  idxs <- which(upper.tri(m, diag = rep), arr.ind = TRUE)
  return(idxs)
}
Run Code Online (Sandbox Code Playgroud)

这些函数给出的结果与以下相同combn(.)

for(i in 2:8){
  # --- comb2.int ------------------
  stopifnot(comb2.int(i) == t(combn(i,2)))
  # => Equal

  # --- comb2.mat ------------------
  m <- comb2.mat(i)
  colnames(m) <- NULL   # difference 1: colnames
  m <- m[order(m[,1]),] # difference 2: output order
  stopifnot(m == t(combn(i,2)))
  # => Equal up to above differences
}
Run Code Online (Sandbox Code Playgroud)

但我的向量中还有除顺序整数之外的其他元素!

使用返回值作为索引:

v <- LETTERS[1:5]                                     
c <- comb2.int(length(v))                             
cbind(v[c[,1]], v[c[,2]])                             
#>       [,1] [,2]
#>  [1,] "A"  "B" 
#>  [2,] "A"  "C" 
#>  [3,] "A"  "D" 
#>  [4,] "A"  "E" 
#>  [5,] "B"  "C" 
#>  [6,] "B"  "D" 
#>  [7,] "B"  "E" 
#>  [8,] "C"  "D" 
#>  [9,] "C"  "E" 
#> [10,] "D"  "E"
Run Code Online (Sandbox Code Playgroud)

基准:

时间( combn) = ~5x 时间( comb2.mat) = ~80x 时间( comb2.int):

library(microbenchmark)

n <- 800
microbenchmark({
  comb2.int(n)
},{
  comb2.mat(n)
},{
  t(combn(n, 2))
})
#>   Unit: milliseconds
#>                    expr        min         lq       mean     median        uq       max neval
#>    {     comb2.int(n) }   4.394051   4.731737   6.350406   5.334463   7.22677  14.68808   100
#>    {     comb2.mat(n) }  20.131455  22.901534  31.648521  24.411782  26.95821 297.70684   100
#>  {     t(combn(n, 2)) } 363.687284 374.826268 391.038755 380.012274 389.59960 532.30305   100
Run Code Online (Sandbox Code Playgroud)