获取列表中共享元素的所有组合

den*_*ble 4 optimization r list

列表:

terms <- list(Item1 = c("a", "b", "c", "d"),
              Item2 = c("a", "e", "f", "g"),
              Item3 = c("b", "e", "h", "i"),
              Item4 = c("j", "k"))
Run Code Online (Sandbox Code Playgroud)

我想获得列表中每对项目之间共享字母的数量.因此预期的输出是:

     [,1] [,2] [,3] [,4]
[1,]    4    1    1    0
[2,]    1    4    1    0
[3,]    1    1    4    0
[4,]    0    0    0    2
Run Code Online (Sandbox Code Playgroud)

从之前的StackOverflow答案中,我找到了一个可能的解决方案:

overlapLength <- function(x, y) mapply(function(x, y) 
  length(intersect(x, y)), terms[x], terms[y])
s <- seq_along(terms)
outer(s, s, overlapLength)
Run Code Online (Sandbox Code Playgroud)

但这对我的名单来说非常慢,这个名单非常大(约9,000件).

有更快的方法吗?


谢谢各位的意见.我用我列表中的前100个项目计算了所有答案.

> system.time(f_crossprod(go))
   user  system elapsed 
  0.024   0.001   0.025 
> system.time(f_crossprod2(go))
   user  system elapsed 
  0.007   0.000   0.008 
> system.time(f_mapply(go))
   user  system elapsed 
  2.018   0.032   2.059 
> system.time(f_outer(go))
   user  system elapsed 
  1.950   0.016   1.979 
> system.time(f_combn(go))
   user  system elapsed 
  1.056   0.005   1.062 
> system.time(f_Rcpp(go))
   user  system elapsed 
163.236  84.226 249.240 
Run Code Online (Sandbox Code Playgroud)

然后我用~9,000个元素的整个列表计算outerMatrix::crossprod解决方案.该outer解决方案在大约55分钟内完成.该Matrix::crossprod解决方案在跑了约0.1秒!

我可能在实现Rcpp函数时出错了.但是,@ alexis_laz如果你发表评论我将接受它.

顺便说一句,抱歉我不清楚,我对对角线上的值不感兴趣.

akr*_*run 8

我们可以用 outer

outer(names(terms), names(terms), FUN = function(x,y) 
              lengths(Map(intersect, terms[x], terms[y])))
#     [,1] [,2] [,3] [,4]
#[1,]    4    1    1    0
#[2,]    1    4    1    0
#[3,]    1    1    4    0
#[4,]    0    0    0    2
Run Code Online (Sandbox Code Playgroud)

或者更紧凑

outer(terms, terms, FUN = function(...) lengths(Map(intersect, ...)))
#      Item1 Item2 Item3 Item4
#Item1     4     1     1     0
#Item2     1     4     1     0
#Item3     1     1     4     0
#Item4     0     0     0     2
Run Code Online (Sandbox Code Playgroud)

我们也可以实现这一点Rcpp.以下是test1.cpp文件

#include <Rcpp.h>
#include <math.h>

using namespace Rcpp;
//[[Rcpp::export]]

List foo(List xs) {
    List x(xs);
    List x1 = Rcpp::clone(xs);
    List y1 = Rcpp::clone(xs);
    int n = x1.size();



    NumericVector res;


    for( int i=0; i<n; i++){
        for(int j=0; j<n; j++){
         CharacterVector xd = x1[i];
         CharacterVector yd = y1[j];

        res.push_back(intersect(xd, yd).length());
        }
    }
    return wrap(res) ;
Run Code Online (Sandbox Code Playgroud)

我们称之为R使用

library(Rcpp)
sourceCpp("test1.cpp")
`dim<-`(unlist(foo(terms)), c(4, 4))
#     [,1] [,2] [,3] [,4]
#[1,]    4    1    1    0
#[2,]    1    4    1    0
#[3,]    1    1    4    0
#[4,]    0    0    0    2
Run Code Online (Sandbox Code Playgroud)

基准

除了上面的函数之外,我们还包含了另一个版本,其中包含RcppEigen此处发布的实现

n <- 100
set.seed(24)
terms1 <- setNames(replicate(n, sample(letters, sample(10), 
         replace = TRUE)), paste0("Item", seq_len(n)))

library(Matrix)
library(inline)
library(Rcpp)

alexis1 <- function() {crossprod(table(stack(terms1)))}
alexis2 <-  function() {Matrix::crossprod(xtabs( ~ values + ind, 
            stack(terms1), sparse = TRUE)) }

akrun1 <- function(){outer(terms1, terms1, FUN = function(...) lengths(Map(intersect, ...)))}
akrun2 <- function() {`dim<-`(unlist(foo(terms1)), c(n, n))}
akrun3 <- function() {tbl <- table(stack(terms1))
                      funCPr(tbl, tbl)[[1]]}

db <- function() {do.call(rbind, lapply(1:length(terms1), function(i)
    sapply(terms1, function(a)
        sum(unlist(terms1[i]) %in% unlist(a)))))} 
lmo <- function() { setNames(data.frame(t(combn(names(terms1), 2)),
                      combn(seq_along(terms1), 2,
                            function(x) length(intersect(terms1[[x[1]]], terms1[[x[2]]])))),
         c("col1", "col2", "counts"))}
Run Code Online (Sandbox Code Playgroud)

和为基准输出Ñ100

library(microbenchmark)
microbenchmark(alexis1(), alexis2(),   akrun1(), akrun2(),akrun3(), db(), lmo(),
           unit = "relative", times = 10L)
#Unit: relative
#      expr        min         lq       mean     median         uq        max neval  cld
# alexis1()   1.035975   1.032101   1.031239   1.010472   1.044217   1.129092    10 a   
# alexis2()   3.896928   3.656585   3.461980   3.386301   3.335469   3.288161    10 a   
#  akrun1() 218.456708 207.099841 198.391784 189.356065 188.542712 214.415661    10    d
#  akrun2()  84.239272  79.073087  88.594414  75.719853  78.277769 129.731990    10  b  
#  akrun3()   1.000000   1.000000   1.000000   1.000000   1.000000   1.000000    10 a   
#      db()  86.921164  82.201117  80.358097  75.113471  73.311414 105.761977    10  b  
#     lmo() 125.128109 123.203318 118.732911 113.271352 113.164333 138.075212    10   c 
Run Code Online (Sandbox Code Playgroud)

随着略高ñ200

n <- 200
set.seed(24)
terms1 <- setNames(replicate(n, sample(letters, sample(10),
      replace = TRUE)), paste0("Item", seq_len(n)))

microbenchmark(alexis1(), alexis2(), akrun3(), db(), unit = "relative", times = 10L)
#Unit: relative
#      expr        min         lq       mean     median         uq        max neval cld
# alexis1()   1.117234   1.164198   1.181280   1.166070   1.230077   1.229899    10 a  
# alexis2()   3.428904   3.425942   3.337112   3.379675   3.280729   3.164852    10  b 
#  akrun3()   1.000000   1.000000   1.000000   1.000000   1.000000   1.000000    10 a  
#      db() 219.971285 219.577403 207.793630 213.232359 196.122420 187.433635    10   c
Run Code Online (Sandbox Code Playgroud)

随着ñ定在9000

n <- 9000
set.seed(24)
terms1 <- setNames(replicate(n, sample(letters, sample(10), 
                replace = TRUE)), paste0("Item", seq_len(n)))
microbenchmark(alexis1(),alexis2(),  akrun3(), unit = "relative", times = 10L)
#Unit: relative
#     expr      min       lq     mean   median       uq      max neval cld
# alexis1() 2.048708 2.021709 2.009396 2.085750 2.141060 1.767329    10  b 
# alexis2() 3.520220 3.518339 3.419368 3.616512 3.515993 2.952927    10   c
#  akrun3() 1.000000 1.000000 1.000000 1.000000 1.000000 1.000000    10 a  
Run Code Online (Sandbox Code Playgroud)

检查输出

res1 <- alexis1()
res2 <- akrun3()
res3 <- alexis2()
all.equal(res1, res2, check.attributes = FALSE)
#[1] TRUE
all.equal(res1, as.matrix(res3), check.attributes = FALSE)
#[1] TRUE
Run Code Online (Sandbox Code Playgroud)

基于来自@alexis_laz的评论,我们还包含了3个函数来替换该table/stack部分以比较n at 的效率9000

alexis3 <- function() {
    unlt = unlist(terms1, use.names = FALSE)
    u = unique(unlt)
    tab = matrix(0L, length(u), length(terms1), dimnames = list(u, names(terms1)))
    tab[cbind(match(unlt, u), rep(seq_along(terms1), lengths(terms1)))] = 1L
    crossprod(tab, tab)
    }

alexis4 <- function() {
        unlt = unlist(terms1, use.names = FALSE)
        u = unique(unlt)

       tab = sparseMatrix(x = 1L, i = match(unlt, u),
           j = rep(seq_along(terms1), lengths(terms1)), dimnames = list(u, names(terms1)))

       Matrix::crossprod(tab, tab, sparse = TRUE)
       }

akrun4 <- function() {
        unlt = unlist(terms1, use.names = FALSE)
        u = unique(unlt)
        tab = matrix(0L, length(u), length(terms1), dimnames = list(u, names(terms1)))
        tab[cbind(match(unlt, u), rep(seq_along(terms1), lengths(terms1)))] = 1L
        funCPr(tab, tab)[[1]]
      }
Run Code Online (Sandbox Code Playgroud)

而且基准是

microbenchmark(alexis1(),alexis2(), alexis3(), alexis4(),
         akrun3(), akrun4(),  unit = "relative", times = 10L)
#Unit: relative
#      expr       min        lq     mean   median       uq      max neval  cld
# alexis1() 2.1888254 2.2897883 2.204237 2.169618 2.162955 2.122552    10  b  
# alexis2() 3.7651292 3.9178071 3.672550 3.616577 3.587886 3.426039    10   c 
# alexis3() 2.1776887 2.2410663 2.197293 2.137106 2.192834 2.241645    10  b  
# alexis4() 4.1640895 4.3431379 4.262192 4.187449 4.388335 4.172607    10    d
#  akrun3() 1.0000000 1.0000000 1.000000 1.000000 1.000000 1.000000    10 a   
#  akrun4() 0.9364288 0.9692772 1.043292 1.063931 1.090301 1.171245    10 a   
Run Code Online (Sandbox Code Playgroud)


lmo*_*lmo 5

这用于combn产生术语组合的数据框,其中术语的值不同.setNames添加变量名称.

result <- setNames(data.frame(t(combn(names(terms), 2)),
                      combn(seq_along(terms), 2,
                            function(x) length(intersect(terms[[x[1]]], terms[[x[2]]])))),
         c("col1", "col2", "counts"))
Run Code Online (Sandbox Code Playgroud)

返回

result
   col1  col2 counts
1 Item1 Item2      1
2 Item1 Item3      1
3 Item1 Item4      0
4 Item2 Item3      1
5 Item2 Item4      0
6 Item3 Item4      0
Run Code Online (Sandbox Code Playgroud)

如果需要,您可以使用lengths获得自己的长度,然后rbind结果如

temp <- lengths(terms)
rbind(result, data.frame(col1=names(temp), col2=names(temp), counts=temp, row.names=NULL))
    col1  col2 counts
1  Item1 Item2      1
2  Item1 Item3      1
3  Item1 Item4      0
4  Item2 Item3      1
5  Item2 Item4      0
6  Item3 Item4      0
7  Item1 Item1      4
8  Item2 Item2      4
9  Item3 Item3      4
10 Item4 Item4      2
Run Code Online (Sandbox Code Playgroud)