以列表形式返回匹配元素的快速方法

use*_*817 8 performance r vector

我有两个向量:

set.seed(1)

a <- sample(1:100,200, replace=T)

b <- sample(1:100,40, replace=F)
Run Code Online (Sandbox Code Playgroud)

a我想找到该匹配中元素的位置b

sapply(b, function(x) which(a %in% x))
Run Code Online (Sandbox Code Playgroud)

这可以完成工作,但需要很长时间

有没有一种方法可以将结果存储在实际上很快的列表中?

所需的输出如下所示:

sapply(b, function(x) which(a %in% x))
[[1]]
integer(0)

[[2]]
integer(0)

[[3]]
[1] 107 142 199

[[4]]
[1] 109 126

[[5]]
[1] 136 167

[[6]]
integer(0)

[[7]]
integer(0)

[[8]]
[1]  73  91 176

[[9]]
[1]  51 146 181
Run Code Online (Sandbox Code Playgroud)

GKi*_*GKi 5

您可以将a沿a 的split序列按b分类 ( ) 。factor

\n
split(seq_along(a), factor(a, b))\n
Run Code Online (Sandbox Code Playgroud)\n

如果需要,结果可以是未命名的。

\n
unname(split(seq_along(a), factor(a, b)))\n
Run Code Online (Sandbox Code Playgroud)\n

或者减少到b中的那些。

\n
. <- which(a %in% b)\nsplit(., factor(a[.], b))\n
Run Code Online (Sandbox Code Playgroud)\n

或者更接近 @Ritchie Sacramento 和 @ThomasIsCoding 的方法并使用fastmatch.

\n
library(fastmatch)\n. <- which(a %fin% b)\nsplit(., factor(fmatch(a[.], b), 1:length(b)))\n
Run Code Online (Sandbox Code Playgroud)\n

或者使用collapse::rsplit.

\n
library(collapse)\nrsplit(NULL, a)[as.character(b)]  #Returns NA where others return numeric(0)\n
Run Code Online (Sandbox Code Playgroud)\n

或者collapse和的一些组合fastmatch

\n
library(fastmatch)\nLibrary(collapse)\n\n. <- fmatch(a, b)\nattr(., "levels") <- b\nclass(.) <- "factor"  #WARNING Please don't use that\ngsplit(NULL, .)[-length(b)-1]\n\ni <- which(a %fin% b)\n. <- fmatch(a[i], b)\nattr(., "levels") <- b\nclass(.) <- "factor"  #WARNING Please don't use that\ngsplit(i, .)\n\n. <- fmatch(a, b)\ni <- which(!is.na(.))\nattr(., "levels") <- b\nclass(.) <- "factor"  #WARNING Please don't use that\ngsplit(i, .[i])\n
Run Code Online (Sandbox Code Playgroud)\n

或者使用 rcpp 用 C++ 编写。

\n
Rcpp::cppFunction("\nRcpp::List getIdx(const Rcpp::IntegerVector& a,\n                  const Rcpp::IntegerVector& b) {\n  std::unordered_map<int, std::vector<int> > m;\n  for(auto const& i : b) m[i].clear();\n  for(int i=0; i<=a.size(); ++i) {\n    auto j = m.find(a[i]);\n    if(j != m.end()) j->second.push_back(i+1);\n  }\n  std::vector< std::vector<int> > res;\n  for(auto const& i : b) res.push_back(std::move(m[i]));\n  return wrap( res );\n}")\ngetIdx(a, b)\n
Run Code Online (Sandbox Code Playgroud)\n

基准

\n
library(collapse) #For Mael and rsplit, gsplit\nlibrary(fastmatch) #For fmatch\nlibrary(data.table) #For jblood94\n\nmethods = alist(original = sapply(b, function(x) which(a %in% x)),\n Thomas = unname(by(seq_along(a), a, list)[as.character(b)]), #Not equal the original -> check = FALSE\n "==" = lapply(b, function(x) which(a == x)),\n Christian = {res  = rep_len(list(integer(0)), length(b))\n                comm = intersect(b, a)\n                res[match(comm, b)] = lapply(comm, function(x) which(a==x))\n                res},\n Mael1 = lapply(b, function(x) a %==% x),\n Mael2 = lapply(b, function(x) whichv(a, x)),\n Ritchie = { m <- b[match(a, b)]\n    idx <- !is.na(m)\n    unname(split(which(idx), factor(match(m[idx], b), levels = seq.int(length(b)))))},\n jblood94 = {out <- rep(list(integer(0)), length(b))\n   dt <- data.table(i = (i <- which(a %fin% b)), a = a[i])[, .(i = .(i)), a]\n   out[fmatch(dt[[1]], b)] <- dt[[2]]\n   out},\n split = split(seq_along(a), factor(a, b)), #Returns named list\n splitu = unname(split(seq_along(a), factor(a, b))),\n splitIn = {. <- which(a %in% b); split(., factor(a[.], b))},  #Returns named list\n fmatch = {. <- which(a %fin% b)   #Returns named list\n   split(., factor(fmatch(a[.], b), 1:length(b))) },\n rcpp = getIdx(a, b),\n rsplit = rsplit(NULL, a)[as.character(b)],\n fmCol1 = {. <- fmatch(a, b); attr(., "levels") <- b\n    class(.) <- "factor"; gsplit(NULL, .)[-length(b)-1]},\n fmCol2 = {i <- which(a %fin% b); . <- fmatch(a[i], b); attr(., "levels") <- b\n    class(.) <- "factor"; gsplit(i, .)},\n fmCol3 = {. <- fmatch(a, b); i <- which(!is.na(.)); attr(., "levels") <- b\n    class(.) <- "factor"; gsplit(i, .[i])}\n)\n
Run Code Online (Sandbox Code Playgroud)\n

结果

\n
set.seed(1)\na <- sample(1:100, 200, replace=TRUE)\nb <- sample(1:100, 40)\nbench::mark(check = FALSE, exprs = methods)\n\n   expression      min   median `itr/sec` mem_alloc `gc/sec` n_itr  n_gc total\xe2\x80\xa6\xc2\xb9\n   <bch:expr> <bch:tm> <bch:tm>     <dbl> <bch:byt>    <dbl> <int> <dbl> <bch:t>\n 1 original   110.13\xc2\xb5s 123.87\xc2\xb5s     7619.    3.97MB     22.4  3405    10   447ms\n 2 Thomas     940.95\xc2\xb5s 993.35\xc2\xb5s     1000.  256.16KB     36.2   442    16   442ms\n 3 ==          66.82\xc2\xb5s  74.11\xc2\xb5s    12243.  117.27KB     29.0  5482    13   448ms\n 4 Christian   64.22\xc2\xb5s  70.38\xc2\xb5s    13075.   83.47KB     30.2  5636    13   431ms\n 5 Mael1        39.7\xc2\xb5s  42.81\xc2\xb5s    22750.   41.07KB     32.4  9130    13   401ms\n 6 Mael2       42.09\xc2\xb5s  45.83\xc2\xb5s    21051.   41.12KB     31.4  9384    14   446ms\n 7 Ritchie     18.22\xc2\xb5s  20.33\xc2\xb5s    47043.   49.07KB     28.2  9994     6   212ms\n 8 jblood94   348.23\xc2\xb5s 372.25\xc2\xb5s     2613.    1.96MB     23.5  1221    11   467ms\n 9 split       14.74\xc2\xb5s  16.53\xc2\xb5s    58574.    8.68KB     17.6  9997     3   171ms\n10 splitu       15.2\xc2\xb5s  17.08\xc2\xb5s    56286.    8.68KB     22.5  9996     4   178ms\n11 splitIn     15.17\xc2\xb5s  16.95\xc2\xb5s    56171.   10.23KB     28.1  9995     5   178ms\n12 fmatch      15.49\xc2\xb5s  17.33\xc2\xb5s    54217.       9KB     21.7  9996     4   184ms\n13 rcpp         6.14\xc2\xb5s   6.75\xc2\xb5s   140790.    2.85KB     14.1  9999     1    71ms\n14 rsplit      22.46\xc2\xb5s  25.09\xc2\xb5s    37062.    9.84KB     29.7  9992     8   270ms\n15 fmCol1      13.62\xc2\xb5s  15.44\xc2\xb5s    61478.   42.27KB     30.8  9995     5   163ms\n16 fmCol2      12.64\xc2\xb5s   14.2\xc2\xb5s    68176.    4.46KB     27.3  9996     4   147ms\n17 fmCol3      15.31\xc2\xb5s  17.02\xc2\xb5s    55035.   15.62KB     38.6  9993     7   182ms\n
Run Code Online (Sandbox Code Playgroud)\n
set.seed(42)\na <- sample(1:10000, 20000, replace=TRUE)\nb <- sample(1:10000, 4000)\nbench::mark(check = FALSE, exprs = methods)\n\n   expression      min   median `itr/sec` mem_alloc `gc/sec` n_itr  n_gc total\xe2\x80\xa6\xc2\xb9\n   <bch:expr> <bch:tm> <bch:tm>     <dbl> <bch:byt>    <dbl> <int> <dbl> <bch:t>\n 1 original   698.85ms 698.85ms      1.43    1.19GB    35.8      1    25   699ms\n 2 Thomas      115.5ms 118.76ms      8.45     2.6MB    42.2      5    25   592ms\n 3 ==         137.85ms 165.14ms      5.73  610.78MB    72.6      3    38   524ms\n 4 Christian  118.31ms 123.18ms      8.13   527.1MB    89.5      5    55   615ms\n 5 Mael1       45.27ms  47.62ms     18.4   305.42MB   116.      10    63   544ms\n 6 Mael2       43.84ms  45.76ms     21.9   305.42MB   137.      11    69   503ms\n 7 Ritchie      2.71ms   2.77ms    352.      1.21MB     7.96   177     4   503ms\n 8 jblood94      4.7ms   6.02ms    163.    649.87KB    29.8     82    15   503ms\n 9 split        4.01ms    4.1ms    238.     752.3KB     3.99   119     2   501ms\n10 splitu       4.02ms   4.18ms    229.     752.3KB     3.98   115     2   503ms\n11 splitIn      2.68ms    2.8ms    346.    863.79KB     5.97   174     3   502ms\n12 fmatch       2.36ms   2.51ms    383.    769.34KB     5.98   192     3   501ms\n13 rcpp       875.88\xc2\xb5s 896.47\xc2\xb5s    930.     33.79KB     8.00   465     4   500ms\n14 rsplit       1.97ms   2.06ms    464.    902.81KB    12.0    232     6   500ms\n15 fmCol1     854.57\xc2\xb5s 893.17\xc2\xb5s   1035.    672.34KB    16.0    518     8   500ms\n16 fmCol2     410.06\xc2\xb5s 425.88\xc2\xb5s   2156.    407.27KB    22.0   1078    11   500ms\n17 fmCol3     365.01\xc2\xb5s 381.88\xc2\xb5s   2285.       454KB    26.0   1143    13   500ms\n
Run Code Online (Sandbox Code Playgroud)\n
set.seed(1) #Taken from jblood94, Test only a selection of methods\na <- sample(1:1e6, 2e6, replace = TRUE)\nb <- sample(1:1e6, 4e5)\nbench::mark(check = FALSE, exprs = methods[-1:-6], min_iterations = 7)\n\n   expression      min   median `itr/sec` mem_alloc gc/se\xe2\x80\xa6\xc2\xb9 n_itr  n_gc total_\xe2\x80\xa6\xc2\xb2\n   <bch:expr> <bch:tm> <bch:tm>     <dbl> <bch:byt>   <dbl> <int> <dbl> <bch:tm>\n 1 Ritchie    593.69ms 789.89ms     1.29   127.36MB   2.20      7    12    5.45s\n 2 jblood94   500.65ms  616.8ms     1.47    57.11MB   4.84      7    23    4.75s\n 3 split         1.15s    1.28s     0.767   79.14MB   0.658     7     6    9.12s\n 4 splitu        1.25s    1.39s     0.724   75.14MB   0.517     7     5    9.67s\n 5 splitIn    650.54ms 884.24ms     1.15    86.79MB   0.987     7     6    6.08s\n 6 fmatch     598.77ms 626.69ms     1.55    76.69MB   1.11      7     5     4.5s\n 7 rcpp       382.63ms 389.04ms     2.48     3.05MB   0.355     7     1    2.82s\n 8 rsplit     618.36ms 723.24ms     1.39    84.64MB   1.59      7     8    5.05s\n 9 fmCol1     187.54ms 235.62ms     3.91    65.61MB   3.35      7     6    1.79s\n10 fmCol2      64.61ms  67.62ms    13.7     39.68MB   5.87      7     3 511.42ms\n11 fmCol3      58.71ms  64.26ms    14.4     44.26MB   5.41      8     3 554.08ms\n
Run Code Online (Sandbox Code Playgroud)\n


Tho*_*ing 0

我们可以玩一个技巧,并by通过索引索引列表as.character(b)但是,我想说这个选项很有趣,但不快),例如,

> unname(by(seq_along(a), a, list)[as.character(b)])
[[1]]
NULL

[[2]]
NULL

[[3]]
[1] 107 142 199

[[4]]
[1] 109 126

[[5]]
[1] 136 167

[[6]]
NULL

[[7]]
NULL

[[8]]
[1]  73  91 176

[[9]]
[1]  51 146 181

[[10]]
[1] 191

[[11]]
[1]  55 118

[[12]]
[1] 192

[[13]]
[1]  40  64 110 165

[[14]]
[1]  20  22 122 175

[[15]]
[1] 137 189

[[16]]
[1] 134

[[17]]
[1] 128

[[18]]
[1]  17  81 184

[[19]]
NULL

[[20]]
[1] 188 194

[[21]]
[1]  98 180

[[22]]
[1]  62 145

[[23]]
[1] 33

[[24]]
NULL

[[25]]
[1] 47

[[26]]
NULL

[[27]]
[1]  29 114 159

[[28]]
[1]  18  26 171

[[29]]
[1]  28  69 186 200

[[30]]
[1] 42

[[31]]
[1]  79 158 190

[[32]]
[1]  5 38 58 82

[[33]]
[1]  35  74 121

[[34]]
[1] 150

[[35]]
[1]  34  36 139

[[36]]
[1]  70 100 117 195

[[37]]
NULL

[[38]]
[1]  32  46 102

[[39]]
[1]  89 133

[[40]]
[1] 127 129 160
Run Code Online (Sandbox Code Playgroud)