use*_*817 8 performance r vector
我有两个向量:
set.seed(1)
a <- sample(1:100,200, replace=T)
b <- sample(1:100,40, replace=F)
Run Code Online (Sandbox Code Playgroud)
a我想找到该匹配中元素的位置b:
sapply(b, function(x) which(a %in% x))
Run Code Online (Sandbox Code Playgroud)
这可以完成工作,但需要很长时间
有没有一种方法可以将结果存储在实际上很快的列表中?
所需的输出如下所示:
sapply(b, function(x) which(a %in% x))
[[1]]
integer(0)
[[2]]
integer(0)
[[3]]
[1] 107 142 199
[[4]]
[1] 109 126
[[5]]
[1] 136 167
[[6]]
integer(0)
[[7]]
integer(0)
[[8]]
[1] 73 91 176
[[9]]
[1] 51 146 181
Run Code Online (Sandbox Code Playgroud)
您可以将a沿a 的split序列按b分类 ( ) 。factor
split(seq_along(a), factor(a, b))\nRun Code Online (Sandbox Code Playgroud)\n如果需要,结果可以是未命名的。
\nunname(split(seq_along(a), factor(a, b)))\nRun Code Online (Sandbox Code Playgroud)\n或者减少到b中的那些。
\n. <- which(a %in% b)\nsplit(., factor(a[.], b))\nRun Code Online (Sandbox Code Playgroud)\n或者更接近 @Ritchie Sacramento 和 @ThomasIsCoding 的方法并使用fastmatch.
library(fastmatch)\n. <- which(a %fin% b)\nsplit(., factor(fmatch(a[.], b), 1:length(b)))\nRun Code Online (Sandbox Code Playgroud)\n或者使用collapse::rsplit.
library(collapse)\nrsplit(NULL, a)[as.character(b)] #Returns NA where others return numeric(0)\nRun Code Online (Sandbox Code Playgroud)\n或者collapse和的一些组合fastmatch。
library(fastmatch)\nLibrary(collapse)\n\n. <- fmatch(a, b)\nattr(., "levels") <- b\nclass(.) <- "factor" #WARNING Please don't use that\ngsplit(NULL, .)[-length(b)-1]\n\ni <- which(a %fin% b)\n. <- fmatch(a[i], b)\nattr(., "levels") <- b\nclass(.) <- "factor" #WARNING Please don't use that\ngsplit(i, .)\n\n. <- fmatch(a, b)\ni <- which(!is.na(.))\nattr(., "levels") <- b\nclass(.) <- "factor" #WARNING Please don't use that\ngsplit(i, .[i])\nRun Code Online (Sandbox Code Playgroud)\n或者使用 rcpp 用 C++ 编写。
\nRcpp::cppFunction("\nRcpp::List getIdx(const Rcpp::IntegerVector& a,\n const Rcpp::IntegerVector& b) {\n std::unordered_map<int, std::vector<int> > m;\n for(auto const& i : b) m[i].clear();\n for(int i=0; i<=a.size(); ++i) {\n auto j = m.find(a[i]);\n if(j != m.end()) j->second.push_back(i+1);\n }\n std::vector< std::vector<int> > res;\n for(auto const& i : b) res.push_back(std::move(m[i]));\n return wrap( res );\n}")\ngetIdx(a, b)\nRun Code Online (Sandbox Code Playgroud)\n基准
\nlibrary(collapse) #For Mael and rsplit, gsplit\nlibrary(fastmatch) #For fmatch\nlibrary(data.table) #For jblood94\n\nmethods = alist(original = sapply(b, function(x) which(a %in% x)),\n Thomas = unname(by(seq_along(a), a, list)[as.character(b)]), #Not equal the original -> check = FALSE\n "==" = lapply(b, function(x) which(a == x)),\n Christian = {res = rep_len(list(integer(0)), length(b))\n comm = intersect(b, a)\n res[match(comm, b)] = lapply(comm, function(x) which(a==x))\n res},\n Mael1 = lapply(b, function(x) a %==% x),\n Mael2 = lapply(b, function(x) whichv(a, x)),\n Ritchie = { m <- b[match(a, b)]\n idx <- !is.na(m)\n unname(split(which(idx), factor(match(m[idx], b), levels = seq.int(length(b)))))},\n jblood94 = {out <- rep(list(integer(0)), length(b))\n dt <- data.table(i = (i <- which(a %fin% b)), a = a[i])[, .(i = .(i)), a]\n out[fmatch(dt[[1]], b)] <- dt[[2]]\n out},\n split = split(seq_along(a), factor(a, b)), #Returns named list\n splitu = unname(split(seq_along(a), factor(a, b))),\n splitIn = {. <- which(a %in% b); split(., factor(a[.], b))}, #Returns named list\n fmatch = {. <- which(a %fin% b) #Returns named list\n split(., factor(fmatch(a[.], b), 1:length(b))) },\n rcpp = getIdx(a, b),\n rsplit = rsplit(NULL, a)[as.character(b)],\n fmCol1 = {. <- fmatch(a, b); attr(., "levels") <- b\n class(.) <- "factor"; gsplit(NULL, .)[-length(b)-1]},\n fmCol2 = {i <- which(a %fin% b); . <- fmatch(a[i], b); attr(., "levels") <- b\n class(.) <- "factor"; gsplit(i, .)},\n fmCol3 = {. <- fmatch(a, b); i <- which(!is.na(.)); attr(., "levels") <- b\n class(.) <- "factor"; gsplit(i, .[i])}\n)\nRun Code Online (Sandbox Code Playgroud)\n结果
\nset.seed(1)\na <- sample(1:100, 200, replace=TRUE)\nb <- sample(1:100, 40)\nbench::mark(check = FALSE, exprs = methods)\n\n expression min median `itr/sec` mem_alloc `gc/sec` n_itr n_gc total\xe2\x80\xa6\xc2\xb9\n <bch:expr> <bch:tm> <bch:tm> <dbl> <bch:byt> <dbl> <int> <dbl> <bch:t>\n 1 original 110.13\xc2\xb5s 123.87\xc2\xb5s 7619. 3.97MB 22.4 3405 10 447ms\n 2 Thomas 940.95\xc2\xb5s 993.35\xc2\xb5s 1000. 256.16KB 36.2 442 16 442ms\n 3 == 66.82\xc2\xb5s 74.11\xc2\xb5s 12243. 117.27KB 29.0 5482 13 448ms\n 4 Christian 64.22\xc2\xb5s 70.38\xc2\xb5s 13075. 83.47KB 30.2 5636 13 431ms\n 5 Mael1 39.7\xc2\xb5s 42.81\xc2\xb5s 22750. 41.07KB 32.4 9130 13 401ms\n 6 Mael2 42.09\xc2\xb5s 45.83\xc2\xb5s 21051. 41.12KB 31.4 9384 14 446ms\n 7 Ritchie 18.22\xc2\xb5s 20.33\xc2\xb5s 47043. 49.07KB 28.2 9994 6 212ms\n 8 jblood94 348.23\xc2\xb5s 372.25\xc2\xb5s 2613. 1.96MB 23.5 1221 11 467ms\n 9 split 14.74\xc2\xb5s 16.53\xc2\xb5s 58574. 8.68KB 17.6 9997 3 171ms\n10 splitu 15.2\xc2\xb5s 17.08\xc2\xb5s 56286. 8.68KB 22.5 9996 4 178ms\n11 splitIn 15.17\xc2\xb5s 16.95\xc2\xb5s 56171. 10.23KB 28.1 9995 5 178ms\n12 fmatch 15.49\xc2\xb5s 17.33\xc2\xb5s 54217. 9KB 21.7 9996 4 184ms\n13 rcpp 6.14\xc2\xb5s 6.75\xc2\xb5s 140790. 2.85KB 14.1 9999 1 71ms\n14 rsplit 22.46\xc2\xb5s 25.09\xc2\xb5s 37062. 9.84KB 29.7 9992 8 270ms\n15 fmCol1 13.62\xc2\xb5s 15.44\xc2\xb5s 61478. 42.27KB 30.8 9995 5 163ms\n16 fmCol2 12.64\xc2\xb5s 14.2\xc2\xb5s 68176. 4.46KB 27.3 9996 4 147ms\n17 fmCol3 15.31\xc2\xb5s 17.02\xc2\xb5s 55035. 15.62KB 38.6 9993 7 182ms\nRun Code Online (Sandbox Code Playgroud)\nset.seed(42)\na <- sample(1:10000, 20000, replace=TRUE)\nb <- sample(1:10000, 4000)\nbench::mark(check = FALSE, exprs = methods)\n\n expression min median `itr/sec` mem_alloc `gc/sec` n_itr n_gc total\xe2\x80\xa6\xc2\xb9\n <bch:expr> <bch:tm> <bch:tm> <dbl> <bch:byt> <dbl> <int> <dbl> <bch:t>\n 1 original 698.85ms 698.85ms 1.43 1.19GB 35.8 1 25 699ms\n 2 Thomas 115.5ms 118.76ms 8.45 2.6MB 42.2 5 25 592ms\n 3 == 137.85ms 165.14ms 5.73 610.78MB 72.6 3 38 524ms\n 4 Christian 118.31ms 123.18ms 8.13 527.1MB 89.5 5 55 615ms\n 5 Mael1 45.27ms 47.62ms 18.4 305.42MB 116. 10 63 544ms\n 6 Mael2 43.84ms 45.76ms 21.9 305.42MB 137. 11 69 503ms\n 7 Ritchie 2.71ms 2.77ms 352. 1.21MB 7.96 177 4 503ms\n 8 jblood94 4.7ms 6.02ms 163. 649.87KB 29.8 82 15 503ms\n 9 split 4.01ms 4.1ms 238. 752.3KB 3.99 119 2 501ms\n10 splitu 4.02ms 4.18ms 229. 752.3KB 3.98 115 2 503ms\n11 splitIn 2.68ms 2.8ms 346. 863.79KB 5.97 174 3 502ms\n12 fmatch 2.36ms 2.51ms 383. 769.34KB 5.98 192 3 501ms\n13 rcpp 875.88\xc2\xb5s 896.47\xc2\xb5s 930. 33.79KB 8.00 465 4 500ms\n14 rsplit 1.97ms 2.06ms 464. 902.81KB 12.0 232 6 500ms\n15 fmCol1 854.57\xc2\xb5s 893.17\xc2\xb5s 1035. 672.34KB 16.0 518 8 500ms\n16 fmCol2 410.06\xc2\xb5s 425.88\xc2\xb5s 2156. 407.27KB 22.0 1078 11 500ms\n17 fmCol3 365.01\xc2\xb5s 381.88\xc2\xb5s 2285. 454KB 26.0 1143 13 500ms\nRun Code Online (Sandbox Code Playgroud)\nset.seed(1) #Taken from jblood94, Test only a selection of methods\na <- sample(1:1e6, 2e6, replace = TRUE)\nb <- sample(1:1e6, 4e5)\nbench::mark(check = FALSE, exprs = methods[-1:-6], min_iterations = 7)\n\n expression min median `itr/sec` mem_alloc gc/se\xe2\x80\xa6\xc2\xb9 n_itr n_gc total_\xe2\x80\xa6\xc2\xb2\n <bch:expr> <bch:tm> <bch:tm> <dbl> <bch:byt> <dbl> <int> <dbl> <bch:tm>\n 1 Ritchie 593.69ms 789.89ms 1.29 127.36MB 2.20 7 12 5.45s\n 2 jblood94 500.65ms 616.8ms 1.47 57.11MB 4.84 7 23 4.75s\n 3 split 1.15s 1.28s 0.767 79.14MB 0.658 7 6 9.12s\n 4 splitu 1.25s 1.39s 0.724 75.14MB 0.517 7 5 9.67s\n 5 splitIn 650.54ms 884.24ms 1.15 86.79MB 0.987 7 6 6.08s\n 6 fmatch 598.77ms 626.69ms 1.55 76.69MB 1.11 7 5 4.5s\n 7 rcpp 382.63ms 389.04ms 2.48 3.05MB 0.355 7 1 2.82s\n 8 rsplit 618.36ms 723.24ms 1.39 84.64MB 1.59 7 8 5.05s\n 9 fmCol1 187.54ms 235.62ms 3.91 65.61MB 3.35 7 6 1.79s\n10 fmCol2 64.61ms 67.62ms 13.7 39.68MB 5.87 7 3 511.42ms\n11 fmCol3 58.71ms 64.26ms 14.4 44.26MB 5.41 8 3 554.08ms\nRun Code Online (Sandbox Code Playgroud)\n
我们可以玩一个技巧,并by通过索引索引列表as.character(b)(但是,我想说这个选项很有趣,但不快),例如,
> unname(by(seq_along(a), a, list)[as.character(b)])
[[1]]
NULL
[[2]]
NULL
[[3]]
[1] 107 142 199
[[4]]
[1] 109 126
[[5]]
[1] 136 167
[[6]]
NULL
[[7]]
NULL
[[8]]
[1] 73 91 176
[[9]]
[1] 51 146 181
[[10]]
[1] 191
[[11]]
[1] 55 118
[[12]]
[1] 192
[[13]]
[1] 40 64 110 165
[[14]]
[1] 20 22 122 175
[[15]]
[1] 137 189
[[16]]
[1] 134
[[17]]
[1] 128
[[18]]
[1] 17 81 184
[[19]]
NULL
[[20]]
[1] 188 194
[[21]]
[1] 98 180
[[22]]
[1] 62 145
[[23]]
[1] 33
[[24]]
NULL
[[25]]
[1] 47
[[26]]
NULL
[[27]]
[1] 29 114 159
[[28]]
[1] 18 26 171
[[29]]
[1] 28 69 186 200
[[30]]
[1] 42
[[31]]
[1] 79 158 190
[[32]]
[1] 5 38 58 82
[[33]]
[1] 35 74 121
[[34]]
[1] 150
[[35]]
[1] 34 36 139
[[36]]
[1] 70 100 117 195
[[37]]
NULL
[[38]]
[1] 32 46 102
[[39]]
[1] 89 133
[[40]]
[1] 127 129 160
Run Code Online (Sandbox Code Playgroud)