Car*_*oft 6 r set gmp set-intersection
该包的当前版本gmp不支持集合操作,例如intersect、setdiff等。我正在使用数字序列进行一些工作(有关示例,请参阅OEIS)并且需要处理大整数的大集合。我目前一直坚持使用各种循环来生成所需的差异或交叉点;虽然我可能可以生成编译的(Rccp 等)代码,但我希望在现有的函数和包中找到一种方法R。
使用bignum而不是gmp返回一个character。而恢复需要时间。
library(bignum)\n\nt1 <- biginteger(1:4)\nt2 <- biginteger(3:6)\nintersect(t1, t2)\n#[1] "3" "4"\n\nbiginteger(intersect(t1, t2))\n#<biginteger[2]>\n#[1] 3 4\nRun Code Online (Sandbox Code Playgroud)\n将 a 存储bigz为listavector.
library(gmp)\n\nintersect(as.bigz(1:4), as.bigz(3:6))\n#Big Integer ('bigz') object of length 2:\n#[1] 1 2\n\nintersect(as.list(as.bigz(1:4)), as.list(as.bigz(3:6)))\n#[[1]]\n#Big Integer ('bigz') :\n#[1] 3\n#\n#[[2]]\n#Big Integer ('bigz') :\n#[1] 4\n\nsetdiff(as.list(as.bigz(1:4)), as.list(as.bigz(3:6)))\n#[[1]]\n#Big Integer ('bigz') :\n#[1] 1\n#\n#[[2]]\n#Big Integer ('bigz') :\n#[1] 2\n\nf3 <- as.list(as.bigz(c(3,5,9,6,4)))\nf4 <- as.list(as.bigz(c(6,7,8,10,9)))\nintersect(f3, f4)\n#[[1]]\n#Big Integer ('bigz') :\n#[1] 9\n#\n#[[2]]\n#Big Integer ('bigz') :\n#[1] 6\nRun Code Online (Sandbox Code Playgroud)\n不幸的是这要慢得多比将其转换为字符
\n对于相交duplicated可以使用但这也比转换为字符慢。
. <- c(unique(as.bigz(1:4)), unique(as.bigz(3:6)))\n.[duplicated(.)]\n#Big Integer ('bigz') object of length 2:\n#[1] 3 4\nRun Code Online (Sandbox Code Playgroud)\n并且将每个元素相互比较也更慢:
\nt1 <- unique(as.bigz(1:4))\nt2 <- unique(as.bigz(3:6))\nt1[sapply(t1, function(x) any(x == t2))]\n#Big Integer ('bigz') object of length 2:\n#[1] 3 4\nRun Code Online (Sandbox Code Playgroud)\n在这种情况下,边际更快的是:
\nt1 <- kit::funique(as.character(as.bigz(1:4)))\nt2 <- as.character(as.bigz(3:6));\nas.bigz(t1[fastmatch::fmatch(t1, t2, 0L) > 0L])\n#Big Integer ('bigz') object of length 2:\n#[1] 3 4\nRun Code Online (Sandbox Code Playgroud)\n以及一个简单的 Rcpp 版本。
\nlibrary(Rcpp)\nsourceCpp(code=r"(\n#include <Rcpp.h>\n#include <list>\n#include <cstring>\nusing namespace Rcpp;\n// [[Rcpp::export]]\nRawVector fun(const RawVector &x, const RawVector &y) {\n std::list<uint32_t const*> xAdr;\n std::list<uint32_t const*> yAdr;\n const uint32_t *px = (const uint32_t *) &x[0];\n const uint32_t *py = (const uint32_t *) &y[0];\n uint32_t nextNr = 1;\n for(uint_fast32_t j=0; j<px[0]; ++j) {\n uint32_t n = px[nextNr] + 2;\n auto i = xAdr.cbegin();\n for(; i != xAdr.cend(); ++i) {\n if(std::memcmp(&px[nextNr], *i, n * sizeof(uint32_t)) == 0) break;\n }\n if(i == xAdr.cend()) xAdr.push_back(&px[nextNr]);\n nextNr += n;\n }\n nextNr = 1;\n for(uint_fast32_t j=0; j<py[0]; ++j) {\n yAdr.push_back(&py[nextNr]);\n nextNr += py[nextNr] + 2;;\n }\n uint32_t l=1;\n for(auto j = xAdr.cbegin(); j != xAdr.cend(); ) {\n auto i = yAdr.cbegin();\n for(; i != yAdr.cend(); ++i) {\n if(std::memcmp(*j, *i, (**j + 2) * sizeof(uint32_t)) == 0) {\n yAdr.erase(i);\n break;\n }\n }\n if(i == yAdr.cend()) j = xAdr.erase(j);\n else {l += **j + 2; ++j;}\n }\n RawVector res(Rcpp::no_init(l * sizeof(uint32_t)));\n res.attr("class") = "bigz";\n uint32_t *p = (uint32_t *) &res[0];\n p[0] = xAdr.size();\n nextNr = 1;\n for(auto j = xAdr.cbegin(); j != xAdr.cend(); ++j) {\n std::memcpy(&p[nextNr], *j, (**j + 2) * sizeof(uint32_t));\n nextNr += p[nextNr] + 2;\n }\n return res;\n}\n)")\n\nfun(as.bigz(1:4), as.bigz(3:6))\n#Big Integer ('bigz') object of length 2:\n#[1] 3 4\nRun Code Online (Sandbox Code Playgroud)\n基准
\nlibrary(gmp)\nx <- as.bigz("10000000000000000000000000000000000000000")+1:4\ny <- as.bigz("10000000000000000000000000000000000000000")+3:6\n\nlibrary(bignum)\nxb <- biginteger("10000000000000000000000000000000000000000")+1:4\nyb <- biginteger("10000000000000000000000000000000000000000")+3:6\n\nbench::mark(check = FALSE,\n "list" = do.call(c, intersect(as.list(x), as.list(y))),\n "char" = as.bigz(intersect(as.character(x), as.character(y))),\n "dupli" = {. <- c(unique(x), unique(y)); .[duplicated(.)]},\n "loop" = {t1 <- unique(x); t2 <- unique(y); t1[sapply(t1, function(x) any(x == t2))]},\n "own" = {t1 <- as.character(x); t2 <- as.character(y);\n x[!duplicated(t1) & match(t1, t2, 0L) > 0L]},\n "own2" = {t1 <- unique(as.character(x)); t2 <- as.character(y);\n as.bigz(t1[match(t1, t2, 0L) > 0L])},\n "kitFmat" = {t1 <- kit::funique(as.character(x)); t2 <- as.character(y);\n as.bigz(t1[fastmatch::fmatch(t1, t2, 0L) > 0L])},\n "collFmat" = {t1 <- collapse::funique(as.character(x)); t2 <- as.character(y);\n as.bigz(t1[fastmatch::fmatch(t1, t2, 0L) > 0L])},\n "bignum" = biginteger(intersect(xb, yb)),\n "rcppIdx" = fun(x, y),\n )\nRun Code Online (Sandbox Code Playgroud)\n结果
\n expression min median `itr/sec` mem_alloc `gc/sec` n_itr n_gc\n <bch:expr> <bch:tm> <bch:tm> <dbl> <bch:byt> <dbl> <int> <dbl>\n 1 list 135.89\xc2\xb5s 247.15\xc2\xb5s 3147. 0B 2.03 1552 1\n 2 char 15.4\xc2\xb5s 20.79\xc2\xb5s 29933. 0B 12.0 9996 4\n 3 dupli 54.58\xc2\xb5s 86.69\xc2\xb5s 7428. 280B 6.18 3604 3\n 4 loop 80.11\xc2\xb5s 156.36\xc2\xb5s 4198. 6.36KB 8.26 2032 4\n 5 own 15.46\xc2\xb5s 27.09\xc2\xb5s 25254. 0B 5.05 9998 2\n 6 own2 13.69\xc2\xb5s 20.53\xc2\xb5s 28952. 0B 8.69 9997 3\n 7 kitFmat 13.23\xc2\xb5s 20.57\xc2\xb5s 30964. 0B 6.19 9998 2\n 8 collFmat 16.76\xc2\xb5s 21.76\xc2\xb5s 26667. 2.49KB 5.33 9998 2\n 9 bignum 32.12\xc2\xb5s 48.51\xc2\xb5s 10784. 0B 8.47 5090 4\n10 rcppIdx 2.35\xc2\xb5s 3.09\xc2\xb5s 212933. 2.49KB 0 10000 0\nRun Code Online (Sandbox Code Playgroud)\n看起来,转换character和返回是目前最快的方法。即使子集化bigz也比子集化character并将其转换为慢bigz。
\nRcpp 中的版本比最快的其他方法大约快 6-7 倍。