tl;dr \n许多功能请求因维护负担而被 R-core 拒绝,但不是hashtab
(R>4.2.0)。?hashtab
声称可以有效地将键与值关联起来。存在许多其他实现(hash、r2r、hashmap等),以及环境和用户友好的扩展(rlang、RC、R6等)。hashtab
除了对象混淆和任意键之外,我还没有找到比其他更有效的明显用例。
问题
\nhashtab
除了任意键之外,是否具有独特的功能,或者对于某些用例来说,在速度、内存或语法方面是否有明显的好处?
我试图研究环境和hashtab
.
set.seed(1)\nmake_hash <- function(n, keys, values) {\n h <- hashtab("identical", n)\n for(i in seq_along(keys)) sethash(h, keys[i], values[[i]])\n h\n}\nmake_env <- function(n, keys, values) setNames(values, keys) |> list2env(size = n)\n\nget_mem <- function(x) as.numeric(lobstr::obj_size(x)) * 0.001\ncompare <- function(n, keylen) {\n keys <- stringi::stri_rand_strings(n, keylen) \n values <- sample(list(mapply, iris, 1:1e6, "just a string", 1L), n, replace = TRUE)\n ind <- sample(keys, 1)\n h <- make_hash(n, keys, values)\n e <- make_env(n, keys, values)\n data.frame(\n n = n,\n method = c("environment", "hashtab"),\n make_speed = {\n bench::mark(\n make_env(n, keys, values),\n make_hash(n, keys, values),\n check = F\n )$median |> as.character()\n },\n memory = c(get_mem(e), get_mem(h)),\n access = bench::mark(\n e[[ind]],\n gethash(h, ind, NULL), # the natural h[[ind]] is 2x slower\n iterations = 1e4\n )$median |> as.character()\n )\n}\n
Run Code Online (Sandbox Code Playgroud)\n一些性能基准,
\npurrr::map_dfr(c(1e2, 1e3, 1e4, 1e5, 1e6), compare, 10)\n n method make_speed memory access\n1 100 environment 13.8\xc2\xb5s 45.11 200ns\n2 100 hashtab 139.9\xc2\xb5s 41.61 1\xc2\xb5s\n3 1000 environment 110.5\xc2\xb5s 227.56 200ns\n4 1000 hashtab 1.25ms 214.34 1\xc2\xb5s\n5 10000 environment 1.46ms 2041.96 200ns\n6 10000 hashtab 13.64ms 2044.10 1\xc2\xb5s\n7 100000 environment 53.3ms 20185.96 200ns\n8 100000 hashtab 394.9ms 19719.11 1\xc2\xb5s\n9 1000000 environment 2.2s 201625.96 300ns\n10 1000000 hashtab 4.1s 192799.18 1\xc2\xb5s\n
Run Code Online (Sandbox Code Playgroud)\n他们的参考语义,
\ne1 <- new.env()\ne1$hi <- 1\ne2 <- e1\ne2$hi <- 2\ne1$hi # autocompletion\n#> [1] 2\n\nh1 <- hashtab()\nsethash(h1, "hi", 1)\nh2 <- h1\nsethash(h2, "hi", 2)\ngethash(h1, "hi")\n#> [1] 2\n
Run Code Online (Sandbox Code Playgroud)\n批量访问,
\ne1$bye <- 3\nsethash(h1, "bye", 3)\n\neapply(e1, function(x) x)\n#> $hi\n#> [1] 2\n#> $bye\n#> [1] 3\n(function(h) {\n val <- list()\n maphash(h, function(k, v) val[[k]] <<- v)\n val\n})(h1)\n#> $bye\n#> [1] 3\n#> $hi\n#> [1] 2\n
Run Code Online (Sandbox Code Playgroud)\n关键名称,
\ne1[[iris]] <- 5 # error. arbitrary object as key... but why?\nh1[[iris]] <- 5 # works fine\n
Run Code Online (Sandbox Code Playgroud)\n他们的内部结构(解释),发现环境包含hashtab
,
e <- new.env(size = 2)\ne$x <- 5\n.Internal(inspect(e))\n#> @0x00000226b4083c48 04 ENVSXP g0c0 [REF(5)] <0x00000226b4083c48>\n#> ENCLOS:\n#> @0x00000226ac100778 04 ENVSXP g1c0 [MARK,REF(65535),GL,gp=0x8000] #><R_GlobalEnv>\n#> HASHTAB:\n#> @0x00000226b5f6b588 19 VECSXP g0c2 [REF(1)] (len=2, tl=1)\n#> @0x00000226b40b0a70 02 LISTSXP g0c0 [REF(1)] \n#> TAG: @0x00000226aed32ae0 01 SYMSXP g1c0 [MARK,REF(65535)] "x"\n#> @0x00000226b5f3d3a0 14 REALSXP g0c1 [REF(6)] (len=1, tl=0) 5\n#> @0x00000226ac0add90 00 NILSXP g1c0 [MARK,REF(65535)] \n\n# note the (len=8)\nh <- hashtab(size = 2)\nsethash(h, "x", 5)\n.Internal(inspect(h))\n#> @0x00000226b5f5e8e0 19 VECSXP g0c1 [OBJ,REF(9),ATT] (len=1, tl=0)\n#> @0x00000226b4361ab0 22 EXTPTRSXP g0c0 [REF(3)] <0x00000226b4361ab0>\n#> PROTECTED:\n#> @0x00000226b5f4e898 19 VECSXP g0c4 [REF(1)] (len=8, tl=0)\n#> @0x00000226ac0add90 00 NILSXP g1c0 [MARK,REF(65535)] \n#> @0x00000226ac0add90 00 NILSXP g1c0 [MARK,REF(65535)] \n#> @0x00000226ac0add90 00 NILSXP g1c0 [MARK,REF(65535)] \n#> @0x00000226ac0add90 00 NILSXP g1c0 [MARK,REF(65535)] \n#> @0x00000226ac0add90 00 NILSXP g1c0 [MARK,REF(65535)] \n#> ...\n#> TAG:\n#> @0x00000226b235b2e8 13 INTSXP g0c2 [REF(1)] (len=3, tl=0) 1,0,3\n#> ATTRIB:\n#> @0x00000226b4361a78 02 LISTSXP g0c0 [REF(1)] \n#> TAG: @0x00000226ac0ada80 01 SYMSXP g1c0 [MARK,REF(55126),LCK,gp=0x4000] #> "class" (has value)\n#> @0x00000226b5f5e8a8 16 STRSXP g0c1 [REF(65535)] (len=1, tl=0)\n#> @0x00000226b015bb00 09 CHARSXP g1c1 [MARK,REF(320),gp=0x61] [ASCII] #> [cached] "hashtab"\n
Run Code Online (Sandbox Code Playgroud)\n最后,他们在玩具包装中的行为。第一次访问 hashtab 失败,后续访问成功。
\nset.seed(1)\nmake_hash <- function(n, keys, values) {\n h <- hashtab("identical", n)\n for(i in seq_along(keys)) sethash(h, keys[i], values[[i]])\n h\n}\nmake_env <- function(n, keys, values) setNames(values, keys) |> list2env(size = n)\n\nget_mem <- function(x) as.numeric(lobstr::obj_size(x)) * 0.001\ncompare <- function(n, keylen) {\n keys <- stringi::stri_rand_strings(n, keylen) \n values <- sample(list(mapply, iris, 1:1e6, "just a string", 1L), n, replace = TRUE)\n ind <- sample(keys, 1)\n h <- make_hash(n, keys, values)\n e <- make_env(n, keys, values)\n data.frame(\n n = n,\n method = c("environment", "hashtab"),\n make_speed = {\n bench::mark(\n make_env(n, keys, values),\n make_hash(n, keys, values),\n check = F\n )$median |> as.character()\n },\n memory = c(get_mem(e), get_mem(h)),\n access = bench::mark(\n e[[ind]],\n gethash(h, ind, NULL), # the natural h[[ind]] is 2x slower\n iterations = 1e4\n )$median |> as.character()\n )\n}\n
Run Code Online (Sandbox Code Playgroud)\nhashtab
与环境相比,
new.env()
,1我不知道这意味着什么。
\n