使用 tidyverse 将列表嵌套到数据帧:比 tidyr unnest_wider 更快

Cla*_*eri 7 r list unnest tidyr

我通过读取存储视频游戏日志信息的 JSON 获得了一个嵌套列表。列表的时间元素是一个简单的向量,而 inputManagerStates 和syncedProperties 是可能包含 0 个或多个元素的列表。

这是这个问题的后续内容,在一些帮助下,我设法将数据转换为矩形格式。不幸的是,我有很多这样的 JSON 文件,并且unnest_wider运行速度似乎相当慢。

列表:

test_list <- 
  list(list(time = 9.92405605316162, inputManagerStates = list(), 
syncedProperties = list()), list(time = 9.9399995803833, 
inputManagerStates = list(list(inputId = "InputY", buttonState = FALSE, 
    axisValue = 0), list(inputId = "InputX", buttonState = FALSE, 
    axisValue = 0.0501395985484123), list(inputId = "xPos", 
    buttonState = FALSE, axisValue = 5), list(inputId = "yPos", 
    buttonState = FALSE, axisValue = 0.0799999982118607), 
    list(inputId = "zPos", buttonState = FALSE, axisValue = 0), 
    list(inputId = "xRot", buttonState = FALSE, axisValue = 0), 
    list(inputId = "yRot", buttonState = FALSE, axisValue = -0.70664256811142), 
    list(inputId = "zRot", buttonState = FALSE, axisValue = 0), 
    list(inputId = "wRot", buttonState = FALSE, axisValue = 0.707570731639862)), 
syncedProperties = list(list(name = "timeStamp", value = "97,2"))), 
list(time = 9.95659446716309, inputManagerStates = list(list(
    inputId = "InputY", buttonState = FALSE, axisValue = 0), 
    list(inputId = "InputX", buttonState = FALSE, axisValue = 0.0993990004062653), 
    list(inputId = "xPos", buttonState = FALSE, axisValue = 5), 
    list(inputId = "yPos", buttonState = FALSE, axisValue = 0.0799999982118607), 
    list(inputId = "zPos", buttonState = FALSE, axisValue = 0), 
    list(inputId = "xRot", buttonState = FALSE, axisValue = 0), 
    list(inputId = "yRot", buttonState = FALSE, axisValue = -0.705721318721771), 
    list(inputId = "zRot", buttonState = FALSE, axisValue = 0), 
    list(inputId = "wRot", buttonState = FALSE, axisValue = 0.708489596843719)), 
    syncedProperties = list(list(name = "timeStamp", value = "97,21667"))), 
list(time = 20.0626411437988, inputManagerStates = list(list(
    inputId = "InputY", buttonState = FALSE, axisValue = 0.601816594600677), 
    list(inputId = "InputX", buttonState = FALSE, axisValue = 0), 
    list(inputId = "xPos", buttonState = FALSE, axisValue = -1.31777036190033), 
    list(inputId = "yPos", buttonState = FALSE, axisValue = 0.0800001174211502), 
    list(inputId = "zPos", buttonState = FALSE, axisValue = 6.08214092254639), 
    list(inputId = "xRot", buttonState = FALSE, axisValue = 0), 
    list(inputId = "yRot", buttonState = FALSE, axisValue = -0.391442984342575), 
    list(inputId = "zRot", buttonState = FALSE, axisValue = 0), 
    list(inputId = "wRot", buttonState = FALSE, axisValue = 0.920202374458313)), 
    syncedProperties = list(list(name = "timeStamp", value = "107,3167"), 
        list(name = "previousGameState", value = "1"), list(
            name = "newGameState", value = "2"))))
Run Code Online (Sandbox Code Playgroud)

我用来矩形化列表的代码:

test_list <- 
  list(list(time = 9.92405605316162, inputManagerStates = list(), 
syncedProperties = list()), list(time = 9.9399995803833, 
inputManagerStates = list(list(inputId = "InputY", buttonState = FALSE, 
    axisValue = 0), list(inputId = "InputX", buttonState = FALSE, 
    axisValue = 0.0501395985484123), list(inputId = "xPos", 
    buttonState = FALSE, axisValue = 5), list(inputId = "yPos", 
    buttonState = FALSE, axisValue = 0.0799999982118607), 
    list(inputId = "zPos", buttonState = FALSE, axisValue = 0), 
    list(inputId = "xRot", buttonState = FALSE, axisValue = 0), 
    list(inputId = "yRot", buttonState = FALSE, axisValue = -0.70664256811142), 
    list(inputId = "zRot", buttonState = FALSE, axisValue = 0), 
    list(inputId = "wRot", buttonState = FALSE, axisValue = 0.707570731639862)), 
syncedProperties = list(list(name = "timeStamp", value = "97,2"))), 
list(time = 9.95659446716309, inputManagerStates = list(list(
    inputId = "InputY", buttonState = FALSE, axisValue = 0), 
    list(inputId = "InputX", buttonState = FALSE, axisValue = 0.0993990004062653), 
    list(inputId = "xPos", buttonState = FALSE, axisValue = 5), 
    list(inputId = "yPos", buttonState = FALSE, axisValue = 0.0799999982118607), 
    list(inputId = "zPos", buttonState = FALSE, axisValue = 0), 
    list(inputId = "xRot", buttonState = FALSE, axisValue = 0), 
    list(inputId = "yRot", buttonState = FALSE, axisValue = -0.705721318721771), 
    list(inputId = "zRot", buttonState = FALSE, axisValue = 0), 
    list(inputId = "wRot", buttonState = FALSE, axisValue = 0.708489596843719)), 
    syncedProperties = list(list(name = "timeStamp", value = "97,21667"))), 
list(time = 20.0626411437988, inputManagerStates = list(list(
    inputId = "InputY", buttonState = FALSE, axisValue = 0.601816594600677), 
    list(inputId = "InputX", buttonState = FALSE, axisValue = 0), 
    list(inputId = "xPos", buttonState = FALSE, axisValue = -1.31777036190033), 
    list(inputId = "yPos", buttonState = FALSE, axisValue = 0.0800001174211502), 
    list(inputId = "zPos", buttonState = FALSE, axisValue = 6.08214092254639), 
    list(inputId = "xRot", buttonState = FALSE, axisValue = 0), 
    list(inputId = "yRot", buttonState = FALSE, axisValue = -0.391442984342575), 
    list(inputId = "zRot", buttonState = FALSE, axisValue = 0), 
    list(inputId = "wRot", buttonState = FALSE, axisValue = 0.920202374458313)), 
    syncedProperties = list(list(name = "timeStamp", value = "107,3167"), 
        list(name = "previousGameState", value = "1"), list(
            name = "newGameState", value = "2"))))
Run Code Online (Sandbox Code Playgroud)

创建于 2022-08-24,使用reprex v2.0.2

因为我的数据unnest相当快,但unnest_wider也很慢。第一个unnest_wider(value)可以很容易地用基础 R 编写 - cbind(., do.call("rbind", .$value))- 并且速度要快得多:

library(tidyverse)  

output_df <- 
  test_list %>% 
  tibble::enframe(name = "frame", value = "value") %>% 
  tidyr::unnest_wider(value) %>%
  tidyr::unnest(inputManagerStates, keep_empty = TRUE) %>%
  tidyr::unnest(syncedProperties, keep_empty = TRUE) %>%
  tidyr::unnest_wider(syncedProperties) %>%
  tidyr::unnest_wider(inputManagerStates)

output_df
#> # A tibble: 46 x 7
#>    frame  time inputId buttonState axisValue name      value
#>    <int> <dbl> <chr>   <lgl>           <dbl> <chr>     <chr>
#>  1     1  9.92 <NA>    NA            NA      <NA>      <NA> 
#>  2     2  9.94 InputY  FALSE          0      timeStamp 97,2 
#>  3     2  9.94 InputX  FALSE          0.0501 timeStamp 97,2 
#>  4     2  9.94 xPos    FALSE          5      timeStamp 97,2 
#>  5     2  9.94 yPos    FALSE          0.0800 timeStamp 97,2 
#>  6     2  9.94 zPos    FALSE          0      timeStamp 97,2 
#>  7     2  9.94 xRot    FALSE          0      timeStamp 97,2 
#>  8     2  9.94 yRot    FALSE         -0.707  timeStamp 97,2 
#>  9     2  9.94 zRot    FALSE          0      timeStamp 97,2 
#> 10     2  9.94 wRot    FALSE          0.708  timeStamp 97,2 
#> # ... with 36 more rows
Run Code Online (Sandbox Code Playgroud)

创建于 2022-08-24,使用reprex v2.0.2

我正在寻找一种%>% tidyr::unnest_wider(syncedProperties) %>% tidyr::unnest_wider(inputManagerStates)用更快的代码替换的方法,但cbind由于行数不同,该解决方案不起作用。

编辑:认为这可能是可能的,unnest::unnest()但无法实现所需的结构(虽然tidytable::unnest_wider.目前仅支持向量)。

Jor*_*hau 2

您可以尝试在-package 中rrapply使用(重新访问 base )。当取消嵌套复制列表时,此选项非常有效,例如:how = "bind"rrapplyrapplyinputManagerStates

\n
library(rrapply)\n\ngetStates <- function(lst) {\n  rrapply(\n    lst, \n    condition = \\(x, .xparents) "inputManagerStates" %in% .xparents, \n    how = "bind", \n    options = list(coldepth = 4, namecols = TRUE)\n  )\n}\n\ngetStates(test_list) |>\n  head()\n#>   L1                 L2 L3 inputId buttonState axisValue\n#> 1  2 inputManagerStates  1  InputY       FALSE 0.0000000\n#> 2  2 inputManagerStates  2  InputX       FALSE 0.0501396\n#> 3  2 inputManagerStates  3    xPos       FALSE 5.0000000\n#> 4  2 inputManagerStates  4    yPos       FALSE 0.0800000\n#> 5  2 inputManagerStates  5    zPos       FALSE 0.0000000\n#> 6  2 inputManagerStates  6    xRot       FALSE 0.0000000\n\nmicrobenchmark::microbenchmark(\n  getStates(test_list)\n)\n#> Unit: microseconds\n#>                  expr     min       lq     mean  median      uq      max neval\n#>  getStates(test_list) 518.573 556.7075 839.9312 587.677 675.444 18465.56   100\n
Run Code Online (Sandbox Code Playgroud)\n

要获得问题中的矩形格式,一种可能的方法是首先单独取消嵌套time,inputManagerStatessyncedProperties列表,然后将它们合并回单个 data.frame 中。与问题中的方法相比,这已经提供了 \xc2\xb110 倍的加速:

\n
library(tidyr)\nlibrary(tibble)\nlibrary(rrapply)\n\nunnest_rrapply <- function() {\n  \n  ## bind individual data.frames\n  states <- rrapply(\n    test_list, \n    condition = \\(x, .xparents) "inputManagerStates" %in% .xparents, \n    how = "bind", \n    options = list(coldepth = 4, namecols = TRUE)\n  )\n  \n  properties <- rrapply(\n    test_list, \n    condition = \\(x, .xparents) "syncedProperties" %in% .xparents, \n    how = "bind", \n    options = list(coldepth = 4, namecols = TRUE)\n  )\n  \n  times <- rrapply(\n    test_list,\n    condition = \\(x, .xname) .xname == "time",\n    how = "bind",\n    options = list(namecols = TRUE)\n  )\n  \n  ## merge into single data.frame\n  out <- merge(times, properties[, -c(2, 3)], all = TRUE, by = "L1") |>\n    merge(states[, -c(2, 3)], all = TRUE, by = "L1")\n  \n  return(out)\n  \n}\n\nunnest_rrapply() |>\n  tibble::as_tibble()\n#> # A tibble: 46 \xc3\x97 7\n#>    L1     time name      value inputId buttonState axisValue\n#>    <chr> <dbl> <chr>     <chr> <chr>   <lgl>           <dbl>\n#>  1 1      9.92 <NA>      <NA>  <NA>    NA            NA     \n#>  2 2      9.94 timeStamp 97,2  InputY  FALSE          0     \n#>  3 2      9.94 timeStamp 97,2  InputX  FALSE          0.0501\n#>  4 2      9.94 timeStamp 97,2  xPos    FALSE          5     \n#>  5 2      9.94 timeStamp 97,2  yPos    FALSE          0.0800\n#>  6 2      9.94 timeStamp 97,2  zPos    FALSE          0     \n#>  7 2      9.94 timeStamp 97,2  xRot    FALSE          0     \n#>  8 2      9.94 timeStamp 97,2  yRot    FALSE         -0.707 \n#>  9 2      9.94 timeStamp 97,2  zRot    FALSE          0     \n#> 10 2      9.94 timeStamp 97,2  wRot    FALSE          0.708 \n#> # \xe2\x80\xa6 with 36 more rows\n#> # \xe2\x84\xb9 Use `print(n = ...)` to see more rows\n\nunnest_tidyr <- function() {\n  \n  test_list |>\n    enframe(name = "frame", value = "value") |>\n    unnest_wider(value) |>\n    unnest(inputManagerStates, keep_empty = TRUE) |>\n    unnest(syncedProperties, keep_empty = TRUE) |>\n    unnest_wider(syncedProperties) |>\n    unnest_wider(inputManagerStates)\n  \n}\n\nmicrobenchmark::microbenchmark(\n  unnest_tidyr(),\n  unnest_rrapply()\n)\n#> Unit: milliseconds\n#>              expr       min       lq     mean   median        uq      max neval\n#>    unnest_tidyr() 17.993309 20.33476 23.13348 22.14034 24.767693 47.35227   100\n#>  unnest_rrapply()  2.354847  2.60518  2.90372  2.74223  2.888071 13.38620   100\n
Run Code Online (Sandbox Code Playgroud)\n

注意:在末尾丢失合并unnest_rrapply()并返回例如 data.frames 列表(这可能足以满足 OP 的目的)可以进一步减少计算时间 \xc2\xb12 倍。

\n