将深度嵌套的列表转换为数据框

nob*_*ien 8 r nested-lists dataframe

我有一个深度嵌套的列表,我想转换为数据框。下面是结构的样子:

ls <- list('10' = list('123' = list('0.1' = list(Gmax.val = rnorm(1),
                                                 G2.val = rnorm(1),
                                                 Gmax.vec = rnorm(8),
                                                 G2.vec = rnorm(8)),
                                    '0.2' = list(Gmax.val = rnorm(1),
                                                 G2.val = rnorm(1),
                                                 Gmax.vec = rnorm(8),
                                                 G2.vec = rnorm(8))),
                       '456' = list ('0.1' = list(Gmax.val = rnorm(1),
                                                  G2.val = rnorm(1),
                                                  Gmax.vec = rnorm(8),
                                                  G2.vec = rnorm(8)),
                                     '0.2' = list(Gmax.val = rnorm(1),
                                                  G2.val = rnorm(1),
                                                  Gmax.vec = rnorm(8),
                                                  G2.vec = rnorm(8)))),
           '20' = list('123' = list('0.1' = list(Gmax.val = rnorm(1),
                                                 G2.val = rnorm(1),
                                                 Gmax.vec = rnorm(8),
                                                 G2.vec = rnorm(8)),
                                    '0.2' = list(Gmax.val = rnorm(1),
                                                 G2.val = rnorm(1),
                                                 Gmax.vec = rnorm(8),
                                                 G2.vec = rnorm(8))),
                       '456' = list ('0.1' = list(Gmax.val = rnorm(1),
                                                  G2.val = rnorm(1),
                                                  Gmax.vec = rnorm(8),
                                                  G2.vec = rnorm(8)),
                                     '0.2' = list(Gmax.val = rnorm(1),
                                                  G2.val = rnorm(1),
                                                  Gmax.vec = rnorm(8),
                                                  G2.vec = rnorm(8)))))


> ls[['10']][['123']][['0.1']]
$Gmax.val
[1] -0.1982298

$G2.val
[1] -0.2761515

$Gmax.vec
[1] -0.4732736 -0.5714809 -0.1600405 -0.7138532  0.3503852 -0.7367241  0.3024992 -0.4931045

$G2.vec
[1] -0.2374231 -0.7927135 -0.9554769  0.8733201 -0.4126742  1.8689940  0.1576750 -0.2184344
Run Code Online (Sandbox Code Playgroud)

每个子列表名称都是不同变量的值:在本例中,可能:

ls[[]] = time; 10 or 20
ls[[]][[]] = seed; 123 or 456
ls[[]][[]][[]] = treatment; 0.1 or 0.2 
Run Code Online (Sandbox Code Playgroud)

理想情况下,我希望将子列表的名称用作它们自己列中的值。我希望数据框看起来像这样:

#  time seed treatment  Gmax.val     G2.val    Gmax.vec     G2.vec
#1   10  123       0.1 0.1972457 -0.1224265  0.06121407  1.5102516
#2   10  123       0.1 0.1972457 -0.1224265 -2.53026477 -0.1320042
#3   10  123       0.1 0.1972457 -0.1224265  0.06648820 -0.2477285
#4   10  123       0.1 0.1972457 -0.1224265 -0.45594701 -0.8577670
#5   10  123       0.1 0.1972457 -0.1224265  0.90828911 -1.0710828
#6   10  123       0.1 0.1972457 -0.1224265  0.56427976  1.5086222
Run Code Online (Sandbox Code Playgroud)

谢谢您的帮助。

Jor*_*hau 6

另一种方法是:

  1. 使用rrapply()rrapply-package将嵌套列表融合到 data.frame中(或类似地使用reshape2::melt())。
  2. 使用 tidyrpivot_wider()和将 data.frame 重塑为所需的格式unnest()
library(rrapply)
library(tidyverse)

rrapply(ls, how = "melt") %>%                            ## melt to long df
  pivot_wider(names_from = "L4") %>%                     ## reshape to wide df
  unnest(c(Gmax.val, G2.val, Gmax.vec, G2.vec)) %>%      ## unnest list columns
  rename(time = L1, seed = L2, treatment = L3)           ## rename columns

#> # A tibble: 64 x 7
#>    time  seed  treatment Gmax.val G2.val Gmax.vec  G2.vec
#>    <chr> <chr> <chr>        <dbl>  <dbl>    <dbl>   <dbl>
#>  1 10    123   0.1         -0.626  0.184   -0.836  1.51  
#>  2 10    123   0.1         -0.626  0.184    1.60   0.390 
#>  3 10    123   0.1         -0.626  0.184    0.330 -0.621 
#>  4 10    123   0.1         -0.626  0.184   -0.820 -2.21  
#>  5 10    123   0.1         -0.626  0.184    0.487  1.12  
#>  6 10    123   0.1         -0.626  0.184    0.738 -0.0449
#>  7 10    123   0.1         -0.626  0.184    0.576 -0.0162
#>  8 10    123   0.1         -0.626  0.184   -0.305  0.944 
#>  9 10    123   0.2          0.821  0.594    0.919 -0.478 
#> 10 10    123   0.2          0.821  0.594    0.782  0.418 
#> # … with 54 more rows
Run Code Online (Sandbox Code Playgroud)

或者使用 data.tabledcast()将长表重塑为宽格式:

library(data.table)

long_dt <- as.data.table(rrapply(ls, how = "melt"))
wide_dt <- dcast(long_dt, L1 + L2 + L3 ~ L4)
wide_dt <- wide_dt[, lapply(.SD, unlist), by = list(L1, L2, L3), .SDcols = c("Gmax.val", "G2.val", "Gmax.vec", "G2.vec")]
setnames(wide_dt, old = c("L1", "L2", "L3"), new = c("time", "seed", "treatment"))
Run Code Online (Sandbox Code Playgroud)

一些基准

microbenchmark::microbenchmark(
  tidyr = {
    rrapply(ls, how = "melt") %>%                            
      pivot_wider(names_from = "L4") %>%                     
      unnest(c(Gmax.val, G2.val, Gmax.vec, G2.vec)) %>%      
      rename(time = L1, seed = L2, treatment = L3)
  },
  data.table = {
    wide_dt <- dcast(as.data.table(rrapply(ls, how = "melt")), L1 + L2 + L3 ~ L4)
    wide_dt <- wide_dt[, lapply(.SD, unlist), by = list(L1, L2, L3), .SDcols = c("Gmax.val", "G2.val", "Gmax.vec", "G2.vec")]
    setnames(wide_dt, old = c("L1", "L2", "L3"), new = c("time", "seed", "treatment"))
    wide_dt
  },
  times = 25
)
#> Unit: milliseconds
#>        expr       min        lq      mean    median        uq       max neval
#>       tidyr 17.959197 20.072647 23.662698 21.278771 25.633581 40.593022    25
#>  data.table  2.061861  2.655782  2.966581  2.784425  2.988044  5.032524    25
Run Code Online (Sandbox Code Playgroud)