nob*_*ien 8 r nested-lists dataframe
我有一个深度嵌套的列表,我想转换为数据框。下面是结构的样子:
ls <- list('10' = list('123' = list('0.1' = list(Gmax.val = rnorm(1),
G2.val = rnorm(1),
Gmax.vec = rnorm(8),
G2.vec = rnorm(8)),
'0.2' = list(Gmax.val = rnorm(1),
G2.val = rnorm(1),
Gmax.vec = rnorm(8),
G2.vec = rnorm(8))),
'456' = list ('0.1' = list(Gmax.val = rnorm(1),
G2.val = rnorm(1),
Gmax.vec = rnorm(8),
G2.vec = rnorm(8)),
'0.2' = list(Gmax.val = rnorm(1),
G2.val = rnorm(1),
Gmax.vec = rnorm(8),
G2.vec = rnorm(8)))),
'20' = list('123' = list('0.1' = list(Gmax.val = rnorm(1),
G2.val = rnorm(1),
Gmax.vec = rnorm(8),
G2.vec = rnorm(8)),
'0.2' = list(Gmax.val = rnorm(1),
G2.val = rnorm(1),
Gmax.vec = rnorm(8),
G2.vec = rnorm(8))),
'456' = list ('0.1' = list(Gmax.val = rnorm(1),
G2.val = rnorm(1),
Gmax.vec = rnorm(8),
G2.vec = rnorm(8)),
'0.2' = list(Gmax.val = rnorm(1),
G2.val = rnorm(1),
Gmax.vec = rnorm(8),
G2.vec = rnorm(8)))))
> ls[['10']][['123']][['0.1']]
$Gmax.val
[1] -0.1982298
$G2.val
[1] -0.2761515
$Gmax.vec
[1] -0.4732736 -0.5714809 -0.1600405 -0.7138532 0.3503852 -0.7367241 0.3024992 -0.4931045
$G2.vec
[1] -0.2374231 -0.7927135 -0.9554769 0.8733201 -0.4126742 1.8689940 0.1576750 -0.2184344
Run Code Online (Sandbox Code Playgroud)
每个子列表名称都是不同变量的值:在本例中,可能:
ls[[]] = time; 10 or 20
ls[[]][[]] = seed; 123 or 456
ls[[]][[]][[]] = treatment; 0.1 or 0.2
Run Code Online (Sandbox Code Playgroud)
理想情况下,我希望将子列表的名称用作它们自己列中的值。我希望数据框看起来像这样:
# time seed treatment Gmax.val G2.val Gmax.vec G2.vec
#1 10 123 0.1 0.1972457 -0.1224265 0.06121407 1.5102516
#2 10 123 0.1 0.1972457 -0.1224265 -2.53026477 -0.1320042
#3 10 123 0.1 0.1972457 -0.1224265 0.06648820 -0.2477285
#4 10 123 0.1 0.1972457 -0.1224265 -0.45594701 -0.8577670
#5 10 123 0.1 0.1972457 -0.1224265 0.90828911 -1.0710828
#6 10 123 0.1 0.1972457 -0.1224265 0.56427976 1.5086222
Run Code Online (Sandbox Code Playgroud)
谢谢您的帮助。
另一种方法是:
rrapply()rrapply-package将嵌套列表融合到 data.frame中(或类似地使用reshape2::melt())。pivot_wider()和将 data.frame 重塑为所需的格式unnest()。library(rrapply)
library(tidyverse)
rrapply(ls, how = "melt") %>% ## melt to long df
pivot_wider(names_from = "L4") %>% ## reshape to wide df
unnest(c(Gmax.val, G2.val, Gmax.vec, G2.vec)) %>% ## unnest list columns
rename(time = L1, seed = L2, treatment = L3) ## rename columns
#> # A tibble: 64 x 7
#> time seed treatment Gmax.val G2.val Gmax.vec G2.vec
#> <chr> <chr> <chr> <dbl> <dbl> <dbl> <dbl>
#> 1 10 123 0.1 -0.626 0.184 -0.836 1.51
#> 2 10 123 0.1 -0.626 0.184 1.60 0.390
#> 3 10 123 0.1 -0.626 0.184 0.330 -0.621
#> 4 10 123 0.1 -0.626 0.184 -0.820 -2.21
#> 5 10 123 0.1 -0.626 0.184 0.487 1.12
#> 6 10 123 0.1 -0.626 0.184 0.738 -0.0449
#> 7 10 123 0.1 -0.626 0.184 0.576 -0.0162
#> 8 10 123 0.1 -0.626 0.184 -0.305 0.944
#> 9 10 123 0.2 0.821 0.594 0.919 -0.478
#> 10 10 123 0.2 0.821 0.594 0.782 0.418
#> # … with 54 more rows
Run Code Online (Sandbox Code Playgroud)
或者使用 data.tabledcast()将长表重塑为宽格式:
library(data.table)
long_dt <- as.data.table(rrapply(ls, how = "melt"))
wide_dt <- dcast(long_dt, L1 + L2 + L3 ~ L4)
wide_dt <- wide_dt[, lapply(.SD, unlist), by = list(L1, L2, L3), .SDcols = c("Gmax.val", "G2.val", "Gmax.vec", "G2.vec")]
setnames(wide_dt, old = c("L1", "L2", "L3"), new = c("time", "seed", "treatment"))
Run Code Online (Sandbox Code Playgroud)
一些基准
microbenchmark::microbenchmark(
tidyr = {
rrapply(ls, how = "melt") %>%
pivot_wider(names_from = "L4") %>%
unnest(c(Gmax.val, G2.val, Gmax.vec, G2.vec)) %>%
rename(time = L1, seed = L2, treatment = L3)
},
data.table = {
wide_dt <- dcast(as.data.table(rrapply(ls, how = "melt")), L1 + L2 + L3 ~ L4)
wide_dt <- wide_dt[, lapply(.SD, unlist), by = list(L1, L2, L3), .SDcols = c("Gmax.val", "G2.val", "Gmax.vec", "G2.vec")]
setnames(wide_dt, old = c("L1", "L2", "L3"), new = c("time", "seed", "treatment"))
wide_dt
},
times = 25
)
#> Unit: milliseconds
#> expr min lq mean median uq max neval
#> tidyr 17.959197 20.072647 23.662698 21.278771 25.633581 40.593022 25
#> data.table 2.061861 2.655782 2.966581 2.784425 2.988044 5.032524 25
Run Code Online (Sandbox Code Playgroud)