如何对存储在多级嵌套列表中的数据帧进行行绑定,并为每个级别添加标识符列?

den*_*nis 7 r nested-lists dataframe dplyr purrr

作为 R 数据操作的初学者,我在处理多层嵌套列表方面遇到了困难。

问题:有没有办法将这个dat0三级列表转换为dat1下面的全局数据框?

  • fulltext列连接text每个小标题中的变量。
  • nbsum列添加nb每个小标题中的变量。

注意:欢迎使用具有函数( ...)purrr的基于方法来更好地理解在此特定上下文中的这些工具。也欢迎其他方法!dplyrmutate

感谢帮助

初始数据:

在此输入图像描述

dat0 <- list(pdf1 =
               list(page1 =
                      list(tibble1 = tibble(x = c(1,2,3,4), y = c(1,1,1,1), text = c("ha","r","r","y"), nb = c(1,2,3,4)),
                           tibble2 = tibble(x = c(1,2,3,4), y = c(2,2,2,2), text = c("p","ot","t","er"), nb = c(1,2,3,4))),
                    page2 = 
                      list(tibble1 = tibble(x = c(1,2,3), y = c(3,3,3), text = c("her","m","ione"), nb = c(1,2,3)),
                           tibble2 = tibble(x = c(1,2,3), y = c(4,4,4), text = c("gra","ng","er"), nb = c(1,2,3)))),
             pdf2 =
               list(page1 =
                      list(tibble1 = tibble(x = c(1,2), y = c(5,5), text = c("vol","de"), nb = c(1,2)),
                           tibble2 = tibble(x = c(1,2), y = c(6,6), text = c("m","ort"), nb = c(1,2))),
                    page2 =
                      list(tibble1 = tibble(x = c(1,2,3,4,5), y = c(7,7,7,7,7), text = c("a","l","b","u","s"), nb = c(1,2,3,4,5)),
                           tibble2 = tibble(x = c(1,2,3,4,5), y = c(8,8,8,8,8), text = c("du","m","ble","do","re"), nb = c(1,2,3,4,5))),
                    page3 = 
                      list(tibble1 = tibble(x = c(1,2,3,4), y = c(9,9,9,9), text = c("dr","a","g","o"), nb = c(1,2,3,4)),
                           tibble2 = tibble(x = c(1,2,3,4), y = c(10,10,10,10), text = c("ma","lf","o","y"), nb = c(1,2,3,4)))),
             pdf3 =
               list(page1 =
                      list(tibble1 = tibble(x = c(1,2,3,4,5), y = c(11,11,11,11,11), text = c("s","ev","e","ru","s"), nb = c(1,2,3,4,5)),
                           tibble2 = tibble(x = c(1,2,3,4,5), y = c(12,12,12,12,12), text = c("r","o","g","u","e"), nb = c(1,2,3,4,5))),
                    page2 =
                      list(tibble1 = tibble(x = c(1,2,3), y = c(13,13,13), text = c("r","o","n"), nb = c(1,2,3)),
                           tibble2 = tibble(x = c(1,2,3), y = c(14,14,14), text = c("we","as","ley"), nb = c(1,2,3))),
                    page3 =
                      list(tibble1 = tibble(x = c(1,2,3,4,5,6), y = c(15,15,15,15,15,15), text = c("be","l","la","t","ri","x"), nb = c(1,2,3,4,5,6)),
                           tibble2 = tibble(x = c(1,2,3,4,5,6), y = c(16,16,16,16,16,16), text = c("l","est","r","a","ng","e"), nb = c(1,2,3,4,5,6))),
                    page4 = 
                      list(tibble1 = tibble(x = c(1,2), y = c(17,17), text = c("sir","ius"), nb = c(1,2)),
                           tibble2 = tibble(x = c(1,2), y = c(18,18), text = c("bl","ack"), nb = c(1,2)))))
Run Code Online (Sandbox Code Playgroud)

所需的输出(费力构建;下面的 dput 脚本):

在此输入图像描述

dat1 <-
structure(list(pdf = c("pdf1", "pdf1", "pdf1", "pdf1", "pdf1", 
"pdf1", "pdf1", "pdf1", "pdf1", "pdf1", "pdf1", "pdf1", "pdf1", 
"pdf1", "pdf2", "pdf2", "pdf2", "pdf2", "pdf2", "pdf2", "pdf2", 
"pdf2", "pdf2", "pdf2", "pdf2", "pdf2", "pdf2", "pdf2", "pdf2", 
"pdf2", "pdf2", "pdf2", "pdf2", "pdf2", "pdf2", "pdf2", "pdf3", 
"pdf3", "pdf3", "pdf3", "pdf3", "pdf3", "pdf3", "pdf3", "pdf3", 
"pdf3", "pdf3", "pdf3", "pdf3", "pdf3", "pdf3", "pdf3", "pdf3", 
"pdf3", "pdf3", "pdf3", "pdf3", "pdf3", "pdf3", "pdf3", "pdf3", 
"pdf3", "pdf3", "pdf3", "pdf3", "pdf3", "pdf3", "pdf3"), page = c("page1", 
"page1", "page1", "page1", "page1", "page1", "page1", "page1", 
"page2", "page2", "page2", "page2", "page2", "page2", "page1", 
"page1", "page1", "page1", "page2", "page2", "page2", "page2", 
"page2", "page2", "page2", "page2", "page2", "page2", "page3", 
"page3", "page3", "page3", "page3", "page3", "page3", "page3", 
"page1", "page1", "page1", "page1", "page1", "page1", "page1", 
"page1", "page1", "page1", "page2", "page2", "page2", "page2", 
"page2", "page2", "page3", "page3", "page3", "page3", "page3", 
"page3", "page3", "page3", "page3", "page3", "page3", "page3", 
"page4", "page4", "page4", "page4"), x = c(1, 2, 3, 4, 1, 2, 
3, 4, 1, 2, 3, 1, 2, 3, 1, 2, 1, 2, 1, 2, 3, 4, 5, 1, 2, 3, 4, 
5, 1, 2, 3, 4, 1, 2, 3, 4, 1, 2, 3, 4, 5, 1, 2, 3, 4, 5, 1, 2, 
3, 1, 2, 3, 1, 2, 3, 4, 5, 6, 1, 2, 3, 4, 5, 6, 1, 2, 1, 2), 
    y = c(1, 1, 1, 1, 2, 2, 2, 2, 3, 3, 3, 4, 4, 4, 5, 5, 6, 
    6, 7, 7, 7, 7, 7, 8, 8, 8, 8, 8, 9, 9, 9, 9, 10, 10, 10, 
    10, 11, 11, 11, 11, 11, 12, 12, 12, 12, 12, 13, 13, 13, 14, 
    14, 14, 15, 15, 15, 15, 15, 15, 16, 16, 16, 16, 16, 16, 17, 
    17, 18, 18), text = c("ha", "r", "r", "y", "p", "ot", "t", 
    "er", "her", "m", "ione", "gra", "ng", "er", "vol", "de", 
    "m", "ort", "a", "l", "b", "u", "s", "du", "m", "ble", "do", 
    "re", "dr", "a", "g", "o", "ma", "lf", "o", "y", "s", "ev", 
    "e", "ru", "s", "r", "o", "g", "u", "e", "r", "o", "n", "we", 
    "as", "ley", "be", "l", "la", "t", "ri", "x", "l", "est", 
    "r", "a", "ng", "e", "sir", "ius", "bl", "ack"), nb = c(1, 
    2, 3, 4, 1, 2, 3, 4, 1, 2, 3, 1, 2, 3, 1, 2, 1, 2, 1, 2, 
    3, 4, 5, 1, 2, 3, 4, 5, 1, 2, 3, 4, 1, 2, 3, 4, 1, 2, 3, 
    4, 5, 1, 2, 3, 4, 5, 1, 2, 3, 1, 2, 3, 1, 2, 3, 4, 5, 6, 
    1, 2, 3, 4, 5, 6, 1, 2, 1, 2), fulltext = c("harry", "harry", 
    "harry", "harry", "potter", "potter", "potter", "potter", 
    "hermione", "hermione", "hermione", "granger", "granger", 
    "granger", "volde", "volde", "mort", "mort", "albus", "albus", 
    "albus", "albus", "albus", "dumbledore", "dumbledore", "dumbledore", 
    "dumbledore", "dumbledore", "drago", "drago", "drago", "drago", 
    "malfoy", "malfoy", "malfoy", "malfoy", "severus", "severus", 
    "severus", "severus", "severus", "rogue", "rogue", "rogue", 
    "rogue", "rogue", "ron", "ron", "ron", "weasley", "weasley", 
    "weasley", "bellatrix", "bellatrix", "bellatrix", "bellatrix", 
    "bellatrix", "bellatrix", "lestrange", "lestrange", "lestrange", 
    "lestrange", "lestrange", "lestrange", "sirius", "sirius", 
    "black", "black"), nbsum = c(10, 10, 10, 10, 10, 10, 10, 
    10, 6, 6, 6, 6, 6, 6, 3, 3, 3, 3, 15, 15, 15, 15, 15, 15, 
    15, 15, 15, 15, 10, 10, 10, 10, 10, 10, 10, 10, 15, 15, 15, 
    15, 15, 15, 15, 15, 15, 15, 6, 6, 6, 6, 6, 6, 21, 21, 21, 
    21, 21, 21, 21, 21, 21, 21, 21, 21, 3, 3, 3, 3)), row.names = c(NA, 
-68L), class = "data.frame")
Run Code Online (Sandbox Code Playgroud)

Maë*_*aël 8

IMO 是此任务最灵活的函数collapse::unlist2d

library(dplyr)
dat2 <- 
  collapse::unlist2d(dat0, idcols = c("pdf", "page", "tibble")) |> 
  mutate(fulltext = paste(text, collapse = ""), 
         nbsum = sum(nb),
         .by = c(pdf, page, tibble)) |> 
  select(-tibble)

identical(dat1, dat2)
#[1] TRUE
Run Code Online (Sandbox Code Playgroud)