解析一列json并与另一列绑定以制作数据框

Eri*_*een 1 r jsonlite purrr

我有采用以下格式的数据:

have <- structure(list(V1 = c(4L, 28L, 2L), 
                       V2 = c("[{\"group\":1,\"topic\":\"A\"},{\"group\":1,\"topic\":\"B\"},{\"group\":2,\"topic\":\"C\"},{\"group\":2,\"topic\":\"T\"},{\"group\":2,\"topic\":\"U\"},{\"group\":3,\"topic\":\"V\"},{\"group\":3,\"topic\":\"D\"},{\"group\":3,\"topic\":\"R\"},{\"group\":4,\"topic\":\"A\"},{\"group\":4,\"topic\":\"Q\"},{\"group\":4,\"topic\":\"S\"},{\"group\":4,\"topic\":\"W\"},{\"group\":6,\"topic\":\"O\"},{\"group\":6,\"topic\":\"P\"},{\"group\":6,\"topic\":\"E\"},{\"group\":6,\"topic\":\"F\"},{\"group\":6,\"topic\":\"G\"},{\"group\":6,\"topic\":\"H\"},{\"group\":6,\"topic\":\"I\"},{\"group\":6,\"topic\":\"J\"},{\"group\":6,\"topic\":\"K\"},{\"group\":6,\"topic\":\"L\"},{\"group\":6,\"topic\":\"M\"},{\"group\":6,\"topic\":\"N\"}]",
                              "[]", 
                              "[{\"group\":2,\"topic\":\"C\"},{\"group\":3,\"topic\":\"D\"},{\"group\":6,\"topic\":\"O\"},{\"group\":6,\"topic\":\"P\"},{\"group\":6,\"topic\":\"E\"},{\"group\":6,\"topic\":\"G\"},{\"group\":6,\"topic\":\"M\"}]")
), 
row.names = c(NA, 3L), 
class = "data.frame")
Run Code Online (Sandbox Code Playgroud)

的内容V2是每一行的嵌套分组,如[{"group":1,"topic":"A"},{"group":1,"topic":"B"}...]

我想获得一个宽数据框,其中also_have每行的 group+topic 的每个组合都有一个指示符 (1/0)(请参阅 参考资料)。像这样的东西:

# A tibble: 3 x 4
     id topic_id_1 topic_id_2 topic_id_3 topic_id_4 ...
  <dbl>      <dbl>      <dbl>      <dbl>
1     4          1          1          0
2    28          0          0          0
3     2          0          0          0
Run Code Online (Sandbox Code Playgroud)

第一步是解析json。

我可以使用purrr::map(have$V2, jsonlite::fromJSON)取消嵌套到列表中,但我不确定如何将V1列(我们可能会重命名为id)绑定到结果列表的每个元素(请注意,列表元素二是空的,因为它V1==28是空的)。下面是添加id( V1)的第一个元素可能是什么样子的片段。

[[1]]
   group topic  id
1      1     A   4
2      1     B   4
3      2     C   4
4      2     T   4
...
Run Code Online (Sandbox Code Playgroud)

或者,我认为purrr::map_df(have$V2, jsonlite::fromJSON)会让我更接近我最终需要的东西,但在这里我也不确定如何添加行id( V1)。

df <- purrr::map_df(have$V2, jsonlite::fromJSON)
df

What I get:

   group topic
1      1     A
2      1     B
3      2     C
4      2     T
...

What I want (notice `V1==28` does not appear): 

   group topic id
1      1     A  4
2      1     B  4
3      2     C  4
4      2     T  4
5      2     U  4
6      3     V  4
7      3     D  4
8      3     R  4
9      4     A  4
10     4     Q  4
11     4     S  4
12     4     W  4
13     6     O  4
14     6     P  4
15     6     E  4
16     6     F  4
17     6     G  4
18     6     H  4
19     6     I  4
20     6     J  4
21     6     K  4
22     6     L  4
23     6     M  4
24     6     N  4
25     2     C  2
26     3     D  2
27     6     O  2
28     6     P  2
29     6     E  2
30     6     G  2
31     6     M  2
Run Code Online (Sandbox Code Playgroud)

停止。

我想如果我能得到上面的数据框,id我就可以完成剩下的工作。最终目标是加入这些信息,also_have然后广泛使用。

# join
also_have <- expand_grid(c(1:6), c(LETTERS)) %>%
  mutate(topic_id = 1:n()) %>%
  magrittr::set_colnames(c("group", "topic", "topic_id")) %>%
  select(topic_id, group, topic)

# pivot wide

# A tibble: 3 x 4
     id topic_id_1 topic_id_2 topic_id_3 topic_id_4 ...
  <dbl>      <dbl>      <dbl>      <dbl>
1     4          1          1          0
2    28          0          0          0
3     2          0          0          0
Run Code Online (Sandbox Code Playgroud)

更新:

应用@akrun 的解决方案:

purrr::map_dfr(setNames(have$V2, have$V1), 
                     jsonlite::fromJSON, 
                     .id = 'V1') %>%
  rename(id = V1) %>%
  left_join(also_have, by=c("group", "topic")) %>%
  select(-group, -topic) %>%
  mutate(value = 1) %>%
  pivot_wider(id_cols = id, 
              names_from = topic_id,
              names_prefix = "topic_id",
              values_from = value,
              values_fill = 0
  ) %>% 
  full_join(tibble(id = as.character(have$V1))) %>%
  replace(is.na(.), 0)

# A tibble: 3 x 25
  id    topic_id1 topic_id2 topic_id29 topic_id46 topic_id47 topic_id74 topic_id56
  <chr>     <dbl>     <dbl>      <dbl>      <dbl>      <dbl>      <dbl>      <dbl>
1 4             1         1          1          1          1          1          1
2 2             0         0          1          0          0          0          1
3 28            0         0          0          0          0          0          0
# … with 17 more variables: topic_id70 <dbl>, topic_id79 <dbl>, topic_id95 <dbl>,
#   topic_id97 <dbl>, topic_id101 <dbl>, topic_id145 <dbl>, topic_id146 <dbl>,
#   topic_id135 <dbl>, topic_id136 <dbl>, topic_id137 <dbl>, topic_id138 <dbl>,
#   topic_id139 <dbl>, topic_id140 <dbl>, topic_id141 <dbl>, topic_id142 <dbl>,
#   topic_id143 <dbl>, topic_id144 <dbl>
Run Code Online (Sandbox Code Playgroud)

akr*_*run 5

我们可以传递一个命名向量,然后.idmap_dfr

 purrr::map_dfr(setNames(have$V2, have$V1), jsonlite::fromJSON, .id = 'id')
Run Code Online (Sandbox Code Playgroud)

-输出

id group topic
1   4     1     A
2   4     1     B
3   4     2     C
4   4     2     T
5   4     2     U
6   4     3     V
7   4     3     D
8   4     3     R
9   4     4     A
10  4     4     Q
11  4     4     S
12  4     4     W
...
Run Code Online (Sandbox Code Playgroud)

或者这可以dplyr在使用后在框架本身内完成rowwise

library(tidyr)
have %>%
    rowwise %>%
    transmute(ID = V1, V2 = list(fromJSON(V2))) %>% 
    ungroup %>%
    unnest(c(V2), keep_empty = TRUE) %>% 
    select(-V2)
# A tibble: 32 x 3
      ID group topic
   <int> <int> <chr>
 1     4     1 A    
 2     4     1 B    
 3     4     2 C    
 4     4     2 T    
 5     4     2 U    
 6     4     3 V    
 7     4     3 D    
 8     4     3 R    
 9     4     4 A    
10     4     4 Q    
# … with 22 more rows
Run Code Online (Sandbox Code Playgroud)

第二步做一个连接

out <-  have %>%
     rowwise %>%
     transmute(ID = V1, V2 = list(fromJSON(V2))) %>% 
     ungroup %>%
     unnest(c(V2), keep_empty = TRUE) %>% 
     select(-V2) %>% right_join(also_have)
out
Joining, by = c("group", "topic")
# A tibble: 163 x 4
      ID group topic topic_id
   <int> <int> <chr>    <int>
 1     4     1 A            1
 2     4     1 B            2
 3     4     2 C           29
 4     4     2 T           46
 5     4     2 U           47
 6     4     3 V           74
 7     4     3 D           56
 8     4     3 R           70
 9     4     4 A           79
10     4     4 Q           95
# … with 153 more rows
Run Code Online (Sandbox Code Playgroud)