转换复杂的数据框架

fug*_*ugu 6 r dataframe

我有一个数据框,其第一列(weights)包含一个列表(数据帧?):

> head(data$weights)

> data <- structure(list(A373R11 = structure(list(Signature.1A = 0, Signature.1B = 0, 
    Signature.2 = 0, Signature.3 = 0.151631702143023, Signature.4 = 0.149799882118262, 
    Signature.5 = 0, Signature.6 = 0, Signature.7 = 0.0634912587993959, 
    Signature.8 = 0, Signature.9 = 0.173189155080817, Signature.10 = 0, 
    Signature.11 = 0, Signature.12 = 0, Signature.13 = 0, Signature.14 = 0, 
    Signature.15 = 0, Signature.16 = 0, Signature.17 = 0, Signature.18 = 0, 
    Signature.19 = 0, Signature.20 = 0, Signature.21 = 0.0905517653558877, 
    Signature.R1 = 0, Signature.R2 = 0, Signature.R3 = 0, Signature.U1 = 0.155590748898003, 
    Signature.U2 = 0.145955461287919), .Names = c("Signature.1A", 
"Signature.1B", "Signature.2", "Signature.3", "Signature.4", 
"Signature.5", "Signature.6", "Signature.7", "Signature.8", "Signature.9", 
"Signature.10", "Signature.11", "Signature.12", "Signature.13", 
"Signature.14", "Signature.15", "Signature.16", "Signature.17", 
"Signature.18", "Signature.19", "Signature.20", "Signature.21", 
"Signature.R1", "Signature.R2", "Signature.R3", "Signature.U1", 
"Signature.U2"), row.names = "A373R11", class = "data.frame"), 
    A373R13 = structure(list(Signature.1A = 0, Signature.1B = 0, 
        Signature.2 = 0, Signature.3 = 0.221014874027829, Signature.4 = 0, 
        Signature.5 = 0, Signature.6 = 0, Signature.7 = 0, Signature.8 = 0.279252211893692, 
        Signature.9 = 0, Signature.10 = 0, Signature.11 = 0, 
        Signature.12 = 0, Signature.13 = 0, Signature.14 = 0, 
        Signature.15 = 0, Signature.16 = 0, Signature.17 = 0, 
        Signature.18 = 0, Signature.19 = 0.115216422668955, Signature.20 = 0, 
        Signature.21 = 0, Signature.R1 = 0, Signature.R2 = 0, 
        Signature.R3 = 0.0636987713225648, Signature.U1 = 0.108875099907467, 
        Signature.U2 = 0), .Names = c("Signature.1A", "Signature.1B", 
    "Signature.2", "Signature.3", "Signature.4", "Signature.5", 
    "Signature.6", "Signature.7", "Signature.8", "Signature.9", 
    "Signature.10", "Signature.11", "Signature.12", "Signature.13", 
    "Signature.14", "Signature.15", "Signature.16", "Signature.17", 
    "Signature.18", "Signature.19", "Signature.20", "Signature.21", 
    "Signature.R1", "Signature.R2", "Signature.R3", "Signature.U1", 
    "Signature.U2"), row.names = "A373R13", class = "data.frame"), 
    A373R3 = structure(list(Signature.1A = 0, Signature.1B = 0, 
        Signature.2 = 0, Signature.3 = 0.0795605471131758, Signature.4 = 0.0973130562439999, 
        Signature.5 = 0, Signature.6 = 0, Signature.7 = 0, Signature.8 = 0.249674548796242, 
        Signature.9 = 0.0725013504411567, Signature.10 = 0, Signature.11 = 0.064665155855146, 
        Signature.12 = 0, Signature.13 = 0, Signature.14 = 0, 
        Signature.15 = 0, Signature.16 = 0, Signature.17 = 0, 
        Signature.18 = 0, Signature.19 = 0, Signature.20 = 0, 
        Signature.21 = 0, Signature.R1 = 0, Signature.R2 = 0, 
        Signature.R3 = 0.0703546703126821, Signature.U1 = 0.21753544296676, 
        Signature.U2 = 0.0739201832004727), .Names = c("Signature.1A", 
    "Signature.1B", "Signature.2", "Signature.3", "Signature.4", 
    "Signature.5", "Signature.6", "Signature.7", "Signature.8", 
    "Signature.9", "Signature.10", "Signature.11", "Signature.12", 
    "Signature.13", "Signature.14", "Signature.15", "Signature.16", 
    "Signature.17", "Signature.18", "Signature.19", "Signature.20", 
    "Signature.21", "Signature.R1", "Signature.R2", "Signature.R3", 
    "Signature.U1", "Signature.U2"), row.names = "A373R3", class = "data.frame"), 
    A373R5 = structure(list(Signature.1A = 0, Signature.1B = 0, 
        Signature.2 = 0, Signature.3 = 0.113996509522102, Signature.4 = 0.114874220936966, 
        Signature.5 = 0.142056872670519, Signature.6 = 0, Signature.7 = 0, 
        Signature.8 = 0.208376707959741, Signature.9 = 0.0744527503782136, 
        Signature.10 = 0, Signature.11 = 0, Signature.12 = 0, 
        Signature.13 = 0, Signature.14 = 0, Signature.15 = 0.0771902641012979, 
        Signature.16 = 0, Signature.17 = 0, Signature.18 = 0, 
        Signature.19 = 0, Signature.20 = 0, Signature.21 = 0, 
        Signature.R1 = 0, Signature.R2 = 0, Signature.R3 = 0, 
        Signature.U1 = 0.0673567355607731, Signature.U2 = 0), .Names = c("Signature.1A", 
    "Signature.1B", "Signature.2", "Signature.3", "Signature.4", 
    "Signature.5", "Signature.6", "Signature.7", "Signature.8", 
    "Signature.9", "Signature.10", "Signature.11", "Signature.12", 
    "Signature.13", "Signature.14", "Signature.15", "Signature.16", 
    "Signature.17", "Signature.18", "Signature.19", "Signature.20", 
    "Signature.21", "Signature.R1", "Signature.R2", "Signature.R3", 
    "Signature.U1", "Signature.U2"), row.names = "A373R5", class = "data.frame"), 
    A373R9 = structure(list(Signature.1A = 0, Signature.1B = 0, 
        Signature.2 = 0, Signature.3 = 0.116847300193985, Signature.4 = 0, 
        Signature.5 = 0.21624751052703, Signature.6 = 0, Signature.7 = 0, 
        Signature.8 = 0.252498230882402, Signature.9 = 0, Signature.10 = 0, 
        Signature.11 = 0.119495912880994, Signature.12 = 0, Signature.13 = 0, 
        Signature.14 = 0, Signature.15 = 0, Signature.16 = 0, 
        Signature.17 = 0, Signature.18 = 0, Signature.19 = 0, 
        Signature.20 = 0, Signature.21 = 0, Signature.R1 = 0, 
        Signature.R2 = 0, Signature.R3 = 0.0725549911220892, 
        Signature.U1 = 0, Signature.U2 = 0), .Names = c("Signature.1A", 
    "Signature.1B", "Signature.2", "Signature.3", "Signature.4", 
    "Signature.5", "Signature.6", "Signature.7", "Signature.8", 
    "Signature.9", "Signature.10", "Signature.11", "Signature.12", 
    "Signature.13", "Signature.14", "Signature.15", "Signature.16", 
    "Signature.17", "Signature.18", "Signature.19", "Signature.20", 
    "Signature.21", "Signature.R1", "Signature.R2", "Signature.R3", 
    "Signature.U1", "Signature.U2"), row.names = "A373R9", class = "data.frame"), 
    A512R19 = structure(list(Signature.1A = 0.109490572493859, 
        Signature.1B = 0, Signature.2 = 0, Signature.3 = 0, Signature.4 = 0.22010156823306, 
        Signature.5 = 0, Signature.6 = 0, Signature.7 = 0, Signature.8 = 0, 
        Signature.9 = 0, Signature.10 = 0, Signature.11 = 0, 
        Signature.12 = 0, Signature.13 = 0, Signature.14 = 0, 
        Signature.15 = 0, Signature.16 = 0, Signature.17 = 0, 
        Signature.18 = 0, Signature.19 = 0, Signature.20 = 0, 
        Signature.21 = 0, Signature.R1 = 0, Signature.R2 = 0, 
        Signature.R3 = 0.150943894106973, Signature.U1 = 0.248556502648564, 
        Signature.U2 = 0.119306892617062), .Names = c("Signature.1A", 
    "Signature.1B", "Signature.2", "Signature.3", "Signature.4", 
    "Signature.5", "Signature.6", "Signature.7", "Signature.8", 
    "Signature.9", "Signature.10", "Signature.11", "Signature.12", 
    "Signature.13", "Signature.14", "Signature.15", "Signature.16", 
    "Signature.17", "Signature.18", "Signature.19", "Signature.20", 
    "Signature.21", "Signature.R1", "Signature.R2", "Signature.R3", 
    "Signature.U1", "Signature.U2"), row.names = "A512R19", class = "data.frame")), .Names = c("A373R11", 
"A373R13", "A373R3", "A373R5", "A373R9", "A512R19"))
Run Code Online (Sandbox Code Playgroud)

这里,每行包含一个样本,每列包含特定签名的分数:

> data[1]

$A373R11
        Signature.1A Signature.1B Signature.2 Signature.3 Signature.4 Signature.5 Signature.6 Signature.7 Signature.8 Signature.9 Signature.10 Signature.11
A373R11            0            0           0   0.1516317   0.1497999           0           0  0.06349126           0   0.1731892            0            0
        Signature.12 Signature.13 Signature.14 Signature.15 Signature.16 Signature.17 Signature.18 Signature.19 Signature.20 Signature.21 Signature.R1 Signature.R2
A373R11            0            0            0            0            0            0            0            0            0   0.09055177            0            0
        Signature.R3 Signature.U1 Signature.U2
A373R11            0    0.1555907    0.1459555
Run Code Online (Sandbox Code Playgroud)

我想将其转换为具有以下结构的数据帧:

sample  signature  score
A373R11  Signature.1A  0
A373R11  Signature.1B  0
[...]
A373R13  Signature.1A  0
A373R13  Signature.1B  0
[...]
Run Code Online (Sandbox Code Playgroud)

谁能指出我正确的方向?

Jaa*_*aap 9

两种方法:

1)使用data.table-package

使用:

library(data.table)
melt(rbindlist(data, idcol = 'sample'),
     id = 'sample', variable.name = 'signature', value.name = 'score')
Run Code Online (Sandbox Code Playgroud)

得到:

      sample    signature      score
  1: A373R11 Signature.1A 0.00000000
  2: A373R13 Signature.1A 0.00000000
  3:  A373R3 Signature.1A 0.00000000
  4:  A373R5 Signature.1A 0.00000000
  5:  A373R9 Signature.1A 0.00000000
 ---                                
158: A373R13 Signature.U2 0.00000000
159:  A373R3 Signature.U2 0.07392018
160:  A373R5 Signature.U2 0.00000000
161:  A373R9 Signature.U2 0.00000000
162: A512R19 Signature.U2 0.11930689
Run Code Online (Sandbox Code Playgroud)

2)基础R

使用:

dat2 <- do.call(rbind, dat)
reshape(dat2, idvar = 'sample', ids = row.names(dat2),
        varying = list(1:ncol(dat2)), times = colnames(dat2),
        timevar = 'signature', v.names = 'score',
        new.row.names = NULL, direction = 'long')
Run Code Online (Sandbox Code Playgroud)

得到:

                        signature      score  sample
A373R11.Signature.1A Signature.1A 0.00000000 A373R11
A373R13.Signature.1A Signature.1A 0.00000000 A373R13
A373R3.Signature.1A  Signature.1A 0.00000000  A373R3
A373R5.Signature.1A  Signature.1A 0.00000000  A373R5
A373R9.Signature.1A  Signature.1A 0.00000000  A373R9

.....

A373R13.Signature.U2 Signature.U2 0.00000000 A373R13
A373R3.Signature.U2  Signature.U2 0.07392018  A373R3
A373R5.Signature.U2  Signature.U2 0.00000000  A373R5
A373R9.Signature.U2  Signature.U2 0.00000000  A373R9
A512R19.Signature.U2 Signature.U2 0.11930689 A512R19
Run Code Online (Sandbox Code Playgroud)

注意:

最好不要为数据提供与函数相同的名称.见?data.


Axe*_*man 7

一个tidyverse解决方案,我们首先将所有data.frames连接在一起,然后gather根据需要使用它们重塑它们:

library(dplyr)
library(tidyr)

data %>%
  bind_rows(.id = 'sample') %>%
  gather(signature, score, -sample)
Run Code Online (Sandbox Code Playgroud)

得到:

     sample    signature      score
1   A373R11 Signature.1A 0.00000000
2   A373R13 Signature.1A 0.00000000
3    A373R3 Signature.1A 0.00000000
4    A373R5 Signature.1A 0.00000000
5    A373R9 Signature.1A 0.00000000
6   A512R19 Signature.1A 0.10949057
7   A373R11 Signature.1B 0.00000000
8   A373R13 Signature.1B 0.00000000
9    A373R3 Signature.1B 0.00000000
10   A373R5 Signature.1B 0.00000000
....
Run Code Online (Sandbox Code Playgroud)

可以写成没有管道的单线程:

gather(bind_rows(data, .id = 'sample'), signature, score, -sample)
Run Code Online (Sandbox Code Playgroud)


lmo*_*lmo 7

这是一个基本的R替代品rapply.请注意,我已重命名您的data.frame数据.

# pull out the values and their attached names with rapply
myVec <- rapply(unname(dat), identity)
# even better:
# myVec <- unlist(unname(dat))

# construct the data.frame
mydf <- data.frame(sample=rep(names(dat), lengths(dat)),
                   signature=names(myVec),
                   score=myVec,
                   stringsAsFactors=FALSE, row.names = seq_along(myVec))
Run Code Online (Sandbox Code Playgroud)

这回来了

 head(mydf)
   sample    signature     score
1 A373R11 Signature.1A 0.0000000
2 A373R11 Signature.1B 0.0000000
3 A373R11  Signature.2 0.0000000
4 A373R11  Signature.3 0.1516317
5 A373R11  Signature.4 0.1497999
6 A373R11  Signature.5 0.0000000
Run Code Online (Sandbox Code Playgroud)

默认情况下,rapply对每个列表的最后元素执行一个函数,返回一个向量.我用来不identity加改变地返回这些元素.由于每个元素都有一个与之关联的名称,因此rapply返回一个命名向量.

unname用来去掉每个外部列表项的名称.这使得下一步的数据构建变得更加容易.否则,命名向量将具有类似"A373R11.Signature.5"的名称,这将需要更多工作来返回所需结果.