按列列出data.tables的列表,并通过引用

ana*_*ria 7 r data.table

说我有以下列表:

X = list(data.table(1:2,3:4,5:6), 
         data.table(letters[1:2], letters[3:4]))
Run Code Online (Sandbox Code Playgroud)

我想按列将列表元素绑定到单个data.table.输出与生成的输出相同

do.call(cbind, X)
Run Code Online (Sandbox Code Playgroud)

但是,由于我的原始列表和包含数据表非常大,如果我可以通过引用来完成此操作,而不是复制整个对象会更好cbind().有办法吗?有点类似于rbindlist()通过列绑定的东西,我看到这标记为待办事项......

抱歉,如果这个简单的问题已经在其他地方得到解答,我就错过了答案.

Mic*_*ico 5

接下来呢?

# check.names = TRUE forces unique names on the output
setDT(unlist(X, recursive = FALSE), check.names = TRUE)[]
#    V1 V2 V3 V4 V5
# 1:  1  3  5  a  c
# 2:  2  4  6  b  d
Run Code Online (Sandbox Code Playgroud)


akr*_*run 5

bind_colsdplyr看起来比是有效的do.call(cbind,它返回一个data.table

library(dplyr)
bind_cols(X)
#   V1 V2 V3 V11 V21
#1:  1  3  5   a   c
#2:  2  4  6   b   d
Run Code Online (Sandbox Code Playgroud)

基准测试

set.seed(24)
X1 <- lapply(1:10, function(i)
      as.data.table(matrix(sample(1:9, 1e5*1e3, replace = TRUE), nrow = 1e5, ncol = 1e3)))

system.time({
   bind_cols(X1)
  })
#user  system elapsed 
#   0.01    0.00    0.02 

system.time({
    do.call(cbind, X1)
   })
#user  system elapsed 
#   2.22   37.84   40.93 

system.time({
  setDT(unlist(X1, recursive = FALSE), check.names = TRUE)
  })
#  user  system elapsed 
#   0.05    0.00    0.05 
Run Code Online (Sandbox Code Playgroud)

或搭配 check.names = FALSE

system.time({
   setDT(unlist(X1, recursive = FALSE), check.names = FALSE)
  })
#  user  system elapsed 
#  0.01    0.00    0.02 
Run Code Online (Sandbox Code Playgroud)

同样基于@MichaelChirico的示例数据进行测试

set.seed(24)
NN <- 1e6
L <- lapply(integer(20L), function(ii) {
    setDT(lapply(integer(sample(15L, 1L)), function(x) rnorm(NN))) }) 

system.time({
   bind_cols(L)
  })
# user  system elapsed 
#      0       0       0 

system.time({
    do.call(cbind, L)
   })
# user  system elapsed 
#   0.44    0.53    0.97 


system.time({
base = L[[1L]]
jj = ncol(base) + 1L
for (ii in 2L:length(L)) {
  for (col_j in seq_len(ncol(L[[ii]]))) {
    set(base, , sprintf('V%d', jj), L[[ii]][[col_j]])
    jj = jj + 1L
  }
}
 })
#user  system elapsed 
#  0.12    0.33    0.46 
Run Code Online (Sandbox Code Playgroud)

并使用@MichaelChirico的更新方法

system.time({
   setDT(unlist(L, recursive = FALSE), check.names = TRUE)
   })
#  user  system elapsed 
#     0       0       0 
Run Code Online (Sandbox Code Playgroud)