R data.table 按行填充行 NA

Question

R data.table 按行填充行 NA

我想在 data.table 中填充行 NA，使用 'locf'，但要分别处理每一行。我似乎无法从以下结果中得到结果；

require(data.table)
set.seed(456)

# some dummy data
dt <- data.table(a = sample(1:4,6, replace=T), b = sample(1:4,6, replace=T), c = sample(1:4,6, replace=T), 
d = sample(1:4,6, replace=T), e = sample(1:4,6, replace=T),  f = sample(1:4,6, replace=T),  
g = sample(1:4,6, replace=T),  h = sample(1:4,6, replace=T),  i = sample(1:4,6, replace=T),  
j = sample(1:4,6, replace=T), xx = sample(1:4,6, replace=T))
dt[4, c:=NA]
dt[1, g:=NA]
dt[1, h:=NA]

# set colnames
cols <- setdiff(names(dt),"xx")

# use nafill over rows
dt[, (cols) := nafill(.SD, type="locf"), seq_len(nrow(dt)), .SDcols = cols]

Run Code Online (Sandbox Code Playgroud)

结果和原来的表没有什么不同，我错过了什么

        a      b      c      d      e      f     g       h      i      j xx
1:      1      3      3      2      3      1     NA     NA      4      3 1
2:      1      1      2      2      1      2      2      1      2      4 1
3:      3      2      3      1      1      4      3      3      2      1 2
4:      2      3     NA      1      2      2      1      4      3      4 2
5:      1      2      3      4      4      3      2      2      2      4 3
6:      4      1      4      2      1      4      4      3      3      4 3

Run Code Online (Sandbox Code Playgroud)

（注意实际数据为 1200 万行，如果这对性能有任何影响）

Answer 1

r2e*_*ans 9

一种好方法，使用for循环。它不是逐行的，它一次对“包含NA在列 'X' 中的所有行”进行操作，对于cols.

for (i in seq_along(cols)[-1]) {
  prevcol <- cols[i-1]
  thiscol <- cols[i]
  dt[is.na(get(thiscol)), (thiscol) := fcoalesce(get(thiscol), get(prevcol)) ]
}

dt
#        a     b     c     d     e     f     g     h     i     j    xx
#    <int> <int> <int> <int> <int> <int> <int> <int> <int> <int> <int>
# 1:     1     3     3     2     3     1     1     1     4     3     1
# 2:     1     1     2     2     1     2     2     1     2     4     1
# 3:     3     2     3     1     1     4     3     3     2     1     2
# 4:     2     3     3     1     2     2     1     4     3     4     2
# 5:     1     2     3     4     4     3     2     2     2     4     3
# 6:     4     1     4     2     1     4     4     3     3     4     3

Run Code Online (Sandbox Code Playgroud)

诚然，的使用get(.)并不完美，但我认为它通常是可以的。

另一种方法，速度差不多（取决于数据的大小）：

dt[, (cols) := Reduce(function(prev,this) fcoalesce(this, prev), .SD, accumulate = TRUE), .SDcols = cols]
# same results

Run Code Online (Sandbox Code Playgroud)

基准测试，既然你说 200 万行，性能很重要。

我将使用 2M 行并更新随机化NAs的方法。

library(data.table)
set.seed(456)
n <- 2e6 # 6e5
dt <- data.table(a = sample(1:4,n, replace=T), b = sample(1:4,n, replace=T), c = sample(1:4,n, replace=T), d = sample(1:4,n, replace=T), e = sample(1:4,n, replace=T),  f = sample(1:4,n, replace=T),  g = sample(1:4,n, replace=T),  h = sample(1:4,n, replace=T),  i = sample(1:4,n, replace=T),  j = sample(1:4,n, replace=T), xx = sample(1:4,n, replace=T))
mtx <- cbind(sample(nrow(dt), ceiling(n*11/20), replace=TRUE), sample(ncol(dt), ceiling(n*11/20), replace=TRUE))
mtx <- mtx[!duplicated(mtx),]
dt[mtx] <- NA
head(dt)
#        a     b     c     d     e     f     g     h     i     j    xx
#    <int> <int> <int> <int> <int> <int> <int> <int> <int> <int> <int>
# 1:     1     2     2     3     2     1     2     3     3     2     2
# 2:     1     3     4     1     4     4     3     2     4     3     3
# 3:     3     4     2     2     3     4     2     2     1    NA     1
# 4:     2     1     4     1     2     3    NA     4     4     4     3
# 5:     1     2     3     3     4     3     3    NA     1     4     1
# 6:     4     3     4     2     2    NA     4     1     2     4     2

Run Code Online (Sandbox Code Playgroud)

不幸的是，该transpose方法失败了：

system.time({
  dt2 = transpose(dt)
  setnafill(dt2, type = 'locf')
  dt2 = transpose(dt2)
  setnames(dt2, names(dt))
})
# Error: cannot allocate vector of size 30.6 Gb

Run Code Online (Sandbox Code Playgroud)

但for循环（和Reduce，顺便说一句）工作正常：

cols <- setdiff(names(dt),"N")
system.time({
  for (i in seq_along(cols)[-1]) {
    prevcol <- cols[i-1]
    thiscol <- cols[i]
    dt[is.na(get(thiscol)), (thiscol) := fcoalesce(get(thiscol), get(prevcol)) ]
  }
})
#    user  system elapsed 
#    0.14    0.00    0.11 
head(dt)
#        a     b     c     d     e     f     g     h     i     j    xx
#    <int> <int> <int> <int> <int> <int> <int> <int> <int> <int> <int>
# 1:     1     2     2     3     2     1     2     3     3     2     2
# 2:     1     3     4     1     4     4     3     2     4     3     3
# 3:     3     4     2     2     3     4     2     2     1     1     1
# 4:     2     1     4     1     2     3     3     4     4     4     3
# 5:     1     2     3     3     4     3     3     3     1     4     1
# 6:     4     3     4     2     2     2     4     1     2     4     2

Run Code Online (Sandbox Code Playgroud)

如果我将问题集简化为 600K 行，那么我可以让两者都工作。（我不知道我的系统的翻转点......它可能是 1M，谁知道呢，我只是想并排比较它们。）使用n <- 6e5和生成dt，我看到以下数据和简单的计时：

head(dt)
#        a     b     c     d     e     f     g     h     i     j    xx
#    <int> <int> <int> <int> <int> <int> <int> <int> <int> <int> <int>
# 1:     1     2     3     1     3     4    NA     3     3     3     3
# 2:     1     3     2     2     4     3     1     2     2     4     1
# 3:     3     4     2     1     1     1     1     4     2     4     2
# 4:     2     4     1    NA     1     4     3     1     4     1     1
# 5:     1    NA     4     2    NA    NA     4     4     2     2    NA
# 6:     4     1     4     4     1     2     3     3     1     1     2

sum(is.na(dt))
# [1] 321782
system.time({
  dt2 = transpose(dt)
  setnafill(dt2, type = 'locf')
  dt2 = transpose(dt2)
  setnames(dt2, names(dt))
})
#    user  system elapsed 
#    4.27    4.50    7.74 

sum(is.na(dt))  # 'dt' is unchanged, only important here to compare the 'for' loop
# [1] 321782
sum(is.na(dt2)) # rows with leading columns having 'NA', nothing to coalesce, not surprising
# [1] 30738

cols <- setdiff(names(dt),"N")
system.time({
  for (i in seq_along(cols)[-1]) {
    prevcol <- cols[i-1]
    thiscol <- cols[i]
    dt[is.na(get(thiscol)), (thiscol) := fcoalesce(get(thiscol), get(prevcol)) ]
  }
})
#    user  system elapsed 
#    0.10    0.03    0.06 

identical(dt, dt2)
# [1] TRUE

### regenerate `dt` so it has `NA`s again
system.time({
  dt[, (cols) := Reduce(function(prev,this) fcoalesce(this,prev), .SD, accumulate = TRUE), .SDcols = cols]
})
#    user  system elapsed 
#    0.03    0.00    0.03 

identical(dt, dt2)
# [1] TRUE

Run Code Online (Sandbox Code Playgroud)

更强大的基准测试bench::mark会因需要copy(dt)每次通过而受到一些影响。虽然这个开销并不大，

bench::mark(copy(dt))
# # A tibble: 1 x 13
#   expression      min   median `itr/sec` mem_alloc `gc/sec` n_itr  n_gc total_time result                           memory                  time          gc               
#   <bch:expr> <bch:tm> <bch:tm>     <dbl> <bch:byt>    <dbl> <int> <dbl>   <bch:tm> <list>                           <list>                  <list>        <list>           
# 1 copy(dt)     7.77ms   20.9ms      45.1    25.2MB        0    23     0      510ms <data.table[,11] [600,000 x 11]> <Rprofmem[,3] [14 x 3]> <bch:tm [23]> <tibble [23 x 3]>

Run Code Online (Sandbox Code Playgroud)

它仍然是额外的。因此，我将transpose两次比较代码，一次有一次没有，以便更好地将其与for和reduce更诚实的答案进行比较。（请注意，bench::mark的默认操作是验证所有输出都是identical。这可以禁用，但我没有这样做，所以所有代码块都返回相同的结果。）

bench::mark(
  transpose1 = {
    dt2 = transpose(dt)
    setnafill(dt2, type = 'locf')
    dt2 = transpose(dt2)
    setnames(dt2, names(dt))
    dt2
  },
  transpose2 = {
    dt0 = copy(dt)
    dt2 = transpose(dt0)
    setnafill(dt2, type = 'locf')
    dt2 = transpose(dt2)
    setnames(dt2, names(dt0))
    dt2
  },
  forloop = {
    dt0 <- copy(dt)
    for (i in seq_along(cols)[-1]) {
      prevcol <- cols[i-1]
      thiscol <- cols[i]
      dt0[is.na(get(thiscol)), (thiscol) := fcoalesce(get(thiscol), get(prevcol)) ]
    }
    dt0
  },
  reduce = {
    dt0 <- copy(dt)
    dt0[, (cols) := Reduce(function(prev,this) fcoalesce(this,prev), .SD, accumulate = TRUE), .SDcols = cols]
  },
  min_iterations = 10)
# Warning: Some expressions had a GC in every iteration; so filtering is disabled.
# # A tibble: 4 x 13
#   expression      min   median `itr/sec` mem_alloc `gc/sec` n_itr  n_gc total_time result                           memory                      time          gc               
#   <bch:expr> <bch:tm> <bch:tm>     <dbl> <bch:byt>    <dbl> <int> <dbl>   <bch:tm> <list>                           <list>                      <list>        <list>           
# 1 transpose1    4.94s    5.48s     0.154    1.28GB    0.201    10    13      1.08m <data.table[,11] [600,000 x 11]> <Rprofmem[,3] [33,008 x 3]> <bch:tm [10]> <tibble [10 x 3]>
# 2 transpose2    5.85s    6.29s     0.130     1.3GB    0.259    10    20      1.29m <data.table[,11] [600,000 x 11]> <Rprofmem[,3] [15,316 x 3]> <bch:tm [10]> <tibble [10 x 3]>
# 3 forloop     48.37ms 130.91ms     2.87    71.14MB    0        10     0      3.49s <data.table[,11] [600,000 x 11]> <Rprofmem[,3] [191 x 3]>    <bch:tm [10]> <tibble [10 x 3]>
# 4 reduce      48.08ms  75.82ms     4.70       71MB    0.470    10     1      2.13s <data.table[,11] [600,000 x 11]> <Rprofmem[,3] [38 x 3]>     <bch:tm [10]> <tibble [10 x 3]>

Run Code Online (Sandbox Code Playgroud)

由此：

时间：归一化为毫秒后，4840ms 与 48ms 相比较差；和
内存：1.28GB 与 71MB 相比较差。

（编辑以将基准测试的最小迭代次数增加到 10。）

Answer 2

B. *_*ang 7

解决此问题的另一种方法是使用该set函数。该解决方案既快速又非常高效。我还将它与 @r2evansforloop和Reduce12M 行数据表上的案例进行了比较。

我还考虑forloop了@erevens 答案（forloop1如下）中案例的一个修改版本。新版本包括简单地删除 data.table 参数 i ( is.na(get(thiscol))) 中的表达式。与原始更改相比，此更改有助于提高内存使用率和性能。

library(data.table)

for(cl in seq_along(cols)[-1L]) set(dt, j=cl, value=fcoalesce(dt3[[cl]], dt3[[cl-1L]]))

Run Code Online (Sandbox Code Playgroud)

基准

n <- 12e6
set.seed(0123456789)
d <- setDT(replicate(7, sample(c(1:4, NA), n, TRUE, (5:1)/15), simplify=FALSE))
setnames(d, c(letters[1:6], "xx"))
cols <- setdiff(names(d),"xx")

dt0 <- copy(d)
dt1 <- copy(d)
dt2 <- copy(d)
dt3 <- copy(d)

bench::mark(
  # modified version
  forloop1 = {
    for (i in seq_along(cols)[-1]) {
      prevcol <- cols[i-1]
      thiscol <- cols[i]
      # i not specified
      dt0[, (thiscol) := fcoalesce(get(thiscol), get(prevcol)) ]
    }
    dt0
  },
  # original version
  forloop2 = {
    for (i in seq_along(cols)[-1]) {
      prevcol <- cols[i-1]
      thiscol <- cols[i]
      dt1[is.na(get(thiscol)), (thiscol) := fcoalesce(get(thiscol), get(prevcol)) ]
    }
    dt1
  },
  reduce = {
    dt2[, (cols) := Reduce(function(prev,this) fcoalesce(this,prev), .SD, accumulate = TRUE), .SDcols = cols]
  },
  set = {
    for(cl in seq_along(cols)[-1L]) set(dt3, j=cl, value=fcoalesce(dt3[[cl]], dt3[[cl-1L]]))
    dt3
  },
  min_iterations = 5L)

# # A tibble: 4 x 13
#   expression      min   median `itr/sec` mem_alloc `gc/sec` n_itr  n_gc total_time result                        memory               time           gc              
#   <bch:expr> <bch:tm> <bch:tm>     <dbl> <bch:byt>    <dbl> <int> <dbl>   <bch:tm> <list>                        <list>               <list>         <list>          
# 1 forloop1     77.1ms   87.9ms     10.9      229MB     2.74     4     1      366ms <data.table [12,000,000 x 7]> <Rprofmem [134 x 3]> <bench_tm [5]> <tibble [5 x 3]>
# 2 forloop2    192.8ms  201.3ms      5.01     460MB     3.34     3     2      599ms <data.table [12,000,000 x 7]> <Rprofmem [183 x 3]> <bench_tm [5]> <tibble [5 x 3]>
# 3 reduce      114.5ms  130.2ms      7.76     458MB     5.17     3     2      387ms <data.table [12,000,000 x 7]> <Rprofmem [21 x 3]>  <bench_tm [5]> <tibble [5 x 3]>
# 4 set          65.6ms   68.5ms     14.5      229MB     9.65     3     2      207ms <data.table [12,000,000 x 7]> <Rprofmem [76 x 3]>  <bench_tm [5]> <tibble [5 x 3]>

Run Code Online (Sandbox Code Playgroud)

使用该set函数可以提高性能和内存使用率。我个人更关心中位时间（而不是total_time）。

归档时间：	4 年，5 月前
查看次数：	143 次
最近记录：	4 年，5 月前