我想在 data.table 中填充行 NA,使用 'locf',但要分别处理每一行。我似乎无法从以下结果中得到结果;
require(data.table)
set.seed(456)
# some dummy data
dt <- data.table(a = sample(1:4,6, replace=T), b = sample(1:4,6, replace=T), c = sample(1:4,6, replace=T),
d = sample(1:4,6, replace=T), e = sample(1:4,6, replace=T), f = sample(1:4,6, replace=T),
g = sample(1:4,6, replace=T), h = sample(1:4,6, replace=T), i = sample(1:4,6, replace=T),
j = sample(1:4,6, replace=T), xx = sample(1:4,6, replace=T))
dt[4, c:=NA]
dt[1, g:=NA]
dt[1, h:=NA]
# set colnames
cols <- setdiff(names(dt),"xx")
# use nafill over rows
dt[, (cols) := nafill(.SD, type="locf"), seq_len(nrow(dt)), .SDcols = cols]
Run Code Online (Sandbox Code Playgroud)
结果和原来的表没有什么不同,我错过了什么
a b c d e f g h i j xx
1: 1 3 3 2 3 1 NA NA 4 3 1
2: 1 1 2 2 1 2 2 1 2 4 1
3: 3 2 3 1 1 4 3 3 2 1 2
4: 2 3 NA 1 2 2 1 4 3 4 2
5: 1 2 3 4 4 3 2 2 2 4 3
6: 4 1 4 2 1 4 4 3 3 4 3
Run Code Online (Sandbox Code Playgroud)
(注意实际数据为 1200 万行,如果这对性能有任何影响)
一种好方法,使用for循环。它不是逐行的,它一次对“包含NA在列 'X' 中的所有行”进行操作,对于cols.
for (i in seq_along(cols)[-1]) {
prevcol <- cols[i-1]
thiscol <- cols[i]
dt[is.na(get(thiscol)), (thiscol) := fcoalesce(get(thiscol), get(prevcol)) ]
}
dt
# a b c d e f g h i j xx
# <int> <int> <int> <int> <int> <int> <int> <int> <int> <int> <int>
# 1: 1 3 3 2 3 1 1 1 4 3 1
# 2: 1 1 2 2 1 2 2 1 2 4 1
# 3: 3 2 3 1 1 4 3 3 2 1 2
# 4: 2 3 3 1 2 2 1 4 3 4 2
# 5: 1 2 3 4 4 3 2 2 2 4 3
# 6: 4 1 4 2 1 4 4 3 3 4 3
Run Code Online (Sandbox Code Playgroud)
诚然, 的使用get(.)并不完美,但我认为它通常是可以的。
另一种方法,速度差不多(取决于数据的大小):
dt[, (cols) := Reduce(function(prev,this) fcoalesce(this, prev), .SD, accumulate = TRUE), .SDcols = cols]
# same results
Run Code Online (Sandbox Code Playgroud)
基准测试,既然你说 200 万行,性能很重要。
我将使用 2M 行并更新随机化NAs的方法。
library(data.table)
set.seed(456)
n <- 2e6 # 6e5
dt <- data.table(a = sample(1:4,n, replace=T), b = sample(1:4,n, replace=T), c = sample(1:4,n, replace=T), d = sample(1:4,n, replace=T), e = sample(1:4,n, replace=T), f = sample(1:4,n, replace=T), g = sample(1:4,n, replace=T), h = sample(1:4,n, replace=T), i = sample(1:4,n, replace=T), j = sample(1:4,n, replace=T), xx = sample(1:4,n, replace=T))
mtx <- cbind(sample(nrow(dt), ceiling(n*11/20), replace=TRUE), sample(ncol(dt), ceiling(n*11/20), replace=TRUE))
mtx <- mtx[!duplicated(mtx),]
dt[mtx] <- NA
head(dt)
# a b c d e f g h i j xx
# <int> <int> <int> <int> <int> <int> <int> <int> <int> <int> <int>
# 1: 1 2 2 3 2 1 2 3 3 2 2
# 2: 1 3 4 1 4 4 3 2 4 3 3
# 3: 3 4 2 2 3 4 2 2 1 NA 1
# 4: 2 1 4 1 2 3 NA 4 4 4 3
# 5: 1 2 3 3 4 3 3 NA 1 4 1
# 6: 4 3 4 2 2 NA 4 1 2 4 2
Run Code Online (Sandbox Code Playgroud)
不幸的是,该transpose方法失败了:
system.time({
dt2 = transpose(dt)
setnafill(dt2, type = 'locf')
dt2 = transpose(dt2)
setnames(dt2, names(dt))
})
# Error: cannot allocate vector of size 30.6 Gb
Run Code Online (Sandbox Code Playgroud)
但for循环(和Reduce,顺便说一句)工作正常:
cols <- setdiff(names(dt),"N")
system.time({
for (i in seq_along(cols)[-1]) {
prevcol <- cols[i-1]
thiscol <- cols[i]
dt[is.na(get(thiscol)), (thiscol) := fcoalesce(get(thiscol), get(prevcol)) ]
}
})
# user system elapsed
# 0.14 0.00 0.11
head(dt)
# a b c d e f g h i j xx
# <int> <int> <int> <int> <int> <int> <int> <int> <int> <int> <int>
# 1: 1 2 2 3 2 1 2 3 3 2 2
# 2: 1 3 4 1 4 4 3 2 4 3 3
# 3: 3 4 2 2 3 4 2 2 1 1 1
# 4: 2 1 4 1 2 3 3 4 4 4 3
# 5: 1 2 3 3 4 3 3 3 1 4 1
# 6: 4 3 4 2 2 2 4 1 2 4 2
Run Code Online (Sandbox Code Playgroud)
如果我将问题集简化为 600K 行,那么我可以让两者都工作。(我不知道我的系统的翻转点......它可能是 1M,谁知道呢,我只是想并排比较它们。)使用n <- 6e5和生成dt,我看到以下数据和简单的计时:
head(dt)
# a b c d e f g h i j xx
# <int> <int> <int> <int> <int> <int> <int> <int> <int> <int> <int>
# 1: 1 2 3 1 3 4 NA 3 3 3 3
# 2: 1 3 2 2 4 3 1 2 2 4 1
# 3: 3 4 2 1 1 1 1 4 2 4 2
# 4: 2 4 1 NA 1 4 3 1 4 1 1
# 5: 1 NA 4 2 NA NA 4 4 2 2 NA
# 6: 4 1 4 4 1 2 3 3 1 1 2
sum(is.na(dt))
# [1] 321782
system.time({
dt2 = transpose(dt)
setnafill(dt2, type = 'locf')
dt2 = transpose(dt2)
setnames(dt2, names(dt))
})
# user system elapsed
# 4.27 4.50 7.74
sum(is.na(dt)) # 'dt' is unchanged, only important here to compare the 'for' loop
# [1] 321782
sum(is.na(dt2)) # rows with leading columns having 'NA', nothing to coalesce, not surprising
# [1] 30738
cols <- setdiff(names(dt),"N")
system.time({
for (i in seq_along(cols)[-1]) {
prevcol <- cols[i-1]
thiscol <- cols[i]
dt[is.na(get(thiscol)), (thiscol) := fcoalesce(get(thiscol), get(prevcol)) ]
}
})
# user system elapsed
# 0.10 0.03 0.06
identical(dt, dt2)
# [1] TRUE
### regenerate `dt` so it has `NA`s again
system.time({
dt[, (cols) := Reduce(function(prev,this) fcoalesce(this,prev), .SD, accumulate = TRUE), .SDcols = cols]
})
# user system elapsed
# 0.03 0.00 0.03
identical(dt, dt2)
# [1] TRUE
Run Code Online (Sandbox Code Playgroud)
更强大的基准测试bench::mark会因需要copy(dt)每次通过而受到一些影响。虽然这个开销并不大,
bench::mark(copy(dt))
# # A tibble: 1 x 13
# expression min median `itr/sec` mem_alloc `gc/sec` n_itr n_gc total_time result memory time gc
# <bch:expr> <bch:tm> <bch:tm> <dbl> <bch:byt> <dbl> <int> <dbl> <bch:tm> <list> <list> <list> <list>
# 1 copy(dt) 7.77ms 20.9ms 45.1 25.2MB 0 23 0 510ms <data.table[,11] [600,000 x 11]> <Rprofmem[,3] [14 x 3]> <bch:tm [23]> <tibble [23 x 3]>
Run Code Online (Sandbox Code Playgroud)
它仍然是额外的。因此,我将transpose两次比较代码,一次有一次没有,以便更好地将其与for和reduce更诚实的答案进行比较。(请注意,bench::mark的默认操作是验证所有输出都是identical。这可以禁用,但我没有这样做,所以所有代码块都返回相同的结果。)
bench::mark(
transpose1 = {
dt2 = transpose(dt)
setnafill(dt2, type = 'locf')
dt2 = transpose(dt2)
setnames(dt2, names(dt))
dt2
},
transpose2 = {
dt0 = copy(dt)
dt2 = transpose(dt0)
setnafill(dt2, type = 'locf')
dt2 = transpose(dt2)
setnames(dt2, names(dt0))
dt2
},
forloop = {
dt0 <- copy(dt)
for (i in seq_along(cols)[-1]) {
prevcol <- cols[i-1]
thiscol <- cols[i]
dt0[is.na(get(thiscol)), (thiscol) := fcoalesce(get(thiscol), get(prevcol)) ]
}
dt0
},
reduce = {
dt0 <- copy(dt)
dt0[, (cols) := Reduce(function(prev,this) fcoalesce(this,prev), .SD, accumulate = TRUE), .SDcols = cols]
},
min_iterations = 10)
# Warning: Some expressions had a GC in every iteration; so filtering is disabled.
# # A tibble: 4 x 13
# expression min median `itr/sec` mem_alloc `gc/sec` n_itr n_gc total_time result memory time gc
# <bch:expr> <bch:tm> <bch:tm> <dbl> <bch:byt> <dbl> <int> <dbl> <bch:tm> <list> <list> <list> <list>
# 1 transpose1 4.94s 5.48s 0.154 1.28GB 0.201 10 13 1.08m <data.table[,11] [600,000 x 11]> <Rprofmem[,3] [33,008 x 3]> <bch:tm [10]> <tibble [10 x 3]>
# 2 transpose2 5.85s 6.29s 0.130 1.3GB 0.259 10 20 1.29m <data.table[,11] [600,000 x 11]> <Rprofmem[,3] [15,316 x 3]> <bch:tm [10]> <tibble [10 x 3]>
# 3 forloop 48.37ms 130.91ms 2.87 71.14MB 0 10 0 3.49s <data.table[,11] [600,000 x 11]> <Rprofmem[,3] [191 x 3]> <bch:tm [10]> <tibble [10 x 3]>
# 4 reduce 48.08ms 75.82ms 4.70 71MB 0.470 10 1 2.13s <data.table[,11] [600,000 x 11]> <Rprofmem[,3] [38 x 3]> <bch:tm [10]> <tibble [10 x 3]>
Run Code Online (Sandbox Code Playgroud)
由此:
(编辑以将基准测试的最小迭代次数增加到 10。)
解决此问题的另一种方法是使用该set函数。该解决方案既快速又非常高效。我还将它与 @r2evansforloop和Reduce12M 行数据表上的案例进行了比较。
我还考虑forloop了@erevens 答案(forloop1如下)中案例的一个修改版本。新版本包括简单地删除 data.table 参数 i ( is.na(get(thiscol))) 中的表达式。与原始更改相比,此更改有助于提高内存使用率和性能。
library(data.table)
for(cl in seq_along(cols)[-1L]) set(dt, j=cl, value=fcoalesce(dt3[[cl]], dt3[[cl-1L]]))
Run Code Online (Sandbox Code Playgroud)
n <- 12e6
set.seed(0123456789)
d <- setDT(replicate(7, sample(c(1:4, NA), n, TRUE, (5:1)/15), simplify=FALSE))
setnames(d, c(letters[1:6], "xx"))
cols <- setdiff(names(d),"xx")
dt0 <- copy(d)
dt1 <- copy(d)
dt2 <- copy(d)
dt3 <- copy(d)
bench::mark(
# modified version
forloop1 = {
for (i in seq_along(cols)[-1]) {
prevcol <- cols[i-1]
thiscol <- cols[i]
# i not specified
dt0[, (thiscol) := fcoalesce(get(thiscol), get(prevcol)) ]
}
dt0
},
# original version
forloop2 = {
for (i in seq_along(cols)[-1]) {
prevcol <- cols[i-1]
thiscol <- cols[i]
dt1[is.na(get(thiscol)), (thiscol) := fcoalesce(get(thiscol), get(prevcol)) ]
}
dt1
},
reduce = {
dt2[, (cols) := Reduce(function(prev,this) fcoalesce(this,prev), .SD, accumulate = TRUE), .SDcols = cols]
},
set = {
for(cl in seq_along(cols)[-1L]) set(dt3, j=cl, value=fcoalesce(dt3[[cl]], dt3[[cl-1L]]))
dt3
},
min_iterations = 5L)
# # A tibble: 4 x 13
# expression min median `itr/sec` mem_alloc `gc/sec` n_itr n_gc total_time result memory time gc
# <bch:expr> <bch:tm> <bch:tm> <dbl> <bch:byt> <dbl> <int> <dbl> <bch:tm> <list> <list> <list> <list>
# 1 forloop1 77.1ms 87.9ms 10.9 229MB 2.74 4 1 366ms <data.table [12,000,000 x 7]> <Rprofmem [134 x 3]> <bench_tm [5]> <tibble [5 x 3]>
# 2 forloop2 192.8ms 201.3ms 5.01 460MB 3.34 3 2 599ms <data.table [12,000,000 x 7]> <Rprofmem [183 x 3]> <bench_tm [5]> <tibble [5 x 3]>
# 3 reduce 114.5ms 130.2ms 7.76 458MB 5.17 3 2 387ms <data.table [12,000,000 x 7]> <Rprofmem [21 x 3]> <bench_tm [5]> <tibble [5 x 3]>
# 4 set 65.6ms 68.5ms 14.5 229MB 9.65 3 2 207ms <data.table [12,000,000 x 7]> <Rprofmem [76 x 3]> <bench_tm [5]> <tibble [5 x 3]>
Run Code Online (Sandbox Code Playgroud)
使用该set函数可以提高性能和内存使用率。我个人更关心中位时间(而不是total_time)。