出于某种原因,这个操作似乎显示data.table分配一个新列大约是基数R的一半.这是有原因的吗?
require(microbenchmark)
require(data.table)
DT = data.table(a = runif(1000000), b = rnorm(1000000))
DF = data.frame(a = runif(1000000), b = rnorm(1000000))
microbenchmark(
DT[,keycol := seq(1,nrow(DT))],
DF$keycol <- seq(1,nrow(DF)),
times = 2)
Unit: microseconds
expr min lq mean median uq max neval
DT[, `:=`(keycol, seq(1, nrow(DT)))] 901.109 901.109 921.1220 921.1220 941.135 941.135 2
DF$keycol <- seq(1, nrow(DF)) 487.844 487.844 527.1865 527.1865 566.529 566.529 2
Run Code Online (Sandbox Code Playgroud)
这是我的R版本,使用data.table版本1.10.4:
> version
_
platform x86_64-w64-mingw32
arch x86_64
os mingw32
system x86_64, mingw32
status
major 3
minor 3.3
year 2017
month 03
day 06
svn rev 72310
language R
version.string R version 3.3.3 (2017-03-06)
nickname Another Canoe
Run Code Online (Sandbox Code Playgroud)
我也对差异之大印象深刻......我想这是开销的错[.data.table
顺便说一句,你没有正确地进行基准测试——更平衡的比较不会在某些时候覆盖该列,而是每次都从头开始,如下所示:
set.seed(102340)
times = matrix(nrow = 500, ncol = 2)
colnames(times) = c('DT', 'DF')
for (ii in seq_len(nrow(times))) {
DT = data.table(a = runif(1000000), b = rnorm(1000000))
DF = data.frame(a = runif(1000000), b = rnorm(1000000))
TT0 = get_nanotime()
DT[ , keycol := seq(1, nrow(DT))]
TT1 = get_nanotime()
delDT = TT1 - TT0
TT0 = get_nanotime()
DF$keycol <- seq(1,nrow(DF))
TT1 = get_nanotime()
delDF = TT1 - TT0
times[ii, ] = c(delDT, delDF)
}
summary(times)
# DT DF
# Min. : 1617687 Min. : 420502
# 1st Qu.: 2205314 1st Qu.: 447691
# Median : 3297872 Median : 464019
# Mean : 5277059 Mean : 594214
# 3rd Qu.: 4291291 3rd Qu.: 578034
# Max. :75731819 Max. :2224713
Run Code Online (Sandbox Code Playgroud)
任何一种方法都使用seq_len(nrow(DT))而不是更快seq(1, nrow(DT))。
差异的很大一部分似乎归因于以下开销[.data.table:
set.seed(102340)
ns = as.integer(10^(1:7))
ratios = numeric(length(ns))
for (nn in seq_along(ns)) {
times = matrix(nrow = 500L, ncol = 2L)
for (ii in seq_len(nrow(times))) {
DT = data.table(a = runif(ns[nn]),
b = rnorm(ns[nn]))
DF = data.frame(a = runif(ns[nn]),
b = rnorm(ns[nn]))
TT0.1 = get_nanotime()
DT[ , keycol := seq_len(nrow(DT))]
TT1.1 = get_nanotime()
delDT = TT1.1 - TT0.1
TT0.2 = get_nanotime()
DF$keycol <- seq(1,nrow(DF))
TT1.2 = get_nanotime()
delDF = TT1.2 - TT0.2
times[ii, ] = c(delDT, delDF)
}
ratios[nn] = median(times[ , 1L])/median(times[ , 2L])
print(ratios)
}
plot(log10(ns), ratios, type = 'b', lwd = 3L, xaxt = 'n',
xlab = '# Rows', ylab = 'DT time / DF time',
main = 'Ratio of DT assignment time\nvs. DF Assignment Time')
axis(side = 1L, at = 1:7, labels = ns)
abline(h = 1, lty = 2L, col = 'red')
Run Code Online (Sandbox Code Playgroud)