dplyr::across 性能和 dplyr::summarise 到 data.table 的效率

use*_*230 1 r out-of-memory dplyr data.table

dplyr不喜欢我的大数据集,所以我尝试将以下简单代码转换为最有效的data.table等效代码:

library(tidyverse)
data(iris)
iris$year <- rep(c(2000, 3000), each = 25) 
iris$color <- rep(c("red", "green","blue"), each = 50) 
iris$letter <- as.factor(rep(c("A", "B", "C"), each = 50)) 
head(iris, 3)

iris %>% 
  group_by(Species, year) %>% 
  summarise(across(c(-Sepal.Length, -Sepal.Width), dplyr::first), 
            across(c(Sepal.Length, Sepal.Width), dplyr::last)) %>% 
  ungroup
Run Code Online (Sandbox Code Playgroud)

然而,我的努力给了我错误的解决方案,也没有命名列:

library(data.table)
final <- setDT(iris)[, c(
  lapply(setdiff(names(iris), c("Sepal.Length", "Sepal.Width")), head, 1), 
  lapply(c("Sepal.Length", "Sepal.Width"), tail, 1)
), by = c("Species", "year")]
final
Run Code Online (Sandbox Code Playgroud)

也许有更快/更好的data.table方法?

谢谢

编辑

当我让上面的dplyr代码在我的真实数据(约 300 万行,80 列)上运行时,我遇到了内存问题。它rstudio在中止前运行了大约 15 小时。summariseacross已已知慢summarise_at(见这里),但我认为他们应该是相同的现在。使用下面的解决方案并获取20000我真实数据集的第一行,我microbenchmark times = 10得到:

#NOTE this is on my real dataset so not reproducible 
microbenchmark(datatable <- as.data.table(real_data)[, c(lapply(.SD[, nm1, with = FALSE], first),
                                                         lapply(.SD[, nm2, with = FALSE], last)), .( Species, year)],
               
               collapse_package <- collap(real_data, ~  Species + year, custom = list(ffirst = nm1, flast = nm2)),
               
               sqldf_df <- fn$sqldf("
 with first_data as (select min(rowid), $byVar, $firstVar from real_data group by $byVar),
      last_data as (select max(rowid), $byVar, $lastVar from real_data group by $byVar)
 select $byVar, $firstVar, $lastVar from first_data left join last_data using($byVar)
", dbname = tempfile()),
               sqldf_df_no_dbname <- fn$sqldf("
 with first_data as (select min(rowid), $byVar, $firstVar from real_data group by $byVar),
      last_data as (select max(rowid), $byVar, $lastVar from real_data group by $byVar)
 select $byVar, $firstVar, $lastVar from first_data left join last_data using($byVar)
"),
               dplyr_sum_across <- real_data %>% 
                 group_by(Species, year) %>%  
                 summarise(
                   across(c(-Sepal.Length, -Sepal.Width), dplyr::first), 
                   across(c(Sepal.Length, Sepal.Width), dplyr::last)), times = 10)

#                         min         lq        mean     median         uq        max neval cld
# datatable         9664.3822  9974.6145 10211.00909 10130.2571 10438.7439 10872.2079    10  b 
# collapse_package     4.9311     5.0039     5.10331     5.0677     5.1597     5.5432    10  a  
# sqldf_df           394.3706   395.7660   403.82425   399.2484   401.9162   450.3884    10  a  
# sqldf_df_no_dbname 374.9822   380.2022   385.52904   382.6653   387.7198   402.9556    10  a  
# dplyr_sum_across 23969.3657 25055.5517 25800.82757 25653.1470 26262.3583 27616.5212    10  c
Run Code Online (Sandbox Code Playgroud)

library(collapse)运行时间非常令人印象深刻!不错的概述在这里

akr*_*run 5

有了data.table,我们可以使用

nm1 <- c("Petal.Length", "Petal.Width", "color", "letter")
nm2 <- c("Sepal.Length", "Sepal.Width")
as.data.table(iris)[, c(lapply(.SD[, nm1, with = FALSE], first),
     lapply(.SD[, nm2, with = FALSE], last)), .(Species, year)]
Run Code Online (Sandbox Code Playgroud)

-输出

#      Species year Petal.Length Petal.Width color letter Sepal.Length Sepal.Width
#1:     setosa 2000          1.4         0.2   red      A          4.8         3.4
#2:     setosa 3000          1.6         0.2   red      A          5.0         3.3
#3: versicolor 2000          4.7         1.4 green      B          6.4         2.9
#4: versicolor 3000          4.4         1.4 green      B          5.7         2.8
#5:  virginica 2000          6.0         2.5  blue      C          6.7         3.3
#6:  virginica 3000          6.0         1.8  blue      C          5.9         3.0
Run Code Online (Sandbox Code Playgroud)

或者另一种选择是 collapse

library(collapse)
collap(iris, ~ Species + year, custom = list(ffirst = nm1, flast = nm2))
#   Sepal.Length Sepal.Width Petal.Length Petal.Width    Species year color letter
#1          4.8         3.4          1.4         0.2     setosa 2000   red      A
#2          5.0         3.3          1.6         0.2     setosa 3000   red      A
#3          6.4         2.9          4.7         1.4 versicolor 2000 green      B
#4          5.7         2.8          4.4         1.4 versicolor 3000 green      B
#5          6.7         3.3          6.0         2.5  virginica 2000  blue      C
#6          5.9         3.0          6.0         1.8  virginica 3000  blue      C
 
Run Code Online (Sandbox Code Playgroud)