加速 group_by 操作 dplyr

bla*_*252 5 r dplyr data.table

我有一个包含很多组的 tibble,我想对其进行分组操作(下面高度简化的变异)。

z <- tibble(k1 = rep(seq(1, 600000, 1), 5),
            category = sample.int(2, 3000000, replace = TRUE)) %>%
  arrange(k1, category)
t1 <- z %>% 
  group_by(k1) %>%
  mutate(x = if_else(category == 1 & lead(category) == 2, "pie", "monkey")) %>% 
  ungroup()
Run Code Online (Sandbox Code Playgroud)

这个操作非常慢,但如果我“手动”进行分组,该过程很难阅读,写起来更烦人,但速度要快得多(20倍)。

z %>%
  mutate(x = if_else(category == 1 & lead(category) == 2 & k1 == lead(k1), "pie", "monkey"),
         x = if_else(category == 1 & k1 != lead(k1), NA_character_, x)) 
Run Code Online (Sandbox Code Playgroud)

很明显,有一些方法可以通过密钥来加速这个过程。有一个更好的方法吗?我尝试使用 data.table,但它仍然比手动技术慢得多。

zDT <- z %>% data.table::as.data.table()
zDT[, x := if_else(category == 1 & lead(category) == 2, "pie", "monkey"), by = "k1"]
Run Code Online (Sandbox Code Playgroud)

对于以自然、快速的方式进行此操作有什么建议吗?

jbl*_*d94 5

进行这些分组比较的成本相对较高。如果可能的话,最好对整个表进行矢量化。请注意,ifelse比 更快,if_else并且data.tablesshift比 更快lead

library(data.table)
library(dplyr)

z <- setorder(data.table(k1 = rep(seq(1, 600000, 1), 5),
                         category = sample.int(2, 3000000, replace = TRUE)))
t1 <- copy(z)
t2 <- copy(z)
t3 <- copy(z)
t4 <- copy(z)
t5 <- copy(z)
microbenchmark::microbenchmark(
  if_else = t1[, x := if_else(category == 1L & lead(category) == 2L, "pie", "monkey"), k1],
  ifelse = t2[, x := ifelse(category == 1L & lead(category) == 2L, "pie", "monkey"), k1],
  shift = t3[, x := ifelse(category == 1L & shift(category, -1) == 2L, "pie", "monkey"), k1],
  ifelse3 = t4[, x := ifelse(category == 1L, ifelse(k1 == shift(k1, -1), ifelse(shift(category, -1) == 2L, "pie", "monkey"), NA_character_), "monkey")],
  logic = t5[, x := c("monkey", NA_character_, "monkey", "pie")[((k1 == shift(k1, -1, 0L))*((shift(category, -1, 0L) == 2) + 1L) + 1L)*(category == 1) + 1L]],
  times = 1,
  check = "identical"
)
#> Unit: milliseconds
#>     expr        min         lq       mean     median         uq        max neval
#>  if_else 25162.7484 25162.7484 25162.7484 25162.7484 25162.7484 25162.7484     1
#>   ifelse 18150.7634 18150.7634 18150.7634 18150.7634 18150.7634 18150.7634     1
#>    shift  9057.7585  9057.7585  9057.7585  9057.7585  9057.7585  9057.7585     1
#>  ifelse3  1544.2912  1544.2912  1544.2912  1544.2912  1544.2912  1544.2912     1
#>    logic    81.9844    81.9844    81.9844    81.9844    81.9844    81.9844     1
Run Code Online (Sandbox Code Playgroud)

逻辑的复杂性主要是由于NA行为的复杂性。如果monkey可以代替NA,t5则可以是:

t5[, x := c("monkey", "pie")[((k1 == shift(k1, -1, 0L))*(shift(category, -1, 0L) == 2)*(k1 == shift(k1, -1, 0L))) + 1L]]
Run Code Online (Sandbox Code Playgroud)


akr*_*run 3

我们可以加快速度,而无需使用ifelse

library(data.table)
> system.time(setDT(z)[, x := c("monkey", "pie")[ 
     1 + (category == 1 & shift(category, type = "lead") %in% 2)], by = k1])
   user  system elapsed 
 18.203   0.146  16.635 
> system.time({t1 <- z %>% 
   group_by(k1) %>%
   mutate(x = if_else(category == 1 & lead(category) == 2, "pie", "monkey")) %>% 
   ungroup()
 })
   user  system elapsed 
 37.319   0.321  37.523 
Run Code Online (Sandbox Code Playgroud)