按组高效过滤多列

tmf*_*mnk 15 regex performance filtering r dataframe

假设数据集每个 ID 包含多行,多列包含一些存储为字符串的代码:

df <- data.frame(id = rep(1:3, each = 2),
                 var1 = c("X1", "Y1", "Y2", "Y3", "Z1", "Z2"),
                 var2 = c("Y1", "X2", "Y2", "Y3", "Z1", "Z2"),
                 var3 = c("Y1", "Y2", "X1", "Y3", "Z1", "Z2"),
                 stringsAsFactors = FALSE)

  id var1 var2 var3
1  1   X1   Y1   Y1
2  1   Y1   X2   Y2
3  2   Y2   Y2   X1
4  2   Y3   Y3   Y3
5  3   Z1   Z1   Z1
6  3   Z2   Z2   Z2
Run Code Online (Sandbox Code Playgroud)

现在,假设我想过滤掉X在任何相关列中具有特定代码(此处)的所有 ID 。使用dplyrand purrr,我可以这样做:

df %>%
 group_by(id) %>%
 filter(all(reduce(.x = across(var1:var3, ~ !grepl("^X", .)), .f = `&`)))

     id var1  var2  var3 
  <int> <chr> <chr> <chr>
1     3 Z1    Z1    Z1   
2     3 Z2    Z2    Z2 
Run Code Online (Sandbox Code Playgroud)

它工作正常,紧凑且易于理解,但是,对于大型数据集(数百万个 ID 和数千万个观察值),效率相当低。我欢迎任何使用任何库的计算更高效代码的想法。

Tho*_*ing 16

一些可能的速度点

  • 尽量不要使用 group by 之类的东西,即group_byindplyrby = in data.table,因为这会降低你的整体性能
  • 如果您有固定的目标模式,例如,以 开头X,则substr可能比grepl使用模式更有效^X

一些基本的 R 方法

根据@Waldi 的最快方法,我们似乎可以 通过以下方法进一步加快速度

TIC1 <- function() {
    subset(df, ave(rowSums(substr(as.matrix(df[, -1]), 1, 1) == "X") == 0, id, FUN = all))
}
Run Code Online (Sandbox Code Playgroud)

或者

TIC2 <- function() {
    subset(df, !id %in% id[rowSums(substr(as.matrix(df[, -1]), 1, 1) == "X") > 0])
}
Run Code Online (Sandbox Code Playgroud)

或者

TIC3 <- function() {
    subset(df, !id %in% id[do.call(pmax, lapply(df[-1], function(v) substr(v, 1, 1) == "X")) > 0])
}
Run Code Online (Sandbox Code Playgroud)

基准测试

@Waldi@EnricoSchumann 的回答相比:

microbenchmark(
    TIC1(),
    TIC2(),
    TIC3(),
    fun1(),
    fun2(),
    waldi_speed(),
    unit = "relative"
)

Unit: relative
          expr       min        lq      mean    median        uq       max
        TIC1()  3.385215  3.451424  3.488670  3.569668  3.684895  3.618991
        TIC2()  1.062116  1.084568  1.074789  1.090400  1.114443  1.027673
        TIC3()  1.077660  2.208734  2.185960  2.214180  2.293366  2.141994
        fun1()  1.166342  1.155096  1.169574  1.153223  1.207932  1.405530
        fun2()  1.000000  1.000000  1.000000  1.000000  1.000000  1.000000
 waldi_speed() 26.218953 26.560429 26.373054 26.952997 27.396017 26.333575
 neval
   100
   100
   100
   100
   100
   100
Run Code Online (Sandbox Code Playgroud)

给予

n <- 5e4
df <- data.frame(
    id = rep(1:(n / 2), each = 2, length.out = n),
    var1 = mapply(paste0, LETTERS[23 + sample(1:3, n, replace = T)], sample(1:3, n, replace = T)),
    var2 = mapply(paste0, LETTERS[23 + sample(1:3, n, replace = T)], sample(1:3, n, replace = T)),
    var3 = mapply(paste0, LETTERS[23 + sample(1:3, n, replace = T)], sample(1:3, n, replace = T)),
    stringsAsFactors = FALSE
)

TIC1 <- function() {
    subset(df, ave(rowSums(substr(as.matrix(df[, -1]), 1, 1) == "X") == 0, id, FUN = all))
}

TIC2 <- function() {
    subset(df, !id %in% id[rowSums(substr(as.matrix(df[, -1]), 1, 1) == "X") > 0])
}

TIC3 <- function() {
    subset(df, !id %in% id[do.call(pmax, lapply(df[-1], function(v) substr(v, 1, 1) == "X")) > 0])
}


waldi_speed <- function() {
    setDT(df)
    df[df[, .(keep = .I[!any(grepl("X", .SD))]), by = id, .SDcols = patterns("var")]$keep]
}


repeated_or <- function(...) {
    L <- list(...)
    ans <- L[[1L]]
    if (...length() > 1L) {
          for (i in seq.int(2L, ...length())) {
                ans <- ans | L[[i]]
            }
      }
    ans
}

fun1 <- function() {
    ## using a pattern
    m <- lapply(df[, -1], grepl, pattern = "^X", perl = TRUE)
    df[!df$id %in% df$id[do.call(repeated_or, m)], ]
}

fun2 <- function() {
    ## using a fixed string
    m <- lapply(df[, -1], function(x) substr(x, 1, 1) == "X")
    df[!df$id %in% df$id[do.call(repeated_or, m)], ]
}
Run Code Online (Sandbox Code Playgroud)


Zaw*_*Zaw 6

这是一种替代tidyverse方法。

my_fun <- function(.data) {
  .data %>% 
    group_by(id) %>% 
    filter(!grepl("X", paste(var1, var2, var3, collapse = ""))) %>% 
    ungroup()
}

my_fun(df)

# # A tibble: 2 x 4
#      id var1  var2  var3 
#   <int> <chr> <chr> <chr>
# 1     3 Z1    Z1    Z1   
# 2     3 Z2    Z2    Z2   

df_fun <- function(.data) {
  .data %>%
    group_by(id) %>%
    filter(all(reduce(.x = across(var1:var3, ~ !grepl("^X", .)), .f = `&`))) %>% 
    ungroup()
}

performance <- bench::mark(
  my_fun(df),
  df_fun(df)
)

performance %>% select(1:4)

# # A tibble: 2 x 4
#   expression       min   median `itr/sec`
#   <bch:expr>  <bch:tm> <bch:tm>     <dbl>
# 1 my_fun(df)    2.6ms    2.7ms      364.
# 2 df_fun(df)    6.01ms   6.39ms      152.
Run Code Online (Sandbox Code Playgroud)

  • 我将赏金授予@ThomasIsCoding,因为他的解决方案是最有效的,但我接受了你的答案,因为它完全适合我的流程,并且对于我当前的工作来说,它相当高效。 (2认同)

Wal*_*ldi 6

另外两个data.table解决方案:

library(data.table)
setDT(df)
df[,.SD[!any(grepl("X", .SD))],by=id,.SDcols=patterns('var')]

   id var1 var2 var3
1:  3   Z1   Z1   Z1
2:  3   Z2   Z2   Z2
Run Code Online (Sandbox Code Playgroud)

可以以降低可读性为代价来提高速度

df[df[, .(keep=.I[!any(grepl("X", .SD))]), by=id,.SDcols=patterns('var')]$keep]
Run Code Online (Sandbox Code Playgroud)

基准测试:

n <- 1e4
df <- data.frame(id = rep(1:(n/2), each = 2,length.out=n),
                 var1 = mapply(paste0,LETTERS[23+sample(1:3,n,replace=T)],sample(1:3,n,replace=T)),
                 var2 = mapply(paste0,LETTERS[23+sample(1:3,n,replace=T)],sample(1:3,n,replace=T)),
                 var3 = mapply(paste0,LETTERS[23+sample(1:3,n,replace=T)],sample(1:3,n,replace=T)),
                 stringsAsFactors = FALSE) 


Unit: milliseconds
          expr       min         lq        mean    median         uq       max neval
         ref() 2131.5304 2285.54535 2401.612346 2367.8145 2480.10490 3294.9647   100
  TeamTeaFan() 1760.1280 1918.29075 1986.489995 1967.7518 2029.02090 2858.8118   100
       ronak()  289.1461  306.06050  324.418149  314.4888  333.44100  468.1077   100
        anil()  230.5183  244.04175  259.687656  255.4336  267.69550  370.5758   100
       waldi()  226.5081  238.23055  256.824345  251.8372  267.23395  384.6071   100
 waldi_speed()   41.0354   45.12365   51.428189   48.6736   55.20530  155.4654   100
         zaw()   25.9210   28.96225   33.508240   31.2333   37.77565   49.5777   100
         TIC()    3.9299    4.51920    5.295555    4.8717    5.43565   14.7225   100
Run Code Online (Sandbox Code Playgroud)

  • 出于好奇,如果您可以将我的方法“if_all”包含在基准测试中,那就太好了。看到 @Zaw 的基本 R 方法优于 {data.table} 真的很惊讶! (2认同)

Enr*_*ann 5

另一个基本的 R 解决方案,使用 ThomasIsCoding 提供的代码示例。首先,定义一个辅助函数:

repeated_or <- function(...) {
    L <- list(...)
    ans <- L[[1L]]
    if (...length() > 1L)
        for (i in seq.int(2L, ...length()))
            ans <- ans | L[[i]]
    ans
}
Run Code Online (Sandbox Code Playgroud)

它将采用任意数量的逻辑向量x1, x2, x3, ... 并产生x1 | x2 | x3 ...等等。

实际工作由以下函数完成,有两种变体。该函数假定要搜索除第一列之外的所有列。

fun1 <- function() {
    ## using a pattern
    m <- lapply(df[, -1], grepl, pattern = "^X", perl = TRUE)
    df[!df$id %in% df$id[do.call(repeated_or, m)], ]
}

fun2 <- function() {
    ## using a fixed string
    m <- lapply(df[, -1], function(x) substr(x, 1,1) == "X")
    df[!df$id %in% df$id[do.call(repeated_or, m)], ]
}
Run Code Online (Sandbox Code Playgroud)

现在,使用 ThomasIsCoding 提供的代码:

n <- 1e4
df <- data.frame(
    id = rep(1:(n / 2), each = 2, length.out = n),
    var1 = mapply(paste0, LETTERS[23 + sample(1:3, n, replace = T)], sample(1:3, n, replace = T)),
    var2 = mapply(paste0, LETTERS[23 + sample(1:3, n, replace = T)], sample(1:3, n, replace = T)),
    var3 = mapply(paste0, LETTERS[23 + sample(1:3, n, replace = T)], sample(1:3, n, replace = T)),
    stringsAsFactors = FALSE
)

library("microbenchmark")
microbenchmark(
    fun1(),
    fun2(),
    TIC1(),
    TIC2(),
    waldi_speed(),
    unit = "relative"
)
## Unit: relative
##           expr       min        lq      mean    median        uq       max neval
##         fun1()  1.180372  1.183109  1.205269  1.189091  1.187704  1.163667   100
##         fun2()  1.000000  1.000000  1.000000  1.000000  1.000000  1.000000   100
##         TIC1()  3.487775  3.462417  3.549228  3.491580  3.494310  2.857216   100
##         TIC2()  1.140145  1.131872  1.141466  1.146900  1.142863  1.078746   100
##  waldi_speed() 31.440025 30.845971 30.556054 30.798701 30.338251 26.213920   100
Run Code Online (Sandbox Code Playgroud)


GKi*_*GKi 5

  • 专用功能:如果找到特定代码,您可能会进行许多操作。使用该类型的专用函数可能比通用函数更快。将比.startsWith(x, "X")grepl("^X", x)

  • 子集:如果查找特定代码的函数很慢(操作比子集慢),请仅对尚未找到代码的行中的其余列进行此操作。

  • 哈希查找:您需要比较所有没有直接命中的剩余 id,如果任何具有相同的行有id命中。所以在列表中查找,持有命中的 id,应该很快。此查找可能是使用快速的哈希表fastmatch::fmatch

  • 存储类型:如果 a 的列data.frame都具有相同的 type,当它存储在 amatrix而不是 a时,对其的操作可能会更快list

  • 避免重新排列数据:尽量按原样使用数据。避免像splitgroup这样会重新排列当前数据的操作。


你可以unlist df[-1],如果它的测试startsWith X,创建一个matrix具有nrowdf,走rowSums,在情况下,它是>0id具有一击。我将它们存储idi. 可选的unique id's可以计算。现在测试是否id%in% i并使用!. 一个可能更快的替代方法%in%%fin%from fastmatch

i <- df$id[unlist(df[-1], FALSE, FALSE) |>
             startsWith("X") |>
             matrix(nrow(df)) |>
             rowSums() > 0]
#i <- unique(i)       #Optional
#i <- kit::funique(i) #Optional faster unique
df[!df$id %in% i,]
#  id var1 var2 var3
#5  3   Z1   Z1   Z1
#6  3   Z2   Z2   Z2

library(fastmatch)
df[!df$id %fin% i,]
Run Code Online (Sandbox Code Playgroud)

另一种方式来以前使用ilappyl,并使用|Reduce或情况下Reduce被缓慢或许改用evalstr2langpaste

i <- lapply(df[,-1], startsWith, "X")
i <- df$id[Reduce(`|`, i)]
#i <- eval(str2lang(paste0("i[[", seq_along(i), "]]", collapse = "|"))) #Alternative to Reduce
df[!df$id %in% i,]
Run Code Online (Sandbox Code Playgroud)

也有可能测试它是否有开始X仅在没有命中已经和使用的情况下%in%只为没有打正着的那些行X,如果它开始,当子集是将意义不是测试更快X,如果子集比寻找匹配更快。

i <- Reduce(function(x, y) `[<-`(x,!x,startsWith(y[!x], "X")),
       df[,-1], logical(nrow(df)))
i[!i] <- df$id[!i] %in% df$id[i]
df[!i,]
Run Code Online (Sandbox Code Playgroud)

基于@Waldi 的基准测试以及TIC2()来自@thomasiscoding 和fun2()@enrico-schumann 的方法:

 getDf <- function(nr, nc) { #function to creat example dataset
    data.frame(id = sample(seq_len(nr/5), nr, TRUE),
      lapply(setNames(seq_len(nc), paste0("var", seq_len(nc))),
        function(i) paste0(sample(LETTERS, nr, TRUE), sample(0:9, nr, TRUE))))
}

library(fastmatch)
FGKi1 <- function() {
  df[!df$id %in% df$id[rowSums(matrix(startsWith(unlist(df[-1], FALSE, FALSE),
                                                 "X"), nrow(df))) > 0],]}
FGKi2 <- function() {
  df[!df$id %in% unique(df$id[rowSums(matrix(startsWith(unlist(df[-1],
                                 FALSE, FALSE), "X"), nrow(df))) > 0]),]}
FGKi3 <- function() {
  df[!df$id %fin% df$id[rowSums(matrix(startsWith(unlist(df[-1], FALSE, FALSE),
                                                  "X"), nrow(df))) > 0],]}
FGKi4 <- function() {
  df[!df$id %in% df$id[Reduce(`|`, lapply(df[, -1], startsWith, "X"))],]
}
FGKi5 <- function() {
  df[!df$id %fin% df$id[Reduce(`|`, lapply(df[, -1], startsWith, "X"))],]
}
FGKi6 <- function() {
  i <- Reduce(`|`, lapply(df[, -1], startsWith, "X"))
  i[!i] <- df$id[!i] %in% df$id[i]
  df[!i,]
}
FGKi7 <- function() {
  i <- lapply(df[, -1], startsWith, "X")
  i <- eval(str2lang(paste0("i[[", seq_along(i), "]]", collapse = "|")))
  df[!df$id %fin% df$id[i],]
}
repeated_or <- function(...) {
    L <- list(...)
    ans <- L[[1L]]
    if (...length() > 1L)
        for (i in seq.int(2L, ...length()))
            ans <- ans | L[[i]]
    ans
}
fun2 <- function() {
    ## using a fixed string
    m <- lapply(df[, -1], function(x) substr(x, 1,1) == "X")
    df[!df$id %in% df$id[do.call(repeated_or, m)], ]
}
TIC2 <- function() {
    subset(df, !id %in% id[rowSums(substr(as.matrix(df[, -1]), 1, 1) == "X") > 0])
}
Run Code Online (Sandbox Code Playgroud)
set.seed(42)
df <- getDf(1e5, 3) #3 col wide Table
bench::mark(TIC2(), fun2(), FGKi1(), FGKi2(), FGKi3(), FGKi4(),
   FGKi5(), FGKi6(), FGKi7())
#  expression     min  median `itr/sec` mem_alloc `gc/sec` n_itr  n_gc total_time
#  <bch:expr> <bch:t> <bch:t>     <dbl> <bch:byt>    <dbl> <int> <dbl>   <bch:tm>
#1 TIC2()      24.7ms  24.9ms      40.2   15.07MB    112.      5    14      125ms
#2 fun2()      22.3ms  22.5ms      43.9   11.26MB     39.9    11    10      251ms
#3 FGKi1()     14.6ms    15ms      66.8   12.78MB     58.9    17    15      255ms
#4 FGKi2()     14.9ms  15.1ms      66.3   12.97MB     58.5    17    15      256ms
#5 FGKi3()     12.1ms  12.3ms      80.8   12.23MB     72.3    19    17      235ms
#6 FGKi4()     12.7ms  12.9ms      77.7    8.97MB     27.7    28    10      360ms
#7 FGKi5()     10.2ms  10.3ms      96.4    8.42MB     51.4    30    16      311ms
#8 FGKi6()     13.2ms  13.3ms      75.1   11.38MB     53.6    21    15      280ms
#9 FGKi7()     10.3ms  10.4ms      95.2    8.42MB     36.8    31    12      326ms

set.seed(42)
df <- getDf(1e4, 1e3) #1000 col wide Table
bench::mark(TIC2(), fun2(), FGKi1(), FGKi2(), FGKi3(), FGKi4(),
   FGKi5(), FGKi6(), FGKi7())
#  expression     min  median `itr/sec` mem_alloc `gc/sec` n_itr  n_gc total_time
#  <bch:expr> <bch:t> <bch:t>     <dbl> <bch:byt>    <dbl> <int> <dbl>   <bch:tm>
#1 TIC2()     430.4ms 434.4ms      2.30     230MB     3.45     2     3      869ms
#2 fun2()     374.6ms 405.6ms      2.47     191MB     6.16     2     5      811ms
#3 FGKi1()    110.8ms 117.7ms      7.87     191MB    13.8      4     7      509ms
#4 FGKi2()    108.9ms 111.1ms      8.32     191MB    11.7      5     7      601ms
#5 FGKi3()    107.8ms 107.8ms      9.25     191MB     9.25     5     5      541ms
#6 FGKi4()     52.5ms  54.6ms     16.6      115MB    14.7      9     8      543ms
#7 FGKi5()     52.5ms  54.7ms     18.3      115MB    18.3     10    10      547ms
#8 FGKi6()     52.8ms  55.2ms     18.1      115MB    16.3     10     9      553ms
#9 FGKi7()     53.7ms  56.6ms     17.6      115MB    17.6      9     9      510ms
#Warning message:
#Some expressions had a GC in every iteration; so filtering is disabled. 
Run Code Online (Sandbox Code Playgroud)