tmf*_*mnk 15 regex performance filtering r dataframe
假设数据集每个 ID 包含多行,多列包含一些存储为字符串的代码:
df <- data.frame(id = rep(1:3, each = 2),
var1 = c("X1", "Y1", "Y2", "Y3", "Z1", "Z2"),
var2 = c("Y1", "X2", "Y2", "Y3", "Z1", "Z2"),
var3 = c("Y1", "Y2", "X1", "Y3", "Z1", "Z2"),
stringsAsFactors = FALSE)
id var1 var2 var3
1 1 X1 Y1 Y1
2 1 Y1 X2 Y2
3 2 Y2 Y2 X1
4 2 Y3 Y3 Y3
5 3 Z1 Z1 Z1
6 3 Z2 Z2 Z2
Run Code Online (Sandbox Code Playgroud)
现在,假设我想过滤掉X
在任何相关列中具有特定代码(此处)的所有 ID 。使用dplyr
and purrr
,我可以这样做:
df %>%
group_by(id) %>%
filter(all(reduce(.x = across(var1:var3, ~ !grepl("^X", .)), .f = `&`)))
id var1 var2 var3
<int> <chr> <chr> <chr>
1 3 Z1 Z1 Z1
2 3 Z2 Z2 Z2
Run Code Online (Sandbox Code Playgroud)
它工作正常,紧凑且易于理解,但是,对于大型数据集(数百万个 ID 和数千万个观察值),效率相当低。我欢迎任何使用任何库的计算更高效代码的想法。
Tho*_*ing 16
group_by
indplyr
或by =
in data.table
,因为这会降低你的整体性能X
,则substr
可能比grepl
使用模式更有效^X
根据@Waldi 的最快方法,我们似乎可以 通过以下方法进一步加快速度
TIC1 <- function() {
subset(df, ave(rowSums(substr(as.matrix(df[, -1]), 1, 1) == "X") == 0, id, FUN = all))
}
Run Code Online (Sandbox Code Playgroud)
或者
TIC2 <- function() {
subset(df, !id %in% id[rowSums(substr(as.matrix(df[, -1]), 1, 1) == "X") > 0])
}
Run Code Online (Sandbox Code Playgroud)
或者
TIC3 <- function() {
subset(df, !id %in% id[do.call(pmax, lapply(df[-1], function(v) substr(v, 1, 1) == "X")) > 0])
}
Run Code Online (Sandbox Code Playgroud)
与@Waldi和@EnricoSchumann 的回答相比:
microbenchmark(
TIC1(),
TIC2(),
TIC3(),
fun1(),
fun2(),
waldi_speed(),
unit = "relative"
)
Unit: relative
expr min lq mean median uq max
TIC1() 3.385215 3.451424 3.488670 3.569668 3.684895 3.618991
TIC2() 1.062116 1.084568 1.074789 1.090400 1.114443 1.027673
TIC3() 1.077660 2.208734 2.185960 2.214180 2.293366 2.141994
fun1() 1.166342 1.155096 1.169574 1.153223 1.207932 1.405530
fun2() 1.000000 1.000000 1.000000 1.000000 1.000000 1.000000
waldi_speed() 26.218953 26.560429 26.373054 26.952997 27.396017 26.333575
neval
100
100
100
100
100
100
Run Code Online (Sandbox Code Playgroud)
给予
n <- 5e4
df <- data.frame(
id = rep(1:(n / 2), each = 2, length.out = n),
var1 = mapply(paste0, LETTERS[23 + sample(1:3, n, replace = T)], sample(1:3, n, replace = T)),
var2 = mapply(paste0, LETTERS[23 + sample(1:3, n, replace = T)], sample(1:3, n, replace = T)),
var3 = mapply(paste0, LETTERS[23 + sample(1:3, n, replace = T)], sample(1:3, n, replace = T)),
stringsAsFactors = FALSE
)
TIC1 <- function() {
subset(df, ave(rowSums(substr(as.matrix(df[, -1]), 1, 1) == "X") == 0, id, FUN = all))
}
TIC2 <- function() {
subset(df, !id %in% id[rowSums(substr(as.matrix(df[, -1]), 1, 1) == "X") > 0])
}
TIC3 <- function() {
subset(df, !id %in% id[do.call(pmax, lapply(df[-1], function(v) substr(v, 1, 1) == "X")) > 0])
}
waldi_speed <- function() {
setDT(df)
df[df[, .(keep = .I[!any(grepl("X", .SD))]), by = id, .SDcols = patterns("var")]$keep]
}
repeated_or <- function(...) {
L <- list(...)
ans <- L[[1L]]
if (...length() > 1L) {
for (i in seq.int(2L, ...length())) {
ans <- ans | L[[i]]
}
}
ans
}
fun1 <- function() {
## using a pattern
m <- lapply(df[, -1], grepl, pattern = "^X", perl = TRUE)
df[!df$id %in% df$id[do.call(repeated_or, m)], ]
}
fun2 <- function() {
## using a fixed string
m <- lapply(df[, -1], function(x) substr(x, 1, 1) == "X")
df[!df$id %in% df$id[do.call(repeated_or, m)], ]
}
Run Code Online (Sandbox Code Playgroud)
这是一种替代tidyverse
方法。
my_fun <- function(.data) {
.data %>%
group_by(id) %>%
filter(!grepl("X", paste(var1, var2, var3, collapse = ""))) %>%
ungroup()
}
my_fun(df)
# # A tibble: 2 x 4
# id var1 var2 var3
# <int> <chr> <chr> <chr>
# 1 3 Z1 Z1 Z1
# 2 3 Z2 Z2 Z2
df_fun <- function(.data) {
.data %>%
group_by(id) %>%
filter(all(reduce(.x = across(var1:var3, ~ !grepl("^X", .)), .f = `&`))) %>%
ungroup()
}
performance <- bench::mark(
my_fun(df),
df_fun(df)
)
performance %>% select(1:4)
# # A tibble: 2 x 4
# expression min median `itr/sec`
# <bch:expr> <bch:tm> <bch:tm> <dbl>
# 1 my_fun(df) 2.6ms 2.7ms 364.
# 2 df_fun(df) 6.01ms 6.39ms 152.
Run Code Online (Sandbox Code Playgroud)
另外两个data.table
解决方案:
library(data.table)
setDT(df)
df[,.SD[!any(grepl("X", .SD))],by=id,.SDcols=patterns('var')]
id var1 var2 var3
1: 3 Z1 Z1 Z1
2: 3 Z2 Z2 Z2
Run Code Online (Sandbox Code Playgroud)
可以以降低可读性为代价来提高速度:
df[df[, .(keep=.I[!any(grepl("X", .SD))]), by=id,.SDcols=patterns('var')]$keep]
Run Code Online (Sandbox Code Playgroud)
基准测试:
n <- 1e4
df <- data.frame(id = rep(1:(n/2), each = 2,length.out=n),
var1 = mapply(paste0,LETTERS[23+sample(1:3,n,replace=T)],sample(1:3,n,replace=T)),
var2 = mapply(paste0,LETTERS[23+sample(1:3,n,replace=T)],sample(1:3,n,replace=T)),
var3 = mapply(paste0,LETTERS[23+sample(1:3,n,replace=T)],sample(1:3,n,replace=T)),
stringsAsFactors = FALSE)
Unit: milliseconds
expr min lq mean median uq max neval
ref() 2131.5304 2285.54535 2401.612346 2367.8145 2480.10490 3294.9647 100
TeamTeaFan() 1760.1280 1918.29075 1986.489995 1967.7518 2029.02090 2858.8118 100
ronak() 289.1461 306.06050 324.418149 314.4888 333.44100 468.1077 100
anil() 230.5183 244.04175 259.687656 255.4336 267.69550 370.5758 100
waldi() 226.5081 238.23055 256.824345 251.8372 267.23395 384.6071 100
waldi_speed() 41.0354 45.12365 51.428189 48.6736 55.20530 155.4654 100
zaw() 25.9210 28.96225 33.508240 31.2333 37.77565 49.5777 100
TIC() 3.9299 4.51920 5.295555 4.8717 5.43565 14.7225 100
Run Code Online (Sandbox Code Playgroud)
另一个基本的 R 解决方案,使用 ThomasIsCoding 提供的代码示例。首先,定义一个辅助函数:
repeated_or <- function(...) {
L <- list(...)
ans <- L[[1L]]
if (...length() > 1L)
for (i in seq.int(2L, ...length()))
ans <- ans | L[[i]]
ans
}
Run Code Online (Sandbox Code Playgroud)
它将采用任意数量的逻辑向量x1
, x2
, x3
, ... 并产生x1 | x2 | x3 ...
等等。
实际工作由以下函数完成,有两种变体。该函数假定要搜索除第一列之外的所有列。
fun1 <- function() {
## using a pattern
m <- lapply(df[, -1], grepl, pattern = "^X", perl = TRUE)
df[!df$id %in% df$id[do.call(repeated_or, m)], ]
}
fun2 <- function() {
## using a fixed string
m <- lapply(df[, -1], function(x) substr(x, 1,1) == "X")
df[!df$id %in% df$id[do.call(repeated_or, m)], ]
}
Run Code Online (Sandbox Code Playgroud)
现在,使用 ThomasIsCoding 提供的代码:
n <- 1e4
df <- data.frame(
id = rep(1:(n / 2), each = 2, length.out = n),
var1 = mapply(paste0, LETTERS[23 + sample(1:3, n, replace = T)], sample(1:3, n, replace = T)),
var2 = mapply(paste0, LETTERS[23 + sample(1:3, n, replace = T)], sample(1:3, n, replace = T)),
var3 = mapply(paste0, LETTERS[23 + sample(1:3, n, replace = T)], sample(1:3, n, replace = T)),
stringsAsFactors = FALSE
)
library("microbenchmark")
microbenchmark(
fun1(),
fun2(),
TIC1(),
TIC2(),
waldi_speed(),
unit = "relative"
)
## Unit: relative
## expr min lq mean median uq max neval
## fun1() 1.180372 1.183109 1.205269 1.189091 1.187704 1.163667 100
## fun2() 1.000000 1.000000 1.000000 1.000000 1.000000 1.000000 100
## TIC1() 3.487775 3.462417 3.549228 3.491580 3.494310 2.857216 100
## TIC2() 1.140145 1.131872 1.141466 1.146900 1.142863 1.078746 100
## waldi_speed() 31.440025 30.845971 30.556054 30.798701 30.338251 26.213920 100
Run Code Online (Sandbox Code Playgroud)
专用功能:如果找到特定代码,您可能会进行许多操作。使用该类型的专用函数可能比通用函数更快。将比.startsWith(x, "X")
grepl("^X", x)
子集:如果查找特定代码的函数很慢(操作比子集慢),请仅对尚未找到代码的行中的其余列进行此操作。
哈希查找:您需要比较所有没有直接命中的剩余 id,如果任何具有相同的行有id
命中。所以在列表中查找,持有命中的 id,应该很快。此查找可能是使用快速的哈希表像fastmatch::fmatch
。
存储类型:如果 a 的列data.frame
都具有相同的 type,当它存储在 amatrix
而不是 a时,对其的操作可能会更快list
。
避免重新排列数据:尽量按原样使用数据。避免像split或group这样会重新排列当前数据的操作。
你可以unlist
df[-1]
,如果它的测试startsWith
X
,创建一个matrix
具有nrow
的df
,走rowSums
,在情况下,它是>0
在id
具有一击。我将它们存储id
在i
. 可选的unique
id's
可以计算。现在测试是否id
是%in%
i
并使用!
. 一个可能更快的替代方法%in%
是%fin%
from fastmatch
。
i <- df$id[unlist(df[-1], FALSE, FALSE) |>
startsWith("X") |>
matrix(nrow(df)) |>
rowSums() > 0]
#i <- unique(i) #Optional
#i <- kit::funique(i) #Optional faster unique
df[!df$id %in% i,]
# id var1 var2 var3
#5 3 Z1 Z1 Z1
#6 3 Z2 Z2 Z2
library(fastmatch)
df[!df$id %fin% i,]
Run Code Online (Sandbox Code Playgroud)
另一种方式来以前使用i
过lappyl
,并使用|
在Reduce
或情况下Reduce
被缓慢或许改用eval
以str2lang
和paste
:
i <- lapply(df[,-1], startsWith, "X")
i <- df$id[Reduce(`|`, i)]
#i <- eval(str2lang(paste0("i[[", seq_along(i), "]]", collapse = "|"))) #Alternative to Reduce
df[!df$id %in% i,]
Run Code Online (Sandbox Code Playgroud)
也有可能测试它是否有开始X
仅在没有命中已经和使用的情况下%in%
只为没有打正着的那些行X
,如果它开始,当子集是将意义不是测试更快X
,如果子集比寻找匹配更快。
i <- Reduce(function(x, y) `[<-`(x,!x,startsWith(y[!x], "X")),
df[,-1], logical(nrow(df)))
i[!i] <- df$id[!i] %in% df$id[i]
df[!i,]
Run Code Online (Sandbox Code Playgroud)
基于@Waldi 的基准测试以及TIC2()
来自@thomasiscoding 和fun2()
@enrico-schumann 的方法:
getDf <- function(nr, nc) { #function to creat example dataset
data.frame(id = sample(seq_len(nr/5), nr, TRUE),
lapply(setNames(seq_len(nc), paste0("var", seq_len(nc))),
function(i) paste0(sample(LETTERS, nr, TRUE), sample(0:9, nr, TRUE))))
}
library(fastmatch)
FGKi1 <- function() {
df[!df$id %in% df$id[rowSums(matrix(startsWith(unlist(df[-1], FALSE, FALSE),
"X"), nrow(df))) > 0],]}
FGKi2 <- function() {
df[!df$id %in% unique(df$id[rowSums(matrix(startsWith(unlist(df[-1],
FALSE, FALSE), "X"), nrow(df))) > 0]),]}
FGKi3 <- function() {
df[!df$id %fin% df$id[rowSums(matrix(startsWith(unlist(df[-1], FALSE, FALSE),
"X"), nrow(df))) > 0],]}
FGKi4 <- function() {
df[!df$id %in% df$id[Reduce(`|`, lapply(df[, -1], startsWith, "X"))],]
}
FGKi5 <- function() {
df[!df$id %fin% df$id[Reduce(`|`, lapply(df[, -1], startsWith, "X"))],]
}
FGKi6 <- function() {
i <- Reduce(`|`, lapply(df[, -1], startsWith, "X"))
i[!i] <- df$id[!i] %in% df$id[i]
df[!i,]
}
FGKi7 <- function() {
i <- lapply(df[, -1], startsWith, "X")
i <- eval(str2lang(paste0("i[[", seq_along(i), "]]", collapse = "|")))
df[!df$id %fin% df$id[i],]
}
repeated_or <- function(...) {
L <- list(...)
ans <- L[[1L]]
if (...length() > 1L)
for (i in seq.int(2L, ...length()))
ans <- ans | L[[i]]
ans
}
fun2 <- function() {
## using a fixed string
m <- lapply(df[, -1], function(x) substr(x, 1,1) == "X")
df[!df$id %in% df$id[do.call(repeated_or, m)], ]
}
TIC2 <- function() {
subset(df, !id %in% id[rowSums(substr(as.matrix(df[, -1]), 1, 1) == "X") > 0])
}
Run Code Online (Sandbox Code Playgroud)
set.seed(42)
df <- getDf(1e5, 3) #3 col wide Table
bench::mark(TIC2(), fun2(), FGKi1(), FGKi2(), FGKi3(), FGKi4(),
FGKi5(), FGKi6(), FGKi7())
# expression min median `itr/sec` mem_alloc `gc/sec` n_itr n_gc total_time
# <bch:expr> <bch:t> <bch:t> <dbl> <bch:byt> <dbl> <int> <dbl> <bch:tm>
#1 TIC2() 24.7ms 24.9ms 40.2 15.07MB 112. 5 14 125ms
#2 fun2() 22.3ms 22.5ms 43.9 11.26MB 39.9 11 10 251ms
#3 FGKi1() 14.6ms 15ms 66.8 12.78MB 58.9 17 15 255ms
#4 FGKi2() 14.9ms 15.1ms 66.3 12.97MB 58.5 17 15 256ms
#5 FGKi3() 12.1ms 12.3ms 80.8 12.23MB 72.3 19 17 235ms
#6 FGKi4() 12.7ms 12.9ms 77.7 8.97MB 27.7 28 10 360ms
#7 FGKi5() 10.2ms 10.3ms 96.4 8.42MB 51.4 30 16 311ms
#8 FGKi6() 13.2ms 13.3ms 75.1 11.38MB 53.6 21 15 280ms
#9 FGKi7() 10.3ms 10.4ms 95.2 8.42MB 36.8 31 12 326ms
set.seed(42)
df <- getDf(1e4, 1e3) #1000 col wide Table
bench::mark(TIC2(), fun2(), FGKi1(), FGKi2(), FGKi3(), FGKi4(),
FGKi5(), FGKi6(), FGKi7())
# expression min median `itr/sec` mem_alloc `gc/sec` n_itr n_gc total_time
# <bch:expr> <bch:t> <bch:t> <dbl> <bch:byt> <dbl> <int> <dbl> <bch:tm>
#1 TIC2() 430.4ms 434.4ms 2.30 230MB 3.45 2 3 869ms
#2 fun2() 374.6ms 405.6ms 2.47 191MB 6.16 2 5 811ms
#3 FGKi1() 110.8ms 117.7ms 7.87 191MB 13.8 4 7 509ms
#4 FGKi2() 108.9ms 111.1ms 8.32 191MB 11.7 5 7 601ms
#5 FGKi3() 107.8ms 107.8ms 9.25 191MB 9.25 5 5 541ms
#6 FGKi4() 52.5ms 54.6ms 16.6 115MB 14.7 9 8 543ms
#7 FGKi5() 52.5ms 54.7ms 18.3 115MB 18.3 10 10 547ms
#8 FGKi6() 52.8ms 55.2ms 18.1 115MB 16.3 10 9 553ms
#9 FGKi7() 53.7ms 56.6ms 17.6 115MB 17.6 9 9 510ms
#Warning message:
#Some expressions had a GC in every iteration; so filtering is disabled.
Run Code Online (Sandbox Code Playgroud)
归档时间: |
|
查看次数: |
558 次 |
最近记录: |