将所有逻辑规则与数据帧匹配(需要超快功能)

mr.*_*r.T 5 r rules function

我有一个函数可以检查数据帧中是否存在逻辑序列

fu <- function(dat , rule , res.only=T){
debug.vec <- rep("no",nrow(dat)) # control of rule triggers
rule.id <- 1 # rule number in vector
for(i in 1:nrow(dat)){
  # check if the rule "rule[rule.id]" has worked on this "i" index in dat[i,]
  current_rule <- with(data = dat[i,] , expr = eval(parse(text = rule[rule.id]))  )
   if(current_rule){  # if the rule is triggered
          debug.vec[i] <- rule[rule.id]
          if(  rule.id==length(rule)  ) break   # stop if there are no more rules
           rule.id <- rule.id+1  # go to the next rule
           }}  
if(!res.only)  return(  cbind(dat,debug.vec)  )  
return(  sum(debug.vec!="no")==length(rule)   )
}
Run Code Online (Sandbox Code Playgroud)

例如我有一些数据

set.seed(123)
dat <- as.data.frame(matrix(data = sample(10,30,replace = T),ncol = 3))
colnames(dat) <- paste0("x" ,1:ncol(dat))
Run Code Online (Sandbox Code Playgroud)

..

dat
   x1 x2 x3
1   3  5  9
2   3  3  3
3  10  9  4
4   2  9  1
5   6  9  7
6   5  3  5
7   4  8 10
8   6 10  7
9   9  7  9
10 10 10  9
Run Code Online (Sandbox Code Playgroud)

还有一个带有规则的向量

rule <- c("x1>5 & x2>2" , "x1>x2" , "x3!=4" )
Run Code Online (Sandbox Code Playgroud)

该函数检查数据帧中是否存在这样的逻辑序列并给出逻辑答案

> fu(dat = dat, rule = rule, res.only = T)
[1] TRUE
Run Code Online (Sandbox Code Playgroud)

或者您可以更改标志res.only = F并查看序列在debug.vec列中的位置

> fu(dat = dat, rule = rule, res.only = F)
   x1 x2 x3   debug.vec
1   3  5  9          no
2   3  3  3          no
3  10  9  4 x1>5 & x2>2
4   2  9  1          no
5   6  9  7          no
6   5  3  5       x1>x2
7   4  8 10       x3!=4
8   6 10  7          no
9   9  7  9          no
10 10 10  9          no
Run Code Online (Sandbox Code Playgroud)

我需要这个函数的最快版本,也许使用Rccp 包或类似的东西..

UPD=======================

Waldi功能与我的功能不同,出现问题

UPD_2_====================================

# Is this correct?
Run Code Online (Sandbox Code Playgroud)

是的,这是正确的,如果触发了规则 [k] 然后搜索规则 [k+1] 从新的一行数据开始

在此处输入图片说明 原谅我的问题不够准确,这是我的错

我的函数返回FALSE是因为最后一条规则"x3!=4"不起作用,应该是

dat <- structure(list(x1 = c(2L, 5L, 1L, 3L, 9L, 2L, 6L, 3L, 3L, 9L), 
                      x2 = c(2L, 1L, 6L, 10L, 8L, 10L, 10L, 4L, 6L, 4L), 
                      x3 = c(4L, 9L, 8L, 7L, 10L, 1L, 2L, 8L, 3L, 10L)),
                   class = "data.frame", row.names = c(NA, -10L))
dat
rule <- c("x1>5 & x2>2" , "x1>x2" , "x3!=4" )

my_fu(dat = dat, rule = rule, res.only = F)
Run Code Online (Sandbox Code Playgroud)

只有两条规则有效

> my_fu(dat = dat, rule = rule, res.only = F)
   x1 x2 x3   debug.vec
1   2  2  4          no
2   5  1  9          no
3   1  6  8          no
4   3 10  7          no
5   9  8 10 x1>5 & x2>2
6   2 10  1          no
7   6 10  2          no
8   3  4  8          no
9   3  6  3          no
10  9  4 10       x1>x2
Run Code Online (Sandbox Code Playgroud)

它应该是

> my_fu(dat = dat, rule = rule, res.only = T)
[1] FALSE
Run Code Online (Sandbox Code Playgroud)

Tho*_*ing 5

更新

根据您的更新,我编写了一个新fu函数,即TIC_fu()

TIC_fu <- function(dat, rule, res.only = TRUE) {
  m <- with(dat, lapply(rule, function(r) eval(str2expression(r))))
  idx <- na.omit(
    Reduce(
      function(x, y) {
        k <- which(y)
        ifelse(all(k <= x), NA, min(k[k > x]))
      }, m,
      init = 0, accumulate = TRUE
    )
  )[-1]
  if (!res.only) {
    fidx <- head(idx, length(rule))
    debug.vec <- replace(rep("no", nrow(dat)), fidx, rule[seq_along(fidx)])
    return(cbind(dat, debug.vec))
  }
  length(idx) >= length(rule)
}
Run Code Online (Sandbox Code Playgroud)

你会看到

> TIC_fu(dat, rule, FALSE)
   x1 x2 x3   debug.vec
1   2  2  4          no
2   5  1  9          no
3   1  6  8          no
4   3 10  7          no
5   9  8 10 x1>5 & x2>2
6   2 10  1          no
7   6 10  2          no
8   3  4  8          no
9   3  6  3          no
10  9  4 10       x1>x2

> TIC_fu(dat,rule)
[1] FALSE
Run Code Online (Sandbox Code Playgroud)

用于基准测试

> microbenchmark(
+   TIC_fu(dat, rule, FALSE),
+   fu(dat, rule, FALSE),
+   unit = "relative"
+ )
Unit: relative
                     expr      min       lq     mean   median     uq      max
 TIC_fu(dat, rule, FALSE) 1.000000 1.000000 1.000000 1.000000 1.0000 1.000000
     fu(dat, rule, FALSE) 4.639093 4.555523 3.383911 4.450056 4.3993 1.007532
 neval
   100
   100
Run Code Online (Sandbox Code Playgroud)

上一个答案

这里有一些类似于@Waldi所做的选项,但唯一的区别是parse,str2langstr2expression

microbenchmark::microbenchmark(
  any(with(dat, rowSums(sapply(rule, function(rule) eval(parse(text = rule))))==length(rule))),
  any(with(dat, rowSums(sapply(rule, function(rule) eval(str2lang(rule))))==length(rule))),
  any(with(dat, rowSums(sapply(rule, function(rule) eval(str2expression(rule))))==length(rule))),
  any(with(dat, eval(str2expression(paste0(rule,collapse = " & ")))))
)
Run Code Online (Sandbox Code Playgroud)

你会看到

Unit: microseconds
                                                                                                  expr
   any(with(dat, rowSums(sapply(rule, function(rule) eval(parse(text = rule)))) ==      length(rule)))
       any(with(dat, rowSums(sapply(rule, function(rule) eval(str2lang(rule)))) ==      length(rule)))
 any(with(dat, rowSums(sapply(rule, function(rule) eval(str2expression(rule)))) ==      length(rule)))
                                  any(with(dat, eval(str2expression(paste0(rule, collapse = " & ")))))
  min   lq    mean median     uq   max neval
 94.0 98.6 131.431 107.35 121.90 632.7   100
 37.5 39.2  48.887  44.05  48.50 174.1   100
 36.8 39.6  51.627  46.20  48.45 241.4   100
 12.7 15.8  19.786  17.00  19.75  97.9   100
Run Code Online (Sandbox Code Playgroud)