更快地捕获正则表达式

Min*_*per 3 regex performance r apply

我想使用正则表达式来捕获子串 - 我已经有了一个可行的解决方案,但我想知道是否有更快的解决方案.我正在申请applyCaptureRegex一个有大约400.000个条目的向量.

 exampleData <- as.data.frame(c("[hg19:21:34809787-34809808:+]","[hg19:11:105851118-105851139:+]","[hg19:17:7482245-7482266:+]","[hg19:6:19839915-19839936:+]"))

captureRegex <- function(captRegEx,str){
  sapply(regmatches(str,gregexpr(captRegEx,str))[[1]], function(m) regmatches(m,regexec(captRegEx,m)))
}

applyCaptureRegex <- function(mir,r){
  mir <- unlist(apply(mir, 1, function(x) captureRegex(r,x[1])))
  mir <- matrix(mir ,ncol=5, byrow = TRUE)
  mir
}
Run Code Online (Sandbox Code Playgroud)

用法和结果:

> captureRegex("\\[[a-z0-9]+:([0-9]+):([0-9]+)-([0-9]+):([-+])\\]","[hg19:12:125627828-125627847:-]")
$`[hg19:12:125627828-125627847:-]`
[1] "[hg19:12:125627828-125627847:-]" "12" "125627828" "125627847" "-"   

> applyCaptureRegex(exampleData,"\\[[a-z0-9]+:([0-9]+):([0-9]+)-([0-9]+):([-+])\\]")
     [,1]                              [,2] [,3]        [,4]        [,5]
[1,] "[hg19:21:34809787-34809808:+]"   "21" "34809787"  "34809808"  "+" 
[2,] "[hg19:11:105851118-105851139:+]" "11" "105851118" "105851139" "+" 
[3,] "[hg19:17:7482245-7482266:+]"     "17" "7482245"   "7482266"   "+" 
[4,] "[hg19:6:19839915-19839936:+]"    "6"  "19839915"  "19839936"  "+" 
Run Code Online (Sandbox Code Playgroud)

谢谢!

hwn*_*wnd 8

为什么重新发明轮子?您有多个库包可供选择,其中的函数返回一个字符矩阵,其中一个列用于模式中的每个捕获组.

stri_match_all_regex - stringi

x <- c('[hg19:21:34809787-34809808:+]', '[hg19:11:105851118-105851139:+]', '[hg19:17:7482245-7482266:+]', '[hg19:6:19839915-19839936:+]')
do.call(rbind, stri_match_all_regex(x, '\\[[^:]+:(\\d+):(\\d+)-(\\d+):([-+])]'))
#      [,1]                              [,2] [,3]        [,4]        [,5]
# [1,] "[hg19:21:34809787-34809808:+]"   "21" "34809787"  "34809808"  "+" 
# [2,] "[hg19:11:105851118-105851139:+]" "11" "105851118" "105851139" "+" 
# [3,] "[hg19:17:7482245-7482266:+]"     "17" "7482245"   "7482266"   "+" 
# [4,] "[hg19:6:19839915-19839936:+]"    "6"  "19839915"  "19839936"  "+" 
Run Code Online (Sandbox Code Playgroud)

str_match - stringr

str_match(x, '\\[[^:]+:(\\d+):(\\d+)-(\\d+):([-+])]')
Run Code Online (Sandbox Code Playgroud)

strapplyc - gsubfn

strapplyc(x, "(\\[[^:]+:(\\d+):(\\d+)-(\\d+):([-+])])", simplify = rbind)
Run Code Online (Sandbox Code Playgroud)

以下是所有组合解决方案的基准比较.

x <- rep(c('[hg19:21:34809787-34809808:+]', 
           '[hg19:11:105851118-105851139:+]', 
           '[hg19:17:7482245-7482266:+]', 
           '[hg19:6:19839915-19839936:+]'), 1000)

applyCaptureRegex <- function(mir, r) {
  do.call(rbind, lapply(mir, function(x) regmatches(x, regexec(r, x))[[1]]))
}

gsubfn <- function(x1) strapplyc(x1, '(\\[[^:]+:(\\d+):(\\d+)-(\\d+):([-+])])', simplify = rbind)
regmtch <- function(x1) applyCaptureRegex(x1, '\\[[^:]+:(\\d+):(\\d+)-(\\d+):([-+])]')
stringr <- function(x1) str_match(x1, '\\[[^:]+:(\\d+):(\\d+)-(\\d+):([-+])]')
stringi <- function(x1) do.call(rbind, stri_match_all_regex(x1, '\\[[^:]+:(\\d+):(\\d+)-(\\d+):([-+])]'))

require(microbenchmark)
microbenchmark(gsubfn(x), regmtch(x), stringr(x), stringi(x))
Run Code Online (Sandbox Code Playgroud)

结果

Unit: milliseconds
       expr       min        lq      mean    median        uq       max neval
  gsubfn(x) 372.27072 382.82179 391.21837 388.32396 396.27361 449.03091   100
 regmtch(x) 394.03164 409.87523 419.42936 417.76770 427.08208 456.92460   100
 stringr(x)  65.81644  70.28327  76.02298  75.43162  78.92567 116.18026   100
 stringi(x)  15.88171  16.53047  17.52434  16.96127  17.76007  23.94449   100
Run Code Online (Sandbox Code Playgroud)