如何计算字符串中某些字符的频率?

use*_*366 9 regex string r

如果我有一连串的人物如"AABBABBBAAAABBAAAABBBAABBBBABABB".

有没有办法让R计算运行次数A并说明每个长度的数量?

所以我想知道A连续3个实例的数量,单个A实例A的数量,连续2个实例的数量等等.

akr*_*run 10

尝试

 v1 <- scan(text=gsub('[^A]+', ',', str1), sep=',', what='', quiet=TRUE)
 table(v1[nzchar(v1)])
 # A   AA AAAA 
 # 3    2    2 
Run Code Online (Sandbox Code Playgroud)

要么

 library(stringi)
 table(stri_extract_all_regex(str1, '[A]+')[[1]])
 # A   AA AAAA 
 # 3    2    2 
Run Code Online (Sandbox Code Playgroud)

基准

 set.seed(42)
 x1 <- stri_rand_strings(1,1e7, pattern='[A-G]')

 system.time(table(stri_split_regex(x1, "[^A]+", omit_empty = TRUE)))
 #   user  system elapsed 
 #  0.829   0.002   0.831 

 system.time(table(stri_extract_all_regex(x1, '[A]+')[[1]]))
 #   user  system elapsed 
 #   0.790   0.002   0.791 

 system.time(table(rle(strsplit(x1,"")[[1]])) )
 #   user  system elapsed 
 #  30.230   1.243  31.523 

 system.time(table(strsplit(x1, "[^A]+")))
 # user  system elapsed 
 # 4.253   0.006   4.258 


 system.time(table(attr(gregexpr("A+",x1)[[1]], 'match.length')))
 #  user  system elapsed 
 #  1.994   0.004   1.999 


 library(microbenchmark)
 microbenchmark(david=table(stri_split_regex(x1, "[^A]+", omit_empty = TRUE)),
    akrun=  table(stri_extract_all_regex(x1, '[A]+')[[1]]),
    david2 =  table(strsplit(x1, "[^A]+")),
    glen = table(rle(strsplit(x1,"")[[1]])),
    plannapus = table(attr(gregexpr("A+",x1)[[1]], 'match.length')),
         times=20L, unit='relative')

#Unit: relative
#     expr       min        lq      mean    median         uq       max    neval  cld
#   david  1.0000000  1.000000  1.000000  1.000000  1.0000000  1.000000    20       a  
#   akrun  0.7908313  1.023388  1.054670  1.336510  0.9903384  1.004711    20       a
#  david2  4.9325256  5.461389  5.613516  6.207990  5.6647301  5.374668    20       c 
#    glen 14.9064240 15.975846 16.672339 20.570874 15.8710402 15.465140    20       d
#plannapus 2.5077719  3.123360  2.836338  3.557242  2.5689176  2.452964    20       b 
Run Code Online (Sandbox Code Playgroud)

数据

 str1 <- 'AABBABBBAAAABBAAAABBBAABBBBABABB'
Run Code Online (Sandbox Code Playgroud)


Gle*_*n_b 10

table(rle(strsplit("AABBABBBAAAABBAAAABBBAABBBBABABB","")[[1]]))
Run Code Online (Sandbox Code Playgroud)

       values
lengths A B
      1 3 1
      2 2 3
      3 0 2
      4 2 1
Run Code Online (Sandbox Code Playgroud)

其中(读下A栏)意味着有3个A长度为1,2个A长度为2,2个长度为4个.

  • 速度没有被提到作为问题的主要考虑因素,但会有更快的方法. (2认同)

Dav*_*urg 8

这是使用的其他方式 strsplit

x <- "AABBABBBAAAABBAAAABBBAABBBBABABB"
table(strsplit(x, "[^A]+"))
# A   AA AAAA 
# 3    2    2 
Run Code Online (Sandbox Code Playgroud)

或者类似于stringi包裹

library(stringi)
table(stri_split_regex(x, "[^A]+", omit_empty = TRUE))
Run Code Online (Sandbox Code Playgroud)