基于R中的交替字符拆分字符串

Cod*_*man 74 r

我试图找出一种有效的方法来分割字符串

"111110000011110000111000"
Run Code Online (Sandbox Code Playgroud)

到一个矢量

[1] "11111" "00000" "1111" "0000" "111" "000"
Run Code Online (Sandbox Code Playgroud)

其中"0"和"1"可以是任何交替的字符.

akr*_*run 93

尝试

strsplit(str1, '(?<=1)(?=0)|(?<=0)(?=1)', perl=TRUE)[[1]]
#[1] "11111" "00000" "1111"  "0000"  "111"   "000"  
Run Code Online (Sandbox Code Playgroud)

更新

修改@ rawr的解决方案 stri_extract_all_regex

library(stringi)
stri_extract_all_regex(str1, '(?:(\\w))\\1*')[[1]]
#[1] "11111" "00000" "1111"  "0000"  "111"   "000"  


stri_extract_all_regex(x1, '(?:(\\w))\\1*')[[1]]
#[1] "11111" "00000" "222"   "000"   "3333"  "000"   "1111"  "0000"  "111"  
#[10] "000"  

stri_extract_all_regex(x2, '(?:(\\w))\\1*')[[1]]
#[1] "aaaaa"   "bb"      "ccccccc" "bbb"     "a"       "d"       "11111"  
#[8] "00000"   "222"     "aaa"     "bb"      "cc"      "d"       "11"     
#[15] "D"       "aa"      "BB"     
Run Code Online (Sandbox Code Playgroud)

基准

library(stringi) 
set.seed(24)
x3 <- stri_rand_strings(1, 1e4)

akrun <- function() stri_extract_all_regex(x3, '(?:(\\w))\\1*')[[1]]
#modified @thelatemail's function to make it bit more general
thelate <- function() regmatches(x3,gregexpr("(?:(\\w))\\1*", x3, 
            perl=TRUE))[[1]]
rawr <- function() strsplit(x3, '(?<=(\\w))(?!\\1)', perl=TRUE)[[1]]
ananda <- function() unlist(read.fwf(textConnection(x3), 
                rle(strsplit(x3, "")[[1]])$lengths, 
                colClasses = "character"))
Colonel <- function() with(rle(strsplit(x3,'')[[1]]), 
   mapply(function(u,v) paste0(rep(v,u), collapse=''), lengths, values))

Cryo <- function(){
   res_vector=rep(NA_character_,nchar(x3))
  res_vector[1]=substr(x3,1,1)
  counter=1
  old_tmp=''

   for (i in 2:nchar(x3)) {
    tmp=substr(x3,i,i)
    if (tmp==old_tmp) {
    res_vector[counter]=paste0(res_vector[counter],tmp)
    } else {
    res_vector[counter+1]=tmp
    counter=counter+1
    }
  old_tmp=tmp
   }

 res_vector[!is.na(res_vector)]
  }


 richard <- function(){
     cs <- cumsum(
     rle(stri_split_boundaries(x3, type = "character")[[1L]])$lengths
   )
   stri_sub(x3, c(1, head(cs + 1, -1)), cs)
  }

 nicola<-function(x) {
   indices<-c(0,which(diff(as.integer(charToRaw(x)))!=0),nchar(x))
   substring(x,indices[-length(indices)]+1,indices[-1])
 }

 richard2 <- function() {
  cs <- cumsum(rle(strsplit(x3, NULL)[[1L]])[[1L]])
  stri_sub(x3, c(1, head(cs + 1, -1)), cs)
 }

system.time(akrun())
# user  system elapsed 
# 0.003   0.000   0.003 

system.time(thelate())
#   user  system elapsed 
#  0.272   0.001   0.274 

system.time(rawr())
# user  system elapsed 
#  0.397   0.001   0.398 

system.time(ananda())
#  user  system elapsed 
# 3.744   0.204   3.949 

system.time(Colonel())
#   user  system elapsed 
#  0.154   0.001   0.154 

system.time(Cryo())
#  user  system elapsed 
# 0.220   0.005   0.226 

system.time(richard())
#  user  system elapsed 
# 0.007   0.000   0.006 

system.time(nicola(x3))
# user  system elapsed 
# 0.190   0.001   0.191 
Run Code Online (Sandbox Code Playgroud)

在略大的字符串上,

set.seed(24)
x3 <- stri_rand_strings(1, 1e6)

system.time(akrun())
#user  system elapsed 
#0.166   0.000   0.155 
system.time(richard())
#  user  system elapsed 
# 0.606   0.000   0.569 
system.time(richard2())
#  user  system elapsed 
# 0.518   0.000   0.487 

system.time(Colonel())
#  user  system elapsed 
# 9.631   0.000   9.358 


library(microbenchmark)
 microbenchmark(richard(), richard2(), akrun(), times=20L, unit='relative')
 #Unit: relative
 #     expr      min       lq     mean   median       uq      max neval cld
 # richard() 2.438570 2.633896 2.365686 2.315503 2.368917 2.124581    20   b
 #richard2() 2.389131 2.533301 2.223521 2.143112 2.153633 2.157861    20   b
 # akrun() 1.000000 1.000000 1.000000 1.000000 1.000000 1.000000    20  a 
Run Code Online (Sandbox Code Playgroud)

注意: 尝试运行其他方法,但需要很长时间

数据

str1 <- "111110000011110000111000"
x1 <- "1111100000222000333300011110000111000"
x2 <- "aaaaabbcccccccbbbad1111100000222aaabbccd11DaaBB"
Run Code Online (Sandbox Code Playgroud)


the*_*ail 27

主题的变化:

x <- "111110000011110000111000"
regmatches(x,gregexpr("1+|0+",x))[[1]]
#[1] "11111" "00000" "1111"  "0000"  "111"   "000"
Run Code Online (Sandbox Code Playgroud)


A5C*_*2T1 22

您可以使用substrread.fwf与之一起使用rle(尽管它不可​​能像任何基于正则表达式的解决方案一样高效):

x <- "111110000011110000111000"
unlist(read.fwf(textConnection(x), 
                rle(strsplit(x, "")[[1]])$lengths, 
                colClasses = "character"))
#      V1      V2      V3      V4      V5      V6 
# "11111" "00000"  "1111"  "0000"   "111"   "000"
Run Code Online (Sandbox Code Playgroud)

这种方法的一个优点是它可以工作,甚至:

x <- paste(c(rep("a", 5), rep("b", 2), rep("c", 7),
             rep("b", 3), rep("a", 1), rep("d", 1)), collapse = "")
x
# [1] "aaaaabbcccccccbbbad"

unlist(read.fwf(textConnection(x), 
                rle(strsplit(x, "")[[1]])$lengths, 
                colClasses = "character"))
#        V1        V2        V3        V4        V5        V6 
#   "aaaaa"      "bb" "ccccccc"     "bbb"       "a"       "d" 
Run Code Online (Sandbox Code Playgroud)


raw*_*awr 20

另一种方法是在交替数字之间添加空格.这适用于任何两个,而不仅仅是1和0.然后strsplit在空白处使用:

x <- "111110000011110000111000"

(y <- gsub('(\\d)(?!\\1)', '\\1 \\2', x, perl = TRUE))
# [1] "11111 00000 1111 0000 111 000 "


strsplit(y, ' ')[[1]]
# [1] "11111" "00000" "1111"  "0000"  "111"   "000"  
Run Code Online (Sandbox Code Playgroud)

或者更为简洁,正如@akrun所指出的那样:

strsplit(x, '(?<=(\\d))(?!\\1)', perl=TRUE)[[1]]
# [1] "11111" "00000" "1111"  "0000"  "111"   "000"  
Run Code Online (Sandbox Code Playgroud)

\\d改为\\w工作

x  <- "aaaaabbcccccccbbbad"
strsplit(x, '(?<=(\\w))(?!\\1)', perl=TRUE)[[1]]
# [1] "aaaaa"   "bb"      "ccccccc" "bbb"     "a"       "d"      

x <- "111110000011110000111000"
strsplit(x, '(?<=(\\w))(?!\\1)', perl=TRUE)[[1]]
# [1] "11111" "00000" "1111"  "0000"  "111"   "000" 
Run Code Online (Sandbox Code Playgroud)

你也可以使用\K(而不是显式地使用捕获组,\\1并且\\2)我看不到很多用,也不知道如何解释它:}

AFAIK \\K重置报告的匹配的起点,并且不再包括任何先前消耗的字符,基本上丢弃了与该点匹配的所有内容.

x <- "1111100000222000333300011110000111000"
(z <- gsub('(\\d)\\K(?!\\1)', ' ', x, perl = TRUE))
# [1] "11111 00000 222 000 3333 000 1111 0000 111 000 "
Run Code Online (Sandbox Code Playgroud)


Ric*_*ven 14

原始方法:这是一个包含的stringi方法rle().

x <- "111110000011110000111000"
library(stringi)

cs <- cumsum(
    rle(stri_split_boundaries(x, type = "character")[[1L]])$lengths
)
stri_sub(x, c(1L, head(cs + 1L, -1L)), cs)
# [1] "11111" "00000" "1111"  "0000"  "111"   "000"  
Run Code Online (Sandbox Code Playgroud)

或者,您可以使用length参数stri_sub()

rl <- rle(stri_split_boundaries(x, type = "character")[[1L]])
with(rl, {
    stri_sub(x, c(1L, head(cumsum(lengths) + 1L, -1L)), length = lengths)
})
# [1] "11111" "00000" "1111"  "0000"  "111"   "000"  
Run Code Online (Sandbox Code Playgroud)

更新效率:在意识到base::strsplit()速度快之后stringi::stri_split_boundaries(),这里只使用基本功能,这是我之前答案的更高效版本.

set.seed(24)
x3 <- stri_rand_strings(1L, 1e6L)

system.time({
    cs <- cumsum(rle(strsplit(x3, NULL)[[1L]])[[1L]])
    substring(x3, c(1L, head(cs + 1L, -1L)), cs)
})
#   user  system elapsed 
#  0.686   0.012   0.697 
Run Code Online (Sandbox Code Playgroud)


Col*_*vel 11

另一种方法,使用mapply:

x="111110000011110000111000"

with(rle(strsplit(x,'')[[1]]), 
     mapply(function(u,v) paste0(rep(v,u), collapse=''), lengths, values))
#[1] "11111" "00000" "1111"  "0000"  "111"   "000"  
Run Code Online (Sandbox Code Playgroud)


Tom*_*ell 8

它并不是OP正在寻找的(简洁的R代码),但我认为我试一试Rcpp,结果相对简单,比最快的基于R的答案快5倍.

library(Rcpp)

cppFunction(
  'std::vector<std::string> split_str_cpp(std::string x) {

  std::vector<std::string> parts;

  int start = 0;

  for(int i = 1; i <= x.length(); i++) {
      if(x[i] != x[i-1]) {
        parts.push_back(x.substr(start, i-start));
        start = i;
      } 
  }

  return parts;

  }')
Run Code Online (Sandbox Code Playgroud)

并测试这些

str1 <- "111110000011110000111000"
x1 <- "1111100000222000333300011110000111000"
x2 <- "aaaaabbcccccccbbbad1111100000222aaabbccd11DaaBB"
Run Code Online (Sandbox Code Playgroud)

给出以下输出

> split_str_cpp(str1)
[1] "11111" "00000" "1111"  "0000"  "111"   "000"  
> split_str_cpp(x1)
 [1] "11111" "00000" "222"   "000"   "3333"  "000"   "1111"  "0000"  "111"   "000"  
> split_str_cpp(x2)
 [1] "aaaaa"   "bb"      "ccccccc" "bbb"     "a"       "d"       "11111"   "00000"   "222"     "aaa"     "bb"      "cc"      "d"       "11"     
[15] "D"       "aa"      "BB"   
Run Code Online (Sandbox Code Playgroud)

基准测试显示它比R解决方案快5-10倍.

akrun <- function(str1) strsplit(str1, '(?<=1)(?=0)|(?<=0)(?=1)', perl=TRUE)[[1]]

richard1 <- function(x3){
  cs <- cumsum(
    rle(stri_split_boundaries(x3, type = "character")[[1L]])$lengths
  )
  stri_sub(x3, c(1, head(cs + 1, -1)), cs)
}

richard2 <- function(x3) {
  cs <- cumsum(rle(strsplit(x3, NULL)[[1L]])[[1L]])
  stri_sub(x3, c(1, head(cs + 1, -1)), cs)
}

library(microbenchmark)
library(stringi)

set.seed(24)
x3 <- stri_rand_strings(1, 1e6)

microbenchmark(split_str_cpp(x3), akrun(x3), richard1(x3), richard2(x3), unit = 'relative', times=20L)
Run Code Online (Sandbox Code Playgroud)

比较:

Unit: relative
              expr      min       lq     mean   median       uq      max neval
 split_str_cpp(x3) 1.000000 1.000000 1.000000 1.000000 1.000000 1.000000    20
         akrun(x3) 9.675613 8.952997 8.241750 8.689001 8.403634 4.423134    20
      richard1(x3) 5.355620 5.226103 5.483171 5.947053 5.982943 3.379446    20
      richard2(x3) 4.842398 4.756086 5.046077 5.389570 5.389193 3.669680    20
Run Code Online (Sandbox Code Playgroud)

  • 等等,所以我的回答比akrun更快?有趣 (2认同)