根据R中的交替字符分割字符串


75

我正在尝试找出一种有效的方法来分割像

"111110000011110000111000"

成向量

[1] "11111" "00000" "1111" "0000" "111" "000"

其中“ 0”和“ 1”可以是任何交替字符。


3
您是指性能有效还是代码(可读性)有效?
freekvd 2015年

1
@freekvd对不起,这意味着可读性高。
CodeShaman

Answers:


95

尝试

strsplit(str1, '(?<=1)(?=0)|(?<=0)(?=1)', perl=TRUE)[[1]]
#[1] "11111" "00000" "1111"  "0000"  "111"   "000"  

更新资料

用@rawr解决方案的修改 stri_extract_all_regex

library(stringi)
stri_extract_all_regex(str1, '(?:(\\w))\\1*')[[1]]
#[1] "11111" "00000" "1111"  "0000"  "111"   "000"  


stri_extract_all_regex(x1, '(?:(\\w))\\1*')[[1]]
#[1] "11111" "00000" "222"   "000"   "3333"  "000"   "1111"  "0000"  "111"  
#[10] "000"  

stri_extract_all_regex(x2, '(?:(\\w))\\1*')[[1]]
#[1] "aaaaa"   "bb"      "ccccccc" "bbb"     "a"       "d"       "11111"  
#[8] "00000"   "222"     "aaa"     "bb"      "cc"      "d"       "11"     
#[15] "D"       "aa"      "BB"     

基准测试

library(stringi) 
set.seed(24)
x3 <- stri_rand_strings(1, 1e4)

akrun <- function() stri_extract_all_regex(x3, '(?:(\\w))\\1*')[[1]]
#modified @thelatemail's function to make it bit more general
thelate <- function() regmatches(x3,gregexpr("(?:(\\w))\\1*", x3, 
            perl=TRUE))[[1]]
rawr <- function() strsplit(x3, '(?<=(\\w))(?!\\1)', perl=TRUE)[[1]]
ananda <- function() unlist(read.fwf(textConnection(x3), 
                rle(strsplit(x3, "")[[1]])$lengths, 
                colClasses = "character"))
Colonel <- function() with(rle(strsplit(x3,'')[[1]]), 
   mapply(function(u,v) paste0(rep(v,u), collapse=''), lengths, values))

Cryo <- function(){
   res_vector=rep(NA_character_,nchar(x3))
  res_vector[1]=substr(x3,1,1)
  counter=1
  old_tmp=''

   for (i in 2:nchar(x3)) {
    tmp=substr(x3,i,i)
    if (tmp==old_tmp) {
    res_vector[counter]=paste0(res_vector[counter],tmp)
    } else {
    res_vector[counter+1]=tmp
    counter=counter+1
    }
  old_tmp=tmp
   }

 res_vector[!is.na(res_vector)]
  }


 richard <- function(){
     cs <- cumsum(
     rle(stri_split_boundaries(x3, type = "character")[[1L]])$lengths
   )
   stri_sub(x3, c(1, head(cs + 1, -1)), cs)
  }

 nicola<-function(x) {
   indices<-c(0,which(diff(as.integer(charToRaw(x)))!=0),nchar(x))
   substring(x,indices[-length(indices)]+1,indices[-1])
 }

 richard2 <- function() {
  cs <- cumsum(rle(strsplit(x3, NULL)[[1L]])[[1L]])
  stri_sub(x3, c(1, head(cs + 1, -1)), cs)
 }

system.time(akrun())
# user  system elapsed 
# 0.003   0.000   0.003 

system.time(thelate())
#   user  system elapsed 
#  0.272   0.001   0.274 

system.time(rawr())
# user  system elapsed 
#  0.397   0.001   0.398 

system.time(ananda())
#  user  system elapsed 
# 3.744   0.204   3.949 

system.time(Colonel())
#   user  system elapsed 
#  0.154   0.001   0.154 

system.time(Cryo())
#  user  system elapsed 
# 0.220   0.005   0.226 

system.time(richard())
#  user  system elapsed 
# 0.007   0.000   0.006 

system.time(nicola(x3))
# user  system elapsed 
# 0.190   0.001   0.191 

在稍大的弦上,

set.seed(24)
x3 <- stri_rand_strings(1, 1e6)

system.time(akrun())
#user  system elapsed 
#0.166   0.000   0.155 
system.time(richard())
#  user  system elapsed 
# 0.606   0.000   0.569 
system.time(richard2())
#  user  system elapsed 
# 0.518   0.000   0.487 

system.time(Colonel())
#  user  system elapsed 
# 9.631   0.000   9.358 


library(microbenchmark)
 microbenchmark(richard(), richard2(), akrun(), times=20L, unit='relative')
 #Unit: relative
 #     expr      min       lq     mean   median       uq      max neval cld
 # richard() 2.438570 2.633896 2.365686 2.315503 2.368917 2.124581    20   b
 #richard2() 2.389131 2.533301 2.223521 2.143112 2.153633 2.157861    20   b
 # akrun() 1.000000 1.000000 1.000000 1.000000 1.000000 1.000000    20  a 

注意: 试图运行其他方法,但是需要很长时间

数据

str1 <- "111110000011110000111000"
x1 <- "1111100000222000333300011110000111000"
x2 <- "aaaaabbcccccccbbbad1111100000222aaabbccd11DaaBB"

27

主题变化:

x <- "111110000011110000111000"
regmatches(x,gregexpr("1+|0+",x))[[1]]
#[1] "11111" "00000" "1111"  "0000"  "111"   "000"

22

您可能会使用substrread.fwf一起使用rle(尽管它不可​​能像任何基于正则表达式的解决方案一样有效):

x <- "111110000011110000111000"
unlist(read.fwf(textConnection(x), 
                rle(strsplit(x, "")[[1]])$lengths, 
                colClasses = "character"))
#      V1      V2      V3      V4      V5      V6 
# "11111" "00000"  "1111"  "0000"   "111"   "000"

这种方法的一个优点是它甚至可以在以下情况下工作:

x <- paste(c(rep("a", 5), rep("b", 2), rep("c", 7),
             rep("b", 3), rep("a", 1), rep("d", 1)), collapse = "")
x
# [1] "aaaaabbcccccccbbbad"

unlist(read.fwf(textConnection(x), 
                rle(strsplit(x, "")[[1]])$lengths, 
                colClasses = "character"))
#        V1        V2        V3        V4        V5        V6 
#   "aaaaa"      "bb" "ccccccc"     "bbb"       "a"       "d" 

20

另一种方法是在交替的数字之间添加空格。这适用于任何两个,而不仅仅是1和0。然后strsplit在空白处使用:

x <- "111110000011110000111000"

(y <- gsub('(\\d)(?!\\1)', '\\1 \\2', x, perl = TRUE))
# [1] "11111 00000 1111 0000 111 000 "


strsplit(y, ' ')[[1]]
# [1] "11111" "00000" "1111"  "0000"  "111"   "000"  

或更简洁地@akrun指出:

strsplit(x, '(?<=(\\d))(?!\\1)', perl=TRUE)[[1]]
# [1] "11111" "00000" "1111"  "0000"  "111"   "000"  

也改变\\d\\w作品

x  <- "aaaaabbcccccccbbbad"
strsplit(x, '(?<=(\\w))(?!\\1)', perl=TRUE)[[1]]
# [1] "aaaaa"   "bb"      "ccccccc" "bbb"     "a"       "d"      

x <- "111110000011110000111000"
strsplit(x, '(?<=(\\w))(?!\\1)', perl=TRUE)[[1]]
# [1] "11111" "00000" "1111"  "0000"  "111"   "000" 

您还可以使用\K(而不是显式地使用捕获组\\1\\2),但我并没有使用太多,也不知道如何解释它:}

AFAIK会\\K重置报告的匹配项的起点,并且不再包含任何以前使用的字符,基本上会丢弃所有匹配的内容。

x <- "1111100000222000333300011110000111000"
(z <- gsub('(\\d)\\K(?!\\1)', ' ', x, perl = TRUE))
# [1] "11111 00000 222 000 3333 000 1111 0000 111 000 "

您可以将代码缩短为strsplit(x, '(?<=(\\d))(?!\\1)', perl=TRUE)[[1]] (尽管没有在很多情况下进行过测试:-)
akrun 2015年

@akrun我知道,我只是想知道您是否知道这个 \\K东西在正则表达式中正在做什么
rawr 2015年

我认为\\w方法在两种情况下都应该起作用。我使用的\\K不是很多,但是我想您已经在您的帖子中对此进行了解释。
akrun,2015年

1
@akrun,即将发表同样的评论。
A5C1D2H2I1M1N2O1R2T1

14

原始方法:这是一种结合了的严格方法rle()

x <- "111110000011110000111000"
library(stringi)

cs <- cumsum(
    rle(stri_split_boundaries(x, type = "character")[[1L]])$lengths
)
stri_sub(x, c(1L, head(cs + 1L, -1L)), cs)
# [1] "11111" "00000" "1111"  "0000"  "111"   "000"  

或者,您可以lengthstri_sub()

rl <- rle(stri_split_boundaries(x, type = "character")[[1L]])
with(rl, {
    stri_sub(x, c(1L, head(cumsum(lengths) + 1L, -1L)), length = lengths)
})
# [1] "11111" "00000" "1111"  "0000"  "111"   "000"  

为提高效率而更新:在意识到base::strsplit()比更快的速度之后stringi::stri_split_boundaries(),这是我以前的答案的更有效版本,仅使用基本函数。

set.seed(24)
x3 <- stri_rand_strings(1L, 1e6L)

system.time({
    cs <- cumsum(rle(strsplit(x3, NULL)[[1L]])[[1L]])
    substring(x3, c(1L, head(cs + 1L, -1L)), cs)
})
#   user  system elapsed 
#  0.686   0.012   0.697 

11

万一的另一种方法,使用mapply

x="111110000011110000111000"

with(rle(strsplit(x,'')[[1]]), 
     mapply(function(u,v) paste0(rep(v,u), collapse=''), lengths, values))
#[1] "11111" "00000" "1111"  "0000"  "111"   "000"  

您的功能比大多数解决方案都要快,除了stringi
akrun

哦,太好了,如果不进行测试,那本来可以骗我的regmatches!由于我不知道在此功能下隐藏了什么而感到误解!
上校博维尔上校'15

regmatches通常更快,但也可能取决于regex使用情况。在这里,我正在测试一个更一般的情况。如果我们在@thelatemail的帖子中使用相同的正则表达式测试二进制字符串,则时间可能会有所不同
akrun 2015年

8

这不是OP真正想要的(简洁的R代码),但我想尝试一下Rcpp,结果相对简单,并且比基于R的最快答案要快5倍。

library(Rcpp)

cppFunction(
  'std::vector<std::string> split_str_cpp(std::string x) {

  std::vector<std::string> parts;

  int start = 0;

  for(int i = 1; i <= x.length(); i++) {
      if(x[i] != x[i-1]) {
        parts.push_back(x.substr(start, i-start));
        start = i;
      } 
  }

  return parts;

  }')

并在这些上进行测试

str1 <- "111110000011110000111000"
x1 <- "1111100000222000333300011110000111000"
x2 <- "aaaaabbcccccccbbbad1111100000222aaabbccd11DaaBB"

提供以下输出

> split_str_cpp(str1)
[1] "11111" "00000" "1111"  "0000"  "111"   "000"  
> split_str_cpp(x1)
 [1] "11111" "00000" "222"   "000"   "3333"  "000"   "1111"  "0000"  "111"   "000"  
> split_str_cpp(x2)
 [1] "aaaaa"   "bb"      "ccccccc" "bbb"     "a"       "d"       "11111"   "00000"   "222"     "aaa"     "bb"      "cc"      "d"       "11"     
[15] "D"       "aa"      "BB"   

基准测试表明它比R解决方案快5-10倍。

akrun <- function(str1) strsplit(str1, '(?<=1)(?=0)|(?<=0)(?=1)', perl=TRUE)[[1]]

richard1 <- function(x3){
  cs <- cumsum(
    rle(stri_split_boundaries(x3, type = "character")[[1L]])$lengths
  )
  stri_sub(x3, c(1, head(cs + 1, -1)), cs)
}

richard2 <- function(x3) {
  cs <- cumsum(rle(strsplit(x3, NULL)[[1L]])[[1L]])
  stri_sub(x3, c(1, head(cs + 1, -1)), cs)
}

library(microbenchmark)
library(stringi)

set.seed(24)
x3 <- stri_rand_strings(1, 1e6)

microbenchmark(split_str_cpp(x3), akrun(x3), richard1(x3), richard2(x3), unit = 'relative', times=20L)

比较:

Unit: relative
              expr      min       lq     mean   median       uq      max neval
 split_str_cpp(x3) 1.000000 1.000000 1.000000 1.000000 1.000000 1.000000    20
         akrun(x3) 9.675613 8.952997 8.241750 8.689001 8.403634 4.423134    20
      richard1(x3) 5.355620 5.226103 5.483171 5.947053 5.982943 3.379446    20
      richard2(x3) 4.842398 4.756086 5.046077 5.389570 5.389193 3.669680    20

2
等待,所以我的答案比akrun的答案快?有趣
Rich Scriven

2

简单for循环解决方案

x="aaaaabbcccccccbbbad1111100000222aaabbccd11DaaBB"
res_vector=substr(x,1,1)

for (i in 2:nchar(x)) {
  tmp=substr(x,i,i)
  if (tmp==substr(x,i-1,i-1)) {
    res_vector[length(res_vector)]=paste0(res_vector[length(res_vector)],tmp)
  } else {
    res_vector[length(res_vector)+1]=tmp
  }
}

res_vector

#[1] "aaaaa"  "bb"  "ccccccc"  "bbb"  "a"  "d"  "11111"  "00000"  "222"  "aaa"  "bb"  "cc"  "d"  "11"  "D"  "aa"  "BB"

或使用预先分配的结果向量快一点

x="aaaaabbcccccccbbbad1111100000222aaabbccd11DaaBB"
res_vector=rep(NA_character_,nchar(x))
res_vector[1]=substr(x,1,1)
counter=1
old_tmp=''

for (i in 2:nchar(x)) {
  tmp=substr(x,i,i)
  if (tmp==old_tmp) {
    res_vector[counter]=paste0(res_vector[counter],tmp)
  } else {
    res_vector[counter+1]=tmp
    counter=counter+1
  }
  old_tmp=tmp
}

res_vector[!is.na(res_vector)]

您的第二个功能更快,但仍落后于上校的方法。+1
akrun

我将添加另一个修改,使其更快一点。:)
cryo111

我试图在1e6字符串上运行您的函数。需要很长时间。
2015年

是的,扩展性似乎不太好。
cryo111

对于1e4向量,速度仅稍快一点。但是对于1e6向量仍然很慢。
cryo111

1

这个怎么样:

s <- "111110000011110000111000"

spl <- strsplit(s,"10|01")[[1]]
l <- length(spl)
sapply(1:l, function(i) paste0(spl[i],i%%2,ifelse(i==1 | i==l, "",i%%2)))

# [1] "11111" "00000" "1111"  "0000"  "111"   "000"  

1
Fyi,通常只是做sapply(seq_along(spl), ...)而不是费力地将其长度提取为单独的var。
Frank Frank
By using our site, you acknowledge that you have read and understand our Cookie Policy and Privacy Policy.
Licensed under cc by-sa 3.0 with attribution required.