根据值是否高于R中的阈值来分段矢量

Hen*_*ugh 8 r vector

我有一个长矢量,我需要根据阈值将其分成段.段是超过阈值的连续值.当值低于阈值时,段结束,下一段开始,其中值再次超过阈值.我需要记录每个段的开始和结束索引.

下面是一个低效的实现.写这个最快最合适的方法是什么?这非常难看,我必须假设有一个更清洁的实现.

set.seed(10)
test.vec <- rnorm(100, 8, 10)
threshold <- 0
segments <- list()
in.segment <- FALSE
for(i in 1:length(test.vec)){

    # If we're in a segment
    if(in.segment){
        if(test.vec[i] > threshold){
            next
        }else{
            end.ind <- i - 1
            in.segment <- FALSE
            segments[[length(segments) + 1]] <- c(start.ind, end.ind)
        }
    }

    # if not in segment
    else{
        if(test.vec[i] > threshold){        
            start.ind <- i
            in.segment <- TRUE
        }
    }
}
Run Code Online (Sandbox Code Playgroud)

编辑:所有解决方案的运行时间

感谢所有的回复,这是有帮助的,非常有启发性的.下面是对所有五种解决方案的小测试(提供的四个加上原始示例).正如您所看到的,这四个都是对原始解决方案的巨大改进,但Khashaa的解决方案是目前最快的解决方案.

set.seed(1)
test.vec <- rnorm(1e6, 8, 10);threshold <- 0

originalFunction <- function(x, threshold){
    segments <- list()
    in.segment <- FALSE
    for(i in 1:length(test.vec)){

    # If we're in a segment
        if(in.segment){
            if(test.vec[i] > threshold){
                next
            }else{
                end.ind <- i - 1
                in.segment <- FALSE
                segments[[length(segments) + 1]] <- c(start.ind, end.ind)
            }
        }

    # if not in segment
        else{
            if(test.vec[i] > threshold){        
                start.ind <- i
                in.segment <- TRUE
            }
        }
    }
    segments
}

SimonG <- function(x, threshold){

  hit <- which(x > threshold)
  n <- length(hit)

  ind <- which(hit[-1] - hit[-n] > 1)

  starts <- c(hit[1], hit[ ind+1 ])
  ends <- c(hit[ ind ], hit[n])

  cbind(starts,ends)
}

Rcpp::cppFunction('DataFrame Khashaa(NumericVector x, double threshold) {
  x.push_back(-1);
  int n = x.size(), startind, endind; 
  std::vector<int> startinds, endinds;
  bool insegment = false;
  for(int i=0; i<n; i++){
    if(!insegment){
      if(x[i] > threshold){        
        startind = i + 1;
        insegment = true;          }
    }else{
      if(x[i] < threshold){
        endind = i;
        insegment = false;
        startinds.push_back(startind); 
        endinds.push_back(endind);
      }
    }
  }
  return DataFrame::create(_["start"]= startinds, _["end"]= endinds);
}')

bgoldst <- function(x, threshold){
    with(rle(x>threshold),
         t(matrix(c(0L,rep(cumsum(lengths),2L)[-length(lengths)]),2L,byrow=T)+1:0)[values,])   
}

ClausWilke <- function(x, threshold){
    suppressMessages(require(dplyr, quietly = TRUE))
    in.segment <- (x > threshold)
    start <- which(c(FALSE, in.segment) == TRUE & lag(c(FALSE, in.segment) == FALSE)) - 1
    end <- which(c(in.segment, FALSE) == TRUE & lead(c(in.segment, FALSE) == FALSE))
    data.frame(start, end)    
}

system.time({ originalFunction(test.vec, threshold); })
 ## user  system elapsed 
 ## 66.539   1.232  67.770 
system.time({ SimonG(test.vec, threshold); })
 ## user  system elapsed 
 ## 0.028   0.008   0.036 
system.time({ Khashaa(test.vec, threshold); })
 ## user  system elapsed 
 ## 0.008   0.000   0.008 
system.time({ bgoldst(test.vec, threshold); })
 ## user  system elapsed 
 ## 0.065   0.000   0.065 
system.time({ ClausWilke(test.vec, threshold); })
 ## user  system elapsed 
 ## 0.274   0.012   0.285 
Run Code Online (Sandbox Code Playgroud)

Sim*_*onG 6

这是另一种选择,主要是使用which.通过查找hit序列的非连续元素来确定起点和终点.

test.vec <- rnorm(100, 8, 10)
threshold <- 0


findSegments <- function(x, threshold){

  hit <- which(x > threshold)
  n <- length(hit)

  ind <- which(hit[-1] - hit[-n] > 1)

  starts <- c(hit[1], hit[ ind+1 ])
  ends <- c(hit[ ind ], hit[n])

  cbind(starts,ends)

}

findSegments(test.vec, threshold=0)
Run Code Online (Sandbox Code Playgroud)

这给出了类似的东西:

> findSegments(test.vec, threshold=0)
      starts ends
 [1,]      1    3
 [2,]      5    7
 [3,]      9   11
 [4,]     13   28
 [5,]     30   30
 [6,]     32   32
 [7,]     34   36
 [8,]     38   39
 [9,]     41   41
[10,]     43   43
[11,]     46   51
[12,]     54   54
[13,]     56   61
[14,]     63   67
[15,]     69   72
[16,]     76   77
[17,]     80   81
[18,]     83   84
[19,]     86   88
[20,]     90   92
[21,]     94   95
[22,]     97   97
[23,]    100  100
Run Code Online (Sandbox Code Playgroud)

将其与原始序列进行比较:

> round(test.vec,1)
  [1]  20.7  15.7   4.3 -15.1  24.6   9.4  23.2  -4.5  16.9  20.9  13.2  -1.2
 [13]  22.6   7.7   6.0   6.6   4.1  21.3   5.3  16.7  11.4  16.7  19.6  16.7
 [25]  11.6   7.3   3.7   8.4  -4.5  11.7  -7.1   8.4 -18.5  12.8  22.5  11.0
 [37]  -3.3  11.1   6.9  -7.9  22.9  -3.7   3.5  -7.1  -5.9   3.5  13.2  20.0
 [49]  13.2  23.4  15.9  -5.0  -6.3  10.0  -6.2   4.7   2.1  26.4   5.9  27.3
 [61]  14.3 -12.4  28.4  30.9  18.2  11.4   5.7  -4.5   6.2  12.0  10.9  11.1
 [73]  -2.0  -9.0  -1.4  15.4  19.1  -1.6  -5.4   5.4   7.8  -5.6  15.2  13.8
 [85] -18.8   7.1  17.1   9.3  -3.9  22.6   1.7  28.9 -21.3  21.2   8.2 -15.4
 [97]   3.2 -10.2  -6.2  14.1
Run Code Online (Sandbox Code Playgroud)


Kha*_*haa 5

我喜欢for loops翻译Rcpp是直截了当的.

Rcpp::cppFunction('DataFrame findSegment(NumericVector x, double threshold) {
  x.push_back(-1);
  int n = x.size(), startind, endind; 
  std::vector<int> startinds, endinds;
  bool insegment = false;
  for(int i=0; i<n; i++){
    if(!insegment){
      if(x[i] > threshold){        
        startind = i + 1;
        insegment = true;          }
    }else{
      if(x[i] < threshold){
        endind = i;
        insegment = false;
        startinds.push_back(startind); 
        endinds.push_back(endind);
      }
    }
  }
  return DataFrame::create(_["start"]= startinds, _["end"]= endinds);
}')
set.seed(1); test.vec <- rnorm(1e7,8,10); threshold <- 0;
system.time(findSegment(test.vec, threshold))

#   user  system elapsed 
#  0.045   0.000   0.045 

# @SimonG's solution
system.time(findSegments(test.vec, threshold))
#   user  system elapsed 
#  0.533   0.012   0.548 
Run Code Online (Sandbox Code Playgroud)