可以将文章内容翻译成中文,广告屏蔽插件可能会导致该功能失效(如失效，请关闭广告屏蔽插件后再试):

问题:

Suppose I have this data:

  x = c(14,14, 6,  7 ,14 , 0 ,0  ,0 , 0,  0,  0 , 0 , 0,  0 , 0 , 0 , 0,  9  ,1 , 3  ,8  ,9 ,15,  9 , 8, 13,  8,  4 , 6 , 7 ,10 ,13,  3,
 0 , 0 , 0 , 0 , 0 , 0,  0,  0 , 0 , 0 , 0,  0,  0,  0,  0  ,0,  0 , 0 , 0,  0,  0,  0,  0 , 0,  0, 4 , 7  ,4,  5 ,16 , 5  ,5 , 9 , 4  ,4,  9 , 8,  2,  0  ,0  ,0  ,0  ,0,  0,  0,  0  ,0 , 0,  0,  0,  0,  0,  0,  0,  0,0)

x
 [1] 14 14  6  7 14  0  0  0  0  0  0  0  0  0  0  0  0  9  1  3  8  9 15  9  8
[26] 13  8  4  6  7 10 13  3  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0
[51]  0  0  0  0  0  0  0  0  4  7  4  5 16  5  5  9  4  4  9  8  2  0  0  0  0
[76]  0  0  0  0  0  0  0  0  0  0  0  0  0  0

I want to recover the indices beginning where there are more than 3 zeroes in a row and terminating with the last 0 before a nonzero.

For example,

I would get

6, 17 for the first rash of zeroes, etc.

回答1:

Here are two base R approaches:

1) rle First run rle and then compute ok to pick out the sequences of zeros that are more than 3 long. We then compute the starts and ends of all repeated sequences subsetting to the ok ones at the end.

with(rle(x), {
  ok <- values == 0 & lengths > 3
  ends <- cumsum(lengths)
  starts <- ends - lengths + 1
  data.frame(starts, ends)[ok, ]
})

giving:

  starts ends
1      6   17
2     34   58
3     72   89

2) gregexpr Take the sign of each number -- that will be 0 or 1 and then concatenate those into a long string. Then use gregexpr to find the location of at least 4 zeros. The result gives the starts and the ends can be computed from that plus the match.length attribute minus 1.

s <- paste(sign(x), collapse = "")
g <- gregexpr("0{4,}", s)[[1]]
data.frame(starts = 0, ends = attr(g, "match.length") - 1) + g

giving:

  starts ends
1      6   17
2     34   58
3     72   89

回答2:

Starts = which(diff(x == 0) == 1) + 1
Ends   = which(diff(x == 0) == -1)
if(length(Ends) < length(Starts)) {
    Ends = c(Ends, length(x)) }

Starts
[1]  6 34 72
Ends
[1] 17 58 89

This works for your test data, but allows any sequence of zeros, including short ones. To insure that you get sequences of length at least n, add:

n=3
Long = which((Ends - Starts) >= n)
Starts = Starts[Long]
Ends = Ends[Long]

回答3:

By using dplyr , get the diff then if the diff not equal to 0 , they are not belong to same group , after cumsum we get the grouid

library(dplyr)
df=data.frame('x'=x,rownumber=seq(length(x)))
df$Groupid=cumsum(c(0,diff(df$x==0))!=0)
df%>%group_by(Groupid)%>%summarize(start=first(rownumber),end=last(rownumber),number=first(x),size=n())%>%filter(number==0&size>=3)
# A tibble: 3 x 5
  Groupid start   end number  size
    <int> <int> <int>  <dbl> <int>
1       1     6    17      0    12
2       3    34    58      0    25
3       5    72    89      0    18

回答4:

If x happens to be a column of a data.table you can do

library(data.table)
dt <- data.table(x = x)

dt[, if(.N > 3 & all(x == 0)) .(starts = first(.I), ends = last(.I))
   , by = rleid(x)]

#    rleid starts ends
# 1:     5      6   17
# 2:    22     34   58
# 3:    34     72   89

Explanation:

rleid(x) gives an ID (integer) for each element in x indicating which "run" the element is a member of, where "run" means a sequence of adjacent equal values.
dt[, <code>, by = rle(x)] partitions dt according to rleid(x) and computes <code> for each subset of dt's rows. The results are stacked together in a single data.table.
.N is the number of elements in the given subset
.I is the vector of row numbers corresponding to the subset
first and last give the first and last element of a vector
.(<stuff>) is the same as list(<stuff>)

The rleid function, by grouping within the brackets, .N and .I symbols, first and last functions are part of the data.table package.