可以将文章内容翻译成中文,广告屏蔽插件可能会导致该功能失效(如失效,请关闭广告屏蔽插件后再试):
问题:
I find it hard to come up with a fast solution to the following problem:
I have a vector of observations, which indicates the time of observation of certain phenomena.
example <- c(0,0,0,1,0,1,1,0,0,0,-1,0,0,-1,-1,0,0,1,0,0);
Now I would like to eliminate zeros between particular observations, given that a certain phenomenon is assumed to continue until a contradictory observation is noted, i.e.,
if ''1'' was observed in third observation, I would like to have only ''1'' up to 11th element, when first ''-1'' is observed. So my desired output looks like:
desired.output <- c(0,0,0,1,1,1,1,1,1,1,-1,-1,-1,-1,-1,-1,-1,1,1,1);
> print(cbind(example, desired.output))
example desired.output
[1,] 0 0
[2,] 0 0
[3,] 0 0
[4,] 1 1
[5,] 0 1
[6,] 1 1
[7,] 1 1
[8,] 0 1
[9,] 0 1
[10,] 0 1
[11,] -1 -1
[12,] 0 -1
[13,] 0 -1
[14,] -1 -1
[15,] -1 -1
[16,] 0 -1
[17,] 0 -1
[18,] 1 1
[19,] 0 1
[20,] 0 1
My lame solution is
for (i in 1:length(example)){
if (example[i] != 0){
current <- example[i];
while ((example[i] != -current) & (i <= length(example))){
example[i] <- current;
i <- i+1;
}
}
}
I will appreciate any help with speeding this up.
回答1:
I'll try to be the one to offer a pure R solution:
example <- c(0,0,0,1,0,1,1,0,0,0,-1,0,0,-1,-1,0,0,1,0,0);
cs = cumsum(example!=0);
mch = match(cs, cs);
desired.output = example[mch];
print(cbind(example,desired.output))
UPD: It may be faster to calculate mch
above with
mch = findInterval(cs-1,cs)+1
UPD2: I like the answer by @Roland. It can be shortened to two lines:
NN = (example != 0);
desired.output = c(example[1], example[NN])[cumsum(NN) + 1L];
回答2:
I suspect that your 0
values are actually NA values. Here I make them NA
and than use na.locf
(Last Observation Carried Forward) from package zoo:
example <- c(0,0,0,1,0,1,1,0,0,0,-1,0,0,-1,-1,0,0,1,0,0)
res <- example
#res[res==0] <- NA
#the same but faster
res <- res/res*res
library(zoo)
res <- na.locf(res, na.rm = FALSE)
res[is.na(res)] <- 0
cbind(example, res)
# example res
# [1,] 0 0
# [2,] 0 0
# [3,] 0 0
# [4,] 1 1
# [5,] 0 1
# [6,] 1 1
# [7,] 1 1
# [8,] 0 1
# [9,] 0 1
# [10,] 0 1
# [11,] -1 -1
# [12,] 0 -1
# [13,] 0 -1
# [14,] -1 -1
# [15,] -1 -1
# [16,] 0 -1
# [17,] 0 -1
# [18,] 1 1
# [19,] 0 1
# [20,] 0 1
回答3:
I am quite sure somebody will approach a better pure-R solution, but my first try is to use only 1 loop as follows:
x <- c(0,0,0,1,0,1,1,0,0,0,-1,0,0,-1,-1,0,0,1,0,0)
last <- x[1]
for (i in seq_along(x)) {
if (x[i] == 0) x[i] <- last
else last <- x[i]
}
x
## [1] 0 0 0 1 1 1 1 1 1 1 -1 -1 -1 -1 -1 -1 -1 1 1 1
The above easily translates to an effective C++ code:
Rcpp::cppFunction('
NumericVector elimzeros(NumericVector x) {
int n = x.size();
NumericVector y(n);
double last = x[0];
for (int i=0; i<n; ++i) {
if (x[i] == 0)
y[i] = last;
else
y[i] = last = x[i];
}
return y;
}
')
elimzeros(x)
## [1] 0 0 0 1 1 1 1 1 1 1 -1 -1 -1 -1 -1 -1 -1 1 1 1
Some benchmarks:
set.seed(123L)
x <- sample(c(-1,0,1), replace=TRUE, 100000)
# ...
microbenchmark::microbenchmark(
gagolews(x),
gagolews_Rcpp(x),
Roland(x),
AndreyShabalin_match(x),
AndreyShabalin_findInterval(x),
AndreyShabalin_cumsum(x),
unit="relative"
)
## Unit: relative
## expr min lq median uq max neval
## gagolews(x) 167.264538 163.172532 162.703810 171.186482 110.604258 100
## gagolews_Rcpp(x) 1.000000 1.000000 1.000000 1.000000 1.000000 100
## Roland(x) 33.817744 34.374521 34.544877 35.633136 52.825091 100
## AndreyShabalin_match(x) 45.217805 43.819050 44.105279 44.800612 58.375625 100
## AndreyShabalin_findInterval(x) 45.191419 43.832256 44.283284 45.094304 23.819259 100
## AndreyShabalin_cumsum(x) 8.701682 8.367212 8.413992 9.938748 5.676467 100