I'm a R newbie. I've got a vector
vec <- c(105,29,41,70,77,0,56,49,63,0,105)
and i would like to sum values till "0" occurs and then create a vector with such values, such as:
vec2 <- c(322,168,105)
But i really don't know where to start! Any suggestion?
Starting with this vector...
> vec
[1] 105 29 41 70 77 0 56 49 63 0 105
We can compute a logical TRUE/FALSE vector of where the zeroes are:
> vec == 0
[1] FALSE FALSE FALSE FALSE FALSE TRUE FALSE FALSE FALSE TRUE FALSE
When you add FALSE and TRUE, FALSE is zero and TRUE is one, so if we add that vector up every time we get to a TRUE the value increases. So using cumsum
for the cumulative sum, we get:
> cumsum(vec==0)
[1] 0 0 0 0 0 1 1 1 1 2 2
Now that result defines the groups that we want to add up within, so let's split
vec
by that result:
> split(vec, cumsum(vec==0))
$`0`
[1] 105 29 41 70 77
$`1`
[1] 0 56 49 63
$`2`
[1] 0 105
So apart from the zeroes in the second and subsequent parts of the list, that's the numbers we want to add up. Because we are adding we can add the zeroes and it doesn't make any difference (but if you wanted the mean you would have to drop the zeroes). Now we use sapply
to iterate over list elements and compute the sum:
> sapply(split(vec, cumsum(vec==0)),sum)
0 1 2
322 168 105
Job done. Ignore the 0 1 2
labels.
Another option is by
as.numeric(by(vec, cumsum(vec == 0), sum))
#[1] 322 168 105
Benchmark
Benchmark comparison of the methods for a larger vector based on microbenchmark
# Create sample vector with N entries
set.seed(2018)
N <- 10000
vec <- sample(100, N, replace = T)
vec[sample(length(vec), 100)] <- 0
library(microbenchmark)
res <- microbenchmark(
vapply = {
I <- which(vec == 0)
vapply(1:(length(I)+1),
function(k) sum(vec[max(I[k-1],1):min(I[k], length(vec), na.rm = TRUE)]),
numeric(1))
},
by = {
as.numeric(by(vec, cumsum(vec == 0), sum))
},
aggregate = {
aggregate(vec, by = list(cumsum(vec == 0)), FUN = sum)[[2]]
},
split = {
sapply(split(vec, cumsum(vec == 0)), sum)
},
Reduce = {
ans <- numeric(0)
s <- n <- 0
Reduce(f = function (y,x) {
if(x == 0) {
ans <<- c(ans,s)
s <<- 0
}
n <<- n+1
s <<- x+s
if (n == length(vec))
ans <<- c(ans,s)
s
}, vec, init = 0, accumulate = TRUE)
ans
},
for_loop = {
I <- which(vec == 0)
n <- length(vec)
N <- length(I) + 1
res <- numeric(N)
for(k in seq_along(res)) {
if (k == 1) {
res[k] <- sum(vec[1:I[1]])
next
}
if (k == N) {
res[k] <- sum(vec[I[N-1]:n])
next
}
res[k] <- sum(vec[I[k-1]:I[k]])
}
res
}
)
res
# Unit: microseconds
# expr min lq mean median uq max
# vapply 435.658 487.4230 621.6155 511.3625 607.2005 6175.039
# by 3897.401 4187.2825 4721.3168 4436.5850 4936.2900 12365.351
# aggregate 4817.032 5392.0620 6002.2579 5831.2905 6310.3665 9782.524
# split 611.175 758.4485 895.2201 838.7665 957.0085 1516.556
# Reduce 21372.054 22169.9110 25363.8684 23022.6920 25503.6145 49255.714
# for_loop 15172.255 15846.5735 17252.6895 16445.7900 17572.7535 34401.827
library(ggplot2)
autoplot(res)
The aggregate
function is useful for this kind of thing. You create a grouping variable with cumsum
(similar to how @Spacedman explained). Using the sum
function as the aggregating operation. The [[2]]
at the end just extracts what you want from what aggregate
returns:
aggregate(vec, by = list(cumsum(vec == 0)), FUN = sum)[[2]]
[1] 322 168 105
With vapply
Here is an option with vapply
I <- which(vec == 0)
vapply(1:(length(I)+1),
function(k) sum(vec[max(I[k-1],1):min(I[k], length(vec), na.rm = TRUE)]),
numeric(1))
# [1] 322 168 105
With Reduce
Here is a solution using Reduce
ans <- numeric(0)
s <- n <- 0
Reduce(f = function (y,x) {
if(x == 0) {
ans <<- c(ans,s)
s <<- 0
}
n <<- n+1
s <<- x+s
if(n == length(vec))
ans <<- c(ans,s)
s
}, vec, init = 0, accumulate = TRUE)
ans
# [1] 322 168 105
With A Loop
Or maybe an old fashioned loop
I <- which(vec == 0)
n <- length(vec)
N <- length(I) + 1
res <- numeric(N)
for(k in seq_along(res)) {
if (k == 1) {
res[k] <- sum(vec[1:I[1]])
next
}
if (k == N) {
res[k] <- sum(vec[I[N-1]:n])
next
}
res[k] <- sum(vec[I[k-1]:I[k]])
}
res
# [1] 322 168 105
Benchmarking
Data
Here is the data used for benchmarking
# c.f. @MauritsEvers
# Create sample vector with N entries
set.seed(2018)
N <- 10000
vec <- sample(100, N, replace = T)
vec[sample(length(vec), 100)] <- 0
Functions
Here are the functions for the second benchmarking figures:
reduce <- function(vec) {
ans <- numeric(0)
s <- n <- 0
Reduce(f = function (y,x) {
if(x == 0) {
ans <<- c(ans,s)
s <<- 0
}
n <<- n+1
s <<- x+s
if(n == length(vec))
ans <<- c(ans,s)
s
}, vec, init = 0, accumulate = TRUE)
ans
}
Vapply <- function (vec) {
I <- which(vec == 0)
vapply(1:(length(I)+1),
function(k) sum(vec[max(I[k-1],1):min(I[k], length(vec), na.rm = TRUE)]),
numeric(1))
}
By <- function (vec) as.numeric(by(vec, cumsum(vec == 0), sum))
Split <- function (vec) sapply(split(vec, cumsum(vec==0)),sum)
Aggregate <- function (vec) aggregate(vec, by = list(cumsum(vec == 0)), FUN = sum)[[2]]
for_loop <- function(vec) {
I <- which(vec == 0)
n <- length(vec)
N <- length(I)+1
res <- numeric(N)
for(k in seq_along(res)) {
if (k == 1) {
res[k] <- sum(vec[1:I[1]])
next
}
if (k == N) {
res[k] <- sum(vec[I[N-1]:n])
next
}
res[k] <- sum(vec[I[k-1]:I[k]])
}
res
}
Rowsum <- function (vec) rowsum(vec, cumsum(vec == 0))
Benchmarking
Here are the two benchmarking processes combined:
# c.f. @MauritsEvers
resBoth <- microbenchmark::microbenchmark(
Vapply = {
I <- which(vec == 0)
vapply(1:(length(I)+1),
function(k) sum(vec[max(I[k-1],1):min(I[k], length(vec), na.rm = TRUE)]),
numeric(1))
},
Vapply(vec),
By = {
as.numeric(by(vec, cumsum(vec == 0), sum))
},
By(vec),
Aggregate = {
aggregate(vec, by = list(cumsum(vec == 0)), FUN = sum)[[2]]
},
Aggregate(vec),
Split = {
sapply(split(vec, cumsum(vec == 0)), sum)
},
Split(vec),
reduce = {
ans <- numeric(0)
s <- n <- 0
Reduce(f = function (y,x) {
if(x == 0) {
ans <<- c(ans,s)
s <<- 0
}
n <<- n+1
s <<- x+s
if (n == length(vec))
ans <<- c(ans,s)
s
}, vec, init = 0, accumulate = TRUE)
ans
},
reduce(vec),
for_loop = {
I <- which(vec == 0)
n <- length(vec)
N <- length(I) + 1
res <- numeric(N)
for(k in seq_along(res)) {
if (k == 1) {
res[k] <- sum(vec[1:I[1]])
next
}
if (k == N) {
res[k] <- sum(vec[I[N-1]:n])
next
}
res[k] <- sum(vec[I[k-1]:I[k]])
}
res
},
for_loop(vec),
Rowsum = {rowsum(vec, cumsum(vec == 0))},
Rowsum(vec),
times = 10^3
)
Results
Here are the benchmarking results
resBoth
# Unit: microseconds
# expr min lq mean median uq max neval cld
# Vapply 234.121 281.5280 358.0708 311.7955 343.5215 4775.018 1000 ab
# Vapply(vec) 234.850 278.6100 376.3956 306.3260 334.4050 14564.278 1000 ab
# By 1866.029 2108.7175 2468.1208 2209.0025 2370.5520 23316.045 1000 c
# By(vec) 1870.769 2120.5695 2473.1643 2217.3900 2390.6090 21039.762 1000 c
# Aggregate 2738.324 3015.6570 3298.0863 3117.9480 3313.2295 13328.404 1000 d
# Aggregate(vec) 2733.583 2998.1530 3295.6874 3109.1955 3349.1500 8277.694 1000 d
# Split 359.202 412.0800 478.0553 444.1710 492.3080 4622.220 1000 b
# Split(vec) 366.131 410.4395 475.2633 444.1715 490.3025 4601.799 1000 b
# reduce 10862.491 13062.3755 15353.2826 14465.0870 16559.3990 76305.463 1000 g
# reduce(vec) 10403.004 12448.9965 14658.4035 13825.9995 15893.3255 67337.080 1000 f
# for_loop 6687.724 7429.4670 8518.0470 7818.0250 9023.9955 27541.136 1000 e
# for_loop(vec) 123.624 145.8690 187.2201 157.5390 177.4140 9928.200 1000 a
# Rowsum 235.579 264.3880 305.7516 282.2570 322.7360 792.068 1000 ab
# Rowsum(vec) 239.590 264.9350 307.2508 284.8100 322.0060 1778.143 1000 ab
rowsum()
is known to be quite fast. We can use cumsum(vec == 0)
for the grouping.
c(rowsum(vec, cumsum(vec == 0)))
# [1] 322 168 105