I am looking for a tidyverse-solution that can count occurrences of unique values of TF
within groups, id
in the data datatbl
. When TF
changes I want to count both forward and backwards from that point. This counting should be stored in a new variable PM##
, so that PM##
holds both plus and minus to each unique shift in TF
.
This question is similar to a question I previously asked, but here I am specifically looking for a solution using tidyverse
tools. Uwe provided an elegant answer to the inital question using data.table
here.
If this question violates any SO policies please let me know and I'll be happy to reopen my initial question or append this an bounty-issue.
To illustrate my question with a minimal working example. I have data like this,
# install.packages(c("tidyverse"), dependencies = TRUE)
library(tibble)
tbl <- tibble(id = c(0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1,
1, 1, 1, 1, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7),
TF = c(NA, 0, NA, 0, 0, 1, 1, 1, NA, 0, 0, NA, 0, 0,
0, 1, 1, 1, NA, NA, 0, 0, 1, 0, 0, 1, 0, 1, 1, 1))
tbl
#> # A tibble: 30 x 2
#> id TF
#> <dbl> <dbl>
#> 1 0 NA
#> 2 0 0
#> 3 0 NA
#> 4 0 0
#> 5 0 0
#> 6 0 1
#> 7 0 1
#> 8 0 1
#> 9 0 NA
#> 10 0 0
#> # ... with 20 more rows
and this is what I am trying to obtain,
dfa <- tibble(id = c(0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1,
1, 1, 1, 1, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7),
TF = c(NA, 0, NA, 0, 0, 1, 1, 1, NA, 0, 0, NA, 0, 0,
0, 1, 1, 1, NA, NA, 0, 0, 1, 0, 0, 1, 0, 1, 1, 1),
PM01 = c(NA, -3, NA, -2, -1, 1, 2, 3, NA, NA, NA, NA, -3, -2, -1,
1, 2, 3, NA, NA, -2, -1, 1, NA, NA, NA, NA, NA, NA, NA),
PM02 = c(NA, NA, NA, NA, NA, -3, -2, -1, NA, 1, 2, NA, NA, NA, NA,
NA, NA, NA, NA, NA, NA, NA, -1, 1, 2, NA, NA, NA, NA, NA),
PM03 = c(NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,
NA, NA, NA, NA, NA, NA, NA, NA, -2, -1, 1, NA, NA, NA, NA),
PM04 = c(NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,
NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, -1, 1, NA, NA, NA),
PM05 = c(NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,
NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, -1, 1, 2, 3)
)
dfa
#> # A tibble: 30 x 7
#> id TF PM01 PM02 PM03 PM04 PM05
#> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
#> 1 0 NA NA NA NA NA NA
#> 2 0 0 -3 NA NA NA NA
#> 3 0 NA NA NA NA NA NA
#> 4 0 0 -2 NA NA NA NA
#> 5 0 0 -1 NA NA NA NA
#> 6 0 1 1 -3 NA NA NA
#> 7 0 1 2 -2 NA NA NA
#> 8 0 1 3 -1 NA NA NA
#> 9 0 NA NA NA NA NA NA
#> 10 0 0 NA 1 NA NA NA
#> # ... with 20 more rows
Here is another tidyverse approach that uses dplyr
, tidyr
and zoo
(used for its na.locf
function) package:
Firstly, instead of dropping NAs in the TF
column and then join back as all the other suggested approaches (including the data.table
approach), I wrote a helper method here, that counts forward by chunks ignoring NAs;
forward_count <- function(v) {
valid <- !is.na(v)
valid_v <- v[valid]
chunk_size = head(rle(valid_v)$lengths, -1)
idx <- cumsum(chunk_size) + 1
ones <- rep(1, length(valid_v))
ones[idx] <- 1 - chunk_size
v[valid] <- cumsum(ones)
v
}
And it works as is required by count after the change:
v <- sample(c(NA, 0, 1), 15, replace = T)
v
# [1] NA NA NA 0 1 NA 1 NA 1 1 0 1 0 0 0
forward_count(v)
# [1] NA NA NA 1 1 NA 2 NA 3 4 1 1 1 2 3
Count before the change can be implemented by reverse the vector twice with this exact same function:
-rev(forward_count(rev(v)))
# [1] NA NA NA -1 -4 NA -3 NA -2 -1 -1 -1 -3 -2 -1
Now define the headers, count forward column as fd
, count backward column as bd
using dplyr
package:
library(dplyr); library(tidyr); library(zoo);
tidy_method <- function(df) {
df %>%
group_by(id) %>%
mutate(
rle_id = cumsum(diff(na.locf(c(0, TF))) != 0), # chunk id for constant TF
PM_fd = if_else( # PM count after change headers
rle_id == head(rle_id, 1),
"head", sprintf('PM%02d', rle_id)
),
PM_bd = if_else( # shift the header up as before change headers
rle_id == tail(rle_id, 1),
"tail", sprintf('PM%02d', rle_id+1)
),
fd = forward_count(TF), # after change count
bd = -rev(forward_count(rev(TF))), # before change count
rn = seq_along(id)) %>% # row number
gather(key, value, PM_fd, PM_bd) %>% # align headers with the count
mutate(count_ = if_else(key == "PM_fd", fd, bd)) %>%
select(-key) %>% spread(value, count_) %>% # reshaper PM column as headers
select(id, TF, rn, matches('PM')) %>% # drop no longer needed columns
arrange(id, rn) %>% select(-rn)
}
Timing compared with the data.table
method:
Define the data.table
method as:
dt_method <- function(df) {
tmp_dt <- setDT(df)[, rn := .I][!is.na(TF)][, rl := rleid(TF), by = id][
, c("up", "dn") := .(seq_len(.N), -rev(seq_len(.N))), by = .(id, rl)][]
res_dt <- tmp_dt[tmp_dt[, seq_len(max(rl) - 1L), by = .(id)], on = .(id), allow.cartesian = TRUE][
rl == V1, PM := dn][rl == V1 + 1L, PM := up][
, dcast(.SD, id + TF + rn ~ sprintf("PM%02d", V1), value.var = "PM")][
df, on = .(rn, id, TF)][, -"rn"]
res_dt
}
Data: A medium sized data by repeating the sample data frame 200 times:
df_test <- bind_rows(rep(list(df), 200))
microbenchmark::microbenchmark(dt_method(df_test), tidy_method(df_test), times = 10)
#Unit: milliseconds
# expr min lq mean median uq max neval
# dt_method(df_test) 2321.5852 2439.8393 2490.8583 2456.1118 2557.4423 2834.2399 10
# tidy_method(df_test) 402.3624 412.2838 437.0801 414.5655 418.6564 540.9667 10
Order the data.table method result by id
and convert all column data types to numeric; the results from data.table
approach and tidyverse
are identical:
identical(
as.data.frame(dt_method(df_test)[order(id), lapply(.SD, as.numeric)]),
as.data.frame(tidy_method(df_test))
)
# [1] TRUE
Update with a bit optimized data.table function:
Should probably go to the old question, but maybe this will trigger some further optimization.
To keep things flowing I have played a bit with the data.table
function and get down to about twice of the execution time of the tidyverse
version - the bottleneck is the dcast()
function, see the screenshot from profvis
below:
dt_method <- function(dt_test) {
tmp_dt <- dt_test[, rn := .I][!is.na(TF)][, rl := rleid(TF), by = id][
, c("up", "dn") := .(seq_len(.N), -rev(seq_len(.N))), by = .(id, rl)][, ':='(
rl_PM = sprintf("PM%02d", rl),
United = paste(id, TF, rn, sep = '_')
)]
res_dt <- tmp_dt[, .(sprintf("PM%02d", seq_len(max(rl) - 1L)), seq_len(max(rl) - 1L)), by = .(id)] %>%
tmp_dt[., on = .(id), allow.cartesian = TRUE] %>%
.[rl == V2, PM := dn] %>%
.[rl == V2 + 1L, PM := up] %>%
dcast(., United ~ V1, value.var = "PM") %>%
.[, c('id', 'TF', 'rn') := lapply(tstrsplit(United, '_'), as.numeric)] %>%
.[dt_test, on = .(rn, id, TF)] %>% .[, -c('rn', 'United')]
res_dt
}
Pipes were needed to deal with some odd errors, but I still consider them allowed even for data.table
.
Microbenchmark results:
Unit: milliseconds
expr min lq mean median uq max neval
dt_method(dt_test) 868.1491 932.8076 1048.5077 1029.9609 1078.0735 1518.0327 10
tidy_method(df_test) 478.6824 515.5639 557.9644 565.9422 585.3143 622.1093 10
And identical()
with fixed order of columns:
identical(
dt_method(dt_test)[order(id), lapply(.SD, as.numeric)] %>% setcolorder(c('id', 'TF', setdiff(names(.), c('id', 'TF')))) %>% as.data.frame(),
as.data.frame(tidy_method(df_test))
)
profvis
timings:
Old part:
Using Uwe's answer as a base:
(Disclaimer: I am not using dplyr
too much, treated this as an exercise for myself, so it is for sure not dplyr
-optimal, see e.g. dcast
.)
library(data.table)
library(magrittr)
library(dplyr)
library(tibble)
df <- tibble(id = c(0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1,
1, 1, 1, 1,7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7),
TF = c(NA, 0, NA, 0, 0, 1, 1, 1, NA, 0, 0, NA, 0, 0, 0,
1, 1, 1, NA, NA, 0, 0, 1, 0, 0, 1, 0, 1, 1, 1))
dfa <- tibble(id = c(0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1,
1, 1, 1, 1, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7),
TF = c(NA, 0, NA, 0, 0, 1, 1, 1, NA, 0, 0, NA, 0, 0,
0, 1, 1, 1, NA, NA, 0, 0, 1, 0, 0, 1, 0, 1, 1, 1),
PM01 = c(NA, -3, NA, -2, -1, 1, 2, 3, NA, NA, NA, NA, -3, -2, -1,
1, 2, 3, NA, NA, -2, -1, 1, NA, NA, NA, NA, NA, NA, NA),
PM02 = c(NA, NA, NA, NA, NA, -3, -2, -1, NA, 1, 2, NA, NA, NA, NA,
NA, NA, NA, NA, NA, NA, NA, -1, 1, 2, NA, NA, NA, NA, NA),
PM03 = c(NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,
NA, NA, NA, NA, NA, NA, NA, NA, -2, -1, 1, NA, NA, NA, NA),
PM04 = c(NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,
NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, -1, 1, NA, NA, NA),
PM05 = c(NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,
NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, -1, 1, 2, 3))
tmp_dt <- setDT(df)[, rn := .I][!is.na(TF)][, rl := rleid(TF), by = id][
, c("up", "dn") := .(seq_len(.N), -rev(seq_len(.N))), by = .(id, rl)][]
res_dt <- tmp_dt[tmp_dt[, seq_len(max(rl) - 1L), by = .(id)], on = .(id), allow.cartesian = TRUE][
rl == V1, PM := dn][rl == V1 + 1L, PM := up][
, dcast(.SD, id + TF + rn ~ sprintf("PM%02d", V1), value.var = "PM")][
df, on = .(rn, id, TF)][, -"rn"]
res_dt
all.equal(res_dt, as.data.table(dfa))
As much tidyverse-sque as possible:
tmp_dplyr <- df %>%
# create row id column (required for final join to get NA rows back in)
mutate(rn = row_number()) %>%
# ignore NA rows
filter(complete.cases(.)) %>%
# number streaks of unique values within each group
group_by(id) %>%
mutate(rl = rleid(TF)) %>%
# create ascending and descending counts for each streak
# this is done once to avoid repeatedly creation of counts for each PM
# (slight performance gain)
group_by(id, rl) %>%
mutate(
up = seq_len(n()),
dn = -rev(seq_len(n()))
)
res_dplyr <- tmp_dplyr %>%
## Replicating tmp[tmp[, seq_len(max(rl) - 1L), by = .(id)], on = .(id), allow.cartesian = TRUE]
group_by(id) %>%
## Part below can for sure be optimized for code length, it's just too early now...
transmute(rl = max(rl)) %>% # Cannot transmute id directly
unique() %>%
ungroup() %>%
slice(rep(1:n(), times = rl - 1L)) %>%
group_by(id) %>%
transmute(V1 = seq_len(max(rl) - 1L)) %>%
ungroup() %>%
right_join(tmp_dplyr, by = 'id') %>%
## End or replicating tmp[tmp[, seq_len(max(rl) - 1L), by = .(id)], on = .(id), allow.cartesian = TRUE]
## Copy descending counts to rows before the switch and ascending counts to rows after the switch
mutate(
PM = ifelse(rl == V1, dn, NA),
PM = ifelse(rl == V1 + 1L, up, PM)
) %>%
## This is very not tidyverse-sque, but I don't get the gather/spread ...
dcast(id + TF + rn ~ sprintf("PM%02d", V1), value.var = "PM") %>%
full_join(df, by = c('rn', 'id', 'TF')) %>%
select(-rn)
all.equal( ## Using data.table all.equal
res_dplyr[do.call(order, res_dplyr),] %>% as.data.table(),
res_dt[do.call(order, res_dt),]
)