using tidyverse; counting after and before change

2019-02-12 05:30发布

站内文章 / 后端开发

69 0

迷人小祖宗

女 | 书童

私信

可以将文章内容翻译成中文,广告屏蔽插件可能会导致该功能失效(如失效，请关闭广告屏蔽插件后再试):

问题:

I am looking for a tidyverse-solution that can count occurrences of unique values of TF within groups, id in the data datatbl. When TF changes I want to count both forward and backwards from that point. This counting should be stored in a new variable PM##, so that PM## holds both plus and minus to each unique shift in TF.

This question is similar to a question I previously asked, but here I am specifically looking for a solution using tidyverse tools. Uwe provided an elegant answer to the inital question using data.table here.

If this question violates any SO policies please let me know and I'll be happy to reopen my initial question or append this an bounty-issue.

To illustrate my question with a minimal working example. I have data like this,

# install.packages(c("tidyverse"), dependencies = TRUE)
library(tibble)

tbl <- tibble(id = c(0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1,
                     1, 1, 1, 1, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7), 
              TF = c(NA, 0, NA, 0, 0, 1, 1, 1, NA, 0, 0, NA, 0, 0,
                     0, 1, 1, 1, NA, NA, 0, 0, 1, 0, 0, 1, 0, 1, 1, 1))
tbl
#> # A tibble: 30 x 2
#>       id    TF
#>    <dbl> <dbl>
#>  1     0    NA
#>  2     0     0
#>  3     0    NA
#>  4     0     0
#>  5     0     0
#>  6     0     1
#>  7     0     1
#>  8     0     1
#>  9     0    NA
#> 10     0     0
#> # ... with 20 more rows

and this is what I am trying to obtain,

dfa <- tibble(id = c(0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1,
                     1, 1, 1, 1, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7),
              TF = c(NA, 0, NA, 0, 0, 1, 1, 1, NA, 0, 0, NA, 0, 0,
                     0, 1, 1, 1, NA, NA, 0, 0, 1, 0, 0, 1, 0, 1, 1, 1),
              PM01 = c(NA, -3, NA, -2, -1, 1, 2, 3, NA, NA, NA, NA, -3, -2, -1,
                       1, 2, 3, NA, NA, -2, -1, 1, NA, NA, NA, NA, NA, NA, NA),
              PM02 = c(NA, NA, NA, NA, NA, -3, -2, -1, NA, 1, 2, NA, NA, NA, NA,
                       NA, NA, NA, NA, NA, NA, NA, -1, 1, 2, NA, NA, NA, NA, NA),
              PM03 = c(NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,
                       NA, NA, NA, NA, NA, NA, NA, NA, -2, -1, 1, NA, NA, NA, NA),
              PM04 = c(NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,
                       NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, -1, 1, NA, NA, NA),
              PM05 = c(NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,
                       NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, -1, 1, 2, 3)
               )

dfa
#> # A tibble: 30 x 7
#>       id    TF  PM01  PM02  PM03  PM04  PM05
#>    <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
#>  1     0    NA    NA    NA    NA    NA    NA
#>  2     0     0    -3    NA    NA    NA    NA
#>  3     0    NA    NA    NA    NA    NA    NA
#>  4     0     0    -2    NA    NA    NA    NA
#>  5     0     0    -1    NA    NA    NA    NA
#>  6     0     1     1    -3    NA    NA    NA
#>  7     0     1     2    -2    NA    NA    NA
#>  8     0     1     3    -1    NA    NA    NA
#>  9     0    NA    NA    NA    NA    NA    NA
#> 10     0     0    NA     1    NA    NA    NA
#> # ... with 20 more rows

回答1:

Here is another tidyverse approach that uses dplyr, tidyr and zoo (used for its na.locf function) package:

Firstly, instead of dropping NAs in the TF column and then join back as all the other suggested approaches (including the data.table approach), I wrote a helper method here, that counts forward by chunks ignoring NAs;

forward_count <- function(v) {
    valid <- !is.na(v)
    valid_v <- v[valid]
    chunk_size = head(rle(valid_v)$lengths, -1)
    idx <- cumsum(chunk_size) + 1
    ones <- rep(1, length(valid_v))
    ones[idx] <- 1 - chunk_size
    v[valid] <- cumsum(ones)
    v
}

And it works as is required by count after the change:

v <- sample(c(NA, 0, 1), 15, replace = T)
v
# [1] NA NA NA  0  1 NA  1 NA  1  1  0  1  0  0  0
forward_count(v)
# [1] NA NA NA  1  1 NA  2 NA  3  4  1  1  1  2  3

Count before the change can be implemented by reverse the vector twice with this exact same function:

-rev(forward_count(rev(v)))
# [1] NA NA NA -1 -4 NA -3 NA -2 -1 -1 -1 -3 -2 -1

Now define the headers, count forward column as fd, count backward column as bd using dplyr package:

library(dplyr); library(tidyr); library(zoo);

tidy_method <- function(df) {
    df %>% 
        group_by(id) %>% 
        mutate(
            rle_id = cumsum(diff(na.locf(c(0, TF))) != 0),   # chunk id for constant TF
            PM_fd = if_else(                 # PM count after change headers
                rle_id == head(rle_id, 1), 
                "head", sprintf('PM%02d', rle_id)
            ), 
            PM_bd = if_else(                 # shift the header up as before change headers
                rle_id == tail(rle_id, 1), 
                "tail", sprintf('PM%02d', rle_id+1)
            ), 
            fd = forward_count(TF),             # after change count
            bd = -rev(forward_count(rev(TF))),  # before change count
            rn = seq_along(id)) %>%             # row number
        gather(key, value, PM_fd, PM_bd) %>%    # align headers with the count
        mutate(count_ = if_else(key == "PM_fd", fd, bd)) %>%
        select(-key) %>% spread(value, count_) %>%    # reshaper PM column as headers
        select(id, TF, rn, matches('PM')) %>%  # drop no longer needed columns
        arrange(id, rn) %>% select(-rn)
}

Timing compared with the data.table method:

Define the data.table method as:

dt_method <- function(df) {
    tmp_dt <- setDT(df)[, rn := .I][!is.na(TF)][, rl := rleid(TF), by = id][
        , c("up", "dn") := .(seq_len(.N), -rev(seq_len(.N))), by = .(id, rl)][]

    res_dt <- tmp_dt[tmp_dt[, seq_len(max(rl) - 1L), by = .(id)], on = .(id), allow.cartesian = TRUE][
        rl == V1, PM := dn][rl == V1 + 1L, PM := up][
            , dcast(.SD, id + TF + rn ~ sprintf("PM%02d", V1), value.var = "PM")][
                df, on = .(rn, id, TF)][, -"rn"]
    res_dt
}

Data: A medium sized data by repeating the sample data frame 200 times:

df_test <- bind_rows(rep(list(df), 200))

microbenchmark::microbenchmark(dt_method(df_test), tidy_method(df_test), times = 10)
#Unit: milliseconds
#                 expr       min        lq      mean    median        uq       max neval
#   dt_method(df_test) 2321.5852 2439.8393 2490.8583 2456.1118 2557.4423 2834.2399    10
# tidy_method(df_test)  402.3624  412.2838  437.0801  414.5655  418.6564  540.9667    10

Order the data.table method result by id and convert all column data types to numeric; the results from data.table approach and tidyverse are identical:

identical(
    as.data.frame(dt_method(df_test)[order(id), lapply(.SD, as.numeric)]), 
    as.data.frame(tidy_method(df_test))
)
# [1] TRUE

回答2:

Update with a bit optimized data.table function:

Should probably go to the old question, but maybe this will trigger some further optimization.

To keep things flowing I have played a bit with the data.table function and get down to about twice of the execution time of the tidyverse version - the bottleneck is the dcast() function, see the screenshot from profvis below:

dt_method <- function(dt_test) {
  tmp_dt <- dt_test[, rn := .I][!is.na(TF)][, rl := rleid(TF), by = id][
    , c("up", "dn") := .(seq_len(.N), -rev(seq_len(.N))), by = .(id, rl)][, ':='(
      rl_PM = sprintf("PM%02d", rl),
      United = paste(id, TF, rn, sep = '_')
    )]

  res_dt <- tmp_dt[, .(sprintf("PM%02d", seq_len(max(rl) - 1L)), seq_len(max(rl) - 1L)), by = .(id)] %>% 
    tmp_dt[., on = .(id), allow.cartesian = TRUE] %>%  
    .[rl == V2, PM := dn] %>%
    .[rl == V2 + 1L, PM := up] %>%
    dcast(., United ~ V1, value.var = "PM") %>%
    .[, c('id', 'TF', 'rn') := lapply(tstrsplit(United, '_'), as.numeric)] %>%
    .[dt_test, on = .(rn, id, TF)] %>% .[, -c('rn', 'United')]
  res_dt
}

Pipes were needed to deal with some odd errors, but I still consider them allowed even for data.table.

Microbenchmark results:

Unit: milliseconds
                 expr      min       lq      mean    median        uq       max neval
   dt_method(dt_test) 868.1491 932.8076 1048.5077 1029.9609 1078.0735 1518.0327    10
 tidy_method(df_test) 478.6824 515.5639  557.9644  565.9422  585.3143  622.1093    10

And identical() with fixed order of columns:

identical(
  dt_method(dt_test)[order(id), lapply(.SD, as.numeric)] %>% setcolorder(c('id', 'TF', setdiff(names(.), c('id', 'TF')))) %>% as.data.frame(),
  as.data.frame(tidy_method(df_test))
)

profvis timings:

Old part:

Using Uwe's answer as a base:

(Disclaimer: I am not using dplyr too much, treated this as an exercise for myself, so it is for sure not dplyr-optimal, see e.g. dcast.)

library(data.table)
library(magrittr)
library(dplyr)
library(tibble)

df <- tibble(id = c(0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 
                    1, 1, 1, 1,7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7),
             TF = c(NA, 0, NA, 0, 0, 1, 1, 1, NA, 0, 0, NA, 0, 0, 0,
                    1, 1, 1, NA, NA, 0, 0, 1, 0, 0, 1, 0, 1, 1, 1))

dfa <- tibble(id = c(0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1,
                     1, 1, 1, 1, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7),
              TF = c(NA, 0, NA, 0, 0, 1, 1, 1, NA, 0, 0, NA, 0, 0,
                     0, 1, 1, 1, NA, NA, 0, 0, 1, 0, 0, 1, 0, 1, 1, 1),
              PM01 = c(NA, -3, NA, -2, -1, 1, 2, 3, NA, NA, NA, NA, -3, -2, -1,
                       1, 2, 3, NA, NA, -2, -1, 1, NA, NA, NA, NA, NA, NA, NA),
              PM02 = c(NA, NA, NA, NA, NA, -3, -2, -1, NA, 1, 2, NA, NA, NA, NA,
                       NA, NA, NA, NA, NA, NA, NA, -1, 1, 2, NA, NA, NA, NA, NA),
              PM03 = c(NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,
                       NA, NA, NA, NA, NA, NA, NA, NA, -2, -1, 1, NA, NA, NA, NA),
              PM04 = c(NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,
                       NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, -1, 1, NA, NA, NA),
              PM05 = c(NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,
                       NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, -1, 1, 2, 3))

tmp_dt <- setDT(df)[, rn := .I][!is.na(TF)][, rl := rleid(TF), by = id][
  , c("up", "dn") := .(seq_len(.N), -rev(seq_len(.N))), by = .(id, rl)][]

res_dt <- tmp_dt[tmp_dt[, seq_len(max(rl) - 1L), by = .(id)], on = .(id), allow.cartesian = TRUE][
  rl == V1, PM := dn][rl == V1 + 1L, PM := up][
    , dcast(.SD, id + TF + rn ~ sprintf("PM%02d", V1), value.var = "PM")][
      df, on = .(rn, id, TF)][, -"rn"]
res_dt

all.equal(res_dt, as.data.table(dfa))

As much tidyverse-sque as possible:

tmp_dplyr <- df %>%
  # create row id column (required for final join to get NA rows back in)
  mutate(rn = row_number()) %>%
  # ignore NA rows 
  filter(complete.cases(.)) %>%
  # number streaks of unique values within each group
  group_by(id) %>%
  mutate(rl = rleid(TF)) %>%
  # create ascending and descending counts for each streak
  # this is done once to avoid repeatedly creation of counts for each PM 
  # (slight performance gain)
  group_by(id, rl) %>%
  mutate(
    up = seq_len(n()),
    dn = -rev(seq_len(n()))
  )

res_dplyr <- tmp_dplyr %>%
  ## Replicating tmp[tmp[, seq_len(max(rl) - 1L), by = .(id)], on = .(id), allow.cartesian = TRUE]
  group_by(id) %>%
  ## Part below can for sure be optimized for code length, it's just too early now...
  transmute(rl = max(rl)) %>% # Cannot transmute id directly
  unique() %>%
  ungroup() %>%
  slice(rep(1:n(), times = rl - 1L)) %>%
  group_by(id) %>%
  transmute(V1 = seq_len(max(rl) - 1L)) %>%
  ungroup() %>%
  right_join(tmp_dplyr, by = 'id') %>%
  ## End or replicating tmp[tmp[, seq_len(max(rl) - 1L), by = .(id)], on = .(id), allow.cartesian = TRUE]
  ## Copy descending counts to rows before the switch and ascending counts to rows after the switch
  mutate(
    PM = ifelse(rl == V1, dn, NA),
    PM = ifelse(rl == V1 + 1L, up, PM)
  ) %>%
  ## This is very not tidyverse-sque, but I don't get the gather/spread ...
  dcast(id + TF + rn ~ sprintf("PM%02d", V1), value.var = "PM") %>%
  full_join(df, by = c('rn', 'id', 'TF')) %>%
  select(-rn)

all.equal( ## Using data.table all.equal
  res_dplyr[do.call(order, res_dplyr),] %>% as.data.table(),
  res_dt[do.call(order, res_dt),]
)

回答3:

I had a answer without data.table but it was not using dplyr. Here is my attempt using dplyr:

        #Remove the NAs 
dfr <-  df %>% filter(!is.na(TF)) %>% 
  # group by id
  group_by(id) %>% 
  # Calculate the rle on TF for each group
  do(., mrle = rle(.$TF)) %>% mutate(Total=sum(mrle$lengths)) %>%
  # Trasform the rle result in a data.frame counting the values after and before changes
  do( {
  t<- .$mrle
  #for each length generate the columns
  res <- as.data.frame(lapply(seq_along(t$lengths[-length(t$lengths)]), function(i) {

      #before change counts
      n1 <- t$lengths[i]
      #position  the counts
      if(i==1) {
        before <- 0
      } else {
        before <- sum(t$lengths[1:i-1])
      }

      #after change conts
      n2 <- t$lengths[i+1]

      if(i == (length(t$lengths)-1))
        after  <- 0
      else
        after <- .$Total - before - n1 - n2

      # assemble the column
      c(rep(NA,before),-n1:-1,1:n2, rep(NA,after))

    } ))

  colnames(res) <- paste0("PM", 1:ncol(res))
  #preserve the id
  cbind(id=.$id,res)

 })

#Join with the original data.frame
res <-  df %>% mutate(rn = row_number()) %>% filter(!is.na(TF)) %>% bind_cols(dfr) %>% right_join( df %>% mutate(rn = row_number()) ) %>% select(-rn, -id1)

#Verify
mapply(all.equal, dfa,res)
#  id   TF PM01 PM02 PM03 PM04 PM05 
#TRUE TRUE TRUE TRUE TRUE TRUE TRUE