dplyr::slice in data.table [duplicate]

2020-02-06 07:48发布

This question already has answers here:

How to extract the first n rows per group? (2 answers)

Subset by group with data.table (1 answer)

What is the idiomatic way to do the action below in data.table?

library(dplyr)
df %>% 
  group_by(b) %>% 
  slice(1:10)

I can do

library(data.table)
df[, .SD[1:10]
   , by = b]

but that appears much slower. Is there a better way?

set.seed(0)
df <- rep(1:500, sample(500:1000, 500, T)) %>% 
        data.table(a = runif(length(.))
                  ,b = .)

f1 <- function(df){
  df %>% 
    group_by(b) %>% 
    slice(1:10)
}
f2 <- function(df){
  df[, .SD[1:10]
     , by = b]
}

library(microbenchmark)
microbenchmark(f1(df), f2(df))
#Unit: milliseconds
#   expr      min       lq      mean   median        uq      max neval
# f1(df) 17.67435 19.50381  22.06026 20.50166  21.42668  78.3318   100
# f2(df) 69.69554 79.43387 119.67845 88.25585 106.38661 581.3067   100

========== Benchmarks with suggested methods ==========

set.seed(0)
df <- rep(1:500, sample(500:1000, 500, T)) %>% 
        data.table(a = runif(length(.))
                  ,b = .)

use.slice <- function(df){
  df %>% 
    group_by(b) %>% 
    slice(1:10)
}
IndexSD <- function(df){
  df[, .SD[1:10]
     , by = b]
}
Index.I <- function(df) {
  df[df[, .I[seq_len(10)], by = b]$V1]
}
use.head <- function(df){
  df[, head(.SD, 10)
     , by = b]
}

library(microbenchmark)
microbenchmark(use.slice(df)
              , IndexSD(df)
              , Index.I(df)
              , use.head(df)
              , unit = "relative"
              , times = 100L)

#Unit: relative
#          expr       min        lq      mean    median        uq       max neval
# use.slice(df)  9.804549 10.269234  9.167413  8.900060  8.782862  6.520270   100
#   IndexSD(df) 38.881793 42.548555 39.044095 38.636523 39.942621 18.981748   100
#   Index.I(df)  1.000000  1.000000  1.000000  1.000000  1.000000  1.000000   100
#  use.head(df)  3.666898  4.033038  3.728299  3.408249  3.545258  3.951565   100

f3 <- function(df) { df[df[, .I[seq_len(10)], by = b]$V1] } microbenchmark(f1(df), f2(df), f3(df), unit = "relative", times = 10L) #Unit: relative # expr min lq mean median uq max neval cld # f1(df) 5.727822 5.480741 4.945486 5.672206 4.317531 5.10003 10 b # f2(df) 24.572633 23.774534 17.842622 23.070634 16.099822 11.58287 10 c # f3(df) 1.000000 1.000000 1.000000 1.000000 1.000000 1.00000 10 a