tidyr; %>% group_by() mutate(foo = fill() )

2019-08-18 02:01发布

问题:

I'm struggling to create a new variable to indicate what letter, LET, some groups, grp, within id, id, begin with.

In the following I'll illustrate my question. I have data like this,

library(dplyr); library(tidyr)
df <- tibble(id = rep(0:1, c(7, 10)),
             grp = rep(c(0,1,0,1,2), c(3,4,2,5,3)),
             LET = rep(c('A', 'B', 'A', 'B', 'A', 'B'), c(1,4, 3, 3, 4, 2)))
#> # A tibble: 17 x 3
#>       id   grp   LET
#>    <int> <dbl> <chr>
#>  1     0     0     A
#>  2     0     0     B
#>  3     0     0     B
#>  4     0     1     B
#>  5     0     1     B
#>  6     0     1     A
#>  7     0     1     A
#>  8     1     0     A
#>  9     1     0     B
#> 10     1     1     B
#> 11     1     1     B
#> 12     1     1     A
#> 13     1     1     A
#> 14     1     1     A
#> 15     1     2     A
#> 16     1     2     B
#> 17     1     2     B

I now want to crate a new variable %>% group_by(id, grp) and I thought I could fill it with fill() and mutate(grp_LET = …, something like this;

df %>% group_by(id, grp) %>% fill(LET) %>% mutate(grp_LET = factor)

But I can't figure it out. What I am hoping to obtain is something like this. My desired outcome,

dfd <- tibble(id = rep(0:1, c(7, 10)),
             grp = rep(c(0,1,0,1,2), c(3,4,2,5,3)),
             LET = rep(c('A', 'B', 'A', 'B', 'A', 'B'), c(1,4, 3, 3, 4, 2)),
             grp_LET =  rep(c('A', 'B', 'A', 'B', 'A'), c(3, 4, 2, 5, 3)));dfd
#> # A tibble: 17 x 4
#>       id   grp   LET grp_LET
#>    <int> <dbl> <chr>   <chr>
#>  1     0     0     A       A
#>  2     0     0     B       A
#>  3     0     0     B       A
#>  4     0     1     B       B
#>  5     0     1     B       B
#>  6     0     1     A       B
#>  7     0     1     A       B
#>  8     1     0     A       A
#>  9     1     0     B       A
#> 10     1     1     B       B
#> 11     1     1     B       B
#> 12     1     1     A       B
#> 13     1     1     A       B
#> 14     1     1     A       B
#> 15     1     2     A       A
#> 16     1     2     B       A
#> 17     1     2     B       A

Any help on this would be appreciated.

update 2017-10-26 11:03:40Z, I made the microbenchmark comparison of the three answer your your enjoyment,

tbl <- df
dim(tbl)
#> [1] 17  3

# install.packages(c("dplyr"), dependencies = TRUE)
library(dplyr)

R_base_by_lmo <- function(x) {dat <- x
             dat$grp_LET <- ave(dat$LET, dat[c("id", "grp")],
             FUN=function(x) head(x, 1)); as_tibble(dat) 
          }
# mapply(all.equal, R_base_by_lmo(tbl), dfd)

# install.packages(c("data.table"), dependencies = TRUE)
library(data.table) 
dt_by_akrun  <- function(x) {foo <- copy(x)
                  setDT(foo)[, grp_LET := LET[1], .(id, grp)]
                   as_tibble(foo)
               }
# mapply(all.equal, dt_by_akrun(tbl), dfd)
tidyverse_by_Psidom <- function(x) x %>% group_by(id,grp) %>% mutate(grp_LET=first(LET))
# mapply(all.equal, tidyverse_by_Psidom(df), dfd)


# install.packages(c("microbenchmark"), dependencies = TRUE)
require(microbenchmark)

x <- tbl
res <- microbenchmark(R_base_by_lmo(x),
                      dt_by_akrun(x),
                      tidyverse_by_Psidom(x), times = 67)

## Print results:
print(res)
Unit: milliseconds                                                              
                   expr      min       lq     mean   median       uq       max neval cld
       R_base_by_lmo(x) 1.338758 1.419860 1.620292 1.547867 1.640043  4.098088    67   a 
         dt_by_akrun(x) 1.670019 1.776765 2.123219 1.859477 1.972842 11.922270    67   a 
 tidyverse_by_Psidom(x) 3.964432 4.065466 4.718041 4.128942 4.478950 15.939186    67   b

### Plot results:
boxplot(res)

update 2017-10-26 12:10:55Z, encourage by akrun’s comment I reran the the microbenchmark test on my production-data. Which more than revered the results.

dim('my production-data')
#> [1] 46104    11
x <- 'my production-data'
res2 <- microbenchmark(R_base_by_lmo(x),
                      dt_by_akrun(x),
                      tidyverse_by_Psidom(x), times = 8)
print(res2)
Unit: milliseconds
                   expr         min          lq        mean      median          uq         max neval cld
       R_base_by_lmo(x) 28976.46868 29236.19450 29468.63955 29464.51339 29591.25206 30188.72785     8   b
         dt_by_akrun(x)    74.18023    76.69274    85.75983    87.15791    91.62508   100.94692     8   a 
 tidyverse_by_Psidom(x)    38.38051    41.15552    42.83667    41.92207    44.53830    49.08109     8   a 

boxplot(res2)

回答1:

Seems you need the first LET for each group; You can extract the first element from vector LET for each group, mutate will broadcast/cycle the value within the group:

df %>% group_by(id, grp) %>% mutate(grp_LET = first(LET))

# A tibble: 17 x 4
# Groups:   id, grp [5]
#      id   grp   LET grp_LET
#   <int> <dbl> <chr>   <chr>
# 1     0     0     A       A
# 2     0     0     B       A
# 3     0     0     B       A
# 4     0     1     B       B
# 5     0     1     B       B
# 6     0     1     A       B
# 7     0     1     A       B
# 8     1     0     A       A
# 9     1     0     B       A
#10     1     1     B       B
#11     1     1     B       B
#12     1     1     A       B
#13     1     1     A       B
#14     1     1     A       B
#15     1     2     A       A
#16     1     2     B       A
#17     1     2     B       A


回答2:

In base R, just use ave and head:

dat$grp_let <- ave(dat$LET, dat[c("id", "grp")], FUN=function(x) head(x, 1))

This returns

dat
   id grp LET grp_let
1   0   0   A       A
2   0   0   B       A
3   0   0   B       A
4   0   1   B       B
5   0   1   B       B
6   0   1   A       B
7   0   1   A       B
8   1   0   A       A
9   1   0   B       A
10  1   1   B       B
11  1   1   B       B
12  1   1   A       B
13  1   1   A       B
14  1   1   A       B
15  1   2   A       A
16  1   2   B       A
17  1   2   B       A


回答3:

Or we can use data.table

library(data.table)
setDT(df)[, grp_LET := LET[1], .(id, grp)]
df
#    id grp LET grp_LET
# 1:  0   0   A       A
# 2:  0   0   B       A
# 3:  0   0   B       A
# 4:  0   1   B       B
# 5:  0   1   B       B
# 6:  0   1   A       B
# 7:  0   1   A       B
# 8:  1   0   A       A
# 9:  1   0   B       A
#10:  1   1   B       B
#11:  1   1   B       B
#12:  1   1   A       B
#13:  1   1   A       B
#14:  1   1   A       B
#15:  1   2   A       A
#16:  1   2   B       A
#17:  1   2   B       A