I'm struggling to create a new variable to indicate what letter, LET
, some groups, grp
, within id, id
, begin with.
In the following I'll illustrate my question. I have data like this,
library(dplyr); library(tidyr)
df <- tibble(id = rep(0:1, c(7, 10)),
grp = rep(c(0,1,0,1,2), c(3,4,2,5,3)),
LET = rep(c('A', 'B', 'A', 'B', 'A', 'B'), c(1,4, 3, 3, 4, 2)))
#> # A tibble: 17 x 3
#> id grp LET
#> <int> <dbl> <chr>
#> 1 0 0 A
#> 2 0 0 B
#> 3 0 0 B
#> 4 0 1 B
#> 5 0 1 B
#> 6 0 1 A
#> 7 0 1 A
#> 8 1 0 A
#> 9 1 0 B
#> 10 1 1 B
#> 11 1 1 B
#> 12 1 1 A
#> 13 1 1 A
#> 14 1 1 A
#> 15 1 2 A
#> 16 1 2 B
#> 17 1 2 B
I now want to crate a new variable %>% group_by(id, grp)
and I thought I could fill it with fill()
and mutate(grp_LET = …
, something like this;
df %>% group_by(id, grp) %>% fill(LET) %>% mutate(grp_LET = factor)
But I can't figure it out. What I am hoping to obtain is something like this. My desired outcome,
dfd <- tibble(id = rep(0:1, c(7, 10)),
grp = rep(c(0,1,0,1,2), c(3,4,2,5,3)),
LET = rep(c('A', 'B', 'A', 'B', 'A', 'B'), c(1,4, 3, 3, 4, 2)),
grp_LET = rep(c('A', 'B', 'A', 'B', 'A'), c(3, 4, 2, 5, 3)));dfd
#> # A tibble: 17 x 4
#> id grp LET grp_LET
#> <int> <dbl> <chr> <chr>
#> 1 0 0 A A
#> 2 0 0 B A
#> 3 0 0 B A
#> 4 0 1 B B
#> 5 0 1 B B
#> 6 0 1 A B
#> 7 0 1 A B
#> 8 1 0 A A
#> 9 1 0 B A
#> 10 1 1 B B
#> 11 1 1 B B
#> 12 1 1 A B
#> 13 1 1 A B
#> 14 1 1 A B
#> 15 1 2 A A
#> 16 1 2 B A
#> 17 1 2 B A
Any help on this would be appreciated.
update 2017-10-26 11:03:40Z, I made the microbenchmark comparison of the three answer your your enjoyment,
tbl <- df
dim(tbl)
#> [1] 17 3
# install.packages(c("dplyr"), dependencies = TRUE)
library(dplyr)
R_base_by_lmo <- function(x) {dat <- x
dat$grp_LET <- ave(dat$LET, dat[c("id", "grp")],
FUN=function(x) head(x, 1)); as_tibble(dat)
}
# mapply(all.equal, R_base_by_lmo(tbl), dfd)
# install.packages(c("data.table"), dependencies = TRUE)
library(data.table)
dt_by_akrun <- function(x) {foo <- copy(x)
setDT(foo)[, grp_LET := LET[1], .(id, grp)]
as_tibble(foo)
}
# mapply(all.equal, dt_by_akrun(tbl), dfd)
tidyverse_by_Psidom <- function(x) x %>% group_by(id,grp) %>% mutate(grp_LET=first(LET))
# mapply(all.equal, tidyverse_by_Psidom(df), dfd)
# install.packages(c("microbenchmark"), dependencies = TRUE)
require(microbenchmark)
x <- tbl
res <- microbenchmark(R_base_by_lmo(x),
dt_by_akrun(x),
tidyverse_by_Psidom(x), times = 67)
## Print results:
print(res)
Unit: milliseconds
expr min lq mean median uq max neval cld
R_base_by_lmo(x) 1.338758 1.419860 1.620292 1.547867 1.640043 4.098088 67 a
dt_by_akrun(x) 1.670019 1.776765 2.123219 1.859477 1.972842 11.922270 67 a
tidyverse_by_Psidom(x) 3.964432 4.065466 4.718041 4.128942 4.478950 15.939186 67 b
### Plot results:
boxplot(res)
update 2017-10-26 12:10:55Z, encourage by akrun’s comment I reran the the microbenchmark test on my production-data. Which more than revered the results.
dim('my production-data')
#> [1] 46104 11
x <- 'my production-data'
res2 <- microbenchmark(R_base_by_lmo(x),
dt_by_akrun(x),
tidyverse_by_Psidom(x), times = 8)
print(res2)
Unit: milliseconds
expr min lq mean median uq max neval cld
R_base_by_lmo(x) 28976.46868 29236.19450 29468.63955 29464.51339 29591.25206 30188.72785 8 b
dt_by_akrun(x) 74.18023 76.69274 85.75983 87.15791 91.62508 100.94692 8 a
tidyverse_by_Psidom(x) 38.38051 41.15552 42.83667 41.92207 44.53830 49.08109 8 a
boxplot(res2)