I have a data.table
first and I want to obtain a subset based on certain conditions, for example, I have
library(data.table)
dt <- data.table(rn=1:10, B=rep(1:2, 5))
dt
# rn B
# 1: 1 1
# 2: 2 2
# 3: 3 1
# 4: 4 2
# 5: 5 1
# 6: 6 2
# 7: 7 1
# 8: 8 2
# 9: 9 1
#10: 10 2`
I know the first column name but I don't know in advance the name of the second column, which is instead stored in a character vector:
nameAsVect <- "B"
Let's say that I want to obtain the following:
dt[rn>5 & B==2, ]
# rn B
#1: 6 2
#2: 8 2
#3: 10 2`
I thought I could do:
setkeyv(dt, c("rn", nameAsVect))
max.count <- max(dt[, nameAsVect, with=FALSE])
dt[J(5:max(rn), max.count), ]
# rn B
#1: 5 2
#2: 6 2
#3: 7 2
#4: 8 2
#5: 9 2
#6: 10 2
But I can't understand why the values 5, 7, and 9 in column rn
are included. I can obtain what I want with: dt[rn>=5 & get(nameAsVect) == max.count]
but I thought that the first approach, if it worked, would have been faster with large tables.
Any insight?
Thanks
There are alternatives to OP's approach which do not require to set keys beforehand
Vector scan & get()
dt[rn >= 5 & get(nameAsVect) == max(get(nameAsVect))]
rn B
1: 6 2
2: 8 2
3: 10 2
Vector scan & eval(parse())
Another approach suggested by Matt Dowle in his answer to Select / assign to data.table variables which names are stored in a character vector:
eval(parse(text = sprintf("dt[rn >= 5 & %s == max(%s)]", nameAsVect, nameAsVect)))
rn B
1: 6 2
2: 8 2
3: 10 2
Non-equi join
With version v1.9.8 (on CRAN 25 Nov 2016), data.table
has gained the ability to do non-equi joins.
max.count <- dt[, max(get(nameAsVect))]
dt[dt[.(5, max.count), on = c("rn>=V1", paste0(nameAsVect, "==V2")), which = TRUE]]
rn B
1: 6 2
2: 8 2
3: 10 2
or (my preferred way)
mdt <- dt[, c(.(rn = 5), lapply(.SD, max)), .SDcols = nameAsVect]
dt[dt[mdt, on = c("rn>=rn", nameAsVect), which = TRUE]]
rn B
1: 6 2
2: 8 2
3: 10 2
Benchmark
Create benchmark data:
n_row <- 1e6L
set.seed(123L)
DT <- data.table(
rn = sample(1:10, n_row, TRUE),
B = sample(1:2, n_row, TRUE)
)
Run the benchmark:
library(microbenchmark)
bm <- microbenchmark(
vec_scan_hard_coded = {
dt <- copy(DT)
dt[rn >= 5L & B == 2L]
},
OP_keyed = {
dt <- copy(DT)
setkeyv(dt, c("rn", nameAsVect))
max.count <- max(dt[, nameAsVect, with=FALSE])
dt[J(5:max(rn), max.count), nomatch = 0L]
},
vec_scan_get = {
dt <- copy(DT)
dt[rn >= 5 & get(nameAsVect) == max(get(nameAsVect))]
},
vec_scan_eval_parse = {
dt <- copy(DT)
eval(parse(text = sprintf("dt[rn >= 5 & %s == max(%s)]", nameAsVect, nameAsVect)))
},
nej1 = {
dt <- copy(DT)
max.count <- dt[, max(get(nameAsVect))]
dt[dt[.(5, max.count), on = c("rn>=V1", paste0(nameAsVect, "==V2")), which = TRUE]]
},
nej1_keyed = {
dt <- copy(DT)
setkeyv(dt, c("rn", nameAsVect))
max.count <- dt[, max(get(nameAsVect))]
dt[dt[.(5, max.count), on = c("rn>=V1", paste0(nameAsVect, "==V2")), which = TRUE]]
},
nej2 = {
dt <- copy(DT)
mdt <- dt[, c(.(rn = 5), lapply(.SD, max)), .SDcols = nameAsVect]
dt[dt[mdt, on = c("rn>=rn", nameAsVect), which = TRUE]]
},
nej2_keyed = {
dt <- copy(DT)
setkeyv(dt, c("rn", nameAsVect))
mdt <- dt[, c(.(rn = 5), lapply(.SD, max)), .SDcols = nameAsVect]
dt[dt[mdt, on = c("rn>=rn", nameAsVect), which = TRUE]]
},
times = 100L
)
print(bm)
For 1 M rows and a result set which is approximately 300 k rows, the vector scan approaches are the fastest:
Unit: milliseconds
expr min lq mean median uq max neval cld
vec_scan_hard_coded 19.03159 20.86890 42.70820 24.38040 27.57417 219.5682 100 a
OP_keyed 31.49025 34.50825 52.46168 37.74204 40.84953 194.7676 100 a
vec_scan_get 20.60384 25.75461 46.37579 27.29287 29.55892 185.5867 100 a
vec_scan_eval_parse 20.81188 23.92598 36.81940 26.69742 29.27687 183.5323 100 a
nej1 53.85361 59.32608 85.32623 62.12509 65.15083 227.1221 100 b
nej1_keyed 52.89946 58.37457 77.38969 61.03312 64.32072 221.3292 100 b
nej2 53.25590 59.69762 88.92513 61.98481 65.05738 285.2495 100 b
nej2_keyed 53.25061 58.61453 81.22925 61.14885 63.56159 274.0207 100 b
The clause nomatch=0
is missing in []
.
Change the line to
dt[J(5:max(rn), max.count), nomatch=0]
The result will be:
rn B
1: 6 2
2: 8 2
3: 10 2