Binary search DT with key on two columns using alt

2019-07-14 09:40发布

问题:

I'm trying to find a way to filter DT with two keys using alternative instead of conjunction. Solution in dplyr would look like this:

filter(DF, A == a | B == b)

I'm trying to do the same thing in data.table with key set on both A and B, but so far no luck.

I don't want to use DT[A == a | B == b] form, because of lower performance of vector search.

Let's use the data below as an example:

DF <- data.frame(A = c(1, NA, 1, 2), B = c(NA, 3, 3, 5))
DF
#    A  B
# 1  1 NA
# 2 NA  3
# 3  1  3
# 4  2  5
filter(DF, A == 1 | B == 3)
#    A  B
# 1  1 NA
# 2 NA  3
# 3  1  3
DT <- as.data.table(DF)
setkey(DT, "A", "B")

回答1:

Thanks to @Frank for an answer - it turned out to be the right way to do it. Frank proposed mya = DT[A==a,which=TRUE]; myb = DT[B==b,which=TRUE]; DT[union(mya,myb)], as it does two binary searches.

I did some benchmarks on larger dataset (97671 x 13) and this is how it looks like (some questionable attempts are also added; conjunction example added for comparison):

> microbenchmark(filter(ref.transactions, TalentID == talent.id | RecurringProfileID == recurring.profile.id), ref.transactions[TalentID == talent.id | RecurringProfileID == recurring.profile.id], unique(rbindlist(list(ref.transactions[.(talent.id)], ref.transactions[.(unique(c(talent.id, NA)), recurring.profile.id)]))), unique(rbind(ref.transactions[.(talent.id)], ref.transactions[.(unique(c(talent.id, NA)), recurring.profile.id)])), ref.transactions[.(talent.id, recurring.profile.id)], {mya = ref.transactions[TalentID==talent.id,which=TRUE]; myb = ref.transactions[RecurringProfileID==recurring.profile.id,which=TRUE]; ref.transactions[union(mya,myb)]})
Unit: milliseconds
                                                                                                                                                                                                    expr       min        lq      mean    median        uq       max neval
                                                                                                       filter(ref.transactions, TalentID == talent.id | RecurringProfileID ==      recurring.profile.id) 10.039814 11.874223 14.278728 12.560975 13.562596 45.023206   100
                                                                                                               ref.transactions[TalentID == talent.id | RecurringProfileID ==      recurring.profile.id]  6.934124  7.838649  9.323780  8.227186  8.822951 40.115687   100
                                                                       unique(rbindlist(list(ref.transactions[.(talent.id)], ref.transactions[.(unique(c(talent.id,      NA)), recurring.profile.id)])))  9.859269 10.826785 13.546877 11.663016 13.073455 47.173324   100
                                                                                 unique(rbind(ref.transactions[.(talent.id)], ref.transactions[.(unique(c(talent.id,      NA)), recurring.profile.id)]))  9.910144 11.027810 14.633140 11.663457 12.920559 57.256676   100
                                                                                                                                                    ref.transactions[.(talent.id, recurring.profile.id)]  1.196426  1.316740  1.513665  1.470091  1.574857  2.799963   100
 {     mya = ref.transactions[TalentID == talent.id, which = TRUE]     myb = ref.transactions[RecurringProfileID == recurring.profile.id,          which = TRUE]     ref.transactions[union(mya, myb)] }  1.710616  1.978395  3.085824  2.121029  2.370705 30.513052   100
> df.res <- filter(ref.transactions, TalentID == talent.id | RecurringProfileID ==      recurring.profile.id)
> mya = ref.transactions[TalentID==talent.id,which=TRUE]; myb = ref.transactions[RecurringProfileID==recurring.profile.id,which=TRUE]; dt.res <- ref.transactions[union(mya,myb)]
> identical(df.res, dt.res)
[1] TRUE