Faster version of combn

2019-01-03 10:11发布

Is there a way to speed up the combn command to get all unique combinations of 2 elements taken from a vector?

Usually this would be set up like this:

# Get latest version of data.table
library(devtools)
install_github("Rdatatable/data.table",  build_vignettes = FALSE)  
library(data.table)

# Toy data
d <- data.table(id=as.character(paste0("A", 10001:15000))) 

# Transform data 
system.time({
d.1 <- as.data.table(t(combn(d$id, 2)))
})

However, combn is 10 times slower (23sec versus 3 sec on my computer) than calculating all possible combinations using data.table.

system.time({
d.2 <- d[, list(neighbor=d$id[-which(d$id==id)]), by=c("id")]
})

Dealing with very large vectors, I am searching for a way to save memory by only calculating the unique combinations (like combn), but with the speed of data.table (see second code snippet).

I appreciate any help.

5条回答
仙女界的扛把子
2楼-- · 2019-01-03 10:46


Here are two base-R solutions if you don't want to use additional dependencies:

  • comb2.int uses rep and other sequence generating functions to generate the desired output.

  • comb2.mat creates a matrix, uses upper.tri() to get the upper triangle and which(..., arr.ind = TRUE) to obtain the column and row indices => all combinations.

Possibility 1: comb2.int

comb2.int <- function(n, rep = FALSE){
  if(!rep){
    # e.g. n=3 => (1,1), (1,2), (1,3), (2,2), (2,3), (3,3)
    x <- rep(1:n,(n:1)-1)
    i <- seq_along(x)+1
    o <- c(0,cumsum((n-2):1))
    y <- i-o[x]
  }else{
    # e.g. n=3 => (1,2), (1,3), (2,3)
    x <- rep(1:n,n:1)
    i <- seq_along(x)
    o <- c(0,cumsum(n:2))
    y <- i-o[x]+x-1
  }
  return(cbind(x,y))
}

Possibility 2: comb2.mat

comb2.mat <- function(n, rep = FALSE){
  # Use which(..., arr.ind = TRUE) to get coordinates.
  m <- matrix(FALSE, nrow = n, ncol = n)
  idxs <- which(upper.tri(m, diag = rep), arr.ind = TRUE)
  return(idxs)
}

The functions give the same result as combn(.):

for(i in 2:8){
  # --- comb2.int ------------------
  stopifnot(comb2.int(i) == t(combn(i,2)))
  # => Equal

  # --- comb2.mat ------------------
  m <- comb2.mat(i)
  colnames(m) <- NULL   # difference 1: colnames
  m <- m[order(m[,1]),] # difference 2: output order
  stopifnot(m == t(combn(i,2)))
  # => Equal up to above differences
}

But I have other elements in my vector than sequencial integers!

Use the return values as indices:

v <- LETTERS[1:5]                                     
c <- comb2.int(length(v))                             
cbind(v[c[,1]], v[c[,2]])                             
#>       [,1] [,2]
#>  [1,] "A"  "B" 
#>  [2,] "A"  "C" 
#>  [3,] "A"  "D" 
#>  [4,] "A"  "E" 
#>  [5,] "B"  "C" 
#>  [6,] "B"  "D" 
#>  [7,] "B"  "E" 
#>  [8,] "C"  "D" 
#>  [9,] "C"  "E" 
#> [10,] "D"  "E"

Benchmark:

time(combn) = ~5x time(comb2.mat) = ~80x time(comb2.int):

library(microbenchmark)

n <- 800
microbenchmark({
  comb2.int(n)
},{
  comb2.mat(n)
},{
  t(combn(n, 2))
})
#>   Unit: milliseconds
#>                    expr        min         lq       mean     median        uq       max neval
#>    {     comb2.int(n) }   4.394051   4.731737   6.350406   5.334463   7.22677  14.68808   100
#>    {     comb2.mat(n) }  20.131455  22.901534  31.648521  24.411782  26.95821 297.70684   100
#>  {     t(combn(n, 2)) } 363.687284 374.826268 391.038755 380.012274 389.59960 532.30305   100
查看更多
Bombasti
3楼-- · 2019-01-03 10:53

Here's a way using data.table function foverlaps(), that also turns out to be fast!

require(data.table) ## 1.9.4+
d[, `:=`(id1 = 1L, id2 = .I)] ## add interval columns for overlaps
setkey(d, id1, id2)

system.time(olaps <- foverlaps(d, d, type="within", which=TRUE)[xid != yid])
#  0.603   0.062   0.717

Note that foverlaps() does not calculate all permutations. The subset xid != yid is needed to remove self overlaps. The subset could be internally handled more efficiently by implementing ignoreSelf argument - similar to IRanges::findOverlaps.

Now it's just a matter of performing a subset using the ids obtained:

system.time(ans <- setDT(list(d$id[olaps$xid], d$id[olaps$yid])))
#   0.576   0.047   0.662 

So totally, ~1.4 seconds.


The advantage is that you can do the same way even if your data.table d has more than 1 column on which you've to get the combinations for, and using the same amount of memory (since we return the indices). In that case, you'd just do:

cbind(d[olaps$xid, your_cols, with=FALSE], d[olaps$yid, your_cols, with=FALSE])

But it's limited to replacing just combn(., 2L). Not more than 2L.

查看更多
虎瘦雄心在
4楼-- · 2019-01-03 10:56

You could use combnPrim from gRbase

source("http://bioconductor.org/biocLite.R")
biocLite("gRbase") # will install dependent packages automatically.
system.time({
 d.1 <- as.data.table(t(combn(d$id, 2)))
 })
#   user  system elapsed 
# 27.322   0.585  27.674 

system.time({
d.2 <- as.data.table(t(combnPrim(d$id,2)))
 })
#   user  system elapsed 
#  2.317   0.110   2.425 

identical(d.1[order(V1, V2),], d.2[order(V1,V2),])
#[1] TRUE
查看更多
爷的心禁止访问
5楼-- · 2019-01-03 10:56

Here is a solution using Rcpp.

library(Rcpp)
library(data.table)
cppFunction('
Rcpp::DataFrame combi2(Rcpp::CharacterVector inputVector){
    int len = inputVector.size();
    int retLen = len * (len-1) / 2;
    Rcpp::CharacterVector outputVector1(retLen);
    Rcpp::CharacterVector outputVector2(retLen);
    int start = 0;
    for (int i = 0; i < len; ++i){
        for (int j = i+1; j < len; ++j){
            outputVector1(start) = inputVector(i);
            outputVector2(start) = inputVector(j);
            ++start;
            }
        }
    return(Rcpp::DataFrame::create(Rcpp::Named("id") = outputVector1,
                              Rcpp::Named("neighbor") = outputVector2));
};
')

# Toy data
d <- data.table(id=as.character(paste0("A", 10001:15000))) 

system.time({
    d.2 <- d[, list(neighbor=d$id[-which(d$id==id)]), by=c("id")]
    })
#  1.908   0.397   2.389

system.time({
    d[, `:=`(id1 = 1L, id2 = .I)] ## add interval columns for overlaps
    setkey(d, id1, id2)
    olaps <- foverlaps(d, d, type="within", which=TRUE)[xid != yid]
    ans <- setDT(list(d$id[olaps$xid], d$id[olaps$yid]))
    })
#  0.653   0.038   0.705

system.time(ans2 <- combi2(d$id))
#  1.377   0.108   1.495 

Using the Rcpp function to get the indices and then form the data.table, works better.

cppFunction('
Rcpp::DataFrame combi2inds(const Rcpp::CharacterVector inputVector){
const int len = inputVector.size();
const int retLen = len * (len-1) / 2;
Rcpp::IntegerVector outputVector1(retLen);
Rcpp::IntegerVector outputVector2(retLen);
int indexSkip;
for (int i = 0; i < len; ++i){
    indexSkip = len * i - ((i+1) * i)/2;
    for (int j = 0; j < len-1-i; ++j){
        outputVector1(indexSkip+j) = i+1;
        outputVector2(indexSkip+j) = i+j+1+1;
        }
    }
return(Rcpp::DataFrame::create(Rcpp::Named("xid") = outputVector1,
                          Rcpp::Named("yid") = outputVector2));
};
')

system.time({
        indices <- combi2inds(d$id)
        ans2 <- setDT(list(d$id[indices$xid], d$id[indices$yid]))
        })      
#  0.389   0.027   0.425 
查看更多
小情绪 Triste *
6楼-- · 2019-01-03 11:07

A post with any variation of the word Fast in the title is incomplete without benchmarks. Before we post any benchmarks, I would just like to mention that since this question was posted, two highly optimized packages, arrangements and RcppAlgos (I am the author) for generating combinations have been released for R.

To give you an idea of their speed over combn and gRbase::combnPrim, here is a basic benchmark:

microbenchmark(arrangements::combinations(20, 10),
               combn(20, 10),
               gRbase::combnPrim(20, 10),
               RcppAlgos::comboGeneral(20, 10),
               unit = "relative")
Unit: relative
                              expr       min        lq      mean    median        uq       max neval
arrangements::combinations(20, 10)  1.364092  1.244705  1.198256  1.265019  1.192174  3.658389   100
                     combn(20, 10) 82.672684 61.589411 52.670841 59.976063 58.584740 67.596315   100
         gRbase::combnPrim(20, 10)  6.650843  5.290714  5.024889  5.303483  5.514129  4.540966   100
   RcppAlgos::comboGeneral(20, 10)  1.000000  1.000000  1.000000  1.000000  1.000000  1.000000   100

Now, we benchmark the other functions posted for the very specific case of producing combinations choose 2 and producing a data.table object.

The functions are as follows:

funAkraf <- function(d) {
    a <- comb2.int(length(d$id))      ## comb2.int from the answer given by @akraf                        
    data.table(V1 = d$id[a[,1]], V2 = d$id[a[,2]])
}

funAnirban <- function(d) {
    indices <- combi2inds(d$id)
    ans2 <- setDT(list(d$id[indices$xid], d$id[indices$yid]))
    ans2
}

funArrangements <- function(d) {as.data.table(arrangements::combinations(x = d$id, k = 2))}

funArun <- function(d) {
    d[, `:=`(id1 = 1L, id2 = .I)] ## add interval columns for overlaps
    setkey(d, id1, id2)
    olaps <- foverlaps(d, d, type="within", which=TRUE)[xid != yid]
    ans <- setDT(list(d$id[olaps$xid], d$id[olaps$yid]))
    ans
}

funGRbase <- function(d) {as.data.table(t(gRbase::combnPrim(d$id,2)))}

funOPCombn <- function(d) {as.data.table(t(combn(d$id, 2)))}

funRcppAlgos <- function(d) {as.data.table(RcppAlgos::comboGeneral(d$id, 2))}

And here are the benchmarks on the example given by the OP:

d <- data.table(id=as.character(paste0("A", 10001:15000))) 

microbenchmark(funAkraf(d),
               funAnirban(d),
               funArrangements(d),
               funArun(d),
               funGRbase(d),
               funOPCombn(d),
               funRcppAlgos(d),
               times = 10, unit = "relative")
Unit: relative
              expr       min        lq      mean    median        uq       max neval
       funAkraf(d)  2.961790  2.869365  2.612028  2.948955  2.215608  2.352351    10
     funAnirban(d)  1.000000  1.000000  1.000000  1.000000  1.000000  1.000000    10
funArrangements(d)  1.384152  1.427382  1.473522  1.854861  1.258471  1.233715    10
        funArun(d)  2.785375  2.543434  2.353724  2.793377  1.883702  2.013235    10
      funGRbase(d)  4.309175  3.909820  3.359260  3.921906  2.727707  2.465525    10
     funOPCombn(d) 22.810793 21.722210 17.989826 21.492045 14.079908 12.933432    10
   funRcppAlgos(d)  1.359991  1.551938  1.434623  1.727857  1.318949  1.176934    10

We see that the function provided by @AnirbanMukherjee is the fastest for this task, followed by RcppAlgos/arrangements (very close timings).

They all give the same result as well:

identical(funAkraf(d), funOPCombn(d))
#[1] TRUE
identical(funAkraf(d), funArrangements(d))
#[1] TRUE
identical(funRcppAlgos(d), funArrangements(d))
#[1] TRUE
identical(funRcppAlgos(d), funAnirban(d))
#[1] TRUE
identical(funRcppAlgos(d), funArun(d))
#[1] TRUE

## different order... we must sort
identical(funRcppAlgos(d), funGRbase(d))
[1] FALSE
d1 <- funGRbase(d)
d2 <- funRcppAlgos(d)

## now it's the same
identical(d1[order(V1, V2),], d2[order(V1,V2),])
#[1] TRUE

Thanks to @Frank for pointing out how to compare two data.tables without going through the pains of creating new data.tables and then arranging them:

fsetequal(funRcppAlgos(d), funGRbase(d))
[1] TRUE
查看更多
登录 后发表回答