Combination of N elements with q elements in R

2019-07-13 06:57发布

问题:

I have N=6 elements and q=3 elements symboled as 0,1,2.

I want to create all the vectors of N=6 elements with 2 elements equal to 0, 2 elements equal to 1 and 2 elements equal to 2 in all the possible positions.

The number of these vectors are equal to combn(6,2)*combn(4,2)*combn(2,2)=90.

Here is a code that construct these 90 vectors in the matrix F:

N=6
x<-c(1:N)
#########################################
A<-combn(x,2)
B<-matrix(0,ncol(A),length(x)) 
for( i in 1:ncol(A) ) 
{ 
y<-rep(0,N) 
y[A[1:nrow(A),i]]<-1 
B[i,]<-y 
}
######################################
E<-matrix(0,nrow(B),length(x)-nrow(A))  
for( i in 1:nrow(B) ) 
{ 
q=0 
for( j in 1:ncol(B) ) 
{ 
if( B[i,j]!=1 )  
{ 
q=q+1 
E[i,q]<-j 
}}}
########################################
ASD<-combn(E[i,],2) 
F<-matrix(0,nrow(B)*ncol(ASD),length(x)) 
q=0 
for( i in 1:nrow(B) ) 
{ 
ASD<-combn(E[i,],2) 
for( j in 1:ncol(ASD) ) 
{ 
B[i,ASD[1:nrow(ASD),j]]<-2 
q=q+1 
F[q,]<-B[i,] 
B[i,ASD[1:nrow(ASD),j]]<-0 
}}

Is there any other less complicated way to do it?

回答1:

You can use the iterpc package:

> library(iterpc)
> I <- iterpc(c(2,2,2), labels=c(0,1,2), ordered=TRUE)
> getall(I)
      [,1] [,2] [,3] [,4] [,5] [,6]
 [1,]    0    0    1    1    2    2
 [2,]    0    0    1    2    1    2
 [3,]    0    0    1    2    2    1
 [4,]    0    0    2    1    1    2
 [5,]    0    0    2    1    2    1
 [6,]    0    0    2    2    1    1
 [7,]    0    1    0    1    2    2
 [8,]    0    1    0    2    1    2
 [9,]    0    1    0    2    2    1
[10,]    0    1    1    0    2    2
[11,]    0    1    1    2    0    2
[12,]    0    1    1    2    2    0
[13,]    0    1    2    0    1    2
[14,]    0    1    2    0    2    1
[15,]    0    1    2    1    0    2
[16,]    0    1    2    1    2    0
[17,]    0    1    2    2    0    1
[18,]    0    1    2    2    1    0
[19,]    0    2    0    1    1    2
[20,]    0    2    0    1    2    1
[21,]    0    2    0    2    1    1
[22,]    0    2    1    0    1    2
[23,]    0    2    1    0    2    1
[24,]    0    2    1    1    0    2
[25,]    0    2    1    1    2    0
[26,]    0    2    1    2    0    1
[27,]    0    2    1    2    1    0
[28,]    0    2    2    0    1    1
[29,]    0    2    2    1    0    1
[30,]    0    2    2    1    1    0
[31,]    1    0    0    1    2    2
[32,]    1    0    0    2    1    2
[33,]    1    0    0    2    2    1
[34,]    1    0    1    0    2    2
[35,]    1    0    1    2    0    2
[36,]    1    0    1    2    2    0
[37,]    1    0    2    0    1    2
[38,]    1    0    2    0    2    1
[39,]    1    0    2    1    0    2
[40,]    1    0    2    1    2    0
[41,]    1    0    2    2    0    1
[42,]    1    0    2    2    1    0
[43,]    1    1    0    0    2    2
[44,]    1    1    0    2    0    2
[45,]    1    1    0    2    2    0
[46,]    1    1    2    0    0    2
[47,]    1    1    2    0    2    0
[48,]    1    1    2    2    0    0
[49,]    1    2    0    0    1    2
[50,]    1    2    0    0    2    1
[51,]    1    2    0    1    0    2
[52,]    1    2    0    1    2    0
[53,]    1    2    0    2    0    1
[54,]    1    2    0    2    1    0
[55,]    1    2    1    0    0    2
[56,]    1    2    1    0    2    0
[57,]    1    2    1    2    0    0
[58,]    1    2    2    0    0    1
[59,]    1    2    2    0    1    0
[60,]    1    2    2    1    0    0
[61,]    2    0    0    1    1    2
[62,]    2    0    0    1    2    1
[63,]    2    0    0    2    1    1
[64,]    2    0    1    0    1    2
[65,]    2    0    1    0    2    1
[66,]    2    0    1    1    0    2
[67,]    2    0    1    1    2    0
[68,]    2    0    1    2    0    1
[69,]    2    0    1    2    1    0
[70,]    2    0    2    0    1    1
[71,]    2    0    2    1    0    1
[72,]    2    0    2    1    1    0
[73,]    2    1    0    0    1    2
[74,]    2    1    0    0    2    1
[75,]    2    1    0    1    0    2
[76,]    2    1    0    1    2    0
[77,]    2    1    0    2    0    1
[78,]    2    1    0    2    1    0
[79,]    2    1    1    0    0    2
[80,]    2    1    1    0    2    0
[81,]    2    1    1    2    0    0
[82,]    2    1    2    0    0    1
[83,]    2    1    2    0    1    0
[84,]    2    1    2    1    0    0
[85,]    2    2    0    0    1    1
[86,]    2    2    0    1    0    1
[87,]    2    2    0    1    1    0
[88,]    2    2    1    0    0    1
[89,]    2    2    1    0    1    0
[90,]    2    2    1    1    0    0

EDIT 2018-04-28

iterpc is now deprecated in favor of arrangements.



回答2:

Here is a super fast one liner from the development version of the package RcppAlgos.

devtools::install_github("jwood000/RcppAlgos")
library(RcppAlgos)    

myPerms <– permuteGeneral(3,6,TRUE,"prod","==",36) - 1L

myPerms
     [,1] [,2] [,3] [,4] [,5] [,6]
[1,]    0    0    1    1    2    2
[2,]    0    0    1    2    1    2
[3,]    0    0    1    2    2    1
[4,]    0    0    2    1    1    2
[5,]    0    0    2    1    2    1
[6,]    0    0    2    2    1    1
.
.
.
      [,1] [,2] [,3] [,4] [,5] [,6]
[85,]    2    2    0    0    1    1
[86,]    2    2    0    1    0    1
[87,]    2    2    0    1    1    0
[88,]    2    2    1    0    0    1
[89,]    2    2    1    0    1    0
[90,]    2    2    1    1    0    0

Here are some benchmarks where rcppAlgo, r2eOne, r2eTwo, and OPFun are function wrappers of the code for each method.

microbenchmark(rcppAlgo(),r2eOne(),r2eTwo(),OPFun(N=6) unit = "relative")
Unit: relative
      expr       min        lq      mean   median        uq      max neval
rcppAlgo()   1.00000   1.00000   1.00000   1.0000   1.00000 1.000000   100
  r2eOne() 471.56007 473.26487 194.01669 267.9402 274.46604 8.373630   100
  r2eTwo()  50.71091  48.84173  24.01617  27.8441  34.02326 2.044374   100
OPFun(N=6)  37.35899  24.38966  22.38029  19.7059  19.51935 31.18059   100


Explanation

Since the OP is looking for a particular combination of numbers with specific frequencies, we can make use of the Fundamental theorem of arithmetic, which states that every number can be written as the product of a unique combination of prime numbers. We are given the set 0, 1, 2 and adding 1 gives the set 1, 2, 3. We do this to avoid getting many zeros when we take the product.

Now, we are tasked with finding all combinations such that each element occurs exactly twice. This means after we apply the product to our target combination we get 1*1*2*2*3*3 = 36 (N.B. 1 is not prime but can be ignored since 1*n = n for all n). Now the problem is simple.

We simply find all combinations such that the product is equal to 36 and subsequently subtract 1 to get back to our original set of numbers and Voila!


General Solution

Below, we have a general solution that can be used to find all permutations of a given vector with repetition of each element a specific number of times.

library(RcppAlgos) ## for primeSieve and permuteGeneral
MakePerms <- function(v, numReps, myCap = NULL) {
    m <- sum(numReps)
    n <- length(v)

    ## Generate some primes using prime
    ## number theorem; fudging a bit to
    ## ensure we get n-1 prime numbers
    myPs <- primeSieve(2*n*log(n))[1:(n-1)]

    ## Set up vector that will be tested
    myV <- c(1L, myPs)
    target <- prod(myV^numReps)
    ps <- permuteGeneral(myV, m, TRUE, "prod", "==", target, myCap)
    for (j in 1:n) {ps[ps == myV[j]] <- v[j]}

    ps
}

It relies heavily on the uniqueness of the prime factorization per the Fundamental theorem of arithmetic and a little indexing (not quite as simple as the trivial case above, but still only 7 lines and still super fast).

We first create a vector of the first n-1 primes and tack on a 1 to complete myV. We then raise each element of myV to the desired number of repeats per element given by numReps and take the product to obtain our target value. Here are some examples:

  1. v = c(10,13,267,1) and numReps = c(3,1,2,5) -->> myV = c(1,2,3,5) -->> target = 1^3 * 2^1 * 3^2 * 5^5 = 56250
  2. v = 0:5 and numReps = c(1,2,1,2,2,2) -->> myV = c(1,2,3,5,7,11) -->> target = 1^1 * 2^2 * 3^1 * 5^2 * 7^2 * 11^2 = 1778700
  3. OP Example: v = c(0,1,2) and numReps = c(2,2,2) -->> myV = c(1,2,3) -->> target = 1^2 * 2^2 * 3^2 = 36

After we find all permutations that have a product equal to the target value, we simply map the contents of our original vector v to the generated matrix using indexing.

For example, if you set N = 8 in the OP's example you get all permutations of c(0,1,2) with 0 repeated exactly 4 times, and 1 and 2 repeated twice.

   t1 <- OPFun(N=8)
   t2 <- MakePerms(0:2, c(4,2,2))

   all.equal(t1[do.call(order, as.data.frame(t1)), ],
             t2[do.call(order, as.data.frame(t2)), ])
   [1] TRUE

   microbenchmark(fun2(8), MakePerms(0:2, c(4,2,2)), unit = "relative")
   Unit: relative
                         expr      min       lq     mean   median       uq      max neval
                     OPFun(8) 23.25099 22.56178 18.64762 19.52436 18.37387 10.90934   100
   MakePerms(0:2, c(4, 2, 2))  1.00000  1.00000  1.00000  1.00000  1.00000  1.00000   100

It should be noted that the number of possible permutations grows rapidly, so attempts like so MakePerms(0:5, rep(2, 6)) will fail as the total number of permutations of of 0:5 12 times is 12^6 = 2,985,984 > 2^31 - 1 (i.e. the maximal number of rows for a matrix in Rcpp). However, we don't expect all of them to meet our criteria, so if we put in a cap, say 10^7, we will have success. Observe:

a <- MakePerms(0:5, rep(2, 6), 10^7)
nrow(a)
7484400

set.seed(17)
a[sample(nrow(a), 10), ]
      [,1] [,2] [,3] [,4] [,5] [,6] [,7] [,8] [,9] [,10] [,11] [,12]
 [1,]    0    5    3    3    1    2    4    4    5     1     0     2
 [2,]    5    4    2    1    1    0    3    4    5     2     3     0
 [3,]    2    4    5    3    5    1    3    0    1     0     4     2
 [4,]    4    3    3    1    2    5    0    5    4     1     0     2
 [5,]    2    2    5    3    4    1    0    3    5     1     0     4
 [6,]    3    1    1    5    0    3    2    0    2     4     4     5
 [7,]    1    1    4    2    0    5    4    0    3     5     3     2
 [8,]    1    0    4    2    4    2    5    1    3     0     5     3
 [9,]    4    3    4    1    5    0    0    2    2     1     3     5
[10,]    1    0    5    3    2    0    1    4    3     4     2     5

The use of myCap can greatly increase efficiency as well.

microbenchmark(withOutCap = MakePerms(0:5, c(1,2,1,2,1,2)),
               withCap = MakePerms(0:5, c(1,2,1,2,1,2), 10^5),
               times = 15) 
Unit: milliseconds
      expr       min       lq      mean    median        uq      max neval
withOutCap 219.64847 246.4718 275.04672 282.52829 299.33816 311.2031    15
   withCap  22.56437  30.6904  33.30469  31.70443  37.50858  41.6095    15

identical(MakePerms(0:5, c(1,2,1,2,1,2)), MakePerms(0:5, c(1,2,1,2,1,2), 10^5)) 
[1] TRUE


iterpc Solution

It seems as though the answers provided to this point are strictly academic as the answer provided by @StéphaneLaurent is far superior. It is super general, one line, and super fast!!

          microbenchmark(iter = getall(iterpc(c(2,2,2), labels=c(0,1,2), ordered=TRUE)),
                  rcppAlg = MakePerms(0:2, c(2,2,2)))
   Unit: microseconds
      expr     min       lq      mean  median       uq      max neval
      iter 428.885 453.2975 592.53164 540.154 683.9585 1165.772   100
   rcppAlg  62.418  74.5205  93.44926  81.749 108.4660  216.454   100

The story changes as the number of permutations grows. Observe:

   microbenchmark(iter = getall(iterpc(c(2,2,2,2), labels=c(0,1,2,3), ordered=TRUE)),
                  rcppAlg = MakePerms(0:3, c(2,2,2,2)),
                  rcppAlgCap = MakePerms(0:3, c(2,2,2,2), 5000))
   Unit: microseconds
         expr     min        lq     mean    median       uq       max neval
         iter 877.246 1052.7060 1394.636 1150.0895 1265.088  8914.980   100
      rcppAlg 964.446 1449.7115 2084.944 1787.9350 1906.242 10921.156   100

If you utilize myCap, MakePerms is a little faster. This doesn't really matter, because with the iterpc solution, you don't even have to think about how many results you will get. Very nice!!


Update

The new version of RcppAlgos (which I am the author of) has just been released on CRAN. There is an additional argument now for permuteGeneral called freqs which allows permutations of multisets, which is exactly what the OP is looking for.

microbenchmark(iter = getall(iterpc(c(2,2,2,2), labels=0:3, ordered=TRUE)),
               newRcppAlgos = permuteGeneral(0:3, freqs = c(2,2,2,2)))
Unit: microseconds
        expr     min       lq      mean   median      uq      max neval
        iter 457.442 482.8365 609.98678 508.6150 572.581 4037.048   100
newRcppAlgos  33.159  43.3975  56.40026  48.5665  58.194  625.691   100

microbenchmark(iter = getall(iterpc(c(5,4,3,2), labels=0:3, ordered=TRUE)),
                 newRcppAlgos = permuteGeneral(0:3, freqs = c(5,4,3,2)))
Unit: milliseconds
        expr       min        lq     mean    median       uq      max neval
        iter 480.25976 552.54343 567.9155 565.23066 579.0258 751.8556   100
newRcppAlgos  83.41194  87.03957 104.6279  95.67596 107.3572 181.1119   100

identical(getall(iterpc(c(5,4,3,2), labels=0:3, ordered=TRUE)),
            permuteGeneral(0:3, freqs = c(5,4,3,2)))
[1] TRUE

nrow(permuteGeneral(0:3, freqs = c(5,4,3,2)))
[1] 2522520


Update 2

As @StéphaneLaurent points out, the package arrangements has been released as a replacement for iterpc (see comments by @RandyLai). It is much more efficient and is capable of handling a broader set of combinatorial problems (e.g. partitions). Here are the benchmarks for the larger example:

microbenchmark(arrangements = permutations(x = 0:3, freq = c(5,4,3,2)),
               RcppAlgos = permuteGeneral(0:3, freqs = c(5,4,3,2)))
Unit: milliseconds
        expr      min       lq     mean    median       uq      max neval
arrangements 97.10078 98.67154 113.5953 100.56261 131.3244 163.8912   100
   RcppAlgos 92.13122 93.84818 108.1845  95.72691 101.2647 165.7248   100

...Nearly identical results.

A huge benefit of arrangements is the ability to get permutations one at a time (or in chunks) via getnext. This allows the user to generate more than 2^31 - 1 results and offers more flexibility in general.

For more information regarding problems like this in R, I wrote an extensive overview to the question : R: Permutations and combinations with/without replacement and for distinct/non-distinct items/multiset.



回答3:

Two methods suggested: an inefficient one and a more efficient but laborious one. (In this context, I equate "efficiency" to scaling, not to the amount of code to do it or execution time. That is, as long as you are only creating 90 rows, then you're fine. If this is a simplified problem and you really need to expand to a much larger matrix, then permutations might exceed memory and/or R's capacity.)

Both solutions are a bit shorter than your code. The first is relatively clear to read and only 4 lines of code; the second is admittedly a bit obscure (and appears to go into index indirection "inception"), but is really still only 13 lines of needed code. It's possible the second could be reduced somewhat, but I ran out of "play" time :-)

Inefficient

One method is to create all permutations and filter out repeats. This works as long as your "N" doesn't get too big.

library(gtools)
v <- rep(0:2, 2)
p <- permutations(6, 6)
p[] <- v[p]
p <- p[!duplicated(p),]

head(p)
#      [,1] [,2] [,3] [,4] [,5] [,6]
# [1,]    0    1    2    0    1    2
# [2,]    0    1    2    0    2    1
# [3,]    0    1    2    1    0    2
# [4,]    0    1    2    1    2    0
# [5,]    0    1    2    2    0    1
# [6,]    0    1    2    2    1    0
tail(p)
#       [,1] [,2] [,3] [,4] [,5] [,6]
# [85,]    2    2    0    1    0    1
# [86,]    2    2    0    1    1    0
# [87,]    2    2    0    0    1    1
# [88,]    2    2    1    0    0    1
# [89,]    2    2    1    0    1    0
# [90,]    2    2    1    1    0    0

Verify each row has exactly two of each element:

all(apply(p, 1, table) == 2)
# [1] TRUE

Less-Inefficient

A less-inefficient (and therefore more effort) method: create a matrix of column indices, using the combn(6,2) and combn(4,2), and then assign the "factors" appropriately. (It'll make more sense in a moment.)

(NB: I often think better about these problems in terms of the transposed matrix; you can easily do this un-transposed, just adjust the code to swap columns/rows.)

What we need is something like expand.grid for two columns at a time. So we'll start with the smaller problems:

left2 <- t(combn(6, 2))
mid2 <- t(combn(4, 2))
left2
#       [,1] [,2]
#  [1,]    1    2
#  [2,]    1    3
#  [3,]    1    4
#  [4,]    1    5
#  [5,]    1    6
#  [6,]    2    3
#  [7,]    2    4
#  [8,]    2    5
#  [9,]    2    6
# [10,]    3    4
# [11,]    3    5
# [12,]    3    6
# [13,]    4    5
# [14,]    4    6
# [15,]    5    6
mid2
#      [,1] [,2]
# [1,]    1    2
# [2,]    1    3
# [3,]    1    4
# [4,]    2    3
# [5,]    2    4
# [6,]    3    4

Now, the grid will expand on the row indices of these two matrices.

eg <- expand.grid(a = 1:15, b = 1:6)
head(eg)
#   a b
# 1 1 1
# 2 2 1
# 3 3 1
# 4 4 1
# 5 5 1
# 6 6 1
inds <- cbind(left2[eg$a,], mid2[eg$b,])
head(inds)
#      [,1] [,2] [,3] [,4]
# [1,]    1    2    1    2
# [2,]    1    3    1    2
# [3,]    1    4    1    2
# [4,]    1    5    1    2
# [5,]    1    6    1    2
# [6,]    2    3    1    2
inds[25,,drop=FALSE]
#      [,1] [,2] [,3] [,4]
# [1,]    3    4    1    3

This means that, for row 25, we should replace columns 3 and 4 with the first factor (say 0). Then, of the remaining columns (1,2,5,6), we should replace the 1st and 3rd columns to the second factor (say 1). Said again, c(1,2,5,6)[c(1,3)] equates to columns 1 and 5 being replace with the second value (1). (The third value 2 will go in all remaining slots.)

So, to come up with the c(1,2,5,6) from above, we can use setdiff(1:6,...):

afterleft2 <- t(apply(left2[eg$a,], 1, function(a) setdiff(1:6, a)))
head( afterleft2 )
#      [,1] [,2] [,3] [,4]
# [1,]    3    4    5    6
# [2,]    2    4    5    6
# [3,]    2    3    5    6
# [4,]    2    3    4    6
# [5,]    2    3    4    5
# [6,]    1    4    5    6
afterleft2[25,,drop=FALSE]
#      [,1] [,2] [,3] [,4]
# [1,]    1    2    5    6

So let's get to fixing the inds third and fourth columns.

inds[,3] <- afterleft2[ cbind(1:90, mid2[eg$b,1]) ]
inds[,4] <- afterleft2[ cbind(1:90, mid2[eg$b,2]) ]
head(inds)
#      [,1] [,2] [,3] [,4]
# [1,]    1    2    3    4
# [2,]    1    3    2    4
# [3,]    1    4    2    3
# [4,]    1    5    2    3
# [5,]    1    6    2    3
# [6,]    2    3    1    4
inds[25,,drop=FALSE]
#      [,1] [,2] [,3] [,4]
# [1,]    3    4    1    5

We see from this that the 25th row has the "1" and "5" we expect.

Now the finish:

nr <- nrow(inds)
out <- matrix(nrow = nr, ncol = 6L)
out[cbind(1:nr,inds[,1])] <- 0L
out[cbind(1:nr,inds[,2])] <- 0L
out[cbind(1:nr,inds[,3])] <- 1L
out[cbind(1:nr,inds[,4])] <- 1L
head(out)
#      [,1] [,2] [,3] [,4] [,5] [,6]
# [1,]    0    0    1    1   NA   NA
# [2,]    0    1    0    1   NA   NA
# [3,]    0    1    1    0   NA   NA
# [4,]    0    1    1   NA    0   NA
# [5,]    0    1    1   NA   NA    0
# [6,]    1    0    0    1   NA   NA
out[25,,drop=FALSE]
#      [,1] [,2] [,3] [,4] [,5] [,6]
# [1,]    1   NA    0    0    1   NA

The "remaining slots" I mentioned above (for the third value) are all NA, by design.

out[is.na(out)] <- 2L
head(out)
#      [,1] [,2] [,3] [,4] [,5] [,6]
# [1,]    0    0    1    1    2    2
# [2,]    0    1    0    1    2    2
# [3,]    0    1    1    0    2    2
# [4,]    0    1    1    2    0    2
# [5,]    0    1    1    2    2    0
# [6,]    1    0    0    1    2    2
out[25,,drop=FALSE]
#      [,1] [,2] [,3] [,4] [,5] [,6]
# [1,]    1    2    0    0    1    2

Now a quick sanity check to ensure our out variable has exactly two of each element on each row.

all(apply(out, 1, table) == 2)
# [1] TRUE