Work on bigger combinations using for loop

2019-08-25 04:07发布

问题:

Backgroud: I am trying to recursively select rows and check for conditions based on combinations. I am able to dump the combinations using iterpc function. However, when I repeat the for loop on bigger row datasets (of input file) >200, I get the error - "Cannot allocate vector of size n GB".

CODE

DATA file with PARA to PARL (presented parameters, mostly numeric)
            data1 <- file.choose(read.csv(), stringasFactors =FALSE)



    #### Combination prediction using iterpc and a loop 
         to check condition success of subset rows #####

    require(iterpc)
                getComboChunks <- function(n, k, chunkSize, totalCombos, myFile, myTestFile) {
                 myIter <- iterpc(n, k)

                  ## initialized myFile
                  myCombs <- getnext(myIter, chunkSize)
                  write.table(myCombs, file = myFile, sep = ",", col.names = FALSE)

                  maxIteration <- (totalCombos - chunkSize) %/% chunkSize

                  for (i in 1:maxIteration) {
                    ## get the next "chunkSize" of combinations
                    myCombs <- getnext(myIter, chunkSize)

                    ## append the above combinations to your file
                    write.table(myCombs, file = myFile, sep = ",",
                                col.names = FALSE , append = TRUE)


                      o <- 1
                      namee <-  subset(data1, SNO %in% myCombs)

                      a <- sum(namee$Weight)
                      h <- (sum(namee$PARA*namee$Weight))/a
                      f <- (sum(namee$PARB*namee$Weight))/a
                      g <- (sum(namee$PARC*namee$Weight))/a
                      l <- (sum(namee$PARE*namee$Weight))/a
                      m <- (sum(namee$PARF*namee$Weight))/a
                      n <- (sum(namee$PARD*namee$Weight))/a
                      p <- (sum(namee$PARG*namee$Weight))/a
                      q <- (sum(namee$ParH*namee$Weight))/a
                      r <- (sum(namee$PARI)) 
                      pr <- (sum(namee$pr))
                      le <- (sum(namee$PARJ*namee$Weight))/a
                      PM[is.na(PM)] <- 0

                      k <- ifelse(aska <= a && askle <= le  
                           && askh <= h  && askf <= f  
                           &&  askg <= g  && askl <= l && askm <= m 
                           && askn <= n  && askp <= p  && askq <= q  
                           && askr <= r && pr >=askpr && a <aska2 
                           && le < askle2 && g <askg2 && f <askf2 
                           && h <askh2&& l <askl2 && m <askm2 
                           && n <askn2 && p <askp2 && q <askq2 
                           && r <askr2, "Success","Failure")

                      if (k == "Success")
                              {
                        PM$SNO <- as.character(PM$SNO)
                        Masterlist$SNO <- as.character(Masterlist$SNO)
                        PM[is.na(PM)] <- 0
                        List   <- rbind(List, as.vector(c(i,a,h,f,g,l,m,n,p,q,le,r,k)))
                        print("Success")
                        }
                      if (k == "Failure"){
                        print("Failure")
                        print(i)
                      }


                      rm(namee)
                      o <- o+1
                      gc() 
                        }


                    myTests <- List

                    ## append the above combinations to your file
                    write.table(myTests, file = myTestFile, sep = ",",
                                col.names = FALSE , append = TRUE)
                   }
A dataset with 200 rows
getComboChunks(200, 5, 1, 2535650040, "myCombos1.csv", "myTests.csv")

Solution To recursively check for the conditions by sequentially flushing memory to work on larger datasets (200 or more rows). I believe I am getting the error "cannot allocate vector of size n GB" because of the following line

 List   <- rbind(List, as.vector(c(i,a,h,f,g,l,m,n,p,q,le,r,k)))

Can this be eliminated by preallocating a huge vector and dumping the values instead of rbind function. Additionally, can the memory be flushed after every run?