avoid R loop and parallelize with snow

2019-07-30 18:02发布

问题:

I have a large loop that will take too long (~100 days). I'm hoping to speed it up with the snow library, but I'm not great with apply statements. This is only part of the loop, but if I can figure this part out, the rest should be straightforward. I'm ok with a bunch of apply statements or loops, but one apply statement using a function to get object 'p' would be ideal.

Original data

dim(m1)   == x x    # x >>> 0
dim(m2)   == y x    # y >>> 0, y > x, y > x-10
dim(mout) == x x    
thresh    == x-10   #specific to my data, actual number probably unimportant
len(v1)   == y      #each element is a random integer, min==1, max==thresh 
len(v2)   == y      #each element is a random integer, min==1, max==thresh 

Original loop

p <- rep(NA,y)
for (k in 1:y){
    mout <- m1 * matrix(m2[k,],x,x)
    mout <- mout/sum(mout)

    if (v1[k] < thresh + 1){
        if(v2[k] < thresh + 1){
            p[k] <- out[v1[k],v2[k]]
        }
        if(v2[k] > thresh){
            p[k] <-  sum(mout[v1[k],(thresh+1):x])
        }
    }

    #do stuff with object 'p'
}

回答1:

library(snow)
dostuff <- function(k){
    #contents of for-loop
    mout <- m1 * matrix(m2[k,],x,x)
    mout <- mout/sum(mout)

    if (v1[k] < thresh + 1){
        if(v2[k] < thresh + 1){
            p <- out[v1[k],v2[k]]
        }
        if(v2[k] > thresh){
            p <-  sum(mout[v1[k],(thresh+1):x])
        }
    }

    #etc etc

    return(list(p,
                other_vars))
}

exports = c('m1',
            'm2',
            'thresh',
            'v1',
            'x' ,
            'v2')
cl = makeSOCKcluster(4)
clusterExport(cl,exports)

loop <- as.array(1:y)
out <- parApply(cl,loop,1,dostuff)

p <- rep(NA,y)
for(k in 1:y){
    p[k]          <- out[[k]][[1]]
    other_vars[k] <- out[[k]][[2]]
}