可以将文章内容翻译成中文,广告屏蔽插件可能会导致该功能失效(如失效,请关闭广告屏蔽插件后再试):
问题:
As a matter of best practices, I'm trying to determine if it's better to create a function and apply()
it across a matrix, or if it's better to simply loop a matrix through the function. I tried it both ways and was surprised to find apply()
is slower. The task is to take a vector and evaluate it as either being positive or negative and then return a vector with 1 if it's positive and -1 if it's negative. The mash()
function loops and the squish()
function is passed to the apply()
function.
million <- as.matrix(rnorm(100000))
mash <- function(x){
for(i in 1:NROW(x))
if(x[i] > 0)
x[i] <- 1
else
x[i] <- -1
return(x)
}
squish <- function(x){
if(x >0)
return(1)
else
return(-1)
}
ptm <- proc.time()
loop_million <- mash(million)
proc.time() - ptm
ptm <- proc.time()
apply_million <- apply(million,1, squish)
proc.time() - ptm
loop_million
results:
user system elapsed
0.468 0.008 0.483
apply_million
results:
user system elapsed
1.401 0.021 1.423
What is the advantage to using apply()
over a for
loop if performance is degraded? Is there a flaw in my test? I compared the two resulting objects for a clue and found:
> class(apply_million)
[1] "numeric"
> class(loop_million)
[1] "matrix"
Which only deepens the mystery. The apply()
function cannot accept a simple numeric vector and that's why I cast it with as.matrix()
in the beginning. But then it returns a numeric. The for
loop is fine with a simple numeric vector. And it returns an object of same class as that one passed to it.
回答1:
As Chase said: Use the power of vectorization. You're comparing two bad solutions here.
To clarify why your apply solution is slower:
Within the for loop, you actually use the vectorized indices of the matrix, meaning there is no conversion of type going on. I'm going a bit rough over it here, but basically the internal calculation kind of ignores the dimensions. They're just kept as an attribute and returned with the vector representing the matrix. To illustrate :
> x <- 1:10
> attr(x,"dim") <- c(5,2)
> y <- matrix(1:10,ncol=2)
> all.equal(x,y)
[1] TRUE
Now, when you use the apply, the matrix is split up internally in 100,000 row vectors, every row vector (i.e. a single number) is put through the function, and in the end the result is combined into an appropriate form. The apply function reckons a vector is best in this case, and thus has to concatenate the results of all rows. This takes time.
Also the sapply function first uses as.vector(unlist(...))
to convert anything to a vector, and in the end tries to simplify the answer into a suitable form. Also this takes time, hence also the sapply might be slower here. Yet, it's not on my machine.
IF apply would be a solution here (and it isn't), you could compare :
> system.time(loop_million <- mash(million))
user system elapsed
0.75 0.00 0.75
> system.time(sapply_million <- matrix(unlist(sapply(million,squish,simplify=F))))
user system elapsed
0.25 0.00 0.25
> system.time(sapply2_million <- matrix(sapply(million,squish)))
user system elapsed
0.34 0.00 0.34
> all.equal(loop_million,sapply_million)
[1] TRUE
> all.equal(loop_million,sapply2_million)
[1] TRUE
回答2:
The point of the apply (and plyr) family of functions is not speed, but expressiveness. They also tend to prevent bugs because they eliminate the book keeping code needed with loops.
Lately, answers on stackoverflow have over-emphasised speed. Your code will get faster on its own as computers get faster and R-core optimises the internals of R. Your code will never get more elegant or easier to understand on its own.
In this case you can have the best of both worlds: an elegant answer using vectorisation that is also very fast, (million > 0) * 2 - 1
.
回答3:
You can use lapply
or sapply
on vectors if you want. However, why not use the appropriate tool for the job, in this case ifelse()
?
> ptm <- proc.time()
> ifelse_million <- ifelse(million > 0,1,-1)
> proc.time() - ptm
user system elapsed
0.077 0.007 0.093
> all.equal(ifelse_million, loop_million)
[1] TRUE
And for comparison's sake, here are the two comparable runs using the for loop and sapply:
> ptm <- proc.time()
> apply_million <- sapply(million, squish)
> proc.time() - ptm
user system elapsed
0.469 0.004 0.474
> ptm <- proc.time()
> loop_million <- mash(million)
> proc.time() - ptm
user system elapsed
0.408 0.001 0.417
回答4:
It is far faster in this case to do index-based replacement than either the ifelse()
, the *apply()
family, or the loop:
> million <- million2 <- as.matrix(rnorm(100000))
> system.time(million3 <- ifelse(million > 0, 1, -1))
user system elapsed
0.046 0.000 0.044
> system.time({million2[(want <- million2 > 0)] <- 1; million2[!want] <- -1})
user system elapsed
0.006 0.000 0.007
> all.equal(million2, million3)
[1] TRUE
It is well worth having all these tools at your finger tips. You can use the one that makes the most sense to you (as you need to understand the code months or years later) and then start to move to more optimised solutions if compute time becomes prohibitive.
回答5:
Better example for speed advantage of for loop.
for_loop <- function(x){
out <- vector(mode="numeric",length=NROW(x))
for(i in seq(length(out)))
out[i] <- max(x[i,])
return(out)
}
apply_loop <- function(x){
apply(x,1,max)
}
million <- matrix(rnorm(1000000),ncol=10)
> system.time(apply_loop(million))
user system elapsed
0.57 0.00 0.56
> system.time(for_loop(million))
user system elapsed
0.32 0.00 0.33
EDIT
Version suggested by Eduardo.
max_col <- function(x){
x[cbind(seq(NROW(x)),max.col(x))]
}
By row
> system.time(for_loop(million))
user system elapsed
0.99 0.00 1.11
> system.time(apply_loop(million))
user system elapsed
1.40 0.00 1.44
> system.time(max_col(million))
user system elapsed
0.06 0.00 0.06
By column
> system.time(for_loop(t(million)))
user system elapsed
0.05 0.00 0.05
> system.time(apply_loop(t(million)))
user system elapsed
0.07 0.00 0.07
> system.time(max_col(t(million)))
user system elapsed
0.04 0.00 0.06