矢量运算速度依赖data.frame的列数的(Speed of vectorized operati

2019-09-01 03:17发布

为什么它需要更长的时间来操作与相同数量的元素的data.frame的比较,但矢量操作安排在多个列? 取这个简单的例子,在这里我们减去每个元素0.5,然后比较它以查看其是否是<0( 与此相关的问题 ):

f.df <- function( df , x = 0.5 ){
  df <- df - x
  df[ df < 0 ] <- 0
  return( df )
}


df1 <- data.frame( matrix( runif(1e5) , nrow = 1e2 ) )
df2 <- data.frame( matrix( runif(1e5) , nrow = 1e3 ) )
df3 <- data.frame( matrix( runif(1e5) , nrow = 1e4 ) )

require( microbenchmark )
microbenchmark( f.df( df1 ) , f.df( df2 ) , f.df( df3 ) , times = 10L )


#Unit: milliseconds
#     expr        min         lq     median         uq        max neval
# f.df(df1) 1562.66827 1568.21097 1595.07005 1674.91726 1680.90092    10
# f.df(df2)   95.77452   98.12557  101.31215  190.46906  198.23927    10
# f.df(df3)   16.25295   16.42373   16.74989   17.95621   18.69218    10

Answer 1:

分析的位显示,大部分的时间都花在[<-.data.frame

因此比例的问题来自于如何Ops.data.frame[<-.dataframe工作,以及如何[<-.data.frame份, [[<-副本命名列表,。

在相关代码Ops.data.frame (有我的评语)

 # cn is the names of your data.frame 
 for (j in seq_along(cn)) {
         left <- if (!lscalar) 
             e1[[j]]
         else e1
         right <- if (!rscalar) 
             e2[[j]]
         else e2
         value[[j]] <- eval(f)
     }
    # sometimes return a data.frame
     if (.Generic %in% c("+", "-", "*", "/", "%%", "%/%")) {
         names(value) <- cn
         data.frame(value, row.names = rn, check.names = FALSE, 
             check.rows = FALSE)
     } # sometimes return a matrix
     else matrix(unlist(value, recursive = FALSE, use.names = FALSE), 
         nrow = nr, dimnames = list(rn, cn))

当您使用Ops.data.frame使用中它会循环的列循环[[<-每次更换。 这意味着随着列数的增加,需要将增加(如会有一些保护内部复制,因为它是一个data.frame被命名列表)的时间 - 因此它将与列的数量成线性比例

# for example  only this part will scale with the number of columns
f.df.1 <- function( df , x = 0.5 ){
     df <- df - x

     return( df )
 }
microbenchmark(f.df.1(df1),f.df.1(df2),f.df.1(df3), times = 10L)
# Unit: milliseconds
#        expr       min        lq   median         uq        max neval
# f.df.1(df1) 96.739646 97.143298 98.36253 172.937100 175.539239    10
# f.df.1(df2) 11.697373 11.955173 12.12206  12.304543 281.055865    10
# f.df.1(df3)  3.114089  3.149682  3.41174   3.575835   3.640467    10

[<-.data.frame具有穿过列的类似环时i是相同的尺寸的逻辑矩阵x

 if(is.logical(i) && is.matrix(i) && all(dim(i) == dim(x))) {
            nreplace <- sum(i, na.rm=TRUE)
            if(!nreplace) return(x) # nothing to replace
            ## allow replication of length(value) > 1 in 1.8.0
            N <- length(value)
            if(N > 1L && N < nreplace && (nreplace %% N) == 0L)
                value <- rep(value, length.out = nreplace)
            if(N > 1L && (length(value) != nreplace))
                stop("'value' is the wrong length")
            n <- 0L
            nv <- nrow(x)
            for(v in seq_len(dim(i)[2L])) {
                thisvar <- i[, v, drop = TRUE]
                nv <- sum(thisvar, na.rm = TRUE)
                if(nv) {
                    if(is.matrix(x[[v]]))
                        x[[v]][thisvar, ] <- if(N > 1L) value[n+seq_len(nv)] else value
                    else
                        x[[v]][thisvar] <- if(N > 1L) value[n+seq_len(nv)] else value
                }
                n <- n+nv
            }
            return(x)


f.df.2 <- function( df , x = 0.5 ){
     df[df < 0 ] <- 0

     return( df )
 }
 microbenchmark(f.df.2(df1), f.df.2(df2), f.df.2(df3), times = 10L)
# Unit: milliseconds
#        expr       min        lq    median        uq       max neval
# f.df.2(df1) 20.500873 20.575801 20.699469 20.993723 84.825607    10
# f.df.2(df2)  3.143228  3.149111  3.173265  3.353779  3.409068    10
# f.df.2(df3)  1.581727  1.634463  1.707337  1.876240  1.887746    10

[<- data.frame(和<-都将复制以及


如何提高。 您可以使用lapplysetdata.table

library(data.table)
sdf <- function(df, x = 0.5){
   # explicit copy so there are no changes to original
   dd <- copy(df)
  for(j in names(df)){
    set(dd, j= j, value = dd[[j]] - 0.5)
    # this is slow when (necessarily) done repeatedly perhaps this 
    # could come out of the loop and into a `lapply` or `vapply` statment
    whi <- which(dd[[j]] < 0 )
    if(length(whi)){
     set(dd, j= j, i = whi, value = 0.0)
    }
  }
  return(dd)
}

 microbenchmark(sdf(df1), sdf(df2), sdf(df3), times = 10L)
# Unit: milliseconds
# expr       min        lq    median        uq        max neval
# sdf(df1) 87.471560 88.323686 89.880685 92.659141 153.218536    10
# sdf(df2)  6.235951  6.531192  6.630981  6.786801   7.230825    10
# sdf(df3)  2.631641  2.729612  2.775762  2.884807   2.970556    10

# a base R approach using lapply
ldf <- function(df, x = 0.5){

  as.data.frame(lapply(df, function(xx,x){ xxx <- xx-x;replace(xxx, xxx<0,0)}, x=x))

}

# pretty good. Does well with large data.frames
microbenchmark(ldf(df1), ldf(df2), ldf(df3), times = 10L)
# Unit: milliseconds
# expr       min        lq    median         uq        max neval
# ldf(df1) 84.380144 84.659572 85.987488 159.928249 161.720599    10
# ldf(df2) 11.507918 11.793418 11.948194  12.175975  86.186517    10
# ldf(df3)  4.237206  4.368717  4.449018   4.627336   5.081222    10

# they all produce the same
dd <- sdf(df1)
ddf1 <- f.df(df1)
ldf1 <- ldf(df1)
identical(dd,ddf1)
## [1] TRUE
identical(ddf1, ldf1)
## [1] TRUE

# sdf and ldf comparable with lots of columns
# see benchmarking below.
microbenchmark(sdf(df1), ldf(df1), f.df(df1),  times = 10L)
# Unit: milliseconds
# expr        min         lq     median         uq       max neval
# sdf(df1)   85.75355   86.47659   86.76647   87.88829  172.0589    10
# ldf(df1)   84.73023   85.27622   85.61528  172.02897  356.4318    10
# f.df(df1) 3689.83135 3730.20084 3768.44067 3905.69565 3949.3532    10
# sdf ~ twice as fast with smaller data.frames
 microbenchmark(sdf(df2), ldf(df2), f.df(df2),  times = 10L)
# Unit: milliseconds
# expr       min         lq     median         uq        max neval
# sdf(df2)   6.46860   6.557955   6.603772   6.927785   7.019567    10
# ldf(df2)  12.26376  12.551905  12.576802  12.667775  12.982594    10
# f.df(df2) 268.42042 273.800762 278.435929 346.112355 503.551387    10
microbenchmark(sdf(df3), ldf(df3), f.df(df3),  times = 10L)
# Unit: milliseconds
# expr       min        lq    median        uq       max neval
# sdf(df3)  2.538830  2.911310  3.020998  3.120961 74.980466    10
# ldf(df3)  4.698771  5.202121  5.272721  5.407351  5.424124    10
# f.df(df3) 17.819254 18.039089 18.158069 19.692038 90.620645    10

# copying of larger objects is slower, repeated calls to which are slow.

microbenchmark(copy(df1), copy(df2), copy(df3), times = 10L)
# Unit: microseconds
# expr     min      lq   median      uq     max neval
# copy(df1) 369.926 407.218 480.5710 527.229 618.698    10
# copy(df2) 165.402 224.626 279.5445 296.215 519.773    10
# copy(df3) 150.148 180.625 214.9140 276.035 467.972    10


Answer 2:

data.frames是列表:每列可以容纳不同类的数据。 所以你可以想象,当你运行你的代码,R具有单独对待每一列。 其结果是,在“矢量”只发生一列的基础上。 对于相同数量的你data.frame元素,你有他们将采取处理时间越长,更多的列。

这不同于矩阵(更一般的阵列),其仅保持一个类的数据,所以矢量可以在整个发生。 这里,对于相同数量的元素,计算时间将是相同的,不管列的数量。 如你看到的:

df1 <- matrix( runif(1e5) , nrow = 1e2 ) 
df2 <- matrix( runif(1e5) , nrow = 1e3 ) 
df3 <- matrix( runif(1e5) , nrow = 1e4 ) 

require( microbenchmark )
microbenchmark( f.df( df1 ) , f.df( df2 ) , f.df( df3 ) , times = 10L )

# Unit: milliseconds
#       expr      min       lq   median       uq      max neval
#  f.df(df1) 4.837330 5.218258 5.350093 5.587897 7.081086    10
#  f.df(df2) 5.158825 5.313685 5.510549 5.731780 5.880861    10
#  f.df(df3) 5.237361 5.344613 5.399209 5.481276 5.940132    10


Answer 3:

比较时df1df2df3 :通过改变行尚未持有的元素不变,你因此被改变的列数总数的数量。

在每列data.frame是一个列表。 在你的榜样每个data.frame有数量级的列的顺序,因此和幅度更多的操作顺序,因而时间。



文章来源: Speed of vectorized operation dependent of number of columns of data.frame