Why is the time complexity of this loop non-linear

Why is the time complexity of this loop non-linear and why is it so slow? The loop takes ~38s for N=50k, and ~570s for N=200k. Is there a faster way to do this? Rprof() seems to indicate that writing to memory is very slow.

df <- data.frame(replicate(5, runif(200000)))
df[,1:3] <- round(df[,1:3])

Rprof(line.profiling = TRUE); timer <- proc.time()
x <- df; N <- nrow(df); i <- 1 
ind <- df[1:(N-1),1:3] == df[2:N,1:3]; 
rind <- which(apply(ind,1,all))
N <- length(rind)
while(i <= N)
{
    x$X4[rind[i]+1] <- x$X4[rind[i]+1] + x$X4[rind[i]]
    x$X5[rind[i]+1] <- x$X4[rind[i]+1] * x$X3[rind[i]+1]
    x$X5[rind[i]+1] <- trunc(x$X5[rind[i]+1]*10^8)/10^8
    x$X1[rind[i]] <- NA
    i <- i + 1
};x <- na.omit(x)
proc.time() - timer; Rprof(NULL)
summaryRprof(lines = "show")

The purpose of this algorithm is to iterate over the data frame and combine adjacent rows that match on certain elements. That is, it removes one of the rows and adds some of that row's values to the other row. The resulting data frame should have n less rows, where n is the number of matching adjacent rows in the original data frame. Every time a pair of rows are combined, the index of the source data frame and new data frame get out of sync by 1, since one row is removed/omitted from the new frame, so i keeps track of the position on the source data frame, and q keeps track of the position on the new data frame.

The code above is updated thanks to @joran's comment. The performance is improved substantially to ~5.5s for N=50k and ~88s for N=200k. However, the time complexity is still non-linear, which I can't fathom. I need to run this at N = 1 million or more, so its still not great speed.

标签： r performance loops optimization time-complexity

2条回答

放荡不羁爱自由

2楼-- · 2020-03-12 05:16

Following is just a rewrite of @Martin Morgan's answer, utilizing the fast subsetting of data.table. It is around 3x faster than the data.frame approach.

library(data.table)
library(matrixStats) # for efficient rowAlls function

g01 <- function(df) {
  setDT(df)
  ind <- df[-nrow(df), 1:3, with=FALSE] == df[-1, 1:3, with=FALSE]
  rind <- which(rowAlls(ind)) + 1L

  X4 <- df$X4
  for (i in seq_along(rind))
    X4[rind[i]] <- X4[rind[i]] + X4[rind[i] - 1L]

  df$X4 <- X4
  df$X5[rind] <- trunc(df$X4[rind] * df$X3[rind] * 10^8) / 10^8
  df[-rind + 1L,]
}

g01c <- compiler::cmpfun(g01)

n_row <- 1e6
set.seed(123)
df <- data.frame(replicate(5, runif(n_row)))
df[,1:3] <- round(df[,1:3])
# data.frame
system.time(g0(df))
# user  system elapsed 
# 1.14    0.00    1.14 
system.time(g0c(df))
# user  system elapsed 
# 0.82    0.03    0.86 

# data.table 
system.time(g01(df))
# user  system elapsed 
# 0.40    0.02    0.43 
system.time(g01c(df))
# user  system elapsed 
# 0.12    0.03    0.16

0人赞添加讨论(0) 举报

爷、活的狠高调

3楼-- · 2020-03-12 05:33

Only the X4 column update depends on previous values, so the loop can be mostly 'vectorized' (with a little bit of optimization, avoiding addition of 1 to rind in each iteration) as

rind1 <- rind + 1L
for (i in seq_len(N))
    x$X4[rind1[i]] <- x$X4[rind1[i]] + x$X4[rind[i]]

x$X5[rind1] <- x$X4[rind1] * x$X3[rind1]
x$X5[rind1] <- trunc(x$X5[rind1] * 10^8) / 10^8
x$X1[rind] <- NA
na.omit(x)

X4 is a numeric value and the update can be made more efficient by updating it as a vector rather than a column of a data.frame

X4 <- x$X4
for (i in seq_len(N))
    X4[rind1[i]] <- X4[rind1[i]] + X4[rind[i]]
x$X4 <- X4

For comparison, we have

f0 <- function(nrow) {
    set.seed(123)
    df <- data.frame(replicate(5, runif(nrow)))
    df[,1:3] <- round(df[,1:3])
    x <- df; N <- nrow(df); i <- 1 
    ind <- df[1:(N-1),1:3] == df[2:N,1:3]; 
    rind <- which(apply(ind,1,all))
    N <- length(rind)

    while(i <= N)
    {
        x$X4[rind[i]+1] <- x$X4[rind[i]+1] + x$X4[rind[i]]
        x$X5[rind[i]+1] <- x$X4[rind[i]+1] * x$X3[rind[i]+1]
        x$X5[rind[i]+1] <- trunc(x$X5[rind[i]+1]*10^8)/10^8
        x$X1[rind[i]] <- NA
        i <- i + 1
    }
    na.omit(x)
}

f1a <- function(nrow) {
    set.seed(123)
    df <- data.frame(replicate(5, runif(nrow)))
    df[,1:3] <- round(df[,1:3])
    x <- df; N <- nrow(df)
    ind <- df[1:(N-1),1:3] == df[2:N,1:3]; 
    rind <- which(apply(ind,1,all))  

    rind1 <- rind + 1L
    for (i in seq_along(rind))
        x$X4[rind1[i]] <- x$X4[rind1[i]] + x$X4[rind[i]]

    x$X5[rind1] <- x$X4[rind1] * x$X3[rind1]
    x$X5[rind1] <- trunc(x$X5[rind1] * 10^8) / 10^8
    x$X1[rind] <- NA
    na.omit(x)
}

f4a <- function(nrow) {
    set.seed(123)
    df <- data.frame(replicate(5, runif(nrow)))
    df[,1:3] <- round(df[,1:3])
    x <- df; N <- nrow(df) 
    ind <- df[1:(N-1),1:3] == df[2:N,1:3]; 
    rind <- which(apply(ind,1,all))

    rind1 <- rind + 1L
    X4 <- x$X4
    for (i in seq_along(rind))
        X4[rind1[i]] <- X4[rind1[i]] + X4[rind[i]]
    x$X4 <- X4

    x$X1[rind] <- NA
    x$X5[rind1] <- X4[rind1] * x$X3[rind1]
    x$X5[rind1] <- trunc(x$X5[rind1] * 10^8) / 10^8

    na.omit(x)
}

The results are the same

> identical(f0(1000), f1a(1000))
[1] TRUE
> identical(f0(1000), f4a(1000))
[1] TRUE

The speedup is substantial (using library(microbenchmark))

> microbenchmark(f0(10000), f1a(10000), f4a(10000), times=10)
Unit: milliseconds
       expr       min        lq      mean    median        uq       max neval
  f0(10000) 346.35906 354.37637 361.15188 363.71627 366.74944 373.88275    10
 f1a(10000) 124.71766 126.43532 127.99166 127.39257 129.51927 133.01573    10
 f4a(10000)  41.70401  42.48141  42.90487  43.00584  43.32059  43.83757    10

The reason for the difference can be seen when R has been compiled with memory profiling enabled --

> tracemem(x)
[1] "<0x39d93a8>"
> tracemem(x$X4)
[1] "<0x6586e40>"
> x$X4[1] <- 1
tracemem[0x39d93a8 -> 0x39d9410]: 
tracemem[0x6586e40 -> 0x670d870]: 
tracemem[0x39d9410 -> 0x39d9478]: 
tracemem[0x39d9478 -> 0x39d94e0]: $<-.data.frame $<- 
tracemem[0x39d94e0 -> 0x39d9548]: $<-.data.frame $<- 
>

Each line indicates a memory copy, so updating a cell in a data frame incurs 5 copies of the outer structure or the vector itself. In contrast, a vector can be updated without any copies.

> tracemem(X4)
[1] "<0xdd44460>"
> X4[1] = 1
tracemem[0xdd44460 -> 0x9d26c10]: 
> X4[1] = 2
>

(The first assignment is expensive because it represents the duplication of the data.frame column; subsequent updates are to X4, only X4 refers to the vector being updated, and the vector does not need to be duplicated).

The data.frame implementation does seem to scale non-linearly

> microbenchmark(f1a(100), f1a(1000), f1a(10000), f1a(100000), times=10)
Unit: milliseconds
       expr         min          lq        mean      median          uq
   f1a(100)    2.372266    2.479458    2.551568    2.524818    2.640244
  f1a(1000)   10.831288   11.100009   11.210483   11.194863   11.432533
 f1a(10000)  130.011104  138.686445  139.556787  141.138329  141.522686
 f1a(1e+05) 4092.439956 4117.818817 4145.809235 4143.634663 4172.282888
         max neval
    2.727221    10
   11.581644    10
  147.993499    10
 4216.129732    10

The reason is apparent in the second line of the tracemem output above -- updating a row triggers a copy of the entire column. So the algorithm scales as the number of rows to update times the number of rows in a column, approximately quadratic.

f4a() appears to scale linearly

> microbenchmark(f4a(100), f4a(1000), f4a(10000), f4a(100000), f4a(1e6), times=10)
Unit: milliseconds
       expr         min          lq        mean      median          uq
   f4a(100)    1.741458    1.756095    1.827886    1.773887    1.929943
  f4a(1000)    5.286016    5.517491    5.558091    5.569514    5.671840
 f4a(10000)   42.906895   43.025385   43.880020   43.928631   44.633684
 f4a(1e+05)  467.698285  478.919843  539.696364  552.896109  576.707913
 f4a(1e+06) 5385.029968 5521.645185 5614.960871 5573.475270 5794.307470
         max neval
    2.003700    10
    5.764022    10
   44.983002    10
  644.927832    10
 5823.868167    10

One could try and be clever about vectorizing the loop, but is it now necessary?

A tuned version of the data processing part of the function uses negative indexing (e.g., -nrow(df)) to remove rows from the data frame, rowSums() instead of apply(), and unname() so that subset operations don't carry around unused names:

g0 <- function(df) {
    ind <- df[-nrow(df), 1:3] == df[-1, 1:3]
    rind <- unname(which(rowSums(ind) == ncol(ind)))
    rind1 <- rind + 1L

    X4 <- df$X4
    for (i in seq_along(rind))
        X4[rind1[i]] <- X4[rind1[i]] + X4[rind[i]]

    df$X4 <- X4
    df$X1[rind] <- NA
    df$X5[rind1] <- trunc(df$X4[rind1] * df$X3[rind1] * 10^8) / 10^8

    na.omit(df)
}

Compared to the data.table solution suggested by @Khashaa

g1 <- function(df) {
    x <- setDT(df)[, r:=rleid(X1, X2, X3),]
    x <- x[, .(X1=X1[.N], X2=X2[.N], X3=X3[.N], X4=sum(X4), X5=X5[.N]), by=r]
    x <- x[, X5:= trunc(X3 * X4 * 10^8)/10^8]
    x
}

the base R version performs favorably with times

> n_row <- 200000
> set.seed(123)
> df <- data.frame(replicate(5, runif(n_row)))
> df[,1:3] <- round(df[,1:3])
> system.time(g0res <- g0(df))
   user  system elapsed 
  0.247   0.000   0.247 
> system.time(g1res <- g1(df))
   user  system elapsed 
  0.551   0.000   0.551

(The pre-tuning version in f4a takes about 760ms, so more than twice as slow).

The results from the data.table implementation are not correct

> head(g0res)
  X1 X2 X3        X4        X5
1  0  1  1 0.4708851 0.8631978
2  1  1  0 0.8977670 0.8311355
3  0  1  0 0.7615472 0.6002179
4  1  1  1 0.6478515 0.5616587
5  1  0  0 0.5329256 0.5805195
6  0  1  1 0.8526255 0.4913130
> head(g1res)
   r X1 X2 X3        X4        X5
1: 1  0  1  1 0.4708851 0.4708851
2: 2  1  1  0 0.8977670 0.0000000
3: 3  0  1  0 0.7615472 0.0000000
4: 4  1  1  1 0.6478515 0.6478515
5: 5  1  0  0 0.5329256 0.0000000
6: 6  0  1  1 0.8526255 0.8526255

and I'm not enough of a data.table wizard (barely a data.table user) to know what the correct formulation is.

Compiling (benefits exclusively from the for loop?) increases speed by about 20%

> g0c <- compiler::cmpfun(g0)
> microbenchmark(g0(df), g0c(df), times=10)
Unit: milliseconds
     expr      min      lq     mean   median       uq      max neval
  g0(df)  250.0750 262.941 276.1549 276.8848 281.1966 321.3778    10
  g0c(df) 214.3132 219.940 228.0784 230.2098 235.4579 242.6636    10

0人赞添加讨论(0) 举报

Why is the time complexity of this loop non-linear

采纳回答

编辑标签

举报内容

检举类型

检举原因

检举说明(必填)

打开微信“扫一扫”，打开网页后点击屏幕右上角分享按钮

付费偷看金额在0.1-10元之间