Using rollapply and lm over multiple columns of da

2019-07-10 16:57发布

问题:

I have a data frame similar to the following with a total of 500 columns:

Probes <- data.frame(Days=seq(0.01, 4.91, 0.01), B1=5:495,B2=-100:390, B3=10:500,B4=-200:290)

I would like to calculate a rolling window linear regression where my window size is 12 data points and each sequential regression is separated by 6 data points. For each regression, "Days" will always be the x component of the model, and the y's would be each of the other columns (B1, followed by B2, B3, etc). I would then like to save the co-efficients as a dataframe with the existing column titles (B1, B2, etc).

I think my code is close, but is not quite working. I used rollapply from the zoo library.

slopedata<-rollapply(zoo(Probes), width=12, function(Probes) { 
 coef(lm(formula=y~Probes$Days, data = Probes))[2]
 }, by = 6, by.column=TRUE, align="right")

If possible, I would also like to have the "xmins" saved to a vector to add to the dataframe. This would mean the smallest x value used in each regression (basically it would be every 6 numbers in the "Days" column.) Thanks for your help.

回答1:

1) Define a zoo object z whose data contains Probes and whose index is taken from the first column of Probes, i.e. Days. Noting that lm allows y to be a matrix define a coefs function which computes the regression coefficients. Finally rollapply over z. Note that the index of the returned object gives xmin.

library(zoo)

z <- zoo(Probes, Probes[[1]])

coefs <- function(z) c(unlist(as.data.frame(coef(lm(z[,-1] ~ z[,1])))))
rz <- rollapply(z, 12, by = 6, coefs, by.column = FALSE, align = "left")

giving:

> head(rz)
     B11 B12  B21 B22 B31 B32  B41 B42
0.01   4 100 -101 100   9 100 -201 100
0.07   4 100 -101 100   9 100 -201 100
0.13   4 100 -101 100   9 100 -201 100
0.19   4 100 -101 100   9 100 -201 100
0.25   4 100 -101 100   9 100 -201 100
0.31   4 100 -101 100   9 100 -201 100

Note that DF <- fortify.zoo(rz) could be used if you needed a data frame representation of rz.

2) An alternative somewhat similar approch would be to rollaplly over the row numbers:

library(zoo)
y <- as.matrix(Probes[-1])
Days <- Probes$Days
n <- nrow(Probes)
coefs <- function(ix) c(unlist(as.data.frame(coef(lm(y ~ Days, subset = ix)))), 
      xmins = Days[ix][1])
r <- rollapply(1:n, 12, by = 6, coefs)


回答2:

try this:

# here are the xmin values you wanted
xmins <- Probes$Days[seq(1,nrow(Probes),6)]

# here we build a function that will run regressions across the columns
# y1 vs x, y2 vs x, y3 vs x...
# you enter the window and by (12/6) in order to limit the interval being
# regressed. this is later called in do.call
runreg <- function(Probes,m,window=12,by=6){

  # beg,end are used to specify the interval
  beg <- seq(1,nrow(Probes),by)[m]
  end <- beg+window-1

  # this is used to go through all the columns
  N <- ncol(Probes)-1
  tmp <- numeric(N)
  # go through each column and store the coefficients in tmp
  for(i in 1:N){
     y <- Probes[[i+1]][beg:end]
     x <- Probes$Days[beg:end]
     tmp[i] <- coef(lm(y~x))[2][[1]]
  }
  # put all our column regressions into a dataframe
  res <- rbind('coeff'=tmp)
  colnames(res) <- colnames(Probes)[-1]

  return(res)
}

# now that we've built the function to do the column regressions
# we just need to go through all the window-ed regressions (row regressions)
res <- do.call(rbind,lapply(1:length(xmins),function(m) runreg(Probes,m)))

# these rownames are the index of the xmin values
rownames(res) <- seq(1,nrow(Probes),6)
res <- data.frame(res,xmins)


回答3:

You can also use the rollRegres package as follows

# setup data
Probes <- data.frame(
  # I changed the days to be intergers
  Days=seq(1L, 491L, 1L),
  B1=5:495, B2=-100:390, B3=10:500 , B4=-200:290)

# setup grp argument
grp_arg <- as.integer((Probes$Days - 1L) %/% 6)

# estimate coefs. width argument is realtive in grp units
library(rollRegres)
X <- cbind(1, Probes$Days / 100)
Ys <- as.matrix(Probes[, 2:5])
out <- lapply(1:ncol(Ys), function(i)
  roll_regres.fit(x = X, y = Ys[, i], width = 2L, grp = grp_arg)$coefs)
out <- do.call(cbind, out)

# only keep the complete.cases and the unique values
colnames(out) <- sapply(1:4, function(i) paste0("B", i, 0:1))
out <- out[c(T, grp_arg[-1] != head(grp_arg, -1)), ]
out <- out[complete.cases(out), ]
head(out)
#R      B10 B11  B20 B21 B30 B31  B40 B41
#R [1,]   4 100 -101 100   9 100 -201 100
#R [2,]   4 100 -101 100   9 100 -201 100
#R [3,]   4 100 -101 100   9 100 -201 100
#R [4,]   4 100 -101 100   9 100 -201 100
#R [5,]   4 100 -101 100   9 100 -201 100
#R [6,]   4 100 -101 100   9 100 -201 100

The solution is a lot faster than e.g., the zoo solution

library(zoo) coefs <- function(z) c(unlist(as.data.frame(coef(lm(z[,-1] ~ z[,1]))))) microbenchmark::microbenchmark(   rollapply = {
    z <- zoo(Probes, Probes[[1]])
    rz <- rollapply(z, 12, by = 6, coefs, by.column = FALSE, align = "left")   },   roll_regres = {
    grp_arg <- as.integer((Probes$Days - 1L) %/% 6)

    X <- cbind(1, Probes$Days / 100)
    Ys <- as.matrix(Probes[, 2:5])
    out <- lapply(1:ncol(Ys), function(i)
      roll_regres.fit(x = X, y = Ys[, i], width = 2L, grp = grp_arg)$coefs)
    out <- do.call(cbind, out)

    colnames(out) <- sapply(1:4, function(i) paste0("B", i, 0:1))
    out <- out[c(T, grp_arg[-1] != head(grp_arg, -1)), ]
    out <- out[complete.cases(out), ]
    head(out)   } )
#R Unit: microseconds
#R        expr       min        lq      mean     median        uq       max neval
#R   rollapply 53392.614 56330.492 59793.106 58363.2825 60902.938 119206.76   100
#R roll_regres   865.186   920.297  1074.161   983.9015  1047.705   5071.41   100

At present you though need to install the package from Github due to an error in the validation in version 0.1.0. Thus, run

devtools::install_github("boennecd/rollRegres", upgrade_dependencies = FALSE,
                         build_vignettes = TRUE)


标签: r zoo lm rollapply