Replicating ddply with the dplyr package? ddply is

2019-06-14 04:31发布

I am working with some big time series datasets, with about 2million rows in each file. So far I've been using ddply to aggregate the data like I want it to but unfortunately it has become too slow and I really need a faster way. Here is my code:

    DF <- read.csv(file = "NSE/20151221/AUROPHARMA15DECFUT_20151221_ob.csv",header = FALSE,sep = "", col.names = c("DateTime","Seq","BP1","BQ1","BO1","AP1","AQ1","AO1","BP2","BQ2","BO2","AP2","AQ2","AO2","BP3","BQ3","BO3","AP3","AQ3","AO3","BP4","BQ4","BO4","AP4","AQ4","AO4","BP5","BQ5","BO5","AP5","AQ5","AO5","BP6","BQ6","BO6","AP6","AQ6","AO6","BP7","BQ7","BO7","AP7","AQ7","AO7","BP8","BQ8","BO8","AP8","AQ8","AO8","BP9","BQ9","BO9","AP9","AQ9","AO9","BP10","BQ10","BO10","AP10","AQ10","AO10", "C", "Price", "Qty", "OldPrice", "OldQty"), colClasses = c(NA, rep("integer",31), rep("NULL", 35)))
    DF <- DF[which(DF$DateTime != 0),]
    options(digits.secs = 3)
    DF$DateTime = as.POSIXct(DF$DateTime/(10^9), origin="1970-01-01")     
    completecase <- DF[complete.cases(DF),]
    midpoint = data.frame(DateTime=completecase$DateTime, MP=(completecase$BP1+completecase$AP1)/2)
    **# creating 10 millisecond time intervals**
    cuts = seq.POSIXt(from = min(midpoint$DateTime), to = max(midpoint$DateTime), by = .01)
   **#creating new Time variable with the 10 millisecond breaks**
    midpoint$Time = cut(midpoint$DateTime, breaks = cuts)
   **#summarizing the MP variable every 10millisecond while keeping the empty time frames with the .drop = FALSE argument**
    mp = ddply(midpoint, .(Time), summarise, MP = mean(MP), .drop = FALSE)
    mp$Time = as.POSIXct(mp$Time)
    mp_xts = xts(mp$MP, mp$Time, tzone = 'Asia/Kolkata')
    mp_xts = mp_xts["2015-12-21 09:15:00.000/2015-12-21 15:30:00.000"]
    mp_xts = makeReturns(mp_xts)

ddply is way too slow. I know I can do this with the data.table package or the dplyr package but what I cant seem to figure out is replicating the .drop = FALSE argument. Both the other packages drop all the empty time intervals but it is important for me to retain them for my analysis.

What I am trying to do is basically, aggregate my time series for every 10milliseconds, and fill the NA time frames with either 0 or locf. So if it does it right I should have 6.25*60*60*100 = 2,250,000 rows as my output.

Any ideas how I can do this faster while doing exactly the same thing as ddply?

> dput(DF[1:20,])
structure(list(DateTime = structure(c(1450669500.804, 1450669500.806, 
1450669500.806, 1450669500.807, 1450669500.807, 1450669500.808, 
1450669500.812, 1450669500.813, 1450669500.813, 1450669500.813, 
1450669500.814, 1450669500.819, 1450669500.82, 1450669500.82, 
1450669500.827, 1450669500.85, 1450669500.85, 1450669500.85, 
1450669500.851, 1450669500.851), class = c("POSIXct", "POSIXt"
), tzone = ""), Seq = c(104L, 163L, 169L, 190L, 198L, 227L, 301L, 
315L, 319L, 320L, 326L, 404L, 429L, 435L, 583L, 928L, 931L, 932L, 
944L, 947L), BP1 = c(82055L, 82055L, 82055L, 82055L, 82055L, 
82630L, 82630L, 82630L, 82630L, 82630L, 82630L, 82630L, 82630L, 
82630L, 82630L, 82630L, 82630L, 82630L, 82830L, 82830L), BQ1 = c(1L, 
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 
1L, 1L, 1L), BO1 = c(1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L), AP1 = c(0L, 87800L, 
83800L, 83800L, 83800L, 83800L, 83800L, 83800L, 83800L, 83800L, 
83800L, 83800L, 83800L, 83800L, 83800L, 83800L, 83795L, 83795L, 
83795L, 83795L), AQ1 = c(0L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 
1L, 1L, 1L, 1L, 2L, 2L, 2L, 1L, 1L, 1L, 1L), AO1 = c(0L, 1L, 
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 
1L, 1L), BP2 = c(0L, 0L, 0L, 0L, 0L, 82055L, 82525L, 82525L, 
82525L, 82525L, 82525L, 82525L, 82525L, 82525L, 82525L, 82525L, 
82525L, 82525L, 82630L, 82630L), BQ2 = c(0L, 0L, 0L, 0L, 0L, 
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L), 
    BO2 = c(0L, 0L, 0L, 0L, 0L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 
    1L, 1L, 1L, 1L, 1L, 1L, 1L), AP2 = c(0L, 0L, 87800L, 84805L, 
    84230L, 84230L, 84230L, 84230L, 84230L, 84230L, 84230L, 84230L, 
    84230L, 84230L, 84230L, 84230L, 83800L, 83800L, 83800L, 83800L
    ), AQ2 = c(0L, 0L, 1L, 2L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 
    1L, 1L, 1L, 1L, 2L, 2L, 2L, 2L), AO2 = c(0L, 0L, 1L, 1L, 
    1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 
    1L), BP3 = c(0L, 0L, 0L, 0L, 0L, 0L, 82055L, 82055L, 82055L, 
    82055L, 82055L, 82320L, 82320L, 82320L, 82320L, 82320L, 82320L, 
    82320L, 82525L, 82525L), BQ3 = c(0L, 0L, 0L, 0L, 0L, 0L, 
    1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L), 
    BO3 = c(0L, 0L, 0L, 0L, 0L, 0L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 
    1L, 1L, 1L, 1L, 1L, 1L, 1L), AP3 = c(0L, 0L, 0L, 87800L, 
    84805L, 84805L, 84805L, 84805L, 84805L, 84805L, 84805L, 84805L, 
    84805L, 84805L, 84805L, 84805L, 84230L, 84230L, 84230L, 84230L
    ), AQ3 = c(0L, 0L, 0L, 1L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 
    2L, 2L, 2L, 2L, 1L, 1L, 1L, 1L), AO3 = c(0L, 0L, 0L, 1L, 
    1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 
    1L), BP4 = c(0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 82035L, 82035L, 
    82035L, 82055L, 82055L, 82055L, 82055L, 82060L, 82060L, 82060L, 
    82320L, 82320L), BQ4 = c(0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 
    1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L), BO4 = c(0L, 
    0L, 0L, 0L, 0L, 0L, 0L, 0L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 
    1L, 1L, 1L, 1L), AP4 = c(0L, 0L, 0L, 0L, 87800L, 87800L, 
    87800L, 85380L, 85380L, 85380L, 85365L, 85365L, 85365L, 85365L, 
    84980L, 84980L, 84805L, 84805L, 84805L, 84400L), AQ4 = c(0L, 
    0L, 0L, 0L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 
    2L, 2L, 2L, 1L), AO4 = c(0L, 0L, 0L, 0L, 1L, 1L, 1L, 1L, 
    1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L), BP5 = c(0L, 
    0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 80035L, 80035L, 82035L, 82035L, 
    82035L, 82035L, 82055L, 82055L, 82055L, 82060L, 82060L), 
    BQ5 = c(0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 1L, 1L, 1L, 1L, 
    1L, 1L, 1L, 1L, 1L, 1L, 1L), BO5 = c(0L, 0L, 0L, 0L, 0L, 
    0L, 0L, 0L, 0L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L
    ), AP5 = c(0L, 0L, 0L, 0L, 0L, 0L, 0L, 87800L, 87800L, 87800L, 
    85380L, 85380L, 85380L, 85380L, 85365L, 85365L, 84980L, 84980L, 
    84980L, 84805L), AQ5 = c(0L, 0L, 0L, 0L, 0L, 0L, 0L, 1L, 
    1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 2L), AO5 = c(0L, 
    0L, 0L, 0L, 0L, 0L, 0L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 
    1L, 1L, 1L, 1L)), .Names = c("DateTime", "Seq", "BP1", "BQ1", 
"BO1", "AP1", "AQ1", "AO1", "BP2", "BQ2", "BO2", "AP2", "AQ2", 
"AO2", "BP3", "BQ3", "BO3", "AP3", "AQ3", "AO3", "BP4", "BQ4", 
"BO4", "AP4", "AQ4", "AO4", "BP5", "BQ5", "BO5", "AP5", "AQ5", 
"AO5"), row.names = c(NA, 20L), class = "data.frame") 

Please, let me know if I should post any other information and really appreciate the help.

> sessionInfo()
R version 3.2.2 (2015-08-14)
Platform: x86_64-pc-linux-gnu (64-bit)
Running under: Ubuntu precise (12.04.5 LTS)

locale:
 [1] LC_CTYPE=en_IN.UTF-8       LC_NUMERIC=C               LC_TIME=en_IN.UTF-8        LC_COLLATE=en_IN.UTF-8     LC_MONETARY=en_IN.UTF-8    LC_MESSAGES=en_IN.UTF-8    LC_PAPER=en_IN.UTF-8      
 [8] LC_NAME=C                  LC_ADDRESS=C               LC_TELEPHONE=C             LC_MEASUREMENT=en_IN.UTF-8 LC_IDENTIFICATION=C       

attached base packages:
[1] stats     graphics  grDevices utils     datasets  methods   base     

other attached packages:
[1] highfrequency_0.4 xts_0.9-7         zoo_1.7-12        data.table_1.9.7 

loaded via a namespace (and not attached):
[1] tools_3.2.2     grid_3.2.2      lattice_0.20-33

2条回答
太酷不给撩
2楼-- · 2019-06-14 05:16

The latest development version of data.table adds non-equi joins which allow for very compact and fast solutions for these kinds of problems:

dt = as.data.table(df) # or convert in place using setDT
breaks = dt[, {tmp = seq(min(DateTime) - 0.01, max(DateTime) + 0.01, 0.01);
               .(Start = tmp, End = c(tail(tmp, -1), Inf))}];

dt[breaks, on = .(DateTime >= Start, DateTime < End),
   lapply(.SD, mean), by = .EACHI]
查看更多
戒情不戒烟
3楼-- · 2019-06-14 05:18

Can't do much better than data.table, except perhaps trying to write your own C++ implementation in Rcpp. Anyway, here's a data.table solution, using findInterval() instead of cut(), which should be slightly faster. Also, I've expanded the breaks outside the min and max to get some extra empty intervals to test.

library(data.table);
setDT(DF);
breaks <- seq.POSIXt(min(DF$DateTime)-0.01,max(DF$DateTime)+0.01,0.01);
breaks;
## [1] "2015-12-20 22:45:00 EST" "2015-12-20 22:45:00 EST" "2015-12-20 22:45:00 EST"
## [4] "2015-12-20 22:45:00 EST" "2015-12-20 22:45:00 EST" "2015-12-20 22:45:00 EST"
## [7] "2015-12-20 22:45:00 EST"
diff(breaks);
## Time differences in secs
## [1] 0.00999999 0.00999999 0.00999999 0.00999999 0.00999999 0.00999999
ints <- findInterval(DF$DateTime,breaks);
ints;
##  [1] 2 2 2 2 2 2 2 2 2 2 3 3 3 3 4 6 6 6 6 6
agg <- DF[,lapply(.SD,mean),breaks[ints],.SDcols=-1L];
agg;
##                 breaks   Seq     BP1 BQ1 BO1   AP1  AQ1 AO1     BP2 BQ2 BO2     AP2 AQ2 AO2      BP3 BQ3 BO3   AP3 AQ3 AO3   BP4 BQ4 BO4   AP4 AQ4 AO4     BP5 BQ5 BO5   AP5 AQ5 AO5
## 1: 2015-12-20 22:45:00 230.6 82342.5   1   1 75820 0.90 0.9 41215.5 0.5 0.5 67798.5 0.9 0.8 32822.00 0.4 0.4 59663 1.3 0.7 16407 0.2 0.2 51954 0.6 0.6  8003.5 0.1 0.1 26340 0.3 0.3
## 2: 2015-12-20 22:45:00 398.5 82630.0   1   1 83800 1.25 1.0 82525.0 1.0 1.0 84230.0 1.0 1.0 82253.75 1.0 1.0 84805 2.0 1.0 82050 1.0 1.0 85365 1.0 1.0 81535.0 1.0 1.0 85380 1.0 1.0
## 3: 2015-12-20 22:45:00 583.0 82630.0   1   1 83800 2.00 1.0 82525.0 1.0 1.0 84230.0 1.0 1.0 82320.00 1.0 1.0 84805 2.0 1.0 82055 1.0 1.0 84980 1.0 1.0 82035.0 1.0 1.0 85365 1.0 1.0
## 4: 2015-12-20 22:45:00 936.4 82710.0   1   1 83796 1.20 1.0 82567.0 1.0 1.0 83886.0 1.8 1.0 82402.00 1.0 1.0 84345 1.2 1.0 82164 1.0 1.0 84759 1.6 1.0 82057.0 1.0 1.0 85022 1.2 1.0
emp <- setdiff(seq_along(breaks),ints);
emp;
## [1] 1 5 7
agg <- rbind(fill=T,agg,data.table(breaks=breaks[emp]));
agg;
##                 breaks   Seq     BP1 BQ1 BO1   AP1  AQ1 AO1     BP2 BQ2 BO2     AP2 AQ2 AO2      BP3 BQ3 BO3   AP3 AQ3 AO3   BP4 BQ4 BO4   AP4 AQ4 AO4     BP5 BQ5 BO5   AP5 AQ5 AO5
## 1: 2015-12-20 22:45:00 230.6 82342.5   1   1 75820 0.90 0.9 41215.5 0.5 0.5 67798.5 0.9 0.8 32822.00 0.4 0.4 59663 1.3 0.7 16407 0.2 0.2 51954 0.6 0.6  8003.5 0.1 0.1 26340 0.3 0.3
## 2: 2015-12-20 22:45:00 398.5 82630.0   1   1 83800 1.25 1.0 82525.0 1.0 1.0 84230.0 1.0 1.0 82253.75 1.0 1.0 84805 2.0 1.0 82050 1.0 1.0 85365 1.0 1.0 81535.0 1.0 1.0 85380 1.0 1.0
## 3: 2015-12-20 22:45:00 583.0 82630.0   1   1 83800 2.00 1.0 82525.0 1.0 1.0 84230.0 1.0 1.0 82320.00 1.0 1.0 84805 2.0 1.0 82055 1.0 1.0 84980 1.0 1.0 82035.0 1.0 1.0 85365 1.0 1.0
## 4: 2015-12-20 22:45:00 936.4 82710.0   1   1 83796 1.20 1.0 82567.0 1.0 1.0 83886.0 1.8 1.0 82402.00 1.0 1.0 84345 1.2 1.0 82164 1.0 1.0 84759 1.6 1.0 82057.0 1.0 1.0 85022 1.2 1.0
## 5: 2015-12-20 22:45:00    NA      NA  NA  NA    NA   NA  NA      NA  NA  NA      NA  NA  NA       NA  NA  NA    NA  NA  NA    NA  NA  NA    NA  NA  NA      NA  NA  NA    NA  NA  NA
## 6: 2015-12-20 22:45:00    NA      NA  NA  NA    NA   NA  NA      NA  NA  NA      NA  NA  NA       NA  NA  NA    NA  NA  NA    NA  NA  NA    NA  NA  NA      NA  NA  NA    NA  NA  NA
## 7: 2015-12-20 22:45:00    NA      NA  NA  NA    NA   NA  NA      NA  NA  NA      NA  NA  NA       NA  NA  NA    NA  NA  NA    NA  NA  NA    NA  NA  NA      NA  NA  NA    NA  NA  NA
查看更多
登录 后发表回答