I am working with some big time series datasets, with about 2million rows in each file. So far I've been using ddply to aggregate the data like I want it to but unfortunately it has become too slow and I really need a faster way. Here is my code:
DF <- read.csv(file = "NSE/20151221/AUROPHARMA15DECFUT_20151221_ob.csv",header = FALSE,sep = "", col.names = c("DateTime","Seq","BP1","BQ1","BO1","AP1","AQ1","AO1","BP2","BQ2","BO2","AP2","AQ2","AO2","BP3","BQ3","BO3","AP3","AQ3","AO3","BP4","BQ4","BO4","AP4","AQ4","AO4","BP5","BQ5","BO5","AP5","AQ5","AO5","BP6","BQ6","BO6","AP6","AQ6","AO6","BP7","BQ7","BO7","AP7","AQ7","AO7","BP8","BQ8","BO8","AP8","AQ8","AO8","BP9","BQ9","BO9","AP9","AQ9","AO9","BP10","BQ10","BO10","AP10","AQ10","AO10", "C", "Price", "Qty", "OldPrice", "OldQty"), colClasses = c(NA, rep("integer",31), rep("NULL", 35)))
DF <- DF[which(DF$DateTime != 0),]
options(digits.secs = 3)
DF$DateTime = as.POSIXct(DF$DateTime/(10^9), origin="1970-01-01")
completecase <- DF[complete.cases(DF),]
midpoint = data.frame(DateTime=completecase$DateTime, MP=(completecase$BP1+completecase$AP1)/2)
**# creating 10 millisecond time intervals**
cuts = seq.POSIXt(from = min(midpoint$DateTime), to = max(midpoint$DateTime), by = .01)
**#creating new Time variable with the 10 millisecond breaks**
midpoint$Time = cut(midpoint$DateTime, breaks = cuts)
**#summarizing the MP variable every 10millisecond while keeping the empty time frames with the .drop = FALSE argument**
mp = ddply(midpoint, .(Time), summarise, MP = mean(MP), .drop = FALSE)
mp$Time = as.POSIXct(mp$Time)
mp_xts = xts(mp$MP, mp$Time, tzone = 'Asia/Kolkata')
mp_xts = mp_xts["2015-12-21 09:15:00.000/2015-12-21 15:30:00.000"]
mp_xts = makeReturns(mp_xts)
ddply is way too slow. I know I can do this with the data.table package or the dplyr package but what I cant seem to figure out is replicating the .drop = FALSE argument. Both the other packages drop all the empty time intervals but it is important for me to retain them for my analysis.
What I am trying to do is basically, aggregate my time series for every 10milliseconds, and fill the NA time frames with either 0 or locf. So if it does it right I should have 6.25*60*60*100 = 2,250,000 rows as my output.
Any ideas how I can do this faster while doing exactly the same thing as ddply?
> dput(DF[1:20,])
structure(list(DateTime = structure(c(1450669500.804, 1450669500.806,
1450669500.806, 1450669500.807, 1450669500.807, 1450669500.808,
1450669500.812, 1450669500.813, 1450669500.813, 1450669500.813,
1450669500.814, 1450669500.819, 1450669500.82, 1450669500.82,
1450669500.827, 1450669500.85, 1450669500.85, 1450669500.85,
1450669500.851, 1450669500.851), class = c("POSIXct", "POSIXt"
), tzone = ""), Seq = c(104L, 163L, 169L, 190L, 198L, 227L, 301L,
315L, 319L, 320L, 326L, 404L, 429L, 435L, 583L, 928L, 931L, 932L,
944L, 947L), BP1 = c(82055L, 82055L, 82055L, 82055L, 82055L,
82630L, 82630L, 82630L, 82630L, 82630L, 82630L, 82630L, 82630L,
82630L, 82630L, 82630L, 82630L, 82630L, 82830L, 82830L), BQ1 = c(1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,
1L, 1L, 1L), BO1 = c(1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L), AP1 = c(0L, 87800L,
83800L, 83800L, 83800L, 83800L, 83800L, 83800L, 83800L, 83800L,
83800L, 83800L, 83800L, 83800L, 83800L, 83800L, 83795L, 83795L,
83795L, 83795L), AQ1 = c(0L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,
1L, 1L, 1L, 1L, 2L, 2L, 2L, 1L, 1L, 1L, 1L), AO1 = c(0L, 1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,
1L, 1L), BP2 = c(0L, 0L, 0L, 0L, 0L, 82055L, 82525L, 82525L,
82525L, 82525L, 82525L, 82525L, 82525L, 82525L, 82525L, 82525L,
82525L, 82525L, 82630L, 82630L), BQ2 = c(0L, 0L, 0L, 0L, 0L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L),
BO2 = c(0L, 0L, 0L, 0L, 0L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L), AP2 = c(0L, 0L, 87800L, 84805L,
84230L, 84230L, 84230L, 84230L, 84230L, 84230L, 84230L, 84230L,
84230L, 84230L, 84230L, 84230L, 83800L, 83800L, 83800L, 83800L
), AQ2 = c(0L, 0L, 1L, 2L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,
1L, 1L, 1L, 1L, 2L, 2L, 2L, 2L), AO2 = c(0L, 0L, 1L, 1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,
1L), BP3 = c(0L, 0L, 0L, 0L, 0L, 0L, 82055L, 82055L, 82055L,
82055L, 82055L, 82320L, 82320L, 82320L, 82320L, 82320L, 82320L,
82320L, 82525L, 82525L), BQ3 = c(0L, 0L, 0L, 0L, 0L, 0L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L),
BO3 = c(0L, 0L, 0L, 0L, 0L, 0L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L), AP3 = c(0L, 0L, 0L, 87800L,
84805L, 84805L, 84805L, 84805L, 84805L, 84805L, 84805L, 84805L,
84805L, 84805L, 84805L, 84805L, 84230L, 84230L, 84230L, 84230L
), AQ3 = c(0L, 0L, 0L, 1L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L,
2L, 2L, 2L, 2L, 1L, 1L, 1L, 1L), AO3 = c(0L, 0L, 0L, 1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,
1L), BP4 = c(0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 82035L, 82035L,
82035L, 82055L, 82055L, 82055L, 82055L, 82060L, 82060L, 82060L,
82320L, 82320L), BQ4 = c(0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L), BO4 = c(0L,
0L, 0L, 0L, 0L, 0L, 0L, 0L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,
1L, 1L, 1L, 1L), AP4 = c(0L, 0L, 0L, 0L, 87800L, 87800L,
87800L, 85380L, 85380L, 85380L, 85365L, 85365L, 85365L, 85365L,
84980L, 84980L, 84805L, 84805L, 84805L, 84400L), AQ4 = c(0L,
0L, 0L, 0L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,
2L, 2L, 2L, 1L), AO4 = c(0L, 0L, 0L, 0L, 1L, 1L, 1L, 1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L), BP5 = c(0L,
0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 80035L, 80035L, 82035L, 82035L,
82035L, 82035L, 82055L, 82055L, 82055L, 82060L, 82060L),
BQ5 = c(0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 1L, 1L, 1L, 1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L), BO5 = c(0L, 0L, 0L, 0L, 0L,
0L, 0L, 0L, 0L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L
), AP5 = c(0L, 0L, 0L, 0L, 0L, 0L, 0L, 87800L, 87800L, 87800L,
85380L, 85380L, 85380L, 85380L, 85365L, 85365L, 84980L, 84980L,
84980L, 84805L), AQ5 = c(0L, 0L, 0L, 0L, 0L, 0L, 0L, 1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 2L), AO5 = c(0L,
0L, 0L, 0L, 0L, 0L, 0L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,
1L, 1L, 1L, 1L)), .Names = c("DateTime", "Seq", "BP1", "BQ1",
"BO1", "AP1", "AQ1", "AO1", "BP2", "BQ2", "BO2", "AP2", "AQ2",
"AO2", "BP3", "BQ3", "BO3", "AP3", "AQ3", "AO3", "BP4", "BQ4",
"BO4", "AP4", "AQ4", "AO4", "BP5", "BQ5", "BO5", "AP5", "AQ5",
"AO5"), row.names = c(NA, 20L), class = "data.frame")
Please, let me know if I should post any other information and really appreciate the help.
> sessionInfo()
R version 3.2.2 (2015-08-14)
Platform: x86_64-pc-linux-gnu (64-bit)
Running under: Ubuntu precise (12.04.5 LTS)
locale:
[1] LC_CTYPE=en_IN.UTF-8 LC_NUMERIC=C LC_TIME=en_IN.UTF-8 LC_COLLATE=en_IN.UTF-8 LC_MONETARY=en_IN.UTF-8 LC_MESSAGES=en_IN.UTF-8 LC_PAPER=en_IN.UTF-8
[8] LC_NAME=C LC_ADDRESS=C LC_TELEPHONE=C LC_MEASUREMENT=en_IN.UTF-8 LC_IDENTIFICATION=C
attached base packages:
[1] stats graphics grDevices utils datasets methods base
other attached packages:
[1] highfrequency_0.4 xts_0.9-7 zoo_1.7-12 data.table_1.9.7
loaded via a namespace (and not attached):
[1] tools_3.2.2 grid_3.2.2 lattice_0.20-33
The latest development version of
data.table
adds non-equi joins which allow for very compact and fast solutions for these kinds of problems:Can't do much better than data.table, except perhaps trying to write your own C++ implementation in Rcpp. Anyway, here's a data.table solution, using
findInterval()
instead ofcut()
, which should be slightly faster. Also, I've expanded the breaks outside the min and max to get some extra empty intervals to test.