I have a couple of large data frames (1 million+ rows x 6-10 columns) I need to subset repeatedly. The subsetting section is the slowest part of my code and I curious if there is way to do this faster.
load("https://dl.dropbox.com/u/4131944/Temp/DF_IOSTAT_ALL.rda")
start_in <- strptime("2012-08-20 13:00", "%Y-%m-%d %H:%M")
end_in<- strptime("2012-08-20 17:00", "%Y-%m-%d %H:%M")
system.time(DF_IOSTAT_INT <- DF_IOSTAT_ALL[DF_IOSTAT_ALL$date_stamp >= start_in & DF_IOSTAT_ALL$date_stamp <= end_in,])
> system.time(DF_IOSTAT_INT <- DF_IOSTAT_ALL[DF_IOSTAT_ALL$date_stamp >= start_in & DF_IOSTAT_ALL$date_stamp <= end_in,])
user system elapsed
16.59 0.00 16.60
dput(head(DF_IOSTAT_ALL))
structure(list(date_stamp = structure(list(sec = c(14, 24, 34,
44, 54, 4), min = c(0L, 0L, 0L, 0L, 0L, 1L), hour = c(0L, 0L,
0L, 0L, 0L, 0L), mday = c(20L, 20L, 20L, 20L, 20L, 20L), mon = c(7L,
7L, 7L, 7L, 7L, 7L), year = c(112L, 112L, 112L, 112L, 112L, 112L
), wday = c(1L, 1L, 1L, 1L, 1L, 1L), yday = c(232L, 232L, 232L,
232L, 232L, 232L), isdst = c(1L, 1L, 1L, 1L, 1L, 1L)), .Names = c("sec",
"min", "hour", "mday", "mon", "year", "wday", "yday", "isdst"
), class = c("POSIXlt", "POSIXt")), cpu = c(0.9, 0.2, 0.2, 0.1,
0.2, 0.1), rsec_s = c(0, 0, 0, 0, 0, 0), wsec_s = c(0, 3.8, 0,
0.4, 0.2, 0.2), util_pct = c(0, 0.1, 0, 0, 0, 0), node = c("bda101",
"bda101", "bda101", "bda101", "bda101", "bda101")), .Names = c("date_stamp",
"cpu", "rsec_s", "wsec_s", "util_pct", "node"), row.names = c(NA,
6L), class = "data.frame")
I would use xts for this. The only potential hiccup is that xts is a matrix with an ordered index attribute, so you can't mix types like you can in a data.frame.
If the node column is invariant, you can just exclude it from your xts object:
library(xts)
x <- xts(DF_IOSTAT_ALL[,2:5], as.POSIXct(DF_IOSTAT_ALL$date_stamp))
x["2012-08-20 00:00:24/2012-08-20 00:00:54"]
Update using the OP's actual data:
Data <- DF_IOSTAT_ALL
# change node from character to numeric,
# so it can exist in the xts object too.
Data$node <- as.numeric(gsub("^bda","",Data$node)
# create the xts object
x <- xts(Data[,-1], as.POSIXct(Data$date_stamp))
# subset one day
system.time(x['2012-08-20 13:00/2012-08-20 17:00'])
# user system elapsed
# 0 0 0
# subset 13:00-17:00 for all days
system.time(x['T13:00/T17:00'])
# user system elapsed
# 2.64 0.00 2.66
Here are my experiments with data.table
. Interestingly, just the conversion to data.table
will make your lookups faster, possibly through more efficient lookup to the logical vectors. I compared four things: the original data frame lookup; a lookup with conversion from POSIXlt to POSIXct (thanks to Matthew Dowle); the data table lookup; and the data table lookup in addition to the setup of copy and conversion. Even with the additional setup, the data table lookup wins. With multiple lookups, you'll get even more savings in time.
library(data.table)
library(rbenchmark)
load("DF_IOSTAT_ALL.rda")
DF_IOSTAT_ALL.original <- DF_IOSTAT_ALL
start_in <- strptime("2012-08-20 13:00", "%Y-%m-%d %H:%M")
end_in<- strptime("2012-08-20 17:00", "%Y-%m-%d %H:%M")
#function to test: original
fun <- function() DF_IOSTAT_INT <<- DF_IOSTAT_ALL.original[DF_IOSTAT_ALL.original$date_stamp >= start_in & DF_IOSTAT_ALL.original$date_stamp <= end_in,]
#function to test: changing to POSIXct
DF_IOSTAT_ALL.ct <- within(DF_IOSTAT_ALL.original,date_stamp <- as.POSIXct(date_stamp))
fun.ct <- function() DF_IOSTAT_INT <<- DF_IOSTAT_ALL.ct[with(DF_IOSTAT_ALL.ct,date_stamp >= start_in & date_stamp <= end_in),]
#function to test: with data.table and POSIXct
DF_IOSTAT_ALL.dt <- as.data.table(DF_IOSTAT_ALL.ct);
fun.dt <- function() DF_IOSTAT_INT <<- DF_IOSTAT_ALL.dt[date_stamp >= start_in & date_stamp <= end_in,]
#function to test: with data table and POSIXct, with setup steps
newfun <- function() {
DF_IOSTAT_ALL <- DF_IOSTAT_ALL.original;
#data.table doesn't play well with POSIXlt, so convert to POSIXct
DF_IOSTAT_ALL$date_stamp <- as.POSIXct(DF_IOSTAT_ALL$date_stamp);
DF_IOSTAT_ALL <- data.table(DF_IOSTAT_ALL);
DF_IOSTAT_INT <<- DF_IOSTAT_ALL[date_stamp >= start_in & date_stamp <= end_in,];
}
benchmark(fun(), fun.ct(), fun.dt(), newfun(), replications=3,order="relative")
# test replications elapsed relative user.self sys.self user.child sys.child
#3 fun.dt() 3 0.18 1.000000 0.11 0.08 NA NA
#2 fun.ct() 3 0.52 2.888889 0.44 0.08 NA NA
#4 newfun() 3 35.49 197.166667 34.88 0.58 NA NA
#1 fun() 3 66.68 370.444444 66.42 0.15 NA NA
If you know what your time intervals are beforehand, you can probably make it even faster by splitting with findInterval
or cut
and keying/indexing the table.
DF_IOSTAT_ALL <- copy(DF_IOSTAT_ALL.new)
time.breaks <- strptime.d("2012-08-19 19:00:00") + 0:178 * 60 * 60 #by hour
DF_IOSTAT_ALL[,interval := findInterval(date_stamp,time.breaks)]
setkey(DF_IOSTAT_ALL,interval)
start_in <- time.breaks[60]
end_in <- time.breaks[61]
benchmark(a <- DF_IOSTAT_ALL[J(60)],b <- fun2(DF_IOSTAT_ALL))
# test replications elapsed relative user.self sys.self user.child sys.child
#1 DF_IOSTAT_ALL[J(60)] 100 0.78 1.000000 0.64 0.14 NA NA
#2 fun2(DF_IOSTAT_ALL) 100 6.69 8.576923 5.76 0.91 NA NA
all.equal(a,b[,.SD,.SDcols=c(12,1:11,13)]) #test for equality (rearranging columns to match)
#TRUE