我有一对夫妇,我需要反复子集大数据帧(100万+行×6-10列)。 该子集部分是我的代码的最慢的部分,我好奇,如果有这种方式更快地做。
load("https://dl.dropbox.com/u/4131944/Temp/DF_IOSTAT_ALL.rda")
start_in <- strptime("2012-08-20 13:00", "%Y-%m-%d %H:%M")
end_in<- strptime("2012-08-20 17:00", "%Y-%m-%d %H:%M")
system.time(DF_IOSTAT_INT <- DF_IOSTAT_ALL[DF_IOSTAT_ALL$date_stamp >= start_in & DF_IOSTAT_ALL$date_stamp <= end_in,])
> system.time(DF_IOSTAT_INT <- DF_IOSTAT_ALL[DF_IOSTAT_ALL$date_stamp >= start_in & DF_IOSTAT_ALL$date_stamp <= end_in,])
user system elapsed
16.59 0.00 16.60
dput(head(DF_IOSTAT_ALL))
structure(list(date_stamp = structure(list(sec = c(14, 24, 34,
44, 54, 4), min = c(0L, 0L, 0L, 0L, 0L, 1L), hour = c(0L, 0L,
0L, 0L, 0L, 0L), mday = c(20L, 20L, 20L, 20L, 20L, 20L), mon = c(7L,
7L, 7L, 7L, 7L, 7L), year = c(112L, 112L, 112L, 112L, 112L, 112L
), wday = c(1L, 1L, 1L, 1L, 1L, 1L), yday = c(232L, 232L, 232L,
232L, 232L, 232L), isdst = c(1L, 1L, 1L, 1L, 1L, 1L)), .Names = c("sec",
"min", "hour", "mday", "mon", "year", "wday", "yday", "isdst"
), class = c("POSIXlt", "POSIXt")), cpu = c(0.9, 0.2, 0.2, 0.1,
0.2, 0.1), rsec_s = c(0, 0, 0, 0, 0, 0), wsec_s = c(0, 3.8, 0,
0.4, 0.2, 0.2), util_pct = c(0, 0.1, 0, 0, 0, 0), node = c("bda101",
"bda101", "bda101", "bda101", "bda101", "bda101")), .Names = c("date_stamp",
"cpu", "rsec_s", "wsec_s", "util_pct", "node"), row.names = c(NA,
6L), class = "data.frame")
我会用XTS这一点。 唯一的潜在打嗝是XTS是有序索引属性的矩阵,所以就像你可以在一个data.frame不能混用类型。
如果节点列是不变的,你可以从你的XTS对象中排除:
library(xts)
x <- xts(DF_IOSTAT_ALL[,2:5], as.POSIXct(DF_IOSTAT_ALL$date_stamp))
x["2012-08-20 00:00:24/2012-08-20 00:00:54"]
更新使用OP的实际数据:
Data <- DF_IOSTAT_ALL
# change node from character to numeric,
# so it can exist in the xts object too.
Data$node <- as.numeric(gsub("^bda","",Data$node)
# create the xts object
x <- xts(Data[,-1], as.POSIXct(Data$date_stamp))
# subset one day
system.time(x['2012-08-20 13:00/2012-08-20 17:00'])
# user system elapsed
# 0 0 0
# subset 13:00-17:00 for all days
system.time(x['T13:00/T17:00'])
# user system elapsed
# 2.64 0.00 2.66
下面是我的实验与data.table
。 有趣的是,就在转换为data.table
会使你更快地查找,可能通过更有效的查找到逻辑载体。 我比较了四件事情:原始数据帧查找; 从POSIXlt到POSIXct(感谢马修Dowle)转换的查找; 数据表查找; 而除了副本和转换设置的数据表中查找。 即使有额外的设置,数据表查找获胜。 随着多次查找,你会及时获得更多的储蓄。
library(data.table)
library(rbenchmark)
load("DF_IOSTAT_ALL.rda")
DF_IOSTAT_ALL.original <- DF_IOSTAT_ALL
start_in <- strptime("2012-08-20 13:00", "%Y-%m-%d %H:%M")
end_in<- strptime("2012-08-20 17:00", "%Y-%m-%d %H:%M")
#function to test: original
fun <- function() DF_IOSTAT_INT <<- DF_IOSTAT_ALL.original[DF_IOSTAT_ALL.original$date_stamp >= start_in & DF_IOSTAT_ALL.original$date_stamp <= end_in,]
#function to test: changing to POSIXct
DF_IOSTAT_ALL.ct <- within(DF_IOSTAT_ALL.original,date_stamp <- as.POSIXct(date_stamp))
fun.ct <- function() DF_IOSTAT_INT <<- DF_IOSTAT_ALL.ct[with(DF_IOSTAT_ALL.ct,date_stamp >= start_in & date_stamp <= end_in),]
#function to test: with data.table and POSIXct
DF_IOSTAT_ALL.dt <- as.data.table(DF_IOSTAT_ALL.ct);
fun.dt <- function() DF_IOSTAT_INT <<- DF_IOSTAT_ALL.dt[date_stamp >= start_in & date_stamp <= end_in,]
#function to test: with data table and POSIXct, with setup steps
newfun <- function() {
DF_IOSTAT_ALL <- DF_IOSTAT_ALL.original;
#data.table doesn't play well with POSIXlt, so convert to POSIXct
DF_IOSTAT_ALL$date_stamp <- as.POSIXct(DF_IOSTAT_ALL$date_stamp);
DF_IOSTAT_ALL <- data.table(DF_IOSTAT_ALL);
DF_IOSTAT_INT <<- DF_IOSTAT_ALL[date_stamp >= start_in & date_stamp <= end_in,];
}
benchmark(fun(), fun.ct(), fun.dt(), newfun(), replications=3,order="relative")
# test replications elapsed relative user.self sys.self user.child sys.child
#3 fun.dt() 3 0.18 1.000000 0.11 0.08 NA NA
#2 fun.ct() 3 0.52 2.888889 0.44 0.08 NA NA
#4 newfun() 3 35.49 197.166667 34.88 0.58 NA NA
#1 fun() 3 66.68 370.444444 66.42 0.15 NA NA
如果你知道你的时间间隔是什么事先,你也许可以使它更快通过与分裂findInterval
或cut
和键控/索引表。
DF_IOSTAT_ALL <- copy(DF_IOSTAT_ALL.new)
time.breaks <- strptime.d("2012-08-19 19:00:00") + 0:178 * 60 * 60 #by hour
DF_IOSTAT_ALL[,interval := findInterval(date_stamp,time.breaks)]
setkey(DF_IOSTAT_ALL,interval)
start_in <- time.breaks[60]
end_in <- time.breaks[61]
benchmark(a <- DF_IOSTAT_ALL[J(60)],b <- fun2(DF_IOSTAT_ALL))
# test replications elapsed relative user.self sys.self user.child sys.child
#1 DF_IOSTAT_ALL[J(60)] 100 0.78 1.000000 0.64 0.14 NA NA
#2 fun2(DF_IOSTAT_ALL) 100 6.69 8.576923 5.76 0.91 NA NA
all.equal(a,b[,.SD,.SDcols=c(12,1:11,13)]) #test for equality (rearranging columns to match)
#TRUE