quick/elegant way to construct mean/variance summa

2019-01-22 03:06发布

I can achieve this task, but I feel like there must be a "best" (slickest, most compact, clearest-code, fastest?) way of doing it and have not figured it out so far ...

For a specified set of categorical factors I want to construct a table of means and variances by group.

generate data:

set.seed(1001)
d <- expand.grid(f1=LETTERS[1:3],f2=letters[1:3],
                 f3=factor(as.character(as.roman(1:3))),rep=1:4)
d$y <- runif(nrow(d))
d$z <- rnorm(nrow(d))

desired output:

  f1 f2  f3    y.mean      y.var
1  A  a   I 0.6502307 0.09537958
2  A  a  II 0.4876630 0.11079670
3  A  a III 0.3102926 0.20280568
4  A  b   I 0.3914084 0.05869310
5  A  b  II 0.5257355 0.21863126
6  A  b III 0.3356860 0.07943314
... etc. ...

using aggregate/merge:

library(reshape)
m1 <- aggregate(y~f1*f2*f3,data=d,FUN=mean)
m2 <- aggregate(y~f1*f2*f3,data=d,FUN=var)
mvtab <- merge(rename(m1,c(y="y.mean")),
      rename(m2,c(y="y.var")))

using ddply/summarise (possibly best but haven't been able to make it work):

mvtab2 <- ddply(subset(d,select=-c(z,rep)),
                .(f1,f2,f3),
                summarise,numcolwise(mean),numcolwise(var))

results in

Error in output[[var]][rng] <- df[[var]] : 
  incompatible types (from closure to logical) in subassignment type fix

using melt/cast (maybe best?)

mvtab3 <- cast(melt(subset(d,select=-c(z,rep)),
          id.vars=1:3),
     ...~.,fun.aggregate=c(mean,var))
## now have to drop "variable"
mvtab3 <- subset(mvtab3,select=-variable)
## also should rename response variables

Won't (?) work in reshape2. Explaining ...~. to someone could be tricky!

8条回答
看我几分像从前
2楼-- · 2019-01-22 03:33

I'm a bit puzzled. Does this not work:

mvtab2 <- ddply(d,.(f1,f2,f3),
            summarise,y.mean = mean(y),y.var = var(y))

This give me something like this:

   f1 f2  f3    y.mean       y.var
1   A  a   I 0.6502307 0.095379578
2   A  a  II 0.4876630 0.110796695
3   A  a III 0.3102926 0.202805677
4   A  b   I 0.3914084 0.058693103
5   A  b  II 0.5257355 0.218631264

Which is in the right form, but it looks like the values are different that what you specified.

Edit

Here's how to make your version with numcolwise work:

mvtab2 <- ddply(subset(d,select=-c(z,rep)),.(f1,f2,f3),summarise,
                y.mean = numcolwise(mean)(piece),
                y.var = numcolwise(var)(piece)) 

You forgot to pass the actual data to numcolwise. And then there's the little ddply trick that each piece is called piece internally. (Which Hadley points out in the comments shouldn't be relied upon as it may change in future versions of plyr.)

查看更多
Lonely孤独者°
3楼-- · 2019-01-22 03:44

Here is a solution using data.table

library(data.table)
d2 = data.table(d)
ans = d2[,list(avg_y = mean(y), var_y = var(y)), 'f1, f2, f3']
查看更多
可以哭但决不认输i
4楼-- · 2019-01-22 03:46

I'm slightly addicted to speed comparisons even though they're largely irrelevant for me in this situation ...

joran_ddply <- function(d) ddply(d,.(f1,f2,f3),
                                 summarise,y.mean = mean(y),y.var = var(y))
joshulrich_aggregate <- function(d) {
  aggregate(d$y, d[,c("f1","f2","f3")],
            FUN=function(x) c(mean=mean(x),var=var(x)))
}

formula_aggregate <- function(d) {
  aggregate(y~f1*f2*f3,data=d,
            FUN=function(x) c(mean=mean(x),var=var(x)))
}
library(data.table)
d2 <- data.table(d)
ramnath_datatable <- function(d) {
  d[,list(avg_y = mean(y), var_y = var(y)), 'f1, f2, f3']
}


library(Hmisc)
dwin_hmisc <- function(d) {summary(y ~ interaction(f3,f2,f1), 
                   data=d, method="response", 
                   fun=function(y) c(mean.y=mean(y) ,var.y=var(y) ))
                         }


library(rbenchmark)
benchmark(joran_ddply(d),
          joshulrich_aggregate(d),
          ramnath_datatable(d2),
          formula_aggregate(d),
          dwin_hmisc(d))

aggregate is fastest (even faster than data.table, which is a surprise to me, although things might be different with a bigger table to aggregate), even using the formula interface ...)

                     test replications elapsed relative user.self sys.self
5           dwin_hmisc(d)          100   1.235 2.125645     1.168    0.044
4    formula_aggregate(d)          100   0.703 1.209983     0.656    0.036
1          joran_ddply(d)          100   3.345 5.757315     3.152    0.144
2 joshulrich_aggregate(d)          100   0.581 1.000000     0.596    0.000
3   ramnath_datatable(d2)          100   0.750 1.290878     0.708    0.000

(Now I just need Dirk to step up and post an Rcpp solution that is 1000 times faster than anything else ...)

查看更多
干净又极端
5楼-- · 2019-01-22 03:47

(I voted for Joshua's.) Here's an Hmisc::summary.formula solution. The advantage of this for me is that it is well integrated with the Hmisc::latex output "channel".

summary(y ~ interaction(f3,f2,f1), data=d, method="response", 
                    fun=function(y) c(mean.y=mean(y) ,var.y=var(y) ))
#-----output----------
y    N=108

+-----------------------+-------+---+---------+-----------+
|                       |       |N  |mean.y   |var.y      |
+-----------------------+-------+---+---------+-----------+
|interaction(f3, f2, f1)|I.a.A  |  4|0.6502307|0.095379578|
|                       |II.a.A |  4|0.4876630|0.110796695|

snipped output to show the latex -> PDF -> png output:

enter image description here

查看更多
在下西门庆
6楼-- · 2019-01-22 03:48

I find the doBy package has some very convenient functions for things like this. For example, the function ?summaryBy is quite handy. Consider:

> summaryBy(y~f1+f2+f3, data=d, FUN=c(mean, var))
   f1 f2  f3    y.mean       y.var
1   A  a   I 0.6502307 0.095379578
2   A  a  II 0.4876630 0.110796695
3   A  a III 0.3102926 0.202805677
4   A  b   I 0.3914084 0.058693103
5   A  b  II 0.5257355 0.218631264
6   A  b III 0.3356860 0.079433136
7   A  c   I 0.3367841 0.079487973
8   A  c  II 0.6273320 0.041373836
9   A  c III 0.4532720 0.022779672
10  B  a   I 0.6688221 0.044184575
11  B  a  II 0.5514724 0.020359289
12  B  a III 0.6389354 0.104056229
13  B  b   I 0.5052346 0.138379070
14  B  b  II 0.3933283 0.050261804
15  B  b III 0.5953874 0.161943989
16  B  c   I 0.3490460 0.079286849
17  B  c  II 0.5534569 0.207381592
18  B  c III 0.4652424 0.187463143
19  C  a   I 0.3340988 0.004994589
20  C  a  II 0.3970315 0.126967554
21  C  a III 0.3580250 0.066769484
22  C  b   I 0.7676858 0.124945402
23  C  b  II 0.3613772 0.182689385
24  C  b III 0.4175562 0.095933470
25  C  c   I 0.3592491 0.039832864
26  C  c  II 0.7882591 0.084271963
27  C  c III 0.3936949 0.085758343

So the function call is simple, easy to use, and I would say, elegant.

Now, if your primary concern is speed, it seems that it would be reasonable--at least with smaller sized tasks (note that I couldn't get the ramnath_datatable function to work for whatever reason):

                     test replications elapsed relative user.self 
4           dwin_hmisc(d)          100    0.50    2.778      0.50 
3    formula_aggregate(d)          100    0.23    1.278      0.24 
5       gung_summaryBy(d)          100    0.34    1.889      0.35 
1          joran_ddply(d)          100    1.34    7.444      1.32 
2 joshulrich_aggregate(d)          100    0.18    1.000      0.19 
查看更多
姐就是有狂的资本
7楼-- · 2019-01-22 03:51

I've came accross with this question and found the benchmarks are done with small tables, so it's hard to tell which method is better with 100 rows.

I've also modified the data a bit also to make it "unsorted", this would be a more common case, for example as the data is in a DB. I've added a few more data.table trials to see if setting a key is faster beforehand. It seems here, setting the key beforehand doesn't improve much the performance, so ramnath solution seems to be the fastest.

set.seed(1001)
d <- data.frame(f1 = sample(LETTERS[1:3], 30e5, replace = T), f2 = sample(letters[1:3], 30e5, replace = T),
                f3 = sample(factor(as.character(as.roman(1:3))), 30e5, replace = T), rep = sample(1:4, replace = T))

d$y <- runif(nrow(d))
d$z <- rnorm(nrow(d))

str(d)

require(Hmisc)
require(plyr)
require(data.table)
d2 = data.table(d)
d3 = data.table(d)

# Set key of d3 to compare how fast it is if the DT is already keyded
setkey(d3,f1,f2,f3)

joran_ddply <- function(d) ddply(d,.(f1,f2,f3),
                                 summarise,y.mean = mean(y),y.var = var(y))

formula_aggregate <- function(d) {
  aggregate(y~f1*f2*f3,data=d,
            FUN=function(x) c(mean=mean(x),var=var(x)))
}

ramnath_datatable <- function(d) {
  d[,list(avg_y = mean(y), var_y = var(y)), 'f1,f2,f3']
}

key_agg_datatable <- function(d) {
  setkey(d2,f1,f2,f3)
  d[,list(avg_y = mean(y), var_y = var(y)), 'f1,f2,f3']
}

one_key_datatable <- function(d) {
  setkey(d2,f1)
  d[,list(avg_y = mean(y), var_y = var(y)), 'f1,f2,f3']
}

including_3key_datatable <- function(d) {
  d[,list(avg_y = mean(y), var_y = var(y)), 'f1,f2,f3']
}

dwin_hmisc <- function(d) {summary(y ~ interaction(f3,f2,f1), 
                                   data=d, method="response", 
                                   fun=function(y) c(mean.y=mean(y) ,var.y=var(y) ))
}

require(rbenchmark)
benchmark(joran_ddply(d),
          joshulrich_aggregate(d),
          ramnath_datatable(d2),
          including_3key_datatable(d3),
          one_key_datatable(d2),
          key_agg_datatable(d2),
          formula_aggregate(d),
          dwin_hmisc(d)
          )

#                         test replications elapsed relative user.self sys.self
#                dwin_hmisc(d)          100 1757.28  252.121   1590.89   165.65
#         formula_aggregate(d)          100  433.56   62.204    390.83    42.50
# including_3key_datatable(d3)          100    7.00    1.004      6.02     0.98
#               joran_ddply(d)          100  173.39   24.877    119.35    53.95
#      joshulrich_aggregate(d)          100  328.51   47.132    307.14    21.22
#        key_agg_datatable(d2)          100   24.62    3.532     19.13     5.50
#        one_key_datatable(d2)          100   29.66    4.255     22.28     7.34
#        ramnath_datatable(d2)          100    6.97    1.000      5.96     1.01
查看更多
登录 后发表回答