dplyr summarize with subtotals

2019-01-25 03:35发布

One of the great things about pivot tables in excel is that they provide subtotals automatically. First, I would like to know if there is anything already created within dplyr that can accomplish this. If not, what is the easiest way to achieve it?

In the example below, I show the mean displacement by number of cylinders and carburetors. For each group of cylinders (4,6,8), I'd like to see the mean displacement for the group (or total displacement, or any other summary statistic).

library(dplyr)
mtcars %>% group_by(cyl,carb) %>% summarize(mean(disp))

  cyl carb mean(disp)
1   4    1      91.38
2   4    2     116.60
3   6    1     241.50
4   6    4     163.80
5   6    6     145.00
6   8    2     345.50
7   8    3     275.80
8   8    4     405.50
9   8    8     301.00

6条回答
霸刀☆藐视天下
2楼-- · 2019-01-25 04:02

Here is a simple one-liner creating margins within a data_frame:

library(plyr)
library(dplyr)

# Margins without labels
mtcars %>% 
  group_by(cyl,carb) %>% 
  summarize(Mean_Disp=mean(disp)) %>% 
  do(plyr::rbind.fill(., data_frame(cyl=first(.$cyl), Mean_Disp=sum(.$Mean_Disp, na.rm=T))))

output:

Source: local data frame [12 x 3]
Groups: cyl [3]

     cyl  carb Mean_Disp
   <dbl> <dbl>     <dbl>
1      4     1     91.38
2      4     2    116.60
3      4    NA    207.98
4      6     1    241.50
5      6     4    163.80
6      6     6    145.00
7      6    NA    550.30
8      8     2    345.50
9      8     3    275.80
10     8     4    405.50
11     8     8    301.00
12     8    NA   1327.80

You may also add labels for the summary statistics like:

mtcars %>% 
  group_by(cyl,carb) %>% 
  summarize(Mean_Disp=mean(disp)) %>% 
  do(plyr::rbind.fill(., data_frame(cyl=first(.$cyl), carb=c("Total", "Mean"), Mean_Disp=c(sum(.$Mean_Disp, na.rm=T), mean(.$Mean_Disp, na.rm=T)))))

output:

Source: local data frame [15 x 3]
Groups: cyl [3]

     cyl  carb Mean_Disp
   <dbl> <chr>     <dbl>
1      4     1     91.38
2      4     2    116.60
3      4 Total    207.98
4      4  Mean    103.99
5      6     1    241.50
6      6     4    163.80
7      6     6    145.00
8      6 Total    550.30
9      6  Mean    183.43
10     8     2    345.50
11     8     3    275.80
12     8     4    405.50
13     8     8    301.00
14     8 Total   1327.80
15     8  Mean    331.95
查看更多
beautiful°
3楼-- · 2019-01-25 04:04

Something similar to table with addmargins (although actually a data.frame)

library(dplyr)
library(reshape2)
out <- bind_cols(
    mtcars %>% group_by(cyl, carb) %>%
      summarise(mu = mean(disp)) %>%
      dcast(cyl ~ carb),
    (mtcars %>% group_by(cyl) %>% summarise(Total=mean(disp)))[,2]
)

margin <- t((mtcars %>% group_by(carb) %>% summarise(Total=mean(disp)))[,2])
rbind(out, c(NA, margin, mean(mtcars$disp))) %>%
  `rownames<-`(c(paste("cyl", c(4,6,8)), "Total"))  # add some row names
#      cyl        1      2     3      4   6   8    Total
# cyl 4   4  91.3800 116.60    NA     NA  NA  NA 105.1364
# cyl 6   6 241.5000     NA    NA 163.80 145  NA 183.3143
# cyl 8   8       NA 345.50 275.8 405.50  NA 301 353.1000
# Total  NA 134.2714 208.16 275.8 308.82 145 301 230.7219

The bottom row is the column wise margins, columns named 1:8 are carbs, and Total is the rowwise margins.

查看更多
老娘就宠你
4楼-- · 2019-01-25 04:06

Also possible by simply joining the two group results:

cyl_carb <- mtcars %>% group_by(cyl,carb) %>% summarize(mean(disp))
cyl <- mtcars %>% group_by(cyl) %>% summarize(mean(disp))
joined <- full_join(cyl_carb, cyl)
result <- arrange(joined, cyl)
result

gives:

Source: local data frame [12 x 3]
Groups: cyl [3]

     cyl  carb mean(disp)
   (dbl) (dbl)      (dbl)
1      4     1    91.3800
2      4     2   116.6000
3      4    NA   105.1364
4      6     1   241.5000
5      6     4   163.8000
6      6     6   145.0000
7      6    NA   183.3143
8      8     2   345.5000
9      8     3   275.8000
10     8     4   405.5000
11     8     8   301.0000
12     8    NA   353.1000

or with an additional column:

cyl_carb <- mtcars %>% group_by(cyl,carb) %>% summarize(mean(disp))
cyl <- mtcars %>% group_by(cyl) %>% summarize(mean.cyl = mean(disp))
joined <- full_join(cyl_carb, cyl)
joined

gives:

Source: local data frame [9 x 4]
Groups: cyl [?]

    cyl  carb mean(disp) mean.cyl
  (dbl) (dbl)      (dbl)    (dbl)
1     4     1      91.38 105.1364
2     4     2     116.60 105.1364
3     6     1     241.50 183.3143
4     6     4     163.80 183.3143
5     6     6     145.00 183.3143
6     8     2     345.50 353.1000
7     8     3     275.80 353.1000
8     8     4     405.50 353.1000
9     8     8     301.00 353.1000
查看更多
Deceive 欺骗
5楼-- · 2019-01-25 04:07

data.table It's very clunky, but this is one way:

library(data.table)
DT <- data.table(mtcars)
rbind(
  DT[,.(mean(disp)),          by=.(cyl,carb)],
  DT[,.(mean(disp), carb=NA), by=.(cyl) ],
  DT[,.(mean(disp), cyl=NA),  by=.(carb)]
)[order(cyl,carb)]

This gives

    cyl carb       V1
 1:   4    1  91.3800
 2:   4    2 116.6000
 3:   4   NA 105.1364
 4:   6    1 241.5000
 5:   6    4 163.8000
 6:   6    6 145.0000
 7:   6   NA 183.3143
 8:   8    2 345.5000
 9:   8    3 275.8000
10:   8    4 405.5000
11:   8    8 301.0000
12:   8   NA 353.1000
13:  NA    1 134.2714
14:  NA    2 208.1600
15:  NA    3 275.8000
16:  NA    4 308.8200
17:  NA    6 145.0000
18:  NA    8 301.0000

I'd rather see results in something like an R table, but don't know of any functions for that.


dplyr @akrun found this analogous code

bind_rows(
  mtcars %>% 
    group_by(cyl, carb) %>% 
    summarise(Mean= mean(disp)), 
  mtcars %>% 
    group_by(cyl) %>% 
    summarise(carb=NA, Mean=mean(disp)), 
  mtcars %>% 
    group_by(carb) %>% 
    summarise(cyl=NA, Mean=mean(disp))
) %>% arrange(cyl, carb)

We could wrap the repeat operations in a function

library(lazyeval)
f1 <- function(df, grp, Var, func){
  FUN <- match.fun(func)
   df %>% 
     group_by_(.dots=grp) %>%
     summarise_(interp(~FUN(v), v=as.name(Var)))
  }

 m1 <- f1(mtcars, c('carb', 'cyl'), 'disp', 'mean')
 m2 <- f1(mtcars, 'carb', 'disp', 'mean')
 m3 <- f1(mtcars, 'cyl', 'disp', 'mean')

 bind_rows(list(m1, m2, m3)) %>%
              arrange(cyl, carb) %>%
              rename(Mean=`FUN(disp)`)
   carb cyl     Mean
1     1   4  91.3800
2     2   4 116.6000
3    NA   4 105.1364
4     1   6 241.5000
5     4   6 163.8000
6     6   6 145.0000
7    NA   6 183.3143
8     2   8 345.5000
9     3   8 275.8000
10    4   8 405.5000
11    8   8 301.0000
12   NA   8 353.1000
13    1  NA 134.2714
14    2  NA 208.1600
15    3  NA 275.8000
16    4  NA 308.8200
17    6  NA 145.0000
18    8  NA 301.0000

Either option can be made a little less ugly with data.table's rbindlist with fill:

rbindlist(list(
  mtcars %>% group_by(cyl) %>% summarise(mean(disp)),
  mtcars %>% group_by(carb) %>% summarise(mean(disp)),
  mtcars %>% group_by(cyl,carb) %>% summarise(mean(disp))
),fill=TRUE) %>% arrange(cyl,carb)

rbindlist(list(
  DT[,mean(disp),by=.(cyl,carb)],
  DT[,mean(disp),by=.(cyl)],
  DT[,mean(disp),by=.(carb)]
),fill=TRUE)[order(cyl,carb)]
查看更多
兄弟一词,经得起流年.
6楼-- · 2019-01-25 04:17

You can use this wrapper around ddply, which applies ddply for each possible margin and rbinds the results with its usual output.

To marginalize over all grouping factors:

mtcars %>% ddplym(.variables = .(cyl, carb), .fun = summarise, mean(disp))

To marginalize over carb only:

mtcars %>% ddplym(
  .variables = .(carb),
  .fun = function(data) data %>% group_by(cyl) %>% summarise(mean(disp)))

Wrapper:

require(plyr)
require(dplyr)

ddplym <- function(.data, .variables, .fun, ..., .margin = TRUE, .margin_name = '(all)') {
  if (.margin) {
    df <- .ddplym(.data, .variables, .fun, ..., .margin_name = .margin_name)
  } else {
    df <- ddply(.data, .variables, .fun, ...)
    if (.variables %>% length == 0) {
      df$.id <- NULL
    }
  }

  return(df)
}

.ddplym <- function(.data,
                    .variables,
                    .fun,
                    ...,
                    .margin_name = '(all)'
) {

  .variables <- as.quoted(.variables)

  n <- length(.variables)

  var_combn_idx <- lapply(0:n, function(x) {
    combn(1:n, n - x) %>% alply(2, c)
  }) %>%
    unlist(recursive = FALSE, use.names = FALSE)

  data_list <- lapply(var_combn_idx, function(x) {
    data <- ddply(.data, .variables[x], .fun, ...)

    # drop '.id' column created when no variables to split by specified
    if (!length(.variables[x]))
      data <- data[, -1, drop = FALSE]

    return(data)
  })

  # workaround for NULL .variables
  if (unlist(.variables) %>% is.null && names(.variables) %>% is.null) {
    data_list <- data_list[1]
  } else if (unlist(.variables) %>% is.null) {
    data_list <- data_list[2]
  }

  if (length(data_list) > 1) {
    data_list <- lapply(data_list, function(data)
      rbind_pre(
        data = data,
        colnames = colnames(data_list[[1]]),
        fill = .margin_name
      )) 
  }

  Reduce(rbind, data_list)
}

rbind_pre <- function(data, colnames, fill = NA) {
  colnames_fill <- setdiff(colnames, colnames(data))
  data_fill <- matrix(fill,
                      nrow = nrow(data),
                      ncol = length(colnames_fill)) %>%
    as.data.frame %>% setNames(colnames_fill)
  cbind(data, data_fill)[, colnames]
}
查看更多
啃猪蹄的小仙女
7楼-- · 2019-01-25 04:19

I know that this may not be a very elegant solution, but I hope it helps anyway:

p <-mtcars %>% group_by(cyl,carb) 
p$cyl <- as.factor(p$cyl)
average_disp <- sapply(1:length(levels(p$cyl)), function(x)mean(subset(p,p$cyl==levels(p$cyl)[x])$disp))
df <- data.frame(levels(p$cyl),average_disp)
colnames(df)[1]<-"cyl"

#> df
#  cyl average_disp
#1   4     105.1364
#2   6     183.3143
#3   8     353.1000

(Edit: After a minor modification in the definition of p this now yields the same results as @Frank's and @akrun's solution)

查看更多
登录 后发表回答