R summarise_at dynamically by condition : mean for

2020-08-01 05:21发布

I would like that but with the conditions in the summarise_at()

edit:

edit 1

I've added the word dynamically in the title: When I use vars(c()) in the summarise_at() it's for fast and clear examples, but in fact it's for use contains(), starts_with() and matches(,, perl=TRUE), because I have 50 columns, with many sum() and some mean().

And the goal is for generate dynamic SQL with tbl()..%>% group_by() ... %>% summarise_at()...%>% collect().

edit 2

I added example with SQL generated in my second example

end edit



library(tidyverse)
(mtcars 
  %>% group_by(carb)
  %>% summarise_at(vars(c("mpg","cyl","disp")), list (~mean(.),~sum(.)))
  # I don't want this line below, I would like a conditionnal in summarise_at() because I have 50 columns in my real case
  %>% select(carb,cyl_mean,disp_mean,mpg_sum)
)
#> # A tibble: 6 x 4
#>    carb cyl_mean disp_mean mpg_sum
#>   <dbl>    <dbl>     <dbl>   <dbl>
#> 1     1     4.57      134.   177. 
#> 2     2     5.6       208.   224  
#> 3     3     8         276.    48.9
#> 4     4     7.2       309.   158. 
#> 5     6     6         145     19.7
#> 6     8     8         301     15

Created on 2020-02-19 by the reprex package (v0.3.0)

This works, but I want only sum for mpg, and only mean for cyl and disp

library(RSQLite)
library(dbplyr)
library(tidyverse)
library(DBI)

db <- dbConnect(SQLite(),":memory:")

dbCreateTable(db, "mtcars_table", mtcars)

(tbl( db, build_sql( con=db,"select * from mtcars_table" ))
  %>% group_by(carb)
  %>% summarise_at(vars(c("mpg","cyl","disp")), list (~mean(.),~sum(.)))
  %>% select(carb,cyl_mean,disp_mean,mpg_sum)                   
  %>% show_query()
)
#> <SQL>
#> Warning: Missing values are always removed in SQL.[...]  to silence this warning
#> SELECT `carb`, `cyl_mean`, `disp_mean`, `mpg_sum`
#> FROM (SELECT `carb`, AVG(`mpg`) AS `mpg_mean`, AVG(`cyl`) AS `cyl_mean`, AVG(`disp`) AS `disp_mean`, SUM(`mpg`) AS `mpg_sum`, SUM(`cyl`) AS `cyl_sum`, SUM(`disp`) AS `disp_sum`
#> FROM (select * from mtcars_table)
#> GROUP BY `carb`)
#> # Source:   lazy query [?? x 4]
#> # Database: sqlite 3.30.1 [:memory:]
#> # … with 4 variables: carb <dbl>, cyl_mean <lgl>, disp_mean <lgl>,
#> #   mpg_sum <lgl>

I tried all possibilities like that but it doesn't work or it produces error.

error

(mtcars %>% group_by(carb)%>% summarise_at(vars(c("mpg","cyl","disp")),ifelse(vars(contains(names(.),"mpg")),list(sum(.)),list(mean(.)))) )

not good, too many columns

library(tidyverse)
(mtcars %>% group_by(carb)%>% summarise_at(vars(c("mpg","cyl","disp")),ifelse ((names(.)=="mpg"), list(~sum(.)) , list(~mean(.)))))
#> # A tibble: 6 x 34
#>    carb mpg_sum cyl_sum disp_sum mpg_mean..2 cyl_mean..2 disp_mean..2
#>   <dbl>   <dbl>   <dbl>    <dbl>       <dbl>       <dbl>        <dbl>
#> 1     1   177.       32     940.        25.3        4.57         134.
#> 2     2   224        56    2082.        22.4        5.6          208.
#> 3     3    48.9      24     827.        16.3        8            276.
#> 4     4   158.       72    3088.        15.8        7.2          309.
#> 5     6    19.7       6     145         19.7        6            145 
#> 6     8    15         8     301         15          8            301 
#> # … with 27 more variables: mpg_mean..3 <dbl>, cyl_mean..3 <dbl>,
#> #   disp_mean..3 <dbl>, mpg_mean..4 <dbl>, cyl_mean..4 <dbl>,
#> #   disp_mean..4 <dbl>, mpg_mean..5 <dbl>, cyl_mean..5 <dbl>,
#> #   disp_mean..5 <dbl>, mpg_mean..6 <dbl>, cyl_mean..6 <dbl>,
#> #   disp_mean..6 <dbl>, mpg_mean..7 <dbl>, cyl_mean..7 <dbl>,
#> #   disp_mean..7 <dbl>, mpg_mean..8 <dbl>, cyl_mean..8 <dbl>,
#> #   disp_mean..8 <dbl>, mpg_mean..9 <dbl>, cyl_mean..9 <dbl>,
#> #   disp_mean..9 <dbl>, mpg_mean..10 <dbl>, cyl_mean..10 <dbl>,
#> #   disp_mean..10 <dbl>, mpg_mean..11 <dbl>, cyl_mean..11 <dbl>,
#> #   disp_mean..11 <dbl>

Some other tries and remarks

I would like conditional sum(.) or mean(.) depending of the name of the column in the summarise().

It could be good if it accepts not only primitive functions.

At then end it's for tbl()..%>% group_by() ... %>% summarise_at()...%>% collect() to generate conditional SQL with AVG() and SUM().

Mssql SQL function like ~(convert(varchar()) works for mutate_at() and similar ~AVG()works for summarise_at() but I arrive at the same point: conditional summarise_at() doesn't work depending of name of columns.

:)

2条回答
ら.Afraid
2楼-- · 2020-08-01 06:01

workaround waiting across() with regex

library(RSQLite)
library(dbplyr)
library(tidyverse)
library(DBI)

db <- dbConnect(SQLite())

mtcars_table <- mtcars %>% rename(mpg_sum=mpg,cyl_mean=cyl,disp_mean=disp )

RSQLite::dbWriteTable(db, "mtcars_table", mtcars_table)

req<-as.character((tbl( db, build_sql( con=db,"select * from mtcars_table" ))
                   %>% group_by(carb)
                   %>% summarise_at(vars(c(ends_with("mean"), ends_with("sum")) ), ~sum(.))

) %>% sql_render())
#> Warning: Missing values are always removed in SQL.
#> Use `SUM(x, na.rm = TRUE)` to silence this warning
#> This warning is displayed only once per session.

req<-gsub("(SUM)(\\(.{1,30}mean.{1,10}\\))", "AVG\\2", req, perl=TRUE)
print(req)
#> [1] "SELECT `carb`, AVG(`cyl_mean`) AS `cyl_mean`, AVG(`disp_mean`) AS `disp_mean`, 
# SUM(`mpg_sum`) AS `mpg_sum`\nFROM (select * from mtcars_table)\n
# GROUP BY `carb`"

dbGetQuery(db, req)
#>   carb cyl_mean disp_mean mpg_sum
#> 1    1 4.571429  134.2714   177.4
#> 2    2 5.600000  208.1600   224.0
#> 3    3 8.000000  275.8000    48.9
#> 4    4 7.200000  308.8200   157.9
#> 5    6 6.000000  145.0000    19.7
#> 6    8 8.000000  301.0000    15.0

sessionInfo()

R version 3.6.1 (2019-07-05)
Platform: x86_64-pc-linux-gnu (64-bit)
Running under: Ubuntu 16.04.6 LTS

Matrix products: default
BLAS:   /usr/lib/libblas/libblas.so.3.6.0
LAPACK: /usr/lib/lapack/liblapack.so.3.6.0

locale:
 [1] LC_CTYPE=en_US.UTF-8       LC_NUMERIC=C               LC_TIME=en_US.UTF-8       
 [4] LC_COLLATE=en_US.UTF-8     LC_MONETARY=en_US.UTF-8    LC_MESSAGES=en_US.UTF-8   
 [7] LC_PAPER=en_US.UTF-8       LC_NAME=C                  LC_ADDRESS=C              
[10] LC_TELEPHONE=C             LC_MEASUREMENT=en_US.UTF-8 LC_IDENTIFICATION=C       

attached base packages:
[1] stats     graphics  grDevices utils     datasets  methods   base     

other attached packages:
 [1] DBI_1.1.0       forcats_0.4.0   stringr_1.4.0   dplyr_0.8.4     purrr_0.3.3    
 [6] readr_1.3.1     tidyr_1.0.2     tibble_2.1.3    ggplot2_3.2.1   tidyverse_1.3.0
[11] dbplyr_1.4.2    RSQLite_2.2.0  

loaded via a namespace (and not attached):
 [1] xfun_0.10        tidyselect_1.0.0 haven_2.2.0      lattice_0.20-38  colorspace_1.4-1
 [6] vctrs_0.2.2      generics_0.0.2   htmltools_0.4.0  blob_1.2.1       rlang_0.4.4     
[11] pillar_1.4.3     glue_1.3.1       withr_2.1.2      bit64_0.9-7      modelr_0.1.5    
[16] readxl_1.3.1     lifecycle_0.1.0  munsell_0.5.0    gtable_0.3.0     cellranger_1.1.0
[21] rvest_0.3.5      memoise_1.1.0    evaluate_0.14    knitr_1.25       callr_3.3.2     
[26] ps_1.3.0         fansi_0.4.1      broom_0.5.2      Rcpp_1.0.3       clipr_0.7.0     
[31] scales_1.1.0     backports_1.1.5  jsonlite_1.6.1   fs_1.3.1         bit_1.1-15.1    
[36] hms_0.5.3        digest_0.6.23    stringi_1.4.5    processx_3.4.1   grid_3.6.1      
[41] cli_2.0.1        tools_3.6.1      magrittr_1.5     lazyeval_0.2.2   whisker_0.4     
[46] crayon_1.3.4     pkgconfig_2.0.3  xml2_1.2.2       reprex_0.3.0     lubridate_1.7.4 
[51] assertthat_0.2.1 rmarkdown_1.16   httr_1.4.1       rstudioapi_0.10  R6_2.4.1        
[56] nlme_3.1-141     compiler_3.6.1  
查看更多
Melony?
3楼-- · 2020-08-01 06:15

An option is to group_by the 'carb', and then create the sum of 'mpg' as another grouping variable and then use summarise_at with the rest of the variables needed

library(dplyr)
mtcars %>%
    group_by(carb) %>%
    group_by(mpg_sum = sum(mpg), .add = TRUE) %>%
    summarise_at(vars(cyl, disp), list(mean = mean))
# A tibble: 6 x 4
# Groups:   carb [6]
#   carb mpg_sum cyl_mean disp_mean
#  <dbl>   <dbl>    <dbl>     <dbl>
#1     1   177.      4.57      134.
#2     2   224       5.6       208.
#3     3    48.9     8         276.
#4     4   158.      7.2       309.
#5     6    19.7     6         145 
#6     8    15       8         301 

Or using the devel version of dplyr, this can be done in a single summarise by wrapping the blocks of columns in across and the single column by themselves and apply different functions on it

mtcars %>%
  group_by(carb) %>% 
  summarise(across(one_of(c("cyl", "disp")), list(mean = mean)), 
            mpg_sum = sum(mpg))
# A tibble: 6 x 4
#   carb cyl_mean disp_mean mpg_sum
#  <dbl>    <dbl>     <dbl>   <dbl>
#1     1     4.57      134.   177. 
#2     2     5.6       208.   224  
#3     3     8         276.    48.9
#4     4     7.2       309.   158. 
#5     6     6         145     19.7
#6     8     8         301     15  

NOTE: summarise_at/summarise_if/mutate_at/mutate_if/... etc. will be superseded by the across verb with the default functions (summarise/mutate/filter/...) in the upcoming releases

查看更多
登录 后发表回答