dplyr with name of columns in a function

2020-04-20 23:56发布

问题:

Not able to figure out how to use column names in a function using dplyr R package. Reproducible example is below:

Data

set.seed(12345)
Y <- rnorm(10)
Env <- paste0("E", rep(1:2, each = 5))
Gen <- paste0("G", rep(1:5, times = 2))
df1 <- data.frame(Y, Env, Gen)

Works outside function

library(dplyr)
  df1 %>%
    dplyr::group_by(E, G) %>%
    dplyr::summarize(mean(Y))

with(data = df1, expr = tapply(X = Y, INDEX = list(E, G), FUN = mean))  

First function

fn1 <- function(Y, E, G, data){
  Y <- deparse(substitute(Y))
  E <- deparse(substitute(E))
  G <- deparse(substitute(G))
  Out <- with(data = data, tapply(X = Y, INDEX = list(E, G), FUN = mean), parent.frame())
  return(Out)
}  

fn1(Y = Y, E = Env, G = Gen, data = df1)

Error in tapply(X = Y, INDEX = list(E, G), FUN = mean) : arguments must have same length

Second function

fn2 <- function(Y, E, G, data){
  Y <- deparse(substitute(Y))
  E <- deparse(substitute(E))
  G <- deparse(substitute(G))
  library(dplyr)
  Out <- df1 %>%
    dplyr::group_by(E, G) %>%
    dplyr::summarize(mean(Y))
  return(Out)
}  

fn2(Y = Y, E = Env, G = Gen, data = df1)

Error in grouped_df_impl(data, unname(vars), drop) : Column E is unknown

回答1:

One option would to use the enquo to capture the expression and its environment in a quosure object which can be evaluated within the group_by, summarise, mutate etc by using !! operator or UQ (unquote expression)

fn2 <- function(Y, E, G, data){
 E <- enquo(E)
 G <- enquo(G)
 Y <- enquo(Y)
 data %>%
    dplyr::group_by(!! E, !! G) %>%
    dplyr::summarize(Y = mean(!!Y))

}

fn2(Y, E = Env, G = Gen, df1)
# A tibble: 10 x 3
# Groups: Env [?]
#   Env    Gen         Y
#   <fctr> <fctr>  <dbl>
# 1 E1     G1      0.586
# 2 E1     G2      0.709
# 3 E1     G3     -0.109
# 4 E1     G4     -0.453
# 5 E1     G5      0.606
# 6 E2     G1     -1.82 
# 7 E2     G2      0.630
# 8 E2     G3     -0.276
# 9 E2     G4     -0.284
#10 E2     G5     -0.919

In the Op's function, while the expression is captured by substitute, with deparse, it is converted to a string. By using sym from rlang, this can be converted to symbol and then evaluated with !! or UQ as above

fn2 <- function(Y, E, G, data){
   Y <- deparse(substitute(Y))
   E <- deparse(substitute(E))
   G <- deparse(substitute(G))

   df1 %>%
    dplyr::group_by(!!rlang::sym(E), !! rlang::sym(G)) %>%
    dplyr::summarize(Y = mean(!! rlang::sym(Y)))

}  

fn2(Y = Y, E = Env, G = Gen, data = df1)

Another variant of the OP's function without using rlang would be to make use of group_by_at or summarise_at which can take strings as argument

fn3 <- function(Y, E, G, data){
  Y <- deparse(substitute(Y))
  E <- deparse(substitute(E))
  G <- deparse(substitute(G))

   df1 %>%
    dplyr::group_by_at(vars(E, G)) %>%
    dplyr::summarize_at(vars(Y), mean)

}  

fn3(Y = Y, E = Env, G = Gen, data = df1)


标签: r function dplyr