Direct way of telling ifelse to ignore NA

2020-07-17 16:14发布

问题:

As explained here when the test condition in ifelse(test, yes, no) is NA, the evaluation is also NA. Hence the following returns...

df <- data.frame(a = c(1, 1, NA, NA, NA ,NA),
                 b = c(NA, NA, 1, 1, NA, NA),
                 c = c(rep(NA, 4), 1, 1))
ifelse(df$a==1, "a==1", 
    ifelse(df$b==1, "b==1", 
        ifelse(df$c==1, "c==1", NA)))
#[1] "a==1" "a==1" NA     NA     NA     NA    

... instead of the desired

#[1] "a==1" "a==1" "b==1" "b==1"  "c==1" "c==1" 

As suggested by Cath, I can circumvent this problem by formally specifying that the test condition should not include NA:

ifelse(df$a==1 &  !is.na(df$a), "a==1", 
    ifelse(df$b==1 & !is.na(df$b), "b==1", 
        ifelse(df$c==1 & !is.na(df$c), "c==1", NA)))

However, as akrun also noted, this solution becomes rather lengthy with increasing number of columns.


A workaround would be to first replace all NAs with a value not present in the data.frame (e.g, 2 in this case):

df_noNA <- data.frame(a = c(1, 1, 2, 2, 2 ,2),
                 b = c(2, 2, 1, 1, 2, 2),
                 c = c(rep(2, 4), 1, 1))

ifelse(df_noNA$a==1, "a==1", 
    ifelse(df_noNA$b==1, "b==1", 
        ifelse(df_noNA$c==1, "c==1", NA)))
#[1] "a==1" "a==1" "b==1" "b==1"  "c==1" "c==1" 

However, I was wondering if there was a more direct way to tell ifelse to ignore NAs? Or is writing a function for & !is.na the most direct way?

ignorena <- function(column) {
        column ==1 & !is.na(column)
}
ifelse(ignorena(df$a), "a==1", 
    ifelse(ignorena(df$b), "b==1", 
        ifelse(ignorena(df$c), "c==1", NA)))
#[1] "a==1" "a==1" "b==1" "b==1"  "c==1" "c==1" 

回答1:

You can use %in% instead of == to sort-of ignore NAs.

ifelse(df$a %in% 1, "a==1", 
       ifelse(df$b %in% 1, "b==1", 
              ifelse(df$c %in% 1, "c==1", NA)))

Unfortunately, this does not give any performance gain compared to the original while @arkun's solution is about 3 times faster.

solution_original <- function(){
  ifelse(df$a==1 &  !is.na(df$a), "a==1", 
         ifelse(df$b==1 & !is.na(df$b), "b==1", 
                ifelse(df$c==1 & !is.na(df$c), "c==1", NA)))
}

solution_akrun <- function(){
  v1 <- names(df)[max.col(!is.na(df)) * NA^!rowSums(!is.na(df))]
  i1 <- !is.na(v1)
  v1[i1] <- paste0(v1[i1], "==1")
}

solution_mine <- function(x){
  ifelse(df$a %in% 1, "a==1", 
         ifelse(df$b %in% 1, "b==1", 
                ifelse(df$c %in% 1, "c==1", NA)))
}
set.seed(1)
df <- data.frame(a = sample(c(1, rep(NA, 4)), 1e6, T),
                 b = sample(c(1, rep(NA, 4)), 1e6, T),
                 c = sample(c(1, rep(NA, 4)), 1e6, T))
microbenchmark::microbenchmark(
  solution_original(),
  solution_akrun(),
  solution_mine()
)
## Unit: milliseconds
##                expr      min       lq     mean   median       uq       max neval
## solution_original() 701.9413 839.3715 845.0720 853.1960 875.6151 1051.6659   100
##    solution_akrun() 217.4129 242.5113 293.2987 253.2144 387.1598  564.3981   100
##     solution_mine() 698.7628 845.0822 848.6717 858.7892 877.9676 1006.2872   100

Was inspired by this: R: Dealing with TRUE, FALSE, NA and NaN

Edit

Following the comment by @arkun, I redid the benchmark and revised the statement.



回答2:

dplyr::case_when is a convenient alternative to cascading ifelse calls:

library(dplyr)

df <- data.frame(a = c(1, 1, NA, NA, NA ,NA),
                 b = c(NA, NA, 1, 1, NA, NA),
                 c = c(rep(NA, 4), 1, 1))

df %>% mutate(equals = case_when(a == 1 ~ 'a==1', 
                                 b == 1 ~ 'b==1', 
                                 c == 1 ~ 'c==1'))
#>    a  b  c equals
#> 1  1 NA NA   a==1
#> 2  1 NA NA   a==1
#> 3 NA  1 NA   b==1
#> 4 NA  1 NA   b==1
#> 5 NA NA  1   c==1
#> 6 NA NA  1   c==1

It cascades like ifelse, so if the first condition is true, the first result is returned even if the second and third conditions are true too. If none are true, it returns NA:

set.seed(47)
df <- setNames(as.data.frame(matrix(sample(c(1, NA), 30, replace = TRUE), 10)), letters[1:3])

df %>% mutate(equals = case_when(a == 1 ~ 'a==1', 
                                 b == 1 ~ 'b==1', 
                                 c == 1 ~ 'c==1'))
#>     a  b  c equals
#> 1  NA  1  1   b==1
#> 2   1 NA NA   a==1
#> 3  NA  1 NA   b==1
#> 4  NA NA  1   c==1
#> 5  NA NA NA   <NA>
#> 6  NA NA  1   c==1
#> 7   1  1  1   a==1
#> 8   1  1  1   a==1
#> 9  NA  1 NA   b==1
#> 10 NA  1 NA   b==1

Plus it's fast:

set.seed(47)
df <- setNames(as.data.frame(matrix(sample(c(1, NA), 3 * 1e5, replace = TRUE), ncol = 3)), letters[1:3])

microbenchmark::microbenchmark(
    original = {
        ifelse(df$a == 1 &  !is.na(df$a), "a==1", 
               ifelse(df$b == 1 & !is.na(df$b), "b==1", 
                      ifelse(df$c == 1 & !is.na(df$c), "c==1", NA)))},
    akrun = {
        v1 <- names(df)[max.col(!is.na(df)) * NA^!rowSums(!is.na(df))]
        i1 <- !is.na(v1)
        v1[i1] <- paste0(v1[i1], "==1")
    },
    amatsuo_net = {
        ifelse(df$a %in% 1, "a==1", 
               ifelse(df$b %in% 1, "b==1", 
                      ifelse(df$c %in% 1, "c==1", NA)))
    },
    alistaire = {
        df %>% mutate(equals = case_when(a == 1 ~ 'a==1', 
                                         b == 1 ~ 'b==1', 
                                         c == 1 ~ 'c==1'))
    }
)
#> Unit: milliseconds
#>         expr      min       lq      mean    median        uq       max neval
#>     original 81.19896 86.11843 110.93882 123.92463 128.58037 171.11026   100
#>        akrun 27.50351 30.99127  38.98353  32.67991  34.64947  77.98958   100
#>  amatsuo_net 83.75744 88.54095 109.22226 110.40066 129.02168 170.92911   100
#>    alistaire 16.57426 18.91951  21.73293  19.29925  24.30350  33.83180   100


回答3:

We can do this more efficiently without nested ifelse loops. For the first dataset, we create a logical matrix (!is.na(df)) for non-NA elements, get the column index of the maximum value i.e. TRUE for each row, use that index to get the column names and paste with ==1

paste0(names(df)[max.col(!is.na(df))], "==1")
#[1] "a==1" "a==1" "b==1" "b==1" "c==1" "c==1"

If there are rows with only NAs

v1 <- names(df)[max.col(!is.na(df)) * NA^!rowSums(!is.na(df))]
i1 <- !is.na(v1)
v1[i1] <- paste0(v1[i1], "==1")

and for the second dataset as there are no NAs, we can directly compare with 1 to get a logical matrix, do the same step as before

paste0(names(df_noNA)[max.col(df_noNA == 1)], "==1")
#[1] "a==1" "a==1" "b==1" "b==1" "c==1" "c==1"