告诉ifelse忽略NA的直接方式

Flo*_*Flo 3 if-statement r na

如此处所述,当测试条件ifelse(test, yes, no)为时NA,评估也是如此NA.因此以下回报......

df <- data.frame(a = c(1, 1, NA, NA, NA ,NA),
                 b = c(NA, NA, 1, 1, NA, NA),
                 c = c(rep(NA, 4), 1, 1))
ifelse(df$a==1, "a==1", 
    ifelse(df$b==1, "b==1", 
        ifelse(df$c==1, "c==1", NA)))
#[1] "a==1" "a==1" NA     NA     NA     NA    
Run Code Online (Sandbox Code Playgroud)

......而不是期望的

#[1] "a==1" "a==1" "b==1" "b==1"  "c==1" "c==1" 
Run Code Online (Sandbox Code Playgroud)

正如Cath所建议的,我可以通过正式指定测试条件不应包含NA来规避这个问题:

ifelse(df$a==1 &  !is.na(df$a), "a==1", 
    ifelse(df$b==1 & !is.na(df$b), "b==1", 
        ifelse(df$c==1 & !is.na(df$c), "c==1", NA)))
Run Code Online (Sandbox Code Playgroud)

然而,正如akrun所指出的,随着列数的增加,这种解决方案变得相当冗长.


解决方法是首先使用NAdata.frame中不存在的值替换所有s(例如,在这种情况下为2):

df_noNA <- data.frame(a = c(1, 1, 2, 2, 2 ,2),
                 b = c(2, 2, 1, 1, 2, 2),
                 c = c(rep(2, 4), 1, 1))

ifelse(df_noNA$a==1, "a==1", 
    ifelse(df_noNA$b==1, "b==1", 
        ifelse(df_noNA$c==1, "c==1", NA)))
#[1] "a==1" "a==1" "b==1" "b==1"  "c==1" "c==1" 
Run Code Online (Sandbox Code Playgroud)

但是,我想知道是否有更直接的方法ifelse来忽略NAs?或者是以& !is.na最直接的方式编写函数?

ignorena <- function(column) {
        column ==1 & !is.na(column)
}
ifelse(ignorena(df$a), "a==1", 
    ifelse(ignorena(df$b), "b==1", 
        ifelse(ignorena(df$c), "c==1", NA)))
#[1] "a==1" "a==1" "b==1" "b==1"  "c==1" "c==1" 
Run Code Online (Sandbox Code Playgroud)

ama*_*net 9

您可以使用%in%而不是==排序忽略NAs.

ifelse(df$a %in% 1, "a==1", 
       ifelse(df$b %in% 1, "b==1", 
              ifelse(df$c %in% 1, "c==1", NA)))
Run Code Online (Sandbox Code Playgroud)

不幸的是,与原版相比,这并没有带来任何性能提升,而@ arkun的解决方案速度提高了约3倍.

solution_original <- function(){
  ifelse(df$a==1 &  !is.na(df$a), "a==1", 
         ifelse(df$b==1 & !is.na(df$b), "b==1", 
                ifelse(df$c==1 & !is.na(df$c), "c==1", NA)))
}

solution_akrun <- function(){
  v1 <- names(df)[max.col(!is.na(df)) * NA^!rowSums(!is.na(df))]
  i1 <- !is.na(v1)
  v1[i1] <- paste0(v1[i1], "==1")
}

solution_mine <- function(x){
  ifelse(df$a %in% 1, "a==1", 
         ifelse(df$b %in% 1, "b==1", 
                ifelse(df$c %in% 1, "c==1", NA)))
}
set.seed(1)
df <- data.frame(a = sample(c(1, rep(NA, 4)), 1e6, T),
                 b = sample(c(1, rep(NA, 4)), 1e6, T),
                 c = sample(c(1, rep(NA, 4)), 1e6, T))
microbenchmark::microbenchmark(
  solution_original(),
  solution_akrun(),
  solution_mine()
)
## Unit: milliseconds
##                expr      min       lq     mean   median       uq       max neval
## solution_original() 701.9413 839.3715 845.0720 853.1960 875.6151 1051.6659   100
##    solution_akrun() 217.4129 242.5113 293.2987 253.2144 387.1598  564.3981   100
##     solution_mine() 698.7628 845.0822 848.6717 858.7892 877.9676 1006.2872   100
Run Code Online (Sandbox Code Playgroud)

受此启发:R:处理TRUE,FALSE,NA和NaN

编辑

在@arkun的评论之后,我重新修改了基准并修改了声明.


ali*_*ire 5

dplyr::case_when是级联ifelse调用的便捷替代方案:

library(dplyr)

df <- data.frame(a = c(1, 1, NA, NA, NA ,NA),
                 b = c(NA, NA, 1, 1, NA, NA),
                 c = c(rep(NA, 4), 1, 1))

df %>% mutate(equals = case_when(a == 1 ~ 'a==1', 
                                 b == 1 ~ 'b==1', 
                                 c == 1 ~ 'c==1'))
#>    a  b  c equals
#> 1  1 NA NA   a==1
#> 2  1 NA NA   a==1
#> 3 NA  1 NA   b==1
#> 4 NA  1 NA   b==1
#> 5 NA NA  1   c==1
#> 6 NA NA  1   c==1
Run Code Online (Sandbox Code Playgroud)

它像 一样级联ifelse,所以如果第一个条件为真,即使第二个和第三个条件也为真,也会返回第一个结果。如果都不为真,则返回NA

set.seed(47)
df <- setNames(as.data.frame(matrix(sample(c(1, NA), 30, replace = TRUE), 10)), letters[1:3])

df %>% mutate(equals = case_when(a == 1 ~ 'a==1', 
                                 b == 1 ~ 'b==1', 
                                 c == 1 ~ 'c==1'))
#>     a  b  c equals
#> 1  NA  1  1   b==1
#> 2   1 NA NA   a==1
#> 3  NA  1 NA   b==1
#> 4  NA NA  1   c==1
#> 5  NA NA NA   <NA>
#> 6  NA NA  1   c==1
#> 7   1  1  1   a==1
#> 8   1  1  1   a==1
#> 9  NA  1 NA   b==1
#> 10 NA  1 NA   b==1
Run Code Online (Sandbox Code Playgroud)

另外它很快:

set.seed(47)
df <- setNames(as.data.frame(matrix(sample(c(1, NA), 3 * 1e5, replace = TRUE), ncol = 3)), letters[1:3])

microbenchmark::microbenchmark(
    original = {
        ifelse(df$a == 1 &  !is.na(df$a), "a==1", 
               ifelse(df$b == 1 & !is.na(df$b), "b==1", 
                      ifelse(df$c == 1 & !is.na(df$c), "c==1", NA)))},
    akrun = {
        v1 <- names(df)[max.col(!is.na(df)) * NA^!rowSums(!is.na(df))]
        i1 <- !is.na(v1)
        v1[i1] <- paste0(v1[i1], "==1")
    },
    amatsuo_net = {
        ifelse(df$a %in% 1, "a==1", 
               ifelse(df$b %in% 1, "b==1", 
                      ifelse(df$c %in% 1, "c==1", NA)))
    },
    alistaire = {
        df %>% mutate(equals = case_when(a == 1 ~ 'a==1', 
                                         b == 1 ~ 'b==1', 
                                         c == 1 ~ 'c==1'))
    }
)
#> Unit: milliseconds
#>         expr      min       lq      mean    median        uq       max neval
#>     original 81.19896 86.11843 110.93882 123.92463 128.58037 171.11026   100
#>        akrun 27.50351 30.99127  38.98353  32.67991  34.64947  77.98958   100
#>  amatsuo_net 83.75744 88.54095 109.22226 110.40066 129.02168 170.92911   100
#>    alistaire 16.57426 18.91951  21.73293  19.29925  24.30350  33.83180   100
Run Code Online (Sandbox Code Playgroud)