我有一个标有 Hmisc R 包的列。列的类是c("labelled", "numeric")。如果我计算median()整个列的 ,返回的中位数仍然是c("labelled", "numeric")。
但是,如果我median()在两个子组中计算了,则一个中位数返回同一个类,但另一个返回为 class "numeric"。返回的不同类导致dplyr::summarize().
library(magrittr)
data <-
structure(
list(
cd4_count = c(
30, 97, 210, NA, 358, 242, 126,
792, 6, 145, 22, 150, 43, 23, 39, 953, 357, 427, 367, 239, 72,
61, 61, 438, 392, 1092, 245, 326, 42, 135, 199, 158, 17, NA,
287, 187, 252, 477, 157, NA, NA, 362, NA, 183, 885, 109, 321,
286, 142, 797
),
unsuccessful = c(
0, 0, 0, 1, 0, 0, 1, 1, 0, 1,
1, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0,
0, 1, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 0, 0, 1, 0, 1, 0, 0
)
),
row.names = c(NA, 50L),
class = "data.frame"
)
# Add label to CD4 count, using Hmisc package
Hmisc::label(data$cd4_count) <- "CD4 count"
# the classes here are all the same
data$cd4_count %>% class()
#> [1] "labelled" "numeric"
data$cd4_count[data$unsuccessful == 0] %>% class()
#> [1] "labelled" "numeric"
data$cd4_count[data$unsuccessful == 1] %>% class()
#> [1] "labelled" "numeric"
# Why are the results not the same class?!?!
data$cd4_count[data$unsuccessful == 0] %>% median(na.rm = TRUE) %>% class()
#> [1] "labelled" "numeric"
data$cd4_count[data$unsuccessful == 1] %>% median(na.rm = TRUE) %>% class()
#> [1] "numeric"
# Because the classes are different, I cannot run this code
data %>%
dplyr::group_by(unsuccessful) %>%
dplyr::summarize_at(dplyr::vars(cd4_count), median, na.rm = TRUE)
#> Error: Problem with `summarise()` input `cd4_count`.
#> x Input `cd4_count` must return compatible vectors across groups
#> i Result type for group 1 (unsuccessful = 0): <labelled>.
#> i Result type for group 2 (unsuccessful = 1): <double>.
#> i Input `cd4_count` is `(function (x, na.rm = FALSE, ...) ...`.
Run Code Online (Sandbox Code Playgroud)
user20650 在评论中指出,根据 的向量长度删除和保留属性x。
当我们查看该median.default方法的代码时,我们可以看到原因。如果length(x)是偶数,则mean使用(内部median),否则x只是sorted 和子集,与 不同mean,不会删除属性。
# lets have a look at the median.default method
function (x, na.rm = FALSE, ...)
{
if (is.factor(x) || is.data.frame(x))
stop("need numeric data")
if (length(names(x)))
names(x) <- NULL
if (na.rm)
x <- x[!is.na(x)]
else if (any(is.na(x)))
return(x[FALSE][NA])
n <- length(x)
if (n == 0L)
return(x[FALSE][NA])
half <- (n + 1L)%/%2L
if (n%%2L == 1L)
# when length is odd: attribute is kept
sort(x, partial = half)[half]
# when length is even: `mean` drops attribute
else mean(sort(x, partial = half + 0L:1L)[half + 0L:1L])
}
Run Code Online (Sandbox Code Playgroud)
由reprex 包(v0.3.0)于 2021 年 4 月 28 日创建
让我们再看看不同的向量以及它们的行为方式。我们可以定义一个keep_attr函数来保留被包装函数和输入的属性。
x1 <- 1
Hmisc::label(x1) = "qw"
class(median(x1)) # keeps attribute
#> [1] "labelled" "numeric"
class(mean(x1)) # drops attribute
#> [1] "numeric"
x2 <- c(1, 2)
Hmisc::label(x2) = "qw"
class(median(x2)) # uses mean
#> [1] "numeric"
class(mean(x2))
#> [1] "numeric"
x3 <- c(1, 2, NA)
Hmisc::label(x3) = "qw"
class(median(x3)) # doesn't use mean
#> [1] "labelled" "numeric"
class(mean(x3))
#> [1] "numeric"
keep_attr <- function(.f, x, ...) {
x_att <- attributes(x)
res <- .f(x, ...)
attributes(res) <- x_att
res
}
class(keep_attr(median, x2))
#> [1] "labelled" "numeric"
class(keep_attr(mean, x2))
#> [1] "labelled" "numeric"
keep_attr(median, x3, na.rm = TRUE)
#> qw
#> [1] 1.5
Run Code Online (Sandbox Code Playgroud)
由reprex 包(v0.3.0)于 2021 年 4 月 28 日创建
更新
关于您的 dplyr 问题,我现在能够重现该问题(我首先忘记标记该cd4_count列并认为这是 dplyr 版本控制问题)。但是,解决方法keep_attr似乎有效。
library(dplyr)
data <-
structure(
list(
cd4_count = c(
30, 97, 210, NA, 358, 242, 126,
792, 6, 145, 22, 150, 43, 23, 39, 953, 357, 427, 367, 239, 72,
61, 61, 438, 392, 1092, 245, 326, 42, 135, 199, 158, 17, NA,
287, 187, 252, 477, 157, NA, NA, 362, NA, 183, 885, 109, 321,
286, 142, 797
),
unsuccessful = c(
0, 0, 0, 1, 0, 0, 1, 1, 0, 1,
1, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0,
0, 1, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 0, 0, 1, 0, 1, 0, 0
)
),
row.names = c(NA, 50L),
class = "data.frame"
)
# Add label to CD4 count, using Hmisc package
Hmisc::label(data$cd4_count) <- "CD4 count"
data %>%
dplyr::group_by(unsuccessful) %>%
dplyr::summarize_at(dplyr::vars(cd4_count), median, na.rm = TRUE)
#> Error: Problem with `summarise()` input `cd4_count`.
#> x Input `cd4_count` must return compatible vectors across groups
#> i Input `cd4_count` is `(function (x, na.rm = FALSE, ...) ...`.
#> i Result type for group 1 (unsuccessful = 0): <labelled>.
#> i Result type for group 2 (unsuccessful = 1): <double>.
data %>%
dplyr::group_by(unsuccessful) %>%
dplyr::summarize_at(dplyr::vars(cd4_count), ~ keep_attr(median, .x, na.rm = TRUE))
#> # A tibble: 2 x 2
#> unsuccessful cd4_count
#> <dbl> <labelled>
#> 1 0 210.0
#> 2 1 135.5
Run Code Online (Sandbox Code Playgroud)
由reprex 包(v0.3.0)于 2021 年 4 月 28 日创建