我试图找出一种方法来聚合组的级别,根据您正在聚合的阈值创建一个新级别。
创建一些数据:
library(tidyr)
library(dplyr)
demo_data <- as_tibble(VADeaths) %>%
mutate(age_bucket = row.names(VADeaths)) %>%
pivot_longer(-age_bucket) %>%
arrange(name)
Run Code Online (Sandbox Code Playgroud)
这里有一堆低于我们阈值的值(这里说 15)
demo_data %>%
filter(value < 15)
#> # A tibble: 5 x 3
#> age_bucket name value
#> <chr> <chr> <dbl>
#> 1 50-54 Rural Female 8.7
#> 2 55-59 Rural Female 11.7
#> 3 50-54 Rural Male 11.7
#> 4 50-54 Urban Female 8.4
#> 5 55-59 Urban Female 13.6
Run Code Online (Sandbox Code Playgroud)
现在我可以使用一些逻辑来做到这一点,case_when但这似乎很脆弱,因为它太具体了。然而,这确实说明了我的追求:
demo_data %>%
mutate(age_bucket_agg = case_when(
age_bucket %in% c("50-54", "55-59") & name == "Rural Female" ~ "50-59",
age_bucket %in% c("50-54", "55-59") & name == "Urban Female" ~ "50-59",
age_bucket %in% c("50-54", "55-59") & name == "Rural Male" ~ "50-59",
TRUE ~ age_bucket
)
) %>%
group_by(age_bucket_agg, name) %>%
summarise(value = sum(value))
#> `summarise()` regrouping output by 'age_bucket_agg' (override with `.groups` argument)
#> # A tibble: 17 x 3
#> # Groups: age_bucket_agg [6]
#> age_bucket_agg name value
#> <chr> <chr> <dbl>
#> 1 50-54 Urban Male 15.4
#> 2 50-59 Rural Female 20.4
#> 3 50-59 Rural Male 29.8
#> 4 50-59 Urban Female 22
#> 5 55-59 Urban Male 24.3
#> 6 60-64 Rural Female 20.3
#> 7 60-64 Rural Male 26.9
#> 8 60-64 Urban Female 19.3
#> 9 60-64 Urban Male 37
#> 10 65-69 Rural Female 30.9
#> 11 65-69 Rural Male 41
#> 12 65-69 Urban Female 35.1
#> 13 65-69 Urban Male 54.6
#> 14 70-74 Rural Female 54.3
#> 15 70-74 Rural Male 66
#> 16 70-74 Urban Female 50
#> 17 70-74 Urban Male 71.1
Run Code Online (Sandbox Code Playgroud)
我的问题是有人能想出一种自动化的方式来做到这一点吗?我如何告诉 dplyr(或一般的 R)将所有低于阈值的值作为阈值并将它们添加到下一个age_bucket,然后重新编码该分组级别以获取最小值和最大值并创建一个新范围。
我认为你的例子对于这个真正具有挑战性的问题来说有点太小了。我向您的数据添加了一些挑战,我认为其他答案的方法还无法解决这些挑战。我的方法相当冗长。本质上,它检查可以合并年龄桶的每个逻辑组合/方向,然后递归地合并年龄桶,直到满足阈值或直到没有其他年龄桶可以合并在一起。通过更多的工作,我们可以将其变成一个更通用的函数。
library(tidyverse)
demo_data <- as_tibble(VADeaths) %>%
mutate(age_bucket = row.names(VADeaths)) %>%
pivot_longer(-age_bucket) %>%
arrange(name) %>%
# lets add more challenges to the data
mutate(value = case_when(
age_bucket == "55-59" & name == "Rural Female" ~ 2,
age_bucket == "70-74" & name == "Rural Male" ~ 13,
age_bucket == "65-69" & name == "Urban Female" ~ 8,
age_bucket == "70-74" & name == "Urban Male" ~ 3,
T ~ value))
# function that implements merging age buckets
merge_impl <- function(x) {
if(any(x$first)) {
e <- filter(x, first == 1)
if (e$id & !is.na(e$age_max_lead)) {
out <- mutate(x,
age_max = if_else(first,
age_max_lead,
age_max),
value = if_else(first,
value + value_lead,
value))
out <- filter(out, !lag(first, default = FALSE))
} else if (e$id & is.na(e$age_max_lead & !is.na(e$age_min_lag))) {
out <- mutate(x,
age_min = if_else(first,
age_min_lag,
age_min),
value = if_else(first,
value + value_lag,
value))
out <- filter(out, !lead(first, default = FALSE))
} else if (e$id & is.na(e$age_max_lead & is.na(e$age_min_lag))) {
out <- x
} else if (!e$id & !is.na(e$age_min_lag)) {
out <- mutate(x,
age_min = if_else(first,
age_min_lag,
age_min),
value = if_else(first,
value + value_lag,
value))
out <- filter(out, !lead(first, default = FALSE))
} else if (!e$id & is.na(e$age_min_lag) & !is.na(e$age_max_lead)) {
out <- mutate(x,
age_max = if_else(first,
age_max_lead,
age_max),
value = if_else(first,
value + value_lead,
value)) %>%
out <- filter(out, !lag(first, default = FALSE))
} else if (!e$id & is.na(e$age_min_lag) & is.na(e$age_max_lead)) {
out <- x
}
} else {
out <- x
}
select(out,
-contains("lead"), -contains("lag"),
-first, -id)
}
merge_age_buckets <- function(x, threshold) {
# initialize
data_ls <-
x %>%
separate(age_bucket,
c("age_min", "age_max"),
convert = TRUE) %>%
group_by(name) %>%
mutate(across(c(age_min, age_max, value),
list(lead = ~ lead(.x),
lag = ~ lag(.x))
)
) %>%
mutate(id = age_min %% 10 == 0,
first = value < threshold & cumsum(value < threshold) == 1) %>%
group_split
# check & proceed
if(any(map_lgl(data_ls, ~ any(.x$first & nrow(.x) > 1)))) {
res <- map_dfr(data_ls, merge_impl) %>%
mutate(age_bucket = paste0(age_min, "-", age_max)) %>%
select(- c(age_min, age_max))
# if result still needs adjustment repeat
if(any(res$value < threshold)) {
merge_age_buckets(res, threshold = threshold)
} else {
return(res)
}
} else {
out <- reduce(data_ls, bind_rows) %>%
mutate(age_buckets = paste0(age_min, "-", age_max)) %>%
select(- c(age_min, age_max))
return(out)
}
}
merge_age_buckets(demo_data, 15)
#> # A tibble: 13 x 3
#> name value age_bucket
#> <chr> <dbl> <chr>
#> 1 Rural Female 31 50-64
#> 2 Rural Female 30.9 65-69
#> 3 Rural Female 54.3 70-74
#> 4 Rural Male 29.8 50-59
#> 5 Rural Male 26.9 60-64
#> 6 Rural Male 54 65-74
#> 7 Urban Female 22 50-59
#> 8 Urban Female 27.3 60-69
#> 9 Urban Female 50 70-74
#> 10 Urban Male 15.4 50-54
#> 11 Urban Male 24.3 55-59
#> 12 Urban Male 37 60-64
#> 13 Urban Male 57.6 65-74
Run Code Online (Sandbox Code Playgroud)
由reprex 包(v0.3.0)创建于 2020-06-23