如果值低于阈值,则与相邻组聚合

bos*_*hek 5 r dplyr

我试图找出一种方法来聚合组的级别,根据您正在聚合的阈值创建一个新级别。

创建一些数据:

library(tidyr)
library(dplyr)

demo_data <- as_tibble(VADeaths) %>% 
  mutate(age_bucket = row.names(VADeaths)) %>% 
  pivot_longer(-age_bucket) %>% 
  arrange(name)
Run Code Online (Sandbox Code Playgroud)

这里有一堆低于我们阈值的值(这里说 15)

demo_data %>% 
  filter(value < 15)
#> # A tibble: 5 x 3
#>   age_bucket name         value
#>   <chr>      <chr>        <dbl>
#> 1 50-54      Rural Female   8.7
#> 2 55-59      Rural Female  11.7
#> 3 50-54      Rural Male    11.7
#> 4 50-54      Urban Female   8.4
#> 5 55-59      Urban Female  13.6

Run Code Online (Sandbox Code Playgroud)

现在我可以使用一些逻辑来做到这一点,case_when但这似乎很脆弱,因为它太具体了。然而,这确实说明了我的追求:

demo_data %>% 
  mutate(age_bucket_agg = case_when(
    age_bucket %in% c("50-54", "55-59") & name == "Rural Female" ~ "50-59",
    age_bucket %in% c("50-54", "55-59") & name == "Urban Female" ~ "50-59",
    age_bucket %in% c("50-54", "55-59") & name == "Rural Male" ~ "50-59",
    TRUE ~ age_bucket
  )
  ) %>% 
  group_by(age_bucket_agg, name) %>% 
  summarise(value = sum(value))
#> `summarise()` regrouping output by 'age_bucket_agg' (override with `.groups` argument)
#> # A tibble: 17 x 3
#> # Groups:   age_bucket_agg [6]
#>    age_bucket_agg name         value
#>    <chr>          <chr>        <dbl>
#>  1 50-54          Urban Male    15.4
#>  2 50-59          Rural Female  20.4
#>  3 50-59          Rural Male    29.8
#>  4 50-59          Urban Female  22  
#>  5 55-59          Urban Male    24.3
#>  6 60-64          Rural Female  20.3
#>  7 60-64          Rural Male    26.9
#>  8 60-64          Urban Female  19.3
#>  9 60-64          Urban Male    37  
#> 10 65-69          Rural Female  30.9
#> 11 65-69          Rural Male    41  
#> 12 65-69          Urban Female  35.1
#> 13 65-69          Urban Male    54.6
#> 14 70-74          Rural Female  54.3
#> 15 70-74          Rural Male    66  
#> 16 70-74          Urban Female  50  
#> 17 70-74          Urban Male    71.1
Run Code Online (Sandbox Code Playgroud)

我的问题是有人能想出一种自动化的方式来做到这一点吗?我如何告诉 dplyr(或一般的 R)将所有低于阈值的值作为阈值并将它们添加到下一个age_bucket,然后重新编码该分组级别以获取最小值和最大值并创建一个新范围。

Tim*_*Fan 1

我认为你的例子对于这个真正具有挑战性的问题来说有点太小了。我向您的数据添加了一些挑战,我认为其他答案的方法还无法解决这些挑战。我的方法相当冗长。本质上,它检查可以合并年龄桶的每个逻辑组合/方向,然后递归地合并年龄桶,直到满足阈值或直到没有其他年龄桶可以合并在一起。通过更多的工作,我们可以将其变成一个更通用的函数。

library(tidyverse)

demo_data <- as_tibble(VADeaths) %>% 
  mutate(age_bucket = row.names(VADeaths)) %>% 
  pivot_longer(-age_bucket) %>% 
  arrange(name) %>% 
  # lets add more challenges to the data
  mutate(value = case_when(
    age_bucket == "55-59" & name == "Rural Female" ~ 2,
    age_bucket == "70-74" & name == "Rural Male" ~ 13,
    age_bucket == "65-69" & name == "Urban Female" ~ 8,
    age_bucket == "70-74" & name == "Urban Male" ~ 3,
    T ~ value))

# function that implements merging age buckets
merge_impl <- function(x) {
  
  if(any(x$first)) {
    e <- filter(x, first == 1)
    
    if (e$id & !is.na(e$age_max_lead)) {
      out <- mutate(x,
                     age_max = if_else(first,
                                       age_max_lead,
                                       age_max),
                     value = if_else(first,
                                     value + value_lead,
                                     value)) 
      out <- filter(out, !lag(first, default = FALSE))

      
    } else if (e$id & is.na(e$age_max_lead & !is.na(e$age_min_lag))) {
      out <- mutate(x,
                     age_min = if_else(first,
                                       age_min_lag,
                                       age_min),
                     value = if_else(first,
                                     value + value_lag,
                                     value))
      out <- filter(out, !lead(first, default = FALSE))
      
    } else if (e$id & is.na(e$age_max_lead & is.na(e$age_min_lag))) {
      out <- x
    } else if (!e$id & !is.na(e$age_min_lag)) {
      out <- mutate(x,
                     age_min = if_else(first,
                                       age_min_lag,
                                       age_min),
                     value = if_else(first,
                                     value + value_lag,
                                     value)) 
      out <- filter(out, !lead(first, default = FALSE))

    } else if (!e$id & is.na(e$age_min_lag) & !is.na(e$age_max_lead)) {
      out <- mutate(x,
                     age_max = if_else(first,
                                       age_max_lead,
                                       age_max),
                     value = if_else(first,
                                     value + value_lead,
                                     value)) %>% 
        out <- filter(out, !lag(first, default = FALSE))

    } else if (!e$id & is.na(e$age_min_lag) & is.na(e$age_max_lead)) {
      out <- x
    }
  } else { 
    out <- x
  }

  select(out,
         -contains("lead"), -contains("lag"),
         -first, -id)
}

merge_age_buckets <- function(x, threshold) {
  
  # initialize
  data_ls <-
    x %>% 
    separate(age_bucket,
             c("age_min", "age_max"),
             convert = TRUE) %>% 
    group_by(name) %>% 
    mutate(across(c(age_min, age_max, value),
                    list(lead = ~ lead(.x),
                         lag  = ~ lag(.x))
                   )
    ) %>% 
    mutate(id = age_min %% 10 == 0,
           first = value < threshold & cumsum(value < threshold) == 1) %>% 
    group_split 

   # check & proceed
   if(any(map_lgl(data_ls, ~ any(.x$first & nrow(.x) > 1)))) {
     res <- map_dfr(data_ls, merge_impl) %>% 
       mutate(age_bucket = paste0(age_min, "-", age_max)) %>% 
       select(- c(age_min, age_max))
     # if result still needs adjustment repeat
     if(any(res$value < threshold)) {
       merge_age_buckets(res, threshold = threshold)
     } else {
       return(res)
       }
   } else {
     out <- reduce(data_ls, bind_rows) %>% 
       mutate(age_buckets = paste0(age_min, "-", age_max)) %>% 
       select(- c(age_min, age_max))
     return(out)
   }
}
 
merge_age_buckets(demo_data, 15)
#> # A tibble: 13 x 3
#>    name         value age_bucket
#>    <chr>        <dbl> <chr>     
#>  1 Rural Female  31   50-64     
#>  2 Rural Female  30.9 65-69     
#>  3 Rural Female  54.3 70-74     
#>  4 Rural Male    29.8 50-59     
#>  5 Rural Male    26.9 60-64     
#>  6 Rural Male    54   65-74     
#>  7 Urban Female  22   50-59     
#>  8 Urban Female  27.3 60-69     
#>  9 Urban Female  50   70-74     
#> 10 Urban Male    15.4 50-54     
#> 11 Urban Male    24.3 55-59     
#> 12 Urban Male    37   60-64     
#> 13 Urban Male    57.6 65-74
Run Code Online (Sandbox Code Playgroud)

由reprex 包(v0.3.0)创建于 2020-06-23