使用tidyverse; 在组内变化之前和之后计数,为每个唯一班次生成新变量

Eri*_*ail 14 r dplyr tidyr tidyverse

我正在寻找一个tidyverse -solution,它可以计算数据数据中TF组内唯一值的出现次数.当我想要从那一点向前和向后计算变化时.此计数应存储在一个新变量中,以便为每个唯一的移位保存加号和减号.idtblTFPM##PM##TF

这个问题类似于我之前提出的问题,但在这里我特意寻找使用tidyverse工具的解决方案.Uwe data.table 在这里使用了一个优雅的答案.

如果这个问题违反了任何SO政策,请告诉我,我会很乐意重新打开我的初步问题,或者附上一个赏金问题.

用一个最小的工作例来说明我的问题.我有这样的数据,

# install.packages(c("tidyverse"), dependencies = TRUE)
library(tibble)

tbl <- tibble(id = c(0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1,
                     1, 1, 1, 1, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7), 
              TF = c(NA, 0, NA, 0, 0, 1, 1, 1, NA, 0, 0, NA, 0, 0,
                     0, 1, 1, 1, NA, NA, 0, 0, 1, 0, 0, 1, 0, 1, 1, 1))
tbl
#> # A tibble: 30 x 2
#>       id    TF
#>    <dbl> <dbl>
#>  1     0    NA
#>  2     0     0
#>  3     0    NA
#>  4     0     0
#>  5     0     0
#>  6     0     1
#>  7     0     1
#>  8     0     1
#>  9     0    NA
#> 10     0     0
#> # ... with 20 more rows
Run Code Online (Sandbox Code Playgroud)

这就是我想要获得的,

dfa <- tibble(id = c(0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1,
                     1, 1, 1, 1, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7),
              TF = c(NA, 0, NA, 0, 0, 1, 1, 1, NA, 0, 0, NA, 0, 0,
                     0, 1, 1, 1, NA, NA, 0, 0, 1, 0, 0, 1, 0, 1, 1, 1),
              PM01 = c(NA, -3, NA, -2, -1, 1, 2, 3, NA, NA, NA, NA, -3, -2, -1,
                       1, 2, 3, NA, NA, -2, -1, 1, NA, NA, NA, NA, NA, NA, NA),
              PM02 = c(NA, NA, NA, NA, NA, -3, -2, -1, NA, 1, 2, NA, NA, NA, NA,
                       NA, NA, NA, NA, NA, NA, NA, -1, 1, 2, NA, NA, NA, NA, NA),
              PM03 = c(NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,
                       NA, NA, NA, NA, NA, NA, NA, NA, -2, -1, 1, NA, NA, NA, NA),
              PM04 = c(NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,
                       NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, -1, 1, NA, NA, NA),
              PM05 = c(NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,
                       NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, -1, 1, 2, 3)
               )

dfa
#> # A tibble: 30 x 7
#>       id    TF  PM01  PM02  PM03  PM04  PM05
#>    <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
#>  1     0    NA    NA    NA    NA    NA    NA
#>  2     0     0    -3    NA    NA    NA    NA
#>  3     0    NA    NA    NA    NA    NA    NA
#>  4     0     0    -2    NA    NA    NA    NA
#>  5     0     0    -1    NA    NA    NA    NA
#>  6     0     1     1    -3    NA    NA    NA
#>  7     0     1     2    -2    NA    NA    NA
#>  8     0     1     3    -1    NA    NA    NA
#>  9     0    NA    NA    NA    NA    NA    NA
#> 10     0     0    NA     1    NA    NA    NA
#> # ... with 20 more rows
Run Code Online (Sandbox Code Playgroud)

Psi*_*dom 5

这里是另一个tidyverse使用方法dplyr,tidyrzoo(用于其na.locf功能)包:

首先,不是在列中删除NATF然后像所有其他建议的方法(包括data.table方法)一样加入,我在这里编写了一个辅助方法,通过忽略NA的块来向前推进;

forward_count <- function(v) {
    valid <- !is.na(v)
    valid_v <- v[valid]
    chunk_size = head(rle(valid_v)$lengths, -1)
    idx <- cumsum(chunk_size) + 1
    ones <- rep(1, length(valid_v))
    ones[idx] <- 1 - chunk_size
    v[valid] <- cumsum(ones)
    v
}
Run Code Online (Sandbox Code Playgroud)

并且它在更改后计数要求工作:

v <- sample(c(NA, 0, 1), 15, replace = T)
v
# [1] NA NA NA  0  1 NA  1 NA  1  1  0  1  0  0  0
forward_count(v)
# [1] NA NA NA  1  1 NA  2 NA  3  4  1  1  1  2  3
Run Code Online (Sandbox Code Playgroud)

可以通过使用此完全相同的函数将向量反转两次来实现更改之前的计数:

-rev(forward_count(rev(v)))
# [1] NA NA NA -1 -4 NA -3 NA -2 -1 -1 -1 -3 -2 -1
Run Code Online (Sandbox Code Playgroud)

现在定义标题,将前向列fd计为,将后向列计为bd使用dplyr包:

library(dplyr); library(tidyr); library(zoo);

tidy_method <- function(df) {
    df %>% 
        group_by(id) %>% 
        mutate(
            rle_id = cumsum(diff(na.locf(c(0, TF))) != 0),   # chunk id for constant TF
            PM_fd = if_else(                 # PM count after change headers
                rle_id == head(rle_id, 1), 
                "head", sprintf('PM%02d', rle_id)
            ), 
            PM_bd = if_else(                 # shift the header up as before change headers
                rle_id == tail(rle_id, 1), 
                "tail", sprintf('PM%02d', rle_id+1)
            ), 
            fd = forward_count(TF),             # after change count
            bd = -rev(forward_count(rev(TF))),  # before change count
            rn = seq_along(id)) %>%             # row number
        gather(key, value, PM_fd, PM_bd) %>%    # align headers with the count
        mutate(count_ = if_else(key == "PM_fd", fd, bd)) %>%
        select(-key) %>% spread(value, count_) %>%    # reshaper PM column as headers
        select(id, TF, rn, matches('PM')) %>%  # drop no longer needed columns
        arrange(id, rn) %>% select(-rn)
}
Run Code Online (Sandbox Code Playgroud)

时间data.table方法比较:

data.table方法定义为:

dt_method <- function(df) {
    tmp_dt <- setDT(df)[, rn := .I][!is.na(TF)][, rl := rleid(TF), by = id][
        , c("up", "dn") := .(seq_len(.N), -rev(seq_len(.N))), by = .(id, rl)][]

    res_dt <- tmp_dt[tmp_dt[, seq_len(max(rl) - 1L), by = .(id)], on = .(id), allow.cartesian = TRUE][
        rl == V1, PM := dn][rl == V1 + 1L, PM := up][
            , dcast(.SD, id + TF + rn ~ sprintf("PM%02d", V1), value.var = "PM")][
                df, on = .(rn, id, TF)][, -"rn"]
    res_dt
}
Run Code Online (Sandbox Code Playgroud)

数据:通过重复样本数据帧200次来获得中等大小的数据:

df_test <- bind_rows(rep(list(df), 200))

microbenchmark::microbenchmark(dt_method(df_test), tidy_method(df_test), times = 10)
#Unit: milliseconds
#                 expr       min        lq      mean    median        uq       max neval
#   dt_method(df_test) 2321.5852 2439.8393 2490.8583 2456.1118 2557.4423 2834.2399    10
# tidy_method(df_test)  402.3624  412.2838  437.0801  414.5655  418.6564  540.9667    10
Run Code Online (Sandbox Code Playgroud)

订购data.table方法结果id并将所有列数据类型转换为数字; 从结果data.table的方法和tidyverse是相同的:

identical(
    as.data.frame(dt_method(df_test)[order(id), lapply(.SD, as.numeric)]), 
    as.data.frame(tidy_method(df_test))
)
# [1] TRUE
Run Code Online (Sandbox Code Playgroud)


m-d*_*-dz 3

使用稍微优化的 data.table 函数进行更新:

可能应该回到老问题,但这也许会引发一些进一步的优化。

为了保持事情顺利进行,我对这个函数进行了一些尝试data.table,并将执行时间减少到该tidyverse版本的大约两倍 - 瓶颈是函数dcast(),请参见下面的屏幕截图profvis

dt_method <- function(dt_test) {
  tmp_dt <- dt_test[, rn := .I][!is.na(TF)][, rl := rleid(TF), by = id][
    , c("up", "dn") := .(seq_len(.N), -rev(seq_len(.N))), by = .(id, rl)][, ':='(
      rl_PM = sprintf("PM%02d", rl),
      United = paste(id, TF, rn, sep = '_')
    )]

  res_dt <- tmp_dt[, .(sprintf("PM%02d", seq_len(max(rl) - 1L)), seq_len(max(rl) - 1L)), by = .(id)] %>% 
    tmp_dt[., on = .(id), allow.cartesian = TRUE] %>%  
    .[rl == V2, PM := dn] %>%
    .[rl == V2 + 1L, PM := up] %>%
    dcast(., United ~ V1, value.var = "PM") %>%
    .[, c('id', 'TF', 'rn') := lapply(tstrsplit(United, '_'), as.numeric)] %>%
    .[dt_test, on = .(rn, id, TF)] %>% .[, -c('rn', 'United')]
  res_dt
}
Run Code Online (Sandbox Code Playgroud)

需要管道来处理一些奇怪的错误,但我仍然认为即使对于data.table.

微基准测试结果:

Unit: milliseconds
                 expr      min       lq      mean    median        uq       max neval
   dt_method(dt_test) 868.1491 932.8076 1048.5077 1029.9609 1078.0735 1518.0327    10
 tidy_method(df_test) 478.6824 515.5639  557.9644  565.9422  585.3143  622.1093    10
Run Code Online (Sandbox Code Playgroud)

identical()具有固定的列顺序:

identical(
  dt_method(dt_test)[order(id), lapply(.SD, as.numeric)] %>% setcolorder(c('id', 'TF', setdiff(names(.), c('id', 'TF')))) %>% as.data.frame(),
  as.data.frame(tidy_method(df_test))
)
Run Code Online (Sandbox Code Playgroud)

profvis时间安排:

在此输入图像描述

旧部分:

使用 Uwe 的答案作为基础:

(免责声明:我没有使用dplyr太多,将其视为对自己的练习,所以它肯定不是dplyr最佳的,请参见例如dcast。)

library(data.table)
library(magrittr)
library(dplyr)
library(tibble)

df <- tibble(id = c(0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 
                    1, 1, 1, 1,7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7),
             TF = c(NA, 0, NA, 0, 0, 1, 1, 1, NA, 0, 0, NA, 0, 0, 0,
                    1, 1, 1, NA, NA, 0, 0, 1, 0, 0, 1, 0, 1, 1, 1))

dfa <- tibble(id = c(0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1,
                     1, 1, 1, 1, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7),
              TF = c(NA, 0, NA, 0, 0, 1, 1, 1, NA, 0, 0, NA, 0, 0,
                     0, 1, 1, 1, NA, NA, 0, 0, 1, 0, 0, 1, 0, 1, 1, 1),
              PM01 = c(NA, -3, NA, -2, -1, 1, 2, 3, NA, NA, NA, NA, -3, -2, -1,
                       1, 2, 3, NA, NA, -2, -1, 1, NA, NA, NA, NA, NA, NA, NA),
              PM02 = c(NA, NA, NA, NA, NA, -3, -2, -1, NA, 1, 2, NA, NA, NA, NA,
                       NA, NA, NA, NA, NA, NA, NA, -1, 1, 2, NA, NA, NA, NA, NA),
              PM03 = c(NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,
                       NA, NA, NA, NA, NA, NA, NA, NA, -2, -1, 1, NA, NA, NA, NA),
              PM04 = c(NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,
                       NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, -1, 1, NA, NA, NA),
              PM05 = c(NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,
                       NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, -1, 1, 2, 3))

tmp_dt <- setDT(df)[, rn := .I][!is.na(TF)][, rl := rleid(TF), by = id][
  , c("up", "dn") := .(seq_len(.N), -rev(seq_len(.N))), by = .(id, rl)][]

res_dt <- tmp_dt[tmp_dt[, seq_len(max(rl) - 1L), by = .(id)], on = .(id), allow.cartesian = TRUE][
  rl == V1, PM := dn][rl == V1 + 1L, PM := up][
    , dcast(.SD, id + TF + rn ~ sprintf("PM%02d", V1), value.var = "PM")][
      df, on = .(rn, id, TF)][, -"rn"]
res_dt

all.equal(res_dt, as.data.table(dfa))
Run Code Online (Sandbox Code Playgroud)

尽可能的整洁:

tmp_dplyr <- df %>%
  # create row id column (required for final join to get NA rows back in)
  mutate(rn = row_number()) %>%
  # ignore NA rows 
  filter(complete.cases(.)) %>%
  # number streaks of unique values within each group
  group_by(id) %>%
  mutate(rl = rleid(TF)) %>%
  # create ascending and descending counts for each streak
  # this is done once to avoid repeatedly creation of counts for each PM 
  # (slight performance gain)
  group_by(id, rl) %>%
  mutate(
    up = seq_len(n()),
    dn = -rev(seq_len(n()))
  )

res_dplyr <- tmp_dplyr %>%
  ## Replicating tmp[tmp[, seq_len(max(rl) - 1L), by = .(id)], on = .(id), allow.cartesian = TRUE]
  group_by(id) %>%
  ## Part below can for sure be optimized for code length, it's just too early now...
  transmute(rl = max(rl)) %>% # Cannot transmute id directly
  unique() %>%
  ungroup() %>%
  slice(rep(1:n(), times = rl - 1L)) %>%
  group_by(id) %>%
  transmute(V1 = seq_len(max(rl) - 1L)) %>%
  ungroup() %>%
  right_join(tmp_dplyr, by = 'id') %>%
  ## End or replicating tmp[tmp[, seq_len(max(rl) - 1L), by = .(id)], on = .(id), allow.cartesian = TRUE]
  ## Copy descending counts to rows before the switch and ascending counts to rows after the switch
  mutate(
    PM = ifelse(rl == V1, dn, NA),
    PM = ifelse(rl == V1 + 1L, up, PM)
  ) %>%
  ## This is very not tidyverse-sque, but I don't get the gather/spread ...
  dcast(id + TF + rn ~ sprintf("PM%02d", V1), value.var = "PM") %>%
  full_join(df, by = c('rn', 'id', 'TF')) %>%
  select(-rn)

all.equal( ## Using data.table all.equal
  res_dplyr[do.call(order, res_dplyr),] %>% as.data.table(),
  res_dt[do.call(order, res_dt),]
)
Run Code Online (Sandbox Code Playgroud)