在TRUE / FALSE和NA分组序列中对组进行排名

Hum*_*hen 10 r rank dplyr data.table

我有一点坚果要破解。

我有一个data.frame这样的:

   group criterium
1      A        NA
2      A      TRUE
3      A      TRUE
4      A      TRUE
5      A     FALSE
6      A     FALSE
7      A      TRUE
8      A      TRUE
9      A     FALSE
10     A      TRUE
11     A      TRUE
12     A      TRUE
13     B        NA
14     B     FALSE
15     B      TRUE
16     B      TRUE
17     B      TRUE
18     B     FALSE

structure(list(group = structure(c(1L, 1L, 1L, 1L, 1L, 1L, 1L, 
1L, 1L, 1L, 1L, 1L, 2L, 2L, 2L, 2L, 2L, 2L), .Label = c("A", 
"B"), class = "factor"), criterium = c(NA, TRUE, TRUE, TRUE, 
FALSE, FALSE, TRUE, TRUE, FALSE, TRUE, TRUE, TRUE, NA, FALSE, 
TRUE, TRUE, TRUE, FALSE)), class = "data.frame", row.names = c(NA, 
-18L))

Run Code Online (Sandbox Code Playgroud)

而且我想按升序排列TRUEcriterium中的组,而忽略FALSENA。目标是在的每个组中都有一个唯一的组标识符group

因此结果应如下所示:

    group criterium goal
1      A        NA   NA
2      A      TRUE    1
3      A      TRUE    1
4      A      TRUE    1
5      A     FALSE   NA
6      A     FALSE   NA
7      A      TRUE    2
8      A      TRUE    2
9      A     FALSE   NA
10     A      TRUE    3
11     A      TRUE    3
12     A      TRUE    3
13     B        NA   NA
14     B     FALSE   NA
15     B      TRUE    1
16     B      TRUE    1
17     B      TRUE    1
18     B     FALSE   NA

Run Code Online (Sandbox Code Playgroud)

我敢肯定有一个相对简单的方法可以做到这一点,我想不出一个。我尝试了dense_rank()和的其他窗口功能dplyr,但无济于事。

chi*_*n12 7

另一种data.table方法:

library(data.table)
setDT(dt)
dt[, cr := rleid(criterium)][
    (criterium), goal := rleid(cr), by=.(group)]
Run Code Online (Sandbox Code Playgroud)


Ron*_*hah 6

也许我把这个复杂化了,但是一种方法dplyr

library(dplyr)

df %>%
  mutate(temp = replace(criterium, is.na(criterium), FALSE), 
         temp1 = cumsum(!temp)) %>%
   group_by(temp1) %>%
   mutate(goal =  +(row_number() == which.max(temp) & any(temp))) %>%
   group_by(group) %>%
   mutate(goal = ifelse(temp, cumsum(goal), NA)) %>%
   select(-temp, -temp1)

#  group criterium  goal
#   <fct> <lgl>     <int>
# 1 A     NA           NA
# 2 A     TRUE          1
# 3 A     TRUE          1
# 4 A     TRUE          1
# 5 A     FALSE        NA
# 6 A     FALSE        NA
# 7 A     TRUE          2
# 8 A     TRUE          2
# 9 A     FALSE        NA
#10 A     TRUE          3
#11 A     TRUE          3
#12 A     TRUE          3
#13 B     NA           NA
#14 B     FALSE        NA
#15 B     TRUE          1
#16 B     TRUE          1
#17 B     TRUE          1
#18 B     FALSE        NA
Run Code Online (Sandbox Code Playgroud)

我们首先replace NAcriterium到列中添加,FALSE然后对它的负数(temp1)求和。我们group_by temp1将1赋给TRUE组中的每个第一个值。最后,通过分组,group我们对TRUE值进行累加或NAFALSE和进行返回NA


mar*_*kus 5

一个data.table选项使用rle

library(data.table)
DT <- as.data.table(dat)
DT[, goal := {
  r <- rle(replace(criterium, is.na(criterium), FALSE))
  r$values <- with(r, cumsum(values) * values)          
  out <- inverse.rle(r)                                 
  replace(out, out == 0, NA)
}, by = group]
DT
#    group criterium goal
# 1:     A        NA   NA
# 2:     A      TRUE    1
# 3:     A      TRUE    1
# 4:     A      TRUE    1
# 5:     A     FALSE   NA
# 6:     A     FALSE   NA
# 7:     A      TRUE    2
# 8:     A      TRUE    2
# 9:     A     FALSE   NA
#10:     A      TRUE    3
#11:     A      TRUE    3
#12:     A      TRUE    3
#13:     B        NA   NA
#14:     B     FALSE   NA
#15:     B      TRUE    1
#16:     B      TRUE    1
#17:     B      TRUE    1
#18:     B     FALSE   NA
Run Code Online (Sandbox Code Playgroud)

一步步

当我们调用时,r <- rle(replace(criterium, is.na(criterium), FALSE))我们得到一个类的对象rle

r
#Run Length Encoding
#  lengths: int [1:9] 1 3 2 2 1 3 2 3 1
#  values : logi [1:9] FALSE TRUE FALSE TRUE FALSE TRUE ...
Run Code Online (Sandbox Code Playgroud)

我们values通过以下方式操作组件

r$values <- with(r, cumsum(values) * values)
r
#Run Length Encoding
#  lengths: int [1:9] 1 3 2 2 1 3 2 3 1
#  values : int [1:9] 0 1 0 2 0 3 0 4 0 
Run Code Online (Sandbox Code Playgroud)

也就是说,我们将TRUEs替换为 的累积values和并将FALSEs设置为0。现在inverse.rle返回一个向量,其中values将重复lenghts次数

out <- inverse.rle(r)
out
# [1] 0 1 1 1 0 0 2 2 0 3 3 3 0 0 4 4 4 0 
Run Code Online (Sandbox Code Playgroud)

这几乎是 OP 想要的,但我们需要0NA

replace(out, out == 0, NA)
Run Code Online (Sandbox Code Playgroud)

这是为每个group.

数据

dat <- structure(list(group = structure(c(1L, 1L, 1L, 1L, 1L, 1L, 1L, 
1L, 1L, 1L, 1L, 1L, 2L, 2L, 2L, 2L, 2L, 2L), .Label = c("A", 
"B"), class = "factor"), criterium = c(NA, TRUE, TRUE, TRUE, 
FALSE, FALSE, TRUE, TRUE, FALSE, TRUE, TRUE, TRUE, NA, FALSE, 
TRUE, TRUE, TRUE, FALSE)), class = "data.frame", row.names = c(NA, 
-18L))
Run Code Online (Sandbox Code Playgroud)