在R中,我有以下示例数据表:
library(data.table)
type <- c("d1", "d1", "d2", "d3", "d3", "d3", "d4", "d4", "d4", "d4", "d4", "d5", "d5", "d6", "d6")
DT <- data.table(type)
DT[, id := seq(.N), by = .(type)]
Run Code Online (Sandbox Code Playgroud)
看起来像这样:
# Input:
#
type id
1: d1 1
2: d1 2
3: d2 1
4: d3 1
5: d3 2
6: d3 3
7: d4 1
8: d4 2
9: d4 3
10: d4 4
11: d4 5
12: d5 1
13: d5 2
14: d6 1
15: d6 2
Run Code Online (Sandbox Code Playgroud)
我想type通过添加一个包含每组五个唯一ID的新列,将列中的观察结果分组为五个块.但是,type列中相同值的序列不会分配给不同的组ID,这意味着块可能包含五个以上的元素.换句话说,我试图实现的是添加一个chunk带有计数器的列,一旦计算了五个元素并且type完成了列中连续相同值的最后一个序列,该列增加+1 .因此,所需的输出是:
# Desired output
type id chunk
1: d1 1 1
2: d1 2 1
3: d2 1 1
4: d3 1 1
5: d3 2 1
6: d3 3 1
7: d4 1 2
8: d4 2 2
9: d4 3 2
10: d4 4 2
11: d4 5 2
12: d5 1 3
13: d5 2 3
14: d6 1 3
15: d6 2 3
Run Code Online (Sandbox Code Playgroud)
欢迎任何建议和帮助,特别是矢量化解决方案.非常感谢你提前.
DT[, grp := .GRP, type]
i <- 1
DT[1:5, chunk := i] # set chunk = i for first five rows
DT[grp == last(grp[!is.na(chunk)]), chunk := i] # make chunk = i for any rows with same type
while((last.I <- DT[, last(.I[!is.na(chunk)])]) < nrow(DT)){
i <- i + 1
DT[last.I + seq(min(c(5, nrow(DT) - last.I))), chunk := i] # set chunk = i for next five rows
DT[grp == last(grp[!is.na(chunk)]), chunk := i] # make chunk = i for any rows with same type
}
DT[, grp := NULL][]
# type id chunk
# 1: d1 1 1
# 2: d1 2 1
# 3: d2 1 1
# 4: d3 1 1
# 5: d3 2 1
# 6: d3 3 1
# 7: d4 1 2
# 8: d4 2 2
# 9: d4 3 2
# 10: d4 4 2
# 11: d4 5 2
# 12: d5 1 3
# 13: d5 2 3
# 14: d6 1 3
# 15: d6 2 3
Run Code Online (Sandbox Code Playgroud)
@Frank 在评论中发布了一个更简单的解决方案
gDT = DT[, .N, by=type][, g := 1L]
s = first(gDT$N)
gg = 1L
for (ii in 1:nrow(gDT)){
if (s >= 5){
s = 0
gg = gg + 1L
gDT[ii:.N, g := gg][]
}
else s = s + gDT$N[ii]
}
DT[gDT, on=.(type), g := i.g]
Run Code Online (Sandbox Code Playgroud)