我有一个较长的数据集,其中的列分别代表开始时间和结束时间,并且如果行与另一行重叠并且具有较高的优先级(例如1为最高优先级),我想删除该行。我的示例数据是
library(tidyverse)
library(lubridate)
times_df <- tibble(start = as_datetime(c("2019-10-05 14:05:25",
"2019-10-05 17:30:20",
"2019-10-05 17:37:00",
"2019-10-06 04:43:55",
"2019-10-06 04:53:45")),
stop = as_datetime(c("2019-10-05 14:19:20",
"2019-10-05 17:45:15",
"2019-10-05 17:50:45",
"2019-10-06 04:59:00",
"2019-10-06 05:07:10")), priority = c(5,3,4,3,4))
Run Code Online (Sandbox Code Playgroud)
我想出的方法是通过找到具有较高优先级值的重叠,然后使用anti_join
从原始数据帧中将其删除来向后攻击该问题。如果三个时间段重叠相同的时间点,那么此代码将无法正常工作,而且我敢肯定,有一种更高效,更实用的方法可以执行此操作。
dropOverlaps <- function(df) {
drops <- df %>%
filter(stop > lead(start) | lag(stop) > start) %>%
mutate(group = ({seq(1, nrow(.)/2)} %>%
rep(each=2))) %>%
group_by(group) %>%
filter(priority == max(priority))
anti_join(df, drops)
}
dropOverlaps(times_df)
#> Joining, by = c("start", "stop", "priority")
#> # A tibble: 3 x 3
#> start stop priority
#> <dttm> <dttm> <dbl>
#> 1 2019-10-05 14:05:25 2019-10-05 14:19:20 5
#> 2 2019-10-05 17:30:20 2019-10-05 17:45:15 3
#> 3 2019-10-06 04:43:55 2019-10-06 04:59:00 3
Run Code Online (Sandbox Code Playgroud)
谁能帮助我获得相同的输出,但功能更简洁?如果它可以处理三个或三个以上全部重叠的时间段的输入,则奖励。
这是一个用于检测重叠记录的data.table
解决方案(正如@GenesRus 已经提到的)。foverlaps
重叠的记录被分配到组中以过滤具有最大值的记录。组内优先。我在示例数据中添加了另外两条记录,以表明此过程也适用于三个或更多重叠记录:
编辑:我修改并翻译了 @pgcudahy 的解决方案,它data.table
提供了更快的代码:
library(data.table)
library(lubridate)
times_df <- data.frame(
start = as_datetime(
c(
"2019-10-05 14:05:25",
"2019-10-05 17:30:20",
"2019-10-05 17:37:00",
"2019-10-06 04:43:55",
"2019-10-06 04:53:45",
"2019-10-06 04:53:46",
"2019-10-06 04:53:47"
)
),
stop = as_datetime(
c(
"2019-10-05 14:19:20",
"2019-10-05 17:45:15",
"2019-10-05 17:50:45",
"2019-10-06 04:59:00",
"2019-10-06 05:07:10",
"2019-10-06 05:07:11",
"2019-10-06 05:07:12"
)
),
priority = c(5, 3, 4, 3, 4, 5, 6)
)
resultDT <- setDT(times_df, key="start")[!(stop >= shift(start, type="lead", fill = TRUE) & priority > shift(priority, type="lead", fill = TRUE)) &
!(start <= shift(stop, type="lag", fill = FALSE) & priority > shift(priority, type="lag", fill = TRUE))]
# old approach ------------------------------------------------------------
# times_dt <- as.data.table(times_df)
# setkey(times_dt, start, stop)[, index := .I]
# overlaps_dt <- foverlaps(times_dt, times_dt, type = "any", which = TRUE)[xid != yid][, group := fifelse(xid > yid, yes = paste0(yid, "_", xid), no = paste0(xid, "_", yid))]
# overlaps_merged <- merge(times_dt, overlaps_dt, by.x = "index", by.y = "xid")[, .(delete_index = index[priority == max(priority)]), by = "group"]
# result_dt <- times_dt[!unique(overlaps_merged$delete_index)][, index := NULL]
Run Code Online (Sandbox Code Playgroud)
有关更多详细信息,请参阅?foverlaps
- 实现了一些更有用的功能来控制重叠的内容,例如maxgap
,minoverlap
或type
(任何、内部、开始、结束和等于)。
更新-新基准
Unit: microseconds
expr min lq mean median uq max neval
Paul 25572.550 26105.2710 30183.930 26514.342 29614.272 153810.600 100
MKa 5100.447 5276.8350 6508.333 5401.275 5832.270 23137.879 100
pgcudahy 3330.243 3474.4345 4284.640 3556.802 3748.203 21241.260 100
ismirsehregal 711.084 913.3475 1144.829 1013.096 1433.427 2316.159 100
Run Code Online (Sandbox Code Playgroud)
基准代码:
#### library ----
library(dplyr)
library(lubridate)
library(igraph)
library(data.table)
library(microbenchmark)
#### data ----
times_df <- data.frame(
start = as_datetime(
c(
"2019-10-05 14:05:25",
"2019-10-05 17:30:20",
"2019-10-05 17:37:00",
"2019-10-06 04:43:55",
"2019-10-06 04:53:45",
"2019-10-06 04:53:46",
"2019-10-06 04:53:47"
)
),
stop = as_datetime(
c(
"2019-10-05 14:19:20",
"2019-10-05 17:45:15",
"2019-10-05 17:50:45",
"2019-10-06 04:59:00",
"2019-10-06 05:07:10",
"2019-10-06 05:07:11",
"2019-10-06 05:07:12"
)
),
priority = c(5, 3, 4, 3, 4, 5, 6)
)
times_tib <- as_tibble(times_df)
times_dt <- as.data.table(times_df)
#### group_interval function ----
# buffer to take a form similar to: days(1), weeks(2), etc.
group_interval <- function(start, end, buffer = 0) {
dat <- tibble(rid = 1:length(start),
start = start,
end = end,
intervals = case_when(!is.na(start) & !is.na(end) ~ interval(start, end),
is.na(start) ~ interval(end, end),
is.na(end) ~ interval(start, start),
TRUE ~ interval(NA, NA)))
# apply buffer period to intervals
int_start(dat$intervals) <- int_start(dat$intervals) - buffer + seconds(0.01)
int_end(dat$intervals) <- int_end(dat$intervals) + buffer - seconds(0.01)
df_overlap <- bind_cols(
expand.grid(dat$rid, dat$rid), # make a 2 col table with every combination of id numbers
expand.grid(dat$intervals, dat$intervals)) %>% # make a combination of every interval
mutate(overlap = int_overlaps(.data$Var11, .data$Var21)) %>% # determine if intervals overlap
rename("row" = "Var1", "col" = "Var2")
# Find groups via graph theory See igraph package
dat_graph <- graph_from_data_frame(filter(df_overlap, overlap) %>% select(row, col))
groups <- components(dat_graph)$membership[df_overlap$row]
# create a 2 column df with row (index) and group number, arrange on row number and return distinct values
df_groups <- tibble(row = as.integer(names(groups)), group = groups) %>%
unique()
# returns
left_join(select(dat, rid), df_groups, by = c("rid" = "row"))$group
}
#### benchmark ----
library(igraph)
library(data.table)
library(dplyr)
library(lubridate)
library(microbenchmark)
df_Paul <- df_MKa <- df_pgcudahy <- df_ismirsehregal <- times_df <- data.frame(
start = as_datetime(
c(
"2019-10-05 14:05:25",
"2019-10-05 17:30:20",
"2019-10-05 17:37:00",
"2019-10-06 04:43:55",
"2019-10-06 04:53:45",
"2019-10-06 04:53:46",
"2019-10-07 06:00:00",
"2019-10-07 06:10:00",
"2019-10-07 06:20:00",
"2019-10-08 06:00:00",
"2019-10-08 06:10:00",
"2019-10-08 06:20:00",
"2019-10-09 03:00:00",
"2019-10-09 03:10:00",
"2019-10-10 03:00:00",
"2019-10-10 03:10:00",
"2019-10-11 05:00:00",
"2019-10-11 05:00:00")
),
stop = as_datetime(
c(
"2019-10-05 14:19:20",
"2019-10-05 17:45:15",
"2019-10-05 17:50:45",
"2019-10-06 04:59:00",
"2019-10-06 05:07:10",
"2019-10-06 05:07:11",
"2019-10-07 06:18:00",
"2019-10-07 06:28:00",
"2019-10-07 06:38:00",
"2019-10-08 06:18:00",
"2019-10-08 06:28:00",
"2019-10-08 06:38:00",
"2019-10-09 03:30:00",
"2019-10-09 03:20:00",
"2019-10-10 03:30:00",
"2019-10-10 03:20:00",
"2019-10-11 05:40:00",
"2019-10-11 05:40:00")
),
priority = c(5, 3, 4, 3, 4, 5, 4, 3, 4, 3, 4, 3, 1, 2, 2, 1, 3, 4)
)
benchmarks <- microbenchmark(Paul = {
group_interval <- function(start, end, buffer = 0) {
dat <- tibble(rid = 1:length(start),
start = start,
end = end,
intervals = case_when(!is.na(start) & !is.na(end) ~ interval(start, end),
is.na(start) ~ interval(end, end),
is.na(end) ~ interval(start, start),
TRUE ~ interval(NA, NA)))
int_start(dat$intervals) <- int_start(dat$intervals) - buffer + seconds(0.01)
int_end(dat$intervals) <- int_end(dat$intervals) + buffer - seconds(0.01)
df_overlap <- bind_cols(
expand.grid(dat$rid, dat$rid), # make a 2 col table with every combination of id numbers
expand.grid(dat$intervals, dat$intervals)) %>% # make a combination of every interval
mutate(overlap = int_overlaps(.data$Var11, .data$Var21)) %>% # determine if intervals overlap
rename("row" = "Var1", "col" = "Var2")
dat_graph <- graph_from_data_frame(filter(df_overlap, overlap) %>% select(row, col))
groups <- components(dat_graph)$membership[df_overlap$row]
df_groups <- tibble(row = as.integer(names(groups)), group = groups) %>%
unique()
left_join(select(dat, rid), df_groups, by = c("rid" = "row"))$group
}
times_tib <- as_tibble(df_Paul)
mutate(times_tib, group = group_interval(start, stop)) %>%
group_by(group) %>%
top_n(1, desc(priority)) %>%
ungroup() %>%
select(-group)
},
MKa = {
df_MKa$id <- 1:nrow(df_MKa)
# Create consolidated df which we will use to check if stop date is in between start and stop
my_df <- bind_rows(replicate(n = nrow(df_MKa), expr = df_MKa, simplify = FALSE))
my_df$stop_chk <- rep(df_MKa$stop, each = nrow(df_MKa))
# Flag if stop date sits in between start and stop
my_df$chk <- my_df$stop_chk >= my_df$start & my_df$stop_chk <= my_df$stop
my_df$chk_id <- df_MKa[match(my_df$stop_chk, df_MKa$stop), "id"]
# Using igrpah to cluster ids to create unique groups
# this will identify any overlapping groups
library(igraph)
g <- graph.data.frame(my_df[my_df$chk == TRUE, c("id", "chk_id")])
df_g <- data.frame(clusters(g)$membership)
df_g$chk_id <- row.names(df_g)
# copy the unique groups to the df
my_df$new_id <- df_g[match(my_df$chk_id, df_g$chk_id), "clusters.g..membership"]
my_df %>%
filter(chk == TRUE) %>%
arrange(priority) %>%
filter(!duplicated(new_id)) %>%
select(start, stop, priority) %>%
arrange(start)
}, pgcudahy = {
df_pgcudahy %>%
arrange(start) %>%
mutate(remove1 = ifelse((stop >= lead(start, default=FALSE)) &
(priority > lead(priority, default=(max(priority) + 1))), TRUE, FALSE)) %>%
mutate(remove2 = ifelse((start <= lag(stop, default=FALSE)) &
(priority > lag(priority, default=(max(priority) + 1))), TRUE, FALSE)) %>%
filter(remove1 == FALSE & remove2 == FALSE) %>%
select(1:3)
}, ismirsehregal = {
setDT(df_ismirsehregal, key="start")[!(stop >= shift(start, type="lead", fill = TRUE) & priority > shift(priority, type="lead", fill = TRUE)) &
!(start <= shift(stop, type="lag", fill = FALSE) & priority > shift(priority, type="lag", fill = TRUE))]
})
benchmarks
Run Code Online (Sandbox Code Playgroud)