Nic*_*ick 6 r dplyr data.table
我是 dplyr 用户。我经常使用它的连接、管道函数、group_by 和汇总函数,但是有一天 data.table 的出现占用了我正在处理的一大数据集,并将我的计算时间减少了 87%。现在我想从 dplyr 切换到 data.table。但是,在我的代码中,我有下表:
structure(list(tariff_label = c("tv_special", "tv_special", "tv_special",
"tv_special", "tv_special", "tv_special", "tv_special", "tv_special",
"tv_special", "tv_special", "tv_special", "tv_special", "tv_special",
"tv_special", "tv_special", "tv_special", "tv_special", "tv_special",
"tv_special", "tv_special", "tv_special", "tv_special", "tv_special",
"tv_special", "tv_special", "tv_special", "tv_special", "tv_special",
"tv_special", "tv_special", "tv_special", "tv_special", "tv_special",
"tv_special", "tv_special", "tv_special", "tv_special", "tv_special",
"tv_special", "tariff", "tariff", "tariff", "tariff", "tariff",
"tariff", "tariff", "tariff", "tariff", "tariff", "tariff", "tariff",
"tv_special", "tv_special", "tv_special", "tv_special", "tv_special",
"tv_special", "tv_special", "tv_special", "tv_special", "tv_special",
"tv_special", "tv_special", "tariff", "tariff", "tariff", "tariff",
"tariff", "tariff", "tariff", "tariff"), d = c("7", "7", "7",
"7", "7", "7", "7", "1", "1", "1", "1", "1", "1", "1", "1", "1",
"1", "1", "1", "1", "1", "1", "1", "1", "1", "1", "1", "1", "1",
"1", "1", "2", "2", "2", "2", "2", "2", "2", "2", "2", "2", "2",
"2", "2", "2", "2", "2", "2", "2", "2", "2", "2", "2", "2", "2",
"3", "3", "3", "3", "3", "3", "3", "3", "3", "3", "3", "3", "3",
"3", "3", "3"), h = c("17", "18", "19", "20", "21", "22", "23",
"0", "1", "2", "3", "4", "5", "6", "7", "8", "9", "10", "11",
"12", "13", "14", "15", "16", "17", "18", "19", "20", "21", "22",
"23", "0", "1", "2", "3", "4", "5", "6", "7", "8", "9", "10",
"11", "12", "13", "14", "15", "16", "17", "18", "19", "20", "21",
"22", "23", "0", "1", "2", "3", "4", "5", "6", "7", "8", "9",
"10", "11", "12", "13", "14", "15"), id = c(1, 2, 3, 4, 5, 6,
7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22,
23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38,
39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54,
55, 56, 57, 58, 59, 60, 61, 62, 63, 64, 65, 66, 67, 68, 69, 70,
71), period = c("day", "day", "day", "night", "night", "night",
"night", "night", "night", "night", "night", "night", "night",
"night", "night", "day", "day", "day", "day", "day", "day", "day",
"day", "day", "day", "day", "day", "night", "night", "night",
"night", "night", "night", "night", "night", "night", "night",
"night", "night", "day", "day", "day", "day", "day", "day", "day",
"day", "day", "day", "day", "day", "night", "night", "night",
"night", "night", "night", "night", "night", "night", "night",
"night", "night", "day", "day", "day", "day", "day", "day", "day",
"day"), week_period = c("weekend", "weekend", "weekend", "weekend",
"weekend", "weekend", "weekend", "weekend", "weekend", "weekend",
"weekend", "weekend", "weekend", "weekend", "weekend", "weekend",
"weekend", "weekend", "weekend", "weekend", "weekend", "weekend",
"weekend", "weekend", "weekend", "weekend", "weekend", "weekend",
"weekend", "weekend", "weekend", "weekend", "weekend", "weekend",
"weekend", "weekend", "weekend", "weekend", "weekend", "week day",
"week day", "week day", "week day", "week day", "week day", "week day",
"week day", "week day", "week day", "week day", "week day", "week day",
"week day", "week day", "week day", "week day", "week day", "week day",
"week day", "week day", "week day", "week day", "week day", "week day",
"week day", "week day", "week day", "week day", "week day", "week day",
"week day"), center_id = c("12_aDb4_09083641_aaHcc", "12_aDb4_09083641_aaHcc",
"12_aDb4_09083641_aaHcc", "12_aDb4_09083641_aaHcc", "12_aDb4_09083641_aaHcc",
"12_aDb4_09083641_aaHcc", "12_aDb4_09083641_aaHcc", "12_aDb4_09083641_aaHcc",
"12_aDb4_09083641_aaHcc", "12_aDb4_09083641_aaHcc", "12_aDb4_09083641_aaHcc",
"12_aDb4_09083641_aaHcc", "12_aDb4_09083641_aaHcc", "12_aDb4_09083641_aaHcc",
"12_aDb4_09083641_aaHcc", "12_aDb4_09083641_aaHcc", "12_aDb4_09083641_aaHcc",
"12_aDb4_09083641_aaHcc", "12_aDb4_09083641_aaHcc", "12_aDb4_09083641_aaHcc",
"12_aDb4_09083641_aaHcc", "12_aDb4_09083641_aaHcc", "12_aDb4_09083641_aaHcc",
"12_aDb4_09083641_aaHcc", "12_aDb4_09083641_aaHcc", "12_aDb4_09083641_aaHcc",
"12_aDb4_09083641_aaHcc", "12_aDb4_09083641_aaHcc", "12_aDb4_09083641_aaHcc",
"12_aDb4_09083641_aaHcc", "12_aDb4_09083641_aaHcc", "12_aDb4_09083641_aaHcc",
"12_aDb4_09083641_aaHcc", "12_aDb4_09083641_aaHcc", "12_aDb4_09083641_aaHcc",
"12_aDb4_09083641_aaHcc", "12_aDb4_09083641_aaHcc", "12_aDb4_09083641_aaHcc",
"12_aDb4_09083641_aaHcc", "12_aDb4_09083641_aaHcc", "12_aDb4_09083641_aaHcc",
"12_aDb4_09083641_aaHcc", "12_aDb4_09083641_aaHcc", "12_aDb4_09083641_aaHcc",
"12_aDb4_09083641_aaHcc", "12_aDb4_09083641_aaHcc", "12_aDb4_09083641_aaHcc",
"12_aDb4_09083641_aaHcc", "12_aDb4_09083641_aaHcc", "12_aDb4_09083641_aaHcc",
"12_aDb4_09083641_aaHcc", "12_aDb4_09083641_aaHcc", "12_aDb4_09083641_aaHcc",
"12_aDb4_09083641_aaHcc", "12_aDb4_09083641_aaHcc", "12_aDb4_09083641_aaHcc",
"12_aDb4_09083641_aaHcc", "12_aDb4_09083641_aaHcc", "12_aDb4_09083641_aaHcc",
"12_aDb4_09083641_aaHcc", "12_aDb4_09083641_aaHcc", "12_aDb4_09083641_aaHcc",
"12_aDb4_09083641_aaHcc", "12_aDb4_09083641_aaHcc", "12_aDb4_09083641_aaHcc",
"12_aDb4_09083641_aaHcc", "12_aDb4_09083641_aaHcc", "12_aDb4_09083641_aaHcc",
"12_aDb4_09083641_aaHcc", "12_aDb4_09083641_aaHcc", "12_aDb4_09083641_aaHcc"
), network_price = c(1.4, 1.4, 1.4, 1.4, 1.4, 1.4, 1.4, 1.4,
1.4, 1.4, 1.4, 1.4, 1.4, 1.4, 1.4, 1.4, 1.4, 1.4, 1.4, 1.4, 1.4,
1.4, 1.4, 1.4, 1.4, 1.4, 1.4, 1.4, 1.4, 1.4, 1.4, 1.4, 1.4, 1.4,
1.4, 1.4, 1.4, 1.4, 1.4, 3.85, 3.85, 3.85, 3.85, 3.85, 3.85,
3.85, 3.85, 3.85, 3.85, 3.85, 3.85, 1.4, 1.4, 1.4, 1.4, 1.4,
1.4, 1.4, 1.4, 1.4, 1.4, 1.4, 1.4, 3.85, 3.85, 3.85, 3.85, 3.85,
3.85, 3.85, 3.85), group_id = structure(c(1L, 1L, 1L, 1L, 1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,
1L, 1L, 1L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L,
2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 3L, 3L, 3L, 3L, 3L,
3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L,
3L, 3L), .Label = c("1", "2", "3"), class = "factor")), row.names = c(NA,
-71L), class = c("data.table", "data.frame"))
Run Code Online (Sandbox Code Playgroud)
然后我做了以下计算
df$network_price <- as.numeric(df$network_price)
df$group_id<-factor(df$group_id)
df$id<-as.numeric(df$id)
df<-data.table(df)
daily_cap<- 13.50
# Main function in regards to my question
msub_list<- df %>% group_by(group_id) %>%
summarise(cum_sum = list(cumsum(network_price)),
exceeded = list(unlist(cum_sum) > daily_cap),
exceeded_indicator = any(unlist(exceeded)),
last_false = sum(!unlist(exceeded)),
start_group = range(id)[1],
end_group = start_group + last_false - 1 + exceeded_indicator,
subdf = list(df[start_group:end_group,]),
difference = daily_cap - unlist(cum_sum)[last_false]
)
Run Code Online (Sandbox Code Playgroud)
我的目标是现在在 data.table 中实现与我在msub_list. 请注意这些列如何使用之前的列进行计算。例如,excessed列使用新创建的列cum_sum等。我尝试使用 data.table 实现此目的,但收到错误 object cum_sum is not found。有没有办法在 data.table 中实现这一点。仅供参考,我以这种方式进行计算,以避免使用循环进行这些计算,当我说循环时,我的意思是使用 for 循环或通过拆分数据表并使用lapply(). 这极大地加快了我的代码速度,我只是在寻找 data.table 的解决方案。我尝试在网上搜索此问题,但找不到任何解决方案。这是我的尝试的样子
# Attempt with data.table
msub_list <- x[, .(cum_sum = list(cumsum(tariff_price)),
exceeded = list(unlist(cum_sum) > daily_cap),
exceeded_indicator = any(unlist(exceeded)),
last_false = sum(!unlist(exceeded)),
start_group = range(id)[1],
end_group = start_group + last_false - 1 + exceeded_indicator,
subdf = list(x[start_group:end_group,]),
difference = daily_cap - unlist(cum_sum)[last_false]
),
by = group_id]
Run Code Online (Sandbox Code Playgroud)
您可以用来{}创建临时列,然后选择要保留的列。
下面为您提供与 dplyr 管道相同的信息
df[, {
cum_sum = list(list(cumsum(network_price)));
exceeded = list(list(unlist(cum_sum)>daily_cap));
exceeded_indicator = any(unlist(exceeded));
start_group = range(id)[1];
last_false=sum(unlist(exceeded)==F);
end_group = start_group+last_false-1+exceeded_indicator;
subdf = list(list(df[start_group:end_group,]));
difference = daily_cap - unlist(cum_sum)[last_false];
list(cum_sum = cum_sum,
exceeded =exceeded,
exceeded_indicator = exceeded_indicator,
start_group = start_group,
last_false = last_false,
end_group = end_group,
subdf = subdf,
difference = difference)
}, by=group_id]
Run Code Online (Sandbox Code Playgroud)
输出:
group_id cum_sum exceeded exceeded_indicator start_group last_false end_group subdf difference
1: 1 <list[1]> <list[1]> TRUE 1 9 10 <list[1]> 0.90
2: 2 <list[1]> <list[1]> TRUE 25 9 34 <list[1]> 0.90
3: 3 <list[1]> <list[1]> TRUE 49 4 53 <list[1]> 0.55
Run Code Online (Sandbox Code Playgroud)