我希望通过ID获得7天的滚动金额.假设我的数据如下所示:
data<-as.data.frame(matrix(NA,42,3))
data$V1<-seq(as.Date("2014-05-01"),as.Date("2014-09-01"),by=3)
data$V2<-rep(1:6,7)
data$V3<-rep(c(1,2),21)
colnames(data)<-c("Date","USD","ID")
Date USD ID
1 2014-05-01 1 1
2 2014-05-04 2 2
3 2014-05-07 3 1
4 2014-05-10 4 2
5 2014-05-13 5 1
6 2014-05-16 6 2
7 2014-05-19 1 1
8 2014-05-22 2 2
9 2014-05-25 3 1
10 2014-05-28 4 2
Run Code Online (Sandbox Code Playgroud)
如何添加一个新列,其中包含按ID划分的7天滚动总和?
如果您的数据很大,您可能需要查看使用的此解决方案data.table
.它很快.如果您需要更多的速度,你可以随时更改mapply
到mcmapply
并使用多个内核.
#Load data.table and convert to data.table object
require(data.table)
setDT(data)[,ID2:=.GRP,by=c("ID")]
#Build reference table
Ref <- data[,list(Compare_Value=list(I(USD)),Compare_Date=list(I(Date))), by=c("ID2")]
#Use mapply to get last seven days of value by id
data[,Roll.Val := mapply(RD = Date,NUM=ID2, function(RD, NUM) {
d <- as.numeric(Ref$Compare_Date[[NUM]] - RD)
sum((d <= 0 & d >= -7)*Ref$Compare_Value[[NUM]])})]
Run Code Online (Sandbox Code Playgroud)
OP提供的数据集不会暴露任务的复杂性.至于解决OP问题到目前为止,只有迈克的回答是正确的.
事实上,对于8个滚动天,而不是7个滚动天,由于d <= 0 & d >= -7
.
zoo
@G的解决方案.格洛腾迪克几乎是有效的,只有merge
这样才能对每一组人都有效ID
.
下面是第二个data.table解决方案,这次有效结果,使用dev RcppRoll允许na.rm=TRUE
.
并略微格式化迈克的解决方案输出.
data<-as.data.frame(matrix(NA,42,3))
data$V1<-seq(as.Date("2014-05-01"),as.Date("2014-09-01"),by=3)
data$V2<-rep(1:6,7)
data$V3<-rep(c(1,2),21)
colnames(data)<-c("Date","USD","ID")
library(microbenchmark)
library(RcppRoll) # install_github("kevinushey/RcppRoll")
library(data.table) # install_github("Rdatatable/data.table")
correct_jan_dt = function(n, partial=TRUE){
DT = as.data.table(data) # this can be speedup by setDT()
date.range = DT[,range(Date)]
all.dates = seq.Date(date.range[1],date.range[2],by=1)
setkey(DT,ID,Date)
r = DT[CJ(unique(ID),all.dates)][, c("roll") := as.integer(roll_sumr(USD, n, normalize = FALSE, na.rm = TRUE)), by="ID"][!is.na(USD)]
# This could be simplified when `partial` arg will be implemented in [kevinushey/RcppRoll](https://github.com/kevinushey/RcppRoll)
if(isTRUE(partial)){
r[is.na(roll), roll := cumsum(USD), by="ID"][]
}
return(r[order(Date,ID)])
}
correct_mike_dt = function(){
data = as.data.table(data)[,ID2:=.GRP,by=c("ID")]
#Build reference table
Ref <- data[,list(Compare_Value=list(I(USD)),Compare_Date=list(I(Date))), by=c("ID2")]
#Use mapply to get last seven days of value by id
data[, c("roll") := mapply(RD = Date,NUM=ID2, function(RD, NUM){
d <- as.numeric(Ref$Compare_Date[[NUM]] - RD)
sum((d <= 0 & d >= -7)*Ref$Compare_Value[[NUM]])})][,ID2:=NULL][]
}
identical(correct_mike_dt(), correct_jan_dt(n=8,partial=TRUE))
# [1] TRUE
microbenchmark(unit="relative", times=5L, correct_mike_dt(), correct_jan_dt(8))
# Unit: relative
# expr min lq mean median uq max neval
# correct_mike_dt() 274.0699 273.9892 267.2886 266.6009 266.2254 256.7296 5
# correct_jan_dt(8) 1.0000 1.0000 1.0000 1.0000 1.0000 1.0000 5
Run Code Online (Sandbox Code Playgroud)
期待@Khashaa的更新.
定时更大(仍然很小)的数据集,5439行:
library(zoo)
library(data.table)
library(dplyr)
library(RcppRoll)
library(microbenchmark)
data<-as.data.frame(matrix(NA,5439,3))
data$V1<-seq(as.Date("1970-01-01"),as.Date("2014-09-01"),by=3)
data$V2<-sample(1:6,5439,TRUE)
data$V3<-sample(c(1,2),5439,TRUE)
colnames(data)<-c("Date","USD","ID")
zoo_f = function(){
z <- read.zoo(data)
z0 <- merge(z, zoo(, seq(start(z), end(z), "day")), fill = 0) # expand to daily
roll <- function(x) rollsumr(x, 7, fill = NA)
transform(data, roll = ave(z0$USD, z0$ID, FUN = roll)[time(z)])
}
dt_f = function(){
DT = as.data.table(data) # this can be speedup by setDT()
date.range = DT[,range(Date)]
all.dates = seq.Date(date.range[1],date.range[2],by=1)
setkey(DT,Date)
DT[.(all.dates)
][order(Date), c("roll") := rowSums(setDT(shift(USD, 0:6, NA, "lag")),na.rm=FALSE), by="ID"
][!is.na(ID)]
}
dp_f = function(){
data %>% group_by(ID) %>%
mutate(roll=roll_sum(c(rep(NA,6), USD), 7))
}
dt2_f = function(){
# this can be speedup by setDT()
as.data.table(data)[, c("roll") := roll_sum(c(rep(NA,6), USD), 7), by="ID"][]
}
identical(as.data.table(zoo_f()),dt_f())
# [1] TRUE
identical(setDT(as.data.frame(dp_f())),dt_f())
# [1] TRUE
identical(dt2_f(),dt_f())
# [1] TRUE
microbenchmark(unit="relative", times=20L, zoo_f(), dt_f(), dp_f(), dt2_f())
# Unit: relative
# expr min lq mean median uq max neval
# zoo_f() 140.331889 141.891917 138.064126 139.381336 136.029019 137.730171 20
# dt_f() 14.917166 14.464199 15.210757 16.898931 16.543811 14.221987 20
# dp_f() 1.000000 1.000000 1.000000 1.000000 1.000000 1.000000 20
# dt2_f() 1.536896 1.521983 1.500392 1.518641 1.629916 1.337903 20
Run Code Online (Sandbox Code Playgroud)
但我不确定我的data.table代码是否已经是最优的.
以上功能没有回答OP问题.阅读帖子顶部的更新.迈克的解决方案是正确的.
1)假设您的意思是该 ID 的每连续重叠 7 行:
library(zoo)
transform(data, roll = ave(USD, ID, FUN = function(x) rollsumr(x, 7, fill = NA)))
Run Code Online (Sandbox Code Playgroud)
2)如果你确实是指 7 天而不是 7 行,那么试试这个:
library(zoo)
z <- read.zoo(data)
z0 <- merge(z, zoo(, seq(start(z), end(z), "day")), fill = 0) # expand to daily
roll <- function(x) rollsumr(x, 7, fill = NA)
transform(data, roll = ave(z0$USD, z0$ID, FUN = roll)[time(z)])
Run Code Online (Sandbox Code Playgroud)
更新添加了(2)并做了一些改进。