使用R中的data.table有效地重塑预测数据

h.l*_*l.m 12 r data.table

我正在寻找一种更有效的方法来重塑R中的data.table数据.

目前,我正在循环以执行多个时间序列预测的重新形成.

我得到了正确答案,但我觉得这种方法非常不优雅/(un-data.table).因此,我期待SO社区看看是否有更优雅的解决方案.

请参阅下面的数据设置以及两次尝试获得所需答案.

# load libraries
require(data.table)
require(lubridate)


# set up data assumptions
id_vec <- letters
len_id_vec <- length(id_vec)
num_orig_dates <- 7
set.seed(123)


# create original data frame
orig <- data.table(ID=rep(id_vec,each=num_orig_dates),
                   date=rep(c(Sys.Date() %m+% months(0: (num_orig_dates-1))),times=len_id_vec),
                   most_recent_bal=unlist(lapply(round(runif(len_id_vec)*100),function(y){
                     y*cumprod(1+rnorm(num_orig_dates,0.001,0.002))})))


# add 24 months ahead predictions of balances using a random walk from the original dates
nrow_orig <- nrow(orig)

for(i in seq(24)){
  orig[,paste0('pred',i,'_bal'):=most_recent_bal*(1+rnorm(nrow_orig,0.001,0.003))]
  orig[,paste0('pred',i,'_date'):=date %m+% months(i)]
}


# First attempt
t0 <- Sys.time()
tmp1 <- rbindlist(lapply(unique(orig$ID),function(x){
  orig1 <- orig[ID==x,]

  bal_cols <- c('most_recent_bal',paste0('pred',seq(24),'_bal'))
  date_cols <- c('date',paste0('pred',seq(24),'_date'))

  # Go through each original date to realign predicted date and predicted balance
  date_vec <- orig1$date
  tmp <- rbindlist(lapply(date_vec,function(y){

    tmp <- data.table(dates=as.Date(as.vector(t(orig1[date==y,date_cols,with=FALSE]))),
                      bals=as.vector(t(orig1[date==y,bal_cols,with=FALSE])))
    tmp[,type:='prediction']
    tmp[,date_prediction_run:=y]

    # collect historical information too for plotting perposes.
    tmp1 <- orig1[date<=y,c('date','most_recent_bal'),with=FALSE]
    if(nrow(tmp1)!=0){

      setnames(tmp1,c('date','most_recent_bal'),c('dates','bals'))
      tmp1[,type:='history']
      tmp1[,date_prediction_run:=y]

      tmp <- rbind(tmp,tmp1)

    }

    tmp
  }))
  tmp[,ID:=x]
}))
t1 <- Sys.time()
t1-t0 #Time difference of 1.117216 secs

# Second Attempt: a slightly more data.table way which is faster but still very inelegant....
t2 <- Sys.time()
bal_cols <- c('most_recent_bal',paste0('pred',seq(24),'_bal'))
date_cols <- c('date',paste0('pred',seq(24),'_date'))
tmp1a <- rbindlist(lapply(unique(orig$ID),function(x){
  tmp <- cbind(melt(orig[ID==x,c('date',bal_cols),with=FALSE],
                    measure.vars = bal_cols,value.name='bals')[,-('variable'),with=FALSE],
               data.table(dates=melt(orig[ID==x,date_cols,with=FALSE],
                                     measure.vars = date_cols)[,value]))
  setnames(tmp,'date','date_prediction_run')
  tmp[,type:='prediction']

  tmp1 <- orig[ID==x,orig[ID==x & date<=.BY,c('date','most_recent_bal'),with=FALSE],by=date]
  setnames(tmp1,c('date_prediction_run','dates','bals'))
  tmp1[,type:='history']
  setcolorder(tmp1,colnames(tmp1)[match(colnames(tmp),colnames(tmp1))])
  tmp <- rbind(tmp,tmp1)
  tmp[,ID:=x]
  tmp
}))
t3 <- Sys.time()
t3-t2 # Time difference of 0.2309799 secs
Run Code Online (Sandbox Code Playgroud)

phi*_*l_t 0

我尝试使用dplyrand reshape2for this ,我觉得它稍微更优雅(不,apply这在技术上是 for 循环)。它还减少了大约 0.04 秒的运行时间。

t0 = Sys.time()

# Extract predicted values in long form
trial_bal = reshape2::melt(orig, id.vars = c("ID", "date"), measure.vars = 
c(colnames(orig)[grep("pred[0-9]{1,}_bal", colnames(orig))]))
colnames(trial_bal) = c("ID", "date_prediction_run", "type", "balance")
trial_bal$type = gsub("_bal", "", trial_bal$type)

trial_date = reshape2::melt(orig, id.vars = c("ID", "date"), measure.vars = 
c(colnames(orig)[grep("pred[0-9]{1,}_date", colnames(orig))]))
colnames(trial_date) = c("ID", "date_prediction_run", "type", "dates")
trial_date$type = gsub("_date", "", trial_date$type)

trial = merge.data.frame(trial_date, trial_bal, by = c("ID", "date_prediction_run", "type"))
trial$type = "prediction"
trial = trial %>% select(dates, balance, type, date_prediction_run, ID)

# Extract historical values in long form
temp = orig[, c("ID", "date", "most_recent_bal")]
temp = merge(temp[, c("ID", "date")], temp, by = "ID", allow.cartesian = TRUE)
temp = temp[temp$date.x >= temp$date.y, ]
temp$type = "history"
temp = temp %>% select(dates = date.y, balance = most_recent_bal, type, 
date_prediction_run = date.x, ID)

# Combine prediction and history
trial = rbind(trial, temp)
trial = trial %>% arrange(ID, date_prediction_run, desc(type), dates)

t1 = Sys.time()
t1 - t0 #Time difference of 0.1900001 secs
Run Code Online (Sandbox Code Playgroud)

这比您拥有的行数少 182 行,因为您有dates = date_prediction_run两次 - 一个 undertype prediction和一个 under history