Roh*_*uja 6 r forecasting data.table
我在这里遇到了用于批量预测的Joseph Owen 代码。我有一个包含接近 19k 行的数据集,但问题是即使应用了批量预测方法,我的代码仍然运行速度非常慢。
在进行实际预测之前,我需要评估使用 MAPE 作为评估标准的最佳模型。以下是相同的可行代码片段。我的问题是如何优化以下代码以使其在可接受的时间内运行(2 分钟以内)
fcnChooseETS <- function(Ts){
TsPositive <- ( min( as.numeric(Ts) ) > 0 ) # Check if all values of timeseries are positive or not
ModelsUsed <- c("ANN","MNN","ANA","AAN","AAA","MAA","MNM","MMN","MMM","MNA","MAN","MAM")
ModelsNonPositive <- c("ANN","ANA","AAN","AAA") # Multiplicative models cannot take non positive data
if( !TsPositive ){
ModelsUsed <- ModelsNonPositive
}
lAllModels <- lapply(ModelsUsed, function(M){
ets(Ts, damped = NULL, model = M)
})
vecResult <- sapply(lAllModels, function(M) accuracy(M)[2])
names(vecResult) <- ModelsUsed
min(vecResult)
}
fcnTrending <- function( dt){
Ts <- lapply(transpose(dt), ts , frequency = 12 , end = FeedDate)
fit <- lapply(Ts , fcnChooseETS )
}
Run Code Online (Sandbox Code Playgroud)
以下脚本测试了在问题中拟合模型的 3 种不同方法。第一个是问题中发布的代码的更惯用版本,接下来的两个并行适合多个模型。
\n该脚本保存在文件中so_62497397.R并按如下方式运行。
#\n# filename: so_62497397.R\n# Test serial and two types of parallel execution of\n# exponential smoothing time series fitting.\n\nlibrary(parallel)\nlibrary(foreach)\nlibrary(doParallel)\nlibrary(forecast)\n\nfcnChooseETS <- function(Ts){\n\n TsPositive <- ( min( as.numeric(Ts) ) > 0 ) # Check if all values of timeseries are positive or not\n\n ModelsUsed <- c("ANN","MNN","ANA","AAN","AAA","MAA","MNM","MMN","MMM","MNA","MAN","MAM")\n ModelsNonPositive <- c("ANN","ANA","AAN","AAA") # Multiplicative models cannot take non positive data\n\n if( !TsPositive ){\n ModelsUsed <- ModelsNonPositive\n }\n\n lAllModels <- lapply(ModelsUsed, function(M){\n ets(Ts, damped = NULL, model = M)\n })\n\n vecResult <- sapply(lAllModels, function(M) accuracy(M)[2])\n\n names(vecResult) <- ModelsUsed\n vecResult[which.min(vecResult)]\n}\nfcnChooseETS2 <- function(Ts, Ncpus = 2){\n\n TsPositive <- ( min( as.numeric(Ts) ) > 0 ) # Check if all values of timeseries are positive or not\n\n ModelsUsed <- c("ANN","MNN","ANA","AAN","AAA","MAA","MNM","MMN","MMM","MNA","MAN","MAM")\n ModelsNonPositive <- c("ANN","ANA","AAN","AAA") # Multiplicative models cannot take non positive data\n\n if( !TsPositive ){\n ModelsUsed <- ModelsNonPositive\n }\n\n vecResult <- mclapply(ModelsUsed, function(M){\n fit <- ets(Ts, damped = NULL, model = M)\n accuracy(fit)[2]\n }, mc.cores = Ncpus)\n\n vecResult <- unlist(vecResult)\n names(vecResult) <- ModelsUsed\n vecResult[which.min(vecResult)]\n}\n\nfcnChooseETS3 <- function(Ts, Ncpus = 2){\n\n TsPositive <- ( min( as.numeric(Ts) ) > 0 ) # Check if all values of timeseries are positive or not\n\n ModelsUsed <- c("ANN","MNN","ANA","AAN","AAA","MAA","MNM","MMN","MMM","MNA","MAN","MAM")\n ModelsNonPositive <- c("ANN","ANA","AAN","AAA") # Multiplicative models cannot take non positive data\n\n if( !TsPositive ){\n ModelsUsed <- ModelsNonPositive\n }\n\n cl <- makeCluster(Ncpus)\n clusterExport(cl, 'ts')\n clusterEvalQ(cl, library(forecast))\n vecResult <- parLapply(cl, ModelsUsed, function(M){\n fit <- ets(Ts, damped = NULL, model = M)\n accuracy(fit)[2]\n })\n stopCluster(cl)\n\n vecResult <- unlist(vecResult)\n names(vecResult) <- ModelsUsed\n vecResult[which.min(vecResult)]\n}\n\nmakeTestdata <- function(N){\n n <- length(USAccDeaths)\n m <- ceiling(log2(N/n))\n x <- as.numeric(USAccDeaths)\n for(i in seq_len(m)) x <- c(x, x)\n L <- length(x)/12 - 1\n x <- ts(x, start = 2000 - L, frequency = 12)\n x\n}\n\n\nnumCores <- detectCores()\ncat("numCores:", numCores, "\\n")\n\nx <- makeTestdata(5e3)\n\nt1 <- system.time(\n res1 <- fcnChooseETS(x)\n)\nt2 <- system.time(\n res2 <- fcnChooseETS2(x, Ncpus = numCores)\n)\nt3 <- system.time(\n res3 <- fcnChooseETS3(x, Ncpus = numCores)\n)\n\nrbind(t.lapply = t1,\n t.mclapply = t2,\n t.parLapply = t3)\n\nc(res1, res2, res3)\nRun Code Online (Sandbox Code Playgroud)\n运行Rscript
时间显示这mclapply是最好的选择,尽管并不比 快多少parLapply。在拟合的模型中,使用 MAPE 选择的模型都是相同的,因为它们应该是相同的。
rui@rui:~$ Rscript --vanilla so_62497397.R\n#Loading required package: iterators\n#Registered S3 method overwritten by 'quantmod':\n# method from\n# as.zoo.data.frame zoo \n#numCores: 4 \n# user.self sys.self elapsed user.child sys.child\n#t.lapply 56.505 0.063 57.389 0.000 0.00\n#t.mclapply 0.039 0.024 33.983 30.506 0.26\n#t.parLapply 0.040 0.012 36.317 0.001 0.00\n# ANA ANA ANA \n#263.0876 263.0876 263.0876 \nRun Code Online (Sandbox Code Playgroud)\n