提高性能 - 摆脱.SD?

cha*_*u13 -3 r data.table

桌上大师!这是我几天前在这里发布的一个问题的后续问题.我正在向您提出一个问题,如何提高以下应用程序的性能data.table:

功能(为了设想尽可能最快的目的):

prob <- function(a, ie1, b, a1, ie2, b2, ...) {
ipf  <- function(a, b, ...) {
m    <- length(a)
n    <- length(b)
if (m < n) {
    r <- rank(c(a, b), ...)[1:m] - 1:m
} else {
    r <- rank(c(a, b), ...)[(m + 1):(m + n)] - 1:n
}
s <- ifelse((n + m)^2 > 2^31, sum(as.double(r)), sum(r))/(as.double(m) * n)
return(ifelse(m < n, s, 1 - s))
}

expand.grid.alt <- function(seq1, seq2) {
cbind(rep.int(seq1, length(seq2)), c(t(matrix(rep.int(seq2, length(seq1)), nrow =   length(seq2)))))
}

if (missing(a1) | missing(b2) | missing(ie2)) {
if (ie1 == ">") {
    return(ipf(a, b))
} else {
    return(ipf(b, a))
}
} else {
if (ie1 == ">") {
    if (ie2 == ">") {
        return(ipf(a, apply(expand.grid.alt(b, b2), 1, max))/ipf(a1, b2))
    } else {
        return(1 - ipf(apply(expand.grid.alt(b, b2), 1, min), a)/(1 - ipf(a1, b2)))
    }
 } else {
    if (ie2 == ">") {
        return(1 - ipf(a, apply(expand.grid.alt(b, b2), 1, max))/ipf(a1, b2))
    } else {
        return(ipf(apply(expand.grid.alt(b, b2), 1, min), a)/(1 - ipf(a1, b2)))
    }
}
}
}
Run Code Online (Sandbox Code Playgroud)

函数的一些注释:该函数允许通过秩和过程比较不同的样本.它允许有效地计算例如样品A的抽取超过样品B的抽取的概率,假设样品A的抽取超过样品C的抽取.在这种情况下,我只想计算抽取A的概率.考虑到A [.I]的平局超过A [ - .I]的平局,[.I]超过B的平局.哪里.我代表所有的ID.除此之外,我想在所有日期都这样做.这是不好的.SD发挥作用的地方.请注意,对于上述任务,prob()已经是最快找到的.

数据集:

dt <- data.table(id=rep(c(rep(1,50),rep(2,50),rep(3,100),rep(4,50),rep(5,100),rep(6,50),rep(7,50),rep(8,50),rep(9,50),rep(10,50)),5),date=c(rep("2004-01-01",600),rep("2004-02-01",600),rep("2004-03-01",600),rep("2004-04-01",600),rep("2004-05-01",600)),A=runif(3000,-5,5),B=runif(3000,-5,5))
Run Code Online (Sandbox Code Playgroud)

data.table的应用:

setkey(dt, id)
setkey(dt, id)
dt[,{
.SD1 <- .SD;
.SD1[,prob(.SD1$A[.I],">",.SD1$B,.SD1$A[.I],">",.SD1$A[-.I]), by=id ]
},by=date]
Run Code Online (Sandbox Code Playgroud)

在我的机器上执行此任务大约需要52.1秒.考虑到我的真实数据集有几百万行并且全部在57个组(ID)中,这太过分了.你有什么建议让我提高性能吗?我实际上正在寻找一个data.table解决方案.我认为数据表语法可能效率低下.也许有人可以摆脱.SD?但我也对并行化的想法持开放态度.

----------

更新

以下是现状.我将程序并行化,从而改善了性能.每个提示如何使程序更有效率都受到高度赞赏,因为我认为我在并行化中缺少一些东西 - 我预计性能会有更大的提升.

library(multicore)
library(doMC)
library(data.table)
registerDoMC(cores=4)
Run Code Online (Sandbox Code Playgroud)

数据集

dt <-     data.table(id=rep(c(rep(1,50),rep(2,50),rep(3,100),rep(4,50),rep(5,100),rep(6,50),rep(7,50),rep(8,50),rep(9,50),rep(10,50)),5),date=c(rep("2004-01-01",600),rep("2004-02-01",600),rep("2004-03-01",600),rep("2004-04-01",600),rep("2004-05-01",600)),A=runif(3000,-5,5),B=runif(3000,-5,5))
Run Code Online (Sandbox Code Playgroud)

prob()函数OP

prob1 <- function(a, ie1, b, a1, ie2, b2, ...) {
ipf  <- function(a, b, ...) {
m    <- length(a)
n    <- length(b)
sm <- seq_len(m)
sn <- seq_len(n)
if (m < n) {
  r <- rank(c(a, b), ...)[sm] - sm
} else {
  r <- rank(c(a, b), ...)[(m + sn)] - sn
}
s <- ifelse((n + m)^2 > 2^31, sum(as.double(r)), sum(r))/(as.double(m) * n)
return(ifelse(m < n, s, 1 - s))
}


if (missing(a1) | missing(b2) | missing(ie2)) {
if (ie1 == ">") {
  return(ipf(a, b))
} else {
  return(ipf(b, a))
}
} else {
if (ie1 == ">") {
  if (ie2 == ">") {
    return(ipf(a,CJ(b, b2)[,pmax(V1,V2)])/ipf(a1, b2))
  } else {
    return(1 - ipf(CJ(b, b2)[,pmin(V1,V2)], a)/(1 - ipf(a1, b2)))
  }
} else {
  if (ie2 == ">") {
    return(1 - ipf(a,CJ(b, b2)[,pmax(V1,V2)])/ipf(a1, b2))
  } else {
    return(ipf(CJ(b, b2)[,pmin(V1,V2)], a)/(1 - ipf(a1, b2)))
  }
}
}
}
Run Code Online (Sandbox Code Playgroud)

prob()函数mnel

prob2 <- function(a, ie1, b, a1, ie2, b2, ...) { 
ipf  <- function(a, b, ...) {
m    <- length(a)
n    <- length(b)
sm <- seq_len(m)
sn <- seq_len(n)
if (m < n) {
  r <- rank(c(a, b), ...)[sm] - sm
} else {
  r <- rank(c(a, b), ...)[(m + sn)] - sn
}
s <- if((n + m)^2 > 2^31){sum(as.double(r))/(as.double(m) * n)} else{     sum(r)/(as.double(m) * n)}
return(if(m < n){s} else{1 - s})
}

if (missing(a1) | missing(b2) | missing(ie2)) {
if (ie1 == ">") {
  return(ipf(a, b))
} else {
  return(ipf(b, a))
}
} else {
if (ie1 == ">") {
  if (ie2 == ">") {

    ipfb <- pmax(rep.int(b,length(b2)), rep(b2, each = length(b)))
    return(ipf(a, ipfb) /ipf(a1, b2))
  } else {
    ipfb <- pmin(rep.int(b,length(b2)), rep(b2, each = length(b)))
    return(1 - ipf(ipfb, a)/(1 - ipf(a1, b2)))
  }
} else {
  if (ie2 == ">") {
    ipfb <- pmax(rep.int(b,length(b2)), rep(b2, each = length(b)))
    return(1 - ipf(a, ipfb )/ipf(a1, b2))
  } else {
    ipfb <- pmin(rep.int(b,length(b2)), rep(b2, each = length(b)))
    return(ipf(ipfb, a)/(1 - ipf(a1, b2)))
  }
}
}
}
Run Code Online (Sandbox Code Playgroud)

prob函数OP - 应用于data.table

ptm <- proc.time()
setkey(dt, id)
res1 <- dt[,{
.SD1 <- .SD;
.SD1[,prob1(.SD1$A[.I],">",.SD1$B,.SD1$A[.I],">",.SD1$A[-.I]),by=id]},by=date]
proc.time() - ptm

user  system elapsed 
6.645   0.110   6.778 
Run Code Online (Sandbox Code Playgroud)

prob函数mnel - 应用于data.table

ptm <- proc.time()
setkey(dt, id)
res2 <- dt[,{
.SD1 <- .SD;
.SD1[,prob2(.SD1$A[.I],">",.SD1$B,.SD1$A[.I],">",.SD1$A[-.I]),by=id]},by=date]
proc.time() - ptm

user  system elapsed 
5.914   0.065   5.999
Run Code Online (Sandbox Code Playgroud)

并行化概率函数 - OP - 应用于data.table

ptm <- proc.time()
jo        <- dt[,list(jobs=list(parallel({.SD1 <- .SD; .SD1[,prob1(.SD1$A[.I],">",.SD1$B,.SD1$A[.I],">",.SD1$A[-.I]),by=id]}))),by=date]
res3      <- data.table(date=rep(jo[,date],each=length(unique(dt$id))),rbindlist(collect(jo[,jobs])))
proc.time() - ptm


user  system elapsed 
13.882   0.537   4.715
Run Code Online (Sandbox Code Playgroud)

并行化概率函数 - mnel - 应用于data.table

ptm <- proc.time()
jo        <- dt[,list(jobs=list(parallel({.SD1 <- .SD; .SD1[,prob2(.SD1$A[.I],">",.SD1$B,.SD1$A[.I],">",.SD1$A[-.I]),by=id]}))),by=date]
res4      <- data.table(date=rep(jo[,date],each=length(unique(dt$id))),rbindlist(collect(jo[,jobs])))
proc.time() - ptm

user  system elapsed 
13.682   0.560   4.545
Run Code Online (Sandbox Code Playgroud)

mne*_*nel 5

data.table与使用和使用无关.SD 如果您对原始呼叫进行了分析(profr例如使用),您会注意到大部分时间花费在t.defaultFUN参数上apply.

library(profr); library(ggplot2
xpr <- profr(dt[,{
  .SD1 <- .SD;
  .SD1[,prob(.SD1$A[.I],">",.SD1$B,.SD1$A[.I],">",.SD1$A[-.I]), by=id ]
  },by=date])

ggplot(xpr)
Run Code Online (Sandbox Code Playgroud)

在此输入图像描述

您可以重写您的expand.grid.alt函数以删除对matrix和的不必要的调用t

 expand.grid.alt <- function(seq1,seq2){
   cbind(rep.int(seq1,length(seq2)), rep(seq2,each=length(seq1)))
 }
Run Code Online (Sandbox Code Playgroud)

但这不会解决您使用的事实,apply以获得向量化的最大值和最小值accros两行.您可以使用pminpmax执行此操作(即使是一个小例子也可以增加8-9倍)

pm <- function(seq1,seq2){
 pmax(rep.int(seq1,length(seq2)), rep(seq2, each = length(seq1)))}
ea <- function(seq1, seq2) {
  apply(cbind(rep.int(seq1, length(seq2)), rep(seq2, each = length(seq1))),1,max)
  }
 eaorig <- function(seq1, seq2) {
   cbind(rep.int(seq1, length(seq2)), 
         c(t(matrix(rep.int(seq2, length(seq1)), nrow =   length(seq2)))))
    }
 eao <- function(seq1,seq2) {apply( eaorig(seq1,seq2), 1, max)}

 library(microbenchmark)

 microbenchmark(pm(1:5,2:8), ea(1:5,2:8),eao(1:5,2:8))
Unit: microseconds
          expr    min      lq  median      uq     max neval
  pm(1:5, 2:8) 10.867 11.7730 12.6790 13.2820  56.446   100
  ea(1:5, 2:8) 80.895 83.6130 85.5745 88.4420 172.054   100
 eao(1:5, 2:8) 91.460 94.0265 95.6860 99.3085 137.341   100
Run Code Online (Sandbox Code Playgroud)

然后我们可以重新定义

prob <- function(a, ie1, b, a1, ie2, b2, ...) {
  ipf  <- function(a, b, ...) {
    m    <- length(a)
    n    <- length(b)
    sm <- seq_len(m)
    sn <- seq_len(n)
    if (m < n) {
      r <- rank(c(a, b), ...)[sm] - sm
    } else {
      r <- rank(c(a, b), ...)[(m + sn)] - sn
    }
    s <- if((n + m)^2 > 2^31){sum(as.double(r))/(as.double(m) * n)} else{ sum(r)/(as.double(m) * n)}
    return(if(m < n){s} else{1 - s})
  }

  if (missing(a1) | missing(b2) | missing(ie2)) {
    if (ie1 == ">") {
      return(ipf(a, b))
    } else {
      return(ipf(b, a))
    }
  } else {
    if (ie1 == ">") {
      if (ie2 == ">") {

        ipfb <- pmax(rep.int(b,length(b2)), rep(b2, each = length(b)))
        return(ipf(a, ipfb) /ipf(a1, b2))
      } else {
        ipfb <- pmin(rep.int(b,length(b2)), rep(b2, each = length(b)))
        return(1 - ipf(ipfb, a)/(1 - ipf(a1, b2)))
      }
    } else {
      if (ie2 == ">") {
        ipfb <- pmax(rep.int(b,length(b2)), rep(b2, each = length(b)))
        return(1 - ipf(a, ipfb )/ipf(a1, b2))
      } else {
        ipfb <- pmin(rep.int(b,length(b2)), rep(b2, each = length(b)))
        return(ipf(ipfb, a)/(1 - ipf(a1, b2)))
      }
    }
  }
}
Run Code Online (Sandbox Code Playgroud)

重写后prob我们加速了10倍

# avoiding .SD
system.time({ 
 # create lists of B by 'date
 Bs <- dt[,list(x = list(B)),keyby='date'];

 dt[,{
   z <- .BY
   BB <- Bs[z[[1]]][['x']][[1]]
   AA <- dt[!list(z[['id']]), A[date == z[['date']]]]
  prob(A, '>',BB,A,'>',AA) 
 },by='date,id']  
 })
   user  system elapsed 
   4.66    0.00    4.67 

# using original .SD approach 
system.time( dt[,{
  .SD1 <- .SD;
  .SD1[,prob(.SD1$A[.I],">",.SD1$B,.SD1$A[.I],">",.SD1$A[-.I]), by=id ]
 },by=date])
   user  system elapsed 
   4.51    0.00    4.52 

 # using probo == original function

system.time( dt[,{
 .SD1 <- .SD;
 .SD1[,probo(.SD1$A[.I],">",.SD1$B,.SD1$A[.I],">",.SD1$A[-.I]), by=id ]
},by=date])

 user  system elapsed 
  43.98    0.02   44.01
Run Code Online (Sandbox Code Playgroud)

使用OP中的更新功能进行编辑

CJ在调用中使用和pmax[.

# if we compare this with the updated version using `CJ`

system.time( dt[,{
     .SD1 <- .SD;
     .SD1[,prob1(.SD1$A[.I],">",.SD1$B,.SD1$A[.I],">",.SD1$A[-.I]), by=id ]
 },by=date])
   user  system elapsed 
  17.23    0.00   17.27 
Run Code Online (Sandbox Code Playgroud)

由于调用的开销[.data.table以及CJ在创建时设置密钥的事实,这会变慢data.table

  • @ chameau13 - 使用`CJ`并随后调用`[.data.table`将有不必要的开销.... (2认同)
  • +1这个投入更多的努力值得更多的爱 (2认同)