内存有效创建稀疏矩阵

Tha*_*uys 17 memory r sparse-matrix

我有一个50000字符串向量的列表,包含6000个独特字符串的各种组合.

目标:我想在"相对频率"(table(x)/length(x))中对它们进行转换,并将它们存储在稀疏矩阵中.低内存消耗比速度更重要.目前记忆是瓶颈.(即使源数据大约有~50 mb,目标格式的数据~10mb - >转换似乎效率低下,......)

生成样本数据

dims <- c(50000, 6000)
nms <- paste0("A", 1:dims[2])
lengths <- sample(5:30, dims[1], replace = T)
data <- lapply(lengths, sample, x = nms, replace = T)
Run Code Online (Sandbox Code Playgroud)

可能的尝试:

1)sapply()简化为稀疏矩阵?

library(Matrix)  
sparseRow <- function(stringVec){
  relFreq <- c(table(factor(stringVec, levels = nms)) / length(stringVec))
  Matrix(relFreq, 1, dims[2], sparse = TRUE)
}
sparseRows <- sapply(data[1:5], sparseRow)
sparseMat <- do.call(rbind, sparseRows)
Run Code Online (Sandbox Code Playgroud)

问题:我的瓶颈似乎是sparseRows因为行没有直接组合成稀疏矩阵.(如果我在完整样本上运行上面的代码,我得到一个Error: cannot allocate vector of size 194 Kb Error during wrapup: memory exhausted (limit reached?)- 我的硬件有8 GB RAM.)

显然,在组合行之前,有更多的内存消耗用于创建行列表,而不是直接填充稀疏矩阵. - >所以在我的情况下使用(s/l)apply对内存不友好?

object.size(sparseRows)
object.size(sparseMat)
Run Code Online (Sandbox Code Playgroud)

2)肮脏的解决方法(?)

我的目标似乎是创建一个空的稀疏矩阵并按行填充.下面是一个脏的方法(它适用于我的硬件).

indxs <- lapply(data, function(data) sapply(data, function(x) which(x == nms), 
   USE.NAMES = FALSE))
relFreq <- lapply(indxs, function(idx) table(idx)/length(idx))

mm <- Matrix(0, nrow = dims[1], ncol = dims[2])
for(idx in 1:dims[1]){
  mm[idx, as.numeric(names(relFreq[[idx]]))] <- as.numeric(relFreq[[idx]])
}
#sapply(1:dims[1], function(idx) mm[idx, 
#     as.numeric(names(relFreq[[idx]]))] <<- as.numeric(relFreq[[idx]]))
Run Code Online (Sandbox Code Playgroud)

我想问一下,是否有一种更优雅/更有效的方法来实现最低RAM量.

min*_*nem 12

我会转换为data.table然后进行必要的计算:

ld <- lengths(data)
D <- data.table(val = unlist(data),
                id = rep(1:length(data), times = ld),
                Ntotal = rep(ld, times = ld))
D <- D[, .N, keyby = .(id, val, Ntotal)]
D[, freq := N/Ntotal]
ii <- data.table(val = nms, ind = seq_along(nms))
D <- ii[D, on = 'val']
sp <- with(D, sparseMatrix(i = id, j = ind, x = freq,
                           dims = c(max(id), length(nms))))
Run Code Online (Sandbox Code Playgroud)

n = 100的基准

data2 <- data[1:100]
Unit: milliseconds
      expr        min         lq       mean    median        uq        max neval cld
        OP 102.150200 106.235148 113.117848 109.98310 116.79734 142.859832    10  b 
  F. Privé 122.314496 123.804442 149.999595 126.76936 164.97166 233.034447    10   c
     minem   5.617658   5.827209   6.307891   6.10946   6.15137   9.199257    10 a  
 user20650  11.012509  11.752350  13.580099  12.59034  14.31870  21.961725    10 a  
Run Code Online (Sandbox Code Playgroud)

所有数据的基准

让最快的函数基准测试3,因为其余的(OP,user20650_v1和F.Privé)会减慢所有数据.

user20650_v2 <- function(x) {
  dt2 = data.table(lst = rep(1:length(x), lengths(x)),
                   V1 = unlist(x))
  dt2[, V1 := factor(V1, levels = nms)]
  x3 = xtabs(~ lst + V1, data = dt2, sparse = TRUE)
  x3/rowSums(x3)
}
user20650_v3 <- function(x) {
  x3 = xtabs(~ rep(1:length(x), lengths(x)) + factor(unlist(x), levels = nms),
             sparse = TRUE)
  x3/rowSums(x3)
}
minem <- function(x) {
  ld <- lengths(x)
  D <- data.table(val = unlist(x), id = rep(1:length(x), times = ld),
                  Ntotal = rep(ld, times = ld))
  D <- D[, .N, keyby = .(id, val, Ntotal)]
  D[, freq := N/Ntotal]
  ii <- data.table(val = nms, ind = seq_along(nms))
  D <- ii[D, on = 'val']
  sparseMatrix(i = D$id, j = D$ind, x = D$freq,
               dims = c(max(D$id), length(nms)))
}
Run Code Online (Sandbox Code Playgroud)

比较的结果minemuser20650_v3:

x1 <- minem(data)
x2 <- user20650_v3(data)
all.equal(x1, x2)
# [1] "Component “Dimnames”: names for current but not for target"             
# [2] "Component “Dimnames”: Component 1: target is NULL, current is character"
# [3] "Component “Dimnames”: Component 2: target is NULL, current is character"
# [4] "names for target but not for current"  
Run Code Online (Sandbox Code Playgroud)

x2有其他名称.删除它们:

dimnames(x2) <- names(x2@x) <- NULL
all.equal(x1, x2)
# [1] TRUE # all equal
Run Code Online (Sandbox Code Playgroud)

时序:

x <- bench::mark(minem(data),
            user20650_v2(data),
            user20650_v3(data),
            iterations = 5, check = F)
as.data.table(x)[, 1:10]

#            expression   min  mean median   max  itr/sec mem_alloc n_gc n_itr total_time
# 1:        minem(data) 324ms 345ms  352ms 371ms 2.896187     141MB    7     5      1.73s
# 2: user20650_v2(data) 604ms 648ms  624ms 759ms 1.544380     222MB   10     5      3.24s
# 3: user20650_v3(data) 587ms 607ms  605ms 633ms 1.646977     209MB   10     5      3.04s
Run Code Online (Sandbox Code Playgroud)

相关记忆:

OPdirty <- function(x) {
  indxs <- lapply(x, function(x) sapply(x, function(x) which(x == nms), 
                                        USE.NAMES = FALSE))
  relFreq <- lapply(indxs, function(idx) table(idx)/length(idx))
  dims <- c(length(indxs), length(nms))
  mm <- Matrix(0, nrow = dims[1], ncol = dims[2])
  for (idx in 1:dims[1]) {
    mm[idx, as.numeric(names(relFreq[[idx]]))] <- as.numeric(relFreq[[idx]])
  }
  mm
}


xx <- data[1:1000]
all.equal(OPdirty(xx), minem(xx))
# true
x <- bench::mark(minem(xx),
                 FPrive(xx),
                 OPdirty(xx),
                 iterations = 3, check = T)
as.data.table(x)[, 1:10]
    expression     min    mean  median     max    itr/sec mem_alloc n_gc n_itr total_time
1:   minem(xx) 12.69ms 14.11ms 12.71ms 16.93ms 70.8788647    3.04MB    0     3    42.33ms
2:  FPrive(xx)   1.46s   1.48s   1.47s   1.52s  0.6740317  214.95MB    4     3      4.45s
3: OPdirty(xx)   2.12s   2.14s   2.15s   2.16s  0.4666106  914.91MB    9     3      6.43s
Run Code Online (Sandbox Code Playgroud)

见专栏mem_alloc......

  • ps你可以使用`all.equal(x1,x2,check.attributes = FALSE)` (2认同)

F. *_*ivé 7

使用循环以列方式填充预先分配的稀疏矩阵(然后转置它):

res <- Matrix(0, dims[2], length(data), sparse = TRUE)
for (i in seq_along(data)) {
  ind.match <- match(data[[i]], nms)
  tab.match <- table(ind.match)
  res[as.integer(names(tab.match)), i] <- as.vector(tab.match) / length(data[[i]])
}
# Verif
stopifnot(identical(t(res), sparseMat))
Run Code Online (Sandbox Code Playgroud)

基准测试:

data2 <- data[1:50]
microbenchmark::microbenchmark(
  OP = {
    sparseMat <- do.call(rbind, sapply(data2, sparseRow))
  },
  ME = {
    res <- Matrix(0, dims[2], length(data2), sparse = TRUE)
    for (i in seq_along(data2)) {
      ind.match <- match(data2[[i]], nms)
      tab.match <- table(ind.match)
      res[as.integer(names(tab.match)), i] <- as.vector(tab.match) / length(data2[[i]])
    }
    res2 <- t(res)
  }
)
stopifnot(identical(res2, sparseMat))

Unit: milliseconds
 expr      min       lq     mean   median       uq       max neval cld
   OP 56.28020 59.61689 63.24816 61.16986 62.80294 206.18689   100   b
   ME 46.60318 48.27268 49.77190 49.50714 50.92287  55.23727   100  a 
Run Code Online (Sandbox Code Playgroud)

所以,它的内存效率并不慢.