Cro*_*ops 15 r list lapply set-operations data.table
我有一个向量列表如下.
data <- list(v1=c("a", "b", "c"), v2=c("g", "h", "k"),
v3=c("c", "d"), v4=c("n", "a"), v5=c("h", "i"))
Run Code Online (Sandbox Code Playgroud)
我正在努力实现以下目标
1)检查任何矢量是否相互交叉.
2)如果找到相交的向量,得到它们的联合.
所以期望的输出是
out <- list(v1=c("a", "b", "c", "d", "n"), v2=c("g", "h", "k", "i"))
Run Code Online (Sandbox Code Playgroud)
我可以得到一组相交集的并集,如下所示.
Reduce(union, list(data[[1]], data[[3]], data[[4]]))
Reduce(union, list(data[[2]], data[[5]])
Run Code Online (Sandbox Code Playgroud)
如何首先识别交叉向量?有没有办法将列表划分为相交矢量组列表?
这是使用data.table的尝试.获得所需的结果.但是对于大型列表仍然很慢,就像在这个示例数据集中一样
datasets.
data <- sapply(data, function(x) paste(x, collapse=", "))
data <- as.data.frame(data, stringsAsFactors = F)
repeat {
M <- nrow(data)
data <- data.table( data , key = "data" )
data <- data[ , list(dataelement = unique(unlist(strsplit(data , ", " )))), by = list(data)]
data <- data.table(data , key = "dataelement" )
data <- data[, list(data = paste0(sort(unique(unlist(strsplit(data, split=", ")))), collapse=", ")), by = "dataelement"]
data$dataelement <- NULL
data <- unique(data)
N <- nrow(data)
if (M == N)
break
}
data <- strsplit(as.character(data$data) , "," )
Run Code Online (Sandbox Code Playgroud)
MrF*_*ick 21
这有点像图形问题所以我喜欢使用这个igraph库,你可以使用你的样本数据
library(igraph)
#build edgelist
el <- do.call("rbind",lapply(data, embed, 2))
#make a graph
gg <- graph.edgelist(el, directed=F)
#partition the graph into disjoint sets
split(V(gg)$name, clusters(gg)$membership)
# $`1`
# [1] "b" "a" "c" "d" "n"
#
# $`2`
# [1] "h" "g" "k" "i"
Run Code Online (Sandbox Code Playgroud)
我们可以查看结果
V(gg)$color=c("green","purple")[clusters(gg)$membership]
plot(gg)
Run Code Online (Sandbox Code Playgroud)

tal*_*lat 16
这是另一种仅使用基础R的方法
akrun评论后的下一次更新及其样本数据:
data <- list(v1=c('g', 'k'), v2= letters[1:4], v3= c('b', 'c', 'd', 'a'))
Run Code Online (Sandbox Code Playgroud)
修改功能:
x <- lapply(seq_along(data), function(i) {
if(!any(data[[i]] %in% unlist(data[-i]))) {
data[[i]]
} else if (any(data[[i]] %in% unlist(data[seq_len(i-1)]))) {
NULL
} else {
z <- lapply(data[-seq_len(i)], intersect, data[[i]])
z <- names(z[sapply(z, length) >= 1L])
if (is.null(z)) NULL else union(data[[i]], unlist(data[z]))
}
})
x[!sapply(x, is.null)]
#[[1]]
#[1] "g" "k"
#
#[[2]]
#[1] "a" "b" "c" "d"
Run Code Online (Sandbox Code Playgroud)
这适用于原始样本数据,MrFlick的样本数据和akrun的样本数据.
效率被诅咒,你们甚至睡觉吗?仅基础R并且比最快的答案慢得多.自从我写完以后,不妨发布它.
f.union = function(x) {
repeat{
n = length(x)
m = matrix(F, nrow = n, ncol = n)
for (i in 1:n){
for (j in 1:n) {
m[i,j] = any(x[[i]] %in% x[[j]])
}
}
o = apply(m, 2, function(v) Reduce(union, x[v]))
if (all(apply(m, 1, sum)==1)) {return(o)} else {x=unique(o)}
}
}
f.union(data)
[[1]]
[1] "a" "b" "c" "d" "n"
[[2]]
[1] "g" "h" "k" "i"
Run Code Online (Sandbox Code Playgroud)
因为我喜欢慢.(基准之外的加载库)
Unit: microseconds
expr min lq mean median uq max neval
vlo() 896.435 1070.6540 1315.8194 1129.4710 1328.6630 7859.999 1000
akrun() 596.263 658.6590 789.9889 694.1360 804.9035 3470.158 1000
flick() 805.854 928.8160 1160.9509 1001.8345 1172.0965 5780.824 1000
josh() 2427.752 2693.0065 3344.8671 2943.7860 3524.1550 16505.909 1000 <- deleted :-(
doc() 254.462 288.9875 354.6084 302.6415 338.9565 2734.795 1000
Run Code Online (Sandbox Code Playgroud)
一种选择是使用combn然后找到相交.会有更简单的选择.
indx <- combn(names(data),2)
lst <- lapply(split(indx, col(indx)),
function(i) Reduce(`intersect`,data[i]))
indx1 <- names(lst[sapply(lst, length)>0])
indx2 <- indx[,as.numeric(indx1)]
indx3 <- apply(indx2,2, sort)
lapply(split(1:ncol(indx3), indx3[1,]),
function(i) unique(unlist(data[c(indx3[,i])], use.names=FALSE)))
#$v1
#[1] "a" "b" "c" "d" "n"
#$v2
#[1] "g" "h" "k" "i"
Run Code Online (Sandbox Code Playgroud)
您可以使用combnPrimfrom library(gRbase)来加快速度.使用稍大的数据集
library(gRbase)
set.seed(25)
data <- setNames(lapply(1:1e3,function(i)sample(letters,
sample(1:20), replace=FALSE)), paste0("v", 1:1000))
Run Code Online (Sandbox Code Playgroud)
并与之比较fastest.这些是基于OP对@docendo discimus的评论的修改函数.
akrun2M <- function(){
ind <- sapply(seq_along(data), function(i){#copied from @docendo discimus
!any(data[[i]] %in% unlist(data[-i]))
})
data1 <- data[!ind]
indx <- combnPrim(names(data1),2)
lst <- lapply(split(indx, col(indx)),
function(i) Reduce(`intersect`,data1[i]))
indx1 <- names(lst[sapply(lst, length)>0])
indx2 <- indx[,as.numeric(indx1)]
indx3 <- apply(indx2,2, sort)
c(data[ind],lapply(split(1:ncol(indx3), indx3[1,]),
function(i) unique(unlist(data[c(indx3[,i])], use.names=FALSE))))
}
doc2 <- function(){
x <- lapply(seq_along(data), function(i) {
if(!any(data[[i]] %in% unlist(data[-i]))) {
data[[i]]
}
else {
z <- unlist(data[names(unlist(lapply(data[-c(1:i)],
intersect, data[[i]])))])
if (is.null(z)){
z
}
else union(data[[i]], z)
}
})
x[!sapply(x, is.null)]
}
Run Code Online (Sandbox Code Playgroud)
microbenchmark(doc2(), akrun2M(), times=10L)
# Unit: seconds
# expr min lq mean median uq max neval cld
# doc2() 35.43687 53.76418 54.77813 54.34668 62.86665 67.76754 10 b
#akrun2M() 26.64997 28.74721 38.02259 35.35081 47.56781 49.82158 10 a
Run Code Online (Sandbox Code Playgroud)