找到最快的方法来获取向量中相同元素之间的所有间隔

jal*_*pic 14 r

假设我有一个包含8个字母的字符向量,每个字符出现两次:

x <- rep(LETTERS[1:8],2)
set.seed(1)
y <- sample(x)
y

# [1] "E" "F" "A" "D" "C" "B" "C" "G" "F" "A" "B" "G" "E" "H" "D" "H"
Run Code Online (Sandbox Code Playgroud)

我想找到每对字母之间的间隔.这里,interval是指两个相同字母之间的字母数.我可以像这样手动完成:

abs(diff(which(y=="A")))-1  #6
abs(diff(which(y=="D")))-1  #10
abs(diff(which(y=="H")))-1  #1
Run Code Online (Sandbox Code Playgroud)

我写了一个for循环来做这个...

res<-NULL
for(i in 1:8){  res[[i]] <- abs(diff(which(y==LETTERS[i])))-1 }

names(res)<-LETTERS[1:8]
res

# A  B  C  D  E  F  G  H 
# 6  4  1 10 11  6  3  1 
Run Code Online (Sandbox Code Playgroud)

但是,我想在具有很长向量的随机化过程中使用这种方法.速度对此至关重要 - 我想知道是否有人有尽可能快速解决这个问题的好主意.

Fra*_*ank 16

您需要设置索引向量,然后执行diff(索引向量)-by-group操作.


以下是它在data.table包装中的外观:

require(data.table)
yDT <- data.table(y)

yDT[,diff(.I)-1,keyby=y]
#    y V1
# 1: A  6
# 2: B  4
# 3: C  1
# 4: D 10
# 5: E 11
# 6: F  6
# 7: G  3
# 8: H  1
Run Code Online (Sandbox Code Playgroud)

这里的索引向量是特殊(内置)变量.I,用于存储行号.

keyby=yy按字母顺序分组和排序结果; 或者by=y,我们会看到结果按照组的首次出现排序.(谢谢@Arun,指出这一点.)


基础R中的类似解决方案看起来像

tapply(1:length(y),y,diff)-1
# A  B  C  D  E  F  G  H 
# 6  4  1 10 11  6  3  1 
Run Code Online (Sandbox Code Playgroud)


Kha*_*haa 13

使用data.table::chmatch速度要快得多.

library(data.table)   
f <- function(x){
  ux <- unique(x)
  out <- length(x) - chmatch(ux, rev(x)) - chmatch(ux, x)
  setNames(out, ux)
}

f(y)
# E  F  A  D  C  B  G  H 
#11  6  6 10  1  4  3  1 
Run Code Online (Sandbox Code Playgroud)

它比快2倍快cmpalex.

set.seed(007); xx = sample(rep(make.unique(rep_len(LETTERS, 1e3)), each = 2))
microbenchmark::microbenchmark(cmpalex(xx), f(xx), unit="relative")
#Unit: relative
#        expr      min       lq    mean   median       uq      max neval
# cmpalex(xx) 2.402806 2.366553 2.33802 2.359145 2.324677 2.232852   100
#       f(xx) 1.000000 1.000000 1.00000 1.000000 1.000000 1.000000   100
Run Code Online (Sandbox Code Playgroud)
R version 3.2.0 (2015-04-16)
Running under: Windows 8 x64 (build 9200)   

other attached packages:
[1] data.table_1.9.5
Run Code Online (Sandbox Code Playgroud)


ale*_*laz 12

另一种选择:

alex = function(x)
{
    ux = unique(x)
    mux = match(x, ux)      
    ans = integer(length(ux))       
    for(i in seq_along(x)) ans[mux[i]] = i - ans[mux[i]]        
    return(setNames(ans - 1L, ux))
}
alex(y)
# E  F  A  D  C  B  G  H 
#11  6  6 10  1  4  3  1
Run Code Online (Sandbox Code Playgroud)

与其他替代方案相比:

frank1 = function(x) tapply(1:length(x), x, diff) - 1

library(data.table)
frank2 = function(x) data.table(x)[, diff(.I) - 1, by = x]

jaehyeon = function(x) sapply(unique(x), function(X) abs(diff(which(x == X))) - 1)

library(data.table)
khashaa = function(x)
{
    ux = unique(x)
    setNames(length(x) - chmatch(ux, rev(x)) - chmatch(ux, x), ux)
}

khashaa_base = function(x)
{
    ux = unique(x)
    setNames(length(x) - match(ux, rev(x)) - match(ux, x), ux) 
}

frank1(y)
# A  B  C  D  E  F  G  H 
# 6  4  1 10 11  6  3  1 
frank2(y)
#   x V1
#1: E 11
#2: F  6
#3: A  6
#4: D 10
#5: C  1
#6: B  4
#7: G  3
#8: H  1
jaehyeon(y)
# E  F  A  D  C  B  G  H 
#11  6  6 10  1  4  3  1
khashaa(y)
# E  F  A  D  C  B  G  H 
#11  6  6 10  1  4  3  1
khashaa_base(y)
# E  F  A  D  C  B  G  H 
#11  6  6 10  1  4  3  1
Run Code Online (Sandbox Code Playgroud)

并在基准上:

#compiled versions for all for consistency:
cmpalex = compiler::cmpfun(alex)
cmpfrank1 = compiler::cmpfun(frank1)
cmpfrank2 = compiler::cmpfun(frank2)
cmpjaehyeon = compiler::cmpfun(jaehyeon)
cmpkhashaa = compiler::cmpfun(khashaa)
cmpkhashaa_base = compiler::cmpfun(khashaa_base)

set.seed(007); xx = sample(rep(make.unique(rep_len(LETTERS, 1e3)), each = 2))

sort_by_names = function(x) x[order(names(x))]
sum(sort_by_names(alex(xx)) != frank1(xx))
#[1] 0
sum(alex(xx) != setNames(frank2(xx)[[2]], frank2(xx)[[1]]))
#[1] 0
sum(alex(xx) != jaehyeon(xx))
#[1] 0
sum(alex(xx) != khashaa(xx))
#[1] 0
sum(alex(xx) != khashaa_base(xx))
#[1] 0


microbenchmark::microbenchmark(alex(xx), cmpalex(xx), 
                               frank1(xx), cmpfrank1(xx), 
                               frank2(xx), cmpfrank2(xx), 
                               jaehyeon(xx), cmpjaehyeon(xx), 
                               khashaa(xx), cmpkhashaa(xx), 
                               khashaa_base(xx), cmpkhashaa_base(xx), times = 20)
#Unit: microseconds
#                expr       min         lq    median         uq       max neval
#            alex(xx)  3472.726  3620.1055  3764.005  4157.9445  5382.221    20
#         cmpalex(xx)  1056.538  1074.6345  1115.177  1251.0720  2131.172    20
#          frank1(xx) 19441.559 19858.8145 20356.808 21159.3035 27471.738    20
#       cmpfrank1(xx) 19166.288 19566.4925 20572.222 21108.8430 22243.335    20
#          frank2(xx) 12592.156 12931.6325 13337.057 14092.5725 24015.020    20
#       cmpfrank2(xx) 12396.578 12861.3365 13376.904 14012.3575 14542.715    20
#        jaehyeon(xx) 45313.525 46875.1900 47514.821 48728.3085 49513.578    20
#     cmpjaehyeon(xx) 44899.401 46496.7365 47748.330 49561.9505 82592.347    20
#         khashaa(xx)   189.314   204.1045   220.982   235.0760   259.959    20
#      cmpkhashaa(xx)   190.010   201.3200   234.032   240.1225   389.415    20
#    khashaa_base(xx)   295.802   315.1170   328.167   360.5320  1353.038    20
# cmpkhashaa_base(xx)   295.803   301.8930   317.901   332.8650   379.323    20
Run Code Online (Sandbox Code Playgroud)

编辑: 包括/修复其他替代方案.字节代码编译仅改进了具有显式循环的函数; 编译其他替代方案只是为了完整性.到目前为止,Khashaa的智能解决方案也是最快的.

  • @jalapic:`tapply`只是R的基本"拆分/应用"操作的包装器,这些操作成本很高.我猜"data.table"必须执行额外的"数据库-y"操作(?).`sapply`遍历向量多次以检查是否相等,而操作一次迭代哈希与`match`应该更快.@DavidArenburg:根据你的意见修正答案; 谢谢. (2认同)