对向量中值的实例进行编号的最快方法

Hen*_*olm 1 r dataframe dplyr

我希望对向量中出现的值的每个实例进行编号。例如,某个值的第一个实例将获取“1”,第二个实例将获取“2”,依此类推 - 计算该值在向量中之前出现的次数。EuStockMarkets我可以使用 R中的 for 循环使用datasets.

#load data
data <- as.data.frame(datasets::EuStockMarkets)
df <- data.frame(order = 1:nrow(data),value = data$DAX)
head(df)

#calculate number of instances 
start_time <- Sys.time()
for (i in 1:nrow(df)) {
 df[i,"instance"]<- sum(df[1:i,"value"] == df[i,"value"])
}
end_time <- Sys.time()
end_time - start_time
#Time difference of 0.1126978 secs
Run Code Online (Sandbox Code Playgroud)

这很好,但如果对于更大的数据集有更快的选项,并且想知道它们是否是一个预先存在的函数(也许带有 tidyverse 包),我宁愿不使用 for 循环。

r2e*_*ans 6

碱基R

这是一个很好的用途ave

table(df$instance) # yours
#    1    2    3    4 
# 1774   66   17    3 
df$instance3 <- ave(df$value, df$value, FUN = seq_along)
table(df$instance3)
#    1    2    3    4 
# 1774   66   17    3 
all(df$instance == df$instance3)
# [1] TRUE
Run Code Online (Sandbox Code Playgroud)

dplyr

library(dplyr)
df %>%
  group_by(value) %>%
  mutate(instance4 = row_number()) %>%
  ungroup()
Run Code Online (Sandbox Code Playgroud)

基准

毫不奇怪,Waldidata.table爆发了:

DT <- as.data.table(df)
useful.function = function(x) { temp.idx = x[1]; sum(x[2] == df$value[1:temp.idx]); }
bench::mark(
  OP = {
    for (i in 1:nrow(df)) {
      df[i,"instance"]<- sum(df[1:i,"value"] == df[i,"value"])
    }
  },
  Waldi = {
    DT[,instance2:=seq_len(.N),by=value]
  },
  r2evans_base = {
    df$instance3 <- ave(df$value, df$value, FUN = seq_along)
  },
  r2evans_dplyr = {
    df %>%
      group_by(value) %>%
      mutate(instance4 = row_number()) %>%
      ungroup()
  },
  PlasticMan = {
    df$instance5 <- apply(df, MARGIN = 1, useful.function)
  },
  min_terations = 500, check = FALSE)
# # A tibble: 5 x 13
#   expression         min   median `itr/sec` mem_alloc `gc/sec` n_itr  n_gc total_time result memory                 time    gc     
#   <bch:expr>    <bch:tm> <bch:tm>     <dbl> <bch:byt>    <dbl> <int> <dbl>   <bch:tm> <list> <list>                 <list>  <list> 
# 1 OP             54.42ms  59.45ms      16.6    39.9MB    11.1    300   200     18.07s <NULL> <Rprofmem [7,360 x 3]> <bench~ <tibbl~
# 2 Waldi          997.3us   1.11ms     828.     81.6KB     1.66   499     1   602.36ms <NULL> <Rprofmem [12 x 3]>    <bench~ <tibbl~
# 3 r2evans_base    3.11ms   3.52ms     273.    389.1KB     1.65   497     3      1.82s <NULL> <Rprofmem [35 x 3]>    <bench~ <tibbl~
# 4 r2evans_dplyr  13.04ms  15.85ms      59.5     347KB     6.03   454    46      7.63s <NULL> <Rprofmem [35 x 3]>    <bench~ <tibbl~
# 5 PlasticMan      12.7ms  14.12ms      65.3    26.9MB     4.47   468    32      7.16s <NULL> <Rprofmem [5,507 x 3]> <bench~ <tibbl~
Run Code Online (Sandbox Code Playgroud)


Wal*_*ldi 5

data.table

library(data.table)
setDT(df)

df[,instance2:=seq_len(.N),by=value]

identical(df$instance,df$instance2)
[1] TRUE
Run Code Online (Sandbox Code Playgroud)