删除时间序列中异常值的有效方法

Tun*_*ung 13 r time-series outliers data-science

我正在寻找有效的方法来删除数据中的异常值。我尝试了在 StackOverflow 和其他地方找到的几种解决方案,但没有一个对我有用(应该在样本数据中检测并删除 1993 年 6 月、1994 年 8 月和 1995 年 3 月的 4 个高值 21637、19590、21659 和 200000发布在这篇文章的末尾)。任何建议将不胜感激!

这是我到目前为止测试过的:

数据概览

  1. 如何从数据集中删除异常值

3 个异常值仍然存在,并且时间序列末尾的许多合法高值已被删除。

y <- dat$Value
y_filter <- y[!y %in% boxplot.stats(y)$out]
plot(y_filter)
Run Code Online (Sandbox Code Playgroud)

  1. 计算 R 中的异常值

与第一种方法类似的问题

FindOutliers <- function(data) {
  data <- data[!is.na(data)]
  lowerq = quantile(data)[2]
  upperq = quantile(data)[4]
  iqr = upperq - lowerq #Or use IQR(data)
  # we identify extreme outliers
  extreme.threshold.upper = (iqr * 1.5) + upperq
  extreme.threshold.lower = lowerq - (iqr * 1.5)
  result <- which(data > extreme.threshold.upper | data < extreme.threshold.lower)
  
  return(result)
}

# use the function to identify outliers
temp <- FindOutliers(y)
# remove the outliers
y1 <- y[-temp]
# show the data with the outliers removed
y1
#>   [1]   511   524   310   721   514   318   511   511   318   510 21637     0
#>  [13]   319   511   305   317     0     0     0     0     0     0     0     0
#>  [25] 19590     0     0     0     0     0     0     0     0 21659     0     0
#>  [37]     0     0    19     7     0     5     9    21    50   187   291   321
#>  [49]   445   462   462   695   694   693  1276  2560  5500 11663 24307 25205
#>  [61] 21667 18610 16739 14294 13517 12296 11247 11209 10954 11228 11387 11190
#>  [73] 11193 11562 12279 11994 12073 11965 11386 10512 10677 10115 10118  9530
#>  [85]  9016  9086  8167  8171  7561  7268  7359  7753  7168  7206  6926  6646
#>  [97]  6674  6100  6177  5975  6033  5767  5497  5862  5594  5319
plot(y1)
Run Code Online (Sandbox Code Playgroud)

  1. 通过性能检查异常值

与其他两种方法相同的问题

library(performance)
outliers_list <- check_outliers(y) # Find outliers
outliers_list # Show the row index of the outliers
#> 9 outliers detected: cases 60, 61, 62, 63, 64, 65, 66, 67, 68.
#> - Based on the following method and threshold: zscore_robust (3.291).
#> - For variable: y.
#> 
#> -----------------------------------------------------------------------------
#> Outliers per variable (zscore_robust): 
#> 
#> $y
#>    Row Distance_Zscore_robust
#> 60  60               3.688817
#> 61  61               4.384524
#> 62  62               4.491276
#> 63  63               4.362517
#> 64  64               4.438994
#> 65  65               4.525319
#> 66  66               4.576871
#> 67  67               4.393886
#> 68  68               3.804809
str(outliers_list)
#>  'check_outliers' logi [1:116] FALSE FALSE FALSE FALSE FALSE FALSE ...
#>  - attr(*, "data")='data.frame': 116 obs. of  4 variables:
#>   ..$ Row                   : int [1:116] 1 2 3 4 5 6 7 8 9 10 ...
#>   ..$ Distance_Zscore_robust: num [1:116] 0.645 0.643 0.669 0.619 0.644 ...
#>   ..$ Outlier_Zscore_robust : num [1:116] 0 0 0 0 0 0 0 0 0 0 ...
#>   ..$ Outlier               : num [1:116] 0 0 0 0 0 0 0 0 0 0 ...
#>  - attr(*, "threshold")=List of 1
#>   ..$ zscore_robust: num 3.29
#>  - attr(*, "method")= chr "zscore_robust"
#>  - attr(*, "text_size")= num 3
#>  - attr(*, "variables")= chr "y"
#>  - attr(*, "raw_data")='data.frame': 116 obs. of  1 variable:
#>   ..$ y: num [1:116] 511 524 310 721 514 318 511 511 318 510 ...
#>  - attr(*, "outlier_var")=List of 1
#>   ..$ zscore_robust:List of 1
#>   .. ..$ y:'data.frame': 9 obs. of  2 variables:
#>   .. .. ..$ Row                   : int [1:9] 60 61 62 63 64 65 66 67 68
#>   .. .. ..$ Distance_Zscore_robust: num [1:9] 3.69 4.38 4.49 4.36 4.44 ...
#>  - attr(*, "outlier_count")=List of 2
#>   ..$ zscore_robust:'data.frame':    0 obs. of  2 variables:
#>   .. ..$ Row            : int(0) 
#>   .. ..$ n_Zscore_robust: int(0) 
#>   ..$ all          :'data.frame':    0 obs. of  2 variables:
#>   .. ..$ Row            : num(0) 
#>   .. ..$ n_Zscore_robust: num(0)

y[!outliers_list]
#>   [1]   511   524   310   721   514   318   511   511   318   510 21637     0
#>  [13]   319   511   305   317     0     0     0     0     0     0     0     0
#>  [25] 19590     0     0     0     0     0     0     0     0 21659     0     0
#>  [37]     0     0    19     7     0     5     9    21    50   187   291   321
#>  [49]   445   462   462   695   694   693  1276  2560  5500 11663 24307 30864
#>  [61] 25205 21667 18610 16739 14294 13517 12296 11247 11209 10954 11228 11387
#>  [73] 11190 11193 11562 12279 11994 12073 11965 11386 10512 10677 10115 10118
#>  [85]  9530  9016  9086  8167  8171  7561  7268  7359  7753  7168  7206  6926
#>  [97]  6646  6674  6100  6177  5975  6033  5767  5497  5862  5594  5319

par(mfrow = c(2,1), oma = c(2,2,0,0) + 0.1, mar = c(0,0,2,1) + 0.2)
plot(y)
plot(y[!outliers_list])
Run Code Online (Sandbox Code Playgroud)

测试数据

library(tibble)

dat <- tibble::tribble(
  ~DateTime, ~Value,
  "1993-06-06 11:00:00",     NA,
  "1993-06-06 12:00:00",    524,
  "1993-06-06 13:00:00",    310,
  "1993-06-06 14:00:00",    721,
  "1993-06-06 15:00:00",    514,
  "1993-06-06 16:00:00",    318,
  "1993-06-06 17:00:00",    511,
  "1993-06-06 18:00:00",    511,
  "1993-06-06 19:00:00",    318,
  "1993-06-06 20:00:00",    510,
  "1993-06-06 21:00:00",  21637,
  "1993-06-06 22:00:00",     NA,
  "1993-06-06 23:00:00",    319,
  "1993-06-07 24:00:00",    511,
  "1993-06-07 01:00:00",    305,
  "1993-06-07 02:00:00",    317,
  "1994-08-25 06:00:00",      0,
  "1994-08-25 07:00:00",      0,
  "1994-08-25 08:00:00",      0,
  "1994-08-25 09:00:00",     NA,
  "1994-08-25 10:00:00",      0,
  "1994-08-25 11:00:00",      0,
  "1994-08-25 12:00:00",      0,
  "1994-08-25 13:00:00",      0,
  "1994-08-25 14:00:00",  19590,
  "1994-08-26 06:00:00",      0,
  "1994-08-26 07:00:00",      0,
  "1994-08-26 08:00:00",      0,
  "1994-08-26 09:00:00",      0,
  "1994-08-26 10:00:00",     NA,
  "1994-08-26 11:00:00",     NA,
  "1994-08-26 12:00:00",      0,
  "1994-08-26 13:00:00",      0,
  "1994-08-26 14:00:00",  21659,
  "1994-08-26 15:00:00",      0,
  "1994-08-26 16:00:00",      0,
  "1994-08-26 17:00:00",      0,
  "1994-08-26 20:00:00",      0,
  "1994-08-26 21:00:00",     19,
  "1994-08-26 22:00:00",     NA,
  "1995-03-09 18:00:00",     NA,
  "1995-03-09 19:00:00",     NA,
  "1995-03-09 20:00:00",      9,
  "1995-03-09 21:00:00",     21,
  "1995-03-09 22:00:00",     50,
  "1995-03-09 23:00:00",    187,
  "1995-03-10 24:00:00",    291,
  "1995-03-10 01:00:00",    321,
  "1995-03-10 02:00:00",    445,
  "1995-03-10 03:00:00",  2e+05,
  "1995-03-10 04:00:00",    462,
  "1995-03-10 05:00:00",    695,
  "1995-03-10 06:00:00",    694,
  "1995-03-10 07:00:00",    693,
  "1995-03-10 08:00:00",   1276,
  "1995-03-10 09:00:00",   2560,
  "1995-03-10 10:00:00",   5500,
  "1995-03-10 11:00:00",  11663,
  "1995-03-10 12:00:00",  24307,
  "1995-03-10 15:00:00",  36154,
  "1995-03-10 17:00:00",  41876,
  "1995-03-10 18:00:00",  42754,
  "1995-03-10 19:00:00",     NA,
  "1995-03-10 20:00:00",     NA,
  "1995-03-10 21:00:00",  43034,
  "1995-03-10 22:00:00",  43458,
  "1995-03-10 23:00:00",  41953,
  "1995-03-11 24:00:00",  37108,
  "1995-03-11 01:00:00",  30864,
  "1995-03-11 02:00:00",  25205,
  "1995-03-11 03:00:00",     NA,
  "1995-03-11 04:00:00",     NA,
  "1995-03-11 05:00:00",     NA,
  "1995-03-11 06:00:00",     NA,
  "1995-03-11 07:00:00",  13517,
  "1995-03-11 08:00:00",  12296,
  "1995-03-11 09:00:00",  11247,
  "1995-03-11 10:00:00",  11209,
  "1995-03-11 11:00:00",  10954,
  "1995-03-11 12:00:00",  11228,
  "1995-03-11 13:00:00",  11387,
  "1995-03-11 14:00:00",  11190,
  "1995-03-11 15:00:00",  11193,
  "1995-03-11 16:00:00",  11562,
  "1995-03-11 17:00:00",  12279,
  "1995-03-11 18:00:00",  11994,
  "1995-03-11 19:00:00",  12073,
  "1995-03-11 20:00:00",  11965,
  "1995-03-11 21:00:00",  11386,
  "1995-03-11 22:00:00",  10512,
  "1995-03-11 23:00:00",  10677,
  "1995-03-12 24:00:00",  10115,
  "1995-03-12 01:00:00",  10118,
  "1995-03-12 02:00:00",   9530,
  "1995-03-12 03:00:00",   9016,
  "1995-03-12 04:00:00",   9086,
  "1995-03-12 05:00:00",   8167,
  "1995-03-12 06:00:00",   8171,
  "1995-03-12 07:00:00",   7561,
  "1995-03-12 08:00:00",   7268,
  "1995-03-12 09:00:00",   7359,
  "1995-03-12 10:00:00",   7753,
  "1995-03-12 11:00:00",   7168,
  "1995-03-12 12:00:00",   7206,
  "1995-03-12 13:00:00",   6926,
  "1995-03-12 14:00:00",   6646,
  "1995-03-12 15:00:00",   6674,
  "1995-03-12 16:00:00",   6100,
  "1995-03-12 17:00:00",   6177,
  "1995-03-12 18:00:00",   5975,
  "1995-03-12 19:00:00",   6033,
  "1995-03-12 20:00:00",   5767,
  "1995-03-12 21:00:00",   5497,
  "1995-03-12 22:00:00",   5862,
  "1995-03-12 23:00:00",   5594,
  "1995-03-13 24:00:00",     NA
)
Run Code Online (Sandbox Code Playgroud)

创建于 2023-10-05,使用reprex v2.0.2

小智 18

在您的时间序列中,检测局部异常值而不是全局异常值可能是更好的主意。要删除这些异常值,您可以使用时间窗口,例如 (n = 10):

library(data.table)

setDT(dat)

dat[, upper := frollapply(Value, n = 10, FUN = quantile, p = 0.75, na.rm = TRUE, align = "center")]
dat[, lower := frollapply(Value, n = 10, FUN = quantile, p = 0.25, na.rm = TRUE, align = "center")]
dat[, iqr := frollapply(Value, n = 10, FUN = IQR, na.rm = TRUE, align = "center")]
dat[Value < lower - iqr * 3 | Value > upper + iqr * 3]

              DateTime Value upper  lower    iqr
1: 1993-06-06 21:00:00 21637   511 317.25 193.75
2: 1994-08-25 14:00:00 19590     0   0.00   0.00
3: 1994-08-26 14:00:00 21659     0   0.00   0.00

dat[Value < lower - iqr * 3 | Value > upper + iqr * 3, Value := NA_integer_]

dat[, mean := frollmean(Value, n = 10, na.rm = TRUE, align = "center")]
dat[, Value := fcoalesce(Value, mean)]
dat[Value < lower - iqr * 3 | Value > upper + iqr * 3]

plot(dat$Value)
Run Code Online (Sandbox Code Playgroud)

阴谋

  • 这太棒了!您能否在代码中添加一些解释,以便刚接触异常值检测(和“data.table”)的人能够理解?谢谢你! (3认同)

DrJ*_*TAO 8

您可以使用 {forecast} 包中的函数来识别有或没有季节性的时间序列异常值。它使用弗里德曼的超级平滑器。它还可以使用其他数据点将异常值和缺失值插入到预期值中。它首先消除趋势和季节性,因此剩下的部分是误差项。如果一个观察到的误差值低于第一个四分位数或高于第三个四分位数三个 IQR,则将其视为异常值。

阅读 Hyndman (2021)“检测时间序列离群值”了解更多信息https://robjhyndman.com/hyndsight/tsoutliers/

library(dplyr)
library(forecast)
dat %>%
  mutate(
    outlier = row_number() %in% tsoutliers(Value)$index, 
    replacement = tsclean(Value)
  )
Run Code Online (Sandbox Code Playgroud)


Wal*_*ldi 6

您可以使用Hampel 滤波器,它是一种有效的信号处理滤波器来消除异常值。

该过滤器在包中实现pracma请参阅

使用提供的示例:

library(pracma)

# non NA values index
ValInd <- which(!is.na(dat$Value))

# Find outliers index using Hampel filter
OutlierInd <- ValInd[pracma::hampel(dat$Value[ValInd],k=6)$ind]

dat$Value[OutlierInd]
#> [1] 21637 19590 21659

# Remove outliers
dat$Value[OutlierInd] <- NA

plot(dat$Value)
Run Code Online (Sandbox Code Playgroud)

请注意,结果取决于观察窗口长度,在本例中k=6提供了预期结果。

性能对比:

Hampel <- function(){
ValInd <- which(!is.na(dat$Value))
OutlierInd <- ValInd[pracma::hampel(dat$Value[ValInd],k=6)$ind]
dat$Value[OutlierInd] <- NA
}

dt <- function() {
setDT(dat)
dat[, upper := frollapply(Value, n = 10, FUN = quantile, p = 0.75, na.rm = TRUE, align = "center")]
dat[, lower := frollapply(Value, n = 10, FUN = quantile, p = 0.25, na.rm = TRUE, align = "center")]
dat[, iqr := frollapply(Value, n = 10, FUN = IQR, na.rm = TRUE, align = "center")]
#dat[Value < lower - iqr * 3 | Value > upper + iqr * 3]
dat[Value < lower - iqr * 3 | Value > upper + iqr * 3, Value := NA_integer_]
dat[, mean := frollmean(Value, n = 10, na.rm = TRUE, align = "center")]
#dat[, Value := fcoalesce(Value, mean)]
#dat[Value < lower - iqr * 3 | Value > upper + iqr * 3]
}

Forecast <- function() {
  dat %>%
    mutate(
      outlier = row_number() %in% tsoutliers(
        Value, lambda = "auto")$index, 
      replacement = tsclean(Value, lambda = "auto")
    )
}

microbenchmark::microbenchmark(Hampel(),dt(),Forecast())

#Unit: milliseconds
       expr      min        lq       mean   median        uq      max neval
#   Hampel()   3.4124   3.62625   4.568761   3.8711   4.35435  14.1761   100
#       dt()  23.6843  24.78875  27.691938  25.5765  31.51740  40.2852   100
# Forecast() 214.8229 222.16470 230.230439 227.2018 232.95140 340.5815   100
Run Code Online (Sandbox Code Playgroud)

  • 简单、直接、简洁、有效,酷!+1! (2认同)