Tun*_*ung 13 r time-series outliers data-science
我正在寻找有效的方法来删除数据中的异常值。我尝试了在 StackOverflow 和其他地方找到的几种解决方案,但没有一个对我有用(应该在样本数据中检测并删除 1993 年 6 月、1994 年 8 月和 1995 年 3 月的 4 个高值 21637、19590、21659 和 200000发布在这篇文章的末尾)。任何建议将不胜感激!
这是我到目前为止测试过的:
数据概览

3 个异常值仍然存在,并且时间序列末尾的许多合法高值已被删除。
y <- dat$Value
y_filter <- y[!y %in% boxplot.stats(y)$out]
plot(y_filter)
Run Code Online (Sandbox Code Playgroud)

与第一种方法类似的问题
FindOutliers <- function(data) {
data <- data[!is.na(data)]
lowerq = quantile(data)[2]
upperq = quantile(data)[4]
iqr = upperq - lowerq #Or use IQR(data)
# we identify extreme outliers
extreme.threshold.upper = (iqr * 1.5) + upperq
extreme.threshold.lower = lowerq - (iqr * 1.5)
result <- which(data > extreme.threshold.upper | data < extreme.threshold.lower)
return(result)
}
# use the function to identify outliers
temp <- FindOutliers(y)
# remove the outliers
y1 <- y[-temp]
# show the data with the outliers removed
y1
#> [1] 511 524 310 721 514 318 511 511 318 510 21637 0
#> [13] 319 511 305 317 0 0 0 0 0 0 0 0
#> [25] 19590 0 0 0 0 0 0 0 0 21659 0 0
#> [37] 0 0 19 7 0 5 9 21 50 187 291 321
#> [49] 445 462 462 695 694 693 1276 2560 5500 11663 24307 25205
#> [61] 21667 18610 16739 14294 13517 12296 11247 11209 10954 11228 11387 11190
#> [73] 11193 11562 12279 11994 12073 11965 11386 10512 10677 10115 10118 9530
#> [85] 9016 9086 8167 8171 7561 7268 7359 7753 7168 7206 6926 6646
#> [97] 6674 6100 6177 5975 6033 5767 5497 5862 5594 5319
plot(y1)
Run Code Online (Sandbox Code Playgroud)

与其他两种方法相同的问题
library(performance)
outliers_list <- check_outliers(y) # Find outliers
outliers_list # Show the row index of the outliers
#> 9 outliers detected: cases 60, 61, 62, 63, 64, 65, 66, 67, 68.
#> - Based on the following method and threshold: zscore_robust (3.291).
#> - For variable: y.
#>
#> -----------------------------------------------------------------------------
#> Outliers per variable (zscore_robust):
#>
#> $y
#> Row Distance_Zscore_robust
#> 60 60 3.688817
#> 61 61 4.384524
#> 62 62 4.491276
#> 63 63 4.362517
#> 64 64 4.438994
#> 65 65 4.525319
#> 66 66 4.576871
#> 67 67 4.393886
#> 68 68 3.804809
str(outliers_list)
#> 'check_outliers' logi [1:116] FALSE FALSE FALSE FALSE FALSE FALSE ...
#> - attr(*, "data")='data.frame': 116 obs. of 4 variables:
#> ..$ Row : int [1:116] 1 2 3 4 5 6 7 8 9 10 ...
#> ..$ Distance_Zscore_robust: num [1:116] 0.645 0.643 0.669 0.619 0.644 ...
#> ..$ Outlier_Zscore_robust : num [1:116] 0 0 0 0 0 0 0 0 0 0 ...
#> ..$ Outlier : num [1:116] 0 0 0 0 0 0 0 0 0 0 ...
#> - attr(*, "threshold")=List of 1
#> ..$ zscore_robust: num 3.29
#> - attr(*, "method")= chr "zscore_robust"
#> - attr(*, "text_size")= num 3
#> - attr(*, "variables")= chr "y"
#> - attr(*, "raw_data")='data.frame': 116 obs. of 1 variable:
#> ..$ y: num [1:116] 511 524 310 721 514 318 511 511 318 510 ...
#> - attr(*, "outlier_var")=List of 1
#> ..$ zscore_robust:List of 1
#> .. ..$ y:'data.frame': 9 obs. of 2 variables:
#> .. .. ..$ Row : int [1:9] 60 61 62 63 64 65 66 67 68
#> .. .. ..$ Distance_Zscore_robust: num [1:9] 3.69 4.38 4.49 4.36 4.44 ...
#> - attr(*, "outlier_count")=List of 2
#> ..$ zscore_robust:'data.frame': 0 obs. of 2 variables:
#> .. ..$ Row : int(0)
#> .. ..$ n_Zscore_robust: int(0)
#> ..$ all :'data.frame': 0 obs. of 2 variables:
#> .. ..$ Row : num(0)
#> .. ..$ n_Zscore_robust: num(0)
y[!outliers_list]
#> [1] 511 524 310 721 514 318 511 511 318 510 21637 0
#> [13] 319 511 305 317 0 0 0 0 0 0 0 0
#> [25] 19590 0 0 0 0 0 0 0 0 21659 0 0
#> [37] 0 0 19 7 0 5 9 21 50 187 291 321
#> [49] 445 462 462 695 694 693 1276 2560 5500 11663 24307 30864
#> [61] 25205 21667 18610 16739 14294 13517 12296 11247 11209 10954 11228 11387
#> [73] 11190 11193 11562 12279 11994 12073 11965 11386 10512 10677 10115 10118
#> [85] 9530 9016 9086 8167 8171 7561 7268 7359 7753 7168 7206 6926
#> [97] 6646 6674 6100 6177 5975 6033 5767 5497 5862 5594 5319
par(mfrow = c(2,1), oma = c(2,2,0,0) + 0.1, mar = c(0,0,2,1) + 0.2)
plot(y)
plot(y[!outliers_list])
Run Code Online (Sandbox Code Playgroud)

测试数据
library(tibble)
dat <- tibble::tribble(
~DateTime, ~Value,
"1993-06-06 11:00:00", NA,
"1993-06-06 12:00:00", 524,
"1993-06-06 13:00:00", 310,
"1993-06-06 14:00:00", 721,
"1993-06-06 15:00:00", 514,
"1993-06-06 16:00:00", 318,
"1993-06-06 17:00:00", 511,
"1993-06-06 18:00:00", 511,
"1993-06-06 19:00:00", 318,
"1993-06-06 20:00:00", 510,
"1993-06-06 21:00:00", 21637,
"1993-06-06 22:00:00", NA,
"1993-06-06 23:00:00", 319,
"1993-06-07 24:00:00", 511,
"1993-06-07 01:00:00", 305,
"1993-06-07 02:00:00", 317,
"1994-08-25 06:00:00", 0,
"1994-08-25 07:00:00", 0,
"1994-08-25 08:00:00", 0,
"1994-08-25 09:00:00", NA,
"1994-08-25 10:00:00", 0,
"1994-08-25 11:00:00", 0,
"1994-08-25 12:00:00", 0,
"1994-08-25 13:00:00", 0,
"1994-08-25 14:00:00", 19590,
"1994-08-26 06:00:00", 0,
"1994-08-26 07:00:00", 0,
"1994-08-26 08:00:00", 0,
"1994-08-26 09:00:00", 0,
"1994-08-26 10:00:00", NA,
"1994-08-26 11:00:00", NA,
"1994-08-26 12:00:00", 0,
"1994-08-26 13:00:00", 0,
"1994-08-26 14:00:00", 21659,
"1994-08-26 15:00:00", 0,
"1994-08-26 16:00:00", 0,
"1994-08-26 17:00:00", 0,
"1994-08-26 20:00:00", 0,
"1994-08-26 21:00:00", 19,
"1994-08-26 22:00:00", NA,
"1995-03-09 18:00:00", NA,
"1995-03-09 19:00:00", NA,
"1995-03-09 20:00:00", 9,
"1995-03-09 21:00:00", 21,
"1995-03-09 22:00:00", 50,
"1995-03-09 23:00:00", 187,
"1995-03-10 24:00:00", 291,
"1995-03-10 01:00:00", 321,
"1995-03-10 02:00:00", 445,
"1995-03-10 03:00:00", 2e+05,
"1995-03-10 04:00:00", 462,
"1995-03-10 05:00:00", 695,
"1995-03-10 06:00:00", 694,
"1995-03-10 07:00:00", 693,
"1995-03-10 08:00:00", 1276,
"1995-03-10 09:00:00", 2560,
"1995-03-10 10:00:00", 5500,
"1995-03-10 11:00:00", 11663,
"1995-03-10 12:00:00", 24307,
"1995-03-10 15:00:00", 36154,
"1995-03-10 17:00:00", 41876,
"1995-03-10 18:00:00", 42754,
"1995-03-10 19:00:00", NA,
"1995-03-10 20:00:00", NA,
"1995-03-10 21:00:00", 43034,
"1995-03-10 22:00:00", 43458,
"1995-03-10 23:00:00", 41953,
"1995-03-11 24:00:00", 37108,
"1995-03-11 01:00:00", 30864,
"1995-03-11 02:00:00", 25205,
"1995-03-11 03:00:00", NA,
"1995-03-11 04:00:00", NA,
"1995-03-11 05:00:00", NA,
"1995-03-11 06:00:00", NA,
"1995-03-11 07:00:00", 13517,
"1995-03-11 08:00:00", 12296,
"1995-03-11 09:00:00", 11247,
"1995-03-11 10:00:00", 11209,
"1995-03-11 11:00:00", 10954,
"1995-03-11 12:00:00", 11228,
"1995-03-11 13:00:00", 11387,
"1995-03-11 14:00:00", 11190,
"1995-03-11 15:00:00", 11193,
"1995-03-11 16:00:00", 11562,
"1995-03-11 17:00:00", 12279,
"1995-03-11 18:00:00", 11994,
"1995-03-11 19:00:00", 12073,
"1995-03-11 20:00:00", 11965,
"1995-03-11 21:00:00", 11386,
"1995-03-11 22:00:00", 10512,
"1995-03-11 23:00:00", 10677,
"1995-03-12 24:00:00", 10115,
"1995-03-12 01:00:00", 10118,
"1995-03-12 02:00:00", 9530,
"1995-03-12 03:00:00", 9016,
"1995-03-12 04:00:00", 9086,
"1995-03-12 05:00:00", 8167,
"1995-03-12 06:00:00", 8171,
"1995-03-12 07:00:00", 7561,
"1995-03-12 08:00:00", 7268,
"1995-03-12 09:00:00", 7359,
"1995-03-12 10:00:00", 7753,
"1995-03-12 11:00:00", 7168,
"1995-03-12 12:00:00", 7206,
"1995-03-12 13:00:00", 6926,
"1995-03-12 14:00:00", 6646,
"1995-03-12 15:00:00", 6674,
"1995-03-12 16:00:00", 6100,
"1995-03-12 17:00:00", 6177,
"1995-03-12 18:00:00", 5975,
"1995-03-12 19:00:00", 6033,
"1995-03-12 20:00:00", 5767,
"1995-03-12 21:00:00", 5497,
"1995-03-12 22:00:00", 5862,
"1995-03-12 23:00:00", 5594,
"1995-03-13 24:00:00", NA
)
Run Code Online (Sandbox Code Playgroud)
创建于 2023-10-05,使用reprex v2.0.2
小智 18
在您的时间序列中,检测局部异常值而不是全局异常值可能是更好的主意。要删除这些异常值,您可以使用时间窗口,例如 (n = 10):
library(data.table)
setDT(dat)
dat[, upper := frollapply(Value, n = 10, FUN = quantile, p = 0.75, na.rm = TRUE, align = "center")]
dat[, lower := frollapply(Value, n = 10, FUN = quantile, p = 0.25, na.rm = TRUE, align = "center")]
dat[, iqr := frollapply(Value, n = 10, FUN = IQR, na.rm = TRUE, align = "center")]
dat[Value < lower - iqr * 3 | Value > upper + iqr * 3]
DateTime Value upper lower iqr
1: 1993-06-06 21:00:00 21637 511 317.25 193.75
2: 1994-08-25 14:00:00 19590 0 0.00 0.00
3: 1994-08-26 14:00:00 21659 0 0.00 0.00
dat[Value < lower - iqr * 3 | Value > upper + iqr * 3, Value := NA_integer_]
dat[, mean := frollmean(Value, n = 10, na.rm = TRUE, align = "center")]
dat[, Value := fcoalesce(Value, mean)]
dat[Value < lower - iqr * 3 | Value > upper + iqr * 3]
plot(dat$Value)
Run Code Online (Sandbox Code Playgroud)
您可以使用 {forecast} 包中的函数来识别有或没有季节性的时间序列异常值。它使用弗里德曼的超级平滑器。它还可以使用其他数据点将异常值和缺失值插入到预期值中。它首先消除趋势和季节性,因此剩下的部分是误差项。如果一个观察到的误差值低于第一个四分位数或高于第三个四分位数三个 IQR,则将其视为异常值。
阅读 Hyndman (2021)“检测时间序列离群值”了解更多信息https://robjhyndman.com/hyndsight/tsoutliers/。
library(dplyr)
library(forecast)
dat %>%
mutate(
outlier = row_number() %in% tsoutliers(Value)$index,
replacement = tsclean(Value)
)
Run Code Online (Sandbox Code Playgroud)
您可以使用Hampel 滤波器,它是一种有效的信号处理滤波器来消除异常值。
该过滤器在包中实现pracma,请参阅。
使用提供的示例:
library(pracma)
# non NA values index
ValInd <- which(!is.na(dat$Value))
# Find outliers index using Hampel filter
OutlierInd <- ValInd[pracma::hampel(dat$Value[ValInd],k=6)$ind]
dat$Value[OutlierInd]
#> [1] 21637 19590 21659
# Remove outliers
dat$Value[OutlierInd] <- NA
plot(dat$Value)
Run Code Online (Sandbox Code Playgroud)

请注意,结果取决于观察窗口长度,在本例中k=6提供了预期结果。
性能对比:
Hampel <- function(){
ValInd <- which(!is.na(dat$Value))
OutlierInd <- ValInd[pracma::hampel(dat$Value[ValInd],k=6)$ind]
dat$Value[OutlierInd] <- NA
}
dt <- function() {
setDT(dat)
dat[, upper := frollapply(Value, n = 10, FUN = quantile, p = 0.75, na.rm = TRUE, align = "center")]
dat[, lower := frollapply(Value, n = 10, FUN = quantile, p = 0.25, na.rm = TRUE, align = "center")]
dat[, iqr := frollapply(Value, n = 10, FUN = IQR, na.rm = TRUE, align = "center")]
#dat[Value < lower - iqr * 3 | Value > upper + iqr * 3]
dat[Value < lower - iqr * 3 | Value > upper + iqr * 3, Value := NA_integer_]
dat[, mean := frollmean(Value, n = 10, na.rm = TRUE, align = "center")]
#dat[, Value := fcoalesce(Value, mean)]
#dat[Value < lower - iqr * 3 | Value > upper + iqr * 3]
}
Forecast <- function() {
dat %>%
mutate(
outlier = row_number() %in% tsoutliers(
Value, lambda = "auto")$index,
replacement = tsclean(Value, lambda = "auto")
)
}
microbenchmark::microbenchmark(Hampel(),dt(),Forecast())
#Unit: milliseconds
expr min lq mean median uq max neval
# Hampel() 3.4124 3.62625 4.568761 3.8711 4.35435 14.1761 100
# dt() 23.6843 24.78875 27.691938 25.5765 31.51740 40.2852 100
# Forecast() 214.8229 222.16470 230.230439 227.2018 232.95140 340.5815 100
Run Code Online (Sandbox Code Playgroud)