检查每行单调递增

Wal*_*Yo_ 13 r dataframe dplyr

我有一个数据框如下:

   COL_1 COL_2 COL_3 COL_4 COL_5 COL_6
   <int> <int> <int> <int> <int> <int>
 1     1     1     1     1     1     1
 2     1     1     1     1     1     2
 3     1     1     1     1     1     3
 4     1     1     1     1     1     4
 5     1     2     1     1     1     5
 6     1     1     1     1     1     6
 7     1     3     4     5     6     7
 8     1     1     1     1     1     8
 9     1     1     9     1     1     9
10     1     3     5     7     9    10
Run Code Online (Sandbox Code Playgroud)

我想过滤此数据集以仅保留严格递增的值COL_1COL_6因此如下所示:

   COL_1 COL_2 COL_3 COL_4 COL_5 COL_6
   <int> <int> <int> <int> <int> <int>
 7     1     3     4     5     6     7
10     1     3     5     7     9    10
Run Code Online (Sandbox Code Playgroud)

编辑:代码应该在具有动态列数的函数中使用(将被命名为 from COL_1to COL_N)。“基本”代码,例如

df %>% filter(COL_6 > COL_5 & ... & COL_2 > COL_1)
Run Code Online (Sandbox Code Playgroud)

在我的情况下不起作用。非常感谢

Dar*_*sai 13

方法一:apply按行

df[!colSums(apply(df, 1, diff) <= 0), ]

#    COL_1 COL_2 COL_3 COL_4 COL_5 COL_6
# 7      1     3     4     5     6     7
# 10     1     3     5     7     9    10
Run Code Online (Sandbox Code Playgroud)

技巧在于:!+ 数字向量会将非零转换为FALSE,将零转换为TRUE

!(-3:3)
# [1] FALSE FALSE FALSE  TRUE FALSE FALSE FALSE
Run Code Online (Sandbox Code Playgroud)

方法 2:Map+ Reduce(更快)

df[Reduce(`&`, Map(`>`, df[-1], df[-ncol(df)])), ]
Run Code Online (Sandbox Code Playgroud)

在更大的数据集上进行基准测试

library(microbenchmark)

bm <- microbenchmark(
  MrFlick = df[Reduce(function(x, y) { list(y, x[[2]] & (x[[1]] < y)) }, df, init = list(df[[1]]-1, TRUE))[[2]], ],
  Darren_1 = df[!colSums(apply(df, 1, diff) <= 0), ],
  Darren_2 = df[Reduce(`&`, Map(`>`, df[-1], df[-ncol(df)])), ],
  zx8754_1 = df[ apply(df, 1, function(i) !is.unsorted(i, strictly = TRUE)), ],
  zx8754_2 = df[ apply(df, 1, function(i) all(rank(i) == seq.int(ncol(df)))), ],
  Thomas_1 = df[rowMeans(df[-1] > df[-ncol(df)]) == 1, ],
  Thomas_2 = df[rowSums(df[-1] > df[-ncol(df)]) == ncol(df) - 1, ],
  setup = {
    df <- as.data.frame(matrix(runif(1e6, 0, 100), 1e4, 1e2))
    # pick 100 rows to sort
    ind <- sample(1:1e4, 100)
    df[ind, ] <- t(apply(df[ind, ], 1, sort))
  }
)

Unit: milliseconds
     expr        min         lq      mean     median        uq       max neval
  MrFlick   7.470067   8.731615  12.28941   9.275596  14.16288 101.26516   100
 Darren_1 218.653701 239.837480 280.04091 264.328409 318.58590 451.99662   100
 Darren_2   9.142422  10.514188  13.94889  10.982153  16.88846  36.41603   100
 zx8754_1  51.888907  61.551056  75.29113  65.437290  73.57925 224.47821   100
 zx8754_2 388.489916 411.315632 448.10555 422.858870 475.91631 732.39909   100
 Thomas_1  12.699829  14.744726  19.51238  15.957306  20.82237  94.52955   100
 Thomas_2  12.592598  14.836320  18.21198  15.572325  20.95979  40.23583   100
Run Code Online (Sandbox Code Playgroud)

在此输入图像描述


MrF*_*ick 11

经过一些工作,您可以使用Reduce 来实现此目的。例如

keep <- Reduce(function(x, y) {
  list(y, x[[2]] & (x[[1]] < y))
}, dd, init=list(dd[[1]]-1, TRUE))[[2]]
which(keep)
# [1]  7 10
dd[keep, ]
#    COL_1 COL_2 COL_3 COL_4 COL_5 COL_6
# 7      1     3     4     5     6     7
# 10     1     3     5     7     9    10
Run Code Online (Sandbox Code Playgroud)

测试用

dd <- read.table(text="
COL_1 COL_2 COL_3 COL_4 COL_5 COL_6
1     1     1     1     1     1     1
2     1     1     1     1     1     2
3     1     1     1     1     1     3
4     1     1     1     1     1     4
5     1     2     1     1     1     5
6     1     1     1     1     1     6
7     1     3     4     5     6     7
8     1     1     1     1     1     8
9     1     1     9     1     1     9
10     1     3     5     7     9    10", header=TRUE)
Run Code Online (Sandbox Code Playgroud)


one*_*one 5

一碱基 R 方法

df[colSums(apply(df,1,diff)>0)==ncol(df)-1,]

   COL_1 COL_2 COL_3 COL_4 COL_5 COL_6
7      1     3     4     5     6     7
10     1     3     5     7     9    10
Run Code Online (Sandbox Code Playgroud)


Tho*_*ing 5

尝试以下基本 R 选项

  • rowMeans
> df[rowMeans(df[-1] - df[-ncol(df)] > 0) == 1, ]
   COL_1 COL_2 COL_3 COL_4 COL_5 COL_6
7      1     3     4     5     6     7
10     1     3     5     7     9    10
Run Code Online (Sandbox Code Playgroud)
  • rowSums
> df[rowSums(df[-1] > df[-ncol(df)]) == ncol(df) - 1, ]
   COL_1 COL_2 COL_3 COL_4 COL_5 COL_6
7      1     3     4     5     6     7
10     1     3     5     7     9    10
Run Code Online (Sandbox Code Playgroud)
  • NA+&
> df[complete.cases(NA & (df[-1] <= df[-ncol(df)])), ]
   COL_1 COL_2 COL_3 COL_4 COL_5 COL_6
7      1     3     4     5     6     7
10     1     3     5     7     9    10
Run Code Online (Sandbox Code Playgroud)

基准测试(借自@Darren Tsai

bm <- microbenchmark(
    MrFlick = df[Reduce(function(x, y) {
        list(y, x[[2]] & (x[[1]] < y))
    }, df, init = list(df[[1]] - 1, TRUE))[[2]], ],
    Darren_1 = df[!colSums(apply(df, 1, diff) <= 0), ],
    Darren_2 = df[Reduce(`&`, Map(`>`, df[-1], df[-ncol(df)])), ],
    zx8754_1 = df[apply(df, 1, function(i) !is.unsorted(i, strictly = TRUE)), ],
    zx8754_2 = df[apply(df, 1, function(i) all(rank(i) == seq.int(ncol(df)))), ],
    tic1 = df[rowMeans(df[-1] > df[-ncol(df)]) == 1, ],
    tic2 = df[rowSums(df[-1] > df[-ncol(df)]) == ncol(df) - 1, ],
    tic3 = df[complete.cases(NA & (df[-1] <= df[-ncol(df)])), ],
    setup = {
        df <- as.data.frame(matrix(runif(1e6, 0, 100), 1e4, 1e2))
        ind <- sample(1:1e4, 1e2)
        df[ind, ] <- t(apply(df[ind, ], 1, sort))
    },
    times = 10L,
    unit = "relative"
)
Run Code Online (Sandbox Code Playgroud)

这使

> bm
Unit: relative
     expr        min         lq      mean    median         uq        max neval
  MrFlick  0.9080191  0.9144448  1.028867  1.127537  1.0952573  1.0503769    10
 Darren_1 16.5282125 17.7915946 19.161257 19.417784 20.7691135 19.3924344    10
 Darren_2  1.0000000  1.0000000  1.000000  1.000000  1.0000000  1.0000000    10
 zx8754_1  4.3833846  4.5916794  4.958092  4.617921  4.1234763  9.0226479    10
 zx8754_2 27.4681979 27.4405513 25.276613 26.550560 22.5151429 24.6191662    10
     tic1  1.0823147  1.3835146  1.997294  1.511849  1.5489377  6.0274525    10
     tic2  1.0455388  1.0989379  1.016991  1.069731  0.9690896  0.9463357    10
     tic3  2.1156887  2.1455595  2.289243  2.501887  2.1517687  2.5138369    10
Run Code Online (Sandbox Code Playgroud)