假设我有这样的事情:
set.seed(0)
the.df <- data.frame( x=rep(letters[1:3], each=4),
n=rep(0:3, 3),
val=round(runif(12)))
the.df
x n val
1 a 0 1
2 a 1 0
3 a 2 0
4 a 3 1
5 b 0 1
6 b 1 0
7 b 2 1
8 b 3 1
9 c 0 1
10 c 1 1
11 c 2 0
12 c 3 0
Run Code Online (Sandbox Code Playgroud)
在每个内部x,从n==2(从小到大)开始,val如果前一个val(按照n)为0 ,我想设置为0; 否则,保持原样.
例如,在子集中x=="b",我首先忽略n<2 行中的两行.现在,在第7行中,因为前一个val是0(the.df$val[the.df$x=="b" & the.df$n==1]),所以我设置val为0(the.df$val[the.df$x=="b" & the.df$n==2] <- 0).然后在第8行,现在val前一个n是0(我们只是设置它),我也想在val这里设置为0(the.df$val[the.df$x=="b" & the.df$n==3] <- 0).
想象一下,data.frame没有排序.因此,依赖于订单的程序需要排序.我也不能假设存在相邻的行(例如,行the.df[the.df$x=="a" & the.df$n==1, ]可能丢失).
最棘手的部分似乎是val按顺序进行评估.我可以使用循环来做到这一点,但我想它会效率低下(我有数百万行).有没有办法可以更有效地做到这一点?
编辑:想要输出
the.df
x n val wanted
1 a 0 1 1
2 a 1 0 0
3 a 2 0 0
4 a 3 1 0
5 b 0 1 1
6 b 1 0 0
7 b 2 1 0
8 b 3 1 0
9 c 0 1 1
10 c 1 1 1
11 c 2 0 0
12 c 3 0 0
Run Code Online (Sandbox Code Playgroud)
另外,我不介意制作新列(例如,将所需的值放在那里).
使用data.table我会尝试以下
library(data.table)
setDT(the.df)[order(n),
val := if(length(indx <- which(val[2:.N] == 0L)))
c(val[1:(indx[1L] + 1L)], rep(0L, .N - (indx[1L] + 1L))),
by = x]
the.df
# x n val
# 1: a 0 1
# 2: a 1 0
# 3: a 2 0
# 4: a 3 0
# 5: b 0 1
# 6: b 1 0
# 7: b 2 0
# 8: b 3 0
# 9: c 0 1
# 10: c 1 1
# 11: c 2 0
# 12: c 3 0
Run Code Online (Sandbox Code Playgroud)
这将同时对数据进行排序n(如您所说,它在现实生活中没有排序)并按val条件重新创建(意味着如果条件不满足,val将不受影响).
希望在不久的将来,这将实现,然后代码可能是
setDT(the.df)[order(n), val[n > 2] := if(val[2L] == 0) 0L, by = x]
Run Code Online (Sandbox Code Playgroud)
这可能是性能和语法方面的巨大改进
嗯,如果你切换到data.table......应该会非常有效
library(data.table)
# Define the.df as a data.table (or use data.table::setDT() function)
set.seed(0)
the.df <- data.table(
x = rep(letters[1:3], each = 4),
n = rep(0:3, 3),
val = round(runif(12))
)
m_dz <- function() {
setorder(the.df, x, n)
repeat{
# Get IDs of rows to change
# ids <- which(the.df[, (n > 1) & (val == 1) & (shift(val, 1L, type = "lag") == 0)])
ids <- the.df[(n > 1) & (val == 1) & (shift(val, 1L, type = "lag") == 0), , which = TRUE]
# If no IDs break
if(length(ids) == 0){
break
}
# Set val to 0
# for (i in ids) set(the.df, i = i, j = "val", value = 0)
set(the.df, i = ids, j = "val", value = 0)
}
return(the.df)
}
Run Code Online (Sandbox Code Playgroud)
which = TRUE和set(the.df, i = ids, j = "val", value = 0),这使得计时更加稳定(没有非常高的最大计时)。m-dz()已更新(@FoldedChromatin 的答案由于结果不同而被跳过)。我的函数在中位数和上分位数方面稍快一些,但时间上有很大的差距(参见最大值...),我不明白为什么。希望计时方法是正确的(将结果返回到不同的对象等)。
任何更大的东西都会毁掉我的电脑:(
set.seed(0)
groups_ids <- replicate(300, paste(sample(LETTERS, 5, replace=TRUE), collapse = ""))
size1 <- length(unique(groups_ids))
size2 <- round(1e7/size1)
the.df1 <- data.table(
x = rep(groups_ids, each = size2), # 52 * 500 = 26000
n = rep(0:(size2-1), size1),
val = round(runif(size1*size2))
)
the.df2 <- copy(the.df1)
# m-dz
m_dz <- function() {
setorder(df1, x, n)
repeat{
ids <- df1[(n > 1) & (val == 1) & (shift(val, 1L, type = "lag") == 0), , which = TRUE]
if(length(ids) == 0){
break
}
set(df1, i = ids, j = "val", value = 0)
}
return(df1)
}
# David Arenburg
DavidArenburg <- function() {
setorder(df2, x, n)
df2[, val := if(length(indx <- which.max(val[2:.N] == 0) + 1L)) c(val[1:indx], rep(0L, .N - indx)), by = x]
return(df2)
}
library(microbenchmark)
microbenchmark(
res1 <- m_dz(),
res2 <- DavidArenburg(),
times = 100
)
# Unit: milliseconds
# expr min lq mean median uq max neval cld
# res1 <- m_dz() 247.4136 268.5005 363.0117 288.4216 312.7307 7071.0960 100 a
# res2 <- DavidArenburg() 270.6074 281.3935 314.7864 303.5229 328.1210 525.8095 100 a
identical(res1, res2)
# [1] TRUE
Run Code Online (Sandbox Code Playgroud)
set.seed(0)
groups_ids <- replicate(300, paste(sample(LETTERS, 5, replace=TRUE), collapse = ""))
size1 <- length(unique(groups_ids))
size2 <- round(1e8/size1)
# Unit: seconds
# expr min lq mean median uq max neval cld
# res1 <- m_dz() 5.599855 5.800264 8.773817 5.923721 6.021132 289.85107 100 a
# res2 <- m_dz2() 5.571911 5.836191 9.047958 5.970952 6.123419 310.65280 100 a
# res3 <- DavidArenburg() 9.183145 9.519756 9.714105 9.723325 9.918377 10.28965 100 a
Run Code Online (Sandbox Code Playgroud)