Luk*_*Sci 11 performance for-loop r
我对 R 比较陌生。我创建了代码来检查数据框并根据特定条件识别数据行,并用 1 和“检查”列标记这些行。该代码与测试数据完全按照我的预期工作。我的问题是真实的数据集有 100 万多行,虽然它可以工作,但速度太慢了。我希望能帮助提高这段代码的效率。
#create test data
alarm <- c(0,0,0,0,0,0,1,1,0,0,0,0,0,0,0,0,1,0,0,0,0,1,0,0,0,0,0,0,0,0,0)
setpoint <- c(10,10,10,10,10,10,10,10,8,8,9,8,8,10,10,10,10,10,10,10,10,10,10,10,8,10,10,8,10,10,10)
temp <- data.frame(alarm, setpoint)
#create a new column to capture if there is any changes to setpoint after any alarm
temp$check <- ""
#review everyrow in dataframe
for(i in 1:nrow(temp)){
cat(round(i/nrow(temp)*100,2),"% \r") # prints the percentage complete in realtime.
if(temp$alarm[i]==1 && temp$setpoint[i] >= 10){
#for when alarm has occurred and the setpoint is 10 or above review the next 5 rows
for(j in 0:5){
if(temp$setpoint[i] != temp$setpoint[i+j]){
#for when there has been a change in the setpoint
for(j in 0:10){
if(temp$setpoint[i] != temp$setpoint[i+j]){
temp$check[i+j]<-'1'
if(temp$setpoint[i+j] != (temp$setpoint[i+j+1])){break}
}
}
}
}
}
}
> print(temp)
alarm setpoint check
1 0 10
2 0 10
3 0 10
4 0 10
5 0 10
6 0 10
7 1 10
8 1 10
9 0 8 1
10 0 8 1
11 0 9
12 0 8
13 0 8
14 0 10
15 0 10
16 0 10
17 1 10
18 0 10
19 0 10
20 0 10
21 0 10
22 1 10
23 0 10
24 0 10
25 0 8 1
26 0 10
27 0 10
28 0 8
29 0 10
30 0 10
31 0 10
Run Code Online (Sandbox Code Playgroud)
Wal*_*ldi 11
为了提高效率,由于循环已经编写,您可以使用Rcpp.
C++语法与语法相差并不远R,主要变化是:
j重命名为k,因为虽然j在 中保持工作R很容易出错并且在 中不起作用C++,因为第二个内部循环将被覆盖i+j或i+k永远不要超过行总数这导致check_data.cpp:
#include <Rcpp.h>
using namespace Rcpp;
// [[Rcpp::export]]
DataFrame check_data(DataFrame df) {
NumericVector alarm = df["alarm"];
NumericVector setpoint = df["setpoint"];
int n = alarm.size();
LogicalVector check(n);
int i,j,k;
for(i=0; i<n;i++){
if(alarm[i]==1 && setpoint[i] >= 10){
Rcout << "pct = " << i*100/n << "%" << std::endl; // prints the percentage complete in realtime.
//for when alarm has occured and the setpoint is 10 or above review the next 5 rows
for(j=1; j<5; j++){
if (i+j > n-1) break;
if(setpoint[i] != setpoint[i+j]){
//for when there has been a change in the setpoint
for(k=1; k<10;k++){
if (i+k> n-1) break;
if(setpoint[i] != setpoint[i+k]){
check[i+k] = true;
if (i+k+1> n-1) break;
if(setpoint[i+k] != (setpoint[i+k+1])){break;}
}
}
}
}
}
}
df["check"]=check;
return(df);
}
// You can include R code blocks in C++ files processed with sourceCpp
// (useful for testing and development). The R code will be automatically
// run after the compilation.
//
/*** R
alarm <- c(0,0,0,0,0,0,1,1,0,0,0,0,0,0,0,0,1,0,0,0,0,1,0,0,0,0,0,0,0,0,0)
setpoint <- c(10,10,10,10,10,10,10,10,8,8,9,8,8,10,10,10,10,10,10,10,10,10,10,10,8,10,10,8,10,10,10)
temp <- data.frame(alarm, setpoint)
check_data(temp)
*/
Run Code Online (Sandbox Code Playgroud)
然后,您可以check_data通过运行以下命令使函数在 R 环境中可用:
library(Rcpp)
sourceCpp('check_data.cpp')
check_data(temp)
pct = 19%
pct = 22%
pct = 51%
pct = 67%
alarm setpoint check
1 0 10 FALSE
2 0 10 FALSE
3 0 10 FALSE
4 0 10 FALSE
5 0 10 FALSE
6 0 10 FALSE
7 1 10 FALSE
8 1 10 FALSE
9 0 8 TRUE
10 0 8 TRUE
11 0 9 FALSE
12 0 8 FALSE
13 0 8 FALSE
14 0 10 FALSE
15 0 10 FALSE
16 0 10 FALSE
17 1 10 FALSE
18 0 10 FALSE
19 0 10 FALSE
20 0 10 FALSE
21 0 10 FALSE
22 1 10 FALSE
23 0 10 FALSE
24 0 10 FALSE
25 0 8 TRUE
26 0 10 FALSE
27 0 10 FALSE
28 0 8 FALSE
29 0 10 FALSE
30 0 10 FALSE
31 0 10 FALSE
Run Code Online (Sandbox Code Playgroud)
性能对比:
Unit: microseconds
expr min lq mean median uq max neval
ref 13051.8 16832.4 18510.316 18448.95 19930.10 28335.9 100
Rcpp 68.3 108.7 179.845 168.60 236.85 515.1 100
Run Code Online (Sandbox Code Playgroud)
如果我正确理解你的目标,也许你可以尝试data.table下面的方法
library(data.table)
setDT(temp)[
,
check := +({
d <- cumsum(c(FALSE, diff(setpoint)) != 0) == 1
d & min(c(which(d), Inf)) <= 5
}),
cumsum(alarm == 1 & setpoint >= 10)
]
Run Code Online (Sandbox Code Playgroud)
这使
alarm setpoint check
1: 0 10 0
2: 0 10 0
3: 0 10 0
4: 0 10 0
5: 0 10 0
6: 0 10 0
7: 1 10 0
8: 1 10 0
9: 0 8 1
10: 0 8 1
11: 0 9 0
12: 0 8 0
13: 0 8 0
14: 0 10 0
15: 0 10 0
16: 0 10 0
17: 1 10 0
18: 0 10 0
19: 0 10 0
20: 0 10 0
21: 0 10 0
22: 1 10 0
23: 0 10 0
24: 0 10 0
25: 0 8 1
26: 0 10 0
27: 0 10 0
28: 0 8 0
29: 0 10 0
30: 0 10 0
31: 0 10 0
Run Code Online (Sandbox Code Playgroud)
使用管道可以很简单地完成此操作dplyr。我没有测试过速度,但肯定会比你的方法快很多......
library(dplyr)
alarm <- c(0,0,0,0,0,0,1,1,0,0,0,0,0,0,0,0,1,0,0,0,0,1,0,0,0,0,0,0,0,0,0)
setpoint <- c(10,10,10,10,10,10,10,10,8,8,9,8,8,10,10,10,10,10,
10,10,10,10,10,10,8,10,10,8,10,10,10)
temp <- data.frame(alarm, setpoint)
temp %>%
mutate(grp = cumsum(alarm == 1 & setpoint >=10)) %>% #set grouping variable
group_by(grp) %>%
mutate(check = as.numeric((cumsum(c(0, diff(setpoint)) != 0) == 1) &
(row_number() <= 5))) %>% #see note
ungroup() %>%
select(-grp) #remove grouping variable
alarm setpoint check
<dbl> <dbl> <dbl>
1 0 10 0
2 0 10 0
3 0 10 0
4 0 10 0
5 0 10 0
6 0 10 0
7 1 10 0
8 1 10 0
9 0 8 1
10 0 8 1
11 0 9 0
12 0 8 0
13 0 8 0
14 0 10 0
15 0 10 0
16 0 10 0
17 1 10 0
18 0 10 0
19 0 10 0
20 0 10 0
21 0 10 0
22 1 10 0
23 0 10 0
24 0 10 0
25 0 8 1
26 0 10 0
27 0 10 0
28 0 8 0
29 0 10 0
30 0 10 0
31 0 10 0
Run Code Online (Sandbox Code Playgroud)
注意 - 此行用于diff检查setpoint每个组内的更改(用零填充以保持长度相同),并设置check以识别第一个更改之后和第二个更改之前的项目,前提是它们位于前五行内该组的。将as.numeric其从逻辑更改为数字 (0/1),这更接近您正在做的事情。
这个答案提供了示例数据集的正确答案,但没有提供 @Luke_DataSci 的实际数据集。
这是一个潜在的“强力”解决方案,应该会更快:
library(dplyr)
alarm <- c(0,0,0,0,0,0,1,1,0,0,0,0,0,0,0,0,1,0,0,0,0,1,0,0,0,0,0,0,0,0,0)
setpoint <- c(10,10,10,10,10,10,10,10,8,8,8,8,8,10,10,10,10,10,10,10,10,10,10,10,8,10,10,8,10,10,10)
test_dataset_1 <- data.frame(alarm, setpoint)
alarm2 <- c(0,0,0,0,0,0,1,1,0,0,0,0,0,0,0,0,1,0,0,0,0,1,0,0,0,0,0,0,0,0,0)
setpoint2 <- c(10,10,10,10,10,10,10,10,8,8,9,8,8,10,10,10,10,10,10,10,10,10,10,10,8,10,10,8,10,10,10)
test_dataset_2 <- data.frame(alarm2, setpoint2)
ifelse_func <- function(df){
df$check <- ifelse(
(lag(df$alarm, n = 1, default = 0) == 1 &
lag(df$setpoint, n = 1, default = 0) >= 10 &
df$setpoint != 10) |
(lag(df$alarm, n = 2, default = 0) == 1 &
lag(df$setpoint, n = 2, default = 0) >= 10 &
df$setpoint != 10 &
df$setpoint == lag(df$setpoint, n = 1, default = 0)) |
(lag(df$alarm, n = 3, default = 0) == 1 &
lag(df$setpoint, n = 3, default = 0) >= 10 &
df$setpoint != 10 &
(df$setpoint == lag(df$setpoint, n = 1, default = 0) |
lag(df$setpoint, n = 1, default = 0) == 10) &
(df$setpoint == lag(df$setpoint, n = 2, default = 0) |
lag(df$setpoint, n = 2, default = 0) == 10)) |
(lag(df$alarm, n = 4, default = 0) == 1 &
lag(df$setpoint, n = 4, default = 0) >= 10 &
df$setpoint != 10 &
(df$setpoint == lag(df$setpoint, n = 1, default = 0) |
lag(df$setpoint, n = 1, default = 0) == 10) &
(df$setpoint == lag(df$setpoint, n = 2, default = 0) |
lag(df$setpoint, n = 2, default = 0) == 10) &
(df$setpoint == lag(df$setpoint, n = 3, default = 0) |
lag(df$setpoint, n = 3, default = 0) == 10)) |
(lag(df$alarm, n = 5, default = 0) == 1 &
lag(df$setpoint, n = 5, default = 0) >= 10 &
df$setpoint != 10 &
(df$setpoint == lag(df$setpoint, n = 1, default = 0) |
lag(df$setpoint, n = 1, default = 0) == 10) &
(df$setpoint == lag(df$setpoint, n = 2, default = 0) |
lag(df$setpoint, n = 2, default = 0) == 10) &
(df$setpoint == lag(df$setpoint, n = 3, default = 0) |
lag(df$setpoint, n = 3, default = 0) == 10) &
(df$setpoint == lag(df$setpoint, n = 4, default = 0) |
lag(df$setpoint, n = 4, default = 0) == 10)),
1, "")
return(df)
}
forloop_func <- function(df){
df$check <- ""
for(i in 1:nrow(df)){
# cat(round(i/nrow(temp)*100,2),"% \r") # prints the percentage complete in realtime.
if(df$alarm[i]==1 && df$setpoint[i] >= 10){
#for when alarm has occurred and the setpoint is 10 or above review the next 5 rows
for(j in 0:5){
if(df$setpoint[i] != df$setpoint[i+j]){
#for when there has been a change in the setpoint
for(j in 0:10){
if(df$setpoint[i] != df$setpoint[i+j]){
df$check[i+j]<-'1'
if(df$setpoint[i+j] != (df$setpoint[i+j+1])){break}
}
}
}
}
}
}
return(df)
}
all_equal(ifelse_func(test_dataset_1), forloop_func(test_dataset_1))
#> [1] TRUE
all_equal(ifelse_func(test_dataset_2), forloop_func(test_dataset_2))
#> [1] TRUE
library(microbenchmark)
library(ggplot2)
res <- microbenchmark(ifelse_func(test_dataset_2),
forloop_func(test_dataset_2),
times = 10)
autoplot(res) + ggtitle("Time difference for 31 rows")
#> Coordinate system already present. Adding new coordinate system, which will replace the existing one.
Run Code Online (Sandbox Code Playgroud)

set.seed(123)
temp2 <- data.frame(alarm = sample(alarm, 1000, replace = TRUE),
setpoint = sample(setpoint, 1000, replace = TRUE))
res2 <- microbenchmark(ifelse_func(temp2), forloop_func(temp2), times = 10)
autoplot(res2) + ggtitle("Time difference for 1,000 rows")
#> Coordinate system already present. Adding new coordinate system, which will replace the existing one.
Run Code Online (Sandbox Code Playgroud)

temp3 <- data.frame(alarm = sample(alarm, 10000, replace = TRUE),
setpoint = sample(setpoint, 10000, replace = TRUE))
res3 <- microbenchmark(ifelse_func(temp3), forloop_func(temp3), times = 10)
autoplot(res3) + ggtitle("Time difference for 10,000 rows")
#> Coordinate system already present. Adding new coordinate system, which will replace the existing one.
Run Code Online (Sandbox Code Playgroud)

temp4 <- data.frame(alarm = sample(alarm, 100000, replace = TRUE),
setpoint = sample(setpoint, 100000, replace = TRUE))
res4 <- microbenchmark(ifelse_func(temp4), forloop_func(temp4), times = 6)
autoplot(res4) + ggtitle("Time difference for 100,000 rows")
#> Coordinate system already present. Adding new coordinate system, which will replace the existing one.
Run Code Online (Sandbox Code Playgroud)

对于 100 万行:
temp5 <- data.frame(alarm = sample(alarm, 1000000, replace = TRUE),
setpoint = sample(setpoint, 1000000, replace = TRUE))
Unit: milliseconds
expr min lq mean median uq max neval cld
ifelse_func(temp5) 873.8556 873.8556 1181.997 1181.997 1490.138 1490.138 2 a
forloop_func(temp5) 292242.7181 292242.7181 295101.463 295101.463 297960.208 297960.208 2 b
Run Code Online (Sandbox Code Playgroud)
由reprex 包(v2.0.1)于 2022-04-07 创建
因此,尽管比 31 行的 for 循环方法慢约 3 倍,但这种方法在 100 万行时快约 250 倍。
现在的问题是它是否提供了正确的答案......
| 归档时间: |
|
| 查看次数: |
699 次 |
| 最近记录: |