将这个丑陋的for循环转换为更友好的R形式

And*_*ier 7 loops aggregate r vectorization

一直在使用SO作为我工作的资源.感谢您召集这样一个伟大的社区.

我正在尝试做一些有点复杂的事情,我现在能想到的唯一方法就是使用一对嵌套的for循环(我知道在R中不赞成)......我有三百万的记录 - 奇数课程注册:学生UserID与CourseID配对.在每一行中,都有一堆数据,包括开始/结束日期和分数等等.我需要做的是,对于每次注册,计算该用户在注册课程之前所参加的课程的平均分数.

我用于for循环的代码如下:

data$Mean.Prior.Score <- 0
for (i in as.numeric(rownames(data)) {
    sum <- 0
    count <- 0
    for (j in as.numeric(rownames(data[data$UserID == data$UserID[i],]))) {
            if (data$Course.End.Date[j] < data$Course.Start.Date[i]) {
                sum <- sum + data$Score[j]
                count <- count + 1
            }
    }
if (count != 0)
    data$Mean.Prior.Score[i] <- sum / count
}
Run Code Online (Sandbox Code Playgroud)

我很确定这会起作用,但它的运行速度非常慢...我的数据框有超过三百万行,但经过10分钟的好处后,外部循环只运行了850条记录.这似乎比时间复杂性所暗示的要慢,特别是考虑到每个用户平均只有5或6个课程.

哦,我应该提一下,我在运行循环之前用as.POSIXct()转换了日期字符串,所以日期比较步骤不应该太慢......

必须有更好的方法来做到这一点...任何建议?


编辑:根据mnel的要求......终于得到dput了很好的发挥.不得不补充一下control = NULL.这是'tis:

structure(list(Username = structure(1:20, .Label = c("100225", 
"100226", "100228", "1013170", "102876", "105796", "106753", 
"106755", "108568", "109038", "110150", "110200", "110350", "111873", 
"111935", "113579", "113670", "117562", "117869", "118329"), class = "factor"), 
User.ID = c(2313737L, 2314278L, 2314920L, 9708829L, 2325896L, 
2315617L, 2314644L, 2314977L, 2330148L, 2315081L, 2314145L, 
2316213L, 2317734L, 2314363L, 2361187L, 2315374L, 2314250L, 
2361507L, 2325592L, 2360182L), Course.ID = c(2106468L, 2106578L, 
2106493L, 5426406L, 2115455L, 2107320L, 2110286L, 2110101L, 
2118574L, 2106876L, 2110108L, 2110058L, 2109958L, 2108222L, 
2127976L, 2106638L, 2107020L, 2127451L, 2117022L, 2126506L
), Course = structure(c(1L, 7L, 10L, 15L, 11L, 19L, 4L, 6L, 
3L, 12L, 2L, 9L, 17L, 8L, 20L, 18L, 13L, 16L, 5L, 14L), .Label = c("ACCT212_A", 
"BIOS200_N", "BIS220_T", "BUSN115_A", "BUSN115_T", "CARD205_A", 
"CIS211_A", "CIS275_X", "CIS438_S", "ENGL112_A", "ENGL112_B", 
"ENGL227_K", "GM400_A", "GM410_A", "HUMN232_M", "HUMN432_W", 
"HUMN445_A", "MATH100_X", "MM575_A", "PSYC110_Y"), class = "factor"), 
Course.Start.Date = structure(c(1098662400, 1098662400, 1098662400, 
1309737600, 1099267200, 1098662400, 1099267200, 1099267200, 
1098662400, 1098662400, 1099267200, 1099267200, 1099267200, 
1098662400, 1104105600, 1098662400, 1098662400, 1104105600, 
1098662400, 1104105600), class = c("POSIXct", "POSIXt"), tzone = "GMT"), 
Term.ID = c(12056L, 12056L, 12056L, 66282L, 12057L, 12056L, 
12057L, 12057L, 12056L, 12056L, 12057L, 12057L, 12057L, 12056L, 
13469L, 12056L, 12056L, 13469L, 12056L, 13469L), Term.Name = structure(c(2L, 
2L, 2L, 4L, 1L, 2L, 1L, 1L, 2L, 2L, 1L, 1L, 1L, 2L, 3L, 2L, 
2L, 3L, 2L, 3L), .Label = c("Fall 2004", "Fall 2004 Session A", 
"Fall 2004 Session B", "Summer Session A 2011"), class = "factor"), 
Term.Start.Date = structure(c(1L, 1L, 1L, 4L, 2L, 1L, 2L, 
2L, 1L, 1L, 2L, 2L, 2L, 1L, 3L, 1L, 1L, 3L, 1L, 3L), .Label = c("2004-10-21", 
"2004-10-28", "2004-12-27", "2011-06-26"), class = "factor"), 
Score = c(0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0.125, 
0, 0, 0, 0, 0), First.Course.Date = structure(c(1L, 1L, 1L, 
4L, 2L, 1L, 2L, 2L, 1L, 1L, 2L, 2L, 2L, 1L, 3L, 1L, 1L, 3L, 
1L, 3L), .Label = c("2004-10-25", "2004-11-01", "2004-12-27", 
"2011-07-04"), class = "factor"), First.Term.Date = structure(c(1L, 
1L, 1L, 4L, 2L, 1L, 2L, 2L, 1L, 1L, 2L, 2L, 2L, 1L, 3L, 1L, 
1L, 3L, 1L, 3L), .Label = c("2004-10-21", "2004-10-28", "2004-12-27", 
"2011-06-26"), class = "factor"), First.Timer = c(TRUE, TRUE, 
TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, 
TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE), Course.Code = structure(c(1L, 
6L, 9L, 13L, 9L, 17L, 4L, 5L, 3L, 10L, 2L, 8L, 15L, 7L, 18L, 
16L, 11L, 14L, 4L, 12L), .Label = c("ACCT212", "BIOS200", 
"BIS220", "BUSN115", "CARD205", "CIS211", "CIS275", "CIS438", 
"ENGL112", "ENGL227", "GM400", "GM410", "HUMN232", "HUMN432", 
"HUMN445", "MATH100", "MM575", "PSYC110"), class = "factor"), 
Course.End.Date = structure(c(1L, 1L, 1L, 4L, 2L, 1L, 2L, 
2L, 1L, 1L, 2L, 2L, 2L, 1L, 3L, 1L, 1L, 3L, 1L, 3L), .Label = c("2004-12-19", 
"2005-02-27", "2005-03-26", "2011-08-28"), class = "factor")), .Names = c("Username", 
"User.ID", "Course.ID", "Course", "Course.Start.Date", "Term.ID", 
"Term.Name", "Term.Start.Date", "Score", "First.Course.Date", 
"First.Term.Date", "First.Timer", "Course.Code", "Course.End.Date"
), row.names = c(NA, 20L), class = "data.frame")
Run Code Online (Sandbox Code Playgroud)

nog*_*pes 1

我发现data.table效果很好。

# Create some data.
library(data.table)
set.seed(1)
n=3e6
numCourses=5 # Average courses per student
data=data.table(UserID=as.character(round(runif(n,1,round(n/numCourses)))),course=1:n,Score=runif(n),CourseStartDate=as.Date('2000-01-01')+round(runif(n,1,365)))
data$CourseEndDate=data$CourseStartDate+round(runif(n,1,100))
setkey(data,UserID)
# test=function(CourseEndDate,Score,CourseStartDate) sapply(CourseStartDate, function(y) mean(Score[y>CourseEndDate]))
# I vastly reduced the number of comparisons with a better "test" function.
test2=function(CourseEndDate,Score,CourseStartDate) {
    o.end = order(CourseEndDate)
    run.avg = cumsum(Score[o.end])/seq_along(CourseEndDate)
    idx=findInterval(CourseStartDate,CourseEndDate[o.end])
    idx=ifelse(idx==0,NA,idx)
    run.avg[idx]
}
system.time(data$MeanPriorScore<-data[,test2(CourseEndDate,Score,CourseStartDate),by=UserID]$V1) 
#  For three million courses, at an average of 5 courses per student:
#    user  system elapsed 
#    122.06    0.22  122.45 
Run Code Online (Sandbox Code Playgroud)

运行测试看看它是否与您的代码相同:

set.seed(1)
n=1e2
data=data.table(UserID=as.character(round(runif(n,1,1000))),course=1:n,Score=runif(n),CourseStartDate=as.Date('2000-01-01')+round(runif(n,1,365)))
data$CourseEndDate=data$CourseStartDate+round(runif(n,1,100))
setkey(data,UserID)
data$MeanPriorScore<-data[,test2(CourseEndDate,Score,CourseStartDate),by=UserID]$V1
data["246"]
#   UserID course     Score CourseStartDate CourseEndDate MeanPriorScore
#1:    246     54 0.4531314      2000-08-09    2000-09-20      0.9437248
#2:    246     89 0.9437248      2000-02-19    2000-03-02             NA

# A comparison with your for loop (slightly modified)
data$MeanPriorScore.old<-NA # Set to NaN instead of zero for easy comparison.
# I think you forgot a bracket here. Also, There is no need to work with the rownames.
for (i in seq(nrow(data))) { 
    sum <- 0
    count <- 0
    # I reduced the complexity of figuring out the vector to loop through.
    # It will result in the exact same thing if there are no rownames.
    for (j in which(data$UserID == data$UserID[i])) {
            if (data$CourseEndDate[j] <= data$CourseStartDate[i]) {
                sum <- sum + data$Score[j]
                count <- count + 1
            }
    }
    # I had to add "[i]" here. I think that is what you meant.
    if (count != 0) data$MeanPriorScore.old[i] <- sum / count 
}
identical(data$MeanPriorScore,data$MeanPriorScore.old)
# [1] TRUE
Run Code Online (Sandbox Code Playgroud)