Emi*_*ily 3 time r match datetime-format
我有一个数据框,目前包含两个HH:MM:SS格式的'时间'列.我想压缩这个数据框,这样我每个唯一的'id'值只有一行.我想保留每个唯一'id'值的行,该值的'time1'值与'time2'值最接近.但是,'time1'需要大于'time2'.
这是一个简单的例子:
> dput(df)
structure(list(id = c(1L, 1L, 1L, 1L, 2L, 2L, 2L, 2L, 3L, 3L,
3L, 3L, 4L, 4L, 4L, 4L), count = c(23L, 23L, 23L, 23L, 45L, 45L,
45L, 45L, 67L, 67L, 67L, 67L, 88L, 88L, 88L, 88L), time1 = structure(c(1L,
1L, 1L, 1L, 2L, 2L, 2L, 2L, 4L, 4L, 4L, 4L, 3L, 3L, 3L, 3L), .Label = c("00:13:00",
"01:13:00", "07:18:00", "18:14:00"), class = "factor"), time2 = structure(c(4L,
1L, 2L, 3L, 4L, 1L, 2L, 3L, 4L, 1L, 2L, 3L, 4L, 1L, 2L, 3L), .Label = c("00:00:00",
"06:00:00", "12:00:00", "18:00:00"), class = "factor"), afn = c(3.36,
0.63, 1.77, 3.89, 3.36, 0.63, 1.77, 3.89, 3.36, 0.63, 1.77, 3.89,
3.36, 0.63, 1.77, 3.89), dfn = c(201.67, 157.27, 103.55, 191.41,
201.67, 157.27, 103.55, 191.41, 201.67, 157.27, 103.55, 191.41,
201.67, 157.27, 103.55, 191.41)), .Names = c("id", "count", "time1",
"time2", "afn", "dfn"), class = "data.frame", row.names = c(NA,
-16L))
> df
id count time1 time2 afn dfn
1 1 23 00:13:00 18:00:00 3.36 201.67
2 1 23 00:13:00 00:00:00 0.63 157.27
3 1 23 00:13:00 06:00:00 1.77 103.55
4 1 23 00:13:00 12:00:00 3.89 191.41
5 2 45 01:13:00 18:00:00 3.36 201.67
6 2 45 01:13:00 00:00:00 0.63 157.27
7 2 45 01:13:00 06:00:00 1.77 103.55
8 2 45 01:13:00 12:00:00 3.89 191.41
9 3 67 18:14:00 18:00:00 3.36 201.67
10 3 67 18:14:00 00:00:00 0.63 157.27
11 3 67 18:14:00 06:00:00 1.77 103.55
12 3 67 18:14:00 12:00:00 3.89 191.41
13 4 88 07:18:00 18:00:00 3.36 201.67
14 4 88 07:18:00 00:00:00 0.63 157.27
15 4 88 07:18:00 06:00:00 1.77 103.55
16 4 88 07:18:00 12:00:00 3.89 191.41
Run Code Online (Sandbox Code Playgroud)
在上述情况下,我想最终得到这个矩阵:
id count time1 time2 afn dfn
1 23 00:13:00 00:00:00 0.63 157.27
2 45 01:13:00 00:00:00 0.63 157.27
3 67 18:14:00 18:00:00 3.36 201.67
4 88 07:18:00 06:00:00 1.77 103.55
Run Code Online (Sandbox Code Playgroud)
我曾使用ddply()函数来压缩过去的数据帧,但没有使用合并的匹配规则.我必须应用这是一个包含大量列的数据框(比这里给出的简单示例更多)所以任何关于如何做到这一点的建议都会非常棒.任何帮助将不胜感激.非常感谢!
以下是一些解决方案.
1)AVE这使用克隆氏病times以及subset和ave从R的基:
library(chron)
delta <- as.vector(times(df$time1) - times(df$time2))
df2 <- subset(df, delta > 0)
df2[ave(delta, df2$id, FUN = function(delta) delta == min(delta)) == 1, ]
Run Code Online (Sandbox Code Playgroud)
2)dplyr这使用chron times和dplyr包:
library(chron)
library(dplyr)
df %.%
mutate(delta = as.vector(times(time1) - times(time2))) %.%
filter(delta > 0) %.%
group_by(id) %.%
filter(delta == min(delta)) %.%
select(- delta)
Run Code Online (Sandbox Code Playgroud)
3)sqldf
library(sqldf)
sqldf("select *, min(strftime('%s', time1) - strftime('%s', time2)) delta
from (select * from df where strftime('%s', time1) > strftime('%s', time2))
group by id")[seq_along(df)]
Run Code Online (Sandbox Code Playgroud)
或者这个变化我们delta在R中计算然后使用sqldf:
library(sqldf)
library(chron)
df2 = transform(df, delta = as.vector(times(time1) - times(time2)))
sqldf("select *, min(delta) delta
from (select * from df2 where delta > 0)
group by id")[-ncol(df2)]
Run Code Online (Sandbox Code Playgroud)
4)data.table
library(data.table)
library(chron)
DT <- data.table(df)
DT[, delta := times(time1) - times(time2)
][delta > 0
][, .SD[delta == min(delta)], by = id
][, seq_along(df), with = FALSE]
Run Code Online (Sandbox Code Playgroud)
增加了额外的解决方案.更正library和subset声明.小改进.