G-s*_*pot 0 r dplyr data.table tidyr
mydata
mydata=structure(list(ID_WORKES = c(1000561L, 1000561L, 1000561L, 1000561L,
1000561L, 1000561L, 1000562L, 1000562L, 1000562L, 1000562L, 1000562L,
1000562L), ID_SP_0R = c(21L, 463L, 465L, 500L, 600L, 1951L, 21L,
463L, 465L, 500L, 600L, 1951L), KOD_DEPO = c(0L, 0L, 0L, 0L,
0L, 0L, 0L, 0L, 0L, 0L, 0L, 1L), KOD_DOR = c(0L, 0L, 0L, 0L,
0L, 0L, 0L, 0L, 0L, 0L, 0L, 1L), COLUMN_MASH = c(0L, 0L, 0L,
0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 1L), prop_violations = c(0L,
0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L), mash_score = c(0L,
0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L)), .Names = c("ID_WORKES",
"ID_SP_0R", "KOD_DEPO", "KOD_DOR", "COLUMN_MASH", "prop_violations",
"mash_score"), class = "data.frame", row.names = c(NA, -12L))
Run Code Online (Sandbox Code Playgroud)
第二个数据具有这样的格式
mydata2=structure(list(ID_SP_NAR = c(146L, 1088L, 1612L, 30L, 745L, 905L
), KOD_DEPO = c(4575L, 8998L, 8134L, 4038L, 9540L, 683L), KOD_DOR = c(94L,
94L, 76L, 76L, 94L, 94L), ID_MASH = c(1000561L, 1000561L, 1000561L,
1000561L, 1000562L, 1000562L), COLUMN_MASH = c(10L, 2L, 1L, 1L,
17L, 5L), n_routes_total = c(15L, 14L, 25L, 11L, 18L, 4L), n_violations = c(15L,
10L, 13L, 8L, 7L, 4L), is_violation = c(1L, 1L, 1L, 1L, 1L, 1L
), prop_violations = structure(c(3L, 4L, 1L, 5L, 2L, 6L), .Label = c("0.04000000",
"0.05555556", "0.06666667", "0.07142857", "0.09090909", "0.25000000"
), class = "factor")), .Names = c("ID_SP_NAR", "KOD_DEPO", "KOD_DOR",
"ID_MASH", "COLUMN_MASH", "n_routes_total", "n_violations", "is_violation",
"prop_violations"), class = "data.frame", row.names = c(NA, -6L
))
Run Code Online (Sandbox Code Playgroud)
KOD_DEPO, KOD_DOR, COLUMN_MASH
对于每个ID_WORKES ID_WORKER=ID_MASH,如何将mydata数据集中变量的零值替换为mydata2数据集中这些变量的最后值,这是联接
的关键变量。
如此理想的输出。对于id_mash =1000561mydata2中的最后一个kod_depo为4038,kod_dor为76并且COLUMN_MASH为1对于mydata2中的id_mash =1000562 最后一个depo为683,kod_dor为94并且COLUMN_MASH为5
ID_WORKES ID_SP_0R KOD_DEPO KOD_DOR COLUMN_MASH prop_violations mash_score
1 1000561 21 4038 76 1 0 0
2 1000561 463 4038 76 1 0 0
3 1000561 465 4038 76 1 0 0
4 1000561 500 4038 76 1 0 0
5 1000561 600 4038 76 1 0 0
6 1000561 1951 4038 76 1 0 0
7 1000562 21 683 94 5 0 0
8 1000562 463 683 94 5 0 0
9 1000562 465 683 94 5 0 0
10 1000562 500 683 94 5 0 0
11 1000562 600 683 94 5 0 0
12 1000562 1951 1 1 1 0 0
Run Code Online (Sandbox Code Playgroud)
怎么做,简单的合并是行不通的。prop_violations和mash_score不会替换。
首先,您需要单独减少mydata2到“最后”条目。请注意,您的数据除显示其初始顺序外,没有显示其他要排序的内容。
library(dplyr)
(last_mash <- mydata2 %>% group_by(ID_MASH) %>%
mutate(row=1:n()) %>% top_n(1, row))
# A tibble: 2 x 10
# Groups: ID_MASH [2]
ID_SP_NAR KOD_DEPO KOD_DOR ID_MASH COLUMN_MASH n_routes_total n_violations is_violation prop_violations row
<int> <int> <int> <int> <int> <int> <int> <int> <fct> <int>
1 30 4038 76 1000561 1 11 8 1 0.09090909 4
2 905 683 94 1000562 5 4 4 1 0.25000000 2
Run Code Online (Sandbox Code Playgroud)
接下来,我们将其与合并mydata,除非我要使用inner_join:
inner_join(mydata, last_mash, by=c('ID_WORKES'='ID_MASH'), suffix=c('','.y'))
ID_WORKES ID_SP_0R KOD_DEPO KOD_DOR COLUMN_MASH prop_violations mash_score ID_SP_NAR KOD_DEPO.y KOD_DOR.y
1 1000561 21 0 0 0 0 0 30 4038 76
2 1000561 463 0 0 0 0 0 30 4038 76
3 1000561 465 0 0 0 0 0 30 4038 76
4 1000561 500 0 0 0 0 0 30 4038 76
5 1000561 600 0 0 0 0 0 30 4038 76
6 1000561 1951 0 0 0 0 0 30 4038 76
7 1000562 21 0 0 0 0 0 905 683 94
8 1000562 463 0 0 0 0 0 905 683 94
9 1000562 465 0 0 0 0 0 905 683 94
10 1000562 500 0 0 0 0 0 905 683 94
11 1000562 600 0 0 0 0 0 905 683 94
12 1000562 1951 1 1 1 0 0 905 683 94
COLUMN_MASH.y n_routes_total n_violations is_violation prop_violations.y row
1 1 11 8 1 0.09090909 4
2 1 11 8 1 0.09090909 4
3 1 11 8 1 0.09090909 4
4 1 11 8 1 0.09090909 4
5 1 11 8 1 0.09090909 4
6 1 11 8 1 0.09090909 4
7 5 4 4 1 0.25000000 2
8 5 4 4 1 0.25000000 2
9 5 4 4 1 0.25000000 2
10 5 4 4 1 0.25000000 2
11 5 4 4 1 0.25000000 2
12 5 4 4 1 0.25000000 2
Run Code Online (Sandbox Code Playgroud)
您会注意到,几列出现两次,一列不带后缀,一列带.y后缀。尝试弄清楚是什么。
从这里开始,在公园里散步以更新零值:
inner_join(mydata, last_mash, by=c('ID_WORKES'='ID_MASH'), suffix =c('','.y')) %>%
mutate(
KOD_DEPO=ifelse(KOD_DEPO==0, KOD_DEPO.y, KOD_DEPO),
KOD_DOR=ifelse(KOD_DOR==0, KOD_DOR.y, KOD_DOR),
COLUMN_MASH=ifelse(COLUMN_MASH==0, COLUMN_MASH.y, COLUMN_MASH)
) %>% select(-ends_with('.y'), -row)
Run Code Online (Sandbox Code Playgroud)
下面是一个使用基地R.我们的一种方式split的mydata和mydata2根据ID,所以我们有相同ID的Map通话在一起。replace所有0 cols值,最后一个值位于的对应列中mydata2。
cols <- c("KOD_DEPO", "KOD_DOR","COLUMN_MASH")
mydata[cols] <- do.call(rbind, Map(function(x, y)
sapply(cols, function(p) replace(x[[p]], x[[p]] == 0, tail(y[[p]], 1))),
split(mydata[c("ID_WORKES",cols)], mydata$ID_WORKES),
split(mydata2[c("ID_MASH",cols)], mydata2$ID_MASH)))
mydata
# ID_WORKES ID_SP_0R KOD_DEPO KOD_DOR COLUMN_MASH prop_violations mash_score
#1 1000561 21 4038 76 1 0 0
#2 1000561 463 4038 76 1 0 0
#3 1000561 465 4038 76 1 0 0
#4 1000561 500 4038 76 1 0 0
#5 1000561 600 4038 76 1 0 0
#6 1000561 1951 4038 76 1 0 0
#7 1000562 21 683 94 5 0 0
#8 1000562 463 683 94 5 0 0
#9 1000562 465 683 94 5 0 0
#10 1000562 500 683 94 5 0 0
#11 1000562 600 683 94 5 0 0
#12 1000562 1951 1 1 1 0 0
Run Code Online (Sandbox Code Playgroud)
Here is an option with data.table:
# Setup
library(data.table)
setDT(mydata)
setDT(mydata2)
vars2update <- c("KOD_DEPO", "KOD_DOR", "COLUMN_MASH")
rows2update <- mydata[, rowSums(.SD == 0L) == 3L, .SDcols = vars2update]
# Join and update variables
mydata[rows2update,
(vars2update) := mydata2[.SD,
on = .(ID_MASH = ID_WORKES),
mult = "last",
mget(vars2update)]]
# ID_WORKES ID_SP_0R KOD_DEPO KOD_DOR COLUMN_MASH prop_violations mash_score
# 1: 1000561 21 4038 76 1 0 0
# 2: 1000561 463 4038 76 1 0 0
# 3: 1000561 465 4038 76 1 0 0
# 4: 1000561 500 4038 76 1 0 0
# 5: 1000561 600 4038 76 1 0 0
# 6: 1000561 1951 4038 76 1 0 0
# 7: 1000562 21 683 94 5 0 0
# 8: 1000562 463 683 94 5 0 0
# 9: 1000562 465 683 94 5 0 0
# 10: 1000562 500 683 94 5 0 0
# 11: 1000562 600 683 94 5 0 0
# 12: 1000562 1951 1 1 1 0 0
Run Code Online (Sandbox Code Playgroud)