用键将R中第二个数据集的最后一个值替换为一个数据集的零值

G-s*_*pot 0 r dplyr data.table tidyr

mydata

  mydata=structure(list(ID_WORKES = c(1000561L, 1000561L, 1000561L, 1000561L, 
    1000561L, 1000561L, 1000562L, 1000562L, 1000562L, 1000562L, 1000562L, 
    1000562L), ID_SP_0R = c(21L, 463L, 465L, 500L, 600L, 1951L, 21L, 
    463L, 465L, 500L, 600L, 1951L), KOD_DEPO = c(0L, 0L, 0L, 0L, 
    0L, 0L, 0L, 0L, 0L, 0L, 0L, 1L), KOD_DOR = c(0L, 0L, 0L, 0L, 
    0L, 0L, 0L, 0L, 0L, 0L, 0L, 1L), COLUMN_MASH = c(0L, 0L, 0L, 
    0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 1L), prop_violations = c(0L, 
    0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L), mash_score = c(0L, 
    0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L)), .Names = c("ID_WORKES", 
    "ID_SP_0R", "KOD_DEPO", "KOD_DOR", "COLUMN_MASH", "prop_violations", 
    "mash_score"), class = "data.frame", row.names = c(NA, -12L))
Run Code Online (Sandbox Code Playgroud)

第二个数据具有这样的格式

mydata2=structure(list(ID_SP_NAR = c(146L, 1088L, 1612L, 30L, 745L, 905L
), KOD_DEPO = c(4575L, 8998L, 8134L, 4038L, 9540L, 683L), KOD_DOR = c(94L, 
94L, 76L, 76L, 94L, 94L), ID_MASH = c(1000561L, 1000561L, 1000561L, 
1000561L, 1000562L, 1000562L), COLUMN_MASH = c(10L, 2L, 1L, 1L, 
17L, 5L), n_routes_total = c(15L, 14L, 25L, 11L, 18L, 4L), n_violations = c(15L, 
10L, 13L, 8L, 7L, 4L), is_violation = c(1L, 1L, 1L, 1L, 1L, 1L
), prop_violations = structure(c(3L, 4L, 1L, 5L, 2L, 6L), .Label = c("0.04000000", 
"0.05555556", "0.06666667", "0.07142857", "0.09090909", "0.25000000"
), class = "factor")), .Names = c("ID_SP_NAR", "KOD_DEPO", "KOD_DOR", 
"ID_MASH", "COLUMN_MASH", "n_routes_total", "n_violations", "is_violation", 
"prop_violations"), class = "data.frame", row.names = c(NA, -6L
))
Run Code Online (Sandbox Code Playgroud)

KOD_DEPO, KOD_DOR, COLUMN_MASH 对于每个ID_WORKES ID_WORKER=ID_MASH,如何将mydata数据集中变量的零值替换为mydata2数据集中这些变量的最后值,这是联接 的关键变量。

如此理想的输出。对于id_mash =1000561mydata2中的最后一个kod_depo为4038,kod_dor为76并且COLUMN_MASH为1对于mydata2中的id_mash =1000562 最后一个depo为683,kod_dor为94并且COLUMN_MASH为5

   ID_WORKES ID_SP_0R KOD_DEPO KOD_DOR COLUMN_MASH prop_violations mash_score
1    1000561       21     4038      76           1               0          0
2    1000561      463     4038      76           1               0          0
3    1000561      465     4038      76           1               0          0
4    1000561      500     4038      76           1               0          0
5    1000561      600     4038      76           1               0          0
6    1000561     1951     4038      76           1               0          0
7    1000562       21      683      94           5               0          0
8    1000562      463      683      94           5               0          0
9    1000562      465      683      94           5               0          0
10   1000562      500      683      94           5               0          0
11   1000562      600      683      94           5               0          0
12   1000562     1951        1       1           1               0          0
Run Code Online (Sandbox Code Playgroud)

怎么做,简单的合并是行不通的。prop_violations和mash_score不会替换。

MrG*_*ble 5

首先,您需要单独减少mydata2到“最后”条目。请注意,您的数据除显示其初始顺序外,没有显示其他要排序的内容。

library(dplyr)
(last_mash <- mydata2 %>% group_by(ID_MASH) %>%
  mutate(row=1:n()) %>% top_n(1, row))
# A tibble: 2 x 10
# Groups:   ID_MASH [2]
  ID_SP_NAR KOD_DEPO KOD_DOR ID_MASH COLUMN_MASH n_routes_total n_violations is_violation prop_violations   row
      <int>    <int>   <int>   <int>       <int>          <int>        <int>        <int> <fct>           <int>
1        30     4038      76 1000561           1             11            8            1 0.09090909          4
2       905      683      94 1000562           5              4            4            1 0.25000000          2
Run Code Online (Sandbox Code Playgroud)

接下来,我们将其与合并mydata,除非我要使用inner_join

inner_join(mydata, last_mash, by=c('ID_WORKES'='ID_MASH'), suffix=c('','.y'))
   ID_WORKES ID_SP_0R KOD_DEPO KOD_DOR COLUMN_MASH prop_violations mash_score ID_SP_NAR KOD_DEPO.y KOD_DOR.y
1    1000561       21        0       0           0               0          0        30       4038        76
2    1000561      463        0       0           0               0          0        30       4038        76
3    1000561      465        0       0           0               0          0        30       4038        76
4    1000561      500        0       0           0               0          0        30       4038        76
5    1000561      600        0       0           0               0          0        30       4038        76
6    1000561     1951        0       0           0               0          0        30       4038        76
7    1000562       21        0       0           0               0          0       905        683        94
8    1000562      463        0       0           0               0          0       905        683        94
9    1000562      465        0       0           0               0          0       905        683        94
10   1000562      500        0       0           0               0          0       905        683        94
11   1000562      600        0       0           0               0          0       905        683        94
12   1000562     1951        1       1           1               0          0       905        683        94
   COLUMN_MASH.y n_routes_total n_violations is_violation prop_violations.y row
1              1             11            8            1        0.09090909   4
2              1             11            8            1        0.09090909   4
3              1             11            8            1        0.09090909   4
4              1             11            8            1        0.09090909   4
5              1             11            8            1        0.09090909   4
6              1             11            8            1        0.09090909   4
7              5              4            4            1        0.25000000   2
8              5              4            4            1        0.25000000   2
9              5              4            4            1        0.25000000   2
10             5              4            4            1        0.25000000   2
11             5              4            4            1        0.25000000   2
12             5              4            4            1        0.25000000   2
Run Code Online (Sandbox Code Playgroud)

您会注意到,几列出现两次,一列不带后缀,一列带.y后缀。尝试弄清楚是什么。

从这里开始,在公园里散步以更新零值:

inner_join(mydata, last_mash, by=c('ID_WORKES'='ID_MASH'), suffix =c('','.y')) %>%
  mutate(
    KOD_DEPO=ifelse(KOD_DEPO==0, KOD_DEPO.y, KOD_DEPO),
    KOD_DOR=ifelse(KOD_DOR==0, KOD_DOR.y, KOD_DOR),
    COLUMN_MASH=ifelse(COLUMN_MASH==0, COLUMN_MASH.y, COLUMN_MASH)
  ) %>% select(-ends_with('.y'), -row)
Run Code Online (Sandbox Code Playgroud)


Ron*_*hah 5

下面是一个使用基地R.我们的一种方式splitmydatamydata2根据ID,所以我们有相同IDMap通话在一起。replace所有0 cols值,最后一个值位于的对应列中mydata2

cols <- c("KOD_DEPO", "KOD_DOR","COLUMN_MASH")

mydata[cols] <- do.call(rbind, Map(function(x, y) 
          sapply(cols, function(p) replace(x[[p]], x[[p]] == 0, tail(y[[p]], 1))),
          split(mydata[c("ID_WORKES",cols)], mydata$ID_WORKES), 
          split(mydata2[c("ID_MASH",cols)], mydata2$ID_MASH)))


mydata

#   ID_WORKES ID_SP_0R KOD_DEPO KOD_DOR COLUMN_MASH prop_violations mash_score
#1    1000561       21     4038      76           1               0          0
#2    1000561      463     4038      76           1               0          0
#3    1000561      465     4038      76           1               0          0
#4    1000561      500     4038      76           1               0          0
#5    1000561      600     4038      76           1               0          0
#6    1000561     1951     4038      76           1               0          0
#7    1000562       21      683      94           5               0          0
#8    1000562      463      683      94           5               0          0
#9    1000562      465      683      94           5               0          0
#10   1000562      500      683      94           5               0          0
#11   1000562      600      683      94           5               0          0
#12   1000562     1951        1       1           1               0          0
Run Code Online (Sandbox Code Playgroud)


sin*_*dur 5

Here is an option with data.table:

# Setup 
library(data.table)
setDT(mydata)
setDT(mydata2)
vars2update <- c("KOD_DEPO", "KOD_DOR", "COLUMN_MASH")
rows2update <- mydata[, rowSums(.SD == 0L) == 3L, .SDcols = vars2update]

# Join and update variables
mydata[rows2update, 
       (vars2update) := mydata2[.SD, 
                                on = .(ID_MASH = ID_WORKES), 
                                mult = "last", 
                                mget(vars2update)]]

#     ID_WORKES ID_SP_0R KOD_DEPO KOD_DOR COLUMN_MASH prop_violations mash_score
#  1:   1000561       21     4038      76           1               0          0
#  2:   1000561      463     4038      76           1               0          0
#  3:   1000561      465     4038      76           1               0          0
#  4:   1000561      500     4038      76           1               0          0
#  5:   1000561      600     4038      76           1               0          0
#  6:   1000561     1951     4038      76           1               0          0
#  7:   1000562       21      683      94           5               0          0
#  8:   1000562      463      683      94           5               0          0
#  9:   1000562      465      683      94           5               0          0
# 10:   1000562      500      683      94           5               0          0
# 11:   1000562      600      683      94           5               0          0
# 12:   1000562     1951        1       1           1               0          0
Run Code Online (Sandbox Code Playgroud)