我想通过范围或虚拟列加入两个小贴士。但似乎by-参数,只允许处理chr奥德vector(chr)现有列名的。
在我的示例中,我有一个d带有列value的小标题,还有一个r带有a from和一to列的小标题。
d <- tibble(value = seq(1,6, by = 0.2))
r <- tibble(from = seq(1,6), to = c(seq(2,6),Inf), class = LETTERS[seq(1,6)])
> d
# A tibble: 26 x 1
value
<dbl>
1 1.0
2 1.2
3 1.4
4 1.6
5 1.8
6 2.0
7 2.2
8 2.4
9 2.6
10 2.8
# ... with 16 more rows
> r
# A tibble: 6 x 3
from to class
<int> <dbl> <chr>
1 1 2 A
2 2 3 B
3 3 4 C
4 4 5 D
5 5 6 E
6 6 Inf F
Run Code Online (Sandbox Code Playgroud)
现在我想加入value在列d的范围内from,并to在r:
d %>% inner_join(r, by = "value between from and to") # >= and <
Run Code Online (Sandbox Code Playgroud)
我不能找到一种方法,这样做,所以决定加入floor的value在d与from列r
d %>% inner_join(r, by = c("floor(value)" = "from"))
Run Code Online (Sandbox Code Playgroud)
当然,我可以创建第二列来解决该问题:
d %>%
mutate(join_value = floor(value)) %>%
inner_join(r, by = c("join_value" = "from")) %>%
select(value, class)
# A tibble: 26 x 2
value class
<dbl> <chr>
1 1.0 A
2 1.2 A
3 1.4 A
4 1.6 A
5 1.8 A
6 2.0 B
7 2.2 B
8 2.4 B
9 2.6 B
10 2.8 B
# ... with 16 more rows
Run Code Online (Sandbox Code Playgroud)
但是没有更舒适的方法吗?
谢谢
avi*_*seR 10
我认为不等式连接dplyr尚未实现,或者将来会实现(请参阅有关不等式约束的连接的讨论),但这是使用 SQL 连接的好情况:
library(tibble)
library(sqldf)
as.tibble(sqldf("select d.value, r.class from d
join r on d.value >= r.'from' and
d.value < r.'to'"))
Run Code Online (Sandbox Code Playgroud)
或者,如果您想将连接集成到您的dplyr链中,您可以使用fuzzyjoin::fuzzy_join:
library(dplyr)
library(fuzzyjoin)
d %>%
fuzzy_join(r, by = c("value" = "from", "value" = "to"),
match_fun = list(`>=`, `<`)) %>%
select(value, class)
Run Code Online (Sandbox Code Playgroud)
结果:
# A tibble: 31 x 2
value class
<dbl> <chr>
1 1.0 A
2 1.2 A
3 1.4 A
4 1.6 A
5 1.8 A
6 2.0 A
7 2.0 B
8 2.2 B
9 2.4 B
10 2.6 B
# ... with 21 more rows
Run Code Online (Sandbox Code Playgroud)
请注意,我在周围添加了单引号from,to因为这些是 SQL 语言的保留字。
好的,谢谢你的建议,这很有趣。我终于写了一个函数range_join(灵感来自@ycw 的代码)并根据运行时比较了所有描述的解决方案。
我喜欢fuzzy_join,但只有50k 行d需要超过40 秒。那太慢了。
这里有 5k 行的结果 d
library(dplyr)
library(fuzzyjoin)
library(sqldf)
#join by range by @WiWeber
range_join <- function(x, y, value, left, right){
x_result <- tibble()
for (y_ in split(y, 1:nrow(y)))
x_result <- x_result %>% bind_rows(x[x[[value]] >= y_[[left]] & x[[value]] < y_[[right]],] %>% cbind(y_))
return(x_result)
}
#dynamic join by @ycw
dynamic_join <- function(d, r){
d$type <- NA_character_
for (r_ in split(r, r$type))
d <- d %>% mutate(type = ifelse(value >= r_$from & value < r_$to, r_$type, type))
return(d)
}
d <- tibble(value = seq(1,6, by = 0.001), join = TRUE)
r <- tibble(from = seq(1,6), to = c(seq(2,6),Inf), type = LETTERS[seq(1,6)], join = TRUE)
# @useR sqldf - fast and intuitive but extra library with horrible code
start <- Sys.time()
d2 <- tbl_df(sqldf("select d.value, r.type from d
join r on d.value >= r.'from' and
d.value < r.'to'"))
Sys.time() - start
# @useR fuzzy_join .... very cool but veeeeeeeeeeeeeeeery slow
start <- Sys.time()
d2 <- d %>%
fuzzy_join(r, by = c("value" = "from", "value" = "to"), match_fun = list(`>=`, `<`)) %>%
select(value, type)
Sys.time() - start
# @jonathande4 cut pretty fast
start <- Sys.time()
d2 <- d
d2$type <- cut(d$value, unique(c(r$from, r$to)), r$type, right = FALSE)
Sys.time() - start
# @WiWeber floor
start <- Sys.time()
d2 <- d %>%
mutate(join_value = floor(value)) %>%
inner_join(r, by = c("join_value" = "from")) %>%
select(value, type)
Sys.time() - start
# @WiWeber cross join - filter
start <- Sys.time()
d2 <- d %>%
inner_join(r, by = "join") %>%
filter(value >= from, value < to) %>%
select(value, type)
Sys.time() - start
# @hardik-gupta sapply
start <- Sys.time()
d2 <- d %>%
mutate(
type = unlist(sapply(value, function (x) r[which(x >= r$from & x < r$to), "type"]))
) %>%
select(value, type)
Sys.time() - start
# @ycw re-dynamic join
start <- Sys.time()
d2 <- d %>% dynamic_join(r)
Sys.time() - start
# @WiWeber range_join
start <- Sys.time()
d2 <- d %>%
range_join(r, "value", "from", "to") %>%
select(value, type)
Sys.time() - start
Run Code Online (Sandbox Code Playgroud)
结果:
# @useR sqldf - fast and intuitive but extra library with horrible code
Time difference of 0.06221986 secs
# @useR fuzzy_join .... very cool but veeeeeeeeeeeeeeeery slow
Time difference of 4.765595 secs
# @jonathande4 cut pretty fast
Time difference of 0.004637003 secs
# @WiWeber floor
Time difference of 0.02223396 secs
# @WiWeber cross join - filter
Time difference of 0.0201931 secs
# @hardik-gupta sapply
Time difference of 5.166633 secs
# @ycw dynamic join
Time difference of 0.03124094 secs
# @WiWeber range_join
Time difference of 0.02691698 secs
Run Code Online (Sandbox Code Playgroud)
格瑞兹威伯
您使用 cut 函数在对象 d 中创建一个“类”,然后使用左连接。
d <- tibble(value = seq(1,6, by = 0.2))
r <- tibble(from = seq(1,6), to = c(seq(2,6),Inf), class = LETTERS[seq(1,6)])
d[["class"]] <- cut(d[["value"]], c(0,2,3,4,5,6,Inf), c('A',"B", "C", "D", "E", "F"), right = FALSE)
d <- left_join(d, r)
Run Code Online (Sandbox Code Playgroud)
要获得正确的存储桶,您只需使用 cut 函数即可获得您想要的内容。