考虑两个向量.
a <- c(123, 234, 432, 223)
b <- c(234, 238, 342, 325, 326)
现在,我想的最后两位数字匹配a到的前两位b,并创建一个新的向量粘贴第一个数字a,匹配的部分和最后一位数字b.我的预期输出是:
[1] 1234 1238 2342 4325 4326 2234 2238
为简单起见,考虑所有元素的长度始终为3.
我试过了 :
sub_a <- substr(a, 2, 3)   #get last two digits of a
sub_b <- substr(b, 1, 2)   #get first two digits of b
common <- intersect(sub_a, sub_b) 
common给了我两个共同的元素a,b它们是:
[1] "23" "34" "32"
然后我一起使用match并paste0得到不完整的输出.
paste0(a[match(common, sub_a)], substr(b[match(common, sub_b)], 3, 3))
#[1] "1234" "2342" "4325"
因为match只有第一个分身相匹配.
如何实现预期的输出?
Jaa*_*aap 19
可能的解决方案:
a <- setNames(a, substr(a,2,3))
b <- setNames(b, substr(b,1,2))
df <- merge(stack(a), stack(b), by = 'ind')
paste0(substr(df$values.x,1,1), df$values.y)
这使:
Run Code Online (Sandbox Code Playgroud)[1] "1234" "1238" "2234" "2238" "4325" "4326" "2342"
第二种选择:
a <- setNames(a, substr(a,2,3))
b <- setNames(b, substr(b,1,2))
l <- lapply(names(a), function(x) b[x == names(b)])
paste0(substr(rep(a, lengths(l)),1,1), unlist(l))
它给出了相同的结果并且速度相当快(参见基准测试).
Ten*_*bai 12
可能有点复杂,但有效:
unlist( sapply( a, function(x) {
  regex <- paste0( substr(x, 2, 3), '(\\d)')
  z <- sub(regex, paste0(x, "\\1"), b)
  z[!b %in% z] 
} ))
给出: [1] "1234" "1238" "2342" "4325" "4326" "2234" "2238"
主要思想是为a中的每个条目创建一个正则表达式,将此正则表达式应用于b并将值替换为当前值,并仅追加捕获的最后一个数字((\\d)正则表达式的一部分,然后过滤生成的向量以返回只有修改后的值.
出于好奇,我做了一个小的基准测试(将sub_a和sub_b创建添加到Sotos和Heikki的答案中,所以每个人都从相同的初始向量a开始400个观测值和b 500个观测值):
Unit: milliseconds
            expr      min       lq     mean   median       uq      max neval
      Jaap(a, b) 341.0224 342.6853 345.2182 344.3482 347.3161 350.2840     3
     Tensi(a, b) 415.9175 416.2672 421.9148 416.6168 424.9134 433.2100     3
    Heikki(a, b) 126.9859 139.6727 149.3252 152.3594 160.4948 168.6302     3
     Sotos(a, b) 151.1264 164.9869 172.0310 178.8474 182.4833 186.1191     3
 MattWBase(a, b) 286.9651 290.8923 293.3795 294.8195 296.5867 298.3538     3
Sot*_*tos 11
另一种方法是使用expand.grid,所以拿起你的,sub_a和sub_b,
d1 <- expand.grid(a, b, stringsAsFactors = FALSE)
d2 <- expand.grid(sub_a, sub_b, stringsAsFactors = FALSE)
i1 <- d2$Var1 == d2$Var2
d1 <- d1[i1,] 
d1$Var1 <- substr(d1$Var1, 1, 1)
do.call(paste0, d1)
#[1] "1234" "2234" "1238" "2238" "2342" "4325" "4326"
这是基础R中的另一个选项:
foo <- function(a, b) {
  split_a <- split(a,  substr(a, 2, 3))
  split_b <- split(substr(b, 3, 3), substr(b, 1, 2))
  idx <- intersect(names(split_a), names(split_b))
  stopifnot(length(idx) > 0)
  unlist(Map(function(x,y) outer(x, y, paste0), split_a[idx], split_b[idx]), 
                      use.names = FALSE)
}
foo(a, b)
# [1] "1234" "2234" "1238" "2238" "4325" "4326" "2342"
更新:
我使用/sf/answers/3320454721/中的函数定义来制作所有答案和更大数据的另一个基准.我得到的输入数据和结果是:
set.seed(123)
a <- sample(100:999, 1e4, TRUE)
b <- sample(100:999, 1e3, TRUE)
library(microbenchmark)
library(dplyr)
res <- microbenchmark(docendo(a, b), 
               Jaap1(a, b), 
               Jaap2(a, b), 
               Sotos(a, b), 
               Tensi(a, b), 
               Heikki(a, b), 
               Matt_base(a, b),
               Matt_dplyr(a, b), 
               zx8754(a, b),
               times = 10, unit = "relative")
Unit: relative
             expr        min         lq       mean     median         uq        max neval
    docendo(a, b)   1.000000   1.000000   1.000000   1.000000   1.000000   1.000000    10
      Jaap1(a, b)  14.002977  13.724432  13.347755  13.433175  12.788948  13.301811    10
      Jaap2(a, b)   4.364993   4.936248   5.201879   5.125639   5.060425   7.520069    10
      Sotos(a, b)  22.215750  23.850280  25.743047  25.177676  28.274083  28.288089    10
      Tensi(a, b) 231.230360 234.830000 246.587532 242.345573 260.784725 273.184452    10
     Heikki(a, b) 135.615708 136.900943 144.775845 146.314048 150.546406 156.873954    10
  Matt_base(a, b)  13.274675  12.995334  13.402940  12.723798  12.432802  18.881093    10
 Matt_dplyr(a, b)   1.299223   1.314568   1.420479   1.345850   1.380378   1.807671    10
     zx8754(a, b)   9.607226  10.175381  10.486580  10.136439  10.096818  13.410858    10
有趣的是,当我从基准重现Frank的答案和我的答案时,我得到了相反的结果:
Frank <- function(a, b) {
  aDT <- as.data.table(tstrsplit(a, ""))
  bDT <- setnames(as.data.table(tstrsplit(b, "")), c("V2", "V3", "V4"))
  merge(aDT, bDT, allow.cartesian = TRUE)[, paste0(V1, V2, V3, V4)]
}
set.seed(1)  # same input size as in the cw benchmark answer
a <- sample(100:999, 8e3, TRUE)
b <- sample(100:999, 1e4, TRUE)
microbenchmark(Frank(a, b), docendo(a, b), unit = "relative", times = 10)
Unit: relative
          expr     min       lq     mean   median       uq      max neval
   Frank(a, b) 1.37435 1.390417 1.500996 1.470548 1.644079 1.616446    10
 docendo(a, b) 1.00000 1.000000 1.000000 1.000000 1.000000 1.000000    10
all.equal(sort(docendo(a, b)), sort(Frank(a, b)))
#[1] TRUE
一个基准(将sub_a和sub_b创建添加到Sotos和Heikki中,所以每个人都从a800个观察和b1000个观察的相同初始向量开始).
运行基准测试:
library(dplyr)
library(data.table)
library(microbenchmark)
a <- sample(100:999, 8e3, TRUE)
b <- sample(100:999, 1e4, TRUE)
microbenchmark(Jaap1(a,b), Jaap2(a,b), Tensi(a,b), Heikki(a,b), Sotos(a,b),
               Matt_base(a,b), Matt_dplyr(a,b), Docendo(a,b),
               zx8754(a,b), zx8754for(a,b), Frank(a,b),
               times = 50, unit = 'relative')
得到:
Run Code Online (Sandbox Code Playgroud)Unit: relative expr min lq mean median uq max neval cld Jaap1(a, b) 19.668483 19.316194 17.2373827 18.921573 18.829932 7.8792713 50 d Jaap2(a, b) 4.253151 4.365420 4.0557281 4.309247 4.398149 2.2149125 50 b Tensi(a, b) 241.682216 238.197815 212.2844582 233.473689 233.367619 93.3562331 50 h Heikki(a, b) 114.895836 113.754054 101.2781709 111.637570 110.541708 44.9437229 50 g Sotos(a, b) 27.598767 28.725937 25.7469518 28.534011 28.638413 11.6995642 50 e Matt_base(a, b) 19.159883 18.834180 16.8853660 18.513498 18.416194 7.8329323 50 d Matt_dplyr(a, b) 1.108230 1.106051 1.0203776 1.102078 1.098476 1.0131898 50 a Docendo(a, b) 1.000000 1.000000 1.0000000 1.000000 1.000000 1.0000000 50 a zx8754(a, b) 11.601730 12.986763 11.7859245 13.054720 13.234842 5.6944437 50 c zx8754for(a, b) 90.448168 92.906445 82.4905438 91.092609 90.160010 36.1277145 50 f Frank(a, b) 1.070775 1.070202 0.9621499 1.063978 1.055540 0.4459918 50 a
使用的功能:
Jaap1 <- function(a,b) {
  a <- setNames(a, substr(a,2,3))
  b <- setNames(b, substr(b,1,2))
  df <- merge(stack(a), stack(b), by = 'ind')
  paste0(substr(df$values.x,1,1), df$values.y)
}
Jaap2 <- function(a,b) {
  a <- setNames(a, substr(a,2,3))
  b <- setNames(b, substr(b,1,2))
  l <- lapply(names(a), function(x) b[x == names(b)])
  paste0(substr(rep(a, lengths(l)),1,1), unlist(l))
}
Tensi <- function(a,b) {
  unlist(sapply(a,function(x) {regex <- paste0(substr(x,2,3),'(\\d)'); z <- sub(regex,paste0(x,"\\1"),b); z[!b %in% z] } ))
}
Heikki <- function(a,b) {
  sub_a <- substr(a, 2, 3)
  sub_b <- substr(b, 1, 2)
  result <- c()
  for (ai in a) {
    sub_ai <- substr(ai,2,3)
    if (sub_ai %in% sub_a) {
      b_match <- (sub_b == sub_ai)
      result <- c(result,paste0(ai,substr(b[b_match],3,4)))
    }
  }
  result
}
Sotos <- function(a,b) {
  sub_a <- substr(a, 2, 3)
  sub_b <- substr(b, 1, 2)
  d1 <- expand.grid(a, b, stringsAsFactors = FALSE)
  d2 <- expand.grid(sub_a, sub_b, stringsAsFactors = FALSE)
  i1 <- d2$Var1 == d2$Var2
  d1 <- d1[i1,] 
  d1$Var1 <- substr(d1$Var1, 1, 1)
  do.call(paste0, d1)
}
Matt_base <- function(a,b) {
  a1 <- data.frame(a)
  b1 <- data.frame(b)
  a1$first_a = substr(a1$a, 1, 1)
  a1$last_a = substr(a1$a, 2, 3)
  b1$first_b = substr(b1$b, 1, 2)
  b1$last_b = substr(b1$b, 3, 3)
  c1 <- merge(a1, b1, by.x = "last_a", by.y = "first_b")
  results <- paste0(c1$a, c1$last_b)
}
Matt_dplyr <- function(a,b) {
  a1 <- data.frame(a)
  b1 <- data.frame(b)
  a1 <- a1 %>% mutate(first_a = substr(a, 1, 1), last_a = substr(a, 2, 3))
  b1 <- b1 %>% mutate(first_b = substr(b, 1, 2), last_b = substr(b, 3, 3))
  c1 <- inner_join(a1, b1, by = c("last_a" = "first_b"))
  results <- paste0(c1$a, c1$last_b)
}
Docendo <- function(a, b) {
  split_a <- split(a,  substr(a, 2, 3))
  split_b <- split(substr(b, 3, 3), substr(b, 1, 2))
  idx <- intersect(names(split_a), names(split_b))
  stopifnot(length(idx) > 0)
  unlist(Map(function(x,y) outer(x, y, paste0), split_a[idx], split_b[idx]), 
         use.names = FALSE)
}
zx8754 <- function(a, b) {
  unlist(sapply(a, function(i) i * 10 + (b %% 10)[i %% 100 == b %/% 10]))
}
zx8754for <- function(a, b) {
  res <- integer()
  for(i in a)  res <- c(res, i * 10 + (b %% 10)[i %% 100 == b %/% 10])
  res
}
Frank <- function(a, b) {
  aDT <- as.data.table(tstrsplit(a, ""))
  bDT <- setnames(as.data.table(tstrsplit(b, "")), c("V2", "V3", "V4"))
  merge(aDT, bDT, allow.cartesian = TRUE)[, paste0(V1, V2, V3, V4)]
}
一点数学怎么样*:
unlist(sapply(a, function(i)
  i * 10 + (b %% 10)[i %% 100 == b %/% 10]))
*假设:所有数字都是3位数,但这当然可以在sapply内调整.
检查输出,输出的顺序与其他答案不同,输出为数字,而不是字符.
identical(sort(as.numeric(docendo(a, b))), sort(zx8754(a, b)))
# [1] TRUE
identical(sort(as.numeric(jaap(a, b))), sort(zx8754(a, b)))
# [1] TRUE
编辑: forloop版本似乎快了3倍(例如小数据,更大的设置它实际上是3倍慢,请参阅基准测试wiki帖子).
zx8754 <- function(a, b) {
  unlist(sapply(a, function(i) i * 10 + (b %% 10)[i %% 100 == b %/% 10]))
}
zx8754_forloop <- function(a, b) {
  res <- integer()
  for(i in a)  res <- c(res, i * 10 + (b %% 10)[i %% 100 == b %/% 10])
  res
}
microbenchmark::microbenchmark(
  zx8754 = zx8754(a, b),
  zx8754_forloop = zx8754_forloop(a, b)
)
# Unit: microseconds
#           expr    min      lq     mean median     uq      max neval
# zx8754         16.535 17.3910 55.05348 17.676 18.246 3672.223   100
# zx8754_forloop  4.562  5.4165 46.74887  5.987  6.272 4080.469   100
#check output
identical(zx8754(a, b), zx8754_forloop(a, b))
# [1] TRUE
另一个选择是将它放入列并加入:
library(data.table)
Frank <- function(a, b) {
  aDT <- setDT(tstrsplit(a, ""))
  bDT <- setnames(setDT(tstrsplit(b, "")), c("V2", "V3", "V4"))
  merge(aDT, bDT, allow.cartesian = TRUE)[, paste0(V1, V2, V3, V4)]
}
MattDT <- function(a,b){
  aDT2 <- data.table(V1 = substring(a,1,1), V23 = substring(a,2,3))
  bDT2 <- data.table(V23 = substring(b,1,2), V4 = substring(b,3,3))
  merge(aDT2, bDT2, allow.cartesian = TRUE)[, paste0(V1, V23, V4)]
}
这是列表经历的方法:
result <- c()
for (ai in a) {
  sub_ai <- substr(ai,2,3)
  if (sub_ai %in% sub_a) {
    b_match <- (sub_b == sub_ai)
    result <- c(result,paste0(ai,substr(b[b_match],3,4)))
  }
}
> result
[1] "1234" "1238" "2342" "4325" "4326" "2234" "2238"
那么你可能想要独特的结果.
在中间件上使用dplyr :: inner_join:
library(dplyr)
a <- c(123, 234, 432, 223)
b <- c(234, 238, 342, 325, 326)
a1 <- data.frame(a)
b1 <- data.frame(b)
a1 <- a1 %>% mutate(first_a = substr(a, 1, 1), last_a = substr(a, 2, 3))
b1 <- b1 %>% mutate(first_b = substr(b, 1, 2), last_b = substr(b, 3, 3))
c1 <- inner_join(a1, b1, by = c("last_a" = "first_b"))
results <- paste0(c1$a, c1$last_b)
使用base :: merge:
a1 <- data.frame(a)
b1 <- data.frame(b)
a1$first_a = substr(a1$a, 1, 1)
a1$last_a = substr(a1$a, 2, 3)
b1$first_b = substr(b1$b, 1, 2)
b1$last_b = substr(b1$b, 3, 3)
c1 <- merge(a1, b1, by.x = "last_a", by.y = "first_b")
results <- paste0(c1$a, c1$last_b)