R - mgsub 问题:被替换的子字符串不是整个字符串

jva*_*nti 8 string performance r gsub

我已经从 USPS 下载了街道缩写。这是数据:

dput(usps_streets)
structure(list(common_abbrev = c("allee", "alley", "ally", "aly", 
"anex", "annex", "annx", "anx", "arc", "arcade", "av", "ave", 
"aven", "avenu", "avenue", "avn", "avnue", "bayoo", "bayou", 
"bch", "beach", "bend", "bnd", "blf", "bluf", "bluff", "bluffs", 
"bot", "btm", "bottm", "bottom", "blvd", "boul", "boulevard", 
"boulv", "br", "brnch", "branch", "brdge", "brg", "bridge", "brk", 
"brook", "brooks", "burg", "burgs", "byp", "bypa", "bypas", "bypass", 
"byps", "camp", "cp", "cmp", "canyn", "canyon", "cnyn", "cape", 
"cpe", "causeway", "causwa", "cswy", "cen", "cent", "center", 
"centr", "centre", "cnter", "cntr", "ctr", "centers", "cir", 
"circ", "circl", "circle", "crcl", "crcle", "circles", "clf", 
"cliff", "clfs", "cliffs", "clb", "club", "common", "commons", 
"cor", "corner", "corners", "cors", "course", "crse", "court", 
"ct", "courts", "cts", "cove", "cv", "coves", "creek", "crk", 
"crescent", "cres", "crsent", "crsnt", "crest", "crossing", "crssng", 
"xing", "crossroad", "crossroads", "curve", "dale", "dl", "dam", 
"dm", "div", "divide", "dv", "dvd", "dr", "driv", "drive", "drv", 
"drives", "est", "estate", "estates", "ests", "exp", "expr", 
"express", "expressway", "expw", "expy", "ext", "extension", 
"extn", "extnsn", "exts", "fall", "falls", "fls", "ferry", "frry", 
"fry", "field", "fld", "fields", "flds", "flat", "flt", "flats", 
"flts", "ford", "frd", "fords", "forest", "forests", "frst", 
"forg", "forge", "frg", "forges", "fork", "frk", "forks", "frks", 
"fort", "frt", "ft", "freeway", "freewy", "frway", "frwy", "fwy", 
"garden", "gardn", "grden", "grdn", "gardens", "gdns", "grdns", 
"gateway", "gatewy", "gatway", "gtway", "gtwy", "glen", "gln", 
"glens", "green", "grn", "greens", "grov", "grove", "grv", "groves", 
"harb", "harbor", "harbr", "hbr", "hrbor", "harbors", "haven", 
"hvn", "ht", "hts", "highway", "highwy", "hiway", "hiwy", "hway", 
"hwy", "hill", "hl", "hills", "hls", "hllw", "hollow", "hollows", 
"holw", "holws", "inlt", "is", "island", "islnd", "islands", 
"islnds", "iss", "isle", "isles", "jct", "jction", "jctn", "junction", 
"junctn", "juncton", "jctns", "jcts", "junctions", "key", "ky", 
"keys", "kys", "knl", "knol", "knoll", "knls", "knolls", "lk", 
"lake", "lks", "lakes", "land", "landing", "lndg", "lndng", "lane", 
"ln", "lgt", "light", "lights", "lf", "loaf", "lck", "lock", 
"lcks", "locks", "ldg", "ldge", "lodg", "lodge", "loop", "loops", 
"mall", "mnr", "manor", "manors", "mnrs", "meadow", "mdw", "mdws", 
"meadows", "medows", "mews", "mill", "mills", "missn", "mssn", 
"motorway", "mnt", "mt", "mount", "mntain", "mntn", "mountain", 
"mountin", "mtin", "mtn", "mntns", "mountains", "nck", "neck", 
"orch", "orchard", "orchrd", "oval", "ovl", "overpass", "park", 
"prk", "parks", "parkway", "parkwy", "pkway", "pkwy", "pky", 
"parkways", "pkwys", "pass", "passage", "path", "paths", "pike", 
"pikes", "pine", "pines", "pnes", "pl", "plain", "pln", "plains", 
"plns", "plaza", "plz", "plza", "point", "pt", "points", "pts", 
"port", "prt", "ports", "prts", "pr", "prairie", "prr", "rad", 
"radial", "radiel", "radl", "ramp", "ranch", "ranches", "rnch", 
"rnchs", "rapid", "rpd", "rapids", "rpds", "rest", "rst", "rdg", 
"rdge", "ridge", "rdgs", "ridges", "riv", "river", "rvr", "rivr", 
"rd", "road", "roads", "rds", "route", "row", "rue", "run", "shl", 
"shoal", "shls", "shoals", "shoar", "shore", "shr", "shoars", 
"shores", "shrs", "skyway", "spg", "spng", "spring", "sprng", 
"spgs", "spngs", "springs", "sprngs", "spur", "spurs", "sq", 
"sqr", "sqre", "squ", "square", "sqrs", "squares", "sta", "station", 
"statn", "stn", "stra", "strav", "straven", "stravenue", "stravn", 
"strvn", "strvnue", "stream", "streme", "strm", "street", "strt", 
"st", "str", "streets", "smt", "suite", "sumit", "sumitt", "summit", 
"ter", "terr", "terrace", "throughway", "trace", "traces", "trce", 
"track", "tracks", "trak", "trk", "trks", "trafficway", "trail", 
"trails", "trl", "trls", "trailer", "trlr", "trlrs", "tunel", 
"tunl", "tunls", "tunnel", "tunnels", "tunnl", "trnpk", "turnpike", 
"turnpk", "underpass", "un", "union", "unions", "valley", "vally", 
"vlly", "vly", "valleys", "vlys", "vdct", "via", "viadct", "viaduct", 
"view", "vw", "views", "vws", "vill", "villag", "village", "villg", 
"villiage", "vlg", "villages", "vlgs", "ville", "vl", "vis", 
"vist", "vista", "vst", "vsta", "walk", "walks", "wall", "wy", 
"way", "ways", "well", "wells", "wls"), usps_abbrev = c("aly", 
"aly", "aly", "aly", "anx", "anx", "anx", "anx", "arc", "arc", 
"ave", "ave", "ave", "ave", "ave", "ave", "ave", "byu", "byu", 
"bch", "bch", "bnd", "bnd", "blf", "blf", "blf", "blfs", "btm", 
"btm", "btm", "btm", "blvd", "blvd", "blvd", "blvd", "br", "br", 
"br", "brg", "brg", "brg", "brk", "brk", "brks", "bg", "bgs", 
"byp", "byp", "byp", "byp", "byp", "cp", "cp", "cp", "cyn", "cyn", 
"cyn", "cpe", "cpe", "cswy", "cswy", "cswy", "ctr", "ctr", "ctr", 
"ctr", "ctr", "ctr", "ctr", "ctr", "ctrs", "cir", "cir", "cir", 
"cir", "cir", "cir", "cirs", "clf", "clf", "clfs", "clfs", "clb", 
"clb", "cmn", "cmns", "cor", "cor", "cors", "cors", "crse", "crse", 
"ct", "ct", "cts", "cts", "cv", "cv", "cvs", "crk", "crk", "cres", 
"cres", "cres", "cres", "crst", "xing", "xing", "xing", "xrd", 
"xrds", "curv", "dl", "dl", "dm", "dm", "dv", "dv", "dv", "dv", 
"dr", "dr", "dr", "dr", "drs", "est", "est", "ests", "ests", 
"expy", "expy", "expy", "expy", "expy", "expy", "ext", "ext", 
"ext", "ext", "exts", "fall", "fls", "fls", "fry", "fry", "fry", 
"fld", "fld", "flds", "flds", "flt", "flt", "flts", "flts", "frd", 
"frd", "frds", "frst", "frst", "frst", "frg", "frg", "frg", "frgs", 
"frk", "frk", "frks", "frks", "ft", "ft", "ft", "fwy", "fwy", 
"fwy", "fwy", "fwy", "gdn", "gdn", "gdn", "gdn", "gdns", "gdns", 
"gdns", "gtwy", "gtwy", "gtwy", "gtwy", "gtwy", "gln", "gln", 
"glns", "grn", "grn", "grns", "grv", "grv", "grv", "grvs", "hbr", 
"hbr", "hbr", "hbr", "hbr", "hbrs", "hvn", "hvn", "hts", "hts", 
"hwy", "hwy", "hwy", "hwy", "hwy", "hwy", "hl", "hl", "hls", 
"hls", "holw", "holw", "holw", "holw", "holw", "inlt", "is", 
"is", "is", "iss", "iss", "iss", "isle", "isle", "jct", "jct", 
"jct", "jct", "jct", "jct", "jcts", "jcts", "jcts", "ky", "ky", 
"kys", "kys", "knl", "knl", "knl", "knls", "knls", "lk", "lk", 
"lks", "lks", "land", "lndg", "lndg", "lndg", "ln", "ln", "lgt", 
"lgt", "lgts", "lf", "lf", "lck", "lck", "lcks", "lcks", "ldg", 
"ldg", "ldg", "ldg", "loop", "loop", "mall", "mnr", "mnr", "mnrs", 
"mnrs", "mdw", "mdws", "mdws", "mdws", "mdws", "mews", "ml", 
"mls", "msn", "msn", "mtwy", "mt", "mt", "mt", "mtn", "mtn", 
"mtn", "mtn", "mtn", "mtn", "mtns", "mtns", "nck", "nck", "orch", 
"orch", "orch", "oval", "oval", "opas", "park", "park", "park", 
"pkwy", "pkwy", "pkwy", "pkwy", "pkwy", "pkwy", "pkwy", "pass", 
"psge", "path", "path", "pike", "pike", "pne", "pnes", "pnes", 
"pl", "pln", "pln", "plns", "plns", "plz", "plz", "plz", "pt", 
"pt", "pts", "pts", "prt", "prt", "prts", "prts", "pr", "pr", 
"pr", "radl", "radl", "radl", "radl", "ramp", "rnch", "rnch", 
"rnch", "rnch", "rpd", "rpd", "rpds", "rpds", "rst", "rst", "rdg", 
"rdg", "rdg", "rdgs", "rdgs", "riv", "riv", "riv", "riv", "rd", 
"rd", "rds", "rds", "rte", "row", "rue", "run", "shl", "shl", 
"shls", "shls", "shr", "shr", "shr", "shrs", "shrs", "shrs", 
"skwy", "spg", "spg", "spg", "spg", "spgs", "spgs", "spgs", "spgs", 
"spur", "spur", "sq", "sq", "sq", "sq", "sq", "sqs", "sqs", "sta", 
"sta", "sta", "sta", "stra", "stra", "stra", "stra", "stra", 
"stra", "stra", "strm", "strm", "strm", "st", "st", "st", "st", 
"sts", "smt", "ste", "smt", "smt", "smt", "ter", "ter", "ter", 
"trwy", "trce", "trce", "trce", "trak", "trak", "trak", "trak", 
"trak", "trfy", "trl", "trl", "trl", "trl", "trlr", "trlr", "trlr", 
"tunl", "tunl", "tunl", "tunl", "tunl", "tunl", "tpke", "tpke", 
"tpke", "upas", "un", "un", "uns", "vly", "vly", "vly", "vly", 
"vlys", "vlys", "via", "via", "via", "via", "vw", "vw", "vws", 
"vws", "vlg", "vlg", "vlg", "vlg", "vlg", "vlg", "vlgs", "vlgs", 
"vl", "vl", "vis", "vis", "vis", "vis", "vis", "walk", "walk", 
"wall", "way", "way", "ways", "wl", "wls", "wls")), class = "data.frame", row.names = c(NA, 
-503L))
Run Code Online (Sandbox Code Playgroud)

我想用它们来处理街道地址和州。玩具数据:

a <- c("10900 harper ave", "12235 davis annex", "24 van cortland parkway")
Run Code Online (Sandbox Code Playgroud)

为了将常见缩写转换为 usps 缩写(标准化数据),我构建了一个小函数:

mr_zip <- function(x){
  x <-textclean::mgsub(usps_streets$common_abbrev, usps_streets$usps_abbrev, x, fixed = T,
                   order.pattern = T)
  return(x)
}
Run Code Online (Sandbox Code Playgroud)

当我将函数应用于数据时,问题就出现了:

f <- sapply(a, mr_zip)
Run Code Online (Sandbox Code Playgroud)

我得到错误的结果:

 "10900 harper avee"       "1235 davis anx" "24 van cortland pkway"
Run Code Online (Sandbox Code Playgroud)

因为我应该得到的是:

"10900 harper ave"       "1235 davis anx" "24 van cortland pkwy"
Run Code Online (Sandbox Code Playgroud)

我的问题:

  1. 为什么当我在函数中指定order.pattern = Tand时会发生这种情况?fixed = Tmgsub
  2. 我能做什么来修复它?
  3. 是否有其他方法可以在文本的多种替换模式中使用向量?

预先感谢,欢迎所有建议。

编辑:感谢@RichieSacramento,我发现使用单词边界确实有帮助,但在大型数据帧(> 400,000 行)上使用时该函数仍然非常慢。使用safe = TRUEinmgsub可以使函数正常工作,但速度非常慢。人们需要快速的东西——因此有赏金。

Mar*_*łka 8

那么让我们开始享受乐趣吧。

步骤 1 首先,我们将您的数据加载到tibble命名的USPS.

library(tidyverse)
USPS = tibble(
 common_abbrev = c("allee", "alley", "ally", "aly", 
 "anex", "annex", "annx", "anx", "arc", "arcade", "av", "ave", 
 "aven", "avenu", "avenue", "avn", "avnue", "bayoo", "bayou", 
 "bch", "beach", "bend", "bnd", "blf", "bluf", "bluff", "bluffs", 
 "bot", "btm", "bottm", "bottom", "blvd", "boul", "boulevard", 
 "boulv", "br", "brnch", "branch", "brdge", "brg", "bridge", "brk", 
 "brook", "brooks", "burg", "burgs", "byp", "bypa", "bypas", "bypass", 
 "byps", "camp", "cp", "cmp", "canyn", "canyon", "cnyn", "cape", 
 "cpe", "causeway", "causwa", "cswy", "cen", "cent", "center", 
 "centr", "centre", "cnter", "cntr", "ctr", "centers", "cir", 
 "circ", "circl", "circle", "crcl", "crcle", "circles", "clf", 
 "cliff", "clfs", "cliffs", "clb", "club", "common", "commons", 
 "cor", "corner", "corners", "cors", "course", "crse", "court", 
 "ct", "courts", "cts", "cove", "cv", "coves", "creek", "crk", 
 "crescent", "cres", "crsent", "crsnt", "crest", "crossing", "crssng", 
 "xing", "crossroad", "crossroads", "curve", "dale", "dl", "dam", 
 "dm", "div", "divide", "dv", "dvd", "dr", "driv", "drive", "drv", 
 "drives", "est", "estate", "estates", "ests", "exp", "expr", 
 "express", "expressway", "expw", "expy", "ext", "extension", 
 "extn", "extnsn", "exts", "fall", "falls", "fls", "ferry", "frry", 
 "fry", "field", "fld", "fields", "flds", "flat", "flt", "flats", 
 "flts", "ford", "frd", "fords", "forest", "forests", "frst", 
 "forg", "forge", "frg", "forges", "fork", "frk", "forks", "frks", 
 "fort", "frt", "ft", "freeway", "freewy", "frway", "frwy", "fwy", 
 "garden", "gardn", "grden", "grdn", "gardens", "gdns", "grdns", 
 "gateway", "gatewy", "gatway", "gtway", "gtwy", "glen", "gln", 
 "glens", "green", "grn", "greens", "grov", "grove", "grv", "groves", 
 "harb", "harbor", "harbr", "hbr", "hrbor", "harbors", "haven", 
 "hvn", "ht", "hts", "highway", "highwy", "hiway", "hiwy", "hway", 
 "hwy", "hill", "hl", "hills", "hls", "hllw", "hollow", "hollows", 
 "holw", "holws", "inlt", "is", "island", "islnd", "islands", 
 "islnds", "iss", "isle", "isles", "jct", "jction", "jctn", "junction", 
 "junctn", "juncton", "jctns", "jcts", "junctions", "key", "ky", 
 "keys", "kys", "knl", "knol", "knoll", "knls", "knolls", "lk", 
 "lake", "lks", "lakes", "land", "landing", "lndg", "lndng", "lane", 
 "ln", "lgt", "light", "lights", "lf", "loaf", "lck", "lock", 
 "lcks", "locks", "ldg", "ldge", "lodg", "lodge", "loop", "loops", 
 "mall", "mnr", "manor", "manors", "mnrs", "meadow", "mdw", "mdws", 
 "meadows", "medows", "mews", "mill", "mills", "missn", "mssn", 
 "motorway", "mnt", "mt", "mount", "mntain", "mntn", "mountain", 
 "mountin", "mtin", "mtn", "mntns", "mountains", "nck", "neck", 
 "orch", "orchard", "orchrd", "oval", "ovl", "overpass", "park", 
 "prk", "parks", "parkway", "parkwy", "pkway", "pkwy", "pky", 
 "parkways", "pkwys", "pass", "passage", "path", "paths", "pike", 
 "pikes", "pine", "pines", "pnes", "pl", "plain", "pln", "plains", 
 "plns", "plaza", "plz", "plza", "point", "pt", "points", "pts", 
 "port", "prt", "ports", "prts", "pr", "prairie", "prr", "rad", 
 "radial", "radiel", "radl", "ramp", "ranch", "ranches", "rnch", 
 "rnchs", "rapid", "rpd", "rapids", "rpds", "rest", "rst", "rdg", 
 "rdge", "ridge", "rdgs", "ridges", "riv", "river", "rvr", "rivr", 
 "rd", "road", "roads", "rds", "route", "row", "rue", "run", "shl", 
 "shoal", "shls", "shoals", "shoar", "shore", "shr", "shoars", 
 "shores", "shrs", "skyway", "spg", "spng", "spring", "sprng", 
 "spgs", "spngs", "springs", "sprngs", "spur", "spurs", "sq", 
 "sqr", "sqre", "squ", "square", "sqrs", "squares", "sta", "station", 
 "statn", "stn", "stra", "strav", "straven", "stravenue", "stravn", 
 "strvn", "strvnue", "stream", "streme", "strm", "street", "strt", 
 "st", "str", "streets", "smt", "suite", "sumit", "sumitt", "summit", 
 "ter", "terr", "terrace", "throughway", "trace", "traces", "trce", 
 "track", "tracks", "trak", "trk", "trks", "trafficway", "trail", 
 "trails", "trl", "trls", "trailer", "trlr", "trlrs", "tunel", 
 "tunl", "tunls", "tunnel", "tunnels", "tunnl", "trnpk", "turnpike", 
 "turnpk", "underpass", "un", "union", "unions", "valley", "vally", 
 "vlly", "vly", "valleys", "vlys", "vdct", "via", "viadct", "viaduct", 
 "view", "vw", "views", "vws", "vill", "villag", "village", "villg", 
 "villiage", "vlg", "villages", "vlgs", "ville", "vl", "vis", 
 "vist", "vista", "vst", "vsta", "walk", "walks", "wall", "wy", 
 "way", "ways", "well", "wells", "wls"), 
 usps_abbrev = c("aly", 
 "aly", "aly", "aly", "anx", "anx", "anx", "anx", "arc", "arc", 
 "ave", "ave", "ave", "ave", "ave", "ave", "ave", "byu", "byu", 
 "bch", "bch", "bnd", "bnd", "blf", "blf", "blf", "blfs", "btm", 
 "btm", "btm", "btm", "blvd", "blvd", "blvd", "blvd", "br", "br", 
 "br", "brg", "brg", "brg", "brk", "brk", "brks", "bg", "bgs", 
 "byp", "byp", "byp", "byp", "byp", "cp", "cp", "cp", "cyn", "cyn", 
 "cyn", "cpe", "cpe", "cswy", "cswy", "cswy", "ctr", "ctr", "ctr", 
 "ctr", "ctr", "ctr", "ctr", "ctr", "ctrs", "cir", "cir", "cir", 
 "cir", "cir", "cir", "cirs", "clf", "clf", "clfs", "clfs", "clb", 
 "clb", "cmn", "cmns", "cor", "cor", "cors", "cors", "crse", "crse", 
 "ct", "ct", "cts", "cts", "cv", "cv", "cvs", "crk", "crk", "cres", 
 "cres", "cres", "cres", "crst", "xing", "xing", "xing", "xrd", 
 "xrds", "curv", "dl", "dl", "dm", "dm", "dv", "dv", "dv", "dv", 
 "dr", "dr", "dr", "dr", "drs", "est", "est", "ests", "ests", 
 "expy", "expy", "expy", "expy", "expy", "expy", "ext", "ext", 
 "ext", "ext", "exts", "fall", "fls", "fls", "fry", "fry", "fry", 
 "fld", "fld", "flds", "flds", "flt", "flt", "flts", "flts", "frd", 
 "frd", "frds", "frst", "frst", "frst", "frg", "frg", "frg", "frgs", 
 "frk", "frk", "frks", "frks", "ft", "ft", "ft", "fwy", "fwy", 
 "fwy", "fwy", "fwy", "gdn", "gdn", "gdn", "gdn", "gdns", "gdns", 
 "gdns", "gtwy", "gtwy", "gtwy", "gtwy", "gtwy", "gln", "gln", 
 "glns", "grn", "grn", "grns", "grv", "grv", "grv", "grvs", "hbr", 
 "hbr", "hbr", "hbr", "hbr", "hbrs", "hvn", "hvn", "hts", "hts", 
 "hwy", "hwy", "hwy", "hwy", "hwy", "hwy", "hl", "hl", "hls", 
 "hls", "holw", "holw", "holw", "holw", "holw", "inlt", "is", 
 "is", "is", "iss", "iss", "iss", "isle", "isle", "jct", "jct", 
 "jct", "jct", "jct", "jct", "jcts", "jcts", "jcts", "ky", "ky", 
 "kys", "kys", "knl", "knl", "knl", "knls", "knls", "lk", "lk", 
 "lks", "lks", "land", "lndg", "lndg", "lndg", "ln", "ln", "lgt", 
 "lgt", "lgts", "lf", "lf", "lck", "lck", "lcks", "lcks", "ldg", 
 "ldg", "ldg", "ldg", "loop", "loop", "mall", "mnr", "mnr", "mnrs", 
 "mnrs", "mdw", "mdws", "mdws", "mdws", "mdws", "mews", "ml", 
 "mls", "msn", "msn", "mtwy", "mt", "mt", "mt", "mtn", "mtn", 
 "mtn", "mtn", "mtn", "mtn", "mtns", "mtns", "nck", "nck", "orch", 
 "orch", "orch", "oval", "oval", "opas", "park", "park", "park", 
 "pkwy", "pkwy", "pkwy", "pkwy", "pkwy", "pkwy", "pkwy", "pass", 
 "psge", "path", "path", "pike", "pike", "pne", "pnes", "pnes", 
 "pl", "pln", "pln", "plns", "plns", "plz", "plz", "plz", "pt", 
 "pt", "pts", "pts", "prt", "prt", "prts", "prts", "pr", "pr", 
 "pr", "radl", "radl", "radl", "radl", "ramp", "rnch", "rnch", 
 "rnch", "rnch", "rpd", "rpd", "rpds", "rpds", "rst", "rst", "rdg", 
 "rdg", "rdg", "rdgs", "rdgs", "riv", "riv", "riv", "riv", "rd", 
 "rd", "rds", "rds", "rte", "row", "rue", "run", "shl", "shl", 
 "shls", "shls", "shr", "shr", "shr", "shrs", "shrs", "shrs", 
 "skwy", "spg", "spg", "spg", "spg", "spgs", "spgs", "spgs", "spgs", 
 "spur", "spur", "sq", "sq", "sq", "sq", "sq", "sqs", "sqs", "sta", 
 "sta", "sta", "sta", "stra", "stra", "stra", "stra", "stra", 
 "stra", "stra", "strm", "strm", "strm", "st", "st", "st", "st", 
 "sts", "smt", "ste", "smt", "smt", "smt", "ter", "ter", "ter", 
 "trwy", "trce", "trce", "trce", "trak", "trak", "trak", "trak", 
 "trak", "trfy", "trl", "trl", "trl", "trl", "trlr", "trlr", "trlr", 
 "tunl", "tunl", "tunl", "tunl", "tunl", "tunl", "tpke", "tpke", 
 "tpke", "upas", "un", "un", "uns", "vly", "vly", "vly", "vly", 
 "vlys", "vlys", "via", "via", "via", "via", "vw", "vw", "vws", 
 "vws", "vlg", "vlg", "vlg", "vlg", "vlg", "vlg", "vlgs", "vlgs", 
 "vl", "vl", "vis", "vis", "vis", "vis", "vis", "walk", "walk", 
 "wall", "way", "way", "ways", "wl", "wls", "wls"))

USPS
Run Code Online (Sandbox Code Playgroud)

输出

# A tibble: 503 x 2
   common_abbrev usps_abbrev
   <chr>         <chr>      
 1 allee         aly        
 2 alley         aly        
 3 ally          aly        
 4 aly           aly        
 5 anex          anx        
 6 annex         anx        
 7 annx          anx        
 8 anx           anx        
 9 arc           arc        
10 arcade        arc        
# ... with 493 more rows
Run Code Online (Sandbox Code Playgroud)

步骤 2 现在我们将把你的USPS表转换成一个带有命名元素的向量。

USPSv = array(data = USPS$usps_abbrev, 
              dimnames= list(USPS$common_abbrev))
Run Code Online (Sandbox Code Playgroud)

让我们看看它给我们带来了什么

USPSv['viadct']
# viadct 
#  "via" 

USPSv['coves'] 
# coves 
# "cvs" 
Run Code Online (Sandbox Code Playgroud)

看起来很诱人。

步骤 3 现在让我们创建一个转换(向量化)函数,该函数将USPSv向量与命名元素一起使用。

USPS_conv = function(x) {
  comm = str_split(x, " ") %>% .[[1]] %>% .[length(.)]
  str_replace(x, comm, USPSv[comm])
}
USPS_conv = Vectorize(USPS_conv)
Run Code Online (Sandbox Code Playgroud)

让我们看看我们的USPS_conv工作方式。

USPS_conv("10900 harper coves")
# 10900 harper coves 
# "10900 harper cvs"

USPS_conv("10900 harper viadct")
# 10900 harper viadct 
# "10900 harper via"
Run Code Online (Sandbox Code Playgroud)

很好,但是它能处理向量吗?

USPS_conv(c("10900 harper coves", "10900 harper viadct", "10900 harper ave"))
# 10900 harper coves 10900 harper viadct    10900 harper ave 
# "10900 harper cvs"  "10900 harper via"  "10900 harper ave"   
Run Code Online (Sandbox Code Playgroud)

到目前为止一切都很顺利。

第四步 现在是时候USPS_conv在函数中使用我们的函数了mutate。但是,我们需要一些输入数据。我们将自己生成它们。

n=10
set.seed(1111)
df = tibble(
  addresses = paste(
    sample(10:10000, n, replace = TRUE),
    sample(c("harper", "davis", "van cortland", "marry", "von brown"), n, replace = TRUE),
    sample(USPS$common_abbrev, n, replace = TRUE)
  )
)
df
Run Code Online (Sandbox Code Playgroud)

输出

# A tibble: 10 x 1
   addresses          
   <chr>              
 1 8995 davis crk     
 2 8527 davis tunnl   
 3 7663 von brown wall
 4 3043 harper lake   
 5 9192 von brown grdn
 6 120 marry rvr      
 7 72 von brown locks 
 8 8752 marry gardn   
 9 7754 davis corner  
10 3745 davis jcts  
Run Code Online (Sandbox Code Playgroud)

让我们执行一个突变

df %>% mutate(addresses = USPS_conv(addresses))
Run Code Online (Sandbox Code Playgroud)

输出

# A tibble: 10 x 1
   addresses          
   <chr>              
 1 8995 davis crk     
 2 8527 davis tunl    
 3 7663 von brown wall
 4 3043 harper lk     
 5 9192 von brown gdn 
 6 120 marry riv      
 7 72 von brown lcks  
 8 8752 marry gdn     
 9 7754 davis cor     
10 3745 davis jcts 
Run Code Online (Sandbox Code Playgroud)

看起来还好吗?好像是最多的。

第 5 步 现在是时候对 1,000,000 个地址进行一次伟大的测试了!我们将像以前一样生成数据。

n=1000000
set.seed(1111)
df = tibble(
  addresses = paste(
    sample(10:10000, n, replace = TRUE),
    sample(c("harper", "davis", "van cortland", "marry", "von brown"), n, replace = TRUE),
    sample(USPS$common_abbrev, n, replace = TRUE)
  )
)
df
Run Code Online (Sandbox Code Playgroud)

输出

# A tibble: 1,000,000 x 1
   addresses              
   <chr>                  
 1 8995 marry pass        
 2 8527 davis spng        
 3 7663 marry loaf        
 4 3043 davis common      
 5 9192 marry bnd         
 6 120 von brown corner   
 7 72 van cortland plains 
 8 8752 van cortland crcle
 9 7754 von brown sqrs    
10 3745 marry key         
# ... with 999,990 more rows
Run Code Online (Sandbox Code Playgroud)

那么我们走吧。但让我们立即衡量一下需要多长时间。

start_time =Sys.time()
df %>% mutate(addresses = USPS_conv(addresses))
Sys.time()-start_time
#Time difference of 3.610211 mins
Run Code Online (Sandbox Code Playgroud)

正如你所看到的,我只花了不到 4 分钟。我不知道你是否期待更快的事情,以及你对这次是否满意。我会等待你的评论。

最后一刻更新

事实证明,USPS_conv如果我们稍微改变一下它的代码,就可以稍微加快速度。

USPS_conv2 = function(x) {
  t = str_split(x, " ")
  comm = t[[1]][length(t[[1]])]
  str_replace(x, comm, USPSv[comm])
}
USPS_conv2 = Vectorize(USPS_conv2)
Run Code Online (Sandbox Code Playgroud)

USPS_conv2功能的运行速度稍快一些。

在此输入图像描述

所有这些意味着一百万条记录的突变时间减少到 3.3 分钟。

超级速度的大更新!

我意识到我的第一个版本的答案虽然结构简单,但有点慢:-(。所以我决定想出更快的东西。我将在这里分享我的想法,但请注意,一些解决方案会有点“神奇”。

魔法词典-环境

为了加快操作速度,我们需要创建一个字典,将键快速转换为值。我们将使用 R 中的环境来创建它。

这是我们词典的一个小界面。

#Simple Dictionary (hash Table) Interface for R
ht.create = function() new.env()

ht.insert = function(ht, key, value)  ht[[key]] <- value
ht.insert = Vectorize(ht.insert, c("key", "value"))

ht.lookup = function(ht, key) ht[[key]]
ht.lookup = Vectorize(ht.lookup, "key")

ht.delete = function(ht, key) rm(list=key,envir=ht,inherits=FALSE)
ht.delete = Vectorize(ht.delete, "key")
Run Code Online (Sandbox Code Playgroud)

这是怎么发生的。我已经展示了。下面我将创建一个新的字典环境,ht.create()向其中添加两个元素“a1”和“a2”,ht.insert分别具有值“va1”和“va2”。最后,我将询问我的环境字典这些键的值ht.lookup

ht1 = ht.create()
ht.insert(ht1, "a1", "va1" )
ht1 %>% ht.insert("a2", "va2")
ht.lookup(ht1, "a1")
# a1
# "va1"
ht1 %>% ht.lookup("a2")
# a2
# "va2"
Run Code Online (Sandbox Code Playgroud)

请注意,函数ht.insert ht.lookup是向量化的,这意味着我将能够将整个向量添加到字典中。以同样的方式,我将能够通过给出整个向量来查询我的字典。

ht.insert(ht1, paste0("a", 1:10),paste0("va", 1:10))
ht1 %>% ht.insert( paste0("a", 11:20),paste0("va", 11:20))

ht.lookup(ht1, paste0("a", 10:1))
# a10     a9     a8     a7     a6     a5     a4     a3     a2     a1
# "va10"  "va9"  "va8"  "va7"  "va6"  "va5"  "va4"  "va3"  "va2"  "va1"
ht1 %>% ht.lookup(paste0("a", 20:11))
# a20    a19    a18    a17    a16    a15    a14    a13    a12    a11
# "va20" "va19" "va18" "va17" "va16" "va15" "va14" "va13" "va12" "va11"
Run Code Online (Sandbox Code Playgroud)

魔法属性

现在我们将执行一个函数,将附加属性添加到选定的字典环境表中。

#Functions that add a dictionary attribute to tibble
addHashTable = function(.data, key, value){
  key = enquo(key)
  value = enquo(value)

  if (!all(c(as_label(key), as_label(value)) %in% names(.data))) {
    stop(paste0("`.data` must contain `", as_label(key),
                "` and `", as_label(value), "` columns"))
  }

  if((.data %>% distinct(!!key, !!value) %>% nrow)!=
     (.data %>% distinct(!!key) %>% nrow)){
    warning(paste0(
      "\nThe number of unique values of the ", as_label(key),
      " variable is different\n",
      " from the number of unique values of the ",
      as_label(key), " and ", as_label(value)," pairs!\n",
      "The dictionary will only return the last values for a given key!"))
  }

  ht = ht.create()
  ht %>% ht.insert(.data %>% distinct(!!key, !!value) %>% pull(!!key),
                   .data %>% distinct(!!key, !!value) %>% pull(!!value))
  attr(.data, "hashTab") = ht
  .data
}


addHashTable2 = function(.x, .y, key, value){
  key = enquo(key)
  value = enquo(value)

  if (!all(c(as_label(key), as_label(value)) %in% names(.y))) {
    stop(paste0("`.y` must contain `", as_label(key),
                "` and `", as_label(value), "` columns"))
  }

  if((.y %>% distinct(!!key, !!value) %>% nrow)!=
     (.y %>% distinct(!!key) %>% nrow)){
    warning(paste0(
      "\nThe number of unique values of the ", as_label(key),
      " variable is different\n",
      " from the number of unique values of the ",
      as_label(key), " and ", as_label(value)," pairs!\n",
      "The dictionary will only return the last values for a given key!"))
  }

  ht = ht.create()
  ht %>% ht.insert(.y %>% distinct(!!key, !!value) %>% pull(!!key),
                   .y %>% distinct(!!key, !!value) %>% pull(!!value))
  attr(.x, "hashTab") = ht
  .x
}
Run Code Online (Sandbox Code Playgroud)

实际上有两个功能。该addHashTable函数将字典环境属性添加到从中获取键值对的同一个表中。该addHashTable2函数同样添加到字典环境表中,但从另一个表中检索密钥对。

让我们看看如何addHashTable运作。

USPS = USPS %>% addHashTable(common_abbrev, usps_abbrev)
str(USPS)
# tibble [503 x 2] (S3: tbl_df/tbl/data.frame)
# $ common_abbrev: chr [1:503] "allee" "alley" "ally" "aly" ...
# $ usps_abbrev  : chr [1:503] "aly" "aly" "aly" "aly" ...
# - attr(*, "hashTab")=<environment: 0x000000001591bbf0>
Run Code Online (Sandbox Code Playgroud)

正如您所看到的,表中添加了一个USPS指向0x000000001591bbf0环境的属性。

替换功能

我们需要创建一个函数,该函数将使用以这种方式添加的字典环境来替换(在本例中)指示变量中的最后一个单词与字典中的相应值。这里是。

replaceString = function(.data, value){
  value = enquo(value)

  #Test whether the value variable is in .data
  if(!(as_label(value) %in% names(.data))){
    stop(paste("The", as_label(value),
               "variable does not exist in the .data table!"))
  }

  #Dictionary attribute presence test
  if(!("hashTab" %in% names(attributes(.data)))) {
    stop(paste0(
      "\nThere is no dictionary attribute in the .data table!\n",
      "Use addHashTable or addHashTable2 to add a dictionary attribute."))
  }

  txt = .data %>% pull(!!value)
  i = sapply(strsplit(txt, ""), function(x) max(which(x==" ")))
  txt = paste0(str_sub(txt, end=i),
               ht.lookup(attr(.data, "hashTab"),
                         str_sub(txt, start=i+1)))
  .data %>% mutate(!!value := txt)
}
Run Code Online (Sandbox Code Playgroud)

第一次测试

到了第一篇文字的时间了。为了避免复制代码,我添加了一个小函数,该函数返回一个包含随机选择地址的表。

randomAddresses = function(n){
  tibble(
    addresses = paste(
      sample(10:10000, n, replace = TRUE),
      sample(c("harper", "davis", "van cortland", "marry", "von brown"), n, replace = TRUE),
      sample(USPS$common_abbrev, n, replace = TRUE)
    )
  )
}

set.seed(1111)
df = randomAddresses(10)
df
# # A tibble: 10 x 1
#   addresses
#   <chr>
# 1 74 marry forges
# 2 787 von brown knol
# 3 2755 van cortland summit
# 4 9405 harper plaza
# 5 5376 marry pass
# 6 1857 marry trailer
# 7 9810 von brown drv
# 8 7984 davis garden
# 9 9110 marry alley
# 10 6458 von brown row
Run Code Online (Sandbox Code Playgroud)

是时候使用我们的魔术文本替换功能了。但是,请记住首先将字典环境添加到表中。

df = df %>% addHashTable2(USPS, common_abbrev, usps_abbrev)
df %>% replaceString(addresses)
# A tibble: 10 x 1
#   addresses
#   <chr>
# 1 74 marry frgs
# 2 787 von brown knl
# 3 2755 van cortland smt
# 4 9405 harper plz
# 5 5376 marry pass
# 6 1857 marry trlr
# 7 9810 von brown dr
# 8 7984 davis gdn
# 9 9110 marry aly
# 10 6458 von brown row
Run Code Online (Sandbox Code Playgroud)

看起来确实有效!

大考验

好吧,没有什么可等待的。现在让我们在一个有一百万行的表上尝试一下。让我们立即测量一下绘制地址和添加字典环境需要多长时间。

start_time =Sys.time()
df = randomAddresses(1000000)
df = df %>% addHashTable2(USPS, common_abbrev, usps_abbrev)
Sys.time()-start_time
#Time difference of 1.56609 secs
Run Code Online (Sandbox Code Playgroud)

输出

df
# A tibble: 1,000,000 x 1
#   addresses              
#   <chr>                  
# 1 8995 marry pass        
# 2 8527 davis spng        
# 3 7663 marry loaf        
# 4 3043 davis common      
# 5 9192 marry bnd         
# 6 120 von brown corner   
# 7 72 van cortland plains 
# 8 8752 van cortland crcle
# 9 7754 von brown sqrs    
# 10 3745 marry key         
# # ... with 999,990 more rows
Run Code Online (Sandbox Code Playgroud)

1.6秒可能并不算太多。然而,最大的问题是更换缩写需要多长时间。

start_time =Sys.time()
df = df %>% replaceString(addresses)
Sys.time()-start_time
#Time difference of 8.316476 secs
Run Code Online (Sandbox Code Playgroud)

输出

# A tibble: 1,000,000 x 1
#   addresses            
#   <chr>                
#   1 8995 marry pass      
# 2 8527 davis spg       
# 3 7663 marry lf        
# 4 3043 davis cmn       
# 5 9192 marry bnd       
# 6 120 von brown cor    
# 7 72 van cortland plns 
# 8 8752 van cortland cir
# 9 7754 von brown sqs   
# 10 3745 marry ky        
# # ... with 99


Tho*_*ing 6

更新

\n

这是现有OP问题的基准测试(从@Marek Fio\xc5\x82ka借用测试数据,但带有n <- 10000

\n
> mb1\nUnit: milliseconds\n                              expr       min        lq       mean    median\n          f_MK_conv2(df$addresses) 1409.0643 1470.3992 1612.09037 1631.3014\n f_MK_replaceString(df, addresses)   50.1582   54.3035   94.53149   62.5772\n              f_TIC1(df$addresses)  394.5972  420.3283  461.50675  447.6186\n              f_TIC2(df$addresses) 1579.1868 1852.6873 2052.28388 1964.8845\n              f_TIC3(df$addresses)   65.8436   71.5448   93.36210   84.9698\n        uq       max neval\n 1710.3459 1898.6773    20\n  116.3108  264.2616    20\n  499.4052  626.9240    20\n 2246.5562 2916.2253    20\n  102.7689  183.5121    20\n
Run Code Online (Sandbox Code Playgroud)\n

在此输入图像描述

\n

其中基准代码给出如下

\n
f_MK_conv2 <- function(x) {\n  USPSv <- array(\n    data = USPS$usps_abbrev,\n    dimnames = list(USPS$common_abbrev)\n  )\n  USPS_conv2 <- function(x) {\n    t <- str_split(x, " ")\n    comm <- t[[1]][length(t[[1]])]\n    str_replace(x, comm, USPSv[comm])\n  }\n  Vectorize(USPS_conv2)(x)\n}\n\nf_MK_replaceString <- function(.data, value) {\n  ht.create <- function() new.env()\n\n  ht.insert <- function(ht, key, value) ht[[key]] <- value\n  ht.insert <- Vectorize(ht.insert, c("key", "value"))\n\n  ht.lookup <- function(ht, key) ht[[key]]\n  ht.lookup <- Vectorize(ht.lookup, "key")\n\n  ht.delete <- function(ht, key) rm(list = key, envir = ht, inherits = FALSE)\n  ht.delete <- Vectorize(ht.delete, "key")\n\n  addHashTable2 <- function(.x, .y, key, value) {\n    key <- enquo(key)\n    value <- enquo(value)\n\n    if (!all(c(as_label(key), as_label(value)) %in% names(.y))) {\n      stop(paste0(\n        "`.y` must contain `", as_label(key),\n        "` and `", as_label(value), "` columns"\n      ))\n    }\n\n    if ((.y %>% distinct(!!key, !!value) %>% nrow()) !=\n      (.y %>% distinct(!!key) %>% nrow())) {\n      warning(paste0(\n        "\\nThe number of unique values of the ", as_label(key),\n        " variable is different\\n",\n        " from the number of unique values of the ",\n        as_label(key), " and ", as_label(value), " pairs!\\n",\n        "The dictionary will only return the last values for a given key!"\n      ))\n    }\n\n    ht <- ht.create()\n    ht %>% ht.insert(\n      .y %>% distinct(!!key, !!value) %>% pull(!!key),\n      .y %>% distinct(!!key, !!value) %>% pull(!!value)\n    )\n    attr(.x, "hashTab") <- ht\n    .x\n  }\n\n  .data <- .data %>% addHashTable2(USPS, common_abbrev, usps_abbrev)\n\n  value <- enquo(value)\n  # Test whether the value variable is in .data\n  if (!(as_label(value) %in% names(.data))) {\n    stop(paste(\n      "The", as_label(value),\n      "variable does not exist in the .data table!"\n    ))\n  }\n\n  # Dictionary attribute presence test\n  if (!("hashTab" %in% names(attributes(.data)))) {\n    stop(paste0(\n      "\\nThere is no dictionary attribute in the .data table!\\n",\n      "Use addHashTable or addHashTable2 to add a dictionary attribute."\n    ))\n  }\n\n  txt <- .data %>% pull(!!value)\n  i <- sapply(strsplit(txt, ""), function(x) max(which(x == " ")))\n  txt <- paste0(\n    str_sub(txt, end = i),\n    ht.lookup(\n      attr(.data, "hashTab"),\n      str_sub(txt, start = i + 1)\n    )\n  )\n  .data %>% mutate(!!value := txt)\n}\n\nf_TIC1 <- function(x) {\n  sapply(\n    strsplit(x, " "),\n    function(x) {\n      with(USPS, {\n        idx <- match(x, common_abbrev)\n        paste0(ifelse(is.na(idx), x, usps_abbrev[idx]),\n          collapse = " "\n        )\n      })\n    }\n  )\n}\n\nf_TIC2 <- function(x) {\n  res <- c()\n  for (s in x) {\n    v <- unlist(strsplit(s, "\\\\W+"))\n    for (p in v) {\n      k <- match(p, USPS$common_abbrev)\n      if (!is.na(k)) {\n        s <- with(\n          USPS,\n          gsub(\n            sprintf("\\\\b%s\\\\b", common_abbrev[k]),\n            usps_abbrev[k],\n            s\n          )\n        )\n      }\n    }\n    res <- append(res, s)\n  }\n  res\n}\n\nf_TIC3 <- function(x) {\n  x.split <- strsplit(x, " ")\n  lut <- with(USPS, setNames(usps_abbrev, common_abbrev))\n  grp <- rep(seq_along(x.split), lengths(x.split))\n  xx <- unlist(x.split)\n  r <- lut[xx]\n  tapply(\n    replace(xx, !is.na(r), na.omit(r)),\n    grp,\n    function(s) paste0(s, collapse = " ")\n  )\n}\n\nf_TIC4 <- function(x) {\n  xb <- gsub("^.*\\\\s+", "", x)\n  rp <- with(USPS, usps_abbrev[match(xb, common_abbrev)])\n  paste0(gsub("\\\\w+$", "", x), replace(xb, !is.na(rp), na.omit(rp)))\n}\n\nf_JM <- function(x) {\n  x$abbreviation <- gsub("^.* ", "", x$addresses)\n  setDT(x)\n  setDT(USPS)\n  x[USPS, abbreviation := usps_abbrev, on = .(abbreviation = common_abbrev)]\n\n  x$usps_abbreviation <- paste(str_extract(x$addresses, "^.* "), x$abbreviation, sep = "")\n}\n\nset.seed(1111)\ndf <- randomAddresses(10000)\n\nlibrary(microbenchmark)\nmb1 <- microbenchmark(\n  f_MK_conv2(df$addresses),\n  f_MK_replaceString(df, addresses),\n  f_JM(df),\n  f_TIC1(df$addresses),\n  f_TIC2(df$addresses),\n  f_TIC3(df$addresses),\n  f_TIC4(df$addresses),\n  times = 20L\n)\nggplot2::autoplot(mb1)\n
Run Code Online (Sandbox Code Playgroud)\n
\n

可能的解决方案

\n

也许以下基本 R 选项之一可以提供帮助

\n
    \n
  • 解决方案1
  • \n
\n
f_TIC1 <- function(x) {\n  sapply(\n    strsplit(x, " "),\n    function(x) {\n      with(USPS, {\n        idx <- match(x, common_abbrev)\n        paste0(ifelse(is.na(idx), x, usps_abbrev[idx]),\n          collapse = " "\n        )\n      })\n    }\n  )\n}\n
Run Code Online (Sandbox Code Playgroud)\n
    \n
  • 解决方案2
  • \n
\n
\nf_TIC2 <- function(x) {\n  res <- c()\n  for (s in x) {\n    v <- unlist(strsplit(s, "\\\\W+"))\n    for (p in v) {\n      k <- match(p, USPS$common_abbrev)\n      if (!is.na(k)) {\n        s <- with(\n          USPS,\n          gsub(\n            sprintf("\\\\b%s\\\\b", common_abbrev[k]),\n            usps_abbrev[k],\n            s\n          )\n        )\n      }\n    }\n    res <- append(res, s)\n  }\n  res\n}\n
Run Code Online (Sandbox Code Playgroud)\n
    \n
  • 解决方案3
  • \n
\n
\nf_TIC3 <- function(x) {\n  x.split <- strsplit(x, " ")\n  lut <- with(USPS, setNames(usps_abbrev, common_abbrev))\n  grp <- rep(seq_along(x.split), lengths(x.split))\n  xx <- unlist(x.split)\n  r <- lut[xx]\n  tapply(\n    replace(xx, !is.na(r), na.omit(r)),\n    grp,\n    function(s) paste0(s, collapse = " ")\n  )\n}\n
Run Code Online (Sandbox Code Playgroud)\n
    \n
  • 解决方案4(这是针对特殊情况,即仅最后一个单词的缩写
  • \n
\n
f_TIC4 <- function(x) {\n  xb <- gsub("^.*\\\\s+", "", x)\n  rp <- with(USPS, usps_abbrev[match(xb, common_abbrev)])\n  paste0(gsub("\\\\w+$", "", x), replace(xb, !is.na(rp), na.omit(rp)))\n}\n
Run Code Online (Sandbox Code Playgroud)\n
\n

输出

\n
[1] "10900 harper ave"     "12235 davis anx"      "24 van cortland pkwy"\n
Run Code Online (Sandbox Code Playgroud)\n


jar*_*rot 5

更新:

\n

我花了一些时间调整我现有的答案(如下),我相信这是最快的方法。另外,值得注意的是,如果您添加perl = TRUE到 f_JM 和 TIC4 中的 gsub,则此示例的速度会显着提高(可能不适用于“现实世界”数据)。我的答案还有一个重要的警告,因为它是基于缩写术语是地址中的最后一个术语(例如 TIC1、TIC2 和 TIC3 不依赖于该假设)。

\n

非常感谢@Marek 和@TIC 的基准测试代码和建设性意见:

\n
## Benchmarking with updated f_JM() and TIC4()\nlibrary(data.table)\nlibrary(tidyverse)\n\nUSPS = tibble(\n  common_abbrev = c("allee", "alley", "ally", "aly",\n                    "anex", "annex", "annx", "anx", "arc", "arcade", "av", "ave",\n                    "aven", "avenu", "avenue", "avn", "avnue", "bayoo", "bayou",\n                    "bch", "beach", "bend", "bnd", "blf", "bluf", "bluff", "bluffs",\n                    "bot", "btm", "bottm", "bottom", "blvd", "boul", "boulevard",\n                    "boulv", "br", "brnch", "branch", "brdge", "brg", "bridge", "brk",\n                    "brook", "brooks", "burg", "burgs", "byp", "bypa", "bypas", "bypass",\n                    "byps", "camp", "cp", "cmp", "canyn", "canyon", "cnyn", "cape",\n                    "cpe", "causeway", "causwa", "cswy", "cen", "cent", "center",\n                    "centr", "centre", "cnter", "cntr", "ctr", "centers", "cir",\n                    "circ", "circl", "circle", "crcl", "crcle", "circles", "clf",\n                    "cliff", "clfs", "cliffs", "clb", "club", "common", "commons",\n                    "cor", "corner", "corners", "cors", "course", "crse", "court",\n                    "ct", "courts", "cts", "cove", "cv", "coves", "creek", "crk",\n                    "crescent", "cres", "crsent", "crsnt", "crest", "crossing", "crssng",\n                    "xing", "crossroad", "crossroads", "curve", "dale", "dl", "dam",\n                    "dm", "div", "divide", "dv", "dvd", "dr", "driv", "drive", "drv",\n                    "drives", "est", "estate", "estates", "ests", "exp", "expr",\n                    "express", "expressway", "expw", "expy", "ext", "extension",\n                    "extn", "extnsn", "exts", "fall", "falls", "fls", "ferry", "frry",\n                    "fry", "field", "fld", "fields", "flds", "flat", "flt", "flats",\n                    "flts", "ford", "frd", "fords", "forest", "forests", "frst",\n                    "forg", "forge", "frg", "forges", "fork", "frk", "forks", "frks",\n                    "fort", "frt", "ft", "freeway", "freewy", "frway", "frwy", "fwy",\n                    "garden", "gardn", "grden", "grdn", "gardens", "gdns", "grdns",\n                    "gateway", "gatewy", "gatway", "gtway", "gtwy", "glen", "gln",\n                    "glens", "green", "grn", "greens", "grov", "grove", "grv", "groves",\n                    "harb", "harbor", "harbr", "hbr", "hrbor", "harbors", "haven",\n                    "hvn", "ht", "hts", "highway", "highwy", "hiway", "hiwy", "hway",\n                    "hwy", "hill", "hl", "hills", "hls", "hllw", "hollow", "hollows",\n                    "holw", "holws", "inlt", "is", "island", "islnd", "islands",\n                    "islnds", "iss", "isle", "isles", "jct", "jction", "jctn", "junction",\n                    "junctn", "juncton", "jctns", "jcts", "junctions", "key", "ky",\n                    "keys", "kys", "knl", "knol", "knoll", "knls", "knolls", "lk",\n                    "lake", "lks", "lakes", "land", "landing", "lndg", "lndng", "lane",\n                    "ln", "lgt", "light", "lights", "lf", "loaf", "lck", "lock",\n                    "lcks", "locks", "ldg", "ldge", "lodg", "lodge", "loop", "loops",\n                    "mall", "mnr", "manor", "manors", "mnrs", "meadow", "mdw", "mdws",\n                    "meadows", "medows", "mews", "mill", "mills", "missn", "mssn",\n                    "motorway", "mnt", "mt", "mount", "mntain", "mntn", "mountain",\n                    "mountin", "mtin", "mtn", "mntns", "mountains", "nck", "neck",\n                    "orch", "orchard", "orchrd", "oval", "ovl", "overpass", "park",\n                    "prk", "parks", "parkway", "parkwy", "pkway", "pkwy", "pky",\n                    "parkways", "pkwys", "pass", "passage", "path", "paths", "pike",\n                    "pikes", "pine", "pines", "pnes", "pl", "plain", "pln", "plains",\n                    "plns", "plaza", "plz", "plza", "point", "pt", "points", "pts",\n                    "port", "prt", "ports", "prts", "pr", "prairie", "prr", "rad",\n                    "radial", "radiel", "radl", "ramp", "ranch", "ranches", "rnch",\n                    "rnchs", "rapid", "rpd", "rapids", "rpds", "rest", "rst", "rdg",\n                    "rdge", "ridge", "rdgs", "ridges", "riv", "river", "rvr", "rivr",\n                    "rd", "road", "roads", "rds", "route", "row", "rue", "run", "shl",\n                    "shoal", "shls", "shoals", "shoar", "shore", "shr", "shoars",\n                    "shores", "shrs", "skyway", "spg", "spng", "spring", "sprng",\n                    "spgs", "spngs", "springs", "sprngs", "spur", "spurs", "sq",\n                    "sqr", "sqre", "squ", "square", "sqrs", "squares", "sta", "station",\n                    "statn", "stn", "stra", "strav", "straven", "stravenue", "stravn",\n                    "strvn", "strvnue", "stream", "streme", "strm", "street", "strt",\n                    "st", "str", "streets", "smt", "suite", "sumit", "sumitt", "summit",\n                    "ter", "terr", "terrace", "throughway", "trace", "traces", "trce",\n                    "track", "tracks", "trak", "trk", "trks", "trafficway", "trail",\n                    "trails", "trl", "trls", "trailer", "trlr", "trlrs", "tunel",\n                    "tunl", "tunls", "tunnel", "tunnels", "tunnl", "trnpk", "turnpike",\n                    "turnpk", "underpass", "un", "union", "unions", "valley", "vally",\n                    "vlly", "vly", "valleys", "vlys", "vdct", "via", "viadct", "viaduct",\n                    "view", "vw", "views", "vws", "vill", "villag", "village", "villg",\n                    "villiage", "vlg", "villages", "vlgs", "ville", "vl", "vis",\n                    "vist", "vista", "vst", "vsta", "walk", "walks", "wall", "wy",\n                    "way", "ways", "well", "wells", "wls"),\n  usps_abbrev = c("aly",\n                  "aly", "aly", "aly", "anx", "anx", "anx", "anx", "arc", "arc",\n                  "ave", "ave", "ave", "ave", "ave", "ave", "ave", "byu", "byu",\n                  "bch", "bch", "bnd", "bnd", "blf", "blf", "blf", "blfs", "btm",\n                  "btm", "btm", "btm", "blvd", "blvd", "blvd", "blvd", "br", "br",\n                  "br", "brg", "brg", "brg", "brk", "brk", "brks", "bg", "bgs",\n                  "byp", "byp", "byp", "byp", "byp", "cp", "cp", "cp", "cyn", "cyn",\n                  "cyn", "cpe", "cpe", "cswy", "cswy", "cswy", "ctr", "ctr", "ctr",\n                  "ctr", "ctr", "ctr", "ctr", "ctr", "ctrs", "cir", "cir", "cir",\n                  "cir", "cir", "cir", "cirs", "clf", "clf", "clfs", "clfs", "clb",\n                  "clb", "cmn", "cmns", "cor", "cor", "cors", "cors", "crse", "crse",\n                  "ct", "ct", "cts", "cts", "cv", "cv", "cvs", "crk", "crk", "cres",\n                  "cres", "cres", "cres", "crst", "xing", "xing", "xing", "xrd",\n                  "xrds", "curv", "dl", "dl", "dm", "dm", "dv", "dv", "dv", "dv",\n                  "dr", "dr", "dr", "dr", "drs", "est", "est", "ests", "ests",\n                  "expy", "expy", "expy", "expy", "expy", "expy", "ext", "ext",\n                  "ext", "ext", "exts", "fall", "fls", "fls", "fry", "fry", "fry",\n                  "fld", "fld", "flds", "flds", "flt", "flt", "flts", "flts", "frd",\n                  "frd", "frds", "frst", "frst", "frst", "frg", "frg", "frg", "frgs",\n                  "frk", "frk", "frks", "frks", "ft", "ft", "ft", "fwy", "fwy",\n                  "fwy", "fwy", "fwy", "gdn", "gdn", "gdn", "gdn", "gdns", "gdns",\n                  "gdns", "gtwy", "gtwy", "gtwy", "gtwy", "gtwy", "gln", "gln",\n                  "glns", "grn", "grn", "grns", "grv", "grv", "grv", "grvs", "hbr",\n                  "hbr", "hbr", "hbr", "hbr", "hbrs", "hvn", "hvn", "hts", "hts",\n                  "hwy", "hwy", "hwy", "hwy", "hwy", "hwy", "hl", "hl", "hls",\n                  "hls", "holw", "holw", "holw", "holw", "holw", "inlt", "is",\n                  "is", "is", "iss", "iss", "iss", "isle", "isle", "jct", "jct",\n                  "jct", "jct", "jct", "jct", "jcts", "jcts", "jcts", "ky", "ky",\n                  "kys", "kys", "knl", "knl", "knl", "knls", "knls", "lk", "lk",\n                  "lks", "lks", "land", "lndg", "lndg", "lndg", "ln", "ln", "lgt",\n                  "lgt", "lgts", "lf", "lf", "lck", "lck", "lcks", "lcks", "ldg",\n                  "ldg", "ldg", "ldg", "loop", "loop", "mall", "mnr", "mnr", "mnrs",\n                  "mnrs", "mdw", "mdws", "mdws", "mdws", "mdws", "mews", "ml",\n                  "mls", "msn", "msn", "mtwy", "mt", "mt", "mt", "mtn", "mtn",\n                  "mtn", "mtn", "mtn", "mtn", "mtns", "mtns", "nck", "nck", "orch",\n                  "orch", "orch", "oval", "oval", "opas", "park", "park", "park",\n                  "pkwy", "pkwy", "pkwy", "pkwy", "pkwy", "pkwy", "pkwy", "pass",\n                  "psge", "path", "path", "pike", "pike", "pne", "pnes", "pnes",\n                  "pl", "pln", "pln", "plns", "plns", "plz", "plz", "plz", "pt",\n                  "pt", "pts", "pts", "prt", "prt", "prts", "prts", "pr", "pr",\n                  "pr", "radl", "radl", "radl", "radl", "ramp", "rnch", "rnch",\n                  "rnch", "rnch", "rpd", "rpd", "rpds", "rpds", "rst", "rst", "rdg",\n                  "rdg", "rdg", "rdgs", "rdgs", "riv", "riv", "riv", "riv", "rd",\n                  "rd", "rds", "rds", "rte", "row", "rue", "run", "shl", "shl",\n                  "shls", "shls", "shr", "shr", "shr", "shrs", "shrs", "shrs",\n                  "skwy", "spg", "spg", "spg", "spg", "spgs", "spgs", "spgs", "spgs",\n                  "spur", "spur", "sq", "sq", "sq", "sq", "sq", "sqs", "sqs", "sta",\n                  "sta", "sta", "sta", "stra", "stra", "stra", "stra", "stra",\n                  "stra", "stra", "strm", "strm", "strm", "st", "st", "st", "st",\n                  "sts", "smt", "ste", "smt", "smt", "smt", "ter", "ter", "ter",\n                  "trwy", "trce", "trce", "trce", "trak", "trak", "trak", "trak",\n                  "trak", "trfy", "trl", "trl", "trl", "trl", "trlr", "trlr", "trlr",\n                  "tunl", "tunl", "tunl", "tunl", "tunl", "tunl", "tpke", "tpke",\n                  "tpke", "upas", "un", "un", "uns", "vly", "vly", "vly", "vly",\n                  "vlys", "vlys", "via", "via", "via", "via", "vw", "vw", "vws",\n                  "vws", "vlg", "vlg", "vlg", "vlg", "vlg", "vlg", "vlgs", "vlgs",\n                  "vl", "vl", "vis", "vis", "vis", "vis", "vis", "walk", "walk",\n                  "wall", "way", "way", "ways", "wl", "wls", "wls"))\n\nrandomAddresses = function(n){\n  tibble(\n    addresses = paste(\n      sample(10:10000, n, replace = TRUE),\n      sample(c("harper", "davis", "van cortland", "marry", "von brown"), n, replace = TRUE),\n      sample(USPS$common_abbrev, n, replace = TRUE)\n    )\n  )\n}\n\nset.seed(1111)\ndf = randomAddresses(10)\n\nUSPS_conv2 = function(x, y) {\n  t = str_split(x, " ")\n  comm = t[[1]][length(t[[1]])]\n  str_replace(x, comm, y[comm])\n}\nUSPS_conv2 = Vectorize(USPS_conv2, "x")\n\nf_MK_conv2 <- function(x, y) {\n  x %>% mutate(\n    addresses = USPS_conv2(addresses, \n                           array(data = y$usps_abbrev, dimnames = list(y$common_abbrev))))\n}\nf_MK_conv2(df, USPS)\n#> # A tibble: 10 \xc3\x97 1\n#>    addresses          \n#>    <chr>              \n#>  1 8995 davis crk     \n#>  2 8527 davis tunl    \n#>  3 7663 von brown wall\n#>  4 3043 harper lk     \n#>  5 9192 von brown gdn \n#>  6 120 marry riv      \n#>  7 72 von brown lcks  \n#>  8 8752 marry gdn     \n#>  9 7754 davis cor     \n#> 10 3745 davis jcts\n\n\nht.create <- function() new.env()\n\nht.insert <- function(ht, key, value) ht[[key]] <- value\nht.insert <- Vectorize(ht.insert, c("key", "value"))\n\nht.lookup <- function(ht, key) ht[[key]]\nht.lookup <- Vectorize(ht.lookup, "key")\n\nht.delete <- function(ht, key) rm(list = key, envir = ht, inherits = FALSE)\nht.delete <- Vectorize(ht.delete, "key")\n\n\nf_MK_replaceString <- function(x, y) {\n  ht <- ht.create()\n  ht.insert(ht, y$common_abbrev, y$usps_abbrev)\n  \n  txt <- x$addresses\n  i <- sapply(strsplit(txt, ""), function(x) max(which(x == " ")))\n  txt <- paste0(\n    str_sub(txt, end = i),\n    ht.lookup(ht, str_sub(txt, start = i + 1))\n  )\n  x %>% mutate(addresses = txt)\n}\nf_MK_replaceString(df, USPS)\n#> # A tibble: 10 \xc3\x97 1\n#>    addresses          \n#>    <chr>              \n#>  1 8995 davis crk     \n#>  2 8527 davis tunl    \n#>  3 7663 von brown wall\n#>  4 3043 harper lk     \n#>  5 9192 von brown gdn \n#>  6 120 marry riv      \n#>  7 72 von brown lcks  \n#>  8 8752 marry gdn     \n#>  9 7754 davis cor     \n#> 10 3745 davis jcts\n\nf_TIC1 <- function(x, y) {\n  x %>% mutate(addresses = sapply(\n    strsplit(x$addresses, " "),\n    function(x) {\n      with(y, {\n        idx <- match(x, common_abbrev)\n        paste0(ifelse(is.na(idx), x, usps_abbrev[idx]),\n               collapse = " "\n        )\n      })\n    }\n  )\n  )\n}\nf_TIC1(df, USPS)\n#> # A tibble: 10 \xc3\x97 1\n#>    addresses          \n#>    <chr>              \n#>  1 8995 davis crk     \n#>  2 8527 davis tunl    \n#>  3 7663 von brown wall\n#>  4 3043 harper lk     \n#>  5 9192 von brown gdn \n#>  6 120 marry riv      \n#>  7 72 von brown lcks  \n#>  8 8752 marry gdn     \n#>  9 7754 davis cor     \n#> 10 3745 davis jcts\n\n\nf_TIC2 <- function(x, y) {\n  res <- c()\n  for (s in x$addresses) {\n    v <- unlist(strsplit(s, "\\\\W+"))\n    for (p in v) {\n      k <- match(p, y$common_abbrev)\n      if (!is.na(k)) {\n        s <- with(\n          y,\n          gsub(\n            sprintf("\\\\b%s\\\\b", common_abbrev[k]),\n            usps_abbrev[k],\n            s\n          )\n        )\n      }\n    }\n    res <- append(res, s)\n  }\n  x %>% mutate(addresses = res)\n}\nf_TIC2(df, USPS)\n#> # A tibble: 10 \xc3\x97 1\n#>    addresses          \n#>    <chr>              \n#>  1 8995 davis crk     \n#>  2 8527 davis tunl    \n#>  3 7663 von brown wall\n#>  4 3043 harper lk     \n#>  5 9192 von brown gdn \n#>  6 120 marry riv      \n#>  7 72 von brown lcks  \n#>  8 8752 marry gdn     \n#>  9 7754 davis cor     \n#> 10 3745 davis jcts\n\n\nf_TIC3 <- function(x, y) {\n  x.split <- strsplit(x$addresses, " ")\n  lut <- with(y, setNames(usps_abbrev, common_abbrev))\n  grp <- rep(seq_along(x.split), lengths(x.split))\n  xx <- unlist(x.split)\n  r <- lut[xx]\n  x %>% mutate(addresses = tapply(\n    replace(xx, !is.na(r), na.omit(r)),\n    grp,\n    function(s) paste0(s, collapse = " ")\n  ))\n}\nf_TIC3(df, USPS)\n#> # A tibble: 10 \xc3\x97 1\n#>    addresses          \n#>    <chr>              \n#>  1 8995 davis crk     \n#>  2 8527 davis tunl    \n#>  3 7663 von brown wall\n#>  4 3043 harper lk     \n#>  5 9192 von brown gdn \n#>  6 120 marry riv      \n#>  7 72 von brown lcks  \n#>  8 8752 marry gdn     \n#>  9 7754 davis cor     \n#> 10 3745 davis jcts\n\nf_TIC4 <- function(x, y) {\n  xb <- gsub("^.*\\\\s+", "", x$addresses, perl = TRUE)\n  rp <- with(USPS, usps_abbrev[match(xb, common_abbrev)])\n  x %>% mutate(addresses = paste0(gsub("\\\\w+$", "", x$addresses), replace(xb, !is.na(rp), na.omit(rp))))\n}\nf_TIC4(df, USPS)\n#> # A tibble: 10 \xc3\x97 1\n#>    addresses          \n#>    <chr>              \n#>  1 8995 davis crk     \n#>  2 8527 davis tunl    \n#>  3 7663 von brown wall\n#>  4 3043 harper lk     \n#>  5 9192 von brown gdn \n#>  6 120 marry riv      \n#>  7 72 von brown lcks  \n#>  8 8752 marry gdn     \n#>  9 7754 davis cor     \n#> 10 3745 davis jcts\n\nf_JM <- function(x, y) {\n  x$abbreviation <- gsub("^.* ", "", x$addresses, perl = TRUE)\n  setDT(x)\n  setDT(y)\n  x[y, abbreviation := usps_abbrev, on = .(abbreviation = common_abbrev)]\n  x$addresses <- paste(str_extract(x$addresses, "^.* "), x$abbreviation, sep = "")\n  x$abbreviation <- NULL\n  return(as_tibble(x))\n}\nf_JM(df, USPS)\n#> # A tibble: 10 \xc3\x97 1\n#>    addresses          \n#>    <chr>              \n#>  1 8995 davis crk     \n#>  2 8527 davis tunl    \n#>  3 7663 von brown wall\n#>  4 3043 harper lk     \n#>  5 9192 von brown gdn \n#>  6 120 marry riv      \n#>  7 72 von brown lcks  \n#>  8 8752 marry gdn     \n#>  9 7754 davis cor     \n#> 10 3745 davis jcts\n\nset.seed(1111)\ndf = randomAddresses(100)\n\nlibrary(microbenchmark)\nmb1 <- microbenchmark(\n  f_MK_conv2(df, USPS),\n  f_MK_replaceString(df, USPS),\n  f_TIC1(df, USPS),\n  f_TIC2(df, USPS),\n  f_TIC3(df, USPS),\n  f_TIC4(df, USPS),\n  f_JM(df, USPS),\n  times = 20L\n)\nggplot2::autoplot(mb1)\n#> Coordinate system already present. Adding new coordinate system, which will replace the existing one.\n
Run Code Online (Sandbox Code Playgroud)\n

\n
set.seed(1111)\ndf = randomAddresses(1000)\n\nlibrary(microbenchmark)\nmb1 <- microbenchmark(\n  f_MK_conv2(df, USPS),\n  f_MK_replaceString(df, USPS),\n  f_TIC1(df, USPS),\n  f_TIC2(df, USPS),\n  f_TIC3(df, USPS),\n  f_TIC4(df, USPS),\n  f_JM(df, USPS),\n  times = 20L\n)\nggplot2::autoplot(mb1)\n#> Coordinate system already present. Adding new coordinate system, which will replace the existing one.\n
Run Code Online (Sandbox Code Playgroud)\n

\n
set.seed(1111)\ndf = randomAddresses(10000)\n\nlibrary(microbenchmark)\nmb1 <- microbenchmark(\n  f_MK_conv2(df, USPS),\n  f_MK_replaceString(df, USPS),\n  f_TIC1(df, USPS),\n  f_TIC2(df, USPS),\n  f_TIC3(df, USPS),\n  f_TIC4(df, USPS),\n  f_JM(df, USPS),\n  times = 20L\n)\nggplot2::autoplot(mb1)\n#> Coordinate system already present. Adding new coordinate system, which will replace the existing one.\n
Run Code Online (Sandbox Code Playgroud)\n

\n
set.seed(1111)\ndf = randomAddresses(100000)\n\nlibrary(microbenchmark)\nmb1 <- microbenchmark(\n  f_MK_replaceString(df, USPS),\n  f_TIC3(df, USPS),\n  f_TIC4(df, USPS),\n  f_JM(df, USPS),\n  times = 20L\n)\nggplot2::autoplot(mb1)\n#> Coordinate system already present. Adding new coordinate system, which will replace the existing one.\n
Run Code Online (Sandbox Code Playgroud)\n

\n
set.seed(1111)\ndf = randomAddresses(1000000)\n\nlibrary(microbenchmark)\nmb1 <- microbenchmark(\n  f_MK_replaceString(df, USPS),\n  f_TIC4(df, USPS),\n  f_JM(df, USPS),\n  times = 20L\n)\nggplot2::autoplot(mb1)\n#> Coordinate system already present. Adding new coordinate system, which will replace the existing one.\n
Run Code Online (Sandbox Code Playgroud)\n

\n

由reprex 包于 2021 年 11 月 4 日创建(v2.0.1)

\n

原来的:

\n

精彩的答案@Marek@TIC!经过一些调整和基准测试后,我认为这种 data.table \'split/lookup-replace/paste\' 方法可能会更快:

\n
library(tidyverse)\nlibrary(data.table)\n\nn=1000000\nset.seed(1111)\ndf = tibble(\n  add