jva*_*nti 7 geocoding r list dataframe split-apply-combine
我已经从人口普查局下载了美国所有城镇等的列表。这是一个随机样本:
dput(somewhere)
structure(list(state = structure(c(30L, 31L, 5L, 31L, 24L, 36L,
13L, 21L, 6L, 10L, 31L, 28L, 10L, 5L, 5L, 8L, 23L, 11L, 34L,
19L, 29L, 4L, 24L, 13L, 21L, 31L, 2L, 3L, 29L, 24L, 1L, 13L,
15L, 10L, 11L, 33L, 35L, 8L, 11L, 12L, 36L, 28L, 9L, 31L, 8L,
14L, 11L, 12L, 36L, 13L, 8L, 5L, 29L, 8L, 7L, 23L, 25L, 39L,
16L, 28L, 10L, 29L, 26L, 8L, 32L, 40L, 28L, 23L, 37L, 31L, 18L,
5L, 1L, 31L, 18L, 13L, 11L, 10L, 25L, 18L, 21L, 18L, 11L, 35L,
31L, 36L, 20L, 19L, 38L, 2L, 40L, 13L, 36L, 11L, 29L, 27L, 22L,
17L, 12L, 20L), .Label = c("ak", "al", "ar", "az", "ca", "co",
"fl", "ga", "hi", "ia", "il", "in", "ks", "ky", "la", "md", "mi",
"mo", "ms", "mt", "nc", "nd", "ne", "nj", "nm", "nv", "ny", "oh",
"ok", "or", "pa", "pr", "ri", "sc", "sd", "tx", "ut", "va", "wa",
"wi"), class = "factor"), geoid = c(4120100L, 4280962L, 668028L,
4243944L, 3460600L, 4871948L, 2046000L, 3747695L, 839965L, 1909910L,
4244824L, 3902204L, 1963750L, 622360L, 669088L, 1382972L, 3125230L,
1722736L, 4539265L, 2804705L, 4039625L, 451465L, 3467020L, 2077150L,
3765260L, 4221792L, 133904L, 566320L, 4033150L, 3463180L, 223460L,
2013675L, 2232405L, 1951600L, 1752142L, 4445010L, 4655684L, 1336416L,
1729080L, 1840842L, 4804672L, 3932207L, 1523675L, 4260384L, 1321912L,
2159232L, 1735307L, 1867176L, 4839304L, 2057350L, 1309656L, 655380L,
4082250L, 1350680L, 1275475L, 3147745L, 3505010L, 5352285L, 2483337L,
3909834L, 1912945L, 4068200L, 3227900L, 1366304L, 7286831L, 5505350L,
3982390L, 3149915L, 4974480L, 4249440L, 2943346L, 677430L, 280770L,
4247872L, 2902242L, 2039075L, 1735281L, 1932565L, 3580120L, 2973852L,
3722620L, 2943238L, 1755938L, 4643100L, 4251904L, 4830920L, 3056575L,
2801940L, 5156048L, 137000L, 5508925L, 2057300L, 4861172L, 1736477L,
4021200L, 3677783L, 3832060L, 2614900L, 1820332L, 3043750L),
ansicode = c(2410344L, 2390453L, 2411793L, 1214759L, 885360L,
2412035L, 485621L, 2403359L, 2412812L, 2583481L, 2390095L,
2397971L, 2396237L, 2585422L, 2411819L, 2405746L, 2398338L,
2394628L, 2812929L, 2586582L, 2408478L, 2582836L, 885393L,
2397270L, 2402898L, 2584453L, 2405811L, 2405518L, 2412737L,
2389752L, 2418574L, 2393549L, 2402559L, 2629970L, 2399453L,
2378109L, 2812999L, 2402563L, 2398956L, 2396699L, 2409759L,
2393028L, 2414061L, 2805542L, 2404192L, 2404475L, 2398514L,
2629884L, 2408486L, 2396265L, 2405306L, 2411363L, 2413515L,
2405064L, 2402989L, 2583899L, 2629103L, 2585016L, 2390487L,
2397481L, 2393811L, 2413298L, 2583927L, 2812702L, 2415078L,
1582764L, 2400116L, 2400036L, 2412013L, 2633665L, 2787908L,
2583158L, 2418866L, 1214943L, 2393998L, 485611L, 2398513L,
2394969L, 2806756L, 2397053L, 2406485L, 2395719L, 2399572L,
1267480L, 2389516L, 2410660L, 2409026L, 2806379L, 2584894L,
2404746L, 2586459L, 2396263L, 2411528L, 2398556L, 2412443L,
2584298L, 1036064L, 2806333L, 2396920L, 2804282L), city = c("donald",
"warminster heights", "san juan capistrano", "littlestown",
"port republic", "taylor", "merriam", "northlakes", "julesburg",
"california junction", "lower allen", "antwerp", "pleasantville",
"el rancho", "santa clarita", "willacoochee", "kennard",
"effingham", "la france", "beechwood", "keys", "orange grove mobile manor",
"shiloh", "west mineral", "stony point", "east salem", "heath",
"stamps", "haworth", "rio grande", "ester", "clayton", "hackberry",
"middle amana", "new baden", "melville", "rolland colony",
"hannahs mill", "germantown hills", "la fontaine", "aurora",
"green meadows", "kaiminani", "pinecroft", "dawson", "park city",
"hinsdale", "st. meinrad", "kingsland", "powhattan", "bowersville",
"palos verdes estates", "wyandotte", "meigs", "waverly",
"sunol", "arroyo hondo", "outlook", "west pocomoke", "buchtel",
"chatsworth", "smith village", "glenbrook", "rock spring",
"villalba", "bayfield", "waynesfield", "utica", "sunset",
"milford square", "lithium", "swall meadows", "unalaska",
"martinsburg", "ashland", "leawood", "hindsboro", "gray",
"turley", "trimble", "falcon", "linn", "olympia fields",
"mitchell", "mount pleasant mills", "greenville", "park city",
"arkabutla", "new river", "huntsville", "boulder junction",
"potwin", "red lick", "huey", "dougherty", "wadsworth", "grand forks",
"chassell", "edgewood", "lindsay"), lsad = c("25", "57",
"25", "21", "25", "25", "25", "57", "43", "57", "57", "47",
"25", "57", "25", "25", "47", "25", "57", "57", "57", "57",
"21", "25", "57", "57", "43", "25", "43", "57", "57", "25",
"57", "57", "47", "57", "57", "57", "47", "43", "25", "57",
"57", "57", "25", "25", "47", "57", "57", "25", "43", "25",
"43", "25", "57", "57", "57", "57", "57", "47", "25", "43",
"57", "57", "62", "25", "47", "47", "25", "57", "57", "57",
"25", "21", "25", "25", "47", "25", "57", "25", "43", "25",
"47", "25", "57", "25", "57", "57", "57", "25", "57", "25",
"25", "47", "43", "57", "25", "57", "43", "57"), funcstat = c("a",
"s", "a", "a", "a", "a", "a", "s", "a", "s", "s", "a", "a",
"s", "a", "a", "a", "a", "s", "s", "s", "s", "a", "a", "s",
"s", "a", "a", "a", "s", "s", "a", "s", "s", "a", "s", "s",
"s", "a", "a", "a", "s", "s", "s", "a", "a", "a", "s", "s",
"a", "a", "a", "a", "a", "s", "s", "s", "s", "s", "a", "a",
"a", "s", "s", "s", "a", "a", "a", "a", "s", "s", "s", "a",
"a", "a", "a", "a", "a", "s", "a", "a", "a", "a", "a", "s",
"a", "s", "s", "s", "a", "s", "a", "a", "a", "a", "s", "a",
"s", "a", "s"), latitude = c(45.221487, 40.18837, 33.500889,
39.74517, 39.534798, 30.573263, 39.017607, 35.780523, 40.984864,
41.56017, 40.226748, 41.180176, 41.387011, 36.220684, 34.414083,
31.335094, 41.474697, 39.120662, 34.616281, 32.336723, 35.802786,
32.598451, 39.462418, 37.283906, 35.867809, 40.608713, 31.344839,
33.354959, 33.840898, 39.019051, 64.879056, 39.736866, 29.964958,
41.794765, 38.536765, 41.559549, 44.3437, 32.937302, 40.768954,
40.673893, 33.055942, 39.867193, 19.757709, 40.564189, 31.771864,
37.093499, 41.800683, 38.168142, 30.666141, 39.761734, 34.373065,
33.774271, 36.807143, 31.071788, 27.985282, 41.154105, 36.534599,
46.331153, 38.096527, 39.463511, 42.916301, 35.45079, 39.100123,
34.81467, 18.127809, 46.81399, 40.602442, 40.895279, 41.13806,
40.433182, 37.831844, 37.50606, 53.910255, 40.310917, 38.812464,
38.907263, 39.684775, 41.841711, 36.736661, 39.476152, 35.194804,
38.478798, 41.521996, 43.730057, 40.724697, 33.111939, 45.630946,
34.700227, 37.142945, 34.782275, 46.1148, 37.938624, 33.485081,
38.605285, 34.399808, 42.821447, 47.921291, 47.036116, 40.103208,
47.224885), longitude = c(-122.837813, -75.084089, -117.654388,
-77.089213, -74.476099, -97.427116, -94.693955, -81.367835,
-102.262708, -95.994752, -76.902769, -84.736099, -93.272787,
-119.068357, -118.494729, -83.044003, -96.203696, -88.550859,
-82.770697, -90.808692, -94.941358, -114.660588, -75.29244,
-94.926801, -81.044121, -77.23694, -86.46905, -93.497879,
-94.657035, -74.87787, -148.041153, -100.176484, -93.410178,
-91.901539, -89.707193, -71.301933, -96.59226, -84.340945,
-89.462982, -85.722023, -97.509615, -83.945334, -156.001765,
-78.353464, -84.443499, -86.048077, -87.928172, -86.832128,
-98.454026, -95.634011, -83.084305, -118.425754, -94.729305,
-84.092683, -81.625304, -102.762746, -105.666602, -120.092812,
-75.579197, -82.180426, -96.514499, -97.457006, -119.927289,
-85.238869, -66.481897, -90.822546, -83.973881, -97.345349,
-112.028388, -75.405024, -89.88325, -118.642656, -166.529029,
-78.324286, -92.239531, -94.62524, -88.134729, -94.985863,
-107.792147, -94.561898, -78.65389, -91.844989, -87.691648,
-98.029974, -77.026451, -96.110256, -108.925311, -90.121565,
-80.595817, -86.532599, -89.654438, -97.01835, -94.161474,
-89.289973, -97.05148, -77.893875, -97.08933, -88.530745,
-85.737461, -105.152791), designation = c("city", "cdp",
"city", "borough", "city", "city", "city", "cdp", "town",
"cdp", "cdp", "village", "city", "cdp", "city", "city", "village",
"city", "cdp", "cdp", "cdp", "cdp", "borough", "city", "cdp",
"cdp", "town", "city", "town", "cdp", "cdp", "city", "cdp",
"cdp", "village", "cdp", "cdp", "cdp", "village", "town",
"city", "cdp", "cdp", "cdp", "city", "city", "village", "cdp",
"cdp", "city", "town", "city", "town", "city", "cdp", "cdp",
"cdp", "cdp", "cdp", "village", "city", "town", "cdp", "cdp",
"urbana", "city", "village", "village", "city", "cdp", "cdp",
"cdp", "city", "borough", "city", "city", "village", "city",
"cdp", "city", "town", "city", "village", "city", "cdp",
"city", "cdp", "cdp", "cdp", "city", "cdp", "city", "city",
"village", "town", "cdp", "city", "cdp", "town", "cdp")), row.names = c(22769L,
24845L, 3314L, 24015L, 17360L, 28139L, 10085L, 19881L, 3886L,
8750L, 24027L, 20585L, 9362L, 2499L, 3333L, 6041L, 16321L, 6847L,
25249L, 14051L, 22233L, 1210L, 17425L, 10353L, 20053L, 23545L,
253L, 1951L, 22166L, 17386L, 685L, 9771L, 11134L, 9225L, 7386L,
25001L, 25862L, 5663L, 6950L, 8239L, 26555L, 20991L, 6108L, 24388L,
5551L, 10772L, 7056L, 8470L, 27292L, 10202L, 5451L, 3116L, 22660L,
5776L, 5317L, 16546L, 17582L, 29958L, 12103L, 20709L, 8779L,
22515L, 16665L, 5902L, 31901L, 30658L, 21745L, 16574L, 28632L,
24127L, 15046L, 3455L, 930L, 24087L, 14494L, 10016L, 7055L, 8993L,
18048L, 15434L, 19615L, 15043L, 7454L, 25775L, 24194L, 27115L,
15857L, 14038L, 29305L, 276L, 30693L, 10201L, 27863L, 7075L,
22046L, 19267L, 20311L, 12502L, 8093L, 15798L), class = "data.frame")
Run Code Online (Sandbox Code Playgroud)
我想city使用longitudeand latitudecolumns 和gdist函数来计算列中每个条目之间的距离。我知道以下for循环有效并且易于阅读:
dist_list <- list()
for (i in 1:nrow(somewhere)) {
dist_list[[i]] <- gdist(lon.1 = somewhere$longitude[i],
lat.1 = somewhere$latitude[i],
lon.2 = somewhere$longitude,
lat.2 = somewhere$latitude,
units="miles")
}
Run Code Online (Sandbox Code Playgroud)
然而:在完整数据集(31K+行)上运行需要很长时间——就像几个小时一样。我正在寻找可以加快计算速度的东西。我认为该split-apply-combine方法中的某些内容会很好用,因为我希望最终涉及一对分组变量,geo_block但ansi_block老实说,任何内容都会比我所拥有的更好。
我已经尝试过以下方法:
somewhere$geo_block <- substr(somewhere$geoid, 1, 1)
somewhere$ansi_block <- substr(somewhere$ansicode, 1, 1)
somewhere <- somewhere %>%
split(.$geo_block, .$ansi_block) %>%
mutate(dist = gdist(longlat = somewhere[, c("longitude", "latitude")]))
Run Code Online (Sandbox Code Playgroud)
但我不确定如何在标准循环之外指定第二组长纬度输入for。
我的问题:
split-apply-combine来解决这个问题?我想返回最短距离以及该距离对应的名称和值。geo_blockansi_blockcitygeo_block欢迎所有建议。理想情况下,所需的结果会相当快,因为我正在使用的实际数据集非常大。因为我有点在树林里,所以我给这个问题添加了一个赏金,以引起更多的兴趣,并希望我可以从中学习到一系列广泛的潜在答案。非常感谢!
我将展示 tidyverse 和基本 R 的分割-应用-组合方法。我的理解是,对于每组城市中的每个城市(无论您的分组变量是什么),您想要报告最近的组内城市以及到最近城市的距离。
\n首先,有几点评论Imap::gdist:
lonlat论证。您必须使用参数(lon|lat).(1|2)来传递坐标。while (abs(lamda - lamda.old) > 1e-11) {...}值为。为了使循环条件的长度为 1(应该如此),参数和 的长度必须为 1。lamda(lon.1 - lon.2) * pi / 180lon.1lon.2因此,至少现在,我的答案将基于更规范的geosphere::distm. 它将整个n对称n距离矩阵存储在内存中,因此您可能会遇到内存分配限制,但在拆分-应用-组合中发生这种情况的可能性要小得多,其中n是组内(而不是总数)数的城市。
我将处理您的数据框的这一部分:
\nlibrary("dplyr")\n\ndd <- somewhere %>%\n as_tibble() %>%\n mutate(geo_block = as.factor(as.integer(substr(geoid, 1L, 1L))),\n ansi_block = as.factor(as.integer(substr(ansicode, 1L, 1L)))) %>%\n select(geo_block, ansi_block, city, longitude, latitude)\ndd\n# # A tibble: 100 \xc3\x97 5\n# geo_block ansi_block city longitude latitude\n# <fct> <fct> <chr> <dbl> <dbl>\n# 1 4 2 donald -123. 45.2\n# 2 4 2 warminster heights -75.1 40.2\n# 3 6 2 san juan capistrano -118. 33.5\n# 4 4 1 littlestown -77.1 39.7\n# 5 3 8 port republic -74.5 39.5\n# 6 4 2 taylor -97.4 30.6\n# 7 2 4 merriam -94.7 39.0\n# 8 3 2 northlakes -81.4 35.8\n# 9 8 2 julesburg -102. 41.0\n# 10 1 2 california junction -96.0 41.6\n# # \xe2\x80\xa6 with 90 more rows\nRun Code Online (Sandbox Code Playgroud)\n以下函数执行识别给定 维度量空间中的find_nearest一组点的最近邻居的基本任务。其论据如下:nd
datan:包含行(每个点一个)、d指定点坐标的变量以及与每个点关联的 1 个或多个 ID 变量的数据框。dist:一个函数,将n坐标d矩阵作为参数并返回相应的n对称n距离矩阵。coordvar:列出坐标变量名称的字符向量。idvar:列出 ID 变量名称的字符向量。在我们的数据框中,坐标变量为longitude,并且latitude有一个 ID 变量city。coordvar为了简单起见,我相应地定义了和 的默认值idvar。
find_nearest <- function(data, dist, \n coordvar = c("longitude", "latitude"), \n idvar = "city") {\n m <- match(coordvar, names(data), 0L)\n n <- nrow(data)\n if (n < 2L) {\n argmin <- NA_integer_[n]\n distance <- NA_real_[n]\n } else {\n ## Compute distance matrix\n D <- dist(as.matrix(data[m]))\n ## Extract minimum distances\n diag(D) <- Inf # want off-diagonal distances\n argmin <- apply(D, 2L, which.min)\n distance <- D[cbind(argmin, seq_len(n))]\n }\n ## Return focal point data, nearest neighbour ID, distance\n r1 <- data[-m]\n r2 <- data[argmin, idvar, drop = FALSE]\n names(r2) <- paste0(idvar, "_nearest")\n data.frame(r1, r2, distance, row.names = NULL, stringsAsFactors = FALSE)\n}\nRun Code Online (Sandbox Code Playgroud)\n这是应用于我们的数据框的输出find_nearest,没有分组,距离由 Vincenty 椭球方法计算。
dist_vel <- function(x) {\n geosphere::distm(x, fun = geosphere::distVincentyEllipsoid)\n}\n\nres <- find_nearest(dd, dist = dist_vel)\nhead(res, 10L)\n# geo_block ansi_block city city_nearest distance\n# 1 4 2 donald outlook 246536.39\n# 2 4 2 warminster heights milford square 38512.79\n# 3 6 2 san juan capistrano palos verdes estates 77722.35\n# 4 4 1 littlestown lower allen 55792.53\n# 5 3 8 port republic rio grande 66935.70\n# 6 4 2 taylor kingsland 98997.90\n# 7 2 4 merriam leawood 13620.87\n# 8 3 2 northlakes stony point 30813.46\n# 9 8 2 julesburg sunol 46037.81\n# 10 1 2 california junction kennard 19857.41\nRun Code Online (Sandbox Code Playgroud)\n这是 tidyverse split-apply-combine,对geo_block和进行分组ansi_block:
dd %>%\n group_by(geo_block, ansi_block) %>%\n group_modify(~find_nearest(., dist = dist_vel))\n# # A tibble: 100 \xc3\x97 5\n# # Groups: geo_block, ansi_block [13]\n# geo_block ansi_block city city_nearest distance\n# <fct> <fct> <chr> <chr> <dbl>\n# 1 1 2 california junction gray 89610.\n# 2 1 2 pleasantville middle amana 122974.\n# 3 1 2 willacoochee meigs 104116.\n# 4 1 2 effingham hindsboro 72160.\n# 5 1 2 heath dawson 198052.\n# 6 1 2 middle amana pleasantville 122974.\n# 7 1 2 new baden huey 37147.\n# 8 1 2 hannahs mill dawson 129599.\n# 9 1 2 germantown hills hindsboro 165140.\n# 10 1 2 la fontaine edgewood 63384.\n# # \xe2\x80\xa6 with 90 more rows\nRun Code Online (Sandbox Code Playgroud)\n例如,请注意,在该分组下,距离 California Junction 最近的城市如何从 Kennard 变为距离较远的 Gray。
\n这是基本的 R split-apply-combine,基于基本函数 构建by。除了结果中组的排序之外,它是等效的:
find_nearest_by <- function(data, by, ...) {\n do.call(rbind, base::by(data, by, find_nearest, ...))\n}\n\nres <- find_nearest_by(dd, by = dd[c("geo_block", "ansi_block")], dist = dist_vel)\nhead(res, 10L)\n# geo_block ansi_block city city_nearest distance\n# 1 3 1 grand forks <NA> NA\n# 2 4 1 littlestown martinsburg 122718.95\n# 3 4 1 martinsburg littlestown 122718.95\n# 4 4 1 mitchell martinsburg 1671365.58\n# 5 5 1 bayfield <NA> NA\n# 6 1 2 california junction gray 89609.71\n# 7 1 2 pleasantville middle amana 122974.32\n# 8 1 2 willacoochee meigs 104116.21\n# 9 1 2 effingham hindsboro 72160.43\n# 10 1 2 heath dawson 198051.76\nRun Code Online (Sandbox Code Playgroud)\n这一排序表明只包含一个城市的组的find_nearest返回结果NA。NA如果我们打印整个 tibble,我们会在 tidyverse 结果中看到这些。
FWIW,这里是计算测地距离所实现的各种函数的基准geosphere,没有提到准确性,尽管您可以从结果推测文森蒂椭球距离切割最少的角落(哈哈)......
fn <- function(dist) find_nearest(dd, dist = dist)\n\nlibrary("geosphere")\ndist_geo <- function(x) distm(x, fun = distGeo)\ndist_cos <- function(x) distm(x, fun = distCosine)\ndist_hav <- function(x) distm(x, fun = distHaversine)\ndist_vsp <- function(x) distm(x, fun = distVincentySphere)\ndist_vel <- function(x) distm(x, fun = distVincentyEllipsoid)\ndist_mee <- function(x) distm(x, fun = distMeeus)\n\nmicrobenchmark::microbenchmark(\n fn(dist_geo),\n fn(dist_cos),\n fn(dist_hav),\n fn(dist_vsp),\n fn(dist_vel),\n fn(dist_mee),\n times = 1000L\n)\n# Unit: milliseconds\n# expr min lq mean median uq max neval\n# fn(dist_geo) 6.143276 6.291737 6.718329 6.362257 6.459345 45.91131 1000\n# fn(dist_cos) 4.239236 4.399977 4.918079 4.461804 4.572033 45.70233 1000\n# fn(dist_hav) 4.005331 4.156067 4.641016 4.210721 4.307542 41.91619 1000\n# fn(dist_vsp) 3.827227 3.979829 4.446428 4.033621 4.123924 44.29160 1000\n# fn(dist_vel) 129.712069 132.549638 135.006170 133.935479 135.248135 174.88874 1000\n# fn(dist_mee) 3.716814 3.830999 4.234231 3.883582 3.962712 42.12947 1000\nRun Code Online (Sandbox Code Playgroud)\nImap::gdist似乎是 Vincenty 椭球距离的纯 R 实现。这可能是其缓慢的原因......
最后一些评论:
\ndist的论证不必find_nearest建立在其中一个geosphere距离上。您可以传递任何您想要满足我上面所述的约束的函数。find_nearest可以适应接受dist返回 class 对象的函数"dist"。这些对象仅存储对称距离矩阵n*(n-1)/2的下三角元素(请参阅)。这将改善对大的支持,并与半正弦距离的内存高效实现兼容(由@dww 在评论中建议)。只需要弄清楚如何翻译像. [更新:我已在单独的答案中实现了此更改,我在其中提供了新的基准。]nn?distnfind_nearestapply(D, 2L, which.min)find_nearest