我需要在包含因子和数字列的大型(许多 GB)表中重复查找“最接近”的行。使用dplyr,它看起来像这样:
df <- data.frame(factorA = rep(letters[1:3], 100000),
factorB = sample(rep(letters[1:3], 100000),
3*100000, replace = FALSE),
numC = round(rnorm(3*100000), 2),
numD = round(rnorm(3*100000), 2))
closest <- function(ValueA, ValueB, ValueC, ValueD) {
df_sub <- df %>%
filter(factorA == ValueA,
factorB == ValueB,
numC >= 0.9 * ValueC,
numC <= 1.1 * ValueC,
numD >= 0.9 * ValueD,
numD <= 1.1 * ValueD)
if (nrow(df_sub) == 0) stop("Oh-oh, no candidates.")
minC <- df_sub[which.min(abs(df_sub$numC - ValueC)), "numC"]
df_sub %>%
filter(numC == …Run Code Online (Sandbox Code Playgroud)