dplyr - 按组大小过滤

Ren*_*rop 6 r subset filter dataframe dplyr

过滤data.frame以获得大小为5的组的最佳方法是什么?

所以我的数据如下:

require(dplyr)
n <- 1e5
x <- rnorm(n)
# Category size ranging each from 1 to 5
cat <- rep(seq_len(n/3), sample(1:5, n/3, replace = TRUE))[1:n]

dat <- data.frame(x = x, cat = cat)
Run Code Online (Sandbox Code Playgroud)

我能想到的dplyr方式是

dat <- group_by(dat, cat)

system.time({
  out1 <- dat %>% filter(n() == 5L)
})
#    user  system elapsed 
#   1.157   0.218   1.497
Run Code Online (Sandbox Code Playgroud)

但这很慢...... dplyr有更好的方法吗?

到目前为止,我的解决方案解决方案如下:

system.time({
  all_ind <- rep(seq_len(n_groups(dat)), group_size(dat))
  take_only <- which(group_size(dat) == 5L)
  out2 <- dat[all_ind %in% take_only, ]
})
#    user  system elapsed 
#   0.026   0.008   0.036
all.equal(out1, out2) # TRUE
Run Code Online (Sandbox Code Playgroud)

但这并不像是......

Joe*_*Joe 14

您可以更简洁地使用n()

library(dplyr)
dat %>% group_by(cat) %>% filter(n() == 5)
Run Code Online (Sandbox Code Playgroud)


tal*_*lat 6

这是你可以尝试的另一种dplyr方法

semi_join(dat, count(dat, cat) %>% filter(n == 5), by = "cat")
Run Code Online (Sandbox Code Playgroud)

-

这是基于OP原始方法的另一种方法,稍作修改:

n <- 1e5
x <- rnorm(n)
# Category size ranging each from 1 to 5
cat <- rep(seq_len(n/3), sample(1:5, n/3, replace = TRUE))[1:n]

dat <- data.frame(x = x, cat = cat)

# second data set for the dt approch
dat2 <- data.frame(x = x, cat = cat)

sol_floo0 <- function(dat){
  dat <- group_by(dat, cat)
  all_ind <- rep(seq_len(n_groups(dat)), group_size(dat))
  take_only <- which(group_size(dat) == 5L)
  dat[all_ind %in% take_only, ]
}

sol_floo0_v2 <- function(dat){
  g <- group_by(dat, cat) %>% group_size()
  ind <- rep(g == 5, g)
  dat[ind, ]
}



microbenchmark::microbenchmark(times = 10,
                               sol_floo0(dat),
                               sol_floo0_v2(dat2))
#Unit: milliseconds
#               expr      min       lq     mean   median       uq      max neval cld
#     sol_floo0(dat) 43.72903 44.89957 45.71121 45.10773 46.59019 48.64595    10   b
# sol_floo0_v2(dat2) 29.83724 30.56719 32.92777 31.97169 34.10451 38.31037    10  a 
all.equal(sol_floo0(dat), sol_floo0_v2(dat2))
#[1] TRUE
Run Code Online (Sandbox Code Playgroud)


cee*_*fel 5

我知道你需要一个dplyr解决方案,但如果你将它与一些解决方案结合起来,purrr你可以在一行中得到它,而无需指定任何新功能。(虽然慢了一点。)

library(dplyr)
library(purrr)
library(tidyr)

dat %>% 
  group_by(cat) %>% 
  nest() %>% 
  mutate(n = map(data, n_distinct)) %>%
  unnest(n = n) %>% 
  filter(n == 5) %>% 
  select(cat, n)
Run Code Online (Sandbox Code Playgroud)