鉴于以下情况
library(dplyr)
myData <- tbl_df(data.frame( var1 = rnorm(100),
var2 = letters[1:3] %>%
sample(100, replace = TRUE) %>%
factor(),
var3 = LETTERS[1:3] %>%
sample(100, replace = TRUE) %>%
factor(),
var4 = month.abb[1:3] %>%
sample(100, replace = TRUE) %>%
factor()))
Run Code Online (Sandbox Code Playgroud)
我想将"myData"分组,最终找到var2,var3和var4的所有可能组合的摘要数据分组.
我可以创建一个列表,其中包含所有可能的变量组合作为字符值
groupNames <- names(myData)[2:4]
myGroups <- Map(combn,
list(groupNames),
seq_along(groupNames),
simplify = FALSE) %>%
unlist(recursive = FALSE)
Run Code Online (Sandbox Code Playgroud)
我的计划是使用for()循环为每个变量组合创建单独的数据集
### This Does Not Work
for (i in 1:length(myGroups)){
assign( myGroups[i]%>%
unlist() %>%
paste0(collapse = "")%>%
paste0("Data"),
myData %>%
group_by_(lapply(myGroups[[i]], as.symbol)) %>%
summarise( n = length(var1),
avgVar2 = var2 %>%
mean()))
}
Run Code Online (Sandbox Code Playgroud)
不可否认,我对列表不是很了解,因为dpyr更新改变了分组的工作方式,所以查找这个问题有点挑战性.
如果有更好的方法来做这个比单独的数据集我很想知道.
当我只用一个变量进行分组时,我得到了一个类似于上面的循环.
非常感谢任何和所有的帮助!谢谢!
这看起来很精确,并且可能有一种方法可以简化或用它来表达do它,但它有效.用你的myData和myGroups,
results = lapply(myGroups, FUN = function(x) {
do.call(what = group_by_, args = c(list(myData), x)) %>%
summarise( n = length(var1),
avgVar1 = mean(var1))
}
)
> results[[1]]
Source: local data frame [3 x 3]
var2 n avgVar1
1 a 31 0.38929738
2 b 31 -0.07451717
3 c 38 -0.22522129
> results[[4]]
Source: local data frame [9 x 4]
Groups: var2
var2 var3 n avgVar1
1 a A 11 -0.1159160
2 a B 11 0.5663312
3 a C 9 0.7904056
4 b A 7 0.0856384
5 b B 13 0.1309756
6 b C 11 -0.4192895
7 c A 15 -0.2783099
8 c B 10 -0.1110877
9 c C 13 -0.2517602
> results[[7]]
# I won't paste them here, but it has all 27 rows, grouped by var2, var3 and var4.
Run Code Online (Sandbox Code Playgroud)
我将您的summarise通话更改为平均值,var1因为var2它不是数字.
我根据 @Gregor 的答案和随后的评论创建了一个函数:
library(magrittr)
myData <- tbl_df(data.frame( var1 = rnorm(100),
var2 = letters[1:3] %>%
sample(100, replace = TRUE) %>%
factor(),
var3 = LETTERS[1:3] %>%
sample(100, replace = TRUE) %>%
factor(),
var4 = month.abb[1:3] %>%
sample(100, replace = TRUE) %>%
factor()))
Run Code Online (Sandbox Code Playgroud)
combSummarisecombSummarise <- function(data, variables=..., summarise=...){
# Get all different combinations of selected variables (credit to @Michael)
myGroups <- lapply(seq_along(variables), function(x) {
combn(c(variables), x, simplify = FALSE)}) %>%
unlist(recursive = FALSE)
# Group by selected variables (credit to @konvas)
df <- eval(parse(text=paste("lapply(myGroups, function(x){
dplyr::group_by_(data, .dots=x) %>%
dplyr::summarize_( \"", paste(summarise, collapse="\",\""),"\")})"))) %>%
do.call(plyr::rbind.fill,.)
groupNames <- c(myGroups[[length(myGroups)]])
newNames <- names(df)[!(names(df) %in% groupNames)]
df <- cbind(df[, groupNames], df[, newNames])
names(df) <- c(groupNames, newNames)
df
}
Run Code Online (Sandbox Code Playgroud)
combSummarisecombSummarise (myData, var=c("var2", "var3", "var4"),
summarise=c("length(var1)", "mean(var1)", "max(var1)"))
Run Code Online (Sandbox Code Playgroud)
或者
combSummarise (myData, var=c("var2", "var4"),
summarise=c("length(var1)", "mean(var1)", "max(var1)"))
Run Code Online (Sandbox Code Playgroud)
或者
combSummarise (myData, var=c("var2", "var4"),
summarise=c("length(var1)"))
Run Code Online (Sandbox Code Playgroud)
ETC