如何为绘图森伯斯特图格式化数据

Question

如何为绘图森伯斯特图格式化数据

mtr*_*reg 7 r plotly sunburst-diagram r-plotly

我正在尝试通过 R 使用 Plotly 制作森伯斯特图。我正在努力处理层次结构所需的数据模型，无论是在概念化它是如何工作的方面，还是看看是否有任何简单的方法来转换常规数据框，用代表不同层次级别的列，转换为所需的格式。

我已经查看了 R 中绘图森伯斯特图表的示例，例如，here，并查看了参考页面，但没有完全获得数据格式的模型。

# Create some fake data - say ownership and land use data with acreage
df <- data.frame(ownership=c(rep("private", 3), rep("public",3),rep("mixed", 3)), 
                 landuse=c(rep(c("residential", "recreation", "commercial"),3)),
                 acres=c(108,143,102, 300,320,500, 37,58,90))

# Just try some quick pie charts of acreage by landuse and ownership
plot_ly(data=df, labels= ~landuse, values= ~acres, type='pie')
plot_ly(data=df, labels= ~ownership, values= ~acres, type='pie')

# This doesn't render anything... not that I'd expect it to given the data format doesn't seem to match what's needed, 
# but this is what I'd intuitively expect to work
plot_ly(data=df, labels= ~landuse, parents = ~ownership, values= ~acres, type='sunburst')

Run Code Online (Sandbox Code Playgroud)

根据上面的示例代码或类似代码，了解如何从数据 ( df) 转换为绘图森伯斯特图所需的格式会很有帮助。

Answer 1

ism*_*gal 9

你是绝对正确的，与 plotly 的 R API 的其余直觉用法相比，为森伯斯特（或树状图）图表准备数据相当烦人。

我遇到了同样的问题，写了一个基于library(data.table)准备数据的函数，接受两种不同的data.frame输入格式。

所需的格式生成使用类似结构的数据旭日情节一样你可以看到这里下的部分旭日反复标签。

对于您的示例，它应该如下所示：

         labels values         parents                           ids
 1:       total   1658            <NA>                         total
 2:     private    353           total               total - private
 3:      public   1120           total                total - public
 4:       mixed    185           total                 total - mixed
 5: residential    108 total - private total - private - residential
 6:  recreation    143 total - private  total - private - recreation
 7:  commercial    102 total - private  total - private - commercial
 8: residential    300  total - public  total - public - residential
 9:  recreation    320  total - public   total - public - recreation
10:  commercial    500  total - public   total - public - commercial
11: residential     37   total - mixed   total - mixed - residential
12:  recreation     58   total - mixed    total - mixed - recreation
13:  commercial     90   total - mixed    total - mixed - commercial

Run Code Online (Sandbox Code Playgroud)

这是到达那里的代码：

library(data.table)
library(plotly)

DF <- data.table(ownership=c(rep("private", 3), rep("public",3),rep("mixed", 3)),
                 landuse=c(rep(c("residential", "recreation", "commercial"),3)),
                 acres=c(108, 143, 102, 300, 320, 500, 37, 58, 90))

as.sunburstDF <- function(DF, value_column = NULL, add_root = FALSE){
  require(data.table)
  
  colNamesDF <- names(DF)
  
  if(is.data.table(DF)){
    DT <- copy(DF)
  } else {
    DT <- data.table(DF, stringsAsFactors = FALSE)
  }
  
  if(add_root){
    DT[, root := "Total"]  
  }
  
  colNamesDT <- names(DT)
  hierarchy_columns <- setdiff(colNamesDT, value_column)
  DT[, (hierarchy_columns) := lapply(.SD, as.factor), .SDcols = hierarchy_columns]
  
  if(is.null(value_column) && add_root){
    setcolorder(DT, c("root", colNamesDF))
  } else if(!is.null(value_column) && !add_root) {
    setnames(DT, value_column, "values", skip_absent=TRUE)
    setcolorder(DT, c(setdiff(colNamesDF, value_column), "values"))
  } else if(!is.null(value_column) && add_root) {
    setnames(DT, value_column, "values", skip_absent=TRUE)
    setcolorder(DT, c("root", setdiff(colNamesDF, value_column), "values"))
  }
  
  hierarchyList <- list()
  
  for(i in seq_along(hierarchy_columns)){
    current_columns <- colNamesDT[1:i]
    if(is.null(value_column)){
      currentDT <- unique(DT[, ..current_columns][, values := .N, by = current_columns], by = current_columns)
    } else {
      currentDT <- DT[, lapply(.SD, sum, na.rm = TRUE), by=current_columns, .SDcols = "values"]
    }
    setnames(currentDT, length(current_columns), "labels")
    hierarchyList[[i]] <- currentDT
  }
  
  hierarchyDT <- rbindlist(hierarchyList, use.names = TRUE, fill = TRUE)
  
  parent_columns <- setdiff(names(hierarchyDT), c("labels", "values", value_column))
  hierarchyDT[, parents := apply(.SD, 1, function(x){fifelse(all(is.na(x)), yes = NA_character_, no = paste(x[!is.na(x)], sep = ":", collapse = " - "))}), .SDcols = parent_columns]
  hierarchyDT[, ids := apply(.SD, 1, function(x){paste(x[!is.na(x)], collapse = " - ")}), .SDcols = c("parents", "labels")]
  hierarchyDT[, c(parent_columns) := NULL]
  return(hierarchyDT)
}


sunburstDF <- as.sunburstDF(DF, value_column = "acres", add_root = TRUE)

plot_ly(data = sunburstDF, ids = ~ids, labels= ~labels, parents = ~parents, values= ~values, type='sunburst', branchvalues = 'total')

Run Code Online (Sandbox Code Playgroud)

下面是data.frame函数接受的第二种格式的示例( value_column = NULL，因为它是根据数据计算出来的)：

DF2 <- data.frame(sample(LETTERS[1:3], 100, replace = TRUE),
                  sample(LETTERS[4:6], 100, replace = TRUE),
                  sample(LETTERS[7:9], 100, replace = TRUE),
                  sample(LETTERS[10:12], 100, replace = TRUE),
                  sample(LETTERS[13:15], 100, replace = TRUE),
                  stringsAsFactors = FALSE)

plot_ly(data = as.sunburstDF(DF2, add_root = TRUE), ids = ~ids, labels= ~labels, parents = ~parents, values= ~values, type='sunburst', branchvalues = 'total')

Run Code Online (Sandbox Code Playgroud)

另请参阅 library( sunburstR ) 作为替代方案。

编辑：添加了一个关于基于 dplyr 的count_to_sunburst()函数的基准library(plotme)（见下文），在我的系统上它比data.table版本慢 5 倍左右。

Unit: milliseconds
          expr     min       lq     mean   median       uq      max neval
        plotme 50.4618 53.09425 60.92404 55.37815 63.62315 122.3842   100
 ismirsehregal  8.6553 10.28870 12.63881 11.53760 12.26620 108.2025   100

Run Code Online (Sandbox Code Playgroud)

重现基准的代码：

# devtools::install_github("yogevherz/plotme")

library(microbenchmark)
library(plotme)
library(dplyr)
library(data.table)
library(plotly)

DF <- data.frame(ownership=c(rep("private", 3), rep("public",3),rep("mixed", 3)),
                 landuse=c(rep(c("residential", "recreation", "commercial"),3)),
                 acres=c(108, 143, 102, 300, 320, 500, 37, 58, 90))

as.sunburstDF <- function(DF, value_column = NULL, add_root = FALSE){
  require(data.table)
  
  colNamesDF <- names(DF)
  
  if(is.data.table(DF)){
    DT <- copy(DF)
  } else {
    DT <- data.table(DF, stringsAsFactors = FALSE)
  }
  
  if(add_root){
    DT[, root := "Total"]  
  }
  
  colNamesDT <- names(DT)
  hierarchy_columns <- setdiff(colNamesDT, value_column)
  DT[, (hierarchy_columns) := lapply(.SD, as.factor), .SDcols = hierarchy_columns]
  
  if(is.null(value_column) && add_root){
    setcolorder(DT, c("root", colNamesDF))
  } else if(!is.null(value_column) && !add_root) {
    setnames(DT, value_column, "values", skip_absent=TRUE)
    setcolorder(DT, c(setdiff(colNamesDF, value_column), "values"))
  } else if(!is.null(value_column) && add_root) {
    setnames(DT, value_column, "values", skip_absent=TRUE)
    setcolorder(DT, c("root", setdiff(colNamesDF, value_column), "values"))
  }
  
  hierarchyList <- list()
  
  for(i in seq_along(hierarchy_columns)){
    current_columns <- colNamesDT[1:i]
    if(is.null(value_column)){
      currentDT <- unique(DT[, ..current_columns][, values := .N, by = current_columns], by = current_columns)
    } else {
      currentDT <- DT[, lapply(.SD, sum, na.rm = TRUE), by=current_columns, .SDcols = "values"]
    }
    setnames(currentDT, length(current_columns), "labels")
    hierarchyList[[i]] <- currentDT
  }
  
  hierarchyDT <- rbindlist(hierarchyList, use.names = TRUE, fill = TRUE)
  
  parent_columns <- setdiff(names(hierarchyDT), c("labels", "values", value_column))
  hierarchyDT[, parents := apply(.SD, 1, function(x){fifelse(all(is.na(x)), yes = NA_character_, no = paste(x[!is.na(x)], sep = ":", collapse = " - "))}), .SDcols = parent_columns]
  hierarchyDT[, ids := apply(.SD, 1, function(x){paste(x[!is.na(x)], collapse = " - ")}), .SDcols = c("parents", "labels")]
  hierarchyDT[, c(parent_columns) := NULL]
  return(hierarchyDT)
}

microbenchmark(plotme = {
  DF %>% 
    rename(n = acres) %>% 
    count_to_sunburst()
}, ismirsehregal = {
  plot_ly(data = as.sunburstDF(DF, value_column = "acres", add_root = TRUE), ids = ~ids, labels= ~labels, parents = ~parents, values= ~values, type='sunburst', branchvalues = 'total')  
})

Run Code Online (Sandbox Code Playgroud)

归档时间：	6 年，6 月前
查看次数：	3204 次
最近记录：	4 年，6 月前