分层(分类)数据到树状图

Cod*_*oob 5 tree r cluster-analysis taxonomy ggplot2

数据
我有以下(简化的)数据集,我们df从现在开始调用:

                     species    rank                   value
1           Pseudomonas putida  family        Pseudomonadaceae
2       Pseudomonas aeruginosa  family        Pseudomonadaceae
3  Enterobacter xiangfangensis  family      Enterobacteriaceae
4          Salmonella enterica  family      Enterobacteriaceae
5        Klebsiella pneumoniae  family      Enterobacteriaceae
6           Pseudomonas putida   genus             Pseudomonas
7       Pseudomonas aeruginosa   genus             Pseudomonas
8  Enterobacter xiangfangensis   genus            Enterobacter
9          Salmonella enterica   genus              Salmonella
10       Klebsiella pneumoniae   genus              Klebsiella
11          Pseudomonas putida species      Pseudomonas putida
12      Pseudomonas aeruginosa species  Pseudomonas aeruginosa
13 Enterobacter xiangfangensis species Enterobacter hormaechei
14         Salmonella enterica species     Salmonella enterica
15       Klebsiella pneumoniae species   Klebsiella pneumoniae
Run Code Online (Sandbox Code Playgroud)

我想要达到的目标

此数据是显示分类的分类数据species,其中的rank顺序为科 > 属 > 种。由于分层性质,我想将其显示为一棵树,优先ggplot2如下所示: 在此处输入图片说明


我尝试过的
虽然我找到了一个包,taxize用于将其(实际上是完整的分类 - 此处仅部分显示)转换为树,使用class2tree

class.dat <- classification(c("Pseudomonas putida", "Pseudomonas aeruginosa","Enterobacter xiangfangensis","Salmonella enterica","Klebsiella pneumoniae"), db = 'ncbi')
taxize::class2tree(class.dat)
Run Code Online (Sandbox Code Playgroud)

这不像我的手工制作的图表那样显示等级,我在我的可视化中需要:

在此处输入图片说明


编辑:数据的dput

structure(list(species = c("Pseudomonas putida", "Pseudomonas putida", 
"Pseudomonas putida", "Pseudomonas aeruginosa", "Pseudomonas aeruginosa", 
"Pseudomonas aeruginosa", "Enterobacter xiangfangensis", "Enterobacter xiangfangensis", 
"Enterobacter xiangfangensis", "Salmonella enterica", "Salmonella enterica", 
"Salmonella enterica", "Klebsiella pneumoniae", "Klebsiella pneumoniae", 
"Klebsiella pneumoniae"), rank = c("family", "genus", "species", 
"family", "genus", "species", "family", "genus", "species", "family", 
"genus", "species", "family", "genus", "species"), value = c("Pseudomonadaceae", 
"Pseudomonas", "Pseudomonas putida", "Pseudomonadaceae", "Pseudomonas", 
"Pseudomonas aeruginosa", "Enterobacteriaceae", "Enterobacter", 
"Enterobacter hormaechei", "Enterobacteriaceae", "Salmonella", 
"Salmonella enterica", "Enterobacteriaceae", "Klebsiella", "Klebsiella pneumoniae"
)), row.names = c(NA, -15L), class = "data.frame", .Names = c("species", 
"rank", "value"))
Run Code Online (Sandbox Code Playgroud)

编辑:对@StupidWolf 的响应
我能够将 class.data 转换为数据帧,然后转换为父子数据帧以将其用作ggraph. 唯一剩下的就是拥有 xlabel,在这种情况下是interest向量。但是我不确定这是否可能ggraph

# Retreive data
class.dat <- classification(c("Pseudomonas putida", "Pseudomonas aeruginosa","Enterobacter xiangfangensis","Salmonella enterica","Klebsiella pneumoniae"), db = 'ncbi')

# Specify interest
interest <- c('superkingdom', 'phylum','class','order','genus','species')

# Convert to wide matrix
df2 <- bind_rows(class.dat, .id = "column_label") %>%
  dplyr::select(-id) %>% 
  filter(rank %in% interest) %>%
  spread(rank, name) %>%
  dplyr::select(-column_label) %>%
  dplyr::select(interest) %>% # we need the order
  as.matrix()

# Empty parent child matrix
parent.child <- matrix(nrow=0,ncol=2)

# Add data to parent child
for (i in 1:(ncol(df2)-1)){
  parent.child <- rbind(parent.child,df2[,c(i,i+1)])
}

# To dataframe and add colnmaes
parent.child <- as.data.frame(parent.child)
colnames(parent.child) <- c('from', 'to')

# Convert this to a ggraph
g <- graph_from_data_frame(parent.child)
ggraph(g,layout='dendrogram',circular=FALSE) + 
  geom_edge_link() + 
  geom_node_label(aes(label=names(V(g))),size=3,nudge_y=-0.1) + 
  scale_y_reverse(labels = interest)  + coord_flip() +
  theme_classic()
Run Code Online (Sandbox Code Playgroud)

Stu*_*olf 5

然后我们创建一个分层捆绑

d1 = data.frame(from="origin",to=c("Pseudomonadaceae","Enterobacteriaceae"))
d2 = data.frame(from=c("Pseudomonadaceae","Pseudomonadaceae","Enterobacteriaceae","Enterobacteriaceae","Enterobacteriaceae"),to=c("Pseudomonas","Pseudomonas","Enterobacter","Salmonella","Klebsiella"))
d3 = data.frame(from=c("Pseudomonas","Pseudomonas","Enterobacter","Salmonella","Klebsiella"),to=c("Pseudomonas putida","Pseudomonas aeruginosa","Enterobacter hormaechei","Salmonella enterica","Klebsiella pneumoniae"))

hierarchy <- rbind(d1, d2,d3)

vertices <- data.frame(name = unique(c(as.character(hierarchy$from), as.character(hierarchy$to))) ) 
Run Code Online (Sandbox Code Playgroud)

然后我们使用 igraph 绘制它们:

g <- graph_from_data_frame( hierarchy, vertices=vertices )
lay = layout.reingold.tilford(g) 
par(mar=c(0,0,0,0))
plot(g, layout=-lay[, 2:1],vertex.label.cex=0.7,
vertex.size=1,edge.arrow.size= 0.4)
Run Code Online (Sandbox Code Playgroud)

在此输入图像描述

或者在 ggraph 中类似这样的东西:

library(ggraph)
ggraph(g,layout='dendrogram',circular=FALSE) + 
geom_edge_link() + 
geom_node_label(aes(label=names(V(g))),size=2,nudge_y=-0.1) + 
scale_y_reverse()  + coord_flip() + theme_void()
Run Code Online (Sandbox Code Playgroud)

在此输入图像描述


d.b*_*d.b 3

这是一种基于图表的方法。

df = do.call(rbind, lapply(split(d, d$species), function(x){
    data.frame(rbind(c(x$value[match(c("family", "genus"), x$rank)], "root"),
                     c(x$value[match(c("genus", "species"), x$rank)], NA)),
               stringsAsFactors = FALSE)
}))
df = unique(df)
rownames(df) = NULL
df

library(igraph)

g = graph.data.frame(df, directed = FALSE)

plot(g, layout = layout_as_tree(g, root = which(V(g)$name %in% sort(unique(df[,1][df[,3] == "root"])))))
Run Code Online (Sandbox Code Playgroud)

和格图

d2 = d %>%
    spread(rank, value) %>%
    arrange(family, genus, species) %>%
    mutate(species = sapply(strsplit(species, " "), "[", 2),
           y3 = row_number(),
           grp = row_number(),
           y2 = ave(y3, genus, FUN = function(x) mean(x)),
           y1 = ave(y2, family, FUN = function(x) mean(x))) %>%
    gather(key, y, -family, -genus, -species, -grp) %>%
    mutate(x = as.numeric(factor(key, c("y1", "y2", "y3"))),
           lbl = case_when(
               key == "y1" ~ family,
               key == "y2" ~ genus,
               key == "y3" ~ species,
               TRUE ~ NA_character_)) %>%
    arrange(x, y)

graphics.off()
ggplot(d2, aes(x, y, group = grp, label = lbl)) +
    geom_point(size = 2, shape = 21) +
    geom_line() +
    geom_text(hjust = "inward", vjust = "inward")
Run Code Online (Sandbox Code Playgroud)