Shu*_*nov 40 python plot cluster-analysis dendrogram
我正在尝试使用children_提供的属性构建树形图AgglomerativeClustering,但到目前为止我运气不好.我无法使用,scipy.cluster因为提供的凝聚聚类scipy缺少一些对我很重要的选项(例如指定聚类数量的选项).我真的很感激那里的任何建议.
import sklearn.cluster
clstr = cluster.AgglomerativeClustering(n_clusters=2)
clusterer.children_
Run Code Online (Sandbox Code Playgroud)
seb*_*gel 12
改为使用凝聚聚类的scipy实现.这是一个例子.
from scipy.cluster.hierarchy import dendrogram, linkage
data = [[0., 0.], [0.1, -0.1], [1., 1.], [1.1, 1.1]]
Z = linkage(data)
dendrogram(Z)
Run Code Online (Sandbox Code Playgroud)
你可以找到文档linkage 这里和文档dendrogram 在这里.
我前段时间遇到了完全相同的问题。我设法绘制该死的树状图的方法是使用软件包ete3。该软件包能够通过各种选项灵活地绘制树木。唯一的困难是将输出转换sklearn为children_可以读取和理解的Newick Tree 格式ete3。此外,我需要手动计算树突的跨度,因为children_. 这是我使用的代码片段。它计算 Newick 树,然后显示ete3树数据结构。有关如何绘图的更多详细信息,请查看此处
import numpy as np
from sklearn.cluster import AgglomerativeClustering
import ete3
def build_Newick_tree(children,n_leaves,X,leaf_labels,spanner):
"""
build_Newick_tree(children,n_leaves,X,leaf_labels,spanner)
Get a string representation (Newick tree) from the sklearn
AgglomerativeClustering.fit output.
Input:
children: AgglomerativeClustering.children_
n_leaves: AgglomerativeClustering.n_leaves_
X: parameters supplied to AgglomerativeClustering.fit
leaf_labels: The label of each parameter array in X
spanner: Callable that computes the dendrite's span
Output:
ntree: A str with the Newick tree representation
"""
return go_down_tree(children,n_leaves,X,leaf_labels,len(children)+n_leaves-1,spanner)[0]+';'
def go_down_tree(children,n_leaves,X,leaf_labels,nodename,spanner):
"""
go_down_tree(children,n_leaves,X,leaf_labels,nodename,spanner)
Iterative function that traverses the subtree that descends from
nodename and returns the Newick representation of the subtree.
Input:
children: AgglomerativeClustering.children_
n_leaves: AgglomerativeClustering.n_leaves_
X: parameters supplied to AgglomerativeClustering.fit
leaf_labels: The label of each parameter array in X
nodename: An int that is the intermediate node name whos
children are located in children[nodename-n_leaves].
spanner: Callable that computes the dendrite's span
Output:
ntree: A str with the Newick tree representation
"""
nodeindex = nodename-n_leaves
if nodename<n_leaves:
return leaf_labels[nodeindex],np.array([X[nodeindex]])
else:
node_children = children[nodeindex]
branch0,branch0samples = go_down_tree(children,n_leaves,X,leaf_labels,node_children[0])
branch1,branch1samples = go_down_tree(children,n_leaves,X,leaf_labels,node_children[1])
node = np.vstack((branch0samples,branch1samples))
branch0span = spanner(branch0samples)
branch1span = spanner(branch1samples)
nodespan = spanner(node)
branch0distance = nodespan-branch0span
branch1distance = nodespan-branch1span
nodename = '({branch0}:{branch0distance},{branch1}:{branch1distance})'.format(branch0=branch0,branch0distance=branch0distance,branch1=branch1,branch1distance=branch1distance)
return nodename,node
def get_cluster_spanner(aggClusterer):
"""
spanner = get_cluster_spanner(aggClusterer)
Input:
aggClusterer: sklearn.cluster.AgglomerativeClustering instance
Get a callable that computes a given cluster's span. To compute
a cluster's span, call spanner(cluster)
The cluster must be a 2D numpy array, where the axis=0 holds
separate cluster members and the axis=1 holds the different
variables.
"""
if aggClusterer.linkage=='ward':
if aggClusterer.affinity=='euclidean':
spanner = lambda x:np.sum((x-aggClusterer.pooling_func(x,axis=0))**2)
elif aggClusterer.linkage=='complete':
if aggClusterer.affinity=='euclidean':
spanner = lambda x:np.max(np.sum((x[:,None,:]-x[None,:,:])**2,axis=2))
elif aggClusterer.affinity=='l1' or aggClusterer.affinity=='manhattan':
spanner = lambda x:np.max(np.sum(np.abs(x[:,None,:]-x[None,:,:]),axis=2))
elif aggClusterer.affinity=='l2':
spanner = lambda x:np.max(np.sqrt(np.sum((x[:,None,:]-x[None,:,:])**2,axis=2)))
elif aggClusterer.affinity=='cosine':
spanner = lambda x:np.max(np.sum((x[:,None,:]*x[None,:,:]))/(np.sqrt(np.sum(x[:,None,:]*x[:,None,:],axis=2,keepdims=True))*np.sqrt(np.sum(x[None,:,:]*x[None,:,:],axis=2,keepdims=True))))
else:
raise AttributeError('Unknown affinity attribute value {0}.'.format(aggClusterer.affinity))
elif aggClusterer.linkage=='average':
if aggClusterer.affinity=='euclidean':
spanner = lambda x:np.mean(np.sum((x[:,None,:]-x[None,:,:])**2,axis=2))
elif aggClusterer.affinity=='l1' or aggClusterer.affinity=='manhattan':
spanner = lambda x:np.mean(np.sum(np.abs(x[:,None,:]-x[None,:,:]),axis=2))
elif aggClusterer.affinity=='l2':
spanner = lambda x:np.mean(np.sqrt(np.sum((x[:,None,:]-x[None,:,:])**2,axis=2)))
elif aggClusterer.affinity=='cosine':
spanner = lambda x:np.mean(np.sum((x[:,None,:]*x[None,:,:]))/(np.sqrt(np.sum(x[:,None,:]*x[:,None,:],axis=2,keepdims=True))*np.sqrt(np.sum(x[None,:,:]*x[None,:,:],axis=2,keepdims=True))))
else:
raise AttributeError('Unknown affinity attribute value {0}.'.format(aggClusterer.affinity))
else:
raise AttributeError('Unknown linkage attribute value {0}.'.format(aggClusterer.linkage))
return spanner
clusterer = AgglomerativeClustering(n_clusters=2,compute_full_tree=True) # You can set compute_full_tree to 'auto', but I left it this way to get the entire tree plotted
clusterer.fit(X) # X for whatever you want to fit
spanner = get_cluster_spanner(clusterer)
newick_tree = build_Newick_tree(clusterer.children_,clusterer.n_leaves_,X,leaf_labels,spanner) # leaf_labels is a list of labels for each entry in X
tree = ete3.Tree(newick_tree)
tree.show()
Run Code Online (Sandbox Code Playgroud)