Riy*_*yaz 25 python r cluster-analysis k-means
我试图将差距统计和预测强度的R实现http://edchedch.wordpress.com/2011/03/19/counting-clusters/转换为python脚本,用于估计3个簇的虹膜数据中的簇数.我没有得到3个集群,而是在不同的运行中获得了不同的结果,其中3(实际的集群数)几乎没有估计.图表显示估计的数字是10而不是3.我错过了什么?任何人都可以帮我找到问题吗?
import random
import numpy as np
import matplotlib.pyplot as plt
from sklearn.cluster import KMeans
def dispersion (data, k):
if k == 1:
cluster_mean = np.mean(data, axis=0)
distances_from_mean = np.sum((data - cluster_mean)**2,axis=1)
dispersion_val = np.log(sum(distances_from_mean))
else:
k_means_model_ = KMeans(n_clusters=k, max_iter=50, n_init=5).fit(data)
distances_from_mean = range(k)
for i in range(k):
distances_from_mean[i] = int()
for idx, label in enumerate(k_means_model_.labels_):
if i == label:
distances_from_mean[i] += sum((data[idx] - k_means_model_.cluster_centers_[i])**2)
dispersion_val = np.log(sum(distances_from_mean))
return dispersion_val
def reference_dispersion(data, num_clusters, num_reference_bootstraps):
dispersions = [dispersion(generate_uniform_points(data), num_clusters) for i in range(num_reference_bootstraps)]
mean_dispersion = np.mean(dispersions)
stddev_dispersion = float(np.std(dispersions)) / np.sqrt(1. + 1. / num_reference_bootstraps)
return mean_dispersion
def generate_uniform_points(data):
mins = np.argmin(data, axis=0)
maxs = np.argmax(data, axis=0)
num_dimensions = data.shape[1]
num_datapoints = data.shape[0]
reference_data_set = np.zeros((num_datapoints,num_dimensions))
for i in range(num_datapoints):
for j in range(num_dimensions):
reference_data_set[i][j] = random.uniform(data[mins[j]][j],data[maxs[j]][j])
return reference_data_set
def gap_statistic (data, nthCluster, referenceDatasets):
actual_dispersion = dispersion(data, nthCluster)
ref_dispersion = reference_dispersion(data, nthCluster, num_reference_bootstraps)
return actual_dispersion, ref_dispersion
if __name__ == "__main__":
data=np.loadtxt('iris.mat', delimiter=',', dtype=float)
maxClusters = 10
num_reference_bootstraps = 10
dispersion_values = np.zeros((maxClusters,2))
for cluster in range(1, maxClusters+1):
dispersion_values_actual,dispersion_values_reference = gap_statistic(data, cluster, num_reference_bootstraps)
dispersion_values[cluster-1][0] = dispersion_values_actual
dispersion_values[cluster-1][1] = dispersion_values_reference
gaps = dispersion_values[:,1] - dispersion_values[:,0]
print gaps
print "The estimated number of clusters is ", range(maxClusters)[np.argmax(gaps)]+1
plt.plot(range(len(gaps)), gaps)
plt.show()
Run Code Online (Sandbox Code Playgroud)
你可以看一下这段代码,你可以改变你的输出绘图格式
\n[![# coding: utf-8\n\n# Impl\xc3\xa9mentation de K-means clustering python\n\n\n#Chargement des biblioth\xc3\xa8ques\nimport pandas as pd\nimport numpy as np\nimport matplotlib.pyplot as plt\nfrom sklearn.cluster import KMeans\nfrom sklearn import datasets\n\n\n#chargement de jeu des donn\xc3\xa9es Iris\niris = datasets.load_iris()\n\n#importer le jeu de donn\xc3\xa9es Iris dataset \xc3\xa0 l\'aide du module pandas\nx = pd.DataFrame(iris.data)\n\nx.columns = \\[\'Sepal_Length\',\'Sepal_width\',\'Petal_Length\',\'Petal_width\'\\]\n\n\ny = pd.DataFrame(iris.target)\n\n\ny.columns = \\[\'Targets\'\\]\n\n\n#Cr\xc3\xa9ation d\'un objet K-Means avec un regroupement en 3 clusters (groupes)\nmodel=KMeans(n_clusters=3)\n\n\n\n#application du mod\xc3\xa8le sur notre jeu de donn\xc3\xa9es Iris\nmodel.fit(x)\n\n\n\n#Visualisation des clusters\nplt.scatter(x.Petal_Length, x.Petal_width)\nplt.show()\n\n\n\n\ncolormap=np.array(\\[\'Red\',\'green\',\'blue\'\\])\n\n\n\n#Visualisation du jeu de donn\xc3\xa9es sans alt\xc3\xa9ration de ce dernier (affichage des fleurs selon leur \xc3\xa9tiquettes)\nplt.scatter(x.Petal_Length, x.Petal_width,c=colormap\\[y.Targets\\],s=40)\nplt.title(\'Classification r\xc3\xa9elle\')\nplt.show()\n\n#Visualisation des clusters form\xc3\xa9s par K-Means\nplt.scatter(x.Petal_Length, x.Petal_width,c=colormap\\[model.labels_\\],s=40)\nplt.title(\'Classification K-means \')\nplt.show()][1]][1]\nRun Code Online (Sandbox Code Playgroud)\n输出 1 \n