Tom*_*mes 10 kdtree scipy networkx pandas connected-components
操作 使用连通分量基于距离和标签对点进行聚类。
问题 NetworkX 节点存储属性和 Pandas DataFrame 之间的来回切换
尝试 使用不同的函数,如 Scikit NearestNeighbours,但导致数据的来回移动相同。
问题 是否有更简单的方法来执行此连接组件操作?
例子
import numpy as np
import pandas as pd
import dask.dataframe as dd
import networkx as nx
from scipy import spatial
#generate example dataframe
pdf = pd.DataFrame({'x':[1.0,2.0,3.0,4.0,5.0],
'y':[1.0,2.0,3.0,4.0,5.0],
'z':[1.0,2.0,3.0,4.0,5.0],
'label':[1,2,1,2,1]},
index=[1, 2, 3, 4, 5])
df = dd.from_pandas(pdf, npartitions = 2)
object_id = 0
def cluster(df, object_id=object_id):
# create kdtree
tree = spatial.cKDTree(df[['x', 'y', 'z']])
# get neighbours within distance for every point, store in dataframe as edges
edges = pd.DataFrame({'src':[], 'tgt':[]}, dtype=int)
for source, target in enumerate(tree.query_ball_tree(tree, r=2)):
target.remove(source)
if target:
edges = edges.append(pd.DataFrame({'src':[source] * len(target), 'tgt':target}), ignore_index=True)
# create graph for points using edges from Balltree query
G = nx.from_pandas_dataframe(edges, 'src', 'tgt')
for i in sorted(G.nodes()):
G.node[i]['label'] = nodes.label[i]
G.node[i]['x'] = nodes.x[i]
G.node[i]['y'] = nodes.y[i]
G.node[i]['z'] = nodes.z[i]
# remove edges between points of different classes
G.remove_edges_from([(u,v) for (u,v) in G.edges_iter() if G.node[u]['label'] != G.node[v]['label']])
# find connected components, create dataframe and assign object id
components = list(nx.connected_component_subgraphs(G))
df_objects = pd.DataFrame()
for c in components:
df_object = pd.DataFrame([[i[0], i[1]['x'], i[1]['y'], i[1]['z'], i[1]['label']] for i in c.nodes(data=True)]
, columns=['point_id', 'x', 'y', 'z', 'label']).set_index('point_id')
df_object['object_id'] = object_id
df_objects.append(df_object)
object_id += 1
return df_objects
meta = pd.DataFrame(np.empty(0, dtype=[('x',float),('y',float),('z',float), ('label',int), ('object_id', int)]))
df.apply(cluster, axis=1, meta=meta).head(10)
Run Code Online (Sandbox Code Playgroud)
小智 2
您可以使用DBSCAN从scikit-learn
. 因为min_samples=1
它基本上找到了连接的组件。它可以使用不同的算法进行最近邻居计算,并通过参数进行配置algorithm
(kd-tree
是选项之一)。
我的另一个建议是对不同的标签分别进行计算。这简化了实现并允许并行化。
这两条建议可以按如下方式实施:
from sklearn.cluster import DBSCAN
def add_cluster(df, distance):
db = DBSCAN(eps=distance, min_samples=1).fit(df[["x", "y", ...]])
return df.assign(cluster=db.labels_)
df = df.groupby("label", group_keys=False).apply(add_cluster, distance)
Run Code Online (Sandbox Code Playgroud)
它应该适用于 Pandas 和 Dask 数据帧。请注意,每个标签的 cluster-id 从 0 开始,即簇由 tuple 唯一标识(label, cluster)
。
这是一个包含人工数据的完整示例:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.datasets import make_blobs
from sklearn.cluster import DBSCAN
plt.rc("figure", dpi=100)
plt.style.use("ggplot")
# create fake data
centers = [[1, 1], [-1, -1], [1, -1], [-1, 1]]
XY, labels = make_blobs(n_samples=100, centers=centers, cluster_std=0.2, random_state=0)
inp = (
pd.DataFrame(XY, columns=["x", "y"])
.assign(label=labels)
.replace({"label": {2: 0, 3: 1}})
)
def add_cluster(df, distance):
db = DBSCAN(eps=distance, min_samples=1).fit(df[["x", "y"]])
return df.assign(cluster=db.labels_)
out = inp.groupby("label", group_keys=False).apply(add_cluster, 0.5)
# visualize
label_marker = ["o", "s"]
ax = plt.gca()
ax.set_aspect('equal')
for (label, cluster), group in out.groupby(["label", "cluster"]):
plt.scatter(group.x, group.y, marker=label_marker[label])
Run Code Online (Sandbox Code Playgroud)
归档时间: |
|
查看次数: |
1160 次 |
最近记录: |