-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathhdbscan_cluster.py
103 lines (86 loc) · 4.09 KB
/
hdbscan_cluster.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
import numpy as np
from hdbscan import HDBSCAN
from sklearn.cluster import DBSCAN
from sklearn import metrics
from sklearn.datasets.samples_generator import make_blobs
from sklearn.preprocessing import StandardScaler
import time
def make_var_density_blobs(n_samples=750, centers=[[0,0]], cluster_std=[0.5], random_state=0):
samples_per_blob = n_samples // len(centers)
blobs = [make_blobs(n_samples=samples_per_blob, centers=[c], cluster_std=cluster_std[i])[0]
for i, c in enumerate(centers)]
labels = [i * np.ones(samples_per_blob) for i in range(len(centers))]
return np.vstack(blobs), np.hstack(labels)
##############################################################################
# Generate sample data
centers = [[1, 1], [-1, -1], [1, -1]]
densities = [0.2, 0.35, 0.5]
X, labels_true = make_var_density_blobs(n_samples=750, centers=centers, cluster_std=densities,
random_state=0)
X = StandardScaler().fit_transform(X)
##############################################################################
# Compute DBSCAN
hdb_t1 = time.time()
hdb = HDBSCAN(min_cluster_size=10).fit(X)
hdb_labels = hdb.labels_
hdb_elapsed_time = time.time() - hdb_t1
db_t1 = time.time()
db = DBSCAN(eps=0.1).fit(X)
db_labels = db.labels_
db_elapsed_time = time.time() - db_t1
# Number of clusters in labels, ignoring noise if present.
n_clusters_hdb_ = len(set(hdb_labels)) - (1 if -1 in hdb_labels else 0)
print('\n\n++ HDBSCAN Results')
print('Estimated number of clusters: %d' % n_clusters_hdb_)
print('Elapsed time to cluster: %.4f s' % hdb_elapsed_time)
print('Homogeneity: %0.3f' % metrics.homogeneity_score(labels_true, hdb_labels))
print('Completeness: %0.3f' % metrics.completeness_score(labels_true, hdb_labels))
print('V-measure: %0.3f' % metrics.v_measure_score(labels_true, hdb_labels))
print('Adjusted Rand Index: %0.3f'
% metrics.adjusted_rand_score(labels_true, hdb_labels))
print('Adjusted Mutual Information: %0.3f'
% metrics.adjusted_mutual_info_score(labels_true, hdb_labels))
print('Silhouette Coefficient: %0.3f'
% metrics.silhouette_score(X, hdb_labels))
n_clusters_db_ = len(set(db_labels)) - (1 if -1 in db_labels else 0)
print('\n\n++ DBSCAN Results')
print('Estimated number of clusters: %d' % n_clusters_db_)
print('Elapsed time to cluster: %.4f s' % db_elapsed_time)
print('Homogeneity: %0.3f' % metrics.homogeneity_score(labels_true, db_labels))
print('Completeness: %0.3f' % metrics.completeness_score(labels_true, db_labels))
print('V-measure: %0.3f' % metrics.v_measure_score(labels_true, db_labels))
print('Adjusted Rand Index: %0.3f'
% metrics.adjusted_rand_score(labels_true, db_labels))
print('Adjusted Mutual Information: %0.3f'
% metrics.adjusted_mutual_info_score(labels_true, db_labels))
if n_clusters_db_ > 1:
print('Silhouette Coefficient: %0.3f'
% metrics.silhouette_score(X, db_labels))
else:
print('Silhouette Coefficient: NaN (too few clusters)')
##############################################################################
# Plot result
import matplotlib.pyplot as plt
# Black removed and is used for noise instead.
hdb_unique_labels = set(hdb_labels)
db_unique_labels = set(db_labels)
hdb_colors = plt.cm.Spectral(np.linspace(0, 1, len(hdb_unique_labels)))
db_colors = plt.cm.Spectral(np.linspace(0, 1, len(db_unique_labels)))
fig = plt.figure(figsize=plt.figaspect(0.5))
hdb_axis = fig.add_subplot('121')
db_axis = fig.add_subplot('122')
for k, col in zip(hdb_unique_labels, hdb_colors):
if k == -1:
# Black used for noise.
col = 'k'
hdb_axis.plot(X[hdb_labels == k, 0], X[hdb_labels == k, 1], 'o', markerfacecolor=col,
markeredgecolor='k', markersize=6)
for k, col in zip(db_unique_labels, db_colors):
if k == -1:
# Black used for noise.
col = 'k'
db_axis.plot(X[db_labels == k, 0], X[db_labels == k, 1], 'o', markerfacecolor=col,
markeredgecolor='k', markersize=6)
hdb_axis.set_title('HDBSCAN\nEstimated number of clusters: %d' % n_clusters_hdb_)
db_axis.set_title('DBSCAN\nEstimated number of clusters: %d' % n_clusters_db_)
plt.show()