-
Notifications
You must be signed in to change notification settings - Fork 5
/
Copy pathKMeansClusterer.py
108 lines (90 loc) · 3.85 KB
/
KMeansClusterer.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_score
from scoreFunction import scoreFunction
class KMeansClusterer:
"""
A class for clustering using kMeans and finding the optimal number of clusters
"""
def __init__(self, embeddings, max_clusters):
"""
Initializes the KMeansClusterer class with the given dataset and maxiumum number of clusters to test.
Args:
embeddings (dict): A dictionary of word or document embeddings.
max_clusters (int): The maximum number of clusters to consider.
"""
self.max_clusters = max_clusters
self.dataset = pd.DataFrame.from_dict(embeddings,orient='index')
self.clusterings = {}
for n_clust in range(2,max_clusters+1):
self.clusterings[n_clust] = self.cluster_data(n_clust)
def cluster_data(self, n_clust):
"""
Cluster the dataset using the specified metric and a range of cluster sizes.
Args:
n_clust (int): The number of clusters to use.
Returns:
cluster_labels (numpy.ndarray): The cluster labels assigned to each data point.
"""
if n_clust in self.clusterings:
return self.clusterings[n_clust]
init_method = 'k-means++'
n_clust = n_clust
n_init = 100
max_iter = 1000
usetol = 1e-10
verbose = 0
random_state = 47
algorithm = "auto"
sk_KMeans = KMeans(n_clusters = n_clust,
init = init_method,
n_init = n_init,
max_iter = max_iter,
tol = usetol,
verbose = verbose,
random_state = random_state,
algorithm = algorithm)
return sk_KMeans.fit(self.dataset)
def evaluate_clustering(self, metric="si",sf_weight=1):
"""
Create a graph showing how the chosen metric varies with the number of clusters.
Args:
metric (str): The clustering metric to use (e.g. 'si' for silhoute score or 'sf for score function).
sf_weight (int): The weight to use to adjust the score function to avoid overflow
"""
scores = []
for k in range(2, self.max_clusters + 1):
# Cluster the data for each value of k
if metric == "si":
score = silhouette_score(self.dataset, self.clusterings[k].labels_)
scores.append(score)
elif metric == "sf":
score = scoreFunction(self.dataset,self.clusterings[k], sf_weight)
scores.append(score)
# Plot the scores vs. the number of clusters
plt.plot(range(2, self.max_clusters + 1), scores, marker='o')
plt.xlabel('Number of Clusters')
if metric == "si":
plt.ylabel('Silhouette Score')
plt.title('Silhouette Score vs. Number of Clusters')
elif metric == "sf":
plt.ylabel('SF Score')
plt.title('SF Score vs. Number of Clusters')
plt.show()
def get_labled_cluster(self, n_clust):
"""
Returns a dictionary of clustered words or documents based on the number of clusters specified.
Args:
n_clust (int): The number of clusters to use.
Returns:
clustered (dict): A dictionary of clusters where the keys are 0 through n_clust - 1 and the values are lists of words belonging to that cluster.
"""
clustered = {}
for word, clust_id in zip(self.dataset.index[:], self.clusterings[n_clust].labels_):
if clust_id in clustered:
clustered[clust_id].append(word)
else:
clustered[clust_id] = [word]
return clustered