Source code for maayanlab_bioinformatics.clustering.silhouette_analysis
import pandas as pd
from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_score
[docs]
def silhouette_analysis(mat: pd.DataFrame, min_clusters=2, max_clusters=25, metric='cosine', random_state=None, **kwargs):
''' Compute KMeans repeatedly on the matrix with different cluster
values between min_clusters and max_clusters, compute the silhouette_score,
and return the best kmeans model/predictions.
'''
silhouette_scores = {}
best = None
for n in range(min_clusters, max_clusters+1):
km = KMeans(n_clusters=n, random_state=random_state)
y_pred = km.fit_predict(mat.values)
score = silhouette_score(mat.values, y_pred, metric='cosine')
silhouette_scores[n] = score
if best is None or score > best[0]:
best = (score, km, y_pred)
#
silhouette_scores = pd.DataFrame([
{'N Clusters': k, 'Silhouette Score': v}
for k, v in silhouette_scores.items()
])
#
score, km, y_pred = best
y_pred = pd.DataFrame({
'Cluster': [
'Cluster {c}'.format(c=c)
for c in km.fit_predict(mat.values)
]
}, index=mat.index)
return type('SilhouetteAnalysis', tuple(), dict(
silhouette_scores=silhouette_scores,
best_score=score,
best_km=km,
best_preds=y_pred,
))