794 lines
24 KiB
Python
794 lines
24 KiB
Python
#
|
|
# OtterTune - cluster.py
|
|
#
|
|
# Copyright (c) 2017-18, Carnegie Mellon University Database Group
|
|
#
|
|
'''
|
|
Created on Jul 4, 2016
|
|
|
|
@author: dva
|
|
'''
|
|
from abc import ABCMeta, abstractproperty
|
|
from collections import OrderedDict
|
|
|
|
import os
|
|
import json
|
|
import copy
|
|
import numpy as np
|
|
import matplotlib.pyplot as plt
|
|
|
|
from scipy.spatial.distance import cdist
|
|
from sklearn.metrics import silhouette_score
|
|
from sklearn.cluster import KMeans as SklearnKMeans
|
|
from celery.utils.log import get_task_logger
|
|
|
|
from .base import ModelBase
|
|
|
|
# Log debug messages
|
|
LOGGER = get_task_logger(__name__)
|
|
|
|
|
|
class KMeans(ModelBase):
|
|
"""
|
|
KMeans:
|
|
|
|
Fits an Sklearn KMeans model to X.
|
|
|
|
|
|
See also
|
|
--------
|
|
http://scikit-learn.org/stable/modules/generated/sklearn.cluster.KMeans.html
|
|
|
|
|
|
Attributes
|
|
----------
|
|
n_clusters_ : int
|
|
The number of clusters, K
|
|
|
|
cluster_inertia_ : float
|
|
Sum of squared distances of samples to their closest cluster center
|
|
|
|
cluster_labels_ : array, [n_clusters_]
|
|
Labels indicating the membership of each point
|
|
|
|
cluster_centers_ : array, [n_clusters, n_features]
|
|
Coordinates of cluster centers
|
|
|
|
sample_labels_ : array, [n_samples]
|
|
Labels for each of the samples in X
|
|
|
|
sample_distances_ : array, [n_samples]
|
|
The distance between each sample point and its cluster's center
|
|
|
|
|
|
Constants
|
|
---------
|
|
SAMPLE_CUTOFF_ : int
|
|
If n_samples > SAMPLE_CUTOFF_ then sample distances
|
|
are NOT recorded
|
|
"""
|
|
|
|
SAMPLE_CUTOFF_ = 1000
|
|
|
|
def __init__(self):
|
|
self.model_ = None
|
|
self.n_clusters_ = None
|
|
self.sample_labels_ = None
|
|
self.sample_distances_ = None
|
|
|
|
@property
|
|
def cluster_inertia_(self):
|
|
# Sum of squared distances of samples to their closest cluster center
|
|
return None if self.model_ is None else \
|
|
self.model_.inertia_
|
|
|
|
@property
|
|
def cluster_labels_(self):
|
|
# Cluster membership labels for each point
|
|
return None if self.model_ is None else \
|
|
copy.deepcopy(self.model_.labels_)
|
|
|
|
@property
|
|
def cluster_centers_(self):
|
|
# Coordinates of the cluster centers
|
|
return None if self.model_ is None else \
|
|
copy.deepcopy(self.model_.cluster_centers_)
|
|
|
|
def _reset(self):
|
|
"""Resets all attributes (erases the model)"""
|
|
self.model_ = None
|
|
self.n_clusters_ = None
|
|
self.sample_labels_ = None
|
|
self.sample_distances_ = None
|
|
|
|
def fit(self, X, K, sample_labels=None, estimator_params=None):
|
|
"""Fits a Sklearn KMeans model to X.
|
|
|
|
Parameters
|
|
----------
|
|
X : array-like, shape (n_samples, n_features)
|
|
Training data.
|
|
|
|
K : int
|
|
The number of clusters.
|
|
|
|
sample_labels : array-like, shape (n_samples), optional
|
|
Labels for each of the samples in X.
|
|
|
|
estimator_params : dict, optional
|
|
The parameters to pass to the KMeans estimators.
|
|
|
|
|
|
Returns
|
|
-------
|
|
self
|
|
"""
|
|
self._reset()
|
|
# Note: previously set n_init=50
|
|
self.model_ = SklearnKMeans(K)
|
|
if estimator_params is not None:
|
|
assert isinstance(estimator_params, dict)
|
|
self.model_.set_params(**estimator_params)
|
|
|
|
# Compute Kmeans model
|
|
self.model_.fit(X)
|
|
if sample_labels is None:
|
|
sample_labels = ["sample_{}".format(i) for i in range(X.shape[0])]
|
|
assert len(sample_labels) == X.shape[0]
|
|
self.sample_labels_ = np.array(sample_labels)
|
|
self.n_clusters_ = K
|
|
|
|
# Record sample label/distance from its cluster center
|
|
self.sample_distances_ = OrderedDict()
|
|
for cluster_label in range(self.n_clusters_):
|
|
assert cluster_label not in self.sample_distances_
|
|
member_rows = X[self.cluster_labels_ == cluster_label, :]
|
|
member_labels = self.sample_labels_[self.cluster_labels_ == cluster_label]
|
|
centroid = np.expand_dims(self.cluster_centers_[cluster_label], axis=0)
|
|
|
|
# "All clusters must have at least 1 member!"
|
|
if member_rows.shape[0] == 0:
|
|
return None
|
|
|
|
# Calculate distance between each member row and the current cluster
|
|
dists = np.empty(member_rows.shape[0])
|
|
dist_labels = []
|
|
for j, (row, label) in enumerate(zip(member_rows, member_labels)):
|
|
dists[j] = cdist(np.expand_dims(row, axis=0), centroid, "euclidean").squeeze()
|
|
dist_labels.append(label)
|
|
|
|
# Sort the distances/labels in ascending order
|
|
sort_order = np.argsort(dists)
|
|
dists = dists[sort_order]
|
|
dist_labels = np.array(dist_labels)[sort_order]
|
|
self.sample_distances_[cluster_label] = {
|
|
"sample_labels": dist_labels,
|
|
"distances": dists,
|
|
}
|
|
return self
|
|
|
|
def get_closest_samples(self):
|
|
"""Returns a list of the labels of the samples that are located closest
|
|
to their cluster's center.
|
|
|
|
|
|
Returns
|
|
----------
|
|
closest_samples : list
|
|
A list of the sample labels that are located the closest to
|
|
their cluster's center.
|
|
"""
|
|
if self.sample_distances_ is None:
|
|
raise Exception("No model has been fit yet!")
|
|
|
|
return [samples['sample_labels'][0] for samples in list(self.sample_distances_.values())]
|
|
|
|
def get_memberships(self):
|
|
'''
|
|
Return the memberships in each cluster
|
|
'''
|
|
memberships = OrderedDict()
|
|
for cluster_label, samples in list(self.sample_distances_.items()):
|
|
memberships[cluster_label] = OrderedDict(
|
|
[(l, d) for l, d in zip(samples["sample_labels"], samples["distances"])])
|
|
return json.dumps(memberships, indent=4)
|
|
|
|
|
|
class KMeansClusters(ModelBase):
|
|
|
|
"""
|
|
KMeansClusters:
|
|
|
|
Fits a KMeans model to X for clusters in the range [min_cluster_, max_cluster_].
|
|
|
|
|
|
Attributes
|
|
----------
|
|
min_cluster_ : int
|
|
The minimum cluster size to fit a KMeans model to
|
|
|
|
max_cluster_ : int
|
|
The maximum cluster size to fit a KMeans model to
|
|
|
|
cluster_map_ : dict
|
|
A dictionary mapping the cluster size (K) to the KMeans
|
|
model fitted to X with K clusters
|
|
|
|
sample_labels_ : array, [n_samples]
|
|
Labels for each of the samples in X
|
|
"""
|
|
|
|
def __init__(self):
|
|
self.min_cluster_ = None
|
|
self.max_cluster_ = None
|
|
self.cluster_map_ = None
|
|
self.sample_labels_ = None
|
|
|
|
def _reset(self):
|
|
"""Resets all attributes (erases the model)"""
|
|
self.min_cluster_ = None
|
|
self.max_cluster_ = None
|
|
self.cluster_map_ = None
|
|
self.sample_labels_ = None
|
|
|
|
def fit(self, X, min_cluster, max_cluster, sample_labels=None, estimator_params=None):
|
|
"""Fits a KMeans model to X for each cluster in the range [min_cluster, max_cluster].
|
|
|
|
Parameters
|
|
----------
|
|
X : array-like, shape (n_samples, n_features)
|
|
Training data.
|
|
|
|
min_cluster : int
|
|
The minimum cluster size to fit a KMeans model to.
|
|
|
|
max_cluster : int
|
|
The maximum cluster size to fit a KMeans model to.
|
|
|
|
sample_labels : array-like, shape (n_samples), optional
|
|
Labels for each of the samples in X.
|
|
|
|
estimator_params : dict, optional
|
|
The parameters to pass to the KMeans estimators.
|
|
|
|
|
|
Returns
|
|
-------
|
|
self
|
|
"""
|
|
self._reset()
|
|
self.min_cluster_ = min_cluster
|
|
self.max_cluster_ = max_cluster
|
|
self.cluster_map_ = {}
|
|
if sample_labels is None:
|
|
sample_labels = ["sample_{}".format(i) for i in range(X.shape[1])]
|
|
self.sample_labels_ = sample_labels
|
|
for K in range(self.min_cluster_, self.max_cluster_ + 1):
|
|
tmp = KMeans().fit(X, K, self.sample_labels_, estimator_params)
|
|
if tmp is None: # Set maximum cluster
|
|
assert K > min_cluster, "min_cluster is too large for the model"
|
|
self.max_cluster_ = K - 1
|
|
break
|
|
else:
|
|
self.cluster_map_[K] = tmp
|
|
|
|
return self
|
|
|
|
def save(self, savedir):
|
|
"""Saves the KMeans model results
|
|
|
|
Parameters
|
|
----------
|
|
savedir : string
|
|
Path to the directory to save the results in.
|
|
"""
|
|
if self.cluster_map_ is None:
|
|
raise Exception("No models have been fitted yet!")
|
|
|
|
cluster_map = OrderedDict()
|
|
inertias = []
|
|
for K, model in sorted(self.cluster_map_.items()):
|
|
cluster_map[K] = {
|
|
"cluster_inertia": model.cluster_inertia_,
|
|
"cluster_labels": model.cluster_labels_,
|
|
"cluster_centers": model.cluster_centers_,
|
|
}
|
|
inertias.append(model.cluster_inertia_)
|
|
|
|
# Save sum of squares plot (elbow curve)
|
|
fig = plt.figure()
|
|
plt.plot(list(cluster_map.keys()), inertias, '--o')
|
|
plt.xlabel("Number of clusters (K)")
|
|
plt.ylabel("Within sum of squares W_k")
|
|
plt.title("Within Sum of Squares vs. Number of Clusters")
|
|
fig.canvas.set_window_title(os.path.basename(savedir))
|
|
savepath = os.path.join(savedir, "kmeans_sum_of_squares.pdf")
|
|
plt.savefig(savepath, bbox_inches="tight")
|
|
plt.close()
|
|
|
|
# save cluster memberships
|
|
for K in range(self.min_cluster_, self.max_cluster_ + 1):
|
|
savepath = os.path.join(savedir,
|
|
"memberships_{}-clusters.json".format(K))
|
|
members = self.cluster_map_[K].get_memberships()
|
|
with open(savepath, "w") as f:
|
|
f.write(members)
|
|
|
|
|
|
class KSelection(ModelBase, metaclass=ABCMeta):
|
|
"""KSelection:
|
|
|
|
Abstract class for techniques that approximate the optimal
|
|
number of clusters (K).
|
|
|
|
|
|
Attributes
|
|
----------
|
|
optimal_num_clusters_ : int
|
|
An estimation of the optimal number of clusters K for
|
|
a KMeans model fit to X
|
|
clusters_ : array, [n_clusters]
|
|
The sizes of the clusters
|
|
|
|
name_ : string
|
|
The name of this technique
|
|
"""
|
|
|
|
NAME_ = None
|
|
|
|
def __init__(self):
|
|
self.optimal_num_clusters_ = None
|
|
self.clusters_ = None
|
|
|
|
def _reset(self):
|
|
"""Resets all attributes (erases the model)"""
|
|
self.optimal_num_clusters_ = None
|
|
self.clusters_ = None
|
|
|
|
@abstractproperty
|
|
def name_(self):
|
|
pass
|
|
|
|
def save(self, savedir):
|
|
"""Saves the estimation of the optimal # of clusters.
|
|
|
|
Parameters
|
|
----------
|
|
savedir : string
|
|
Path to the directory to save the results in.
|
|
"""
|
|
if self.optimal_num_clusters_ is None:
|
|
raise Exception("Optimal number of clusters has not been computed!")
|
|
|
|
# Save the computed optimal number of clusters
|
|
savepath = os.path.join(savedir, self.name_ + "_optimal_num_clusters.txt")
|
|
with open(savepath, "w") as f:
|
|
f.write(str(self.optimal_num_clusters_))
|
|
|
|
|
|
class GapStatistic(KSelection):
|
|
"""GapStatistic:
|
|
|
|
Approximates the optimal number of clusters (K).
|
|
|
|
|
|
References
|
|
----------
|
|
https://web.stanford.edu/~hastie/Papers/gap.pdf
|
|
|
|
|
|
Attributes
|
|
----------
|
|
optimal_num_clusters_ : int
|
|
An estimation of the optimal number of clusters K for
|
|
a KMeans model fit to X
|
|
|
|
clusters_ : array, [n_clusters]
|
|
The sizes of the clusters
|
|
|
|
name_ : string
|
|
The name of this technique
|
|
|
|
log_wks_ : array, [n_clusters]
|
|
The within-dispersion measures of X (log)
|
|
|
|
log_wkbs_ : array, [n_clusters]
|
|
The within-dispersion measures of the generated
|
|
reference data sets
|
|
|
|
khats_ : array, [n_clusters]
|
|
The gap-statistic for each cluster
|
|
"""
|
|
|
|
NAME_ = "gap-statistic"
|
|
|
|
def __init__(self):
|
|
super(GapStatistic, self).__init__()
|
|
self.log_wks_ = None
|
|
self.log_wkbs_ = None
|
|
self.khats_ = None
|
|
|
|
@property
|
|
def name_(self):
|
|
return self.NAME_
|
|
|
|
def _reset(self):
|
|
"""Resets all attributes (erases the model)"""
|
|
super(GapStatistic, self)._reset()
|
|
self.log_wks_ = None
|
|
self.log_wkbs_ = None
|
|
self.khats_ = None
|
|
|
|
def fit(self, X, cluster_map, n_b=50):
|
|
"""Estimates the optimal number of clusters (K) for a
|
|
KMeans model trained on X.
|
|
|
|
Parameters
|
|
----------
|
|
X : array-like, shape (n_samples, n_features)
|
|
Training data.
|
|
|
|
cluster_map_ : dict
|
|
A dictionary mapping each cluster size (K) to the KMeans
|
|
model fitted to X with K clusters
|
|
|
|
n_B : int
|
|
The number of reference data sets to generate
|
|
|
|
|
|
Returns
|
|
-------
|
|
self
|
|
"""
|
|
self._reset()
|
|
mins, maxs = GapStatistic.bounding_box(X)
|
|
n_clusters = len(cluster_map)
|
|
|
|
# Dispersion for real distribution
|
|
log_wks = np.zeros(n_clusters)
|
|
log_wkbs = np.zeros(n_clusters)
|
|
sk = np.zeros(n_clusters)
|
|
for indk, (K, model) in enumerate(sorted(cluster_map.items())):
|
|
|
|
# Computes Wk: the within-dispersion of each cluster size (k)
|
|
log_wks[indk] = np.log(model.cluster_inertia_ / (2.0 * K))
|
|
|
|
# Create B reference datasets
|
|
log_bwkbs = np.zeros(n_b)
|
|
for i in range(n_b):
|
|
Xb = np.empty_like(X)
|
|
for j in range(X.shape[1]):
|
|
Xb[:, j] = np.random.uniform(mins[j], maxs[j], size=X.shape[0])
|
|
Xb_model = KMeans().fit(Xb, K)
|
|
log_bwkbs[i] = np.log(Xb_model.cluster_inertia_ / (2.0 * K))
|
|
log_wkbs[indk] = sum(log_bwkbs) / n_b
|
|
sk[indk] = np.sqrt(sum((log_bwkbs - log_wkbs[indk]) ** 2) / n_b)
|
|
sk = sk * np.sqrt(1 + 1.0 / n_b)
|
|
|
|
khats = np.zeros(n_clusters)
|
|
gaps = log_wkbs - log_wks
|
|
gsks = gaps - sk
|
|
khats[1:] = gaps[0:-1] - gsks[1:]
|
|
self.clusters_ = np.array(sorted(cluster_map.keys()))
|
|
|
|
for i in range(1, n_clusters):
|
|
if gaps[i - 1] >= gsks[i]:
|
|
self.optimal_num_clusters_ = self.clusters_[i - 1]
|
|
break
|
|
|
|
if self.optimal_num_clusters_ is None:
|
|
LOGGER.info("GapStatistic NOT found the optimal k, \
|
|
use the last(maximum) k instead ")
|
|
self.optimal_num_clusters_ = self.clusters_[-1]
|
|
|
|
self.log_wks_ = log_wks
|
|
self.log_wkbs_ = log_wkbs
|
|
self.khats_ = khats
|
|
return self
|
|
|
|
@staticmethod
|
|
def bounding_box(X):
|
|
"""Computes the box that tightly bounds X
|
|
|
|
Parameters
|
|
----------
|
|
X : array-like, shape (n_samples, n_features)
|
|
Training data.
|
|
|
|
|
|
Returns
|
|
-------
|
|
The mins and maxs that make up the bounding box
|
|
"""
|
|
mins = np.min(X, axis=0)
|
|
maxs = np.max(X, axis=0)
|
|
return mins, maxs
|
|
|
|
@staticmethod
|
|
def Wk(X, mu, cluster_labels):
|
|
"""Computes the within-dispersion of each cluster size (k)
|
|
|
|
Parameters
|
|
----------
|
|
X : array-like, shape (n_samples, n_features)
|
|
Training data.
|
|
|
|
mu : array-like, shape (n_clusters, n_features)
|
|
Coordinates of cluster centers
|
|
|
|
cluster_labels: array-like, shape (n_samples)
|
|
Labels for each of the samples in X.
|
|
|
|
|
|
Returns
|
|
-------
|
|
The within-dispersion of each cluster (K)
|
|
"""
|
|
K = len(mu)
|
|
return sum([np.linalg.norm(mu[i] - x) ** 2 / (2.0 * K)
|
|
for i in range(K)
|
|
for x in X[cluster_labels == i]])
|
|
|
|
def save(self, savedir):
|
|
"""Saves the estimation results of the optimal # of clusters.
|
|
|
|
Parameters
|
|
----------
|
|
savedir : string
|
|
Path to the directory to save the results in.
|
|
"""
|
|
super(GapStatistic, self).save(savedir)
|
|
|
|
# Plot the calculated gap
|
|
gaps = self.log_wkbs_ - self.log_wks_
|
|
fig = plt.figure()
|
|
plt.plot(self.clusters_, gaps, '--o')
|
|
plt.title("Gap vs. Number of Clusters")
|
|
plt.xlabel("Number of clusters (K)")
|
|
plt.ylabel("gap_K")
|
|
fig.canvas.set_window_title(os.path.basename(savedir))
|
|
plt.savefig(os.path.join(savedir, self.name_ + ".pdf"), bbox_inches="tight")
|
|
plt.close()
|
|
|
|
# Plot the gap statistic
|
|
fig = plt.figure()
|
|
plt.bar(self.clusters_, self.khats_)
|
|
plt.title("Gap Statistic vs. Number of Clusters")
|
|
plt.xlabel("Number of clusters (K)")
|
|
plt.ylabel("gap(K)-(gap(K+1)-s(K+1))")
|
|
fig.canvas.set_window_title(os.path.basename(savedir))
|
|
plt.savefig(os.path.join(savedir, self.name_ + "_final.pdf"),
|
|
bbox_inches="tight")
|
|
plt.close()
|
|
|
|
|
|
class DetK(KSelection):
|
|
"""DetK:
|
|
|
|
Approximates the optimal number of clusters (K).
|
|
|
|
|
|
References
|
|
----------
|
|
https://www.ee.columbia.edu/~dpwe/papers/PhamDN05-kmeans.pdf
|
|
|
|
|
|
Attributes
|
|
----------
|
|
optimal_num_clusters_ : int
|
|
An estimation of the optimal number of clusters K for
|
|
KMeans models fit to X
|
|
|
|
clusters_ : array, [n_clusters]
|
|
The sizes of the clusters
|
|
|
|
name_ : string
|
|
The name of this technique
|
|
|
|
fs_ : array, [n_clusters]
|
|
The computed evaluation functions F(K) for each cluster size K
|
|
"""
|
|
|
|
NAME_ = "det-k"
|
|
|
|
def __init__(self):
|
|
super(DetK, self).__init__()
|
|
self.fs_ = None
|
|
|
|
@property
|
|
def name_(self):
|
|
return DetK.NAME_
|
|
|
|
def _reset(self):
|
|
"""Resets all attributes (erases the model)"""
|
|
super(DetK, self)._reset()
|
|
self.fs_ = None
|
|
|
|
def fit(self, X, cluster_map):
|
|
"""Estimates the optimal number of clusters (K) for a
|
|
KMeans model trained on X.
|
|
|
|
Parameters
|
|
----------
|
|
X : array-like, shape (n_samples, n_features)
|
|
Training data.
|
|
|
|
cluster_map_ : dict
|
|
A dictionary mapping each cluster size (K) to the KMeans
|
|
model fitted to X with K clusters
|
|
|
|
|
|
Returns
|
|
-------
|
|
self
|
|
"""
|
|
self._reset()
|
|
n_clusters = len(cluster_map)
|
|
nd = X.shape[1]
|
|
fs = np.empty(n_clusters)
|
|
sks = np.empty(n_clusters)
|
|
alpha = {}
|
|
# K from 1 to maximum_cluster_
|
|
for i, (K, model) \
|
|
in enumerate(sorted(cluster_map.items())):
|
|
# Compute alpha(K, nd) (i.e. alpha[K])
|
|
if K == 2:
|
|
alpha[K] = 1 - 3.0 / (4 * nd)
|
|
elif K > 2:
|
|
alpha[K] = alpha[K - 1] + (1 - alpha[K - 1]) / 6.0
|
|
sks[i] = model.cluster_inertia_
|
|
|
|
if K == 1:
|
|
fs[i] = 1
|
|
elif sks[i - 1] == 0:
|
|
fs[i] = 1
|
|
else:
|
|
fs[i] = sks[i] / (alpha[K] * sks[i - 1])
|
|
self.clusters_ = np.array(sorted(cluster_map.keys()))
|
|
self.optimal_num_clusters_ = self.clusters_[np.argmin(fs)]
|
|
self.fs_ = fs
|
|
return self
|
|
|
|
def save(self, savedir):
|
|
"""Saves the estimation results of the optimal # of clusters.
|
|
|
|
Parameters
|
|
----------
|
|
savedir : string
|
|
Path to the directory to save the results in.
|
|
"""
|
|
super(DetK, self).save(savedir)
|
|
|
|
# Plot the evaluation function
|
|
fig = plt.figure()
|
|
plt.plot(self.clusters_, self.fs_, '--o')
|
|
plt.xlabel("Number of clusters (K)")
|
|
plt.ylabel("Evaluation function (F_k)")
|
|
plt.title("Evaluation Function vs. Number of Clusters")
|
|
fig.canvas.set_window_title(os.path.basename(savedir))
|
|
savepath = os.path.join(savedir, self.name_ + "_eval_function.pdf")
|
|
plt.savefig(savepath, bbox_inches="tight")
|
|
plt.close()
|
|
|
|
|
|
class Silhouette(KSelection):
|
|
"""Det:
|
|
|
|
Approximates the optimal number of clusters (K).
|
|
|
|
|
|
References
|
|
----------
|
|
http://scikit-learn.org/stable/modules/generated/sklearn.metrics.silhouette_score.html
|
|
|
|
|
|
Attributes
|
|
----------
|
|
optimal_num_clusters_ : int
|
|
An estimation of the optimal number of clusters K for
|
|
KMeans models fit to X
|
|
|
|
clusters_ : array, [n_clusters]
|
|
The sizes of the clusters
|
|
|
|
name_ : string
|
|
The name of this technique
|
|
|
|
Score_ : array, [n_clusters]
|
|
The mean Silhouette Coefficient for each cluster size K
|
|
"""
|
|
|
|
# short for Silhouette score
|
|
NAME_ = "s-score"
|
|
|
|
def __init__(self):
|
|
super(Silhouette, self).__init__()
|
|
self.scores_ = None
|
|
|
|
@property
|
|
def name_(self):
|
|
return Silhouette.NAME_
|
|
|
|
def _reset(self):
|
|
"""Resets all attributes (erases the model)"""
|
|
super(Silhouette, self)._reset()
|
|
self.scores_ = None
|
|
|
|
def fit(self, X, cluster_map):
|
|
"""Estimates the optimal number of clusters (K) for a
|
|
KMeans model trained on X.
|
|
|
|
Parameters
|
|
----------
|
|
X : array-like, shape (n_samples, n_features)
|
|
Training data.
|
|
|
|
cluster_map_ : dict
|
|
A dictionary mapping each cluster size (K) to the KMeans
|
|
model fitted to X with K clusters
|
|
|
|
Returns
|
|
-------
|
|
self
|
|
"""
|
|
self._reset()
|
|
n_clusters = len(cluster_map)
|
|
# scores = np.empty(n_clusters)
|
|
scores = np.zeros(n_clusters)
|
|
for i, (K, model) \
|
|
in enumerate(sorted(cluster_map.items())):
|
|
if K <= 1: # K >= 2
|
|
continue
|
|
scores[i] = silhouette_score(X, model.cluster_labels_)
|
|
|
|
self.clusters_ = np.array(sorted(cluster_map.keys()))
|
|
self.optimal_num_clusters_ = self.clusters_[np.argmax(scores)]
|
|
self.scores_ = scores
|
|
return self
|
|
|
|
def save(self, savedir):
|
|
"""Saves the estimation results of the optimal # of clusters.
|
|
|
|
Parameters
|
|
----------
|
|
savedir : string
|
|
Path to the directory to save the results in.
|
|
"""
|
|
super(Silhouette, self).save(savedir)
|
|
|
|
# Plot the evaluation function
|
|
fig = plt.figure()
|
|
plt.plot(self.clusters_, self.scores_, '--o')
|
|
plt.xlabel("Number of clusters (K)")
|
|
plt.ylabel("Silhouette scores")
|
|
plt.title("Silhouette Scores vs. Number of Clusters")
|
|
fig.canvas.set_window_title(os.path.basename(savedir))
|
|
savepath = os.path.join(savedir, self.name_ + "_eval_function.pdf")
|
|
plt.savefig(savepath, bbox_inches="tight")
|
|
plt.close()
|
|
|
|
|
|
def create_kselection_model(model_name):
|
|
"""Constructs the KSelection model object with the given name
|
|
|
|
Parameters
|
|
----------
|
|
model_name : string
|
|
Name of the KSelection model.
|
|
One of ['gap-statistic', 'det-k', 's-score']
|
|
|
|
|
|
Returns
|
|
-------
|
|
The constructed model object
|
|
"""
|
|
kselection_map = {
|
|
DetK.NAME_: DetK,
|
|
GapStatistic.NAME_: GapStatistic,
|
|
Silhouette.NAME_: Silhouette
|
|
}
|
|
if model_name not in kselection_map:
|
|
raise Exception("KSelection model {} not supported!".format(model_name))
|
|
else:
|
|
return kselection_map[model_name]()
|