ottertune/server/analysis/cluster.py

#
# OtterTune - cluster.py
#
# Copyright (c) 2017-18, Carnegie Mellon University Database Group
#
'''
Created on Jul 4, 2016

@author: dva
'''
from abc import ABCMeta, abstractproperty
from collections import OrderedDict

import os
import json
import copy
import numpy as np
import matplotlib.pyplot as plt

from scipy.spatial.distance import cdist
from sklearn.metrics import silhouette_score
from sklearn.cluster import KMeans as SklearnKMeans
from celery.utils.log import get_task_logger

from .base import ModelBase

# Log debug messages
LOGGER = get_task_logger(__name__)


class KMeans(ModelBase):
    """
    KMeans:

    Fits an Sklearn KMeans model to X.


    See also
    --------
    http://scikit-learn.org/stable/modules/generated/sklearn.cluster.KMeans.html


    Attributes
    ----------
    n_clusters_ : int
                  The number of clusters, K

    cluster_inertia_ : float
                       Sum of squared distances of samples to their closest cluster center

    cluster_labels_ : array, [n_clusters_]
                      Labels indicating the membership of each point

    cluster_centers_ : array, [n_clusters, n_features]
                       Coordinates of cluster centers

    sample_labels_ : array, [n_samples]
                     Labels for each of the samples in X

    sample_distances_ : array, [n_samples]
                        The distance between each sample point and its cluster's center


    Constants
    ---------
    SAMPLE_CUTOFF_ : int
                     If n_samples > SAMPLE_CUTOFF_ then sample distances
                     are NOT recorded
    """

    SAMPLE_CUTOFF_ = 1000

    def __init__(self):
        self.model_ = None
        self.n_clusters_ = None
        self.sample_labels_ = None
        self.sample_distances_ = None

    @property
    def cluster_inertia_(self):
        # Sum of squared distances of samples to their closest cluster center
        return None if self.model_ is None else \
            self.model_.inertia_

    @property
    def cluster_labels_(self):
        # Cluster membership labels for each point
        return None if self.model_ is None else \
            copy.deepcopy(self.model_.labels_)

    @property
    def cluster_centers_(self):
        # Coordinates of the cluster centers
        return None if self.model_ is None else \
            copy.deepcopy(self.model_.cluster_centers_)

    def _reset(self):
        """Resets all attributes (erases the model)"""
        self.model_ = None
        self.n_clusters_ = None
        self.sample_labels_ = None
        self.sample_distances_ = None

    def fit(self, X, K, sample_labels=None, estimator_params=None):
        """Fits a Sklearn KMeans model to X.

        Parameters
        ----------
        X : array-like, shape (n_samples, n_features)
            Training data.

        K : int
            The number of clusters.

        sample_labels : array-like, shape (n_samples), optional
                        Labels for each of the samples in X.

        estimator_params : dict, optional
                           The parameters to pass to the KMeans estimators.


        Returns
        -------
        self
        """
        self._reset()
        # Note: previously set n_init=50
        self.model_ = SklearnKMeans(K)
        if estimator_params is not None:
            assert isinstance(estimator_params, dict)
            self.model_.set_params(**estimator_params)

        # Compute Kmeans model
        self.model_.fit(X)
        if sample_labels is None:
            sample_labels = ["sample_{}".format(i) for i in range(X.shape[0])]
        assert len(sample_labels) == X.shape[0]
        self.sample_labels_ = np.array(sample_labels)
        self.n_clusters_ = K

        # Record sample label/distance from its cluster center
        self.sample_distances_ = OrderedDict()
        for cluster_label in range(self.n_clusters_):
            assert cluster_label not in self.sample_distances_
            member_rows = X[self.cluster_labels_ == cluster_label, :]
            member_labels = self.sample_labels_[self.cluster_labels_ == cluster_label]
            centroid = np.expand_dims(self.cluster_centers_[cluster_label], axis=0)

            # "All clusters must have at least 1 member!"
            if member_rows.shape[0] == 0:
                return None

            # Calculate distance between each member row and the current cluster
            dists = np.empty(member_rows.shape[0])
            dist_labels = []
            for j, (row, label) in enumerate(zip(member_rows, member_labels)):
                dists[j] = cdist(np.expand_dims(row, axis=0), centroid, "euclidean").squeeze()
                dist_labels.append(label)

            # Sort the distances/labels in ascending order
            sort_order = np.argsort(dists)
            dists = dists[sort_order]
            dist_labels = np.array(dist_labels)[sort_order]
            self.sample_distances_[cluster_label] = {
                "sample_labels": dist_labels,
                "distances": dists,
            }
        return self

    def get_closest_samples(self):
        """Returns a list of the labels of the samples that are located closest
           to their cluster's center.


        Returns
        ----------
        closest_samples : list
                  A list of the sample labels that are located the closest to
                  their cluster's center.
        """
        if self.sample_distances_ is None:
            raise Exception("No model has been fit yet!")

        return [samples['sample_labels'][0] for samples in list(self.sample_distances_.values())]

    def get_memberships(self):
        '''
        Return the memberships in each cluster
        '''
        memberships = OrderedDict()
        for cluster_label, samples in list(self.sample_distances_.items()):
            memberships[cluster_label] = OrderedDict(
                [(l, d) for l, d in zip(samples["sample_labels"], samples["distances"])])
        return json.dumps(memberships, indent=4)


class KMeansClusters(ModelBase):

    """
    KMeansClusters:

    Fits a KMeans model to X for clusters in the range [min_cluster_, max_cluster_].


    Attributes
    ----------
    min_cluster_ : int
                   The minimum cluster size to fit a KMeans model to

    max_cluster_ : int
                   The maximum cluster size to fit a KMeans model to

    cluster_map_ : dict
                   A dictionary mapping the cluster size (K) to the KMeans
                   model fitted to X with K clusters

    sample_labels_ : array, [n_samples]
                     Labels for each of the samples in X
    """

    def __init__(self):
        self.min_cluster_ = None
        self.max_cluster_ = None
        self.cluster_map_ = None
        self.sample_labels_ = None

    def _reset(self):
        """Resets all attributes (erases the model)"""
        self.min_cluster_ = None
        self.max_cluster_ = None
        self.cluster_map_ = None
        self.sample_labels_ = None

    def fit(self, X, min_cluster, max_cluster, sample_labels=None, estimator_params=None):
        """Fits a KMeans model to X for each cluster in the range [min_cluster, max_cluster].

        Parameters
        ----------
        X : array-like, shape (n_samples, n_features)
            Training data.

        min_cluster : int
                      The minimum cluster size to fit a KMeans model to.

        max_cluster : int
                      The maximum cluster size to fit a KMeans model to.

        sample_labels : array-like, shape (n_samples), optional
                        Labels for each of the samples in X.

        estimator_params : dict, optional
                           The parameters to pass to the KMeans estimators.


        Returns
        -------
        self
        """
        self._reset()
        self.min_cluster_ = min_cluster
        self.max_cluster_ = max_cluster
        self.cluster_map_ = {}
        if sample_labels is None:
            sample_labels = ["sample_{}".format(i) for i in range(X.shape[1])]
        self.sample_labels_ = sample_labels
        for K in range(self.min_cluster_, self.max_cluster_ + 1):
            tmp = KMeans().fit(X, K, self.sample_labels_, estimator_params)
            if tmp is None:  # Set maximum cluster
                assert K > min_cluster, "min_cluster is too large for the model"
                self.max_cluster_ = K - 1
                break
            else:
                self.cluster_map_[K] = tmp

        return self

    def save(self, savedir):
        """Saves the KMeans model results

        Parameters
        ----------
        savedir : string
                  Path to the directory to save the results in.
        """
        if self.cluster_map_ is None:
            raise Exception("No models have been fitted yet!")

        cluster_map = OrderedDict()
        inertias = []
        for K, model in sorted(self.cluster_map_.items()):
            cluster_map[K] = {
                "cluster_inertia": model.cluster_inertia_,
                "cluster_labels": model.cluster_labels_,
                "cluster_centers": model.cluster_centers_,
            }
            inertias.append(model.cluster_inertia_)

        # Save sum of squares plot (elbow curve)
        fig = plt.figure()
        plt.plot(list(cluster_map.keys()), inertias, '--o')
        plt.xlabel("Number of clusters (K)")
        plt.ylabel("Within sum of squares W_k")
        plt.title("Within Sum of Squares vs. Number of Clusters")
        fig.canvas.set_window_title(os.path.basename(savedir))
        savepath = os.path.join(savedir, "kmeans_sum_of_squares.pdf")
        plt.savefig(savepath, bbox_inches="tight")
        plt.close()

        # save cluster memberships
        for K in range(self.min_cluster_, self.max_cluster_ + 1):
            savepath = os.path.join(savedir,
                                    "memberships_{}-clusters.json".format(K))
            members = self.cluster_map_[K].get_memberships()
            with open(savepath, "w") as f:
                f.write(members)


class KSelection(ModelBase, metaclass=ABCMeta):
    """KSelection:

    Abstract class for techniques that approximate the optimal
    number of clusters (K).


    Attributes
    ----------
    optimal_num_clusters_ : int
                            An estimation of the optimal number of clusters K for
                            a KMeans model fit to X
    clusters_ : array, [n_clusters]
                The sizes of the clusters

    name_ : string
            The name of this technique
    """

    NAME_ = None

    def __init__(self):
        self.optimal_num_clusters_ = None
        self.clusters_ = None

    def _reset(self):
        """Resets all attributes (erases the model)"""
        self.optimal_num_clusters_ = None
        self.clusters_ = None

    @abstractproperty
    def name_(self):
        pass

    def save(self, savedir):
        """Saves the estimation of the optimal # of clusters.

        Parameters
        ----------
        savedir : string
                  Path to the directory to save the results in.
        """
        if self.optimal_num_clusters_ is None:
            raise Exception("Optimal number of clusters has not been computed!")

        # Save the computed optimal number of clusters
        savepath = os.path.join(savedir, self.name_ + "_optimal_num_clusters.txt")
        with open(savepath, "w") as f:
            f.write(str(self.optimal_num_clusters_))


class GapStatistic(KSelection):
    """GapStatistic:

    Approximates the optimal number of clusters (K).


    References
    ----------
    https://web.stanford.edu/~hastie/Papers/gap.pdf


    Attributes
    ----------
    optimal_num_clusters_ : int
                            An estimation of the optimal number of clusters K for
                            a KMeans model fit to X

    clusters_ : array, [n_clusters]
                The sizes of the clusters

    name_ : string
            The name of this technique

    log_wks_ : array, [n_clusters]
               The within-dispersion measures of X (log)

    log_wkbs_ : array, [n_clusters]
                The within-dispersion measures of the generated
                reference data sets

    khats_ : array, [n_clusters]
             The gap-statistic for each cluster
    """

    NAME_ = "gap-statistic"

    def __init__(self):
        super(GapStatistic, self).__init__()
        self.log_wks_ = None
        self.log_wkbs_ = None
        self.khats_ = None

    @property
    def name_(self):
        return self.NAME_

    def _reset(self):
        """Resets all attributes (erases the model)"""
        super(GapStatistic, self)._reset()
        self.log_wks_ = None
        self.log_wkbs_ = None
        self.khats_ = None

    def fit(self, X, cluster_map, n_b=50):
        """Estimates the optimal number of clusters (K) for a
           KMeans model trained on X.

        Parameters
        ----------
        X : array-like, shape (n_samples, n_features)
            Training data.

        cluster_map_ : dict
                       A dictionary mapping each cluster size (K) to the KMeans
                       model fitted to X with K clusters

        n_B : int
              The number of reference data sets to generate


        Returns
        -------
        self
        """
        self._reset()
        mins, maxs = GapStatistic.bounding_box(X)
        n_clusters = len(cluster_map)

        # Dispersion for real distribution
        log_wks = np.zeros(n_clusters)
        log_wkbs = np.zeros(n_clusters)
        sk = np.zeros(n_clusters)
        for indk, (K, model) in enumerate(sorted(cluster_map.items())):

            # Computes Wk: the within-dispersion of each cluster size (k)
            log_wks[indk] = np.log(model.cluster_inertia_ / (2.0 * K))

            # Create B reference datasets
            log_bwkbs = np.zeros(n_b)
            for i in range(n_b):
                Xb = np.empty_like(X)
                for j in range(X.shape[1]):
                    Xb[:, j] = np.random.uniform(mins[j], maxs[j], size=X.shape[0])
                Xb_model = KMeans().fit(Xb, K)
                log_bwkbs[i] = np.log(Xb_model.cluster_inertia_ / (2.0 * K))
            log_wkbs[indk] = sum(log_bwkbs) / n_b
            sk[indk] = np.sqrt(sum((log_bwkbs - log_wkbs[indk]) ** 2) / n_b)
        sk = sk * np.sqrt(1 + 1.0 / n_b)

        khats = np.zeros(n_clusters)
        gaps = log_wkbs - log_wks
        gsks = gaps - sk
        khats[1:] = gaps[0:-1] - gsks[1:]
        self.clusters_ = np.array(sorted(cluster_map.keys()))

        for i in range(1, n_clusters):
            if gaps[i - 1] >= gsks[i]:
                self.optimal_num_clusters_ = self.clusters_[i - 1]
                break

        if self.optimal_num_clusters_ is None:
            LOGGER.info("GapStatistic NOT found the optimal k, \
                        use the last(maximum) k instead ")
            self.optimal_num_clusters_ = self.clusters_[-1]

        self.log_wks_ = log_wks
        self.log_wkbs_ = log_wkbs
        self.khats_ = khats
        return self

    @staticmethod
    def bounding_box(X):
        """Computes the box that tightly bounds X

        Parameters
        ----------
        X : array-like, shape (n_samples, n_features)
            Training data.


        Returns
        -------
        The mins and maxs that make up the bounding box
        """
        mins = np.min(X, axis=0)
        maxs = np.max(X, axis=0)
        return mins, maxs

    @staticmethod
    def Wk(X, mu, cluster_labels):
        """Computes the within-dispersion of each cluster size (k)

        Parameters
        ----------
        X : array-like, shape (n_samples, n_features)
            Training data.

        mu : array-like, shape (n_clusters, n_features)
            Coordinates of cluster centers

        cluster_labels: array-like, shape (n_samples)
                        Labels for each of the samples in X.


        Returns
        -------
        The within-dispersion of each cluster (K)
        """
        K = len(mu)
        return sum([np.linalg.norm(mu[i] - x) ** 2 / (2.0 * K)
                    for i in range(K)
                    for x in X[cluster_labels == i]])

    def save(self, savedir):
        """Saves the estimation results of the optimal # of clusters.

        Parameters
        ----------
        savedir : string
                  Path to the directory to save the results in.
        """
        super(GapStatistic, self).save(savedir)

        # Plot the calculated gap
        gaps = self.log_wkbs_ - self.log_wks_
        fig = plt.figure()
        plt.plot(self.clusters_, gaps, '--o')
        plt.title("Gap vs. Number of Clusters")
        plt.xlabel("Number of clusters (K)")
        plt.ylabel("gap_K")
        fig.canvas.set_window_title(os.path.basename(savedir))
        plt.savefig(os.path.join(savedir, self.name_ + ".pdf"), bbox_inches="tight")
        plt.close()

        # Plot the gap statistic
        fig = plt.figure()
        plt.bar(self.clusters_, self.khats_)
        plt.title("Gap Statistic vs. Number of Clusters")
        plt.xlabel("Number of clusters (K)")
        plt.ylabel("gap(K)-(gap(K+1)-s(K+1))")
        fig.canvas.set_window_title(os.path.basename(savedir))
        plt.savefig(os.path.join(savedir, self.name_ + "_final.pdf"),
                    bbox_inches="tight")
        plt.close()


class DetK(KSelection):
    """DetK:

    Approximates the optimal number of clusters (K).


    References
    ----------
    https://www.ee.columbia.edu/~dpwe/papers/PhamDN05-kmeans.pdf


    Attributes
    ----------
    optimal_num_clusters_ : int
                            An estimation of the optimal number of clusters K for
                            KMeans models fit to X

    clusters_ : array, [n_clusters]
                The sizes of the clusters

    name_ : string
            The name of this technique

    fs_ : array, [n_clusters]
          The computed evaluation functions F(K) for each cluster size K
    """

    NAME_ = "det-k"

    def __init__(self):
        super(DetK, self).__init__()
        self.fs_ = None

    @property
    def name_(self):
        return DetK.NAME_

    def _reset(self):
        """Resets all attributes (erases the model)"""
        super(DetK, self)._reset()
        self.fs_ = None

    def fit(self, X, cluster_map):
        """Estimates the optimal number of clusters (K) for a
           KMeans model trained on X.

        Parameters
        ----------
        X : array-like, shape (n_samples, n_features)
            Training data.

        cluster_map_ : dict
                       A dictionary mapping each cluster size (K) to the KMeans
                       model fitted to X with K clusters


        Returns
        -------
        self
        """
        self._reset()
        n_clusters = len(cluster_map)
        nd = X.shape[1]
        fs = np.empty(n_clusters)
        sks = np.empty(n_clusters)
        alpha = {}
        # K from 1 to maximum_cluster_
        for i, (K, model) \
                in enumerate(sorted(cluster_map.items())):
            # Compute alpha(K, nd) (i.e. alpha[K])
            if K == 2:
                alpha[K] = 1 - 3.0 / (4 * nd)
            elif K > 2:
                alpha[K] = alpha[K - 1] + (1 - alpha[K - 1]) / 6.0
            sks[i] = model.cluster_inertia_

            if K == 1:
                fs[i] = 1
            elif sks[i - 1] == 0:
                fs[i] = 1
            else:
                fs[i] = sks[i] / (alpha[K] * sks[i - 1])
        self.clusters_ = np.array(sorted(cluster_map.keys()))
        self.optimal_num_clusters_ = self.clusters_[np.argmin(fs)]
        self.fs_ = fs
        return self

    def save(self, savedir):
        """Saves the estimation results of the optimal # of clusters.

        Parameters
        ----------
        savedir : string
                  Path to the directory to save the results in.
        """
        super(DetK, self).save(savedir)

        # Plot the evaluation function
        fig = plt.figure()
        plt.plot(self.clusters_, self.fs_, '--o')
        plt.xlabel("Number of clusters (K)")
        plt.ylabel("Evaluation function (F_k)")
        plt.title("Evaluation Function vs. Number of Clusters")
        fig.canvas.set_window_title(os.path.basename(savedir))
        savepath = os.path.join(savedir, self.name_ + "_eval_function.pdf")
        plt.savefig(savepath, bbox_inches="tight")
        plt.close()


class Silhouette(KSelection):
    """Det:

    Approximates the optimal number of clusters (K).


    References
    ----------
    http://scikit-learn.org/stable/modules/generated/sklearn.metrics.silhouette_score.html


    Attributes
    ----------
    optimal_num_clusters_ : int
                            An estimation of the optimal number of clusters K for
                            KMeans models fit to X

    clusters_ : array, [n_clusters]
                The sizes of the clusters

    name_ : string
            The name of this technique

    Score_ : array, [n_clusters]
            The mean Silhouette Coefficient for each cluster size K
    """

    # short for Silhouette score
    NAME_ = "s-score"

    def __init__(self):
        super(Silhouette, self).__init__()
        self.scores_ = None

    @property
    def name_(self):
        return Silhouette.NAME_

    def _reset(self):
        """Resets all attributes (erases the model)"""
        super(Silhouette, self)._reset()
        self.scores_ = None

    def fit(self, X, cluster_map):
        """Estimates the optimal number of clusters (K) for a
           KMeans model trained on X.

        Parameters
        ----------
        X : array-like, shape (n_samples, n_features)
            Training data.

        cluster_map_ : dict
                       A dictionary mapping each cluster size (K) to the KMeans
                       model fitted to X with K clusters

        Returns
        -------
        self
        """
        self._reset()
        n_clusters = len(cluster_map)
        # scores = np.empty(n_clusters)
        scores = np.zeros(n_clusters)
        for i, (K, model) \
                in enumerate(sorted(cluster_map.items())):
            if K <= 1:  # K >= 2
                continue
            scores[i] = silhouette_score(X, model.cluster_labels_)

        self.clusters_ = np.array(sorted(cluster_map.keys()))
        self.optimal_num_clusters_ = self.clusters_[np.argmax(scores)]
        self.scores_ = scores
        return self

    def save(self, savedir):
        """Saves the estimation results of the optimal # of clusters.

        Parameters
        ----------
        savedir : string
                  Path to the directory to save the results in.
        """
        super(Silhouette, self).save(savedir)

        # Plot the evaluation function
        fig = plt.figure()
        plt.plot(self.clusters_, self.scores_, '--o')
        plt.xlabel("Number of clusters (K)")
        plt.ylabel("Silhouette scores")
        plt.title("Silhouette Scores vs. Number of Clusters")
        fig.canvas.set_window_title(os.path.basename(savedir))
        savepath = os.path.join(savedir, self.name_ + "_eval_function.pdf")
        plt.savefig(savepath, bbox_inches="tight")
        plt.close()


def create_kselection_model(model_name):
    """Constructs the KSelection model object with the given name

    Parameters
    ----------
    model_name : string
                 Name of the KSelection model.
                 One of ['gap-statistic', 'det-k', 's-score']


    Returns
    -------
    The constructed model object
    """
    kselection_map = {
        DetK.NAME_: DetK,
        GapStatistic.NAME_: GapStatistic,
        Silhouette.NAME_: Silhouette
    }
    if model_name not in kselection_map:
        raise Exception("KSelection model {} not supported!".format(model_name))
    else:
        return kselection_map[model_name]()