Initial commit with BSL

2019-08-23 11:47:19 -04:00
commit 3e564ce922
286 changed files with 177642 additions and 0 deletions
--- a/server/analysis/init.py
+++ b/server/analysis/init.py
@@ -0,0 +1,5 @@
+#
+# OtterTune - __init__.py
+#
+# Copyright (c) 2017-18, Carnegie Mellon University Database Group
+#
--- a/server/analysis/base.py
+++ b/server/analysis/base.py
@@ -0,0 +1,19 @@
+#
+# OtterTune - base.py
+#
+# Copyright (c) 2017-18, Carnegie Mellon University Database Group
+#
+'''
+Created on Oct 25, 2017
+
+@author: dva
+'''
+
+from abc import ABCMeta, abstractmethod
+
+
+class ModelBase(object, metaclass=ABCMeta):
+
+    @abstractmethod
+    def _reset(self):
+        pass
--- a/server/analysis/cluster.py
+++ b/server/analysis/cluster.py
@@ -0,0 +1,793 @@
+#
+# OtterTune - cluster.py
+#
+# Copyright (c) 2017-18, Carnegie Mellon University Database Group
+#
+'''
+Created on Jul 4, 2016
+
+@author: dva
+'''
+from abc import ABCMeta, abstractproperty
+from collections import OrderedDict
+
+import os
+import json
+import copy
+import numpy as np
+import matplotlib.pyplot as plt
+
+from scipy.spatial.distance import cdist
+from sklearn.metrics import silhouette_score
+from sklearn.cluster import KMeans as SklearnKMeans
+from celery.utils.log import get_task_logger
+
+from .base import ModelBase
+
+# Log debug messages
+LOGGER = get_task_logger(__name__)
+
+
+class KMeans(ModelBase):
+    """
+    KMeans:
+
+    Fits an Sklearn KMeans model to X.
+
+
+    See also
+    --------
+    http://scikit-learn.org/stable/modules/generated/sklearn.cluster.KMeans.html
+
+
+    Attributes
+    ----------
+    n_clusters_ : int
+                  The number of clusters, K
+
+    cluster_inertia_ : float
+                       Sum of squared distances of samples to their closest cluster center
+
+    cluster_labels_ : array, [n_clusters_]
+                      Labels indicating the membership of each point
+
+    cluster_centers_ : array, [n_clusters, n_features]
+                       Coordinates of cluster centers
+
+    sample_labels_ : array, [n_samples]
+                     Labels for each of the samples in X
+
+    sample_distances_ : array, [n_samples]
+                        The distance between each sample point and its cluster's center
+
+
+    Constants
+    ---------
+    SAMPLE_CUTOFF_ : int
+                     If n_samples > SAMPLE_CUTOFF_ then sample distances
+                     are NOT recorded
+    """
+
+    SAMPLE_CUTOFF_ = 1000
+
+    def __init__(self):
+        self.model_ = None
+        self.n_clusters_ = None
+        self.sample_labels_ = None
+        self.sample_distances_ = None
+
+    @property
+    def cluster_inertia_(self):
+        # Sum of squared distances of samples to their closest cluster center
+        return None if self.model_ is None else \
+            self.model_.inertia_
+
+    @property
+    def cluster_labels_(self):
+        # Cluster membership labels for each point
+        return None if self.model_ is None else \
+            copy.deepcopy(self.model_.labels_)
+
+    @property
+    def cluster_centers_(self):
+        # Coordinates of the cluster centers
+        return None if self.model_ is None else \
+            copy.deepcopy(self.model_.cluster_centers_)
+
+    def _reset(self):
+        """Resets all attributes (erases the model)"""
+        self.model_ = None
+        self.n_clusters_ = None
+        self.sample_labels_ = None
+        self.sample_distances_ = None
+
+    def fit(self, X, K, sample_labels=None, estimator_params=None):
+        """Fits a Sklearn KMeans model to X.
+
+        Parameters
+        ----------
+        X : array-like, shape (n_samples, n_features)
+            Training data.
+
+        K : int
+            The number of clusters.
+
+        sample_labels : array-like, shape (n_samples), optional
+                        Labels for each of the samples in X.
+
+        estimator_params : dict, optional
+                           The parameters to pass to the KMeans estimators.
+
+
+        Returns
+        -------
+        self
+        """
+        self._reset()
+        # Note: previously set n_init=50
+        self.model_ = SklearnKMeans(K)
+        if estimator_params is not None:
+            assert isinstance(estimator_params, dict)
+            self.model_.set_params(**estimator_params)
+
+        # Compute Kmeans model
+        self.model_.fit(X)
+        if sample_labels is None:
+            sample_labels = ["sample_{}".format(i) for i in range(X.shape[0])]
+        assert len(sample_labels) == X.shape[0]
+        self.sample_labels_ = np.array(sample_labels)
+        self.n_clusters_ = K
+
+        # Record sample label/distance from its cluster center
+        self.sample_distances_ = OrderedDict()
+        for cluster_label in range(self.n_clusters_):
+            assert cluster_label not in self.sample_distances_
+            member_rows = X[self.cluster_labels_ == cluster_label, :]
+            member_labels = self.sample_labels_[self.cluster_labels_ == cluster_label]
+            centroid = np.expand_dims(self.cluster_centers_[cluster_label], axis=0)
+
+            # "All clusters must have at least 1 member!"
+            if member_rows.shape[0] == 0:
+                return None
+
+            # Calculate distance between each member row and the current cluster
+            dists = np.empty(member_rows.shape[0])
+            dist_labels = []
+            for j, (row, label) in enumerate(zip(member_rows, member_labels)):
+                dists[j] = cdist(np.expand_dims(row, axis=0), centroid, "euclidean").squeeze()
+                dist_labels.append(label)
+
+            # Sort the distances/labels in ascending order
+            sort_order = np.argsort(dists)
+            dists = dists[sort_order]
+            dist_labels = np.array(dist_labels)[sort_order]
+            self.sample_distances_[cluster_label] = {
+                "sample_labels": dist_labels,
+                "distances": dists,
+            }
+        return self
+
+    def get_closest_samples(self):
+        """Returns a list of the labels of the samples that are located closest
+           to their cluster's center.
+
+
+        Returns
+        ----------
+        closest_samples : list
+                  A list of the sample labels that are located the closest to
+                  their cluster's center.
+        """
+        if self.sample_distances_ is None:
+            raise Exception("No model has been fit yet!")
+
+        return [samples['sample_labels'][0] for samples in list(self.sample_distances_.values())]
+
+    def get_memberships(self):
+        '''
+        Return the memberships in each cluster
+        '''
+        memberships = OrderedDict()
+        for cluster_label, samples in list(self.sample_distances_.items()):
+            memberships[cluster_label] = OrderedDict(
+                [(l, d) for l, d in zip(samples["sample_labels"], samples["distances"])])
+        return json.dumps(memberships, indent=4)
+
+
+class KMeansClusters(ModelBase):
+
+    """
+    KMeansClusters:
+
+    Fits a KMeans model to X for clusters in the range [min_cluster_, max_cluster_].
+
+
+    Attributes
+    ----------
+    min_cluster_ : int
+                   The minimum cluster size to fit a KMeans model to
+
+    max_cluster_ : int
+                   The maximum cluster size to fit a KMeans model to
+
+    cluster_map_ : dict
+                   A dictionary mapping the cluster size (K) to the KMeans
+                   model fitted to X with K clusters
+
+    sample_labels_ : array, [n_samples]
+                     Labels for each of the samples in X
+    """
+
+    def __init__(self):
+        self.min_cluster_ = None
+        self.max_cluster_ = None
+        self.cluster_map_ = None
+        self.sample_labels_ = None
+
+    def _reset(self):
+        """Resets all attributes (erases the model)"""
+        self.min_cluster_ = None
+        self.max_cluster_ = None
+        self.cluster_map_ = None
+        self.sample_labels_ = None
+
+    def fit(self, X, min_cluster, max_cluster, sample_labels=None, estimator_params=None):
+        """Fits a KMeans model to X for each cluster in the range [min_cluster, max_cluster].
+
+        Parameters
+        ----------
+        X : array-like, shape (n_samples, n_features)
+            Training data.
+
+        min_cluster : int
+                      The minimum cluster size to fit a KMeans model to.
+
+        max_cluster : int
+                      The maximum cluster size to fit a KMeans model to.
+
+        sample_labels : array-like, shape (n_samples), optional
+                        Labels for each of the samples in X.
+
+        estimator_params : dict, optional
+                           The parameters to pass to the KMeans estimators.
+
+
+        Returns
+        -------
+        self
+        """
+        self._reset()
+        self.min_cluster_ = min_cluster
+        self.max_cluster_ = max_cluster
+        self.cluster_map_ = {}
+        if sample_labels is None:
+            sample_labels = ["sample_{}".format(i) for i in range(X.shape[1])]
+        self.sample_labels_ = sample_labels
+        for K in range(self.min_cluster_, self.max_cluster_ + 1):
+            tmp = KMeans().fit(X, K, self.sample_labels_, estimator_params)
+            if tmp is None:  # Set maximum cluster
+                assert K > min_cluster, "min_cluster is too large for the model"
+                self.max_cluster_ = K - 1
+                break
+            else:
+                self.cluster_map_[K] = tmp
+
+        return self
+
+    def save(self, savedir):
+        """Saves the KMeans model results
+
+        Parameters
+        ----------
+        savedir : string
+                  Path to the directory to save the results in.
+        """
+        if self.cluster_map_ is None:
+            raise Exception("No models have been fitted yet!")
+
+        cluster_map = OrderedDict()
+        inertias = []
+        for K, model in sorted(self.cluster_map_.items()):
+            cluster_map[K] = {
+                "cluster_inertia": model.cluster_inertia_,
+                "cluster_labels": model.cluster_labels_,
+                "cluster_centers": model.cluster_centers_,
+            }
+            inertias.append(model.cluster_inertia_)
+
+        # Save sum of squares plot (elbow curve)
+        fig = plt.figure()
+        plt.plot(list(cluster_map.keys()), inertias, '--o')
+        plt.xlabel("Number of clusters (K)")
+        plt.ylabel("Within sum of squares W_k")
+        plt.title("Within Sum of Squares vs. Number of Clusters")
+        fig.canvas.set_window_title(os.path.basename(savedir))
+        savepath = os.path.join(savedir, "kmeans_sum_of_squares.pdf")
+        plt.savefig(savepath, bbox_inches="tight")
+        plt.close()
+
+        # save cluster memberships
+        for K in range(self.min_cluster_, self.max_cluster_ + 1):
+            savepath = os.path.join(savedir,
+                                    "memberships_{}-clusters.json".format(K))
+            members = self.cluster_map_[K].get_memberships()
+            with open(savepath, "w") as f:
+                f.write(members)
+
+
+class KSelection(ModelBase, metaclass=ABCMeta):
+    """KSelection:
+
+    Abstract class for techniques that approximate the optimal
+    number of clusters (K).
+
+
+    Attributes
+    ----------
+    optimal_num_clusters_ : int
+                            An estimation of the optimal number of clusters K for
+                            a KMeans model fit to X
+    clusters_ : array, [n_clusters]
+                The sizes of the clusters
+
+    name_ : string
+            The name of this technique
+    """
+
+    NAME_ = None
+
+    def __init__(self):
+        self.optimal_num_clusters_ = None
+        self.clusters_ = None
+
+    def _reset(self):
+        """Resets all attributes (erases the model)"""
+        self.optimal_num_clusters_ = None
+        self.clusters_ = None
+
+    @abstractproperty
+    def name_(self):
+        pass
+
+    def save(self, savedir):
+        """Saves the estimation of the optimal # of clusters.
+
+        Parameters
+        ----------
+        savedir : string
+                  Path to the directory to save the results in.
+        """
+        if self.optimal_num_clusters_ is None:
+            raise Exception("Optimal number of clusters has not been computed!")
+
+        # Save the computed optimal number of clusters
+        savepath = os.path.join(savedir, self.name_ + "_optimal_num_clusters.txt")
+        with open(savepath, "w") as f:
+            f.write(str(self.optimal_num_clusters_))
+
+
+class GapStatistic(KSelection):
+    """GapStatistic:
+
+    Approximates the optimal number of clusters (K).
+
+
+    References
+    ----------
+    https://web.stanford.edu/~hastie/Papers/gap.pdf
+
+
+    Attributes
+    ----------
+    optimal_num_clusters_ : int
+                            An estimation of the optimal number of clusters K for
+                            a KMeans model fit to X
+
+    clusters_ : array, [n_clusters]
+                The sizes of the clusters
+
+    name_ : string
+            The name of this technique
+
+    log_wks_ : array, [n_clusters]
+               The within-dispersion measures of X (log)
+
+    log_wkbs_ : array, [n_clusters]
+                The within-dispersion measures of the generated
+                reference data sets
+
+    khats_ : array, [n_clusters]
+             The gap-statistic for each cluster
+    """
+
+    NAME_ = "gap-statistic"
+
+    def __init__(self):
+        super(GapStatistic, self).__init__()
+        self.log_wks_ = None
+        self.log_wkbs_ = None
+        self.khats_ = None
+
+    @property
+    def name_(self):
+        return self.NAME_
+
+    def _reset(self):
+        """Resets all attributes (erases the model)"""
+        super(GapStatistic, self)._reset()
+        self.log_wks_ = None
+        self.log_wkbs_ = None
+        self.khats_ = None
+
+    def fit(self, X, cluster_map, n_b=50):
+        """Estimates the optimal number of clusters (K) for a
+           KMeans model trained on X.
+
+        Parameters
+        ----------
+        X : array-like, shape (n_samples, n_features)
+            Training data.
+
+        cluster_map_ : dict
+                       A dictionary mapping each cluster size (K) to the KMeans
+                       model fitted to X with K clusters
+
+        n_B : int
+              The number of reference data sets to generate
+
+
+        Returns
+        -------
+        self
+        """
+        self._reset()
+        mins, maxs = GapStatistic.bounding_box(X)
+        n_clusters = len(cluster_map)
+
+        # Dispersion for real distribution
+        log_wks = np.zeros(n_clusters)
+        log_wkbs = np.zeros(n_clusters)
+        sk = np.zeros(n_clusters)
+        for indk, (K, model) in enumerate(sorted(cluster_map.items())):
+
+            # Computes Wk: the within-dispersion of each cluster size (k)
+            log_wks[indk] = np.log(model.cluster_inertia_ / (2.0 * K))
+
+            # Create B reference datasets
+            log_bwkbs = np.zeros(n_b)
+            for i in range(n_b):
+                Xb = np.empty_like(X)
+                for j in range(X.shape[1]):
+                    Xb[:, j] = np.random.uniform(mins[j], maxs[j], size=X.shape[0])
+                Xb_model = KMeans().fit(Xb, K)
+                log_bwkbs[i] = np.log(Xb_model.cluster_inertia_ / (2.0 * K))
+            log_wkbs[indk] = sum(log_bwkbs) / n_b
+            sk[indk] = np.sqrt(sum((log_bwkbs - log_wkbs[indk]) ** 2) / n_b)
+        sk = sk * np.sqrt(1 + 1.0 / n_b)
+
+        khats = np.zeros(n_clusters)
+        gaps = log_wkbs - log_wks
+        gsks = gaps - sk
+        khats[1:] = gaps[0:-1] - gsks[1:]
+        self.clusters_ = np.array(sorted(cluster_map.keys()))
+
+        for i in range(1, n_clusters):
+            if gaps[i - 1] >= gsks[i]:
+                self.optimal_num_clusters_ = self.clusters_[i - 1]
+                break
+
+        if self.optimal_num_clusters_ is None:
+            LOGGER.info("GapStatistic NOT found the optimal k, \
+                        use the last(maximum) k instead ")
+            self.optimal_num_clusters_ = self.clusters_[-1]
+
+        self.log_wks_ = log_wks
+        self.log_wkbs_ = log_wkbs
+        self.khats_ = khats
+        return self
+
+    @staticmethod
+    def bounding_box(X):
+        """Computes the box that tightly bounds X
+
+        Parameters
+        ----------
+        X : array-like, shape (n_samples, n_features)
+            Training data.
+
+
+        Returns
+        -------
+        The mins and maxs that make up the bounding box
+        """
+        mins = np.min(X, axis=0)
+        maxs = np.max(X, axis=0)
+        return mins, maxs
+
+    @staticmethod
+    def Wk(X, mu, cluster_labels):
+        """Computes the within-dispersion of each cluster size (k)
+
+        Parameters
+        ----------
+        X : array-like, shape (n_samples, n_features)
+            Training data.
+
+        mu : array-like, shape (n_clusters, n_features)
+            Coordinates of cluster centers
+
+        cluster_labels: array-like, shape (n_samples)
+                        Labels for each of the samples in X.
+
+
+        Returns
+        -------
+        The within-dispersion of each cluster (K)
+        """
+        K = len(mu)
+        return sum([np.linalg.norm(mu[i] - x) ** 2 / (2.0 * K)
+                    for i in range(K)
+                    for x in X[cluster_labels == i]])
+
+    def save(self, savedir):
+        """Saves the estimation results of the optimal # of clusters.
+
+        Parameters
+        ----------
+        savedir : string
+                  Path to the directory to save the results in.
+        """
+        super(GapStatistic, self).save(savedir)
+
+        # Plot the calculated gap
+        gaps = self.log_wkbs_ - self.log_wks_
+        fig = plt.figure()
+        plt.plot(self.clusters_, gaps, '--o')
+        plt.title("Gap vs. Number of Clusters")
+        plt.xlabel("Number of clusters (K)")
+        plt.ylabel("gap_K")
+        fig.canvas.set_window_title(os.path.basename(savedir))
+        plt.savefig(os.path.join(savedir, self.name_ + ".pdf"), bbox_inches="tight")
+        plt.close()
+
+        # Plot the gap statistic
+        fig = plt.figure()
+        plt.bar(self.clusters_, self.khats_)
+        plt.title("Gap Statistic vs. Number of Clusters")
+        plt.xlabel("Number of clusters (K)")
+        plt.ylabel("gap(K)-(gap(K+1)-s(K+1))")
+        fig.canvas.set_window_title(os.path.basename(savedir))
+        plt.savefig(os.path.join(savedir, self.name_ + "_final.pdf"),
+                    bbox_inches="tight")
+        plt.close()
+
+
+class DetK(KSelection):
+    """DetK:
+
+    Approximates the optimal number of clusters (K).
+
+
+    References
+    ----------
+    https://www.ee.columbia.edu/~dpwe/papers/PhamDN05-kmeans.pdf
+
+
+    Attributes
+    ----------
+    optimal_num_clusters_ : int
+                            An estimation of the optimal number of clusters K for
+                            KMeans models fit to X
+
+    clusters_ : array, [n_clusters]
+                The sizes of the clusters
+
+    name_ : string
+            The name of this technique
+
+    fs_ : array, [n_clusters]
+          The computed evaluation functions F(K) for each cluster size K
+    """
+
+    NAME_ = "det-k"
+
+    def __init__(self):
+        super(DetK, self).__init__()
+        self.fs_ = None
+
+    @property
+    def name_(self):
+        return DetK.NAME_
+
+    def _reset(self):
+        """Resets all attributes (erases the model)"""
+        super(DetK, self)._reset()
+        self.fs_ = None
+
+    def fit(self, X, cluster_map):
+        """Estimates the optimal number of clusters (K) for a
+           KMeans model trained on X.
+
+        Parameters
+        ----------
+        X : array-like, shape (n_samples, n_features)
+            Training data.
+
+        cluster_map_ : dict
+                       A dictionary mapping each cluster size (K) to the KMeans
+                       model fitted to X with K clusters
+
+
+        Returns
+        -------
+        self
+        """
+        self._reset()
+        n_clusters = len(cluster_map)
+        nd = X.shape[1]
+        fs = np.empty(n_clusters)
+        sks = np.empty(n_clusters)
+        alpha = {}
+        # K from 1 to maximum_cluster_
+        for i, (K, model) \
+                in enumerate(sorted(cluster_map.items())):
+            # Compute alpha(K, nd) (i.e. alpha[K])
+            if K == 2:
+                alpha[K] = 1 - 3.0 / (4 * nd)
+            elif K > 2:
+                alpha[K] = alpha[K - 1] + (1 - alpha[K - 1]) / 6.0
+            sks[i] = model.cluster_inertia_
+
+            if K == 1:
+                fs[i] = 1
+            elif sks[i - 1] == 0:
+                fs[i] = 1
+            else:
+                fs[i] = sks[i] / (alpha[K] * sks[i - 1])
+        self.clusters_ = np.array(sorted(cluster_map.keys()))
+        self.optimal_num_clusters_ = self.clusters_[np.argmin(fs)]
+        self.fs_ = fs
+        return self
+
+    def save(self, savedir):
+        """Saves the estimation results of the optimal # of clusters.
+
+        Parameters
+        ----------
+        savedir : string
+                  Path to the directory to save the results in.
+        """
+        super(DetK, self).save(savedir)
+
+        # Plot the evaluation function
+        fig = plt.figure()
+        plt.plot(self.clusters_, self.fs_, '--o')
+        plt.xlabel("Number of clusters (K)")
+        plt.ylabel("Evaluation function (F_k)")
+        plt.title("Evaluation Function vs. Number of Clusters")
+        fig.canvas.set_window_title(os.path.basename(savedir))
+        savepath = os.path.join(savedir, self.name_ + "_eval_function.pdf")
+        plt.savefig(savepath, bbox_inches="tight")
+        plt.close()
+
+
+class Silhouette(KSelection):
+    """Det:
+
+    Approximates the optimal number of clusters (K).
+
+
+    References
+    ----------
+    http://scikit-learn.org/stable/modules/generated/sklearn.metrics.silhouette_score.html
+
+
+    Attributes
+    ----------
+    optimal_num_clusters_ : int
+                            An estimation of the optimal number of clusters K for
+                            KMeans models fit to X
+
+    clusters_ : array, [n_clusters]
+                The sizes of the clusters
+
+    name_ : string
+            The name of this technique
+
+    Score_ : array, [n_clusters]
+            The mean Silhouette Coefficient for each cluster size K
+    """
+
+    # short for Silhouette score
+    NAME_ = "s-score"
+
+    def __init__(self):
+        super(Silhouette, self).__init__()
+        self.scores_ = None
+
+    @property
+    def name_(self):
+        return Silhouette.NAME_
+
+    def _reset(self):
+        """Resets all attributes (erases the model)"""
+        super(Silhouette, self)._reset()
+        self.scores_ = None
+
+    def fit(self, X, cluster_map):
+        """Estimates the optimal number of clusters (K) for a
+           KMeans model trained on X.
+
+        Parameters
+        ----------
+        X : array-like, shape (n_samples, n_features)
+            Training data.
+
+        cluster_map_ : dict
+                       A dictionary mapping each cluster size (K) to the KMeans
+                       model fitted to X with K clusters
+
+        Returns
+        -------
+        self
+        """
+        self._reset()
+        n_clusters = len(cluster_map)
+        # scores = np.empty(n_clusters)
+        scores = np.zeros(n_clusters)
+        for i, (K, model) \
+                in enumerate(sorted(cluster_map.items())):
+            if K <= 1:  # K >= 2
+                continue
+            scores[i] = silhouette_score(X, model.cluster_labels_)
+
+        self.clusters_ = np.array(sorted(cluster_map.keys()))
+        self.optimal_num_clusters_ = self.clusters_[np.argmax(scores)]
+        self.scores_ = scores
+        return self
+
+    def save(self, savedir):
+        """Saves the estimation results of the optimal # of clusters.
+
+        Parameters
+        ----------
+        savedir : string
+                  Path to the directory to save the results in.
+        """
+        super(Silhouette, self).save(savedir)
+
+        # Plot the evaluation function
+        fig = plt.figure()
+        plt.plot(self.clusters_, self.scores_, '--o')
+        plt.xlabel("Number of clusters (K)")
+        plt.ylabel("Silhouette scores")
+        plt.title("Silhouette Scores vs. Number of Clusters")
+        fig.canvas.set_window_title(os.path.basename(savedir))
+        savepath = os.path.join(savedir, self.name_ + "_eval_function.pdf")
+        plt.savefig(savepath, bbox_inches="tight")
+        plt.close()
+
+
+def create_kselection_model(model_name):
+    """Constructs the KSelection model object with the given name
+
+    Parameters
+    ----------
+    model_name : string
+                 Name of the KSelection model.
+                 One of ['gap-statistic', 'det-k', 's-score']
+
+
+    Returns
+    -------
+    The constructed model object
+    """
+    kselection_map = {
+        DetK.NAME_: DetK,
+        GapStatistic.NAME_: GapStatistic,
+        Silhouette.NAME_: Silhouette
+    }
+    if model_name not in kselection_map:
+        raise Exception("KSelection model {} not supported!".format(model_name))
+    else:
+        return kselection_map[model_name]()
--- a/server/analysis/constraints.py
+++ b/server/analysis/constraints.py
@@ -0,0 +1,115 @@
+#
+# OtterTune - constraints.py
+#
+# Copyright (c) 2017-18, Carnegie Mellon University Database Group
+#
+'''
+Created on Sep 8, 2016
+
+@author: dvanaken
+'''
+
+import numpy as np
+
+
+class ParamConstraintHelper(object):
+
+    def __init__(self, scaler, encoder=None, binary_vars=None,
+                 init_flip_prob=0.3, flip_prob_decay=0.5):
+        if 'inverse_transform' not in dir(scaler):
+            raise Exception("Scaler object must provide function inverse_transform(X)")
+        if 'transform' not in dir(scaler):
+            raise Exception("Scaler object must provide function transform(X)")
+        self.scaler_ = scaler
+        if encoder is not None and len(encoder.n_values) > 0:
+            self.is_dummy_encoded_ = True
+            self.encoder_ = encoder.encoder
+        else:
+            self.is_dummy_encoded_ = False
+        self.binary_vars_ = binary_vars
+        self.init_flip_prob_ = init_flip_prob
+        self.flip_prob_decay_ = flip_prob_decay
+
+    def apply_constraints(self, sample, scaled=True, rescale=True):
+        conv_sample = self._handle_scaling(sample, scaled)
+
+        if self.is_dummy_encoded_:
+            # apply categorical (ie enum var, >=3 values) constraints
+            n_values = self.encoder_.n_values_
+            cat_start_indices = self.encoder_.feature_indices_
+            for i, nvals in enumerate(n_values):
+                start_idx = cat_start_indices[i]
+                cvals = conv_sample[start_idx: start_idx + nvals]
+                cvals = np.array(np.arange(nvals) == np.argmax(cvals), dtype=float)
+                assert np.sum(cvals) == 1
+                conv_sample[start_idx: start_idx + nvals] = cvals
+
+        # apply binary (0-1) constraints
+        if self.binary_vars_ is not None:
+            for i in self.binary_vars_:
+                # round to closest
+                if conv_sample[i] >= 0.5:
+                    conv_sample[i] = 1
+                else:
+                    conv_sample[i] = 0
+
+        conv_sample = self._handle_rescaling(conv_sample, rescale)
+        return conv_sample
+
+    def _handle_scaling(self, sample, scaled):
+        if scaled:
+            if sample.ndim == 1:
+                sample = sample.reshape(1, -1)
+            sample = self.scaler_.inverse_transform(sample).ravel()
+        else:
+            sample = np.array(sample)
+        return sample
+
+    def _handle_rescaling(self, sample, rescale):
+        if rescale:
+            if sample.ndim == 1:
+                sample = sample.reshape(1, -1)
+            return self.scaler_.transform(sample).ravel()
+        return sample
+
+    def randomize_categorical_features(self, sample, scaled=True, rescale=True):
+        # If there are no categorical features, this function is a no-op.
+        if not self.is_dummy_encoded_:
+            return sample
+        n_values = self.encoder_.n_values_
+        cat_start_indices = self.encoder_.feature_indices_
+        n_cat_feats = len(n_values)
+
+        conv_sample = self._handle_scaling(sample, scaled)
+        flips = np.zeros((n_cat_feats,), dtype=bool)
+
+        # Always flip at least one categorical feature
+        flips[0] = True
+
+        # Flip the rest with decreasing probability
+        p = self.init_flip_prob_
+        for i in range(1, n_cat_feats):
+            if np.random.rand() <= p:
+                flips[i] = True
+            p *= self.flip_prob_decay_
+
+        flip_shuffle_indices = np.random.choice(np.arange(n_cat_feats),
+                                                n_cat_feats,
+                                                replace=False)
+        flips = flips[flip_shuffle_indices]
+
+        for i, nvals in enumerate(n_values):
+            if flips[i]:
+                start_idx = cat_start_indices[i]
+                current_val = conv_sample[start_idx: start_idx + nvals]
+                assert np.all(np.logical_or(current_val == 0, current_val == 1)), \
+                    "categorical {0}: value not 0/1: {1}".format(i, current_val)
+                choices = np.arange(nvals)[current_val != 1]
+                assert choices.size == nvals - 1
+                r = np.zeros(nvals)
+                r[np.random.choice(choices)] = 1
+                assert np.sum(r) == 1
+                conv_sample[start_idx: start_idx + nvals] = r
+
+        conv_sample = self._handle_rescaling(conv_sample, rescale)
+        return conv_sample
--- a/server/analysis/factor_analysis.py
+++ b/server/analysis/factor_analysis.py
@@ -0,0 +1,111 @@
+#
+# OtterTune - factor_analysis.py
+#
+# Copyright (c) 2017-18, Carnegie Mellon University Database Group
+#
+'''
+Created on Jul 4, 2016
+
+@author: dvanaken
+'''
+
+import numpy as np
+from sklearn.decomposition import FactorAnalysis as SklearnFactorAnalysis
+
+from .base import ModelBase
+
+
+class FactorAnalysis(ModelBase):
+    """FactorAnalysis (FA):
+
+    Fits an Sklearn FactorAnalysis model to X.
+
+
+    See also
+    --------
+    http://scikit-learn.org/stable/modules/generated/sklearn.decomposition.FactorAnalysis.html
+
+
+    Attributes
+    ----------
+    model_ : sklearn.decomposition.FactorAnalysis
+             The fitted FA model
+
+    components_ : array, [n_components, n_features]
+                  Components (i.e., factors) with maximum variance
+
+    feature_labels_ : array, [n_features]
+
+    total_variance_ : float
+                      The total amount of variance explained by the components
+
+    pvars_ : array, [n_components]
+             The percentage of the variance explained by each component
+
+    pvars_noise_ : array, [n_components]
+                   The percentage of the variance explained by each component also
+                   accounting for noise
+    """
+
+    def __init__(self):
+        self.model_ = None
+        self.components_ = None
+        self.feature_labels_ = None
+        self.total_variance_ = None
+        self.pvars_ = None
+        self.pvars_noise_ = None
+
+    def _reset(self):
+        """Resets all attributes (erases the model)"""
+        self.model_ = None
+        self.components_ = None
+        self.feature_labels_ = None
+        self.total_variance_ = None
+        self.pvars_ = None
+        self.pvars_noise_ = None
+
+    def fit(self, X, feature_labels=None, n_components=None, estimator_params=None):
+        """Fits an Sklearn FA model to X.
+
+        Parameters
+        ----------
+        X : array-like, shape (n_samples, n_features)
+            Training data.
+
+        feature_labels : array-like, shape (n_features), optional
+                         Labels for each of the features in X.
+
+        estimator_params : dict, optional
+                           The parameters to pass to Sklearn's FA estimators.
+
+
+        Returns
+        -------
+        self
+        """
+        self._reset()
+        if feature_labels is None:
+            feature_labels = ["feature_{}".format(i) for i in range(X.shape[1])]
+        self.feature_labels_ = feature_labels
+        if n_components is not None:
+            model = SklearnFactorAnalysis(n_components=n_components)
+        else:
+            model = SklearnFactorAnalysis()
+        self.model_ = model
+        if estimator_params is not None:
+            # Update Sklearn estimator params
+            assert isinstance(estimator_params, dict)
+            self.model_.set_params(**estimator_params)
+        self.model_.fit(X)
+
+        # Remove zero-valued components (n_components x n_features)
+        components_mask = np.sum(self.model_.components_ != 0.0, axis=1) > 0.0
+        self.components_ = self.model_.components_[components_mask]
+
+        # Compute the % variance explained (with/without noise)
+        c2 = np.sum(self.components_ ** 2, axis=1)
+        self.total_variance_ = np.sum(c2)
+        self.pvars_ = 100 * c2 / self.total_variance_
+        self.pvars_noise_ = 100 * c2 / (self.total_variance_ +
+                                        np.sum(self.model_.noise_variance_))
+        return self
--- a/server/analysis/gp.py
+++ b/server/analysis/gp.py
@@ -0,0 +1,148 @@
+#
+# OtterTune - gp.py
+#
+# Copyright (c) 2017-18, Carnegie Mellon University Database Group
+#
+'''
+Created on Feb 18, 2018
+
+@author: Bohan Zhang
+'''
+import numpy as np
+from scipy.spatial.distance import cdist as ed
+from scipy import special
+from analysis.gp_tf import GPRResult
+
+
+# numpy version of Gaussian Process Regression, not using Tensorflow
+class GPRNP(object):
+
+    def __init__(self, length_scale=1.0, magnitude=1.0, max_train_size=7000,
+                 batch_size=3000, check_numerics=True, debug=False):
+        assert np.isscalar(length_scale)
+        assert np.isscalar(magnitude)
+        assert length_scale > 0 and magnitude > 0
+        self.length_scale = length_scale
+        self.magnitude = magnitude
+        self.max_train_size_ = max_train_size
+        self.batch_size_ = batch_size
+        self.check_numerics = check_numerics
+        self.debug = debug
+        self.X_train = None
+        self.y_train = None
+        self.K = None
+        self.K_inv = None
+        self.y_best = None
+
+    def __repr__(self):
+        rep = ""
+        for k, v in sorted(self.__dict__.items()):
+            rep += "{} = {}\n".format(k, v)
+        return rep
+
+    def __str__(self):
+        return self.__repr__()
+
+    def _reset(self):
+        self.X_train = None
+        self.y_train = None
+        self.K = None
+        self.K_inv = None
+        self.y_best = None
+
+    def check_X_y(self, X, y):
+        from sklearn.utils.validation import check_X_y
+
+        if X.shape[0] > self.max_train_size_:
+            raise Exception("X_train size cannot exceed {} ({})"
+                            .format(self.max_train_size_, X.shape[0]))
+        return check_X_y(X, y, multi_output=True,
+                         allow_nd=True, y_numeric=True,
+                         estimator="GPRNP")
+
+    def check_fitted(self):
+        if self.X_train is None or self.y_train is None \
+                or self.K is None:
+            raise Exception("The model must be trained before making predictions!")
+
+    @staticmethod
+    def check_array(X):
+        from sklearn.utils.validation import check_array
+        return check_array(X, allow_nd=True, estimator="GPRNP")
+
+    @staticmethod
+    def check_output(X):
+        finite_els = np.isfinite(X)
+        if not np.all(finite_els):
+            raise Exception("Input contains non-finite values: {}"
+                            .format(X[~finite_els]))
+
+    def fit(self, X_train, y_train, ridge=0.01):
+        self._reset()
+        X_train, y_train = self.check_X_y(X_train, y_train)
+        if X_train.ndim != 2 or y_train.ndim != 2:
+            raise Exception("X_train or y_train should have 2 dimensions! X_dim:{}, y_dim:{}"
+                            .format(X_train.ndim, y_train.ndim))
+        self.X_train = np.float32(X_train)
+        self.y_train = np.float32(y_train)
+        sample_size = self.X_train.shape[0]
+        if np.isscalar(ridge):
+            ridge = np.ones(sample_size) * ridge
+        assert isinstance(ridge, np.ndarray)
+        assert ridge.ndim == 1
+        K = self.magnitude * np.exp(-ed(self.X_train, self.X_train) / self.length_scale) \
+            + np.diag(ridge)
+        K_inv = np.linalg.inv(K)
+        self.K = K
+        self.K_inv = K_inv
+        self.y_best = np.min(y_train)
+        return self
+
+    def predict(self, X_test):
+        self.check_fitted()
+        if X_test.ndim != 2:
+            raise Exception("X_test should have 2 dimensions! X_dim:{}"
+                            .format(X_test.ndim))
+        X_test = np.float32(GPRNP.check_array(X_test))
+        test_size = X_test.shape[0]
+        arr_offset = 0
+        length_scale = self.length_scale
+        yhats = np.zeros([test_size, 1])
+        sigmas = np.zeros([test_size, 1])
+        eips = np.zeros([test_size, 1])
+        while arr_offset < test_size:
+            if arr_offset + self.batch_size_ > test_size:
+                end_offset = test_size
+            else:
+                end_offset = arr_offset + self.batch_size_
+            xt_ = X_test[arr_offset:end_offset]
+            K2 = self.magnitude * np.exp(-ed(self.X_train, xt_) / length_scale)
+            K3 = self.magnitude * np.exp(-ed(xt_, xt_) / length_scale)
+            K2_trans = np.transpose(K2)
+            yhat = np.matmul(K2_trans, np.matmul(self.K_inv, self.y_train))
+            sigma = np.sqrt(np.diag(K3 - np.matmul(K2_trans, np.matmul(self.K_inv, K2)))) \
+                .reshape(xt_.shape[0], 1)
+            u = (self.y_best - yhat) / sigma
+            phi1 = 0.5 * special.erf(u / np.sqrt(2.0)) + 0.5
+            phi2 = (1.0 / np.sqrt(2.0 * np.pi)) * np.exp(np.square(u) * (-0.5))
+            eip = sigma * (u * phi1 + phi2)
+            yhats[arr_offset:end_offset] = yhat
+            sigmas[arr_offset:end_offset] = sigma
+            eips[arr_offset:end_offset] = eip
+            arr_offset = end_offset
+        GPRNP.check_output(yhats)
+        GPRNP.check_output(sigmas)
+        return GPRResult(yhats, sigmas)
+
+    def get_params(self, deep=True):
+        return {"length_scale": self.length_scale,
+                "magnitude": self.magnitude,
+                "X_train": self.X_train,
+                "y_train": self.y_train,
+                "K": self.K,
+                "K_inv": self.K_inv}
+
+    def set_params(self, **parameters):
+        for param, val in list(parameters.items()):
+            setattr(self, param, val)
+        return self
--- a/server/analysis/gp_tf.py
+++ b/server/analysis/gp_tf.py
@@ -0,0 +1,710 @@
+#
+# OtterTune - gp_tf.py
+#
+# Copyright (c) 2017-18, Carnegie Mellon University Database Group
+#
+'''
+Created on Aug 18, 2016
+
+@author: Bohan Zhang, Dana Van Aken
+'''
+
+import gc
+import numpy as np
+import tensorflow as tf
+
+from .util import get_analysis_logger
+
+LOG = get_analysis_logger(__name__)
+
+
+class GPRResult(object):
+
+    def __init__(self, ypreds=None, sigmas=None):
+        self.ypreds = ypreds
+        self.sigmas = sigmas
+
+
+class GPRGDResult(GPRResult):
+
+    def __init__(self, ypreds=None, sigmas=None,
+                 minl=None, minl_conf=None):
+        super(GPRGDResult, self).__init__(ypreds, sigmas)
+        self.minl = minl
+        self.minl_conf = minl_conf
+
+
+class GPR(object):
+
+    def __init__(self, length_scale=1.0, magnitude=1.0, max_train_size=7000,
+                 batch_size=3000, num_threads=4, check_numerics=True, debug=False):
+        assert np.isscalar(length_scale)
+        assert np.isscalar(magnitude)
+        assert length_scale > 0 and magnitude > 0
+        self.length_scale = length_scale
+        self.magnitude = magnitude
+        self.max_train_size_ = max_train_size
+        self.batch_size_ = batch_size
+        self.num_threads_ = num_threads
+        self.check_numerics = check_numerics
+        self.debug = debug
+        self.X_train = None
+        self.y_train = None
+        self.xy_ = None
+        self.K = None
+        self.K_inv = None
+        self.graph = None
+        self.vars = None
+        self.ops = None
+
+    def build_graph(self):
+        self.vars = {}
+        self.ops = {}
+        self.graph = tf.Graph()
+        with self.graph.as_default():
+            mag_const = tf.constant(self.magnitude,
+                                    dtype=np.float32,
+                                    name='magnitude')
+            ls_const = tf.constant(self.length_scale,
+                                   dtype=np.float32,
+                                   name='length_scale')
+
+            # Nodes for distance computation
+            v1 = tf.placeholder(tf.float32, name="v1")
+            v2 = tf.placeholder(tf.float32, name="v2")
+            dist_op = tf.sqrt(tf.reduce_sum(tf.pow(tf.subtract(v1, v2), 2), 1), name='dist_op')
+            if self.check_numerics:
+                dist_op = tf.check_numerics(dist_op, "dist_op: ")
+
+            self.vars['v1_h'] = v1
+            self.vars['v2_h'] = v2
+            self.ops['dist_op'] = dist_op
+
+            # Nodes for kernel computation
+            X_dists = tf.placeholder(tf.float32, name='X_dists')
+            ridge_ph = tf.placeholder(tf.float32, name='ridge')
+            K_op = mag_const * tf.exp(-X_dists / ls_const)
+            if self.check_numerics:
+                K_op = tf.check_numerics(K_op, "K_op: ")
+            K_ridge_op = K_op + tf.diag(ridge_ph)
+            if self.check_numerics:
+                K_ridge_op = tf.check_numerics(K_ridge_op, "K_ridge_op: ")
+
+            self.vars['X_dists_h'] = X_dists
+            self.vars['ridge_h'] = ridge_ph
+            self.ops['K_op'] = K_op
+            self.ops['K_ridge_op'] = K_ridge_op
+
+            # Nodes for xy computation
+            K = tf.placeholder(tf.float32, name='K')
+            K_inv = tf.placeholder(tf.float32, name='K_inv')
+            xy_ = tf.placeholder(tf.float32, name='xy_')
+            yt_ = tf.placeholder(tf.float32, name='yt_')
+            K_inv_op = tf.matrix_inverse(K)
+            if self.check_numerics:
+                K_inv_op = tf.check_numerics(K_inv_op, "K_inv: ")
+            xy_op = tf.matmul(K_inv, yt_)
+            if self.check_numerics:
+                xy_op = tf.check_numerics(xy_op, "xy_: ")
+
+            self.vars['K_h'] = K
+            self.vars['K_inv_h'] = K_inv
+            self.vars['xy_h'] = xy_
+            self.vars['yt_h'] = yt_
+            self.ops['K_inv_op'] = K_inv_op
+            self.ops['xy_op'] = xy_op
+
+            # Nodes for yhat/sigma computation
+            K2 = tf.placeholder(tf.float32, name="K2")
+            K3 = tf.placeholder(tf.float32, name="K3")
+            yhat_ = tf.cast(tf.matmul(tf.transpose(K2), xy_), tf.float32)
+            if self.check_numerics:
+                yhat_ = tf.check_numerics(yhat_, "yhat_: ")
+            sv1 = tf.matmul(tf.transpose(K2), tf.matmul(K_inv, K2))
+            if self.check_numerics:
+                sv1 = tf.check_numerics(sv1, "sv1: ")
+            sig_val = tf.cast((tf.sqrt(tf.diag_part(K3 - sv1))), tf.float32)
+            if self.check_numerics:
+                sig_val = tf.check_numerics(sig_val, "sig_val: ")
+
+            self.vars['K2_h'] = K2
+            self.vars['K3_h'] = K3
+            self.ops['yhat_op'] = yhat_
+            self.ops['sig_op'] = sig_val
+
+            # Compute y_best (min y)
+            y_best_op = tf.cast(tf.reduce_min(yt_, 0, True), tf.float32)
+            if self.check_numerics:
+                y_best_op = tf.check_numerics(y_best_op, "y_best_op: ")
+            self.ops['y_best_op'] = y_best_op
+
+            sigma = tf.placeholder(tf.float32, name='sigma')
+            yhat = tf.placeholder(tf.float32, name='yhat')
+
+            self.vars['sigma_h'] = sigma
+            self.vars['yhat_h'] = yhat
+
+    def __repr__(self):
+        rep = ""
+        for k, v in sorted(self.__dict__.items()):
+            rep += "{} = {}\n".format(k, v)
+        return rep
+
+    def __str__(self):
+        return self.__repr__()
+
+    def check_X_y(self, X, y):
+        from sklearn.utils.validation import check_X_y
+
+        if X.shape[0] > self.max_train_size_:
+            raise Exception("X_train size cannot exceed {} ({})"
+                            .format(self.max_train_size_, X.shape[0]))
+        return check_X_y(X, y, multi_output=True,
+                         allow_nd=True, y_numeric=True,
+                         estimator="GPR")
+
+    def check_fitted(self):
+        if self.X_train is None or self.y_train is None \
+                or self.xy_ is None or self.K is None:
+            raise Exception("The model must be trained before making predictions!")
+
+    @staticmethod
+    def check_array(X):
+        from sklearn.utils.validation import check_array
+        return check_array(X, allow_nd=True, estimator="GPR")
+
+    @staticmethod
+    def check_output(X):
+        finite_els = np.isfinite(X)
+        if not np.all(finite_els):
+            raise Exception("Input contains non-finite values: {}"
+                            .format(X[~finite_els]))
+
+    def fit(self, X_train, y_train, ridge=1.0):
+        self._reset()
+        X_train, y_train = self.check_X_y(X_train, y_train)
+        self.X_train = np.float32(X_train)
+        self.y_train = np.float32(y_train)
+        sample_size = self.X_train.shape[0]
+
+        if np.isscalar(ridge):
+            ridge = np.ones(sample_size) * ridge
+        assert isinstance(ridge, np.ndarray)
+        assert ridge.ndim == 1
+
+        X_dists = np.zeros((sample_size, sample_size), dtype=np.float32)
+        with tf.Session(graph=self.graph,
+                        config=tf.ConfigProto(
+                            intra_op_parallelism_threads=self.num_threads_)) as sess:
+            dist_op = self.ops['dist_op']
+            v1, v2 = self.vars['v1_h'], self.vars['v2_h']
+            for i in range(sample_size):
+                X_dists[i] = sess.run(dist_op, feed_dict={v1: self.X_train[i], v2: self.X_train})
+
+            K_ridge_op = self.ops['K_ridge_op']
+            X_dists_ph = self.vars['X_dists_h']
+            ridge_ph = self.vars['ridge_h']
+
+            self.K = sess.run(K_ridge_op, feed_dict={X_dists_ph: X_dists, ridge_ph: ridge})
+
+            K_ph = self.vars['K_h']
+
+            K_inv_op = self.ops['K_inv_op']
+            self.K_inv = sess.run(K_inv_op, feed_dict={K_ph: self.K})
+
+            xy_op = self.ops['xy_op']
+            K_inv_ph = self.vars['K_inv_h']
+            yt_ph = self.vars['yt_h']
+            self.xy_ = sess.run(xy_op, feed_dict={K_inv_ph: self.K_inv,
+                                                  yt_ph: self.y_train})
+        return self
+
+    def predict(self, X_test):
+        self.check_fitted()
+        X_test = np.float32(GPR.check_array(X_test))
+        test_size = X_test.shape[0]
+        sample_size = self.X_train.shape[0]
+
+        arr_offset = 0
+        yhats = np.zeros([test_size, 1])
+        sigmas = np.zeros([test_size, 1])
+        with tf.Session(graph=self.graph,
+                        config=tf.ConfigProto(
+                            intra_op_parallelism_threads=self.num_threads_)) as sess:
+            # Nodes for distance operation
+            dist_op = self.ops['dist_op']
+            v1 = self.vars['v1_h']
+            v2 = self.vars['v2_h']
+
+            # Nodes for kernel computation
+            K_op = self.ops['K_op']
+            X_dists = self.vars['X_dists_h']
+
+            # Nodes to compute yhats/sigmas
+            yhat_ = self.ops['yhat_op']
+            K_inv_ph = self.vars['K_inv_h']
+            K2 = self.vars['K2_h']
+            K3 = self.vars['K3_h']
+            xy_ph = self.vars['xy_h']
+
+            while arr_offset < test_size:
+                if arr_offset + self.batch_size_ > test_size:
+                    end_offset = test_size
+                else:
+                    end_offset = arr_offset + self.batch_size_
+
+                X_test_batch = X_test[arr_offset:end_offset]
+                batch_len = end_offset - arr_offset
+
+                dists1 = np.zeros([sample_size, batch_len])
+                for i in range(sample_size):
+                    dists1[i] = sess.run(dist_op, feed_dict={v1: self.X_train[i],
+                                                             v2: X_test_batch})
+
+                sig_val = self.ops['sig_op']
+                K2_ = sess.run(K_op, feed_dict={X_dists: dists1})
+                yhat = sess.run(yhat_, feed_dict={K2: K2_, xy_ph: self.xy_})
+                dists2 = np.zeros([batch_len, batch_len])
+                for i in range(batch_len):
+                    dists2[i] = sess.run(dist_op, feed_dict={v1: X_test_batch[i], v2: X_test_batch})
+                K3_ = sess.run(K_op, feed_dict={X_dists: dists2})
+
+                sigma = np.zeros([1, batch_len], np.float32)
+                sigma[0] = sess.run(sig_val, feed_dict={K_inv_ph: self.K_inv, K2: K2_, K3: K3_})
+                sigma = np.transpose(sigma)
+                yhats[arr_offset: end_offset] = yhat
+                sigmas[arr_offset: end_offset] = sigma
+                arr_offset = end_offset
+        GPR.check_output(yhats)
+        GPR.check_output(sigmas)
+        return GPRResult(yhats, sigmas)
+
+    def get_params(self, deep=True):
+        return {"length_scale": self.length_scale,
+                "magnitude": self.magnitude,
+                "X_train": self.X_train,
+                "y_train": self.y_train,
+                "xy_": self.xy_,
+                "K": self.K,
+                "K_inv": self.K_inv}
+
+    def set_params(self, **parameters):
+        for param, val in list(parameters.items()):
+            setattr(self, param, val)
+        return self
+
+    def _reset(self):
+        self.X_train = None
+        self.y_train = None
+        self.xy_ = None
+        self.K = None
+        self.K_inv = None
+        self.graph = None
+        self.build_graph()
+        gc.collect()
+
+
+class GPRGD(GPR):
+
+    GP_BETA_UCB = "UCB"
+    GP_BETA_CONST = "CONST"
+
+    def __init__(self,
+                 length_scale=1.0,
+                 magnitude=1.0,
+                 max_train_size=7000,
+                 batch_size=3000,
+                 num_threads=4,
+                 learning_rate=0.01,
+                 epsilon=1e-6,
+                 max_iter=100,
+                 sigma_multiplier=3.0,
+                 mu_multiplier=1.0):
+        super(GPRGD, self).__init__(length_scale=length_scale,
+                                    magnitude=magnitude,
+                                    max_train_size=max_train_size,
+                                    batch_size=batch_size,
+                                    num_threads=num_threads)
+        self.learning_rate = learning_rate
+        self.epsilon = epsilon
+        self.max_iter = max_iter
+        self.sigma_multiplier = sigma_multiplier
+        self.mu_multiplier = mu_multiplier
+        self.X_min = None
+        self.X_max = None
+
+    def fit(self, X_train, y_train, X_min, X_max, ridge):  # pylint: disable=arguments-differ
+        super(GPRGD, self).fit(X_train, y_train, ridge)
+        self.X_min = X_min
+        self.X_max = X_max
+
+        with tf.Session(graph=self.graph,
+                        config=tf.ConfigProto(
+                            intra_op_parallelism_threads=self.num_threads_)) as sess:
+            xt_ = tf.Variable(self.X_train[0], tf.float32)
+            xt_ph = tf.placeholder(tf.float32)
+            xt_assign_op = xt_.assign(xt_ph)
+            init = tf.global_variables_initializer()
+            sess.run(init)
+            K2_mat = tf.transpose(tf.expand_dims(tf.sqrt(tf.reduce_sum(tf.pow(
+                tf.subtract(xt_, self.X_train), 2), 1)), 0))
+            if self.check_numerics is True:
+                K2_mat = tf.check_numerics(K2_mat, "K2_mat: ")
+            K2__ = tf.cast(self.magnitude * tf.exp(-K2_mat / self.length_scale), tf.float32)
+            if self.check_numerics is True:
+                K2__ = tf.check_numerics(K2__, "K2__: ")
+            yhat_gd = tf.cast(tf.matmul(tf.transpose(K2__), self.xy_), tf.float32)
+            if self.check_numerics is True:
+                yhat_gd = tf.check_numerics(yhat_gd, message="yhat: ")
+            sig_val = tf.cast((tf.sqrt(self.magnitude - tf.matmul(
+                tf.transpose(K2__), tf.matmul(self.K_inv, K2__)))), tf.float32)
+            if self.check_numerics is True:
+                sig_val = tf.check_numerics(sig_val, message="sigma: ")
+            LOG.debug("\nyhat_gd : %s", str(sess.run(yhat_gd)))
+            LOG.debug("\nsig_val : %s", str(sess.run(sig_val)))
+
+            loss = tf.squeeze(tf.subtract(self.mu_multiplier * yhat_gd,
+                                          self.sigma_multiplier * sig_val))
+            if self.check_numerics is True:
+                loss = tf.check_numerics(loss, "loss: ")
+            optimizer = tf.train.AdamOptimizer(learning_rate=self.learning_rate,
+                                               epsilon=self.epsilon)
+            # optimizer = tf.train.GradientDescentOptimizer(learning_rate=self.learning_rate)
+            train = optimizer.minimize(loss)
+
+            self.vars['xt_'] = xt_
+            self.vars['xt_ph'] = xt_ph
+            self.ops['xt_assign_op'] = xt_assign_op
+            self.ops['yhat_gd'] = yhat_gd
+            self.ops['sig_val2'] = sig_val
+            self.ops['loss_op'] = loss
+            self.ops['train_op'] = train
+        return self
+
+    def predict(self, X_test, constraint_helper=None,  # pylint: disable=arguments-differ
+                categorical_feature_method='hillclimbing',
+                categorical_feature_steps=3):
+        self.check_fitted()
+        X_test = np.float32(GPR.check_array(X_test))
+        test_size = X_test.shape[0]
+        nfeats = self.X_train.shape[1]
+
+        arr_offset = 0
+        yhats = np.zeros([test_size, 1])
+        sigmas = np.zeros([test_size, 1])
+        minls = np.zeros([test_size, 1])
+        minl_confs = np.zeros([test_size, nfeats])
+
+        with tf.Session(graph=self.graph,
+                        config=tf.ConfigProto(
+                            intra_op_parallelism_threads=self.num_threads_)) as sess:
+            while arr_offset < test_size:
+                if arr_offset + self.batch_size_ > test_size:
+                    end_offset = test_size
+                else:
+                    end_offset = arr_offset + self.batch_size_
+
+                X_test_batch = X_test[arr_offset:end_offset]
+                batch_len = end_offset - arr_offset
+
+                xt_ = self.vars['xt_']
+                init = tf.global_variables_initializer()
+                sess.run(init)
+
+                sig_val = self.ops['sig_val2']
+                yhat_gd = self.ops['yhat_gd']
+                loss = self.ops['loss_op']
+                train = self.ops['train_op']
+
+                xt_ph = self.vars['xt_ph']
+                assign_op = self.ops['xt_assign_op']
+
+                yhat = np.empty((batch_len, 1))
+                sigma = np.empty((batch_len, 1))
+                minl = np.empty((batch_len, 1))
+                minl_conf = np.empty((batch_len, nfeats))
+                for i in range(batch_len):
+                    if self.debug is True:
+                        LOG.info("-------------------------------------------")
+                    yhats_it = np.empty((self.max_iter + 1,)) * np.nan
+                    sigmas_it = np.empty((self.max_iter + 1,)) * np.nan
+                    losses_it = np.empty((self.max_iter + 1,)) * np.nan
+                    confs_it = np.empty((self.max_iter + 1, nfeats)) * np.nan
+
+                    sess.run(assign_op, feed_dict={xt_ph: X_test_batch[i]})
+                    step = 0
+                    for step in range(self.max_iter):
+                        if self.debug is True:
+                            LOG.info("Batch %d, iter %d:", i, step)
+                        yhats_it[step] = sess.run(yhat_gd)[0][0]
+                        sigmas_it[step] = sess.run(sig_val)[0][0]
+                        losses_it[step] = sess.run(loss)
+                        confs_it[step] = sess.run(xt_)
+                        if self.debug is True:
+                            LOG.info("    yhat:  %s", str(yhats_it[step]))
+                            LOG.info("    sigma: %s", str(sigmas_it[step]))
+                            LOG.info("    loss:  %s", str(losses_it[step]))
+                            LOG.info("    conf:  %s", str(confs_it[step]))
+                        sess.run(train)
+                        # constraint Projected Gradient Descent
+                        xt = sess.run(xt_)
+                        xt_valid = np.minimum(xt, self.X_max)
+                        xt_valid = np.maximum(xt_valid, self.X_min)
+                        sess.run(assign_op, feed_dict={xt_ph: xt_valid})
+                        if constraint_helper is not None:
+                            xt_valid = constraint_helper.apply_constraints(sess.run(xt_))
+                            sess.run(assign_op, feed_dict={xt_ph: xt_valid})
+                            if categorical_feature_method == 'hillclimbing':
+                                if step % categorical_feature_steps == 0:
+                                    current_xt = sess.run(xt_)
+                                    current_loss = sess.run(loss)
+                                    new_xt = \
+                                        constraint_helper.randomize_categorical_features(
+                                            current_xt)
+                                    sess.run(assign_op, feed_dict={xt_ph: new_xt})
+                                    new_loss = sess.run(loss)
+                                    if current_loss < new_loss:
+                                        sess.run(assign_op, feed_dict={xt_ph: new_xt})
+                            else:
+                                raise Exception("Unknown categorial feature method: {}".format(
+                                    categorical_feature_method))
+                    if step == self.max_iter - 1:
+                        # Record results from final iteration
+                        yhats_it[-1] = sess.run(yhat_gd)[0][0]
+                        sigmas_it[-1] = sess.run(sig_val)[0][0]
+                        losses_it[-1] = sess.run(loss)
+                        confs_it[-1] = sess.run(xt_)
+                        assert np.all(np.isfinite(yhats_it))
+                        assert np.all(np.isfinite(sigmas_it))
+                        assert np.all(np.isfinite(losses_it))
+                        assert np.all(np.isfinite(confs_it))
+
+                    # Store info for conf with min loss from all iters
+                    if np.all(~np.isfinite(losses_it)):
+                        min_loss_idx = 0
+                    else:
+                        min_loss_idx = np.nanargmin(losses_it)
+                    yhat[i] = yhats_it[min_loss_idx]
+                    sigma[i] = sigmas_it[min_loss_idx]
+                    minl[i] = losses_it[min_loss_idx]
+                    minl_conf[i] = confs_it[min_loss_idx]
+
+                minls[arr_offset:end_offset] = minl
+                minl_confs[arr_offset:end_offset] = minl_conf
+                yhats[arr_offset:end_offset] = yhat
+                sigmas[arr_offset:end_offset] = sigma
+                arr_offset = end_offset
+
+        GPR.check_output(yhats)
+        GPR.check_output(sigmas)
+        GPR.check_output(minls)
+        GPR.check_output(minl_confs)
+
+        return GPRGDResult(yhats, sigmas, minls, minl_confs)
+
+    @staticmethod
+    def calculate_sigma_multiplier(t, ndim, bound=0.1):
+        assert t > 0
+        assert ndim > 0
+        assert bound > 0 and bound <= 1
+        beta = 2 * np.log(ndim * (t**2) * (np.pi**2) / 6 * bound)
+        if beta > 0:
+            beta = np.sqrt(beta)
+        else:
+            beta = 1
+        return beta
+
+
+# def gp_tf(X_train, y_train, X_test, ridge, length_scale, magnitude, batch_size=3000):
+#    with tf.Graph().as_default():
+#        y_best = tf.cast(tf.reduce_min(y_train, 0, True), tf.float32)
+#        sample_size = X_train.shape[0]
+#        train_size = X_test.shape[0]
+#        arr_offset = 0
+#        yhats = np.zeros([train_size, 1])
+#        sigmas = np.zeros([train_size, 1])
+#        eips = np.zeros([train_size, 1])
+#        X_train = np.float32(X_train)
+#        y_train = np.float32(y_train)
+#        X_test = np.float32(X_test)
+#        ridge = np.float32(ridge)
+#
+#        v1 = tf.placeholder(tf.float32,name="v1")
+#        v2 = tf.placeholder(tf.float32,name="v2")
+#        dist_op = tf.sqrt(tf.reduce_sum(tf.pow(tf.subtract(v1, v2), 2), 1))
+#        try:
+#            sess = tf.Session(config=tf.ConfigProto(log_device_placement=False))
+#
+#            dists = np.zeros([sample_size,sample_size])
+#            for i in range(sample_size):
+#                dists[i] = sess.run(dist_op,feed_dict={v1:X_train[i], v2:X_train})
+#
+#
+#            dists = tf.cast(dists, tf.float32)
+#            K = magnitude * tf.exp(-dists/length_scale) + tf.diag(ridge);
+#
+#            K2 = tf.placeholder(tf.float32, name="K2")
+#            K3 = tf.placeholder(tf.float32, name="K3")
+#
+#            x = tf.matmul(tf.matrix_inverse(K), y_train)
+#            yhat_ =  tf.cast(tf.matmul(tf.transpose(K2), x), tf.float32);
+#            sig_val = tf.cast((tf.sqrt(tf.diag_part(K3 -  tf.matmul(tf.transpose(K2),
+#                                                                    tf.matmul(tf.matrix_inverse(K),
+#                                                                              K2))))),
+#                              tf.float32)
+#
+#            u = tf.placeholder(tf.float32, name="u")
+#            phi1 = 0.5 * tf.erf(u / np.sqrt(2.0)) + 0.5
+#            phi2 = (1.0 / np.sqrt(2.0 * np.pi)) * tf.exp(tf.square(u) * (-0.5));
+#            eip = (tf.multiply(u, phi1) + phi2);
+#
+#            while arr_offset < train_size:
+#                if arr_offset + batch_size > train_size:
+#                    end_offset = train_size
+#                else:
+#                    end_offset = arr_offset + batch_size;
+#
+#                xt_ = X_test[arr_offset:end_offset];
+#                batch_len = end_offset - arr_offset
+#
+#                dists = np.zeros([sample_size, batch_len])
+#                for i in range(sample_size):
+#                    dists[i] = sess.run(dist_op, feed_dict={v1:X_train[i], v2:xt_})
+#
+#                K2_ = magnitude * tf.exp(-dists / length_scale);
+#                K2_ = sess.run(K2_)
+#
+#                dists = np.zeros([batch_len, batch_len])
+#                for i in range(batch_len):
+#                    dists[i] = sess.run(dist_op, feed_dict={v1:xt_[i], v2:xt_})
+#                K3_ = magnitude * tf.exp(-dists / length_scale);
+#                K3_ = sess.run(K3_)
+#
+#                yhat = sess.run(yhat_, feed_dict={K2:K2_})
+#
+#                sigma = np.zeros([1, batch_len], np.float32)
+#                sigma[0] = (sess.run(sig_val, feed_dict={K2:K2_, K3:K3_}))
+#                sigma = np.transpose(sigma)
+#
+#                u_ = tf.cast(tf.div(tf.subtract(y_best, yhat), sigma), tf.float32)
+#                u_ = sess.run(u_)
+#                eip_p = sess.run(eip, feed_dict={u:u_})
+#                eip_ = tf.multiply(sigma, eip_p)
+#                yhats[arr_offset:end_offset] = yhat
+#                sigmas[arr_offset:end_offset] =  sigma;
+#                eips[arr_offset:end_offset] = sess.run(eip_);
+#                arr_offset = end_offset
+#
+#        finally:
+#            sess.close()
+#
+#        return yhats, sigmas, eips
+
+
+def euclidean_mat(X, y, sess):
+    x_n = X.shape[0]
+    y_n = y.shape[0]
+    z = np.zeros([x_n, y_n])
+    for i in range(x_n):
+        v1 = X[i]
+        tmp = []
+        for j in range(y_n):
+            v2 = y[j]
+            tmp.append(tf.sqrt(tf.reduce_sum(tf.pow(tf.subtract(v1, v2), 2))))
+        z[i] = (sess.run(tmp))
+    return z
+
+
+def gd_tf(xs, ys, xt, ridge, length_scale=1.0, magnitude=1.0, max_iter=50):
+    LOG.debug("xs shape: %s", str(xs.shape))
+    LOG.debug("ys shape: %s", str(ys.shape))
+    LOG.debug("xt shape: %s", str(xt.shape))
+    with tf.Graph().as_default():
+        # y_best = tf.cast(tf.reduce_min(ys,0,True),tf.float32);   #array
+        # yhat_gd = tf.check_numerics(yhat_gd, message="yhat: ")
+        sample_size = xs.shape[0]
+        nfeats = xs.shape[1]
+        test_size = xt.shape[0]
+        # arr_offset = 0
+        ini_size = xt.shape[0]
+
+        yhats = np.zeros([test_size, 1])
+        sigmas = np.zeros([test_size, 1])
+        minl = np.zeros([test_size, 1])
+        new_conf = np.zeros([test_size, nfeats])
+
+        xs = np.float32(xs)
+        ys = np.float32(ys)
+        xt_ = tf.Variable(xt[0], tf.float32)
+
+        sess = tf.Session(config=tf.ConfigProto(intra_op_parallelism_threads=8))
+        init = tf.global_variables_initializer()
+        sess.run(init)
+
+        ridge = np.float32(ridge)
+        v1 = tf.placeholder(tf.float32, name="v1")
+        v2 = tf.placeholder(tf.float32, name="v2")
+        dist = tf.sqrt(tf.reduce_sum(tf.pow(tf.subtract(v1, v2), 2), 1))
+
+        tmp = np.zeros([sample_size, sample_size])
+        for i in range(sample_size):
+            tmp[i] = sess.run(dist, feed_dict={v1: xs[i], v2: xs})
+
+        tmp = tf.cast(tmp, tf.float32)
+        K = magnitude * tf.exp(-tmp / length_scale) + tf.diag(ridge)
+        LOG.debug("K shape: %s", str(sess.run(K).shape))
+
+        K2_mat = tf.sqrt(tf.reduce_sum(tf.pow(tf.subtract(xt_, xs), 2), 1))
+        K2_mat = tf.transpose(tf.expand_dims(K2_mat, 0))
+        K2 = tf.cast(tf.exp(-K2_mat / length_scale), tf.float32)
+
+        x = tf.matmul(tf.matrix_inverse(K), ys)
+        x = sess.run(x)
+        yhat_ = tf.cast(tf.matmul(tf.transpose(K2), x), tf.float32)
+        sig_val = tf.cast((tf.sqrt(magnitude - tf.matmul(
+            tf.transpose(K2), tf.matmul(tf.matrix_inverse(K), K2)))), tf.float32)
+
+        LOG.debug('yhat shape: %s', str(sess.run(yhat_).shape))
+        LOG.debug('sig_val shape: %s', str(sess.run(sig_val).shape))
+        yhat_ = tf.check_numerics(yhat_, message='yhat: ')
+        sig_val = tf.check_numerics(sig_val, message='sig_val: ')
+        loss = tf.squeeze(tf.subtract(yhat_, sig_val))
+        loss = tf.check_numerics(loss, message='loss: ')
+    #    optimizer = tf.train.GradientDescentOptimizer(0.1)
+        LOG.debug('loss: %s', str(sess.run(loss)))
+        optimizer = tf.train.AdamOptimizer(0.1)
+        train = optimizer.minimize(loss)
+        init = tf.global_variables_initializer()
+        sess.run(init)
+
+        for i in range(ini_size):
+            assign_op = xt_.assign(xt[i])
+            sess.run(assign_op)
+            for step in range(max_iter):
+                LOG.debug('sample #: %d, iter #: %d, loss: %s', i, step, str(sess.run(loss)))
+                sess.run(train)
+            yhats[i] = sess.run(yhat_)[0][0]
+            sigmas[i] = sess.run(sig_val)[0][0]
+            minl[i] = sess.run(loss)
+            new_conf[i] = sess.run(xt_)
+        return yhats, sigmas, minl, new_conf
+
+
+def main():
+    pass
+
+
+def create_random_matrices(n_samples=3000, n_feats=12, n_test=4444):
+    X_train = np.random.rand(n_samples, n_feats)
+    y_train = np.random.rand(n_samples, 1)
+    X_test = np.random.rand(n_test, n_feats)
+
+    length_scale = np.random.rand()
+    magnitude = np.random.rand()
+    ridge = np.ones(n_samples) * np.random.rand()
+
+    return X_train, y_train, X_test, length_scale, magnitude, ridge
+
+
+if __name__ == "__main__":
+    main()
--- a/server/analysis/lasso.py
+++ b/server/analysis/lasso.py
@@ -0,0 +1,109 @@
+#
+# OtterTune - lasso.py
+#
+# Copyright (c) 2017-18, Carnegie Mellon University Database Group
+#
+'''
+Created on Jul 8, 2016
+
+@author: dvanaken
+'''
+
+import numpy as np
+from sklearn.linear_model import lasso_path
+
+from .base import ModelBase
+
+
+class LassoPath(ModelBase):
+    """Lasso:
+
+    Computes the Lasso path using Sklearn's lasso_path method.
+
+
+    See also
+    --------
+    http://scikit-learn.org/stable/modules/generated/sklearn.linear_model.lasso_path.html
+
+
+    Attributes
+    ----------
+    feature_labels_ : array, [n_features]
+                      Labels for each of the features in X.
+
+    alphas_ : array, [n_alphas]
+              The alphas along the path where models are computed. (These are
+              the decreasing values of the penalty along the path).
+
+    coefs_ : array, [n_outputs, n_features, n_alphas]
+             Coefficients along the path.
+
+    rankings_ : array, [n_features]
+             The average ranking of each feature across all target values.
+    """
+    def __init__(self):
+        self.feature_labels_ = None
+        self.alphas_ = None
+        self.coefs_ = None
+        self.rankings_ = None
+
+    def _reset(self):
+        """Resets all attributes (erases the model)"""
+        self.feature_labels_ = None
+        self.alphas_ = None
+        self.coefs_ = None
+        self.rankings_ = None
+
+    def fit(self, X, y, feature_labels, estimator_params=None):
+        """Computes the Lasso path using Sklearn's lasso_path method.
+
+        Parameters
+        ----------
+        X : array-like, shape (n_samples, n_features)
+            Training data (the independent variables).
+
+        y : array-like, shape (n_samples, n_outputs)
+            Training data (the output/target values).
+
+        feature_labels : array-like, shape (n_features)
+                         Labels for each of the features in X.
+
+        estimator_params : dict, optional
+                           The parameters to pass to Sklearn's Lasso estimator.
+
+
+        Returns
+        -------
+        self
+        """
+        self._reset()
+        if estimator_params is None:
+            estimator_params = {}
+        self.feature_labels_ = feature_labels
+
+        alphas, coefs, _ = lasso_path(X, y, **estimator_params)
+        self.alphas_ = alphas.copy()
+        self.coefs_ = coefs.copy()
+
+        # Rank the features in X by order of importance. This ranking is based
+        # on how early a given features enter the regression (the earlier a
+        # feature enters the regression, the MORE important it is).
+        feature_rankings = [[] for _ in range(X.shape[1])]
+        for target_coef_paths in self.coefs_:
+            for i, feature_path in enumerate(target_coef_paths):
+                entrance_step = 1
+                for val_at_step in feature_path:
+                    if val_at_step == 0:
+                        entrance_step += 1
+                    else:
+                        break
+                feature_rankings[i].append(entrance_step)
+        self.rankings_ = np.array([np.mean(ranks) for ranks in feature_rankings])
+        return self
+
+    def get_ranked_features(self):
+        if self.rankings_ is None:
+            raise Exception("No lasso path has been fit yet!")
+
+        rank_idxs = np.argsort(self.rankings_)
+        return [self.feature_labels_[i] for i in rank_idxs]
--- a/server/analysis/preprocessing.py
+++ b/server/analysis/preprocessing.py
@@ -0,0 +1,489 @@
+#
+# OtterTune - preprocessing.py
+#
+# Copyright (c) 2017-18, Carnegie Mellon University Database Group
+#
+from abc import ABCMeta, abstractmethod
+
+from itertools import chain, combinations, combinations_with_replacement
+import numpy as np
+from sklearn.preprocessing import MinMaxScaler as SklearnMinMaxScaler
+
+from .util import is_numeric_matrix, is_lexical_matrix
+
+
+# ==========================================================
+#  Preprocessing Base Class
+# ==========================================================
+class Preprocess(object, metaclass=ABCMeta):
+
+    @abstractmethod
+    def fit(self, matrix):
+        pass
+
+    @abstractmethod
+    def transform(self, matrix, copy=True):
+        pass
+
+    def fit_transform(self, matrix, copy=True):
+        self.fit(matrix)
+        return self.transform(matrix, copy=True)
+
+    @abstractmethod
+    def inverse_transform(self, matrix, copy=True):
+        pass
+
+
+# ==========================================================
+#   Bin by Deciles
+# ==========================================================
+class Bin(Preprocess):
+
+    def __init__(self, bin_start, axis=None):
+        if axis is not None and \
+                axis != 1 and axis != 0:
+            raise NotImplementedError("Axis={} is not yet implemented".format(axis))
+        self.deciles_ = None
+        self.bin_start_ = bin_start
+        self.axis_ = axis
+
+    def fit(self, matrix):
+        if self.axis_ is None:
+            self.deciles_ = get_deciles(matrix, self.axis_)
+        elif self.axis_ == 0:  # Bin columns
+            self.deciles_ = []
+            for col in matrix.T:
+                self.deciles_.append(get_deciles(col, axis=None))
+        elif self.axis_ == 1:  # Bin rows
+            self.deciles_ = []
+            for row in matrix:
+                self.deciles_.append(get_deciles(row, axis=None))
+        return self
+
+    def transform(self, matrix, copy=True):
+        assert self.deciles_ is not None
+        if self.axis_ is None:
+            res = bin_by_decile(matrix, self.deciles_,
+                                self.bin_start_, self.axis_)
+        elif self.axis_ == 0:  # Transform columns
+            columns = []
+            for col, decile in zip(matrix.T, self.deciles_):
+                columns.append(bin_by_decile(col, decile,
+                                             self.bin_start_, axis=None))
+            res = np.vstack(columns).T
+        elif self.axis_ == 1:  # Transform rows
+            rows = []
+            for row, decile in zip(matrix, self.deciles_):
+                rows.append(bin_by_decile(row, decile,
+                                          self.bin_start_, axis=None))
+            res = np.vstack(rows)
+        assert res.shape == matrix.shape
+        return res
+
+    def inverse_transform(self, matrix, copy=True):
+        raise NotImplementedError("This method is not supported")
+
+
+def get_deciles(matrix, axis=None):
+    if axis is not None:
+        raise NotImplementedError("Axis is not yet implemented")
+
+    assert matrix.ndim > 0
+    assert matrix.size > 0
+
+    decile_range = np.arange(10, 101, 10)
+    deciles = np.percentile(matrix, decile_range, axis=axis)
+    deciles[-1] = np.Inf
+    return deciles
+
+
+def bin_by_decile(matrix, deciles, bin_start, axis=None):
+    if axis is not None:
+        raise NotImplementedError("Axis is not yet implemented")
+
+    assert matrix.ndim > 0
+    assert matrix.size > 0
+    assert deciles is not None
+    assert len(deciles) == 10
+
+    binned_matrix = np.zeros_like(matrix)
+    for i in range(10)[::-1]:
+        decile = deciles[i]
+        binned_matrix[matrix <= decile] = i + bin_start
+
+    return binned_matrix
+
+
+# ==========================================================
+#   Shuffle Indices
+# ==========================================================
+class Shuffler(Preprocess):
+
+    def __init__(self, shuffle_rows=True, shuffle_columns=False,
+                 row_indices=None, column_indices=None, seed=0):
+        self.shuffle_rows_ = shuffle_rows
+        self.shuffle_columns_ = shuffle_columns
+        self.row_indices_ = row_indices
+        self.column_indices_ = column_indices
+        np.random.seed(seed)
+        self.fitted_ = False
+
+    def fit(self, matrix):
+        if self.shuffle_rows_ and self.row_indices_ is None:
+            self.row_indices_ = get_shuffle_indices(matrix.data.shape[0])
+        if self.shuffle_columns_ and self.column_indices_ is None:
+            self.column_indices_ = get_shuffle_indices(matrix.data.shape[1])
+        self.fitted_ = True
+
+    def transform(self, matrix, copy=True):
+        if not self.fitted_:
+            raise Exception("The fit() function must be called before transform()")
+        if copy:
+            matrix = matrix.copy()
+
+        if self.shuffle_rows_:
+            matrix.data = matrix.data[self.row_indices_]
+            matrix.rowlabels = matrix.rowlabels[self.row_indices_]
+        if self.shuffle_columns_:
+            matrix.data = matrix.data[:, self.column_indices_]
+            matrix.columnlabels = matrix.columnlabels[self.column_indices_]
+        return matrix
+
+    def inverse_transform(self, matrix, copy=True):
+        if copy:
+            matrix = matrix.copy()
+
+        if self.shuffle_rows_:
+            inverse_row_indices = np.argsort(self.row_indices_)
+            matrix.data = matrix.data[inverse_row_indices]
+            matrix.rowlabels = matrix.rowlabels[inverse_row_indices]
+        if self.shuffle_columns_:
+            inverse_column_indices = np.argsort(self.column_indices_)
+            matrix.data = matrix.data[:, inverse_column_indices]
+            matrix.columnlabels = matrix.columnlabels[inverse_column_indices]
+        return matrix
+
+
+def get_shuffle_indices(size, seed=None):
+    if seed is not None:
+        assert isinstance(seed, int)
+        np.random.seed(seed)
+    if isinstance(size, int):
+        return np.random.choice(size, size, replace=False)
+    else:
+        indices = []
+        for d in size:
+            indices.append(np.random.choice(d, d, replace=False))
+        return indices
+
+
+# ==========================================================
+#   Polynomial Features
+# ==========================================================
+class PolynomialFeatures(Preprocess):
+    """Compute the polynomial features of the input array.
+    This code was copied and modified from sklearn's
+    implementation.
+    """
+
+    def __init__(self, degree=2, interaction_only=False, include_bias=True):
+        self.degree_ = degree
+        self.interaction_only_ = interaction_only
+        self.include_bias_ = include_bias
+        self.n_input_features_ = None
+        self.n_output_features_ = None
+
+#     @property
+#     def powers_(self):
+#         combinations = self._combinations(self.n_input_features_, self.degree_,
+#                                           self.interaction_only_,
+#                                           self.include_bias_)
+#         return np.vstack(np.bincount(c, minlength=self.n_input_features_)
+#                          for c in combinations)
+
+    @staticmethod
+    def _combinations(n_features, degree, interaction_only, include_bias):
+        comb = (combinations if interaction_only else combinations_with_replacement)
+        start = int(not include_bias)
+        return chain.from_iterable(comb(list(range(n_features)), i)
+                                   for i in range(start, degree + 1))
+
+    def fit(self, matrix):
+        assert matrix.ndim == 2
+        assert matrix.size > 0
+
+        _, n_features = matrix.shape
+        combos = self._combinations(n_features, self.degree_,
+                                    self.interaction_only_,
+                                    self.include_bias_)
+        self.n_input_features_ = matrix.shape[1]
+        self.n_output_features_ = sum(1 for _ in combos)
+        return self
+
+    def transform(self, matrix, copy=True):
+        """Transform data to polynomial features
+        Parameters
+        ----------
+        X : array-like, shape [n_samples, n_features]
+            The data to transform, row by row.
+        Returns
+        -------
+        XP : np.ndarray shape [n_samples, NP]
+            The matrix of features, where NP is the number of polynomial
+            features generated from the combination of inputs.
+        """
+        assert matrix.ndim == 2
+        assert matrix.size > 0
+
+        n_samples, n_features = matrix.shape
+
+        if n_features != self.n_input_features_:
+            raise ValueError("X shape does not match training shape")
+
+        is_numeric_type = is_numeric_matrix(matrix)
+        is_lexical_type = is_lexical_matrix(matrix)
+        if is_lexical_type:
+            strs = matrix.reshape((matrix.size,))
+            maxlen = max([len(s) for s in strs])
+            dtype = "S{}".format(maxlen * 2 + 1)
+        else:
+            dtype = matrix.dtype
+
+        # allocate output data
+        poly_matrix = np.empty((n_samples, self.n_output_features_), dtype=dtype)
+
+        combos = self._combinations(n_features, self.degree_,
+                                    self.interaction_only_,
+                                    self.include_bias_)
+        for i, c in enumerate(combos):
+            if is_numeric_type:
+                poly_matrix[:, i] = matrix[:, c].prod(1)
+            elif is_lexical_type:
+                n_poly1_feats = n_features + int(self.include_bias_)
+                if i >= n_poly1_feats:
+                    x = "*".join(np.squeeze(matrix[:, c]).tolist())
+                else:
+                    x = "".join(np.squeeze(matrix[:, c]).tolist())
+                poly_matrix[:, i] = x
+            else:
+                raise TypeError("Unsupported matrix type {}".format(matrix.dtype))
+
+        return poly_matrix
+
+    def inverse_transform(self, matrix, copy=True):
+        raise NotImplementedError("This method is not supported")
+
+
+# ==========================================================
+#   Dummy Encoding
+# ==========================================================
+class DummyEncoder(Preprocess):
+
+    def __init__(self, n_values, categorical_features, cat_columnlabels, noncat_columnlabels):
+        from sklearn.preprocessing import OneHotEncoder
+
+        if not isinstance(n_values, np.ndarray):
+            n_values = np.array(n_values)
+        if not isinstance(categorical_features, np.ndarray):
+            categorical_features = np.array(categorical_features)
+        # assert categorical_features.size > 0
+        assert categorical_features.shape == n_values.shape
+        for nv in n_values:
+            if nv <= 2:
+                raise Exception("Categorical features must have 3+ labels")
+
+        self.n_values = n_values
+        self.cat_columnlabels = cat_columnlabels
+        self.noncat_columnlabels = noncat_columnlabels
+        self.encoder = OneHotEncoder(
+            n_values=n_values, categorical_features=categorical_features, sparse=False)
+        self.new_labels = None
+        self.cat_idxs_old = categorical_features
+
+    def fit(self, matrix):
+        self.encoder.fit(matrix)
+        # determine new columnlabels
+        # categorical variables are done in order specified by categorical_features
+        new_labels = []
+        for i, cat_label in enumerate(self.cat_columnlabels):
+            low = self.encoder.feature_indices_[i]
+            high = self.encoder.feature_indices_[i + 1]
+            for j in range(low, high):
+                # eg the categorical variable named cat_var with 5 possible values
+                # turns into 0/1 variables named cat_var____0, ..., cat_var____4
+                new_labels.append(cat_label + "____" + str(j - low))
+        # according to sklearn documentation,
+        # "non-categorical features are always stacked to the right of the matrix"
+        # by observation, it looks like the non-categorical features' relative order is preserved
+        # BUT: there is no guarantee made about that behavior!
+        # We either trust OneHotEncoder to be sensible, or look for some other way
+        new_labels += self.noncat_columnlabels
+        self.new_labels = new_labels
+
+    def transform(self, matrix, copy=True):
+        # actually transform the matrix
+        matrix_encoded = self.encoder.transform(matrix)
+        return matrix_encoded
+
+    def fit_transform(self, matrix, copy=True):
+        self.fit(matrix)
+        return self.transform(matrix)
+
+    def inverse_transform(self, matrix, copy=True):
+        n_values = self.n_values
+        # If there are no categorical variables, no transformation happened.
+        if len(n_values) == 0:
+            return matrix
+
+        # Otherwise, this is a dummy-encoded matrix. Transform it back to original form.
+        n_features = matrix.shape[-1] - self.encoder.feature_indices_[-1] + len(n_values)
+        noncat_start_idx = self.encoder.feature_indices_[-1]
+        inverted_matrix = np.empty((matrix.shape[0], n_features))
+        cat_idx = 0
+        noncat_idx = 0
+        for i in range(n_features):
+            if i in self.cat_idxs_old:
+                new_col = np.ones((matrix.shape[0],))
+                start_idx = self.encoder.feature_indices_[cat_idx]
+                for j in range(n_values[cat_idx]):
+                    col = matrix[:, start_idx + j]
+                    new_col[col == 1] = j
+                cat_idx += 1
+            else:
+                new_col = np.array(matrix[:, noncat_start_idx + noncat_idx])
+                noncat_idx += 1
+            inverted_matrix[:, i] = new_col
+        return inverted_matrix
+
+    def total_dummies(self):
+        return sum(self.n_values)
+
+
+def consolidate_columnlabels(columnlabels):
+    import re
+    # use this to check if a label was created by dummy encoder
+    p = re.compile(r'(.*)____\d+')
+
+    consolidated_columnlabels = []
+    cat_seen = set()  # avoid duplicate cat_labels
+    for lab in columnlabels:
+        m = p.match(lab)
+        # m.group(1) is the original column name
+        if m:
+            if m.group(1) not in cat_seen:
+                cat_seen.add(m.group(1))
+                consolidated_columnlabels.append(m.group(1))
+        else:
+            # non-categorical variable
+            consolidated_columnlabels.append(lab)
+    return consolidated_columnlabels
+
+
+def fix_scaler(scaler, encoder, params):
+    p = 0.5
+    mean = scaler.mean_
+    var = scaler.var_
+    n_values = encoder.n_values
+    cat_start_idxs = encoder.xform_start_indices
+    current_idx = 0
+    cat_idx = 0
+    for param in params:
+        if param.iscategorical:
+            if param.isboolean:
+                nvals = 1
+            else:
+                assert cat_start_idxs[cat_idx] == current_idx
+                nvals = n_values[cat_idx]
+                cat_idx += 1
+            cat_mean = nvals * p
+            cat_var = cat_mean * (1 - p)
+            mean[current_idx: current_idx + nvals] = cat_mean
+            var[current_idx: current_idx + nvals] = cat_var
+            current_idx += nvals
+        else:
+            current_idx += 1
+
+    scaler.mean_ = mean
+    scaler.var_ = var
+    scaler.scale_ = np.sqrt(var)
+
+
+def get_min_max(params, encoder=None):
+    if encoder is not None:
+        num_cat_feats = encoder.n_values.size
+        nfeats = len(params) - num_cat_feats + np.sum(encoder.n_values)
+        n_values = encoder.n_values
+        cat_start_idxs = encoder.xform_start_indices
+    else:
+        num_cat_feats = 0
+        nfeats = len(params)
+        n_values = np.array([])
+        cat_start_idxs = np.array([])
+
+    mins = np.empty((nfeats,))
+    maxs = np.empty((nfeats,))
+    current_idx = 0
+    cat_idx = 0
+    for param in params:
+        if param.iscategorical:
+            if param.isboolean:
+                nvals = 1
+            else:
+                assert cat_start_idxs[cat_idx] == current_idx
+                nvals = n_values[cat_idx]
+                cat_idx += 1
+            mins[current_idx: current_idx + nvals] = 0
+            maxs[current_idx: current_idx + nvals] = 1
+            current_idx += nvals
+        else:
+            mins[current_idx] = param.true_range[0]  # valid_values[0]
+            maxs[current_idx] = param.true_range[1]  # valid_values[-1]
+            current_idx += 1
+    return mins, maxs
+
+
+# ==========================================================
+#   Min-max scaler
+# ==========================================================
+class MinMaxScaler(Preprocess):
+
+    def __init__(self, mins=None, maxs=None):
+        self.scaler_ = SklearnMinMaxScaler()
+        if mins is not None:
+            assert isinstance(mins, np.ndarray)
+            if mins.ndim == 1:
+                mins = mins.reshape(1, -1)
+            self.scaler_.partial_fit(mins)
+            self.mins_ = mins
+        else:
+            self.mins_ = None
+        if maxs is not None:
+            assert isinstance(maxs, np.ndarray)
+            if maxs.ndim == 1:
+                maxs = maxs.reshape(1, -1)
+            self.scaler_.partial_fit(maxs)
+            self.maxs_ = maxs
+        else:
+            self.maxs_ = None
+        self.fitted_ = self.mins_ is not None and self.maxs_ is not None
+
+    def fit(self, matrix):
+        if matrix.ndim == 1:
+            matrix = matrix.reshape(1, -1)
+        self.scaler_.partial_fit(matrix)
+        self.mins_ = self.scaler_.data_min_
+        self.maxs_ = self.scaler_.data_max_
+        self.fitted_ = True
+        return self
+
+    def transform(self, matrix, copy=True):
+        if not self.fitted_:
+            raise Exception("Model not fitted!")
+        if matrix.ndim == 1:
+            matrix = matrix.reshape(1, -1)
+        return self.scaler_.transform(matrix)
+
+    def inverse_transform(self, matrix, copy=True):
+        if matrix.ndim == 1:
+            matrix = matrix.reshape(1, -1)
+        return self.scaler_.inverse_transform(matrix)
--- a/server/analysis/tests/init.py
+++ b/server/analysis/tests/init.py
@@ -0,0 +1,5 @@
+#
+# OtterTune - __init__.py
+#
+# Copyright (c) 2017-18, Carnegie Mellon University Database Group
+#
--- a/server/analysis/tests/test_cluster.py
+++ b/server/analysis/tests/test_cluster.py
@@ -0,0 +1,91 @@
+#
+# OtterTune - test_cluster.py
+#
+# Copyright (c) 2017-18, Carnegie Mellon University Database Group
+#
+import unittest
+import numpy as np
+from sklearn import datasets
+
+from analysis.cluster import KMeans, KMeansClusters, create_kselection_model
+
+
+class TestKMeans(unittest.TestCase):
+
+    @classmethod
+    def setUpClass(cls):
+        super(TestKMeans, cls).setUpClass()
+        iris = datasets.load_iris()
+        cls.model = KMeans()
+        cls.model.fit(iris.data, 5, iris.target,
+                      estimator_params={'n_init': 50, 'random_state': 42})
+
+    def test_kmeans_n_clusters(self):
+        self.assertEqual(self.model.n_clusters_, 5)
+
+    def test_kmeans_cluster_inertia(self):
+        self.assertAlmostEqual(self.model.cluster_inertia_, 46.535, 2)
+
+    def test_kmeans_cluster_labels(self):
+        expected_labels = [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
+                           1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
+                           1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 3, 3, 3, 2, 3, 3, 3,
+                           2, 3, 2, 2, 3, 2, 3, 2, 3, 3, 2, 3, 2, 3, 2, 3, 3, 3, 3,
+                           3, 3, 3, 2, 2, 2, 2, 3, 2, 3, 3, 3, 2, 2, 2, 3, 2, 2, 2,
+                           2, 2, 3, 2, 2, 4, 3, 0, 4, 4, 0, 2, 0, 4, 0, 4, 4, 4, 3,
+                           4, 4, 4, 0, 0, 3, 4, 3, 0, 3, 4, 0, 3, 3, 4, 0, 0, 0, 4,
+                           3, 3, 0, 4, 4, 3, 4, 4, 4, 3, 4, 4, 4, 3, 4, 4, 3]
+        for lab_actual, lab_expected in zip(self.model.cluster_labels_, expected_labels):
+            self.assertEqual(lab_actual, lab_expected)
+
+    def test_kmeans_sample_labels(self):
+        for lab_actual, lab_expected in zip(self.model.sample_labels_, datasets.load_iris().target):
+            self.assertEqual(lab_actual, lab_expected)
+
+    def test_kmeans_cluster_centers(self):
+        expected_centers = [[7.475, 3.125, 6.300, 2.050],
+                            [5.006, 3.418, 1.464, 0.244],
+                            [5.508, 2.600, 3.908, 1.204],
+                            [6.207, 2.853, 4.746, 1.564],
+                            [6.529, 3.058, 5.508, 2.162]]
+        for row_actual, row_expected in zip(self.model.cluster_centers_, expected_centers):
+            for val_actual, val_expected in zip(row_actual, row_expected):
+                self.assertAlmostEqual(val_actual, val_expected, 2)
+
+
+class TestKSelection(unittest.TestCase):
+
+    def setUp(self):
+        np.random.seed(seed=42)
+
+    @classmethod
+    def setUpClass(cls):
+        super(TestKSelection, cls).setUpClass()
+
+        # Load Iris data
+        iris = datasets.load_iris()
+        cls.matrix = iris.data
+        cls.kmeans_models = KMeansClusters()
+        cls.kmeans_models.fit(cls.matrix,
+                              min_cluster=1,
+                              max_cluster=10,
+                              sample_labels=iris.target,
+                              estimator_params={'n_init': 50, 'random_state': 42})
+
+    def test_detk_optimal_num_clusters(self):
+        # Compute optimal # cluster using det-k
+        detk = create_kselection_model("det-k")
+        detk.fit(self.matrix, self.kmeans_models.cluster_map_)
+        self.assertEqual(detk.optimal_num_clusters_, 2)
+
+    def test_gap_statistic_optimal_num_clusters(self):
+        # Compute optimal # cluster using gap-statistics
+        gap = create_kselection_model("gap-statistic")
+        gap.fit(self.matrix, self.kmeans_models.cluster_map_)
+        self.assertEqual(gap.optimal_num_clusters_, 8)
+
+    def test_silhouette_optimal_num_clusters(self):
+        # Compute optimal # cluster using Silhouette Analysis
+        sil = create_kselection_model("s-score")
+        sil.fit(self.matrix, self.kmeans_models.cluster_map_)
+        self.assertEqual(sil.optimal_num_clusters_, 2)
--- a/server/analysis/tests/test_constraints.py
+++ b/server/analysis/tests/test_constraints.py
@@ -0,0 +1,116 @@
+#
+# OtterTune - test_constraints.py
+#
+# Copyright (c) 2017-18, Carnegie Mellon University Database Group
+#
+import unittest
+import numpy as np
+from sklearn import datasets
+from sklearn.preprocessing import StandardScaler
+
+from analysis.constraints import ParamConstraintHelper
+from analysis.preprocessing import DummyEncoder
+
+
+class ConstraintHelperTestCase(unittest.TestCase):
+
+    def test_scale_rescale(self):
+        X = datasets.load_boston()['data']
+        X_scaler = StandardScaler()
+        # params hard-coded for test (messy to import constant from website module)
+        constraint_helper = ParamConstraintHelper(X_scaler, None,
+                                                  init_flip_prob=0.3,
+                                                  flip_prob_decay=0.5)
+        X_scaled = X_scaler.fit_transform(X)
+        # there may be some floating point imprecision between scaling and rescaling
+        row_unscaled = np.round(constraint_helper._handle_scaling(X_scaled[0], True), 10)  # pylint: disable=protected-access
+        self.assertTrue(np.all(X[0] == row_unscaled))
+        row_rescaled = constraint_helper._handle_rescaling(row_unscaled, True)  # pylint: disable=protected-access
+        self.assertTrue(np.all(X_scaled[0] == row_rescaled))
+
+    def test_apply_constraints_unscaled(self):
+        n_values = [3]
+        categorical_features = [0]
+        encoder = DummyEncoder(n_values, categorical_features, ['a'], [])
+        encoder.fit([[0, 17]])
+        X_scaler = StandardScaler()
+        constraint_helper = ParamConstraintHelper(X_scaler, encoder,
+                                                  init_flip_prob=0.3,
+                                                  flip_prob_decay=0.5)
+
+        X = [0.1, 0.2, 0.3, 17]
+        X_expected = [0, 0, 1, 17]
+        X_corrected = constraint_helper.apply_constraints(X, scaled=False, rescale=False)
+        self.assertTrue(np.all(X_corrected == X_expected))
+
+    def test_apply_constraints(self):
+        n_values = [3]
+        categorical_features = [0]
+        encoder = DummyEncoder(n_values, categorical_features, ['a'], [])
+        encoder.fit([[0, 17]])
+        X_scaler = StandardScaler()
+        X = np.array([[0, 0, 1, 17], [1, 0, 0, 17]], dtype=float)
+        X_scaled = X_scaler.fit_transform(X)
+        constraint_helper = ParamConstraintHelper(X_scaler, encoder,
+                                                  init_flip_prob=0.3,
+                                                  flip_prob_decay=0.5)
+
+        row = X_scaled[0]
+        new_row = np.copy(row)
+        new_row[0: 3] += 0.1  # should still represent [0, 0, 1] encoding
+        row_corrected = constraint_helper.apply_constraints(new_row)
+        self.assertTrue(np.all(row == row_corrected))
+
+    # tests that repeatedly applying randomize_categorical_features
+    # always results in valid configurations of categorical dumny encodings
+    # and will lead to all possible values of categorical variables being tried
+    def test_randomize_categorical_features(self):
+        # variable 0 is categorical, 3 values
+        # variable 1 is not categorical
+        # variable 2 is categorical, 4 values
+        cat_var_0_levels = 3
+        cat_var_2_levels = 4
+        cat_var_0_idx = 0
+        cat_var_2_idx = 2
+        n_values = [cat_var_0_levels, cat_var_2_levels]
+        categorical_features = [cat_var_0_idx, cat_var_2_idx]
+        encoder = DummyEncoder(n_values, categorical_features, ['a', 'b'], [])
+        encoder.fit([[0, 17, 0]])
+        X_scaler = StandardScaler()
+        constraint_helper = ParamConstraintHelper(X_scaler, encoder,
+                                                  init_flip_prob=0.3,
+                                                  flip_prob_decay=0.5)
+
+        # row is a sample encoded set of features,
+        # note that the non-categorical variable is on the right
+        row = np.array([0, 0, 1, 1, 0, 0, 0, 17], dtype=float)
+        trials = 20
+        cat_var_0_counts = np.zeros(cat_var_0_levels)
+        cat_var_2_counts = np.zeros(cat_var_2_levels)
+        for _ in range(trials):
+            # possibly flip the categorical features
+            row = constraint_helper.randomize_categorical_features(row, scaled=False, rescale=False)
+
+            # check that result is valid for cat_var_0
+            cat_var_0_dummies = row[0: cat_var_0_levels]
+            self.assertTrue(np.all(np.logical_or(cat_var_0_dummies == 0, cat_var_0_dummies == 1)))
+            self.assertEqual(np.sum(cat_var_0_dummies), 1)
+            cat_var_0_counts[np.argmax(cat_var_0_dummies)] += 1
+
+            # check that result is valid for cat_var_2
+            cat_var_2_dummies = row[cat_var_0_levels: cat_var_0_levels + cat_var_2_levels]
+            self.assertTrue(np.all(np.logical_or(cat_var_2_dummies == 0, cat_var_2_dummies == 1)))
+            self.assertEqual(np.sum(cat_var_2_dummies), 1)
+            cat_var_2_counts[np.argmax(cat_var_2_dummies)] += 1
+
+            self.assertEqual(row[-1], 17)
+
+        for ct in cat_var_0_counts:
+            self.assertTrue(ct > 0)
+
+        for ct in cat_var_2_counts:
+            self.assertTrue(ct > 0)
+
+
+if __name__ == '__main__':
+    unittest.main()
--- a/server/analysis/tests/test_gpr.py
+++ b/server/analysis/tests/test_gpr.py
@@ -0,0 +1,61 @@
+#
+# OtterTune - test_gpr.py
+#
+# Copyright (c) 2017-18, Carnegie Mellon University Database Group
+#
+import unittest
+from sklearn import datasets
+from analysis.gp import GPRNP
+from analysis.gp_tf import GPR
+
+
+# test numpy version GPR
+class TestGPRNP(unittest.TestCase):
+
+    @classmethod
+    def setUpClass(cls):
+        super(TestGPRNP, cls).setUpClass()
+        boston = datasets.load_boston()
+        data = boston['data']
+        X_train = data[0:500]
+        X_test = data[500:]
+        y_train = boston['target'][0:500].reshape(500, 1)
+        cls.model = GPRNP(length_scale=1.0, magnitude=1.0)
+        cls.model.fit(X_train, y_train, ridge=1.0)
+        cls.gpr_result = cls.model.predict(X_test)
+
+    def test_gprnp_ypreds(self):
+        ypreds_round = [round(x[0], 4) for x in self.gpr_result.ypreds]
+        expected_ypreds = [0.0181, 0.0014, 0.0006, 0.0015, 0.0039, 0.0014]
+        self.assertEqual(ypreds_round, expected_ypreds)
+
+    def test_gprnp_sigmas(self):
+        sigmas_round = [round(x[0], 4) for x in self.gpr_result.sigmas]
+        expected_sigmas = [1.0, 1.0, 1.0, 1.0, 1.0, 1.0]
+        self.assertEqual(sigmas_round, expected_sigmas)
+
+
+# test Tensorflow version GPR
+class TestGPRTF(unittest.TestCase):
+
+    @classmethod
+    def setUpClass(cls):
+        super(TestGPRTF, cls).setUpClass()
+        boston = datasets.load_boston()
+        data = boston['data']
+        X_train = data[0:500]
+        X_test = data[500:]
+        y_train = boston['target'][0:500].reshape(500, 1)
+        cls.model = GPR(length_scale=1.0, magnitude=1.0)
+        cls.model.fit(X_train, y_train, ridge=1.0)
+        cls.gpr_result = cls.model.predict(X_test)
+
+    def test_gprnp_ypreds(self):
+        ypreds_round = [round(x[0], 4) for x in self.gpr_result.ypreds]
+        expected_ypreds = [0.0181, 0.0014, 0.0006, 0.0015, 0.0039, 0.0014]
+        self.assertEqual(ypreds_round, expected_ypreds)
+
+    def test_gprnp_sigmas(self):
+        sigmas_round = [round(x[0], 4) for x in self.gpr_result.sigmas]
+        expected_sigmas = [1.0, 1.0, 1.0, 1.0, 1.0, 1.0]
+        self.assertEqual(sigmas_round, expected_sigmas)
--- a/server/analysis/tests/test_preprocessing.py
+++ b/server/analysis/tests/test_preprocessing.py
@@ -0,0 +1,83 @@
+#
+# OtterTune - test_preprocessing.py
+#
+# Copyright (c) 2017-18, Carnegie Mellon University Database Group
+#
+import unittest
+import numpy as np
+
+from analysis.preprocessing import DummyEncoder, consolidate_columnlabels
+
+
+class TestDummyEncoder(unittest.TestCase):
+
+    def test_no_categoricals(self):
+        X = [[1, 2, 3], [4, 5, 6]]
+        n_values = []
+        categorical_features = []
+        cat_columnlabels = []
+        noncat_columnlabels = ['a', 'b', 'c']
+
+        enc = DummyEncoder(n_values, categorical_features,
+                           cat_columnlabels, noncat_columnlabels)
+        X_encoded = enc.fit_transform(X)
+        new_labels = enc.new_labels
+        self.assertTrue(np.all(X == X_encoded))
+        self.assertEqual(noncat_columnlabels, new_labels)
+
+    def test_simple_categorical(self):
+        X = [[0, 1, 2], [1, 1, 2], [2, 1, 2]]
+        n_values = [3]
+        categorical_features = [0]
+        cat_columnlabels = ['label']
+        noncat_columnlabels = ['a', 'b']
+
+        X_expected = [[1, 0, 0, 1, 2], [0, 1, 0, 1, 2], [0, 0, 1, 1, 2]]
+        new_labels_expected = ['label____0', 'label____1', 'label____2', 'a', 'b']
+        enc = DummyEncoder(n_values, categorical_features,
+                           cat_columnlabels, noncat_columnlabels)
+        X_encoded = enc.fit_transform(X)
+        new_labels = enc.new_labels
+        self.assertTrue(np.all(X_expected == X_encoded))
+        self.assertEqual(new_labels_expected, new_labels)
+
+    def test_mixed_categorical(self):
+        X = [[1, 0, 2], [1, 1, 2], [1, 2, 2]]
+        n_values = [3]
+        categorical_features = [1]
+        cat_columnlabels = ['label']
+        noncat_columnlabels = ['a', 'b']
+
+        X_expected = [[1, 0, 0, 1, 2], [0, 1, 0, 1, 2], [0, 0, 1, 1, 2]]
+        new_labels_expected = ['label____0', 'label____1', 'label____2', 'a', 'b']
+        enc = DummyEncoder(n_values, categorical_features,
+                           cat_columnlabels, noncat_columnlabels)
+        X_encoded = enc.fit_transform(X)
+        new_labels = enc.new_labels
+        self.assertTrue(np.all(X_expected == X_encoded))
+        self.assertEqual(new_labels_expected, new_labels)
+
+    def test_consolidate(self):
+        labels = ['label1____0', 'label1____1', 'label2____0', 'label2____1', 'noncat']
+        consolidated = consolidate_columnlabels(labels)
+        expected = ['label1', 'label2', 'noncat']
+        self.assertEqual(expected, consolidated)
+
+    def test_inverse_transform(self):
+        X = [[1, 0, 2], [1, 1, 2], [1, 2, 2]]
+        n_values = [3]
+        categorical_features = [1]
+        cat_columnlabels = ['label']
+        noncat_columnlabels = ['a', 'b']
+
+        X_expected = [[1, 0, 0, 1, 2], [0, 1, 0, 1, 2], [0, 0, 1, 1, 2]]
+        enc = DummyEncoder(n_values, categorical_features,
+                           cat_columnlabels, noncat_columnlabels)
+        X_encoded = enc.fit_transform(X)
+        self.assertTrue(np.all(X_encoded == X_expected))
+        X_decoded = enc.inverse_transform(X_encoded)
+        self.assertTrue(np.all(X == X_decoded))
+
+
+if __name__ == '__main__':
+    unittest.main()
--- a/server/analysis/util.py
+++ b/server/analysis/util.py
@@ -0,0 +1,106 @@
+#
+# OtterTune - util.py
+#
+# Copyright (c) 2017-18, Carnegie Mellon University Database Group
+#
+'''
+Created on Oct 24, 2017
+
+@author: dva
+'''
+
+import logging
+from numbers import Number
+
+import contextlib
+import datetime
+import numpy as np
+
+
+def get_analysis_logger(name, level=logging.INFO):
+    logger = logging.getLogger(name)
+    log_handler = logging.StreamHandler()
+    log_formatter = logging.Formatter(
+        fmt='%(asctime)s [%(funcName)s:%(lineno)03d] %(levelname)-5s: %(message)s',
+        datefmt='%m-%d-%Y %H:%M:%S'
+    )
+    log_handler.setFormatter(log_formatter)
+    logger.addHandler(log_handler)
+    logger.setLevel(level)
+    np.set_printoptions(formatter={'float': '{: 0.3f}'.format})
+    return logger
+
+
+LOG = get_analysis_logger(__name__)
+
+
+def stdev_zero(data, axis=None, nearzero=1e-8):
+    mstd = np.expand_dims(data.std(axis=axis), axis=axis)
+    return (np.abs(mstd) < nearzero).squeeze()
+
+
+def get_datetime():
+    return datetime.datetime.utcnow()
+
+
+class TimerStruct(object):
+
+    def __init__(self):
+        self.__start_time = 0.0
+        self.__stop_time = 0.0
+        self.__elapsed = None
+
+    @property
+    def elapsed_seconds(self):
+        if self.__elapsed is None:
+            return (get_datetime() - self.__start_time).total_seconds()
+        return self.__elapsed.total_seconds()
+
+    def start(self):
+        self.__start_time = get_datetime()
+
+    def stop(self):
+        self.__stop_time = get_datetime()
+        self.__elapsed = (self.__stop_time - self.__start_time)
+
+
+@contextlib.contextmanager
+def stopwatch(message=None):
+    ts = TimerStruct()
+    ts.start()
+    try:
+        yield ts
+    finally:
+        ts.stop()
+        if message is not None:
+            LOG.info('Total elapsed_seconds time for %s: %.3fs', message, ts.elapsed_seconds)
+
+
+def get_data_base(arr):
+    """For a given Numpy array, finds the
+    base array that "owns" the actual data."""
+    base = arr
+    while isinstance(base.base, np.ndarray):
+        base = base.base
+    return base
+
+
+def arrays_share_data(x, y):
+    return get_data_base(x) is get_data_base(y)
+
+
+def array_tostring(arr):
+    arr_shape = arr.shape
+    arr = arr.ravel()
+    arr = np.array([str(a) for a in arr])
+    return arr.reshape(arr_shape)
+
+
+def is_numeric_matrix(matrix):
+    assert matrix.size > 0
+    return isinstance(matrix.ravel()[0], Number)
+
+
+def is_lexical_matrix(matrix):
+    assert matrix.size > 0
+    return isinstance(matrix.ravel()[0], str)