Initial commit with BSL

This commit is contained in:
Andy Pavlo
2019-08-23 11:47:19 -04:00
commit 3e564ce922
286 changed files with 177642 additions and 0 deletions

View File

@@ -0,0 +1,5 @@
#
# OtterTune - __init__.py
#
# Copyright (c) 2017-18, Carnegie Mellon University Database Group
#

19
server/analysis/base.py Normal file
View File

@@ -0,0 +1,19 @@
#
# OtterTune - base.py
#
# Copyright (c) 2017-18, Carnegie Mellon University Database Group
#
'''
Created on Oct 25, 2017
@author: dva
'''
from abc import ABCMeta, abstractmethod
class ModelBase(object, metaclass=ABCMeta):
@abstractmethod
def _reset(self):
pass

793
server/analysis/cluster.py Normal file
View File

@@ -0,0 +1,793 @@
#
# OtterTune - cluster.py
#
# Copyright (c) 2017-18, Carnegie Mellon University Database Group
#
'''
Created on Jul 4, 2016
@author: dva
'''
from abc import ABCMeta, abstractproperty
from collections import OrderedDict
import os
import json
import copy
import numpy as np
import matplotlib.pyplot as plt
from scipy.spatial.distance import cdist
from sklearn.metrics import silhouette_score
from sklearn.cluster import KMeans as SklearnKMeans
from celery.utils.log import get_task_logger
from .base import ModelBase
# Log debug messages
LOGGER = get_task_logger(__name__)
class KMeans(ModelBase):
"""
KMeans:
Fits an Sklearn KMeans model to X.
See also
--------
http://scikit-learn.org/stable/modules/generated/sklearn.cluster.KMeans.html
Attributes
----------
n_clusters_ : int
The number of clusters, K
cluster_inertia_ : float
Sum of squared distances of samples to their closest cluster center
cluster_labels_ : array, [n_clusters_]
Labels indicating the membership of each point
cluster_centers_ : array, [n_clusters, n_features]
Coordinates of cluster centers
sample_labels_ : array, [n_samples]
Labels for each of the samples in X
sample_distances_ : array, [n_samples]
The distance between each sample point and its cluster's center
Constants
---------
SAMPLE_CUTOFF_ : int
If n_samples > SAMPLE_CUTOFF_ then sample distances
are NOT recorded
"""
SAMPLE_CUTOFF_ = 1000
def __init__(self):
self.model_ = None
self.n_clusters_ = None
self.sample_labels_ = None
self.sample_distances_ = None
@property
def cluster_inertia_(self):
# Sum of squared distances of samples to their closest cluster center
return None if self.model_ is None else \
self.model_.inertia_
@property
def cluster_labels_(self):
# Cluster membership labels for each point
return None if self.model_ is None else \
copy.deepcopy(self.model_.labels_)
@property
def cluster_centers_(self):
# Coordinates of the cluster centers
return None if self.model_ is None else \
copy.deepcopy(self.model_.cluster_centers_)
def _reset(self):
"""Resets all attributes (erases the model)"""
self.model_ = None
self.n_clusters_ = None
self.sample_labels_ = None
self.sample_distances_ = None
def fit(self, X, K, sample_labels=None, estimator_params=None):
"""Fits a Sklearn KMeans model to X.
Parameters
----------
X : array-like, shape (n_samples, n_features)
Training data.
K : int
The number of clusters.
sample_labels : array-like, shape (n_samples), optional
Labels for each of the samples in X.
estimator_params : dict, optional
The parameters to pass to the KMeans estimators.
Returns
-------
self
"""
self._reset()
# Note: previously set n_init=50
self.model_ = SklearnKMeans(K)
if estimator_params is not None:
assert isinstance(estimator_params, dict)
self.model_.set_params(**estimator_params)
# Compute Kmeans model
self.model_.fit(X)
if sample_labels is None:
sample_labels = ["sample_{}".format(i) for i in range(X.shape[0])]
assert len(sample_labels) == X.shape[0]
self.sample_labels_ = np.array(sample_labels)
self.n_clusters_ = K
# Record sample label/distance from its cluster center
self.sample_distances_ = OrderedDict()
for cluster_label in range(self.n_clusters_):
assert cluster_label not in self.sample_distances_
member_rows = X[self.cluster_labels_ == cluster_label, :]
member_labels = self.sample_labels_[self.cluster_labels_ == cluster_label]
centroid = np.expand_dims(self.cluster_centers_[cluster_label], axis=0)
# "All clusters must have at least 1 member!"
if member_rows.shape[0] == 0:
return None
# Calculate distance between each member row and the current cluster
dists = np.empty(member_rows.shape[0])
dist_labels = []
for j, (row, label) in enumerate(zip(member_rows, member_labels)):
dists[j] = cdist(np.expand_dims(row, axis=0), centroid, "euclidean").squeeze()
dist_labels.append(label)
# Sort the distances/labels in ascending order
sort_order = np.argsort(dists)
dists = dists[sort_order]
dist_labels = np.array(dist_labels)[sort_order]
self.sample_distances_[cluster_label] = {
"sample_labels": dist_labels,
"distances": dists,
}
return self
def get_closest_samples(self):
"""Returns a list of the labels of the samples that are located closest
to their cluster's center.
Returns
----------
closest_samples : list
A list of the sample labels that are located the closest to
their cluster's center.
"""
if self.sample_distances_ is None:
raise Exception("No model has been fit yet!")
return [samples['sample_labels'][0] for samples in list(self.sample_distances_.values())]
def get_memberships(self):
'''
Return the memberships in each cluster
'''
memberships = OrderedDict()
for cluster_label, samples in list(self.sample_distances_.items()):
memberships[cluster_label] = OrderedDict(
[(l, d) for l, d in zip(samples["sample_labels"], samples["distances"])])
return json.dumps(memberships, indent=4)
class KMeansClusters(ModelBase):
"""
KMeansClusters:
Fits a KMeans model to X for clusters in the range [min_cluster_, max_cluster_].
Attributes
----------
min_cluster_ : int
The minimum cluster size to fit a KMeans model to
max_cluster_ : int
The maximum cluster size to fit a KMeans model to
cluster_map_ : dict
A dictionary mapping the cluster size (K) to the KMeans
model fitted to X with K clusters
sample_labels_ : array, [n_samples]
Labels for each of the samples in X
"""
def __init__(self):
self.min_cluster_ = None
self.max_cluster_ = None
self.cluster_map_ = None
self.sample_labels_ = None
def _reset(self):
"""Resets all attributes (erases the model)"""
self.min_cluster_ = None
self.max_cluster_ = None
self.cluster_map_ = None
self.sample_labels_ = None
def fit(self, X, min_cluster, max_cluster, sample_labels=None, estimator_params=None):
"""Fits a KMeans model to X for each cluster in the range [min_cluster, max_cluster].
Parameters
----------
X : array-like, shape (n_samples, n_features)
Training data.
min_cluster : int
The minimum cluster size to fit a KMeans model to.
max_cluster : int
The maximum cluster size to fit a KMeans model to.
sample_labels : array-like, shape (n_samples), optional
Labels for each of the samples in X.
estimator_params : dict, optional
The parameters to pass to the KMeans estimators.
Returns
-------
self
"""
self._reset()
self.min_cluster_ = min_cluster
self.max_cluster_ = max_cluster
self.cluster_map_ = {}
if sample_labels is None:
sample_labels = ["sample_{}".format(i) for i in range(X.shape[1])]
self.sample_labels_ = sample_labels
for K in range(self.min_cluster_, self.max_cluster_ + 1):
tmp = KMeans().fit(X, K, self.sample_labels_, estimator_params)
if tmp is None: # Set maximum cluster
assert K > min_cluster, "min_cluster is too large for the model"
self.max_cluster_ = K - 1
break
else:
self.cluster_map_[K] = tmp
return self
def save(self, savedir):
"""Saves the KMeans model results
Parameters
----------
savedir : string
Path to the directory to save the results in.
"""
if self.cluster_map_ is None:
raise Exception("No models have been fitted yet!")
cluster_map = OrderedDict()
inertias = []
for K, model in sorted(self.cluster_map_.items()):
cluster_map[K] = {
"cluster_inertia": model.cluster_inertia_,
"cluster_labels": model.cluster_labels_,
"cluster_centers": model.cluster_centers_,
}
inertias.append(model.cluster_inertia_)
# Save sum of squares plot (elbow curve)
fig = plt.figure()
plt.plot(list(cluster_map.keys()), inertias, '--o')
plt.xlabel("Number of clusters (K)")
plt.ylabel("Within sum of squares W_k")
plt.title("Within Sum of Squares vs. Number of Clusters")
fig.canvas.set_window_title(os.path.basename(savedir))
savepath = os.path.join(savedir, "kmeans_sum_of_squares.pdf")
plt.savefig(savepath, bbox_inches="tight")
plt.close()
# save cluster memberships
for K in range(self.min_cluster_, self.max_cluster_ + 1):
savepath = os.path.join(savedir,
"memberships_{}-clusters.json".format(K))
members = self.cluster_map_[K].get_memberships()
with open(savepath, "w") as f:
f.write(members)
class KSelection(ModelBase, metaclass=ABCMeta):
"""KSelection:
Abstract class for techniques that approximate the optimal
number of clusters (K).
Attributes
----------
optimal_num_clusters_ : int
An estimation of the optimal number of clusters K for
a KMeans model fit to X
clusters_ : array, [n_clusters]
The sizes of the clusters
name_ : string
The name of this technique
"""
NAME_ = None
def __init__(self):
self.optimal_num_clusters_ = None
self.clusters_ = None
def _reset(self):
"""Resets all attributes (erases the model)"""
self.optimal_num_clusters_ = None
self.clusters_ = None
@abstractproperty
def name_(self):
pass
def save(self, savedir):
"""Saves the estimation of the optimal # of clusters.
Parameters
----------
savedir : string
Path to the directory to save the results in.
"""
if self.optimal_num_clusters_ is None:
raise Exception("Optimal number of clusters has not been computed!")
# Save the computed optimal number of clusters
savepath = os.path.join(savedir, self.name_ + "_optimal_num_clusters.txt")
with open(savepath, "w") as f:
f.write(str(self.optimal_num_clusters_))
class GapStatistic(KSelection):
"""GapStatistic:
Approximates the optimal number of clusters (K).
References
----------
https://web.stanford.edu/~hastie/Papers/gap.pdf
Attributes
----------
optimal_num_clusters_ : int
An estimation of the optimal number of clusters K for
a KMeans model fit to X
clusters_ : array, [n_clusters]
The sizes of the clusters
name_ : string
The name of this technique
log_wks_ : array, [n_clusters]
The within-dispersion measures of X (log)
log_wkbs_ : array, [n_clusters]
The within-dispersion measures of the generated
reference data sets
khats_ : array, [n_clusters]
The gap-statistic for each cluster
"""
NAME_ = "gap-statistic"
def __init__(self):
super(GapStatistic, self).__init__()
self.log_wks_ = None
self.log_wkbs_ = None
self.khats_ = None
@property
def name_(self):
return self.NAME_
def _reset(self):
"""Resets all attributes (erases the model)"""
super(GapStatistic, self)._reset()
self.log_wks_ = None
self.log_wkbs_ = None
self.khats_ = None
def fit(self, X, cluster_map, n_b=50):
"""Estimates the optimal number of clusters (K) for a
KMeans model trained on X.
Parameters
----------
X : array-like, shape (n_samples, n_features)
Training data.
cluster_map_ : dict
A dictionary mapping each cluster size (K) to the KMeans
model fitted to X with K clusters
n_B : int
The number of reference data sets to generate
Returns
-------
self
"""
self._reset()
mins, maxs = GapStatistic.bounding_box(X)
n_clusters = len(cluster_map)
# Dispersion for real distribution
log_wks = np.zeros(n_clusters)
log_wkbs = np.zeros(n_clusters)
sk = np.zeros(n_clusters)
for indk, (K, model) in enumerate(sorted(cluster_map.items())):
# Computes Wk: the within-dispersion of each cluster size (k)
log_wks[indk] = np.log(model.cluster_inertia_ / (2.0 * K))
# Create B reference datasets
log_bwkbs = np.zeros(n_b)
for i in range(n_b):
Xb = np.empty_like(X)
for j in range(X.shape[1]):
Xb[:, j] = np.random.uniform(mins[j], maxs[j], size=X.shape[0])
Xb_model = KMeans().fit(Xb, K)
log_bwkbs[i] = np.log(Xb_model.cluster_inertia_ / (2.0 * K))
log_wkbs[indk] = sum(log_bwkbs) / n_b
sk[indk] = np.sqrt(sum((log_bwkbs - log_wkbs[indk]) ** 2) / n_b)
sk = sk * np.sqrt(1 + 1.0 / n_b)
khats = np.zeros(n_clusters)
gaps = log_wkbs - log_wks
gsks = gaps - sk
khats[1:] = gaps[0:-1] - gsks[1:]
self.clusters_ = np.array(sorted(cluster_map.keys()))
for i in range(1, n_clusters):
if gaps[i - 1] >= gsks[i]:
self.optimal_num_clusters_ = self.clusters_[i - 1]
break
if self.optimal_num_clusters_ is None:
LOGGER.info("GapStatistic NOT found the optimal k, \
use the last(maximum) k instead ")
self.optimal_num_clusters_ = self.clusters_[-1]
self.log_wks_ = log_wks
self.log_wkbs_ = log_wkbs
self.khats_ = khats
return self
@staticmethod
def bounding_box(X):
"""Computes the box that tightly bounds X
Parameters
----------
X : array-like, shape (n_samples, n_features)
Training data.
Returns
-------
The mins and maxs that make up the bounding box
"""
mins = np.min(X, axis=0)
maxs = np.max(X, axis=0)
return mins, maxs
@staticmethod
def Wk(X, mu, cluster_labels):
"""Computes the within-dispersion of each cluster size (k)
Parameters
----------
X : array-like, shape (n_samples, n_features)
Training data.
mu : array-like, shape (n_clusters, n_features)
Coordinates of cluster centers
cluster_labels: array-like, shape (n_samples)
Labels for each of the samples in X.
Returns
-------
The within-dispersion of each cluster (K)
"""
K = len(mu)
return sum([np.linalg.norm(mu[i] - x) ** 2 / (2.0 * K)
for i in range(K)
for x in X[cluster_labels == i]])
def save(self, savedir):
"""Saves the estimation results of the optimal # of clusters.
Parameters
----------
savedir : string
Path to the directory to save the results in.
"""
super(GapStatistic, self).save(savedir)
# Plot the calculated gap
gaps = self.log_wkbs_ - self.log_wks_
fig = plt.figure()
plt.plot(self.clusters_, gaps, '--o')
plt.title("Gap vs. Number of Clusters")
plt.xlabel("Number of clusters (K)")
plt.ylabel("gap_K")
fig.canvas.set_window_title(os.path.basename(savedir))
plt.savefig(os.path.join(savedir, self.name_ + ".pdf"), bbox_inches="tight")
plt.close()
# Plot the gap statistic
fig = plt.figure()
plt.bar(self.clusters_, self.khats_)
plt.title("Gap Statistic vs. Number of Clusters")
plt.xlabel("Number of clusters (K)")
plt.ylabel("gap(K)-(gap(K+1)-s(K+1))")
fig.canvas.set_window_title(os.path.basename(savedir))
plt.savefig(os.path.join(savedir, self.name_ + "_final.pdf"),
bbox_inches="tight")
plt.close()
class DetK(KSelection):
"""DetK:
Approximates the optimal number of clusters (K).
References
----------
https://www.ee.columbia.edu/~dpwe/papers/PhamDN05-kmeans.pdf
Attributes
----------
optimal_num_clusters_ : int
An estimation of the optimal number of clusters K for
KMeans models fit to X
clusters_ : array, [n_clusters]
The sizes of the clusters
name_ : string
The name of this technique
fs_ : array, [n_clusters]
The computed evaluation functions F(K) for each cluster size K
"""
NAME_ = "det-k"
def __init__(self):
super(DetK, self).__init__()
self.fs_ = None
@property
def name_(self):
return DetK.NAME_
def _reset(self):
"""Resets all attributes (erases the model)"""
super(DetK, self)._reset()
self.fs_ = None
def fit(self, X, cluster_map):
"""Estimates the optimal number of clusters (K) for a
KMeans model trained on X.
Parameters
----------
X : array-like, shape (n_samples, n_features)
Training data.
cluster_map_ : dict
A dictionary mapping each cluster size (K) to the KMeans
model fitted to X with K clusters
Returns
-------
self
"""
self._reset()
n_clusters = len(cluster_map)
nd = X.shape[1]
fs = np.empty(n_clusters)
sks = np.empty(n_clusters)
alpha = {}
# K from 1 to maximum_cluster_
for i, (K, model) \
in enumerate(sorted(cluster_map.items())):
# Compute alpha(K, nd) (i.e. alpha[K])
if K == 2:
alpha[K] = 1 - 3.0 / (4 * nd)
elif K > 2:
alpha[K] = alpha[K - 1] + (1 - alpha[K - 1]) / 6.0
sks[i] = model.cluster_inertia_
if K == 1:
fs[i] = 1
elif sks[i - 1] == 0:
fs[i] = 1
else:
fs[i] = sks[i] / (alpha[K] * sks[i - 1])
self.clusters_ = np.array(sorted(cluster_map.keys()))
self.optimal_num_clusters_ = self.clusters_[np.argmin(fs)]
self.fs_ = fs
return self
def save(self, savedir):
"""Saves the estimation results of the optimal # of clusters.
Parameters
----------
savedir : string
Path to the directory to save the results in.
"""
super(DetK, self).save(savedir)
# Plot the evaluation function
fig = plt.figure()
plt.plot(self.clusters_, self.fs_, '--o')
plt.xlabel("Number of clusters (K)")
plt.ylabel("Evaluation function (F_k)")
plt.title("Evaluation Function vs. Number of Clusters")
fig.canvas.set_window_title(os.path.basename(savedir))
savepath = os.path.join(savedir, self.name_ + "_eval_function.pdf")
plt.savefig(savepath, bbox_inches="tight")
plt.close()
class Silhouette(KSelection):
"""Det:
Approximates the optimal number of clusters (K).
References
----------
http://scikit-learn.org/stable/modules/generated/sklearn.metrics.silhouette_score.html
Attributes
----------
optimal_num_clusters_ : int
An estimation of the optimal number of clusters K for
KMeans models fit to X
clusters_ : array, [n_clusters]
The sizes of the clusters
name_ : string
The name of this technique
Score_ : array, [n_clusters]
The mean Silhouette Coefficient for each cluster size K
"""
# short for Silhouette score
NAME_ = "s-score"
def __init__(self):
super(Silhouette, self).__init__()
self.scores_ = None
@property
def name_(self):
return Silhouette.NAME_
def _reset(self):
"""Resets all attributes (erases the model)"""
super(Silhouette, self)._reset()
self.scores_ = None
def fit(self, X, cluster_map):
"""Estimates the optimal number of clusters (K) for a
KMeans model trained on X.
Parameters
----------
X : array-like, shape (n_samples, n_features)
Training data.
cluster_map_ : dict
A dictionary mapping each cluster size (K) to the KMeans
model fitted to X with K clusters
Returns
-------
self
"""
self._reset()
n_clusters = len(cluster_map)
# scores = np.empty(n_clusters)
scores = np.zeros(n_clusters)
for i, (K, model) \
in enumerate(sorted(cluster_map.items())):
if K <= 1: # K >= 2
continue
scores[i] = silhouette_score(X, model.cluster_labels_)
self.clusters_ = np.array(sorted(cluster_map.keys()))
self.optimal_num_clusters_ = self.clusters_[np.argmax(scores)]
self.scores_ = scores
return self
def save(self, savedir):
"""Saves the estimation results of the optimal # of clusters.
Parameters
----------
savedir : string
Path to the directory to save the results in.
"""
super(Silhouette, self).save(savedir)
# Plot the evaluation function
fig = plt.figure()
plt.plot(self.clusters_, self.scores_, '--o')
plt.xlabel("Number of clusters (K)")
plt.ylabel("Silhouette scores")
plt.title("Silhouette Scores vs. Number of Clusters")
fig.canvas.set_window_title(os.path.basename(savedir))
savepath = os.path.join(savedir, self.name_ + "_eval_function.pdf")
plt.savefig(savepath, bbox_inches="tight")
plt.close()
def create_kselection_model(model_name):
"""Constructs the KSelection model object with the given name
Parameters
----------
model_name : string
Name of the KSelection model.
One of ['gap-statistic', 'det-k', 's-score']
Returns
-------
The constructed model object
"""
kselection_map = {
DetK.NAME_: DetK,
GapStatistic.NAME_: GapStatistic,
Silhouette.NAME_: Silhouette
}
if model_name not in kselection_map:
raise Exception("KSelection model {} not supported!".format(model_name))
else:
return kselection_map[model_name]()

View File

@@ -0,0 +1,115 @@
#
# OtterTune - constraints.py
#
# Copyright (c) 2017-18, Carnegie Mellon University Database Group
#
'''
Created on Sep 8, 2016
@author: dvanaken
'''
import numpy as np
class ParamConstraintHelper(object):
def __init__(self, scaler, encoder=None, binary_vars=None,
init_flip_prob=0.3, flip_prob_decay=0.5):
if 'inverse_transform' not in dir(scaler):
raise Exception("Scaler object must provide function inverse_transform(X)")
if 'transform' not in dir(scaler):
raise Exception("Scaler object must provide function transform(X)")
self.scaler_ = scaler
if encoder is not None and len(encoder.n_values) > 0:
self.is_dummy_encoded_ = True
self.encoder_ = encoder.encoder
else:
self.is_dummy_encoded_ = False
self.binary_vars_ = binary_vars
self.init_flip_prob_ = init_flip_prob
self.flip_prob_decay_ = flip_prob_decay
def apply_constraints(self, sample, scaled=True, rescale=True):
conv_sample = self._handle_scaling(sample, scaled)
if self.is_dummy_encoded_:
# apply categorical (ie enum var, >=3 values) constraints
n_values = self.encoder_.n_values_
cat_start_indices = self.encoder_.feature_indices_
for i, nvals in enumerate(n_values):
start_idx = cat_start_indices[i]
cvals = conv_sample[start_idx: start_idx + nvals]
cvals = np.array(np.arange(nvals) == np.argmax(cvals), dtype=float)
assert np.sum(cvals) == 1
conv_sample[start_idx: start_idx + nvals] = cvals
# apply binary (0-1) constraints
if self.binary_vars_ is not None:
for i in self.binary_vars_:
# round to closest
if conv_sample[i] >= 0.5:
conv_sample[i] = 1
else:
conv_sample[i] = 0
conv_sample = self._handle_rescaling(conv_sample, rescale)
return conv_sample
def _handle_scaling(self, sample, scaled):
if scaled:
if sample.ndim == 1:
sample = sample.reshape(1, -1)
sample = self.scaler_.inverse_transform(sample).ravel()
else:
sample = np.array(sample)
return sample
def _handle_rescaling(self, sample, rescale):
if rescale:
if sample.ndim == 1:
sample = sample.reshape(1, -1)
return self.scaler_.transform(sample).ravel()
return sample
def randomize_categorical_features(self, sample, scaled=True, rescale=True):
# If there are no categorical features, this function is a no-op.
if not self.is_dummy_encoded_:
return sample
n_values = self.encoder_.n_values_
cat_start_indices = self.encoder_.feature_indices_
n_cat_feats = len(n_values)
conv_sample = self._handle_scaling(sample, scaled)
flips = np.zeros((n_cat_feats,), dtype=bool)
# Always flip at least one categorical feature
flips[0] = True
# Flip the rest with decreasing probability
p = self.init_flip_prob_
for i in range(1, n_cat_feats):
if np.random.rand() <= p:
flips[i] = True
p *= self.flip_prob_decay_
flip_shuffle_indices = np.random.choice(np.arange(n_cat_feats),
n_cat_feats,
replace=False)
flips = flips[flip_shuffle_indices]
for i, nvals in enumerate(n_values):
if flips[i]:
start_idx = cat_start_indices[i]
current_val = conv_sample[start_idx: start_idx + nvals]
assert np.all(np.logical_or(current_val == 0, current_val == 1)), \
"categorical {0}: value not 0/1: {1}".format(i, current_val)
choices = np.arange(nvals)[current_val != 1]
assert choices.size == nvals - 1
r = np.zeros(nvals)
r[np.random.choice(choices)] = 1
assert np.sum(r) == 1
conv_sample[start_idx: start_idx + nvals] = r
conv_sample = self._handle_rescaling(conv_sample, rescale)
return conv_sample

View File

@@ -0,0 +1,111 @@
#
# OtterTune - factor_analysis.py
#
# Copyright (c) 2017-18, Carnegie Mellon University Database Group
#
'''
Created on Jul 4, 2016
@author: dvanaken
'''
import numpy as np
from sklearn.decomposition import FactorAnalysis as SklearnFactorAnalysis
from .base import ModelBase
class FactorAnalysis(ModelBase):
"""FactorAnalysis (FA):
Fits an Sklearn FactorAnalysis model to X.
See also
--------
http://scikit-learn.org/stable/modules/generated/sklearn.decomposition.FactorAnalysis.html
Attributes
----------
model_ : sklearn.decomposition.FactorAnalysis
The fitted FA model
components_ : array, [n_components, n_features]
Components (i.e., factors) with maximum variance
feature_labels_ : array, [n_features]
total_variance_ : float
The total amount of variance explained by the components
pvars_ : array, [n_components]
The percentage of the variance explained by each component
pvars_noise_ : array, [n_components]
The percentage of the variance explained by each component also
accounting for noise
"""
def __init__(self):
self.model_ = None
self.components_ = None
self.feature_labels_ = None
self.total_variance_ = None
self.pvars_ = None
self.pvars_noise_ = None
def _reset(self):
"""Resets all attributes (erases the model)"""
self.model_ = None
self.components_ = None
self.feature_labels_ = None
self.total_variance_ = None
self.pvars_ = None
self.pvars_noise_ = None
def fit(self, X, feature_labels=None, n_components=None, estimator_params=None):
"""Fits an Sklearn FA model to X.
Parameters
----------
X : array-like, shape (n_samples, n_features)
Training data.
feature_labels : array-like, shape (n_features), optional
Labels for each of the features in X.
estimator_params : dict, optional
The parameters to pass to Sklearn's FA estimators.
Returns
-------
self
"""
self._reset()
if feature_labels is None:
feature_labels = ["feature_{}".format(i) for i in range(X.shape[1])]
self.feature_labels_ = feature_labels
if n_components is not None:
model = SklearnFactorAnalysis(n_components=n_components)
else:
model = SklearnFactorAnalysis()
self.model_ = model
if estimator_params is not None:
# Update Sklearn estimator params
assert isinstance(estimator_params, dict)
self.model_.set_params(**estimator_params)
self.model_.fit(X)
# Remove zero-valued components (n_components x n_features)
components_mask = np.sum(self.model_.components_ != 0.0, axis=1) > 0.0
self.components_ = self.model_.components_[components_mask]
# Compute the % variance explained (with/without noise)
c2 = np.sum(self.components_ ** 2, axis=1)
self.total_variance_ = np.sum(c2)
self.pvars_ = 100 * c2 / self.total_variance_
self.pvars_noise_ = 100 * c2 / (self.total_variance_ +
np.sum(self.model_.noise_variance_))
return self

148
server/analysis/gp.py Normal file
View File

@@ -0,0 +1,148 @@
#
# OtterTune - gp.py
#
# Copyright (c) 2017-18, Carnegie Mellon University Database Group
#
'''
Created on Feb 18, 2018
@author: Bohan Zhang
'''
import numpy as np
from scipy.spatial.distance import cdist as ed
from scipy import special
from analysis.gp_tf import GPRResult
# numpy version of Gaussian Process Regression, not using Tensorflow
class GPRNP(object):
def __init__(self, length_scale=1.0, magnitude=1.0, max_train_size=7000,
batch_size=3000, check_numerics=True, debug=False):
assert np.isscalar(length_scale)
assert np.isscalar(magnitude)
assert length_scale > 0 and magnitude > 0
self.length_scale = length_scale
self.magnitude = magnitude
self.max_train_size_ = max_train_size
self.batch_size_ = batch_size
self.check_numerics = check_numerics
self.debug = debug
self.X_train = None
self.y_train = None
self.K = None
self.K_inv = None
self.y_best = None
def __repr__(self):
rep = ""
for k, v in sorted(self.__dict__.items()):
rep += "{} = {}\n".format(k, v)
return rep
def __str__(self):
return self.__repr__()
def _reset(self):
self.X_train = None
self.y_train = None
self.K = None
self.K_inv = None
self.y_best = None
def check_X_y(self, X, y):
from sklearn.utils.validation import check_X_y
if X.shape[0] > self.max_train_size_:
raise Exception("X_train size cannot exceed {} ({})"
.format(self.max_train_size_, X.shape[0]))
return check_X_y(X, y, multi_output=True,
allow_nd=True, y_numeric=True,
estimator="GPRNP")
def check_fitted(self):
if self.X_train is None or self.y_train is None \
or self.K is None:
raise Exception("The model must be trained before making predictions!")
@staticmethod
def check_array(X):
from sklearn.utils.validation import check_array
return check_array(X, allow_nd=True, estimator="GPRNP")
@staticmethod
def check_output(X):
finite_els = np.isfinite(X)
if not np.all(finite_els):
raise Exception("Input contains non-finite values: {}"
.format(X[~finite_els]))
def fit(self, X_train, y_train, ridge=0.01):
self._reset()
X_train, y_train = self.check_X_y(X_train, y_train)
if X_train.ndim != 2 or y_train.ndim != 2:
raise Exception("X_train or y_train should have 2 dimensions! X_dim:{}, y_dim:{}"
.format(X_train.ndim, y_train.ndim))
self.X_train = np.float32(X_train)
self.y_train = np.float32(y_train)
sample_size = self.X_train.shape[0]
if np.isscalar(ridge):
ridge = np.ones(sample_size) * ridge
assert isinstance(ridge, np.ndarray)
assert ridge.ndim == 1
K = self.magnitude * np.exp(-ed(self.X_train, self.X_train) / self.length_scale) \
+ np.diag(ridge)
K_inv = np.linalg.inv(K)
self.K = K
self.K_inv = K_inv
self.y_best = np.min(y_train)
return self
def predict(self, X_test):
self.check_fitted()
if X_test.ndim != 2:
raise Exception("X_test should have 2 dimensions! X_dim:{}"
.format(X_test.ndim))
X_test = np.float32(GPRNP.check_array(X_test))
test_size = X_test.shape[0]
arr_offset = 0
length_scale = self.length_scale
yhats = np.zeros([test_size, 1])
sigmas = np.zeros([test_size, 1])
eips = np.zeros([test_size, 1])
while arr_offset < test_size:
if arr_offset + self.batch_size_ > test_size:
end_offset = test_size
else:
end_offset = arr_offset + self.batch_size_
xt_ = X_test[arr_offset:end_offset]
K2 = self.magnitude * np.exp(-ed(self.X_train, xt_) / length_scale)
K3 = self.magnitude * np.exp(-ed(xt_, xt_) / length_scale)
K2_trans = np.transpose(K2)
yhat = np.matmul(K2_trans, np.matmul(self.K_inv, self.y_train))
sigma = np.sqrt(np.diag(K3 - np.matmul(K2_trans, np.matmul(self.K_inv, K2)))) \
.reshape(xt_.shape[0], 1)
u = (self.y_best - yhat) / sigma
phi1 = 0.5 * special.erf(u / np.sqrt(2.0)) + 0.5
phi2 = (1.0 / np.sqrt(2.0 * np.pi)) * np.exp(np.square(u) * (-0.5))
eip = sigma * (u * phi1 + phi2)
yhats[arr_offset:end_offset] = yhat
sigmas[arr_offset:end_offset] = sigma
eips[arr_offset:end_offset] = eip
arr_offset = end_offset
GPRNP.check_output(yhats)
GPRNP.check_output(sigmas)
return GPRResult(yhats, sigmas)
def get_params(self, deep=True):
return {"length_scale": self.length_scale,
"magnitude": self.magnitude,
"X_train": self.X_train,
"y_train": self.y_train,
"K": self.K,
"K_inv": self.K_inv}
def set_params(self, **parameters):
for param, val in list(parameters.items()):
setattr(self, param, val)
return self

710
server/analysis/gp_tf.py Normal file
View File

@@ -0,0 +1,710 @@
#
# OtterTune - gp_tf.py
#
# Copyright (c) 2017-18, Carnegie Mellon University Database Group
#
'''
Created on Aug 18, 2016
@author: Bohan Zhang, Dana Van Aken
'''
import gc
import numpy as np
import tensorflow as tf
from .util import get_analysis_logger
LOG = get_analysis_logger(__name__)
class GPRResult(object):
def __init__(self, ypreds=None, sigmas=None):
self.ypreds = ypreds
self.sigmas = sigmas
class GPRGDResult(GPRResult):
def __init__(self, ypreds=None, sigmas=None,
minl=None, minl_conf=None):
super(GPRGDResult, self).__init__(ypreds, sigmas)
self.minl = minl
self.minl_conf = minl_conf
class GPR(object):
def __init__(self, length_scale=1.0, magnitude=1.0, max_train_size=7000,
batch_size=3000, num_threads=4, check_numerics=True, debug=False):
assert np.isscalar(length_scale)
assert np.isscalar(magnitude)
assert length_scale > 0 and magnitude > 0
self.length_scale = length_scale
self.magnitude = magnitude
self.max_train_size_ = max_train_size
self.batch_size_ = batch_size
self.num_threads_ = num_threads
self.check_numerics = check_numerics
self.debug = debug
self.X_train = None
self.y_train = None
self.xy_ = None
self.K = None
self.K_inv = None
self.graph = None
self.vars = None
self.ops = None
def build_graph(self):
self.vars = {}
self.ops = {}
self.graph = tf.Graph()
with self.graph.as_default():
mag_const = tf.constant(self.magnitude,
dtype=np.float32,
name='magnitude')
ls_const = tf.constant(self.length_scale,
dtype=np.float32,
name='length_scale')
# Nodes for distance computation
v1 = tf.placeholder(tf.float32, name="v1")
v2 = tf.placeholder(tf.float32, name="v2")
dist_op = tf.sqrt(tf.reduce_sum(tf.pow(tf.subtract(v1, v2), 2), 1), name='dist_op')
if self.check_numerics:
dist_op = tf.check_numerics(dist_op, "dist_op: ")
self.vars['v1_h'] = v1
self.vars['v2_h'] = v2
self.ops['dist_op'] = dist_op
# Nodes for kernel computation
X_dists = tf.placeholder(tf.float32, name='X_dists')
ridge_ph = tf.placeholder(tf.float32, name='ridge')
K_op = mag_const * tf.exp(-X_dists / ls_const)
if self.check_numerics:
K_op = tf.check_numerics(K_op, "K_op: ")
K_ridge_op = K_op + tf.diag(ridge_ph)
if self.check_numerics:
K_ridge_op = tf.check_numerics(K_ridge_op, "K_ridge_op: ")
self.vars['X_dists_h'] = X_dists
self.vars['ridge_h'] = ridge_ph
self.ops['K_op'] = K_op
self.ops['K_ridge_op'] = K_ridge_op
# Nodes for xy computation
K = tf.placeholder(tf.float32, name='K')
K_inv = tf.placeholder(tf.float32, name='K_inv')
xy_ = tf.placeholder(tf.float32, name='xy_')
yt_ = tf.placeholder(tf.float32, name='yt_')
K_inv_op = tf.matrix_inverse(K)
if self.check_numerics:
K_inv_op = tf.check_numerics(K_inv_op, "K_inv: ")
xy_op = tf.matmul(K_inv, yt_)
if self.check_numerics:
xy_op = tf.check_numerics(xy_op, "xy_: ")
self.vars['K_h'] = K
self.vars['K_inv_h'] = K_inv
self.vars['xy_h'] = xy_
self.vars['yt_h'] = yt_
self.ops['K_inv_op'] = K_inv_op
self.ops['xy_op'] = xy_op
# Nodes for yhat/sigma computation
K2 = tf.placeholder(tf.float32, name="K2")
K3 = tf.placeholder(tf.float32, name="K3")
yhat_ = tf.cast(tf.matmul(tf.transpose(K2), xy_), tf.float32)
if self.check_numerics:
yhat_ = tf.check_numerics(yhat_, "yhat_: ")
sv1 = tf.matmul(tf.transpose(K2), tf.matmul(K_inv, K2))
if self.check_numerics:
sv1 = tf.check_numerics(sv1, "sv1: ")
sig_val = tf.cast((tf.sqrt(tf.diag_part(K3 - sv1))), tf.float32)
if self.check_numerics:
sig_val = tf.check_numerics(sig_val, "sig_val: ")
self.vars['K2_h'] = K2
self.vars['K3_h'] = K3
self.ops['yhat_op'] = yhat_
self.ops['sig_op'] = sig_val
# Compute y_best (min y)
y_best_op = tf.cast(tf.reduce_min(yt_, 0, True), tf.float32)
if self.check_numerics:
y_best_op = tf.check_numerics(y_best_op, "y_best_op: ")
self.ops['y_best_op'] = y_best_op
sigma = tf.placeholder(tf.float32, name='sigma')
yhat = tf.placeholder(tf.float32, name='yhat')
self.vars['sigma_h'] = sigma
self.vars['yhat_h'] = yhat
def __repr__(self):
rep = ""
for k, v in sorted(self.__dict__.items()):
rep += "{} = {}\n".format(k, v)
return rep
def __str__(self):
return self.__repr__()
def check_X_y(self, X, y):
from sklearn.utils.validation import check_X_y
if X.shape[0] > self.max_train_size_:
raise Exception("X_train size cannot exceed {} ({})"
.format(self.max_train_size_, X.shape[0]))
return check_X_y(X, y, multi_output=True,
allow_nd=True, y_numeric=True,
estimator="GPR")
def check_fitted(self):
if self.X_train is None or self.y_train is None \
or self.xy_ is None or self.K is None:
raise Exception("The model must be trained before making predictions!")
@staticmethod
def check_array(X):
from sklearn.utils.validation import check_array
return check_array(X, allow_nd=True, estimator="GPR")
@staticmethod
def check_output(X):
finite_els = np.isfinite(X)
if not np.all(finite_els):
raise Exception("Input contains non-finite values: {}"
.format(X[~finite_els]))
def fit(self, X_train, y_train, ridge=1.0):
self._reset()
X_train, y_train = self.check_X_y(X_train, y_train)
self.X_train = np.float32(X_train)
self.y_train = np.float32(y_train)
sample_size = self.X_train.shape[0]
if np.isscalar(ridge):
ridge = np.ones(sample_size) * ridge
assert isinstance(ridge, np.ndarray)
assert ridge.ndim == 1
X_dists = np.zeros((sample_size, sample_size), dtype=np.float32)
with tf.Session(graph=self.graph,
config=tf.ConfigProto(
intra_op_parallelism_threads=self.num_threads_)) as sess:
dist_op = self.ops['dist_op']
v1, v2 = self.vars['v1_h'], self.vars['v2_h']
for i in range(sample_size):
X_dists[i] = sess.run(dist_op, feed_dict={v1: self.X_train[i], v2: self.X_train})
K_ridge_op = self.ops['K_ridge_op']
X_dists_ph = self.vars['X_dists_h']
ridge_ph = self.vars['ridge_h']
self.K = sess.run(K_ridge_op, feed_dict={X_dists_ph: X_dists, ridge_ph: ridge})
K_ph = self.vars['K_h']
K_inv_op = self.ops['K_inv_op']
self.K_inv = sess.run(K_inv_op, feed_dict={K_ph: self.K})
xy_op = self.ops['xy_op']
K_inv_ph = self.vars['K_inv_h']
yt_ph = self.vars['yt_h']
self.xy_ = sess.run(xy_op, feed_dict={K_inv_ph: self.K_inv,
yt_ph: self.y_train})
return self
def predict(self, X_test):
self.check_fitted()
X_test = np.float32(GPR.check_array(X_test))
test_size = X_test.shape[0]
sample_size = self.X_train.shape[0]
arr_offset = 0
yhats = np.zeros([test_size, 1])
sigmas = np.zeros([test_size, 1])
with tf.Session(graph=self.graph,
config=tf.ConfigProto(
intra_op_parallelism_threads=self.num_threads_)) as sess:
# Nodes for distance operation
dist_op = self.ops['dist_op']
v1 = self.vars['v1_h']
v2 = self.vars['v2_h']
# Nodes for kernel computation
K_op = self.ops['K_op']
X_dists = self.vars['X_dists_h']
# Nodes to compute yhats/sigmas
yhat_ = self.ops['yhat_op']
K_inv_ph = self.vars['K_inv_h']
K2 = self.vars['K2_h']
K3 = self.vars['K3_h']
xy_ph = self.vars['xy_h']
while arr_offset < test_size:
if arr_offset + self.batch_size_ > test_size:
end_offset = test_size
else:
end_offset = arr_offset + self.batch_size_
X_test_batch = X_test[arr_offset:end_offset]
batch_len = end_offset - arr_offset
dists1 = np.zeros([sample_size, batch_len])
for i in range(sample_size):
dists1[i] = sess.run(dist_op, feed_dict={v1: self.X_train[i],
v2: X_test_batch})
sig_val = self.ops['sig_op']
K2_ = sess.run(K_op, feed_dict={X_dists: dists1})
yhat = sess.run(yhat_, feed_dict={K2: K2_, xy_ph: self.xy_})
dists2 = np.zeros([batch_len, batch_len])
for i in range(batch_len):
dists2[i] = sess.run(dist_op, feed_dict={v1: X_test_batch[i], v2: X_test_batch})
K3_ = sess.run(K_op, feed_dict={X_dists: dists2})
sigma = np.zeros([1, batch_len], np.float32)
sigma[0] = sess.run(sig_val, feed_dict={K_inv_ph: self.K_inv, K2: K2_, K3: K3_})
sigma = np.transpose(sigma)
yhats[arr_offset: end_offset] = yhat
sigmas[arr_offset: end_offset] = sigma
arr_offset = end_offset
GPR.check_output(yhats)
GPR.check_output(sigmas)
return GPRResult(yhats, sigmas)
def get_params(self, deep=True):
return {"length_scale": self.length_scale,
"magnitude": self.magnitude,
"X_train": self.X_train,
"y_train": self.y_train,
"xy_": self.xy_,
"K": self.K,
"K_inv": self.K_inv}
def set_params(self, **parameters):
for param, val in list(parameters.items()):
setattr(self, param, val)
return self
def _reset(self):
self.X_train = None
self.y_train = None
self.xy_ = None
self.K = None
self.K_inv = None
self.graph = None
self.build_graph()
gc.collect()
class GPRGD(GPR):
GP_BETA_UCB = "UCB"
GP_BETA_CONST = "CONST"
def __init__(self,
length_scale=1.0,
magnitude=1.0,
max_train_size=7000,
batch_size=3000,
num_threads=4,
learning_rate=0.01,
epsilon=1e-6,
max_iter=100,
sigma_multiplier=3.0,
mu_multiplier=1.0):
super(GPRGD, self).__init__(length_scale=length_scale,
magnitude=magnitude,
max_train_size=max_train_size,
batch_size=batch_size,
num_threads=num_threads)
self.learning_rate = learning_rate
self.epsilon = epsilon
self.max_iter = max_iter
self.sigma_multiplier = sigma_multiplier
self.mu_multiplier = mu_multiplier
self.X_min = None
self.X_max = None
def fit(self, X_train, y_train, X_min, X_max, ridge): # pylint: disable=arguments-differ
super(GPRGD, self).fit(X_train, y_train, ridge)
self.X_min = X_min
self.X_max = X_max
with tf.Session(graph=self.graph,
config=tf.ConfigProto(
intra_op_parallelism_threads=self.num_threads_)) as sess:
xt_ = tf.Variable(self.X_train[0], tf.float32)
xt_ph = tf.placeholder(tf.float32)
xt_assign_op = xt_.assign(xt_ph)
init = tf.global_variables_initializer()
sess.run(init)
K2_mat = tf.transpose(tf.expand_dims(tf.sqrt(tf.reduce_sum(tf.pow(
tf.subtract(xt_, self.X_train), 2), 1)), 0))
if self.check_numerics is True:
K2_mat = tf.check_numerics(K2_mat, "K2_mat: ")
K2__ = tf.cast(self.magnitude * tf.exp(-K2_mat / self.length_scale), tf.float32)
if self.check_numerics is True:
K2__ = tf.check_numerics(K2__, "K2__: ")
yhat_gd = tf.cast(tf.matmul(tf.transpose(K2__), self.xy_), tf.float32)
if self.check_numerics is True:
yhat_gd = tf.check_numerics(yhat_gd, message="yhat: ")
sig_val = tf.cast((tf.sqrt(self.magnitude - tf.matmul(
tf.transpose(K2__), tf.matmul(self.K_inv, K2__)))), tf.float32)
if self.check_numerics is True:
sig_val = tf.check_numerics(sig_val, message="sigma: ")
LOG.debug("\nyhat_gd : %s", str(sess.run(yhat_gd)))
LOG.debug("\nsig_val : %s", str(sess.run(sig_val)))
loss = tf.squeeze(tf.subtract(self.mu_multiplier * yhat_gd,
self.sigma_multiplier * sig_val))
if self.check_numerics is True:
loss = tf.check_numerics(loss, "loss: ")
optimizer = tf.train.AdamOptimizer(learning_rate=self.learning_rate,
epsilon=self.epsilon)
# optimizer = tf.train.GradientDescentOptimizer(learning_rate=self.learning_rate)
train = optimizer.minimize(loss)
self.vars['xt_'] = xt_
self.vars['xt_ph'] = xt_ph
self.ops['xt_assign_op'] = xt_assign_op
self.ops['yhat_gd'] = yhat_gd
self.ops['sig_val2'] = sig_val
self.ops['loss_op'] = loss
self.ops['train_op'] = train
return self
def predict(self, X_test, constraint_helper=None, # pylint: disable=arguments-differ
categorical_feature_method='hillclimbing',
categorical_feature_steps=3):
self.check_fitted()
X_test = np.float32(GPR.check_array(X_test))
test_size = X_test.shape[0]
nfeats = self.X_train.shape[1]
arr_offset = 0
yhats = np.zeros([test_size, 1])
sigmas = np.zeros([test_size, 1])
minls = np.zeros([test_size, 1])
minl_confs = np.zeros([test_size, nfeats])
with tf.Session(graph=self.graph,
config=tf.ConfigProto(
intra_op_parallelism_threads=self.num_threads_)) as sess:
while arr_offset < test_size:
if arr_offset + self.batch_size_ > test_size:
end_offset = test_size
else:
end_offset = arr_offset + self.batch_size_
X_test_batch = X_test[arr_offset:end_offset]
batch_len = end_offset - arr_offset
xt_ = self.vars['xt_']
init = tf.global_variables_initializer()
sess.run(init)
sig_val = self.ops['sig_val2']
yhat_gd = self.ops['yhat_gd']
loss = self.ops['loss_op']
train = self.ops['train_op']
xt_ph = self.vars['xt_ph']
assign_op = self.ops['xt_assign_op']
yhat = np.empty((batch_len, 1))
sigma = np.empty((batch_len, 1))
minl = np.empty((batch_len, 1))
minl_conf = np.empty((batch_len, nfeats))
for i in range(batch_len):
if self.debug is True:
LOG.info("-------------------------------------------")
yhats_it = np.empty((self.max_iter + 1,)) * np.nan
sigmas_it = np.empty((self.max_iter + 1,)) * np.nan
losses_it = np.empty((self.max_iter + 1,)) * np.nan
confs_it = np.empty((self.max_iter + 1, nfeats)) * np.nan
sess.run(assign_op, feed_dict={xt_ph: X_test_batch[i]})
step = 0
for step in range(self.max_iter):
if self.debug is True:
LOG.info("Batch %d, iter %d:", i, step)
yhats_it[step] = sess.run(yhat_gd)[0][0]
sigmas_it[step] = sess.run(sig_val)[0][0]
losses_it[step] = sess.run(loss)
confs_it[step] = sess.run(xt_)
if self.debug is True:
LOG.info(" yhat: %s", str(yhats_it[step]))
LOG.info(" sigma: %s", str(sigmas_it[step]))
LOG.info(" loss: %s", str(losses_it[step]))
LOG.info(" conf: %s", str(confs_it[step]))
sess.run(train)
# constraint Projected Gradient Descent
xt = sess.run(xt_)
xt_valid = np.minimum(xt, self.X_max)
xt_valid = np.maximum(xt_valid, self.X_min)
sess.run(assign_op, feed_dict={xt_ph: xt_valid})
if constraint_helper is not None:
xt_valid = constraint_helper.apply_constraints(sess.run(xt_))
sess.run(assign_op, feed_dict={xt_ph: xt_valid})
if categorical_feature_method == 'hillclimbing':
if step % categorical_feature_steps == 0:
current_xt = sess.run(xt_)
current_loss = sess.run(loss)
new_xt = \
constraint_helper.randomize_categorical_features(
current_xt)
sess.run(assign_op, feed_dict={xt_ph: new_xt})
new_loss = sess.run(loss)
if current_loss < new_loss:
sess.run(assign_op, feed_dict={xt_ph: new_xt})
else:
raise Exception("Unknown categorial feature method: {}".format(
categorical_feature_method))
if step == self.max_iter - 1:
# Record results from final iteration
yhats_it[-1] = sess.run(yhat_gd)[0][0]
sigmas_it[-1] = sess.run(sig_val)[0][0]
losses_it[-1] = sess.run(loss)
confs_it[-1] = sess.run(xt_)
assert np.all(np.isfinite(yhats_it))
assert np.all(np.isfinite(sigmas_it))
assert np.all(np.isfinite(losses_it))
assert np.all(np.isfinite(confs_it))
# Store info for conf with min loss from all iters
if np.all(~np.isfinite(losses_it)):
min_loss_idx = 0
else:
min_loss_idx = np.nanargmin(losses_it)
yhat[i] = yhats_it[min_loss_idx]
sigma[i] = sigmas_it[min_loss_idx]
minl[i] = losses_it[min_loss_idx]
minl_conf[i] = confs_it[min_loss_idx]
minls[arr_offset:end_offset] = minl
minl_confs[arr_offset:end_offset] = minl_conf
yhats[arr_offset:end_offset] = yhat
sigmas[arr_offset:end_offset] = sigma
arr_offset = end_offset
GPR.check_output(yhats)
GPR.check_output(sigmas)
GPR.check_output(minls)
GPR.check_output(minl_confs)
return GPRGDResult(yhats, sigmas, minls, minl_confs)
@staticmethod
def calculate_sigma_multiplier(t, ndim, bound=0.1):
assert t > 0
assert ndim > 0
assert bound > 0 and bound <= 1
beta = 2 * np.log(ndim * (t**2) * (np.pi**2) / 6 * bound)
if beta > 0:
beta = np.sqrt(beta)
else:
beta = 1
return beta
# def gp_tf(X_train, y_train, X_test, ridge, length_scale, magnitude, batch_size=3000):
# with tf.Graph().as_default():
# y_best = tf.cast(tf.reduce_min(y_train, 0, True), tf.float32)
# sample_size = X_train.shape[0]
# train_size = X_test.shape[0]
# arr_offset = 0
# yhats = np.zeros([train_size, 1])
# sigmas = np.zeros([train_size, 1])
# eips = np.zeros([train_size, 1])
# X_train = np.float32(X_train)
# y_train = np.float32(y_train)
# X_test = np.float32(X_test)
# ridge = np.float32(ridge)
#
# v1 = tf.placeholder(tf.float32,name="v1")
# v2 = tf.placeholder(tf.float32,name="v2")
# dist_op = tf.sqrt(tf.reduce_sum(tf.pow(tf.subtract(v1, v2), 2), 1))
# try:
# sess = tf.Session(config=tf.ConfigProto(log_device_placement=False))
#
# dists = np.zeros([sample_size,sample_size])
# for i in range(sample_size):
# dists[i] = sess.run(dist_op,feed_dict={v1:X_train[i], v2:X_train})
#
#
# dists = tf.cast(dists, tf.float32)
# K = magnitude * tf.exp(-dists/length_scale) + tf.diag(ridge);
#
# K2 = tf.placeholder(tf.float32, name="K2")
# K3 = tf.placeholder(tf.float32, name="K3")
#
# x = tf.matmul(tf.matrix_inverse(K), y_train)
# yhat_ = tf.cast(tf.matmul(tf.transpose(K2), x), tf.float32);
# sig_val = tf.cast((tf.sqrt(tf.diag_part(K3 - tf.matmul(tf.transpose(K2),
# tf.matmul(tf.matrix_inverse(K),
# K2))))),
# tf.float32)
#
# u = tf.placeholder(tf.float32, name="u")
# phi1 = 0.5 * tf.erf(u / np.sqrt(2.0)) + 0.5
# phi2 = (1.0 / np.sqrt(2.0 * np.pi)) * tf.exp(tf.square(u) * (-0.5));
# eip = (tf.multiply(u, phi1) + phi2);
#
# while arr_offset < train_size:
# if arr_offset + batch_size > train_size:
# end_offset = train_size
# else:
# end_offset = arr_offset + batch_size;
#
# xt_ = X_test[arr_offset:end_offset];
# batch_len = end_offset - arr_offset
#
# dists = np.zeros([sample_size, batch_len])
# for i in range(sample_size):
# dists[i] = sess.run(dist_op, feed_dict={v1:X_train[i], v2:xt_})
#
# K2_ = magnitude * tf.exp(-dists / length_scale);
# K2_ = sess.run(K2_)
#
# dists = np.zeros([batch_len, batch_len])
# for i in range(batch_len):
# dists[i] = sess.run(dist_op, feed_dict={v1:xt_[i], v2:xt_})
# K3_ = magnitude * tf.exp(-dists / length_scale);
# K3_ = sess.run(K3_)
#
# yhat = sess.run(yhat_, feed_dict={K2:K2_})
#
# sigma = np.zeros([1, batch_len], np.float32)
# sigma[0] = (sess.run(sig_val, feed_dict={K2:K2_, K3:K3_}))
# sigma = np.transpose(sigma)
#
# u_ = tf.cast(tf.div(tf.subtract(y_best, yhat), sigma), tf.float32)
# u_ = sess.run(u_)
# eip_p = sess.run(eip, feed_dict={u:u_})
# eip_ = tf.multiply(sigma, eip_p)
# yhats[arr_offset:end_offset] = yhat
# sigmas[arr_offset:end_offset] = sigma;
# eips[arr_offset:end_offset] = sess.run(eip_);
# arr_offset = end_offset
#
# finally:
# sess.close()
#
# return yhats, sigmas, eips
def euclidean_mat(X, y, sess):
x_n = X.shape[0]
y_n = y.shape[0]
z = np.zeros([x_n, y_n])
for i in range(x_n):
v1 = X[i]
tmp = []
for j in range(y_n):
v2 = y[j]
tmp.append(tf.sqrt(tf.reduce_sum(tf.pow(tf.subtract(v1, v2), 2))))
z[i] = (sess.run(tmp))
return z
def gd_tf(xs, ys, xt, ridge, length_scale=1.0, magnitude=1.0, max_iter=50):
LOG.debug("xs shape: %s", str(xs.shape))
LOG.debug("ys shape: %s", str(ys.shape))
LOG.debug("xt shape: %s", str(xt.shape))
with tf.Graph().as_default():
# y_best = tf.cast(tf.reduce_min(ys,0,True),tf.float32); #array
# yhat_gd = tf.check_numerics(yhat_gd, message="yhat: ")
sample_size = xs.shape[0]
nfeats = xs.shape[1]
test_size = xt.shape[0]
# arr_offset = 0
ini_size = xt.shape[0]
yhats = np.zeros([test_size, 1])
sigmas = np.zeros([test_size, 1])
minl = np.zeros([test_size, 1])
new_conf = np.zeros([test_size, nfeats])
xs = np.float32(xs)
ys = np.float32(ys)
xt_ = tf.Variable(xt[0], tf.float32)
sess = tf.Session(config=tf.ConfigProto(intra_op_parallelism_threads=8))
init = tf.global_variables_initializer()
sess.run(init)
ridge = np.float32(ridge)
v1 = tf.placeholder(tf.float32, name="v1")
v2 = tf.placeholder(tf.float32, name="v2")
dist = tf.sqrt(tf.reduce_sum(tf.pow(tf.subtract(v1, v2), 2), 1))
tmp = np.zeros([sample_size, sample_size])
for i in range(sample_size):
tmp[i] = sess.run(dist, feed_dict={v1: xs[i], v2: xs})
tmp = tf.cast(tmp, tf.float32)
K = magnitude * tf.exp(-tmp / length_scale) + tf.diag(ridge)
LOG.debug("K shape: %s", str(sess.run(K).shape))
K2_mat = tf.sqrt(tf.reduce_sum(tf.pow(tf.subtract(xt_, xs), 2), 1))
K2_mat = tf.transpose(tf.expand_dims(K2_mat, 0))
K2 = tf.cast(tf.exp(-K2_mat / length_scale), tf.float32)
x = tf.matmul(tf.matrix_inverse(K), ys)
x = sess.run(x)
yhat_ = tf.cast(tf.matmul(tf.transpose(K2), x), tf.float32)
sig_val = tf.cast((tf.sqrt(magnitude - tf.matmul(
tf.transpose(K2), tf.matmul(tf.matrix_inverse(K), K2)))), tf.float32)
LOG.debug('yhat shape: %s', str(sess.run(yhat_).shape))
LOG.debug('sig_val shape: %s', str(sess.run(sig_val).shape))
yhat_ = tf.check_numerics(yhat_, message='yhat: ')
sig_val = tf.check_numerics(sig_val, message='sig_val: ')
loss = tf.squeeze(tf.subtract(yhat_, sig_val))
loss = tf.check_numerics(loss, message='loss: ')
# optimizer = tf.train.GradientDescentOptimizer(0.1)
LOG.debug('loss: %s', str(sess.run(loss)))
optimizer = tf.train.AdamOptimizer(0.1)
train = optimizer.minimize(loss)
init = tf.global_variables_initializer()
sess.run(init)
for i in range(ini_size):
assign_op = xt_.assign(xt[i])
sess.run(assign_op)
for step in range(max_iter):
LOG.debug('sample #: %d, iter #: %d, loss: %s', i, step, str(sess.run(loss)))
sess.run(train)
yhats[i] = sess.run(yhat_)[0][0]
sigmas[i] = sess.run(sig_val)[0][0]
minl[i] = sess.run(loss)
new_conf[i] = sess.run(xt_)
return yhats, sigmas, minl, new_conf
def main():
pass
def create_random_matrices(n_samples=3000, n_feats=12, n_test=4444):
X_train = np.random.rand(n_samples, n_feats)
y_train = np.random.rand(n_samples, 1)
X_test = np.random.rand(n_test, n_feats)
length_scale = np.random.rand()
magnitude = np.random.rand()
ridge = np.ones(n_samples) * np.random.rand()
return X_train, y_train, X_test, length_scale, magnitude, ridge
if __name__ == "__main__":
main()

109
server/analysis/lasso.py Normal file
View File

@@ -0,0 +1,109 @@
#
# OtterTune - lasso.py
#
# Copyright (c) 2017-18, Carnegie Mellon University Database Group
#
'''
Created on Jul 8, 2016
@author: dvanaken
'''
import numpy as np
from sklearn.linear_model import lasso_path
from .base import ModelBase
class LassoPath(ModelBase):
"""Lasso:
Computes the Lasso path using Sklearn's lasso_path method.
See also
--------
http://scikit-learn.org/stable/modules/generated/sklearn.linear_model.lasso_path.html
Attributes
----------
feature_labels_ : array, [n_features]
Labels for each of the features in X.
alphas_ : array, [n_alphas]
The alphas along the path where models are computed. (These are
the decreasing values of the penalty along the path).
coefs_ : array, [n_outputs, n_features, n_alphas]
Coefficients along the path.
rankings_ : array, [n_features]
The average ranking of each feature across all target values.
"""
def __init__(self):
self.feature_labels_ = None
self.alphas_ = None
self.coefs_ = None
self.rankings_ = None
def _reset(self):
"""Resets all attributes (erases the model)"""
self.feature_labels_ = None
self.alphas_ = None
self.coefs_ = None
self.rankings_ = None
def fit(self, X, y, feature_labels, estimator_params=None):
"""Computes the Lasso path using Sklearn's lasso_path method.
Parameters
----------
X : array-like, shape (n_samples, n_features)
Training data (the independent variables).
y : array-like, shape (n_samples, n_outputs)
Training data (the output/target values).
feature_labels : array-like, shape (n_features)
Labels for each of the features in X.
estimator_params : dict, optional
The parameters to pass to Sklearn's Lasso estimator.
Returns
-------
self
"""
self._reset()
if estimator_params is None:
estimator_params = {}
self.feature_labels_ = feature_labels
alphas, coefs, _ = lasso_path(X, y, **estimator_params)
self.alphas_ = alphas.copy()
self.coefs_ = coefs.copy()
# Rank the features in X by order of importance. This ranking is based
# on how early a given features enter the regression (the earlier a
# feature enters the regression, the MORE important it is).
feature_rankings = [[] for _ in range(X.shape[1])]
for target_coef_paths in self.coefs_:
for i, feature_path in enumerate(target_coef_paths):
entrance_step = 1
for val_at_step in feature_path:
if val_at_step == 0:
entrance_step += 1
else:
break
feature_rankings[i].append(entrance_step)
self.rankings_ = np.array([np.mean(ranks) for ranks in feature_rankings])
return self
def get_ranked_features(self):
if self.rankings_ is None:
raise Exception("No lasso path has been fit yet!")
rank_idxs = np.argsort(self.rankings_)
return [self.feature_labels_[i] for i in rank_idxs]

View File

@@ -0,0 +1,489 @@
#
# OtterTune - preprocessing.py
#
# Copyright (c) 2017-18, Carnegie Mellon University Database Group
#
from abc import ABCMeta, abstractmethod
from itertools import chain, combinations, combinations_with_replacement
import numpy as np
from sklearn.preprocessing import MinMaxScaler as SklearnMinMaxScaler
from .util import is_numeric_matrix, is_lexical_matrix
# ==========================================================
# Preprocessing Base Class
# ==========================================================
class Preprocess(object, metaclass=ABCMeta):
@abstractmethod
def fit(self, matrix):
pass
@abstractmethod
def transform(self, matrix, copy=True):
pass
def fit_transform(self, matrix, copy=True):
self.fit(matrix)
return self.transform(matrix, copy=True)
@abstractmethod
def inverse_transform(self, matrix, copy=True):
pass
# ==========================================================
# Bin by Deciles
# ==========================================================
class Bin(Preprocess):
def __init__(self, bin_start, axis=None):
if axis is not None and \
axis != 1 and axis != 0:
raise NotImplementedError("Axis={} is not yet implemented".format(axis))
self.deciles_ = None
self.bin_start_ = bin_start
self.axis_ = axis
def fit(self, matrix):
if self.axis_ is None:
self.deciles_ = get_deciles(matrix, self.axis_)
elif self.axis_ == 0: # Bin columns
self.deciles_ = []
for col in matrix.T:
self.deciles_.append(get_deciles(col, axis=None))
elif self.axis_ == 1: # Bin rows
self.deciles_ = []
for row in matrix:
self.deciles_.append(get_deciles(row, axis=None))
return self
def transform(self, matrix, copy=True):
assert self.deciles_ is not None
if self.axis_ is None:
res = bin_by_decile(matrix, self.deciles_,
self.bin_start_, self.axis_)
elif self.axis_ == 0: # Transform columns
columns = []
for col, decile in zip(matrix.T, self.deciles_):
columns.append(bin_by_decile(col, decile,
self.bin_start_, axis=None))
res = np.vstack(columns).T
elif self.axis_ == 1: # Transform rows
rows = []
for row, decile in zip(matrix, self.deciles_):
rows.append(bin_by_decile(row, decile,
self.bin_start_, axis=None))
res = np.vstack(rows)
assert res.shape == matrix.shape
return res
def inverse_transform(self, matrix, copy=True):
raise NotImplementedError("This method is not supported")
def get_deciles(matrix, axis=None):
if axis is not None:
raise NotImplementedError("Axis is not yet implemented")
assert matrix.ndim > 0
assert matrix.size > 0
decile_range = np.arange(10, 101, 10)
deciles = np.percentile(matrix, decile_range, axis=axis)
deciles[-1] = np.Inf
return deciles
def bin_by_decile(matrix, deciles, bin_start, axis=None):
if axis is not None:
raise NotImplementedError("Axis is not yet implemented")
assert matrix.ndim > 0
assert matrix.size > 0
assert deciles is not None
assert len(deciles) == 10
binned_matrix = np.zeros_like(matrix)
for i in range(10)[::-1]:
decile = deciles[i]
binned_matrix[matrix <= decile] = i + bin_start
return binned_matrix
# ==========================================================
# Shuffle Indices
# ==========================================================
class Shuffler(Preprocess):
def __init__(self, shuffle_rows=True, shuffle_columns=False,
row_indices=None, column_indices=None, seed=0):
self.shuffle_rows_ = shuffle_rows
self.shuffle_columns_ = shuffle_columns
self.row_indices_ = row_indices
self.column_indices_ = column_indices
np.random.seed(seed)
self.fitted_ = False
def fit(self, matrix):
if self.shuffle_rows_ and self.row_indices_ is None:
self.row_indices_ = get_shuffle_indices(matrix.data.shape[0])
if self.shuffle_columns_ and self.column_indices_ is None:
self.column_indices_ = get_shuffle_indices(matrix.data.shape[1])
self.fitted_ = True
def transform(self, matrix, copy=True):
if not self.fitted_:
raise Exception("The fit() function must be called before transform()")
if copy:
matrix = matrix.copy()
if self.shuffle_rows_:
matrix.data = matrix.data[self.row_indices_]
matrix.rowlabels = matrix.rowlabels[self.row_indices_]
if self.shuffle_columns_:
matrix.data = matrix.data[:, self.column_indices_]
matrix.columnlabels = matrix.columnlabels[self.column_indices_]
return matrix
def inverse_transform(self, matrix, copy=True):
if copy:
matrix = matrix.copy()
if self.shuffle_rows_:
inverse_row_indices = np.argsort(self.row_indices_)
matrix.data = matrix.data[inverse_row_indices]
matrix.rowlabels = matrix.rowlabels[inverse_row_indices]
if self.shuffle_columns_:
inverse_column_indices = np.argsort(self.column_indices_)
matrix.data = matrix.data[:, inverse_column_indices]
matrix.columnlabels = matrix.columnlabels[inverse_column_indices]
return matrix
def get_shuffle_indices(size, seed=None):
if seed is not None:
assert isinstance(seed, int)
np.random.seed(seed)
if isinstance(size, int):
return np.random.choice(size, size, replace=False)
else:
indices = []
for d in size:
indices.append(np.random.choice(d, d, replace=False))
return indices
# ==========================================================
# Polynomial Features
# ==========================================================
class PolynomialFeatures(Preprocess):
"""Compute the polynomial features of the input array.
This code was copied and modified from sklearn's
implementation.
"""
def __init__(self, degree=2, interaction_only=False, include_bias=True):
self.degree_ = degree
self.interaction_only_ = interaction_only
self.include_bias_ = include_bias
self.n_input_features_ = None
self.n_output_features_ = None
# @property
# def powers_(self):
# combinations = self._combinations(self.n_input_features_, self.degree_,
# self.interaction_only_,
# self.include_bias_)
# return np.vstack(np.bincount(c, minlength=self.n_input_features_)
# for c in combinations)
@staticmethod
def _combinations(n_features, degree, interaction_only, include_bias):
comb = (combinations if interaction_only else combinations_with_replacement)
start = int(not include_bias)
return chain.from_iterable(comb(list(range(n_features)), i)
for i in range(start, degree + 1))
def fit(self, matrix):
assert matrix.ndim == 2
assert matrix.size > 0
_, n_features = matrix.shape
combos = self._combinations(n_features, self.degree_,
self.interaction_only_,
self.include_bias_)
self.n_input_features_ = matrix.shape[1]
self.n_output_features_ = sum(1 for _ in combos)
return self
def transform(self, matrix, copy=True):
"""Transform data to polynomial features
Parameters
----------
X : array-like, shape [n_samples, n_features]
The data to transform, row by row.
Returns
-------
XP : np.ndarray shape [n_samples, NP]
The matrix of features, where NP is the number of polynomial
features generated from the combination of inputs.
"""
assert matrix.ndim == 2
assert matrix.size > 0
n_samples, n_features = matrix.shape
if n_features != self.n_input_features_:
raise ValueError("X shape does not match training shape")
is_numeric_type = is_numeric_matrix(matrix)
is_lexical_type = is_lexical_matrix(matrix)
if is_lexical_type:
strs = matrix.reshape((matrix.size,))
maxlen = max([len(s) for s in strs])
dtype = "S{}".format(maxlen * 2 + 1)
else:
dtype = matrix.dtype
# allocate output data
poly_matrix = np.empty((n_samples, self.n_output_features_), dtype=dtype)
combos = self._combinations(n_features, self.degree_,
self.interaction_only_,
self.include_bias_)
for i, c in enumerate(combos):
if is_numeric_type:
poly_matrix[:, i] = matrix[:, c].prod(1)
elif is_lexical_type:
n_poly1_feats = n_features + int(self.include_bias_)
if i >= n_poly1_feats:
x = "*".join(np.squeeze(matrix[:, c]).tolist())
else:
x = "".join(np.squeeze(matrix[:, c]).tolist())
poly_matrix[:, i] = x
else:
raise TypeError("Unsupported matrix type {}".format(matrix.dtype))
return poly_matrix
def inverse_transform(self, matrix, copy=True):
raise NotImplementedError("This method is not supported")
# ==========================================================
# Dummy Encoding
# ==========================================================
class DummyEncoder(Preprocess):
def __init__(self, n_values, categorical_features, cat_columnlabels, noncat_columnlabels):
from sklearn.preprocessing import OneHotEncoder
if not isinstance(n_values, np.ndarray):
n_values = np.array(n_values)
if not isinstance(categorical_features, np.ndarray):
categorical_features = np.array(categorical_features)
# assert categorical_features.size > 0
assert categorical_features.shape == n_values.shape
for nv in n_values:
if nv <= 2:
raise Exception("Categorical features must have 3+ labels")
self.n_values = n_values
self.cat_columnlabels = cat_columnlabels
self.noncat_columnlabels = noncat_columnlabels
self.encoder = OneHotEncoder(
n_values=n_values, categorical_features=categorical_features, sparse=False)
self.new_labels = None
self.cat_idxs_old = categorical_features
def fit(self, matrix):
self.encoder.fit(matrix)
# determine new columnlabels
# categorical variables are done in order specified by categorical_features
new_labels = []
for i, cat_label in enumerate(self.cat_columnlabels):
low = self.encoder.feature_indices_[i]
high = self.encoder.feature_indices_[i + 1]
for j in range(low, high):
# eg the categorical variable named cat_var with 5 possible values
# turns into 0/1 variables named cat_var____0, ..., cat_var____4
new_labels.append(cat_label + "____" + str(j - low))
# according to sklearn documentation,
# "non-categorical features are always stacked to the right of the matrix"
# by observation, it looks like the non-categorical features' relative order is preserved
# BUT: there is no guarantee made about that behavior!
# We either trust OneHotEncoder to be sensible, or look for some other way
new_labels += self.noncat_columnlabels
self.new_labels = new_labels
def transform(self, matrix, copy=True):
# actually transform the matrix
matrix_encoded = self.encoder.transform(matrix)
return matrix_encoded
def fit_transform(self, matrix, copy=True):
self.fit(matrix)
return self.transform(matrix)
def inverse_transform(self, matrix, copy=True):
n_values = self.n_values
# If there are no categorical variables, no transformation happened.
if len(n_values) == 0:
return matrix
# Otherwise, this is a dummy-encoded matrix. Transform it back to original form.
n_features = matrix.shape[-1] - self.encoder.feature_indices_[-1] + len(n_values)
noncat_start_idx = self.encoder.feature_indices_[-1]
inverted_matrix = np.empty((matrix.shape[0], n_features))
cat_idx = 0
noncat_idx = 0
for i in range(n_features):
if i in self.cat_idxs_old:
new_col = np.ones((matrix.shape[0],))
start_idx = self.encoder.feature_indices_[cat_idx]
for j in range(n_values[cat_idx]):
col = matrix[:, start_idx + j]
new_col[col == 1] = j
cat_idx += 1
else:
new_col = np.array(matrix[:, noncat_start_idx + noncat_idx])
noncat_idx += 1
inverted_matrix[:, i] = new_col
return inverted_matrix
def total_dummies(self):
return sum(self.n_values)
def consolidate_columnlabels(columnlabels):
import re
# use this to check if a label was created by dummy encoder
p = re.compile(r'(.*)____\d+')
consolidated_columnlabels = []
cat_seen = set() # avoid duplicate cat_labels
for lab in columnlabels:
m = p.match(lab)
# m.group(1) is the original column name
if m:
if m.group(1) not in cat_seen:
cat_seen.add(m.group(1))
consolidated_columnlabels.append(m.group(1))
else:
# non-categorical variable
consolidated_columnlabels.append(lab)
return consolidated_columnlabels
def fix_scaler(scaler, encoder, params):
p = 0.5
mean = scaler.mean_
var = scaler.var_
n_values = encoder.n_values
cat_start_idxs = encoder.xform_start_indices
current_idx = 0
cat_idx = 0
for param in params:
if param.iscategorical:
if param.isboolean:
nvals = 1
else:
assert cat_start_idxs[cat_idx] == current_idx
nvals = n_values[cat_idx]
cat_idx += 1
cat_mean = nvals * p
cat_var = cat_mean * (1 - p)
mean[current_idx: current_idx + nvals] = cat_mean
var[current_idx: current_idx + nvals] = cat_var
current_idx += nvals
else:
current_idx += 1
scaler.mean_ = mean
scaler.var_ = var
scaler.scale_ = np.sqrt(var)
def get_min_max(params, encoder=None):
if encoder is not None:
num_cat_feats = encoder.n_values.size
nfeats = len(params) - num_cat_feats + np.sum(encoder.n_values)
n_values = encoder.n_values
cat_start_idxs = encoder.xform_start_indices
else:
num_cat_feats = 0
nfeats = len(params)
n_values = np.array([])
cat_start_idxs = np.array([])
mins = np.empty((nfeats,))
maxs = np.empty((nfeats,))
current_idx = 0
cat_idx = 0
for param in params:
if param.iscategorical:
if param.isboolean:
nvals = 1
else:
assert cat_start_idxs[cat_idx] == current_idx
nvals = n_values[cat_idx]
cat_idx += 1
mins[current_idx: current_idx + nvals] = 0
maxs[current_idx: current_idx + nvals] = 1
current_idx += nvals
else:
mins[current_idx] = param.true_range[0] # valid_values[0]
maxs[current_idx] = param.true_range[1] # valid_values[-1]
current_idx += 1
return mins, maxs
# ==========================================================
# Min-max scaler
# ==========================================================
class MinMaxScaler(Preprocess):
def __init__(self, mins=None, maxs=None):
self.scaler_ = SklearnMinMaxScaler()
if mins is not None:
assert isinstance(mins, np.ndarray)
if mins.ndim == 1:
mins = mins.reshape(1, -1)
self.scaler_.partial_fit(mins)
self.mins_ = mins
else:
self.mins_ = None
if maxs is not None:
assert isinstance(maxs, np.ndarray)
if maxs.ndim == 1:
maxs = maxs.reshape(1, -1)
self.scaler_.partial_fit(maxs)
self.maxs_ = maxs
else:
self.maxs_ = None
self.fitted_ = self.mins_ is not None and self.maxs_ is not None
def fit(self, matrix):
if matrix.ndim == 1:
matrix = matrix.reshape(1, -1)
self.scaler_.partial_fit(matrix)
self.mins_ = self.scaler_.data_min_
self.maxs_ = self.scaler_.data_max_
self.fitted_ = True
return self
def transform(self, matrix, copy=True):
if not self.fitted_:
raise Exception("Model not fitted!")
if matrix.ndim == 1:
matrix = matrix.reshape(1, -1)
return self.scaler_.transform(matrix)
def inverse_transform(self, matrix, copy=True):
if matrix.ndim == 1:
matrix = matrix.reshape(1, -1)
return self.scaler_.inverse_transform(matrix)

View File

@@ -0,0 +1,5 @@
#
# OtterTune - __init__.py
#
# Copyright (c) 2017-18, Carnegie Mellon University Database Group
#

View File

@@ -0,0 +1,91 @@
#
# OtterTune - test_cluster.py
#
# Copyright (c) 2017-18, Carnegie Mellon University Database Group
#
import unittest
import numpy as np
from sklearn import datasets
from analysis.cluster import KMeans, KMeansClusters, create_kselection_model
class TestKMeans(unittest.TestCase):
@classmethod
def setUpClass(cls):
super(TestKMeans, cls).setUpClass()
iris = datasets.load_iris()
cls.model = KMeans()
cls.model.fit(iris.data, 5, iris.target,
estimator_params={'n_init': 50, 'random_state': 42})
def test_kmeans_n_clusters(self):
self.assertEqual(self.model.n_clusters_, 5)
def test_kmeans_cluster_inertia(self):
self.assertAlmostEqual(self.model.cluster_inertia_, 46.535, 2)
def test_kmeans_cluster_labels(self):
expected_labels = [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 3, 3, 3, 2, 3, 3, 3,
2, 3, 2, 2, 3, 2, 3, 2, 3, 3, 2, 3, 2, 3, 2, 3, 3, 3, 3,
3, 3, 3, 2, 2, 2, 2, 3, 2, 3, 3, 3, 2, 2, 2, 3, 2, 2, 2,
2, 2, 3, 2, 2, 4, 3, 0, 4, 4, 0, 2, 0, 4, 0, 4, 4, 4, 3,
4, 4, 4, 0, 0, 3, 4, 3, 0, 3, 4, 0, 3, 3, 4, 0, 0, 0, 4,
3, 3, 0, 4, 4, 3, 4, 4, 4, 3, 4, 4, 4, 3, 4, 4, 3]
for lab_actual, lab_expected in zip(self.model.cluster_labels_, expected_labels):
self.assertEqual(lab_actual, lab_expected)
def test_kmeans_sample_labels(self):
for lab_actual, lab_expected in zip(self.model.sample_labels_, datasets.load_iris().target):
self.assertEqual(lab_actual, lab_expected)
def test_kmeans_cluster_centers(self):
expected_centers = [[7.475, 3.125, 6.300, 2.050],
[5.006, 3.418, 1.464, 0.244],
[5.508, 2.600, 3.908, 1.204],
[6.207, 2.853, 4.746, 1.564],
[6.529, 3.058, 5.508, 2.162]]
for row_actual, row_expected in zip(self.model.cluster_centers_, expected_centers):
for val_actual, val_expected in zip(row_actual, row_expected):
self.assertAlmostEqual(val_actual, val_expected, 2)
class TestKSelection(unittest.TestCase):
def setUp(self):
np.random.seed(seed=42)
@classmethod
def setUpClass(cls):
super(TestKSelection, cls).setUpClass()
# Load Iris data
iris = datasets.load_iris()
cls.matrix = iris.data
cls.kmeans_models = KMeansClusters()
cls.kmeans_models.fit(cls.matrix,
min_cluster=1,
max_cluster=10,
sample_labels=iris.target,
estimator_params={'n_init': 50, 'random_state': 42})
def test_detk_optimal_num_clusters(self):
# Compute optimal # cluster using det-k
detk = create_kselection_model("det-k")
detk.fit(self.matrix, self.kmeans_models.cluster_map_)
self.assertEqual(detk.optimal_num_clusters_, 2)
def test_gap_statistic_optimal_num_clusters(self):
# Compute optimal # cluster using gap-statistics
gap = create_kselection_model("gap-statistic")
gap.fit(self.matrix, self.kmeans_models.cluster_map_)
self.assertEqual(gap.optimal_num_clusters_, 8)
def test_silhouette_optimal_num_clusters(self):
# Compute optimal # cluster using Silhouette Analysis
sil = create_kselection_model("s-score")
sil.fit(self.matrix, self.kmeans_models.cluster_map_)
self.assertEqual(sil.optimal_num_clusters_, 2)

View File

@@ -0,0 +1,116 @@
#
# OtterTune - test_constraints.py
#
# Copyright (c) 2017-18, Carnegie Mellon University Database Group
#
import unittest
import numpy as np
from sklearn import datasets
from sklearn.preprocessing import StandardScaler
from analysis.constraints import ParamConstraintHelper
from analysis.preprocessing import DummyEncoder
class ConstraintHelperTestCase(unittest.TestCase):
def test_scale_rescale(self):
X = datasets.load_boston()['data']
X_scaler = StandardScaler()
# params hard-coded for test (messy to import constant from website module)
constraint_helper = ParamConstraintHelper(X_scaler, None,
init_flip_prob=0.3,
flip_prob_decay=0.5)
X_scaled = X_scaler.fit_transform(X)
# there may be some floating point imprecision between scaling and rescaling
row_unscaled = np.round(constraint_helper._handle_scaling(X_scaled[0], True), 10) # pylint: disable=protected-access
self.assertTrue(np.all(X[0] == row_unscaled))
row_rescaled = constraint_helper._handle_rescaling(row_unscaled, True) # pylint: disable=protected-access
self.assertTrue(np.all(X_scaled[0] == row_rescaled))
def test_apply_constraints_unscaled(self):
n_values = [3]
categorical_features = [0]
encoder = DummyEncoder(n_values, categorical_features, ['a'], [])
encoder.fit([[0, 17]])
X_scaler = StandardScaler()
constraint_helper = ParamConstraintHelper(X_scaler, encoder,
init_flip_prob=0.3,
flip_prob_decay=0.5)
X = [0.1, 0.2, 0.3, 17]
X_expected = [0, 0, 1, 17]
X_corrected = constraint_helper.apply_constraints(X, scaled=False, rescale=False)
self.assertTrue(np.all(X_corrected == X_expected))
def test_apply_constraints(self):
n_values = [3]
categorical_features = [0]
encoder = DummyEncoder(n_values, categorical_features, ['a'], [])
encoder.fit([[0, 17]])
X_scaler = StandardScaler()
X = np.array([[0, 0, 1, 17], [1, 0, 0, 17]], dtype=float)
X_scaled = X_scaler.fit_transform(X)
constraint_helper = ParamConstraintHelper(X_scaler, encoder,
init_flip_prob=0.3,
flip_prob_decay=0.5)
row = X_scaled[0]
new_row = np.copy(row)
new_row[0: 3] += 0.1 # should still represent [0, 0, 1] encoding
row_corrected = constraint_helper.apply_constraints(new_row)
self.assertTrue(np.all(row == row_corrected))
# tests that repeatedly applying randomize_categorical_features
# always results in valid configurations of categorical dumny encodings
# and will lead to all possible values of categorical variables being tried
def test_randomize_categorical_features(self):
# variable 0 is categorical, 3 values
# variable 1 is not categorical
# variable 2 is categorical, 4 values
cat_var_0_levels = 3
cat_var_2_levels = 4
cat_var_0_idx = 0
cat_var_2_idx = 2
n_values = [cat_var_0_levels, cat_var_2_levels]
categorical_features = [cat_var_0_idx, cat_var_2_idx]
encoder = DummyEncoder(n_values, categorical_features, ['a', 'b'], [])
encoder.fit([[0, 17, 0]])
X_scaler = StandardScaler()
constraint_helper = ParamConstraintHelper(X_scaler, encoder,
init_flip_prob=0.3,
flip_prob_decay=0.5)
# row is a sample encoded set of features,
# note that the non-categorical variable is on the right
row = np.array([0, 0, 1, 1, 0, 0, 0, 17], dtype=float)
trials = 20
cat_var_0_counts = np.zeros(cat_var_0_levels)
cat_var_2_counts = np.zeros(cat_var_2_levels)
for _ in range(trials):
# possibly flip the categorical features
row = constraint_helper.randomize_categorical_features(row, scaled=False, rescale=False)
# check that result is valid for cat_var_0
cat_var_0_dummies = row[0: cat_var_0_levels]
self.assertTrue(np.all(np.logical_or(cat_var_0_dummies == 0, cat_var_0_dummies == 1)))
self.assertEqual(np.sum(cat_var_0_dummies), 1)
cat_var_0_counts[np.argmax(cat_var_0_dummies)] += 1
# check that result is valid for cat_var_2
cat_var_2_dummies = row[cat_var_0_levels: cat_var_0_levels + cat_var_2_levels]
self.assertTrue(np.all(np.logical_or(cat_var_2_dummies == 0, cat_var_2_dummies == 1)))
self.assertEqual(np.sum(cat_var_2_dummies), 1)
cat_var_2_counts[np.argmax(cat_var_2_dummies)] += 1
self.assertEqual(row[-1], 17)
for ct in cat_var_0_counts:
self.assertTrue(ct > 0)
for ct in cat_var_2_counts:
self.assertTrue(ct > 0)
if __name__ == '__main__':
unittest.main()

View File

@@ -0,0 +1,61 @@
#
# OtterTune - test_gpr.py
#
# Copyright (c) 2017-18, Carnegie Mellon University Database Group
#
import unittest
from sklearn import datasets
from analysis.gp import GPRNP
from analysis.gp_tf import GPR
# test numpy version GPR
class TestGPRNP(unittest.TestCase):
@classmethod
def setUpClass(cls):
super(TestGPRNP, cls).setUpClass()
boston = datasets.load_boston()
data = boston['data']
X_train = data[0:500]
X_test = data[500:]
y_train = boston['target'][0:500].reshape(500, 1)
cls.model = GPRNP(length_scale=1.0, magnitude=1.0)
cls.model.fit(X_train, y_train, ridge=1.0)
cls.gpr_result = cls.model.predict(X_test)
def test_gprnp_ypreds(self):
ypreds_round = [round(x[0], 4) for x in self.gpr_result.ypreds]
expected_ypreds = [0.0181, 0.0014, 0.0006, 0.0015, 0.0039, 0.0014]
self.assertEqual(ypreds_round, expected_ypreds)
def test_gprnp_sigmas(self):
sigmas_round = [round(x[0], 4) for x in self.gpr_result.sigmas]
expected_sigmas = [1.0, 1.0, 1.0, 1.0, 1.0, 1.0]
self.assertEqual(sigmas_round, expected_sigmas)
# test Tensorflow version GPR
class TestGPRTF(unittest.TestCase):
@classmethod
def setUpClass(cls):
super(TestGPRTF, cls).setUpClass()
boston = datasets.load_boston()
data = boston['data']
X_train = data[0:500]
X_test = data[500:]
y_train = boston['target'][0:500].reshape(500, 1)
cls.model = GPR(length_scale=1.0, magnitude=1.0)
cls.model.fit(X_train, y_train, ridge=1.0)
cls.gpr_result = cls.model.predict(X_test)
def test_gprnp_ypreds(self):
ypreds_round = [round(x[0], 4) for x in self.gpr_result.ypreds]
expected_ypreds = [0.0181, 0.0014, 0.0006, 0.0015, 0.0039, 0.0014]
self.assertEqual(ypreds_round, expected_ypreds)
def test_gprnp_sigmas(self):
sigmas_round = [round(x[0], 4) for x in self.gpr_result.sigmas]
expected_sigmas = [1.0, 1.0, 1.0, 1.0, 1.0, 1.0]
self.assertEqual(sigmas_round, expected_sigmas)

View File

@@ -0,0 +1,83 @@
#
# OtterTune - test_preprocessing.py
#
# Copyright (c) 2017-18, Carnegie Mellon University Database Group
#
import unittest
import numpy as np
from analysis.preprocessing import DummyEncoder, consolidate_columnlabels
class TestDummyEncoder(unittest.TestCase):
def test_no_categoricals(self):
X = [[1, 2, 3], [4, 5, 6]]
n_values = []
categorical_features = []
cat_columnlabels = []
noncat_columnlabels = ['a', 'b', 'c']
enc = DummyEncoder(n_values, categorical_features,
cat_columnlabels, noncat_columnlabels)
X_encoded = enc.fit_transform(X)
new_labels = enc.new_labels
self.assertTrue(np.all(X == X_encoded))
self.assertEqual(noncat_columnlabels, new_labels)
def test_simple_categorical(self):
X = [[0, 1, 2], [1, 1, 2], [2, 1, 2]]
n_values = [3]
categorical_features = [0]
cat_columnlabels = ['label']
noncat_columnlabels = ['a', 'b']
X_expected = [[1, 0, 0, 1, 2], [0, 1, 0, 1, 2], [0, 0, 1, 1, 2]]
new_labels_expected = ['label____0', 'label____1', 'label____2', 'a', 'b']
enc = DummyEncoder(n_values, categorical_features,
cat_columnlabels, noncat_columnlabels)
X_encoded = enc.fit_transform(X)
new_labels = enc.new_labels
self.assertTrue(np.all(X_expected == X_encoded))
self.assertEqual(new_labels_expected, new_labels)
def test_mixed_categorical(self):
X = [[1, 0, 2], [1, 1, 2], [1, 2, 2]]
n_values = [3]
categorical_features = [1]
cat_columnlabels = ['label']
noncat_columnlabels = ['a', 'b']
X_expected = [[1, 0, 0, 1, 2], [0, 1, 0, 1, 2], [0, 0, 1, 1, 2]]
new_labels_expected = ['label____0', 'label____1', 'label____2', 'a', 'b']
enc = DummyEncoder(n_values, categorical_features,
cat_columnlabels, noncat_columnlabels)
X_encoded = enc.fit_transform(X)
new_labels = enc.new_labels
self.assertTrue(np.all(X_expected == X_encoded))
self.assertEqual(new_labels_expected, new_labels)
def test_consolidate(self):
labels = ['label1____0', 'label1____1', 'label2____0', 'label2____1', 'noncat']
consolidated = consolidate_columnlabels(labels)
expected = ['label1', 'label2', 'noncat']
self.assertEqual(expected, consolidated)
def test_inverse_transform(self):
X = [[1, 0, 2], [1, 1, 2], [1, 2, 2]]
n_values = [3]
categorical_features = [1]
cat_columnlabels = ['label']
noncat_columnlabels = ['a', 'b']
X_expected = [[1, 0, 0, 1, 2], [0, 1, 0, 1, 2], [0, 0, 1, 1, 2]]
enc = DummyEncoder(n_values, categorical_features,
cat_columnlabels, noncat_columnlabels)
X_encoded = enc.fit_transform(X)
self.assertTrue(np.all(X_encoded == X_expected))
X_decoded = enc.inverse_transform(X_encoded)
self.assertTrue(np.all(X == X_decoded))
if __name__ == '__main__':
unittest.main()

106
server/analysis/util.py Normal file
View File

@@ -0,0 +1,106 @@
#
# OtterTune - util.py
#
# Copyright (c) 2017-18, Carnegie Mellon University Database Group
#
'''
Created on Oct 24, 2017
@author: dva
'''
import logging
from numbers import Number
import contextlib
import datetime
import numpy as np
def get_analysis_logger(name, level=logging.INFO):
logger = logging.getLogger(name)
log_handler = logging.StreamHandler()
log_formatter = logging.Formatter(
fmt='%(asctime)s [%(funcName)s:%(lineno)03d] %(levelname)-5s: %(message)s',
datefmt='%m-%d-%Y %H:%M:%S'
)
log_handler.setFormatter(log_formatter)
logger.addHandler(log_handler)
logger.setLevel(level)
np.set_printoptions(formatter={'float': '{: 0.3f}'.format})
return logger
LOG = get_analysis_logger(__name__)
def stdev_zero(data, axis=None, nearzero=1e-8):
mstd = np.expand_dims(data.std(axis=axis), axis=axis)
return (np.abs(mstd) < nearzero).squeeze()
def get_datetime():
return datetime.datetime.utcnow()
class TimerStruct(object):
def __init__(self):
self.__start_time = 0.0
self.__stop_time = 0.0
self.__elapsed = None
@property
def elapsed_seconds(self):
if self.__elapsed is None:
return (get_datetime() - self.__start_time).total_seconds()
return self.__elapsed.total_seconds()
def start(self):
self.__start_time = get_datetime()
def stop(self):
self.__stop_time = get_datetime()
self.__elapsed = (self.__stop_time - self.__start_time)
@contextlib.contextmanager
def stopwatch(message=None):
ts = TimerStruct()
ts.start()
try:
yield ts
finally:
ts.stop()
if message is not None:
LOG.info('Total elapsed_seconds time for %s: %.3fs', message, ts.elapsed_seconds)
def get_data_base(arr):
"""For a given Numpy array, finds the
base array that "owns" the actual data."""
base = arr
while isinstance(base.base, np.ndarray):
base = base.base
return base
def arrays_share_data(x, y):
return get_data_base(x) is get_data_base(y)
def array_tostring(arr):
arr_shape = arr.shape
arr = arr.ravel()
arr = np.array([str(a) for a in arr])
return arr.reshape(arr_shape)
def is_numeric_matrix(matrix):
assert matrix.size > 0
return isinstance(matrix.ravel()[0], Number)
def is_lexical_matrix(matrix):
assert matrix.size > 0
return isinstance(matrix.ravel()[0], str)