Initial commit with BSL
This commit is contained in:
5
server/analysis/__init__.py
Normal file
5
server/analysis/__init__.py
Normal file
@@ -0,0 +1,5 @@
|
||||
#
|
||||
# OtterTune - __init__.py
|
||||
#
|
||||
# Copyright (c) 2017-18, Carnegie Mellon University Database Group
|
||||
#
|
||||
19
server/analysis/base.py
Normal file
19
server/analysis/base.py
Normal file
@@ -0,0 +1,19 @@
|
||||
#
|
||||
# OtterTune - base.py
|
||||
#
|
||||
# Copyright (c) 2017-18, Carnegie Mellon University Database Group
|
||||
#
|
||||
'''
|
||||
Created on Oct 25, 2017
|
||||
|
||||
@author: dva
|
||||
'''
|
||||
|
||||
from abc import ABCMeta, abstractmethod
|
||||
|
||||
|
||||
class ModelBase(object, metaclass=ABCMeta):
|
||||
|
||||
@abstractmethod
|
||||
def _reset(self):
|
||||
pass
|
||||
793
server/analysis/cluster.py
Normal file
793
server/analysis/cluster.py
Normal file
@@ -0,0 +1,793 @@
|
||||
#
|
||||
# OtterTune - cluster.py
|
||||
#
|
||||
# Copyright (c) 2017-18, Carnegie Mellon University Database Group
|
||||
#
|
||||
'''
|
||||
Created on Jul 4, 2016
|
||||
|
||||
@author: dva
|
||||
'''
|
||||
from abc import ABCMeta, abstractproperty
|
||||
from collections import OrderedDict
|
||||
|
||||
import os
|
||||
import json
|
||||
import copy
|
||||
import numpy as np
|
||||
import matplotlib.pyplot as plt
|
||||
|
||||
from scipy.spatial.distance import cdist
|
||||
from sklearn.metrics import silhouette_score
|
||||
from sklearn.cluster import KMeans as SklearnKMeans
|
||||
from celery.utils.log import get_task_logger
|
||||
|
||||
from .base import ModelBase
|
||||
|
||||
# Log debug messages
|
||||
LOGGER = get_task_logger(__name__)
|
||||
|
||||
|
||||
class KMeans(ModelBase):
|
||||
"""
|
||||
KMeans:
|
||||
|
||||
Fits an Sklearn KMeans model to X.
|
||||
|
||||
|
||||
See also
|
||||
--------
|
||||
http://scikit-learn.org/stable/modules/generated/sklearn.cluster.KMeans.html
|
||||
|
||||
|
||||
Attributes
|
||||
----------
|
||||
n_clusters_ : int
|
||||
The number of clusters, K
|
||||
|
||||
cluster_inertia_ : float
|
||||
Sum of squared distances of samples to their closest cluster center
|
||||
|
||||
cluster_labels_ : array, [n_clusters_]
|
||||
Labels indicating the membership of each point
|
||||
|
||||
cluster_centers_ : array, [n_clusters, n_features]
|
||||
Coordinates of cluster centers
|
||||
|
||||
sample_labels_ : array, [n_samples]
|
||||
Labels for each of the samples in X
|
||||
|
||||
sample_distances_ : array, [n_samples]
|
||||
The distance between each sample point and its cluster's center
|
||||
|
||||
|
||||
Constants
|
||||
---------
|
||||
SAMPLE_CUTOFF_ : int
|
||||
If n_samples > SAMPLE_CUTOFF_ then sample distances
|
||||
are NOT recorded
|
||||
"""
|
||||
|
||||
SAMPLE_CUTOFF_ = 1000
|
||||
|
||||
def __init__(self):
|
||||
self.model_ = None
|
||||
self.n_clusters_ = None
|
||||
self.sample_labels_ = None
|
||||
self.sample_distances_ = None
|
||||
|
||||
@property
|
||||
def cluster_inertia_(self):
|
||||
# Sum of squared distances of samples to their closest cluster center
|
||||
return None if self.model_ is None else \
|
||||
self.model_.inertia_
|
||||
|
||||
@property
|
||||
def cluster_labels_(self):
|
||||
# Cluster membership labels for each point
|
||||
return None if self.model_ is None else \
|
||||
copy.deepcopy(self.model_.labels_)
|
||||
|
||||
@property
|
||||
def cluster_centers_(self):
|
||||
# Coordinates of the cluster centers
|
||||
return None if self.model_ is None else \
|
||||
copy.deepcopy(self.model_.cluster_centers_)
|
||||
|
||||
def _reset(self):
|
||||
"""Resets all attributes (erases the model)"""
|
||||
self.model_ = None
|
||||
self.n_clusters_ = None
|
||||
self.sample_labels_ = None
|
||||
self.sample_distances_ = None
|
||||
|
||||
def fit(self, X, K, sample_labels=None, estimator_params=None):
|
||||
"""Fits a Sklearn KMeans model to X.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
X : array-like, shape (n_samples, n_features)
|
||||
Training data.
|
||||
|
||||
K : int
|
||||
The number of clusters.
|
||||
|
||||
sample_labels : array-like, shape (n_samples), optional
|
||||
Labels for each of the samples in X.
|
||||
|
||||
estimator_params : dict, optional
|
||||
The parameters to pass to the KMeans estimators.
|
||||
|
||||
|
||||
Returns
|
||||
-------
|
||||
self
|
||||
"""
|
||||
self._reset()
|
||||
# Note: previously set n_init=50
|
||||
self.model_ = SklearnKMeans(K)
|
||||
if estimator_params is not None:
|
||||
assert isinstance(estimator_params, dict)
|
||||
self.model_.set_params(**estimator_params)
|
||||
|
||||
# Compute Kmeans model
|
||||
self.model_.fit(X)
|
||||
if sample_labels is None:
|
||||
sample_labels = ["sample_{}".format(i) for i in range(X.shape[0])]
|
||||
assert len(sample_labels) == X.shape[0]
|
||||
self.sample_labels_ = np.array(sample_labels)
|
||||
self.n_clusters_ = K
|
||||
|
||||
# Record sample label/distance from its cluster center
|
||||
self.sample_distances_ = OrderedDict()
|
||||
for cluster_label in range(self.n_clusters_):
|
||||
assert cluster_label not in self.sample_distances_
|
||||
member_rows = X[self.cluster_labels_ == cluster_label, :]
|
||||
member_labels = self.sample_labels_[self.cluster_labels_ == cluster_label]
|
||||
centroid = np.expand_dims(self.cluster_centers_[cluster_label], axis=0)
|
||||
|
||||
# "All clusters must have at least 1 member!"
|
||||
if member_rows.shape[0] == 0:
|
||||
return None
|
||||
|
||||
# Calculate distance between each member row and the current cluster
|
||||
dists = np.empty(member_rows.shape[0])
|
||||
dist_labels = []
|
||||
for j, (row, label) in enumerate(zip(member_rows, member_labels)):
|
||||
dists[j] = cdist(np.expand_dims(row, axis=0), centroid, "euclidean").squeeze()
|
||||
dist_labels.append(label)
|
||||
|
||||
# Sort the distances/labels in ascending order
|
||||
sort_order = np.argsort(dists)
|
||||
dists = dists[sort_order]
|
||||
dist_labels = np.array(dist_labels)[sort_order]
|
||||
self.sample_distances_[cluster_label] = {
|
||||
"sample_labels": dist_labels,
|
||||
"distances": dists,
|
||||
}
|
||||
return self
|
||||
|
||||
def get_closest_samples(self):
|
||||
"""Returns a list of the labels of the samples that are located closest
|
||||
to their cluster's center.
|
||||
|
||||
|
||||
Returns
|
||||
----------
|
||||
closest_samples : list
|
||||
A list of the sample labels that are located the closest to
|
||||
their cluster's center.
|
||||
"""
|
||||
if self.sample_distances_ is None:
|
||||
raise Exception("No model has been fit yet!")
|
||||
|
||||
return [samples['sample_labels'][0] for samples in list(self.sample_distances_.values())]
|
||||
|
||||
def get_memberships(self):
|
||||
'''
|
||||
Return the memberships in each cluster
|
||||
'''
|
||||
memberships = OrderedDict()
|
||||
for cluster_label, samples in list(self.sample_distances_.items()):
|
||||
memberships[cluster_label] = OrderedDict(
|
||||
[(l, d) for l, d in zip(samples["sample_labels"], samples["distances"])])
|
||||
return json.dumps(memberships, indent=4)
|
||||
|
||||
|
||||
class KMeansClusters(ModelBase):
|
||||
|
||||
"""
|
||||
KMeansClusters:
|
||||
|
||||
Fits a KMeans model to X for clusters in the range [min_cluster_, max_cluster_].
|
||||
|
||||
|
||||
Attributes
|
||||
----------
|
||||
min_cluster_ : int
|
||||
The minimum cluster size to fit a KMeans model to
|
||||
|
||||
max_cluster_ : int
|
||||
The maximum cluster size to fit a KMeans model to
|
||||
|
||||
cluster_map_ : dict
|
||||
A dictionary mapping the cluster size (K) to the KMeans
|
||||
model fitted to X with K clusters
|
||||
|
||||
sample_labels_ : array, [n_samples]
|
||||
Labels for each of the samples in X
|
||||
"""
|
||||
|
||||
def __init__(self):
|
||||
self.min_cluster_ = None
|
||||
self.max_cluster_ = None
|
||||
self.cluster_map_ = None
|
||||
self.sample_labels_ = None
|
||||
|
||||
def _reset(self):
|
||||
"""Resets all attributes (erases the model)"""
|
||||
self.min_cluster_ = None
|
||||
self.max_cluster_ = None
|
||||
self.cluster_map_ = None
|
||||
self.sample_labels_ = None
|
||||
|
||||
def fit(self, X, min_cluster, max_cluster, sample_labels=None, estimator_params=None):
|
||||
"""Fits a KMeans model to X for each cluster in the range [min_cluster, max_cluster].
|
||||
|
||||
Parameters
|
||||
----------
|
||||
X : array-like, shape (n_samples, n_features)
|
||||
Training data.
|
||||
|
||||
min_cluster : int
|
||||
The minimum cluster size to fit a KMeans model to.
|
||||
|
||||
max_cluster : int
|
||||
The maximum cluster size to fit a KMeans model to.
|
||||
|
||||
sample_labels : array-like, shape (n_samples), optional
|
||||
Labels for each of the samples in X.
|
||||
|
||||
estimator_params : dict, optional
|
||||
The parameters to pass to the KMeans estimators.
|
||||
|
||||
|
||||
Returns
|
||||
-------
|
||||
self
|
||||
"""
|
||||
self._reset()
|
||||
self.min_cluster_ = min_cluster
|
||||
self.max_cluster_ = max_cluster
|
||||
self.cluster_map_ = {}
|
||||
if sample_labels is None:
|
||||
sample_labels = ["sample_{}".format(i) for i in range(X.shape[1])]
|
||||
self.sample_labels_ = sample_labels
|
||||
for K in range(self.min_cluster_, self.max_cluster_ + 1):
|
||||
tmp = KMeans().fit(X, K, self.sample_labels_, estimator_params)
|
||||
if tmp is None: # Set maximum cluster
|
||||
assert K > min_cluster, "min_cluster is too large for the model"
|
||||
self.max_cluster_ = K - 1
|
||||
break
|
||||
else:
|
||||
self.cluster_map_[K] = tmp
|
||||
|
||||
return self
|
||||
|
||||
def save(self, savedir):
|
||||
"""Saves the KMeans model results
|
||||
|
||||
Parameters
|
||||
----------
|
||||
savedir : string
|
||||
Path to the directory to save the results in.
|
||||
"""
|
||||
if self.cluster_map_ is None:
|
||||
raise Exception("No models have been fitted yet!")
|
||||
|
||||
cluster_map = OrderedDict()
|
||||
inertias = []
|
||||
for K, model in sorted(self.cluster_map_.items()):
|
||||
cluster_map[K] = {
|
||||
"cluster_inertia": model.cluster_inertia_,
|
||||
"cluster_labels": model.cluster_labels_,
|
||||
"cluster_centers": model.cluster_centers_,
|
||||
}
|
||||
inertias.append(model.cluster_inertia_)
|
||||
|
||||
# Save sum of squares plot (elbow curve)
|
||||
fig = plt.figure()
|
||||
plt.plot(list(cluster_map.keys()), inertias, '--o')
|
||||
plt.xlabel("Number of clusters (K)")
|
||||
plt.ylabel("Within sum of squares W_k")
|
||||
plt.title("Within Sum of Squares vs. Number of Clusters")
|
||||
fig.canvas.set_window_title(os.path.basename(savedir))
|
||||
savepath = os.path.join(savedir, "kmeans_sum_of_squares.pdf")
|
||||
plt.savefig(savepath, bbox_inches="tight")
|
||||
plt.close()
|
||||
|
||||
# save cluster memberships
|
||||
for K in range(self.min_cluster_, self.max_cluster_ + 1):
|
||||
savepath = os.path.join(savedir,
|
||||
"memberships_{}-clusters.json".format(K))
|
||||
members = self.cluster_map_[K].get_memberships()
|
||||
with open(savepath, "w") as f:
|
||||
f.write(members)
|
||||
|
||||
|
||||
class KSelection(ModelBase, metaclass=ABCMeta):
|
||||
"""KSelection:
|
||||
|
||||
Abstract class for techniques that approximate the optimal
|
||||
number of clusters (K).
|
||||
|
||||
|
||||
Attributes
|
||||
----------
|
||||
optimal_num_clusters_ : int
|
||||
An estimation of the optimal number of clusters K for
|
||||
a KMeans model fit to X
|
||||
clusters_ : array, [n_clusters]
|
||||
The sizes of the clusters
|
||||
|
||||
name_ : string
|
||||
The name of this technique
|
||||
"""
|
||||
|
||||
NAME_ = None
|
||||
|
||||
def __init__(self):
|
||||
self.optimal_num_clusters_ = None
|
||||
self.clusters_ = None
|
||||
|
||||
def _reset(self):
|
||||
"""Resets all attributes (erases the model)"""
|
||||
self.optimal_num_clusters_ = None
|
||||
self.clusters_ = None
|
||||
|
||||
@abstractproperty
|
||||
def name_(self):
|
||||
pass
|
||||
|
||||
def save(self, savedir):
|
||||
"""Saves the estimation of the optimal # of clusters.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
savedir : string
|
||||
Path to the directory to save the results in.
|
||||
"""
|
||||
if self.optimal_num_clusters_ is None:
|
||||
raise Exception("Optimal number of clusters has not been computed!")
|
||||
|
||||
# Save the computed optimal number of clusters
|
||||
savepath = os.path.join(savedir, self.name_ + "_optimal_num_clusters.txt")
|
||||
with open(savepath, "w") as f:
|
||||
f.write(str(self.optimal_num_clusters_))
|
||||
|
||||
|
||||
class GapStatistic(KSelection):
|
||||
"""GapStatistic:
|
||||
|
||||
Approximates the optimal number of clusters (K).
|
||||
|
||||
|
||||
References
|
||||
----------
|
||||
https://web.stanford.edu/~hastie/Papers/gap.pdf
|
||||
|
||||
|
||||
Attributes
|
||||
----------
|
||||
optimal_num_clusters_ : int
|
||||
An estimation of the optimal number of clusters K for
|
||||
a KMeans model fit to X
|
||||
|
||||
clusters_ : array, [n_clusters]
|
||||
The sizes of the clusters
|
||||
|
||||
name_ : string
|
||||
The name of this technique
|
||||
|
||||
log_wks_ : array, [n_clusters]
|
||||
The within-dispersion measures of X (log)
|
||||
|
||||
log_wkbs_ : array, [n_clusters]
|
||||
The within-dispersion measures of the generated
|
||||
reference data sets
|
||||
|
||||
khats_ : array, [n_clusters]
|
||||
The gap-statistic for each cluster
|
||||
"""
|
||||
|
||||
NAME_ = "gap-statistic"
|
||||
|
||||
def __init__(self):
|
||||
super(GapStatistic, self).__init__()
|
||||
self.log_wks_ = None
|
||||
self.log_wkbs_ = None
|
||||
self.khats_ = None
|
||||
|
||||
@property
|
||||
def name_(self):
|
||||
return self.NAME_
|
||||
|
||||
def _reset(self):
|
||||
"""Resets all attributes (erases the model)"""
|
||||
super(GapStatistic, self)._reset()
|
||||
self.log_wks_ = None
|
||||
self.log_wkbs_ = None
|
||||
self.khats_ = None
|
||||
|
||||
def fit(self, X, cluster_map, n_b=50):
|
||||
"""Estimates the optimal number of clusters (K) for a
|
||||
KMeans model trained on X.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
X : array-like, shape (n_samples, n_features)
|
||||
Training data.
|
||||
|
||||
cluster_map_ : dict
|
||||
A dictionary mapping each cluster size (K) to the KMeans
|
||||
model fitted to X with K clusters
|
||||
|
||||
n_B : int
|
||||
The number of reference data sets to generate
|
||||
|
||||
|
||||
Returns
|
||||
-------
|
||||
self
|
||||
"""
|
||||
self._reset()
|
||||
mins, maxs = GapStatistic.bounding_box(X)
|
||||
n_clusters = len(cluster_map)
|
||||
|
||||
# Dispersion for real distribution
|
||||
log_wks = np.zeros(n_clusters)
|
||||
log_wkbs = np.zeros(n_clusters)
|
||||
sk = np.zeros(n_clusters)
|
||||
for indk, (K, model) in enumerate(sorted(cluster_map.items())):
|
||||
|
||||
# Computes Wk: the within-dispersion of each cluster size (k)
|
||||
log_wks[indk] = np.log(model.cluster_inertia_ / (2.0 * K))
|
||||
|
||||
# Create B reference datasets
|
||||
log_bwkbs = np.zeros(n_b)
|
||||
for i in range(n_b):
|
||||
Xb = np.empty_like(X)
|
||||
for j in range(X.shape[1]):
|
||||
Xb[:, j] = np.random.uniform(mins[j], maxs[j], size=X.shape[0])
|
||||
Xb_model = KMeans().fit(Xb, K)
|
||||
log_bwkbs[i] = np.log(Xb_model.cluster_inertia_ / (2.0 * K))
|
||||
log_wkbs[indk] = sum(log_bwkbs) / n_b
|
||||
sk[indk] = np.sqrt(sum((log_bwkbs - log_wkbs[indk]) ** 2) / n_b)
|
||||
sk = sk * np.sqrt(1 + 1.0 / n_b)
|
||||
|
||||
khats = np.zeros(n_clusters)
|
||||
gaps = log_wkbs - log_wks
|
||||
gsks = gaps - sk
|
||||
khats[1:] = gaps[0:-1] - gsks[1:]
|
||||
self.clusters_ = np.array(sorted(cluster_map.keys()))
|
||||
|
||||
for i in range(1, n_clusters):
|
||||
if gaps[i - 1] >= gsks[i]:
|
||||
self.optimal_num_clusters_ = self.clusters_[i - 1]
|
||||
break
|
||||
|
||||
if self.optimal_num_clusters_ is None:
|
||||
LOGGER.info("GapStatistic NOT found the optimal k, \
|
||||
use the last(maximum) k instead ")
|
||||
self.optimal_num_clusters_ = self.clusters_[-1]
|
||||
|
||||
self.log_wks_ = log_wks
|
||||
self.log_wkbs_ = log_wkbs
|
||||
self.khats_ = khats
|
||||
return self
|
||||
|
||||
@staticmethod
|
||||
def bounding_box(X):
|
||||
"""Computes the box that tightly bounds X
|
||||
|
||||
Parameters
|
||||
----------
|
||||
X : array-like, shape (n_samples, n_features)
|
||||
Training data.
|
||||
|
||||
|
||||
Returns
|
||||
-------
|
||||
The mins and maxs that make up the bounding box
|
||||
"""
|
||||
mins = np.min(X, axis=0)
|
||||
maxs = np.max(X, axis=0)
|
||||
return mins, maxs
|
||||
|
||||
@staticmethod
|
||||
def Wk(X, mu, cluster_labels):
|
||||
"""Computes the within-dispersion of each cluster size (k)
|
||||
|
||||
Parameters
|
||||
----------
|
||||
X : array-like, shape (n_samples, n_features)
|
||||
Training data.
|
||||
|
||||
mu : array-like, shape (n_clusters, n_features)
|
||||
Coordinates of cluster centers
|
||||
|
||||
cluster_labels: array-like, shape (n_samples)
|
||||
Labels for each of the samples in X.
|
||||
|
||||
|
||||
Returns
|
||||
-------
|
||||
The within-dispersion of each cluster (K)
|
||||
"""
|
||||
K = len(mu)
|
||||
return sum([np.linalg.norm(mu[i] - x) ** 2 / (2.0 * K)
|
||||
for i in range(K)
|
||||
for x in X[cluster_labels == i]])
|
||||
|
||||
def save(self, savedir):
|
||||
"""Saves the estimation results of the optimal # of clusters.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
savedir : string
|
||||
Path to the directory to save the results in.
|
||||
"""
|
||||
super(GapStatistic, self).save(savedir)
|
||||
|
||||
# Plot the calculated gap
|
||||
gaps = self.log_wkbs_ - self.log_wks_
|
||||
fig = plt.figure()
|
||||
plt.plot(self.clusters_, gaps, '--o')
|
||||
plt.title("Gap vs. Number of Clusters")
|
||||
plt.xlabel("Number of clusters (K)")
|
||||
plt.ylabel("gap_K")
|
||||
fig.canvas.set_window_title(os.path.basename(savedir))
|
||||
plt.savefig(os.path.join(savedir, self.name_ + ".pdf"), bbox_inches="tight")
|
||||
plt.close()
|
||||
|
||||
# Plot the gap statistic
|
||||
fig = plt.figure()
|
||||
plt.bar(self.clusters_, self.khats_)
|
||||
plt.title("Gap Statistic vs. Number of Clusters")
|
||||
plt.xlabel("Number of clusters (K)")
|
||||
plt.ylabel("gap(K)-(gap(K+1)-s(K+1))")
|
||||
fig.canvas.set_window_title(os.path.basename(savedir))
|
||||
plt.savefig(os.path.join(savedir, self.name_ + "_final.pdf"),
|
||||
bbox_inches="tight")
|
||||
plt.close()
|
||||
|
||||
|
||||
class DetK(KSelection):
|
||||
"""DetK:
|
||||
|
||||
Approximates the optimal number of clusters (K).
|
||||
|
||||
|
||||
References
|
||||
----------
|
||||
https://www.ee.columbia.edu/~dpwe/papers/PhamDN05-kmeans.pdf
|
||||
|
||||
|
||||
Attributes
|
||||
----------
|
||||
optimal_num_clusters_ : int
|
||||
An estimation of the optimal number of clusters K for
|
||||
KMeans models fit to X
|
||||
|
||||
clusters_ : array, [n_clusters]
|
||||
The sizes of the clusters
|
||||
|
||||
name_ : string
|
||||
The name of this technique
|
||||
|
||||
fs_ : array, [n_clusters]
|
||||
The computed evaluation functions F(K) for each cluster size K
|
||||
"""
|
||||
|
||||
NAME_ = "det-k"
|
||||
|
||||
def __init__(self):
|
||||
super(DetK, self).__init__()
|
||||
self.fs_ = None
|
||||
|
||||
@property
|
||||
def name_(self):
|
||||
return DetK.NAME_
|
||||
|
||||
def _reset(self):
|
||||
"""Resets all attributes (erases the model)"""
|
||||
super(DetK, self)._reset()
|
||||
self.fs_ = None
|
||||
|
||||
def fit(self, X, cluster_map):
|
||||
"""Estimates the optimal number of clusters (K) for a
|
||||
KMeans model trained on X.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
X : array-like, shape (n_samples, n_features)
|
||||
Training data.
|
||||
|
||||
cluster_map_ : dict
|
||||
A dictionary mapping each cluster size (K) to the KMeans
|
||||
model fitted to X with K clusters
|
||||
|
||||
|
||||
Returns
|
||||
-------
|
||||
self
|
||||
"""
|
||||
self._reset()
|
||||
n_clusters = len(cluster_map)
|
||||
nd = X.shape[1]
|
||||
fs = np.empty(n_clusters)
|
||||
sks = np.empty(n_clusters)
|
||||
alpha = {}
|
||||
# K from 1 to maximum_cluster_
|
||||
for i, (K, model) \
|
||||
in enumerate(sorted(cluster_map.items())):
|
||||
# Compute alpha(K, nd) (i.e. alpha[K])
|
||||
if K == 2:
|
||||
alpha[K] = 1 - 3.0 / (4 * nd)
|
||||
elif K > 2:
|
||||
alpha[K] = alpha[K - 1] + (1 - alpha[K - 1]) / 6.0
|
||||
sks[i] = model.cluster_inertia_
|
||||
|
||||
if K == 1:
|
||||
fs[i] = 1
|
||||
elif sks[i - 1] == 0:
|
||||
fs[i] = 1
|
||||
else:
|
||||
fs[i] = sks[i] / (alpha[K] * sks[i - 1])
|
||||
self.clusters_ = np.array(sorted(cluster_map.keys()))
|
||||
self.optimal_num_clusters_ = self.clusters_[np.argmin(fs)]
|
||||
self.fs_ = fs
|
||||
return self
|
||||
|
||||
def save(self, savedir):
|
||||
"""Saves the estimation results of the optimal # of clusters.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
savedir : string
|
||||
Path to the directory to save the results in.
|
||||
"""
|
||||
super(DetK, self).save(savedir)
|
||||
|
||||
# Plot the evaluation function
|
||||
fig = plt.figure()
|
||||
plt.plot(self.clusters_, self.fs_, '--o')
|
||||
plt.xlabel("Number of clusters (K)")
|
||||
plt.ylabel("Evaluation function (F_k)")
|
||||
plt.title("Evaluation Function vs. Number of Clusters")
|
||||
fig.canvas.set_window_title(os.path.basename(savedir))
|
||||
savepath = os.path.join(savedir, self.name_ + "_eval_function.pdf")
|
||||
plt.savefig(savepath, bbox_inches="tight")
|
||||
plt.close()
|
||||
|
||||
|
||||
class Silhouette(KSelection):
|
||||
"""Det:
|
||||
|
||||
Approximates the optimal number of clusters (K).
|
||||
|
||||
|
||||
References
|
||||
----------
|
||||
http://scikit-learn.org/stable/modules/generated/sklearn.metrics.silhouette_score.html
|
||||
|
||||
|
||||
Attributes
|
||||
----------
|
||||
optimal_num_clusters_ : int
|
||||
An estimation of the optimal number of clusters K for
|
||||
KMeans models fit to X
|
||||
|
||||
clusters_ : array, [n_clusters]
|
||||
The sizes of the clusters
|
||||
|
||||
name_ : string
|
||||
The name of this technique
|
||||
|
||||
Score_ : array, [n_clusters]
|
||||
The mean Silhouette Coefficient for each cluster size K
|
||||
"""
|
||||
|
||||
# short for Silhouette score
|
||||
NAME_ = "s-score"
|
||||
|
||||
def __init__(self):
|
||||
super(Silhouette, self).__init__()
|
||||
self.scores_ = None
|
||||
|
||||
@property
|
||||
def name_(self):
|
||||
return Silhouette.NAME_
|
||||
|
||||
def _reset(self):
|
||||
"""Resets all attributes (erases the model)"""
|
||||
super(Silhouette, self)._reset()
|
||||
self.scores_ = None
|
||||
|
||||
def fit(self, X, cluster_map):
|
||||
"""Estimates the optimal number of clusters (K) for a
|
||||
KMeans model trained on X.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
X : array-like, shape (n_samples, n_features)
|
||||
Training data.
|
||||
|
||||
cluster_map_ : dict
|
||||
A dictionary mapping each cluster size (K) to the KMeans
|
||||
model fitted to X with K clusters
|
||||
|
||||
Returns
|
||||
-------
|
||||
self
|
||||
"""
|
||||
self._reset()
|
||||
n_clusters = len(cluster_map)
|
||||
# scores = np.empty(n_clusters)
|
||||
scores = np.zeros(n_clusters)
|
||||
for i, (K, model) \
|
||||
in enumerate(sorted(cluster_map.items())):
|
||||
if K <= 1: # K >= 2
|
||||
continue
|
||||
scores[i] = silhouette_score(X, model.cluster_labels_)
|
||||
|
||||
self.clusters_ = np.array(sorted(cluster_map.keys()))
|
||||
self.optimal_num_clusters_ = self.clusters_[np.argmax(scores)]
|
||||
self.scores_ = scores
|
||||
return self
|
||||
|
||||
def save(self, savedir):
|
||||
"""Saves the estimation results of the optimal # of clusters.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
savedir : string
|
||||
Path to the directory to save the results in.
|
||||
"""
|
||||
super(Silhouette, self).save(savedir)
|
||||
|
||||
# Plot the evaluation function
|
||||
fig = plt.figure()
|
||||
plt.plot(self.clusters_, self.scores_, '--o')
|
||||
plt.xlabel("Number of clusters (K)")
|
||||
plt.ylabel("Silhouette scores")
|
||||
plt.title("Silhouette Scores vs. Number of Clusters")
|
||||
fig.canvas.set_window_title(os.path.basename(savedir))
|
||||
savepath = os.path.join(savedir, self.name_ + "_eval_function.pdf")
|
||||
plt.savefig(savepath, bbox_inches="tight")
|
||||
plt.close()
|
||||
|
||||
|
||||
def create_kselection_model(model_name):
|
||||
"""Constructs the KSelection model object with the given name
|
||||
|
||||
Parameters
|
||||
----------
|
||||
model_name : string
|
||||
Name of the KSelection model.
|
||||
One of ['gap-statistic', 'det-k', 's-score']
|
||||
|
||||
|
||||
Returns
|
||||
-------
|
||||
The constructed model object
|
||||
"""
|
||||
kselection_map = {
|
||||
DetK.NAME_: DetK,
|
||||
GapStatistic.NAME_: GapStatistic,
|
||||
Silhouette.NAME_: Silhouette
|
||||
}
|
||||
if model_name not in kselection_map:
|
||||
raise Exception("KSelection model {} not supported!".format(model_name))
|
||||
else:
|
||||
return kselection_map[model_name]()
|
||||
115
server/analysis/constraints.py
Normal file
115
server/analysis/constraints.py
Normal file
@@ -0,0 +1,115 @@
|
||||
#
|
||||
# OtterTune - constraints.py
|
||||
#
|
||||
# Copyright (c) 2017-18, Carnegie Mellon University Database Group
|
||||
#
|
||||
'''
|
||||
Created on Sep 8, 2016
|
||||
|
||||
@author: dvanaken
|
||||
'''
|
||||
|
||||
import numpy as np
|
||||
|
||||
|
||||
class ParamConstraintHelper(object):
|
||||
|
||||
def __init__(self, scaler, encoder=None, binary_vars=None,
|
||||
init_flip_prob=0.3, flip_prob_decay=0.5):
|
||||
if 'inverse_transform' not in dir(scaler):
|
||||
raise Exception("Scaler object must provide function inverse_transform(X)")
|
||||
if 'transform' not in dir(scaler):
|
||||
raise Exception("Scaler object must provide function transform(X)")
|
||||
self.scaler_ = scaler
|
||||
if encoder is not None and len(encoder.n_values) > 0:
|
||||
self.is_dummy_encoded_ = True
|
||||
self.encoder_ = encoder.encoder
|
||||
else:
|
||||
self.is_dummy_encoded_ = False
|
||||
self.binary_vars_ = binary_vars
|
||||
self.init_flip_prob_ = init_flip_prob
|
||||
self.flip_prob_decay_ = flip_prob_decay
|
||||
|
||||
def apply_constraints(self, sample, scaled=True, rescale=True):
|
||||
conv_sample = self._handle_scaling(sample, scaled)
|
||||
|
||||
if self.is_dummy_encoded_:
|
||||
# apply categorical (ie enum var, >=3 values) constraints
|
||||
n_values = self.encoder_.n_values_
|
||||
cat_start_indices = self.encoder_.feature_indices_
|
||||
for i, nvals in enumerate(n_values):
|
||||
start_idx = cat_start_indices[i]
|
||||
cvals = conv_sample[start_idx: start_idx + nvals]
|
||||
cvals = np.array(np.arange(nvals) == np.argmax(cvals), dtype=float)
|
||||
assert np.sum(cvals) == 1
|
||||
conv_sample[start_idx: start_idx + nvals] = cvals
|
||||
|
||||
# apply binary (0-1) constraints
|
||||
if self.binary_vars_ is not None:
|
||||
for i in self.binary_vars_:
|
||||
# round to closest
|
||||
if conv_sample[i] >= 0.5:
|
||||
conv_sample[i] = 1
|
||||
else:
|
||||
conv_sample[i] = 0
|
||||
|
||||
conv_sample = self._handle_rescaling(conv_sample, rescale)
|
||||
return conv_sample
|
||||
|
||||
def _handle_scaling(self, sample, scaled):
|
||||
if scaled:
|
||||
if sample.ndim == 1:
|
||||
sample = sample.reshape(1, -1)
|
||||
sample = self.scaler_.inverse_transform(sample).ravel()
|
||||
else:
|
||||
sample = np.array(sample)
|
||||
return sample
|
||||
|
||||
def _handle_rescaling(self, sample, rescale):
|
||||
if rescale:
|
||||
if sample.ndim == 1:
|
||||
sample = sample.reshape(1, -1)
|
||||
return self.scaler_.transform(sample).ravel()
|
||||
return sample
|
||||
|
||||
def randomize_categorical_features(self, sample, scaled=True, rescale=True):
|
||||
# If there are no categorical features, this function is a no-op.
|
||||
if not self.is_dummy_encoded_:
|
||||
return sample
|
||||
n_values = self.encoder_.n_values_
|
||||
cat_start_indices = self.encoder_.feature_indices_
|
||||
n_cat_feats = len(n_values)
|
||||
|
||||
conv_sample = self._handle_scaling(sample, scaled)
|
||||
flips = np.zeros((n_cat_feats,), dtype=bool)
|
||||
|
||||
# Always flip at least one categorical feature
|
||||
flips[0] = True
|
||||
|
||||
# Flip the rest with decreasing probability
|
||||
p = self.init_flip_prob_
|
||||
for i in range(1, n_cat_feats):
|
||||
if np.random.rand() <= p:
|
||||
flips[i] = True
|
||||
p *= self.flip_prob_decay_
|
||||
|
||||
flip_shuffle_indices = np.random.choice(np.arange(n_cat_feats),
|
||||
n_cat_feats,
|
||||
replace=False)
|
||||
flips = flips[flip_shuffle_indices]
|
||||
|
||||
for i, nvals in enumerate(n_values):
|
||||
if flips[i]:
|
||||
start_idx = cat_start_indices[i]
|
||||
current_val = conv_sample[start_idx: start_idx + nvals]
|
||||
assert np.all(np.logical_or(current_val == 0, current_val == 1)), \
|
||||
"categorical {0}: value not 0/1: {1}".format(i, current_val)
|
||||
choices = np.arange(nvals)[current_val != 1]
|
||||
assert choices.size == nvals - 1
|
||||
r = np.zeros(nvals)
|
||||
r[np.random.choice(choices)] = 1
|
||||
assert np.sum(r) == 1
|
||||
conv_sample[start_idx: start_idx + nvals] = r
|
||||
|
||||
conv_sample = self._handle_rescaling(conv_sample, rescale)
|
||||
return conv_sample
|
||||
111
server/analysis/factor_analysis.py
Normal file
111
server/analysis/factor_analysis.py
Normal file
@@ -0,0 +1,111 @@
|
||||
#
|
||||
# OtterTune - factor_analysis.py
|
||||
#
|
||||
# Copyright (c) 2017-18, Carnegie Mellon University Database Group
|
||||
#
|
||||
'''
|
||||
Created on Jul 4, 2016
|
||||
|
||||
@author: dvanaken
|
||||
'''
|
||||
|
||||
import numpy as np
|
||||
from sklearn.decomposition import FactorAnalysis as SklearnFactorAnalysis
|
||||
|
||||
from .base import ModelBase
|
||||
|
||||
|
||||
class FactorAnalysis(ModelBase):
|
||||
"""FactorAnalysis (FA):
|
||||
|
||||
Fits an Sklearn FactorAnalysis model to X.
|
||||
|
||||
|
||||
See also
|
||||
--------
|
||||
http://scikit-learn.org/stable/modules/generated/sklearn.decomposition.FactorAnalysis.html
|
||||
|
||||
|
||||
Attributes
|
||||
----------
|
||||
model_ : sklearn.decomposition.FactorAnalysis
|
||||
The fitted FA model
|
||||
|
||||
components_ : array, [n_components, n_features]
|
||||
Components (i.e., factors) with maximum variance
|
||||
|
||||
feature_labels_ : array, [n_features]
|
||||
|
||||
total_variance_ : float
|
||||
The total amount of variance explained by the components
|
||||
|
||||
pvars_ : array, [n_components]
|
||||
The percentage of the variance explained by each component
|
||||
|
||||
pvars_noise_ : array, [n_components]
|
||||
The percentage of the variance explained by each component also
|
||||
accounting for noise
|
||||
"""
|
||||
|
||||
def __init__(self):
|
||||
self.model_ = None
|
||||
self.components_ = None
|
||||
self.feature_labels_ = None
|
||||
self.total_variance_ = None
|
||||
self.pvars_ = None
|
||||
self.pvars_noise_ = None
|
||||
|
||||
def _reset(self):
|
||||
"""Resets all attributes (erases the model)"""
|
||||
self.model_ = None
|
||||
self.components_ = None
|
||||
self.feature_labels_ = None
|
||||
self.total_variance_ = None
|
||||
self.pvars_ = None
|
||||
self.pvars_noise_ = None
|
||||
|
||||
def fit(self, X, feature_labels=None, n_components=None, estimator_params=None):
|
||||
"""Fits an Sklearn FA model to X.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
X : array-like, shape (n_samples, n_features)
|
||||
Training data.
|
||||
|
||||
feature_labels : array-like, shape (n_features), optional
|
||||
Labels for each of the features in X.
|
||||
|
||||
estimator_params : dict, optional
|
||||
The parameters to pass to Sklearn's FA estimators.
|
||||
|
||||
|
||||
Returns
|
||||
-------
|
||||
self
|
||||
"""
|
||||
self._reset()
|
||||
if feature_labels is None:
|
||||
feature_labels = ["feature_{}".format(i) for i in range(X.shape[1])]
|
||||
self.feature_labels_ = feature_labels
|
||||
if n_components is not None:
|
||||
model = SklearnFactorAnalysis(n_components=n_components)
|
||||
else:
|
||||
model = SklearnFactorAnalysis()
|
||||
self.model_ = model
|
||||
if estimator_params is not None:
|
||||
# Update Sklearn estimator params
|
||||
assert isinstance(estimator_params, dict)
|
||||
self.model_.set_params(**estimator_params)
|
||||
self.model_.fit(X)
|
||||
|
||||
# Remove zero-valued components (n_components x n_features)
|
||||
components_mask = np.sum(self.model_.components_ != 0.0, axis=1) > 0.0
|
||||
self.components_ = self.model_.components_[components_mask]
|
||||
|
||||
# Compute the % variance explained (with/without noise)
|
||||
c2 = np.sum(self.components_ ** 2, axis=1)
|
||||
self.total_variance_ = np.sum(c2)
|
||||
self.pvars_ = 100 * c2 / self.total_variance_
|
||||
self.pvars_noise_ = 100 * c2 / (self.total_variance_ +
|
||||
np.sum(self.model_.noise_variance_))
|
||||
return self
|
||||
148
server/analysis/gp.py
Normal file
148
server/analysis/gp.py
Normal file
@@ -0,0 +1,148 @@
|
||||
#
|
||||
# OtterTune - gp.py
|
||||
#
|
||||
# Copyright (c) 2017-18, Carnegie Mellon University Database Group
|
||||
#
|
||||
'''
|
||||
Created on Feb 18, 2018
|
||||
|
||||
@author: Bohan Zhang
|
||||
'''
|
||||
import numpy as np
|
||||
from scipy.spatial.distance import cdist as ed
|
||||
from scipy import special
|
||||
from analysis.gp_tf import GPRResult
|
||||
|
||||
|
||||
# numpy version of Gaussian Process Regression, not using Tensorflow
|
||||
class GPRNP(object):
|
||||
|
||||
def __init__(self, length_scale=1.0, magnitude=1.0, max_train_size=7000,
|
||||
batch_size=3000, check_numerics=True, debug=False):
|
||||
assert np.isscalar(length_scale)
|
||||
assert np.isscalar(magnitude)
|
||||
assert length_scale > 0 and magnitude > 0
|
||||
self.length_scale = length_scale
|
||||
self.magnitude = magnitude
|
||||
self.max_train_size_ = max_train_size
|
||||
self.batch_size_ = batch_size
|
||||
self.check_numerics = check_numerics
|
||||
self.debug = debug
|
||||
self.X_train = None
|
||||
self.y_train = None
|
||||
self.K = None
|
||||
self.K_inv = None
|
||||
self.y_best = None
|
||||
|
||||
def __repr__(self):
|
||||
rep = ""
|
||||
for k, v in sorted(self.__dict__.items()):
|
||||
rep += "{} = {}\n".format(k, v)
|
||||
return rep
|
||||
|
||||
def __str__(self):
|
||||
return self.__repr__()
|
||||
|
||||
def _reset(self):
|
||||
self.X_train = None
|
||||
self.y_train = None
|
||||
self.K = None
|
||||
self.K_inv = None
|
||||
self.y_best = None
|
||||
|
||||
def check_X_y(self, X, y):
|
||||
from sklearn.utils.validation import check_X_y
|
||||
|
||||
if X.shape[0] > self.max_train_size_:
|
||||
raise Exception("X_train size cannot exceed {} ({})"
|
||||
.format(self.max_train_size_, X.shape[0]))
|
||||
return check_X_y(X, y, multi_output=True,
|
||||
allow_nd=True, y_numeric=True,
|
||||
estimator="GPRNP")
|
||||
|
||||
def check_fitted(self):
|
||||
if self.X_train is None or self.y_train is None \
|
||||
or self.K is None:
|
||||
raise Exception("The model must be trained before making predictions!")
|
||||
|
||||
@staticmethod
|
||||
def check_array(X):
|
||||
from sklearn.utils.validation import check_array
|
||||
return check_array(X, allow_nd=True, estimator="GPRNP")
|
||||
|
||||
@staticmethod
|
||||
def check_output(X):
|
||||
finite_els = np.isfinite(X)
|
||||
if not np.all(finite_els):
|
||||
raise Exception("Input contains non-finite values: {}"
|
||||
.format(X[~finite_els]))
|
||||
|
||||
def fit(self, X_train, y_train, ridge=0.01):
|
||||
self._reset()
|
||||
X_train, y_train = self.check_X_y(X_train, y_train)
|
||||
if X_train.ndim != 2 or y_train.ndim != 2:
|
||||
raise Exception("X_train or y_train should have 2 dimensions! X_dim:{}, y_dim:{}"
|
||||
.format(X_train.ndim, y_train.ndim))
|
||||
self.X_train = np.float32(X_train)
|
||||
self.y_train = np.float32(y_train)
|
||||
sample_size = self.X_train.shape[0]
|
||||
if np.isscalar(ridge):
|
||||
ridge = np.ones(sample_size) * ridge
|
||||
assert isinstance(ridge, np.ndarray)
|
||||
assert ridge.ndim == 1
|
||||
K = self.magnitude * np.exp(-ed(self.X_train, self.X_train) / self.length_scale) \
|
||||
+ np.diag(ridge)
|
||||
K_inv = np.linalg.inv(K)
|
||||
self.K = K
|
||||
self.K_inv = K_inv
|
||||
self.y_best = np.min(y_train)
|
||||
return self
|
||||
|
||||
def predict(self, X_test):
|
||||
self.check_fitted()
|
||||
if X_test.ndim != 2:
|
||||
raise Exception("X_test should have 2 dimensions! X_dim:{}"
|
||||
.format(X_test.ndim))
|
||||
X_test = np.float32(GPRNP.check_array(X_test))
|
||||
test_size = X_test.shape[0]
|
||||
arr_offset = 0
|
||||
length_scale = self.length_scale
|
||||
yhats = np.zeros([test_size, 1])
|
||||
sigmas = np.zeros([test_size, 1])
|
||||
eips = np.zeros([test_size, 1])
|
||||
while arr_offset < test_size:
|
||||
if arr_offset + self.batch_size_ > test_size:
|
||||
end_offset = test_size
|
||||
else:
|
||||
end_offset = arr_offset + self.batch_size_
|
||||
xt_ = X_test[arr_offset:end_offset]
|
||||
K2 = self.magnitude * np.exp(-ed(self.X_train, xt_) / length_scale)
|
||||
K3 = self.magnitude * np.exp(-ed(xt_, xt_) / length_scale)
|
||||
K2_trans = np.transpose(K2)
|
||||
yhat = np.matmul(K2_trans, np.matmul(self.K_inv, self.y_train))
|
||||
sigma = np.sqrt(np.diag(K3 - np.matmul(K2_trans, np.matmul(self.K_inv, K2)))) \
|
||||
.reshape(xt_.shape[0], 1)
|
||||
u = (self.y_best - yhat) / sigma
|
||||
phi1 = 0.5 * special.erf(u / np.sqrt(2.0)) + 0.5
|
||||
phi2 = (1.0 / np.sqrt(2.0 * np.pi)) * np.exp(np.square(u) * (-0.5))
|
||||
eip = sigma * (u * phi1 + phi2)
|
||||
yhats[arr_offset:end_offset] = yhat
|
||||
sigmas[arr_offset:end_offset] = sigma
|
||||
eips[arr_offset:end_offset] = eip
|
||||
arr_offset = end_offset
|
||||
GPRNP.check_output(yhats)
|
||||
GPRNP.check_output(sigmas)
|
||||
return GPRResult(yhats, sigmas)
|
||||
|
||||
def get_params(self, deep=True):
|
||||
return {"length_scale": self.length_scale,
|
||||
"magnitude": self.magnitude,
|
||||
"X_train": self.X_train,
|
||||
"y_train": self.y_train,
|
||||
"K": self.K,
|
||||
"K_inv": self.K_inv}
|
||||
|
||||
def set_params(self, **parameters):
|
||||
for param, val in list(parameters.items()):
|
||||
setattr(self, param, val)
|
||||
return self
|
||||
710
server/analysis/gp_tf.py
Normal file
710
server/analysis/gp_tf.py
Normal file
@@ -0,0 +1,710 @@
|
||||
#
|
||||
# OtterTune - gp_tf.py
|
||||
#
|
||||
# Copyright (c) 2017-18, Carnegie Mellon University Database Group
|
||||
#
|
||||
'''
|
||||
Created on Aug 18, 2016
|
||||
|
||||
@author: Bohan Zhang, Dana Van Aken
|
||||
'''
|
||||
|
||||
import gc
|
||||
import numpy as np
|
||||
import tensorflow as tf
|
||||
|
||||
from .util import get_analysis_logger
|
||||
|
||||
LOG = get_analysis_logger(__name__)
|
||||
|
||||
|
||||
class GPRResult(object):
|
||||
|
||||
def __init__(self, ypreds=None, sigmas=None):
|
||||
self.ypreds = ypreds
|
||||
self.sigmas = sigmas
|
||||
|
||||
|
||||
class GPRGDResult(GPRResult):
|
||||
|
||||
def __init__(self, ypreds=None, sigmas=None,
|
||||
minl=None, minl_conf=None):
|
||||
super(GPRGDResult, self).__init__(ypreds, sigmas)
|
||||
self.minl = minl
|
||||
self.minl_conf = minl_conf
|
||||
|
||||
|
||||
class GPR(object):
|
||||
|
||||
def __init__(self, length_scale=1.0, magnitude=1.0, max_train_size=7000,
|
||||
batch_size=3000, num_threads=4, check_numerics=True, debug=False):
|
||||
assert np.isscalar(length_scale)
|
||||
assert np.isscalar(magnitude)
|
||||
assert length_scale > 0 and magnitude > 0
|
||||
self.length_scale = length_scale
|
||||
self.magnitude = magnitude
|
||||
self.max_train_size_ = max_train_size
|
||||
self.batch_size_ = batch_size
|
||||
self.num_threads_ = num_threads
|
||||
self.check_numerics = check_numerics
|
||||
self.debug = debug
|
||||
self.X_train = None
|
||||
self.y_train = None
|
||||
self.xy_ = None
|
||||
self.K = None
|
||||
self.K_inv = None
|
||||
self.graph = None
|
||||
self.vars = None
|
||||
self.ops = None
|
||||
|
||||
def build_graph(self):
|
||||
self.vars = {}
|
||||
self.ops = {}
|
||||
self.graph = tf.Graph()
|
||||
with self.graph.as_default():
|
||||
mag_const = tf.constant(self.magnitude,
|
||||
dtype=np.float32,
|
||||
name='magnitude')
|
||||
ls_const = tf.constant(self.length_scale,
|
||||
dtype=np.float32,
|
||||
name='length_scale')
|
||||
|
||||
# Nodes for distance computation
|
||||
v1 = tf.placeholder(tf.float32, name="v1")
|
||||
v2 = tf.placeholder(tf.float32, name="v2")
|
||||
dist_op = tf.sqrt(tf.reduce_sum(tf.pow(tf.subtract(v1, v2), 2), 1), name='dist_op')
|
||||
if self.check_numerics:
|
||||
dist_op = tf.check_numerics(dist_op, "dist_op: ")
|
||||
|
||||
self.vars['v1_h'] = v1
|
||||
self.vars['v2_h'] = v2
|
||||
self.ops['dist_op'] = dist_op
|
||||
|
||||
# Nodes for kernel computation
|
||||
X_dists = tf.placeholder(tf.float32, name='X_dists')
|
||||
ridge_ph = tf.placeholder(tf.float32, name='ridge')
|
||||
K_op = mag_const * tf.exp(-X_dists / ls_const)
|
||||
if self.check_numerics:
|
||||
K_op = tf.check_numerics(K_op, "K_op: ")
|
||||
K_ridge_op = K_op + tf.diag(ridge_ph)
|
||||
if self.check_numerics:
|
||||
K_ridge_op = tf.check_numerics(K_ridge_op, "K_ridge_op: ")
|
||||
|
||||
self.vars['X_dists_h'] = X_dists
|
||||
self.vars['ridge_h'] = ridge_ph
|
||||
self.ops['K_op'] = K_op
|
||||
self.ops['K_ridge_op'] = K_ridge_op
|
||||
|
||||
# Nodes for xy computation
|
||||
K = tf.placeholder(tf.float32, name='K')
|
||||
K_inv = tf.placeholder(tf.float32, name='K_inv')
|
||||
xy_ = tf.placeholder(tf.float32, name='xy_')
|
||||
yt_ = tf.placeholder(tf.float32, name='yt_')
|
||||
K_inv_op = tf.matrix_inverse(K)
|
||||
if self.check_numerics:
|
||||
K_inv_op = tf.check_numerics(K_inv_op, "K_inv: ")
|
||||
xy_op = tf.matmul(K_inv, yt_)
|
||||
if self.check_numerics:
|
||||
xy_op = tf.check_numerics(xy_op, "xy_: ")
|
||||
|
||||
self.vars['K_h'] = K
|
||||
self.vars['K_inv_h'] = K_inv
|
||||
self.vars['xy_h'] = xy_
|
||||
self.vars['yt_h'] = yt_
|
||||
self.ops['K_inv_op'] = K_inv_op
|
||||
self.ops['xy_op'] = xy_op
|
||||
|
||||
# Nodes for yhat/sigma computation
|
||||
K2 = tf.placeholder(tf.float32, name="K2")
|
||||
K3 = tf.placeholder(tf.float32, name="K3")
|
||||
yhat_ = tf.cast(tf.matmul(tf.transpose(K2), xy_), tf.float32)
|
||||
if self.check_numerics:
|
||||
yhat_ = tf.check_numerics(yhat_, "yhat_: ")
|
||||
sv1 = tf.matmul(tf.transpose(K2), tf.matmul(K_inv, K2))
|
||||
if self.check_numerics:
|
||||
sv1 = tf.check_numerics(sv1, "sv1: ")
|
||||
sig_val = tf.cast((tf.sqrt(tf.diag_part(K3 - sv1))), tf.float32)
|
||||
if self.check_numerics:
|
||||
sig_val = tf.check_numerics(sig_val, "sig_val: ")
|
||||
|
||||
self.vars['K2_h'] = K2
|
||||
self.vars['K3_h'] = K3
|
||||
self.ops['yhat_op'] = yhat_
|
||||
self.ops['sig_op'] = sig_val
|
||||
|
||||
# Compute y_best (min y)
|
||||
y_best_op = tf.cast(tf.reduce_min(yt_, 0, True), tf.float32)
|
||||
if self.check_numerics:
|
||||
y_best_op = tf.check_numerics(y_best_op, "y_best_op: ")
|
||||
self.ops['y_best_op'] = y_best_op
|
||||
|
||||
sigma = tf.placeholder(tf.float32, name='sigma')
|
||||
yhat = tf.placeholder(tf.float32, name='yhat')
|
||||
|
||||
self.vars['sigma_h'] = sigma
|
||||
self.vars['yhat_h'] = yhat
|
||||
|
||||
def __repr__(self):
|
||||
rep = ""
|
||||
for k, v in sorted(self.__dict__.items()):
|
||||
rep += "{} = {}\n".format(k, v)
|
||||
return rep
|
||||
|
||||
def __str__(self):
|
||||
return self.__repr__()
|
||||
|
||||
def check_X_y(self, X, y):
|
||||
from sklearn.utils.validation import check_X_y
|
||||
|
||||
if X.shape[0] > self.max_train_size_:
|
||||
raise Exception("X_train size cannot exceed {} ({})"
|
||||
.format(self.max_train_size_, X.shape[0]))
|
||||
return check_X_y(X, y, multi_output=True,
|
||||
allow_nd=True, y_numeric=True,
|
||||
estimator="GPR")
|
||||
|
||||
def check_fitted(self):
|
||||
if self.X_train is None or self.y_train is None \
|
||||
or self.xy_ is None or self.K is None:
|
||||
raise Exception("The model must be trained before making predictions!")
|
||||
|
||||
@staticmethod
|
||||
def check_array(X):
|
||||
from sklearn.utils.validation import check_array
|
||||
return check_array(X, allow_nd=True, estimator="GPR")
|
||||
|
||||
@staticmethod
|
||||
def check_output(X):
|
||||
finite_els = np.isfinite(X)
|
||||
if not np.all(finite_els):
|
||||
raise Exception("Input contains non-finite values: {}"
|
||||
.format(X[~finite_els]))
|
||||
|
||||
def fit(self, X_train, y_train, ridge=1.0):
|
||||
self._reset()
|
||||
X_train, y_train = self.check_X_y(X_train, y_train)
|
||||
self.X_train = np.float32(X_train)
|
||||
self.y_train = np.float32(y_train)
|
||||
sample_size = self.X_train.shape[0]
|
||||
|
||||
if np.isscalar(ridge):
|
||||
ridge = np.ones(sample_size) * ridge
|
||||
assert isinstance(ridge, np.ndarray)
|
||||
assert ridge.ndim == 1
|
||||
|
||||
X_dists = np.zeros((sample_size, sample_size), dtype=np.float32)
|
||||
with tf.Session(graph=self.graph,
|
||||
config=tf.ConfigProto(
|
||||
intra_op_parallelism_threads=self.num_threads_)) as sess:
|
||||
dist_op = self.ops['dist_op']
|
||||
v1, v2 = self.vars['v1_h'], self.vars['v2_h']
|
||||
for i in range(sample_size):
|
||||
X_dists[i] = sess.run(dist_op, feed_dict={v1: self.X_train[i], v2: self.X_train})
|
||||
|
||||
K_ridge_op = self.ops['K_ridge_op']
|
||||
X_dists_ph = self.vars['X_dists_h']
|
||||
ridge_ph = self.vars['ridge_h']
|
||||
|
||||
self.K = sess.run(K_ridge_op, feed_dict={X_dists_ph: X_dists, ridge_ph: ridge})
|
||||
|
||||
K_ph = self.vars['K_h']
|
||||
|
||||
K_inv_op = self.ops['K_inv_op']
|
||||
self.K_inv = sess.run(K_inv_op, feed_dict={K_ph: self.K})
|
||||
|
||||
xy_op = self.ops['xy_op']
|
||||
K_inv_ph = self.vars['K_inv_h']
|
||||
yt_ph = self.vars['yt_h']
|
||||
self.xy_ = sess.run(xy_op, feed_dict={K_inv_ph: self.K_inv,
|
||||
yt_ph: self.y_train})
|
||||
return self
|
||||
|
||||
def predict(self, X_test):
|
||||
self.check_fitted()
|
||||
X_test = np.float32(GPR.check_array(X_test))
|
||||
test_size = X_test.shape[0]
|
||||
sample_size = self.X_train.shape[0]
|
||||
|
||||
arr_offset = 0
|
||||
yhats = np.zeros([test_size, 1])
|
||||
sigmas = np.zeros([test_size, 1])
|
||||
with tf.Session(graph=self.graph,
|
||||
config=tf.ConfigProto(
|
||||
intra_op_parallelism_threads=self.num_threads_)) as sess:
|
||||
# Nodes for distance operation
|
||||
dist_op = self.ops['dist_op']
|
||||
v1 = self.vars['v1_h']
|
||||
v2 = self.vars['v2_h']
|
||||
|
||||
# Nodes for kernel computation
|
||||
K_op = self.ops['K_op']
|
||||
X_dists = self.vars['X_dists_h']
|
||||
|
||||
# Nodes to compute yhats/sigmas
|
||||
yhat_ = self.ops['yhat_op']
|
||||
K_inv_ph = self.vars['K_inv_h']
|
||||
K2 = self.vars['K2_h']
|
||||
K3 = self.vars['K3_h']
|
||||
xy_ph = self.vars['xy_h']
|
||||
|
||||
while arr_offset < test_size:
|
||||
if arr_offset + self.batch_size_ > test_size:
|
||||
end_offset = test_size
|
||||
else:
|
||||
end_offset = arr_offset + self.batch_size_
|
||||
|
||||
X_test_batch = X_test[arr_offset:end_offset]
|
||||
batch_len = end_offset - arr_offset
|
||||
|
||||
dists1 = np.zeros([sample_size, batch_len])
|
||||
for i in range(sample_size):
|
||||
dists1[i] = sess.run(dist_op, feed_dict={v1: self.X_train[i],
|
||||
v2: X_test_batch})
|
||||
|
||||
sig_val = self.ops['sig_op']
|
||||
K2_ = sess.run(K_op, feed_dict={X_dists: dists1})
|
||||
yhat = sess.run(yhat_, feed_dict={K2: K2_, xy_ph: self.xy_})
|
||||
dists2 = np.zeros([batch_len, batch_len])
|
||||
for i in range(batch_len):
|
||||
dists2[i] = sess.run(dist_op, feed_dict={v1: X_test_batch[i], v2: X_test_batch})
|
||||
K3_ = sess.run(K_op, feed_dict={X_dists: dists2})
|
||||
|
||||
sigma = np.zeros([1, batch_len], np.float32)
|
||||
sigma[0] = sess.run(sig_val, feed_dict={K_inv_ph: self.K_inv, K2: K2_, K3: K3_})
|
||||
sigma = np.transpose(sigma)
|
||||
yhats[arr_offset: end_offset] = yhat
|
||||
sigmas[arr_offset: end_offset] = sigma
|
||||
arr_offset = end_offset
|
||||
GPR.check_output(yhats)
|
||||
GPR.check_output(sigmas)
|
||||
return GPRResult(yhats, sigmas)
|
||||
|
||||
def get_params(self, deep=True):
|
||||
return {"length_scale": self.length_scale,
|
||||
"magnitude": self.magnitude,
|
||||
"X_train": self.X_train,
|
||||
"y_train": self.y_train,
|
||||
"xy_": self.xy_,
|
||||
"K": self.K,
|
||||
"K_inv": self.K_inv}
|
||||
|
||||
def set_params(self, **parameters):
|
||||
for param, val in list(parameters.items()):
|
||||
setattr(self, param, val)
|
||||
return self
|
||||
|
||||
def _reset(self):
|
||||
self.X_train = None
|
||||
self.y_train = None
|
||||
self.xy_ = None
|
||||
self.K = None
|
||||
self.K_inv = None
|
||||
self.graph = None
|
||||
self.build_graph()
|
||||
gc.collect()
|
||||
|
||||
|
||||
class GPRGD(GPR):
|
||||
|
||||
GP_BETA_UCB = "UCB"
|
||||
GP_BETA_CONST = "CONST"
|
||||
|
||||
def __init__(self,
|
||||
length_scale=1.0,
|
||||
magnitude=1.0,
|
||||
max_train_size=7000,
|
||||
batch_size=3000,
|
||||
num_threads=4,
|
||||
learning_rate=0.01,
|
||||
epsilon=1e-6,
|
||||
max_iter=100,
|
||||
sigma_multiplier=3.0,
|
||||
mu_multiplier=1.0):
|
||||
super(GPRGD, self).__init__(length_scale=length_scale,
|
||||
magnitude=magnitude,
|
||||
max_train_size=max_train_size,
|
||||
batch_size=batch_size,
|
||||
num_threads=num_threads)
|
||||
self.learning_rate = learning_rate
|
||||
self.epsilon = epsilon
|
||||
self.max_iter = max_iter
|
||||
self.sigma_multiplier = sigma_multiplier
|
||||
self.mu_multiplier = mu_multiplier
|
||||
self.X_min = None
|
||||
self.X_max = None
|
||||
|
||||
def fit(self, X_train, y_train, X_min, X_max, ridge): # pylint: disable=arguments-differ
|
||||
super(GPRGD, self).fit(X_train, y_train, ridge)
|
||||
self.X_min = X_min
|
||||
self.X_max = X_max
|
||||
|
||||
with tf.Session(graph=self.graph,
|
||||
config=tf.ConfigProto(
|
||||
intra_op_parallelism_threads=self.num_threads_)) as sess:
|
||||
xt_ = tf.Variable(self.X_train[0], tf.float32)
|
||||
xt_ph = tf.placeholder(tf.float32)
|
||||
xt_assign_op = xt_.assign(xt_ph)
|
||||
init = tf.global_variables_initializer()
|
||||
sess.run(init)
|
||||
K2_mat = tf.transpose(tf.expand_dims(tf.sqrt(tf.reduce_sum(tf.pow(
|
||||
tf.subtract(xt_, self.X_train), 2), 1)), 0))
|
||||
if self.check_numerics is True:
|
||||
K2_mat = tf.check_numerics(K2_mat, "K2_mat: ")
|
||||
K2__ = tf.cast(self.magnitude * tf.exp(-K2_mat / self.length_scale), tf.float32)
|
||||
if self.check_numerics is True:
|
||||
K2__ = tf.check_numerics(K2__, "K2__: ")
|
||||
yhat_gd = tf.cast(tf.matmul(tf.transpose(K2__), self.xy_), tf.float32)
|
||||
if self.check_numerics is True:
|
||||
yhat_gd = tf.check_numerics(yhat_gd, message="yhat: ")
|
||||
sig_val = tf.cast((tf.sqrt(self.magnitude - tf.matmul(
|
||||
tf.transpose(K2__), tf.matmul(self.K_inv, K2__)))), tf.float32)
|
||||
if self.check_numerics is True:
|
||||
sig_val = tf.check_numerics(sig_val, message="sigma: ")
|
||||
LOG.debug("\nyhat_gd : %s", str(sess.run(yhat_gd)))
|
||||
LOG.debug("\nsig_val : %s", str(sess.run(sig_val)))
|
||||
|
||||
loss = tf.squeeze(tf.subtract(self.mu_multiplier * yhat_gd,
|
||||
self.sigma_multiplier * sig_val))
|
||||
if self.check_numerics is True:
|
||||
loss = tf.check_numerics(loss, "loss: ")
|
||||
optimizer = tf.train.AdamOptimizer(learning_rate=self.learning_rate,
|
||||
epsilon=self.epsilon)
|
||||
# optimizer = tf.train.GradientDescentOptimizer(learning_rate=self.learning_rate)
|
||||
train = optimizer.minimize(loss)
|
||||
|
||||
self.vars['xt_'] = xt_
|
||||
self.vars['xt_ph'] = xt_ph
|
||||
self.ops['xt_assign_op'] = xt_assign_op
|
||||
self.ops['yhat_gd'] = yhat_gd
|
||||
self.ops['sig_val2'] = sig_val
|
||||
self.ops['loss_op'] = loss
|
||||
self.ops['train_op'] = train
|
||||
return self
|
||||
|
||||
def predict(self, X_test, constraint_helper=None, # pylint: disable=arguments-differ
|
||||
categorical_feature_method='hillclimbing',
|
||||
categorical_feature_steps=3):
|
||||
self.check_fitted()
|
||||
X_test = np.float32(GPR.check_array(X_test))
|
||||
test_size = X_test.shape[0]
|
||||
nfeats = self.X_train.shape[1]
|
||||
|
||||
arr_offset = 0
|
||||
yhats = np.zeros([test_size, 1])
|
||||
sigmas = np.zeros([test_size, 1])
|
||||
minls = np.zeros([test_size, 1])
|
||||
minl_confs = np.zeros([test_size, nfeats])
|
||||
|
||||
with tf.Session(graph=self.graph,
|
||||
config=tf.ConfigProto(
|
||||
intra_op_parallelism_threads=self.num_threads_)) as sess:
|
||||
while arr_offset < test_size:
|
||||
if arr_offset + self.batch_size_ > test_size:
|
||||
end_offset = test_size
|
||||
else:
|
||||
end_offset = arr_offset + self.batch_size_
|
||||
|
||||
X_test_batch = X_test[arr_offset:end_offset]
|
||||
batch_len = end_offset - arr_offset
|
||||
|
||||
xt_ = self.vars['xt_']
|
||||
init = tf.global_variables_initializer()
|
||||
sess.run(init)
|
||||
|
||||
sig_val = self.ops['sig_val2']
|
||||
yhat_gd = self.ops['yhat_gd']
|
||||
loss = self.ops['loss_op']
|
||||
train = self.ops['train_op']
|
||||
|
||||
xt_ph = self.vars['xt_ph']
|
||||
assign_op = self.ops['xt_assign_op']
|
||||
|
||||
yhat = np.empty((batch_len, 1))
|
||||
sigma = np.empty((batch_len, 1))
|
||||
minl = np.empty((batch_len, 1))
|
||||
minl_conf = np.empty((batch_len, nfeats))
|
||||
for i in range(batch_len):
|
||||
if self.debug is True:
|
||||
LOG.info("-------------------------------------------")
|
||||
yhats_it = np.empty((self.max_iter + 1,)) * np.nan
|
||||
sigmas_it = np.empty((self.max_iter + 1,)) * np.nan
|
||||
losses_it = np.empty((self.max_iter + 1,)) * np.nan
|
||||
confs_it = np.empty((self.max_iter + 1, nfeats)) * np.nan
|
||||
|
||||
sess.run(assign_op, feed_dict={xt_ph: X_test_batch[i]})
|
||||
step = 0
|
||||
for step in range(self.max_iter):
|
||||
if self.debug is True:
|
||||
LOG.info("Batch %d, iter %d:", i, step)
|
||||
yhats_it[step] = sess.run(yhat_gd)[0][0]
|
||||
sigmas_it[step] = sess.run(sig_val)[0][0]
|
||||
losses_it[step] = sess.run(loss)
|
||||
confs_it[step] = sess.run(xt_)
|
||||
if self.debug is True:
|
||||
LOG.info(" yhat: %s", str(yhats_it[step]))
|
||||
LOG.info(" sigma: %s", str(sigmas_it[step]))
|
||||
LOG.info(" loss: %s", str(losses_it[step]))
|
||||
LOG.info(" conf: %s", str(confs_it[step]))
|
||||
sess.run(train)
|
||||
# constraint Projected Gradient Descent
|
||||
xt = sess.run(xt_)
|
||||
xt_valid = np.minimum(xt, self.X_max)
|
||||
xt_valid = np.maximum(xt_valid, self.X_min)
|
||||
sess.run(assign_op, feed_dict={xt_ph: xt_valid})
|
||||
if constraint_helper is not None:
|
||||
xt_valid = constraint_helper.apply_constraints(sess.run(xt_))
|
||||
sess.run(assign_op, feed_dict={xt_ph: xt_valid})
|
||||
if categorical_feature_method == 'hillclimbing':
|
||||
if step % categorical_feature_steps == 0:
|
||||
current_xt = sess.run(xt_)
|
||||
current_loss = sess.run(loss)
|
||||
new_xt = \
|
||||
constraint_helper.randomize_categorical_features(
|
||||
current_xt)
|
||||
sess.run(assign_op, feed_dict={xt_ph: new_xt})
|
||||
new_loss = sess.run(loss)
|
||||
if current_loss < new_loss:
|
||||
sess.run(assign_op, feed_dict={xt_ph: new_xt})
|
||||
else:
|
||||
raise Exception("Unknown categorial feature method: {}".format(
|
||||
categorical_feature_method))
|
||||
if step == self.max_iter - 1:
|
||||
# Record results from final iteration
|
||||
yhats_it[-1] = sess.run(yhat_gd)[0][0]
|
||||
sigmas_it[-1] = sess.run(sig_val)[0][0]
|
||||
losses_it[-1] = sess.run(loss)
|
||||
confs_it[-1] = sess.run(xt_)
|
||||
assert np.all(np.isfinite(yhats_it))
|
||||
assert np.all(np.isfinite(sigmas_it))
|
||||
assert np.all(np.isfinite(losses_it))
|
||||
assert np.all(np.isfinite(confs_it))
|
||||
|
||||
# Store info for conf with min loss from all iters
|
||||
if np.all(~np.isfinite(losses_it)):
|
||||
min_loss_idx = 0
|
||||
else:
|
||||
min_loss_idx = np.nanargmin(losses_it)
|
||||
yhat[i] = yhats_it[min_loss_idx]
|
||||
sigma[i] = sigmas_it[min_loss_idx]
|
||||
minl[i] = losses_it[min_loss_idx]
|
||||
minl_conf[i] = confs_it[min_loss_idx]
|
||||
|
||||
minls[arr_offset:end_offset] = minl
|
||||
minl_confs[arr_offset:end_offset] = minl_conf
|
||||
yhats[arr_offset:end_offset] = yhat
|
||||
sigmas[arr_offset:end_offset] = sigma
|
||||
arr_offset = end_offset
|
||||
|
||||
GPR.check_output(yhats)
|
||||
GPR.check_output(sigmas)
|
||||
GPR.check_output(minls)
|
||||
GPR.check_output(minl_confs)
|
||||
|
||||
return GPRGDResult(yhats, sigmas, minls, minl_confs)
|
||||
|
||||
@staticmethod
|
||||
def calculate_sigma_multiplier(t, ndim, bound=0.1):
|
||||
assert t > 0
|
||||
assert ndim > 0
|
||||
assert bound > 0 and bound <= 1
|
||||
beta = 2 * np.log(ndim * (t**2) * (np.pi**2) / 6 * bound)
|
||||
if beta > 0:
|
||||
beta = np.sqrt(beta)
|
||||
else:
|
||||
beta = 1
|
||||
return beta
|
||||
|
||||
|
||||
# def gp_tf(X_train, y_train, X_test, ridge, length_scale, magnitude, batch_size=3000):
|
||||
# with tf.Graph().as_default():
|
||||
# y_best = tf.cast(tf.reduce_min(y_train, 0, True), tf.float32)
|
||||
# sample_size = X_train.shape[0]
|
||||
# train_size = X_test.shape[0]
|
||||
# arr_offset = 0
|
||||
# yhats = np.zeros([train_size, 1])
|
||||
# sigmas = np.zeros([train_size, 1])
|
||||
# eips = np.zeros([train_size, 1])
|
||||
# X_train = np.float32(X_train)
|
||||
# y_train = np.float32(y_train)
|
||||
# X_test = np.float32(X_test)
|
||||
# ridge = np.float32(ridge)
|
||||
#
|
||||
# v1 = tf.placeholder(tf.float32,name="v1")
|
||||
# v2 = tf.placeholder(tf.float32,name="v2")
|
||||
# dist_op = tf.sqrt(tf.reduce_sum(tf.pow(tf.subtract(v1, v2), 2), 1))
|
||||
# try:
|
||||
# sess = tf.Session(config=tf.ConfigProto(log_device_placement=False))
|
||||
#
|
||||
# dists = np.zeros([sample_size,sample_size])
|
||||
# for i in range(sample_size):
|
||||
# dists[i] = sess.run(dist_op,feed_dict={v1:X_train[i], v2:X_train})
|
||||
#
|
||||
#
|
||||
# dists = tf.cast(dists, tf.float32)
|
||||
# K = magnitude * tf.exp(-dists/length_scale) + tf.diag(ridge);
|
||||
#
|
||||
# K2 = tf.placeholder(tf.float32, name="K2")
|
||||
# K3 = tf.placeholder(tf.float32, name="K3")
|
||||
#
|
||||
# x = tf.matmul(tf.matrix_inverse(K), y_train)
|
||||
# yhat_ = tf.cast(tf.matmul(tf.transpose(K2), x), tf.float32);
|
||||
# sig_val = tf.cast((tf.sqrt(tf.diag_part(K3 - tf.matmul(tf.transpose(K2),
|
||||
# tf.matmul(tf.matrix_inverse(K),
|
||||
# K2))))),
|
||||
# tf.float32)
|
||||
#
|
||||
# u = tf.placeholder(tf.float32, name="u")
|
||||
# phi1 = 0.5 * tf.erf(u / np.sqrt(2.0)) + 0.5
|
||||
# phi2 = (1.0 / np.sqrt(2.0 * np.pi)) * tf.exp(tf.square(u) * (-0.5));
|
||||
# eip = (tf.multiply(u, phi1) + phi2);
|
||||
#
|
||||
# while arr_offset < train_size:
|
||||
# if arr_offset + batch_size > train_size:
|
||||
# end_offset = train_size
|
||||
# else:
|
||||
# end_offset = arr_offset + batch_size;
|
||||
#
|
||||
# xt_ = X_test[arr_offset:end_offset];
|
||||
# batch_len = end_offset - arr_offset
|
||||
#
|
||||
# dists = np.zeros([sample_size, batch_len])
|
||||
# for i in range(sample_size):
|
||||
# dists[i] = sess.run(dist_op, feed_dict={v1:X_train[i], v2:xt_})
|
||||
#
|
||||
# K2_ = magnitude * tf.exp(-dists / length_scale);
|
||||
# K2_ = sess.run(K2_)
|
||||
#
|
||||
# dists = np.zeros([batch_len, batch_len])
|
||||
# for i in range(batch_len):
|
||||
# dists[i] = sess.run(dist_op, feed_dict={v1:xt_[i], v2:xt_})
|
||||
# K3_ = magnitude * tf.exp(-dists / length_scale);
|
||||
# K3_ = sess.run(K3_)
|
||||
#
|
||||
# yhat = sess.run(yhat_, feed_dict={K2:K2_})
|
||||
#
|
||||
# sigma = np.zeros([1, batch_len], np.float32)
|
||||
# sigma[0] = (sess.run(sig_val, feed_dict={K2:K2_, K3:K3_}))
|
||||
# sigma = np.transpose(sigma)
|
||||
#
|
||||
# u_ = tf.cast(tf.div(tf.subtract(y_best, yhat), sigma), tf.float32)
|
||||
# u_ = sess.run(u_)
|
||||
# eip_p = sess.run(eip, feed_dict={u:u_})
|
||||
# eip_ = tf.multiply(sigma, eip_p)
|
||||
# yhats[arr_offset:end_offset] = yhat
|
||||
# sigmas[arr_offset:end_offset] = sigma;
|
||||
# eips[arr_offset:end_offset] = sess.run(eip_);
|
||||
# arr_offset = end_offset
|
||||
#
|
||||
# finally:
|
||||
# sess.close()
|
||||
#
|
||||
# return yhats, sigmas, eips
|
||||
|
||||
|
||||
def euclidean_mat(X, y, sess):
|
||||
x_n = X.shape[0]
|
||||
y_n = y.shape[0]
|
||||
z = np.zeros([x_n, y_n])
|
||||
for i in range(x_n):
|
||||
v1 = X[i]
|
||||
tmp = []
|
||||
for j in range(y_n):
|
||||
v2 = y[j]
|
||||
tmp.append(tf.sqrt(tf.reduce_sum(tf.pow(tf.subtract(v1, v2), 2))))
|
||||
z[i] = (sess.run(tmp))
|
||||
return z
|
||||
|
||||
|
||||
def gd_tf(xs, ys, xt, ridge, length_scale=1.0, magnitude=1.0, max_iter=50):
|
||||
LOG.debug("xs shape: %s", str(xs.shape))
|
||||
LOG.debug("ys shape: %s", str(ys.shape))
|
||||
LOG.debug("xt shape: %s", str(xt.shape))
|
||||
with tf.Graph().as_default():
|
||||
# y_best = tf.cast(tf.reduce_min(ys,0,True),tf.float32); #array
|
||||
# yhat_gd = tf.check_numerics(yhat_gd, message="yhat: ")
|
||||
sample_size = xs.shape[0]
|
||||
nfeats = xs.shape[1]
|
||||
test_size = xt.shape[0]
|
||||
# arr_offset = 0
|
||||
ini_size = xt.shape[0]
|
||||
|
||||
yhats = np.zeros([test_size, 1])
|
||||
sigmas = np.zeros([test_size, 1])
|
||||
minl = np.zeros([test_size, 1])
|
||||
new_conf = np.zeros([test_size, nfeats])
|
||||
|
||||
xs = np.float32(xs)
|
||||
ys = np.float32(ys)
|
||||
xt_ = tf.Variable(xt[0], tf.float32)
|
||||
|
||||
sess = tf.Session(config=tf.ConfigProto(intra_op_parallelism_threads=8))
|
||||
init = tf.global_variables_initializer()
|
||||
sess.run(init)
|
||||
|
||||
ridge = np.float32(ridge)
|
||||
v1 = tf.placeholder(tf.float32, name="v1")
|
||||
v2 = tf.placeholder(tf.float32, name="v2")
|
||||
dist = tf.sqrt(tf.reduce_sum(tf.pow(tf.subtract(v1, v2), 2), 1))
|
||||
|
||||
tmp = np.zeros([sample_size, sample_size])
|
||||
for i in range(sample_size):
|
||||
tmp[i] = sess.run(dist, feed_dict={v1: xs[i], v2: xs})
|
||||
|
||||
tmp = tf.cast(tmp, tf.float32)
|
||||
K = magnitude * tf.exp(-tmp / length_scale) + tf.diag(ridge)
|
||||
LOG.debug("K shape: %s", str(sess.run(K).shape))
|
||||
|
||||
K2_mat = tf.sqrt(tf.reduce_sum(tf.pow(tf.subtract(xt_, xs), 2), 1))
|
||||
K2_mat = tf.transpose(tf.expand_dims(K2_mat, 0))
|
||||
K2 = tf.cast(tf.exp(-K2_mat / length_scale), tf.float32)
|
||||
|
||||
x = tf.matmul(tf.matrix_inverse(K), ys)
|
||||
x = sess.run(x)
|
||||
yhat_ = tf.cast(tf.matmul(tf.transpose(K2), x), tf.float32)
|
||||
sig_val = tf.cast((tf.sqrt(magnitude - tf.matmul(
|
||||
tf.transpose(K2), tf.matmul(tf.matrix_inverse(K), K2)))), tf.float32)
|
||||
|
||||
LOG.debug('yhat shape: %s', str(sess.run(yhat_).shape))
|
||||
LOG.debug('sig_val shape: %s', str(sess.run(sig_val).shape))
|
||||
yhat_ = tf.check_numerics(yhat_, message='yhat: ')
|
||||
sig_val = tf.check_numerics(sig_val, message='sig_val: ')
|
||||
loss = tf.squeeze(tf.subtract(yhat_, sig_val))
|
||||
loss = tf.check_numerics(loss, message='loss: ')
|
||||
# optimizer = tf.train.GradientDescentOptimizer(0.1)
|
||||
LOG.debug('loss: %s', str(sess.run(loss)))
|
||||
optimizer = tf.train.AdamOptimizer(0.1)
|
||||
train = optimizer.minimize(loss)
|
||||
init = tf.global_variables_initializer()
|
||||
sess.run(init)
|
||||
|
||||
for i in range(ini_size):
|
||||
assign_op = xt_.assign(xt[i])
|
||||
sess.run(assign_op)
|
||||
for step in range(max_iter):
|
||||
LOG.debug('sample #: %d, iter #: %d, loss: %s', i, step, str(sess.run(loss)))
|
||||
sess.run(train)
|
||||
yhats[i] = sess.run(yhat_)[0][0]
|
||||
sigmas[i] = sess.run(sig_val)[0][0]
|
||||
minl[i] = sess.run(loss)
|
||||
new_conf[i] = sess.run(xt_)
|
||||
return yhats, sigmas, minl, new_conf
|
||||
|
||||
|
||||
def main():
|
||||
pass
|
||||
|
||||
|
||||
def create_random_matrices(n_samples=3000, n_feats=12, n_test=4444):
|
||||
X_train = np.random.rand(n_samples, n_feats)
|
||||
y_train = np.random.rand(n_samples, 1)
|
||||
X_test = np.random.rand(n_test, n_feats)
|
||||
|
||||
length_scale = np.random.rand()
|
||||
magnitude = np.random.rand()
|
||||
ridge = np.ones(n_samples) * np.random.rand()
|
||||
|
||||
return X_train, y_train, X_test, length_scale, magnitude, ridge
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
109
server/analysis/lasso.py
Normal file
109
server/analysis/lasso.py
Normal file
@@ -0,0 +1,109 @@
|
||||
#
|
||||
# OtterTune - lasso.py
|
||||
#
|
||||
# Copyright (c) 2017-18, Carnegie Mellon University Database Group
|
||||
#
|
||||
'''
|
||||
Created on Jul 8, 2016
|
||||
|
||||
@author: dvanaken
|
||||
'''
|
||||
|
||||
import numpy as np
|
||||
from sklearn.linear_model import lasso_path
|
||||
|
||||
from .base import ModelBase
|
||||
|
||||
|
||||
class LassoPath(ModelBase):
|
||||
"""Lasso:
|
||||
|
||||
Computes the Lasso path using Sklearn's lasso_path method.
|
||||
|
||||
|
||||
See also
|
||||
--------
|
||||
http://scikit-learn.org/stable/modules/generated/sklearn.linear_model.lasso_path.html
|
||||
|
||||
|
||||
Attributes
|
||||
----------
|
||||
feature_labels_ : array, [n_features]
|
||||
Labels for each of the features in X.
|
||||
|
||||
alphas_ : array, [n_alphas]
|
||||
The alphas along the path where models are computed. (These are
|
||||
the decreasing values of the penalty along the path).
|
||||
|
||||
coefs_ : array, [n_outputs, n_features, n_alphas]
|
||||
Coefficients along the path.
|
||||
|
||||
rankings_ : array, [n_features]
|
||||
The average ranking of each feature across all target values.
|
||||
"""
|
||||
def __init__(self):
|
||||
self.feature_labels_ = None
|
||||
self.alphas_ = None
|
||||
self.coefs_ = None
|
||||
self.rankings_ = None
|
||||
|
||||
def _reset(self):
|
||||
"""Resets all attributes (erases the model)"""
|
||||
self.feature_labels_ = None
|
||||
self.alphas_ = None
|
||||
self.coefs_ = None
|
||||
self.rankings_ = None
|
||||
|
||||
def fit(self, X, y, feature_labels, estimator_params=None):
|
||||
"""Computes the Lasso path using Sklearn's lasso_path method.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
X : array-like, shape (n_samples, n_features)
|
||||
Training data (the independent variables).
|
||||
|
||||
y : array-like, shape (n_samples, n_outputs)
|
||||
Training data (the output/target values).
|
||||
|
||||
feature_labels : array-like, shape (n_features)
|
||||
Labels for each of the features in X.
|
||||
|
||||
estimator_params : dict, optional
|
||||
The parameters to pass to Sklearn's Lasso estimator.
|
||||
|
||||
|
||||
Returns
|
||||
-------
|
||||
self
|
||||
"""
|
||||
self._reset()
|
||||
if estimator_params is None:
|
||||
estimator_params = {}
|
||||
self.feature_labels_ = feature_labels
|
||||
|
||||
alphas, coefs, _ = lasso_path(X, y, **estimator_params)
|
||||
self.alphas_ = alphas.copy()
|
||||
self.coefs_ = coefs.copy()
|
||||
|
||||
# Rank the features in X by order of importance. This ranking is based
|
||||
# on how early a given features enter the regression (the earlier a
|
||||
# feature enters the regression, the MORE important it is).
|
||||
feature_rankings = [[] for _ in range(X.shape[1])]
|
||||
for target_coef_paths in self.coefs_:
|
||||
for i, feature_path in enumerate(target_coef_paths):
|
||||
entrance_step = 1
|
||||
for val_at_step in feature_path:
|
||||
if val_at_step == 0:
|
||||
entrance_step += 1
|
||||
else:
|
||||
break
|
||||
feature_rankings[i].append(entrance_step)
|
||||
self.rankings_ = np.array([np.mean(ranks) for ranks in feature_rankings])
|
||||
return self
|
||||
|
||||
def get_ranked_features(self):
|
||||
if self.rankings_ is None:
|
||||
raise Exception("No lasso path has been fit yet!")
|
||||
|
||||
rank_idxs = np.argsort(self.rankings_)
|
||||
return [self.feature_labels_[i] for i in rank_idxs]
|
||||
489
server/analysis/preprocessing.py
Normal file
489
server/analysis/preprocessing.py
Normal file
@@ -0,0 +1,489 @@
|
||||
#
|
||||
# OtterTune - preprocessing.py
|
||||
#
|
||||
# Copyright (c) 2017-18, Carnegie Mellon University Database Group
|
||||
#
|
||||
from abc import ABCMeta, abstractmethod
|
||||
|
||||
from itertools import chain, combinations, combinations_with_replacement
|
||||
import numpy as np
|
||||
from sklearn.preprocessing import MinMaxScaler as SklearnMinMaxScaler
|
||||
|
||||
from .util import is_numeric_matrix, is_lexical_matrix
|
||||
|
||||
|
||||
# ==========================================================
|
||||
# Preprocessing Base Class
|
||||
# ==========================================================
|
||||
class Preprocess(object, metaclass=ABCMeta):
|
||||
|
||||
@abstractmethod
|
||||
def fit(self, matrix):
|
||||
pass
|
||||
|
||||
@abstractmethod
|
||||
def transform(self, matrix, copy=True):
|
||||
pass
|
||||
|
||||
def fit_transform(self, matrix, copy=True):
|
||||
self.fit(matrix)
|
||||
return self.transform(matrix, copy=True)
|
||||
|
||||
@abstractmethod
|
||||
def inverse_transform(self, matrix, copy=True):
|
||||
pass
|
||||
|
||||
|
||||
# ==========================================================
|
||||
# Bin by Deciles
|
||||
# ==========================================================
|
||||
class Bin(Preprocess):
|
||||
|
||||
def __init__(self, bin_start, axis=None):
|
||||
if axis is not None and \
|
||||
axis != 1 and axis != 0:
|
||||
raise NotImplementedError("Axis={} is not yet implemented".format(axis))
|
||||
self.deciles_ = None
|
||||
self.bin_start_ = bin_start
|
||||
self.axis_ = axis
|
||||
|
||||
def fit(self, matrix):
|
||||
if self.axis_ is None:
|
||||
self.deciles_ = get_deciles(matrix, self.axis_)
|
||||
elif self.axis_ == 0: # Bin columns
|
||||
self.deciles_ = []
|
||||
for col in matrix.T:
|
||||
self.deciles_.append(get_deciles(col, axis=None))
|
||||
elif self.axis_ == 1: # Bin rows
|
||||
self.deciles_ = []
|
||||
for row in matrix:
|
||||
self.deciles_.append(get_deciles(row, axis=None))
|
||||
return self
|
||||
|
||||
def transform(self, matrix, copy=True):
|
||||
assert self.deciles_ is not None
|
||||
if self.axis_ is None:
|
||||
res = bin_by_decile(matrix, self.deciles_,
|
||||
self.bin_start_, self.axis_)
|
||||
elif self.axis_ == 0: # Transform columns
|
||||
columns = []
|
||||
for col, decile in zip(matrix.T, self.deciles_):
|
||||
columns.append(bin_by_decile(col, decile,
|
||||
self.bin_start_, axis=None))
|
||||
res = np.vstack(columns).T
|
||||
elif self.axis_ == 1: # Transform rows
|
||||
rows = []
|
||||
for row, decile in zip(matrix, self.deciles_):
|
||||
rows.append(bin_by_decile(row, decile,
|
||||
self.bin_start_, axis=None))
|
||||
res = np.vstack(rows)
|
||||
assert res.shape == matrix.shape
|
||||
return res
|
||||
|
||||
def inverse_transform(self, matrix, copy=True):
|
||||
raise NotImplementedError("This method is not supported")
|
||||
|
||||
|
||||
def get_deciles(matrix, axis=None):
|
||||
if axis is not None:
|
||||
raise NotImplementedError("Axis is not yet implemented")
|
||||
|
||||
assert matrix.ndim > 0
|
||||
assert matrix.size > 0
|
||||
|
||||
decile_range = np.arange(10, 101, 10)
|
||||
deciles = np.percentile(matrix, decile_range, axis=axis)
|
||||
deciles[-1] = np.Inf
|
||||
return deciles
|
||||
|
||||
|
||||
def bin_by_decile(matrix, deciles, bin_start, axis=None):
|
||||
if axis is not None:
|
||||
raise NotImplementedError("Axis is not yet implemented")
|
||||
|
||||
assert matrix.ndim > 0
|
||||
assert matrix.size > 0
|
||||
assert deciles is not None
|
||||
assert len(deciles) == 10
|
||||
|
||||
binned_matrix = np.zeros_like(matrix)
|
||||
for i in range(10)[::-1]:
|
||||
decile = deciles[i]
|
||||
binned_matrix[matrix <= decile] = i + bin_start
|
||||
|
||||
return binned_matrix
|
||||
|
||||
|
||||
# ==========================================================
|
||||
# Shuffle Indices
|
||||
# ==========================================================
|
||||
class Shuffler(Preprocess):
|
||||
|
||||
def __init__(self, shuffle_rows=True, shuffle_columns=False,
|
||||
row_indices=None, column_indices=None, seed=0):
|
||||
self.shuffle_rows_ = shuffle_rows
|
||||
self.shuffle_columns_ = shuffle_columns
|
||||
self.row_indices_ = row_indices
|
||||
self.column_indices_ = column_indices
|
||||
np.random.seed(seed)
|
||||
self.fitted_ = False
|
||||
|
||||
def fit(self, matrix):
|
||||
if self.shuffle_rows_ and self.row_indices_ is None:
|
||||
self.row_indices_ = get_shuffle_indices(matrix.data.shape[0])
|
||||
if self.shuffle_columns_ and self.column_indices_ is None:
|
||||
self.column_indices_ = get_shuffle_indices(matrix.data.shape[1])
|
||||
self.fitted_ = True
|
||||
|
||||
def transform(self, matrix, copy=True):
|
||||
if not self.fitted_:
|
||||
raise Exception("The fit() function must be called before transform()")
|
||||
if copy:
|
||||
matrix = matrix.copy()
|
||||
|
||||
if self.shuffle_rows_:
|
||||
matrix.data = matrix.data[self.row_indices_]
|
||||
matrix.rowlabels = matrix.rowlabels[self.row_indices_]
|
||||
if self.shuffle_columns_:
|
||||
matrix.data = matrix.data[:, self.column_indices_]
|
||||
matrix.columnlabels = matrix.columnlabels[self.column_indices_]
|
||||
return matrix
|
||||
|
||||
def inverse_transform(self, matrix, copy=True):
|
||||
if copy:
|
||||
matrix = matrix.copy()
|
||||
|
||||
if self.shuffle_rows_:
|
||||
inverse_row_indices = np.argsort(self.row_indices_)
|
||||
matrix.data = matrix.data[inverse_row_indices]
|
||||
matrix.rowlabels = matrix.rowlabels[inverse_row_indices]
|
||||
if self.shuffle_columns_:
|
||||
inverse_column_indices = np.argsort(self.column_indices_)
|
||||
matrix.data = matrix.data[:, inverse_column_indices]
|
||||
matrix.columnlabels = matrix.columnlabels[inverse_column_indices]
|
||||
return matrix
|
||||
|
||||
|
||||
def get_shuffle_indices(size, seed=None):
|
||||
if seed is not None:
|
||||
assert isinstance(seed, int)
|
||||
np.random.seed(seed)
|
||||
if isinstance(size, int):
|
||||
return np.random.choice(size, size, replace=False)
|
||||
else:
|
||||
indices = []
|
||||
for d in size:
|
||||
indices.append(np.random.choice(d, d, replace=False))
|
||||
return indices
|
||||
|
||||
|
||||
# ==========================================================
|
||||
# Polynomial Features
|
||||
# ==========================================================
|
||||
class PolynomialFeatures(Preprocess):
|
||||
"""Compute the polynomial features of the input array.
|
||||
This code was copied and modified from sklearn's
|
||||
implementation.
|
||||
"""
|
||||
|
||||
def __init__(self, degree=2, interaction_only=False, include_bias=True):
|
||||
self.degree_ = degree
|
||||
self.interaction_only_ = interaction_only
|
||||
self.include_bias_ = include_bias
|
||||
self.n_input_features_ = None
|
||||
self.n_output_features_ = None
|
||||
|
||||
# @property
|
||||
# def powers_(self):
|
||||
# combinations = self._combinations(self.n_input_features_, self.degree_,
|
||||
# self.interaction_only_,
|
||||
# self.include_bias_)
|
||||
# return np.vstack(np.bincount(c, minlength=self.n_input_features_)
|
||||
# for c in combinations)
|
||||
|
||||
@staticmethod
|
||||
def _combinations(n_features, degree, interaction_only, include_bias):
|
||||
comb = (combinations if interaction_only else combinations_with_replacement)
|
||||
start = int(not include_bias)
|
||||
return chain.from_iterable(comb(list(range(n_features)), i)
|
||||
for i in range(start, degree + 1))
|
||||
|
||||
def fit(self, matrix):
|
||||
assert matrix.ndim == 2
|
||||
assert matrix.size > 0
|
||||
|
||||
_, n_features = matrix.shape
|
||||
combos = self._combinations(n_features, self.degree_,
|
||||
self.interaction_only_,
|
||||
self.include_bias_)
|
||||
self.n_input_features_ = matrix.shape[1]
|
||||
self.n_output_features_ = sum(1 for _ in combos)
|
||||
return self
|
||||
|
||||
def transform(self, matrix, copy=True):
|
||||
"""Transform data to polynomial features
|
||||
Parameters
|
||||
----------
|
||||
X : array-like, shape [n_samples, n_features]
|
||||
The data to transform, row by row.
|
||||
Returns
|
||||
-------
|
||||
XP : np.ndarray shape [n_samples, NP]
|
||||
The matrix of features, where NP is the number of polynomial
|
||||
features generated from the combination of inputs.
|
||||
"""
|
||||
assert matrix.ndim == 2
|
||||
assert matrix.size > 0
|
||||
|
||||
n_samples, n_features = matrix.shape
|
||||
|
||||
if n_features != self.n_input_features_:
|
||||
raise ValueError("X shape does not match training shape")
|
||||
|
||||
is_numeric_type = is_numeric_matrix(matrix)
|
||||
is_lexical_type = is_lexical_matrix(matrix)
|
||||
if is_lexical_type:
|
||||
strs = matrix.reshape((matrix.size,))
|
||||
maxlen = max([len(s) for s in strs])
|
||||
dtype = "S{}".format(maxlen * 2 + 1)
|
||||
else:
|
||||
dtype = matrix.dtype
|
||||
|
||||
# allocate output data
|
||||
poly_matrix = np.empty((n_samples, self.n_output_features_), dtype=dtype)
|
||||
|
||||
combos = self._combinations(n_features, self.degree_,
|
||||
self.interaction_only_,
|
||||
self.include_bias_)
|
||||
for i, c in enumerate(combos):
|
||||
if is_numeric_type:
|
||||
poly_matrix[:, i] = matrix[:, c].prod(1)
|
||||
elif is_lexical_type:
|
||||
n_poly1_feats = n_features + int(self.include_bias_)
|
||||
if i >= n_poly1_feats:
|
||||
x = "*".join(np.squeeze(matrix[:, c]).tolist())
|
||||
else:
|
||||
x = "".join(np.squeeze(matrix[:, c]).tolist())
|
||||
poly_matrix[:, i] = x
|
||||
else:
|
||||
raise TypeError("Unsupported matrix type {}".format(matrix.dtype))
|
||||
|
||||
return poly_matrix
|
||||
|
||||
def inverse_transform(self, matrix, copy=True):
|
||||
raise NotImplementedError("This method is not supported")
|
||||
|
||||
|
||||
# ==========================================================
|
||||
# Dummy Encoding
|
||||
# ==========================================================
|
||||
class DummyEncoder(Preprocess):
|
||||
|
||||
def __init__(self, n_values, categorical_features, cat_columnlabels, noncat_columnlabels):
|
||||
from sklearn.preprocessing import OneHotEncoder
|
||||
|
||||
if not isinstance(n_values, np.ndarray):
|
||||
n_values = np.array(n_values)
|
||||
if not isinstance(categorical_features, np.ndarray):
|
||||
categorical_features = np.array(categorical_features)
|
||||
# assert categorical_features.size > 0
|
||||
assert categorical_features.shape == n_values.shape
|
||||
for nv in n_values:
|
||||
if nv <= 2:
|
||||
raise Exception("Categorical features must have 3+ labels")
|
||||
|
||||
self.n_values = n_values
|
||||
self.cat_columnlabels = cat_columnlabels
|
||||
self.noncat_columnlabels = noncat_columnlabels
|
||||
self.encoder = OneHotEncoder(
|
||||
n_values=n_values, categorical_features=categorical_features, sparse=False)
|
||||
self.new_labels = None
|
||||
self.cat_idxs_old = categorical_features
|
||||
|
||||
def fit(self, matrix):
|
||||
self.encoder.fit(matrix)
|
||||
# determine new columnlabels
|
||||
# categorical variables are done in order specified by categorical_features
|
||||
new_labels = []
|
||||
for i, cat_label in enumerate(self.cat_columnlabels):
|
||||
low = self.encoder.feature_indices_[i]
|
||||
high = self.encoder.feature_indices_[i + 1]
|
||||
for j in range(low, high):
|
||||
# eg the categorical variable named cat_var with 5 possible values
|
||||
# turns into 0/1 variables named cat_var____0, ..., cat_var____4
|
||||
new_labels.append(cat_label + "____" + str(j - low))
|
||||
# according to sklearn documentation,
|
||||
# "non-categorical features are always stacked to the right of the matrix"
|
||||
# by observation, it looks like the non-categorical features' relative order is preserved
|
||||
# BUT: there is no guarantee made about that behavior!
|
||||
# We either trust OneHotEncoder to be sensible, or look for some other way
|
||||
new_labels += self.noncat_columnlabels
|
||||
self.new_labels = new_labels
|
||||
|
||||
def transform(self, matrix, copy=True):
|
||||
# actually transform the matrix
|
||||
matrix_encoded = self.encoder.transform(matrix)
|
||||
return matrix_encoded
|
||||
|
||||
def fit_transform(self, matrix, copy=True):
|
||||
self.fit(matrix)
|
||||
return self.transform(matrix)
|
||||
|
||||
def inverse_transform(self, matrix, copy=True):
|
||||
n_values = self.n_values
|
||||
# If there are no categorical variables, no transformation happened.
|
||||
if len(n_values) == 0:
|
||||
return matrix
|
||||
|
||||
# Otherwise, this is a dummy-encoded matrix. Transform it back to original form.
|
||||
n_features = matrix.shape[-1] - self.encoder.feature_indices_[-1] + len(n_values)
|
||||
noncat_start_idx = self.encoder.feature_indices_[-1]
|
||||
inverted_matrix = np.empty((matrix.shape[0], n_features))
|
||||
cat_idx = 0
|
||||
noncat_idx = 0
|
||||
for i in range(n_features):
|
||||
if i in self.cat_idxs_old:
|
||||
new_col = np.ones((matrix.shape[0],))
|
||||
start_idx = self.encoder.feature_indices_[cat_idx]
|
||||
for j in range(n_values[cat_idx]):
|
||||
col = matrix[:, start_idx + j]
|
||||
new_col[col == 1] = j
|
||||
cat_idx += 1
|
||||
else:
|
||||
new_col = np.array(matrix[:, noncat_start_idx + noncat_idx])
|
||||
noncat_idx += 1
|
||||
inverted_matrix[:, i] = new_col
|
||||
return inverted_matrix
|
||||
|
||||
def total_dummies(self):
|
||||
return sum(self.n_values)
|
||||
|
||||
|
||||
def consolidate_columnlabels(columnlabels):
|
||||
import re
|
||||
# use this to check if a label was created by dummy encoder
|
||||
p = re.compile(r'(.*)____\d+')
|
||||
|
||||
consolidated_columnlabels = []
|
||||
cat_seen = set() # avoid duplicate cat_labels
|
||||
for lab in columnlabels:
|
||||
m = p.match(lab)
|
||||
# m.group(1) is the original column name
|
||||
if m:
|
||||
if m.group(1) not in cat_seen:
|
||||
cat_seen.add(m.group(1))
|
||||
consolidated_columnlabels.append(m.group(1))
|
||||
else:
|
||||
# non-categorical variable
|
||||
consolidated_columnlabels.append(lab)
|
||||
return consolidated_columnlabels
|
||||
|
||||
|
||||
def fix_scaler(scaler, encoder, params):
|
||||
p = 0.5
|
||||
mean = scaler.mean_
|
||||
var = scaler.var_
|
||||
n_values = encoder.n_values
|
||||
cat_start_idxs = encoder.xform_start_indices
|
||||
current_idx = 0
|
||||
cat_idx = 0
|
||||
for param in params:
|
||||
if param.iscategorical:
|
||||
if param.isboolean:
|
||||
nvals = 1
|
||||
else:
|
||||
assert cat_start_idxs[cat_idx] == current_idx
|
||||
nvals = n_values[cat_idx]
|
||||
cat_idx += 1
|
||||
cat_mean = nvals * p
|
||||
cat_var = cat_mean * (1 - p)
|
||||
mean[current_idx: current_idx + nvals] = cat_mean
|
||||
var[current_idx: current_idx + nvals] = cat_var
|
||||
current_idx += nvals
|
||||
else:
|
||||
current_idx += 1
|
||||
|
||||
scaler.mean_ = mean
|
||||
scaler.var_ = var
|
||||
scaler.scale_ = np.sqrt(var)
|
||||
|
||||
|
||||
def get_min_max(params, encoder=None):
|
||||
if encoder is not None:
|
||||
num_cat_feats = encoder.n_values.size
|
||||
nfeats = len(params) - num_cat_feats + np.sum(encoder.n_values)
|
||||
n_values = encoder.n_values
|
||||
cat_start_idxs = encoder.xform_start_indices
|
||||
else:
|
||||
num_cat_feats = 0
|
||||
nfeats = len(params)
|
||||
n_values = np.array([])
|
||||
cat_start_idxs = np.array([])
|
||||
|
||||
mins = np.empty((nfeats,))
|
||||
maxs = np.empty((nfeats,))
|
||||
current_idx = 0
|
||||
cat_idx = 0
|
||||
for param in params:
|
||||
if param.iscategorical:
|
||||
if param.isboolean:
|
||||
nvals = 1
|
||||
else:
|
||||
assert cat_start_idxs[cat_idx] == current_idx
|
||||
nvals = n_values[cat_idx]
|
||||
cat_idx += 1
|
||||
mins[current_idx: current_idx + nvals] = 0
|
||||
maxs[current_idx: current_idx + nvals] = 1
|
||||
current_idx += nvals
|
||||
else:
|
||||
mins[current_idx] = param.true_range[0] # valid_values[0]
|
||||
maxs[current_idx] = param.true_range[1] # valid_values[-1]
|
||||
current_idx += 1
|
||||
return mins, maxs
|
||||
|
||||
|
||||
# ==========================================================
|
||||
# Min-max scaler
|
||||
# ==========================================================
|
||||
class MinMaxScaler(Preprocess):
|
||||
|
||||
def __init__(self, mins=None, maxs=None):
|
||||
self.scaler_ = SklearnMinMaxScaler()
|
||||
if mins is not None:
|
||||
assert isinstance(mins, np.ndarray)
|
||||
if mins.ndim == 1:
|
||||
mins = mins.reshape(1, -1)
|
||||
self.scaler_.partial_fit(mins)
|
||||
self.mins_ = mins
|
||||
else:
|
||||
self.mins_ = None
|
||||
if maxs is not None:
|
||||
assert isinstance(maxs, np.ndarray)
|
||||
if maxs.ndim == 1:
|
||||
maxs = maxs.reshape(1, -1)
|
||||
self.scaler_.partial_fit(maxs)
|
||||
self.maxs_ = maxs
|
||||
else:
|
||||
self.maxs_ = None
|
||||
self.fitted_ = self.mins_ is not None and self.maxs_ is not None
|
||||
|
||||
def fit(self, matrix):
|
||||
if matrix.ndim == 1:
|
||||
matrix = matrix.reshape(1, -1)
|
||||
self.scaler_.partial_fit(matrix)
|
||||
self.mins_ = self.scaler_.data_min_
|
||||
self.maxs_ = self.scaler_.data_max_
|
||||
self.fitted_ = True
|
||||
return self
|
||||
|
||||
def transform(self, matrix, copy=True):
|
||||
if not self.fitted_:
|
||||
raise Exception("Model not fitted!")
|
||||
if matrix.ndim == 1:
|
||||
matrix = matrix.reshape(1, -1)
|
||||
return self.scaler_.transform(matrix)
|
||||
|
||||
def inverse_transform(self, matrix, copy=True):
|
||||
if matrix.ndim == 1:
|
||||
matrix = matrix.reshape(1, -1)
|
||||
return self.scaler_.inverse_transform(matrix)
|
||||
5
server/analysis/tests/__init__.py
Normal file
5
server/analysis/tests/__init__.py
Normal file
@@ -0,0 +1,5 @@
|
||||
#
|
||||
# OtterTune - __init__.py
|
||||
#
|
||||
# Copyright (c) 2017-18, Carnegie Mellon University Database Group
|
||||
#
|
||||
91
server/analysis/tests/test_cluster.py
Normal file
91
server/analysis/tests/test_cluster.py
Normal file
@@ -0,0 +1,91 @@
|
||||
#
|
||||
# OtterTune - test_cluster.py
|
||||
#
|
||||
# Copyright (c) 2017-18, Carnegie Mellon University Database Group
|
||||
#
|
||||
import unittest
|
||||
import numpy as np
|
||||
from sklearn import datasets
|
||||
|
||||
from analysis.cluster import KMeans, KMeansClusters, create_kselection_model
|
||||
|
||||
|
||||
class TestKMeans(unittest.TestCase):
|
||||
|
||||
@classmethod
|
||||
def setUpClass(cls):
|
||||
super(TestKMeans, cls).setUpClass()
|
||||
iris = datasets.load_iris()
|
||||
cls.model = KMeans()
|
||||
cls.model.fit(iris.data, 5, iris.target,
|
||||
estimator_params={'n_init': 50, 'random_state': 42})
|
||||
|
||||
def test_kmeans_n_clusters(self):
|
||||
self.assertEqual(self.model.n_clusters_, 5)
|
||||
|
||||
def test_kmeans_cluster_inertia(self):
|
||||
self.assertAlmostEqual(self.model.cluster_inertia_, 46.535, 2)
|
||||
|
||||
def test_kmeans_cluster_labels(self):
|
||||
expected_labels = [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
|
||||
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
|
||||
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 3, 3, 3, 2, 3, 3, 3,
|
||||
2, 3, 2, 2, 3, 2, 3, 2, 3, 3, 2, 3, 2, 3, 2, 3, 3, 3, 3,
|
||||
3, 3, 3, 2, 2, 2, 2, 3, 2, 3, 3, 3, 2, 2, 2, 3, 2, 2, 2,
|
||||
2, 2, 3, 2, 2, 4, 3, 0, 4, 4, 0, 2, 0, 4, 0, 4, 4, 4, 3,
|
||||
4, 4, 4, 0, 0, 3, 4, 3, 0, 3, 4, 0, 3, 3, 4, 0, 0, 0, 4,
|
||||
3, 3, 0, 4, 4, 3, 4, 4, 4, 3, 4, 4, 4, 3, 4, 4, 3]
|
||||
for lab_actual, lab_expected in zip(self.model.cluster_labels_, expected_labels):
|
||||
self.assertEqual(lab_actual, lab_expected)
|
||||
|
||||
def test_kmeans_sample_labels(self):
|
||||
for lab_actual, lab_expected in zip(self.model.sample_labels_, datasets.load_iris().target):
|
||||
self.assertEqual(lab_actual, lab_expected)
|
||||
|
||||
def test_kmeans_cluster_centers(self):
|
||||
expected_centers = [[7.475, 3.125, 6.300, 2.050],
|
||||
[5.006, 3.418, 1.464, 0.244],
|
||||
[5.508, 2.600, 3.908, 1.204],
|
||||
[6.207, 2.853, 4.746, 1.564],
|
||||
[6.529, 3.058, 5.508, 2.162]]
|
||||
for row_actual, row_expected in zip(self.model.cluster_centers_, expected_centers):
|
||||
for val_actual, val_expected in zip(row_actual, row_expected):
|
||||
self.assertAlmostEqual(val_actual, val_expected, 2)
|
||||
|
||||
|
||||
class TestKSelection(unittest.TestCase):
|
||||
|
||||
def setUp(self):
|
||||
np.random.seed(seed=42)
|
||||
|
||||
@classmethod
|
||||
def setUpClass(cls):
|
||||
super(TestKSelection, cls).setUpClass()
|
||||
|
||||
# Load Iris data
|
||||
iris = datasets.load_iris()
|
||||
cls.matrix = iris.data
|
||||
cls.kmeans_models = KMeansClusters()
|
||||
cls.kmeans_models.fit(cls.matrix,
|
||||
min_cluster=1,
|
||||
max_cluster=10,
|
||||
sample_labels=iris.target,
|
||||
estimator_params={'n_init': 50, 'random_state': 42})
|
||||
|
||||
def test_detk_optimal_num_clusters(self):
|
||||
# Compute optimal # cluster using det-k
|
||||
detk = create_kselection_model("det-k")
|
||||
detk.fit(self.matrix, self.kmeans_models.cluster_map_)
|
||||
self.assertEqual(detk.optimal_num_clusters_, 2)
|
||||
|
||||
def test_gap_statistic_optimal_num_clusters(self):
|
||||
# Compute optimal # cluster using gap-statistics
|
||||
gap = create_kselection_model("gap-statistic")
|
||||
gap.fit(self.matrix, self.kmeans_models.cluster_map_)
|
||||
self.assertEqual(gap.optimal_num_clusters_, 8)
|
||||
|
||||
def test_silhouette_optimal_num_clusters(self):
|
||||
# Compute optimal # cluster using Silhouette Analysis
|
||||
sil = create_kselection_model("s-score")
|
||||
sil.fit(self.matrix, self.kmeans_models.cluster_map_)
|
||||
self.assertEqual(sil.optimal_num_clusters_, 2)
|
||||
116
server/analysis/tests/test_constraints.py
Normal file
116
server/analysis/tests/test_constraints.py
Normal file
@@ -0,0 +1,116 @@
|
||||
#
|
||||
# OtterTune - test_constraints.py
|
||||
#
|
||||
# Copyright (c) 2017-18, Carnegie Mellon University Database Group
|
||||
#
|
||||
import unittest
|
||||
import numpy as np
|
||||
from sklearn import datasets
|
||||
from sklearn.preprocessing import StandardScaler
|
||||
|
||||
from analysis.constraints import ParamConstraintHelper
|
||||
from analysis.preprocessing import DummyEncoder
|
||||
|
||||
|
||||
class ConstraintHelperTestCase(unittest.TestCase):
|
||||
|
||||
def test_scale_rescale(self):
|
||||
X = datasets.load_boston()['data']
|
||||
X_scaler = StandardScaler()
|
||||
# params hard-coded for test (messy to import constant from website module)
|
||||
constraint_helper = ParamConstraintHelper(X_scaler, None,
|
||||
init_flip_prob=0.3,
|
||||
flip_prob_decay=0.5)
|
||||
X_scaled = X_scaler.fit_transform(X)
|
||||
# there may be some floating point imprecision between scaling and rescaling
|
||||
row_unscaled = np.round(constraint_helper._handle_scaling(X_scaled[0], True), 10) # pylint: disable=protected-access
|
||||
self.assertTrue(np.all(X[0] == row_unscaled))
|
||||
row_rescaled = constraint_helper._handle_rescaling(row_unscaled, True) # pylint: disable=protected-access
|
||||
self.assertTrue(np.all(X_scaled[0] == row_rescaled))
|
||||
|
||||
def test_apply_constraints_unscaled(self):
|
||||
n_values = [3]
|
||||
categorical_features = [0]
|
||||
encoder = DummyEncoder(n_values, categorical_features, ['a'], [])
|
||||
encoder.fit([[0, 17]])
|
||||
X_scaler = StandardScaler()
|
||||
constraint_helper = ParamConstraintHelper(X_scaler, encoder,
|
||||
init_flip_prob=0.3,
|
||||
flip_prob_decay=0.5)
|
||||
|
||||
X = [0.1, 0.2, 0.3, 17]
|
||||
X_expected = [0, 0, 1, 17]
|
||||
X_corrected = constraint_helper.apply_constraints(X, scaled=False, rescale=False)
|
||||
self.assertTrue(np.all(X_corrected == X_expected))
|
||||
|
||||
def test_apply_constraints(self):
|
||||
n_values = [3]
|
||||
categorical_features = [0]
|
||||
encoder = DummyEncoder(n_values, categorical_features, ['a'], [])
|
||||
encoder.fit([[0, 17]])
|
||||
X_scaler = StandardScaler()
|
||||
X = np.array([[0, 0, 1, 17], [1, 0, 0, 17]], dtype=float)
|
||||
X_scaled = X_scaler.fit_transform(X)
|
||||
constraint_helper = ParamConstraintHelper(X_scaler, encoder,
|
||||
init_flip_prob=0.3,
|
||||
flip_prob_decay=0.5)
|
||||
|
||||
row = X_scaled[0]
|
||||
new_row = np.copy(row)
|
||||
new_row[0: 3] += 0.1 # should still represent [0, 0, 1] encoding
|
||||
row_corrected = constraint_helper.apply_constraints(new_row)
|
||||
self.assertTrue(np.all(row == row_corrected))
|
||||
|
||||
# tests that repeatedly applying randomize_categorical_features
|
||||
# always results in valid configurations of categorical dumny encodings
|
||||
# and will lead to all possible values of categorical variables being tried
|
||||
def test_randomize_categorical_features(self):
|
||||
# variable 0 is categorical, 3 values
|
||||
# variable 1 is not categorical
|
||||
# variable 2 is categorical, 4 values
|
||||
cat_var_0_levels = 3
|
||||
cat_var_2_levels = 4
|
||||
cat_var_0_idx = 0
|
||||
cat_var_2_idx = 2
|
||||
n_values = [cat_var_0_levels, cat_var_2_levels]
|
||||
categorical_features = [cat_var_0_idx, cat_var_2_idx]
|
||||
encoder = DummyEncoder(n_values, categorical_features, ['a', 'b'], [])
|
||||
encoder.fit([[0, 17, 0]])
|
||||
X_scaler = StandardScaler()
|
||||
constraint_helper = ParamConstraintHelper(X_scaler, encoder,
|
||||
init_flip_prob=0.3,
|
||||
flip_prob_decay=0.5)
|
||||
|
||||
# row is a sample encoded set of features,
|
||||
# note that the non-categorical variable is on the right
|
||||
row = np.array([0, 0, 1, 1, 0, 0, 0, 17], dtype=float)
|
||||
trials = 20
|
||||
cat_var_0_counts = np.zeros(cat_var_0_levels)
|
||||
cat_var_2_counts = np.zeros(cat_var_2_levels)
|
||||
for _ in range(trials):
|
||||
# possibly flip the categorical features
|
||||
row = constraint_helper.randomize_categorical_features(row, scaled=False, rescale=False)
|
||||
|
||||
# check that result is valid for cat_var_0
|
||||
cat_var_0_dummies = row[0: cat_var_0_levels]
|
||||
self.assertTrue(np.all(np.logical_or(cat_var_0_dummies == 0, cat_var_0_dummies == 1)))
|
||||
self.assertEqual(np.sum(cat_var_0_dummies), 1)
|
||||
cat_var_0_counts[np.argmax(cat_var_0_dummies)] += 1
|
||||
|
||||
# check that result is valid for cat_var_2
|
||||
cat_var_2_dummies = row[cat_var_0_levels: cat_var_0_levels + cat_var_2_levels]
|
||||
self.assertTrue(np.all(np.logical_or(cat_var_2_dummies == 0, cat_var_2_dummies == 1)))
|
||||
self.assertEqual(np.sum(cat_var_2_dummies), 1)
|
||||
cat_var_2_counts[np.argmax(cat_var_2_dummies)] += 1
|
||||
|
||||
self.assertEqual(row[-1], 17)
|
||||
|
||||
for ct in cat_var_0_counts:
|
||||
self.assertTrue(ct > 0)
|
||||
|
||||
for ct in cat_var_2_counts:
|
||||
self.assertTrue(ct > 0)
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
unittest.main()
|
||||
61
server/analysis/tests/test_gpr.py
Normal file
61
server/analysis/tests/test_gpr.py
Normal file
@@ -0,0 +1,61 @@
|
||||
#
|
||||
# OtterTune - test_gpr.py
|
||||
#
|
||||
# Copyright (c) 2017-18, Carnegie Mellon University Database Group
|
||||
#
|
||||
import unittest
|
||||
from sklearn import datasets
|
||||
from analysis.gp import GPRNP
|
||||
from analysis.gp_tf import GPR
|
||||
|
||||
|
||||
# test numpy version GPR
|
||||
class TestGPRNP(unittest.TestCase):
|
||||
|
||||
@classmethod
|
||||
def setUpClass(cls):
|
||||
super(TestGPRNP, cls).setUpClass()
|
||||
boston = datasets.load_boston()
|
||||
data = boston['data']
|
||||
X_train = data[0:500]
|
||||
X_test = data[500:]
|
||||
y_train = boston['target'][0:500].reshape(500, 1)
|
||||
cls.model = GPRNP(length_scale=1.0, magnitude=1.0)
|
||||
cls.model.fit(X_train, y_train, ridge=1.0)
|
||||
cls.gpr_result = cls.model.predict(X_test)
|
||||
|
||||
def test_gprnp_ypreds(self):
|
||||
ypreds_round = [round(x[0], 4) for x in self.gpr_result.ypreds]
|
||||
expected_ypreds = [0.0181, 0.0014, 0.0006, 0.0015, 0.0039, 0.0014]
|
||||
self.assertEqual(ypreds_round, expected_ypreds)
|
||||
|
||||
def test_gprnp_sigmas(self):
|
||||
sigmas_round = [round(x[0], 4) for x in self.gpr_result.sigmas]
|
||||
expected_sigmas = [1.0, 1.0, 1.0, 1.0, 1.0, 1.0]
|
||||
self.assertEqual(sigmas_round, expected_sigmas)
|
||||
|
||||
|
||||
# test Tensorflow version GPR
|
||||
class TestGPRTF(unittest.TestCase):
|
||||
|
||||
@classmethod
|
||||
def setUpClass(cls):
|
||||
super(TestGPRTF, cls).setUpClass()
|
||||
boston = datasets.load_boston()
|
||||
data = boston['data']
|
||||
X_train = data[0:500]
|
||||
X_test = data[500:]
|
||||
y_train = boston['target'][0:500].reshape(500, 1)
|
||||
cls.model = GPR(length_scale=1.0, magnitude=1.0)
|
||||
cls.model.fit(X_train, y_train, ridge=1.0)
|
||||
cls.gpr_result = cls.model.predict(X_test)
|
||||
|
||||
def test_gprnp_ypreds(self):
|
||||
ypreds_round = [round(x[0], 4) for x in self.gpr_result.ypreds]
|
||||
expected_ypreds = [0.0181, 0.0014, 0.0006, 0.0015, 0.0039, 0.0014]
|
||||
self.assertEqual(ypreds_round, expected_ypreds)
|
||||
|
||||
def test_gprnp_sigmas(self):
|
||||
sigmas_round = [round(x[0], 4) for x in self.gpr_result.sigmas]
|
||||
expected_sigmas = [1.0, 1.0, 1.0, 1.0, 1.0, 1.0]
|
||||
self.assertEqual(sigmas_round, expected_sigmas)
|
||||
83
server/analysis/tests/test_preprocessing.py
Normal file
83
server/analysis/tests/test_preprocessing.py
Normal file
@@ -0,0 +1,83 @@
|
||||
#
|
||||
# OtterTune - test_preprocessing.py
|
||||
#
|
||||
# Copyright (c) 2017-18, Carnegie Mellon University Database Group
|
||||
#
|
||||
import unittest
|
||||
import numpy as np
|
||||
|
||||
from analysis.preprocessing import DummyEncoder, consolidate_columnlabels
|
||||
|
||||
|
||||
class TestDummyEncoder(unittest.TestCase):
|
||||
|
||||
def test_no_categoricals(self):
|
||||
X = [[1, 2, 3], [4, 5, 6]]
|
||||
n_values = []
|
||||
categorical_features = []
|
||||
cat_columnlabels = []
|
||||
noncat_columnlabels = ['a', 'b', 'c']
|
||||
|
||||
enc = DummyEncoder(n_values, categorical_features,
|
||||
cat_columnlabels, noncat_columnlabels)
|
||||
X_encoded = enc.fit_transform(X)
|
||||
new_labels = enc.new_labels
|
||||
self.assertTrue(np.all(X == X_encoded))
|
||||
self.assertEqual(noncat_columnlabels, new_labels)
|
||||
|
||||
def test_simple_categorical(self):
|
||||
X = [[0, 1, 2], [1, 1, 2], [2, 1, 2]]
|
||||
n_values = [3]
|
||||
categorical_features = [0]
|
||||
cat_columnlabels = ['label']
|
||||
noncat_columnlabels = ['a', 'b']
|
||||
|
||||
X_expected = [[1, 0, 0, 1, 2], [0, 1, 0, 1, 2], [0, 0, 1, 1, 2]]
|
||||
new_labels_expected = ['label____0', 'label____1', 'label____2', 'a', 'b']
|
||||
enc = DummyEncoder(n_values, categorical_features,
|
||||
cat_columnlabels, noncat_columnlabels)
|
||||
X_encoded = enc.fit_transform(X)
|
||||
new_labels = enc.new_labels
|
||||
self.assertTrue(np.all(X_expected == X_encoded))
|
||||
self.assertEqual(new_labels_expected, new_labels)
|
||||
|
||||
def test_mixed_categorical(self):
|
||||
X = [[1, 0, 2], [1, 1, 2], [1, 2, 2]]
|
||||
n_values = [3]
|
||||
categorical_features = [1]
|
||||
cat_columnlabels = ['label']
|
||||
noncat_columnlabels = ['a', 'b']
|
||||
|
||||
X_expected = [[1, 0, 0, 1, 2], [0, 1, 0, 1, 2], [0, 0, 1, 1, 2]]
|
||||
new_labels_expected = ['label____0', 'label____1', 'label____2', 'a', 'b']
|
||||
enc = DummyEncoder(n_values, categorical_features,
|
||||
cat_columnlabels, noncat_columnlabels)
|
||||
X_encoded = enc.fit_transform(X)
|
||||
new_labels = enc.new_labels
|
||||
self.assertTrue(np.all(X_expected == X_encoded))
|
||||
self.assertEqual(new_labels_expected, new_labels)
|
||||
|
||||
def test_consolidate(self):
|
||||
labels = ['label1____0', 'label1____1', 'label2____0', 'label2____1', 'noncat']
|
||||
consolidated = consolidate_columnlabels(labels)
|
||||
expected = ['label1', 'label2', 'noncat']
|
||||
self.assertEqual(expected, consolidated)
|
||||
|
||||
def test_inverse_transform(self):
|
||||
X = [[1, 0, 2], [1, 1, 2], [1, 2, 2]]
|
||||
n_values = [3]
|
||||
categorical_features = [1]
|
||||
cat_columnlabels = ['label']
|
||||
noncat_columnlabels = ['a', 'b']
|
||||
|
||||
X_expected = [[1, 0, 0, 1, 2], [0, 1, 0, 1, 2], [0, 0, 1, 1, 2]]
|
||||
enc = DummyEncoder(n_values, categorical_features,
|
||||
cat_columnlabels, noncat_columnlabels)
|
||||
X_encoded = enc.fit_transform(X)
|
||||
self.assertTrue(np.all(X_encoded == X_expected))
|
||||
X_decoded = enc.inverse_transform(X_encoded)
|
||||
self.assertTrue(np.all(X == X_decoded))
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
unittest.main()
|
||||
106
server/analysis/util.py
Normal file
106
server/analysis/util.py
Normal file
@@ -0,0 +1,106 @@
|
||||
#
|
||||
# OtterTune - util.py
|
||||
#
|
||||
# Copyright (c) 2017-18, Carnegie Mellon University Database Group
|
||||
#
|
||||
'''
|
||||
Created on Oct 24, 2017
|
||||
|
||||
@author: dva
|
||||
'''
|
||||
|
||||
import logging
|
||||
from numbers import Number
|
||||
|
||||
import contextlib
|
||||
import datetime
|
||||
import numpy as np
|
||||
|
||||
|
||||
def get_analysis_logger(name, level=logging.INFO):
|
||||
logger = logging.getLogger(name)
|
||||
log_handler = logging.StreamHandler()
|
||||
log_formatter = logging.Formatter(
|
||||
fmt='%(asctime)s [%(funcName)s:%(lineno)03d] %(levelname)-5s: %(message)s',
|
||||
datefmt='%m-%d-%Y %H:%M:%S'
|
||||
)
|
||||
log_handler.setFormatter(log_formatter)
|
||||
logger.addHandler(log_handler)
|
||||
logger.setLevel(level)
|
||||
np.set_printoptions(formatter={'float': '{: 0.3f}'.format})
|
||||
return logger
|
||||
|
||||
|
||||
LOG = get_analysis_logger(__name__)
|
||||
|
||||
|
||||
def stdev_zero(data, axis=None, nearzero=1e-8):
|
||||
mstd = np.expand_dims(data.std(axis=axis), axis=axis)
|
||||
return (np.abs(mstd) < nearzero).squeeze()
|
||||
|
||||
|
||||
def get_datetime():
|
||||
return datetime.datetime.utcnow()
|
||||
|
||||
|
||||
class TimerStruct(object):
|
||||
|
||||
def __init__(self):
|
||||
self.__start_time = 0.0
|
||||
self.__stop_time = 0.0
|
||||
self.__elapsed = None
|
||||
|
||||
@property
|
||||
def elapsed_seconds(self):
|
||||
if self.__elapsed is None:
|
||||
return (get_datetime() - self.__start_time).total_seconds()
|
||||
return self.__elapsed.total_seconds()
|
||||
|
||||
def start(self):
|
||||
self.__start_time = get_datetime()
|
||||
|
||||
def stop(self):
|
||||
self.__stop_time = get_datetime()
|
||||
self.__elapsed = (self.__stop_time - self.__start_time)
|
||||
|
||||
|
||||
@contextlib.contextmanager
|
||||
def stopwatch(message=None):
|
||||
ts = TimerStruct()
|
||||
ts.start()
|
||||
try:
|
||||
yield ts
|
||||
finally:
|
||||
ts.stop()
|
||||
if message is not None:
|
||||
LOG.info('Total elapsed_seconds time for %s: %.3fs', message, ts.elapsed_seconds)
|
||||
|
||||
|
||||
def get_data_base(arr):
|
||||
"""For a given Numpy array, finds the
|
||||
base array that "owns" the actual data."""
|
||||
base = arr
|
||||
while isinstance(base.base, np.ndarray):
|
||||
base = base.base
|
||||
return base
|
||||
|
||||
|
||||
def arrays_share_data(x, y):
|
||||
return get_data_base(x) is get_data_base(y)
|
||||
|
||||
|
||||
def array_tostring(arr):
|
||||
arr_shape = arr.shape
|
||||
arr = arr.ravel()
|
||||
arr = np.array([str(a) for a in arr])
|
||||
return arr.reshape(arr_shape)
|
||||
|
||||
|
||||
def is_numeric_matrix(matrix):
|
||||
assert matrix.size > 0
|
||||
return isinstance(matrix.ravel()[0], Number)
|
||||
|
||||
|
||||
def is_lexical_matrix(matrix):
|
||||
assert matrix.size > 0
|
||||
return isinstance(matrix.ravel()[0], str)
|
||||
Reference in New Issue
Block a user