ottertune/server/analysis/preprocessing.py

490 lines
17 KiB
Python

#
# OtterTune - preprocessing.py
#
# Copyright (c) 2017-18, Carnegie Mellon University Database Group
#
from abc import ABCMeta, abstractmethod
from itertools import chain, combinations, combinations_with_replacement
import numpy as np
from sklearn.preprocessing import MinMaxScaler as SklearnMinMaxScaler
from .util import is_numeric_matrix, is_lexical_matrix
# ==========================================================
# Preprocessing Base Class
# ==========================================================
class Preprocess(object, metaclass=ABCMeta):
@abstractmethod
def fit(self, matrix):
pass
@abstractmethod
def transform(self, matrix, copy=True):
pass
def fit_transform(self, matrix, copy=True):
self.fit(matrix)
return self.transform(matrix, copy=True)
@abstractmethod
def inverse_transform(self, matrix, copy=True):
pass
# ==========================================================
# Bin by Deciles
# ==========================================================
class Bin(Preprocess):
def __init__(self, bin_start, axis=None):
if axis is not None and \
axis != 1 and axis != 0:
raise NotImplementedError("Axis={} is not yet implemented".format(axis))
self.deciles_ = None
self.bin_start_ = bin_start
self.axis_ = axis
def fit(self, matrix):
if self.axis_ is None:
self.deciles_ = get_deciles(matrix, self.axis_)
elif self.axis_ == 0: # Bin columns
self.deciles_ = []
for col in matrix.T:
self.deciles_.append(get_deciles(col, axis=None))
elif self.axis_ == 1: # Bin rows
self.deciles_ = []
for row in matrix:
self.deciles_.append(get_deciles(row, axis=None))
return self
def transform(self, matrix, copy=True):
assert self.deciles_ is not None
if self.axis_ is None:
res = bin_by_decile(matrix, self.deciles_,
self.bin_start_, self.axis_)
elif self.axis_ == 0: # Transform columns
columns = []
for col, decile in zip(matrix.T, self.deciles_):
columns.append(bin_by_decile(col, decile,
self.bin_start_, axis=None))
res = np.vstack(columns).T
elif self.axis_ == 1: # Transform rows
rows = []
for row, decile in zip(matrix, self.deciles_):
rows.append(bin_by_decile(row, decile,
self.bin_start_, axis=None))
res = np.vstack(rows)
assert res.shape == matrix.shape
return res
def inverse_transform(self, matrix, copy=True):
raise NotImplementedError("This method is not supported")
def get_deciles(matrix, axis=None):
if axis is not None:
raise NotImplementedError("Axis is not yet implemented")
assert matrix.ndim > 0
assert matrix.size > 0
decile_range = np.arange(10, 101, 10)
deciles = np.percentile(matrix, decile_range, axis=axis)
deciles[-1] = np.Inf
return deciles
def bin_by_decile(matrix, deciles, bin_start, axis=None):
if axis is not None:
raise NotImplementedError("Axis is not yet implemented")
assert matrix.ndim > 0
assert matrix.size > 0
assert deciles is not None
assert len(deciles) == 10
binned_matrix = np.zeros_like(matrix)
for i in range(10)[::-1]:
decile = deciles[i]
binned_matrix[matrix <= decile] = i + bin_start
return binned_matrix
# ==========================================================
# Shuffle Indices
# ==========================================================
class Shuffler(Preprocess):
def __init__(self, shuffle_rows=True, shuffle_columns=False,
row_indices=None, column_indices=None, seed=0):
self.shuffle_rows_ = shuffle_rows
self.shuffle_columns_ = shuffle_columns
self.row_indices_ = row_indices
self.column_indices_ = column_indices
np.random.seed(seed)
self.fitted_ = False
def fit(self, matrix):
if self.shuffle_rows_ and self.row_indices_ is None:
self.row_indices_ = get_shuffle_indices(matrix.data.shape[0])
if self.shuffle_columns_ and self.column_indices_ is None:
self.column_indices_ = get_shuffle_indices(matrix.data.shape[1])
self.fitted_ = True
def transform(self, matrix, copy=True):
if not self.fitted_:
raise Exception("The fit() function must be called before transform()")
if copy:
matrix = matrix.copy()
if self.shuffle_rows_:
matrix.data = matrix.data[self.row_indices_]
matrix.rowlabels = matrix.rowlabels[self.row_indices_]
if self.shuffle_columns_:
matrix.data = matrix.data[:, self.column_indices_]
matrix.columnlabels = matrix.columnlabels[self.column_indices_]
return matrix
def inverse_transform(self, matrix, copy=True):
if copy:
matrix = matrix.copy()
if self.shuffle_rows_:
inverse_row_indices = np.argsort(self.row_indices_)
matrix.data = matrix.data[inverse_row_indices]
matrix.rowlabels = matrix.rowlabels[inverse_row_indices]
if self.shuffle_columns_:
inverse_column_indices = np.argsort(self.column_indices_)
matrix.data = matrix.data[:, inverse_column_indices]
matrix.columnlabels = matrix.columnlabels[inverse_column_indices]
return matrix
def get_shuffle_indices(size, seed=None):
if seed is not None:
assert isinstance(seed, int)
np.random.seed(seed)
if isinstance(size, int):
return np.random.choice(size, size, replace=False)
else:
indices = []
for d in size:
indices.append(np.random.choice(d, d, replace=False))
return indices
# ==========================================================
# Polynomial Features
# ==========================================================
class PolynomialFeatures(Preprocess):
"""Compute the polynomial features of the input array.
This code was copied and modified from sklearn's
implementation.
"""
def __init__(self, degree=2, interaction_only=False, include_bias=True):
self.degree_ = degree
self.interaction_only_ = interaction_only
self.include_bias_ = include_bias
self.n_input_features_ = None
self.n_output_features_ = None
# @property
# def powers_(self):
# combinations = self._combinations(self.n_input_features_, self.degree_,
# self.interaction_only_,
# self.include_bias_)
# return np.vstack(np.bincount(c, minlength=self.n_input_features_)
# for c in combinations)
@staticmethod
def _combinations(n_features, degree, interaction_only, include_bias):
comb = (combinations if interaction_only else combinations_with_replacement)
start = int(not include_bias)
return chain.from_iterable(comb(list(range(n_features)), i)
for i in range(start, degree + 1))
def fit(self, matrix):
assert matrix.ndim == 2
assert matrix.size > 0
_, n_features = matrix.shape
combos = self._combinations(n_features, self.degree_,
self.interaction_only_,
self.include_bias_)
self.n_input_features_ = matrix.shape[1]
self.n_output_features_ = sum(1 for _ in combos)
return self
def transform(self, matrix, copy=True):
"""Transform data to polynomial features
Parameters
----------
X : array-like, shape [n_samples, n_features]
The data to transform, row by row.
Returns
-------
XP : np.ndarray shape [n_samples, NP]
The matrix of features, where NP is the number of polynomial
features generated from the combination of inputs.
"""
assert matrix.ndim == 2
assert matrix.size > 0
n_samples, n_features = matrix.shape
if n_features != self.n_input_features_:
raise ValueError("X shape does not match training shape")
is_numeric_type = is_numeric_matrix(matrix)
is_lexical_type = is_lexical_matrix(matrix)
if is_lexical_type:
strs = matrix.reshape((matrix.size,))
maxlen = max([len(s) for s in strs])
dtype = "S{}".format(maxlen * 2 + 1)
else:
dtype = matrix.dtype
# allocate output data
poly_matrix = np.empty((n_samples, self.n_output_features_), dtype=dtype)
combos = self._combinations(n_features, self.degree_,
self.interaction_only_,
self.include_bias_)
for i, c in enumerate(combos):
if is_numeric_type:
poly_matrix[:, i] = matrix[:, c].prod(1)
elif is_lexical_type:
n_poly1_feats = n_features + int(self.include_bias_)
if i >= n_poly1_feats:
x = "*".join(np.squeeze(matrix[:, c]).tolist())
else:
x = "".join(np.squeeze(matrix[:, c]).tolist())
poly_matrix[:, i] = x
else:
raise TypeError("Unsupported matrix type {}".format(matrix.dtype))
return poly_matrix
def inverse_transform(self, matrix, copy=True):
raise NotImplementedError("This method is not supported")
# ==========================================================
# Dummy Encoding
# ==========================================================
class DummyEncoder(Preprocess):
def __init__(self, n_values, categorical_features, cat_columnlabels, noncat_columnlabels):
from sklearn.preprocessing import OneHotEncoder
if not isinstance(n_values, np.ndarray):
n_values = np.array(n_values)
if not isinstance(categorical_features, np.ndarray):
categorical_features = np.array(categorical_features)
# assert categorical_features.size > 0
assert categorical_features.shape == n_values.shape
for nv in n_values:
if nv <= 2:
raise Exception("Categorical features must have 3+ labels")
self.n_values = n_values
self.cat_columnlabels = cat_columnlabels
self.noncat_columnlabels = noncat_columnlabels
self.encoder = OneHotEncoder(
n_values=n_values, categorical_features=categorical_features, sparse=False)
self.new_labels = None
self.cat_idxs_old = categorical_features
def fit(self, matrix):
self.encoder.fit(matrix)
# determine new columnlabels
# categorical variables are done in order specified by categorical_features
new_labels = []
for i, cat_label in enumerate(self.cat_columnlabels):
low = self.encoder.feature_indices_[i]
high = self.encoder.feature_indices_[i + 1]
for j in range(low, high):
# eg the categorical variable named cat_var with 5 possible values
# turns into 0/1 variables named cat_var____0, ..., cat_var____4
new_labels.append(cat_label + "____" + str(j - low))
# according to sklearn documentation,
# "non-categorical features are always stacked to the right of the matrix"
# by observation, it looks like the non-categorical features' relative order is preserved
# BUT: there is no guarantee made about that behavior!
# We either trust OneHotEncoder to be sensible, or look for some other way
new_labels += self.noncat_columnlabels
self.new_labels = new_labels
def transform(self, matrix, copy=True):
# actually transform the matrix
matrix_encoded = self.encoder.transform(matrix)
return matrix_encoded
def fit_transform(self, matrix, copy=True):
self.fit(matrix)
return self.transform(matrix)
def inverse_transform(self, matrix, copy=True):
n_values = self.n_values
# If there are no categorical variables, no transformation happened.
if len(n_values) == 0:
return matrix
# Otherwise, this is a dummy-encoded matrix. Transform it back to original form.
n_features = matrix.shape[-1] - self.encoder.feature_indices_[-1] + len(n_values)
noncat_start_idx = self.encoder.feature_indices_[-1]
inverted_matrix = np.empty((matrix.shape[0], n_features))
cat_idx = 0
noncat_idx = 0
for i in range(n_features):
if i in self.cat_idxs_old:
new_col = np.ones((matrix.shape[0],))
start_idx = self.encoder.feature_indices_[cat_idx]
for j in range(n_values[cat_idx]):
col = matrix[:, start_idx + j]
new_col[col == 1] = j
cat_idx += 1
else:
new_col = np.array(matrix[:, noncat_start_idx + noncat_idx])
noncat_idx += 1
inverted_matrix[:, i] = new_col
return inverted_matrix
def total_dummies(self):
return sum(self.n_values)
def consolidate_columnlabels(columnlabels):
import re
# use this to check if a label was created by dummy encoder
p = re.compile(r'(.*)____\d+')
consolidated_columnlabels = []
cat_seen = set() # avoid duplicate cat_labels
for lab in columnlabels:
m = p.match(lab)
# m.group(1) is the original column name
if m:
if m.group(1) not in cat_seen:
cat_seen.add(m.group(1))
consolidated_columnlabels.append(m.group(1))
else:
# non-categorical variable
consolidated_columnlabels.append(lab)
return consolidated_columnlabels
def fix_scaler(scaler, encoder, params):
p = 0.5
mean = scaler.mean_
var = scaler.var_
n_values = encoder.n_values
cat_start_idxs = encoder.xform_start_indices
current_idx = 0
cat_idx = 0
for param in params:
if param.iscategorical:
if param.isboolean:
nvals = 1
else:
assert cat_start_idxs[cat_idx] == current_idx
nvals = n_values[cat_idx]
cat_idx += 1
cat_mean = nvals * p
cat_var = cat_mean * (1 - p)
mean[current_idx: current_idx + nvals] = cat_mean
var[current_idx: current_idx + nvals] = cat_var
current_idx += nvals
else:
current_idx += 1
scaler.mean_ = mean
scaler.var_ = var
scaler.scale_ = np.sqrt(var)
def get_min_max(params, encoder=None):
if encoder is not None:
num_cat_feats = encoder.n_values.size
nfeats = len(params) - num_cat_feats + np.sum(encoder.n_values)
n_values = encoder.n_values
cat_start_idxs = encoder.xform_start_indices
else:
num_cat_feats = 0
nfeats = len(params)
n_values = np.array([])
cat_start_idxs = np.array([])
mins = np.empty((nfeats,))
maxs = np.empty((nfeats,))
current_idx = 0
cat_idx = 0
for param in params:
if param.iscategorical:
if param.isboolean:
nvals = 1
else:
assert cat_start_idxs[cat_idx] == current_idx
nvals = n_values[cat_idx]
cat_idx += 1
mins[current_idx: current_idx + nvals] = 0
maxs[current_idx: current_idx + nvals] = 1
current_idx += nvals
else:
mins[current_idx] = param.true_range[0] # valid_values[0]
maxs[current_idx] = param.true_range[1] # valid_values[-1]
current_idx += 1
return mins, maxs
# ==========================================================
# Min-max scaler
# ==========================================================
class MinMaxScaler(Preprocess):
def __init__(self, mins=None, maxs=None):
self.scaler_ = SklearnMinMaxScaler()
if mins is not None:
assert isinstance(mins, np.ndarray)
if mins.ndim == 1:
mins = mins.reshape(1, -1)
self.scaler_.partial_fit(mins)
self.mins_ = mins
else:
self.mins_ = None
if maxs is not None:
assert isinstance(maxs, np.ndarray)
if maxs.ndim == 1:
maxs = maxs.reshape(1, -1)
self.scaler_.partial_fit(maxs)
self.maxs_ = maxs
else:
self.maxs_ = None
self.fitted_ = self.mins_ is not None and self.maxs_ is not None
def fit(self, matrix):
if matrix.ndim == 1:
matrix = matrix.reshape(1, -1)
self.scaler_.partial_fit(matrix)
self.mins_ = self.scaler_.data_min_
self.maxs_ = self.scaler_.data_max_
self.fitted_ = True
return self
def transform(self, matrix, copy=True):
if not self.fitted_:
raise Exception("Model not fitted!")
if matrix.ndim == 1:
matrix = matrix.reshape(1, -1)
return self.scaler_.transform(matrix)
def inverse_transform(self, matrix, copy=True):
if matrix.ndim == 1:
matrix = matrix.reshape(1, -1)
return self.scaler_.inverse_transform(matrix)