ottertune/server/analysis/preprocessing.py

#
# OtterTune - preprocessing.py
#
# Copyright (c) 2017-18, Carnegie Mellon University Database Group
#
from abc import ABCMeta, abstractmethod

from itertools import chain, combinations, combinations_with_replacement
import numpy as np
from sklearn.preprocessing import MinMaxScaler as SklearnMinMaxScaler

from .util import is_numeric_matrix, is_lexical_matrix


# ==========================================================
#  Preprocessing Base Class
# ==========================================================
class Preprocess(object, metaclass=ABCMeta):

    @abstractmethod
    def fit(self, matrix):
        pass

    @abstractmethod
    def transform(self, matrix, copy=True):
        pass

    def fit_transform(self, matrix, copy=True):
        self.fit(matrix)
        return self.transform(matrix, copy=True)

    @abstractmethod
    def inverse_transform(self, matrix, copy=True):
        pass


# ==========================================================
#   Bin by Deciles
# ==========================================================
class Bin(Preprocess):

    def __init__(self, bin_start, axis=None):
        if axis is not None and \
                axis != 1 and axis != 0:
            raise NotImplementedError("Axis={} is not yet implemented".format(axis))
        self.deciles_ = None
        self.bin_start_ = bin_start
        self.axis_ = axis

    def fit(self, matrix):
        if self.axis_ is None:
            self.deciles_ = get_deciles(matrix, self.axis_)
        elif self.axis_ == 0:  # Bin columns
            self.deciles_ = []
            for col in matrix.T:
                self.deciles_.append(get_deciles(col, axis=None))
        elif self.axis_ == 1:  # Bin rows
            self.deciles_ = []
            for row in matrix:
                self.deciles_.append(get_deciles(row, axis=None))
        return self

    def transform(self, matrix, copy=True):
        assert self.deciles_ is not None
        if self.axis_ is None:
            res = bin_by_decile(matrix, self.deciles_,
                                self.bin_start_, self.axis_)
        elif self.axis_ == 0:  # Transform columns
            columns = []
            for col, decile in zip(matrix.T, self.deciles_):
                columns.append(bin_by_decile(col, decile,
                                             self.bin_start_, axis=None))
            res = np.vstack(columns).T
        elif self.axis_ == 1:  # Transform rows
            rows = []
            for row, decile in zip(matrix, self.deciles_):
                rows.append(bin_by_decile(row, decile,
                                          self.bin_start_, axis=None))
            res = np.vstack(rows)
        assert res.shape == matrix.shape
        return res

    def inverse_transform(self, matrix, copy=True):
        raise NotImplementedError("This method is not supported")


def get_deciles(matrix, axis=None):
    if axis is not None:
        raise NotImplementedError("Axis is not yet implemented")

    assert matrix.ndim > 0
    assert matrix.size > 0

    decile_range = np.arange(10, 101, 10)
    deciles = np.percentile(matrix, decile_range, axis=axis)
    deciles[-1] = np.Inf
    return deciles


def bin_by_decile(matrix, deciles, bin_start, axis=None):
    if axis is not None:
        raise NotImplementedError("Axis is not yet implemented")

    assert matrix.ndim > 0
    assert matrix.size > 0
    assert deciles is not None
    assert len(deciles) == 10

    binned_matrix = np.zeros_like(matrix)
    for i in range(10)[::-1]:
        decile = deciles[i]
        binned_matrix[matrix <= decile] = i + bin_start

    return binned_matrix


# ==========================================================
#   Shuffle Indices
# ==========================================================
class Shuffler(Preprocess):

    def __init__(self, shuffle_rows=True, shuffle_columns=False,
                 row_indices=None, column_indices=None, seed=0):
        self.shuffle_rows_ = shuffle_rows
        self.shuffle_columns_ = shuffle_columns
        self.row_indices_ = row_indices
        self.column_indices_ = column_indices
        np.random.seed(seed)
        self.fitted_ = False

    def fit(self, matrix):
        if self.shuffle_rows_ and self.row_indices_ is None:
            self.row_indices_ = get_shuffle_indices(matrix.data.shape[0])
        if self.shuffle_columns_ and self.column_indices_ is None:
            self.column_indices_ = get_shuffle_indices(matrix.data.shape[1])
        self.fitted_ = True

    def transform(self, matrix, copy=True):
        if not self.fitted_:
            raise Exception("The fit() function must be called before transform()")
        if copy:
            matrix = matrix.copy()

        if self.shuffle_rows_:
            matrix.data = matrix.data[self.row_indices_]
            matrix.rowlabels = matrix.rowlabels[self.row_indices_]
        if self.shuffle_columns_:
            matrix.data = matrix.data[:, self.column_indices_]
            matrix.columnlabels = matrix.columnlabels[self.column_indices_]
        return matrix

    def inverse_transform(self, matrix, copy=True):
        if copy:
            matrix = matrix.copy()

        if self.shuffle_rows_:
            inverse_row_indices = np.argsort(self.row_indices_)
            matrix.data = matrix.data[inverse_row_indices]
            matrix.rowlabels = matrix.rowlabels[inverse_row_indices]
        if self.shuffle_columns_:
            inverse_column_indices = np.argsort(self.column_indices_)
            matrix.data = matrix.data[:, inverse_column_indices]
            matrix.columnlabels = matrix.columnlabels[inverse_column_indices]
        return matrix


def get_shuffle_indices(size, seed=None):
    if seed is not None:
        assert isinstance(seed, int)
        np.random.seed(seed)
    if isinstance(size, int):
        return np.random.choice(size, size, replace=False)
    else:
        indices = []
        for d in size:
            indices.append(np.random.choice(d, d, replace=False))
        return indices


# ==========================================================
#   Polynomial Features
# ==========================================================
class PolynomialFeatures(Preprocess):
    """Compute the polynomial features of the input array.
    This code was copied and modified from sklearn's
    implementation.
    """

    def __init__(self, degree=2, interaction_only=False, include_bias=True):
        self.degree_ = degree
        self.interaction_only_ = interaction_only
        self.include_bias_ = include_bias
        self.n_input_features_ = None
        self.n_output_features_ = None

#     @property
#     def powers_(self):
#         combinations = self._combinations(self.n_input_features_, self.degree_,
#                                           self.interaction_only_,
#                                           self.include_bias_)
#         return np.vstack(np.bincount(c, minlength=self.n_input_features_)
#                          for c in combinations)

    @staticmethod
    def _combinations(n_features, degree, interaction_only, include_bias):
        comb = (combinations if interaction_only else combinations_with_replacement)
        start = int(not include_bias)
        return chain.from_iterable(comb(list(range(n_features)), i)
                                   for i in range(start, degree + 1))

    def fit(self, matrix):
        assert matrix.ndim == 2
        assert matrix.size > 0

        _, n_features = matrix.shape
        combos = self._combinations(n_features, self.degree_,
                                    self.interaction_only_,
                                    self.include_bias_)
        self.n_input_features_ = matrix.shape[1]
        self.n_output_features_ = sum(1 for _ in combos)
        return self

    def transform(self, matrix, copy=True):
        """Transform data to polynomial features
        Parameters
        ----------
        X : array-like, shape [n_samples, n_features]
            The data to transform, row by row.
        Returns
        -------
        XP : np.ndarray shape [n_samples, NP]
            The matrix of features, where NP is the number of polynomial
            features generated from the combination of inputs.
        """
        assert matrix.ndim == 2
        assert matrix.size > 0

        n_samples, n_features = matrix.shape

        if n_features != self.n_input_features_:
            raise ValueError("X shape does not match training shape")

        is_numeric_type = is_numeric_matrix(matrix)
        is_lexical_type = is_lexical_matrix(matrix)
        if is_lexical_type:
            strs = matrix.reshape((matrix.size,))
            maxlen = max([len(s) for s in strs])
            dtype = "S{}".format(maxlen * 2 + 1)
        else:
            dtype = matrix.dtype

        # allocate output data
        poly_matrix = np.empty((n_samples, self.n_output_features_), dtype=dtype)

        combos = self._combinations(n_features, self.degree_,
                                    self.interaction_only_,
                                    self.include_bias_)
        for i, c in enumerate(combos):
            if is_numeric_type:
                poly_matrix[:, i] = matrix[:, c].prod(1)
            elif is_lexical_type:
                n_poly1_feats = n_features + int(self.include_bias_)
                if i >= n_poly1_feats:
                    x = "*".join(np.squeeze(matrix[:, c]).tolist())
                else:
                    x = "".join(np.squeeze(matrix[:, c]).tolist())
                poly_matrix[:, i] = x
            else:
                raise TypeError("Unsupported matrix type {}".format(matrix.dtype))

        return poly_matrix

    def inverse_transform(self, matrix, copy=True):
        raise NotImplementedError("This method is not supported")


# ==========================================================
#   Dummy Encoding
# ==========================================================
class DummyEncoder(Preprocess):

    def __init__(self, n_values, categorical_features, cat_columnlabels, noncat_columnlabels):
        from sklearn.preprocessing import OneHotEncoder

        if not isinstance(n_values, np.ndarray):
            n_values = np.array(n_values)
        if not isinstance(categorical_features, np.ndarray):
            categorical_features = np.array(categorical_features)
        # assert categorical_features.size > 0
        assert categorical_features.shape == n_values.shape
        for nv in n_values:
            if nv <= 2:
                raise Exception("Categorical features must have 3+ labels")

        self.n_values = n_values
        self.cat_columnlabels = cat_columnlabels
        self.noncat_columnlabels = noncat_columnlabels
        self.encoder = OneHotEncoder(
            n_values=n_values, categorical_features=categorical_features, sparse=False)
        self.new_labels = None
        self.cat_idxs_old = categorical_features

    def fit(self, matrix):
        self.encoder.fit(matrix)
        # determine new columnlabels
        # categorical variables are done in order specified by categorical_features
        new_labels = []
        for i, cat_label in enumerate(self.cat_columnlabels):
            low = self.encoder.feature_indices_[i]
            high = self.encoder.feature_indices_[i + 1]
            for j in range(low, high):
                # eg the categorical variable named cat_var with 5 possible values
                # turns into 0/1 variables named cat_var____0, ..., cat_var____4
                new_labels.append(cat_label + "____" + str(j - low))
        # according to sklearn documentation,
        # "non-categorical features are always stacked to the right of the matrix"
        # by observation, it looks like the non-categorical features' relative order is preserved
        # BUT: there is no guarantee made about that behavior!
        # We either trust OneHotEncoder to be sensible, or look for some other way
        new_labels += self.noncat_columnlabels
        self.new_labels = new_labels

    def transform(self, matrix, copy=True):
        # actually transform the matrix
        matrix_encoded = self.encoder.transform(matrix)
        return matrix_encoded

    def fit_transform(self, matrix, copy=True):
        self.fit(matrix)
        return self.transform(matrix)

    def inverse_transform(self, matrix, copy=True):
        n_values = self.n_values
        # If there are no categorical variables, no transformation happened.
        if len(n_values) == 0:
            return matrix

        # Otherwise, this is a dummy-encoded matrix. Transform it back to original form.
        n_features = matrix.shape[-1] - self.encoder.feature_indices_[-1] + len(n_values)
        noncat_start_idx = self.encoder.feature_indices_[-1]
        inverted_matrix = np.empty((matrix.shape[0], n_features))
        cat_idx = 0
        noncat_idx = 0
        for i in range(n_features):
            if i in self.cat_idxs_old:
                new_col = np.ones((matrix.shape[0],))
                start_idx = self.encoder.feature_indices_[cat_idx]
                for j in range(n_values[cat_idx]):
                    col = matrix[:, start_idx + j]
                    new_col[col == 1] = j
                cat_idx += 1
            else:
                new_col = np.array(matrix[:, noncat_start_idx + noncat_idx])
                noncat_idx += 1
            inverted_matrix[:, i] = new_col
        return inverted_matrix

    def total_dummies(self):
        return sum(self.n_values)


def consolidate_columnlabels(columnlabels):
    import re
    # use this to check if a label was created by dummy encoder
    p = re.compile(r'(.*)____\d+')

    consolidated_columnlabels = []
    cat_seen = set()  # avoid duplicate cat_labels
    for lab in columnlabels:
        m = p.match(lab)
        # m.group(1) is the original column name
        if m:
            if m.group(1) not in cat_seen:
                cat_seen.add(m.group(1))
                consolidated_columnlabels.append(m.group(1))
        else:
            # non-categorical variable
            consolidated_columnlabels.append(lab)
    return consolidated_columnlabels


def fix_scaler(scaler, encoder, params):
    p = 0.5
    mean = scaler.mean_
    var = scaler.var_
    n_values = encoder.n_values
    cat_start_idxs = encoder.xform_start_indices
    current_idx = 0
    cat_idx = 0
    for param in params:
        if param.iscategorical:
            if param.isboolean:
                nvals = 1
            else:
                assert cat_start_idxs[cat_idx] == current_idx
                nvals = n_values[cat_idx]
                cat_idx += 1
            cat_mean = nvals * p
            cat_var = cat_mean * (1 - p)
            mean[current_idx: current_idx + nvals] = cat_mean
            var[current_idx: current_idx + nvals] = cat_var
            current_idx += nvals
        else:
            current_idx += 1

    scaler.mean_ = mean
    scaler.var_ = var
    scaler.scale_ = np.sqrt(var)


def get_min_max(params, encoder=None):
    if encoder is not None:
        num_cat_feats = encoder.n_values.size
        nfeats = len(params) - num_cat_feats + np.sum(encoder.n_values)
        n_values = encoder.n_values
        cat_start_idxs = encoder.xform_start_indices
    else:
        num_cat_feats = 0
        nfeats = len(params)
        n_values = np.array([])
        cat_start_idxs = np.array([])

    mins = np.empty((nfeats,))
    maxs = np.empty((nfeats,))
    current_idx = 0
    cat_idx = 0
    for param in params:
        if param.iscategorical:
            if param.isboolean:
                nvals = 1
            else:
                assert cat_start_idxs[cat_idx] == current_idx
                nvals = n_values[cat_idx]
                cat_idx += 1
            mins[current_idx: current_idx + nvals] = 0
            maxs[current_idx: current_idx + nvals] = 1
            current_idx += nvals
        else:
            mins[current_idx] = param.true_range[0]  # valid_values[0]
            maxs[current_idx] = param.true_range[1]  # valid_values[-1]
            current_idx += 1
    return mins, maxs


# ==========================================================
#   Min-max scaler
# ==========================================================
class MinMaxScaler(Preprocess):

    def __init__(self, mins=None, maxs=None):
        self.scaler_ = SklearnMinMaxScaler()
        if mins is not None:
            assert isinstance(mins, np.ndarray)
            if mins.ndim == 1:
                mins = mins.reshape(1, -1)
            self.scaler_.partial_fit(mins)
            self.mins_ = mins
        else:
            self.mins_ = None
        if maxs is not None:
            assert isinstance(maxs, np.ndarray)
            if maxs.ndim == 1:
                maxs = maxs.reshape(1, -1)
            self.scaler_.partial_fit(maxs)
            self.maxs_ = maxs
        else:
            self.maxs_ = None
        self.fitted_ = self.mins_ is not None and self.maxs_ is not None

    def fit(self, matrix):
        if matrix.ndim == 1:
            matrix = matrix.reshape(1, -1)
        self.scaler_.partial_fit(matrix)
        self.mins_ = self.scaler_.data_min_
        self.maxs_ = self.scaler_.data_max_
        self.fitted_ = True
        return self

    def transform(self, matrix, copy=True):
        if not self.fitted_:
            raise Exception("Model not fitted!")
        if matrix.ndim == 1:
            matrix = matrix.reshape(1, -1)
        return self.scaler_.transform(matrix)

    def inverse_transform(self, matrix, copy=True):
        if matrix.ndim == 1:
            matrix = matrix.reshape(1, -1)
        return self.scaler_.inverse_transform(matrix)