117 lines
5.1 KiB
Python
117 lines
5.1 KiB
Python
|
#
|
||
|
# OtterTune - test_constraints.py
|
||
|
#
|
||
|
# Copyright (c) 2017-18, Carnegie Mellon University Database Group
|
||
|
#
|
||
|
import unittest
|
||
|
import numpy as np
|
||
|
from sklearn import datasets
|
||
|
from sklearn.preprocessing import StandardScaler
|
||
|
|
||
|
from analysis.constraints import ParamConstraintHelper
|
||
|
from analysis.preprocessing import DummyEncoder
|
||
|
|
||
|
|
||
|
class ConstraintHelperTestCase(unittest.TestCase):
|
||
|
|
||
|
def test_scale_rescale(self):
|
||
|
X = datasets.load_boston()['data']
|
||
|
X_scaler = StandardScaler()
|
||
|
# params hard-coded for test (messy to import constant from website module)
|
||
|
constraint_helper = ParamConstraintHelper(X_scaler, None,
|
||
|
init_flip_prob=0.3,
|
||
|
flip_prob_decay=0.5)
|
||
|
X_scaled = X_scaler.fit_transform(X)
|
||
|
# there may be some floating point imprecision between scaling and rescaling
|
||
|
row_unscaled = np.round(constraint_helper._handle_scaling(X_scaled[0], True), 10) # pylint: disable=protected-access
|
||
|
self.assertTrue(np.all(X[0] == row_unscaled))
|
||
|
row_rescaled = constraint_helper._handle_rescaling(row_unscaled, True) # pylint: disable=protected-access
|
||
|
self.assertTrue(np.all(X_scaled[0] == row_rescaled))
|
||
|
|
||
|
def test_apply_constraints_unscaled(self):
|
||
|
n_values = [3]
|
||
|
categorical_features = [0]
|
||
|
encoder = DummyEncoder(n_values, categorical_features, ['a'], [])
|
||
|
encoder.fit([[0, 17]])
|
||
|
X_scaler = StandardScaler()
|
||
|
constraint_helper = ParamConstraintHelper(X_scaler, encoder,
|
||
|
init_flip_prob=0.3,
|
||
|
flip_prob_decay=0.5)
|
||
|
|
||
|
X = [0.1, 0.2, 0.3, 17]
|
||
|
X_expected = [0, 0, 1, 17]
|
||
|
X_corrected = constraint_helper.apply_constraints(X, scaled=False, rescale=False)
|
||
|
self.assertTrue(np.all(X_corrected == X_expected))
|
||
|
|
||
|
def test_apply_constraints(self):
|
||
|
n_values = [3]
|
||
|
categorical_features = [0]
|
||
|
encoder = DummyEncoder(n_values, categorical_features, ['a'], [])
|
||
|
encoder.fit([[0, 17]])
|
||
|
X_scaler = StandardScaler()
|
||
|
X = np.array([[0, 0, 1, 17], [1, 0, 0, 17]], dtype=float)
|
||
|
X_scaled = X_scaler.fit_transform(X)
|
||
|
constraint_helper = ParamConstraintHelper(X_scaler, encoder,
|
||
|
init_flip_prob=0.3,
|
||
|
flip_prob_decay=0.5)
|
||
|
|
||
|
row = X_scaled[0]
|
||
|
new_row = np.copy(row)
|
||
|
new_row[0: 3] += 0.1 # should still represent [0, 0, 1] encoding
|
||
|
row_corrected = constraint_helper.apply_constraints(new_row)
|
||
|
self.assertTrue(np.all(row == row_corrected))
|
||
|
|
||
|
# tests that repeatedly applying randomize_categorical_features
|
||
|
# always results in valid configurations of categorical dumny encodings
|
||
|
# and will lead to all possible values of categorical variables being tried
|
||
|
def test_randomize_categorical_features(self):
|
||
|
# variable 0 is categorical, 3 values
|
||
|
# variable 1 is not categorical
|
||
|
# variable 2 is categorical, 4 values
|
||
|
cat_var_0_levels = 3
|
||
|
cat_var_2_levels = 4
|
||
|
cat_var_0_idx = 0
|
||
|
cat_var_2_idx = 2
|
||
|
n_values = [cat_var_0_levels, cat_var_2_levels]
|
||
|
categorical_features = [cat_var_0_idx, cat_var_2_idx]
|
||
|
encoder = DummyEncoder(n_values, categorical_features, ['a', 'b'], [])
|
||
|
encoder.fit([[0, 17, 0]])
|
||
|
X_scaler = StandardScaler()
|
||
|
constraint_helper = ParamConstraintHelper(X_scaler, encoder,
|
||
|
init_flip_prob=0.3,
|
||
|
flip_prob_decay=0.5)
|
||
|
|
||
|
# row is a sample encoded set of features,
|
||
|
# note that the non-categorical variable is on the right
|
||
|
row = np.array([0, 0, 1, 1, 0, 0, 0, 17], dtype=float)
|
||
|
trials = 20
|
||
|
cat_var_0_counts = np.zeros(cat_var_0_levels)
|
||
|
cat_var_2_counts = np.zeros(cat_var_2_levels)
|
||
|
for _ in range(trials):
|
||
|
# possibly flip the categorical features
|
||
|
row = constraint_helper.randomize_categorical_features(row, scaled=False, rescale=False)
|
||
|
|
||
|
# check that result is valid for cat_var_0
|
||
|
cat_var_0_dummies = row[0: cat_var_0_levels]
|
||
|
self.assertTrue(np.all(np.logical_or(cat_var_0_dummies == 0, cat_var_0_dummies == 1)))
|
||
|
self.assertEqual(np.sum(cat_var_0_dummies), 1)
|
||
|
cat_var_0_counts[np.argmax(cat_var_0_dummies)] += 1
|
||
|
|
||
|
# check that result is valid for cat_var_2
|
||
|
cat_var_2_dummies = row[cat_var_0_levels: cat_var_0_levels + cat_var_2_levels]
|
||
|
self.assertTrue(np.all(np.logical_or(cat_var_2_dummies == 0, cat_var_2_dummies == 1)))
|
||
|
self.assertEqual(np.sum(cat_var_2_dummies), 1)
|
||
|
cat_var_2_counts[np.argmax(cat_var_2_dummies)] += 1
|
||
|
|
||
|
self.assertEqual(row[-1], 17)
|
||
|
|
||
|
for ct in cat_var_0_counts:
|
||
|
self.assertTrue(ct > 0)
|
||
|
|
||
|
for ct in cat_var_2_counts:
|
||
|
self.assertTrue(ct > 0)
|
||
|
|
||
|
|
||
|
if __name__ == '__main__':
|
||
|
unittest.main()
|