Initial commit with BSL

2019-08-23 11:47:19 -04:00
commit 3e564ce922
286 changed files with 177642 additions and 0 deletions
--- a/server/analysis/tests/init.py
+++ b/server/analysis/tests/init.py
@@ -0,0 +1,5 @@
+#
+# OtterTune - __init__.py
+#
+# Copyright (c) 2017-18, Carnegie Mellon University Database Group
+#
--- a/server/analysis/tests/test_cluster.py
+++ b/server/analysis/tests/test_cluster.py
@@ -0,0 +1,91 @@
+#
+# OtterTune - test_cluster.py
+#
+# Copyright (c) 2017-18, Carnegie Mellon University Database Group
+#
+import unittest
+import numpy as np
+from sklearn import datasets
+
+from analysis.cluster import KMeans, KMeansClusters, create_kselection_model
+
+
+class TestKMeans(unittest.TestCase):
+
+    @classmethod
+    def setUpClass(cls):
+        super(TestKMeans, cls).setUpClass()
+        iris = datasets.load_iris()
+        cls.model = KMeans()
+        cls.model.fit(iris.data, 5, iris.target,
+                      estimator_params={'n_init': 50, 'random_state': 42})
+
+    def test_kmeans_n_clusters(self):
+        self.assertEqual(self.model.n_clusters_, 5)
+
+    def test_kmeans_cluster_inertia(self):
+        self.assertAlmostEqual(self.model.cluster_inertia_, 46.535, 2)
+
+    def test_kmeans_cluster_labels(self):
+        expected_labels = [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
+                           1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
+                           1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 3, 3, 3, 2, 3, 3, 3,
+                           2, 3, 2, 2, 3, 2, 3, 2, 3, 3, 2, 3, 2, 3, 2, 3, 3, 3, 3,
+                           3, 3, 3, 2, 2, 2, 2, 3, 2, 3, 3, 3, 2, 2, 2, 3, 2, 2, 2,
+                           2, 2, 3, 2, 2, 4, 3, 0, 4, 4, 0, 2, 0, 4, 0, 4, 4, 4, 3,
+                           4, 4, 4, 0, 0, 3, 4, 3, 0, 3, 4, 0, 3, 3, 4, 0, 0, 0, 4,
+                           3, 3, 0, 4, 4, 3, 4, 4, 4, 3, 4, 4, 4, 3, 4, 4, 3]
+        for lab_actual, lab_expected in zip(self.model.cluster_labels_, expected_labels):
+            self.assertEqual(lab_actual, lab_expected)
+
+    def test_kmeans_sample_labels(self):
+        for lab_actual, lab_expected in zip(self.model.sample_labels_, datasets.load_iris().target):
+            self.assertEqual(lab_actual, lab_expected)
+
+    def test_kmeans_cluster_centers(self):
+        expected_centers = [[7.475, 3.125, 6.300, 2.050],
+                            [5.006, 3.418, 1.464, 0.244],
+                            [5.508, 2.600, 3.908, 1.204],
+                            [6.207, 2.853, 4.746, 1.564],
+                            [6.529, 3.058, 5.508, 2.162]]
+        for row_actual, row_expected in zip(self.model.cluster_centers_, expected_centers):
+            for val_actual, val_expected in zip(row_actual, row_expected):
+                self.assertAlmostEqual(val_actual, val_expected, 2)
+
+
+class TestKSelection(unittest.TestCase):
+
+    def setUp(self):
+        np.random.seed(seed=42)
+
+    @classmethod
+    def setUpClass(cls):
+        super(TestKSelection, cls).setUpClass()
+
+        # Load Iris data
+        iris = datasets.load_iris()
+        cls.matrix = iris.data
+        cls.kmeans_models = KMeansClusters()
+        cls.kmeans_models.fit(cls.matrix,
+                              min_cluster=1,
+                              max_cluster=10,
+                              sample_labels=iris.target,
+                              estimator_params={'n_init': 50, 'random_state': 42})
+
+    def test_detk_optimal_num_clusters(self):
+        # Compute optimal # cluster using det-k
+        detk = create_kselection_model("det-k")
+        detk.fit(self.matrix, self.kmeans_models.cluster_map_)
+        self.assertEqual(detk.optimal_num_clusters_, 2)
+
+    def test_gap_statistic_optimal_num_clusters(self):
+        # Compute optimal # cluster using gap-statistics
+        gap = create_kselection_model("gap-statistic")
+        gap.fit(self.matrix, self.kmeans_models.cluster_map_)
+        self.assertEqual(gap.optimal_num_clusters_, 8)
+
+    def test_silhouette_optimal_num_clusters(self):
+        # Compute optimal # cluster using Silhouette Analysis
+        sil = create_kselection_model("s-score")
+        sil.fit(self.matrix, self.kmeans_models.cluster_map_)
+        self.assertEqual(sil.optimal_num_clusters_, 2)
--- a/server/analysis/tests/test_constraints.py
+++ b/server/analysis/tests/test_constraints.py
@@ -0,0 +1,116 @@
+#
+# OtterTune - test_constraints.py
+#
+# Copyright (c) 2017-18, Carnegie Mellon University Database Group
+#
+import unittest
+import numpy as np
+from sklearn import datasets
+from sklearn.preprocessing import StandardScaler
+
+from analysis.constraints import ParamConstraintHelper
+from analysis.preprocessing import DummyEncoder
+
+
+class ConstraintHelperTestCase(unittest.TestCase):
+
+    def test_scale_rescale(self):
+        X = datasets.load_boston()['data']
+        X_scaler = StandardScaler()
+        # params hard-coded for test (messy to import constant from website module)
+        constraint_helper = ParamConstraintHelper(X_scaler, None,
+                                                  init_flip_prob=0.3,
+                                                  flip_prob_decay=0.5)
+        X_scaled = X_scaler.fit_transform(X)
+        # there may be some floating point imprecision between scaling and rescaling
+        row_unscaled = np.round(constraint_helper._handle_scaling(X_scaled[0], True), 10)  # pylint: disable=protected-access
+        self.assertTrue(np.all(X[0] == row_unscaled))
+        row_rescaled = constraint_helper._handle_rescaling(row_unscaled, True)  # pylint: disable=protected-access
+        self.assertTrue(np.all(X_scaled[0] == row_rescaled))
+
+    def test_apply_constraints_unscaled(self):
+        n_values = [3]
+        categorical_features = [0]
+        encoder = DummyEncoder(n_values, categorical_features, ['a'], [])
+        encoder.fit([[0, 17]])
+        X_scaler = StandardScaler()
+        constraint_helper = ParamConstraintHelper(X_scaler, encoder,
+                                                  init_flip_prob=0.3,
+                                                  flip_prob_decay=0.5)
+
+        X = [0.1, 0.2, 0.3, 17]
+        X_expected = [0, 0, 1, 17]
+        X_corrected = constraint_helper.apply_constraints(X, scaled=False, rescale=False)
+        self.assertTrue(np.all(X_corrected == X_expected))
+
+    def test_apply_constraints(self):
+        n_values = [3]
+        categorical_features = [0]
+        encoder = DummyEncoder(n_values, categorical_features, ['a'], [])
+        encoder.fit([[0, 17]])
+        X_scaler = StandardScaler()
+        X = np.array([[0, 0, 1, 17], [1, 0, 0, 17]], dtype=float)
+        X_scaled = X_scaler.fit_transform(X)
+        constraint_helper = ParamConstraintHelper(X_scaler, encoder,
+                                                  init_flip_prob=0.3,
+                                                  flip_prob_decay=0.5)
+
+        row = X_scaled[0]
+        new_row = np.copy(row)
+        new_row[0: 3] += 0.1  # should still represent [0, 0, 1] encoding
+        row_corrected = constraint_helper.apply_constraints(new_row)
+        self.assertTrue(np.all(row == row_corrected))
+
+    # tests that repeatedly applying randomize_categorical_features
+    # always results in valid configurations of categorical dumny encodings
+    # and will lead to all possible values of categorical variables being tried
+    def test_randomize_categorical_features(self):
+        # variable 0 is categorical, 3 values
+        # variable 1 is not categorical
+        # variable 2 is categorical, 4 values
+        cat_var_0_levels = 3
+        cat_var_2_levels = 4
+        cat_var_0_idx = 0
+        cat_var_2_idx = 2
+        n_values = [cat_var_0_levels, cat_var_2_levels]
+        categorical_features = [cat_var_0_idx, cat_var_2_idx]
+        encoder = DummyEncoder(n_values, categorical_features, ['a', 'b'], [])
+        encoder.fit([[0, 17, 0]])
+        X_scaler = StandardScaler()
+        constraint_helper = ParamConstraintHelper(X_scaler, encoder,
+                                                  init_flip_prob=0.3,
+                                                  flip_prob_decay=0.5)
+
+        # row is a sample encoded set of features,
+        # note that the non-categorical variable is on the right
+        row = np.array([0, 0, 1, 1, 0, 0, 0, 17], dtype=float)
+        trials = 20
+        cat_var_0_counts = np.zeros(cat_var_0_levels)
+        cat_var_2_counts = np.zeros(cat_var_2_levels)
+        for _ in range(trials):
+            # possibly flip the categorical features
+            row = constraint_helper.randomize_categorical_features(row, scaled=False, rescale=False)
+
+            # check that result is valid for cat_var_0
+            cat_var_0_dummies = row[0: cat_var_0_levels]
+            self.assertTrue(np.all(np.logical_or(cat_var_0_dummies == 0, cat_var_0_dummies == 1)))
+            self.assertEqual(np.sum(cat_var_0_dummies), 1)
+            cat_var_0_counts[np.argmax(cat_var_0_dummies)] += 1
+
+            # check that result is valid for cat_var_2
+            cat_var_2_dummies = row[cat_var_0_levels: cat_var_0_levels + cat_var_2_levels]
+            self.assertTrue(np.all(np.logical_or(cat_var_2_dummies == 0, cat_var_2_dummies == 1)))
+            self.assertEqual(np.sum(cat_var_2_dummies), 1)
+            cat_var_2_counts[np.argmax(cat_var_2_dummies)] += 1
+
+            self.assertEqual(row[-1], 17)
+
+        for ct in cat_var_0_counts:
+            self.assertTrue(ct > 0)
+
+        for ct in cat_var_2_counts:
+            self.assertTrue(ct > 0)
+
+
+if __name__ == '__main__':
+    unittest.main()
--- a/server/analysis/tests/test_gpr.py
+++ b/server/analysis/tests/test_gpr.py
@@ -0,0 +1,61 @@
+#
+# OtterTune - test_gpr.py
+#
+# Copyright (c) 2017-18, Carnegie Mellon University Database Group
+#
+import unittest
+from sklearn import datasets
+from analysis.gp import GPRNP
+from analysis.gp_tf import GPR
+
+
+# test numpy version GPR
+class TestGPRNP(unittest.TestCase):
+
+    @classmethod
+    def setUpClass(cls):
+        super(TestGPRNP, cls).setUpClass()
+        boston = datasets.load_boston()
+        data = boston['data']
+        X_train = data[0:500]
+        X_test = data[500:]
+        y_train = boston['target'][0:500].reshape(500, 1)
+        cls.model = GPRNP(length_scale=1.0, magnitude=1.0)
+        cls.model.fit(X_train, y_train, ridge=1.0)
+        cls.gpr_result = cls.model.predict(X_test)
+
+    def test_gprnp_ypreds(self):
+        ypreds_round = [round(x[0], 4) for x in self.gpr_result.ypreds]
+        expected_ypreds = [0.0181, 0.0014, 0.0006, 0.0015, 0.0039, 0.0014]
+        self.assertEqual(ypreds_round, expected_ypreds)
+
+    def test_gprnp_sigmas(self):
+        sigmas_round = [round(x[0], 4) for x in self.gpr_result.sigmas]
+        expected_sigmas = [1.0, 1.0, 1.0, 1.0, 1.0, 1.0]
+        self.assertEqual(sigmas_round, expected_sigmas)
+
+
+# test Tensorflow version GPR
+class TestGPRTF(unittest.TestCase):
+
+    @classmethod
+    def setUpClass(cls):
+        super(TestGPRTF, cls).setUpClass()
+        boston = datasets.load_boston()
+        data = boston['data']
+        X_train = data[0:500]
+        X_test = data[500:]
+        y_train = boston['target'][0:500].reshape(500, 1)
+        cls.model = GPR(length_scale=1.0, magnitude=1.0)
+        cls.model.fit(X_train, y_train, ridge=1.0)
+        cls.gpr_result = cls.model.predict(X_test)
+
+    def test_gprnp_ypreds(self):
+        ypreds_round = [round(x[0], 4) for x in self.gpr_result.ypreds]
+        expected_ypreds = [0.0181, 0.0014, 0.0006, 0.0015, 0.0039, 0.0014]
+        self.assertEqual(ypreds_round, expected_ypreds)
+
+    def test_gprnp_sigmas(self):
+        sigmas_round = [round(x[0], 4) for x in self.gpr_result.sigmas]
+        expected_sigmas = [1.0, 1.0, 1.0, 1.0, 1.0, 1.0]
+        self.assertEqual(sigmas_round, expected_sigmas)
--- a/server/analysis/tests/test_preprocessing.py
+++ b/server/analysis/tests/test_preprocessing.py
@@ -0,0 +1,83 @@
+#
+# OtterTune - test_preprocessing.py
+#
+# Copyright (c) 2017-18, Carnegie Mellon University Database Group
+#
+import unittest
+import numpy as np
+
+from analysis.preprocessing import DummyEncoder, consolidate_columnlabels
+
+
+class TestDummyEncoder(unittest.TestCase):
+
+    def test_no_categoricals(self):
+        X = [[1, 2, 3], [4, 5, 6]]
+        n_values = []
+        categorical_features = []
+        cat_columnlabels = []
+        noncat_columnlabels = ['a', 'b', 'c']
+
+        enc = DummyEncoder(n_values, categorical_features,
+                           cat_columnlabels, noncat_columnlabels)
+        X_encoded = enc.fit_transform(X)
+        new_labels = enc.new_labels
+        self.assertTrue(np.all(X == X_encoded))
+        self.assertEqual(noncat_columnlabels, new_labels)
+
+    def test_simple_categorical(self):
+        X = [[0, 1, 2], [1, 1, 2], [2, 1, 2]]
+        n_values = [3]
+        categorical_features = [0]
+        cat_columnlabels = ['label']
+        noncat_columnlabels = ['a', 'b']
+
+        X_expected = [[1, 0, 0, 1, 2], [0, 1, 0, 1, 2], [0, 0, 1, 1, 2]]
+        new_labels_expected = ['label____0', 'label____1', 'label____2', 'a', 'b']
+        enc = DummyEncoder(n_values, categorical_features,
+                           cat_columnlabels, noncat_columnlabels)
+        X_encoded = enc.fit_transform(X)
+        new_labels = enc.new_labels
+        self.assertTrue(np.all(X_expected == X_encoded))
+        self.assertEqual(new_labels_expected, new_labels)
+
+    def test_mixed_categorical(self):
+        X = [[1, 0, 2], [1, 1, 2], [1, 2, 2]]
+        n_values = [3]
+        categorical_features = [1]
+        cat_columnlabels = ['label']
+        noncat_columnlabels = ['a', 'b']
+
+        X_expected = [[1, 0, 0, 1, 2], [0, 1, 0, 1, 2], [0, 0, 1, 1, 2]]
+        new_labels_expected = ['label____0', 'label____1', 'label____2', 'a', 'b']
+        enc = DummyEncoder(n_values, categorical_features,
+                           cat_columnlabels, noncat_columnlabels)
+        X_encoded = enc.fit_transform(X)
+        new_labels = enc.new_labels
+        self.assertTrue(np.all(X_expected == X_encoded))
+        self.assertEqual(new_labels_expected, new_labels)
+
+    def test_consolidate(self):
+        labels = ['label1____0', 'label1____1', 'label2____0', 'label2____1', 'noncat']
+        consolidated = consolidate_columnlabels(labels)
+        expected = ['label1', 'label2', 'noncat']
+        self.assertEqual(expected, consolidated)
+
+    def test_inverse_transform(self):
+        X = [[1, 0, 2], [1, 1, 2], [1, 2, 2]]
+        n_values = [3]
+        categorical_features = [1]
+        cat_columnlabels = ['label']
+        noncat_columnlabels = ['a', 'b']
+
+        X_expected = [[1, 0, 0, 1, 2], [0, 1, 0, 1, 2], [0, 0, 1, 1, 2]]
+        enc = DummyEncoder(n_values, categorical_features,
+                           cat_columnlabels, noncat_columnlabels)
+        X_encoded = enc.fit_transform(X)
+        self.assertTrue(np.all(X_encoded == X_expected))
+        X_decoded = enc.inverse_transform(X_encoded)
+        self.assertTrue(np.all(X == X_decoded))
+
+
+if __name__ == '__main__':
+    unittest.main()