84 lines
3.0 KiB
Python
84 lines
3.0 KiB
Python
#
|
|
# OtterTune - test_preprocessing.py
|
|
#
|
|
# Copyright (c) 2017-18, Carnegie Mellon University Database Group
|
|
#
|
|
import unittest
|
|
import numpy as np
|
|
|
|
from analysis.preprocessing import DummyEncoder, consolidate_columnlabels
|
|
|
|
|
|
class TestDummyEncoder(unittest.TestCase):
|
|
|
|
def test_no_categoricals(self):
|
|
X = [[1, 2, 3], [4, 5, 6]]
|
|
n_values = []
|
|
categorical_features = []
|
|
cat_columnlabels = []
|
|
noncat_columnlabels = ['a', 'b', 'c']
|
|
|
|
enc = DummyEncoder(n_values, categorical_features,
|
|
cat_columnlabels, noncat_columnlabels)
|
|
X_encoded = enc.fit_transform(X)
|
|
new_labels = enc.new_labels
|
|
self.assertTrue(np.all(X == X_encoded))
|
|
self.assertEqual(noncat_columnlabels, new_labels)
|
|
|
|
def test_simple_categorical(self):
|
|
X = [[0, 1, 2], [1, 1, 2], [2, 1, 2]]
|
|
n_values = [3]
|
|
categorical_features = [0]
|
|
cat_columnlabels = ['label']
|
|
noncat_columnlabels = ['a', 'b']
|
|
|
|
X_expected = [[1, 0, 0, 1, 2], [0, 1, 0, 1, 2], [0, 0, 1, 1, 2]]
|
|
new_labels_expected = ['label____0', 'label____1', 'label____2', 'a', 'b']
|
|
enc = DummyEncoder(n_values, categorical_features,
|
|
cat_columnlabels, noncat_columnlabels)
|
|
X_encoded = enc.fit_transform(X)
|
|
new_labels = enc.new_labels
|
|
self.assertTrue(np.all(X_expected == X_encoded))
|
|
self.assertEqual(new_labels_expected, new_labels)
|
|
|
|
def test_mixed_categorical(self):
|
|
X = [[1, 0, 2], [1, 1, 2], [1, 2, 2]]
|
|
n_values = [3]
|
|
categorical_features = [1]
|
|
cat_columnlabels = ['label']
|
|
noncat_columnlabels = ['a', 'b']
|
|
|
|
X_expected = [[1, 0, 0, 1, 2], [0, 1, 0, 1, 2], [0, 0, 1, 1, 2]]
|
|
new_labels_expected = ['label____0', 'label____1', 'label____2', 'a', 'b']
|
|
enc = DummyEncoder(n_values, categorical_features,
|
|
cat_columnlabels, noncat_columnlabels)
|
|
X_encoded = enc.fit_transform(X)
|
|
new_labels = enc.new_labels
|
|
self.assertTrue(np.all(X_expected == X_encoded))
|
|
self.assertEqual(new_labels_expected, new_labels)
|
|
|
|
def test_consolidate(self):
|
|
labels = ['label1____0', 'label1____1', 'label2____0', 'label2____1', 'noncat']
|
|
consolidated = consolidate_columnlabels(labels)
|
|
expected = ['label1', 'label2', 'noncat']
|
|
self.assertEqual(expected, consolidated)
|
|
|
|
def test_inverse_transform(self):
|
|
X = [[1, 0, 2], [1, 1, 2], [1, 2, 2]]
|
|
n_values = [3]
|
|
categorical_features = [1]
|
|
cat_columnlabels = ['label']
|
|
noncat_columnlabels = ['a', 'b']
|
|
|
|
X_expected = [[1, 0, 0, 1, 2], [0, 1, 0, 1, 2], [0, 0, 1, 1, 2]]
|
|
enc = DummyEncoder(n_values, categorical_features,
|
|
cat_columnlabels, noncat_columnlabels)
|
|
X_encoded = enc.fit_transform(X)
|
|
self.assertTrue(np.all(X_encoded == X_expected))
|
|
X_decoded = enc.inverse_transform(X_encoded)
|
|
self.assertTrue(np.all(X == X_decoded))
|
|
|
|
|
|
if __name__ == '__main__':
|
|
unittest.main()
|