92 lines
3.7 KiB
Python
92 lines
3.7 KiB
Python
|
#
|
||
|
# OtterTune - test_cluster.py
|
||
|
#
|
||
|
# Copyright (c) 2017-18, Carnegie Mellon University Database Group
|
||
|
#
|
||
|
import unittest
|
||
|
import numpy as np
|
||
|
from sklearn import datasets
|
||
|
|
||
|
from analysis.cluster import KMeans, KMeansClusters, create_kselection_model
|
||
|
|
||
|
|
||
|
class TestKMeans(unittest.TestCase):
|
||
|
|
||
|
@classmethod
|
||
|
def setUpClass(cls):
|
||
|
super(TestKMeans, cls).setUpClass()
|
||
|
iris = datasets.load_iris()
|
||
|
cls.model = KMeans()
|
||
|
cls.model.fit(iris.data, 5, iris.target,
|
||
|
estimator_params={'n_init': 50, 'random_state': 42})
|
||
|
|
||
|
def test_kmeans_n_clusters(self):
|
||
|
self.assertEqual(self.model.n_clusters_, 5)
|
||
|
|
||
|
def test_kmeans_cluster_inertia(self):
|
||
|
self.assertAlmostEqual(self.model.cluster_inertia_, 46.535, 2)
|
||
|
|
||
|
def test_kmeans_cluster_labels(self):
|
||
|
expected_labels = [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
|
||
|
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
|
||
|
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 3, 3, 3, 2, 3, 3, 3,
|
||
|
2, 3, 2, 2, 3, 2, 3, 2, 3, 3, 2, 3, 2, 3, 2, 3, 3, 3, 3,
|
||
|
3, 3, 3, 2, 2, 2, 2, 3, 2, 3, 3, 3, 2, 2, 2, 3, 2, 2, 2,
|
||
|
2, 2, 3, 2, 2, 4, 3, 0, 4, 4, 0, 2, 0, 4, 0, 4, 4, 4, 3,
|
||
|
4, 4, 4, 0, 0, 3, 4, 3, 0, 3, 4, 0, 3, 3, 4, 0, 0, 0, 4,
|
||
|
3, 3, 0, 4, 4, 3, 4, 4, 4, 3, 4, 4, 4, 3, 4, 4, 3]
|
||
|
for lab_actual, lab_expected in zip(self.model.cluster_labels_, expected_labels):
|
||
|
self.assertEqual(lab_actual, lab_expected)
|
||
|
|
||
|
def test_kmeans_sample_labels(self):
|
||
|
for lab_actual, lab_expected in zip(self.model.sample_labels_, datasets.load_iris().target):
|
||
|
self.assertEqual(lab_actual, lab_expected)
|
||
|
|
||
|
def test_kmeans_cluster_centers(self):
|
||
|
expected_centers = [[7.475, 3.125, 6.300, 2.050],
|
||
|
[5.006, 3.418, 1.464, 0.244],
|
||
|
[5.508, 2.600, 3.908, 1.204],
|
||
|
[6.207, 2.853, 4.746, 1.564],
|
||
|
[6.529, 3.058, 5.508, 2.162]]
|
||
|
for row_actual, row_expected in zip(self.model.cluster_centers_, expected_centers):
|
||
|
for val_actual, val_expected in zip(row_actual, row_expected):
|
||
|
self.assertAlmostEqual(val_actual, val_expected, 2)
|
||
|
|
||
|
|
||
|
class TestKSelection(unittest.TestCase):
|
||
|
|
||
|
def setUp(self):
|
||
|
np.random.seed(seed=42)
|
||
|
|
||
|
@classmethod
|
||
|
def setUpClass(cls):
|
||
|
super(TestKSelection, cls).setUpClass()
|
||
|
|
||
|
# Load Iris data
|
||
|
iris = datasets.load_iris()
|
||
|
cls.matrix = iris.data
|
||
|
cls.kmeans_models = KMeansClusters()
|
||
|
cls.kmeans_models.fit(cls.matrix,
|
||
|
min_cluster=1,
|
||
|
max_cluster=10,
|
||
|
sample_labels=iris.target,
|
||
|
estimator_params={'n_init': 50, 'random_state': 42})
|
||
|
|
||
|
def test_detk_optimal_num_clusters(self):
|
||
|
# Compute optimal # cluster using det-k
|
||
|
detk = create_kselection_model("det-k")
|
||
|
detk.fit(self.matrix, self.kmeans_models.cluster_map_)
|
||
|
self.assertEqual(detk.optimal_num_clusters_, 2)
|
||
|
|
||
|
def test_gap_statistic_optimal_num_clusters(self):
|
||
|
# Compute optimal # cluster using gap-statistics
|
||
|
gap = create_kselection_model("gap-statistic")
|
||
|
gap.fit(self.matrix, self.kmeans_models.cluster_map_)
|
||
|
self.assertEqual(gap.optimal_num_clusters_, 8)
|
||
|
|
||
|
def test_silhouette_optimal_num_clusters(self):
|
||
|
# Compute optimal # cluster using Silhouette Analysis
|
||
|
sil = create_kselection_model("s-score")
|
||
|
sil.fit(self.matrix, self.kmeans_models.cluster_map_)
|
||
|
self.assertEqual(sil.optimal_num_clusters_, 2)
|