Improved/reordered preprocessing of metrics in workload characterization task to further reduce the number of columns and speed up the computation. Tweaked a few admin views to show more recent info first.

2020-04-14 05:27:35 -04:00 · 2020-04-14 05:27:35 -04:00 · 99c80e2e83
parent efa02899b5
commit 99c80e2e83
2 changed files with 20 additions and 10 deletions
--- a/server/website/website/admin.py
+++ b/server/website/website/admin.py
@ -118,7 +118,7 @@ class PipelineDataAdmin(admin.ModelAdmin):

 class PipelineRunAdmin(admin.ModelAdmin):
    list_display = ('id', 'start_time', 'end_time')
-    ordering = ('id', 'start_time')
+    ordering = ('-id', '-start_time')


 class WorkloadAdmin(admin.ModelAdmin):
@ -187,7 +187,9 @@ class CustomStatusLogAdmin(StatusLogAdmin):


 class ExecutionTimeAdmin(admin.ModelAdmin):
-    list_display = ('event', 'result', 'exec_time')
+    list_display = ('event', 'start_time', 'exec_time', 'result')
+    list_filter = ('module', 'function', 'tag')
+    ordering = ('-start_time',)

    def exec_time(self, instance):  # pylint: disable=no-self-use
        return '{:.0f} sec'.format(instance.execution_time)
--- a/server/website/website/tasks/periodic_tasks.py
+++ b/server/website/website/tasks/periodic_tasks.py
@ -247,30 +247,38 @@ def run_workload_characterization(metric_data):

    matrix = metric_data['data']
    columnlabels = metric_data['columnlabels']
+    LOG.debug("Workload characterization ~ initial data size: %s", matrix.shape)
+
+    # Bin each column (metric) in the matrix by its decile
+    binner = Bin(bin_start=1, axis=0)
+    binned_matrix = binner.fit_transform(matrix)

    # Remove any constant columns
    nonconst_matrix = []
    nonconst_columnlabels = []
-    for col, cl in zip(matrix.T, columnlabels):
+    for col, cl in zip(binned_matrix.T, columnlabels):
        if np.any(col != col[0]):
            nonconst_matrix.append(col.reshape(-1, 1))
            nonconst_columnlabels.append(cl)
    assert len(nonconst_matrix) > 0, "Need more data to train the model"
    nonconst_matrix = np.hstack(nonconst_matrix)
-    n_rows, n_cols = nonconst_matrix.shape
+    LOG.debug("Workload characterization ~ nonconst data size: %s", nonconst_matrix.shape)

-    # Bin each column (metric) in the matrix by its decile
-    binner = Bin(bin_start=1, axis=0)
-    binned_matrix = binner.fit_transform(nonconst_matrix)
+    # Remove any duplicate columns
+    unique_matrix, unique_idxs = np.unique(nonconst_matrix, axis=1, return_index=True)
+    unique_columnlabels = [nonconst_columnlabels[idx] for idx in unique_idxs]
+
+    LOG.debug("Workload characterization ~ final data size: %s", unique_matrix.shape)
+    n_rows, n_cols = unique_matrix.shape

    # Shuffle the matrix rows
    shuffle_indices = get_shuffle_indices(n_rows)
-    shuffled_matrix = binned_matrix[shuffle_indices, :]
+    shuffled_matrix = unique_matrix[shuffle_indices, :]

    # Fit factor analysis model
    fa_model = FactorAnalysis()
    # For now we use 5 latent variables
-    fa_model.fit(shuffled_matrix, nonconst_columnlabels, n_components=5)
+    fa_model.fit(unique_matrix, unique_columnlabels, n_components=5)

    # Components: metrics * factors
    components = fa_model.components_.T.copy()
@ -280,7 +288,7 @@ def run_workload_characterization(metric_data):
    kmeans_models = KMeansClusters()
    kmeans_models.fit(components, min_cluster=1,
                      max_cluster=min(n_cols - 1, 20),
-                      sample_labels=nonconst_columnlabels,
+                      sample_labels=unique_columnlabels,
                      estimator_params={'n_init': 50})

    # Compute optimal # clusters, k, using gap statistics