Improved/reordered preprocessing of metrics in workload characterization task to further reduce the number of columns and speed up the computation. Tweaked a few admin views to show more recent info first.
This commit is contained in:
		
							parent
							
								
									efa02899b5
								
							
						
					
					
						commit
						99c80e2e83
					
				|  | @ -118,7 +118,7 @@ class PipelineDataAdmin(admin.ModelAdmin): | |||
| 
 | ||||
| class PipelineRunAdmin(admin.ModelAdmin): | ||||
|     list_display = ('id', 'start_time', 'end_time') | ||||
|     ordering = ('id', 'start_time') | ||||
|     ordering = ('-id', '-start_time') | ||||
| 
 | ||||
| 
 | ||||
| class WorkloadAdmin(admin.ModelAdmin): | ||||
|  | @ -187,7 +187,9 @@ class CustomStatusLogAdmin(StatusLogAdmin): | |||
| 
 | ||||
| 
 | ||||
| class ExecutionTimeAdmin(admin.ModelAdmin): | ||||
|     list_display = ('event', 'result', 'exec_time') | ||||
|     list_display = ('event', 'start_time', 'exec_time', 'result') | ||||
|     list_filter = ('module', 'function', 'tag') | ||||
|     ordering = ('-start_time',) | ||||
| 
 | ||||
|     def exec_time(self, instance):  # pylint: disable=no-self-use | ||||
|         return '{:.0f} sec'.format(instance.execution_time) | ||||
|  |  | |||
|  | @ -247,30 +247,38 @@ def run_workload_characterization(metric_data): | |||
| 
 | ||||
|     matrix = metric_data['data'] | ||||
|     columnlabels = metric_data['columnlabels'] | ||||
|     LOG.debug("Workload characterization ~ initial data size: %s", matrix.shape) | ||||
| 
 | ||||
|     # Bin each column (metric) in the matrix by its decile | ||||
|     binner = Bin(bin_start=1, axis=0) | ||||
|     binned_matrix = binner.fit_transform(matrix) | ||||
| 
 | ||||
|     # Remove any constant columns | ||||
|     nonconst_matrix = [] | ||||
|     nonconst_columnlabels = [] | ||||
|     for col, cl in zip(matrix.T, columnlabels): | ||||
|     for col, cl in zip(binned_matrix.T, columnlabels): | ||||
|         if np.any(col != col[0]): | ||||
|             nonconst_matrix.append(col.reshape(-1, 1)) | ||||
|             nonconst_columnlabels.append(cl) | ||||
|     assert len(nonconst_matrix) > 0, "Need more data to train the model" | ||||
|     nonconst_matrix = np.hstack(nonconst_matrix) | ||||
|     n_rows, n_cols = nonconst_matrix.shape | ||||
|     LOG.debug("Workload characterization ~ nonconst data size: %s", nonconst_matrix.shape) | ||||
| 
 | ||||
|     # Bin each column (metric) in the matrix by its decile | ||||
|     binner = Bin(bin_start=1, axis=0) | ||||
|     binned_matrix = binner.fit_transform(nonconst_matrix) | ||||
|     # Remove any duplicate columns | ||||
|     unique_matrix, unique_idxs = np.unique(nonconst_matrix, axis=1, return_index=True) | ||||
|     unique_columnlabels = [nonconst_columnlabels[idx] for idx in unique_idxs] | ||||
| 
 | ||||
|     LOG.debug("Workload characterization ~ final data size: %s", unique_matrix.shape) | ||||
|     n_rows, n_cols = unique_matrix.shape | ||||
| 
 | ||||
|     # Shuffle the matrix rows | ||||
|     shuffle_indices = get_shuffle_indices(n_rows) | ||||
|     shuffled_matrix = binned_matrix[shuffle_indices, :] | ||||
|     shuffled_matrix = unique_matrix[shuffle_indices, :] | ||||
| 
 | ||||
|     # Fit factor analysis model | ||||
|     fa_model = FactorAnalysis() | ||||
|     # For now we use 5 latent variables | ||||
|     fa_model.fit(shuffled_matrix, nonconst_columnlabels, n_components=5) | ||||
|     fa_model.fit(unique_matrix, unique_columnlabels, n_components=5) | ||||
| 
 | ||||
|     # Components: metrics * factors | ||||
|     components = fa_model.components_.T.copy() | ||||
|  | @ -280,7 +288,7 @@ def run_workload_characterization(metric_data): | |||
|     kmeans_models = KMeansClusters() | ||||
|     kmeans_models.fit(components, min_cluster=1, | ||||
|                       max_cluster=min(n_cols - 1, 20), | ||||
|                       sample_labels=nonconst_columnlabels, | ||||
|                       sample_labels=unique_columnlabels, | ||||
|                       estimator_params={'n_init': 50}) | ||||
| 
 | ||||
|     # Compute optimal # clusters, k, using gap statistics | ||||
|  |  | |||
		Loading…
	
		Reference in New Issue