clean metrics before pruning

2020-04-22 02:51:20 +00:00 · 2020-04-22 02:51:20 +00:00 · e7d5b0338a
parent 8da203fdc3
commit e7d5b0338a
4 changed files with 52 additions and 43 deletions
--- a/server/website/website/settings/constants.py
+++ b/server/website/website/settings/constants.py
@ -24,3 +24,6 @@ KNOB_IDENT_USE_PRUNED_METRICS = False
 # The background tasks only process workloads containing this minimum amount of results
 MIN_WORKLOAD_RESULTS_COUNT = 5
 # The views used for metrics pruning
 VIEWS_FOR_PRUNING = ['dba_hist_osstat', 'dba_hist_sysstat', 'dba_hist_system_event']
--- a/server/website/website/tasks/async_tasks.py
+++ b/server/website/website/tasks/async_tasks.py
@ -89,30 +89,6 @@ class ConfigurationRecommendation(BaseTask):  # pylint: disable=abstract-method
        task_meta.save()
 def clean_metric_data(metric_matrix, metric_labels, session):
    # Makes sure that all knobs in the dbms are included in the knob_matrix and knob_labels
    metric_objs = MetricCatalog.objects.filter(dbms=session.dbms)
    metric_cat = [session.target_objective]
    for metric_obj in metric_objs:
        metric_cat.append(metric_obj.name)
    missing_columns = sorted(set(metric_cat) - set(metric_labels))
    unused_columns = set(metric_labels) - set(metric_cat)
    LOG.debug("clean_metric_data: added %d metrics and removed %d metric.", len(missing_columns),
              len(unused_columns))
    default_val = 0
    metric_cat_size = len(metric_cat)
    matrix = np.ones((len(metric_matrix), metric_cat_size)) * default_val
    metric_labels_dict = {n: i for i, n in enumerate(metric_labels)}
    # column labels in matrix has the same order as ones in metric catalog
    # missing values are filled with default_val
    for i, metric_name in enumerate(metric_cat):
        if metric_name in metric_labels_dict:
            index = metric_labels_dict[metric_name]
            matrix[:, i] = metric_matrix[:, index]
    LOG.debug("clean_metric_data: final ~ matrix: %s, labels: %s", matrix.shape, len(metric_cat))
    return matrix, metric_cat
 def save_execution_time(start_ts, fn, result):
    end_ts = time.time()
    exec_time = end_ts - start_ts
@ -513,12 +489,12 @@ def train_ddpg(train_ddpg_input):
    prev_objective = prev_metric_data[target_obj_idx]
    # Clean metric data
-    metric_data, metric_labels = clean_metric_data(agg_data['y_matrix'],
+    metric_data, _ = DataUtil.clean_metric_data(agg_data['y_matrix'],
                                                agg_data['y_columnlabels'], session)
    metric_data = metric_data.flatten()
    metric_scalar = MinMaxScaler().fit(metric_data.reshape(1, -1))
    normalized_metric_data = metric_scalar.transform(metric_data.reshape(1, -1))[0]
-    prev_metric_data, _ = clean_metric_data(prev_agg_data['y_matrix'],
+    prev_metric_data, _ = DataUtil.clean_metric_data(prev_agg_data['y_matrix'],
                                                     prev_agg_data['y_columnlabels'], session)
    prev_metric_data = prev_metric_data.flatten()
    prev_metric_scalar = MinMaxScaler().fit(prev_metric_data.reshape(1, -1))
@ -642,7 +618,8 @@ def configuration_recommendation_ddpg(recommendation_ddpg_input):  # pylint: dis
    params = JSONUtil.loads(session.hyperparameters)
    agg_data = DataUtil.aggregate_data(result_list)
-    metric_data, _ = clean_metric_data(agg_data['y_matrix'], agg_data['y_columnlabels'], session)
+    metric_data, _ = DataUtil.clean_metric_data(agg_data['y_matrix'],
                                                agg_data['y_columnlabels'], session)
    metric_data = metric_data.flatten()
    metric_scalar = MinMaxScaler().fit(metric_data.reshape(1, -1))
    normalized_metric_data = metric_scalar.transform(metric_data.reshape(1, -1))[0]
--- a/server/website/website/tasks/periodic_tasks.py
+++ b/server/website/website/tasks/periodic_tasks.py
@ -22,7 +22,7 @@ from analysis.preprocessing import (Bin, get_shuffle_indices,
                                    consolidate_columnlabels)
 from website.models import PipelineData, PipelineRun, Result, Workload, ExecutionTime
 from website.settings import (ENABLE_DUMMY_ENCODER, KNOB_IDENT_USE_PRUNED_METRICS,
-                              MIN_WORKLOAD_RESULTS_COUNT, TIME_ZONE)
+                              MIN_WORKLOAD_RESULTS_COUNT, TIME_ZONE, VIEWS_FOR_PRUNING)
 from website.types import PipelineTaskType, WorkloadStatusType
 from website.utils import DataUtil, JSONUtil
@ -82,8 +82,6 @@ def run_background_tasks():
            # Check that there are enough results in the workload
            LOG.info("Not enough results in workload %s (# results: %s, # required: %s).",
                     workload_name, num_wkld_results, MIN_WORKLOAD_RESULTS_COUNT)
            workload.status = WorkloadStatusType.PROCESSED
            workload.save()
            continue
        LOG.info("Aggregating data for workload %s...", workload_name)
@ -92,18 +90,11 @@ def run_background_tasks():
        LOG.debug("Aggregated knob data: rowlabels=%s, columnlabels=%s, data=%s.",
                  len(knob_data['rowlabels']), len(knob_data['columnlabels']),
                  knob_data['data'].shape)
        LOG.debug("Aggregated metric data: rowlabels=%s, columnlabels=%s, data=%s.",
                  len(metric_data['rowlabels']), len(metric_data['columnlabels']),
                  metric_data['data'].shape)
        LOG.info("Done aggregating data for workload %s.", workload_name)
        num_valid_results = knob_data['data'].shape[0]  # pylint: disable=unsubscriptable-object
        if num_valid_results < MIN_WORKLOAD_RESULTS_COUNT:
            # Check that there are enough valid results in the workload
            LOG.info("Not enough valid results in workload %s (# valid results: "
                     "%s, # required: %s).", workload_name, num_valid_results,
                     MIN_WORKLOAD_RESULTS_COUNT)
            workload.status = WorkloadStatusType.PROCESSED
            workload.save()
            continue
        # Knob_data and metric_data are 2D numpy arrays. Convert them into a
        # JSON-friendly (nested) lists and then save them as new PipelineData
        # objects.
@ -264,6 +255,14 @@ def run_workload_characterization(metric_data):
    matrix = metric_data['data']
    columnlabels = metric_data['columnlabels']
    LOG.debug("Workload characterization ~ initial data size: %s", matrix.shape)
    useful_labels = []
    for label in columnlabels:
        for view in VIEWS_FOR_PRUNING:
            if view in label:
                useful_labels.append(label)
                break
    matrix, columnlabels = DataUtil.clean_metric_data(matrix, columnlabels, None, useful_labels)
    LOG.debug("Workload characterization ~ cleaned data size: %s", matrix.shape)
    # Bin each column (metric) in the matrix by its decile
    binner = Bin(bin_start=1, axis=0)
@ -298,6 +297,7 @@ def run_workload_characterization(metric_data):
    # Components: metrics * factors
    components = fa_model.components_.T.copy()
    LOG.info("Workload characterization first part costs %.0f seconds.", time.time() - start_ts)
    # Run Kmeans for # clusters k in range(1, num_nonduplicate_metrics - 1)
    # K should be much smaller than n_cols in detK, For now max_cluster <= 20
--- a/server/website/website/utils.py
+++ b/server/website/website/utils.py
@ -25,7 +25,7 @@ from django.utils.text import capfirst
 from django_db_logger.models import StatusLog
 from djcelery.models import TaskMeta
-from .models import DBMSCatalog, KnobCatalog, Result, Session, SessionKnob
+from .models import DBMSCatalog, MetricCatalog, KnobCatalog, Result, Session, SessionKnob
 from .settings import common
 from .types import LabelStyleType, VarType
@ -211,6 +211,35 @@ class DataUtil(object):
                rowlabels_unique[i] = tuple(rowlabels[dup_idxs])
        return X_unique, y_unique, rowlabels_unique
    @staticmethod
    def clean_metric_data(metric_matrix, metric_labels, session, useful_labels=None):
        # Make metric_labels identical to useful_labels (if given)
        # If useful_labels is not given, let it to be all metrics in the catalog.
        if useful_labels is None:
            metric_objs = MetricCatalog.objects.filter(dbms=session.dbms)
            metric_cat = [session.target_objective]
            for metric_obj in metric_objs:
                metric_cat.append(metric_obj.name)
        else:
            metric_cat = useful_labels
        missing_columns = sorted(set(metric_cat) - set(metric_labels))
        unused_columns = set(metric_labels) - set(metric_cat)
        LOG.debug("clean_metric_data: added %d metrics and removed %d metric.",
                  len(missing_columns), len(unused_columns))
        default_val = 0
        metric_cat_size = len(metric_cat)
        matrix = np.ones((len(metric_matrix), metric_cat_size)) * default_val
        metric_labels_dict = {n: i for i, n in enumerate(metric_labels)}
        # column labels in matrix has the same order as ones in metric catalog
        # missing values are filled with default_val
        for i, metric_name in enumerate(metric_cat):
            if metric_name in metric_labels_dict:
                index = metric_labels_dict[metric_name]
                matrix[:, i] = metric_matrix[:, index]
        LOG.debug("clean_metric_data: final ~ matrix: %s, labels: %s", matrix.shape,
                  len(metric_cat))
        return matrix, metric_cat
    @staticmethod
    def clean_knob_data(knob_matrix, knob_labels, sessions):
        # Filter and amend knob_matrix and knob_labels according to the tunable knobs in the session