clean metrics for DDPG

2020-04-28 17:58:50 +00:00 · 2020-04-28 17:58:50 +00:00 · 285522a0f5
parent 2916330e89
commit 285522a0f5
4 changed files with 36 additions and 32 deletions
--- a/server/website/website/settings/constants.py
+++ b/server/website/website/settings/constants.py
@ -32,3 +32,9 @@ VIEWS_FOR_PRUNING = {
    DBMSType.ORACLE: ['dba_hist_osstat', 'dba_hist_sysstat', 'dba_hist_system_event',
                      'dba_workload_replays', 'dba_hist_sys_time_model'],
 }
 # The views used for DDPG
 # WARNING: modifying this parameter will cause all existing DDPG sessions broken
 VIEWS_FOR_DDPG = {
    DBMSType.ORACLE: ['dba_hist_sys_time_model'],
 }
--- a/server/website/website/tasks/async_tasks.py
+++ b/server/website/website/tasks/async_tasks.py
@ -34,7 +34,7 @@ from website.models import (PipelineData, PipelineRun, Result, Workload, Session
 from website import db
 from website.types import PipelineTaskType, AlgorithmType, VarType
 from website.utils import DataUtil, JSONUtil
-from website.settings import ENABLE_DUMMY_ENCODER, TIME_ZONE
+from website.settings import ENABLE_DUMMY_ENCODER, TIME_ZONE, VIEWS_FOR_DDPG
 LOG = get_task_logger(__name__)
@ -420,6 +420,7 @@ def train_ddpg(train_ddpg_input):
    result_id, algorithm, target_data = train_ddpg_input
    result = Result.objects.get(pk=result_id)
    session = result.session
    dbms = session.dbms
    task_name = _get_task_name(session, result_id)
    # If the preprocessing method has already generated a config, bypass this method.
@ -480,13 +481,14 @@ def train_ddpg(train_ddpg_input):
    prev_objective = prev_metric_data[target_obj_idx]
    # Clean metric data
    views = VIEWS_FOR_DDPG.get(dbms.type, None)
    metric_data, _ = DataUtil.clean_metric_data(agg_data['y_matrix'],
-                                                agg_data['y_columnlabels'], session)
+                                                agg_data['y_columnlabels'], views)
    metric_data = metric_data.flatten()
    metric_scalar = MinMaxScaler().fit(metric_data.reshape(1, -1))
    normalized_metric_data = metric_scalar.transform(metric_data.reshape(1, -1))[0]
    prev_metric_data, _ = DataUtil.clean_metric_data(prev_agg_data['y_matrix'],
-                                                     prev_agg_data['y_columnlabels'], session)
+                                                     prev_agg_data['y_columnlabels'], views)
    prev_metric_data = prev_metric_data.flatten()
    prev_metric_scalar = MinMaxScaler().fit(prev_metric_data.reshape(1, -1))
    prev_normalized_metric_data = prev_metric_scalar.transform(prev_metric_data.reshape(1, -1))[0]
@ -597,6 +599,7 @@ def configuration_recommendation_ddpg(recommendation_ddpg_input):  # pylint: dis
    result_list = Result.objects.filter(pk=result_id)
    result = result_list.first()
    session = result.session
    dbms = session.dbms
    task_name = _get_task_name(session, result_id)
    early_return, target_data_res = check_early_return(target_data, algorithm)
@ -609,8 +612,9 @@ def configuration_recommendation_ddpg(recommendation_ddpg_input):  # pylint: dis
    params = JSONUtil.loads(session.hyperparameters)
    agg_data = DataUtil.aggregate_data(result_list)
    views = VIEWS_FOR_DDPG.get(dbms.type, None)
    metric_data, _ = DataUtil.clean_metric_data(agg_data['y_matrix'],
-                                                agg_data['y_columnlabels'], session)
+                                                agg_data['y_columnlabels'], views)
    metric_data = metric_data.flatten()
    metric_scalar = MinMaxScaler().fit(metric_data.reshape(1, -1))
    normalized_metric_data = metric_scalar.transform(metric_data.reshape(1, -1))[0]
--- a/server/website/website/tasks/periodic_tasks.py
+++ b/server/website/website/tasks/periodic_tasks.py
@ -269,16 +269,8 @@ def run_workload_characterization(metric_data, dbms=None):
    LOG.debug("Workload characterization ~ initial data size: %s", matrix.shape)
    views = None if dbms is None else VIEWS_FOR_PRUNING.get(dbms.type, None)
-    if views is not None:
+    matrix, columnlabels = DataUtil.clean_metric_data(matrix, columnlabels, views)
-        useful_labels = []
+    LOG.debug("Workload characterization ~ cleaned data size: %s", matrix.shape)
        for label in columnlabels:
            for view in views:
                if view in label:
                    useful_labels.append(label)
                    break
        matrix, columnlabels = DataUtil.clean_metric_data(matrix, columnlabels, None,
                                                          useful_labels)
        LOG.debug("Workload characterization ~ cleaned data size: %s", matrix.shape)
    # Bin each column (metric) in the matrix by its decile
    binner = Bin(bin_start=1, axis=0)
--- a/server/website/website/utils.py
+++ b/server/website/website/utils.py
@ -212,33 +212,35 @@ class DataUtil(object):
        return X_unique, y_unique, rowlabels_unique
    @staticmethod
-    def clean_metric_data(metric_matrix, metric_labels, session, useful_labels=None):
+    def clean_metric_data(metric_matrix, metric_labels, useful_views):
-        # Make metric_labels identical to useful_labels (if given)
+        # Make metric_labels identical to useful_labels (if not None)
-        # If useful_labels is not given, let it to be all metrics in the catalog.
+        if useful_views is None:
-        if useful_labels is None:
+            return metric_matrix, metric_labels
-            metric_objs = MetricCatalog.objects.filter(dbms=session.dbms)
+
-            metric_cat = [session.target_objective]
+        useful_labels = []
-            for metric_obj in metric_objs:
+        for label in metric_labels:
-                metric_cat.append(metric_obj.name)
+            for view in useful_views:
-        else:
+                if view in label:
-            metric_cat = useful_labels
+                    useful_labels.append(label)
-        missing_columns = sorted(set(metric_cat) - set(metric_labels))
+                    break
-        unused_columns = set(metric_labels) - set(metric_cat)
+
        missing_columns = sorted(set(useful_labels) - set(metric_labels))
        unused_columns = set(metric_labels) - set(useful_labels)
        LOG.debug("clean_metric_data: added %d metrics and removed %d metric.",
                  len(missing_columns), len(unused_columns))
        default_val = 0
-        metric_cat_size = len(metric_cat)
+        useful_labels_size = len(useful_labels)
-        matrix = np.ones((len(metric_matrix), metric_cat_size)) * default_val
+        matrix = np.ones((len(metric_matrix), useful_labels_size)) * default_val
        metric_labels_dict = {n: i for i, n in enumerate(metric_labels)}
-        # column labels in matrix has the same order as ones in metric catalog
+        # column labels in matrix has the same order as ones in useful_labels
        # missing values are filled with default_val
-        for i, metric_name in enumerate(metric_cat):
+        for i, metric_name in enumerate(useful_labels):
            if metric_name in metric_labels_dict:
                index = metric_labels_dict[metric_name]
                matrix[:, i] = metric_matrix[:, index]
        LOG.debug("clean_metric_data: final ~ matrix: %s, labels: %s", matrix.shape,
-                  len(metric_cat))
+                  useful_labels_size)
-        return matrix, metric_cat
+        return matrix, useful_labels
    @staticmethod
    def clean_knob_data(knob_matrix, knob_labels, sessions):