From 285522a0f5f2be538fa095c10e156e4a8638ed8d Mon Sep 17 00:00:00 2001 From: yangdsh Date: Tue, 28 Apr 2020 17:58:50 +0000 Subject: [PATCH] clean metrics for DDPG --- server/website/website/settings/constants.py | 6 +++ server/website/website/tasks/async_tasks.py | 12 ++++-- .../website/website/tasks/periodic_tasks.py | 12 +----- server/website/website/utils.py | 38 ++++++++++--------- 4 files changed, 36 insertions(+), 32 deletions(-) diff --git a/server/website/website/settings/constants.py b/server/website/website/settings/constants.py index 64ed521..c2477bc 100644 --- a/server/website/website/settings/constants.py +++ b/server/website/website/settings/constants.py @@ -32,3 +32,9 @@ VIEWS_FOR_PRUNING = { DBMSType.ORACLE: ['dba_hist_osstat', 'dba_hist_sysstat', 'dba_hist_system_event', 'dba_workload_replays', 'dba_hist_sys_time_model'], } + +# The views used for DDPG +# WARNING: modifying this parameter will cause all existing DDPG sessions broken +VIEWS_FOR_DDPG = { + DBMSType.ORACLE: ['dba_hist_sys_time_model'], +} diff --git a/server/website/website/tasks/async_tasks.py b/server/website/website/tasks/async_tasks.py index 78ec30f..e63d377 100644 --- a/server/website/website/tasks/async_tasks.py +++ b/server/website/website/tasks/async_tasks.py @@ -34,7 +34,7 @@ from website.models import (PipelineData, PipelineRun, Result, Workload, Session from website import db from website.types import PipelineTaskType, AlgorithmType, VarType from website.utils import DataUtil, JSONUtil -from website.settings import ENABLE_DUMMY_ENCODER, TIME_ZONE +from website.settings import ENABLE_DUMMY_ENCODER, TIME_ZONE, VIEWS_FOR_DDPG LOG = get_task_logger(__name__) @@ -420,6 +420,7 @@ def train_ddpg(train_ddpg_input): result_id, algorithm, target_data = train_ddpg_input result = Result.objects.get(pk=result_id) session = result.session + dbms = session.dbms task_name = _get_task_name(session, result_id) # If the preprocessing method has already generated a config, bypass this method. @@ -480,13 +481,14 @@ def train_ddpg(train_ddpg_input): prev_objective = prev_metric_data[target_obj_idx] # Clean metric data + views = VIEWS_FOR_DDPG.get(dbms.type, None) metric_data, _ = DataUtil.clean_metric_data(agg_data['y_matrix'], - agg_data['y_columnlabels'], session) + agg_data['y_columnlabels'], views) metric_data = metric_data.flatten() metric_scalar = MinMaxScaler().fit(metric_data.reshape(1, -1)) normalized_metric_data = metric_scalar.transform(metric_data.reshape(1, -1))[0] prev_metric_data, _ = DataUtil.clean_metric_data(prev_agg_data['y_matrix'], - prev_agg_data['y_columnlabels'], session) + prev_agg_data['y_columnlabels'], views) prev_metric_data = prev_metric_data.flatten() prev_metric_scalar = MinMaxScaler().fit(prev_metric_data.reshape(1, -1)) prev_normalized_metric_data = prev_metric_scalar.transform(prev_metric_data.reshape(1, -1))[0] @@ -597,6 +599,7 @@ def configuration_recommendation_ddpg(recommendation_ddpg_input): # pylint: dis result_list = Result.objects.filter(pk=result_id) result = result_list.first() session = result.session + dbms = session.dbms task_name = _get_task_name(session, result_id) early_return, target_data_res = check_early_return(target_data, algorithm) @@ -609,8 +612,9 @@ def configuration_recommendation_ddpg(recommendation_ddpg_input): # pylint: dis params = JSONUtil.loads(session.hyperparameters) agg_data = DataUtil.aggregate_data(result_list) + views = VIEWS_FOR_DDPG.get(dbms.type, None) metric_data, _ = DataUtil.clean_metric_data(agg_data['y_matrix'], - agg_data['y_columnlabels'], session) + agg_data['y_columnlabels'], views) metric_data = metric_data.flatten() metric_scalar = MinMaxScaler().fit(metric_data.reshape(1, -1)) normalized_metric_data = metric_scalar.transform(metric_data.reshape(1, -1))[0] diff --git a/server/website/website/tasks/periodic_tasks.py b/server/website/website/tasks/periodic_tasks.py index d247a86..9fffd0b 100644 --- a/server/website/website/tasks/periodic_tasks.py +++ b/server/website/website/tasks/periodic_tasks.py @@ -269,16 +269,8 @@ def run_workload_characterization(metric_data, dbms=None): LOG.debug("Workload characterization ~ initial data size: %s", matrix.shape) views = None if dbms is None else VIEWS_FOR_PRUNING.get(dbms.type, None) - if views is not None: - useful_labels = [] - for label in columnlabels: - for view in views: - if view in label: - useful_labels.append(label) - break - matrix, columnlabels = DataUtil.clean_metric_data(matrix, columnlabels, None, - useful_labels) - LOG.debug("Workload characterization ~ cleaned data size: %s", matrix.shape) + matrix, columnlabels = DataUtil.clean_metric_data(matrix, columnlabels, views) + LOG.debug("Workload characterization ~ cleaned data size: %s", matrix.shape) # Bin each column (metric) in the matrix by its decile binner = Bin(bin_start=1, axis=0) diff --git a/server/website/website/utils.py b/server/website/website/utils.py index 9d59a27..9af261a 100644 --- a/server/website/website/utils.py +++ b/server/website/website/utils.py @@ -212,33 +212,35 @@ class DataUtil(object): return X_unique, y_unique, rowlabels_unique @staticmethod - def clean_metric_data(metric_matrix, metric_labels, session, useful_labels=None): - # Make metric_labels identical to useful_labels (if given) - # If useful_labels is not given, let it to be all metrics in the catalog. - if useful_labels is None: - metric_objs = MetricCatalog.objects.filter(dbms=session.dbms) - metric_cat = [session.target_objective] - for metric_obj in metric_objs: - metric_cat.append(metric_obj.name) - else: - metric_cat = useful_labels - missing_columns = sorted(set(metric_cat) - set(metric_labels)) - unused_columns = set(metric_labels) - set(metric_cat) + def clean_metric_data(metric_matrix, metric_labels, useful_views): + # Make metric_labels identical to useful_labels (if not None) + if useful_views is None: + return metric_matrix, metric_labels + + useful_labels = [] + for label in metric_labels: + for view in useful_views: + if view in label: + useful_labels.append(label) + break + + missing_columns = sorted(set(useful_labels) - set(metric_labels)) + unused_columns = set(metric_labels) - set(useful_labels) LOG.debug("clean_metric_data: added %d metrics and removed %d metric.", len(missing_columns), len(unused_columns)) default_val = 0 - metric_cat_size = len(metric_cat) - matrix = np.ones((len(metric_matrix), metric_cat_size)) * default_val + useful_labels_size = len(useful_labels) + matrix = np.ones((len(metric_matrix), useful_labels_size)) * default_val metric_labels_dict = {n: i for i, n in enumerate(metric_labels)} - # column labels in matrix has the same order as ones in metric catalog + # column labels in matrix has the same order as ones in useful_labels # missing values are filled with default_val - for i, metric_name in enumerate(metric_cat): + for i, metric_name in enumerate(useful_labels): if metric_name in metric_labels_dict: index = metric_labels_dict[metric_name] matrix[:, i] = metric_matrix[:, index] LOG.debug("clean_metric_data: final ~ matrix: %s, labels: %s", matrix.shape, - len(metric_cat)) - return matrix, metric_cat + useful_labels_size) + return matrix, useful_labels @staticmethod def clean_knob_data(knob_matrix, knob_labels, sessions):