clean metrics for DDPG

This commit is contained in:
yangdsh 2020-04-28 17:58:50 +00:00 committed by Dana Van Aken
parent 2916330e89
commit 285522a0f5
4 changed files with 36 additions and 32 deletions

View File

@ -32,3 +32,9 @@ VIEWS_FOR_PRUNING = {
DBMSType.ORACLE: ['dba_hist_osstat', 'dba_hist_sysstat', 'dba_hist_system_event',
'dba_workload_replays', 'dba_hist_sys_time_model'],
}
# The views used for DDPG
# WARNING: modifying this parameter will cause all existing DDPG sessions broken
VIEWS_FOR_DDPG = {
DBMSType.ORACLE: ['dba_hist_sys_time_model'],
}

View File

@ -34,7 +34,7 @@ from website.models import (PipelineData, PipelineRun, Result, Workload, Session
from website import db
from website.types import PipelineTaskType, AlgorithmType, VarType
from website.utils import DataUtil, JSONUtil
from website.settings import ENABLE_DUMMY_ENCODER, TIME_ZONE
from website.settings import ENABLE_DUMMY_ENCODER, TIME_ZONE, VIEWS_FOR_DDPG
LOG = get_task_logger(__name__)
@ -420,6 +420,7 @@ def train_ddpg(train_ddpg_input):
result_id, algorithm, target_data = train_ddpg_input
result = Result.objects.get(pk=result_id)
session = result.session
dbms = session.dbms
task_name = _get_task_name(session, result_id)
# If the preprocessing method has already generated a config, bypass this method.
@ -480,13 +481,14 @@ def train_ddpg(train_ddpg_input):
prev_objective = prev_metric_data[target_obj_idx]
# Clean metric data
views = VIEWS_FOR_DDPG.get(dbms.type, None)
metric_data, _ = DataUtil.clean_metric_data(agg_data['y_matrix'],
agg_data['y_columnlabels'], session)
agg_data['y_columnlabels'], views)
metric_data = metric_data.flatten()
metric_scalar = MinMaxScaler().fit(metric_data.reshape(1, -1))
normalized_metric_data = metric_scalar.transform(metric_data.reshape(1, -1))[0]
prev_metric_data, _ = DataUtil.clean_metric_data(prev_agg_data['y_matrix'],
prev_agg_data['y_columnlabels'], session)
prev_agg_data['y_columnlabels'], views)
prev_metric_data = prev_metric_data.flatten()
prev_metric_scalar = MinMaxScaler().fit(prev_metric_data.reshape(1, -1))
prev_normalized_metric_data = prev_metric_scalar.transform(prev_metric_data.reshape(1, -1))[0]
@ -597,6 +599,7 @@ def configuration_recommendation_ddpg(recommendation_ddpg_input): # pylint: dis
result_list = Result.objects.filter(pk=result_id)
result = result_list.first()
session = result.session
dbms = session.dbms
task_name = _get_task_name(session, result_id)
early_return, target_data_res = check_early_return(target_data, algorithm)
@ -609,8 +612,9 @@ def configuration_recommendation_ddpg(recommendation_ddpg_input): # pylint: dis
params = JSONUtil.loads(session.hyperparameters)
agg_data = DataUtil.aggregate_data(result_list)
views = VIEWS_FOR_DDPG.get(dbms.type, None)
metric_data, _ = DataUtil.clean_metric_data(agg_data['y_matrix'],
agg_data['y_columnlabels'], session)
agg_data['y_columnlabels'], views)
metric_data = metric_data.flatten()
metric_scalar = MinMaxScaler().fit(metric_data.reshape(1, -1))
normalized_metric_data = metric_scalar.transform(metric_data.reshape(1, -1))[0]

View File

@ -269,15 +269,7 @@ def run_workload_characterization(metric_data, dbms=None):
LOG.debug("Workload characterization ~ initial data size: %s", matrix.shape)
views = None if dbms is None else VIEWS_FOR_PRUNING.get(dbms.type, None)
if views is not None:
useful_labels = []
for label in columnlabels:
for view in views:
if view in label:
useful_labels.append(label)
break
matrix, columnlabels = DataUtil.clean_metric_data(matrix, columnlabels, None,
useful_labels)
matrix, columnlabels = DataUtil.clean_metric_data(matrix, columnlabels, views)
LOG.debug("Workload characterization ~ cleaned data size: %s", matrix.shape)
# Bin each column (metric) in the matrix by its decile

View File

@ -212,33 +212,35 @@ class DataUtil(object):
return X_unique, y_unique, rowlabels_unique
@staticmethod
def clean_metric_data(metric_matrix, metric_labels, session, useful_labels=None):
# Make metric_labels identical to useful_labels (if given)
# If useful_labels is not given, let it to be all metrics in the catalog.
if useful_labels is None:
metric_objs = MetricCatalog.objects.filter(dbms=session.dbms)
metric_cat = [session.target_objective]
for metric_obj in metric_objs:
metric_cat.append(metric_obj.name)
else:
metric_cat = useful_labels
missing_columns = sorted(set(metric_cat) - set(metric_labels))
unused_columns = set(metric_labels) - set(metric_cat)
def clean_metric_data(metric_matrix, metric_labels, useful_views):
# Make metric_labels identical to useful_labels (if not None)
if useful_views is None:
return metric_matrix, metric_labels
useful_labels = []
for label in metric_labels:
for view in useful_views:
if view in label:
useful_labels.append(label)
break
missing_columns = sorted(set(useful_labels) - set(metric_labels))
unused_columns = set(metric_labels) - set(useful_labels)
LOG.debug("clean_metric_data: added %d metrics and removed %d metric.",
len(missing_columns), len(unused_columns))
default_val = 0
metric_cat_size = len(metric_cat)
matrix = np.ones((len(metric_matrix), metric_cat_size)) * default_val
useful_labels_size = len(useful_labels)
matrix = np.ones((len(metric_matrix), useful_labels_size)) * default_val
metric_labels_dict = {n: i for i, n in enumerate(metric_labels)}
# column labels in matrix has the same order as ones in metric catalog
# column labels in matrix has the same order as ones in useful_labels
# missing values are filled with default_val
for i, metric_name in enumerate(metric_cat):
for i, metric_name in enumerate(useful_labels):
if metric_name in metric_labels_dict:
index = metric_labels_dict[metric_name]
matrix[:, i] = metric_matrix[:, index]
LOG.debug("clean_metric_data: final ~ matrix: %s, labels: %s", matrix.shape,
len(metric_cat))
return matrix, metric_cat
useful_labels_size)
return matrix, useful_labels
@staticmethod
def clean_knob_data(knob_matrix, knob_labels, sessions):