clean metrics for DDPG

This commit is contained in:
yangdsh 2020-04-28 17:58:50 +00:00 committed by Dana Van Aken
parent 2916330e89
commit 285522a0f5
4 changed files with 36 additions and 32 deletions

View File

@ -32,3 +32,9 @@ VIEWS_FOR_PRUNING = {
DBMSType.ORACLE: ['dba_hist_osstat', 'dba_hist_sysstat', 'dba_hist_system_event', DBMSType.ORACLE: ['dba_hist_osstat', 'dba_hist_sysstat', 'dba_hist_system_event',
'dba_workload_replays', 'dba_hist_sys_time_model'], 'dba_workload_replays', 'dba_hist_sys_time_model'],
} }
# The views used for DDPG
# WARNING: modifying this parameter will cause all existing DDPG sessions broken
VIEWS_FOR_DDPG = {
DBMSType.ORACLE: ['dba_hist_sys_time_model'],
}

View File

@ -34,7 +34,7 @@ from website.models import (PipelineData, PipelineRun, Result, Workload, Session
from website import db from website import db
from website.types import PipelineTaskType, AlgorithmType, VarType from website.types import PipelineTaskType, AlgorithmType, VarType
from website.utils import DataUtil, JSONUtil from website.utils import DataUtil, JSONUtil
from website.settings import ENABLE_DUMMY_ENCODER, TIME_ZONE from website.settings import ENABLE_DUMMY_ENCODER, TIME_ZONE, VIEWS_FOR_DDPG
LOG = get_task_logger(__name__) LOG = get_task_logger(__name__)
@ -420,6 +420,7 @@ def train_ddpg(train_ddpg_input):
result_id, algorithm, target_data = train_ddpg_input result_id, algorithm, target_data = train_ddpg_input
result = Result.objects.get(pk=result_id) result = Result.objects.get(pk=result_id)
session = result.session session = result.session
dbms = session.dbms
task_name = _get_task_name(session, result_id) task_name = _get_task_name(session, result_id)
# If the preprocessing method has already generated a config, bypass this method. # If the preprocessing method has already generated a config, bypass this method.
@ -480,13 +481,14 @@ def train_ddpg(train_ddpg_input):
prev_objective = prev_metric_data[target_obj_idx] prev_objective = prev_metric_data[target_obj_idx]
# Clean metric data # Clean metric data
views = VIEWS_FOR_DDPG.get(dbms.type, None)
metric_data, _ = DataUtil.clean_metric_data(agg_data['y_matrix'], metric_data, _ = DataUtil.clean_metric_data(agg_data['y_matrix'],
agg_data['y_columnlabels'], session) agg_data['y_columnlabels'], views)
metric_data = metric_data.flatten() metric_data = metric_data.flatten()
metric_scalar = MinMaxScaler().fit(metric_data.reshape(1, -1)) metric_scalar = MinMaxScaler().fit(metric_data.reshape(1, -1))
normalized_metric_data = metric_scalar.transform(metric_data.reshape(1, -1))[0] normalized_metric_data = metric_scalar.transform(metric_data.reshape(1, -1))[0]
prev_metric_data, _ = DataUtil.clean_metric_data(prev_agg_data['y_matrix'], prev_metric_data, _ = DataUtil.clean_metric_data(prev_agg_data['y_matrix'],
prev_agg_data['y_columnlabels'], session) prev_agg_data['y_columnlabels'], views)
prev_metric_data = prev_metric_data.flatten() prev_metric_data = prev_metric_data.flatten()
prev_metric_scalar = MinMaxScaler().fit(prev_metric_data.reshape(1, -1)) prev_metric_scalar = MinMaxScaler().fit(prev_metric_data.reshape(1, -1))
prev_normalized_metric_data = prev_metric_scalar.transform(prev_metric_data.reshape(1, -1))[0] prev_normalized_metric_data = prev_metric_scalar.transform(prev_metric_data.reshape(1, -1))[0]
@ -597,6 +599,7 @@ def configuration_recommendation_ddpg(recommendation_ddpg_input): # pylint: dis
result_list = Result.objects.filter(pk=result_id) result_list = Result.objects.filter(pk=result_id)
result = result_list.first() result = result_list.first()
session = result.session session = result.session
dbms = session.dbms
task_name = _get_task_name(session, result_id) task_name = _get_task_name(session, result_id)
early_return, target_data_res = check_early_return(target_data, algorithm) early_return, target_data_res = check_early_return(target_data, algorithm)
@ -609,8 +612,9 @@ def configuration_recommendation_ddpg(recommendation_ddpg_input): # pylint: dis
params = JSONUtil.loads(session.hyperparameters) params = JSONUtil.loads(session.hyperparameters)
agg_data = DataUtil.aggregate_data(result_list) agg_data = DataUtil.aggregate_data(result_list)
views = VIEWS_FOR_DDPG.get(dbms.type, None)
metric_data, _ = DataUtil.clean_metric_data(agg_data['y_matrix'], metric_data, _ = DataUtil.clean_metric_data(agg_data['y_matrix'],
agg_data['y_columnlabels'], session) agg_data['y_columnlabels'], views)
metric_data = metric_data.flatten() metric_data = metric_data.flatten()
metric_scalar = MinMaxScaler().fit(metric_data.reshape(1, -1)) metric_scalar = MinMaxScaler().fit(metric_data.reshape(1, -1))
normalized_metric_data = metric_scalar.transform(metric_data.reshape(1, -1))[0] normalized_metric_data = metric_scalar.transform(metric_data.reshape(1, -1))[0]

View File

@ -269,15 +269,7 @@ def run_workload_characterization(metric_data, dbms=None):
LOG.debug("Workload characterization ~ initial data size: %s", matrix.shape) LOG.debug("Workload characterization ~ initial data size: %s", matrix.shape)
views = None if dbms is None else VIEWS_FOR_PRUNING.get(dbms.type, None) views = None if dbms is None else VIEWS_FOR_PRUNING.get(dbms.type, None)
if views is not None: matrix, columnlabels = DataUtil.clean_metric_data(matrix, columnlabels, views)
useful_labels = []
for label in columnlabels:
for view in views:
if view in label:
useful_labels.append(label)
break
matrix, columnlabels = DataUtil.clean_metric_data(matrix, columnlabels, None,
useful_labels)
LOG.debug("Workload characterization ~ cleaned data size: %s", matrix.shape) LOG.debug("Workload characterization ~ cleaned data size: %s", matrix.shape)
# Bin each column (metric) in the matrix by its decile # Bin each column (metric) in the matrix by its decile

View File

@ -212,33 +212,35 @@ class DataUtil(object):
return X_unique, y_unique, rowlabels_unique return X_unique, y_unique, rowlabels_unique
@staticmethod @staticmethod
def clean_metric_data(metric_matrix, metric_labels, session, useful_labels=None): def clean_metric_data(metric_matrix, metric_labels, useful_views):
# Make metric_labels identical to useful_labels (if given) # Make metric_labels identical to useful_labels (if not None)
# If useful_labels is not given, let it to be all metrics in the catalog. if useful_views is None:
if useful_labels is None: return metric_matrix, metric_labels
metric_objs = MetricCatalog.objects.filter(dbms=session.dbms)
metric_cat = [session.target_objective] useful_labels = []
for metric_obj in metric_objs: for label in metric_labels:
metric_cat.append(metric_obj.name) for view in useful_views:
else: if view in label:
metric_cat = useful_labels useful_labels.append(label)
missing_columns = sorted(set(metric_cat) - set(metric_labels)) break
unused_columns = set(metric_labels) - set(metric_cat)
missing_columns = sorted(set(useful_labels) - set(metric_labels))
unused_columns = set(metric_labels) - set(useful_labels)
LOG.debug("clean_metric_data: added %d metrics and removed %d metric.", LOG.debug("clean_metric_data: added %d metrics and removed %d metric.",
len(missing_columns), len(unused_columns)) len(missing_columns), len(unused_columns))
default_val = 0 default_val = 0
metric_cat_size = len(metric_cat) useful_labels_size = len(useful_labels)
matrix = np.ones((len(metric_matrix), metric_cat_size)) * default_val matrix = np.ones((len(metric_matrix), useful_labels_size)) * default_val
metric_labels_dict = {n: i for i, n in enumerate(metric_labels)} metric_labels_dict = {n: i for i, n in enumerate(metric_labels)}
# column labels in matrix has the same order as ones in metric catalog # column labels in matrix has the same order as ones in useful_labels
# missing values are filled with default_val # missing values are filled with default_val
for i, metric_name in enumerate(metric_cat): for i, metric_name in enumerate(useful_labels):
if metric_name in metric_labels_dict: if metric_name in metric_labels_dict:
index = metric_labels_dict[metric_name] index = metric_labels_dict[metric_name]
matrix[:, i] = metric_matrix[:, index] matrix[:, i] = metric_matrix[:, index]
LOG.debug("clean_metric_data: final ~ matrix: %s, labels: %s", matrix.shape, LOG.debug("clean_metric_data: final ~ matrix: %s, labels: %s", matrix.shape,
len(metric_cat)) useful_labels_size)
return matrix, metric_cat return matrix, useful_labels
@staticmethod @staticmethod
def clean_knob_data(knob_matrix, knob_labels, sessions): def clean_knob_data(knob_matrix, knob_labels, sessions):