clean metrics before pruning

This commit is contained in:
yangdsh 2020-04-22 02:51:20 +00:00 committed by Dana Van Aken
parent 8da203fdc3
commit e7d5b0338a
4 changed files with 52 additions and 43 deletions

View File

@ -24,3 +24,6 @@ KNOB_IDENT_USE_PRUNED_METRICS = False
# The background tasks only process workloads containing this minimum amount of results
MIN_WORKLOAD_RESULTS_COUNT = 5
# The views used for metrics pruning
VIEWS_FOR_PRUNING = ['dba_hist_osstat', 'dba_hist_sysstat', 'dba_hist_system_event']

View File

@ -89,30 +89,6 @@ class ConfigurationRecommendation(BaseTask): # pylint: disable=abstract-method
task_meta.save()
def clean_metric_data(metric_matrix, metric_labels, session):
# Makes sure that all knobs in the dbms are included in the knob_matrix and knob_labels
metric_objs = MetricCatalog.objects.filter(dbms=session.dbms)
metric_cat = [session.target_objective]
for metric_obj in metric_objs:
metric_cat.append(metric_obj.name)
missing_columns = sorted(set(metric_cat) - set(metric_labels))
unused_columns = set(metric_labels) - set(metric_cat)
LOG.debug("clean_metric_data: added %d metrics and removed %d metric.", len(missing_columns),
len(unused_columns))
default_val = 0
metric_cat_size = len(metric_cat)
matrix = np.ones((len(metric_matrix), metric_cat_size)) * default_val
metric_labels_dict = {n: i for i, n in enumerate(metric_labels)}
# column labels in matrix has the same order as ones in metric catalog
# missing values are filled with default_val
for i, metric_name in enumerate(metric_cat):
if metric_name in metric_labels_dict:
index = metric_labels_dict[metric_name]
matrix[:, i] = metric_matrix[:, index]
LOG.debug("clean_metric_data: final ~ matrix: %s, labels: %s", matrix.shape, len(metric_cat))
return matrix, metric_cat
def save_execution_time(start_ts, fn, result):
end_ts = time.time()
exec_time = end_ts - start_ts
@ -513,12 +489,12 @@ def train_ddpg(train_ddpg_input):
prev_objective = prev_metric_data[target_obj_idx]
# Clean metric data
metric_data, metric_labels = clean_metric_data(agg_data['y_matrix'],
metric_data, _ = DataUtil.clean_metric_data(agg_data['y_matrix'],
agg_data['y_columnlabels'], session)
metric_data = metric_data.flatten()
metric_scalar = MinMaxScaler().fit(metric_data.reshape(1, -1))
normalized_metric_data = metric_scalar.transform(metric_data.reshape(1, -1))[0]
prev_metric_data, _ = clean_metric_data(prev_agg_data['y_matrix'],
prev_metric_data, _ = DataUtil.clean_metric_data(prev_agg_data['y_matrix'],
prev_agg_data['y_columnlabels'], session)
prev_metric_data = prev_metric_data.flatten()
prev_metric_scalar = MinMaxScaler().fit(prev_metric_data.reshape(1, -1))
@ -642,7 +618,8 @@ def configuration_recommendation_ddpg(recommendation_ddpg_input): # pylint: dis
params = JSONUtil.loads(session.hyperparameters)
agg_data = DataUtil.aggregate_data(result_list)
metric_data, _ = clean_metric_data(agg_data['y_matrix'], agg_data['y_columnlabels'], session)
metric_data, _ = DataUtil.clean_metric_data(agg_data['y_matrix'],
agg_data['y_columnlabels'], session)
metric_data = metric_data.flatten()
metric_scalar = MinMaxScaler().fit(metric_data.reshape(1, -1))
normalized_metric_data = metric_scalar.transform(metric_data.reshape(1, -1))[0]

View File

@ -22,7 +22,7 @@ from analysis.preprocessing import (Bin, get_shuffle_indices,
consolidate_columnlabels)
from website.models import PipelineData, PipelineRun, Result, Workload, ExecutionTime
from website.settings import (ENABLE_DUMMY_ENCODER, KNOB_IDENT_USE_PRUNED_METRICS,
MIN_WORKLOAD_RESULTS_COUNT, TIME_ZONE)
MIN_WORKLOAD_RESULTS_COUNT, TIME_ZONE, VIEWS_FOR_PRUNING)
from website.types import PipelineTaskType, WorkloadStatusType
from website.utils import DataUtil, JSONUtil
@ -82,8 +82,6 @@ def run_background_tasks():
# Check that there are enough results in the workload
LOG.info("Not enough results in workload %s (# results: %s, # required: %s).",
workload_name, num_wkld_results, MIN_WORKLOAD_RESULTS_COUNT)
workload.status = WorkloadStatusType.PROCESSED
workload.save()
continue
LOG.info("Aggregating data for workload %s...", workload_name)
@ -92,18 +90,11 @@ def run_background_tasks():
LOG.debug("Aggregated knob data: rowlabels=%s, columnlabels=%s, data=%s.",
len(knob_data['rowlabels']), len(knob_data['columnlabels']),
knob_data['data'].shape)
LOG.debug("Aggregated metric data: rowlabels=%s, columnlabels=%s, data=%s.",
len(metric_data['rowlabels']), len(metric_data['columnlabels']),
metric_data['data'].shape)
LOG.info("Done aggregating data for workload %s.", workload_name)
num_valid_results = knob_data['data'].shape[0] # pylint: disable=unsubscriptable-object
if num_valid_results < MIN_WORKLOAD_RESULTS_COUNT:
# Check that there are enough valid results in the workload
LOG.info("Not enough valid results in workload %s (# valid results: "
"%s, # required: %s).", workload_name, num_valid_results,
MIN_WORKLOAD_RESULTS_COUNT)
workload.status = WorkloadStatusType.PROCESSED
workload.save()
continue
# Knob_data and metric_data are 2D numpy arrays. Convert them into a
# JSON-friendly (nested) lists and then save them as new PipelineData
# objects.
@ -264,6 +255,14 @@ def run_workload_characterization(metric_data):
matrix = metric_data['data']
columnlabels = metric_data['columnlabels']
LOG.debug("Workload characterization ~ initial data size: %s", matrix.shape)
useful_labels = []
for label in columnlabels:
for view in VIEWS_FOR_PRUNING:
if view in label:
useful_labels.append(label)
break
matrix, columnlabels = DataUtil.clean_metric_data(matrix, columnlabels, None, useful_labels)
LOG.debug("Workload characterization ~ cleaned data size: %s", matrix.shape)
# Bin each column (metric) in the matrix by its decile
binner = Bin(bin_start=1, axis=0)
@ -298,6 +297,7 @@ def run_workload_characterization(metric_data):
# Components: metrics * factors
components = fa_model.components_.T.copy()
LOG.info("Workload characterization first part costs %.0f seconds.", time.time() - start_ts)
# Run Kmeans for # clusters k in range(1, num_nonduplicate_metrics - 1)
# K should be much smaller than n_cols in detK, For now max_cluster <= 20

View File

@ -25,7 +25,7 @@ from django.utils.text import capfirst
from django_db_logger.models import StatusLog
from djcelery.models import TaskMeta
from .models import DBMSCatalog, KnobCatalog, Result, Session, SessionKnob
from .models import DBMSCatalog, MetricCatalog, KnobCatalog, Result, Session, SessionKnob
from .settings import common
from .types import LabelStyleType, VarType
@ -211,6 +211,35 @@ class DataUtil(object):
rowlabels_unique[i] = tuple(rowlabels[dup_idxs])
return X_unique, y_unique, rowlabels_unique
@staticmethod
def clean_metric_data(metric_matrix, metric_labels, session, useful_labels=None):
# Make metric_labels identical to useful_labels (if given)
# If useful_labels is not given, let it to be all metrics in the catalog.
if useful_labels is None:
metric_objs = MetricCatalog.objects.filter(dbms=session.dbms)
metric_cat = [session.target_objective]
for metric_obj in metric_objs:
metric_cat.append(metric_obj.name)
else:
metric_cat = useful_labels
missing_columns = sorted(set(metric_cat) - set(metric_labels))
unused_columns = set(metric_labels) - set(metric_cat)
LOG.debug("clean_metric_data: added %d metrics and removed %d metric.",
len(missing_columns), len(unused_columns))
default_val = 0
metric_cat_size = len(metric_cat)
matrix = np.ones((len(metric_matrix), metric_cat_size)) * default_val
metric_labels_dict = {n: i for i, n in enumerate(metric_labels)}
# column labels in matrix has the same order as ones in metric catalog
# missing values are filled with default_val
for i, metric_name in enumerate(metric_cat):
if metric_name in metric_labels_dict:
index = metric_labels_dict[metric_name]
matrix[:, i] = metric_matrix[:, index]
LOG.debug("clean_metric_data: final ~ matrix: %s, labels: %s", matrix.shape,
len(metric_cat))
return matrix, metric_cat
@staticmethod
def clean_knob_data(knob_matrix, knob_labels, sessions):
# Filter and amend knob_matrix and knob_labels according to the tunable knobs in the session