When ranking knobs in the background tasks use the target objective(s) by default. Added an option that when enabled also uses the pruned metrics.

This commit is contained in:
dvanaken 2020-04-13 23:54:44 -04:00 committed by Dana Van Aken
parent aeaae7d9f4
commit efa02899b5
3 changed files with 34 additions and 12 deletions

View File

@ -146,7 +146,7 @@ class TaskMetaAdmin(admin.ModelAdmin):
fields = readonly_fields
list_filter = ('status',)
list_per_page = 10
ordering = ('date_done',)
ordering = ('-date_done',)
max_field_length = 1000
@staticmethod

View File

@ -11,3 +11,16 @@ CHECK_CELERY = True
# address categorical knobs (enum, boolean)
ENABLE_DUMMY_ENCODER = False
# Whether to include the pruned metrics from the workload characterization subtask in
# the output (y) when ranking the knobs for a given workload in the knob identification
# subtask.
# When computing the ranked knobs in the knob identification subtask, the output (y) is
# the set of target objectives used to tune the given workload. If this flag is enabled
# then the pruned metrics from the workload characterization subtask are also included
# in the output. (See website/tasks/periodic_tasks.py)
KNOB_IDENT_USE_PRUNED_METRICS = False
# The background tasks only process workloads containing this minimum amount of results
MIN_WORKLOAD_RESULTS_COUNT = 5

View File

@ -21,14 +21,13 @@ from analysis.preprocessing import (Bin, get_shuffle_indices,
DummyEncoder,
consolidate_columnlabels)
from website.models import PipelineData, PipelineRun, Result, Workload, ExecutionTime
from website.settings import ENABLE_DUMMY_ENCODER, TIME_ZONE
from website.settings import (ENABLE_DUMMY_ENCODER, KNOB_IDENT_USE_PRUNED_METRICS,
MIN_WORKLOAD_RESULTS_COUNT, TIME_ZONE)
from website.types import PipelineTaskType, WorkloadStatusType
from website.utils import DataUtil, JSONUtil
# Log debug messages
LOG = get_task_logger(__name__)
# Only process workload containing this minimum amount of results
MIN_WORKLOAD_RESULTS_COUNT = 5
def save_execution_time(start_ts, fn):
@ -134,21 +133,31 @@ def run_background_tasks():
creation_time=now())
pruned_metrics_entry.save()
# Use the pruned metrics to filter the metric_data
pruned_metric_idxs = [i for i, metric_name in enumerate(metric_data['columnlabels'])
if metric_name in pruned_metrics]
pruned_metric_data = {
'data': metric_data['data'][:, pruned_metric_idxs],
# Workload target objective data
ranked_knob_metrics = sorted(wkld_results.distinct('session').values_list(
'session__target_objective', flat=True).distinct())
LOG.debug("Target objectives for workload %s: %s", workload_name,
', '.join(ranked_knob_metrics))
if KNOB_IDENT_USE_PRUNED_METRICS:
ranked_knob_metrics = sorted(set(ranked_knob_metrics) + set(pruned_metrics))
# Use the set of metrics to filter the metric_data
metric_idxs = [i for i, metric_name in enumerate(metric_data['columnlabels'])
if metric_name in ranked_knob_metrics]
ranked_metric_data = {
'data': metric_data['data'][:, metric_idxs],
'rowlabels': copy.deepcopy(metric_data['rowlabels']),
'columnlabels': [metric_data['columnlabels'][i] for i in pruned_metric_idxs]
'columnlabels': [metric_data['columnlabels'][i] for i in metric_idxs]
}
# Execute the Knob Identification task to compute an ordered list of knobs
# ranked by their impact on the DBMS's performance. Save them in a new
# PipelineData object.
LOG.info("Ranking knobs for workload %s...", workload_name)
LOG.info("Ranking knobs for workload %s (use pruned metric data: %s)...",
workload_name, KNOB_IDENT_USE_PRUNED_METRICS)
ranked_knobs = run_knob_identification(knob_data=knob_data,
metric_data=pruned_metric_data,
metric_data=ranked_metric_data,
dbms=workload.dbms)
LOG.info("Done ranking knobs for workload %s (# ranked knobs: %s).\n\n"
"Ranked knobs: %s\n", workload_name, len(ranked_knobs), ranked_knobs)