When ranking knobs in the background tasks use the target objective(s) by default. Added an option that when enabled also uses the pruned metrics.

2020-04-13 23:54:44 -04:00 · 2020-04-13 23:54:44 -04:00 · efa02899b5
parent aeaae7d9f4
commit efa02899b5
3 changed files with 34 additions and 12 deletions
--- a/server/website/website/admin.py
+++ b/server/website/website/admin.py
@ -146,7 +146,7 @@ class TaskMetaAdmin(admin.ModelAdmin):
    fields = readonly_fields
    list_filter = ('status',)
    list_per_page = 10
-    ordering = ('date_done',)
+    ordering = ('-date_done',)
    max_field_length = 1000

    @staticmethod
--- a/server/website/website/settings/constants.py
+++ b/server/website/website/settings/constants.py
@ -11,3 +11,16 @@ CHECK_CELERY = True

 # address categorical knobs (enum, boolean)
 ENABLE_DUMMY_ENCODER = False
+
+# Whether to include the pruned metrics from the workload characterization subtask in
+# the output (y) when ranking the knobs for a given workload in the knob identification
+# subtask.
+
+# When computing the ranked knobs in the knob identification subtask, the output (y) is
+# the set of target objectives used to tune the given workload. If this flag is enabled
+# then the pruned metrics from the workload characterization subtask are also included
+# in the output. (See website/tasks/periodic_tasks.py)
+KNOB_IDENT_USE_PRUNED_METRICS = False
+
+# The background tasks only process workloads containing this minimum amount of results
+MIN_WORKLOAD_RESULTS_COUNT = 5
--- a/server/website/website/tasks/periodic_tasks.py
+++ b/server/website/website/tasks/periodic_tasks.py
@ -21,14 +21,13 @@ from analysis.preprocessing import (Bin, get_shuffle_indices,
                                    DummyEncoder,
                                    consolidate_columnlabels)
 from website.models import PipelineData, PipelineRun, Result, Workload, ExecutionTime
-from website.settings import ENABLE_DUMMY_ENCODER, TIME_ZONE
+from website.settings import (ENABLE_DUMMY_ENCODER, KNOB_IDENT_USE_PRUNED_METRICS,
+                              MIN_WORKLOAD_RESULTS_COUNT, TIME_ZONE)
 from website.types import PipelineTaskType, WorkloadStatusType
 from website.utils import DataUtil, JSONUtil

 # Log debug messages
 LOG = get_task_logger(__name__)
-# Only process workload containing this minimum amount of results
-MIN_WORKLOAD_RESULTS_COUNT = 5


 def save_execution_time(start_ts, fn):
@ -134,21 +133,31 @@ def run_background_tasks():
                                            creation_time=now())
        pruned_metrics_entry.save()

-        # Use the pruned metrics to filter the metric_data
-        pruned_metric_idxs = [i for i, metric_name in enumerate(metric_data['columnlabels'])
-                              if metric_name in pruned_metrics]
-        pruned_metric_data = {
-            'data': metric_data['data'][:, pruned_metric_idxs],
+        # Workload target objective data
+        ranked_knob_metrics = sorted(wkld_results.distinct('session').values_list(
+            'session__target_objective', flat=True).distinct())
+        LOG.debug("Target objectives for workload %s: %s", workload_name,
+                  ', '.join(ranked_knob_metrics))
+
+        if KNOB_IDENT_USE_PRUNED_METRICS:
+            ranked_knob_metrics = sorted(set(ranked_knob_metrics) + set(pruned_metrics))
+
+        # Use the set of metrics to filter the metric_data
+        metric_idxs = [i for i, metric_name in enumerate(metric_data['columnlabels'])
+                       if metric_name in ranked_knob_metrics]
+        ranked_metric_data = {
+            'data': metric_data['data'][:, metric_idxs],
            'rowlabels': copy.deepcopy(metric_data['rowlabels']),
-            'columnlabels': [metric_data['columnlabels'][i] for i in pruned_metric_idxs]
+            'columnlabels': [metric_data['columnlabels'][i] for i in metric_idxs]
        }

        # Execute the Knob Identification task to compute an ordered list of knobs
        # ranked by their impact on the DBMS's performance. Save them in a new
        # PipelineData object.
-        LOG.info("Ranking knobs for workload %s...", workload_name)
+        LOG.info("Ranking knobs for workload %s (use pruned metric data: %s)...",
+                 workload_name, KNOB_IDENT_USE_PRUNED_METRICS)
        ranked_knobs = run_knob_identification(knob_data=knob_data,
-                                               metric_data=pruned_metric_data,
+                                               metric_data=ranked_metric_data,
                                               dbms=workload.dbms)
        LOG.info("Done ranking knobs for workload %s (# ranked knobs: %s).\n\n"
                 "Ranked knobs: %s\n", workload_name, len(ranked_knobs), ranked_knobs)