From efa02899b5a2b46f4260095bb75d2610d9874f74 Mon Sep 17 00:00:00 2001
From: dvanaken <dvanaken@cs.cmu.edu>
Date: Mon, 13 Apr 2020 23:54:44 -0400
Subject: [PATCH] When ranking knobs in the background tasks use the target
 objective(s) by default. Added an option that when enabled also uses the
 pruned metrics.

---
 server/website/website/admin.py               |  2 +-
 server/website/website/settings/constants.py  | 13 ++++++++
 .../website/website/tasks/periodic_tasks.py   | 31 ++++++++++++-------
 3 files changed, 34 insertions(+), 12 deletions(-)

diff --git a/server/website/website/admin.py b/server/website/website/admin.py
index d3b7b49..0862836 100644
--- a/server/website/website/admin.py
+++ b/server/website/website/admin.py
@@ -146,7 +146,7 @@ class TaskMetaAdmin(admin.ModelAdmin):
     fields = readonly_fields
     list_filter = ('status',)
     list_per_page = 10
-    ordering = ('date_done',)
+    ordering = ('-date_done',)
     max_field_length = 1000
 
     @staticmethod
diff --git a/server/website/website/settings/constants.py b/server/website/website/settings/constants.py
index 7fb1fec..f078417 100644
--- a/server/website/website/settings/constants.py
+++ b/server/website/website/settings/constants.py
@@ -11,3 +11,16 @@ CHECK_CELERY = True
 
 # address categorical knobs (enum, boolean)
 ENABLE_DUMMY_ENCODER = False
+
+# Whether to include the pruned metrics from the workload characterization subtask in
+# the output (y) when ranking the knobs for a given workload in the knob identification
+# subtask.
+
+# When computing the ranked knobs in the knob identification subtask, the output (y) is
+# the set of target objectives used to tune the given workload. If this flag is enabled
+# then the pruned metrics from the workload characterization subtask are also included
+# in the output. (See website/tasks/periodic_tasks.py)
+KNOB_IDENT_USE_PRUNED_METRICS = False
+
+# The background tasks only process workloads containing this minimum amount of results
+MIN_WORKLOAD_RESULTS_COUNT = 5
diff --git a/server/website/website/tasks/periodic_tasks.py b/server/website/website/tasks/periodic_tasks.py
index 466ad19..a744a61 100644
--- a/server/website/website/tasks/periodic_tasks.py
+++ b/server/website/website/tasks/periodic_tasks.py
@@ -21,14 +21,13 @@ from analysis.preprocessing import (Bin, get_shuffle_indices,
                                     DummyEncoder,
                                     consolidate_columnlabels)
 from website.models import PipelineData, PipelineRun, Result, Workload, ExecutionTime
-from website.settings import ENABLE_DUMMY_ENCODER, TIME_ZONE
+from website.settings import (ENABLE_DUMMY_ENCODER, KNOB_IDENT_USE_PRUNED_METRICS,
+                              MIN_WORKLOAD_RESULTS_COUNT, TIME_ZONE)
 from website.types import PipelineTaskType, WorkloadStatusType
 from website.utils import DataUtil, JSONUtil
 
 # Log debug messages
 LOG = get_task_logger(__name__)
-# Only process workload containing this minimum amount of results
-MIN_WORKLOAD_RESULTS_COUNT = 5
 
 
 def save_execution_time(start_ts, fn):
@@ -134,21 +133,31 @@ def run_background_tasks():
                                             creation_time=now())
         pruned_metrics_entry.save()
 
-        # Use the pruned metrics to filter the metric_data
-        pruned_metric_idxs = [i for i, metric_name in enumerate(metric_data['columnlabels'])
-                              if metric_name in pruned_metrics]
-        pruned_metric_data = {
-            'data': metric_data['data'][:, pruned_metric_idxs],
+        # Workload target objective data
+        ranked_knob_metrics = sorted(wkld_results.distinct('session').values_list(
+            'session__target_objective', flat=True).distinct())
+        LOG.debug("Target objectives for workload %s: %s", workload_name,
+                  ', '.join(ranked_knob_metrics))
+
+        if KNOB_IDENT_USE_PRUNED_METRICS:
+            ranked_knob_metrics = sorted(set(ranked_knob_metrics) + set(pruned_metrics))
+
+        # Use the set of metrics to filter the metric_data
+        metric_idxs = [i for i, metric_name in enumerate(metric_data['columnlabels'])
+                       if metric_name in ranked_knob_metrics]
+        ranked_metric_data = {
+            'data': metric_data['data'][:, metric_idxs],
             'rowlabels': copy.deepcopy(metric_data['rowlabels']),
-            'columnlabels': [metric_data['columnlabels'][i] for i in pruned_metric_idxs]
+            'columnlabels': [metric_data['columnlabels'][i] for i in metric_idxs]
         }
 
         # Execute the Knob Identification task to compute an ordered list of knobs
         # ranked by their impact on the DBMS's performance. Save them in a new
         # PipelineData object.
-        LOG.info("Ranking knobs for workload %s...", workload_name)
+        LOG.info("Ranking knobs for workload %s (use pruned metric data: %s)...",
+                 workload_name, KNOB_IDENT_USE_PRUNED_METRICS)
         ranked_knobs = run_knob_identification(knob_data=knob_data,
-                                               metric_data=pruned_metric_data,
+                                               metric_data=ranked_metric_data,
                                                dbms=workload.dbms)
         LOG.info("Done ranking knobs for workload %s (# ranked knobs: %s).\n\n"
                  "Ranked knobs: %s\n", workload_name, len(ranked_knobs), ranked_knobs)