From d5f7ae65eb122390da57749b39e0b3073b00a298 Mon Sep 17 00:00:00 2001
From: bohanjason <bohanzhang95@gmail.com>
Date: Sat, 14 Dec 2019 23:41:58 -0500
Subject: [PATCH] disable dummy encoder in knob identification

---
 server/website/website/tasks/async_tasks.py   |  2 ++
 .../website/website/tasks/periodic_tasks.py   | 27 ++++++++++---------
 2 files changed, 17 insertions(+), 12 deletions(-)

diff --git a/server/website/website/tasks/async_tasks.py b/server/website/website/tasks/async_tasks.py
index 45cb8a7..1591474 100644
--- a/server/website/website/tasks/async_tasks.py
+++ b/server/website/website/tasks/async_tasks.py
@@ -482,6 +482,8 @@ def configuration_recommendation(recommendation_input):
         workload=mapped_workload,
         task_type=PipelineTaskType.RANKED_KNOBS)
     ranked_knobs = JSONUtil.loads(ranked_knobs.data)[:IMPORTANT_KNOB_NUMBER]
+    # Sort the important knobs to fix the columns of input X
+    ranked_knobs = sorted(ranked_knobs)
     ranked_knob_idxs = [i for i, cl in enumerate(X_columnlabels) if cl in ranked_knobs]
     X_workload = X_workload[:, ranked_knob_idxs]
     X_target = X_target[:, ranked_knob_idxs]
diff --git a/server/website/website/tasks/periodic_tasks.py b/server/website/website/tasks/periodic_tasks.py
index 670bea8..ef259e7 100644
--- a/server/website/website/tasks/periodic_tasks.py
+++ b/server/website/website/tasks/periodic_tasks.py
@@ -18,7 +18,7 @@ from analysis.preprocessing import (Bin, get_shuffle_indices,
                                     DummyEncoder,
                                     consolidate_columnlabels)
 from website.models import PipelineData, PipelineRun, Result, Workload
-from website.settings import RUN_EVERY
+from website.settings import RUN_EVERY, ENABLE_DUMMY_ENCODER
 from website.types import PipelineTaskType, WorkloadStatusType
 from website.utils import DataUtil, JSONUtil
 
@@ -296,18 +296,21 @@ def run_knob_identification(knob_data, metric_data, dbms):
             nonconst_metric_columnlabels.append(cl)
     nonconst_metric_matrix = np.hstack(nonconst_metric_matrix)
 
-    # determine which knobs need encoding (enums with >2 possible values)
+    if ENABLE_DUMMY_ENCODER:
+        # determine which knobs need encoding (enums with >2 possible values)
 
-    categorical_info = DataUtil.dummy_encoder_helper(nonconst_knob_columnlabels,
-                                                     dbms)
-    # encode categorical variable first (at least, before standardize)
-    dummy_encoder = DummyEncoder(categorical_info['n_values'],
-                                 categorical_info['categorical_features'],
-                                 categorical_info['cat_columnlabels'],
-                                 categorical_info['noncat_columnlabels'])
-    encoded_knob_matrix = dummy_encoder.fit_transform(
-        nonconst_knob_matrix)
-    encoded_knob_columnlabels = dummy_encoder.new_labels
+        categorical_info = DataUtil.dummy_encoder_helper(nonconst_knob_columnlabels,
+                                                         dbms)
+        # encode categorical variable first (at least, before standardize)
+        dummy_encoder = DummyEncoder(categorical_info['n_values'],
+                                     categorical_info['categorical_features'],
+                                     categorical_info['cat_columnlabels'],
+                                     categorical_info['noncat_columnlabels'])
+        encoded_knob_matrix = dummy_encoder.fit_transform(
+            nonconst_knob_matrix)
+        encoded_knob_columnlabels = dummy_encoder.new_labels
+    else:
+        encoded_knob_columnlabels = nonconst_knob_columnlabels
 
     # standardize values in each column to N(0, 1)
     standardizer = StandardScaler()