From d5f7ae65eb122390da57749b39e0b3073b00a298 Mon Sep 17 00:00:00 2001 From: bohanjason Date: Sat, 14 Dec 2019 23:41:58 -0500 Subject: [PATCH] disable dummy encoder in knob identification --- server/website/website/tasks/async_tasks.py | 2 ++ .../website/website/tasks/periodic_tasks.py | 27 ++++++++++--------- 2 files changed, 17 insertions(+), 12 deletions(-) diff --git a/server/website/website/tasks/async_tasks.py b/server/website/website/tasks/async_tasks.py index 45cb8a7..1591474 100644 --- a/server/website/website/tasks/async_tasks.py +++ b/server/website/website/tasks/async_tasks.py @@ -482,6 +482,8 @@ def configuration_recommendation(recommendation_input): workload=mapped_workload, task_type=PipelineTaskType.RANKED_KNOBS) ranked_knobs = JSONUtil.loads(ranked_knobs.data)[:IMPORTANT_KNOB_NUMBER] + # Sort the important knobs to fix the columns of input X + ranked_knobs = sorted(ranked_knobs) ranked_knob_idxs = [i for i, cl in enumerate(X_columnlabels) if cl in ranked_knobs] X_workload = X_workload[:, ranked_knob_idxs] X_target = X_target[:, ranked_knob_idxs] diff --git a/server/website/website/tasks/periodic_tasks.py b/server/website/website/tasks/periodic_tasks.py index 670bea8..ef259e7 100644 --- a/server/website/website/tasks/periodic_tasks.py +++ b/server/website/website/tasks/periodic_tasks.py @@ -18,7 +18,7 @@ from analysis.preprocessing import (Bin, get_shuffle_indices, DummyEncoder, consolidate_columnlabels) from website.models import PipelineData, PipelineRun, Result, Workload -from website.settings import RUN_EVERY +from website.settings import RUN_EVERY, ENABLE_DUMMY_ENCODER from website.types import PipelineTaskType, WorkloadStatusType from website.utils import DataUtil, JSONUtil @@ -296,18 +296,21 @@ def run_knob_identification(knob_data, metric_data, dbms): nonconst_metric_columnlabels.append(cl) nonconst_metric_matrix = np.hstack(nonconst_metric_matrix) - # determine which knobs need encoding (enums with >2 possible values) + if ENABLE_DUMMY_ENCODER: + # determine which knobs need encoding (enums with >2 possible values) - categorical_info = DataUtil.dummy_encoder_helper(nonconst_knob_columnlabels, - dbms) - # encode categorical variable first (at least, before standardize) - dummy_encoder = DummyEncoder(categorical_info['n_values'], - categorical_info['categorical_features'], - categorical_info['cat_columnlabels'], - categorical_info['noncat_columnlabels']) - encoded_knob_matrix = dummy_encoder.fit_transform( - nonconst_knob_matrix) - encoded_knob_columnlabels = dummy_encoder.new_labels + categorical_info = DataUtil.dummy_encoder_helper(nonconst_knob_columnlabels, + dbms) + # encode categorical variable first (at least, before standardize) + dummy_encoder = DummyEncoder(categorical_info['n_values'], + categorical_info['categorical_features'], + categorical_info['cat_columnlabels'], + categorical_info['noncat_columnlabels']) + encoded_knob_matrix = dummy_encoder.fit_transform( + nonconst_knob_matrix) + encoded_knob_columnlabels = dummy_encoder.new_labels + else: + encoded_knob_columnlabels = nonconst_knob_columnlabels # standardize values in each column to N(0, 1) standardizer = StandardScaler()