clean metrics for DDPG
This commit is contained in:
		
							parent
							
								
									2916330e89
								
							
						
					
					
						commit
						285522a0f5
					
				| 
						 | 
				
			
			@ -32,3 +32,9 @@ VIEWS_FOR_PRUNING = {
 | 
			
		|||
    DBMSType.ORACLE: ['dba_hist_osstat', 'dba_hist_sysstat', 'dba_hist_system_event',
 | 
			
		||||
                      'dba_workload_replays', 'dba_hist_sys_time_model'],
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
# The views used for DDPG
 | 
			
		||||
# WARNING: modifying this parameter will cause all existing DDPG sessions broken
 | 
			
		||||
VIEWS_FOR_DDPG = {
 | 
			
		||||
    DBMSType.ORACLE: ['dba_hist_sys_time_model'],
 | 
			
		||||
}
 | 
			
		||||
| 
						 | 
				
			
			
 | 
			
		|||
| 
						 | 
				
			
			@ -34,7 +34,7 @@ from website.models import (PipelineData, PipelineRun, Result, Workload, Session
 | 
			
		|||
from website import db
 | 
			
		||||
from website.types import PipelineTaskType, AlgorithmType, VarType
 | 
			
		||||
from website.utils import DataUtil, JSONUtil
 | 
			
		||||
from website.settings import ENABLE_DUMMY_ENCODER, TIME_ZONE
 | 
			
		||||
from website.settings import ENABLE_DUMMY_ENCODER, TIME_ZONE, VIEWS_FOR_DDPG
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
LOG = get_task_logger(__name__)
 | 
			
		||||
| 
						 | 
				
			
			@ -420,6 +420,7 @@ def train_ddpg(train_ddpg_input):
 | 
			
		|||
    result_id, algorithm, target_data = train_ddpg_input
 | 
			
		||||
    result = Result.objects.get(pk=result_id)
 | 
			
		||||
    session = result.session
 | 
			
		||||
    dbms = session.dbms
 | 
			
		||||
    task_name = _get_task_name(session, result_id)
 | 
			
		||||
 | 
			
		||||
    # If the preprocessing method has already generated a config, bypass this method.
 | 
			
		||||
| 
						 | 
				
			
			@ -480,13 +481,14 @@ def train_ddpg(train_ddpg_input):
 | 
			
		|||
    prev_objective = prev_metric_data[target_obj_idx]
 | 
			
		||||
 | 
			
		||||
    # Clean metric data
 | 
			
		||||
    views = VIEWS_FOR_DDPG.get(dbms.type, None)
 | 
			
		||||
    metric_data, _ = DataUtil.clean_metric_data(agg_data['y_matrix'],
 | 
			
		||||
                                                agg_data['y_columnlabels'], session)
 | 
			
		||||
                                                agg_data['y_columnlabels'], views)
 | 
			
		||||
    metric_data = metric_data.flatten()
 | 
			
		||||
    metric_scalar = MinMaxScaler().fit(metric_data.reshape(1, -1))
 | 
			
		||||
    normalized_metric_data = metric_scalar.transform(metric_data.reshape(1, -1))[0]
 | 
			
		||||
    prev_metric_data, _ = DataUtil.clean_metric_data(prev_agg_data['y_matrix'],
 | 
			
		||||
                                                     prev_agg_data['y_columnlabels'], session)
 | 
			
		||||
                                                     prev_agg_data['y_columnlabels'], views)
 | 
			
		||||
    prev_metric_data = prev_metric_data.flatten()
 | 
			
		||||
    prev_metric_scalar = MinMaxScaler().fit(prev_metric_data.reshape(1, -1))
 | 
			
		||||
    prev_normalized_metric_data = prev_metric_scalar.transform(prev_metric_data.reshape(1, -1))[0]
 | 
			
		||||
| 
						 | 
				
			
			@ -597,6 +599,7 @@ def configuration_recommendation_ddpg(recommendation_ddpg_input):  # pylint: dis
 | 
			
		|||
    result_list = Result.objects.filter(pk=result_id)
 | 
			
		||||
    result = result_list.first()
 | 
			
		||||
    session = result.session
 | 
			
		||||
    dbms = session.dbms
 | 
			
		||||
    task_name = _get_task_name(session, result_id)
 | 
			
		||||
 | 
			
		||||
    early_return, target_data_res = check_early_return(target_data, algorithm)
 | 
			
		||||
| 
						 | 
				
			
			@ -609,8 +612,9 @@ def configuration_recommendation_ddpg(recommendation_ddpg_input):  # pylint: dis
 | 
			
		|||
 | 
			
		||||
    params = JSONUtil.loads(session.hyperparameters)
 | 
			
		||||
    agg_data = DataUtil.aggregate_data(result_list)
 | 
			
		||||
    views = VIEWS_FOR_DDPG.get(dbms.type, None)
 | 
			
		||||
    metric_data, _ = DataUtil.clean_metric_data(agg_data['y_matrix'],
 | 
			
		||||
                                                agg_data['y_columnlabels'], session)
 | 
			
		||||
                                                agg_data['y_columnlabels'], views)
 | 
			
		||||
    metric_data = metric_data.flatten()
 | 
			
		||||
    metric_scalar = MinMaxScaler().fit(metric_data.reshape(1, -1))
 | 
			
		||||
    normalized_metric_data = metric_scalar.transform(metric_data.reshape(1, -1))[0]
 | 
			
		||||
| 
						 | 
				
			
			
 | 
			
		|||
| 
						 | 
				
			
			@ -269,16 +269,8 @@ def run_workload_characterization(metric_data, dbms=None):
 | 
			
		|||
    LOG.debug("Workload characterization ~ initial data size: %s", matrix.shape)
 | 
			
		||||
 | 
			
		||||
    views = None if dbms is None else VIEWS_FOR_PRUNING.get(dbms.type, None)
 | 
			
		||||
    if views is not None:
 | 
			
		||||
        useful_labels = []
 | 
			
		||||
        for label in columnlabels:
 | 
			
		||||
            for view in views:
 | 
			
		||||
                if view in label:
 | 
			
		||||
                    useful_labels.append(label)
 | 
			
		||||
                    break
 | 
			
		||||
        matrix, columnlabels = DataUtil.clean_metric_data(matrix, columnlabels, None,
 | 
			
		||||
                                                          useful_labels)
 | 
			
		||||
        LOG.debug("Workload characterization ~ cleaned data size: %s", matrix.shape)
 | 
			
		||||
    matrix, columnlabels = DataUtil.clean_metric_data(matrix, columnlabels, views)
 | 
			
		||||
    LOG.debug("Workload characterization ~ cleaned data size: %s", matrix.shape)
 | 
			
		||||
 | 
			
		||||
    # Bin each column (metric) in the matrix by its decile
 | 
			
		||||
    binner = Bin(bin_start=1, axis=0)
 | 
			
		||||
| 
						 | 
				
			
			
 | 
			
		|||
| 
						 | 
				
			
			@ -212,33 +212,35 @@ class DataUtil(object):
 | 
			
		|||
        return X_unique, y_unique, rowlabels_unique
 | 
			
		||||
 | 
			
		||||
    @staticmethod
 | 
			
		||||
    def clean_metric_data(metric_matrix, metric_labels, session, useful_labels=None):
 | 
			
		||||
        # Make metric_labels identical to useful_labels (if given)
 | 
			
		||||
        # If useful_labels is not given, let it to be all metrics in the catalog.
 | 
			
		||||
        if useful_labels is None:
 | 
			
		||||
            metric_objs = MetricCatalog.objects.filter(dbms=session.dbms)
 | 
			
		||||
            metric_cat = [session.target_objective]
 | 
			
		||||
            for metric_obj in metric_objs:
 | 
			
		||||
                metric_cat.append(metric_obj.name)
 | 
			
		||||
        else:
 | 
			
		||||
            metric_cat = useful_labels
 | 
			
		||||
        missing_columns = sorted(set(metric_cat) - set(metric_labels))
 | 
			
		||||
        unused_columns = set(metric_labels) - set(metric_cat)
 | 
			
		||||
    def clean_metric_data(metric_matrix, metric_labels, useful_views):
 | 
			
		||||
        # Make metric_labels identical to useful_labels (if not None)
 | 
			
		||||
        if useful_views is None:
 | 
			
		||||
            return metric_matrix, metric_labels
 | 
			
		||||
 | 
			
		||||
        useful_labels = []
 | 
			
		||||
        for label in metric_labels:
 | 
			
		||||
            for view in useful_views:
 | 
			
		||||
                if view in label:
 | 
			
		||||
                    useful_labels.append(label)
 | 
			
		||||
                    break
 | 
			
		||||
 | 
			
		||||
        missing_columns = sorted(set(useful_labels) - set(metric_labels))
 | 
			
		||||
        unused_columns = set(metric_labels) - set(useful_labels)
 | 
			
		||||
        LOG.debug("clean_metric_data: added %d metrics and removed %d metric.",
 | 
			
		||||
                  len(missing_columns), len(unused_columns))
 | 
			
		||||
        default_val = 0
 | 
			
		||||
        metric_cat_size = len(metric_cat)
 | 
			
		||||
        matrix = np.ones((len(metric_matrix), metric_cat_size)) * default_val
 | 
			
		||||
        useful_labels_size = len(useful_labels)
 | 
			
		||||
        matrix = np.ones((len(metric_matrix), useful_labels_size)) * default_val
 | 
			
		||||
        metric_labels_dict = {n: i for i, n in enumerate(metric_labels)}
 | 
			
		||||
        # column labels in matrix has the same order as ones in metric catalog
 | 
			
		||||
        # column labels in matrix has the same order as ones in useful_labels
 | 
			
		||||
        # missing values are filled with default_val
 | 
			
		||||
        for i, metric_name in enumerate(metric_cat):
 | 
			
		||||
        for i, metric_name in enumerate(useful_labels):
 | 
			
		||||
            if metric_name in metric_labels_dict:
 | 
			
		||||
                index = metric_labels_dict[metric_name]
 | 
			
		||||
                matrix[:, i] = metric_matrix[:, index]
 | 
			
		||||
        LOG.debug("clean_metric_data: final ~ matrix: %s, labels: %s", matrix.shape,
 | 
			
		||||
                  len(metric_cat))
 | 
			
		||||
        return matrix, metric_cat
 | 
			
		||||
                  useful_labels_size)
 | 
			
		||||
        return matrix, useful_labels
 | 
			
		||||
 | 
			
		||||
    @staticmethod
 | 
			
		||||
    def clean_knob_data(knob_matrix, knob_labels, sessions):
 | 
			
		||||
| 
						 | 
				
			
			
 | 
			
		|||
		Loading…
	
		Reference in New Issue