ottertune/server/website/website/db/base/parser.py

#
# OtterTune - parser.py
#
# Copyright (c) 2017-18, Carnegie Mellon University Database Group
#
import logging

from collections import OrderedDict

from website.models import KnobCatalog, MetricCatalog
from website.types import BooleanType, KnobUnitType, MetricType, VarType
from website.utils import ConversionUtil
from .. import target_objectives

LOG = logging.getLogger(__name__)


# pylint: disable=no-self-use
class BaseParser:

    def __init__(self, dbms_obj):
        self.dbms_id = int(dbms_obj.pk)

        self.valid_true_val = ("on", "true", "yes")
        self.valid_false_val = ("off", "false", "no")
        self.true_value = 'on'
        self.false_value = 'off'

        self.bytes_system = ConversionUtil.DEFAULT_BYTES_SYSTEM
        self.time_system = ConversionUtil.DEFAULT_TIME_SYSTEM
        self.min_bytes_unit = 'kB'
        self.min_time_unit = 'ms'

    def parse_version_string(self, version_string):
        return version_string

    def convert_bool(self, bool_value, metadata):
        if isinstance(bool_value, str):
            bool_value = bool_value.lower()

        if bool_value in self.valid_true_val:
            res = BooleanType.TRUE
        elif bool_value in self.valid_false_val:
            res = BooleanType.FALSE
        else:
            raise Exception("Invalid Boolean {}".format(bool_value))

        return res

    def convert_enum(self, enum_value, metadata):
        enumvals = metadata.enumvals.split(',')
        lower_enumvals = [ev.lower() for ev in enumvals]
        lower_enum_value = enum_value.lower()
        try:
            res = lower_enumvals.index(lower_enum_value)
        except ValueError:
            raise Exception('Invalid enum value for variable {} ({})'.format(
                metadata.name, enum_value))

        return res

    def convert_integer(self, int_value, metadata):
        if len(str(int_value)) == 0:
            # The value collected from the database is empty
            return 0
        try:
            try:
                converted = int(int_value)
            except ValueError:
                converted = int(float(int_value))

        except ValueError:
            if metadata.unit == KnobUnitType.BYTES:
                converted = ConversionUtil.get_raw_size(
                    int_value, system=self.bytes_system)
            elif metadata.unit == KnobUnitType.MILLISECONDS:
                converted = ConversionUtil.get_raw_size(
                    int_value, system=self.time_system)
            else:
                # If vartype is OTHER, try to decode it with bytes and time systems
                converted = ConversionUtil.get_raw_size(
                    int_value, system=self.bytes_system)
                if converted is None:
                    converted = ConversionUtil.get_raw_size(
                        int_value, system=self.time_system)
        if converted is None:
            raise Exception('Invalid integer format for {}: {}'.format(
                metadata.name, int_value))
        return converted

    def convert_real(self, real_value, metadata):
        try:
            return float(real_value)
        except ValueError:
            raise Exception('Cannot convert knob {} from {} to float'.format(
                metadata.name, real_value))
    def convert_string(self, string_value, metadata):
        return string_value

    def convert_timestamp(self, timestamp_value, metadata):
        return timestamp_value

    def valid_boolean_val_to_string(self):
        str_true = 'valid true values: '
        for bval in self.valid_true_val:
            str_true += str(bval) + ' '
        str_false = 'valid false values: '
        for bval in self.valid_false_val:
            str_false += str(bval) + ' '
        return str_true + '; ' + str_false

    def convert_dbms_knobs(self, knobs, knob_catalog=None):
        knob_data = {}
        if knob_catalog is None:
            knob_catalog = KnobCatalog.objects.filter(dbms__id=self.dbms_id, tunable=True)
        for metadata in knob_catalog:
            name = metadata.name
            if name not in knobs:
                if name.startswith('global.'):
                    name = name[name.find('.') + 1:]
                if name not in knobs:
                    continue
            value = knobs[name]
            if isinstance(value, str):
                value = value.replace('\'', '')
            conv_value = None

            if metadata.vartype == VarType.BOOL:
                if not self._check_knob_bool_val(value):
                    raise Exception('Knob {} boolean value not valid! '
                                    'Boolean values should be one of: {}, '
                                    'but the actual value is: {}'
                                    .format(name, self.valid_boolean_val_to_string(),
                                            str(value)))
                conv_value = self.convert_bool(value, metadata)

            elif metadata.vartype == VarType.ENUM:
                conv_value = self.convert_enum(value, metadata)

            elif metadata.vartype == VarType.INTEGER:
                conv_value = self.convert_integer(value, metadata)
                if not self._check_knob_num_in_range(conv_value, metadata):
                    raise Exception('Knob {} integer num value not in range! '
                                    'min: {}, max: {}, actual: {}'
                                    .format(name, metadata.minval,
                                            metadata.maxval, str(conv_value)))

            elif metadata.vartype == VarType.REAL:
                conv_value = self.convert_real(value, metadata)
                if not self._check_knob_num_in_range(conv_value, metadata):
                    raise Exception('Knob {} real num value not in range! '
                                    'min: {}, max: {}, actual: {}'
                                    .format(name, metadata.minval,
                                            metadata.maxval, str(conv_value)))

            elif metadata.vartype == VarType.STRING:
                conv_value = self.convert_string(value, metadata)

            elif metadata.vartype == VarType.TIMESTAMP:
                conv_value = self.convert_timestamp(value, metadata)

            else:
                raise Exception(
                    'Unknown variable type: {}'.format(metadata.vartype))

            if conv_value is None:
                raise Exception('Param value for {} cannot be null'.format(name))
            knob_data[name] = conv_value

        return knob_data

    def _check_knob_num_in_range(self, value, mdata, fix_knob_range=True):
        if mdata.minval is None or mdata.maxval is None:
            return True
        minval = float(mdata.minval)
        maxval = float(mdata.maxval)
        if fix_knob_range:
            if minval > value:
                LOG.debug("Changing knob %s minval from %f to %f", mdata.name, minval, value)
                mdata.minval = str(value)
            if maxval < value:
                LOG.debug("Changing knob %s maxval from %f to %f", mdata.name, maxval, value)
                mdata.maxval = str(value)
            mdata.save()
        return float(mdata.minval) <= value <= float(mdata.maxval)

    def _check_knob_bool_val(self, value):
        if isinstance(value, str):
            value = value.lower()
        return value in self.valid_true_val or value in self.valid_false_val

    def convert_dbms_metrics(self, metrics, observation_time, target_objective):
        numeric_metric_data = {}
        numeric_metric_catalog = MetricCatalog.objects.filter(
            dbms__id=self.dbms_id, metric_type__in=MetricType.numeric())

        for metadata in numeric_metric_catalog:
            name = metadata.name
            value = metrics[name]

            if metadata.vartype == VarType.INTEGER:
                converted = float(self.convert_integer(value, metadata))
            elif metadata.vartype == VarType.REAL:
                converted = self.convert_real(value, metadata)
            else:
                raise ValueError(
                    ("Found non-numeric metric '{}' in the numeric "
                     "metric catalog: value={}, type={}").format(
                         name, value, VarType.name(metadata.vartype)))

            if metadata.metric_type == MetricType.COUNTER:
                assert isinstance(converted, float)
                numeric_metric_data[name] = converted
            elif metadata.metric_type == MetricType.STATISTICS:
                assert isinstance(converted, float)
                numeric_metric_data[name] = converted
            else:
                raise ValueError(
                    'Unknown metric type for {}: {}'.format(name, metadata.metric_type))

        target_list = target_objectives.get_all(self.dbms_id)
        if target_objective not in target_list:
            raise ValueError(
                "Invalid target objective '{}'. Expected one of: {}.".format(
                    target_objective, ', '.join(target_list.keys())))

        for target_name, target_instance in target_list.items():
            # wait_class is needed to calculate target_objectives, but it is not numeric
            numeric_metric_data[target_name] = target_instance.compute(
                metrics, observation_time)

        return numeric_metric_data

    def extract_valid_variables(self, variables, catalog, default_value=None):
        valid_variables = {}
        diff_log = OrderedDict([(k, []) for k in ('miscapitalized', 'extra', 'missing')])
        lc_catalog = {k.lower(): v for k, v in catalog.items()}

        # First check that the names of all variables are valid (i.e., listed
        # in the official catalog). Invalid variables are logged as 'extras'.
        # Variable names that are valid but differ in capitalization are still
        # added to valid_variables but with the proper capitalization. They
        # are also logged as 'miscapitalized'.
        for var_name, var_value in variables.items():
            if var_name in catalog:
                valid_variables[var_name] = var_value
            else:
                lc_var_name = var_name.lower()
                if lc_var_name in lc_catalog:
                    valid_name = lc_catalog[lc_var_name].name
                    diff_log['miscapitalized'].append((valid_name, var_name))
                    valid_variables[valid_name] = var_value
                else:
                    diff_log['extra'].append(var_name)

        # Next find all item names that are listed in the catalog but missing from
        # variables. Missing variables are added to valid_variables with the given
        # default_value if provided (or the item's actual default value if not) and
        # logged as 'missing'.
        lc_variables = {k.lower() for k in variables.keys()}
        for valid_lc_name, metadata in lc_catalog.items():
            if valid_lc_name not in lc_variables:
                diff_log['missing'].append(metadata.name)
                valid_variables[metadata.name] = default_value if \
                    default_value is not None else metadata.default
        assert len(valid_variables) == len(catalog)
        return valid_variables, diff_log

    def parse_helper(self, scope, valid_variables, view_variables):
        for view_name, variables in list(view_variables.items()):
            for var_name, var_value in list(variables.items()):
                full_name = '{}.{}'.format(view_name, var_name)
                if full_name not in valid_variables:
                    valid_variables[full_name] = []
                valid_variables[full_name].append(var_value)
        return valid_variables

    def parse_dbms_variables(self, variables):
        valid_variables = {}
        for scope, sub_vars in list(variables.items()):
            if sub_vars is None:
                continue
            if scope == 'global':
                valid_variables.update(self.parse_helper(scope, valid_variables, sub_vars))
            elif scope == 'local':
                for _, viewnames in list(sub_vars.items()):
                    for viewname, objnames in list(viewnames.items()):
                        for _, view_vars in list(objnames.items()):
                            valid_variables.update(self.parse_helper(
                                scope, valid_variables, {viewname: view_vars}))
            else:
                raise Exception('Unsupported variable scope: {}'.format(scope))
        return valid_variables

    def parse_dbms_knobs(self, knobs):
        valid_knobs = self.parse_dbms_variables(knobs)

        for k in list(valid_knobs.keys()):
            assert len(valid_knobs[k]) == 1
            valid_knobs[k] = valid_knobs[k][0]
        # Extract all valid knobs
        knob_catalog = {k.name: k for k in KnobCatalog.objects.filter(dbms__id=self.dbms_id)}
        return self.extract_valid_variables(valid_knobs, knob_catalog)

    def parse_dbms_metrics(self, metrics):
        # Some DBMSs measure different types of stats (e.g., global, local)
        # at different scopes (e.g. indexes, # tables, database) so for now
        # we just combine them
        valid_metrics = self.parse_dbms_variables(metrics)

        # Extract all valid metrics
        metric_catalog = {m.name: m for m in MetricCatalog.objects.filter(dbms__id=self.dbms_id)}

        valid_metrics, diffs = self.extract_valid_variables(
            valid_metrics, metric_catalog, default_value='0')

        # Combine values
        for name, values in list(valid_metrics.items()):
            metric = metric_catalog[name]
            if len(values) == 1 or metric.metric_type in MetricType.nonnumeric():
                valid_metrics[name] = values[0]
            elif metric.metric_type in MetricType.numeric():
                conv_fn = int if metric.vartype == VarType.INTEGER else float
                values = [conv_fn(v) for v in values if v is not None]
                if len(values) == 0:
                    valid_metrics[name] = 0
                else:
                    valid_metrics[name] = str(sum(values))
            else:
                raise Exception(
                    'Invalid metric type: {}'.format(metric.metric_type))
        return valid_metrics, diffs

    def calculate_change_in_metrics(self, metrics_start, metrics_end,
                                    fix_metric_type=True, allow_negative=True):
        metric_catalog = {m.name: m for m in MetricCatalog.objects.filter(dbms__id=self.dbms_id)}
        adjusted_metrics = {}

        for met_name, start_val in metrics_start.items():
            end_val = metrics_end[met_name]
            met_info = metric_catalog[met_name]
            if met_info.vartype == VarType.INTEGER or \
                    met_info.vartype == VarType.REAL:
                conversion_fn = self.convert_integer if \
                    met_info.vartype == VarType.INTEGER else \
                    self.convert_real
                start_val = conversion_fn(start_val, met_info)
                end_val = conversion_fn(end_val, met_info)
                if met_info.metric_type == MetricType.COUNTER:
                    adj_val = end_val - start_val
                else:  # MetricType.STATISTICS or MetricType.INFO
                    adj_val = end_val

                if fix_metric_type:
                    if adj_val < 0:
                        adj_val = end_val
                        LOG.warning("Changing metric %s from COUNTER to STATISTICS", met_name)
                        met_info.metric_type = MetricType.STATISTICS
                        met_info.save()
                if allow_negative and adj_val < 0:
                    LOG.warning('%s metric type %s value is negative (start=%s, end=%s, diff=%s)',
                                met_name, MetricType.name(met_info.metric_type), start_val, end_val,
                                end_val - start_val)
                else:
                    assert adj_val >= 0, \
                        '{} wrong metric type: {} (start={}, end={}, diff={})'.format(
                            met_name, MetricType.name(met_info.metric_type), start_val,
                            end_val, end_val - start_val)

                adjusted_metrics[met_name] = adj_val
            else:
                # This metric is either a bool, enum, string, or timestamp
                # so take last recorded value from metrics_end
                adjusted_metrics[met_name] = end_val
        return adjusted_metrics

    def create_knob_configuration(self, tuning_knobs):
        configuration = {}
        for knob_name, knob_value in sorted(tuning_knobs.items()):
            # FIX ME: for now it only shows the global knobs, works for Postgres
            if knob_name.startswith('global.'):
                knob_name_global = knob_name[knob_name.find('.') + 1:]
                configuration[knob_name_global] = knob_value

        configuration = OrderedDict(sorted(configuration.items()))
        return configuration

    def format_bool(self, bool_value, metadata):
        return self.true_value if int(round(bool_value)) == BooleanType.TRUE else self.false_value

    def format_enum(self, enum_value, metadata):
        enumvals = metadata.enumvals.split(',')
        return enumvals[int(round(enum_value))]

    def format_integer(self, int_value, metadata):
        int_value = int(round(int_value))
        if metadata.unit != KnobUnitType.OTHER and int_value > 0:
            if metadata.unit == KnobUnitType.BYTES:
                int_value = ConversionUtil.get_human_readable2(
                    int_value, self.bytes_system, self.min_bytes_unit)
            elif metadata.unit == KnobUnitType.MILLISECONDS:
                int_value = ConversionUtil.get_human_readable2(
                    int_value, self.time_system, self.min_time_unit)
            else:
                raise Exception(
                    'Invalid unit type for {}: {}'.format(
                        metadata.name, metadata.unit))

        return int_value

    def format_real(self, real_value, metadata):
        return round(float(real_value), 3)

    def format_string(self, string_value, metadata):
        return string_value

    def format_timestamp(self, timestamp_value, metadata):
        return timestamp_value

    def format_dbms_knobs(self, knobs):
        formatted_knobs = {}
        for knob_name, knob_value in list(knobs.items()):
            metadata = KnobCatalog.objects.get(dbms__id=self.dbms_id, name=knob_name)
            fvalue = None
            if metadata.vartype == VarType.BOOL:
                fvalue = self.format_bool(knob_value, metadata)
            elif metadata.vartype == VarType.ENUM:
                fvalue = self.format_enum(knob_value, metadata)
            elif metadata.vartype == VarType.INTEGER:
                fvalue = self.format_integer(knob_value, metadata)
            elif metadata.vartype == VarType.REAL:
                fvalue = self.format_real(knob_value, metadata)
            elif metadata.vartype == VarType.STRING:
                fvalue = self.format_string(knob_value, metadata)
            elif metadata.vartype == VarType.TIMESTAMP:
                fvalue = self.format_timestamp(knob_value, metadata)
            else:
                raise Exception('Unknown variable type for {}: {}'.format(
                    knob_name, metadata.vartype))
            if fvalue is None:
                raise Exception('Cannot format value for {}: {}'.format(
                    knob_name, knob_value))
            formatted_knobs[knob_name] = fvalue
        return formatted_knobs

# pylint: enable=no-self-use