diff --git a/client/driver/driver_config.json b/client/driver/driver_config.json index f7ed573..efd6bb7 100644 --- a/client/driver/driver_config.json +++ b/client/driver/driver_config.json @@ -1,7 +1,7 @@ { "database_type" : "postgres", "database_name" : "tpcc", - "database_disk": "/dev/xvda1", + "database_disk": "", "database_conf": "/etc/postgresql/9.6/main/postgresql.conf", "database_save_path": "~/ottertune/client/driver/dumpfiles", "username" : "dbuser", diff --git a/client/driver/fabfile.py b/client/driver/fabfile.py index f095d63..e89126f 100644 --- a/client/driver/fabfile.py +++ b/client/driver/fabfile.py @@ -34,6 +34,8 @@ fabric_output.update({ RELOAD_INTERVAL = 10 # maximum disk usage MAX_DISK_USAGE = 90 +# Postgres datadir +PG_DATADIR = '/var/lib/postgresql/11/main' # Load config with open('driver_config.json', 'r') as _f: @@ -119,7 +121,7 @@ def create_controller_config(): @task def restart_database(): if CONF['database_type'] == 'postgres': - cmd = 'sudo service postgresql restart' + cmd = 'sudo -u postgres pg_ctl -D {} -w restart'.format(PG_DATADIR) elif CONF['database_type'] == 'oracle': cmd = 'sh oracleScripts/shutdownOracle.sh && sh oracleScripts/startupOracle.sh' else: @@ -149,11 +151,11 @@ def create_database(): @task def reset_conf(): - change_conf(next_conf='') + change_conf() @task -def change_conf(next_conf='next_config'): +def change_conf(next_conf=None): signal = "# configurations recommended by ottertune:\n" next_conf = next_conf or {} @@ -240,6 +242,16 @@ def save_dbms_result(): srcfile = os.path.join(CONF['controller_home'], 'output', f_) dstfile = os.path.join(CONF['save_path'], '{}__{}'.format(t, f_)) local('cp {} {}'.format(srcfile, dstfile)) + return t + + +@task +def save_next_config(next_config, t=None): + if not t: + t = int(time.time()) + with open(os.path.join(CONF['save_path'], '{}__next_config.json'.format(t)), 'w') as f: + json.dump(next_config, f, indent=2) + return t @task @@ -282,52 +294,77 @@ def upload_result(result_dir=None, prefix=None): @task -def get_result(max_time_sec=180, interval_sec=1): +def get_result(max_time_sec=180, interval_sec=5): max_time_sec = int(max_time_sec) interval_sec = int(interval_sec) url = CONF['upload_url'] + '/query_and_get/' + CONF['upload_code'] - elapsed = 0.0 + elapsed = 0 response_dict = None response = '' - start_time = time.time() while elapsed <= max_time_sec: rsp = requests.get(url) response = rsp.content.decode() - LOG.debug('Response:\n\n%s\n', response) + assert response != 'null' + + LOG.debug('%s [status code: %d, content_type: %s, elapsed: %ds]', response, + rsp.status_code, rsp.headers.get('content-type', ''), elapsed) if rsp.status_code == 200: # Success response_dict = json.loads(rsp.json(), object_pairs_hook=OrderedDict) break + elif rsp.status_code == 202: # Not ready time.sleep(interval_sec) + elapsed += interval_sec + elif rsp.status_code == 400: # Failure raise Exception( "Failed to download the next config.\nStatus code: {}\nMessage: {}\n".format( rsp.status_code, response)) + else: raise NotImplementedError( "Unhandled status code: '{}'.\nMessage: {}".format(rsp.status_code, response)) - elapsed = time.time() - start_time - if not response_dict: assert elapsed > max_time_sec, \ - 'response={} but elapsed={:.1f}s <= max_time={:.1f}s'.format( + 'response={} but elapsed={}s <= max_time={}s'.format( response, elapsed, max_time_sec) raise Exception( - 'Failed to download the next config in {}s: {} (elapsed: {:.1f}s)'.format( + 'Failed to download the next config in {}s: {} (elapsed: {}s)'.format( max_time_sec, response, elapsed)) - LOG.info('Downloaded the next config in %.0fs: %s', elapsed, - json.dumps(response_dict, indent=4)) + LOG.info('Downloaded the next config in %ds: %s', elapsed, json.dumps(response_dict, indent=4)) return response_dict +@task +def download_debug_info(pprint=False): + pprint = _parse_bool(pprint) + url = '{}/dump/{}'.format(CONF['upload_url'], CONF['upload_code']) + params = {'pp': int(True)} if pprint else {} + rsp = requests.get(url, params=params) + + if rsp.status_code != 200: + raise Exception('Error downloading debug info.') + + filename = rsp.headers.get('Content-Disposition').split('=')[-1] + file_len, exp_len = len(rsp.content), int(rsp.headers.get('Content-Length')) + assert file_len == exp_len, 'File {}: content length != expected length: {} != {}'.format( + filename, file_len, exp_len) + + with open(filename, 'wb') as f: + f.write(rsp.content) + LOG.info('Downloaded debug info to %s', filename) + + return filename + + @task def add_udf(): cmd = 'sudo python3 ./LatencyUDF.py ../controller/output/' @@ -469,16 +506,19 @@ def loop(): # add_udf() # save result - save_dbms_result() + result_timestamp = save_dbms_result() # upload result upload_result() # get result - get_result() + response = get_result() + + # save next config + save_next_config(response, t=result_timestamp) # change config - change_conf() + change_conf(response['recommendation']) @task @@ -489,6 +529,7 @@ def run_lhs(): # dump database if it's not done before. dump = dump_database() + result_timestamp = None for i, sample in enumerate(samples): # reload database periodically if RELOAD_INTERVAL > 0: @@ -508,15 +549,16 @@ def run_lhs(): if check_disk_usage() > MAX_DISK_USAGE: LOG.warning('Exceeds max disk usage %s', MAX_DISK_USAGE) - # copy lhs-sampled config to the to-be-used config - cmd = 'cp {} next_config'.format(sample) - local(cmd) + # load the next lhs-sampled config + with open(sample, 'r') as f: + next_config = json.load(f, object_pairs_hook=OrderedDict) + save_next_config(next_config, t=result_timestamp) # remove oltpbench log and controller log clean_logs() # change config - change_conf() + change_conf(next_config) # restart database restart_database() @@ -550,7 +592,7 @@ def run_lhs(): p.join() # save result - save_dbms_result() + result_timestamp = save_dbms_result() # upload result upload_result() diff --git a/server/website/website/management/commands/dumpdebuginfo.py b/server/website/website/management/commands/dumpdebuginfo.py new file mode 100644 index 0000000..4f4dda2 --- /dev/null +++ b/server/website/website/management/commands/dumpdebuginfo.py @@ -0,0 +1,58 @@ +# +# OtterTune - setuploadcode.py +# +# Copyright (c) 2017-18, Carnegie Mellon University Database Group +# +import os + +from django.core.management.base import BaseCommand, CommandError + +from website.models import Session +from website.utils import dump_debug_info + + +class Command(BaseCommand): + help = 'Dump debug information for the session with the given upload code.' + + def add_arguments(self, parser): + parser.add_argument( + 'uploadcode', + metavar='UPLOADCODE', + help="The session's upload code to.") + parser.add_argument( + '-f', '--filename', + metavar='FILE', + help='Name of the file to write the debug information to. ' + 'Default: debug_[timestamp].tar.gz') + parser.add_argument( + '-d', '--directory', + metavar='DIR', + help='Path of the directory to write the debug information to. ' + 'Default: current directory') + parser.add_argument( + '--prettyprint', + action='store_true', + help='Pretty print the output.') + + def handle(self, *args, **options): + directory = options['directory'] or '' + if directory and not os.path.exists(directory): + os.makedirs(directory) + try: + session = Session.objects.get(upload_code=options['uploadcode']) + except Session.DoesNotExist: + raise CommandError( + "ERROR: Session with upload code '{}' not exist.".format(options['uploadcode'])) + + debug_info, root = dump_debug_info(session, pretty_print=options['prettyprint']) + + filename = options['filename'] or root + if not filename.endswith('.tar.gz'): + filename += '.tar.gz' + path = os.path.join(directory, filename) + + with open(path, 'wb') as f: + f.write(debug_info.getvalue()) + + self.stdout.write(self.style.SUCCESS( + "Successfully dumped debug information to '{}'.".format(path))) diff --git a/server/website/website/templates/session.html b/server/website/website/templates/session.html index b5566f5..32dc6fd 100644 --- a/server/website/website/templates/session.html +++ b/server/website/website/templates/session.html @@ -61,12 +61,11 @@ caption span {float: right;}
- + diff --git a/server/website/website/urls.py b/server/website/website/urls.py index d1f5f06..f638fdf 100644 --- a/server/website/website/urls.py +++ b/server/website/website/urls.py @@ -37,6 +37,7 @@ urlpatterns = [ url(r'^projects/(?P[0-9]+)/sessions/(?P[0-9]+)/edit/$', website_views.create_or_edit_session, name='edit_session'), url(r'^projects/(?P[0-9]+)/sessions/(?P[0-9]+)/editKnobs/$', website_views.edit_knobs, name='edit_knobs'), url(r'^projects/(?P[0-9]+)/sessions/delete/$', website_views.delete_session, name='delete_session'), + url(r'^projects/(?P[0-9]+)/sessions/(?P[0-9]+)/dump/$', website_views.download_debug_info, name='dump_debug_data'), # URLs for result views url(r'^new_result/', website_views.new_result, name='new_result'), @@ -65,6 +66,7 @@ urlpatterns = [ # Back door url(r'^query_and_get/(?P[0-9a-zA-Z]+)$', website_views.give_result, name="backdoor"), + url(r'^dump/(?P[0-9a-zA-Z]+)', website_views.get_debug_info, name="backdoor_debug"), # train ddpg with results in the given session url(r'^train_ddpg/sessions/(?P[0-9]+)$', website_views.train_ddpg_loops, name='train_ddpg_loops'), diff --git a/server/website/website/utils.py b/server/website/website/utils.py index 3b37637..259f418 100644 --- a/server/website/website/utils.py +++ b/server/website/website/utils.py @@ -3,24 +3,25 @@ # # Copyright (c) 2017-18, Carnegie Mellon University Database Group # -''' -Created on Jul 8, 2017 - -@author: dvanaken -''' - +import datetime import json import logging +import os import string +import tarfile +import time from collections import OrderedDict +from io import BytesIO from random import choice import numpy as np from django.utils.text import capfirst +from django_db_logger.models import StatusLog from djcelery.models import TaskMeta +from .models import DBMSCatalog, KnobCatalog, Result, Session, SessionKnob +from .settings import constants from .types import LabelStyleType, VarType -from .models import KnobCatalog, DBMSCatalog, SessionKnob LOG = logging.getLogger(__name__) @@ -34,17 +35,27 @@ class JSONUtil(object): object_pairs_hook=OrderedDict) @staticmethod - def dumps(config, pprint=False, sort=False): - indent = 4 if pprint is True else None + def dumps(config, pprint=False, sort=False, encoder='custom'): + json_args = dict(indent=4 if pprint is True else None, + ensure_ascii=False) + + if encoder == 'custom': + json_args.update(default=JSONUtil.custom_converter) + if sort is True: if isinstance(config, dict): config = OrderedDict(sorted(config.items())) else: config = sorted(config) - return json.dumps(config, - ensure_ascii=False, - indent=indent) + return json.dumps(config, **json_args) + + @staticmethod + def custom_converter(o): + if isinstance(o, datetime.datetime): + return str(o) + elif isinstance(o, np.ndarray): + return o.tolist() class MediaUtil(object): @@ -279,3 +290,108 @@ class LabelUtil(object): label = label.replace('Dbms', 'DBMS') style_labels[name] = str(label) return style_labels + + +def dump_debug_info(session, pretty_print=False): + files = {} + + # Session + session_values = Session.objects.filter(pk=session.pk).values()[0] + session_values['dbms'] = session.dbms.full_name + session_values['hardware'] = session.hardware.name + + # Session knobs + knob_instances = SessionKnob.objects.filter( + session=session, tunable=True).select_related('knob') + knob_values = list(knob_instances.values()) + for knob, knob_dict in zip(knob_instances, knob_values): + assert knob.pk == knob_dict['id'] + knob_dict['knob'] = knob.name + session_values['knobs'] = knob_values + + # Save binary field types to separate files + binary_fields = [ + 'ddpg_actor_model', + 'ddpg_critic_model', + 'ddpg_reply_memory', + 'dnn_model', + ] + for bf in binary_fields: + if session_values[bf]: + filename = os.path.join('binaries', '{}.pickle'.format(bf)) + content = session_values[bf] + session_values[bf] = filename + files[filename] = content + + files['session.json'] = session_values + + # Results from session + result_instances = Result.objects.filter(session=session).select_related( + 'knob_data', 'metric_data').order_by('creation_time') + results = [] + + for result, result_dict in zip(result_instances, result_instances.values()): + assert result.pk == result_dict['id'] + result_dict = OrderedDict(result_dict) + next_config = result.next_configuration or '{}' + result_dict['next_configuration'] = JSONUtil.loads(next_config) + + tasks = {} + task_ids = result.task_ids + task_ids = task_ids.split(',') if task_ids else [] + for task_id in task_ids: + task = TaskMeta.objects.filter(task_id=task_id).values() + task = task[0] if task else None + tasks[task_id] = task + result_dict['tasks'] = tasks + + knob_data = result.knob_data.data or '{}' + metric_data = result.metric_data.data or '{}' + result_dict['knob_data'] = JSONUtil.loads(knob_data) + result_dict['metric_data'] = JSONUtil.loads(metric_data) + results.append(result_dict) + + files['results.json'] = results + + # Log messages written to the database using django-db-logger + logs = StatusLog.objects.filter(create_datetime__gte=session.creation_time) + logger_names = logs.order_by().values_list('logger_name', flat=True).distinct() + + # Write log files at app scope (e.g., django, website, celery) + logger_names = set([l.split('.', 1)[0] for l in logger_names]) + + for logger_name in logger_names: + log_values = list(logs.filter(logger_name__startswith=logger_name).order_by( + 'create_datetime').values()) + for lv in log_values: + lv['level'] = logging.getLevelName(lv['level']) + files['logs/{}.log'.format(logger_name)] = log_values + + # Save settings + constants_dict = OrderedDict() + for name, value in sorted(constants.__dict__.items()): + if not name.startswith('_') and name == name.upper(): + constants_dict[name] = value + files['constants.json'] = constants_dict + + timestamp = datetime.datetime.now().strftime("%Y%m%d-%H%M%S") + root = 'debug_{}'.format(timestamp) + + mtime = time.time() + tarstream = BytesIO() + with tarfile.open(mode='w:gz', fileobj=tarstream) as tar: + for filename, content in files.items(): # pylint: disable=not-an-iterable + if isinstance(content, (dict, list)): + content = JSONUtil.dumps(content, pprint=pretty_print) + if isinstance(content, str): + content = content.encode('utf-8') + assert isinstance(content, bytes), (filename, type(content)) + bio = BytesIO(content) + path = os.path.join(root, filename) + tarinfo = tarfile.TarInfo(name=path) + tarinfo.size = len(bio.getvalue()) + tarinfo.mtime = mtime + tar.addfile(tarinfo, bio) + + tarstream.seek(0) + return tarstream, root diff --git a/server/website/website/views.py b/server/website/website/views.py index 6c4c8ef..83b2f70 100644 --- a/server/website/website/views.py +++ b/server/website/website/views.py @@ -3,6 +3,7 @@ # # Copyright (c) 2017-18, Carnegie Mellon University Database Group # +# pylint: disable=too-many-lines import logging import datetime import re @@ -14,6 +15,7 @@ from django.contrib.auth import update_session_auth_hash from django.contrib.auth.forms import AuthenticationForm, UserCreationForm from django.contrib.auth.forms import PasswordChangeForm from django.core.exceptions import ObjectDoesNotExist +from django.core.files.base import ContentFile from django.http import HttpResponse, QueryDict from django.shortcuts import redirect, render, get_object_or_404 from django.template.context_processors import csrf @@ -34,7 +36,7 @@ from .tasks import (aggregate_target_results, map_workload, train_ddpg, configuration_recommendation, configuration_recommendation_ddpg) from .types import (DBMSType, KnobUnitType, MetricType, TaskType, VarType, WorkloadStatusType, AlgorithmType) -from .utils import JSONUtil, LabelUtil, MediaUtil, TaskUtil +from .utils import dump_debug_info, JSONUtil, LabelUtil, MediaUtil, TaskUtil from .settings import TIME_ZONE from .set_default_knobs import set_default_knobs @@ -336,18 +338,24 @@ def edit_knobs(request, project_id, session_id): {'project': project, 'session': session, 'form': form}) instance = form.instance instance.session = session - instance.knob = KnobCatalog.objects.filter(dbms=session.dbms, - name=form.cleaned_data["name"])[0] + instance.knob = KnobCatalog.objects.get(dbms=session.dbms, + name=form.cleaned_data["name"]) SessionKnob.objects.filter(session=instance.session, knob=instance.knob).delete() instance.save() return HttpResponse(status=204) else: + # knobs = KnobCatalog.objects.filter(dbms=session.dbms).order_by('-tunable') knobs = SessionKnob.objects.filter(session=session).order_by('-tunable', 'knob__name') forms = [] for knob in knobs: knob_values = model_to_dict(knob) knob_values['session'] = session knob_values['name'] = KnobCatalog.objects.get(pk=knob.knob.pk).name + # if SessionKnob.objects.filter(session=session, knob=knob).exists(): + # new_knob = SessionKnob.objects.filter(session=session, knob=knob)[0] + # knob_values["minval"] = new_knob.minval + # knob_values["maxval"] = new_knob.maxval + # knob_values["tunable"] = new_knob.tunable forms.append(SessionKnobForm(initial=knob_values)) context = { 'project': project, @@ -412,7 +420,6 @@ def new_result(request): if not form.is_valid(): LOG.warning("New result form is not valid: %s", str(form.errors)) return HttpResponse("New result form is not valid: " + str(form.errors), status=400) - upload_code = form.cleaned_data['upload_code'] try: session = Session.objects.get(upload_code=upload_code) @@ -421,7 +428,6 @@ def new_result(request): return HttpResponse("Invalid upload code: " + upload_code, status=400) return handle_result_files(session, request.FILES) - LOG.warning("Request type was not POST") return HttpResponse("Request type was not POST", status=400) @@ -721,6 +727,17 @@ def download_next_config(request): return response +@login_required(login_url=reverse_lazy('login')) +def download_debug_info(request, project_id, session_id): # pylint: disable=unused-argument + session = Session.objects.get(pk=session_id) + content, filename = dump_debug_info(session, pretty_print=False) + file = ContentFile(content.getvalue()) + response = HttpResponse(file, content_type='application/x-gzip') + response['Content-Length'] = file.size + response['Content-Disposition'] = 'attachment; filename={}.tar.gz'.format(filename) + return response + + @login_required(login_url=reverse_lazy('login')) def tuner_status_view(request, project_id, session_id, result_id): # pylint: disable=unused-argument res = Result.objects.get(pk=result_id) @@ -949,29 +966,38 @@ def get_timeline_data(request): # get the lastest result def give_result(request, upload_code): # pylint: disable=unused-argument + + def _failed_response(_latest_result, _tasks, _num_completed, _status, _msg): + _msg = "{}\nSTATUS: {}\nRESULT ID: {}\n".format(_msg, _status, _latest_result) + if tasks: + _failed_task_idx = min(len(_tasks) - 1, _num_completed + 1) + _failed_task = _tasks[_failed_task_idx] + _msg += "TRACEBACK: {}".format(_failed_task.traceback) + return HttpResponse(_msg, status=400) + try: session = Session.objects.get(upload_code=upload_code) except Session.DoesNotExist: LOG.warning("Invalid upload code: %s", upload_code) return HttpResponse("Invalid upload code: " + upload_code, status=400) - results = Result.objects.filter(session=session) - lastest_result = results[len(results) - 1] - tasks = TaskUtil.get_tasks(lastest_result.task_ids) + latest_result = Result.objects.filter(session=session).latest('creation_time') + tasks = TaskUtil.get_tasks(latest_result.task_ids) overall_status, num_completed = TaskUtil.get_task_status(tasks) if overall_status == 'SUCCESS': - res = Result.objects.get(pk=lastest_result.pk) - response = HttpResponse(JSONUtil.dumps(res.next_configuration), - content_type='application/json') + next_config = latest_result.next_configuration + if not next_config: + overall_status = 'FAILURE' + response = _failed_response(latest_result, tasks, num_completed, overall_status, + 'Failed to get the next configuration.') + else: + response = HttpResponse(JSONUtil.dumps(next_config), + content_type='application/json') elif overall_status in ('FAILURE', 'REVOKED', 'RETRY'): - msg = "STATUS: {}\nRESULT ID: {}\n".format(overall_status, lastest_result) - if tasks: - failed_task_idx = min(len(tasks) - 1, num_completed + 1) - failed_task = tasks[failed_task_idx] - msg += "TRACEBACK: {}".format(failed_task.traceback) - response = HttpResponse(msg, status=400) + response = _failed_response(latest_result, tasks, num_completed, overall_status, + 'Celery failed to get the next configuration.') else: # overall_status in ('PENDING', 'RECEIVED', 'STARTED'): response = HttpResponse("{}: Result not ready".format(overall_status), status=202) @@ -979,6 +1005,23 @@ def give_result(request, upload_code): # pylint: disable=unused-argument return response +# get the lastest result +def get_debug_info(request, upload_code): # pylint: disable=unused-argument + pprint = bool(int(request.GET.get('pp', False))) + try: + session = Session.objects.get(upload_code=upload_code) + except Session.DoesNotExist: + LOG.warning("Invalid upload code: %s", upload_code) + return HttpResponse("Invalid upload code: " + upload_code, status=400) + + content, filename = dump_debug_info(session, pretty_print=pprint) + file = ContentFile(content.getvalue()) + response = HttpResponse(file, content_type='application/x-gzip') + response['Content-Length'] = file.size + response['Content-Disposition'] = 'attachment; filename={}.tar.gz'.format(filename) + return response + + def train_ddpg_loops(request, session_id): # pylint: disable=unused-argument session = get_object_or_404(Session, pk=session_id, user=request.user) # pylint: disable=unused-variable results = Result.objects.filter(session=session_id)
-

{{ labels.title }}

- (edit) - (edit knobs) - -

{{ labels.title }} + (edit) + (edit knobs) + (dump debug info) +

{{ labels.name }}