Added option to dump debug info in 3 places: (1) the session view, (2) as a command in manage.py, and (3) as a fab command in the driver.

This commit is contained in:
Dana Van Aken 2019-10-03 22:21:51 -04:00
parent 309d327a44
commit 162dc48c53
7 changed files with 317 additions and 57 deletions

View File

@ -1,7 +1,7 @@
{
"database_type" : "postgres",
"database_name" : "tpcc",
"database_disk": "/dev/xvda1",
"database_disk": "",
"database_conf": "/etc/postgresql/9.6/main/postgresql.conf",
"database_save_path": "~/ottertune/client/driver/dumpfiles",
"username" : "dbuser",

View File

@ -34,6 +34,8 @@ fabric_output.update({
RELOAD_INTERVAL = 10
# maximum disk usage
MAX_DISK_USAGE = 90
# Postgres datadir
PG_DATADIR = '/var/lib/postgresql/11/main'
# Load config
with open('driver_config.json', 'r') as _f:
@ -119,7 +121,7 @@ def create_controller_config():
@task
def restart_database():
if CONF['database_type'] == 'postgres':
cmd = 'sudo service postgresql restart'
cmd = 'sudo -u postgres pg_ctl -D {} -w restart'.format(PG_DATADIR)
elif CONF['database_type'] == 'oracle':
cmd = 'sh oracleScripts/shutdownOracle.sh && sh oracleScripts/startupOracle.sh'
else:
@ -149,11 +151,11 @@ def create_database():
@task
def reset_conf():
change_conf(next_conf='')
change_conf()
@task
def change_conf(next_conf='next_config'):
def change_conf(next_conf=None):
signal = "# configurations recommended by ottertune:\n"
next_conf = next_conf or {}
@ -240,6 +242,16 @@ def save_dbms_result():
srcfile = os.path.join(CONF['controller_home'], 'output', f_)
dstfile = os.path.join(CONF['save_path'], '{}__{}'.format(t, f_))
local('cp {} {}'.format(srcfile, dstfile))
return t
@task
def save_next_config(next_config, t=None):
if not t:
t = int(time.time())
with open(os.path.join(CONF['save_path'], '{}__next_config.json'.format(t)), 'w') as f:
json.dump(next_config, f, indent=2)
return t
@task
@ -282,52 +294,77 @@ def upload_result(result_dir=None, prefix=None):
@task
def get_result(max_time_sec=180, interval_sec=1):
def get_result(max_time_sec=180, interval_sec=5):
max_time_sec = int(max_time_sec)
interval_sec = int(interval_sec)
url = CONF['upload_url'] + '/query_and_get/' + CONF['upload_code']
elapsed = 0.0
elapsed = 0
response_dict = None
response = ''
start_time = time.time()
while elapsed <= max_time_sec:
rsp = requests.get(url)
response = rsp.content.decode()
LOG.debug('Response:\n\n%s\n', response)
assert response != 'null'
LOG.debug('%s [status code: %d, content_type: %s, elapsed: %ds]', response,
rsp.status_code, rsp.headers.get('content-type', ''), elapsed)
if rsp.status_code == 200:
# Success
response_dict = json.loads(rsp.json(), object_pairs_hook=OrderedDict)
break
elif rsp.status_code == 202:
# Not ready
time.sleep(interval_sec)
elapsed += interval_sec
elif rsp.status_code == 400:
# Failure
raise Exception(
"Failed to download the next config.\nStatus code: {}\nMessage: {}\n".format(
rsp.status_code, response))
else:
raise NotImplementedError(
"Unhandled status code: '{}'.\nMessage: {}".format(rsp.status_code, response))
elapsed = time.time() - start_time
if not response_dict:
assert elapsed > max_time_sec, \
'response={} but elapsed={:.1f}s <= max_time={:.1f}s'.format(
'response={} but elapsed={}s <= max_time={}s'.format(
response, elapsed, max_time_sec)
raise Exception(
'Failed to download the next config in {}s: {} (elapsed: {:.1f}s)'.format(
'Failed to download the next config in {}s: {} (elapsed: {}s)'.format(
max_time_sec, response, elapsed))
LOG.info('Downloaded the next config in %.0fs: %s', elapsed,
json.dumps(response_dict, indent=4))
LOG.info('Downloaded the next config in %ds: %s', elapsed, json.dumps(response_dict, indent=4))
return response_dict
@task
def download_debug_info(pprint=False):
pprint = _parse_bool(pprint)
url = '{}/dump/{}'.format(CONF['upload_url'], CONF['upload_code'])
params = {'pp': int(True)} if pprint else {}
rsp = requests.get(url, params=params)
if rsp.status_code != 200:
raise Exception('Error downloading debug info.')
filename = rsp.headers.get('Content-Disposition').split('=')[-1]
file_len, exp_len = len(rsp.content), int(rsp.headers.get('Content-Length'))
assert file_len == exp_len, 'File {}: content length != expected length: {} != {}'.format(
filename, file_len, exp_len)
with open(filename, 'wb') as f:
f.write(rsp.content)
LOG.info('Downloaded debug info to %s', filename)
return filename
@task
def add_udf():
cmd = 'sudo python3 ./LatencyUDF.py ../controller/output/'
@ -469,16 +506,19 @@ def loop():
# add_udf()
# save result
save_dbms_result()
result_timestamp = save_dbms_result()
# upload result
upload_result()
# get result
get_result()
response = get_result()
# save next config
save_next_config(response, t=result_timestamp)
# change config
change_conf()
change_conf(response['recommendation'])
@task
@ -489,6 +529,7 @@ def run_lhs():
# dump database if it's not done before.
dump = dump_database()
result_timestamp = None
for i, sample in enumerate(samples):
# reload database periodically
if RELOAD_INTERVAL > 0:
@ -508,15 +549,16 @@ def run_lhs():
if check_disk_usage() > MAX_DISK_USAGE:
LOG.warning('Exceeds max disk usage %s', MAX_DISK_USAGE)
# copy lhs-sampled config to the to-be-used config
cmd = 'cp {} next_config'.format(sample)
local(cmd)
# load the next lhs-sampled config
with open(sample, 'r') as f:
next_config = json.load(f, object_pairs_hook=OrderedDict)
save_next_config(next_config, t=result_timestamp)
# remove oltpbench log and controller log
clean_logs()
# change config
change_conf()
change_conf(next_config)
# restart database
restart_database()
@ -550,7 +592,7 @@ def run_lhs():
p.join()
# save result
save_dbms_result()
result_timestamp = save_dbms_result()
# upload result
upload_result()

View File

@ -0,0 +1,58 @@
#
# OtterTune - setuploadcode.py
#
# Copyright (c) 2017-18, Carnegie Mellon University Database Group
#
import os
from django.core.management.base import BaseCommand, CommandError
from website.models import Session
from website.utils import dump_debug_info
class Command(BaseCommand):
help = 'Dump debug information for the session with the given upload code.'
def add_arguments(self, parser):
parser.add_argument(
'uploadcode',
metavar='UPLOADCODE',
help="The session's upload code to.")
parser.add_argument(
'-f', '--filename',
metavar='FILE',
help='Name of the file to write the debug information to. '
'Default: debug_[timestamp].tar.gz')
parser.add_argument(
'-d', '--directory',
metavar='DIR',
help='Path of the directory to write the debug information to. '
'Default: current directory')
parser.add_argument(
'--prettyprint',
action='store_true',
help='Pretty print the output.')
def handle(self, *args, **options):
directory = options['directory'] or ''
if directory and not os.path.exists(directory):
os.makedirs(directory)
try:
session = Session.objects.get(upload_code=options['uploadcode'])
except Session.DoesNotExist:
raise CommandError(
"ERROR: Session with upload code '{}' not exist.".format(options['uploadcode']))
debug_info, root = dump_debug_info(session, pretty_print=options['prettyprint'])
filename = options['filename'] or root
if not filename.endswith('.tar.gz'):
filename += '.tar.gz'
path = os.path.join(directory, filename)
with open(path, 'wb') as f:
f.write(debug_info.getvalue())
self.stdout.write(self.style.SUCCESS(
"Successfully dumped debug information to '{}'.".format(path)))

View File

@ -61,12 +61,11 @@ caption span {float: right;}
<div id="session" class="container">
<table class="table table-striped table-bordered table-condensed table-hover">
<caption >
<h4> {{ labels.title }}</h4>
<span> (<a href="{% url 'edit_session' project.pk session.pk %}">edit</a>)
<caption><h4>{{ labels.title }}
(<a href="{% url 'edit_session' project.pk session.pk %}">edit</a>)
(<a href="{% url 'edit_knobs' project.pk session.pk %}">edit knobs</a>)
</span>
</caption>
(<a href="{% url 'dump_debug_data' project.pk session.pk %}">dump debug info</a>)
</h4></caption>
<tbody>
<tr>
<td style="width: 50%"><div class="text-right">{{ labels.name }}</div></td>

View File

@ -37,6 +37,7 @@ urlpatterns = [
url(r'^projects/(?P<project_id>[0-9]+)/sessions/(?P<session_id>[0-9]+)/edit/$', website_views.create_or_edit_session, name='edit_session'),
url(r'^projects/(?P<project_id>[0-9]+)/sessions/(?P<session_id>[0-9]+)/editKnobs/$', website_views.edit_knobs, name='edit_knobs'),
url(r'^projects/(?P<project_id>[0-9]+)/sessions/delete/$', website_views.delete_session, name='delete_session'),
url(r'^projects/(?P<project_id>[0-9]+)/sessions/(?P<session_id>[0-9]+)/dump/$', website_views.download_debug_info, name='dump_debug_data'),
# URLs for result views
url(r'^new_result/', website_views.new_result, name='new_result'),
@ -65,6 +66,7 @@ urlpatterns = [
# Back door
url(r'^query_and_get/(?P<upload_code>[0-9a-zA-Z]+)$', website_views.give_result, name="backdoor"),
url(r'^dump/(?P<upload_code>[0-9a-zA-Z]+)', website_views.get_debug_info, name="backdoor_debug"),
# train ddpg with results in the given session
url(r'^train_ddpg/sessions/(?P<session_id>[0-9]+)$', website_views.train_ddpg_loops, name='train_ddpg_loops'),

View File

@ -3,24 +3,25 @@
#
# Copyright (c) 2017-18, Carnegie Mellon University Database Group
#
'''
Created on Jul 8, 2017
@author: dvanaken
'''
import datetime
import json
import logging
import os
import string
import tarfile
import time
from collections import OrderedDict
from io import BytesIO
from random import choice
import numpy as np
from django.utils.text import capfirst
from django_db_logger.models import StatusLog
from djcelery.models import TaskMeta
from .models import DBMSCatalog, KnobCatalog, Result, Session, SessionKnob
from .settings import constants
from .types import LabelStyleType, VarType
from .models import KnobCatalog, DBMSCatalog, SessionKnob
LOG = logging.getLogger(__name__)
@ -34,17 +35,27 @@ class JSONUtil(object):
object_pairs_hook=OrderedDict)
@staticmethod
def dumps(config, pprint=False, sort=False):
indent = 4 if pprint is True else None
def dumps(config, pprint=False, sort=False, encoder='custom'):
json_args = dict(indent=4 if pprint is True else None,
ensure_ascii=False)
if encoder == 'custom':
json_args.update(default=JSONUtil.custom_converter)
if sort is True:
if isinstance(config, dict):
config = OrderedDict(sorted(config.items()))
else:
config = sorted(config)
return json.dumps(config,
ensure_ascii=False,
indent=indent)
return json.dumps(config, **json_args)
@staticmethod
def custom_converter(o):
if isinstance(o, datetime.datetime):
return str(o)
elif isinstance(o, np.ndarray):
return o.tolist()
class MediaUtil(object):
@ -279,3 +290,108 @@ class LabelUtil(object):
label = label.replace('Dbms', 'DBMS')
style_labels[name] = str(label)
return style_labels
def dump_debug_info(session, pretty_print=False):
files = {}
# Session
session_values = Session.objects.filter(pk=session.pk).values()[0]
session_values['dbms'] = session.dbms.full_name
session_values['hardware'] = session.hardware.name
# Session knobs
knob_instances = SessionKnob.objects.filter(
session=session, tunable=True).select_related('knob')
knob_values = list(knob_instances.values())
for knob, knob_dict in zip(knob_instances, knob_values):
assert knob.pk == knob_dict['id']
knob_dict['knob'] = knob.name
session_values['knobs'] = knob_values
# Save binary field types to separate files
binary_fields = [
'ddpg_actor_model',
'ddpg_critic_model',
'ddpg_reply_memory',
'dnn_model',
]
for bf in binary_fields:
if session_values[bf]:
filename = os.path.join('binaries', '{}.pickle'.format(bf))
content = session_values[bf]
session_values[bf] = filename
files[filename] = content
files['session.json'] = session_values
# Results from session
result_instances = Result.objects.filter(session=session).select_related(
'knob_data', 'metric_data').order_by('creation_time')
results = []
for result, result_dict in zip(result_instances, result_instances.values()):
assert result.pk == result_dict['id']
result_dict = OrderedDict(result_dict)
next_config = result.next_configuration or '{}'
result_dict['next_configuration'] = JSONUtil.loads(next_config)
tasks = {}
task_ids = result.task_ids
task_ids = task_ids.split(',') if task_ids else []
for task_id in task_ids:
task = TaskMeta.objects.filter(task_id=task_id).values()
task = task[0] if task else None
tasks[task_id] = task
result_dict['tasks'] = tasks
knob_data = result.knob_data.data or '{}'
metric_data = result.metric_data.data or '{}'
result_dict['knob_data'] = JSONUtil.loads(knob_data)
result_dict['metric_data'] = JSONUtil.loads(metric_data)
results.append(result_dict)
files['results.json'] = results
# Log messages written to the database using django-db-logger
logs = StatusLog.objects.filter(create_datetime__gte=session.creation_time)
logger_names = logs.order_by().values_list('logger_name', flat=True).distinct()
# Write log files at app scope (e.g., django, website, celery)
logger_names = set([l.split('.', 1)[0] for l in logger_names])
for logger_name in logger_names:
log_values = list(logs.filter(logger_name__startswith=logger_name).order_by(
'create_datetime').values())
for lv in log_values:
lv['level'] = logging.getLevelName(lv['level'])
files['logs/{}.log'.format(logger_name)] = log_values
# Save settings
constants_dict = OrderedDict()
for name, value in sorted(constants.__dict__.items()):
if not name.startswith('_') and name == name.upper():
constants_dict[name] = value
files['constants.json'] = constants_dict
timestamp = datetime.datetime.now().strftime("%Y%m%d-%H%M%S")
root = 'debug_{}'.format(timestamp)
mtime = time.time()
tarstream = BytesIO()
with tarfile.open(mode='w:gz', fileobj=tarstream) as tar:
for filename, content in files.items(): # pylint: disable=not-an-iterable
if isinstance(content, (dict, list)):
content = JSONUtil.dumps(content, pprint=pretty_print)
if isinstance(content, str):
content = content.encode('utf-8')
assert isinstance(content, bytes), (filename, type(content))
bio = BytesIO(content)
path = os.path.join(root, filename)
tarinfo = tarfile.TarInfo(name=path)
tarinfo.size = len(bio.getvalue())
tarinfo.mtime = mtime
tar.addfile(tarinfo, bio)
tarstream.seek(0)
return tarstream, root

View File

@ -3,6 +3,7 @@
#
# Copyright (c) 2017-18, Carnegie Mellon University Database Group
#
# pylint: disable=too-many-lines
import logging
import datetime
import re
@ -14,6 +15,7 @@ from django.contrib.auth import update_session_auth_hash
from django.contrib.auth.forms import AuthenticationForm, UserCreationForm
from django.contrib.auth.forms import PasswordChangeForm
from django.core.exceptions import ObjectDoesNotExist
from django.core.files.base import ContentFile
from django.http import HttpResponse, QueryDict
from django.shortcuts import redirect, render, get_object_or_404
from django.template.context_processors import csrf
@ -34,7 +36,7 @@ from .tasks import (aggregate_target_results, map_workload, train_ddpg,
configuration_recommendation, configuration_recommendation_ddpg)
from .types import (DBMSType, KnobUnitType, MetricType,
TaskType, VarType, WorkloadStatusType, AlgorithmType)
from .utils import JSONUtil, LabelUtil, MediaUtil, TaskUtil
from .utils import dump_debug_info, JSONUtil, LabelUtil, MediaUtil, TaskUtil
from .settings import TIME_ZONE
from .set_default_knobs import set_default_knobs
@ -336,18 +338,24 @@ def edit_knobs(request, project_id, session_id):
{'project': project, 'session': session, 'form': form})
instance = form.instance
instance.session = session
instance.knob = KnobCatalog.objects.filter(dbms=session.dbms,
name=form.cleaned_data["name"])[0]
instance.knob = KnobCatalog.objects.get(dbms=session.dbms,
name=form.cleaned_data["name"])
SessionKnob.objects.filter(session=instance.session, knob=instance.knob).delete()
instance.save()
return HttpResponse(status=204)
else:
# knobs = KnobCatalog.objects.filter(dbms=session.dbms).order_by('-tunable')
knobs = SessionKnob.objects.filter(session=session).order_by('-tunable', 'knob__name')
forms = []
for knob in knobs:
knob_values = model_to_dict(knob)
knob_values['session'] = session
knob_values['name'] = KnobCatalog.objects.get(pk=knob.knob.pk).name
# if SessionKnob.objects.filter(session=session, knob=knob).exists():
# new_knob = SessionKnob.objects.filter(session=session, knob=knob)[0]
# knob_values["minval"] = new_knob.minval
# knob_values["maxval"] = new_knob.maxval
# knob_values["tunable"] = new_knob.tunable
forms.append(SessionKnobForm(initial=knob_values))
context = {
'project': project,
@ -412,7 +420,6 @@ def new_result(request):
if not form.is_valid():
LOG.warning("New result form is not valid: %s", str(form.errors))
return HttpResponse("New result form is not valid: " + str(form.errors), status=400)
upload_code = form.cleaned_data['upload_code']
try:
session = Session.objects.get(upload_code=upload_code)
@ -421,7 +428,6 @@ def new_result(request):
return HttpResponse("Invalid upload code: " + upload_code, status=400)
return handle_result_files(session, request.FILES)
LOG.warning("Request type was not POST")
return HttpResponse("Request type was not POST", status=400)
@ -721,6 +727,17 @@ def download_next_config(request):
return response
@login_required(login_url=reverse_lazy('login'))
def download_debug_info(request, project_id, session_id): # pylint: disable=unused-argument
session = Session.objects.get(pk=session_id)
content, filename = dump_debug_info(session, pretty_print=False)
file = ContentFile(content.getvalue())
response = HttpResponse(file, content_type='application/x-gzip')
response['Content-Length'] = file.size
response['Content-Disposition'] = 'attachment; filename={}.tar.gz'.format(filename)
return response
@login_required(login_url=reverse_lazy('login'))
def tuner_status_view(request, project_id, session_id, result_id): # pylint: disable=unused-argument
res = Result.objects.get(pk=result_id)
@ -949,29 +966,38 @@ def get_timeline_data(request):
# get the lastest result
def give_result(request, upload_code): # pylint: disable=unused-argument
def _failed_response(_latest_result, _tasks, _num_completed, _status, _msg):
_msg = "{}\nSTATUS: {}\nRESULT ID: {}\n".format(_msg, _status, _latest_result)
if tasks:
_failed_task_idx = min(len(_tasks) - 1, _num_completed + 1)
_failed_task = _tasks[_failed_task_idx]
_msg += "TRACEBACK: {}".format(_failed_task.traceback)
return HttpResponse(_msg, status=400)
try:
session = Session.objects.get(upload_code=upload_code)
except Session.DoesNotExist:
LOG.warning("Invalid upload code: %s", upload_code)
return HttpResponse("Invalid upload code: " + upload_code, status=400)
results = Result.objects.filter(session=session)
lastest_result = results[len(results) - 1]
tasks = TaskUtil.get_tasks(lastest_result.task_ids)
latest_result = Result.objects.filter(session=session).latest('creation_time')
tasks = TaskUtil.get_tasks(latest_result.task_ids)
overall_status, num_completed = TaskUtil.get_task_status(tasks)
if overall_status == 'SUCCESS':
res = Result.objects.get(pk=lastest_result.pk)
response = HttpResponse(JSONUtil.dumps(res.next_configuration),
next_config = latest_result.next_configuration
if not next_config:
overall_status = 'FAILURE'
response = _failed_response(latest_result, tasks, num_completed, overall_status,
'Failed to get the next configuration.')
else:
response = HttpResponse(JSONUtil.dumps(next_config),
content_type='application/json')
elif overall_status in ('FAILURE', 'REVOKED', 'RETRY'):
msg = "STATUS: {}\nRESULT ID: {}\n".format(overall_status, lastest_result)
if tasks:
failed_task_idx = min(len(tasks) - 1, num_completed + 1)
failed_task = tasks[failed_task_idx]
msg += "TRACEBACK: {}".format(failed_task.traceback)
response = HttpResponse(msg, status=400)
response = _failed_response(latest_result, tasks, num_completed, overall_status,
'Celery failed to get the next configuration.')
else: # overall_status in ('PENDING', 'RECEIVED', 'STARTED'):
response = HttpResponse("{}: Result not ready".format(overall_status), status=202)
@ -979,6 +1005,23 @@ def give_result(request, upload_code): # pylint: disable=unused-argument
return response
# get the lastest result
def get_debug_info(request, upload_code): # pylint: disable=unused-argument
pprint = bool(int(request.GET.get('pp', False)))
try:
session = Session.objects.get(upload_code=upload_code)
except Session.DoesNotExist:
LOG.warning("Invalid upload code: %s", upload_code)
return HttpResponse("Invalid upload code: " + upload_code, status=400)
content, filename = dump_debug_info(session, pretty_print=pprint)
file = ContentFile(content.getvalue())
response = HttpResponse(file, content_type='application/x-gzip')
response['Content-Length'] = file.size
response['Content-Disposition'] = 'attachment; filename={}.tar.gz'.format(filename)
return response
def train_ddpg_loops(request, session_id): # pylint: disable=unused-argument
session = get_object_or_404(Session, pk=session_id, user=request.user) # pylint: disable=unused-variable
results = Result.objects.filter(session=session_id)