Create/format/save the recommended config for the latest result before the on_success method.

This commit is contained in:
dvanaken 2020-01-07 16:10:30 -05:00 committed by Dana Van Aken
parent 8001b658c9
commit 8cec62160f
3 changed files with 66 additions and 78 deletions

View File

@ -109,19 +109,9 @@ class ConfigurationRecommendation(UpdateTask): # pylint: disable=abstract-metho
def on_success(self, retval, task_id, args, kwargs): def on_success(self, retval, task_id, args, kwargs):
super(ConfigurationRecommendation, self).on_success(retval, task_id, args, kwargs) super(ConfigurationRecommendation, self).on_success(retval, task_id, args, kwargs)
result_id = retval['result_id']
result = Result.objects.get(pk=result_id)
# Replace result with formatted result
formatted_params = db.parser.format_dbms_knobs(result.dbms.pk, retval['recommendation'])
# Create next configuration to try
config = db.parser.create_knob_configuration(result.dbms.pk, formatted_params)
task_meta = TaskMeta.objects.get(task_id=task_id) task_meta = TaskMeta.objects.get(task_id=task_id)
retval['recommendation'] = config
task_meta.result = retval task_meta.result = retval
task_meta.save() task_meta.save()
result.next_configuration = JSONUtil.dumps(retval)
result.save()
def clean_knob_data(knob_matrix, knob_labels, session): def clean_knob_data(knob_matrix, knob_labels, session):
@ -419,13 +409,31 @@ def train_ddpg(result_id):
return result_info return result_info
def create_and_save_recommendation(recommended_knobs, result, status, **kwargs):
dbms_id = result.dbms.pk
formatted_knobs = db.parser.format_dbms_knobs(dbms_id, recommended_knobs)
config = db.parser.create_knob_configuration(dbms_id, formatted_knobs)
retval = dict(**kwargs)
retval.update(
status=status,
result_id=result.pk,
recommendation=config,
)
result.next_configuration = JSONUtil.dumps(retval)
result.save()
return retval
@task(base=ConfigurationRecommendation, name='configuration_recommendation_ddpg') @task(base=ConfigurationRecommendation, name='configuration_recommendation_ddpg')
def configuration_recommendation_ddpg(result_info): # pylint: disable=invalid-name def configuration_recommendation_ddpg(result_info): # pylint: disable=invalid-name
LOG.info('Use ddpg to recommend configuration') LOG.info('Use ddpg to recommend configuration')
result_id = result_info['newest_result_id'] result_id = result_info['newest_result_id']
result = Result.objects.filter(pk=result_id) result_list = Result.objects.filter(pk=result_id)
session = Result.objects.get(pk=result_id).session result = result_list.first()
agg_data = DataUtil.aggregate_data(result) session = result.session
agg_data = DataUtil.aggregate_data(result_list)
metric_data, _ = clean_metric_data(agg_data['y_matrix'], agg_data['y_columnlabels'], session) metric_data, _ = clean_metric_data(agg_data['y_matrix'], agg_data['y_columnlabels'], session)
metric_data = metric_data.flatten() metric_data = metric_data.flatten()
metric_scalar = MinMaxScaler().fit(metric_data.reshape(1, -1)) metric_scalar = MinMaxScaler().fit(metric_data.reshape(1, -1))
@ -447,11 +455,10 @@ def configuration_recommendation_ddpg(result_info): # pylint: disable=invalid-n
knob_bounds = np.vstack(DataUtil.get_knob_bounds(knob_labels, session)) knob_bounds = np.vstack(DataUtil.get_knob_bounds(knob_labels, session))
knob_data = MinMaxScaler().fit(knob_bounds).inverse_transform(knob_data.reshape(1, -1))[0] knob_data = MinMaxScaler().fit(knob_bounds).inverse_transform(knob_data.reshape(1, -1))[0]
conf_map = {k: knob_data[i] for i, k in enumerate(knob_labels)} conf_map = {k: knob_data[i] for i, k in enumerate(knob_labels)}
conf_map_res = {}
conf_map_res['status'] = 'good' conf_map_res = create_and_save_recommendation(recommended_knobs=conf_map, result=result,
conf_map_res['result_id'] = result_id status='good', info='INFO: ddpg')
conf_map_res['recommendation'] = conf_map
conf_map_res['info'] = 'INFO: ddpg'
return conf_map_res return conf_map_res
@ -638,20 +645,18 @@ def combine_workload(target_data):
def configuration_recommendation(recommendation_input): def configuration_recommendation(recommendation_input):
target_data, algorithm = recommendation_input target_data, algorithm = recommendation_input
LOG.info('configuration_recommendation called') LOG.info('configuration_recommendation called')
newest_result = Result.objects.get(pk=target_data['newest_result_id'])
if target_data['bad'] is True: if target_data['bad'] is True:
target_data_res = dict( target_data_res = create_and_save_recommendation(
status='bad', recommended_knobs=target_data['config_recommend'], result=newest_result,
result_id=target_data['newest_result_id'], status='bad', info='WARNING: no training data, the config is generated randomly',
info='WARNING: no training data, the config is generated randomly',
recommendation=target_data['config_recommend'],
pipeline_run=target_data['pipeline_run']) pipeline_run=target_data['pipeline_run'])
LOG.debug('%s: Skipping configuration recommendation.\n\ndata=%s\n', LOG.debug('%s: Skipping configuration recommendation.\n\ndata=%s\n',
AlgorithmType.name(algorithm), JSONUtil.dumps(target_data, pprint=True)) AlgorithmType.name(algorithm), JSONUtil.dumps(target_data, pprint=True))
return target_data_res return target_data_res
latest_pipeline_run = PipelineRun.objects.get(pk=target_data['pipeline_run']) latest_pipeline_run = PipelineRun.objects.get(pk=target_data['pipeline_run'])
newest_result = Result.objects.get(pk=target_data['newest_result_id'])
X_columnlabels, X_scaler, X_scaled, y_scaled, X_max, X_min = combine_workload(target_data) X_columnlabels, X_scaler, X_scaled, y_scaled, X_max, X_min = combine_workload(target_data)
@ -756,12 +761,10 @@ def configuration_recommendation(recommendation_input):
best_config = np.maximum(best_config, X_min_inv) best_config = np.maximum(best_config, X_min_inv)
conf_map = {k: best_config[i] for i, k in enumerate(X_columnlabels)} conf_map = {k: best_config[i] for i, k in enumerate(X_columnlabels)}
conf_map_res = dict( conf_map_res = create_and_save_recommendation(
status='good', recommended_knobs=conf_map, result=newest_result,
result_id=target_data['newest_result_id'], status='good', info='INFO: training data size is {}'.format(X_scaled.shape[0]),
recommendation=conf_map, pipeline_run=target_data['pipeline_run'])
info='INFO: training data size is {}'.format(X_scaled.shape[0]),
pipeline_run=latest_pipeline_run.pk)
LOG.debug('%s: Finished selecting the next config.\n\ndata=%s\n', LOG.debug('%s: Finished selecting the next config.\n\ndata=%s\n',
AlgorithmType.name(algorithm), JSONUtil.dumps(conf_map_res, pprint=True)) AlgorithmType.name(algorithm), JSONUtil.dumps(conf_map_res, pprint=True))

View File

@ -17,6 +17,7 @@ from random import choice
import numpy as np import numpy as np
from django.contrib.auth.models import User from django.contrib.auth.models import User
from django.db.models import Case, When
from django.utils.text import capfirst from django.utils.text import capfirst
from django_db_logger.models import StatusLog from django_db_logger.models import StatusLog
from djcelery.models import TaskMeta from djcelery.models import TaskMeta
@ -72,34 +73,34 @@ class MediaUtil(object):
class TaskUtil(object): class TaskUtil(object):
@staticmethod @staticmethod
def get_tasks(tasks): def get_tasks(task_ids):
if not tasks: task_ids = task_ids or []
return [] if isinstance(task_ids, str):
task_ids = tasks.split(',') task_ids = task_ids.split(',')
res = [] preserved = Case(*[
for task_id in task_ids: When(task_id=task_id, then=pos) for pos, task_id in enumerate(task_ids)])
task = TaskMeta.objects.filter(task_id=task_id) return TaskMeta.objects.filter(task_id__in=task_ids).order_by(preserved)
if len(task) == 0:
continue # Task Not Finished
res.append(task[0])
return res
@staticmethod @staticmethod
def get_task_status(tasks): def get_task_status(tasks):
if len(tasks) == 0: if not tasks:
return None, 0 return None, 0
overall_status = 'SUCCESS' overall_status = 'SUCCESS'
num_completed = 0 num_completed = 0
for task in tasks: for task in tasks:
status = task.status status = task.status
if status == "SUCCESS": if status == "SUCCESS":
num_completed += 1 num_completed += 1
elif status in ['FAILURE', 'REVOKED', 'RETRY']: elif status in ('FAILURE', 'REVOKED', 'RETRY'):
overall_status = status overall_status = status
break break
else: else:
assert status in ['PENDING', 'RECEIVED', 'STARTED'] if status not in ('PENDING', 'RECEIVED', 'STARTED'):
LOG.warning("Task %s: invalid task status: '%s' (task_id=%s)",
task.id, status, task.task_id)
overall_status = status overall_status = status
return overall_status, num_completed return overall_status, num_completed

View File

@ -1067,14 +1067,6 @@ def get_timeline_data(request):
# get the lastest result # get the lastest result
def give_result(request, upload_code): # pylint: disable=unused-argument def give_result(request, upload_code): # pylint: disable=unused-argument
def _failed_response(_latest_result, _tasks, _num_completed, _status, _msg):
_msg = "{}\nSTATUS: {}\nRESULT ID: {}\n".format(_msg, _status, _latest_result)
if tasks:
_failed_task_idx = min(len(_tasks) - 1, _num_completed + 1)
_failed_task = _tasks[_failed_task_idx]
_msg += "TRACEBACK: {}".format(_failed_task.traceback)
return HttpResponse(_msg, status=400)
try: try:
session = Session.objects.get(upload_code=upload_code) session = Session.objects.get(upload_code=upload_code)
except Session.DoesNotExist: except Session.DoesNotExist:
@ -1084,39 +1076,31 @@ def give_result(request, upload_code): # pylint: disable=unused-argument
latest_result = Result.objects.filter(session=session).latest('creation_time') latest_result = Result.objects.filter(session=session).latest('creation_time')
tasks = TaskUtil.get_tasks(latest_result.task_ids) tasks = TaskUtil.get_tasks(latest_result.task_ids)
overall_status, num_completed = TaskUtil.get_task_status(tasks) overall_status, num_completed = TaskUtil.get_task_status(tasks)
response = {
'celery_status': overall_status,
'result_id': latest_result.pk,
'message': '',
'errors': [],
}
if overall_status == 'SUCCESS': if overall_status == 'SUCCESS':
# The task status is set to SUCCESS before the next config is saved in response.update(JSONUtil.loads(latest_result.next_configuration),
# the latest result so we must wait for it to be updated message='Celery successfully recommended the next configuration')
max_wait_sec = 20 status_code = 200
elapsed_sec = 0
while not latest_result.next_configuration and elapsed_sec <= max_wait_sec:
time.sleep(5)
elapsed_sec += 5
latest_result = Result.objects.get(id=latest_result.pk)
LOG.debug("Waiting for the next config for result %s to be updated... "
"(elapsed: %ss): %s", latest_result.pk, elapsed_sec,
model_to_dict(latest_result))
if not latest_result.next_configuration:
LOG.warning(
"Failed to get the next configuration from the latest result after %ss: %s",
elapsed_sec, model_to_dict(latest_result))
overall_status = 'FAILURE'
response = _failed_response(latest_result, tasks, num_completed, overall_status,
'Failed to get the next configuration.')
else:
response = HttpResponse(JSONUtil.dumps(latest_result.next_configuration),
content_type='application/json')
elif overall_status in ('FAILURE', 'REVOKED', 'RETRY'): elif overall_status in ('FAILURE', 'REVOKED', 'RETRY'):
response = _failed_response(latest_result, tasks, num_completed, overall_status, task_errors = [t.traceback for t in tasks if t.traceback]
'Celery failed to get the next configuration.') if task_errors:
LOG.warning('\n\n'.join(task_errors))
response.update(message='Celery failed to get the next configuration', errors=task_errors)
status_code = 400
else: # overall_status in ('PENDING', 'RECEIVED', 'STARTED'): else: # overall_status in ('PENDING', 'RECEIVED', 'STARTED'):
response = HttpResponse("{}: Result not ready".format(overall_status), status=202) response.update(message='Result not ready')
status_code = 202
return response return HttpResponse(JSONUtil.dumps(response, pprint=True), status=status_code,
content_type='application/json')
# get the lastest result # get the lastest result