ottertune/server/website/script/controller_simulator/data_generator.py

133 lines
4.9 KiB
Python
Raw Normal View History

2019-08-23 08:47:19 -07:00
#
# OtterTune - data_generator.py
#
# Copyright (c) 2017-18, Carnegie Mellon University Database Group
#
'''
Created on Nov 30, 2017
@author: dvanaken
'''
import copy
import datetime
import logging
import os
import shutil
import sys
import json
import numpy as np
LOG = logging.getLogger(__name__)
# Data generator configuration
OBSERVATION_TIME_SEC = 300 # 5 minutes
START_TIME = datetime.datetime.now() - datetime.timedelta(weeks=1)
START_FREQUENCY = datetime.timedelta(minutes=10)
END_FREQUENCY = datetime.timedelta(seconds=OBSERVATION_TIME_SEC)
EPOCH = datetime.datetime.utcfromtimestamp(0)
# Paths
ROOT_DIR = os.path.abspath(os.path.dirname(__file__))
SAMPLE_DIR = os.path.join(ROOT_DIR, 'samples')
OUTPUT_DIR = os.path.join(ROOT_DIR, 'generated_data')
def unix_time_millis(dt):
return int((dt - EPOCH).total_seconds() * 1000.0)
def generate_data(n_workloads, n_samples_per_workload):
with open(os.path.join(SAMPLE_DIR, 'knobs.json'), 'r') as f:
knob_sample = json.load(f)
with open(os.path.join(SAMPLE_DIR, 'metrics_before.json'), 'r') as f:
metrics_start_sample = json.load(f)
with open(os.path.join(SAMPLE_DIR, 'metrics_after.json'), 'r') as f:
metrics_end_sample = json.load(f)
with open(os.path.join(SAMPLE_DIR, 'summary.json'), 'r') as f:
summary_sample = json.load(f)
start_time = START_TIME
end_time = START_TIME + END_FREQUENCY
for i in range(n_workloads):
workload_name = 'workload-{}'.format(i)
wkld_dir = os.path.join(OUTPUT_DIR, workload_name)
os.mkdir(wkld_dir)
for j in range(n_samples_per_workload):
knob_data = copy.deepcopy(knob_sample)
metrics_start_data = copy.deepcopy(metrics_start_sample)
metrics_end_data = copy.deepcopy(metrics_end_sample)
summary_data = copy.deepcopy(summary_sample)
summary_data['workload_name'] = workload_name
summary_data['observation_time'] = OBSERVATION_TIME_SEC
summary_data['start_time'] = unix_time_millis(start_time)
summary_data['end_time'] = unix_time_millis(end_time)
start_time = start_time + START_FREQUENCY
end_time = start_time + END_FREQUENCY
knob_vals = np.random.randint(1, 11, 4)
global_knobs = knob_data['global']['global']
global_knobs['shared_buffers'] = str(knob_vals[0]) + 'GB'
global_knobs['work_mem'] = str(knob_vals[1]) + 'GB'
global_knobs['checkpoint_timing'] = str(knob_vals[2]) + 'min'
global_knobs['effective_io_concurrency'] = str(knob_vals[3])
metrics_start_data['global']['pg_stat_bgwriter']['buffers_alloc'] = np.random.randint(
3000, 7000)
metrics_end_data['global']['pg_stat_bgwriter']['buffers_alloc'] = np.random.randint(
7000, 10000)
locations = [
('xact_commit', metrics_start_data['local']['database']['pg_stat_database']),
('xact_commit', metrics_end_data['local']['database']['pg_stat_database']),
('n_tup_ins', metrics_start_data['local']['table']['pg_stat_user_tables']),
('n_tup_ins', metrics_end_data['local']['table']['pg_stat_user_tables']),
('idx_blks_hit', metrics_start_data['local']['indexes']['pg_statio_user_indexes']),
('idx_blks_hit', metrics_end_data['local']['indexes']['pg_statio_user_indexes']),
]
for k, (name, loc) in enumerate(locations):
for kvs in list(loc.values()):
if k % 2 == 0: # start time must be smaller value
met_val = np.random.randint(30000, 70000)
else:
met_val = np.random.randint(70000, 100000)
kvs[name] = met_val
basepath = os.path.join(wkld_dir, 'sample-{}'.format(j))
with open(basepath + "__knobs.json", 'w') as f:
json.dump(knob_data, f, indent=4)
with open(basepath + '__metrics_start.json', 'w') as f:
json.dump(metrics_start_data, f, indent=4)
with open(basepath + '__metrics_end.json', 'w') as f:
json.dump(metrics_end_data, f, indent=4)
with open(basepath + '__summary.json', 'w') as f:
json.dump(summary_data, f, indent=4)
def main():
if len(sys.argv) < 3:
LOG.error('Usage: python data_generator.py [n_workloads] [n_samples_per_workload] '
'[optional: random_seed]')
sys.exit(1)
if len(sys.argv) == 4:
random_seed = int(sys.argv[3])
LOG.info("Seeding the generator with value: %d", random_seed)
np.random.seed(seed=random_seed)
shutil.rmtree(OUTPUT_DIR, ignore_errors=True)
os.mkdir(OUTPUT_DIR)
generate_data(int(sys.argv[1]), int(sys.argv[2]))
LOG.info("Finished. Generated data written to %s.", OUTPUT_DIR)
if __name__ == "__main__":
main()