133 lines
4.9 KiB
Python
133 lines
4.9 KiB
Python
|
#
|
||
|
# OtterTune - data_generator.py
|
||
|
#
|
||
|
# Copyright (c) 2017-18, Carnegie Mellon University Database Group
|
||
|
#
|
||
|
'''
|
||
|
Created on Nov 30, 2017
|
||
|
|
||
|
@author: dvanaken
|
||
|
'''
|
||
|
|
||
|
import copy
|
||
|
import datetime
|
||
|
import logging
|
||
|
import os
|
||
|
import shutil
|
||
|
import sys
|
||
|
|
||
|
import json
|
||
|
import numpy as np
|
||
|
|
||
|
LOG = logging.getLogger(__name__)
|
||
|
|
||
|
|
||
|
# Data generator configuration
|
||
|
OBSERVATION_TIME_SEC = 300 # 5 minutes
|
||
|
START_TIME = datetime.datetime.now() - datetime.timedelta(weeks=1)
|
||
|
START_FREQUENCY = datetime.timedelta(minutes=10)
|
||
|
END_FREQUENCY = datetime.timedelta(seconds=OBSERVATION_TIME_SEC)
|
||
|
EPOCH = datetime.datetime.utcfromtimestamp(0)
|
||
|
|
||
|
# Paths
|
||
|
ROOT_DIR = os.path.abspath(os.path.dirname(__file__))
|
||
|
SAMPLE_DIR = os.path.join(ROOT_DIR, 'samples')
|
||
|
OUTPUT_DIR = os.path.join(ROOT_DIR, 'generated_data')
|
||
|
|
||
|
|
||
|
def unix_time_millis(dt):
|
||
|
return int((dt - EPOCH).total_seconds() * 1000.0)
|
||
|
|
||
|
|
||
|
def generate_data(n_workloads, n_samples_per_workload):
|
||
|
with open(os.path.join(SAMPLE_DIR, 'knobs.json'), 'r') as f:
|
||
|
knob_sample = json.load(f)
|
||
|
with open(os.path.join(SAMPLE_DIR, 'metrics_before.json'), 'r') as f:
|
||
|
metrics_start_sample = json.load(f)
|
||
|
with open(os.path.join(SAMPLE_DIR, 'metrics_after.json'), 'r') as f:
|
||
|
metrics_end_sample = json.load(f)
|
||
|
with open(os.path.join(SAMPLE_DIR, 'summary.json'), 'r') as f:
|
||
|
summary_sample = json.load(f)
|
||
|
|
||
|
start_time = START_TIME
|
||
|
end_time = START_TIME + END_FREQUENCY
|
||
|
|
||
|
for i in range(n_workloads):
|
||
|
workload_name = 'workload-{}'.format(i)
|
||
|
wkld_dir = os.path.join(OUTPUT_DIR, workload_name)
|
||
|
os.mkdir(wkld_dir)
|
||
|
|
||
|
for j in range(n_samples_per_workload):
|
||
|
knob_data = copy.deepcopy(knob_sample)
|
||
|
metrics_start_data = copy.deepcopy(metrics_start_sample)
|
||
|
metrics_end_data = copy.deepcopy(metrics_end_sample)
|
||
|
summary_data = copy.deepcopy(summary_sample)
|
||
|
|
||
|
summary_data['workload_name'] = workload_name
|
||
|
summary_data['observation_time'] = OBSERVATION_TIME_SEC
|
||
|
summary_data['start_time'] = unix_time_millis(start_time)
|
||
|
summary_data['end_time'] = unix_time_millis(end_time)
|
||
|
start_time = start_time + START_FREQUENCY
|
||
|
end_time = start_time + END_FREQUENCY
|
||
|
|
||
|
knob_vals = np.random.randint(1, 11, 4)
|
||
|
global_knobs = knob_data['global']['global']
|
||
|
global_knobs['shared_buffers'] = str(knob_vals[0]) + 'GB'
|
||
|
global_knobs['work_mem'] = str(knob_vals[1]) + 'GB'
|
||
|
global_knobs['checkpoint_timing'] = str(knob_vals[2]) + 'min'
|
||
|
global_knobs['effective_io_concurrency'] = str(knob_vals[3])
|
||
|
|
||
|
metrics_start_data['global']['pg_stat_bgwriter']['buffers_alloc'] = np.random.randint(
|
||
|
3000, 7000)
|
||
|
metrics_end_data['global']['pg_stat_bgwriter']['buffers_alloc'] = np.random.randint(
|
||
|
7000, 10000)
|
||
|
|
||
|
locations = [
|
||
|
('xact_commit', metrics_start_data['local']['database']['pg_stat_database']),
|
||
|
('xact_commit', metrics_end_data['local']['database']['pg_stat_database']),
|
||
|
('n_tup_ins', metrics_start_data['local']['table']['pg_stat_user_tables']),
|
||
|
('n_tup_ins', metrics_end_data['local']['table']['pg_stat_user_tables']),
|
||
|
('idx_blks_hit', metrics_start_data['local']['indexes']['pg_statio_user_indexes']),
|
||
|
('idx_blks_hit', metrics_end_data['local']['indexes']['pg_statio_user_indexes']),
|
||
|
]
|
||
|
|
||
|
for k, (name, loc) in enumerate(locations):
|
||
|
|
||
|
for kvs in list(loc.values()):
|
||
|
if k % 2 == 0: # start time must be smaller value
|
||
|
met_val = np.random.randint(30000, 70000)
|
||
|
else:
|
||
|
met_val = np.random.randint(70000, 100000)
|
||
|
kvs[name] = met_val
|
||
|
|
||
|
basepath = os.path.join(wkld_dir, 'sample-{}'.format(j))
|
||
|
|
||
|
with open(basepath + "__knobs.json", 'w') as f:
|
||
|
json.dump(knob_data, f, indent=4)
|
||
|
with open(basepath + '__metrics_start.json', 'w') as f:
|
||
|
json.dump(metrics_start_data, f, indent=4)
|
||
|
with open(basepath + '__metrics_end.json', 'w') as f:
|
||
|
json.dump(metrics_end_data, f, indent=4)
|
||
|
with open(basepath + '__summary.json', 'w') as f:
|
||
|
json.dump(summary_data, f, indent=4)
|
||
|
|
||
|
|
||
|
def main():
|
||
|
if len(sys.argv) < 3:
|
||
|
LOG.error('Usage: python data_generator.py [n_workloads] [n_samples_per_workload] '
|
||
|
'[optional: random_seed]')
|
||
|
sys.exit(1)
|
||
|
if len(sys.argv) == 4:
|
||
|
random_seed = int(sys.argv[3])
|
||
|
LOG.info("Seeding the generator with value: %d", random_seed)
|
||
|
np.random.seed(seed=random_seed)
|
||
|
shutil.rmtree(OUTPUT_DIR, ignore_errors=True)
|
||
|
os.mkdir(OUTPUT_DIR)
|
||
|
|
||
|
generate_data(int(sys.argv[1]), int(sys.argv[2]))
|
||
|
LOG.info("Finished. Generated data written to %s.", OUTPUT_DIR)
|
||
|
|
||
|
|
||
|
if __name__ == "__main__":
|
||
|
main()
|