From 21fce27291130b00ce3b4fe9ea83da2e2f12949f Mon Sep 17 00:00:00 2001 From: yangdsh Date: Sat, 9 Nov 2019 01:58:10 +0000 Subject: [PATCH] improve ddpg --- server/analysis/ddpg/ddpg.py | 49 +++++++++++--------- server/analysis/simulation.py | 49 +++++++++++--------- server/analysis/tests/test_ddpg.py | 13 ++++-- server/website/website/settings/constants.py | 11 ++++- server/website/website/tasks/async_tasks.py | 16 +++---- 5 files changed, 79 insertions(+), 59 deletions(-) diff --git a/server/analysis/ddpg/ddpg.py b/server/analysis/ddpg/ddpg.py index 0cebf88..04b74de 100644 --- a/server/analysis/ddpg/ddpg.py +++ b/server/analysis/ddpg/ddpg.py @@ -23,21 +23,21 @@ LOG = get_analysis_logger(__name__) class Actor(nn.Module): - def __init__(self, n_states, n_actions): + def __init__(self, n_states, n_actions, hidden_sizes): super(Actor, self).__init__() self.layers = nn.Sequential( - nn.Linear(n_states, 128), + nn.Linear(n_states, hidden_sizes[0]), nn.LeakyReLU(negative_slope=0.2), - nn.BatchNorm1d(128), - nn.Linear(128, 128), + nn.BatchNorm1d(hidden_sizes[0]), + nn.Linear(hidden_sizes[0], hidden_sizes[1]), nn.Tanh(), nn.Dropout(0.3), - nn.BatchNorm1d(128), - - nn.Linear(128, 64), + nn.BatchNorm1d(hidden_sizes[1]), + nn.Linear(hidden_sizes[1], hidden_sizes[2]), nn.Tanh(), - nn.BatchNorm1d(64), - nn.Linear(64, n_actions) + nn.Dropout(0.3), + nn.BatchNorm1d(hidden_sizes[2]), + nn.Linear(hidden_sizes[2], n_actions) ) # This act layer maps the output to (0, 1) self.act = nn.Sigmoid() @@ -58,21 +58,21 @@ class Actor(nn.Module): class Critic(nn.Module): - def __init__(self, n_states, n_actions): + def __init__(self, n_states, n_actions, hidden_sizes): super(Critic, self).__init__() - self.state_input = nn.Linear(n_states, 128) - self.action_input = nn.Linear(n_actions, 128) + self.state_input = nn.Linear(n_states, hidden_sizes[0]) + self.action_input = nn.Linear(n_actions, hidden_sizes[0]) self.act = nn.Tanh() self.layers = nn.Sequential( - nn.Linear(256, 256), + nn.Linear(hidden_sizes[0] * 2, hidden_sizes[1]), nn.LeakyReLU(negative_slope=0.2), - nn.BatchNorm1d(256), - - nn.Linear(256, 64), + nn.Dropout(0.3), + nn.BatchNorm1d(hidden_sizes[1]), + nn.Linear(hidden_sizes[1], hidden_sizes[2]), nn.Tanh(), nn.Dropout(0.3), - nn.BatchNorm1d(64), - nn.Linear(64, 1), + nn.BatchNorm1d(hidden_sizes[2]), + nn.Linear(hidden_sizes[2], 1), ) self._init_weights() @@ -100,7 +100,8 @@ class Critic(nn.Module): class DDPG(object): def __init__(self, n_states, n_actions, model_name='', alr=0.001, clr=0.001, - gamma=0.9, batch_size=32, tau=0.002, shift=0, memory_size=100000): + gamma=0.9, batch_size=32, tau=0.002, shift=0, memory_size=100000, + a_hidden_sizes=[128, 128, 64], c_hidden_sizes=[128, 256, 64]): self.n_states = n_states self.n_actions = n_actions self.alr = alr @@ -109,6 +110,8 @@ class DDPG(object): self.batch_size = batch_size self.gamma = gamma self.tau = tau + self.a_hidden_sizes = a_hidden_sizes + self.c_hidden_sizes = c_hidden_sizes self.shift = shift self._build_network() @@ -121,10 +124,10 @@ class DDPG(object): return Variable(torch.FloatTensor(x)) def _build_network(self): - self.actor = Actor(self.n_states, self.n_actions) - self.target_actor = Actor(self.n_states, self.n_actions) - self.critic = Critic(self.n_states, self.n_actions) - self.target_critic = Critic(self.n_states, self.n_actions) + self.actor = Actor(self.n_states, self.n_actions, self.a_hidden_sizes) + self.target_actor = Actor(self.n_states, self.n_actions, self.a_hidden_sizes) + self.critic = Critic(self.n_states, self.n_actions, self.c_hidden_sizes) + self.target_critic = Critic(self.n_states, self.n_actions, self.c_hidden_sizes) # Copy actor's parameters self._update_target(self.target_actor, self.actor, tau=1.0) diff --git a/server/analysis/simulation.py b/server/analysis/simulation.py index eeed454..31e4bb9 100644 --- a/server/analysis/simulation.py +++ b/server/analysis/simulation.py @@ -25,6 +25,7 @@ from analysis.gpr import gpr_models # noqa from analysis.gpr import ucb # noqa from analysis.gpr.optimize import tf_optimize # noqa + LOG = get_analysis_logger(__name__) @@ -106,8 +107,10 @@ def ddpg(env, config, n_loops=100): a_lr = config['a_lr'] c_lr = config['c_lr'] n_epochs = config['n_epochs'] + ahs = config['a_hidden_sizes'] + chs = config['c_hidden_sizes'] model_ddpg = DDPG(n_actions=env.knob_dim, n_states=env.metric_dim, gamma=gamma, - clr=c_lr, alr=a_lr, shift=0.1) + clr=c_lr, alr=a_lr, shift=0, a_hidden_sizes=ahs, c_hidden_sizes=chs) knob_data = np.random.rand(env.knob_dim) prev_metric_data = np.zeros(env.metric_dim) @@ -122,7 +125,7 @@ def ddpg(env, config, n_loops=100): for i in range(n_loops): reward, metric_data = env.simulate(knob_data) - model_ddpg.add_sample(prev_metric_data, prev_knob_data, prev_reward, metric_data) + model_ddpg.add_sample(prev_metric_data, prev_knob_data, prev_reward, prev_metric_data) prev_metric_data = metric_data prev_knob_data = knob_data prev_reward = reward @@ -184,6 +187,7 @@ def dnn(env, config, n_loops=100): actions, rewards = memory.get_all() model_nn.fit(np.array(actions), -np.array(rewards), fit_epochs=50) res = model_nn.recommend(X_samples, Xmin, Xmax, recommend_epochs=10, explore=False) + best_config_idx = np.argmin(res.minl.ravel()) best_config = res.minl_conf[best_config_idx, :] if ou_process: @@ -313,12 +317,12 @@ def gpr_new(env, config, n_loops=100): model_kwargs['hyperparameters'] = None model_kwargs['optimize_hyperparameters'] = optimize_hyperparams - X_new, ypred, model_params, hyperparameters = run_optimize(np.array(actions), - -np.array(rewards), - X_samples, - model_name, - opt_kwargs, - model_kwargs) + X_new, ypred, _, hyperparameters = run_optimize(np.array(actions), + -np.array(rewards), + X_samples, + model_name, + opt_kwargs, + model_kwargs) sort_index = np.argsort(ypred.squeeze()) X_new = X_new[sort_index] @@ -342,8 +346,8 @@ def plotlines(xs, results, labels, title, path): N = 1 weights = np.ones(N) for x_axis, result, label in zip(xs, results, labels): - result = np.convolve(weights/weights.sum(), result.flatten())[N-1:-N+1] - lines.append(plt.plot(x_axis[:-N+1], result, label=label, lw=4)[0]) + result = np.convolve(weights/weights.sum(), result.flatten())[N-1:-N] + lines.append(plt.plot(x_axis[:-N], result, label=label, lw=4)[0]) plt.legend(handles=lines, fontsize=30) plt.title(title, fontsize=25) plt.xticks(fontsize=25) @@ -357,8 +361,8 @@ def plotlines(xs, results, labels, title, path): def run(tuners, configs, labels, title, env, n_loops, n_repeats): if not plt: LOG.info("Cannot import matplotlib. Will write results to files instead of figures.") - random.seed(0) - np.random.seed(1) + random.seed(2) + np.random.seed(2) torch.manual_seed(0) results = [] xs = [] @@ -385,16 +389,17 @@ def run(tuners, configs, labels, title, env, n_loops, n_repeats): def main(): - env = Environment(knob_dim=24, metric_dim=60, modes=[2], reward_variance=0.05) - title = 'compare' - n_repeats = [1, 1, 1, 1] - n_loops = 80 - configs = [{'gamma': 0., 'c_lr': 0.001, 'a_lr': 0.01, 'num_collections': 50, 'n_epochs': 50}, - {'num_samples': 30, 'num_collections': 50}, - {'num_samples': 30, 'num_collections': 50}, - {'num_samples': 30, 'num_collections': 50}] - tuners = [ddpg, gpr_new, dnn, gpr] - labels = [tuner.__name__ for tuner in tuners] + env = Environment(knob_dim=8, metric_dim=60, modes=[2], reward_variance=0.15) + title = 'ddpg_structure_nodrop' + n_repeats = [2, 2] + n_loops = 100 + configs = [{'gamma': 0., 'c_lr': 0.001, 'a_lr': 0.02, 'num_collections': 1, 'n_epochs': 30, + 'a_hidden_sizes': [128, 128, 64], 'c_hidden_sizes': [64, 128, 64]}, + {'gamma': 0., 'c_lr': 0.001, 'a_lr': 0.02, 'num_collections': 1, 'n_epochs': 30, + 'a_hidden_sizes': [64, 64, 32], 'c_hidden_sizes': [64, 128, 64]}, + ] + tuners = [ddpg, ddpg] + labels = ['1', '2'] run(tuners, configs, labels, title, env, n_loops, n_repeats) diff --git a/server/analysis/tests/test_ddpg.py b/server/analysis/tests/test_ddpg.py index b293ec3..ee336b5 100644 --- a/server/analysis/tests/test_ddpg.py +++ b/server/analysis/tests/test_ddpg.py @@ -25,15 +25,18 @@ class TestDDPG(unittest.TestCase): np.random.seed(0) torch.manual_seed(0) super(TestDDPG, cls).setUpClass() - cls.ddpg = DDPG(n_actions=1, n_states=1, gamma=0) - for _ in range(700): - knob_data = np.array([random.random()]) - prev_metric_data = np.array([random.random()]) + cls.ddpg = DDPG(n_actions=1, n_states=1, gamma=0, alr=0.02) + knob_data = np.zeros(1) + metric_data = np.array([random.random()]) + for _ in range(100): + prev_metric_data = metric_data metric_data = np.array([random.random()]) reward = 1.0 if (prev_metric_data[0] - 0.5) * (knob_data[0] - 0.5) > 0 else 0.0 reward = np.array([reward]) cls.ddpg.add_sample(prev_metric_data, knob_data, reward, metric_data) - cls.ddpg.update() + for _ in range(10): + cls.ddpg.update() + knob_data = cls.ddpg.choose_action(metric_data) def test_ddpg_ypreds(self): total_reward = 0.0 diff --git a/server/website/website/settings/constants.py b/server/website/website/settings/constants.py index c52796f..ec54760 100644 --- a/server/website/website/settings/constants.py +++ b/server/website/website/settings/constants.py @@ -77,7 +77,16 @@ DNN_DEBUG_INTERVAL = 100 DDPG_BATCH_SIZE = 32 # Learning rate of actor network -ACTOR_LEARNING_RATE = 0.01 +ACTOR_LEARNING_RATE = 0.02 # Learning rate of critic network CRITIC_LEARNING_RATE = 0.001 + +# Number of update epochs per iteration +UPDATE_EPOCHS = 30 + +# The number of hidden units in each layer of the actor MLP +ACTOR_HIDDEN_SIZES = [128, 128, 64] + +# The number of hidden units in each layer of the critic MLP +CRITIC_HIDDEN_SIZES = [64, 128, 64] diff --git a/server/website/website/tasks/async_tasks.py b/server/website/website/tasks/async_tasks.py index 322348d..9bd4c89 100644 --- a/server/website/website/tasks/async_tasks.py +++ b/server/website/website/tasks/async_tasks.py @@ -31,7 +31,8 @@ from website.settings import (DEFAULT_LENGTH_SCALE, DEFAULT_MAGNITUDE, DEFAULT_EPSILON, MAX_ITER, GPR_EPS, DEFAULT_SIGMA_MULTIPLIER, DEFAULT_MU_MULTIPLIER, DDPG_BATCH_SIZE, ACTOR_LEARNING_RATE, - CRITIC_LEARNING_RATE, + CRITIC_LEARNING_RATE, UPDATE_EPOCHS, + ACTOR_HIDDEN_SIZES, CRITIC_HIDDEN_SIZES, DNN_TRAIN_ITER, DNN_EXPLORE, DNN_EXPLORE_ITER, DNN_NOISE_SCALE_BEGIN, DNN_NOISE_SCALE_END, DNN_DEBUG, DNN_DEBUG_INTERVAL) @@ -278,12 +279,9 @@ def train_ddpg(result_id): result = Result.objects.get(pk=result_id) session = Result.objects.get(pk=result_id).session session_results = Result.objects.filter(session=session, - creation_time__lt=result.creation_time) + creation_time__lte=result.creation_time) result_info = {} result_info['newest_result_id'] = result_id - if len(session_results) == 0: - LOG.info('No previous result. Abort.') - return result_info # Extract data from result result = Result.objects.filter(pk=result_id) @@ -332,13 +330,14 @@ def train_ddpg(result_id): # Update ddpg ddpg = DDPG(n_actions=knob_num, n_states=metric_num, alr=ACTOR_LEARNING_RATE, - clr=CRITIC_LEARNING_RATE, gamma=0, batch_size=DDPG_BATCH_SIZE) + clr=CRITIC_LEARNING_RATE, gamma=0, batch_size=DDPG_BATCH_SIZE, + a_hidden_sizes=ACTOR_HIDDEN_SIZES, c_hidden_sizes=CRITIC_HIDDEN_SIZES) if session.ddpg_actor_model and session.ddpg_critic_model: ddpg.set_model(session.ddpg_actor_model, session.ddpg_critic_model) if session.ddpg_reply_memory: ddpg.replay_memory.set(session.ddpg_reply_memory) ddpg.add_sample(normalized_metric_data, knob_data, reward, normalized_metric_data) - for _ in range(25): + for _ in range(UPDATE_EPOCHS): ddpg.update() session.ddpg_actor_model, session.ddpg_critic_model = ddpg.get_model() session.ddpg_reply_memory = ddpg.replay_memory.get() @@ -362,7 +361,8 @@ def configuration_recommendation_ddpg(result_info): # pylint: disable=invalid-n knob_num = len(knob_labels) metric_num = len(metric_data) - ddpg = DDPG(n_actions=knob_num, n_states=metric_num) + ddpg = DDPG(n_actions=knob_num, n_states=metric_num, a_hidden_sizes=ACTOR_HIDDEN_SIZES, + c_hidden_sizes=CRITIC_HIDDEN_SIZES) if session.ddpg_actor_model is not None and session.ddpg_critic_model is not None: ddpg.set_model(session.ddpg_actor_model, session.ddpg_critic_model) if session.ddpg_reply_memory is not None: