improve ddpg

This commit is contained in:
yangdsh 2019-11-09 01:58:10 +00:00 committed by Dana Van Aken
parent 67a4a70c09
commit 21fce27291
5 changed files with 79 additions and 59 deletions

View File

@ -23,21 +23,21 @@ LOG = get_analysis_logger(__name__)
class Actor(nn.Module): class Actor(nn.Module):
def __init__(self, n_states, n_actions): def __init__(self, n_states, n_actions, hidden_sizes):
super(Actor, self).__init__() super(Actor, self).__init__()
self.layers = nn.Sequential( self.layers = nn.Sequential(
nn.Linear(n_states, 128), nn.Linear(n_states, hidden_sizes[0]),
nn.LeakyReLU(negative_slope=0.2), nn.LeakyReLU(negative_slope=0.2),
nn.BatchNorm1d(128), nn.BatchNorm1d(hidden_sizes[0]),
nn.Linear(128, 128), nn.Linear(hidden_sizes[0], hidden_sizes[1]),
nn.Tanh(), nn.Tanh(),
nn.Dropout(0.3), nn.Dropout(0.3),
nn.BatchNorm1d(128), nn.BatchNorm1d(hidden_sizes[1]),
nn.Linear(hidden_sizes[1], hidden_sizes[2]),
nn.Linear(128, 64),
nn.Tanh(), nn.Tanh(),
nn.BatchNorm1d(64), nn.Dropout(0.3),
nn.Linear(64, n_actions) nn.BatchNorm1d(hidden_sizes[2]),
nn.Linear(hidden_sizes[2], n_actions)
) )
# This act layer maps the output to (0, 1) # This act layer maps the output to (0, 1)
self.act = nn.Sigmoid() self.act = nn.Sigmoid()
@ -58,21 +58,21 @@ class Actor(nn.Module):
class Critic(nn.Module): class Critic(nn.Module):
def __init__(self, n_states, n_actions): def __init__(self, n_states, n_actions, hidden_sizes):
super(Critic, self).__init__() super(Critic, self).__init__()
self.state_input = nn.Linear(n_states, 128) self.state_input = nn.Linear(n_states, hidden_sizes[0])
self.action_input = nn.Linear(n_actions, 128) self.action_input = nn.Linear(n_actions, hidden_sizes[0])
self.act = nn.Tanh() self.act = nn.Tanh()
self.layers = nn.Sequential( self.layers = nn.Sequential(
nn.Linear(256, 256), nn.Linear(hidden_sizes[0] * 2, hidden_sizes[1]),
nn.LeakyReLU(negative_slope=0.2), nn.LeakyReLU(negative_slope=0.2),
nn.BatchNorm1d(256), nn.Dropout(0.3),
nn.BatchNorm1d(hidden_sizes[1]),
nn.Linear(256, 64), nn.Linear(hidden_sizes[1], hidden_sizes[2]),
nn.Tanh(), nn.Tanh(),
nn.Dropout(0.3), nn.Dropout(0.3),
nn.BatchNorm1d(64), nn.BatchNorm1d(hidden_sizes[2]),
nn.Linear(64, 1), nn.Linear(hidden_sizes[2], 1),
) )
self._init_weights() self._init_weights()
@ -100,7 +100,8 @@ class Critic(nn.Module):
class DDPG(object): class DDPG(object):
def __init__(self, n_states, n_actions, model_name='', alr=0.001, clr=0.001, def __init__(self, n_states, n_actions, model_name='', alr=0.001, clr=0.001,
gamma=0.9, batch_size=32, tau=0.002, shift=0, memory_size=100000): gamma=0.9, batch_size=32, tau=0.002, shift=0, memory_size=100000,
a_hidden_sizes=[128, 128, 64], c_hidden_sizes=[128, 256, 64]):
self.n_states = n_states self.n_states = n_states
self.n_actions = n_actions self.n_actions = n_actions
self.alr = alr self.alr = alr
@ -109,6 +110,8 @@ class DDPG(object):
self.batch_size = batch_size self.batch_size = batch_size
self.gamma = gamma self.gamma = gamma
self.tau = tau self.tau = tau
self.a_hidden_sizes = a_hidden_sizes
self.c_hidden_sizes = c_hidden_sizes
self.shift = shift self.shift = shift
self._build_network() self._build_network()
@ -121,10 +124,10 @@ class DDPG(object):
return Variable(torch.FloatTensor(x)) return Variable(torch.FloatTensor(x))
def _build_network(self): def _build_network(self):
self.actor = Actor(self.n_states, self.n_actions) self.actor = Actor(self.n_states, self.n_actions, self.a_hidden_sizes)
self.target_actor = Actor(self.n_states, self.n_actions) self.target_actor = Actor(self.n_states, self.n_actions, self.a_hidden_sizes)
self.critic = Critic(self.n_states, self.n_actions) self.critic = Critic(self.n_states, self.n_actions, self.c_hidden_sizes)
self.target_critic = Critic(self.n_states, self.n_actions) self.target_critic = Critic(self.n_states, self.n_actions, self.c_hidden_sizes)
# Copy actor's parameters # Copy actor's parameters
self._update_target(self.target_actor, self.actor, tau=1.0) self._update_target(self.target_actor, self.actor, tau=1.0)

View File

@ -25,6 +25,7 @@ from analysis.gpr import gpr_models # noqa
from analysis.gpr import ucb # noqa from analysis.gpr import ucb # noqa
from analysis.gpr.optimize import tf_optimize # noqa from analysis.gpr.optimize import tf_optimize # noqa
LOG = get_analysis_logger(__name__) LOG = get_analysis_logger(__name__)
@ -106,8 +107,10 @@ def ddpg(env, config, n_loops=100):
a_lr = config['a_lr'] a_lr = config['a_lr']
c_lr = config['c_lr'] c_lr = config['c_lr']
n_epochs = config['n_epochs'] n_epochs = config['n_epochs']
ahs = config['a_hidden_sizes']
chs = config['c_hidden_sizes']
model_ddpg = DDPG(n_actions=env.knob_dim, n_states=env.metric_dim, gamma=gamma, model_ddpg = DDPG(n_actions=env.knob_dim, n_states=env.metric_dim, gamma=gamma,
clr=c_lr, alr=a_lr, shift=0.1) clr=c_lr, alr=a_lr, shift=0, a_hidden_sizes=ahs, c_hidden_sizes=chs)
knob_data = np.random.rand(env.knob_dim) knob_data = np.random.rand(env.knob_dim)
prev_metric_data = np.zeros(env.metric_dim) prev_metric_data = np.zeros(env.metric_dim)
@ -122,7 +125,7 @@ def ddpg(env, config, n_loops=100):
for i in range(n_loops): for i in range(n_loops):
reward, metric_data = env.simulate(knob_data) reward, metric_data = env.simulate(knob_data)
model_ddpg.add_sample(prev_metric_data, prev_knob_data, prev_reward, metric_data) model_ddpg.add_sample(prev_metric_data, prev_knob_data, prev_reward, prev_metric_data)
prev_metric_data = metric_data prev_metric_data = metric_data
prev_knob_data = knob_data prev_knob_data = knob_data
prev_reward = reward prev_reward = reward
@ -184,6 +187,7 @@ def dnn(env, config, n_loops=100):
actions, rewards = memory.get_all() actions, rewards = memory.get_all()
model_nn.fit(np.array(actions), -np.array(rewards), fit_epochs=50) model_nn.fit(np.array(actions), -np.array(rewards), fit_epochs=50)
res = model_nn.recommend(X_samples, Xmin, Xmax, recommend_epochs=10, explore=False) res = model_nn.recommend(X_samples, Xmin, Xmax, recommend_epochs=10, explore=False)
best_config_idx = np.argmin(res.minl.ravel()) best_config_idx = np.argmin(res.minl.ravel())
best_config = res.minl_conf[best_config_idx, :] best_config = res.minl_conf[best_config_idx, :]
if ou_process: if ou_process:
@ -313,7 +317,7 @@ def gpr_new(env, config, n_loops=100):
model_kwargs['hyperparameters'] = None model_kwargs['hyperparameters'] = None
model_kwargs['optimize_hyperparameters'] = optimize_hyperparams model_kwargs['optimize_hyperparameters'] = optimize_hyperparams
X_new, ypred, model_params, hyperparameters = run_optimize(np.array(actions), X_new, ypred, _, hyperparameters = run_optimize(np.array(actions),
-np.array(rewards), -np.array(rewards),
X_samples, X_samples,
model_name, model_name,
@ -342,8 +346,8 @@ def plotlines(xs, results, labels, title, path):
N = 1 N = 1
weights = np.ones(N) weights = np.ones(N)
for x_axis, result, label in zip(xs, results, labels): for x_axis, result, label in zip(xs, results, labels):
result = np.convolve(weights/weights.sum(), result.flatten())[N-1:-N+1] result = np.convolve(weights/weights.sum(), result.flatten())[N-1:-N]
lines.append(plt.plot(x_axis[:-N+1], result, label=label, lw=4)[0]) lines.append(plt.plot(x_axis[:-N], result, label=label, lw=4)[0])
plt.legend(handles=lines, fontsize=30) plt.legend(handles=lines, fontsize=30)
plt.title(title, fontsize=25) plt.title(title, fontsize=25)
plt.xticks(fontsize=25) plt.xticks(fontsize=25)
@ -357,8 +361,8 @@ def plotlines(xs, results, labels, title, path):
def run(tuners, configs, labels, title, env, n_loops, n_repeats): def run(tuners, configs, labels, title, env, n_loops, n_repeats):
if not plt: if not plt:
LOG.info("Cannot import matplotlib. Will write results to files instead of figures.") LOG.info("Cannot import matplotlib. Will write results to files instead of figures.")
random.seed(0) random.seed(2)
np.random.seed(1) np.random.seed(2)
torch.manual_seed(0) torch.manual_seed(0)
results = [] results = []
xs = [] xs = []
@ -385,16 +389,17 @@ def run(tuners, configs, labels, title, env, n_loops, n_repeats):
def main(): def main():
env = Environment(knob_dim=24, metric_dim=60, modes=[2], reward_variance=0.05) env = Environment(knob_dim=8, metric_dim=60, modes=[2], reward_variance=0.15)
title = 'compare' title = 'ddpg_structure_nodrop'
n_repeats = [1, 1, 1, 1] n_repeats = [2, 2]
n_loops = 80 n_loops = 100
configs = [{'gamma': 0., 'c_lr': 0.001, 'a_lr': 0.01, 'num_collections': 50, 'n_epochs': 50}, configs = [{'gamma': 0., 'c_lr': 0.001, 'a_lr': 0.02, 'num_collections': 1, 'n_epochs': 30,
{'num_samples': 30, 'num_collections': 50}, 'a_hidden_sizes': [128, 128, 64], 'c_hidden_sizes': [64, 128, 64]},
{'num_samples': 30, 'num_collections': 50}, {'gamma': 0., 'c_lr': 0.001, 'a_lr': 0.02, 'num_collections': 1, 'n_epochs': 30,
{'num_samples': 30, 'num_collections': 50}] 'a_hidden_sizes': [64, 64, 32], 'c_hidden_sizes': [64, 128, 64]},
tuners = [ddpg, gpr_new, dnn, gpr] ]
labels = [tuner.__name__ for tuner in tuners] tuners = [ddpg, ddpg]
labels = ['1', '2']
run(tuners, configs, labels, title, env, n_loops, n_repeats) run(tuners, configs, labels, title, env, n_loops, n_repeats)

View File

@ -25,15 +25,18 @@ class TestDDPG(unittest.TestCase):
np.random.seed(0) np.random.seed(0)
torch.manual_seed(0) torch.manual_seed(0)
super(TestDDPG, cls).setUpClass() super(TestDDPG, cls).setUpClass()
cls.ddpg = DDPG(n_actions=1, n_states=1, gamma=0) cls.ddpg = DDPG(n_actions=1, n_states=1, gamma=0, alr=0.02)
for _ in range(700): knob_data = np.zeros(1)
knob_data = np.array([random.random()]) metric_data = np.array([random.random()])
prev_metric_data = np.array([random.random()]) for _ in range(100):
prev_metric_data = metric_data
metric_data = np.array([random.random()]) metric_data = np.array([random.random()])
reward = 1.0 if (prev_metric_data[0] - 0.5) * (knob_data[0] - 0.5) > 0 else 0.0 reward = 1.0 if (prev_metric_data[0] - 0.5) * (knob_data[0] - 0.5) > 0 else 0.0
reward = np.array([reward]) reward = np.array([reward])
cls.ddpg.add_sample(prev_metric_data, knob_data, reward, metric_data) cls.ddpg.add_sample(prev_metric_data, knob_data, reward, metric_data)
for _ in range(10):
cls.ddpg.update() cls.ddpg.update()
knob_data = cls.ddpg.choose_action(metric_data)
def test_ddpg_ypreds(self): def test_ddpg_ypreds(self):
total_reward = 0.0 total_reward = 0.0

View File

@ -77,7 +77,16 @@ DNN_DEBUG_INTERVAL = 100
DDPG_BATCH_SIZE = 32 DDPG_BATCH_SIZE = 32
# Learning rate of actor network # Learning rate of actor network
ACTOR_LEARNING_RATE = 0.01 ACTOR_LEARNING_RATE = 0.02
# Learning rate of critic network # Learning rate of critic network
CRITIC_LEARNING_RATE = 0.001 CRITIC_LEARNING_RATE = 0.001
# Number of update epochs per iteration
UPDATE_EPOCHS = 30
# The number of hidden units in each layer of the actor MLP
ACTOR_HIDDEN_SIZES = [128, 128, 64]
# The number of hidden units in each layer of the critic MLP
CRITIC_HIDDEN_SIZES = [64, 128, 64]

View File

@ -31,7 +31,8 @@ from website.settings import (DEFAULT_LENGTH_SCALE, DEFAULT_MAGNITUDE,
DEFAULT_EPSILON, MAX_ITER, GPR_EPS, DEFAULT_EPSILON, MAX_ITER, GPR_EPS,
DEFAULT_SIGMA_MULTIPLIER, DEFAULT_MU_MULTIPLIER, DEFAULT_SIGMA_MULTIPLIER, DEFAULT_MU_MULTIPLIER,
DDPG_BATCH_SIZE, ACTOR_LEARNING_RATE, DDPG_BATCH_SIZE, ACTOR_LEARNING_RATE,
CRITIC_LEARNING_RATE, CRITIC_LEARNING_RATE, UPDATE_EPOCHS,
ACTOR_HIDDEN_SIZES, CRITIC_HIDDEN_SIZES,
DNN_TRAIN_ITER, DNN_EXPLORE, DNN_EXPLORE_ITER, DNN_TRAIN_ITER, DNN_EXPLORE, DNN_EXPLORE_ITER,
DNN_NOISE_SCALE_BEGIN, DNN_NOISE_SCALE_END, DNN_NOISE_SCALE_BEGIN, DNN_NOISE_SCALE_END,
DNN_DEBUG, DNN_DEBUG_INTERVAL) DNN_DEBUG, DNN_DEBUG_INTERVAL)
@ -278,12 +279,9 @@ def train_ddpg(result_id):
result = Result.objects.get(pk=result_id) result = Result.objects.get(pk=result_id)
session = Result.objects.get(pk=result_id).session session = Result.objects.get(pk=result_id).session
session_results = Result.objects.filter(session=session, session_results = Result.objects.filter(session=session,
creation_time__lt=result.creation_time) creation_time__lte=result.creation_time)
result_info = {} result_info = {}
result_info['newest_result_id'] = result_id result_info['newest_result_id'] = result_id
if len(session_results) == 0:
LOG.info('No previous result. Abort.')
return result_info
# Extract data from result # Extract data from result
result = Result.objects.filter(pk=result_id) result = Result.objects.filter(pk=result_id)
@ -332,13 +330,14 @@ def train_ddpg(result_id):
# Update ddpg # Update ddpg
ddpg = DDPG(n_actions=knob_num, n_states=metric_num, alr=ACTOR_LEARNING_RATE, ddpg = DDPG(n_actions=knob_num, n_states=metric_num, alr=ACTOR_LEARNING_RATE,
clr=CRITIC_LEARNING_RATE, gamma=0, batch_size=DDPG_BATCH_SIZE) clr=CRITIC_LEARNING_RATE, gamma=0, batch_size=DDPG_BATCH_SIZE,
a_hidden_sizes=ACTOR_HIDDEN_SIZES, c_hidden_sizes=CRITIC_HIDDEN_SIZES)
if session.ddpg_actor_model and session.ddpg_critic_model: if session.ddpg_actor_model and session.ddpg_critic_model:
ddpg.set_model(session.ddpg_actor_model, session.ddpg_critic_model) ddpg.set_model(session.ddpg_actor_model, session.ddpg_critic_model)
if session.ddpg_reply_memory: if session.ddpg_reply_memory:
ddpg.replay_memory.set(session.ddpg_reply_memory) ddpg.replay_memory.set(session.ddpg_reply_memory)
ddpg.add_sample(normalized_metric_data, knob_data, reward, normalized_metric_data) ddpg.add_sample(normalized_metric_data, knob_data, reward, normalized_metric_data)
for _ in range(25): for _ in range(UPDATE_EPOCHS):
ddpg.update() ddpg.update()
session.ddpg_actor_model, session.ddpg_critic_model = ddpg.get_model() session.ddpg_actor_model, session.ddpg_critic_model = ddpg.get_model()
session.ddpg_reply_memory = ddpg.replay_memory.get() session.ddpg_reply_memory = ddpg.replay_memory.get()
@ -362,7 +361,8 @@ def configuration_recommendation_ddpg(result_info): # pylint: disable=invalid-n
knob_num = len(knob_labels) knob_num = len(knob_labels)
metric_num = len(metric_data) metric_num = len(metric_data)
ddpg = DDPG(n_actions=knob_num, n_states=metric_num) ddpg = DDPG(n_actions=knob_num, n_states=metric_num, a_hidden_sizes=ACTOR_HIDDEN_SIZES,
c_hidden_sizes=CRITIC_HIDDEN_SIZES)
if session.ddpg_actor_model is not None and session.ddpg_critic_model is not None: if session.ddpg_actor_model is not None and session.ddpg_critic_model is not None:
ddpg.set_model(session.ddpg_actor_model, session.ddpg_critic_model) ddpg.set_model(session.ddpg_actor_model, session.ddpg_critic_model)
if session.ddpg_reply_memory is not None: if session.ddpg_reply_memory is not None: