Add machine learning model ddpg

This commit is contained in:
Dongsheng Yang
2019-09-13 17:55:55 -04:00
committed by Dana Van Aken
parent b9dc726b9c
commit c83f2649b6
10 changed files with 863 additions and 4 deletions

View File

@@ -0,0 +1,41 @@
#
# OUProcess.py
#
# Copyright
#
import numpy as np
# from https://github.com/songrotek/DDPG/blob/master/ou_noise.py
class OUProcess(object):
def __init__(self, n_actions, theta=0.15, mu=0, sigma=0.1, ):
self.n_actions = n_actions
self.theta = theta
self.mu = mu
self.sigma = sigma
self.current_value = np.ones(self.n_actions) * self.mu
def reset(self, sigma=0):
self.current_value = np.ones(self.n_actions) * self.mu
if sigma != 0:
self.sigma = sigma
def noise(self):
x = self.current_value
dx = self.theta * (self.mu - x) + self.sigma * np.random.randn(len(x))
self.current_value = x + dx
return self.current_value
if __name__ == '__main__':
import matplotlib.pyplot as plt # pylint: disable=wrong-import-position
ou = OUProcess(3, theta=0.3) # pylint: disable=invalid-name
states = [] # pylint: disable=invalid-name
for i in range(1000):
states.append(ou.noise())
plt.plot(states)
plt.show()

View File

@@ -0,0 +1,10 @@
#
# __init__.py
#
# Copyright
#
from analysis.ddpg.ddpg import DDPG
__all__ = ["DDPG"]

View File

@@ -0,0 +1,509 @@
#
# ddpg.py
#
# Copyright
#
"""
Deep Deterministic Policy Gradient Model
"""
import logging
import os
import sys
import math
import pickle
import numpy as np
import torch
import torch.nn as nn
from torch.nn import init, Parameter
import torch.nn.functional as F
import torch.optim as optimizer
from torch.autograd import Variable
from analysis.ddpg.OUProcess import OUProcess
from analysis.ddpg.prioritized_replay_memory import PrioritizedReplayMemory
LOG = logging.getLogger(__name__)
sys.path.append('../')
# code from https://github.com/Kaixhin/NoisyNet-A3C/blob/master/model.py
class NoisyLinear(nn.Linear):
def __init__(self, in_features, out_features, sigma_init=0.05, bias=True):
super(NoisyLinear, self).__init__(in_features, out_features, bias=True)
# reuse self.weight and self.bias
self.sigma_init = sigma_init
self.sigma_weight = Parameter(torch.Tensor(out_features, in_features))
self.sigma_bias = Parameter(torch.Tensor(out_features))
self.register_buffer('epsilon_weight', torch.zeros(out_features, in_features))
self.register_buffer('epsilon_bias', torch.zeros(out_features))
self.reset_parameters()
def reset_parameters(self):
# Only init after all params added (otherwise super().__init__() fails)
if hasattr(self, 'sigma_weight'):
init.uniform(self.weight, -math.sqrt(3 / self.in_features),
math.sqrt(3 / self.in_features))
init.uniform(self.bias, -math.sqrt(3 / self.in_features),
math.sqrt(3 / self.in_features))
init.constant(self.sigma_weight, self.sigma_init)
init.constant(self.sigma_bias, self.sigma_init)
def forward(self, x):
return F.linear(x, self.weight + self.sigma_weight * Variable(self.epsilon_weight),
self.bias + self.sigma_bias * Variable(self.epsilon_bias))
# pylint: disable=attribute-defined-outside-init
def sample_noise(self):
self.epsilon_weight = torch.randn(self.out_features, self.in_features)
self.epsilon_bias = torch.randn(self.out_features)
def remove_noise(self):
self.epsilon_weight = torch.zeros(self.out_features, self.in_features)
self.epsilon_bias = torch.zeros(self.out_features)
# pylint: enable=attribute-defined-outside-init
class Normalizer(object):
def __init__(self, mean, variance):
if isinstance(mean, list):
mean = np.array(mean)
if isinstance(variance, list):
variance = np.array(variance)
self.mean = mean
self.std = np.sqrt(variance + 0.00001)
def normalize(self, x):
if isinstance(x, list):
x = np.array(x)
x = x - self.mean
x = x / self.std
return Variable(torch.FloatTensor(x))
def __call__(self, x, *args, **kwargs):
return self.normalize(x)
class ActorLow(nn.Module):
def __init__(self, n_states, n_actions, ):
super(ActorLow, self).__init__()
self.layers = nn.Sequential(
nn.BatchNorm1d(n_states),
nn.Linear(n_states, 32),
nn.LeakyReLU(negative_slope=0.2),
nn.BatchNorm1d(32),
nn.Linear(32, n_actions),
nn.LeakyReLU(negative_slope=0.2)
)
self._init_weights()
self.out_func = nn.Tanh()
def _init_weights(self):
for m in self.layers:
if isinstance(m, nn.Linear):
m.weight.data.normal_(0.0, 1e-3)
m.bias.data.uniform_(-0.1, 0.1)
def forward(self, x): # pylint: disable=arguments-differ
out = self.layers(x)
return self.out_func(out)
class CriticLow(nn.Module):
def __init__(self, n_states, n_actions):
super(CriticLow, self).__init__()
self.state_input = nn.Linear(n_states, 32)
self.action_input = nn.Linear(n_actions, 32)
self.act = nn.LeakyReLU(negative_slope=0.2)
self.state_bn = nn.BatchNorm1d(n_states)
self.layers = nn.Sequential(
nn.Linear(64, 1),
nn.LeakyReLU(negative_slope=0.2),
)
self._init_weights()
def _init_weights(self):
self.state_input.weight.data.normal_(0.0, 1e-3)
self.state_input.bias.data.uniform_(-0.1, 0.1)
self.action_input.weight.data.normal_(0.0, 1e-3)
self.action_input.bias.data.uniform_(-0.1, 0.1)
for m in self.layers:
if isinstance(m, nn.Linear):
m.weight.data.normal_(0.0, 1e-3)
m.bias.data.uniform_(-0.1, 0.1)
def forward(self, x, action): # pylint: disable=arguments-differ
x = self.state_bn(x)
x = self.act(self.state_input(x))
action = self.act(self.action_input(action))
_input = torch.cat([x, action], dim=1)
value = self.layers(_input)
return value
class Actor(nn.Module):
def __init__(self, n_states, n_actions, noisy=False):
super(Actor, self).__init__()
self.layers = nn.Sequential(
nn.Linear(n_states, 128),
nn.LeakyReLU(negative_slope=0.2),
nn.BatchNorm1d(128),
nn.Linear(128, 128),
nn.Tanh(),
nn.Dropout(0.3),
nn.Linear(128, 64),
nn.Tanh(),
nn.BatchNorm1d(64),
)
if noisy:
self.out = NoisyLinear(64, n_actions)
else:
self.out = nn.Linear(64, n_actions)
self._init_weights()
self.act = nn.Sigmoid()
def _init_weights(self):
for m in self.layers:
if isinstance(m, nn.Linear):
m.weight.data.normal_(0.0, 1e-2)
m.bias.data.uniform_(-0.1, 0.1)
def sample_noise(self):
self.out.sample_noise()
def forward(self, x): # pylint: disable=arguments-differ
out = self.act(self.out(self.layers(x)))
return out
class Critic(nn.Module):
def __init__(self, n_states, n_actions):
super(Critic, self).__init__()
self.state_input = nn.Linear(n_states, 128)
self.action_input = nn.Linear(n_actions, 128)
self.act = nn.Tanh()
self.layers = nn.Sequential(
nn.Linear(256, 256),
nn.LeakyReLU(negative_slope=0.2),
nn.BatchNorm1d(256),
nn.Linear(256, 64),
nn.Tanh(),
nn.Dropout(0.3),
nn.BatchNorm1d(64),
nn.Linear(64, 1),
)
self._init_weights()
def _init_weights(self):
self.state_input.weight.data.normal_(0.0, 1e-2)
self.state_input.bias.data.uniform_(-0.1, 0.1)
self.action_input.weight.data.normal_(0.0, 1e-2)
self.action_input.bias.data.uniform_(-0.1, 0.1)
for m in self.layers:
if isinstance(m, nn.Linear):
m.weight.data.normal_(0.0, 1e-2)
m.bias.data.uniform_(-0.1, 0.1)
def forward(self, x, action): # pylint: disable=arguments-differ
x = self.act(self.state_input(x))
action = self.act(self.action_input(action))
_input = torch.cat([x, action], dim=1)
value = self.layers(_input)
return value
class DDPG(object):
def __init__(self, n_states, n_actions, opt=None, ouprocess=True, mean_var_path=None,
supervised=False):
""" DDPG Algorithms
Args:
n_states: int, dimension of states
n_actions: int, dimension of actions
opt: dict, params
supervised, bool, pre-train the actor with supervised learning
"""
self.n_states = n_states
self.n_actions = n_actions
if opt is None:
opt = {
'model': '',
'alr': 0.001,
'clr': 0.001,
'gamma': 0.9,
'batch_size': 32,
'tau': 0.002,
'memory_size': 100000
}
# Params
self.alr = opt['alr']
self.clr = opt['clr']
self.model_name = opt['model']
self.batch_size = opt['batch_size']
self.gamma = opt['gamma']
self.tau = opt['tau']
self.ouprocess = ouprocess
if mean_var_path is None:
mean = np.zeros(n_states)
var = np.zeros(n_states)
elif not os.path.exists(mean_var_path):
mean = np.zeros(n_states)
var = np.zeros(n_states)
else:
with open(mean_var_path, 'rb') as f:
mean, var = pickle.load(f)
self.normalizer = Normalizer(mean, var)
if supervised:
self._build_actor()
LOG.info("Supervised Learning Initialized")
else:
# Build Network
self._build_network()
LOG.info('Finish Initializing Networks')
self.replay_memory = PrioritizedReplayMemory(capacity=opt['memory_size'])
self.noise = OUProcess(n_actions)
# LOG.info('DDPG Initialzed!')
@staticmethod
def totensor(x):
return Variable(torch.FloatTensor(x))
def _build_actor(self):
if self.ouprocess:
noisy = False
else:
noisy = True
self.actor = Actor(self.n_states, self.n_actions, noisy=noisy)
self.actor_criterion = nn.MSELoss()
self.actor_optimizer = optimizer.Adam(lr=self.alr, params=self.actor.parameters())
def _build_network(self):
if self.ouprocess:
noisy = False
else:
noisy = True
self.actor = Actor(self.n_states, self.n_actions, noisy=noisy)
self.target_actor = Actor(self.n_states, self.n_actions)
self.critic = Critic(self.n_states, self.n_actions)
self.target_critic = Critic(self.n_states, self.n_actions)
# if model params are provided, load them
if len(self.model_name):
self.load_model(model_name=self.model_name)
LOG.info("Loading model from file: %s", self.model_name)
# Copy actor's parameters
self._update_target(self.target_actor, self.actor, tau=1.0)
# Copy critic's parameters
self._update_target(self.target_critic, self.critic, tau=1.0)
self.loss_criterion = nn.MSELoss()
self.actor_optimizer = optimizer.Adam(lr=self.alr, params=self.actor.parameters(),
weight_decay=1e-5)
self.critic_optimizer = optimizer.Adam(lr=self.clr, params=self.critic.parameters(),
weight_decay=1e-5)
@staticmethod
def _update_target(target, source, tau):
for (target_param, param) in zip(target.parameters(), source.parameters()):
target_param.data.copy_(
target_param.data * (1 - tau) + param.data * tau
)
def reset(self, sigma):
self.noise.reset(sigma)
def _sample_batch(self):
batch, idx = self.replay_memory.sample(self.batch_size)
# batch = self.replay_memory.sample(self.batch_size)
states = list(map(lambda x: x[0].tolist(), batch)) # pylint: disable=W0141
next_states = list(map(lambda x: x[3].tolist(), batch)) # pylint: disable=W0141
actions = list(map(lambda x: x[1].tolist(), batch)) # pylint: disable=W0141
rewards = list(map(lambda x: x[2], batch)) # pylint: disable=W0141
terminates = list(map(lambda x: x[4], batch)) # pylint: disable=W0141
return idx, states, next_states, actions, rewards, terminates
def add_sample(self, state, action, reward, next_state, terminate):
self.critic.eval()
self.actor.eval()
self.target_critic.eval()
self.target_actor.eval()
batch_state = self.normalizer([state.tolist()])
batch_next_state = self.normalizer([next_state.tolist()])
current_value = self.critic(batch_state, self.totensor([action.tolist()]))
target_action = self.target_actor(batch_next_state)
target_value = self.totensor([reward]) \
+ self.totensor([0 if x else 1 for x in [terminate]]) \
* self.target_critic(batch_next_state, target_action) * self.gamma
error = float(torch.abs(current_value - target_value).data.numpy()[0])
self.target_actor.train()
self.actor.train()
self.critic.train()
self.target_critic.train()
self.replay_memory.add(error, (state, action, reward, next_state, terminate))
def update(self):
idxs, states, next_states, actions, rewards, terminates = self._sample_batch()
batch_states = self.normalizer(states)
batch_next_states = self.normalizer(next_states)
batch_actions = self.totensor(actions)
batch_rewards = self.totensor(rewards)
mask = [0 if x else 1 for x in terminates]
mask = self.totensor(mask)
target_next_actions = self.target_actor(batch_next_states).detach()
target_next_value = self.target_critic(batch_next_states, target_next_actions).detach()
current_value = self.critic(batch_states, batch_actions)
# TODO (dongshen): This clause is the original clause, but it has some mistakes
# next_value = batch_rewards + mask * target_next_value * self.gamma
# Since terminate is always false, I remove the mask here.
next_value = batch_rewards + target_next_value * self.gamma
# Update Critic
# update prioritized memory
error = torch.abs(current_value - next_value).data.numpy()
for i in range(self.batch_size):
idx = idxs[i]
self.replay_memory.update(idx, error[i][0])
loss = self.loss_criterion(current_value, next_value)
self.critic_optimizer.zero_grad()
loss.backward()
self.critic_optimizer.step()
# Update Actor
self.critic.eval()
policy_loss = -self.critic(batch_states, self.actor(batch_states))
policy_loss = policy_loss.mean()
self.actor_optimizer.zero_grad()
policy_loss.backward()
self.actor_optimizer.step()
self.critic.train()
self._update_target(self.target_critic, self.critic, tau=self.tau)
self._update_target(self.target_actor, self.actor, tau=self.tau)
return loss.data, policy_loss.data
def choose_action(self, x):
""" Select Action according to the current state
Args:
x: np.array, current state
"""
self.actor.eval()
act = self.actor(self.normalizer([x.tolist()])).squeeze(0)
self.actor.train()
action = act.data.numpy()
if self.ouprocess:
action += self.noise.noise()
return action.clip(0, 1)
def sample_noise(self):
self.actor.sample_noise()
def load_model(self, model_name):
""" Load Torch Model from files
Args:
model_name: str, model path
"""
self.actor.load_state_dict(
torch.load('{}_actor.pth'.format(model_name))
)
self.critic.load_state_dict(
torch.load('{}_critic.pth'.format(model_name))
)
def save_model(self, model_name):
""" Save Torch Model from files
Args:
model_dir: str, model dir
title: str, model name
"""
torch.save(
self.actor.state_dict(),
'{}_actor.pth'.format(model_name)
)
torch.save(
self.critic.state_dict(),
'{}_critic.pth'.format(model_name)
)
def save_actor(self, path):
""" save actor network
Args:
path, str, path to save
"""
torch.save(
self.actor.state_dict(),
path
)
def load_actor(self, path):
""" load actor network
Args:
path, str, path to load
"""
self.actor.load_state_dict(
torch.load(path)
)
def train_actor(self, batch_data, is_train=True):
""" Train the actor separately with data
Args:
batch_data: tuple, (states, actions)
is_train: bool
Return:
_loss: float, training loss
"""
states, action = batch_data
if is_train:
self.actor.train()
pred = self.actor(self.normalizer(states))
action = self.totensor(action)
_loss = self.actor_criterion(pred, action)
self.actor_optimizer.zero_grad()
_loss.backward()
self.actor_optimizer.step()
else:
self.actor.eval()
pred = self.actor(self.normalizer(states))
action = self.totensor(action)
_loss = self.actor_criterion(pred, action)
return _loss.data[0]

View File

@@ -0,0 +1,121 @@
#
# prioritized_replay_memory.py
#
# Copyright
#
import random
import pickle
import numpy as np
class SumTree(object):
write = 0
def __init__(self, capacity):
self.capacity = capacity
self.tree = np.zeros(2 * capacity - 1)
self.data = np.zeros(capacity, dtype=object)
self.num_entries = 0
def _propagate(self, idx, change):
parent = (idx - 1) // 2
self.tree[parent] += change
if parent != 0:
self._propagate(parent, change)
def _retrieve(self, idx, s):
left = 2 * idx + 1
right = left + 1
if left >= len(self.tree):
return idx
if s <= self.tree[left]:
return self._retrieve(left, s)
else:
return self._retrieve(right, s - self.tree[left])
def total(self):
return self.tree[0]
def add(self, p, data):
idx = self.write + self.capacity - 1
self.data[self.write] = data
self.update(idx, p)
self.write += 1
if self.write >= self.capacity:
self.write = 0
if self.num_entries < self.capacity:
self.num_entries += 1
def update(self, idx, p):
change = p - self.tree[idx]
self.tree[idx] = p
self._propagate(idx, change)
def get(self, s):
idx = self._retrieve(0, s)
data_idx = idx - self.capacity + 1
return [idx, self.tree[idx], self.data[data_idx]]
class PrioritizedReplayMemory(object):
def __init__(self, capacity):
self.tree = SumTree(capacity)
self.capacity = capacity
self.e = 0.01 # pylint: disable=invalid-name
self.a = 0.6 # pylint: disable=invalid-name
self.beta = 0.4
self.beta_increment_per_sampling = 0.001
def _get_priority(self, error):
return (error + self.e) ** self.a
def add(self, error, sample):
# (s, a, r, s, t)
p = self._get_priority(error)
self.tree.add(p, sample)
def __len__(self):
return self.tree.num_entries
def sample(self, n):
batch = []
idxs = []
segment = self.tree.total() / n
priorities = []
self.beta = np.min([1., self.beta + self.beta_increment_per_sampling])
for i in range(n):
a = segment * i
b = segment * (i + 1)
s = random.uniform(a, b)
(idx, p, data) = self.tree.get(s)
priorities.append(p)
batch.append(data)
idxs.append(idx)
return batch, idxs
# sampling_probabilities = priorities / self.tree.total()
# is_weight = np.power(self.tree.num_entries * sampling_probabilities, -self.beta)
# is_weight /= is_weight.max()
def update(self, idx, error):
p = self._get_priority(error)
self.tree.update(idx, p)
def save(self, path):
f = open(path, 'wb')
pickle.dump({"tree": self.tree}, f)
f.close()
def load_memory(self, path):
with open(path, 'rb') as f:
_memory = pickle.load(f)
self.tree = _memory['tree']