ottertune/server/analysis/ddpg/ddpg.py

#
# ddpg.py
#
# Copyright
#
"""
Deep Deterministic Policy Gradient Model

"""

import logging
import os
import sys
import math
import pickle
import numpy as np
import torch
import torch.nn as nn
from torch.nn import init, Parameter
import torch.nn.functional as F
import torch.optim as optimizer
from torch.autograd import Variable

from analysis.ddpg.OUProcess import OUProcess
from analysis.ddpg.prioritized_replay_memory import PrioritizedReplayMemory

LOG = logging.getLogger(__name__)

sys.path.append('../')


# code from https://github.com/Kaixhin/NoisyNet-A3C/blob/master/model.py
class NoisyLinear(nn.Linear):
    def __init__(self, in_features, out_features, sigma_init=0.05, bias=True):
        super(NoisyLinear, self).__init__(in_features, out_features, bias=True)
        # reuse self.weight and self.bias
        self.sigma_init = sigma_init
        self.sigma_weight = Parameter(torch.Tensor(out_features, in_features))
        self.sigma_bias = Parameter(torch.Tensor(out_features))
        self.register_buffer('epsilon_weight', torch.zeros(out_features, in_features))
        self.register_buffer('epsilon_bias', torch.zeros(out_features))
        self.reset_parameters()

    def reset_parameters(self):
        # Only init after all params added (otherwise super().__init__() fails)
        if hasattr(self, 'sigma_weight'):
            init.uniform(self.weight, -math.sqrt(3 / self.in_features),
                         math.sqrt(3 / self.in_features))
            init.uniform(self.bias, -math.sqrt(3 / self.in_features),
                         math.sqrt(3 / self.in_features))
            init.constant(self.sigma_weight, self.sigma_init)
            init.constant(self.sigma_bias, self.sigma_init)

    def forward(self, x):
        return F.linear(x, self.weight + self.sigma_weight * Variable(self.epsilon_weight),
                        self.bias + self.sigma_bias * Variable(self.epsilon_bias))

    # pylint: disable=attribute-defined-outside-init
    def sample_noise(self):
        self.epsilon_weight = torch.randn(self.out_features, self.in_features)
        self.epsilon_bias = torch.randn(self.out_features)

    def remove_noise(self):
        self.epsilon_weight = torch.zeros(self.out_features, self.in_features)
        self.epsilon_bias = torch.zeros(self.out_features)
    # pylint: enable=attribute-defined-outside-init


class Normalizer(object):

    def __init__(self, mean, variance):
        if isinstance(mean, list):
            mean = np.array(mean)
        if isinstance(variance, list):
            variance = np.array(variance)
        self.mean = mean
        self.std = np.sqrt(variance + 0.00001)

    def normalize(self, x):
        if isinstance(x, list):
            x = np.array(x)
        x = x - self.mean
        x = x / self.std

        return Variable(torch.FloatTensor(x))

    def __call__(self, x, *args, **kwargs):
        return self.normalize(x)


class ActorLow(nn.Module):

    def __init__(self, n_states, n_actions, ):
        super(ActorLow, self).__init__()
        self.layers = nn.Sequential(
            nn.BatchNorm1d(n_states),
            nn.Linear(n_states, 32),
            nn.LeakyReLU(negative_slope=0.2),
            nn.BatchNorm1d(32),
            nn.Linear(32, n_actions),
            nn.LeakyReLU(negative_slope=0.2)
        )
        self._init_weights()
        self.out_func = nn.Tanh()

    def _init_weights(self):

        for m in self.layers:
            if isinstance(m, nn.Linear):
                m.weight.data.normal_(0.0, 1e-3)
                m.bias.data.uniform_(-0.1, 0.1)

    def forward(self, x):  # pylint: disable=arguments-differ

        out = self.layers(x)

        return self.out_func(out)


class CriticLow(nn.Module):

    def __init__(self, n_states, n_actions):
        super(CriticLow, self).__init__()
        self.state_input = nn.Linear(n_states, 32)
        self.action_input = nn.Linear(n_actions, 32)
        self.act = nn.LeakyReLU(negative_slope=0.2)
        self.state_bn = nn.BatchNorm1d(n_states)
        self.layers = nn.Sequential(
            nn.Linear(64, 1),
            nn.LeakyReLU(negative_slope=0.2),
        )
        self._init_weights()

    def _init_weights(self):
        self.state_input.weight.data.normal_(0.0, 1e-3)
        self.state_input.bias.data.uniform_(-0.1, 0.1)

        self.action_input.weight.data.normal_(0.0, 1e-3)
        self.action_input.bias.data.uniform_(-0.1, 0.1)

        for m in self.layers:
            if isinstance(m, nn.Linear):
                m.weight.data.normal_(0.0, 1e-3)
                m.bias.data.uniform_(-0.1, 0.1)

    def forward(self, x, action):  # pylint: disable=arguments-differ
        x = self.state_bn(x)
        x = self.act(self.state_input(x))
        action = self.act(self.action_input(action))

        _input = torch.cat([x, action], dim=1)
        value = self.layers(_input)
        return value


class Actor(nn.Module):

    def __init__(self, n_states, n_actions, noisy=False):
        super(Actor, self).__init__()
        self.layers = nn.Sequential(
            nn.Linear(n_states, 128),
            nn.LeakyReLU(negative_slope=0.2),
            nn.BatchNorm1d(128),
            nn.Linear(128, 128),
            nn.Tanh(),
            nn.Dropout(0.3),

            nn.Linear(128, 64),
            nn.Tanh(),
            nn.BatchNorm1d(64),
        )
        if noisy:
            self.out = NoisyLinear(64, n_actions)
        else:
            self.out = nn.Linear(64, n_actions)
        self._init_weights()
        self.act = nn.Sigmoid()

    def _init_weights(self):

        for m in self.layers:
            if isinstance(m, nn.Linear):
                m.weight.data.normal_(0.0, 1e-2)
                m.bias.data.uniform_(-0.1, 0.1)

    def sample_noise(self):
        self.out.sample_noise()

    def forward(self, x):  # pylint: disable=arguments-differ

        out = self.act(self.out(self.layers(x)))
        return out


class Critic(nn.Module):

    def __init__(self, n_states, n_actions):
        super(Critic, self).__init__()
        self.state_input = nn.Linear(n_states, 128)
        self.action_input = nn.Linear(n_actions, 128)
        self.act = nn.Tanh()
        self.layers = nn.Sequential(
            nn.Linear(256, 256),
            nn.LeakyReLU(negative_slope=0.2),
            nn.BatchNorm1d(256),

            nn.Linear(256, 64),
            nn.Tanh(),
            nn.Dropout(0.3),
            nn.BatchNorm1d(64),
            nn.Linear(64, 1),
        )
        self._init_weights()

    def _init_weights(self):
        self.state_input.weight.data.normal_(0.0, 1e-2)
        self.state_input.bias.data.uniform_(-0.1, 0.1)

        self.action_input.weight.data.normal_(0.0, 1e-2)
        self.action_input.bias.data.uniform_(-0.1, 0.1)

        for m in self.layers:
            if isinstance(m, nn.Linear):
                m.weight.data.normal_(0.0, 1e-2)
                m.bias.data.uniform_(-0.1, 0.1)

    def forward(self, x, action):  # pylint: disable=arguments-differ
        x = self.act(self.state_input(x))
        action = self.act(self.action_input(action))

        _input = torch.cat([x, action], dim=1)
        value = self.layers(_input)
        return value


class DDPG(object):

    def __init__(self, n_states, n_actions, opt=None, ouprocess=True, mean_var_path=None,
                 supervised=False):
        """ DDPG Algorithms
        Args:
            n_states: int, dimension of states
            n_actions: int, dimension of actions
            opt: dict, params
            supervised, bool, pre-train the actor with supervised learning
        """
        self.n_states = n_states
        self.n_actions = n_actions

        if opt is None:
            opt = {
                'model': '',
                'alr': 0.001,
                'clr': 0.001,
                'gamma': 0.9,
                'batch_size': 32,
                'tau': 0.002,
                'memory_size': 100000
            }

        # Params
        self.alr = opt['alr']
        self.clr = opt['clr']
        self.model_name = opt['model']
        self.batch_size = opt['batch_size']
        self.gamma = opt['gamma']
        self.tau = opt['tau']
        self.ouprocess = ouprocess

        if mean_var_path is None:
            mean = np.zeros(n_states)
            var = np.zeros(n_states)
        elif not os.path.exists(mean_var_path):
            mean = np.zeros(n_states)
            var = np.zeros(n_states)
        else:
            with open(mean_var_path, 'rb') as f:
                mean, var = pickle.load(f)

        self.normalizer = Normalizer(mean, var)

        if supervised:
            self._build_actor()
            LOG.info("Supervised Learning Initialized")
        else:
            # Build Network
            self._build_network()
            LOG.info('Finish Initializing Networks')

        self.replay_memory = PrioritizedReplayMemory(capacity=opt['memory_size'])
        self.noise = OUProcess(n_actions)
        # LOG.info('DDPG Initialzed!')

    @staticmethod
    def totensor(x):
        return Variable(torch.FloatTensor(x))

    def _build_actor(self):
        if self.ouprocess:
            noisy = False
        else:
            noisy = True
        self.actor = Actor(self.n_states, self.n_actions, noisy=noisy)
        self.actor_criterion = nn.MSELoss()
        self.actor_optimizer = optimizer.Adam(lr=self.alr, params=self.actor.parameters())

    def _build_network(self):
        if self.ouprocess:
            noisy = False
        else:
            noisy = True
        self.actor = Actor(self.n_states, self.n_actions, noisy=noisy)
        self.target_actor = Actor(self.n_states, self.n_actions)
        self.critic = Critic(self.n_states, self.n_actions)
        self.target_critic = Critic(self.n_states, self.n_actions)

        # if model params are provided, load them
        if len(self.model_name):
            self.load_model(model_name=self.model_name)
            LOG.info("Loading model from file: %s", self.model_name)

        # Copy actor's parameters
        self._update_target(self.target_actor, self.actor, tau=1.0)

        # Copy critic's parameters
        self._update_target(self.target_critic, self.critic, tau=1.0)

        self.loss_criterion = nn.MSELoss()
        self.actor_optimizer = optimizer.Adam(lr=self.alr, params=self.actor.parameters(),
                                              weight_decay=1e-5)
        self.critic_optimizer = optimizer.Adam(lr=self.clr, params=self.critic.parameters(),
                                               weight_decay=1e-5)

    @staticmethod
    def _update_target(target, source, tau):
        for (target_param, param) in zip(target.parameters(), source.parameters()):
            target_param.data.copy_(
                target_param.data * (1 - tau) + param.data * tau
            )

    def reset(self, sigma):
        self.noise.reset(sigma)

    def _sample_batch(self):
        batch, idx = self.replay_memory.sample(self.batch_size)
        # batch = self.replay_memory.sample(self.batch_size)
        states = list(map(lambda x: x[0].tolist(), batch))  # pylint: disable=W0141
        next_states = list(map(lambda x: x[3].tolist(), batch))  # pylint: disable=W0141
        actions = list(map(lambda x: x[1].tolist(), batch))  # pylint: disable=W0141
        rewards = list(map(lambda x: x[2], batch))  # pylint: disable=W0141
        terminates = list(map(lambda x: x[4], batch))  # pylint: disable=W0141

        return idx, states, next_states, actions, rewards, terminates

    def add_sample(self, state, action, reward, next_state, terminate):
        self.critic.eval()
        self.actor.eval()
        self.target_critic.eval()
        self.target_actor.eval()
        batch_state = self.normalizer([state.tolist()])
        batch_next_state = self.normalizer([next_state.tolist()])
        current_value = self.critic(batch_state, self.totensor([action.tolist()]))
        target_action = self.target_actor(batch_next_state)
        target_value = self.totensor([reward]) \
            + self.totensor([0 if x else 1 for x in [terminate]]) \
            * self.target_critic(batch_next_state, target_action) * self.gamma
        error = float(torch.abs(current_value - target_value).data.numpy()[0])

        self.target_actor.train()
        self.actor.train()
        self.critic.train()
        self.target_critic.train()
        self.replay_memory.add(error, (state, action, reward, next_state, terminate))

    def update(self):
        idxs, states, next_states, actions, rewards, terminates = self._sample_batch()
        batch_states = self.normalizer(states)
        batch_next_states = self.normalizer(next_states)
        batch_actions = self.totensor(actions)
        batch_rewards = self.totensor(rewards)
        mask = [0 if x else 1 for x in terminates]
        mask = self.totensor(mask)

        target_next_actions = self.target_actor(batch_next_states).detach()
        target_next_value = self.target_critic(batch_next_states, target_next_actions).detach()
        current_value = self.critic(batch_states, batch_actions)
        # TODO (dongshen): This clause is the original clause, but it has some mistakes
        # next_value = batch_rewards +  mask * target_next_value * self.gamma
        # Since terminate is always false, I remove the mask here.
        next_value = batch_rewards + target_next_value * self.gamma
        # Update Critic

        # update prioritized memory
        error = torch.abs(current_value - next_value).data.numpy()
        for i in range(self.batch_size):
            idx = idxs[i]
            self.replay_memory.update(idx, error[i][0])

        loss = self.loss_criterion(current_value, next_value)
        self.critic_optimizer.zero_grad()
        loss.backward()
        self.critic_optimizer.step()

        # Update Actor
        self.critic.eval()
        policy_loss = -self.critic(batch_states, self.actor(batch_states))
        policy_loss = policy_loss.mean()
        self.actor_optimizer.zero_grad()
        policy_loss.backward()

        self.actor_optimizer.step()
        self.critic.train()

        self._update_target(self.target_critic, self.critic, tau=self.tau)
        self._update_target(self.target_actor, self.actor, tau=self.tau)

        return loss.data, policy_loss.data

    def choose_action(self, x):
        """ Select Action according to the current state
        Args:
            x: np.array, current state
        """
        self.actor.eval()
        act = self.actor(self.normalizer([x.tolist()])).squeeze(0)
        self.actor.train()
        action = act.data.numpy()
        if self.ouprocess:
            action += self.noise.noise()
        return action.clip(0, 1)

    def sample_noise(self):
        self.actor.sample_noise()

    def load_model(self, model_name):
        """ Load Torch Model from files
        Args:
            model_name: str, model path
        """
        self.actor.load_state_dict(
            torch.load('{}_actor.pth'.format(model_name))
        )
        self.critic.load_state_dict(
            torch.load('{}_critic.pth'.format(model_name))
        )

    def save_model(self, model_name):
        """ Save Torch Model from files
        Args:
            model_dir: str, model dir
            title: str, model name
        """
        torch.save(
            self.actor.state_dict(),
            '{}_actor.pth'.format(model_name)
        )

        torch.save(
            self.critic.state_dict(),
            '{}_critic.pth'.format(model_name)
        )

    def save_actor(self, path):
        """ save actor network
        Args:
             path, str, path to save
        """
        torch.save(
            self.actor.state_dict(),
            path
        )

    def load_actor(self, path):
        """ load actor network
        Args:
             path, str, path to load
        """
        self.actor.load_state_dict(
            torch.load(path)
        )

    def train_actor(self, batch_data, is_train=True):
        """ Train the actor separately with data
        Args:
            batch_data: tuple, (states, actions)
            is_train: bool
        Return:
            _loss: float, training loss
        """
        states, action = batch_data

        if is_train:
            self.actor.train()
            pred = self.actor(self.normalizer(states))
            action = self.totensor(action)

            _loss = self.actor_criterion(pred, action)

            self.actor_optimizer.zero_grad()
            _loss.backward()
            self.actor_optimizer.step()

        else:
            self.actor.eval()
            pred = self.actor(self.normalizer(states))
            action = self.totensor(action)
            _loss = self.actor_criterion(pred, action)

        return _loss.data[0]
Add machine learning model ddpg 2019-09-13 14:55:55 -07:00			`#`
			`# ddpg.py`
			`#`
			`# Copyright`
			`#`
			`"""`
			`Deep Deterministic Policy Gradient Model`

			`"""`

			`import logging`
			`import os`
			`import sys`
			`import math`
			`import pickle`
			`import numpy as np`
			`import torch`
			`import torch.nn as nn`
			`from torch.nn import init, Parameter`
			`import torch.nn.functional as F`
			`import torch.optim as optimizer`
			`from torch.autograd import Variable`

			`from analysis.ddpg.OUProcess import OUProcess`
			`from analysis.ddpg.prioritized_replay_memory import PrioritizedReplayMemory`

			`LOG = logging.getLogger(__name__)`

			`sys.path.append('../')`


			`# code from https://github.com/Kaixhin/NoisyNet-A3C/blob/master/model.py`
			`class NoisyLinear(nn.Linear):`
			`def __init__(self, in_features, out_features, sigma_init=0.05, bias=True):`
			`super(NoisyLinear, self).__init__(in_features, out_features, bias=True)`
			`# reuse self.weight and self.bias`
			`self.sigma_init = sigma_init`
			`self.sigma_weight = Parameter(torch.Tensor(out_features, in_features))`
			`self.sigma_bias = Parameter(torch.Tensor(out_features))`
			`self.register_buffer('epsilon_weight', torch.zeros(out_features, in_features))`
			`self.register_buffer('epsilon_bias', torch.zeros(out_features))`
			`self.reset_parameters()`

			`def reset_parameters(self):`
			`# Only init after all params added (otherwise super().__init__() fails)`
			`if hasattr(self, 'sigma_weight'):`
			`init.uniform(self.weight, -math.sqrt(3 / self.in_features),`
			`math.sqrt(3 / self.in_features))`
			`init.uniform(self.bias, -math.sqrt(3 / self.in_features),`
			`math.sqrt(3 / self.in_features))`
			`init.constant(self.sigma_weight, self.sigma_init)`
			`init.constant(self.sigma_bias, self.sigma_init)`

			`def forward(self, x):`
			`return F.linear(x, self.weight + self.sigma_weight * Variable(self.epsilon_weight),`
			`self.bias + self.sigma_bias * Variable(self.epsilon_bias))`

			`# pylint: disable=attribute-defined-outside-init`
			`def sample_noise(self):`
			`self.epsilon_weight = torch.randn(self.out_features, self.in_features)`
			`self.epsilon_bias = torch.randn(self.out_features)`

			`def remove_noise(self):`
			`self.epsilon_weight = torch.zeros(self.out_features, self.in_features)`
			`self.epsilon_bias = torch.zeros(self.out_features)`
			`# pylint: enable=attribute-defined-outside-init`


			`class Normalizer(object):`

			`def __init__(self, mean, variance):`
			`if isinstance(mean, list):`
			`mean = np.array(mean)`
			`if isinstance(variance, list):`
			`variance = np.array(variance)`
			`self.mean = mean`
			`self.std = np.sqrt(variance + 0.00001)`

			`def normalize(self, x):`
			`if isinstance(x, list):`
			`x = np.array(x)`
			`x = x - self.mean`
			`x = x / self.std`

			`return Variable(torch.FloatTensor(x))`

			`def __call__(self, x, args, *kwargs):`
			`return self.normalize(x)`


			`class ActorLow(nn.Module):`

			`def __init__(self, n_states, n_actions, ):`
			`super(ActorLow, self).__init__()`
			`self.layers = nn.Sequential(`
			`nn.BatchNorm1d(n_states),`
			`nn.Linear(n_states, 32),`
			`nn.LeakyReLU(negative_slope=0.2),`
			`nn.BatchNorm1d(32),`
			`nn.Linear(32, n_actions),`
			`nn.LeakyReLU(negative_slope=0.2)`
			`)`
			`self._init_weights()`
			`self.out_func = nn.Tanh()`

			`def _init_weights(self):`

			`for m in self.layers:`
			`if isinstance(m, nn.Linear):`
			`m.weight.data.normal_(0.0, 1e-3)`
			`m.bias.data.uniform_(-0.1, 0.1)`

			`def forward(self, x): # pylint: disable=arguments-differ`

			`out = self.layers(x)`

			`return self.out_func(out)`


			`class CriticLow(nn.Module):`

			`def __init__(self, n_states, n_actions):`
			`super(CriticLow, self).__init__()`
			`self.state_input = nn.Linear(n_states, 32)`
			`self.action_input = nn.Linear(n_actions, 32)`
			`self.act = nn.LeakyReLU(negative_slope=0.2)`
			`self.state_bn = nn.BatchNorm1d(n_states)`
			`self.layers = nn.Sequential(`
			`nn.Linear(64, 1),`
			`nn.LeakyReLU(negative_slope=0.2),`
			`)`
			`self._init_weights()`

			`def _init_weights(self):`
			`self.state_input.weight.data.normal_(0.0, 1e-3)`
			`self.state_input.bias.data.uniform_(-0.1, 0.1)`

			`self.action_input.weight.data.normal_(0.0, 1e-3)`
			`self.action_input.bias.data.uniform_(-0.1, 0.1)`

			`for m in self.layers:`
			`if isinstance(m, nn.Linear):`
			`m.weight.data.normal_(0.0, 1e-3)`
			`m.bias.data.uniform_(-0.1, 0.1)`

			`def forward(self, x, action): # pylint: disable=arguments-differ`
			`x = self.state_bn(x)`
			`x = self.act(self.state_input(x))`
			`action = self.act(self.action_input(action))`

			`_input = torch.cat([x, action], dim=1)`
			`value = self.layers(_input)`
			`return value`


			`class Actor(nn.Module):`

			`def __init__(self, n_states, n_actions, noisy=False):`
			`super(Actor, self).__init__()`
			`self.layers = nn.Sequential(`
			`nn.Linear(n_states, 128),`
			`nn.LeakyReLU(negative_slope=0.2),`
			`nn.BatchNorm1d(128),`
			`nn.Linear(128, 128),`
			`nn.Tanh(),`
			`nn.Dropout(0.3),`

			`nn.Linear(128, 64),`
			`nn.Tanh(),`
			`nn.BatchNorm1d(64),`
			`)`
			`if noisy:`
			`self.out = NoisyLinear(64, n_actions)`
			`else:`
			`self.out = nn.Linear(64, n_actions)`
			`self._init_weights()`
			`self.act = nn.Sigmoid()`

			`def _init_weights(self):`

			`for m in self.layers:`
			`if isinstance(m, nn.Linear):`
			`m.weight.data.normal_(0.0, 1e-2)`
			`m.bias.data.uniform_(-0.1, 0.1)`

			`def sample_noise(self):`
			`self.out.sample_noise()`

			`def forward(self, x): # pylint: disable=arguments-differ`

			`out = self.act(self.out(self.layers(x)))`
			`return out`


			`class Critic(nn.Module):`

			`def __init__(self, n_states, n_actions):`
			`super(Critic, self).__init__()`
			`self.state_input = nn.Linear(n_states, 128)`
			`self.action_input = nn.Linear(n_actions, 128)`
			`self.act = nn.Tanh()`
			`self.layers = nn.Sequential(`
			`nn.Linear(256, 256),`
			`nn.LeakyReLU(negative_slope=0.2),`
			`nn.BatchNorm1d(256),`

			`nn.Linear(256, 64),`
			`nn.Tanh(),`
			`nn.Dropout(0.3),`
			`nn.BatchNorm1d(64),`
			`nn.Linear(64, 1),`
			`)`
			`self._init_weights()`

			`def _init_weights(self):`
			`self.state_input.weight.data.normal_(0.0, 1e-2)`
			`self.state_input.bias.data.uniform_(-0.1, 0.1)`

			`self.action_input.weight.data.normal_(0.0, 1e-2)`
			`self.action_input.bias.data.uniform_(-0.1, 0.1)`

			`for m in self.layers:`
			`if isinstance(m, nn.Linear):`
			`m.weight.data.normal_(0.0, 1e-2)`
			`m.bias.data.uniform_(-0.1, 0.1)`

			`def forward(self, x, action): # pylint: disable=arguments-differ`
			`x = self.act(self.state_input(x))`
			`action = self.act(self.action_input(action))`

			`_input = torch.cat([x, action], dim=1)`
			`value = self.layers(_input)`
			`return value`


			`class DDPG(object):`

			`def __init__(self, n_states, n_actions, opt=None, ouprocess=True, mean_var_path=None,`
			`supervised=False):`
			`""" DDPG Algorithms`
			`Args:`
			`n_states: int, dimension of states`
			`n_actions: int, dimension of actions`
			`opt: dict, params`
			`supervised, bool, pre-train the actor with supervised learning`
			`"""`
			`self.n_states = n_states`
			`self.n_actions = n_actions`

			`if opt is None:`
			`opt = {`
			`'model': '',`
			`'alr': 0.001,`
			`'clr': 0.001,`
			`'gamma': 0.9,`
			`'batch_size': 32,`
			`'tau': 0.002,`
			`'memory_size': 100000`
			`}`

			`# Params`
			`self.alr = opt['alr']`
			`self.clr = opt['clr']`
			`self.model_name = opt['model']`
			`self.batch_size = opt['batch_size']`
			`self.gamma = opt['gamma']`
			`self.tau = opt['tau']`
			`self.ouprocess = ouprocess`

			`if mean_var_path is None:`
			`mean = np.zeros(n_states)`
			`var = np.zeros(n_states)`
			`elif not os.path.exists(mean_var_path):`
			`mean = np.zeros(n_states)`
			`var = np.zeros(n_states)`
			`else:`
			`with open(mean_var_path, 'rb') as f:`
			`mean, var = pickle.load(f)`

			`self.normalizer = Normalizer(mean, var)`

			`if supervised:`
			`self._build_actor()`
			`LOG.info("Supervised Learning Initialized")`
			`else:`
			`# Build Network`
			`self._build_network()`
			`LOG.info('Finish Initializing Networks')`

			`self.replay_memory = PrioritizedReplayMemory(capacity=opt['memory_size'])`
			`self.noise = OUProcess(n_actions)`
			`# LOG.info('DDPG Initialzed!')`

			`@staticmethod`
			`def totensor(x):`
			`return Variable(torch.FloatTensor(x))`

			`def _build_actor(self):`
			`if self.ouprocess:`
			`noisy = False`
			`else:`
			`noisy = True`
			`self.actor = Actor(self.n_states, self.n_actions, noisy=noisy)`
			`self.actor_criterion = nn.MSELoss()`
			`self.actor_optimizer = optimizer.Adam(lr=self.alr, params=self.actor.parameters())`

			`def _build_network(self):`
			`if self.ouprocess:`
			`noisy = False`
			`else:`
			`noisy = True`
			`self.actor = Actor(self.n_states, self.n_actions, noisy=noisy)`
			`self.target_actor = Actor(self.n_states, self.n_actions)`
			`self.critic = Critic(self.n_states, self.n_actions)`
			`self.target_critic = Critic(self.n_states, self.n_actions)`

			`# if model params are provided, load them`
			`if len(self.model_name):`
			`self.load_model(model_name=self.model_name)`
			`LOG.info("Loading model from file: %s", self.model_name)`

			`# Copy actor's parameters`
			`self._update_target(self.target_actor, self.actor, tau=1.0)`

			`# Copy critic's parameters`
			`self._update_target(self.target_critic, self.critic, tau=1.0)`

			`self.loss_criterion = nn.MSELoss()`
			`self.actor_optimizer = optimizer.Adam(lr=self.alr, params=self.actor.parameters(),`
			`weight_decay=1e-5)`
			`self.critic_optimizer = optimizer.Adam(lr=self.clr, params=self.critic.parameters(),`
			`weight_decay=1e-5)`

			`@staticmethod`
			`def _update_target(target, source, tau):`
			`for (target_param, param) in zip(target.parameters(), source.parameters()):`
			`target_param.data.copy_(`
			`target_param.data * (1 - tau) + param.data * tau`
			`)`

			`def reset(self, sigma):`
			`self.noise.reset(sigma)`

			`def _sample_batch(self):`
			`batch, idx = self.replay_memory.sample(self.batch_size)`
			`# batch = self.replay_memory.sample(self.batch_size)`
			`states = list(map(lambda x: x[0].tolist(), batch)) # pylint: disable=W0141`
			`next_states = list(map(lambda x: x[3].tolist(), batch)) # pylint: disable=W0141`
			`actions = list(map(lambda x: x[1].tolist(), batch)) # pylint: disable=W0141`
			`rewards = list(map(lambda x: x[2], batch)) # pylint: disable=W0141`
			`terminates = list(map(lambda x: x[4], batch)) # pylint: disable=W0141`

			`return idx, states, next_states, actions, rewards, terminates`

			`def add_sample(self, state, action, reward, next_state, terminate):`
			`self.critic.eval()`
			`self.actor.eval()`
			`self.target_critic.eval()`
			`self.target_actor.eval()`
			`batch_state = self.normalizer([state.tolist()])`
			`batch_next_state = self.normalizer([next_state.tolist()])`
			`current_value = self.critic(batch_state, self.totensor([action.tolist()]))`
			`target_action = self.target_actor(batch_next_state)`
			`target_value = self.totensor([reward]) \`
			`+ self.totensor([0 if x else 1 for x in [terminate]]) \`
			`* self.target_critic(batch_next_state, target_action) * self.gamma`
			`error = float(torch.abs(current_value - target_value).data.numpy()[0])`

			`self.target_actor.train()`
			`self.actor.train()`
			`self.critic.train()`
			`self.target_critic.train()`
			`self.replay_memory.add(error, (state, action, reward, next_state, terminate))`

			`def update(self):`
			`idxs, states, next_states, actions, rewards, terminates = self._sample_batch()`
			`batch_states = self.normalizer(states)`
			`batch_next_states = self.normalizer(next_states)`
			`batch_actions = self.totensor(actions)`
			`batch_rewards = self.totensor(rewards)`
			`mask = [0 if x else 1 for x in terminates]`
			`mask = self.totensor(mask)`

			`target_next_actions = self.target_actor(batch_next_states).detach()`
			`target_next_value = self.target_critic(batch_next_states, target_next_actions).detach()`
			`current_value = self.critic(batch_states, batch_actions)`
			`# TODO (dongshen): This clause is the original clause, but it has some mistakes`
			`# next_value = batch_rewards + mask * target_next_value * self.gamma`
			`# Since terminate is always false, I remove the mask here.`
			`next_value = batch_rewards + target_next_value * self.gamma`
			`# Update Critic`

			`# update prioritized memory`
			`error = torch.abs(current_value - next_value).data.numpy()`
			`for i in range(self.batch_size):`
			`idx = idxs[i]`
			`self.replay_memory.update(idx, error[i][0])`

			`loss = self.loss_criterion(current_value, next_value)`
			`self.critic_optimizer.zero_grad()`
			`loss.backward()`
			`self.critic_optimizer.step()`

			`# Update Actor`
			`self.critic.eval()`
			`policy_loss = -self.critic(batch_states, self.actor(batch_states))`
			`policy_loss = policy_loss.mean()`
			`self.actor_optimizer.zero_grad()`
			`policy_loss.backward()`

			`self.actor_optimizer.step()`
			`self.critic.train()`

			`self._update_target(self.target_critic, self.critic, tau=self.tau)`
			`self._update_target(self.target_actor, self.actor, tau=self.tau)`

			`return loss.data, policy_loss.data`

			`def choose_action(self, x):`
			`""" Select Action according to the current state`
			`Args:`
			`x: np.array, current state`
			`"""`
			`self.actor.eval()`
			`act = self.actor(self.normalizer([x.tolist()])).squeeze(0)`
			`self.actor.train()`
			`action = act.data.numpy()`
			`if self.ouprocess:`
			`action += self.noise.noise()`
			`return action.clip(0, 1)`

			`def sample_noise(self):`
			`self.actor.sample_noise()`

			`def load_model(self, model_name):`
			`""" Load Torch Model from files`
			`Args:`
			`model_name: str, model path`
			`"""`
			`self.actor.load_state_dict(`
			`torch.load('{}_actor.pth'.format(model_name))`
			`)`
			`self.critic.load_state_dict(`
			`torch.load('{}_critic.pth'.format(model_name))`
			`)`

			`def save_model(self, model_name):`
			`""" Save Torch Model from files`
			`Args:`
			`model_dir: str, model dir`
			`title: str, model name`
			`"""`
			`torch.save(`
			`self.actor.state_dict(),`
			`'{}_actor.pth'.format(model_name)`
			`)`

			`torch.save(`
			`self.critic.state_dict(),`
			`'{}_critic.pth'.format(model_name)`
			`)`

			`def save_actor(self, path):`
			`""" save actor network`
			`Args:`
			`path, str, path to save`
			`"""`
			`torch.save(`
			`self.actor.state_dict(),`
			`path`
			`)`

			`def load_actor(self, path):`
			`""" load actor network`
			`Args:`
			`path, str, path to load`
			`"""`
			`self.actor.load_state_dict(`
			`torch.load(path)`
			`)`

			`def train_actor(self, batch_data, is_train=True):`
			`""" Train the actor separately with data`
			`Args:`
			`batch_data: tuple, (states, actions)`
			`is_train: bool`
			`Return:`
			`_loss: float, training loss`
			`"""`
			`states, action = batch_data`

			`if is_train:`
			`self.actor.train()`
			`pred = self.actor(self.normalizer(states))`
			`action = self.totensor(action)`

			`_loss = self.actor_criterion(pred, action)`

			`self.actor_optimizer.zero_grad()`
			`_loss.backward()`
			`self.actor_optimizer.step()`

			`else:`
			`self.actor.eval()`
			`pred = self.actor(self.normalizer(states))`
			`action = self.totensor(action)`
			`_loss = self.actor_criterion(pred, action)`

			`return _loss.data[0]`