Source code for keras_gym.core.policy_normal

from tensorflow import keras

from ..utils import check_tensor
from ..proba_dists import NormalDist
from .base import BaseUpdateablePolicy


[docs]class GaussianPolicy(BaseUpdateablePolicy): """ An :term:`updateable policy` for environments with a continuous action space, i.e. a :class:`Box <gym.spaces.Box>`. It models the policy :math:`\\pi_\\theta(a|s)` as a normal distribution with conditional parameters :math:`(\\mu_\\theta(s), \\sigma_\\theta(s))`. .. important:: This environment requires that the ``env`` is with: .. code:: env = km.wrappers.BoxToReals(env) This wrapper decompactifies the Box action space. Parameters ---------- function_approximator : FunctionApproximator object The main :term:`function approximator`. update_strategy : str, optional The strategy for updating our policy. This typically determines the loss function that we use for our policy function approximator. Options are: 'vanilla' Plain vanilla policy gradient. The corresponding (surrogate) loss function that we use is: .. math:: J(\\theta)\\ =\\ \\hat{\\mathbb{E}}_t \\left\\{ -\\mathcal{A}_t\\,\\log\\pi_\\theta(A_t|S_t) \\right\\} where :math:`\\mathcal{A}_t=\\mathcal{A}(S_t,A_t)` is the advantage at time step :math:`t`. 'ppo' `Proximal policy optimization <https://arxiv.org/abs/1707.06347>`_ uses a clipped proximal loss: .. math:: J(\\theta)\\ =\\ \\hat{\\mathbb{E}}_t \\left\\{ \\min\\Big( \\rho_t(\\theta)\\,\\mathcal{A}_t\\,,\\ \\tilde{\\rho}_t(\\theta)\\,\\mathcal{A}_t \\Big) \\right\\} where :math:`\\rho_t(\\theta)` is the probability ratio: .. math:: \\rho_t(\\theta)\\ =\\ \\frac {\\pi_\\theta(A_t|S_t)} {\\pi_{\\theta_\\text{old}}(A_t|S_t)} and :math:`\\tilde{\\rho}_t(\\theta)` is its clipped version: .. math:: \\tilde{\\rho}_t(\\theta)\\ =\\ \\text{clip}\\big( \\rho_t(\\theta), 1-\\epsilon, 1+\\epsilon\\big) 'cross_entropy' Straightforward categorical cross-entropy (from logits). This loss function does *not* make use of the advantages :term:`Adv`. Instead, it minimizes the cross entropy between the behavior policy :math:`\\pi_b(a|s)` and the learned policy :math:`\\pi_\\theta(a|s)`: .. math:: J(\\theta)\\ =\\ \\hat{\\mathbb{E}}_t\\left\\{ -\\sum_a \\pi_b(a|S_t)\\, \\log \\pi_\\theta(a|S_t) \\right\\} ppo_clip_eps : float, optional The clipping parameter :math:`\\epsilon` in the PPO clipped surrogate loss. This option is only applicable if ``update_strategy='ppo'``. entropy_beta : float, optional The coefficient of the entropy bonus term in the policy objective. """ def _init_models(self): S = keras.Input( shape=self.env.observation_space.shape, dtype=self.env.observation_space.dtype, name='policy/S') A = keras.Input( shape=self.env.action_space.shape, dtype=self.env.action_space.dtype, name='policy/A') Adv = keras.Input(shape=(), dtype='float', name='policy/Adv') # forward pass X = self.function_approximator.body(S) mu, logvar = self.function_approximator.head_pi(X) check_tensor(mu, ndim=2, axis_size=self.actions_ndim, axis=1) check_tensor(logvar, same_as=mu) # special layers A_sample = keras.layers.Lambda( lambda args: NormalDist(*args).sample())([mu, logvar]) A_greedy = mu # output models self.predict_model = keras.Model(S, A_sample) self.target_model = self._create_target_model(self.predict_model) self.predict_greedy_model = keras.Model(S, A_greedy) self.target_greedy_model = self._create_target_model( self.predict_greedy_model) self.predict_param_model = keras.Model(S, [mu, logvar]) self.target_param_model = self._create_target_model( self.predict_param_model) # loss and target tensor (depends on self.update_strategy) self.dist = NormalDist(mu=mu, logvar=logvar) self.target_dist = NormalDist(*self.target_param_model(S)) loss, metrics = self.policy_loss_with_metrics(Adv, A) check_tensor(loss, ndim=0) # models self.train_model = keras.Model([S, A, Adv], loss) self.train_model.add_loss(loss) for name, metric in metrics.items(): self.train_model.add_metric(metric, name=name, aggregation='mean') self.train_model.compile( optimizer=self.function_approximator.optimizer)