import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import backend as K

from ..utils import (
    check_numpy_array, check_tensor, is_vfunction, is_qfunction, is_policy)
from ..base.mixins import ActionSpaceMixin
from ..base.errors import ActionSpaceError
from ..policies.base import BasePolicy

from .base import BaseFunctionApproximator
from .value_v import V
from .value_q import QTypeI, QTypeII

__all__ = (

class BaseActorCritic(BasePolicy, BaseFunctionApproximator, ActionSpaceMixin):
    def env(self):
        return self.policy.env

    def __call__(self, s):
        Draw an action from the current policy :math:`\\pi(a|s)` and get the
        expected value :math:`v(s)`.

        s : state observation

            A single state observation.

        a, v : tuple (1d array of floats, float)

            Returns a pair representing :math:`(a, v(s))`.

        return self.policy(s), self.v_func(s)

    def dist_params(self, s):

        Get the distribution parameters under the current policy
        :math:`\\pi(a|s)` and get the expected value :math:`v(s)`.

        s : state observation

            A single state observation.

        dist_params, v : tuple (1d array of floats, float)

            Returns a pair representing the distribution parameters of
            :math:`\\pi(a|s)` and the estimated state value :math:`v(s)`.

        return self.policy.dist_params(s), self.v_func(s)

    def batch_eval(self, S, use_target_model=False):
        Evaluate the actor-critic on a batch of state observations.

        S : nd array, shape: [batch_size, ...]

            A batch of state observations.

        use_target_model : bool, optional

            Whether to use the :term:`target_model` internally. If False
            (default), the :term:`predict_model` is used.

        A, V : arrays, shapes: [batch_size, ...] and [batch_size]

            A batch of sampled actions :term:`A` and state values :term:`V`.

        A = self.policy.batch_eval(S, use_target_model=use_target_model)
        V = self.v_func.batch_eval(
            S, use_target_model=use_target_model)
        return A, V

    def greedy(self, s):
        Draw a greedy action :math:`a=\\arg\\max_{a'}\\pi(a'|s)` and get the
        expected value :math:`v(s)`.

        s : state observation

            A single state observation.

        a, v : tuple (1d array of floats, float)

            Returns a pair representing :math:`(a, v(s))`.

        return self.policy.greedy(s), self.v_func(s)

    def update(self, s, a, r, done):
        Update both actor and critic.

        s : state observation

            A single state observation.

        a : action

            A single action.

        r : float

            A single observed reward.

        done : bool

            Whether the episode has finished.

        assert self.env.observation_space.contains(s)
        self.v_func._cache.add(s, a, r, done)

        # eager updates
        while self.v_func._cache:
            # pop with batch_size=1

    def batch_update(self, S, A, Rn, In, S_next, A_next=None):
        Update both actor and critic on a batch of transitions.

        S : nd array, shape: [batch_size, ...]

            A batch of state observations.

        A : nd Tensor, shape: [batch_size, ...]

            A batch of actions taken.

        Rn : 1d array, dtype: float, shape: [batch_size]

            A batch of partial returns. For example, in n-step bootstrapping
            this is given by:

            .. math::

                R^{(n)}_t\\ =\\ R_t + \\gamma\\,R_{t+1} + \\dots

            In other words, it's the non-bootstrapped part of the n-step

        In : 1d array, dtype: float, shape: [batch_size]

            A batch bootstrapping factor. For instance, in n-step bootstrapping
            this is given by :math:`I^{(n)}_t=\\gamma^n` if the episode is
            ongoing and :math:`I^{(n)}_t=0` otherwise. This allows us to write
            the bootstrapped target as
            :math:`G^{(n)}_t=R^{(n)}_t+I^{(n)}_tQ(S_{t+n}, A_{t+n})`.

        S_next : nd array, shape: [batch_size, ...]

            A batch of next-state observations.

        A_next : 2d Tensor, shape: [batch_size, ...]

            A batch of (potential) next actions :term:`A_next`. This argument
            is only used if ``update_strategy='sarsa'``.

        losses : dict

            A dict of losses/metrics, of type ``{name <str>: value <float>}``.

        use_target_model = self.v_func.bootstrap_with_target_model
        V_next = self.v_func.batch_eval(S_next, use_target_model)
        G = Rn + In * V_next

        # check shapes / dtypes
        check_numpy_array(G, ndim=1, dtype='float')
        if self.action_space_is_discrete:
                A, ndim=2, dtype=('float32', 'float64'),
                axis_size=self.num_actions, axis=1)
        elif self.action_space_is_box:
                A, ndim=2, dtype=('float32', 'float64'),
                axis_size=self.actions_ndim, axis=1)
            raise ActionSpaceError.feature_request(self.env)

        losses = self._train_on_batch([S, A, G])
        return losses

    def sync_target_model(self, tau=1.0):

[docs]class ActorCritic(BaseActorCritic): """ A generic actor-critic, combining an :term:`updateable policy` with a :term:`value function <state value function>`. The added value of using an :class:`ActorCritic` to combine a policy with a value function is that it avoids having to feed in :term:`S` (potentially very large) three times at training time. Instead, it only feeds it in once. Parameters ---------- policy : Policy object An :term:`updateable policy`. v_func : value-function object A :term:`state value function` :math:`v(s)`. value_loss_weight : float, optional Relative weight to give to the value-function loss: .. code:: python loss = policy_loss + value_loss_weight * value_loss """ def __init__(self, policy, v_func, value_loss_weight=1.0): self.policy = policy self.v_func = v_func self.value_loss_weight = value_loss_weight self._check_function_types() self._init_models()
[docs] @classmethod def from_func( cls, function_approximator, gamma=0.9, bootstrap_n=1, bootstrap_with_target_model=False, entropy_beta=0.01, update_strategy='vanilla', random_seed=None): """ Create instance directly from a :class:`FunctionApproximator <keras_gym.FunctionApproximator>` object. Parameters ---------- function_approximator : FunctionApproximator object The main :term:`function approximator`. gamma : float, optional The discount factor for discounting future rewards. bootstrap_n : positive int, optional The number of steps in n-step bootstrapping. It specifies the number of steps over which we're willing to delay bootstrapping. Large :math:`n` corresponds to Monte Carlo updates and :math:`n=1` corresponds to TD(0). bootstrap_with_target_model : bool, optional Whether to use the :term:`target_model` when constructing a bootstrapped target. If False (default), the primary :term:`predict_model` is used. entropy_beta : float, optional The coefficient of the entropy bonus term in the policy objective. update_strategy : str, callable, optional The strategy for updating our policy. This determines the loss function that we use for our policy function approximator. If you wish to use a custom policy loss, you can override the :func:`policy_loss_with_metrics` method. Provided options are: 'vanilla' Plain vanilla policy gradient. The corresponding (surrogate) loss function that we use is: .. math:: J(\\theta)\\ =\\ -\\mathcal{A}(s,a)\\,\\ln\\pi(a|s,\\theta) 'ppo' `Proximal policy optimization <>`_ uses a clipped proximal loss: .. math:: J(\\theta)\\ =\\ \\min\\Big( r(\\theta)\\,\\mathcal{A}(s,a)\\,,\\ \\text{clip}\\big( r(\\theta), 1-\\epsilon, 1+\\epsilon\\big) \\,\\mathcal{A}(s,a)\\Big) where :math:`r(\\theta)` is the probability ratio: .. math:: r(\\theta)\\ =\\ \\frac {\\pi(a|s,\\theta)} {\\pi(a|s,\\theta_\\text{old})} 'cross_entropy' Straightforward categorical cross-entropy (from logits). This loss function does *not* make use of the advantages :term:`Adv`. Instead, it minimizes the cross entropy between the behavior policy :math:`\\pi_b(a|s)` and the learned policy :math:`\\pi_\\theta(a|s)`: .. math:: J(\\theta)\\ =\\ \\hat{\\mathbb{E}}_t\\left\\{ -\\sum_a \\pi_b(a|S_t)\\, \\log \\pi_\\theta(a|S_t) \\right\\} random_seed : int, optional Sets the random state to get reproducible results. """ func = function_approximator # just an abbreviation policy_cls = func._get_policy_class() pi = policy_cls( func, entropy_beta=entropy_beta, random_seed=random_seed) v = V( func, gamma=gamma, bootstrap_n=bootstrap_n, bootstrap_with_target_model=bootstrap_with_target_model) return cls(pi, v)
def _check_function_types(self): if not is_vfunction(self.v_func): if is_qfunction(self.v_func): raise NotImplementedError( "ActorCritic hasn't been yet implemented for Q-functions, " "please let me know is you need this; for the time being, " "please use V-function instead.") if not is_policy(self.policy, check_updateable=True): raise TypeError("expected an updateable policy") if self.policy.env != self.v_func.env: raise ValueError( "the envs of policy and v_func do not match") def _init_models(self): # inputs S, A = self.policy.train_model.inputs[:2] G = keras.Input(name='G', shape=(1,), dtype='float') # get TD advantages V = self.v_func.predict_model(S) Adv = K.stop_gradient(G - V) # update loss with advantage coming directly from graph policy_loss, metrics = self.policy.policy_loss_with_metrics(Adv, A) value_loss = self.v_func.train_model([S, G]) metrics['policy/loss'] = policy_loss metrics['value/loss'] = value_loss loss = policy_loss + self.value_loss_weight * value_loss # joint model self.train_model = keras.Model([S, A, G], loss) self.train_model.add_loss(loss) for name, metric in metrics.items(): self.train_model.add_metric(metric, name=name, aggregation='mean') self.train_model.compile(optimizer=self.policy.train_model.optimizer)
[docs]class SoftActorCritic(BaseActorCritic): """ Implementation of a `soft actor-critic <>`_ (SAC), which uses entropy regularization in the value function as well as in its policy updates. Parameters ---------- policy : a policy object An :term:`updateable policy` object :math:`\\pi(a|s)`. v_func : v-function object A state-action value function. This is used as the entropy-regularized value function (critic). q_func1 : q-function object A :term:`type-I state-action value function`. This is used as the target for both the policy (actor) and the state value function (critic). q_func2 : q-function object Same as ``q_func1``. SAC uses two q-functions to avoid overfitting due to overly optimistic value estimates. value_loss_weight : float, optional Relative weight to give to the value-function loss: .. code:: python loss = policy_loss + value_loss_weight * value_loss """ def __init__( self, policy, v_func, q_func1, q_func2, value_loss_weight=1.0): self.policy = policy self.v_func = v_func self.q_func1 = q_func1 self.q_func2 = q_func2 self.value_loss_weight = value_loss_weight self._check_function_types() self._init_models()
[docs] @classmethod def from_func( cls, function_approximator, gamma=0.9, bootstrap_n=1, q_type=None, entropy_beta=0.01, random_seed=None): """ Create instance directly from a :class:`FunctionApproximator <keras_gym.FunctionApproximator>` object. Parameters ---------- function_approximator : FunctionApproximator object The main :term:`function approximator`. gamma : float, optional The discount factor for discounting future rewards. bootstrap_n : positive int, optional The number of steps in n-step bootstrapping. It specifies the number of steps over which we're willing to delay bootstrapping. Large :math:`n` corresponds to Monte Carlo updates and :math:`n=1` corresponds to TD(0). q_type : 1 or 2, optional Whether to model the q-function as :term:`type-I <type-I state-action value function>` or :term:`type-II <type-II state-action value function>`. This defaults to type-II for discrete action spaces and type-I otherwise. entropy_beta : float, optional The coefficient of the entropy bonus term in the policy objective. random_seed : int, optional Sets the random state to get reproducible results. """ func = function_approximator # just an abbreviation if q_type is None: q_type = 2 if func.action_space_is_discrete else 1 q_func_cls = QTypeII if q_type == 2 else QTypeI policy_cls = func._get_policy_class() pi = policy_cls( func, entropy_beta=entropy_beta, random_seed=random_seed, update_strategy='sac') v = V( func, gamma=gamma, bootstrap_n=bootstrap_n, bootstrap_with_target_model=True) q1 = q_func_cls( func, gamma=gamma, bootstrap_n=bootstrap_n) q2 = q_func_cls( func, gamma=gamma, bootstrap_n=bootstrap_n) return cls(pi, v, q1, q2)
def _check_function_types(self): if not is_vfunction(self.v_func): raise TypeError("'v_func' must be a v-function: v(s)") if not is_qfunction(self.q_func1): raise TypeError("'q_func1' must be a q-function: q(s,a)") if not is_qfunction(self.q_func2): raise TypeError("'q_func2' must be a q-function: q(s,a)") if not is_policy(self.policy, check_updateable=True): raise TypeError("'policy' must be an updateable policy") funcs = (self.policy, self.v_func, self.q_func1, self.q_func2) if not all(f.env == self.env for f in funcs): raise ValueError( "the envs of policy and value function(s) do not match") @staticmethod def _get_q_value(q_func, S, A): if is_qfunction(q_func, qtype=2): Q = q_func.target_model(S) check_tensor(Q, ndim=2, axis_size=q_func.num_actions, axis=1) check_tensor(A, ndim=2, axis_size=q_func.num_actions, axis=1) Q = tf.expand_dims(tf.einsum('ij,ij->i', Q, A), axis=1) else: Q = q_func.target_model([S, A]) check_tensor(Q, ndim=2, axis_size=1, axis=1) return Q def _init_models(self): # make sure that the policy loss is set to 'sac' if self.policy.update_strategy != 'sac': self.policy.update_strategy = 'sac' self.logger.warn("policy.update_strategy has been set to 'sac'") # inputs S, A = self.policy.train_model.inputs[:2] G = keras.Input(name='G', shape=(1,), dtype='float') # constuct log(pi(a_sampled, s)) A_sampled = self.policy.dist.sample() # differentiable log_pi = self.policy.dist.log_proba(A_sampled) # use target models for q-values, because they're non-trainable Q1 = self._get_q_value(self.q_func1, S, A_sampled) Q2 = self._get_q_value(self.q_func2, S, A_sampled) Q_both = keras.layers.Concatenate()([Q1, Q2]) check_tensor(Q_both, ndim=2, axis_size=2, axis=1) # construct entropy-corrected target for state value function Q_min = keras.layers.Lambda(lambda x: K.min(x, axis=1))(Q_both) V_target = K.stop_gradient(Q_min - self.policy.entropy_beta * log_pi) check_tensor(V_target, ndim=1) # compute advantages from q-function V = self.v_func.predict_model(S) check_tensor(V, axis_size=1, axis=1) V = K.stop_gradient(K.squeeze(V, axis=1)) Q = keras.layers.Lambda(lambda x: K.mean(x, axis=1))(Q_both) Adv = Q - self.policy.entropy_beta * log_pi - V # update loss with advantage coming directly from graph policy_loss, metrics = self.policy.policy_loss_with_metrics(Adv) v_loss = self.v_func.train_model([S, V_target]) q_loss1 = self.q_func1.train_model([S, A, G]) q_loss2 = self.q_func2.train_model([S, A, G]) value_loss = (v_loss + q_loss1 + q_loss2) / 3. # add losses to metrics dict metrics.update({ 'policy/loss': policy_loss, 'v_func/loss': v_loss, 'q_func1/loss': q_loss1, 'q_func2/loss': q_loss2, 'value/loss': value_loss, }) # combined loss function loss = policy_loss + self.value_loss_weight * value_loss check_tensor(loss, ndim=0) # should be a scalar # joint model self.train_model = keras.Model([S, A, G], loss) self.train_model.add_loss(loss) for name, metric in metrics.items(): self.train_model.add_metric(metric, name=name, aggregation='mean') self.train_model.compile(optimizer=self.policy.train_model.optimizer)
[docs] def batch_update(self, S, A, Rn, In, S_next, A_next=None): super().batch_update(S, A, Rn, In, S_next, A_next) self.q_func1.sync_target_model(tau=1.0) self.q_func2.sync_target_model(tau=1.0)