Source code for keras_gym.core.function_approximator

import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import backend as K

from ..utils import check_tensor
from ..base.mixins import ActionSpaceMixin
from ..base.errors import ActionSpaceError


__all__ = (
    'FunctionApproximator',
)


[docs]class FunctionApproximator(ActionSpaceMixin):
    """
    A generic function approximator.

    This is the central object object that provides an interface between a
    gym-type environment and function approximators like :term:`value functions
    <state value function>` and :term:`updateable policies <updateable
    policy>`.

    In order to create a valid function approximator, you need to implement the
    :term:`body` method. For example, to implement a simple multi-layer
    perceptron function approximator you would do something like:

    .. code:: python

        import gym
        import keras_gym as km
        from tensorflow.keras.layers import Flatten, Dense

        class MLP(km.FunctionApproximator):
            \"\"\" multi-layer perceptron with one hidden layer \"\"\"
            def body(self, S):
                X = Flatten()(S)
                X = Dense(units=4)(X)
                return X

        # environment
        env = gym.make(...)

        # generic function approximator
        mlp = MLP(env, lr=0.001)

        # policy and value function
        pi, v = km.SoftmaxPolicy(mlp), km.V(mlp)

    The default :term:`heads <head>` are simple (multi) linear regression
    layers, which can be overridden by your own implementation.

    Parameters
    ----------
    env : environment

        A gym-style environment.

    optimizer : keras.optimizers.Optimizer, optional

        If left unspecified (``optimizer=None``), the function approximator's
        DEFAULT_OPTIMIZER is used. See `keras documentation
        <https://keras.io/optimizers/>`_ for more details.

    **optimizer_kwargs : keyword arguments

        Keyword arguments for the optimizer. This is useful when you want to
        use the default optimizer with a different setting, e.g. changing the
        learning rate.

    """
    DEFAULT_OPTIMIZER = keras.optimizers.Adam
    VALUE_LOSS_FUNCTION = keras.losses.Huber()

    def __init__(self, env, optimizer=None, **optimizer_kwargs):
        self.env = env
        self._init_optimizer(optimizer, optimizer_kwargs)

[docs]    def head_v(self, X):
        """
        This is the :term:`state value <state value function>` head. It returns
        a scalar V-value :math:`v(s)\\in\\mathbb{R}`.

        Parameters
        ----------
        X : nd Tensor, shape: [batch_size, ...]

            ``X`` is an intermediate tensor in the full forward-pass of the
            computation graph; it's the output of the last layer of the
            :func:`body` method.

        Returns
        -------
        V : 2d Tensor, shape: [batch_size, 1]

            The output :term:`state values <V>` :math:`v(s)\\in\\mathbb{R}`.

        """
        V = keras.layers.Dense(
            units=1,
            activation='linear',
            kernel_initializer='zeros',
            name='value/v')(X)
        return V

[docs]    def head_q1(self, X):
        """
        This is the :term:`type-I <type-I state-action value function>`
        Q-value head. It returns a scalar Q-value
        :math:`q(s,a)\\in\\mathbb{R}`.

        Parameters
        ----------
        X : nd Tensor, shape: [batch_size, ...]

            ``X`` is an intermediate tensor in the full forward-pass of the
            computation graph; it's the output of the last layer of the
            :func:`body` method.

        Returns
        -------
        Q_sa : 2d Tensor, shape: [batch_size, 1]

            The output :term:`type-I <Q_sa>` Q-values
            :math:`q(s,a)\\in\\mathbb{R}`.

        """
        Q_sa = keras.layers.Dense(
            units=1,
            activation='linear',
            kernel_initializer='zeros',
            name='value/qtype1')(X)
        return Q_sa

[docs]    def head_q2(self, X):
        """
        This is the :term:`type-II <type-II state-action value function>`
        Q-value head. It returns a vector of Q-values
        :math:`q(s,.)\\in\\mathbb{R}^n`.

        Parameters
        ----------
        X : nd Tensor, shape: [batch_size, ...]

            ``X`` is an intermediate tensor in the full forward-pass of the
            computation graph; it's the output of the last layer of the
            :func:`body` method.

        Returns
        -------
        Q_s : 2d Tensor, shape: [batch_size, num_actions]

            The output :term:`type-II <Q_s>` Q-values
            :math:`q(s,.)\\in\\mathbb{R}^n`.

        """
        Q_s = keras.layers.Dense(
            units=self.num_actions,
            activation='linear',
            kernel_initializer='zeros',
            name='value/qtype2')(X)
        return Q_s

[docs]    def head_pi(self, X):
        """
        This is the policy head. It returns logits, i.e. not probabilities. Use
        a softmax to turn the output into probabilities.

        Parameters
        ----------
        X : nd Tensor, shape: [batch_size, ...]

            ``X`` is an intermediate tensor in the full forward-pass of the
            computation graph; it's the output of the last layer of the
            :func:`body` method.

        Returns
        -------
        \\*params : Tensor or tuple of Tensors, shape: [batch_size, ...]

            These constitute the raw policy distribution parameters.

        """
        if self.action_space_is_discrete:
            logits = keras.layers.Dense(
                units=self.num_actions,
                activation='linear',
                kernel_initializer='zeros',
                name='policy/logits')(X)
            return logits

        if self.action_space_is_box:
            mu = keras.layers.Dense(
                units=self.actions_ndim,
                activation='linear',
                kernel_initializer='zeros',
                name='policy/mu')(X)
            logvar = keras.layers.Dense(
                units=self.actions_ndim,
                activation='linear',
                kernel_initializer='zeros',
                name='policy/logvar')(X)
            return mu, logvar

        raise ActionSpaceError.feature_request(self.env)

[docs]    def body(self, S):
        """
        This is the part of the computation graph that may be shared between
        e.g. policy (actor) and value function (critic). It is typically the
        part of a neural net that does most of the heavy lifting. One may think
        of the :func:`body` as an elaborate automatic feature extractor.

        Parameters
        ----------
        S : nd Tensor: shape: [batch_size, ...]

            The input state observation.

        Returns
        -------
        X : nd Tensor, shape: [batch_size, ...]

            The intermediate keras tensor.

        """
        return keras.layers.Lambda(
            lambda x: self._to_vector(x, self.env.observation_space))(S)

[docs]    def body_q1(self, S, A):
        """
        This is similar to :func:`body`, except that it takes a state-action
        pair as input instead of only state observations.

        Parameters
        ----------
        S : nd Tensor: shape: [batch_size, ...]

            The input state observation.

        A : nd Tensor: shape: [batch_size, ...]

            The input actions.

        Returns
        -------
        X : nd Tensor, shape: [batch_size, ...]

            The intermediate keras tensor.

        """
        def kronecker_product(args):
            S, A = args
            S = self._to_vector(S, self.env.observation_space)
            A = self._to_vector(A, self.env.action_space)
            check_tensor(S, ndim=2, dtype=('float32', 'float64'))
            check_tensor(A, ndim=2, dtype=('float32', 'float64'))
            return tf.einsum('ij,ik->ijk', S, A)

        X = keras.layers.Lambda(kronecker_product)([S, A])
        X = keras.layers.Flatten()(X)
        return self.body(X)

    def _init_optimizer(self, optimizer, optimizer_kwargs):
        if optimizer is None:
            self.optimizer = self.DEFAULT_OPTIMIZER(**optimizer_kwargs)
        elif isinstance(optimizer, keras.optimizers.Optimizer):
            self.optimizer = optimizer
        else:
            raise ValueError(
                "unknown optimizer, expected a keras.optimizers.Optimizer or "
                "None (which sets the default keras.optimizers.Adam "
                "optimizer)")

    @staticmethod
    def _to_vector(X, space):
        if K.ndim(X) == 1 and K.dtype(X).startswith('int'):
            X = K.one_hot(X, space.n)
        elif K.ndim(X) > 2:
            X = keras.layers.Flatten()(X)
        return X