Source code for keras_gym.caching.experience_replay

import numpy as np

from ..base.mixins import RandomStateMixin, ActionSpaceMixin
from ..base.errors import NumpyArrayCheckError, InsufficientCacheError
from ..utils import check_numpy_array, get_env_attr


__all__ = (
    'ExperienceReplayBuffer',
)


[docs]class ExperienceReplayBuffer(RandomStateMixin, ActionSpaceMixin): """ A simple numpy implementation of an experience replay buffer. This is written primarily with computer game environments (Atari) in mind. It implements a generic experience replay buffer for environments in which individual observations (frames) are stacked to represent the state. Parameters ---------- env : gym environment The main gym environment. This is needed to infer the number of stacked frames ``num_frames`` as well as the number of actions ``num_actions``. capacity : positive int The capacity of the experience replay buffer. DQN typically uses ``capacity=1000000``. batch_size : positive int, optional The desired batch size of the sample. bootstrap_n : positive int The number of steps over which to delay bootstrapping, i.e. n-step bootstrapping. gamma : float between 0 and 1 Reward discount factor. random_seed : int or None To get reproducible results. """ def __init__( self, env, capacity, batch_size=32, bootstrap_n=1, gamma=0.99, random_seed=None): self.env = env self.capacity = int(capacity) self.batch_size = int(batch_size) self.num_frames = get_env_attr(env, 'num_frames', 1) self.bootstrap_n = int(bootstrap_n) self.gamma = float(gamma) self.random_seed = random_seed # internal self._initialized = False
[docs] @classmethod def from_value_function(cls, value_function, capacity, batch_size=32): """ Create a new instance by extracting some settings from a Q-function. The settings that are extracted from the value function are: ``gamma``, ``bootstrap_n`` and ``num_frames``. The latter is taken from the value function's ``env`` attribute. Parameters ---------- value_function : value-function object A state value function or a state-action value function. capacity : positive int The capacity of the experience replay buffer. DQN typically uses ``capacity=1000000``. batch_size : positive int, optional The desired batch size of the sample. Returns ------- experience_replay_buffer A new instance. """ self = cls( env=value_function.env, capacity=capacity, batch_size=batch_size, gamma=value_function.gamma, bootstrap_n=value_function.bootstrap_n) return self
[docs] def add(self, s, a, r, done, episode_id): """ Add a transition to the experience replay buffer. Parameters ---------- s : state A single state observation. a : action A single action. r : float The observed rewards associated with this transition. done : bool Whether the episode has finished. episode_id : int The episode in which the transition took place. This is needed for generating consistent samples. """ s = self._extract_last_frame(s) if not self._initialized: self._s_dtype = s.dtype self._s_shape = s.shape if self.action_space_is_discrete: self._a_shape = (self.num_actions,) # we do one-hot encoding self._a_dtype = 'float' else: self._a_shape = self.env.action_space.shape self._a_dtype = self.env.action_space.dtype self._init_cache() if self.action_space_is_discrete: a = self._one_hot_encode_discrete(a) self._s[self._i] = s self._a[self._i] = a self._r[self._i] = r self._d[self._i] = done self._e[self._i] = episode_id self._i = (self._i + 1) % (self.capacity + self.bootstrap_n) if self._num_transitions < self.capacity + self.bootstrap_n: self._num_transitions += 1
[docs] def sample(self): """ Get a batch of transitions to be used for bootstrapped updates. Returns ------- S, A, Rn, In, S_next, A_next : tuple of arrays The returned tuple represents a batch of preprocessed transitions: (:term:`S`, :term:`A`, :term:`Rn`, :term:`In`, :term:`S_next`, :term:`A_next`) These are typically used for bootstrapped updates, e.g. minimizing the bootstrapped MSE: .. math:: \\left( R^{(n)}_t + I^{(n)}_t\\,\\sum_aP(a|S_{t+n})\\,Q(S_{t+n},a) - \\sum_aP(a|S_t)\\,Q(S_t,a) \\right)^2 """ # noqa: E501 if not self._initialized or len(self) < self.batch_size: raise InsufficientCacheError( "insufficient cached data to sample from") S = [] A = [] Rn = [] In = [] S_next = [] A_next = [] for attempt in range(10 * self.batch_size): # js are the S indices and ks are the S_next indices J = len(self) - self.num_frames assert J > 0, "please insert more transitions before sampling" js = self.random.randint(J) + np.arange(self.num_frames) ks = js + self.bootstrap_n ls = np.arange(js[-1], ks[-1]) # wrap around js %= self.capacity + self.bootstrap_n ks %= self.capacity + self.bootstrap_n ls %= self.capacity + self.bootstrap_n # check if S indices are all from the same episode ep = self._e[js[-1]] if any(self._e[j] > ep for j in js[:-1]): # Check if all js are from the current episode or from the # immediately preceding episodes. Otherwise, we would generate # spurious data because it would probably mean that 'js' spans # the overwrite-boundary. continue for i, j in reversed(list(enumerate(js[:-1]))): # if j is from a previous episode, replace it by its successor if self._e[j] < ep: js[i] = js[i + 1] # gather partial returns rn = np.zeros(1) done = False for t, l in enumerate(ls): rn[0] += pow(self.gamma, t) * self._r[l] done = self._d[l] if done: break if not done and any(self._e[k] != ep for k in ks): continue # permutation to transpose 'num_frames' axis to axis=-1 perm = np.roll(np.arange(self._s.ndim), -1) S.append(self._s[js].transpose(perm)) A.append(self._a[js[-1:]]) Rn.append(rn) S_next.append(self._s[ks].transpose(perm)) A_next.append(self._a[ks[-1:]]) if done: In.append(np.zeros(1)) else: In.append( np.power([self.gamma], self.bootstrap_n)) if len(S) == self.batch_size: break if len(S) < self.batch_size: raise RuntimeError("couldn't construct valid sample") S = np.stack(S, axis=0) A = np.concatenate(A, axis=0) Rn = np.concatenate(Rn, axis=0) In = np.concatenate(In, axis=0) S_next = np.stack(S_next, axis=0) A_next = np.concatenate(A_next, axis=0) if self.num_frames == 1: S = np.squeeze(S, axis=-1) S_next = np.squeeze(S_next, axis=-1) return S, A, Rn, In, S_next, A_next
[docs] def clear(self): """ Clear the experience replay buffer. """ self._i = 0 self._num_transitions = 0
def __len__(self): return max(0, self._num_transitions - self.bootstrap_n) def __bool__(self): return bool(len(self)) def _init_cache(self): self._i = 0 self._num_transitions = 0 # construct appropriate shapes n = (self.capacity + self.bootstrap_n,) # create cache attrs self._s = np.empty(n + self._s_shape, self._s_dtype) # frames self._a = np.zeros(n + self._a_shape, self._a_dtype) # actions self._r = np.zeros(n, 'float') # rewards self._d = np.zeros(n, 'bool') # done? self._e = np.zeros(n, 'int32') # episode id self._initialized = True def _extract_last_frame(self, s): if self.num_frames == 1: return s check_numpy_array(s, axis_size=self.num_frames, axis=-1) if s.ndim == 3: s = s[:, :, -1] elif s.ndim == 4: s = s[:, :, :, -1] else: NumpyArrayCheckError( "expected ndim equal to 3 or 4, got shape: {}".format(s.shape)) return s