chainer · prabhatnagarajan · Nov 26, 2018 · Nov 26, 2018 · Dec 12, 2018 · Jan 16, 2019
diff --git a/README.md b/README.md
@@ -86,6 +86,8 @@ Following useful techniques have been also implemented in ChainerRL:
   - examples: [[Rainbow]](examples/atari/reproduction/rainbow) [[DQN/DoubleDQN/PAL]](examples/atari/train_dqn_ale.py)
 - [Prioritized Experience Replay](https://arxiv.org/abs/1511.05952)
   - examples: [[Rainbow]](examples/atari/reproduction/rainbow) [[DQN/DoubleDQN/PAL]](examples/atari/train_dqn_ale.py)
+ - [Hindsight Experience Replay](https://arxiv.org/abs/1707.01495)
+  - examples: [[HER]](examples/gym/train_her_gym.py)
 - [Dueling Network](https://arxiv.org/abs/1511.06581)
   - examples: [[Rainbow]](examples/atari/reproduction/rainbow) [[DQN/DoubleDQN/PAL]](examples/atari/train_dqn_ale.py)
 - [Normalized Advantage Function](https://arxiv.org/abs/1603.00748)

diff --git a/chainerrl/agents/ddpg.py b/chainerrl/agents/ddpg.py
@@ -75,6 +75,8 @@ class DDPG(AttributeSavingMixin, BatchAgent):
         logger (Logger): Logger used
         batch_states (callable): method which makes a batch of observations.
             default is `chainerrl.misc.batch_states.batch_states`
+        clip_critic_tgt (tuple or None) : tuple containing (min, max) to clip
+            the target of the critic. If None, target will not be clipped.
         burnin_action_func (callable or None): If not None, this callable
             object is used to select actions before the model is updated
             one or more times during training.
@@ -83,10 +85,11 @@ class DDPG(AttributeSavingMixin, BatchAgent):
     saved_attributes = ('model',
                         'target_model',
                         'actor_optimizer',
-                        'critic_optimizer')
+                        'critic_optimizer',
+                        'obs_normalizer')
 
     def __init__(self, model, actor_optimizer, critic_optimizer, replay_buffer,
-                 gamma, explorer,
+                 gamma, explorer, obs_normalizer=None,
                  gpu=None, replay_start_size=50000,
                  minibatch_size=32, update_interval=1,
                  target_update_interval=10000,
@@ -99,14 +102,19 @@ def __init__(self, model, actor_optimizer, critic_optimizer, replay_buffer,
                  episodic_update_len=None,
                  logger=getLogger(__name__),
                  batch_states=batch_states,
+                 l2_action_penalty=None,
+                 clip_critic_tgt=None,
                  burnin_action_func=None,
                  ):
 
         self.model = model
+        self.obs_normalizer = obs_normalizer
 
         if gpu is not None and gpu >= 0:
             cuda.get_device(gpu).use()
             self.model.to_gpu(device=gpu)
+            if self.obs_normalizer is not None:
+                self.obs_normalizer.to_gpu(device=gpu)
 
         self.xp = self.model.xp
         self.replay_buffer = replay_buffer
@@ -137,6 +145,8 @@ def __init__(self, model, actor_optimizer, critic_optimizer, replay_buffer,
             update_interval=update_interval,
         )
         self.batch_states = batch_states
+        self.clip_critic_tgt = clip_critic_tgt
+        self.l2_action_penalty = l2_action_penalty
         self.burnin_action_func = burnin_action_func
 
         self.t = 0
@@ -204,6 +214,10 @@ def compute_critic_loss(self, batch):
 
             target_q = batch_rewards + self.gamma * \
                 (1.0 - batch_terminal) * F.reshape(next_q, (batchsize,))
+            if self.clip_critic_tgt:
+                target_q = F.clip(target_q,
+                                  self.clip_critic_tgt[0],
+                                  self.clip_critic_tgt[1])
 
         # Estimated Q-function observes s_t and a_t
         predict_q = F.reshape(
@@ -251,6 +265,8 @@ def compute_actor_loss(self, batch):
 
         # Since we want to maximize Q, loss is negation of Q
         loss = - F.sum(q) / batch_size
+        if self.l2_action_penalty:
+            loss += self.l2_action_penalty * F.mean(F.square(onpolicy_actions))
 
         # Update stats
         self.average_actor_loss *= self.average_loss_decay
@@ -260,8 +276,11 @@ def compute_actor_loss(self, batch):
 
     def update(self, experiences, errors_out=None):
         """Update the model from experiences"""
-
         batch = batch_experiences(experiences, self.xp, self.phi, self.gamma)
+        if self.obs_normalizer:
+            batch['state'] = self.obs_normalizer(batch['state'], update=False)
+            batch['next_state'] = self.obs_normalizer(batch['next_state'],
+                                                      update=False)
         self.critic_optimizer.update(lambda: self.compute_critic_loss(batch))
         self.actor_optimizer.update(lambda: self.compute_actor_loss(batch))
 
@@ -280,6 +299,11 @@ def update_from_episodes(self, episodes, errors_out=None):
                 transitions.append([ep[i]])
             batch = batch_experiences(
                 transitions, xp=self.xp, phi=self.phi, gamma=self.gamma)
+            if self.obs_normalizer:
+                batch['state'] = self.obs_normalizer(batch['state'],
+                                                     update=False)
+                batch['next_state'] = self.obs_normalizer(batch['next_state'],
+                                                          update=False)
             batches.append(batch)
 
         with self.model.state_reset(), self.target_model.state_reset():
@@ -330,6 +354,9 @@ def act_and_train(self, obs, reward):
                 next_state=obs,
                 next_action=action,
                 is_state_terminal=False)
+            # Add to Normalizer
+            if self.obs_normalizer:
+                self.obs_normalizer.experience([obs])
 
         self.last_state = obs
         self.last_action = action
@@ -339,9 +366,10 @@ def act_and_train(self, obs, reward):
         return self.last_action
 
     def act(self, obs):
-
         with chainer.using_config('train', False):
             s = self.batch_states([obs], self.xp, self.phi)
+            if self.obs_normalizer:
+                s = self.obs_normalizer(s, update=False)
             action = self.policy(s).sample()
             # Q is not needed here, but log it just for information
             q = self.q_function(s, action)
@@ -363,9 +391,10 @@ def batch_act(self, batch_obs):
         Returns:
             Sequence of ~object: Actions.
         """
-
         with chainer.using_config('train', False), chainer.no_backprop_mode():
             batch_xs = self.batch_states(batch_obs, self.xp, self.phi)
+            if self.obs_normalizer:
+                batch_xs = self.obs_normalizer(batch_xs, update=False)
             batch_action = self.policy(batch_xs).sample()
             # Q is not needed here, but log it just for information
             q = self.q_function(batch_xs, batch_action)
@@ -398,7 +427,12 @@ def batch_act_and_train(self, batch_obs):
                 self.explorer.select_action(
                     self.t, lambda: batch_greedy_action[i])
                 for i in range(len(batch_greedy_action))]
-
+        # Add to Normalizer
+        if self.obs_normalizer:
+            self.obs_normalizer.experience(
+                    self.batch_states(batch_obs,
+                                      self.xp,
+                                      self.phi))
         self.batch_last_obs = list(batch_obs)
         self.batch_last_action = list(batch_action)
 
@@ -459,7 +493,11 @@ def stop_episode_and_train(self, state, reward, done=False):
             next_state=state,
             next_action=self.last_action,
             is_state_terminal=done)
-
+        # Add to Normalizer
+        if self.obs_normalizer:
+            self.obs_normalizer(self.batch_states([state],
+                                self.xp,
+                                self.phi))
         self.stop_episode()
 
     def stop_episode(self):

diff --git a/chainerrl/misc/collections.py b/chainerrl/misc/collections.py
@@ -8,6 +8,8 @@
 
 import itertools
 
+import numpy as np
+
 from chainerrl.misc.random import sample_n_k
 
 
@@ -108,3 +110,6 @@ def popleft(self):
 
     def sample(self, k):
         return [self[i] for i in sample_n_k(len(self), k)]
+
+    def sample_with_replacement(self, k):
+        return [self[i] for i in np.random.randint(0, len(self), k)]
diff --git a/chainerrl/replay_buffers/__init__.py b/chainerrl/replay_buffers/__init__.py
@@ -1,4 +1,5 @@
 from chainerrl.replay_buffers.episodic import EpisodicReplayBuffer  # NOQA
+from chainerrl.replay_buffers.hindsight import HindsightReplayBuffer  # NOQA
 from chainerrl.replay_buffers.prioritized import PrioritizedReplayBuffer  # NOQA
 from chainerrl.replay_buffers.prioritized import PriorityWeightError  # NOQA
 from chainerrl.replay_buffers.prioritized_episodic import PrioritizedEpisodicReplayBuffer  # NOQA

diff --git a/chainerrl/replay_buffers/hindsight.py b/chainerrl/replay_buffers/hindsight.py
@@ -0,0 +1,85 @@
+from __future__ import print_function
+from __future__ import unicode_literals
+from __future__ import division
+from __future__ import absolute_import
+from builtins import *  # NOQA
+from future import standard_library
+standard_library.install_aliases()  # NOQA
+
+import copy
+
+import numpy as np
+
+from chainerrl import replay_buffer
+from chainerrl.replay_buffers.episodic import EpisodicReplayBuffer  # NOQA
+
+
+class HindsightReplayBuffer(EpisodicReplayBuffer):
+    """Hindsight Replay Buffer
+
+    https://arxiv.org/abs/1707.01495
+
+    We currently do not support N-step transitions for the
+
+    Hindsight Buffer.
+
+    Args:
+        reward_function: achieved_goal, desired goal to reward
+        capacity (int): Capacity of the replay buffer
+        future_k (int): number of future goals to sample per true sample
+    """
+
+    def __init__(self, reward_function,
+                 capacity=None,
+                 future_k=0):
+        super(HindsightReplayBuffer, self).__init__(capacity)
+        self.reward_function = reward_function
+        # probability of sampling a future goal instead of a
+        # true goal
+        self.future_prob = 1.0 - 1.0/(float(future_k) + 1)
+
+    def _replace_goal(self, transition, future_transition):
+        transition = copy.deepcopy(transition)
+        future_state = future_transition['next_state']
+        assert future_state['achieved_goal'] is not None
+        new_goal = future_state['achieved_goal']
+        transition['state']['desired_goal'] = new_goal
+        transition['next_state']['desired_goal'] = new_goal
+        transition['reward'] = self.reward_function(
+                                            transition['next_state']['achieved_goal'],
+                                            new_goal)
+        return transition
+
+    def sample(self, n):
+        assert len(self.memory) >= n
+        # Select n episodes
+        episodes = self.sample_episodes(n)
+        # Select timesteps from each episode
+        episode_lens = np.array([len(episode) for episode in episodes])
+        timesteps = np.array(
+            [np.random.randint(episode_lens[i]) for i in range(n)])
+        # Select episodes for which we use a future goal instead of true
+
+        do_replace = np.random.uniform(size=n) < self.future_prob
+        # Randomly select offsets of future goals
+        future_offset = np.random.uniform(size=n) * (episode_lens - timesteps)
+        future_offset = future_offset.astype(int)
+        future_times = timesteps + future_offset
+        batch = []
+        # Go through episodes
+        for episode, timestep, future_timestep, replace in zip(
+                episodes, timesteps, future_times, do_replace):
+            transition = episode[timestep]
+            if replace:
+                future_transition = episode[future_timestep]
+                transition = self._replace_goal(transition, future_transition)
+            batch.append([transition])
+        return batch
+
+    def sample_episodes(self, n_episodes, max_len=None):
+        episodes = self.episodic_memory.sample_with_replacement(n_episodes)
+        if max_len is not None:
+            return [replay_buffer.random_subseq(ep, max_len)
+                    for ep in episodes]
+        else:
+            return episodes
diff --git a/chainerrl/wrappers/vector_frame_stack.py b/chainerrl/wrappers/vector_frame_stack.py
@@ -10,8 +10,8 @@
 from gym import spaces
 import numpy as np
 
-from chainerrl.wrappers.atari_wrappers import LazyFrames
 from chainerrl.env import VectorEnv
+from chainerrl.wrappers.atari_wrappers import LazyFrames
 
 
 class VectorEnvWrapper(VectorEnv):