From c8fe4f7f60e4c3806114c77e4460c356a0ed4877 Mon Sep 17 00:00:00 2001 From: Yuxiang Li Date: Wed, 14 Feb 2018 15:37:29 +0000 Subject: [PATCH 1/2] Change docstring of t_max in PCL and add a new parameter max_len_replay --- chainerrl/agents/pcl.py | 14 ++++++++++---- examples/gym/train_pcl_gym.py | 4 +++- tests/agents_tests/test_pcl.py | 21 +++++++++++++-------- 3 files changed, 26 insertions(+), 13 deletions(-) diff --git a/chainerrl/agents/pcl.py b/chainerrl/agents/pcl.py index 0c8777606..2e83f0720 100644 --- a/chainerrl/agents/pcl.py +++ b/chainerrl/agents/pcl.py @@ -50,8 +50,8 @@ class PCL(agent.AttributeSavingMixin, agent.AsyncAgent): - action distributions (Distribution) - state values (chainer.Variable) optimizer (chainer.Optimizer): optimizer used to train the model - t_max (int or None): The model is updated after every t_max local - steps. If set None, the model is updated after every episode. + t_max (int): The model is updated after every t_max local + steps. Default to 1. gamma (float): Discount factor [0,1] tau (float): Weight coefficient for the entropy regularizaiton term. phi (callable): Feature extractor function @@ -64,6 +64,10 @@ class PCL(agent.AttributeSavingMixin, agent.AsyncAgent): (batchsize x t_max). disable_online_update (bool): If set true, disable online on-policy update and rely only on experience replay. + max_len_replay (int or None): Maximum length of trajectories sampled + from the replay buffer. If set to None, there is not limit on it. + Refer to the behavior of AbstractEpisodicReplayBuffer for + more details. n_times_replay (int): Number of times experience replay is repeated per one time of online update. replay_start_size (int): Experience replay is disabled if the number of @@ -95,7 +99,7 @@ class PCL(agent.AttributeSavingMixin, agent.AsyncAgent): def __init__(self, model, optimizer, replay_buffer=None, - t_max=None, + t_max=1, gamma=0.99, tau=1e-2, phi=lambda x: x, @@ -104,6 +108,7 @@ def __init__(self, model, optimizer, rollout_len=10, batchsize=1, disable_online_update=False, + max_len_replay=None, n_times_replay=1, replay_start_size=10 ** 2, normalize_loss_by_steps=True, @@ -148,6 +153,7 @@ def __init__(self, model, optimizer, self.normalize_loss_by_steps = normalize_loss_by_steps self.act_deterministically = act_deterministically self.disable_online_update = disable_online_update + self.max_len_replay = max_len_replay self.n_times_replay = n_times_replay self.replay_start_size = replay_start_size self.average_loss_decay = average_loss_decay @@ -281,7 +287,7 @@ def update_from_replay(self): self.logger.debug('update_from_replay') episodes = self.replay_buffer.sample_episodes( - self.batchsize, max_len=self.t_max) + self.batchsize, max_len=self.max_len_replay) if isinstance(episodes, tuple): # Prioritized replay episodes, weights = episodes diff --git a/examples/gym/train_pcl_gym.py b/examples/gym/train_pcl_gym.py index 0ac47542c..e9385b79a 100644 --- a/examples/gym/train_pcl_gym.py +++ b/examples/gym/train_pcl_gym.py @@ -47,9 +47,10 @@ def main(): parser.add_argument('--rollout-len', type=int, default=10) parser.add_argument('--n-hidden-channels', type=int, default=100) parser.add_argument('--n-hidden-layers', type=int, default=2) + parser.add_argument('--max-len-replay', type=int, default=None) parser.add_argument('--n-times-replay', type=int, default=1) parser.add_argument('--replay-start-size', type=int, default=10000) - parser.add_argument('--t-max', type=int, default=None) + parser.add_argument('--t-max', type=int, default=1) parser.add_argument('--tau', type=float, default=1e-2) parser.add_argument('--profile', action='store_true') parser.add_argument('--steps', type=int, default=8 * 10 ** 7) @@ -160,6 +161,7 @@ def make_env(process_idx, test): tau=args.tau, phi=lambda x: x.astype(np.float32, copy=False), rollout_len=args.rollout_len, + max_len_replay=args.max_len_replay, n_times_replay=args.n_times_replay, replay_start_size=args.replay_start_size, batchsize=args.batchsize, diff --git a/tests/agents_tests/test_pcl.py b/tests/agents_tests/test_pcl.py index 81ff0fc19..e35adde3b 100644 --- a/tests/agents_tests/test_pcl.py +++ b/tests/agents_tests/test_pcl.py @@ -28,6 +28,7 @@ @testing.parameterize(*( testing.product({ 't_max': [1], + 'max_len_replay': [10], 'use_lstm': [False], 'episodic': [True], # PCL doesn't work well with continuing envs 'disable_online_update': [True, False], @@ -36,7 +37,8 @@ 'batchsize': [1, 5], }) + testing.product({ - 't_max': [None], + 't_max': [1], + 'max_len_replay': [None], 'use_lstm': [True, False], 'episodic': [True], 'disable_online_update': [True, False], @@ -53,25 +55,27 @@ def setUp(self): @testing.attr.slow def test_abc_discrete(self): - self._test_abc(self.t_max, self.use_lstm, episodic=self.episodic) + self._test_abc(self.t_max, self.max_len_replay, self.use_lstm, + episodic=self.episodic) def test_abc_discrete_fast(self): - self._test_abc(self.t_max, self.use_lstm, episodic=self.episodic, - steps=10, require_success=False) + self._test_abc(self.t_max, self.max_len_replay, self.use_lstm, + episodic=self.episodic, steps=10, + require_success=False) @testing.attr.slow def test_abc_gaussian(self): - self._test_abc(self.t_max, self.use_lstm, + self._test_abc(self.t_max, self.max_len_replay, self.use_lstm, discrete=False, episodic=self.episodic, steps=100000) def test_abc_gaussian_fast(self): - self._test_abc(self.t_max, self.use_lstm, + self._test_abc(self.t_max, self.max_len_replay, self.use_lstm, discrete=False, episodic=self.episodic, steps=10, require_success=False) - def _test_abc(self, t_max, use_lstm, discrete=True, episodic=True, - steps=100000, require_success=True): + def _test_abc(self, t_max, max_len_replay, use_lstm, discrete=True, + episodic=True, steps=100000, require_success=True): nproc = 8 @@ -185,6 +189,7 @@ def phi(x): gamma=gamma, tau=tau, phi=phi, + max_len_replay=max_len_replay, n_times_replay=1, batchsize=self.batchsize, train_async=self.train_async, From b8c069feb3373ac6ed0469b83d0594f538a5b9cc Mon Sep 17 00:00:00 2001 From: Yuxiang Li Date: Wed, 14 Feb 2018 16:08:29 +0000 Subject: [PATCH 2/2] Rename parameters of PCL --- chainerrl/agents/pcl.py | 24 ++++++++++++------------ examples/gym/train_pcl_gym.py | 8 ++++---- tests/agents_tests/test_pcl.py | 20 ++++++++++---------- 3 files changed, 26 insertions(+), 26 deletions(-) diff --git a/chainerrl/agents/pcl.py b/chainerrl/agents/pcl.py index 2e83f0720..9828690e1 100644 --- a/chainerrl/agents/pcl.py +++ b/chainerrl/agents/pcl.py @@ -50,8 +50,8 @@ class PCL(agent.AttributeSavingMixin, agent.AsyncAgent): - action distributions (Distribution) - state values (chainer.Variable) optimizer (chainer.Optimizer): optimizer used to train the model - t_max (int): The model is updated after every t_max local - steps. Default to 1. + update_interval (int): The model is updated after every update_interval + local steps gamma (float): Discount factor [0,1] tau (float): Weight coefficient for the entropy regularizaiton term. phi (callable): Feature extractor function @@ -64,10 +64,10 @@ class PCL(agent.AttributeSavingMixin, agent.AsyncAgent): (batchsize x t_max). disable_online_update (bool): If set true, disable online on-policy update and rely only on experience replay. - max_len_replay (int or None): Maximum length of trajectories sampled - from the replay buffer. If set to None, there is not limit on it. - Refer to the behavior of AbstractEpisodicReplayBuffer for - more details. + t_max (int or None): Maximum length of trajectories sampled from the + replay buffer. If set to None, there is not limit on it, + complete trajectories / episodes will be sampled. Refer to the + behavior of AbstractEpisodicReplayBuffer for more details. n_times_replay (int): Number of times experience replay is repeated per one time of online update. replay_start_size (int): Experience replay is disabled if the number of @@ -99,7 +99,7 @@ class PCL(agent.AttributeSavingMixin, agent.AsyncAgent): def __init__(self, model, optimizer, replay_buffer=None, - t_max=1, + update_interval=1, gamma=0.99, tau=1e-2, phi=lambda x: x, @@ -108,7 +108,7 @@ def __init__(self, model, optimizer, rollout_len=10, batchsize=1, disable_online_update=False, - max_len_replay=None, + t_max=None, n_times_replay=1, replay_start_size=10 ** 2, normalize_loss_by_steps=True, @@ -136,7 +136,7 @@ def __init__(self, model, optimizer, self.optimizer = optimizer self.replay_buffer = replay_buffer - self.t_max = t_max + self.update_interval = update_interval self.gamma = gamma self.tau = tau self.phi = phi @@ -153,7 +153,7 @@ def __init__(self, model, optimizer, self.normalize_loss_by_steps = normalize_loss_by_steps self.act_deterministically = act_deterministically self.disable_online_update = disable_online_update - self.max_len_replay = max_len_replay + self.t_max = t_max self.n_times_replay = n_times_replay self.replay_start_size = replay_start_size self.average_loss_decay = average_loss_decay @@ -287,7 +287,7 @@ def update_from_replay(self): self.logger.debug('update_from_replay') episodes = self.replay_buffer.sample_episodes( - self.batchsize, max_len=self.max_len_replay) + self.batchsize, max_len=self.t_max) if isinstance(episodes, tuple): # Prioritized replay episodes, weights = episodes @@ -391,7 +391,7 @@ def act_and_train(self, obs, reward): if self.last_state is not None: self.past_rewards[self.t - 1] = reward - if self.t - self.t_start == self.t_max: + if self.t - self.t_start == self.update_interval: self.update_on_policy(statevar) if len(self.online_batch_losses) == 0: for _ in range(self.n_times_replay): diff --git a/examples/gym/train_pcl_gym.py b/examples/gym/train_pcl_gym.py index e9385b79a..a6301722e 100644 --- a/examples/gym/train_pcl_gym.py +++ b/examples/gym/train_pcl_gym.py @@ -47,10 +47,10 @@ def main(): parser.add_argument('--rollout-len', type=int, default=10) parser.add_argument('--n-hidden-channels', type=int, default=100) parser.add_argument('--n-hidden-layers', type=int, default=2) - parser.add_argument('--max-len-replay', type=int, default=None) parser.add_argument('--n-times-replay', type=int, default=1) parser.add_argument('--replay-start-size', type=int, default=10000) - parser.add_argument('--t-max', type=int, default=1) + parser.add_argument('--update-interval', type=int, default=1) + parser.add_argument('--t-max', type=int, default=None) parser.add_argument('--tau', type=float, default=1e-2) parser.add_argument('--profile', action='store_true') parser.add_argument('--steps', type=int, default=8 * 10 ** 7) @@ -157,11 +157,11 @@ def make_env(process_idx, test): agent = chainerrl.agents.PCL( model, opt, replay_buffer=replay_buffer, - t_max=args.t_max, gamma=0.99, + update_interval=args.update_interval, gamma=0.99, tau=args.tau, phi=lambda x: x.astype(np.float32, copy=False), rollout_len=args.rollout_len, - max_len_replay=args.max_len_replay, + t_max=args.t_max, n_times_replay=args.n_times_replay, replay_start_size=args.replay_start_size, batchsize=args.batchsize, diff --git a/tests/agents_tests/test_pcl.py b/tests/agents_tests/test_pcl.py index e35adde3b..84b56c1ca 100644 --- a/tests/agents_tests/test_pcl.py +++ b/tests/agents_tests/test_pcl.py @@ -27,8 +27,8 @@ @testing.parameterize(*( testing.product({ - 't_max': [1], - 'max_len_replay': [10], + 'update_interval': [1], + 't_max': [10], 'use_lstm': [False], 'episodic': [True], # PCL doesn't work well with continuing envs 'disable_online_update': [True, False], @@ -37,8 +37,8 @@ 'batchsize': [1, 5], }) + testing.product({ - 't_max': [1], - 'max_len_replay': [None], + 'update_interval': [1], + 't_max': [None], 'use_lstm': [True, False], 'episodic': [True], 'disable_online_update': [True, False], @@ -55,26 +55,26 @@ def setUp(self): @testing.attr.slow def test_abc_discrete(self): - self._test_abc(self.t_max, self.max_len_replay, self.use_lstm, + self._test_abc(self.t_max, self.update_interval, self.use_lstm, episodic=self.episodic) def test_abc_discrete_fast(self): - self._test_abc(self.t_max, self.max_len_replay, self.use_lstm, + self._test_abc(self.t_max, self.update_interval, self.use_lstm, episodic=self.episodic, steps=10, require_success=False) @testing.attr.slow def test_abc_gaussian(self): - self._test_abc(self.t_max, self.max_len_replay, self.use_lstm, + self._test_abc(self.t_max, self.update_interval, self.use_lstm, discrete=False, episodic=self.episodic, steps=100000) def test_abc_gaussian_fast(self): - self._test_abc(self.t_max, self.max_len_replay, self.use_lstm, + self._test_abc(self.t_max, self.update_interval, self.use_lstm, discrete=False, episodic=self.episodic, steps=10, require_success=False) - def _test_abc(self, t_max, max_len_replay, use_lstm, discrete=True, + def _test_abc(self, t_max, update_interval, use_lstm, discrete=True, episodic=True, steps=100000, require_success=True): nproc = 8 @@ -186,10 +186,10 @@ def phi(x): agent = pcl.PCL(model, opt, replay_buffer=replay_buffer, t_max=t_max, + update_interval=update_interval, gamma=gamma, tau=tau, phi=phi, - max_len_replay=max_len_replay, n_times_replay=1, batchsize=self.batchsize, train_async=self.train_async,