diff --git a/.gitignore b/.gitignore index aa1c430c9..a601f95e9 100644 --- a/.gitignore +++ b/.gitignore @@ -7,3 +7,5 @@ rlkit/launchers/conf_private.py MANIFEST *.egg-info \.idea/ +.pytest_cache +.coverage diff --git a/README.md b/README.md index 4fd665194..50752c785 100644 --- a/README.md +++ b/README.md @@ -214,6 +214,13 @@ This requires some knowledge of AWS and/or GCP, which is beyond the scope of this README. To learn more, more about `doodad`, [go to the repository](https://github.com/vitchyr/doodad/), which is based on [this original repository](https://github.com/justinjfu/doodad/). +### Testing +A set of basic regression tests can be run with: +``` +nose2 -v -B -s tests/regression/basic +``` +Other directories in `tests/regression` have project-specific tests. + # Requests for pull-requests - Implement policy-gradient algorithms. - Implement model-based algorithms. diff --git a/examples/__init__.py b/examples/__init__.py new file mode 100644 index 000000000..e69de29bb diff --git a/examples/awac/__init__.py b/examples/awac/__init__.py new file mode 100644 index 000000000..e69de29bb diff --git a/examples/awac/hand/__init__.py b/examples/awac/hand/__init__.py new file mode 100644 index 000000000..e69de29bb diff --git a/examples/awac/hand/awac1.py b/examples/awac/hand/awac1.py index ee0252730..456c60103 100644 --- a/examples/awac/hand/awac1.py +++ b/examples/awac/hand/awac1.py @@ -3,19 +3,22 @@ import rlkit.util.hyperparameter as hyp from rlkit.launchers.launcher_util import run_experiment +from rlkit.launchers.arglauncher import run_variants from rlkit.torch.sac.policies import GaussianPolicy from rlkit.torch.networks import Clamp -if __name__ == "__main__": +def main(): variant = dict( - num_epochs=501, - num_eval_steps_per_epoch=1000, - num_trains_per_train_loop=1000, - num_expl_steps_per_train_loop=1000, - min_num_steps_before_training=1000, + algo_kwargs=dict( + num_epochs=501, + num_eval_steps_per_epoch=1000, + num_trains_per_train_loop=1000, + num_expl_steps_per_train_loop=1000, + min_num_steps_before_training=1000, + batch_size=1024, + ), max_path_length=1000, - batch_size=1024, algorithm="AWAC", replay_buffer_size=int(1E6), @@ -99,35 +102,11 @@ search_space, default_parameters=variant, ) - # n_seeds = 1 - # mode = 'local' - # exp_prefix = 'dev-{}'.format( - # __file__.replace('/', '-').replace('_', '-').split('.')[0] - # ) - - n_seeds = 3 - mode = 'ec2' - exp_prefix = 'hand-awac1' + variants = [] + for variant in sweeper.iterate_hyperparameters(): + variants.append(variant) - for exp_id, variant in enumerate(sweeper.iterate_hyperparameters()): - for _ in range(n_seeds): - run_experiment( - experiment, - exp_prefix=exp_prefix, - mode=mode, - variant=variant, - use_gpu=True, - snapshot_gap=200, - snapshot_mode='gap_and_last', - num_exps_per_instance=3, - gcp_kwargs=dict( - zone='us-west1-b', - ), + run_variants(experiment, variants, process_args) - ) - - # variants = [] - # for variant in sweeper.iterate_hyperparameters(): - # variants.append(variant) - - # run_variants(experiment, variants, process_args) +if __name__ == "__main__": + main() diff --git a/examples/awac/mujoco/__init__.py b/examples/awac/mujoco/__init__.py new file mode 100644 index 000000000..e69de29bb diff --git a/examples/awac/mujoco/awac1.py b/examples/awac/mujoco/awac1.py index 1fac9db13..ca13221fa 100644 --- a/examples/awac/mujoco/awac1.py +++ b/examples/awac/mujoco/awac1.py @@ -3,19 +3,22 @@ import rlkit.util.hyperparameter as hyp from rlkit.launchers.launcher_util import run_experiment +from rlkit.launchers.arglauncher import run_variants from rlkit.torch.sac.policies import GaussianPolicy from rlkit.torch.networks import Clamp -if __name__ == "__main__": +def main(): variant = dict( - num_epochs=501, - num_eval_steps_per_epoch=1000, - num_trains_per_train_loop=1000, - num_expl_steps_per_train_loop=1000, - min_num_steps_before_training=1000, + algo_kwargs=dict( + num_epochs=501, + num_eval_steps_per_epoch=1000, + num_trains_per_train_loop=1000, + num_expl_steps_per_train_loop=1000, + min_num_steps_before_training=1000, + batch_size=1024, + ), max_path_length=1000, - batch_size=1024, replay_buffer_size=int(1E6), layer_size=256, num_layers=2, @@ -87,35 +90,11 @@ search_space, default_parameters=variant, ) - n_seeds = 1 - mode = 'local' - exp_prefix = 'dev-{}'.format( - __file__.replace('/', '-').replace('_', '-').split('.')[0] - ) - - # n_seeds = 3 - # mode = 'gcp' - # exp_prefix = 'skew-fit-pickup-reference-post-refactor' + variants = [] + for variant in sweeper.iterate_hyperparameters(): + variants.append(variant) - for exp_id, variant in enumerate(sweeper.iterate_hyperparameters()): - for _ in range(n_seeds): - run_experiment( - experiment, - exp_prefix=exp_prefix, - mode=mode, - variant=variant, - use_gpu=use_gpu, - snapshot_gap=200, - snapshot_mode='gap_and_last', - num_exps_per_instance=3, - gcp_kwargs=dict( - zone='us-west1-b', - ), + run_variants(experiment, variants, process_args) - ) - - # variants = [] - # for variant in sweeper.iterate_hyperparameters(): - # variants.append(variant) - - # run_variants(experiment, variants, process_args) +if __name__ == "__main__": + main() diff --git a/examples/ddpg.py b/examples/ddpg.py index c9c59a1db..a44a8d443 100644 --- a/examples/ddpg.py +++ b/examples/ddpg.py @@ -18,6 +18,9 @@ import rlkit.torch.pytorch_util as ptu from rlkit.torch.torch_rl_algorithm import TorchBatchRLAlgorithm +import random +import torch +import numpy as np def experiment(variant): eval_env = NormalizedBoxEnv(HalfCheetahEnv()) @@ -25,6 +28,14 @@ def experiment(variant): # Or for a specific version: # import gym # env = NormalizedBoxEnv(gym.make('HalfCheetah-v1')) + + seed = variant["seed"] + random.seed(seed) + np.random.seed(seed) + torch.manual_seed(seed) + eval_env.seed(seed) + expl_env.seed(seed) + obs_dim = eval_env.observation_space.low.size action_dim = eval_env.action_space.low.size qf = ConcatMlp( @@ -65,34 +76,36 @@ def experiment(variant): algorithm.to(ptu.device) algorithm.train() +variant = dict( + algorithm_kwargs=dict( + num_epochs=1000, + num_eval_steps_per_epoch=1000, + num_trains_per_train_loop=1000, + num_expl_steps_per_train_loop=1000, + min_num_steps_before_training=10000, + max_path_length=1000, + batch_size=128, + ), + trainer_kwargs=dict( + use_soft_update=True, + tau=1e-2, + discount=0.99, + qf_learning_rate=1e-3, + policy_learning_rate=1e-4, + ), + qf_kwargs=dict( + hidden_sizes=[400, 300], + ), + policy_kwargs=dict( + hidden_sizes=[400, 300], + ), + replay_buffer_size=int(1E6), + seed=random.randint(0, 100000), +) -if __name__ == "__main__": - # noinspection PyTypeChecker - variant = dict( - algorithm_kwargs=dict( - num_epochs=1000, - num_eval_steps_per_epoch=1000, - num_trains_per_train_loop=1000, - num_expl_steps_per_train_loop=1000, - min_num_steps_before_training=10000, - max_path_length=1000, - batch_size=128, - ), - trainer_kwargs=dict( - use_soft_update=True, - tau=1e-2, - discount=0.99, - qf_learning_rate=1e-3, - policy_learning_rate=1e-4, - ), - qf_kwargs=dict( - hidden_sizes=[400, 300], - ), - policy_kwargs=dict( - hidden_sizes=[400, 300], - ), - replay_buffer_size=int(1E6), - ) - # ptu.set_gpu_mode(True) # optionally set the GPU (default=False) +def main(): setup_logger('name-of-experiment', variant=variant) experiment(variant) + +if __name__ == "__main__": + main() diff --git a/examples/dqn_and_double_dqn.py b/examples/dqn_and_double_dqn.py index 2ef275d6f..03dbabe09 100644 --- a/examples/dqn_and_double_dqn.py +++ b/examples/dqn_and_double_dqn.py @@ -17,6 +17,9 @@ from rlkit.samplers.data_collector import MdpPathCollector from rlkit.torch.torch_rl_algorithm import TorchBatchRLAlgorithm +import random +import torch +import numpy as np def experiment(variant): expl_env = gym.make('CartPole-v0').env @@ -24,6 +27,13 @@ def experiment(variant): obs_dim = expl_env.observation_space.low.size action_dim = eval_env.action_space.n + seed = variant["seed"] + random.seed(seed) + np.random.seed(seed) + torch.manual_seed(seed) + eval_env.seed(seed) + expl_env.seed(seed) + qf = Mlp( hidden_sizes=[32, 32], input_size=obs_dim, @@ -70,28 +80,31 @@ def experiment(variant): algorithm.to(ptu.device) algorithm.train() +variant = dict( + algorithm="DQN", + version="normal", + layer_size=256, + replay_buffer_size=int(1E6), + algorithm_kwargs=dict( + num_epochs=3000, + num_eval_steps_per_epoch=5000, + num_trains_per_train_loop=1000, + num_expl_steps_per_train_loop=1000, + min_num_steps_before_training=1000, + max_path_length=1000, + batch_size=256, + ), + trainer_kwargs=dict( + discount=0.99, + learning_rate=3E-4, + ), + seed=random.randint(0, 100000), +) -if __name__ == "__main__": - # noinspection PyTypeChecker - variant = dict( - algorithm="DQN", - version="normal", - layer_size=256, - replay_buffer_size=int(1E6), - algorithm_kwargs=dict( - num_epochs=3000, - num_eval_steps_per_epoch=5000, - num_trains_per_train_loop=1000, - num_expl_steps_per_train_loop=1000, - min_num_steps_before_training=1000, - max_path_length=1000, - batch_size=256, - ), - trainer_kwargs=dict( - discount=0.99, - learning_rate=3E-4, - ), - ) +def main(): setup_logger('dqn-CartPole', variant=variant) # ptu.set_gpu_mode(True) # optionally set the GPU (default=False) experiment(variant) + +if __name__ == "__main__": + main() diff --git a/examples/her/__init__.py b/examples/her/__init__.py new file mode 100644 index 000000000..e69de29bb diff --git a/examples/her/her_dqn_gridworld.py b/examples/her/her_dqn_gridworld.py index 68b77cd19..89e073d43 100644 --- a/examples/her/her_dqn_gridworld.py +++ b/examples/her/her_dqn_gridworld.py @@ -27,11 +27,21 @@ "https://github.com/vitchyr/multiworld.") raise e +import random +import torch +import numpy as np def experiment(variant): expl_env = gym.make('GoalGridworld-v0') eval_env = gym.make('GoalGridworld-v0') + seed = variant["seed"] + random.seed(seed) + np.random.seed(seed) + torch.manual_seed(seed) + eval_env.seed(seed) + expl_env.seed(seed) + obs_dim = expl_env.observation_space.spaces['observation'].low.size goal_dim = expl_env.observation_space.spaces['desired_goal'].low.size action_dim = expl_env.action_space.n @@ -91,25 +101,30 @@ def experiment(variant): algorithm.train() -if __name__ == "__main__": - variant = dict( - algo_kwargs=dict( - num_epochs=100, - max_path_length=50, - num_eval_steps_per_epoch=1000, - num_expl_steps_per_train_loop=1000, - num_trains_per_train_loop=1000, - min_num_steps_before_training=1000, - batch_size=128, - ), - trainer_kwargs=dict( - discount=0.99, - ), - replay_buffer_kwargs=dict( - max_size=100000, - fraction_goals_rollout_goals=0.2, # equal to k = 4 in HER paper - fraction_goals_env_goals=0.0, - ), - ) +variant = dict( + algo_kwargs=dict( + num_epochs=100, + max_path_length=50, + num_eval_steps_per_epoch=1000, + num_expl_steps_per_train_loop=1000, + num_trains_per_train_loop=1000, + min_num_steps_before_training=1000, + batch_size=128, + ), + trainer_kwargs=dict( + discount=0.99, + ), + replay_buffer_kwargs=dict( + max_size=100000, + fraction_goals_rollout_goals=0.2, # equal to k = 4 in HER paper + fraction_goals_env_goals=0.0, + ), + seed=random.randint(0, 100000), +) + +def main(): setup_logger('her-dqn-gridworld-experiment', variant=variant) experiment(variant) + +if __name__ == "__main__": + main() diff --git a/examples/her/her_sac_gym_fetch_reach.py b/examples/her/her_sac_gym_fetch_reach.py index 96f6356e6..06dc446cf 100644 --- a/examples/her/her_sac_gym_fetch_reach.py +++ b/examples/her/her_sac_gym_fetch_reach.py @@ -10,12 +10,22 @@ from rlkit.torch.sac.sac import SACTrainer from rlkit.torch.torch_rl_algorithm import TorchBatchRLAlgorithm +import random +import torch +import numpy as np def experiment(variant): # unwrap the TimeLimitEnv wrapper since we manually termiante after 50 steps eval_env = gym.make('FetchReach-v1').env expl_env = gym.make('FetchReach-v1').env + seed = variant["seed"] + random.seed(seed) + np.random.seed(seed) + torch.manual_seed(seed) + eval_env.seed(seed) + expl_env.seed(seed) + observation_key = 'observation' desired_goal_key = 'desired_goal' @@ -90,40 +100,44 @@ def experiment(variant): algorithm.to(ptu.device) algorithm.train() +variant = dict( + algorithm='HER-SAC', + version='normal', + algo_kwargs=dict( + batch_size=128, + num_epochs=100, + num_eval_steps_per_epoch=5000, + num_expl_steps_per_train_loop=1000, + num_trains_per_train_loop=1000, + min_num_steps_before_training=1000, + max_path_length=50, + ), + sac_trainer_kwargs=dict( + discount=0.99, + soft_target_tau=5e-3, + target_update_period=1, + policy_lr=3E-4, + qf_lr=3E-4, + reward_scale=1, + use_automatic_entropy_tuning=True, + ), + replay_buffer_kwargs=dict( + max_size=int(1E6), + fraction_goals_rollout_goals=0.2, # equal to k = 4 in HER paper + fraction_goals_env_goals=0, + ), + qf_kwargs=dict( + hidden_sizes=[400, 300], + ), + policy_kwargs=dict( + hidden_sizes=[400, 300], + ), + seed=random.randint(0, 100000), +) -if __name__ == "__main__": - variant = dict( - algorithm='HER-SAC', - version='normal', - algo_kwargs=dict( - batch_size=128, - num_epochs=100, - num_eval_steps_per_epoch=5000, - num_expl_steps_per_train_loop=1000, - num_trains_per_train_loop=1000, - min_num_steps_before_training=1000, - max_path_length=50, - ), - sac_trainer_kwargs=dict( - discount=0.99, - soft_target_tau=5e-3, - target_update_period=1, - policy_lr=3E-4, - qf_lr=3E-4, - reward_scale=1, - use_automatic_entropy_tuning=True, - ), - replay_buffer_kwargs=dict( - max_size=int(1E6), - fraction_goals_rollout_goals=0.2, # equal to k = 4 in HER paper - fraction_goals_env_goals=0, - ), - qf_kwargs=dict( - hidden_sizes=[400, 300], - ), - policy_kwargs=dict( - hidden_sizes=[400, 300], - ), - ) +def main(): setup_logger('her-sac-fetch-experiment', variant=variant) experiment(variant) + +if __name__ == "__main__": + main() diff --git a/examples/her/her_td3_multiworld_sawyer_reach.py b/examples/her/her_td3_multiworld_sawyer_reach.py index fff1924f8..8edb13c37 100644 --- a/examples/her/her_td3_multiworld_sawyer_reach.py +++ b/examples/her/her_td3_multiworld_sawyer_reach.py @@ -22,12 +22,23 @@ from rlkit.torch.td3.td3 import TD3Trainer from rlkit.torch.torch_rl_algorithm import TorchBatchRLAlgorithm +import random +import torch +import numpy as np def experiment(variant): import multiworld multiworld.register_all_envs() eval_env = gym.make('SawyerReachXYZEnv-v0') expl_env = gym.make('SawyerReachXYZEnv-v0') + + seed = variant["seed"] + random.seed(seed) + np.random.seed(seed) + torch.manual_seed(seed) + eval_env.seed(seed) + expl_env.seed(seed) + observation_key = 'state_observation' desired_goal_key = 'state_desired_goal' achieved_goal_key = desired_goal_key.replace("desired", "achieved") @@ -116,30 +127,35 @@ def experiment(variant): algorithm.train() -if __name__ == "__main__": - variant = dict( - algo_kwargs=dict( - num_epochs=100, - max_path_length=50, - batch_size=128, - num_eval_steps_per_epoch=1000, - num_expl_steps_per_train_loop=1000, - num_trains_per_train_loop=1000, - ), - trainer_kwargs=dict( - discount=0.99, - ), - replay_buffer_kwargs=dict( - max_size=100000, - fraction_goals_rollout_goals=0.2, - fraction_goals_env_goals=0.0, - ), - qf_kwargs=dict( - hidden_sizes=[400, 300], - ), - policy_kwargs=dict( - hidden_sizes=[400, 300], - ), - ) +variant = dict( + algo_kwargs=dict( + num_epochs=100, + max_path_length=50, + batch_size=128, + num_eval_steps_per_epoch=1000, + num_expl_steps_per_train_loop=1000, + num_trains_per_train_loop=1000, + ), + trainer_kwargs=dict( + discount=0.99, + ), + replay_buffer_kwargs=dict( + max_size=100000, + fraction_goals_rollout_goals=0.2, + fraction_goals_env_goals=0.0, + ), + qf_kwargs=dict( + hidden_sizes=[400, 300], + ), + policy_kwargs=dict( + hidden_sizes=[400, 300], + ), + seed=random.randint(0, 100000), +) + +def main(): setup_logger('her-td3-sawyer-experiment', variant=variant) experiment(variant) + +if __name__ == "__main__": + main() diff --git a/examples/sac.py b/examples/sac.py index 0d863964c..95cb6a900 100644 --- a/examples/sac.py +++ b/examples/sac.py @@ -10,6 +10,9 @@ from rlkit.torch.networks import ConcatMlp from rlkit.torch.torch_rl_algorithm import TorchBatchRLAlgorithm +import random +import torch +import numpy as np def experiment(variant): expl_env = NormalizedBoxEnv(HalfCheetahEnv()) @@ -17,6 +20,13 @@ def experiment(variant): obs_dim = expl_env.observation_space.low.size action_dim = eval_env.action_space.low.size + seed = variant["seed"] + random.seed(seed) + np.random.seed(seed) + torch.manual_seed(seed) + eval_env.seed(seed) + expl_env.seed(seed) + M = variant['layer_size'] qf1 = ConcatMlp( input_size=obs_dim + action_dim, @@ -77,35 +87,36 @@ def experiment(variant): algorithm.to(ptu.device) algorithm.train() +variant = dict( + algorithm="SAC", + version="normal", + layer_size=256, + replay_buffer_size=int(1E6), + algorithm_kwargs=dict( + num_epochs=3000, + num_eval_steps_per_epoch=5000, + num_trains_per_train_loop=1000, + num_expl_steps_per_train_loop=1000, + min_num_steps_before_training=1000, + max_path_length=1000, + batch_size=256, + ), + trainer_kwargs=dict( + discount=0.99, + soft_target_tau=5e-3, + target_update_period=1, + policy_lr=3E-4, + qf_lr=3E-4, + reward_scale=1, + use_automatic_entropy_tuning=True, + ), + seed=random.randint(0, 100000), +) - - -if __name__ == "__main__": - # noinspection PyTypeChecker - variant = dict( - algorithm="SAC", - version="normal", - layer_size=256, - replay_buffer_size=int(1E6), - algorithm_kwargs=dict( - num_epochs=3000, - num_eval_steps_per_epoch=5000, - num_trains_per_train_loop=1000, - num_expl_steps_per_train_loop=1000, - min_num_steps_before_training=1000, - max_path_length=1000, - batch_size=256, - ), - trainer_kwargs=dict( - discount=0.99, - soft_target_tau=5e-3, - target_update_period=1, - policy_lr=3E-4, - qf_lr=3E-4, - reward_scale=1, - use_automatic_entropy_tuning=True, - ), - ) +def main(): setup_logger('name-of-experiment', variant=variant) # ptu.set_gpu_mode(True) # optionally set the GPU (default=False) experiment(variant) + +if __name__ == "__main__": + main() diff --git a/examples/td3.py b/examples/td3.py index b86fa506f..aa0178318 100644 --- a/examples/td3.py +++ b/examples/td3.py @@ -20,10 +20,21 @@ from rlkit.torch.td3.td3 import TD3Trainer from rlkit.torch.torch_rl_algorithm import TorchBatchRLAlgorithm +import random +import torch +import numpy as np def experiment(variant): expl_env = NormalizedBoxEnv(HalfCheetahEnv()) eval_env = NormalizedBoxEnv(HalfCheetahEnv()) + + seed = variant["seed"] + random.seed(seed) + np.random.seed(seed) + torch.manual_seed(seed) + eval_env.seed(seed) + expl_env.seed(seed) + obs_dim = expl_env.observation_space.low.size action_dim = expl_env.action_space.low.size qf1 = ConcatMlp( @@ -98,29 +109,33 @@ def experiment(variant): algorithm.to(ptu.device) algorithm.train() +variant = dict( + algorithm_kwargs=dict( + num_epochs=3000, + num_eval_steps_per_epoch=5000, + num_trains_per_train_loop=1000, + num_expl_steps_per_train_loop=1000, + min_num_steps_before_training=1000, + max_path_length=1000, + batch_size=256, + ), + trainer_kwargs=dict( + discount=0.99, + ), + qf_kwargs=dict( + hidden_sizes=[400, 300], + ), + policy_kwargs=dict( + hidden_sizes=[400, 300], + ), + replay_buffer_size=int(1E6), + seed=random.randint(0, 100000), +) -if __name__ == "__main__": - variant = dict( - algorithm_kwargs=dict( - num_epochs=3000, - num_eval_steps_per_epoch=5000, - num_trains_per_train_loop=1000, - num_expl_steps_per_train_loop=1000, - min_num_steps_before_training=1000, - max_path_length=1000, - batch_size=256, - ), - trainer_kwargs=dict( - discount=0.99, - ), - qf_kwargs=dict( - hidden_sizes=[400, 300], - ), - policy_kwargs=dict( - hidden_sizes=[400, 300], - ), - replay_buffer_size=int(1E6), - ) +def main(): # ptu.set_gpu_mode(True) # optionally set the GPU (default=False) setup_logger('rlkit-post-refactor-td3-half-cheetah', variant=variant) experiment(variant) + +if __name__ == "__main__": + main() diff --git a/rlkit/core/__init__.py b/rlkit/core/__init__.py index b3ae95dba..2458ec66c 100644 --- a/rlkit/core/__init__.py +++ b/rlkit/core/__init__.py @@ -1,7 +1,6 @@ """ General classes, functions, utilities that are used throughout rlkit. """ -from rlkit.core.logging import logger - -__all__ = ['logger'] +from rlkit.core.logging import logger, setup_logger_custom +__all__ = ["logger", "setup_logger_custom"] diff --git a/rlkit/core/batch_rl_algorithm.py b/rlkit/core/batch_rl_algorithm.py index 848c5eb80..30d18ee44 100644 --- a/rlkit/core/batch_rl_algorithm.py +++ b/rlkit/core/batch_rl_algorithm.py @@ -1,84 +1,80 @@ -import abc +from collections import OrderedDict -import gtimer as gt -from rlkit.core.rl_algorithm import BaseRLAlgorithm -from rlkit.data_management.replay_buffer import ReplayBuffer -from rlkit.samplers.data_collector import PathCollector +from rlkit.core.timer import timer +from rlkit.core import logger, eval_util +from rlkit.data_management.replay_buffer import ReplayBuffer +from rlkit.samplers.data_collector.path_collector import PathCollector +from rlkit.core.rl_algorithm import BaseRLAlgorithm -class BatchRLAlgorithm(BaseRLAlgorithm, metaclass=abc.ABCMeta): +class BatchRLAlgorithm(BaseRLAlgorithm): def __init__( self, - trainer, - exploration_env, - evaluation_env, - exploration_data_collector: PathCollector, - evaluation_data_collector: PathCollector, - replay_buffer: ReplayBuffer, batch_size, max_path_length, - num_epochs, num_eval_steps_per_epoch, num_expl_steps_per_train_loop, num_trains_per_train_loop, num_train_loops_per_epoch=1, min_num_steps_before_training=0, + offline_rl=False, + *args, + **kwargs ): - super().__init__( - trainer, - exploration_env, - evaluation_env, - exploration_data_collector, - evaluation_data_collector, - replay_buffer, - ) + + super().__init__(*args, **kwargs) self.batch_size = batch_size self.max_path_length = max_path_length - self.num_epochs = num_epochs self.num_eval_steps_per_epoch = num_eval_steps_per_epoch self.num_trains_per_train_loop = num_trains_per_train_loop self.num_train_loops_per_epoch = num_train_loops_per_epoch self.num_expl_steps_per_train_loop = num_expl_steps_per_train_loop self.min_num_steps_before_training = min_num_steps_before_training + self.offline_rl = offline_rl def _train(self): - if self.min_num_steps_before_training > 0: + done = (self.epoch == self.num_epochs) + if done: + return OrderedDict(), done + + if self.epoch == 0 and self.min_num_steps_before_training > 0: init_expl_paths = self.expl_data_collector.collect_new_paths( self.max_path_length, self.min_num_steps_before_training, discard_incomplete_paths=False, ) - self.replay_buffer.add_paths(init_expl_paths) + if not self.offline_rl: + self.replay_buffer.add_paths(init_expl_paths) self.expl_data_collector.end_epoch(-1) - for epoch in gt.timed_for( - range(self._start_epoch, self.num_epochs), - save_itrs=True, - ): + timer.start_timer('evaluation sampling') + if self.epoch % self._eval_epoch_freq == 0: self.eval_data_collector.collect_new_paths( self.max_path_length, self.num_eval_steps_per_epoch, discard_incomplete_paths=True, ) - gt.stamp('evaluation sampling') + timer.stop_timer('evaluation sampling') + if not self._eval_only: for _ in range(self.num_train_loops_per_epoch): + timer.start_timer('exploration sampling', unique=False) new_expl_paths = self.expl_data_collector.collect_new_paths( self.max_path_length, self.num_expl_steps_per_train_loop, discard_incomplete_paths=False, ) - gt.stamp('exploration sampling', unique=False) + timer.stop_timer('exploration sampling') - self.replay_buffer.add_paths(new_expl_paths) - gt.stamp('data storing', unique=False) + timer.start_timer('replay buffer data storing', unique=False) + if not self.offline_rl: + self.replay_buffer.add_paths(new_expl_paths) + timer.stop_timer('replay buffer data storing') - self.training_mode(True) + timer.start_timer('training', unique=False) for _ in range(self.num_trains_per_train_loop): - train_data = self.replay_buffer.random_batch( - self.batch_size) + train_data = self.replay_buffer.random_batch(self.batch_size) self.trainer.train(train_data) - gt.stamp('training', unique=False) - self.training_mode(False) - - self._end_epoch(epoch) + timer.stop_timer('training') + log_stats = self._get_diagnostics() + return log_stats, False diff --git a/rlkit/core/eval_util.py b/rlkit/core/eval_util.py index d232b387a..f01111bea 100644 --- a/rlkit/core/eval_util.py +++ b/rlkit/core/eval_util.py @@ -3,11 +3,58 @@ """ from collections import OrderedDict +from typing import Dict, List from numbers import Number +import rlkit.pythonplusplus as ppp import numpy as np -import rlkit.pythonplusplus as ppp + +def create_stats_ordered_dict( + name, + data, + stat_prefix=None, + always_show_all_stats=True, + exclude_max_min=False, +): + if stat_prefix is not None: + name = "{}{}".format(stat_prefix, name) + if isinstance(data, Number): + return OrderedDict({name: data}) + + if len(data) == 0: + return OrderedDict() + + if isinstance(data, tuple): + ordered_dict = OrderedDict() + for number, d in enumerate(data): + sub_dict = create_stats_ordered_dict( + "{0}_{1}".format(name, number), + d, + ) + ordered_dict.update(sub_dict) + return ordered_dict + + if isinstance(data, list): + try: + iter(data[0]) + except TypeError: + pass + else: + data = np.concatenate(data) + + if (isinstance(data, np.ndarray) and data.size == 1 + and not always_show_all_stats): + return OrderedDict({name: float(data)}) + + stats = OrderedDict([ + (name + ' Mean', np.mean(data)), + (name + ' Std', np.std(data)), + ]) + if not exclude_max_min: + stats[name + ' Max'] = np.max(data) + stats[name + ' Min'] = np.min(data) + return stats def get_generic_path_information(paths, stat_prefix=''): @@ -15,6 +62,8 @@ def get_generic_path_information(paths, stat_prefix=''): Get an OrderedDict with a bunch of statistic names and values. """ statistics = OrderedDict() + if len(paths) == 0: + return statistics returns = [sum(path["rewards"]) for path in paths] rewards = np.vstack([path["rewards"] for path in paths]) @@ -30,16 +79,21 @@ def get_generic_path_information(paths, stat_prefix=''): statistics.update(create_stats_ordered_dict( 'Actions', actions, stat_prefix=stat_prefix )) - statistics['Num Paths'] = len(paths) - statistics[stat_prefix + 'Average Returns'] = get_average_returns(paths) + statistics[stat_prefix + 'Num Paths'] = len(paths) + statistics[stat_prefix + 'Average Returns'] = get_average_returns(paths) for info_key in ['env_infos', 'agent_infos']: if info_key in paths[0]: all_env_infos = [ - ppp.list_of_dicts__to__dict_of_lists(p[info_key]) + ppp.list_of_dicts__to__dict_of_lists(p[info_key], enforce_consistent_keys=False) for p in paths ] for k in all_env_infos[0].keys(): + if not ( + isinstance(all_env_infos[0][k], Number) + or isinstance(all_env_infos[0][k], np.ndarray) + ): + continue final_ks = np.array([info[k][-1] for info in all_env_infos]) first_ks = np.array([info[k][0] for info in all_env_infos]) all_ks = np.concatenate([info[k] for info in all_env_infos]) @@ -67,48 +121,42 @@ def get_average_returns(paths): return np.mean(returns) -def create_stats_ordered_dict( - name, - data, - stat_prefix=None, - always_show_all_stats=True, - exclude_max_min=False, -): - if stat_prefix is not None: - name = "{}{}".format(stat_prefix, name) - if isinstance(data, Number): - return OrderedDict({name: data}) +def get_path_lengths(paths): + return [len(path['observations']) for path in paths] - if len(data) == 0: - return OrderedDict() - if isinstance(data, tuple): - ordered_dict = OrderedDict() - for number, d in enumerate(data): - sub_dict = create_stats_ordered_dict( - "{0}_{1}".format(name, number), - d, - ) - ordered_dict.update(sub_dict) - return ordered_dict +def get_stat_in_paths(paths, dict_name, scalar_name): + if len(paths) == 0: + return np.array([[]]) - if isinstance(data, list): - try: - iter(data[0]) - except TypeError: - pass - else: - data = np.concatenate(data) + if type(paths[0][dict_name]) == dict: + # Support rllab interface + return [path[dict_name][scalar_name] for path in paths] - if (isinstance(data, np.ndarray) and data.size == 1 - and not always_show_all_stats): - return OrderedDict({name: float(data)}) + return [ + [info[scalar_name] for info in path[dict_name]] + for path in paths + ] - stats = OrderedDict([ - (name + ' Mean', np.mean(data)), - (name + ' Std', np.std(data)), - ]) - if not exclude_max_min: - stats[name + ' Max'] = np.max(data) - stats[name + ' Min'] = np.min(data) - return stats + +def diagnostics_from_paths_statistics(stat_to_lists: Dict[str, List[List]]): + statistics = OrderedDict() + for stat_name, stat_list in stat_to_lists.items(): + statistics.update(create_stats_ordered_dict( + stat_name, + stat_list, + always_show_all_stats=True, + )) + statistics.update(create_stats_ordered_dict( + '{}/final'.format(stat_name), + [s[-1:] for s in stat_list], + always_show_all_stats=True, + exclude_max_min=True, + )) + statistics.update(create_stats_ordered_dict( + '{}/initial'.format(stat_name), + [s[:1] for s in stat_list], + always_show_all_stats=True, + exclude_max_min=True, + )) + return statistics diff --git a/rlkit/core/logging.py b/rlkit/core/logging.py index ca5e0d2cf..69ca1b79a 100644 --- a/rlkit/core/logging.py +++ b/rlkit/core/logging.py @@ -3,22 +3,33 @@ https://github.com/rll/rllab """ -from enum import Enum -from contextlib import contextmanager -import numpy as np -import os -import os.path as osp -import sys -import datetime -import dateutil.tz import csv +import datetime +import errno +import joblib import json +import os +import os.path as osp import pickle -import errno +import sys import torch +from collections import OrderedDict +from contextlib import contextmanager +from enum import Enum + +import dateutil.tz +import dateutil.tz +import numpy as np +import uuid from rlkit.core.tabulate import tabulate -from collections import OrderedDict +from rlkit import pythonplusplus as ppp + + +def reopen(f): + f.close() + return open(f.name, 'a') + def add_prefix(log_dict: OrderedDict, prefix: str, divider=''): with_prefix = OrderedDict() @@ -27,9 +38,9 @@ def add_prefix(log_dict: OrderedDict, prefix: str, divider=''): return with_prefix -def append_log(log_dict, to_add_dict, prefix=None): +def append_log(log_dict, to_add_dict, prefix=None, divider=''): if prefix is not None: - to_add_dict = add_prefix(to_add_dict, prefix=prefix) + to_add_dict = add_prefix(to_add_dict, prefix=prefix, divider=divider) return log_dict.update(to_add_dict) @@ -82,6 +93,7 @@ def mkdir_p(path): class Logger(object): def __init__(self): + self.reopen_files_on_flush = False # useful for Azure blobfuse self._prefixes = [] self._prefix_str = '' @@ -89,10 +101,10 @@ def __init__(self): self._tabular_prefix_str = '' self._tabular = [] - self._tabular_keys = {} self._text_outputs = [] self._tabular_outputs = [] + self._tabular_keys = None self._text_fds = {} self._tabular_fds = {} @@ -106,6 +118,11 @@ def __init__(self): self._header_printed = False self.table_printer = TerminalTablePrinter() + self._use_tensorboard = False + self.epoch = 0 + + self._save_param_mode = 'torch' + def reset(self): self.__init__() @@ -120,6 +137,7 @@ def _remove_output(self, file_name, arr, fds): fds[file_name].close() del fds[file_name] arr.remove(file_name) + self._tabular_keys=None def push_prefix(self, prefix): self._prefixes.append(prefix) @@ -129,6 +147,11 @@ def add_text_output(self, file_name): self._add_output(file_name, self._text_outputs, self._text_fds, mode='a') + def add_tensorboard_output(self, file_name): + import tensorboard_logger + self._use_tensorboard = True + self.tensorboard_logger = tensorboard_logger.Logger(file_name) + def remove_text_output(self, file_name): self._remove_output(file_name, self._text_outputs, self._text_fds) @@ -137,7 +160,6 @@ def add_tabular_output(self, file_name, relative_to_snapshot_dir=False): file_name = osp.join(self._snapshot_dir, file_name) self._add_output(file_name, self._tabular_outputs, self._tabular_fds, mode='w') - self._tabular_keys[file_name] = None def remove_tabular_output(self, file_name, relative_to_snapshot_dir=False): if relative_to_snapshot_dir: @@ -164,6 +186,13 @@ def get_snapshot_gap(self, ): def set_snapshot_gap(self, gap): self._snapshot_gap = gap + def get_save_param_mode(self, ): + return self._save_param_mode + + def set_save_param_mode(self, mode): + assert mode in ['pickle', 'torch', 'joblib'] + self._save_param_mode = mode + def set_log_tabular_only(self, log_tabular_only): self._log_tabular_only = log_tabular_only @@ -184,10 +213,16 @@ def log(self, s, with_prefix=True, with_timestamp=True): for fd in list(self._text_fds.values()): fd.write(out + '\n') fd.flush() + if self.reopen_files_on_flush: + self._text_fds = { + k: reopen(fd) for k, fd in self._text_fds.items() + } sys.stdout.flush() def record_tabular(self, key, val): self._tabular.append((self._tabular_prefix_str + str(key), str(val))) + if self._use_tensorboard: + self.tensorboard_logger.log_value(self._tabular_prefix_str + str(key), val, self.epoch) def record_dict(self, d, prefix=None): if prefix is not None: @@ -205,24 +240,14 @@ def pop_tabular_prefix(self, ): del self._tabular_prefixes[-1] self._tabular_prefix_str = ''.join(self._tabular_prefixes) - def save_extra_data(self, data, file_name='extra_data.pkl', mode='joblib'): + def save_extra_data(self, data, file_name='extra_data', mode='joblib'): """ Data saved here will always override the last entry :param data: Something pickle'able. """ file_name = osp.join(self._snapshot_dir, file_name) - if mode == 'joblib': - import joblib - joblib.dump(data, file_name, compress=3) - elif mode == 'pickle': - pickle.dump(data, open(file_name, "wb")) - elif mode == 'cloudpickle': - import cloudpickle - full_filename = file_name + ".cpkl" - cloudpickle.dump(data, open(full_filename, "wb")) - else: - raise ValueError("Invalid mode: {}".format(mode)) + self._save_params_to_file(data, file_name, mode=mode) return file_name def get_table_dict(self, ): @@ -271,7 +296,9 @@ def record_tabular_misc_stat(self, key, values, placement='back'): self.record_tabular(prefix + "Max" + suffix, np.nan) def dump_tabular(self, *args, **kwargs): + self.epoch += 1 wh = kwargs.pop("write_header", None) + if len(self._tabular) > 0: if self._log_tabular_only: self.table_printer.print_tabular(self._tabular) @@ -279,24 +306,16 @@ def dump_tabular(self, *args, **kwargs): for line in tabulate(self._tabular).split('\n'): self.log(line, *args, **kwargs) tabular_dict = dict(self._tabular) - # Also write to the csv files - for filename, tabular_fd in list(self._tabular_fds.items()): - # Only saves keys in first iteration to CSV! - # (But every key is printed out in text) - itr0_keys = self._tabular_keys.get(filename) - if itr0_keys is None: - itr0_keys = list(sorted(tabular_dict.keys())) - self._tabular_keys[filename] = itr0_keys - else: - prev_keys = set(itr0_keys) - curr_keys = set(tabular_dict.keys()) - if curr_keys != prev_keys: - print("Warning: CSV key mismatch") - print("extra keys in 0th iter", prev_keys - curr_keys) - print("extra keys in cur iter", curr_keys - prev_keys) + # Only saves keys in first iteration to CSV! + # (But every key is printed out in text) + if self._tabular_keys is None: + self._tabular_keys = list(sorted(tabular_dict.keys())) + + # Write to the csv files + for tabular_fd in list(self._tabular_fds.values()): writer = csv.DictWriter(tabular_fd, - fieldnames=itr0_keys, + fieldnames=self._tabular_keys, extrasaction="ignore",) if wh or ( wh is None and tabular_fd not in self._tabular_header_written): @@ -304,36 +323,239 @@ def dump_tabular(self, *args, **kwargs): self._tabular_header_written.add(tabular_fd) writer.writerow(tabular_dict) tabular_fd.flush() + if self.reopen_files_on_flush: + new_tabular_fds = {} + for k, fd in self._tabular_fds.items(): + new_fd = reopen(fd) + new_tabular_fds[k] = new_fd + if fd in self._tabular_header_written: + self._tabular_header_written.remove(fd) + self._tabular_header_written.add(new_fd) + self._tabular_fds = new_tabular_fds del self._tabular[:] def pop_prefix(self, ): del self._prefixes[-1] self._prefix_str = ''.join(self._prefixes) + def _save_params_to_file(self, params, file_name, mode): + if mode == 'joblib': + joblib.dump(params, file_name + ".pkl", compress=3) + elif mode == 'pickle': + pickle.dump(params, open(file_name + ".pkl", "wb")) + elif mode == 'cloudpickle': + import cloudpickle + cloudpickle.dump(params, open(file_name + ".cpkl", "wb")) + print(file_name + ".cpkl", "wb") + elif mode == 'torch': + torch.save(params, file_name + ".pt") + elif mode == 'txt': + with open(file_name + ".txt", "w") as f: + f.write(params) + else: + raise ValueError("Invalid mode: {}".format(mode)) + def save_itr_params(self, itr, params): if self._snapshot_dir: if self._snapshot_mode == 'all': - file_name = osp.join(self._snapshot_dir, 'itr_%d.pkl' % itr) - torch.save(params, file_name) + file_name = osp.join(self._snapshot_dir, 'itr_%d' % itr) + self._save_params_to_file(params, file_name, mode=self._save_param_mode) elif self._snapshot_mode == 'last': # override previous params - file_name = osp.join(self._snapshot_dir, 'params.pkl') - torch.save(params, file_name) + file_name = osp.join(self._snapshot_dir, 'params') + self._save_params_to_file(params, file_name, mode=self._save_param_mode) elif self._snapshot_mode == "gap": if itr % self._snapshot_gap == 0: - file_name = osp.join(self._snapshot_dir, 'itr_%d.pkl' % itr) - torch.save(params, file_name) + file_name = osp.join(self._snapshot_dir, 'itr_%d' % itr) + self._save_params_to_file(params, file_name, mode=self._save_param_mode) elif self._snapshot_mode == "gap_and_last": if itr % self._snapshot_gap == 0: - file_name = osp.join(self._snapshot_dir, 'itr_%d.pkl' % itr) - torch.save(params, file_name) - file_name = osp.join(self._snapshot_dir, 'params.pkl') - torch.save(params, file_name) + file_name = osp.join(self._snapshot_dir, 'itr_%d' % itr) + self._save_params_to_file(params, file_name, mode=self._save_param_mode) + file_name = osp.join(self._snapshot_dir, 'params') + self._save_params_to_file(params, file_name, mode=self._save_param_mode) elif self._snapshot_mode == 'none': pass else: raise NotImplementedError -logger = Logger() +def setup_logger_custom( + logger, + exp_name, + base_log_dir, + variant=None, + text_log_file="debug.log", + variant_log_file="variant.json", + tabular_log_file="progress.csv", + snapshot_mode="last", + snapshot_gap=1, + log_tabular_only=False, + log_dir=None, + tensorboard=False, + unique_id=None, + git_infos=None, + script_name=None, + run_id=None, + first_time=True, + reopen_files_on_flush=False, + **create_log_dir_kwargs +): + """ + A more custom logger setup function with more options. + Will save log output to + based_log_dir/exp_name/exp_name. + exp_name will be auto-generated to be unique. + If log_dir is specified, then that directory is used as the output dir. + :param exp_name: The sub-directory for this specific experiment. + :param variant: + :param base_log_dir: The directory where all log should be saved. + :param text_log_file: + :param variant_log_file: + :param tabular_log_file: + :param snapshot_mode: + :param log_tabular_only: + :param snapshot_gap: + :param log_dir: + :return: + """ + logger.reset() + variant = variant or {} + unique_id = unique_id or str(uuid.uuid4()) + + if log_dir is None: + log_dir = create_log_dir( + exp_name=exp_name, + base_log_dir=base_log_dir, + variant=variant, + run_id=run_id, + **create_log_dir_kwargs + ) + + if tensorboard: + tensorboard_log_path = osp.join(log_dir, "tensorboard") + logger.add_tensorboard_output(tensorboard_log_path) + + logger.log("Variant:") + variant_to_save = variant.copy() + variant_to_save['unique_id'] = unique_id + variant_to_save['exp_name'] = exp_name + variant_to_save['trial_name'] = log_dir.split('/')[-1] + logger.log( + json.dumps(ppp.dict_to_safe_json(variant_to_save, sort=True), indent=2) + ) + variant_log_path = osp.join(log_dir, variant_log_file) + logger.log_variant(variant_log_path, variant_to_save) + + tabular_log_path = osp.join(log_dir, tabular_log_file) + text_log_path = osp.join(log_dir, text_log_file) + + logger.add_text_output(text_log_path) + if first_time: + logger.add_tabular_output(tabular_log_path) + else: + logger._add_output(tabular_log_path, logger._tabular_outputs, + logger._tabular_fds, mode='a') + for tabular_fd in logger._tabular_fds: + logger._tabular_header_written.add(tabular_fd) + logger.set_snapshot_dir(log_dir) + logger.set_snapshot_mode(snapshot_mode) + logger.set_snapshot_gap(snapshot_gap) + logger.set_log_tabular_only(log_tabular_only) + logger.reopen_files_on_flush = reopen_files_on_flush + exp_name = log_dir.split("/")[-1] + logger.push_prefix("[%s] " % exp_name) + + if git_infos: + save_git_infos(git_infos, log_dir) + if script_name: + with open(osp.join(log_dir, "script_name.txt"), "w") as f: + f.write(script_name) + return log_dir + + +def save_git_infos(git_infos, log_dir): + for ( + directory, code_diff, code_diff_staged, commit_hash, branch_name + ) in git_infos: + if directory[-1] == '/': + diff_file_name = directory[1:-1].replace("/", "-") + ".patch" + diff_staged_file_name = ( + directory[1:-1].replace("/", "-") + "_staged.patch" + ) + else: + diff_file_name = directory[1:].replace("/", "-") + ".patch" + diff_staged_file_name = ( + directory[1:].replace("/", "-") + "_staged.patch" + ) + if code_diff is not None and len(code_diff) > 0: + with open(osp.join(log_dir, diff_file_name), "w") as f: + f.write(code_diff + '\n') + if code_diff_staged is not None and len(code_diff_staged) > 0: + with open(osp.join(log_dir, diff_staged_file_name), "w") as f: + f.write(code_diff_staged + '\n') + with open(osp.join(log_dir, "git_infos.txt"), "a") as f: + f.write("directory: {}".format(directory)) + f.write('\n') + f.write("git hash: {}".format(commit_hash)) + f.write('\n') + f.write("git branch name: {}".format(branch_name)) + f.write('\n\n') + + +def create_log_dir( + exp_name, + base_log_dir, + exp_id=0, + seed=0, + variant=None, + trial_dir_suffix=None, + add_time_suffix=True, + include_exp_name_sub_dir=True, + run_id=None, +): + """ + Creates and returns a unique log directory. + :param exp_name: All experiments with this prefix will have log + directories be under this directory. + :param exp_id: Different exp_ids will be in different directories. + :return: + """ + if run_id is not None: + exp_id = variant["exp_id"] + if variant.get("num_exps_per_instance", 0) > 1: + now = datetime.datetime.now(dateutil.tz.tzlocal()) + timestamp = now.strftime('%Y_%m_%d_%H_%M_%S') + trial_name = "run%s/id%s/%s--s%d" % (run_id, exp_id, timestamp, seed) + else: + trial_name = "run{}/id{}".format(run_id, exp_id) + else: + trial_name = create_trial_name(exp_name, exp_id=exp_id, seed=seed, add_time_suffix=add_time_suffix) + if trial_dir_suffix is not None: + trial_name = "{}-{}".format(trial_name, trial_dir_suffix) + if include_exp_name_sub_dir: + log_dir = osp.join(base_log_dir, exp_name.replace("_", "-"), trial_name) + else: + log_dir = osp.join(base_log_dir, trial_name) + if osp.exists(log_dir): + print("WARNING: Log directory already exists {}".format(log_dir)) + os.makedirs(log_dir, exist_ok=True) + return log_dir + + +def create_trial_name(exp_name, exp_id=0, seed=0, add_time_suffix=True): + """ + Create a semi-unique experiment name that has a timestamp + :param exp_name: + :param exp_id: + :return: + """ + now = datetime.datetime.now(dateutil.tz.tzlocal()) + timestamp = now.strftime('%Y_%m_%d_%H_%M_%S') + if add_time_suffix: + return "%s_%s_id%03d--s%d" % (exp_name, timestamp, exp_id, seed) + else: + return "%s_id%03d--s%d" % (exp_name, exp_id, seed) + +logger = Logger() diff --git a/rlkit/core/rl_algorithm.py b/rlkit/core/rl_algorithm.py index 284ef46fd..84e7bf3f5 100644 --- a/rlkit/core/rl_algorithm.py +++ b/rlkit/core/rl_algorithm.py @@ -1,23 +1,22 @@ import abc from collections import OrderedDict -import gtimer as gt +from rlkit.core.timer import timer from rlkit.core import logger, eval_util +from rlkit.core.logging import append_log from rlkit.data_management.replay_buffer import ReplayBuffer from rlkit.samplers.data_collector import DataCollector def _get_epoch_timings(): - times_itrs = gt.get_times().stamps.itrs + times_itrs = timer.get_times() times = OrderedDict() epoch_time = 0 for key in sorted(times_itrs): - time = times_itrs[key][-1] + time = times_itrs[key] epoch_time += time times['time/{} (s)'.format(key)] = time - times['time/epoch (s)'] = epoch_time - times['time/total (s)'] = gt.get_times().total return times @@ -30,6 +29,16 @@ def __init__( exploration_data_collector: DataCollector, evaluation_data_collector: DataCollector, replay_buffer: ReplayBuffer, + num_epochs, + exploration_get_diagnostic_functions=None, + evaluation_get_diagnostic_functions=None, + eval_epoch_freq=1, + eval_only=False, + save_algorithm=False, + save_replay_buffer=False, + save_logger=False, + save_extra_manual_epoch_list=(), + keep_only_last_extra=True, ): self.trainer = trainer self.expl_env = exploration_env @@ -38,12 +47,47 @@ def __init__( self.eval_data_collector = evaluation_data_collector self.replay_buffer = replay_buffer self._start_epoch = 0 - + self.post_train_funcs = [] self.post_epoch_funcs = [] - - def train(self, start_epoch=0): - self._start_epoch = start_epoch - self._train() + self.epoch = self._start_epoch + self.num_epochs = num_epochs + self.save_algorithm = save_algorithm + self.save_replay_buffer = save_replay_buffer + self.save_extra_manual_epoch_list = save_extra_manual_epoch_list + self.save_logger = save_logger + self.keep_only_last_extra = keep_only_last_extra + if exploration_get_diagnostic_functions is None: + exploration_get_diagnostic_functions = [ + eval_util.get_generic_path_information, + ] + if hasattr(self.expl_env, 'get_diagnostics'): + exploration_get_diagnostic_functions.append( + self.expl_env.get_diagnostics) + if evaluation_get_diagnostic_functions is None: + evaluation_get_diagnostic_functions = [ + eval_util.get_generic_path_information, + ] + if hasattr(self.eval_env, 'get_diagnostics'): + evaluation_get_diagnostic_functions.append( + self.eval_env.get_diagnostics) + self._eval_get_diag_fns = evaluation_get_diagnostic_functions + self._expl_get_diag_fns = exploration_get_diagnostic_functions + + self._eval_epoch_freq = eval_epoch_freq + self._eval_only = eval_only + + def train(self): + timer.return_global_times = True + for _ in range(self.epoch, self.num_epochs): + self._begin_epoch() + timer.start_timer('saving') + logger.save_itr_params(self.epoch, self._get_snapshot()) + timer.stop_timer('saving') + log_dict, _ = self._train() + logger.record_dict(log_dict) + logger.dump_tabular(with_prefix=True, with_timestamp=False) + self._end_epoch() + logger.save_itr_params(self.epoch, self._get_snapshot()) def _train(self): """ @@ -51,19 +95,41 @@ def _train(self): """ raise NotImplementedError('_train must implemented by inherited class') - def _end_epoch(self, epoch): - snapshot = self._get_snapshot() - logger.save_itr_params(epoch, snapshot) - gt.stamp('saving') - self._log_stats(epoch) + def _begin_epoch(self): + timer.reset() + + def _end_epoch(self): + for post_train_func in self.post_train_funcs: + post_train_func(self, self.epoch) - self.expl_data_collector.end_epoch(epoch) - self.eval_data_collector.end_epoch(epoch) - self.replay_buffer.end_epoch(epoch) - self.trainer.end_epoch(epoch) + self.expl_data_collector.end_epoch(self.epoch) + self.eval_data_collector.end_epoch(self.epoch) + self.replay_buffer.end_epoch(self.epoch) + self.trainer.end_epoch(self.epoch) for post_epoch_func in self.post_epoch_funcs: - post_epoch_func(self, epoch) + post_epoch_func(self, self.epoch) + + if self.epoch in self.save_extra_manual_epoch_list: + if self.keep_only_last_extra: + file_name = 'extra_snapshot' + info_lines = [ + 'extra_snapshot_itr = {}'.format(self.epoch), + 'snapshot_dir = {}'.format(logger.get_snapshot_dir()) + ] + logger.save_extra_data( + '\n'.join(info_lines), + file_name='snapshot_info', + mode='txt', + ) + else: + file_name = 'extra_snapshot_itr{}'.format(self.epoch) + logger.save_extra_data( + self.get_extra_data_to_save(self.epoch), + file_name=file_name, + mode='cloudpickle', + ) + self.epoch += 1 def _get_snapshot(self): snapshot = {} @@ -77,64 +143,61 @@ def _get_snapshot(self): snapshot['replay_buffer/' + k] = v return snapshot - def _log_stats(self, epoch): - logger.log("Epoch {} finished".format(epoch), with_timestamp=True) - - """ - Replay Buffer - """ - logger.record_dict( - self.replay_buffer.get_diagnostics(), - prefix='replay_buffer/' - ) - - """ - Trainer - """ - logger.record_dict(self.trainer.get_diagnostics(), prefix='trainer/') - + def get_extra_data_to_save(self, epoch): """ - Exploration + Save things that shouldn't be saved every snapshot but rather + overwritten every time. + :param epoch: + :return: """ - logger.record_dict( - self.expl_data_collector.get_diagnostics(), - prefix='exploration/' + data_to_save = dict( + epoch=epoch, ) + if self.save_replay_buffer: + data_to_save['replay_buffer'] = self.replay_buffer + if self.save_algorithm: + data_to_save['algorithm'] = self + if self.save_logger: + data_to_save['logger'] = logger + return data_to_save + + def _get_diagnostics(self): + timer.start_timer('logging', unique=False) + algo_log = OrderedDict() + append_log(algo_log, self.replay_buffer.get_diagnostics(), + prefix='replay_buffer/') + append_log(algo_log, self.trainer.get_diagnostics(), prefix='trainer/') + # Exploration + append_log(algo_log, self.expl_data_collector.get_diagnostics(), + prefix='expl/') expl_paths = self.expl_data_collector.get_epoch_paths() - if hasattr(self.expl_env, 'get_diagnostics'): - logger.record_dict( - self.expl_env.get_diagnostics(expl_paths), - prefix='exploration/', - ) - logger.record_dict( - eval_util.get_generic_path_information(expl_paths), - prefix="exploration/", - ) - """ - Evaluation - """ - logger.record_dict( - self.eval_data_collector.get_diagnostics(), - prefix='evaluation/', - ) - eval_paths = self.eval_data_collector.get_epoch_paths() - if hasattr(self.eval_env, 'get_diagnostics'): - logger.record_dict( - self.eval_env.get_diagnostics(eval_paths), - prefix='evaluation/', - ) - logger.record_dict( - eval_util.get_generic_path_information(eval_paths), - prefix="evaluation/", - ) - - """ - Misc - """ - gt.stamp('logging') - logger.record_dict(_get_epoch_timings()) - logger.record_tabular('Epoch', epoch) - logger.dump_tabular(with_prefix=False, with_timestamp=False) + for fn in self._expl_get_diag_fns: + append_log(algo_log, fn(expl_paths), prefix='expl/') + # Eval + if self.epoch % self._eval_epoch_freq == 0: + self._prev_eval_log = OrderedDict() + eval_diag = self.eval_data_collector.get_diagnostics() + self._prev_eval_log.update(eval_diag) + append_log(algo_log, eval_diag, prefix='eval/') + eval_paths = self.eval_data_collector.get_epoch_paths() + for fn in self._eval_get_diag_fns: + addl_diag = fn(eval_paths) + self._prev_eval_log.update(addl_diag) + append_log(algo_log, addl_diag, prefix='eval/') + else: + append_log(algo_log, self._prev_eval_log, prefix='eval/') + + append_log(algo_log, _get_epoch_timings()) + algo_log['epoch'] = self.epoch + try: + import os + import psutil + process = psutil.Process(os.getpid()) + algo_log['RAM Usage (Mb)'] = int(process.memory_info().rss / 1000000) + except ImportError: + pass + timer.stop_timer('logging') + return algo_log @abc.abstractmethod def training_mode(self, mode): diff --git a/rlkit/envs/proxy_env.py b/rlkit/envs/proxy_env.py index bb9a51a7a..032afd236 100644 --- a/rlkit/envs/proxy_env.py +++ b/rlkit/envs/proxy_env.py @@ -28,6 +28,9 @@ def terminate(self): if hasattr(self.wrapped_env, "terminate"): self.wrapped_env.terminate() + def seed(self, _seed): + return self.wrapped_env.seed(_seed) + def __getattr__(self, attr): if attr == '_wrapped_env': raise AttributeError() @@ -47,4 +50,4 @@ def __setstate__(self, state): self.__dict__.update(state) def __str__(self): - return '{}({})'.format(type(self).__name__, self.wrapped_env) \ No newline at end of file + return '{}({})'.format(type(self).__name__, self.wrapped_env) diff --git a/rlkit/envs/wrappers.py b/rlkit/envs/wrappers.py index ba83fe95c..3ba11b792 100644 --- a/rlkit/envs/wrappers.py +++ b/rlkit/envs/wrappers.py @@ -34,6 +34,9 @@ def terminate(self): if hasattr(self.wrapped_env, "terminate"): self.wrapped_env.terminate() + def seed(self, _seed): + return self.wrapped_env.seed(_seed) + def __getattr__(self, attr): if attr == '_wrapped_env': raise AttributeError() diff --git a/rlkit/launchers/arglauncher.py b/rlkit/launchers/arglauncher.py new file mode 100644 index 000000000..0f69bc97c --- /dev/null +++ b/rlkit/launchers/arglauncher.py @@ -0,0 +1,203 @@ +"""Wraps launcher_util to make launching experiments one step easier - Ashvin +- Names experiments based on the running filename +- Adds some modes like --1 to run only one variant of a set for testing +- Control the GPU used and other experiment attributes through command line args +""" + +from rlkit.launchers import launcher_util as lu +import argparse # TODO: migrate to argparse if necessary +import sys +from multiprocessing import Process, Pool +import pdb +import random + +def run_variants(experiment, vs, process_args_fn=None, run_id=0, ): + # preprocess + variants = [] + for i, v in enumerate(vs): + v["exp_id"] = i + v["run_id"] = i + process_run_args(v) + process_logger_args(v, run_id=run_id) + process_launcher_args(v) + if process_args_fn: + process_args_fn(v) + variants.append(v) + + if "--variants" in sys.argv: # takes either 3-7 or 3,6,7,8,10 as next arg + i = sys.argv.index("--variants") + val = sys.argv[i+1] + ids = [] + if "," in val: + ids = map(int, val.split(',')) + elif "-" in val: + start, end = map(int, val.split(',')) + ids = range(start, end) + else: + ids = [int(val), ] + new_variants = [] + for v in variants: + if v["exp_id"] in ids: + new_variants.append(v) + variants = new_variants + if "--1" in sys.argv: # only run the first experiment for testing + variants = variants[:1] + + print("Running", len(variants), "variants") + + # run + parallel = variants[0].get("parallel", False) + if parallel: + parallel_run(experiment, variants, parallel) + else: + for variant in variants: + run_variant(experiment, variant) + + print("Running", len(variants), "variants") + +def run_variant(experiment, variant): + launcher_config = variant.pop("launcher_config") + lu.run_experiment( + experiment, + variant=variant, + **launcher_config, + ) + +def parallel_run(experiment, variants, n_p): + i = 0 + # import pdb; pdb.set_trace() + while i < len(variants): + prs = [] + for p in range(n_p): + if i < len(variants): + v = variants[i] + v["gpu_id"] = v["gpus"][p] + pr = Process(target=run_variant, args=(experiment, v)) + prs.append(pr) + pr.start() + i += 1 + for pr in prs: + pr.join() + +def process_run_args(variant): + if "--sync" in sys.argv: + variant["sync"] = True + if "--nosync" in sys.argv: + variant["sync"] = False + + if "--render" in sys.argv: + variant["render"] = True + if "algo_kwargs" in variant: + if "base_kwargs" in variant["algo_kwargs"]: + variant["algo_kwargs"]["base_kwargs"]["render"] = True + if "--norender" in sys.argv: + variant["render"] = False + if "--debug" in sys.argv: + variant["debug"] = True + + if "--parallel" in sys.argv: + i = sys.argv.index("--parallel") + parallel = int(sys.argv[i+1]) + variant["parallel"] = parallel + if "--gpus" in sys.argv: + i = sys.argv.index("--gpus") + variant["gpus"] = list(map(int, sys.argv[i+1].split(","))) + variant["use_gpu"] = True + else: + variant["gpus"] = list(range(parallel)) + + +def process_logger_args(variant, run_id=None): + logger_config = variant.setdefault("logger_config", dict()) + + logger_config["snapshot_mode"] = logger_config.get("snapshot_mode", "gap") + logger_config["snapshot_gap"] = logger_config.get("snapshot_gap", 100) + if "--snapshot" in sys.argv: + logger_config["snapshot_mode"] = 'gap_and_last' + logger_config["snapshot_gap"] = 20 + elif "--nosnapshot" in sys.argv: + logger_config["snapshot_mode"] = 'none' + logger_config["snapshot_gap"] = 1 + + if "--run" in sys.argv: + i = sys.argv.index("--run") + logger_config["run_id"] = int(sys.argv[i+1]) + variant["run_id"] = int(sys.argv[i+1]) + else: + logger_config["run_id"] = run_id + + +def process_launcher_args(variant): + launcher_config = variant.setdefault("launcher_config", dict()) + + # launcher_config.setdefault("gpu_id", 0) + launcher_config.setdefault("prepend_date_to_exp_prefix", False) + launcher_config.setdefault("region", "us-west-2") + launcher_config.setdefault("time_in_mins", None) + launcher_config.setdefault("ssh_host", None) + # launcher_config.setdefault("slurm_config_name", None) + launcher_config.setdefault("unpack_variant", False) + # launcher_config.setdefault("s3_log_prefix", "") + + if "--seed" in sys.argv: + i = sys.argv.index("--seed") + launcher_config["seed"] = int(sys.argv[i+1]) + + if "--ec2" in sys.argv: + launcher_config["mode"] = "ec2" + if "--local" in sys.argv: + launcher_config["mode"] = "here_no_doodad" + if "--localdocker" in sys.argv: + launcher_config["mode"] = "local_docker" + if "--sss" in sys.argv: + launcher_config["mode"] = "sss" + if "--singularity" in sys.argv: + launcher_config["mode"] = "local_singularity" + if "--slurm" in sys.argv: + launcher_config["mode"] = "slurm" + if "--ss" in sys.argv: + launcher_config["mode"] = "slurm_singularity" + if "--sss" in sys.argv: + launcher_config["mode"] = "sss" + if "--htp" in sys.argv: + launcher_config["mode"] = "htp" + if "--ssh" in sys.argv: + launcher_config["mode"] = "ssh" + i = sys.argv.index("--ssh") + launcher_config["ssh_host"] = sys.argv[i+1] + + if "--slurmconfig" in sys.argv: + i = sys.argv.index("--slurmconfig") + launcher_config["slurm_config_name"] = sys.argv[i+1] + if "--lowprio" in sys.argv: + launcher_config["slurm_config_name"] = "gpu_lowprio" + + if "--verbose" in sys.argv: + launcher_config["verbose"] = True + + if "--gpu_id" in sys.argv: + i = sys.argv.index("--gpu_id") + launcher_config["gpu_id"] = int(sys.argv[i+1]) + launcher_config["use_gpu"] = True + if "--gpu" in sys.argv: + launcher_config["use_gpu"] = True + if "use_gpu" in launcher_config and launcher_config["use_gpu"]: + if "instance_type" not in launcher_config: + launcher_config["instance_type"] = "g3.4xlarge" + if "--time" in sys.argv: + i = sys.argv.index("--time") + launcher_config["time_in_mins"] = int(sys.argv[i+1]) + + if "instance_type" not in launcher_config: + launcher_config["instance_type"] = "c4.xlarge" + if "use_gpu" not in launcher_config: + launcher_config["use_gpu"] = None + + if "base_log_dir" not in launcher_config: + launcher_config["base_log_dir"] = None + if "--mac" in sys.argv: + launcher_config["base_log_dir"] = "/Users/ashvin/data/s3doodad/" + + if "exp_prefix" not in launcher_config: + launcher_config["exp_prefix"] = sys.argv[0][:-3] + diff --git a/rlkit/launchers/experiments/awac/awac_rl.py b/rlkit/launchers/experiments/awac/awac_rl.py index 8b381006c..41f0a862e 100644 --- a/rlkit/launchers/experiments/awac/awac_rl.py +++ b/rlkit/launchers/experiments/awac/awac_rl.py @@ -157,12 +157,14 @@ def resume(variant): def process_args(variant): if variant.get("debug", False): variant['max_path_length'] = 50 - variant['batch_size'] = 5 - variant['num_epochs'] = 5 - # variant['num_eval_steps_per_epoch'] = 100 - # variant['num_expl_steps_per_train_loop'] = 100 - variant['num_trains_per_train_loop'] = 10 - # variant['min_num_steps_before_training'] = 100 + variant.get('algo_kwargs', {}).update(dict( + batch_size=5, + num_epochs=5, + num_eval_steps_per_epoch=100, + num_expl_steps_per_train_loop=100, + num_trains_per_train_loop=10, + min_num_steps_before_training=10, + )) variant['trainer_kwargs']['bc_num_pretrain_steps'] = min(10, variant['trainer_kwargs'].get('bc_num_pretrain_steps', 0)) variant['trainer_kwargs']['q_num_pretrain1_steps'] = min(10, variant['trainer_kwargs'].get('q_num_pretrain1_steps', 0)) variant['trainer_kwargs']['q_num_pretrain2_steps'] = min(10, variant['trainer_kwargs'].get('q_num_pretrain2_steps', 0)) @@ -328,12 +330,7 @@ def experiment(variant): evaluation_data_collector=eval_path_collector, replay_buffer=replay_buffer, max_path_length=variant['max_path_length'], - batch_size=variant['batch_size'], - num_epochs=variant['num_epochs'], - num_eval_steps_per_epoch=variant['num_eval_steps_per_epoch'], - num_expl_steps_per_train_loop=variant['num_expl_steps_per_train_loop'], - num_trains_per_train_loop=variant['num_trains_per_train_loop'], - min_num_steps_before_training=variant['min_num_steps_before_training'], + **variant['algo_kwargs'] ) else: expl_path_collector = MdpPathCollector( @@ -348,12 +345,7 @@ def experiment(variant): evaluation_data_collector=eval_path_collector, replay_buffer=replay_buffer, max_path_length=variant['max_path_length'], - batch_size=variant['batch_size'], - num_epochs=variant['num_epochs'], - num_eval_steps_per_epoch=variant['num_eval_steps_per_epoch'], - num_expl_steps_per_train_loop=variant['num_expl_steps_per_train_loop'], - num_trains_per_train_loop=variant['num_trains_per_train_loop'], - min_num_steps_before_training=variant['min_num_steps_before_training'], + **variant['algo_kwargs'] ) algorithm.to(ptu.device) diff --git a/rlkit/launchers/launcher_util.py b/rlkit/launchers/launcher_util.py index ce330cb2f..62377740d 100644 --- a/rlkit/launchers/launcher_util.py +++ b/rlkit/launchers/launcher_util.py @@ -17,6 +17,8 @@ from rlkit.torch.pytorch_util import set_gpu_mode import rlkit.pythonplusplus as ppp +import torch + GitInfo = namedtuple( 'GitInfo', [ @@ -353,6 +355,7 @@ def set_seed(seed): seed = int(seed) random.seed(seed) np.random.seed(seed) + torch.manual_seed(seed) def reset_execution_environment(): diff --git a/rlkit/pythonplusplus.py b/rlkit/pythonplusplus.py index d73c2291c..61d3379f2 100644 --- a/rlkit/pythonplusplus.py +++ b/rlkit/pythonplusplus.py @@ -1,7 +1,7 @@ """ General purpose Python functions. -TODO(vitchyr): I should probably move this to its own module +TODO(vitchyr): probably move this to its own module, not under railrl """ import math import sys @@ -96,7 +96,7 @@ def nested_dict_to_dot_map_dict(d, parent_key=''): def merge_recursive_dicts(a, b, path=None, - ignore_duplicate_keys_in_second_dict=False): + ignore_duplicate_keys_in_second_dict=False, print_info=True): """ Merge two dicts that may have nested dicts. """ @@ -105,13 +105,16 @@ def merge_recursive_dicts(a, b, path=None, if key in a: if isinstance(a[key], dict) and isinstance(b[key], dict): merge_recursive_dicts(a[key], b[key], path + [str(key)], - ignore_duplicate_keys_in_second_dict=ignore_duplicate_keys_in_second_dict) + ignore_duplicate_keys_in_second_dict=ignore_duplicate_keys_in_second_dict, + print_info=print_info) elif a[key] == b[key]: - print("Same value for key: {}".format(key)) + if print_info: + print("Same value for key: {}".format(key)) else: duplicate_key = '.'.join(path + [str(key)]) if ignore_duplicate_keys_in_second_dict: - print("duplicate key ignored: {}".format(duplicate_key)) + if print_info: + print("duplicate key ignored: {}".format(duplicate_key)) else: raise Exception( 'Duplicate keys at {}'.format(duplicate_key) @@ -144,7 +147,7 @@ def dict_of_list__to__list_of_dicts(dict, n_items): return new_dicts -def list_of_dicts__to__dict_of_lists(lst): +def list_of_dicts__to__dict_of_lists(lst, enforce_consistent_keys=True): """ ``` x = [ @@ -162,7 +165,10 @@ def list_of_dicts__to__dict_of_lists(lst): keys = lst[0].keys() output_dict = collections.defaultdict(list) for d in lst: - assert set(d.keys()) == set(keys), (d.keys(), keys) + if set(d.keys()) != set(keys): + print("dropping some keys", d.keys()) + if enforce_consistent_keys: + assert set(d.keys()) == set(keys) for k in keys: output_dict[k].append(d[k]) return output_dict @@ -232,6 +238,38 @@ def recursive_items(dictionary): yield from recursive_items(value) +def recursive_string_replace(x, original, new): + if isinstance(x, dict): + return {k: recursive_string_replace(v, original, new) + for k, v in x.items()} + elif isinstance(x, str): + return x.replace(original, new) + elif isinstance(x, list): + return [recursive_string_replace(i, original, new) for i in x] + elif isinstance(x, tuple): + return tuple(recursive_string_replace(i, original, new) for i in x) + else: + return x + + +def recursive_to_dict(maybe_dictable): + """Try to convert something into a recursive dictionary. + + The main use case is to have some generic config/loading module (e.g. + pyhocon) but to convert everything to a dictionary right before you use it. + + :param maybe_dictable: Something that may be a dictionary. + :return: + """ + try: + dict_version = dict(maybe_dictable) + return { + k: recursive_to_dict(v) + for k, v in dict_version.items() + } + except (TypeError, ValueError): + return maybe_dictable + # TODO(vitchyr): test methods/classes below @@ -240,25 +278,61 @@ def recursive_items(dictionary): """ -def map_recursive(fctn, x_or_iterable): +def treemap(f, *args, atomic_type=None, **kwargs): """ - Apply `fctn` to each element in x_or_iterable. + Recursively apply a function to a data structure. + + Usage: + ``` + def add(x, y, constant=0): + return x + y + constant + + tree1 = ( + [1, 2], + {'foo': 3, 'bar': 4}, + 5 + ) + tree2 = ( + [6, 7], + {'foo': 8, 'bar': 9}, + 10 + ) - This is a generalization of the map function since this will work - recursively for iterables. + treemap(add, tree1, tree2, constant=100) + ``` + will return + ``` + ( + [107, 109], + {'foo': 111, 'bar': 113}, + 115 + ) + ``` - :param fctn: Function from element of iterable to something. - :param x_or_iterable: An element or an Iterable of an element. - :return: The same (potentially recursive) iterable but with - all the elements transformed by fctn. + Currently only supports Mapping and Iterable data structures. + :param f: Function to apply. + :param args: Data structures over which to apply the function. + :param kwargs: key-word arguments that are passed to the base function + directly. + :return: """ - # if isinstance(x_or_iterable, Iterable): - if isinstance(x_or_iterable, list) or isinstance(x_or_iterable, tuple): - return type(x_or_iterable)( - map_recursive(fctn, item) for item in x_or_iterable - ) + if len(args) == 0: + return f(**kwargs) + if atomic_type and isinstance(args[0], atomic_type): + return f(*args, **kwargs) + if isinstance(args[0], collections.Mapping): + keys0 = args[0].keys() + assert all(keys0 == arg.keys() for arg in args) # same keys in all args + return type(args[0])({ + k: treemap(f, *tuple(d[k] for d in args), + atomic_type=atomic_type, **kwargs) + for k in args[0] + }) + elif isinstance(args[0], collections.Iterable): + return type(args[0])(treemap(f, *a, atomic_type=atomic_type, **kwargs) + for a in zip(*args)) else: - return fctn(x_or_iterable) + return f(*args, **kwargs) def filter_recursive(x_or_iterable): diff --git a/rlkit/testing/__init__.py b/rlkit/testing/__init__.py new file mode 100644 index 000000000..e69de29bb diff --git a/rlkit/testing/csv_util.py b/rlkit/testing/csv_util.py new file mode 100644 index 000000000..08a159775 --- /dev/null +++ b/rlkit/testing/csv_util.py @@ -0,0 +1,26 @@ +import csv + +def get_exp(fname): + with open(fname) as csvfile: + reader = csv.DictReader(csvfile) + output = [] + for row in reader: + output.append(row) + return output + +def check_equal(reference, output, keys): + for i in range(len(reference)): + reference_row = reference[i] + output_row = output[i] + for key in keys: + assert key in reference_row, "line %d key %s not in reference" % (i, key) + assert key in output_row, "line %d key %s not in output" % (i, key) + assert reference_row[key] == output_row[key], "line %d key %s reference: %s, output: %s" % (i, key, reference_row[key], output_row[key]) + +def check_exactly_equal(reference, output, ): + for i in range(len(reference)): + reference_row = reference[i] + output_row = output[i] + for key in reference_row: + assert key in output_row, key + assert reference_row[key] == output_row[key], "%s reference: %s, output: %s" % (key, reference_row[key], output_row[key]) diff --git a/rlkit/testing/np_test_case.py b/rlkit/testing/np_test_case.py new file mode 100644 index 000000000..07bc4ae6a --- /dev/null +++ b/rlkit/testing/np_test_case.py @@ -0,0 +1,99 @@ +import unittest + +import numpy as np + +from rlkit.testing.testing_utils import are_np_arrays_equal, \ + are_np_array_iterables_equal + + +class NPTestCase(unittest.TestCase): + """ + Numpy test case, providing useful assert methods. + """ + def assertNpEqual(self, np_arr1, np_arr2, msg="Numpy arrays not equal."): + self.assertTrue(are_np_arrays_equal(np_arr1, np_arr2), msg) + + def assertNpAlmostEqual( + self, + np_arr1, + np_arr2, + msg="Numpy arrays not equal.", + threshold=1e-5, + ): + self.assertTrue( + are_np_arrays_equal(np_arr1, np_arr2, threshold=threshold), + msg + ) + + def assertNpNotEqual(self, np_arr1, np_arr2, msg="Numpy arrays equal"): + self.assertFalse(are_np_arrays_equal(np_arr1, np_arr2), msg) + + def assertNpArraysEqual( + self, + np_arrays1, + np_arrays2, + msg=None, + ): + msg = msg or "Numpy arrays {} and {} are not equal".format( + np_arrays1, + np_arrays2, + ) + self.assertTrue( + are_np_array_iterables_equal( + np_arrays1, + np_arrays2, + ), + msg + ) + + # TODO(vpong): see why such a high threshold is needed + def assertNpArraysAlmostEqual( + self, + np_arrays1, + np_arrays2, + msg="Numpy array lists are not almost equal.", + threshold=1e-4, + ): + self.assertTrue( + are_np_array_iterables_equal( + np_arrays1, + np_arrays2, + threshold=threshold, + ), + msg + ) + + def assertNpArraysNotEqual( + self, + np_arrays1, + np_arrays2, + msg="Numpy array lists are equal." + ): + self.assertFalse(are_np_array_iterables_equal(np_arrays1, np_arrays2), + msg) + + def assertNpArraysNotAlmostEqual( + self, + np_arrays1, + np_arrays2, + msg="Numpy array lists are equal.", + threshold=1e-4, + ): + self.assertFalse( + are_np_array_iterables_equal( + np_arrays1, + np_arrays2, + threshold=threshold, + ), + msg + ) + + def assertNpArrayConstant( + self, + np_array: np.ndarray, + constant + ): + self.assertTrue( + (np_array == constant).all(), + msg="Not all values equal {0}".format(constant) + ) \ No newline at end of file diff --git a/rlkit/testing/stub_classes.py b/rlkit/testing/stub_classes.py new file mode 100644 index 000000000..48ef94108 --- /dev/null +++ b/rlkit/testing/stub_classes.py @@ -0,0 +1,107 @@ +import numpy as np +from gym.spaces import Box, Dict + +from rlkit.exploration_strategies.base import RawExplorationStrategy + +class StubEnv(object): + def __init__(self, obs_dim=1, action_dim=1, **kwargs): + self.obs_dim = obs_dim + obs_low = np.ones(obs_dim) * -1 + obs_high = np.ones(obs_dim) + self._observation_space = Box(obs_low, obs_high) + + self.action_dim = action_dim + action_low = np.ones(action_dim) * -1 + action_high = np.ones(action_dim) + self._action_space = Box(action_low, action_high) + + self.env = self # for env wrappers + + print("stub env unused kwargs", kwargs) + + def reset(self): + return np.zeros(self.obs_dim) + + def step(self, action): + return np.zeros(self.obs_dim), 0, 0, {} + + @property + def action_space(self): + return self._action_space + + @property + def horizon(self): + return 99999 + + @property + def observation_space(self): + return self._observation_space + +class StubMultiEnv(object): + def __init__(self, obs_dims=None, action_dim=1, **kwargs): + self.obs_dims = obs_dims + + spaces = [] + for name in self.obs_dims: + obs_dim = self.obs_dims[name] + obs_low = np.ones(obs_dim) * -1 + obs_high = np.ones(obs_dim) + spaces.append((name, Box(obs_low, obs_high))) + self._observation_space = Dict(spaces) + + self.action_dim = action_dim + action_low = np.ones(action_dim) * -1 + action_high = np.ones(action_dim) + self._action_space = Box(action_low, action_high) + + self.env = self # for env wrappers + + print("stub env unused kwargs", kwargs) + + def reset(self): + return self.get_obs() + + def step(self, action): + return self.get_obs(), 0, 0, {} + + def get_obs(self): + obs = dict() + for name in self.obs_dims: + obs_dim = self.obs_dims[name] + obs[name] = np.zeros(obs_dim) + return obs + + @property + def action_space(self): + return self._action_space + + @property + def horizon(self): + return 99999 + + @property + def observation_space(self): + return self._observation_space + + +class StubPolicy(object): + def __init__(self, action): + self._action = action + + def get_action(self, *arg, **kwargs): + return self._action, {} + + +class AddEs(RawExplorationStrategy): + """ + return action + constant + """ + def __init__(self, number): + self._number = number + + def get_action(self, t, observation, policy, **kwargs): + action, _ = policy.get_action(observation) + return self.get_action_from_raw_action(action) + + def get_action_from_raw_action(self, action, **kwargs): + return self._number + action diff --git a/rlkit/testing/testing_utils.py b/rlkit/testing/testing_utils.py new file mode 100644 index 000000000..b66e9c32c --- /dev/null +++ b/rlkit/testing/testing_utils.py @@ -0,0 +1,52 @@ +import math +import numpy as np +from numbers import Number + + +def is_binomial_trial_likely(n, p, num_success, num_std=3): + """ + Returns whether or not seeing `num_sucesss` successes is likely. + :param n: Number of trials. + :param p: Probability of success. + :param num_success: Number of successes + :param num_std: Number of standard deviations the results must be within. + :return: + """ + mean = n * p + std = math.sqrt(n * p * (1 - p)) + margin = num_std * std + return mean - margin < num_success < mean + margin + + +def are_np_array_iterables_equal(np_list1, np_list2, threshold=1e-5): + # import ipdb; ipdb.set_trace() + # if isinstance(np_list1.shape ==, Number) and isinstance(np_itr2, Number): + # return are_np_arrays_equal(np_itr1, np_itr2) + if np_list1.shape == () and np_list2.shape == (): + return are_np_arrays_equal(np_list1, np_list2) + # in case generators were passed in + # np_list1 = list(np_itr1) + # np_list2 = list(np_itr2) + return ( + len(np_list1) == len(np_list2) and + all(are_np_arrays_equal(arr1, arr2, threshold=threshold) + for arr1, arr2 in zip(np_list1, np_list2)) + ) + + +def are_np_arrays_equal(arr1, arr2, threshold=1e-5): + if arr1.shape != arr2.shape: + return False + return (np.abs(arr1 - arr2) <= threshold).all() + + +def is_list_subset(list1, list2): + for a in list1: + if a not in list2: + return False + return True + + +def are_dict_lists_equal(list1, list2): + return is_list_subset(list1, list2) and is_list_subset(list2, list1) + diff --git a/rlkit/testing/tf_test_case.py b/rlkit/testing/tf_test_case.py new file mode 100644 index 000000000..a128b913a --- /dev/null +++ b/rlkit/testing/tf_test_case.py @@ -0,0 +1,40 @@ +import numpy as np +import tensorflow as tf + +from rlkit.testing.np_test_case import NPTestCase + + +class TFTestCase(NPTestCase): + """ + Tensorflow test case, providing useful assert methods and clean default + session. + """ + def setUp(self): + tf.reset_default_graph() + self.sess = tf.get_default_session() or tf.Session() + self.sess_context = self.sess.as_default() + self.sess_context.__enter__() + + def tearDown(self): + self.sess_context.__exit__(None, None, None) + self.sess.close() + + def assertParamsEqual(self, network1, network2): + self.assertNpArraysEqual( + network1.get_param_values(), + network2.get_param_values(), + msg="Parameters are not equal.", + ) + + def assertParamsNotEqual(self, network1, network2): + self.assertNpArraysNotEqual( + network1.get_param_values(), + network2.get_param_values(), + msg="Parameters are equal.", + ) + + def randomize_param_values(self, network): + for v in network.get_params(): + self.sess.run( + v.assign(np.random.rand(*v.get_shape())) + ) diff --git a/rlkit/torch/sac/awac_trainer.py b/rlkit/torch/sac/awac_trainer.py index 56feeb4f2..b16b7511b 100644 --- a/rlkit/torch/sac/awac_trainer.py +++ b/rlkit/torch/sac/awac_trainer.py @@ -786,6 +786,7 @@ def train_from_torch(self, batch, train=True, pretrain=False,): 'terminals', ptu.get_numpy(terminals), )) + self.eval_statistics['replay_buffer_len'] = self.replay_buffer._size policy_statistics = add_prefix(dist.get_diagnostics(), "policy/") self.eval_statistics.update(policy_statistics) self.eval_statistics.update(create_stats_ordered_dict( diff --git a/tests/regression/README.md b/tests/regression/README.md new file mode 100644 index 000000000..38e9979c2 --- /dev/null +++ b/tests/regression/README.md @@ -0,0 +1,30 @@ +# Regression Tests + +The purpose of these tests is mainly to 1. catch bugs being introduced that cause past experiments/projects to fail 2. after code set up, know that the experiment you are running is exactly the one eg. reported in a paper. + +Tests may be run by individually running an individual file with `python`, or `pip install nose2` and run: +``` +nose2 -v -B -s tests/regression/ +``` + +The tests are divided into different project folders, and some require specific software setup. They are described below: + +### random + +Tests whether stochasticity is fully controlled and match the source tests. Unfortunately if `tests/regression/random/test_mujoco_env.py` fails then any tests that collect online data with MuJoCo environments are likely to also fail. + +### basic + +Tests whether the basic RL algorithm reference scripts run and the results exactly match with smaller batch sizes and epochs. + +### sac + +Tests SAC - this is redundant with `basic` but a different style of test. + +### awac + +Tests AWAC, which runs offline RL followed by online finetuning, in different domains. Any tests that are "offline" do not collect additional data in the environment and just test the algorithm. In particular `gcrl/test_awac_gcrl_offline.py` is a quick test to run the data is included in the repo. + +### val + +Tests VAL, split into two parts: one tests training the VQVAE and one tests running RL with a pretrained VQVAE. diff --git a/tests/regression/__init__.py b/tests/regression/__init__.py new file mode 100644 index 000000000..e69de29bb diff --git a/tests/regression/awac/__init__.py b/tests/regression/awac/__init__.py new file mode 100644 index 000000000..e69de29bb diff --git a/tests/regression/awac/hand/__init__.py b/tests/regression/awac/hand/__init__.py new file mode 100644 index 000000000..e69de29bb diff --git a/tests/regression/awac/hand/id0/debug.log b/tests/regression/awac/hand/id0/debug.log new file mode 100644 index 000000000..0090585d4 --- /dev/null +++ b/tests/regression/awac/hand/id0/debug.log @@ -0,0 +1,528 @@ +[id0] ------------------------------ --------------- +[id0] trainer/QF1 Loss 0.430996 +[id0] trainer/QF2 Loss 0.430787 +[id0] trainer/Policy Loss 37.697 +[id0] trainer/Q1 Predictions Mean -3.02359e-06 +[id0] trainer/Q1 Predictions Std 3.40742e-05 +[id0] trainer/Q1 Predictions Max 0 +[id0] trainer/Q1 Predictions Min -0.00038702 +[id0] trainer/Q2 Predictions Mean -0.000706951 +[id0] trainer/Q2 Predictions Std 0.000794925 +[id0] trainer/Q2 Predictions Max 0 +[id0] trainer/Q2 Predictions Min -0.00266178 +[id0] trainer/Q Targets Mean -0.430984 +[id0] trainer/Q Targets Std 0.495232 +[id0] trainer/Q Targets Max 0 +[id0] trainer/Q Targets Min -1.00543 +[id0] trainer/Log Pis Mean 37.6967 +[id0] trainer/Log Pis Std 3.7997 +[id0] trainer/Log Pis Max 45.9274 +[id0] trainer/Log Pis Min 26.4242 +[id0] trainer/rewards Mean -0.429688 +[id0] trainer/rewards Std 0.495031 +[id0] trainer/rewards Max 0 +[id0] trainer/rewards Min -1 +[id0] trainer/terminals Mean 0 +[id0] trainer/terminals Std 0 +[id0] trainer/terminals Max 0 +[id0] trainer/terminals Min 0 +[id0] trainer/replay_buffer_len 94378 +[id0] trainer/policy/mean Mean 2.644e-05 +[id0] trainer/policy/mean Std 0.000604046 +[id0] trainer/policy/mean Max 0.00306807 +[id0] trainer/policy/mean Min -0.00326424 +[id0] trainer/policy/std Mean 0.0497871 +[id0] trainer/policy/std Std 3.72529e-09 +[id0] trainer/policy/std Max 0.0497871 +[id0] trainer/policy/std Min 0.0497871 +[id0] trainer/Advantage Weights Mean 0.0078125 +[id0] trainer/Advantage Weights Std 9.27213e-06 +[id0] trainer/Advantage Weights Max 0.00782936 +[id0] trainer/Advantage Weights Min 0.00778225 +[id0] trainer/Advantage Score Mean -0.000492835 +[id0] trainer/Advantage Score Std 0.000593715 +[id0] trainer/Advantage Score Max 0.000585229 +[id0] trainer/Advantage Score Min -0.00243256 +[id0] trainer/batch 0 +[id0] trainer/epoch_time 0.0268393 +[id0] ------------------------------ --------------- +[id0] ----------------------------------- --------------- +[id0] replay_buffer/size 94488 +[id0] trainer/num train calls 10 +[id0] trainer/QF1 Loss 0.541809 +[id0] trainer/QF2 Loss 0.53458 +[id0] trainer/Policy Loss 37.4076 +[id0] trainer/Q1 Predictions Mean -0.0578201 +[id0] trainer/Q1 Predictions Std 0.00994116 +[id0] trainer/Q1 Predictions Max -0.0389091 +[id0] trainer/Q1 Predictions Min -0.0677077 +[id0] trainer/Q2 Predictions Mean -0.0627062 +[id0] trainer/Q2 Predictions Std 0.0100067 +[id0] trainer/Q2 Predictions Max -0.0432269 +[id0] trainer/Q2 Predictions Min -0.0707417 +[id0] trainer/Q Targets Mean -0.601419 +[id0] trainer/Q Targets Std 0.489972 +[id0] trainer/Q Targets Max -0.00121503 +[id0] trainer/Q Targets Min -1.00257 +[id0] trainer/Log Pis Mean 37.3661 +[id0] trainer/Log Pis Std 3.27023 +[id0] trainer/Log Pis Max 42.6848 +[id0] trainer/Log Pis Min 33.2804 +[id0] trainer/rewards Mean -0.6 +[id0] trainer/rewards Std 0.489898 +[id0] trainer/rewards Max 0 +[id0] trainer/rewards Min -1 +[id0] trainer/terminals Mean 0 +[id0] trainer/terminals Std 0 +[id0] trainer/terminals Max 0 +[id0] trainer/terminals Min 0 +[id0] trainer/replay_buffer_len 94488 +[id0] trainer/policy/mean Mean -0.00648146 +[id0] trainer/policy/mean Std 0.0393754 +[id0] trainer/policy/mean Max 0.0571199 +[id0] trainer/policy/mean Min -0.0556537 +[id0] trainer/policy/std Mean 0.0500107 +[id0] trainer/policy/std Std 2.66422e-07 +[id0] trainer/policy/std Max 0.0500114 +[id0] trainer/policy/std Min 0.0500103 +[id0] trainer/Advantage Weights Mean 0.2 +[id0] trainer/Advantage Weights Std 0.000590957 +[id0] trainer/Advantage Weights Max 0.200814 +[id0] trainer/Advantage Weights Min 0.199178 +[id0] trainer/Advantage Score Mean -0.0211827 +[id0] trainer/Advantage Score Std 0.00147735 +[id0] trainer/Advantage Score Max -0.0191485 +[id0] trainer/Advantage Score Min -0.0232408 +[id0] expl/num steps total 110 +[id0] expl/num paths total 2 +[id0] expl/path length Mean 100 +[id0] expl/path length Std 0 +[id0] expl/path length Max 100 +[id0] expl/path length Min 100 +[id0] expl/Rewards Mean -1 +[id0] expl/Rewards Std 0 +[id0] expl/Rewards Max -1 +[id0] expl/Rewards Min -1 +[id0] expl/Returns Mean -100 +[id0] expl/Returns Std 0 +[id0] expl/Returns Max -100 +[id0] expl/Returns Min -100 +[id0] expl/Actions Mean -0.00604862 +[id0] expl/Actions Std 0.0633161 +[id0] expl/Actions Max 0.217646 +[id0] expl/Actions Min -0.211709 +[id0] expl/Num Paths 1 +[id0] expl/Average Returns -100 +[id0] eval/num steps total 100 +[id0] eval/num paths total 1 +[id0] eval/path length Mean 100 +[id0] eval/path length Std 0 +[id0] eval/path length Max 100 +[id0] eval/path length Min 100 +[id0] eval/Rewards Mean -1 +[id0] eval/Rewards Std 0 +[id0] eval/Rewards Max -1 +[id0] eval/Rewards Min -1 +[id0] eval/Returns Mean -100 +[id0] eval/Returns Std 0 +[id0] eval/Returns Max -100 +[id0] eval/Returns Min -100 +[id0] eval/Actions Mean -0.00671205 +[id0] eval/Actions Std 0.0393274 +[id0] eval/Actions Max 0.0751831 +[id0] eval/Actions Min -0.0744021 +[id0] eval/Num Paths 1 +[id0] eval/Average Returns -100 +[id0] time/epoch_time (s) 0.44418 +[id0] time/evaluation sampling (s) 0.132373 +[id0] time/exploration sampling (s) 0.138283 +[id0] time/global_time (s) 6.3876 +[id0] time/replay buffer data storing (s) 0.000680447 +[id0] time/saving (s) 0.0107133 +[id0] time/training (s) 0.143939 +[id0] epoch 0 +[id0] ----------------------------------- --------------- +[id0] ----------------------------------- --------------- +[id0] replay_buffer/size 94588 +[id0] trainer/num train calls 20 +[id0] trainer/QF1 Loss 0.289459 +[id0] trainer/QF2 Loss 0.288048 +[id0] trainer/Policy Loss 39.2572 +[id0] trainer/Q1 Predictions Mean -0.209552 +[id0] trainer/Q1 Predictions Std 0.0315131 +[id0] trainer/Q1 Predictions Max -0.149196 +[id0] trainer/Q1 Predictions Min -0.241264 +[id0] trainer/Q2 Predictions Mean -0.216391 +[id0] trainer/Q2 Predictions Std 0.0327007 +[id0] trainer/Q2 Predictions Max -0.154755 +[id0] trainer/Q2 Predictions Min -0.246627 +[id0] trainer/Q Targets Mean -0.402311 +[id0] trainer/Q Targets Std 0.489789 +[id0] trainer/Q Targets Max -0.00222209 +[id0] trainer/Q Targets Min -1.00322 +[id0] trainer/Log Pis Mean 39.1055 +[id0] trainer/Log Pis Std 1.76641 +[id0] trainer/Log Pis Max 41.0172 +[id0] trainer/Log Pis Min 36.1102 +[id0] trainer/rewards Mean -0.4 +[id0] trainer/rewards Std 0.489898 +[id0] trainer/rewards Max 0 +[id0] trainer/rewards Min -1 +[id0] trainer/terminals Mean 0 +[id0] trainer/terminals Std 0 +[id0] trainer/terminals Max 0 +[id0] trainer/terminals Min 0 +[id0] trainer/replay_buffer_len 94588 +[id0] trainer/policy/mean Mean -0.0283767 +[id0] trainer/policy/mean Std 0.175846 +[id0] trainer/policy/mean Max 0.264322 +[id0] trainer/policy/mean Min -0.267117 +[id0] trainer/policy/std Mean 0.0502269 +[id0] trainer/policy/std Std 4.20873e-06 +[id0] trainer/policy/std Max 0.050234 +[id0] trainer/policy/std Min 0.0502148 +[id0] trainer/Advantage Weights Mean 0.2 +[id0] trainer/Advantage Weights Std 0.00318273 +[id0] trainer/Advantage Weights Max 0.203914 +[id0] trainer/Advantage Weights Min 0.194703 +[id0] trainer/Advantage Score Mean -0.0655858 +[id0] trainer/Advantage Score Std 0.00799222 +[id0] trainer/Advantage Score Max -0.0558326 +[id0] trainer/Advantage Score Min -0.0789438 +[id0] expl/num steps total 210 +[id0] expl/num paths total 3 +[id0] expl/path length Mean 100 +[id0] expl/path length Std 0 +[id0] expl/path length Max 100 +[id0] expl/path length Min 100 +[id0] expl/Rewards Mean -1 +[id0] expl/Rewards Std 0 +[id0] expl/Rewards Max -1 +[id0] expl/Rewards Min -1 +[id0] expl/Returns Mean -100 +[id0] expl/Returns Std 0 +[id0] expl/Returns Max -100 +[id0] expl/Returns Min -100 +[id0] expl/Actions Mean -0.0298466 +[id0] expl/Actions Std 0.182544 +[id0] expl/Actions Max 0.388272 +[id0] expl/Actions Min -0.394536 +[id0] expl/Num Paths 1 +[id0] expl/Average Returns -100 +[id0] eval/num steps total 200 +[id0] eval/num paths total 2 +[id0] eval/path length Mean 100 +[id0] eval/path length Std 0 +[id0] eval/path length Max 100 +[id0] eval/path length Min 100 +[id0] eval/Rewards Mean -1 +[id0] eval/Rewards Std 0 +[id0] eval/Rewards Max -1 +[id0] eval/Rewards Min -1 +[id0] eval/Returns Mean -100 +[id0] eval/Returns Std 0 +[id0] eval/Returns Max -100 +[id0] eval/Returns Min -100 +[id0] eval/Actions Mean -0.0298399 +[id0] eval/Actions Std 0.175925 +[id0] eval/Actions Max 0.327232 +[id0] eval/Actions Min -0.330864 +[id0] eval/Num Paths 1 +[id0] eval/Average Returns -100 +[id0] time/epoch_time (s) 0.410806 +[id0] time/evaluation sampling (s) 0.131974 +[id0] time/exploration sampling (s) 0.131596 +[id0] time/global_time (s) 6.80294 +[id0] time/replay buffer data storing (s) 0.000659943 +[id0] time/saving (s) 2.71797e-05 +[id0] time/training (s) 0.144542 +[id0] epoch 1 +[id0] ----------------------------------- --------------- +[id0] ----------------------------------- --------------- +[id0] replay_buffer/size 94688 +[id0] trainer/num train calls 30 +[id0] trainer/QF1 Loss 0.323031 +[id0] trainer/QF2 Loss 0.304242 +[id0] trainer/Policy Loss 37.0603 +[id0] trainer/Q1 Predictions Mean -0.670611 +[id0] trainer/Q1 Predictions Std 0.109438 +[id0] trainer/Q1 Predictions Max -0.561595 +[id0] trainer/Q1 Predictions Min -0.83286 +[id0] trainer/Q2 Predictions Mean -0.637274 +[id0] trainer/Q2 Predictions Std 0.10316 +[id0] trainer/Q2 Predictions Max -0.541261 +[id0] trainer/Q2 Predictions Min -0.79081 +[id0] trainer/Q Targets Mean -0.406869 +[id0] trainer/Q Targets Std 0.49003 +[id0] trainer/Q Targets Max -0.00506353 +[id0] trainer/Q Targets Min -1.00817 +[id0] trainer/Log Pis Mean 36.4262 +[id0] trainer/Log Pis Std 4.86645 +[id0] trainer/Log Pis Max 41.9765 +[id0] trainer/Log Pis Min 28.7056 +[id0] trainer/rewards Mean -0.4 +[id0] trainer/rewards Std 0.489898 +[id0] trainer/rewards Max 0 +[id0] trainer/rewards Min -1 +[id0] trainer/terminals Mean 0 +[id0] trainer/terminals Std 0 +[id0] trainer/terminals Max 0 +[id0] trainer/terminals Min 0 +[id0] trainer/replay_buffer_len 94688 +[id0] trainer/policy/mean Mean -0.0845189 +[id0] trainer/policy/mean Std 0.532795 +[id0] trainer/policy/mean Max 0.753737 +[id0] trainer/policy/mean Min -0.758149 +[id0] trainer/policy/std Mean 0.050423 +[id0] trainer/policy/std Std 1.82354e-05 +[id0] trainer/policy/std Max 0.0504539 +[id0] trainer/policy/std Min 0.0503917 +[id0] trainer/Advantage Weights Mean 0.2 +[id0] trainer/Advantage Weights Std 0.00936739 +[id0] trainer/Advantage Weights Max 0.212892 +[id0] trainer/Advantage Weights Min 0.185305 +[id0] trainer/Advantage Score Mean -0.0368531 +[id0] trainer/Advantage Score Std 0.023583 +[id0] trainer/Advantage Score Max -0.00506538 +[id0] trainer/Advantage Score Min -0.0744564 +[id0] expl/num steps total 310 +[id0] expl/num paths total 4 +[id0] expl/path length Mean 100 +[id0] expl/path length Std 0 +[id0] expl/path length Max 100 +[id0] expl/path length Min 100 +[id0] expl/Rewards Mean -1 +[id0] expl/Rewards Std 0 +[id0] expl/Rewards Max -1 +[id0] expl/Rewards Min -1 +[id0] expl/Returns Mean -100 +[id0] expl/Returns Std 0 +[id0] expl/Returns Max -100 +[id0] expl/Returns Min -100 +[id0] expl/Actions Mean -0.069673 +[id0] expl/Actions Std 0.447311 +[id0] expl/Actions Max 0.837576 +[id0] expl/Actions Min -0.865931 +[id0] expl/Num Paths 1 +[id0] expl/Average Returns -100 +[id0] eval/num steps total 300 +[id0] eval/num paths total 3 +[id0] eval/path length Mean 100 +[id0] eval/path length Std 0 +[id0] eval/path length Max 100 +[id0] eval/path length Min 100 +[id0] eval/Rewards Mean -1 +[id0] eval/Rewards Std 0 +[id0] eval/Rewards Max -1 +[id0] eval/Rewards Min -1 +[id0] eval/Returns Mean -100 +[id0] eval/Returns Std 0 +[id0] eval/Returns Max -100 +[id0] eval/Returns Min -100 +[id0] eval/Actions Mean -0.0690714 +[id0] eval/Actions Std 0.442378 +[id0] eval/Actions Max 0.83838 +[id0] eval/Actions Min -0.840991 +[id0] eval/Num Paths 1 +[id0] eval/Average Returns -100 +[id0] time/epoch_time (s) 0.433728 +[id0] time/evaluation sampling (s) 0.144165 +[id0] time/exploration sampling (s) 0.143284 +[id0] time/global_time (s) 7.24049 +[id0] time/replay buffer data storing (s) 0.00064826 +[id0] time/saving (s) 2.45571e-05 +[id0] time/training (s) 0.143998 +[id0] epoch 2 +[id0] ----------------------------------- --------------- +[id0] ----------------------------------- --------------- +[id0] replay_buffer/size 94788 +[id0] trainer/num train calls 40 +[id0] trainer/QF1 Loss 0.212688 +[id0] trainer/QF2 Loss 0.224051 +[id0] trainer/Policy Loss 38.1473 +[id0] trainer/Q1 Predictions Mean -0.567949 +[id0] trainer/Q1 Predictions Std 0.0991776 +[id0] trainer/Q1 Predictions Max -0.431312 +[id0] trainer/Q1 Predictions Min -0.7372 +[id0] trainer/Q2 Predictions Mean -0.584185 +[id0] trainer/Q2 Predictions Std 0.0955245 +[id0] trainer/Q2 Predictions Max -0.452816 +[id0] trainer/Q2 Predictions Min -0.749657 +[id0] trainer/Q Targets Mean -0.409292 +[id0] trainer/Q Targets Std 0.490638 +[id0] trainer/Q Targets Max -0.0079322 +[id0] trainer/Q Targets Min -1.0153 +[id0] trainer/Log Pis Mean 37.6143 +[id0] trainer/Log Pis Std 1.28059 +[id0] trainer/Log Pis Max 40.025 +[id0] trainer/Log Pis Min 36.4052 +[id0] trainer/rewards Mean -0.4 +[id0] trainer/rewards Std 0.489898 +[id0] trainer/rewards Max 0 +[id0] trainer/rewards Min -1 +[id0] trainer/terminals Mean 0 +[id0] trainer/terminals Std 0 +[id0] trainer/terminals Max 0 +[id0] trainer/terminals Min 0 +[id0] trainer/replay_buffer_len 94788 +[id0] trainer/policy/mean Mean -0.0731318 +[id0] trainer/policy/mean Std 0.573299 +[id0] trainer/policy/mean Max 0.897316 +[id0] trainer/policy/mean Min -0.894554 +[id0] trainer/policy/std Mean 0.0505917 +[id0] trainer/policy/std Std 5.44389e-05 +[id0] trainer/policy/std Max 0.0506736 +[id0] trainer/policy/std Min 0.0504877 +[id0] trainer/Advantage Weights Mean 0.2 +[id0] trainer/Advantage Weights Std 0.0359507 +[id0] trainer/Advantage Weights Max 0.230955 +[id0] trainer/Advantage Weights Min 0.134236 +[id0] trainer/Advantage Score Mean -0.0508601 +[id0] trainer/Advantage Score Std 0.101171 +[id0] trainer/Advantage Score Max 0.0305518 +[id0] trainer/Advantage Score Min -0.240757 +[id0] expl/num steps total 410 +[id0] expl/num paths total 5 +[id0] expl/path length Mean 100 +[id0] expl/path length Std 0 +[id0] expl/path length Max 100 +[id0] expl/path length Min 100 +[id0] expl/Rewards Mean -1 +[id0] expl/Rewards Std 0 +[id0] expl/Rewards Max -1 +[id0] expl/Rewards Min -1 +[id0] expl/Returns Mean -100 +[id0] expl/Returns Std 0 +[id0] expl/Returns Max -100 +[id0] expl/Returns Min -100 +[id0] expl/Actions Mean -0.0746132 +[id0] expl/Actions Std 0.582549 +[id0] expl/Actions Max 1.05004 +[id0] expl/Actions Min -0.979273 +[id0] expl/Num Paths 1 +[id0] expl/Average Returns -100 +[id0] eval/num steps total 400 +[id0] eval/num paths total 4 +[id0] eval/path length Mean 100 +[id0] eval/path length Std 0 +[id0] eval/path length Max 100 +[id0] eval/path length Min 100 +[id0] eval/Rewards Mean -1 +[id0] eval/Rewards Std 0 +[id0] eval/Rewards Max -1 +[id0] eval/Rewards Min -1 +[id0] eval/Returns Mean -100 +[id0] eval/Returns Std 0 +[id0] eval/Returns Max -100 +[id0] eval/Returns Min -100 +[id0] eval/Actions Mean -0.0765953 +[id0] eval/Actions Std 0.589325 +[id0] eval/Actions Max 0.955766 +[id0] eval/Actions Min -0.953699 +[id0] eval/Num Paths 1 +[id0] eval/Average Returns -100 +[id0] time/epoch_time (s) 0.433787 +[id0] time/evaluation sampling (s) 0.14487 +[id0] time/exploration sampling (s) 0.141525 +[id0] time/global_time (s) 7.67767 +[id0] time/replay buffer data storing (s) 0.000645399 +[id0] time/saving (s) 2.19345e-05 +[id0] time/training (s) 0.144945 +[id0] epoch 3 +[id0] ----------------------------------- --------------- +[id0] ----------------------------------- --------------- +[id0] replay_buffer/size 94888 +[id0] trainer/num train calls 50 +[id0] trainer/QF1 Loss 0.245918 +[id0] trainer/QF2 Loss 0.221803 +[id0] trainer/Policy Loss 35.0431 +[id0] trainer/Q1 Predictions Mean -0.529338 +[id0] trainer/Q1 Predictions Std 0.118553 +[id0] trainer/Q1 Predictions Max -0.343747 +[id0] trainer/Q1 Predictions Min -0.675423 +[id0] trainer/Q2 Predictions Mean -0.552998 +[id0] trainer/Q2 Predictions Std 0.11044 +[id0] trainer/Q2 Predictions Max -0.373542 +[id0] trainer/Q2 Predictions Min -0.694036 +[id0] trainer/Q Targets Mean -1.01097 +[id0] trainer/Q Targets Std 0.00297953 +[id0] trainer/Q Targets Max -1.00546 +[id0] trainer/Q Targets Min -1.01356 +[id0] trainer/Log Pis Mean 34.6563 +[id0] trainer/Log Pis Std 2.59986 +[id0] trainer/Log Pis Max 39.3036 +[id0] trainer/Log Pis Min 31.745 +[id0] trainer/rewards Mean -1 +[id0] trainer/rewards Std 0 +[id0] trainer/rewards Max -1 +[id0] trainer/rewards Min -1 +[id0] trainer/terminals Mean 0 +[id0] trainer/terminals Std 0 +[id0] trainer/terminals Max 0 +[id0] trainer/terminals Min 0 +[id0] trainer/replay_buffer_len 94888 +[id0] trainer/policy/mean Mean -0.0465579 +[id0] trainer/policy/mean Std 0.429905 +[id0] trainer/policy/mean Max 0.886022 +[id0] trainer/policy/mean Min -0.875919 +[id0] trainer/policy/std Mean 0.0507521 +[id0] trainer/policy/std Std 9.82493e-05 +[id0] trainer/policy/std Max 0.0508838 +[id0] trainer/policy/std Min 0.0505329 +[id0] trainer/Advantage Weights Mean 0.2 +[id0] trainer/Advantage Weights Std 0.047163 +[id0] trainer/Advantage Weights Max 0.26051 +[id0] trainer/Advantage Weights Min 0.145292 +[id0] trainer/Advantage Score Mean -0.168038 +[id0] trainer/Advantage Score Std 0.119359 +[id0] trainer/Advantage Score Max -0.0217271 +[id0] trainer/Advantage Score Min -0.313674 +[id0] expl/num steps total 510 +[id0] expl/num paths total 6 +[id0] expl/path length Mean 100 +[id0] expl/path length Std 0 +[id0] expl/path length Max 100 +[id0] expl/path length Min 100 +[id0] expl/Rewards Mean -1 +[id0] expl/Rewards Std 0 +[id0] expl/Rewards Max -1 +[id0] expl/Rewards Min -1 +[id0] expl/Returns Mean -100 +[id0] expl/Returns Std 0 +[id0] expl/Returns Max -100 +[id0] expl/Returns Min -100 +[id0] expl/Actions Mean -0.0576382 +[id0] expl/Actions Std 0.534035 +[id0] expl/Actions Max 1.0441 +[id0] expl/Actions Min -0.995019 +[id0] expl/Num Paths 1 +[id0] expl/Average Returns -100 +[id0] eval/num steps total 500 +[id0] eval/num paths total 5 +[id0] eval/path length Mean 100 +[id0] eval/path length Std 0 +[id0] eval/path length Max 100 +[id0] eval/path length Min 100 +[id0] eval/Rewards Mean -1 +[id0] eval/Rewards Std 0 +[id0] eval/Rewards Max -1 +[id0] eval/Rewards Min -1 +[id0] eval/Returns Mean -100 +[id0] eval/Returns Std 0 +[id0] eval/Returns Max -100 +[id0] eval/Returns Min -100 +[id0] eval/Actions Mean -0.0611262 +[id0] eval/Actions Std 0.549484 +[id0] eval/Actions Max 0.962641 +[id0] eval/Actions Min -0.956991 +[id0] eval/Num Paths 1 +[id0] eval/Average Returns -100 +[id0] time/epoch_time (s) 0.449554 +[id0] time/evaluation sampling (s) 0.152599 +[id0] time/exploration sampling (s) 0.150493 +[id0] time/global_time (s) 8.13115 +[id0] time/replay buffer data storing (s) 0.000631094 +[id0] time/saving (s) 1.81198e-05 +[id0] time/training (s) 0.144146 +[id0] epoch 4 +[id0] ----------------------------------- --------------- diff --git a/tests/regression/awac/hand/id0/pretrain_policy.csv b/tests/regression/awac/hand/id0/pretrain_policy.csv new file mode 100644 index 000000000..e69de29bb diff --git a/tests/regression/awac/hand/id0/pretrain_q.csv b/tests/regression/awac/hand/id0/pretrain_q.csv new file mode 100644 index 000000000..650d9d15b --- /dev/null +++ b/tests/regression/awac/hand/id0/pretrain_q.csv @@ -0,0 +1,2 @@ +trainer/Advantage Score Max,trainer/Advantage Score Mean,trainer/Advantage Score Min,trainer/Advantage Score Std,trainer/Advantage Weights Max,trainer/Advantage Weights Mean,trainer/Advantage Weights Min,trainer/Advantage Weights Std,trainer/Log Pis Max,trainer/Log Pis Mean,trainer/Log Pis Min,trainer/Log Pis Std,trainer/Policy Loss,trainer/Q Targets Max,trainer/Q Targets Mean,trainer/Q Targets Min,trainer/Q Targets Std,trainer/Q1 Predictions Max,trainer/Q1 Predictions Mean,trainer/Q1 Predictions Min,trainer/Q1 Predictions Std,trainer/Q2 Predictions Max,trainer/Q2 Predictions Mean,trainer/Q2 Predictions Min,trainer/Q2 Predictions Std,trainer/QF1 Loss,trainer/QF2 Loss,trainer/batch,trainer/epoch_time,trainer/policy/mean Max,trainer/policy/mean Mean,trainer/policy/mean Min,trainer/policy/mean Std,trainer/policy/std Max,trainer/policy/std Mean,trainer/policy/std Min,trainer/policy/std Std,trainer/replay_buffer_len,trainer/rewards Max,trainer/rewards Mean,trainer/rewards Min,trainer/rewards Std,trainer/terminals Max,trainer/terminals Mean,trainer/terminals Min,trainer/terminals Std +0.0005852294,-0.00049283454,-0.0024325612,0.00059371465,0.007829357,0.007812499,0.007782245,9.2721275e-06,45.92739,37.696743,26.42419,3.7997017,37.69696,0.0,-0.43098378,-1.0054277,0.495232,0.0,-3.023595e-06,-0.00038702015,3.407418e-05,0.0,-0.0007069509,-0.0026617819,0.000794925,0.43099567,0.43078744,0,0.026839256286621094,0.0030680718,2.6439986e-05,-0.0032642372,0.0006040457,0.049787067,0.04978707,0.049787067,3.7252903e-09,94378,0.0,-0.4296875,-1.0,0.49503148,0.0,0.0,0.0,0.0 diff --git a/tests/regression/awac/hand/id0/progress.csv b/tests/regression/awac/hand/id0/progress.csv new file mode 100644 index 000000000..97eaa86ae --- /dev/null +++ b/tests/regression/awac/hand/id0/progress.csv @@ -0,0 +1,6 @@ +epoch,eval/Actions Max,eval/Actions Mean,eval/Actions Min,eval/Actions Std,eval/Average Returns,eval/Num Paths,eval/Returns Max,eval/Returns Mean,eval/Returns Min,eval/Returns Std,eval/Rewards Max,eval/Rewards Mean,eval/Rewards Min,eval/Rewards Std,eval/num paths total,eval/num steps total,eval/path length Max,eval/path length Mean,eval/path length Min,eval/path length Std,expl/Actions Max,expl/Actions Mean,expl/Actions Min,expl/Actions Std,expl/Average Returns,expl/Num Paths,expl/Returns Max,expl/Returns Mean,expl/Returns Min,expl/Returns Std,expl/Rewards Max,expl/Rewards Mean,expl/Rewards Min,expl/Rewards Std,expl/num paths total,expl/num steps total,expl/path length Max,expl/path length Mean,expl/path length Min,expl/path length Std,replay_buffer/size,time/epoch_time (s),time/evaluation sampling (s),time/exploration sampling (s),time/global_time (s),time/replay buffer data storing (s),time/saving (s),time/training (s),trainer/Advantage Score Max,trainer/Advantage Score Mean,trainer/Advantage Score Min,trainer/Advantage Score Std,trainer/Advantage Weights Max,trainer/Advantage Weights Mean,trainer/Advantage Weights Min,trainer/Advantage Weights Std,trainer/Log Pis Max,trainer/Log Pis Mean,trainer/Log Pis Min,trainer/Log Pis Std,trainer/Policy Loss,trainer/Q Targets Max,trainer/Q Targets Mean,trainer/Q Targets Min,trainer/Q Targets Std,trainer/Q1 Predictions Max,trainer/Q1 Predictions Mean,trainer/Q1 Predictions Min,trainer/Q1 Predictions Std,trainer/Q2 Predictions Max,trainer/Q2 Predictions Mean,trainer/Q2 Predictions Min,trainer/Q2 Predictions Std,trainer/QF1 Loss,trainer/QF2 Loss,trainer/num train calls,trainer/policy/mean Max,trainer/policy/mean Mean,trainer/policy/mean Min,trainer/policy/mean Std,trainer/policy/std Max,trainer/policy/std Mean,trainer/policy/std Min,trainer/policy/std Std,trainer/replay_buffer_len,trainer/rewards Max,trainer/rewards Mean,trainer/rewards Min,trainer/rewards Std,trainer/terminals Max,trainer/terminals Mean,trainer/terminals Min,trainer/terminals Std +0,0.0751831,-0.0067120497,-0.07440208,0.039327394,-100.0,1,-100,-100.0,-100,0.0,-1,-1.0,-1,0.0,1,100,100,100.0,100,0.0,0.21764643,-0.0060486193,-0.21170878,0.063316055,-100.0,1,-100,-100.0,-100,0.0,-1,-1.0,-1,0.0,2,110,100,100.0,100,0.0,94488,0.4441797733306885,0.13237333297729492,0.13828301429748535,6.387598037719727,0.0006804466247558594,0.010713338851928711,0.14393854141235352,-0.019148499,-0.021182675,-0.023240812,0.0014773487,0.20081446,0.2,0.19917756,0.0005909566,42.684845,37.366127,33.2804,3.2702262,37.40755,-0.0012150254,-0.6014195,-1.0025667,0.48997208,-0.038909107,-0.05782012,-0.06770767,0.009941163,-0.043226924,-0.06270619,-0.07074172,0.01000666,0.54180855,0.5345802,10,0.057119895,-0.0064814645,-0.055653658,0.03937538,0.050011396,0.050010722,0.050010297,2.664223e-07,94488,0.0,-0.6,-1.0,0.48989794,0.0,0.0,0.0,0.0 +1,0.32723245,-0.02983992,-0.33086422,0.17592543,-100.0,1,-100,-100.0,-100,0.0,-1,-1.0,-1,0.0,2,200,100,100.0,100,0.0,0.38827184,-0.029846583,-0.39453623,0.18254384,-100.0,1,-100,-100.0,-100,0.0,-1,-1.0,-1,0.0,3,210,100,100.0,100,0.0,94588,0.41080617904663086,0.1319742202758789,0.13159561157226562,6.802939176559448,0.000659942626953125,2.7179718017578125e-05,0.14454197883605957,-0.055832624,-0.06558577,-0.07894382,0.007992218,0.20391357,0.19999999,0.19470271,0.00318273,41.01719,39.105507,36.110214,1.766412,39.257187,-0.0022220896,-0.40231103,-1.0032197,0.4897892,-0.14919621,-0.20955181,-0.24126376,0.031513114,-0.15475512,-0.21639073,-0.24662727,0.032700714,0.2894588,0.28804755,20,0.2643216,-0.028376682,-0.26711684,0.17584603,0.050233953,0.050226945,0.050214797,4.208725e-06,94588,0.0,-0.4,-1.0,0.48989797,0.0,0.0,0.0,0.0 +2,0.83838034,-0.06907141,-0.84099126,0.44237795,-100.0,1,-100,-100.0,-100,0.0,-1,-1.0,-1,0.0,3,300,100,100.0,100,0.0,0.8375761,-0.06967298,-0.86593086,0.44731107,-100.0,1,-100,-100.0,-100,0.0,-1,-1.0,-1,0.0,4,310,100,100.0,100,0.0,94688,0.4337282180786133,0.1441652774810791,0.14328360557556152,7.2404868602752686,0.0006482601165771484,2.4557113647460938e-05,0.1439981460571289,-0.0050653815,-0.036853082,-0.07445642,0.023583008,0.21289217,0.20000002,0.1853051,0.009367392,41.976456,36.426247,28.705608,4.8664484,37.060284,-0.0050635263,-0.40686882,-1.0081694,0.49003008,-0.5615947,-0.670611,-0.8328599,0.109437816,-0.5412614,-0.6372739,-0.7908095,0.10316037,0.32303113,0.30424166,30,0.753737,-0.08451888,-0.75814915,0.53279483,0.05045392,0.050423015,0.050391745,1.8235358e-05,94688,0.0,-0.4,-1.0,0.48989797,0.0,0.0,0.0,0.0 +3,0.9557665,-0.07659533,-0.9536987,0.58932513,-100.0,1,-100,-100.0,-100,0.0,-1,-1.0,-1,0.0,4,400,100,100.0,100,0.0,1.0500412,-0.074613206,-0.97927314,0.58254904,-100.0,1,-100,-100.0,-100,0.0,-1,-1.0,-1,0.0,5,410,100,100.0,100,0.0,94788,0.43378710746765137,0.14487004280090332,0.1415250301361084,7.677665710449219,0.0006453990936279297,2.193450927734375e-05,0.1449449062347412,0.03055185,-0.05086013,-0.24075744,0.101171255,0.23095457,0.19999999,0.1342364,0.035950724,40.02499,37.614285,36.40517,1.2805876,38.147266,-0.007932198,-0.40929207,-1.0153013,0.4906375,-0.4313117,-0.5679489,-0.7371997,0.09917764,-0.4528161,-0.58418524,-0.7496567,0.0955245,0.2126884,0.22405101,40,0.8973159,-0.07313178,-0.8945535,0.5732988,0.05067363,0.05059171,0.050487734,5.4438875e-05,94788,0.0,-0.4,-1.0,0.48989797,0.0,0.0,0.0,0.0 +4,0.9626407,-0.06112624,-0.95699066,0.54948413,-100.0,1,-100,-100.0,-100,0.0,-1,-1.0,-1,0.0,5,500,100,100.0,100,0.0,1.0440977,-0.05763822,-0.99501944,0.5340355,-100.0,1,-100,-100.0,-100,0.0,-1,-1.0,-1,0.0,6,510,100,100.0,100,0.0,94888,0.449554443359375,0.15259909629821777,0.15049290657043457,8.1311514377594,0.0006310939788818359,1.811981201171875e-05,0.14414620399475098,-0.021727145,-0.16803777,-0.3136738,0.11935903,0.26051006,0.2,0.14529239,0.047163025,39.303642,34.656303,31.744986,2.5998602,35.0431,-1.0054611,-1.010975,-1.0135633,0.0029795296,-0.34374738,-0.5293378,-0.67542297,0.118552946,-0.373542,-0.5529977,-0.6940357,0.11044008,0.24591793,0.22180313,50,0.8860216,-0.04655789,-0.87591904,0.42990506,0.050883844,0.050752144,0.050532945,9.8249286e-05,94888,-1.0,-1.0,-1.0,0.0,0.0,0.0,0.0,0.0 diff --git a/tests/regression/awac/hand/id0/script_name.txt b/tests/regression/awac/hand/id0/script_name.txt new file mode 100644 index 000000000..84b593ca0 --- /dev/null +++ b/tests/regression/awac/hand/id0/script_name.txt @@ -0,0 +1 @@ +tests/regression/awac/hand/test_awac_hand.py \ No newline at end of file diff --git a/tests/regression/awac/hand/id0/variant.json b/tests/regression/awac/hand/id0/variant.json new file mode 100644 index 000000000..156d2e67e --- /dev/null +++ b/tests/regression/awac/hand/id0/variant.json @@ -0,0 +1,106 @@ +{ + "add_env_demos": true, + "add_env_offpolicy_data": true, + "algo_kwargs": { + "batch_size": 5, + "min_num_steps_before_training": 10, + "num_epochs": 5, + "num_eval_steps_per_epoch": 100, + "num_expl_steps_per_train_loop": 100, + "num_trains_per_train_loop": 10 + }, + "algorithm": "AWAC", + "collection_mode": "batch", + "debug": true, + "env_demo_path": { + "is_demo": true, + "obs_dict": true, + "path": "demos/icml2020/hand/pen2_sparse.npy" + }, + "env_id": "pen-binary-v0", + "env_offpolicy_data_path": { + "is_demo": false, + "obs_dict": false, + "path": "demos/icml2020/hand/pen_bc_sparse4.npy", + "train_split": 0.9 + }, + "exp_id": 0, + "exp_name": "references/awac/hand/awac1", + "load_demos": true, + "logger_config": { + "run_id": 0, + "snapshot_gap": 100, + "snapshot_mode": "gap" + }, + "max_path_length": 200, + "normalize_env": false, + "path_loader_class": { + "$class": "rlkit.demos.source.dict_to_mdp_path_loader.DictToMDPPathLoader" + }, + "path_loader_kwargs": { + "demo_paths": [], + "obs_key": "state_observation" + }, + "policy_class": { + "$class": "rlkit.torch.sac.policies.gaussian_policy.GaussianPolicy" + }, + "policy_kwargs": { + "hidden_sizes": [ + 256, + 256 + ], + "max_log_std": 0, + "min_log_std": -6, + "std_architecture": "values" + }, + "pretrain_policy": true, + "pretrain_rl": true, + "qf_kwargs": { + "hidden_sizes": [ + 256, + 256 + ], + "output_activation": { + "$function": "rlkit.torch.networks.basic.Clamp" + } + }, + "replay_buffer_size": 1000000, + "run_id": 0, + "seed": "0", + "seedid": 0, + "sparse_reward": true, + "trainer_kwargs": { + "alpha": 0, + "awr_min_q": true, + "awr_use_mle_for_vf": true, + "awr_weight": 1.0, + "bc_num_pretrain_steps": 0, + "bc_weight": 0.0, + "beta": 0.5, + "clip_score": 0.5, + "compute_bc": false, + "discount": 0.99, + "policy_lr": 0.0003, + "policy_weight_decay": 0.0001, + "q_num_pretrain1_steps": 0, + "q_num_pretrain2_steps": 10, + "q_weight_decay": 0, + "qf_lr": 0.0003, + "reparam_weight": 0.0, + "reward_scale": 1, + "reward_transform_kwargs": null, + "rl_weight": 1.0, + "soft_target_tau": 0.005, + "target_update_period": 1, + "terminal_transform_kwargs": { + "b": 0, + "m": 0 + }, + "use_automatic_entropy_tuning": false, + "use_awr_update": true, + "use_reparam_update": false + }, + "trial_name": "id0", + "unique_id": "c7763e50-2054-4f0c-a288-addb1e964fac", + "version": "normal" +} \ No newline at end of file diff --git a/tests/regression/awac/hand/id0_offline/debug.log b/tests/regression/awac/hand/id0_offline/debug.log new file mode 100644 index 000000000..1eafc449f --- /dev/null +++ b/tests/regression/awac/hand/id0_offline/debug.log @@ -0,0 +1,528 @@ +[id0] ------------------------------ -------------- +[id0] trainer/QF1 Loss 0.587335 +[id0] trainer/QF2 Loss 0.586703 +[id0] trainer/Policy Loss 37.6971 +[id0] trainer/Q1 Predictions Mean -2.0639e-05 +[id0] trainer/Q1 Predictions Std 0.000144708 +[id0] trainer/Q1 Predictions Max 0 +[id0] trainer/Q1 Predictions Min -0.00117335 +[id0] trainer/Q2 Predictions Mean -0.000909909 +[id0] trainer/Q2 Predictions Std 0.000862791 +[id0] trainer/Q2 Predictions Max 0 +[id0] trainer/Q2 Predictions Min -0.00243598 +[id0] trainer/Q Targets Mean -0.58711 +[id0] trainer/Q Targets Std 0.492624 +[id0] trainer/Q Targets Max 0 +[id0] trainer/Q Targets Min -1.00652 +[id0] trainer/Log Pis Mean 37.6967 +[id0] trainer/Log Pis Std 3.7997 +[id0] trainer/Log Pis Max 45.9274 +[id0] trainer/Log Pis Min 26.4242 +[id0] trainer/rewards Mean -0.585938 +[id0] trainer/rewards Std 0.492559 +[id0] trainer/rewards Max 0 +[id0] trainer/rewards Min -1 +[id0] trainer/terminals Mean 0 +[id0] trainer/terminals Std 0 +[id0] trainer/terminals Max 0 +[id0] trainer/terminals Min 0 +[id0] trainer/replay_buffer_len 4378 +[id0] trainer/policy/mean Mean 1.41675e-05 +[id0] trainer/policy/mean Std 0.000566262 +[id0] trainer/policy/mean Max 0.0028541 +[id0] trainer/policy/mean Min -0.00361637 +[id0] trainer/policy/std Mean 0.0497871 +[id0] trainer/policy/std Std 3.72529e-09 +[id0] trainer/policy/std Max 0.0497871 +[id0] trainer/policy/std Min 0.0497871 +[id0] trainer/Advantage Weights Mean 0.0078125 +[id0] trainer/Advantage Weights Std 9.11405e-06 +[id0] trainer/Advantage Weights Max 0.00783496 +[id0] trainer/Advantage Weights Min 0.00778689 +[id0] trainer/Advantage Score Mean -0.000567973 +[id0] trainer/Advantage Score Std 0.000583401 +[id0] trainer/Advantage Score Max 0.000867629 +[id0] trainer/Advantage Score Min -0.00220922 +[id0] trainer/batch 0 +[id0] trainer/epoch_time 0.0215847 +[id0] ------------------------------ -------------- +[id0] ----------------------------------- -------------- +[id0] replay_buffer/size 4488 +[id0] trainer/num train calls 10 +[id0] trainer/QF1 Loss 0.530111 +[id0] trainer/QF2 Loss 0.530509 +[id0] trainer/Policy Loss 37.4115 +[id0] trainer/Q1 Predictions Mean -0.0631376 +[id0] trainer/Q1 Predictions Std 0.0164455 +[id0] trainer/Q1 Predictions Max -0.0359238 +[id0] trainer/Q1 Predictions Min -0.0875088 +[id0] trainer/Q2 Predictions Mean -0.0667733 +[id0] trainer/Q2 Predictions Std 0.0137011 +[id0] trainer/Q2 Predictions Max -0.0402223 +[id0] trainer/Q2 Predictions Min -0.079071 +[id0] trainer/Q Targets Mean -0.601106 +[id0] trainer/Q Targets Std 0.489749 +[id0] trainer/Q Targets Max -0.00115218 +[id0] trainer/Q Targets Min -1.00137 +[id0] trainer/Log Pis Mean 37.366 +[id0] trainer/Log Pis Std 3.27023 +[id0] trainer/Log Pis Max 42.6848 +[id0] trainer/Log Pis Min 33.2803 +[id0] trainer/rewards Mean -0.6 +[id0] trainer/rewards Std 0.489898 +[id0] trainer/rewards Max 0 +[id0] trainer/rewards Min -1 +[id0] trainer/terminals Mean 0 +[id0] trainer/terminals Std 0 +[id0] trainer/terminals Max 0 +[id0] trainer/terminals Min 0 +[id0] trainer/replay_buffer_len 4488 +[id0] trainer/policy/mean Mean -0.00621008 +[id0] trainer/policy/mean Std 0.0420172 +[id0] trainer/policy/mean Max 0.0644337 +[id0] trainer/policy/mean Min -0.0634647 +[id0] trainer/policy/std Mean 0.0500109 +[id0] trainer/policy/std Std 6.22487e-07 +[id0] trainer/policy/std Max 0.0500119 +[id0] trainer/policy/std Min 0.0500096 +[id0] trainer/Advantage Weights Mean 0.2 +[id0] trainer/Advantage Weights Std 0.00131027 +[id0] trainer/Advantage Weights Max 0.201731 +[id0] trainer/Advantage Weights Min 0.198306 +[id0] trainer/Advantage Score Mean -0.0228875 +[id0] trainer/Advantage Score Std 0.0032768 +[id0] trainer/Advantage Score Max -0.0185681 +[id0] trainer/Advantage Score Min -0.0271295 +[id0] expl/num steps total 110 +[id0] expl/num paths total 3 +[id0] expl/path length Mean 50 +[id0] expl/path length Std 0 +[id0] expl/path length Max 50 +[id0] expl/path length Min 50 +[id0] expl/Rewards Mean 0 +[id0] expl/Rewards Std 0 +[id0] expl/Rewards Max 0 +[id0] expl/Rewards Min 0 +[id0] expl/Returns Mean 0 +[id0] expl/Returns Std 0 +[id0] expl/Returns Max 0 +[id0] expl/Returns Min 0 +[id0] expl/Actions Mean -0.000271306 +[id0] expl/Actions Std 0.0491936 +[id0] expl/Actions Max 0.176716 +[id0] expl/Actions Min -0.176366 +[id0] expl/Num Paths 2 +[id0] expl/Average Returns 0 +[id0] eval/num steps total 100 +[id0] eval/num paths total 2 +[id0] eval/path length Mean 50 +[id0] eval/path length Std 0 +[id0] eval/path length Max 50 +[id0] eval/path length Min 50 +[id0] eval/Rewards Mean 0 +[id0] eval/Rewards Std 0 +[id0] eval/Rewards Max 0 +[id0] eval/Rewards Min 0 +[id0] eval/Returns Mean 0 +[id0] eval/Returns Std 0 +[id0] eval/Returns Max 0 +[id0] eval/Returns Min 0 +[id0] eval/Actions Mean -0.000916671 +[id0] eval/Actions Std 0.00486171 +[id0] eval/Actions Max 0.0057998 +[id0] eval/Actions Min -0.00579818 +[id0] eval/Num Paths 2 +[id0] eval/Average Returns 0 +[id0] time/epoch_time (s) 0.257109 +[id0] time/evaluation sampling (s) 0.0522325 +[id0] time/exploration sampling (s) 0.052789 +[id0] time/global_time (s) 4.11667 +[id0] time/replay buffer data storing (s) 0.000616074 +[id0] time/saving (s) 0.0116429 +[id0] time/training (s) 0.131897 +[id0] epoch 0 +[id0] ----------------------------------- -------------- +[id0] ----------------------------------- -------------- +[id0] replay_buffer/size 4588 +[id0] trainer/num train calls 20 +[id0] trainer/QF1 Loss 0.187533 +[id0] trainer/QF2 Loss 0.174136 +[id0] trainer/Policy Loss 39.3108 +[id0] trainer/Q1 Predictions Mean -0.276973 +[id0] trainer/Q1 Predictions Std 0.0452016 +[id0] trainer/Q1 Predictions Max -0.227012 +[id0] trainer/Q1 Predictions Min -0.345765 +[id0] trainer/Q2 Predictions Mean -0.261092 +[id0] trainer/Q2 Predictions Std 0.0236579 +[id0] trainer/Q2 Predictions Max -0.235645 +[id0] trainer/Q2 Predictions Min -0.302091 +[id0] trainer/Q Targets Mean -0.20258 +[id0] trainer/Q Targets Std 0.399967 +[id0] trainer/Q Targets Max -0.00244766 +[id0] trainer/Q Targets Min -1.00251 +[id0] trainer/Log Pis Mean 39.1055 +[id0] trainer/Log Pis Std 1.76641 +[id0] trainer/Log Pis Max 41.0172 +[id0] trainer/Log Pis Min 36.1102 +[id0] trainer/rewards Mean -0.2 +[id0] trainer/rewards Std 0.4 +[id0] trainer/rewards Max 0 +[id0] trainer/rewards Min -1 +[id0] trainer/terminals Mean 0 +[id0] trainer/terminals Std 0 +[id0] trainer/terminals Max 0 +[id0] trainer/terminals Min 0 +[id0] trainer/replay_buffer_len 4588 +[id0] trainer/policy/mean Mean -0.0265185 +[id0] trainer/policy/mean Std 0.21096 +[id0] trainer/policy/mean Max 0.292389 +[id0] trainer/policy/mean Min -0.293355 +[id0] trainer/policy/std Mean 0.0502269 +[id0] trainer/policy/std Std 4.40045e-06 +[id0] trainer/policy/std Max 0.0502351 +[id0] trainer/policy/std Min 0.0502205 +[id0] trainer/Advantage Weights Mean 0.2 +[id0] trainer/Advantage Weights Std 0.00329152 +[id0] trainer/Advantage Weights Max 0.204288 +[id0] trainer/Advantage Weights Min 0.194871 +[id0] trainer/Advantage Score Mean -0.076051 +[id0] trainer/Advantage Score Std 0.00824854 +[id0] trainer/Advantage Score Max -0.0653758 +[id0] trainer/Advantage Score Min -0.088972 +[id0] expl/num steps total 210 +[id0] expl/num paths total 5 +[id0] expl/path length Mean 50 +[id0] expl/path length Std 0 +[id0] expl/path length Max 50 +[id0] expl/path length Min 50 +[id0] expl/Rewards Mean 0 +[id0] expl/Rewards Std 0 +[id0] expl/Rewards Max 0 +[id0] expl/Rewards Min 0 +[id0] expl/Returns Mean 0 +[id0] expl/Returns Std 0 +[id0] expl/Returns Max 0 +[id0] expl/Returns Min 0 +[id0] expl/Actions Mean -0.00244175 +[id0] expl/Actions Std 0.0535764 +[id0] expl/Actions Max 0.174067 +[id0] expl/Actions Min -0.191247 +[id0] expl/Num Paths 2 +[id0] expl/Average Returns 0 +[id0] eval/num steps total 200 +[id0] eval/num paths total 4 +[id0] eval/path length Mean 50 +[id0] eval/path length Std 0 +[id0] eval/path length Max 50 +[id0] eval/path length Min 50 +[id0] eval/Rewards Mean 0 +[id0] eval/Rewards Std 0 +[id0] eval/Rewards Max 0 +[id0] eval/Rewards Min 0 +[id0] eval/Returns Mean 0 +[id0] eval/Returns Std 0 +[id0] eval/Returns Max 0 +[id0] eval/Returns Min 0 +[id0] eval/Actions Mean -0.00253839 +[id0] eval/Actions Std 0.0176202 +[id0] eval/Actions Max 0.0221246 +[id0] eval/Actions Min -0.0223982 +[id0] eval/Num Paths 2 +[id0] eval/Average Returns 0 +[id0] time/epoch_time (s) 0.232375 +[id0] time/evaluation sampling (s) 0.0493882 +[id0] time/exploration sampling (s) 0.0502512 +[id0] time/global_time (s) 4.35213 +[id0] time/replay buffer data storing (s) 0.00065589 +[id0] time/saving (s) 1.38283e-05 +[id0] time/training (s) 0.130432 +[id0] epoch 1 +[id0] ----------------------------------- -------------- +[id0] ----------------------------------- -------------- +[id0] replay_buffer/size 4688 +[id0] trainer/num train calls 30 +[id0] trainer/QF1 Loss 0.242659 +[id0] trainer/QF2 Loss 0.242312 +[id0] trainer/Policy Loss 37.0019 +[id0] trainer/Q1 Predictions Mean -0.649627 +[id0] trainer/Q1 Predictions Std 0.108881 +[id0] trainer/Q1 Predictions Max -0.512828 +[id0] trainer/Q1 Predictions Min -0.750237 +[id0] trainer/Q2 Predictions Mean -0.606173 +[id0] trainer/Q2 Predictions Std 0.0692651 +[id0] trainer/Q2 Predictions Max -0.510576 +[id0] trainer/Q2 Predictions Min -0.671952 +[id0] trainer/Q Targets Mean -0.604844 +[id0] trainer/Q Targets Std 0.489962 +[id0] trainer/Q Targets Max -0.00471973 +[id0] trainer/Q Targets Min -1.00523 +[id0] trainer/Log Pis Mean 36.4276 +[id0] trainer/Log Pis Std 4.86645 +[id0] trainer/Log Pis Max 41.9779 +[id0] trainer/Log Pis Min 28.707 +[id0] trainer/rewards Mean -0.6 +[id0] trainer/rewards Std 0.489898 +[id0] trainer/rewards Max 0 +[id0] trainer/rewards Min -1 +[id0] trainer/terminals Mean 0 +[id0] trainer/terminals Std 0 +[id0] trainer/terminals Max 0 +[id0] trainer/terminals Min 0 +[id0] trainer/replay_buffer_len 4688 +[id0] trainer/policy/mean Mean -0.0502694 +[id0] trainer/policy/mean Std 0.503637 +[id0] trainer/policy/mean Max 0.6963 +[id0] trainer/policy/mean Min -0.699244 +[id0] trainer/policy/std Mean 0.0504201 +[id0] trainer/policy/std Std 2.21303e-05 +[id0] trainer/policy/std Max 0.0504548 +[id0] trainer/policy/std Min 0.050386 +[id0] trainer/Advantage Weights Mean 0.2 +[id0] trainer/Advantage Weights Std 0.00570983 +[id0] trainer/Advantage Weights Max 0.210345 +[id0] trainer/Advantage Weights Min 0.195276 +[id0] trainer/Advantage Score Mean -0.0756233 +[id0] trainer/Advantage Score Std 0.0140877 +[id0] trainer/Advantage Score Max -0.0502083 +[id0] trainer/Advantage Score Min -0.0873759 +[id0] expl/num steps total 310 +[id0] expl/num paths total 7 +[id0] expl/path length Mean 50 +[id0] expl/path length Std 0 +[id0] expl/path length Max 50 +[id0] expl/path length Min 50 +[id0] expl/Rewards Mean 0 +[id0] expl/Rewards Std 0 +[id0] expl/Rewards Max 0 +[id0] expl/Rewards Min 0 +[id0] expl/Returns Mean 0 +[id0] expl/Returns Std 0 +[id0] expl/Returns Max 0 +[id0] expl/Returns Min 0 +[id0] expl/Actions Mean -0.00426805 +[id0] expl/Actions Std 0.0644527 +[id0] expl/Actions Max 0.21705 +[id0] expl/Actions Min -0.172821 +[id0] expl/Num Paths 2 +[id0] expl/Average Returns 0 +[id0] eval/num steps total 300 +[id0] eval/num paths total 6 +[id0] eval/path length Mean 50 +[id0] eval/path length Std 0 +[id0] eval/path length Max 50 +[id0] eval/path length Min 50 +[id0] eval/Rewards Mean 0 +[id0] eval/Rewards Std 0 +[id0] eval/Rewards Max 0 +[id0] eval/Rewards Min 0 +[id0] eval/Returns Mean 0 +[id0] eval/Returns Std 0 +[id0] eval/Returns Max 0 +[id0] eval/Returns Min 0 +[id0] eval/Actions Mean -0.00411224 +[id0] eval/Actions Std 0.0409106 +[id0] eval/Actions Max 0.0553754 +[id0] eval/Actions Min -0.0561151 +[id0] eval/Num Paths 2 +[id0] eval/Average Returns 0 +[id0] time/epoch_time (s) 0.246871 +[id0] time/evaluation sampling (s) 0.0508497 +[id0] time/exploration sampling (s) 0.0508993 +[id0] time/global_time (s) 4.6022 +[id0] time/replay buffer data storing (s) 0.000656843 +[id0] time/saving (s) 1.45435e-05 +[id0] time/training (s) 0.142996 +[id0] epoch 2 +[id0] ----------------------------------- -------------- +[id0] ----------------------------------- -------------- +[id0] replay_buffer/size 4788 +[id0] trainer/num train calls 40 +[id0] trainer/QF1 Loss 0.200712 +[id0] trainer/QF2 Loss 0.221927 +[id0] trainer/Policy Loss 38.1063 +[id0] trainer/Q1 Predictions Mean -0.489174 +[id0] trainer/Q1 Predictions Std 0.0613553 +[id0] trainer/Q1 Predictions Max -0.417736 +[id0] trainer/Q1 Predictions Min -0.594545 +[id0] trainer/Q2 Predictions Mean -0.506482 +[id0] trainer/Q2 Predictions Std 0.0459208 +[id0] trainer/Q2 Predictions Max -0.44493 +[id0] trainer/Q2 Predictions Min -0.579774 +[id0] trainer/Q Targets Mean -0.208323 +[id0] trainer/Q Targets Std 0.400345 +[id0] trainer/Q Targets Max -0.00777229 +[id0] trainer/Q Targets Min -1.00901 +[id0] trainer/Log Pis Mean 37.6202 +[id0] trainer/Log Pis Std 1.28059 +[id0] trainer/Log Pis Max 40.0309 +[id0] trainer/Log Pis Min 36.4111 +[id0] trainer/rewards Mean -0.2 +[id0] trainer/rewards Std 0.4 +[id0] trainer/rewards Max 0 +[id0] trainer/rewards Min -1 +[id0] trainer/terminals Mean 0 +[id0] trainer/terminals Std 0 +[id0] trainer/terminals Max 0 +[id0] trainer/terminals Min 0 +[id0] trainer/replay_buffer_len 4788 +[id0] trainer/policy/mean Mean -0.0453844 +[id0] trainer/policy/mean Std 0.652906 +[id0] trainer/policy/mean Max 0.899897 +[id0] trainer/policy/mean Min -0.893193 +[id0] trainer/policy/std Mean 0.0505792 +[id0] trainer/policy/std Std 6.09606e-05 +[id0] trainer/policy/std Max 0.0506763 +[id0] trainer/policy/std Min 0.0504869 +[id0] trainer/Advantage Weights Mean 0.2 +[id0] trainer/Advantage Weights Std 0.00573491 +[id0] trainer/Advantage Weights Max 0.210629 +[id0] trainer/Advantage Weights Min 0.194751 +[id0] trainer/Advantage Score Mean -0.0234178 +[id0] trainer/Advantage Score Std 0.0141337 +[id0] trainer/Advantage Score Max 0.00267541 +[id0] trainer/Advantage Score Min -0.0365132 +[id0] expl/num steps total 410 +[id0] expl/num paths total 9 +[id0] expl/path length Mean 50 +[id0] expl/path length Std 0 +[id0] expl/path length Max 50 +[id0] expl/path length Min 50 +[id0] expl/Rewards Mean 0 +[id0] expl/Rewards Std 0 +[id0] expl/Rewards Max 0 +[id0] expl/Rewards Min 0 +[id0] expl/Returns Mean 0 +[id0] expl/Returns Std 0 +[id0] expl/Returns Max 0 +[id0] expl/Returns Min 0 +[id0] expl/Actions Mean -0.00290322 +[id0] expl/Actions Std 0.0797602 +[id0] expl/Actions Max 0.229309 +[id0] expl/Actions Min -0.234993 +[id0] expl/Num Paths 2 +[id0] expl/Average Returns 0 +[id0] eval/num steps total 400 +[id0] eval/num paths total 8 +[id0] eval/path length Mean 50 +[id0] eval/path length Std 0 +[id0] eval/path length Max 50 +[id0] eval/path length Min 50 +[id0] eval/Rewards Mean 0 +[id0] eval/Rewards Std 0 +[id0] eval/Rewards Max 0 +[id0] eval/Rewards Min 0 +[id0] eval/Returns Mean 0 +[id0] eval/Returns Std 0 +[id0] eval/Returns Max 0 +[id0] eval/Returns Min 0 +[id0] eval/Actions Mean -0.00327218 +[id0] eval/Actions Std 0.0611877 +[id0] eval/Actions Max 0.0900031 +[id0] eval/Actions Min -0.0884993 +[id0] eval/Num Paths 2 +[id0] eval/Average Returns 0 +[id0] time/epoch_time (s) 0.247592 +[id0] time/evaluation sampling (s) 0.0504873 +[id0] time/exploration sampling (s) 0.0508873 +[id0] time/global_time (s) 4.85258 +[id0] time/replay buffer data storing (s) 0.000612259 +[id0] time/saving (s) 1.38283e-05 +[id0] time/training (s) 0.143778 +[id0] epoch 3 +[id0] ----------------------------------- -------------- +[id0] ----------------------------------- -------------- +[id0] replay_buffer/size 4888 +[id0] trainer/num train calls 50 +[id0] trainer/QF1 Loss 0.278162 +[id0] trainer/QF2 Loss 0.296417 +[id0] trainer/Policy Loss 35.0882 +[id0] trainer/Q1 Predictions Mean -0.437299 +[id0] trainer/Q1 Predictions Std 0.0387452 +[id0] trainer/Q1 Predictions Max -0.375038 +[id0] trainer/Q1 Predictions Min -0.495727 +[id0] trainer/Q2 Predictions Mean -0.423185 +[id0] trainer/Q2 Predictions Std 0.0229848 +[id0] trainer/Q2 Predictions Max -0.381832 +[id0] trainer/Q2 Predictions Min -0.449819 +[id0] trainer/Q Targets Mean -0.813099 +[id0] trainer/Q Targets Std 0.400558 +[id0] trainer/Q Targets Max -0.0119891 +[id0] trainer/Q Targets Min -1.01662 +[id0] trainer/Log Pis Mean 34.6726 +[id0] trainer/Log Pis Std 2.59986 +[id0] trainer/Log Pis Max 39.3199 +[id0] trainer/Log Pis Min 31.7613 +[id0] trainer/rewards Mean -0.8 +[id0] trainer/rewards Std 0.4 +[id0] trainer/rewards Max 0 +[id0] trainer/rewards Min -1 +[id0] trainer/terminals Mean 0 +[id0] trainer/terminals Std 0 +[id0] trainer/terminals Max 0 +[id0] trainer/terminals Min 0 +[id0] trainer/replay_buffer_len 4888 +[id0] trainer/policy/mean Mean -0.0412092 +[id0] trainer/policy/mean Std 0.649419 +[id0] trainer/policy/mean Max 0.919502 +[id0] trainer/policy/mean Min -0.909987 +[id0] trainer/policy/std Mean 0.0507177 +[id0] trainer/policy/std Std 0.000108938 +[id0] trainer/policy/std Max 0.0508712 +[id0] trainer/policy/std Min 0.0505374 +[id0] trainer/Advantage Weights Mean 0.2 +[id0] trainer/Advantage Weights Std 0.00593372 +[id0] trainer/Advantage Weights Max 0.20982 +[id0] trainer/Advantage Weights Min 0.191195 +[id0] trainer/Advantage Score Mean -0.0238996 +[id0] trainer/Advantage Score Std 0.0147918 +[id0] trainer/Advantage Score Max 0.000285596 +[id0] trainer/Advantage Score Min -0.0461929 +[id0] expl/num steps total 510 +[id0] expl/num paths total 11 +[id0] expl/path length Mean 50 +[id0] expl/path length Std 0 +[id0] expl/path length Max 50 +[id0] expl/path length Min 50 +[id0] expl/Rewards Mean 0 +[id0] expl/Rewards Std 0 +[id0] expl/Rewards Max 0 +[id0] expl/Rewards Min 0 +[id0] expl/Returns Mean 0 +[id0] expl/Returns Std 0 +[id0] expl/Returns Max 0 +[id0] expl/Returns Min 0 +[id0] expl/Actions Mean -0.0021044 +[id0] expl/Actions Std 0.077818 +[id0] expl/Actions Max 0.231755 +[id0] expl/Actions Min -0.248131 +[id0] expl/Num Paths 2 +[id0] expl/Average Returns 0 +[id0] eval/num steps total 500 +[id0] eval/num paths total 10 +[id0] eval/path length Mean 50 +[id0] eval/path length Std 0 +[id0] eval/path length Max 50 +[id0] eval/path length Min 50 +[id0] eval/Rewards Mean 0 +[id0] eval/Rewards Std 0 +[id0] eval/Rewards Max 0 +[id0] eval/Rewards Min 0 +[id0] eval/Returns Mean 0 +[id0] eval/Returns Std 0 +[id0] eval/Returns Max 0 +[id0] eval/Returns Min 0 +[id0] eval/Actions Mean -0.0026689 +[id0] eval/Actions Std 0.0594182 +[id0] eval/Actions Max 0.0957613 +[id0] eval/Actions Min -0.0931032 +[id0] eval/Num Paths 2 +[id0] eval/Average Returns 0 +[id0] time/epoch_time (s) 0.249234 +[id0] time/evaluation sampling (s) 0.051774 +[id0] time/exploration sampling (s) 0.0512688 +[id0] time/global_time (s) 5.10534 +[id0] time/replay buffer data storing (s) 0.00061202 +[id0] time/saving (s) 1.64509e-05 +[id0] time/training (s) 0.143926 +[id0] epoch 4 +[id0] ----------------------------------- -------------- diff --git a/tests/regression/awac/hand/id0_offline/pretrain_policy.csv b/tests/regression/awac/hand/id0_offline/pretrain_policy.csv new file mode 100644 index 000000000..e69de29bb diff --git a/tests/regression/awac/hand/id0_offline/pretrain_q.csv b/tests/regression/awac/hand/id0_offline/pretrain_q.csv new file mode 100644 index 000000000..e6724dbb7 --- /dev/null +++ b/tests/regression/awac/hand/id0_offline/pretrain_q.csv @@ -0,0 +1,2 @@ +trainer/Advantage Score Max,trainer/Advantage Score Mean,trainer/Advantage Score Min,trainer/Advantage Score Std,trainer/Advantage Weights Max,trainer/Advantage Weights Mean,trainer/Advantage Weights Min,trainer/Advantage Weights Std,trainer/Log Pis Max,trainer/Log Pis Mean,trainer/Log Pis Min,trainer/Log Pis Std,trainer/Policy Loss,trainer/Q Targets Max,trainer/Q Targets Mean,trainer/Q Targets Min,trainer/Q Targets Std,trainer/Q1 Predictions Max,trainer/Q1 Predictions Mean,trainer/Q1 Predictions Min,trainer/Q1 Predictions Std,trainer/Q2 Predictions Max,trainer/Q2 Predictions Mean,trainer/Q2 Predictions Min,trainer/Q2 Predictions Std,trainer/QF1 Loss,trainer/QF2 Loss,trainer/batch,trainer/epoch_time,trainer/policy/mean Max,trainer/policy/mean Mean,trainer/policy/mean Min,trainer/policy/mean Std,trainer/policy/std Max,trainer/policy/std Mean,trainer/policy/std Min,trainer/policy/std Std,trainer/replay_buffer_len,trainer/rewards Max,trainer/rewards Mean,trainer/rewards Min,trainer/rewards Std,trainer/terminals Max,trainer/terminals Mean,trainer/terminals Min,trainer/terminals Std +0.0008676289,-0.0005679735,-0.0022092168,0.00058340095,0.007834959,0.007812501,0.0077868924,9.1140455e-06,45.92739,37.696743,26.42419,3.7997017,37.69711,0.0,-0.58710957,-1.006516,0.49262425,0.0,-2.0639023e-05,-0.0011733524,0.00014470788,0.0,-0.000909909,-0.00243598,0.00086279056,0.5873349,0.58670294,0,0.021584749221801758,0.0028540972,1.4167544e-05,-0.0036163693,0.00056626234,0.049787067,0.04978707,0.049787067,3.7252903e-09,4378,0.0,-0.5859375,-1.0,0.49255937,0.0,0.0,0.0,0.0 diff --git a/tests/regression/awac/hand/id0_offline/progress.csv b/tests/regression/awac/hand/id0_offline/progress.csv new file mode 100644 index 000000000..8b65ee4d3 --- /dev/null +++ b/tests/regression/awac/hand/id0_offline/progress.csv @@ -0,0 +1,6 @@ +epoch,eval/Actions Max,eval/Actions Mean,eval/Actions Min,eval/Actions Std,eval/Average Returns,eval/Num Paths,eval/Returns Max,eval/Returns Mean,eval/Returns Min,eval/Returns Std,eval/Rewards Max,eval/Rewards Mean,eval/Rewards Min,eval/Rewards Std,eval/num paths total,eval/num steps total,eval/path length Max,eval/path length Mean,eval/path length Min,eval/path length Std,expl/Actions Max,expl/Actions Mean,expl/Actions Min,expl/Actions Std,expl/Average Returns,expl/Num Paths,expl/Returns Max,expl/Returns Mean,expl/Returns Min,expl/Returns Std,expl/Rewards Max,expl/Rewards Mean,expl/Rewards Min,expl/Rewards Std,expl/num paths total,expl/num steps total,expl/path length Max,expl/path length Mean,expl/path length Min,expl/path length Std,replay_buffer/size,time/epoch_time (s),time/evaluation sampling (s),time/exploration sampling (s),time/global_time (s),time/replay buffer data storing (s),time/saving (s),time/training (s),trainer/Advantage Score Max,trainer/Advantage Score Mean,trainer/Advantage Score Min,trainer/Advantage Score Std,trainer/Advantage Weights Max,trainer/Advantage Weights Mean,trainer/Advantage Weights Min,trainer/Advantage Weights Std,trainer/Log Pis Max,trainer/Log Pis Mean,trainer/Log Pis Min,trainer/Log Pis Std,trainer/Policy Loss,trainer/Q Targets Max,trainer/Q Targets Mean,trainer/Q Targets Min,trainer/Q Targets Std,trainer/Q1 Predictions Max,trainer/Q1 Predictions Mean,trainer/Q1 Predictions Min,trainer/Q1 Predictions Std,trainer/Q2 Predictions Max,trainer/Q2 Predictions Mean,trainer/Q2 Predictions Min,trainer/Q2 Predictions Std,trainer/QF1 Loss,trainer/QF2 Loss,trainer/num train calls,trainer/policy/mean Max,trainer/policy/mean Mean,trainer/policy/mean Min,trainer/policy/mean Std,trainer/policy/std Max,trainer/policy/std Mean,trainer/policy/std Min,trainer/policy/std Std,trainer/replay_buffer_len,trainer/rewards Max,trainer/rewards Mean,trainer/rewards Min,trainer/rewards Std,trainer/terminals Max,trainer/terminals Mean,trainer/terminals Min,trainer/terminals Std +0,0.0057997955,-0.00091667136,-0.0057981755,0.0048617073,0.0,2,0,0.0,0,0.0,0,0.0,0,0.0,2,100,50,50.0,50,0.0,0.17671596,-0.00027130623,-0.17636557,0.049193554,0.0,2,0,0.0,0,0.0,0,0.0,0,0.0,3,110,50,50.0,50,0.0,4488,0.2571086883544922,0.05223250389099121,0.05278897285461426,4.116673469543457,0.0006160736083984375,0.011642932891845703,0.1318972110748291,-0.018568095,-0.022887528,-0.027129482,0.0032767975,0.20173094,0.20000002,0.19830613,0.0013102691,42.684753,37.366028,33.280308,3.2702258,37.41147,-0.0011521813,-0.6011065,-1.001372,0.48974857,-0.035923805,-0.06313755,-0.08750885,0.01644555,-0.040222347,-0.06677331,-0.07907102,0.0137011465,0.5301114,0.53050923,10,0.064433694,-0.0062100836,-0.06346466,0.04201718,0.050011873,0.05001092,0.05000961,6.224868e-07,4488,0.0,-0.6,-1.0,0.48989794,0.0,0.0,0.0,0.0 +1,0.022124615,-0.002538385,-0.022398187,0.01762015,0.0,2,0,0.0,0,0.0,0,0.0,0,0.0,4,200,50,50.0,50,0.0,0.1740673,-0.002441747,-0.19124705,0.053576443,0.0,2,0,0.0,0,0.0,0,0.0,0,0.0,5,210,50,50.0,50,0.0,4588,0.2323753833770752,0.04938817024230957,0.05025124549865723,4.352134704589844,0.0006558895111083984,1.3828277587890625e-05,0.13043212890625,-0.06537576,-0.07605104,-0.08897197,0.008248537,0.20428829,0.2,0.19487137,0.003291521,41.017193,39.105515,36.11022,1.766411,39.310795,-0.0024476647,-0.20258045,-1.0025142,0.39996693,-0.22701214,-0.27697256,-0.34576467,0.045201637,-0.2356447,-0.26109186,-0.3020911,0.023657888,0.18753262,0.17413564,20,0.29238892,-0.026518514,-0.2933545,0.21095972,0.050235093,0.050226934,0.050220545,4.4004523e-06,4588,0.0,-0.2,-1.0,0.40000004,0.0,0.0,0.0,0.0 +2,0.05537542,-0.0041122404,-0.056115124,0.04091064,0.0,2,0,0.0,0,0.0,0,0.0,0,0.0,6,300,50,50.0,50,0.0,0.2170499,-0.004268051,-0.17282078,0.06445268,0.0,2,0,0.0,0,0.0,0,0.0,0,0.0,7,310,50,50.0,50,0.0,4688,0.2468705177307129,0.05084967613220215,0.05089926719665527,4.602200269699097,0.0006568431854248047,1.4543533325195312e-05,0.14299607276916504,-0.05020833,-0.07562328,-0.08737588,0.014087667,0.21034452,0.2,0.19527556,0.005709826,41.97786,36.42765,28.707008,4.866449,37.001884,-0.0047197314,-0.60484385,-1.005229,0.4899623,-0.51282763,-0.6496271,-0.750237,0.108881496,-0.5105761,-0.6061728,-0.6719518,0.06926513,0.24265863,0.24231184,30,0.6962998,-0.050269432,-0.69924355,0.503637,0.05045482,0.05042008,0.050386023,2.2130274e-05,4688,0.0,-0.6,-1.0,0.48989794,0.0,0.0,0.0,0.0 +3,0.090003096,-0.0032721753,-0.088499345,0.061187737,0.0,2,0,0.0,0,0.0,0,0.0,0,0.0,8,400,50,50.0,50,0.0,0.22930935,-0.0029032195,-0.23499307,0.07976017,0.0,2,0,0.0,0,0.0,0,0.0,0,0.0,9,410,50,50.0,50,0.0,4788,0.24759197235107422,0.05048727989196777,0.050887346267700195,4.852576017379761,0.0006122589111328125,1.3828277587890625e-05,0.14377832412719727,0.002675414,-0.0234178,-0.03651324,0.014133674,0.21062946,0.2,0.19475125,0.005734913,40.030922,37.62022,36.41111,1.2805856,38.1063,-0.0077722864,-0.20832333,-1.0090127,0.40034488,-0.41773587,-0.4891737,-0.594545,0.06135534,-0.4449304,-0.50648177,-0.57977355,0.04592085,0.20071249,0.22192657,40,0.899897,-0.04538444,-0.89319265,0.6529056,0.050676323,0.050579205,0.050486892,6.0960647e-05,4788,0.0,-0.2,-1.0,0.40000004,0.0,0.0,0.0,0.0 +4,0.09576128,-0.002668901,-0.09310322,0.059418164,0.0,2,0,0.0,0,0.0,0,0.0,0,0.0,10,500,50,50.0,50,0.0,0.23175487,-0.0021043965,-0.24813086,0.077817984,0.0,2,0,0.0,0,0.0,0,0.0,0,0.0,11,510,50,50.0,50,0.0,4888,0.24923419952392578,0.051774024963378906,0.051268815994262695,5.105340242385864,0.0006120204925537109,1.6450881958007812e-05,0.14392614364624023,0.00028559566,-0.023899574,-0.046192855,0.014791807,0.20981985,0.2,0.19119473,0.0059337234,39.319942,34.6726,31.761282,2.5998607,35.088203,-0.011989091,-0.8130989,-1.0166211,0.4005584,-0.37503818,-0.4372994,-0.4957269,0.038745232,-0.38183162,-0.42318493,-0.4498194,0.022984754,0.27816167,0.29641694,50,0.91950196,-0.041209158,-0.9099869,0.64941895,0.050871197,0.050717708,0.05053735,0.00010893781,4888,0.0,-0.8,-1.0,0.40000004,0.0,0.0,0.0,0.0 diff --git a/tests/regression/awac/hand/id0_offline/script_name.txt b/tests/regression/awac/hand/id0_offline/script_name.txt new file mode 100644 index 000000000..e79cb6baa --- /dev/null +++ b/tests/regression/awac/hand/id0_offline/script_name.txt @@ -0,0 +1 @@ +tests/regression/awac/hand/test_awac_hand_offline.py \ No newline at end of file diff --git a/tests/regression/awac/hand/id0_offline/variant.json b/tests/regression/awac/hand/id0_offline/variant.json new file mode 100644 index 000000000..9b91545ba --- /dev/null +++ b/tests/regression/awac/hand/id0_offline/variant.json @@ -0,0 +1,110 @@ +{ + "add_env_demos": false, + "add_env_offpolicy_data": false, + "algo_kwargs": { + "batch_size": 5, + "min_num_steps_before_training": 10, + "num_epochs": 5, + "num_eval_steps_per_epoch": 100, + "num_expl_steps_per_train_loop": 100, + "num_trains_per_train_loop": 10 + }, + "algorithm": "AWAC", + "batch_size": 5, + "collection_mode": "batch", + "debug": true, + "env_class": { + "$class": "rlkit.testing.stub_classes.StubEnv" + }, + "env_kwargs": { + "action_dim": 24, + "obs_dim": 45 + }, + "exp_id": 0, + "exp_name": "references/awac/hand/awac_offline1", + "load_demos": true, + "logger_config": { + "run_id": 0, + "snapshot_gap": 100, + "snapshot_mode": "gap" + }, + "max_path_length": 50, + "normalize_env": false, + "num_epochs": 0, + "path_loader_class": { + "$class": "rlkit.demos.source.dict_to_mdp_path_loader.DictToMDPPathLoader" + }, + "path_loader_kwargs": { + "demo_paths": [ + { + "data_split": 1, + "is_demo": true, + "obs_dict": true, + "path": "/home/ashvin/code/railrl-private/tests/regression/awac/hand/pen2_sparse.npy" + } + ], + "obs_key": "state_observation" + }, + "policy_class": { + "$class": "rlkit.torch.sac.policies.gaussian_policy.GaussianPolicy" + }, + "policy_kwargs": { + "hidden_sizes": [ + 256, + 256 + ], + "max_log_std": 0, + "min_log_std": -6, + "std_architecture": "values" + }, + "pretrain_policy": true, + "pretrain_rl": true, + "pretraining_logging_period": 1, + "qf_kwargs": { + "hidden_sizes": [ + 256, + 256 + ], + "output_activation": { + "$function": "rlkit.torch.networks.basic.Clamp" + } + }, + "replay_buffer_size": 1000000, + "run_id": 0, + "seed": "0", + "seedid": 0, + "trainer_kwargs": { + "alpha": 0, + "awr_min_q": true, + "awr_use_mle_for_vf": true, + "awr_weight": 1.0, + "bc_num_pretrain_steps": 0, + "bc_weight": 0.0, + "beta": 0.5, + "clip_score": 0.5, + "compute_bc": false, + "discount": 0.99, + "policy_lr": 0.0003, + "policy_weight_decay": 0.0001, + "q_num_pretrain1_steps": 0, + "q_num_pretrain2_steps": 10, + "q_weight_decay": 0, + "qf_lr": 0.0003, + "reparam_weight": 0.0, + "reward_scale": 1, + "reward_transform_kwargs": null, + "rl_weight": 1.0, + "soft_target_tau": 0.005, + "target_update_period": 1, + "terminal_transform_kwargs": { + "b": 0, + "m": 0 + }, + "use_automatic_entropy_tuning": false, + "use_awr_update": true, + "use_reparam_update": false + }, + "trial_name": "id0", + "unique_id": "78d5532f-cbe6-4907-9ae1-2cc8ac332a5d", + "version": "normal" +} \ No newline at end of file diff --git a/tests/regression/awac/hand/pen2_sparse.npy b/tests/regression/awac/hand/pen2_sparse.npy new file mode 100644 index 000000000..5113ae865 Binary files /dev/null and b/tests/regression/awac/hand/pen2_sparse.npy differ diff --git a/tests/regression/awac/hand/test_awac_hand.py b/tests/regression/awac/hand/test_awac_hand.py new file mode 100644 index 000000000..4bf58b5e2 --- /dev/null +++ b/tests/regression/awac/hand/test_awac_hand.py @@ -0,0 +1,54 @@ +"""Test AWAC offline+online on Mujoco dextrous manipulation tasks. + +Running the dexterous manipulation experiments requires setting up the +environments in this repository: https://github.com/aravindr93/hand_dapg. +You can also use the follwing docker image, which has the required +dependencies set up: anair17/railrl-hand-v3 + +For the mj_envs repository, please use: https://github.com/anair13/mj_envs + +Data available for download: +https://drive.google.com/file/d/1SsVaQKZnY5UkuR78WrInp9XxTdKHbF0x/view +""" +import os +import sys + +from examples.awac.hand.awac1 import main + +from rlkit.core import logger +from rlkit.testing import csv_util + +def test_awac_hand_online(): + # the following hack required because of a conflict between env naming in d4rl and mj_envs + import gym + custom_envs = ['door-v0', 'pen-v0', 'relocate-v0', 'hammer-v0'] + env_dict = gym.envs.registration.registry.env_specs.copy() + for custom_env in custom_envs: + if custom_env in env_dict: + print("Remove {} from registry".format(custom_env)) + del gym.envs.registration.registry.env_specs[custom_env] + + cmd = "python examples/awac/hand/awac1.py --1 --local --gpu --run 0 --seed 0 --debug" + sys.argv = cmd.split(" ")[1:] + main() + + # check if online training results matches + reference_csv = "tests/regression/awac/hand/id0/pretrain_q.csv" + output_csv = os.path.join(logger.get_snapshot_dir(), "pretrain_q.csv") + print("comparing reference %s against output %s" % (reference_csv, output_csv)) + output = csv_util.get_exp(output_csv) + reference = csv_util.get_exp(reference_csv) + keys = ["trainer/batch", "trainer/Advantage Score Max", "trainer/Q1 Predictions Mean", "trainer/replay_buffer_len"] + csv_util.check_equal(reference, output, keys) + + # TODO: this test seems to have some extra stochasticity to control, perhaps from the env? + # check if online training results match + # reference_csv = "tests/regression/awac/hand/id0/progress.csv" + # output_csv = os.path.join(logger.get_snapshot_dir(), "progress.csv") + # output = csv_util.get_exp(output_csv) + # reference = csv_util.get_exp(reference_csv) + # keys = ["epoch", "eval/Actions Mean", "expl/Actions Mean", "eval/Average Returns", "expl/Average Returns", "trainer/Advantage Score Max", "trainer/Q1 Predictions Mean", "trainer/replay_buffer_len"] + # csv_util.check_equal(reference, output, keys) + +if __name__ == "__main__": + test_awac_hand_online() diff --git a/tests/regression/awac/hand/test_awac_hand_offline.py b/tests/regression/awac/hand/test_awac_hand_offline.py new file mode 100644 index 000000000..556ed8b76 --- /dev/null +++ b/tests/regression/awac/hand/test_awac_hand_offline.py @@ -0,0 +1,169 @@ +"""Test AWAC offline on Mujoco dextrous manipulation tasks. + +Running the dexterous manipulation experiments requires setting up the +environments in this repository: https://github.com/aravindr93/hand_dapg. +You can also use the follwing docker image, which has the required +dependencies set up: anair17/railrl-hand-v3 + +For the mj_envs repository, please use: https://github.com/anair13/mj_envs + +Data available for download: +https://drive.google.com/file/d/1SsVaQKZnY5UkuR78WrInp9XxTdKHbF0x/view +""" + +from rlkit.demos.source.dict_to_mdp_path_loader import DictToMDPPathLoader +from rlkit.launchers.experiments.awac.awac_rl import experiment, process_args + +import rlkit.util.hyperparameter as hyp +from rlkit.launchers.arglauncher import run_variants + +from rlkit.torch.sac.policies import GaussianPolicy +from rlkit.torch.networks import Clamp + +from rlkit.testing.stub_classes import StubEnv, StubMultiEnv + +def main(): + variant = dict( + algo_kwargs=dict( + num_epochs=501, + num_eval_steps_per_epoch=1000, + num_trains_per_train_loop=1000, + num_expl_steps_per_train_loop=1000, + min_num_steps_before_training=1000, + batch_size=1024, + ), + max_path_length=1000, + algorithm="AWAC", + replay_buffer_size=int(1E6), + + policy_class=GaussianPolicy, + policy_kwargs=dict( + hidden_sizes=[256, 256, ], + max_log_std=0, + min_log_std=-6, + std_architecture="values", + ), + qf_kwargs=dict( + hidden_sizes=[256, 256, ], + output_activation=Clamp(max=0), # rewards are <= 0 + ), + + version="normal", + collection_mode='batch', + trainer_kwargs=dict( + discount=0.99, + soft_target_tau=5e-3, + target_update_period=1, + policy_lr=3E-4, + qf_lr=3E-4, + reward_scale=1, + beta=1, + use_automatic_entropy_tuning=False, + alpha=0, + compute_bc=False, + awr_min_q=True, + + bc_num_pretrain_steps=0, + q_num_pretrain1_steps=0, + q_num_pretrain2_steps=25000, + policy_weight_decay=1e-4, + q_weight_decay=0, + + rl_weight=1.0, + use_awr_update=True, + use_reparam_update=False, + reparam_weight=0.0, + awr_weight=1.0, + bc_weight=0.0, + + reward_transform_kwargs=None, + terminal_transform_kwargs=dict(m=0, b=0), + ), + launcher_config=dict( + num_exps_per_instance=1, + region='us-west-2', + ), + + path_loader_class=DictToMDPPathLoader, + path_loader_kwargs=dict( + obs_key="state_observation", + demo_paths=[ # these can be loaded in awac_rl.py per env + # dict( + # path="demos/icml2020/hand/pen_bc5.npy", + # obs_dict=False, + # is_demo=False, + # train_split=0.9, + # ), + ], + ), + add_env_demos=True, + add_env_offpolicy_data=True, + normalize_env=False, + + load_demos=True, + pretrain_policy=True, + pretrain_rl=True, + ) + + search_space = { + # 'env_id': ["pen-binary-v0", "door-binary-v0", "relocate-binary-v0", ], + 'seedid': range(5), + 'trainer_kwargs.beta': [0.5, ], + 'trainer_kwargs.clip_score': [0.5, ], + 'trainer_kwargs.awr_use_mle_for_vf': [True, ], + + # make experiment short and only offline + 'batch_size': [5], + 'num_epochs': [0], + 'pretraining_logging_period': [1], + 'trainer_kwargs.q_num_pretrain2_steps': [10], + 'path_loader_kwargs.demo_paths': [ + [dict( + path=os.getcwd() + "/tests/regression/awac/hand/pen2_sparse.npy", + obs_dict=True, # misleading but this arg is really "unwrap_obs_dict" + is_demo=True, + data_split=1, + ),], + ], + 'env_class': [StubEnv], # replaces pen + 'env_kwargs': [dict( + obs_dim=45, + action_dim=24, + ),], + 'add_env_demos': [False], + 'add_env_offpolicy_data': [False], + } + + sweeper = hyp.DeterministicHyperparameterSweeper( + search_space, default_parameters=variant, + ) + + variants = [] + for variant in sweeper.iterate_hyperparameters(): + variants.append(variant) + + run_variants(experiment, variants, process_args) + +import os +import sys + +from rlkit.core import logger +from rlkit.testing import csv_util + +def test_awac_hand_online(): + cmd = "python examples/awac/hand/awac_offline1.py --1 --local --gpu --run 0 --seed 0 --debug" + sys.argv = cmd.split(" ")[1:] + main() + + # check if online training results matches + reference_csv = "tests/regression/awac/hand/id0_offline/pretrain_q.csv" + output_csv = os.path.join(logger.get_snapshot_dir(), "pretrain_q.csv") + print("comparing reference %s against output %s" % (reference_csv, output_csv)) + output = csv_util.get_exp(output_csv) + reference = csv_util.get_exp(reference_csv) + keys = ["trainer/batch", "trainer/Advantage Score Max", "trainer/Q1 Predictions Mean", "trainer/replay_buffer_len"] + csv_util.check_equal(reference, output, keys) + + +if __name__ == "__main__": + test_awac_hand_online() diff --git a/tests/regression/awac/mujoco/__init__.py b/tests/regression/awac/mujoco/__init__.py new file mode 100644 index 000000000..e69de29bb diff --git a/tests/regression/awac/mujoco/hc_action_noise_15.npy b/tests/regression/awac/mujoco/hc_action_noise_15.npy new file mode 100644 index 000000000..53641429c Binary files /dev/null and b/tests/regression/awac/mujoco/hc_action_noise_15.npy differ diff --git a/tests/regression/awac/mujoco/id0/debug.log b/tests/regression/awac/mujoco/id0/debug.log new file mode 100644 index 000000000..c488d8843 --- /dev/null +++ b/tests/regression/awac/mujoco/id0/debug.log @@ -0,0 +1,7238 @@ +[id0] ------------------------------ ------------ +[id0] trainer/QF1 Loss 33.5189 +[id0] trainer/QF2 Loss 33.4663 +[id0] trainer/Policy Loss 9.77056 +[id0] trainer/Q1 Predictions Mean -0.00249345 +[id0] trainer/Q1 Predictions Std 0.00498761 +[id0] trainer/Q1 Predictions Max 0.0140494 +[id0] trainer/Q1 Predictions Min -0.016885 +[id0] trainer/Q2 Predictions Mean 0.000312903 +[id0] trainer/Q2 Predictions Std 0.00627891 +[id0] trainer/Q2 Predictions Max 0.0147451 +[id0] trainer/Q2 Predictions Min -0.0143751 +[id0] trainer/Q Targets Mean 3.37967 +[id0] trainer/Q Targets Std 4.69738 +[id0] trainer/Q Targets Max 12.2831 +[id0] trainer/Q Targets Min -1.67562 +[id0] trainer/Log Pis Mean 9.76541 +[id0] trainer/Log Pis Std 1.54756 +[id0] trainer/Log Pis Max 11.9642 +[id0] trainer/Log Pis Min 3.07995 +[id0] trainer/rewards Mean 3.37886 +[id0] trainer/rewards Std 4.69639 +[id0] trainer/rewards Max 12.2775 +[id0] trainer/rewards Min -1.67873 +[id0] trainer/terminals Mean 0 +[id0] trainer/terminals Std 0 +[id0] trainer/terminals Max 0 +[id0] trainer/terminals Min 0 +[id0] trainer/policy/mean Mean -2.27958e-05 +[id0] trainer/policy/mean Std 0.000390531 +[id0] trainer/policy/mean Max 0.00158385 +[id0] trainer/policy/mean Min -0.00106018 +[id0] trainer/policy/std Mean 0.0497871 +[id0] trainer/policy/std Std 3.72529e-09 +[id0] trainer/policy/std Max 0.0497871 +[id0] trainer/policy/std Min 0.0497871 +[id0] trainer/Advantage Weights Mean 0.0078125 +[id0] trainer/Advantage Weights Std 1.74008e-06 +[id0] trainer/Advantage Weights Max 0.00781774 +[id0] trainer/Advantage Weights Min 0.00780844 +[id0] trainer/Advantage Score Mean 5.04166e-05 +[id0] trainer/Advantage Score Std 0.000445457 +[id0] trainer/Advantage Score Max 0.00139067 +[id0] trainer/Advantage Score Min -0.000988897 +[id0] trainer/batch 0 +[id0] trainer/epoch_time 0.0219638 +[id0] ------------------------------ ------------ +[id0] ------------------------------ -------------- +[id0] trainer/QF1 Loss 1.10636 +[id0] trainer/QF2 Loss 1.17687 +[id0] trainer/Policy Loss -4.37908 +[id0] trainer/Q1 Predictions Mean 12.6011 +[id0] trainer/Q1 Predictions Std 18.5229 +[id0] trainer/Q1 Predictions Max 46.2724 +[id0] trainer/Q1 Predictions Min -2.81333 +[id0] trainer/Q2 Predictions Mean 12.6603 +[id0] trainer/Q2 Predictions Std 18.5387 +[id0] trainer/Q2 Predictions Max 46.2575 +[id0] trainer/Q2 Predictions Min -2.73023 +[id0] trainer/Q Targets Mean 12.5188 +[id0] trainer/Q Targets Std 18.42 +[id0] trainer/Q Targets Max 46.5481 +[id0] trainer/Q Targets Min -3.75254 +[id0] trainer/Log Pis Mean 8.16459 +[id0] trainer/Log Pis Std 1.50557 +[id0] trainer/Log Pis Max 10.6785 +[id0] trainer/Log Pis Min 2.8423 +[id0] trainer/rewards Mean 2.85022 +[id0] trainer/rewards Std 4.3826 +[id0] trainer/rewards Max 11.4352 +[id0] trainer/rewards Min -1.33225 +[id0] trainer/terminals Mean 0 +[id0] trainer/terminals Std 0 +[id0] trainer/terminals Max 0 +[id0] trainer/terminals Min 0 +[id0] trainer/policy/mean Mean 0.118643 +[id0] trainer/policy/mean Std 0.679188 +[id0] trainer/policy/mean Max 0.999853 +[id0] trainer/policy/mean Min -0.998398 +[id0] trainer/policy/std Mean 0.0621908 +[id0] trainer/policy/std Std 0.00241011 +[id0] trainer/policy/std Max 0.0652779 +[id0] trainer/policy/std Min 0.0586058 +[id0] trainer/Advantage Weights Mean 0.0078125 +[id0] trainer/Advantage Weights Std 0.000768811 +[id0] trainer/Advantage Weights Max 0.010311 +[id0] trainer/Advantage Weights Min 0.00564109 +[id0] trainer/Advantage Score Mean 0.00272017 +[id0] trainer/Advantage Score Std 0.194321 +[id0] trainer/Advantage Score Max 0.567186 +[id0] trainer/Advantage Score Min -0.639083 +[id0] trainer/batch 1000 +[id0] trainer/epoch_time 17.3256 +[id0] ------------------------------ -------------- +[id0] ------------------------------ ------------ +[id0] trainer/QF1 Loss 36.9449 +[id0] trainer/QF2 Loss 36.8878 +[id0] trainer/Policy Loss 9.65703 +[id0] trainer/Q1 Predictions Mean -0.00135076 +[id0] trainer/Q1 Predictions Std 0.00403503 +[id0] trainer/Q1 Predictions Max 0.0101064 +[id0] trainer/Q1 Predictions Min -0.0096822 +[id0] trainer/Q2 Predictions Mean 0.00284615 +[id0] trainer/Q2 Predictions Std 0.00358465 +[id0] trainer/Q2 Predictions Max 0.0116807 +[id0] trainer/Q2 Predictions Min -0.00417278 +[id0] trainer/Q Targets Mean 3.74862 +[id0] trainer/Q Targets Std 4.78119 +[id0] trainer/Q Targets Max 12.0778 +[id0] trainer/Q Targets Min -1.7004 +[id0] trainer/Log Pis Mean 9.65528 +[id0] trainer/Log Pis Std 1.57625 +[id0] trainer/Log Pis Max 11.9813 +[id0] trainer/Log Pis Min 5.00643 +[id0] trainer/rewards Mean 3.7576 +[id0] trainer/rewards Std 4.78427 +[id0] trainer/rewards Max 12.0879 +[id0] trainer/rewards Min -1.691 +[id0] trainer/terminals Mean 0 +[id0] trainer/terminals Std 0 +[id0] trainer/terminals Max 0 +[id0] trainer/terminals Min 0 +[id0] trainer/policy/mean Mean 5.72647e-05 +[id0] trainer/policy/mean Std 0.000304584 +[id0] trainer/policy/mean Max 0.00130345 +[id0] trainer/policy/mean Min -0.000712585 +[id0] trainer/policy/std Mean 0.0497871 +[id0] trainer/policy/std Std 3.72529e-09 +[id0] trainer/policy/std Max 0.0497871 +[id0] trainer/policy/std Min 0.0497871 +[id0] trainer/Advantage Weights Mean 0.0078125 +[id0] trainer/Advantage Weights Std 1.91135e-06 +[id0] trainer/Advantage Weights Max 0.00782011 +[id0] trainer/Advantage Weights Min 0.00780832 +[id0] trainer/Advantage Score Mean 1.51016e-05 +[id0] trainer/Advantage Score Std 0.000489254 +[id0] trainer/Advantage Score Max 0.00196306 +[id0] trainer/Advantage Score Min -0.00105644 +[id0] trainer/batch 0 +[id0] trainer/epoch_time 0.0218916 +[id0] ------------------------------ ------------ +[id0] ------------------------------ ------------ +[id0] trainer/QF1 Loss 39.1479 +[id0] trainer/QF2 Loss 39.1395 +[id0] trainer/Policy Loss 9.25703 +[id0] trainer/Q1 Predictions Mean 0.00113849 +[id0] trainer/Q1 Predictions Std 0.00296468 +[id0] trainer/Q1 Predictions Max 0.00909882 +[id0] trainer/Q1 Predictions Min -0.00882283 +[id0] trainer/Q2 Predictions Mean -0.000210945 +[id0] trainer/Q2 Predictions Std 0.00447923 +[id0] trainer/Q2 Predictions Max 0.0127143 +[id0] trainer/Q2 Predictions Min -0.00956595 +[id0] trainer/Q Targets Mean 3.91035 +[id0] trainer/Q Targets Std 4.8844 +[id0] trainer/Q Targets Max 11.671 +[id0] trainer/Q Targets Min -1.70705 +[id0] trainer/Log Pis Mean 9.25529 +[id0] trainer/Log Pis Std 1.57173 +[id0] trainer/Log Pis Max 11.8969 +[id0] trainer/Log Pis Min 4.98222 +[id0] trainer/rewards Mean 3.91579 +[id0] trainer/rewards Std 4.88588 +[id0] trainer/rewards Max 11.6849 +[id0] trainer/rewards Min -1.70557 +[id0] trainer/terminals Mean 0 +[id0] trainer/terminals Std 0 +[id0] trainer/terminals Max 0 +[id0] trainer/terminals Min 0 +[id0] trainer/policy/mean Mean -6.88463e-05 +[id0] trainer/policy/mean Std 0.00032387 +[id0] trainer/policy/mean Max 0.000914816 +[id0] trainer/policy/mean Min -0.00136786 +[id0] trainer/policy/std Mean 0.0497871 +[id0] trainer/policy/std Std 3.72529e-09 +[id0] trainer/policy/std Max 0.0497871 +[id0] trainer/policy/std Min 0.0497871 +[id0] trainer/Advantage Weights Mean 0.0078125 +[id0] trainer/Advantage Weights Std 2.03445e-06 +[id0] trainer/Advantage Weights Max 0.00781688 +[id0] trainer/Advantage Weights Min 0.00780573 +[id0] trainer/Advantage Score Mean -0.000257812 +[id0] trainer/Advantage Score Std 0.000520833 +[id0] trainer/Advantage Score Max 0.000864113 +[id0] trainer/Advantage Score Min -0.00199137 +[id0] trainer/batch 0 +[id0] trainer/epoch_time 0.0218341 +[id0] ------------------------------ ------------ +[id0] ----------------------------------- ------------- +[id0] trainer/num train calls 10 +[id0] trainer/QF1 Loss 39.6099 +[id0] trainer/QF2 Loss 39.5914 +[id0] trainer/Policy Loss 10.0304 +[id0] trainer/Q1 Predictions Mean 0.206157 +[id0] trainer/Q1 Predictions Std 0.0716584 +[id0] trainer/Q1 Predictions Max 0.327408 +[id0] trainer/Q1 Predictions Min 0.120331 +[id0] trainer/Q2 Predictions Mean 0.205103 +[id0] trainer/Q2 Predictions Std 0.0814219 +[id0] trainer/Q2 Predictions Max 0.330909 +[id0] trainer/Q2 Predictions Min 0.110514 +[id0] trainer/Q Targets Mean 4.89164 +[id0] trainer/Q Targets Std 4.24767 +[id0] trainer/Q Targets Max 10.8606 +[id0] trainer/Q Targets Min -0.134648 +[id0] trainer/Log Pis Mean 10.2293 +[id0] trainer/Log Pis Std 0.974036 +[id0] trainer/Log Pis Max 11.1982 +[id0] trainer/Log Pis Min 8.52036 +[id0] trainer/rewards Mean 4.89314 +[id0] trainer/rewards Std 4.24648 +[id0] trainer/rewards Max 10.868 +[id0] trainer/rewards Min -0.128855 +[id0] trainer/terminals Mean 0 +[id0] trainer/terminals Std 0 +[id0] trainer/terminals Max 0 +[id0] trainer/terminals Min 0 +[id0] trainer/policy/mean Mean 0.0157861 +[id0] trainer/policy/mean Std 0.0374136 +[id0] trainer/policy/mean Max 0.0695026 +[id0] trainer/policy/mean Min -0.0565063 +[id0] trainer/policy/std Mean 0.0500107 +[id0] trainer/policy/std Std 7.34906e-07 +[id0] trainer/policy/std Max 0.0500114 +[id0] trainer/policy/std Min 0.0500094 +[id0] trainer/Advantage Weights Mean 0.2 +[id0] trainer/Advantage Weights Std 0.000155717 +[id0] trainer/Advantage Weights Max 0.200201 +[id0] trainer/Advantage Weights Min 0.199822 +[id0] trainer/Advantage Score Mean -8.04335e-05 +[id0] trainer/Advantage Score Std 0.00155706 +[id0] trainer/Advantage Score Max 0.00192887 +[id0] trainer/Advantage Score Min -0.0018598 +[id0] expl/num steps total 110 +[id0] expl/num paths total 2 +[id0] expl/path length Mean 100 +[id0] expl/path length Std 0 +[id0] expl/path length Max 100 +[id0] expl/path length Min 100 +[id0] expl/Rewards Mean -0.00891122 +[id0] expl/Rewards Std 0.0732151 +[id0] expl/Rewards Max 0.168586 +[id0] expl/Rewards Min -0.176729 +[id0] expl/Returns Mean -0.891122 +[id0] expl/Returns Std 0 +[id0] expl/Returns Max -0.891122 +[id0] expl/Returns Min -0.891122 +[id0] expl/Actions Mean 0.00418862 +[id0] expl/Actions Std 0.0488052 +[id0] expl/Actions Max 0.144731 +[id0] expl/Actions Min -0.142584 +[id0] expl/Num Paths 1 +[id0] expl/Average Returns -0.891122 +[id0] eval/num steps total 0 +[id0] eval/num paths total 0 +[id0] time/epoch_time (s) 0.345869 +[id0] time/evaluation sampling (s) 0.0869439 +[id0] time/exploration sampling (s) 0.0841613 +[id0] time/global_time (s) 6.08635 +[id0] time/replay buffer data storing (s) 0.000600338 +[id0] time/saving (s) 0.0135102 +[id0] time/training (s) 0.148214 +[id0] epoch 0 +[id0] ----------------------------------- ------------- +[id0] ----------------------------------- ------------- +[id0] trainer/num train calls 20 +[id0] trainer/QF1 Loss 0.607627 +[id0] trainer/QF2 Loss 0.610418 +[id0] trainer/Policy Loss 9.25632 +[id0] trainer/Q1 Predictions Mean 0.521341 +[id0] trainer/Q1 Predictions Std 0.113618 +[id0] trainer/Q1 Predictions Max 0.658644 +[id0] trainer/Q1 Predictions Min 0.376657 +[id0] trainer/Q2 Predictions Mean 0.512161 +[id0] trainer/Q2 Predictions Std 0.105168 +[id0] trainer/Q2 Predictions Max 0.657816 +[id0] trainer/Q2 Predictions Min 0.381929 +[id0] trainer/Q Targets Mean -0.0692358 +[id0] trainer/Q Targets Std 0.586632 +[id0] trainer/Q Targets Max 0.739792 +[id0] trainer/Q Targets Min -0.949559 +[id0] trainer/Log Pis Mean 9.76232 +[id0] trainer/Log Pis Std 1.82962 +[id0] trainer/Log Pis Max 11.701 +[id0] trainer/Log Pis Min 6.32611 +[id0] trainer/rewards Mean -0.0719136 +[id0] trainer/rewards Std 0.588887 +[id0] trainer/rewards Max 0.738594 +[id0] trainer/rewards Min -0.955556 +[id0] trainer/terminals Mean 0 +[id0] trainer/terminals Std 0 +[id0] trainer/terminals Max 0 +[id0] trainer/terminals Min 0 +[id0] trainer/policy/mean Mean 0.0887968 +[id0] trainer/policy/mean Std 0.253995 +[id0] trainer/policy/mean Max 0.425883 +[id0] trainer/policy/mean Min -0.353937 +[id0] trainer/policy/std Mean 0.0502249 +[id0] trainer/policy/std Std 5.31756e-06 +[id0] trainer/policy/std Max 0.0502326 +[id0] trainer/policy/std Min 0.0502159 +[id0] trainer/Advantage Weights Mean 0.2 +[id0] trainer/Advantage Weights Std 0.00050095 +[id0] trainer/Advantage Weights Max 0.200636 +[id0] trainer/Advantage Weights Min 0.19919 +[id0] trainer/Advantage Score Mean 0.00305707 +[id0] trainer/Advantage Score Std 0.00501217 +[id0] trainer/Advantage Score Max 0.00940925 +[id0] trainer/Advantage Score Min -0.00505379 +[id0] expl/num steps total 210 +[id0] expl/num paths total 3 +[id0] expl/path length Mean 100 +[id0] expl/path length Std 0 +[id0] expl/path length Max 100 +[id0] expl/path length Min 100 +[id0] expl/Rewards Mean -0.00792291 +[id0] expl/Rewards Std 0.063436 +[id0] expl/Rewards Max 0.120925 +[id0] expl/Rewards Min -0.158852 +[id0] expl/Returns Mean -0.792291 +[id0] expl/Returns Std 0 +[id0] expl/Returns Max -0.792291 +[id0] expl/Returns Min -0.792291 +[id0] expl/Actions Mean 0.00748742 +[id0] expl/Actions Std 0.0589161 +[id0] expl/Actions Max 0.170901 +[id0] expl/Actions Min -0.165447 +[id0] expl/Num Paths 1 +[id0] expl/Average Returns -0.792291 +[id0] eval/num steps total 0 +[id0] eval/num paths total 0 +[id0] time/epoch_time (s) 0.283542 +[id0] time/evaluation sampling (s) 0.082824 +[id0] time/exploration sampling (s) 0.0818107 +[id0] time/global_time (s) 6.37261 +[id0] time/replay buffer data storing (s) 0.000624418 +[id0] time/saving (s) 1.50204e-05 +[id0] time/training (s) 0.117351 +[id0] epoch 1 +[id0] ----------------------------------- ------------- +[id0] ----------------------------------- ------------- +[id0] trainer/num train calls 30 +[id0] trainer/QF1 Loss 37.3653 +[id0] trainer/QF2 Loss 37.7417 +[id0] trainer/Policy Loss 8.93955 +[id0] trainer/Q1 Predictions Mean 0.879653 +[id0] trainer/Q1 Predictions Std 0.37879 +[id0] trainer/Q1 Predictions Max 1.42376 +[id0] trainer/Q1 Predictions Min 0.430082 +[id0] trainer/Q2 Predictions Mean 0.84467 +[id0] trainer/Q2 Predictions Std 0.395051 +[id0] trainer/Q2 Predictions Max 1.50713 +[id0] trainer/Q2 Predictions Min 0.403993 +[id0] trainer/Q Targets Mean 4.11413 +[id0] trainer/Q Targets Std 5.5447 +[id0] trainer/Q Targets Max 10.9152 +[id0] trainer/Q Targets Min -0.916915 +[id0] trainer/Log Pis Mean 9.75643 +[id0] trainer/Log Pis Std 0.86159 +[id0] trainer/Log Pis Max 11.0454 +[id0] trainer/Log Pis Min 8.76103 +[id0] trainer/rewards Mean 4.10483 +[id0] trainer/rewards Std 5.54617 +[id0] trainer/rewards Max 10.9107 +[id0] trainer/rewards Min -0.934062 +[id0] trainer/terminals Mean 0 +[id0] trainer/terminals Std 0 +[id0] trainer/terminals Max 0 +[id0] trainer/terminals Min 0 +[id0] trainer/policy/mean Mean 0.104748 +[id0] trainer/policy/mean Std 0.374738 +[id0] trainer/policy/mean Max 0.820446 +[id0] trainer/policy/mean Min -0.801323 +[id0] trainer/policy/std Mean 0.0504248 +[id0] trainer/policy/std Std 1.74858e-05 +[id0] trainer/policy/std Max 0.0504474 +[id0] trainer/policy/std Min 0.0503928 +[id0] trainer/Advantage Weights Mean 0.2 +[id0] trainer/Advantage Weights Std 0.0011713 +[id0] trainer/Advantage Weights Max 0.201475 +[id0] trainer/Advantage Weights Min 0.198515 +[id0] trainer/Advantage Score Mean 0.0078394 +[id0] trainer/Advantage Score Std 0.0117173 +[id0] trainer/Advantage Score Max 0.0225664 +[id0] trainer/Advantage Score Min -0.00702929 +[id0] expl/num steps total 310 +[id0] expl/num paths total 4 +[id0] expl/path length Mean 100 +[id0] expl/path length Std 0 +[id0] expl/path length Max 100 +[id0] expl/path length Min 100 +[id0] expl/Rewards Mean -0.00972317 +[id0] expl/Rewards Std 0.0867267 +[id0] expl/Rewards Max 0.167552 +[id0] expl/Rewards Min -0.175151 +[id0] expl/Returns Mean -0.972317 +[id0] expl/Returns Std 0 +[id0] expl/Returns Max -0.972317 +[id0] expl/Returns Min -0.972317 +[id0] expl/Actions Mean 0.0173206 +[id0] expl/Actions Std 0.0769402 +[id0] expl/Actions Max 0.237627 +[id0] expl/Actions Min -0.208343 +[id0] expl/Num Paths 1 +[id0] expl/Average Returns -0.972317 +[id0] eval/num steps total 0 +[id0] eval/num paths total 0 +[id0] time/epoch_time (s) 0.315411 +[id0] time/evaluation sampling (s) 0.0791221 +[id0] time/exploration sampling (s) 0.0823536 +[id0] time/global_time (s) 6.69066 +[id0] time/replay buffer data storing (s) 0.000601292 +[id0] time/saving (s) 1.54972e-05 +[id0] time/training (s) 0.152391 +[id0] epoch 2 +[id0] ----------------------------------- ------------- +[id0] ----------------------------------- ------------- +[id0] trainer/num train calls 40 +[id0] trainer/QF1 Loss 17.2463 +[id0] trainer/QF2 Loss 16.0504 +[id0] trainer/Policy Loss 8.59143 +[id0] trainer/Q1 Predictions Mean 1.90116 +[id0] trainer/Q1 Predictions Std 0.58867 +[id0] trainer/Q1 Predictions Max 2.88221 +[id0] trainer/Q1 Predictions Min 1.1464 +[id0] trainer/Q2 Predictions Mean 1.95097 +[id0] trainer/Q2 Predictions Std 0.665129 +[id0] trainer/Q2 Predictions Max 3.0759 +[id0] trainer/Q2 Predictions Min 1.14595 +[id0] trainer/Q Targets Mean 4.19348 +[id0] trainer/Q Targets Std 3.83143 +[id0] trainer/Q Targets Max 8.9047 +[id0] trainer/Q Targets Min 0.887791 +[id0] trainer/Log Pis Mean 10.4459 +[id0] trainer/Log Pis Std 0.92806 +[id0] trainer/Log Pis Max 11.199 +[id0] trainer/Log Pis Min 8.73429 +[id0] trainer/rewards Mean 4.17041 +[id0] trainer/rewards Std 3.82532 +[id0] trainer/rewards Max 8.86589 +[id0] trainer/rewards Min 0.869449 +[id0] trainer/terminals Mean 0 +[id0] trainer/terminals Std 0 +[id0] trainer/terminals Max 0 +[id0] trainer/terminals Min 0 +[id0] trainer/policy/mean Mean 0.116528 +[id0] trainer/policy/mean Std 0.529735 +[id0] trainer/policy/mean Max 0.813348 +[id0] trainer/policy/mean Min -0.859628 +[id0] trainer/policy/std Mean 0.0506099 +[id0] trainer/policy/std Std 2.64224e-05 +[id0] trainer/policy/std Max 0.0506531 +[id0] trainer/policy/std Min 0.0505769 +[id0] trainer/Advantage Weights Mean 0.2 +[id0] trainer/Advantage Weights Std 0.00208328 +[id0] trainer/Advantage Weights Max 0.20169 +[id0] trainer/Advantage Weights Min 0.195922 +[id0] trainer/Advantage Score Mean 0.0077806 +[id0] trainer/Advantage Score Std 0.0209809 +[id0] trainer/Advantage Score Max 0.0247157 +[id0] trainer/Advantage Score Min -0.0333155 +[id0] expl/num steps total 410 +[id0] expl/num paths total 5 +[id0] expl/path length Mean 100 +[id0] expl/path length Std 0 +[id0] expl/path length Max 100 +[id0] expl/path length Min 100 +[id0] expl/Rewards Mean -0.0240241 +[id0] expl/Rewards Std 0.115636 +[id0] expl/Rewards Max 0.254035 +[id0] expl/Rewards Min -0.428281 +[id0] expl/Returns Mean -2.40241 +[id0] expl/Returns Std 0 +[id0] expl/Returns Max -2.40241 +[id0] expl/Returns Min -2.40241 +[id0] expl/Actions Mean 0.0146732 +[id0] expl/Actions Std 0.0803057 +[id0] expl/Actions Max 0.231781 +[id0] expl/Actions Min -0.26722 +[id0] expl/Num Paths 1 +[id0] expl/Average Returns -2.40241 +[id0] eval/num steps total 0 +[id0] eval/num paths total 0 +[id0] time/epoch_time (s) 0.322704 +[id0] time/evaluation sampling (s) 0.0862281 +[id0] time/exploration sampling (s) 0.08301 +[id0] time/global_time (s) 7.01616 +[id0] time/replay buffer data storing (s) 0.000590801 +[id0] time/saving (s) 1.81198e-05 +[id0] time/training (s) 0.151856 +[id0] epoch 3 +[id0] ----------------------------------- ------------- +[id0] ----------------------------------- ------------- +[id0] trainer/num train calls 50 +[id0] trainer/QF1 Loss 14.4287 +[id0] trainer/QF2 Loss 13.7903 +[id0] trainer/Policy Loss 8.34756 +[id0] trainer/Q1 Predictions Mean 2.14834 +[id0] trainer/Q1 Predictions Std 0.992247 +[id0] trainer/Q1 Predictions Max 3.83773 +[id0] trainer/Q1 Predictions Min 1.04134 +[id0] trainer/Q2 Predictions Mean 2.23217 +[id0] trainer/Q2 Predictions Std 1.10552 +[id0] trainer/Q2 Predictions Max 4.13452 +[id0] trainer/Q2 Predictions Min 1.01635 +[id0] trainer/Q Targets Mean 2.60241 +[id0] trainer/Q Targets Std 4.57101 +[id0] trainer/Q Targets Max 11.3185 +[id0] trainer/Q Targets Min -0.978749 +[id0] trainer/Log Pis Mean 10.4799 +[id0] trainer/Log Pis Std 0.422812 +[id0] trainer/Log Pis Max 11.2896 +[id0] trainer/Log Pis Min 10.0659 +[id0] trainer/rewards Mean 2.56521 +[id0] trainer/rewards Std 4.57024 +[id0] trainer/rewards Max 11.285 +[id0] trainer/rewards Min -1.00007 +[id0] trainer/terminals Mean 0 +[id0] trainer/terminals Std 0 +[id0] trainer/terminals Max 0 +[id0] trainer/terminals Min 0 +[id0] trainer/policy/mean Mean 0.0797901 +[id0] trainer/policy/mean Std 0.496851 +[id0] trainer/policy/mean Max 0.757134 +[id0] trainer/policy/mean Min -0.853617 +[id0] trainer/policy/std Mean 0.0507786 +[id0] trainer/policy/std Std 3.24643e-05 +[id0] trainer/policy/std Max 0.0508337 +[id0] trainer/policy/std Min 0.0507399 +[id0] trainer/Advantage Weights Mean 0.2 +[id0] trainer/Advantage Weights Std 0.000770751 +[id0] trainer/Advantage Weights Max 0.201244 +[id0] trainer/Advantage Weights Min 0.198896 +[id0] trainer/Advantage Score Mean -0.0045176 +[id0] trainer/Advantage Score Std 0.00770382 +[id0] trainer/Advantage Score Max 0.00789416 +[id0] trainer/Advantage Score Min -0.0155728 +[id0] expl/num steps total 510 +[id0] expl/num paths total 6 +[id0] expl/path length Mean 100 +[id0] expl/path length Std 0 +[id0] expl/path length Max 100 +[id0] expl/path length Min 100 +[id0] expl/Rewards Mean -0.0193309 +[id0] expl/Rewards Std 0.103352 +[id0] expl/Rewards Max 0.257825 +[id0] expl/Rewards Min -0.207475 +[id0] expl/Returns Mean -1.93309 +[id0] expl/Returns Std 0 +[id0] expl/Returns Max -1.93309 +[id0] expl/Returns Min -1.93309 +[id0] expl/Actions Mean 0.0186586 +[id0] expl/Actions Std 0.0829721 +[id0] expl/Actions Max 0.289438 +[id0] expl/Actions Min -0.279665 +[id0] expl/Num Paths 1 +[id0] expl/Average Returns -1.93309 +[id0] eval/num steps total 0 +[id0] eval/num paths total 0 +[id0] time/epoch_time (s) 0.322238 +[id0] time/evaluation sampling (s) 0.0854766 +[id0] time/exploration sampling (s) 0.0825238 +[id0] time/global_time (s) 7.34121 +[id0] time/replay buffer data storing (s) 0.000594854 +[id0] time/saving (s) 1.74046e-05 +[id0] time/training (s) 0.152554 +[id0] epoch 4 +[id0] ----------------------------------- ------------- +[id0] ------------------------------ --------------- +[id0] trainer/QF1 Loss 0.0267284 +[id0] trainer/QF2 Loss 0.0267345 +[id0] trainer/Policy Loss 3.15012 +[id0] trainer/Q1 Predictions Mean -0.000483587 +[id0] trainer/Q1 Predictions Std 0.000263562 +[id0] trainer/Q1 Predictions Max -2.54349e-05 +[id0] trainer/Q1 Predictions Min -0.00114263 +[id0] trainer/Q2 Predictions Mean -0.000438495 +[id0] trainer/Q2 Predictions Std 0.000220916 +[id0] trainer/Q2 Predictions Max 4.79189e-05 +[id0] trainer/Q2 Predictions Min -0.000895643 +[id0] trainer/Q Targets Mean -0.124641 +[id0] trainer/Q Targets Std 0.106343 +[id0] trainer/Q Targets Max 0.000171593 +[id0] trainer/Q Targets Min -0.380185 +[id0] trainer/Log Pis Mean 3.14974 +[id0] trainer/Log Pis Std 1.1 +[id0] trainer/Log Pis Max 4.15625 +[id0] trainer/Log Pis Min -1.85168 +[id0] trainer/rewards Mean -0.124673 +[id0] trainer/rewards Std 0.106319 +[id0] trainer/rewards Max 0 +[id0] trainer/rewards Min -0.380129 +[id0] trainer/terminals Mean 0 +[id0] trainer/terminals Std 0 +[id0] trainer/terminals Max 0 +[id0] trainer/terminals Min 0 +[id0] trainer/replay_buffer_len 90000 +[id0] trainer/policy/mean Mean -3.94223e-05 +[id0] trainer/policy/mean Std 5.48776e-06 +[id0] trainer/policy/mean Max -2.87341e-05 +[id0] trainer/policy/mean Min -5.66508e-05 +[id0] trainer/policy/std Mean 0.0497949 +[id0] trainer/policy/std Std 7.06956e-06 +[id0] trainer/policy/std Max 0.049803 +[id0] trainer/policy/std Min 0.0497869 +[id0] trainer/Advantage Weights Mean 0.0078125 +[id0] trainer/Advantage Weights Std 0.00198469 +[id0] trainer/Advantage Weights Max 0.0130013 +[id0] trainer/Advantage Weights Min 0.00423533 +[id0] trainer/Advantage Score Mean -0.000101316 +[id0] trainer/Advantage Score Std 0.000263755 +[id0] trainer/Advantage Score Max 0.000441706 +[id0] trainer/Advantage Score Min -0.000679884 +[id0] trainer/bc/Train Logprob Loss 254.926 +[id0] trainer/bc/Test Logprob Loss 270.137 +[id0] trainer/bc/Train MSE 0.642499 +[id0] trainer/bc/Test MSE 0.689321 +[id0] trainer/bc/train_policy_loss 254.926 +[id0] trainer/bc/test_policy_loss 270.137 +[id0] trainer/batch 0 +[id0] trainer/epoch_time 0.0311177 +[id0] ------------------------------ --------------- +[id0] --------------------------------------- --------------- +[id0] trainer/num train calls 10 +[id0] trainer/QF1 Loss 0.0250762 +[id0] trainer/QF2 Loss 0.0254459 +[id0] trainer/Policy Loss 3.42755 +[id0] trainer/Q1 Predictions Mean -0.02271 +[id0] trainer/Q1 Predictions Std 0.0013751 +[id0] trainer/Q1 Predictions Max -0.020538 +[id0] trainer/Q1 Predictions Min -0.0247885 +[id0] trainer/Q2 Predictions Mean -0.0215133 +[id0] trainer/Q2 Predictions Std 0.00105855 +[id0] trainer/Q2 Predictions Max -0.0195664 +[id0] trainer/Q2 Predictions Min -0.0226589 +[id0] trainer/Q Targets Mean -0.136798 +[id0] trainer/Q Targets Std 0.110915 +[id0] trainer/Q Targets Max -0.00019019 +[id0] trainer/Q Targets Min -0.317915 +[id0] trainer/Log Pis Mean 3.40936 +[id0] trainer/Log Pis Std 0.616096 +[id0] trainer/Log Pis Max 3.98308 +[id0] trainer/Log Pis Min 2.43023 +[id0] trainer/rewards Mean -0.136573 +[id0] trainer/rewards Std 0.110873 +[id0] trainer/rewards Max -1.48851e-06 +[id0] trainer/rewards Min -0.317618 +[id0] trainer/terminals Mean 0 +[id0] trainer/terminals Std 0 +[id0] trainer/terminals Max 0 +[id0] trainer/terminals Min 0 +[id0] trainer/replay_buffer_len 90200 +[id0] trainer/policy/mean Mean -0.000476395 +[id0] trainer/policy/mean Std 0.0094533 +[id0] trainer/policy/mean Max 0.00918785 +[id0] trainer/policy/mean Min -0.0101835 +[id0] trainer/policy/std Mean 0.0506216 +[id0] trainer/policy/std Std 1.96462e-05 +[id0] trainer/policy/std Max 0.0506586 +[id0] trainer/policy/std Min 0.0505911 +[id0] trainer/Advantage Weights Mean 0.2 +[id0] trainer/Advantage Weights Std 0.286094 +[id0] trainer/Advantage Weights Max 0.768934 +[id0] trainer/Advantage Weights Min 0.0102757 +[id0] trainer/Advantage Score Mean -0.0045227 +[id0] trainer/Advantage Score Std 0.00139399 +[id0] trainer/Advantage Score Max -0.00221948 +[id0] trainer/Advantage Score Min -0.00653471 +[id0] trainer/bc/Train Logprob Loss 245.501 +[id0] trainer/bc/Test Logprob Loss 246.347 +[id0] trainer/bc/Train MSE 0.643397 +[id0] trainer/bc/Test MSE 0.648696 +[id0] trainer/bc/train_policy_loss 245.501 +[id0] trainer/bc/test_policy_loss 246.347 +[id0] expl/num steps total 200 +[id0] expl/num paths total 4 +[id0] expl/path length Mean 50 +[id0] expl/path length Std 0 +[id0] expl/path length Max 50 +[id0] expl/path length Min 50 +[id0] expl/Rewards Mean -0.197639 +[id0] expl/Rewards Std 0.011234 +[id0] expl/Rewards Max -0.186405 +[id0] expl/Rewards Min -0.208873 +[id0] expl/Returns Mean -9.88193 +[id0] expl/Returns Std 0.561699 +[id0] expl/Returns Max -9.32024 +[id0] expl/Returns Min -10.4436 +[id0] expl/Actions Mean -0.000557166 +[id0] expl/Actions Std 0.0502919 +[id0] expl/Actions Max 0.146695 +[id0] expl/Actions Min -0.165994 +[id0] expl/Num Paths 2 +[id0] expl/Average Returns -9.88193 +[id0] expl/hand_distance Mean 0.0856795 +[id0] expl/hand_distance Std 0.0329738 +[id0] expl/hand_distance Max 0.120603 +[id0] expl/hand_distance Min 0.0476662 +[id0] expl/Final hand_distance Mean 0.0895555 +[id0] expl/Final hand_distance Std 0.0310472 +[id0] expl/Final hand_distance Max 0.120603 +[id0] expl/Final hand_distance Min 0.0585082 +[id0] expl/puck_distance Mean 0.197639 +[id0] expl/puck_distance Std 0.011234 +[id0] expl/puck_distance Max 0.208873 +[id0] expl/puck_distance Min 0.186405 +[id0] expl/Final puck_distance Mean 0.197639 +[id0] expl/Final puck_distance Std 0.0112338 +[id0] expl/Final puck_distance Max 0.208872 +[id0] expl/Final puck_distance Min 0.186405 +[id0] expl/state_distance Mean 0.21697 +[id0] expl/state_distance Std 0.0232233 +[id0] expl/state_distance Max 0.24119 +[id0] expl/state_distance Min 0.192403 +[id0] expl/Final state_distance Mean 0.218281 +[id0] expl/Final state_distance Std 0.0229095 +[id0] expl/Final state_distance Max 0.24119 +[id0] expl/Final state_distance Min 0.195371 +[id0] expl/touch_distance Mean 0.232212 +[id0] expl/touch_distance Std 0.0408655 +[id0] expl/touch_distance Max 0.278074 +[id0] expl/touch_distance Min 0.181348 +[id0] expl/Final touch_distance Mean 0.229711 +[id0] expl/Final touch_distance Std 0.048363 +[id0] expl/Final touch_distance Max 0.278074 +[id0] expl/Final touch_distance Min 0.181348 +[id0] expl/hand_success Mean 0.5 +[id0] expl/hand_success Std 0.5 +[id0] expl/hand_success Max 1 +[id0] expl/hand_success Min 0 +[id0] expl/Final hand_success Mean 0.5 +[id0] expl/Final hand_success Std 0.5 +[id0] expl/Final hand_success Max 1 +[id0] expl/Final hand_success Min 0 +[id0] expl/hand_success_2 Mean 0.5 +[id0] expl/hand_success_2 Std 0.5 +[id0] expl/hand_success_2 Max 1 +[id0] expl/hand_success_2 Min 0 +[id0] expl/Final hand_success_2 Mean 0.5 +[id0] expl/Final hand_success_2 Std 0.5 +[id0] expl/Final hand_success_2 Max 1 +[id0] expl/Final hand_success_2 Min 0 +[id0] expl/hand_success_3 Mean 0.97 +[id0] expl/hand_success_3 Std 0.170587 +[id0] expl/hand_success_3 Max 1 +[id0] expl/hand_success_3 Min 0 +[id0] expl/Final hand_success_3 Mean 0.5 +[id0] expl/Final hand_success_3 Std 0.5 +[id0] expl/Final hand_success_3 Max 1 +[id0] expl/Final hand_success_3 Min 0 +[id0] expl/puck_success Mean 0 +[id0] expl/puck_success Std 0 +[id0] expl/puck_success Max 0 +[id0] expl/puck_success Min 0 +[id0] expl/Final puck_success Mean 0 +[id0] expl/Final puck_success Std 0 +[id0] expl/Final puck_success Max 0 +[id0] expl/Final puck_success Min 0 +[id0] expl/puck_success_2 Mean 0 +[id0] expl/puck_success_2 Std 0 +[id0] expl/puck_success_2 Max 0 +[id0] expl/puck_success_2 Min 0 +[id0] expl/Final puck_success_2 Mean 0 +[id0] expl/Final puck_success_2 Std 0 +[id0] expl/Final puck_success_2 Max 0 +[id0] expl/Final puck_success_2 Min 0 +[id0] expl/puck_success_3 Mean 0 +[id0] expl/puck_success_3 Std 0 +[id0] expl/puck_success_3 Max 0 +[id0] expl/puck_success_3 Min 0 +[id0] expl/Final puck_success_3 Mean 0 +[id0] expl/Final puck_success_3 Std 0 +[id0] expl/Final puck_success_3 Max 0 +[id0] expl/Final puck_success_3 Min 0 +[id0] expl/hand_and_puck_success Mean 0 +[id0] expl/hand_and_puck_success Std 0 +[id0] expl/hand_and_puck_success Max 0 +[id0] expl/hand_and_puck_success Min 0 +[id0] expl/Final hand_and_puck_success Mean 0 +[id0] expl/Final hand_and_puck_success Std 0 +[id0] expl/Final hand_and_puck_success Max 0 +[id0] expl/Final hand_and_puck_success Min 0 +[id0] expl/hand_and_puck_success_2 Mean 0 +[id0] expl/hand_and_puck_success_2 Std 0 +[id0] expl/hand_and_puck_success_2 Max 0 +[id0] expl/hand_and_puck_success_2 Min 0 +[id0] expl/Final hand_and_puck_success_2 Mean 0 +[id0] expl/Final hand_and_puck_success_2 Std 0 +[id0] expl/Final hand_and_puck_success_2 Max 0 +[id0] expl/Final hand_and_puck_success_2 Min 0 +[id0] expl/hand_and_puck_success_3 Mean 0 +[id0] expl/hand_and_puck_success_3 Std 0 +[id0] expl/hand_and_puck_success_3 Max 0 +[id0] expl/hand_and_puck_success_3 Min 0 +[id0] expl/Final hand_and_puck_success_3 Mean 0 +[id0] expl/Final hand_and_puck_success_3 Std 0 +[id0] expl/Final hand_and_puck_success_3 Max 0 +[id0] expl/Final hand_and_puck_success_3 Min 0 +[id0] expl/state_success Mean 0 +[id0] expl/state_success Std 0 +[id0] expl/state_success Max 0 +[id0] expl/state_success Min 0 +[id0] expl/Final state_success Mean 0 +[id0] expl/Final state_success Std 0 +[id0] expl/Final state_success Max 0 +[id0] expl/Final state_success Min 0 +[id0] expl/state_success_2 Mean 0 +[id0] expl/state_success_2 Std 0 +[id0] expl/state_success_2 Max 0 +[id0] expl/state_success_2 Min 0 +[id0] expl/Final state_success_2 Mean 0 +[id0] expl/Final state_success_2 Std 0 +[id0] expl/Final state_success_2 Max 0 +[id0] expl/Final state_success_2 Min 0 +[id0] expl/state_success_3 Mean 0 +[id0] expl/state_success_3 Std 0 +[id0] expl/state_success_3 Max 0 +[id0] expl/state_success_3 Min 0 +[id0] expl/Final state_success_3 Mean 0 +[id0] expl/Final state_success_3 Std 0 +[id0] expl/Final state_success_3 Max 0 +[id0] expl/Final state_success_3 Min 0 +[id0] expl/touch_success Mean 0 +[id0] expl/touch_success Std 0 +[id0] expl/touch_success Max 0 +[id0] expl/touch_success Min 0 +[id0] expl/Final touch_success Mean 0 +[id0] expl/Final touch_success Std 0 +[id0] expl/Final touch_success Max 0 +[id0] expl/Final touch_success Min 0 +[id0] expl/touch_success_2 Mean 0 +[id0] expl/touch_success_2 Std 0 +[id0] expl/touch_success_2 Max 0 +[id0] expl/touch_success_2 Min 0 +[id0] expl/Final touch_success_2 Mean 0 +[id0] expl/Final touch_success_2 Std 0 +[id0] expl/Final touch_success_2 Max 0 +[id0] expl/Final touch_success_2 Min 0 +[id0] expl/touch_success_3 Mean 0 +[id0] expl/touch_success_3 Std 0 +[id0] expl/touch_success_3 Max 0 +[id0] expl/touch_success_3 Min 0 +[id0] expl/Final touch_success_3 Mean 0 +[id0] expl/Final touch_success_3 Std 0 +[id0] expl/Final touch_success_3 Max 0 +[id0] expl/Final touch_success_3 Min 0 +[id0] eval/num steps total 100 +[id0] eval/num paths total 2 +[id0] eval/path length Mean 50 +[id0] eval/path length Std 0 +[id0] eval/path length Max 50 +[id0] eval/path length Min 50 +[id0] eval/Rewards Mean -0.262271 +[id0] eval/Rewards Std 0.0382676 +[id0] eval/Rewards Max -0.224004 +[id0] eval/Rewards Min -0.300539 +[id0] eval/Returns Mean -13.1136 +[id0] eval/Returns Std 1.91338 +[id0] eval/Returns Max -11.2002 +[id0] eval/Returns Min -15.0269 +[id0] eval/Actions Mean -0.000489309 +[id0] eval/Actions Std 0.00942212 +[id0] eval/Actions Max 0.00906735 +[id0] eval/Actions Min -0.0100524 +[id0] eval/Num Paths 2 +[id0] eval/Average Returns -13.1136 +[id0] eval/hand_distance Mean 0.168322 +[id0] eval/hand_distance Std 0.0479489 +[id0] eval/hand_distance Max 0.222622 +[id0] eval/hand_distance Min 0.116261 +[id0] eval/Final hand_distance Mean 0.162978 +[id0] eval/Final hand_distance Std 0.0467166 +[id0] eval/Final hand_distance Max 0.209695 +[id0] eval/Final hand_distance Min 0.116261 +[id0] eval/puck_distance Mean 0.262271 +[id0] eval/puck_distance Std 0.0382676 +[id0] eval/puck_distance Max 0.300539 +[id0] eval/puck_distance Min 0.224004 +[id0] eval/Final puck_distance Mean 0.262271 +[id0] eval/Final puck_distance Std 0.0382675 +[id0] eval/Final puck_distance Max 0.300539 +[id0] eval/Final puck_distance Min 0.224004 +[id0] eval/state_distance Mean 0.317552 +[id0] eval/state_distance Std 0.00655355 +[id0] eval/state_distance Max 0.325431 +[id0] eval/state_distance Min 0.306838 +[id0] eval/Final state_distance Mean 0.31454 +[id0] eval/Final state_distance Std 0.00770235 +[id0] eval/Final state_distance Max 0.322243 +[id0] eval/Final state_distance Min 0.306838 +[id0] eval/touch_distance Mean 0.10999 +[id0] eval/touch_distance Std 0.00448419 +[id0] eval/touch_distance Max 0.116016 +[id0] eval/touch_distance Min 0.100666 +[id0] eval/Final touch_distance Mean 0.108341 +[id0] eval/Final touch_distance Std 0.00767521 +[id0] eval/Final touch_distance Max 0.116016 +[id0] eval/Final touch_distance Min 0.100666 +[id0] eval/hand_success Mean 0 +[id0] eval/hand_success Std 0 +[id0] eval/hand_success Max 0 +[id0] eval/hand_success Min 0 +[id0] eval/Final hand_success Mean 0 +[id0] eval/Final hand_success Std 0 +[id0] eval/Final hand_success Max 0 +[id0] eval/Final hand_success Min 0 +[id0] eval/hand_success_2 Mean 0 +[id0] eval/hand_success_2 Std 0 +[id0] eval/hand_success_2 Max 0 +[id0] eval/hand_success_2 Min 0 +[id0] eval/Final hand_success_2 Mean 0 +[id0] eval/Final hand_success_2 Std 0 +[id0] eval/Final hand_success_2 Max 0 +[id0] eval/Final hand_success_2 Min 0 +[id0] eval/hand_success_3 Mean 0.22 +[id0] eval/hand_success_3 Std 0.414246 +[id0] eval/hand_success_3 Max 1 +[id0] eval/hand_success_3 Min 0 +[id0] eval/Final hand_success_3 Mean 0.5 +[id0] eval/Final hand_success_3 Std 0.5 +[id0] eval/Final hand_success_3 Max 1 +[id0] eval/Final hand_success_3 Min 0 +[id0] eval/puck_success Mean 0 +[id0] eval/puck_success Std 0 +[id0] eval/puck_success Max 0 +[id0] eval/puck_success Min 0 +[id0] eval/Final puck_success Mean 0 +[id0] eval/Final puck_success Std 0 +[id0] eval/Final puck_success Max 0 +[id0] eval/Final puck_success Min 0 +[id0] eval/puck_success_2 Mean 0 +[id0] eval/puck_success_2 Std 0 +[id0] eval/puck_success_2 Max 0 +[id0] eval/puck_success_2 Min 0 +[id0] eval/Final puck_success_2 Mean 0 +[id0] eval/Final puck_success_2 Std 0 +[id0] eval/Final puck_success_2 Max 0 +[id0] eval/Final puck_success_2 Min 0 +[id0] eval/puck_success_3 Mean 0 +[id0] eval/puck_success_3 Std 0 +[id0] eval/puck_success_3 Max 0 +[id0] eval/puck_success_3 Min 0 +[id0] eval/Final puck_success_3 Mean 0 +[id0] eval/Final puck_success_3 Std 0 +[id0] eval/Final puck_success_3 Max 0 +[id0] eval/Final puck_success_3 Min 0 +[id0] eval/hand_and_puck_success Mean 0 +[id0] eval/hand_and_puck_success Std 0 +[id0] eval/hand_and_puck_success Max 0 +[id0] eval/hand_and_puck_success Min 0 +[id0] eval/Final hand_and_puck_success Mean 0 +[id0] eval/Final hand_and_puck_success Std 0 +[id0] eval/Final hand_and_puck_success Max 0 +[id0] eval/Final hand_and_puck_success Min 0 +[id0] eval/hand_and_puck_success_2 Mean 0 +[id0] eval/hand_and_puck_success_2 Std 0 +[id0] eval/hand_and_puck_success_2 Max 0 +[id0] eval/hand_and_puck_success_2 Min 0 +[id0] eval/Final hand_and_puck_success_2 Mean 0 +[id0] eval/Final hand_and_puck_success_2 Std 0 +[id0] eval/Final hand_and_puck_success_2 Max 0 +[id0] eval/Final hand_and_puck_success_2 Min 0 +[id0] eval/hand_and_puck_success_3 Mean 0 +[id0] eval/hand_and_puck_success_3 Std 0 +[id0] eval/hand_and_puck_success_3 Max 0 +[id0] eval/hand_and_puck_success_3 Min 0 +[id0] eval/Final hand_and_puck_success_3 Mean 0 +[id0] eval/Final hand_and_puck_success_3 Std 0 +[id0] eval/Final hand_and_puck_success_3 Max 0 +[id0] eval/Final hand_and_puck_success_3 Min 0 +[id0] eval/state_success Mean 0 +[id0] eval/state_success Std 0 +[id0] eval/state_success Max 0 +[id0] eval/state_success Min 0 +[id0] eval/Final state_success Mean 0 +[id0] eval/Final state_success Std 0 +[id0] eval/Final state_success Max 0 +[id0] eval/Final state_success Min 0 +[id0] eval/state_success_2 Mean 0 +[id0] eval/state_success_2 Std 0 +[id0] eval/state_success_2 Max 0 +[id0] eval/state_success_2 Min 0 +[id0] eval/Final state_success_2 Mean 0 +[id0] eval/Final state_success_2 Std 0 +[id0] eval/Final state_success_2 Max 0 +[id0] eval/Final state_success_2 Min 0 +[id0] eval/state_success_3 Mean 0 +[id0] eval/state_success_3 Std 0 +[id0] eval/state_success_3 Max 0 +[id0] eval/state_success_3 Min 0 +[id0] eval/Final state_success_3 Mean 0 +[id0] eval/Final state_success_3 Std 0 +[id0] eval/Final state_success_3 Max 0 +[id0] eval/Final state_success_3 Min 0 +[id0] eval/touch_success Mean 0 +[id0] eval/touch_success Std 0 +[id0] eval/touch_success Max 0 +[id0] eval/touch_success Min 0 +[id0] eval/Final touch_success Mean 0 +[id0] eval/Final touch_success Std 0 +[id0] eval/Final touch_success Max 0 +[id0] eval/Final touch_success Min 0 +[id0] eval/touch_success_2 Mean 0 +[id0] eval/touch_success_2 Std 0 +[id0] eval/touch_success_2 Max 0 +[id0] eval/touch_success_2 Min 0 +[id0] eval/Final touch_success_2 Mean 0 +[id0] eval/Final touch_success_2 Std 0 +[id0] eval/Final touch_success_2 Max 0 +[id0] eval/Final touch_success_2 Min 0 +[id0] eval/touch_success_3 Mean 1 +[id0] eval/touch_success_3 Std 0 +[id0] eval/touch_success_3 Max 1 +[id0] eval/touch_success_3 Min 1 +[id0] eval/Final touch_success_3 Mean 1 +[id0] eval/Final touch_success_3 Std 0 +[id0] eval/Final touch_success_3 Max 1 +[id0] eval/Final touch_success_3 Min 1 +[id0] time/epoch_time (s) 4.97374 +[id0] time/evaluation sampling (s) 1.5735 +[id0] time/exploration sampling (s) 1.57095 +[id0] time/global_time (s) 21.6473 +[id0] time/replay buffer data storing (s) 0.000350714 +[id0] time/saving (s) 0.0271418 +[id0] time/training (s) 0.201885 +[id0] epoch 0 +[id0] --------------------------------------- --------------- +[id0] --------------------------------------- --------------- +[id0] trainer/num train calls 20 +[id0] trainer/QF1 Loss 0.00250073 +[id0] trainer/QF2 Loss 0.00207108 +[id0] trainer/Policy Loss 2.22325 +[id0] trainer/Q1 Predictions Mean -0.0737635 +[id0] trainer/Q1 Predictions Std 0.00398001 +[id0] trainer/Q1 Predictions Max -0.0687237 +[id0] trainer/Q1 Predictions Min -0.0776319 +[id0] trainer/Q2 Predictions Mean -0.0691382 +[id0] trainer/Q2 Predictions Std 0.00233655 +[id0] trainer/Q2 Predictions Max -0.0668909 +[id0] trainer/Q2 Predictions Min -0.0736087 +[id0] trainer/Q Targets Mean -0.0295868 +[id0] trainer/Q Targets Std 0.0228406 +[id0] trainer/Q Targets Max -0.000883632 +[id0] trainer/Q Targets Min -0.0600125 +[id0] trainer/Log Pis Mean 2.16272 +[id0] trainer/Log Pis Std 1.56912 +[id0] trainer/Log Pis Max 3.88093 +[id0] trainer/Log Pis Min -0.660848 +[id0] trainer/rewards Mean -0.0287089 +[id0] trainer/rewards Std 0.0228596 +[id0] trainer/rewards Max -1.95569e-07 +[id0] trainer/rewards Min -0.0592206 +[id0] trainer/terminals Mean 0 +[id0] trainer/terminals Std 0 +[id0] trainer/terminals Max 0 +[id0] trainer/terminals Min 0 +[id0] trainer/replay_buffer_len 90300 +[id0] trainer/policy/mean Mean -0.000350953 +[id0] trainer/policy/mean Std 0.0518476 +[id0] trainer/policy/mean Max 0.0528517 +[id0] trainer/policy/mean Min -0.0535691 +[id0] trainer/policy/std Mean 0.0543918 +[id0] trainer/policy/std Std 0.000164085 +[id0] trainer/policy/std Max 0.0545799 +[id0] trainer/policy/std Min 0.054039 +[id0] trainer/Advantage Weights Mean 0.2 +[id0] trainer/Advantage Weights Std 0.376826 +[id0] trainer/Advantage Weights Max 0.953306 +[id0] trainer/Advantage Weights Min 0.00159713 +[id0] trainer/Advantage Score Mean -0.0132283 +[id0] trainer/Advantage Score Std 0.00232157 +[id0] trainer/Advantage Score Max -0.00913966 +[id0] trainer/Advantage Score Min -0.0155314 +[id0] trainer/bc/Train Logprob Loss 212.68 +[id0] trainer/bc/Test Logprob Loss 222.854 +[id0] trainer/bc/Train MSE 0.638763 +[id0] trainer/bc/Test MSE 0.68769 +[id0] trainer/bc/train_policy_loss 212.68 +[id0] trainer/bc/test_policy_loss 222.854 +[id0] expl/num steps total 300 +[id0] expl/num paths total 6 +[id0] expl/path length Mean 50 +[id0] expl/path length Std 0 +[id0] expl/path length Max 50 +[id0] expl/path length Min 50 +[id0] expl/Rewards Mean -0.197181 +[id0] expl/Rewards Std 0.124835 +[id0] expl/Rewards Max -0.0723456 +[id0] expl/Rewards Min -0.322017 +[id0] expl/Returns Mean -9.85907 +[id0] expl/Returns Std 6.24177 +[id0] expl/Returns Max -3.61729 +[id0] expl/Returns Min -16.1008 +[id0] expl/Actions Mean 0.00306798 +[id0] expl/Actions Std 0.0800417 +[id0] expl/Actions Max 0.22433 +[id0] expl/Actions Min -0.187217 +[id0] expl/Num Paths 2 +[id0] expl/Average Returns -9.85907 +[id0] expl/hand_distance Mean 0.147538 +[id0] expl/hand_distance Std 0.0153105 +[id0] expl/hand_distance Max 0.16193 +[id0] expl/hand_distance Min 0.111969 +[id0] expl/Final hand_distance Mean 0.160255 +[id0] expl/Final hand_distance Std 0.000688567 +[id0] expl/Final hand_distance Max 0.160944 +[id0] expl/Final hand_distance Min 0.159567 +[id0] expl/puck_distance Mean 0.197181 +[id0] expl/puck_distance Std 0.124835 +[id0] expl/puck_distance Max 0.322017 +[id0] expl/puck_distance Min 0.0723456 +[id0] expl/Final puck_distance Mean 0.197181 +[id0] expl/Final puck_distance Std 0.124836 +[id0] expl/Final puck_distance Max 0.322017 +[id0] expl/Final puck_distance Min 0.0723456 +[id0] expl/state_distance Mean 0.256835 +[id0] expl/state_distance Std 0.102479 +[id0] expl/state_distance Max 0.359867 +[id0] expl/state_distance Min 0.133308 +[id0] expl/Final state_distance Mean 0.26792 +[id0] expl/Final state_distance Std 0.0914635 +[id0] expl/Final state_distance Max 0.359383 +[id0] expl/Final state_distance Min 0.176456 +[id0] expl/touch_distance Mean 0.18962 +[id0] expl/touch_distance Std 0.108421 +[id0] expl/touch_distance Max 0.317678 +[id0] expl/touch_distance Min 0.0735865 +[id0] expl/Final touch_distance Mean 0.184087 +[id0] expl/Final touch_distance Std 0.0925809 +[id0] expl/Final touch_distance Max 0.276668 +[id0] expl/Final touch_distance Min 0.0915062 +[id0] expl/hand_success Mean 0 +[id0] expl/hand_success Std 0 +[id0] expl/hand_success Max 0 +[id0] expl/hand_success Min 0 +[id0] expl/Final hand_success Mean 0 +[id0] expl/Final hand_success Std 0 +[id0] expl/Final hand_success Max 0 +[id0] expl/Final hand_success Min 0 +[id0] expl/hand_success_2 Mean 0 +[id0] expl/hand_success_2 Std 0 +[id0] expl/hand_success_2 Max 0 +[id0] expl/hand_success_2 Min 0 +[id0] expl/Final hand_success_2 Mean 0 +[id0] expl/Final hand_success_2 Std 0 +[id0] expl/Final hand_success_2 Max 0 +[id0] expl/Final hand_success_2 Min 0 +[id0] expl/hand_success_3 Mean 0.08 +[id0] expl/hand_success_3 Std 0.271293 +[id0] expl/hand_success_3 Max 1 +[id0] expl/hand_success_3 Min 0 +[id0] expl/Final hand_success_3 Mean 0 +[id0] expl/Final hand_success_3 Std 0 +[id0] expl/Final hand_success_3 Max 0 +[id0] expl/Final hand_success_3 Min 0 +[id0] expl/puck_success Mean 0 +[id0] expl/puck_success Std 0 +[id0] expl/puck_success Max 0 +[id0] expl/puck_success Min 0 +[id0] expl/Final puck_success Mean 0 +[id0] expl/Final puck_success Std 0 +[id0] expl/Final puck_success Max 0 +[id0] expl/Final puck_success Min 0 +[id0] expl/puck_success_2 Mean 0.5 +[id0] expl/puck_success_2 Std 0.5 +[id0] expl/puck_success_2 Max 1 +[id0] expl/puck_success_2 Min 0 +[id0] expl/Final puck_success_2 Mean 0.5 +[id0] expl/Final puck_success_2 Std 0.5 +[id0] expl/Final puck_success_2 Max 1 +[id0] expl/Final puck_success_2 Min 0 +[id0] expl/puck_success_3 Mean 0.5 +[id0] expl/puck_success_3 Std 0.5 +[id0] expl/puck_success_3 Max 1 +[id0] expl/puck_success_3 Min 0 +[id0] expl/Final puck_success_3 Mean 0.5 +[id0] expl/Final puck_success_3 Std 0.5 +[id0] expl/Final puck_success_3 Max 1 +[id0] expl/Final puck_success_3 Min 0 +[id0] expl/hand_and_puck_success Mean 0 +[id0] expl/hand_and_puck_success Std 0 +[id0] expl/hand_and_puck_success Max 0 +[id0] expl/hand_and_puck_success Min 0 +[id0] expl/Final hand_and_puck_success Mean 0 +[id0] expl/Final hand_and_puck_success Std 0 +[id0] expl/Final hand_and_puck_success Max 0 +[id0] expl/Final hand_and_puck_success Min 0 +[id0] expl/hand_and_puck_success_2 Mean 0 +[id0] expl/hand_and_puck_success_2 Std 0 +[id0] expl/hand_and_puck_success_2 Max 0 +[id0] expl/hand_and_puck_success_2 Min 0 +[id0] expl/Final hand_and_puck_success_2 Mean 0 +[id0] expl/Final hand_and_puck_success_2 Std 0 +[id0] expl/Final hand_and_puck_success_2 Max 0 +[id0] expl/Final hand_and_puck_success_2 Min 0 +[id0] expl/hand_and_puck_success_3 Mean 0 +[id0] expl/hand_and_puck_success_3 Std 0 +[id0] expl/hand_and_puck_success_3 Max 0 +[id0] expl/hand_and_puck_success_3 Min 0 +[id0] expl/Final hand_and_puck_success_3 Mean 0 +[id0] expl/Final hand_and_puck_success_3 Std 0 +[id0] expl/Final hand_and_puck_success_3 Max 0 +[id0] expl/Final hand_and_puck_success_3 Min 0 +[id0] expl/state_success Mean 0 +[id0] expl/state_success Std 0 +[id0] expl/state_success Max 0 +[id0] expl/state_success Min 0 +[id0] expl/Final state_success Mean 0 +[id0] expl/Final state_success Std 0 +[id0] expl/Final state_success Max 0 +[id0] expl/Final state_success Min 0 +[id0] expl/state_success_2 Mean 0 +[id0] expl/state_success_2 Std 0 +[id0] expl/state_success_2 Max 0 +[id0] expl/state_success_2 Min 0 +[id0] expl/Final state_success_2 Mean 0 +[id0] expl/Final state_success_2 Std 0 +[id0] expl/Final state_success_2 Max 0 +[id0] expl/Final state_success_2 Min 0 +[id0] expl/state_success_3 Mean 0 +[id0] expl/state_success_3 Std 0 +[id0] expl/state_success_3 Max 0 +[id0] expl/state_success_3 Min 0 +[id0] expl/Final state_success_3 Mean 0 +[id0] expl/Final state_success_3 Std 0 +[id0] expl/Final state_success_3 Max 0 +[id0] expl/Final state_success_3 Min 0 +[id0] expl/touch_success Mean 0 +[id0] expl/touch_success Std 0 +[id0] expl/touch_success Max 0 +[id0] expl/touch_success Min 0 +[id0] expl/Final touch_success Mean 0 +[id0] expl/Final touch_success Std 0 +[id0] expl/Final touch_success Max 0 +[id0] expl/Final touch_success Min 0 +[id0] expl/touch_success_2 Mean 0.24 +[id0] expl/touch_success_2 Std 0.427083 +[id0] expl/touch_success_2 Max 1 +[id0] expl/touch_success_2 Min 0 +[id0] expl/Final touch_success_2 Mean 0 +[id0] expl/Final touch_success_2 Std 0 +[id0] expl/Final touch_success_2 Max 0 +[id0] expl/Final touch_success_2 Min 0 +[id0] expl/touch_success_3 Mean 0.5 +[id0] expl/touch_success_3 Std 0.5 +[id0] expl/touch_success_3 Max 1 +[id0] expl/touch_success_3 Min 0 +[id0] expl/Final touch_success_3 Mean 0.5 +[id0] expl/Final touch_success_3 Std 0.5 +[id0] expl/Final touch_success_3 Max 1 +[id0] expl/Final touch_success_3 Min 0 +[id0] eval/num steps total 200 +[id0] eval/num paths total 4 +[id0] eval/path length Mean 50 +[id0] eval/path length Std 0 +[id0] eval/path length Max 50 +[id0] eval/path length Min 50 +[id0] eval/Rewards Mean -0.272303 +[id0] eval/Rewards Std 0.0149697 +[id0] eval/Rewards Max -0.25069 +[id0] eval/Rewards Min -0.29101 +[id0] eval/Returns Mean -13.6151 +[id0] eval/Returns Std 0.645758 +[id0] eval/Returns Max -12.9694 +[id0] eval/Returns Min -14.2609 +[id0] eval/Actions Mean -0.000333342 +[id0] eval/Actions Std 0.0505545 +[id0] eval/Actions Max 0.0510213 +[id0] eval/Actions Min -0.0517202 +[id0] eval/Num Paths 2 +[id0] eval/Average Returns -13.6151 +[id0] eval/hand_distance Mean 0.122697 +[id0] eval/hand_distance Std 0.0300833 +[id0] eval/hand_distance Max 0.180573 +[id0] eval/hand_distance Min 0.0919758 +[id0] eval/Final hand_distance Mean 0.140525 +[id0] eval/Final hand_distance Std 0.0400476 +[id0] eval/Final hand_distance Max 0.180573 +[id0] eval/Final hand_distance Min 0.100478 +[id0] eval/puck_distance Mean 0.272303 +[id0] eval/puck_distance Std 0.0149697 +[id0] eval/puck_distance Max 0.29101 +[id0] eval/puck_distance Min 0.25069 +[id0] eval/Final puck_distance Mean 0.288114 +[id0] eval/Final puck_distance Std 0.00289617 +[id0] eval/Final puck_distance Max 0.29101 +[id0] eval/Final puck_distance Min 0.285218 +[id0] eval/state_distance Mean 0.300344 +[id0] eval/state_distance Std 0.0112235 +[id0] eval/state_distance Max 0.342481 +[id0] eval/state_distance Min 0.281135 +[id0] eval/Final state_distance Mean 0.32244 +[id0] eval/Final state_distance Std 0.0200413 +[id0] eval/Final state_distance Max 0.342481 +[id0] eval/Final state_distance Min 0.302399 +[id0] eval/touch_distance Mean 0.136578 +[id0] eval/touch_distance Std 0.0700787 +[id0] eval/touch_distance Max 0.235992 +[id0] eval/touch_distance Min 0.0600804 +[id0] eval/Final touch_distance Mean 0.15151 +[id0] eval/Final touch_distance Std 0.0844817 +[id0] eval/Final touch_distance Max 0.235992 +[id0] eval/Final touch_distance Min 0.067028 +[id0] eval/hand_success Mean 0 +[id0] eval/hand_success Std 0 +[id0] eval/hand_success Max 0 +[id0] eval/hand_success Min 0 +[id0] eval/Final hand_success Mean 0 +[id0] eval/Final hand_success Std 0 +[id0] eval/Final hand_success Max 0 +[id0] eval/Final hand_success Min 0 +[id0] eval/hand_success_2 Mean 0 +[id0] eval/hand_success_2 Std 0 +[id0] eval/hand_success_2 Max 0 +[id0] eval/hand_success_2 Min 0 +[id0] eval/Final hand_success_2 Mean 0 +[id0] eval/Final hand_success_2 Std 0 +[id0] eval/Final hand_success_2 Max 0 +[id0] eval/Final hand_success_2 Min 0 +[id0] eval/hand_success_3 Mean 0.5 +[id0] eval/hand_success_3 Std 0.5 +[id0] eval/hand_success_3 Max 1 +[id0] eval/hand_success_3 Min 0 +[id0] eval/Final hand_success_3 Mean 0.5 +[id0] eval/Final hand_success_3 Std 0.5 +[id0] eval/Final hand_success_3 Max 1 +[id0] eval/Final hand_success_3 Min 0 +[id0] eval/puck_success Mean 0 +[id0] eval/puck_success Std 0 +[id0] eval/puck_success Max 0 +[id0] eval/puck_success Min 0 +[id0] eval/Final puck_success Mean 0 +[id0] eval/Final puck_success Std 0 +[id0] eval/Final puck_success Max 0 +[id0] eval/Final puck_success Min 0 +[id0] eval/puck_success_2 Mean 0 +[id0] eval/puck_success_2 Std 0 +[id0] eval/puck_success_2 Max 0 +[id0] eval/puck_success_2 Min 0 +[id0] eval/Final puck_success_2 Mean 0 +[id0] eval/Final puck_success_2 Std 0 +[id0] eval/Final puck_success_2 Max 0 +[id0] eval/Final puck_success_2 Min 0 +[id0] eval/puck_success_3 Mean 0 +[id0] eval/puck_success_3 Std 0 +[id0] eval/puck_success_3 Max 0 +[id0] eval/puck_success_3 Min 0 +[id0] eval/Final puck_success_3 Mean 0 +[id0] eval/Final puck_success_3 Std 0 +[id0] eval/Final puck_success_3 Max 0 +[id0] eval/Final puck_success_3 Min 0 +[id0] eval/hand_and_puck_success Mean 0 +[id0] eval/hand_and_puck_success Std 0 +[id0] eval/hand_and_puck_success Max 0 +[id0] eval/hand_and_puck_success Min 0 +[id0] eval/Final hand_and_puck_success Mean 0 +[id0] eval/Final hand_and_puck_success Std 0 +[id0] eval/Final hand_and_puck_success Max 0 +[id0] eval/Final hand_and_puck_success Min 0 +[id0] eval/hand_and_puck_success_2 Mean 0 +[id0] eval/hand_and_puck_success_2 Std 0 +[id0] eval/hand_and_puck_success_2 Max 0 +[id0] eval/hand_and_puck_success_2 Min 0 +[id0] eval/Final hand_and_puck_success_2 Mean 0 +[id0] eval/Final hand_and_puck_success_2 Std 0 +[id0] eval/Final hand_and_puck_success_2 Max 0 +[id0] eval/Final hand_and_puck_success_2 Min 0 +[id0] eval/hand_and_puck_success_3 Mean 0 +[id0] eval/hand_and_puck_success_3 Std 0 +[id0] eval/hand_and_puck_success_3 Max 0 +[id0] eval/hand_and_puck_success_3 Min 0 +[id0] eval/Final hand_and_puck_success_3 Mean 0 +[id0] eval/Final hand_and_puck_success_3 Std 0 +[id0] eval/Final hand_and_puck_success_3 Max 0 +[id0] eval/Final hand_and_puck_success_3 Min 0 +[id0] eval/state_success Mean 0 +[id0] eval/state_success Std 0 +[id0] eval/state_success Max 0 +[id0] eval/state_success Min 0 +[id0] eval/Final state_success Mean 0 +[id0] eval/Final state_success Std 0 +[id0] eval/Final state_success Max 0 +[id0] eval/Final state_success Min 0 +[id0] eval/state_success_2 Mean 0 +[id0] eval/state_success_2 Std 0 +[id0] eval/state_success_2 Max 0 +[id0] eval/state_success_2 Min 0 +[id0] eval/Final state_success_2 Mean 0 +[id0] eval/Final state_success_2 Std 0 +[id0] eval/Final state_success_2 Max 0 +[id0] eval/Final state_success_2 Min 0 +[id0] eval/state_success_3 Mean 0 +[id0] eval/state_success_3 Std 0 +[id0] eval/state_success_3 Max 0 +[id0] eval/state_success_3 Min 0 +[id0] eval/Final state_success_3 Mean 0 +[id0] eval/Final state_success_3 Std 0 +[id0] eval/Final state_success_3 Max 0 +[id0] eval/Final state_success_3 Min 0 +[id0] eval/touch_success Mean 0 +[id0] eval/touch_success Std 0 +[id0] eval/touch_success Max 0 +[id0] eval/touch_success Min 0 +[id0] eval/Final touch_success Mean 0 +[id0] eval/Final touch_success Std 0 +[id0] eval/Final touch_success Max 0 +[id0] eval/Final touch_success Min 0 +[id0] eval/touch_success_2 Mean 0.45 +[id0] eval/touch_success_2 Std 0.497494 +[id0] eval/touch_success_2 Max 1 +[id0] eval/touch_success_2 Min 0 +[id0] eval/Final touch_success_2 Mean 0.5 +[id0] eval/Final touch_success_2 Std 0.5 +[id0] eval/Final touch_success_2 Max 1 +[id0] eval/Final touch_success_2 Min 0 +[id0] eval/touch_success_3 Mean 0.5 +[id0] eval/touch_success_3 Std 0.5 +[id0] eval/touch_success_3 Max 1 +[id0] eval/touch_success_3 Min 0 +[id0] eval/Final touch_success_3 Mean 0.5 +[id0] eval/Final touch_success_3 Std 0.5 +[id0] eval/Final touch_success_3 Max 1 +[id0] eval/Final touch_success_3 Min 0 +[id0] time/epoch_time (s) 3.39167 +[id0] time/evaluation sampling (s) 1.61394 +[id0] time/exploration sampling (s) 1.56629 +[id0] time/global_time (s) 25.0657 +[id0] time/replay buffer data storing (s) 0.000339746 +[id0] time/saving (s) 2.5034e-05 +[id0] time/training (s) 0.201277 +[id0] epoch 1 +[id0] --------------------------------------- --------------- +[id0] --------------------------------------- --------------- +[id0] trainer/num train calls 30 +[id0] trainer/QF1 Loss 0.00454581 +[id0] trainer/QF2 Loss 0.004166 +[id0] trainer/Policy Loss 2.72957 +[id0] trainer/Q1 Predictions Mean -0.126527 +[id0] trainer/Q1 Predictions Std 0.00315079 +[id0] trainer/Q1 Predictions Max -0.123503 +[id0] trainer/Q1 Predictions Min -0.131618 +[id0] trainer/Q2 Predictions Mean -0.126105 +[id0] trainer/Q2 Predictions Std 0.00710658 +[id0] trainer/Q2 Predictions Max -0.11948 +[id0] trainer/Q2 Predictions Min -0.139396 +[id0] trainer/Q Targets Mean -0.112228 +[id0] trainer/Q Targets Std 0.0672478 +[id0] trainer/Q Targets Max -0.0255882 +[id0] trainer/Q Targets Min -0.209882 +[id0] trainer/Log Pis Mean 2.6193 +[id0] trainer/Log Pis Std 1.044 +[id0] trainer/Log Pis Max 3.409 +[id0] trainer/Log Pis Min 0.627569 +[id0] trainer/rewards Mean -0.110253 +[id0] trainer/rewards Std 0.0672446 +[id0] trainer/rewards Max -0.0236798 +[id0] trainer/rewards Min -0.207899 +[id0] trainer/terminals Mean 0 +[id0] trainer/terminals Std 0 +[id0] trainer/terminals Max 0 +[id0] trainer/terminals Min 0 +[id0] trainer/replay_buffer_len 90400 +[id0] trainer/policy/mean Mean 0.00452171 +[id0] trainer/policy/mean Std 0.199858 +[id0] trainer/policy/mean Max 0.211937 +[id0] trainer/policy/mean Min -0.202561 +[id0] trainer/policy/std Mean 0.0727627 +[id0] trainer/policy/std Std 0.000733147 +[id0] trainer/policy/std Max 0.0741526 +[id0] trainer/policy/std Min 0.0716186 +[id0] trainer/Advantage Weights Mean 0.2 +[id0] trainer/Advantage Weights Std 0.303689 +[id0] trainer/Advantage Weights Max 0.803173 +[id0] trainer/Advantage Weights Min 0.000376373 +[id0] trainer/Advantage Score Mean -0.0162524 +[id0] trainer/Advantage Score Std 0.00252286 +[id0] trainer/Advantage Score Max -0.0131438 +[id0] trainer/Advantage Score Min -0.0208096 +[id0] trainer/bc/Train Logprob Loss 137.941 +[id0] trainer/bc/Test Logprob Loss 114.999 +[id0] trainer/bc/Train MSE 0.72753 +[id0] trainer/bc/Test MSE 0.686215 +[id0] trainer/bc/train_policy_loss 137.941 +[id0] trainer/bc/test_policy_loss 114.999 +[id0] expl/num steps total 400 +[id0] expl/num paths total 8 +[id0] expl/path length Mean 50 +[id0] expl/path length Std 0 +[id0] expl/path length Max 50 +[id0] expl/path length Min 50 +[id0] expl/Rewards Mean -0.145173 +[id0] expl/Rewards Std 0.0133283 +[id0] expl/Rewards Max -0.131845 +[id0] expl/Rewards Min -0.158502 +[id0] expl/Returns Mean -7.25866 +[id0] expl/Returns Std 0.666417 +[id0] expl/Returns Max -6.59225 +[id0] expl/Returns Min -7.92508 +[id0] expl/Actions Mean 0.0140931 +[id0] expl/Actions Std 0.209096 +[id0] expl/Actions Max 0.371834 +[id0] expl/Actions Min -0.3301 +[id0] expl/Num Paths 2 +[id0] expl/Average Returns -7.25866 +[id0] expl/hand_distance Mean 0.202981 +[id0] expl/hand_distance Std 0.0395357 +[id0] expl/hand_distance Max 0.271631 +[id0] expl/hand_distance Min 0.118211 +[id0] expl/Final hand_distance Mean 0.265121 +[id0] expl/Final hand_distance Std 0.00650981 +[id0] expl/Final hand_distance Max 0.271631 +[id0] expl/Final hand_distance Min 0.258611 +[id0] expl/puck_distance Mean 0.145173 +[id0] expl/puck_distance Std 0.0133283 +[id0] expl/puck_distance Max 0.158502 +[id0] expl/puck_distance Min 0.131845 +[id0] expl/Final puck_distance Mean 0.145173 +[id0] expl/Final puck_distance Std 0.0133284 +[id0] expl/Final puck_distance Max 0.158502 +[id0] expl/Final puck_distance Min 0.131845 +[id0] expl/state_distance Mean 0.250709 +[id0] expl/state_distance Std 0.0340966 +[id0] expl/state_distance Max 0.303319 +[id0] expl/state_distance Min 0.177079 +[id0] expl/Final state_distance Mean 0.302628 +[id0] expl/Final state_distance Std 0.000690746 +[id0] expl/Final state_distance Max 0.303319 +[id0] expl/Final state_distance Min 0.301938 +[id0] expl/touch_distance Mean 0.202697 +[id0] expl/touch_distance Std 0.0637517 +[id0] expl/touch_distance Max 0.312716 +[id0] expl/touch_distance Min 0.0831949 +[id0] expl/Final touch_distance Mean 0.197956 +[id0] expl/Final touch_distance Std 0.114761 +[id0] expl/Final touch_distance Max 0.312716 +[id0] expl/Final touch_distance Min 0.0831949 +[id0] expl/hand_success Mean 0 +[id0] expl/hand_success Std 0 +[id0] expl/hand_success Max 0 +[id0] expl/hand_success Min 0 +[id0] expl/Final hand_success Mean 0 +[id0] expl/Final hand_success Std 0 +[id0] expl/Final hand_success Max 0 +[id0] expl/Final hand_success Min 0 +[id0] expl/hand_success_2 Mean 0 +[id0] expl/hand_success_2 Std 0 +[id0] expl/hand_success_2 Max 0 +[id0] expl/hand_success_2 Min 0 +[id0] expl/Final hand_success_2 Mean 0 +[id0] expl/Final hand_success_2 Std 0 +[id0] expl/Final hand_success_2 Max 0 +[id0] expl/Final hand_success_2 Min 0 +[id0] expl/hand_success_3 Mean 0.01 +[id0] expl/hand_success_3 Std 0.0994987 +[id0] expl/hand_success_3 Max 1 +[id0] expl/hand_success_3 Min 0 +[id0] expl/Final hand_success_3 Mean 0 +[id0] expl/Final hand_success_3 Std 0 +[id0] expl/Final hand_success_3 Max 0 +[id0] expl/Final hand_success_3 Min 0 +[id0] expl/puck_success Mean 0 +[id0] expl/puck_success Std 0 +[id0] expl/puck_success Max 0 +[id0] expl/puck_success Min 0 +[id0] expl/Final puck_success Mean 0 +[id0] expl/Final puck_success Std 0 +[id0] expl/Final puck_success Max 0 +[id0] expl/Final puck_success Min 0 +[id0] expl/puck_success_2 Mean 0 +[id0] expl/puck_success_2 Std 0 +[id0] expl/puck_success_2 Max 0 +[id0] expl/puck_success_2 Min 0 +[id0] expl/Final puck_success_2 Mean 0 +[id0] expl/Final puck_success_2 Std 0 +[id0] expl/Final puck_success_2 Max 0 +[id0] expl/Final puck_success_2 Min 0 +[id0] expl/puck_success_3 Mean 0 +[id0] expl/puck_success_3 Std 0 +[id0] expl/puck_success_3 Max 0 +[id0] expl/puck_success_3 Min 0 +[id0] expl/Final puck_success_3 Mean 0 +[id0] expl/Final puck_success_3 Std 0 +[id0] expl/Final puck_success_3 Max 0 +[id0] expl/Final puck_success_3 Min 0 +[id0] expl/hand_and_puck_success Mean 0 +[id0] expl/hand_and_puck_success Std 0 +[id0] expl/hand_and_puck_success Max 0 +[id0] expl/hand_and_puck_success Min 0 +[id0] expl/Final hand_and_puck_success Mean 0 +[id0] expl/Final hand_and_puck_success Std 0 +[id0] expl/Final hand_and_puck_success Max 0 +[id0] expl/Final hand_and_puck_success Min 0 +[id0] expl/hand_and_puck_success_2 Mean 0 +[id0] expl/hand_and_puck_success_2 Std 0 +[id0] expl/hand_and_puck_success_2 Max 0 +[id0] expl/hand_and_puck_success_2 Min 0 +[id0] expl/Final hand_and_puck_success_2 Mean 0 +[id0] expl/Final hand_and_puck_success_2 Std 0 +[id0] expl/Final hand_and_puck_success_2 Max 0 +[id0] expl/Final hand_and_puck_success_2 Min 0 +[id0] expl/hand_and_puck_success_3 Mean 0 +[id0] expl/hand_and_puck_success_3 Std 0 +[id0] expl/hand_and_puck_success_3 Max 0 +[id0] expl/hand_and_puck_success_3 Min 0 +[id0] expl/Final hand_and_puck_success_3 Mean 0 +[id0] expl/Final hand_and_puck_success_3 Std 0 +[id0] expl/Final hand_and_puck_success_3 Max 0 +[id0] expl/Final hand_and_puck_success_3 Min 0 +[id0] expl/state_success Mean 0 +[id0] expl/state_success Std 0 +[id0] expl/state_success Max 0 +[id0] expl/state_success Min 0 +[id0] expl/Final state_success Mean 0 +[id0] expl/Final state_success Std 0 +[id0] expl/Final state_success Max 0 +[id0] expl/Final state_success Min 0 +[id0] expl/state_success_2 Mean 0 +[id0] expl/state_success_2 Std 0 +[id0] expl/state_success_2 Max 0 +[id0] expl/state_success_2 Min 0 +[id0] expl/Final state_success_2 Mean 0 +[id0] expl/Final state_success_2 Std 0 +[id0] expl/Final state_success_2 Max 0 +[id0] expl/Final state_success_2 Min 0 +[id0] expl/state_success_3 Mean 0 +[id0] expl/state_success_3 Std 0 +[id0] expl/state_success_3 Max 0 +[id0] expl/state_success_3 Min 0 +[id0] expl/Final state_success_3 Mean 0 +[id0] expl/Final state_success_3 Std 0 +[id0] expl/Final state_success_3 Max 0 +[id0] expl/Final state_success_3 Min 0 +[id0] expl/touch_success Mean 0 +[id0] expl/touch_success Std 0 +[id0] expl/touch_success Max 0 +[id0] expl/touch_success Min 0 +[id0] expl/Final touch_success Mean 0 +[id0] expl/Final touch_success Std 0 +[id0] expl/Final touch_success Max 0 +[id0] expl/Final touch_success Min 0 +[id0] expl/touch_success_2 Mean 0 +[id0] expl/touch_success_2 Std 0 +[id0] expl/touch_success_2 Max 0 +[id0] expl/touch_success_2 Min 0 +[id0] expl/Final touch_success_2 Mean 0 +[id0] expl/Final touch_success_2 Std 0 +[id0] expl/Final touch_success_2 Max 0 +[id0] expl/Final touch_success_2 Min 0 +[id0] expl/touch_success_3 Mean 0.14 +[id0] expl/touch_success_3 Std 0.346987 +[id0] expl/touch_success_3 Max 1 +[id0] expl/touch_success_3 Min 0 +[id0] expl/Final touch_success_3 Mean 0.5 +[id0] expl/Final touch_success_3 Std 0.5 +[id0] expl/Final touch_success_3 Max 1 +[id0] expl/Final touch_success_3 Min 0 +[id0] eval/num steps total 300 +[id0] eval/num paths total 6 +[id0] eval/path length Mean 50 +[id0] eval/path length Std 0 +[id0] eval/path length Max 50 +[id0] eval/path length Min 50 +[id0] eval/Rewards Mean -0.160745 +[id0] eval/Rewards Std 0.0629678 +[id0] eval/Rewards Max -0.0980016 +[id0] eval/Rewards Min -0.248806 +[id0] eval/Returns Mean -8.03726 +[id0] eval/Returns Std 3.13716 +[id0] eval/Returns Max -4.90009 +[id0] eval/Returns Min -11.1744 +[id0] eval/Actions Mean 0.00448906 +[id0] eval/Actions Std 0.198369 +[id0] eval/Actions Max 0.213171 +[id0] eval/Actions Min -0.203783 +[id0] eval/Num Paths 2 +[id0] eval/Average Returns -8.03726 +[id0] eval/hand_distance Mean 0.172838 +[id0] eval/hand_distance Std 0.0793458 +[id0] eval/hand_distance Max 0.338509 +[id0] eval/hand_distance Min 0.100819 +[id0] eval/Final hand_distance Mean 0.241763 +[id0] eval/Final hand_distance Std 0.0967464 +[id0] eval/Final hand_distance Max 0.338509 +[id0] eval/Final hand_distance Min 0.145016 +[id0] eval/puck_distance Mean 0.160745 +[id0] eval/puck_distance Std 0.0629678 +[id0] eval/puck_distance Max 0.248806 +[id0] eval/puck_distance Min 0.0980016 +[id0] eval/Final puck_distance Mean 0.173404 +[id0] eval/Final puck_distance Std 0.0754021 +[id0] eval/Final puck_distance Max 0.248806 +[id0] eval/Final puck_distance Min 0.0980023 +[id0] eval/state_distance Mean 0.252767 +[id0] eval/state_distance Std 0.0456258 +[id0] eval/state_distance Max 0.35241 +[id0] eval/state_distance Min 0.149458 +[id0] eval/Final state_distance Mean 0.320197 +[id0] eval/Final state_distance Std 0.0322134 +[id0] eval/Final state_distance Max 0.35241 +[id0] eval/Final state_distance Min 0.287983 +[id0] eval/touch_distance Mean 0.133625 +[id0] eval/touch_distance Std 0.0489162 +[id0] eval/touch_distance Max 0.22003 +[id0] eval/touch_distance Min 0.0647734 +[id0] eval/Final touch_distance Mean 0.143407 +[id0] eval/Final touch_distance Std 0.0766229 +[id0] eval/Final touch_distance Max 0.22003 +[id0] eval/Final touch_distance Min 0.0667838 +[id0] eval/hand_success Mean 0 +[id0] eval/hand_success Std 0 +[id0] eval/hand_success Max 0 +[id0] eval/hand_success Min 0 +[id0] eval/Final hand_success Mean 0 +[id0] eval/Final hand_success Std 0 +[id0] eval/Final hand_success Max 0 +[id0] eval/Final hand_success Min 0 +[id0] eval/hand_success_2 Mean 0 +[id0] eval/hand_success_2 Std 0 +[id0] eval/hand_success_2 Max 0 +[id0] eval/hand_success_2 Min 0 +[id0] eval/Final hand_success_2 Mean 0 +[id0] eval/Final hand_success_2 Std 0 +[id0] eval/Final hand_success_2 Max 0 +[id0] eval/Final hand_success_2 Min 0 +[id0] eval/hand_success_3 Mean 0.42 +[id0] eval/hand_success_3 Std 0.493559 +[id0] eval/hand_success_3 Max 1 +[id0] eval/hand_success_3 Min 0 +[id0] eval/Final hand_success_3 Mean 0 +[id0] eval/Final hand_success_3 Std 0 +[id0] eval/Final hand_success_3 Max 0 +[id0] eval/Final hand_success_3 Min 0 +[id0] eval/puck_success Mean 0 +[id0] eval/puck_success Std 0 +[id0] eval/puck_success Max 0 +[id0] eval/puck_success Min 0 +[id0] eval/Final puck_success Mean 0 +[id0] eval/Final puck_success Std 0 +[id0] eval/Final puck_success Max 0 +[id0] eval/Final puck_success Min 0 +[id0] eval/puck_success_2 Mean 0 +[id0] eval/puck_success_2 Std 0 +[id0] eval/puck_success_2 Max 0 +[id0] eval/puck_success_2 Min 0 +[id0] eval/Final puck_success_2 Mean 0 +[id0] eval/Final puck_success_2 Std 0 +[id0] eval/Final puck_success_2 Max 0 +[id0] eval/Final puck_success_2 Min 0 +[id0] eval/puck_success_3 Mean 0.5 +[id0] eval/puck_success_3 Std 0.5 +[id0] eval/puck_success_3 Max 1 +[id0] eval/puck_success_3 Min 0 +[id0] eval/Final puck_success_3 Mean 0.5 +[id0] eval/Final puck_success_3 Std 0.5 +[id0] eval/Final puck_success_3 Max 1 +[id0] eval/Final puck_success_3 Min 0 +[id0] eval/hand_and_puck_success Mean 0 +[id0] eval/hand_and_puck_success Std 0 +[id0] eval/hand_and_puck_success Max 0 +[id0] eval/hand_and_puck_success Min 0 +[id0] eval/Final hand_and_puck_success Mean 0 +[id0] eval/Final hand_and_puck_success Std 0 +[id0] eval/Final hand_and_puck_success Max 0 +[id0] eval/Final hand_and_puck_success Min 0 +[id0] eval/hand_and_puck_success_2 Mean 0 +[id0] eval/hand_and_puck_success_2 Std 0 +[id0] eval/hand_and_puck_success_2 Max 0 +[id0] eval/hand_and_puck_success_2 Min 0 +[id0] eval/Final hand_and_puck_success_2 Mean 0 +[id0] eval/Final hand_and_puck_success_2 Std 0 +[id0] eval/Final hand_and_puck_success_2 Max 0 +[id0] eval/Final hand_and_puck_success_2 Min 0 +[id0] eval/hand_and_puck_success_3 Mean 0 +[id0] eval/hand_and_puck_success_3 Std 0 +[id0] eval/hand_and_puck_success_3 Max 0 +[id0] eval/hand_and_puck_success_3 Min 0 +[id0] eval/Final hand_and_puck_success_3 Mean 0 +[id0] eval/Final hand_and_puck_success_3 Std 0 +[id0] eval/Final hand_and_puck_success_3 Max 0 +[id0] eval/Final hand_and_puck_success_3 Min 0 +[id0] eval/state_success Mean 0 +[id0] eval/state_success Std 0 +[id0] eval/state_success Max 0 +[id0] eval/state_success Min 0 +[id0] eval/Final state_success Mean 0 +[id0] eval/Final state_success Std 0 +[id0] eval/Final state_success Max 0 +[id0] eval/Final state_success Min 0 +[id0] eval/state_success_2 Mean 0 +[id0] eval/state_success_2 Std 0 +[id0] eval/state_success_2 Max 0 +[id0] eval/state_success_2 Min 0 +[id0] eval/Final state_success_2 Mean 0 +[id0] eval/Final state_success_2 Std 0 +[id0] eval/Final state_success_2 Max 0 +[id0] eval/Final state_success_2 Min 0 +[id0] eval/state_success_3 Mean 0 +[id0] eval/state_success_3 Std 0 +[id0] eval/state_success_3 Max 0 +[id0] eval/state_success_3 Min 0 +[id0] eval/Final state_success_3 Mean 0 +[id0] eval/Final state_success_3 Std 0 +[id0] eval/Final state_success_3 Max 0 +[id0] eval/Final state_success_3 Min 0 +[id0] eval/touch_success Mean 0 +[id0] eval/touch_success Std 0 +[id0] eval/touch_success Max 0 +[id0] eval/touch_success Min 0 +[id0] eval/Final touch_success Mean 0 +[id0] eval/Final touch_success Std 0 +[id0] eval/Final touch_success Max 0 +[id0] eval/Final touch_success Min 0 +[id0] eval/touch_success_2 Mean 0.16 +[id0] eval/touch_success_2 Std 0.366606 +[id0] eval/touch_success_2 Max 1 +[id0] eval/touch_success_2 Min 0 +[id0] eval/Final touch_success_2 Mean 0.5 +[id0] eval/Final touch_success_2 Std 0.5 +[id0] eval/Final touch_success_2 Max 1 +[id0] eval/Final touch_success_2 Min 0 +[id0] eval/touch_success_3 Mean 0.46 +[id0] eval/touch_success_3 Std 0.498397 +[id0] eval/touch_success_3 Max 1 +[id0] eval/touch_success_3 Min 0 +[id0] eval/Final touch_success_3 Mean 0.5 +[id0] eval/Final touch_success_3 Std 0.5 +[id0] eval/Final touch_success_3 Max 1 +[id0] eval/Final touch_success_3 Min 0 +[id0] time/epoch_time (s) 3.36861 +[id0] time/evaluation sampling (s) 1.57366 +[id0] time/exploration sampling (s) 1.57969 +[id0] time/global_time (s) 28.4603 +[id0] time/replay buffer data storing (s) 0.000371456 +[id0] time/saving (s) 2.74181e-05 +[id0] time/training (s) 0.205313 +[id0] epoch 2 +[id0] --------------------------------------- --------------- +[id0] --------------------------------------- --------------- +[id0] trainer/num train calls 40 +[id0] trainer/QF1 Loss 0.00812755 +[id0] trainer/QF2 Loss 0.00844011 +[id0] trainer/Policy Loss 1.44796 +[id0] trainer/Q1 Predictions Mean -0.125104 +[id0] trainer/Q1 Predictions Std 0.00825019 +[id0] trainer/Q1 Predictions Max -0.115534 +[id0] trainer/Q1 Predictions Min -0.140015 +[id0] trainer/Q2 Predictions Mean -0.126615 +[id0] trainer/Q2 Predictions Std 0.00817811 +[id0] trainer/Q2 Predictions Max -0.115739 +[id0] trainer/Q2 Predictions Min -0.141015 +[id0] trainer/Q Targets Mean -0.0874891 +[id0] trainer/Q Targets Std 0.0767401 +[id0] trainer/Q Targets Max -0.00298979 +[id0] trainer/Q Targets Min -0.219601 +[id0] trainer/Log Pis Mean 1.32672 +[id0] trainer/Log Pis Std 0.705888 +[id0] trainer/Log Pis Max 1.97718 +[id0] trainer/Log Pis Min 0.302602 +[id0] trainer/rewards Mean -0.0843445 +[id0] trainer/rewards Std 0.0768233 +[id0] trainer/rewards Max -1.76237e-06 +[id0] trainer/rewards Min -0.216733 +[id0] trainer/terminals Mean 0 +[id0] trainer/terminals Std 0 +[id0] trainer/terminals Max 0 +[id0] trainer/terminals Min 0 +[id0] trainer/replay_buffer_len 90500 +[id0] trainer/policy/mean Mean -0.0363329 +[id0] trainer/policy/mean Std 0.381041 +[id0] trainer/policy/mean Max 0.369571 +[id0] trainer/policy/mean Min -0.446347 +[id0] trainer/policy/std Mean 0.140297 +[id0] trainer/policy/std Std 0.0092635 +[id0] trainer/policy/std Max 0.156963 +[id0] trainer/policy/std Min 0.123789 +[id0] trainer/Advantage Weights Mean 0.2 +[id0] trainer/Advantage Weights Std 0.399073 +[id0] trainer/Advantage Weights Max 0.998145 +[id0] trainer/Advantage Weights Min 3.91133e-10 +[id0] trainer/Advantage Score Mean -0.00386002 +[id0] trainer/Advantage Score Std 0.00749892 +[id0] trainer/Advantage Score Max 0.0071685 +[id0] trainer/Advantage Score Min -0.0144916 +[id0] trainer/bc/Train Logprob Loss 35.3732 +[id0] trainer/bc/Test Logprob Loss 31.4472 +[id0] trainer/bc/Train MSE 0.762608 +[id0] trainer/bc/Test MSE 0.794041 +[id0] trainer/bc/train_policy_loss 35.3732 +[id0] trainer/bc/test_policy_loss 31.4472 +[id0] expl/num steps total 500 +[id0] expl/num paths total 10 +[id0] expl/path length Mean 50 +[id0] expl/path length Std 0 +[id0] expl/path length Max 50 +[id0] expl/path length Min 50 +[id0] expl/Rewards Mean -0.113284 +[id0] expl/Rewards Std 0.00896539 +[id0] expl/Rewards Max -0.104318 +[id0] expl/Rewards Min -0.122249 +[id0] expl/Returns Mean -5.66419 +[id0] expl/Returns Std 0.448269 +[id0] expl/Returns Max -5.21592 +[id0] expl/Returns Min -6.11246 +[id0] expl/Actions Mean -0.0486082 +[id0] expl/Actions Std 0.417547 +[id0] expl/Actions Max 0.674142 +[id0] expl/Actions Min -0.728117 +[id0] expl/Num Paths 2 +[id0] expl/Average Returns -5.66419 +[id0] expl/hand_distance Mean 0.150248 +[id0] expl/hand_distance Std 0.0805957 +[id0] expl/hand_distance Max 0.233022 +[id0] expl/hand_distance Min 0.0281943 +[id0] expl/Final hand_distance Mean 0.162135 +[id0] expl/Final hand_distance Std 0.0708874 +[id0] expl/Final hand_distance Max 0.233022 +[id0] expl/Final hand_distance Min 0.0912476 +[id0] expl/puck_distance Mean 0.113284 +[id0] expl/puck_distance Std 0.00896539 +[id0] expl/puck_distance Max 0.122249 +[id0] expl/puck_distance Min 0.104318 +[id0] expl/Final puck_distance Mean 0.113284 +[id0] expl/Final puck_distance Std 0.00896559 +[id0] expl/Final puck_distance Max 0.122249 +[id0] expl/Final puck_distance Min 0.104318 +[id0] expl/state_distance Mean 0.193878 +[id0] expl/state_distance Std 0.0662949 +[id0] expl/state_distance Max 0.263143 +[id0] expl/state_distance Min 0.108062 +[id0] expl/Final state_distance Mean 0.200869 +[id0] expl/Final state_distance Std 0.0622744 +[id0] expl/Final state_distance Max 0.263143 +[id0] expl/Final state_distance Min 0.138594 +[id0] expl/touch_distance Mean 0.320496 +[id0] expl/touch_distance Std 0.0821977 +[id0] expl/touch_distance Max 0.413936 +[id0] expl/touch_distance Min 0.172705 +[id0] expl/Final touch_distance Mean 0.330567 +[id0] expl/Final touch_distance Std 0.0833687 +[id0] expl/Final touch_distance Max 0.413936 +[id0] expl/Final touch_distance Min 0.247198 +[id0] expl/hand_success Mean 0.17 +[id0] expl/hand_success Std 0.375633 +[id0] expl/hand_success Max 1 +[id0] expl/hand_success Min 0 +[id0] expl/Final hand_success Mean 0 +[id0] expl/Final hand_success Std 0 +[id0] expl/Final hand_success Max 0 +[id0] expl/Final hand_success Min 0 +[id0] expl/hand_success_2 Mean 0.2 +[id0] expl/hand_success_2 Std 0.4 +[id0] expl/hand_success_2 Max 1 +[id0] expl/hand_success_2 Min 0 +[id0] expl/Final hand_success_2 Mean 0 +[id0] expl/Final hand_success_2 Std 0 +[id0] expl/Final hand_success_2 Max 0 +[id0] expl/Final hand_success_2 Min 0 +[id0] expl/hand_success_3 Mean 0.5 +[id0] expl/hand_success_3 Std 0.5 +[id0] expl/hand_success_3 Max 1 +[id0] expl/hand_success_3 Min 0 +[id0] expl/Final hand_success_3 Mean 0.5 +[id0] expl/Final hand_success_3 Std 0.5 +[id0] expl/Final hand_success_3 Max 1 +[id0] expl/Final hand_success_3 Min 0 +[id0] expl/puck_success Mean 0 +[id0] expl/puck_success Std 0 +[id0] expl/puck_success Max 0 +[id0] expl/puck_success Min 0 +[id0] expl/Final puck_success Mean 0 +[id0] expl/Final puck_success Std 0 +[id0] expl/Final puck_success Max 0 +[id0] expl/Final puck_success Min 0 +[id0] expl/puck_success_2 Mean 0 +[id0] expl/puck_success_2 Std 0 +[id0] expl/puck_success_2 Max 0 +[id0] expl/puck_success_2 Min 0 +[id0] expl/Final puck_success_2 Mean 0 +[id0] expl/Final puck_success_2 Std 0 +[id0] expl/Final puck_success_2 Max 0 +[id0] expl/Final puck_success_2 Min 0 +[id0] expl/puck_success_3 Mean 0.5 +[id0] expl/puck_success_3 Std 0.5 +[id0] expl/puck_success_3 Max 1 +[id0] expl/puck_success_3 Min 0 +[id0] expl/Final puck_success_3 Mean 0.5 +[id0] expl/Final puck_success_3 Std 0.5 +[id0] expl/Final puck_success_3 Max 1 +[id0] expl/Final puck_success_3 Min 0 +[id0] expl/hand_and_puck_success Mean 0 +[id0] expl/hand_and_puck_success Std 0 +[id0] expl/hand_and_puck_success Max 0 +[id0] expl/hand_and_puck_success Min 0 +[id0] expl/Final hand_and_puck_success Mean 0 +[id0] expl/Final hand_and_puck_success Std 0 +[id0] expl/Final hand_and_puck_success Max 0 +[id0] expl/Final hand_and_puck_success Min 0 +[id0] expl/hand_and_puck_success_2 Mean 0 +[id0] expl/hand_and_puck_success_2 Std 0 +[id0] expl/hand_and_puck_success_2 Max 0 +[id0] expl/hand_and_puck_success_2 Min 0 +[id0] expl/Final hand_and_puck_success_2 Mean 0 +[id0] expl/Final hand_and_puck_success_2 Std 0 +[id0] expl/Final hand_and_puck_success_2 Max 0 +[id0] expl/Final hand_and_puck_success_2 Min 0 +[id0] expl/hand_and_puck_success_3 Mean 0 +[id0] expl/hand_and_puck_success_3 Std 0 +[id0] expl/hand_and_puck_success_3 Max 0 +[id0] expl/hand_and_puck_success_3 Min 0 +[id0] expl/Final hand_and_puck_success_3 Mean 0 +[id0] expl/Final hand_and_puck_success_3 Std 0 +[id0] expl/Final hand_and_puck_success_3 Max 0 +[id0] expl/Final hand_and_puck_success_3 Min 0 +[id0] expl/state_success Mean 0 +[id0] expl/state_success Std 0 +[id0] expl/state_success Max 0 +[id0] expl/state_success Min 0 +[id0] expl/Final state_success Mean 0 +[id0] expl/Final state_success Std 0 +[id0] expl/Final state_success Max 0 +[id0] expl/Final state_success Min 0 +[id0] expl/state_success_2 Mean 0 +[id0] expl/state_success_2 Std 0 +[id0] expl/state_success_2 Max 0 +[id0] expl/state_success_2 Min 0 +[id0] expl/Final state_success_2 Mean 0 +[id0] expl/Final state_success_2 Std 0 +[id0] expl/Final state_success_2 Max 0 +[id0] expl/Final state_success_2 Min 0 +[id0] expl/state_success_3 Mean 0.17 +[id0] expl/state_success_3 Std 0.375633 +[id0] expl/state_success_3 Max 1 +[id0] expl/state_success_3 Min 0 +[id0] expl/Final state_success_3 Mean 0 +[id0] expl/Final state_success_3 Std 0 +[id0] expl/Final state_success_3 Max 0 +[id0] expl/Final state_success_3 Min 0 +[id0] expl/touch_success Mean 0 +[id0] expl/touch_success Std 0 +[id0] expl/touch_success Max 0 +[id0] expl/touch_success Min 0 +[id0] expl/Final touch_success Mean 0 +[id0] expl/Final touch_success Std 0 +[id0] expl/Final touch_success Max 0 +[id0] expl/Final touch_success Min 0 +[id0] expl/touch_success_2 Mean 0 +[id0] expl/touch_success_2 Std 0 +[id0] expl/touch_success_2 Max 0 +[id0] expl/touch_success_2 Min 0 +[id0] expl/Final touch_success_2 Mean 0 +[id0] expl/Final touch_success_2 Std 0 +[id0] expl/Final touch_success_2 Max 0 +[id0] expl/Final touch_success_2 Min 0 +[id0] expl/touch_success_3 Mean 0 +[id0] expl/touch_success_3 Std 0 +[id0] expl/touch_success_3 Max 0 +[id0] expl/touch_success_3 Min 0 +[id0] expl/Final touch_success_3 Mean 0 +[id0] expl/Final touch_success_3 Std 0 +[id0] expl/Final touch_success_3 Max 0 +[id0] expl/Final touch_success_3 Min 0 +[id0] eval/num steps total 400 +[id0] eval/num paths total 8 +[id0] eval/path length Mean 50 +[id0] eval/path length Std 0 +[id0] eval/path length Max 50 +[id0] eval/path length Min 50 +[id0] eval/Rewards Mean -0.127591 +[id0] eval/Rewards Std 0.0872006 +[id0] eval/Rewards Max -0.0403897 +[id0] eval/Rewards Min -0.214791 +[id0] eval/Returns Mean -6.37953 +[id0] eval/Returns Std 4.36003 +[id0] eval/Returns Max -2.0195 +[id0] eval/Returns Min -10.7396 +[id0] eval/Actions Mean -0.0376967 +[id0] eval/Actions Std 0.39848 +[id0] eval/Actions Max 0.365947 +[id0] eval/Actions Min -0.442161 +[id0] eval/Num Paths 2 +[id0] eval/Average Returns -6.37953 +[id0] eval/hand_distance Mean 0.132352 +[id0] eval/hand_distance Std 0.0460267 +[id0] eval/hand_distance Max 0.18476 +[id0] eval/hand_distance Min 0.000587539 +[id0] eval/Final hand_distance Mean 0.163061 +[id0] eval/Final hand_distance Std 0.0216984 +[id0] eval/Final hand_distance Max 0.18476 +[id0] eval/Final hand_distance Min 0.141363 +[id0] eval/puck_distance Mean 0.127591 +[id0] eval/puck_distance Std 0.0872006 +[id0] eval/puck_distance Max 0.214791 +[id0] eval/puck_distance Min 0.0403897 +[id0] eval/Final puck_distance Mean 0.127591 +[id0] eval/Final puck_distance Std 0.0872005 +[id0] eval/Final puck_distance Max 0.214791 +[id0] eval/Final puck_distance Min 0.0403904 +[id0] eval/state_distance Mean 0.194302 +[id0] eval/state_distance Std 0.0759312 +[id0] eval/state_distance Max 0.269184 +[id0] eval/state_distance Min 0.0403941 +[id0] eval/Final state_distance Mean 0.223129 +[id0] eval/Final state_distance Std 0.0340064 +[id0] eval/Final state_distance Max 0.257136 +[id0] eval/Final state_distance Min 0.189123 +[id0] eval/touch_distance Mean 0.242917 +[id0] eval/touch_distance Std 0.0917106 +[id0] eval/touch_distance Max 0.337246 +[id0] eval/touch_distance Min 0.0830543 +[id0] eval/Final touch_distance Mean 0.275227 +[id0] eval/Final touch_distance Std 0.0620179 +[id0] eval/Final touch_distance Max 0.337245 +[id0] eval/Final touch_distance Min 0.213209 +[id0] eval/hand_success Mean 0.11 +[id0] eval/hand_success Std 0.31289 +[id0] eval/hand_success Max 1 +[id0] eval/hand_success Min 0 +[id0] eval/Final hand_success Mean 0 +[id0] eval/Final hand_success Std 0 +[id0] eval/Final hand_success Max 0 +[id0] eval/Final hand_success Min 0 +[id0] eval/hand_success_2 Mean 0.16 +[id0] eval/hand_success_2 Std 0.366606 +[id0] eval/hand_success_2 Max 1 +[id0] eval/hand_success_2 Min 0 +[id0] eval/Final hand_success_2 Mean 0 +[id0] eval/Final hand_success_2 Std 0 +[id0] eval/Final hand_success_2 Max 0 +[id0] eval/Final hand_success_2 Min 0 +[id0] eval/hand_success_3 Mean 0.24 +[id0] eval/hand_success_3 Std 0.427083 +[id0] eval/hand_success_3 Max 1 +[id0] eval/hand_success_3 Min 0 +[id0] eval/Final hand_success_3 Mean 0 +[id0] eval/Final hand_success_3 Std 0 +[id0] eval/Final hand_success_3 Max 0 +[id0] eval/Final hand_success_3 Min 0 +[id0] eval/puck_success Mean 0.5 +[id0] eval/puck_success Std 0.5 +[id0] eval/puck_success Max 1 +[id0] eval/puck_success Min 0 +[id0] eval/Final puck_success Mean 0.5 +[id0] eval/Final puck_success Std 0.5 +[id0] eval/Final puck_success Max 1 +[id0] eval/Final puck_success Min 0 +[id0] eval/puck_success_2 Mean 0.5 +[id0] eval/puck_success_2 Std 0.5 +[id0] eval/puck_success_2 Max 1 +[id0] eval/puck_success_2 Min 0 +[id0] eval/Final puck_success_2 Mean 0.5 +[id0] eval/Final puck_success_2 Std 0.5 +[id0] eval/Final puck_success_2 Max 1 +[id0] eval/Final puck_success_2 Min 0 +[id0] eval/puck_success_3 Mean 0.5 +[id0] eval/puck_success_3 Std 0.5 +[id0] eval/puck_success_3 Max 1 +[id0] eval/puck_success_3 Min 0 +[id0] eval/Final puck_success_3 Mean 0.5 +[id0] eval/Final puck_success_3 Std 0.5 +[id0] eval/Final puck_success_3 Max 1 +[id0] eval/Final puck_success_3 Min 0 +[id0] eval/hand_and_puck_success Mean 0.03 +[id0] eval/hand_and_puck_success Std 0.170587 +[id0] eval/hand_and_puck_success Max 1 +[id0] eval/hand_and_puck_success Min 0 +[id0] eval/Final hand_and_puck_success Mean 0 +[id0] eval/Final hand_and_puck_success Std 0 +[id0] eval/Final hand_and_puck_success Max 0 +[id0] eval/Final hand_and_puck_success Min 0 +[id0] eval/hand_and_puck_success_2 Mean 0.07 +[id0] eval/hand_and_puck_success_2 Std 0.255147 +[id0] eval/hand_and_puck_success_2 Max 1 +[id0] eval/hand_and_puck_success_2 Min 0 +[id0] eval/Final hand_and_puck_success_2 Mean 0 +[id0] eval/Final hand_and_puck_success_2 Std 0 +[id0] eval/Final hand_and_puck_success_2 Max 0 +[id0] eval/Final hand_and_puck_success_2 Min 0 +[id0] eval/hand_and_puck_success_3 Mean 0.16 +[id0] eval/hand_and_puck_success_3 Std 0.366606 +[id0] eval/hand_and_puck_success_3 Max 1 +[id0] eval/hand_and_puck_success_3 Min 0 +[id0] eval/Final hand_and_puck_success_3 Mean 0 +[id0] eval/Final hand_and_puck_success_3 Std 0 +[id0] eval/Final hand_and_puck_success_3 Max 0 +[id0] eval/Final hand_and_puck_success_3 Min 0 +[id0] eval/state_success Mean 0.08 +[id0] eval/state_success Std 0.271293 +[id0] eval/state_success Max 1 +[id0] eval/state_success Min 0 +[id0] eval/Final state_success Mean 0 +[id0] eval/Final state_success Std 0 +[id0] eval/Final state_success Max 0 +[id0] eval/Final state_success Min 0 +[id0] eval/state_success_2 Mean 0.14 +[id0] eval/state_success_2 Std 0.346987 +[id0] eval/state_success_2 Max 1 +[id0] eval/state_success_2 Min 0 +[id0] eval/Final state_success_2 Mean 0 +[id0] eval/Final state_success_2 Std 0 +[id0] eval/Final state_success_2 Max 0 +[id0] eval/Final state_success_2 Min 0 +[id0] eval/state_success_3 Mean 0.23 +[id0] eval/state_success_3 Std 0.420833 +[id0] eval/state_success_3 Max 1 +[id0] eval/state_success_3 Min 0 +[id0] eval/Final state_success_3 Mean 0 +[id0] eval/Final state_success_3 Std 0 +[id0] eval/Final state_success_3 Max 0 +[id0] eval/Final state_success_3 Min 0 +[id0] eval/touch_success Mean 0 +[id0] eval/touch_success Std 0 +[id0] eval/touch_success Max 0 +[id0] eval/touch_success Min 0 +[id0] eval/Final touch_success Mean 0 +[id0] eval/Final touch_success Std 0 +[id0] eval/Final touch_success Max 0 +[id0] eval/Final touch_success Min 0 +[id0] eval/touch_success_2 Mean 0 +[id0] eval/touch_success_2 Std 0 +[id0] eval/touch_success_2 Max 0 +[id0] eval/touch_success_2 Min 0 +[id0] eval/Final touch_success_2 Mean 0 +[id0] eval/Final touch_success_2 Std 0 +[id0] eval/Final touch_success_2 Max 0 +[id0] eval/Final touch_success_2 Min 0 +[id0] eval/touch_success_3 Mean 0.15 +[id0] eval/touch_success_3 Std 0.357071 +[id0] eval/touch_success_3 Max 1 +[id0] eval/touch_success_3 Min 0 +[id0] eval/Final touch_success_3 Mean 0 +[id0] eval/Final touch_success_3 Std 0 +[id0] eval/Final touch_success_3 Max 0 +[id0] eval/Final touch_success_3 Min 0 +[id0] time/epoch_time (s) 3.31405 +[id0] time/evaluation sampling (s) 1.54725 +[id0] time/exploration sampling (s) 1.54288 +[id0] time/global_time (s) 31.801 +[id0] time/replay buffer data storing (s) 0.000344515 +[id0] time/saving (s) 2.36034e-05 +[id0] time/training (s) 0.212814 +[id0] epoch 3 +[id0] --------------------------------------- --------------- +[id0] --------------------------------------- --------------- +[id0] trainer/num train calls 50 +[id0] trainer/QF1 Loss 0.0135741 +[id0] trainer/QF2 Loss 0.0132348 +[id0] trainer/Policy Loss -1.0125 +[id0] trainer/Q1 Predictions Mean -0.115927 +[id0] trainer/Q1 Predictions Std 0.015483 +[id0] trainer/Q1 Predictions Max -0.0978391 +[id0] trainer/Q1 Predictions Min -0.137299 +[id0] trainer/Q2 Predictions Mean -0.11971 +[id0] trainer/Q2 Predictions Std 0.0153614 +[id0] trainer/Q2 Predictions Max -0.099684 +[id0] trainer/Q2 Predictions Min -0.141439 +[id0] trainer/Q Targets Mean -0.124712 +[id0] trainer/Q Targets Std 0.111004 +[id0] trainer/Q Targets Max -0.00466002 +[id0] trainer/Q Targets Min -0.280714 +[id0] trainer/Log Pis Mean -1.13064 +[id0] trainer/Log Pis Std 1.98296 +[id0] trainer/Log Pis Max 0.0934451 +[id0] trainer/Log Pis Min -5.06595 +[id0] trainer/rewards Mean -0.120223 +[id0] trainer/rewards Std 0.111247 +[id0] trainer/rewards Max -8.42937e-08 +[id0] trainer/rewards Min -0.276419 +[id0] trainer/terminals Mean 0 +[id0] trainer/terminals Std 0 +[id0] trainer/terminals Max 0 +[id0] trainer/terminals Min 0 +[id0] trainer/replay_buffer_len 90600 +[id0] trainer/policy/mean Mean -0.0841174 +[id0] trainer/policy/mean Std 0.459796 +[id0] trainer/policy/mean Max 0.393837 +[id0] trainer/policy/mean Min -0.566948 +[id0] trainer/policy/std Mean 0.343178 +[id0] trainer/policy/std Std 0.0279913 +[id0] trainer/policy/std Max 0.393021 +[id0] trainer/policy/std Min 0.302896 +[id0] trainer/Advantage Weights Mean 0.2 +[id0] trainer/Advantage Weights Std 0.392735 +[id0] trainer/Advantage Weights Max 0.985392 +[id0] trainer/Advantage Weights Min 7.05268e-16 +[id0] trainer/Advantage Score Mean 0.00222292 +[id0] trainer/Advantage Score Std 0.0124918 +[id0] trainer/Advantage Score Max 0.0155132 +[id0] trainer/Advantage Score Min -0.0193601 +[id0] trainer/bc/Train Logprob Loss 6.82 +[id0] trainer/bc/Test Logprob Loss 5.87462 +[id0] trainer/bc/Train MSE 1.0784 +[id0] trainer/bc/Test MSE 1.02518 +[id0] trainer/bc/train_policy_loss 6.82 +[id0] trainer/bc/test_policy_loss 5.87462 +[id0] expl/num steps total 600 +[id0] expl/num paths total 12 +[id0] expl/path length Mean 50 +[id0] expl/path length Std 0 +[id0] expl/path length Max 50 +[id0] expl/path length Min 50 +[id0] expl/Rewards Mean -0.195256 +[id0] expl/Rewards Std 0.107581 +[id0] expl/Rewards Max -0.0480694 +[id0] expl/Rewards Min -0.301743 +[id0] expl/Returns Mean -9.76281 +[id0] expl/Returns Std 5.32436 +[id0] expl/Returns Max -4.43845 +[id0] expl/Returns Min -15.0872 +[id0] expl/Actions Mean -0.0733707 +[id0] expl/Actions Std 0.62436 +[id0] expl/Actions Max 1.42151 +[id0] expl/Actions Min -1.46905 +[id0] expl/Num Paths 2 +[id0] expl/Average Returns -9.76281 +[id0] expl/hand_distance Mean 0.158642 +[id0] expl/hand_distance Std 0.0608491 +[id0] expl/hand_distance Max 0.245079 +[id0] expl/hand_distance Min 0.0291545 +[id0] expl/Final hand_distance Mean 0.197413 +[id0] expl/Final hand_distance Std 0.0476639 +[id0] expl/Final hand_distance Max 0.245077 +[id0] expl/Final hand_distance Min 0.149749 +[id0] expl/puck_distance Mean 0.195256 +[id0] expl/puck_distance Std 0.107581 +[id0] expl/puck_distance Max 0.301743 +[id0] expl/puck_distance Min 0.0480694 +[id0] expl/Final puck_distance Mean 0.20168 +[id0] expl/Final puck_distance Std 0.100063 +[id0] expl/Final puck_distance Max 0.301743 +[id0] expl/Final puck_distance Min 0.101617 +[id0] expl/state_distance Mean 0.256775 +[id0] expl/state_distance Std 0.112407 +[id0] expl/state_distance Max 0.388732 +[id0] expl/state_distance Min 0.0562197 +[id0] expl/Final state_distance Mean 0.284851 +[id0] expl/Final state_distance Std 0.10388 +[id0] expl/Final state_distance Max 0.388731 +[id0] expl/Final state_distance Min 0.180972 +[id0] expl/touch_distance Mean 0.105648 +[id0] expl/touch_distance Std 0.0463963 +[id0] expl/touch_distance Max 0.261491 +[id0] expl/touch_distance Min 0.0616572 +[id0] expl/Final touch_distance Mean 0.0855933 +[id0] expl/Final touch_distance Std 0.00990345 +[id0] expl/Final touch_distance Max 0.0954967 +[id0] expl/Final touch_distance Min 0.0756899 +[id0] expl/hand_success Mean 0.06 +[id0] expl/hand_success Std 0.237487 +[id0] expl/hand_success Max 1 +[id0] expl/hand_success Min 0 +[id0] expl/Final hand_success Mean 0 +[id0] expl/Final hand_success Std 0 +[id0] expl/Final hand_success Max 0 +[id0] expl/Final hand_success Min 0 +[id0] expl/hand_success_2 Mean 0.11 +[id0] expl/hand_success_2 Std 0.31289 +[id0] expl/hand_success_2 Max 1 +[id0] expl/hand_success_2 Min 0 +[id0] expl/Final hand_success_2 Mean 0 +[id0] expl/Final hand_success_2 Std 0 +[id0] expl/Final hand_success_2 Max 0 +[id0] expl/Final hand_success_2 Min 0 +[id0] expl/hand_success_3 Mean 0.22 +[id0] expl/hand_success_3 Std 0.414246 +[id0] expl/hand_success_3 Max 1 +[id0] expl/hand_success_3 Min 0 +[id0] expl/Final hand_success_3 Mean 0 +[id0] expl/Final hand_success_3 Std 0 +[id0] expl/Final hand_success_3 Max 0 +[id0] expl/Final hand_success_3 Min 0 +[id0] expl/puck_success Mean 0.1 +[id0] expl/puck_success Std 0.3 +[id0] expl/puck_success Max 1 +[id0] expl/puck_success Min 0 +[id0] expl/Final puck_success Mean 0 +[id0] expl/Final puck_success Std 0 +[id0] expl/Final puck_success Max 0 +[id0] expl/Final puck_success Min 0 +[id0] expl/puck_success_2 Mean 0.12 +[id0] expl/puck_success_2 Std 0.324962 +[id0] expl/puck_success_2 Max 1 +[id0] expl/puck_success_2 Min 0 +[id0] expl/Final puck_success_2 Mean 0 +[id0] expl/Final puck_success_2 Std 0 +[id0] expl/Final puck_success_2 Max 0 +[id0] expl/Final puck_success_2 Min 0 +[id0] expl/puck_success_3 Mean 0.5 +[id0] expl/puck_success_3 Std 0.5 +[id0] expl/puck_success_3 Max 1 +[id0] expl/puck_success_3 Min 0 +[id0] expl/Final puck_success_3 Mean 0.5 +[id0] expl/Final puck_success_3 Std 0.5 +[id0] expl/Final puck_success_3 Max 1 +[id0] expl/Final puck_success_3 Min 0 +[id0] expl/hand_and_puck_success Mean 0 +[id0] expl/hand_and_puck_success Std 0 +[id0] expl/hand_and_puck_success Max 0 +[id0] expl/hand_and_puck_success Min 0 +[id0] expl/Final hand_and_puck_success Mean 0 +[id0] expl/Final hand_and_puck_success Std 0 +[id0] expl/Final hand_and_puck_success Max 0 +[id0] expl/Final hand_and_puck_success Min 0 +[id0] expl/hand_and_puck_success_2 Mean 0.03 +[id0] expl/hand_and_puck_success_2 Std 0.170587 +[id0] expl/hand_and_puck_success_2 Max 1 +[id0] expl/hand_and_puck_success_2 Min 0 +[id0] expl/Final hand_and_puck_success_2 Mean 0 +[id0] expl/Final hand_and_puck_success_2 Std 0 +[id0] expl/Final hand_and_puck_success_2 Max 0 +[id0] expl/Final hand_and_puck_success_2 Min 0 +[id0] expl/hand_and_puck_success_3 Mean 0.08 +[id0] expl/hand_and_puck_success_3 Std 0.271293 +[id0] expl/hand_and_puck_success_3 Max 1 +[id0] expl/hand_and_puck_success_3 Min 0 +[id0] expl/Final hand_and_puck_success_3 Mean 0 +[id0] expl/Final hand_and_puck_success_3 Std 0 +[id0] expl/Final hand_and_puck_success_3 Max 0 +[id0] expl/Final hand_and_puck_success_3 Min 0 +[id0] expl/state_success Mean 0.04 +[id0] expl/state_success Std 0.195959 +[id0] expl/state_success Max 1 +[id0] expl/state_success Min 0 +[id0] expl/Final state_success Mean 0 +[id0] expl/Final state_success Std 0 +[id0] expl/Final state_success Max 0 +[id0] expl/Final state_success Min 0 +[id0] expl/state_success_2 Mean 0.06 +[id0] expl/state_success_2 Std 0.237487 +[id0] expl/state_success_2 Max 1 +[id0] expl/state_success_2 Min 0 +[id0] expl/Final state_success_2 Mean 0 +[id0] expl/Final state_success_2 Std 0 +[id0] expl/Final state_success_2 Max 0 +[id0] expl/Final state_success_2 Min 0 +[id0] expl/state_success_3 Mean 0.13 +[id0] expl/state_success_3 Std 0.336303 +[id0] expl/state_success_3 Max 1 +[id0] expl/state_success_3 Min 0 +[id0] expl/Final state_success_3 Mean 0 +[id0] expl/Final state_success_3 Std 0 +[id0] expl/Final state_success_3 Max 0 +[id0] expl/Final state_success_3 Min 0 +[id0] expl/touch_success Mean 0 +[id0] expl/touch_success Std 0 +[id0] expl/touch_success Max 0 +[id0] expl/touch_success Min 0 +[id0] expl/Final touch_success Mean 0 +[id0] expl/Final touch_success Std 0 +[id0] expl/Final touch_success Max 0 +[id0] expl/Final touch_success Min 0 +[id0] expl/touch_success_2 Mean 0.42 +[id0] expl/touch_success_2 Std 0.493559 +[id0] expl/touch_success_2 Max 1 +[id0] expl/touch_success_2 Min 0 +[id0] expl/Final touch_success_2 Mean 0.5 +[id0] expl/Final touch_success_2 Std 0.5 +[id0] expl/Final touch_success_2 Max 1 +[id0] expl/Final touch_success_2 Min 0 +[id0] expl/touch_success_3 Mean 0.77 +[id0] expl/touch_success_3 Std 0.420833 +[id0] expl/touch_success_3 Max 1 +[id0] expl/touch_success_3 Min 0 +[id0] expl/Final touch_success_3 Mean 1 +[id0] expl/Final touch_success_3 Std 0 +[id0] expl/Final touch_success_3 Max 1 +[id0] expl/Final touch_success_3 Min 1 +[id0] eval/num steps total 500 +[id0] eval/num paths total 10 +[id0] eval/path length Mean 50 +[id0] eval/path length Std 0 +[id0] eval/path length Max 50 +[id0] eval/path length Min 50 +[id0] eval/Rewards Mean -0.197671 +[id0] eval/Rewards Std 0.043969 +[id0] eval/Rewards Max -0.157676 +[id0] eval/Rewards Min -0.25822 +[id0] eval/Returns Mean -9.88357 +[id0] eval/Returns Std 1.99975 +[id0] eval/Returns Max -7.88382 +[id0] eval/Returns Min -11.8833 +[id0] eval/Actions Mean -0.0890177 +[id0] eval/Actions Std 0.504219 +[id0] eval/Actions Max 0.420863 +[id0] eval/Actions Min -0.600135 +[id0] eval/Num Paths 2 +[id0] eval/Average Returns -9.88357 +[id0] eval/hand_distance Mean 0.113758 +[id0] eval/hand_distance Std 0.0444896 +[id0] eval/hand_distance Max 0.272936 +[id0] eval/hand_distance Min 0.0675899 +[id0] eval/Final hand_distance Mean 0.0937614 +[id0] eval/Final hand_distance Std 0.0240546 +[id0] eval/Final hand_distance Max 0.117816 +[id0] eval/Final hand_distance Min 0.0697068 +[id0] eval/puck_distance Mean 0.197671 +[id0] eval/puck_distance Std 0.043969 +[id0] eval/puck_distance Max 0.25822 +[id0] eval/puck_distance Min 0.157676 +[id0] eval/Final puck_distance Mean 0.207933 +[id0] eval/Final puck_distance Std 0.0502558 +[id0] eval/Final puck_distance Max 0.258188 +[id0] eval/Final puck_distance Min 0.157677 +[id0] eval/state_distance Mean 0.233133 +[id0] eval/state_distance Std 0.0397063 +[id0] eval/state_distance Max 0.339762 +[id0] eval/state_distance Min 0.193193 +[id0] eval/Final state_distance Mean 0.232132 +[id0] eval/Final state_distance Std 0.0353007 +[id0] eval/Final state_distance Max 0.267433 +[id0] eval/Final state_distance Min 0.196831 +[id0] eval/touch_distance Mean 0.152677 +[id0] eval/touch_distance Std 0.0686253 +[id0] eval/touch_distance Max 0.229003 +[id0] eval/touch_distance Min 0.0658695 +[id0] eval/Final touch_distance Mean 0.139983 +[id0] eval/Final touch_distance Std 0.0722409 +[id0] eval/Final touch_distance Max 0.212224 +[id0] eval/Final touch_distance Min 0.0677426 +[id0] eval/hand_success Mean 0 +[id0] eval/hand_success Std 0 +[id0] eval/hand_success Max 0 +[id0] eval/hand_success Min 0 +[id0] eval/Final hand_success Mean 0 +[id0] eval/Final hand_success Std 0 +[id0] eval/Final hand_success Max 0 +[id0] eval/Final hand_success Min 0 +[id0] eval/hand_success_2 Mean 0.31 +[id0] eval/hand_success_2 Std 0.462493 +[id0] eval/hand_success_2 Max 1 +[id0] eval/hand_success_2 Min 0 +[id0] eval/Final hand_success_2 Mean 0.5 +[id0] eval/Final hand_success_2 Std 0.5 +[id0] eval/Final hand_success_2 Max 1 +[id0] eval/Final hand_success_2 Min 0 +[id0] eval/hand_success_3 Mean 0.8 +[id0] eval/hand_success_3 Std 0.4 +[id0] eval/hand_success_3 Max 1 +[id0] eval/hand_success_3 Min 0 +[id0] eval/Final hand_success_3 Mean 1 +[id0] eval/Final hand_success_3 Std 0 +[id0] eval/Final hand_success_3 Max 1 +[id0] eval/Final hand_success_3 Min 1 +[id0] eval/puck_success Mean 0 +[id0] eval/puck_success Std 0 +[id0] eval/puck_success Max 0 +[id0] eval/puck_success Min 0 +[id0] eval/Final puck_success Mean 0 +[id0] eval/Final puck_success Std 0 +[id0] eval/Final puck_success Max 0 +[id0] eval/Final puck_success Min 0 +[id0] eval/puck_success_2 Mean 0 +[id0] eval/puck_success_2 Std 0 +[id0] eval/puck_success_2 Max 0 +[id0] eval/puck_success_2 Min 0 +[id0] eval/Final puck_success_2 Mean 0 +[id0] eval/Final puck_success_2 Std 0 +[id0] eval/Final puck_success_2 Max 0 +[id0] eval/Final puck_success_2 Min 0 +[id0] eval/puck_success_3 Mean 0 +[id0] eval/puck_success_3 Std 0 +[id0] eval/puck_success_3 Max 0 +[id0] eval/puck_success_3 Min 0 +[id0] eval/Final puck_success_3 Mean 0 +[id0] eval/Final puck_success_3 Std 0 +[id0] eval/Final puck_success_3 Max 0 +[id0] eval/Final puck_success_3 Min 0 +[id0] eval/hand_and_puck_success Mean 0 +[id0] eval/hand_and_puck_success Std 0 +[id0] eval/hand_and_puck_success Max 0 +[id0] eval/hand_and_puck_success Min 0 +[id0] eval/Final hand_and_puck_success Mean 0 +[id0] eval/Final hand_and_puck_success Std 0 +[id0] eval/Final hand_and_puck_success Max 0 +[id0] eval/Final hand_and_puck_success Min 0 +[id0] eval/hand_and_puck_success_2 Mean 0 +[id0] eval/hand_and_puck_success_2 Std 0 +[id0] eval/hand_and_puck_success_2 Max 0 +[id0] eval/hand_and_puck_success_2 Min 0 +[id0] eval/Final hand_and_puck_success_2 Mean 0 +[id0] eval/Final hand_and_puck_success_2 Std 0 +[id0] eval/Final hand_and_puck_success_2 Max 0 +[id0] eval/Final hand_and_puck_success_2 Min 0 +[id0] eval/hand_and_puck_success_3 Mean 0 +[id0] eval/hand_and_puck_success_3 Std 0 +[id0] eval/hand_and_puck_success_3 Max 0 +[id0] eval/hand_and_puck_success_3 Min 0 +[id0] eval/Final hand_and_puck_success_3 Mean 0 +[id0] eval/Final hand_and_puck_success_3 Std 0 +[id0] eval/Final hand_and_puck_success_3 Max 0 +[id0] eval/Final hand_and_puck_success_3 Min 0 +[id0] eval/state_success Mean 0 +[id0] eval/state_success Std 0 +[id0] eval/state_success Max 0 +[id0] eval/state_success Min 0 +[id0] eval/Final state_success Mean 0 +[id0] eval/Final state_success Std 0 +[id0] eval/Final state_success Max 0 +[id0] eval/Final state_success Min 0 +[id0] eval/state_success_2 Mean 0 +[id0] eval/state_success_2 Std 0 +[id0] eval/state_success_2 Max 0 +[id0] eval/state_success_2 Min 0 +[id0] eval/Final state_success_2 Mean 0 +[id0] eval/Final state_success_2 Std 0 +[id0] eval/Final state_success_2 Max 0 +[id0] eval/Final state_success_2 Min 0 +[id0] eval/state_success_3 Mean 0 +[id0] eval/state_success_3 Std 0 +[id0] eval/state_success_3 Max 0 +[id0] eval/state_success_3 Min 0 +[id0] eval/Final state_success_3 Mean 0 +[id0] eval/Final state_success_3 Std 0 +[id0] eval/Final state_success_3 Max 0 +[id0] eval/Final state_success_3 Min 0 +[id0] eval/touch_success Mean 0 +[id0] eval/touch_success Std 0 +[id0] eval/touch_success Max 0 +[id0] eval/touch_success Min 0 +[id0] eval/Final touch_success Mean 0 +[id0] eval/Final touch_success Std 0 +[id0] eval/Final touch_success Max 0 +[id0] eval/Final touch_success Min 0 +[id0] eval/touch_success_2 Mean 0.36 +[id0] eval/touch_success_2 Std 0.48 +[id0] eval/touch_success_2 Max 1 +[id0] eval/touch_success_2 Min 0 +[id0] eval/Final touch_success_2 Mean 0.5 +[id0] eval/Final touch_success_2 Std 0.5 +[id0] eval/Final touch_success_2 Max 1 +[id0] eval/Final touch_success_2 Min 0 +[id0] eval/touch_success_3 Mean 0.4 +[id0] eval/touch_success_3 Std 0.489898 +[id0] eval/touch_success_3 Max 1 +[id0] eval/touch_success_3 Min 0 +[id0] eval/Final touch_success_3 Mean 0.5 +[id0] eval/Final touch_success_3 Std 0.5 +[id0] eval/Final touch_success_3 Max 1 +[id0] eval/Final touch_success_3 Min 0 +[id0] time/epoch_time (s) 3.35237 +[id0] time/evaluation sampling (s) 1.55379 +[id0] time/exploration sampling (s) 1.58229 +[id0] time/global_time (s) 35.1789 +[id0] time/replay buffer data storing (s) 0.000374079 +[id0] time/saving (s) 2.71797e-05 +[id0] time/training (s) 0.204778 +[id0] epoch 4 +[id0] --------------------------------------- --------------- +[id0] ------------------------------ --------------- +[id0] trainer/QF1 Loss 0.0267284 +[id0] trainer/QF2 Loss 0.0267345 +[id0] trainer/Policy Loss 3.15012 +[id0] trainer/Q1 Predictions Mean -0.000483587 +[id0] trainer/Q1 Predictions Std 0.000263562 +[id0] trainer/Q1 Predictions Max -2.54349e-05 +[id0] trainer/Q1 Predictions Min -0.00114263 +[id0] trainer/Q2 Predictions Mean -0.000438495 +[id0] trainer/Q2 Predictions Std 0.000220916 +[id0] trainer/Q2 Predictions Max 4.79189e-05 +[id0] trainer/Q2 Predictions Min -0.000895643 +[id0] trainer/Q Targets Mean -0.124641 +[id0] trainer/Q Targets Std 0.106343 +[id0] trainer/Q Targets Max 0.000171593 +[id0] trainer/Q Targets Min -0.380185 +[id0] trainer/Log Pis Mean 3.14974 +[id0] trainer/Log Pis Std 1.1 +[id0] trainer/Log Pis Max 4.15625 +[id0] trainer/Log Pis Min -1.85168 +[id0] trainer/rewards Mean -0.124673 +[id0] trainer/rewards Std 0.106319 +[id0] trainer/rewards Max 0 +[id0] trainer/rewards Min -0.380129 +[id0] trainer/terminals Mean 0 +[id0] trainer/terminals Std 0 +[id0] trainer/terminals Max 0 +[id0] trainer/terminals Min 0 +[id0] trainer/replay_buffer_len 90000 +[id0] trainer/policy/mean Mean -3.94223e-05 +[id0] trainer/policy/mean Std 5.48776e-06 +[id0] trainer/policy/mean Max -2.87341e-05 +[id0] trainer/policy/mean Min -5.66508e-05 +[id0] trainer/policy/std Mean 0.0497949 +[id0] trainer/policy/std Std 7.06956e-06 +[id0] trainer/policy/std Max 0.049803 +[id0] trainer/policy/std Min 0.0497869 +[id0] trainer/Advantage Weights Mean 0.0078125 +[id0] trainer/Advantage Weights Std 0.00198469 +[id0] trainer/Advantage Weights Max 0.0130013 +[id0] trainer/Advantage Weights Min 0.00423533 +[id0] trainer/Advantage Score Mean -0.000101316 +[id0] trainer/Advantage Score Std 0.000263755 +[id0] trainer/Advantage Score Max 0.000441706 +[id0] trainer/Advantage Score Min -0.000679884 +[id0] trainer/bc/Train Logprob Loss 254.926 +[id0] trainer/bc/Test Logprob Loss 270.137 +[id0] trainer/bc/Train MSE 0.642499 +[id0] trainer/bc/Test MSE 0.689321 +[id0] trainer/bc/train_policy_loss 254.926 +[id0] trainer/bc/test_policy_loss 270.137 +[id0] trainer/batch 0 +[id0] trainer/epoch_time 0.0321772 +[id0] ------------------------------ --------------- +[id0] --------------------------------------- --------------- +[id0] trainer/num train calls 10 +[id0] trainer/QF1 Loss 0.0250762 +[id0] trainer/QF2 Loss 0.0254459 +[id0] trainer/Policy Loss 3.42755 +[id0] trainer/Q1 Predictions Mean -0.02271 +[id0] trainer/Q1 Predictions Std 0.0013751 +[id0] trainer/Q1 Predictions Max -0.020538 +[id0] trainer/Q1 Predictions Min -0.0247885 +[id0] trainer/Q2 Predictions Mean -0.0215133 +[id0] trainer/Q2 Predictions Std 0.00105855 +[id0] trainer/Q2 Predictions Max -0.0195664 +[id0] trainer/Q2 Predictions Min -0.0226589 +[id0] trainer/Q Targets Mean -0.136798 +[id0] trainer/Q Targets Std 0.110915 +[id0] trainer/Q Targets Max -0.00019019 +[id0] trainer/Q Targets Min -0.317915 +[id0] trainer/Log Pis Mean 3.40936 +[id0] trainer/Log Pis Std 0.616096 +[id0] trainer/Log Pis Max 3.98308 +[id0] trainer/Log Pis Min 2.43023 +[id0] trainer/rewards Mean -0.136573 +[id0] trainer/rewards Std 0.110873 +[id0] trainer/rewards Max -1.48851e-06 +[id0] trainer/rewards Min -0.317618 +[id0] trainer/terminals Mean 0 +[id0] trainer/terminals Std 0 +[id0] trainer/terminals Max 0 +[id0] trainer/terminals Min 0 +[id0] trainer/replay_buffer_len 90200 +[id0] trainer/policy/mean Mean -0.000476395 +[id0] trainer/policy/mean Std 0.0094533 +[id0] trainer/policy/mean Max 0.00918785 +[id0] trainer/policy/mean Min -0.0101835 +[id0] trainer/policy/std Mean 0.0506216 +[id0] trainer/policy/std Std 1.96462e-05 +[id0] trainer/policy/std Max 0.0506586 +[id0] trainer/policy/std Min 0.0505911 +[id0] trainer/Advantage Weights Mean 0.2 +[id0] trainer/Advantage Weights Std 0.286094 +[id0] trainer/Advantage Weights Max 0.768934 +[id0] trainer/Advantage Weights Min 0.0102757 +[id0] trainer/Advantage Score Mean -0.0045227 +[id0] trainer/Advantage Score Std 0.00139399 +[id0] trainer/Advantage Score Max -0.00221948 +[id0] trainer/Advantage Score Min -0.00653471 +[id0] trainer/bc/Train Logprob Loss 245.501 +[id0] trainer/bc/Test Logprob Loss 246.347 +[id0] trainer/bc/Train MSE 0.643397 +[id0] trainer/bc/Test MSE 0.648696 +[id0] trainer/bc/train_policy_loss 245.501 +[id0] trainer/bc/test_policy_loss 246.347 +[id0] expl/num steps total 200 +[id0] expl/num paths total 4 +[id0] expl/path length Mean 50 +[id0] expl/path length Std 0 +[id0] expl/path length Max 50 +[id0] expl/path length Min 50 +[id0] expl/Rewards Mean -0.197639 +[id0] expl/Rewards Std 0.011234 +[id0] expl/Rewards Max -0.186405 +[id0] expl/Rewards Min -0.208873 +[id0] expl/Returns Mean -9.88193 +[id0] expl/Returns Std 0.561699 +[id0] expl/Returns Max -9.32024 +[id0] expl/Returns Min -10.4436 +[id0] expl/Actions Mean -0.000557166 +[id0] expl/Actions Std 0.0502919 +[id0] expl/Actions Max 0.146695 +[id0] expl/Actions Min -0.165994 +[id0] expl/Num Paths 2 +[id0] expl/Average Returns -9.88193 +[id0] expl/hand_distance Mean 0.0856795 +[id0] expl/hand_distance Std 0.0329738 +[id0] expl/hand_distance Max 0.120603 +[id0] expl/hand_distance Min 0.0476662 +[id0] expl/Final hand_distance Mean 0.0895555 +[id0] expl/Final hand_distance Std 0.0310472 +[id0] expl/Final hand_distance Max 0.120603 +[id0] expl/Final hand_distance Min 0.0585082 +[id0] expl/puck_distance Mean 0.197639 +[id0] expl/puck_distance Std 0.011234 +[id0] expl/puck_distance Max 0.208873 +[id0] expl/puck_distance Min 0.186405 +[id0] expl/Final puck_distance Mean 0.197639 +[id0] expl/Final puck_distance Std 0.0112338 +[id0] expl/Final puck_distance Max 0.208872 +[id0] expl/Final puck_distance Min 0.186405 +[id0] expl/state_distance Mean 0.21697 +[id0] expl/state_distance Std 0.0232233 +[id0] expl/state_distance Max 0.24119 +[id0] expl/state_distance Min 0.192403 +[id0] expl/Final state_distance Mean 0.218281 +[id0] expl/Final state_distance Std 0.0229095 +[id0] expl/Final state_distance Max 0.24119 +[id0] expl/Final state_distance Min 0.195371 +[id0] expl/touch_distance Mean 0.232212 +[id0] expl/touch_distance Std 0.0408655 +[id0] expl/touch_distance Max 0.278074 +[id0] expl/touch_distance Min 0.181348 +[id0] expl/Final touch_distance Mean 0.229711 +[id0] expl/Final touch_distance Std 0.048363 +[id0] expl/Final touch_distance Max 0.278074 +[id0] expl/Final touch_distance Min 0.181348 +[id0] expl/hand_success Mean 0.5 +[id0] expl/hand_success Std 0.5 +[id0] expl/hand_success Max 1 +[id0] expl/hand_success Min 0 +[id0] expl/Final hand_success Mean 0.5 +[id0] expl/Final hand_success Std 0.5 +[id0] expl/Final hand_success Max 1 +[id0] expl/Final hand_success Min 0 +[id0] expl/hand_success_2 Mean 0.5 +[id0] expl/hand_success_2 Std 0.5 +[id0] expl/hand_success_2 Max 1 +[id0] expl/hand_success_2 Min 0 +[id0] expl/Final hand_success_2 Mean 0.5 +[id0] expl/Final hand_success_2 Std 0.5 +[id0] expl/Final hand_success_2 Max 1 +[id0] expl/Final hand_success_2 Min 0 +[id0] expl/hand_success_3 Mean 0.97 +[id0] expl/hand_success_3 Std 0.170587 +[id0] expl/hand_success_3 Max 1 +[id0] expl/hand_success_3 Min 0 +[id0] expl/Final hand_success_3 Mean 0.5 +[id0] expl/Final hand_success_3 Std 0.5 +[id0] expl/Final hand_success_3 Max 1 +[id0] expl/Final hand_success_3 Min 0 +[id0] expl/puck_success Mean 0 +[id0] expl/puck_success Std 0 +[id0] expl/puck_success Max 0 +[id0] expl/puck_success Min 0 +[id0] expl/Final puck_success Mean 0 +[id0] expl/Final puck_success Std 0 +[id0] expl/Final puck_success Max 0 +[id0] expl/Final puck_success Min 0 +[id0] expl/puck_success_2 Mean 0 +[id0] expl/puck_success_2 Std 0 +[id0] expl/puck_success_2 Max 0 +[id0] expl/puck_success_2 Min 0 +[id0] expl/Final puck_success_2 Mean 0 +[id0] expl/Final puck_success_2 Std 0 +[id0] expl/Final puck_success_2 Max 0 +[id0] expl/Final puck_success_2 Min 0 +[id0] expl/puck_success_3 Mean 0 +[id0] expl/puck_success_3 Std 0 +[id0] expl/puck_success_3 Max 0 +[id0] expl/puck_success_3 Min 0 +[id0] expl/Final puck_success_3 Mean 0 +[id0] expl/Final puck_success_3 Std 0 +[id0] expl/Final puck_success_3 Max 0 +[id0] expl/Final puck_success_3 Min 0 +[id0] expl/hand_and_puck_success Mean 0 +[id0] expl/hand_and_puck_success Std 0 +[id0] expl/hand_and_puck_success Max 0 +[id0] expl/hand_and_puck_success Min 0 +[id0] expl/Final hand_and_puck_success Mean 0 +[id0] expl/Final hand_and_puck_success Std 0 +[id0] expl/Final hand_and_puck_success Max 0 +[id0] expl/Final hand_and_puck_success Min 0 +[id0] expl/hand_and_puck_success_2 Mean 0 +[id0] expl/hand_and_puck_success_2 Std 0 +[id0] expl/hand_and_puck_success_2 Max 0 +[id0] expl/hand_and_puck_success_2 Min 0 +[id0] expl/Final hand_and_puck_success_2 Mean 0 +[id0] expl/Final hand_and_puck_success_2 Std 0 +[id0] expl/Final hand_and_puck_success_2 Max 0 +[id0] expl/Final hand_and_puck_success_2 Min 0 +[id0] expl/hand_and_puck_success_3 Mean 0 +[id0] expl/hand_and_puck_success_3 Std 0 +[id0] expl/hand_and_puck_success_3 Max 0 +[id0] expl/hand_and_puck_success_3 Min 0 +[id0] expl/Final hand_and_puck_success_3 Mean 0 +[id0] expl/Final hand_and_puck_success_3 Std 0 +[id0] expl/Final hand_and_puck_success_3 Max 0 +[id0] expl/Final hand_and_puck_success_3 Min 0 +[id0] expl/state_success Mean 0 +[id0] expl/state_success Std 0 +[id0] expl/state_success Max 0 +[id0] expl/state_success Min 0 +[id0] expl/Final state_success Mean 0 +[id0] expl/Final state_success Std 0 +[id0] expl/Final state_success Max 0 +[id0] expl/Final state_success Min 0 +[id0] expl/state_success_2 Mean 0 +[id0] expl/state_success_2 Std 0 +[id0] expl/state_success_2 Max 0 +[id0] expl/state_success_2 Min 0 +[id0] expl/Final state_success_2 Mean 0 +[id0] expl/Final state_success_2 Std 0 +[id0] expl/Final state_success_2 Max 0 +[id0] expl/Final state_success_2 Min 0 +[id0] expl/state_success_3 Mean 0 +[id0] expl/state_success_3 Std 0 +[id0] expl/state_success_3 Max 0 +[id0] expl/state_success_3 Min 0 +[id0] expl/Final state_success_3 Mean 0 +[id0] expl/Final state_success_3 Std 0 +[id0] expl/Final state_success_3 Max 0 +[id0] expl/Final state_success_3 Min 0 +[id0] expl/touch_success Mean 0 +[id0] expl/touch_success Std 0 +[id0] expl/touch_success Max 0 +[id0] expl/touch_success Min 0 +[id0] expl/Final touch_success Mean 0 +[id0] expl/Final touch_success Std 0 +[id0] expl/Final touch_success Max 0 +[id0] expl/Final touch_success Min 0 +[id0] expl/touch_success_2 Mean 0 +[id0] expl/touch_success_2 Std 0 +[id0] expl/touch_success_2 Max 0 +[id0] expl/touch_success_2 Min 0 +[id0] expl/Final touch_success_2 Mean 0 +[id0] expl/Final touch_success_2 Std 0 +[id0] expl/Final touch_success_2 Max 0 +[id0] expl/Final touch_success_2 Min 0 +[id0] expl/touch_success_3 Mean 0 +[id0] expl/touch_success_3 Std 0 +[id0] expl/touch_success_3 Max 0 +[id0] expl/touch_success_3 Min 0 +[id0] expl/Final touch_success_3 Mean 0 +[id0] expl/Final touch_success_3 Std 0 +[id0] expl/Final touch_success_3 Max 0 +[id0] expl/Final touch_success_3 Min 0 +[id0] eval/num steps total 100 +[id0] eval/num paths total 2 +[id0] eval/path length Mean 50 +[id0] eval/path length Std 0 +[id0] eval/path length Max 50 +[id0] eval/path length Min 50 +[id0] eval/Rewards Mean -0.262271 +[id0] eval/Rewards Std 0.0382676 +[id0] eval/Rewards Max -0.224004 +[id0] eval/Rewards Min -0.300539 +[id0] eval/Returns Mean -13.1136 +[id0] eval/Returns Std 1.91338 +[id0] eval/Returns Max -11.2002 +[id0] eval/Returns Min -15.0269 +[id0] eval/Actions Mean -0.000489309 +[id0] eval/Actions Std 0.00942212 +[id0] eval/Actions Max 0.00906735 +[id0] eval/Actions Min -0.0100524 +[id0] eval/Num Paths 2 +[id0] eval/Average Returns -13.1136 +[id0] eval/hand_distance Mean 0.168322 +[id0] eval/hand_distance Std 0.0479489 +[id0] eval/hand_distance Max 0.222622 +[id0] eval/hand_distance Min 0.116261 +[id0] eval/Final hand_distance Mean 0.162978 +[id0] eval/Final hand_distance Std 0.0467166 +[id0] eval/Final hand_distance Max 0.209695 +[id0] eval/Final hand_distance Min 0.116261 +[id0] eval/puck_distance Mean 0.262271 +[id0] eval/puck_distance Std 0.0382676 +[id0] eval/puck_distance Max 0.300539 +[id0] eval/puck_distance Min 0.224004 +[id0] eval/Final puck_distance Mean 0.262271 +[id0] eval/Final puck_distance Std 0.0382675 +[id0] eval/Final puck_distance Max 0.300539 +[id0] eval/Final puck_distance Min 0.224004 +[id0] eval/state_distance Mean 0.317552 +[id0] eval/state_distance Std 0.00655355 +[id0] eval/state_distance Max 0.325431 +[id0] eval/state_distance Min 0.306838 +[id0] eval/Final state_distance Mean 0.31454 +[id0] eval/Final state_distance Std 0.00770235 +[id0] eval/Final state_distance Max 0.322243 +[id0] eval/Final state_distance Min 0.306838 +[id0] eval/touch_distance Mean 0.10999 +[id0] eval/touch_distance Std 0.00448419 +[id0] eval/touch_distance Max 0.116016 +[id0] eval/touch_distance Min 0.100666 +[id0] eval/Final touch_distance Mean 0.108341 +[id0] eval/Final touch_distance Std 0.00767521 +[id0] eval/Final touch_distance Max 0.116016 +[id0] eval/Final touch_distance Min 0.100666 +[id0] eval/hand_success Mean 0 +[id0] eval/hand_success Std 0 +[id0] eval/hand_success Max 0 +[id0] eval/hand_success Min 0 +[id0] eval/Final hand_success Mean 0 +[id0] eval/Final hand_success Std 0 +[id0] eval/Final hand_success Max 0 +[id0] eval/Final hand_success Min 0 +[id0] eval/hand_success_2 Mean 0 +[id0] eval/hand_success_2 Std 0 +[id0] eval/hand_success_2 Max 0 +[id0] eval/hand_success_2 Min 0 +[id0] eval/Final hand_success_2 Mean 0 +[id0] eval/Final hand_success_2 Std 0 +[id0] eval/Final hand_success_2 Max 0 +[id0] eval/Final hand_success_2 Min 0 +[id0] eval/hand_success_3 Mean 0.22 +[id0] eval/hand_success_3 Std 0.414246 +[id0] eval/hand_success_3 Max 1 +[id0] eval/hand_success_3 Min 0 +[id0] eval/Final hand_success_3 Mean 0.5 +[id0] eval/Final hand_success_3 Std 0.5 +[id0] eval/Final hand_success_3 Max 1 +[id0] eval/Final hand_success_3 Min 0 +[id0] eval/puck_success Mean 0 +[id0] eval/puck_success Std 0 +[id0] eval/puck_success Max 0 +[id0] eval/puck_success Min 0 +[id0] eval/Final puck_success Mean 0 +[id0] eval/Final puck_success Std 0 +[id0] eval/Final puck_success Max 0 +[id0] eval/Final puck_success Min 0 +[id0] eval/puck_success_2 Mean 0 +[id0] eval/puck_success_2 Std 0 +[id0] eval/puck_success_2 Max 0 +[id0] eval/puck_success_2 Min 0 +[id0] eval/Final puck_success_2 Mean 0 +[id0] eval/Final puck_success_2 Std 0 +[id0] eval/Final puck_success_2 Max 0 +[id0] eval/Final puck_success_2 Min 0 +[id0] eval/puck_success_3 Mean 0 +[id0] eval/puck_success_3 Std 0 +[id0] eval/puck_success_3 Max 0 +[id0] eval/puck_success_3 Min 0 +[id0] eval/Final puck_success_3 Mean 0 +[id0] eval/Final puck_success_3 Std 0 +[id0] eval/Final puck_success_3 Max 0 +[id0] eval/Final puck_success_3 Min 0 +[id0] eval/hand_and_puck_success Mean 0 +[id0] eval/hand_and_puck_success Std 0 +[id0] eval/hand_and_puck_success Max 0 +[id0] eval/hand_and_puck_success Min 0 +[id0] eval/Final hand_and_puck_success Mean 0 +[id0] eval/Final hand_and_puck_success Std 0 +[id0] eval/Final hand_and_puck_success Max 0 +[id0] eval/Final hand_and_puck_success Min 0 +[id0] eval/hand_and_puck_success_2 Mean 0 +[id0] eval/hand_and_puck_success_2 Std 0 +[id0] eval/hand_and_puck_success_2 Max 0 +[id0] eval/hand_and_puck_success_2 Min 0 +[id0] eval/Final hand_and_puck_success_2 Mean 0 +[id0] eval/Final hand_and_puck_success_2 Std 0 +[id0] eval/Final hand_and_puck_success_2 Max 0 +[id0] eval/Final hand_and_puck_success_2 Min 0 +[id0] eval/hand_and_puck_success_3 Mean 0 +[id0] eval/hand_and_puck_success_3 Std 0 +[id0] eval/hand_and_puck_success_3 Max 0 +[id0] eval/hand_and_puck_success_3 Min 0 +[id0] eval/Final hand_and_puck_success_3 Mean 0 +[id0] eval/Final hand_and_puck_success_3 Std 0 +[id0] eval/Final hand_and_puck_success_3 Max 0 +[id0] eval/Final hand_and_puck_success_3 Min 0 +[id0] eval/state_success Mean 0 +[id0] eval/state_success Std 0 +[id0] eval/state_success Max 0 +[id0] eval/state_success Min 0 +[id0] eval/Final state_success Mean 0 +[id0] eval/Final state_success Std 0 +[id0] eval/Final state_success Max 0 +[id0] eval/Final state_success Min 0 +[id0] eval/state_success_2 Mean 0 +[id0] eval/state_success_2 Std 0 +[id0] eval/state_success_2 Max 0 +[id0] eval/state_success_2 Min 0 +[id0] eval/Final state_success_2 Mean 0 +[id0] eval/Final state_success_2 Std 0 +[id0] eval/Final state_success_2 Max 0 +[id0] eval/Final state_success_2 Min 0 +[id0] eval/state_success_3 Mean 0 +[id0] eval/state_success_3 Std 0 +[id0] eval/state_success_3 Max 0 +[id0] eval/state_success_3 Min 0 +[id0] eval/Final state_success_3 Mean 0 +[id0] eval/Final state_success_3 Std 0 +[id0] eval/Final state_success_3 Max 0 +[id0] eval/Final state_success_3 Min 0 +[id0] eval/touch_success Mean 0 +[id0] eval/touch_success Std 0 +[id0] eval/touch_success Max 0 +[id0] eval/touch_success Min 0 +[id0] eval/Final touch_success Mean 0 +[id0] eval/Final touch_success Std 0 +[id0] eval/Final touch_success Max 0 +[id0] eval/Final touch_success Min 0 +[id0] eval/touch_success_2 Mean 0 +[id0] eval/touch_success_2 Std 0 +[id0] eval/touch_success_2 Max 0 +[id0] eval/touch_success_2 Min 0 +[id0] eval/Final touch_success_2 Mean 0 +[id0] eval/Final touch_success_2 Std 0 +[id0] eval/Final touch_success_2 Max 0 +[id0] eval/Final touch_success_2 Min 0 +[id0] eval/touch_success_3 Mean 1 +[id0] eval/touch_success_3 Std 0 +[id0] eval/touch_success_3 Max 1 +[id0] eval/touch_success_3 Min 1 +[id0] eval/Final touch_success_3 Mean 1 +[id0] eval/Final touch_success_3 Std 0 +[id0] eval/Final touch_success_3 Max 1 +[id0] eval/Final touch_success_3 Min 1 +[id0] time/epoch_time (s) 4.89415 +[id0] time/evaluation sampling (s) 1.54094 +[id0] time/exploration sampling (s) 1.55405 +[id0] time/global_time (s) 21.5592 +[id0] time/replay buffer data storing (s) 0.000367403 +[id0] time/saving (s) 0.0125396 +[id0] time/training (s) 0.203122 +[id0] epoch 0 +[id0] --------------------------------------- --------------- +[id0] --------------------------------------- --------------- +[id0] trainer/num train calls 20 +[id0] trainer/QF1 Loss 0.00250073 +[id0] trainer/QF2 Loss 0.00207108 +[id0] trainer/Policy Loss 2.22325 +[id0] trainer/Q1 Predictions Mean -0.0737635 +[id0] trainer/Q1 Predictions Std 0.00398001 +[id0] trainer/Q1 Predictions Max -0.0687237 +[id0] trainer/Q1 Predictions Min -0.0776319 +[id0] trainer/Q2 Predictions Mean -0.0691382 +[id0] trainer/Q2 Predictions Std 0.00233655 +[id0] trainer/Q2 Predictions Max -0.0668909 +[id0] trainer/Q2 Predictions Min -0.0736087 +[id0] trainer/Q Targets Mean -0.0295868 +[id0] trainer/Q Targets Std 0.0228406 +[id0] trainer/Q Targets Max -0.000883632 +[id0] trainer/Q Targets Min -0.0600125 +[id0] trainer/Log Pis Mean 2.16272 +[id0] trainer/Log Pis Std 1.56912 +[id0] trainer/Log Pis Max 3.88093 +[id0] trainer/Log Pis Min -0.660848 +[id0] trainer/rewards Mean -0.0287089 +[id0] trainer/rewards Std 0.0228596 +[id0] trainer/rewards Max -1.95569e-07 +[id0] trainer/rewards Min -0.0592206 +[id0] trainer/terminals Mean 0 +[id0] trainer/terminals Std 0 +[id0] trainer/terminals Max 0 +[id0] trainer/terminals Min 0 +[id0] trainer/replay_buffer_len 90300 +[id0] trainer/policy/mean Mean -0.000350953 +[id0] trainer/policy/mean Std 0.0518476 +[id0] trainer/policy/mean Max 0.0528517 +[id0] trainer/policy/mean Min -0.0535691 +[id0] trainer/policy/std Mean 0.0543918 +[id0] trainer/policy/std Std 0.000164085 +[id0] trainer/policy/std Max 0.0545799 +[id0] trainer/policy/std Min 0.054039 +[id0] trainer/Advantage Weights Mean 0.2 +[id0] trainer/Advantage Weights Std 0.376826 +[id0] trainer/Advantage Weights Max 0.953306 +[id0] trainer/Advantage Weights Min 0.00159713 +[id0] trainer/Advantage Score Mean -0.0132283 +[id0] trainer/Advantage Score Std 0.00232157 +[id0] trainer/Advantage Score Max -0.00913966 +[id0] trainer/Advantage Score Min -0.0155314 +[id0] trainer/bc/Train Logprob Loss 212.68 +[id0] trainer/bc/Test Logprob Loss 222.854 +[id0] trainer/bc/Train MSE 0.638763 +[id0] trainer/bc/Test MSE 0.68769 +[id0] trainer/bc/train_policy_loss 212.68 +[id0] trainer/bc/test_policy_loss 222.854 +[id0] expl/num steps total 300 +[id0] expl/num paths total 6 +[id0] expl/path length Mean 50 +[id0] expl/path length Std 0 +[id0] expl/path length Max 50 +[id0] expl/path length Min 50 +[id0] expl/Rewards Mean -0.197181 +[id0] expl/Rewards Std 0.124835 +[id0] expl/Rewards Max -0.0723456 +[id0] expl/Rewards Min -0.322017 +[id0] expl/Returns Mean -9.85907 +[id0] expl/Returns Std 6.24177 +[id0] expl/Returns Max -3.61729 +[id0] expl/Returns Min -16.1008 +[id0] expl/Actions Mean 0.00306798 +[id0] expl/Actions Std 0.0800417 +[id0] expl/Actions Max 0.22433 +[id0] expl/Actions Min -0.187217 +[id0] expl/Num Paths 2 +[id0] expl/Average Returns -9.85907 +[id0] expl/hand_distance Mean 0.147538 +[id0] expl/hand_distance Std 0.0153105 +[id0] expl/hand_distance Max 0.16193 +[id0] expl/hand_distance Min 0.111969 +[id0] expl/Final hand_distance Mean 0.160255 +[id0] expl/Final hand_distance Std 0.000688567 +[id0] expl/Final hand_distance Max 0.160944 +[id0] expl/Final hand_distance Min 0.159567 +[id0] expl/puck_distance Mean 0.197181 +[id0] expl/puck_distance Std 0.124835 +[id0] expl/puck_distance Max 0.322017 +[id0] expl/puck_distance Min 0.0723456 +[id0] expl/Final puck_distance Mean 0.197181 +[id0] expl/Final puck_distance Std 0.124836 +[id0] expl/Final puck_distance Max 0.322017 +[id0] expl/Final puck_distance Min 0.0723456 +[id0] expl/state_distance Mean 0.256835 +[id0] expl/state_distance Std 0.102479 +[id0] expl/state_distance Max 0.359867 +[id0] expl/state_distance Min 0.133308 +[id0] expl/Final state_distance Mean 0.26792 +[id0] expl/Final state_distance Std 0.0914635 +[id0] expl/Final state_distance Max 0.359383 +[id0] expl/Final state_distance Min 0.176456 +[id0] expl/touch_distance Mean 0.18962 +[id0] expl/touch_distance Std 0.108421 +[id0] expl/touch_distance Max 0.317678 +[id0] expl/touch_distance Min 0.0735865 +[id0] expl/Final touch_distance Mean 0.184087 +[id0] expl/Final touch_distance Std 0.0925809 +[id0] expl/Final touch_distance Max 0.276668 +[id0] expl/Final touch_distance Min 0.0915062 +[id0] expl/hand_success Mean 0 +[id0] expl/hand_success Std 0 +[id0] expl/hand_success Max 0 +[id0] expl/hand_success Min 0 +[id0] expl/Final hand_success Mean 0 +[id0] expl/Final hand_success Std 0 +[id0] expl/Final hand_success Max 0 +[id0] expl/Final hand_success Min 0 +[id0] expl/hand_success_2 Mean 0 +[id0] expl/hand_success_2 Std 0 +[id0] expl/hand_success_2 Max 0 +[id0] expl/hand_success_2 Min 0 +[id0] expl/Final hand_success_2 Mean 0 +[id0] expl/Final hand_success_2 Std 0 +[id0] expl/Final hand_success_2 Max 0 +[id0] expl/Final hand_success_2 Min 0 +[id0] expl/hand_success_3 Mean 0.08 +[id0] expl/hand_success_3 Std 0.271293 +[id0] expl/hand_success_3 Max 1 +[id0] expl/hand_success_3 Min 0 +[id0] expl/Final hand_success_3 Mean 0 +[id0] expl/Final hand_success_3 Std 0 +[id0] expl/Final hand_success_3 Max 0 +[id0] expl/Final hand_success_3 Min 0 +[id0] expl/puck_success Mean 0 +[id0] expl/puck_success Std 0 +[id0] expl/puck_success Max 0 +[id0] expl/puck_success Min 0 +[id0] expl/Final puck_success Mean 0 +[id0] expl/Final puck_success Std 0 +[id0] expl/Final puck_success Max 0 +[id0] expl/Final puck_success Min 0 +[id0] expl/puck_success_2 Mean 0.5 +[id0] expl/puck_success_2 Std 0.5 +[id0] expl/puck_success_2 Max 1 +[id0] expl/puck_success_2 Min 0 +[id0] expl/Final puck_success_2 Mean 0.5 +[id0] expl/Final puck_success_2 Std 0.5 +[id0] expl/Final puck_success_2 Max 1 +[id0] expl/Final puck_success_2 Min 0 +[id0] expl/puck_success_3 Mean 0.5 +[id0] expl/puck_success_3 Std 0.5 +[id0] expl/puck_success_3 Max 1 +[id0] expl/puck_success_3 Min 0 +[id0] expl/Final puck_success_3 Mean 0.5 +[id0] expl/Final puck_success_3 Std 0.5 +[id0] expl/Final puck_success_3 Max 1 +[id0] expl/Final puck_success_3 Min 0 +[id0] expl/hand_and_puck_success Mean 0 +[id0] expl/hand_and_puck_success Std 0 +[id0] expl/hand_and_puck_success Max 0 +[id0] expl/hand_and_puck_success Min 0 +[id0] expl/Final hand_and_puck_success Mean 0 +[id0] expl/Final hand_and_puck_success Std 0 +[id0] expl/Final hand_and_puck_success Max 0 +[id0] expl/Final hand_and_puck_success Min 0 +[id0] expl/hand_and_puck_success_2 Mean 0 +[id0] expl/hand_and_puck_success_2 Std 0 +[id0] expl/hand_and_puck_success_2 Max 0 +[id0] expl/hand_and_puck_success_2 Min 0 +[id0] expl/Final hand_and_puck_success_2 Mean 0 +[id0] expl/Final hand_and_puck_success_2 Std 0 +[id0] expl/Final hand_and_puck_success_2 Max 0 +[id0] expl/Final hand_and_puck_success_2 Min 0 +[id0] expl/hand_and_puck_success_3 Mean 0 +[id0] expl/hand_and_puck_success_3 Std 0 +[id0] expl/hand_and_puck_success_3 Max 0 +[id0] expl/hand_and_puck_success_3 Min 0 +[id0] expl/Final hand_and_puck_success_3 Mean 0 +[id0] expl/Final hand_and_puck_success_3 Std 0 +[id0] expl/Final hand_and_puck_success_3 Max 0 +[id0] expl/Final hand_and_puck_success_3 Min 0 +[id0] expl/state_success Mean 0 +[id0] expl/state_success Std 0 +[id0] expl/state_success Max 0 +[id0] expl/state_success Min 0 +[id0] expl/Final state_success Mean 0 +[id0] expl/Final state_success Std 0 +[id0] expl/Final state_success Max 0 +[id0] expl/Final state_success Min 0 +[id0] expl/state_success_2 Mean 0 +[id0] expl/state_success_2 Std 0 +[id0] expl/state_success_2 Max 0 +[id0] expl/state_success_2 Min 0 +[id0] expl/Final state_success_2 Mean 0 +[id0] expl/Final state_success_2 Std 0 +[id0] expl/Final state_success_2 Max 0 +[id0] expl/Final state_success_2 Min 0 +[id0] expl/state_success_3 Mean 0 +[id0] expl/state_success_3 Std 0 +[id0] expl/state_success_3 Max 0 +[id0] expl/state_success_3 Min 0 +[id0] expl/Final state_success_3 Mean 0 +[id0] expl/Final state_success_3 Std 0 +[id0] expl/Final state_success_3 Max 0 +[id0] expl/Final state_success_3 Min 0 +[id0] expl/touch_success Mean 0 +[id0] expl/touch_success Std 0 +[id0] expl/touch_success Max 0 +[id0] expl/touch_success Min 0 +[id0] expl/Final touch_success Mean 0 +[id0] expl/Final touch_success Std 0 +[id0] expl/Final touch_success Max 0 +[id0] expl/Final touch_success Min 0 +[id0] expl/touch_success_2 Mean 0.24 +[id0] expl/touch_success_2 Std 0.427083 +[id0] expl/touch_success_2 Max 1 +[id0] expl/touch_success_2 Min 0 +[id0] expl/Final touch_success_2 Mean 0 +[id0] expl/Final touch_success_2 Std 0 +[id0] expl/Final touch_success_2 Max 0 +[id0] expl/Final touch_success_2 Min 0 +[id0] expl/touch_success_3 Mean 0.5 +[id0] expl/touch_success_3 Std 0.5 +[id0] expl/touch_success_3 Max 1 +[id0] expl/touch_success_3 Min 0 +[id0] expl/Final touch_success_3 Mean 0.5 +[id0] expl/Final touch_success_3 Std 0.5 +[id0] expl/Final touch_success_3 Max 1 +[id0] expl/Final touch_success_3 Min 0 +[id0] eval/num steps total 200 +[id0] eval/num paths total 4 +[id0] eval/path length Mean 50 +[id0] eval/path length Std 0 +[id0] eval/path length Max 50 +[id0] eval/path length Min 50 +[id0] eval/Rewards Mean -0.272303 +[id0] eval/Rewards Std 0.0149697 +[id0] eval/Rewards Max -0.25069 +[id0] eval/Rewards Min -0.29101 +[id0] eval/Returns Mean -13.6151 +[id0] eval/Returns Std 0.645758 +[id0] eval/Returns Max -12.9694 +[id0] eval/Returns Min -14.2609 +[id0] eval/Actions Mean -0.000333342 +[id0] eval/Actions Std 0.0505545 +[id0] eval/Actions Max 0.0510213 +[id0] eval/Actions Min -0.0517202 +[id0] eval/Num Paths 2 +[id0] eval/Average Returns -13.6151 +[id0] eval/hand_distance Mean 0.122697 +[id0] eval/hand_distance Std 0.0300833 +[id0] eval/hand_distance Max 0.180573 +[id0] eval/hand_distance Min 0.0919758 +[id0] eval/Final hand_distance Mean 0.140525 +[id0] eval/Final hand_distance Std 0.0400476 +[id0] eval/Final hand_distance Max 0.180573 +[id0] eval/Final hand_distance Min 0.100478 +[id0] eval/puck_distance Mean 0.272303 +[id0] eval/puck_distance Std 0.0149697 +[id0] eval/puck_distance Max 0.29101 +[id0] eval/puck_distance Min 0.25069 +[id0] eval/Final puck_distance Mean 0.288114 +[id0] eval/Final puck_distance Std 0.00289617 +[id0] eval/Final puck_distance Max 0.29101 +[id0] eval/Final puck_distance Min 0.285218 +[id0] eval/state_distance Mean 0.300344 +[id0] eval/state_distance Std 0.0112235 +[id0] eval/state_distance Max 0.342481 +[id0] eval/state_distance Min 0.281135 +[id0] eval/Final state_distance Mean 0.32244 +[id0] eval/Final state_distance Std 0.0200413 +[id0] eval/Final state_distance Max 0.342481 +[id0] eval/Final state_distance Min 0.302399 +[id0] eval/touch_distance Mean 0.136578 +[id0] eval/touch_distance Std 0.0700787 +[id0] eval/touch_distance Max 0.235992 +[id0] eval/touch_distance Min 0.0600804 +[id0] eval/Final touch_distance Mean 0.15151 +[id0] eval/Final touch_distance Std 0.0844817 +[id0] eval/Final touch_distance Max 0.235992 +[id0] eval/Final touch_distance Min 0.067028 +[id0] eval/hand_success Mean 0 +[id0] eval/hand_success Std 0 +[id0] eval/hand_success Max 0 +[id0] eval/hand_success Min 0 +[id0] eval/Final hand_success Mean 0 +[id0] eval/Final hand_success Std 0 +[id0] eval/Final hand_success Max 0 +[id0] eval/Final hand_success Min 0 +[id0] eval/hand_success_2 Mean 0 +[id0] eval/hand_success_2 Std 0 +[id0] eval/hand_success_2 Max 0 +[id0] eval/hand_success_2 Min 0 +[id0] eval/Final hand_success_2 Mean 0 +[id0] eval/Final hand_success_2 Std 0 +[id0] eval/Final hand_success_2 Max 0 +[id0] eval/Final hand_success_2 Min 0 +[id0] eval/hand_success_3 Mean 0.5 +[id0] eval/hand_success_3 Std 0.5 +[id0] eval/hand_success_3 Max 1 +[id0] eval/hand_success_3 Min 0 +[id0] eval/Final hand_success_3 Mean 0.5 +[id0] eval/Final hand_success_3 Std 0.5 +[id0] eval/Final hand_success_3 Max 1 +[id0] eval/Final hand_success_3 Min 0 +[id0] eval/puck_success Mean 0 +[id0] eval/puck_success Std 0 +[id0] eval/puck_success Max 0 +[id0] eval/puck_success Min 0 +[id0] eval/Final puck_success Mean 0 +[id0] eval/Final puck_success Std 0 +[id0] eval/Final puck_success Max 0 +[id0] eval/Final puck_success Min 0 +[id0] eval/puck_success_2 Mean 0 +[id0] eval/puck_success_2 Std 0 +[id0] eval/puck_success_2 Max 0 +[id0] eval/puck_success_2 Min 0 +[id0] eval/Final puck_success_2 Mean 0 +[id0] eval/Final puck_success_2 Std 0 +[id0] eval/Final puck_success_2 Max 0 +[id0] eval/Final puck_success_2 Min 0 +[id0] eval/puck_success_3 Mean 0 +[id0] eval/puck_success_3 Std 0 +[id0] eval/puck_success_3 Max 0 +[id0] eval/puck_success_3 Min 0 +[id0] eval/Final puck_success_3 Mean 0 +[id0] eval/Final puck_success_3 Std 0 +[id0] eval/Final puck_success_3 Max 0 +[id0] eval/Final puck_success_3 Min 0 +[id0] eval/hand_and_puck_success Mean 0 +[id0] eval/hand_and_puck_success Std 0 +[id0] eval/hand_and_puck_success Max 0 +[id0] eval/hand_and_puck_success Min 0 +[id0] eval/Final hand_and_puck_success Mean 0 +[id0] eval/Final hand_and_puck_success Std 0 +[id0] eval/Final hand_and_puck_success Max 0 +[id0] eval/Final hand_and_puck_success Min 0 +[id0] eval/hand_and_puck_success_2 Mean 0 +[id0] eval/hand_and_puck_success_2 Std 0 +[id0] eval/hand_and_puck_success_2 Max 0 +[id0] eval/hand_and_puck_success_2 Min 0 +[id0] eval/Final hand_and_puck_success_2 Mean 0 +[id0] eval/Final hand_and_puck_success_2 Std 0 +[id0] eval/Final hand_and_puck_success_2 Max 0 +[id0] eval/Final hand_and_puck_success_2 Min 0 +[id0] eval/hand_and_puck_success_3 Mean 0 +[id0] eval/hand_and_puck_success_3 Std 0 +[id0] eval/hand_and_puck_success_3 Max 0 +[id0] eval/hand_and_puck_success_3 Min 0 +[id0] eval/Final hand_and_puck_success_3 Mean 0 +[id0] eval/Final hand_and_puck_success_3 Std 0 +[id0] eval/Final hand_and_puck_success_3 Max 0 +[id0] eval/Final hand_and_puck_success_3 Min 0 +[id0] eval/state_success Mean 0 +[id0] eval/state_success Std 0 +[id0] eval/state_success Max 0 +[id0] eval/state_success Min 0 +[id0] eval/Final state_success Mean 0 +[id0] eval/Final state_success Std 0 +[id0] eval/Final state_success Max 0 +[id0] eval/Final state_success Min 0 +[id0] eval/state_success_2 Mean 0 +[id0] eval/state_success_2 Std 0 +[id0] eval/state_success_2 Max 0 +[id0] eval/state_success_2 Min 0 +[id0] eval/Final state_success_2 Mean 0 +[id0] eval/Final state_success_2 Std 0 +[id0] eval/Final state_success_2 Max 0 +[id0] eval/Final state_success_2 Min 0 +[id0] eval/state_success_3 Mean 0 +[id0] eval/state_success_3 Std 0 +[id0] eval/state_success_3 Max 0 +[id0] eval/state_success_3 Min 0 +[id0] eval/Final state_success_3 Mean 0 +[id0] eval/Final state_success_3 Std 0 +[id0] eval/Final state_success_3 Max 0 +[id0] eval/Final state_success_3 Min 0 +[id0] eval/touch_success Mean 0 +[id0] eval/touch_success Std 0 +[id0] eval/touch_success Max 0 +[id0] eval/touch_success Min 0 +[id0] eval/Final touch_success Mean 0 +[id0] eval/Final touch_success Std 0 +[id0] eval/Final touch_success Max 0 +[id0] eval/Final touch_success Min 0 +[id0] eval/touch_success_2 Mean 0.45 +[id0] eval/touch_success_2 Std 0.497494 +[id0] eval/touch_success_2 Max 1 +[id0] eval/touch_success_2 Min 0 +[id0] eval/Final touch_success_2 Mean 0.5 +[id0] eval/Final touch_success_2 Std 0.5 +[id0] eval/Final touch_success_2 Max 1 +[id0] eval/Final touch_success_2 Min 0 +[id0] eval/touch_success_3 Mean 0.5 +[id0] eval/touch_success_3 Std 0.5 +[id0] eval/touch_success_3 Max 1 +[id0] eval/touch_success_3 Min 0 +[id0] eval/Final touch_success_3 Mean 0.5 +[id0] eval/Final touch_success_3 Std 0.5 +[id0] eval/Final touch_success_3 Max 1 +[id0] eval/Final touch_success_3 Min 0 +[id0] time/epoch_time (s) 3.32287 +[id0] time/evaluation sampling (s) 1.55291 +[id0] time/exploration sampling (s) 1.55204 +[id0] time/global_time (s) 24.9089 +[id0] time/replay buffer data storing (s) 0.000366211 +[id0] time/saving (s) 3.05176e-05 +[id0] time/training (s) 0.206526 +[id0] epoch 1 +[id0] --------------------------------------- --------------- +[id0] --------------------------------------- --------------- +[id0] trainer/num train calls 30 +[id0] trainer/QF1 Loss 0.00454581 +[id0] trainer/QF2 Loss 0.004166 +[id0] trainer/Policy Loss 2.72957 +[id0] trainer/Q1 Predictions Mean -0.126527 +[id0] trainer/Q1 Predictions Std 0.00315079 +[id0] trainer/Q1 Predictions Max -0.123503 +[id0] trainer/Q1 Predictions Min -0.131618 +[id0] trainer/Q2 Predictions Mean -0.126105 +[id0] trainer/Q2 Predictions Std 0.00710658 +[id0] trainer/Q2 Predictions Max -0.11948 +[id0] trainer/Q2 Predictions Min -0.139396 +[id0] trainer/Q Targets Mean -0.112228 +[id0] trainer/Q Targets Std 0.0672478 +[id0] trainer/Q Targets Max -0.0255882 +[id0] trainer/Q Targets Min -0.209882 +[id0] trainer/Log Pis Mean 2.6193 +[id0] trainer/Log Pis Std 1.044 +[id0] trainer/Log Pis Max 3.409 +[id0] trainer/Log Pis Min 0.627569 +[id0] trainer/rewards Mean -0.110253 +[id0] trainer/rewards Std 0.0672446 +[id0] trainer/rewards Max -0.0236798 +[id0] trainer/rewards Min -0.207899 +[id0] trainer/terminals Mean 0 +[id0] trainer/terminals Std 0 +[id0] trainer/terminals Max 0 +[id0] trainer/terminals Min 0 +[id0] trainer/replay_buffer_len 90400 +[id0] trainer/policy/mean Mean 0.00452171 +[id0] trainer/policy/mean Std 0.199858 +[id0] trainer/policy/mean Max 0.211937 +[id0] trainer/policy/mean Min -0.202561 +[id0] trainer/policy/std Mean 0.0727627 +[id0] trainer/policy/std Std 0.000733147 +[id0] trainer/policy/std Max 0.0741526 +[id0] trainer/policy/std Min 0.0716186 +[id0] trainer/Advantage Weights Mean 0.2 +[id0] trainer/Advantage Weights Std 0.303689 +[id0] trainer/Advantage Weights Max 0.803173 +[id0] trainer/Advantage Weights Min 0.000376373 +[id0] trainer/Advantage Score Mean -0.0162524 +[id0] trainer/Advantage Score Std 0.00252286 +[id0] trainer/Advantage Score Max -0.0131438 +[id0] trainer/Advantage Score Min -0.0208096 +[id0] trainer/bc/Train Logprob Loss 137.941 +[id0] trainer/bc/Test Logprob Loss 114.999 +[id0] trainer/bc/Train MSE 0.72753 +[id0] trainer/bc/Test MSE 0.686215 +[id0] trainer/bc/train_policy_loss 137.941 +[id0] trainer/bc/test_policy_loss 114.999 +[id0] expl/num steps total 400 +[id0] expl/num paths total 8 +[id0] expl/path length Mean 50 +[id0] expl/path length Std 0 +[id0] expl/path length Max 50 +[id0] expl/path length Min 50 +[id0] expl/Rewards Mean -0.145173 +[id0] expl/Rewards Std 0.0133283 +[id0] expl/Rewards Max -0.131845 +[id0] expl/Rewards Min -0.158502 +[id0] expl/Returns Mean -7.25866 +[id0] expl/Returns Std 0.666417 +[id0] expl/Returns Max -6.59225 +[id0] expl/Returns Min -7.92508 +[id0] expl/Actions Mean 0.0140931 +[id0] expl/Actions Std 0.209096 +[id0] expl/Actions Max 0.371834 +[id0] expl/Actions Min -0.3301 +[id0] expl/Num Paths 2 +[id0] expl/Average Returns -7.25866 +[id0] expl/hand_distance Mean 0.202981 +[id0] expl/hand_distance Std 0.0395357 +[id0] expl/hand_distance Max 0.271631 +[id0] expl/hand_distance Min 0.118211 +[id0] expl/Final hand_distance Mean 0.265121 +[id0] expl/Final hand_distance Std 0.00650981 +[id0] expl/Final hand_distance Max 0.271631 +[id0] expl/Final hand_distance Min 0.258611 +[id0] expl/puck_distance Mean 0.145173 +[id0] expl/puck_distance Std 0.0133283 +[id0] expl/puck_distance Max 0.158502 +[id0] expl/puck_distance Min 0.131845 +[id0] expl/Final puck_distance Mean 0.145173 +[id0] expl/Final puck_distance Std 0.0133284 +[id0] expl/Final puck_distance Max 0.158502 +[id0] expl/Final puck_distance Min 0.131845 +[id0] expl/state_distance Mean 0.250709 +[id0] expl/state_distance Std 0.0340966 +[id0] expl/state_distance Max 0.303319 +[id0] expl/state_distance Min 0.177079 +[id0] expl/Final state_distance Mean 0.302628 +[id0] expl/Final state_distance Std 0.000690746 +[id0] expl/Final state_distance Max 0.303319 +[id0] expl/Final state_distance Min 0.301938 +[id0] expl/touch_distance Mean 0.202697 +[id0] expl/touch_distance Std 0.0637517 +[id0] expl/touch_distance Max 0.312716 +[id0] expl/touch_distance Min 0.0831949 +[id0] expl/Final touch_distance Mean 0.197956 +[id0] expl/Final touch_distance Std 0.114761 +[id0] expl/Final touch_distance Max 0.312716 +[id0] expl/Final touch_distance Min 0.0831949 +[id0] expl/hand_success Mean 0 +[id0] expl/hand_success Std 0 +[id0] expl/hand_success Max 0 +[id0] expl/hand_success Min 0 +[id0] expl/Final hand_success Mean 0 +[id0] expl/Final hand_success Std 0 +[id0] expl/Final hand_success Max 0 +[id0] expl/Final hand_success Min 0 +[id0] expl/hand_success_2 Mean 0 +[id0] expl/hand_success_2 Std 0 +[id0] expl/hand_success_2 Max 0 +[id0] expl/hand_success_2 Min 0 +[id0] expl/Final hand_success_2 Mean 0 +[id0] expl/Final hand_success_2 Std 0 +[id0] expl/Final hand_success_2 Max 0 +[id0] expl/Final hand_success_2 Min 0 +[id0] expl/hand_success_3 Mean 0.01 +[id0] expl/hand_success_3 Std 0.0994987 +[id0] expl/hand_success_3 Max 1 +[id0] expl/hand_success_3 Min 0 +[id0] expl/Final hand_success_3 Mean 0 +[id0] expl/Final hand_success_3 Std 0 +[id0] expl/Final hand_success_3 Max 0 +[id0] expl/Final hand_success_3 Min 0 +[id0] expl/puck_success Mean 0 +[id0] expl/puck_success Std 0 +[id0] expl/puck_success Max 0 +[id0] expl/puck_success Min 0 +[id0] expl/Final puck_success Mean 0 +[id0] expl/Final puck_success Std 0 +[id0] expl/Final puck_success Max 0 +[id0] expl/Final puck_success Min 0 +[id0] expl/puck_success_2 Mean 0 +[id0] expl/puck_success_2 Std 0 +[id0] expl/puck_success_2 Max 0 +[id0] expl/puck_success_2 Min 0 +[id0] expl/Final puck_success_2 Mean 0 +[id0] expl/Final puck_success_2 Std 0 +[id0] expl/Final puck_success_2 Max 0 +[id0] expl/Final puck_success_2 Min 0 +[id0] expl/puck_success_3 Mean 0 +[id0] expl/puck_success_3 Std 0 +[id0] expl/puck_success_3 Max 0 +[id0] expl/puck_success_3 Min 0 +[id0] expl/Final puck_success_3 Mean 0 +[id0] expl/Final puck_success_3 Std 0 +[id0] expl/Final puck_success_3 Max 0 +[id0] expl/Final puck_success_3 Min 0 +[id0] expl/hand_and_puck_success Mean 0 +[id0] expl/hand_and_puck_success Std 0 +[id0] expl/hand_and_puck_success Max 0 +[id0] expl/hand_and_puck_success Min 0 +[id0] expl/Final hand_and_puck_success Mean 0 +[id0] expl/Final hand_and_puck_success Std 0 +[id0] expl/Final hand_and_puck_success Max 0 +[id0] expl/Final hand_and_puck_success Min 0 +[id0] expl/hand_and_puck_success_2 Mean 0 +[id0] expl/hand_and_puck_success_2 Std 0 +[id0] expl/hand_and_puck_success_2 Max 0 +[id0] expl/hand_and_puck_success_2 Min 0 +[id0] expl/Final hand_and_puck_success_2 Mean 0 +[id0] expl/Final hand_and_puck_success_2 Std 0 +[id0] expl/Final hand_and_puck_success_2 Max 0 +[id0] expl/Final hand_and_puck_success_2 Min 0 +[id0] expl/hand_and_puck_success_3 Mean 0 +[id0] expl/hand_and_puck_success_3 Std 0 +[id0] expl/hand_and_puck_success_3 Max 0 +[id0] expl/hand_and_puck_success_3 Min 0 +[id0] expl/Final hand_and_puck_success_3 Mean 0 +[id0] expl/Final hand_and_puck_success_3 Std 0 +[id0] expl/Final hand_and_puck_success_3 Max 0 +[id0] expl/Final hand_and_puck_success_3 Min 0 +[id0] expl/state_success Mean 0 +[id0] expl/state_success Std 0 +[id0] expl/state_success Max 0 +[id0] expl/state_success Min 0 +[id0] expl/Final state_success Mean 0 +[id0] expl/Final state_success Std 0 +[id0] expl/Final state_success Max 0 +[id0] expl/Final state_success Min 0 +[id0] expl/state_success_2 Mean 0 +[id0] expl/state_success_2 Std 0 +[id0] expl/state_success_2 Max 0 +[id0] expl/state_success_2 Min 0 +[id0] expl/Final state_success_2 Mean 0 +[id0] expl/Final state_success_2 Std 0 +[id0] expl/Final state_success_2 Max 0 +[id0] expl/Final state_success_2 Min 0 +[id0] expl/state_success_3 Mean 0 +[id0] expl/state_success_3 Std 0 +[id0] expl/state_success_3 Max 0 +[id0] expl/state_success_3 Min 0 +[id0] expl/Final state_success_3 Mean 0 +[id0] expl/Final state_success_3 Std 0 +[id0] expl/Final state_success_3 Max 0 +[id0] expl/Final state_success_3 Min 0 +[id0] expl/touch_success Mean 0 +[id0] expl/touch_success Std 0 +[id0] expl/touch_success Max 0 +[id0] expl/touch_success Min 0 +[id0] expl/Final touch_success Mean 0 +[id0] expl/Final touch_success Std 0 +[id0] expl/Final touch_success Max 0 +[id0] expl/Final touch_success Min 0 +[id0] expl/touch_success_2 Mean 0 +[id0] expl/touch_success_2 Std 0 +[id0] expl/touch_success_2 Max 0 +[id0] expl/touch_success_2 Min 0 +[id0] expl/Final touch_success_2 Mean 0 +[id0] expl/Final touch_success_2 Std 0 +[id0] expl/Final touch_success_2 Max 0 +[id0] expl/Final touch_success_2 Min 0 +[id0] expl/touch_success_3 Mean 0.14 +[id0] expl/touch_success_3 Std 0.346987 +[id0] expl/touch_success_3 Max 1 +[id0] expl/touch_success_3 Min 0 +[id0] expl/Final touch_success_3 Mean 0.5 +[id0] expl/Final touch_success_3 Std 0.5 +[id0] expl/Final touch_success_3 Max 1 +[id0] expl/Final touch_success_3 Min 0 +[id0] eval/num steps total 300 +[id0] eval/num paths total 6 +[id0] eval/path length Mean 50 +[id0] eval/path length Std 0 +[id0] eval/path length Max 50 +[id0] eval/path length Min 50 +[id0] eval/Rewards Mean -0.160745 +[id0] eval/Rewards Std 0.0629678 +[id0] eval/Rewards Max -0.0980016 +[id0] eval/Rewards Min -0.248806 +[id0] eval/Returns Mean -8.03726 +[id0] eval/Returns Std 3.13716 +[id0] eval/Returns Max -4.90009 +[id0] eval/Returns Min -11.1744 +[id0] eval/Actions Mean 0.00448906 +[id0] eval/Actions Std 0.198369 +[id0] eval/Actions Max 0.213171 +[id0] eval/Actions Min -0.203783 +[id0] eval/Num Paths 2 +[id0] eval/Average Returns -8.03726 +[id0] eval/hand_distance Mean 0.172838 +[id0] eval/hand_distance Std 0.0793458 +[id0] eval/hand_distance Max 0.338509 +[id0] eval/hand_distance Min 0.100819 +[id0] eval/Final hand_distance Mean 0.241763 +[id0] eval/Final hand_distance Std 0.0967464 +[id0] eval/Final hand_distance Max 0.338509 +[id0] eval/Final hand_distance Min 0.145016 +[id0] eval/puck_distance Mean 0.160745 +[id0] eval/puck_distance Std 0.0629678 +[id0] eval/puck_distance Max 0.248806 +[id0] eval/puck_distance Min 0.0980016 +[id0] eval/Final puck_distance Mean 0.173404 +[id0] eval/Final puck_distance Std 0.0754021 +[id0] eval/Final puck_distance Max 0.248806 +[id0] eval/Final puck_distance Min 0.0980023 +[id0] eval/state_distance Mean 0.252767 +[id0] eval/state_distance Std 0.0456258 +[id0] eval/state_distance Max 0.35241 +[id0] eval/state_distance Min 0.149458 +[id0] eval/Final state_distance Mean 0.320197 +[id0] eval/Final state_distance Std 0.0322134 +[id0] eval/Final state_distance Max 0.35241 +[id0] eval/Final state_distance Min 0.287983 +[id0] eval/touch_distance Mean 0.133625 +[id0] eval/touch_distance Std 0.0489162 +[id0] eval/touch_distance Max 0.22003 +[id0] eval/touch_distance Min 0.0647734 +[id0] eval/Final touch_distance Mean 0.143407 +[id0] eval/Final touch_distance Std 0.0766229 +[id0] eval/Final touch_distance Max 0.22003 +[id0] eval/Final touch_distance Min 0.0667838 +[id0] eval/hand_success Mean 0 +[id0] eval/hand_success Std 0 +[id0] eval/hand_success Max 0 +[id0] eval/hand_success Min 0 +[id0] eval/Final hand_success Mean 0 +[id0] eval/Final hand_success Std 0 +[id0] eval/Final hand_success Max 0 +[id0] eval/Final hand_success Min 0 +[id0] eval/hand_success_2 Mean 0 +[id0] eval/hand_success_2 Std 0 +[id0] eval/hand_success_2 Max 0 +[id0] eval/hand_success_2 Min 0 +[id0] eval/Final hand_success_2 Mean 0 +[id0] eval/Final hand_success_2 Std 0 +[id0] eval/Final hand_success_2 Max 0 +[id0] eval/Final hand_success_2 Min 0 +[id0] eval/hand_success_3 Mean 0.42 +[id0] eval/hand_success_3 Std 0.493559 +[id0] eval/hand_success_3 Max 1 +[id0] eval/hand_success_3 Min 0 +[id0] eval/Final hand_success_3 Mean 0 +[id0] eval/Final hand_success_3 Std 0 +[id0] eval/Final hand_success_3 Max 0 +[id0] eval/Final hand_success_3 Min 0 +[id0] eval/puck_success Mean 0 +[id0] eval/puck_success Std 0 +[id0] eval/puck_success Max 0 +[id0] eval/puck_success Min 0 +[id0] eval/Final puck_success Mean 0 +[id0] eval/Final puck_success Std 0 +[id0] eval/Final puck_success Max 0 +[id0] eval/Final puck_success Min 0 +[id0] eval/puck_success_2 Mean 0 +[id0] eval/puck_success_2 Std 0 +[id0] eval/puck_success_2 Max 0 +[id0] eval/puck_success_2 Min 0 +[id0] eval/Final puck_success_2 Mean 0 +[id0] eval/Final puck_success_2 Std 0 +[id0] eval/Final puck_success_2 Max 0 +[id0] eval/Final puck_success_2 Min 0 +[id0] eval/puck_success_3 Mean 0.5 +[id0] eval/puck_success_3 Std 0.5 +[id0] eval/puck_success_3 Max 1 +[id0] eval/puck_success_3 Min 0 +[id0] eval/Final puck_success_3 Mean 0.5 +[id0] eval/Final puck_success_3 Std 0.5 +[id0] eval/Final puck_success_3 Max 1 +[id0] eval/Final puck_success_3 Min 0 +[id0] eval/hand_and_puck_success Mean 0 +[id0] eval/hand_and_puck_success Std 0 +[id0] eval/hand_and_puck_success Max 0 +[id0] eval/hand_and_puck_success Min 0 +[id0] eval/Final hand_and_puck_success Mean 0 +[id0] eval/Final hand_and_puck_success Std 0 +[id0] eval/Final hand_and_puck_success Max 0 +[id0] eval/Final hand_and_puck_success Min 0 +[id0] eval/hand_and_puck_success_2 Mean 0 +[id0] eval/hand_and_puck_success_2 Std 0 +[id0] eval/hand_and_puck_success_2 Max 0 +[id0] eval/hand_and_puck_success_2 Min 0 +[id0] eval/Final hand_and_puck_success_2 Mean 0 +[id0] eval/Final hand_and_puck_success_2 Std 0 +[id0] eval/Final hand_and_puck_success_2 Max 0 +[id0] eval/Final hand_and_puck_success_2 Min 0 +[id0] eval/hand_and_puck_success_3 Mean 0 +[id0] eval/hand_and_puck_success_3 Std 0 +[id0] eval/hand_and_puck_success_3 Max 0 +[id0] eval/hand_and_puck_success_3 Min 0 +[id0] eval/Final hand_and_puck_success_3 Mean 0 +[id0] eval/Final hand_and_puck_success_3 Std 0 +[id0] eval/Final hand_and_puck_success_3 Max 0 +[id0] eval/Final hand_and_puck_success_3 Min 0 +[id0] eval/state_success Mean 0 +[id0] eval/state_success Std 0 +[id0] eval/state_success Max 0 +[id0] eval/state_success Min 0 +[id0] eval/Final state_success Mean 0 +[id0] eval/Final state_success Std 0 +[id0] eval/Final state_success Max 0 +[id0] eval/Final state_success Min 0 +[id0] eval/state_success_2 Mean 0 +[id0] eval/state_success_2 Std 0 +[id0] eval/state_success_2 Max 0 +[id0] eval/state_success_2 Min 0 +[id0] eval/Final state_success_2 Mean 0 +[id0] eval/Final state_success_2 Std 0 +[id0] eval/Final state_success_2 Max 0 +[id0] eval/Final state_success_2 Min 0 +[id0] eval/state_success_3 Mean 0 +[id0] eval/state_success_3 Std 0 +[id0] eval/state_success_3 Max 0 +[id0] eval/state_success_3 Min 0 +[id0] eval/Final state_success_3 Mean 0 +[id0] eval/Final state_success_3 Std 0 +[id0] eval/Final state_success_3 Max 0 +[id0] eval/Final state_success_3 Min 0 +[id0] eval/touch_success Mean 0 +[id0] eval/touch_success Std 0 +[id0] eval/touch_success Max 0 +[id0] eval/touch_success Min 0 +[id0] eval/Final touch_success Mean 0 +[id0] eval/Final touch_success Std 0 +[id0] eval/Final touch_success Max 0 +[id0] eval/Final touch_success Min 0 +[id0] eval/touch_success_2 Mean 0.16 +[id0] eval/touch_success_2 Std 0.366606 +[id0] eval/touch_success_2 Max 1 +[id0] eval/touch_success_2 Min 0 +[id0] eval/Final touch_success_2 Mean 0.5 +[id0] eval/Final touch_success_2 Std 0.5 +[id0] eval/Final touch_success_2 Max 1 +[id0] eval/Final touch_success_2 Min 0 +[id0] eval/touch_success_3 Mean 0.46 +[id0] eval/touch_success_3 Std 0.498397 +[id0] eval/touch_success_3 Max 1 +[id0] eval/touch_success_3 Min 0 +[id0] eval/Final touch_success_3 Mean 0.5 +[id0] eval/Final touch_success_3 Std 0.5 +[id0] eval/Final touch_success_3 Max 1 +[id0] eval/Final touch_success_3 Min 0 +[id0] time/epoch_time (s) 3.32644 +[id0] time/evaluation sampling (s) 1.55151 +[id0] time/exploration sampling (s) 1.56175 +[id0] time/global_time (s) 28.2607 +[id0] time/replay buffer data storing (s) 0.000365496 +[id0] time/saving (s) 3.29018e-05 +[id0] time/training (s) 0.2043 +[id0] epoch 2 +[id0] --------------------------------------- --------------- +[id0] --------------------------------------- --------------- +[id0] trainer/num train calls 40 +[id0] trainer/QF1 Loss 0.00812755 +[id0] trainer/QF2 Loss 0.00844011 +[id0] trainer/Policy Loss 1.44796 +[id0] trainer/Q1 Predictions Mean -0.125104 +[id0] trainer/Q1 Predictions Std 0.00825019 +[id0] trainer/Q1 Predictions Max -0.115534 +[id0] trainer/Q1 Predictions Min -0.140015 +[id0] trainer/Q2 Predictions Mean -0.126615 +[id0] trainer/Q2 Predictions Std 0.00817811 +[id0] trainer/Q2 Predictions Max -0.115739 +[id0] trainer/Q2 Predictions Min -0.141015 +[id0] trainer/Q Targets Mean -0.0874891 +[id0] trainer/Q Targets Std 0.0767401 +[id0] trainer/Q Targets Max -0.00298979 +[id0] trainer/Q Targets Min -0.219601 +[id0] trainer/Log Pis Mean 1.32672 +[id0] trainer/Log Pis Std 0.705888 +[id0] trainer/Log Pis Max 1.97718 +[id0] trainer/Log Pis Min 0.302602 +[id0] trainer/rewards Mean -0.0843445 +[id0] trainer/rewards Std 0.0768233 +[id0] trainer/rewards Max -1.76237e-06 +[id0] trainer/rewards Min -0.216733 +[id0] trainer/terminals Mean 0 +[id0] trainer/terminals Std 0 +[id0] trainer/terminals Max 0 +[id0] trainer/terminals Min 0 +[id0] trainer/replay_buffer_len 90500 +[id0] trainer/policy/mean Mean -0.0363329 +[id0] trainer/policy/mean Std 0.381041 +[id0] trainer/policy/mean Max 0.369571 +[id0] trainer/policy/mean Min -0.446347 +[id0] trainer/policy/std Mean 0.140297 +[id0] trainer/policy/std Std 0.0092635 +[id0] trainer/policy/std Max 0.156963 +[id0] trainer/policy/std Min 0.123789 +[id0] trainer/Advantage Weights Mean 0.2 +[id0] trainer/Advantage Weights Std 0.399073 +[id0] trainer/Advantage Weights Max 0.998145 +[id0] trainer/Advantage Weights Min 3.91133e-10 +[id0] trainer/Advantage Score Mean -0.00386002 +[id0] trainer/Advantage Score Std 0.00749892 +[id0] trainer/Advantage Score Max 0.0071685 +[id0] trainer/Advantage Score Min -0.0144916 +[id0] trainer/bc/Train Logprob Loss 35.3732 +[id0] trainer/bc/Test Logprob Loss 31.4472 +[id0] trainer/bc/Train MSE 0.762608 +[id0] trainer/bc/Test MSE 0.794041 +[id0] trainer/bc/train_policy_loss 35.3732 +[id0] trainer/bc/test_policy_loss 31.4472 +[id0] expl/num steps total 500 +[id0] expl/num paths total 10 +[id0] expl/path length Mean 50 +[id0] expl/path length Std 0 +[id0] expl/path length Max 50 +[id0] expl/path length Min 50 +[id0] expl/Rewards Mean -0.113284 +[id0] expl/Rewards Std 0.00896539 +[id0] expl/Rewards Max -0.104318 +[id0] expl/Rewards Min -0.122249 +[id0] expl/Returns Mean -5.66419 +[id0] expl/Returns Std 0.448269 +[id0] expl/Returns Max -5.21592 +[id0] expl/Returns Min -6.11246 +[id0] expl/Actions Mean -0.0486082 +[id0] expl/Actions Std 0.417547 +[id0] expl/Actions Max 0.674142 +[id0] expl/Actions Min -0.728117 +[id0] expl/Num Paths 2 +[id0] expl/Average Returns -5.66419 +[id0] expl/hand_distance Mean 0.150248 +[id0] expl/hand_distance Std 0.0805957 +[id0] expl/hand_distance Max 0.233022 +[id0] expl/hand_distance Min 0.0281943 +[id0] expl/Final hand_distance Mean 0.162135 +[id0] expl/Final hand_distance Std 0.0708874 +[id0] expl/Final hand_distance Max 0.233022 +[id0] expl/Final hand_distance Min 0.0912476 +[id0] expl/puck_distance Mean 0.113284 +[id0] expl/puck_distance Std 0.00896539 +[id0] expl/puck_distance Max 0.122249 +[id0] expl/puck_distance Min 0.104318 +[id0] expl/Final puck_distance Mean 0.113284 +[id0] expl/Final puck_distance Std 0.00896559 +[id0] expl/Final puck_distance Max 0.122249 +[id0] expl/Final puck_distance Min 0.104318 +[id0] expl/state_distance Mean 0.193878 +[id0] expl/state_distance Std 0.0662949 +[id0] expl/state_distance Max 0.263143 +[id0] expl/state_distance Min 0.108062 +[id0] expl/Final state_distance Mean 0.200869 +[id0] expl/Final state_distance Std 0.0622744 +[id0] expl/Final state_distance Max 0.263143 +[id0] expl/Final state_distance Min 0.138594 +[id0] expl/touch_distance Mean 0.320496 +[id0] expl/touch_distance Std 0.0821977 +[id0] expl/touch_distance Max 0.413936 +[id0] expl/touch_distance Min 0.172705 +[id0] expl/Final touch_distance Mean 0.330567 +[id0] expl/Final touch_distance Std 0.0833687 +[id0] expl/Final touch_distance Max 0.413936 +[id0] expl/Final touch_distance Min 0.247198 +[id0] expl/hand_success Mean 0.17 +[id0] expl/hand_success Std 0.375633 +[id0] expl/hand_success Max 1 +[id0] expl/hand_success Min 0 +[id0] expl/Final hand_success Mean 0 +[id0] expl/Final hand_success Std 0 +[id0] expl/Final hand_success Max 0 +[id0] expl/Final hand_success Min 0 +[id0] expl/hand_success_2 Mean 0.2 +[id0] expl/hand_success_2 Std 0.4 +[id0] expl/hand_success_2 Max 1 +[id0] expl/hand_success_2 Min 0 +[id0] expl/Final hand_success_2 Mean 0 +[id0] expl/Final hand_success_2 Std 0 +[id0] expl/Final hand_success_2 Max 0 +[id0] expl/Final hand_success_2 Min 0 +[id0] expl/hand_success_3 Mean 0.5 +[id0] expl/hand_success_3 Std 0.5 +[id0] expl/hand_success_3 Max 1 +[id0] expl/hand_success_3 Min 0 +[id0] expl/Final hand_success_3 Mean 0.5 +[id0] expl/Final hand_success_3 Std 0.5 +[id0] expl/Final hand_success_3 Max 1 +[id0] expl/Final hand_success_3 Min 0 +[id0] expl/puck_success Mean 0 +[id0] expl/puck_success Std 0 +[id0] expl/puck_success Max 0 +[id0] expl/puck_success Min 0 +[id0] expl/Final puck_success Mean 0 +[id0] expl/Final puck_success Std 0 +[id0] expl/Final puck_success Max 0 +[id0] expl/Final puck_success Min 0 +[id0] expl/puck_success_2 Mean 0 +[id0] expl/puck_success_2 Std 0 +[id0] expl/puck_success_2 Max 0 +[id0] expl/puck_success_2 Min 0 +[id0] expl/Final puck_success_2 Mean 0 +[id0] expl/Final puck_success_2 Std 0 +[id0] expl/Final puck_success_2 Max 0 +[id0] expl/Final puck_success_2 Min 0 +[id0] expl/puck_success_3 Mean 0.5 +[id0] expl/puck_success_3 Std 0.5 +[id0] expl/puck_success_3 Max 1 +[id0] expl/puck_success_3 Min 0 +[id0] expl/Final puck_success_3 Mean 0.5 +[id0] expl/Final puck_success_3 Std 0.5 +[id0] expl/Final puck_success_3 Max 1 +[id0] expl/Final puck_success_3 Min 0 +[id0] expl/hand_and_puck_success Mean 0 +[id0] expl/hand_and_puck_success Std 0 +[id0] expl/hand_and_puck_success Max 0 +[id0] expl/hand_and_puck_success Min 0 +[id0] expl/Final hand_and_puck_success Mean 0 +[id0] expl/Final hand_and_puck_success Std 0 +[id0] expl/Final hand_and_puck_success Max 0 +[id0] expl/Final hand_and_puck_success Min 0 +[id0] expl/hand_and_puck_success_2 Mean 0 +[id0] expl/hand_and_puck_success_2 Std 0 +[id0] expl/hand_and_puck_success_2 Max 0 +[id0] expl/hand_and_puck_success_2 Min 0 +[id0] expl/Final hand_and_puck_success_2 Mean 0 +[id0] expl/Final hand_and_puck_success_2 Std 0 +[id0] expl/Final hand_and_puck_success_2 Max 0 +[id0] expl/Final hand_and_puck_success_2 Min 0 +[id0] expl/hand_and_puck_success_3 Mean 0 +[id0] expl/hand_and_puck_success_3 Std 0 +[id0] expl/hand_and_puck_success_3 Max 0 +[id0] expl/hand_and_puck_success_3 Min 0 +[id0] expl/Final hand_and_puck_success_3 Mean 0 +[id0] expl/Final hand_and_puck_success_3 Std 0 +[id0] expl/Final hand_and_puck_success_3 Max 0 +[id0] expl/Final hand_and_puck_success_3 Min 0 +[id0] expl/state_success Mean 0 +[id0] expl/state_success Std 0 +[id0] expl/state_success Max 0 +[id0] expl/state_success Min 0 +[id0] expl/Final state_success Mean 0 +[id0] expl/Final state_success Std 0 +[id0] expl/Final state_success Max 0 +[id0] expl/Final state_success Min 0 +[id0] expl/state_success_2 Mean 0 +[id0] expl/state_success_2 Std 0 +[id0] expl/state_success_2 Max 0 +[id0] expl/state_success_2 Min 0 +[id0] expl/Final state_success_2 Mean 0 +[id0] expl/Final state_success_2 Std 0 +[id0] expl/Final state_success_2 Max 0 +[id0] expl/Final state_success_2 Min 0 +[id0] expl/state_success_3 Mean 0.17 +[id0] expl/state_success_3 Std 0.375633 +[id0] expl/state_success_3 Max 1 +[id0] expl/state_success_3 Min 0 +[id0] expl/Final state_success_3 Mean 0 +[id0] expl/Final state_success_3 Std 0 +[id0] expl/Final state_success_3 Max 0 +[id0] expl/Final state_success_3 Min 0 +[id0] expl/touch_success Mean 0 +[id0] expl/touch_success Std 0 +[id0] expl/touch_success Max 0 +[id0] expl/touch_success Min 0 +[id0] expl/Final touch_success Mean 0 +[id0] expl/Final touch_success Std 0 +[id0] expl/Final touch_success Max 0 +[id0] expl/Final touch_success Min 0 +[id0] expl/touch_success_2 Mean 0 +[id0] expl/touch_success_2 Std 0 +[id0] expl/touch_success_2 Max 0 +[id0] expl/touch_success_2 Min 0 +[id0] expl/Final touch_success_2 Mean 0 +[id0] expl/Final touch_success_2 Std 0 +[id0] expl/Final touch_success_2 Max 0 +[id0] expl/Final touch_success_2 Min 0 +[id0] expl/touch_success_3 Mean 0 +[id0] expl/touch_success_3 Std 0 +[id0] expl/touch_success_3 Max 0 +[id0] expl/touch_success_3 Min 0 +[id0] expl/Final touch_success_3 Mean 0 +[id0] expl/Final touch_success_3 Std 0 +[id0] expl/Final touch_success_3 Max 0 +[id0] expl/Final touch_success_3 Min 0 +[id0] eval/num steps total 400 +[id0] eval/num paths total 8 +[id0] eval/path length Mean 50 +[id0] eval/path length Std 0 +[id0] eval/path length Max 50 +[id0] eval/path length Min 50 +[id0] eval/Rewards Mean -0.127591 +[id0] eval/Rewards Std 0.0872006 +[id0] eval/Rewards Max -0.0403897 +[id0] eval/Rewards Min -0.214791 +[id0] eval/Returns Mean -6.37953 +[id0] eval/Returns Std 4.36003 +[id0] eval/Returns Max -2.0195 +[id0] eval/Returns Min -10.7396 +[id0] eval/Actions Mean -0.0376967 +[id0] eval/Actions Std 0.39848 +[id0] eval/Actions Max 0.365947 +[id0] eval/Actions Min -0.442161 +[id0] eval/Num Paths 2 +[id0] eval/Average Returns -6.37953 +[id0] eval/hand_distance Mean 0.132352 +[id0] eval/hand_distance Std 0.0460267 +[id0] eval/hand_distance Max 0.18476 +[id0] eval/hand_distance Min 0.000587539 +[id0] eval/Final hand_distance Mean 0.163061 +[id0] eval/Final hand_distance Std 0.0216984 +[id0] eval/Final hand_distance Max 0.18476 +[id0] eval/Final hand_distance Min 0.141363 +[id0] eval/puck_distance Mean 0.127591 +[id0] eval/puck_distance Std 0.0872006 +[id0] eval/puck_distance Max 0.214791 +[id0] eval/puck_distance Min 0.0403897 +[id0] eval/Final puck_distance Mean 0.127591 +[id0] eval/Final puck_distance Std 0.0872005 +[id0] eval/Final puck_distance Max 0.214791 +[id0] eval/Final puck_distance Min 0.0403904 +[id0] eval/state_distance Mean 0.194302 +[id0] eval/state_distance Std 0.0759312 +[id0] eval/state_distance Max 0.269184 +[id0] eval/state_distance Min 0.0403941 +[id0] eval/Final state_distance Mean 0.223129 +[id0] eval/Final state_distance Std 0.0340064 +[id0] eval/Final state_distance Max 0.257136 +[id0] eval/Final state_distance Min 0.189123 +[id0] eval/touch_distance Mean 0.242917 +[id0] eval/touch_distance Std 0.0917106 +[id0] eval/touch_distance Max 0.337246 +[id0] eval/touch_distance Min 0.0830543 +[id0] eval/Final touch_distance Mean 0.275227 +[id0] eval/Final touch_distance Std 0.0620179 +[id0] eval/Final touch_distance Max 0.337245 +[id0] eval/Final touch_distance Min 0.213209 +[id0] eval/hand_success Mean 0.11 +[id0] eval/hand_success Std 0.31289 +[id0] eval/hand_success Max 1 +[id0] eval/hand_success Min 0 +[id0] eval/Final hand_success Mean 0 +[id0] eval/Final hand_success Std 0 +[id0] eval/Final hand_success Max 0 +[id0] eval/Final hand_success Min 0 +[id0] eval/hand_success_2 Mean 0.16 +[id0] eval/hand_success_2 Std 0.366606 +[id0] eval/hand_success_2 Max 1 +[id0] eval/hand_success_2 Min 0 +[id0] eval/Final hand_success_2 Mean 0 +[id0] eval/Final hand_success_2 Std 0 +[id0] eval/Final hand_success_2 Max 0 +[id0] eval/Final hand_success_2 Min 0 +[id0] eval/hand_success_3 Mean 0.24 +[id0] eval/hand_success_3 Std 0.427083 +[id0] eval/hand_success_3 Max 1 +[id0] eval/hand_success_3 Min 0 +[id0] eval/Final hand_success_3 Mean 0 +[id0] eval/Final hand_success_3 Std 0 +[id0] eval/Final hand_success_3 Max 0 +[id0] eval/Final hand_success_3 Min 0 +[id0] eval/puck_success Mean 0.5 +[id0] eval/puck_success Std 0.5 +[id0] eval/puck_success Max 1 +[id0] eval/puck_success Min 0 +[id0] eval/Final puck_success Mean 0.5 +[id0] eval/Final puck_success Std 0.5 +[id0] eval/Final puck_success Max 1 +[id0] eval/Final puck_success Min 0 +[id0] eval/puck_success_2 Mean 0.5 +[id0] eval/puck_success_2 Std 0.5 +[id0] eval/puck_success_2 Max 1 +[id0] eval/puck_success_2 Min 0 +[id0] eval/Final puck_success_2 Mean 0.5 +[id0] eval/Final puck_success_2 Std 0.5 +[id0] eval/Final puck_success_2 Max 1 +[id0] eval/Final puck_success_2 Min 0 +[id0] eval/puck_success_3 Mean 0.5 +[id0] eval/puck_success_3 Std 0.5 +[id0] eval/puck_success_3 Max 1 +[id0] eval/puck_success_3 Min 0 +[id0] eval/Final puck_success_3 Mean 0.5 +[id0] eval/Final puck_success_3 Std 0.5 +[id0] eval/Final puck_success_3 Max 1 +[id0] eval/Final puck_success_3 Min 0 +[id0] eval/hand_and_puck_success Mean 0.03 +[id0] eval/hand_and_puck_success Std 0.170587 +[id0] eval/hand_and_puck_success Max 1 +[id0] eval/hand_and_puck_success Min 0 +[id0] eval/Final hand_and_puck_success Mean 0 +[id0] eval/Final hand_and_puck_success Std 0 +[id0] eval/Final hand_and_puck_success Max 0 +[id0] eval/Final hand_and_puck_success Min 0 +[id0] eval/hand_and_puck_success_2 Mean 0.07 +[id0] eval/hand_and_puck_success_2 Std 0.255147 +[id0] eval/hand_and_puck_success_2 Max 1 +[id0] eval/hand_and_puck_success_2 Min 0 +[id0] eval/Final hand_and_puck_success_2 Mean 0 +[id0] eval/Final hand_and_puck_success_2 Std 0 +[id0] eval/Final hand_and_puck_success_2 Max 0 +[id0] eval/Final hand_and_puck_success_2 Min 0 +[id0] eval/hand_and_puck_success_3 Mean 0.16 +[id0] eval/hand_and_puck_success_3 Std 0.366606 +[id0] eval/hand_and_puck_success_3 Max 1 +[id0] eval/hand_and_puck_success_3 Min 0 +[id0] eval/Final hand_and_puck_success_3 Mean 0 +[id0] eval/Final hand_and_puck_success_3 Std 0 +[id0] eval/Final hand_and_puck_success_3 Max 0 +[id0] eval/Final hand_and_puck_success_3 Min 0 +[id0] eval/state_success Mean 0.08 +[id0] eval/state_success Std 0.271293 +[id0] eval/state_success Max 1 +[id0] eval/state_success Min 0 +[id0] eval/Final state_success Mean 0 +[id0] eval/Final state_success Std 0 +[id0] eval/Final state_success Max 0 +[id0] eval/Final state_success Min 0 +[id0] eval/state_success_2 Mean 0.14 +[id0] eval/state_success_2 Std 0.346987 +[id0] eval/state_success_2 Max 1 +[id0] eval/state_success_2 Min 0 +[id0] eval/Final state_success_2 Mean 0 +[id0] eval/Final state_success_2 Std 0 +[id0] eval/Final state_success_2 Max 0 +[id0] eval/Final state_success_2 Min 0 +[id0] eval/state_success_3 Mean 0.23 +[id0] eval/state_success_3 Std 0.420833 +[id0] eval/state_success_3 Max 1 +[id0] eval/state_success_3 Min 0 +[id0] eval/Final state_success_3 Mean 0 +[id0] eval/Final state_success_3 Std 0 +[id0] eval/Final state_success_3 Max 0 +[id0] eval/Final state_success_3 Min 0 +[id0] eval/touch_success Mean 0 +[id0] eval/touch_success Std 0 +[id0] eval/touch_success Max 0 +[id0] eval/touch_success Min 0 +[id0] eval/Final touch_success Mean 0 +[id0] eval/Final touch_success Std 0 +[id0] eval/Final touch_success Max 0 +[id0] eval/Final touch_success Min 0 +[id0] eval/touch_success_2 Mean 0 +[id0] eval/touch_success_2 Std 0 +[id0] eval/touch_success_2 Max 0 +[id0] eval/touch_success_2 Min 0 +[id0] eval/Final touch_success_2 Mean 0 +[id0] eval/Final touch_success_2 Std 0 +[id0] eval/Final touch_success_2 Max 0 +[id0] eval/Final touch_success_2 Min 0 +[id0] eval/touch_success_3 Mean 0.15 +[id0] eval/touch_success_3 Std 0.357071 +[id0] eval/touch_success_3 Max 1 +[id0] eval/touch_success_3 Min 0 +[id0] eval/Final touch_success_3 Mean 0 +[id0] eval/Final touch_success_3 Std 0 +[id0] eval/Final touch_success_3 Max 0 +[id0] eval/Final touch_success_3 Min 0 +[id0] time/epoch_time (s) 3.27514 +[id0] time/evaluation sampling (s) 1.5271 +[id0] time/exploration sampling (s) 1.52767 +[id0] time/global_time (s) 31.5604 +[id0] time/replay buffer data storing (s) 0.000356913 +[id0] time/saving (s) 2.45571e-05 +[id0] time/training (s) 0.209238 +[id0] epoch 3 +[id0] --------------------------------------- --------------- +[id0] --------------------------------------- --------------- +[id0] trainer/num train calls 50 +[id0] trainer/QF1 Loss 0.0135741 +[id0] trainer/QF2 Loss 0.0132348 +[id0] trainer/Policy Loss -1.0125 +[id0] trainer/Q1 Predictions Mean -0.115927 +[id0] trainer/Q1 Predictions Std 0.015483 +[id0] trainer/Q1 Predictions Max -0.0978391 +[id0] trainer/Q1 Predictions Min -0.137299 +[id0] trainer/Q2 Predictions Mean -0.11971 +[id0] trainer/Q2 Predictions Std 0.0153614 +[id0] trainer/Q2 Predictions Max -0.099684 +[id0] trainer/Q2 Predictions Min -0.141439 +[id0] trainer/Q Targets Mean -0.124712 +[id0] trainer/Q Targets Std 0.111004 +[id0] trainer/Q Targets Max -0.00466002 +[id0] trainer/Q Targets Min -0.280714 +[id0] trainer/Log Pis Mean -1.13064 +[id0] trainer/Log Pis Std 1.98296 +[id0] trainer/Log Pis Max 0.0934451 +[id0] trainer/Log Pis Min -5.06595 +[id0] trainer/rewards Mean -0.120223 +[id0] trainer/rewards Std 0.111247 +[id0] trainer/rewards Max -8.42937e-08 +[id0] trainer/rewards Min -0.276419 +[id0] trainer/terminals Mean 0 +[id0] trainer/terminals Std 0 +[id0] trainer/terminals Max 0 +[id0] trainer/terminals Min 0 +[id0] trainer/replay_buffer_len 90600 +[id0] trainer/policy/mean Mean -0.0841174 +[id0] trainer/policy/mean Std 0.459796 +[id0] trainer/policy/mean Max 0.393837 +[id0] trainer/policy/mean Min -0.566948 +[id0] trainer/policy/std Mean 0.343178 +[id0] trainer/policy/std Std 0.0279913 +[id0] trainer/policy/std Max 0.393021 +[id0] trainer/policy/std Min 0.302896 +[id0] trainer/Advantage Weights Mean 0.2 +[id0] trainer/Advantage Weights Std 0.392735 +[id0] trainer/Advantage Weights Max 0.985392 +[id0] trainer/Advantage Weights Min 7.05268e-16 +[id0] trainer/Advantage Score Mean 0.00222292 +[id0] trainer/Advantage Score Std 0.0124918 +[id0] trainer/Advantage Score Max 0.0155132 +[id0] trainer/Advantage Score Min -0.0193601 +[id0] trainer/bc/Train Logprob Loss 6.82 +[id0] trainer/bc/Test Logprob Loss 5.87462 +[id0] trainer/bc/Train MSE 1.0784 +[id0] trainer/bc/Test MSE 1.02518 +[id0] trainer/bc/train_policy_loss 6.82 +[id0] trainer/bc/test_policy_loss 5.87462 +[id0] expl/num steps total 600 +[id0] expl/num paths total 12 +[id0] expl/path length Mean 50 +[id0] expl/path length Std 0 +[id0] expl/path length Max 50 +[id0] expl/path length Min 50 +[id0] expl/Rewards Mean -0.195256 +[id0] expl/Rewards Std 0.107581 +[id0] expl/Rewards Max -0.0480694 +[id0] expl/Rewards Min -0.301743 +[id0] expl/Returns Mean -9.76281 +[id0] expl/Returns Std 5.32436 +[id0] expl/Returns Max -4.43845 +[id0] expl/Returns Min -15.0872 +[id0] expl/Actions Mean -0.0733707 +[id0] expl/Actions Std 0.62436 +[id0] expl/Actions Max 1.42151 +[id0] expl/Actions Min -1.46905 +[id0] expl/Num Paths 2 +[id0] expl/Average Returns -9.76281 +[id0] expl/hand_distance Mean 0.158642 +[id0] expl/hand_distance Std 0.0608491 +[id0] expl/hand_distance Max 0.245079 +[id0] expl/hand_distance Min 0.0291545 +[id0] expl/Final hand_distance Mean 0.197413 +[id0] expl/Final hand_distance Std 0.0476639 +[id0] expl/Final hand_distance Max 0.245077 +[id0] expl/Final hand_distance Min 0.149749 +[id0] expl/puck_distance Mean 0.195256 +[id0] expl/puck_distance Std 0.107581 +[id0] expl/puck_distance Max 0.301743 +[id0] expl/puck_distance Min 0.0480694 +[id0] expl/Final puck_distance Mean 0.20168 +[id0] expl/Final puck_distance Std 0.100063 +[id0] expl/Final puck_distance Max 0.301743 +[id0] expl/Final puck_distance Min 0.101617 +[id0] expl/state_distance Mean 0.256775 +[id0] expl/state_distance Std 0.112407 +[id0] expl/state_distance Max 0.388732 +[id0] expl/state_distance Min 0.0562197 +[id0] expl/Final state_distance Mean 0.284851 +[id0] expl/Final state_distance Std 0.10388 +[id0] expl/Final state_distance Max 0.388731 +[id0] expl/Final state_distance Min 0.180972 +[id0] expl/touch_distance Mean 0.105648 +[id0] expl/touch_distance Std 0.0463963 +[id0] expl/touch_distance Max 0.261491 +[id0] expl/touch_distance Min 0.0616572 +[id0] expl/Final touch_distance Mean 0.0855933 +[id0] expl/Final touch_distance Std 0.00990345 +[id0] expl/Final touch_distance Max 0.0954967 +[id0] expl/Final touch_distance Min 0.0756899 +[id0] expl/hand_success Mean 0.06 +[id0] expl/hand_success Std 0.237487 +[id0] expl/hand_success Max 1 +[id0] expl/hand_success Min 0 +[id0] expl/Final hand_success Mean 0 +[id0] expl/Final hand_success Std 0 +[id0] expl/Final hand_success Max 0 +[id0] expl/Final hand_success Min 0 +[id0] expl/hand_success_2 Mean 0.11 +[id0] expl/hand_success_2 Std 0.31289 +[id0] expl/hand_success_2 Max 1 +[id0] expl/hand_success_2 Min 0 +[id0] expl/Final hand_success_2 Mean 0 +[id0] expl/Final hand_success_2 Std 0 +[id0] expl/Final hand_success_2 Max 0 +[id0] expl/Final hand_success_2 Min 0 +[id0] expl/hand_success_3 Mean 0.22 +[id0] expl/hand_success_3 Std 0.414246 +[id0] expl/hand_success_3 Max 1 +[id0] expl/hand_success_3 Min 0 +[id0] expl/Final hand_success_3 Mean 0 +[id0] expl/Final hand_success_3 Std 0 +[id0] expl/Final hand_success_3 Max 0 +[id0] expl/Final hand_success_3 Min 0 +[id0] expl/puck_success Mean 0.1 +[id0] expl/puck_success Std 0.3 +[id0] expl/puck_success Max 1 +[id0] expl/puck_success Min 0 +[id0] expl/Final puck_success Mean 0 +[id0] expl/Final puck_success Std 0 +[id0] expl/Final puck_success Max 0 +[id0] expl/Final puck_success Min 0 +[id0] expl/puck_success_2 Mean 0.12 +[id0] expl/puck_success_2 Std 0.324962 +[id0] expl/puck_success_2 Max 1 +[id0] expl/puck_success_2 Min 0 +[id0] expl/Final puck_success_2 Mean 0 +[id0] expl/Final puck_success_2 Std 0 +[id0] expl/Final puck_success_2 Max 0 +[id0] expl/Final puck_success_2 Min 0 +[id0] expl/puck_success_3 Mean 0.5 +[id0] expl/puck_success_3 Std 0.5 +[id0] expl/puck_success_3 Max 1 +[id0] expl/puck_success_3 Min 0 +[id0] expl/Final puck_success_3 Mean 0.5 +[id0] expl/Final puck_success_3 Std 0.5 +[id0] expl/Final puck_success_3 Max 1 +[id0] expl/Final puck_success_3 Min 0 +[id0] expl/hand_and_puck_success Mean 0 +[id0] expl/hand_and_puck_success Std 0 +[id0] expl/hand_and_puck_success Max 0 +[id0] expl/hand_and_puck_success Min 0 +[id0] expl/Final hand_and_puck_success Mean 0 +[id0] expl/Final hand_and_puck_success Std 0 +[id0] expl/Final hand_and_puck_success Max 0 +[id0] expl/Final hand_and_puck_success Min 0 +[id0] expl/hand_and_puck_success_2 Mean 0.03 +[id0] expl/hand_and_puck_success_2 Std 0.170587 +[id0] expl/hand_and_puck_success_2 Max 1 +[id0] expl/hand_and_puck_success_2 Min 0 +[id0] expl/Final hand_and_puck_success_2 Mean 0 +[id0] expl/Final hand_and_puck_success_2 Std 0 +[id0] expl/Final hand_and_puck_success_2 Max 0 +[id0] expl/Final hand_and_puck_success_2 Min 0 +[id0] expl/hand_and_puck_success_3 Mean 0.08 +[id0] expl/hand_and_puck_success_3 Std 0.271293 +[id0] expl/hand_and_puck_success_3 Max 1 +[id0] expl/hand_and_puck_success_3 Min 0 +[id0] expl/Final hand_and_puck_success_3 Mean 0 +[id0] expl/Final hand_and_puck_success_3 Std 0 +[id0] expl/Final hand_and_puck_success_3 Max 0 +[id0] expl/Final hand_and_puck_success_3 Min 0 +[id0] expl/state_success Mean 0.04 +[id0] expl/state_success Std 0.195959 +[id0] expl/state_success Max 1 +[id0] expl/state_success Min 0 +[id0] expl/Final state_success Mean 0 +[id0] expl/Final state_success Std 0 +[id0] expl/Final state_success Max 0 +[id0] expl/Final state_success Min 0 +[id0] expl/state_success_2 Mean 0.06 +[id0] expl/state_success_2 Std 0.237487 +[id0] expl/state_success_2 Max 1 +[id0] expl/state_success_2 Min 0 +[id0] expl/Final state_success_2 Mean 0 +[id0] expl/Final state_success_2 Std 0 +[id0] expl/Final state_success_2 Max 0 +[id0] expl/Final state_success_2 Min 0 +[id0] expl/state_success_3 Mean 0.13 +[id0] expl/state_success_3 Std 0.336303 +[id0] expl/state_success_3 Max 1 +[id0] expl/state_success_3 Min 0 +[id0] expl/Final state_success_3 Mean 0 +[id0] expl/Final state_success_3 Std 0 +[id0] expl/Final state_success_3 Max 0 +[id0] expl/Final state_success_3 Min 0 +[id0] expl/touch_success Mean 0 +[id0] expl/touch_success Std 0 +[id0] expl/touch_success Max 0 +[id0] expl/touch_success Min 0 +[id0] expl/Final touch_success Mean 0 +[id0] expl/Final touch_success Std 0 +[id0] expl/Final touch_success Max 0 +[id0] expl/Final touch_success Min 0 +[id0] expl/touch_success_2 Mean 0.42 +[id0] expl/touch_success_2 Std 0.493559 +[id0] expl/touch_success_2 Max 1 +[id0] expl/touch_success_2 Min 0 +[id0] expl/Final touch_success_2 Mean 0.5 +[id0] expl/Final touch_success_2 Std 0.5 +[id0] expl/Final touch_success_2 Max 1 +[id0] expl/Final touch_success_2 Min 0 +[id0] expl/touch_success_3 Mean 0.77 +[id0] expl/touch_success_3 Std 0.420833 +[id0] expl/touch_success_3 Max 1 +[id0] expl/touch_success_3 Min 0 +[id0] expl/Final touch_success_3 Mean 1 +[id0] expl/Final touch_success_3 Std 0 +[id0] expl/Final touch_success_3 Max 1 +[id0] expl/Final touch_success_3 Min 1 +[id0] eval/num steps total 500 +[id0] eval/num paths total 10 +[id0] eval/path length Mean 50 +[id0] eval/path length Std 0 +[id0] eval/path length Max 50 +[id0] eval/path length Min 50 +[id0] eval/Rewards Mean -0.197671 +[id0] eval/Rewards Std 0.043969 +[id0] eval/Rewards Max -0.157676 +[id0] eval/Rewards Min -0.25822 +[id0] eval/Returns Mean -9.88357 +[id0] eval/Returns Std 1.99975 +[id0] eval/Returns Max -7.88382 +[id0] eval/Returns Min -11.8833 +[id0] eval/Actions Mean -0.0890177 +[id0] eval/Actions Std 0.504219 +[id0] eval/Actions Max 0.420863 +[id0] eval/Actions Min -0.600135 +[id0] eval/Num Paths 2 +[id0] eval/Average Returns -9.88357 +[id0] eval/hand_distance Mean 0.113758 +[id0] eval/hand_distance Std 0.0444896 +[id0] eval/hand_distance Max 0.272936 +[id0] eval/hand_distance Min 0.0675899 +[id0] eval/Final hand_distance Mean 0.0937614 +[id0] eval/Final hand_distance Std 0.0240546 +[id0] eval/Final hand_distance Max 0.117816 +[id0] eval/Final hand_distance Min 0.0697068 +[id0] eval/puck_distance Mean 0.197671 +[id0] eval/puck_distance Std 0.043969 +[id0] eval/puck_distance Max 0.25822 +[id0] eval/puck_distance Min 0.157676 +[id0] eval/Final puck_distance Mean 0.207933 +[id0] eval/Final puck_distance Std 0.0502558 +[id0] eval/Final puck_distance Max 0.258188 +[id0] eval/Final puck_distance Min 0.157677 +[id0] eval/state_distance Mean 0.233133 +[id0] eval/state_distance Std 0.0397063 +[id0] eval/state_distance Max 0.339762 +[id0] eval/state_distance Min 0.193193 +[id0] eval/Final state_distance Mean 0.232132 +[id0] eval/Final state_distance Std 0.0353007 +[id0] eval/Final state_distance Max 0.267433 +[id0] eval/Final state_distance Min 0.196831 +[id0] eval/touch_distance Mean 0.152677 +[id0] eval/touch_distance Std 0.0686253 +[id0] eval/touch_distance Max 0.229003 +[id0] eval/touch_distance Min 0.0658695 +[id0] eval/Final touch_distance Mean 0.139983 +[id0] eval/Final touch_distance Std 0.0722409 +[id0] eval/Final touch_distance Max 0.212224 +[id0] eval/Final touch_distance Min 0.0677426 +[id0] eval/hand_success Mean 0 +[id0] eval/hand_success Std 0 +[id0] eval/hand_success Max 0 +[id0] eval/hand_success Min 0 +[id0] eval/Final hand_success Mean 0 +[id0] eval/Final hand_success Std 0 +[id0] eval/Final hand_success Max 0 +[id0] eval/Final hand_success Min 0 +[id0] eval/hand_success_2 Mean 0.31 +[id0] eval/hand_success_2 Std 0.462493 +[id0] eval/hand_success_2 Max 1 +[id0] eval/hand_success_2 Min 0 +[id0] eval/Final hand_success_2 Mean 0.5 +[id0] eval/Final hand_success_2 Std 0.5 +[id0] eval/Final hand_success_2 Max 1 +[id0] eval/Final hand_success_2 Min 0 +[id0] eval/hand_success_3 Mean 0.8 +[id0] eval/hand_success_3 Std 0.4 +[id0] eval/hand_success_3 Max 1 +[id0] eval/hand_success_3 Min 0 +[id0] eval/Final hand_success_3 Mean 1 +[id0] eval/Final hand_success_3 Std 0 +[id0] eval/Final hand_success_3 Max 1 +[id0] eval/Final hand_success_3 Min 1 +[id0] eval/puck_success Mean 0 +[id0] eval/puck_success Std 0 +[id0] eval/puck_success Max 0 +[id0] eval/puck_success Min 0 +[id0] eval/Final puck_success Mean 0 +[id0] eval/Final puck_success Std 0 +[id0] eval/Final puck_success Max 0 +[id0] eval/Final puck_success Min 0 +[id0] eval/puck_success_2 Mean 0 +[id0] eval/puck_success_2 Std 0 +[id0] eval/puck_success_2 Max 0 +[id0] eval/puck_success_2 Min 0 +[id0] eval/Final puck_success_2 Mean 0 +[id0] eval/Final puck_success_2 Std 0 +[id0] eval/Final puck_success_2 Max 0 +[id0] eval/Final puck_success_2 Min 0 +[id0] eval/puck_success_3 Mean 0 +[id0] eval/puck_success_3 Std 0 +[id0] eval/puck_success_3 Max 0 +[id0] eval/puck_success_3 Min 0 +[id0] eval/Final puck_success_3 Mean 0 +[id0] eval/Final puck_success_3 Std 0 +[id0] eval/Final puck_success_3 Max 0 +[id0] eval/Final puck_success_3 Min 0 +[id0] eval/hand_and_puck_success Mean 0 +[id0] eval/hand_and_puck_success Std 0 +[id0] eval/hand_and_puck_success Max 0 +[id0] eval/hand_and_puck_success Min 0 +[id0] eval/Final hand_and_puck_success Mean 0 +[id0] eval/Final hand_and_puck_success Std 0 +[id0] eval/Final hand_and_puck_success Max 0 +[id0] eval/Final hand_and_puck_success Min 0 +[id0] eval/hand_and_puck_success_2 Mean 0 +[id0] eval/hand_and_puck_success_2 Std 0 +[id0] eval/hand_and_puck_success_2 Max 0 +[id0] eval/hand_and_puck_success_2 Min 0 +[id0] eval/Final hand_and_puck_success_2 Mean 0 +[id0] eval/Final hand_and_puck_success_2 Std 0 +[id0] eval/Final hand_and_puck_success_2 Max 0 +[id0] eval/Final hand_and_puck_success_2 Min 0 +[id0] eval/hand_and_puck_success_3 Mean 0 +[id0] eval/hand_and_puck_success_3 Std 0 +[id0] eval/hand_and_puck_success_3 Max 0 +[id0] eval/hand_and_puck_success_3 Min 0 +[id0] eval/Final hand_and_puck_success_3 Mean 0 +[id0] eval/Final hand_and_puck_success_3 Std 0 +[id0] eval/Final hand_and_puck_success_3 Max 0 +[id0] eval/Final hand_and_puck_success_3 Min 0 +[id0] eval/state_success Mean 0 +[id0] eval/state_success Std 0 +[id0] eval/state_success Max 0 +[id0] eval/state_success Min 0 +[id0] eval/Final state_success Mean 0 +[id0] eval/Final state_success Std 0 +[id0] eval/Final state_success Max 0 +[id0] eval/Final state_success Min 0 +[id0] eval/state_success_2 Mean 0 +[id0] eval/state_success_2 Std 0 +[id0] eval/state_success_2 Max 0 +[id0] eval/state_success_2 Min 0 +[id0] eval/Final state_success_2 Mean 0 +[id0] eval/Final state_success_2 Std 0 +[id0] eval/Final state_success_2 Max 0 +[id0] eval/Final state_success_2 Min 0 +[id0] eval/state_success_3 Mean 0 +[id0] eval/state_success_3 Std 0 +[id0] eval/state_success_3 Max 0 +[id0] eval/state_success_3 Min 0 +[id0] eval/Final state_success_3 Mean 0 +[id0] eval/Final state_success_3 Std 0 +[id0] eval/Final state_success_3 Max 0 +[id0] eval/Final state_success_3 Min 0 +[id0] eval/touch_success Mean 0 +[id0] eval/touch_success Std 0 +[id0] eval/touch_success Max 0 +[id0] eval/touch_success Min 0 +[id0] eval/Final touch_success Mean 0 +[id0] eval/Final touch_success Std 0 +[id0] eval/Final touch_success Max 0 +[id0] eval/Final touch_success Min 0 +[id0] eval/touch_success_2 Mean 0.36 +[id0] eval/touch_success_2 Std 0.48 +[id0] eval/touch_success_2 Max 1 +[id0] eval/touch_success_2 Min 0 +[id0] eval/Final touch_success_2 Mean 0.5 +[id0] eval/Final touch_success_2 Std 0.5 +[id0] eval/Final touch_success_2 Max 1 +[id0] eval/Final touch_success_2 Min 0 +[id0] eval/touch_success_3 Mean 0.4 +[id0] eval/touch_success_3 Std 0.489898 +[id0] eval/touch_success_3 Max 1 +[id0] eval/touch_success_3 Min 0 +[id0] eval/Final touch_success_3 Mean 0.5 +[id0] eval/Final touch_success_3 Std 0.5 +[id0] eval/Final touch_success_3 Max 1 +[id0] eval/Final touch_success_3 Min 0 +[id0] time/epoch_time (s) 3.34087 +[id0] time/evaluation sampling (s) 1.53697 +[id0] time/exploration sampling (s) 1.57926 +[id0] time/global_time (s) 34.9266 +[id0] time/replay buffer data storing (s) 0.000347137 +[id0] time/saving (s) 2.43187e-05 +[id0] time/training (s) 0.212641 +[id0] epoch 4 +[id0] --------------------------------------- --------------- +[id0] ------------------------------ --------------- +[id0] trainer/QF1 Loss 0.0267284 +[id0] trainer/QF2 Loss 0.0267345 +[id0] trainer/Policy Loss 3.15012 +[id0] trainer/Q1 Predictions Mean -0.000483587 +[id0] trainer/Q1 Predictions Std 0.000263562 +[id0] trainer/Q1 Predictions Max -2.54349e-05 +[id0] trainer/Q1 Predictions Min -0.00114263 +[id0] trainer/Q2 Predictions Mean -0.000438495 +[id0] trainer/Q2 Predictions Std 0.000220916 +[id0] trainer/Q2 Predictions Max 4.79189e-05 +[id0] trainer/Q2 Predictions Min -0.000895643 +[id0] trainer/Q Targets Mean -0.124641 +[id0] trainer/Q Targets Std 0.106343 +[id0] trainer/Q Targets Max 0.000171593 +[id0] trainer/Q Targets Min -0.380185 +[id0] trainer/Log Pis Mean 3.14974 +[id0] trainer/Log Pis Std 1.1 +[id0] trainer/Log Pis Max 4.15625 +[id0] trainer/Log Pis Min -1.85168 +[id0] trainer/rewards Mean -0.124673 +[id0] trainer/rewards Std 0.106319 +[id0] trainer/rewards Max 0 +[id0] trainer/rewards Min -0.380129 +[id0] trainer/terminals Mean 0 +[id0] trainer/terminals Std 0 +[id0] trainer/terminals Max 0 +[id0] trainer/terminals Min 0 +[id0] trainer/replay_buffer_len 90000 +[id0] trainer/policy/mean Mean -3.94223e-05 +[id0] trainer/policy/mean Std 5.48776e-06 +[id0] trainer/policy/mean Max -2.87341e-05 +[id0] trainer/policy/mean Min -5.66508e-05 +[id0] trainer/policy/std Mean 0.0497949 +[id0] trainer/policy/std Std 7.06956e-06 +[id0] trainer/policy/std Max 0.049803 +[id0] trainer/policy/std Min 0.0497869 +[id0] trainer/Advantage Weights Mean 0.0078125 +[id0] trainer/Advantage Weights Std 0.00198469 +[id0] trainer/Advantage Weights Max 0.0130013 +[id0] trainer/Advantage Weights Min 0.00423533 +[id0] trainer/Advantage Score Mean -0.000101316 +[id0] trainer/Advantage Score Std 0.000263755 +[id0] trainer/Advantage Score Max 0.000441706 +[id0] trainer/Advantage Score Min -0.000679884 +[id0] trainer/bc/Train Logprob Loss 254.926 +[id0] trainer/bc/Test Logprob Loss 270.137 +[id0] trainer/bc/Train MSE 0.642499 +[id0] trainer/bc/Test MSE 0.689321 +[id0] trainer/bc/train_policy_loss 254.926 +[id0] trainer/bc/test_policy_loss 270.137 +[id0] trainer/batch 0 +[id0] trainer/epoch_time 0.0318239 +[id0] ------------------------------ --------------- +[id0] --------------------------------------- --------------- +[id0] trainer/num train calls 10 +[id0] trainer/QF1 Loss 0.0250762 +[id0] trainer/QF2 Loss 0.0254459 +[id0] trainer/Policy Loss 3.42755 +[id0] trainer/Q1 Predictions Mean -0.02271 +[id0] trainer/Q1 Predictions Std 0.0013751 +[id0] trainer/Q1 Predictions Max -0.020538 +[id0] trainer/Q1 Predictions Min -0.0247885 +[id0] trainer/Q2 Predictions Mean -0.0215133 +[id0] trainer/Q2 Predictions Std 0.00105855 +[id0] trainer/Q2 Predictions Max -0.0195664 +[id0] trainer/Q2 Predictions Min -0.0226589 +[id0] trainer/Q Targets Mean -0.136798 +[id0] trainer/Q Targets Std 0.110915 +[id0] trainer/Q Targets Max -0.00019019 +[id0] trainer/Q Targets Min -0.317915 +[id0] trainer/Log Pis Mean 3.40936 +[id0] trainer/Log Pis Std 0.616096 +[id0] trainer/Log Pis Max 3.98308 +[id0] trainer/Log Pis Min 2.43023 +[id0] trainer/rewards Mean -0.136573 +[id0] trainer/rewards Std 0.110873 +[id0] trainer/rewards Max -1.48851e-06 +[id0] trainer/rewards Min -0.317618 +[id0] trainer/terminals Mean 0 +[id0] trainer/terminals Std 0 +[id0] trainer/terminals Max 0 +[id0] trainer/terminals Min 0 +[id0] trainer/replay_buffer_len 90200 +[id0] trainer/policy/mean Mean -0.000476395 +[id0] trainer/policy/mean Std 0.0094533 +[id0] trainer/policy/mean Max 0.00918785 +[id0] trainer/policy/mean Min -0.0101835 +[id0] trainer/policy/std Mean 0.0506216 +[id0] trainer/policy/std Std 1.96462e-05 +[id0] trainer/policy/std Max 0.0506586 +[id0] trainer/policy/std Min 0.0505911 +[id0] trainer/Advantage Weights Mean 0.2 +[id0] trainer/Advantage Weights Std 0.286094 +[id0] trainer/Advantage Weights Max 0.768934 +[id0] trainer/Advantage Weights Min 0.0102757 +[id0] trainer/Advantage Score Mean -0.0045227 +[id0] trainer/Advantage Score Std 0.00139399 +[id0] trainer/Advantage Score Max -0.00221948 +[id0] trainer/Advantage Score Min -0.00653471 +[id0] trainer/bc/Train Logprob Loss 245.501 +[id0] trainer/bc/Test Logprob Loss 246.347 +[id0] trainer/bc/Train MSE 0.643397 +[id0] trainer/bc/Test MSE 0.648696 +[id0] trainer/bc/train_policy_loss 245.501 +[id0] trainer/bc/test_policy_loss 246.347 +[id0] expl/num steps total 200 +[id0] expl/num paths total 4 +[id0] expl/path length Mean 50 +[id0] expl/path length Std 0 +[id0] expl/path length Max 50 +[id0] expl/path length Min 50 +[id0] expl/Rewards Mean -0.197639 +[id0] expl/Rewards Std 0.011234 +[id0] expl/Rewards Max -0.186405 +[id0] expl/Rewards Min -0.208873 +[id0] expl/Returns Mean -9.88193 +[id0] expl/Returns Std 0.561699 +[id0] expl/Returns Max -9.32024 +[id0] expl/Returns Min -10.4436 +[id0] expl/Actions Mean -0.000557166 +[id0] expl/Actions Std 0.0502919 +[id0] expl/Actions Max 0.146695 +[id0] expl/Actions Min -0.165994 +[id0] expl/Num Paths 2 +[id0] expl/Average Returns -9.88193 +[id0] expl/hand_distance Mean 0.0856795 +[id0] expl/hand_distance Std 0.0329738 +[id0] expl/hand_distance Max 0.120603 +[id0] expl/hand_distance Min 0.0476662 +[id0] expl/Final hand_distance Mean 0.0895555 +[id0] expl/Final hand_distance Std 0.0310472 +[id0] expl/Final hand_distance Max 0.120603 +[id0] expl/Final hand_distance Min 0.0585082 +[id0] expl/puck_distance Mean 0.197639 +[id0] expl/puck_distance Std 0.011234 +[id0] expl/puck_distance Max 0.208873 +[id0] expl/puck_distance Min 0.186405 +[id0] expl/Final puck_distance Mean 0.197639 +[id0] expl/Final puck_distance Std 0.0112338 +[id0] expl/Final puck_distance Max 0.208872 +[id0] expl/Final puck_distance Min 0.186405 +[id0] expl/state_distance Mean 0.21697 +[id0] expl/state_distance Std 0.0232233 +[id0] expl/state_distance Max 0.24119 +[id0] expl/state_distance Min 0.192403 +[id0] expl/Final state_distance Mean 0.218281 +[id0] expl/Final state_distance Std 0.0229095 +[id0] expl/Final state_distance Max 0.24119 +[id0] expl/Final state_distance Min 0.195371 +[id0] expl/touch_distance Mean 0.232212 +[id0] expl/touch_distance Std 0.0408655 +[id0] expl/touch_distance Max 0.278074 +[id0] expl/touch_distance Min 0.181348 +[id0] expl/Final touch_distance Mean 0.229711 +[id0] expl/Final touch_distance Std 0.048363 +[id0] expl/Final touch_distance Max 0.278074 +[id0] expl/Final touch_distance Min 0.181348 +[id0] expl/hand_success Mean 0.5 +[id0] expl/hand_success Std 0.5 +[id0] expl/hand_success Max 1 +[id0] expl/hand_success Min 0 +[id0] expl/Final hand_success Mean 0.5 +[id0] expl/Final hand_success Std 0.5 +[id0] expl/Final hand_success Max 1 +[id0] expl/Final hand_success Min 0 +[id0] expl/hand_success_2 Mean 0.5 +[id0] expl/hand_success_2 Std 0.5 +[id0] expl/hand_success_2 Max 1 +[id0] expl/hand_success_2 Min 0 +[id0] expl/Final hand_success_2 Mean 0.5 +[id0] expl/Final hand_success_2 Std 0.5 +[id0] expl/Final hand_success_2 Max 1 +[id0] expl/Final hand_success_2 Min 0 +[id0] expl/hand_success_3 Mean 0.97 +[id0] expl/hand_success_3 Std 0.170587 +[id0] expl/hand_success_3 Max 1 +[id0] expl/hand_success_3 Min 0 +[id0] expl/Final hand_success_3 Mean 0.5 +[id0] expl/Final hand_success_3 Std 0.5 +[id0] expl/Final hand_success_3 Max 1 +[id0] expl/Final hand_success_3 Min 0 +[id0] expl/puck_success Mean 0 +[id0] expl/puck_success Std 0 +[id0] expl/puck_success Max 0 +[id0] expl/puck_success Min 0 +[id0] expl/Final puck_success Mean 0 +[id0] expl/Final puck_success Std 0 +[id0] expl/Final puck_success Max 0 +[id0] expl/Final puck_success Min 0 +[id0] expl/puck_success_2 Mean 0 +[id0] expl/puck_success_2 Std 0 +[id0] expl/puck_success_2 Max 0 +[id0] expl/puck_success_2 Min 0 +[id0] expl/Final puck_success_2 Mean 0 +[id0] expl/Final puck_success_2 Std 0 +[id0] expl/Final puck_success_2 Max 0 +[id0] expl/Final puck_success_2 Min 0 +[id0] expl/puck_success_3 Mean 0 +[id0] expl/puck_success_3 Std 0 +[id0] expl/puck_success_3 Max 0 +[id0] expl/puck_success_3 Min 0 +[id0] expl/Final puck_success_3 Mean 0 +[id0] expl/Final puck_success_3 Std 0 +[id0] expl/Final puck_success_3 Max 0 +[id0] expl/Final puck_success_3 Min 0 +[id0] expl/hand_and_puck_success Mean 0 +[id0] expl/hand_and_puck_success Std 0 +[id0] expl/hand_and_puck_success Max 0 +[id0] expl/hand_and_puck_success Min 0 +[id0] expl/Final hand_and_puck_success Mean 0 +[id0] expl/Final hand_and_puck_success Std 0 +[id0] expl/Final hand_and_puck_success Max 0 +[id0] expl/Final hand_and_puck_success Min 0 +[id0] expl/hand_and_puck_success_2 Mean 0 +[id0] expl/hand_and_puck_success_2 Std 0 +[id0] expl/hand_and_puck_success_2 Max 0 +[id0] expl/hand_and_puck_success_2 Min 0 +[id0] expl/Final hand_and_puck_success_2 Mean 0 +[id0] expl/Final hand_and_puck_success_2 Std 0 +[id0] expl/Final hand_and_puck_success_2 Max 0 +[id0] expl/Final hand_and_puck_success_2 Min 0 +[id0] expl/hand_and_puck_success_3 Mean 0 +[id0] expl/hand_and_puck_success_3 Std 0 +[id0] expl/hand_and_puck_success_3 Max 0 +[id0] expl/hand_and_puck_success_3 Min 0 +[id0] expl/Final hand_and_puck_success_3 Mean 0 +[id0] expl/Final hand_and_puck_success_3 Std 0 +[id0] expl/Final hand_and_puck_success_3 Max 0 +[id0] expl/Final hand_and_puck_success_3 Min 0 +[id0] expl/state_success Mean 0 +[id0] expl/state_success Std 0 +[id0] expl/state_success Max 0 +[id0] expl/state_success Min 0 +[id0] expl/Final state_success Mean 0 +[id0] expl/Final state_success Std 0 +[id0] expl/Final state_success Max 0 +[id0] expl/Final state_success Min 0 +[id0] expl/state_success_2 Mean 0 +[id0] expl/state_success_2 Std 0 +[id0] expl/state_success_2 Max 0 +[id0] expl/state_success_2 Min 0 +[id0] expl/Final state_success_2 Mean 0 +[id0] expl/Final state_success_2 Std 0 +[id0] expl/Final state_success_2 Max 0 +[id0] expl/Final state_success_2 Min 0 +[id0] expl/state_success_3 Mean 0 +[id0] expl/state_success_3 Std 0 +[id0] expl/state_success_3 Max 0 +[id0] expl/state_success_3 Min 0 +[id0] expl/Final state_success_3 Mean 0 +[id0] expl/Final state_success_3 Std 0 +[id0] expl/Final state_success_3 Max 0 +[id0] expl/Final state_success_3 Min 0 +[id0] expl/touch_success Mean 0 +[id0] expl/touch_success Std 0 +[id0] expl/touch_success Max 0 +[id0] expl/touch_success Min 0 +[id0] expl/Final touch_success Mean 0 +[id0] expl/Final touch_success Std 0 +[id0] expl/Final touch_success Max 0 +[id0] expl/Final touch_success Min 0 +[id0] expl/touch_success_2 Mean 0 +[id0] expl/touch_success_2 Std 0 +[id0] expl/touch_success_2 Max 0 +[id0] expl/touch_success_2 Min 0 +[id0] expl/Final touch_success_2 Mean 0 +[id0] expl/Final touch_success_2 Std 0 +[id0] expl/Final touch_success_2 Max 0 +[id0] expl/Final touch_success_2 Min 0 +[id0] expl/touch_success_3 Mean 0 +[id0] expl/touch_success_3 Std 0 +[id0] expl/touch_success_3 Max 0 +[id0] expl/touch_success_3 Min 0 +[id0] expl/Final touch_success_3 Mean 0 +[id0] expl/Final touch_success_3 Std 0 +[id0] expl/Final touch_success_3 Max 0 +[id0] expl/Final touch_success_3 Min 0 +[id0] eval/num steps total 100 +[id0] eval/num paths total 2 +[id0] eval/path length Mean 50 +[id0] eval/path length Std 0 +[id0] eval/path length Max 50 +[id0] eval/path length Min 50 +[id0] eval/Rewards Mean -0.262271 +[id0] eval/Rewards Std 0.0382676 +[id0] eval/Rewards Max -0.224004 +[id0] eval/Rewards Min -0.300539 +[id0] eval/Returns Mean -13.1136 +[id0] eval/Returns Std 1.91338 +[id0] eval/Returns Max -11.2002 +[id0] eval/Returns Min -15.0269 +[id0] eval/Actions Mean -0.000489309 +[id0] eval/Actions Std 0.00942212 +[id0] eval/Actions Max 0.00906735 +[id0] eval/Actions Min -0.0100524 +[id0] eval/Num Paths 2 +[id0] eval/Average Returns -13.1136 +[id0] eval/hand_distance Mean 0.168322 +[id0] eval/hand_distance Std 0.0479489 +[id0] eval/hand_distance Max 0.222622 +[id0] eval/hand_distance Min 0.116261 +[id0] eval/Final hand_distance Mean 0.162978 +[id0] eval/Final hand_distance Std 0.0467166 +[id0] eval/Final hand_distance Max 0.209695 +[id0] eval/Final hand_distance Min 0.116261 +[id0] eval/puck_distance Mean 0.262271 +[id0] eval/puck_distance Std 0.0382676 +[id0] eval/puck_distance Max 0.300539 +[id0] eval/puck_distance Min 0.224004 +[id0] eval/Final puck_distance Mean 0.262271 +[id0] eval/Final puck_distance Std 0.0382675 +[id0] eval/Final puck_distance Max 0.300539 +[id0] eval/Final puck_distance Min 0.224004 +[id0] eval/state_distance Mean 0.317552 +[id0] eval/state_distance Std 0.00655355 +[id0] eval/state_distance Max 0.325431 +[id0] eval/state_distance Min 0.306838 +[id0] eval/Final state_distance Mean 0.31454 +[id0] eval/Final state_distance Std 0.00770235 +[id0] eval/Final state_distance Max 0.322243 +[id0] eval/Final state_distance Min 0.306838 +[id0] eval/touch_distance Mean 0.10999 +[id0] eval/touch_distance Std 0.00448419 +[id0] eval/touch_distance Max 0.116016 +[id0] eval/touch_distance Min 0.100666 +[id0] eval/Final touch_distance Mean 0.108341 +[id0] eval/Final touch_distance Std 0.00767521 +[id0] eval/Final touch_distance Max 0.116016 +[id0] eval/Final touch_distance Min 0.100666 +[id0] eval/hand_success Mean 0 +[id0] eval/hand_success Std 0 +[id0] eval/hand_success Max 0 +[id0] eval/hand_success Min 0 +[id0] eval/Final hand_success Mean 0 +[id0] eval/Final hand_success Std 0 +[id0] eval/Final hand_success Max 0 +[id0] eval/Final hand_success Min 0 +[id0] eval/hand_success_2 Mean 0 +[id0] eval/hand_success_2 Std 0 +[id0] eval/hand_success_2 Max 0 +[id0] eval/hand_success_2 Min 0 +[id0] eval/Final hand_success_2 Mean 0 +[id0] eval/Final hand_success_2 Std 0 +[id0] eval/Final hand_success_2 Max 0 +[id0] eval/Final hand_success_2 Min 0 +[id0] eval/hand_success_3 Mean 0.22 +[id0] eval/hand_success_3 Std 0.414246 +[id0] eval/hand_success_3 Max 1 +[id0] eval/hand_success_3 Min 0 +[id0] eval/Final hand_success_3 Mean 0.5 +[id0] eval/Final hand_success_3 Std 0.5 +[id0] eval/Final hand_success_3 Max 1 +[id0] eval/Final hand_success_3 Min 0 +[id0] eval/puck_success Mean 0 +[id0] eval/puck_success Std 0 +[id0] eval/puck_success Max 0 +[id0] eval/puck_success Min 0 +[id0] eval/Final puck_success Mean 0 +[id0] eval/Final puck_success Std 0 +[id0] eval/Final puck_success Max 0 +[id0] eval/Final puck_success Min 0 +[id0] eval/puck_success_2 Mean 0 +[id0] eval/puck_success_2 Std 0 +[id0] eval/puck_success_2 Max 0 +[id0] eval/puck_success_2 Min 0 +[id0] eval/Final puck_success_2 Mean 0 +[id0] eval/Final puck_success_2 Std 0 +[id0] eval/Final puck_success_2 Max 0 +[id0] eval/Final puck_success_2 Min 0 +[id0] eval/puck_success_3 Mean 0 +[id0] eval/puck_success_3 Std 0 +[id0] eval/puck_success_3 Max 0 +[id0] eval/puck_success_3 Min 0 +[id0] eval/Final puck_success_3 Mean 0 +[id0] eval/Final puck_success_3 Std 0 +[id0] eval/Final puck_success_3 Max 0 +[id0] eval/Final puck_success_3 Min 0 +[id0] eval/hand_and_puck_success Mean 0 +[id0] eval/hand_and_puck_success Std 0 +[id0] eval/hand_and_puck_success Max 0 +[id0] eval/hand_and_puck_success Min 0 +[id0] eval/Final hand_and_puck_success Mean 0 +[id0] eval/Final hand_and_puck_success Std 0 +[id0] eval/Final hand_and_puck_success Max 0 +[id0] eval/Final hand_and_puck_success Min 0 +[id0] eval/hand_and_puck_success_2 Mean 0 +[id0] eval/hand_and_puck_success_2 Std 0 +[id0] eval/hand_and_puck_success_2 Max 0 +[id0] eval/hand_and_puck_success_2 Min 0 +[id0] eval/Final hand_and_puck_success_2 Mean 0 +[id0] eval/Final hand_and_puck_success_2 Std 0 +[id0] eval/Final hand_and_puck_success_2 Max 0 +[id0] eval/Final hand_and_puck_success_2 Min 0 +[id0] eval/hand_and_puck_success_3 Mean 0 +[id0] eval/hand_and_puck_success_3 Std 0 +[id0] eval/hand_and_puck_success_3 Max 0 +[id0] eval/hand_and_puck_success_3 Min 0 +[id0] eval/Final hand_and_puck_success_3 Mean 0 +[id0] eval/Final hand_and_puck_success_3 Std 0 +[id0] eval/Final hand_and_puck_success_3 Max 0 +[id0] eval/Final hand_and_puck_success_3 Min 0 +[id0] eval/state_success Mean 0 +[id0] eval/state_success Std 0 +[id0] eval/state_success Max 0 +[id0] eval/state_success Min 0 +[id0] eval/Final state_success Mean 0 +[id0] eval/Final state_success Std 0 +[id0] eval/Final state_success Max 0 +[id0] eval/Final state_success Min 0 +[id0] eval/state_success_2 Mean 0 +[id0] eval/state_success_2 Std 0 +[id0] eval/state_success_2 Max 0 +[id0] eval/state_success_2 Min 0 +[id0] eval/Final state_success_2 Mean 0 +[id0] eval/Final state_success_2 Std 0 +[id0] eval/Final state_success_2 Max 0 +[id0] eval/Final state_success_2 Min 0 +[id0] eval/state_success_3 Mean 0 +[id0] eval/state_success_3 Std 0 +[id0] eval/state_success_3 Max 0 +[id0] eval/state_success_3 Min 0 +[id0] eval/Final state_success_3 Mean 0 +[id0] eval/Final state_success_3 Std 0 +[id0] eval/Final state_success_3 Max 0 +[id0] eval/Final state_success_3 Min 0 +[id0] eval/touch_success Mean 0 +[id0] eval/touch_success Std 0 +[id0] eval/touch_success Max 0 +[id0] eval/touch_success Min 0 +[id0] eval/Final touch_success Mean 0 +[id0] eval/Final touch_success Std 0 +[id0] eval/Final touch_success Max 0 +[id0] eval/Final touch_success Min 0 +[id0] eval/touch_success_2 Mean 0 +[id0] eval/touch_success_2 Std 0 +[id0] eval/touch_success_2 Max 0 +[id0] eval/touch_success_2 Min 0 +[id0] eval/Final touch_success_2 Mean 0 +[id0] eval/Final touch_success_2 Std 0 +[id0] eval/Final touch_success_2 Max 0 +[id0] eval/Final touch_success_2 Min 0 +[id0] eval/touch_success_3 Mean 1 +[id0] eval/touch_success_3 Std 0 +[id0] eval/touch_success_3 Max 1 +[id0] eval/touch_success_3 Min 1 +[id0] eval/Final touch_success_3 Mean 1 +[id0] eval/Final touch_success_3 Std 0 +[id0] eval/Final touch_success_3 Max 1 +[id0] eval/Final touch_success_3 Min 1 +[id0] time/epoch_time (s) 4.87509 +[id0] time/evaluation sampling (s) 1.54375 +[id0] time/exploration sampling (s) 1.54015 +[id0] time/global_time (s) 21.5402 +[id0] time/replay buffer data storing (s) 0.000362873 +[id0] time/saving (s) 0.0123284 +[id0] time/training (s) 0.205729 +[id0] epoch 0 +[id0] --------------------------------------- --------------- +[id0] --------------------------------------- --------------- +[id0] trainer/num train calls 20 +[id0] trainer/QF1 Loss 0.00250073 +[id0] trainer/QF2 Loss 0.00207108 +[id0] trainer/Policy Loss 2.22325 +[id0] trainer/Q1 Predictions Mean -0.0737635 +[id0] trainer/Q1 Predictions Std 0.00398001 +[id0] trainer/Q1 Predictions Max -0.0687237 +[id0] trainer/Q1 Predictions Min -0.0776319 +[id0] trainer/Q2 Predictions Mean -0.0691382 +[id0] trainer/Q2 Predictions Std 0.00233655 +[id0] trainer/Q2 Predictions Max -0.0668909 +[id0] trainer/Q2 Predictions Min -0.0736087 +[id0] trainer/Q Targets Mean -0.0295868 +[id0] trainer/Q Targets Std 0.0228406 +[id0] trainer/Q Targets Max -0.000883632 +[id0] trainer/Q Targets Min -0.0600125 +[id0] trainer/Log Pis Mean 2.16272 +[id0] trainer/Log Pis Std 1.56912 +[id0] trainer/Log Pis Max 3.88093 +[id0] trainer/Log Pis Min -0.660848 +[id0] trainer/rewards Mean -0.0287089 +[id0] trainer/rewards Std 0.0228596 +[id0] trainer/rewards Max -1.95569e-07 +[id0] trainer/rewards Min -0.0592206 +[id0] trainer/terminals Mean 0 +[id0] trainer/terminals Std 0 +[id0] trainer/terminals Max 0 +[id0] trainer/terminals Min 0 +[id0] trainer/replay_buffer_len 90300 +[id0] trainer/policy/mean Mean -0.000350953 +[id0] trainer/policy/mean Std 0.0518476 +[id0] trainer/policy/mean Max 0.0528517 +[id0] trainer/policy/mean Min -0.0535691 +[id0] trainer/policy/std Mean 0.0543918 +[id0] trainer/policy/std Std 0.000164085 +[id0] trainer/policy/std Max 0.0545799 +[id0] trainer/policy/std Min 0.054039 +[id0] trainer/Advantage Weights Mean 0.2 +[id0] trainer/Advantage Weights Std 0.376826 +[id0] trainer/Advantage Weights Max 0.953306 +[id0] trainer/Advantage Weights Min 0.00159713 +[id0] trainer/Advantage Score Mean -0.0132283 +[id0] trainer/Advantage Score Std 0.00232157 +[id0] trainer/Advantage Score Max -0.00913966 +[id0] trainer/Advantage Score Min -0.0155314 +[id0] trainer/bc/Train Logprob Loss 212.68 +[id0] trainer/bc/Test Logprob Loss 222.854 +[id0] trainer/bc/Train MSE 0.638763 +[id0] trainer/bc/Test MSE 0.68769 +[id0] trainer/bc/train_policy_loss 212.68 +[id0] trainer/bc/test_policy_loss 222.854 +[id0] expl/num steps total 300 +[id0] expl/num paths total 6 +[id0] expl/path length Mean 50 +[id0] expl/path length Std 0 +[id0] expl/path length Max 50 +[id0] expl/path length Min 50 +[id0] expl/Rewards Mean -0.197181 +[id0] expl/Rewards Std 0.124835 +[id0] expl/Rewards Max -0.0723456 +[id0] expl/Rewards Min -0.322017 +[id0] expl/Returns Mean -9.85907 +[id0] expl/Returns Std 6.24177 +[id0] expl/Returns Max -3.61729 +[id0] expl/Returns Min -16.1008 +[id0] expl/Actions Mean 0.00306798 +[id0] expl/Actions Std 0.0800417 +[id0] expl/Actions Max 0.22433 +[id0] expl/Actions Min -0.187217 +[id0] expl/Num Paths 2 +[id0] expl/Average Returns -9.85907 +[id0] expl/hand_distance Mean 0.147538 +[id0] expl/hand_distance Std 0.0153105 +[id0] expl/hand_distance Max 0.16193 +[id0] expl/hand_distance Min 0.111969 +[id0] expl/Final hand_distance Mean 0.160255 +[id0] expl/Final hand_distance Std 0.000688567 +[id0] expl/Final hand_distance Max 0.160944 +[id0] expl/Final hand_distance Min 0.159567 +[id0] expl/puck_distance Mean 0.197181 +[id0] expl/puck_distance Std 0.124835 +[id0] expl/puck_distance Max 0.322017 +[id0] expl/puck_distance Min 0.0723456 +[id0] expl/Final puck_distance Mean 0.197181 +[id0] expl/Final puck_distance Std 0.124836 +[id0] expl/Final puck_distance Max 0.322017 +[id0] expl/Final puck_distance Min 0.0723456 +[id0] expl/state_distance Mean 0.256835 +[id0] expl/state_distance Std 0.102479 +[id0] expl/state_distance Max 0.359867 +[id0] expl/state_distance Min 0.133308 +[id0] expl/Final state_distance Mean 0.26792 +[id0] expl/Final state_distance Std 0.0914635 +[id0] expl/Final state_distance Max 0.359383 +[id0] expl/Final state_distance Min 0.176456 +[id0] expl/touch_distance Mean 0.18962 +[id0] expl/touch_distance Std 0.108421 +[id0] expl/touch_distance Max 0.317678 +[id0] expl/touch_distance Min 0.0735865 +[id0] expl/Final touch_distance Mean 0.184087 +[id0] expl/Final touch_distance Std 0.0925809 +[id0] expl/Final touch_distance Max 0.276668 +[id0] expl/Final touch_distance Min 0.0915062 +[id0] expl/hand_success Mean 0 +[id0] expl/hand_success Std 0 +[id0] expl/hand_success Max 0 +[id0] expl/hand_success Min 0 +[id0] expl/Final hand_success Mean 0 +[id0] expl/Final hand_success Std 0 +[id0] expl/Final hand_success Max 0 +[id0] expl/Final hand_success Min 0 +[id0] expl/hand_success_2 Mean 0 +[id0] expl/hand_success_2 Std 0 +[id0] expl/hand_success_2 Max 0 +[id0] expl/hand_success_2 Min 0 +[id0] expl/Final hand_success_2 Mean 0 +[id0] expl/Final hand_success_2 Std 0 +[id0] expl/Final hand_success_2 Max 0 +[id0] expl/Final hand_success_2 Min 0 +[id0] expl/hand_success_3 Mean 0.08 +[id0] expl/hand_success_3 Std 0.271293 +[id0] expl/hand_success_3 Max 1 +[id0] expl/hand_success_3 Min 0 +[id0] expl/Final hand_success_3 Mean 0 +[id0] expl/Final hand_success_3 Std 0 +[id0] expl/Final hand_success_3 Max 0 +[id0] expl/Final hand_success_3 Min 0 +[id0] expl/puck_success Mean 0 +[id0] expl/puck_success Std 0 +[id0] expl/puck_success Max 0 +[id0] expl/puck_success Min 0 +[id0] expl/Final puck_success Mean 0 +[id0] expl/Final puck_success Std 0 +[id0] expl/Final puck_success Max 0 +[id0] expl/Final puck_success Min 0 +[id0] expl/puck_success_2 Mean 0.5 +[id0] expl/puck_success_2 Std 0.5 +[id0] expl/puck_success_2 Max 1 +[id0] expl/puck_success_2 Min 0 +[id0] expl/Final puck_success_2 Mean 0.5 +[id0] expl/Final puck_success_2 Std 0.5 +[id0] expl/Final puck_success_2 Max 1 +[id0] expl/Final puck_success_2 Min 0 +[id0] expl/puck_success_3 Mean 0.5 +[id0] expl/puck_success_3 Std 0.5 +[id0] expl/puck_success_3 Max 1 +[id0] expl/puck_success_3 Min 0 +[id0] expl/Final puck_success_3 Mean 0.5 +[id0] expl/Final puck_success_3 Std 0.5 +[id0] expl/Final puck_success_3 Max 1 +[id0] expl/Final puck_success_3 Min 0 +[id0] expl/hand_and_puck_success Mean 0 +[id0] expl/hand_and_puck_success Std 0 +[id0] expl/hand_and_puck_success Max 0 +[id0] expl/hand_and_puck_success Min 0 +[id0] expl/Final hand_and_puck_success Mean 0 +[id0] expl/Final hand_and_puck_success Std 0 +[id0] expl/Final hand_and_puck_success Max 0 +[id0] expl/Final hand_and_puck_success Min 0 +[id0] expl/hand_and_puck_success_2 Mean 0 +[id0] expl/hand_and_puck_success_2 Std 0 +[id0] expl/hand_and_puck_success_2 Max 0 +[id0] expl/hand_and_puck_success_2 Min 0 +[id0] expl/Final hand_and_puck_success_2 Mean 0 +[id0] expl/Final hand_and_puck_success_2 Std 0 +[id0] expl/Final hand_and_puck_success_2 Max 0 +[id0] expl/Final hand_and_puck_success_2 Min 0 +[id0] expl/hand_and_puck_success_3 Mean 0 +[id0] expl/hand_and_puck_success_3 Std 0 +[id0] expl/hand_and_puck_success_3 Max 0 +[id0] expl/hand_and_puck_success_3 Min 0 +[id0] expl/Final hand_and_puck_success_3 Mean 0 +[id0] expl/Final hand_and_puck_success_3 Std 0 +[id0] expl/Final hand_and_puck_success_3 Max 0 +[id0] expl/Final hand_and_puck_success_3 Min 0 +[id0] expl/state_success Mean 0 +[id0] expl/state_success Std 0 +[id0] expl/state_success Max 0 +[id0] expl/state_success Min 0 +[id0] expl/Final state_success Mean 0 +[id0] expl/Final state_success Std 0 +[id0] expl/Final state_success Max 0 +[id0] expl/Final state_success Min 0 +[id0] expl/state_success_2 Mean 0 +[id0] expl/state_success_2 Std 0 +[id0] expl/state_success_2 Max 0 +[id0] expl/state_success_2 Min 0 +[id0] expl/Final state_success_2 Mean 0 +[id0] expl/Final state_success_2 Std 0 +[id0] expl/Final state_success_2 Max 0 +[id0] expl/Final state_success_2 Min 0 +[id0] expl/state_success_3 Mean 0 +[id0] expl/state_success_3 Std 0 +[id0] expl/state_success_3 Max 0 +[id0] expl/state_success_3 Min 0 +[id0] expl/Final state_success_3 Mean 0 +[id0] expl/Final state_success_3 Std 0 +[id0] expl/Final state_success_3 Max 0 +[id0] expl/Final state_success_3 Min 0 +[id0] expl/touch_success Mean 0 +[id0] expl/touch_success Std 0 +[id0] expl/touch_success Max 0 +[id0] expl/touch_success Min 0 +[id0] expl/Final touch_success Mean 0 +[id0] expl/Final touch_success Std 0 +[id0] expl/Final touch_success Max 0 +[id0] expl/Final touch_success Min 0 +[id0] expl/touch_success_2 Mean 0.24 +[id0] expl/touch_success_2 Std 0.427083 +[id0] expl/touch_success_2 Max 1 +[id0] expl/touch_success_2 Min 0 +[id0] expl/Final touch_success_2 Mean 0 +[id0] expl/Final touch_success_2 Std 0 +[id0] expl/Final touch_success_2 Max 0 +[id0] expl/Final touch_success_2 Min 0 +[id0] expl/touch_success_3 Mean 0.5 +[id0] expl/touch_success_3 Std 0.5 +[id0] expl/touch_success_3 Max 1 +[id0] expl/touch_success_3 Min 0 +[id0] expl/Final touch_success_3 Mean 0.5 +[id0] expl/Final touch_success_3 Std 0.5 +[id0] expl/Final touch_success_3 Max 1 +[id0] expl/Final touch_success_3 Min 0 +[id0] eval/num steps total 200 +[id0] eval/num paths total 4 +[id0] eval/path length Mean 50 +[id0] eval/path length Std 0 +[id0] eval/path length Max 50 +[id0] eval/path length Min 50 +[id0] eval/Rewards Mean -0.272303 +[id0] eval/Rewards Std 0.0149697 +[id0] eval/Rewards Max -0.25069 +[id0] eval/Rewards Min -0.29101 +[id0] eval/Returns Mean -13.6151 +[id0] eval/Returns Std 0.645758 +[id0] eval/Returns Max -12.9694 +[id0] eval/Returns Min -14.2609 +[id0] eval/Actions Mean -0.000333342 +[id0] eval/Actions Std 0.0505545 +[id0] eval/Actions Max 0.0510213 +[id0] eval/Actions Min -0.0517202 +[id0] eval/Num Paths 2 +[id0] eval/Average Returns -13.6151 +[id0] eval/hand_distance Mean 0.122697 +[id0] eval/hand_distance Std 0.0300833 +[id0] eval/hand_distance Max 0.180573 +[id0] eval/hand_distance Min 0.0919758 +[id0] eval/Final hand_distance Mean 0.140525 +[id0] eval/Final hand_distance Std 0.0400476 +[id0] eval/Final hand_distance Max 0.180573 +[id0] eval/Final hand_distance Min 0.100478 +[id0] eval/puck_distance Mean 0.272303 +[id0] eval/puck_distance Std 0.0149697 +[id0] eval/puck_distance Max 0.29101 +[id0] eval/puck_distance Min 0.25069 +[id0] eval/Final puck_distance Mean 0.288114 +[id0] eval/Final puck_distance Std 0.00289617 +[id0] eval/Final puck_distance Max 0.29101 +[id0] eval/Final puck_distance Min 0.285218 +[id0] eval/state_distance Mean 0.300344 +[id0] eval/state_distance Std 0.0112235 +[id0] eval/state_distance Max 0.342481 +[id0] eval/state_distance Min 0.281135 +[id0] eval/Final state_distance Mean 0.32244 +[id0] eval/Final state_distance Std 0.0200413 +[id0] eval/Final state_distance Max 0.342481 +[id0] eval/Final state_distance Min 0.302399 +[id0] eval/touch_distance Mean 0.136578 +[id0] eval/touch_distance Std 0.0700787 +[id0] eval/touch_distance Max 0.235992 +[id0] eval/touch_distance Min 0.0600804 +[id0] eval/Final touch_distance Mean 0.15151 +[id0] eval/Final touch_distance Std 0.0844817 +[id0] eval/Final touch_distance Max 0.235992 +[id0] eval/Final touch_distance Min 0.067028 +[id0] eval/hand_success Mean 0 +[id0] eval/hand_success Std 0 +[id0] eval/hand_success Max 0 +[id0] eval/hand_success Min 0 +[id0] eval/Final hand_success Mean 0 +[id0] eval/Final hand_success Std 0 +[id0] eval/Final hand_success Max 0 +[id0] eval/Final hand_success Min 0 +[id0] eval/hand_success_2 Mean 0 +[id0] eval/hand_success_2 Std 0 +[id0] eval/hand_success_2 Max 0 +[id0] eval/hand_success_2 Min 0 +[id0] eval/Final hand_success_2 Mean 0 +[id0] eval/Final hand_success_2 Std 0 +[id0] eval/Final hand_success_2 Max 0 +[id0] eval/Final hand_success_2 Min 0 +[id0] eval/hand_success_3 Mean 0.5 +[id0] eval/hand_success_3 Std 0.5 +[id0] eval/hand_success_3 Max 1 +[id0] eval/hand_success_3 Min 0 +[id0] eval/Final hand_success_3 Mean 0.5 +[id0] eval/Final hand_success_3 Std 0.5 +[id0] eval/Final hand_success_3 Max 1 +[id0] eval/Final hand_success_3 Min 0 +[id0] eval/puck_success Mean 0 +[id0] eval/puck_success Std 0 +[id0] eval/puck_success Max 0 +[id0] eval/puck_success Min 0 +[id0] eval/Final puck_success Mean 0 +[id0] eval/Final puck_success Std 0 +[id0] eval/Final puck_success Max 0 +[id0] eval/Final puck_success Min 0 +[id0] eval/puck_success_2 Mean 0 +[id0] eval/puck_success_2 Std 0 +[id0] eval/puck_success_2 Max 0 +[id0] eval/puck_success_2 Min 0 +[id0] eval/Final puck_success_2 Mean 0 +[id0] eval/Final puck_success_2 Std 0 +[id0] eval/Final puck_success_2 Max 0 +[id0] eval/Final puck_success_2 Min 0 +[id0] eval/puck_success_3 Mean 0 +[id0] eval/puck_success_3 Std 0 +[id0] eval/puck_success_3 Max 0 +[id0] eval/puck_success_3 Min 0 +[id0] eval/Final puck_success_3 Mean 0 +[id0] eval/Final puck_success_3 Std 0 +[id0] eval/Final puck_success_3 Max 0 +[id0] eval/Final puck_success_3 Min 0 +[id0] eval/hand_and_puck_success Mean 0 +[id0] eval/hand_and_puck_success Std 0 +[id0] eval/hand_and_puck_success Max 0 +[id0] eval/hand_and_puck_success Min 0 +[id0] eval/Final hand_and_puck_success Mean 0 +[id0] eval/Final hand_and_puck_success Std 0 +[id0] eval/Final hand_and_puck_success Max 0 +[id0] eval/Final hand_and_puck_success Min 0 +[id0] eval/hand_and_puck_success_2 Mean 0 +[id0] eval/hand_and_puck_success_2 Std 0 +[id0] eval/hand_and_puck_success_2 Max 0 +[id0] eval/hand_and_puck_success_2 Min 0 +[id0] eval/Final hand_and_puck_success_2 Mean 0 +[id0] eval/Final hand_and_puck_success_2 Std 0 +[id0] eval/Final hand_and_puck_success_2 Max 0 +[id0] eval/Final hand_and_puck_success_2 Min 0 +[id0] eval/hand_and_puck_success_3 Mean 0 +[id0] eval/hand_and_puck_success_3 Std 0 +[id0] eval/hand_and_puck_success_3 Max 0 +[id0] eval/hand_and_puck_success_3 Min 0 +[id0] eval/Final hand_and_puck_success_3 Mean 0 +[id0] eval/Final hand_and_puck_success_3 Std 0 +[id0] eval/Final hand_and_puck_success_3 Max 0 +[id0] eval/Final hand_and_puck_success_3 Min 0 +[id0] eval/state_success Mean 0 +[id0] eval/state_success Std 0 +[id0] eval/state_success Max 0 +[id0] eval/state_success Min 0 +[id0] eval/Final state_success Mean 0 +[id0] eval/Final state_success Std 0 +[id0] eval/Final state_success Max 0 +[id0] eval/Final state_success Min 0 +[id0] eval/state_success_2 Mean 0 +[id0] eval/state_success_2 Std 0 +[id0] eval/state_success_2 Max 0 +[id0] eval/state_success_2 Min 0 +[id0] eval/Final state_success_2 Mean 0 +[id0] eval/Final state_success_2 Std 0 +[id0] eval/Final state_success_2 Max 0 +[id0] eval/Final state_success_2 Min 0 +[id0] eval/state_success_3 Mean 0 +[id0] eval/state_success_3 Std 0 +[id0] eval/state_success_3 Max 0 +[id0] eval/state_success_3 Min 0 +[id0] eval/Final state_success_3 Mean 0 +[id0] eval/Final state_success_3 Std 0 +[id0] eval/Final state_success_3 Max 0 +[id0] eval/Final state_success_3 Min 0 +[id0] eval/touch_success Mean 0 +[id0] eval/touch_success Std 0 +[id0] eval/touch_success Max 0 +[id0] eval/touch_success Min 0 +[id0] eval/Final touch_success Mean 0 +[id0] eval/Final touch_success Std 0 +[id0] eval/Final touch_success Max 0 +[id0] eval/Final touch_success Min 0 +[id0] eval/touch_success_2 Mean 0.45 +[id0] eval/touch_success_2 Std 0.497494 +[id0] eval/touch_success_2 Max 1 +[id0] eval/touch_success_2 Min 0 +[id0] eval/Final touch_success_2 Mean 0.5 +[id0] eval/Final touch_success_2 Std 0.5 +[id0] eval/Final touch_success_2 Max 1 +[id0] eval/Final touch_success_2 Min 0 +[id0] eval/touch_success_3 Mean 0.5 +[id0] eval/touch_success_3 Std 0.5 +[id0] eval/touch_success_3 Max 1 +[id0] eval/touch_success_3 Min 0 +[id0] eval/Final touch_success_3 Mean 0.5 +[id0] eval/Final touch_success_3 Std 0.5 +[id0] eval/Final touch_success_3 Max 1 +[id0] eval/Final touch_success_3 Min 0 +[id0] time/epoch_time (s) 3.31098 +[id0] time/evaluation sampling (s) 1.55752 +[id0] time/exploration sampling (s) 1.53391 +[id0] time/global_time (s) 24.877 +[id0] time/replay buffer data storing (s) 0.000359535 +[id0] time/saving (s) 2.86102e-05 +[id0] time/training (s) 0.207875 +[id0] epoch 1 +[id0] --------------------------------------- --------------- +[id0] --------------------------------------- --------------- +[id0] trainer/num train calls 30 +[id0] trainer/QF1 Loss 0.00454581 +[id0] trainer/QF2 Loss 0.004166 +[id0] trainer/Policy Loss 2.72957 +[id0] trainer/Q1 Predictions Mean -0.126527 +[id0] trainer/Q1 Predictions Std 0.00315079 +[id0] trainer/Q1 Predictions Max -0.123503 +[id0] trainer/Q1 Predictions Min -0.131618 +[id0] trainer/Q2 Predictions Mean -0.126105 +[id0] trainer/Q2 Predictions Std 0.00710658 +[id0] trainer/Q2 Predictions Max -0.11948 +[id0] trainer/Q2 Predictions Min -0.139396 +[id0] trainer/Q Targets Mean -0.112228 +[id0] trainer/Q Targets Std 0.0672478 +[id0] trainer/Q Targets Max -0.0255882 +[id0] trainer/Q Targets Min -0.209882 +[id0] trainer/Log Pis Mean 2.6193 +[id0] trainer/Log Pis Std 1.044 +[id0] trainer/Log Pis Max 3.409 +[id0] trainer/Log Pis Min 0.627569 +[id0] trainer/rewards Mean -0.110253 +[id0] trainer/rewards Std 0.0672446 +[id0] trainer/rewards Max -0.0236798 +[id0] trainer/rewards Min -0.207899 +[id0] trainer/terminals Mean 0 +[id0] trainer/terminals Std 0 +[id0] trainer/terminals Max 0 +[id0] trainer/terminals Min 0 +[id0] trainer/replay_buffer_len 90400 +[id0] trainer/policy/mean Mean 0.00452171 +[id0] trainer/policy/mean Std 0.199858 +[id0] trainer/policy/mean Max 0.211937 +[id0] trainer/policy/mean Min -0.202561 +[id0] trainer/policy/std Mean 0.0727627 +[id0] trainer/policy/std Std 0.000733147 +[id0] trainer/policy/std Max 0.0741526 +[id0] trainer/policy/std Min 0.0716186 +[id0] trainer/Advantage Weights Mean 0.2 +[id0] trainer/Advantage Weights Std 0.303689 +[id0] trainer/Advantage Weights Max 0.803173 +[id0] trainer/Advantage Weights Min 0.000376373 +[id0] trainer/Advantage Score Mean -0.0162524 +[id0] trainer/Advantage Score Std 0.00252286 +[id0] trainer/Advantage Score Max -0.0131438 +[id0] trainer/Advantage Score Min -0.0208096 +[id0] trainer/bc/Train Logprob Loss 137.941 +[id0] trainer/bc/Test Logprob Loss 114.999 +[id0] trainer/bc/Train MSE 0.72753 +[id0] trainer/bc/Test MSE 0.686215 +[id0] trainer/bc/train_policy_loss 137.941 +[id0] trainer/bc/test_policy_loss 114.999 +[id0] expl/num steps total 400 +[id0] expl/num paths total 8 +[id0] expl/path length Mean 50 +[id0] expl/path length Std 0 +[id0] expl/path length Max 50 +[id0] expl/path length Min 50 +[id0] expl/Rewards Mean -0.145173 +[id0] expl/Rewards Std 0.0133283 +[id0] expl/Rewards Max -0.131845 +[id0] expl/Rewards Min -0.158502 +[id0] expl/Returns Mean -7.25866 +[id0] expl/Returns Std 0.666417 +[id0] expl/Returns Max -6.59225 +[id0] expl/Returns Min -7.92508 +[id0] expl/Actions Mean 0.0140931 +[id0] expl/Actions Std 0.209096 +[id0] expl/Actions Max 0.371834 +[id0] expl/Actions Min -0.3301 +[id0] expl/Num Paths 2 +[id0] expl/Average Returns -7.25866 +[id0] expl/hand_distance Mean 0.202981 +[id0] expl/hand_distance Std 0.0395357 +[id0] expl/hand_distance Max 0.271631 +[id0] expl/hand_distance Min 0.118211 +[id0] expl/Final hand_distance Mean 0.265121 +[id0] expl/Final hand_distance Std 0.00650981 +[id0] expl/Final hand_distance Max 0.271631 +[id0] expl/Final hand_distance Min 0.258611 +[id0] expl/puck_distance Mean 0.145173 +[id0] expl/puck_distance Std 0.0133283 +[id0] expl/puck_distance Max 0.158502 +[id0] expl/puck_distance Min 0.131845 +[id0] expl/Final puck_distance Mean 0.145173 +[id0] expl/Final puck_distance Std 0.0133284 +[id0] expl/Final puck_distance Max 0.158502 +[id0] expl/Final puck_distance Min 0.131845 +[id0] expl/state_distance Mean 0.250709 +[id0] expl/state_distance Std 0.0340966 +[id0] expl/state_distance Max 0.303319 +[id0] expl/state_distance Min 0.177079 +[id0] expl/Final state_distance Mean 0.302628 +[id0] expl/Final state_distance Std 0.000690746 +[id0] expl/Final state_distance Max 0.303319 +[id0] expl/Final state_distance Min 0.301938 +[id0] expl/touch_distance Mean 0.202697 +[id0] expl/touch_distance Std 0.0637517 +[id0] expl/touch_distance Max 0.312716 +[id0] expl/touch_distance Min 0.0831949 +[id0] expl/Final touch_distance Mean 0.197956 +[id0] expl/Final touch_distance Std 0.114761 +[id0] expl/Final touch_distance Max 0.312716 +[id0] expl/Final touch_distance Min 0.0831949 +[id0] expl/hand_success Mean 0 +[id0] expl/hand_success Std 0 +[id0] expl/hand_success Max 0 +[id0] expl/hand_success Min 0 +[id0] expl/Final hand_success Mean 0 +[id0] expl/Final hand_success Std 0 +[id0] expl/Final hand_success Max 0 +[id0] expl/Final hand_success Min 0 +[id0] expl/hand_success_2 Mean 0 +[id0] expl/hand_success_2 Std 0 +[id0] expl/hand_success_2 Max 0 +[id0] expl/hand_success_2 Min 0 +[id0] expl/Final hand_success_2 Mean 0 +[id0] expl/Final hand_success_2 Std 0 +[id0] expl/Final hand_success_2 Max 0 +[id0] expl/Final hand_success_2 Min 0 +[id0] expl/hand_success_3 Mean 0.01 +[id0] expl/hand_success_3 Std 0.0994987 +[id0] expl/hand_success_3 Max 1 +[id0] expl/hand_success_3 Min 0 +[id0] expl/Final hand_success_3 Mean 0 +[id0] expl/Final hand_success_3 Std 0 +[id0] expl/Final hand_success_3 Max 0 +[id0] expl/Final hand_success_3 Min 0 +[id0] expl/puck_success Mean 0 +[id0] expl/puck_success Std 0 +[id0] expl/puck_success Max 0 +[id0] expl/puck_success Min 0 +[id0] expl/Final puck_success Mean 0 +[id0] expl/Final puck_success Std 0 +[id0] expl/Final puck_success Max 0 +[id0] expl/Final puck_success Min 0 +[id0] expl/puck_success_2 Mean 0 +[id0] expl/puck_success_2 Std 0 +[id0] expl/puck_success_2 Max 0 +[id0] expl/puck_success_2 Min 0 +[id0] expl/Final puck_success_2 Mean 0 +[id0] expl/Final puck_success_2 Std 0 +[id0] expl/Final puck_success_2 Max 0 +[id0] expl/Final puck_success_2 Min 0 +[id0] expl/puck_success_3 Mean 0 +[id0] expl/puck_success_3 Std 0 +[id0] expl/puck_success_3 Max 0 +[id0] expl/puck_success_3 Min 0 +[id0] expl/Final puck_success_3 Mean 0 +[id0] expl/Final puck_success_3 Std 0 +[id0] expl/Final puck_success_3 Max 0 +[id0] expl/Final puck_success_3 Min 0 +[id0] expl/hand_and_puck_success Mean 0 +[id0] expl/hand_and_puck_success Std 0 +[id0] expl/hand_and_puck_success Max 0 +[id0] expl/hand_and_puck_success Min 0 +[id0] expl/Final hand_and_puck_success Mean 0 +[id0] expl/Final hand_and_puck_success Std 0 +[id0] expl/Final hand_and_puck_success Max 0 +[id0] expl/Final hand_and_puck_success Min 0 +[id0] expl/hand_and_puck_success_2 Mean 0 +[id0] expl/hand_and_puck_success_2 Std 0 +[id0] expl/hand_and_puck_success_2 Max 0 +[id0] expl/hand_and_puck_success_2 Min 0 +[id0] expl/Final hand_and_puck_success_2 Mean 0 +[id0] expl/Final hand_and_puck_success_2 Std 0 +[id0] expl/Final hand_and_puck_success_2 Max 0 +[id0] expl/Final hand_and_puck_success_2 Min 0 +[id0] expl/hand_and_puck_success_3 Mean 0 +[id0] expl/hand_and_puck_success_3 Std 0 +[id0] expl/hand_and_puck_success_3 Max 0 +[id0] expl/hand_and_puck_success_3 Min 0 +[id0] expl/Final hand_and_puck_success_3 Mean 0 +[id0] expl/Final hand_and_puck_success_3 Std 0 +[id0] expl/Final hand_and_puck_success_3 Max 0 +[id0] expl/Final hand_and_puck_success_3 Min 0 +[id0] expl/state_success Mean 0 +[id0] expl/state_success Std 0 +[id0] expl/state_success Max 0 +[id0] expl/state_success Min 0 +[id0] expl/Final state_success Mean 0 +[id0] expl/Final state_success Std 0 +[id0] expl/Final state_success Max 0 +[id0] expl/Final state_success Min 0 +[id0] expl/state_success_2 Mean 0 +[id0] expl/state_success_2 Std 0 +[id0] expl/state_success_2 Max 0 +[id0] expl/state_success_2 Min 0 +[id0] expl/Final state_success_2 Mean 0 +[id0] expl/Final state_success_2 Std 0 +[id0] expl/Final state_success_2 Max 0 +[id0] expl/Final state_success_2 Min 0 +[id0] expl/state_success_3 Mean 0 +[id0] expl/state_success_3 Std 0 +[id0] expl/state_success_3 Max 0 +[id0] expl/state_success_3 Min 0 +[id0] expl/Final state_success_3 Mean 0 +[id0] expl/Final state_success_3 Std 0 +[id0] expl/Final state_success_3 Max 0 +[id0] expl/Final state_success_3 Min 0 +[id0] expl/touch_success Mean 0 +[id0] expl/touch_success Std 0 +[id0] expl/touch_success Max 0 +[id0] expl/touch_success Min 0 +[id0] expl/Final touch_success Mean 0 +[id0] expl/Final touch_success Std 0 +[id0] expl/Final touch_success Max 0 +[id0] expl/Final touch_success Min 0 +[id0] expl/touch_success_2 Mean 0 +[id0] expl/touch_success_2 Std 0 +[id0] expl/touch_success_2 Max 0 +[id0] expl/touch_success_2 Min 0 +[id0] expl/Final touch_success_2 Mean 0 +[id0] expl/Final touch_success_2 Std 0 +[id0] expl/Final touch_success_2 Max 0 +[id0] expl/Final touch_success_2 Min 0 +[id0] expl/touch_success_3 Mean 0.14 +[id0] expl/touch_success_3 Std 0.346987 +[id0] expl/touch_success_3 Max 1 +[id0] expl/touch_success_3 Min 0 +[id0] expl/Final touch_success_3 Mean 0.5 +[id0] expl/Final touch_success_3 Std 0.5 +[id0] expl/Final touch_success_3 Max 1 +[id0] expl/Final touch_success_3 Min 0 +[id0] eval/num steps total 300 +[id0] eval/num paths total 6 +[id0] eval/path length Mean 50 +[id0] eval/path length Std 0 +[id0] eval/path length Max 50 +[id0] eval/path length Min 50 +[id0] eval/Rewards Mean -0.160745 +[id0] eval/Rewards Std 0.0629678 +[id0] eval/Rewards Max -0.0980016 +[id0] eval/Rewards Min -0.248806 +[id0] eval/Returns Mean -8.03726 +[id0] eval/Returns Std 3.13716 +[id0] eval/Returns Max -4.90009 +[id0] eval/Returns Min -11.1744 +[id0] eval/Actions Mean 0.00448906 +[id0] eval/Actions Std 0.198369 +[id0] eval/Actions Max 0.213171 +[id0] eval/Actions Min -0.203783 +[id0] eval/Num Paths 2 +[id0] eval/Average Returns -8.03726 +[id0] eval/hand_distance Mean 0.172838 +[id0] eval/hand_distance Std 0.0793458 +[id0] eval/hand_distance Max 0.338509 +[id0] eval/hand_distance Min 0.100819 +[id0] eval/Final hand_distance Mean 0.241763 +[id0] eval/Final hand_distance Std 0.0967464 +[id0] eval/Final hand_distance Max 0.338509 +[id0] eval/Final hand_distance Min 0.145016 +[id0] eval/puck_distance Mean 0.160745 +[id0] eval/puck_distance Std 0.0629678 +[id0] eval/puck_distance Max 0.248806 +[id0] eval/puck_distance Min 0.0980016 +[id0] eval/Final puck_distance Mean 0.173404 +[id0] eval/Final puck_distance Std 0.0754021 +[id0] eval/Final puck_distance Max 0.248806 +[id0] eval/Final puck_distance Min 0.0980023 +[id0] eval/state_distance Mean 0.252767 +[id0] eval/state_distance Std 0.0456258 +[id0] eval/state_distance Max 0.35241 +[id0] eval/state_distance Min 0.149458 +[id0] eval/Final state_distance Mean 0.320197 +[id0] eval/Final state_distance Std 0.0322134 +[id0] eval/Final state_distance Max 0.35241 +[id0] eval/Final state_distance Min 0.287983 +[id0] eval/touch_distance Mean 0.133625 +[id0] eval/touch_distance Std 0.0489162 +[id0] eval/touch_distance Max 0.22003 +[id0] eval/touch_distance Min 0.0647734 +[id0] eval/Final touch_distance Mean 0.143407 +[id0] eval/Final touch_distance Std 0.0766229 +[id0] eval/Final touch_distance Max 0.22003 +[id0] eval/Final touch_distance Min 0.0667838 +[id0] eval/hand_success Mean 0 +[id0] eval/hand_success Std 0 +[id0] eval/hand_success Max 0 +[id0] eval/hand_success Min 0 +[id0] eval/Final hand_success Mean 0 +[id0] eval/Final hand_success Std 0 +[id0] eval/Final hand_success Max 0 +[id0] eval/Final hand_success Min 0 +[id0] eval/hand_success_2 Mean 0 +[id0] eval/hand_success_2 Std 0 +[id0] eval/hand_success_2 Max 0 +[id0] eval/hand_success_2 Min 0 +[id0] eval/Final hand_success_2 Mean 0 +[id0] eval/Final hand_success_2 Std 0 +[id0] eval/Final hand_success_2 Max 0 +[id0] eval/Final hand_success_2 Min 0 +[id0] eval/hand_success_3 Mean 0.42 +[id0] eval/hand_success_3 Std 0.493559 +[id0] eval/hand_success_3 Max 1 +[id0] eval/hand_success_3 Min 0 +[id0] eval/Final hand_success_3 Mean 0 +[id0] eval/Final hand_success_3 Std 0 +[id0] eval/Final hand_success_3 Max 0 +[id0] eval/Final hand_success_3 Min 0 +[id0] eval/puck_success Mean 0 +[id0] eval/puck_success Std 0 +[id0] eval/puck_success Max 0 +[id0] eval/puck_success Min 0 +[id0] eval/Final puck_success Mean 0 +[id0] eval/Final puck_success Std 0 +[id0] eval/Final puck_success Max 0 +[id0] eval/Final puck_success Min 0 +[id0] eval/puck_success_2 Mean 0 +[id0] eval/puck_success_2 Std 0 +[id0] eval/puck_success_2 Max 0 +[id0] eval/puck_success_2 Min 0 +[id0] eval/Final puck_success_2 Mean 0 +[id0] eval/Final puck_success_2 Std 0 +[id0] eval/Final puck_success_2 Max 0 +[id0] eval/Final puck_success_2 Min 0 +[id0] eval/puck_success_3 Mean 0.5 +[id0] eval/puck_success_3 Std 0.5 +[id0] eval/puck_success_3 Max 1 +[id0] eval/puck_success_3 Min 0 +[id0] eval/Final puck_success_3 Mean 0.5 +[id0] eval/Final puck_success_3 Std 0.5 +[id0] eval/Final puck_success_3 Max 1 +[id0] eval/Final puck_success_3 Min 0 +[id0] eval/hand_and_puck_success Mean 0 +[id0] eval/hand_and_puck_success Std 0 +[id0] eval/hand_and_puck_success Max 0 +[id0] eval/hand_and_puck_success Min 0 +[id0] eval/Final hand_and_puck_success Mean 0 +[id0] eval/Final hand_and_puck_success Std 0 +[id0] eval/Final hand_and_puck_success Max 0 +[id0] eval/Final hand_and_puck_success Min 0 +[id0] eval/hand_and_puck_success_2 Mean 0 +[id0] eval/hand_and_puck_success_2 Std 0 +[id0] eval/hand_and_puck_success_2 Max 0 +[id0] eval/hand_and_puck_success_2 Min 0 +[id0] eval/Final hand_and_puck_success_2 Mean 0 +[id0] eval/Final hand_and_puck_success_2 Std 0 +[id0] eval/Final hand_and_puck_success_2 Max 0 +[id0] eval/Final hand_and_puck_success_2 Min 0 +[id0] eval/hand_and_puck_success_3 Mean 0 +[id0] eval/hand_and_puck_success_3 Std 0 +[id0] eval/hand_and_puck_success_3 Max 0 +[id0] eval/hand_and_puck_success_3 Min 0 +[id0] eval/Final hand_and_puck_success_3 Mean 0 +[id0] eval/Final hand_and_puck_success_3 Std 0 +[id0] eval/Final hand_and_puck_success_3 Max 0 +[id0] eval/Final hand_and_puck_success_3 Min 0 +[id0] eval/state_success Mean 0 +[id0] eval/state_success Std 0 +[id0] eval/state_success Max 0 +[id0] eval/state_success Min 0 +[id0] eval/Final state_success Mean 0 +[id0] eval/Final state_success Std 0 +[id0] eval/Final state_success Max 0 +[id0] eval/Final state_success Min 0 +[id0] eval/state_success_2 Mean 0 +[id0] eval/state_success_2 Std 0 +[id0] eval/state_success_2 Max 0 +[id0] eval/state_success_2 Min 0 +[id0] eval/Final state_success_2 Mean 0 +[id0] eval/Final state_success_2 Std 0 +[id0] eval/Final state_success_2 Max 0 +[id0] eval/Final state_success_2 Min 0 +[id0] eval/state_success_3 Mean 0 +[id0] eval/state_success_3 Std 0 +[id0] eval/state_success_3 Max 0 +[id0] eval/state_success_3 Min 0 +[id0] eval/Final state_success_3 Mean 0 +[id0] eval/Final state_success_3 Std 0 +[id0] eval/Final state_success_3 Max 0 +[id0] eval/Final state_success_3 Min 0 +[id0] eval/touch_success Mean 0 +[id0] eval/touch_success Std 0 +[id0] eval/touch_success Max 0 +[id0] eval/touch_success Min 0 +[id0] eval/Final touch_success Mean 0 +[id0] eval/Final touch_success Std 0 +[id0] eval/Final touch_success Max 0 +[id0] eval/Final touch_success Min 0 +[id0] eval/touch_success_2 Mean 0.16 +[id0] eval/touch_success_2 Std 0.366606 +[id0] eval/touch_success_2 Max 1 +[id0] eval/touch_success_2 Min 0 +[id0] eval/Final touch_success_2 Mean 0.5 +[id0] eval/Final touch_success_2 Std 0.5 +[id0] eval/Final touch_success_2 Max 1 +[id0] eval/Final touch_success_2 Min 0 +[id0] eval/touch_success_3 Mean 0.46 +[id0] eval/touch_success_3 Std 0.498397 +[id0] eval/touch_success_3 Max 1 +[id0] eval/touch_success_3 Min 0 +[id0] eval/Final touch_success_3 Mean 0.5 +[id0] eval/Final touch_success_3 Std 0.5 +[id0] eval/Final touch_success_3 Max 1 +[id0] eval/Final touch_success_3 Min 0 +[id0] time/epoch_time (s) 3.31803 +[id0] time/evaluation sampling (s) 1.54555 +[id0] time/exploration sampling (s) 1.55969 +[id0] time/global_time (s) 28.2217 +[id0] time/replay buffer data storing (s) 0.000371933 +[id0] time/saving (s) 3.50475e-05 +[id0] time/training (s) 0.202231 +[id0] epoch 2 +[id0] --------------------------------------- --------------- +[id0] --------------------------------------- --------------- +[id0] trainer/num train calls 40 +[id0] trainer/QF1 Loss 0.00812755 +[id0] trainer/QF2 Loss 0.00844011 +[id0] trainer/Policy Loss 1.44796 +[id0] trainer/Q1 Predictions Mean -0.125104 +[id0] trainer/Q1 Predictions Std 0.00825019 +[id0] trainer/Q1 Predictions Max -0.115534 +[id0] trainer/Q1 Predictions Min -0.140015 +[id0] trainer/Q2 Predictions Mean -0.126615 +[id0] trainer/Q2 Predictions Std 0.00817811 +[id0] trainer/Q2 Predictions Max -0.115739 +[id0] trainer/Q2 Predictions Min -0.141015 +[id0] trainer/Q Targets Mean -0.0874891 +[id0] trainer/Q Targets Std 0.0767401 +[id0] trainer/Q Targets Max -0.00298979 +[id0] trainer/Q Targets Min -0.219601 +[id0] trainer/Log Pis Mean 1.32672 +[id0] trainer/Log Pis Std 0.705888 +[id0] trainer/Log Pis Max 1.97718 +[id0] trainer/Log Pis Min 0.302602 +[id0] trainer/rewards Mean -0.0843445 +[id0] trainer/rewards Std 0.0768233 +[id0] trainer/rewards Max -1.76237e-06 +[id0] trainer/rewards Min -0.216733 +[id0] trainer/terminals Mean 0 +[id0] trainer/terminals Std 0 +[id0] trainer/terminals Max 0 +[id0] trainer/terminals Min 0 +[id0] trainer/replay_buffer_len 90500 +[id0] trainer/policy/mean Mean -0.0363329 +[id0] trainer/policy/mean Std 0.381041 +[id0] trainer/policy/mean Max 0.369571 +[id0] trainer/policy/mean Min -0.446347 +[id0] trainer/policy/std Mean 0.140297 +[id0] trainer/policy/std Std 0.0092635 +[id0] trainer/policy/std Max 0.156963 +[id0] trainer/policy/std Min 0.123789 +[id0] trainer/Advantage Weights Mean 0.2 +[id0] trainer/Advantage Weights Std 0.399073 +[id0] trainer/Advantage Weights Max 0.998145 +[id0] trainer/Advantage Weights Min 3.91133e-10 +[id0] trainer/Advantage Score Mean -0.00386002 +[id0] trainer/Advantage Score Std 0.00749892 +[id0] trainer/Advantage Score Max 0.0071685 +[id0] trainer/Advantage Score Min -0.0144916 +[id0] trainer/bc/Train Logprob Loss 35.3732 +[id0] trainer/bc/Test Logprob Loss 31.4472 +[id0] trainer/bc/Train MSE 0.762608 +[id0] trainer/bc/Test MSE 0.794041 +[id0] trainer/bc/train_policy_loss 35.3732 +[id0] trainer/bc/test_policy_loss 31.4472 +[id0] expl/num steps total 500 +[id0] expl/num paths total 10 +[id0] expl/path length Mean 50 +[id0] expl/path length Std 0 +[id0] expl/path length Max 50 +[id0] expl/path length Min 50 +[id0] expl/Rewards Mean -0.113284 +[id0] expl/Rewards Std 0.00896539 +[id0] expl/Rewards Max -0.104318 +[id0] expl/Rewards Min -0.122249 +[id0] expl/Returns Mean -5.66419 +[id0] expl/Returns Std 0.448269 +[id0] expl/Returns Max -5.21592 +[id0] expl/Returns Min -6.11246 +[id0] expl/Actions Mean -0.0486082 +[id0] expl/Actions Std 0.417547 +[id0] expl/Actions Max 0.674142 +[id0] expl/Actions Min -0.728117 +[id0] expl/Num Paths 2 +[id0] expl/Average Returns -5.66419 +[id0] expl/hand_distance Mean 0.150248 +[id0] expl/hand_distance Std 0.0805957 +[id0] expl/hand_distance Max 0.233022 +[id0] expl/hand_distance Min 0.0281943 +[id0] expl/Final hand_distance Mean 0.162135 +[id0] expl/Final hand_distance Std 0.0708874 +[id0] expl/Final hand_distance Max 0.233022 +[id0] expl/Final hand_distance Min 0.0912476 +[id0] expl/puck_distance Mean 0.113284 +[id0] expl/puck_distance Std 0.00896539 +[id0] expl/puck_distance Max 0.122249 +[id0] expl/puck_distance Min 0.104318 +[id0] expl/Final puck_distance Mean 0.113284 +[id0] expl/Final puck_distance Std 0.00896559 +[id0] expl/Final puck_distance Max 0.122249 +[id0] expl/Final puck_distance Min 0.104318 +[id0] expl/state_distance Mean 0.193878 +[id0] expl/state_distance Std 0.0662949 +[id0] expl/state_distance Max 0.263143 +[id0] expl/state_distance Min 0.108062 +[id0] expl/Final state_distance Mean 0.200869 +[id0] expl/Final state_distance Std 0.0622744 +[id0] expl/Final state_distance Max 0.263143 +[id0] expl/Final state_distance Min 0.138594 +[id0] expl/touch_distance Mean 0.320496 +[id0] expl/touch_distance Std 0.0821977 +[id0] expl/touch_distance Max 0.413936 +[id0] expl/touch_distance Min 0.172705 +[id0] expl/Final touch_distance Mean 0.330567 +[id0] expl/Final touch_distance Std 0.0833687 +[id0] expl/Final touch_distance Max 0.413936 +[id0] expl/Final touch_distance Min 0.247198 +[id0] expl/hand_success Mean 0.17 +[id0] expl/hand_success Std 0.375633 +[id0] expl/hand_success Max 1 +[id0] expl/hand_success Min 0 +[id0] expl/Final hand_success Mean 0 +[id0] expl/Final hand_success Std 0 +[id0] expl/Final hand_success Max 0 +[id0] expl/Final hand_success Min 0 +[id0] expl/hand_success_2 Mean 0.2 +[id0] expl/hand_success_2 Std 0.4 +[id0] expl/hand_success_2 Max 1 +[id0] expl/hand_success_2 Min 0 +[id0] expl/Final hand_success_2 Mean 0 +[id0] expl/Final hand_success_2 Std 0 +[id0] expl/Final hand_success_2 Max 0 +[id0] expl/Final hand_success_2 Min 0 +[id0] expl/hand_success_3 Mean 0.5 +[id0] expl/hand_success_3 Std 0.5 +[id0] expl/hand_success_3 Max 1 +[id0] expl/hand_success_3 Min 0 +[id0] expl/Final hand_success_3 Mean 0.5 +[id0] expl/Final hand_success_3 Std 0.5 +[id0] expl/Final hand_success_3 Max 1 +[id0] expl/Final hand_success_3 Min 0 +[id0] expl/puck_success Mean 0 +[id0] expl/puck_success Std 0 +[id0] expl/puck_success Max 0 +[id0] expl/puck_success Min 0 +[id0] expl/Final puck_success Mean 0 +[id0] expl/Final puck_success Std 0 +[id0] expl/Final puck_success Max 0 +[id0] expl/Final puck_success Min 0 +[id0] expl/puck_success_2 Mean 0 +[id0] expl/puck_success_2 Std 0 +[id0] expl/puck_success_2 Max 0 +[id0] expl/puck_success_2 Min 0 +[id0] expl/Final puck_success_2 Mean 0 +[id0] expl/Final puck_success_2 Std 0 +[id0] expl/Final puck_success_2 Max 0 +[id0] expl/Final puck_success_2 Min 0 +[id0] expl/puck_success_3 Mean 0.5 +[id0] expl/puck_success_3 Std 0.5 +[id0] expl/puck_success_3 Max 1 +[id0] expl/puck_success_3 Min 0 +[id0] expl/Final puck_success_3 Mean 0.5 +[id0] expl/Final puck_success_3 Std 0.5 +[id0] expl/Final puck_success_3 Max 1 +[id0] expl/Final puck_success_3 Min 0 +[id0] expl/hand_and_puck_success Mean 0 +[id0] expl/hand_and_puck_success Std 0 +[id0] expl/hand_and_puck_success Max 0 +[id0] expl/hand_and_puck_success Min 0 +[id0] expl/Final hand_and_puck_success Mean 0 +[id0] expl/Final hand_and_puck_success Std 0 +[id0] expl/Final hand_and_puck_success Max 0 +[id0] expl/Final hand_and_puck_success Min 0 +[id0] expl/hand_and_puck_success_2 Mean 0 +[id0] expl/hand_and_puck_success_2 Std 0 +[id0] expl/hand_and_puck_success_2 Max 0 +[id0] expl/hand_and_puck_success_2 Min 0 +[id0] expl/Final hand_and_puck_success_2 Mean 0 +[id0] expl/Final hand_and_puck_success_2 Std 0 +[id0] expl/Final hand_and_puck_success_2 Max 0 +[id0] expl/Final hand_and_puck_success_2 Min 0 +[id0] expl/hand_and_puck_success_3 Mean 0 +[id0] expl/hand_and_puck_success_3 Std 0 +[id0] expl/hand_and_puck_success_3 Max 0 +[id0] expl/hand_and_puck_success_3 Min 0 +[id0] expl/Final hand_and_puck_success_3 Mean 0 +[id0] expl/Final hand_and_puck_success_3 Std 0 +[id0] expl/Final hand_and_puck_success_3 Max 0 +[id0] expl/Final hand_and_puck_success_3 Min 0 +[id0] expl/state_success Mean 0 +[id0] expl/state_success Std 0 +[id0] expl/state_success Max 0 +[id0] expl/state_success Min 0 +[id0] expl/Final state_success Mean 0 +[id0] expl/Final state_success Std 0 +[id0] expl/Final state_success Max 0 +[id0] expl/Final state_success Min 0 +[id0] expl/state_success_2 Mean 0 +[id0] expl/state_success_2 Std 0 +[id0] expl/state_success_2 Max 0 +[id0] expl/state_success_2 Min 0 +[id0] expl/Final state_success_2 Mean 0 +[id0] expl/Final state_success_2 Std 0 +[id0] expl/Final state_success_2 Max 0 +[id0] expl/Final state_success_2 Min 0 +[id0] expl/state_success_3 Mean 0.17 +[id0] expl/state_success_3 Std 0.375633 +[id0] expl/state_success_3 Max 1 +[id0] expl/state_success_3 Min 0 +[id0] expl/Final state_success_3 Mean 0 +[id0] expl/Final state_success_3 Std 0 +[id0] expl/Final state_success_3 Max 0 +[id0] expl/Final state_success_3 Min 0 +[id0] expl/touch_success Mean 0 +[id0] expl/touch_success Std 0 +[id0] expl/touch_success Max 0 +[id0] expl/touch_success Min 0 +[id0] expl/Final touch_success Mean 0 +[id0] expl/Final touch_success Std 0 +[id0] expl/Final touch_success Max 0 +[id0] expl/Final touch_success Min 0 +[id0] expl/touch_success_2 Mean 0 +[id0] expl/touch_success_2 Std 0 +[id0] expl/touch_success_2 Max 0 +[id0] expl/touch_success_2 Min 0 +[id0] expl/Final touch_success_2 Mean 0 +[id0] expl/Final touch_success_2 Std 0 +[id0] expl/Final touch_success_2 Max 0 +[id0] expl/Final touch_success_2 Min 0 +[id0] expl/touch_success_3 Mean 0 +[id0] expl/touch_success_3 Std 0 +[id0] expl/touch_success_3 Max 0 +[id0] expl/touch_success_3 Min 0 +[id0] expl/Final touch_success_3 Mean 0 +[id0] expl/Final touch_success_3 Std 0 +[id0] expl/Final touch_success_3 Max 0 +[id0] expl/Final touch_success_3 Min 0 +[id0] eval/num steps total 400 +[id0] eval/num paths total 8 +[id0] eval/path length Mean 50 +[id0] eval/path length Std 0 +[id0] eval/path length Max 50 +[id0] eval/path length Min 50 +[id0] eval/Rewards Mean -0.127591 +[id0] eval/Rewards Std 0.0872006 +[id0] eval/Rewards Max -0.0403897 +[id0] eval/Rewards Min -0.214791 +[id0] eval/Returns Mean -6.37953 +[id0] eval/Returns Std 4.36003 +[id0] eval/Returns Max -2.0195 +[id0] eval/Returns Min -10.7396 +[id0] eval/Actions Mean -0.0376967 +[id0] eval/Actions Std 0.39848 +[id0] eval/Actions Max 0.365947 +[id0] eval/Actions Min -0.442161 +[id0] eval/Num Paths 2 +[id0] eval/Average Returns -6.37953 +[id0] eval/hand_distance Mean 0.132352 +[id0] eval/hand_distance Std 0.0460267 +[id0] eval/hand_distance Max 0.18476 +[id0] eval/hand_distance Min 0.000587539 +[id0] eval/Final hand_distance Mean 0.163061 +[id0] eval/Final hand_distance Std 0.0216984 +[id0] eval/Final hand_distance Max 0.18476 +[id0] eval/Final hand_distance Min 0.141363 +[id0] eval/puck_distance Mean 0.127591 +[id0] eval/puck_distance Std 0.0872006 +[id0] eval/puck_distance Max 0.214791 +[id0] eval/puck_distance Min 0.0403897 +[id0] eval/Final puck_distance Mean 0.127591 +[id0] eval/Final puck_distance Std 0.0872005 +[id0] eval/Final puck_distance Max 0.214791 +[id0] eval/Final puck_distance Min 0.0403904 +[id0] eval/state_distance Mean 0.194302 +[id0] eval/state_distance Std 0.0759312 +[id0] eval/state_distance Max 0.269184 +[id0] eval/state_distance Min 0.0403941 +[id0] eval/Final state_distance Mean 0.223129 +[id0] eval/Final state_distance Std 0.0340064 +[id0] eval/Final state_distance Max 0.257136 +[id0] eval/Final state_distance Min 0.189123 +[id0] eval/touch_distance Mean 0.242917 +[id0] eval/touch_distance Std 0.0917106 +[id0] eval/touch_distance Max 0.337246 +[id0] eval/touch_distance Min 0.0830543 +[id0] eval/Final touch_distance Mean 0.275227 +[id0] eval/Final touch_distance Std 0.0620179 +[id0] eval/Final touch_distance Max 0.337245 +[id0] eval/Final touch_distance Min 0.213209 +[id0] eval/hand_success Mean 0.11 +[id0] eval/hand_success Std 0.31289 +[id0] eval/hand_success Max 1 +[id0] eval/hand_success Min 0 +[id0] eval/Final hand_success Mean 0 +[id0] eval/Final hand_success Std 0 +[id0] eval/Final hand_success Max 0 +[id0] eval/Final hand_success Min 0 +[id0] eval/hand_success_2 Mean 0.16 +[id0] eval/hand_success_2 Std 0.366606 +[id0] eval/hand_success_2 Max 1 +[id0] eval/hand_success_2 Min 0 +[id0] eval/Final hand_success_2 Mean 0 +[id0] eval/Final hand_success_2 Std 0 +[id0] eval/Final hand_success_2 Max 0 +[id0] eval/Final hand_success_2 Min 0 +[id0] eval/hand_success_3 Mean 0.24 +[id0] eval/hand_success_3 Std 0.427083 +[id0] eval/hand_success_3 Max 1 +[id0] eval/hand_success_3 Min 0 +[id0] eval/Final hand_success_3 Mean 0 +[id0] eval/Final hand_success_3 Std 0 +[id0] eval/Final hand_success_3 Max 0 +[id0] eval/Final hand_success_3 Min 0 +[id0] eval/puck_success Mean 0.5 +[id0] eval/puck_success Std 0.5 +[id0] eval/puck_success Max 1 +[id0] eval/puck_success Min 0 +[id0] eval/Final puck_success Mean 0.5 +[id0] eval/Final puck_success Std 0.5 +[id0] eval/Final puck_success Max 1 +[id0] eval/Final puck_success Min 0 +[id0] eval/puck_success_2 Mean 0.5 +[id0] eval/puck_success_2 Std 0.5 +[id0] eval/puck_success_2 Max 1 +[id0] eval/puck_success_2 Min 0 +[id0] eval/Final puck_success_2 Mean 0.5 +[id0] eval/Final puck_success_2 Std 0.5 +[id0] eval/Final puck_success_2 Max 1 +[id0] eval/Final puck_success_2 Min 0 +[id0] eval/puck_success_3 Mean 0.5 +[id0] eval/puck_success_3 Std 0.5 +[id0] eval/puck_success_3 Max 1 +[id0] eval/puck_success_3 Min 0 +[id0] eval/Final puck_success_3 Mean 0.5 +[id0] eval/Final puck_success_3 Std 0.5 +[id0] eval/Final puck_success_3 Max 1 +[id0] eval/Final puck_success_3 Min 0 +[id0] eval/hand_and_puck_success Mean 0.03 +[id0] eval/hand_and_puck_success Std 0.170587 +[id0] eval/hand_and_puck_success Max 1 +[id0] eval/hand_and_puck_success Min 0 +[id0] eval/Final hand_and_puck_success Mean 0 +[id0] eval/Final hand_and_puck_success Std 0 +[id0] eval/Final hand_and_puck_success Max 0 +[id0] eval/Final hand_and_puck_success Min 0 +[id0] eval/hand_and_puck_success_2 Mean 0.07 +[id0] eval/hand_and_puck_success_2 Std 0.255147 +[id0] eval/hand_and_puck_success_2 Max 1 +[id0] eval/hand_and_puck_success_2 Min 0 +[id0] eval/Final hand_and_puck_success_2 Mean 0 +[id0] eval/Final hand_and_puck_success_2 Std 0 +[id0] eval/Final hand_and_puck_success_2 Max 0 +[id0] eval/Final hand_and_puck_success_2 Min 0 +[id0] eval/hand_and_puck_success_3 Mean 0.16 +[id0] eval/hand_and_puck_success_3 Std 0.366606 +[id0] eval/hand_and_puck_success_3 Max 1 +[id0] eval/hand_and_puck_success_3 Min 0 +[id0] eval/Final hand_and_puck_success_3 Mean 0 +[id0] eval/Final hand_and_puck_success_3 Std 0 +[id0] eval/Final hand_and_puck_success_3 Max 0 +[id0] eval/Final hand_and_puck_success_3 Min 0 +[id0] eval/state_success Mean 0.08 +[id0] eval/state_success Std 0.271293 +[id0] eval/state_success Max 1 +[id0] eval/state_success Min 0 +[id0] eval/Final state_success Mean 0 +[id0] eval/Final state_success Std 0 +[id0] eval/Final state_success Max 0 +[id0] eval/Final state_success Min 0 +[id0] eval/state_success_2 Mean 0.14 +[id0] eval/state_success_2 Std 0.346987 +[id0] eval/state_success_2 Max 1 +[id0] eval/state_success_2 Min 0 +[id0] eval/Final state_success_2 Mean 0 +[id0] eval/Final state_success_2 Std 0 +[id0] eval/Final state_success_2 Max 0 +[id0] eval/Final state_success_2 Min 0 +[id0] eval/state_success_3 Mean 0.23 +[id0] eval/state_success_3 Std 0.420833 +[id0] eval/state_success_3 Max 1 +[id0] eval/state_success_3 Min 0 +[id0] eval/Final state_success_3 Mean 0 +[id0] eval/Final state_success_3 Std 0 +[id0] eval/Final state_success_3 Max 0 +[id0] eval/Final state_success_3 Min 0 +[id0] eval/touch_success Mean 0 +[id0] eval/touch_success Std 0 +[id0] eval/touch_success Max 0 +[id0] eval/touch_success Min 0 +[id0] eval/Final touch_success Mean 0 +[id0] eval/Final touch_success Std 0 +[id0] eval/Final touch_success Max 0 +[id0] eval/Final touch_success Min 0 +[id0] eval/touch_success_2 Mean 0 +[id0] eval/touch_success_2 Std 0 +[id0] eval/touch_success_2 Max 0 +[id0] eval/touch_success_2 Min 0 +[id0] eval/Final touch_success_2 Mean 0 +[id0] eval/Final touch_success_2 Std 0 +[id0] eval/Final touch_success_2 Max 0 +[id0] eval/Final touch_success_2 Min 0 +[id0] eval/touch_success_3 Mean 0.15 +[id0] eval/touch_success_3 Std 0.357071 +[id0] eval/touch_success_3 Max 1 +[id0] eval/touch_success_3 Min 0 +[id0] eval/Final touch_success_3 Mean 0 +[id0] eval/Final touch_success_3 Std 0 +[id0] eval/Final touch_success_3 Max 0 +[id0] eval/Final touch_success_3 Min 0 +[id0] time/epoch_time (s) 3.31187 +[id0] time/evaluation sampling (s) 1.55201 +[id0] time/exploration sampling (s) 1.53991 +[id0] time/global_time (s) 31.5589 +[id0] time/replay buffer data storing (s) 0.000349522 +[id0] time/saving (s) 2.47955e-05 +[id0] time/training (s) 0.208356 +[id0] epoch 3 +[id0] --------------------------------------- --------------- +[id0] --------------------------------------- --------------- +[id0] trainer/num train calls 50 +[id0] trainer/QF1 Loss 0.0135741 +[id0] trainer/QF2 Loss 0.0132348 +[id0] trainer/Policy Loss -1.0125 +[id0] trainer/Q1 Predictions Mean -0.115927 +[id0] trainer/Q1 Predictions Std 0.015483 +[id0] trainer/Q1 Predictions Max -0.0978391 +[id0] trainer/Q1 Predictions Min -0.137299 +[id0] trainer/Q2 Predictions Mean -0.11971 +[id0] trainer/Q2 Predictions Std 0.0153614 +[id0] trainer/Q2 Predictions Max -0.099684 +[id0] trainer/Q2 Predictions Min -0.141439 +[id0] trainer/Q Targets Mean -0.124712 +[id0] trainer/Q Targets Std 0.111004 +[id0] trainer/Q Targets Max -0.00466002 +[id0] trainer/Q Targets Min -0.280714 +[id0] trainer/Log Pis Mean -1.13064 +[id0] trainer/Log Pis Std 1.98296 +[id0] trainer/Log Pis Max 0.0934451 +[id0] trainer/Log Pis Min -5.06595 +[id0] trainer/rewards Mean -0.120223 +[id0] trainer/rewards Std 0.111247 +[id0] trainer/rewards Max -8.42937e-08 +[id0] trainer/rewards Min -0.276419 +[id0] trainer/terminals Mean 0 +[id0] trainer/terminals Std 0 +[id0] trainer/terminals Max 0 +[id0] trainer/terminals Min 0 +[id0] trainer/replay_buffer_len 90600 +[id0] trainer/policy/mean Mean -0.0841174 +[id0] trainer/policy/mean Std 0.459796 +[id0] trainer/policy/mean Max 0.393837 +[id0] trainer/policy/mean Min -0.566948 +[id0] trainer/policy/std Mean 0.343178 +[id0] trainer/policy/std Std 0.0279913 +[id0] trainer/policy/std Max 0.393021 +[id0] trainer/policy/std Min 0.302896 +[id0] trainer/Advantage Weights Mean 0.2 +[id0] trainer/Advantage Weights Std 0.392735 +[id0] trainer/Advantage Weights Max 0.985392 +[id0] trainer/Advantage Weights Min 7.05268e-16 +[id0] trainer/Advantage Score Mean 0.00222292 +[id0] trainer/Advantage Score Std 0.0124918 +[id0] trainer/Advantage Score Max 0.0155132 +[id0] trainer/Advantage Score Min -0.0193601 +[id0] trainer/bc/Train Logprob Loss 6.82 +[id0] trainer/bc/Test Logprob Loss 5.87462 +[id0] trainer/bc/Train MSE 1.0784 +[id0] trainer/bc/Test MSE 1.02518 +[id0] trainer/bc/train_policy_loss 6.82 +[id0] trainer/bc/test_policy_loss 5.87462 +[id0] expl/num steps total 600 +[id0] expl/num paths total 12 +[id0] expl/path length Mean 50 +[id0] expl/path length Std 0 +[id0] expl/path length Max 50 +[id0] expl/path length Min 50 +[id0] expl/Rewards Mean -0.195256 +[id0] expl/Rewards Std 0.107581 +[id0] expl/Rewards Max -0.0480694 +[id0] expl/Rewards Min -0.301743 +[id0] expl/Returns Mean -9.76281 +[id0] expl/Returns Std 5.32436 +[id0] expl/Returns Max -4.43845 +[id0] expl/Returns Min -15.0872 +[id0] expl/Actions Mean -0.0733707 +[id0] expl/Actions Std 0.62436 +[id0] expl/Actions Max 1.42151 +[id0] expl/Actions Min -1.46905 +[id0] expl/Num Paths 2 +[id0] expl/Average Returns -9.76281 +[id0] expl/hand_distance Mean 0.158642 +[id0] expl/hand_distance Std 0.0608491 +[id0] expl/hand_distance Max 0.245079 +[id0] expl/hand_distance Min 0.0291545 +[id0] expl/Final hand_distance Mean 0.197413 +[id0] expl/Final hand_distance Std 0.0476639 +[id0] expl/Final hand_distance Max 0.245077 +[id0] expl/Final hand_distance Min 0.149749 +[id0] expl/puck_distance Mean 0.195256 +[id0] expl/puck_distance Std 0.107581 +[id0] expl/puck_distance Max 0.301743 +[id0] expl/puck_distance Min 0.0480694 +[id0] expl/Final puck_distance Mean 0.20168 +[id0] expl/Final puck_distance Std 0.100063 +[id0] expl/Final puck_distance Max 0.301743 +[id0] expl/Final puck_distance Min 0.101617 +[id0] expl/state_distance Mean 0.256775 +[id0] expl/state_distance Std 0.112407 +[id0] expl/state_distance Max 0.388732 +[id0] expl/state_distance Min 0.0562197 +[id0] expl/Final state_distance Mean 0.284851 +[id0] expl/Final state_distance Std 0.10388 +[id0] expl/Final state_distance Max 0.388731 +[id0] expl/Final state_distance Min 0.180972 +[id0] expl/touch_distance Mean 0.105648 +[id0] expl/touch_distance Std 0.0463963 +[id0] expl/touch_distance Max 0.261491 +[id0] expl/touch_distance Min 0.0616572 +[id0] expl/Final touch_distance Mean 0.0855933 +[id0] expl/Final touch_distance Std 0.00990345 +[id0] expl/Final touch_distance Max 0.0954967 +[id0] expl/Final touch_distance Min 0.0756899 +[id0] expl/hand_success Mean 0.06 +[id0] expl/hand_success Std 0.237487 +[id0] expl/hand_success Max 1 +[id0] expl/hand_success Min 0 +[id0] expl/Final hand_success Mean 0 +[id0] expl/Final hand_success Std 0 +[id0] expl/Final hand_success Max 0 +[id0] expl/Final hand_success Min 0 +[id0] expl/hand_success_2 Mean 0.11 +[id0] expl/hand_success_2 Std 0.31289 +[id0] expl/hand_success_2 Max 1 +[id0] expl/hand_success_2 Min 0 +[id0] expl/Final hand_success_2 Mean 0 +[id0] expl/Final hand_success_2 Std 0 +[id0] expl/Final hand_success_2 Max 0 +[id0] expl/Final hand_success_2 Min 0 +[id0] expl/hand_success_3 Mean 0.22 +[id0] expl/hand_success_3 Std 0.414246 +[id0] expl/hand_success_3 Max 1 +[id0] expl/hand_success_3 Min 0 +[id0] expl/Final hand_success_3 Mean 0 +[id0] expl/Final hand_success_3 Std 0 +[id0] expl/Final hand_success_3 Max 0 +[id0] expl/Final hand_success_3 Min 0 +[id0] expl/puck_success Mean 0.1 +[id0] expl/puck_success Std 0.3 +[id0] expl/puck_success Max 1 +[id0] expl/puck_success Min 0 +[id0] expl/Final puck_success Mean 0 +[id0] expl/Final puck_success Std 0 +[id0] expl/Final puck_success Max 0 +[id0] expl/Final puck_success Min 0 +[id0] expl/puck_success_2 Mean 0.12 +[id0] expl/puck_success_2 Std 0.324962 +[id0] expl/puck_success_2 Max 1 +[id0] expl/puck_success_2 Min 0 +[id0] expl/Final puck_success_2 Mean 0 +[id0] expl/Final puck_success_2 Std 0 +[id0] expl/Final puck_success_2 Max 0 +[id0] expl/Final puck_success_2 Min 0 +[id0] expl/puck_success_3 Mean 0.5 +[id0] expl/puck_success_3 Std 0.5 +[id0] expl/puck_success_3 Max 1 +[id0] expl/puck_success_3 Min 0 +[id0] expl/Final puck_success_3 Mean 0.5 +[id0] expl/Final puck_success_3 Std 0.5 +[id0] expl/Final puck_success_3 Max 1 +[id0] expl/Final puck_success_3 Min 0 +[id0] expl/hand_and_puck_success Mean 0 +[id0] expl/hand_and_puck_success Std 0 +[id0] expl/hand_and_puck_success Max 0 +[id0] expl/hand_and_puck_success Min 0 +[id0] expl/Final hand_and_puck_success Mean 0 +[id0] expl/Final hand_and_puck_success Std 0 +[id0] expl/Final hand_and_puck_success Max 0 +[id0] expl/Final hand_and_puck_success Min 0 +[id0] expl/hand_and_puck_success_2 Mean 0.03 +[id0] expl/hand_and_puck_success_2 Std 0.170587 +[id0] expl/hand_and_puck_success_2 Max 1 +[id0] expl/hand_and_puck_success_2 Min 0 +[id0] expl/Final hand_and_puck_success_2 Mean 0 +[id0] expl/Final hand_and_puck_success_2 Std 0 +[id0] expl/Final hand_and_puck_success_2 Max 0 +[id0] expl/Final hand_and_puck_success_2 Min 0 +[id0] expl/hand_and_puck_success_3 Mean 0.08 +[id0] expl/hand_and_puck_success_3 Std 0.271293 +[id0] expl/hand_and_puck_success_3 Max 1 +[id0] expl/hand_and_puck_success_3 Min 0 +[id0] expl/Final hand_and_puck_success_3 Mean 0 +[id0] expl/Final hand_and_puck_success_3 Std 0 +[id0] expl/Final hand_and_puck_success_3 Max 0 +[id0] expl/Final hand_and_puck_success_3 Min 0 +[id0] expl/state_success Mean 0.04 +[id0] expl/state_success Std 0.195959 +[id0] expl/state_success Max 1 +[id0] expl/state_success Min 0 +[id0] expl/Final state_success Mean 0 +[id0] expl/Final state_success Std 0 +[id0] expl/Final state_success Max 0 +[id0] expl/Final state_success Min 0 +[id0] expl/state_success_2 Mean 0.06 +[id0] expl/state_success_2 Std 0.237487 +[id0] expl/state_success_2 Max 1 +[id0] expl/state_success_2 Min 0 +[id0] expl/Final state_success_2 Mean 0 +[id0] expl/Final state_success_2 Std 0 +[id0] expl/Final state_success_2 Max 0 +[id0] expl/Final state_success_2 Min 0 +[id0] expl/state_success_3 Mean 0.13 +[id0] expl/state_success_3 Std 0.336303 +[id0] expl/state_success_3 Max 1 +[id0] expl/state_success_3 Min 0 +[id0] expl/Final state_success_3 Mean 0 +[id0] expl/Final state_success_3 Std 0 +[id0] expl/Final state_success_3 Max 0 +[id0] expl/Final state_success_3 Min 0 +[id0] expl/touch_success Mean 0 +[id0] expl/touch_success Std 0 +[id0] expl/touch_success Max 0 +[id0] expl/touch_success Min 0 +[id0] expl/Final touch_success Mean 0 +[id0] expl/Final touch_success Std 0 +[id0] expl/Final touch_success Max 0 +[id0] expl/Final touch_success Min 0 +[id0] expl/touch_success_2 Mean 0.42 +[id0] expl/touch_success_2 Std 0.493559 +[id0] expl/touch_success_2 Max 1 +[id0] expl/touch_success_2 Min 0 +[id0] expl/Final touch_success_2 Mean 0.5 +[id0] expl/Final touch_success_2 Std 0.5 +[id0] expl/Final touch_success_2 Max 1 +[id0] expl/Final touch_success_2 Min 0 +[id0] expl/touch_success_3 Mean 0.77 +[id0] expl/touch_success_3 Std 0.420833 +[id0] expl/touch_success_3 Max 1 +[id0] expl/touch_success_3 Min 0 +[id0] expl/Final touch_success_3 Mean 1 +[id0] expl/Final touch_success_3 Std 0 +[id0] expl/Final touch_success_3 Max 1 +[id0] expl/Final touch_success_3 Min 1 +[id0] eval/num steps total 500 +[id0] eval/num paths total 10 +[id0] eval/path length Mean 50 +[id0] eval/path length Std 0 +[id0] eval/path length Max 50 +[id0] eval/path length Min 50 +[id0] eval/Rewards Mean -0.197671 +[id0] eval/Rewards Std 0.043969 +[id0] eval/Rewards Max -0.157676 +[id0] eval/Rewards Min -0.25822 +[id0] eval/Returns Mean -9.88357 +[id0] eval/Returns Std 1.99975 +[id0] eval/Returns Max -7.88382 +[id0] eval/Returns Min -11.8833 +[id0] eval/Actions Mean -0.0890177 +[id0] eval/Actions Std 0.504219 +[id0] eval/Actions Max 0.420863 +[id0] eval/Actions Min -0.600135 +[id0] eval/Num Paths 2 +[id0] eval/Average Returns -9.88357 +[id0] eval/hand_distance Mean 0.113758 +[id0] eval/hand_distance Std 0.0444896 +[id0] eval/hand_distance Max 0.272936 +[id0] eval/hand_distance Min 0.0675899 +[id0] eval/Final hand_distance Mean 0.0937614 +[id0] eval/Final hand_distance Std 0.0240546 +[id0] eval/Final hand_distance Max 0.117816 +[id0] eval/Final hand_distance Min 0.0697068 +[id0] eval/puck_distance Mean 0.197671 +[id0] eval/puck_distance Std 0.043969 +[id0] eval/puck_distance Max 0.25822 +[id0] eval/puck_distance Min 0.157676 +[id0] eval/Final puck_distance Mean 0.207933 +[id0] eval/Final puck_distance Std 0.0502558 +[id0] eval/Final puck_distance Max 0.258188 +[id0] eval/Final puck_distance Min 0.157677 +[id0] eval/state_distance Mean 0.233133 +[id0] eval/state_distance Std 0.0397063 +[id0] eval/state_distance Max 0.339762 +[id0] eval/state_distance Min 0.193193 +[id0] eval/Final state_distance Mean 0.232132 +[id0] eval/Final state_distance Std 0.0353007 +[id0] eval/Final state_distance Max 0.267433 +[id0] eval/Final state_distance Min 0.196831 +[id0] eval/touch_distance Mean 0.152677 +[id0] eval/touch_distance Std 0.0686253 +[id0] eval/touch_distance Max 0.229003 +[id0] eval/touch_distance Min 0.0658695 +[id0] eval/Final touch_distance Mean 0.139983 +[id0] eval/Final touch_distance Std 0.0722409 +[id0] eval/Final touch_distance Max 0.212224 +[id0] eval/Final touch_distance Min 0.0677426 +[id0] eval/hand_success Mean 0 +[id0] eval/hand_success Std 0 +[id0] eval/hand_success Max 0 +[id0] eval/hand_success Min 0 +[id0] eval/Final hand_success Mean 0 +[id0] eval/Final hand_success Std 0 +[id0] eval/Final hand_success Max 0 +[id0] eval/Final hand_success Min 0 +[id0] eval/hand_success_2 Mean 0.31 +[id0] eval/hand_success_2 Std 0.462493 +[id0] eval/hand_success_2 Max 1 +[id0] eval/hand_success_2 Min 0 +[id0] eval/Final hand_success_2 Mean 0.5 +[id0] eval/Final hand_success_2 Std 0.5 +[id0] eval/Final hand_success_2 Max 1 +[id0] eval/Final hand_success_2 Min 0 +[id0] eval/hand_success_3 Mean 0.8 +[id0] eval/hand_success_3 Std 0.4 +[id0] eval/hand_success_3 Max 1 +[id0] eval/hand_success_3 Min 0 +[id0] eval/Final hand_success_3 Mean 1 +[id0] eval/Final hand_success_3 Std 0 +[id0] eval/Final hand_success_3 Max 1 +[id0] eval/Final hand_success_3 Min 1 +[id0] eval/puck_success Mean 0 +[id0] eval/puck_success Std 0 +[id0] eval/puck_success Max 0 +[id0] eval/puck_success Min 0 +[id0] eval/Final puck_success Mean 0 +[id0] eval/Final puck_success Std 0 +[id0] eval/Final puck_success Max 0 +[id0] eval/Final puck_success Min 0 +[id0] eval/puck_success_2 Mean 0 +[id0] eval/puck_success_2 Std 0 +[id0] eval/puck_success_2 Max 0 +[id0] eval/puck_success_2 Min 0 +[id0] eval/Final puck_success_2 Mean 0 +[id0] eval/Final puck_success_2 Std 0 +[id0] eval/Final puck_success_2 Max 0 +[id0] eval/Final puck_success_2 Min 0 +[id0] eval/puck_success_3 Mean 0 +[id0] eval/puck_success_3 Std 0 +[id0] eval/puck_success_3 Max 0 +[id0] eval/puck_success_3 Min 0 +[id0] eval/Final puck_success_3 Mean 0 +[id0] eval/Final puck_success_3 Std 0 +[id0] eval/Final puck_success_3 Max 0 +[id0] eval/Final puck_success_3 Min 0 +[id0] eval/hand_and_puck_success Mean 0 +[id0] eval/hand_and_puck_success Std 0 +[id0] eval/hand_and_puck_success Max 0 +[id0] eval/hand_and_puck_success Min 0 +[id0] eval/Final hand_and_puck_success Mean 0 +[id0] eval/Final hand_and_puck_success Std 0 +[id0] eval/Final hand_and_puck_success Max 0 +[id0] eval/Final hand_and_puck_success Min 0 +[id0] eval/hand_and_puck_success_2 Mean 0 +[id0] eval/hand_and_puck_success_2 Std 0 +[id0] eval/hand_and_puck_success_2 Max 0 +[id0] eval/hand_and_puck_success_2 Min 0 +[id0] eval/Final hand_and_puck_success_2 Mean 0 +[id0] eval/Final hand_and_puck_success_2 Std 0 +[id0] eval/Final hand_and_puck_success_2 Max 0 +[id0] eval/Final hand_and_puck_success_2 Min 0 +[id0] eval/hand_and_puck_success_3 Mean 0 +[id0] eval/hand_and_puck_success_3 Std 0 +[id0] eval/hand_and_puck_success_3 Max 0 +[id0] eval/hand_and_puck_success_3 Min 0 +[id0] eval/Final hand_and_puck_success_3 Mean 0 +[id0] eval/Final hand_and_puck_success_3 Std 0 +[id0] eval/Final hand_and_puck_success_3 Max 0 +[id0] eval/Final hand_and_puck_success_3 Min 0 +[id0] eval/state_success Mean 0 +[id0] eval/state_success Std 0 +[id0] eval/state_success Max 0 +[id0] eval/state_success Min 0 +[id0] eval/Final state_success Mean 0 +[id0] eval/Final state_success Std 0 +[id0] eval/Final state_success Max 0 +[id0] eval/Final state_success Min 0 +[id0] eval/state_success_2 Mean 0 +[id0] eval/state_success_2 Std 0 +[id0] eval/state_success_2 Max 0 +[id0] eval/state_success_2 Min 0 +[id0] eval/Final state_success_2 Mean 0 +[id0] eval/Final state_success_2 Std 0 +[id0] eval/Final state_success_2 Max 0 +[id0] eval/Final state_success_2 Min 0 +[id0] eval/state_success_3 Mean 0 +[id0] eval/state_success_3 Std 0 +[id0] eval/state_success_3 Max 0 +[id0] eval/state_success_3 Min 0 +[id0] eval/Final state_success_3 Mean 0 +[id0] eval/Final state_success_3 Std 0 +[id0] eval/Final state_success_3 Max 0 +[id0] eval/Final state_success_3 Min 0 +[id0] eval/touch_success Mean 0 +[id0] eval/touch_success Std 0 +[id0] eval/touch_success Max 0 +[id0] eval/touch_success Min 0 +[id0] eval/Final touch_success Mean 0 +[id0] eval/Final touch_success Std 0 +[id0] eval/Final touch_success Max 0 +[id0] eval/Final touch_success Min 0 +[id0] eval/touch_success_2 Mean 0.36 +[id0] eval/touch_success_2 Std 0.48 +[id0] eval/touch_success_2 Max 1 +[id0] eval/touch_success_2 Min 0 +[id0] eval/Final touch_success_2 Mean 0.5 +[id0] eval/Final touch_success_2 Std 0.5 +[id0] eval/Final touch_success_2 Max 1 +[id0] eval/Final touch_success_2 Min 0 +[id0] eval/touch_success_3 Mean 0.4 +[id0] eval/touch_success_3 Std 0.489898 +[id0] eval/touch_success_3 Max 1 +[id0] eval/touch_success_3 Min 0 +[id0] eval/Final touch_success_3 Mean 0.5 +[id0] eval/Final touch_success_3 Std 0.5 +[id0] eval/Final touch_success_3 Max 1 +[id0] eval/Final touch_success_3 Min 0 +[id0] time/epoch_time (s) 3.34013 +[id0] time/evaluation sampling (s) 1.55183 +[id0] time/exploration sampling (s) 1.57141 +[id0] time/global_time (s) 34.9261 +[id0] time/replay buffer data storing (s) 0.000357151 +[id0] time/saving (s) 2.64645e-05 +[id0] time/training (s) 0.205462 +[id0] epoch 4 +[id0] --------------------------------------- --------------- +[id0] ------------------------------ --------------- +[id0] trainer/QF1 Loss 36.8715 +[id0] trainer/QF2 Loss 36.8755 +[id0] trainer/Policy Loss 9.28352 +[id0] trainer/Q1 Predictions Mean -0.000248199 +[id0] trainer/Q1 Predictions Std 0.00343049 +[id0] trainer/Q1 Predictions Max 0.0135826 +[id0] trainer/Q1 Predictions Min -0.00886565 +[id0] trainer/Q2 Predictions Mean 0.000417902 +[id0] trainer/Q2 Predictions Std 0.00379635 +[id0] trainer/Q2 Predictions Max 0.0113527 +[id0] trainer/Q2 Predictions Min -0.0108211 +[id0] trainer/Q Targets Mean 3.5774 +[id0] trainer/Q Targets Std 4.90573 +[id0] trainer/Q Targets Max 12.3087 +[id0] trainer/Q Targets Min -1.71089 +[id0] trainer/Log Pis Mean 9.2814 +[id0] trainer/Log Pis Std 1.9335 +[id0] trainer/Log Pis Max 12.1028 +[id0] trainer/Log Pis Min 1.67644 +[id0] trainer/rewards Mean 3.58356 +[id0] trainer/rewards Std 4.9069 +[id0] trainer/rewards Max 12.316 +[id0] trainer/rewards Min -1.70161 +[id0] trainer/terminals Mean 0 +[id0] trainer/terminals Std 0 +[id0] trainer/terminals Max 0 +[id0] trainer/terminals Min 0 +[id0] trainer/replay_buffer_len 89000 +[id0] trainer/policy/mean Mean -0.000101677 +[id0] trainer/policy/mean Std 0.000327078 +[id0] trainer/policy/mean Max 0.000704126 +[id0] trainer/policy/mean Min -0.0014804 +[id0] trainer/policy/std Mean 0.0497871 +[id0] trainer/policy/std Std 3.72529e-09 +[id0] trainer/policy/std Max 0.0497871 +[id0] trainer/policy/std Min 0.0497871 +[id0] trainer/Advantage Weights Mean 0.0078125 +[id0] trainer/Advantage Weights Std 1.97209e-06 +[id0] trainer/Advantage Weights Max 0.00781739 +[id0] trainer/Advantage Weights Min 0.00780739 +[id0] trainer/Advantage Score Mean 5.75556e-05 +[id0] trainer/Advantage Score Std 0.000504857 +[id0] trainer/Advantage Score Max 0.00130917 +[id0] trainer/Advantage Score Min -0.00124966 +[id0] trainer/batch 0 +[id0] trainer/epoch_time 0.0230172 +[id0] ------------------------------ --------------- +[id0] ----------------------------------- --------------- +[id0] trainer/num train calls 10 +[id0] trainer/QF1 Loss 92.1218 +[id0] trainer/QF2 Loss 91.9529 +[id0] trainer/Policy Loss 8.70062 +[id0] trainer/Q1 Predictions Mean 0.243832 +[id0] trainer/Q1 Predictions Std 0.04779 +[id0] trainer/Q1 Predictions Max 0.299389 +[id0] trainer/Q1 Predictions Min 0.179337 +[id0] trainer/Q2 Predictions Mean 0.242781 +[id0] trainer/Q2 Predictions Std 0.0598606 +[id0] trainer/Q2 Predictions Max 0.301608 +[id0] trainer/Q2 Predictions Min 0.137358 +[id0] trainer/Q Targets Mean 8.68214 +[id0] trainer/Q Targets Std 4.60552 +[id0] trainer/Q Targets Max 11.1447 +[id0] trainer/Q Targets Min -0.527146 +[id0] trainer/Log Pis Mean 8.93413 +[id0] trainer/Log Pis Std 1.90035 +[id0] trainer/Log Pis Max 10.6362 +[id0] trainer/Log Pis Min 5.43167 +[id0] trainer/rewards Mean 8.68726 +[id0] trainer/rewards Std 4.60638 +[id0] trainer/rewards Max 11.1469 +[id0] trainer/rewards Min -0.523798 +[id0] trainer/terminals Mean 0 +[id0] trainer/terminals Std 0 +[id0] trainer/terminals Max 0 +[id0] trainer/terminals Min 0 +[id0] trainer/replay_buffer_len 89110 +[id0] trainer/policy/mean Mean 0.0501946 +[id0] trainer/policy/mean Std 0.0769959 +[id0] trainer/policy/mean Max 0.164765 +[id0] trainer/policy/mean Min -0.141686 +[id0] trainer/policy/std Mean 0.0500109 +[id0] trainer/policy/std Std 1.7139e-07 +[id0] trainer/policy/std Max 0.0500112 +[id0] trainer/policy/std Min 0.0500107 +[id0] trainer/Advantage Weights Mean 0.2 +[id0] trainer/Advantage Weights Std 0.000215922 +[id0] trainer/Advantage Weights Max 0.200264 +[id0] trainer/Advantage Weights Min 0.199614 +[id0] trainer/Advantage Score Mean 0.00193172 +[id0] trainer/Advantage Score Std 0.00216014 +[id0] trainer/Advantage Score Max 0.00457418 +[id0] trainer/Advantage Score Min -0.00193453 +[id0] expl/num steps total 110 +[id0] expl/num paths total 2 +[id0] expl/path length Mean 100 +[id0] expl/path length Std 0 +[id0] expl/path length Max 100 +[id0] expl/path length Min 100 +[id0] expl/Rewards Mean -0.0213552 +[id0] expl/Rewards Std 0.103067 +[id0] expl/Rewards Max 0.154351 +[id0] expl/Rewards Min -0.358819 +[id0] expl/Returns Mean -2.13552 +[id0] expl/Returns Std 0 +[id0] expl/Returns Max -2.13552 +[id0] expl/Returns Min -2.13552 +[id0] expl/Actions Mean 0.00360301 +[id0] expl/Actions Std 0.0495008 +[id0] expl/Actions Max 0.156353 +[id0] expl/Actions Min -0.179061 +[id0] expl/Num Paths 1 +[id0] expl/Average Returns -2.13552 +[id0] eval/num steps total 0 +[id0] eval/num paths total 0 +[id0] time/epoch_time (s) 0.35254 +[id0] time/evaluation sampling (s) 0.0868323 +[id0] time/exploration sampling (s) 0.0838516 +[id0] time/global_time (s) 6.16777 +[id0] time/replay buffer data storing (s) 0.000583887 +[id0] time/saving (s) 0.0111945 +[id0] time/training (s) 0.157702 +[id0] epoch 0 +[id0] ----------------------------------- --------------- +[id0] ----------------------------------- --------------- +[id0] trainer/num train calls 20 +[id0] trainer/QF1 Loss 57.5256 +[id0] trainer/QF2 Loss 57.7265 +[id0] trainer/Policy Loss 9.03843 +[id0] trainer/Q1 Predictions Mean 0.610775 +[id0] trainer/Q1 Predictions Std 0.183916 +[id0] trainer/Q1 Predictions Max 0.854407 +[id0] trainer/Q1 Predictions Min 0.396609 +[id0] trainer/Q2 Predictions Mean 0.610474 +[id0] trainer/Q2 Predictions Std 0.159908 +[id0] trainer/Q2 Predictions Max 0.800834 +[id0] trainer/Q2 Predictions Min 0.367608 +[id0] trainer/Q Targets Mean 6.21657 +[id0] trainer/Q Targets Std 5.26535 +[id0] trainer/Q Targets Max 11.7897 +[id0] trainer/Q Targets Min -0.836062 +[id0] trainer/Log Pis Mean 9.60988 +[id0] trainer/Log Pis Std 0.939456 +[id0] trainer/Log Pis Max 11.2394 +[id0] trainer/Log Pis Min 8.45266 +[id0] trainer/rewards Mean 6.21509 +[id0] trainer/rewards Std 5.26596 +[id0] trainer/rewards Max 11.7865 +[id0] trainer/rewards Min -0.839532 +[id0] trainer/terminals Mean 0 +[id0] trainer/terminals Std 0 +[id0] trainer/terminals Max 0 +[id0] trainer/terminals Min 0 +[id0] trainer/replay_buffer_len 89210 +[id0] trainer/policy/mean Mean 0.1687 +[id0] trainer/policy/mean Std 0.298864 +[id0] trainer/policy/mean Max 0.587666 +[id0] trainer/policy/mean Min -0.548678 +[id0] trainer/policy/std Mean 0.0502259 +[id0] trainer/policy/std Std 2.68281e-06 +[id0] trainer/policy/std Max 0.0502315 +[id0] trainer/policy/std Min 0.0502227 +[id0] trainer/Advantage Weights Mean 0.2 +[id0] trainer/Advantage Weights Std 0.000468149 +[id0] trainer/Advantage Weights Max 0.200907 +[id0] trainer/Advantage Weights Min 0.199639 +[id0] trainer/Advantage Score Mean 0.00256994 +[id0] trainer/Advantage Score Std 0.00467449 +[id0] trainer/Advantage Score Max 0.0116208 +[id0] trainer/Advantage Score Min -0.00104058 +[id0] expl/num steps total 210 +[id0] expl/num paths total 3 +[id0] expl/path length Mean 100 +[id0] expl/path length Std 0 +[id0] expl/path length Max 100 +[id0] expl/path length Min 100 +[id0] expl/Rewards Mean -0.00513658 +[id0] expl/Rewards Std 0.0732153 +[id0] expl/Rewards Max 0.176331 +[id0] expl/Rewards Min -0.15102 +[id0] expl/Returns Mean -0.513658 +[id0] expl/Returns Std 0 +[id0] expl/Returns Max -0.513658 +[id0] expl/Returns Min -0.513658 +[id0] expl/Actions Mean 0.0171286 +[id0] expl/Actions Std 0.0538859 +[id0] expl/Actions Max 0.170002 +[id0] expl/Actions Min -0.128606 +[id0] expl/Num Paths 1 +[id0] expl/Average Returns -0.513658 +[id0] eval/num steps total 0 +[id0] eval/num paths total 0 +[id0] time/epoch_time (s) 0.328234 +[id0] time/evaluation sampling (s) 0.0877585 +[id0] time/exploration sampling (s) 0.0837185 +[id0] time/global_time (s) 6.49861 +[id0] time/replay buffer data storing (s) 0.000593662 +[id0] time/saving (s) 1.52588e-05 +[id0] time/training (s) 0.155221 +[id0] epoch 1 +[id0] ----------------------------------- --------------- +[id0] ----------------------------------- --------------- +[id0] trainer/num train calls 30 +[id0] trainer/QF1 Loss 27.1734 +[id0] trainer/QF2 Loss 27.0553 +[id0] trainer/Policy Loss 9.20581 +[id0] trainer/Q1 Predictions Mean 1.22454 +[id0] trainer/Q1 Predictions Std 0.446581 +[id0] trainer/Q1 Predictions Max 1.82785 +[id0] trainer/Q1 Predictions Min 0.645031 +[id0] trainer/Q2 Predictions Mean 1.17344 +[id0] trainer/Q2 Predictions Std 0.463408 +[id0] trainer/Q2 Predictions Max 1.8045 +[id0] trainer/Q2 Predictions Min 0.545251 +[id0] trainer/Q Targets Mean 4.36755 +[id0] trainer/Q Targets Std 4.29701 +[id0] trainer/Q Targets Max 9.30498 +[id0] trainer/Q Targets Min -0.915291 +[id0] trainer/Log Pis Mean 10.3622 +[id0] trainer/Log Pis Std 0.543725 +[id0] trainer/Log Pis Max 11.2877 +[id0] trainer/Log Pis Min 9.68874 +[id0] trainer/rewards Mean 4.35784 +[id0] trainer/rewards Std 4.29613 +[id0] trainer/rewards Max 9.2929 +[id0] trainer/rewards Min -0.920588 +[id0] trainer/terminals Mean 0 +[id0] trainer/terminals Std 0 +[id0] trainer/terminals Max 0 +[id0] trainer/terminals Min 0 +[id0] trainer/replay_buffer_len 89310 +[id0] trainer/policy/mean Mean 0.0729202 +[id0] trainer/policy/mean Std 0.519411 +[id0] trainer/policy/mean Max 0.812271 +[id0] trainer/policy/mean Min -0.82246 +[id0] trainer/policy/std Mean 0.0504347 +[id0] trainer/policy/std Std 4.48409e-06 +[id0] trainer/policy/std Max 0.0504402 +[id0] trainer/policy/std Min 0.0504286 +[id0] trainer/Advantage Weights Mean 0.2 +[id0] trainer/Advantage Weights Std 0.000429373 +[id0] trainer/Advantage Weights Max 0.20049 +[id0] trainer/Advantage Weights Min 0.199444 +[id0] trainer/Advantage Score Mean 0.00394787 +[id0] trainer/Advantage Score Std 0.0042947 +[id0] trainer/Advantage Score Max 0.00884658 +[id0] trainer/Advantage Score Min -0.00161982 +[id0] expl/num steps total 310 +[id0] expl/num paths total 4 +[id0] expl/path length Mean 100 +[id0] expl/path length Std 0 +[id0] expl/path length Max 100 +[id0] expl/path length Min 100 +[id0] expl/Rewards Mean -0.0111399 +[id0] expl/Rewards Std 0.109831 +[id0] expl/Rewards Max 0.205804 +[id0] expl/Rewards Min -0.360678 +[id0] expl/Returns Mean -1.11399 +[id0] expl/Returns Std 0 +[id0] expl/Returns Max -1.11399 +[id0] expl/Returns Min -1.11399 +[id0] expl/Actions Mean 0.0279911 +[id0] expl/Actions Std 0.067341 +[id0] expl/Actions Max 0.254027 +[id0] expl/Actions Min -0.163547 +[id0] expl/Num Paths 1 +[id0] expl/Average Returns -1.11399 +[id0] eval/num steps total 0 +[id0] eval/num paths total 0 +[id0] time/epoch_time (s) 0.329881 +[id0] time/evaluation sampling (s) 0.087302 +[id0] time/exploration sampling (s) 0.0841551 +[id0] time/global_time (s) 6.8316 +[id0] time/replay buffer data storing (s) 0.000627041 +[id0] time/saving (s) 1.78814e-05 +[id0] time/training (s) 0.156898 +[id0] epoch 2 +[id0] ----------------------------------- --------------- +[id0] ----------------------------------- --------------- +[id0] trainer/num train calls 40 +[id0] trainer/QF1 Loss 27.5874 +[id0] trainer/QF2 Loss 24.4089 +[id0] trainer/Policy Loss 7.8244 +[id0] trainer/Q1 Predictions Mean 1.43631 +[id0] trainer/Q1 Predictions Std 0.695864 +[id0] trainer/Q1 Predictions Max 2.35928 +[id0] trainer/Q1 Predictions Min 0.501491 +[id0] trainer/Q2 Predictions Mean 1.72835 +[id0] trainer/Q2 Predictions Std 0.874067 +[id0] trainer/Q2 Predictions Max 2.90332 +[id0] trainer/Q2 Predictions Min 0.604359 +[id0] trainer/Q Targets Mean 4.25011 +[id0] trainer/Q Targets Std 5.1101 +[id0] trainer/Q Targets Max 11.5322 +[id0] trainer/Q Targets Min -1.23925 +[id0] trainer/Log Pis Mean 9.25355 +[id0] trainer/Log Pis Std 0.578658 +[id0] trainer/Log Pis Max 10.16 +[id0] trainer/Log Pis Min 8.5383 +[id0] trainer/rewards Mean 4.22912 +[id0] trainer/rewards Std 5.11 +[id0] trainer/rewards Max 11.5097 +[id0] trainer/rewards Min -1.25702 +[id0] trainer/terminals Mean 0 +[id0] trainer/terminals Std 0 +[id0] trainer/terminals Max 0 +[id0] trainer/terminals Min 0 +[id0] trainer/replay_buffer_len 89410 +[id0] trainer/policy/mean Mean 0.236795 +[id0] trainer/policy/mean Std 0.2788 +[id0] trainer/policy/mean Max 0.742046 +[id0] trainer/policy/mean Min -0.260477 +[id0] trainer/policy/std Mean 0.0506312 +[id0] trainer/policy/std Std 1.46128e-05 +[id0] trainer/policy/std Max 0.0506556 +[id0] trainer/policy/std Min 0.0506164 +[id0] trainer/Advantage Weights Mean 0.2 +[id0] trainer/Advantage Weights Std 0.000854139 +[id0] trainer/Advantage Weights Max 0.201628 +[id0] trainer/Advantage Weights Min 0.199138 +[id0] trainer/Advantage Score Mean 0.00716419 +[id0] trainer/Advantage Score Std 0.00852061 +[id0] trainer/Advantage Score Max 0.0233918 +[id0] trainer/Advantage Score Min -0.00145853 +[id0] expl/num steps total 410 +[id0] expl/num paths total 5 +[id0] expl/path length Mean 100 +[id0] expl/path length Std 0 +[id0] expl/path length Max 100 +[id0] expl/path length Min 100 +[id0] expl/Rewards Mean -0.0171546 +[id0] expl/Rewards Std 0.0997752 +[id0] expl/Rewards Max 0.208582 +[id0] expl/Rewards Min -0.365759 +[id0] expl/Returns Mean -1.71546 +[id0] expl/Returns Std 0 +[id0] expl/Returns Max -1.71546 +[id0] expl/Returns Min -1.71546 +[id0] expl/Actions Mean 0.0157296 +[id0] expl/Actions Std 0.060933 +[id0] expl/Actions Max 0.199297 +[id0] expl/Actions Min -0.158053 +[id0] expl/Num Paths 1 +[id0] expl/Average Returns -1.71546 +[id0] eval/num steps total 0 +[id0] eval/num paths total 0 +[id0] time/epoch_time (s) 0.329376 +[id0] time/evaluation sampling (s) 0.086175 +[id0] time/exploration sampling (s) 0.0836077 +[id0] time/global_time (s) 7.16354 +[id0] time/replay buffer data storing (s) 0.000646591 +[id0] time/saving (s) 1.43051e-05 +[id0] time/training (s) 0.158074 +[id0] epoch 3 +[id0] ----------------------------------- --------------- +[id0] ----------------------------------- --------------- +[id0] trainer/num train calls 50 +[id0] trainer/QF1 Loss 12.0632 +[id0] trainer/QF2 Loss 12.4672 +[id0] trainer/Policy Loss 5.22708 +[id0] trainer/Q1 Predictions Mean 3.13504 +[id0] trainer/Q1 Predictions Std 1.28296 +[id0] trainer/Q1 Predictions Max 4.55434 +[id0] trainer/Q1 Predictions Min 1.47195 +[id0] trainer/Q2 Predictions Mean 3.27497 +[id0] trainer/Q2 Predictions Std 1.37992 +[id0] trainer/Q2 Predictions Max 4.7188 +[id0] trainer/Q2 Predictions Min 1.39781 +[id0] trainer/Q Targets Mean 4.40037 +[id0] trainer/Q Targets Std 4.29655 +[id0] trainer/Q Targets Max 10.2577 +[id0] trainer/Q Targets Min -0.60194 +[id0] trainer/Log Pis Mean 8.32517 +[id0] trainer/Log Pis Std 1.02499 +[id0] trainer/Log Pis Max 9.8133 +[id0] trainer/Log Pis Min 7.10904 +[id0] trainer/rewards Mean 4.36342 +[id0] trainer/rewards Std 4.28988 +[id0] trainer/rewards Max 10.2231 +[id0] trainer/rewards Min -0.621302 +[id0] trainer/terminals Mean 0 +[id0] trainer/terminals Std 0 +[id0] trainer/terminals Max 0 +[id0] trainer/terminals Min 0 +[id0] trainer/replay_buffer_len 89510 +[id0] trainer/policy/mean Mean 0.0279084 +[id0] trainer/policy/mean Std 0.513607 +[id0] trainer/policy/mean Max 0.844352 +[id0] trainer/policy/mean Min -0.899527 +[id0] trainer/policy/std Mean 0.0508098 +[id0] trainer/policy/std Std 2.20989e-05 +[id0] trainer/policy/std Max 0.05085 +[id0] trainer/policy/std Min 0.0507816 +[id0] trainer/Advantage Weights Mean 0.2 +[id0] trainer/Advantage Weights Std 0.00161367 +[id0] trainer/Advantage Weights Max 0.202964 +[id0] trainer/Advantage Weights Min 0.198459 +[id0] trainer/Advantage Score Mean 0.0103492 +[id0] trainer/Advantage Score Std 0.0160741 +[id0] trainer/Advantage Score Max 0.0398329 +[id0] trainer/Advantage Score Min -0.00505543 +[id0] expl/num steps total 510 +[id0] expl/num paths total 6 +[id0] expl/path length Mean 100 +[id0] expl/path length Std 0 +[id0] expl/path length Max 100 +[id0] expl/path length Min 100 +[id0] expl/Rewards Mean -0.00338446 +[id0] expl/Rewards Std 0.117742 +[id0] expl/Rewards Max 0.285116 +[id0] expl/Rewards Min -0.281986 +[id0] expl/Returns Mean -0.338446 +[id0] expl/Returns Std 0 +[id0] expl/Returns Max -0.338446 +[id0] expl/Returns Min -0.338446 +[id0] expl/Actions Mean 0.00301742 +[id0] expl/Actions Std 0.0810042 +[id0] expl/Actions Max 0.227759 +[id0] expl/Actions Min -0.208984 +[id0] expl/Num Paths 1 +[id0] expl/Average Returns -0.338446 +[id0] eval/num steps total 0 +[id0] eval/num paths total 0 +[id0] time/epoch_time (s) 0.326353 +[id0] time/evaluation sampling (s) 0.08706 +[id0] time/exploration sampling (s) 0.0842543 +[id0] time/global_time (s) 7.49245 +[id0] time/replay buffer data storing (s) 0.000594378 +[id0] time/saving (s) 1.43051e-05 +[id0] time/training (s) 0.153602 +[id0] epoch 4 +[id0] ----------------------------------- --------------- diff --git a/tests/regression/awac/mujoco/id0/pretrain_policy.csv b/tests/regression/awac/mujoco/id0/pretrain_policy.csv new file mode 100644 index 000000000..e69de29bb diff --git a/tests/regression/awac/mujoco/id0/pretrain_q.csv b/tests/regression/awac/mujoco/id0/pretrain_q.csv new file mode 100644 index 000000000..a7a20f295 --- /dev/null +++ b/tests/regression/awac/mujoco/id0/pretrain_q.csv @@ -0,0 +1,2 @@ +trainer/Advantage Score Max,trainer/Advantage Score Mean,trainer/Advantage Score Min,trainer/Advantage Score Std,trainer/Advantage Weights Max,trainer/Advantage Weights Mean,trainer/Advantage Weights Min,trainer/Advantage Weights Std,trainer/Log Pis Max,trainer/Log Pis Mean,trainer/Log Pis Min,trainer/Log Pis Std,trainer/Policy Loss,trainer/Q Targets Max,trainer/Q Targets Mean,trainer/Q Targets Min,trainer/Q Targets Std,trainer/Q1 Predictions Max,trainer/Q1 Predictions Mean,trainer/Q1 Predictions Min,trainer/Q1 Predictions Std,trainer/Q2 Predictions Max,trainer/Q2 Predictions Mean,trainer/Q2 Predictions Min,trainer/Q2 Predictions Std,trainer/QF1 Loss,trainer/QF2 Loss,trainer/batch,trainer/epoch_time,trainer/policy/mean Max,trainer/policy/mean Mean,trainer/policy/mean Min,trainer/policy/mean Std,trainer/policy/std Max,trainer/policy/std Mean,trainer/policy/std Min,trainer/policy/std Std,trainer/replay_buffer_len,trainer/rewards Max,trainer/rewards Mean,trainer/rewards Min,trainer/rewards Std,trainer/terminals Max,trainer/terminals Mean,trainer/terminals Min,trainer/terminals Std +0.0013091738,5.7555604e-05,-0.0012496571,0.0005048572,0.00781739,0.0078124995,0.007807395,1.972087e-06,12.102844,9.2814045,1.676443,1.9334962,9.283523,12.308665,3.5774043,-1.7108884,4.905726,0.0135826375,-0.00024819933,-0.008865653,0.003430491,0.011352685,0.00041790164,-0.010821134,0.003796346,36.871548,36.875504,0,0.023017168045043945,0.0007041259,-0.000101676735,-0.0014804011,0.0003270782,0.049787067,0.04978707,0.049787067,3.7252903e-09,89000,12.31599,3.5835624,-1.7016138,4.9069014,0.0,0.0,0.0,0.0 diff --git a/tests/regression/awac/mujoco/id0/progress.csv b/tests/regression/awac/mujoco/id0/progress.csv new file mode 100644 index 000000000..6f62c0f6c --- /dev/null +++ b/tests/regression/awac/mujoco/id0/progress.csv @@ -0,0 +1,6 @@ +epoch,eval/num paths total,eval/num steps total,expl/Actions Max,expl/Actions Mean,expl/Actions Min,expl/Actions Std,expl/Average Returns,expl/Num Paths,expl/Returns Max,expl/Returns Mean,expl/Returns Min,expl/Returns Std,expl/Rewards Max,expl/Rewards Mean,expl/Rewards Min,expl/Rewards Std,expl/num paths total,expl/num steps total,expl/path length Max,expl/path length Mean,expl/path length Min,expl/path length Std,time/epoch_time (s),time/evaluation sampling (s),time/exploration sampling (s),time/global_time (s),time/replay buffer data storing (s),time/saving (s),time/training (s),trainer/Advantage Score Max,trainer/Advantage Score Mean,trainer/Advantage Score Min,trainer/Advantage Score Std,trainer/Advantage Weights Max,trainer/Advantage Weights Mean,trainer/Advantage Weights Min,trainer/Advantage Weights Std,trainer/Log Pis Max,trainer/Log Pis Mean,trainer/Log Pis Min,trainer/Log Pis Std,trainer/Policy Loss,trainer/Q Targets Max,trainer/Q Targets Mean,trainer/Q Targets Min,trainer/Q Targets Std,trainer/Q1 Predictions Max,trainer/Q1 Predictions Mean,trainer/Q1 Predictions Min,trainer/Q1 Predictions Std,trainer/Q2 Predictions Max,trainer/Q2 Predictions Mean,trainer/Q2 Predictions Min,trainer/Q2 Predictions Std,trainer/QF1 Loss,trainer/QF2 Loss,trainer/num train calls,trainer/policy/mean Max,trainer/policy/mean Mean,trainer/policy/mean Min,trainer/policy/mean Std,trainer/policy/std Max,trainer/policy/std Mean,trainer/policy/std Min,trainer/policy/std Std,trainer/replay_buffer_len,trainer/rewards Max,trainer/rewards Mean,trainer/rewards Min,trainer/rewards Std,trainer/terminals Max,trainer/terminals Mean,trainer/terminals Min,trainer/terminals Std +0,0,0,0.15635289,0.003603011,-0.17906095,0.04950077,-2.135522789199266,1,-2.135522789199266,-2.135522789199266,-2.135522789199266,0.0,0.15435091308607227,-0.021355227891992658,-0.3588185610585467,0.10306718573438323,2,110,100,100.0,100,0.0,0.3525400161743164,0.08683228492736816,0.08385157585144043,6.1677656173706055,0.0005838871002197266,0.011194467544555664,0.15770244598388672,0.0045741796,0.001931715,-0.0019345284,0.0021601364,0.2002643,0.2,0.19961365,0.00021592183,10.63615,8.934129,5.431668,1.9003453,8.700624,11.14474,8.682143,-0.5271463,4.605519,0.29938892,0.24383172,0.1793371,0.047789965,0.3016076,0.24278085,0.13735807,0.05986061,92.12183,91.95293,10,0.16476473,0.050194602,-0.14168571,0.07699586,0.050011206,0.050010867,0.05001067,1.7139035e-07,89110,11.146929,8.6872635,-0.5237985,4.60638,0.0,0.0,0.0,0.0 +1,0,0,0.1700022,0.017128626,-0.12860556,0.053885933,-0.5136583697448278,1,-0.5136583697448278,-0.5136583697448278,-0.5136583697448278,0.0,0.17633074769437393,-0.005136583697448281,-0.1510203465927745,0.07321532353103812,3,210,100,100.0,100,0.0,0.3282339572906494,0.08775854110717773,0.08371853828430176,6.498608350753784,0.0005936622619628906,1.52587890625e-05,0.15522098541259766,0.01162076,0.0025699376,-0.0010405779,0.004674491,0.20090659,0.2,0.19963872,0.0004681491,11.239445,9.609881,8.452656,0.9394557,9.038435,11.789696,6.2165704,-0.83606213,5.2653494,0.85440683,0.61077535,0.39660922,0.18391582,0.80083364,0.61047417,0.36760753,0.15990822,57.525616,57.726475,20,0.5876659,0.16869982,-0.5486779,0.298864,0.050231464,0.050225925,0.0502227,2.6828088e-06,89210,11.786497,6.2150893,-0.83953166,5.265964,0.0,0.0,0.0,0.0 +2,0,0,0.25402707,0.027991101,-0.16354725,0.06734105,-1.1139865646877456,1,-1.1139865646877456,-1.1139865646877456,-1.1139865646877456,0.0,0.20580445400230984,-0.011139865646877453,-0.3606784776937927,0.10983129003577123,4,310,100,100.0,100,0.0,0.3298814296722412,0.08730196952819824,0.08415508270263672,6.831604242324829,0.0006270408630371094,1.7881393432617188e-05,0.15689849853515625,0.008846581,0.003947866,-0.0016198158,0.0042947046,0.20049001,0.19999999,0.19944353,0.0004293735,11.2876835,10.362177,9.688736,0.54372483,9.205812,9.304983,4.367554,-0.9152912,4.2970104,1.8278465,1.2245387,0.6450309,0.44658068,1.8045048,1.1734407,0.5452507,0.46340814,27.173355,27.055313,30,0.8122705,0.072920226,-0.82245976,0.51941067,0.050440244,0.05043467,0.05042863,4.4840895e-06,89310,9.292902,4.3578415,-0.92058784,4.296131,0.0,0.0,0.0,0.0 +3,0,0,0.19929746,0.015729608,-0.15805301,0.060932975,-1.715457124973294,1,-1.715457124973294,-1.715457124973294,-1.715457124973294,0.0,0.20858203085351046,-0.017154571249732943,-0.3657590158395002,0.09977523794852798,5,410,100,100.0,100,0.0,0.3293764591217041,0.08617496490478516,0.08360767364501953,7.163540840148926,0.0006465911865234375,1.430511474609375e-05,0.15807390213012695,0.023391843,0.0071641863,-0.0014585257,0.008520608,0.20162754,0.19999999,0.19913776,0.0008541388,10.160042,9.25355,8.5383005,0.5786578,7.8244004,11.532185,4.2501097,-1.2392461,5.1100993,2.3592813,1.4363134,0.5014911,0.6958636,2.9033215,1.7283541,0.60435903,0.8740667,27.587402,24.40886,40,0.74204636,0.23679481,-0.26047686,0.2788,0.05065557,0.050631236,0.05061636,1.4612753e-05,89410,11.509733,4.2291183,-1.257019,5.1099987,0.0,0.0,0.0,0.0 +4,0,0,0.22775936,0.0030174232,-0.20898446,0.08100421,-0.3384462867570016,1,-0.3384462867570016,-0.3384462867570016,-0.3384462867570016,0.0,0.28511649657945803,-0.0033844628675700162,-0.2819858919971847,0.11774201087608019,6,510,100,100.0,100,0.0,0.3263530731201172,0.08705997467041016,0.08425426483154297,7.492448806762695,0.0005943775177001953,1.430511474609375e-05,0.15360236167907715,0.03983295,0.010349202,-0.0050554276,0.016074125,0.20296365,0.2,0.19845903,0.0016136708,9.813296,8.325169,7.1090374,1.0249859,5.227079,10.257714,4.4003663,-0.60194004,4.296547,4.5543356,3.1350446,1.4719478,1.2829554,4.718795,3.2749665,1.3978143,1.3799181,12.063154,12.467193,50,0.844352,0.027908402,-0.8995265,0.5136073,0.050850045,0.0508098,0.050781608,2.2098871e-05,89510,10.22309,4.3634233,-0.6213017,4.289878,0.0,0.0,0.0,0.0 diff --git a/tests/regression/awac/mujoco/id0/script_name.txt b/tests/regression/awac/mujoco/id0/script_name.txt new file mode 100644 index 000000000..ba4cc0812 --- /dev/null +++ b/tests/regression/awac/mujoco/id0/script_name.txt @@ -0,0 +1 @@ +tests/regression/awac/mujoco/test_awac_mujoco.py \ No newline at end of file diff --git a/tests/regression/awac/mujoco/id0/variant.json b/tests/regression/awac/mujoco/id0/variant.json new file mode 100644 index 000000000..627c2bffc --- /dev/null +++ b/tests/regression/awac/mujoco/id0/variant.json @@ -0,0 +1,100 @@ +{ + "add_env_demos": true, + "add_env_offpolicy_data": true, + "algo_kwargs": { + "batch_size": 5, + "min_num_steps_before_training": 10, + "num_epochs": 5, + "num_eval_steps_per_epoch": 100, + "num_expl_steps_per_train_loop": 100, + "num_trains_per_train_loop": 10 + }, + "algorithm": "AWAC", + "collection_mode": "batch", + "debug": true, + "env_demo_path": { + "is_demo": true, + "obs_dict": false, + "path": "demos/icml2020/mujoco/hc_action_noise_15.npy" + }, + "env_id": "HalfCheetah-v2", + "env_offpolicy_data_path": { + "is_demo": false, + "obs_dict": false, + "path": "demos/icml2020/mujoco/hc_off_policy_15_demos_100.npy", + "train_split": 0.9 + }, + "exp_id": 0, + "exp_name": "references/awac/mujoco/awac1", + "layer_size": 256, + "load_demos": true, + "logger_config": { + "run_id": 0, + "snapshot_gap": 100, + "snapshot_mode": "gap" + }, + "max_path_length": 1000, + "num_expl_steps_per_train_loop": 1000, + "num_layers": 2, + "path_loader_class": { + "$class": "rlkit.demos.source.dict_to_mdp_path_loader.DictToMDPPathLoader" + }, + "path_loader_kwargs": { + "demo_paths": [] + }, + "policy_class": { + "$class": "rlkit.torch.sac.policies.gaussian_policy.GaussianPolicy" + }, + "policy_kwargs": { + "hidden_sizes": [ + 256, + 256, + 256, + 256 + ], + "max_log_std": 0, + "min_log_std": -6, + "std_architecture": "values" + }, + "pretrain_policy": false, + "pretrain_rl": true, + "qf_kwargs": { + "hidden_sizes": [ + 256, + 256 + ] + }, + "replay_buffer_size": 1000000, + "run_id": 0, + "seed": "0", + "seedid": 0, + "train_rl": true, + "trainer_kwargs": { + "alpha": 0, + "awr_min_q": true, + "awr_sample_actions": false, + "awr_use_mle_for_vf": false, + "awr_weight": 1.0, + "bc_num_pretrain_steps": 0, + "bc_weight": 0.0, + "beta": 2, + "buffer_policy_sample_actions": false, + "compute_bc": false, + "discount": 0.99, + "policy_lr": 0.0003, + "policy_weight_decay": 0.0001, + "q_num_pretrain1_steps": 0, + "q_num_pretrain2_steps": 10, + "qf_lr": 0.0003, + "reparam_weight": 0.0, + "reward_scale": 1, + "soft_target_tau": 0.005, + "target_update_period": 1, + "train_bc_on_rl_buffer": false, + "use_automatic_entropy_tuning": false + }, + "trial_name": "id0", + "unique_id": "affca07d-712d-462a-aeaa-4bdf14cde7c6", + "use_validation_buffer": true, + "version": "normal" +} \ No newline at end of file diff --git a/tests/regression/awac/mujoco/id0_offline/debug.log b/tests/regression/awac/mujoco/id0_offline/debug.log new file mode 100644 index 000000000..b6330bf3b --- /dev/null +++ b/tests/regression/awac/mujoco/id0_offline/debug.log @@ -0,0 +1,523 @@ +[id0] ------------------------------ --------------- +[id0] trainer/QF1 Loss 105.663 +[id0] trainer/QF2 Loss 105.656 +[id0] trainer/Policy Loss 9.2854 +[id0] trainer/Q1 Predictions Mean -0.00176684 +[id0] trainer/Q1 Predictions Std 0.00436927 +[id0] trainer/Q1 Predictions Max 0.0140987 +[id0] trainer/Q1 Predictions Min -0.00976394 +[id0] trainer/Q2 Predictions Mean -0.00156584 +[id0] trainer/Q2 Predictions Std 0.00312412 +[id0] trainer/Q2 Predictions Max 0.00693969 +[id0] trainer/Q2 Predictions Min -0.0108211 +[id0] trainer/Q Targets Mean 10.0992 +[id0] trainer/Q Targets Std 1.90509 +[id0] trainer/Q Targets Max 12.3087 +[id0] trainer/Q Targets Min 0.460343 +[id0] trainer/Log Pis Mean 9.2814 +[id0] trainer/Log Pis Std 1.9335 +[id0] trainer/Log Pis Max 12.1028 +[id0] trainer/Log Pis Min 1.67644 +[id0] trainer/rewards Mean 10.108 +[id0] trainer/rewards Std 1.90519 +[id0] trainer/rewards Max 12.316 +[id0] trainer/rewards Min 0.46292 +[id0] trainer/terminals Mean 0 +[id0] trainer/terminals Std 0 +[id0] trainer/terminals Max 0 +[id0] trainer/terminals Min 0 +[id0] trainer/replay_buffer_len 12000 +[id0] trainer/policy/mean Mean -0.00011057 +[id0] trainer/policy/mean Std 0.000409417 +[id0] trainer/policy/mean Max 0.000723418 +[id0] trainer/policy/mean Min -0.0014804 +[id0] trainer/policy/std Mean 0.0497871 +[id0] trainer/policy/std Std 3.72529e-09 +[id0] trainer/policy/std Max 0.0497871 +[id0] trainer/policy/std Min 0.0497871 +[id0] trainer/Advantage Weights Mean 0.0078125 +[id0] trainer/Advantage Weights Std 1.65823e-06 +[id0] trainer/Advantage Weights Max 0.00781602 +[id0] trainer/Advantage Weights Min 0.00780831 +[id0] trainer/Advantage Score Mean 4.32669e-05 +[id0] trainer/Advantage Score Std 0.000424512 +[id0] trainer/Advantage Score Max 0.000945322 +[id0] trainer/Advantage Score Min -0.00103077 +[id0] trainer/batch 0 +[id0] trainer/epoch_time 0.0220866 +[id0] ------------------------------ --------------- +[id0] ----------------------------------- --------------- +[id0] trainer/num train calls 10 +[id0] trainer/QF1 Loss 110.416 +[id0] trainer/QF2 Loss 110.426 +[id0] trainer/Policy Loss 8.75761 +[id0] trainer/Q1 Predictions Mean 0.188749 +[id0] trainer/Q1 Predictions Std 0.0604501 +[id0] trainer/Q1 Predictions Max 0.301579 +[id0] trainer/Q1 Predictions Min 0.135304 +[id0] trainer/Q2 Predictions Mean 0.18873 +[id0] trainer/Q2 Predictions Std 0.0538581 +[id0] trainer/Q2 Predictions Max 0.256944 +[id0] trainer/Q2 Predictions Min 0.124673 +[id0] trainer/Q Targets Mean 10.6619 +[id0] trainer/Q Targets Std 0.877905 +[id0] trainer/Q Targets Max 11.7227 +[id0] trainer/Q Targets Min 9.18379 +[id0] trainer/Log Pis Mean 8.93408 +[id0] trainer/Log Pis Std 1.90035 +[id0] trainer/Log Pis Max 10.6361 +[id0] trainer/Log Pis Min 5.43162 +[id0] trainer/rewards Mean 10.6676 +[id0] trainer/rewards Std 0.875117 +[id0] trainer/rewards Max 11.7213 +[id0] trainer/rewards Min 9.19344 +[id0] trainer/terminals Mean 0 +[id0] trainer/terminals Std 0 +[id0] trainer/terminals Max 0 +[id0] trainer/terminals Min 0 +[id0] trainer/replay_buffer_len 12110 +[id0] trainer/policy/mean Mean 0.0154436 +[id0] trainer/policy/mean Std 0.0484534 +[id0] trainer/policy/mean Max 0.0994135 +[id0] trainer/policy/mean Min -0.0910146 +[id0] trainer/policy/std Mean 0.0500113 +[id0] trainer/policy/std Std 7.11701e-07 +[id0] trainer/policy/std Max 0.0500118 +[id0] trainer/policy/std Min 0.0500098 +[id0] trainer/Advantage Weights Mean 0.2 +[id0] trainer/Advantage Weights Std 0.000118501 +[id0] trainer/Advantage Weights Max 0.200219 +[id0] trainer/Advantage Weights Min 0.199888 +[id0] trainer/Advantage Score Mean 0.00122384 +[id0] trainer/Advantage Score Std 0.00118466 +[id0] trainer/Advantage Score Max 0.00341064 +[id0] trainer/Advantage Score Min 0.000105202 +[id0] expl/num steps total 110 +[id0] expl/num paths total 3 +[id0] expl/path length Mean 50 +[id0] expl/path length Std 0 +[id0] expl/path length Max 50 +[id0] expl/path length Min 50 +[id0] expl/Rewards Mean 0 +[id0] expl/Rewards Std 0 +[id0] expl/Rewards Max 0 +[id0] expl/Rewards Min 0 +[id0] expl/Returns Mean 0 +[id0] expl/Returns Std 0 +[id0] expl/Returns Max 0 +[id0] expl/Returns Min 0 +[id0] expl/Actions Mean 0.0015648 +[id0] expl/Actions Std 0.0488781 +[id0] expl/Actions Max 0.148999 +[id0] expl/Actions Min -0.171592 +[id0] expl/Num Paths 2 +[id0] expl/Average Returns 0 +[id0] eval/num steps total 100 +[id0] eval/num paths total 2 +[id0] eval/path length Mean 50 +[id0] eval/path length Std 0 +[id0] eval/path length Max 50 +[id0] eval/path length Min 50 +[id0] eval/Rewards Mean 0 +[id0] eval/Rewards Std 0 +[id0] eval/Rewards Max 0 +[id0] eval/Rewards Min 0 +[id0] eval/Returns Mean 0 +[id0] eval/Returns Std 0 +[id0] eval/Returns Max 0 +[id0] eval/Returns Min 0 +[id0] eval/Actions Mean 0.00198674 +[id0] eval/Actions Std 0.00245479 +[id0] eval/Actions Max 0.00403596 +[id0] eval/Actions Min -0.00245622 +[id0] eval/Num Paths 2 +[id0] eval/Average Returns 0 +[id0] time/epoch_time (s) 0.309243 +[id0] time/evaluation sampling (s) 0.070143 +[id0] time/exploration sampling (s) 0.0686193 +[id0] time/global_time (s) 4.41927 +[id0] time/replay buffer data storing (s) 0.000622034 +[id0] time/saving (s) 0.0115674 +[id0] time/training (s) 0.14824 +[id0] epoch 0 +[id0] ----------------------------------- --------------- +[id0] ----------------------------------- -------------- +[id0] trainer/num train calls 20 +[id0] trainer/QF1 Loss 52.1679 +[id0] trainer/QF2 Loss 52.0353 +[id0] trainer/Policy Loss 9.01049 +[id0] trainer/Q1 Predictions Mean 0.631065 +[id0] trainer/Q1 Predictions Std 0.191857 +[id0] trainer/Q1 Predictions Max 0.861882 +[id0] trainer/Q1 Predictions Min 0.391539 +[id0] trainer/Q2 Predictions Mean 0.615324 +[id0] trainer/Q2 Predictions Std 0.211701 +[id0] trainer/Q2 Predictions Max 0.894524 +[id0] trainer/Q2 Predictions Min 0.34987 +[id0] trainer/Q Targets Mean 6.81921 +[id0] trainer/Q Targets Std 3.80028 +[id0] trainer/Q Targets Max 11.248 +[id0] trainer/Q Targets Min 0.120901 +[id0] trainer/Log Pis Mean 9.61044 +[id0] trainer/Log Pis Std 0.939455 +[id0] trainer/Log Pis Max 11.24 +[id0] trainer/Log Pis Min 8.45321 +[id0] trainer/rewards Mean 6.81824 +[id0] trainer/rewards Std 3.8021 +[id0] trainer/rewards Max 11.2481 +[id0] trainer/rewards Min 0.115804 +[id0] trainer/terminals Mean 0 +[id0] trainer/terminals Std 0 +[id0] trainer/terminals Max 0 +[id0] trainer/terminals Min 0 +[id0] trainer/replay_buffer_len 1050 +[id0] trainer/policy/mean Mean -0.0018095 +[id0] trainer/policy/mean Std 0.255604 +[id0] trainer/policy/mean Max 0.435848 +[id0] trainer/policy/mean Min -0.446678 +[id0] trainer/policy/std Mean 0.0502213 +[id0] trainer/policy/std Std 5.39185e-06 +[id0] trainer/policy/std Max 0.0502296 +[id0] trainer/policy/std Min 0.0502124 +[id0] trainer/Advantage Weights Mean 0.2 +[id0] trainer/Advantage Weights Std 0.000395878 +[id0] trainer/Advantage Weights Max 0.200611 +[id0] trainer/Advantage Weights Min 0.199596 +[id0] trainer/Advantage Score Mean 0.00307283 +[id0] trainer/Advantage Score Std 0.00395706 +[id0] trainer/Advantage Score Max 0.00917578 +[id0] trainer/Advantage Score Min -0.00097096 +[id0] expl/num steps total 210 +[id0] expl/num paths total 5 +[id0] expl/path length Mean 50 +[id0] expl/path length Std 0 +[id0] expl/path length Max 50 +[id0] expl/path length Min 50 +[id0] expl/Rewards Mean 0 +[id0] expl/Rewards Std 0 +[id0] expl/Rewards Max 0 +[id0] expl/Rewards Min 0 +[id0] expl/Returns Mean 0 +[id0] expl/Returns Std 0 +[id0] expl/Returns Max 0 +[id0] expl/Returns Min 0 +[id0] expl/Actions Mean 0.0067328 +[id0] expl/Actions Std 0.0502179 +[id0] expl/Actions Max 0.156779 +[id0] expl/Actions Min -0.1635 +[id0] expl/Num Paths 2 +[id0] expl/Average Returns 0 +[id0] eval/num steps total 200 +[id0] eval/num paths total 4 +[id0] eval/path length Mean 50 +[id0] eval/path length Std 0 +[id0] eval/path length Max 50 +[id0] eval/path length Min 50 +[id0] eval/Rewards Mean 0 +[id0] eval/Rewards Std 0 +[id0] eval/Rewards Max 0 +[id0] eval/Rewards Min 0 +[id0] eval/Returns Mean 0 +[id0] eval/Returns Std 0 +[id0] eval/Returns Max 0 +[id0] eval/Returns Min 0 +[id0] eval/Actions Mean 0.00624839 +[id0] eval/Actions Std 0.0053632 +[id0] eval/Actions Max 0.0126386 +[id0] eval/Actions Min -0.00239006 +[id0] eval/Num Paths 2 +[id0] eval/Average Returns 0 +[id0] time/epoch_time (s) 0.293301 +[id0] time/evaluation sampling (s) 0.0710528 +[id0] time/exploration sampling (s) 0.0693829 +[id0] time/global_time (s) 4.71605 +[id0] time/replay buffer data storing (s) 0.000627518 +[id0] time/saving (s) 1.57356e-05 +[id0] time/training (s) 0.150614 +[id0] epoch 1 +[id0] ----------------------------------- -------------- +[id0] ----------------------------------- -------------- +[id0] trainer/num train calls 30 +[id0] trainer/QF1 Loss 81.512 +[id0] trainer/QF2 Loss 79.3319 +[id0] trainer/Policy Loss 8.60279 +[id0] trainer/Q1 Predictions Mean 1.76 +[id0] trainer/Q1 Predictions Std 0.412954 +[id0] trainer/Q1 Predictions Max 2.3335 +[id0] trainer/Q1 Predictions Min 1.24309 +[id0] trainer/Q2 Predictions Mean 1.87768 +[id0] trainer/Q2 Predictions Std 0.432051 +[id0] trainer/Q2 Predictions Max 2.43641 +[id0] trainer/Q2 Predictions Min 1.24154 +[id0] trainer/Q Targets Mean 10.7561 +[id0] trainer/Q Targets Std 0.672906 +[id0] trainer/Q Targets Max 11.9525 +[id0] trainer/Q Targets Min 10.1027 +[id0] trainer/Log Pis Mean 10.3654 +[id0] trainer/Log Pis Std 0.543725 +[id0] trainer/Log Pis Max 11.2909 +[id0] trainer/Log Pis Min 9.69192 +[id0] trainer/rewards Mean 10.7414 +[id0] trainer/rewards Std 0.673268 +[id0] trainer/rewards Max 11.9371 +[id0] trainer/rewards Min 10.0875 +[id0] trainer/terminals Mean 0 +[id0] trainer/terminals Std 0 +[id0] trainer/terminals Max 0 +[id0] trainer/terminals Min 0 +[id0] trainer/replay_buffer_len 1100 +[id0] trainer/policy/mean Mean 0.211427 +[id0] trainer/policy/mean Std 0.57911 +[id0] trainer/policy/mean Max 0.943453 +[id0] trainer/policy/mean Min -0.734058 +[id0] trainer/policy/std Mean 0.0504079 +[id0] trainer/policy/std Std 1.08582e-05 +[id0] trainer/policy/std Max 0.0504208 +[id0] trainer/policy/std Min 0.0503883 +[id0] trainer/Advantage Weights Mean 0.2 +[id0] trainer/Advantage Weights Std 0.00135166 +[id0] trainer/Advantage Weights Max 0.201368 +[id0] trainer/Advantage Weights Min 0.197556 +[id0] trainer/Advantage Score Mean -0.00288568 +[id0] trainer/Advantage Score Std 0.0135574 +[id0] trainer/Advantage Score Max 0.0107968 +[id0] trainer/Advantage Score Min -0.0274303 +[id0] expl/num steps total 310 +[id0] expl/num paths total 7 +[id0] expl/path length Mean 50 +[id0] expl/path length Std 0 +[id0] expl/path length Max 50 +[id0] expl/path length Min 50 +[id0] expl/Rewards Mean 0 +[id0] expl/Rewards Std 0 +[id0] expl/Rewards Max 0 +[id0] expl/Rewards Min 0 +[id0] expl/Returns Mean 0 +[id0] expl/Returns Std 0 +[id0] expl/Returns Max 0 +[id0] expl/Returns Min 0 +[id0] expl/Actions Mean 0.00999501 +[id0] expl/Actions Std 0.0552161 +[id0] expl/Actions Max 0.17964 +[id0] expl/Actions Min -0.141274 +[id0] expl/Num Paths 2 +[id0] expl/Average Returns 0 +[id0] eval/num steps total 300 +[id0] eval/num paths total 6 +[id0] eval/path length Mean 50 +[id0] eval/path length Std 0 +[id0] eval/path length Max 50 +[id0] eval/path length Min 50 +[id0] eval/Rewards Mean 0 +[id0] eval/Rewards Std 0 +[id0] eval/Rewards Max 0 +[id0] eval/Rewards Min 0 +[id0] eval/Returns Mean 0 +[id0] eval/Returns Std 0 +[id0] eval/Returns Max 0 +[id0] eval/Returns Min 0 +[id0] eval/Actions Mean 0.0064345 +[id0] eval/Actions Std 0.0146827 +[id0] eval/Actions Max 0.0218623 +[id0] eval/Actions Min -0.0166687 +[id0] eval/Num Paths 2 +[id0] eval/Average Returns 0 +[id0] time/epoch_time (s) 0.29058 +[id0] time/evaluation sampling (s) 0.0714369 +[id0] time/exploration sampling (s) 0.0691419 +[id0] time/global_time (s) 5.00972 +[id0] time/replay buffer data storing (s) 0.000591278 +[id0] time/saving (s) 1.43051e-05 +[id0] time/training (s) 0.147866 +[id0] epoch 2 +[id0] ----------------------------------- -------------- +[id0] ----------------------------------- --------------- +[id0] trainer/num train calls 40 +[id0] trainer/QF1 Loss 42.8198 +[id0] trainer/QF2 Loss 38.7256 +[id0] trainer/Policy Loss 6.74625 +[id0] trainer/Q1 Predictions Mean 2.52629 +[id0] trainer/Q1 Predictions Std 1.49044 +[id0] trainer/Q1 Predictions Max 4.63162 +[id0] trainer/Q1 Predictions Min 0.122619 +[id0] trainer/Q2 Predictions Mean 2.82724 +[id0] trainer/Q2 Predictions Std 1.72747 +[id0] trainer/Q2 Predictions Max 5.27479 +[id0] trainer/Q2 Predictions Min 0.118848 +[id0] trainer/Q Targets Mean 8.34129 +[id0] trainer/Q Targets Std 4.20369 +[id0] trainer/Q Targets Max 11.0752 +[id0] trainer/Q Targets Min 0.00151104 +[id0] trainer/Log Pis Mean 9.26061 +[id0] trainer/Log Pis Std 0.578658 +[id0] trainer/Log Pis Max 10.1671 +[id0] trainer/Log Pis Min 8.54536 +[id0] trainer/rewards Mean 8.3179 +[id0] trainer/rewards Std 4.19312 +[id0] trainer/rewards Max 11.0475 +[id0] trainer/rewards Min 0 +[id0] trainer/terminals Mean 0 +[id0] trainer/terminals Std 0 +[id0] trainer/terminals Max 0 +[id0] trainer/terminals Min 0 +[id0] trainer/replay_buffer_len 12260 +[id0] trainer/policy/mean Mean 0.140981 +[id0] trainer/policy/mean Std 0.518752 +[id0] trainer/policy/mean Max 0.983747 +[id0] trainer/policy/mean Min -0.636301 +[id0] trainer/policy/std Mean 0.0505717 +[id0] trainer/policy/std Std 3.4236e-05 +[id0] trainer/policy/std Max 0.0506245 +[id0] trainer/policy/std Min 0.0505321 +[id0] trainer/Advantage Weights Mean 0.2 +[id0] trainer/Advantage Weights Std 0.00302391 +[id0] trainer/Advantage Weights Max 0.204801 +[id0] trainer/Advantage Weights Min 0.195571 +[id0] trainer/Advantage Score Mean -0.00526473 +[id0] trainer/Advantage Score Std 0.0302046 +[id0] trainer/Advantage Score Max 0.042403 +[id0] trainer/Advantage Score Min -0.0498209 +[id0] expl/num steps total 410 +[id0] expl/num paths total 9 +[id0] expl/path length Mean 50 +[id0] expl/path length Std 0 +[id0] expl/path length Max 50 +[id0] expl/path length Min 50 +[id0] expl/Rewards Mean 0 +[id0] expl/Rewards Std 0 +[id0] expl/Rewards Max 0 +[id0] expl/Rewards Min 0 +[id0] expl/Returns Mean 0 +[id0] expl/Returns Std 0 +[id0] expl/Returns Max 0 +[id0] expl/Returns Min 0 +[id0] expl/Actions Mean 0.00970836 +[id0] expl/Actions Std 0.0531246 +[id0] expl/Actions Max 0.160464 +[id0] expl/Actions Min -0.167047 +[id0] expl/Num Paths 2 +[id0] expl/Average Returns 0 +[id0] eval/num steps total 400 +[id0] eval/num paths total 8 +[id0] eval/path length Mean 50 +[id0] eval/path length Std 0 +[id0] eval/path length Max 50 +[id0] eval/path length Min 50 +[id0] eval/Rewards Mean 0 +[id0] eval/Rewards Std 0 +[id0] eval/Rewards Max 0 +[id0] eval/Rewards Min 0 +[id0] eval/Returns Mean 0 +[id0] eval/Returns Std 0 +[id0] eval/Returns Max 0 +[id0] eval/Returns Min 0 +[id0] eval/Actions Mean 0.0062658 +[id0] eval/Actions Std 0.0147267 +[id0] eval/Actions Max 0.0217953 +[id0] eval/Actions Min -0.0196141 +[id0] eval/Num Paths 2 +[id0] eval/Average Returns 0 +[id0] time/epoch_time (s) 0.248886 +[id0] time/evaluation sampling (s) 0.0705061 +[id0] time/exploration sampling (s) 0.0690467 +[id0] time/global_time (s) 5.26157 +[id0] time/replay buffer data storing (s) 0.000605345 +[id0] time/saving (s) 1.38283e-05 +[id0] time/training (s) 0.107416 +[id0] epoch 3 +[id0] ----------------------------------- --------------- +[id0] ----------------------------------- --------------- +[id0] trainer/num train calls 50 +[id0] trainer/QF1 Loss 25.877 +[id0] trainer/QF2 Loss 21.9453 +[id0] trainer/Policy Loss 3.76137 +[id0] trainer/Q1 Predictions Mean 4.63133 +[id0] trainer/Q1 Predictions Std 0.454815 +[id0] trainer/Q1 Predictions Max 5.41268 +[id0] trainer/Q1 Predictions Min 4.19083 +[id0] trainer/Q2 Predictions Mean 5.01998 +[id0] trainer/Q2 Predictions Std 0.561715 +[id0] trainer/Q2 Predictions Max 5.83242 +[id0] trainer/Q2 Predictions Min 4.26122 +[id0] trainer/Q Targets Mean 9.6649 +[id0] trainer/Q Targets Std 0.783043 +[id0] trainer/Q Targets Max 10.3778 +[id0] trainer/Q Targets Min 8.15492 +[id0] trainer/Log Pis Mean 8.33577 +[id0] trainer/Log Pis Std 1.02499 +[id0] trainer/Log Pis Max 9.8239 +[id0] trainer/Log Pis Min 7.11964 +[id0] trainer/rewards Mean 9.58611 +[id0] trainer/rewards Std 0.786294 +[id0] trainer/rewards Max 10.3004 +[id0] trainer/rewards Min 8.06987 +[id0] trainer/terminals Mean 0 +[id0] trainer/terminals Std 0 +[id0] trainer/terminals Max 0 +[id0] trainer/terminals Min 0 +[id0] trainer/replay_buffer_len 12360 +[id0] trainer/policy/mean Mean -0.0360792 +[id0] trainer/policy/mean Std 0.363043 +[id0] trainer/policy/mean Max 0.602992 +[id0] trainer/policy/mean Min -0.687834 +[id0] trainer/policy/std Mean 0.0507201 +[id0] trainer/policy/std Std 6.73447e-05 +[id0] trainer/policy/std Max 0.0508204 +[id0] trainer/policy/std Min 0.0506453 +[id0] trainer/Advantage Weights Mean 0.2 +[id0] trainer/Advantage Weights Std 0.0078605 +[id0] trainer/Advantage Weights Max 0.206832 +[id0] trainer/Advantage Weights Min 0.184856 +[id0] trainer/Advantage Score Mean 0.0569316 +[id0] trainer/Advantage Score Std 0.0806522 +[id0] trainer/Advantage Score Max 0.125707 +[id0] trainer/Advantage Score Min -0.0989513 +[id0] expl/num steps total 510 +[id0] expl/num paths total 11 +[id0] expl/path length Mean 50 +[id0] expl/path length Std 0 +[id0] expl/path length Max 50 +[id0] expl/path length Min 50 +[id0] expl/Rewards Mean 0 +[id0] expl/Rewards Std 0 +[id0] expl/Rewards Max 0 +[id0] expl/Rewards Min 0 +[id0] expl/Returns Mean 0 +[id0] expl/Returns Std 0 +[id0] expl/Returns Max 0 +[id0] expl/Returns Min 0 +[id0] expl/Actions Mean 0.00277007 +[id0] expl/Actions Std 0.0533851 +[id0] expl/Actions Max 0.173962 +[id0] expl/Actions Min -0.161638 +[id0] expl/Num Paths 2 +[id0] expl/Average Returns 0 +[id0] eval/num steps total 500 +[id0] eval/num paths total 10 +[id0] eval/path length Mean 50 +[id0] eval/path length Std 0 +[id0] eval/path length Max 50 +[id0] eval/path length Min 50 +[id0] eval/Rewards Mean 0 +[id0] eval/Rewards Std 0 +[id0] eval/Rewards Max 0 +[id0] eval/Rewards Min 0 +[id0] eval/Returns Mean 0 +[id0] eval/Returns Std 0 +[id0] eval/Returns Max 0 +[id0] eval/Returns Min 0 +[id0] eval/Actions Mean 0.00495222 +[id0] eval/Actions Std 0.0137168 +[id0] eval/Actions Max 0.0171238 +[id0] eval/Actions Min -0.0204181 +[id0] eval/Num Paths 2 +[id0] eval/Average Returns 0 +[id0] time/epoch_time (s) 0.241812 +[id0] time/evaluation sampling (s) 0.0640893 +[id0] time/exploration sampling (s) 0.0679815 +[id0] time/global_time (s) 5.50601 +[id0] time/replay buffer data storing (s) 0.000584364 +[id0] time/saving (s) 1.16825e-05 +[id0] time/training (s) 0.107869 +[id0] epoch 4 +[id0] ----------------------------------- --------------- diff --git a/tests/regression/awac/mujoco/id0_offline/pretrain_q.csv b/tests/regression/awac/mujoco/id0_offline/pretrain_q.csv new file mode 100644 index 000000000..c44d34a1e --- /dev/null +++ b/tests/regression/awac/mujoco/id0_offline/pretrain_q.csv @@ -0,0 +1,2 @@ +trainer/Advantage Score Max,trainer/Advantage Score Mean,trainer/Advantage Score Min,trainer/Advantage Score Std,trainer/Advantage Weights Max,trainer/Advantage Weights Mean,trainer/Advantage Weights Min,trainer/Advantage Weights Std,trainer/Log Pis Max,trainer/Log Pis Mean,trainer/Log Pis Min,trainer/Log Pis Std,trainer/Policy Loss,trainer/Q Targets Max,trainer/Q Targets Mean,trainer/Q Targets Min,trainer/Q Targets Std,trainer/Q1 Predictions Max,trainer/Q1 Predictions Mean,trainer/Q1 Predictions Min,trainer/Q1 Predictions Std,trainer/Q2 Predictions Max,trainer/Q2 Predictions Mean,trainer/Q2 Predictions Min,trainer/Q2 Predictions Std,trainer/QF1 Loss,trainer/QF2 Loss,trainer/batch,trainer/epoch_time,trainer/policy/mean Max,trainer/policy/mean Mean,trainer/policy/mean Min,trainer/policy/mean Std,trainer/policy/std Max,trainer/policy/std Mean,trainer/policy/std Min,trainer/policy/std Std,trainer/replay_buffer_len,trainer/rewards Max,trainer/rewards Mean,trainer/rewards Min,trainer/rewards Std,trainer/terminals Max,trainer/terminals Mean,trainer/terminals Min,trainer/terminals Std +0.00094532245,4.326693e-05,-0.0010307734,0.0004245118,0.007816024,0.0078124995,0.0078083053,1.6582343e-06,12.102844,9.2814045,1.676443,1.9334962,9.285402,12.308713,10.0991955,0.46034268,1.9050921,0.014098693,-0.001766838,-0.0097639365,0.0043692724,0.0069396878,-0.001565838,-0.010821134,0.0031241195,105.66298,105.65559,0,0.022086620330810547,0.00072341814,-0.000110570305,-0.0014804011,0.00040941662,0.049787067,0.04978707,0.049787067,3.7252903e-09,12000,12.31599,10.108,0.46291977,1.9051892,0.0,0.0,0.0,0.0 diff --git a/tests/regression/awac/mujoco/id0_offline/progress.csv b/tests/regression/awac/mujoco/id0_offline/progress.csv new file mode 100644 index 000000000..44bce192c --- /dev/null +++ b/tests/regression/awac/mujoco/id0_offline/progress.csv @@ -0,0 +1,6 @@ +epoch,eval/Actions Max,eval/Actions Mean,eval/Actions Min,eval/Actions Std,eval/Average Returns,eval/Num Paths,eval/Returns Max,eval/Returns Mean,eval/Returns Min,eval/Returns Std,eval/Rewards Max,eval/Rewards Mean,eval/Rewards Min,eval/Rewards Std,eval/num paths total,eval/num steps total,eval/path length Max,eval/path length Mean,eval/path length Min,eval/path length Std,expl/Actions Max,expl/Actions Mean,expl/Actions Min,expl/Actions Std,expl/Average Returns,expl/Num Paths,expl/Returns Max,expl/Returns Mean,expl/Returns Min,expl/Returns Std,expl/Rewards Max,expl/Rewards Mean,expl/Rewards Min,expl/Rewards Std,expl/num paths total,expl/num steps total,expl/path length Max,expl/path length Mean,expl/path length Min,expl/path length Std,time/epoch_time (s),time/evaluation sampling (s),time/exploration sampling (s),time/global_time (s),time/replay buffer data storing (s),time/saving (s),time/training (s),trainer/Advantage Score Max,trainer/Advantage Score Mean,trainer/Advantage Score Min,trainer/Advantage Score Std,trainer/Advantage Weights Max,trainer/Advantage Weights Mean,trainer/Advantage Weights Min,trainer/Advantage Weights Std,trainer/Log Pis Max,trainer/Log Pis Mean,trainer/Log Pis Min,trainer/Log Pis Std,trainer/Policy Loss,trainer/Q Targets Max,trainer/Q Targets Mean,trainer/Q Targets Min,trainer/Q Targets Std,trainer/Q1 Predictions Max,trainer/Q1 Predictions Mean,trainer/Q1 Predictions Min,trainer/Q1 Predictions Std,trainer/Q2 Predictions Max,trainer/Q2 Predictions Mean,trainer/Q2 Predictions Min,trainer/Q2 Predictions Std,trainer/QF1 Loss,trainer/QF2 Loss,trainer/num train calls,trainer/policy/mean Max,trainer/policy/mean Mean,trainer/policy/mean Min,trainer/policy/mean Std,trainer/policy/std Max,trainer/policy/std Mean,trainer/policy/std Min,trainer/policy/std Std,trainer/replay_buffer_len,trainer/rewards Max,trainer/rewards Mean,trainer/rewards Min,trainer/rewards Std,trainer/terminals Max,trainer/terminals Mean,trainer/terminals Min,trainer/terminals Std +0,0.0040359604,0.0019867404,-0.00245622,0.002454789,0.0,2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2,100,50,50.0,50,0.0,0.14899917,0.0015648025,-0.17159222,0.04887811,0.0,2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,3,110,50,50.0,50,0.0,0.30924296379089355,0.07014298439025879,0.0686192512512207,4.419266223907471,0.0006220340728759766,0.011567354202270508,0.148240327835083,0.0034106374,0.0012238442,0.0001052022,0.0011846588,0.20021877,0.2,0.19988814,0.00011850115,10.636104,8.934083,5.4316216,1.9003453,8.757611,11.722731,10.661857,9.183795,0.87790495,0.30157864,0.1887492,0.13530414,0.060450103,0.25694412,0.18873005,0.12467281,0.053858086,110.41571,110.42584,10,0.09941348,0.015443613,-0.09101463,0.048453398,0.050011825,0.05001126,0.050009772,7.117011e-07,12110,11.721305,10.66757,9.193444,0.87511677,0.0,0.0,0.0,0.0 +1,0.0126385875,0.006248394,-0.0023900562,0.005363199,0.0,2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,4,200,50,50.0,50,0.0,0.15677911,0.0067327977,-0.16349992,0.050217863,0.0,2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,5,210,50,50.0,50,0.0,0.2933011054992676,0.07105278968811035,0.06938290596008301,4.7160515785217285,0.0006275177001953125,1.5735626220703125e-05,0.15061426162719727,0.009175777,0.003072834,-0.00097095966,0.0039570634,0.20061082,0.19999999,0.19959563,0.00039587833,11.239998,9.6104355,8.453209,0.93945545,9.0104885,11.24798,6.819205,0.12090105,3.800278,0.8618817,0.6310652,0.39153916,0.19185658,0.89452434,0.61532426,0.3498703,0.21170053,52.16795,52.035328,20,0.43584758,-0.0018095046,-0.44667792,0.25560436,0.05022962,0.050221298,0.05021245,5.3918543e-06,1050,11.248069,6.818242,0.115803644,3.8021011,0.0,0.0,0.0,0.0 +2,0.02186232,0.006434499,-0.016668703,0.014682745,0.0,2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,6,300,50,50.0,50,0.0,0.17963977,0.009995011,-0.14127403,0.05521614,0.0,2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,7,310,50,50.0,50,0.0,0.29057979583740234,0.07143688201904297,0.06914186477661133,5.009724855422974,0.000591278076171875,1.430511474609375e-05,0.14786624908447266,0.010796785,-0.0028856755,-0.027430296,0.013557364,0.20136832,0.2,0.197556,0.001351659,11.290871,10.365364,9.691922,0.5437251,8.602791,11.952537,10.756134,10.102677,0.67290604,2.333503,1.7599968,1.2430896,0.41295424,2.436408,1.8776823,1.2415384,0.4320512,81.51204,79.33188,30,0.9434526,0.21142718,-0.734058,0.57911015,0.05042078,0.050407894,0.050388318,1.0858161e-05,1100,11.937101,10.741376,10.087513,0.6732681,0.0,0.0,0.0,0.0 +3,0.02179531,0.0062657977,-0.019614106,0.014726656,0.0,2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,8,400,50,50.0,50,0.0,0.16046432,0.009708362,-0.16704701,0.053124588,0.0,2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,9,410,50,50.0,50,0.0,0.2488858699798584,0.07050609588623047,0.0690467357635498,5.261571645736694,0.0006053447723388672,1.3828277587890625e-05,0.10741615295410156,0.042402983,-0.0052647265,-0.0498209,0.030204618,0.20480067,0.19999999,0.19557132,0.0030239078,10.167101,9.260609,8.545359,0.57865787,6.7462454,11.075172,8.341286,0.001511037,4.203693,4.6316233,2.5262907,0.122619346,1.4904397,5.2747893,2.8272429,0.11884822,1.7274737,42.81978,38.72561,40,0.98374677,0.14098135,-0.6363006,0.51875216,0.050624505,0.050571714,0.05053212,3.4236007e-05,12260,11.047477,8.3179035,0.0,4.1931157,0.0,0.0,0.0,0.0 +4,0.017123755,0.0049522216,-0.020418145,0.013716821,0.0,2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,10,500,50,50.0,50,0.0,0.17396162,0.0027700714,-0.16163819,0.05338514,0.0,2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,11,510,50,50.0,50,0.0,0.24181151390075684,0.06408929824829102,0.06798148155212402,5.506011247634888,0.0005843639373779297,1.1682510375976562e-05,0.10786914825439453,0.12570715,0.056931593,-0.09895134,0.08065216,0.2068318,0.2,0.18485591,0.007860499,9.8239,8.3357725,7.1196394,1.0249865,3.7613707,10.377833,9.664903,8.154922,0.7830427,5.4126797,4.631333,4.190825,0.45481482,5.832421,5.0199785,4.2612243,0.5617146,25.876968,21.945274,50,0.6029924,-0.03607917,-0.6878341,0.363043,0.050820414,0.05072013,0.05064529,6.734472e-05,12360,10.300411,9.586108,8.069872,0.7862939,0.0,0.0,0.0,0.0 diff --git a/tests/regression/awac/mujoco/id0_offline/script_name.txt b/tests/regression/awac/mujoco/id0_offline/script_name.txt new file mode 100644 index 000000000..f88034849 --- /dev/null +++ b/tests/regression/awac/mujoco/id0_offline/script_name.txt @@ -0,0 +1 @@ +tests/regression/awac/mujoco/test_awac_mujoco_offline.py \ No newline at end of file diff --git a/tests/regression/awac/mujoco/id0_offline/variant.json b/tests/regression/awac/mujoco/id0_offline/variant.json new file mode 100644 index 000000000..8534a9be3 --- /dev/null +++ b/tests/regression/awac/mujoco/id0_offline/variant.json @@ -0,0 +1,104 @@ +{ + "add_env_demos": false, + "add_env_offpolicy_data": false, + "algo_kwargs": { + "batch_size": 5, + "min_num_steps_before_training": 10, + "num_epochs": 5, + "num_eval_steps_per_epoch": 100, + "num_expl_steps_per_train_loop": 100, + "num_trains_per_train_loop": 10 + }, + "algorithm": "AWAC", + "batch_size": 5, + "collection_mode": "batch", + "debug": true, + "env_class": { + "$class": "rlkit.testing.stub_classes.StubEnv" + }, + "env_kwargs": { + "action_dim": 6, + "obs_dim": 17 + }, + "exp_id": 0, + "exp_name": "references/awac/mujoco/awac_offline1", + "layer_size": 256, + "load_demos": true, + "logger_config": { + "run_id": 0, + "snapshot_gap": 100, + "snapshot_mode": "gap" + }, + "max_path_length": 50, + "num_epochs": 0, + "num_layers": 2, + "path_loader_class": { + "$class": "rlkit.demos.source.dict_to_mdp_path_loader.DictToMDPPathLoader" + }, + "path_loader_kwargs": { + "demo_paths": [ + { + "data_split": 1, + "is_demo": true, + "obs_dict": false, + "path": "/home/ashvin/code/railrl-private/tests/regression/awac/mujoco/hc_action_noise_15.npy" + } + ] + }, + "policy_class": { + "$class": "rlkit.torch.sac.policies.gaussian_policy.GaussianPolicy" + }, + "policy_kwargs": { + "hidden_sizes": [ + 256, + 256, + 256, + 256 + ], + "max_log_std": 0, + "min_log_std": -6, + "std_architecture": "values" + }, + "pretrain_policy": false, + "pretrain_rl": true, + "pretraining_logging_period": 1, + "qf_kwargs": { + "hidden_sizes": [ + 256, + 256 + ] + }, + "replay_buffer_size": 1000000, + "run_id": 0, + "seed": "0", + "seedid": 0, + "train_rl": true, + "trainer_kwargs": { + "alpha": 0, + "awr_min_q": true, + "awr_sample_actions": false, + "awr_use_mle_for_vf": false, + "awr_weight": 1.0, + "bc_num_pretrain_steps": 0, + "bc_weight": 0.0, + "beta": 2, + "buffer_policy_sample_actions": false, + "compute_bc": false, + "discount": 0.99, + "policy_lr": 0.0003, + "policy_weight_decay": 0.0001, + "q_num_pretrain1_steps": 0, + "q_num_pretrain2_steps": 10, + "qf_lr": 0.0003, + "reparam_weight": 0.0, + "reward_scale": 1, + "soft_target_tau": 0.005, + "target_update_period": 1, + "train_bc_on_rl_buffer": false, + "use_automatic_entropy_tuning": false + }, + "trial_name": "id0", + "unique_id": "23a68c9c-d7b0-451a-a876-3a7bbeaf0266", + "use_validation_buffer": true, + "version": "normal" +} \ No newline at end of file diff --git a/tests/regression/awac/mujoco/test_awac_mujoco.py b/tests/regression/awac/mujoco/test_awac_mujoco.py new file mode 100644 index 000000000..aedc9bf9f --- /dev/null +++ b/tests/regression/awac/mujoco/test_awac_mujoco.py @@ -0,0 +1,38 @@ +"""Test AWAC online on Mujoco benchmark tasks. + +Data available for download: +https://drive.google.com/file/d/1edcuicVv2d-PqH1aZUVbO5CeRq3lqK89/view +""" +import os +import sys + +from examples.awac.mujoco.awac1 import main + +from rlkit.core import logger +from rlkit.testing import csv_util + +def test_awac_mujoco_online(): + cmd = "python examples/awac/mujoco/awac1.py --1 --local --gpu --run 0 --seed 0 --debug" + sys.argv = cmd.split(" ")[1:] + main() + + # check if online training results matches + reference_csv = "tests/regression/awac/mujoco/id0/pretrain_q.csv" + output_csv = os.path.join(logger.get_snapshot_dir(), "pretrain_q.csv") + print("comparing reference %s against output %s" % (reference_csv, output_csv)) + output = csv_util.get_exp(output_csv) + reference = csv_util.get_exp(reference_csv) + keys = ["trainer/batch", "trainer/Advantage Score Max", "trainer/Q1 Predictions Mean", "trainer/replay_buffer_len"] + csv_util.check_equal(reference, output, keys) + + # TODO: this test seems to have some extra stochasticity to control, perhaps from the env? + # check if online training results match + # reference_csv = "tests/regression/awac/mujoco/id0/progress.csv" + # output_csv = os.path.join(logger.get_snapshot_dir(), "progress.csv") + # output = csv_util.get_exp(output_csv) + # reference = csv_util.get_exp(reference_csv) + # keys = ["epoch", "expl/Average Returns", "trainer/Advantage Score Max", "trainer/Q1 Predictions Mean", "trainer/replay_buffer_len"] + # csv_util.check_equal(reference, output, keys) + +if __name__ == "__main__": + test_awac_mujoco_online() diff --git a/tests/regression/awac/mujoco/test_awac_mujoco_offline.py b/tests/regression/awac/mujoco/test_awac_mujoco_offline.py new file mode 100644 index 000000000..c2d6759ff --- /dev/null +++ b/tests/regression/awac/mujoco/test_awac_mujoco_offline.py @@ -0,0 +1,149 @@ +"""Test AWAC online on Mujoco benchmark tasks. + +Data available for download: +https://drive.google.com/file/d/1edcuicVv2d-PqH1aZUVbO5CeRq3lqK89/view +""" + +from rlkit.demos.source.dict_to_mdp_path_loader import DictToMDPPathLoader +from rlkit.launchers.experiments.awac.awac_rl import experiment, process_args + +import rlkit.util.hyperparameter as hyp +from rlkit.launchers.arglauncher import run_variants + +from rlkit.torch.sac.policies import GaussianPolicy + +from rlkit.testing.stub_classes import StubEnv + +def main(): + variant = dict( + algo_kwargs=dict( + num_epochs=501, + num_eval_steps_per_epoch=1000, + num_trains_per_train_loop=1000, + num_expl_steps_per_train_loop=1000, + min_num_steps_before_training=1000, + batch_size=1024, + ), + max_path_length=1000, + replay_buffer_size=int(1E6), + layer_size=256, + num_layers=2, + algorithm="AWAC", + version="normal", + collection_mode='batch', + + policy_class=GaussianPolicy, + policy_kwargs=dict( + hidden_sizes=[256] * 4, + max_log_std=0, + min_log_std=-6, + std_architecture="values", + ), + qf_kwargs=dict( + hidden_sizes=[256, 256] + ), + trainer_kwargs=dict( + discount=0.99, + soft_target_tau=5e-3, + target_update_period=1, + policy_lr=3E-4, + qf_lr=3E-4, + reward_scale=1, + beta=1, + alpha=0, + use_automatic_entropy_tuning=False, + q_num_pretrain1_steps=0, + q_num_pretrain2_steps=50000, + policy_weight_decay=1e-4, + train_bc_on_rl_buffer=False, + buffer_policy_sample_actions=False, + + reparam_weight=0.0, + awr_weight=1.0, + bc_weight=0.0, + compute_bc=False, + awr_use_mle_for_vf=False, + awr_sample_actions=False, + awr_min_q=True, + ), + path_loader_kwargs=dict( + demo_paths=[ # these can be loaded in awac_rl.py per env + # dict( + # path='demos/ant_action_noise_15.npy', + # obs_dict=False, + # is_demo=True, + # train_split=.9, + # ), + ], + ), + path_loader_class=DictToMDPPathLoader, + + pretrain_rl=True, + use_validation_buffer=True, + add_env_demos=True, + add_env_offpolicy_data=True, + load_demos=True, + ) + + search_space = { + 'trainer_kwargs.beta':[2, ], + 'train_rl':[True], + 'pretrain_rl':[True], + 'pretrain_policy':[False], + # 'env_id': ['HalfCheetah-v2', 'Ant-v2', 'Walker2d-v2', ], + 'seedid': range(5), + + # make experiment short and only offline + 'batch_size': [5], + 'num_epochs': [0], + 'pretraining_logging_period': [1], + 'trainer_kwargs.q_num_pretrain2_steps': [10], + 'path_loader_kwargs.demo_paths': [ + [dict( + path=os.getcwd() + "/tests/regression/awac/mujoco/hc_action_noise_15.npy", + obs_dict=False, # misleading but this arg is really "unwrap_obs_dict" + is_demo=True, + data_split=1, + ),], + ], + 'env_class': [StubEnv], # replaces half-cheetah + 'env_kwargs': [dict( + obs_dim=17, + action_dim=6, + ),], + 'add_env_demos': [False], + 'add_env_offpolicy_data': [False], + } + sweeper = hyp.DeterministicHyperparameterSweeper( + search_space, default_parameters=variant, + ) + + variants = [] + for variant in sweeper.iterate_hyperparameters(): + variants.append(variant) + + run_variants(experiment, variants, process_args) + + +import os +import sys + +from rlkit.core import logger +from rlkit.testing import csv_util + +def test_awac_mujoco_offline(): + cmd = "python experiments/references/awac/mujoco/awac_offline1.py --1 --local --gpu --run 0 --seed 0 --debug" + sys.argv = cmd.split(" ")[1:] + main() + + # check if offline training results matches + reference_csv = "tests/regression/awac/mujoco/id0_offline/pretrain_q.csv" + output_csv = os.path.join(logger.get_snapshot_dir(), "pretrain_q.csv") + print("comparing reference %s against output %s" % (reference_csv, output_csv)) + output = csv_util.get_exp(output_csv) + reference = csv_util.get_exp(reference_csv) + keys = ["trainer/batch", "trainer/Advantage Score Max", "trainer/Q1 Predictions Mean", "trainer/replay_buffer_len"] + csv_util.check_equal(reference, output, keys) + +if __name__ == "__main__": + test_awac_mujoco_offline() diff --git a/tests/regression/basic/__init__.py b/tests/regression/basic/__init__.py new file mode 100644 index 000000000..e69de29bb diff --git a/tests/regression/basic/data_ddpg/debug.log b/tests/regression/basic/data_ddpg/debug.log new file mode 100644 index 000000000..bda40792f --- /dev/null +++ b/tests/regression/basic/data_ddpg/debug.log @@ -0,0 +1,142 @@ +[name-of-experiment_2021_08_20_00_11_36_0000--s-0] ----------------------------------- --------------- +[name-of-experiment_2021_08_20_00_11_36_0000--s-0] replay_buffer/size 11000 +[name-of-experiment_2021_08_20_00_11_36_0000--s-0] trainer/QF Loss 0.316443 +[name-of-experiment_2021_08_20_00_11_36_0000--s-0] trainer/Policy Loss 0.000451458 +[name-of-experiment_2021_08_20_00_11_36_0000--s-0] trainer/Raw Policy Loss 0.000451458 +[name-of-experiment_2021_08_20_00_11_36_0000--s-0] trainer/Preactivation Policy Loss 0 +[name-of-experiment_2021_08_20_00_11_36_0000--s-0] trainer/Q Predictions Mean -0.000400974 +[name-of-experiment_2021_08_20_00_11_36_0000--s-0] trainer/Q Predictions Std 0.00013214 +[name-of-experiment_2021_08_20_00_11_36_0000--s-0] trainer/Q Predictions Max -0.000268835 +[name-of-experiment_2021_08_20_00_11_36_0000--s-0] trainer/Q Predictions Min -0.000533114 +[name-of-experiment_2021_08_20_00_11_36_0000--s-0] trainer/Q Targets Mean 0.5249 +[name-of-experiment_2021_08_20_00_11_36_0000--s-0] trainer/Q Targets Std 0.20112 +[name-of-experiment_2021_08_20_00_11_36_0000--s-0] trainer/Q Targets Max 0.72602 +[name-of-experiment_2021_08_20_00_11_36_0000--s-0] trainer/Q Targets Min 0.32378 +[name-of-experiment_2021_08_20_00_11_36_0000--s-0] trainer/Bellman Errors Mean 0.316443 +[name-of-experiment_2021_08_20_00_11_36_0000--s-0] trainer/Bellman Errors Std 0.211436 +[name-of-experiment_2021_08_20_00_11_36_0000--s-0] trainer/Bellman Errors Max 0.527879 +[name-of-experiment_2021_08_20_00_11_36_0000--s-0] trainer/Bellman Errors Min 0.105008 +[name-of-experiment_2021_08_20_00_11_36_0000--s-0] trainer/Policy Action Mean 0 +[name-of-experiment_2021_08_20_00_11_36_0000--s-0] trainer/Policy Action Std 0 +[name-of-experiment_2021_08_20_00_11_36_0000--s-0] trainer/Policy Action Max 0 +[name-of-experiment_2021_08_20_00_11_36_0000--s-0] trainer/Policy Action Min 0 +[name-of-experiment_2021_08_20_00_11_36_0000--s-0] expl/num steps total 11000 +[name-of-experiment_2021_08_20_00_11_36_0000--s-0] expl/num paths total 11 +[name-of-experiment_2021_08_20_00_11_36_0000--s-0] expl/path length Mean 1000 +[name-of-experiment_2021_08_20_00_11_36_0000--s-0] expl/path length Std 0 +[name-of-experiment_2021_08_20_00_11_36_0000--s-0] expl/path length Max 1000 +[name-of-experiment_2021_08_20_00_11_36_0000--s-0] expl/path length Min 1000 +[name-of-experiment_2021_08_20_00_11_36_0000--s-0] expl/Rewards Mean -0.158665 +[name-of-experiment_2021_08_20_00_11_36_0000--s-0] expl/Rewards Std 0.287571 +[name-of-experiment_2021_08_20_00_11_36_0000--s-0] expl/Rewards Max 0.870555 +[name-of-experiment_2021_08_20_00_11_36_0000--s-0] expl/Rewards Min -1.19111 +[name-of-experiment_2021_08_20_00_11_36_0000--s-0] expl/Returns Mean -158.665 +[name-of-experiment_2021_08_20_00_11_36_0000--s-0] expl/Returns Std 0 +[name-of-experiment_2021_08_20_00_11_36_0000--s-0] expl/Returns Max -158.665 +[name-of-experiment_2021_08_20_00_11_36_0000--s-0] expl/Returns Min -158.665 +[name-of-experiment_2021_08_20_00_11_36_0000--s-0] expl/Actions Mean 0.0159522 +[name-of-experiment_2021_08_20_00_11_36_0000--s-0] expl/Actions Std 0.544067 +[name-of-experiment_2021_08_20_00_11_36_0000--s-0] expl/Actions Max 1 +[name-of-experiment_2021_08_20_00_11_36_0000--s-0] expl/Actions Min -1 +[name-of-experiment_2021_08_20_00_11_36_0000--s-0] expl/Num Paths 1 +[name-of-experiment_2021_08_20_00_11_36_0000--s-0] expl/Average Returns -158.665 +[name-of-experiment_2021_08_20_00_11_36_0000--s-0] eval/num steps total 1000 +[name-of-experiment_2021_08_20_00_11_36_0000--s-0] eval/num paths total 1 +[name-of-experiment_2021_08_20_00_11_36_0000--s-0] eval/path length Mean 1000 +[name-of-experiment_2021_08_20_00_11_36_0000--s-0] eval/path length Std 0 +[name-of-experiment_2021_08_20_00_11_36_0000--s-0] eval/path length Max 1000 +[name-of-experiment_2021_08_20_00_11_36_0000--s-0] eval/path length Min 1000 +[name-of-experiment_2021_08_20_00_11_36_0000--s-0] eval/Rewards Mean -0.000567671 +[name-of-experiment_2021_08_20_00_11_36_0000--s-0] eval/Rewards Std 0.013464 +[name-of-experiment_2021_08_20_00_11_36_0000--s-0] eval/Rewards Max 0.132052 +[name-of-experiment_2021_08_20_00_11_36_0000--s-0] eval/Rewards Min -0.189206 +[name-of-experiment_2021_08_20_00_11_36_0000--s-0] eval/Returns Mean -0.567671 +[name-of-experiment_2021_08_20_00_11_36_0000--s-0] eval/Returns Std 0 +[name-of-experiment_2021_08_20_00_11_36_0000--s-0] eval/Returns Max -0.567671 +[name-of-experiment_2021_08_20_00_11_36_0000--s-0] eval/Returns Min -0.567671 +[name-of-experiment_2021_08_20_00_11_36_0000--s-0] eval/Actions Mean -2.57517e-05 +[name-of-experiment_2021_08_20_00_11_36_0000--s-0] eval/Actions Std 7.41497e-05 +[name-of-experiment_2021_08_20_00_11_36_0000--s-0] eval/Actions Max 0.000939435 +[name-of-experiment_2021_08_20_00_11_36_0000--s-0] eval/Actions Min -0.00118844 +[name-of-experiment_2021_08_20_00_11_36_0000--s-0] eval/Num Paths 1 +[name-of-experiment_2021_08_20_00_11_36_0000--s-0] eval/Average Returns -0.567671 +[name-of-experiment_2021_08_20_00_11_36_0000--s-0] time/epoch_time (s) 8.45713 +[name-of-experiment_2021_08_20_00_11_36_0000--s-0] time/evaluation sampling (s) 0.31752 +[name-of-experiment_2021_08_20_00_11_36_0000--s-0] time/exploration sampling (s) 0.355206 +[name-of-experiment_2021_08_20_00_11_36_0000--s-0] time/global_time (s) 8.48654 +[name-of-experiment_2021_08_20_00_11_36_0000--s-0] time/replay buffer data storing (s) 0.00572181 +[name-of-experiment_2021_08_20_00_11_36_0000--s-0] time/saving (s) 0.00324774 +[name-of-experiment_2021_08_20_00_11_36_0000--s-0] time/training (s) 4.18211 +[name-of-experiment_2021_08_20_00_11_36_0000--s-0] epoch 0 +[name-of-experiment_2021_08_20_00_11_36_0000--s-0] ----------------------------------- --------------- +[name-of-experiment_2021_08_20_00_11_36_0000--s-0] ----------------------------------- -------------- +[name-of-experiment_2021_08_20_00_11_36_0000--s-0] replay_buffer/size 12000 +[name-of-experiment_2021_08_20_00_11_36_0000--s-0] trainer/QF Loss 0.0694934 +[name-of-experiment_2021_08_20_00_11_36_0000--s-0] trainer/Policy Loss 0.825903 +[name-of-experiment_2021_08_20_00_11_36_0000--s-0] trainer/Raw Policy Loss 0.825903 +[name-of-experiment_2021_08_20_00_11_36_0000--s-0] trainer/Preactivation Policy Loss 0 +[name-of-experiment_2021_08_20_00_11_36_0000--s-0] trainer/Q Predictions Mean -0.808841 +[name-of-experiment_2021_08_20_00_11_36_0000--s-0] trainer/Q Predictions Std 0.327327 +[name-of-experiment_2021_08_20_00_11_36_0000--s-0] trainer/Q Predictions Max -0.481514 +[name-of-experiment_2021_08_20_00_11_36_0000--s-0] trainer/Q Predictions Min -1.13617 +[name-of-experiment_2021_08_20_00_11_36_0000--s-0] trainer/Q Targets Mean -0.761048 +[name-of-experiment_2021_08_20_00_11_36_0000--s-0] trainer/Q Targets Std 0.586575 +[name-of-experiment_2021_08_20_00_11_36_0000--s-0] trainer/Q Targets Max -0.174474 +[name-of-experiment_2021_08_20_00_11_36_0000--s-0] trainer/Q Targets Min -1.34762 +[name-of-experiment_2021_08_20_00_11_36_0000--s-0] trainer/Bellman Errors Mean 0.0694934 +[name-of-experiment_2021_08_20_00_11_36_0000--s-0] trainer/Bellman Errors Std 0.0247802 +[name-of-experiment_2021_08_20_00_11_36_0000--s-0] trainer/Bellman Errors Max 0.0942736 +[name-of-experiment_2021_08_20_00_11_36_0000--s-0] trainer/Bellman Errors Min 0.0447132 +[name-of-experiment_2021_08_20_00_11_36_0000--s-0] trainer/Policy Action Mean 0.0416398 +[name-of-experiment_2021_08_20_00_11_36_0000--s-0] trainer/Policy Action Std 0.118181 +[name-of-experiment_2021_08_20_00_11_36_0000--s-0] trainer/Policy Action Max 0.150039 +[name-of-experiment_2021_08_20_00_11_36_0000--s-0] trainer/Policy Action Min -0.146716 +[name-of-experiment_2021_08_20_00_11_36_0000--s-0] expl/num steps total 12000 +[name-of-experiment_2021_08_20_00_11_36_0000--s-0] expl/num paths total 12 +[name-of-experiment_2021_08_20_00_11_36_0000--s-0] expl/path length Mean 1000 +[name-of-experiment_2021_08_20_00_11_36_0000--s-0] expl/path length Std 0 +[name-of-experiment_2021_08_20_00_11_36_0000--s-0] expl/path length Max 1000 +[name-of-experiment_2021_08_20_00_11_36_0000--s-0] expl/path length Min 1000 +[name-of-experiment_2021_08_20_00_11_36_0000--s-0] expl/Rewards Mean -0.184372 +[name-of-experiment_2021_08_20_00_11_36_0000--s-0] expl/Rewards Std 0.373552 +[name-of-experiment_2021_08_20_00_11_36_0000--s-0] expl/Rewards Max 0.923601 +[name-of-experiment_2021_08_20_00_11_36_0000--s-0] expl/Rewards Min -2.0592 +[name-of-experiment_2021_08_20_00_11_36_0000--s-0] expl/Returns Mean -184.372 +[name-of-experiment_2021_08_20_00_11_36_0000--s-0] expl/Returns Std 0 +[name-of-experiment_2021_08_20_00_11_36_0000--s-0] expl/Returns Max -184.372 +[name-of-experiment_2021_08_20_00_11_36_0000--s-0] expl/Returns Min -184.372 +[name-of-experiment_2021_08_20_00_11_36_0000--s-0] expl/Actions Mean 0.0680128 +[name-of-experiment_2021_08_20_00_11_36_0000--s-0] expl/Actions Std 0.538613 +[name-of-experiment_2021_08_20_00_11_36_0000--s-0] expl/Actions Max 1 +[name-of-experiment_2021_08_20_00_11_36_0000--s-0] expl/Actions Min -1 +[name-of-experiment_2021_08_20_00_11_36_0000--s-0] expl/Num Paths 1 +[name-of-experiment_2021_08_20_00_11_36_0000--s-0] expl/Average Returns -184.372 +[name-of-experiment_2021_08_20_00_11_36_0000--s-0] eval/num steps total 2000 +[name-of-experiment_2021_08_20_00_11_36_0000--s-0] eval/num paths total 2 +[name-of-experiment_2021_08_20_00_11_36_0000--s-0] eval/path length Mean 1000 +[name-of-experiment_2021_08_20_00_11_36_0000--s-0] eval/path length Std 0 +[name-of-experiment_2021_08_20_00_11_36_0000--s-0] eval/path length Max 1000 +[name-of-experiment_2021_08_20_00_11_36_0000--s-0] eval/path length Min 1000 +[name-of-experiment_2021_08_20_00_11_36_0000--s-0] eval/Rewards Mean -0.0102187 +[name-of-experiment_2021_08_20_00_11_36_0000--s-0] eval/Rewards Std 0.0205813 +[name-of-experiment_2021_08_20_00_11_36_0000--s-0] eval/Rewards Max 0.382765 +[name-of-experiment_2021_08_20_00_11_36_0000--s-0] eval/Rewards Min -0.145901 +[name-of-experiment_2021_08_20_00_11_36_0000--s-0] eval/Returns Mean -10.2187 +[name-of-experiment_2021_08_20_00_11_36_0000--s-0] eval/Returns Std 0 +[name-of-experiment_2021_08_20_00_11_36_0000--s-0] eval/Returns Max -10.2187 +[name-of-experiment_2021_08_20_00_11_36_0000--s-0] eval/Returns Min -10.2187 +[name-of-experiment_2021_08_20_00_11_36_0000--s-0] eval/Actions Mean 0.0451494 +[name-of-experiment_2021_08_20_00_11_36_0000--s-0] eval/Actions Std 0.127398 +[name-of-experiment_2021_08_20_00_11_36_0000--s-0] eval/Actions Max 0.17525 +[name-of-experiment_2021_08_20_00_11_36_0000--s-0] eval/Actions Min -0.170668 +[name-of-experiment_2021_08_20_00_11_36_0000--s-0] eval/Num Paths 1 +[name-of-experiment_2021_08_20_00_11_36_0000--s-0] eval/Average Returns -10.2187 +[name-of-experiment_2021_08_20_00_11_36_0000--s-0] time/epoch_time (s) 3.37273 +[name-of-experiment_2021_08_20_00_11_36_0000--s-0] time/evaluation sampling (s) 0.341391 +[name-of-experiment_2021_08_20_00_11_36_0000--s-0] time/exploration sampling (s) 0.352289 +[name-of-experiment_2021_08_20_00_11_36_0000--s-0] time/global_time (s) 11.8699 +[name-of-experiment_2021_08_20_00_11_36_0000--s-0] time/replay buffer data storing (s) 0.00548053 +[name-of-experiment_2021_08_20_00_11_36_0000--s-0] time/saving (s) 0.00345588 +[name-of-experiment_2021_08_20_00_11_36_0000--s-0] time/training (s) 2.66486 +[name-of-experiment_2021_08_20_00_11_36_0000--s-0] epoch 1 +[name-of-experiment_2021_08_20_00_11_36_0000--s-0] ----------------------------------- -------------- diff --git a/tests/regression/basic/data_ddpg/progress.csv b/tests/regression/basic/data_ddpg/progress.csv new file mode 100644 index 000000000..56c829dbf --- /dev/null +++ b/tests/regression/basic/data_ddpg/progress.csv @@ -0,0 +1,3 @@ +epoch,eval/Actions Max,eval/Actions Mean,eval/Actions Min,eval/Actions Std,eval/Average Returns,eval/Num Paths,eval/Returns Max,eval/Returns Mean,eval/Returns Min,eval/Returns Std,eval/Rewards Max,eval/Rewards Mean,eval/Rewards Min,eval/Rewards Std,eval/num paths total,eval/num steps total,eval/path length Max,eval/path length Mean,eval/path length Min,eval/path length Std,expl/Actions Max,expl/Actions Mean,expl/Actions Min,expl/Actions Std,expl/Average Returns,expl/Num Paths,expl/Returns Max,expl/Returns Mean,expl/Returns Min,expl/Returns Std,expl/Rewards Max,expl/Rewards Mean,expl/Rewards Min,expl/Rewards Std,expl/num paths total,expl/num steps total,expl/path length Max,expl/path length Mean,expl/path length Min,expl/path length Std,replay_buffer/size,time/epoch_time (s),time/evaluation sampling (s),time/exploration sampling (s),time/global_time (s),time/replay buffer data storing (s),time/saving (s),time/training (s),trainer/Bellman Errors Max,trainer/Bellman Errors Mean,trainer/Bellman Errors Min,trainer/Bellman Errors Std,trainer/Policy Action Max,trainer/Policy Action Mean,trainer/Policy Action Min,trainer/Policy Action Std,trainer/Policy Loss,trainer/Preactivation Policy Loss,trainer/Q Predictions Max,trainer/Q Predictions Mean,trainer/Q Predictions Min,trainer/Q Predictions Std,trainer/Q Targets Max,trainer/Q Targets Mean,trainer/Q Targets Min,trainer/Q Targets Std,trainer/QF Loss,trainer/Raw Policy Loss +0,0.0009394348,-2.575173e-05,-0.0011884357,7.4149735e-05,-0.5676708947219838,1,-0.5676708947219838,-0.5676708947219838,-0.5676708947219838,0.0,0.13205208635980237,-0.0005676708947219878,-0.18920589871478904,0.01346403852392649,1,1000,1000,1000.0,1000,0.0,1.0,0.01595221522542903,-1.0,0.5440672682673976,-158.66463782825045,1,-158.66463782825045,-158.66463782825045,-158.66463782825045,0.0,0.8705545737722478,-0.1586646378282505,-1.1911117725806701,0.28757149125768267,11,11000,1000,1000.0,1000,0.0,11000,8.45713186264038,0.3175199031829834,0.3552060127258301,8.486541509628296,0.0057218074798583984,0.0032477378845214844,4.182106256484985,0.52787924,0.3164434,0.10500758,0.21143582,0.0,0.0,0.0,0.0,0.00045145772,0.0,-0.00026883493,-0.00040097444,-0.00053311395,0.00013213951,0.72601986,0.52489984,0.32377988,0.20111999,0.31644338,0.00045145772 +1,0.1752497,0.045149375,-0.17066804,0.12739834,-10.218731674245209,1,-10.218731674245209,-10.218731674245209,-10.218731674245209,0.0,0.38276471473529733,-0.010218731674245298,-0.14590142818814536,0.020581276243253184,2,2000,1000,1000.0,1000,0.0,1.0,0.06801284130544095,-1.0,0.5386126116190025,-184.3722984168325,1,-184.3722984168325,-184.3722984168325,-184.3722984168325,0.0,0.923600792459433,-0.1843722984168323,-2.0592014442312245,0.37355204241341083,12,12000,1000,1000.0,1000,0.0,12000,3.3727293014526367,0.34139132499694824,0.35228943824768066,11.869916439056396,0.005480527877807617,0.0034558773040771484,2.6648571491241455,0.09427358,0.06949337,0.044713162,0.02478021,0.1500395,0.041639764,-0.14671618,0.11818099,0.82590324,0.0,-0.4815136,-0.8088408,-1.136168,0.32732722,-0.17447355,-0.7610482,-1.3476229,0.5865747,0.069493376,0.82590324 diff --git a/tests/regression/basic/data_ddpg/variant.json b/tests/regression/basic/data_ddpg/variant.json new file mode 100644 index 000000000..4bfa7779a --- /dev/null +++ b/tests/regression/basic/data_ddpg/variant.json @@ -0,0 +1,32 @@ +{ + "algorithm_kwargs": { + "batch_size": 2, + "max_path_length": 1000, + "min_num_steps_before_training": 10000, + "num_epochs": 2, + "num_eval_steps_per_epoch": 1000, + "num_expl_steps_per_train_loop": 1000, + "num_trains_per_train_loop": 1000 + }, + "policy_kwargs": { + "hidden_sizes": [ + 2, + 2 + ] + }, + "qf_kwargs": { + "hidden_sizes": [ + 2, + 2 + ] + }, + "replay_buffer_size": 1000000, + "seed": 25580, + "trainer_kwargs": { + "discount": 0.99, + "policy_learning_rate": 0.0001, + "qf_learning_rate": 0.001, + "tau": 0.01, + "use_soft_update": true + } +} \ No newline at end of file diff --git a/tests/regression/basic/data_dqn/debug.log b/tests/regression/basic/data_dqn/debug.log new file mode 100644 index 000000000..75c29e1b1 --- /dev/null +++ b/tests/regression/basic/data_dqn/debug.log @@ -0,0 +1,112 @@ +[dqn-CartPole_2021_08_20_00_26_35_0000--s-0] ----------------------------------- -------------- +[dqn-CartPole_2021_08_20_00_26_35_0000--s-0] replay_buffer/size 2000 +[dqn-CartPole_2021_08_20_00_26_35_0000--s-0] trainer/QF Loss 1.0002 +[dqn-CartPole_2021_08_20_00_26_35_0000--s-0] trainer/Y Predictions Mean 2.7425e-05 +[dqn-CartPole_2021_08_20_00_26_35_0000--s-0] trainer/Y Predictions Std 4.64033e-05 +[dqn-CartPole_2021_08_20_00_26_35_0000--s-0] trainer/Y Predictions Max 7.38283e-05 +[dqn-CartPole_2021_08_20_00_26_35_0000--s-0] trainer/Y Predictions Min -1.89784e-05 +[dqn-CartPole_2021_08_20_00_26_35_0000--s-0] expl/num steps total 2000 +[dqn-CartPole_2021_08_20_00_26_35_0000--s-0] expl/num paths total 30 +[dqn-CartPole_2021_08_20_00_26_35_0000--s-0] expl/path length Mean 55.5556 +[dqn-CartPole_2021_08_20_00_26_35_0000--s-0] expl/path length Std 23.1114 +[dqn-CartPole_2021_08_20_00_26_35_0000--s-0] expl/path length Max 100 +[dqn-CartPole_2021_08_20_00_26_35_0000--s-0] expl/path length Min 22 +[dqn-CartPole_2021_08_20_00_26_35_0000--s-0] expl/Rewards Mean 1 +[dqn-CartPole_2021_08_20_00_26_35_0000--s-0] expl/Rewards Std 0 +[dqn-CartPole_2021_08_20_00_26_35_0000--s-0] expl/Rewards Max 1 +[dqn-CartPole_2021_08_20_00_26_35_0000--s-0] expl/Rewards Min 1 +[dqn-CartPole_2021_08_20_00_26_35_0000--s-0] expl/Returns Mean 55.5556 +[dqn-CartPole_2021_08_20_00_26_35_0000--s-0] expl/Returns Std 23.1114 +[dqn-CartPole_2021_08_20_00_26_35_0000--s-0] expl/Returns Max 100 +[dqn-CartPole_2021_08_20_00_26_35_0000--s-0] expl/Returns Min 22 +[dqn-CartPole_2021_08_20_00_26_35_0000--s-0] expl/Actions Mean 0.51 +[dqn-CartPole_2021_08_20_00_26_35_0000--s-0] expl/Actions Std 0.4999 +[dqn-CartPole_2021_08_20_00_26_35_0000--s-0] expl/Actions Max 1 +[dqn-CartPole_2021_08_20_00_26_35_0000--s-0] expl/Actions Min 0 +[dqn-CartPole_2021_08_20_00_26_35_0000--s-0] expl/Num Paths 18 +[dqn-CartPole_2021_08_20_00_26_35_0000--s-0] expl/Average Returns 55.5556 +[dqn-CartPole_2021_08_20_00_26_35_0000--s-0] eval/num steps total 4980 +[dqn-CartPole_2021_08_20_00_26_35_0000--s-0] eval/num paths total 80 +[dqn-CartPole_2021_08_20_00_26_35_0000--s-0] eval/path length Mean 62.25 +[dqn-CartPole_2021_08_20_00_26_35_0000--s-0] eval/path length Std 26.7748 +[dqn-CartPole_2021_08_20_00_26_35_0000--s-0] eval/path length Max 166 +[dqn-CartPole_2021_08_20_00_26_35_0000--s-0] eval/path length Min 22 +[dqn-CartPole_2021_08_20_00_26_35_0000--s-0] eval/Rewards Mean 1 +[dqn-CartPole_2021_08_20_00_26_35_0000--s-0] eval/Rewards Std 0 +[dqn-CartPole_2021_08_20_00_26_35_0000--s-0] eval/Rewards Max 1 +[dqn-CartPole_2021_08_20_00_26_35_0000--s-0] eval/Rewards Min 1 +[dqn-CartPole_2021_08_20_00_26_35_0000--s-0] eval/Returns Mean 62.25 +[dqn-CartPole_2021_08_20_00_26_35_0000--s-0] eval/Returns Std 26.7748 +[dqn-CartPole_2021_08_20_00_26_35_0000--s-0] eval/Returns Max 166 +[dqn-CartPole_2021_08_20_00_26_35_0000--s-0] eval/Returns Min 22 +[dqn-CartPole_2021_08_20_00_26_35_0000--s-0] eval/Actions Mean 0.511044 +[dqn-CartPole_2021_08_20_00_26_35_0000--s-0] eval/Actions Std 0.499878 +[dqn-CartPole_2021_08_20_00_26_35_0000--s-0] eval/Actions Max 1 +[dqn-CartPole_2021_08_20_00_26_35_0000--s-0] eval/Actions Min 0 +[dqn-CartPole_2021_08_20_00_26_35_0000--s-0] eval/Num Paths 80 +[dqn-CartPole_2021_08_20_00_26_35_0000--s-0] eval/Average Returns 62.25 +[dqn-CartPole_2021_08_20_00_26_35_0000--s-0] time/epoch_time (s) 2.58453 +[dqn-CartPole_2021_08_20_00_26_35_0000--s-0] time/evaluation sampling (s) 0.818132 +[dqn-CartPole_2021_08_20_00_26_35_0000--s-0] time/exploration sampling (s) 0.162215 +[dqn-CartPole_2021_08_20_00_26_35_0000--s-0] time/global_time (s) 2.75982 +[dqn-CartPole_2021_08_20_00_26_35_0000--s-0] time/replay buffer data storing (s) 0.00668001 +[dqn-CartPole_2021_08_20_00_26_35_0000--s-0] time/saving (s) 0.00590634 +[dqn-CartPole_2021_08_20_00_26_35_0000--s-0] time/training (s) 1.38845 +[dqn-CartPole_2021_08_20_00_26_35_0000--s-0] epoch 0 +[dqn-CartPole_2021_08_20_00_26_35_0000--s-0] ----------------------------------- -------------- +[dqn-CartPole_2021_08_20_00_26_35_0000--s-0] ----------------------------------- ------------- +[dqn-CartPole_2021_08_20_00_26_35_0000--s-0] replay_buffer/size 3000 +[dqn-CartPole_2021_08_20_00_26_35_0000--s-0] trainer/QF Loss 0.0011098 +[dqn-CartPole_2021_08_20_00_26_35_0000--s-0] trainer/Y Predictions Mean 1.26111 +[dqn-CartPole_2021_08_20_00_26_35_0000--s-0] trainer/Y Predictions Std 0.042638 +[dqn-CartPole_2021_08_20_00_26_35_0000--s-0] trainer/Y Predictions Max 1.30375 +[dqn-CartPole_2021_08_20_00_26_35_0000--s-0] trainer/Y Predictions Min 1.21848 +[dqn-CartPole_2021_08_20_00_26_35_0000--s-0] expl/num steps total 3000 +[dqn-CartPole_2021_08_20_00_26_35_0000--s-0] expl/num paths total 132 +[dqn-CartPole_2021_08_20_00_26_35_0000--s-0] expl/path length Mean 9.80392 +[dqn-CartPole_2021_08_20_00_26_35_0000--s-0] expl/path length Std 1.54691 +[dqn-CartPole_2021_08_20_00_26_35_0000--s-0] expl/path length Max 15 +[dqn-CartPole_2021_08_20_00_26_35_0000--s-0] expl/path length Min 2 +[dqn-CartPole_2021_08_20_00_26_35_0000--s-0] expl/Rewards Mean 1 +[dqn-CartPole_2021_08_20_00_26_35_0000--s-0] expl/Rewards Std 0 +[dqn-CartPole_2021_08_20_00_26_35_0000--s-0] expl/Rewards Max 1 +[dqn-CartPole_2021_08_20_00_26_35_0000--s-0] expl/Rewards Min 1 +[dqn-CartPole_2021_08_20_00_26_35_0000--s-0] expl/Returns Mean 9.80392 +[dqn-CartPole_2021_08_20_00_26_35_0000--s-0] expl/Returns Std 1.54691 +[dqn-CartPole_2021_08_20_00_26_35_0000--s-0] expl/Returns Max 15 +[dqn-CartPole_2021_08_20_00_26_35_0000--s-0] expl/Returns Min 2 +[dqn-CartPole_2021_08_20_00_26_35_0000--s-0] expl/Actions Mean 0.947 +[dqn-CartPole_2021_08_20_00_26_35_0000--s-0] expl/Actions Std 0.224033 +[dqn-CartPole_2021_08_20_00_26_35_0000--s-0] expl/Actions Max 1 +[dqn-CartPole_2021_08_20_00_26_35_0000--s-0] expl/Actions Min 0 +[dqn-CartPole_2021_08_20_00_26_35_0000--s-0] expl/Num Paths 102 +[dqn-CartPole_2021_08_20_00_26_35_0000--s-0] expl/Average Returns 9.80392 +[dqn-CartPole_2021_08_20_00_26_35_0000--s-0] eval/num steps total 9975 +[dqn-CartPole_2021_08_20_00_26_35_0000--s-0] eval/num paths total 617 +[dqn-CartPole_2021_08_20_00_26_35_0000--s-0] eval/path length Mean 9.30168 +[dqn-CartPole_2021_08_20_00_26_35_0000--s-0] eval/path length Std 0.773308 +[dqn-CartPole_2021_08_20_00_26_35_0000--s-0] eval/path length Max 11 +[dqn-CartPole_2021_08_20_00_26_35_0000--s-0] eval/path length Min 8 +[dqn-CartPole_2021_08_20_00_26_35_0000--s-0] eval/Rewards Mean 1 +[dqn-CartPole_2021_08_20_00_26_35_0000--s-0] eval/Rewards Std 0 +[dqn-CartPole_2021_08_20_00_26_35_0000--s-0] eval/Rewards Max 1 +[dqn-CartPole_2021_08_20_00_26_35_0000--s-0] eval/Rewards Min 1 +[dqn-CartPole_2021_08_20_00_26_35_0000--s-0] eval/Returns Mean 9.30168 +[dqn-CartPole_2021_08_20_00_26_35_0000--s-0] eval/Returns Std 0.773308 +[dqn-CartPole_2021_08_20_00_26_35_0000--s-0] eval/Returns Max 11 +[dqn-CartPole_2021_08_20_00_26_35_0000--s-0] eval/Returns Min 8 +[dqn-CartPole_2021_08_20_00_26_35_0000--s-0] eval/Actions Mean 1 +[dqn-CartPole_2021_08_20_00_26_35_0000--s-0] eval/Actions Std 0 +[dqn-CartPole_2021_08_20_00_26_35_0000--s-0] eval/Actions Max 1 +[dqn-CartPole_2021_08_20_00_26_35_0000--s-0] eval/Actions Min 1 +[dqn-CartPole_2021_08_20_00_26_35_0000--s-0] eval/Num Paths 537 +[dqn-CartPole_2021_08_20_00_26_35_0000--s-0] eval/Average Returns 9.30168 +[dqn-CartPole_2021_08_20_00_26_35_0000--s-0] time/epoch_time (s) 2.31997 +[dqn-CartPole_2021_08_20_00_26_35_0000--s-0] time/evaluation sampling (s) 0.810525 +[dqn-CartPole_2021_08_20_00_26_35_0000--s-0] time/exploration sampling (s) 0.168954 +[dqn-CartPole_2021_08_20_00_26_35_0000--s-0] time/global_time (s) 5.08436 +[dqn-CartPole_2021_08_20_00_26_35_0000--s-0] time/replay buffer data storing (s) 0.0068624 +[dqn-CartPole_2021_08_20_00_26_35_0000--s-0] time/saving (s) 0.00544405 +[dqn-CartPole_2021_08_20_00_26_35_0000--s-0] time/training (s) 1.31033 +[dqn-CartPole_2021_08_20_00_26_35_0000--s-0] epoch 1 +[dqn-CartPole_2021_08_20_00_26_35_0000--s-0] ----------------------------------- ------------- diff --git a/tests/regression/basic/data_dqn/progress.csv b/tests/regression/basic/data_dqn/progress.csv new file mode 100644 index 000000000..4c23abfe7 --- /dev/null +++ b/tests/regression/basic/data_dqn/progress.csv @@ -0,0 +1,3 @@ +epoch,eval/Actions Max,eval/Actions Mean,eval/Actions Min,eval/Actions Std,eval/Average Returns,eval/Num Paths,eval/Returns Max,eval/Returns Mean,eval/Returns Min,eval/Returns Std,eval/Rewards Max,eval/Rewards Mean,eval/Rewards Min,eval/Rewards Std,eval/num paths total,eval/num steps total,eval/path length Max,eval/path length Mean,eval/path length Min,eval/path length Std,expl/Actions Max,expl/Actions Mean,expl/Actions Min,expl/Actions Std,expl/Average Returns,expl/Num Paths,expl/Returns Max,expl/Returns Mean,expl/Returns Min,expl/Returns Std,expl/Rewards Max,expl/Rewards Mean,expl/Rewards Min,expl/Rewards Std,expl/num paths total,expl/num steps total,expl/path length Max,expl/path length Mean,expl/path length Min,expl/path length Std,replay_buffer/size,time/epoch_time (s),time/evaluation sampling (s),time/exploration sampling (s),time/global_time (s),time/replay buffer data storing (s),time/saving (s),time/training (s),trainer/QF Loss,trainer/Y Predictions Max,trainer/Y Predictions Mean,trainer/Y Predictions Min,trainer/Y Predictions Std +0,1,0.5110441767068273,0,0.4998780112796205,62.25,80,166.0,62.25,22.0,26.774754900838964,1.0,1.0,1.0,0.0,80,4980,166,62.25,22,26.774754900838964,1,0.51,0,0.4998999899979995,55.55555555555556,18,100.0,55.55555555555556,22.0,23.111378203584827,1.0,1.0,1.0,0.0,30,2000,100,55.55555555555556,22,23.111378203584827,2000,2.584531307220459,0.818131685256958,0.1622154712677002,2.7598235607147217,0.006680011749267578,0.005906343460083008,1.3884549140930176,1.0001953,7.382831e-05,2.742497e-05,-1.8978371e-05,4.6403344e-05 +1,1,1.0,1,0.0,9.30167597765363,537,11.0,9.30167597765363,8.0,0.7733075972008009,1.0,1.0,1.0,0.0,617,9975,11,9.30167597765363,8,0.7733075972008009,1,0.947,0,0.22403347964087864,9.803921568627452,102,15.0,9.803921568627452,2.0,1.5469084645738056,1.0,1.0,1.0,0.0,132,3000,15,9.803921568627452,2,1.5469084645738056,3000,2.3199687004089355,0.8105249404907227,0.16895389556884766,5.084362745285034,0.0068624019622802734,0.005444049835205078,1.3103294372558594,0.0011097997,1.3037528,1.2611148,1.2184768,0.042638004 diff --git a/tests/regression/basic/data_dqn/variant.json b/tests/regression/basic/data_dqn/variant.json new file mode 100644 index 000000000..6398a1832 --- /dev/null +++ b/tests/regression/basic/data_dqn/variant.json @@ -0,0 +1,20 @@ +{ + "algorithm": "DQN", + "algorithm_kwargs": { + "batch_size": 2, + "max_path_length": 1000, + "min_num_steps_before_training": 1000, + "num_epochs": 2, + "num_eval_steps_per_epoch": 5000, + "num_expl_steps_per_train_loop": 1000, + "num_trains_per_train_loop": 1000 + }, + "layer_size": 256, + "replay_buffer_size": 1000000, + "seed": 25580, + "trainer_kwargs": { + "discount": 0.99, + "learning_rate": 0.0003 + }, + "version": "normal" +} \ No newline at end of file diff --git a/tests/regression/basic/data_sac/debug.log b/tests/regression/basic/data_sac/debug.log new file mode 100644 index 000000000..0e4cf3d0f --- /dev/null +++ b/tests/regression/basic/data_sac/debug.log @@ -0,0 +1,170 @@ +[name-of-experiment_2021_08_20_00_32_20_0000--s-0] ----------------------------------- -------------- +[name-of-experiment_2021_08_20_00_32_20_0000--s-0] replay_buffer/size 2000 +[name-of-experiment_2021_08_20_00_32_20_0000--s-0] trainer/num train calls 1000 +[name-of-experiment_2021_08_20_00_32_20_0000--s-0] trainer/QF1 Loss 13.5868 +[name-of-experiment_2021_08_20_00_32_20_0000--s-0] trainer/QF2 Loss 13.6571 +[name-of-experiment_2021_08_20_00_32_20_0000--s-0] trainer/Policy Loss -3.69907 +[name-of-experiment_2021_08_20_00_32_20_0000--s-0] trainer/Q1 Predictions Mean 0.00934327 +[name-of-experiment_2021_08_20_00_32_20_0000--s-0] trainer/Q1 Predictions Std 0.00314975 +[name-of-experiment_2021_08_20_00_32_20_0000--s-0] trainer/Q1 Predictions Max 0.012493 +[name-of-experiment_2021_08_20_00_32_20_0000--s-0] trainer/Q1 Predictions Min 0.00619352 +[name-of-experiment_2021_08_20_00_32_20_0000--s-0] trainer/Q2 Predictions Mean -0.00216014 +[name-of-experiment_2021_08_20_00_32_20_0000--s-0] trainer/Q2 Predictions Std 0.0042975 +[name-of-experiment_2021_08_20_00_32_20_0000--s-0] trainer/Q2 Predictions Max 0.00213736 +[name-of-experiment_2021_08_20_00_32_20_0000--s-0] trainer/Q2 Predictions Min -0.00645764 +[name-of-experiment_2021_08_20_00_32_20_0000--s-0] trainer/Q Targets Mean 3.59894 +[name-of-experiment_2021_08_20_00_32_20_0000--s-0] trainer/Q Targets Std 0.834459 +[name-of-experiment_2021_08_20_00_32_20_0000--s-0] trainer/Q Targets Max 4.43339 +[name-of-experiment_2021_08_20_00_32_20_0000--s-0] trainer/Q Targets Min 2.76448 +[name-of-experiment_2021_08_20_00_32_20_0000--s-0] trainer/Log Pis Mean -3.70052 +[name-of-experiment_2021_08_20_00_32_20_0000--s-0] trainer/Log Pis Std 0.28082 +[name-of-experiment_2021_08_20_00_32_20_0000--s-0] trainer/Log Pis Max -3.4197 +[name-of-experiment_2021_08_20_00_32_20_0000--s-0] trainer/Log Pis Min -3.98134 +[name-of-experiment_2021_08_20_00_32_20_0000--s-0] trainer/policy/mean Mean -0.000228407 +[name-of-experiment_2021_08_20_00_32_20_0000--s-0] trainer/policy/mean Std 0.00185078 +[name-of-experiment_2021_08_20_00_32_20_0000--s-0] trainer/policy/mean Max 0.0028966 +[name-of-experiment_2021_08_20_00_32_20_0000--s-0] trainer/policy/mean Min -0.00292742 +[name-of-experiment_2021_08_20_00_32_20_0000--s-0] trainer/policy/normal/std Mean 1 +[name-of-experiment_2021_08_20_00_32_20_0000--s-0] trainer/policy/normal/std Std 0.00110054 +[name-of-experiment_2021_08_20_00_32_20_0000--s-0] trainer/policy/normal/std Max 1.00152 +[name-of-experiment_2021_08_20_00_32_20_0000--s-0] trainer/policy/normal/std Min 0.998095 +[name-of-experiment_2021_08_20_00_32_20_0000--s-0] trainer/policy/normal/log_std Mean -3.97151e-07 +[name-of-experiment_2021_08_20_00_32_20_0000--s-0] trainer/policy/normal/log_std Std 0.00110076 +[name-of-experiment_2021_08_20_00_32_20_0000--s-0] trainer/policy/normal/log_std Max 0.00152293 +[name-of-experiment_2021_08_20_00_32_20_0000--s-0] trainer/policy/normal/log_std Min -0.00190684 +[name-of-experiment_2021_08_20_00_32_20_0000--s-0] trainer/Alpha 1 +[name-of-experiment_2021_08_20_00_32_20_0000--s-0] trainer/Alpha Loss -0 +[name-of-experiment_2021_08_20_00_32_20_0000--s-0] expl/num steps total 2000 +[name-of-experiment_2021_08_20_00_32_20_0000--s-0] expl/num paths total 2 +[name-of-experiment_2021_08_20_00_32_20_0000--s-0] expl/path length Mean 1000 +[name-of-experiment_2021_08_20_00_32_20_0000--s-0] expl/path length Std 0 +[name-of-experiment_2021_08_20_00_32_20_0000--s-0] expl/path length Max 1000 +[name-of-experiment_2021_08_20_00_32_20_0000--s-0] expl/path length Min 1000 +[name-of-experiment_2021_08_20_00_32_20_0000--s-0] expl/Rewards Mean -0.186742 +[name-of-experiment_2021_08_20_00_32_20_0000--s-0] expl/Rewards Std 0.653587 +[name-of-experiment_2021_08_20_00_32_20_0000--s-0] expl/Rewards Max 1.89113 +[name-of-experiment_2021_08_20_00_32_20_0000--s-0] expl/Rewards Min -2.12768 +[name-of-experiment_2021_08_20_00_32_20_0000--s-0] expl/Returns Mean -186.742 +[name-of-experiment_2021_08_20_00_32_20_0000--s-0] expl/Returns Std 0 +[name-of-experiment_2021_08_20_00_32_20_0000--s-0] expl/Returns Max -186.742 +[name-of-experiment_2021_08_20_00_32_20_0000--s-0] expl/Returns Min -186.742 +[name-of-experiment_2021_08_20_00_32_20_0000--s-0] expl/Actions Mean 0.000695572 +[name-of-experiment_2021_08_20_00_32_20_0000--s-0] expl/Actions Std 0.628869 +[name-of-experiment_2021_08_20_00_32_20_0000--s-0] expl/Actions Max 0.998728 +[name-of-experiment_2021_08_20_00_32_20_0000--s-0] expl/Actions Min -0.998485 +[name-of-experiment_2021_08_20_00_32_20_0000--s-0] expl/Num Paths 1 +[name-of-experiment_2021_08_20_00_32_20_0000--s-0] expl/Average Returns -186.742 +[name-of-experiment_2021_08_20_00_32_20_0000--s-0] eval/num steps total 5000 +[name-of-experiment_2021_08_20_00_32_20_0000--s-0] eval/num paths total 5 +[name-of-experiment_2021_08_20_00_32_20_0000--s-0] eval/path length Mean 1000 +[name-of-experiment_2021_08_20_00_32_20_0000--s-0] eval/path length Std 0 +[name-of-experiment_2021_08_20_00_32_20_0000--s-0] eval/path length Max 1000 +[name-of-experiment_2021_08_20_00_32_20_0000--s-0] eval/path length Min 1000 +[name-of-experiment_2021_08_20_00_32_20_0000--s-0] eval/Rewards Mean -0.000277411 +[name-of-experiment_2021_08_20_00_32_20_0000--s-0] eval/Rewards Std 0.0140067 +[name-of-experiment_2021_08_20_00_32_20_0000--s-0] eval/Rewards Max 0.241348 +[name-of-experiment_2021_08_20_00_32_20_0000--s-0] eval/Rewards Min -0.289714 +[name-of-experiment_2021_08_20_00_32_20_0000--s-0] eval/Returns Mean -0.277411 +[name-of-experiment_2021_08_20_00_32_20_0000--s-0] eval/Returns Std 0.547906 +[name-of-experiment_2021_08_20_00_32_20_0000--s-0] eval/Returns Max 0.679503 +[name-of-experiment_2021_08_20_00_32_20_0000--s-0] eval/Returns Min -0.839934 +[name-of-experiment_2021_08_20_00_32_20_0000--s-0] eval/Actions Mean -1.35809e-05 +[name-of-experiment_2021_08_20_00_32_20_0000--s-0] eval/Actions Std 3.58651e-05 +[name-of-experiment_2021_08_20_00_32_20_0000--s-0] eval/Actions Max 0.000581184 +[name-of-experiment_2021_08_20_00_32_20_0000--s-0] eval/Actions Min -0.00086283 +[name-of-experiment_2021_08_20_00_32_20_0000--s-0] eval/Num Paths 5 +[name-of-experiment_2021_08_20_00_32_20_0000--s-0] eval/Average Returns -0.277411 +[name-of-experiment_2021_08_20_00_32_20_0000--s-0] time/epoch_time (s) 12.7593 +[name-of-experiment_2021_08_20_00_32_20_0000--s-0] time/evaluation sampling (s) 2.09937 +[name-of-experiment_2021_08_20_00_32_20_0000--s-0] time/exploration sampling (s) 0.49046 +[name-of-experiment_2021_08_20_00_32_20_0000--s-0] time/global_time (s) 12.837 +[name-of-experiment_2021_08_20_00_32_20_0000--s-0] time/replay buffer data storing (s) 0.0068388 +[name-of-experiment_2021_08_20_00_32_20_0000--s-0] time/saving (s) 0.0087781 +[name-of-experiment_2021_08_20_00_32_20_0000--s-0] time/training (s) 9.64209 +[name-of-experiment_2021_08_20_00_32_20_0000--s-0] epoch 0 +[name-of-experiment_2021_08_20_00_32_20_0000--s-0] ----------------------------------- -------------- +[name-of-experiment_2021_08_20_00_32_20_0000--s-0] ----------------------------------- --------------- +[name-of-experiment_2021_08_20_00_32_20_0000--s-0] replay_buffer/size 3000 +[name-of-experiment_2021_08_20_00_32_20_0000--s-0] trainer/num train calls 2000 +[name-of-experiment_2021_08_20_00_32_20_0000--s-0] trainer/QF1 Loss 0.835819 +[name-of-experiment_2021_08_20_00_32_20_0000--s-0] trainer/QF2 Loss 0.850335 +[name-of-experiment_2021_08_20_00_32_20_0000--s-0] trainer/Policy Loss -14.3326 +[name-of-experiment_2021_08_20_00_32_20_0000--s-0] trainer/Q1 Predictions Mean 11.4556 +[name-of-experiment_2021_08_20_00_32_20_0000--s-0] trainer/Q1 Predictions Std 0.749654 +[name-of-experiment_2021_08_20_00_32_20_0000--s-0] trainer/Q1 Predictions Max 12.2053 +[name-of-experiment_2021_08_20_00_32_20_0000--s-0] trainer/Q1 Predictions Min 10.706 +[name-of-experiment_2021_08_20_00_32_20_0000--s-0] trainer/Q2 Predictions Mean 11.4674 +[name-of-experiment_2021_08_20_00_32_20_0000--s-0] trainer/Q2 Predictions Std 0.713215 +[name-of-experiment_2021_08_20_00_32_20_0000--s-0] trainer/Q2 Predictions Max 12.1806 +[name-of-experiment_2021_08_20_00_32_20_0000--s-0] trainer/Q2 Predictions Min 10.7542 +[name-of-experiment_2021_08_20_00_32_20_0000--s-0] trainer/Q Targets Mean 10.5484 +[name-of-experiment_2021_08_20_00_32_20_0000--s-0] trainer/Q Targets Std 0.636489 +[name-of-experiment_2021_08_20_00_32_20_0000--s-0] trainer/Q Targets Max 11.1849 +[name-of-experiment_2021_08_20_00_32_20_0000--s-0] trainer/Q Targets Min 9.91194 +[name-of-experiment_2021_08_20_00_32_20_0000--s-0] trainer/Log Pis Mean -4.12219 +[name-of-experiment_2021_08_20_00_32_20_0000--s-0] trainer/Log Pis Std 0.378276 +[name-of-experiment_2021_08_20_00_32_20_0000--s-0] trainer/Log Pis Max -3.74392 +[name-of-experiment_2021_08_20_00_32_20_0000--s-0] trainer/Log Pis Min -4.50047 +[name-of-experiment_2021_08_20_00_32_20_0000--s-0] trainer/policy/mean Mean -0.0206859 +[name-of-experiment_2021_08_20_00_32_20_0000--s-0] trainer/policy/mean Std 0.0564667 +[name-of-experiment_2021_08_20_00_32_20_0000--s-0] trainer/policy/mean Max 0.0648408 +[name-of-experiment_2021_08_20_00_32_20_0000--s-0] trainer/policy/mean Min -0.133037 +[name-of-experiment_2021_08_20_00_32_20_0000--s-0] trainer/policy/normal/std Mean 0.892203 +[name-of-experiment_2021_08_20_00_32_20_0000--s-0] trainer/policy/normal/std Std 0.0442898 +[name-of-experiment_2021_08_20_00_32_20_0000--s-0] trainer/policy/normal/std Max 0.98269 +[name-of-experiment_2021_08_20_00_32_20_0000--s-0] trainer/policy/normal/std Min 0.818071 +[name-of-experiment_2021_08_20_00_32_20_0000--s-0] trainer/policy/normal/log_std Mean -0.115284 +[name-of-experiment_2021_08_20_00_32_20_0000--s-0] trainer/policy/normal/log_std Std 0.0493709 +[name-of-experiment_2021_08_20_00_32_20_0000--s-0] trainer/policy/normal/log_std Max -0.0174612 +[name-of-experiment_2021_08_20_00_32_20_0000--s-0] trainer/policy/normal/log_std Min -0.200806 +[name-of-experiment_2021_08_20_00_32_20_0000--s-0] trainer/Alpha 0.740733 +[name-of-experiment_2021_08_20_00_32_20_0000--s-0] trainer/Alpha Loss -3.03782 +[name-of-experiment_2021_08_20_00_32_20_0000--s-0] expl/num steps total 3000 +[name-of-experiment_2021_08_20_00_32_20_0000--s-0] expl/num paths total 3 +[name-of-experiment_2021_08_20_00_32_20_0000--s-0] expl/path length Mean 1000 +[name-of-experiment_2021_08_20_00_32_20_0000--s-0] expl/path length Std 0 +[name-of-experiment_2021_08_20_00_32_20_0000--s-0] expl/path length Max 1000 +[name-of-experiment_2021_08_20_00_32_20_0000--s-0] expl/path length Min 1000 +[name-of-experiment_2021_08_20_00_32_20_0000--s-0] expl/Rewards Mean -0.403968 +[name-of-experiment_2021_08_20_00_32_20_0000--s-0] expl/Rewards Std 0.73118 +[name-of-experiment_2021_08_20_00_32_20_0000--s-0] expl/Rewards Max 2.37418 +[name-of-experiment_2021_08_20_00_32_20_0000--s-0] expl/Rewards Min -2.7083 +[name-of-experiment_2021_08_20_00_32_20_0000--s-0] expl/Returns Mean -403.968 +[name-of-experiment_2021_08_20_00_32_20_0000--s-0] expl/Returns Std 0 +[name-of-experiment_2021_08_20_00_32_20_0000--s-0] expl/Returns Max -403.968 +[name-of-experiment_2021_08_20_00_32_20_0000--s-0] expl/Returns Min -403.968 +[name-of-experiment_2021_08_20_00_32_20_0000--s-0] expl/Actions Mean -0.0203944 +[name-of-experiment_2021_08_20_00_32_20_0000--s-0] expl/Actions Std 0.596303 +[name-of-experiment_2021_08_20_00_32_20_0000--s-0] expl/Actions Max 0.998637 +[name-of-experiment_2021_08_20_00_32_20_0000--s-0] expl/Actions Min -0.996339 +[name-of-experiment_2021_08_20_00_32_20_0000--s-0] expl/Num Paths 1 +[name-of-experiment_2021_08_20_00_32_20_0000--s-0] expl/Average Returns -403.968 +[name-of-experiment_2021_08_20_00_32_20_0000--s-0] eval/num steps total 10000 +[name-of-experiment_2021_08_20_00_32_20_0000--s-0] eval/num paths total 10 +[name-of-experiment_2021_08_20_00_32_20_0000--s-0] eval/path length Mean 1000 +[name-of-experiment_2021_08_20_00_32_20_0000--s-0] eval/path length Std 0 +[name-of-experiment_2021_08_20_00_32_20_0000--s-0] eval/path length Max 1000 +[name-of-experiment_2021_08_20_00_32_20_0000--s-0] eval/path length Min 1000 +[name-of-experiment_2021_08_20_00_32_20_0000--s-0] eval/Rewards Mean -8.18254e-05 +[name-of-experiment_2021_08_20_00_32_20_0000--s-0] eval/Rewards Std 0.0150251 +[name-of-experiment_2021_08_20_00_32_20_0000--s-0] eval/Rewards Max 0.24441 +[name-of-experiment_2021_08_20_00_32_20_0000--s-0] eval/Rewards Min -0.322935 +[name-of-experiment_2021_08_20_00_32_20_0000--s-0] eval/Returns Mean -0.0818254 +[name-of-experiment_2021_08_20_00_32_20_0000--s-0] eval/Returns Std 0.515369 +[name-of-experiment_2021_08_20_00_32_20_0000--s-0] eval/Returns Max 0.581243 +[name-of-experiment_2021_08_20_00_32_20_0000--s-0] eval/Returns Min -0.878872 +[name-of-experiment_2021_08_20_00_32_20_0000--s-0] eval/Actions Mean -0.00158871 +[name-of-experiment_2021_08_20_00_32_20_0000--s-0] eval/Actions Std 0.0078556 +[name-of-experiment_2021_08_20_00_32_20_0000--s-0] eval/Actions Max 0.0200246 +[name-of-experiment_2021_08_20_00_32_20_0000--s-0] eval/Actions Min -0.0494994 +[name-of-experiment_2021_08_20_00_32_20_0000--s-0] eval/Num Paths 5 +[name-of-experiment_2021_08_20_00_32_20_0000--s-0] eval/Average Returns -0.0818254 +[name-of-experiment_2021_08_20_00_32_20_0000--s-0] time/epoch_time (s) 12.8613 +[name-of-experiment_2021_08_20_00_32_20_0000--s-0] time/evaluation sampling (s) 2.13829 +[name-of-experiment_2021_08_20_00_32_20_0000--s-0] time/exploration sampling (s) 0.487076 +[name-of-experiment_2021_08_20_00_32_20_0000--s-0] time/global_time (s) 25.7049 +[name-of-experiment_2021_08_20_00_32_20_0000--s-0] time/replay buffer data storing (s) 0.00657558 +[name-of-experiment_2021_08_20_00_32_20_0000--s-0] time/saving (s) 0.00871992 +[name-of-experiment_2021_08_20_00_32_20_0000--s-0] time/training (s) 10.2043 +[name-of-experiment_2021_08_20_00_32_20_0000--s-0] epoch 1 +[name-of-experiment_2021_08_20_00_32_20_0000--s-0] ----------------------------------- --------------- diff --git a/tests/regression/basic/data_sac/progress.csv b/tests/regression/basic/data_sac/progress.csv new file mode 100644 index 000000000..f56544a5c --- /dev/null +++ b/tests/regression/basic/data_sac/progress.csv @@ -0,0 +1,3 @@ +epoch,eval/Actions Max,eval/Actions Mean,eval/Actions Min,eval/Actions Std,eval/Average Returns,eval/Num Paths,eval/Returns Max,eval/Returns Mean,eval/Returns Min,eval/Returns Std,eval/Rewards Max,eval/Rewards Mean,eval/Rewards Min,eval/Rewards Std,eval/num paths total,eval/num steps total,eval/path length Max,eval/path length Mean,eval/path length Min,eval/path length Std,expl/Actions Max,expl/Actions Mean,expl/Actions Min,expl/Actions Std,expl/Average Returns,expl/Num Paths,expl/Returns Max,expl/Returns Mean,expl/Returns Min,expl/Returns Std,expl/Rewards Max,expl/Rewards Mean,expl/Rewards Min,expl/Rewards Std,expl/num paths total,expl/num steps total,expl/path length Max,expl/path length Mean,expl/path length Min,expl/path length Std,replay_buffer/size,time/epoch_time (s),time/evaluation sampling (s),time/exploration sampling (s),time/global_time (s),time/replay buffer data storing (s),time/saving (s),time/training (s),trainer/Alpha,trainer/Alpha Loss,trainer/Log Pis Max,trainer/Log Pis Mean,trainer/Log Pis Min,trainer/Log Pis Std,trainer/Policy Loss,trainer/Q Targets Max,trainer/Q Targets Mean,trainer/Q Targets Min,trainer/Q Targets Std,trainer/Q1 Predictions Max,trainer/Q1 Predictions Mean,trainer/Q1 Predictions Min,trainer/Q1 Predictions Std,trainer/Q2 Predictions Max,trainer/Q2 Predictions Mean,trainer/Q2 Predictions Min,trainer/Q2 Predictions Std,trainer/QF1 Loss,trainer/QF2 Loss,trainer/num train calls,trainer/policy/mean Max,trainer/policy/mean Mean,trainer/policy/mean Min,trainer/policy/mean Std,trainer/policy/normal/log_std Max,trainer/policy/normal/log_std Mean,trainer/policy/normal/log_std Min,trainer/policy/normal/log_std Std,trainer/policy/normal/std Max,trainer/policy/normal/std Mean,trainer/policy/normal/std Min,trainer/policy/normal/std Std +0,0.0005811839,-1.358085e-05,-0.0008628296,3.58651e-05,-0.27741123351005426,5,0.679502600817786,-0.27741123351005426,-0.8399343828093677,0.5479060059943178,0.24134825784457262,-0.00027741123351005627,-0.28971371612459795,0.01400672126605945,5,5000,1000,1000.0,1000,0.0,0.9987282,0.0006955716,-0.9984847,0.62886876,-186.74236183371255,1,-186.74236183371255,-186.74236183371255,-186.74236183371255,0.0,1.8911265554202854,-0.18674236183371243,-2.127676507647777,0.6535867865010028,2,2000,1000,1000.0,1000,0.0,2000,12.759305953979492,2.0993728637695312,0.4904603958129883,12.837042570114136,0.006838798522949219,0.008778095245361328,9.642087459564209,1.0,-0.0,-3.4196992,-3.7005188,-3.9813385,0.28081965,-3.6990724,4.433394,3.5989351,2.7644763,0.8344588,0.012493022,0.009343271,0.00619352,0.0031497509,0.002137361,-0.0021601378,-0.0064576366,0.0042974986,13.586757,13.657056,1000,0.0028965985,-0.00022840746,-0.0029274235,0.0018507849,0.0015229306,-3.9715087e-07,-0.001906841,0.0011007573,1.0015241,1.0000001,0.998095,0.0011005357 +1,0.020024562,-0.0015887116,-0.04949943,0.007855596,-0.08182537464844046,5,0.5812429864923967,-0.08182537464844046,-0.8788718056921059,0.5153690810941282,0.24440954150111469,-8.18253746484356e-05,-0.32293497251885983,0.015025099521363234,10,10000,1000,1000.0,1000,0.0,0.9986366,-0.020394377,-0.99633896,0.5963033,-403.9682654062123,1,-403.9682654062123,-403.9682654062123,-403.9682654062123,0.0,2.3741750786717035,-0.40396826540621295,-2.7082982579696493,0.731179885819233,3,3000,1000,1000.0,1000,0.0,3000,12.861341714859009,2.1382904052734375,0.4870762825012207,25.704946041107178,0.006575584411621094,0.008719921112060547,10.20429277420044,0.7407328486442566,-3.0378241539001465,-3.7439165,-4.122193,-4.500469,0.37827635,-14.332607,11.184919,10.54843,9.911942,0.6364889,12.205285,11.455631,10.7059765,0.7496543,12.180584,11.467369,10.754154,0.7132149,0.8358188,0.85033506,2000,0.06484082,-0.020685883,-0.13303688,0.05646669,-0.01746117,-0.115284264,-0.20080556,0.049370933,0.9826904,0.89220315,0.8180715,0.04428978 diff --git a/tests/regression/basic/data_sac/variant.json b/tests/regression/basic/data_sac/variant.json new file mode 100644 index 000000000..56ba0bfd5 --- /dev/null +++ b/tests/regression/basic/data_sac/variant.json @@ -0,0 +1,37 @@ +{ + "algorithm": "SAC", + "algorithm_kwargs": { + "batch_size": 2, + "max_path_length": 1000, + "min_num_steps_before_training": 1000, + "num_epochs": 2, + "num_eval_steps_per_epoch": 5000, + "num_expl_steps_per_train_loop": 1000, + "num_trains_per_train_loop": 1000 + }, + "layer_size": 256, + "policy_kwargs": { + "hidden_sizes": [ + 2, + 2 + ] + }, + "qf_kwargs": { + "hidden_sizes": [ + 2, + 2 + ] + }, + "replay_buffer_size": 1000000, + "seed": 25580, + "trainer_kwargs": { + "discount": 0.99, + "policy_lr": 0.0003, + "qf_lr": 0.0003, + "reward_scale": 1, + "soft_target_tau": 0.005, + "target_update_period": 1, + "use_automatic_entropy_tuning": true + }, + "version": "normal" +} \ No newline at end of file diff --git a/tests/regression/basic/data_td3/debug.log b/tests/regression/basic/data_td3/debug.log new file mode 100644 index 000000000..81c75bdf6 --- /dev/null +++ b/tests/regression/basic/data_td3/debug.log @@ -0,0 +1,156 @@ +[rlkit-post-refactor-td3-half-cheetah_2021_08_20_00_21_31_0000--s-0] ----------------------------------- -------------- +[rlkit-post-refactor-td3-half-cheetah_2021_08_20_00_21_31_0000--s-0] replay_buffer/size 2000 +[rlkit-post-refactor-td3-half-cheetah_2021_08_20_00_21_31_0000--s-0] trainer/QF1 Loss 0.0198549 +[rlkit-post-refactor-td3-half-cheetah_2021_08_20_00_21_31_0000--s-0] trainer/QF2 Loss 0.0197657 +[rlkit-post-refactor-td3-half-cheetah_2021_08_20_00_21_31_0000--s-0] trainer/Policy Loss -0.000842675 +[rlkit-post-refactor-td3-half-cheetah_2021_08_20_00_21_31_0000--s-0] trainer/Q1 Predictions Mean -0.000233052 +[rlkit-post-refactor-td3-half-cheetah_2021_08_20_00_21_31_0000--s-0] trainer/Q1 Predictions Std 0.000233052 +[rlkit-post-refactor-td3-half-cheetah_2021_08_20_00_21_31_0000--s-0] trainer/Q1 Predictions Max 0 +[rlkit-post-refactor-td3-half-cheetah_2021_08_20_00_21_31_0000--s-0] trainer/Q1 Predictions Min -0.000466104 +[rlkit-post-refactor-td3-half-cheetah_2021_08_20_00_21_31_0000--s-0] trainer/Q2 Predictions Mean -2.02651e-05 +[rlkit-post-refactor-td3-half-cheetah_2021_08_20_00_21_31_0000--s-0] trainer/Q2 Predictions Std 2.02651e-05 +[rlkit-post-refactor-td3-half-cheetah_2021_08_20_00_21_31_0000--s-0] trainer/Q2 Predictions Max 0 +[rlkit-post-refactor-td3-half-cheetah_2021_08_20_00_21_31_0000--s-0] trainer/Q2 Predictions Min -4.05302e-05 +[rlkit-post-refactor-td3-half-cheetah_2021_08_20_00_21_31_0000--s-0] trainer/Q Targets Mean 0.0557477 +[rlkit-post-refactor-td3-half-cheetah_2021_08_20_00_21_31_0000--s-0] trainer/Q Targets Std 0.129077 +[rlkit-post-refactor-td3-half-cheetah_2021_08_20_00_21_31_0000--s-0] trainer/Q Targets Max 0.184825 +[rlkit-post-refactor-td3-half-cheetah_2021_08_20_00_21_31_0000--s-0] trainer/Q Targets Min -0.0733291 +[rlkit-post-refactor-td3-half-cheetah_2021_08_20_00_21_31_0000--s-0] trainer/Bellman Errors 1 Mean 0.0198549 +[rlkit-post-refactor-td3-half-cheetah_2021_08_20_00_21_31_0000--s-0] trainer/Bellman Errors 1 Std 0.0144777 +[rlkit-post-refactor-td3-half-cheetah_2021_08_20_00_21_31_0000--s-0] trainer/Bellman Errors 1 Max 0.0343327 +[rlkit-post-refactor-td3-half-cheetah_2021_08_20_00_21_31_0000--s-0] trainer/Bellman Errors 1 Min 0.00537716 +[rlkit-post-refactor-td3-half-cheetah_2021_08_20_00_21_31_0000--s-0] trainer/Bellman Errors 2 Mean 0.0197657 +[rlkit-post-refactor-td3-half-cheetah_2021_08_20_00_21_31_0000--s-0] trainer/Bellman Errors 2 Std 0.0143945 +[rlkit-post-refactor-td3-half-cheetah_2021_08_20_00_21_31_0000--s-0] trainer/Bellman Errors 2 Max 0.0341602 +[rlkit-post-refactor-td3-half-cheetah_2021_08_20_00_21_31_0000--s-0] trainer/Bellman Errors 2 Min 0.00537122 +[rlkit-post-refactor-td3-half-cheetah_2021_08_20_00_21_31_0000--s-0] trainer/Policy Action Mean 0 +[rlkit-post-refactor-td3-half-cheetah_2021_08_20_00_21_31_0000--s-0] trainer/Policy Action Std 0 +[rlkit-post-refactor-td3-half-cheetah_2021_08_20_00_21_31_0000--s-0] trainer/Policy Action Max 0 +[rlkit-post-refactor-td3-half-cheetah_2021_08_20_00_21_31_0000--s-0] trainer/Policy Action Min 0 +[rlkit-post-refactor-td3-half-cheetah_2021_08_20_00_21_31_0000--s-0] expl/num steps total 2000 +[rlkit-post-refactor-td3-half-cheetah_2021_08_20_00_21_31_0000--s-0] expl/num paths total 2 +[rlkit-post-refactor-td3-half-cheetah_2021_08_20_00_21_31_0000--s-0] expl/path length Mean 1000 +[rlkit-post-refactor-td3-half-cheetah_2021_08_20_00_21_31_0000--s-0] expl/path length Std 0 +[rlkit-post-refactor-td3-half-cheetah_2021_08_20_00_21_31_0000--s-0] expl/path length Max 1000 +[rlkit-post-refactor-td3-half-cheetah_2021_08_20_00_21_31_0000--s-0] expl/path length Min 1000 +[rlkit-post-refactor-td3-half-cheetah_2021_08_20_00_21_31_0000--s-0] expl/Rewards Mean -0.00826605 +[rlkit-post-refactor-td3-half-cheetah_2021_08_20_00_21_31_0000--s-0] expl/Rewards Std 0.120397 +[rlkit-post-refactor-td3-half-cheetah_2021_08_20_00_21_31_0000--s-0] expl/Rewards Max 0.349028 +[rlkit-post-refactor-td3-half-cheetah_2021_08_20_00_21_31_0000--s-0] expl/Rewards Min -0.408541 +[rlkit-post-refactor-td3-half-cheetah_2021_08_20_00_21_31_0000--s-0] expl/Returns Mean -8.26605 +[rlkit-post-refactor-td3-half-cheetah_2021_08_20_00_21_31_0000--s-0] expl/Returns Std 0 +[rlkit-post-refactor-td3-half-cheetah_2021_08_20_00_21_31_0000--s-0] expl/Returns Max -8.26605 +[rlkit-post-refactor-td3-half-cheetah_2021_08_20_00_21_31_0000--s-0] expl/Returns Min -8.26605 +[rlkit-post-refactor-td3-half-cheetah_2021_08_20_00_21_31_0000--s-0] expl/Actions Mean 0.00147068 +[rlkit-post-refactor-td3-half-cheetah_2021_08_20_00_21_31_0000--s-0] expl/Actions Std 0.0995527 +[rlkit-post-refactor-td3-half-cheetah_2021_08_20_00_21_31_0000--s-0] expl/Actions Max 0.382974 +[rlkit-post-refactor-td3-half-cheetah_2021_08_20_00_21_31_0000--s-0] expl/Actions Min -0.355687 +[rlkit-post-refactor-td3-half-cheetah_2021_08_20_00_21_31_0000--s-0] expl/Num Paths 1 +[rlkit-post-refactor-td3-half-cheetah_2021_08_20_00_21_31_0000--s-0] expl/Average Returns -8.26605 +[rlkit-post-refactor-td3-half-cheetah_2021_08_20_00_21_31_0000--s-0] eval/num steps total 5000 +[rlkit-post-refactor-td3-half-cheetah_2021_08_20_00_21_31_0000--s-0] eval/num paths total 5 +[rlkit-post-refactor-td3-half-cheetah_2021_08_20_00_21_31_0000--s-0] eval/path length Mean 1000 +[rlkit-post-refactor-td3-half-cheetah_2021_08_20_00_21_31_0000--s-0] eval/path length Std 0 +[rlkit-post-refactor-td3-half-cheetah_2021_08_20_00_21_31_0000--s-0] eval/path length Max 1000 +[rlkit-post-refactor-td3-half-cheetah_2021_08_20_00_21_31_0000--s-0] eval/path length Min 1000 +[rlkit-post-refactor-td3-half-cheetah_2021_08_20_00_21_31_0000--s-0] eval/Rewards Mean -0.000278785 +[rlkit-post-refactor-td3-half-cheetah_2021_08_20_00_21_31_0000--s-0] eval/Rewards Std 0.0140093 +[rlkit-post-refactor-td3-half-cheetah_2021_08_20_00_21_31_0000--s-0] eval/Rewards Max 0.241317 +[rlkit-post-refactor-td3-half-cheetah_2021_08_20_00_21_31_0000--s-0] eval/Rewards Min -0.289879 +[rlkit-post-refactor-td3-half-cheetah_2021_08_20_00_21_31_0000--s-0] eval/Returns Mean -0.278785 +[rlkit-post-refactor-td3-half-cheetah_2021_08_20_00_21_31_0000--s-0] eval/Returns Std 0.548003 +[rlkit-post-refactor-td3-half-cheetah_2021_08_20_00_21_31_0000--s-0] eval/Returns Max 0.6795 +[rlkit-post-refactor-td3-half-cheetah_2021_08_20_00_21_31_0000--s-0] eval/Returns Min -0.84064 +[rlkit-post-refactor-td3-half-cheetah_2021_08_20_00_21_31_0000--s-0] eval/Actions Mean 0 +[rlkit-post-refactor-td3-half-cheetah_2021_08_20_00_21_31_0000--s-0] eval/Actions Std 0 +[rlkit-post-refactor-td3-half-cheetah_2021_08_20_00_21_31_0000--s-0] eval/Actions Max 0 +[rlkit-post-refactor-td3-half-cheetah_2021_08_20_00_21_31_0000--s-0] eval/Actions Min 0 +[rlkit-post-refactor-td3-half-cheetah_2021_08_20_00_21_31_0000--s-0] eval/Num Paths 5 +[rlkit-post-refactor-td3-half-cheetah_2021_08_20_00_21_31_0000--s-0] eval/Average Returns -0.278785 +[rlkit-post-refactor-td3-half-cheetah_2021_08_20_00_21_31_0000--s-0] time/epoch_time (s) 6.25102 +[rlkit-post-refactor-td3-half-cheetah_2021_08_20_00_21_31_0000--s-0] time/evaluation sampling (s) 1.51935 +[rlkit-post-refactor-td3-half-cheetah_2021_08_20_00_21_31_0000--s-0] time/exploration sampling (s) 0.335109 +[rlkit-post-refactor-td3-half-cheetah_2021_08_20_00_21_31_0000--s-0] time/global_time (s) 6.29512 +[rlkit-post-refactor-td3-half-cheetah_2021_08_20_00_21_31_0000--s-0] time/replay buffer data storing (s) 0.00528622 +[rlkit-post-refactor-td3-half-cheetah_2021_08_20_00_21_31_0000--s-0] time/saving (s) 0.0049026 +[rlkit-post-refactor-td3-half-cheetah_2021_08_20_00_21_31_0000--s-0] time/training (s) 4.02953 +[rlkit-post-refactor-td3-half-cheetah_2021_08_20_00_21_31_0000--s-0] epoch 0 +[rlkit-post-refactor-td3-half-cheetah_2021_08_20_00_21_31_0000--s-0] ----------------------------------- -------------- +[rlkit-post-refactor-td3-half-cheetah_2021_08_20_00_21_31_0000--s-0] ----------------------------------- --------------- +[rlkit-post-refactor-td3-half-cheetah_2021_08_20_00_21_31_0000--s-0] replay_buffer/size 3000 +[rlkit-post-refactor-td3-half-cheetah_2021_08_20_00_21_31_0000--s-0] trainer/QF1 Loss 0.0425722 +[rlkit-post-refactor-td3-half-cheetah_2021_08_20_00_21_31_0000--s-0] trainer/QF2 Loss 0.0551591 +[rlkit-post-refactor-td3-half-cheetah_2021_08_20_00_21_31_0000--s-0] trainer/Policy Loss -0.0736262 +[rlkit-post-refactor-td3-half-cheetah_2021_08_20_00_21_31_0000--s-0] trainer/Q1 Predictions Mean 0.0102167 +[rlkit-post-refactor-td3-half-cheetah_2021_08_20_00_21_31_0000--s-0] trainer/Q1 Predictions Std 0.0652914 +[rlkit-post-refactor-td3-half-cheetah_2021_08_20_00_21_31_0000--s-0] trainer/Q1 Predictions Max 0.075508 +[rlkit-post-refactor-td3-half-cheetah_2021_08_20_00_21_31_0000--s-0] trainer/Q1 Predictions Min -0.0550747 +[rlkit-post-refactor-td3-half-cheetah_2021_08_20_00_21_31_0000--s-0] trainer/Q2 Predictions Mean 0.000761375 +[rlkit-post-refactor-td3-half-cheetah_2021_08_20_00_21_31_0000--s-0] trainer/Q2 Predictions Std 0.0348157 +[rlkit-post-refactor-td3-half-cheetah_2021_08_20_00_21_31_0000--s-0] trainer/Q2 Predictions Max 0.035577 +[rlkit-post-refactor-td3-half-cheetah_2021_08_20_00_21_31_0000--s-0] trainer/Q2 Predictions Min -0.0340543 +[rlkit-post-refactor-td3-half-cheetah_2021_08_20_00_21_31_0000--s-0] trainer/Q Targets Mean 0.158049 +[rlkit-post-refactor-td3-half-cheetah_2021_08_20_00_21_31_0000--s-0] trainer/Q Targets Std 0.209228 +[rlkit-post-refactor-td3-half-cheetah_2021_08_20_00_21_31_0000--s-0] trainer/Q Targets Max 0.367277 +[rlkit-post-refactor-td3-half-cheetah_2021_08_20_00_21_31_0000--s-0] trainer/Q Targets Min -0.0511789 +[rlkit-post-refactor-td3-half-cheetah_2021_08_20_00_21_31_0000--s-0] trainer/Bellman Errors 1 Mean 0.0425722 +[rlkit-post-refactor-td3-half-cheetah_2021_08_20_00_21_31_0000--s-0] trainer/Bellman Errors 1 Std 0.042557 +[rlkit-post-refactor-td3-half-cheetah_2021_08_20_00_21_31_0000--s-0] trainer/Bellman Errors 1 Max 0.0851292 +[rlkit-post-refactor-td3-half-cheetah_2021_08_20_00_21_31_0000--s-0] trainer/Bellman Errors 1 Min 1.5177e-05 +[rlkit-post-refactor-td3-half-cheetah_2021_08_20_00_21_31_0000--s-0] trainer/Bellman Errors 2 Mean 0.0551591 +[rlkit-post-refactor-td3-half-cheetah_2021_08_20_00_21_31_0000--s-0] trainer/Bellman Errors 2 Std 0.0548659 +[rlkit-post-refactor-td3-half-cheetah_2021_08_20_00_21_31_0000--s-0] trainer/Bellman Errors 2 Max 0.110025 +[rlkit-post-refactor-td3-half-cheetah_2021_08_20_00_21_31_0000--s-0] trainer/Bellman Errors 2 Min 0.000293254 +[rlkit-post-refactor-td3-half-cheetah_2021_08_20_00_21_31_0000--s-0] trainer/Policy Action Mean 0.00576749 +[rlkit-post-refactor-td3-half-cheetah_2021_08_20_00_21_31_0000--s-0] trainer/Policy Action Std 0.510843 +[rlkit-post-refactor-td3-half-cheetah_2021_08_20_00_21_31_0000--s-0] trainer/Policy Action Max 0.645545 +[rlkit-post-refactor-td3-half-cheetah_2021_08_20_00_21_31_0000--s-0] trainer/Policy Action Min -0.653989 +[rlkit-post-refactor-td3-half-cheetah_2021_08_20_00_21_31_0000--s-0] expl/num steps total 3000 +[rlkit-post-refactor-td3-half-cheetah_2021_08_20_00_21_31_0000--s-0] expl/num paths total 3 +[rlkit-post-refactor-td3-half-cheetah_2021_08_20_00_21_31_0000--s-0] expl/path length Mean 1000 +[rlkit-post-refactor-td3-half-cheetah_2021_08_20_00_21_31_0000--s-0] expl/path length Std 0 +[rlkit-post-refactor-td3-half-cheetah_2021_08_20_00_21_31_0000--s-0] expl/path length Max 1000 +[rlkit-post-refactor-td3-half-cheetah_2021_08_20_00_21_31_0000--s-0] expl/path length Min 1000 +[rlkit-post-refactor-td3-half-cheetah_2021_08_20_00_21_31_0000--s-0] expl/Rewards Mean -0.166073 +[rlkit-post-refactor-td3-half-cheetah_2021_08_20_00_21_31_0000--s-0] expl/Rewards Std 0.130721 +[rlkit-post-refactor-td3-half-cheetah_2021_08_20_00_21_31_0000--s-0] expl/Rewards Max 0.234898 +[rlkit-post-refactor-td3-half-cheetah_2021_08_20_00_21_31_0000--s-0] expl/Rewards Min -0.670388 +[rlkit-post-refactor-td3-half-cheetah_2021_08_20_00_21_31_0000--s-0] expl/Returns Mean -166.073 +[rlkit-post-refactor-td3-half-cheetah_2021_08_20_00_21_31_0000--s-0] expl/Returns Std 0 +[rlkit-post-refactor-td3-half-cheetah_2021_08_20_00_21_31_0000--s-0] expl/Returns Max -166.073 +[rlkit-post-refactor-td3-half-cheetah_2021_08_20_00_21_31_0000--s-0] expl/Returns Min -166.073 +[rlkit-post-refactor-td3-half-cheetah_2021_08_20_00_21_31_0000--s-0] expl/Actions Mean 0.00600685 +[rlkit-post-refactor-td3-half-cheetah_2021_08_20_00_21_31_0000--s-0] expl/Actions Std 0.52099 +[rlkit-post-refactor-td3-half-cheetah_2021_08_20_00_21_31_0000--s-0] expl/Actions Max 0.963568 +[rlkit-post-refactor-td3-half-cheetah_2021_08_20_00_21_31_0000--s-0] expl/Actions Min -1 +[rlkit-post-refactor-td3-half-cheetah_2021_08_20_00_21_31_0000--s-0] expl/Num Paths 1 +[rlkit-post-refactor-td3-half-cheetah_2021_08_20_00_21_31_0000--s-0] expl/Average Returns -166.073 +[rlkit-post-refactor-td3-half-cheetah_2021_08_20_00_21_31_0000--s-0] eval/num steps total 10000 +[rlkit-post-refactor-td3-half-cheetah_2021_08_20_00_21_31_0000--s-0] eval/num paths total 10 +[rlkit-post-refactor-td3-half-cheetah_2021_08_20_00_21_31_0000--s-0] eval/path length Mean 1000 +[rlkit-post-refactor-td3-half-cheetah_2021_08_20_00_21_31_0000--s-0] eval/path length Std 0 +[rlkit-post-refactor-td3-half-cheetah_2021_08_20_00_21_31_0000--s-0] eval/path length Max 1000 +[rlkit-post-refactor-td3-half-cheetah_2021_08_20_00_21_31_0000--s-0] eval/path length Min 1000 +[rlkit-post-refactor-td3-half-cheetah_2021_08_20_00_21_31_0000--s-0] eval/Rewards Mean -0.157461 +[rlkit-post-refactor-td3-half-cheetah_2021_08_20_00_21_31_0000--s-0] eval/Rewards Std 0.0225605 +[rlkit-post-refactor-td3-half-cheetah_2021_08_20_00_21_31_0000--s-0] eval/Rewards Max 0.0405168 +[rlkit-post-refactor-td3-half-cheetah_2021_08_20_00_21_31_0000--s-0] eval/Rewards Min -0.645774 +[rlkit-post-refactor-td3-half-cheetah_2021_08_20_00_21_31_0000--s-0] eval/Returns Mean -157.461 +[rlkit-post-refactor-td3-half-cheetah_2021_08_20_00_21_31_0000--s-0] eval/Returns Std 0.433748 +[rlkit-post-refactor-td3-half-cheetah_2021_08_20_00_21_31_0000--s-0] eval/Returns Max -156.947 +[rlkit-post-refactor-td3-half-cheetah_2021_08_20_00_21_31_0000--s-0] eval/Returns Min -158.211 +[rlkit-post-refactor-td3-half-cheetah_2021_08_20_00_21_31_0000--s-0] eval/Actions Mean 0.0057675 +[rlkit-post-refactor-td3-half-cheetah_2021_08_20_00_21_31_0000--s-0] eval/Actions Std 0.510843 +[rlkit-post-refactor-td3-half-cheetah_2021_08_20_00_21_31_0000--s-0] eval/Actions Max 0.645545 +[rlkit-post-refactor-td3-half-cheetah_2021_08_20_00_21_31_0000--s-0] eval/Actions Min -0.653989 +[rlkit-post-refactor-td3-half-cheetah_2021_08_20_00_21_31_0000--s-0] eval/Num Paths 5 +[rlkit-post-refactor-td3-half-cheetah_2021_08_20_00_21_31_0000--s-0] eval/Average Returns -157.461 +[rlkit-post-refactor-td3-half-cheetah_2021_08_20_00_21_31_0000--s-0] time/epoch_time (s) 6.82463 +[rlkit-post-refactor-td3-half-cheetah_2021_08_20_00_21_31_0000--s-0] time/evaluation sampling (s) 1.58153 +[rlkit-post-refactor-td3-half-cheetah_2021_08_20_00_21_31_0000--s-0] time/exploration sampling (s) 0.33517 +[rlkit-post-refactor-td3-half-cheetah_2021_08_20_00_21_31_0000--s-0] time/global_time (s) 13.1261 +[rlkit-post-refactor-td3-half-cheetah_2021_08_20_00_21_31_0000--s-0] time/replay buffer data storing (s) 0.00542688 +[rlkit-post-refactor-td3-half-cheetah_2021_08_20_00_21_31_0000--s-0] time/saving (s) 0.00517988 +[rlkit-post-refactor-td3-half-cheetah_2021_08_20_00_21_31_0000--s-0] time/training (s) 4.88098 +[rlkit-post-refactor-td3-half-cheetah_2021_08_20_00_21_31_0000--s-0] epoch 1 +[rlkit-post-refactor-td3-half-cheetah_2021_08_20_00_21_31_0000--s-0] ----------------------------------- --------------- diff --git a/tests/regression/basic/data_td3/progress.csv b/tests/regression/basic/data_td3/progress.csv new file mode 100644 index 000000000..dfa138f12 --- /dev/null +++ b/tests/regression/basic/data_td3/progress.csv @@ -0,0 +1,3 @@ +epoch,eval/Actions Max,eval/Actions Mean,eval/Actions Min,eval/Actions Std,eval/Average Returns,eval/Num Paths,eval/Returns Max,eval/Returns Mean,eval/Returns Min,eval/Returns Std,eval/Rewards Max,eval/Rewards Mean,eval/Rewards Min,eval/Rewards Std,eval/num paths total,eval/num steps total,eval/path length Max,eval/path length Mean,eval/path length Min,eval/path length Std,expl/Actions Max,expl/Actions Mean,expl/Actions Min,expl/Actions Std,expl/Average Returns,expl/Num Paths,expl/Returns Max,expl/Returns Mean,expl/Returns Min,expl/Returns Std,expl/Rewards Max,expl/Rewards Mean,expl/Rewards Min,expl/Rewards Std,expl/num paths total,expl/num steps total,expl/path length Max,expl/path length Mean,expl/path length Min,expl/path length Std,replay_buffer/size,time/epoch_time (s),time/evaluation sampling (s),time/exploration sampling (s),time/global_time (s),time/replay buffer data storing (s),time/saving (s),time/training (s),trainer/Bellman Errors 1 Max,trainer/Bellman Errors 1 Mean,trainer/Bellman Errors 1 Min,trainer/Bellman Errors 1 Std,trainer/Bellman Errors 2 Max,trainer/Bellman Errors 2 Mean,trainer/Bellman Errors 2 Min,trainer/Bellman Errors 2 Std,trainer/Policy Action Max,trainer/Policy Action Mean,trainer/Policy Action Min,trainer/Policy Action Std,trainer/Policy Loss,trainer/Q Targets Max,trainer/Q Targets Mean,trainer/Q Targets Min,trainer/Q Targets Std,trainer/Q1 Predictions Max,trainer/Q1 Predictions Mean,trainer/Q1 Predictions Min,trainer/Q1 Predictions Std,trainer/Q2 Predictions Max,trainer/Q2 Predictions Mean,trainer/Q2 Predictions Min,trainer/Q2 Predictions Std,trainer/QF1 Loss,trainer/QF2 Loss +0,0.0,0.0,0.0,0.0,-0.2787846557649052,5,0.6794999918771203,-0.2787846557649052,-0.8406401498369198,0.5480029988483005,0.2413171250821968,-0.0002787846557649049,-0.28987948135644215,0.014009323460651259,5,5000,1000,1000.0,1000,0.0,0.3829738402811096,0.0014706836856775468,-0.35568701205548586,0.09955273459174718,-8.266052198817784,1,-8.266052198817784,-8.266052198817784,-8.266052198817784,0.0,0.34902824897936163,-0.008266052198817781,-0.4085406559252864,0.12039725106027663,2,2000,1000,1000.0,1000,0.0,2000,6.251018285751343,1.519348382949829,0.33510899543762207,6.295122385025024,0.005286216735839844,0.00490260124206543,4.02953028678894,0.034332663,0.019854914,0.005377164,0.014477749,0.03416015,0.019765684,0.005371222,0.014394463,0.0,0.0,0.0,0.0,-0.00084267475,0.18482465,0.055747747,-0.07332915,0.1290769,0.0,-0.0002330518,-0.0004661036,0.0002330518,0.0,-2.0265119e-05,-4.0530238e-05,2.0265119e-05,0.019854914,0.019765684 +1,0.64554536,0.005767496,-0.6539888,0.5108426,-157.46104887113364,5,-156.9470033800888,-157.46104887113364,-158.21080747766192,0.4337479348270325,0.04051678583571533,-0.15746104887113463,-0.6457742223258414,0.022560546418144544,10,10000,1000,1000.0,1000,0.0,0.9635679193102207,0.006006854040585884,-1.0,0.5209904912201095,-166.0727840984679,1,-166.0727840984679,-166.0727840984679,-166.0727840984679,0.0,0.2348981855526169,-0.16607278409846776,-0.6703879093539248,0.13072079968290107,3,3000,1000,1000.0,1000,0.0,3000,6.82463264465332,1.5815272331237793,0.33517003059387207,13.126054286956787,0.005426883697509766,0.005179882049560547,4.880982398986816,0.08512924,0.042572208,1.5176972e-05,0.04255703,0.110025,0.05515913,0.00029325375,0.054865874,0.64554536,0.0057674893,-0.6539888,0.5108426,-0.073626205,0.3672772,0.15804914,-0.05117894,0.20922807,0.07550805,0.010216674,-0.055074703,0.065291375,0.035577036,0.00076137483,-0.034054287,0.03481566,0.042572208,0.05515913 diff --git a/tests/regression/basic/data_td3/variant.json b/tests/regression/basic/data_td3/variant.json new file mode 100644 index 000000000..509e843c2 --- /dev/null +++ b/tests/regression/basic/data_td3/variant.json @@ -0,0 +1,28 @@ +{ + "algorithm_kwargs": { + "batch_size": 2, + "max_path_length": 1000, + "min_num_steps_before_training": 1000, + "num_epochs": 2, + "num_eval_steps_per_epoch": 5000, + "num_expl_steps_per_train_loop": 1000, + "num_trains_per_train_loop": 1000 + }, + "policy_kwargs": { + "hidden_sizes": [ + 2, + 2 + ] + }, + "qf_kwargs": { + "hidden_sizes": [ + 2, + 2 + ] + }, + "replay_buffer_size": 1000000, + "seed": 25580, + "trainer_kwargs": { + "discount": 0.99 + } +} \ No newline at end of file diff --git a/tests/regression/basic/test_ddpg.py b/tests/regression/basic/test_ddpg.py new file mode 100644 index 000000000..e9a9c74bf --- /dev/null +++ b/tests/regression/basic/test_ddpg.py @@ -0,0 +1,30 @@ +import os +import sys + +from examples import ddpg + +from rlkit.core import logger +from rlkit.testing import csv_util + +def test_ddpg_online(): + logger.reset() + + # make tests small by mutating variant + ddpg.variant["algorithm_kwargs"]["num_epochs"] = 2 + ddpg.variant["algorithm_kwargs"]["batch_size"] = 2 + ddpg.variant["qf_kwargs"] = dict(hidden_sizes=[2, 2]) + ddpg.variant["policy_kwargs"] = dict(hidden_sizes=[2, 2]) + ddpg.variant["seed"] = 25580 + + ddpg.main() + + reference_csv = "tests/regression/basic/data_ddpg/progress.csv" + output_csv = os.path.join(logger.get_snapshot_dir(), "progress.csv") + print("comparing reference %s against output %s" % (reference_csv, output_csv)) + output = csv_util.get_exp(output_csv) + reference = csv_util.get_exp(reference_csv) + keys = ["epoch", "expl/num steps total", "eval/Average Returns", "trainer/Q Predictions Mean", ] + csv_util.check_equal(reference, output, keys) + +if __name__ == "__main__": + test_ddpg_online() diff --git a/tests/regression/basic/test_dqn.py b/tests/regression/basic/test_dqn.py new file mode 100644 index 000000000..21469c65b --- /dev/null +++ b/tests/regression/basic/test_dqn.py @@ -0,0 +1,29 @@ +import os +import sys + +from examples import dqn_and_double_dqn as dqn + +from rlkit.core import logger +from rlkit.testing import csv_util + +def test_dqn_online(): + logger.reset() + + # make tests small by mutating variant + dqn.variant["algorithm_kwargs"]["num_epochs"] = 2 + dqn.variant["algorithm_kwargs"]["batch_size"] = 2 + dqn.variant["seed"] = 25580 + + dqn.main() + + # TODO: there is an extra source of randomness so it doesn't match numerically + reference_csv = "tests/regression/basic/data_dqn/progress.csv" + output_csv = os.path.join(logger.get_snapshot_dir(), "progress.csv") + print("comparing reference %s against output %s" % (reference_csv, output_csv)) + output = csv_util.get_exp(output_csv) + reference = csv_util.get_exp(reference_csv) + keys = ["epoch", "expl/num steps total", ] # "eval/Average Returns", "trainer/QF Loss"] + csv_util.check_equal(reference, output, keys) + +if __name__ == "__main__": + test_dqn_online() diff --git a/tests/regression/basic/test_sac.py b/tests/regression/basic/test_sac.py new file mode 100644 index 000000000..80132e520 --- /dev/null +++ b/tests/regression/basic/test_sac.py @@ -0,0 +1,30 @@ +import os +import sys + +from examples import sac + +from rlkit.core import logger +from rlkit.testing import csv_util + +def test_sac_online(): + logger.reset() + + # make tests small by mutating variant + sac.variant["algorithm_kwargs"]["num_epochs"] = 2 + sac.variant["algorithm_kwargs"]["batch_size"] = 2 + sac.variant["qf_kwargs"] = dict(hidden_sizes=[2, 2]) + sac.variant["policy_kwargs"] = dict(hidden_sizes=[2, 2]) + sac.variant["seed"] = 25580 + + sac.main() + + reference_csv = "tests/regression/basic/data_sac/progress.csv" + output_csv = os.path.join(logger.get_snapshot_dir(), "progress.csv") + print("comparing reference %s against output %s" % (reference_csv, output_csv)) + output = csv_util.get_exp(output_csv) + reference = csv_util.get_exp(reference_csv) + keys = ["epoch", "expl/num steps total", "eval/Average Returns", "trainer/Q1 Predictions Mean", ] + csv_util.check_equal(reference, output, keys) + +if __name__ == "__main__": + test_sac_online() diff --git a/tests/regression/basic/test_td3.py b/tests/regression/basic/test_td3.py new file mode 100644 index 000000000..728f7338f --- /dev/null +++ b/tests/regression/basic/test_td3.py @@ -0,0 +1,30 @@ +import os +import sys + +from examples import td3 + +from rlkit.core import logger +from rlkit.testing import csv_util + +def test_td3_online(): + logger.reset() + + # make tests small by mutating variant + td3.variant["algorithm_kwargs"]["num_epochs"] = 2 + td3.variant["algorithm_kwargs"]["batch_size"] = 2 + td3.variant["qf_kwargs"] = dict(hidden_sizes=[2, 2]) + td3.variant["policy_kwargs"] = dict(hidden_sizes=[2, 2]) + td3.variant["seed"] = 25580 + + td3.main() + + reference_csv = "tests/regression/basic/data_td3/progress.csv" + output_csv = os.path.join(logger.get_snapshot_dir(), "progress.csv") + print("comparing reference %s against output %s" % (reference_csv, output_csv)) + output = csv_util.get_exp(output_csv) + reference = csv_util.get_exp(reference_csv) + keys = ["epoch", "expl/num steps total", "eval/Average Returns", "trainer/Q1 Predictions Mean", ] + csv_util.check_equal(reference, output, keys) + +if __name__ == "__main__": + test_td3_online() diff --git a/tests/regression/her/__init__.py b/tests/regression/her/__init__.py new file mode 100644 index 000000000..e69de29bb diff --git a/tests/regression/her/data_her_dqn/debug.log b/tests/regression/her/data_her_dqn/debug.log new file mode 100644 index 000000000..5280a232f --- /dev/null +++ b/tests/regression/her/data_her_dqn/debug.log @@ -0,0 +1,110 @@ +[her-dqn-gridworld-experiment_2021_08_20_08_55_45_0000--s-0] ----------------------------------- ------------- +[her-dqn-gridworld-experiment_2021_08_20_08_55_45_0000--s-0] trainer/QF Loss 5.38033 +[her-dqn-gridworld-experiment_2021_08_20_08_55_45_0000--s-0] trainer/Y Predictions Mean 0.000245351 +[her-dqn-gridworld-experiment_2021_08_20_08_55_45_0000--s-0] trainer/Y Predictions Std 0 +[her-dqn-gridworld-experiment_2021_08_20_08_55_45_0000--s-0] trainer/Y Predictions Max 0.000245351 +[her-dqn-gridworld-experiment_2021_08_20_08_55_45_0000--s-0] trainer/Y Predictions Min 0.000245351 +[her-dqn-gridworld-experiment_2021_08_20_08_55_45_0000--s-0] expl/num steps total 20 +[her-dqn-gridworld-experiment_2021_08_20_08_55_45_0000--s-0] expl/num paths total 4 +[her-dqn-gridworld-experiment_2021_08_20_08_55_45_0000--s-0] expl/path length Mean 5 +[her-dqn-gridworld-experiment_2021_08_20_08_55_45_0000--s-0] expl/path length Std 0 +[her-dqn-gridworld-experiment_2021_08_20_08_55_45_0000--s-0] expl/path length Max 5 +[her-dqn-gridworld-experiment_2021_08_20_08_55_45_0000--s-0] expl/path length Min 5 +[her-dqn-gridworld-experiment_2021_08_20_08_55_45_0000--s-0] expl/Rewards Mean -2.44 +[her-dqn-gridworld-experiment_2021_08_20_08_55_45_0000--s-0] expl/Rewards Std 0.04 +[her-dqn-gridworld-experiment_2021_08_20_08_55_45_0000--s-0] expl/Rewards Max -2.4 +[her-dqn-gridworld-experiment_2021_08_20_08_55_45_0000--s-0] expl/Rewards Min -2.48 +[her-dqn-gridworld-experiment_2021_08_20_08_55_45_0000--s-0] expl/Returns Mean -12.2 +[her-dqn-gridworld-experiment_2021_08_20_08_55_45_0000--s-0] expl/Returns Std 0.2 +[her-dqn-gridworld-experiment_2021_08_20_08_55_45_0000--s-0] expl/Returns Max -12 +[her-dqn-gridworld-experiment_2021_08_20_08_55_45_0000--s-0] expl/Returns Min -12.4 +[her-dqn-gridworld-experiment_2021_08_20_08_55_45_0000--s-0] expl/Actions Mean 2.4 +[her-dqn-gridworld-experiment_2021_08_20_08_55_45_0000--s-0] expl/Actions Std 0.916515 +[her-dqn-gridworld-experiment_2021_08_20_08_55_45_0000--s-0] expl/Actions Max 3 +[her-dqn-gridworld-experiment_2021_08_20_08_55_45_0000--s-0] expl/Actions Min 1 +[her-dqn-gridworld-experiment_2021_08_20_08_55_45_0000--s-0] expl/Num Paths 2 +[her-dqn-gridworld-experiment_2021_08_20_08_55_45_0000--s-0] expl/Average Returns -12.2 +[her-dqn-gridworld-experiment_2021_08_20_08_55_45_0000--s-0] eval/num steps total 10 +[her-dqn-gridworld-experiment_2021_08_20_08_55_45_0000--s-0] eval/num paths total 2 +[her-dqn-gridworld-experiment_2021_08_20_08_55_45_0000--s-0] eval/path length Mean 5 +[her-dqn-gridworld-experiment_2021_08_20_08_55_45_0000--s-0] eval/path length Std 0 +[her-dqn-gridworld-experiment_2021_08_20_08_55_45_0000--s-0] eval/path length Max 5 +[her-dqn-gridworld-experiment_2021_08_20_08_55_45_0000--s-0] eval/path length Min 5 +[her-dqn-gridworld-experiment_2021_08_20_08_55_45_0000--s-0] eval/Rewards Mean -2.176 +[her-dqn-gridworld-experiment_2021_08_20_08_55_45_0000--s-0] eval/Rewards Std 0.285769 +[her-dqn-gridworld-experiment_2021_08_20_08_55_45_0000--s-0] eval/Rewards Max -1.68 +[her-dqn-gridworld-experiment_2021_08_20_08_55_45_0000--s-0] eval/Rewards Min -2.48 +[her-dqn-gridworld-experiment_2021_08_20_08_55_45_0000--s-0] eval/Returns Mean -10.88 +[her-dqn-gridworld-experiment_2021_08_20_08_55_45_0000--s-0] eval/Returns Std 0.72 +[her-dqn-gridworld-experiment_2021_08_20_08_55_45_0000--s-0] eval/Returns Max -10.16 +[her-dqn-gridworld-experiment_2021_08_20_08_55_45_0000--s-0] eval/Returns Min -11.6 +[her-dqn-gridworld-experiment_2021_08_20_08_55_45_0000--s-0] eval/Actions Mean 1 +[her-dqn-gridworld-experiment_2021_08_20_08_55_45_0000--s-0] eval/Actions Std 0 +[her-dqn-gridworld-experiment_2021_08_20_08_55_45_0000--s-0] eval/Actions Max 1 +[her-dqn-gridworld-experiment_2021_08_20_08_55_45_0000--s-0] eval/Actions Min 1 +[her-dqn-gridworld-experiment_2021_08_20_08_55_45_0000--s-0] eval/Num Paths 2 +[her-dqn-gridworld-experiment_2021_08_20_08_55_45_0000--s-0] eval/Average Returns -10.88 +[her-dqn-gridworld-experiment_2021_08_20_08_55_45_0000--s-0] time/epoch_time (s) 0.124921 +[her-dqn-gridworld-experiment_2021_08_20_08_55_45_0000--s-0] time/evaluation sampling (s) 0.00424433 +[her-dqn-gridworld-experiment_2021_08_20_08_55_45_0000--s-0] time/exploration sampling (s) 0.00542784 +[her-dqn-gridworld-experiment_2021_08_20_08_55_45_0000--s-0] time/global_time (s) 0.560171 +[her-dqn-gridworld-experiment_2021_08_20_08_55_45_0000--s-0] time/replay buffer data storing (s) 0.000172138 +[her-dqn-gridworld-experiment_2021_08_20_08_55_45_0000--s-0] time/saving (s) 0.00541902 +[her-dqn-gridworld-experiment_2021_08_20_08_55_45_0000--s-0] time/training (s) 0.103057 +[her-dqn-gridworld-experiment_2021_08_20_08_55_45_0000--s-0] epoch 0 +[her-dqn-gridworld-experiment_2021_08_20_08_55_45_0000--s-0] ----------------------------------- ------------- +[her-dqn-gridworld-experiment_2021_08_20_08_55_45_0000--s-0] ----------------------------------- ------------- +[her-dqn-gridworld-experiment_2021_08_20_08_55_45_0000--s-0] trainer/QF Loss 5.06572 +[her-dqn-gridworld-experiment_2021_08_20_08_55_45_0000--s-0] trainer/Y Predictions Mean -0.0289561 +[her-dqn-gridworld-experiment_2021_08_20_08_55_45_0000--s-0] trainer/Y Predictions Std 0.0104967 +[her-dqn-gridworld-experiment_2021_08_20_08_55_45_0000--s-0] trainer/Y Predictions Max -0.0184594 +[her-dqn-gridworld-experiment_2021_08_20_08_55_45_0000--s-0] trainer/Y Predictions Min -0.0394528 +[her-dqn-gridworld-experiment_2021_08_20_08_55_45_0000--s-0] expl/num steps total 30 +[her-dqn-gridworld-experiment_2021_08_20_08_55_45_0000--s-0] expl/num paths total 6 +[her-dqn-gridworld-experiment_2021_08_20_08_55_45_0000--s-0] expl/path length Mean 5 +[her-dqn-gridworld-experiment_2021_08_20_08_55_45_0000--s-0] expl/path length Std 0 +[her-dqn-gridworld-experiment_2021_08_20_08_55_45_0000--s-0] expl/path length Max 5 +[her-dqn-gridworld-experiment_2021_08_20_08_55_45_0000--s-0] expl/path length Min 5 +[her-dqn-gridworld-experiment_2021_08_20_08_55_45_0000--s-0] expl/Rewards Mean -2.292 +[her-dqn-gridworld-experiment_2021_08_20_08_55_45_0000--s-0] expl/Rewards Std 0.227719 +[her-dqn-gridworld-experiment_2021_08_20_08_55_45_0000--s-0] expl/Rewards Max -1.72 +[her-dqn-gridworld-experiment_2021_08_20_08_55_45_0000--s-0] expl/Rewards Min -2.48 +[her-dqn-gridworld-experiment_2021_08_20_08_55_45_0000--s-0] expl/Returns Mean -11.46 +[her-dqn-gridworld-experiment_2021_08_20_08_55_45_0000--s-0] expl/Returns Std 0.86 +[her-dqn-gridworld-experiment_2021_08_20_08_55_45_0000--s-0] expl/Returns Max -10.6 +[her-dqn-gridworld-experiment_2021_08_20_08_55_45_0000--s-0] expl/Returns Min -12.32 +[her-dqn-gridworld-experiment_2021_08_20_08_55_45_0000--s-0] expl/Actions Mean 2.1 +[her-dqn-gridworld-experiment_2021_08_20_08_55_45_0000--s-0] expl/Actions Std 0.3 +[her-dqn-gridworld-experiment_2021_08_20_08_55_45_0000--s-0] expl/Actions Max 3 +[her-dqn-gridworld-experiment_2021_08_20_08_55_45_0000--s-0] expl/Actions Min 2 +[her-dqn-gridworld-experiment_2021_08_20_08_55_45_0000--s-0] expl/Num Paths 2 +[her-dqn-gridworld-experiment_2021_08_20_08_55_45_0000--s-0] expl/Average Returns -11.46 +[her-dqn-gridworld-experiment_2021_08_20_08_55_45_0000--s-0] eval/num steps total 20 +[her-dqn-gridworld-experiment_2021_08_20_08_55_45_0000--s-0] eval/num paths total 4 +[her-dqn-gridworld-experiment_2021_08_20_08_55_45_0000--s-0] eval/path length Mean 5 +[her-dqn-gridworld-experiment_2021_08_20_08_55_45_0000--s-0] eval/path length Std 0 +[her-dqn-gridworld-experiment_2021_08_20_08_55_45_0000--s-0] eval/path length Max 5 +[her-dqn-gridworld-experiment_2021_08_20_08_55_45_0000--s-0] eval/path length Min 5 +[her-dqn-gridworld-experiment_2021_08_20_08_55_45_0000--s-0] eval/Rewards Mean -2.28 +[her-dqn-gridworld-experiment_2021_08_20_08_55_45_0000--s-0] eval/Rewards Std 0 +[her-dqn-gridworld-experiment_2021_08_20_08_55_45_0000--s-0] eval/Rewards Max -2.28 +[her-dqn-gridworld-experiment_2021_08_20_08_55_45_0000--s-0] eval/Rewards Min -2.28 +[her-dqn-gridworld-experiment_2021_08_20_08_55_45_0000--s-0] eval/Returns Mean -11.4 +[her-dqn-gridworld-experiment_2021_08_20_08_55_45_0000--s-0] eval/Returns Std 0 +[her-dqn-gridworld-experiment_2021_08_20_08_55_45_0000--s-0] eval/Returns Max -11.4 +[her-dqn-gridworld-experiment_2021_08_20_08_55_45_0000--s-0] eval/Returns Min -11.4 +[her-dqn-gridworld-experiment_2021_08_20_08_55_45_0000--s-0] eval/Actions Mean 2 +[her-dqn-gridworld-experiment_2021_08_20_08_55_45_0000--s-0] eval/Actions Std 0 +[her-dqn-gridworld-experiment_2021_08_20_08_55_45_0000--s-0] eval/Actions Max 2 +[her-dqn-gridworld-experiment_2021_08_20_08_55_45_0000--s-0] eval/Actions Min 2 +[her-dqn-gridworld-experiment_2021_08_20_08_55_45_0000--s-0] eval/Num Paths 2 +[her-dqn-gridworld-experiment_2021_08_20_08_55_45_0000--s-0] eval/Average Returns -11.4 +[her-dqn-gridworld-experiment_2021_08_20_08_55_45_0000--s-0] time/epoch_time (s) 0.0643942 +[her-dqn-gridworld-experiment_2021_08_20_08_55_45_0000--s-0] time/evaluation sampling (s) 0.00411153 +[her-dqn-gridworld-experiment_2021_08_20_08_55_45_0000--s-0] time/exploration sampling (s) 0.00386906 +[her-dqn-gridworld-experiment_2021_08_20_08_55_45_0000--s-0] time/global_time (s) 0.626971 +[her-dqn-gridworld-experiment_2021_08_20_08_55_45_0000--s-0] time/replay buffer data storing (s) 0.000205755 +[her-dqn-gridworld-experiment_2021_08_20_08_55_45_0000--s-0] time/saving (s) 0.00550675 +[her-dqn-gridworld-experiment_2021_08_20_08_55_45_0000--s-0] time/training (s) 0.0496237 +[her-dqn-gridworld-experiment_2021_08_20_08_55_45_0000--s-0] epoch 1 +[her-dqn-gridworld-experiment_2021_08_20_08_55_45_0000--s-0] ----------------------------------- ------------- diff --git a/tests/regression/her/data_her_dqn/progress.csv b/tests/regression/her/data_her_dqn/progress.csv new file mode 100644 index 000000000..cfa6ee2ed --- /dev/null +++ b/tests/regression/her/data_her_dqn/progress.csv @@ -0,0 +1,3 @@ +epoch,eval/Actions Max,eval/Actions Mean,eval/Actions Min,eval/Actions Std,eval/Average Returns,eval/Num Paths,eval/Returns Max,eval/Returns Mean,eval/Returns Min,eval/Returns Std,eval/Rewards Max,eval/Rewards Mean,eval/Rewards Min,eval/Rewards Std,eval/num paths total,eval/num steps total,eval/path length Max,eval/path length Mean,eval/path length Min,eval/path length Std,expl/Actions Max,expl/Actions Mean,expl/Actions Min,expl/Actions Std,expl/Average Returns,expl/Num Paths,expl/Returns Max,expl/Returns Mean,expl/Returns Min,expl/Returns Std,expl/Rewards Max,expl/Rewards Mean,expl/Rewards Min,expl/Rewards Std,expl/num paths total,expl/num steps total,expl/path length Max,expl/path length Mean,expl/path length Min,expl/path length Std,time/epoch_time (s),time/evaluation sampling (s),time/exploration sampling (s),time/global_time (s),time/replay buffer data storing (s),time/saving (s),time/training (s),trainer/QF Loss,trainer/Y Predictions Max,trainer/Y Predictions Mean,trainer/Y Predictions Min,trainer/Y Predictions Std +0,1,1.0,1,0.0,-10.880000000000003,2,-10.160000000000002,-10.880000000000003,-11.600000000000001,0.7199999999999999,-1.6800000000000004,-2.1760000000000006,-2.4800000000000004,0.285769137591868,2,10,5,5.0,5,0.0,3,2.4,1,0.9165151389911679,-12.200000000000003,2,-12.000000000000002,-12.200000000000003,-12.400000000000002,0.20000000000000018,-2.4000000000000004,-2.44,-2.4800000000000004,0.040000000000000036,4,20,5,5.0,5,0.0,0.12492108345031738,0.004244327545166016,0.005427837371826172,0.5601711273193359,0.00017213821411132812,0.005419015884399414,0.1030571460723877,5.380331,0.00024535076,0.00024535076,0.00024535076,0.0 +1,2,2.0,2,0.0,-11.400000000000002,2,-11.400000000000002,-11.400000000000002,-11.400000000000002,0.0,-2.2800000000000002,-2.2800000000000002,-2.2800000000000002,0.0,4,20,5,5.0,5,0.0,3,2.1,2,0.3,-11.46,2,-10.600000000000001,-11.46,-12.320000000000002,0.8600000000000003,-1.7200000000000002,-2.2920000000000003,-2.4800000000000004,0.22771912523984464,6,30,5,5.0,5,0.0,0.06439423561096191,0.004111528396606445,0.0038690567016601562,0.6269712448120117,0.00020575523376464844,0.005506753921508789,0.049623727798461914,5.065722,-0.018459443,-0.02895614,-0.039452836,0.010496696 diff --git a/tests/regression/her/data_her_dqn/variant.json b/tests/regression/her/data_her_dqn/variant.json new file mode 100644 index 000000000..082eae1da --- /dev/null +++ b/tests/regression/her/data_her_dqn/variant.json @@ -0,0 +1,20 @@ +{ + "algo_kwargs": { + "batch_size": 2, + "max_path_length": 5, + "min_num_steps_before_training": 10, + "num_epochs": 2, + "num_eval_steps_per_epoch": 10, + "num_expl_steps_per_train_loop": 10, + "num_trains_per_train_loop": 10 + }, + "replay_buffer_kwargs": { + "fraction_goals_env_goals": 0.0, + "fraction_goals_rollout_goals": 0.2, + "max_size": 100000 + }, + "seed": 25580, + "trainer_kwargs": { + "discount": 0.99 + } +} \ No newline at end of file diff --git a/tests/regression/her/data_her_sac/debug.log b/tests/regression/her/data_her_sac/debug.log new file mode 100644 index 000000000..ffcd9f527 --- /dev/null +++ b/tests/regression/her/data_her_sac/debug.log @@ -0,0 +1,168 @@ +[her-sac-fetch-experiment_2021_08_20_09_00_40_0000--s-0] ----------------------------------- ------------ +[her-sac-fetch-experiment_2021_08_20_09_00_40_0000--s-0] trainer/num train calls 0 +[her-sac-fetch-experiment_2021_08_20_09_00_40_0000--s-0] trainer/QF1 Loss 7.49643 +[her-sac-fetch-experiment_2021_08_20_09_00_40_0000--s-0] trainer/QF2 Loss 7.50385 +[her-sac-fetch-experiment_2021_08_20_09_00_40_0000--s-0] trainer/Policy Loss -2.47502 +[her-sac-fetch-experiment_2021_08_20_09_00_40_0000--s-0] trainer/Q1 Predictions Mean 0.000904009 +[her-sac-fetch-experiment_2021_08_20_09_00_40_0000--s-0] trainer/Q1 Predictions Std 3.76896e-05 +[her-sac-fetch-experiment_2021_08_20_09_00_40_0000--s-0] trainer/Q1 Predictions Max 0.000941699 +[her-sac-fetch-experiment_2021_08_20_09_00_40_0000--s-0] trainer/Q1 Predictions Min 0.00086632 +[her-sac-fetch-experiment_2021_08_20_09_00_40_0000--s-0] trainer/Q2 Predictions Mean -0.000476467 +[her-sac-fetch-experiment_2021_08_20_09_00_40_0000--s-0] trainer/Q2 Predictions Std 0.000228151 +[her-sac-fetch-experiment_2021_08_20_09_00_40_0000--s-0] trainer/Q2 Predictions Max -0.000248316 +[her-sac-fetch-experiment_2021_08_20_09_00_40_0000--s-0] trainer/Q2 Predictions Min -0.000704618 +[her-sac-fetch-experiment_2021_08_20_09_00_40_0000--s-0] trainer/Q Targets Mean 2.72973 +[her-sac-fetch-experiment_2021_08_20_09_00_40_0000--s-0] trainer/Q Targets Std 0.22346 +[her-sac-fetch-experiment_2021_08_20_09_00_40_0000--s-0] trainer/Q Targets Max 2.95319 +[her-sac-fetch-experiment_2021_08_20_09_00_40_0000--s-0] trainer/Q Targets Min 2.50627 +[her-sac-fetch-experiment_2021_08_20_09_00_40_0000--s-0] trainer/Log Pis Mean -2.47566 +[her-sac-fetch-experiment_2021_08_20_09_00_40_0000--s-0] trainer/Log Pis Std 0.105101 +[her-sac-fetch-experiment_2021_08_20_09_00_40_0000--s-0] trainer/Log Pis Max -2.37056 +[her-sac-fetch-experiment_2021_08_20_09_00_40_0000--s-0] trainer/Log Pis Min -2.58076 +[her-sac-fetch-experiment_2021_08_20_09_00_40_0000--s-0] trainer/policy/mean Mean 3.23676e-05 +[her-sac-fetch-experiment_2021_08_20_09_00_40_0000--s-0] trainer/policy/mean Std 0.000592543 +[her-sac-fetch-experiment_2021_08_20_09_00_40_0000--s-0] trainer/policy/mean Max 0.000728342 +[her-sac-fetch-experiment_2021_08_20_09_00_40_0000--s-0] trainer/policy/mean Min -0.000566301 +[her-sac-fetch-experiment_2021_08_20_09_00_40_0000--s-0] trainer/policy/normal/std Mean 1.00019 +[her-sac-fetch-experiment_2021_08_20_09_00_40_0000--s-0] trainer/policy/normal/std Std 0.00112417 +[her-sac-fetch-experiment_2021_08_20_09_00_40_0000--s-0] trainer/policy/normal/std Max 1.00163 +[her-sac-fetch-experiment_2021_08_20_09_00_40_0000--s-0] trainer/policy/normal/std Min 0.99896 +[her-sac-fetch-experiment_2021_08_20_09_00_40_0000--s-0] trainer/policy/normal/log_std Mean 0.000192268 +[her-sac-fetch-experiment_2021_08_20_09_00_40_0000--s-0] trainer/policy/normal/log_std Std 0.00112389 +[her-sac-fetch-experiment_2021_08_20_09_00_40_0000--s-0] trainer/policy/normal/log_std Max 0.00162707 +[her-sac-fetch-experiment_2021_08_20_09_00_40_0000--s-0] trainer/policy/normal/log_std Min -0.00104076 +[her-sac-fetch-experiment_2021_08_20_09_00_40_0000--s-0] trainer/Alpha 1 +[her-sac-fetch-experiment_2021_08_20_09_00_40_0000--s-0] trainer/Alpha Loss -0 +[her-sac-fetch-experiment_2021_08_20_09_00_40_0000--s-0] expl/num steps total 20 +[her-sac-fetch-experiment_2021_08_20_09_00_40_0000--s-0] expl/num paths total 4 +[her-sac-fetch-experiment_2021_08_20_09_00_40_0000--s-0] expl/path length Mean 5 +[her-sac-fetch-experiment_2021_08_20_09_00_40_0000--s-0] expl/path length Std 0 +[her-sac-fetch-experiment_2021_08_20_09_00_40_0000--s-0] expl/path length Max 5 +[her-sac-fetch-experiment_2021_08_20_09_00_40_0000--s-0] expl/path length Min 5 +[her-sac-fetch-experiment_2021_08_20_09_00_40_0000--s-0] expl/Rewards Mean -1 +[her-sac-fetch-experiment_2021_08_20_09_00_40_0000--s-0] expl/Rewards Std 0 +[her-sac-fetch-experiment_2021_08_20_09_00_40_0000--s-0] expl/Rewards Max -1 +[her-sac-fetch-experiment_2021_08_20_09_00_40_0000--s-0] expl/Rewards Min -1 +[her-sac-fetch-experiment_2021_08_20_09_00_40_0000--s-0] expl/Returns Mean -5 +[her-sac-fetch-experiment_2021_08_20_09_00_40_0000--s-0] expl/Returns Std 0 +[her-sac-fetch-experiment_2021_08_20_09_00_40_0000--s-0] expl/Returns Max -5 +[her-sac-fetch-experiment_2021_08_20_09_00_40_0000--s-0] expl/Returns Min -5 +[her-sac-fetch-experiment_2021_08_20_09_00_40_0000--s-0] expl/Actions Mean -0.00821653 +[her-sac-fetch-experiment_2021_08_20_09_00_40_0000--s-0] expl/Actions Std 0.569609 +[her-sac-fetch-experiment_2021_08_20_09_00_40_0000--s-0] expl/Actions Max 0.977579 +[her-sac-fetch-experiment_2021_08_20_09_00_40_0000--s-0] expl/Actions Min -0.998943 +[her-sac-fetch-experiment_2021_08_20_09_00_40_0000--s-0] expl/Num Paths 2 +[her-sac-fetch-experiment_2021_08_20_09_00_40_0000--s-0] expl/Average Returns -5 +[her-sac-fetch-experiment_2021_08_20_09_00_40_0000--s-0] eval/num steps total 10 +[her-sac-fetch-experiment_2021_08_20_09_00_40_0000--s-0] eval/num paths total 2 +[her-sac-fetch-experiment_2021_08_20_09_00_40_0000--s-0] eval/path length Mean 5 +[her-sac-fetch-experiment_2021_08_20_09_00_40_0000--s-0] eval/path length Std 0 +[her-sac-fetch-experiment_2021_08_20_09_00_40_0000--s-0] eval/path length Max 5 +[her-sac-fetch-experiment_2021_08_20_09_00_40_0000--s-0] eval/path length Min 5 +[her-sac-fetch-experiment_2021_08_20_09_00_40_0000--s-0] eval/Rewards Mean -1 +[her-sac-fetch-experiment_2021_08_20_09_00_40_0000--s-0] eval/Rewards Std 0 +[her-sac-fetch-experiment_2021_08_20_09_00_40_0000--s-0] eval/Rewards Max -1 +[her-sac-fetch-experiment_2021_08_20_09_00_40_0000--s-0] eval/Rewards Min -1 +[her-sac-fetch-experiment_2021_08_20_09_00_40_0000--s-0] eval/Returns Mean -5 +[her-sac-fetch-experiment_2021_08_20_09_00_40_0000--s-0] eval/Returns Std 0 +[her-sac-fetch-experiment_2021_08_20_09_00_40_0000--s-0] eval/Returns Max -5 +[her-sac-fetch-experiment_2021_08_20_09_00_40_0000--s-0] eval/Returns Min -5 +[her-sac-fetch-experiment_2021_08_20_09_00_40_0000--s-0] eval/Actions Mean 3.2829e-05 +[her-sac-fetch-experiment_2021_08_20_09_00_40_0000--s-0] eval/Actions Std 0.000601366 +[her-sac-fetch-experiment_2021_08_20_09_00_40_0000--s-0] eval/Actions Max 0.000757367 +[her-sac-fetch-experiment_2021_08_20_09_00_40_0000--s-0] eval/Actions Min -0.000588868 +[her-sac-fetch-experiment_2021_08_20_09_00_40_0000--s-0] eval/Num Paths 2 +[her-sac-fetch-experiment_2021_08_20_09_00_40_0000--s-0] eval/Average Returns -5 +[her-sac-fetch-experiment_2021_08_20_09_00_40_0000--s-0] time/epoch_time (s) 0.212232 +[her-sac-fetch-experiment_2021_08_20_09_00_40_0000--s-0] time/evaluation sampling (s) 0.0179353 +[her-sac-fetch-experiment_2021_08_20_09_00_40_0000--s-0] time/exploration sampling (s) 0.0185997 +[her-sac-fetch-experiment_2021_08_20_09_00_40_0000--s-0] time/global_time (s) 1.12265 +[her-sac-fetch-experiment_2021_08_20_09_00_40_0000--s-0] time/replay buffer data storing (s) 0.000101566 +[her-sac-fetch-experiment_2021_08_20_09_00_40_0000--s-0] time/saving (s) 0.0099926 +[her-sac-fetch-experiment_2021_08_20_09_00_40_0000--s-0] time/training (s) 0.119463 +[her-sac-fetch-experiment_2021_08_20_09_00_40_0000--s-0] epoch 0 +[her-sac-fetch-experiment_2021_08_20_09_00_40_0000--s-0] ----------------------------------- ------------ +[her-sac-fetch-experiment_2021_08_20_09_00_40_0000--s-0] ----------------------------------- ------------ +[her-sac-fetch-experiment_2021_08_20_09_00_40_0000--s-0] trainer/num train calls 0 +[her-sac-fetch-experiment_2021_08_20_09_00_40_0000--s-0] trainer/QF1 Loss 7.31758 +[her-sac-fetch-experiment_2021_08_20_09_00_40_0000--s-0] trainer/QF2 Loss 7.3401 +[her-sac-fetch-experiment_2021_08_20_09_00_40_0000--s-0] trainer/Policy Loss -2.46695 +[her-sac-fetch-experiment_2021_08_20_09_00_40_0000--s-0] trainer/Q1 Predictions Mean 0.00733391 +[her-sac-fetch-experiment_2021_08_20_09_00_40_0000--s-0] trainer/Q1 Predictions Std 5.56116e-05 +[her-sac-fetch-experiment_2021_08_20_09_00_40_0000--s-0] trainer/Q1 Predictions Max 0.00738953 +[her-sac-fetch-experiment_2021_08_20_09_00_40_0000--s-0] trainer/Q1 Predictions Min 0.0072783 +[her-sac-fetch-experiment_2021_08_20_09_00_40_0000--s-0] trainer/Q2 Predictions Mean 0.00317477 +[her-sac-fetch-experiment_2021_08_20_09_00_40_0000--s-0] trainer/Q2 Predictions Std 0.000185155 +[her-sac-fetch-experiment_2021_08_20_09_00_40_0000--s-0] trainer/Q2 Predictions Max 0.00335993 +[her-sac-fetch-experiment_2021_08_20_09_00_40_0000--s-0] trainer/Q2 Predictions Min 0.00298962 +[her-sac-fetch-experiment_2021_08_20_09_00_40_0000--s-0] trainer/Q Targets Mean 2.71244 +[her-sac-fetch-experiment_2021_08_20_09_00_40_0000--s-0] trainer/Q Targets Std 0.00232387 +[her-sac-fetch-experiment_2021_08_20_09_00_40_0000--s-0] trainer/Q Targets Max 2.71476 +[her-sac-fetch-experiment_2021_08_20_09_00_40_0000--s-0] trainer/Q Targets Min 2.71011 +[her-sac-fetch-experiment_2021_08_20_09_00_40_0000--s-0] trainer/Log Pis Mean -2.47074 +[her-sac-fetch-experiment_2021_08_20_09_00_40_0000--s-0] trainer/Log Pis Std 0.29424 +[her-sac-fetch-experiment_2021_08_20_09_00_40_0000--s-0] trainer/Log Pis Max -2.1765 +[her-sac-fetch-experiment_2021_08_20_09_00_40_0000--s-0] trainer/Log Pis Min -2.76498 +[her-sac-fetch-experiment_2021_08_20_09_00_40_0000--s-0] trainer/policy/mean Mean 7.67683e-05 +[her-sac-fetch-experiment_2021_08_20_09_00_40_0000--s-0] trainer/policy/mean Std 0.00122954 +[her-sac-fetch-experiment_2021_08_20_09_00_40_0000--s-0] trainer/policy/mean Max 0.00147136 +[her-sac-fetch-experiment_2021_08_20_09_00_40_0000--s-0] trainer/policy/mean Min -0.00189778 +[her-sac-fetch-experiment_2021_08_20_09_00_40_0000--s-0] trainer/policy/normal/std Mean 1.00001 +[her-sac-fetch-experiment_2021_08_20_09_00_40_0000--s-0] trainer/policy/normal/std Std 0.00151354 +[her-sac-fetch-experiment_2021_08_20_09_00_40_0000--s-0] trainer/policy/normal/std Max 1.0015 +[her-sac-fetch-experiment_2021_08_20_09_00_40_0000--s-0] trainer/policy/normal/std Min 0.997472 +[her-sac-fetch-experiment_2021_08_20_09_00_40_0000--s-0] trainer/policy/normal/log_std Mean 1.07968e-05 +[her-sac-fetch-experiment_2021_08_20_09_00_40_0000--s-0] trainer/policy/normal/log_std Std 0.00151457 +[her-sac-fetch-experiment_2021_08_20_09_00_40_0000--s-0] trainer/policy/normal/log_std Max 0.00149543 +[her-sac-fetch-experiment_2021_08_20_09_00_40_0000--s-0] trainer/policy/normal/log_std Min -0.00253097 +[her-sac-fetch-experiment_2021_08_20_09_00_40_0000--s-0] trainer/Alpha 0.997008 +[her-sac-fetch-experiment_2021_08_20_09_00_40_0000--s-0] trainer/Alpha Loss -0.0193924 +[her-sac-fetch-experiment_2021_08_20_09_00_40_0000--s-0] expl/num steps total 30 +[her-sac-fetch-experiment_2021_08_20_09_00_40_0000--s-0] expl/num paths total 6 +[her-sac-fetch-experiment_2021_08_20_09_00_40_0000--s-0] expl/path length Mean 5 +[her-sac-fetch-experiment_2021_08_20_09_00_40_0000--s-0] expl/path length Std 0 +[her-sac-fetch-experiment_2021_08_20_09_00_40_0000--s-0] expl/path length Max 5 +[her-sac-fetch-experiment_2021_08_20_09_00_40_0000--s-0] expl/path length Min 5 +[her-sac-fetch-experiment_2021_08_20_09_00_40_0000--s-0] expl/Rewards Mean -1 +[her-sac-fetch-experiment_2021_08_20_09_00_40_0000--s-0] expl/Rewards Std 0 +[her-sac-fetch-experiment_2021_08_20_09_00_40_0000--s-0] expl/Rewards Max -1 +[her-sac-fetch-experiment_2021_08_20_09_00_40_0000--s-0] expl/Rewards Min -1 +[her-sac-fetch-experiment_2021_08_20_09_00_40_0000--s-0] expl/Returns Mean -5 +[her-sac-fetch-experiment_2021_08_20_09_00_40_0000--s-0] expl/Returns Std 0 +[her-sac-fetch-experiment_2021_08_20_09_00_40_0000--s-0] expl/Returns Max -5 +[her-sac-fetch-experiment_2021_08_20_09_00_40_0000--s-0] expl/Returns Min -5 +[her-sac-fetch-experiment_2021_08_20_09_00_40_0000--s-0] expl/Actions Mean 0.0746636 +[her-sac-fetch-experiment_2021_08_20_09_00_40_0000--s-0] expl/Actions Std 0.629169 +[her-sac-fetch-experiment_2021_08_20_09_00_40_0000--s-0] expl/Actions Max 0.948969 +[her-sac-fetch-experiment_2021_08_20_09_00_40_0000--s-0] expl/Actions Min -0.896441 +[her-sac-fetch-experiment_2021_08_20_09_00_40_0000--s-0] expl/Num Paths 2 +[her-sac-fetch-experiment_2021_08_20_09_00_40_0000--s-0] expl/Average Returns -5 +[her-sac-fetch-experiment_2021_08_20_09_00_40_0000--s-0] eval/num steps total 20 +[her-sac-fetch-experiment_2021_08_20_09_00_40_0000--s-0] eval/num paths total 4 +[her-sac-fetch-experiment_2021_08_20_09_00_40_0000--s-0] eval/path length Mean 5 +[her-sac-fetch-experiment_2021_08_20_09_00_40_0000--s-0] eval/path length Std 0 +[her-sac-fetch-experiment_2021_08_20_09_00_40_0000--s-0] eval/path length Max 5 +[her-sac-fetch-experiment_2021_08_20_09_00_40_0000--s-0] eval/path length Min 5 +[her-sac-fetch-experiment_2021_08_20_09_00_40_0000--s-0] eval/Rewards Mean -1 +[her-sac-fetch-experiment_2021_08_20_09_00_40_0000--s-0] eval/Rewards Std 0 +[her-sac-fetch-experiment_2021_08_20_09_00_40_0000--s-0] eval/Rewards Max -1 +[her-sac-fetch-experiment_2021_08_20_09_00_40_0000--s-0] eval/Rewards Min -1 +[her-sac-fetch-experiment_2021_08_20_09_00_40_0000--s-0] eval/Returns Mean -5 +[her-sac-fetch-experiment_2021_08_20_09_00_40_0000--s-0] eval/Returns Std 0 +[her-sac-fetch-experiment_2021_08_20_09_00_40_0000--s-0] eval/Returns Max -5 +[her-sac-fetch-experiment_2021_08_20_09_00_40_0000--s-0] eval/Returns Min -5 +[her-sac-fetch-experiment_2021_08_20_09_00_40_0000--s-0] eval/Actions Mean 7.57047e-05 +[her-sac-fetch-experiment_2021_08_20_09_00_40_0000--s-0] eval/Actions Std 0.00121639 +[her-sac-fetch-experiment_2021_08_20_09_00_40_0000--s-0] eval/Actions Max 0.0014672 +[her-sac-fetch-experiment_2021_08_20_09_00_40_0000--s-0] eval/Actions Min -0.0018842 +[her-sac-fetch-experiment_2021_08_20_09_00_40_0000--s-0] eval/Num Paths 2 +[her-sac-fetch-experiment_2021_08_20_09_00_40_0000--s-0] eval/Average Returns -5 +[her-sac-fetch-experiment_2021_08_20_09_00_40_0000--s-0] time/epoch_time (s) 0.11834 +[her-sac-fetch-experiment_2021_08_20_09_00_40_0000--s-0] time/evaluation sampling (s) 0.0179975 +[her-sac-fetch-experiment_2021_08_20_09_00_40_0000--s-0] time/exploration sampling (s) 0.0193708 +[her-sac-fetch-experiment_2021_08_20_09_00_40_0000--s-0] time/global_time (s) 1.24424 +[her-sac-fetch-experiment_2021_08_20_09_00_40_0000--s-0] time/replay buffer data storing (s) 0.00014782 +[her-sac-fetch-experiment_2021_08_20_09_00_40_0000--s-0] time/saving (s) 0.00665903 +[her-sac-fetch-experiment_2021_08_20_09_00_40_0000--s-0] time/training (s) 0.0728889 +[her-sac-fetch-experiment_2021_08_20_09_00_40_0000--s-0] epoch 1 +[her-sac-fetch-experiment_2021_08_20_09_00_40_0000--s-0] ----------------------------------- ------------ diff --git a/tests/regression/her/data_her_sac/progress.csv b/tests/regression/her/data_her_sac/progress.csv new file mode 100644 index 000000000..7d7c8fa82 --- /dev/null +++ b/tests/regression/her/data_her_sac/progress.csv @@ -0,0 +1,3 @@ +epoch,eval/Actions Max,eval/Actions Mean,eval/Actions Min,eval/Actions Std,eval/Average Returns,eval/Num Paths,eval/Returns Max,eval/Returns Mean,eval/Returns Min,eval/Returns Std,eval/Rewards Max,eval/Rewards Mean,eval/Rewards Min,eval/Rewards Std,eval/num paths total,eval/num steps total,eval/path length Max,eval/path length Mean,eval/path length Min,eval/path length Std,expl/Actions Max,expl/Actions Mean,expl/Actions Min,expl/Actions Std,expl/Average Returns,expl/Num Paths,expl/Returns Max,expl/Returns Mean,expl/Returns Min,expl/Returns Std,expl/Rewards Max,expl/Rewards Mean,expl/Rewards Min,expl/Rewards Std,expl/num paths total,expl/num steps total,expl/path length Max,expl/path length Mean,expl/path length Min,expl/path length Std,time/epoch_time (s),time/evaluation sampling (s),time/exploration sampling (s),time/global_time (s),time/replay buffer data storing (s),time/saving (s),time/training (s),trainer/Alpha,trainer/Alpha Loss,trainer/Log Pis Max,trainer/Log Pis Mean,trainer/Log Pis Min,trainer/Log Pis Std,trainer/Policy Loss,trainer/Q Targets Max,trainer/Q Targets Mean,trainer/Q Targets Min,trainer/Q Targets Std,trainer/Q1 Predictions Max,trainer/Q1 Predictions Mean,trainer/Q1 Predictions Min,trainer/Q1 Predictions Std,trainer/Q2 Predictions Max,trainer/Q2 Predictions Mean,trainer/Q2 Predictions Min,trainer/Q2 Predictions Std,trainer/QF1 Loss,trainer/QF2 Loss,trainer/num train calls,trainer/policy/mean Max,trainer/policy/mean Mean,trainer/policy/mean Min,trainer/policy/mean Std,trainer/policy/normal/log_std Max,trainer/policy/normal/log_std Mean,trainer/policy/normal/log_std Min,trainer/policy/normal/log_std Std,trainer/policy/normal/std Max,trainer/policy/normal/std Mean,trainer/policy/normal/std Min,trainer/policy/normal/std Std +0,0.0007573665,3.2828986e-05,-0.00058886805,0.00060136605,-5.0,2,-5.0,-5.0,-5.0,0.0,-1.0,-1.0,-1.0,0.0,2,10,5,5.0,5,0.0,0.97757876,-0.00821653,-0.9989433,0.5696091,-5.0,2,-5.0,-5.0,-5.0,0.0,-1.0,-1.0,-1.0,0.0,4,20,5,5.0,5,0.0,0.21223235130310059,0.01793527603149414,0.018599748611450195,1.1226472854614258,0.00010156631469726562,0.009992599487304688,0.11946344375610352,1.0,-0.0,-2.370561,-2.475662,-2.580763,0.10510111,-2.4750183,2.9531884,2.7297287,2.5062687,0.22345984,0.000941699,0.00090400944,0.00086631987,3.7689577e-05,-0.000248316,-0.00047646684,-0.00070461765,0.00022815082,7.496434,7.503852,0,0.00072834216,3.236763e-05,-0.00056630105,0.0005925432,0.0016270744,0.00019226767,-0.0010407617,0.001123888,1.0016284,1.0001929,0.9989598,0.0011241715 +1,0.0014672047,7.5704666e-05,-0.0018841992,0.0012163888,-5.0,2,-5.0,-5.0,-5.0,0.0,-1.0,-1.0,-1.0,0.0,4,20,5,5.0,5,0.0,0.94896895,0.074663624,-0.8964408,0.629169,-5.0,2,-5.0,-5.0,-5.0,0.0,-1.0,-1.0,-1.0,0.0,6,30,5,5.0,5,0.0,0.11833953857421875,0.01799750328063965,0.01937079429626465,1.2442359924316406,0.00014781951904296875,0.006659030914306641,0.07288885116577148,0.9970075488090515,-0.019392408430576324,-2.1764956,-2.4707358,-2.764976,0.29424024,-2.4669516,2.7147596,2.7124357,2.7101119,0.002323866,0.0073895263,0.0073339147,0.007278303,5.56116e-05,0.0033599273,0.003174772,0.0029896165,0.00018515543,7.317581,7.3400993,0,0.0014713614,7.676828e-05,-0.001897785,0.001229544,0.0014954347,1.0796779e-05,-0.0025309736,0.0015145651,1.0014966,1.0000119,0.9974722,0.0015135449 diff --git a/tests/regression/her/data_her_sac/variant.json b/tests/regression/her/data_her_sac/variant.json new file mode 100644 index 000000000..aef765f31 --- /dev/null +++ b/tests/regression/her/data_her_sac/variant.json @@ -0,0 +1,40 @@ +{ + "algo_kwargs": { + "batch_size": 2, + "max_path_length": 5, + "min_num_steps_before_training": 10, + "num_epochs": 2, + "num_eval_steps_per_epoch": 10, + "num_expl_steps_per_train_loop": 10, + "num_trains_per_train_loop": 10 + }, + "algorithm": "HER-SAC", + "policy_kwargs": { + "hidden_sizes": [ + 2, + 2 + ] + }, + "qf_kwargs": { + "hidden_sizes": [ + 2, + 2 + ] + }, + "replay_buffer_kwargs": { + "fraction_goals_env_goals": 0, + "fraction_goals_rollout_goals": 0.2, + "max_size": 1000000 + }, + "sac_trainer_kwargs": { + "discount": 0.99, + "policy_lr": 0.0003, + "qf_lr": 0.0003, + "reward_scale": 1, + "soft_target_tau": 0.005, + "target_update_period": 1, + "use_automatic_entropy_tuning": true + }, + "seed": 25580, + "version": "normal" +} \ No newline at end of file diff --git a/tests/regression/her/data_her_td3/debug.log b/tests/regression/her/data_her_td3/debug.log new file mode 100644 index 000000000..2182869e2 --- /dev/null +++ b/tests/regression/her/data_her_td3/debug.log @@ -0,0 +1,282 @@ +[her-td3-sawyer-experiment_2021_08_20_09_02_57_0000--s-0] ----------------------------------- ------------ +[her-td3-sawyer-experiment_2021_08_20_09_02_57_0000--s-0] trainer/QF1 Loss 0.000127872 +[her-td3-sawyer-experiment_2021_08_20_09_02_57_0000--s-0] trainer/QF2 Loss 0.000131456 +[her-td3-sawyer-experiment_2021_08_20_09_02_57_0000--s-0] trainer/Policy Loss 0.00134107 +[her-td3-sawyer-experiment_2021_08_20_09_02_57_0000--s-0] trainer/Q1 Predictions Mean -0.000226959 +[her-td3-sawyer-experiment_2021_08_20_09_02_57_0000--s-0] trainer/Q1 Predictions Std 6.23965e-05 +[her-td3-sawyer-experiment_2021_08_20_09_02_57_0000--s-0] trainer/Q1 Predictions Max -0.000164563 +[her-td3-sawyer-experiment_2021_08_20_09_02_57_0000--s-0] trainer/Q1 Predictions Min -0.000289356 +[her-td3-sawyer-experiment_2021_08_20_09_02_57_0000--s-0] trainer/Q2 Predictions Mean 0 +[her-td3-sawyer-experiment_2021_08_20_09_02_57_0000--s-0] trainer/Q2 Predictions Std 0 +[her-td3-sawyer-experiment_2021_08_20_09_02_57_0000--s-0] trainer/Q2 Predictions Max 0 +[her-td3-sawyer-experiment_2021_08_20_09_02_57_0000--s-0] trainer/Q2 Predictions Min 0 +[her-td3-sawyer-experiment_2021_08_20_09_02_57_0000--s-0] trainer/Q Targets Mean -0.0096989 +[her-td3-sawyer-experiment_2021_08_20_09_02_57_0000--s-0] trainer/Q Targets Std 0.00611456 +[her-td3-sawyer-experiment_2021_08_20_09_02_57_0000--s-0] trainer/Q Targets Max -0.00358434 +[her-td3-sawyer-experiment_2021_08_20_09_02_57_0000--s-0] trainer/Q Targets Min -0.0158135 +[her-td3-sawyer-experiment_2021_08_20_09_02_57_0000--s-0] trainer/Bellman Errors 1 Mean 0.000127872 +[her-td3-sawyer-experiment_2021_08_20_09_02_57_0000--s-0] trainer/Bellman Errors 1 Std 0.000117016 +[her-td3-sawyer-experiment_2021_08_20_09_02_57_0000--s-0] trainer/Bellman Errors 1 Max 0.000244888 +[her-td3-sawyer-experiment_2021_08_20_09_02_57_0000--s-0] trainer/Bellman Errors 1 Min 1.08569e-05 +[her-td3-sawyer-experiment_2021_08_20_09_02_57_0000--s-0] trainer/Bellman Errors 2 Mean 0.000131456 +[her-td3-sawyer-experiment_2021_08_20_09_02_57_0000--s-0] trainer/Bellman Errors 2 Std 0.000118609 +[her-td3-sawyer-experiment_2021_08_20_09_02_57_0000--s-0] trainer/Bellman Errors 2 Max 0.000250065 +[her-td3-sawyer-experiment_2021_08_20_09_02_57_0000--s-0] trainer/Bellman Errors 2 Min 1.28475e-05 +[her-td3-sawyer-experiment_2021_08_20_09_02_57_0000--s-0] trainer/Policy Action Mean -2.30222e-05 +[her-td3-sawyer-experiment_2021_08_20_09_02_57_0000--s-0] trainer/Policy Action Std 0.000172028 +[her-td3-sawyer-experiment_2021_08_20_09_02_57_0000--s-0] trainer/Policy Action Max 0.000222309 +[her-td3-sawyer-experiment_2021_08_20_09_02_57_0000--s-0] trainer/Policy Action Min -0.000179257 +[her-td3-sawyer-experiment_2021_08_20_09_02_57_0000--s-0] expl/num steps total 20 +[her-td3-sawyer-experiment_2021_08_20_09_02_57_0000--s-0] expl/num paths total 4 +[her-td3-sawyer-experiment_2021_08_20_09_02_57_0000--s-0] expl/path length Mean 5 +[her-td3-sawyer-experiment_2021_08_20_09_02_57_0000--s-0] expl/path length Std 0 +[her-td3-sawyer-experiment_2021_08_20_09_02_57_0000--s-0] expl/path length Max 5 +[her-td3-sawyer-experiment_2021_08_20_09_02_57_0000--s-0] expl/path length Min 5 +[her-td3-sawyer-experiment_2021_08_20_09_02_57_0000--s-0] expl/Rewards Mean -0.173503 +[her-td3-sawyer-experiment_2021_08_20_09_02_57_0000--s-0] expl/Rewards Std 0.0669977 +[her-td3-sawyer-experiment_2021_08_20_09_02_57_0000--s-0] expl/Rewards Max -0.103657 +[her-td3-sawyer-experiment_2021_08_20_09_02_57_0000--s-0] expl/Rewards Min -0.245565 +[her-td3-sawyer-experiment_2021_08_20_09_02_57_0000--s-0] expl/Returns Mean -0.867514 +[her-td3-sawyer-experiment_2021_08_20_09_02_57_0000--s-0] expl/Returns Std 0.334262 +[her-td3-sawyer-experiment_2021_08_20_09_02_57_0000--s-0] expl/Returns Max -0.533252 +[her-td3-sawyer-experiment_2021_08_20_09_02_57_0000--s-0] expl/Returns Min -1.20178 +[her-td3-sawyer-experiment_2021_08_20_09_02_57_0000--s-0] expl/Actions Mean 0.0573666 +[her-td3-sawyer-experiment_2021_08_20_09_02_57_0000--s-0] expl/Actions Std 0.23091 +[her-td3-sawyer-experiment_2021_08_20_09_02_57_0000--s-0] expl/Actions Max 0.640042 +[her-td3-sawyer-experiment_2021_08_20_09_02_57_0000--s-0] expl/Actions Min -0.427158 +[her-td3-sawyer-experiment_2021_08_20_09_02_57_0000--s-0] expl/Num Paths 2 +[her-td3-sawyer-experiment_2021_08_20_09_02_57_0000--s-0] expl/Average Returns -0.867514 +[her-td3-sawyer-experiment_2021_08_20_09_02_57_0000--s-0] expl/hand_distance Mean 0.173503 +[her-td3-sawyer-experiment_2021_08_20_09_02_57_0000--s-0] expl/hand_distance Std 0.0669977 +[her-td3-sawyer-experiment_2021_08_20_09_02_57_0000--s-0] expl/hand_distance Max 0.245565 +[her-td3-sawyer-experiment_2021_08_20_09_02_57_0000--s-0] expl/hand_distance Min 0.103657 +[her-td3-sawyer-experiment_2021_08_20_09_02_57_0000--s-0] expl/Final hand_distance Mean 0.168724 +[her-td3-sawyer-experiment_2021_08_20_09_02_57_0000--s-0] expl/Final hand_distance Std 0.0650669 +[her-td3-sawyer-experiment_2021_08_20_09_02_57_0000--s-0] expl/Final hand_distance Max 0.233791 +[her-td3-sawyer-experiment_2021_08_20_09_02_57_0000--s-0] expl/Final hand_distance Min 0.103657 +[her-td3-sawyer-experiment_2021_08_20_09_02_57_0000--s-0] expl/hand_distance_l1 Mean 0.291814 +[her-td3-sawyer-experiment_2021_08_20_09_02_57_0000--s-0] expl/hand_distance_l1 Std 0.115831 +[her-td3-sawyer-experiment_2021_08_20_09_02_57_0000--s-0] expl/hand_distance_l1 Max 0.416994 +[her-td3-sawyer-experiment_2021_08_20_09_02_57_0000--s-0] expl/hand_distance_l1 Min 0.170934 +[her-td3-sawyer-experiment_2021_08_20_09_02_57_0000--s-0] expl/Final hand_distance_l1 Mean 0.283126 +[her-td3-sawyer-experiment_2021_08_20_09_02_57_0000--s-0] expl/Final hand_distance_l1 Std 0.111779 +[her-td3-sawyer-experiment_2021_08_20_09_02_57_0000--s-0] expl/Final hand_distance_l1 Max 0.394905 +[her-td3-sawyer-experiment_2021_08_20_09_02_57_0000--s-0] expl/Final hand_distance_l1 Min 0.171347 +[her-td3-sawyer-experiment_2021_08_20_09_02_57_0000--s-0] expl/hand_distance_l2 Mean 0.173503 +[her-td3-sawyer-experiment_2021_08_20_09_02_57_0000--s-0] expl/hand_distance_l2 Std 0.0669977 +[her-td3-sawyer-experiment_2021_08_20_09_02_57_0000--s-0] expl/hand_distance_l2 Max 0.245565 +[her-td3-sawyer-experiment_2021_08_20_09_02_57_0000--s-0] expl/hand_distance_l2 Min 0.103657 +[her-td3-sawyer-experiment_2021_08_20_09_02_57_0000--s-0] expl/Final hand_distance_l2 Mean 0.168724 +[her-td3-sawyer-experiment_2021_08_20_09_02_57_0000--s-0] expl/Final hand_distance_l2 Std 0.0650669 +[her-td3-sawyer-experiment_2021_08_20_09_02_57_0000--s-0] expl/Final hand_distance_l2 Max 0.233791 +[her-td3-sawyer-experiment_2021_08_20_09_02_57_0000--s-0] expl/Final hand_distance_l2 Min 0.103657 +[her-td3-sawyer-experiment_2021_08_20_09_02_57_0000--s-0] expl/hand_success Mean 0 +[her-td3-sawyer-experiment_2021_08_20_09_02_57_0000--s-0] expl/hand_success Std 0 +[her-td3-sawyer-experiment_2021_08_20_09_02_57_0000--s-0] expl/hand_success Max 0 +[her-td3-sawyer-experiment_2021_08_20_09_02_57_0000--s-0] expl/hand_success Min 0 +[her-td3-sawyer-experiment_2021_08_20_09_02_57_0000--s-0] expl/Final hand_success Mean 0 +[her-td3-sawyer-experiment_2021_08_20_09_02_57_0000--s-0] expl/Final hand_success Std 0 +[her-td3-sawyer-experiment_2021_08_20_09_02_57_0000--s-0] expl/Final hand_success Max 0 +[her-td3-sawyer-experiment_2021_08_20_09_02_57_0000--s-0] expl/Final hand_success Min 0 +[her-td3-sawyer-experiment_2021_08_20_09_02_57_0000--s-0] eval/num steps total 10 +[her-td3-sawyer-experiment_2021_08_20_09_02_57_0000--s-0] eval/num paths total 2 +[her-td3-sawyer-experiment_2021_08_20_09_02_57_0000--s-0] eval/path length Mean 5 +[her-td3-sawyer-experiment_2021_08_20_09_02_57_0000--s-0] eval/path length Std 0 +[her-td3-sawyer-experiment_2021_08_20_09_02_57_0000--s-0] eval/path length Max 5 +[her-td3-sawyer-experiment_2021_08_20_09_02_57_0000--s-0] eval/path length Min 5 +[her-td3-sawyer-experiment_2021_08_20_09_02_57_0000--s-0] eval/Rewards Mean -0.219461 +[her-td3-sawyer-experiment_2021_08_20_09_02_57_0000--s-0] eval/Rewards Std 0.00853623 +[her-td3-sawyer-experiment_2021_08_20_09_02_57_0000--s-0] eval/Rewards Max -0.21053 +[her-td3-sawyer-experiment_2021_08_20_09_02_57_0000--s-0] eval/Rewards Min -0.229518 +[her-td3-sawyer-experiment_2021_08_20_09_02_57_0000--s-0] eval/Returns Mean -1.09731 +[her-td3-sawyer-experiment_2021_08_20_09_02_57_0000--s-0] eval/Returns Std 0.0425397 +[her-td3-sawyer-experiment_2021_08_20_09_02_57_0000--s-0] eval/Returns Max -1.05477 +[her-td3-sawyer-experiment_2021_08_20_09_02_57_0000--s-0] eval/Returns Min -1.13985 +[her-td3-sawyer-experiment_2021_08_20_09_02_57_0000--s-0] eval/Actions Mean -2.15041e-05 +[her-td3-sawyer-experiment_2021_08_20_09_02_57_0000--s-0] eval/Actions Std 0.000293242 +[her-td3-sawyer-experiment_2021_08_20_09_02_57_0000--s-0] eval/Actions Max 0.000405435 +[her-td3-sawyer-experiment_2021_08_20_09_02_57_0000--s-0] eval/Actions Min -0.000286149 +[her-td3-sawyer-experiment_2021_08_20_09_02_57_0000--s-0] eval/Num Paths 2 +[her-td3-sawyer-experiment_2021_08_20_09_02_57_0000--s-0] eval/Average Returns -1.09731 +[her-td3-sawyer-experiment_2021_08_20_09_02_57_0000--s-0] eval/hand_distance Mean 0.219461 +[her-td3-sawyer-experiment_2021_08_20_09_02_57_0000--s-0] eval/hand_distance Std 0.00853623 +[her-td3-sawyer-experiment_2021_08_20_09_02_57_0000--s-0] eval/hand_distance Max 0.229518 +[her-td3-sawyer-experiment_2021_08_20_09_02_57_0000--s-0] eval/hand_distance Min 0.21053 +[her-td3-sawyer-experiment_2021_08_20_09_02_57_0000--s-0] eval/Final hand_distance Mean 0.218957 +[her-td3-sawyer-experiment_2021_08_20_09_02_57_0000--s-0] eval/Final hand_distance Std 0.00842709 +[her-td3-sawyer-experiment_2021_08_20_09_02_57_0000--s-0] eval/Final hand_distance Max 0.227384 +[her-td3-sawyer-experiment_2021_08_20_09_02_57_0000--s-0] eval/Final hand_distance Min 0.21053 +[her-td3-sawyer-experiment_2021_08_20_09_02_57_0000--s-0] eval/hand_distance_l1 Mean 0.36472 +[her-td3-sawyer-experiment_2021_08_20_09_02_57_0000--s-0] eval/hand_distance_l1 Std 0.0266812 +[her-td3-sawyer-experiment_2021_08_20_09_02_57_0000--s-0] eval/hand_distance_l1 Max 0.393655 +[her-td3-sawyer-experiment_2021_08_20_09_02_57_0000--s-0] eval/hand_distance_l1 Min 0.337015 +[her-td3-sawyer-experiment_2021_08_20_09_02_57_0000--s-0] eval/Final hand_distance_l1 Mean 0.363758 +[her-td3-sawyer-experiment_2021_08_20_09_02_57_0000--s-0] eval/Final hand_distance_l1 Std 0.0267429 +[her-td3-sawyer-experiment_2021_08_20_09_02_57_0000--s-0] eval/Final hand_distance_l1 Max 0.3905 +[her-td3-sawyer-experiment_2021_08_20_09_02_57_0000--s-0] eval/Final hand_distance_l1 Min 0.337015 +[her-td3-sawyer-experiment_2021_08_20_09_02_57_0000--s-0] eval/hand_distance_l2 Mean 0.219461 +[her-td3-sawyer-experiment_2021_08_20_09_02_57_0000--s-0] eval/hand_distance_l2 Std 0.00853623 +[her-td3-sawyer-experiment_2021_08_20_09_02_57_0000--s-0] eval/hand_distance_l2 Max 0.229518 +[her-td3-sawyer-experiment_2021_08_20_09_02_57_0000--s-0] eval/hand_distance_l2 Min 0.21053 +[her-td3-sawyer-experiment_2021_08_20_09_02_57_0000--s-0] eval/Final hand_distance_l2 Mean 0.218957 +[her-td3-sawyer-experiment_2021_08_20_09_02_57_0000--s-0] eval/Final hand_distance_l2 Std 0.00842709 +[her-td3-sawyer-experiment_2021_08_20_09_02_57_0000--s-0] eval/Final hand_distance_l2 Max 0.227384 +[her-td3-sawyer-experiment_2021_08_20_09_02_57_0000--s-0] eval/Final hand_distance_l2 Min 0.21053 +[her-td3-sawyer-experiment_2021_08_20_09_02_57_0000--s-0] eval/hand_success Mean 0 +[her-td3-sawyer-experiment_2021_08_20_09_02_57_0000--s-0] eval/hand_success Std 0 +[her-td3-sawyer-experiment_2021_08_20_09_02_57_0000--s-0] eval/hand_success Max 0 +[her-td3-sawyer-experiment_2021_08_20_09_02_57_0000--s-0] eval/hand_success Min 0 +[her-td3-sawyer-experiment_2021_08_20_09_02_57_0000--s-0] eval/Final hand_success Mean 0 +[her-td3-sawyer-experiment_2021_08_20_09_02_57_0000--s-0] eval/Final hand_success Std 0 +[her-td3-sawyer-experiment_2021_08_20_09_02_57_0000--s-0] eval/Final hand_success Max 0 +[her-td3-sawyer-experiment_2021_08_20_09_02_57_0000--s-0] eval/Final hand_success Min 0 +[her-td3-sawyer-experiment_2021_08_20_09_02_57_0000--s-0] time/epoch_time (s) 0.309838 +[her-td3-sawyer-experiment_2021_08_20_09_02_57_0000--s-0] time/evaluation sampling (s) 0.0594292 +[her-td3-sawyer-experiment_2021_08_20_09_02_57_0000--s-0] time/exploration sampling (s) 0.0602119 +[her-td3-sawyer-experiment_2021_08_20_09_02_57_0000--s-0] time/global_time (s) 1.20551 +[her-td3-sawyer-experiment_2021_08_20_09_02_57_0000--s-0] time/replay buffer data storing (s) 0.00013113 +[her-td3-sawyer-experiment_2021_08_20_09_02_57_0000--s-0] time/saving (s) 0.00461841 +[her-td3-sawyer-experiment_2021_08_20_09_02_57_0000--s-0] time/training (s) 0.0952432 +[her-td3-sawyer-experiment_2021_08_20_09_02_57_0000--s-0] epoch 0 +[her-td3-sawyer-experiment_2021_08_20_09_02_57_0000--s-0] ----------------------------------- ------------ +[her-td3-sawyer-experiment_2021_08_20_09_02_57_0000--s-0] ----------------------------------- ------------ +[her-td3-sawyer-experiment_2021_08_20_09_02_57_0000--s-0] trainer/QF1 Loss 5.29412e-05 +[her-td3-sawyer-experiment_2021_08_20_09_02_57_0000--s-0] trainer/QF2 Loss 4.674e-05 +[her-td3-sawyer-experiment_2021_08_20_09_02_57_0000--s-0] trainer/Policy Loss 0.00756379 +[her-td3-sawyer-experiment_2021_08_20_09_02_57_0000--s-0] trainer/Q1 Predictions Mean -0.00748128 +[her-td3-sawyer-experiment_2021_08_20_09_02_57_0000--s-0] trainer/Q1 Predictions Std 0.000159119 +[her-td3-sawyer-experiment_2021_08_20_09_02_57_0000--s-0] trainer/Q1 Predictions Max -0.00732216 +[her-td3-sawyer-experiment_2021_08_20_09_02_57_0000--s-0] trainer/Q1 Predictions Min -0.0076404 +[her-td3-sawyer-experiment_2021_08_20_09_02_57_0000--s-0] trainer/Q2 Predictions Mean -0.00704535 +[her-td3-sawyer-experiment_2021_08_20_09_02_57_0000--s-0] trainer/Q2 Predictions Std 0 +[her-td3-sawyer-experiment_2021_08_20_09_02_57_0000--s-0] trainer/Q2 Predictions Max -0.00704535 +[her-td3-sawyer-experiment_2021_08_20_09_02_57_0000--s-0] trainer/Q2 Predictions Min -0.00704535 +[her-td3-sawyer-experiment_2021_08_20_09_02_57_0000--s-0] trainer/Q Targets Mean -0.000209156 +[her-td3-sawyer-experiment_2021_08_20_09_02_57_0000--s-0] trainer/Q Targets Std 8.05506e-05 +[her-td3-sawyer-experiment_2021_08_20_09_02_57_0000--s-0] trainer/Q Targets Max -0.000128606 +[her-td3-sawyer-experiment_2021_08_20_09_02_57_0000--s-0] trainer/Q Targets Min -0.000289707 +[her-td3-sawyer-experiment_2021_08_20_09_02_57_0000--s-0] trainer/Bellman Errors 1 Mean 5.29412e-05 +[her-td3-sawyer-experiment_2021_08_20_09_02_57_0000--s-0] trainer/Bellman Errors 1 Std 3.48581e-06 +[her-td3-sawyer-experiment_2021_08_20_09_02_57_0000--s-0] trainer/Bellman Errors 1 Max 5.6427e-05 +[her-td3-sawyer-experiment_2021_08_20_09_02_57_0000--s-0] trainer/Bellman Errors 1 Min 4.94554e-05 +[her-td3-sawyer-experiment_2021_08_20_09_02_57_0000--s-0] trainer/Bellman Errors 2 Mean 4.674e-05 +[her-td3-sawyer-experiment_2021_08_20_09_02_57_0000--s-0] trainer/Bellman Errors 2 Std 1.10132e-06 +[her-td3-sawyer-experiment_2021_08_20_09_02_57_0000--s-0] trainer/Bellman Errors 2 Max 4.78414e-05 +[her-td3-sawyer-experiment_2021_08_20_09_02_57_0000--s-0] trainer/Bellman Errors 2 Min 4.56387e-05 +[her-td3-sawyer-experiment_2021_08_20_09_02_57_0000--s-0] trainer/Policy Action Mean 0.0019138 +[her-td3-sawyer-experiment_2021_08_20_09_02_57_0000--s-0] trainer/Policy Action Std 0.00542599 +[her-td3-sawyer-experiment_2021_08_20_09_02_57_0000--s-0] trainer/Policy Action Max 0.00597471 +[her-td3-sawyer-experiment_2021_08_20_09_02_57_0000--s-0] trainer/Policy Action Min -0.0058046 +[her-td3-sawyer-experiment_2021_08_20_09_02_57_0000--s-0] expl/num steps total 30 +[her-td3-sawyer-experiment_2021_08_20_09_02_57_0000--s-0] expl/num paths total 6 +[her-td3-sawyer-experiment_2021_08_20_09_02_57_0000--s-0] expl/path length Mean 5 +[her-td3-sawyer-experiment_2021_08_20_09_02_57_0000--s-0] expl/path length Std 0 +[her-td3-sawyer-experiment_2021_08_20_09_02_57_0000--s-0] expl/path length Max 5 +[her-td3-sawyer-experiment_2021_08_20_09_02_57_0000--s-0] expl/path length Min 5 +[her-td3-sawyer-experiment_2021_08_20_09_02_57_0000--s-0] expl/Rewards Mean -0.186881 +[her-td3-sawyer-experiment_2021_08_20_09_02_57_0000--s-0] expl/Rewards Std 0.058378 +[her-td3-sawyer-experiment_2021_08_20_09_02_57_0000--s-0] expl/Rewards Max -0.122179 +[her-td3-sawyer-experiment_2021_08_20_09_02_57_0000--s-0] expl/Rewards Min -0.249904 +[her-td3-sawyer-experiment_2021_08_20_09_02_57_0000--s-0] expl/Returns Mean -0.934404 +[her-td3-sawyer-experiment_2021_08_20_09_02_57_0000--s-0] expl/Returns Std 0.290372 +[her-td3-sawyer-experiment_2021_08_20_09_02_57_0000--s-0] expl/Returns Max -0.644033 +[her-td3-sawyer-experiment_2021_08_20_09_02_57_0000--s-0] expl/Returns Min -1.22478 +[her-td3-sawyer-experiment_2021_08_20_09_02_57_0000--s-0] expl/Actions Mean 0.0698069 +[her-td3-sawyer-experiment_2021_08_20_09_02_57_0000--s-0] expl/Actions Std 0.348039 +[her-td3-sawyer-experiment_2021_08_20_09_02_57_0000--s-0] expl/Actions Max 0.71338 +[her-td3-sawyer-experiment_2021_08_20_09_02_57_0000--s-0] expl/Actions Min -0.935515 +[her-td3-sawyer-experiment_2021_08_20_09_02_57_0000--s-0] expl/Num Paths 2 +[her-td3-sawyer-experiment_2021_08_20_09_02_57_0000--s-0] expl/Average Returns -0.934404 +[her-td3-sawyer-experiment_2021_08_20_09_02_57_0000--s-0] expl/hand_distance Mean 0.186881 +[her-td3-sawyer-experiment_2021_08_20_09_02_57_0000--s-0] expl/hand_distance Std 0.058378 +[her-td3-sawyer-experiment_2021_08_20_09_02_57_0000--s-0] expl/hand_distance Max 0.249904 +[her-td3-sawyer-experiment_2021_08_20_09_02_57_0000--s-0] expl/hand_distance Min 0.122179 +[her-td3-sawyer-experiment_2021_08_20_09_02_57_0000--s-0] expl/Final hand_distance Mean 0.179782 +[her-td3-sawyer-experiment_2021_08_20_09_02_57_0000--s-0] expl/Final hand_distance Std 0.0557268 +[her-td3-sawyer-experiment_2021_08_20_09_02_57_0000--s-0] expl/Final hand_distance Max 0.235509 +[her-td3-sawyer-experiment_2021_08_20_09_02_57_0000--s-0] expl/Final hand_distance Min 0.124056 +[her-td3-sawyer-experiment_2021_08_20_09_02_57_0000--s-0] expl/hand_distance_l1 Mean 0.272028 +[her-td3-sawyer-experiment_2021_08_20_09_02_57_0000--s-0] expl/hand_distance_l1 Std 0.0819001 +[her-td3-sawyer-experiment_2021_08_20_09_02_57_0000--s-0] expl/hand_distance_l1 Max 0.365866 +[her-td3-sawyer-experiment_2021_08_20_09_02_57_0000--s-0] expl/hand_distance_l1 Min 0.181539 +[her-td3-sawyer-experiment_2021_08_20_09_02_57_0000--s-0] expl/Final hand_distance_l1 Mean 0.260928 +[her-td3-sawyer-experiment_2021_08_20_09_02_57_0000--s-0] expl/Final hand_distance_l1 Std 0.0771957 +[her-td3-sawyer-experiment_2021_08_20_09_02_57_0000--s-0] expl/Final hand_distance_l1 Max 0.338124 +[her-td3-sawyer-experiment_2021_08_20_09_02_57_0000--s-0] expl/Final hand_distance_l1 Min 0.183732 +[her-td3-sawyer-experiment_2021_08_20_09_02_57_0000--s-0] expl/hand_distance_l2 Mean 0.186881 +[her-td3-sawyer-experiment_2021_08_20_09_02_57_0000--s-0] expl/hand_distance_l2 Std 0.058378 +[her-td3-sawyer-experiment_2021_08_20_09_02_57_0000--s-0] expl/hand_distance_l2 Max 0.249904 +[her-td3-sawyer-experiment_2021_08_20_09_02_57_0000--s-0] expl/hand_distance_l2 Min 0.122179 +[her-td3-sawyer-experiment_2021_08_20_09_02_57_0000--s-0] expl/Final hand_distance_l2 Mean 0.179782 +[her-td3-sawyer-experiment_2021_08_20_09_02_57_0000--s-0] expl/Final hand_distance_l2 Std 0.0557268 +[her-td3-sawyer-experiment_2021_08_20_09_02_57_0000--s-0] expl/Final hand_distance_l2 Max 0.235509 +[her-td3-sawyer-experiment_2021_08_20_09_02_57_0000--s-0] expl/Final hand_distance_l2 Min 0.124056 +[her-td3-sawyer-experiment_2021_08_20_09_02_57_0000--s-0] expl/hand_success Mean 0 +[her-td3-sawyer-experiment_2021_08_20_09_02_57_0000--s-0] expl/hand_success Std 0 +[her-td3-sawyer-experiment_2021_08_20_09_02_57_0000--s-0] expl/hand_success Max 0 +[her-td3-sawyer-experiment_2021_08_20_09_02_57_0000--s-0] expl/hand_success Min 0 +[her-td3-sawyer-experiment_2021_08_20_09_02_57_0000--s-0] expl/Final hand_success Mean 0 +[her-td3-sawyer-experiment_2021_08_20_09_02_57_0000--s-0] expl/Final hand_success Std 0 +[her-td3-sawyer-experiment_2021_08_20_09_02_57_0000--s-0] expl/Final hand_success Max 0 +[her-td3-sawyer-experiment_2021_08_20_09_02_57_0000--s-0] expl/Final hand_success Min 0 +[her-td3-sawyer-experiment_2021_08_20_09_02_57_0000--s-0] eval/num steps total 20 +[her-td3-sawyer-experiment_2021_08_20_09_02_57_0000--s-0] eval/num paths total 4 +[her-td3-sawyer-experiment_2021_08_20_09_02_57_0000--s-0] eval/path length Mean 5 +[her-td3-sawyer-experiment_2021_08_20_09_02_57_0000--s-0] eval/path length Std 0 +[her-td3-sawyer-experiment_2021_08_20_09_02_57_0000--s-0] eval/path length Max 5 +[her-td3-sawyer-experiment_2021_08_20_09_02_57_0000--s-0] eval/path length Min 5 +[her-td3-sawyer-experiment_2021_08_20_09_02_57_0000--s-0] eval/Rewards Mean -0.184288 +[her-td3-sawyer-experiment_2021_08_20_09_02_57_0000--s-0] eval/Rewards Std 0.0253106 +[her-td3-sawyer-experiment_2021_08_20_09_02_57_0000--s-0] eval/Rewards Max -0.157971 +[her-td3-sawyer-experiment_2021_08_20_09_02_57_0000--s-0] eval/Rewards Min -0.211177 +[her-td3-sawyer-experiment_2021_08_20_09_02_57_0000--s-0] eval/Returns Mean -0.921442 +[her-td3-sawyer-experiment_2021_08_20_09_02_57_0000--s-0] eval/Returns Std 0.12645 +[her-td3-sawyer-experiment_2021_08_20_09_02_57_0000--s-0] eval/Returns Max -0.794991 +[her-td3-sawyer-experiment_2021_08_20_09_02_57_0000--s-0] eval/Returns Min -1.04789 +[her-td3-sawyer-experiment_2021_08_20_09_02_57_0000--s-0] eval/Actions Mean 0.00198059 +[her-td3-sawyer-experiment_2021_08_20_09_02_57_0000--s-0] eval/Actions Std 0.00566782 +[her-td3-sawyer-experiment_2021_08_20_09_02_57_0000--s-0] eval/Actions Max 0.00631524 +[her-td3-sawyer-experiment_2021_08_20_09_02_57_0000--s-0] eval/Actions Min -0.00604316 +[her-td3-sawyer-experiment_2021_08_20_09_02_57_0000--s-0] eval/Num Paths 2 +[her-td3-sawyer-experiment_2021_08_20_09_02_57_0000--s-0] eval/Average Returns -0.921442 +[her-td3-sawyer-experiment_2021_08_20_09_02_57_0000--s-0] eval/hand_distance Mean 0.184288 +[her-td3-sawyer-experiment_2021_08_20_09_02_57_0000--s-0] eval/hand_distance Std 0.0253106 +[her-td3-sawyer-experiment_2021_08_20_09_02_57_0000--s-0] eval/hand_distance Max 0.211177 +[her-td3-sawyer-experiment_2021_08_20_09_02_57_0000--s-0] eval/hand_distance Min 0.157971 +[her-td3-sawyer-experiment_2021_08_20_09_02_57_0000--s-0] eval/Final hand_distance Mean 0.18344 +[her-td3-sawyer-experiment_2021_08_20_09_02_57_0000--s-0] eval/Final hand_distance Std 0.0254691 +[her-td3-sawyer-experiment_2021_08_20_09_02_57_0000--s-0] eval/Final hand_distance Max 0.208909 +[her-td3-sawyer-experiment_2021_08_20_09_02_57_0000--s-0] eval/Final hand_distance Min 0.157971 +[her-td3-sawyer-experiment_2021_08_20_09_02_57_0000--s-0] eval/hand_distance_l1 Mean 0.296706 +[her-td3-sawyer-experiment_2021_08_20_09_02_57_0000--s-0] eval/hand_distance_l1 Std 0.0404009 +[her-td3-sawyer-experiment_2021_08_20_09_02_57_0000--s-0] eval/hand_distance_l1 Max 0.33993 +[her-td3-sawyer-experiment_2021_08_20_09_02_57_0000--s-0] eval/hand_distance_l1 Min 0.255005 +[her-td3-sawyer-experiment_2021_08_20_09_02_57_0000--s-0] eval/Final hand_distance_l1 Mean 0.295487 +[her-td3-sawyer-experiment_2021_08_20_09_02_57_0000--s-0] eval/Final hand_distance_l1 Std 0.0404823 +[her-td3-sawyer-experiment_2021_08_20_09_02_57_0000--s-0] eval/Final hand_distance_l1 Max 0.33597 +[her-td3-sawyer-experiment_2021_08_20_09_02_57_0000--s-0] eval/Final hand_distance_l1 Min 0.255005 +[her-td3-sawyer-experiment_2021_08_20_09_02_57_0000--s-0] eval/hand_distance_l2 Mean 0.184288 +[her-td3-sawyer-experiment_2021_08_20_09_02_57_0000--s-0] eval/hand_distance_l2 Std 0.0253106 +[her-td3-sawyer-experiment_2021_08_20_09_02_57_0000--s-0] eval/hand_distance_l2 Max 0.211177 +[her-td3-sawyer-experiment_2021_08_20_09_02_57_0000--s-0] eval/hand_distance_l2 Min 0.157971 +[her-td3-sawyer-experiment_2021_08_20_09_02_57_0000--s-0] eval/Final hand_distance_l2 Mean 0.18344 +[her-td3-sawyer-experiment_2021_08_20_09_02_57_0000--s-0] eval/Final hand_distance_l2 Std 0.0254691 +[her-td3-sawyer-experiment_2021_08_20_09_02_57_0000--s-0] eval/Final hand_distance_l2 Max 0.208909 +[her-td3-sawyer-experiment_2021_08_20_09_02_57_0000--s-0] eval/Final hand_distance_l2 Min 0.157971 +[her-td3-sawyer-experiment_2021_08_20_09_02_57_0000--s-0] eval/hand_success Mean 0 +[her-td3-sawyer-experiment_2021_08_20_09_02_57_0000--s-0] eval/hand_success Std 0 +[her-td3-sawyer-experiment_2021_08_20_09_02_57_0000--s-0] eval/hand_success Max 0 +[her-td3-sawyer-experiment_2021_08_20_09_02_57_0000--s-0] eval/hand_success Min 0 +[her-td3-sawyer-experiment_2021_08_20_09_02_57_0000--s-0] eval/Final hand_success Mean 0 +[her-td3-sawyer-experiment_2021_08_20_09_02_57_0000--s-0] eval/Final hand_success Std 0 +[her-td3-sawyer-experiment_2021_08_20_09_02_57_0000--s-0] eval/Final hand_success Max 0 +[her-td3-sawyer-experiment_2021_08_20_09_02_57_0000--s-0] eval/Final hand_success Min 0 +[her-td3-sawyer-experiment_2021_08_20_09_02_57_0000--s-0] time/epoch_time (s) 0.174882 +[her-td3-sawyer-experiment_2021_08_20_09_02_57_0000--s-0] time/evaluation sampling (s) 0.0611782 +[her-td3-sawyer-experiment_2021_08_20_09_02_57_0000--s-0] time/exploration sampling (s) 0.0609596 +[her-td3-sawyer-experiment_2021_08_20_09_02_57_0000--s-0] time/global_time (s) 1.38531 +[her-td3-sawyer-experiment_2021_08_20_09_02_57_0000--s-0] time/replay buffer data storing (s) 0.000152826 +[her-td3-sawyer-experiment_2021_08_20_09_02_57_0000--s-0] time/saving (s) 0.00482249 +[her-td3-sawyer-experiment_2021_08_20_09_02_57_0000--s-0] time/training (s) 0.0451498 +[her-td3-sawyer-experiment_2021_08_20_09_02_57_0000--s-0] epoch 1 +[her-td3-sawyer-experiment_2021_08_20_09_02_57_0000--s-0] ----------------------------------- ------------ diff --git a/tests/regression/her/data_her_td3/params.pt b/tests/regression/her/data_her_td3/params.pt new file mode 100644 index 000000000..49efc542e Binary files /dev/null and b/tests/regression/her/data_her_td3/params.pt differ diff --git a/tests/regression/her/data_her_td3/progress.csv b/tests/regression/her/data_her_td3/progress.csv new file mode 100644 index 000000000..ba5cd3c16 --- /dev/null +++ b/tests/regression/her/data_her_td3/progress.csv @@ -0,0 +1,3 @@ +epoch,eval/Actions Max,eval/Actions Mean,eval/Actions Min,eval/Actions Std,eval/Average Returns,eval/Final hand_distance Max,eval/Final hand_distance Mean,eval/Final hand_distance Min,eval/Final hand_distance Std,eval/Final hand_distance_l1 Max,eval/Final hand_distance_l1 Mean,eval/Final hand_distance_l1 Min,eval/Final hand_distance_l1 Std,eval/Final hand_distance_l2 Max,eval/Final hand_distance_l2 Mean,eval/Final hand_distance_l2 Min,eval/Final hand_distance_l2 Std,eval/Final hand_success Max,eval/Final hand_success Mean,eval/Final hand_success Min,eval/Final hand_success Std,eval/Num Paths,eval/Returns Max,eval/Returns Mean,eval/Returns Min,eval/Returns Std,eval/Rewards Max,eval/Rewards Mean,eval/Rewards Min,eval/Rewards Std,eval/hand_distance Max,eval/hand_distance Mean,eval/hand_distance Min,eval/hand_distance Std,eval/hand_distance_l1 Max,eval/hand_distance_l1 Mean,eval/hand_distance_l1 Min,eval/hand_distance_l1 Std,eval/hand_distance_l2 Max,eval/hand_distance_l2 Mean,eval/hand_distance_l2 Min,eval/hand_distance_l2 Std,eval/hand_success Max,eval/hand_success Mean,eval/hand_success Min,eval/hand_success Std,eval/num paths total,eval/num steps total,eval/path length Max,eval/path length Mean,eval/path length Min,eval/path length Std,expl/Actions Max,expl/Actions Mean,expl/Actions Min,expl/Actions Std,expl/Average Returns,expl/Final hand_distance Max,expl/Final hand_distance Mean,expl/Final hand_distance Min,expl/Final hand_distance Std,expl/Final hand_distance_l1 Max,expl/Final hand_distance_l1 Mean,expl/Final hand_distance_l1 Min,expl/Final hand_distance_l1 Std,expl/Final hand_distance_l2 Max,expl/Final hand_distance_l2 Mean,expl/Final hand_distance_l2 Min,expl/Final hand_distance_l2 Std,expl/Final hand_success Max,expl/Final hand_success Mean,expl/Final hand_success Min,expl/Final hand_success Std,expl/Num Paths,expl/Returns Max,expl/Returns Mean,expl/Returns Min,expl/Returns Std,expl/Rewards Max,expl/Rewards Mean,expl/Rewards Min,expl/Rewards Std,expl/hand_distance Max,expl/hand_distance Mean,expl/hand_distance Min,expl/hand_distance Std,expl/hand_distance_l1 Max,expl/hand_distance_l1 Mean,expl/hand_distance_l1 Min,expl/hand_distance_l1 Std,expl/hand_distance_l2 Max,expl/hand_distance_l2 Mean,expl/hand_distance_l2 Min,expl/hand_distance_l2 Std,expl/hand_success Max,expl/hand_success Mean,expl/hand_success Min,expl/hand_success Std,expl/num paths total,expl/num steps total,expl/path length Max,expl/path length Mean,expl/path length Min,expl/path length Std,time/epoch_time (s),time/evaluation sampling (s),time/exploration sampling (s),time/global_time (s),time/replay buffer data storing (s),time/saving (s),time/training (s),trainer/Bellman Errors 1 Max,trainer/Bellman Errors 1 Mean,trainer/Bellman Errors 1 Min,trainer/Bellman Errors 1 Std,trainer/Bellman Errors 2 Max,trainer/Bellman Errors 2 Mean,trainer/Bellman Errors 2 Min,trainer/Bellman Errors 2 Std,trainer/Policy Action Max,trainer/Policy Action Mean,trainer/Policy Action Min,trainer/Policy Action Std,trainer/Policy Loss,trainer/Q Targets Max,trainer/Q Targets Mean,trainer/Q Targets Min,trainer/Q Targets Std,trainer/Q1 Predictions Max,trainer/Q1 Predictions Mean,trainer/Q1 Predictions Min,trainer/Q1 Predictions Std,trainer/Q2 Predictions Max,trainer/Q2 Predictions Mean,trainer/Q2 Predictions Min,trainer/Q2 Predictions Std,trainer/QF1 Loss,trainer/QF2 Loss +0,0.0004054351,-2.150413e-05,-0.00028614886,0.00029324184,-1.0973054839340641,0.2273842964297562,0.21895720859466888,0.21053012075958158,0.008427087835087305,0.3905004857805467,0.3637576349695517,0.3370147841585568,0.02674285081099495,0.2273842964297562,0.21895720859466888,0.21053012075958158,0.008427087835087305,0.0,0.0,0.0,0.0,2,-1.054765826005259,-1.0973054839340641,-1.1398451418628692,0.0425396579288051,-0.21053012075958158,-0.21946109678681286,-0.2295176237287723,0.008536225362702066,0.2295176237287723,0.21946109678681286,0.21053012075958158,0.008536225362702066,0.3936553835985243,0.36471989495011503,0.3370147841585568,0.02668118631055801,0.2295176237287723,0.21946109678681286,0.21053012075958158,0.008536225362702066,0.0,0.0,0.0,0.0,2,10,5,5.0,5,0.0,0.640041708946228,0.05736659229172929,-0.427158385515213,0.23090975414228854,-0.8675138392619195,0.23379131972009715,0.16872440275848405,0.10365748579687094,0.06506691696161311,0.3949048903340666,0.28312575346613794,0.1713466165982093,0.11177913686792866,0.23379131972009715,0.16872440275848405,0.10365748579687094,0.06506691696161311,0.0,0.0,0.0,0.0,2,-0.5332515580248091,-0.8675138392619195,-1.2017761204990298,0.3342622812371104,-0.10365748579687094,-0.17350276785238392,-0.24556493533687146,0.06699769780405994,0.24556493533687146,0.17350276785238392,0.10365748579687094,0.06699769780405994,0.4169941665534834,0.2918139872825595,0.17093379893934676,0.11583114419136979,0.24556493533687146,0.17350276785238392,0.10365748579687094,0.06699769780405994,0.0,0.0,0.0,0.0,4,20,5,5.0,5,0.0,0.30983805656433105,0.059429168701171875,0.060211896896362305,1.2055063247680664,0.00013113021850585938,0.004618406295776367,0.09524321556091309,0.00024488795,0.00012787242,1.0856888e-05,0.00011701554,0.00025006547,0.00013145646,1.28474585e-05,0.000118609,0.00022230897,-2.3022198e-05,-0.00017925714,0.00017202772,0.0013410698,-0.003584335,-0.009698897,-0.015813459,0.0061145616,-0.00016456282,-0.0002269593,-0.00028935575,6.2396466e-05,0.0,0.0,0.0,0.0,0.00012787242,0.00013145646 +1,0.0063152444,0.0019805946,-0.006043161,0.005667816,-0.9214418104533926,0.20890941151444659,0.1834403327295425,0.1579712539446384,0.025469078784904092,0.33596974411605496,0.29548744362472523,0.2550051431333955,0.040482300491329726,0.20890941151444659,0.1834403327295425,0.1579712539446384,0.025469078784904092,0.0,0.0,0.0,0.0,2,-0.7949914066575262,-0.9214418104533926,-1.0478922142492588,0.1264504037958663,-0.1579712539446384,-0.1842883620906785,-0.21117722975199968,0.0253105635650462,0.21117722975199968,0.1842883620906785,0.1579712539446384,0.0253105635650462,0.3399301362965096,0.2967061316018946,0.2550051431333955,0.040400868913801555,0.21117722975199968,0.1842883620906785,0.1579712539446384,0.0253105635650462,0.0,0.0,0.0,0.0,4,20,5,5.0,5,0.0,0.7133803367614746,0.06980689336646934,-0.9355153441429138,0.348038631648611,-0.9344042528257721,0.2355092886963321,0.1797824560300162,0.12405562336370031,0.05572683266631589,0.33812377219588474,0.2609280761137638,0.18373238003164283,0.07719569608212096,0.2355092886963321,0.1797824560300162,0.12405562336370031,0.05572683266631589,0.0,0.0,0.0,0.0,2,-0.6440325896225111,-0.9344042528257721,-1.224775916029033,0.29037166320326097,-0.12217868466484308,-0.1868808505651544,-0.249904445224928,0.058377978710212206,0.249904445224928,0.1868808505651544,0.12217868466484308,0.058377978710212206,0.3658664324506281,0.2720279127214639,0.1815388943413173,0.08190012480689134,0.249904445224928,0.1868808505651544,0.12217868466484308,0.058377978710212206,0.0,0.0,0.0,0.0,6,30,5,5.0,5,0.0,0.1748816967010498,0.06117820739746094,0.060959577560424805,1.385305404663086,0.00015282630920410156,0.004822492599487305,0.045149803161621094,5.6426987e-05,5.294117e-05,4.9455357e-05,3.485815e-06,4.784135e-05,4.6740028e-05,4.5638706e-05,1.1013217e-06,0.0059747086,0.0019137971,-0.005804599,0.0054259934,0.0075637856,-0.00012860558,-0.00020915618,-0.00028970678,8.05506e-05,-0.0073221573,-0.007481276,-0.0076403953,0.00015911902,-0.0070453496,-0.0070453496,-0.0070453496,0.0,5.294117e-05,4.6740028e-05 diff --git a/tests/regression/her/data_her_td3/variant.json b/tests/regression/her/data_her_td3/variant.json new file mode 100644 index 000000000..57fb342f7 --- /dev/null +++ b/tests/regression/her/data_her_td3/variant.json @@ -0,0 +1,32 @@ +{ + "algo_kwargs": { + "batch_size": 2, + "max_path_length": 5, + "min_num_steps_before_training": 10, + "num_epochs": 2, + "num_eval_steps_per_epoch": 10, + "num_expl_steps_per_train_loop": 10, + "num_trains_per_train_loop": 10 + }, + "policy_kwargs": { + "hidden_sizes": [ + 2, + 2 + ] + }, + "qf_kwargs": { + "hidden_sizes": [ + 2, + 2 + ] + }, + "replay_buffer_kwargs": { + "fraction_goals_env_goals": 0.0, + "fraction_goals_rollout_goals": 0.2, + "max_size": 100000 + }, + "seed": 25580, + "trainer_kwargs": { + "discount": 0.99 + } +} \ No newline at end of file diff --git a/tests/regression/her/test_her_dqn.py b/tests/regression/her/test_her_dqn.py new file mode 100644 index 000000000..278c59176 --- /dev/null +++ b/tests/regression/her/test_her_dqn.py @@ -0,0 +1,36 @@ +import os +import sys + +from examples.her import her_dqn_gridworld as her + +from rlkit.core import logger +from rlkit.testing import csv_util + +def test_her_online(): + logger.reset() + + # make tests small by mutating variant + her.variant["algo_kwargs"] = dict( + num_epochs=2, + max_path_length=5, + num_eval_steps_per_epoch=10, + num_expl_steps_per_train_loop=10, + num_trains_per_train_loop=10, + min_num_steps_before_training=10, + batch_size=2, + ) + her.variant["seed"] = 25580 + + her.main() + + # this env seems to seed inconsistently so this test does not check algorithm values + reference_csv = "tests/regression/her/data_her_dqn/progress.csv" + output_csv = os.path.join(logger.get_snapshot_dir(), "progress.csv") + print("comparing reference %s against output %s" % (reference_csv, output_csv)) + output = csv_util.get_exp(output_csv) + reference = csv_util.get_exp(reference_csv) + keys = ["epoch", "expl/num steps total", "eval/Average Returns",] # "trainer/Y Predictions Mean", ] + csv_util.check_equal(reference, output, keys) + +if __name__ == "__main__": + test_her_online() diff --git a/tests/regression/her/test_her_sac.py b/tests/regression/her/test_her_sac.py new file mode 100644 index 000000000..b5fd27965 --- /dev/null +++ b/tests/regression/her/test_her_sac.py @@ -0,0 +1,41 @@ +import os +import sys + +from examples.her import her_sac_gym_fetch_reach as her + +from rlkit.core import logger +from rlkit.testing import csv_util + +def test_her_online(): + logger.reset() + + # make tests small by mutating variant + her.variant["algo_kwargs"] = dict( + num_epochs=2, + max_path_length=5, + num_eval_steps_per_epoch=10, + num_expl_steps_per_train_loop=10, + num_trains_per_train_loop=10, + min_num_steps_before_training=10, + batch_size=2, + ) + her.variant["qf_kwargs"] = dict( + hidden_sizes=[2, 2], + ) + her.variant["policy_kwargs"] = dict( + hidden_sizes=[2, 2], + ) + her.variant["seed"] = 25580 + + her.main() + + reference_csv = "tests/regression/her/data_her_sac/progress.csv" + output_csv = os.path.join(logger.get_snapshot_dir(), "progress.csv") + print("comparing reference %s against output %s" % (reference_csv, output_csv)) + output = csv_util.get_exp(output_csv) + reference = csv_util.get_exp(reference_csv) + keys = ["epoch", "expl/num steps total", "eval/Average Returns", "trainer/Q1 Predictions Mean", ] + csv_util.check_equal(reference, output, keys) + +if __name__ == "__main__": + test_her_online() diff --git a/tests/regression/her/test_her_td3.py b/tests/regression/her/test_her_td3.py new file mode 100644 index 000000000..d83e8e9be --- /dev/null +++ b/tests/regression/her/test_her_td3.py @@ -0,0 +1,42 @@ +import os +import sys + +from examples.her import her_td3_multiworld_sawyer_reach as her + +from rlkit.core import logger +from rlkit.testing import csv_util + +def test_her_online(): + logger.reset() + + # make tests small by mutating variant + her.variant["algo_kwargs"] = dict( + num_epochs=2, + max_path_length=5, + num_eval_steps_per_epoch=10, + num_expl_steps_per_train_loop=10, + num_trains_per_train_loop=10, + min_num_steps_before_training=10, + batch_size=2, + ) + her.variant["qf_kwargs"] = dict( + hidden_sizes=[2, 2], + ) + her.variant["policy_kwargs"] = dict( + hidden_sizes=[2, 2], + ) + her.variant["seed"] = 25580 + + her.main() + + # this env seems to seed inconsistently so this test does not check algorithm values + reference_csv = "tests/regression/her/data_her_td3/progress.csv" + output_csv = os.path.join(logger.get_snapshot_dir(), "progress.csv") + print("comparing reference %s against output %s" % (reference_csv, output_csv)) + output = csv_util.get_exp(output_csv) + reference = csv_util.get_exp(reference_csv) + keys = ["epoch", "expl/num steps total", ] + csv_util.check_equal(reference, output, keys) + +if __name__ == "__main__": + test_her_online() diff --git a/tests/regression/random/test_mlp.py b/tests/regression/random/test_mlp.py new file mode 100644 index 000000000..c6c360b53 --- /dev/null +++ b/tests/regression/random/test_mlp.py @@ -0,0 +1,16 @@ +from rlkit.torch.networks import Mlp +import rlkit.torch.pytorch_util as ptu +import numpy as np +import torch + +def test_mlp_run(): + torch.manual_seed(0) + f = Mlp(hidden_sizes=[100, 100, 100], output_size=20, input_size=10) + x = ptu.from_numpy(np.ones(10,)) + y = ptu.get_numpy(f(x)) + print(list(y)) + expected_result = np.array([-0.00051861757, -0.0002842698, 0.0009188504, 0.0002975871, 0.00061783846, -0.0004398331, -0.00053419394, -6.92709e-06, -0.00042290357, -6.476462e-05, 0.00031131462, -0.00036673213, -0.0004935546, -4.3982916e-05, -0.00041407614, -0.00028983108, 0.00072777556, -2.5328445e-05, -0.00015854833, -0.00013315874]) + assert np.isclose(expected_result, y).all() + +if __name__ == "__main__": + test_mlp_run() diff --git a/tests/regression/random/test_mujoco_env.py b/tests/regression/random/test_mujoco_env.py new file mode 100644 index 000000000..24b38b327 --- /dev/null +++ b/tests/regression/random/test_mujoco_env.py @@ -0,0 +1,34 @@ +from gym.envs.mujoco import HalfCheetahEnv +import numpy as np + +def test_mujoco_env_hc(): + env = HalfCheetahEnv() + env.seed(0) + + obs = env.reset() + print("reset observation") + print(list(obs)) + expected_result = np.array([0.09307818744846408, 0.026538189116820654, -0.04199653123045835, -0.07950314858277296, 0.03461526961155151, -0.02148465198872647, 0.03396921345769463, -0.08803220624881061, -0.05727510457667884, 0.0033623362111972334, -0.003173373363900766, 0.04092179272969336, 0.05138553542843286, 0.01490225253385475, -0.04298222544789362, 0.1840846875514338, -0.004267549831952004]) + assert np.isclose(obs, expected_result).all() + + obs, _, _, _ = env.step(np.zeros(6,)) + print("after 1 step") + print(list(obs)) + expected_result = np.array([0.07598225250646283, 0.019175277630375635, -0.026049271400655116, -0.02524626742505337, 0.0022975114812461717, -0.0022682205794546627, 0.012561418192984201, -0.03680577283317282, 0.011517471035147164, -0.5486275103205266, -0.18959104789073566, 0.5238752657158512, 1.2951048434195913, -0.7176542510873759, 0.4801683485682522, -0.5874709352139998, 1.4057385188694944]) + assert np.isclose(obs, expected_result).all() + + np.random.seed(0) + env.reset() + N = 1000 + M = 17 + observations = np.zeros((N, M)) + expected_observations = np.load("tests/regression/random/test_mujoco_env_obs.npy") + for i in range(1000): + obs, _, _, _ = env.step(np.random.random(6)) + assert np.isclose(obs, expected_observations[i, :]).all(), "observation %d diverged" % i + observations[i, :] = obs + print("results matched after %d steps" % N) + # np.save("tests/regression/random/test_mujoco_env_obs.npy", observations) + +if __name__ == "__main__": + test_mujoco_env_hc() diff --git a/tests/regression/random/test_mujoco_env_obs.npy b/tests/regression/random/test_mujoco_env_obs.npy new file mode 100644 index 000000000..16066fced Binary files /dev/null and b/tests/regression/random/test_mujoco_env_obs.npy differ diff --git a/tests/regression/sac/21-08-18-default_2021_08_18_14_36_11_id478649--s977983/debug.log b/tests/regression/sac/21-08-18-default_2021_08_18_14_36_11_id478649--s977983/debug.log new file mode 100644 index 000000000..154d819fe --- /dev/null +++ b/tests/regression/sac/21-08-18-default_2021_08_18_14_36_11_id478649--s977983/debug.log @@ -0,0 +1,190 @@ +[21-08-18-default_2021_08_18_14_36_11_id478649--s977983] ----------------------------------- -------------- +[21-08-18-default_2021_08_18_14_36_11_id478649--s977983] replay_buffer/size 2000 +[21-08-18-default_2021_08_18_14_36_11_id478649--s977983] trainer/num train calls 1000 +[21-08-18-default_2021_08_18_14_36_11_id478649--s977983] trainer/QF1 Loss 7.51889 +[21-08-18-default_2021_08_18_14_36_11_id478649--s977983] trainer/QF2 Loss 7.54049 +[21-08-18-default_2021_08_18_14_36_11_id478649--s977983] trainer/Policy Loss -4.39813 +[21-08-18-default_2021_08_18_14_36_11_id478649--s977983] trainer/Q1 Predictions Mean 0.00458353 +[21-08-18-default_2021_08_18_14_36_11_id478649--s977983] trainer/Q1 Predictions Std 0.00458353 +[21-08-18-default_2021_08_18_14_36_11_id478649--s977983] trainer/Q1 Predictions Max 0.00916706 +[21-08-18-default_2021_08_18_14_36_11_id478649--s977983] trainer/Q1 Predictions Min 0 +[21-08-18-default_2021_08_18_14_36_11_id478649--s977983] trainer/Q2 Predictions Mean 0.000466318 +[21-08-18-default_2021_08_18_14_36_11_id478649--s977983] trainer/Q2 Predictions Std 0.000881484 +[21-08-18-default_2021_08_18_14_36_11_id478649--s977983] trainer/Q2 Predictions Max 0.0013478 +[21-08-18-default_2021_08_18_14_36_11_id478649--s977983] trainer/Q2 Predictions Min -0.000415165 +[21-08-18-default_2021_08_18_14_36_11_id478649--s977983] trainer/Q Targets Mean 2.74508 +[21-08-18-default_2021_08_18_14_36_11_id478649--s977983] trainer/Q Targets Std 0.0878264 +[21-08-18-default_2021_08_18_14_36_11_id478649--s977983] trainer/Q Targets Max 2.83291 +[21-08-18-default_2021_08_18_14_36_11_id478649--s977983] trainer/Q Targets Min 2.65726 +[21-08-18-default_2021_08_18_14_36_11_id478649--s977983] trainer/Log Pis Mean -4.39827 +[21-08-18-default_2021_08_18_14_36_11_id478649--s977983] trainer/Log Pis Std 0.229529 +[21-08-18-default_2021_08_18_14_36_11_id478649--s977983] trainer/Log Pis Max -4.16875 +[21-08-18-default_2021_08_18_14_36_11_id478649--s977983] trainer/Log Pis Min -4.6278 +[21-08-18-default_2021_08_18_14_36_11_id478649--s977983] trainer/rewards Mean -0.680844 +[21-08-18-default_2021_08_18_14_36_11_id478649--s977983] trainer/rewards Std 0.054372 +[21-08-18-default_2021_08_18_14_36_11_id478649--s977983] trainer/rewards Max -0.626472 +[21-08-18-default_2021_08_18_14_36_11_id478649--s977983] trainer/rewards Min -0.735216 +[21-08-18-default_2021_08_18_14_36_11_id478649--s977983] trainer/terminals Mean 0 +[21-08-18-default_2021_08_18_14_36_11_id478649--s977983] trainer/terminals Std 0 +[21-08-18-default_2021_08_18_14_36_11_id478649--s977983] trainer/terminals Max 0 +[21-08-18-default_2021_08_18_14_36_11_id478649--s977983] trainer/terminals Min 0 +[21-08-18-default_2021_08_18_14_36_11_id478649--s977983] trainer/reward scale 1 +[21-08-18-default_2021_08_18_14_36_11_id478649--s977983] trainer/policy/mean Mean 0 +[21-08-18-default_2021_08_18_14_36_11_id478649--s977983] trainer/policy/mean Std 0 +[21-08-18-default_2021_08_18_14_36_11_id478649--s977983] trainer/policy/mean Max 0 +[21-08-18-default_2021_08_18_14_36_11_id478649--s977983] trainer/policy/mean Min 0 +[21-08-18-default_2021_08_18_14_36_11_id478649--s977983] trainer/policy/normal/std Mean 1.00039 +[21-08-18-default_2021_08_18_14_36_11_id478649--s977983] trainer/policy/normal/std Std 0.000467339 +[21-08-18-default_2021_08_18_14_36_11_id478649--s977983] trainer/policy/normal/std Max 1.00084 +[21-08-18-default_2021_08_18_14_36_11_id478649--s977983] trainer/policy/normal/std Min 0.999512 +[21-08-18-default_2021_08_18_14_36_11_id478649--s977983] trainer/policy/normal/log_std Mean 0.00038531 +[21-08-18-default_2021_08_18_14_36_11_id478649--s977983] trainer/policy/normal/log_std Std 0.000467251 +[21-08-18-default_2021_08_18_14_36_11_id478649--s977983] trainer/policy/normal/log_std Max 0.000842574 +[21-08-18-default_2021_08_18_14_36_11_id478649--s977983] trainer/policy/normal/log_std Min -0.000488102 +[21-08-18-default_2021_08_18_14_36_11_id478649--s977983] trainer/Alpha 1 +[21-08-18-default_2021_08_18_14_36_11_id478649--s977983] trainer/Alpha Loss -0 +[21-08-18-default_2021_08_18_14_36_11_id478649--s977983] expl/num steps total 2000 +[21-08-18-default_2021_08_18_14_36_11_id478649--s977983] expl/num paths total 2 +[21-08-18-default_2021_08_18_14_36_11_id478649--s977983] expl/path length Mean 1000 +[21-08-18-default_2021_08_18_14_36_11_id478649--s977983] expl/path length Std 0 +[21-08-18-default_2021_08_18_14_36_11_id478649--s977983] expl/path length Max 1000 +[21-08-18-default_2021_08_18_14_36_11_id478649--s977983] expl/path length Min 1000 +[21-08-18-default_2021_08_18_14_36_11_id478649--s977983] expl/Rewards Mean -0.278543 +[21-08-18-default_2021_08_18_14_36_11_id478649--s977983] expl/Rewards Std 0.68186 +[21-08-18-default_2021_08_18_14_36_11_id478649--s977983] expl/Rewards Max 1.97229 +[21-08-18-default_2021_08_18_14_36_11_id478649--s977983] expl/Rewards Min -2.44078 +[21-08-18-default_2021_08_18_14_36_11_id478649--s977983] expl/Returns Mean -278.543 +[21-08-18-default_2021_08_18_14_36_11_id478649--s977983] expl/Returns Std 0 +[21-08-18-default_2021_08_18_14_36_11_id478649--s977983] expl/Returns Max -278.543 +[21-08-18-default_2021_08_18_14_36_11_id478649--s977983] expl/Returns Min -278.543 +[21-08-18-default_2021_08_18_14_36_11_id478649--s977983] expl/Actions Mean 0.008082 +[21-08-18-default_2021_08_18_14_36_11_id478649--s977983] expl/Actions Std 0.631892 +[21-08-18-default_2021_08_18_14_36_11_id478649--s977983] expl/Actions Max 0.997034 +[21-08-18-default_2021_08_18_14_36_11_id478649--s977983] expl/Actions Min -0.998462 +[21-08-18-default_2021_08_18_14_36_11_id478649--s977983] expl/Num Paths 1 +[21-08-18-default_2021_08_18_14_36_11_id478649--s977983] expl/Average Returns -278.543 +[21-08-18-default_2021_08_18_14_36_11_id478649--s977983] eval/num steps total 1000 +[21-08-18-default_2021_08_18_14_36_11_id478649--s977983] eval/num paths total 1 +[21-08-18-default_2021_08_18_14_36_11_id478649--s977983] eval/path length Mean 1000 +[21-08-18-default_2021_08_18_14_36_11_id478649--s977983] eval/path length Std 0 +[21-08-18-default_2021_08_18_14_36_11_id478649--s977983] eval/path length Max 1000 +[21-08-18-default_2021_08_18_14_36_11_id478649--s977983] eval/path length Min 1000 +[21-08-18-default_2021_08_18_14_36_11_id478649--s977983] eval/Rewards Mean -0.000895229 +[21-08-18-default_2021_08_18_14_36_11_id478649--s977983] eval/Rewards Std 0.019424 +[21-08-18-default_2021_08_18_14_36_11_id478649--s977983] eval/Rewards Max 0.127781 +[21-08-18-default_2021_08_18_14_36_11_id478649--s977983] eval/Rewards Min -0.351519 +[21-08-18-default_2021_08_18_14_36_11_id478649--s977983] eval/Returns Mean -0.895229 +[21-08-18-default_2021_08_18_14_36_11_id478649--s977983] eval/Returns Std 0 +[21-08-18-default_2021_08_18_14_36_11_id478649--s977983] eval/Returns Max -0.895229 +[21-08-18-default_2021_08_18_14_36_11_id478649--s977983] eval/Returns Min -0.895229 +[21-08-18-default_2021_08_18_14_36_11_id478649--s977983] eval/Actions Mean 1.1957e-07 +[21-08-18-default_2021_08_18_14_36_11_id478649--s977983] eval/Actions Std 3.77022e-06 +[21-08-18-default_2021_08_18_14_36_11_id478649--s977983] eval/Actions Max 9.07126e-05 +[21-08-18-default_2021_08_18_14_36_11_id478649--s977983] eval/Actions Min -0.000104064 +[21-08-18-default_2021_08_18_14_36_11_id478649--s977983] eval/Num Paths 1 +[21-08-18-default_2021_08_18_14_36_11_id478649--s977983] eval/Average Returns -0.895229 +[21-08-18-default_2021_08_18_14_36_11_id478649--s977983] time/epoch_time (s) 8.01915 +[21-08-18-default_2021_08_18_14_36_11_id478649--s977983] time/evaluation sampling (s) 0.377766 +[21-08-18-default_2021_08_18_14_36_11_id478649--s977983] time/exploration sampling (s) 0.450897 +[21-08-18-default_2021_08_18_14_36_11_id478649--s977983] time/global_time (s) 8.06562 +[21-08-18-default_2021_08_18_14_36_11_id478649--s977983] time/replay buffer data storing (s) 0.00562119 +[21-08-18-default_2021_08_18_14_36_11_id478649--s977983] time/sac training (s) 6.46483 +[21-08-18-default_2021_08_18_14_36_11_id478649--s977983] time/saving (s) 0.0071342 +[21-08-18-default_2021_08_18_14_36_11_id478649--s977983] time/training (s) 6.71693 +[21-08-18-default_2021_08_18_14_36_11_id478649--s977983] epoch 0 +[21-08-18-default_2021_08_18_14_36_11_id478649--s977983] ----------------------------------- -------------- +[21-08-18-default_2021_08_18_14_36_11_id478649--s977983] ----------------------------------- -------------- +[21-08-18-default_2021_08_18_14_36_11_id478649--s977983] replay_buffer/size 3000 +[21-08-18-default_2021_08_18_14_36_11_id478649--s977983] trainer/num train calls 2000 +[21-08-18-default_2021_08_18_14_36_11_id478649--s977983] trainer/QF1 Loss 1.11979 +[21-08-18-default_2021_08_18_14_36_11_id478649--s977983] trainer/QF2 Loss 2.32574 +[21-08-18-default_2021_08_18_14_36_11_id478649--s977983] trainer/Policy Loss -7.3159 +[21-08-18-default_2021_08_18_14_36_11_id478649--s977983] trainer/Q1 Predictions Mean 4.97316 +[21-08-18-default_2021_08_18_14_36_11_id478649--s977983] trainer/Q1 Predictions Std 0.0135632 +[21-08-18-default_2021_08_18_14_36_11_id478649--s977983] trainer/Q1 Predictions Max 4.98672 +[21-08-18-default_2021_08_18_14_36_11_id478649--s977983] trainer/Q1 Predictions Min 4.9596 +[21-08-18-default_2021_08_18_14_36_11_id478649--s977983] trainer/Q2 Predictions Mean 4.70634 +[21-08-18-default_2021_08_18_14_36_11_id478649--s977983] trainer/Q2 Predictions Std 0.451723 +[21-08-18-default_2021_08_18_14_36_11_id478649--s977983] trainer/Q2 Predictions Max 5.15807 +[21-08-18-default_2021_08_18_14_36_11_id478649--s977983] trainer/Q2 Predictions Min 4.25462 +[21-08-18-default_2021_08_18_14_36_11_id478649--s977983] trainer/Q Targets Mean 4.85932 +[21-08-18-default_2021_08_18_14_36_11_id478649--s977983] trainer/Q Targets Std 1.06562 +[21-08-18-default_2021_08_18_14_36_11_id478649--s977983] trainer/Q Targets Max 5.92495 +[21-08-18-default_2021_08_18_14_36_11_id478649--s977983] trainer/Q Targets Min 3.7937 +[21-08-18-default_2021_08_18_14_36_11_id478649--s977983] trainer/Log Pis Mean -4.23955 +[21-08-18-default_2021_08_18_14_36_11_id478649--s977983] trainer/Log Pis Std 0.247615 +[21-08-18-default_2021_08_18_14_36_11_id478649--s977983] trainer/Log Pis Max -3.99193 +[21-08-18-default_2021_08_18_14_36_11_id478649--s977983] trainer/Log Pis Min -4.48716 +[21-08-18-default_2021_08_18_14_36_11_id478649--s977983] trainer/rewards Mean -0.458908 +[21-08-18-default_2021_08_18_14_36_11_id478649--s977983] trainer/rewards Std 0.273369 +[21-08-18-default_2021_08_18_14_36_11_id478649--s977983] trainer/rewards Max -0.185539 +[21-08-18-default_2021_08_18_14_36_11_id478649--s977983] trainer/rewards Min -0.732277 +[21-08-18-default_2021_08_18_14_36_11_id478649--s977983] trainer/terminals Mean 0 +[21-08-18-default_2021_08_18_14_36_11_id478649--s977983] trainer/terminals Std 0 +[21-08-18-default_2021_08_18_14_36_11_id478649--s977983] trainer/terminals Max 0 +[21-08-18-default_2021_08_18_14_36_11_id478649--s977983] trainer/terminals Min 0 +[21-08-18-default_2021_08_18_14_36_11_id478649--s977983] trainer/reward scale 1 +[21-08-18-default_2021_08_18_14_36_11_id478649--s977983] trainer/policy/mean Mean 4.17832e-05 +[21-08-18-default_2021_08_18_14_36_11_id478649--s977983] trainer/policy/mean Std 0.0185309 +[21-08-18-default_2021_08_18_14_36_11_id478649--s977983] trainer/policy/mean Max 0.0296929 +[21-08-18-default_2021_08_18_14_36_11_id478649--s977983] trainer/policy/mean Min -0.031448 +[21-08-18-default_2021_08_18_14_36_11_id478649--s977983] trainer/policy/normal/std Mean 0.900724 +[21-08-18-default_2021_08_18_14_36_11_id478649--s977983] trainer/policy/normal/std Std 0.0274135 +[21-08-18-default_2021_08_18_14_36_11_id478649--s977983] trainer/policy/normal/std Max 0.944814 +[21-08-18-default_2021_08_18_14_36_11_id478649--s977983] trainer/policy/normal/std Min 0.843748 +[21-08-18-default_2021_08_18_14_36_11_id478649--s977983] trainer/policy/normal/log_std Mean -0.105023 +[21-08-18-default_2021_08_18_14_36_11_id478649--s977983] trainer/policy/normal/log_std Std 0.03058 +[21-08-18-default_2021_08_18_14_36_11_id478649--s977983] trainer/policy/normal/log_std Max -0.0567676 +[21-08-18-default_2021_08_18_14_36_11_id478649--s977983] trainer/policy/normal/log_std Min -0.169902 +[21-08-18-default_2021_08_18_14_36_11_id478649--s977983] trainer/Alpha 0.740502 +[21-08-18-default_2021_08_18_14_36_11_id478649--s977983] trainer/Alpha Loss -3.07624 +[21-08-18-default_2021_08_18_14_36_11_id478649--s977983] expl/num steps total 3000 +[21-08-18-default_2021_08_18_14_36_11_id478649--s977983] expl/num paths total 3 +[21-08-18-default_2021_08_18_14_36_11_id478649--s977983] expl/path length Mean 1000 +[21-08-18-default_2021_08_18_14_36_11_id478649--s977983] expl/path length Std 0 +[21-08-18-default_2021_08_18_14_36_11_id478649--s977983] expl/path length Max 1000 +[21-08-18-default_2021_08_18_14_36_11_id478649--s977983] expl/path length Min 1000 +[21-08-18-default_2021_08_18_14_36_11_id478649--s977983] expl/Rewards Mean -0.179153 +[21-08-18-default_2021_08_18_14_36_11_id478649--s977983] expl/Rewards Std 0.673567 +[21-08-18-default_2021_08_18_14_36_11_id478649--s977983] expl/Rewards Max 1.78125 +[21-08-18-default_2021_08_18_14_36_11_id478649--s977983] expl/Rewards Min -2.40707 +[21-08-18-default_2021_08_18_14_36_11_id478649--s977983] expl/Returns Mean -179.153 +[21-08-18-default_2021_08_18_14_36_11_id478649--s977983] expl/Returns Std 0 +[21-08-18-default_2021_08_18_14_36_11_id478649--s977983] expl/Returns Max -179.153 +[21-08-18-default_2021_08_18_14_36_11_id478649--s977983] expl/Returns Min -179.153 +[21-08-18-default_2021_08_18_14_36_11_id478649--s977983] expl/Actions Mean 0.00706013 +[21-08-18-default_2021_08_18_14_36_11_id478649--s977983] expl/Actions Std 0.609613 +[21-08-18-default_2021_08_18_14_36_11_id478649--s977983] expl/Actions Max 0.997411 +[21-08-18-default_2021_08_18_14_36_11_id478649--s977983] expl/Actions Min -0.997045 +[21-08-18-default_2021_08_18_14_36_11_id478649--s977983] expl/Num Paths 1 +[21-08-18-default_2021_08_18_14_36_11_id478649--s977983] expl/Average Returns -179.153 +[21-08-18-default_2021_08_18_14_36_11_id478649--s977983] eval/num steps total 2000 +[21-08-18-default_2021_08_18_14_36_11_id478649--s977983] eval/num paths total 2 +[21-08-18-default_2021_08_18_14_36_11_id478649--s977983] eval/path length Mean 1000 +[21-08-18-default_2021_08_18_14_36_11_id478649--s977983] eval/path length Std 0 +[21-08-18-default_2021_08_18_14_36_11_id478649--s977983] eval/path length Max 1000 +[21-08-18-default_2021_08_18_14_36_11_id478649--s977983] eval/path length Min 1000 +[21-08-18-default_2021_08_18_14_36_11_id478649--s977983] eval/Rewards Mean -0.000701264 +[21-08-18-default_2021_08_18_14_36_11_id478649--s977983] eval/Rewards Std 0.0153608 +[21-08-18-default_2021_08_18_14_36_11_id478649--s977983] eval/Rewards Max 0.0909554 +[21-08-18-default_2021_08_18_14_36_11_id478649--s977983] eval/Rewards Min -0.271044 +[21-08-18-default_2021_08_18_14_36_11_id478649--s977983] eval/Returns Mean -0.701264 +[21-08-18-default_2021_08_18_14_36_11_id478649--s977983] eval/Returns Std 0 +[21-08-18-default_2021_08_18_14_36_11_id478649--s977983] eval/Returns Max -0.701264 +[21-08-18-default_2021_08_18_14_36_11_id478649--s977983] eval/Returns Min -0.701264 +[21-08-18-default_2021_08_18_14_36_11_id478649--s977983] eval/Actions Mean -0.00134304 +[21-08-18-default_2021_08_18_14_36_11_id478649--s977983] eval/Actions Std 0.00609567 +[21-08-18-default_2021_08_18_14_36_11_id478649--s977983] eval/Actions Max 0.0110001 +[21-08-18-default_2021_08_18_14_36_11_id478649--s977983] eval/Actions Min -0.00952567 +[21-08-18-default_2021_08_18_14_36_11_id478649--s977983] eval/Num Paths 1 +[21-08-18-default_2021_08_18_14_36_11_id478649--s977983] eval/Average Returns -0.701264 +[21-08-18-default_2021_08_18_14_36_11_id478649--s977983] time/epoch_time (s) 7.77494 +[21-08-18-default_2021_08_18_14_36_11_id478649--s977983] time/evaluation sampling (s) 0.40692 +[21-08-18-default_2021_08_18_14_36_11_id478649--s977983] time/exploration sampling (s) 0.448364 +[21-08-18-default_2021_08_18_14_36_11_id478649--s977983] time/global_time (s) 15.8451 +[21-08-18-default_2021_08_18_14_36_11_id478649--s977983] time/replay buffer data storing (s) 0.00544906 +[21-08-18-default_2021_08_18_14_36_11_id478649--s977983] time/sac training (s) 6.645 +[21-08-18-default_2021_08_18_14_36_11_id478649--s977983] time/saving (s) 0.00674033 +[21-08-18-default_2021_08_18_14_36_11_id478649--s977983] time/training (s) 6.90003 +[21-08-18-default_2021_08_18_14_36_11_id478649--s977983] epoch 1 +[21-08-18-default_2021_08_18_14_36_11_id478649--s977983] ----------------------------------- -------------- diff --git a/tests/regression/sac/21-08-18-default_2021_08_18_14_36_11_id478649--s977983/progress.csv b/tests/regression/sac/21-08-18-default_2021_08_18_14_36_11_id478649--s977983/progress.csv new file mode 100644 index 000000000..4907cade7 --- /dev/null +++ b/tests/regression/sac/21-08-18-default_2021_08_18_14_36_11_id478649--s977983/progress.csv @@ -0,0 +1,3 @@ +epoch,eval/Actions Max,eval/Actions Mean,eval/Actions Min,eval/Actions Std,eval/Average Returns,eval/Num Paths,eval/Returns Max,eval/Returns Mean,eval/Returns Min,eval/Returns Std,eval/Rewards Max,eval/Rewards Mean,eval/Rewards Min,eval/Rewards Std,eval/num paths total,eval/num steps total,eval/path length Max,eval/path length Mean,eval/path length Min,eval/path length Std,expl/Actions Max,expl/Actions Mean,expl/Actions Min,expl/Actions Std,expl/Average Returns,expl/Num Paths,expl/Returns Max,expl/Returns Mean,expl/Returns Min,expl/Returns Std,expl/Rewards Max,expl/Rewards Mean,expl/Rewards Min,expl/Rewards Std,expl/num paths total,expl/num steps total,expl/path length Max,expl/path length Mean,expl/path length Min,expl/path length Std,replay_buffer/size,time/epoch_time (s),time/evaluation sampling (s),time/exploration sampling (s),time/global_time (s),time/replay buffer data storing (s),time/sac training (s),time/saving (s),time/training (s),trainer/Alpha,trainer/Alpha Loss,trainer/Log Pis Max,trainer/Log Pis Mean,trainer/Log Pis Min,trainer/Log Pis Std,trainer/Policy Loss,trainer/Q Targets Max,trainer/Q Targets Mean,trainer/Q Targets Min,trainer/Q Targets Std,trainer/Q1 Predictions Max,trainer/Q1 Predictions Mean,trainer/Q1 Predictions Min,trainer/Q1 Predictions Std,trainer/Q2 Predictions Max,trainer/Q2 Predictions Mean,trainer/Q2 Predictions Min,trainer/Q2 Predictions Std,trainer/QF1 Loss,trainer/QF2 Loss,trainer/num train calls,trainer/policy/mean Max,trainer/policy/mean Mean,trainer/policy/mean Min,trainer/policy/mean Std,trainer/policy/normal/log_std Max,trainer/policy/normal/log_std Mean,trainer/policy/normal/log_std Min,trainer/policy/normal/log_std Std,trainer/policy/normal/std Max,trainer/policy/normal/std Mean,trainer/policy/normal/std Min,trainer/policy/normal/std Std,trainer/reward scale,trainer/rewards Max,trainer/rewards Mean,trainer/rewards Min,trainer/rewards Std,trainer/terminals Max,trainer/terminals Mean,trainer/terminals Min,trainer/terminals Std +0,9.0712594e-05,1.1957016e-07,-0.000104064056,3.7702214e-06,-0.8952290231208841,1,-0.8952290231208841,-0.8952290231208841,-0.8952290231208841,0.0,0.12778112217645218,-0.0008952290231209042,-0.3515186784366525,0.01942398467443055,1,1000,1000,1000.0,1000,0.0,0.99703395,0.008081997,-0.99846154,0.631892,-278.5426965066304,1,-278.5426965066304,-278.5426965066304,-278.5426965066304,0.0,1.9722930947979778,-0.27854269650663044,-2.4407773121749865,0.6818599705855102,2,2000,1000,1000.0,1000,0.0,2000,8.019153833389282,0.3777656555175781,0.4508967399597168,8.065624952316284,0.005621194839477539,6.464830636978149,0.007134199142456055,6.716925621032715,1.0,-0.0,-4.168746,-4.398275,-4.627804,0.2295289,-4.3981266,2.832911,2.7450848,2.6572583,0.08782637,0.009167064,0.004583532,0.0,0.004583532,0.0013478022,0.00046631842,-0.00041516538,0.0008814838,7.518886,7.540489,1000,0.0,0.0,0.0,0.0,0.0008425738,0.0003853096,-0.00048810232,0.00046725076,1.0008429,1.0003854,0.999512,0.00046733918,1,-0.62647223,-0.6808442,-0.7352162,0.054371983,0.0,0.0,0.0,0.0 +1,0.011000051,-0.0013430383,-0.009525669,0.006095669,-0.7012644561706887,1,-0.7012644561706887,-0.7012644561706887,-0.7012644561706887,0.0,0.09095537207406806,-0.0007012644561706894,-0.271044494429758,0.015360811409931894,2,2000,1000,1000.0,1000,0.0,0.9974111,0.0070601325,-0.99704456,0.60961276,-179.15326251134772,1,-179.15326251134772,-179.15326251134772,-179.15326251134772,0.0,1.7812511614876059,-0.17915326251134767,-2.4070713463174256,0.6735669191445811,3,3000,1000,1000.0,1000,0.0,3000,7.774937868118286,0.4069199562072754,0.4483642578125,15.845108985900879,0.005449056625366211,6.64499568939209,0.0067403316497802734,6.900032997131348,0.7405017018318176,-3.076240301132202,-3.991932,-4.2395473,-4.4871626,0.24761534,-7.315896,5.924946,4.859323,3.7937,1.0656229,4.986722,4.973159,4.9595957,0.013563156,5.158066,4.706343,4.2546206,0.45172262,1.1197884,2.3257403,2000,0.029692946,4.1783245e-05,-0.03144796,0.018530913,-0.05676761,-0.10502273,-0.16990194,0.030579962,0.9448136,0.9007237,0.84374756,0.0274135,1,-0.18553922,-0.45890814,-0.7322771,0.27336892,0.0,0.0,0.0,0.0 diff --git a/tests/regression/sac/21-08-18-default_2021_08_18_14_36_11_id478649--s977983/script_name.txt b/tests/regression/sac/21-08-18-default_2021_08_18_14_36_11_id478649--s977983/script_name.txt new file mode 100644 index 000000000..f0cafee7b --- /dev/null +++ b/tests/regression/sac/21-08-18-default_2021_08_18_14_36_11_id478649--s977983/script_name.txt @@ -0,0 +1 @@ +tests/regression/sac/test_sac.py \ No newline at end of file diff --git a/tests/regression/sac/21-08-18-default_2021_08_18_14_36_11_id478649--s977983/variant.json b/tests/regression/sac/21-08-18-default_2021_08_18_14_36_11_id478649--s977983/variant.json new file mode 100644 index 000000000..ef41721ff --- /dev/null +++ b/tests/regression/sac/21-08-18-default_2021_08_18_14_36_11_id478649--s977983/variant.json @@ -0,0 +1,28 @@ +{ + "algorithm": "SAC", + "algorithm_kwargs": { + "batch_size": 2, + "max_path_length": 1000, + "min_num_steps_before_training": 1000, + "num_epochs": 2, + "num_eval_steps_per_epoch": 1000, + "num_expl_steps_per_train_loop": 1000, + "num_trains_per_train_loop": 1000 + }, + "exp_name": "21-08-18-default", + "layer_size": 5, + "replay_buffer_size": 1000000, + "seed": 977983, + "trainer_kwargs": { + "discount": 0.99, + "policy_lr": 0.0003, + "qf_lr": 0.0003, + "reward_scale": 1, + "soft_target_tau": 0.005, + "target_update_period": 1, + "use_automatic_entropy_tuning": true + }, + "trial_name": "21-08-18-default_2021_08_18_14_36_11_id478649--s977983", + "unique_id": "f1aea952-7b36-4ea2-91ee-1205992a917d", + "version": "normal" +} \ No newline at end of file diff --git a/tests/regression/sac/__init__.py b/tests/regression/sac/__init__.py new file mode 100644 index 000000000..e69de29bb diff --git a/tests/regression/sac/test_sac.py b/tests/regression/sac/test_sac.py new file mode 100644 index 000000000..d138bdda2 --- /dev/null +++ b/tests/regression/sac/test_sac.py @@ -0,0 +1,130 @@ +"""Test whether this matches exactly SAC debug""" + +import unittest + +from gym.envs.mujoco import HalfCheetahEnv + +import rlkit.torch.pytorch_util as ptu +from rlkit.data_management.env_replay_buffer import EnvReplayBuffer +from rlkit.envs.wrappers import NormalizedBoxEnv +from rlkit.launchers.launcher_util import run_experiment +from rlkit.samplers.data_collector import MdpPathCollector +from rlkit.torch.sac.policies import TanhGaussianPolicy, MakeDeterministic +from rlkit.torch.sac.sac import SACTrainer +from rlkit.torch.networks import ConcatMlp +from rlkit.torch.torch_rl_algorithm import TorchBatchRLAlgorithm + + +def experiment(variant): + expl_env = NormalizedBoxEnv(HalfCheetahEnv()) + eval_env = NormalizedBoxEnv(HalfCheetahEnv()) + obs_dim = expl_env.observation_space.low.size + action_dim = eval_env.action_space.low.size + + expl_env.seed(0) + eval_env.seed(0) + + M = variant['layer_size'] + qf1 = ConcatMlp( + input_size=obs_dim + action_dim, + output_size=1, + hidden_sizes=[M, M], + ) + qf2 = ConcatMlp( + input_size=obs_dim + action_dim, + output_size=1, + hidden_sizes=[M, M], + ) + target_qf1 = ConcatMlp( + input_size=obs_dim + action_dim, + output_size=1, + hidden_sizes=[M, M], + ) + target_qf2 = ConcatMlp( + input_size=obs_dim + action_dim, + output_size=1, + hidden_sizes=[M, M], + ) + policy = TanhGaussianPolicy( + obs_dim=obs_dim, + action_dim=action_dim, + hidden_sizes=[M, M], + ) + eval_policy = MakeDeterministic(policy) + eval_path_collector = MdpPathCollector( + eval_env, + eval_policy, + ) + expl_path_collector = MdpPathCollector( + expl_env, + policy, + ) + replay_buffer = EnvReplayBuffer( + variant['replay_buffer_size'], + expl_env, + ) + trainer = SACTrainer( + env=eval_env, + policy=policy, + qf1=qf1, + qf2=qf2, + target_qf1=target_qf1, + target_qf2=target_qf2, + **variant['trainer_kwargs'] + ) + algorithm = TorchBatchRLAlgorithm( + trainer=trainer, + exploration_env=expl_env, + evaluation_env=eval_env, + exploration_data_collector=expl_path_collector, + evaluation_data_collector=eval_path_collector, + replay_buffer=replay_buffer, + **variant['algorithm_kwargs'] + ) + algorithm.to(ptu.device) + algorithm.train() + +import os +import sys + +from rlkit.core import logger +from rlkit.testing import csv_util + +def test_sac_online(): + # noinspection PyTypeChecker + variant = dict( + algorithm="SAC", + version="normal", + layer_size=5, + replay_buffer_size=int(1E6), + algorithm_kwargs=dict( + num_epochs=2, + num_eval_steps_per_epoch=1000, + num_trains_per_train_loop=1000, + num_expl_steps_per_train_loop=1000, + min_num_steps_before_training=1000, + max_path_length=1000, + batch_size=2, + ), + trainer_kwargs=dict( + discount=0.99, + soft_target_tau=5e-3, + target_update_period=1, + policy_lr=3E-4, + qf_lr=3E-4, + reward_scale=1, + use_automatic_entropy_tuning=True, + ), + ) + run_experiment(experiment, variant=variant, unpack_variant=False, mode="here_no_doodad", seed=977983, ) + + # check if online training results matches + reference_csv = "tests/regression/sac/21-08-18-default_2021_08_18_14_36_11_id478649--s977983/progress.csv" + output_csv = os.path.join(logger.get_snapshot_dir(), "progress.csv") + output = csv_util.get_exp(output_csv) + reference = csv_util.get_exp(reference_csv) + keys = ["eval/Average Returns", "trainer/Q1 Predictions Mean", ] + csv_util.check_equal(reference, output, keys) + +if __name__ == "__main__": + test_sac_online()