diff --git a/.github/workflows/install.yml b/.github/workflows/install.yml index 8541dbee2..70e06374c 100644 --- a/.github/workflows/install.yml +++ b/.github/workflows/install.yml @@ -3,6 +3,7 @@ name: install on: push: pull_request: + workflow_dispatch: jobs: test: diff --git a/.gitignore b/.gitignore index 5906b9179..1578783b4 100644 --- a/.gitignore +++ b/.gitignore @@ -149,3 +149,5 @@ checkpoints/ experiments/ wandb/ raylib/ +.vscode/launch.json +raylib-5.0_* diff --git a/clean_pufferl.py b/clean_pufferl.py index 247a8127f..3c3998902 100644 --- a/clean_pufferl.py +++ b/clean_pufferl.py @@ -1,29 +1,24 @@ -from pdb import set_trace as T -import numpy as np - import os import random -import psutil import time - -from threading import Thread from collections import defaultdict, deque +from threading import Thread +import numpy as np +import psutil +import pufferlib +import pufferlib.pytorch +import pufferlib.utils import rich +import torch from rich.console import Console from rich.table import Table -import torch - -import pufferlib -import pufferlib.utils -import pufferlib.pytorch - -torch.set_float32_matmul_precision('high') +torch.set_float32_matmul_precision("high") # Fast Cython GAE implementation -#import pyximport -#pyximport.install(setup_args={"include_dirs": np.get_include()}) +# import pyximport +# pyximport.install(setup_args={"include_dirs": np.get_include()}) from c_gae import compute_gae @@ -33,7 +28,7 @@ def create(config, vecenv, policy, optimizer=None, wandb=None): losses = make_losses() utilization = Utilization() - msg = f'Model Size: {abbreviate(count_params(policy))} parameters' + msg = f"Model Size: {abbreviate(count_params(policy))} parameters" print_dashboard(config.env, utilization, 0, 0, profile, losses, {}, msg, clear=True) vecenv.async_reset(config.seed) @@ -43,18 +38,28 @@ def create(config, vecenv, policy, optimizer=None, wandb=None): atn_dtype = vecenv.single_action_space.dtype total_agents = vecenv.num_agents - lstm = policy.lstm if hasattr(policy, 'lstm') else None - experience = Experience(config.batch_size, config.bptt_horizon, - config.minibatch_size, obs_shape, obs_dtype, atn_shape, atn_dtype, - config.cpu_offload, config.device, lstm, total_agents) + lstm = policy.lstm if hasattr(policy, "lstm") else None + experience = Experience( + config.batch_size, + config.bptt_horizon, + config.minibatch_size, + policy.hidden_size, + obs_shape, + obs_dtype, + atn_shape, + atn_dtype, + config.cpu_offload, + config.device, + lstm, + total_agents, + ) uncompiled_policy = policy if config.compile: policy = torch.compile(policy, mode=config.compile_mode) - optimizer = torch.optim.Adam(policy.parameters(), - lr=config.learning_rate, eps=1e-5) + optimizer = torch.optim.Adam(policy.parameters(), lr=config.learning_rate, eps=1e-5) return pufferlib.namespace( config=config, @@ -74,6 +79,7 @@ def create(config, vecenv, policy, optimizer=None, wandb=None): utilization=utilization, ) + @pufferlib.utils.profile def evaluate(data): config, profile, experience = data.config, data.profile, data.experience @@ -82,6 +88,7 @@ def evaluate(data): policy = data.policy infos = defaultdict(list) lstm_h, lstm_c = experience.lstm_h, experience.lstm_c + e3b_inv = experience.e3b_inv while not experience.full: with profile.env: @@ -99,22 +106,26 @@ def evaluate(data): with profile.eval_forward, torch.no_grad(): # TODO: In place-update should be faster. Leaking 7% speed max # Also should be using a cuda tensor to index + e3b = e3b_inv[env_id] if lstm_h is not None: h = lstm_h[:, env_id] c = lstm_c[:, env_id] - actions, logprob, _, value, (h, c) = policy(o_device, (h, c)) + actions, logprob, _, value, (h, c), next_e3b, intrinsic_reward = policy(o_device, (h, c), e3b=e3b) lstm_h[:, env_id] = h lstm_c[:, env_id] = c else: - actions, logprob, _, value = policy(o_device) + actions, logprob, _, value, next_e3b, intrinsic_reward = policy(o_device, e3b=e3b) + + e3b_inv[env_id] = next_e3b + r += intrinsic_reward.cpu() - if config.device == 'cuda': + if config.device == "cuda": torch.cuda.synchronize() with profile.eval_misc: value = value.flatten() actions = actions.cpu().numpy() - mask = torch.as_tensor(mask)# * policy.mask) + mask = torch.as_tensor(mask) # * policy.mask) o = o if config.cpu_offload else o_device experience.store(o, value, actions, logprob, r, d, env_id, mask) @@ -127,8 +138,8 @@ def evaluate(data): with profile.eval_misc: for k, v in infos.items(): - if '_map' in k and data.wandb is not None: - data.stats[f'Media/{k}'] = data.wandb.Image(v[0]) + if "_map" in k and data.wandb is not None: + data.stats[f"Media/{k}"] = data.wandb.Image(v[0]) continue if isinstance(v, np.ndarray): @@ -145,6 +156,7 @@ def evaluate(data): data.experience.step = 0 return data.stats, infos + @pufferlib.utils.profile def train(data): config, profile, experience = data.config, data.profile, data.experience @@ -157,8 +169,7 @@ def train(data): values_np = experience.values_np[idxs] rewards_np = experience.rewards_np[idxs] # TODO: bootstrap between segment bounds - advantages_np = compute_gae(dones_np, values_np, - rewards_np, config.gamma, config.gae_lambda) + advantages_np = compute_gae(dones_np, values_np, rewards_np, config.gamma, config.gae_lambda) experience.flatten_batch(advantages_np) # Optimizing the policy and value network @@ -179,16 +190,15 @@ def train(data): with profile.train_forward: if experience.lstm_h is not None: - _, newlogprob, entropy, newvalue, lstm_state = data.policy( - obs, state=lstm_state, action=atn) + _, newlogprob, entropy, newvalue, lstm_state, _, _ = data.policy(obs, state=lstm_state, action=atn) lstm_state = (lstm_state[0].detach(), lstm_state[1].detach()) else: - _, newlogprob, entropy, newvalue = data.policy( + _, newlogprob, entropy, newvalue, _, _ = data.policy( obs.reshape(-1, *data.vecenv.single_observation_space.shape), action=atn, ) - if config.device == 'cuda': + if config.device == "cuda": torch.cuda.synchronize() with profile.train_misc: @@ -207,9 +217,7 @@ def train(data): # Policy loss pg_loss1 = -adv * ratio - pg_loss2 = -adv * torch.clamp( - ratio, 1 - config.clip_coef, 1 + config.clip_coef - ) + pg_loss2 = -adv * torch.clamp(ratio, 1 - config.clip_coef, 1 + config.clip_coef) pg_loss = torch.max(pg_loss1, pg_loss2).mean() # Value loss @@ -235,7 +243,7 @@ def train(data): loss.backward() torch.nn.utils.clip_grad_norm_(data.policy.parameters(), config.max_grad_norm) data.optimizer.step() - if config.device == 'cuda': + if config.device == "cuda": torch.cuda.synchronize() with profile.train_misc: @@ -268,13 +276,15 @@ def train(data): # TODO: make this appear faster if done_training or profile.update(data): mean_and_log(data) - print_dashboard(config.env, data.utilization, data.global_step, data.epoch, - profile, data.losses, data.stats, data.msg) + print_dashboard( + config.env, data.utilization, data.global_step, data.epoch, profile, data.losses, data.stats, data.msg + ) data.stats = defaultdict(list) if data.epoch % config.checkpoint_interval == 0 or done_training: save_checkpoint(data) - data.msg = f'Checkpoint saved at update {data.epoch}' + data.msg = f"Checkpoint saved at update {data.epoch}" + def mean_and_log(data): for k in list(data.stats.keys()): @@ -290,15 +300,18 @@ def mean_and_log(data): return data.last_log_time = time.time() - data.wandb.log({ - '0verview/SPS': data.profile.SPS, - '0verview/agent_steps': data.global_step, - '0verview/epoch': data.epoch, - '0verview/learning_rate': data.optimizer.param_groups[0]["lr"], - **{f'environment/{k}': v for k, v in data.stats.items()}, - **{f'losses/{k}': v for k, v in data.losses.items()}, - **{f'performance/{k}': v for k, v in data.profile}, - }) + data.wandb.log( + { + "0verview/SPS": data.profile.SPS, + "0verview/agent_steps": data.global_step, + "0verview/epoch": data.epoch, + "0verview/learning_rate": data.optimizer.param_groups[0]["lr"], + **{f"environment/{k}": v for k, v in data.stats.items()}, + **{f"losses/{k}": v for k, v in data.losses.items()}, + **{f"performance/{k}": v for k, v in data.profile}, + } + ) + def close(data): data.vecenv.close() @@ -312,6 +325,7 @@ def close(data): data.wandb.run.log_artifact(artifact) data.wandb.finish() + class Profile: SPS: ... = 0 uptime: ... = 0 @@ -324,6 +338,7 @@ class Profile: train_forward_time: ... = 0 learn_time: ... = 0 train_misc_time: ... = 0 + def __init__(self): self.start = time.time() self.env = pufferlib.utils.Profiler() @@ -335,17 +350,17 @@ def __init__(self): self.prev_steps = 0 def __iter__(self): - yield 'SPS', self.SPS - yield 'uptime', self.uptime - yield 'remaining', self.remaining - yield 'eval_time', self.eval_time - yield 'env_time', self.env_time - yield 'eval_forward_time', self.eval_forward_time - yield 'eval_misc_time', self.eval_misc_time - yield 'train_time', self.train_time - yield 'train_forward_time', self.train_forward_time - yield 'learn_time', self.learn_time - yield 'train_misc_time', self.train_misc_time + yield "SPS", self.SPS + yield "uptime", self.uptime + yield "remaining", self.remaining + yield "eval_time", self.eval_time + yield "env_time", self.env_time + yield "eval_forward_time", self.eval_forward_time + yield "eval_misc_time", self.eval_misc_time + yield "train_time", self.train_time + yield "train_forward_time", self.train_forward_time + yield "learn_time", self.learn_time + yield "train_misc_time", self.train_misc_time @property def epoch_time(self): @@ -365,16 +380,17 @@ def update(self, data, interval_s=1): self.uptime = uptime self.remaining = (data.config.total_timesteps - global_step) / self.SPS - self.eval_time = data._timers['evaluate'].elapsed + self.eval_time = data._timers["evaluate"].elapsed self.eval_forward_time = self.eval_forward.elapsed self.env_time = self.env.elapsed self.eval_misc_time = self.eval_misc.elapsed - self.train_time = data._timers['train'].elapsed + self.train_time = data._timers["train"].elapsed self.train_forward_time = self.train_forward.elapsed self.learn_time = self.learn.elapsed self.train_misc_time = self.train_misc.elapsed return True + def make_losses(): return pufferlib.namespace( policy_loss=0, @@ -386,27 +402,43 @@ def make_losses(): explained_variance=0, ) + class Experience: - '''Flat tensor storage and array views for faster indexing''' - def __init__(self, batch_size, bptt_horizon, minibatch_size, obs_shape, obs_dtype, atn_shape, atn_dtype, - cpu_offload=False, device='cuda', lstm=None, lstm_total_agents=0): + """Flat tensor storage and array views for faster indexing""" + + def __init__( + self, + batch_size, + bptt_horizon, + minibatch_size, + hidden_size, + obs_shape, + obs_dtype, + atn_shape, + atn_dtype, + cpu_offload=False, + device="cuda", + lstm=None, + lstm_total_agents=0, + ): if minibatch_size is None: minibatch_size = batch_size obs_dtype = pufferlib.pytorch.numpy_to_torch_dtype_dict[obs_dtype] atn_dtype = pufferlib.pytorch.numpy_to_torch_dtype_dict[atn_dtype] - pin = device == 'cuda' and cpu_offload - obs_device = device if not pin else 'cpu' - self.obs=torch.zeros(batch_size, *obs_shape, dtype=obs_dtype, - pin_memory=pin, device=device if not pin else 'cpu') - self.actions=torch.zeros(batch_size, *atn_shape, dtype=atn_dtype, pin_memory=pin) - self.logprobs=torch.zeros(batch_size, pin_memory=pin) - self.rewards=torch.zeros(batch_size, pin_memory=pin) - self.dones=torch.zeros(batch_size, pin_memory=pin) - self.truncateds=torch.zeros(batch_size, pin_memory=pin) - self.values=torch.zeros(batch_size, pin_memory=pin) - - #self.obs_np = np.asarray(self.obs) + pin = device == "cuda" and cpu_offload + obs_device = device if not pin else "cpu" + self.obs = torch.zeros( + batch_size, *obs_shape, dtype=obs_dtype, pin_memory=pin, device=device if not pin else "cpu" + ) + self.actions = torch.zeros(batch_size, *atn_shape, dtype=atn_dtype, pin_memory=pin) + self.logprobs = torch.zeros(batch_size, pin_memory=pin) + self.rewards = torch.zeros(batch_size, pin_memory=pin) + self.dones = torch.zeros(batch_size, pin_memory=pin) + self.truncateds = torch.zeros(batch_size, pin_memory=pin) + self.values = torch.zeros(batch_size, pin_memory=pin) + self.e3b_inv = 1 * torch.eye(hidden_size).repeat(lstm_total_agents, 1, 1).to(device) + self.actions_np = np.asarray(self.actions) self.logprobs_np = np.asarray(self.logprobs) self.rewards_np = np.asarray(self.rewards) @@ -424,12 +456,12 @@ def __init__(self, batch_size, bptt_horizon, minibatch_size, obs_shape, obs_dtyp num_minibatches = batch_size / minibatch_size self.num_minibatches = int(num_minibatches) if self.num_minibatches != num_minibatches: - raise ValueError('batch_size must be divisible by minibatch_size') + raise ValueError("batch_size must be divisible by minibatch_size") minibatch_rows = minibatch_size / bptt_horizon self.minibatch_rows = int(minibatch_rows) if self.minibatch_rows != minibatch_rows: - raise ValueError('minibatch_size must be divisible by bptt_horizon') + raise ValueError("minibatch_size must be divisible by bptt_horizon") self.batch_size = batch_size self.bptt_horizon = bptt_horizon @@ -446,9 +478,9 @@ def full(self): def store(self, obs, value, action, logprob, reward, done, env_id, mask): # Mask learner and Ensure indices do not exceed batch size ptr = self.ptr - indices = torch.where(mask)[0].numpy()[:self.batch_size - ptr] + indices = torch.where(mask)[0].numpy()[: self.batch_size - ptr] end = ptr + len(indices) - + self.obs[ptr:end] = obs.to(self.obs.device)[indices] self.values_np[ptr:end] = value.cpu().numpy()[indices] self.actions_np[ptr:end] = action[indices] @@ -460,14 +492,16 @@ def store(self, obs, value, action, logprob, reward, done, env_id, mask): self.step += 1 def sort_training_data(self): - idxs = np.asarray(sorted( - range(len(self.sort_keys)), key=self.sort_keys.__getitem__)) - self.b_idxs_obs = torch.as_tensor(idxs.reshape( - self.minibatch_rows, self.num_minibatches, self.bptt_horizon - ).transpose(1,0,-1)).to(self.obs.device).long() + idxs = np.asarray(sorted(range(len(self.sort_keys)), key=self.sort_keys.__getitem__)) + self.b_idxs_obs = ( + torch.as_tensor( + idxs.reshape(self.minibatch_rows, self.num_minibatches, self.bptt_horizon).transpose(1, 0, -1) + ) + .to(self.obs.device) + .long() + ) self.b_idxs = self.b_idxs_obs.to(self.device) - self.b_idxs_flat = self.b_idxs.reshape( - self.num_minibatches, self.minibatch_size) + self.b_idxs_flat = self.b_idxs.reshape(self.num_minibatches, self.minibatch_size) self.sort_keys = [] return idxs @@ -478,9 +512,11 @@ def flatten_batch(self, advantages_np): self.b_logprobs = self.logprobs.to(self.device, non_blocking=True) self.b_dones = self.dones.to(self.device, non_blocking=True) self.b_values = self.values.to(self.device, non_blocking=True) - self.b_advantages = advantages.reshape(self.minibatch_rows, - self.num_minibatches, self.bptt_horizon).transpose(0, 1).reshape( - self.num_minibatches, self.minibatch_size) + self.b_advantages = ( + advantages.reshape(self.minibatch_rows, self.num_minibatches, self.bptt_horizon) + .transpose(0, 1) + .reshape(self.num_minibatches, self.minibatch_size) + ) self.returns_np = advantages_np + self.values_np self.b_obs = self.obs[self.b_idxs_obs] self.b_actions = self.b_actions[b_idxs].contiguous() @@ -489,6 +525,7 @@ def flatten_batch(self, advantages_np): self.b_values = self.b_values[b_flat] self.b_returns = self.b_advantages + self.b_values + class Utilization(Thread): def __init__(self, delay=1, maxlen=20): super().__init__() @@ -503,13 +540,13 @@ def __init__(self, delay=1, maxlen=20): def run(self): while not self.stopped: - self.cpu_util.append(100*psutil.cpu_percent()) + self.cpu_util.append(100 * psutil.cpu_percent()) mem = psutil.virtual_memory() - self.cpu_mem.append(100*mem.active/mem.total) + self.cpu_mem.append(100 * mem.active / mem.total) if torch.cuda.is_available(): self.gpu_util.append(torch.cuda.utilization()) free, total = torch.cuda.mem_get_info() - self.gpu_mem.append(100*free/total) + self.gpu_mem.append(100 * free / total) else: self.gpu_util.append(0) self.gpu_mem.append(0) @@ -518,13 +555,14 @@ def run(self): def stop(self): self.stopped = True + def save_checkpoint(data): config = data.config path = os.path.join(config.data_dir, config.exp_id) if not os.path.exists(path): os.makedirs(path) - model_name = f'model_{data.epoch:06d}.pt' + model_name = f"model_{data.epoch:06d}.pt" model_path = os.path.join(path, model_name) if os.path.exists(model_path): return model_path @@ -532,40 +570,52 @@ def save_checkpoint(data): torch.save(data.uncompiled_policy, model_path) state = { - 'optimizer_state_dict': data.optimizer.state_dict(), - 'global_step': data.global_step, - 'agent_step': data.global_step, - 'update': data.epoch, - 'model_name': model_name, - 'exp_id': config.exp_id, + "optimizer_state_dict": data.optimizer.state_dict(), + "global_step": data.global_step, + "agent_step": data.global_step, + "update": data.epoch, + "model_name": model_name, + "exp_id": config.exp_id, } - state_path = os.path.join(path, 'trainer_state.pt') - torch.save(state, state_path + '.tmp') - os.rename(state_path + '.tmp', state_path) + state_path = os.path.join(path, "trainer_state.pt") + torch.save(state, state_path + ".tmp") + os.rename(state_path + ".tmp", state_path) return model_path + def try_load_checkpoint(data): config = data.config path = os.path.join(config.data_dir, config.exp_id) if not os.path.exists(path): - print('No checkpoints found. Assuming new experiment') + print("No checkpoints found. Assuming new experiment") return - trainer_path = os.path.join(path, 'trainer_state.pt') + trainer_path = os.path.join(path, "trainer_state.pt") resume_state = torch.load(trainer_path) - model_path = os.path.join(path, resume_state['model_name']) + model_path = os.path.join(path, resume_state["model_name"]) data.policy.uncompiled.load_state_dict(model_path, map_location=config.device) - data.optimizer.load_state_dict(resume_state['optimizer_state_dict']) - print(f'Loaded checkpoint {resume_state["model_name"]}') + data.optimizer.load_state_dict(resume_state["optimizer_state_dict"]) + print(f"Loaded checkpoint {resume_state['model_name']}") + def count_params(policy): return sum(p.numel() for p in policy.parameters() if p.requires_grad) -def rollout(env_creator, env_kwargs, policy_cls, rnn_cls, agent_creator, agent_kwargs, - backend, render_mode='auto', model_path=None, device='cuda'): - if render_mode != 'auto': - env_kwargs['render_mode'] = render_mode +def rollout( + env_creator, + env_kwargs, + policy_cls, + rnn_cls, + agent_creator, + agent_kwargs, + backend, + render_mode="auto", + model_path=None, + device="cuda", +): + if render_mode != "auto": + env_kwargs["render_mode"] = render_mode # We are just using Serial vecenv to give a consistent # single-agent/multi-agent API for evaluation @@ -578,7 +628,7 @@ def rollout(env_creator, env_kwargs, policy_cls, rnn_cls, agent_creator, agent_k ob, info = env.reset() driver = env.driver_env - os.system('clear') + os.system("clear") state = None frames = [] @@ -586,22 +636,23 @@ def rollout(env_creator, env_kwargs, policy_cls, rnn_cls, agent_creator, agent_k while tick <= 2000: if tick % 1 == 0: render = driver.render() - if driver.render_mode == 'ansi': - print('\033[0;0H' + render + '\n') + if driver.render_mode == "ansi": + print("\033[0;0H" + render + "\n") time.sleep(0.05) - elif driver.render_mode == 'rgb_array': + elif driver.render_mode == "rgb_array": frames.append(render) import cv2 + render = cv2.cvtColor(render, cv2.COLOR_RGB2BGR) - cv2.imshow('frame', render) + cv2.imshow("frame", render) cv2.waitKey(1) - time.sleep(1/24) - elif driver.render_mode in ('human', 'raylib') and render is not None: + time.sleep(1 / 24) + elif driver.render_mode in ("human", "raylib") and render is not None: frames.append(render) with torch.no_grad(): ob = torch.as_tensor(ob).to(device) - if hasattr(agent, 'lstm'): + if hasattr(agent, "lstm"): action, _, _, _, state = agent(ob, state) else: action, _, _, _ = agent(ob) @@ -611,13 +662,15 @@ def rollout(env_creator, env_kwargs, policy_cls, rnn_cls, agent_creator, agent_k ob, reward = env.step(action)[:2] reward = reward.mean() if tick % 128 == 0: - print(f'Reward: {reward:.4f}, Tick: {tick}') + print(f"Reward: {reward:.4f}, Tick: {tick}") tick += 1 # Save frames as gif if frames: import imageio - os.makedirs('../docker', exist_ok=True) or imageio.mimsave('../docker/eval.gif', frames, fps=15, loop=0) + + os.makedirs("../docker", exist_ok=True) or imageio.mimsave("../docker/eval.gif", frames, fps=15, loop=0) + def seed_everything(seed, torch_deterministic): random.seed(seed) @@ -626,34 +679,28 @@ def seed_everything(seed, torch_deterministic): torch.manual_seed(seed) torch.backends.cudnn.deterministic = torch_deterministic -ROUND_OPEN = rich.box.Box( - "╭──╮\n" - "│ │\n" - "│ │\n" - "│ │\n" - "│ │\n" - "│ │\n" - "│ │\n" - "╰──╯\n" -) - -c1 = '[bright_cyan]' -c2 = '[white]' -c3 = '[cyan]' -b1 = '[bright_cyan]' -b2 = '[bright_white]' + +ROUND_OPEN = rich.box.Box("╭──╮\n│ │\n│ │\n│ │\n│ │\n│ │\n│ │\n╰──╯\n") + +c1 = "[bright_cyan]" +c2 = "[white]" +c3 = "[cyan]" +b1 = "[bright_cyan]" +b2 = "[bright_white]" + def abbreviate(num): if num < 1e3: - return f'{b2}{num:.0f}' + return f"{b2}{num:.0f}" elif num < 1e6: - return f'{b2}{num/1e3:.1f}{c2}k' + return f"{b2}{num / 1e3:.1f}{c2}k" elif num < 1e9: - return f'{b2}{num/1e6:.1f}{c2}m' + return f"{b2}{num / 1e6:.1f}{c2}m" elif num < 1e12: - return f'{b2}{num/1e9:.1f}{c2}b' + return f"{b2}{num / 1e9:.1f}{c2}b" else: - return f'{b2}{num/1e12:.1f}{c2}t' + return f"{b2}{num / 1e12:.1f}{c2}t" + def duration(seconds): seconds = int(seconds) @@ -662,19 +709,19 @@ def duration(seconds): s = seconds % 60 return f"{b2}{h}{c2}h {b2}{m}{c2}m {b2}{s}{c2}s" if h else f"{b2}{m}{c2}m {b2}{s}{c2}s" if m else f"{b2}{s}{c2}s" + def fmt_perf(name, time, uptime): - percent = 0 if uptime == 0 else int(100*time/uptime - 1e-5) - return f'{c1}{name}', duration(time), f'{b2}{percent:2d}%' + percent = 0 if uptime == 0 else int(100 * time / uptime - 1e-5) + return f"{c1}{name}", duration(time), f"{b2}{percent:2d}%" + # TODO: Add env name to print_dashboard -def print_dashboard(env_name, utilization, global_step, epoch, - profile, losses, stats, msg, clear=False, max_stats=[0]): +def print_dashboard(env_name, utilization, global_step, epoch, profile, losses, stats, msg, clear=False, max_stats=[0]): console = Console() if clear: console.clear() - dashboard = Table(box=ROUND_OPEN, expand=True, - show_header=False, border_style='bright_cyan') + dashboard = Table(box=ROUND_OPEN, expand=True, show_header=False, border_style="bright_cyan") table = Table(box=None, expand=True, show_header=False) dashboard.add_row(table) @@ -688,41 +735,44 @@ def print_dashboard(env_name, utilization, global_step, epoch, table.add_column(justify="center", width=13) table.add_column(justify="right", width=13) table.add_row( - f':blowfish: {c1}PufferLib {b2}2.0.0', - f'{c1}CPU: {c3}{cpu_percent:.1f}%', - f'{c1}GPU: {c3}{gpu_percent:.1f}%', - f'{c1}DRAM: {c3}{dram_percent:.1f}%', - f'{c1}VRAM: {c3}{vram_percent:.1f}%', + f":blowfish: {c1}PufferLib {b2}2.0.0", + f"{c1}CPU: {c3}{cpu_percent:.1f}%", + f"{c1}GPU: {c3}{gpu_percent:.1f}%", + f"{c1}DRAM: {c3}{dram_percent:.1f}%", + f"{c1}VRAM: {c3}{vram_percent:.1f}%", ) - + s = Table(box=None, expand=True) - s.add_column(f"{c1}Summary", justify='left', vertical='top', width=16) - s.add_column(f"{c1}Value", justify='right', vertical='top', width=8) - s.add_row(f'{c2}Environment', f'{b2}{env_name}') - s.add_row(f'{c2}Agent Steps', abbreviate(global_step)) - s.add_row(f'{c2}SPS', abbreviate(profile.SPS)) - s.add_row(f'{c2}Epoch', abbreviate(epoch)) - s.add_row(f'{c2}Uptime', duration(profile.uptime)) - s.add_row(f'{c2}Remaining', duration(profile.remaining)) + s.add_column(f"{c1}Summary", justify="left", vertical="top", width=16) + s.add_column(f"{c1}Value", justify="right", vertical="top", width=8) + s.add_row(f"{c2}Environment", f"{b2}{env_name}") + s.add_row(f"{c2}Agent Steps", abbreviate(global_step)) + s.add_row(f"{c2}SPS", abbreviate(profile.SPS)) + s.add_row(f"{c2}Epoch", abbreviate(epoch)) + s.add_row(f"{c2}Uptime", duration(profile.uptime)) + s.add_row(f"{c2}Remaining", duration(profile.remaining)) p = Table(box=None, expand=True, show_header=False) p.add_column(f"{c1}Performance", justify="left", width=10) p.add_column(f"{c1}Time", justify="right", width=8) p.add_column(f"{c1}%", justify="right", width=4) - p.add_row(*fmt_perf('Evaluate', profile.eval_time, profile.uptime)) - p.add_row(*fmt_perf(' Forward', profile.eval_forward_time, profile.uptime)) - p.add_row(*fmt_perf(' Env', profile.env_time, profile.uptime)) - p.add_row(*fmt_perf(' Misc', profile.eval_misc_time, profile.uptime)) - p.add_row(*fmt_perf('Train', profile.train_time, profile.uptime)) - p.add_row(*fmt_perf(' Forward', profile.train_forward_time, profile.uptime)) - p.add_row(*fmt_perf(' Learn', profile.learn_time, profile.uptime)) - p.add_row(*fmt_perf(' Misc', profile.train_misc_time, profile.uptime)) - - l = Table(box=None, expand=True, ) - l.add_column(f'{c1}Losses', justify="left", width=16) - l.add_column(f'{c1}Value', justify="right", width=8) + p.add_row(*fmt_perf("Evaluate", profile.eval_time, profile.uptime)) + p.add_row(*fmt_perf(" Forward", profile.eval_forward_time, profile.uptime)) + p.add_row(*fmt_perf(" Env", profile.env_time, profile.uptime)) + p.add_row(*fmt_perf(" Misc", profile.eval_misc_time, profile.uptime)) + p.add_row(*fmt_perf("Train", profile.train_time, profile.uptime)) + p.add_row(*fmt_perf(" Forward", profile.train_forward_time, profile.uptime)) + p.add_row(*fmt_perf(" Learn", profile.learn_time, profile.uptime)) + p.add_row(*fmt_perf(" Misc", profile.train_misc_time, profile.uptime)) + + l = Table( + box=None, + expand=True, + ) + l.add_column(f"{c1}Losses", justify="left", width=16) + l.add_column(f"{c1}Value", justify="right", width=8) for metric, value in losses.items(): - l.add_row(f'{c2}{metric}', f'{b2}{value:.3f}') + l.add_row(f"{c2}{metric}", f"{b2}{value:.3f}") monitor = Table(box=None, expand=True, pad_edge=False) monitor.add_row(s, p, l) @@ -739,28 +789,28 @@ def print_dashboard(env_name, utilization, global_step, epoch, right.add_column(f"{c1}Value", justify="right", width=10) i = 0 for metric, value in stats.items(): - try: # Discard non-numeric values + try: # Discard non-numeric values int(value) except: continue u = left if i % 2 == 0 else right - u.add_row(f'{c2}{metric}', f'{b2}{value:.3f}') + u.add_row(f"{c2}{metric}", f"{b2}{value:.3f}") i += 1 if i == 30: break for i in range(max_stats[0] - i): u = left if i % 2 == 0 else right - u.add_row('', '') + u.add_row("", "") max_stats[0] = max(max_stats[0], i) table = Table(box=None, expand=True, pad_edge=False) dashboard.add_row(table) - table.add_row(f' {c1}Message: {c2}{msg}') + table.add_row(f" {c1}Message: {c2}{msg}") with console.capture() as capture: console.print(dashboard) - print('\033[0;0H' + capture.get()) + print("\033[0;0H" + capture.get()) diff --git a/cleanrl_ppo_atari.py b/cleanrl_ppo_atari.py index 595f62ab2..b621f5d5b 100644 --- a/cleanrl_ppo_atari.py +++ b/cleanrl_ppo_atari.py @@ -1,5 +1,4 @@ # docs and experiment results can be found at https://docs.cleanrl.dev/rl-algorithms/ppo/#ppo_ataripy -import os import random import time from dataclasses import dataclass @@ -21,9 +20,10 @@ NoopResetEnv, ) + @dataclass class Args: - exp_name: str = 'cleanrl_ppo_atari' + exp_name: str = "cleanrl_ppo_atari" """the name of this experiment""" seed: int = 1 """seed of the experiment""" @@ -180,8 +180,9 @@ def get_action_and_value(self, x, action=None): device = torch.device("cuda" if torch.cuda.is_available() and args.cuda else "cpu") # PufferLib vectorization makes CleanRL ~65% faster! - import pufferlib.vector import pufferlib.environments.atari + import pufferlib.vector + envs = pufferlib.vector.make( pufferlib.environments.atari.env_creator(args.env_id), env_kwargs=dict(framestack=4), diff --git a/config/ocean/pong.ini b/config/ocean/pong.ini index 1b01fb91f..f53ed2883 100644 --- a/config/ocean/pong.ini +++ b/config/ocean/pong.ini @@ -13,18 +13,18 @@ checkpoint_interval = 25 num_envs = 1 num_workers = 1 env_batch_size = 1 -batch_size = 131072 -update_epochs = 3 +batch_size = 32768 +update_epochs = 1 minibatch_size = 8192 bptt_horizon = 16 -ent_coef = 0.004602 +ent_coef = 0.003 gae_lambda = 0.979 gamma = 0.9879 learning_rate = 0.001494 anneal_lr = False device = cuda -max_grad_norm = 3.592 -vf_coef = 0.4122 +max_grad_norm = 0.5 +vf_coef = 0.5 [sweep.metric] goal = maximize diff --git a/demo.py b/demo.py index 6a7192c0b..c00b8ce8c 100644 --- a/demo.py +++ b/demo.py @@ -1,42 +1,46 @@ -import configparser import argparse -import shutil -import glob -import uuid import ast +import configparser +import glob import os +import shutil +import uuid import pufferlib +import pufferlib.cleanrl import pufferlib.utils import pufferlib.vector -import pufferlib.cleanrl - -from rich_argparse import RichHelpFormatter from rich.console import Console from rich.traceback import install -install(show_locals=False) # Rich tracebacks +from rich_argparse import RichHelpFormatter + +install(show_locals=False) # Rich tracebacks + +import signal # Aggressively exit on ctrl+c -import signal # Aggressively exit on ctrl+c signal.signal(signal.SIGINT, lambda sig, frame: os._exit(0)) import clean_pufferl - + + def make_policy(env, policy_cls, rnn_cls, args): - policy = policy_cls(env, **args['policy']) + policy = policy_cls(env, **args["policy"]) if rnn_cls is not None: - policy = rnn_cls(env, policy, **args['rnn']) + policy = rnn_cls(env, policy, **args["rnn"]) policy = pufferlib.cleanrl.RecurrentPolicy(policy) else: policy = pufferlib.cleanrl.Policy(policy) - return policy.to(args['train']['device']) + return policy.to(args["train"]["device"]) + def init_wandb(args, name, id=None, resume=True): import wandb + wandb.init( id=id or wandb.util.generate_id(), - project=args['wandb_project'], - group=args['wandb_group'], + project=args["wandb_project"], + group=args["wandb_group"], allow_val_change=True, save_code=True, resume=resume, @@ -45,71 +49,75 @@ def init_wandb(args, name, id=None, resume=True): ) return wandb + def sweep(args, env_name, make_env, policy_cls, rnn_cls): import wandb - sweep_id = wandb.sweep(sweep=args['sweep'], project=args['wandb_project']) + + sweep_id = wandb.sweep(sweep=args["sweep"], project=args["wandb_project"]) def main(): try: - wandb = init_wandb(args, env_name, id=args['exp_id']) - args['train'].update(wandb.config.train) + wandb = init_wandb(args, env_name, id=args["exp_id"]) + args["train"].update(wandb.config.train) train(args, make_env, policy_cls, rnn_cls, wandb) - except Exception as e: + except Exception: Console().print_exception() wandb.agent(sweep_id, main, count=100) + ### CARBS Sweeps def sweep_carbs(args, env_name, make_env, policy_cls, rnn_cls): - import numpy as np - import sys + from math import ceil, floor, log - from math import log, ceil, floor - - from carbs import CARBS - from carbs import CARBSParams - from carbs import LinearSpace - from carbs import LogSpace - from carbs import LogitSpace - from carbs import ObservationInParam - from carbs import ParamDictType - from carbs import Param + import numpy as np + from carbs import CARBS, CARBSParams, LinearSpace, LogitSpace, LogSpace, ObservationInParam, Param def closest_power(x): possible_results = floor(log(x, 2)), ceil(log(x, 2)) - return int(2**min(possible_results, key= lambda z: abs(x-2**z))) - - def carbs_param(group, name, space, wandb_params, mmin=None, mmax=None, - search_center=None, is_integer=False, rounding_factor=1, scale=1): - wandb_param = wandb_params[group]['parameters'][name] - if 'values' in wandb_param: - values = wandb_param['values'] + return int(2 ** min(possible_results, key=lambda z: abs(x - 2**z))) + + def carbs_param( + group, + name, + space, + wandb_params, + mmin=None, + mmax=None, + search_center=None, + is_integer=False, + rounding_factor=1, + scale=1, + ): + wandb_param = wandb_params[group]["parameters"][name] + if "values" in wandb_param: + values = wandb_param["values"] mmin = min(values) mmax = max(values) if mmin is None: - mmin = float(wandb_param['min']) + mmin = float(wandb_param["min"]) if mmax is None: - mmax = float(wandb_param['max']) + mmax = float(wandb_param["max"]) - if space == 'log': + if space == "log": Space = LogSpace if search_center is None: - search_center = 2**(np.log2(mmin) + np.log2(mmax)/2) - elif space == 'linear': + search_center = 2 ** (np.log2(mmin) + np.log2(mmax) / 2) + elif space == "linear": Space = LinearSpace if search_center is None: - search_center = (mmin + mmax)/2 - elif space == 'logit': + search_center = (mmin + mmax) / 2 + elif space == "logit": Space = LogitSpace assert mmin == 0 assert mmax == 1 assert search_center is not None else: - raise ValueError(f'Invalid CARBS space: {space} (log/linear)') + raise ValueError(f"Invalid CARBS space: {space} (log/linear)") return Param( - name=f'{group}/{name}', + name=f"{group}/{name}", space=Space( min=mmin, max=mmax, @@ -120,78 +128,82 @@ def carbs_param(group, name, space, wandb_params, mmin=None, mmax=None, search_center=search_center, ) - if not os.path.exists('checkpoints'): - os.system('mkdir checkpoints') + if not os.path.exists("checkpoints"): + os.system("mkdir checkpoints") import wandb + sweep_id = wandb.sweep( - sweep=args['sweep'], + sweep=args["sweep"], project="carbs", ) - target_metric = args['sweep']['metric']['name'].split('/')[-1] - sweep_parameters = args['sweep']['parameters'] - #wandb_env_params = sweep_parameters['env']['parameters'] - #wandb_policy_params = sweep_parameters['policy']['parameters'] + target_metric = args["sweep"]["metric"]["name"].split("/")[-1] + sweep_parameters = args["sweep"]["parameters"] + # wandb_env_params = sweep_parameters['env']['parameters'] + # wandb_policy_params = sweep_parameters['policy']['parameters'] # Must be hardcoded and match wandb sweep space for now param_spaces = [] - if 'total_timesteps' in sweep_parameters['train']['parameters']: - time_param = sweep_parameters['train']['parameters']['total_timesteps'] - min_timesteps = time_param['min'] - param_spaces.append(carbs_param('train', 'total_timesteps', 'log', sweep_parameters, - search_center=min_timesteps, is_integer=True)) + if "total_timesteps" in sweep_parameters["train"]["parameters"]: + time_param = sweep_parameters["train"]["parameters"]["total_timesteps"] + min_timesteps = time_param["min"] + param_spaces.append( + carbs_param( + "train", "total_timesteps", "log", sweep_parameters, search_center=min_timesteps, is_integer=True + ) + ) - batch_param = sweep_parameters['train']['parameters']['batch_size'] - default_batch = (batch_param['max'] - batch_param['min']) // 2 + batch_param = sweep_parameters["train"]["parameters"]["batch_size"] + default_batch = (batch_param["max"] - batch_param["min"]) // 2 - minibatch_param = sweep_parameters['train']['parameters']['minibatch_size'] - default_minibatch = (minibatch_param['max'] - minibatch_param['min']) // 2 + minibatch_param = sweep_parameters["train"]["parameters"]["minibatch_size"] + default_minibatch = (minibatch_param["max"] - minibatch_param["min"]) // 2 - if 'env' in sweep_parameters: - env_params = sweep_parameters['env']['parameters'] + if "env" in sweep_parameters: + env_params = sweep_parameters["env"]["parameters"] # MOBA - if 'reward_death' in env_params: - param_spaces.append(carbs_param('env', 'reward_death', - 'linear', sweep_parameters, search_center=-0.42)) - if 'reward_xp' in env_params: - param_spaces.append(carbs_param('env', 'reward_xp', - 'linear', sweep_parameters, search_center=0.015, scale=0.05)) - if 'reward_distance' in env_params: - param_spaces.append(carbs_param('env', 'reward_distance', - 'linear', sweep_parameters, search_center=0.15, scale=0.5)) - if 'reward_tower' in env_params: - param_spaces.append(carbs_param('env', 'reward_tower', - 'linear', sweep_parameters, search_center=4.0)) + if "reward_death" in env_params: + param_spaces.append(carbs_param("env", "reward_death", "linear", sweep_parameters, search_center=-0.42)) + if "reward_xp" in env_params: + param_spaces.append( + carbs_param("env", "reward_xp", "linear", sweep_parameters, search_center=0.015, scale=0.05) + ) + if "reward_distance" in env_params: + param_spaces.append( + carbs_param("env", "reward_distance", "linear", sweep_parameters, search_center=0.15, scale=0.5) + ) + if "reward_tower" in env_params: + param_spaces.append(carbs_param("env", "reward_tower", "linear", sweep_parameters, search_center=4.0)) # Atari - if 'frameskip' in env_params: - param_spaces.append(carbs_param('env', 'frameskip', - 'linear', sweep_parameters, search_center=4, is_integer=True)) - if 'repeat_action_probability' in env_params: - param_spaces.append(carbs_param('env', 'repeat_action_probability', - 'logit', sweep_parameters, search_center=0.25)) + if "frameskip" in env_params: + param_spaces.append( + carbs_param("env", "frameskip", "linear", sweep_parameters, search_center=4, is_integer=True) + ) + if "repeat_action_probability" in env_params: + param_spaces.append( + carbs_param("env", "repeat_action_probability", "logit", sweep_parameters, search_center=0.25) + ) param_spaces += [ - #carbs_param('cnn_channels', 'linear', wandb_policy_params, search_center=32, is_integer=True), - #carbs_param('hidden_size', 'linear', wandb_policy_params, search_center=128, is_integer=True), - #carbs_param('vision', 'linear', search_center=5, is_integer=True), - carbs_param('train', 'learning_rate', 'log', sweep_parameters, search_center=1e-3), - carbs_param('train', 'gamma', 'logit', sweep_parameters, search_center=0.95), - carbs_param('train', 'gae_lambda', 'logit', sweep_parameters, search_center=0.90), - carbs_param('train', 'update_epochs', 'linear', sweep_parameters, - search_center=1, scale=3, is_integer=True), - carbs_param('train', 'clip_coef', 'logit', sweep_parameters, search_center=0.1), - carbs_param('train', 'vf_coef', 'logit', sweep_parameters, search_center=0.5), - carbs_param('train', 'vf_clip_coef', 'logit', sweep_parameters, search_center=0.1), - carbs_param('train', 'max_grad_norm', 'linear', sweep_parameters, search_center=0.5), - carbs_param('train', 'ent_coef', 'log', sweep_parameters, search_center=0.01), - carbs_param('train', 'batch_size', 'log', sweep_parameters, - search_center=default_batch, is_integer=True), - carbs_param('train', 'minibatch_size', 'log', sweep_parameters, - search_center=default_minibatch, is_integer=True), - carbs_param('train', 'bptt_horizon', 'log', sweep_parameters, - search_center=16, is_integer=True), + # carbs_param('cnn_channels', 'linear', wandb_policy_params, search_center=32, is_integer=True), + # carbs_param('hidden_size', 'linear', wandb_policy_params, search_center=128, is_integer=True), + # carbs_param('vision', 'linear', search_center=5, is_integer=True), + carbs_param("train", "learning_rate", "log", sweep_parameters, search_center=1e-3), + carbs_param("train", "gamma", "logit", sweep_parameters, search_center=0.95), + carbs_param("train", "gae_lambda", "logit", sweep_parameters, search_center=0.90), + carbs_param("train", "update_epochs", "linear", sweep_parameters, search_center=1, scale=3, is_integer=True), + carbs_param("train", "clip_coef", "logit", sweep_parameters, search_center=0.1), + carbs_param("train", "vf_coef", "logit", sweep_parameters, search_center=0.5), + carbs_param("train", "vf_clip_coef", "logit", sweep_parameters, search_center=0.1), + carbs_param("train", "max_grad_norm", "linear", sweep_parameters, search_center=0.5), + carbs_param("train", "ent_coef", "log", sweep_parameters, search_center=0.01), + carbs_param("train", "batch_size", "log", sweep_parameters, search_center=default_batch, is_integer=True), + carbs_param( + "train", "minibatch_size", "log", sweep_parameters, search_center=default_minibatch, is_integer=True + ), + carbs_param("train", "bptt_horizon", "log", sweep_parameters, search_center=16, is_integer=True), ] carbs_params = CARBSParams( @@ -199,69 +211,71 @@ def carbs_param(group, name, space, wandb_params, mmin=None, mmax=None, is_wandb_logging_enabled=False, resample_frequency=5, num_random_samples=len(param_spaces), - max_suggestion_cost=args['max_suggestion_cost'], + max_suggestion_cost=args["max_suggestion_cost"], is_saved_on_every_observation=False, ) carbs = CARBS(carbs_params, param_spaces) # GPUDrive doesn't let you reinit the vecenv, so we have to cache it - cache_vecenv = args['env_name'] == 'gpudrive' + cache_vecenv = args["env_name"] == "gpudrive" + + elos = {"model_random.pt": 1000} + vecenv = {"vecenv": None} # can't reassign otherwise + shutil.rmtree("moba_elo", ignore_errors=True) + os.mkdir("moba_elo") + import time + + import torch - elos = {'model_random.pt': 1000} - vecenv = {'vecenv': None} # can't reassign otherwise - shutil.rmtree('moba_elo', ignore_errors=True) - os.mkdir('moba_elo') - import time, torch def main(): - print('Vecenv:', vecenv) + print("Vecenv:", vecenv) # set torch and pytorch seeds to current time np.random.seed(int(time.time())) torch.manual_seed(int(time.time())) - wandb = init_wandb(args, env_name, id=args['exp_id']) - wandb.config.__dict__['_locked'] = {} + wandb = init_wandb(args, env_name, id=args["exp_id"]) + wandb.config.__dict__["_locked"] = {} orig_suggestion = carbs.suggest().suggestion suggestion = orig_suggestion.copy() - print('Suggestion:', suggestion) - #cnn_channels = suggestion.pop('cnn_channels') - #hidden_size = suggestion.pop('hidden_size') - #vision = suggestion.pop('vision') - #wandb.config.env['vision'] = vision - #wandb.config.policy['cnn_channels'] = cnn_channels - #wandb.config.policy['hidden_size'] = hidden_size - train_suggestion = {k.split('/')[1]: v for k, v in suggestion.items() if k.startswith('train/')} - env_suggestion = {k.split('/')[1]: v for k, v in suggestion.items() if k.startswith('env/')} - args['train'].update(train_suggestion) - args['train']['batch_size'] = closest_power( - train_suggestion['batch_size']) - args['train']['minibatch_size'] = closest_power( - train_suggestion['minibatch_size']) - args['train']['bptt_horizon'] = closest_power( - train_suggestion['bptt_horizon']) - - args['env'].update(env_suggestion) - args['track'] = True - wandb.config.update({'train': args['train']}, allow_val_change=True) - wandb.config.update({'env': args['env']}, allow_val_change=True) - - #args.env.__dict__['vision'] = vision - #args['policy']['cnn_channels'] = cnn_channels - #args['policy']['hidden_size'] = hidden_size - #args['rnn']['input_size'] = hidden_size - #args['rnn']['hidden_size'] = hidden_size + print("Suggestion:", suggestion) + # cnn_channels = suggestion.pop('cnn_channels') + # hidden_size = suggestion.pop('hidden_size') + # vision = suggestion.pop('vision') + # wandb.config.env['vision'] = vision + # wandb.config.policy['cnn_channels'] = cnn_channels + # wandb.config.policy['hidden_size'] = hidden_size + train_suggestion = {k.split("/")[1]: v for k, v in suggestion.items() if k.startswith("train/")} + env_suggestion = {k.split("/")[1]: v for k, v in suggestion.items() if k.startswith("env/")} + args["train"].update(train_suggestion) + args["train"]["batch_size"] = closest_power(train_suggestion["batch_size"]) + args["train"]["minibatch_size"] = closest_power(train_suggestion["minibatch_size"]) + args["train"]["bptt_horizon"] = closest_power(train_suggestion["bptt_horizon"]) + + args["env"].update(env_suggestion) + args["track"] = True + wandb.config.update({"train": args["train"]}, allow_val_change=True) + wandb.config.update({"env": args["env"]}, allow_val_change=True) + + # args.env.__dict__['vision'] = vision + # args['policy']['cnn_channels'] = cnn_channels + # args['policy']['hidden_size'] = hidden_size + # args['rnn']['input_size'] = hidden_size + # args['rnn']['hidden_size'] = hidden_size print(wandb.config.train) print(wandb.config.env) print(wandb.config.policy) try: - stats, uptime, new_elos, vecenv['vecenv'] = train(args, make_env, policy_cls, rnn_cls, - wandb, elos=elos, vecenv=vecenv['vecenv'] if cache_vecenv else None) + stats, uptime, new_elos, vecenv["vecenv"] = train( + args, make_env, policy_cls, rnn_cls, wandb, elos=elos, vecenv=vecenv["vecenv"] if cache_vecenv else None + ) elos.update(new_elos) - except Exception as e: + except Exception: import traceback + traceback.print_exc() else: observed_value = stats[target_metric] - print('Observed value:', observed_value) - print('Uptime:', uptime) + print("Observed value:", observed_value) + print("Uptime:", uptime) carbs.observe( ObservationInParam( @@ -273,51 +287,63 @@ def main(): wandb.agent(sweep_id, main, count=500) -def train(args, make_env, policy_cls, rnn_cls, wandb, - eval_frac=0.1, elos={'model_random.pt': 1000}, vecenv=None, subprocess=False, queue=None): + +def train( + args, + make_env, + policy_cls, + rnn_cls, + wandb, + eval_frac=0.1, + elos={"model_random.pt": 1000}, + vecenv=None, + subprocess=False, + queue=None, +): if subprocess: from multiprocessing import Process, Queue + queue = Queue() - p = Process(target=train, args=(args, make_env, policy_cls, rnn_cls, wandb, - eval_frac, elos, False, queue)) + p = Process(target=train, args=(args, make_env, policy_cls, rnn_cls, wandb, eval_frac, elos, False, queue)) p.start() p.join() stats, uptime, elos = queue.get() - if args['vec'] == 'serial': + if args["vec"] == "serial": vec = pufferlib.vector.Serial - elif args['vec'] == 'multiprocessing': + elif args["vec"] == "multiprocessing": vec = pufferlib.vector.Multiprocessing - elif args['vec'] == 'ray': + elif args["vec"] == "ray": vec = pufferlib.vector.Ray - elif args['vec'] == 'native': + elif args["vec"] == "native": vec = pufferlib.environment.PufferEnv else: - raise ValueError(f'Invalid --vec (serial/multiprocessing/ray/native).') + raise ValueError("Invalid --vec (serial/multiprocessing/ray/native).") if vecenv is None: vecenv = pufferlib.vector.make( make_env, - env_kwargs=args['env'], - num_envs=args['train']['num_envs'], - num_workers=args['train']['num_workers'], - batch_size=args['train']['env_batch_size'], - zero_copy=args['train']['zero_copy'], - overwork=args['vec_overwork'], + env_kwargs=args["env"], + num_envs=args["train"]["num_envs"], + num_workers=args["train"]["num_workers"], + batch_size=args["train"]["env_batch_size"], + zero_copy=args["train"]["zero_copy"], + overwork=args["vec_overwork"], backend=vec, ) policy = make_policy(vecenv.driver_env, policy_cls, rnn_cls, args) - ''' + """ if env_name == 'moba': import torch os.makedirs('moba_elo', exist_ok=True) torch.save(policy, os.path.join('moba_elo', 'model_random.pt')) - ''' + """ - train_config = pufferlib.namespace(**args['train'], env=env_name, - exp_id=args['exp_id'] or env_name + '-' + str(uuid.uuid4())[:8]) + train_config = pufferlib.namespace( + **args["train"], env=env_name, exp_id=args["exp_id"] or env_name + "-" + str(uuid.uuid4())[:8] + ) data = clean_pufferl.create(train_config, vecenv, policy, wandb=wandb) while data.global_step < train_config.total_timesteps: clean_pufferl.evaluate(data) @@ -325,15 +351,15 @@ def train(args, make_env, policy_cls, rnn_cls, wandb, uptime = data.profile.uptime steps_evaluated = 0 - steps_to_eval = int(args['train']['total_timesteps'] * eval_frac) - batch_size = args['train']['batch_size'] + steps_to_eval = int(args["train"]["total_timesteps"] * eval_frac) + batch_size = args["train"]["batch_size"] while steps_evaluated < steps_to_eval: stats, _ = clean_pufferl.evaluate(data) steps_evaluated += batch_size clean_pufferl.mean_and_log(data) - ''' + """ if env_name == 'moba': exp_n = len(elos) model_name = f'model_{exp_n}.pt' @@ -343,7 +369,7 @@ def train(args, make_env, policy_cls, rnn_cls, wandb, stats['elo'] = elos[model_name] if wandb is not None: wandb.log({'environment/elo': elos[model_name]}) - ''' + """ clean_pufferl.close(data) if queue is not None: @@ -351,64 +377,68 @@ def train(args, make_env, policy_cls, rnn_cls, wandb, return stats, uptime, elos, vecenv -if __name__ == '__main__': + +if __name__ == "__main__": parser = argparse.ArgumentParser( - description=f':blowfish: PufferLib [bright_cyan]{pufferlib.__version__}[/]' - ' demo options. Shows valid args for your env and policy', - formatter_class=RichHelpFormatter, add_help=False) - parser.add_argument('--env', '--environment', type=str, - default='puffer_squared', help='Name of specific environment to run') - parser.add_argument('--mode', type=str, default='train', - choices='train eval evaluate sweep sweep-carbs autotune profile'.split()) - parser.add_argument('--vec-overwork', action='store_true', - help='Allow vectorization to use >1 worker/core. Not recommended.') - parser.add_argument('--eval-model-path', type=str, default=None, - help='Path to a pretrained checkpoint') - parser.add_argument('--baseline', action='store_true', - help='Load pretrained model from WandB if available') - parser.add_argument('--render-mode', type=str, default='auto', - choices=['auto', 'human', 'ansi', 'rgb_array', 'raylib', 'None']) - parser.add_argument('--exp-id', '--exp-name', type=str, - default=None, help='Resume from experiment') - parser.add_argument('--track', action='store_true', help='Track on WandB') - parser.add_argument('--wandb-project', type=str, default='pufferlib') - parser.add_argument('--wandb-group', type=str, default='debug') + description=f":blowfish: PufferLib [bright_cyan]{pufferlib.__version__}[/]" + " demo options. Shows valid args for your env and policy", + formatter_class=RichHelpFormatter, + add_help=False, + ) + parser.add_argument( + "--env", "--environment", type=str, default="puffer_squared", help="Name of specific environment to run" + ) + parser.add_argument( + "--mode", type=str, default="train", choices="train eval evaluate sweep sweep-carbs autotune profile".split() + ) + parser.add_argument( + "--vec-overwork", action="store_true", help="Allow vectorization to use >1 worker/core. Not recommended." + ) + parser.add_argument("--eval-model-path", type=str, default=None, help="Path to a pretrained checkpoint") + parser.add_argument("--baseline", action="store_true", help="Load pretrained model from WandB if available") + parser.add_argument( + "--render-mode", type=str, default="auto", choices=["auto", "human", "ansi", "rgb_array", "raylib", "None"] + ) + parser.add_argument("--exp-id", "--exp-name", type=str, default=None, help="Resume from experiment") + parser.add_argument("--track", action="store_true", help="Track on WandB") + parser.add_argument("--wandb-project", type=str, default="pufferlib") + parser.add_argument("--wandb-group", type=str, default="debug") args = parser.parse_known_args()[0] - - file_paths = glob.glob('config/**/*.ini', recursive=True) + file_paths = glob.glob("config/**/*.ini", recursive=True) for path in file_paths: p = configparser.ConfigParser() - p.read('config/default.ini') + p.read("config/default.ini") - subconfig = os.path.join(*path.split('/')[:-1] + ['default.ini']) + subconfig = os.path.join(*path.split("/")[:-1] + ["default.ini"]) if subconfig in file_paths: p.read(subconfig) p.read(path) - if args.env in p['base']['env_name'].split(): + if args.env in p["base"]["env_name"].split(): break else: - raise Exception('No config for env_name {}'.format(args.env)) + raise Exception("No config for env_name {}".format(args.env)) for section in p.sections(): for key in p[section]: - if section == 'base': - argparse_key = f'--{key}'.replace('_', '-') + if section == "base": + argparse_key = f"--{key}".replace("_", "-") else: - argparse_key = f'--{section}.{key}'.replace('_', '-') + argparse_key = f"--{section}.{key}".replace("_", "-") parser.add_argument(argparse_key, default=p[section][key]) # Late add help so you get a dynamic menu based on the env - parser.add_argument('-h', '--help', default=argparse.SUPPRESS, - action='help', help='Show this help message and exit') + parser.add_argument( + "-h", "--help", default=argparse.SUPPRESS, action="help", help="Show this help message and exit" + ) parsed = parser.parse_args().__dict__ - args = {'env': {}, 'policy': {}, 'rnn': {}} - env_name = parsed.pop('env') + args = {"env": {}, "policy": {}, "rnn": {}} + env_name = parsed.pop("env") for key, value in parsed.items(): next = args - for subkey in key.split('.'): + for subkey in key.split("."): if subkey not in next: next[subkey] = {} prev = next @@ -418,69 +448,72 @@ def train(args, make_env, policy_cls, rnn_cls, wandb, except: prev[subkey] = value - package = args['package'] - module_name = f'pufferlib.environments.{package}' - if package == 'ocean': - module_name = 'pufferlib.ocean' + package = args["package"] + module_name = f"pufferlib.environments.{package}" + if package == "ocean": + module_name = "pufferlib.ocean" import importlib + env_module = importlib.import_module(module_name) make_env = env_module.env_creator(env_name) - policy_cls = getattr(env_module.torch, args['policy_name']) - - rnn_name = args['rnn_name'] + policy_cls = getattr(env_module.torch, args["policy_name"]) + + rnn_name = args["rnn_name"] rnn_cls = None if rnn_name is not None: - rnn_cls = getattr(env_module.torch, args['rnn_name']) - - if args['baseline']: - assert args['mode'] in ('train', 'eval', 'evaluate') - args['track'] = True - version = '.'.join(pufferlib.__version__.split('.')[:2]) - args['exp_id'] = f'puf-{version}-{env_name}' - args['wandb_group'] = f'puf-{version}-baseline' - shutil.rmtree(f'experiments/{args["exp_id"]}', ignore_errors=True) - run = init_wandb(args, args['exp_id'], resume=False) - if args['mode'] in ('eval', 'evaluate'): - model_name = f'puf-{version}-{env_name}_model:latest' + rnn_cls = getattr(env_module.torch, args["rnn_name"]) + + if args["baseline"]: + assert args["mode"] in ("train", "eval", "evaluate") + args["track"] = True + version = ".".join(pufferlib.__version__.split(".")[:2]) + args["exp_id"] = f"puf-{version}-{env_name}" + args["wandb_group"] = f"puf-{version}-baseline" + shutil.rmtree(f"experiments/{args['exp_id']}", ignore_errors=True) + run = init_wandb(args, args["exp_id"], resume=False) + if args["mode"] in ("eval", "evaluate"): + model_name = f"puf-{version}-{env_name}_model:latest" artifact = run.use_artifact(model_name) data_dir = artifact.download() model_file = max(os.listdir(data_dir)) - args['eval_model_path'] = os.path.join(data_dir, model_file) - if args['mode'] == 'train': + args["eval_model_path"] = os.path.join(data_dir, model_file) + if args["mode"] == "train": wandb = None - if args['track']: - wandb = init_wandb(args, env_name, id=args['exp_id']) + if args["track"]: + wandb = init_wandb(args, env_name, id=args["exp_id"]) train(args, make_env, policy_cls, rnn_cls, wandb=wandb) - elif args['mode'] in ('eval', 'evaluate'): + elif args["mode"] in ("eval", "evaluate"): vec = pufferlib.vector.Serial - if args['vec'] == 'native': + if args["vec"] == "native": vec = pufferlib.environment.PufferEnv clean_pufferl.rollout( make_env, - args['env'], + args["env"], policy_cls=policy_cls, rnn_cls=rnn_cls, agent_creator=make_policy, agent_kwargs=args, backend=vec, - model_path=args['eval_model_path'], - render_mode=args['render_mode'], - device=args['train']['device'], + model_path=args["eval_model_path"], + render_mode=args["render_mode"], + device=args["train"]["device"], ) - elif args['mode'] == 'sweep': - args['track'] = True + elif args["mode"] == "sweep": + args["track"] = True sweep(args, env_name, make_env, policy_cls, rnn_cls) - elif args['mode'] == 'sweep-carbs': + elif args["mode"] == "sweep-carbs": sweep_carbs(args, env_name, make_env, policy_cls, rnn_cls) - elif args['mode'] == 'autotune': - pufferlib.vector.autotune(make_env, batch_size=args['train']['env_batch_size']) - elif args['mode'] == 'profile': + elif args["mode"] == "autotune": + pufferlib.vector.autotune(make_env, batch_size=args["train"]["env_batch_size"]) + elif args["mode"] == "profile": import cProfile - cProfile.run('train(args, make_env, policy_cls, rnn_cls, wandb=None)', 'stats.profile') + + cProfile.run("train(args, make_env, policy_cls, rnn_cls, wandb=None)", "stats.profile") import pstats from pstats import SortKey - p = pstats.Stats('stats.profile') + + p = pstats.Stats("stats.profile") p.sort_stats(SortKey.TIME).print_stats(10) diff --git a/evaluate_elos.py b/evaluate_elos.py index 95ea5318c..a09dd6e37 100644 --- a/evaluate_elos.py +++ b/evaluate_elos.py @@ -1,20 +1,16 @@ -import numpy as np -import torch -import time - -import pufferlib -import random import glob import os +import random +import time -from pufferlib.policy_ranker import update_elos +import numpy as np +import torch from pufferlib.environments.ocean.environment import env_creator -from pufferlib.environments.ocean.torch import MOBA, Recurrent -import pufferlib.cleanrl +from pufferlib.policy_ranker import update_elos -def load_policies(checkpoint_dir, n, map_location='cuda'): - paths = glob.glob(f'{checkpoint_dir}/model_*.pt', recursive=True) +def load_policies(checkpoint_dir, n, map_location="cuda"): + paths = glob.glob(f"{checkpoint_dir}/model_*.pt", recursive=True) # Sample with replacement if not enough models if len(paths) < n: @@ -22,16 +18,16 @@ def load_policies(checkpoint_dir, n, map_location='cuda'): else: samples = random.sample(paths, n) - names = [path.split('/')[-1] for path in samples] - return {name: torch.load(path, map_location=map_location) - for name, path in zip(names, samples)} + names = [path.split("/")[-1] for path in samples] + return {name: torch.load(path, map_location=map_location) for name, path in zip(names, samples, strict=False)} + def rollout(envs, policy, opponents, num_games, timeout=180, render=False): obs, _ = envs.reset() # Double reset clears randomizations obs, _ = envs.reset() - #cenv = envs.c_envs[0] + # cenv = envs.c_envs[0] start = time.time() step = 0 @@ -45,18 +41,18 @@ def rollout(envs, policy, opponents, num_games, timeout=180, render=False): prev_dire_victories = [c.dire_victories for c in envs.c_envs] scores = [] - atn_shape = (10*num_envs, len(envs.action_space.nvec)) + atn_shape = (10 * num_envs, len(envs.action_space.nvec)) actions = torch.zeros(atn_shape, dtype=torch.int64).cuda() actions_struct = actions.view(num_opponents, envs_per_opponent, 2, 5, len(envs.action_space.nvec)) - slice_idxs = torch.arange(10*num_envs).reshape(num_opponents, envs_per_opponent, 2, 5).cuda() + slice_idxs = torch.arange(10 * num_envs).reshape(num_opponents, envs_per_opponent, 2, 5).cuda() flat_teams = np.random.randint(0, 2, num_envs) team_assignments = torch.from_numpy(flat_teams.reshape(num_opponents, envs_per_opponent)).cuda() arange = torch.arange(envs_per_opponent).cuda() games_played = 0 while games_played < num_games and time.time() - start < timeout: - #if render and step % 10 == 0: + # if render and step % 10 == 0: # env.render() step += 1 @@ -66,10 +62,10 @@ def rollout(envs, policy, opponents, num_games, timeout=180, render=False): idxs = slice_idxs[i] teams = team_assignments[i] - my_obs = obs[idxs[arange, teams]].view(5*envs_per_opponent, -1) - opp_obs = obs[idxs[arange, 1 - teams]].view(5*envs_per_opponent, -1) + my_obs = obs[idxs[arange, teams]].view(5 * envs_per_opponent, -1) + opp_obs = obs[idxs[arange, 1 - teams]].view(5 * envs_per_opponent, -1) - if hasattr(policy, 'lstm'): + if hasattr(policy, "lstm"): my_actions, _, _, _, my_states[i] = policy(my_obs, my_states[i]) opp_atn, _, _, _, opp_states[i] = opponents[i](opp_obs, opp_states[i]) else: @@ -88,45 +84,46 @@ def rollout(envs, policy, opponents, num_games, timeout=180, render=False): prev_radiant_victories[i] = c.radiant_victories scores.append((opp_idx, flat_teams[i] == 0)) games_played += 1 - print('Radiant Victory') + print("Radiant Victory") elif c.dire_victories > prev_dire_victories[i]: prev_dire_victories[i] = c.dire_victories scores.append((opp_idx, flat_teams[i] == 1)) games_played += 1 - print('Dire Victory') + print("Dire Victory") return scores + def calc_elo(checkpoint, checkpoint_dir, elos, num_envs=128, num_games=128, num_opponents=8, k=24.0): - print(f'Calculating ELO for {checkpoint}') - make_env = env_creator('moba') + print(f"Calculating ELO for {checkpoint}") + make_env = env_creator("moba") envs = make_env(num_envs=num_envs) - policy = torch.load(os.path.join(checkpoint_dir, checkpoint), map_location='cuda') - print(f'Loaded policy {checkpoint}') + policy = torch.load(os.path.join(checkpoint_dir, checkpoint), map_location="cuda") + print(f"Loaded policy {checkpoint}") - paths = glob.glob(f'{checkpoint_dir}/model_*.pt', recursive=True) - names = [path.split('/')[-1] for path in paths] - print(f'Loaded {len(paths)} models') - paths.remove(f'{checkpoint_dir}/{checkpoint}') - print(f'Removed {checkpoint} from paths') + paths = glob.glob(f"{checkpoint_dir}/model_*.pt", recursive=True) + names = [path.split("/")[-1] for path in paths] + print(f"Loaded {len(paths)} models") + paths.remove(f"{checkpoint_dir}/{checkpoint}") + print(f"Removed {checkpoint} from paths") elos[checkpoint] = 1000 - + # Sample with replacement if not enough models - print(f'Sampling {num_opponents} opponents') + print(f"Sampling {num_opponents} opponents") n_models = len(paths) if n_models < num_opponents: idxs = random.choices(range(n_models), k=num_opponents) else: idxs = random.sample(range(n_models), num_opponents) - print(f'Sampled {num_opponents} opponents') + print(f"Sampled {num_opponents} opponents") opponent_names = [names[i] for i in idxs] - opponents = [torch.load(paths[i], map_location='cuda') for i in idxs] - print(f'Loaded {num_opponents} opponents') + opponents = [torch.load(paths[i], map_location="cuda") for i in idxs] + print(f"Loaded {num_opponents} opponents") results = rollout(envs, policy, opponents, num_games=num_games, render=False) - print(f'Finished {num_games} games') + print(f"Finished {num_games} games") for game in results: opponent, win = game @@ -139,15 +136,15 @@ def calc_elo(checkpoint, checkpoint_dir, elos, num_envs=128, num_games=128, num_ elo_pair = np.array([elos[checkpoint], elos[opp_name]]) elo_pair = update_elos(elo_pair, score, k=24.0) elos[checkpoint] = elo_pair[0] - #elos[opp_name] = elo_pair[1] + # elos[opp_name] = elo_pair[1] - print(f'Finished calculating ELO for {checkpoint}') + print(f"Finished calculating ELO for {checkpoint}") for k, v in elos.items(): - print(f'{k}: {v}') + print(f"{k}: {v}") return elos - ''' + """ for game in range(1000): opponent, name = load_policy(checkpoint_dir) print(f'Game: {game} Opponent: {name}') @@ -163,12 +160,11 @@ def calc_elo(checkpoint, checkpoint_dir, elos, num_envs=128, num_games=128, num_ for k, v in elos.items(): print(f'{k}: {v}') print() - ''' + """ -if __name__ == '__main__': - checkpoint_dir = 'moba_elo' - checkpoint = 'model_0.pt' - elos = {'model_random.pt': 1000} +if __name__ == "__main__": + checkpoint_dir = "moba_elo" + checkpoint = "model_0.pt" + elos = {"model_random.pt": 1000} calc_elo(checkpoint, checkpoint_dir, elos, num_games=16) - diff --git a/pufferlib/__init__.py b/pufferlib/__init__.py index c501c0194..f346a0032 100644 --- a/pufferlib/__init__.py +++ b/pufferlib/__init__.py @@ -1,4 +1,5 @@ from pufferlib import version + __version__ = version.__version__ import os @@ -6,13 +7,14 @@ # Silence noisy dependencies import warnings + warnings.filterwarnings("ignore", category=DeprecationWarning) # Silence noisy packages original_stdout = sys.stdout original_stderr = sys.stderr -sys.stdout = open(os.devnull, 'w') -sys.stderr = open(os.devnull, 'w') +sys.stdout = open(os.devnull, "w") +sys.stderr = open(os.devnull, "w") try: import gymnasium import pygame @@ -23,6 +25,6 @@ sys.stdout = original_stdout sys.stderr = original_stderr -from pufferlib.namespace import namespace, dataclass, Namespace from pufferlib import environments from pufferlib.environment import PufferEnv +from pufferlib.namespace import Namespace, dataclass, namespace diff --git a/pufferlib/cleanrl.py b/pufferlib/cleanrl.py index 28de11385..1935b96e7 100644 --- a/pufferlib/cleanrl.py +++ b/pufferlib/cleanrl.py @@ -1,12 +1,8 @@ -from pdb import set_trace as T from typing import List, Union import torch -from torch.distributions import Categorical from torch.distributions.utils import logits_to_probs -import pufferlib.models - # taken from torch.distributions.Categorical def log_prob(logits, value): @@ -15,6 +11,7 @@ def log_prob(logits, value): value = value[..., :1] return log_pmf.gather(-1, value).squeeze(-1) + # taken from torch.distributions.Categorical def entropy(logits): min_real = torch.finfo(logits.dtype).min @@ -22,8 +19,8 @@ def entropy(logits): p_log_p = logits * logits_to_probs(logits) return -p_log_p.sum(-1) -def sample_logits(logits: Union[torch.Tensor, List[torch.Tensor]], - action=None, is_continuous=False): + +def sample_logits(logits: Union[torch.Tensor, List[torch.Tensor]], action=None, is_continuous=False): is_discrete = isinstance(logits, torch.Tensor) if is_continuous: batch = logits.loc.shape[0] @@ -36,10 +33,9 @@ def sample_logits(logits: Union[torch.Tensor, List[torch.Tensor]], elif is_discrete: normalized_logits = [logits - logits.logsumexp(dim=-1, keepdim=True)] logits = [logits] - else: # not sure what else it could be + else: # not sure what else it could be normalized_logits = [l - l.logsumexp(dim=-1, keepdim=True) for l in logits] - if action is None: action = torch.stack([torch.multinomial(logits_to_probs(l), 1).squeeze() for l in logits]) else: @@ -47,7 +43,7 @@ def sample_logits(logits: Union[torch.Tensor, List[torch.Tensor]], action = action.view(batch, -1).T assert len(logits) == len(action) - logprob = torch.stack([log_prob(l, a) for l, a in zip(normalized_logits, action)]).T.sum(1) + logprob = torch.stack([log_prob(l, a) for l, a in zip(normalized_logits, action, strict=False)]).T.sum(1) logits_entropy = torch.stack([entropy(l) for l in normalized_logits]).T.sum(1) if is_discrete: @@ -57,48 +53,52 @@ def sample_logits(logits: Union[torch.Tensor, List[torch.Tensor]], class Policy(torch.nn.Module): - '''Wrap a non-recurrent PyTorch model for use with CleanRL''' + """Wrap a non-recurrent PyTorch model for use with CleanRL""" + def __init__(self, policy): super().__init__() self.policy = policy - self.is_continuous = hasattr(policy, 'is_continuous') and policy.is_continuous + self.is_continuous = hasattr(policy, "is_continuous") and policy.is_continuous + self.hidden_size = policy.hidden_size def get_value(self, x, state=None): _, value = self.policy(x) return value def get_action_and_value(self, x, action=None): - logits, value = self.policy(x) - action, logprob, entropy = sample_logits(logits, action, self.is_continuous) - return action, logprob, entropy, value + logits, value, e3b, intrinsic_reward = self.policy(x, e3b=e3b) + action, logprob, entropy = sample_logits(logits, action, self.is_continuous) + return action, logprob, entropy, value, e3b, intrinsic_reward - def forward(self, x, action=None): - return self.get_action_and_value(x, action) + def forward(self, x, action=None, e3b=None): + return self.get_action_and_value(x, action, e3b) class RecurrentPolicy(torch.nn.Module): - '''Wrap a recurrent PyTorch model for use with CleanRL''' + """Wrap a recurrent PyTorch model for use with CleanRL""" + def __init__(self, policy): super().__init__() self.policy = policy - self.is_continuous = hasattr(policy.policy, 'is_continuous') and policy.policy.is_continuous + self.is_continuous = hasattr(policy.policy, "is_continuous") and policy.policy.is_continuous + self.hidden_size = policy.hidden_size @property def lstm(self): - if hasattr(self.policy, 'recurrent'): + if hasattr(self.policy, "recurrent"): return self.policy.recurrent - elif hasattr(self.policy, 'lstm'): + elif hasattr(self.policy, "lstm"): return self.policy.lstm else: - raise ValueError('Policy must have a subnetwork named lstm or recurrent') + raise ValueError("Policy must have a subnetwork named lstm or recurrent") def get_value(self, x, state=None): _, value, _ = self.policy(x, state) - def get_action_and_value(self, x, state=None, action=None): - logits, value, state = self.policy(x, state) + def get_action_and_value(self, x, state=None, action=None, e3b=None): + logits, value, state, e3b, intrinsic_reward = self.policy(x, state, e3b=e3b) action, logprob, entropy = sample_logits(logits, action, self.is_continuous) - return action, logprob, entropy, value, state + return action, logprob, entropy, value, state, e3b, intrinsic_reward - def forward(self, x, state=None, action=None): - return self.get_action_and_value(x, state, action) + def forward(self, x, state=None, action=None, e3b=None): + return self.get_action_and_value(x, state, action, e3b) diff --git a/pufferlib/emulation.py b/pufferlib/emulation.py index 8b8089b76..87fcfb4e1 100644 --- a/pufferlib/emulation.py +++ b/pufferlib/emulation.py @@ -1,16 +1,15 @@ -from pdb import set_trace as T -import numpy as np +import inspect import warnings import gymnasium -import inspect +import numpy as np import pufferlib import pufferlib.spaces -from pufferlib import utils, exceptions +from pufferlib import exceptions, utils from pufferlib.environment import set_buffers -from pufferlib.spaces import Discrete, Tuple, Dict +from pufferlib.spaces import Dict, Discrete, Tuple def emulate(struct, sample): @@ -19,18 +18,19 @@ def emulate(struct, sample): emulate(struct[k], v) elif isinstance(sample, tuple): for i, v in enumerate(sample): - emulate(struct[f'f{i}'], v) + emulate(struct[f"f{i}"], v) else: struct[()] = sample + def make_buffer(arr_dtype, struct_dtype, struct, n=None): - '''None instead of 1 makes it work for 1 agent PZ envs''' - ''' + """None instead of 1 makes it work for 1 agent PZ envs""" + """ if n is None: struct = np.zeros(1, dtype=struct_dtype) else: struct = np.zeros(n, dtype=struct_dtype) - ''' + """ arr = struct.view(arr_dtype) @@ -41,34 +41,36 @@ def make_buffer(arr_dtype, struct_dtype, struct, n=None): return arr + def _nativize(struct, space): if isinstance(space, Discrete): return struct.item() elif isinstance(space, Tuple): - return tuple(_nativize(struct[f'f{i}'], elem) - for i, elem in enumerate(space)) + return tuple(_nativize(struct[f"f{i}"], elem) for i, elem in enumerate(space)) elif isinstance(space, Dict): - return {k: _nativize(struct[k], value) - for k, value in space.items()} + return {k: _nativize(struct[k], value) for k, value in space.items()} else: return struct + def nativize(arr, space, struct_dtype): struct = np.asarray(arr).view(struct_dtype)[0] return _nativize(struct, space) -''' + +""" try: from pufferlib.extensions import emulate, nativize except ImportError: warnings.warn('PufferLib Cython extensions not installed. Using slow Python versions') -''' +""" + def dtype_from_space(space): if isinstance(space, pufferlib.spaces.Tuple): dtype = [] for i, elem in enumerate(space): - dtype.append((f'f{i}', dtype_from_space(elem))) + dtype.append((f"f{i}", dtype_from_space(elem))) elif isinstance(space, pufferlib.spaces.Dict): dtype = [] for k, value in space.items(): @@ -82,6 +84,7 @@ def dtype_from_space(space): return np.dtype(dtype, align=True) + def flatten_space(space): if isinstance(space, pufferlib.spaces.Tuple): subspaces = [] @@ -96,6 +99,7 @@ def flatten_space(space): else: return [space] + def emulate_observation_space(space): emulated_dtype = dtype_from_space(space) @@ -114,6 +118,7 @@ def emulate_observation_space(space): emulated_space = gymnasium.spaces.Box(low=mmin, high=mmax, shape=(numel,), dtype=dtype) return emulated_space, emulated_dtype + def emulate_action_space(space): if isinstance(space, pufferlib.spaces.Box): return space, space.dtype @@ -136,10 +141,8 @@ def __init__(self, env=None, env_creator=None, env_args=[], env_kwargs={}, buf=N self.is_observation_checked = False self.is_action_checked = False - self.observation_space, self.obs_dtype = emulate_observation_space( - self.env.observation_space) - self.action_space, self.atn_dtype = emulate_action_space( - self.env.action_space) + self.observation_space, self.obs_dtype = emulate_observation_space(self.env.observation_space) + self.action_space, self.atn_dtype = emulate_action_space(self.env.action_space) self.single_observation_space = self.observation_space self.single_action_space = self.action_space self.num_agents = 1 @@ -147,18 +150,18 @@ def __init__(self, env=None, env_creator=None, env_args=[], env_kwargs={}, buf=N self.is_obs_emulated = self.single_observation_space is not self.env.observation_space self.is_atn_emulated = self.single_action_space is not self.env.action_space self.emulated = pufferlib.namespace( - observation_dtype = self.observation_space.dtype, - emulated_observation_dtype = self.obs_dtype, + observation_dtype=self.observation_space.dtype, + emulated_observation_dtype=self.obs_dtype, ) - self.render_modes = 'human rgb_array'.split() + self.render_modes = "human rgb_array".split() set_buffers(self, buf) if isinstance(self.env.observation_space, pufferlib.spaces.Box): self.obs_struct = self.observations else: self.obs_struct = self.observations.view(self.obs_dtype) - + @property def render_mode(self): return self.env.render_mode @@ -172,8 +175,7 @@ def reset(self, seed=None): ob, info = _seed_and_reset(self.env, seed) if not self.is_observation_checked: - self.is_observation_checked = check_space( - ob, self.env.observation_space) + self.is_observation_checked = check_space(ob, self.env.observation_space) if self.is_obs_emulated: emulate(self.obs_struct, ob) @@ -184,15 +186,15 @@ def reset(self, seed=None): self.terminals[0] = False self.truncations[0] = False self.masks[0] = True - + return self.observations, info - + def step(self, action): - '''Execute an action and return (observation, reward, done, info)''' + """Execute an action and return (observation, reward, done, info)""" if not self.initialized: - raise exceptions.APIUsageError('step() called before reset()') + raise exceptions.APIUsageError("step() called before reset()") if self.done: - raise exceptions.APIUsageError('step() called after environment is done') + raise exceptions.APIUsageError("step() called after environment is done") # Unpack actions from multidiscrete into the original action space if self.is_atn_emulated: @@ -204,8 +206,7 @@ def step(self, action): action = action[0] if not self.is_action_checked: - self.is_action_checked = check_space( - action, self.env.action_space) + self.is_action_checked = check_space(action, self.env.action_space) ob, reward, done, truncated, info = self.env.step(action) @@ -218,7 +219,7 @@ def step(self, action): self.terminals[0] = done self.truncations[0] = truncated self.masks[0] = True - + self.done = done or truncated return self.observations, reward, done, truncated, info @@ -228,6 +229,7 @@ def render(self): def close(self): return self.env.close() + class PettingZooPufferEnv: def __init__(self, env=None, env_creator=None, env_args=[], buf=None, env_kwargs={}, to_puffer=False): self.env = make_object(env, env_creator, env_args, env_kwargs) @@ -242,15 +244,13 @@ def __init__(self, env=None, env_creator=None, env_args=[], buf=None, env_kwargs single_agent = self.possible_agents[0] self.env_single_observation_space = self.env.observation_space(single_agent) self.env_single_action_space = self.env.action_space(single_agent) - self.single_observation_space, self.obs_dtype = ( - emulate_observation_space(self.env_single_observation_space)) - self.single_action_space, self.atn_dtype = ( - emulate_action_space(self.env_single_action_space)) + self.single_observation_space, self.obs_dtype = emulate_observation_space(self.env_single_observation_space) + self.single_action_space, self.atn_dtype = emulate_action_space(self.env_single_action_space) self.is_obs_emulated = self.single_observation_space is not self.env_single_observation_space self.is_atn_emulated = self.single_action_space is not self.env_single_action_space self.emulated = pufferlib.namespace( - observation_dtype = self.single_observation_space.dtype, - emulated_observation_dtype = self.obs_dtype, + observation_dtype=self.single_observation_space.dtype, + emulated_observation_dtype=self.obs_dtype, ) self.num_agents = len(self.possible_agents) @@ -278,14 +278,14 @@ def done(self): return len(self.agents) == 0 or self.all_done def observation_space(self, agent): - '''Returns the observation space for a single agent''' + """Returns the observation space for a single agent""" if agent not in self.possible_agents: raise pufferlib.exceptions.InvalidAgentError(agent, self.possible_agents) return self.single_observation_space def action_space(self, agent): - '''Returns the action space for a single agent''' + """Returns the action space for a single agent""" if agent not in self.possible_agents: raise pufferlib.exceptions.InvalidAgentError(agent, self.possible_agents) @@ -303,8 +303,7 @@ def reset(self, seed=None): if not self.is_observation_checked: for k, ob in obs.items(): - self.is_observation_checked = check_space( - ob, self.env.observation_space(k)) + self.is_observation_checked = check_space(ob, self.env.observation_space(k)) # Call user featurizer and flatten the observations self.observations[:] = 0 @@ -326,16 +325,17 @@ def reset(self, seed=None): return self.dict_obs, info def step(self, actions): - '''Step the environment and return (observations, rewards, dones, infos)''' + """Step the environment and return (observations, rewards, dones, infos)""" if not self.initialized: - raise exceptions.APIUsageError('step() called before reset()') + raise exceptions.APIUsageError("step() called before reset()") if self.done: - raise exceptions.APIUsageError('step() called after environment is done') + raise exceptions.APIUsageError("step() called after environment is done") if isinstance(actions, np.ndarray): if not self.is_action_checked and len(actions) != self.num_agents: raise exceptions.APIUsageError( - f'Actions specified as len {len(actions)} but environment has {self.num_agents} agents') + f"Actions specified as len {len(actions)} but environment has {self.num_agents} agents" + ) actions = {agent: actions[i] for i, agent in enumerate(self.possible_agents)} @@ -345,10 +345,7 @@ def step(self, actions): if agent not in self.possible_agents: raise exceptions.InvalidAgentError(agent, self.possible_agents) - self.is_action_checked = check_space( - next(iter(actions.values())), - self.single_action_space - ) + self.is_action_checked = check_space(next(iter(actions.values())), self.single_action_space) # Unpack actions from multidiscrete into the original action space unpacked_actions = {} @@ -381,7 +378,7 @@ def step(self, actions): self.masks[i] = False continue - ob = obs[agent] + ob = obs[agent] self.mask[agent] = True if self.is_obs_emulated: emulate(self.obs_struct[i], ob) @@ -392,10 +389,12 @@ def step(self, actions): self.terminals[i] = dones[agent] self.truncations[i] = truncateds[agent] self.masks[i] = True - + self.all_done = all(dones.values()) or all(truncateds.values()) rewards = pad_agent_data(rewards, self.possible_agents, 0) - dones = pad_agent_data(dones, self.possible_agents, True) # You changed this from false to match api test... is this correct? + dones = pad_agent_data( + dones, self.possible_agents, True + ) # You changed this from false to match api test... is this correct? truncateds = pad_agent_data(truncateds, self.possible_agents, False) return self.dict_obs, rewards, dones, truncateds, infos @@ -405,23 +404,24 @@ def render(self): def close(self): return self.env.close() + def pad_agent_data(data, agents, pad_value): - return {agent: data[agent] if agent in data else pad_value - for agent in agents} - + return {agent: data[agent] if agent in data else pad_value for agent in agents} + + def make_object(object_instance=None, object_creator=None, creator_args=[], creator_kwargs={}): if (object_instance is None) == (object_creator is None): - raise ValueError('Exactly one of object_instance or object_creator must be provided') + raise ValueError("Exactly one of object_instance or object_creator must be provided") if object_instance is not None: if callable(object_instance) or inspect.isclass(object_instance): - raise TypeError('object_instance must be an instance, not a function or class') + raise TypeError("object_instance must be an instance, not a function or class") return object_instance if object_creator is not None: if not callable(object_creator): - raise TypeError('object_creator must be a callable') - + raise TypeError("object_creator must be a callable") + if creator_args is None: creator_args = [] @@ -430,19 +430,19 @@ def make_object(object_instance=None, object_creator=None, creator_args=[], crea return object_creator(*creator_args, **creator_kwargs) + def check_space(data, space): try: contains = space.contains(data) except: - raise exceptions.APIUsageError( - f'Error checking space {space} with sample :\n{data}') + raise exceptions.APIUsageError(f"Error checking space {space} with sample :\n{data}") if not contains: - raise exceptions.APIUsageError( - f'Data:\n{data}\n not in space:\n{space}') - + raise exceptions.APIUsageError(f"Data:\n{data}\n not in space:\n{space}") + return True + def _seed_and_reset(env, seed): if seed is None: # Gym bug: does not reset env correctly @@ -457,6 +457,6 @@ def _seed_and_reset(env, seed): obs, info = env.reset() except: obs, info = env.reset() - warnings.warn('WARNING: Environment does not support seeding.', DeprecationWarning) + warnings.warn("WARNING: Environment does not support seeding.", DeprecationWarning) return obs, info diff --git a/pufferlib/environment.py b/pufferlib/environment.py index bce092fbc..bc3d26255 100644 --- a/pufferlib/environment.py +++ b/pufferlib/environment.py @@ -1,12 +1,13 @@ import numpy as np -from pufferlib.exceptions import APIUsageError import pufferlib.spaces +from pufferlib.exceptions import APIUsageError -ERROR = ''' +ERROR = """ Environment missing required attribute {}. The most common cause is calling super() before you have assigned the attribute. -''' +""" + def set_buffers(env, buf=None): if buf is None: @@ -31,25 +32,28 @@ def set_buffers(env, buf=None): env.masks = buf.masks env.actions = buf.actions + class PufferEnv: def __init__(self, buf=None): - if not hasattr(self, 'single_observation_space'): - raise APIUsageError(ERROR.format('single_observation_space')) - if not hasattr(self, 'single_action_space'): - raise APIUsageError(ERROR.format('single_action_space')) - if not hasattr(self, 'num_agents'): - raise APIUsageError(ERROR.format('num_agents')) - - if hasattr(self, 'observation_space'): - raise APIUsageError('PufferEnvs must define single_observation_space, not observation_space') - if hasattr(self, 'action_space'): - raise APIUsageError('PufferEnvs must define single_action_space, not action_space') + if not hasattr(self, "single_observation_space"): + raise APIUsageError(ERROR.format("single_observation_space")) + if not hasattr(self, "single_action_space"): + raise APIUsageError(ERROR.format("single_action_space")) + if not hasattr(self, "num_agents"): + raise APIUsageError(ERROR.format("num_agents")) + + if hasattr(self, "observation_space"): + raise APIUsageError("PufferEnvs must define single_observation_space, not observation_space") + if hasattr(self, "action_space"): + raise APIUsageError("PufferEnvs must define single_action_space, not action_space") if not isinstance(self.single_observation_space, pufferlib.spaces.Box): - raise APIUsageError('Native observation_space must be a Box') - if (not isinstance(self.single_action_space, pufferlib.spaces.Discrete) - and not isinstance(self.single_action_space, pufferlib.spaces.MultiDiscrete) - and not isinstance(self.single_action_space, pufferlib.spaces.Box)): - raise APIUsageError('Native action_space must be a Discrete, MultiDiscrete, or Box') + raise APIUsageError("Native observation_space must be a Box") + if ( + not isinstance(self.single_action_space, pufferlib.spaces.Discrete) + and not isinstance(self.single_action_space, pufferlib.spaces.MultiDiscrete) + and not isinstance(self.single_action_space, pufferlib.spaces.Box) + ): + raise APIUsageError("Native action_space must be a Discrete, MultiDiscrete, or Box") set_buffers(self, buf) @@ -59,17 +63,17 @@ def __init__(self, buf=None): @property def emulated(self): - '''Native envs do not use emulation''' + """Native envs do not use emulation""" return False @property def done(self): - '''Native envs handle resets internally''' + """Native envs handle resets internally""" return False @property def driver_env(self): - '''For compatibility with Multiprocessing''' + """For compatibility with Multiprocessing""" return self def reset(self, seed=None): @@ -83,12 +87,37 @@ def close(self): def async_reset(self, seed=None): _, self.infos = self.reset(seed) - assert isinstance(self.infos, list), 'PufferEnvs must return info as a list of dicts' + assert isinstance(self.infos, list), "PufferEnvs must return info as a list of dicts" def send(self, actions): _, _, _, _, self.infos = self.step(actions) - assert isinstance(self.infos, list), 'PufferEnvs must return info as a list of dicts' + assert isinstance(self.infos, list), "PufferEnvs must return info as a list of dicts" def recv(self): - return (self.observations, self.rewards, self.terminals, - self.truncations, self.infos, self.agent_ids, self.masks) + return ( + self.observations, + self.rewards, + self.terminals, + self.truncations, + self.infos, + self.agent_ids, + self.masks, + ) + + # Add stub methods for action-related functionality + def action_names(self) -> list[str]: + """Return the list of action names. + + Raises: + NotImplementedError: If the environment doesn't implement this method. + """ + raise NotImplementedError("This environment doesn't support action_names()") + + @property + def _c_env(self): + """Access to the C++ environment. + + Raises: + NotImplementedError: If the environment doesn't have a C++ extension. + """ + raise NotImplementedError("This environment doesn't have a C++ extension") diff --git a/pufferlib/environments/__init__.py b/pufferlib/environments/__init__.py index d6c969523..e86699cca 100644 --- a/pufferlib/environments/__init__.py +++ b/pufferlib/environments/__init__.py @@ -1,21 +1,22 @@ from pdb import set_trace as T + import pufferlib + def try_import(module_path, package_name=None): if package_name is None: package_name = module_path try: - package = __import__(module_path) + package = __import__(module_path) except ImportError as e: raise ImportError( - f'{e.args[0]}\n\n' - 'This is probably an installation error. Try: ' - f'pip install pufferlib[{package_name}]. ' - - 'Note that some environments have non-python dependencies. ' - 'These are included in PufferTank. Or, you can install ' - 'manually by following the instructions provided by the ' - 'environment meaintainers. But some are finicky, so we ' - 'recommend using PufferTank.' + f"{e.args[0]}\n\n" + "This is probably an installation error. Try: " + f"pip install pufferlib[{package_name}]. " + "Note that some environments have non-python dependencies. " + "These are included in PufferTank. Or, you can install " + "manually by following the instructions provided by the " + "environment meaintainers. But some are finicky, so we " + "recommend using PufferTank." ) from e return package diff --git a/pufferlib/environments/atari/__init__.py b/pufferlib/environments/atari/__init__.py index 59cda9e7c..c24135317 100644 --- a/pufferlib/environments/atari/__init__.py +++ b/pufferlib/environments/atari/__init__.py @@ -6,6 +6,7 @@ pass else: from .torch import Policy + try: from .torch import Recurrent except: diff --git a/pufferlib/environments/atari/environment.py b/pufferlib/environments/atari/environment.py index 604eb2546..7cc8b9417 100644 --- a/pufferlib/environments/atari/environment.py +++ b/pufferlib/environments/atari/environment.py @@ -1,62 +1,76 @@ -from pdb import set_trace as T -import numpy as np import functools import gymnasium as gym +import numpy as np import pufferlib import pufferlib.emulation import pufferlib.environments -import pufferlib.utils -import pufferlib.postprocess import pufferlib.postprocess +import pufferlib.utils + -def env_creator(name='breakout'): +def env_creator(name="breakout"): return functools.partial(make, name) -def make(name, obs_type='grayscale', frameskip=4, - full_action_space=False, framestack=1, - repeat_action_probability=0.0, render_mode='rgb_array', buf=None): - '''Atari creation function''' - pufferlib.environments.try_import('ale_py', 'AtariEnv') + +def make( + name, + obs_type="grayscale", + frameskip=4, + full_action_space=False, + framestack=1, + repeat_action_probability=0.0, + render_mode="rgb_array", + buf=None, +): + """Atari creation function""" + pufferlib.environments.try_import("ale_py", "AtariEnv") ale_render_mode = render_mode - if render_mode == 'human': - ale_render_mode = 'rgb_array' - obs_type = 'rgb' + if render_mode == "human": + ale_render_mode = "rgb_array" + obs_type = "rgb" frameskip = 1 full_action_space = True upscale = 4 - elif render_mode == 'raylib': - ale_render_mode = 'rgb_array' + elif render_mode == "raylib": + ale_render_mode = "rgb_array" upscale = 8 from ale_py import AtariEnv - env = AtariEnv(name, obs_type=obs_type, frameskip=frameskip, + + env = AtariEnv( + name, + obs_type=obs_type, + frameskip=frameskip, repeat_action_probability=repeat_action_probability, full_action_space=full_action_space, - render_mode=ale_render_mode) + render_mode=ale_render_mode, + ) action_set = env._action_set - - if render_mode != 'human': + + if render_mode != "human": env = pufferlib.postprocess.ResizeObservation(env, downscale=2) if framestack > 1: env = gym.wrappers.FrameStack(env, framestack) - if render_mode in ('human', 'raylib'): + if render_mode in ("human", "raylib"): env = RaylibClient(env, action_set, frameskip, upscale) else: - env = AtariPostprocessor(env) # Don't use standard postprocessor + env = AtariPostprocessor(env) # Don't use standard postprocessor env = pufferlib.postprocess.EpisodeStats(env) env = pufferlib.emulation.GymnasiumPufferEnv(env=env, buf=buf) return env + class AtariPostprocessor(gym.Wrapper): - '''Atari breaks the normal PufferLib postprocessor because - it sends terminal=True every live, not every episode''' + """Atari breaks the normal PufferLib postprocessor because + it sends terminal=True every live, not every episode""" + def __init__(self, env): super().__init__(env) shape = env.observation_space.shape @@ -65,8 +79,7 @@ def __init__(self, env): else: shape = (shape[2], shape[0], shape[1]) - self.observation_space = gym.spaces.Box(low=0, high=255, - shape=shape, dtype=env.observation_space.dtype) + self.observation_space = gym.spaces.Box(low=0, high=255, shape=shape, dtype=env.observation_space.dtype) def unsqueeze_transpose(self, obs): if len(obs.shape) == 3: @@ -82,6 +95,7 @@ def step(self, action): obs, reward, terminal, truncated, _ = self.env.step(action) return self.unsqueeze_transpose(obs), reward, terminal, truncated, {} + class RaylibClient(gym.Wrapper): def __init__(self, env, action_set, frameskip=4, upscale=4): self.env = env @@ -99,20 +113,23 @@ def __init__(self, env, action_set, frameskip=4, upscale=4): height *= upscale width *= upscale - from raylib import rl, colors + from raylib import colors, rl + rl.InitWindow(width, height, "Atari".encode()) - rl.SetTargetFPS(60//frameskip) + rl.SetTargetFPS(60 // frameskip) self.rl = rl self.colors = colors - import numpy as np + rendered = np.zeros((width, height, 4), dtype=np.uint8) import pyray from cffi import FFI - raylib_image = pyray.Image(FFI().from_buffer(rendered.data), - width, height, 1, pyray.PIXELFORMAT_UNCOMPRESSED_R8G8B8) + + raylib_image = pyray.Image( + FFI().from_buffer(rendered.data), width, height, 1, pyray.PIXELFORMAT_UNCOMPRESSED_R8G8B8 + ) self.texture = rl.LoadTextureFromImage(raylib_image) self.action = 0 @@ -191,7 +208,7 @@ def render(self): else: self.action = Action.NOOP.value - #frame = self.env.render() + # frame = self.env.render() frame = self.frame if len(frame.shape) < 3: frame = np.expand_dims(frame, 2) diff --git a/pufferlib/environments/atari/torch.py b/pufferlib/environments/atari/torch.py index 1d678a845..cc4f89b7a 100644 --- a/pufferlib/environments/atari/torch.py +++ b/pufferlib/environments/atari/torch.py @@ -5,9 +5,9 @@ class Recurrent(pufferlib.models.LSTMWrapper): def __init__(self, env, policy, input_size=512, hidden_size=512, num_layers=1): super().__init__(env, policy, input_size, hidden_size, num_layers) + class Policy(pufferlib.models.Convolutional): - def __init__(self, env, input_size=512, hidden_size=512, output_size=512, - framestack=1, flat_size=64*6*9): + def __init__(self, env, input_size=512, hidden_size=512, output_size=512, framestack=1, flat_size=64 * 6 * 9): super().__init__( env=env, input_size=input_size, diff --git a/pufferlib/environments/box2d/__init__.py b/pufferlib/environments/box2d/__init__.py index eff86ef02..a38c82d6d 100644 --- a/pufferlib/environments/box2d/__init__.py +++ b/pufferlib/environments/box2d/__init__.py @@ -6,6 +6,7 @@ pass else: from .torch import Policy + try: from .torch import Recurrent except: diff --git a/pufferlib/environments/box2d/environment.py b/pufferlib/environments/box2d/environment.py index f2fd12b7e..ef19743ed 100644 --- a/pufferlib/environments/box2d/environment.py +++ b/pufferlib/environments/box2d/environment.py @@ -1,21 +1,21 @@ -from pdb import set_trace as T -import gymnasium import functools +import gymnasium + import pufferlib.emulation import pufferlib.environments import pufferlib.postprocess -def env_creator(name='car-racing'): +def env_creator(name="car-racing"): return functools.partial(make, name=name) -def make(name, domain_randomize=True, continuous=False, render_mode='rgb_array', buf=None): - if name == 'car-racing': - name = 'CarRacing-v2' - env = gymnasium.make(name, render_mode=render_mode, - domain_randomize=domain_randomize, continuous=continuous) +def make(name, domain_randomize=True, continuous=False, render_mode="rgb_array", buf=None): + if name == "car-racing": + name = "CarRacing-v2" + + env = gymnasium.make(name, render_mode=render_mode, domain_randomize=domain_randomize, continuous=continuous) env = pufferlib.postprocess.EpisodeStats(env) return pufferlib.emulation.GymnasiumPufferEnv(env=env, buf=buf) diff --git a/pufferlib/environments/box2d/torch.py b/pufferlib/environments/box2d/torch.py index b7cdd6acb..924dfe4d5 100644 --- a/pufferlib/environments/box2d/torch.py +++ b/pufferlib/environments/box2d/torch.py @@ -1,18 +1,14 @@ -from functools import partial -import torch import pufferlib.models + class Recurrent(pufferlib.models.LSTMWrapper): - def __init__(self, env, policy, - input_size=128, hidden_size=128, num_layers=1): - super().__init__(env, policy, - input_size, hidden_size, num_layers) + def __init__(self, env, policy, input_size=128, hidden_size=128, num_layers=1): + super().__init__(env, policy, input_size, hidden_size, num_layers) + class Policy(pufferlib.models.Convolutional): - def __init__(self, env, - input_size=128, hidden_size=128, output_size=128, - framestack=3, flat_size=64*8*8): + def __init__(self, env, input_size=128, hidden_size=128, output_size=128, framestack=3, flat_size=64 * 8 * 8): super().__init__( env=env, input_size=input_size, diff --git a/pufferlib/environments/bsuite/__init__.py b/pufferlib/environments/bsuite/__init__.py index eff86ef02..a38c82d6d 100644 --- a/pufferlib/environments/bsuite/__init__.py +++ b/pufferlib/environments/bsuite/__init__.py @@ -6,6 +6,7 @@ pass else: from .torch import Policy + try: from .torch import Recurrent except: diff --git a/pufferlib/environments/bsuite/environment.py b/pufferlib/environments/bsuite/environment.py index f39c661e5..87e7af598 100644 --- a/pufferlib/environments/bsuite/environment.py +++ b/pufferlib/environments/bsuite/environment.py @@ -1,20 +1,19 @@ -from pdb import set_trace as T -import gym import functools +from bsuite.utils import gym_wrapper + import pufferlib.emulation import pufferlib.wrappers -import bsuite -from bsuite.utils import gym_wrapper -def env_creator(name='bandit/0'): +def env_creator(name="bandit/0"): return functools.partial(make, name) -def make(name='bandit/0', results_dir='experiments/bsuite', overwrite=True, buf=None): - '''BSuite environments''' - bsuite = pufferlib.environments.try_import('bsuite') - from bsuite.utils import gym_wrapper + +def make(name="bandit/0", results_dir="experiments/bsuite", overwrite=True, buf=None): + """BSuite environments""" + bsuite = pufferlib.environments.try_import("bsuite") + env = bsuite.load_and_record_to_csv(name, results_dir, overwrite=overwrite) env = gym_wrapper.GymFromDMEnv(env) env = BSuiteStopper(env) @@ -22,6 +21,7 @@ def make(name='bandit/0', results_dir='experiments/bsuite', overwrite=True, buf= env = pufferlib.emulation.GymnasiumPufferEnv(env, buf=buf) return env + class BSuiteStopper: def __init__(self, env): self.env = env @@ -34,8 +34,8 @@ def __init__(self, env): self.action_space = self.env.action_space def reset(self): - '''Forces the environment to stop after the - number of episodes required by bsuite''' + """Forces the environment to stop after the + number of episodes required by bsuite""" self.num_episodes += 1 if self.num_episodes >= self.env.bsuite_num_episodes: diff --git a/pufferlib/environments/bsuite/torch.py b/pufferlib/environments/bsuite/torch.py index 8b13194e9..e69de29bb 100644 --- a/pufferlib/environments/bsuite/torch.py +++ b/pufferlib/environments/bsuite/torch.py @@ -1 +0,0 @@ -from pufferlib.models import Default as Policy diff --git a/pufferlib/environments/butterfly/__init__.py b/pufferlib/environments/butterfly/__init__.py index eff86ef02..a38c82d6d 100644 --- a/pufferlib/environments/butterfly/__init__.py +++ b/pufferlib/environments/butterfly/__init__.py @@ -6,6 +6,7 @@ pass else: from .torch import Policy + try: from .torch import Recurrent except: diff --git a/pufferlib/environments/butterfly/environment.py b/pufferlib/environments/butterfly/environment.py index 994e53e9e..c33e47928 100644 --- a/pufferlib/environments/butterfly/environment.py +++ b/pufferlib/environments/butterfly/environment.py @@ -1,24 +1,27 @@ -from pdb import set_trace as T -from pettingzoo.utils.conversions import aec_to_parallel_wrapper import functools +from pettingzoo.utils.conversions import aec_to_parallel_wrapper + import pufferlib.emulation import pufferlib.environments -def env_creator(name='cooperative_pong_v5'): +def env_creator(name="cooperative_pong_v5"): return functools.partial(make, name) + def make(name, buf=None): - pufferlib.environments.try_import('pettingzoo.butterfly', 'butterfly') - if name == 'cooperative_pong_v5': + pufferlib.environments.try_import("pettingzoo.butterfly", "butterfly") + if name == "cooperative_pong_v5": from pettingzoo.butterfly import cooperative_pong_v5 as pong + env_cls = pong.raw_env - elif name == 'knights_archers_zombies_v10': + elif name == "knights_archers_zombies_v10": from pettingzoo.butterfly import knights_archers_zombies_v10 as kaz + env_cls = kaz.raw_env else: - raise ValueError(f'Unknown environment: {name}') + raise ValueError(f"Unknown environment: {name}") env = env_cls() env = aec_to_parallel_wrapper(env) diff --git a/pufferlib/environments/butterfly/torch.py b/pufferlib/environments/butterfly/torch.py index 5965a3971..29e590533 100644 --- a/pufferlib/environments/butterfly/torch.py +++ b/pufferlib/environments/butterfly/torch.py @@ -3,16 +3,16 @@ class Policy(pufferlib.models.Convolutional): def __init__( - self, - env, - flat_size=3520, - channels_last=True, - downsample=4, - input_size=512, - hidden_size=128, - output_size=128, - **kwargs - ): + self, + env, + flat_size=3520, + channels_last=True, + downsample=4, + input_size=512, + hidden_size=128, + output_size=128, + **kwargs, + ): super().__init__( env, framestack=3, @@ -22,5 +22,5 @@ def __init__( input_size=input_size, hidden_size=hidden_size, output_size=output_size, - **kwargs + **kwargs, ) diff --git a/pufferlib/environments/classic_control/__init__.py b/pufferlib/environments/classic_control/__init__.py index eff86ef02..a38c82d6d 100644 --- a/pufferlib/environments/classic_control/__init__.py +++ b/pufferlib/environments/classic_control/__init__.py @@ -6,6 +6,7 @@ pass else: from .torch import Policy + try: from .torch import Recurrent except: diff --git a/pufferlib/environments/classic_control/environment.py b/pufferlib/environments/classic_control/environment.py index 9da76d3e9..593953a46 100644 --- a/pufferlib/environments/classic_control/environment.py +++ b/pufferlib/environments/classic_control/environment.py @@ -1,6 +1,6 @@ -import gymnasium -from gymnasium.envs import classic_control import functools + +import gymnasium import numpy as np import pufferlib @@ -8,33 +8,35 @@ import pufferlib.postprocess ALIASES = { - 'cartpole': 'CartPole-v0', - 'mountaincar': 'MountainCar-v0', + "cartpole": "CartPole-v0", + "mountaincar": "MountainCar-v0", } -def env_creator(name='cartpole'): + +def env_creator(name="cartpole"): return functools.partial(make, name) -def make(name, render_mode='rgb_array', buf=None): - '''Create an environment by name''' + +def make(name, render_mode="rgb_array", buf=None): + """Create an environment by name""" if name in ALIASES: name = ALIASES[name] env = gymnasium.make(name, render_mode=render_mode) - if name == 'MountainCar-v0': + if name == "MountainCar-v0": env = MountainCarWrapper(env) - #env = gymnasium.wrappers.NormalizeObservation(env) + # env = gymnasium.wrappers.NormalizeObservation(env) env = gymnasium.wrappers.TransformObservation(env, lambda obs: np.clip(obs, -1, 1)) - #env = gymnasium.wrappers.NormalizeReward(env, gamma=gamma) + # env = gymnasium.wrappers.NormalizeReward(env, gamma=gamma) env = gymnasium.wrappers.TransformReward(env, lambda reward: np.clip(reward, -1, 1)) env = pufferlib.postprocess.EpisodeStats(env) return pufferlib.emulation.GymnasiumPufferEnv(env=env, buf=buf) + class MountainCarWrapper(gymnasium.Wrapper): def step(self, action): obs, reward, terminated, truncated, info = self.env.step(action) - reward = abs(obs[0]+0.5) + reward = abs(obs[0] + 0.5) return obs, reward, terminated, truncated, info - diff --git a/pufferlib/environments/classic_control_continuous/__init__.py b/pufferlib/environments/classic_control_continuous/__init__.py index eff86ef02..a38c82d6d 100644 --- a/pufferlib/environments/classic_control_continuous/__init__.py +++ b/pufferlib/environments/classic_control_continuous/__init__.py @@ -6,6 +6,7 @@ pass else: from .torch import Policy + try: from .torch import Recurrent except: diff --git a/pufferlib/environments/classic_control_continuous/environment.py b/pufferlib/environments/classic_control_continuous/environment.py index 47f293657..28b896ff2 100644 --- a/pufferlib/environments/classic_control_continuous/environment.py +++ b/pufferlib/environments/classic_control_continuous/environment.py @@ -1,27 +1,29 @@ -import gymnasium import functools +import gymnasium + import pufferlib import pufferlib.emulation import pufferlib.postprocess -def env_creator(name='MountainCarContinuous-v0'): +def env_creator(name="MountainCarContinuous-v0"): return functools.partial(make, name) -def make(name, render_mode='rgb_array', buf=None): - '''Create an environment by name''' + +def make(name, render_mode="rgb_array", buf=None): + """Create an environment by name""" env = gymnasium.make(name, render_mode=render_mode) - if name == 'MountainCarContinuous-v0': + if name == "MountainCarContinuous-v0": env = MountainCarWrapper(env) env = pufferlib.postprocess.ClipAction(env) env = pufferlib.postprocess.EpisodeStats(env) return pufferlib.emulation.GymnasiumPufferEnv(env=env, buf=buf) + class MountainCarWrapper(gymnasium.Wrapper): def step(self, action): obs, reward, terminated, truncated, info = self.env.step(action) - reward = abs(obs[0]+0.5) + reward = abs(obs[0] + 0.5) return obs, reward, terminated, truncated, info - diff --git a/pufferlib/environments/crafter/__init__.py b/pufferlib/environments/crafter/__init__.py index eff86ef02..a38c82d6d 100644 --- a/pufferlib/environments/crafter/__init__.py +++ b/pufferlib/environments/crafter/__init__.py @@ -6,6 +6,7 @@ pass else: from .torch import Policy + try: from .torch import Recurrent except: diff --git a/pufferlib/environments/crafter/environment.py b/pufferlib/environments/crafter/environment.py index e320e347d..df4941047 100644 --- a/pufferlib/environments/crafter/environment.py +++ b/pufferlib/environments/crafter/environment.py @@ -1,9 +1,8 @@ -from pdb import set_trace as T + +import functools import gym -import gymnasium import shimmy -import functools import pufferlib import pufferlib.emulation @@ -16,15 +15,17 @@ class TransposeObs(gym.Wrapper): def observation(self, observation): return observation.transpose(2, 0, 1) -def env_creator(name='crafter'): + +def env_creator(name="crafter"): return functools.partial(make, name) + def make(name, buf=None): - '''Crafter creation function''' - if name == 'crafter': - name = 'CrafterReward-v1' + """Crafter creation function""" + if name == "crafter": + name = "CrafterReward-v1" - pufferlib.environments.try_import('crafter') + pufferlib.environments.try_import("crafter") env = gym.make(name) env.reset = pufferlib.utils.silence_warnings(env.reset) env = shimmy.GymV21CompatibilityV0(env=env) @@ -33,6 +34,7 @@ def make(name, buf=None): env = pufferlib.postprocess.EpisodeStats(env) return pufferlib.emulation.GymnasiumPufferEnv(env=env, buf=buf) + class RenderWrapper(gym.Wrapper): def __init__(self, env): super().__init__(env) @@ -40,7 +42,7 @@ def __init__(self, env): @property def render_mode(self): - return 'rgb_array' + return "rgb_array" def render(self, *args, **kwargs): - return self.env.unwrapped.env.unwrapped.render((256,256)) + return self.env.unwrapped.env.unwrapped.render((256, 256)) diff --git a/pufferlib/environments/crafter/torch.py b/pufferlib/environments/crafter/torch.py index f6f4e5237..8e7ed9c18 100644 --- a/pufferlib/environments/crafter/torch.py +++ b/pufferlib/environments/crafter/torch.py @@ -3,16 +3,16 @@ class Policy(pufferlib.models.Convolutional): def __init__( - self, - env, - flat_size=1024, - channels_last=True, - downsample=1, - input_size=512, - hidden_size=128, - output_size=128, - **kwargs - ): + self, + env, + flat_size=1024, + channels_last=True, + downsample=1, + input_size=512, + hidden_size=128, + output_size=128, + **kwargs, + ): super().__init__( env, framestack=3, @@ -22,5 +22,5 @@ def __init__( input_size=input_size, hidden_size=hidden_size, output_size=output_size, - **kwargs + **kwargs, ) diff --git a/pufferlib/environments/dm_control/__init__.py b/pufferlib/environments/dm_control/__init__.py index eff86ef02..a38c82d6d 100644 --- a/pufferlib/environments/dm_control/__init__.py +++ b/pufferlib/environments/dm_control/__init__.py @@ -6,6 +6,7 @@ pass else: from .torch import Policy + try: from .torch import Recurrent except: diff --git a/pufferlib/environments/dm_control/environment.py b/pufferlib/environments/dm_control/environment.py index f492fa659..4e0815a2e 100644 --- a/pufferlib/environments/dm_control/environment.py +++ b/pufferlib/environments/dm_control/environment.py @@ -1,24 +1,24 @@ -from pdb import set_trace as T -import gym -import shimmy import functools +import shimmy + import pufferlib import pufferlib.emulation import pufferlib.environments -def env_creator(name='walker'): - '''Deepmind Control environment creation function +def env_creator(name="walker"): + """Deepmind Control environment creation function No support for bindings yet because PufferLib does - not support continuous action spaces.''' + not support continuous action spaces.""" return functools.partial(make, name) -def make(name, task_name='walk', buf=None): - '''Untested. Let us know in Discord if you want to use dmc in PufferLib.''' - dm_control = pufferlib.environments.try_import('dm_control.suite', 'dmc') + +def make(name, task_name="walk", buf=None): + """Untested. Let us know in Discord if you want to use dmc in PufferLib.""" + dm_control = pufferlib.environments.try_import("dm_control.suite", "dmc") env = dm_control.suite.load(name, task_name) env = shimmy.DmControlCompatibilityV0(env=env) return pufferlib.emulation.GymnasiumPufferEnv(env, buf=buf) diff --git a/pufferlib/environments/dm_control/torch.py b/pufferlib/environments/dm_control/torch.py index 8b13194e9..e69de29bb 100644 --- a/pufferlib/environments/dm_control/torch.py +++ b/pufferlib/environments/dm_control/torch.py @@ -1 +0,0 @@ -from pufferlib.models import Default as Policy diff --git a/pufferlib/environments/dm_lab/__init__.py b/pufferlib/environments/dm_lab/__init__.py index eff86ef02..a38c82d6d 100644 --- a/pufferlib/environments/dm_lab/__init__.py +++ b/pufferlib/environments/dm_lab/__init__.py @@ -6,6 +6,7 @@ pass else: from .torch import Policy + try: from .torch import Recurrent except: diff --git a/pufferlib/environments/dm_lab/environment.py b/pufferlib/environments/dm_lab/environment.py index 17bbde770..b808bd88d 100644 --- a/pufferlib/environments/dm_lab/environment.py +++ b/pufferlib/environments/dm_lab/environment.py @@ -1,24 +1,24 @@ -from pdb import set_trace as T -import gym -import shimmy import functools +import shimmy + import pufferlib import pufferlib.emulation import pufferlib.environments -def env_creator(name='seekavoid_arena_01'): - '''Deepmind Lab binding creation function - dm-lab requires extensive setup. Use PufferTank.''' +def env_creator(name="seekavoid_arena_01"): + """Deepmind Lab binding creation function + dm-lab requires extensive setup. Use PufferTank.""" return functools.partial(make, name=name) + def make(name, buf=None): - '''Deepmind Lab binding creation function + """Deepmind Lab binding creation function dm-lab requires extensive setup. Currently dropped frop PufferTank. - Let us know if you need this for your work.''' - dm_lab = pufferlib.environments.try_import('deepmind_lab', 'dm-lab') - env = dm_lab.Lab(name, ['RGB_INTERLEAVED']) + Let us know if you need this for your work.""" + dm_lab = pufferlib.environments.try_import("deepmind_lab", "dm-lab") + env = dm_lab.Lab(name, ["RGB_INTERLEAVED"]) env = shimmy.DmLabCompatibilityV0(env=env) return pufferlib.emulation.GymnasiumPufferEnv(env=env, buf=buf) diff --git a/pufferlib/environments/dm_lab/torch.py b/pufferlib/environments/dm_lab/torch.py index c8e411498..46e61023a 100644 --- a/pufferlib/environments/dm_lab/torch.py +++ b/pufferlib/environments/dm_lab/torch.py @@ -3,16 +3,16 @@ class Policy(pufferlib.models.Convolutional): def __init__( - self, - env, - flat_size=3136, - channels_last=True, - downsample=1, - input_size=512, - hidden_size=128, - output_size=128, - **kwargs - ): + self, + env, + flat_size=3136, + channels_last=True, + downsample=1, + input_size=512, + hidden_size=128, + output_size=128, + **kwargs, + ): super().__init__( env, framestack=3, @@ -22,5 +22,5 @@ def __init__( input_size=input_size, hidden_size=hidden_size, output_size=output_size, - **kwargs + **kwargs, ) diff --git a/pufferlib/environments/gpudrive/__init__.py b/pufferlib/environments/gpudrive/__init__.py index 59cda9e7c..c24135317 100644 --- a/pufferlib/environments/gpudrive/__init__.py +++ b/pufferlib/environments/gpudrive/__init__.py @@ -6,6 +6,7 @@ pass else: from .torch import Policy + try: from .torch import Recurrent except: diff --git a/pufferlib/environments/gpudrive/environment.py b/pufferlib/environments/gpudrive/environment.py index 31302e65d..4c0af061e 100644 --- a/pufferlib/environments/gpudrive/environment.py +++ b/pufferlib/environments/gpudrive/environment.py @@ -1,17 +1,19 @@ import os -import numpy as np from pathlib import Path -import torch -import gymnasium +import gymnasium +import numpy as np +import torch from pygpudrive.env.config import EnvConfig, RenderConfig, SceneConfig, SelectionDiscipline from pygpudrive.env.env_torch import GPUDriveTorchEnv -def env_creator(name='gpudrive'): + +def env_creator(name="gpudrive"): return PufferGPUDrive + class PufferGPUDrive: - def __init__(self, device='cuda', max_cont_agents=64, num_worlds=64, k_unique_scenes=1): + def __init__(self, device="cuda", max_cont_agents=64, num_worlds=64, k_unique_scenes=1): self.device = device self.max_cont_agents = max_cont_agents self.num_worlds = num_worlds @@ -19,26 +21,23 @@ def __init__(self, device='cuda', max_cont_agents=64, num_worlds=64, k_unique_sc self.total_agents = max_cont_agents * num_worlds # Set working directory to the base directory 'gpudrive' - working_dir = os.path.join(Path.cwd(), '../gpudrive') + working_dir = os.path.join(Path.cwd(), "../gpudrive") os.chdir(working_dir) scene_config = SceneConfig( - path='biggest_file/', + path="biggest_file/", num_scenes=num_worlds, discipline=SelectionDiscipline.K_UNIQUE_N, k_unique_scenes=k_unique_scenes, ) env_config = EnvConfig( - steer_actions = torch.round( - torch.linspace(-1.0, 1.0, 3), decimals=3), - accel_actions = torch.round( - torch.linspace(-3, 3, 3), decimals=3 - ) + steer_actions=torch.round(torch.linspace(-1.0, 1.0, 3), decimals=3), + accel_actions=torch.round(torch.linspace(-3, 3, 3), decimals=3), ) render_config = RenderConfig( - resolution=(512, 512), # Quality of the rendered images + resolution=(512, 512), # Quality of the rendered images ) self.env = GPUDriveTorchEnv( @@ -52,13 +51,12 @@ def __init__(self, device='cuda', max_cont_agents=64, num_worlds=64, k_unique_sc self.obs_size = self.env.observation_space.shape[-1] self.action_space = self.env.action_space self.observation_space = self.env.observation_space - self.observation_space = gymnasium.spaces.Box( - low=0, high=255, shape=(self.obs_size,), dtype=np.float32) + self.observation_space = gymnasium.spaces.Box(low=0, high=255, shape=(self.obs_size,), dtype=np.float32) self.single_observation_space = self.observation_space self.single_action_space = self.action_space self.done = False self.emulated = None - self.render_mode = 'rgb_array' + self.render_mode = "rgb_array" self.num_live = [] self.controlled_agent_mask = self.env.cont_agent_mask.clone() @@ -67,22 +65,21 @@ def __init__(self, device='cuda', max_cont_agents=64, num_worlds=64, k_unique_sc self.num_agents = self.obs.shape[0] self.env_id = np.array([i for i in range(self.num_agents)]) self.mask = np.ones(self.num_agents, dtype=bool) - self.actions = torch.zeros((num_worlds, max_cont_agents), - dtype=torch.int64).to(self.device) + self.actions = torch.zeros((num_worlds, max_cont_agents), dtype=torch.int64).to(self.device) def _obs_and_mask(self, obs): - #self.buf.masks[:] = self.env.cont_agent_mask.numpy().ravel() * self.live_agent_mask - #return np.asarray(obs).reshape(NUM_WORLDS*MAX_NUM_OBJECTS, self.obs_size) - #return obs.numpy().reshape(NUM_WORLDS*MAX_NUM_OBJECTS, self.obs_size)[:, :6] + # self.buf.masks[:] = self.env.cont_agent_mask.numpy().ravel() * self.live_agent_mask + # return np.asarray(obs).reshape(NUM_WORLDS*MAX_NUM_OBJECTS, self.obs_size) + # return obs.numpy().reshape(NUM_WORLDS*MAX_NUM_OBJECTS, self.obs_size)[:, :6] return obs.view(self.total_agents, self.obs_size) def close(self): - '''There is no point in closing the env because + """There is no point in closing the env because Madrona doesn't close correctly anyways. You will want - to cache this copy for later use. Cuda errors if you don't''' - pass - #self.env.close() - #del self.env.sim + to cache this copy for later use. Cuda errors if you don't""" + pass + # self.env.close() + # del self.env.sim def reset(self, seed=None, options=None): self.reward = torch.zeros(self.num_agents, dtype=torch.float32).to(self.device) @@ -104,8 +101,7 @@ def step(self, action): terminal = self.env.get_dones().bool() done_worlds = torch.where( - (terminal.nan_to_num(0) * self.controlled_agent_mask).sum(dim=1) - == self.controlled_agent_mask.sum(dim=1) + (terminal.nan_to_num(0) * self.controlled_agent_mask).sum(dim=1) == self.controlled_agent_mask.sum(dim=1) )[0].cpu() self.episode_returns += reward @@ -121,17 +117,19 @@ def step(self, action): controlled_mask = self.controlled_agent_mask[done_worlds] info_tensor = self.env.get_infos()[done_worlds][controlled_mask] num_finished_agents = controlled_mask.sum().item() - info.append({ - 'off_road': info_tensor[:, 0].sum().item() / num_finished_agents, - 'veh_collisions': info_tensor[:, 1].sum().item() / num_finished_agents, - 'non_veh_collisions': info_tensor[:, 2].sum().item() / num_finished_agents, - 'goal_achieved': info_tensor[:, 3].sum().item() / num_finished_agents, - 'num_finished_agents': num_finished_agents, - 'episode_length': self.episode_lengths[done_worlds].mean().item(), - 'mean_reward_per_episode': self.episode_returns[done_worlds].mean().item(), - 'control_density': self.num_controlled / self.num_agents, - 'data_density': np.mean(self.num_live) / self.num_agents, - }) + info.append( + { + "off_road": info_tensor[:, 0].sum().item() / num_finished_agents, + "veh_collisions": info_tensor[:, 1].sum().item() / num_finished_agents, + "non_veh_collisions": info_tensor[:, 2].sum().item() / num_finished_agents, + "goal_achieved": info_tensor[:, 3].sum().item() / num_finished_agents, + "num_finished_agents": num_finished_agents, + "episode_length": self.episode_lengths[done_worlds].mean().item(), + "mean_reward_per_episode": self.episode_returns[done_worlds].mean().item(), + "control_density": self.num_controlled / self.num_agents, + "data_density": np.mean(self.num_live) / self.num_agents, + } + ) self.num_live = [] for idx in done_worlds: diff --git a/pufferlib/environments/gpudrive/torch.py b/pufferlib/environments/gpudrive/torch.py index 5e684e507..4faaae914 100644 --- a/pufferlib/environments/gpudrive/torch.py +++ b/pufferlib/environments/gpudrive/torch.py @@ -1,11 +1,10 @@ -from torch import nn + import torch -import torch.nn.functional as F +from torch import nn -from functools import partial import pufferlib.models - from pufferlib.models import Default as Policy + Recurrent = pufferlib.models.LSTMWrapper EGO_STATE_DIM = 6 @@ -14,7 +13,8 @@ MAX_CONTROLLED_VEHICLES = 32 ROADMAP_AGENT_FEAT_DIM = MAX_CONTROLLED_VEHICLES - 1 -TOP_K_ROADPOINTS = 64 # Number of visible roadpoints from the road graph +TOP_K_ROADPOINTS = 64 # Number of visible roadpoints from the road graph + def unpack_obs(obs_flat): """ @@ -26,16 +26,14 @@ def unpack_obs(obs_flat): """ # Unpack ego and visible state ego_state = obs_flat[:, :EGO_STATE_DIM] - vis_state = obs_flat[:, EGO_STATE_DIM :] - # Visible state object order: road_objects, road_points + vis_state = obs_flat[:, EGO_STATE_DIM:] + # Visible state object order: road_objects, road_points # Find the ends of each section ro_end_idx = PARTNER_DIM * ROADMAP_AGENT_FEAT_DIM rg_end_idx = ro_end_idx + (ROAD_MAP_DIM * TOP_K_ROADPOINTS) - + # Unflatten and reshape to (batch_size, num_objects, object_dim) - road_objects = (vis_state[:, :ro_end_idx]).reshape( - -1, ROADMAP_AGENT_FEAT_DIM, PARTNER_DIM - ) + road_objects = (vis_state[:, :ro_end_idx]).reshape(-1, ROADMAP_AGENT_FEAT_DIM, PARTNER_DIM) road_graph = (vis_state[:, ro_end_idx:rg_end_idx]).reshape( -1, TOP_K_ROADPOINTS, @@ -43,6 +41,7 @@ def unpack_obs(obs_flat): ) return ego_state, road_objects, road_graph + class Policy(nn.Module): def __init__(self, env, input_size=64, hidden_size=128, **kwargs): super().__init__() @@ -64,12 +63,10 @@ def __init__(self, env, input_size=64, hidden_size=128, **kwargs): pufferlib.pytorch.layer_init(nn.Linear(input_size, input_size)), ) - self.proj = pufferlib.pytorch.layer_init(nn.Linear(3*input_size, hidden_size)) + self.proj = pufferlib.pytorch.layer_init(nn.Linear(3 * input_size, hidden_size)) - self.actor = pufferlib.pytorch.layer_init( - nn.Linear(hidden_size, env.single_action_space.n), std=0.01) - self.value_fn = pufferlib.pytorch.layer_init( - nn.Linear(hidden_size, 1), std=1) + self.actor = pufferlib.pytorch.layer_init(nn.Linear(hidden_size, env.single_action_space.n), std=0.01) + self.value_fn = pufferlib.pytorch.layer_init(nn.Linear(hidden_size, 1), std=1) def forward(self, observations): hidden, lookup = self.encode_observations(observations) diff --git a/pufferlib/environments/griddly/__init__.py b/pufferlib/environments/griddly/__init__.py index eff86ef02..a38c82d6d 100644 --- a/pufferlib/environments/griddly/__init__.py +++ b/pufferlib/environments/griddly/__init__.py @@ -6,6 +6,7 @@ pass else: from .torch import Policy + try: from .torch import Recurrent except: diff --git a/pufferlib/environments/griddly/environment.py b/pufferlib/environments/griddly/environment.py index fdf8c2430..92e2905a1 100644 --- a/pufferlib/environments/griddly/environment.py +++ b/pufferlib/environments/griddly/environment.py @@ -1,8 +1,8 @@ -from pdb import set_trace as T + +import functools import gym import shimmy -import functools import pufferlib import pufferlib.emulation @@ -10,27 +10,32 @@ import pufferlib.postprocess ALIASES = { - 'spiders': 'GDY-Spiders-v0', -} + "spiders": "GDY-Spiders-v0", +} -def env_creator(name='spiders'): + +def env_creator(name="spiders"): return functools.partial(make, name) + # TODO: fix griddly def make(name, buf=None): - '''Griddly creation function + """Griddly creation function Note that Griddly environments do not have observation spaces until - they are created and reset''' + they are created and reset""" if name in ALIASES: name = ALIASES[name] import warnings - warnings.warn('Griddly has been segfaulting in the latest build and we do not know why. Submit a PR if you find a fix!') - pufferlib.environments.try_import('griddly') + + warnings.warn( + "Griddly has been segfaulting in the latest build and we do not know why. Submit a PR if you find a fix!" + ) + pufferlib.environments.try_import("griddly") with pufferlib.utils.Suppress(): env = gym.make(name) - env.reset() # Populate observation space + env.reset() # Populate observation space env = shimmy.GymV21CompatibilityV0(env=env) env = pufferlib.postprocess.EpisodeStats(env) diff --git a/pufferlib/environments/griddly/torch.py b/pufferlib/environments/griddly/torch.py index 8b13194e9..e69de29bb 100644 --- a/pufferlib/environments/griddly/torch.py +++ b/pufferlib/environments/griddly/torch.py @@ -1 +0,0 @@ -from pufferlib.models import Default as Policy diff --git a/pufferlib/environments/gvgai/environment.py b/pufferlib/environments/gvgai/environment.py index fa4da86fd..72392b001 100644 --- a/pufferlib/environments/gvgai/environment.py +++ b/pufferlib/environments/gvgai/environment.py @@ -1,28 +1,33 @@ -from pdb import set_trace as T -import numpy as np import functools import gym - import pufferlib import pufferlib.emulation import pufferlib.environments -import pufferlib.utils import pufferlib.postprocess +import pufferlib.utils import pufferlib.wrappers -def env_creator(name='zelda'): - if name == 'zelda': - name = 'gvgai-zelda-lvl0-v0' + +def env_creator(name="zelda"): + if name == "zelda": + name = "gvgai-zelda-lvl0-v0" return functools.partial(make, name) -def make(name, obs_type='grayscale', frameskip=4, full_action_space=False, - repeat_action_probability=0.0, render_mode='rgb_array', buf=None): - '''Atari creation function''' - pufferlib.environments.try_import('gym_gvgai') + +def make( + name, + obs_type="grayscale", + frameskip=4, + full_action_space=False, + repeat_action_probability=0.0, + render_mode="rgb_array", + buf=None, +): + """Atari creation function""" + pufferlib.environments.try_import("gym_gvgai") env = gym.make(name) env = pufferlib.wrappers.GymToGymnasium(env) env = pufferlib.postprocess.EpisodeStats(env) env = pufferlib.emulation.GymnasiumPufferEnv(env=env, buf=buf) return env - diff --git a/pufferlib/environments/links_awaken/__init__.py b/pufferlib/environments/links_awaken/__init__.py index 1a9590595..776fb93f2 100644 --- a/pufferlib/environments/links_awaken/__init__.py +++ b/pufferlib/environments/links_awaken/__init__.py @@ -6,6 +6,7 @@ pass else: from .torch import Policy + try: from .torch import Recurrent except: diff --git a/pufferlib/environments/links_awaken/environment.py b/pufferlib/environments/links_awaken/environment.py index 8016c3cf5..48648d89f 100644 --- a/pufferlib/environments/links_awaken/environment.py +++ b/pufferlib/environments/links_awaken/environment.py @@ -1,15 +1,14 @@ -from pdb import set_trace as T import gymnasium - from links_awaken import LinksAwakenV1 as env_creator import pufferlib.emulation def make_env(headless: bool = True, state_path=None, buf=None): - '''Links Awakening''' + """Links Awakening""" env = env_creator(headless=headless, state_path=state_path) env = gymnasium.wrappers.ResizeObservation(env, shape=(72, 80)) - return pufferlib.emulation.GymnasiumPufferEnv(env=env, - postprocessor_cls=pufferlib.emulation.BasicPostprocessor, buf=buf) + return pufferlib.emulation.GymnasiumPufferEnv( + env=env, postprocessor_cls=pufferlib.emulation.BasicPostprocessor, buf=buf + ) diff --git a/pufferlib/environments/links_awaken/torch.py b/pufferlib/environments/links_awaken/torch.py index 4c92d2fde..ec3be2614 100644 --- a/pufferlib/environments/links_awaken/torch.py +++ b/pufferlib/environments/links_awaken/torch.py @@ -1,5 +1,4 @@ import pufferlib.models -from pufferlib.pytorch import LSTM class Recurrent: @@ -7,9 +6,9 @@ class Recurrent: hidden_size = 512 num_layers = 1 + class Policy(pufferlib.models.Convolutional): - def __init__(self, env, input_size=512, hidden_size=512, output_size=512, - framestack=3, flat_size=64*5*6): + def __init__(self, env, input_size=512, hidden_size=512, output_size=512, framestack=3, flat_size=64 * 5 * 6): super().__init__( env=env, input_size=input_size, diff --git a/pufferlib/environments/magent/__init__.py b/pufferlib/environments/magent/__init__.py index eff86ef02..a38c82d6d 100644 --- a/pufferlib/environments/magent/__init__.py +++ b/pufferlib/environments/magent/__init__.py @@ -6,6 +6,7 @@ pass else: from .torch import Policy + try: from .torch import Recurrent except: diff --git a/pufferlib/environments/magent/environment.py b/pufferlib/environments/magent/environment.py index 10fbfc2ad..8b0e48a75 100644 --- a/pufferlib/environments/magent/environment.py +++ b/pufferlib/environments/magent/environment.py @@ -1,24 +1,26 @@ -from pdb import set_trace as T -from pettingzoo.utils.conversions import aec_to_parallel_wrapper import functools +from pettingzoo.utils.conversions import aec_to_parallel_wrapper + import pufferlib.emulation import pufferlib.environments import pufferlib.wrappers -def env_creator(name='battle_v4'): +def env_creator(name="battle_v4"): return functools.partial(make, name) - pufferlib.environments.try_import('pettingzoo.magent', 'magent') + pufferlib.environments.try_import("pettingzoo.magent", "magent") + def make(name, buf=None): - '''MAgent Battle V4 creation function''' - if name == 'battle_v4': + """MAgent Battle V4 creation function""" + if name == "battle_v4": from pettingzoo.magent import battle_v4 + env_cls = battle_v4.env else: - raise ValueError(f'Unknown environment name {name}') - + raise ValueError(f"Unknown environment name {name}") + env = env_cls() env = aec_to_parallel_wrapper(env) env = pufferlib.wrappers.PettingZooTruncatedWrapper(env) diff --git a/pufferlib/environments/magent/torch.py b/pufferlib/environments/magent/torch.py index eb43bcd72..0a27e344d 100644 --- a/pufferlib/environments/magent/torch.py +++ b/pufferlib/environments/magent/torch.py @@ -4,12 +4,13 @@ class Policy(pufferlib.models.Policy): - '''Based off of the DQN policy in MAgent''' + """Based off of the DQN policy in MAgent""" + def __init__(self, env, hidden_size=256, output_size=256, kernel_num=32): - '''The CleanRL default Atari policy: a stack of three convolutions followed by a linear layer - + """The CleanRL default Atari policy: a stack of three convolutions followed by a linear layer + Takes framestack as a mandatory keyword arguments. Suggested default is 1 frame - with LSTM or 4 frames without.''' + with LSTM or 4 frames without.""" super().__init__(env) self.num_actions = self.action_space.n @@ -19,7 +20,7 @@ def __init__(self, env, hidden_size=256, output_size=256, kernel_num=32): pufferlib.pytorch.layer_init(nn.Conv2d(kernel_num, kernel_num, 3)), nn.ReLU(), nn.Flatten(), - pufferlib.pytorch.layer_init(nn.Linear(kernel_num*9*9, hidden_size)), + pufferlib.pytorch.layer_init(nn.Linear(kernel_num * 9 * 9, hidden_size)), nn.ReLU(), pufferlib.pytorch.layer_init(nn.Linear(hidden_size, hidden_size)), nn.ReLU(), diff --git a/pufferlib/environments/microrts/__init__.py b/pufferlib/environments/microrts/__init__.py index eff86ef02..a38c82d6d 100644 --- a/pufferlib/environments/microrts/__init__.py +++ b/pufferlib/environments/microrts/__init__.py @@ -6,6 +6,7 @@ pass else: from .torch import Policy + try: from .torch import Recurrent except: diff --git a/pufferlib/environments/microrts/environment.py b/pufferlib/environments/microrts/environment.py index 6c00f4d52..25af8d87f 100644 --- a/pufferlib/environments/microrts/environment.py +++ b/pufferlib/environments/microrts/environment.py @@ -1,27 +1,26 @@ -from pdb import set_trace as T -import numpy as np +import functools -import warnings +import numpy as np import shimmy -import functools import pufferlib.emulation import pufferlib.environments -def env_creator(name='GlobalAgentCombinedRewardEnv'): +def env_creator(name="GlobalAgentCombinedRewardEnv"): return functools.partial(make, name) + def make(name, buf=None): - '''Gym MicroRTS creation function - + """Gym MicroRTS creation function + This library appears broken. Step crashes in Java. - ''' - pufferlib.environments.try_import('gym_microrts') - if name == 'GlobalAgentCombinedRewardEnv': + """ + pufferlib.environments.try_import("gym_microrts") + if name == "GlobalAgentCombinedRewardEnv": from gym_microrts.envs import GlobalAgentCombinedRewardEnv else: - raise ValueError(f'Unknown environment: {name}') + raise ValueError(f"Unknown environment: {name}") with pufferlib.utils.Suppress(): return GlobalAgentCombinedRewardEnv() @@ -33,6 +32,7 @@ def make(name, buf=None): env = shimmy.GymV21CompatibilityV0(env=env) return pufferlib.emulation.GymnasiumPufferEnv(env=env, buf=buf) + class MicroRTS: def __init__(self, env): self.env = env diff --git a/pufferlib/environments/microrts/torch.py b/pufferlib/environments/microrts/torch.py index 8b13194e9..e69de29bb 100644 --- a/pufferlib/environments/microrts/torch.py +++ b/pufferlib/environments/microrts/torch.py @@ -1 +0,0 @@ -from pufferlib.models import Default as Policy diff --git a/pufferlib/environments/minerl/__init__.py b/pufferlib/environments/minerl/__init__.py index eff86ef02..a38c82d6d 100644 --- a/pufferlib/environments/minerl/__init__.py +++ b/pufferlib/environments/minerl/__init__.py @@ -6,6 +6,7 @@ pass else: from .torch import Policy + try: from .torch import Recurrent except: diff --git a/pufferlib/environments/minerl/environment.py b/pufferlib/environments/minerl/environment.py index 771c3c600..becda47fe 100644 --- a/pufferlib/environments/minerl/environment.py +++ b/pufferlib/environments/minerl/environment.py @@ -1,8 +1,8 @@ -from pdb import set_trace as T + +import functools import gym import shimmy -import functools import pufferlib import pufferlib.emulation @@ -10,18 +10,19 @@ import pufferlib.utils -def env_creator(name='MineRLBasaltFindCave-v0'): +def env_creator(name="MineRLBasaltFindCave-v0"): return functools.partial(make, name=name) + def make(name, buf=None): - '''Minecraft environment creation function''' + """Minecraft environment creation function""" - pufferlib.environments.try_import('minerl') + pufferlib.environments.try_import("minerl") # Monkey patch to add .itmes to old gym.spaces.Dict - #gym.spaces.Dict.items = lambda self: self.spaces.items() + # gym.spaces.Dict.items = lambda self: self.spaces.items() - #with pufferlib.utils.Suppress(): + # with pufferlib.utils.Suppress(): env = gym.make(name) env = shimmy.GymV21CompatibilityV0(env=env) diff --git a/pufferlib/environments/minerl/torch.py b/pufferlib/environments/minerl/torch.py index 8b13194e9..e69de29bb 100644 --- a/pufferlib/environments/minerl/torch.py +++ b/pufferlib/environments/minerl/torch.py @@ -1 +0,0 @@ -from pufferlib.models import Default as Policy diff --git a/pufferlib/environments/minigrid/__init__.py b/pufferlib/environments/minigrid/__init__.py index eff86ef02..a38c82d6d 100644 --- a/pufferlib/environments/minigrid/__init__.py +++ b/pufferlib/environments/minigrid/__init__.py @@ -6,6 +6,7 @@ pass else: from .torch import Policy + try: from .torch import Recurrent except: diff --git a/pufferlib/environments/minigrid/environment.py b/pufferlib/environments/minigrid/environment.py index b08ad9599..1deb6ed5b 100644 --- a/pufferlib/environments/minigrid/environment.py +++ b/pufferlib/environments/minigrid/environment.py @@ -1,52 +1,53 @@ -from pdb import set_trace as T -import gymnasium import functools +import gymnasium + import pufferlib.emulation import pufferlib.environments import pufferlib.postprocess ALIASES = { - 'minigrid': 'MiniGrid-LavaGapS7-v0', + "minigrid": "MiniGrid-LavaGapS7-v0", } -def env_creator(name='minigrid'): +def env_creator(name="minigrid"): return functools.partial(make, name=name) -def make(name, render_mode='rgb_array', buf=None): + +def make(name, render_mode="rgb_array", buf=None): if name in ALIASES: name = ALIASES[name] - minigrid = pufferlib.environments.try_import('minigrid') + minigrid = pufferlib.environments.try_import("minigrid") env = gymnasium.make(name, render_mode=render_mode) env = MiniGridWrapper(env) env = pufferlib.postprocess.EpisodeStats(env) return pufferlib.emulation.GymnasiumPufferEnv(env=env, buf=buf) + class MiniGridWrapper: def __init__(self, env): self.env = env - self.observation_space = gymnasium.spaces.Dict({ - k: v for k, v in self.env.observation_space.items() if - k != 'mission' - }) + self.observation_space = gymnasium.spaces.Dict( + {k: v for k, v in self.env.observation_space.items() if k != "mission"} + ) self.action_space = self.env.action_space self.close = self.env.close self.render = self.env.render self.close = self.env.close - self.render_mode = 'rgb_array' + self.render_mode = "rgb_array" def reset(self, seed=None, options=None): self.tick = 0 obs, info = self.env.reset(seed=seed) - del obs['mission'] + del obs["mission"] return obs, info def step(self, action): obs, reward, done, truncated, info = self.env.step(action) - del obs['mission'] + del obs["mission"] self.tick += 1 if self.tick == 100: diff --git a/pufferlib/environments/minigrid/torch.py b/pufferlib/environments/minigrid/torch.py index 8b13194e9..e69de29bb 100644 --- a/pufferlib/environments/minigrid/torch.py +++ b/pufferlib/environments/minigrid/torch.py @@ -1 +0,0 @@ -from pufferlib.models import Default as Policy diff --git a/pufferlib/environments/minihack/__init__.py b/pufferlib/environments/minihack/__init__.py index eff86ef02..a38c82d6d 100644 --- a/pufferlib/environments/minihack/__init__.py +++ b/pufferlib/environments/minihack/__init__.py @@ -6,6 +6,7 @@ pass else: from .torch import Policy + try: from .torch import Recurrent except: diff --git a/pufferlib/environments/minihack/environment.py b/pufferlib/environments/minihack/environment.py index b5ac005ff..a91a68973 100644 --- a/pufferlib/environments/minihack/environment.py +++ b/pufferlib/environments/minihack/environment.py @@ -1,40 +1,43 @@ -from pdb import set_trace as T + +import functools import gym import shimmy -import functools import pufferlib import pufferlib.emulation import pufferlib.environments - EXTRA_OBS_KEYS = [ - 'tty_chars', - 'tty_colors', - 'tty_cursor', + "tty_chars", + "tty_colors", + "tty_cursor", ] ALIASES = { - 'minihack': 'MiniHack-River-v0', + "minihack": "MiniHack-River-v0", } -def env_creator(name='minihack'): + +def env_creator(name="minihack"): return functools.partial(make, name) + def make(name, buf=None): - '''NetHack binding creation function''' + """NetHack binding creation function""" if name in ALIASES: name = ALIASES[name] import minihack - pufferlib.environments.try_import('minihack') + + pufferlib.environments.try_import("minihack") obs_key = minihack.base.MH_DEFAULT_OBS_KEYS + EXTRA_OBS_KEYS env = gym.make(name, observation_keys=obs_key) env = shimmy.GymV21CompatibilityV0(env=env) env = MinihackWrapper(env) return pufferlib.emulation.GymnasiumPufferEnv(env=env, buf=buf) + class MinihackWrapper: def __init__(self, env): self.env = env @@ -42,7 +45,7 @@ def __init__(self, env): self.action_space = self.env.action_space self.close = self.env.close self.close = self.env.close - self.render_mode = 'ansi' + self.render_mode = "ansi" def reset(self, seed=None): obs, info = self.env.reset(seed=seed) @@ -56,7 +59,6 @@ def step(self, action): def render(self): import nle - chars = nle.nethack.tty_render( - self.obs['tty_chars'], self.obs['tty_colors'], self.obs['tty_cursor']) - return chars + chars = nle.nethack.tty_render(self.obs["tty_chars"], self.obs["tty_colors"], self.obs["tty_cursor"]) + return chars diff --git a/pufferlib/environments/minihack/torch.py b/pufferlib/environments/minihack/torch.py index 7781cb677..7863eda04 100644 --- a/pufferlib/environments/minihack/torch.py +++ b/pufferlib/environments/minihack/torch.py @@ -1,7 +1,6 @@ -from pdb import set_trace as T import pufferlib.pytorch -from pufferlib.environments.nethack import Policy + class Recurrent(pufferlib.models.LSTMWrapper): def __init__(self, env, policy, input_size=512, hidden_size=512, num_layers=1): diff --git a/pufferlib/environments/mujoco/__init__.py b/pufferlib/environments/mujoco/__init__.py index 9bb39504b..e517a67d4 100644 --- a/pufferlib/environments/mujoco/__init__.py +++ b/pufferlib/environments/mujoco/__init__.py @@ -7,7 +7,8 @@ pass else: from .policy import Policy + try: from .policy import Recurrent except: - Recurrent = None \ No newline at end of file + Recurrent = None diff --git a/pufferlib/environments/mujoco/cleanrl.py b/pufferlib/environments/mujoco/cleanrl.py index 6b2bd1426..32330e7a9 100644 --- a/pufferlib/environments/mujoco/cleanrl.py +++ b/pufferlib/environments/mujoco/cleanrl.py @@ -1,10 +1,10 @@ # docs and experiment results can be found at https://docs.cleanrl.dev/rl-algorithms/ppo/#ppo_continuous_actionpy -import configparser import argparse -import random -import time import ast +import configparser import os +import random +import time from types import SimpleNamespace import gymnasium @@ -17,7 +17,6 @@ from pufferlib.environments.mujoco.environment import cleanrl_env_creator from pufferlib.environments.mujoco.policy import CleanRLPolicy, Policy - if __name__ == "__main__": # Simpler args parse just for this script. Configs are read from file ONLY. parser = argparse.ArgumentParser(description="Training arguments for cleanrl mujoco", add_help=False) @@ -32,23 +31,23 @@ args = parser.parse_known_args()[0] if not os.path.exists(args.config): - raise Exception(f'Config {args.config} not found') + raise Exception(f"Config {args.config} not found") p = configparser.ConfigParser() p.read(args.config) - assert args.env_id in p['base']['env_name'].split(), f"Env {args.env_id} not found in {args.config}" + assert args.env_id in p["base"]["env_name"].split(), f"Env {args.env_id} not found in {args.config}" for section in p.sections(): for key in p[section]: - argparse_key = f'--{section}.{key}'.replace('_', '-') + argparse_key = f"--{section}.{key}".replace("_", "-") parser.add_argument(argparse_key, default=p[section][key]) parsed = parser.parse_args().__dict__ - args_dict = {'env': {}, 'policy': {}, 'rnn': {}} - env_name = parsed.pop('env_id') + args_dict = {"env": {}, "policy": {}, "rnn": {}} + env_name = parsed.pop("env_id") for key, value in parsed.items(): next = args_dict - for subkey in key.split('.'): + for subkey in key.split("."): if subkey not in next: next[subkey] = {} prev = next @@ -84,6 +83,7 @@ wandb = None if args.track: import wandb + wandb.init( project=args.wandb_project, group=args.wandb_group, @@ -107,30 +107,21 @@ # env setup envs = gymnasium.vector.SyncVectorEnv( - [ - cleanrl_env_creator(args.env_id, run_name, args.capture_video, args.gamma, i) - for i in range(args.num_envs) - ] + [cleanrl_env_creator(args.env_id, run_name, args.capture_video, args.gamma, i) for i in range(args.num_envs)] ) - assert isinstance( - envs.single_action_space, gymnasium.spaces.Box - ), "only continuous action space is supported" + assert isinstance(envs.single_action_space, gymnasium.spaces.Box), "only continuous action space is supported" if args.policy == "cleanrl": policy = CleanRLPolicy(envs) elif args.policy == "puffer": policy = Policy(envs) - + agent = pufferlib.cleanrl.Policy(policy).to(device) optimizer = optim.Adam(agent.parameters(), lr=args.learning_rate, eps=1e-5) # ALGO Logic: Storage setup - obs = torch.zeros((args.num_steps, args.num_envs) + envs.single_observation_space.shape).to( - device - ) - actions = torch.zeros((args.num_steps, args.num_envs) + envs.single_action_space.shape).to( - device - ) + obs = torch.zeros((args.num_steps, args.num_envs) + envs.single_observation_space.shape).to(device) + actions = torch.zeros((args.num_steps, args.num_envs) + envs.single_action_space.shape).to(device) logprobs = torch.zeros((args.num_steps, args.num_envs)).to(device) rewards = torch.zeros((args.num_steps, args.num_envs)).to(device) dones = torch.zeros((args.num_steps, args.num_envs)).to(device) @@ -174,9 +165,7 @@ if "final_info" in infos: for info in infos["final_info"]: if info and "episode_return" in info: - print( - f"global_step: {global_step}, episode_return: {int(info['episode_return'])}" - ) + print(f"global_step: {global_step}, episode_return: {int(info['episode_return'])}") episode_stats["episode_return"].append(info["episode_return"]) episode_stats["episode_length"].append(info["episode_length"]) @@ -193,9 +182,7 @@ nextnonterminal = 1.0 - dones[t + 1] nextvalues = values[t + 1] delta = rewards[t] + args.gamma * nextvalues * nextnonterminal - values[t] - advantages[t] = lastgaelam = ( - delta + args.gamma * args.gae_lambda * nextnonterminal * lastgaelam - ) + advantages[t] = lastgaelam = delta + args.gamma * args.gae_lambda * nextnonterminal * lastgaelam returns = advantages + values # flatten the batch @@ -215,9 +202,7 @@ end = start + args.minibatch_size mb_inds = b_inds[start:end] - _, newlogprob, entropy, newvalue = agent.get_action_and_value( - b_obs[mb_inds], b_actions[mb_inds] - ) + _, newlogprob, entropy, newvalue = agent.get_action_and_value(b_obs[mb_inds], b_actions[mb_inds]) logratio = newlogprob - b_logprobs[mb_inds] ratio = logratio.exp() @@ -229,15 +214,11 @@ mb_advantages = b_advantages[mb_inds] if args.norm_adv: - mb_advantages = (mb_advantages - mb_advantages.mean()) / ( - mb_advantages.std() + 1e-8 - ) + mb_advantages = (mb_advantages - mb_advantages.mean()) / (mb_advantages.std() + 1e-8) # Policy loss pg_loss1 = -mb_advantages * ratio - pg_loss2 = -mb_advantages * torch.clamp( - ratio, 1 - args.clip_coef, 1 + args.clip_coef - ) + pg_loss2 = -mb_advantages * torch.clamp(ratio, 1 - args.clip_coef, 1 + args.clip_coef) pg_loss = torch.max(pg_loss1, pg_loss2).mean() # Value loss diff --git a/pufferlib/environments/mujoco/environment.py b/pufferlib/environments/mujoco/environment.py index 4aad701e9..4e4932378 100644 --- a/pufferlib/environments/mujoco/environment.py +++ b/pufferlib/environments/mujoco/environment.py @@ -1,10 +1,8 @@ -from pdb import set_trace as T - import functools -import numpy as np import gymnasium +import numpy as np import pufferlib import pufferlib.emulation diff --git a/pufferlib/environments/mujoco/policy.py b/pufferlib/environments/mujoco/policy.py index 6969b71df..207c0014d 100644 --- a/pufferlib/environments/mujoco/policy.py +++ b/pufferlib/environments/mujoco/policy.py @@ -5,8 +5,6 @@ import pufferlib from pufferlib.pytorch import layer_init -from pufferlib.models import Default as Policy - # Puffer LSTMWrapper does NOT support separate critic networks for now # Would be good to test he performance between these architectures @@ -31,9 +29,7 @@ def __init__(self, env, hidden_size=64): nn.Tanh(), ) - self.actor_decoder_mean = layer_init( - nn.Linear(hidden_size, env.single_action_space.shape[0]), std=0.01 - ) + self.actor_decoder_mean = layer_init(nn.Linear(hidden_size, env.single_action_space.shape[0]), std=0.01) self.actor_decoder_logstd = nn.Parameter(torch.zeros(1, env.single_action_space.shape[0])) self.critic = nn.Sequential( @@ -62,7 +58,7 @@ def encode_observations(self, observations): def decode_actions(self, hidden, lookup, concat=True): """Decodes a batch of hidden states into (multi)discrete actions. Assumes no time dimension (handled by LSTM wrappers).""" - #value = self.value_head(hidden) + # value = self.value_head(hidden) mean = self.actor_decoder_mean(hidden) logstd = self.actor_decoder_logstd.expand_as(mean) diff --git a/pufferlib/environments/nethack/__init__.py b/pufferlib/environments/nethack/__init__.py index eff86ef02..a38c82d6d 100644 --- a/pufferlib/environments/nethack/__init__.py +++ b/pufferlib/environments/nethack/__init__.py @@ -6,6 +6,7 @@ pass else: from .torch import Policy + try: from .torch import Recurrent except: diff --git a/pufferlib/environments/nethack/environment.py b/pufferlib/environments/nethack/environment.py index c49ae260f..ce4200e17 100644 --- a/pufferlib/environments/nethack/environment.py +++ b/pufferlib/environments/nethack/environment.py @@ -1,31 +1,35 @@ -from pdb import set_trace as T -import shimmy -import gym import functools +import gym +import shimmy + import pufferlib import pufferlib.emulation import pufferlib.environments import pufferlib.postprocess -#from .wrapper import RenderCharImagesWithNumpyWrapper -def env_creator(name='nethack'): +# from .wrapper import RenderCharImagesWithNumpyWrapper + + +def env_creator(name="nethack"): return functools.partial(make, name) + def make(name, buf=None): - '''NetHack binding creation function''' - if name == 'nethack': - name = 'NetHackScore-v0' + """NetHack binding creation function""" + if name == "nethack": + name = "NetHackScore-v0" - nle = pufferlib.environments.try_import('nle') + nle = pufferlib.environments.try_import("nle") env = gym.make(name) - #env = RenderCharImagesWithNumpyWrapper(env) + # env = RenderCharImagesWithNumpyWrapper(env) env = shimmy.GymV21CompatibilityV0(env=env) env = NethackWrapper(env) env = pufferlib.postprocess.EpisodeStats(env) return pufferlib.emulation.GymnasiumPufferEnv(env=env, buf=buf) + class NethackWrapper: def __init__(self, env): self.env = env @@ -33,7 +37,7 @@ def __init__(self, env): self.action_space = self.env.action_space self.close = self.env.close self.close = self.env.close - self.render_mode = 'ansi' + self.render_mode = "ansi" def reset(self, seed=None): obs, info = self.env.reset(seed=seed) @@ -47,6 +51,6 @@ def step(self, action): def render(self): import nle - chars = nle.nethack.tty_render( - self.obs['tty_chars'], self.obs['tty_colors'], self.obs['tty_cursor']) + + chars = nle.nethack.tty_render(self.obs["tty_chars"], self.obs["tty_colors"], self.obs["tty_cursor"]) return chars diff --git a/pufferlib/environments/nethack/torch.py b/pufferlib/environments/nethack/torch.py index 86a54a040..9785686f4 100644 --- a/pufferlib/environments/nethack/torch.py +++ b/pufferlib/environments/nethack/torch.py @@ -1,8 +1,6 @@ -from pdb import set_trace as T import torch import torch.nn as nn -import torch.nn.functional as F import pufferlib.models import pufferlib.pytorch @@ -13,6 +11,7 @@ class Recurrent(pufferlib.models.LSTMWrapper): def __init__(self, env, policy, input_size=256, hidden_size=256, num_layers=1): super().__init__(env, policy, input_size, hidden_size, num_layers) + class Policy(nn.Module): def __init__(self, env): super().__init__() @@ -34,7 +33,7 @@ def __init__(self, env): nn.Flatten(), ) - self.proj = nn.Linear(864+960, 256) + self.proj = nn.Linear(864 + 960, 256) self.actor = layer_init(nn.Linear(256, 8), std=0.01) self.critic = layer_init(nn.Linear(256, 1), std=1) @@ -44,13 +43,13 @@ def forward(self, x): return actions, value def encode_observations(self, x): - x = x.type(torch.uint8) # Undo bad cleanrl cast + x = x.type(torch.uint8) # Undo bad cleanrl cast x = pufferlib.pytorch.nativize_tensor(x, self.dtype) - blstats = torch.clip(x['blstats'] + 1, 0, 255).int() + blstats = torch.clip(x["blstats"] + 1, 0, 255).int() blstats = self.blstats_net(blstats) - chars = self.char_embed(x['chars'].int()) + chars = self.char_embed(x["chars"].int()) chars = torch.permute(chars, (0, 3, 1, 2)) chars = self.chars_net(chars) diff --git a/pufferlib/environments/nethack/wrapper.py b/pufferlib/environments/nethack/wrapper.py index 17781a6ac..a5d581d99 100644 --- a/pufferlib/environments/nethack/wrapper.py +++ b/pufferlib/environments/nethack/wrapper.py @@ -30,13 +30,11 @@ import cv2 import gym import numpy as np -from numba import njit from nle import nethack -from PIL import Image -from PIL import ImageDraw -from PIL import ImageFont +from numba import njit +from PIL import Image, ImageDraw, ImageFont -#import render_utils +# import render_utils SMALL_FONT_PATH = os.path.join(__package__.replace(".", "/"), "Hack-Regular.ttf") @@ -93,9 +91,7 @@ def _tile_characters_to_image( color = colors[h_char, w_char] h_pixel = h * char_height w_pixel = w * char_width - out_image[ - :, h_pixel : h_pixel + char_height, w_pixel : w_pixel + char_width - ] = char_array[char, color] + out_image[:, h_pixel : h_pixel + char_height, w_pixel : w_pixel + char_width] = char_array[char, color] def _initialize_char_array(font_size, rescale_font_size): @@ -106,9 +102,7 @@ def _initialize_char_array(font_size, rescale_font_size): Returns a np array of (num_chars, num_colors, char_height, char_width, 3) """ font = ImageFont.truetype(SMALL_FONT_PATH, font_size) - dummy_text = "".join( - [(chr(i) if chr(i).isprintable() else " ") for i in range(256)] - ) + dummy_text = "".join([(chr(i) if chr(i).isprintable() else " ") for i in range(256)]) _, _, image_width, image_height = font.getbbox(dummy_text) # Above can not be trusted (or its siblings).... image_width = int(np.ceil(image_width / 256) * 256) @@ -168,11 +162,9 @@ def __init__( self.output_width_chars * self.char_width, ) - self.observation_space = gym.spaces.Box( - low=0, high=255, shape=self.chw_image_shape, dtype=np.uint8 - ) - - ''' + self.observation_space = gym.spaces.Box(low=0, high=255, shape=self.chw_image_shape, dtype=np.uint8) + + """ obs_spaces = { "screen_image": gym.spaces.Box( low=0, high=255, shape=self.chw_image_shape, dtype=np.uint8 @@ -186,9 +178,9 @@ def __init__( ] ) self.observation_space = gym.spaces.Dict(obs_spaces) - ''' + """ - self.render_mode = 'rgb_array' + self.render_mode = "rgb_array" def _render_text_to_image(self, obs): chars = obs["tty_chars"] @@ -235,7 +227,7 @@ def reset(self): obs = self._render_text_to_image(obs) return obs - def render(self, mode='rgb_array'): + def render(self, mode="rgb_array"): return self.obs @@ -269,11 +261,7 @@ def __init__( crop_cols * self.char_width, ) - obs_spaces = { - "screen_image": gym.spaces.Box( - low=0, high=255, shape=self.chw_image_shape, dtype=np.uint8 - ) - } + obs_spaces = {"screen_image": gym.spaces.Box(low=0, high=255, shape=self.chw_image_shape, dtype=np.uint8)} obs_spaces.update( [ (k, self.env.observation_space[k]) diff --git a/pufferlib/environments/nmmo/__init__.py b/pufferlib/environments/nmmo/__init__.py index eff86ef02..a38c82d6d 100644 --- a/pufferlib/environments/nmmo/__init__.py +++ b/pufferlib/environments/nmmo/__init__.py @@ -6,6 +6,7 @@ pass else: from .torch import Policy + try: from .torch import Recurrent except: diff --git a/pufferlib/environments/nmmo/environment.py b/pufferlib/environments/nmmo/environment.py index f4ef1c998..ab16957fc 100644 --- a/pufferlib/environments/nmmo/environment.py +++ b/pufferlib/environments/nmmo/environment.py @@ -1,34 +1,37 @@ -from pdb import set_trace as T -import numpy as np import functools +import numpy as np + import pufferlib import pufferlib.emulation import pufferlib.environments -import pufferlib.wrappers import pufferlib.postprocess +import pufferlib.wrappers -def env_creator(name='nmmo'): +def env_creator(name="nmmo"): return functools.partial(make, name) + def make(name, *args, buf=None, **kwargs): - '''Neural MMO creation function''' - nmmo = pufferlib.environments.try_import('nmmo') + """Neural MMO creation function""" + nmmo = pufferlib.environments.try_import("nmmo") env = nmmo.Env(*args, **kwargs) env = NMMOWrapper(env) env = pufferlib.postprocess.MultiagentEpisodeStats(env) env = pufferlib.postprocess.MeanOverAgents(env) return pufferlib.emulation.PettingZooPufferEnv(env=env, buf=buf) + class NMMOWrapper(pufferlib.postprocess.PettingZooWrapper): - '''Remove task spam''' + """Remove task spam""" + @property def render_mode(self): - return 'rgb_array' - + return "rgb_array" + def render(self): - '''Quick little renderer for NMMO''' + """Quick little renderer for NMMO""" tiles = self.env.tile_map[:, :, 2].astype(np.uint8) render = np.zeros((tiles.shape[0], tiles.shape[1], 3), dtype=np.uint8) BROWN = (136, 69, 19) @@ -67,11 +70,9 @@ def reset(self, seed=None): def step(self, actions): obs, rewards, dones, truncateds, infos = self.env.step(actions) - infos = {k: list(v['task'].values())[0] for k, v in infos.items()} + infos = {k: list(v["task"].values())[0] for k, v in infos.items()} self.obs = obs return obs, rewards, dones, truncateds, infos def close(self): return self.env.close() - - diff --git a/pufferlib/environments/nmmo/torch.py b/pufferlib/environments/nmmo/torch.py index e98134f3a..e98dc1ea7 100644 --- a/pufferlib/environments/nmmo/torch.py +++ b/pufferlib/environments/nmmo/torch.py @@ -1,4 +1,3 @@ -from pdb import set_trace as T import torch import torch.nn.functional as F @@ -17,80 +16,76 @@ class Recurrent(pufferlib.models.LSTMWrapper): def __init__(self, env, policy, input_size=256, hidden_size=256, num_layers=1): super().__init__(env, policy, input_size, hidden_size, num_layers) + class Policy(torch.nn.Module): - NUM_ATTRS = 34 - EntityId = EntityState.State.attr_name_to_col["id"] - tile_offset = torch.tensor([i*256 for i in range(3)]) - entity_offset = torch.tensor([i*256 for i in range(3, 34)]) - - def __init__(self, env, input_size=256, hidden_size=256, output_size=256): - super().__init__() - self.dtype = pufferlib.pytorch.nativize_dtype(env.emulated) - - # A dumb example encoder that applies a linear layer to agent self features - self.embedding = torch.nn.Embedding(self.NUM_ATTRS*256, 32) - self.tile_conv_1 = torch.nn.Conv2d(96, 32, 3) - self.tile_conv_2 = torch.nn.Conv2d(32, 8, 3) - self.tile_fc = torch.nn.Linear(8*11*11, input_size) - - self.entity_fc = torch.nn.Linear(31*32, input_size) - - self.proj_fc = torch.nn.Linear(2*input_size, input_size) - - self.decoders = torch.nn.ModuleList([torch.nn.Linear(hidden_size, n) - for n in env.single_action_space.nvec]) - self.value_head = torch.nn.Linear(hidden_size, 1) - - def forward(self, x): - hidden, lookup = self.encode_observations(x) - actions, value = self.decode_actions(hidden, lookup) - return actions, value - - def encode_observations(self, env_outputs): - env_outputs = pufferlib.pytorch.nativize_tensor(env_outputs, self.dtype) - - tile = env_outputs['Tile'] - # Center on player - # This is cursed without clone?? - tile[:, :, :2] -= tile[:, 112:113, :2].clone() - tile[:, :, :2] += 7 - tile = self.embedding( - tile.long().clip(0, 255) + self.tile_offset.to(tile.device) - ) - - agents, tiles, features, embed = tile.shape - tile = tile.view(agents, tiles, features*embed).transpose(1, 2).view(agents, features*embed, 15, 15) - - tile = self.tile_conv_1(tile) - tile = F.relu(tile) - tile = self.tile_conv_2(tile) - tile = F.relu(tile) - tile = tile.contiguous().view(agents, -1) - tile = self.tile_fc(tile) - tile = F.relu(tile) - - # Pull out rows corresponding to the agent - agentEmb = env_outputs["Entity"] - my_id = env_outputs["AgentId"][:,0] - entity_ids = agentEmb[:,:,self.EntityId] - mask = (entity_ids == my_id.unsqueeze(1)) & (entity_ids != 0) - mask = mask.int() - row_indices = torch.where(mask.any(dim=1), mask.argmax(dim=1), torch.zeros_like(mask.sum(dim=1))) - entity = agentEmb[torch.arange(agentEmb.shape[0]), row_indices] - - entity = self.embedding( - entity.long().clip(0, 255) + self.entity_offset.to(entity.device) - ) - agents, attrs, embed = entity.shape - entity = entity.view(agents, attrs*embed) - - entity = self.entity_fc(entity) - entity = F.relu(entity) - - obs = torch.cat([tile, entity], dim=-1) - return self.proj_fc(obs), None - - def decode_actions(self, hidden, lookup, concat=True): - value = self.value_head(hidden) - actions = [dec(hidden) for dec in self.decoders] - return actions, value + NUM_ATTRS = 34 + EntityId = EntityState.State.attr_name_to_col["id"] + tile_offset = torch.tensor([i * 256 for i in range(3)]) + entity_offset = torch.tensor([i * 256 for i in range(3, 34)]) + + def __init__(self, env, input_size=256, hidden_size=256, output_size=256): + super().__init__() + self.dtype = pufferlib.pytorch.nativize_dtype(env.emulated) + + # A dumb example encoder that applies a linear layer to agent self features + self.embedding = torch.nn.Embedding(self.NUM_ATTRS * 256, 32) + self.tile_conv_1 = torch.nn.Conv2d(96, 32, 3) + self.tile_conv_2 = torch.nn.Conv2d(32, 8, 3) + self.tile_fc = torch.nn.Linear(8 * 11 * 11, input_size) + + self.entity_fc = torch.nn.Linear(31 * 32, input_size) + + self.proj_fc = torch.nn.Linear(2 * input_size, input_size) + + self.decoders = torch.nn.ModuleList([torch.nn.Linear(hidden_size, n) for n in env.single_action_space.nvec]) + self.value_head = torch.nn.Linear(hidden_size, 1) + + def forward(self, x): + hidden, lookup = self.encode_observations(x) + actions, value = self.decode_actions(hidden, lookup) + return actions, value + + def encode_observations(self, env_outputs): + env_outputs = pufferlib.pytorch.nativize_tensor(env_outputs, self.dtype) + + tile = env_outputs["Tile"] + # Center on player + # This is cursed without clone?? + tile[:, :, :2] -= tile[:, 112:113, :2].clone() + tile[:, :, :2] += 7 + tile = self.embedding(tile.long().clip(0, 255) + self.tile_offset.to(tile.device)) + + agents, tiles, features, embed = tile.shape + tile = tile.view(agents, tiles, features * embed).transpose(1, 2).view(agents, features * embed, 15, 15) + + tile = self.tile_conv_1(tile) + tile = F.relu(tile) + tile = self.tile_conv_2(tile) + tile = F.relu(tile) + tile = tile.contiguous().view(agents, -1) + tile = self.tile_fc(tile) + tile = F.relu(tile) + + # Pull out rows corresponding to the agent + agentEmb = env_outputs["Entity"] + my_id = env_outputs["AgentId"][:, 0] + entity_ids = agentEmb[:, :, self.EntityId] + mask = (entity_ids == my_id.unsqueeze(1)) & (entity_ids != 0) + mask = mask.int() + row_indices = torch.where(mask.any(dim=1), mask.argmax(dim=1), torch.zeros_like(mask.sum(dim=1))) + entity = agentEmb[torch.arange(agentEmb.shape[0]), row_indices] + + entity = self.embedding(entity.long().clip(0, 255) + self.entity_offset.to(entity.device)) + agents, attrs, embed = entity.shape + entity = entity.view(agents, attrs * embed) + + entity = self.entity_fc(entity) + entity = F.relu(entity) + + obs = torch.cat([tile, entity], dim=-1) + return self.proj_fc(obs), None + + def decode_actions(self, hidden, lookup, concat=True): + value = self.value_head(hidden) + actions = [dec(hidden) for dec in self.decoders] + return actions, value diff --git a/pufferlib/environments/open_spiel/__init__.py b/pufferlib/environments/open_spiel/__init__.py index eff86ef02..a38c82d6d 100644 --- a/pufferlib/environments/open_spiel/__init__.py +++ b/pufferlib/environments/open_spiel/__init__.py @@ -6,6 +6,7 @@ pass else: from .torch import Policy + try: from .torch import Recurrent except: diff --git a/pufferlib/environments/open_spiel/environment.py b/pufferlib/environments/open_spiel/environment.py index a5679b759..8fafcb03b 100644 --- a/pufferlib/environments/open_spiel/environment.py +++ b/pufferlib/environments/open_spiel/environment.py @@ -1,38 +1,25 @@ -from pdb import set_trace as T -import numpy as np import functools import pufferlib -from pufferlib import namespace import pufferlib.emulation import pufferlib.environments -def env_creator(name='connect_four'): - '''OpenSpiel creation function''' +def env_creator(name="connect_four"): + """OpenSpiel creation function""" return functools.partial(make, name) -def make( - name, - multiplayer=False, - n_rollouts=5, - max_simulations=10, - min_simulations=None, - buf=None - ): - '''OpenSpiel creation function''' - pyspiel = pufferlib.environments.try_import('pyspiel', 'open_spiel') + +def make(name, multiplayer=False, n_rollouts=5, max_simulations=10, min_simulations=None, buf=None): + """OpenSpiel creation function""" + pyspiel = pufferlib.environments.try_import("pyspiel", "open_spiel") env = pyspiel.load_game(name) if min_simulations is None: min_simulations = max_simulations - from pufferlib.environments.open_spiel.gymnasium_environment import ( - OpenSpielGymnasiumEnvironment - ) - from pufferlib.environments.open_spiel.pettingzoo_environment import ( - OpenSpielPettingZooEnvironment - ) + from pufferlib.environments.open_spiel.gymnasium_environment import OpenSpielGymnasiumEnvironment + from pufferlib.environments.open_spiel.pettingzoo_environment import OpenSpielPettingZooEnvironment kwargs = dict( env=env, @@ -40,7 +27,7 @@ def make( min_simulations=int(min_simulations), max_simulations=int(max_simulations), ) - + if multiplayer: env = OpenSpielPettingZooEnvironment(**kwargs) wrapper_cls = pufferlib.emulation.PettingZooPufferEnv @@ -53,4 +40,3 @@ def make( postprocessor_cls=pufferlib.emulation.BasicPostprocessor, buf=buf, ) - diff --git a/pufferlib/environments/open_spiel/gymnasium_environment.py b/pufferlib/environments/open_spiel/gymnasium_environment.py index ae5bfba59..0085b819d 100644 --- a/pufferlib/environments/open_spiel/gymnasium_environment.py +++ b/pufferlib/environments/open_spiel/gymnasium_environment.py @@ -1,41 +1,38 @@ -from pdb import set_trace as T import numpy as np - from open_spiel.python.algorithms import mcts -import pufferlib -from pufferlib import namespace from pufferlib.environments.open_spiel.utils import ( - solve_chance_nodes, - get_obs_and_infos, - observation_space, action_space, + close, + get_obs_and_infos, init, + observation_space, render, - close, + solve_chance_nodes, ) def create_bots(state, seed): - assert seed is not None, 'seed must be set' + assert seed is not None, "seed must be set" rnd_state = np.random.RandomState(seed) - evaluator = mcts.RandomRolloutEvaluator( - n_rollouts=state.n_rollouts, - random_state=rnd_state - ) - - return [mcts.MCTSBot( - game=state.env, - uct_c=2, - max_simulations=a, - evaluator=evaluator, - random_state=rnd_state, - child_selection_fn=mcts.SearchNode.puct_value, - solve=True, - ) for a in range(state.min_simulations, state.max_simulations + 1)] - -def reset(state, seed = None, options = None): + evaluator = mcts.RandomRolloutEvaluator(n_rollouts=state.n_rollouts, random_state=rnd_state) + + return [ + mcts.MCTSBot( + game=state.env, + uct_c=2, + max_simulations=a, + evaluator=evaluator, + random_state=rnd_state, + child_selection_fn=mcts.SearchNode.puct_value, + solve=True, + ) + for a in range(state.min_simulations, state.max_simulations + 1) + ] + + +def reset(state, seed=None, options=None): state.state = state.env.new_initial_state() if not state.has_reset: @@ -49,11 +46,12 @@ def reset(state, seed = None, options = None): if np.random.rand() < 0.5: bot_atn = state.bot.step(state.state) state.state.apply_action(bot_atn) - + obs, infos = get_obs_and_infos(state) player = state.state.current_player() return obs[player], infos[player] + def step(state, action): player = state.state.current_player() solve_chance_nodes(state) @@ -73,11 +71,12 @@ def step(state, action): # Are we done? terminated = state.state.is_terminal() if terminated: - key = f'win_mcts_{state.bot.max_simulations}' - info[key] = int(reward==1) + key = f"win_mcts_{state.bot.max_simulations}" + info[key] = int(reward == 1) return obs[player], reward, terminated, False, info + class OpenSpielGymnasiumEnvironment: __init__ = init step = step diff --git a/pufferlib/environments/open_spiel/pettingzoo_environment.py b/pufferlib/environments/open_spiel/pettingzoo_environment.py index cbfca0af4..f4df339d3 100644 --- a/pufferlib/environments/open_spiel/pettingzoo_environment.py +++ b/pufferlib/environments/open_spiel/pettingzoo_environment.py @@ -1,32 +1,33 @@ -from pdb import set_trace as T import numpy as np -import pufferlib -from pufferlib import namespace - from pufferlib.environments.open_spiel.utils import ( - solve_chance_nodes, - get_obs_and_infos, - observation_space, action_space, + close, + get_obs_and_infos, init, + observation_space, render, - close, + solve_chance_nodes, ) + def agents(state): return state.agents + def possible_agents(state): return list(range(state.env.num_players())) + def pz_observation_space(state, agent): return observation_space(state) + def pz_action_space(state, agent): return action_space(state) -def reset(state, seed = None, options = None): + +def reset(state, seed=None, options=None): state.state = state.env.new_initial_state() obs, infos = get_obs_and_infos(state) state.agents = state.possible_agents @@ -38,6 +39,7 @@ def reset(state, seed = None, options = None): return obs, infos + def step(state, actions): curr_player = state.state.current_player() solve_chance_nodes(state) @@ -56,6 +58,7 @@ def step(state, actions): return obs, rewards, terminateds, truncateds, infos + class OpenSpielPettingZooEnvironment: __init__ = init step = step diff --git a/pufferlib/environments/open_spiel/torch.py b/pufferlib/environments/open_spiel/torch.py index 4d1845b6b..7235c46bd 100644 --- a/pufferlib/environments/open_spiel/torch.py +++ b/pufferlib/environments/open_spiel/torch.py @@ -1,44 +1,43 @@ -from pdb import set_trace as T import numpy as np - import torch from torch import nn import pufferlib.emulation from pufferlib.models import Policy as Base + class Policy(Base): def __init__(self, env, input_size=128, hidden_size=128): - '''Default PyTorch policy, meant for debugging. + """Default PyTorch policy, meant for debugging. This should run with any environment but is unlikely to learn anything. - + Uses a single linear layer + relu to encode observations and a list of linear layers to decode actions. The value function is a single linear layer. - ''' + """ super().__init__(env) self.flat_observation_space = env.flat_observation_space self.flat_observation_structure = env.flat_observation_structure - self.encoder = nn.Linear(np.prod( - env.structured_observation_space['obs'].shape), hidden_size) + self.encoder = nn.Linear(np.prod(env.structured_observation_space["obs"].shape), hidden_size) self.decoder = nn.Linear(hidden_size, self.action_space.n) self.value_head = nn.Linear(hidden_size, 1) def encode_observations(self, observations): - '''Linear encoder function''' - observations = pufferlib.emulation.unpack_batched_obs(observations, - self.flat_observation_space, self.flat_observation_structure) - obs = observations['obs'].view(observations['obs'].shape[0], -1) - self.action_mask = observations['action_mask'] + """Linear encoder function""" + observations = pufferlib.emulation.unpack_batched_obs( + observations, self.flat_observation_space, self.flat_observation_structure + ) + obs = observations["obs"].view(observations["obs"].shape[0], -1) + self.action_mask = observations["action_mask"] hidden = torch.relu(self.encoder(obs)) return hidden, None def decode_actions(self, hidden, lookup, concat=True): - '''Concatenated linear decoder function''' + """Concatenated linear decoder function""" value = self.value_head(hidden) action = self.decoder(hidden) action = action.masked_fill(self.action_mask == 0, -1e9) - return action, value \ No newline at end of file + return action, value diff --git a/pufferlib/environments/open_spiel/utils.py b/pufferlib/environments/open_spiel/utils.py index 8eab105c2..a7591c801 100644 --- a/pufferlib/environments/open_spiel/utils.py +++ b/pufferlib/environments/open_spiel/utils.py @@ -1,19 +1,13 @@ -from pdb import set_trace as T -import numpy as np - import gymnasium +import numpy as np from pufferlib import namespace -def init(self, - env, - n_rollouts, - min_simulations, - max_simulations - ): - #state.num_agents = state.env.num_players() - return namespace(self, +def init(self, env, n_rollouts, min_simulations, max_simulations): + # state.num_agents = state.env.num_players() + return namespace( + self, env=env, type=env.get_type(), n_rollouts=n_rollouts, @@ -24,37 +18,39 @@ def init(self, has_reset=False, ) + def observation_space(state): - return gymnasium.spaces.Dict({ - 'obs': gymnasium.spaces.Box( - low=0.0, - high=1.0, - shape=(state.env.observation_tensor_size(),), - dtype=np.float32, - ), - 'action_mask': gymnasium.spaces.Box( - low=0, - high=1, - shape=(action_space(state).n,), - dtype=np.int8 - ) - }) + return gymnasium.spaces.Dict( + { + "obs": gymnasium.spaces.Box( + low=0.0, + high=1.0, + shape=(state.env.observation_tensor_size(),), + dtype=np.float32, + ), + "action_mask": gymnasium.spaces.Box(low=0, high=1, shape=(action_space(state).n,), dtype=np.int8), + } + ) + def action_space(state): - return gymnasium.spaces.Discrete( - state.env.num_distinct_actions()) + return gymnasium.spaces.Discrete(state.env.num_distinct_actions()) + def render(state, mode=None) -> None: if mode == "human": print(state.state) + def close(state): pass + def act(state, action): solve_chance_nodes(state) state.state.apply_action(action) + def get_obs_and_infos(state): # Before calculating an observation, there could be chance nodes # (that may have an effect on the actual observations). @@ -64,7 +60,7 @@ def get_obs_and_infos(state): if state.state.is_terminal(): return ( - state.last_obs, + state.last_obs, {player: {} for player in range(state.env.num_players())}, ) @@ -74,11 +70,13 @@ def get_obs_and_infos(state): np_mask = np.zeros(action_space(state).n) np_mask[mask] = 1 - state.last_obs = {player: { - 'obs': np.reshape(state.state.observation_tensor(), - [-1]).astype(np.float32), - 'action_mask': np_mask.astype(np.int8), - } for player in range(state.env.num_players())} + state.last_obs = { + player: { + "obs": np.reshape(state.state.observation_tensor(), [-1]).astype(np.float32), + "action_mask": np_mask.astype(np.int8), + } + for player in range(state.env.num_players()) + } state.last_info = {curr_player: {}} @@ -87,6 +85,7 @@ def get_obs_and_infos(state): state.last_info, ) + def solve_chance_nodes(state): # Before applying action(s), there could be chance nodes. # E.g. if env has to figure out, which agent's action should get @@ -94,6 +93,6 @@ def solve_chance_nodes(state): # Chance node(s): Sample a (non-player) action and apply. while state.state.is_chance_node(): assert state.state.current_player() == -1 - actions, probs = zip(*state.state.chance_outcomes()) + actions, probs = zip(*state.state.chance_outcomes(), strict=False) action = np.random.choice(actions, p=probs) state.state.apply_action(action) diff --git a/pufferlib/environments/pokemon_red/__init__.py b/pufferlib/environments/pokemon_red/__init__.py index eff86ef02..a38c82d6d 100644 --- a/pufferlib/environments/pokemon_red/__init__.py +++ b/pufferlib/environments/pokemon_red/__init__.py @@ -6,6 +6,7 @@ pass else: from .torch import Policy + try: from .torch import Recurrent except: diff --git a/pufferlib/environments/pokemon_red/environment.py b/pufferlib/environments/pokemon_red/environment.py index 03acee58a..44b5c7edc 100644 --- a/pufferlib/environments/pokemon_red/environment.py +++ b/pufferlib/environments/pokemon_red/environment.py @@ -1,31 +1,32 @@ -from pdb import set_trace as T -import gymnasium import functools +import gymnasium from pokegym import Environment import pufferlib.emulation import pufferlib.postprocess -def env_creator(name='pokemon_red'): +def env_creator(name="pokemon_red"): return functools.partial(make, name) + def make(name, headless: bool = True, state_path=None, buf=None): - '''Pokemon Red''' + """Pokemon Red""" env = Environment(headless=headless, state_path=state_path) env = RenderWrapper(env) env = pufferlib.postprocess.EpisodeStats(env) return pufferlib.emulation.GymnasiumPufferEnv(env=env, buf=buf) + class RenderWrapper(gymnasium.Wrapper): def __init__(self, env): self.env = env @property def render_mode(self): - return 'rgb_array' + return "rgb_array" def render(self): return self.env.screen.screen_ndarray() diff --git a/pufferlib/environments/pokemon_red/torch.py b/pufferlib/environments/pokemon_red/torch.py index 370b023bf..11088e26b 100644 --- a/pufferlib/environments/pokemon_red/torch.py +++ b/pufferlib/environments/pokemon_red/torch.py @@ -1,19 +1,14 @@ -from functools import partial -import torch import pufferlib.models class Recurrent(pufferlib.models.LSTMWrapper): - def __init__(self, env, policy, - input_size=512, hidden_size=512, num_layers=1): - super().__init__(env, policy, - input_size, hidden_size, num_layers) + def __init__(self, env, policy, input_size=512, hidden_size=512, num_layers=1): + super().__init__(env, policy, input_size, hidden_size, num_layers) + class Policy(pufferlib.models.Convolutional): - def __init__(self, env, - input_size=512, hidden_size=512, output_size=512, - framestack=4, flat_size=64*5*6): + def __init__(self, env, input_size=512, hidden_size=512, output_size=512, framestack=4, flat_size=64 * 5 * 6): super().__init__( env=env, input_size=input_size, @@ -25,7 +20,7 @@ def __init__(self, env, ) -''' +""" class Policy(pufferlib.models.ProcgenResnet): def __init__(self, env, cnn_width=16, mlp_width=512): super().__init__( @@ -33,4 +28,4 @@ def __init__(self, env, cnn_width=16, mlp_width=512): cnn_width=cnn_width, mlp_width=mlp_width, ) -''' +""" diff --git a/pufferlib/environments/procgen/__init__.py b/pufferlib/environments/procgen/__init__.py index eff86ef02..a38c82d6d 100644 --- a/pufferlib/environments/procgen/__init__.py +++ b/pufferlib/environments/procgen/__init__.py @@ -6,6 +6,7 @@ pass else: from .torch import Policy + try: from .torch import Recurrent except: diff --git a/pufferlib/environments/procgen/environment.py b/pufferlib/environments/procgen/environment.py index 7dce8a1e0..97c544211 100644 --- a/pufferlib/environments/procgen/environment.py +++ b/pufferlib/environments/procgen/environment.py @@ -1,30 +1,25 @@ -from pdb import set_trace as T -import numpy as np +import functools import gym -import gymnasium +import numpy as np import shimmy -import functools import pufferlib import pufferlib.emulation import pufferlib.environments import pufferlib.postprocess -from stable_baselines3.common.atari_wrappers import ( - MaxAndSkipEnv, -) -def env_creator(name='bigfish'): +def env_creator(name="bigfish"): return functools.partial(make, name) -def make(name, num_envs=1, num_levels=0, start_level=0, - distribution_mode='easy', render_mode=None, buf=None): - '''Atari creation function with default CleanRL preprocessing based on Stable Baselines3 wrappers''' + +def make(name, num_envs=1, num_levels=0, start_level=0, distribution_mode="easy", render_mode=None, buf=None): + """Atari creation function with default CleanRL preprocessing based on Stable Baselines3 wrappers""" assert int(num_envs) == float(num_envs), "num_envs must be an integer" num_envs = int(num_envs) - procgen = pufferlib.environments.try_import('procgen') + procgen = pufferlib.environments.try_import("procgen") envs = procgen.ProcgenEnv( env_name=name, num_envs=num_envs, @@ -43,28 +38,29 @@ def make(name, num_envs=1, num_levels=0, start_level=0, assert isinstance(envs.single_action_space, gym.spaces.Discrete), "only discrete action space is supported" envs = ProcgenWrapper(envs) envs = shimmy.GymV21CompatibilityV0(env=envs, render_mode=render_mode) - #envs = gymnasium.wrappers.GrayScaleObservation(envs) - #envs = gymnasium.wrappers.FrameStack(envs, 4)#, framestack) - #envs = MaxAndSkipEnv(envs, skip=2) + # envs = gymnasium.wrappers.GrayScaleObservation(envs) + # envs = gymnasium.wrappers.FrameStack(envs, 4)#, framestack) + # envs = MaxAndSkipEnv(envs, skip=2) envs = pufferlib.postprocess.EpisodeStats(envs) return pufferlib.emulation.GymnasiumPufferEnv(env=envs, buf=buf) + class ProcgenWrapper: def __init__(self, env): self.env = env - self.observation_space = self.env.observation_space['rgb'] + self.observation_space = self.env.observation_space["rgb"] self.action_space = self.env.action_space @property def render_mode(self): - return 'rgb_array' + return "rgb_array" def reset(self, seed=None): obs = self.env.reset()[0] return obs def render(self, mode=None): - return self.env.env.env.env.env.env.get_info()[0]['rgb'] + return self.env.env.env.env.env.env.get_info()[0]["rgb"] def close(self): return self.env.close() diff --git a/pufferlib/environments/procgen/torch.py b/pufferlib/environments/procgen/torch.py index 924926c34..9cfe8d0ba 100644 --- a/pufferlib/environments/procgen/torch.py +++ b/pufferlib/environments/procgen/torch.py @@ -1,25 +1,26 @@ -from pdb import set_trace as T from torch import nn + import pufferlib.models # This policy ended up being useful broadly # so I included it in the defaults + class Recurrent(pufferlib.models.LSTMWrapper): def __init__(self, env, policy, input_size=256, hidden_size=256, num_layers=1): super().__init__(env, policy, input_size, hidden_size, num_layers) -class Policy (nn.Module): - def __init__(self, env, *args, input_size=256, hidden_size=256, - output_size=256, **kwargs): - '''The CleanRL default NatureCNN policy used for Atari. + +class Policy(nn.Module): + def __init__(self, env, *args, input_size=256, hidden_size=256, output_size=256, **kwargs): + """The CleanRL default NatureCNN policy used for Atari. It's just a stack of three convolutions followed by a linear layer - + Takes framestack as a mandatory keyword argument. Suggested default is 1 frame - with LSTM or 4 frames without.''' + with LSTM or 4 frames without.""" super().__init__() - self.network= nn.Sequential( + self.network = nn.Sequential( pufferlib.pytorch.layer_init(nn.Conv2d(3, 16, 8, stride=4)), nn.ReLU(), pufferlib.pytorch.layer_init(nn.Conv2d(16, 32, 4, stride=2)), @@ -28,10 +29,8 @@ def __init__(self, env, *args, input_size=256, hidden_size=256, pufferlib.pytorch.layer_init(nn.Linear(1152, hidden_size)), nn.ReLU(), ) - self.actor = pufferlib.pytorch.layer_init( - nn.Linear(hidden_size, env.single_action_space.n), std=0.01) - self.value_fn = pufferlib.pytorch.layer_init( - nn.Linear(output_size, 1), std=1) + self.actor = pufferlib.pytorch.layer_init(nn.Linear(hidden_size, env.single_action_space.n), std=0.01) + self.value_fn = pufferlib.pytorch.layer_init(nn.Linear(output_size, 1), std=1) def forward(self, observations): hidden, lookup = self.encode_observations(observations) @@ -47,4 +46,5 @@ def decode_actions(self, flat_hidden, lookup, concat=None): value = self.value_fn(flat_hidden) return action, value + Policy = pufferlib.models.ProcgenResnet diff --git a/pufferlib/environments/slimevolley/__init__.py b/pufferlib/environments/slimevolley/__init__.py index 59cda9e7c..c24135317 100644 --- a/pufferlib/environments/slimevolley/__init__.py +++ b/pufferlib/environments/slimevolley/__init__.py @@ -6,6 +6,7 @@ pass else: from .torch import Policy + try: from .torch import Recurrent except: diff --git a/pufferlib/environments/slimevolley/environment.py b/pufferlib/environments/slimevolley/environment.py index a46d79562..a5cd43cf7 100644 --- a/pufferlib/environments/slimevolley/environment.py +++ b/pufferlib/environments/slimevolley/environment.py @@ -1,25 +1,26 @@ -from pdb import set_trace as T -import numpy as np import functools import gym +import numpy as np import shimmy import pufferlib import pufferlib.emulation import pufferlib.environments -import pufferlib.utils import pufferlib.postprocess +import pufferlib.utils -def env_creator(name='SlimeVolley-v0'): +def env_creator(name="SlimeVolley-v0"): return functools.partial(make, name) -def make(name, render_mode='rgb_array', buf=None): - if name == 'slimevolley': - name = 'SlimeVolley-v0' + +def make(name, render_mode="rgb_array", buf=None): + if name == "slimevolley": + name = "SlimeVolley-v0" from slimevolleygym import SlimeVolleyEnv + SlimeVolleyEnv.atari_mode = True env = SlimeVolleyEnv() env.policy.predict = lambda obs: np.random.randint(0, 2, 3) @@ -29,10 +30,11 @@ def make(name, render_mode='rgb_array', buf=None): env = pufferlib.postprocess.EpisodeStats(env) return pufferlib.emulation.GymnasiumPufferEnv(env=env, buf=buf) + class SlimeVolleyMultiDiscrete(gym.Wrapper): def __init__(self, env): super().__init__(env) - #self.action_space = gym.spaces.MultiDiscrete( + # self.action_space = gym.spaces.MultiDiscrete( # [2 for _ in range(env.action_space.n)]) def reset(self, seed=None): @@ -42,11 +44,13 @@ def step(self, action): obs, reward, done, info = self.env.step(action) return obs.astype(np.float32), reward, done, info + class SkipWrapper(gym.Wrapper): """ - Generic common frame skipping wrapper - Will perform action for `x` additional steps + Generic common frame skipping wrapper + Will perform action for `x` additional steps """ + def __init__(self, env, repeat_count): super(SkipWrapper, self).__init__(env) self.repeat_count = repeat_count @@ -67,4 +71,3 @@ def step(self, action): def reset(self): self.stepcount = 0 return self.env.reset() - diff --git a/pufferlib/environments/smac/__init__.py b/pufferlib/environments/smac/__init__.py index eff86ef02..a38c82d6d 100644 --- a/pufferlib/environments/smac/__init__.py +++ b/pufferlib/environments/smac/__init__.py @@ -6,6 +6,7 @@ pass else: from .torch import Policy + try: from .torch import Recurrent except: diff --git a/pufferlib/environments/smac/environment.py b/pufferlib/environments/smac/environment.py index e104e860b..95221b8b2 100644 --- a/pufferlib/environments/smac/environment.py +++ b/pufferlib/environments/smac/environment.py @@ -6,15 +6,16 @@ import pufferlib.wrappers -def env_creator(name='smac'): +def env_creator(name="smac"): return functools.partial(make, name) + def make(name, buf=None): - '''Starcraft Multiagent Challenge creation function + """Starcraft Multiagent Challenge creation function Support for SMAC is WIP because environments do not function without - an action-masked baseline policy.''' - pufferlib.environments.try_import('smac') + an action-masked baseline policy.""" + pufferlib.environments.try_import("smac") from smac.env.pettingzoo.StarCraft2PZEnv import _parallel_env as smac_env env = smac_env(1000) diff --git a/pufferlib/environments/smac/torch.py b/pufferlib/environments/smac/torch.py index 8b13194e9..e69de29bb 100644 --- a/pufferlib/environments/smac/torch.py +++ b/pufferlib/environments/smac/torch.py @@ -1 +0,0 @@ -from pufferlib.models import Default as Policy diff --git a/pufferlib/environments/stable_retro/__init__.py b/pufferlib/environments/stable_retro/__init__.py index eff86ef02..a38c82d6d 100644 --- a/pufferlib/environments/stable_retro/__init__.py +++ b/pufferlib/environments/stable_retro/__init__.py @@ -6,6 +6,7 @@ pass else: from .torch import Policy + try: from .torch import Recurrent except: diff --git a/pufferlib/environments/stable_retro/environment.py b/pufferlib/environments/stable_retro/environment.py index 092708613..e00729d40 100644 --- a/pufferlib/environments/stable_retro/environment.py +++ b/pufferlib/environments/stable_retro/environment.py @@ -1,27 +1,26 @@ -from pdb import set_trace as T -import numpy as np +import functools import gymnasium as gym -import functools +import numpy as np import pufferlib import pufferlib.emulation import pufferlib.environments -def env_creator(name='Airstriker-Genesis'): +def env_creator(name="Airstriker-Genesis"): return functools.partial(make, name) -def make(name='Airstriker-Genesis', framestack=4, buf=None): - '''Atari creation function with default CleanRL preprocessing based on Stable Baselines3 wrappers''' - retro = pufferlib.environments.try_import('retro', 'stable-retro') + +def make(name="Airstriker-Genesis", framestack=4, buf=None): + """Atari creation function with default CleanRL preprocessing based on Stable Baselines3 wrappers""" + retro = pufferlib.environments.try_import("retro", "stable-retro") from stable_baselines3.common.atari_wrappers import ( ClipRewardEnv, - EpisodicLifeEnv, - FireResetEnv, MaxAndSkipEnv, ) + with pufferlib.utils.Suppress(): env = retro.make(name) @@ -31,8 +30,8 @@ def make(name='Airstriker-Genesis', framestack=4, buf=None): env = gym.wrappers.ResizeObservation(env, (84, 84)) env = gym.wrappers.GrayScaleObservation(env) env = gym.wrappers.FrameStack(env, framestack) - return pufferlib.emulation.GymnasiumPufferEnv( - env=env, postprocessor_cls=AtariFeaturizer, buf=buf) + return pufferlib.emulation.GymnasiumPufferEnv(env=env, postprocessor_cls=AtariFeaturizer, buf=buf) + class AtariFeaturizer(pufferlib.emulation.Postprocessor): def reset(self, obs): @@ -40,8 +39,8 @@ def reset(self, obs): self.epoch_length = 0 self.done = False - #@property - #def observation_space(self): + # @property + # def observation_space(self): # return gym.spaces.Box(0, 255, (1, 84, 84), dtype=np.uint8) def observation(self, obs): @@ -50,11 +49,11 @@ def observation(self, obs): def reward_done_truncated_info(self, reward, done, truncated, info): return reward, done, truncated, info - if 'lives' in info: - if info['lives'] == 0 and done: - info['return'] = info['episode']['r'] - info['length'] = info['episode']['l'] - info['time'] = info['episode']['t'] + if "lives" in info: + if info["lives"] == 0 and done: + info["return"] = info["episode"]["r"] + info["length"] = info["episode"]["l"] + info["time"] = info["episode"]["t"] return reward, True, info return reward, False, info @@ -62,8 +61,8 @@ def reward_done_truncated_info(self, reward, done, truncated, info): return reward, done, info if done: - info['return'] = self.epoch_return - info['length'] = self.epoch_length + info["return"] = self.epoch_return + info["length"] = self.epoch_length self.done = True else: self.epoch_length += 1 diff --git a/pufferlib/environments/stable_retro/torch.py b/pufferlib/environments/stable_retro/torch.py index 0ab602183..ade6713dd 100644 --- a/pufferlib/environments/stable_retro/torch.py +++ b/pufferlib/environments/stable_retro/torch.py @@ -6,9 +6,9 @@ class Recurrent: hidden_size = 512 num_layers = 1 + class Policy(pufferlib.models.Convolutional): - def __init__(self, env, input_size=512, hidden_size=512, output_size=512, - framestack=4, flat_size=64*7*7): + def __init__(self, env, input_size=512, hidden_size=512, output_size=512, framestack=4, flat_size=64 * 7 * 7): super().__init__( env=env, input_size=input_size, diff --git a/pufferlib/environments/test/__init__.py b/pufferlib/environments/test/__init__.py index 9d3bfe421..ceaa0e423 100644 --- a/pufferlib/environments/test/__init__.py +++ b/pufferlib/environments/test/__init__.py @@ -1,15 +1,13 @@ from .environment import ( + MOCK_ACTION_SPACES, + MOCK_OBSERVATION_SPACES, GymnasiumPerformanceEnv, + GymnasiumTestEnv, PettingZooPerformanceEnv, - GymnasiumTestEnv, PettingZooTestEnv, make_all_mock_environments, - MOCK_OBSERVATION_SPACES, - MOCK_ACTION_SPACES, ) - -from .mock_environments import MOCK_SINGLE_AGENT_ENVIRONMENTS -from .mock_environments import MOCK_MULTI_AGENT_ENVIRONMENTS +from .mock_environments import MOCK_MULTI_AGENT_ENVIRONMENTS, MOCK_SINGLE_AGENT_ENVIRONMENTS try: import torch @@ -17,6 +15,7 @@ pass else: from .torch import Policy + try: from .torch import Recurrent except: diff --git a/pufferlib/environments/test/environment.py b/pufferlib/environments/test/environment.py index ff82c6e47..71960e9f8 100644 --- a/pufferlib/environments/test/environment.py +++ b/pufferlib/environments/test/environment.py @@ -1,19 +1,15 @@ -from pdb import set_trace as T -import numpy as np - import time -import hashlib import gym import gymnasium -from gymnasium.spaces import Box, Discrete, Dict, Tuple -from pufferlib import spaces +import numpy as np +from gymnasium.spaces import Box, Dict, Discrete, Tuple from pettingzoo.utils.env import ParallelEnv import pufferlib import pufferlib.emulation import pufferlib.utils - +from pufferlib import spaces HIGH = 100 LOW = 0 @@ -21,208 +17,267 @@ MOCK_OBSERVATION_SPACES = [ # Atari space Box(low=0, high=255, shape=(4, 84, 84), dtype=np.uint8), - # NetHack space - Dict({ - 'blstats': Box(-2147483648, 2147483647, (27,), 'int64'), - 'chars': Box(0, 255, (21, 79), 'uint8'), - 'colors': Box(0, 15, (21, 79), 'uint8'), - 'glyphs': Box(0, 5976, (21, 79), 'int16'), - 'inv_glyphs': Box(0, 5976, (55,), 'int16'), - 'inv_letters': Box(0, 127, (55,), 'uint8'), - 'inv_oclasses': Box(0, 18, (55,), 'uint8'), - 'inv_strs': Box(0, 255, (55, 80), 'uint8'), - 'message': Box(0, 255, (256,), 'uint8'), - 'screen_descriptions': Box(0, 127, (21, 79, 80), 'uint8'), - 'specials': Box(0, 255, (21, 79), 'uint8'), - 'tty_chars': Box(0, 255, (24, 80), 'uint8'), - 'tty_colors': Box(0, 31, (24, 80), 'int8'), - 'tty_cursor': Box(0, 255, (2,), 'uint8'), - }), - + Dict( + { + "blstats": Box(-2147483648, 2147483647, (27,), "int64"), + "chars": Box(0, 255, (21, 79), "uint8"), + "colors": Box(0, 15, (21, 79), "uint8"), + "glyphs": Box(0, 5976, (21, 79), "int16"), + "inv_glyphs": Box(0, 5976, (55,), "int16"), + "inv_letters": Box(0, 127, (55,), "uint8"), + "inv_oclasses": Box(0, 18, (55,), "uint8"), + "inv_strs": Box(0, 255, (55, 80), "uint8"), + "message": Box(0, 255, (256,), "uint8"), + "screen_descriptions": Box(0, 127, (21, 79, 80), "uint8"), + "specials": Box(0, 255, (21, 79), "uint8"), + "tty_chars": Box(0, 255, (24, 80), "uint8"), + "tty_colors": Box(0, 31, (24, 80), "int8"), + "tty_cursor": Box(0, 255, (2,), "uint8"), + } + ), # Neural MMO space - Dict({ - 'ActionTargets': Dict({ - 'Attack': Dict({ - 'Style': Box(0, 1, (3,), 'int8'), - 'Target': Box(0, 1, (100,), 'int8'), - }), - 'Buy': Dict({ - 'MarketItem': Box(0, 1, (1024,), 'int8'), - }), - 'Comm': Dict({ - 'Token': Box(0, 1, (50,), 'int8'), - }), - 'Destroy': Dict({ - 'InventoryItem': Box(0, 1, (12,), 'int8'), - }), - 'Give': Dict({ - 'InventoryItem': Box(0, 1, (12,), 'int8'), - 'Target': Box(0, 1, (100,), 'int8'), - }), - 'GiveGold': Dict({ - 'Price': Box(0, 1, (99,), 'int8'), - 'Target': Box(0, 1, (100,), 'int8'), - }), - 'Move': Dict({ - 'Direction': Box(0, 1, (5,), 'int8'), - }), - 'Sell': Dict({ - 'InventoryItem': Box(0, 1, (12,), 'int8'), - 'Price': Box(0, 1, (99,), 'int8'), - }), - 'Use': Dict({ - 'InventoryItem': Box(0, 1, (12,), 'int8'), - }) - }), - 'AgentId': Discrete(129), - 'CurrentTick': Discrete(1025), - 'Entity': Box(-32768, 32767, (100, 23), 'int16'), - 'Inventory': Box(-32768, 32767, (12, 16), 'int16'), - 'Market': Box(-32768, 32767, (1024, 16), 'int16'), - 'Task': Box(-32770.0, 32770.0, (1024,), 'float16'), - 'Tile': Box(-32768, 32767, (225, 3), 'int16'), - }), - + Dict( + { + "ActionTargets": Dict( + { + "Attack": Dict( + { + "Style": Box(0, 1, (3,), "int8"), + "Target": Box(0, 1, (100,), "int8"), + } + ), + "Buy": Dict( + { + "MarketItem": Box(0, 1, (1024,), "int8"), + } + ), + "Comm": Dict( + { + "Token": Box(0, 1, (50,), "int8"), + } + ), + "Destroy": Dict( + { + "InventoryItem": Box(0, 1, (12,), "int8"), + } + ), + "Give": Dict( + { + "InventoryItem": Box(0, 1, (12,), "int8"), + "Target": Box(0, 1, (100,), "int8"), + } + ), + "GiveGold": Dict( + { + "Price": Box(0, 1, (99,), "int8"), + "Target": Box(0, 1, (100,), "int8"), + } + ), + "Move": Dict( + { + "Direction": Box(0, 1, (5,), "int8"), + } + ), + "Sell": Dict( + { + "InventoryItem": Box(0, 1, (12,), "int8"), + "Price": Box(0, 1, (99,), "int8"), + } + ), + "Use": Dict( + { + "InventoryItem": Box(0, 1, (12,), "int8"), + } + ), + } + ), + "AgentId": Discrete(129), + "CurrentTick": Discrete(1025), + "Entity": Box(-32768, 32767, (100, 23), "int16"), + "Inventory": Box(-32768, 32767, (12, 16), "int16"), + "Market": Box(-32768, 32767, (1024, 16), "int16"), + "Task": Box(-32770.0, 32770.0, (1024,), "float16"), + "Tile": Box(-32768, 32767, (225, 3), "int16"), + } + ), # Simple spaces Discrete(5), Box(low=LOW, high=HIGH, shape=(4,), dtype=np.float32), - # Nested spaces - Dict({ - "foo": Box(low=LOW, high=HIGH, shape=(2,), dtype=np.float32), - "bar": Box(low=LOW, high=HIGH, shape=(2,), dtype=np.float32), - }), + Dict( + { + "foo": Box(low=LOW, high=HIGH, shape=(2,), dtype=np.float32), + "bar": Box(low=LOW, high=HIGH, shape=(2,), dtype=np.float32), + } + ), Tuple((Discrete(3), Discrete(4))), - Tuple(( - Box(low=LOW, high=HIGH, shape=(2,), dtype=np.float32), - Discrete(3), - Dict({ - "baz": Box(low=LOW, high=HIGH, shape=(1,), dtype=np.float32), - "qux": Box(low=LOW, high=HIGH, shape=(1,), dtype=np.float32), - }), - )), - Dict({ - "foo": Tuple(( + Tuple( + ( Box(low=LOW, high=HIGH, shape=(2,), dtype=np.float32), Discrete(3), - )), - "bar": Dict({ - "baz": Discrete(2), - "qux": Discrete(4), - }), - }), + Dict( + { + "baz": Box(low=LOW, high=HIGH, shape=(1,), dtype=np.float32), + "qux": Box(low=LOW, high=HIGH, shape=(1,), dtype=np.float32), + } + ), + ) + ), + Dict( + { + "foo": Tuple( + ( + Box(low=LOW, high=HIGH, shape=(2,), dtype=np.float32), + Discrete(3), + ) + ), + "bar": Dict( + { + "baz": Discrete(2), + "qux": Discrete(4), + } + ), + } + ), ] MOCK_ACTION_SPACES = [ # NetHack action space Discrete(5), - # Neural MMO action space - Dict({ - 'Attack': Dict({ - 'Style': Discrete(3), - 'Target': Discrete(100), - }), - 'Buy': Dict({ - 'MarketItem': Discrete(1024), - }), - 'Comm': Dict({ - 'Token': Discrete(50), - }), - 'Destroy': Dict({ - 'InventoryItem': Discrete(12), - }), - 'Give': Dict({ - 'InventoryItem': Discrete(12), - 'Target': Discrete(100), - }), - 'GiveGold': Dict({ - 'Price': Discrete(99), - 'Target': Discrete(100), - }), - 'Move': Dict({ - 'Direction': Discrete(5), - }), - 'Sell': Dict({ - 'InventoryItem': Discrete(12), - 'Price': Discrete(99), - }), - 'Use': Dict({ - 'InventoryItem': Discrete(12), - }) - }), - + Dict( + { + "Attack": Dict( + { + "Style": Discrete(3), + "Target": Discrete(100), + } + ), + "Buy": Dict( + { + "MarketItem": Discrete(1024), + } + ), + "Comm": Dict( + { + "Token": Discrete(50), + } + ), + "Destroy": Dict( + { + "InventoryItem": Discrete(12), + } + ), + "Give": Dict( + { + "InventoryItem": Discrete(12), + "Target": Discrete(100), + } + ), + "GiveGold": Dict( + { + "Price": Discrete(99), + "Target": Discrete(100), + } + ), + "Move": Dict( + { + "Direction": Discrete(5), + } + ), + "Sell": Dict( + { + "InventoryItem": Discrete(12), + "Price": Discrete(99), + } + ), + "Use": Dict( + { + "InventoryItem": Discrete(12), + } + ), + } + ), # Nested spaces Tuple((Discrete(2), Discrete(3))), - Dict({ - "foo": Discrete(4), - "bar": Discrete(2), - }), - Tuple(( - Discrete(4), - Dict({ - "baz": Discrete(2), - "qux": Discrete(2), - }), - )), - Dict({ - "foo": Tuple(( - Discrete(2), - Discrete(3), - )), - "bar": Dict({ - "baz": Discrete(2), - "qux": Discrete(4), - }), - }), + Dict( + { + "foo": Discrete(4), + "bar": Discrete(2), + } + ), + Tuple( + ( + Discrete(4), + Dict( + { + "baz": Discrete(2), + "qux": Discrete(2), + } + ), + ) + ), + Dict( + { + "foo": Tuple( + ( + Discrete(2), + Discrete(3), + ) + ), + "bar": Dict( + { + "baz": Discrete(2), + "qux": Discrete(4), + } + ), + } + ), ] MOCK_TEAMS = { - 'None': None, - 'single': { - 'team_1': ['agent_1'], - 'team_2': ['agent_2'], - 'team_3': ['agent_3'], - 'team_4': ['agent_4'], - 'team_5': ['agent_5'], - 'team_6': ['agent_6'], - 'team_7': ['agent_7'], - 'team_8': ['agent_8'], - 'team_9': ['agent_9'], - 'team_10': ['agent_10'], - 'team_11': ['agent_11'], - 'team_12': ['agent_12'], - 'team_13': ['agent_13'], - 'team_14': ['agent_14'], - 'team_15': ['agent_15'], - 'team_16': ['agent_16'], + "None": None, + "single": { + "team_1": ["agent_1"], + "team_2": ["agent_2"], + "team_3": ["agent_3"], + "team_4": ["agent_4"], + "team_5": ["agent_5"], + "team_6": ["agent_6"], + "team_7": ["agent_7"], + "team_8": ["agent_8"], + "team_9": ["agent_9"], + "team_10": ["agent_10"], + "team_11": ["agent_11"], + "team_12": ["agent_12"], + "team_13": ["agent_13"], + "team_14": ["agent_14"], + "team_15": ["agent_15"], + "team_16": ["agent_16"], }, - 'pairs': { - 'team_1': ['agent_1', 'agent_2'], - 'team_2': ['agent_3', 'agent_4'], - 'team_3': ['agent_5', 'agent_6'], - 'team_4': ['agent_7', 'agent_8'], - 'team_5': ['agent_9', 'agent_10'], - 'team_6': ['agent_11', 'agent_12'], - 'team_7': ['agent_13', 'agent_14'], - 'team_8': ['agent_15', 'agent_16'], + "pairs": { + "team_1": ["agent_1", "agent_2"], + "team_2": ["agent_3", "agent_4"], + "team_3": ["agent_5", "agent_6"], + "team_4": ["agent_7", "agent_8"], + "team_5": ["agent_9", "agent_10"], + "team_6": ["agent_11", "agent_12"], + "team_7": ["agent_13", "agent_14"], + "team_8": ["agent_15", "agent_16"], }, - 'mixed': { - 'team_1': ['agent_1', 'agent_2'], - 'team_2': ['agent_3', 'agent_4', 'agent_5', 'agent_6'], - 'team_3': ['agent_7', 'agent_8', 'agent_9'], - 'team_4': ['agent_10', 'agent_11', 'agent_12', 'agent_13', 'agent_14'], - 'team_5': ['agent_15', 'agent_16'], + "mixed": { + "team_1": ["agent_1", "agent_2"], + "team_2": ["agent_3", "agent_4", "agent_5", "agent_6"], + "team_3": ["agent_7", "agent_8", "agent_9"], + "team_4": ["agent_10", "agent_11", "agent_12", "agent_13", "agent_14"], + "team_5": ["agent_15", "agent_16"], }, } -DEFAULT_OBSERVATION_SPACE = gymnasium.spaces.Box( - low=-2**20, high=2**20, - shape=(1,), dtype=np.float32 -) +DEFAULT_OBSERVATION_SPACE = gymnasium.spaces.Box(low=-(2**20), high=2**20, shape=(1,), dtype=np.float32) DEFAULT_ACTION_SPACE = gymnasium.spaces.Discrete(2) - + def make_all_mock_environments(): mock_single_agent_environments = [] @@ -235,7 +290,7 @@ def make_all_mock_environments(): action_space=act_space, ) ) - + mock_multi_agent_environments.append( PettingZooTestEnv( observation_space=obs_space, @@ -248,13 +303,15 @@ def make_all_mock_environments(): ) return mock_single_agent_environments, mock_multi_agent_environments + def do_work(delay_mean, delay_std): start, idx = time.process_time(), 0 - target_time = delay_mean + delay_std*np.random.randn() + target_time = delay_mean + delay_std * np.random.randn() while time.process_time() - start < target_time: idx += 1 return + class GymnasiumPerformanceEnv: def __init__(self, delay_mean=0, delay_std=0): self.observation_space = DEFAULT_OBSERVATION_SPACE @@ -277,6 +334,7 @@ def step(self, action): def close(self): pass + class PettingZooPerformanceEnv: def __init__(self, delay_mean, delay_std): self.possible_agents = [1] @@ -291,7 +349,7 @@ def observation_space(self, agent): def action_space(self, agent): return DEFAULT_ACTION_SPACE - + def reset(self, seed=None): return {1: self.observation_space(1).sample()}, {1: {}} @@ -309,10 +367,9 @@ def step(self, actions): def close(self): pass + class GymnasiumTestEnv(gym.Env): - def __init__(self, - observation_space=DEFAULT_OBSERVATION_SPACE, - action_space=DEFAULT_ACTION_SPACE): + def __init__(self, observation_space=DEFAULT_OBSERVATION_SPACE, action_space=DEFAULT_ACTION_SPACE): self.observation_space = observation_space self.action_space = action_space @@ -320,7 +377,7 @@ def reset(self, seed=None): self.tick = 0 self.rng = pufferlib.utils.RandomState(seed) - ob = _sample_space('agent_1', self.tick, self.observation_space) + ob = _sample_space("agent_1", self.tick, self.observation_space) return ob, {} def step(self, actions): @@ -328,19 +385,24 @@ def step(self, actions): done = self.tick < 10 self.tick += 1 - ob = _sample_space('agent_1', self.tick, self.observation_space) - return ob, reward, done, False, {'dead': done} + ob = _sample_space("agent_1", self.tick, self.observation_space) + return ob, reward, done, False, {"dead": done} def close(self): pass + class PettingZooTestEnv(ParallelEnv): - def __init__(self, - observation_space=DEFAULT_OBSERVATION_SPACE, - action_space=DEFAULT_ACTION_SPACE, - initial_agents=16, max_agents=16, - spawn_per_tick=0, death_per_tick=1, - homogeneous_spaces=True): + def __init__( + self, + observation_space=DEFAULT_OBSERVATION_SPACE, + action_space=DEFAULT_ACTION_SPACE, + initial_agents=16, + max_agents=16, + spawn_per_tick=0, + death_per_tick=1, + homogeneous_spaces=True, + ): self._observation_space = observation_space self._action_space = action_space self.initial_agents = initial_agents @@ -349,15 +411,14 @@ def __init__(self, self.death_per_tick = death_per_tick self.homogeneous_spaces = homogeneous_spaces - self.possible_agents = [f'agent_{i+1}' for i in range(max_agents)] + self.possible_agents = [f"agent_{i + 1}" for i in range(max_agents)] self.agents = [] def reset(self, seed=None): self.tick = 0 - self.agents = self.possible_agents[:self.initial_agents] + self.agents = self.possible_agents[: self.initial_agents] - obs = {a: _sample_space(a, self.tick, self._observation_space) - for a in self.agents} + obs = {a: _sample_space(a, self.tick, self._observation_space) for a in self.agents} infos = {a: {} for a in self.agents} return obs, infos @@ -365,17 +426,16 @@ def step(self, actions): obs, rewards, dones, truncateds, infos = {}, {}, {}, {}, {} self.tick += 1 - dead = self.agents[:self.death_per_tick] + dead = self.agents[: self.death_per_tick] for kill in dead: self.agents.remove(kill) # TODO: Make pufferlib work without pad obs # but still require rewards, dones, and optionally infos - obs[kill] = _sample_space(kill, self.tick, - self._observation_space, zero=True) + obs[kill] = _sample_space(kill, self.tick, self._observation_space, zero=True) rewards[kill] = -1 dones[kill] = True truncateds[kill] = False - infos[kill] = {'dead': True} + infos[kill] = {"dead": True} # TODO: Fix this assert self.spawn_per_tick == 0 @@ -391,7 +451,7 @@ def step(self, actions): rewards[agent] = 0.1 * _agent_str_to_int(agent) dones[agent] = False truncateds[agent] = False - infos[agent] = {'dead': False} + infos[agent] = {"dead": False} return obs, rewards, dones, truncateds, infos @@ -401,31 +461,33 @@ def observation_space(self, agent) -> gym.Space: def action_space(self, agent) -> gym.Space: return self._action_space - def render(self, mode='human'): + def render(self, mode="human"): pass def close(self): pass + ### Other Mock environments and utilities def _agent_str_to_int(agent): - return int(agent.split('_')[-1]) + return int(agent.split("_")[-1]) + def _sample_space(agent, tick, space, zero=False): if isinstance(agent, str): - agent = float(agent.split('_')[-1]) + agent = float(agent.split("_")[-1]) if isinstance(space, spaces.Discrete): if zero: return 0 - return int((10*agent + tick) % space.n) + return int((10 * agent + tick) % space.n) elif isinstance(space, spaces.Box): if zero: return np.zeros(space.shape, dtype=space.dtype) # Try to make a relatively unique data pattern # without using RNG - nonce = 10*agent + tick + nonce = 10 * agent + tick low = space.low high = space.high sample = low + np.arange(low.size).reshape(space.shape) + nonce diff --git a/pufferlib/environments/test/mock_environments.py b/pufferlib/environments/test/mock_environments.py index 8e8db330e..b33e71e8a 100644 --- a/pufferlib/environments/test/mock_environments.py +++ b/pufferlib/environments/test/mock_environments.py @@ -1,28 +1,26 @@ -from pdb import set_trace as T -import numpy as np - import time -import hashlib from functools import partial import gymnasium as gym -from gymnasium.spaces import Box, Discrete, Dict, Tuple +import numpy as np +from gymnasium.spaces import Box, Dict, Discrete, Tuple from pettingzoo.utils.env import ParallelEnv import pufferlib import pufferlib.emulation import pufferlib.utils - HIGH = 100 LOW = 0 + def make_performance_env(delay=0, bandwidth=1): return pufferlib.emulation.PettingZooPufferEnv( env_creator=PerformanceEnv, env_args=[delay, bandwidth], ) + class PerformanceEnv: def __init__(self, delay=0, bandwith=1): self.agents = [1] @@ -51,35 +49,32 @@ def step(self, actions): return obs, rewards, dones, truncateds, infos def observation_space(self, agent): - return Box( - low=-2**20, high=2**20, - shape=(self.bandwidth,), dtype=np.float32 - ) + return Box(low=-(2**20), high=2**20, shape=(self.bandwidth,), dtype=np.float32) def action_space(self, agent): return Discrete(2) - + ### Other Mock environments and utilities def _agent_str_to_int(agent): - return int(agent.split('_')[-1]) + return int(agent.split("_")[-1]) def _sample_space(agent, tick, space, zero=False): if isinstance(agent, str): - agent = float(agent.split('_')[-1]) + agent = float(agent.split("_")[-1]) if isinstance(space, Discrete): if zero: return 0 - return int((10*agent + tick) % space.n) + return int((10 * agent + tick) % space.n) elif isinstance(space, Box): if zero: return np.zeros(space.shape, dtype=space.dtype) # Try to make a relatively unique data pattern # without using RNG - nonce = 10*agent + tick + nonce = 10 * agent + tick low = space.low high = space.high sample = low + np.arange(low.size).reshape(space.shape) + nonce @@ -92,6 +87,7 @@ def _sample_space(agent, tick, space, zero=False): else: raise ValueError(f"Invalid space type: {type(space)}") + class GymnasiumTestEnv(gym.Env): def __init__(self, observation_space, action_space): self.observation_space = observation_space @@ -101,7 +97,7 @@ def reset(self, seed=None): self.tick = 0 self.rng = pufferlib.utils.RandomState(seed) - ob = _sample_space('agent_1', self.tick, self.observation_space) + ob = _sample_space("agent_1", self.tick, self.observation_space) return ob, {} def step(self, actions): @@ -109,8 +105,8 @@ def step(self, actions): done = self.tick < 10 self.tick += 1 - ob = _sample_space('agent_1', self.tick, self.observation_space) - return ob, reward, done, False, {'dead': done} + ob = _sample_space("agent_1", self.tick, self.observation_space) + return ob, reward, done, False, {"dead": done} def make_mock_singleagent_env(observation_space, action_space): @@ -120,9 +116,9 @@ def make_mock_singleagent_env(observation_space, action_space): action_space=action_space, ) + class TestEnv(ParallelEnv): - def __init__(self, observation_space, action_space, initial_agents, - max_agents, spawn_per_tick, death_per_tick): + def __init__(self, observation_space, action_space, initial_agents, max_agents, spawn_per_tick, death_per_tick): self.single_observation_space = observation_space self.single_action_space = action_space self.initial_agents = initial_agents @@ -130,15 +126,14 @@ def __init__(self, observation_space, action_space, initial_agents, self.spawn_per_tick = spawn_per_tick self.death_per_tick = death_per_tick - self.possible_agents = [f'agent_{i+1}' for i in range(max_agents)] + self.possible_agents = [f"agent_{i + 1}" for i in range(max_agents)] self.agents = [] def reset(self, seed=None): self.tick = 0 - self.agents = self.possible_agents[:self.initial_agents] + self.agents = self.possible_agents[: self.initial_agents] - obs = {a: _sample_space(a, self.tick, self.single_observation_space) - for a in self.agents} + obs = {a: _sample_space(a, self.tick, self.single_observation_space) for a in self.agents} infos = {a: {} for a in self.agents} return obs, infos @@ -146,7 +141,7 @@ def step(self, actions): obs, rewards, dones, truncateds, infos = {}, {}, {}, {}, {} self.tick += 1 - dead = self.agents[:self.death_per_tick] + dead = self.agents[: self.death_per_tick] for kill in dead: self.agents.remove(kill) # TODO: Make pufferlib work without pad obs @@ -155,7 +150,7 @@ def step(self, actions): rewards[kill] = -1 dones[kill] = True truncateds[kill] = False - infos[kill] = {'dead': True} + infos[kill] = {"dead": True} # TODO: Fix this assert self.spawn_per_tick == 0 @@ -171,7 +166,7 @@ def step(self, actions): rewards[agent] = 0.1 * _agent_str_to_int(agent) dones[agent] = False truncateds[agent] = False - infos[agent] = {'dead': False} + infos[agent] = {"dead": False} return obs, rewards, dones, truncateds, infos @@ -181,20 +176,16 @@ def observation_space(self, agent) -> gym.Space: def action_space(self, agent) -> gym.Space: return self.single_action_space - def render(self, mode='human'): + def render(self, mode="human"): pass def close(self): pass + def make_mock_multiagent_env( - observation_space, - action_space, - initial_agents, - max_agents, - spawn_per_tick, - death_per_tick, - homogeneous_spaces=True): + observation_space, action_space, initial_agents, max_agents, spawn_per_tick, death_per_tick, homogeneous_spaces=True +): return partial( TestEnv, observation_space=observation_space, @@ -209,200 +200,262 @@ def make_mock_multiagent_env( MOCK_OBSERVATION_SPACES = [ # Atari space Box(low=0, high=255, shape=(4, 84, 84), dtype=np.uint8), - # NetHack space - Dict({ - 'blstats': Box(-2147483648, 2147483647, (27,), 'int64'), - 'chars': Box(0, 255, (21, 79), 'uint8'), - 'colors': Box(0, 15, (21, 79), 'uint8'), - 'glyphs': Box(0, 5976, (21, 79), 'int16'), - 'inv_glyphs': Box(0, 5976, (55,), 'int16'), - 'inv_letters': Box(0, 127, (55,), 'uint8'), - 'inv_oclasses': Box(0, 18, (55,), 'uint8'), - 'inv_strs': Box(0, 255, (55, 80), 'uint8'), - 'message': Box(0, 255, (256,), 'uint8'), - 'screen_descriptions': Box(0, 127, (21, 79, 80), 'uint8'), - 'specials': Box(0, 255, (21, 79), 'uint8'), - 'tty_chars': Box(0, 255, (24, 80), 'uint8'), - 'tty_colors': Box(0, 31, (24, 80), 'int8'), - 'tty_cursor': Box(0, 255, (2,), 'uint8'), - }), - + Dict( + { + "blstats": Box(-2147483648, 2147483647, (27,), "int64"), + "chars": Box(0, 255, (21, 79), "uint8"), + "colors": Box(0, 15, (21, 79), "uint8"), + "glyphs": Box(0, 5976, (21, 79), "int16"), + "inv_glyphs": Box(0, 5976, (55,), "int16"), + "inv_letters": Box(0, 127, (55,), "uint8"), + "inv_oclasses": Box(0, 18, (55,), "uint8"), + "inv_strs": Box(0, 255, (55, 80), "uint8"), + "message": Box(0, 255, (256,), "uint8"), + "screen_descriptions": Box(0, 127, (21, 79, 80), "uint8"), + "specials": Box(0, 255, (21, 79), "uint8"), + "tty_chars": Box(0, 255, (24, 80), "uint8"), + "tty_colors": Box(0, 31, (24, 80), "int8"), + "tty_cursor": Box(0, 255, (2,), "uint8"), + } + ), # Neural MMO space - Dict({ - 'ActionTargets': Dict({ - 'Attack': Dict({ - 'Style': Box(0, 1, (3,), 'int8'), - 'Target': Box(0, 1, (100,), 'int8'), - }), - 'Buy': Dict({ - 'MarketItem': Box(0, 1, (1024,), 'int8'), - }), - 'Comm': Dict({ - 'Token': Box(0, 1, (50,), 'int8'), - }), - 'Destroy': Dict({ - 'InventoryItem': Box(0, 1, (12,), 'int8'), - }), - 'Give': Dict({ - 'InventoryItem': Box(0, 1, (12,), 'int8'), - 'Target': Box(0, 1, (100,), 'int8'), - }), - 'GiveGold': Dict({ - 'Price': Box(0, 1, (99,), 'int8'), - 'Target': Box(0, 1, (100,), 'int8'), - }), - 'Move': Dict({ - 'Direction': Box(0, 1, (5,), 'int8'), - }), - 'Sell': Dict({ - 'InventoryItem': Box(0, 1, (12,), 'int8'), - 'Price': Box(0, 1, (99,), 'int8'), - }), - 'Use': Dict({ - 'InventoryItem': Box(0, 1, (12,), 'int8'), - }) - }), - 'AgentId': Discrete(129), - 'CurrentTick': Discrete(1025), - 'Entity': Box(-32768, 32767, (100, 23), 'int16'), - 'Inventory': Box(-32768, 32767, (12, 16), 'int16'), - 'Market': Box(-32768, 32767, (1024, 16), 'int16'), - 'Task': Box(-32770.0, 32770.0, (1024,), 'float16'), - 'Tile': Box(-32768, 32767, (225, 3), 'int16'), - }), - + Dict( + { + "ActionTargets": Dict( + { + "Attack": Dict( + { + "Style": Box(0, 1, (3,), "int8"), + "Target": Box(0, 1, (100,), "int8"), + } + ), + "Buy": Dict( + { + "MarketItem": Box(0, 1, (1024,), "int8"), + } + ), + "Comm": Dict( + { + "Token": Box(0, 1, (50,), "int8"), + } + ), + "Destroy": Dict( + { + "InventoryItem": Box(0, 1, (12,), "int8"), + } + ), + "Give": Dict( + { + "InventoryItem": Box(0, 1, (12,), "int8"), + "Target": Box(0, 1, (100,), "int8"), + } + ), + "GiveGold": Dict( + { + "Price": Box(0, 1, (99,), "int8"), + "Target": Box(0, 1, (100,), "int8"), + } + ), + "Move": Dict( + { + "Direction": Box(0, 1, (5,), "int8"), + } + ), + "Sell": Dict( + { + "InventoryItem": Box(0, 1, (12,), "int8"), + "Price": Box(0, 1, (99,), "int8"), + } + ), + "Use": Dict( + { + "InventoryItem": Box(0, 1, (12,), "int8"), + } + ), + } + ), + "AgentId": Discrete(129), + "CurrentTick": Discrete(1025), + "Entity": Box(-32768, 32767, (100, 23), "int16"), + "Inventory": Box(-32768, 32767, (12, 16), "int16"), + "Market": Box(-32768, 32767, (1024, 16), "int16"), + "Task": Box(-32770.0, 32770.0, (1024,), "float16"), + "Tile": Box(-32768, 32767, (225, 3), "int16"), + } + ), # Simple spaces Discrete(5), Box(low=LOW, high=HIGH, shape=(4,), dtype=np.float32), - # Nested spaces - Dict({ - "foo": Box(low=LOW, high=HIGH, shape=(2,), dtype=np.float32), - "bar": Box(low=LOW, high=HIGH, shape=(2,), dtype=np.float32), - }), + Dict( + { + "foo": Box(low=LOW, high=HIGH, shape=(2,), dtype=np.float32), + "bar": Box(low=LOW, high=HIGH, shape=(2,), dtype=np.float32), + } + ), Tuple((Discrete(3), Discrete(4))), - Tuple(( - Box(low=LOW, high=HIGH, shape=(2,), dtype=np.float32), - Discrete(3), - Dict({ - "baz": Box(low=LOW, high=HIGH, shape=(1,), dtype=np.float32), - "qux": Box(low=LOW, high=HIGH, shape=(1,), dtype=np.float32), - }), - )), - Dict({ - "foo": Tuple(( + Tuple( + ( Box(low=LOW, high=HIGH, shape=(2,), dtype=np.float32), Discrete(3), - )), - "bar": Dict({ - "baz": Discrete(2), - "qux": Discrete(4), - }), - }), + Dict( + { + "baz": Box(low=LOW, high=HIGH, shape=(1,), dtype=np.float32), + "qux": Box(low=LOW, high=HIGH, shape=(1,), dtype=np.float32), + } + ), + ) + ), + Dict( + { + "foo": Tuple( + ( + Box(low=LOW, high=HIGH, shape=(2,), dtype=np.float32), + Discrete(3), + ) + ), + "bar": Dict( + { + "baz": Discrete(2), + "qux": Discrete(4), + } + ), + } + ), ] MOCK_ACTION_SPACES = [ # NetHack action space Discrete(5), - # Neural MMO action space - Dict({ - 'Attack': Dict({ - 'Style': Discrete(3), - 'Target': Discrete(100), - }), - 'Buy': Dict({ - 'MarketItem': Discrete(1024), - }), - 'Comm': Dict({ - 'Token': Discrete(50), - }), - 'Destroy': Dict({ - 'InventoryItem': Discrete(12), - }), - 'Give': Dict({ - 'InventoryItem': Discrete(12), - 'Target': Discrete(100), - }), - 'GiveGold': Dict({ - 'Price': Discrete(99), - 'Target': Discrete(100), - }), - 'Move': Dict({ - 'Direction': Discrete(5), - }), - 'Sell': Dict({ - 'InventoryItem': Discrete(12), - 'Price': Discrete(99), - }), - 'Use': Dict({ - 'InventoryItem': Discrete(12), - }) - }), - + Dict( + { + "Attack": Dict( + { + "Style": Discrete(3), + "Target": Discrete(100), + } + ), + "Buy": Dict( + { + "MarketItem": Discrete(1024), + } + ), + "Comm": Dict( + { + "Token": Discrete(50), + } + ), + "Destroy": Dict( + { + "InventoryItem": Discrete(12), + } + ), + "Give": Dict( + { + "InventoryItem": Discrete(12), + "Target": Discrete(100), + } + ), + "GiveGold": Dict( + { + "Price": Discrete(99), + "Target": Discrete(100), + } + ), + "Move": Dict( + { + "Direction": Discrete(5), + } + ), + "Sell": Dict( + { + "InventoryItem": Discrete(12), + "Price": Discrete(99), + } + ), + "Use": Dict( + { + "InventoryItem": Discrete(12), + } + ), + } + ), # Nested spaces Tuple((gym.spaces.Discrete(2), gym.spaces.Discrete(3))), - Dict({ - "foo": Discrete(4), - "bar": Discrete(2), - }), - Tuple(( - Discrete(4), - Dict({ - "baz": Discrete(2), - "qux": Discrete(2), - }), - )), - Dict({ - "foo": Tuple(( - Discrete(2), - Discrete(3), - )), - "bar": Dict({ - "baz": Discrete(2), - "qux": Discrete(4), - }), - }), + Dict( + { + "foo": Discrete(4), + "bar": Discrete(2), + } + ), + Tuple( + ( + Discrete(4), + Dict( + { + "baz": Discrete(2), + "qux": Discrete(2), + } + ), + ) + ), + Dict( + { + "foo": Tuple( + ( + Discrete(2), + Discrete(3), + ) + ), + "bar": Dict( + { + "baz": Discrete(2), + "qux": Discrete(4), + } + ), + } + ), ] MOCK_TEAMS = { - 'None': None, - 'single': { - 'team_1': ['agent_1'], - 'team_2': ['agent_2'], - 'team_3': ['agent_3'], - 'team_4': ['agent_4'], - 'team_5': ['agent_5'], - 'team_6': ['agent_6'], - 'team_7': ['agent_7'], - 'team_8': ['agent_8'], - 'team_9': ['agent_9'], - 'team_10': ['agent_10'], - 'team_11': ['agent_11'], - 'team_12': ['agent_12'], - 'team_13': ['agent_13'], - 'team_14': ['agent_14'], - 'team_15': ['agent_15'], - 'team_16': ['agent_16'], + "None": None, + "single": { + "team_1": ["agent_1"], + "team_2": ["agent_2"], + "team_3": ["agent_3"], + "team_4": ["agent_4"], + "team_5": ["agent_5"], + "team_6": ["agent_6"], + "team_7": ["agent_7"], + "team_8": ["agent_8"], + "team_9": ["agent_9"], + "team_10": ["agent_10"], + "team_11": ["agent_11"], + "team_12": ["agent_12"], + "team_13": ["agent_13"], + "team_14": ["agent_14"], + "team_15": ["agent_15"], + "team_16": ["agent_16"], }, - 'pairs': { - 'team_1': ['agent_1', 'agent_2'], - 'team_2': ['agent_3', 'agent_4'], - 'team_3': ['agent_5', 'agent_6'], - 'team_4': ['agent_7', 'agent_8'], - 'team_5': ['agent_9', 'agent_10'], - 'team_6': ['agent_11', 'agent_12'], - 'team_7': ['agent_13', 'agent_14'], - 'team_8': ['agent_15', 'agent_16'], + "pairs": { + "team_1": ["agent_1", "agent_2"], + "team_2": ["agent_3", "agent_4"], + "team_3": ["agent_5", "agent_6"], + "team_4": ["agent_7", "agent_8"], + "team_5": ["agent_9", "agent_10"], + "team_6": ["agent_11", "agent_12"], + "team_7": ["agent_13", "agent_14"], + "team_8": ["agent_15", "agent_16"], }, - 'mixed': { - 'team_1': ['agent_1', 'agent_2'], - 'team_2': ['agent_3', 'agent_4', 'agent_5', 'agent_6'], - 'team_3': ['agent_7', 'agent_8', 'agent_9'], - 'team_4': ['agent_10', 'agent_11', 'agent_12', 'agent_13', 'agent_14'], - 'team_5': ['agent_15', 'agent_16'], + "mixed": { + "team_1": ["agent_1", "agent_2"], + "team_2": ["agent_3", "agent_4", "agent_5", "agent_6"], + "team_3": ["agent_7", "agent_8", "agent_9"], + "team_4": ["agent_10", "agent_11", "agent_12", "agent_13", "agent_14"], + "team_5": ["agent_15", "agent_16"], }, } @@ -416,7 +469,7 @@ def make_mock_multiagent_env( action_space=act_space, ) ) - + MOCK_MULTI_AGENT_ENVIRONMENTS.append( make_mock_multiagent_env( observation_space=obs_space, diff --git a/pufferlib/environments/test/torch.py b/pufferlib/environments/test/torch.py index 8b13194e9..e69de29bb 100644 --- a/pufferlib/environments/test/torch.py +++ b/pufferlib/environments/test/torch.py @@ -1 +0,0 @@ -from pufferlib.models import Default as Policy diff --git a/pufferlib/environments/vizdoom/__init__.py b/pufferlib/environments/vizdoom/__init__.py index 59cda9e7c..c24135317 100644 --- a/pufferlib/environments/vizdoom/__init__.py +++ b/pufferlib/environments/vizdoom/__init__.py @@ -6,6 +6,7 @@ pass else: from .torch import Policy + try: from .torch import Recurrent except: diff --git a/pufferlib/environments/vizdoom/environment.py b/pufferlib/environments/vizdoom/environment.py index d98a263f9..30b3e177d 100644 --- a/pufferlib/environments/vizdoom/environment.py +++ b/pufferlib/environments/vizdoom/environment.py @@ -1,69 +1,66 @@ -from pdb import set_trace as T -import numpy as np import functools import gymnasium as gym +import numpy as np import pufferlib import pufferlib.emulation import pufferlib.environments -import pufferlib.utils import pufferlib.postprocess +import pufferlib.utils -def env_creator(name='doom'): +def env_creator(name="doom"): return functools.partial(make, name) -def make(name, framestack=1, render_mode='rgb_array', buf=None): - '''Atari creation function with default CleanRL preprocessing based on Stable Baselines3 wrappers''' - if name == 'doom': - name = 'VizdoomHealthGatheringSupreme-v0' - - #pufferlib.environments.try_import('vizdoom', 'gymnasium_wrapper') - from stable_baselines3.common.atari_wrappers import ( - ClipRewardEnv, - EpisodicLifeEnv, - FireResetEnv, - MaxAndSkipEnv, - NoopResetEnv, - ) + +def make(name, framestack=1, render_mode="rgb_array", buf=None): + """Atari creation function with default CleanRL preprocessing based on Stable Baselines3 wrappers""" + if name == "doom": + name = "VizdoomHealthGatheringSupreme-v0" + + # pufferlib.environments.try_import('vizdoom', 'gymnasium_wrapper') + # Make does not work without this imported # TODO: Fix try_import - from vizdoom import gymnasium_wrapper + with pufferlib.utils.Suppress(): env = gym.make(name, render_mode=render_mode) - env = DoomWrapper(env) # Don't use standard postprocessor + env = DoomWrapper(env) # Don't use standard postprocessor - #env = gym.wrappers.RecordEpisodeStatistics(env) - #env = NoopResetEnv(env, noop_max=30) - #env = MaxAndSkipEnv(env, skip=4) - #env = EpisodicLifeEnv(env) - #if "FIRE" in env.unwrapped.get_action_meanings(): + # env = gym.wrappers.RecordEpisodeStatistics(env) + # env = NoopResetEnv(env, noop_max=30) + # env = MaxAndSkipEnv(env, skip=4) + # env = EpisodicLifeEnv(env) + # if "FIRE" in env.unwrapped.get_action_meanings(): # env = FireResetEnv(env) - #env = ClipRewardEnv(env) - #env = gym.wrappers.GrayScaleObservation(env) - #env = gym.wrappers.FrameStack(env, framestack) + # env = ClipRewardEnv(env) + # env = gym.wrappers.GrayScaleObservation(env) + # env = gym.wrappers.FrameStack(env, framestack) return pufferlib.emulation.GymnasiumPufferEnv(env=env, buf=buf) + class DoomWrapper(gym.Wrapper): - '''Gymnasium env does not expose proper options for screen scale and - render format. This is slow. So we do it ourselves. Not it is fast. Yay!''' + """Gymnasium env does not expose proper options for screen scale and + render format. This is slow. So we do it ourselves. Not it is fast. Yay!""" + def __init__(self, env): super().__init__(env.unwrapped) - if env.observation_space['screen'].shape[0] != 120: - raise ValueError('Wrong screen resolution. Doom does not provide ' - 'a way to change this. You must edit scenarios/.cfg' - 'This is inside your local ViZDoom installation. Likely in python system packages' - 'Set screen resolution to RES_160X120 and screen format to GRAY8') + if env.observation_space["screen"].shape[0] != 120: + raise ValueError( + "Wrong screen resolution. Doom does not provide " + "a way to change this. You must edit scenarios/.cfg" + "This is inside your local ViZDoom installation. Likely in python system packages" + "Set screen resolution to RES_160X120 and screen format to GRAY8" + ) - self.observation_space = gym.spaces.Box( - low=0, high=255, shape=(60, 80, 1), dtype=np.uint8) + self.observation_space = gym.spaces.Box(low=0, high=255, shape=(60, 80, 1), dtype=np.uint8) def reset(self, seed=None, options=None): obs, info = self.env.reset(seed=seed, options=options) - return obs['screen'][::2, ::2], {} + return obs["screen"][::2, ::2], {} def step(self, action): obs, reward, terminal, truncated, info = self.env.step(action) - return obs['screen'][::2, ::2], reward, terminal, truncated, info + return obs["screen"][::2, ::2], reward, terminal, truncated, info diff --git a/pufferlib/environments/vizdoom/torch.py b/pufferlib/environments/vizdoom/torch.py index 4fd6964a5..445d87161 100644 --- a/pufferlib/environments/vizdoom/torch.py +++ b/pufferlib/environments/vizdoom/torch.py @@ -5,9 +5,9 @@ class Recurrent(pufferlib.models.LSTMWrapper): def __init__(self, env, policy, input_size=512, hidden_size=512, num_layers=1): super().__init__(env, policy, input_size, hidden_size, num_layers) + class Policy(pufferlib.models.Convolutional): - def __init__(self, env, input_size=512, hidden_size=512, output_size=512, - framestack=1, flat_size=64*4*6): + def __init__(self, env, input_size=512, hidden_size=512, output_size=512, framestack=1, flat_size=64 * 4 * 6): super().__init__( env=env, input_size=input_size, @@ -15,5 +15,5 @@ def __init__(self, env, input_size=512, hidden_size=512, output_size=512, output_size=output_size, framestack=framestack, flat_size=flat_size, - channels_last=True + channels_last=True, ) diff --git a/pufferlib/exceptions.py b/pufferlib/exceptions.py index ec8d2cc84..0b33f313a 100644 --- a/pufferlib/exceptions.py +++ b/pufferlib/exceptions.py @@ -2,6 +2,7 @@ class EnvironmentSetupError(RuntimeError): def __init__(self, e, package): super().__init__(self.message) + class APIUsageError(RuntimeError): """Exception raised when the API is used incorrectly.""" @@ -9,12 +10,10 @@ def __init__(self, message="API usage error."): self.message = message super().__init__(self.message) + class InvalidAgentError(ValueError): """Exception raised when an invalid agent key is used.""" def __init__(self, agent_id, agents): - message = ( - f'Invalid agent/team ({agent_id}) specified. ' - f'Valid values:\n{agents}' - ) + message = f"Invalid agent/team ({agent_id}) specified. Valid values:\n{agents}" super().__init__(message) diff --git a/pufferlib/models.py b/pufferlib/models.py index c6b4754f9..25b130db1 100644 --- a/pufferlib/models.py +++ b/pufferlib/models.py @@ -1,6 +1,4 @@ -from pdb import set_trace as T import numpy as np - import torch import torch.nn as nn @@ -10,7 +8,7 @@ class Default(nn.Module): - '''Default PyTorch policy. Flattens obs and applies a linear layer. + """Default PyTorch policy. Flattens obs and applies a linear layer. PufferLib is not a framework. It does not enforce a base class. You can use any PyTorch policy that returns actions and values. @@ -20,138 +18,236 @@ class Default(nn.Module): for use with our LSTM wrapper, simply put everything from forward() before the recurrent cell into encode_observations and put everything after into decode_actions. - ''' - def __init__(self, env, hidden_size=128): + """ + + def __init__(self, env, hidden_size=128, use_p3o=False, p3o_horizon=32, use_diayn=False, diayn_skills=128): super().__init__() self.hidden_size = hidden_size - self.is_multidiscrete = isinstance(env.single_action_space, - pufferlib.spaces.MultiDiscrete) - self.is_continuous = isinstance(env.single_action_space, - pufferlib.spaces.Box) + self.is_multidiscrete = isinstance(env.single_action_space, pufferlib.spaces.MultiDiscrete) + self.is_continuous = isinstance(env.single_action_space, pufferlib.spaces.Box) try: - self.is_dict_obs = isinstance(env.env.observation_space, pufferlib.spaces.Dict) + self.is_dict_obs = isinstance(env.env.observation_space, pufferlib.spaces.Dict) except: - self.is_dict_obs = isinstance(env.observation_space, pufferlib.spaces.Dict) + self.is_dict_obs = isinstance(env.observation_space, pufferlib.spaces.Dict) if self.is_dict_obs: self.dtype = pufferlib.pytorch.nativize_dtype(env.emulated) input_size = int(sum(np.prod(v.shape) for v in env.env.observation_space.values())) self.encoder = nn.Linear(input_size, self.hidden_size) else: - self.encoder = nn.Linear(np.prod(env.single_observation_space.shape), hidden_size) + # self.encoder = nn.Linear(np.prod(env.single_observation_space.shape), hidden_size) + self.encoder = torch.nn.Sequential( + nn.Linear(np.prod(env.single_observation_space.shape), hidden_size), + nn.GELU(), + ) if self.is_multidiscrete: action_nvec = env.single_action_space.nvec - self.decoder = nn.ModuleList([pufferlib.pytorch.layer_init( - nn.Linear(hidden_size, n), std=0.01) for n in action_nvec]) + self.decoder = nn.ModuleList( + [pufferlib.pytorch.layer_init(nn.Linear(hidden_size, n), std=0.01) for n in action_nvec] + ) elif not self.is_continuous: - self.decoder = pufferlib.pytorch.layer_init( - nn.Linear(hidden_size, env.single_action_space.n), std=0.01) + self.decoder = pufferlib.pytorch.layer_init(nn.Linear(hidden_size, env.single_action_space.n), std=0.01) + else: self.decoder_mean = pufferlib.pytorch.layer_init( - nn.Linear(hidden_size, env.single_action_space.shape[0]), std=0.01) - self.decoder_logstd = nn.Parameter(torch.zeros( - 1, env.single_action_space.shape[0])) + nn.Linear(hidden_size, env.single_action_space.shape[0]), std=0.01 + ) + self.decoder_logstd = nn.Parameter(torch.zeros(1, env.single_action_space.shape[0])) + + if use_diayn: + self.diayn_discriminator = nn.Sequential( + pufferlib.pytorch.layer_init(nn.Linear(hidden_size, hidden_size)), + nn.ReLU(), + pufferlib.pytorch.layer_init(nn.Linear(hidden_size, diayn_skills)), + ) + + self.use_p3o = use_p3o + self.p3o_horizon = p3o_horizon + if use_p3o: + self.value_mean = pufferlib.pytorch.layer_init(nn.Linear(hidden_size, p3o_horizon), std=1) + self.value_logstd = nn.Parameter(torch.zeros(1, p3o_horizon)) + + # param = np.log10(np.arange(1, N+1)) + # param = 1 - np.exp(-np.sqrt(np.arange(N))) + # self.value_logstd = nn.Parameter(torch.tensor(param).view(1, N)) + # self.value_logstd = pufferlib.pytorch.layer_init( + # nn.Linear(hidden_size, 32), std=0.01) + else: + self.value = pufferlib.pytorch.layer_init(nn.Linear(hidden_size, 1), std=1) - self.value_head = nn.Linear(hidden_size, 1) + def forward(self, observations, state=None): + hidden = self.encode_observations(observations, state=state) + state.hidden = hidden + logits, values = self.decode_actions(hidden) + return logits, values - def forward(self, observations): - hidden, lookup = self.encode_observations(observations) - actions, value = self.decode_actions(hidden, lookup) - return actions, value + def forward_train(self, observations, state=None): + return self.forward(observations, state) - def encode_observations(self, observations): - '''Encodes a batch of observations into hidden states. Assumes - no time dimension (handled by LSTM wrappers).''' + def encode_observations(self, observations, state=None): + """Encodes a batch of observations into hidden states. Assumes + no time dimension (handled by LSTM wrappers).""" batch_size = observations.shape[0] if self.is_dict_obs: observations = pufferlib.pytorch.nativize_tensor(observations, self.dtype) observations = torch.cat([v.view(batch_size, -1) for v in observations.values()], dim=1) - else: + else: observations = observations.view(batch_size, -1) - return torch.relu(self.encoder(observations.float())), None + return self.encoder(observations.float()) - def decode_actions(self, hidden, lookup, concat=True): - '''Decodes a batch of hidden states into (multi)discrete actions. - Assumes no time dimension (handled by LSTM wrappers).''' - value = self.value_head(hidden) + def decode_actions(self, hidden): + """Decodes a batch of hidden states into (multi)discrete actions. + Assumes no time dimension (handled by LSTM wrappers).""" if self.is_multidiscrete: - actions = [dec(hidden) for dec in self.decoder] - return actions, value + logits = [dec(hidden) for dec in self.decoder] elif self.is_continuous: mean = self.decoder_mean(hidden) logstd = self.decoder_logstd.expand_as(mean) std = torch.exp(logstd) - probs = torch.distributions.Normal(mean, std) - batch = hidden.shape[0] - return probs, value + logits = torch.distributions.Normal(mean, std) + else: + logits = self.decoder(hidden) + + if self.use_p3o: + mean = self.value_mean(hidden) + values = pufferlib.namespace( + mean=mean, + std=torch.exp(torch.clamp(self.value_logstd, -10, 10)).expand_as(mean), + ) + else: + values = self.value(hidden) - actions = self.decoder(hidden) - return actions, value + return logits, values -class LSTMWrapper(nn.Module): - def __init__(self, env, policy, input_size=128, hidden_size=128, num_layers=1): - '''Wraps your policy with an LSTM without letting you shoot yourself in the + +class LSTMWrapper(nn.LSTM): + def __init__(self, env, policy, input_size=128, hidden_size=128): + """Wraps your policy with an LSTM without letting you shoot yourself in the foot with bad transpose and shape operations. This saves much pain. Requires that your policy define encode_observations and decode_actions. - See the Default policy for an example.''' - super().__init__() + See the Default policy for an example.""" + super().__init__(input_size, hidden_size) self.obs_shape = env.single_observation_space.shape self.policy = policy self.input_size = input_size self.hidden_size = hidden_size - self.recurrent = nn.LSTM(input_size, hidden_size, num_layers) + self.is_continuous = self.policy.is_continuous - for name, param in self.recurrent.named_parameters(): + for name, param in self.named_parameters(): + if "layer_norm" in name: + continue if "bias" in name: nn.init.constant_(param, 0) elif "weight" in name: nn.init.orthogonal_(param, 1.0) - def forward(self, x, state): + self.cell = torch.nn.LSTMCell(input_size, hidden_size) + self.cell.weight_ih = self.weight_ih_l0 + self.cell.weight_hh = self.weight_hh_l0 + self.cell.bias_ih = self.bias_ih_l0 + self.cell.bias_hh = self.bias_hh_l0 + + # self.pre_layernorm = nn.LayerNorm(hidden_size) + # self.post_layernorm = nn.LayerNorm(hidden_size) + + def forward(self, observations, state): + """Forward function for inference. 3x faster than using LSTM directly""" + hidden = self.policy.encode_observations(observations, state=state) + h = state.lstm_h + c = state.lstm_c + + # TODO: Don't break compile + if h is not None: + assert h.shape[0] == c.shape[0] == observations.shape[0], "LSTM state must be (h, c)" + lstm_state = (h, c) + else: + lstm_state = None + + # hidden = self.pre_layernorm(hidden) + hidden, c = self.cell(hidden, lstm_state) + # hidden = self.post_layernorm(hidden) + state.hidden = hidden + state.lstm_h = hidden + state.lstm_c = c + logits, values = self.policy.decode_actions(hidden) + return logits, values + + def forward_train(self, observations, state): + """Forward function for training. Uses LSTM for fast time-batching""" + x = observations + lstm_h = state.lstm_h + lstm_c = state.lstm_c + x_shape, space_shape = x.shape, self.obs_shape x_n, space_n = len(x_shape), len(space_shape) if x_shape[-space_n:] != space_shape: - raise ValueError('Invalid input tensor shape', x.shape) + raise ValueError("Invalid input tensor shape", x.shape) if x_n == space_n + 1: B, TT = x_shape[0], 1 elif x_n == space_n + 2: B, TT = x_shape[:2] else: - raise ValueError('Invalid input tensor shape', x.shape) + raise ValueError("Invalid input tensor shape", x.shape) + + if lstm_h is not None: + assert lstm_h.shape[1] == lstm_c.shape[1] == B, "LSTM state must be (h, c)" + lstm_state = (lstm_h, lstm_c) + else: + lstm_state = None - if state is not None: - assert state[0].shape[1] == state[1].shape[1] == B + x = x.reshape(B * TT, *space_shape) + hidden = self.policy.encode_observations(x, state) + assert hidden.shape == (B * TT, self.input_size) - x = x.reshape(B*TT, *space_shape) - hidden, lookup = self.policy.encode_observations(x) - assert hidden.shape == (B*TT, self.input_size) hidden = hidden.reshape(B, TT, self.input_size) hidden = hidden.transpose(0, 1) - hidden, state = self.recurrent(hidden, state) + # hidden = self.pre_layernorm(hidden) + hidden, (lstm_h, lstm_c) = super().forward(hidden, lstm_state) + # hidden = self.post_layernorm(hidden) hidden = hidden.transpose(0, 1) - hidden = hidden.reshape(B*TT, self.hidden_size) - hidden, critic = self.policy.decode_actions(hidden, lookup) - return hidden, critic, state + flat_hidden = hidden.reshape(B * TT, self.hidden_size) + logits, values = self.policy.decode_actions(flat_hidden) + values = values.reshape(B, TT) + # state.batch_logits = logits.reshape(B, TT, -1) + state.hidden = hidden + state.lstm_h = lstm_h.detach() + state.lstm_c = lstm_c.detach() + return logits, values + class Convolutional(nn.Module): - def __init__(self, env, *args, framestack, flat_size, - input_size=512, hidden_size=512, output_size=512, - channels_last=False, downsample=1, **kwargs): - '''The CleanRL default NatureCNN policy used for Atari. + def __init__( + self, + env, + *args, + framestack, + flat_size, + input_size=512, + hidden_size=512, + output_size=512, + channels_last=False, + downsample=1, + **kwargs, + ): + """The CleanRL default NatureCNN policy used for Atari. It's just a stack of three convolutions followed by a linear layer - + Takes framestack as a mandatory keyword argument. Suggested default is 1 frame - with LSTM or 4 frames without.''' + with LSTM or 4 frames without.""" super().__init__() self.channels_last = channels_last self.downsample = downsample - self.network= nn.Sequential( + # TODO: Remove these from required params + self.hidden_size = hidden_size + self.is_continuous = False + + self.network = nn.Sequential( pufferlib.pytorch.layer_init(nn.Conv2d(framestack, 32, 8, stride=4)), nn.ReLU(), pufferlib.pytorch.layer_init(nn.Conv2d(32, 64, 4, stride=2)), @@ -162,37 +258,40 @@ def __init__(self, env, *args, framestack, flat_size, pufferlib.pytorch.layer_init(nn.Linear(flat_size, hidden_size)), nn.ReLU(), ) - self.actor = pufferlib.pytorch.layer_init( - nn.Linear(hidden_size, env.single_action_space.n), std=0.01) - self.value_fn = pufferlib.pytorch.layer_init( - nn.Linear(output_size, 1), std=1) + self.actor = pufferlib.pytorch.layer_init(nn.Linear(hidden_size, env.single_action_space.n), std=0.01) + self.value_fn = pufferlib.pytorch.layer_init(nn.Linear(output_size, 1), std=1) - def forward(self, observations): + def forward(self, observations, state=None): hidden, lookup = self.encode_observations(observations) actions, value = self.decode_actions(hidden, lookup) return actions, value - def encode_observations(self, observations): + def forward_train(self, observations, state=None): + return self.forward(observations, state) + + def encode_observations(self, observations, state=None): if self.channels_last: observations = observations.permute(0, 3, 1, 2) if self.downsample > 1: - observations = observations[:, :, ::self.downsample, ::self.downsample] - return self.network(observations.float() / 255.0), None + observations = observations[:, :, :: self.downsample, :: self.downsample] + return self.network(observations.float() / 255.0) - def decode_actions(self, flat_hidden, lookup, concat=None): + def decode_actions(self, flat_hidden): action = self.actor(flat_hidden) value = self.value_fn(flat_hidden) return action, value + class ProcgenResnet(nn.Module): - '''Procgen baseline from the AICrowd NeurIPS 2020 competition - Based on the ResNet architecture that was used in the Impala paper.''' + """Procgen baseline from the AICrowd NeurIPS 2020 competition + Based on the ResNet architecture that was used in the Impala paper.""" + def __init__(self, env, cnn_width=16, mlp_width=256): super().__init__() h, w, c = env.single_observation_space.shape shape = (c, h, w) conv_seqs = [] - for out_channels in [cnn_width, 2*cnn_width, 2*cnn_width]: + for out_channels in [cnn_width, 2 * cnn_width, 2 * cnn_width]: conv_seq = ConvSequence(shape, out_channels) shape = conv_seq.get_output_shape() conv_seqs.append(conv_seq) @@ -203,10 +302,8 @@ def __init__(self, env, cnn_width=16, mlp_width=256): nn.ReLU(), ] self.network = nn.Sequential(*conv_seqs) - self.actor = pufferlib.pytorch.layer_init( - nn.Linear(mlp_width, env.single_action_space.n), std=0.01) - self.value = pufferlib.pytorch.layer_init( - nn.Linear(mlp_width, 1), std=1) + self.actor = pufferlib.pytorch.layer_init(nn.Linear(mlp_width, env.single_action_space.n), std=0.01) + self.value = pufferlib.pytorch.layer_init(nn.Linear(mlp_width, 1), std=1) def forward(self, observations): hidden, lookup = self.encode_observations(observations) @@ -216,13 +313,14 @@ def forward(self, observations): def encode_observations(self, x): hidden = self.network(x.permute((0, 3, 1, 2)) / 255.0) return hidden, None - + def decode_actions(self, hidden, lookup): - '''linear decoder function''' + """linear decoder function""" action = self.actor(hidden) value = self.value(hidden) return action, value + class ResidualBlock(nn.Module): def __init__(self, channels): super().__init__() @@ -237,12 +335,15 @@ def forward(self, x): x = self.conv1(x) return x + inputs + class ConvSequence(nn.Module): def __init__(self, input_shape, out_channels): super().__init__() self._input_shape = input_shape self._out_channels = out_channels - self.conv = nn.Conv2d(in_channels=self._input_shape[0], out_channels=self._out_channels, kernel_size=3, padding=1) + self.conv = nn.Conv2d( + in_channels=self._input_shape[0], out_channels=self._out_channels, kernel_size=3, padding=1 + ) self.res_block0 = ResidualBlock(self._out_channels) self.res_block1 = ResidualBlock(self._out_channels) diff --git a/pufferlib/namespace.py b/pufferlib/namespace.py index dff138f64..e8b18ee57 100644 --- a/pufferlib/namespace.py +++ b/pufferlib/namespace.py @@ -1,25 +1,31 @@ -from pdb import set_trace as T -from types import SimpleNamespace from collections.abc import Mapping +from types import SimpleNamespace + def __getitem__(self, key): return self.__dict__[key] + def keys(self): return self.__dict__.keys() + def values(self): return self.__dict__.values() + def items(self): return self.__dict__.items() + def __iter__(self): return iter(self.__dict__) + def __len__(self): return len(self.__dict__) + class Namespace(SimpleNamespace, Mapping): __getitem__ = __getitem__ __iter__ = __iter__ @@ -28,27 +34,29 @@ class Namespace(SimpleNamespace, Mapping): values = values items = items + def dataclass(cls): # Safely get annotations - annotations = getattr(cls, '__annotations__', {}) + annotations = getattr(cls, "__annotations__", {}) # Combine both annotated and non-annotated fields all_fields = {**{k: None for k in annotations.keys()}, **cls.__dict__} - all_fields = {k: v for k, v in all_fields.items() if not callable(v) and not k.startswith('__')} + all_fields = {k: v for k, v in all_fields.items() if not callable(v) and not k.startswith("__")} def __init__(self, **kwargs): for field, default_value in all_fields.items(): setattr(self, field, kwargs.get(field, default_value)) cls.__init__ = __init__ - setattr(cls, "__getitem__", __getitem__) - setattr(cls, "__iter__", __iter__) - setattr(cls, "__len__", __len__) - setattr(cls, "keys", keys) - setattr(cls, "values", values) - setattr(cls, "items", items) + cls.__getitem__ = __getitem__ + cls.__iter__ = __iter__ + cls.__len__ = __len__ + cls.keys = keys + cls.values = values + cls.items = items return cls + def namespace(self=None, **kwargs): if self is None: return Namespace(**kwargs) diff --git a/pufferlib/ocean/__init__.py b/pufferlib/ocean/__init__.py index 1c91a2b0f..55822f8a5 100644 --- a/pufferlib/ocean/__init__.py +++ b/pufferlib/ocean/__init__.py @@ -6,6 +6,7 @@ pass else: from .torch import Policy + try: from .torch import Recurrent except: diff --git a/pufferlib/ocean/breakout/breakout.py b/pufferlib/ocean/breakout/breakout.py index eab63f098..d74560d47 100644 --- a/pufferlib/ocean/breakout/breakout.py +++ b/pufferlib/ocean/breakout/breakout.py @@ -1,35 +1,62 @@ -'''High-perf Pong +"""High-perf Pong Inspired from https://gist.github.com/Yttrmin/18ecc3d2d68b407b4be1 & https://jair.org/index.php/jair/article/view/10819/25823 & https://www.youtube.com/watch?v=PSQt5KGv7Vk -''' +""" -import numpy as np import gymnasium - +import numpy as np import pufferlib from pufferlib.ocean.breakout.cy_breakout import CyBreakout + class Breakout(pufferlib.PufferEnv): - def __init__(self, num_envs=1, render_mode=None, report_interval=128, - frameskip=4, width=576, height=330, - paddle_width=62, paddle_height=8, - ball_width=32, ball_height=32, - brick_width=32, brick_height=12, - brick_rows=6, brick_cols=18, buf=None): - self.single_observation_space = gymnasium.spaces.Box(low=0, high=1, - shape=(11 + brick_rows*brick_cols,), dtype=np.float32) + def __init__( + self, + num_envs=1, + render_mode=None, + report_interval=128, + frameskip=4, + width=576, + height=330, + paddle_width=62, + paddle_height=8, + ball_width=32, + ball_height=32, + brick_width=32, + brick_height=12, + brick_rows=6, + brick_cols=18, + buf=None, + ): + self.single_observation_space = gymnasium.spaces.Box( + low=0, high=1, shape=(11 + brick_rows * brick_cols,), dtype=np.float32 + ) self.single_action_space = gymnasium.spaces.Discrete(4) self.report_interval = report_interval self.render_mode = render_mode self.num_agents = num_envs super().__init__(buf) - self.c_envs = CyBreakout(self.observations, self.actions, self.rewards, - self.terminals, num_envs, frameskip, width, height, - paddle_width, paddle_height, ball_width, ball_height, - brick_width, brick_height, brick_rows, brick_cols) + self.c_envs = CyBreakout( + self.observations, + self.actions, + self.rewards, + self.terminals, + num_envs, + frameskip, + width, + height, + paddle_width, + paddle_height, + ball_width, + ball_height, + brick_width, + brick_height, + brick_rows, + brick_cols, + ) def reset(self, seed=None): self.c_envs.reset() @@ -43,12 +70,11 @@ def step(self, actions): info = [] if self.tick % self.report_interval == 0: log = self.c_envs.log() - if log['episode_length'] > 0: + if log["episode_length"] > 0: info.append(log) self.tick += 1 - return (self.observations, self.rewards, - self.terminals, self.truncations, info) + return (self.observations, self.rewards, self.terminals, self.truncations, info) def render(self): self.c_envs.render() @@ -56,6 +82,7 @@ def render(self): def close(self): self.c_envs.close() + def test_performance(timeout=10, atn_cache=1024): env = CyBreakout(num_envs=1000) env.reset() @@ -64,13 +91,15 @@ def test_performance(timeout=10, atn_cache=1024): actions = np.random.randint(0, 2, (atn_cache, env.num_envs)) import time + start = time.time() while time.time() - start < timeout: atn = actions[tick % atn_cache] env.step(atn) tick += 1 - print(f'SPS: %f', env.num_envs * tick / (time.time() - start)) + print("SPS: %f", env.num_envs * tick / (time.time() - start)) + -if __name__ == '__main__': +if __name__ == "__main__": test_performance() diff --git a/pufferlib/ocean/connect4/connect4.py b/pufferlib/ocean/connect4/connect4.py index a6bed18bd..59009db59 100644 --- a/pufferlib/ocean/connect4/connect4.py +++ b/pufferlib/ocean/connect4/connect4.py @@ -1,31 +1,46 @@ -'''High-perf Pong +"""High-perf Pong Inspired from https://gist.github.com/Yttrmin/18ecc3d2d68b407b4be1 & https://jair.org/index.php/jair/article/view/10819/25823 & https://www.youtube.com/watch?v=PSQt5KGv7Vk -''' +""" -import numpy as np import gymnasium - +import numpy as np import pufferlib from pufferlib.ocean.connect4.cy_connect4 import CyConnect4 class Connect4(pufferlib.PufferEnv): - def __init__(self, num_envs=1, render_mode=None, report_interval=128, - width=672, height=576, piece_width=96, piece_height=96, buf=None): - - self.single_observation_space = gymnasium.spaces.Box(low=0, high=1, - shape=(42,), dtype=np.float32) + def __init__( + self, + num_envs=1, + render_mode=None, + report_interval=128, + width=672, + height=576, + piece_width=96, + piece_height=96, + buf=None, + ): + self.single_observation_space = gymnasium.spaces.Box(low=0, high=1, shape=(42,), dtype=np.float32) self.single_action_space = gymnasium.spaces.Discrete(7) self.report_interval = report_interval self.render_mode = render_mode self.num_agents = num_envs super().__init__(buf=buf) - self.c_envs = CyConnect4(self.observations, self.actions, self.rewards, - self.terminals, num_envs, width, height, piece_width, piece_height) + self.c_envs = CyConnect4( + self.observations, + self.actions, + self.rewards, + self.terminals, + num_envs, + width, + height, + piece_width, + piece_height, + ) def reset(self, seed=None): self.c_envs.reset() @@ -40,11 +55,10 @@ def step(self, actions): info = [] if self.tick % self.report_interval == 0: log = self.c_envs.log() - if log['episode_length'] > 0: + if log["episode_length"] > 0: info.append(log) - return (self.observations, self.rewards, - self.terminals, self.truncations, info) + return (self.observations, self.rewards, self.terminals, self.truncations, info) def render(self): self.c_envs.render() @@ -68,12 +82,12 @@ def test_performance(timeout=10, atn_cache=1024, num_envs=1024): start = time.time() while time.time() - start < timeout: - atn = actions[tick % atn_cache] + atn = actions[tick % atn_cache] env.step(atn) tick += 1 - print(f'SPS: {num_envs * tick / (time.time() - start)}') + print(f"SPS: {num_envs * tick / (time.time() - start)}") -if __name__ == '__main__': +if __name__ == "__main__": test_performance() diff --git a/pufferlib/ocean/enduro/enduro.py b/pufferlib/ocean/enduro/enduro.py index b2cbb3070..34aeb54c9 100644 --- a/pufferlib/ocean/enduro/enduro.py +++ b/pufferlib/ocean/enduro/enduro.py @@ -1,14 +1,13 @@ # enduro_clone.py -import numpy as np import gymnasium +import numpy as np import pufferlib from pufferlib.ocean.enduro.cy_enduro import CyEnduro -class Enduro(pufferlib.PufferEnv): - def __init__(self, num_envs=1, frame_skip=1, render_mode='human', - report_interval=1, buf=None): +class Enduro(pufferlib.PufferEnv): + def __init__(self, num_envs=1, frame_skip=1, render_mode="human", report_interval=1, buf=None): self.render_mode = render_mode self.num_agents = num_envs self.report_interval = report_interval @@ -16,12 +15,10 @@ def __init__(self, num_envs=1, frame_skip=1, render_mode='human', self.tick = 0 self.max_enemies = 10 - obs_size = (8 + (5 * self.max_enemies) + 9 + 1) + obs_size = 8 + (5 * self.max_enemies) + 9 + 1 self.num_obs = obs_size - self.single_observation_space = gymnasium.spaces.Box( - low=0, high=1, shape=(self.num_obs,), dtype=np.float32 - ) + self.single_observation_space = gymnasium.spaces.Box(low=0, high=1, shape=(self.num_obs,), dtype=np.float32) self.single_action_space = gymnasium.spaces.Discrete(9) # Initialize buffers @@ -37,19 +34,14 @@ def __init__(self, num_envs=1, frame_skip=1, render_mode='human', super().__init__(buf=buf) self.c_envs = CyEnduro( - self.observations, - self.actions, - self.rewards, - self.terminals, - self.truncations, - num_envs + self.observations, self.actions, self.rewards, self.terminals, self.truncations, num_envs ) - + def reset(self, seed=None): self.tick = 0 self.c_envs.reset() return self.observations, [] - + def step(self, actions): for _ in range(self.frame_skip): self.actions[:] = actions @@ -65,34 +57,30 @@ def step(self, actions): info = [] if self.tick % self.report_interval == 0: rewards = np.mean(self.rewards_buffer) - info.append({'rewards': rewards}) + info.append({"rewards": rewards}) self.rewards_buffer = [] log = self.c_envs.log() - if log['episode_length'] > 0: + if log["episode_length"] > 0: info.append(log) self.tick += 1 - return ( - self.observations, - self.rewards, - self.terminals, - self.truncations, - info - ) - + return (self.observations, self.rewards, self.terminals, self.truncations, info) + def render(self): self.c_envs.render() def close(self): self.c_envs.close() - + def validate_probabilities(prob_tensor): import torch + if torch.isnan(prob_tensor).any() or torch.isinf(prob_tensor).any() or (prob_tensor < 0).any(): raise ValueError("Invalid probability values detected") return prob_tensor - + + def test_performance(timeout=10, atn_cache=8192): num_envs = 4096 env = Enduro(num_envs=num_envs) @@ -102,6 +90,7 @@ def test_performance(timeout=10, atn_cache=8192): actions = np.random.randint(0, env.single_action_space.n, (atn_cache, num_envs)) import time + start = time.time() while time.time() - start < timeout: atn = actions[tick % atn_cache] @@ -109,7 +98,8 @@ def test_performance(timeout=10, atn_cache=8192): tick += 1 sps = num_envs * tick / (time.time() - start) - print(f'SPS: {sps:,}') + print(f"SPS: {sps:,}") + -if __name__ == '__main__': +if __name__ == "__main__": test_performance() diff --git a/pufferlib/ocean/environment.py b/pufferlib/ocean/environment.py index 91ca68b12..c7dac7561 100644 --- a/pufferlib/ocean/environment.py +++ b/pufferlib/ocean/environment.py @@ -1,168 +1,231 @@ import pufferlib.emulation import pufferlib.postprocess -from .snake.snake import Snake -from .squared.squared import Squared -from .squared.pysquared import PySquared -from .pong.pong import Pong from .breakout.breakout import Breakout -from .enduro.enduro import Enduro from .connect4.connect4 import Connect4 -from .tripletriad.tripletriad import TripleTriad -from .tactical.tactical import Tactical +from .enduro.enduro import Enduro +from .go.go import Go from .moba.moba import Moba from .nmmo3.nmmo3 import NMMO3 -from .go.go import Go +from .pong.pong import Pong from .rware.rware import Rware -#from .rocket_lander import rocket_lander +from .snake.snake import Snake +from .squared.pysquared import PySquared +from .squared.squared import Squared +from .tactical.tactical import Tactical + +# from .rocket_lander import rocket_lander from .trash_pickup.trash_pickup import TrashPickupEnv +from .tripletriad.tripletriad import TripleTriad -def make_foraging(width=1080, height=720, num_agents=4096, horizon=512, - discretize=True, food_reward=0.1, render_mode='rgb_array'): + +def make_foraging( + width=1080, height=720, num_agents=4096, horizon=512, discretize=True, food_reward=0.1, render_mode="rgb_array" +): from .grid import grid + init_fn = grid.init_foraging reward_fn = grid.reward_foraging - return grid.PufferGrid(width, height, num_agents, - horizon, discretize=discretize, food_reward=food_reward, init_fn=init_fn, reward_fn=reward_fn, render_mode=render_mode) - -def make_predator_prey(width=1080, height=720, num_agents=4096, horizon=512, - discretize=True, food_reward=0.1, render_mode='rgb_array'): + return grid.PufferGrid( + width, + height, + num_agents, + horizon, + discretize=discretize, + food_reward=food_reward, + init_fn=init_fn, + reward_fn=reward_fn, + render_mode=render_mode, + ) + + +def make_predator_prey( + width=1080, height=720, num_agents=4096, horizon=512, discretize=True, food_reward=0.1, render_mode="rgb_array" +): from .grid import grid + init_fn = grid.init_predator_prey reward_fn = grid.reward_predator_prey - return grid.PufferGrid(width, height, num_agents, - horizon, discretize=discretize, food_reward=food_reward, - init_fn=init_fn, reward_fn=reward_fn, - render_mode=render_mode) - -def make_group(width=1080, height=720, num_agents=4096, horizon=512, - discretize=True, food_reward=0.1, render_mode='rgb_array'): + return grid.PufferGrid( + width, + height, + num_agents, + horizon, + discretize=discretize, + food_reward=food_reward, + init_fn=init_fn, + reward_fn=reward_fn, + render_mode=render_mode, + ) + + +def make_group( + width=1080, height=720, num_agents=4096, horizon=512, discretize=True, food_reward=0.1, render_mode="rgb_array" +): from .grid import grid + init_fn = grid.init_group reward_fn = grid.reward_group - return grid.PufferGrid(width, height, num_agents, - horizon, discretize=discretize, food_reward=food_reward, - init_fn=init_fn, reward_fn=reward_fn, - render_mode=render_mode) - -def make_puffer(width=1080, height=720, num_agents=4096, horizon=512, - discretize=True, food_reward=0.1, render_mode='rgb_array'): + return grid.PufferGrid( + width, + height, + num_agents, + horizon, + discretize=discretize, + food_reward=food_reward, + init_fn=init_fn, + reward_fn=reward_fn, + render_mode=render_mode, + ) + + +def make_puffer( + width=1080, height=720, num_agents=4096, horizon=512, discretize=True, food_reward=0.1, render_mode="rgb_array" +): from .grid import grid + init_fn = grid.init_puffer reward_fn = grid.reward_puffer - return grid.PufferGrid(width, height, num_agents, - horizon, discretize=discretize, food_reward=food_reward, - init_fn=init_fn, reward_fn=reward_fn, - render_mode=render_mode) - -def make_puffergrid(render_mode='rgb_array', vision_range=3): - assert False, 'This env is unfinished. Join our Discord and help us finish it!' + return grid.PufferGrid( + width, + height, + num_agents, + horizon, + discretize=discretize, + food_reward=food_reward, + init_fn=init_fn, + reward_fn=reward_fn, + render_mode=render_mode, + ) + + +def make_puffergrid(render_mode="rgb_array", vision_range=3): + assert False, "This env is unfinished. Join our Discord and help us finish it!" from .grid import grid + return grid.PufferGrid(render_mode, vision_range) + def make_continuous(discretize=False, buf=None, **kwargs): from . import sanity + env = sanity.Continuous(discretize=discretize) if not discretize: env = pufferlib.postprocess.ClipAction(env) env = pufferlib.postprocess.EpisodeStats(env) return pufferlib.emulation.GymnasiumPufferEnv(env=env, buf=buf) + def make_squared(distance_to_target=3, num_targets=1, buf=None, **kwargs): from . import sanity + env = sanity.Squared(distance_to_target=distance_to_target, num_targets=num_targets, **kwargs) env = pufferlib.postprocess.EpisodeStats(env) return pufferlib.emulation.GymnasiumPufferEnv(env=env, buf=buf, **kwargs) + def make_bandit(num_actions=10, reward_scale=1, reward_noise=1, buf=None): from . import sanity - env = sanity.Bandit(num_actions=num_actions, reward_scale=reward_scale, - reward_noise=reward_noise) + + env = sanity.Bandit(num_actions=num_actions, reward_scale=reward_scale, reward_noise=reward_noise) env = pufferlib.postprocess.EpisodeStats(env) return pufferlib.emulation.GymnasiumPufferEnv(env=env, buf=buf) + def make_memory(mem_length=2, mem_delay=2, buf=None, **kwargs): from . import sanity + env = sanity.Memory(mem_length=mem_length, mem_delay=mem_delay) env = pufferlib.postprocess.EpisodeStats(env) return pufferlib.emulation.GymnasiumPufferEnv(env=env, buf=buf) + def make_password(password_length=5, buf=None, **kwargs): from . import sanity + env = sanity.Password(password_length=password_length) env = pufferlib.postprocess.EpisodeStats(env) return pufferlib.emulation.GymnasiumPufferEnv(env=env, buf=buf) + def make_performance(delay_mean=0, delay_std=0, bandwidth=1, buf=None, **kwargs): from . import sanity + env = sanity.Performance(delay_mean=delay_mean, delay_std=delay_std, bandwidth=bandwidth) env = pufferlib.postprocess.EpisodeStats(env) return pufferlib.emulation.GymnasiumPufferEnv(env=env, buf=buf) + def make_performance_empiric(count_n=0, count_std=0, bandwidth=1, buf=None, **kwargs): from . import sanity + env = sanity.PerformanceEmpiric(count_n=count_n, count_std=count_std, bandwidth=bandwidth) env = pufferlib.postprocess.EpisodeStats(env) return pufferlib.emulation.GymnasiumPufferEnv(env=env, buf=buf) + def make_stochastic(p=0.7, horizon=100, buf=None, **kwargs): from . import sanity + env = sanity.Stochastic(p=p, horizon=100) env = pufferlib.postprocess.EpisodeStats(env) return pufferlib.emulation.GymnasiumPufferEnv(env=env, buf=buf) + def make_spaces(buf=None, **kwargs): from . import sanity + env = sanity.Spaces() env = pufferlib.postprocess.EpisodeStats(env) return pufferlib.emulation.GymnasiumPufferEnv(env=env, buf=buf, **kwargs) + def make_multiagent(buf=None, **kwargs): from . import sanity + env = sanity.Multiagent() env = pufferlib.postprocess.MultiagentEpisodeStats(env) return pufferlib.emulation.PettingZooPufferEnv(env=env, buf=buf) -MAKE_FNS = { - 'breakout': Breakout, - 'pong': Pong, - 'enduro': Enduro, - 'moba': Moba, - 'nmmo3': NMMO3, - 'snake': Snake, - 'squared': Squared, - 'pysquared': PySquared, - 'connect4': Connect4, - 'tripletriad': TripleTriad, - 'tactical': Tactical, - 'go': Go, - 'rware': Rware, - 'trash_pickup': TrashPickupEnv, +MAKE_FNS = { + "breakout": Breakout, + "pong": Pong, + "enduro": Enduro, + "moba": Moba, + "nmmo3": NMMO3, + "snake": Snake, + "squared": Squared, + "pysquared": PySquared, + "connect4": Connect4, + "tripletriad": TripleTriad, + "tactical": Tactical, + "go": Go, + "rware": Rware, + "trash_pickup": TrashPickupEnv, #'rocket_lander': rocket_lander.RocketLander, - 'foraging': make_foraging, - 'predator_prey': make_predator_prey, - 'group': make_group, - 'puffer': make_puffer, - 'puffer_grid': make_puffergrid, - 'continuous': make_continuous, - 'bandit': make_bandit, - 'memory': make_memory, - 'password': make_password, - 'stochastic': make_stochastic, - 'multiagent': make_multiagent, - 'spaces': make_spaces, - 'performance': make_performance, - 'performance_empiric': make_performance_empiric, + "foraging": make_foraging, + "predator_prey": make_predator_prey, + "group": make_group, + "puffer": make_puffer, + "puffer_grid": make_puffergrid, + "continuous": make_continuous, + "bandit": make_bandit, + "memory": make_memory, + "password": make_password, + "stochastic": make_stochastic, + "multiagent": make_multiagent, + "spaces": make_spaces, + "performance": make_performance, + "performance_empiric": make_performance_empiric, } # Alias puffer_ to all names -MAKE_FNS = {**MAKE_FNS, **{'puffer_' + k: v for k, v in MAKE_FNS.items()}} +MAKE_FNS = {**MAKE_FNS, **{"puffer_" + k: v for k, v in MAKE_FNS.items()}} -def env_creator(name='squared'): + +def env_creator(name="squared"): if name in MAKE_FNS: return MAKE_FNS[name] else: - raise ValueError(f'Invalid environment name: {name}') - - + raise ValueError(f"Invalid environment name: {name}") diff --git a/pufferlib/ocean/go/go.py b/pufferlib/ocean/go/go.py index 69d54560e..4291342f3 100644 --- a/pufferlib/ocean/go/go.py +++ b/pufferlib/ocean/go/go.py @@ -1,51 +1,73 @@ -'''High-perf Pong +"""High-perf Pong Inspired from https://gist.github.com/Yttrmin/18ecc3d2d68b407b4be1 & https://jair.org/index.php/jair/article/view/10819/25823 & https://www.youtube.com/watch?v=PSQt5KGv7Vk -''' +""" -import numpy as np import gymnasium - +import numpy as np import pufferlib from pufferlib.ocean.go.cy_go import CyGo -class Go(pufferlib.PufferEnv): - def __init__(self, num_envs=1, render_mode=None, report_interval=1, - width=950, height=800, - grid_size=7, - board_width=600, board_height=600, - grid_square_size=600/9, - moves_made=0, - komi=7.5, - score = 0.0, - last_capture_position=-1, - reward_move_pass = -0.25, - reward_move_invalid = -0.1, - reward_move_valid = 0.1, - reward_player_capture = 0.25, - reward_opponent_capture = -0.25, - buf = None): +class Go(pufferlib.PufferEnv): + def __init__( + self, + num_envs=1, + render_mode=None, + report_interval=1, + width=950, + height=800, + grid_size=7, + board_width=600, + board_height=600, + grid_square_size=600 / 9, + moves_made=0, + komi=7.5, + score=0.0, + last_capture_position=-1, + reward_move_pass=-0.25, + reward_move_invalid=-0.1, + reward_move_valid=0.1, + reward_player_capture=0.25, + reward_opponent_capture=-0.25, + buf=None, + ): # env self.num_agents = num_envs self.render_mode = render_mode self.report_interval = report_interval - - self.num_obs = (grid_size) * (grid_size)*2 + 2 + + self.num_obs = (grid_size) * (grid_size) * 2 + 2 self.num_act = (grid_size) * (grid_size) + 1 - self.single_observation_space = gymnasium.spaces.Box(low=0, high=1, - shape=(self.num_obs,), dtype=np.float32) + self.single_observation_space = gymnasium.spaces.Box(low=0, high=1, shape=(self.num_obs,), dtype=np.float32) self.single_action_space = gymnasium.spaces.Discrete(self.num_act) super().__init__(buf=buf) - height = 64*(grid_size+1) - self.c_envs = CyGo(self.observations, self.actions, self.rewards, - self.terminals, num_envs, width, height, grid_size, board_width, - board_height, grid_square_size, moves_made, komi, score, - last_capture_position, reward_move_pass, reward_move_invalid, - reward_move_valid, reward_player_capture, reward_opponent_capture) + height = 64 * (grid_size + 1) + self.c_envs = CyGo( + self.observations, + self.actions, + self.rewards, + self.terminals, + num_envs, + width, + height, + grid_size, + board_width, + board_height, + grid_square_size, + moves_made, + komi, + score, + last_capture_position, + reward_move_pass, + reward_move_invalid, + reward_move_valid, + reward_player_capture, + reward_opponent_capture, + ) def reset(self, seed=None): self.c_envs.reset() @@ -59,19 +81,19 @@ def step(self, actions): info = [] if self.tick % self.report_interval == 0: log = self.c_envs.log() - if log['episode_length'] > 0: + if log["episode_length"] > 0: info.append(log) - return (self.observations, self.rewards, - self.terminals, self.truncations, info) + return (self.observations, self.rewards, self.terminals, self.truncations, info) def render(self): self.c_envs.render() - + def close(self): - self.c_envs.close() + self.c_envs.close() + def test_performance(timeout=10, atn_cache=1024): - num_envs=1000; + num_envs = 1000 env = Go(num_envs=num_envs) env.reset() tick = 0 @@ -79,6 +101,7 @@ def test_performance(timeout=10, atn_cache=1024): actions = np.random.randint(0, env.single_action_space.n, (atn_cache, num_envs)) import time + start = time.time() while time.time() - start < timeout: atn = actions[tick % atn_cache] @@ -86,6 +109,8 @@ def test_performance(timeout=10, atn_cache=1024): tick += 1 sps = num_envs * tick / (time.time() - start) - print(f'SPS: {sps:,}') -if __name__ == '__main__': + print(f"SPS: {sps:,}") + + +if __name__ == "__main__": test_performance() diff --git a/pufferlib/ocean/grid/grid.py b/pufferlib/ocean/grid/grid.py index 7cbfae50b..405deb308 100644 --- a/pufferlib/ocean/grid/grid.py +++ b/pufferlib/ocean/grid/grid.py @@ -1,12 +1,8 @@ -import numpy as np -import os import gymnasium - -from raylib import rl, colors +import numpy as np import pufferlib -from pufferlib.ocean import render from pufferlib.ocean.grid.cy_grid import CGrid EMPTY = 0 @@ -25,29 +21,29 @@ class PufferGrid(pufferlib.PufferEnv): - def __init__(self, render_mode='rgb_array', vision_range=3, num_envs=4096, report_interval=1024): + def __init__(self, render_mode="rgb_array", vision_range=3, num_envs=4096, report_interval=1024): super().__init__() self.vision_range = vision_range - self.obs_size = 2*self.vision_range + 1 + self.obs_size = 2 * self.vision_range + 1 self.report_interval = report_interval self.emulated = None self.buf = pufferlib.namespace( - observations = np.zeros( - (num_envs, self.obs_size*self.obs_size + 3), dtype=np.uint8), - rewards = np.zeros(num_envs, dtype=np.float32), - terminals = np.zeros(num_envs, dtype=bool), - truncations = np.zeros(num_envs, dtype=bool), - masks = np.ones(num_envs, dtype=bool), + observations=np.zeros((num_envs, self.obs_size * self.obs_size + 3), dtype=np.uint8), + rewards=np.zeros(num_envs, dtype=np.float32), + terminals=np.zeros(num_envs, dtype=bool), + truncations=np.zeros(num_envs, dtype=bool), + masks=np.ones(num_envs, dtype=bool), ) self.actions = np.zeros(num_envs, dtype=np.uint32) self.dones = np.ones(num_envs, dtype=bool) self.not_done = np.zeros(num_envs, dtype=bool) self.render_mode = render_mode - self.observation_space = gymnasium.spaces.Box(low=0, high=255, - shape=(self.obs_size*self.obs_size+3,), dtype=np.uint8) + self.observation_space = gymnasium.spaces.Box( + low=0, high=255, shape=(self.obs_size * self.obs_size + 3,), dtype=np.uint8 + ) self.action_space = gymnasium.spaces.Discrete(5) @@ -64,8 +60,7 @@ def render(self): def reset(self, seed=0): if self.cenv is None: - self.cenv = CGrid(self.buf.observations, self.actions, - self.buf.rewards, self.dones, self.num_agents) + self.cenv = CGrid(self.buf.observations, self.actions, self.buf.rewards, self.dones, self.num_agents) self.cenv.reset() self.agents = [0] @@ -86,10 +81,9 @@ def step(self, actions): infos = {} if self.tick % self.report_interval == 0: - infos['episode_return'] = self.cenv.get_returns() - infos['sum_rewards'] = self.sum_rewards - infos['has_key'] = self.cenv.has_key() + infos["episode_return"] = self.cenv.get_returns() + infos["sum_rewards"] = self.sum_rewards + infos["has_key"] = self.cenv.has_key() self.sum_rewards = 0 - return (self.buf.observations, self.buf.rewards, - self.buf.terminals, self.buf.truncations, infos) + return (self.buf.observations, self.buf.rewards, self.buf.terminals, self.buf.truncations, infos) diff --git a/pufferlib/ocean/moba/moba.py b/pufferlib/ocean/moba/moba.py index 51ff827cf..402da3054 100644 --- a/pufferlib/ocean/moba/moba.py +++ b/pufferlib/ocean/moba/moba.py @@ -1,35 +1,54 @@ -from pdb import set_trace as T -import numpy as np -import os - -import pettingzoo import gymnasium +import numpy as np import pufferlib from pufferlib.ocean.moba.cy_moba import CyMOBA -from pufferlib.ocean.moba.cy_moba import entity_dtype, reward_dtype -MAP_OBS_N = 11*11*4 +MAP_OBS_N = 11 * 11 * 4 PLAYER_OBS_N = 26 -class Moba(pufferlib.PufferEnv): - def __init__(self, num_envs=4, vision_range=5, agent_speed=1.0, - discretize=True, reward_death=-1.0, reward_xp=0.006, - reward_distance=0.05, reward_tower=3.0, report_interval=32, - script_opponents=True, render_mode='human', buf=None): +class Moba(pufferlib.PufferEnv): + def __init__( + self, + num_envs=4, + vision_range=5, + agent_speed=1.0, + discretize=True, + reward_death=-1.0, + reward_xp=0.006, + reward_distance=0.05, + reward_tower=3.0, + report_interval=32, + script_opponents=True, + render_mode="human", + buf=None, + ): self.report_interval = report_interval self.render_mode = render_mode - self.num_agents = 5*num_envs if script_opponents else 10*num_envs + self.num_agents = 5 * num_envs if script_opponents else 10 * num_envs - self.single_observation_space = gymnasium.spaces.Box(low=0, high=255, - shape=(MAP_OBS_N + PLAYER_OBS_N,), dtype=np.uint8) + self.single_observation_space = gymnasium.spaces.Box( + low=0, high=255, shape=(MAP_OBS_N + PLAYER_OBS_N,), dtype=np.uint8 + ) self.single_action_space = gymnasium.spaces.MultiDiscrete([7, 7, 3, 2, 2, 2]) super().__init__(buf=buf) - self.c_envs = CyMOBA(self.observations, self.actions, self.rewards, - self.terminals, num_envs, vision_range, agent_speed, True, - reward_death, reward_xp, reward_distance, reward_tower, script_opponents) + self.c_envs = CyMOBA( + self.observations, + self.actions, + self.rewards, + self.terminals, + num_envs, + vision_range, + agent_speed, + True, + reward_death, + reward_xp, + reward_distance, + reward_tower, + script_opponents, + ) def reset(self, seed=0): self.c_envs.reset() @@ -38,19 +57,18 @@ def reset(self, seed=0): def step(self, actions): self.actions[:] = actions - self.actions[:, 0] = 100*(self.actions[:, 0] - 3) - self.actions[:, 1] = 100*(self.actions[:, 1] - 3) + self.actions[:, 0] = 100 * (self.actions[:, 0] - 3) + self.actions[:, 1] = 100 * (self.actions[:, 1] - 3) self.c_envs.step() infos = [] self.tick += 1 if self.tick % self.report_interval == 0: log = self.c_envs.log() - if log['episode_length'] > 0: + if log["episode_length"] > 0: infos.append(dict(pufferlib.utils.unroll_nested_dict(log))) - return (self.observations, self.rewards, - self.terminals, self.truncations, infos) + return (self.observations, self.rewards, self.terminals, self.truncations, infos) def render(self): for frame in range(12): @@ -64,29 +82,33 @@ def test_performance(timeout=20, atn_cache=1024, num_envs=400): tick = 0 import time + start = time.time() while time.time() - start < timeout: atns = actions[tick % atn_cache] env.step(atns) tick += 1 - print(f'SPS: %f', 10*num_envs*tick / (time.time() - start)) + print("SPS: %f", 10 * num_envs * tick / (time.time() - start)) + -if __name__ == '__main__': +if __name__ == "__main__": # Run with c profile from cProfile import run + num_envs = 400 env = Moba(num_envs=num_envs, report_interval=10000000) env.reset() - actions = np.random.randint(0, env.single_action_space.nvec, (1024, 10*num_envs, 6)) + actions = np.random.randint(0, env.single_action_space.nvec, (1024, 10 * num_envs, 6)) test_performance(20, 1024, num_envs) exit(0) - run('test_performance(20)', 'stats.profile') + run("test_performance(20)", "stats.profile") import pstats from pstats import SortKey - p = pstats.Stats('stats.profile') + + p = pstats.Stats("stats.profile") p.sort_stats(SortKey.TIME).print_stats(25) exit(0) - #test_performance(10) + # test_performance(10) diff --git a/pufferlib/ocean/nmmo3/make_sprite_sheets.py b/pufferlib/ocean/nmmo3/make_sprite_sheets.py index be4d52c33..6ed816b82 100644 --- a/pufferlib/ocean/nmmo3/make_sprite_sheets.py +++ b/pufferlib/ocean/nmmo3/make_sprite_sheets.py @@ -1,4 +1,4 @@ -''' +""" This script is used to generate scaled and combined sprite sheets for nmmo3 You will need the to put the following folders into the same directory. They can be purchased from ManaSeed on itch.io @@ -14,27 +14,27 @@ 21.07b - Sword & Shield Combat 2.3 21.10a - Forester Pointed Hat & Tunic 2.1a (comp. v01) 21.10a - Forester Pointed Hat & Tunic 2.2 (optional, combat animations) -''' +""" -from itertools import product -from PIL import Image -import pyray as ray -import numpy as np +import os import random import sys -import os -import cv2 +from itertools import product +import cv2 +import numpy as np +import pyray as ray +from PIL import Image SHEET_SIZE = 2048 N_GENERATE = 10 ELEMENTS = ( - ('neutral', 1, ray.Color(255, 255, 255, 255)), - ('fire', 5, ray.Color(255, 128, 128, 255)), - ('water', 9, ray.Color(128, 128, 255, 255)), - ('earth', 11, ray.Color(128, 255, 128, 255)), - ('air', 3, ray.Color(255, 255, 128, 255)), + ("neutral", 1, ray.Color(255, 255, 255, 255)), + ("fire", 5, ray.Color(255, 128, 128, 255)), + ("water", 9, ray.Color(128, 128, 255, 255)), + ("earth", 11, ray.Color(128, 255, 128, 255)), + ("air", 3, ray.Color(255, 255, 128, 255)), ) BASE = list(range(8)) @@ -45,7 +45,7 @@ QUIVER = list(range(1, 9)) # Hair colors, indices into files -''' +""" HAIR = { ELEM_NEUTRAL: 1, ELEM_FIRE: 5, @@ -53,73 +53,80 @@ ELEM_EARTH: 11, ELEM_AIR: 3 } -''' +""" # Character base -character = 'char_a_p1_0bas_humn_v{i:02}.png' -demon = 'char_a_p1_0bas_demn_v{i:02}.png' -goblin = 'char_a_p1_0bas_gbln_v{i:02}.png' -hair_dap = 'char_a_p1_4har_dap1_v{i:02}.png' -hair_bob = 'char_a_p1_4har_bob1_v{i:02}.png' +character = "char_a_p1_0bas_humn_v{i:02}.png" +demon = "char_a_p1_0bas_demn_v{i:02}.png" +goblin = "char_a_p1_0bas_gbln_v{i:02}.png" +hair_dap = "char_a_p1_4har_dap1_v{i:02}.png" +hair_bob = "char_a_p1_4har_bob1_v{i:02}.png" # Combat animations -sword_character = 'char_a_pONE3_0bas_humn_v{i:02}.png' -sword_weapon = 'char_a_pONE3_6tla_sw01_v{i:02}.png' -sword_hair_bob = 'char_a_pONE3_4har_bob1_v{i:02}.png' -sword_hair_dap = 'char_a_pONE3_4har_dap1_v{i:02}.png' -bow_character = 'char_a_pBOW3_0bas_humn_v{i:02}.png' -bow_hair_dap = 'char_a_pBOW3_4har_dap1_v{i:02}.png' -bow_hair_bob = 'char_a_pBOW3_4har_bob1_v{i:02}.png' -bow_weapon = 'char_a_pBOW3_6tla_bo01_v{i:02}.png' -bow_quiver = 'char_a_pBOW3_7tlb_qv01_v{i:02}.png' -arrow = 'aro_comn_v{i:02}.png' +sword_character = "char_a_pONE3_0bas_humn_v{i:02}.png" +sword_weapon = "char_a_pONE3_6tla_sw01_v{i:02}.png" +sword_hair_bob = "char_a_pONE3_4har_bob1_v{i:02}.png" +sword_hair_dap = "char_a_pONE3_4har_dap1_v{i:02}.png" +bow_character = "char_a_pBOW3_0bas_humn_v{i:02}.png" +bow_hair_dap = "char_a_pBOW3_4har_dap1_v{i:02}.png" +bow_hair_bob = "char_a_pBOW3_4har_bob1_v{i:02}.png" +bow_weapon = "char_a_pBOW3_6tla_bo01_v{i:02}.png" +bow_quiver = "char_a_pBOW3_7tlb_qv01_v{i:02}.png" +arrow = "aro_comn_v{i:02}.png" # Peasant character alternative -peasant_clothes = 'char_a_p1_1out_pfpn_v{i:02}.png' -sword_peasant_clothes = 'char_a_pONE3_1out_pfpn_v{i:02}.png' -bow_peasant_clothes = 'char_a_pBOW3_1out_pfpn_v{i:02}.png' +peasant_clothes = "char_a_p1_1out_pfpn_v{i:02}.png" +sword_peasant_clothes = "char_a_pONE3_1out_pfpn_v{i:02}.png" +bow_peasant_clothes = "char_a_pBOW3_1out_pfpn_v{i:02}.png" # Forester character alternative -forester_hat = 'char_a_p1_5hat_pnty_v{i:02}.png' -forester_clothes = 'char_a_p1_1out_fstr_v{i:02}.png' -sword_forester_hat = 'char_a_pONE3_5hat_pnty_v{i:02}.png' -sword_forester_clothes = 'char_a_pONE3_1out_fstr_v{i:02}.png' -bow_forester_hat = 'char_a_pBOW3_5hat_pnty_v{i:02}.png' -bow_forester_clothes = 'char_a_pBOW3_1out_fstr_v{i:02}.png' - -sword_mask = np.array(( - (1, 1, 1, 1, 1, 1, 1, 1), - (1, 1, 1, 1, 1, 1, 1, 1), - (1, 0, 1, 1, 1, 1, 1, 1), - (1, 0, 1, 1, 1, 1, 1, 1), - (0, 0, 1, 1, 1, 1, 1, 1), - (1, 1, 1, 1, 1, 1, 1, 1), - (0, 0, 1, 1, 0, 0, 0, 0), - (0, 0, 1, 1, 0, 0, 0, 0), -)) - -bow_mask = np.array(( - (0, 0, 0, 0, 0, 0, 0, 0), - (1, 1, 1, 1, 1, 1, 1, 1), - (1, 0, 0, 0, 0, 0, 0, 0), - (1, 0, 0, 0, 0, 0, 0, 0), - (0, 0, 0, 0, 0, 0, 0, 0), - (1, 1, 1, 1, 1, 1, 1, 1), - (1, 0, 0, 0, 0, 0, 0, 0), - (1, 0, 0, 0, 0, 0, 0, 0), -)) - -quiver_mask = np.array(( - (1, 1, 1, 1, 1, 1, 1, 1), - (0, 0, 0, 0, 0, 0, 0, 0), - (1, 1, 1, 1, 1, 1, 1, 1), - (1, 1, 1, 1, 1, 1, 1, 1), - (1, 1, 1, 1, 1, 1, 1, 1), - (0, 0, 0, 0, 0, 0, 0, 0), - (1, 1, 1, 1, 1, 1, 1, 1), - (1, 1, 1, 1, 1, 1, 1, 1), -)) +forester_hat = "char_a_p1_5hat_pnty_v{i:02}.png" +forester_clothes = "char_a_p1_1out_fstr_v{i:02}.png" +sword_forester_hat = "char_a_pONE3_5hat_pnty_v{i:02}.png" +sword_forester_clothes = "char_a_pONE3_1out_fstr_v{i:02}.png" +bow_forester_hat = "char_a_pBOW3_5hat_pnty_v{i:02}.png" +bow_forester_clothes = "char_a_pBOW3_1out_fstr_v{i:02}.png" + +sword_mask = np.array( + ( + (1, 1, 1, 1, 1, 1, 1, 1), + (1, 1, 1, 1, 1, 1, 1, 1), + (1, 0, 1, 1, 1, 1, 1, 1), + (1, 0, 1, 1, 1, 1, 1, 1), + (0, 0, 1, 1, 1, 1, 1, 1), + (1, 1, 1, 1, 1, 1, 1, 1), + (0, 0, 1, 1, 0, 0, 0, 0), + (0, 0, 1, 1, 0, 0, 0, 0), + ) +) + +bow_mask = np.array( + ( + (0, 0, 0, 0, 0, 0, 0, 0), + (1, 1, 1, 1, 1, 1, 1, 1), + (1, 0, 0, 0, 0, 0, 0, 0), + (1, 0, 0, 0, 0, 0, 0, 0), + (0, 0, 0, 0, 0, 0, 0, 0), + (1, 1, 1, 1, 1, 1, 1, 1), + (1, 0, 0, 0, 0, 0, 0, 0), + (1, 0, 0, 0, 0, 0, 0, 0), + ) +) + +quiver_mask = np.array( + ( + (1, 1, 1, 1, 1, 1, 1, 1), + (0, 0, 0, 0, 0, 0, 0, 0), + (1, 1, 1, 1, 1, 1, 1, 1), + (1, 1, 1, 1, 1, 1, 1, 1), + (1, 1, 1, 1, 1, 1, 1, 1), + (0, 0, 0, 0, 0, 0, 0, 0), + (1, 1, 1, 1, 1, 1, 1, 1), + (1, 1, 1, 1, 1, 1, 1, 1), + ) +) + def draw_tex(path, f_name, i, x, y, tint=None): if tint is None: @@ -131,6 +138,7 @@ def draw_tex(path, f_name, i, x, y, tint=None): dest_rect = ray.Rectangle(x, y, texture.width, texture.height) ray.draw_texture_pro(texture, source_rect, dest_rect, (0, 0), 0, tint) + def draw_masked_tex(path, f_name, i, x, y, mask, tint=None): if tint is None: tint = ray.WHITE @@ -148,16 +156,18 @@ def draw_masked_tex(path, f_name, i, x, y, mask, tint=None): source_rect = ray.Rectangle(src_x, src_y, 128, -128) dst_x = x + src_x - dst_y = y + (Y-r-1)*128 + dst_y = y + (Y - r - 1) * 128 dest_rect = ray.Rectangle(dst_x, dst_y, 128, 128) ray.draw_texture_pro(texture, source_rect, dest_rect, (0, 0), 0, tint) + def draw_arrow(tex, src_x, src_y, dst_x, dst_y, offset_x, offset_y, rot): - source_rect = ray.Rectangle(src_x*32, src_y*32, 32, -32) - dest_rect = ray.Rectangle(dst_x*128 + offset_x, SHEET_SIZE-(dst_y+1)*128+ offset_y, 32, 32) + source_rect = ray.Rectangle(src_x * 32, src_y * 32, 32, -32) + dest_rect = ray.Rectangle(dst_x * 128 + offset_x, SHEET_SIZE - (dst_y + 1) * 128 + offset_y, 32, 32) ray.draw_texture_pro(tex, source_rect, dest_rect, (0, 0), rot, ray.WHITE) + def draw_sheet(src, hair_i, tint, seed=None): if seed is not None: random.seed(seed) @@ -192,7 +202,7 @@ def draw_sheet(src, hair_i, tint, seed=None): draw_tex(src, sword_character, base_i, x, y) draw_tex(src, sword_hair, hair_i, x, y) draw_tex(src, sword_clothes, clothes_i, x, y) - draw_masked_tex(src, sword_weapon, sword_i, x, y, 1-sword_mask, tint=tint) + draw_masked_tex(src, sword_weapon, sword_i, x, y, 1 - sword_mask, tint=tint) x = 1024 y = 1024 @@ -201,8 +211,8 @@ def draw_sheet(src, hair_i, tint, seed=None): draw_tex(src, bow_character, base_i, x, y) draw_tex(src, bow_hair, hair_i, x, y) draw_tex(src, bow_clothes, clothes_i, x, y) - draw_masked_tex(src, bow_weapon, bow_i, x, y, 1-bow_mask, tint=tint) - draw_masked_tex(src, bow_quiver, quiver_i, x, y, 1-quiver_mask, tint=tint) + draw_masked_tex(src, bow_weapon, bow_i, x, y, 1 - bow_mask, tint=tint) + draw_masked_tex(src, bow_quiver, quiver_i, x, y, 1 - quiver_mask, tint=tint) arrow_path = os.path.join(src, arrow).format(i=quiver_i) arrow_tex = ray.load_texture(arrow_path) @@ -215,45 +225,42 @@ def draw_sheet(src, hair_i, tint, seed=None): draw_arrow(arrow_tex, 1, 1, 12, 3, 38, 64, 0) # Right facing arrows - draw_arrow(arrow_tex, 4, 1, 9, 2, 64+42, 48, 120) - draw_arrow(arrow_tex, 4, 1, 10, 2, 64+42, 48, 120) - draw_arrow(arrow_tex, 3, 1, 11, 2, 64+32, 82, 180) - draw_arrow(arrow_tex, 1, 1, 12, 2, 56, 98, 180+80) + draw_arrow(arrow_tex, 4, 1, 9, 2, 64 + 42, 48, 120) + draw_arrow(arrow_tex, 4, 1, 10, 2, 64 + 42, 48, 120) + draw_arrow(arrow_tex, 3, 1, 11, 2, 64 + 32, 82, 180) + draw_arrow(arrow_tex, 1, 1, 12, 2, 56, 98, 180 + 80) def scale_image(image_array, scale_factor): if scale_factor < 1: # Scale down with exact interpolation - scaled_image_array = image_array[::int(1/scale_factor), ::int(1/scale_factor)] + scaled_image_array = image_array[:: int(1 / scale_factor), :: int(1 / scale_factor)] elif scale_factor > 1: # Scale up (duplicate pixels) - scaled_image_array = np.repeat( - np.repeat( - image_array, scale_factor, axis=0 - ), scale_factor, axis=1 - ) + scaled_image_array = np.repeat(np.repeat(image_array, scale_factor, axis=0), scale_factor, axis=1) else: # No scaling scaled_image_array = image_array return scaled_image_array + def copy_and_scale_files(source_directory, target_directory, scale_factor): for root, dirs, files in os.walk(source_directory): relative_path = os.path.relpath(root, source_directory) target_path = os.path.join(target_directory) os.makedirs(target_path, exist_ok=True) - + for file in files: src_file_path = os.path.join(root, file) target_file_path = os.path.join(target_directory, file) - + path = src_file_path.lower() - if path.endswith('.ttf'): + if path.endswith(".ttf"): os.copy(src_file_path, target_file_path) continue - if not src_file_path.lower().endswith(('.png', '.jpg', '.jpeg')): + if not src_file_path.lower().endswith((".png", ".jpg", ".jpeg")): continue image = Image.open(src_file_path) @@ -262,6 +269,7 @@ def copy_and_scale_files(source_directory, target_directory, scale_factor): scaled_image = Image.fromarray(scaled_image_array) scaled_image.save(target_file_path) + if len(sys.argv) != 4: print("Usage: script.py source_directory target_directory scale_factor") sys.exit(1) @@ -276,9 +284,9 @@ def copy_and_scale_files(source_directory, target_directory, scale_factor): valid_scales = [0.125, 0.25, 0.5, 1, 2, 4] if scale_factor not in valid_scales: - print(f'Scale factor must be one of {valid_scales}') + print(f"Scale factor must be one of {valid_scales}") -intermediate_directory = os.path.join(target_directory, 'temp') +intermediate_directory = os.path.join(target_directory, "temp") if not os.path.exists(intermediate_directory): os.makedirs(intermediate_directory) copy_and_scale_files(source_directory, intermediate_directory, scale_factor) @@ -290,7 +298,7 @@ def copy_and_scale_files(source_directory, target_directory, scale_factor): i = 0 while not ray.window_should_close() and i < N_GENERATE: - ray.set_window_title(f'Generating sheet {i+1}/{N_GENERATE}') + ray.set_window_title(f"Generating sheet {i + 1}/{N_GENERATE}") for elem in ELEMENTS: elem_name, hair_i, tint = elem @@ -301,7 +309,7 @@ def copy_and_scale_files(source_directory, target_directory, scale_factor): ray.end_texture_mode() image = ray.load_image_from_texture(output_image.texture) - f_path = os.path.join(target_directory, f'{elem_name}_{i}.png') + f_path = os.path.join(target_directory, f"{elem_name}_{i}.png") ray.export_image(image, f_path) ray.clear_background(ray.GRAY) @@ -311,10 +319,10 @@ def copy_and_scale_files(source_directory, target_directory, scale_factor): i += 1 coords = (0, 1) -spring = cv2.imread(intermediate_directory + '/spring forest.png') -summer = cv2.imread(intermediate_directory + '/summer forest.png') -autumn = cv2.imread(intermediate_directory + '/autumn forest (bare).png') -winter = cv2.imread(intermediate_directory + '/winter forest (clean).png') +spring = cv2.imread(intermediate_directory + "/spring forest.png") +summer = cv2.imread(intermediate_directory + "/summer forest.png") +autumn = cv2.imread(intermediate_directory + "/autumn forest (bare).png") +winter = cv2.imread(intermediate_directory + "/winter forest (clean).png") spring = scale_image(spring, 2) summer = scale_image(summer, 2) @@ -323,10 +331,10 @@ def copy_and_scale_files(source_directory, target_directory, scale_factor): SEASONS = [spring, summer, autumn, winter] -spring_sparkle = cv2.imread(intermediate_directory + '/spring water sparkles B.png') -summer_sparkle = cv2.imread(intermediate_directory + '/summer water sparkles B 16x16.png') -autumn_sparkle = cv2.imread(intermediate_directory + '/autumn water sparkles B 16x16.png') -winter_sparkle = cv2.imread(intermediate_directory + '/winter water sparkles B 16x16.png') +spring_sparkle = cv2.imread(intermediate_directory + "/spring water sparkles B.png") +summer_sparkle = cv2.imread(intermediate_directory + "/summer water sparkles B 16x16.png") +autumn_sparkle = cv2.imread(intermediate_directory + "/autumn water sparkles B 16x16.png") +winter_sparkle = cv2.imread(intermediate_directory + "/winter water sparkles B 16x16.png") spring_sparkle = scale_image(spring_sparkle, 2) summer_sparkle = scale_image(summer_sparkle, 2) @@ -394,14 +402,30 @@ def copy_and_scale_files(source_directory, target_directory, scale_factor): TRR_DIAG = (2, 3) BRR_DIAG = (2, 4) -OFFSETS = [TL_CORNER, T_FLAT, TR_CORNER, L_FLAT, CENTER, R_FLAT, BL_CORNER, - B_FLAT, BR_CORNER, TL_DIAG, TR_DIAG, BL_DIAG, BR_DIAG, TRR_DIAG, BRR_DIAG] +OFFSETS = [ + TL_CORNER, + T_FLAT, + TR_CORNER, + L_FLAT, + CENTER, + R_FLAT, + BL_CORNER, + B_FLAT, + BR_CORNER, + TL_DIAG, + TR_DIAG, + BL_DIAG, + BR_DIAG, + TRR_DIAG, + BRR_DIAG, +] TILE_SIZE = int(32 * scale_factor) SHEET_SIZE = 64 SHEET_PX = TILE_SIZE * SHEET_SIZE merged_sheet = np.zeros((SHEET_PX, SHEET_PX, 3), dtype=np.uint8) + def gen_lerps(): valid_combinations = [] for combo in product(range(10), repeat=4): @@ -410,6 +434,7 @@ def gen_lerps(): return valid_combinations + def gen_lerps(): valid_combinations = [] for total_sum in range(1, 10): # Loop through all possible sums from 1 to 9 @@ -418,15 +443,14 @@ def gen_lerps(): valid_combinations.append(combo) return valid_combinations + def slice(r, c): - return np.s_[ - r*TILE_SIZE:(r+1)*TILE_SIZE, - c*TILE_SIZE:(c+1)*TILE_SIZE - ] + return np.s_[r * TILE_SIZE : (r + 1) * TILE_SIZE, c * TILE_SIZE : (c + 1) * TILE_SIZE] + idx = 0 for sheet in SEASONS: - for offset, material in zip(ALL_OFFSETS, ALL_MATERIALS): + for offset, material in zip(ALL_OFFSETS, ALL_MATERIALS, strict=False): src_dx, src_dy = offset # Write full tile textures. These are irregularly @@ -445,8 +469,8 @@ def slice(r, c): for dx, dy in OFFSETS: dst_r, dst_c = divmod(idx, SHEET_SIZE) idx += 1 - - src_pos = slice(dy+src_dy, dx+src_dx) + + src_pos = slice(dy + src_dy, dx + src_dx) tile_tex = sheet[src_pos] dst_pos = slice(dst_r, dst_c) @@ -455,7 +479,7 @@ def slice(r, c): for x, y in WATER_N: # 3 animations for anim_y in range(3): - for season, sparkle in zip(SEASONS, SPARKLES): + for season, sparkle in zip(SEASONS, SPARKLES, strict=False): src_pos = slice(y, x) tile_tex = season[src_pos] @@ -480,7 +504,7 @@ def slice(r, c): tex = np.zeros((TILE_SIZE, TILE_SIZE, 3)) total_weight = sum(combo) for i, weight in enumerate(combo): - tex += weight/total_weight * tiles[i] + tex += weight / total_weight * tiles[i] tex = tex.astype(np.uint8) @@ -493,6 +517,6 @@ def slice(r, c): print(idx) # save image -cv2.imwrite('merged_sheet.png', merged_sheet) -cv2.imshow('merged_sheet', merged_sheet) +cv2.imwrite("merged_sheet.png", merged_sheet) +cv2.imshow("merged_sheet", merged_sheet) cv2.waitKey(0) diff --git a/pufferlib/ocean/nmmo3/nmmo3.py b/pufferlib/ocean/nmmo3/nmmo3.py index 3648e4dfb..990b021e4 100644 --- a/pufferlib/ocean/nmmo3/nmmo3.py +++ b/pufferlib/ocean/nmmo3/nmmo3.py @@ -1,23 +1,36 @@ -from pdb import set_trace as T -import numpy as np -from types import SimpleNamespace -import gymnasium -import pettingzoo import time +import gymnasium +import numpy as np +import pufferlib from pufferlib.ocean.nmmo3.cy_nmmo3 import Environment, entity_dtype, reward_dtype -import pufferlib class NMMO3(pufferlib.PufferEnv): - def __init__(self, width=4*[512], height=4*[512], num_envs=4, - num_players=1024, num_enemies=2048, num_resources=2048, - num_weapons=1024, num_gems=512, tiers=5, levels=40, - teleportitis_prob=0.001, enemy_respawn_ticks=2, - item_respawn_ticks=100, x_window=7, y_window=5, - reward_combat_level=1.0, reward_prof_level=1.0, - reward_item_level=0.5, reward_market=0.01, - reward_death=-1.0, buf=None): + def __init__( + self, + width=4 * [512], + height=4 * [512], + num_envs=4, + num_players=1024, + num_enemies=2048, + num_resources=2048, + num_weapons=1024, + num_gems=512, + tiers=5, + levels=40, + teleportitis_prob=0.001, + enemy_respawn_ticks=2, + item_respawn_ticks=100, + x_window=7, + y_window=5, + reward_combat_level=1.0, + reward_prof_level=1.0, + reward_item_level=0.5, + reward_market=0.01, + reward_death=-1.0, + buf=None, + ): if not isinstance(width, list): width = num_envs * [width] if not isinstance(height, list): @@ -113,13 +126,13 @@ def __init__(self, width=4*[512], height=4*[512], num_envs=4, total_players += num_players[idx] total_enemies += num_enemies[idx] - self.players_flat = np.zeros((total_players, 51+501+3), dtype=np.intc) - self.enemies_flat = np.zeros((total_enemies, 51+501+3), dtype=np.intc) + self.players_flat = np.zeros((total_players, 51 + 501 + 3), dtype=np.intc) + self.enemies_flat = np.zeros((total_enemies, 51 + 501 + 3), dtype=np.intc) self.rewards_flat = np.zeros((total_players, 10), dtype=np.float32) - #map_obs = np.zeros((total_players, 11*15 + 47 + 10), dtype=np.intc) - #counts = np.zeros((num_envs, height, width), dtype=np.uint8) - #terrain = np.zeros((num_envs, height, width), dtype=np.uint8) - #rendered = np.zeros((num_envs, height, width, 3), dtype=np.uint8) + # map_obs = np.zeros((total_players, 11*15 + 47 + 10), dtype=np.intc) + # counts = np.zeros((num_envs, height, width), dtype=np.uint8) + # terrain = np.zeros((num_envs, height, width), dtype=np.uint8) + # rendered = np.zeros((num_envs, height, width, 3), dtype=np.uint8) actions = np.zeros((total_players), dtype=np.intc) self.actions = actions @@ -127,30 +140,48 @@ def __init__(self, width=4*[512], height=4*[512], num_envs=4, self.num_players = total_players self.num_enemies = total_enemies - self.players = np.frombuffer(self.players_flat, - dtype=entity_dtype()).view(np.recarray) - self.enemies = np.frombuffer(self.enemies_flat, - dtype=entity_dtype()).view(np.recarray) - self.struct_rewards = np.frombuffer(self.rewards_flat, - dtype=reward_dtype()).view(np.recarray) + self.players = np.frombuffer(self.players_flat, dtype=entity_dtype()).view(np.recarray) + self.enemies = np.frombuffer(self.enemies_flat, dtype=entity_dtype()).view(np.recarray) + self.struct_rewards = np.frombuffer(self.rewards_flat, dtype=reward_dtype()).view(np.recarray) self.comb_goal_mask = np.array([1, 0, 1, 0, 1, 1, 0, 1, 1, 1]) self.prof_goal_mask = np.array([0, 0, 0, 1, 0, 0, 1, 1, 1, 1]) self.tick = 0 - self.single_observation_space = gymnasium.spaces.Box(low=-1, - high=2**32-1, shape=(11*15*10+47+10,), dtype=np.uint8) + self.single_observation_space = gymnasium.spaces.Box( + low=-1, high=2**32 - 1, shape=(11 * 15 * 10 + 47 + 10,), dtype=np.uint8 + ) self.single_action_space = gymnasium.spaces.Discrete(26) - self.render_mode = 'human' + self.render_mode = "human" super().__init__(buf) - self.c_env = Environment(self.observations, self.players_flat, - self.enemies_flat, self.rewards_flat, self.actions, - width, height, num_envs, num_players, num_enemies, - num_resources, num_weapons, num_gems, tiers, levels, - teleportitis_prob, enemy_respawn_ticks, item_respawn_ticks, - reward_combat_level, reward_prof_level, reward_item_level, - reward_market, reward_death, x_window, y_window) + self.c_env = Environment( + self.observations, + self.players_flat, + self.enemies_flat, + self.rewards_flat, + self.actions, + width, + height, + num_envs, + num_players, + num_enemies, + num_resources, + num_weapons, + num_gems, + tiers, + levels, + teleportitis_prob, + enemy_respawn_ticks, + item_respawn_ticks, + reward_combat_level, + reward_prof_level, + reward_item_level, + reward_market, + reward_death, + x_window, + y_window, + ) def reset(self, seed=None): self.struct_rewards.fill(0) @@ -160,8 +191,8 @@ def reset(self, seed=None): return self.observations, [] def step(self, actions): - if not hasattr(self, 'is_reset'): - raise Exception('Must call reset before step') + if not hasattr(self, "is_reset"): + raise Exception("Must call reset before step") self.rewards.fill(0) rewards = self.struct_rewards rewards.fill(0) @@ -172,20 +203,20 @@ def step(self, actions): infos = [] if self.tick % 128 == 0: log = self.c_env.log() - if log['episode_length'] > 0: + if log["episode_length"] > 0: infos.append(log) - ''' + """ print( f'Comb lvl: {np.mean(self.players.comb_lvl)} (max {np.max(self.players.comb_lvl)})', f'Prof lvl: {np.mean(self.players.prof_lvl)} (max {np.max(self.players.prof_lvl)})', f'Time alive: {np.mean(self.players.time_alive)} (max {np.max(self.players.time_alive)})', ) - ''' + """ if False and self.tick % 128 == 0: # TODO: Log images to Wandb in latest version - infos['nmmo3_map'] = self.render() + infos["nmmo3_map"] = self.render() self.tick += 1 @@ -195,10 +226,10 @@ def step(self, actions): def render(self): self.c_env.render() - #all_maps = [e.rendered.astype(np.float32) for e in self.c_env.envs] - #all_counts = [e.counts.astype(np.float32) for e in self.c_env.envs] + # all_maps = [e.rendered.astype(np.float32) for e in self.c_env.envs] + # all_counts = [e.counts.astype(np.float32) for e in self.c_env.envs] - ''' + """ agg_maps = np.zeros((2048, 2048, 3), dtype=np.float32) agg_counts = np.zeros((2048, 2048), dtype=np.float32) @@ -239,27 +270,26 @@ def render(self): lerped = lerped[::2, ::2] return lerped.astype(np.uint8) - ''' + """ def close(self): self.c_envs.close() + class Overlays: def __init__(self, width, height): self.counts = np.zeros((width, height), dtype=int) self.value_function = np.zeros((width, height), dtype=np.float32) + def test_env_performance(env, timeout=10): num_agents = env.num_players - actions = {t: - {agent: np.random.randint(0, 6) for agent in range(1, num_agents+1)} - for t in range(100) - } + actions = {t: {agent: np.random.randint(0, 6) for agent in range(1, num_agents + 1)} for t in range(100)} actions = {t: np.random.randint(0, 6, num_agents) for t in range(100)} idx = 0 - import time + start = time.time() num_steps = 0 while time.time() - start < timeout: @@ -269,5 +299,3 @@ def test_env_performance(env, timeout=10): end = time.time() fps = num_agents * num_steps / (end - start) print(f"Test Environment Performance FPS: {fps:.2f}") - - diff --git a/pufferlib/ocean/pong/pong.py b/pufferlib/ocean/pong/pong.py index dc2685e03..2a6120844 100644 --- a/pufferlib/ocean/pong/pong.py +++ b/pufferlib/ocean/pong/pong.py @@ -1,26 +1,42 @@ -'''High-perf Pong +"""High-perf Pong Inspired from https://gist.github.com/Yttrmin/18ecc3d2d68b407b4be1 & https://jair.org/index.php/jair/article/view/10819/25823 & https://www.youtube.com/watch?v=PSQt5KGv7Vk -''' +""" -import numpy as np import gymnasium - +import numpy as np import pufferlib from pufferlib.ocean.pong.cy_pong import CyPong class Pong(pufferlib.PufferEnv): - def __init__(self, num_envs=1, render_mode=None, - width=500, height=640, paddle_width=20, paddle_height=70, - ball_width=32, ball_height=32, paddle_speed=8, - ball_initial_speed_x=10, ball_initial_speed_y=1, - ball_speed_y_increment=3, ball_max_speed_y=13, - max_score=21, frameskip=1, report_interval=1, buf=None): + def __init__( + self, + num_envs=1, + render_mode=None, + width=500, + height=640, + paddle_width=20, + paddle_height=70, + ball_width=32, + ball_height=32, + paddle_speed=8, + ball_initial_speed_x=10, + ball_initial_speed_y=1, + ball_speed_y_increment=3, + ball_max_speed_y=13, + max_score=21, + frameskip=1, + report_interval=1, + buf=None, + ): self.single_observation_space = gymnasium.spaces.Box( - low=0, high=1, shape=(8,), dtype=np.float32, + low=0, + high=1, + shape=(8,), + dtype=np.float32, ) self.single_action_space = gymnasium.spaces.Discrete(3) self.render_mode = render_mode @@ -31,12 +47,27 @@ def __init__(self, num_envs=1, render_mode=None, self.tick = 0 super().__init__(buf) - self.c_envs = CyPong(self.observations, self.actions, self.rewards, - self.terminals, num_envs, width, height, - paddle_width, paddle_height, ball_width, ball_height, - paddle_speed, ball_initial_speed_x, ball_initial_speed_y, - ball_max_speed_y, ball_speed_y_increment, max_score, frameskip) - + self.c_envs = CyPong( + self.observations, + self.actions, + self.rewards, + self.terminals, + num_envs, + width, + height, + paddle_width, + paddle_height, + ball_width, + ball_height, + paddle_speed, + ball_initial_speed_x, + ball_initial_speed_y, + ball_max_speed_y, + ball_speed_y_increment, + max_score, + frameskip, + ) + def reset(self, seed=None): self.tick = 0 self.c_envs.reset() @@ -49,12 +80,11 @@ def step(self, actions): info = [] if self.tick % self.report_interval == 0: log = self.c_envs.log() - if log['episode_length'] > 0: - info.append(log) + if log["episode_length"] > 0: + info.append(log) self.tick += 1 - return (self.observations, self.rewards, - self.terminals, self.truncations, info) + return (self.observations, self.rewards, self.terminals, self.truncations, info) def render(self): self.c_envs.render() @@ -62,6 +92,7 @@ def render(self): def close(self): self.c_envs.close() + def test_performance(timeout=10, atn_cache=1024): env = Pong(num_envs=1000) env.reset() @@ -70,14 +101,15 @@ def test_performance(timeout=10, atn_cache=1024): actions = np.random.randint(0, 2, (atn_cache, env.num_agents)) import time + start = time.time() while time.time() - start < timeout: atn = actions[tick % atn_cache] env.step(atn) tick += 1 - print(f'SPS: {env.num_agents * tick / (time.time() - start)}') + print(f"SPS: {env.num_agents * tick / (time.time() - start)}") -if __name__ == '__main__': +if __name__ == "__main__": test_performance() diff --git a/pufferlib/ocean/render.py b/pufferlib/ocean/render.py index 1a6b02404..da4164ae1 100644 --- a/pufferlib/ocean/render.py +++ b/pufferlib/ocean/render.py @@ -1,25 +1,27 @@ -import numpy as np -import os -from cffi import FFI -from raylib import rl, colors +import numpy as np import pyray +from cffi import FFI +from raylib import colors, rl PUFF_BACKGROUND = [6, 24, 24, 255] PUFF_TEXT = [0, 187, 187, 255] ANSI_COLORS = [30, 34, 36, 90, 31, 97, 91, 37] -COLORS = np.array([ - [6, 24, 24 ], # Background - [0, 0, 255], # Food - [0, 128, 255], # Corpse - [128, 128, 128], # Wall - [255, 0, 0], # Snake - [255, 255, 255], # Snake - [255, 85, 85], # Snake - [170, 170, 170], # Snake -], dtype=np.uint8) +COLORS = np.array( + [ + [6, 24, 24], # Background + [0, 0, 255], # Food + [0, 128, 255], # Corpse + [128, 128, 128], # Wall + [255, 0, 0], # Snake + [255, 255, 255], # Snake + [255, 85, 85], # Snake + [170, 170, 170], # Snake + ], + dtype=np.uint8, +) def any_key_down(keys): @@ -28,12 +30,14 @@ def any_key_down(keys): return True return False + def any_key_pressed(keys): for key in keys: if rl.IsKeyPressed(key): return True return False + def cdata_to_numpy(): image = rl.LoadImageFromScreen() data_pointer = image.data @@ -42,15 +46,17 @@ def cdata_to_numpy(): channels = 4 data_size = width * height * channels cdata = FFI().buffer(data_pointer, data_size) - return np.frombuffer(cdata, dtype=np.uint8 - ).reshape((height, width, channels)) + return np.frombuffer(cdata, dtype=np.uint8).reshape((height, width, channels)) + def make_texture(width, height): rendered = np.zeros((height, width, 4), dtype=np.uint8) - raylib_image = pyray.Image(FFI().from_buffer(rendered.data), - width, height, 1, pyray.PIXELFORMAT_UNCOMPRESSED_R8G8B8) + raylib_image = pyray.Image( + FFI().from_buffer(rendered.data), width, height, 1, pyray.PIXELFORMAT_UNCOMPRESSED_R8G8B8 + ) return rl.LoadTextureFromImage(raylib_image) + class AnsiRender: def __init__(self, colors=None): self.colors = colors @@ -58,17 +64,17 @@ def __init__(self, colors=None): self.colors = ANSI_COLORS def render(self, grid): - frame = '' + frame = "" for v in range(grid.shape[0]): lines = [] - for line in grid[v-1:-v, v-1:-v]: - lines.append(''.join([ - f'\033[{ANSI_COLORS[val]}m██\033[0m' for val in line])) + for line in grid[v - 1 : -v, v - 1 : -v]: + lines.append("".join([f"\033[{ANSI_COLORS[val]}m██\033[0m" for val in line])) - frame = '\n'.join(lines) + frame = "\n".join(lines) return frame - + + class RGBArrayRender: def __init__(self, colors=None, upscale=1): self.colors = colors @@ -86,9 +92,11 @@ def render(self, grid): return frame + class GridRender: - def __init__(self, width, height, screen_width=1080, screen_height=720, - colors=None, fps=60, name='PufferLib Raylib Renderer'): + def __init__( + self, width, height, screen_width=1080, screen_height=720, colors=None, fps=60, name="PufferLib Raylib Renderer" + ): self.width = width self.height = height self.fps = fps @@ -103,9 +111,9 @@ def __init__(self, width, height, screen_width=1080, screen_height=720, self.height = height camera = pyray.Camera2D() - camera.target= (width/2, height/2) - camera.rotation = 0.0 - camera.zoom = min(screen_width/width, screen_height/height) + camera.target = (width / 2, height / 2) + camera.rotation = 0.0 + camera.zoom = min(screen_width / width, screen_height / height) self.camera = camera self.speed = min(screen_width, screen_height) / 100 @@ -127,8 +135,8 @@ def render(self, grid, *args, end_drawing=True): screen_height = rl.GetScreenHeight() camera = self.camera - camera.offset.x = screen_width/2 - camera.offset.y = screen_height/2 + camera.offset.x = screen_width / 2 + camera.offset.y = screen_height / 2 fps = rl.GetFPS() or self.fps fps_mul = self.fps / fps @@ -136,9 +144,9 @@ def render(self, grid, *args, end_drawing=True): zoom_speed = 0.01 * fps_mul if any_key_down([rl.KEY_SPACE]): - camera.zoom = min(screen_width/self.width, screen_height/self.height) - camera.target.x = self.width/2 - camera.target.y = self.height/2 + camera.zoom = min(screen_width / self.width, screen_height / self.height) + camera.target.x = self.width / 2 + camera.target.y = self.height / 2 if any_key_down([rl.KEY_LEFT_SHIFT]): speed *= 3 @@ -170,38 +178,37 @@ def render(self, grid, *args, end_drawing=True): rl.EndMode2D() if self.show_help: # Stats - rl.DrawText(f'FPS: {fps}'.encode(), 10, 10, 20, PUFF_TEXT) - rl.DrawText(f'Zoom: {camera.zoom:.2f}'.encode(), 10, 30, 20, PUFF_TEXT) - rl.DrawText(f'X: {camera.offset.x:.2f}'.encode(), 10, 50, 20, PUFF_TEXT) - rl.DrawText(f'Y: {camera.offset.y:.2f}'.encode(), 10, 70, 20, PUFF_TEXT) - rl.DrawText(f'Speed: {speed:.2f}'.encode(), 10, 90, 20, PUFF_TEXT) + rl.DrawText(f"FPS: {fps}".encode(), 10, 10, 20, PUFF_TEXT) + rl.DrawText(f"Zoom: {camera.zoom:.2f}".encode(), 10, 30, 20, PUFF_TEXT) + rl.DrawText(f"X: {camera.offset.x:.2f}".encode(), 10, 50, 20, PUFF_TEXT) + rl.DrawText(f"Y: {camera.offset.y:.2f}".encode(), 10, 70, 20, PUFF_TEXT) + rl.DrawText(f"Speed: {speed:.2f}".encode(), 10, 90, 20, PUFF_TEXT) # Controls - rl.DrawText('Move: WASD/HJKL'.encode(), 10, 120, 20, PUFF_TEXT) - rl.DrawText('Zoom: QE/-+'.encode(), 10, 140, 20, PUFF_TEXT) - rl.DrawText('Turbo: Shift'.encode(), 10, 160, 20, PUFF_TEXT) - rl.DrawText('Help: Tab/~'.encode(), 10, 180, 20, PUFF_TEXT) - rl.DrawText('Reset: Space'.encode(), 10, 200, 20, PUFF_TEXT) + rl.DrawText("Move: WASD/HJKL".encode(), 10, 120, 20, PUFF_TEXT) + rl.DrawText("Zoom: QE/-+".encode(), 10, 140, 20, PUFF_TEXT) + rl.DrawText("Turbo: Shift".encode(), 10, 160, 20, PUFF_TEXT) + rl.DrawText("Help: Tab/~".encode(), 10, 180, 20, PUFF_TEXT) + rl.DrawText("Reset: Space".encode(), 10, 200, 20, PUFF_TEXT) if end_drawing: rl.EndDrawing() return cdata_to_numpy() + class GameRender: - def __init__(self, width, height, screen_width=1080, screen_height=720, - colors=None, name='PufferLib Raylib Game'): - self.client = GridRender(width, height, - screen_width, screen_height, colors, name) + def __init__(self, width, height, screen_width=1080, screen_height=720, colors=None, name="PufferLib Raylib Game"): + self.client = GridRender(width, height, screen_width, screen_height, colors, name) def render(self, grid, x, y): self.client.camera.target.x = x self.client.camera.target.y = y return self.client.render(grid) + class TestGameRender: - def __init__(self, width, height, colors=None, - tile_size=16, name='PufferLib Raylib Game'): + def __init__(self, width, height, colors=None, tile_size=16, name="PufferLib Raylib Game"): assert width % tile_size == 0 assert height % tile_size == 0 assert (width // tile_size) % 2 == 1 @@ -239,9 +246,8 @@ def render(self, grid, agent_positions): dy = self.y_tiles // 2 -if __name__ == '__main__': +if __name__ == "__main__": renderer = GridRender(256, 256) grid = np.random.randint(0, 3, (256, 256), dtype=np.uint8) while True: frame = renderer.render(grid) - diff --git a/pufferlib/ocean/rocket_lander/rocket_lander.py b/pufferlib/ocean/rocket_lander/rocket_lander.py index aee2f67a8..49e81879d 100644 --- a/pufferlib/ocean/rocket_lander/rocket_lander.py +++ b/pufferlib/ocean/rocket_lander/rocket_lander.py @@ -1,20 +1,19 @@ -'''High-perf Pong +"""High-perf Pong Inspired from https://gist.github.com/Yttrmin/18ecc3d2d68b407b4be1 & https://jair.org/index.php/jair/article/view/10819/25823 & https://www.youtube.com/watch?v=PSQt5KGv7Vk -''' +""" -import numpy as np import gymnasium - +import numpy as np import pufferlib from pufferlib.ocean.rocket_lander.cy_rocket_lander import CyRocketLander + class RocketLander(pufferlib.PufferEnv): def __init__(self, num_envs=1, render_mode=None, report_interval=32, buf=None): - self.single_observation_space = gymnasium.spaces.Box(low=0, high=1, - shape=(6,), dtype=np.float32) + self.single_observation_space = gymnasium.spaces.Box(low=0, high=1, shape=(6,), dtype=np.float32) self.single_action_space = gymnasium.spaces.Discrete(4) self.render_mode = render_mode self.num_agents = num_envs @@ -22,9 +21,10 @@ def __init__(self, num_envs=1, render_mode=None, report_interval=32, buf=None): super().__init__(buf) self.float_actions = np.zeros((num_envs, 3), dtype=np.float32) - self.c_envs = CyRocketLander(self.observations, self.float_actions, self.rewards, - self.terminals, self.truncations, num_envs) - + self.c_envs = CyRocketLander( + self.observations, self.float_actions, self.rewards, self.terminals, self.truncations, num_envs + ) + def reset(self, seed=None): self.tick = 0 self.c_envs.reset() @@ -40,12 +40,11 @@ def step(self, actions): info = [] if self.tick % self.report_interval == 0: log = self.c_envs.log() - if log['episode_length'] > 0: + if log["episode_length"] > 0: info.append(log) self.tick += 1 - return (self.observations, self.rewards, - self.terminals, self.truncations, info) + return (self.observations, self.rewards, self.terminals, self.truncations, info) def render(self): self.c_envs.render() @@ -53,6 +52,7 @@ def render(self): def close(self): self.c_envs.close() + def test_performance(timeout=10, atn_cache=1024): env = RocketLander(num_envs=1000) env.reset() @@ -61,13 +61,15 @@ def test_performance(timeout=10, atn_cache=1024): actions = np.random.randint(0, 2, (atn_cache, env.num_envs)) import time + start = time.time() while time.time() - start < timeout: atn = actions[tick % atn_cache] env.step(atn) tick += 1 - print(f'SPS: %f', env.num_envs * tick / (time.time() - start)) + print("SPS: %f", env.num_envs * tick / (time.time() - start)) + -if __name__ == '__main__': +if __name__ == "__main__": test_performance() diff --git a/pufferlib/ocean/rware/rware.py b/pufferlib/ocean/rware/rware.py index f0c6cfb96..93b18fe43 100644 --- a/pufferlib/ocean/rware/rware.py +++ b/pufferlib/ocean/rware/rware.py @@ -1,43 +1,58 @@ -'''High-perf Pong +"""High-perf Pong Inspired from https://gist.github.com/Yttrmin/18ecc3d2d68b407b4be1 & https://jair.org/index.php/jair/article/view/10819/25823 & https://www.youtube.com/watch?v=PSQt5KGv7Vk -''' +""" -import numpy as np import gymnasium - +import numpy as np import pufferlib from pufferlib.ocean.rware.cy_rware import CyRware PLAYER_OBS_N = 27 -class Rware(pufferlib.PufferEnv): - def __init__(self, num_envs=1, render_mode=None, report_interval=1, - width=1280, height=1024, - num_agents=4, - map_choice=1, - num_requested_shelves=4, - grid_square_size=64, - human_agent_idx=0, - reward_type=1, - buf = None): +class Rware(pufferlib.PufferEnv): + def __init__( + self, + num_envs=1, + render_mode=None, + report_interval=1, + width=1280, + height=1024, + num_agents=4, + map_choice=1, + num_requested_shelves=4, + grid_square_size=64, + human_agent_idx=0, + reward_type=1, + buf=None, + ): # env - self.num_agents = num_envs*num_agents + self.num_agents = num_envs * num_agents self.render_mode = render_mode self.report_interval = report_interval - + self.num_obs = 27 - self.single_observation_space = gymnasium.spaces.Box(low=0, high=1, - shape=(self.num_obs,), dtype=np.float32) + self.single_observation_space = gymnasium.spaces.Box(low=0, high=1, shape=(self.num_obs,), dtype=np.float32) self.single_action_space = gymnasium.spaces.Discrete(5) super().__init__(buf=buf) - self.c_envs = CyRware(self.observations, self.actions, self.rewards, - self.terminals, num_envs, width, height, map_choice, num_agents, num_requested_shelves, grid_square_size, human_agent_idx) - + self.c_envs = CyRware( + self.observations, + self.actions, + self.rewards, + self.terminals, + num_envs, + width, + height, + map_choice, + num_agents, + num_requested_shelves, + grid_square_size, + human_agent_idx, + ) def reset(self, seed=None): self.c_envs.reset() @@ -52,26 +67,27 @@ def step(self, actions): info = [] if self.tick % self.report_interval == 0: log = self.c_envs.log() - if log['episode_length'] > 0: + if log["episode_length"] > 0: info.append(log) - return (self.observations, self.rewards, - self.terminals, self.truncations, info) + return (self.observations, self.rewards, self.terminals, self.truncations, info) def render(self): self.c_envs.render() - + def close(self): - self.c_envs.close() + self.c_envs.close() + def test_performance(timeout=10, atn_cache=1024): - num_envs=1000; + num_envs = 1000 env = MyRware(num_envs=num_envs) env.reset() tick = 0 - actions = np.random.randint(0, env.single_action_space.n, (atn_cache, 5*num_envs)) + actions = np.random.randint(0, env.single_action_space.n, (atn_cache, 5 * num_envs)) import time + start = time.time() while time.time() - start < timeout: atn = actions[tick % atn_cache] @@ -79,6 +95,8 @@ def test_performance(timeout=10, atn_cache=1024): tick += 1 sps = num_envs * tick / (time.time() - start) - print(f'SPS: {sps:,}') -if __name__ == '__main__': + print(f"SPS: {sps:,}") + + +if __name__ == "__main__": test_performance() diff --git a/pufferlib/ocean/sanity.py b/pufferlib/ocean/sanity.py index a90d1f094..02157f769 100644 --- a/pufferlib/ocean/sanity.py +++ b/pufferlib/ocean/sanity.py @@ -1,12 +1,13 @@ -import gymnasium -import pettingzoo -import numpy as np import random import time +import gymnasium +import numpy as np +import pettingzoo + class Bandit(gymnasium.Env): - '''Pufferlib Bandit environment + """Pufferlib Bandit environment Simulates a classic multiarmed bandit problem. @@ -17,18 +18,17 @@ class Bandit(gymnasium.Env): reward_scale: The scale of the reward reward_noise: The standard deviation of the reward signal hard_fixed_seed: All instances of the environment should share the same seed. - ''' - def __init__(self, num_actions=4, reward_scale=1, - reward_noise=0, hard_fixed_seed=42): + """ + + def __init__(self, num_actions=4, reward_scale=1, reward_noise=0, hard_fixed_seed=42): self.num_actions = num_actions self.reward_scale = reward_scale self.reward_noise = reward_noise self.hard_fixed_seed = hard_fixed_seed - self.observation=np.ones(1, dtype=np.float32) - self.observation_space=gymnasium.spaces.Box( - low=-1, high=1, shape=(1,)) - self.action_space=gymnasium.spaces.Discrete(num_actions) - self.render_mode = 'ansi' + self.observation = np.ones(1, dtype=np.float32) + self.observation_space = gymnasium.spaces.Box(low=-1, high=1, shape=(1,)) + self.action_space = gymnasium.spaces.Discrete(num_actions) + self.render_mode = "ansi" def reset(self, seed=None): # Bandit problem requires a single fixed seed @@ -59,10 +59,11 @@ def step(self, action): # Couples reward noise to scale reward = (reward + reward_noise) * self.reward_scale - return self.observation, reward, True, False, {'score': correct} + return self.observation, reward, True, False, {"score": correct} + class Memory(gymnasium.Env): - '''Pufferlib Memory environment + """Pufferlib Memory environment Repeat the observed sequence after a delay. It is randomly generated upon every reset. This is a test of memory length and capacity. It starts requiring credit assignment if you make the sequence too long. @@ -74,15 +75,15 @@ class Memory(gymnasium.Env): Args: mem_length: The length of the sequence mem_delay: The number of 0s between the sequence and the agent's response - ''' + """ + def __init__(self, mem_length=1, mem_delay=0): self.mem_length = mem_length self.mem_delay = mem_delay self.horizon = 2 * mem_length + mem_delay - self.observation_space=gymnasium.spaces.Box( - low=-1, high=1, shape=(1,)) - self.action_space=gymnasium.spaces.Discrete(2) - self.render_mode = 'ansi' + self.observation_space = gymnasium.spaces.Box(low=-1, high=1, shape=(1,)) + self.action_space = gymnasium.spaces.Discrete(2) + self.render_mode = "ansi" def reset(self, seed=None): if seed is not None: @@ -90,7 +91,7 @@ def reset(self, seed=None): np.random.seed(seed) self.solution = np.random.randint(0, 2, size=self.horizon).astype(np.float32) - self.solution[-(self.mem_length + self.mem_delay):] = -1 + self.solution[-(self.mem_length + self.mem_delay) :] = -1 self.submission = np.zeros(self.horizon) - 1 self.tick = 1 @@ -117,8 +118,7 @@ def step(self, action): info = {} if terminal: - info['score'] = np.all( - self.solution[:self.mem_length] == self.submission[-self.mem_length:]) + info["score"] = np.all(self.solution[: self.mem_length] == self.submission[-self.mem_length :]) return ob, reward, terminal, False, info @@ -130,30 +130,31 @@ def _render(val): c = 91 else: c = 90 - return f'\033[{c}m██\033[0m' + return f"\033[{c}m██\033[0m" chars = [] for val in self.solution: c = _render(val) chars.append(c) - chars.append(' Solution\n') + chars.append(" Solution\n") for val in self.submission: c = _render(val) chars.append(c) - chars.append(' Prediction\n') + chars.append(" Prediction\n") - return ''.join(chars) + return "".join(chars) class Multiagent(pettingzoo.ParallelEnv): - '''Pufferlib Multiagent environment + """Pufferlib Multiagent environment Agent 1 must pick action 0 and Agent 2 must pick action 1 Observation space: Box(0, 1, (1,)). 0 for Agent 1 and 1 for Agent 2 Action space: Discrete(2). Which action to take. - ''' + """ + def __init__(self): self.observation = { 1: np.zeros(1, dtype=np.float32), @@ -167,20 +168,19 @@ def __init__(self): 1: False, 2: False, } - self.possible_agents=[1, 2] - self.agents=[1, 2] - self.render_mode = 'ansi' + self.possible_agents = [1, 2] + self.agents = [1, 2] + self.render_mode = "ansi" def observation_space(self, agent): - return gymnasium.spaces.Box( - low=0, high=1, shape=(1,)) + return gymnasium.spaces.Box(low=0, high=1, shape=(1,)) def action_space(self, agent): return gymnasium.spaces.Discrete(2) def reset(self, seed=None): # Reallocating is faster than zeroing - self.view=np.zeros((2, 5), dtype=np.float32) + self.view = np.zeros((2, 5), dtype=np.float32) return self.observation, {} def step(self, action): @@ -202,8 +202,8 @@ def step(self, action): reward[2] = 0 info = { - 1: {'score': reward[1]}, - 2: {'score': reward[2]}, + 1: {"score": reward[1]}, + 2: {"score": reward[2]}, } return self.observation, reward, self.terminal, self.truncated, info @@ -215,22 +215,23 @@ def _render(val): c = 90 else: c = 90 - return f'\033[{c}m██\033[0m' + return f"\033[{c}m██\033[0m" chars = [] for row in self.view: for val in row: c = _render(val) chars.append(c) - chars.append('\n') - return ''.join(chars) + chars.append("\n") + return "".join(chars) + class Password(gymnasium.Env): - '''Pufferlib Password environment + """Pufferlib Password environment Guess the password, which is a static binary string. Your policy has to not determinize before it happens to get the reward, and it also has to - latch onto the reward within a few instances of getting it. + latch onto the reward within a few instances of getting it. Observation space: Box(0, 1, (password_length,)). A binary vector containing your guesses so far, so that the environment will be solvable without memory. Action space: Discrete(2). Your guess for the next digit. @@ -238,15 +239,14 @@ class Password(gymnasium.Env): Args: password_length: The number of binary digits in the password. hard_fixed_seed: A fixed seed for the environment. It should be the same for all instances. This environment does not make sense when randomly generated. - ''' - + """ + def __init__(self, password_length=5, hard_fixed_seed=42): self.password_length = password_length self.hard_fixed_seed = hard_fixed_seed - self.observation_space=gymnasium.spaces.Box( - low=-1, high=1, shape=(password_length,)) - self.action_space=gymnasium.spaces.Discrete(2) - self.render_mode = 'ansi' + self.observation_space = gymnasium.spaces.Box(low=-1, high=1, shape=(password_length,)) + self.action_space = gymnasium.spaces.Discrete(2) + self.render_mode = "ansi" def reset(self, seed=None): # Bandit problem requires a single fixed seed @@ -257,8 +257,7 @@ def reset(self, seed=None): np.random.seed(seed) self.observation = np.zeros(self.password_length, dtype=np.float32) - 1 - self.solution = np.random.randint( - 0, 2, size=self.password_length).astype(np.float32) + self.solution = np.random.randint(0, 2, size=self.password_length).astype(np.float32) self.tick = 0 return self.observation, {} @@ -276,7 +275,7 @@ def step(self, action): if terminal: reward = float(np.all(self.observation == self.solution)) - info['score'] = reward + info["score"] = reward return self.observation, reward, terminal, False, info @@ -288,32 +287,30 @@ def _render(val): c = 91 else: c = 90 - return f'\033[{c}m██\033[0m' + return f"\033[{c}m██\033[0m" chars = [] for val in self.solution: c = _render(val) chars.append(c) - chars.append(' Solution\n') + chars.append(" Solution\n") for val in self.observation: c = _render(val) chars.append(c) - chars.append(' Prediction\n') + chars.append(" Prediction\n") + + return "".join(chars) - return ''.join(chars) class Performance(gymnasium.Env): def __init__(self, delay_mean=0, delay_std=0, bandwidth=1): np.random.seed(time.time_ns() % 2**32) - self.observation_space = gymnasium.spaces.Box( - low=-2**20, high=2**20, - shape=(bandwidth,), dtype=np.float32 - ) + self.observation_space = gymnasium.spaces.Box(low=-(2**20), high=2**20, shape=(bandwidth,), dtype=np.float32) self.action_space = gymnasium.spaces.Discrete(2) self.observation = self.observation_space.sample() - self.render_mode = 'ansi' + self.render_mode = "ansi" def reset(self, seed=None): return self.observation, {} @@ -321,40 +318,39 @@ def reset(self, seed=None): def step(self, action): start = time.process_time() idx = 0 - target_time = self.delay_mean + self.delay_std*np.random.randn() + target_time = self.delay_mean + self.delay_std * np.random.randn() while time.process_time() - start < target_time: idx += 1 return self.observation, 0, False, False, {} + class PerformanceEmpiric(gymnasium.Env): def __init__(self, count_n=0, count_std=0, bandwidth=1): np.random.seed(time.time_ns() % 2**32) - self.observation_space = gymnasium.spaces.Box( - low=-2**20, high=2**20, - shape=(bandwidth,), dtype=np.float32 - ) + self.observation_space = gymnasium.spaces.Box(low=-(2**20), high=2**20, shape=(bandwidth,), dtype=np.float32) self.action_space = gymnasium.spaces.Discrete(2) self.observation = self.observation_space.sample() self.count_n = count_n self.count_std = count_std self.bandwidth = bandwidth - self.render_mode = 'ansi' + self.render_mode = "ansi" def reset(self, seed=None): return self.observation, {} def step(self, action): idx = 0 - target = self.count_n + self.count_std * np.random.randn() + target = self.count_n + self.count_std * np.random.randn() while idx < target: idx += 1 return self.observation, 0, False, False, {} + class Spaces(gymnasium.Env): - '''Pufferlib Spaces environment + """Pufferlib Spaces environment A simple environment with hierarchical observation and action spaces @@ -364,47 +360,51 @@ class Spaces(gymnasium.Env): 0.5 reward is given for each correct action Does not provide rendering - ''' + """ + def __init__(self): - self.observation_space = gymnasium.spaces.Dict({ - 'image': gymnasium.spaces.Box( - low=0, high=1, shape=(5, 5), dtype=np.float32), - 'flat': gymnasium.spaces.Box( - low=-1, high=1, shape=(5,), dtype=np.int8), - }) - self.action_space = gymnasium.spaces.Dict({ - 'image': gymnasium.spaces.Discrete(2), - 'flat': gymnasium.spaces.Discrete(2), - }) - self.render_mode = 'ansi' + self.observation_space = gymnasium.spaces.Dict( + { + "image": gymnasium.spaces.Box(low=0, high=1, shape=(5, 5), dtype=np.float32), + "flat": gymnasium.spaces.Box(low=-1, high=1, shape=(5,), dtype=np.int8), + } + ) + self.action_space = gymnasium.spaces.Dict( + { + "image": gymnasium.spaces.Discrete(2), + "flat": gymnasium.spaces.Discrete(2), + } + ) + self.render_mode = "ansi" def reset(self, seed=None): self.observation = { - 'image': np.random.rand(5, 5).astype(np.float32), - 'flat': np.random.randint(-1, 2, (5,), dtype=np.int8), + "image": np.random.rand(5, 5).astype(np.float32), + "flat": np.random.randint(-1, 2, (5,), dtype=np.int8), } - self.image_sign = np.sum(self.observation['image']) > 0 - self.flat_sign = np.sum(self.observation['flat']) > 0 + self.image_sign = np.sum(self.observation["image"]) > 0 + self.flat_sign = np.sum(self.observation["flat"]) > 0 return self.observation, {} def step(self, action): assert isinstance(action, dict) - assert 'image' in action and action['image'] in (0, 1) - assert 'flat' in action and action['flat'] in (0, 1) + assert "image" in action and action["image"] in (0, 1) + assert "flat" in action and action["flat"] in (0, 1) reward = 0 - if self.image_sign == action['image']: + if self.image_sign == action["image"]: reward += 0.5 - if self.flat_sign == action['flat']: + if self.flat_sign == action["flat"]: reward += 0.5 info = dict(score=reward) return self.observation, reward, True, False, info + class Squared(gymnasium.Env): - '''Pufferlib Squared environment + """Pufferlib Squared environment Agent starts at the center of a square grid. Targets are placed on the perimeter of the grid. @@ -418,15 +418,16 @@ class Squared(gymnasium.Env): Args: distance_to_target: The distance from the center to the closest target. num_targets: The number of targets to randomly generate. - - ''' + + """ MOVES = [(0, -1), (0, 1), (-1, 0), (1, 0), (1, -1), (-1, -1), (1, 1), (-1, 1)] - def __init__(self, + def __init__( + self, distance_to_target=1, num_targets=-1, - ): + ): grid_size = 2 * distance_to_target + 1 if num_targets == -1: num_targets = 4 * distance_to_target @@ -436,14 +437,17 @@ def __init__(self, self.num_targets = num_targets self.grid_size = grid_size self.max_ticks = num_targets * distance_to_target - self.observation_space = gymnasium.spaces.Box( - low=-1, high=1, shape=(grid_size, grid_size)) + self.observation_space = gymnasium.spaces.Box(low=-1, high=1, shape=(grid_size, grid_size)) self.action_space = gymnasium.spaces.Discrete(8) - self.render_mode = 'ansi' + self.render_mode = "ansi" def _all_possible_targets(self, grid_size): - return [(x, y) for x in range(grid_size) for y in range(grid_size) - if x == 0 or y == 0 or x == grid_size - 1 or y == grid_size - 1] + return [ + (x, y) + for x in range(grid_size) + for y in range(grid_size) + if x == 0 or y == 0 or x == grid_size - 1 or y == grid_size - 1 + ] def reset(self, seed=None): if seed is not None: @@ -470,7 +474,7 @@ def step(self, action): x += dx y += dy - min_dist = min([max(abs(x-tx), abs(y-ty)) for tx, ty in self.targets]) + min_dist = min([max(abs(x - tx), abs(y - ty)) for tx, ty in self.targets]) # This reward function will return 0.46 average reward for an unsuccessful # episode with distance_to_target=4 and num_targets=1 (0.5 for solve) # It looks reasonable but is not very discriminative @@ -479,36 +483,35 @@ def step(self, action): # This reward function will return 1 when the agent moves in the right direction # (plus an adjustment for the 0 reset reward) to average 1 for success # It is not much better than the previous one. - #reward = state.distance_to_target - min_dist - state.tick + 1/state.max_ticks + # reward = state.distance_to_target - min_dist - state.tick + 1/state.max_ticks # This function will return 0, 0.2, 0.4, ... 1 for successful episodes (n=5) # And will drop rewards to 0 or less as soon as an error is made # Somewhat smoother but actually worse than the previous ones # reward = (state.distance_to_target - min_dist - state.tick) / (state.max_ticks - state.tick) - # This one nicely tracks the task completed metric but does not optimize well - #if state.distance_to_target - min_dist - state.tick == 1: + # if state.distance_to_target - min_dist - state.tick == 1: # reward = 1 - #else: + # else: # reward = -state.tick if (x, y) in self.targets: self.targets.remove((x, y)) - #state.grid[x, y] = 0 + # state.grid[x, y] = 0 - dist_from_origin = max(abs(x-self.distance_to_target), abs(y-self.distance_to_target)) + dist_from_origin = max(abs(x - self.distance_to_target), abs(y - self.distance_to_target)) if dist_from_origin >= self.distance_to_target: self.agent_pos = self.distance_to_target, self.distance_to_target else: self.agent_pos = x, y - + self.grid[self.agent_pos] = -1 self.tick += 1 done = self.tick >= self.max_ticks score = (self.num_targets - len(self.targets)) / self.num_targets - info = {'score': score} if done else {} + info = {"score": score} if done else {} return self.grid, reward, done, False, info @@ -522,12 +525,13 @@ def render(self): color = 91 else: color = 90 - chars.append(f'\033[{color}m██\033[0m') - chars.append('\n') - return ''.join(chars) + chars.append(f"\033[{color}m██\033[0m") + chars.append("\n") + return "".join(chars) + class Stochastic(gymnasium.Env): - '''Pufferlib Stochastic environment + """Pufferlib Stochastic environment The optimal policy is to play action 0 < p % of the time and action 1 < (1 - p) % This is a test of whether your algorithm can learn a nontrivial stochastic policy. @@ -539,14 +543,14 @@ class Stochastic(gymnasium.Env): Args: p: The optimal probability for action 0 horizon: How often the environment should reset - ''' + """ + def __init__(self, p=0.75, horizon=1000): self.p = p self.horizon = horizon - self.observation_space = gymnasium.spaces.Box( - low=0, high=1, shape=(1,)) + self.observation_space = gymnasium.spaces.Box(low=0, high=1, shape=(1,)) self.action_space = gymnasium.spaces.Discrete(2) - self.render_mode = 'ansi' + self.render_mode = "ansi" def reset(self, seed=None): if seed is not None: @@ -569,15 +573,15 @@ def step(self, action): terminal = self.tick == self.horizon atn0_frac = self.count / self.tick - proximity_to_p = 1 - (self.p - atn0_frac)**2 + proximity_to_p = 1 - (self.p - atn0_frac) ** 2 - reward = proximity_to_p if ( - (action == 0 and atn0_frac < self.p) or - (action == 1 and atn0_frac >= self.p)) else 0 + reward = ( + proximity_to_p if ((action == 0 and atn0_frac < self.p) or (action == 1 and atn0_frac >= self.p)) else 0 + ) info = {} if terminal: - info['score'] = proximity_to_p + info["score"] = proximity_to_p return np.zeros(1, dtype=np.float32), reward, terminal, False, info @@ -589,37 +593,37 @@ def _render(val): c = 91 else: c = 90 - return f'\033[{c}m██\033[0m' + return f"\033[{c}m██\033[0m" + chars = [] if self.tick == 0: solution = 0 else: solution = 0 if self.count / self.tick < self.p else 1 chars.append(_render(solution)) - chars.append(' Solution\n') + chars.append(" Solution\n") chars.append(_render(self.action)) - chars.append(' Prediction\n') + chars.append(" Prediction\n") + + return "".join(chars) - return ''.join(chars) class Continuous(gymnasium.Env): def __init__(self, discretize=False): - self.observation_space=gymnasium.spaces.Box( - low=-1, high=1, shape=(6,)) + self.observation_space = gymnasium.spaces.Box(low=-1, high=1, shape=(6,)) self.discretize = discretize if discretize: - self.action_space=gymnasium.spaces.Discrete(4) + self.action_space = gymnasium.spaces.Discrete(4) else: - self.action_space=gymnasium.spaces.Box( - low=-1, high=1, shape=(2,)) + self.action_space = gymnasium.spaces.Box(low=-1, high=1, shape=(2,)) - self.render_mode = 'human' + self.render_mode = "human" self.client = None def reset(self, seed=None, options=None): # pos_x, pos_y, vel_x, vel_y, target_x, target_y - self.state = 2*np.random.rand(6)-1 + self.state = 2 * np.random.rand(6) - 1 self.state[2:4] = 0 self.tick = 0 @@ -637,7 +641,7 @@ def step(self, action): elif action == 3: accel_y = 0.1 else: - accel_x, accel_y = 0.1*action + accel_x, accel_y = 0.1 * action self.state[2] += accel_x self.state[3] += accel_y @@ -647,9 +651,9 @@ def step(self, action): pos_x, pos_y, vel_x, vel_y, target_x, target_y = self.state if pos_x < -1 or pos_x > 1 or pos_y < -1 or pos_y > 1: - return self.state, -1, True, False, {'score': 0} + return self.state, -1, True, False, {"score": 0} - dist = np.sqrt((pos_x - target_x)**2 + (pos_y - target_y)**2) + dist = np.sqrt((pos_x - target_x) ** 2 + (pos_y - target_y) ** 2) reward = 0.02 * (1 - dist) self.tick += 1 @@ -661,10 +665,10 @@ def step(self, action): info = {} if done: reward = 5.0 - info = {'score': 1} + info = {"score": 1} elif truncated: reward = 0.0 - info = {'score': 0} + info = {"score": 0} return self.state, reward, done, truncated, info @@ -676,6 +680,7 @@ def render(self): frame, atn = self.client.render(pos_x, pos_y, target_x, target_y) return frame + class RaylibClient: def __init__(self, width=1080, height=720, size=20): self.width = width @@ -683,20 +688,20 @@ def __init__(self, width=1080, height=720, size=20): self.size = size from raylib import rl - rl.InitWindow(width, height, - "PufferLib Simple Continuous".encode()) + + rl.InitWindow(width, height, "PufferLib Simple Continuous".encode()) rl.SetTargetFPS(10) self.rl = rl from cffi import FFI + self.ffi = FFI() def _cdata_to_numpy(self): image = self.rl.LoadImageFromScreen() width, height, channels = image.width, image.height, 4 - cdata = self.ffi.buffer(image.data, width*height*channels) - return np.frombuffer(cdata, dtype=np.uint8 - ).reshape((height, width, channels))[:, :, :3] + cdata = self.ffi.buffer(image.data, width * height * channels) + return np.frombuffer(cdata, dtype=np.uint8).reshape((height, width, channels))[:, :, :3] def render(self, pos_x, pos_y, target_x, target_y): rl = self.rl @@ -713,10 +718,10 @@ def render(self, pos_x, pos_y, target_x, target_y): rl.BeginDrawing() rl.ClearBackground([6, 24, 24, 255]) - pos_x = int((0.5+pos_x/2) * self.width) - pos_y = int((0.5+pos_y/2) * self.height) - target_x = int((0.5+target_x/2) * self.width) - target_y = int((0.5+target_y/2) * self.height) + pos_x = int((0.5 + pos_x / 2) * self.width) + pos_y = int((0.5 + pos_y / 2) * self.height) + target_x = int((0.5 + target_x / 2) * self.width) + target_y = int((0.5 + target_y / 2) * self.height) rl.DrawCircle(pos_x, pos_y, self.size, [255, 0, 0, 255]) rl.DrawCircle(target_x, target_y, self.size, [0, 0, 255, 255]) diff --git a/pufferlib/ocean/snake/snake.py b/pufferlib/ocean/snake/snake.py index 05f7d1cb1..d153761db 100644 --- a/pufferlib/ocean/snake/snake.py +++ b/pufferlib/ocean/snake/snake.py @@ -1,41 +1,51 @@ -'''High-perf many-agent snake. Inspired by snake env from https://github.com/dnbt777''' +"""High-perf many-agent snake. Inspired by snake env from https://github.com/dnbt777""" -import numpy as np import gymnasium +import numpy as np import pufferlib from pufferlib.exceptions import APIUsageError from pufferlib.ocean.snake.cy_snake import CySnake -class Snake(pufferlib.PufferEnv): - def __init__(self, widths=16*[640], heights=16*[360], - num_snakes=16*[256], num_food=16*[4096], - vision=5, leave_corpse_on_death=True, - reward_food=0.1, reward_corpse=0.1, reward_death=-1.0, - report_interval=128, max_snake_length=1024, - render_mode='human', buf=None): +class Snake(pufferlib.PufferEnv): + def __init__( + self, + widths=16 * [640], + heights=16 * [360], + num_snakes=16 * [256], + num_food=16 * [4096], + vision=5, + leave_corpse_on_death=True, + reward_food=0.1, + reward_corpse=0.1, + reward_death=-1.0, + report_interval=128, + max_snake_length=1024, + render_mode="human", + buf=None, + ): if not isinstance(vision, int): - raise APIUsageError('vision must be an integer') + raise APIUsageError("vision must be an integer") if isinstance(leave_corpse_on_death, bool): - leave_corpse_on_death = len(widths)*[leave_corpse_on_death] + leave_corpse_on_death = len(widths) * [leave_corpse_on_death] - valid_params = (len(widths) == len(heights) == len(num_snakes) - == len(num_food) == len(leave_corpse_on_death)) + valid_params = len(widths) == len(heights) == len(num_snakes) == len(num_food) == len(leave_corpse_on_death) if not valid_params: - raise APIUsageError('widths, heights, num_snakes, num_food must be lists of equal length') + raise APIUsageError("widths, heights, num_snakes, num_food must be lists of equal length") - for w, h in zip(widths, heights): - if w < 2*vision+2 or h < 2*vision+2: - raise APIUsageError('width and height must be at least 2*vision+2') + for w, h in zip(widths, heights, strict=False): + if w < 2 * vision + 2 or h < 2 * vision + 2: + raise APIUsageError("width and height must be at least 2*vision+2") - max_area = max([w*h for h, w in zip(heights, widths)]) + max_area = max([w * h for h, w in zip(heights, widths, strict=False)]) self.max_snake_length = min(max_snake_length, max_area) self.report_interval = report_interval # This block required by advanced PufferLib env spec self.single_observation_space = gymnasium.spaces.Box( - low=0, high=2, shape=(2*vision+1, 2*vision+1), dtype=np.int8) + low=0, high=2, shape=(2 * vision + 1, 2 * vision + 1), dtype=np.int8 + ) self.single_action_space = gymnasium.spaces.Discrete(4) self.num_agents = sum(num_snakes) self.render_mode = render_mode @@ -44,12 +54,23 @@ def __init__(self, widths=16*[640], heights=16*[360], self.cell_size = int(np.ceil(1280 / max(max(widths), max(heights)))) super().__init__(buf) - self.c_envs = CySnake(self.observations, self.actions, - self.rewards, self.terminals, widths, heights, - num_snakes, num_food, vision, max_snake_length, - leave_corpse_on_death, reward_food, reward_corpse, - reward_death) - + self.c_envs = CySnake( + self.observations, + self.actions, + self.rewards, + self.terminals, + widths, + heights, + num_snakes, + num_food, + vision, + max_snake_length, + leave_corpse_on_death, + reward_food, + reward_corpse, + reward_death, + ) + def reset(self, seed=None): self.c_envs.reset() self.tick = 0 @@ -62,11 +83,10 @@ def step(self, actions): info = [] if self.tick % self.report_interval == 0: log = self.c_envs.log() - if log['episode_length'] > 0: + if log["episode_length"] > 0: info.append(log) - return (self.observations, self.rewards, - self.terminals, self.truncations, info) + return (self.observations, self.rewards, self.terminals, self.truncations, info) def render(self): self.c_envs.render(self.cell_size) @@ -74,6 +94,7 @@ def render(self): def close(self): self.c_envs.close() + def test_performance(timeout=10, atn_cache=1024): env = Snake() env.reset() @@ -83,13 +104,15 @@ def test_performance(timeout=10, atn_cache=1024): actions = np.random.randint(0, 4, (atn_cache, total_snakes)) import time + start = time.time() while time.time() - start < timeout: atns = actions[tick % atn_cache] env.step(atns) tick += 1 - print(f'SPS: %f', total_snakes * tick / (time.time() - start)) + print("SPS: %f", total_snakes * tick / (time.time() - start)) + -if __name__ == '__main__': +if __name__ == "__main__": test_performance() diff --git a/pufferlib/ocean/squared/pysquared.py b/pufferlib/ocean/squared/pysquared.py index 59c004f10..3611217e1 100644 --- a/pufferlib/ocean/squared/pysquared.py +++ b/pufferlib/ocean/squared/pysquared.py @@ -1,10 +1,8 @@ -'''A simple sample environment. Use this as a template for your own envs.''' +"""A simple sample environment. Use this as a template for your own envs.""" import gymnasium import numpy as np - import pufferlib -from pufferlib.ocean.squared.cy_squared import CySquared NOOP = 0 DOWN = 1 @@ -16,10 +14,10 @@ AGENT = 1 TARGET = 2 + class PySquared(pufferlib.PufferEnv): - def __init__(self, num_envs=1, render_mode='ansi', size=11, buf=None): - self.single_observation_space = gymnasium.spaces.Box(low=0, high=1, - shape=(size*size,), dtype=np.uint8) + def __init__(self, num_envs=1, render_mode="ansi", size=11, buf=None): + self.single_observation_space = gymnasium.spaces.Box(low=0, high=1, shape=(size * size,), dtype=np.uint8) self.single_action_space = gymnasium.spaces.Discrete(5) self.render_mode = render_mode self.num_agents = 1 @@ -29,14 +27,14 @@ def __init__(self, num_envs=1, render_mode='ansi', size=11, buf=None): def reset(self, seed=None): self.observations[0, :] = EMPTY - self.observations[0, self.size*self.size//2] = AGENT - self.r = self.size//2 - self.c = self.size//2 + self.observations[0, self.size * self.size // 2] = AGENT + self.r = self.size // 2 + self.c = self.size // 2 self.tick = 0 while True: target_r, target_c = np.random.randint(0, self.size, 2) if target_r != self.r or target_c != self.c: - self.observations[0, target_r*self.size + target_c] = TARGET + self.observations[0, target_r * self.size + target_c] = TARGET break return self.observations, [] @@ -46,7 +44,7 @@ def step(self, actions): self.terminals[0] = False self.rewards[0] = 0 - self.observations[0, self.r*self.size + self.c] = EMPTY + self.observations[0, self.r * self.size + self.c] = EMPTY if atn == DOWN: self.r += 1 @@ -58,20 +56,16 @@ def step(self, actions): self.c -= 1 info = [] - pos = self.r*self.size + self.c - if (self.tick > 3*self.size - or self.r < 0 - or self.c < 0 - or self.r >= self.size - or self.c >= self.size): + pos = self.r * self.size + self.c + if self.tick > 3 * self.size or self.r < 0 or self.c < 0 or self.r >= self.size or self.c >= self.size: self.terminals[0] = True self.rewards[0] = -1.0 - info = {'reward': -1.0} + info = {"reward": -1.0} self.reset() elif self.observations[0, pos] == TARGET: self.terminals[0] = True self.rewards[0] = 1.0 - info = {'reward': 1.0} + info = {"reward": 1.0} self.reset() else: self.observations[0, pos] = AGENT @@ -90,9 +84,9 @@ def render(self): color = 91 else: color = 90 - chars.append(f'\033[{color}m██\033[0m') - chars.append('\n') - return ''.join(chars) + chars.append(f"\033[{color}m██\033[0m") + chars.append("\n") + return "".join(chars) def close(self): pass diff --git a/pufferlib/ocean/squared/squared.py b/pufferlib/ocean/squared/squared.py index 29a3c84d3..b39cfbf29 100644 --- a/pufferlib/ocean/squared/squared.py +++ b/pufferlib/ocean/squared/squared.py @@ -1,24 +1,21 @@ -'''A simple sample environment. Use this as a template for your own envs.''' +"""A simple sample environment. Use this as a template for your own envs.""" import gymnasium import numpy as np - import pufferlib from pufferlib.ocean.squared.cy_squared import CySquared class Squared(pufferlib.PufferEnv): def __init__(self, num_envs=1, render_mode=None, size=11, buf=None): - self.single_observation_space = gymnasium.spaces.Box(low=0, high=1, - shape=(size*size,), dtype=np.uint8) + self.single_observation_space = gymnasium.spaces.Box(low=0, high=1, shape=(size * size,), dtype=np.uint8) self.single_action_space = gymnasium.spaces.Discrete(5) self.render_mode = render_mode self.num_agents = num_envs super().__init__(buf) - self.c_envs = CySquared(self.observations, self.actions, - self.rewards, self.terminals, num_envs, size) - + self.c_envs = CySquared(self.observations, self.actions, self.rewards, self.terminals, num_envs, size) + def reset(self, seed=None): self.c_envs.reset() return self.observations, [] @@ -31,12 +28,13 @@ def step(self, actions): info = [] if len(episode_returns) > 0: - info = [{ - 'reward': np.mean(episode_returns), - }] + info = [ + { + "reward": np.mean(episode_returns), + } + ] - return (self.observations, self.rewards, - self.terminals, self.truncations, info) + return (self.observations, self.rewards, self.terminals, self.truncations, info) def render(self): self.c_envs.render() diff --git a/pufferlib/ocean/tactical/tactical.py b/pufferlib/ocean/tactical/tactical.py index 3ae3370df..2aa9bdbce 100644 --- a/pufferlib/ocean/tactical/tactical.py +++ b/pufferlib/ocean/tactical/tactical.py @@ -1,11 +1,12 @@ -import numpy as np + import gymnasium -import os -#from raylib import rl -#import heapq +import numpy as np +# from raylib import rl +# import heapq import pufferlib from pufferlib.ocean.tactical.c_tactical import CTactical + # from pufferlib.environments.ocean import render EMPTY = 0 @@ -14,21 +15,20 @@ WALL = 3 MAP_DICT = { - '_': EMPTY, - '.': GROUND, - '|': HOLE, - '#': WALL, + "_": EMPTY, + ".": GROUND, + "|": HOLE, + "#": WALL, } class Tactical: - def __init__(self, num_envs=200, render_mode='human'): + def __init__(self, num_envs=200, render_mode="human"): self.num_envs = num_envs self.render_mode = render_mode # env spec (TODO) - self.observation_space = gymnasium.spaces.Box( - low=0, high=2, shape=(10,), dtype=np.uint8) + self.observation_space = gymnasium.spaces.Box(low=0, high=2, shape=(10,), dtype=np.uint8) self.action_space = gymnasium.spaces.Discrete(4) self.single_observation_space = self.observation_space self.single_action_space = self.action_space @@ -37,12 +37,11 @@ def __init__(self, num_envs=200, render_mode='human'): self.emulated = None self.done = False self.buf = pufferlib.namespace( - observations = np.zeros( - (num_envs, 10), dtype=np.uint8), - rewards = np.zeros(num_envs, dtype=np.float32), - terminals = np.zeros(num_envs, dtype=bool), - truncations = np.zeros(num_envs, dtype=bool), - masks = np.ones(num_envs, dtype=bool), + observations=np.zeros((num_envs, 10), dtype=np.uint8), + rewards=np.zeros(num_envs, dtype=np.float32), + terminals=np.zeros(num_envs, dtype=bool), + truncations=np.zeros(num_envs, dtype=bool), + masks=np.ones(num_envs, dtype=bool), ) self.actions = np.zeros(num_envs, dtype=np.uint32) @@ -54,10 +53,10 @@ def __init__(self, num_envs=200, render_mode='human'): # map_path = 'pufferlib/environments/ocean/tactical/map_test.txt' # print(map_path) # self.load_map(map_path) - + def load_map(self, filename): - with open(filename, 'r') as f: - self.map_str = [line.strip() for line in f.read().strip().split('\n') if line[0] != ';'] + with open(filename, "r") as f: + self.map_str = [line.strip() for line in f.read().strip().split("\n") if line[0] != ";"] self.map_width = len(self.map_str[0]) self.map_height = len(self.map_str) self.map = np.zeros((self.map_height, self.map_width), dtype=np.uint8) @@ -68,10 +67,9 @@ def load_map(self, filename): def reset(self, seed=None): self.c_envs = [] for i in range(self.num_envs): - self.c_envs.append(CTactical( - self.buf.observations[i], - self.actions[i:i+1], - self.buf.rewards[i:i+1])) + self.c_envs.append( + CTactical(self.buf.observations[i], self.actions[i : i + 1], self.buf.rewards[i : i + 1]) + ) self.c_envs[i].reset() return self.buf.observations, {} @@ -80,11 +78,10 @@ def step(self, actions): self.actions[:] = actions for c_env in self.c_envs: c_env.step() - + info = {} - return (self.buf.observations, self.buf.rewards, - self.buf.terminals, self.buf.truncations, info) + return (self.buf.observations, self.buf.rewards, self.buf.terminals, self.buf.truncations, info) def render(self): return self.c_envs[0].render() @@ -95,7 +92,8 @@ def close(self): for c_env in self.c_envs: c_env.close() -''' + +""" def a_star_search(map, start, goal): frontier = [] heapq.heappush(frontier, (0, start)) @@ -477,14 +475,15 @@ def render(self, map): rl.EndDrawing() return render.cdata_to_numpy() -''' +""" -if __name__ == '__main__': +if __name__ == "__main__": PROFILE = False - env = Tactical(num_envs=1, render_mode='human') + env = Tactical(num_envs=1, render_mode="human") env.reset() import time + t0 = time.time() steps = 0 while not PROFILE or time.time() - t0 < 10: @@ -493,5 +492,4 @@ def render(self, map): if env.render() == 1: # exit code break steps += 1 - print('SPS:', 1 * steps / (time.time() - t0)) - + print("SPS:", 1 * steps / (time.time() - t0)) diff --git a/pufferlib/ocean/torch.py b/pufferlib/ocean/torch.py index ce99d7dcc..9698b81fb 100644 --- a/pufferlib/ocean/torch.py +++ b/pufferlib/ocean/torch.py @@ -1,23 +1,23 @@ -from torch import nn + import torch import torch.nn.functional as F +from torch import nn -from functools import partial import pufferlib.models -from pufferlib.models import Default as Policy -from pufferlib.models import Convolutional as Conv Recurrent = pufferlib.models.LSTMWrapper import numpy as np + class NMMO3LSTM(pufferlib.models.LSTMWrapper): def __init__(self, env, policy, input_size=256, hidden_size=256, num_layers=1): super().__init__(env, policy, input_size, hidden_size, num_layers) + class NMMO3(nn.Module): def __init__(self, env, hidden_size=256, output_size=256): super().__init__() - #self.dtype = pufferlib.pytorch.nativize_dtype(env.emulated) + # self.dtype = pufferlib.pytorch.nativize_dtype(env.emulated) self.num_actions = env.single_action_space.n self.factors = np.array([4, 4, 17, 5, 3, 5, 5, 5, 7, 4]) self.offsets = torch.tensor([0] + list(np.cumsum(self.factors)[:-1])).cuda().view(1, -1, 1, 1) @@ -42,8 +42,7 @@ def __init__(self, env, hidden_size=256, output_size=256): nn.ReLU(), ) - self.actor = pufferlib.pytorch.layer_init( - nn.Linear(output_size, self.num_actions), std=0.01) + self.actor = pufferlib.pytorch.layer_init(nn.Linear(output_size, self.num_actions), std=0.01) self.value_fn = pufferlib.pytorch.layer_init(nn.Linear(output_size, 1), std=1) def forward(self, x): @@ -53,8 +52,8 @@ def forward(self, x): def encode_observations(self, observations, unflatten=False): batch = observations.shape[0] - ob_map = observations[:, :11*15*10].view(batch, 11, 15, 10) - ob_player = observations[:, 11*15*10:-10] + ob_map = observations[:, : 11 * 15 * 10].view(batch, 11, 15, 10) + ob_player = observations[:, 11 * 15 * 10 : -10] ob_reward = observations[:, -10:] map_buf = torch.zeros(batch, self.multihot_dim, 11, 15, device=ob_map.device, dtype=torch.float32) @@ -73,24 +72,21 @@ def decode_actions(self, flat_hidden, lookup, concat=None): value = self.value_fn(flat_hidden) return action, value + class Snake(nn.Module): def __init__(self, env, cnn_channels=32, hidden_size=128, **kwargs): super().__init__() - self.network= nn.Sequential( - pufferlib.pytorch.layer_init( - nn.Conv2d(8, cnn_channels, 5, stride=3)), + self.network = nn.Sequential( + pufferlib.pytorch.layer_init(nn.Conv2d(8, cnn_channels, 5, stride=3)), nn.ReLU(), - pufferlib.pytorch.layer_init( - nn.Conv2d(cnn_channels, cnn_channels, 3, stride=1)), + pufferlib.pytorch.layer_init(nn.Conv2d(cnn_channels, cnn_channels, 3, stride=1)), nn.ReLU(), nn.Flatten(), pufferlib.pytorch.layer_init(nn.Linear(cnn_channels, hidden_size)), nn.ReLU(), ) - self.actor = pufferlib.pytorch.layer_init( - nn.Linear(hidden_size, env.single_action_space.n), std=0.01) - self.value_fn = pufferlib.pytorch.layer_init( - nn.Linear(hidden_size, 1), std=1) + self.actor = pufferlib.pytorch.layer_init(nn.Linear(hidden_size, env.single_action_space.n), std=0.01) + self.value_fn = pufferlib.pytorch.layer_init(nn.Linear(hidden_size, 1), std=1) def forward(self, observations): hidden, lookup = self.encode_observations(observations) @@ -106,32 +102,29 @@ def decode_actions(self, flat_hidden, lookup, concat=None): value = self.value_fn(flat_hidden) return action, value + class Grid(nn.Module): def __init__(self, env, cnn_channels=32, hidden_size=128, **kwargs): super().__init__() self.cnn = nn.Sequential( - pufferlib.pytorch.layer_init( - nn.Conv2d(7, cnn_channels, 5, stride=3)), + pufferlib.pytorch.layer_init(nn.Conv2d(7, cnn_channels, 5, stride=3)), nn.ReLU(), - pufferlib.pytorch.layer_init( - nn.Conv2d(cnn_channels, cnn_channels, 3, stride=1)), + pufferlib.pytorch.layer_init(nn.Conv2d(cnn_channels, cnn_channels, 3, stride=1)), nn.Flatten(), ) self.flat = pufferlib.pytorch.layer_init(nn.Linear(3, 32)) - self.proj = pufferlib.pytorch.layer_init(nn.Linear(32+cnn_channels, hidden_size)) + self.proj = pufferlib.pytorch.layer_init(nn.Linear(32 + cnn_channels, hidden_size)) self.is_continuous = isinstance(env.single_action_space, pufferlib.spaces.Box) if self.is_continuous: self.decoder_mean = pufferlib.pytorch.layer_init( - nn.Linear(hidden_size, env.single_action_space.shape[0]), std=0.01) - self.decoder_logstd = nn.Parameter(torch.zeros( - 1, env.single_action_space.shape[0])) + nn.Linear(hidden_size, env.single_action_space.shape[0]), std=0.01 + ) + self.decoder_logstd = nn.Parameter(torch.zeros(1, env.single_action_space.shape[0])) else: - self.actor = pufferlib.pytorch.layer_init( - nn.Linear(hidden_size, 6), std=0.01) + self.actor = pufferlib.pytorch.layer_init(nn.Linear(hidden_size, 6), std=0.01) - self.value_fn = pufferlib.pytorch.layer_init( - nn.Linear(hidden_size, 1), std=1) + self.value_fn = pufferlib.pytorch.layer_init(nn.Linear(hidden_size, 1), std=1) def forward(self, observations): hidden, lookup = self.encode_observations(observations) @@ -163,36 +156,33 @@ def decode_actions(self, flat_hidden, lookup, concat=None): action = self.actor(flat_hidden).split(3, dim=1) return action, value + class Go(nn.Module): def __init__(self, env, cnn_channels=64, hidden_size=128, **kwargs): super().__init__() - # 3 categories 2 boards. + # 3 categories 2 boards. # categories = player, opponent, empty # boards = current, previous self.cnn = nn.Sequential( - pufferlib.pytorch.layer_init( - nn.Conv2d(2, cnn_channels, 3, stride=1)), + pufferlib.pytorch.layer_init(nn.Conv2d(2, cnn_channels, 3, stride=1)), nn.ReLU(), - pufferlib.pytorch.layer_init( - nn.Conv2d(cnn_channels, cnn_channels, 3, stride = 1)), + pufferlib.pytorch.layer_init(nn.Conv2d(cnn_channels, cnn_channels, 3, stride=1)), nn.Flatten(), ) obs_size = env.single_observation_space.shape[0] - self.grid_size = int(np.sqrt((obs_size-2)/2)) + self.grid_size = int(np.sqrt((obs_size - 2) / 2)) output_size = self.grid_size - 4 cnn_flat_size = cnn_channels * output_size * output_size - - self.flat = pufferlib.pytorch.layer_init(nn.Linear(2,32)) - + + self.flat = pufferlib.pytorch.layer_init(nn.Linear(2, 32)) + self.proj = pufferlib.pytorch.layer_init(nn.Linear(cnn_flat_size + 32, hidden_size)) - self.actor = pufferlib.pytorch.layer_init( - nn.Linear(hidden_size, env.single_action_space.n), std=0.01) + self.actor = pufferlib.pytorch.layer_init(nn.Linear(hidden_size, env.single_action_space.n), std=0.01) + + self.value_fn = pufferlib.pytorch.layer_init(nn.Linear(hidden_size, 1), std=1) - self.value_fn = pufferlib.pytorch.layer_init( - nn.Linear(hidden_size, 1), std=1) - def forward(self, observations): hidden, lookup = self.encode_observations(observations) actions, value = self.decode_actions(hidden, lookup) @@ -200,16 +190,16 @@ def forward(self, observations): def encode_observations(self, observations): grid_size = int(np.sqrt((observations.shape[1] - 2) / 2)) - full_board = grid_size * grid_size - black_board = observations[:, :full_board].view(-1,1, grid_size,grid_size).float() - white_board = observations[:, full_board:-2].view(-1,1, grid_size, grid_size).float() - board_features = torch.cat([black_board, white_board],dim=1) + full_board = grid_size * grid_size + black_board = observations[:, :full_board].view(-1, 1, grid_size, grid_size).float() + white_board = observations[:, full_board:-2].view(-1, 1, grid_size, grid_size).float() + board_features = torch.cat([black_board, white_board], dim=1) flat_feature1 = observations[:, -2].unsqueeze(1).float() flat_feature2 = observations[:, -1].unsqueeze(1).float() # Pass board through cnn cnn_features = self.cnn(board_features) # Pass extra feature - flat_features = torch.cat([flat_feature1, flat_feature2],dim=1) + flat_features = torch.cat([flat_feature1, flat_feature2], dim=1) flat_features = self.flat(flat_features) # pass all features features = torch.cat([cnn_features, flat_features], dim=1) @@ -221,34 +211,31 @@ def decode_actions(self, flat_hidden, lookup, concat=None): value = self.value_fn(flat_hidden) action = self.actor(flat_hidden) return action, value - + + class MOBA(nn.Module): def __init__(self, env, cnn_channels=128, hidden_size=128, **kwargs): super().__init__() self.cnn = nn.Sequential( - pufferlib.pytorch.layer_init( - nn.Conv2d(16 + 3, cnn_channels, 5, stride=3)), + pufferlib.pytorch.layer_init(nn.Conv2d(16 + 3, cnn_channels, 5, stride=3)), nn.ReLU(), - pufferlib.pytorch.layer_init( - nn.Conv2d(cnn_channels, cnn_channels, 3, stride=1)), + pufferlib.pytorch.layer_init(nn.Conv2d(cnn_channels, cnn_channels, 3, stride=1)), nn.Flatten(), ) self.flat = pufferlib.pytorch.layer_init(nn.Linear(26, 128)) - self.proj = pufferlib.pytorch.layer_init(nn.Linear(128+cnn_channels, hidden_size)) + self.proj = pufferlib.pytorch.layer_init(nn.Linear(128 + cnn_channels, hidden_size)) self.is_continuous = isinstance(env.single_action_space, pufferlib.spaces.Box) if self.is_continuous: self.decoder_mean = pufferlib.pytorch.layer_init( - nn.Linear(hidden_size, env.single_action_space.shape[0]), std=0.01) - self.decoder_logstd = nn.Parameter(torch.zeros( - 1, env.single_action_space.shape[0])) + nn.Linear(hidden_size, env.single_action_space.shape[0]), std=0.01 + ) + self.decoder_logstd = nn.Parameter(torch.zeros(1, env.single_action_space.shape[0])) else: self.atn_dim = env.single_action_space.nvec.tolist() - self.actor = pufferlib.pytorch.layer_init( - nn.Linear(hidden_size, sum(self.atn_dim)), std=0.01) + self.actor = pufferlib.pytorch.layer_init(nn.Linear(hidden_size, sum(self.atn_dim)), std=0.01) - self.value_fn = pufferlib.pytorch.layer_init( - nn.Linear(hidden_size, 1), std=1) + self.value_fn = pufferlib.pytorch.layer_init(nn.Linear(hidden_size, 1), std=1) def forward(self, observations): hidden, lookup = self.encode_observations(observations) @@ -258,28 +245,28 @@ def forward(self, observations): def encode_observations(self, observations): cnn_features = observations[:, :-26].view(-1, 11, 11, 4).long() if cnn_features[:, :, :, 0].max() > 15: - print('Invalid map value:', cnn_features[:, :, :, 0].max()) + print("Invalid map value:", cnn_features[:, :, :, 0].max()) breakpoint() exit(1) map_features = F.one_hot(cnn_features[:, :, :, 0], 16).permute(0, 3, 1, 2).float() extra_map_features = (cnn_features[:, :, :, -3:].float() / 255).permute(0, 3, 1, 2) cnn_features = torch.cat([map_features, extra_map_features], dim=1) - #print('observations 2d: ', map_features[0].cpu().numpy().tolist()) + # print('observations 2d: ', map_features[0].cpu().numpy().tolist()) cnn_features = self.cnn(cnn_features) - #print('cnn features: ', cnn_features[0].detach().cpu().numpy().tolist()) + # print('cnn features: ', cnn_features[0].detach().cpu().numpy().tolist()) flat_features = observations[:, -26:].float() / 255.0 - #print('observations 1d: ', flat_features[0, 0]) + # print('observations 1d: ', flat_features[0, 0]) flat_features = self.flat(flat_features) - #print('flat features: ', flat_features[0].detach().cpu().numpy().tolist()) + # print('flat features: ', flat_features[0].detach().cpu().numpy().tolist()) features = torch.cat([cnn_features, flat_features], dim=1) features = F.relu(self.proj(F.relu(features))) - #print('features: ', features[0].detach().cpu().numpy().tolist()) + # print('features: ', features[0].detach().cpu().numpy().tolist()) return features, None def decode_actions(self, flat_hidden, lookup, concat=None): - #print('lstm: ', flat_hidden[0].detach().cpu().numpy().tolist()) + # print('lstm: ', flat_hidden[0].detach().cpu().numpy().tolist()) value = self.value_fn(flat_hidden) if self.is_continuous: mean = self.decoder_mean(flat_hidden) @@ -292,29 +279,26 @@ def decode_actions(self, flat_hidden, lookup, concat=None): action = self.actor(flat_hidden) action = torch.split(action, self.atn_dim, dim=1) - #argmax_samples = [torch.argmax(a, dim=1).detach().cpu().numpy().tolist() for a in action] - #print('argmax samples: ', argmax_samples) + # argmax_samples = [torch.argmax(a, dim=1).detach().cpu().numpy().tolist() for a in action] + # print('argmax samples: ', argmax_samples) return action, value + class TrashPickup(nn.Module): def __init__(self, env, cnn_channels=32, hidden_size=128, **kwargs): super().__init__() - self.network= nn.Sequential( - pufferlib.pytorch.layer_init( - nn.Conv2d(5, cnn_channels, 5, stride=3)), + self.network = nn.Sequential( + pufferlib.pytorch.layer_init(nn.Conv2d(5, cnn_channels, 5, stride=3)), nn.ReLU(), - pufferlib.pytorch.layer_init( - nn.Conv2d(cnn_channels, cnn_channels, 3, stride=1)), + pufferlib.pytorch.layer_init(nn.Conv2d(cnn_channels, cnn_channels, 3, stride=1)), nn.ReLU(), nn.Flatten(), pufferlib.pytorch.layer_init(nn.Linear(cnn_channels, hidden_size)), nn.ReLU(), ) - self.actor = pufferlib.pytorch.layer_init( - nn.Linear(hidden_size, env.single_action_space.n), std=0.01) - self.value_fn = pufferlib.pytorch.layer_init( - nn.Linear(hidden_size, 1), std=1) + self.actor = pufferlib.pytorch.layer_init(nn.Linear(hidden_size, env.single_action_space.n), std=0.01) + self.value_fn = pufferlib.pytorch.layer_init(nn.Linear(hidden_size, 1), std=1) def forward(self, observations): hidden, lookup = self.encode_observations(observations) diff --git a/pufferlib/ocean/trash_pickup/trash_pickup.py b/pufferlib/ocean/trash_pickup/trash_pickup.py index 2b15b6a32..2df2354d6 100644 --- a/pufferlib/ocean/trash_pickup/trash_pickup.py +++ b/pufferlib/ocean/trash_pickup/trash_pickup.py @@ -1,13 +1,23 @@ import numpy as np -from gymnasium import spaces - import pufferlib +from gymnasium import spaces from pufferlib.ocean.trash_pickup.cy_trash_pickup import CyTrashPickup class TrashPickupEnv(pufferlib.PufferEnv): - def __init__(self, num_envs=1, render_mode=None, report_interval=1, buf=None, - grid_size=10, num_agents=3, num_trash=15, num_bins=2, max_steps=300, agent_sight_range=5): + def __init__( + self, + num_envs=1, + render_mode=None, + report_interval=1, + buf=None, + grid_size=10, + num_agents=3, + num_trash=15, + num_bins=2, + max_steps=300, + agent_sight_range=5, + ): # Env Setup self.render_mode = render_mode self.report_interval = report_interval @@ -50,16 +60,29 @@ def __init__(self, num_envs=1, render_mode=None, report_interval=1, buf=None, # num_obs_bin = num_bins * 2 # [x pos, y pos] for each bin # num_obs_agent = num_agents * 3 # [carrying trash, x pos, y pos] for each agent # self.num_obs = num_obs_trash + num_obs_bin + num_obs_agent; - + # 2D Local crop obs space - self.num_obs = ((((agent_sight_range * 2 + 1) * (agent_sight_range * 2 + 1)) * 5)); # one-hot encoding for all cell types in local crop around agent (minus the cell the agent is currently in) + self.num_obs = ( + ((agent_sight_range * 2 + 1) * (agent_sight_range * 2 + 1)) * 5 + ) # one-hot encoding for all cell types in local crop around agent (minus the cell the agent is currently in) - self.single_observation_space = spaces.Box(low=0, high=1, - shape=(self.num_obs,), dtype=np.int8) + self.single_observation_space = spaces.Box(low=0, high=1, shape=(self.num_obs,), dtype=np.int8) self.single_action_space = spaces.Discrete(4) super().__init__(buf=buf) - self.c_envs = CyTrashPickup(self.observations, self.actions, self.rewards, self.terminals, num_envs, num_agents, grid_size, num_trash, num_bins, max_steps, agent_sight_range) + self.c_envs = CyTrashPickup( + self.observations, + self.actions, + self.rewards, + self.terminals, + num_envs, + num_agents, + grid_size, + num_trash, + num_bins, + max_steps, + agent_sight_range, + ) def reset(self, seed=None): self.c_envs.reset() @@ -75,35 +98,38 @@ def step(self, actions): if self.tick % self.report_interval == 0: log = self.c_envs.log() # print(f"tha log: {log}") - if log['episode_length'] > 0: + if log["episode_length"] > 0: info.append(log) - return (self.observations, self.rewards, - self.terminals, self.truncations, info) + return (self.observations, self.rewards, self.terminals, self.truncations, info) def render(self): self.c_envs.render() - + def close(self): - self.c_envs.close() + self.c_envs.close() + def test_performance(timeout=10, atn_cache=1024): - env = TrashPickupEnv(num_envs=1024, grid_size=10, num_agents=4, - num_trash=20, num_bins=1, max_steps=150, agent_sight_range=5) - + env = TrashPickupEnv( + num_envs=1024, grid_size=10, num_agents=4, num_trash=20, num_bins=1, max_steps=150, agent_sight_range=5 + ) + env.reset() tick = 0 actions = np.random.randint(0, 4, (atn_cache, env.num_agents)) import time + start = time.time() while time.time() - start < timeout: atn = actions[tick % atn_cache] env.step(atn) tick += 1 - print(f'SPS: %f', env.num_agents * tick / (time.time() - start)) + print("SPS: %f", env.num_agents * tick / (time.time() - start)) + -if __name__ == '__main__': +if __name__ == "__main__": test_performance() diff --git a/pufferlib/ocean/tripletriad/tripletriad.py b/pufferlib/ocean/tripletriad/tripletriad.py index 67cf97829..dd55b5c48 100644 --- a/pufferlib/ocean/tripletriad/tripletriad.py +++ b/pufferlib/ocean/tripletriad/tripletriad.py @@ -1,23 +1,39 @@ -import numpy as np import gymnasium - +import numpy as np import pufferlib from pufferlib.ocean.tripletriad.cy_tripletriad import CyTripleTriad + class TripleTriad(pufferlib.PufferEnv): - def __init__(self, num_envs=1, render_mode=None, report_interval=1, - width=990, height=690, piece_width=192, piece_height=224, buf=None): - self.single_observation_space = gymnasium.spaces.Box(low=0, high=1, - shape=(114,), dtype=np.float32) + def __init__( + self, + num_envs=1, + render_mode=None, + report_interval=1, + width=990, + height=690, + piece_width=192, + piece_height=224, + buf=None, + ): + self.single_observation_space = gymnasium.spaces.Box(low=0, high=1, shape=(114,), dtype=np.float32) self.single_action_space = gymnasium.spaces.Discrete(15) self.report_interval = report_interval self.render_mode = render_mode self.num_agents = num_envs super().__init__(buf=buf) - self.c_envs = CyTripleTriad(self.observations, self.actions, - self.rewards, self.terminals, num_envs, width, height, - piece_width, piece_height) + self.c_envs = CyTripleTriad( + self.observations, + self.actions, + self.rewards, + self.terminals, + num_envs, + width, + height, + piece_width, + piece_height, + ) def reset(self, seed=None): self.c_envs.reset() @@ -32,11 +48,10 @@ def step(self, actions): info = [] if self.tick % self.report_interval == 0: log = self.c_envs.log() - if log['episode_length'] > 0: + if log["episode_length"] > 0: info.append(log) - return (self.observations, self.rewards, - self.terminals, self.truncations, info) + return (self.observations, self.rewards, self.terminals, self.truncations, info) def render(self): self.c_envs.render() @@ -44,6 +59,7 @@ def render(self): def close(self): self.c_envs.close() + def test_performance(timeout=10, atn_cache=1024): env = TripleTriad(num_envs=1000) env.reset() @@ -52,13 +68,15 @@ def test_performance(timeout=10, atn_cache=1024): actions = np.random.randint(0, 2, (atn_cache, env.num_envs)) import time + start = time.time() while time.time() - start < timeout: atn = actions[tick % atn_cache] env.step(atn) tick += 1 - print(f'SPS: %f', env.num_envs * tick / (time.time() - start)) + print("SPS: %f", env.num_envs * tick / (time.time() - start)) + -if __name__ == '__main__': +if __name__ == "__main__": test_performance() diff --git a/pufferlib/policy_ranker.py b/pufferlib/policy_ranker.py index 8282f9fc9..db7e1645b 100644 --- a/pufferlib/policy_ranker.py +++ b/pufferlib/policy_ranker.py @@ -1,18 +1,18 @@ -from pdb import set_trace as T -import numpy as np - import sqlite3 +import numpy as np + ANCHOR_ELO = 1000.0 def win_prob(elo1, elo2): - '''Calculate win probability such that a difference of - 50/100/150 elo corresponds to win probabilitit 68/95/99.7%''' + """Calculate win probability such that a difference of + 50/100/150 elo corresponds to win probabilitit 68/95/99.7%""" return 1 / (1 + 10 ** ((elo2 - elo1) / 400)) + def update_elos(elos: np.ndarray, scores: np.ndarray, k: float = 4.0): - '''Update elos based on the result of a game + """Update elos based on the result of a game The parameter k controls the magnitude of the update. A higher k means that the elo will change more after a game. @@ -23,13 +23,13 @@ def update_elos(elos: np.ndarray, scores: np.ndarray, k: float = 4.0): The default is tuned for normally distributed player skill You should lower it if you have very similar players. Raise it if you are evaluating a diverse skill pool. - ''' + """ num_players = len(elos) assert num_players == len(scores) elo_update = [[] for _ in range(num_players)] for i in range(num_players): - for j in range(i+1, num_players): + for j in range(i + 1, num_players): delta = scores[i] - scores[j] # Convert to elo scoring format @@ -49,7 +49,8 @@ def update_elos(elos: np.ndarray, scores: np.ndarray, k: float = 4.0): elo_update[j].append(k * (score_j - expected_j)) elo_update = [np.mean(e) for e in elo_update] - return [elo + update for elo, update in zip(elos, elo_update)] + return [elo + update for elo, update in zip(elos, elo_update, strict=False)] + class Ranker: def __init__(self, db_path): @@ -64,13 +65,10 @@ def __init__(self, db_path): def __repr__(self): if len(self.ratings) == 0: - return '' + return "" sorted_dict = sorted(self.ratings.items(), key=lambda x: x[1], reverse=True) - return '\n'.join([ - f' - Policy: {name}, Elo: {elo:.3f}' - for name, elo in sorted_dict - ]) + return "\n".join([f" - Policy: {name}, Elo: {elo:.3f}" for name, elo in sorted_dict]) @property def ratings(self): @@ -96,9 +94,12 @@ def update(self, scores: dict): flat_elos.append(ANCHOR_ELO) flat_elos = update_elos(flat_elos, flat_scores) - elos = zip(scores.keys(), flat_elos) + elos = zip(scores.keys(), flat_elos, strict=False) with self.conn: - self.conn.executemany(""" + self.conn.executemany( + """ INSERT OR REPLACE INTO ratings (policy, elo) VALUES (?, ?); - """, elos) + """, + elos, + ) diff --git a/pufferlib/policy_store.py b/pufferlib/policy_store.py index 7bbd96ad1..16f59f8f0 100644 --- a/pufferlib/policy_store.py +++ b/pufferlib/policy_store.py @@ -1,5 +1,5 @@ -from pdb import set_trace as T import os + import torch @@ -7,10 +7,11 @@ def get_policy_names(path: str) -> list: # Assumeing that all pt files other than trainer_state.pt in the path are policy files names = [] for file in os.listdir(path): - if file.endswith(".pt") and file != 'trainer_state.pt': + if file.endswith(".pt") and file != "trainer_state.pt": names.append(file[:-3]) return sorted(names) + class PolicyStore: def __init__(self, path: str): self.path = path @@ -19,8 +20,8 @@ def policy_names(self) -> list: return get_policy_names(self.path) def get_policy(self, name: str) -> torch.nn.Module: - path = os.path.join(self.path, name + '.pt') + path = os.path.join(self.path, name + ".pt") try: return torch.load(path) except: - return torch.load(path, map_location=torch.device('cpu')) + return torch.load(path, map_location=torch.device("cpu")) diff --git a/pufferlib/postprocess.py b/pufferlib/postprocess.py index cb311cc14..eae7de2c5 100644 --- a/pufferlib/postprocess.py +++ b/pufferlib/postprocess.py @@ -1,12 +1,13 @@ -from pdb import set_trace as T -import numpy as np import gymnasium +import numpy as np import pufferlib.utils + class ResizeObservation(gymnasium.Wrapper): - '''Fixed downscaling wrapper. Do NOT use gym.wrappers.ResizeObservation - It uses a laughably slow OpenCV resize. -50% on Atari just from that.''' + """Fixed downscaling wrapper. Do NOT use gym.wrappers.ResizeObservation + It uses a laughably slow OpenCV resize. -50% on Atari just from that.""" + def __init__(self, env, downscale=2): super().__init__(env) self.downscale = downscale @@ -14,19 +15,20 @@ def __init__(self, env, downscale=2): assert y_size % downscale == 0 and x_size % downscale == 0 y_size = env.observation_space.shape[0] // downscale x_size = env.observation_space.shape[1] // downscale - self.observation_space = gymnasium.spaces.Box( - low=0, high=255, shape=(y_size, x_size), dtype=np.uint8) + self.observation_space = gymnasium.spaces.Box(low=0, high=255, shape=(y_size, x_size), dtype=np.uint8) def reset(self, seed=None, options=None): obs, info = self.env.reset(seed=seed, options=options) - return obs[::self.downscale, ::self.downscale], info + return obs[:: self.downscale, :: self.downscale], info def step(self, action): obs, reward, terminal, truncated, info = self.env.step(action) - return obs[::self.downscale, ::self.downscale], reward, terminal, truncated, info + return obs[:: self.downscale, :: self.downscale], reward, terminal, truncated, info + class ClipAction(gymnasium.Wrapper): - '''Wrapper for Gymnasium environments that clips actions''' + """Wrapper for Gymnasium environments that clips actions""" + def __init__(self, env): self.env = env assert isinstance(env.action_space, gymnasium.spaces.Box) @@ -44,8 +46,9 @@ def step(self, action): class EpisodeStats(gymnasium.Wrapper): - '''Wrapper for Gymnasium environments that stores - episodic returns and lengths in infos''' + """Wrapper for Gymnasium environments that stores + episodic returns and lengths in infos""" + def __init__(self, env): self.env = env self.observation_space = env.observation_space @@ -55,7 +58,7 @@ def __init__(self, env): def reset(self, seed=None, options=None): self.info = dict(episode_return=[], episode_length=0) # TODO: options - return self.env.reset(seed=seed)#, options=options) + return self.env.reset(seed=seed) # , options=options) def step(self, action): observation, reward, terminated, truncated, info = super().step(action) @@ -66,8 +69,8 @@ def step(self, action): self.info[k].append(v) - self.info['episode_return'].append(reward) - self.info['episode_length'] += 1 + self.info["episode_return"].append(reward) + self.info["episode_length"] += 1 info = {} if terminated or truncated: @@ -83,7 +86,7 @@ def step(self, action): continue try: - x = int(v) # probably a value + x = int(v) # probably a value info[k] = v continue except TypeError: @@ -91,15 +94,17 @@ def step(self, action): return observation, reward, terminated, truncated, info + class PettingZooWrapper: - '''PettingZoo does not provide a ParallelEnv wrapper. This code is adapted from - their AEC wrapper, to prevent unneeded conversions to/from AEC''' + """PettingZoo does not provide a ParallelEnv wrapper. This code is adapted from + their AEC wrapper, to prevent unneeded conversions to/from AEC""" + def __init__(self, env): self.env = env def __getattr__(self, name): - '''Returns an attribute with ``name``, unless ``name`` starts with an underscore.''' - if name.startswith('_') and name != '_cumulative_rewards': + """Returns an attribute with ``name``, unless ``name`` starts with an underscore.""" + if name.startswith("_") and name != "_cumulative_rewards": raise AttributeError(f'accessing private attribute "{name}" is prohibited') return getattr(self.env, name) @@ -135,11 +140,13 @@ def action_space(self, agent): return self.env.action_space(agent) def __str__(self) -> str: - '''Returns a name which looks like: "max_observation".''' - return f'{type(self).__name__}<{str(self.env)}>' + """Returns a name which looks like: "max_observation".""" + return f"{type(self).__name__}<{str(self.env)}>" + class MeanOverAgents(PettingZooWrapper): - '''Averages over agent infos''' + """Averages over agent infos""" + def _mean(self, infos): list_infos = {} for agent, info in infos.items(): @@ -168,15 +175,14 @@ def step(self, actions): infos = self._mean(infos) return observations, rewards, terminations, truncations, infos + class MultiagentEpisodeStats(PettingZooWrapper): - '''Wrapper for PettingZoo environments that stores - episodic returns and lengths in infos''' + """Wrapper for PettingZoo environments that stores + episodic returns and lengths in infos""" + def reset(self, seed=None, options=None): observations, infos = super().reset(seed=seed, options=options) - self.infos = { - agent: dict(episode_return=[], episode_length=0) - for agent in self.possible_agents - } + self.infos = {agent: dict(episode_return=[], episode_length=0) for agent in self.possible_agents} return observations, infos def step(self, actions): @@ -192,8 +198,8 @@ def step(self, actions): agent_info[k].append(v) # Saved to self. TODO: Clean up - agent_info['episode_return'].append(rewards[agent]) - agent_info['episode_length'] += 1 + agent_info["episode_return"].append(rewards[agent]) + agent_info["episode_length"] += 1 agent_info = {} all_infos[agent] = agent_info @@ -210,7 +216,7 @@ def step(self, actions): continue try: - x = int(v) # probably a value + x = int(v) # probably a value agent_info[k] = v continue except TypeError: diff --git a/pufferlib/pytorch.py b/pufferlib/pytorch.py index 54bcc24df..c58928436 100644 --- a/pufferlib/pytorch.py +++ b/pufferlib/pytorch.py @@ -1,7 +1,6 @@ +import contextlib import sys -from pdb import set_trace as T from typing import Dict, List, Tuple, Union -import contextlib import numpy as np import torch @@ -9,7 +8,6 @@ import pufferlib - numpy_to_torch_dtype_dict = { np.dtype("float64"): torch.float64, np.dtype("float32"): torch.float32, @@ -58,12 +56,12 @@ def nativize_dtype(emulated: pufferlib.namespace) -> NativeDType: else: return subviews + def round_to(x, base): - return int(base * np.ceil(x/base)) + return int(base * np.ceil(x / base)) -def _nativize_dtype(sample_dtype: np.dtype, - structured_dtype: np.dtype, - offset: int = 0) -> NativeDType: + +def _nativize_dtype(sample_dtype: np.dtype, structured_dtype: np.dtype, offset: int = 0) -> NativeDType: if structured_dtype.fields is None: if structured_dtype.subdtype is not None: dtype, shape = structured_dtype.subdtype @@ -84,8 +82,7 @@ def _nativize_dtype(sample_dtype: np.dtype, start_offset = offset all_delta = 0 for name, (dtype, _) in structured_dtype.fields.items(): - views, dtype, shape, offset, delta = _nativize_dtype( - sample_dtype, dtype, offset) + views, dtype, shape, offset, delta = _nativize_dtype(sample_dtype, dtype, offset) if views is not None: subviews[name] = views @@ -122,9 +119,7 @@ def compilable_cast(u8, dtype): return u8.view(dtype) # breaking cast -def _nativize_tensor( - observation: torch.Tensor, native_dtype: NativeDType -) -> torch.Tensor | dict[str, torch.Tensor]: +def _nativize_tensor(observation: torch.Tensor, native_dtype: NativeDType) -> torch.Tensor | dict[str, torch.Tensor]: if isinstance(native_dtype, tuple): dtype, shape, offset, delta = native_dtype torch._check_is_size(offset) @@ -202,27 +197,26 @@ def __init__(self, input_size=128, hidden_size=128, num_layers=1): super().__init__(input_size, hidden_size, num_layers) layer_init(self) + def cycle_selector(sample_idx, num_policies): return sample_idx % num_policies + class PolicyPool(torch.nn.Module): - def __init__(self, vecenv, policies, learner_mask, device, - policy_selector=cycle_selector): - '''Experimental utility for running multiple different policies''' + def __init__(self, vecenv, policies, learner_mask, device, policy_selector=cycle_selector): + """Experimental utility for running multiple different policies""" super().__init__() assert len(learner_mask) == len(policies) - self.policy_map = torch.tensor([policy_selector(i, len(policies)) - for i in range(vecenv.num_agents)]) + self.policy_map = torch.tensor([policy_selector(i, len(policies)) for i in range(vecenv.num_agents)]) self.learner_mask = learner_mask self.policies = torch.nn.ModuleList(policies) self.vecenv = vecenv # Assumes that all policies have the same LSTM or no LSTM - self.lstm = policies[0].lstm if hasattr(policies[0], 'lstm') else None + self.lstm = policies[0].lstm if hasattr(policies[0], "lstm") else None # Allocate buffers - self.actions = torch.zeros(vecenv.num_agents, - *vecenv.single_action_space.shape, dtype=int).to(device) + self.actions = torch.zeros(vecenv.num_agents, *vecenv.single_action_space.shape, dtype=int).to(device) self.logprobs = torch.zeros(vecenv.num_agents).to(device) self.entropy = torch.zeros(vecenv.num_agents).to(device) self.values = torch.zeros(vecenv.num_agents).to(device) diff --git a/pufferlib/spaces.py b/pufferlib/spaces.py index b5bab9e6c..a19826718 100644 --- a/pufferlib/spaces.py +++ b/pufferlib/spaces.py @@ -1,6 +1,6 @@ -import numpy as np -import gym import gymnasium +import gymnasium as gym +import numpy as np Box = (gym.spaces.Box, gymnasium.spaces.Box) Dict = (gym.spaces.Dict, gymnasium.spaces.Dict) @@ -9,17 +9,20 @@ MultiDiscrete = (gym.spaces.MultiDiscrete, gymnasium.spaces.MultiDiscrete) Tuple = (gym.spaces.Tuple, gymnasium.spaces.Tuple) + def joint_space(space, n): if isinstance(space, Discrete): return gymnasium.spaces.MultiDiscrete([space.n] * n) elif isinstance(space, MultiDiscrete): - return gymnasium.spaces.Box(low=0, - high=np.repeat(space.nvec[None] - 1, n, axis=0), - shape=(n, len(space)), dtype=space.dtype) + return gymnasium.spaces.Box( + low=0, high=np.repeat(space.nvec[None] - 1, n, axis=0), shape=(n, len(space)), dtype=space.dtype + ) elif isinstance(space, Box): return gymnasium.spaces.Box( low=np.repeat(space.low[None], n, axis=0), high=np.repeat(space.high[None], n, axis=0), - shape=(n, *space.shape), dtype=space.dtype) + shape=(n, *space.shape), + dtype=space.dtype, + ) else: - raise ValueError(f'Unsupported space: {space}') + raise ValueError(f"Unsupported space: {space}") diff --git a/pufferlib/sweep.py b/pufferlib/sweep.py new file mode 100644 index 000000000..874dbd3f4 --- /dev/null +++ b/pufferlib/sweep.py @@ -0,0 +1,753 @@ +import math +import random +import warnings +from copy import deepcopy + +import numpy as np +import pyro +import torch +from pyro.contrib import gp as gp + +import pufferlib + + +class Space: + def __init__(self, min, max, scale, mean, is_integer=False): + self.min = min + self.max = max + self.scale = scale + self.mean = mean # TODO: awkward to have just this normalized + self.norm_min = self.normalize(min) + self.norm_max = self.normalize(max) + self.norm_mean = self.normalize(mean) + self.is_integer = is_integer + + +class Linear(Space): + def __init__(self, min, max, scale, mean, is_integer=False): + if scale == "auto": + scale = 0.5 + + super().__init__(min, max, scale, mean, is_integer) + + def normalize(self, value): + # assert isinstance(value, (int, float)) + zero_one = (value - self.min) / (self.max - self.min) + return 2 * zero_one - 1 + + def unnormalize(self, value): + zero_one = (value + 1) / 2 + value = zero_one * (self.max - self.min) + self.min + if self.is_integer: + value = round(value) + return value + + +class Pow2(Space): + def __init__(self, min, max, scale, mean, is_integer=False): + if scale == "auto": + scale = 0.5 + # scale = 2 / (np.log2(max) - np.log2(min)) + + super().__init__(min, max, scale, mean, is_integer) + + def normalize(self, value): + # assert isinstance(value, (int, float)) + # assert value != 0.0 + zero_one = (math.log(value, 2) - math.log(self.min, 2)) / (math.log(self.max, 2) - math.log(self.min, 2)) + return 2 * zero_one - 1 + + def unnormalize(self, value): + zero_one = (value + 1) / 2 + log_spaced = zero_one * (math.log(self.max, 2) - math.log(self.min, 2)) + math.log(self.min, 2) + rounded = round(log_spaced) + return 2**rounded + + +class Log(Space): + base: int = 10 + + def __init__(self, min, max, scale, mean, is_integer=False): + if scale == "time": + # TODO: Set scaling param intuitively based on number of jumps from min to max + scale = 1 / (np.log2(max) - np.log2(min)) + elif scale == "auto": + scale = 0.5 + + super().__init__(min, max, scale, mean, is_integer) + + def normalize(self, value): + # assert isinstance(value, (int, float)) + # assert value != 0.0 + zero_one = (math.log(value, self.base) - math.log(self.min, self.base)) / ( + math.log(self.max, self.base) - math.log(self.min, self.base) + ) + return 2 * zero_one - 1 + + def unnormalize(self, value): + zero_one = (value + 1) / 2 + log_spaced = zero_one * (math.log(self.max, self.base) - math.log(self.min, self.base)) + math.log( + self.min, self.base + ) + value = self.base**log_spaced + if self.is_integer: + value = round(value) + return value + + +class Logit(Space): + base: int = 10 + + def __init__(self, min, max, scale, mean, is_integer=False): + if scale == "auto": + scale = 0.5 + + super().__init__(min, max, scale, mean, is_integer) + + def normalize(self, value): + # assert isinstance(value, (int, float)) + # assert value != 0.0 + # assert value != 1.0 + zero_one = (math.log(1 - value, self.base) - math.log(1 - self.min, self.base)) / ( + math.log(1 - self.max, self.base) - math.log(1 - self.min, self.base) + ) + return 2 * zero_one - 1 + + def unnormalize(self, value): + zero_one = (value + 1) / 2 + log_spaced = zero_one * (math.log(1 - self.max, self.base) - math.log(1 - self.min, self.base)) + math.log( + 1 - self.min, self.base + ) + return 1 - self.base**log_spaced + + +def _params_from_puffer_sweep(sweep_config): + param_spaces = {} + for name, param in sweep_config.items(): + if name in ("method", "name", "metric", "max_score"): + continue + + assert isinstance(param, dict) + if any(isinstance(param[k], dict) for k in param): + param_spaces[name] = _params_from_puffer_sweep(param) + continue + + assert "distribution" in param + distribution = param["distribution"] + search_center = param["mean"] + kwargs = dict( + min=param["min"], + max=param["max"], + scale=param["scale"], + mean=search_center, + ) + if distribution == "uniform": + space = Linear(**kwargs) + elif distribution == "int_uniform": + space = Linear(**kwargs, is_integer=True) + elif distribution == "uniform_pow2": + space = Pow2(**kwargs, is_integer=True) + elif distribution == "log_normal": + space = Log(**kwargs) + elif distribution == "logit_normal": + space = Logit(**kwargs) + else: + raise ValueError(f"Invalid distribution: {distribution}") + + param_spaces[name] = space + + return param_spaces + + +class Hyperparameters: + def __init__(self, config, verbose=True): + self.spaces = _params_from_puffer_sweep(config) + self.flat_spaces = dict(pufferlib.utils.unroll_nested_dict(self.spaces)) + self.num = len(self.flat_spaces) + + self.metric = config["metric"] + assert self.metric["goal"] in ["maximize", "minimize"] + self.optimize_direction = 1 if self.metric["goal"] == "maximize" else -1 + + self.search_centers = np.array([e.norm_mean for e in self.flat_spaces.values()]) + self.min_bounds = np.array([e.norm_min for e in self.flat_spaces.values()]) + self.max_bounds = np.array([e.norm_max for e in self.flat_spaces.values()]) + self.search_scales = np.array([e.scale for e in self.flat_spaces.values()]) + + if verbose: + print("Min random sample:") + for name, space in self.flat_spaces.items(): + print(f"\t{name}: {space.unnormalize(max(space.norm_mean - space.scale, space.norm_min))}") + + print("Max random sample:") + for name, space in self.flat_spaces.items(): + print(f"\t{name}: {space.unnormalize(min(space.norm_mean + space.scale, space.norm_max))}") + + def sample(self, n, mu=None, scale=1): + if mu is None: + mu = self.search_centers + + if len(mu.shape) == 1: + mu = mu[None, :] + + n_input, n_dim = mu.shape + scale = scale * self.search_scales + mu_idxs = np.random.randint(0, n_input, n) + samples = scale * (2 * np.random.rand(n, n_dim) - 1) + mu[mu_idxs] + return np.clip(samples, self.min_bounds, self.max_bounds) + + def from_dict(self, params): + flat_params = dict(pufferlib.utils.unroll_nested_dict(params)) + values = [] + for key, space in self.flat_spaces.items(): + assert key in flat_params, f"Missing hyperparameter {key}" + val = flat_params[key] + normed = space.normalize(val) + values.append(normed) + + return np.array(values) + + def to_dict(self, sample, fill=None): + params = deepcopy(self.spaces) if fill is None else fill + self._fill(params, self.spaces, sample) + return params + + def _fill(self, params, spaces, flat_sample, idx=0): + for name, space in spaces.items(): + if isinstance(space, dict): + idx = self._fill(params[name], spaces[name], flat_sample, idx=idx) + else: + params[name] = spaces[name].unnormalize(flat_sample[idx]) + idx += 1 + + return idx + + +def pareto_points(observations, eps=1e-6): + scores = np.array([e["output"] for e in observations]) + costs = np.array([e["cost"] for e in observations]) + pareto = [] + idxs = [] + for idx, obs in enumerate(observations): + # TODO: Ties and groups + higher_score = scores + eps > scores[idx] + lower_cost = costs - eps < costs[idx] + better = higher_score & lower_cost + better[idx] = False + if not better.any(): + pareto.append(obs) + idxs.append(idx) + + return pareto, idxs + + +class Random: + def __init__( + self, + sweep_config, + global_search_scale=1, + random_suggestions=1024, + ): + self.hyperparameters = Hyperparameters(sweep_config) + self.global_search_scale = global_search_scale + self.random_suggestions = random_suggestions + self.success_observations = [] + + def suggest(self, fill=None): + suggestions = self.hyperparameters.sample(self.random_suggestions) + self.suggestion = random.choice(suggestions) + return self.hyperparameters.to_dict(self.suggestion, fill) + + def observe(self, score, cost, is_failure=False): + self.success_observations.append( + dict( + input=self.suggestion, + output=score, + cost=cost, + is_failure=is_failure, + ) + ) + + +class ParetoGenetic: + def __init__( + self, + sweep_config, + global_search_scale=1, + suggestions_per_pareto=1, + bias_cost=True, + log_bias=False, + ): + self.hyperparameters = Hyperparameters(sweep_config) + self.global_search_scale = global_search_scale + self.suggestions_per_pareto = suggestions_per_pareto + self.bias_cost = bias_cost + self.log_bias = log_bias + self.success_observations = [] + + def suggest(self, fill=None): + if len(self.success_observations) == 0: + suggestion = self.hyperparameters.search_centers + return self.hyperparameters.to_dict(suggestion, fill), {} + + candidates, _ = pareto_points(self.success_observations) + pareto_costs = np.array([e["cost"] for e in candidates]) + + if self.bias_cost: + if self.log_bias: + cost_dists = np.abs(np.log(pareto_costs[:, None]) - np.log(pareto_costs[None, :])) + else: + cost_dists = np.abs(pareto_costs[:, None] - pareto_costs[None, :]) + + cost_dists += (np.max(pareto_costs) + 1) * np.eye(len(pareto_costs)) # mask self-distance + idx = np.argmax(np.min(cost_dists, axis=1)) + search_centers = candidates[idx]["input"] + else: + search_centers = np.stack([e["input"] for e in candidates]) + + suggestions = self.hyperparameters.sample(len(candidates) * self.suggestions_per_pareto, mu=search_centers) + suggestion = suggestions[np.random.randint(0, len(suggestions))] + return self.hyperparameters.to_dict(suggestion, fill), {} + + def observe(self, hypers, score, cost, is_failure=False): + params = self.hyperparameters.from_dict(hypers) + self.success_observations.append( + dict( + input=params, + output=score, + cost=cost, + is_failure=is_failure, + ) + ) + + +def create_gp(x_dim, scale_length=1.0): + # Dummy data + X = scale_length * torch.ones((1, x_dim)) + y = torch.zeros((1,)) + + matern_kernel = gp.kernels.Matern32(input_dim=x_dim, lengthscale=X) + linear_kernel = gp.kernels.Polynomial(x_dim, degree=1) + kernel = gp.kernels.Sum(linear_kernel, matern_kernel) + + # Params taken from HEBO: https://arxiv.org/abs/2012.03826 + model = gp.models.GPRegression(X, y, kernel=kernel, jitter=1.0e-4) + model.noise = pyro.nn.PyroSample(pyro.distributions.LogNormal(math.log(1e-2), 0.5)) + optimizer = torch.optim.Adam(model.parameters(), lr=0.0001) + return model, optimizer + + +class Protein: + def __init__( + self, + sweep_config, + max_suggestion_cost=None, + resample_frequency=5, + num_random_samples=10, + global_search_scale=1, + random_suggestions=1024, + suggestions_per_pareto=256, + min_score=None, + max_score=None, + ): + self.hyperparameters = Hyperparameters(sweep_config) + + self.min_score = min_score + self.max_score = max_score + if self.min_score is None: + warnings.warn("No min_score specified. This can destabilize tuning.") + if self.max_score is None: + warnings.warn("No max_score specified. This can destabilize tuning.") + + self.num_random_samples = num_random_samples + self.global_search_scale = global_search_scale + self.random_suggestions = random_suggestions + self.suggestions_per_pareto = suggestions_per_pareto + self.resample_frequency = resample_frequency + self.max_suggestion_cost = max_suggestion_cost + + self.success_observations = [] + self.failure_observations = [] + self.suggestion_idx = 0 + + self.gp_score, self.score_opt = create_gp(self.hyperparameters.num) + self.gp_cost, self.cost_opt = create_gp(self.hyperparameters.num) + + def suggest(self, fill): + # TODO: Clip random samples to bounds so we don't get bad high cost samples + info = {} + # if self.suggestion_idx <= self.num_random_samples: + # suggestions = self.hyperparameters.sample(self.random_suggestions) + # best_idx = np.random.randint(0, self.random_suggestions) + # best = suggestions[best_idx] + self.suggestion_idx += 1 + if len(self.success_observations) == 0: + best = self.hyperparameters.search_centers + return self.hyperparameters.to_dict(best, fill), info + elif len(self.success_observations) < self.num_random_samples: + suggestions = self.hyperparameters.sample(self.random_suggestions) + self.suggestion = random.choice(suggestions) + return self.hyperparameters.to_dict(self.suggestion, fill), info + elif self.resample_frequency and self.suggestion_idx % self.resample_frequency == 0: + candidates, _ = pareto_points(self.success_observations) + suggestions = np.stack([e["input"] for e in candidates]) + best_idx = np.random.randint(0, len(candidates)) + best = suggestions[best_idx] + return self.hyperparameters.to_dict(best, fill), info + + params = np.array([e["input"] for e in self.success_observations]) + params = torch.from_numpy(params) + eps = 1e-2 + + # Scores variable y + y = np.array([e["output"] for e in self.success_observations]) + + # Transformed scores + min_score = self.min_score + if min_score is None: + min_score = np.min(y) - abs(np.min(y)) + + if np.min(y) < min_score - 1e-6: + raise ValueError(f"Min score {min_score} is less than min score in data {np.min(y)}") + + max_score = self.max_score + if max_score is None: + max_score = np.max(y) + abs(np.max(y)) + + if np.max(y) > max_score + 1e-6: + raise ValueError(f"Max score {max_score} is greater than max score in data {np.max(y)}") + + # Linearize, exp transform, linearize + y_norm = (y - min_score) / (max_score - min_score) + # yt = -np.log(1 - y_norm + eps) + # yt_min = np.min(yt) + # yt_max = np.max(yt) + # yt_norm = (yt - yt_min) / (yt_max - yt_min) + + # self.gp_score.set_data(params, torch.from_numpy(yt_norm)) + self.gp_score.set_data(params, torch.from_numpy(y_norm)) + self.gp_score.train() + gp.util.train(self.gp_score, self.score_opt) + self.gp_score.eval() + + # Log costs + c = np.array([e["cost"] for e in self.success_observations]) + + log_c = np.log(c) + + # Linear input norm creates clean 1 mean fn + log_c_min = np.min(log_c) + log_c_max = np.max(log_c) + log_c_norm = (log_c - log_c_min) / (log_c_max - log_c_min) + + self.gp_cost.mean_function = lambda x: 1 + self.gp_cost.set_data(params, torch.from_numpy(log_c_norm)) + self.gp_cost.train() + gp.util.train(self.gp_cost, self.cost_opt) + self.gp_cost.eval() + + candidates, pareto_idxs = pareto_points(self.success_observations) + pareto_costs = np.array([e["cost"] for e in candidates]) + + # cost_dists = np.abs(np.log(pareto_costs[:, None]) - np.log(pareto_costs[None, :])) + ###cost_dists = np.abs(pareto_costs[:, None] - pareto_costs[None, :]) + # cost_dists += (np.max(pareto_costs) + 1)*np.eye(len(pareto_costs)) # mask self-distance + # idx = np.argmax(np.min(cost_dists, axis=1)) + # search_centers = candidates[idx]['input'] + + ### Sample suggestions + search_centers = np.stack([e["input"] for e in candidates]) + suggestions = self.hyperparameters.sample(len(candidates) * self.suggestions_per_pareto, mu=search_centers) + + ### Predict scores and costs + suggestions = torch.from_numpy(suggestions) + with torch.no_grad(): + gp_y_norm, gp_y_norm_var = self.gp_score(suggestions) + gp_log_c_norm, gp_log_c_norm_var = self.gp_cost(suggestions) + + gp_y_norm = gp_y_norm.numpy() + gp_log_c_norm = gp_log_c_norm.numpy() + + # Unlinearize, inverse exp transform, unlinearize + # gp_yt = gp_yt_norm*(yt_max - yt_min) + yt_min + # gp_y_norm = -(np.exp(-gp_yt) - 1 - eps) + # gp_y = gp_y_norm*(max_score - min_score) + min_score + gp_y = gp_y_norm * (max_score - min_score) + min_score + + gp_log_c = gp_log_c_norm * (log_c_max - log_c_min) + log_c_min + gp_c = np.exp(gp_log_c) + + gp_c_min = np.min(gp_c) + gp_c_max = np.max(gp_c) + gp_c_norm = (gp_c - gp_c_min) / (gp_c_max - gp_c_min) + + pareto_y = y[pareto_idxs] + # pareto_yt = yt[pareto_idxs] + # pareto_yt_norm = yt_norm[pareto_idxs] + pareto_c = c[pareto_idxs] + pareto_log_c_norm = log_c_norm[pareto_idxs] + + max_c = np.max(c) + min_c = np.min(c) + + c_right = abs(pareto_log_c_norm[None, :] - gp_log_c_norm[:, None]) + + # pareto_c_norm = (pareto_c - min_c) / (max_c - min_c) + # gp_c_norm = (gp_c - min_c) / (max_c - min_c) + # c_right = np.abs(pareto_c_norm[None, :] - gp_c_norm[:, None]) + + # pareto_log_c_norm = (np.log(pareto_c) - log_c_min) / (log_c_max - log_c_min) + # c_right = np.abs(pareto_log_c_norm[None, :] - gp_log_c_norm[:, None]) + + sorted_dist = np.sort(c_right, axis=1) + # top_k = sorted_dist[:, :5] + # pareto_dist_weight = np.sum(top_k, axis=1) / top_k.shape[1] + + nearest_idx = np.argmin(c_right, axis=1) + nearest_pareto_dist = np.min(c_right, axis=1) + nearest_pareto_y = pareto_y[nearest_idx] + + # c_left = np.abs(gp_c[:, None] - pareto_c[None, :]) + # c_left[c_left < 0] = np.inf + # nearest_idx = np.argmin(c_left, axis=1) + # nearest_pareto_yt_norm = pareto_yt_norm[nearest_idx] + + max_c_mask = gp_c < self.max_suggestion_cost + # suggestion_scores = self.hyperparameters.optimize_direction * max_c_mask * ( + # gp_yt_norm - nearest_pareto_yt_norm) * nearest_pareto_dist + + # suggestion_scores = self.hyperparameters.optimize_direction * max_c_mask * ( + # gp_yt_norm - nearest_pareto_yt_norm)# / gp_c + + # np.argwhere(gp_c > c) + cumsum_mask = c[None, :] <= np.clip(gp_c[:, None], min_c, max_c) + cumsum_mask = cumsum_mask * c[None, :] + cumsum = np.sum(cumsum_mask, axis=1) / np.sum(c) + target = gp_c_norm + weight = target - cumsum + + # if np.random.rand() < 0.5: + # score = gp_y_norm + # else: + # score = gp_y_norm * weight + # suggestion_scores = self.hyperparameters.optimize_direction * max_c_mask * ( + # score)# / gp_c + + target = 1.25 * np.random.rand() + weight = 1 - abs(target - gp_log_c_norm) + + suggestion_scores = self.hyperparameters.optimize_direction * max_c_mask * (gp_y_norm * weight) # / gp_c + + # suggestion_scores = self.hyperparameters.optimize_direction * max_c_mask * ( + # gp_y_norm*nearest_pareto_dist)# / gp_c + + # exp_scores = np.exp(suggestion_scores) + # sum_exp_scores = np.sum(exp_scores) + # softmax_scores = exp_scores / sum_exp_scores + # idxs = np.arange(len(softmax_scores)) + # best_idx = np.random.choice(idxs, p=softmax_scores) + + # This works and uncovers approximate binary search when the GP is perfect + # Can't include cost in denom because it biases this case + # Instead, use conservative score and/or cost estimates + # Just need to figure out why the GP is overconfident + + best_idx = np.argmax(suggestion_scores) + # best_idx = np.argmax(gp_y_norm) + info = dict( + cost=gp_c[best_idx].item(), + score=gp_y[best_idx].item(), + nearby=nearest_pareto_y[best_idx].item(), + dist=nearest_pareto_dist[best_idx].item(), + rating=suggestion_scores[best_idx].item(), + ) + print( + "Predicted -- ", + f"Score: {info['score']:.3f}", + f"Nearby: {info['nearby']:.3f}", + f"Dist: {info['dist']:.3f}", + f"Cost: {info['cost']:.3f}", + f"Rating: {info['rating']:.3f}", + ) + """ + if info['rating'] < 10: + from bokeh.models import ColumnDataSource, LinearColorMapper + from bokeh.plotting import figure, show + from bokeh.palettes import Turbo256 + + source = ColumnDataSource(data=dict( + x=c, + y=y, + order=np.argsort(c), + )) + mapper = LinearColorMapper( + palette=Turbo256, + low=0, + high=len(c) + ) + + idxs = np.argsort(pareto_c) + pareto_source = ColumnDataSource(data=dict( + x=pareto_c[idxs], + y=pareto_y[idxs], + )) + + c_sorted = sorted(c) + cost_source = ColumnDataSource(data=dict( + x = c_sorted, + y = np.cumsum(c_sorted) / np.sum(c_sorted), + )) + + #gp_pareto_source = ColumnDataSource(data=dict( + # x=gp_c, + # y=gp_y, + # order=np.argsort(gp_c), + #)) + + preds = [{ + 'output': gp_y[i], + 'cost': gp_c[i], + } for i in range(len(gp_c))] + _, pareto_idxs = pareto_points(preds) + + gp_c_pareto = gp_c[pareto_idxs] + gp_y_pareto = gp_y[pareto_idxs] + idxs = np.argsort(gp_c_pareto) + gp_source = ColumnDataSource(data=dict( + x=gp_c_pareto[idxs], + y=gp_y_pareto[idxs], + )) + + p = figure(title='Hyperparam Test', + x_axis_label='Cost', + y_axis_label='Score') + + # Original data + p.scatter( + x='x', + y='y', + color={'field': 'order', 'transform': mapper}, + size=10, + source=source + ) + + p.line(x='x', y='y', color='red', source=pareto_source) + p.line(x='x', y='y', color='blue', source=gp_source) + p.line(x='x', y='y', color='green', source=cost_source) + #p.line(x='x', y='y', color='green', source=gp_pareto_source) + + show(p) + """ + + best = suggestions[best_idx].numpy() + return self.hyperparameters.to_dict(best, fill), info + + def observe(self, hypers, score, cost, is_failure=False): + params = self.hyperparameters.from_dict(hypers) + new_observation = dict( + input=params, + output=score, + cost=cost, + is_failure=is_failure, + ) + + if len(self.success_observations) == 0: + self.success_observations.append(new_observation) + return + + success_params = np.stack([e["input"] for e in self.success_observations]) + dist = np.linalg.norm(params - success_params, axis=1) + same = np.where(dist < 1e-6)[0] + if len(same) > 0: + self.success_observations[same[0]] = new_observation + else: + self.success_observations.append(new_observation) + + +def _carbs_params_from_puffer_sweep(sweep_config): + from carbs import ( + LinearSpace, + LogitSpace, + LogSpace, + Param, + ) + + param_spaces = {} + for name, param in sweep_config.items(): + if name in ("method", "name", "metric", "max_score"): + continue + + assert isinstance(param, dict) + if any(isinstance(param[k], dict) for k in param): + param_spaces[name] = _carbs_params_from_puffer_sweep(param) + continue + + assert "distribution" in param + distribution = param["distribution"] + search_center = param["mean"] + kwargs = dict( + min=param["min"], + max=param["max"], + ) + if distribution == "uniform": + space = LinearSpace(**kwargs) + elif distribution in ("int_uniform", "uniform_pow2"): + space = LinearSpace(**kwargs, is_integer=True) + elif distribution == "log_normal": + space = LogSpace(**kwargs) + elif distribution == "logit_normal": + space = LogitSpace(**kwargs) + else: + raise ValueError(f"Invalid distribution: {distribution}") + + param_spaces[name] = Param(name=name, space=space, search_center=search_center) + + return param_spaces + + +class Carbs: + def __init__( + self, + sweep_config: dict, + max_suggestion_cost: float = None, + resample_frequency: int = 5, + num_random_samples: int = 10, + ): + param_spaces = _carbs_params_from_puffer_sweep(sweep_config) + flat_spaces = [e[1] for e in pufferlib.utils.unroll_nested_dict(param_spaces)] + for e in flat_spaces: + print(e.name, e.space) + + from carbs import ( + CARBS, + CARBSParams, + ) + + carbs_params = CARBSParams( + better_direction_sign=1, + is_wandb_logging_enabled=False, + resample_frequency=resample_frequency, + num_random_samples=num_random_samples, + max_suggestion_cost=max_suggestion_cost, + is_saved_on_every_observation=False, + ) + self.carbs = CARBS(carbs_params, flat_spaces) + + def suggest(self, args): + self.suggestion = self.carbs.suggest().suggestion + for k in ("train", "env"): + for name, param in args["sweep"][k].items(): + if name in self.suggestion: + args[k][name] = self.suggestion[name] + + def observe(self, hypers, score, cost, is_failure=False): + from carbs import ObservationInParam + + self.carbs.observe( + ObservationInParam( + input=self.suggestion, + output=score, + cost=cost, + is_failure=is_failure, + ) + ) diff --git a/pufferlib/utils.py b/pufferlib/utils.py index ad50c3054..a1c3f8e29 100644 --- a/pufferlib/utils.py +++ b/pufferlib/utils.py @@ -1,35 +1,30 @@ -from pdb import set_trace as T -from collections import OrderedDict - -import numpy as np - -import time +import functools +import importlib +import inspect import os -import sys -import pickle import subprocess -from contextlib import redirect_stdout, redirect_stderr, contextmanager -from io import StringIO -import psutil - +import sys +import time import warnings +from collections import OrderedDict +from contextlib import redirect_stderr, redirect_stdout from functools import wraps +from io import StringIO + +import numpy as np +import psutil -import functools -import inspect -import importlib def validate_args(fn, kwargs): fn_kwargs = get_init_args(fn) for param, val in kwargs.items(): if param not in fn_kwargs: raise ValueError( - f'Invalid argument\n{param}\nto\n{fn}\n' - f'which takes \n{fn_kwargs}\n' - f'Double check your config' + f"Invalid argument\n{param}\nto\n{fn}\nwhich takes \n{fn_kwargs}\nDouble check your config" ) + def get_init_args(fn): if fn is None: return {} @@ -40,7 +35,7 @@ def get_init_args(fn): sig = inspect.signature(fn) kwargs = {} for name, param in sig.parameters.items(): - if name in ['env', 'policy']: + if name in ["env", "policy"]: # Hack to avoid duplicate kwargs continue if param.kind == inspect.Parameter.VAR_POSITIONAL: @@ -63,15 +58,17 @@ def unroll_nested_dict(d): else: yield k, v + def install_requirements(env): - '''Pip install dependencies for specified environment''' - pip_install_cmd = [sys.executable, "-m", "pip", "install", "-e" f".[{env}]"] + """Pip install dependencies for specified environment""" + pip_install_cmd = [sys.executable, "-m", "pip", "install", f"-e.[{env}]"] proc = subprocess.run(pip_install_cmd, capture_output=True, text=True) if proc.returncode != 0: raise RuntimeError(f"Error installing requirements: {proc.stderr}") + def install_and_import(package): - '''Install and import a package''' + """Install and import a package""" try: module = importlib.import_module(package) except ImportError: @@ -80,23 +77,27 @@ def install_and_import(package): return module + def silence_warnings(original_func, category=DeprecationWarning): @wraps(original_func) def wrapper(*args, **kwargs): with warnings.catch_warnings(): warnings.simplefilter("ignore", category=category) return original_func(*args, **kwargs) + return wrapper + def check_env(env): - #assert issubclass(env_cls, gym.Env), "Not a gymnasium env (are you on old gym?)" - assert hasattr(env, 'possible_agents') + # assert issubclass(env_cls, gym.Env), "Not a gymnasium env (are you on old gym?)" + assert hasattr(env, "possible_agents") assert len(env.possible_agents) obs_space = env.observation_space(env.possible_agents[0]) atn_space = env.action_space(env.possible_agents[0]) for e in env.possible_agents: - assert env.observation_space(e) == obs_space, 'All agents must have same obs space' - assert env.action_space(e) == atn_space, 'All agents must have same atn space' + assert env.observation_space(e) == obs_space, "All agents must have same obs space" + assert env.action_space(e) == atn_space, "All agents must have same atn space" + def make_zeros_like(data): if isinstance(data, dict): @@ -108,7 +109,8 @@ def make_zeros_like(data): elif isinstance(data, (int, float)): return 0 else: - raise ValueError(f'Unsupported type: {type(data)}') + raise ValueError(f"Unsupported type: {type(data)}") + def compare_arrays(array_1, array_2): assert isinstance(array_1, np.ndarray) @@ -116,6 +118,7 @@ def compare_arrays(array_1, array_2): assert array_1.shape == array_2.shape return np.allclose(array_1, array_2) + def compare_dicts(dict_1, dict_2, idx): assert isinstance(dict_1, (dict, OrderedDict)) assert isinstance(dict_2, (dict, OrderedDict)) @@ -129,6 +132,7 @@ def compare_dicts(dict_1, dict_2, idx): return True + def compare_lists(list_1, list_2, idx): assert isinstance(list_1, (list, tuple)) assert isinstance(list_2, (list, tuple)) @@ -136,19 +140,20 @@ def compare_lists(list_1, list_2, idx): if len(list_1) != len(list_2): raise ValueError("Lengths do not match between lists/tuples.") - for v1, v2 in zip(list_1, list_2): + for v1, v2 in zip(list_1, list_2, strict=False): if not compare_space_samples(v1, v2, idx): return False - + return True - + + def compare_space_samples(sample_1, sample_2, sample_2_batch_idx=None): - '''Compare two samples from the same space - + """Compare two samples from the same space + Optionally, sample_2 may be a batch of samples from the same space concatenated along the first dimension of the leaves. In this case, sample_2_batch_idx specifies which sample to compare. - ''' + """ if isinstance(sample_1, (dict, OrderedDict)): return compare_dicts(sample_1, sample_2, sample_2_batch_idx) elif isinstance(sample_1, (list, tuple)): @@ -168,6 +173,7 @@ def compare_space_samples(sample_1, sample_2, sample_2_batch_idx=None): else: raise ValueError(f"Unsupported type: {type(sample_1)}") + def _get_dtype_bounds(dtype): if dtype == bool: return 0, 1 @@ -181,25 +187,30 @@ def _get_dtype_bounds(dtype): else: raise ValueError(f"Unsupported dtype: {dtype}") + def is_dict_space(space): # Compatible with gym/gymnasium - return type(space).__name__ == 'Dict' + return type(space).__name__ == "Dict" + def is_multiagent(env): - import pettingzoo import gym + import pettingzoo + if inspect.isclass(env): env_cls = env else: env_cls = type(env) if not issubclass(env_cls, pettingzoo.AECEnv) and not issubclass(env_cls, pettingzoo.ParallelEnv): - assert issubclass(env_cls, gym.Env), 'Environment must subclass pettingzoo.AECEnv/ParallelEnv or gym.Env' + assert issubclass(env_cls, gym.Env), "Environment must subclass pettingzoo.AECEnv/ParallelEnv or gym.Env" return False return True + def current_datetime(): - return time.strftime('%Y-%m-%d_%H-%M-%S', time.localtime()) + return time.strftime("%Y-%m-%d_%H-%M-%S", time.localtime()) + def myprint(d): stack = d.items() @@ -210,6 +221,7 @@ def myprint(d): else: print("%s: %s" % (k, v)) + class RandomState: def __init__(self, seed): self.rng = np.random.RandomState(seed) @@ -218,11 +230,11 @@ def random(self): return self.rng.random() def probabilistic_round(self, n): - frac, integer = np.modf(n) - if self.random() < frac: - return int(integer) + 1 - else: - return int(integer) + frac, integer = np.modf(n) + if self.random() < frac: + return int(integer) + 1 + else: + return int(integer) def sample(self, ary, n): n_rounded = self.probabilistic_round(n) @@ -231,17 +243,19 @@ def sample(self, ary, n): def choice(self, ary): return self.sample(ary, 1)[0] + def format_bytes(size): - if size >= 1024 ** 4: - return f'{size / (1024 ** 4):.2f} TB' - elif size >= 1024 ** 3: - return f'{size / (1024 ** 3):.2f} GB' - elif size >= 1024 ** 2: - return f'{size / (1024 ** 2):.2f} MB' + if size >= 1024**4: + return f"{size / (1024**4):.2f} TB" + elif size >= 1024**3: + return f"{size / (1024**3):.2f} GB" + elif size >= 1024**2: + return f"{size / (1024**2):.2f} MB" elif size >= 1024: - return f'{size / 1024:.2f} KB' + return f"{size / 1024:.2f} KB" else: - return f'{size} B' + return f"{size} B" + class Profiler: def __init__(self, elapsed=True, calls=True, memory=False, pytorch_memory=False): @@ -250,27 +264,28 @@ def __init__(self, elapsed=True, calls=True, memory=False, pytorch_memory=False) self.memory = None self.pytorch_memory = None self.prev = 0 - + self.track_elapsed = elapsed self.track_calls = calls self.track_memory = memory self.track_pytorch_memory = pytorch_memory - + if memory: self.process = psutil.Process() if pytorch_memory: import torch + self.torch = torch @property def serial(self): return { - 'elapsed': self.elapsed, - 'calls': self.calls, - 'memory': self.memory, - 'pytorch_memory': self.pytorch_memory, - 'delta': self.delta + "elapsed": self.elapsed, + "calls": self.calls, + "memory": self.memory, + "pytorch_memory": self.pytorch_memory, + "delta": self.delta, } @property @@ -304,26 +319,27 @@ def __exit__(self, *args): def __repr__(self): parts = [] if self.track_elapsed: - parts.append(f'Elapsed: {self.elapsed:.4f} s') + parts.append(f"Elapsed: {self.elapsed:.4f} s") if self.track_calls: - parts.append(f'Calls: {self.calls}') + parts.append(f"Calls: {self.calls}") if self.track_memory: - parts.append(f'Memory: {format_bytes(self.memory)}') + parts.append(f"Memory: {format_bytes(self.memory)}") if self.track_pytorch_memory: - parts.append(f'PyTorch Memory: {format_bytes(self.pytorch_memory)}') + parts.append(f"PyTorch Memory: {format_bytes(self.pytorch_memory)}") return ", ".join(parts) # Aliases for use without context manager start = __enter__ stop = __exit__ + def profile(func): name = func.__name__ def wrapper(*args, **kwargs): self = args[0] - if not hasattr(self, '_timers'): + if not hasattr(self, "_timers"): self._timers = {} if name not in self._timers: @@ -338,6 +354,7 @@ def wrapper(*args, **kwargs): return wrapper + def aggregate_profilers(profiler_dicts): merged = {} @@ -349,7 +366,8 @@ def aggregate_profilers(profiler_dicts): return merged -class Suppress(): + +class Suppress: def __init__(self): self.f = StringIO() self.null_1 = os.open(os.devnull, os.O_WRONLY | os.O_TRUNC | os.O_CREAT) diff --git a/pufferlib/vector.py b/pufferlib/vector.py index f8339e372..07415c8de 100644 --- a/pufferlib/vector.py +++ b/pufferlib/vector.py @@ -1,18 +1,17 @@ # TODO: Check actions passed to envs are right shape? On first call at least -from pdb import set_trace as T -import numpy as np import time + +import numpy as np import psutil +import pufferlib.spaces from pufferlib import namespace from pufferlib.emulation import GymnasiumPufferEnv, PettingZooPufferEnv from pufferlib.environment import PufferEnv, set_buffers from pufferlib.exceptions import APIUsageError from pufferlib.namespace import Namespace -import pufferlib.spaces -import gymnasium RESET = 0 STEP = 1 @@ -22,35 +21,40 @@ MAIN = 5 INFO = 6 + def recv_precheck(vecenv): if vecenv.flag != RECV: - raise APIUsageError('Call reset before stepping') + raise APIUsageError("Call reset before stepping") vecenv.flag = SEND + def send_precheck(vecenv, actions): if vecenv.flag != SEND: - raise APIUsageError('Call (async) reset + recv before sending') + raise APIUsageError("Call (async) reset + recv before sending") actions = np.asarray(actions) if not vecenv.initialized: vecenv.initialized = True if not vecenv.action_space.contains(actions): - raise APIUsageError('Actions do not match action space') + raise APIUsageError("Actions do not match action space") vecenv.flag = RECV return actions + def reset(vecenv, seed=42): vecenv.async_reset(seed) obs, rewards, terminals, truncations, infos, env_ids, masks = vecenv.recv() return obs, infos + def step(vecenv, actions): actions = np.asarray(actions) vecenv.send(actions) obs, rewards, terminals, truncations, infos, env_ids, masks = vecenv.recv() - return obs, rewards, terminals, truncations, infos # include env_ids or no? + return obs, rewards, terminals, truncations, infos # include env_ids or no? + class Serial: reset = reset @@ -59,7 +63,7 @@ class Serial: @property def num_envs(self): return self.agents_per_batch - + def __init__(self, env_creators, env_args, env_kwargs, num_envs, buf=None, **kwargs): self.driver_env = env_creators[0](*env_args[0], **env_kwargs[0]) self.agents_per_batch = self.driver_env.num_agents * num_envs @@ -70,7 +74,6 @@ def __init__(self, env_creators, env_args, env_kwargs, num_envs, buf=None, **kwa self.action_space = pufferlib.spaces.joint_space(self.single_action_space, self.agents_per_batch) self.observation_space = pufferlib.spaces.joint_space(self.single_observation_space, self.agents_per_batch) - set_buffers(self, buf) self.envs = [] @@ -83,7 +86,7 @@ def __init__(self, env_creators, env_args, env_kwargs, num_envs, buf=None, **kwa terminals=self.terminals[ptr:end], truncations=self.truncations[ptr:end], masks=self.masks[ptr:end], - actions=self.actions[ptr:end] + actions=self.actions[ptr:end], ) ptr = end env = env_creators[i](*env_args[i], buf=buf_i, **env_kwargs[i]) @@ -103,9 +106,9 @@ def async_reset(self, seed=42): seed = make_seeds(seed, len(self.envs)) infos = [] - for env, s in zip(self.envs, seed): + for env, s in zip(self.envs, seed, strict=False): ob, i = env.reset(seed=s) - + if isinstance(i, list): infos.extend(i) else: @@ -138,23 +141,43 @@ def send(self, actions): def recv(self): recv_precheck(self) - return (self.observations, self.rewards, self.terminals, self.truncations, - self.infos, self.agent_ids, self.masks) + return ( + self.observations, + self.rewards, + self.terminals, + self.truncations, + self.infos, + self.agent_ids, + self.masks, + ) def close(self): for env in self.envs: env.close() -def _worker_process(env_creators, env_args, env_kwargs, obs_shape, obs_dtype, atn_shape, atn_dtype, - num_envs, num_agents, num_workers, worker_idx, send_pipe, recv_pipe, shm, is_native): +def _worker_process( + env_creators, + env_args, + env_kwargs, + obs_shape, + obs_dtype, + atn_shape, + atn_dtype, + num_envs, + num_agents, + num_workers, + worker_idx, + send_pipe, + recv_pipe, + shm, + is_native, +): # Environments read and write directly to shared memory - shape = (num_workers, num_envs*num_agents) - atn_arr = np.ndarray((*shape, *atn_shape), - dtype=atn_dtype, buffer=shm.actions)[worker_idx] + shape = (num_workers, num_envs * num_agents) + atn_arr = np.ndarray((*shape, *atn_shape), dtype=atn_dtype, buffer=shm.actions)[worker_idx] buf = namespace( - observations=np.ndarray((*shape, *obs_shape), - dtype=obs_dtype, buffer=shm.observations)[worker_idx], + observations=np.ndarray((*shape, *obs_shape), dtype=obs_dtype, buffer=shm.observations)[worker_idx], rewards=np.ndarray(shape, dtype=np.float32, buffer=shm.rewards)[worker_idx], terminals=np.ndarray(shape, dtype=bool, buffer=shm.terminals)[worker_idx], truncations=np.ndarray(shape, dtype=bool, buffer=shm.truncateds)[worker_idx], @@ -168,7 +191,7 @@ def _worker_process(env_creators, env_args, env_kwargs, obs_shape, obs_dtype, at else: envs = Serial(env_creators, env_args, env_kwargs, num_envs, buf=buf) - semaphores=np.ndarray(num_workers, dtype=np.uint8, buffer=shm.semaphores) + semaphores = np.ndarray(num_workers, dtype=np.uint8, buffer=shm.semaphores) start = time.time() while True: sem = semaphores[worker_idx] @@ -194,40 +217,55 @@ def _worker_process(env_creators, env_args, env_kwargs, obs_shape, obs_dtype, at else: semaphores[worker_idx] = MAIN + class Multiprocessing: - '''Runs environments in parallel using multiprocessing + """Runs environments in parallel using multiprocessing Use this vectorization module for most applications - ''' + """ + reset = reset step = step @property def num_envs(self): return self.agents_per_batch - - def __init__(self, env_creators, env_args, env_kwargs, - num_envs, num_workers=None, batch_size=None, - zero_copy=True, overwork=False, **kwargs): + + def __init__( + self, + env_creators, + env_args, + env_kwargs, + num_envs, + num_workers=None, + batch_size=None, + zero_copy=True, + overwork=False, + **kwargs, + ): if batch_size is None: batch_size = num_envs if num_workers is None: num_workers = num_envs import psutil + cpu_cores = psutil.cpu_count(logical=False) if num_workers > cpu_cores and not overwork: - raise APIUsageError(' '.join([ - f'num_workers ({num_workers}) > hardware cores ({cpu_cores}) is disallowed by default.', - 'PufferLib multiprocessing is heavily optimized for 1 process per hardware core.', - 'If you really want to do this, set overwork=True (--vec-overwork in our demo.py).', - ])) + raise APIUsageError( + " ".join( + [ + f"num_workers ({num_workers}) > hardware cores ({cpu_cores}) is disallowed by default.", + "PufferLib multiprocessing is heavily optimized for 1 process per hardware core.", + "If you really want to do this, set overwork=True (--vec-overwork in our demo.py).", + ] + ) + ) num_batches = num_envs / batch_size if zero_copy and num_batches != int(num_batches): # This is so you can have n equal buffers - raise APIUsageError( - 'zero_copy: num_envs must be divisible by batch_size') + raise APIUsageError("zero_copy: num_envs must be divisible by batch_size") self.num_environments = num_envs envs_per_worker = num_envs // num_workers @@ -266,23 +304,22 @@ def __init__(self, env_creators, env_args, env_kwargs, self.agent_ids = np.arange(num_agents).reshape(num_workers, agents_per_worker) from multiprocessing import RawArray + self.shm = namespace( observations=RawArray(obs_ctype, num_agents * int(np.prod(obs_shape))), actions=RawArray(atn_ctype, num_agents * int(np.prod(atn_shape))), - rewards=RawArray('f', num_agents), - terminals=RawArray('b', num_agents), - truncateds=RawArray('b', num_agents), - masks=RawArray('b', num_agents), - semaphores=RawArray('c', num_workers), + rewards=RawArray("f", num_agents), + terminals=RawArray("b", num_agents), + truncateds=RawArray("b", num_agents), + masks=RawArray("b", num_agents), + semaphores=RawArray("c", num_workers), ) shape = (num_workers, agents_per_worker) self.obs_batch_shape = (self.agents_per_batch, *obs_shape) self.atn_batch_shape = (self.workers_per_batch, agents_per_worker, *atn_shape) - self.actions = np.ndarray((*shape, *atn_shape), - dtype=atn_dtype, buffer=self.shm.actions) + self.actions = np.ndarray((*shape, *atn_shape), dtype=atn_dtype, buffer=self.shm.actions) self.buf = namespace( - observations=np.ndarray((*shape, *obs_shape), - dtype=obs_dtype, buffer=self.shm.observations), + observations=np.ndarray((*shape, *obs_shape), dtype=obs_dtype, buffer=self.shm.observations), rewards=np.ndarray(shape, dtype=np.float32, buffer=self.shm.rewards), terminals=np.ndarray(shape, dtype=bool, buffer=self.shm.terminals), truncations=np.ndarray(shape, dtype=bool, buffer=self.shm.truncateds), @@ -292,8 +329,9 @@ def __init__(self, env_creators, env_args, env_kwargs, self.buf.semaphores[:] = MAIN from multiprocessing import Pipe, Process - self.send_pipes, w_recv_pipes = zip(*[Pipe() for _ in range(num_workers)]) - w_send_pipes, self.recv_pipes = zip(*[Pipe() for _ in range(num_workers)]) + + self.send_pipes, w_recv_pipes = zip(*[Pipe() for _ in range(num_workers)], strict=False) + w_send_pipes, self.recv_pipes = zip(*[Pipe() for _ in range(num_workers)], strict=False) self.recv_pipe_dict = {p: i for i, p in enumerate(self.recv_pipes)} self.processes = [] @@ -302,11 +340,23 @@ def __init__(self, env_creators, env_args, env_kwargs, end = start + envs_per_worker p = Process( target=_worker_process, - args=(env_creators[start:end], env_args[start:end], - env_kwargs[start:end], obs_shape, obs_dtype, - atn_shape, atn_dtype, envs_per_worker, driver_env.num_agents, - num_workers, i, w_send_pipes[i], w_recv_pipes[i], - self.shm, is_native) + args=( + env_creators[start:end], + env_args[start:end], + env_kwargs[start:end], + obs_shape, + obs_dtype, + atn_shape, + atn_dtype, + envs_per_worker, + driver_env.num_agents, + num_workers, + i, + w_send_pipes[i], + w_recv_pipes[i], + self.shm, + is_native, + ), ) p.start() self.processes.append(p) @@ -354,8 +404,7 @@ def recv(self): # microseconds of extra index processing time completed = np.zeros(self.num_workers, dtype=bool) completed[self.ready_workers] = True - buffers = completed.reshape( - -1, self.workers_per_batch).all(axis=1) + buffers = completed.reshape(-1, self.workers_per_batch).all(axis=1) start = buffers.argmax() if not buffers[start]: continue @@ -365,17 +414,16 @@ def recv(self): w_slice = slice(start, end) s_range = range(start, end) self.waiting_workers.extend(s_range) - self.ready_workers = [e for e in self.ready_workers - if e not in s_range] + self.ready_workers = [e for e in self.ready_workers if e not in s_range] break elif len(self.ready_workers) >= self.workers_per_batch: # Full async path for batch size > 1. Alawys copies # data because of non-contiguous worker indices # Can be faster for envs with small observations - w_slice = self.ready_workers[:self.workers_per_batch] + w_slice = self.ready_workers[: self.workers_per_batch] s_range = w_slice self.waiting_workers.extend(s_range) - self.ready_workers = self.ready_workers[self.workers_per_batch:] + self.ready_workers = self.ready_workers[self.workers_per_batch :] break self.w_slice = w_slice @@ -401,7 +449,7 @@ def recv(self): def send(self, actions): actions = send_precheck(self, actions).reshape(self.atn_batch_shape) # TODO: What shape? - + idxs = self.w_slice self.actions[idxs] = actions self.buf.semaphores[idxs] = STEP @@ -418,12 +466,12 @@ def async_reset(self, seed=42): self.buf.semaphores[:] = RESET for i in range(self.num_workers): - start = i*self.envs_per_worker - end = (i+1)*self.envs_per_worker + start = i * self.envs_per_worker + end = (i + 1) * self.envs_per_worker self.send_pipes[i].send(seed[start:end]) def close(self): - ''' + """ while self.waiting_workers: worker = self.waiting_workers.pop(0) sem = self.buf.semaphores[worker] @@ -444,24 +492,25 @@ def close(self): self.ready_workers.append(worker) if sem == INFO: self.recv_pipes[worker].recv() - + else: self.waiting_workers.append(worker) - ''' + """ for p in self.processes: p.terminate() -class Ray(): - '''Runs environments in parallel on multiple processes using Ray + +class Ray: + """Runs environments in parallel on multiple processes using Ray Use this module for distributed simulation on a cluster. - ''' + """ + reset = reset step = step - def __init__(self, env_creators, env_args, env_kwargs, num_envs, - num_workers=None, batch_size=None, **kwargs): + def __init__(self, env_creators, env_args, env_kwargs, num_envs, num_workers=None, batch_size=None, **kwargs): if batch_size is None: batch_size = num_envs if num_workers is None: @@ -491,12 +540,14 @@ def __init__(self, env_creators, env_args, env_kwargs, num_envs, self.single_action_space = driver_env.single_action_space self.action_space = pufferlib.spaces.joint_space(self.single_action_space, self.agents_per_batch) self.observation_space = pufferlib.spaces.joint_space(self.single_observation_space, self.agents_per_batch) - + self.agent_ids = np.arange(num_agents).reshape(num_workers, agents_per_worker) import ray + if not ray.is_initialized(): import logging + ray.init( include_dashboard=False, # WSL Compatibility logging_level=logging.ERROR, @@ -508,10 +559,7 @@ def __init__(self, env_creators, env_args, env_kwargs, num_envs, end = start + envs_per_worker self.envs.append( ray.remote(Serial).remote( - env_creators[start:end], - env_args[start:end], - env_kwargs[start:end], - envs_per_worker + env_creators[start:end], env_args[start:end], env_kwargs[start:end], envs_per_worker ) ) @@ -530,12 +578,11 @@ def recv(self): recvs = self.ray.get(self.async_handles[:workers_per_batch]) env_id = [_ for _ in range(workers_per_batch)] else: - ready, busy = self.ray.wait( - self.async_handles, num_returns=workers_per_batch) + ready, busy = self.ray.wait(self.async_handles, num_returns=workers_per_batch) env_id = [self.async_handles.index(e) for e in ready] recvs = self.ray.get(ready) - o, r, d, t, infos, ids, m = zip(*recvs) + o, r, d, t, infos, ids, m = zip(*recvs, strict=False) self.prev_env_id = env_id infos = [i for ii in infos for i in ii] @@ -582,38 +629,40 @@ def close(self): def make(env_creator_or_creators, env_args=None, env_kwargs=None, backend=PufferEnv, num_envs=1, **kwargs): if num_envs < 1: - raise APIUsageError('num_envs must be at least 1') + raise APIUsageError("num_envs must be at least 1") if num_envs != int(num_envs): - raise APIUsageError('num_envs must be an integer') + raise APIUsageError("num_envs must be an integer") if backend == PufferEnv: env_args = env_args or [] env_kwargs = env_kwargs or {} vecenv = env_creator_or_creators(*env_args, **env_kwargs) if not isinstance(vecenv, PufferEnv): - raise APIUsageError('Native vectorization requires a native PufferEnv. Use Serial or Multiprocessing instead.') + raise APIUsageError( + "Native vectorization requires a native PufferEnv. Use Serial or Multiprocessing instead." + ) if num_envs != 1: - raise APIUsageError('Native vectorization is for PufferEnvs that handle all per-process vectorization internally. If you want to run multiple separate Python instances on a single process, use Serial or Multiprocessing instead') + raise APIUsageError( + "Native vectorization is for PufferEnvs that handle all per-process vectorization internally. If you want to run multiple separate Python instances on a single process, use Serial or Multiprocessing instead" + ) return vecenv - if 'num_workers' in kwargs: - num_workers = kwargs['num_workers'] + if "num_workers" in kwargs: + num_workers = kwargs["num_workers"] # TODO: None? envs_per_worker = num_envs / num_workers if envs_per_worker != int(envs_per_worker): - raise APIUsageError('num_envs must be divisible by num_workers') + raise APIUsageError("num_envs must be divisible by num_workers") - if 'batch_size' in kwargs: - batch_size = kwargs['batch_size'] + if "batch_size" in kwargs: + batch_size = kwargs["batch_size"] if batch_size is None: batch_size = num_envs if batch_size % envs_per_worker != 0: - raise APIUsageError( - 'batch_size must be divisible by (num_envs / num_workers)') - - + raise APIUsageError("batch_size must be divisible by (num_envs / num_workers)") + if env_args is None: env_args = [] @@ -628,41 +677,42 @@ def make(env_creator_or_creators, env_args=None, env_kwargs=None, backend=Puffer env_creators = env_creator_or_creators if len(env_creators) != num_envs: - raise APIUsageError('env_creators must be a list of length num_envs') + raise APIUsageError("env_creators must be a list of length num_envs") if len(env_args) != num_envs: - raise APIUsageError('env_args must be a list of length num_envs') + raise APIUsageError("env_args must be a list of length num_envs") if len(env_kwargs) != num_envs: - raise APIUsageError('env_kwargs must be a list of length num_envs') + raise APIUsageError("env_kwargs must be a list of length num_envs") for i in range(num_envs): if not callable(env_creators[i]): - raise APIUsageError('env_creators must be a list of callables') + raise APIUsageError("env_creators must be a list of callables") if not isinstance(env_args[i], (list, tuple)): - raise APIUsageError('env_args must be a list of lists or tuples') + raise APIUsageError("env_args must be a list of lists or tuples") if not isinstance(env_kwargs[i], (dict, Namespace)): - raise APIUsageError('env_kwargs must be a list of dictionaries') + raise APIUsageError("env_kwargs must be a list of dictionaries") # Keeps batch size consistent when debugging with Serial backend - if backend is Serial and 'batch_size' in kwargs: - num_envs = kwargs['batch_size'] + if backend is Serial and "batch_size" in kwargs: + num_envs = kwargs["batch_size"] # TODO: Check num workers is not greater than num envs. This results in # different Serial vs Multiprocessing behavior # Sanity check args for k in kwargs: - if k not in ['num_workers', 'batch_size', 'zero_copy', 'overwork', 'backend']: - raise APIUsageError(f'Invalid argument: {k}') + if k not in ["num_workers", "batch_size", "zero_copy", "overwork", "backend"]: + raise APIUsageError(f"Invalid argument: {k}") # TODO: First step action space check - + return backend(env_creators, env_args, env_kwargs, num_envs, **kwargs) + def make_seeds(seed, num_envs): if isinstance(seed, int): return [seed + i for i in range(num_envs)] - err = f'seed {seed} must be an integer or a list of integers' + err = f"seed {seed} must be an integer or a list of integers" if isinstance(seed, (list, tuple)): if len(seed) != num_envs: raise APIUsageError(err) @@ -671,48 +721,53 @@ def make_seeds(seed, num_envs): raise APIUsageError(err) + def check_envs(envs, driver): valid = (PufferEnv, GymnasiumPufferEnv, PettingZooPufferEnv) if not isinstance(driver, valid): - raise APIUsageError(f'env_creator must be {valid}') + raise APIUsageError(f"env_creator must be {valid}") driver_obs = driver.single_observation_space driver_atn = driver.single_action_space for env in envs: if not isinstance(env, valid): - raise APIUsageError(f'env_creators must be {valid}') + raise APIUsageError(f"env_creators must be {valid}") obs_space = env.single_observation_space if obs_space != driver_obs: - raise APIUsageError(f'\n{obs_space}\n{driver_obs} obs space mismatch') + raise APIUsageError(f"\n{obs_space}\n{driver_obs} obs space mismatch") atn_space = env.single_action_space if atn_space != driver_atn: - raise APIUsageError(f'\n{atn_space}\n{driver_atn} atn space mismatch') - -def autotune(env_creator, batch_size, max_envs=194, model_forward_s=0.0, - max_env_ram_gb=32, max_batch_vram_gb=0.05, time_per_test=5): - '''Determine the optimal vectorization parameters for your system''' + raise APIUsageError(f"\n{atn_space}\n{driver_atn} atn space mismatch") + + +def autotune( + env_creator, + batch_size, + max_envs=194, + model_forward_s=0.0, + max_env_ram_gb=32, + max_batch_vram_gb=0.05, + time_per_test=5, +): + """Determine the optimal vectorization parameters for your system""" # TODO: fix multiagent if batch_size is None: - raise ValueError('batch_size must not be None') + raise ValueError("batch_size must not be None") if max_envs < batch_size: - raise ValueError('max_envs < min_batch_size') + raise ValueError("max_envs < min_batch_size") num_cores = psutil.cpu_count(logical=False) idle_ram = psutil.Process().memory_info().rss load_ram = idle_ram # Initial profile to estimate single-core performance - print('Profiling single-core performance for ~', time_per_test, 'seconds') + print("Profiling single-core performance for ~", time_per_test, "seconds") env = env_creator() env.reset() obs_space = env.single_observation_space - actions = [ - np.array([env.single_action_space.sample() - for _ in range(env.num_agents)]) - for _ in range(1000) - ] + actions = [np.array([env.single_action_space.sample() for _ in range(env.num_agents)]) for _ in range(1000)] num_agents = env.num_agents steps = 0 @@ -726,7 +781,7 @@ def autotune(env_creator, batch_size, max_envs=194, model_forward_s=0.0, env.reset() reset_times.append(time.time() - s) else: - env.step(actions[steps%1000]) + env.step(actions[steps % 1000]) step_times.append(time.time() - s) steps += 1 @@ -738,35 +793,30 @@ def autotune(env_creator, batch_size, max_envs=194, model_forward_s=0.0, reset_mean = np.mean(reset_times) ram_usage = max(1, (idle_ram - load_ram)) / 1e9 - obs_size_gb = ( - np.prod(obs_space.shape) - * np.dtype(obs_space.dtype).itemsize - * num_agents - / 1e9 - ) + obs_size_gb = np.prod(obs_space.shape) * np.dtype(obs_space.dtype).itemsize * num_agents / 1e9 # Max bandwidth bandwidth = obs_size_gb * sps throughput = bandwidth * num_cores - print('Profile complete') - print(f' SPS: {sps:.3f}') - print(f' STD: {step_variance:.3f}%') - print(f' Reset: {reset_percent:.3f}%') - print(f' RAM: {1000*ram_usage:.3f} MB/env') - print(f' Bandwidth: {bandwidth:.3f} GB/s') - print(f' Throughput: {throughput:.3f} GB/s ({num_cores} cores)') + print("Profile complete") + print(f" SPS: {sps:.3f}") + print(f" STD: {step_variance:.3f}%") + print(f" Reset: {reset_percent:.3f}%") + print(f" RAM: {1000 * ram_usage:.3f} MB/env") + print(f" Bandwidth: {bandwidth:.3f} GB/s") + print(f" Throughput: {throughput:.3f} GB/s ({num_cores} cores)") print() # Cap envs based on max allowed RAM max_allowed_by_ram = max_env_ram_gb // ram_usage if max_allowed_by_ram < max_envs: max_envs = int(max_allowed_by_ram) - print('Reducing max envs to', max_envs, 'based on RAM') + print("Reducing max envs to", max_envs, "based on RAM") # Cap envs based on estimated max speedup - #linear_speedup = (num_cores * steps / sum_time) // 500 - #if linear_speedup < max_envs and linear_speedup > num_cores: + # linear_speedup = (num_cores * steps / sum_time) // 500 + # if linear_speedup < max_envs and linear_speedup > num_cores: # max_envs = int(linear_speedup) # print('Reducing max envs to', max_envs, 'based on single-core speed') @@ -774,25 +824,27 @@ def autotune(env_creator, batch_size, max_envs=194, model_forward_s=0.0, hardware_envs = max_envs - (max_envs % num_cores) if hardware_envs > batch_size and hardware_envs != max_envs: max_envs = int(hardware_envs) - print('Reducing max envs to', max_envs, 'based on core division') + print("Reducing max envs to", max_envs, "based on core division") max_allowed_by_vram = max_batch_vram_gb // obs_size_gb if max_allowed_by_vram < batch_size: - raise ValueError('max_allowed_by_vram < batch_size') + raise ValueError("max_allowed_by_vram < batch_size") print() configs = [] # Strategy 1: one batch per core strategy_cores = min(num_cores, max_envs // batch_size) - configs.append(dict( - num_envs=batch_size*strategy_cores, - num_workers=strategy_cores, - batch_size=batch_size, - backend=Multiprocessing, - )) - - strategy_min_envs_per_worker = int(np.ceil((batch_size+1) / num_cores)) + configs.append( + dict( + num_envs=batch_size * strategy_cores, + num_workers=strategy_cores, + batch_size=batch_size, + backend=Multiprocessing, + ) + ) + + strategy_min_envs_per_worker = int(np.ceil((batch_size + 1) / num_cores)) strategy_num_envs = [] for envs_per_worker in range(strategy_min_envs_per_worker, batch_size): num_envs = envs_per_worker * num_cores @@ -802,44 +854,52 @@ def autotune(env_creator, batch_size, max_envs=194, model_forward_s=0.0, continue # Strategy 2: Full async. Only reasonable for low bandwidth - #if throughput < 1.5: - configs.append(dict( - num_envs=num_envs, - num_workers=num_cores, - batch_size=batch_size, - zero_copy=False, - backend=Multiprocessing, - )) + # if throughput < 1.5: + configs.append( + dict( + num_envs=num_envs, + num_workers=num_cores, + batch_size=batch_size, + zero_copy=False, + backend=Multiprocessing, + ) + ) # Strategy 3: Contiguous blocks. Only reasonable for high bandwidth num_batchs = num_envs / batch_size if num_batchs != int(num_batchs): continue if throughput > 0.5: - configs.append(dict( - num_envs=num_envs, - num_workers=num_cores, - batch_size=batch_size, - backend=Multiprocessing, - )) - + configs.append( + dict( + num_envs=num_envs, + num_workers=num_cores, + batch_size=batch_size, + backend=Multiprocessing, + ) + ) + # Strategy 4: Full sync - perhaps nichely useful for strategy_cores in range(num_cores, 1, -1): if batch_size % strategy_cores != 0: continue - configs.append(dict( - num_envs=batch_size, - num_workers=strategy_cores, - batch_size=batch_size, - backend=Multiprocessing, - )) + configs.append( + dict( + num_envs=batch_size, + num_workers=strategy_cores, + batch_size=batch_size, + backend=Multiprocessing, + ) + ) # Strategy 5: Serial - configs.append(dict( - num_envs=batch_size, - backend=Serial, - )) + configs.append( + dict( + num_envs=batch_size, + backend=Serial, + ) + ) for config in configs: with pufferlib.utils.Suppress(): @@ -851,7 +911,7 @@ def autotune(env_creator, batch_size, max_envs=194, model_forward_s=0.0, start = time.time() while time.time() - start < time_per_test: s = time.time() - envs.send(actions[steps%1000]) + envs.send(actions[steps % 1000]) step_time += time.time() - s if model_forward_s > 0: @@ -866,11 +926,11 @@ def autotune(env_creator, batch_size, max_envs=194, model_forward_s=0.0, end = time.time() envs.close() sps = steps * envs.agents_per_batch / step_time - print(f'SPS: {sps:.3f}') + print(f"SPS: {sps:.3f}") for k, v in config.items(): - if k == 'backend': + if k == "backend": v = v.__name__ - print(f' {k}: {v}') + print(f" {k}: {v}") print() diff --git a/pufferlib/version.py b/pufferlib/version.py index 13ce17d8e..ff6ef86de 100644 --- a/pufferlib/version.py +++ b/pufferlib/version.py @@ -1 +1 @@ -__version__ = '2.0.6' +__version__ = "2.0.6" diff --git a/pufferlib/wrappers.py b/pufferlib/wrappers.py index 3bda41971..c2e2ec0e3 100644 --- a/pufferlib/wrappers.py +++ b/pufferlib/wrappers.py @@ -1,4 +1,4 @@ -from pdb import set_trace as T + class GymToGymnasium: def __init__(self, env): @@ -22,6 +22,7 @@ def step(self, action): def close(self): self.env.close() + class PettingZooTruncatedWrapper: def __init__(self, env): self.env = env diff --git a/save_net_flat.py b/save_net_flat.py index 585eecae6..6d26bd546 100644 --- a/save_net_flat.py +++ b/save_net_flat.py @@ -1,74 +1,79 @@ +import numpy as np import torch from torch.nn import functional as F -import numpy as np + def save_model_weights(model, filename): weights = [] for name, param in model.named_parameters(): weights.append(param.data.cpu().numpy().flatten()) print(name, param.shape, param.data.cpu().numpy().ravel()[0]) - + weights = np.concatenate(weights) - print('Num weights:', len(weights)) + print("Num weights:", len(weights)) weights.tofile(filename) # Save the model architecture (you may want to adjust this based on your specific model) - #with open(filename + "_architecture.txt", "w") as f: + # with open(filename + "_architecture.txt", "w") as f: # for name, param in model.named_parameters(): # f.write(f"{name}: {param.shape}\n") + def test_model(model): model = model.cpu().policy batch_size = 16 obs_window = 11 obs_window_channels = 4 obs_flat = 26 - x = torch.arange( - 0, batch_size*(obs_window*obs_window*obs_window_channels + obs_flat) - ).reshape(batch_size, -1) % 16 + x = ( + torch.arange(0, batch_size * (obs_window * obs_window * obs_window_channels + obs_flat)).reshape(batch_size, -1) + % 16 + ) - cnn_features = x[:, :-obs_flat].view( - batch_size, obs_window, obs_window, obs_window_channels).long() + cnn_features = x[:, :-obs_flat].view(batch_size, obs_window, obs_window, obs_window_channels).long() map_features = F.one_hot(cnn_features[:, :, :, 0], 16).permute(0, 3, 1, 2).float() extra_map_features = (cnn_features[:, :, :, -3:].float() / 255.0).permute(0, 3, 1, 2) cnn_features = torch.cat([map_features, extra_map_features], dim=1) cnn = model.policy.cnn cnn_features = torch.from_numpy( - np.arange(batch_size*11*11*19).reshape( - batch_size, 19, obs_window, obs_window) + np.arange(batch_size * 11 * 11 * 19).reshape(batch_size, 19, obs_window, obs_window) ).float() conv1_out = cnn[0](cnn_features) - #(cnn[0].weight[0] * cnn_features[0, :, :5, :5]).sum() + cnn[0].bias[0] + # (cnn[0].weight[0] * cnn_features[0, :, :5, :5]).sum() + cnn[0].bias[0] breakpoint() hidden = model.encoder(x) output = model.decoder(hidden) atn = output.argmax(dim=1) - print('Encode weight sum:', model.encoder.weight.sum()) - print('encode decode weight and bias sum:', model.encoder.weight.sum() + model.encoder.bias.sum() + model.decoder.weight.sum() + model.decoder.bias.sum()) - print('X sum:', x.sum()) - print('Hidden sum:', hidden.sum()) - print('Hidden 1-10:', hidden[0, :10]) - print('Output sum:', output.sum()) - print('Atn sum:', atn.sum()) + print("Encode weight sum:", model.encoder.weight.sum()) + print( + "encode decode weight and bias sum:", + model.encoder.weight.sum() + model.encoder.bias.sum() + model.decoder.weight.sum() + model.decoder.bias.sum(), + ) + print("X sum:", x.sum()) + print("Hidden sum:", hidden.sum()) + print("Hidden 1-10:", hidden[0, :10]) + print("Output sum:", output.sum()) + print("Atn sum:", atn.sum()) breakpoint() exit(0) + def test_lstm(): batch_size = 16 input_size = 128 hidden_size = 128 - input = torch.arange(batch_size*input_size).reshape(1, batch_size, -1).float()/ 100000 + input = torch.arange(batch_size * input_size).reshape(1, batch_size, -1).float() / 100000 state = ( - torch.arange(batch_size*hidden_size).reshape(1, batch_size, -1).float()/ 100000, - torch.arange(batch_size*hidden_size).reshape(1, batch_size, -1).float() / 100000 + torch.arange(batch_size * hidden_size).reshape(1, batch_size, -1).float() / 100000, + torch.arange(batch_size * hidden_size).reshape(1, batch_size, -1).float() / 100000, ) - weights_input = torch.arange(4*hidden_size*input_size).reshape(4*hidden_size, -1).float()/ 100000 - weights_state = torch.arange(4*hidden_size*hidden_size).reshape(4*hidden_size, -1).float()/ 100000 - bias_input = torch.arange(4*hidden_size).reshape(4*hidden_size).float() / 100000 - bias_state = torch.arange(4*hidden_size).reshape(4*hidden_size).float() / 100000 + weights_input = torch.arange(4 * hidden_size * input_size).reshape(4 * hidden_size, -1).float() / 100000 + weights_state = torch.arange(4 * hidden_size * hidden_size).reshape(4 * hidden_size, -1).float() / 100000 + bias_input = torch.arange(4 * hidden_size).reshape(4 * hidden_size).float() / 100000 + bias_state = torch.arange(4 * hidden_size).reshape(4 * hidden_size).float() / 100000 lstm = torch.nn.LSTM(input_size=128, hidden_size=128, num_layers=1) lstm.weight_ih_l0.data = weights_input @@ -81,10 +86,7 @@ def test_lstm(): input = input.squeeze(0) h, c = state - buffer = ( - torch.matmul(input, weights_input.T) + bias_input - + torch.matmul(h, weights_state.T) + bias_state - )[0] + buffer = (torch.matmul(input, weights_input.T) + bias_input + torch.matmul(h, weights_state.T) + bias_state)[0] i, f, g, o = torch.split(buffer, hidden_size, dim=1) @@ -93,26 +95,27 @@ def test_lstm(): g = torch.tanh(g) o = torch.sigmoid(o) - c = f*c + i*g - h = o*torch.tanh(c) + c = f * c + i * g + h = o * torch.tanh(c) breakpoint() - print('Output:', output) + print("Output:", output) + def test_model_forward(model): - data = torch.arange(10*(11*11*4 + 26)) % 16 - data[(11*11*4 + 26):] = 0 + data = torch.arange(10 * (11 * 11 * 4 + 26)) % 16 + data[(11 * 11 * 4 + 26) :] = 0 data = data.reshape(10, -1).float() output = model(data) breakpoint() pass - -if __name__ == '__main__': - #test_lstm() - model = torch.load('snake.pt', map_location='cpu') - #test_model_forward(model) - #test_model(model) - save_model_weights(model, 'snake_weights.bin') - print('saved') +if __name__ == "__main__": + # test_lstm() + model = torch.load("snake.pt", map_location="cpu") + # test_model_forward(model) + # test_model(model) + + save_model_weights(model, "snake_weights.bin") + print("saved") diff --git a/sb3_demo.py b/sb3_demo.py index 5363ce1d0..3bd027f8f 100644 --- a/sb3_demo.py +++ b/sb3_demo.py @@ -2,13 +2,12 @@ import argparse +from pufferlib.environments import atari from stable_baselines3 import PPO -from stable_baselines3.common.vec_env import DummyVecEnv, SubprocVecEnv from stable_baselines3.common.env_util import make_vec_env +from stable_baselines3.common.vec_env import DummyVecEnv -from pufferlib.environments import atari - -''' +""" elif args.backend == 'sb3': from stable_baselines3 import PPO from stable_baselines3.common.vec_env import DummyVecEnv, SubprocVecEnv @@ -25,25 +24,24 @@ ) model.learn(total_timesteps=args.train.total_timesteps) -''' +""" parser = argparse.ArgumentParser() -parser.add_argument('--env', type=str, default='BreakoutNoFrameskip-v4') +parser.add_argument("--env", type=str, default="BreakoutNoFrameskip-v4") args = parser.parse_args() env_creator = atari.env_creator(args.env) -envs = make_vec_env(lambda: env_creator(), - n_envs=4, seed=0, vec_env_cls=DummyVecEnv) +envs = make_vec_env(lambda: env_creator(), n_envs=4, seed=0, vec_env_cls=DummyVecEnv) model = PPO("CnnPolicy", envs, verbose=1) model.learn(total_timesteps=2000) # Demonstrate loading -model.save(f'ppo_{args.env}') -model = PPO.load(f'ppo_{args.env}') +model.save(f"ppo_{args.env}") +model = PPO.load(f"ppo_{args.env}") # Watch the agent play -env = atari.make_env(args.env, render_mode='human') +env = atari.make_env(args.env, render_mode="human") terminal = True for _ in range(1000): if terminal or truncated: @@ -53,4 +51,3 @@ action, _states = model.predict(ob) ob, reward, terminal, truncated, info = env.step(action[0]) env.render() - diff --git a/setup.py b/setup.py index 537a45c1c..9a969228c 100644 --- a/setup.py +++ b/setup.py @@ -1,46 +1,54 @@ -from setuptools import find_packages, find_namespace_packages, setup, Extension -from Cython.Build import cythonize -import numpy import os -import urllib.request -import zipfile -import tarfile import platform - +import tarfile +import urllib.request + +import numpy +from Cython.Build import cythonize +from setuptools import Extension, find_namespace_packages, find_packages, setup + # python3 setup.py built_ext --inplace -VERSION = '2.0.6' +VERSION = "2.0.6" -RAYLIB_BASE = 'https://github.com/raysan5/raylib/releases/download/5.0/' +RAYLIB_BASE = "https://github.com/raysan5/raylib/releases/download/5.0/" -RAYLIB_NAME = 'raylib-5.0_macos' if platform.system() == "Darwin" else 'raylib-5.0_linux_amd64' +RAYLIB_NAME = "raylib-5.0_macos" if platform.system() == "Darwin" else "raylib-5.0_linux_amd64" -RAYLIB_LINUX = 'raylib-5.0_linux_amd64' -RAYLIB_LINUX_URL = RAYLIB_BASE + RAYLIB_LINUX + '.tar.gz' +RAYLIB_LINUX = "raylib-5.0_linux_amd64" +RAYLIB_LINUX_URL = RAYLIB_BASE + RAYLIB_LINUX + ".tar.gz" if not os.path.exists(RAYLIB_LINUX): - urllib.request.urlretrieve(RAYLIB_LINUX_URL, RAYLIB_LINUX + '.tar.gz') - with tarfile.open(RAYLIB_LINUX + '.tar.gz', 'r') as tar_ref: + urllib.request.urlretrieve(RAYLIB_LINUX_URL, RAYLIB_LINUX + ".tar.gz") + with tarfile.open(RAYLIB_LINUX + ".tar.gz", "r") as tar_ref: tar_ref.extractall() - os.remove(RAYLIB_LINUX + '.tar.gz') + os.remove(RAYLIB_LINUX + ".tar.gz") -RAYLIB_MACOS = 'raylib-5.0_macos' -RAYLIB_MACOS_URL = RAYLIB_BASE + RAYLIB_MACOS + '.tar.gz' +RAYLIB_MACOS = "raylib-5.0_macos" +RAYLIB_MACOS_URL = RAYLIB_BASE + RAYLIB_MACOS + ".tar.gz" if not os.path.exists(RAYLIB_MACOS): - urllib.request.urlretrieve(RAYLIB_MACOS_URL, RAYLIB_MACOS + '.tar.gz') - with tarfile.open(RAYLIB_MACOS + '.tar.gz', 'r') as tar_ref: + urllib.request.urlretrieve(RAYLIB_MACOS_URL, RAYLIB_MACOS + ".tar.gz") + with tarfile.open(RAYLIB_MACOS + ".tar.gz", "r") as tar_ref: tar_ref.extractall() - os.remove(RAYLIB_MACOS + '.tar.gz') + os.remove(RAYLIB_MACOS + ".tar.gz") + +# RAYLIB_WASM = 'raylib-5.0_webassembly' +# RAYLIB_WASM_URL = RAYLIB_BASE + RAYLIB_WASM + '.zip' +# if not os.path.exists(RAYLIB_WASM): +# urllib.request.urlretrieve(RAYLIB_WASM_URL, RAYLIB_WASM + '.zip') +# with zipfile.ZipFile(RAYLIB_WASM + '.zip', 'r') as zip_ref: +# zip_ref.extractall() +# if os.path.exists('raylib_wasm'): +# # move old directory to new name +# os.rename('raylib_wasm', 'raylib_wasm_old') +# os.rename('raylib-5.0_webassembly', 'raylib_wasm') + +# import os +# os.environ['CFLAGS'] = '-O3 -march=native -Wall' -RAYLIB_WASM = 'raylib-5.0_webassembly' -RAYLIB_WASM_URL = RAYLIB_BASE + RAYLIB_WASM + '.zip' -if not os.path.exists(RAYLIB_WASM): - urllib.request.urlretrieve(RAYLIB_WASM_URL, RAYLIB_WASM + '.zip') - with zipfile.ZipFile(RAYLIB_WASM + '.zip', 'r') as zip_ref: - zip_ref.extractall() - os.remove(RAYLIB_WASM + '.zip') +# os.remove(RAYLIB_WASM + '.zip') # Default Gym/Gymnasium/PettingZoo versions # Gym: @@ -52,169 +60,169 @@ # - <= 0.20 is missing dict methods for gym.spaces.Dict # - 0.18-0.21 require setuptools<=65.5.0 -GYMNASIUM_VERSION = '0.29.1' -GYM_VERSION = '0.23' -PETTINGZOO_VERSION = '1.24.1' +GYMNASIUM_VERSION = "0.29.1" +GYM_VERSION = "0.23" +PETTINGZOO_VERSION = "1.24.1" docs = [ - 'sphinx==5.0.0', - 'sphinx-rtd-theme==0.5.1', - 'sphinxcontrib-youtube==1.0.1', - 'sphinx-rtd-theme==0.5.1', - 'sphinx-design==0.4.1', - 'furo==2023.3.27', + "sphinx==5.0.0", + "sphinx-rtd-theme==0.5.1", + "sphinxcontrib-youtube==1.0.1", + "sphinx-rtd-theme==0.5.1", + "sphinx-design==0.4.1", + "furo==2023.3.27", ] cleanrl = [ - 'stable_baselines3==2.1.0', - 'tensorboard==2.11.2', - 'torch', - 'tyro==0.8.6', - 'wandb==0.19.1', + "stable_baselines3==2.1.0", + "tensorboard==2.11.2", + "torch", + "tyro==0.8.6", + "wandb==0.19.1", ] ray = [ - 'ray==2.23.0', + "ray==2.23.0", ] environments = { - 'avalon': [ - f'gym=={GYM_VERSION}', - f'gymnasium=={GYMNASIUM_VERSION}', - 'avalon-rl==1.0.0', + "avalon": [ + f"gym=={GYM_VERSION}", + f"gymnasium=={GYMNASIUM_VERSION}", + "avalon-rl==1.0.0", ], - 'atari': [ - f'gym=={GYM_VERSION}', - f'gymnasium[accept-rom-license]=={GYMNASIUM_VERSION}', - 'ale_py==0.9.0', + "atari": [ + f"gym=={GYM_VERSION}", + f"gymnasium[accept-rom-license]=={GYMNASIUM_VERSION}", + "ale_py==0.9.0", ], - 'box2d': [ - f'gym=={GYM_VERSION}', - f'gymnasium[box2d]=={GYMNASIUM_VERSION}', - 'swig==4.1.1', + "box2d": [ + f"gym=={GYM_VERSION}", + f"gymnasium[box2d]=={GYMNASIUM_VERSION}", + "swig==4.1.1", ], - 'bsuite': [ - f'gym=={GYM_VERSION}', - f'gymnasium=={GYMNASIUM_VERSION}', - 'bsuite==0.3.5', + "bsuite": [ + f"gym=={GYM_VERSION}", + f"gymnasium=={GYMNASIUM_VERSION}", + "bsuite==0.3.5", ], - 'butterfly': [ - f'gym=={GYM_VERSION}', - f'gymnasium=={GYMNASIUM_VERSION}', - f'pettingzoo[butterfly]=={PETTINGZOO_VERSION}', + "butterfly": [ + f"gym=={GYM_VERSION}", + f"gymnasium=={GYMNASIUM_VERSION}", + f"pettingzoo[butterfly]=={PETTINGZOO_VERSION}", ], - 'classic_control': [ - f'gym=={GYM_VERSION}', - f'gymnasium=={GYMNASIUM_VERSION}', + "classic_control": [ + f"gym=={GYM_VERSION}", + f"gymnasium=={GYMNASIUM_VERSION}", ], - 'crafter': [ - f'gym=={GYM_VERSION}', - f'gymnasium=={GYMNASIUM_VERSION}', - 'crafter==1.8.3', + "crafter": [ + f"gym=={GYM_VERSION}", + f"gymnasium=={GYMNASIUM_VERSION}", + "crafter==1.8.3", ], - 'dm_control': [ - f'gym=={GYM_VERSION}', - f'gymnasium=={GYMNASIUM_VERSION}', - 'dm_control==1.0.11', + "dm_control": [ + f"gym=={GYM_VERSION}", + f"gymnasium=={GYMNASIUM_VERSION}", + "dm_control==1.0.11", ], - 'dm_lab': [ - f'gym=={GYM_VERSION}', - f'gymnasium=={GYMNASIUM_VERSION}', - 'gym_deepmindlab==0.1.2', - 'dm_env==1.6', + "dm_lab": [ + f"gym=={GYM_VERSION}", + f"gymnasium=={GYMNASIUM_VERSION}", + "gym_deepmindlab==0.1.2", + "dm_env==1.6", ], - 'griddly': [ - f'gym=={GYM_VERSION}', - f'gymnasium=={GYMNASIUM_VERSION}', - 'griddly==1.6.7', - 'imageio', + "griddly": [ + f"gym=={GYM_VERSION}", + f"gymnasium=={GYMNASIUM_VERSION}", + "griddly==1.6.7", + "imageio", ], - 'magent': [ - f'gym=={GYM_VERSION}', - f'gymnasium=={GYMNASIUM_VERSION}', - 'pettingzoo==1.19.0', - 'magent==0.2.4', + "magent": [ + f"gym=={GYM_VERSION}", + f"gymnasium=={GYMNASIUM_VERSION}", + "pettingzoo==1.19.0", + "magent==0.2.4", # The Magent2 package is broken for now #'magent2==0.3.2', ], - 'microrts': [ - f'gym=={GYM_VERSION}', - f'gymnasium=={GYMNASIUM_VERSION}', - 'ffmpeg==1.4', - 'gym_microrts==0.3.2', + "microrts": [ + f"gym=={GYM_VERSION}", + f"gymnasium=={GYMNASIUM_VERSION}", + "ffmpeg==1.4", + "gym_microrts==0.3.2", ], - 'minerl': [ - 'gym==0.17.0', - f'gymnasium=={GYMNASIUM_VERSION}', + "minerl": [ + "gym==0.17.0", + f"gymnasium=={GYMNASIUM_VERSION}", #'git+https://github.com/minerllabs/minerl' # Compatiblity warning with urllib3 and chardet #'requests==2.31.0', ], - 'minigrid': [ - f'gym=={GYM_VERSION}', - f'gymnasium=={GYMNASIUM_VERSION}', - 'minigrid==2.3.1', + "minigrid": [ + f"gym=={GYM_VERSION}", + f"gymnasium=={GYMNASIUM_VERSION}", + "minigrid==2.3.1", ], - 'minihack': [ - f'gym=={GYM_VERSION}', - f'gymnasium=={GYMNASIUM_VERSION}', - 'minihack==0.1.5', + "minihack": [ + f"gym=={GYM_VERSION}", + f"gymnasium=={GYMNASIUM_VERSION}", + "minihack==0.1.5", ], - 'mujoco': [ - f'gymnasium[mujoco]=={GYMNASIUM_VERSION}', - 'mujoco==2.3.7', # mujuco > 3 is supported by gymnasium > 1.0 - 'moviepy', + "mujoco": [ + f"gymnasium[mujoco]=={GYMNASIUM_VERSION}", + "mujoco==2.3.7", # mujuco > 3 is supported by gymnasium > 1.0 + "moviepy", ], - 'nethack': [ - f'gym=={GYM_VERSION}', - f'gymnasium=={GYMNASIUM_VERSION}', - 'nle==0.9.1', + "nethack": [ + f"gym=={GYM_VERSION}", + f"gymnasium=={GYMNASIUM_VERSION}", + "nle==0.9.1", ], - 'nmmo': [ - f'gym=={GYM_VERSION}', - f'gymnasium=={GYMNASIUM_VERSION}', - f'pettingzoo=={PETTINGZOO_VERSION}', - 'nmmo>=2.1', + "nmmo": [ + f"gym=={GYM_VERSION}", + f"gymnasium=={GYMNASIUM_VERSION}", + f"pettingzoo=={PETTINGZOO_VERSION}", + "nmmo>=2.1", ], - 'open_spiel': [ - f'gym=={GYM_VERSION}', - f'gymnasium=={GYMNASIUM_VERSION}', - 'open_spiel==1.3', - 'pettingzoo==1.19.0', + "open_spiel": [ + f"gym=={GYM_VERSION}", + f"gymnasium=={GYMNASIUM_VERSION}", + "open_spiel==1.3", + "pettingzoo==1.19.0", ], - 'pokemon_red': [ - f'gym=={GYM_VERSION}', - f'gymnasium=={GYMNASIUM_VERSION}', - 'pokegym>=0.2.0', - 'einops==0.6.1', - 'matplotlib', - 'scikit-image', - 'pyboy<2.0.0', - 'hnswlib==0.7.0', - 'mediapy', - 'pandas==2.0.2', - 'pettingzoo', - 'websockets', + "pokemon_red": [ + f"gym=={GYM_VERSION}", + f"gymnasium=={GYMNASIUM_VERSION}", + "pokegym>=0.2.0", + "einops==0.6.1", + "matplotlib", + "scikit-image", + "pyboy<2.0.0", + "hnswlib==0.7.0", + "mediapy", + "pandas==2.0.2", + "pettingzoo", + "websockets", ], - 'procgen': [ - f'gym=={GYM_VERSION}', - f'gymnasium=={GYMNASIUM_VERSION}', - 'procgen-mirror==0.10.7', # Procgen mirror for 3.11 and 3.12 support + "procgen": [ + f"gym=={GYM_VERSION}", + f"gymnasium=={GYMNASIUM_VERSION}", + "procgen-mirror==0.10.7", # Procgen mirror for 3.11 and 3.12 support # Note: You need glfw==2.7 after installing for some torch versions ], #'smac': [ # 'git+https://github.com/oxwhirl/smac.git', - #], + # ], #'stable-retro': [ # 'git+https://github.com/Farama-Foundation/stable-retro.git', - #] - 'slimevolley': [ - f'gym=={GYM_VERSION}', - f'gymnasium=={GYMNASIUM_VERSION}', - 'slimevolley==0.1.0', + # ] + "slimevolley": [ + f"gym=={GYM_VERSION}", + f"gymnasium=={GYMNASIUM_VERSION}", + "slimevolley==0.1.0", ], - 'vizdoom': [ - 'vizdoom==1.2.3', + "vizdoom": [ + "vizdoom==1.2.3", ], } @@ -225,134 +233,134 @@ # We force updated versions of Gym/Gymnasium/PettingZoo here to # ensure that users do not have issues with conflicting versions # when switching to incompatible environments -common = cleanrl + [environments[env] for env in [ - 'atari', - #'box2d', - 'bsuite', - #'butterfly', - 'classic_control', - 'crafter', - 'dm_control', - 'dm_lab', - 'griddly', - 'microrts', - 'minigrid', - 'minihack', - 'nethack', - 'nmmo', - 'pokemon_red', - 'procgen', - 'vizdoom', -]] +common = cleanrl + [ + environments[env] + for env in [ + "atari", + #'box2d', + "bsuite", + #'butterfly', + "classic_control", + "crafter", + "dm_control", + "dm_lab", + "griddly", + "microrts", + "minigrid", + "minihack", + "nethack", + "nmmo", + "pokemon_red", + "procgen", + "vizdoom", + ] +] extension_paths = [ - 'pufferlib/ocean/nmmo3/cy_nmmo3', - 'pufferlib/ocean/moba/cy_moba', - 'pufferlib/ocean/tactical/c_tactical', - 'pufferlib/ocean/squared/cy_squared', - 'pufferlib/ocean/snake/cy_snake', - 'pufferlib/ocean/pong/cy_pong', - 'pufferlib/ocean/breakout/cy_breakout', - 'pufferlib/ocean/enduro/cy_enduro', - 'pufferlib/ocean/connect4/cy_connect4', - 'pufferlib/ocean/grid/cy_grid', - 'pufferlib/ocean/tripletriad/cy_tripletriad', - 'pufferlib/ocean/go/cy_go', - 'pufferlib/ocean/rware/cy_rware', - 'pufferlib/ocean/trash_pickup/cy_trash_pickup' + # Don't build the ocean. + # 'pufferlib/ocean/nmmo3/cy_nmmo3', + # 'pufferlib/ocean/moba/cy_moba', + # 'pufferlib/ocean/tactical/c_tactical', + # 'pufferlib/ocean/squared/cy_squared', + # 'pufferlib/ocean/snake/cy_snake', + # 'pufferlib/ocean/pong/cy_pong', + # 'pufferlib/ocean/breakout/cy_breakout', + # 'pufferlib/ocean/enduro/cy_enduro', + # 'pufferlib/ocean/connect4/cy_connect4', + # 'pufferlib/ocean/grid/cy_grid', + # 'pufferlib/ocean/tripletriad/cy_tripletriad', + # 'pufferlib/ocean/go/cy_go', + # 'pufferlib/ocean/rware/cy_rware', + # 'pufferlib/ocean/trash_pickup/cy_trash_pickup' ] system = platform.system() -if system == 'Darwin': +if system == "Darwin": # On macOS, use @loader_path. # The extension “.so” is typically in pufferlib/ocean/..., # and “raylib/lib” is (maybe) two directories up from ocean/. # So @loader_path/../../raylib/lib is common. - RAYLIB_INCLUDE = f'{RAYLIB_MACOS}/include' - RAYLIB_LIB = f'{RAYLIB_MACOS}/lib' -elif system == 'Linux': + RAYLIB_INCLUDE = f"{RAYLIB_MACOS}/include" + RAYLIB_LIB = f"{RAYLIB_MACOS}/lib" +elif system == "Linux": # TODO: Check if anything moves packages around after they are installed. # That would break this linking. Rel path doesn't work outside the pufferlib dir - RAYLIB_INCLUDE = f'{RAYLIB_LINUX}/include' - RAYLIB_LIB = f'{RAYLIB_LINUX}/lib' + RAYLIB_INCLUDE = f"{RAYLIB_LINUX}/include" + RAYLIB_LIB = f"{RAYLIB_LINUX}/lib" else: - raise ValueError(f'Unsupported system: {system}') + raise ValueError(f"Unsupported system: {system}") -extensions = [Extension( - path.replace('/', '.'), - [path + '.pyx'], - include_dirs=[numpy.get_include(), RAYLIB_INCLUDE], - extra_compile_args=['-DNPY_NO_DEPRECATED_API=NPY_1_7_API_VERSION', '-DPLATFORM_DESKTOP', '-O2', '-Wno-alloc-size-larger-than', '-fwrapv'],#, '-g'], - extra_link_args=['-Bsymbolic-functions', '-O2', '-fwrapv'], - extra_objects=[f'{RAYLIB_LIB}/libraylib.a'] -) for path in extension_paths] - -# Prevent Conda from injecting garbage compile flags -from distutils.sysconfig import get_config_vars -cfg_vars = get_config_vars() -for key in ('CC', 'CXX', 'LDSHARED'): - if cfg_vars[key]: - cfg_vars[key] = cfg_vars[key].replace('-B /root/anaconda3/compiler_compat', '') - cfg_vars[key] = cfg_vars[key].replace('-pthread', '') - cfg_vars[key] = cfg_vars[key].replace('-fno-strict-overflow', '') - -for key, value in cfg_vars.items(): - if value and '-fno-strict-overflow' in str(value): - cfg_vars[key] = value.replace('-fno-strict-overflow', '') +extensions = [ + Extension( + path.replace("/", "."), + [path + ".pyx"], + include_dirs=[numpy.get_include(), RAYLIB_INCLUDE], + extra_compile_args=[ + "-DNPY_NO_DEPRECATED_API=NPY_1_7_API_VERSION", + "-DPLATFORM_DESKTOP", + "-O2", + "-Wno-alloc-size-larger-than", + "-fwrapv", + ], # , '-g'], + extra_link_args=["-Bsymbolic-functions", "-O2", "-fwrapv"], + extra_objects=[f"{RAYLIB_LIB}/libraylib.a"], + ) + for path in extension_paths +] setup( name="pufferlib", - description="PufferAI Library" - "PufferAI's library of RL tools and utilities", + description="PufferAI LibraryPufferAI's library of RL tools and utilities", long_description_content_type="text/markdown", version=VERSION, packages=find_namespace_packages() + find_packages(), package_data={ "pufferlib": [ - f'{RAYLIB_LIB}/libraylib.a', + f"{RAYLIB_LIB}/libraylib.a", ] }, include_package_data=True, install_requires=[ - 'numpy>=1.23.3', - 'opencv-python==3.4.17.63', - 'cython>=3.0.0', - 'rich', - 'rich_argparse', - f'gym<={GYM_VERSION}', - f'gymnasium<={GYMNASIUM_VERSION}', - f'pettingzoo<={PETTINGZOO_VERSION}', - 'shimmy[gym-v21]', - 'psutil==5.9.5', - 'pynvml', - 'imageio', + "numpy>=1.23.3", + "opencv-python==3.4.17.63", + "cython>=3.0.0", + "rich", + "rich_argparse", + f"gym<={GYM_VERSION}", + f"gymnasium<={GYMNASIUM_VERSION}", + f"pettingzoo<={PETTINGZOO_VERSION}", + "shimmy[gym-v21]", + "psutil==5.9.5", + "pynvml", + "imageio", ], extras_require={ - 'docs': docs, - 'ray': ray, - 'cleanrl': cleanrl, - 'common': common, + "docs": docs, + "ray": ray, + "cleanrl": cleanrl, + "common": common, **environments, }, - ext_modules = cythonize([ - "pufferlib/extensions.pyx", - "c_gae.pyx", - "pufferlib/puffernet.pyx", - "pufferlib/ocean/grid/c_grid.pyx", - *extensions, - ], - compiler_directives={ - 'language_level': 3, - 'boundscheck': False, - 'initializedcheck': False, - 'wraparound': False, - 'cdivision': True, - 'nonecheck': False, - 'profile': False, - }, - #nthreads=6, - #annotate=True, - #compiler_directives={'profile': True},# annotate=True + ext_modules=cythonize( + [ + "pufferlib/extensions.pyx", + "c_gae.pyx", + "pufferlib/puffernet.pyx", + "pufferlib/ocean/grid/c_grid.pyx", + *extensions, + ], + compiler_directives={ + "language_level": 3, + "boundscheck": False, + "initializedcheck": False, + "wraparound": False, + "cdivision": True, + "nonecheck": False, + "profile": False, + }, + # nthreads=6, + # annotate=True, + # compiler_directives={'profile': True},# annotate=True ), include_dirs=[numpy.get_include(), RAYLIB_INCLUDE], python_requires=">=3.9", @@ -372,11 +380,11 @@ ], ) -#stable_baselines3 -#supersuit==3.3.5 +# stable_baselines3 +# supersuit==3.3.5 #'git+https://github.com/oxwhirl/smac.git', -#curl -L -o smac.zip https://blzdistsc2-a.akamaihd.net/Linux/SC2.4.10.zip -#unzip -P iagreetotheeula smac.zip -#curl -L -o maps.zip https://github.com/oxwhirl/smac/releases/download/v0.1-beta1/SMAC_Maps.zip -#unzip maps.zip && mv SMAC_Maps/ StarCraftII/Maps/ +# curl -L -o smac.zip https://blzdistsc2-a.akamaihd.net/Linux/SC2.4.10.zip +# unzip -P iagreetotheeula smac.zip +# curl -L -o maps.zip https://github.com/oxwhirl/smac/releases/download/v0.1-beta1/SMAC_Maps.zip +# unzip maps.zip && mv SMAC_Maps/ StarCraftII/Maps/ diff --git a/tests/mem_test.py b/tests/mem_test.py index d0ca0c4d4..0b220ce09 100644 --- a/tests/mem_test.py +++ b/tests/mem_test.py @@ -1,42 +1,51 @@ -from pdb import set_trace as T -import numpy as np +import selectors import time +from multiprocessing import Array, Pipe, Process + +import numpy as np -import selectors -from multiprocessing import Process, Pipe, Array -def worker_process(envs_per_worker, shared_mem, bandwidth, - delay_mean, delay_std, send_pipe, recv_pipe): +def worker_process(envs_per_worker, shared_mem, bandwidth, delay_mean, delay_std, send_pipe, recv_pipe): data = np.random.randn(bandwidth) while True: request = recv_pipe.recv() for _ in range(envs_per_worker): start = time.process_time() idx = 0 - target_time = delay_mean + delay_std*np.random.randn() + target_time = delay_mean + delay_std * np.random.randn() while time.process_time() - start < target_time: idx += 1 shared_mem[:bandwidth] = data - send_pipe.send('end') - - -def test_speed(envs_per_worker=1, bandwidth=1, delay_mean=0.01, delay_std=0.001, - num_workers=4, batch_size=4, timeout=10): - - main_send_pipes, work_recv_pipes = zip(*[Pipe() for _ in range(num_workers)]) - work_send_pipes, main_recv_pipes = zip(*[Pipe() for _ in range(num_workers)]) - - shared_mem = [Array('d', bandwidth) for _ in range(num_workers)] - processes = [Process( - target=worker_process, - args=(envs_per_worker, shared_mem, bandwidth, - delay_mean, delay_std, work_send_pipes[i], work_recv_pipes[i])) - for i in range(num_workers)] + send_pipe.send("end") + + +def test_speed( + envs_per_worker=1, bandwidth=1, delay_mean=0.01, delay_std=0.001, num_workers=4, batch_size=4, timeout=10 +): + main_send_pipes, work_recv_pipes = zip(*[Pipe() for _ in range(num_workers)], strict=False) + work_send_pipes, main_recv_pipes = zip(*[Pipe() for _ in range(num_workers)], strict=False) + + shared_mem = [Array("d", bandwidth) for _ in range(num_workers)] + processes = [ + Process( + target=worker_process, + args=( + envs_per_worker, + shared_mem, + bandwidth, + delay_mean, + delay_std, + work_send_pipes[i], + work_recv_pipes[i], + ), + ) + for i in range(num_workers) + ] for p in processes: p.start() - + send_idxs = {i for i in range(num_workers)} # Register all receive pipes with the selector @@ -48,7 +57,7 @@ def test_speed(envs_per_worker=1, bandwidth=1, delay_mean=0.01, delay_std=0.001, start = time.time() while time.time() - start < timeout: for idx in send_idxs: - main_send_pipes[idx].send('start') + main_send_pipes[idx].send("start") send_idxs = set() @@ -57,7 +66,7 @@ def test_speed(envs_per_worker=1, bandwidth=1, delay_mean=0.01, delay_std=0.001, idx = main_recv_pipes.index(pipe) if pipe.poll(): - assert pipe.recv() == 'end' + assert pipe.recv() == "end" send_idxs.add(idx) if len(send_idxs) == batch_size: @@ -70,19 +79,19 @@ def test_speed(envs_per_worker=1, bandwidth=1, delay_mean=0.01, delay_std=0.001, sps = steps_collected / (end - start) print( - f'SPS: {sps:.2f}', - f'envs_per_worker: {envs_per_worker}', - f'delay_mean: {delay_mean}', - f'delay_std: {delay_std}', - f'num_workers: {num_workers}', - f'batch_size: {batch_size}', - f'sync: {sync}', + f"SPS: {sps:.2f}", + f"envs_per_worker: {envs_per_worker}", + f"delay_mean: {delay_mean}", + f"delay_std: {delay_std}", + f"num_workers: {num_workers}", + f"batch_size: {batch_size}", + f"sync: {sync}", ) -if __name__ == '__main__': - #timeout = 1 - #test_speed(timeout=1) +if __name__ == "__main__": + # timeout = 1 + # test_speed(timeout=1) test_speed(delay_mean=0, delay_std=0, num_workers=1, batch_size=1) test_speed(delay_mean=0, delay_std=0, num_workers=1, batch_size=1) test_speed(delay_mean=0, delay_std=0, num_workers=6, batch_size=6) @@ -90,4 +99,3 @@ def test_speed(envs_per_worker=1, bandwidth=1, delay_mean=0.01, delay_std=0.001, test_speed(delay_mean=0, delay_std=0, num_workers=24, batch_size=6) test_speed(delay_mean=0, delay_std=0, num_workers=24, batch_size=24) test_speed(delay_mean=0, delay_std=0, num_workers=24, batch_size=6) - diff --git a/tests/pool/plot_packing.py b/tests/pool/plot_packing.py index b7c1ca546..123b45eed 100644 --- a/tests/pool/plot_packing.py +++ b/tests/pool/plot_packing.py @@ -1,39 +1,43 @@ -import plotly.graph_objects as go import numpy as np +import plotly.graph_objects as go # Parameters n_bars = 24 mu = 0.002 std = 0.002 -background = '#061a1a' -forground = '#f1f1f1' +background = "#061a1a" +forground = "#f1f1f1" # Sampling from the normal distribution bar_heights = mu + np.clip(np.random.normal(mu, std, n_bars), 0, np.inf) # Creating the bar chart -fig = go.Figure(go.Bar( - x=[i for i in range(n_bars)], - y=bar_heights, - marker_line_width=0, - marker_color=forground, -)) +fig = go.Figure( + go.Bar( + x=[i for i in range(n_bars)], + y=bar_heights, + marker_line_width=0, + marker_color=forground, + ) +) # Updating the layout -fig.update_layout({ - 'plot_bgcolor': background, - 'paper_bgcolor': background, - 'showlegend': False, - 'xaxis': {'visible': False}, - 'yaxis': {'visible': False, 'range': [0, max(bar_heights)]}, - 'margin': {'l': 0, 'r': 0, 't': 0, 'b': 0}, - 'height': 400, - 'width': 800, - 'bargap': 0.0, - 'bargroupgap': 0.0, -}) +fig.update_layout( + { + "plot_bgcolor": background, + "paper_bgcolor": background, + "showlegend": False, + "xaxis": {"visible": False}, + "yaxis": {"visible": False, "range": [0, max(bar_heights)]}, + "margin": {"l": 0, "r": 0, "t": 0, "b": 0}, + "height": 400, + "width": 800, + "bargap": 0.0, + "bargroupgap": 0.0, + } +) fig.show() -fig.write_image('../docker/env_variance.png', scale=3) +fig.write_image("../docker/env_variance.png", scale=3) diff --git a/tests/pool/test_basic_multprocessing.py b/tests/pool/test_basic_multprocessing.py index 3d6e95747..9a91fa234 100644 --- a/tests/pool/test_basic_multprocessing.py +++ b/tests/pool/test_basic_multprocessing.py @@ -1,9 +1,9 @@ -from pdb import set_trace as T -import numpy as np +import selectors import time +from multiprocessing import Pipe, Process + +import numpy as np -import selectors -from multiprocessing import Process, Pipe def worker_process(envs_per_worker, delay_mean, delay_std, send_pipe, recv_pipe): while True: @@ -11,24 +11,27 @@ def worker_process(envs_per_worker, delay_mean, delay_std, send_pipe, recv_pipe) for _ in range(envs_per_worker): start = time.process_time() idx = 0 - target_time = delay_mean + delay_std*np.random.randn() + target_time = delay_mean + delay_std * np.random.randn() while time.process_time() - start < target_time: idx += 1 - send_pipe.send('end') + send_pipe.send("end") + def test_speed(envs_per_worker=1, delay_mean=0.01, delay_std=0.001, num_workers=4, batch_size=4, sync=True, timeout=10): - main_send_pipes, work_recv_pipes = zip(*[Pipe() for _ in range(num_workers)]) - work_send_pipes, main_recv_pipes = zip(*[Pipe() for _ in range(num_workers)]) + main_send_pipes, work_recv_pipes = zip(*[Pipe() for _ in range(num_workers)], strict=False) + work_send_pipes, main_recv_pipes = zip(*[Pipe() for _ in range(num_workers)], strict=False) - processes = [Process( - target=worker_process, - args=(envs_per_worker, delay_mean, delay_std, work_send_pipes[i], work_recv_pipes[i])) - for i in range(num_workers)] + processes = [ + Process( + target=worker_process, args=(envs_per_worker, delay_mean, delay_std, work_send_pipes[i], work_recv_pipes[i]) + ) + for i in range(num_workers) + ] for p in processes: p.start() - + send_idxs = {i for i in range(num_workers)} # Register all receive pipes with the selector @@ -40,29 +43,29 @@ def test_speed(envs_per_worker=1, delay_mean=0.01, delay_std=0.001, num_workers= start = time.time() while time.time() - start < timeout: for idx in send_idxs: - main_send_pipes[idx].send('start') + main_send_pipes[idx].send("start") send_idxs = set() if sync: for idx, pipe in enumerate(main_recv_pipes): - assert pipe.recv() == 'end' + assert pipe.recv() == "end" send_idxs.add(idx) - steps_collected += num_workers*envs_per_worker + steps_collected += num_workers * envs_per_worker else: for key, _ in sel.select(timeout=None): pipe = key.fileobj idx = main_recv_pipes.index(pipe) if pipe.poll(): - assert pipe.recv() == 'end' + assert pipe.recv() == "end" send_idxs.add(idx) if len(send_idxs) == batch_size: break - steps_collected += batch_size*envs_per_worker + steps_collected += batch_size * envs_per_worker end = time.time() @@ -71,19 +74,19 @@ def test_speed(envs_per_worker=1, delay_mean=0.01, delay_std=0.001, num_workers= sps = steps_collected / (end - start) print( - f'SPS: {sps:.2f}', - f'envs_per_worker: {envs_per_worker}', - f'delay_mean: {delay_mean}', - f'delay_std: {delay_std}', - f'num_workers: {num_workers}', - f'batch_size: {batch_size}', - f'sync: {sync}', + f"SPS: {sps:.2f}", + f"envs_per_worker: {envs_per_worker}", + f"delay_mean: {delay_mean}", + f"delay_std: {delay_std}", + f"num_workers: {num_workers}", + f"batch_size: {batch_size}", + f"sync: {sync}", ) -if __name__ == '__main__': - #timeout = 1 - #test_speed(timeout=1) +if __name__ == "__main__": + # timeout = 1 + # test_speed(timeout=1) test_speed(delay_mean=0, delay_std=0, num_workers=1, batch_size=1, sync=False) test_speed(delay_mean=0, delay_std=0, num_workers=1, batch_size=1, sync=True) test_speed(delay_mean=0, delay_std=0, num_workers=6, batch_size=6, sync=False) @@ -91,4 +94,3 @@ def test_speed(envs_per_worker=1, delay_mean=0.01, delay_std=0.001, num_workers= test_speed(delay_mean=0, delay_std=0, num_workers=24, batch_size=6, sync=False) test_speed(delay_mean=0, delay_std=0, num_workers=24, batch_size=24, sync=False) test_speed(delay_mean=0, delay_std=0, num_workers=24, batch_size=6, sync=True) - diff --git a/tests/pool/test_envpool.py b/tests/pool/test_envpool.py index 28b4da568..253c16d78 100644 --- a/tests/pool/test_envpool.py +++ b/tests/pool/test_envpool.py @@ -1,12 +1,9 @@ -from pdb import set_trace as T -import numpy as np import time import gymnasium - +import numpy as np import pufferlib -from pufferlib.vectorization import Serial, Multiprocessing, Ray - +from pufferlib.vectorization import Multiprocessing, Ray, Serial # This is about 1 second on a good CPU core. It is quite difficult to # find good sources of a 1 second delay without using a timer that can swap @@ -18,24 +15,20 @@ class PerformanceEnv: def __init__(self, delay_mean, delay_std): np.random.seed(time.time_ns() % 2**32) - self.observation_space = gymnasium.spaces.Box( - low=-2**20, high=2**20, - shape=(1,), dtype=np.float32 - ) + self.observation_space = gymnasium.spaces.Box(low=-(2**20), high=2**20, shape=(1,), dtype=np.float32) self.action_space = gymnasium.spaces.Discrete(2) self.observation = self.observation_space.sample() self.delay_mean = delay_mean self.delay_std = delay_std - def reset(self, seed=None): return self.observation, {} def step(self, action): start = time.process_time() idx = 0 - target_time = self.delay_mean + self.delay_std*np.random.randn() + target_time = self.delay_mean + self.delay_std * np.random.randn() while time.process_time() - start < target_time: idx += 1 @@ -45,23 +38,21 @@ def close(self): pass -def test_performance(vectorization, workers, envs_per_worker, - delay_mean, delay_std, batch_size=None, timeout=1): +def test_performance(vectorization, workers, envs_per_worker, delay_mean, delay_std, batch_size=None, timeout=1): def make_env(): - return pufferlib.emulation.GymnasiumPufferEnv( - env_creator=PerformanceEnv, env_args=(delay_mean, delay_std)) + return pufferlib.emulation.GymnasiumPufferEnv(env_creator=PerformanceEnv, env_args=(delay_mean, delay_std)) if batch_size is None: batch_size = workers * envs_per_worker actions = np.array([make_env().action_space.sample() for _ in range(batch_size)]) - if vectorization in (Serial, Multiprocessing, 'SyncMultiprocessing', 'SyncRay', Ray): + if vectorization in (Serial, Multiprocessing, "SyncMultiprocessing", "SyncRay", Ray): synchronous = False - if vectorization == 'SyncMultiprocessing': + if vectorization == "SyncMultiprocessing": vectorization = Multiprocessing synchronous = True - if vectorization == 'SyncRay': + if vectorization == "SyncRay": vectorization = Ray synchronous = True @@ -90,21 +81,24 @@ def make_env(): def sweep_performance_tests(): backends = ( - gymnasium.vector.SyncVectorEnv, Serial, - gymnasium.vector.AsyncVectorEnv, 'SyncMultiprocessing', - Multiprocessing, - 'SyncRay', Ray, + gymnasium.vector.SyncVectorEnv, + Serial, + gymnasium.vector.AsyncVectorEnv, + "SyncMultiprocessing", + Multiprocessing, + "SyncRay", + Ray, ) results = {} delay_means = (1e-2, 1e-2, 1e-3, 1e-3, 1e-4, 1e-4) delay_stds = (1e-3, 1e-2, 1e-4, 1e-3, 1e-5, 1e-4) - for mean, std in zip(delay_means, delay_stds): + for mean, std in zip(delay_means, delay_stds, strict=False): results[(mean, std)] = {} - print('Environment delay: ', mean, std) + print("Environment delay: ", mean, std) for workers in (1, 6, 24, 96, 192): resul = {} results[(mean, std)][workers] = resul - print('\t', workers) + print("\t", workers) for vec in backends: res = {} if type(vec) != str: @@ -113,27 +107,26 @@ def sweep_performance_tests(): name = vec resul[name] = res - print(2*'\t', name) + print(2 * "\t", name) for envs_per_worker in (1, 2, 4): - batch_sizes=[workers * envs_per_worker] + batch_sizes = [workers * envs_per_worker] if vec in (Multiprocessing, Ray) and workers != 1: batch_sizes.append(workers * envs_per_worker // 2) batch_sizes.append(workers * envs_per_worker // 3) for batch in batch_sizes: - steps, duration = test_performance( - vec, workers, envs_per_worker, mean, std, batch) + steps, duration = test_performance(vec, workers, envs_per_worker, mean, std, batch) res[(envs_per_worker, batch)] = (steps, duration) - print('SPS, envs/worker, batch size: ', - steps / duration, envs_per_worker, batch) + print("SPS, envs/worker, batch size: ", steps / duration, envs_per_worker, batch) + + # np.save('envpool_results.npy', results, allow_pickle=True) - #np.save('envpool_results.npy', results, allow_pickle=True) def plot_performance_tests(): - data = np.load('envpool_results.npy', allow_pickle=True).item() + data = np.load("envpool_results.npy", allow_pickle=True).item() n_envs = len(data) inner_data = list(data.items())[0][1] @@ -143,11 +136,10 @@ def plot_performance_tests(): n_backends, backends = len(inner_inner_data), list(inner_inner_data.keys()) from matplotlib import pyplot as plt - import matplotlib.colors as mcolors # Create figure and axes fig, ax = plt.subplots(figsize=(15, 5)) # Adjust size as needed - #plt.yscale('log') + # plt.yscale('log') # Bar settings bar_width = 0.15 @@ -159,45 +151,39 @@ def plot_performance_tests(): backend_colors = [str(g) for g in grayscale_colors] # Hue colors for cores - hue_colors = 255*plt.cm.hsv(np.linspace(0, 0.6, n_cores))[:, :3] + hue_colors = 255 * plt.cm.hsv(np.linspace(0, 0.6, n_cores))[:, :3] bars_data = [] grayscale_colors = np.linspace(0.4, 1, n_cores) - hue_colors = 255*plt.cm.hsv(np.linspace(0, 0.6, n_backends))[:, :3] + hue_colors = 255 * plt.cm.hsv(np.linspace(0, 0.6, n_backends))[:, :3] import plotly.graph_objects as go - import dash - import dash_core_components as dcc - import dash_html_components as html # Plotting the bars pos = 0 - x_labels = [f'{mean}±{std}' for mean, std in data.keys()] - tick_vals = np.linspace(0, bar_width*n_envs*n_cores*(n_backends+1), n_envs) + x_labels = [f"{mean}±{std}" for mean, std in data.keys()] + tick_vals = np.linspace(0, bar_width * n_envs * n_cores * (n_backends + 1), n_envs) # Set up layout configuration layout = go.Layout( - title=dict( - text='Performance of Vectorization Backends on Various Workloads (24 core machine)', - y=0.9 - ), - width=2000,# 1000, + title=dict(text="Performance of Vectorization Backends on Various Workloads (24 core machine)", y=0.9), + width=2000, # 1000, height=500, - yaxis=dict(title='Speedup over Expected Serial Performance'), - plot_bgcolor='rgba(6, 26, 26, 1)', # Dark cyan background - paper_bgcolor='rgba(6, 26, 26, 1)', - font=dict(color='rgba(241, 241, 241, 1)'), # Light text - barmode='group', - xaxis = dict( - title='Test Environment Delays (mean/std) and Process Counts', - tickmode='array', - tickvals = tick_vals, - ticktext = x_labels, + yaxis=dict(title="Speedup over Expected Serial Performance"), + plot_bgcolor="rgba(6, 26, 26, 1)", # Dark cyan background + paper_bgcolor="rgba(6, 26, 26, 1)", + font=dict(color="rgba(241, 241, 241, 1)"), # Light text + barmode="group", + xaxis=dict( + title="Test Environment Delays (mean/std) and Process Counts", + tickmode="array", + tickvals=tick_vals, + ticktext=x_labels, ), legend=dict( y=1.20, - x=0.9,#0.80 + x=0.9, # 0.80 ), ) @@ -205,7 +191,7 @@ def plot_performance_tests(): x = 0 for env_idx, (mean, std) in enumerate(data): env = data[(mean, std)] - label = ('mean = %.1e, std = %.1e' % (mean, std)) + label = "mean = %.1e, std = %.1e" % (mean, std) for workers_idx, workers in enumerate(env): runs = env[workers] for vec_idx, vec in enumerate(runs): @@ -214,51 +200,57 @@ def plot_performance_tests(): speedup = best_sps * mean color = hue_colors[vec_idx] * grayscale_colors[workers_idx] - color = f'rgb{tuple(color[:3])}' # Convert to RGB string - fig.add_trace(go.Bar( - x=[x], - y=[speedup], # Y value - marker_color=color, # Color - text=label, - showlegend=False, - )) + color = f"rgb{tuple(color[:3])}" # Convert to RGB string + fig.add_trace( + go.Bar( + x=[x], + y=[speedup], # Y value + marker_color=color, # Color + text=label, + showlegend=False, + ) + ) x += bar_width - label = '' + label = "" x += bar_width - x += 3*bar_width + x += 3 * bar_width # Create figure with the collected bar data and layout for idx, vec in enumerate(backends): - if vec == 'Serial': - vec = 'Puffer Serial' - elif vec == 'SyncMultiprocessing': - vec = 'Puffer Multiproc.' - elif vec == 'Multiprocessing': - vec = 'Puffer Pool' - - color = f'rgb{tuple(hue_colors[idx])}' # Convert to RGB string - fig.add_trace(go.Bar( - x=[None], # No x value - y=[None], # No y value - name=vec, # Name for the legend entry - marker_color=color, # Transparent color - showlegend=True, # Show in legend - )) + if vec == "Serial": + vec = "Puffer Serial" + elif vec == "SyncMultiprocessing": + vec = "Puffer Multiproc." + elif vec == "Multiprocessing": + vec = "Puffer Pool" + + color = f"rgb{tuple(hue_colors[idx])}" # Convert to RGB string + fig.add_trace( + go.Bar( + x=[None], # No x value + y=[None], # No y value + name=vec, # Name for the legend entry + marker_color=color, # Transparent color + showlegend=True, # Show in legend + ) + ) for idx, core in enumerate(cores): - color = f'rgb{tuple(3*[grayscale_colors[idx]])}' - fig.add_trace(go.Bar( - x=[None], # No x value - y=[None], # No y value - name=core, # Name for the legend entry - marker_color=color, # Transparent color - showlegend=True, # Show in legend - )) + color = f"rgb{tuple(3 * [grayscale_colors[idx]])}" + fig.add_trace( + go.Bar( + x=[None], # No x value + y=[None], # No y value + name=core, # Name for the legend entry + marker_color=color, # Transparent color + showlegend=True, # Show in legend + ) + ) # Save the figure to a file - fig.write_image('../docker/envpool_sps.png', scale=3) + fig.write_image("../docker/envpool_sps.png", scale=3) -if __name__ == '__main__': - #sweep_performance_tests() +if __name__ == "__main__": + # sweep_performance_tests() plot_performance_tests() diff --git a/tests/pool/test_multiprocessing.py b/tests/pool/test_multiprocessing.py index aefd71b85..93689ae47 100644 --- a/tests/pool/test_multiprocessing.py +++ b/tests/pool/test_multiprocessing.py @@ -1,13 +1,16 @@ -from pdb import set_trace as T -import numpy as np import time -from pufferlib.vectorization import Multiprocessing +import numpy as np from pufferlib.environments import pokemon_red +from pufferlib.vectorization import Multiprocessing + def test_envpool(num_envs, envs_per_worker, envs_per_batch, steps=1000, env_pool=True): - pool = Multiprocessing(pokemon_red.env_creator(), num_envs=num_envs, - envs_per_worker=envs_per_worker, envs_per_batch=envs_per_batch, + pool = Multiprocessing( + pokemon_red.env_creator(), + num_envs=num_envs, + envs_per_worker=envs_per_worker, + envs_per_batch=envs_per_batch, env_pool=True, ) pool.async_reset() @@ -18,21 +21,21 @@ def test_envpool(num_envs, envs_per_worker, envs_per_batch, steps=1000, env_pool o, r, d, t, i, mask, env_id = pool.recv() pool.send(a) end = time.time() - print('Steps per second: ', envs_per_batch * steps / (end - start)) + print("Steps per second: ", envs_per_batch * steps / (end - start)) pool.close() -if __name__ == '__main__': +if __name__ == "__main__": # 225 sps - #test_envpool(num_envs=1, envs_per_worker=1, envs_per_batch=1, env_pool=False) + # test_envpool(num_envs=1, envs_per_worker=1, envs_per_batch=1, env_pool=False) # 600 sps - #test_envpool(num_envs=6, envs_per_worker=1, envs_per_batch=6, env_pool=False) + # test_envpool(num_envs=6, envs_per_worker=1, envs_per_batch=6, env_pool=False) # 645 sps - #test_envpool(num_envs=24, envs_per_worker=4, envs_per_batch=24, env_pool=False) + # test_envpool(num_envs=24, envs_per_worker=4, envs_per_batch=24, env_pool=False) - # 755 sps + # 755 sps # test_envpool(num_envs=24, envs_per_worker=4, envs_per_batch=24) # 1050 sps diff --git a/tests/test.py b/tests/test.py index 813bd8dbe..6f7dab006 100644 --- a/tests/test.py +++ b/tests/test.py @@ -1,16 +1,14 @@ -from pdb import set_trace as T -import numpy as np +# Deprecation warnings from gymnasium +import warnings +import numpy as np import pufferlib import pufferlib.emulation import pufferlib.utils import pufferlib.vector from pufferlib.environments import test -# Deprecation warnings from gymnasium -import gymnasium -import warnings warnings.filterwarnings("ignore") @@ -31,8 +29,7 @@ def test_gymnasium_emulation(env_cls, steps=100): # Reconstruct original obs format from puffer env and compare to raw if puf_env.is_obs_emulated: - puf_ob = pufferlib.emulation.nativize( - puf_ob, puf_env.env.observation_space, puf_env.obs_dtype) + puf_ob = pufferlib.emulation.nativize(puf_ob, puf_env.env.observation_space, puf_env.obs_dtype) pufferlib.utils.compare_space_samples(raw_ob, puf_ob) @@ -41,12 +38,12 @@ def test_gymnasium_emulation(env_cls, steps=100): # Convert raw actions to puffer format if puf_env.is_atn_emulated: - action = pufferlib.emulation.emulate_copy( - action, puf_env.action_space.dtype, puf_env.atn_dtype) + action = pufferlib.emulation.emulate_copy(action, puf_env.action_space.dtype, puf_env.atn_dtype) puf_ob, puf_reward, puf_done, puf_truncated, _ = puf_env.step(action) assert puf_reward == raw_reward + def test_pettingzoo_emulation(env_cls, steps=100): raw_env = env_cls() puf_env = pufferlib.emulation.PettingZooPufferEnv(env_creator=env_cls) @@ -71,13 +68,11 @@ def test_pettingzoo_emulation(env_cls, steps=100): # Reconstruct original obs format from puffer env and compare to raw if puf_env.is_obs_emulated: - puf_ob = pufferlib.emulation.nativize( - puf_ob, puf_env.env.single_observation_space, puf_env.obs_dtype) - + puf_ob = pufferlib.emulation.nativize(puf_ob, puf_env.env.single_observation_space, puf_env.obs_dtype) + assert pufferlib.utils.compare_space_samples(raw_ob, puf_ob) - raw_actions = {a: raw_env.action_space(a).sample() - for a in raw_env.agents} + raw_actions = {a: raw_env.action_space(a).sample() for a in raw_env.agents} puf_actions = raw_actions raw_obs, raw_rewards, raw_dones, raw_truncateds, _ = raw_env.step(raw_actions) @@ -91,7 +86,8 @@ def test_pettingzoo_emulation(env_cls, steps=100): continue puf_actions[agent] = pufferlib.emulation.emulate_copy( - raw_actions[agent], puf_env.single_action_space.dtype, puf_env.atn_dtype) + raw_actions[agent], puf_env.single_action_space.dtype, puf_env.atn_dtype + ) puf_obs, puf_rewards, puf_dones, puf_truncateds, _ = puf_env.step(puf_actions) @@ -101,10 +97,10 @@ def test_pettingzoo_emulation(env_cls, steps=100): for agent in raw_dones: assert puf_dones[agent] == raw_dones[agent] + def test_puffer_vectorization(env_cls, puffer_cls, steps=100, num_envs=1, **kwargs): raw_envs = [puffer_cls(env_creator=env_cls) for _ in range(num_envs)] - vec_envs = pufferlib.vector.make(puffer_cls, - env_kwargs={'env_creator': env_cls}, num_envs=num_envs, **kwargs) + vec_envs = pufferlib.vector.make(puffer_cls, env_kwargs={"env_creator": env_cls}, num_envs=num_envs, **kwargs) num_agents = sum(env.num_agents for env in raw_envs) assert num_agents == vec_envs.num_agents @@ -138,7 +134,7 @@ def test_puffer_vectorization(env_cls, puffer_cls, steps=100, num_envs=1, **kwar raw_rewards.append(r_rew) raw_terminals.append(r_term) raw_truncations.append(r_trunc) - + vec_obs, vec_rewards, vec_terminals, vec_truncations, _ = vec_envs.step(actions) rew = raw_rewards @@ -159,22 +155,21 @@ def test_puffer_vectorization(env_cls, puffer_cls, steps=100, num_envs=1, **kwar for raw_env in raw_envs: raw_env.close() + def test_emulation(): for env_cls in test.MOCK_SINGLE_AGENT_ENVIRONMENTS: test_gymnasium_emulation(env_cls) - print('Gymnasium emulation tests passed') + print("Gymnasium emulation tests passed") for env_cls in test.MOCK_MULTI_AGENT_ENVIRONMENTS: test_pettingzoo_emulation(env_cls) - print('PettingZoo emulation tests passed') + print("PettingZoo emulation tests passed") + def test_vectorization(): - for vectorization in [ - pufferlib.vector.Serial, - pufferlib.vector.Multiprocessing, - pufferlib.vector.Ray]: + for vectorization in [pufferlib.vector.Serial, pufferlib.vector.Multiprocessing, pufferlib.vector.Ray]: for env_cls in test.MOCK_SINGLE_AGENT_ENVIRONMENTS: test_puffer_vectorization( env_cls, @@ -185,7 +180,7 @@ def test_vectorization(): backend=vectorization, ) - print(f'Gymnasium {vectorization.__name__} vectorization tests passed') + print(f"Gymnasium {vectorization.__name__} vectorization tests passed") for env_cls in test.MOCK_MULTI_AGENT_ENVIRONMENTS: test_puffer_vectorization( @@ -197,9 +192,10 @@ def test_vectorization(): backend=vectorization, ) - print(f'PettingZoo {vectorization.__name__} vectorization tests passed') + print(f"PettingZoo {vectorization.__name__} vectorization tests passed") + -if __name__ == '__main__': +if __name__ == "__main__": test_emulation() test_vectorization() - exit(0) # For Ray + exit(0) # For Ray diff --git a/tests/test_api.py b/tests/test_api.py index 3bf57ea3b..072809297 100644 --- a/tests/test_api.py +++ b/tests/test_api.py @@ -1,20 +1,20 @@ -from pdb import set_trace as T import pufferlib import pufferlib.emulation import pufferlib.vector -from pufferlib.exceptions import APIUsageError, InvalidAgentError from pufferlib.environments import test +from pufferlib.exceptions import APIUsageError, InvalidAgentError + def print_if(e, print_errors): if print_errors: - print(type(e).__name__ + ':', e) - print('#################') + print(type(e).__name__ + ":", e) + print("#################") print() + def test_gymnasium_api(print_errors=False): - env = pufferlib.emulation.GymnasiumPufferEnv( - env_creator=test.GymnasiumTestEnv) + env = pufferlib.emulation.GymnasiumPufferEnv(env_creator=test.GymnasiumTestEnv) try: env.step({}) @@ -39,9 +39,9 @@ def test_gymnasium_api(print_errors=False): action = env.action_space.sample() obs, rewards, terminals, truncateds, infos = env.step(action) + def test_pettingzoo_api_usage(print_errors=False): - env = pufferlib.emulation.PettingZooPufferEnv( - env_creator=test.PettingZooTestEnv) + env = pufferlib.emulation.PettingZooPufferEnv(env_creator=test.PettingZooTestEnv) try: env.step({}) @@ -54,17 +54,17 @@ def test_pettingzoo_api_usage(print_errors=False): print_if(e, print_errors) try: - env.observation_space('foo') + env.observation_space("foo") except InvalidAgentError as e: print_if(e, print_errors) try: - env.action_space('foo') + env.action_space("foo") except InvalidAgentError as e: print_if(e, print_errors) - env.observation_space('agent_1') - env.action_space('agent_1') + env.observation_space("agent_1") + env.action_space("agent_1") obs = env.reset() try: @@ -74,28 +74,21 @@ def test_pettingzoo_api_usage(print_errors=False): print_if(e, print_errors) try: - env.step({'foo': None}) + env.step({"foo": None}) except InvalidAgentError as e: print_if(e, print_errors) - actions = {agent: env.action_space(agent).sample() for agent in env.agents} obs, rewards, terminals, truncateds, infos = env.step(actions) + def test_vectorization_api(print_errors=False): - gymnasium_creator = lambda: pufferlib.emulation.GymnasiumPufferEnv( - env_creator=test.GymnasiumTestEnv) - pettingzoo_creator = lambda: pufferlib.emulation.PettingZooPufferEnv( - env_creator=test.PettingZooTestEnv) - - for backend in [ - pufferlib.vector.Serial, - pufferlib.vector.Multiprocessing, - pufferlib.vector.Ray]: - + gymnasium_creator = lambda: pufferlib.emulation.GymnasiumPufferEnv(env_creator=test.GymnasiumTestEnv) + pettingzoo_creator = lambda: pufferlib.emulation.PettingZooPufferEnv(env_creator=test.PettingZooTestEnv) + + for backend in [pufferlib.vector.Serial, pufferlib.vector.Multiprocessing, pufferlib.vector.Ray]: for creator in [gymnasium_creator, pettingzoo_creator]: - vec = pufferlib.vector.make(creator, num_envs=6, - num_workers=3, backend=backend) + vec = pufferlib.vector.make(creator, num_envs=6, num_workers=3, backend=backend) # Sync API _, _ = vec.reset() @@ -104,8 +97,7 @@ def test_vectorization_api(print_errors=False): vec.close() # Async API - vec = pufferlib.vector.make(creator, num_envs=8, - num_workers=4, batch_size=4, backend=backend) + vec = pufferlib.vector.make(creator, num_envs=8, num_workers=4, batch_size=4, backend=backend) vec.async_reset() actions = vec.action_space.sample() _, _, _, _, _, _, _ = vec.recv() @@ -118,19 +110,17 @@ def test_vectorization_api(print_errors=False): print_if(e, print_errors) try: - vec = pufferlib.vector.make(gymnasium_creator, - num_envs=3, num_workers=2) + vec = pufferlib.vector.make(gymnasium_creator, num_envs=3, num_workers=2) except APIUsageError as e: print_if(e, print_errors) try: - vec = pufferlib.vector.make(gymnasium_creator, - num_envs=4, num_workers=2, batch_size=3) + vec = pufferlib.vector.make(gymnasium_creator, num_envs=4, num_workers=2, batch_size=3) except APIUsageError as e: print_if(e, print_errors) -if __name__ == '__main__': +if __name__ == "__main__": test_gymnasium_api() test_pettingzoo_api_usage() test_vectorization_api() diff --git a/tests/test_atari_reset.py b/tests/test_atari_reset.py index a82635d24..c292d6375 100644 --- a/tests/test_atari_reset.py +++ b/tests/test_atari_reset.py @@ -1,12 +1,11 @@ -from pdb import set_trace as T from pufferlib.environments import atari def test_atari_reset(): - '''Common way to bug the wrappers can be detected + """Common way to bug the wrappers can be detected by checking that the environment properly resets - after hitting 0 lives''' - env = atari.env_creator('BreakoutNoFrameskip-v4')(4) + after hitting 0 lives""" + env = atari.env_creator("BreakoutNoFrameskip-v4")(4) obs, info = env.reset() prev_lives = 5 @@ -16,14 +15,15 @@ def test_atari_reset(): action = env.action_space.sample() obs, reward, terminal, truncated, info = env.step(action) - if info['lives'] != prev_lives: + if info["lives"] != prev_lives: lives.append(i) - prev_lives = info['lives'] + prev_lives = info["lives"] if terminal or truncated: obs = env.reset() assert len(lives) > 10 -if __name__ == '__main__': + +if __name__ == "__main__": test_atari_reset() diff --git a/tests/test_carbs.py b/tests/test_carbs.py index 0feb11700..7544f600b 100644 --- a/tests/test_carbs.py +++ b/tests/test_carbs.py @@ -1,15 +1,6 @@ import numpy as np - -from carbs import CARBS -from carbs import CARBSParams -from carbs import LinearSpace -from carbs import LogSpace -from carbs import LogitSpace -from carbs import ObservationInParam -from carbs import ParamDictType -from carbs import Param - import wandb +from carbs import CARBS, CARBSParams, LinearSpace, ObservationInParam, Param class SyntheticExperiment: @@ -20,11 +11,12 @@ def __init__(self, n_params, noise=0.1): self.param_optima = np.random.randn(n_params) def optimize(self, params): - dist = (params-self.param_optima)**2 - reward = 2**(-dist) - noise = 1 + self.noise*np.random.randn() + dist = (params - self.param_optima) ** 2 + reward = 2 ** (-dist) + noise = 1 + self.noise * np.random.randn() return noise * np.prod(reward) + class CARBSSearch: def __init__(self, experiment): self.experiment = experiment @@ -32,9 +24,7 @@ def __init__(self, experiment): self.best_params = None param_spaces = [ - Param(name=str(i), - space=LinearSpace(min=-10, max=10, is_integer=False), - search_center=0.0) + Param(name=str(i), space=LinearSpace(min=-10, max=10, is_integer=False), search_center=0.0) for i in range(self.experiment.n_params) ] carbs_params = CARBSParams( @@ -56,11 +46,12 @@ def sample(self): ObservationInParam( input=suggestion, output=reward, - cost=1,#uptime, + cost=1, # uptime, ) ) return params, reward + class GeneticAlgorithm: def __init__(self, experiment, mutation_rate=0.1): self.experiment = experiment @@ -69,7 +60,7 @@ def __init__(self, experiment, mutation_rate=0.1): self.best_params = np.random.randn(self.experiment.n_params) def sample(self): - mutation = self.mutation_rate*np.random.randn(self.experiment.n_params) + mutation = self.mutation_rate * np.random.randn(self.experiment.n_params) params = self.best_params + mutation reward = self.experiment.optimize(params) if self.best_reward is None or reward > self.best_reward: @@ -78,23 +69,22 @@ def sample(self): return params, reward + class WandbSearch: - def __init__(self, experiment, method='bayes', strategy=None): + def __init__(self, experiment, method="bayes", strategy=None): self.experiment = experiment self.strategy = strategy - self.parameters = {f'param_{i}': - {'distribution': 'normal', 'mu': 0, 'sigma': 1} - for i in range(10)} + self.parameters = {f"param_{i}": {"distribution": "normal", "mu": 0, "sigma": 1} for i in range(10)} name = strategy.__class__.__name__ if strategy is not None else method self.sweep_id = wandb.sweep( sweep=dict( method=method, - name=f'sweep-{name}', + name=f"sweep-{name}", metric=dict( - goal='maximize', - name='reward', + goal="maximize", + name="reward", ), parameters=self.parameters, ), @@ -105,32 +95,32 @@ def __init__(self, experiment, method='bayes', strategy=None): def run(self): def main(): self.idx += 1 - wandb.init(name=f'experiment-{self.idx}') - wandb.config.__dict__['_locked'] = {} + wandb.init(name=f"experiment-{self.idx}") + wandb.config.__dict__["_locked"] = {} if self.strategy is not None: params, reward = self.strategy.sample() else: params = np.array([float(wandb.config[k]) for k in self.parameters]) reward = self.experiment.optimize(params) - param_dict = dict(zip(self.parameters.keys(), params)) + param_dict = dict(zip(self.parameters.keys(), params, strict=False)) wandb.config.update(param_dict, allow_val_change=True) - wandb.log({'reward': reward}) + wandb.log({"reward": reward}) wandb.agent(self.sweep_id, main, count=100) -if __name__ == '__main__': +if __name__ == "__main__": experiment = SyntheticExperiment(10) strategy = CARBSSearch(experiment) wandb_search = WandbSearch(experiment, strategy=strategy) wandb_search.run() - wandb_search = WandbSearch(experiment, method='random') + wandb_search = WandbSearch(experiment, method="random") wandb_search.run() - wandb_search = WandbSearch(experiment, method='bayes') + wandb_search = WandbSearch(experiment, method="bayes") wandb_search.run() strategy = GeneticAlgorithm(experiment) diff --git a/tests/test_cleanrl_utils.py b/tests/test_cleanrl_utils.py index 08dc939b5..87261de2d 100644 --- a/tests/test_cleanrl_utils.py +++ b/tests/test_cleanrl_utils.py @@ -1,24 +1,19 @@ -from pdb import set_trace as T -import numpy as np - -import torch -from torch.distributions import Categorical - import gym - +import numpy as np import pufferlib -import pufferlib.models import pufferlib.cleanrl import pufferlib.environments.classic_control +import pufferlib.models import pufferlib.vectorization +import torch +from torch.distributions import Categorical def test_cleanrl_utils(): envs = pufferlib.vectorization.Serial( - env_creator=pufferlib.environments.classic_control.env_creator('cartpole'), - num_envs=4, envs_per_worker=2 + env_creator=pufferlib.environments.classic_control.env_creator("cartpole"), num_envs=4, envs_per_worker=2 ) - + obs, info, _, _ = envs.reset() policy = pufferlib.models.Default(envs.driver_env) @@ -28,11 +23,13 @@ def test_cleanrl_utils(): obs = torch.tensor(obs).unsqueeze(1).float() actions = policy.get_action_and_value(obs) + def shape_check(a1, l1, e1, a2, l2, e2): assert a1.shape == a2.shape assert l1.shape == l2.shape assert e1.shape == e2.shape + def test_sample_logits(): batch = 8 @@ -57,8 +54,9 @@ def test_sample_logits(): a2, l2, e2 = pufferlib.cleanrl.sample_logits(md_logits, action=md_action) shape_check(a1, l1, e1, a2, l2, e2) + def correct_sample_logits(logits, action=None): - '''A bad but known correct implementation''' + """A bad but known correct implementation""" if isinstance(logits, torch.Tensor): categorical = Categorical(logits=logits) if action is None: @@ -77,11 +75,11 @@ def correct_sample_logits(logits, action=None): else: action = action.view(-1, action.shape[-1]).T - logprob = torch.stack([c.log_prob(a) for c, a in zip(multi_categorical, action)]).T.sum(1) + logprob = torch.stack([c.log_prob(a) for c, a in zip(multi_categorical, action, strict=False)]).T.sum(1) entropy = torch.stack([c.entropy() for c in multi_categorical]).T.sum(1) return action, logprob, entropy - -if __name__ == '__main__': + +if __name__ == "__main__": test_cleanrl_utils() - #test_sample_logits() + # test_sample_logits() diff --git a/tests/test_extensions.py b/tests/test_extensions.py index bef142266..e28ce1a24 100644 --- a/tests/test_extensions.py +++ b/tests/test_extensions.py @@ -1,11 +1,11 @@ -from pdb import set_trace as T -import numpy as np import timeit -import gym -from pufferlib.emulation import flatten_structure, flatten_space, flatten, unflatten, concatenate, split +import gym +import numpy as np import pufferlib.utils +from pufferlib.emulation import concatenate, flatten, flatten_space, flatten_structure, split, unflatten + def test_pack_unpack(): for space in nested_spaces: @@ -18,44 +18,48 @@ def test_pack_unpack(): unpack_sample = split(pack_sample, flat_space, sz, batched=False) unflat_sample = unflatten(unpack_sample, space) assert pufferlib.utils.compare_space_samples(sample, unflat_sample), "Unflatten failed." - + + test_cases = [ # Nested Dict with Box and Discrete spaces - gym.spaces.Dict({ - "a": gym.spaces.Box(low=0, high=1, shape=(3,)), - "b": gym.spaces.MultiDiscrete([3, 10]), - "c": gym.spaces.Dict({ - "d": gym.spaces.Box(low=-10, high=10, shape=(100,)), - "e": gym.spaces.Discrete(1000) - }) - }), - + gym.spaces.Dict( + { + "a": gym.spaces.Box(low=0, high=1, shape=(3,)), + "b": gym.spaces.MultiDiscrete([3, 10]), + "c": gym.spaces.Dict({"d": gym.spaces.Box(low=-10, high=10, shape=(100,)), "e": gym.spaces.Discrete(1000)}), + } + ), # Nested Tuple with Box spaces of different shapes - gym.spaces.Tuple(( - gym.spaces.Box(low=0, high=1, shape=(1,)), - gym.spaces.Box(low=-5, high=5, shape=(10,)), - gym.spaces.Tuple(( - gym.spaces.Box(low=-100, high=100, shape=(1000,)), - gym.spaces.Box(low=-1000, high=1000, shape=(10000,)) - )) - )), - + gym.spaces.Tuple( + ( + gym.spaces.Box(low=0, high=1, shape=(1,)), + gym.spaces.Box(low=-5, high=5, shape=(10,)), + gym.spaces.Tuple( + ( + gym.spaces.Box(low=-100, high=100, shape=(1000,)), + gym.spaces.Box(low=-1000, high=1000, shape=(10000,)), + ) + ), + ) + ), # Nested Dict with Tuple, Box, and Discrete spaces - gym.spaces.Dict({ - "f": gym.spaces.Tuple((gym.spaces.Discrete(2), gym.spaces.Discrete(3))), - "g": gym.spaces.Box(low=-10, high=10, shape=(50,)), - "h": gym.spaces.Tuple(( - gym.spaces.Box(low=0, high=1, shape=(500,)), - gym.spaces.Dict({ - "i": gym.spaces.Discrete(5000), - "j": gym.spaces.Box(low=-100, high=100, shape=(10000,)) - }) - )) - }), - + gym.spaces.Dict( + { + "f": gym.spaces.Tuple((gym.spaces.Discrete(2), gym.spaces.Discrete(3))), + "g": gym.spaces.Box(low=-10, high=10, shape=(50,)), + "h": gym.spaces.Tuple( + ( + gym.spaces.Box(low=0, high=1, shape=(500,)), + gym.spaces.Dict( + {"i": gym.spaces.Discrete(5000), "j": gym.spaces.Box(low=-100, high=100, shape=(10000,))} + ), + ) + ), + } + ), # Flat spaces for control gym.spaces.Box(low=0, high=1, shape=(10,)), - gym.spaces.Discrete(100) + gym.spaces.Discrete(100), ] @@ -75,21 +79,17 @@ def test_flatten_unflatten(iterations=10_000): unflat = unflatten(unmerged, structure) assert pufferlib.utils.compare_space_samples(data, unflat), "Unflatten failed." - flatten_times.append(timeit.timeit( - lambda: flatten(data), number=iterations)) - concatenate_times.append(timeit.timeit( - lambda: concatenate(flat), number=iterations)) - split_times.append(timeit.timeit( - lambda: split(merged, flat_space, sz, batched=False), number=iterations)) - unflatten_times.append(timeit.timeit( - lambda: unflatten(unmerged, structure), number=iterations)) + flatten_times.append(timeit.timeit(lambda: flatten(data), number=iterations)) + concatenate_times.append(timeit.timeit(lambda: concatenate(flat), number=iterations)) + split_times.append(timeit.timeit(lambda: split(merged, flat_space, sz, batched=False), number=iterations)) + unflatten_times.append(timeit.timeit(lambda: unflatten(unmerged, structure), number=iterations)) - print(f'{np.mean(flatten_times)/iterations:.8f}: Flatten time') - print(f'{np.mean(concatenate_times)/iterations:.8f}: Concatenate time') - print(f'{np.mean(split_times)/iterations:.8f}: Split time') - print(f'{np.mean(unflatten_times)/iterations:.8f}: Unflatten time') + print(f"{np.mean(flatten_times) / iterations:.8f}: Flatten time") + print(f"{np.mean(concatenate_times) / iterations:.8f}: Concatenate time") + print(f"{np.mean(split_times) / iterations:.8f}: Split time") + print(f"{np.mean(unflatten_times) / iterations:.8f}: Unflatten time") -if __name__ == '__main__': +if __name__ == "__main__": iterations = 10_000 test_flatten_unflatten(iterations=iterations) diff --git a/tests/test_flatten.py b/tests/test_flatten.py index c2b7de346..c7f9fe352 100644 --- a/tests/test_flatten.py +++ b/tests/test_flatten.py @@ -1,27 +1,27 @@ -import pufferlib.extensions as c -from pufferlib.emulation import flatten_structure import timeit +import pufferlib.extensions as c +from pufferlib.emulation import flatten_structure samples = [ - [1, {'foo': (1, 2, 3)}], - {'foo': 1, 'bar': {'baz': 2, 'qux': 3}}, + [1, {"foo": (1, 2, 3)}], + {"foo": 1, "bar": {"baz": 2, "qux": 3}}, 1, - {'a': [1, 2, {'b': (3, 4)}]}, - {'x': {'y': {'z': [1, 2, 3]}}}, - (1, 2, [3, 4], {'a': 5}), - {'nested': {'more': {'and_more': (1, 2, [3, {'deep': 4}])}}}, + {"a": [1, 2, {"b": (3, 4)}]}, + {"x": {"y": {"z": [1, 2, 3]}}}, + (1, 2, [3, 4], {"a": 5}), + {"nested": {"more": {"and_more": (1, 2, [3, {"deep": 4}])}}}, [[1, 2], [3, 4], [5, 6]], - {'a': 1, 'b': 2, 'c': {'d': 3, 'e': [4, 5]}}, - (1, {'a': 2, 'b': {'c': 3, 'd': [4, 5]}}), - {'a': {'b': {'c': {'d': 1}}}}, - [1, 2, 3, [4, 5, {'a': 6}]], - {'single': 1}, + {"a": 1, "b": 2, "c": {"d": 3, "e": [4, 5]}}, + (1, {"a": 2, "b": {"c": 3, "d": [4, 5]}}), + {"a": {"b": {"c": {"d": 1}}}}, + [1, 2, 3, [4, 5, {"a": 6}]], + {"single": 1}, (1,), - {'a': {'b': [1, 2, (3, 4, {'e': 5})]}}, - [[1, 2], 3, {'a': (4, 5)}], - (1, [2, {'a': 3}], {'b': 4}, [5, 6]), - {'mixed': (1, [2, 3], {'a': 4, 'b': (5, [6, 7])})} + {"a": {"b": [1, 2, (3, 4, {"e": 5})]}}, + [[1, 2], 3, {"a": (4, 5)}], + (1, [2, {"a": 3}], {"b": 4}, [5, 6]), + {"mixed": (1, [2, 3], {"a": 4, "b": (5, [6, 7])})}, ] @@ -29,7 +29,7 @@ def compare_data(data, unflat): if isinstance(data, (list, tuple)) and isinstance(unflat, (list, tuple)): if len(data) != len(unflat): return False - return all(compare_data(d, f) for d, f in zip(data, unflat)) + return all(compare_data(d, f) for d, f in zip(data, unflat, strict=False)) elif isinstance(data, dict) and isinstance(unflat, dict): if len(data) != len(unflat): return False @@ -37,6 +37,7 @@ def compare_data(data, unflat): else: return data == unflat + def test_flatten_unflatten(): for sample in samples: structure = flatten_structure(sample) @@ -49,6 +50,7 @@ def test_flatten_unflatten(): breakpoint() assert compare_data(sample, unflat) + def test_flatten_performance(n=100_000): print("\nFlatten Performance Testing:") total_calls_per_second = 0 @@ -62,6 +64,7 @@ def test_flatten_performance(n=100_000): avg_calls_per_second_in_k = total_calls_per_second // num_samples print(f"Average flatten calls per second across all samples: {avg_calls_per_second_in_k} K") + def test_unflatten_performance(n=100_000): print("\nUnflatten Performance Testing:") total_calls_per_second = 0 diff --git a/tests/test_import_performance.py b/tests/test_import_performance.py index c224e3431..589023ce2 100644 --- a/tests/test_import_performance.py +++ b/tests/test_import_performance.py @@ -1,11 +1,13 @@ import time + def test_import_speed(): - start = time.time() - import pufferlib + start = time.time() + end = time.time() - print(end - start, ' seconds to import pufferlib') + print(end - start, " seconds to import pufferlib") assert end - start < 0.25 -if __name__ == '__main__': - test_import_speed() \ No newline at end of file + +if __name__ == "__main__": + test_import_speed() diff --git a/tests/test_namespace.py b/tests/test_namespace.py index 08095017c..9e1924181 100644 --- a/tests/test_namespace.py +++ b/tests/test_namespace.py @@ -1,29 +1,33 @@ -from pufferlib import namespace, dataclass +from pufferlib import dataclass, namespace + def test_namespace_as_function(): ns = namespace(x=1, y=2, z=3) - + assert ns.x == 1 assert ns.y == 2 assert ns.z == 3 - assert list(ns.keys()) == ['x', 'y', 'z'] + assert list(ns.keys()) == ["x", "y", "z"] assert list(ns.values()) == [1, 2, 3] - assert list(ns.items()) == [('x', 1), ('y', 2), ('z', 3)] + assert list(ns.items()) == [("x", 1), ("y", 2), ("z", 3)] + @dataclass class TestClass: a: int b = 1 + def test_namespace_as_decorator(): obj = TestClass(a=4, b=5) - + assert obj.a == 4 assert obj.b == 5 - assert list(obj.keys()) == ['a', 'b'] + assert list(obj.keys()) == ["a", "b"] assert list(obj.values()) == [4, 5] - assert list(obj.items()) == [('a', 4), ('b', 5)] + assert list(obj.items()) == [("a", 4), ("b", 5)] + -if __name__ == '__main__': +if __name__ == "__main__": test_namespace_as_function() test_namespace_as_decorator() diff --git a/tests/test_nested.py b/tests/test_nested.py index dcef1cc49..ebe47557f 100644 --- a/tests/test_nested.py +++ b/tests/test_nested.py @@ -1,17 +1,16 @@ -from pdb import set_trace as T import numpy as np - import pufferlib.spaces from pufferlib.emulation import flatten + def dtype_from_space(space): if isinstance(space, pufferlib.spaces.Tuple): dtype = [] num_bytes = 0 for i, elem in enumerate(space): dtype_ext, bytes_ext = dtype_from_space(elem) - dtype.append((f'f{i}', dtype_ext)) - #dtype.append((dtype_ext,)) + dtype.append((f"f{i}", dtype_ext)) + # dtype.append((dtype_ext,)) num_bytes += bytes_ext elif isinstance(space, pufferlib.spaces.Dict): dtype = [] @@ -32,11 +31,12 @@ def flat_dtype_from_space(space, name=None): _flat_dtype_from_space(space, dtype, name) return dtype + def _flat_dtype_from_space(space, dtype, name=None): if isinstance(space, pufferlib.spaces.Tuple): for i, elem in enumerate(space): - _flat_dtype_from_space(elem, dtype, name=f'f{i}') - #_flat_dtype_from_space(elem, dtype, name=None) + _flat_dtype_from_space(elem, dtype, name=f"f{i}") + # _flat_dtype_from_space(elem, dtype, name=None) elif isinstance(space, pufferlib.spaces.Dict): for k, value in space.items(): _flat_dtype_from_space(value, dtype, name=k) @@ -48,28 +48,30 @@ def _flat_dtype_from_space(space, dtype, name=None): return dtype + def fill_with_sample(arr, sample): if isinstance(sample, dict): for k, v in sample.items(): fill_with_sample(arr[k], v) elif isinstance(sample, tuple): for i, v in enumerate(sample): - fill_with_sample(arr[f'f{i}'], v) + fill_with_sample(arr[f"f{i}"], v) else: arr[()] = sample -from gymnasium.spaces import Tuple, Dict, Box +from gymnasium.spaces import Box, Dict, Tuple -test_space = Tuple([ - Dict({ - 'a': Box(0, 1, shape=(2,)), - 'b': Box(0, 1, shape=(3,)) - }), - Dict({ - 'c': Box(0, 1, shape=(4,)), - }) -]) +test_space = Tuple( + [ + Dict({"a": Box(0, 1, shape=(2,)), "b": Box(0, 1, shape=(3,))}), + Dict( + { + "c": Box(0, 1, shape=(4,)), + } + ), + ] +) # Some notes: # The flat version may be faster. It allows you to fill in a single @@ -83,15 +85,15 @@ def fill_with_sample(arr, sample): dtype, num_bytes = dtype_from_space(test_space) dtype = np.dtype(dtype) elem = np.zeros(1, dtype=dtype) -#flat_dtype = flat_dtype_from_space(test_space) +# flat_dtype = flat_dtype_from_space(test_space) sample = test_space.sample() fill_with_sample(elem, sample) breakpoint() -#flat_sample = flatten(sample) +# flat_sample = flatten(sample) rec_array = np.rec.array(flat_sample, dtype=flat_dtype) rec_array = rec_array.view(dtype) -''' +""" test_space = Dict({ 'a': Box(0, 1, shape=(3,)), 'b': Dict({ @@ -100,44 +102,47 @@ def fill_with_sample(arr, sample): }), 'e': Box(0, 1, shape=(3,)) }) -''' - +""" + breakpoint() exit() + def mkdt(d): ll = [] sz_bytes = 0 - for k,v in d.items(): - if isinstance(v,np.ndarray): - ll.append((k,v.dtype)) + for k, v in d.items(): + if isinstance(v, np.ndarray): + ll.append((k, v.dtype)) sz_bytes += v.nbytes else: l_ext, sz_ext = mkdt(v) - ll.append((k,l_ext)) + ll.append((k, l_ext)) sz_bytes += sz_ext return ll, sz_bytes + def mkdt_flat(d): dtype = [] return _mkdt_flat(d, dtype) + def _mkdt_flat(d, dtype): - for k,v in d.items(): - if isinstance(v,np.ndarray): - dtype.append((k,v.dtype)) + for k, v in d.items(): + if isinstance(v, np.ndarray): + dtype.append((k, v.dtype)) else: _mkdt_flat(v, dtype) return dtype -arr1=np.arange(10).astype(np.float32) -arr2=np.arange(100.,110.).astype(np.uint8) -arr3=np.arange(200,210).astype(np.int32) -d={'a':arr1, 'b':{'b1':arr2, 'b2':{'c':arr3}}} +arr1 = np.arange(10).astype(np.float32) +arr2 = np.arange(100.0, 110.0).astype(np.uint8) +arr3 = np.arange(200, 210).astype(np.int32) +d = {"a": arr1, "b": {"b1": arr2, "b2": {"c": arr3}}} dt, sz_bytes = mkdt(d) -#A = np.zeros(sz_bytes, dtype=np.uint8) +# A = np.zeros(sz_bytes, dtype=np.uint8) flat = flatten(d) flat_dtype = mkdt_flat(d) rec_array = np.rec.array(flat, dtype=flat_dtype).view(dt) diff --git a/tests/test_nmmo3_compile.py b/tests/test_nmmo3_compile.py index 4f92f90cb..a0a2c33b6 100644 --- a/tests/test_nmmo3_compile.py +++ b/tests/test_nmmo3_compile.py @@ -1,27 +1,28 @@ -from pdb import set_trace as T import time -import torch + import numpy as np +import torch -@torch.compile(fullgraph=True, mode='reduce-overhead') +@torch.compile(fullgraph=True, mode="reduce-overhead") def fast_decode_map(codes, obs, factors, add, div): codes = codes.view(codes.shape[0], 1, -1) - dec = add + (codes//div) % factors + dec = add + (codes // div) % factors obs.scatter_(1, dec, 1) return obs -#@torch.compile(fullgraph=True, mode='reduce-overhead') + +# @torch.compile(fullgraph=True, mode='reduce-overhead') def decode_map(codes): codes = codes.unsqueeze(1).long() factors = [4, 4, 16, 5, 3, 5, 5, 6, 7, 4] n_channels = sum(factors) - obs = torch.zeros(codes.shape[0], n_channels, 11, 15, device='cuda') + obs = torch.zeros(codes.shape[0], n_channels, 11, 15, device="cuda") add, div = 0, 1 # TODO: check item/tier order for mod in factors: - obs.scatter_(1, add+(codes//div)%mod, 1) + obs.scatter_(1, add + (codes // div) % mod, 1) add += mod div *= mod @@ -38,8 +39,8 @@ def test_perf(n=100, agents=1024): add = torch.tensor(add).cuda() div = torch.tensor(div).cuda() - codes = torch.randint(0, 4*4*16*5*3*5*5*6*7*4, (agents, 11, 15)).cuda() - obs = torch.zeros(agents, n_channels, 11*15, device='cuda') + codes = torch.randint(0, 4 * 4 * 16 * 5 * 3 * 5 * 5 * 6 * 7 * 4, (agents, 11, 15)).cuda() + obs = torch.zeros(agents, n_channels, 11 * 15, device="cuda") obs_view = obs.view(agents, n_channels, 11, 15) # Warm up @@ -50,15 +51,14 @@ def test_perf(n=100, agents=1024): start = time.time() for _ in range(n): fast_decode_map(codes, obs, factors, add, div) - #obs2 = decode_map(codes) - #print(torch.all(obs_view == obs2)) - + # obs2 = decode_map(codes) + # print(torch.all(obs_view == obs2)) torch.cuda.synchronize() end = time.time() sps = n / (end - start) - print(f'SPS: {sps:.2f}') + print(f"SPS: {sps:.2f}") -if __name__ == '__main__': - test_perf() +if __name__ == "__main__": + test_perf() diff --git a/tests/test_performance.py b/tests/test_performance.py index ae6e385d6..357428176 100644 --- a/tests/test_performance.py +++ b/tests/test_performance.py @@ -1,30 +1,19 @@ -from pdb import set_trace as T -import time -from tqdm import tqdm -import importlib import random -import sys +import time +import gymnasium +import numpy as np import pufferlib -import pufferlib.utils -import pufferlib.exceptions import pufferlib.emulation import pufferlib.environments - -import numpy as np - -import pufferlib +import pufferlib.exceptions +import pufferlib.utils from pufferlib.environments import ocean -from pufferlib.vector import Multiprocessing, Serial, Ray, make, autotune - -import time -import psutil -import gymnasium +from pufferlib.vector import Multiprocessing, Serial, make DEFAULT_TIMEOUT = 10 -import time -from functools import wraps + class TimedEnv: def __init__(self, env): @@ -51,6 +40,7 @@ def reset(self, *args, **kwargs): self.reset_times.append(elapsed) return result + def profile_emulation(env_creator, timeout=DEFAULT_TIMEOUT, seed=42): reset_times = [] step_times = [] @@ -75,7 +65,7 @@ def profile_emulation(env_creator, timeout=DEFAULT_TIMEOUT, seed=42): if multiagent: action = {agent: env.action_space(agent).sample() for agent in ob} - agent_step_count += len(env.agents) + agent_step_count += len(env.agents) else: action = env.action_space.sample() agent_step_count += 1 @@ -103,13 +93,15 @@ def profile_emulation(env_creator, timeout=DEFAULT_TIMEOUT, seed=42): env_sps = agent_step_count / (puf_total_step + puf_total_reset) env_percent_reset = 100 * puf_total_reset / (puf_total_reset + puf_total_step) env_percent_step_std = 100 * puf_step_std / puf_step_mean - env_overhead = 100 * (puf_total_step - raw_total_step + puf_total_reset - raw_total_reset) / (puf_total_step + puf_total_reset) + env_overhead = ( + 100 * (puf_total_step - raw_total_step + puf_total_reset - raw_total_reset) / (puf_total_step + puf_total_reset) + ) + print(f" SPS : {env_sps:.1f}") + print(f" Overhead : {env_overhead:.2g}%") + print(f" Reset : {env_percent_reset:.3g}%") + print(f" Step STD : {env_percent_step_std:.3g}%") - print(f' SPS : {env_sps:.1f}') - print(f' Overhead : {env_overhead:.2g}%') - print(f' Reset : {env_percent_reset:.3g}%') - print(f' Step STD : {env_percent_step_std:.3g}%') def profile_puffer(env_creator, timeout=DEFAULT_TIMEOUT, **kwargs): vecenv = make(env_creator, **kwargs) @@ -119,20 +111,21 @@ def profile_puffer(env_creator, timeout=DEFAULT_TIMEOUT, **kwargs): vecenv.reset() start = time.time() while time.time() - start < timeout: - vecenv.send(actions[agent_steps%1000]) + vecenv.send(actions[agent_steps % 1000]) o, r, d, t, i, env_id, mask = vecenv.recv() agent_steps += sum(mask) sps = agent_steps / (time.time() - start) vecenv.close() - backend = kwargs.get('backend', Serial) - if backend == Multiprocessing and 'batch_size' in kwargs: - print(f' Puffer : {(sps):.1f} - Pool') + backend = kwargs.get("backend", Serial) + if backend == Multiprocessing and "batch_size" in kwargs: + print(f" Puffer : {(sps):.1f} - Pool") else: - print(f' Puffer : {(sps):.1f} - {backend.__name__}') + print(f" Puffer : {(sps):.1f} - {backend.__name__}") return sps + def profile_gymnasium_vec(env_creator, num_envs, timeout=DEFAULT_TIMEOUT): vecenv = gymnasium.vector.AsyncVectorEnv([env_creator] * num_envs) actions = [vecenv.action_space.sample() for _ in range(1000)] @@ -141,127 +134,149 @@ def profile_gymnasium_vec(env_creator, num_envs, timeout=DEFAULT_TIMEOUT): vecenv.reset() start = time.time() while time.time() - start < timeout: - vecenv.step(actions[steps%1000]) + vecenv.step(actions[steps % 1000]) steps += 1 sps = steps * vecenv.num_envs / (time.time() - start) vecenv.close() - print(f' Gymnasium : {(sps):.1f}') + print(f" Gymnasium : {(sps):.1f}") return sps + def profile_sb3_vec(env_creator, num_envs, timeout=DEFAULT_TIMEOUT): with pufferlib.utils.Suppress(): from stable_baselines3.common.vec_env import SubprocVecEnv + vecenv = SubprocVecEnv([env_creator] * num_envs) - actions = [[vecenv.action_space.sample() for _ in range(num_envs)] - for _ in range(1000)] + actions = [[vecenv.action_space.sample() for _ in range(num_envs)] for _ in range(1000)] steps = 0 vecenv.reset() start = time.time() while time.time() - start < timeout: - vecenv.step(actions[steps%1000]) + vecenv.step(actions[steps % 1000]) steps += 1 sps = steps * vecenv.num_envs / (time.time() - start) vecenv.close() - print(f' SB3 : {(sps):.1f}') + print(f" SB3 : {(sps):.1f}") return sps -def profile_all(name, env_creator, num_envs, num_workers=24, - env_batch_size=None, zero_copy=True, timeout=DEFAULT_TIMEOUT): + +def profile_all( + name, env_creator, num_envs, num_workers=24, env_batch_size=None, zero_copy=True, timeout=DEFAULT_TIMEOUT +): if env_batch_size is None: env_batch_size = num_envs print(name) profile_emulation(env_creator, timeout=timeout) - profile_puffer(env_creator, num_envs=env_batch_size, - backend=Multiprocessing, timeout=timeout, + profile_puffer( + env_creator, + num_envs=env_batch_size, + backend=Multiprocessing, + timeout=timeout, num_workers=min(num_workers, env_batch_size), ) if env_batch_size is not None and env_batch_size != num_envs: - profile_puffer(env_creator, num_envs=num_envs, - backend=Multiprocessing, timeout=timeout, num_workers=num_workers, - batch_size=env_batch_size, zero_copy=zero_copy + profile_puffer( + env_creator, + num_envs=num_envs, + backend=Multiprocessing, + timeout=timeout, + num_workers=num_workers, + batch_size=env_batch_size, + zero_copy=zero_copy, ) profile_gymnasium_vec(env_creator, num_envs=env_batch_size, timeout=timeout) profile_sb3_vec(env_creator, num_envs=env_batch_size, timeout=timeout) print() -if __name__ == '__main__': + +if __name__ == "__main__": from pufferlib.environments import nocturne + env_creator = nocturne.env_creator() profile_emulation(env_creator) - #profile_puffer(env_creator, num_envs=8, backend=Multiprocessing) + # profile_puffer(env_creator, num_envs=8, backend=Multiprocessing) exit(0) from pufferlib.environments import vizdoom + env_creator = vizdoom.env_creator() - #profile_emulation(env_creator) - profile_puffer(env_creator, num_envs=24, - batch_size=8, backend=Multiprocessing, zero_copy=False) + # profile_emulation(env_creator) + profile_puffer(env_creator, num_envs=24, batch_size=8, backend=Multiprocessing, zero_copy=False) from pufferlib.environments import ocean - env_creator = ocean.env_creator('grid') - #profile_emulation(env_creator) + + env_creator = ocean.env_creator("grid") + # profile_emulation(env_creator) import cProfile - cProfile.run('profile_emulation(env_creator)', 'stats.profile') + + cProfile.run("profile_emulation(env_creator)", "stats.profile") import pstats from pstats import SortKey - p = pstats.Stats('stats.profile') + + p = pstats.Stats("stats.profile") p.sort_stats(SortKey.TIME).print_stats(10) exit(0) from pufferlib.environments import nmmo - print('Neural MMO') + + print("Neural MMO") env_creator = nmmo.env_creator() profile_emulation(env_creator) - #profile_puffer(env_creator, num_envs=8, backend=Multiprocessing) - profile_puffer(env_creator, num_envs=96, - batch_size=48, backend=Multiprocessing, zero_copy=False) + # profile_puffer(env_creator, num_envs=8, backend=Multiprocessing) + profile_puffer(env_creator, num_envs=96, batch_size=48, backend=Multiprocessing, zero_copy=False) print() from pufferlib.environments import nethack - profile_all('NetHack', nethack.env_creator(), num_envs=48) + + profile_all("NetHack", nethack.env_creator(), num_envs=48) from pufferlib.environments import minihack - profile_all('MiniHack', minihack.env_creator(), num_envs=48) + + profile_all("MiniHack", minihack.env_creator(), num_envs=48) from pufferlib.environments import pokemon_red - profile_all('Pokemon Red', pokemon_red.env_creator(), - num_envs=144, env_batch_size=48, zero_copy=False) + + profile_all("Pokemon Red", pokemon_red.env_creator(), num_envs=144, env_batch_size=48, zero_copy=False) from pufferlib.environments import procgen - profile_all('ProcGen', procgen.env_creator('bigfish'), - num_envs=144, env_batch_size=48, num_workers=24, zero_copy=False) + + profile_all( + "ProcGen", procgen.env_creator("bigfish"), num_envs=144, env_batch_size=48, num_workers=24, zero_copy=False + ) from pufferlib.environments import classic_control - profile_all('Classic Control', classic_control.env_creator(), - num_envs=1152, env_batch_size=48) + + profile_all("Classic Control", classic_control.env_creator(), num_envs=1152, env_batch_size=48) from pufferlib.environments import ocean - profile_all('Ocean Squared', ocean.env_creator('squared'), - num_envs=1152, env_batch_size=48) + + profile_all("Ocean Squared", ocean.env_creator("squared"), num_envs=1152, env_batch_size=48) from pufferlib.environments import atari - profile_all('Atari Breakout', atari.env_creator('BreakoutNoFrameskip-v4'), - num_envs=144, env_batch_size=48, zero_copy=False) + + profile_all( + "Atari Breakout", atari.env_creator("BreakoutNoFrameskip-v4"), num_envs=144, env_batch_size=48, zero_copy=False + ) from pufferlib.environments import crafter - profile_all('Crafter', crafter.env_creator(), - num_envs=24, env_batch_size=8, zero_copy=False) + + profile_all("Crafter", crafter.env_creator(), num_envs=24, env_batch_size=8, zero_copy=False) from pufferlib.environments import minigrid - profile_all('MiniGrid', minigrid.env_creator(), - num_envs=192, env_batch_size=48, zero_copy=False) + + profile_all("MiniGrid", minigrid.env_creator(), num_envs=192, env_batch_size=48, zero_copy=False) exit(0) - ''' + """ # Small scale version for laptop from pufferlib.environments import nmmo print('Neural MMO') @@ -303,26 +318,25 @@ def profile_all(name, env_creator, num_envs, num_workers=24, num_envs=36, num_workers=6, env_batch_size=12, zero_copy=False) exit(0) - ''' + """ - #from functools import partial - #counts = [1e5, 1e6, 1e7, 1e8] - #delays = [0, 0.1, 0.25, 0.5, 1] - #bandwidth = [1, 1e4, 1e5, 1e6] + # from functools import partial + # counts = [1e5, 1e6, 1e7, 1e8] + # delays = [0, 0.1, 0.25, 0.5, 1] + # bandwidth = [1, 1e4, 1e5, 1e6] - - #synthetic_creators = {} - #for count in counts: + # synthetic_creators = {} + # for count in counts: # name = f'test_delay_{count}' - #env_creators.test = partial( + # env_creators.test = partial( # ocean.env_creator('performance_empiric'), # count_n=270_000, bandwidth=150_000 - #) + # ) - #timeout = 60 - #cores = psutil.cpu_count(logical=False) - #for key, creator in env_creators.items(): + # timeout = 60 + # cores = psutil.cpu_count(logical=False) + # for key, creator in env_creators.items(): # prof = profile_emulation(creator, timeout) # profile_vec(creator, cores, timeout, prof.puf.sps) # print() diff --git a/tests/test_pokemon_red.py b/tests/test_pokemon_red.py index 12ea02f70..28ba76ef5 100644 --- a/tests/test_pokemon_red.py +++ b/tests/test_pokemon_red.py @@ -4,6 +4,6 @@ ob, info = env.reset() for i in range(100): ob, reward, terminal, truncated, info = env.step(env.action_space.sample()) - print(f'Step: {i}, Info: {info}') + print(f"Step: {i}, Info: {info}") env.close() diff --git a/tests/test_policy_pool.py b/tests/test_policy_pool.py index 9f7a3689b..777f05c77 100644 --- a/tests/test_policy_pool.py +++ b/tests/test_policy_pool.py @@ -1,9 +1,8 @@ import unittest import numpy as np -import torch - import pufferlib.policy_pool as pp +import torch NUM_AGENTS = 4 NUM_ENVS = 2 @@ -25,10 +24,11 @@ def __call__(self, obs): values = torch.arange(batch_size, dtype=torch.float32) + 10 # add to make the values different return actions, logprobs, None, values + class MockPolicyStore: def __init__(self, num_policies): - self._policies = {f'Policy{i+1}': MockPolicy() for i in range(num_policies)} - self.path = 'mock_policy_store' + self._policies = {f"Policy{i + 1}": MockPolicy() for i in range(num_policies)} + self.path = "mock_policy_store" def policy_names(self): return list(self._policies.keys()) @@ -36,17 +36,18 @@ def policy_names(self): def get_policy(self, name): return self._policies[name] + class TestPolicyPool(unittest.TestCase): def setUp(self): self.mock_nonrecurrent_policy = MockPolicy() - self.mock_nonrecurrent_policy.name = 'BasePolicy1' + self.mock_nonrecurrent_policy.name = "BasePolicy1" self.nonrecurrent_policy_pool = pp.PolicyPool( policy=self.mock_nonrecurrent_policy, total_agents=POOL_AGENTS, atn_shape=(ACTION_DIM,), - device='cpu', + device="cpu", policy_store=MockPolicyStore(3), - kernel = [0, 1, 0, 2], + kernel=[0, 1, 0, 2], skip_ranker=True, ) @@ -67,28 +68,29 @@ def test_update_policies(self): policy_store = MockPolicyStore(0) policy_pool.update_policies(policy_ids=np.array([0, 1, 2]), store=policy_store) for pol in policy_pool.current_policies.values(): - self.assertEqual(pol['name'], 'learner') - self.assertEqual(pol['policy'], policy_pool.learner_policy) + self.assertEqual(pol["name"], "learner") + self.assertEqual(pol["policy"], policy_pool.learner_policy) # Sample 2 policies when there is only one policy in the policy store # Both policies should be Policy1 policy_store = MockPolicyStore(1) policy_pool.update_policies(policy_ids=np.array([0, 1, 2]), store=policy_store) for pol in policy_pool.current_policies.values(): - self.assertEqual(pol['name'], 'Policy1') - self.assertEqual(pol['policy'], policy_store.get_policy('Policy1')) + self.assertEqual(pol["name"], "Policy1") + self.assertEqual(pol["policy"], policy_store.get_policy("Policy1")) # Sample 3 policies when there are 10 policies in the policy store # All sampled policies should be different policy_store = MockPolicyStore(10) policy_pool.update_policies(policy_ids=np.array([0, 1, 2, 3]), store=policy_store) - self.assertEqual(len(set(p['name'] for p in policy_pool.current_policies.values())), 3) + self.assertEqual(len(set(p["name"] for p in policy_pool.current_policies.values())), 3) # Use all_selector policy_store = MockPolicyStore(5) - policy_pool.update_policies(policy_ids=np.array([0, 1, 2, 3, 4, 5]), store=policy_store, - policy_selector=pp.AllPolicySelector(seed=0)) - self.assertEqual(len(set(p['name'] for p in policy_pool.current_policies.values())), 5) + policy_pool.update_policies( + policy_ids=np.array([0, 1, 2, 3, 4, 5]), store=policy_store, policy_selector=pp.AllPolicySelector(seed=0) + ) + self.assertEqual(len(set(p["name"] for p in policy_pool.current_policies.values())), 5) def test_nonrecurrent_forward(self): policy_pool = self.nonrecurrent_policy_pool @@ -98,8 +100,7 @@ def test_nonrecurrent_forward(self): for policy_id in policy_pool.policy_ids: samp = policy_pool.sample_idxs[policy_id] - policy = policy_pool.learner_policy if policy_id == 0 \ - else policy_pool.current_policies[policy_id]['policy'] + policy = policy_pool.learner_policy if policy_id == 0 else policy_pool.current_policies[policy_id]["policy"] atn1, lgprob1, _, val1 = policy(obs[samp]) self.assertTrue(torch.equal(atn[samp], atn1)) @@ -110,20 +111,23 @@ def test_update_scores(self): policy_pool = self.nonrecurrent_policy_pool # With the kernel [0, 1, 0, 2], agents 1 and 3 are learner, and agents 2 and 4 are different - infos = [{1: {'return': 1}, 2: {'return': 2}, 3: {'return': 3}, 4: {'return': 4}}, - {1: {'return': 10}, 2: {'return': 20}, 4: {'return': 40}}] + infos = [ + {1: {"return": 1}, 2: {"return": 2}, 3: {"return": 3}, 4: {"return": 4}}, + {1: {"return": 10}, 2: {"return": 20}, 4: {"return": 40}}, + ] pol1_name = policy_pool._get_policy_name(2) pol2_name = policy_pool._get_policy_name(4) - policy_infos = policy_pool.update_scores(infos, 'return') - self.assertEqual(policy_infos['learner'], [{'return': 1}, {'return': 3}, {'return': 10}]) - self.assertEqual(policy_infos[pol1_name], [{'return': 2}, {'return': 20}]) - self.assertEqual(policy_infos[pol2_name], [{'return': 4}, {'return': 40}]) + policy_infos = policy_pool.update_scores(infos, "return") + self.assertEqual(policy_infos["learner"], [{"return": 1}, {"return": 3}, {"return": 10}]) + self.assertEqual(policy_infos[pol1_name], [{"return": 2}, {"return": 20}]) + self.assertEqual(policy_infos[pol2_name], [{"return": 4}, {"return": 40}]) # policy_pool.scores only keep the last game's results - self.assertEqual(policy_pool.scores['learner'], 10) + self.assertEqual(policy_pool.scores["learner"], 10) self.assertEqual(policy_pool.scores[pol1_name], 20) self.assertEqual(policy_pool.scores[pol2_name], 40) -if __name__ == '__main__': + +if __name__ == "__main__": unittest.main() diff --git a/tests/test_puffernet.py b/tests/test_puffernet.py index 7f90b4261..2fd3608e4 100644 --- a/tests/test_puffernet.py +++ b/tests/test_puffernet.py @@ -1,38 +1,42 @@ -import torch import numpy as np - +import torch from pufferlib import puffernet # TODO: Should probably add a safe mode that type checks input arrays # It's user error, but it is a big foot gun + def make_dummy_data(*shape, seed=42): np.random.seed(seed) n = np.prod(shape) ary = np.random.rand(*shape).astype(np.float32) - 0.5 return np.ascontiguousarray(ary) + def make_dummy_int_data(num_classes, *shape, seed=42): np.random.seed(seed) n = np.prod(shape) ary = np.random.randint(0, num_classes, shape).astype(np.int32) return np.ascontiguousarray(ary) + def assert_near(a, b): assert a.shape == b.shape assert np.all(np.abs(a - b) < 1e-4) + def test_puffernet_relu(batch_size=16, input_size=128): input_puffer = make_dummy_data(batch_size, input_size) input_torch = torch.from_numpy(input_puffer) output_torch = torch.relu(input_torch).detach() - + # PufferNet done second because it is in-place on the input - puffernet.puf_relu(input_puffer, input_puffer, batch_size*input_size) + puffernet.puf_relu(input_puffer, input_puffer, batch_size * input_size) assert_near(input_puffer, output_torch.numpy()) + def test_puffernet_sigmoid(n=1024, epsilon=1e-4): input_np = make_dummy_data(n) @@ -44,13 +48,13 @@ def test_puffernet_sigmoid(n=1024, epsilon=1e-4): out_puffer = puffernet.puf_sigmoid(input_np[i]) assert abs(out_puffer - out_torch) < epsilon + def test_puffernet_linear_layer(batch_size=16, input_size=128, hidden_size=128): input_np = make_dummy_data(batch_size, input_size, seed=42) weights_np = make_dummy_data(hidden_size, input_size, seed=43) bias_np = make_dummy_data(hidden_size, seed=44) output_puffer = np.zeros((batch_size, hidden_size), dtype=np.float32) - puffernet.puf_linear_layer(input_np, weights_np, bias_np, output_puffer, - batch_size, input_size, hidden_size) + puffernet.puf_linear_layer(input_np, weights_np, bias_np, output_puffer, batch_size, input_size, hidden_size) input_torch = torch.from_numpy(input_np) weights_torch = torch.from_numpy(weights_np) @@ -62,16 +66,29 @@ def test_puffernet_linear_layer(batch_size=16, input_size=128, hidden_size=128): assert_near(output_puffer, output_torch.numpy()) -def test_puffernet_convolution_layer(batch_size=16, in_width=11, in_height=11, - in_channels=19, out_channels=32, kernel_size=5, stride=3): + +def test_puffernet_convolution_layer( + batch_size=16, in_width=11, in_height=11, in_channels=19, out_channels=32, kernel_size=5, stride=3 +): input_np = make_dummy_data(batch_size, in_channels, in_height, in_width) weights_np = make_dummy_data(out_channels, in_channels, kernel_size, kernel_size) bias_np = make_dummy_data(out_channels) - out_height = int((in_height - kernel_size)/stride + 1) - out_width = int((in_width - kernel_size)/stride + 1) + out_height = int((in_height - kernel_size) / stride + 1) + out_width = int((in_width - kernel_size) / stride + 1) output_puffer = np.zeros((batch_size, out_channels, out_height, out_width), dtype=np.float32) - puffernet.puf_convolution_layer(input_np, weights_np, bias_np, output_puffer, - batch_size, in_width, in_height, in_channels, out_channels, kernel_size, stride) + puffernet.puf_convolution_layer( + input_np, + weights_np, + bias_np, + output_puffer, + batch_size, + in_width, + in_height, + in_channels, + out_channels, + kernel_size, + stride, + ) input_torch = torch.from_numpy(input_np) weights_torch = torch.from_numpy(weights_np) @@ -83,15 +100,16 @@ def test_puffernet_convolution_layer(batch_size=16, in_width=11, in_height=11, assert_near(output_puffer, output_torch.numpy()) + def test_puffernet_lstm(batch_size=16, input_size=128, hidden_size=128): input_np = make_dummy_data(batch_size, input_size, seed=42) state_h_np = make_dummy_data(batch_size, hidden_size, seed=43) state_c_np = make_dummy_data(batch_size, hidden_size, seed=44) - weights_input_np = make_dummy_data(4*hidden_size, input_size, seed=45) - weights_state_np = make_dummy_data(4*hidden_size, hidden_size, seed=46) - bias_input_np = make_dummy_data(4*hidden_size, seed=47) - bias_state_np = make_dummy_data(4*hidden_size, seed=48) - buffer_np = make_dummy_data(4*batch_size*hidden_size, seed=49) + weights_input_np = make_dummy_data(4 * hidden_size, input_size, seed=45) + weights_state_np = make_dummy_data(4 * hidden_size, hidden_size, seed=46) + bias_input_np = make_dummy_data(4 * hidden_size, seed=47) + bias_state_np = make_dummy_data(4 * hidden_size, seed=48) + buffer_np = make_dummy_data(4 * batch_size * hidden_size, seed=49) input_torch = torch.from_numpy(input_np).view(1, batch_size, input_size) state_h_torch = torch.from_numpy(state_h_np).view(1, batch_size, hidden_size) @@ -110,19 +128,29 @@ def test_puffernet_lstm(batch_size=16, input_size=128, hidden_size=128): state_c_torch = state_c_torch.detach() # PufferNet done second because it is in-place on the state vars - puffernet.puf_lstm(input_np, state_h_np, state_c_np, weights_input_np, - weights_state_np, bias_input_np, bias_state_np, buffer_np, - batch_size, input_size, hidden_size) + puffernet.puf_lstm( + input_np, + state_h_np, + state_c_np, + weights_input_np, + weights_state_np, + bias_input_np, + bias_state_np, + buffer_np, + batch_size, + input_size, + hidden_size, + ) assert_near(state_h_np, state_h_torch.numpy()[0]) assert_near(state_c_np, state_c_torch.numpy()[0]) + def test_puffernet_embedding(batch_size=16, num_embeddings=128, embedding_dim=32): input_np = make_dummy_int_data(num_embeddings, batch_size, seed=42) weights_np = make_dummy_data(num_embeddings, embedding_dim, seed=43) output_puffer = np.zeros((batch_size, embedding_dim), dtype=np.float32) - puffernet.puf_embedding(input_np, weights_np, output_puffer, - batch_size, num_embeddings, embedding_dim) + puffernet.puf_embedding(input_np, weights_np, output_puffer, batch_size, num_embeddings, embedding_dim) input_torch = torch.from_numpy(input_np).long() weights_torch = torch.from_numpy(weights_np) @@ -136,6 +164,7 @@ def test_puffernet_embedding(batch_size=16, num_embeddings=128, embedding_dim=32 assert_near(output_puffer, output_torch.numpy()) + def test_puffernet_one_hot(batch_size=16, input_size=128, num_classes=10): input_np = make_dummy_int_data(num_classes, batch_size, input_size) output_puffer = np.zeros((batch_size, input_size, num_classes), dtype=np.int32) @@ -146,6 +175,7 @@ def test_puffernet_one_hot(batch_size=16, input_size=128, num_classes=10): assert_near(output_puffer, output_torch.numpy()) + def test_puffernet_cat_dim1(batch_size=16, x_size=32, y_size=64): x_np = make_dummy_data(batch_size, x_size) y_np = make_dummy_data(batch_size, y_size) @@ -158,7 +188,8 @@ def test_puffernet_cat_dim1(batch_size=16, x_size=32, y_size=64): assert_near(output_puffer, output_torch.numpy()) -def test_puffernet_argmax_multidiscrete(batch_size=16, logit_sizes=[5,7,2]): + +def test_puffernet_argmax_multidiscrete(batch_size=16, logit_sizes=[5, 7, 2]): logit_sizes = np.array(logit_sizes).astype(np.int32) num_actions = len(logit_sizes) input_np = make_dummy_data(batch_size, logit_sizes.sum()) @@ -171,7 +202,8 @@ def test_puffernet_argmax_multidiscrete(batch_size=16, logit_sizes=[5,7,2]): assert_near(output_puffer, output_torch.numpy()) -if __name__ == '__main__': + +if __name__ == "__main__": test_puffernet_relu() test_puffernet_sigmoid() test_puffernet_linear_layer() diff --git a/tests/test_pytorch.py b/tests/test_pytorch.py index 9b72f195e..6185f8992 100644 --- a/tests/test_pytorch.py +++ b/tests/test_pytorch.py @@ -2,11 +2,10 @@ import gymnasium as gym import numpy as np -import torch -import pytest - import pufferlib import pufferlib.emulation +import pytest +import torch from pufferlib.pytorch import NativeDType, nativize_dtype, nativize_tensor @@ -109,7 +108,10 @@ np.dtype( [ ("xx", np.float32, (1, 2)), - ("yy", [("aa", np.uint8, (7, 7)), ("bb", np.int32, (2, 3))],), + ( + "yy", + [("aa", np.uint8, (7, 7)), ("bb", np.int32, (2, 3))], + ), ], align=True, ), @@ -123,9 +125,7 @@ ), ], ) -def test_nativize_dtype( - observation_dtype: np.array, emulated_dtype: np.array, expected: NativeDType -): +def test_nativize_dtype(observation_dtype: np.array, emulated_dtype: np.array, expected: NativeDType): assert expected == nativize_dtype( pufferlib.namespace( observation_dtype=observation_dtype, @@ -172,18 +172,14 @@ def test_nativize_dtype( ) def test_nativize_tensor(space: gym.spaces.Space, sample_dtype: np.dtype): emulated_dtype = pufferlib.emulation.dtype_from_space(space) - observation_space, observation_dtype = ( - pufferlib.emulation.emulate_observation_space(space) - ) + observation_space, observation_dtype = pufferlib.emulation.emulate_observation_space(space) native_dtype = nativize_dtype( pufferlib.namespace( observation_dtype=sample_dtype, emulated_observation_dtype=emulated_dtype, ) ) - flat = np.zeros(observation_space.shape, dtype=observation_space.dtype).view( - observation_dtype - ) + flat = np.zeros(observation_space.shape, dtype=observation_space.dtype).view(observation_dtype) structured = space.sample() pufferlib.emulation.emulate(flat, structured) @@ -204,8 +200,7 @@ def flatten(inp: Any | Dict[str, Any]) -> List[Any | Tuple[str, Any]]: observation = torch.tensor(flat.view(observation_space.dtype)).unsqueeze(0) nativized_tensor = nativize_tensor(observation, native_dtype) assert all( - nx == ny and np.all(vx == vy) - for (nx, vx), (ny, vy) in zip(flatten(nativized_tensor), flatten(structured)) + nx == ny and np.all(vx == vy) for (nx, vx), (ny, vy) in zip(flatten(nativized_tensor), flatten(structured), strict=False) ) explain_out = torch._dynamo.explain(nativize_tensor)(observation, native_dtype) assert len(explain_out.break_reasons) == 0 diff --git a/tests/test_record_array.py b/tests/test_record_array.py index 094ebb54a..d140603ad 100644 --- a/tests/test_record_array.py +++ b/tests/test_record_array.py @@ -2,19 +2,25 @@ import numpy as np # Create a custom Gym space using Dict, Tuple, and Box -space = gym.spaces.Dict({ - "position": gym.spaces.Box(low=-1.0, high=1.0, shape=(2,), dtype=np.float32), - "velocity": gym.spaces.Box(low=-1.0, high=1.0, shape=(2,), dtype=np.float32), - "description": gym.spaces.Tuple(( - #gym.spaces.Discrete(10), - gym.spaces.Box(low=0, high=100, shape=(), dtype=np.int32), - gym.spaces.Box(low=0, high=100, shape=(), dtype=np.int32) - )) -}) +space = gym.spaces.Dict( + { + "position": gym.spaces.Box(low=-1.0, high=1.0, shape=(2,), dtype=np.float32), + "velocity": gym.spaces.Box(low=-1.0, high=1.0, shape=(2,), dtype=np.float32), + "description": gym.spaces.Tuple( + ( + # gym.spaces.Discrete(10), + gym.spaces.Box(low=0, high=100, shape=(), dtype=np.int32), + gym.spaces.Box(low=0, high=100, shape=(), dtype=np.int32), + ) + ), + } +) -space = gym.spaces.Dict({ - "position": gym.spaces.Box(low=-1.0, high=1.0, shape=(2,), dtype=np.float32), -}) +space = gym.spaces.Dict( + { + "position": gym.spaces.Box(low=-1.0, high=1.0, shape=(2,), dtype=np.float32), + } +) # Define a function to create a dtype from the Gym space @@ -23,13 +29,16 @@ def create_dtype_from_space(space): dtype_fields = [(name, create_dtype_from_space(subspace)) for name, subspace in space.spaces.items()] return np.dtype(dtype_fields) elif isinstance(space, gym.spaces.Tuple): - dtype_fields = [('field' + str(i), create_dtype_from_space(subspace)) for i, subspace in enumerate(space.spaces)] + dtype_fields = [ + ("field" + str(i), create_dtype_from_space(subspace)) for i, subspace in enumerate(space.spaces) + ] return np.dtype(dtype_fields) elif isinstance(space, gym.spaces.Box): return (space.dtype, space.shape) elif isinstance(space, gym.spaces.Discrete): return np.int64 # Assuming np.int64 for Discrete spaces + # Compute the dtype from the space space_dtype = create_dtype_from_space(space) @@ -37,10 +46,12 @@ def create_dtype_from_space(space): breakpoint() np.rec.array(sample, dtype=space_dtype) + # Function to sample from the space and convert to a structured numpy array def sample_and_convert(space, dtype): sample = space.sample() flat_sample = {} + def flatten(sample, name_prefix=""): for key, item in sample.items(): full_key = name_prefix + key if name_prefix == "" else name_prefix + "_" + key @@ -48,9 +59,11 @@ def flatten(sample, name_prefix=""): flatten(item, full_key) else: flat_sample[full_key] = item + flatten(sample) return np.array(tuple(flat_sample.values()), dtype=dtype) + num_samples = 3 samples = [sample_and_convert(space, space_dtype) for _ in range(num_samples)] print("Samples:", samples) diff --git a/tests/test_record_emulation.py b/tests/test_record_emulation.py index 38a1cf23f..20dea5cc2 100644 --- a/tests/test_record_emulation.py +++ b/tests/test_record_emulation.py @@ -1,8 +1,7 @@ -import pufferlib.emulation from pufferlib.environments.ocean import env_creator -env = env_creator('spaces')() +env = env_creator("spaces")() env.reset() -env.step([1,0]) +env.step([1, 0]) breakpoint() diff --git a/tests/test_render.py b/tests/test_render.py index f8bb38309..81b5357b8 100644 --- a/tests/test_render.py +++ b/tests/test_render.py @@ -1,4 +1,3 @@ -from pdb import set_trace as T import argparse import importlib @@ -6,20 +5,19 @@ import cv2 - # Tested human: classic_control, atari, minigrid # Tested rbg_array: atari, pokemon_red, crafter # Tested ansii: minihack, nethack, squared -if __name__ == '__main__': +if __name__ == "__main__": parser = argparse.ArgumentParser() - parser.add_argument('--env', type=str, default='atari') - parser.add_argument('--render-mode', type=str, default='rgb_array') + parser.add_argument("--env", type=str, default="atari") + parser.add_argument("--render-mode", type=str, default="rgb_array") args = parser.parse_args() - env_module = importlib.import_module(f'pufferlib.environments.{args.env}') + env_module = importlib.import_module(f"pufferlib.environments.{args.env}") - if args.render_mode == 'human': - env = env_module.env_creator()(render_mode='human') + if args.render_mode == "human": + env = env_module.env_creator()(render_mode="human") else: env = env_module.env_creator()() @@ -29,16 +27,16 @@ if terminal or truncated: ob, _ = env.reset() - if args.render_mode == 'rgb_array': + if args.render_mode == "rgb_array": frame = env.render() frame = cv2.cvtColor(frame, cv2.COLOR_RGB2BGR) - #if ob.shape[0] in (1, 3, 4): + # if ob.shape[0] in (1, 3, 4): # ob = ob.transpose(1, 2, 0) - cv2.imshow('frame', frame) + cv2.imshow("frame", frame) - #cv2.imshow('ob', ob) + # cv2.imshow('ob', ob) cv2.waitKey(1) - elif args.render_mode == 'ansi': + elif args.render_mode == "ansi": chars = env.render() print("\033c", end="") print(chars) @@ -48,6 +46,5 @@ ob, reward, terminal, truncated, info = env.step(action) env.render() start = time.time() - if time.time() - start < 1/60: - time.sleep(1/60 - (time.time() - start)) - + if time.time() - start < 1 / 60: + time.sleep(1 / 60 - (time.time() - start)) diff --git a/tests/test_rich.py b/tests/test_rich.py index e54c5663d..9b912f7c0 100644 --- a/tests/test_rich.py +++ b/tests/test_rich.py @@ -1,46 +1,36 @@ -import psutil -import GPUtil import time -import sys +import GPUtil +import psutil import rich from rich.console import Console from rich.layout import Layout -from rich.live import Live +from rich.progress import BarColumn, MofNCompleteColumn, Progress, TextColumn from rich.table import Table -from rich.panel import Panel -from rich.progress import Progress, BarColumn, TextColumn, MofNCompleteColumn - -#import pufferlib - -ROUND_OPEN = rich.box.Box( - "╭──╮\n" - "│ │\n" - "│ │\n" - "│ │\n" - "│ │\n" - "│ │\n" - "│ │\n" - "╰──╯\n" -) - -c1 = '[bright_cyan]' -c2 = '[white]' -c3 = '[cyan]' -b1 = '[bright_cyan]' -b2 = '[bright_white]' + +# import pufferlib + +ROUND_OPEN = rich.box.Box("╭──╮\n│ │\n│ │\n│ │\n│ │\n│ │\n│ │\n╰──╯\n") + +c1 = "[bright_cyan]" +c2 = "[white]" +c3 = "[cyan]" +b1 = "[bright_cyan]" +b2 = "[bright_white]" + def abbreviate(num): if num < 1e3: return f"{num:.0f}" elif num < 1e6: - return f"{num/1e3:.1f}k" + return f"{num / 1e3:.1f}k" elif num < 1e9: - return f"{num/1e6:.1f}m" + return f"{num / 1e6:.1f}m" elif num < 1e12: - return f"{num/1e9:.1f}b" + return f"{num / 1e9:.1f}b" else: - return f"{num/1e12:.1f}t" + return f"{num / 1e12:.1f}t" + def duration(seconds): h = seconds // 3600 @@ -48,6 +38,7 @@ def duration(seconds): s = seconds % 60 return f"{h}h {m}m {s}s" if h else f"{m}m {s}s" if m else f"{s}s" + def print_dashboard(performance_data, loss_data, user_data, min_interval=0.25, last_print=[0]): console = Console() @@ -63,40 +54,40 @@ def print_dashboard(performance_data, loss_data, user_data, min_interval=0.25, l util.add_column(justify="center") util.add_column(justify="right") util.add_row( - f':blowfish: {c1}PufferLib {b2}1.0.0', - f'{c1}CPU: {c3}{cpu_percent:.1f}%', - f'{c1}GPU: {c3}{gpu_percent:.1f}%', - f'{c1}DRAM: {c3}{dram_percent:.1f}%', - f'{c1}VRAM: {c3}{vram_percent:.1f}%', + f":blowfish: {c1}PufferLib {b2}1.0.0", + f"{c1}CPU: {c3}{cpu_percent:.1f}%", + f"{c1}GPU: {c3}{gpu_percent:.1f}%", + f"{c1}DRAM: {c3}{dram_percent:.1f}%", + f"{c1}VRAM: {c3}{vram_percent:.1f}%", ) - - summary= Table(box=None, expand=True) - summary.add_column(f"{c1}Summary", justify='left', vertical='top') - summary.add_column(f"{c1}Value", justify='right', vertical='top') - summary.add_row(f'{c2}Epoch', f'{b2}{performance.epoch}') - summary.add_row(f'{c2}Uptime', f'{b2}{duration(performance.uptime)}') + + summary = Table(box=None, expand=True) + summary.add_column(f"{c1}Summary", justify="left", vertical="top") + summary.add_column(f"{c1}Value", justify="right", vertical="top") + summary.add_row(f"{c2}Epoch", f"{b2}{performance.epoch}") + summary.add_row(f"{c2}Uptime", f"{b2}{duration(performance.uptime)}") estimated_time = performance.total_steps / performance.sps - summary.add_row(f'{c2}Estim', f'{b2}{duration(estimated_time)}') - summary.add_row(f'{c2}Agent Steps', f'{b2}{abbreviate(performance.agent_steps)}') - summary.add_row(f'{c2}Steps/sec', f'{b2}{abbreviate(performance.sps)}') - summary.add_row(f'{c2}sec/Batch', f'{b2}{performance.epoch_time:.2f}') - + summary.add_row(f"{c2}Estim", f"{b2}{duration(estimated_time)}") + summary.add_row(f"{c2}Agent Steps", f"{b2}{abbreviate(performance.agent_steps)}") + summary.add_row(f"{c2}Steps/sec", f"{b2}{abbreviate(performance.sps)}") + summary.add_row(f"{c2}sec/Batch", f"{b2}{performance.epoch_time:.2f}") + perf = Table(box=None, expand=True) perf.add_column(f"{c1}Performance", justify="left", ratio=1.0) perf.add_column(f"{c1}Time", justify="right", ratio=0.5) - perf.add_row(f'{c2}Training', f'{b2}{performance.epoch_train_time:.2f}') - perf.add_row(f'{c2}Evaluation', f'{b2}{performance.epoch_eval_time:.2f}') - perf.add_row(f'{c2}Environment', f'{b2}{performance.epoch_env_time:.2f}') - perf.add_row(f'{c2}Forward', f'{b2}{performance.epoch_forward_time:.2f}') - perf.add_row(f'{c2}Misc', f'{b2}{performance.epoch_misc_time:.2f}') - perf.add_row(f'{c2}Allocation', f'{b2}{performance.epoch_alloc_time:.2f}') - perf.add_row(f'{c2}Backward', f'{b2}{performance.epoch_backward_time:.2f}') + perf.add_row(f"{c2}Training", f"{b2}{performance.epoch_train_time:.2f}") + perf.add_row(f"{c2}Evaluation", f"{b2}{performance.epoch_eval_time:.2f}") + perf.add_row(f"{c2}Environment", f"{b2}{performance.epoch_env_time:.2f}") + perf.add_row(f"{c2}Forward", f"{b2}{performance.epoch_forward_time:.2f}") + perf.add_row(f"{c2}Misc", f"{b2}{performance.epoch_misc_time:.2f}") + perf.add_row(f"{c2}Allocation", f"{b2}{performance.epoch_alloc_time:.2f}") + perf.add_row(f"{c2}Backward", f"{b2}{performance.epoch_backward_time:.2f}") losses = Table(box=None, expand=True) - losses.add_column(f'{c1}Losses', justify="left", ratio=1.0) - losses.add_column(f'{c1}Value', justify="right", ratio=0.5) + losses.add_column(f"{c1}Losses", justify="left", ratio=1.0) + losses.add_column(f"{c1}Value", justify="right", ratio=0.5) for metric, value in loss_data.items(): - losses.add_row(f'{c2}{metric}', f'{b2}{value}') + losses.add_row(f"{c2}{metric}", f"{b2}{value}") monitor = Table(box=None, expand=True, pad_edge=False) monitor.add_row(summary, perf, losses) @@ -106,16 +97,16 @@ def print_dashboard(performance_data, loss_data, user_data, min_interval=0.25, l user2 = Table(box=None, expand=True) user.add_row(user1, user2) user1.add_column(f"{c1}User Stats", justify="left", ratio=1.0) - user1.add_column(f"{c1}Value", justify="right",ratio=1.0) + user1.add_column(f"{c1}Value", justify="right", ratio=1.0) user2.add_column(f"{c1}User Stats", justify="left", ratio=1.0) - user2.add_column(f"{c1}Value", justify="right",ratio=1.0) + user2.add_column(f"{c1}Value", justify="right", ratio=1.0) i = 0 for metric, value in user_data.items(): u = user1 if i % 2 == 0 else user2 - u.add_row(f'{c2}{metric}', f'{b2}{value}') + u.add_row(f"{c2}{metric}", f"{b2}{value}") i += 1 - table = Table(box=ROUND_OPEN, expand=True, show_header=False, width=80, border_style='bright_cyan') + table = Table(box=ROUND_OPEN, expand=True, show_header=False, width=80, border_style="bright_cyan") table.add_row(util) table.add_row(monitor) table.add_row(user) @@ -132,9 +123,9 @@ def __init__(self): Layout(name="utilization", size=5), Layout(name="monitoring"), ) - + self.layout = layout - ''' + """ layout.split( Layout(name="utilization", size=5), Layout(name="puffer", size=2), @@ -148,13 +139,13 @@ def __init__(self): layout["user"].split_row( Layout(name="user_stats") ) - ''' + """ utilization = Progress( BarColumn(bar_width=None, style="bar.back", complete_style="bar.complete"), TextColumn("[progress.description]{task.description}"), MofNCompleteColumn(), - expand=True + expand=True, ) self.cpu_task = utilization.add_task("[cyan]CPU", total=100) self.gpu_task = utilization.add_task("[red]GPU", total=100) @@ -163,8 +154,8 @@ def __init__(self): self.layout["utilization"].update(utilization) self.utilization = utilization - #self.live = Live(self.layout, console=self.console)#, auto_refresh=4) - #self.live.start() + # self.live = Live(self.layout, console=self.console)#, auto_refresh=4) + # self.live.start() def _update_utilization(self): self.utilization.update(self.cpu_task, completed=psutil.cpu_percent()) @@ -177,10 +168,12 @@ def _update_utilization(self): self.utilization.update(self.gpu_task, completed=0) self.utilization.update(self.vram_task, completed=0) - #self.layout['puffer'].update(f':blowfish: PufferLib {pufferlib.__version__}') - #self.layout['puffer'].update(f':blowfish: PufferLib 1.0.0') + # self.layout['puffer'].update(f':blowfish: PufferLib {pufferlib.__version__}') + # self.layout['puffer'].update(f':blowfish: PufferLib 1.0.0') - def update(self, total_uptime, estimated_time, total_steps, steps_per_second, performance_data, loss_data, user_data): + def update( + self, total_uptime, estimated_time, total_steps, steps_per_second, performance_data, loss_data, user_data + ): topline = self.update_topline(total_uptime, estimated_time, total_steps, steps_per_second) performance = self.update_performance(performance_data) losses = self.update_losses(loss_data) @@ -188,19 +181,18 @@ def update(self, total_uptime, estimated_time, total_steps, steps_per_second, pe megatable = Table(box=ROUND_OPEN, expand=True, show_header=False) megatable.add_row(topline) - megatable.add_row('') + megatable.add_row("") perf = Table(box=None, expand=True) perf.add_column(performance, ratio=1.0) perf.add_column(losses, ratio=1.0) - #megatable.add_row(performance) - #megatable.add_row(losses) + # megatable.add_row(performance) + # megatable.add_row(losses) megatable.add_row(perf) - megatable.add_row('') + megatable.add_row("") megatable.add_row(user) self.layout["monitoring"].update(megatable) self.console.clear() - self.console.print(self.layout) - + self.console.print(self.layout) def update_topline(self, total_uptime, estimated_time, total_steps, steps_per_second): table = Table(box=None, expand=True) @@ -208,15 +200,15 @@ def update_topline(self, total_uptime, estimated_time, total_steps, steps_per_se table.add_column(justify="center") table.add_column(justify="right") table.add_row( - f':blowfish: PufferLib 1.0.0', - f'[bold magenta]Uptime: [cyan]{total_uptime}/{estimated_time}(est)', - f'[bold magenta]Agent Steps: [cyan]{total_steps} ({steps_per_second}/s)' + ":blowfish: PufferLib 1.0.0", + f"[bold magenta]Uptime: [cyan]{total_uptime}/{estimated_time}(est)", + f"[bold magenta]Agent Steps: [cyan]{total_steps} ({steps_per_second}/s)", ) return table def update_performance(self, data): table = Table(box=None, expand=True) - #self.layout["performance"].update(table) + # self.layout["performance"].update(table) table.add_column("[bold magenta]Performance", justify="right", ratio=1.0) table.add_column("Latency", justify="left", style="cyan", ratio=1.0) for metric, value in data.items(): @@ -224,11 +216,11 @@ def update_performance(self, data): return table self.console.clear() - self.console.print(self.layout) + self.console.print(self.layout) def update_losses(self, data): table = Table(box=None, expand=True) - #self.layout["losses"].update(table) + # self.layout["losses"].update(table) table.add_column("[bold magenta]Losses", justify="right", ratio=1.0) table.add_column("Value", justify="left", style="bright_cyan", ratio=1.0) for metric, value in data.items(): @@ -238,54 +230,53 @@ def update_losses(self, data): return table self.console.clear() - self.console.print(self.layout) + self.console.print(self.layout) def update_user_stats(self, data): table = Table(box=None, expand=True) table.add_column("[bold magenta]User Stats", justify="right", style="bold yellow", ratio=1.0) - table.add_column("Value", justify="left",ratio=1.0) - #self.layout["user_stats"].update(table) + table.add_column("Value", justify="left", ratio=1.0) + # self.layout["user_stats"].update(table) for metric, value in data.items(): table.add_row(metric, str(value)) return table self.console.clear() - self.console.print(self.layout) + self.console.print(self.layout) -#dashboard = Dashboard() +# dashboard = Dashboard() # Update loop try: while True: - #dashboard._update_utilization() + # dashboard._update_utilization() topline = (5000, 100000, 102332, 1038, 1.3) performance = { - 'training': 0.7, - 'evaluation': 0.6, - 'environment': 0.2, - 'forward': 0.3, - 'misc': 0.1, - 'allocation': 0.2, - 'backward': 0.3, + "training": 0.7, + "evaluation": 0.6, + "environment": 0.2, + "forward": 0.3, + "misc": 0.1, + "allocation": 0.2, + "backward": 0.3, } losses = { - 'policy': 0.4, - 'value': 0.2, - 'entropy': 0.1, - 'old_approx_kl': 0.1, - 'approx_kl': 0.2, - 'clip_fraction': 0.1, - 'explained_variance': 0.3, + "policy": 0.4, + "value": 0.2, + "entropy": 0.1, + "old_approx_kl": 0.1, + "approx_kl": 0.2, + "clip_fraction": 0.1, + "explained_variance": 0.3, } user_stats = { - 'time_alive': 128, - 'exploration': 0.1, - 'experience': 1000, + "time_alive": 128, + "exploration": 0.1, + "experience": 1000, } - #dashboard.update(*topline, performance, losses, user_stats) + # dashboard.update(*topline, performance, losses, user_stats) print_dashboard(*topline, performance, losses, user_stats) time.sleep(1) except KeyboardInterrupt: dashboard.stop() - diff --git a/tests/test_utils.py b/tests/test_utils.py index e193d0aaf..04a24f89c 100644 --- a/tests/test_utils.py +++ b/tests/test_utils.py @@ -1,14 +1,16 @@ import sys -import gym +import gym import pufferlib import pufferlib.utils + def test_suppress(): with pufferlib.utils.Suppress(): - gym.make('Breakout-v4') - print('stdout (you should not see this)', file=sys.stdout) - print('stderr (you should not see this)', file=sys.stderr) + gym.make("Breakout-v4") + print("stdout (you should not see this)", file=sys.stdout) + print("stderr (you should not see this)", file=sys.stderr) + -if __name__ == '__main__': - test_suppress() \ No newline at end of file +if __name__ == "__main__": + test_suppress() diff --git a/tests/time_alloc.py b/tests/time_alloc.py index fd938d4ca..0ba752301 100644 --- a/tests/time_alloc.py +++ b/tests/time_alloc.py @@ -1,15 +1,17 @@ -import numpy as np import timeit +import numpy as np + # Time np.zeros(2, 5) for 100000 iterations -time_zeros = timeit.timeit('np.zeros((2, 5))', setup='import numpy as np', number=100000) +time_zeros = timeit.timeit("np.zeros((2, 5))", setup="import numpy as np", number=100000) # Pre-allocate the array preallocated_array = np.zeros((2, 5)) # Time setting the pre-allocated array to zero for 100000 iterations -time_preallocated = timeit.timeit('preallocated_array[:] = 0', setup='import numpy as np; preallocated_array = np.zeros((2, 5))', number=100000) +time_preallocated = timeit.timeit( + "preallocated_array[:] = 0", setup="import numpy as np; preallocated_array = np.zeros((2, 5))", number=100000 +) print(f"Time for np.zeros(2, 5) over 100000 iterations: {time_zeros} seconds") print(f"Time for preallocated *= 0 over 100000 iterations: {time_preallocated} seconds") -