diff --git a/assets/x8.xml b/assets/x8.xml index 3368f7b..bcc8bde 100644 --- a/assets/x8.xml +++ b/assets/x8.xml @@ -70,10 +70,10 @@ 0.8 0.5 - 0.1 + .02 8.0 - 0.85 - 0 + 2.85 + 360 NONE 0 @@ -85,10 +85,10 @@ 0.8 0.5 - 0.1 + 0.03 4.0 - 0.95 - 0.0 + 1.95 + 360 LEFT 0 @@ -100,10 +100,10 @@ 0.8 0.5 - 0.1 + 0.03 4.0 - 0.95 - 0.0 + 1.95 + 360 RIGHT 0 @@ -145,7 +145,7 @@ attitude/heading-true-rad - 0.003 + 0.0 diff --git a/data/ppo/stats.txt b/data/ppo/stats.txt new file mode 100644 index 0000000..e69de29 diff --git a/learning/autopilot.py b/learning/autopilot.py index 5cd3f47..d20c6ed 100644 --- a/learning/autopilot.py +++ b/learning/autopilot.py @@ -4,6 +4,9 @@ from tensordict.nn.distributions import NormalParamExtractor from tensordict.nn import TensorDictModule, InteractionType from torchrl.modules import ProbabilisticActor, TanhNormal, ValueOperator +from torch.distributions import Categorical +from tensordict.nn import CompositeDistribution +from learning.utils import CategoricalControlsExtractor import os """ @@ -20,7 +23,7 @@ def __init__(self): - 3 angular velocities: w_roll, w_pitch, w_yaw - 3d relative position of next waypoint: wx, wy, wz """ - self.inputs = 13 + self.inputs = 15 """ Action: @@ -36,7 +39,7 @@ def __init__(self): nn.Linear(self.inputs, self.inputs), nn.ReLU(), nn.Linear(self.inputs, self.outputs), - nn.Sigmoid(), + nn.Tanh(), ) # Returns the action selected and 0, representing the log-prob of the @@ -50,10 +53,10 @@ def get_control(self, action): Clamps various control outputs and sets the mean for control surfaces to 0. Assumes [action] is a 4-item tensor of throttle, aileron cmd, elevator cmd, rudder cmd. """ - action[0] = 0.5 * action[0] - action[1] = 0.1 * (action[1] - 0.5) - action[2] = 0.5 * (action[2] - 0.5) - action[3] = 0.5 * (action[3] - 0.5) + action[0] = 0.8 * (0.5*(action[0] + 1)) + action[1] = 0.1 * action[1] + action[2] = 0.4 * action[2] + action[3] = 0.1 * action[3] return action # flattened_params = flattened dx1 numpy array of all params to init from @@ -82,12 +85,11 @@ def init_from_params(self, flattened_params): layer1, nn.ReLU(), layer2, - nn.Sigmoid(), + nn.Tanh() ) - # Loads the network from dir/name.pth - def init_from_saved(self, dir, name): - path = os.path.join(dir, name + '.pth') + # Loads the network from path + def init_from_saved(self, path): self.policy_network = torch.load(path) # Saves the network to dir/name.pth @@ -125,8 +127,8 @@ def transform_from_deterministic_learner(self): self.policy_network[-2] = nn.Linear(self.inputs, self.outputs * 2) # Update the output layer to include the original weights and biases - self.policy_network[-2].weight = nn.Parameter(torch.cat((w, torch.ones(w.shape) * 100), 0)) - self.policy_network[-2].bias = nn.Parameter(torch.cat((b, torch.ones(b.shape) * 100), 0)) + self.policy_network[-2].weight = nn.Parameter(torch.cat((w, torch.zeros(w.shape)), 0)) + self.policy_network[-2].bias = nn.Parameter(torch.cat((b, torch.zeros(b.shape)), 0)) # Add a normal param extractor to the network to extract (means, sigmas) tuple self.policy_network.append(NormalParamExtractor()) @@ -141,10 +143,6 @@ def transform_from_deterministic_learner(self): module=policy_module, in_keys=["loc", "scale"], distribution_class=TanhNormal, - distribution_kwargs={ - "min": 0, # minimum control - "max": 1, # maximum control - }, default_interaction_type=InteractionType.RANDOM, return_log_prob=True, ) @@ -153,7 +151,6 @@ def transform_from_deterministic_learner(self): def get_action(self, observation): data = TensorDict({"observation": observation}, []) policy_forward = self.policy_module(data) - print("action", self.policy_network(observation)) return policy_forward["action"], policy_forward["sample_log_prob"] # NOTE: This initializes from *deterministic* learner parameters and picks @@ -173,10 +170,114 @@ def init_from_saved(self, path): module=policy_module, in_keys=["loc", "scale"], distribution_class=TanhNormal, - distribution_kwargs={ - "min": 0, # minimum control - "max": 1, # maximum control - }, default_interaction_type=InteractionType.RANDOM, return_log_prob=True, ) + +""" +Slew rate autopilot learner. +Each control (throttle/aileron/elevator/rudder) has three options: +stay constant, go down, or go up. +""" +class SlewRateAutopilotLearner: + def __init__(self): + self.inputs = 15 + self.outputs = 4 + + # Slew rates are wrt sim clock + self.throttle_slew_rate = 0.005 + self.aileron_slew_rate = 0.0001 + self.elevator_slew_rate = 0.00025 + self.rudder_slew_rate = 0.0001 + + self.policy_network = nn.Sequential( + nn.Linear(self.inputs, self.inputs), + nn.ReLU(), + nn.Linear(self.inputs, 3 * self.outputs), + nn.Sigmoid(), + CategoricalControlsExtractor() + ) + + self.instantiate_policy_module() + + def instantiate_policy_module(self): + policy_module = TensorDictModule(self.policy_network, in_keys=["observation"], out_keys=[("params", "throttle", "probs"),("params", "aileron", "probs"),("params", "elevator", "probs"),("params", "rudder", "probs")]) + self.policy_module = policy_module = ProbabilisticActor( + module=policy_module, + in_keys=["params"], + distribution_class=CompositeDistribution, + distribution_kwargs={ + "distribution_map": { + "throttle": Categorical, + "aileron": Categorical, + "elevator": Categorical, + "rudder": Categorical, + } + }, + default_interaction_type=InteractionType.RANDOM, + return_log_prob=True + ) + + # Returns the action selected and the log_prob of that action + def get_action(self, observation): + data = TensorDict({"observation": observation}, []) + policy_forward = self.policy_module(data) + action = torch.Tensor([policy_forward['throttle'], policy_forward['aileron'], policy_forward['elevator'], policy_forward['rudder']]) + return action, policy_forward["sample_log_prob"] + + # Always samples the mode of the output for each control + def get_deterministic_action(self, observation): + throttle_probs, aileron_probs, elevator_probs, rudder_probs = self.policy_network(observation) + action = torch.Tensor([torch.argmax(throttle_probs), torch.argmax(aileron_probs), torch.argmax(elevator_probs), torch.argmax(rudder_probs)]) + return action, 0 + + # Apply a -1 transformation to the action to create control tensor such that: + # -1 means go down, 0 means stay same, and +1 means go up + def get_control(self, action): + return action - 1 + + # flattened_params = flattened dx1 numpy array of all params to init from + # NOTE: the way the params are broken up into the weights/biases of each layer + # would need to be manually edited for changes in network architecture + def init_from_params(self, flattened_params): + flattened_params = torch.from_numpy(flattened_params).to(torch.float32) + + pl, pr = 0, 0 + layer1 = nn.Linear(self.inputs, self.inputs) + pr += layer1.weight.nelement() + layer1.weight = nn.Parameter(flattened_params[pl:pr].reshape(layer1.weight.shape)) + pl = pr + pr += layer1.bias.nelement() + layer1.bias = nn.Parameter(flattened_params[pl:pr].reshape(layer1.bias.shape)) + + layer2 = nn.Linear(self.inputs, 3*self.outputs) + pl = pr + pr += layer2.weight.nelement() + layer2.weight = nn.Parameter(flattened_params[pl:pr].reshape(layer2.weight.shape)) + pl = pr + pr += layer2.bias.nelement() + layer2.bias = nn.Parameter(flattened_params[pl:pr].reshape(layer2.bias.shape)) + + self.policy_network = nn.Sequential( + layer1, + nn.ReLU(), + layer2, + nn.Sigmoid(), + CategoricalControlsExtractor() + ) + self.instantiate_policy_module() + + # Loads the network from path + def init_from_saved(self, path): + self.policy_network = torch.load(path) + self.instantiate_policy_module() + + # Saves the network to dir/name.pth + def save(self, dir, name): + path = os.path.join(dir, name + '.pth') + torch.save(self.policy_network, path) + + + + + diff --git a/learning/crossentropy.py b/learning/crossentropy.py index 677fcf1..a768312 100644 --- a/learning/crossentropy.py +++ b/learning/crossentropy.py @@ -1,6 +1,6 @@ import torch import numpy as np -from learning.autopilot import AutopilotLearner +from learning.autopilot import AutopilotLearner, SlewRateAutopilotLearner from simulation.simulate import FullIntegratedSim from simulation.jsbsim_aircraft import x8 import os @@ -25,7 +25,7 @@ def __init__(self, learners, num_params): def init_using_torch_default(generation_size, num_params): learners = [] for i in range(generation_size): - learners.append(AutopilotLearner()) + learners.append(SlewRateAutopilotLearner()) return Generation(learners, num_params) # Utilizes [rewards], which contains the reward obtained by each learner as a @@ -89,12 +89,12 @@ def make_new_generation(mean, cov, generation_size, num_params): # Generate each learner from params learners = [] for param_list in selected_params: - l = AutopilotLearner() + l = SlewRateAutopilotLearner() l.init_from_params(param_list) learners.append(l) return Generation(learners, num_params) -def cross_entropy_train(epochs, generation_size, num_survive, num_params=238, sim_time=60.0, save_dir='cross_entropy'): +def cross_entropy_train(epochs, generation_size, num_survive, num_params=432, sim_time=60.0, save_dir='cross_entropy'): # Create save_dir (and if one already exists, rename it with some rand int) if os.path.exists(os.path.join('data', save_dir)): os.rename(os.path.join('data', save_dir), os.path.join('data', save_dir + '_old' + str(randint(0, 100000)))) @@ -117,7 +117,7 @@ def cross_entropy_train(epochs, generation_size, num_survive, num_params=238, si # Evaluate generation through rollouts rewards = [] for i in range(len(generation.learners)): - id = str(100*(epoch+1) + (i+1)) + id = str(1000*(epoch+1) + (i+1)) learner = generation.learners[i] # Run simulation to evaluate learner @@ -127,7 +127,7 @@ def cross_entropy_train(epochs, generation_size, num_survive, num_params=238, si integrated_sim.simulation_loop() # Acquire/save data - integrated_sim.mdp_data_collector.save(os.path.join(save_dir, 'generation' + str(epoch+1)), 'trajectory_learner#' + str(i+1)) + #integrated_sim.mdp_data_collector.save(os.path.join(save_dir, 'generation' + str(epoch+1)), 'trajectory_learner#' + str(i+1)) rewards.append(integrated_sim.mdp_data_collector.get_cum_reward()) print('Reward for Learner #', id, ': ', integrated_sim.mdp_data_collector.get_cum_reward()) @@ -149,4 +149,4 @@ def cross_entropy_train(epochs, generation_size, num_survive, num_params=238, si if __name__ == "__main__": os.environ["JSBSIM_DEBUG"]=str(0) # epochs, generation_size, num_survive - cross_entropy_train(100, 99, 50) \ No newline at end of file + cross_entropy_train(100, 200, 50) \ No newline at end of file diff --git a/learning/ppo.py b/learning/ppo.py index e012f67..76138c2 100644 --- a/learning/ppo.py +++ b/learning/ppo.py @@ -7,11 +7,14 @@ from torchrl.data.replay_buffers.storages import LazyTensorStorage from torchrl.objectives import ClipPPOLoss from torchrl.objectives.value import GAE -from learning.autopilot import StochasticAutopilotLearner +from learning.autopilot import StochasticAutopilotLearner, SlewRateAutopilotLearner from simulation.simulate import FullIntegratedSim from simulation.jsbsim_aircraft import x8 +import numpy as np import os +device = "cpu" if not torch.has_cuda else "cuda:0" + """ Gathers rollout data and returns it in the way the Proximal Policy Optimization loss_module expects """ @@ -24,15 +27,27 @@ def gather_rollout_data(autopilot_learner, policy_num, num_trajectories=100, sim sample_log_probs = torch.empty(0) rewards = torch.empty(0) dones = torch.empty(0, dtype=torch.bool) + best_cum_reward = -float('inf') + worst_cum_reward = float('inf') + total_cum_reward = 0 + total_timesteps = 0 for t in range(num_trajectories): integrated_sim = FullIntegratedSim(x8, autopilot_learner, sim_time) integrated_sim.simulation_loop() # Acquire data observation, next_observation, action, sample_log_prob, reward, done = integrated_sim.mdp_data_collector.get_trajectory_data() + cum_reward = integrated_sim.mdp_data_collector.get_cum_reward() + total_cum_reward += cum_reward + total_timesteps += reward.shape[0] - # Save data - # integrated_sim.mdp_data_collector.save(os.path.join('ppo', 'trajectories'), 'rollout' + str(policy_num * num_trajectories + t)) + # Save trajectory if worst or best so far + if cum_reward > best_cum_reward: + integrated_sim.mdp_data_collector.save(os.path.join('ppo', 'trajectories'), 'best_rollout#' + str(policy_num)) + best_cum_reward = cum_reward + if cum_reward < worst_cum_reward: + integrated_sim.mdp_data_collector.save(os.path.join('ppo', 'trajectories'), 'worst_rollout#' + str(policy_num)) + worst_cum_reward = cum_reward # Add to the data observations = torch.cat((observations, observation)) @@ -47,14 +62,34 @@ def gather_rollout_data(autopilot_learner, policy_num, num_trajectories=100, sim # that entry for one step in a rollout. data = TensorDict({ "observation": observations, - "action": actions.detach(), + "action": TensorDict({ + "throttle": actions[:,0].detach(), + "aileron": actions[:,1].detach(), + "elevator": actions[:,2].detach(), + "rudder": actions[:,3].detach(), + "throttle_log_prob": torch.zeros(data_size).detach(), + "aileron_log_prob": torch.zeros(data_size).detach(), + "elevator_log_prob": torch.zeros(data_size).detach(), + "rudder_log_prob": torch.zeros(data_size).detach(), + }, [data_size,]), "sample_log_prob": sample_log_probs.detach(), # log probability that each action was selected + # "sample_log_prob": TensorDict({ + # }, [data_size,]), # log probability that each action was selected ("next", "done"): dones, ("next", "terminated"): dones, ("next", "reward"): rewards, ("next", "observation"): next_observations, }, [data_size,]) + # Write to stats file + stats_file = open(os.path.join('data', 'ppo', 'stats.txt'), 'a') + stats_file.write('Policy Number #' + str(policy_num) + ':\n') + stats_file.write('Average Time Steps =' + str(total_timesteps/num_trajectories) + ':\n') + stats_file.write('Average Reward: ' + str(total_cum_reward/num_trajectories) + '\n') + stats_file.write('Best Reward: ' + str(best_cum_reward) + '\n') + stats_file.write('Worst Reward: ' + str(worst_cum_reward) + '\n') + stats_file.write('\n\n') + return data, data_size """ @@ -64,12 +99,12 @@ def gather_rollout_data(autopilot_learner, policy_num, num_trajectories=100, sim action taken during PPO learning. """ def make_value_estimator_module(n_obs): + global device + value_net = nn.Sequential( - nn.Linear(n_obs, n_obs), + nn.Linear(n_obs, 2*n_obs, device=device), nn.Tanh(), - nn.Linear(n_obs, n_obs), - nn.Tanh(), - nn.Linear(n_obs, 1), # one value is computed for the state + nn.Linear(2*n_obs, 1, device=device), # one value is computed for the state ) return ValueOperator( @@ -115,15 +150,16 @@ def train_ppo_once(policy_num, autopilot_learner, loss_module, advantage_module, for b in range(0, data_size // batch_size): # Gather batch and calculate PPO loss on it batch = replay_buffer.sample(batch_size) - loss_vals = loss_module(batch.to("cpu")) + loss_vals = loss_module(batch.to(device)) loss_value = ( loss_vals["loss_objective"] + loss_vals["loss_critic"] - + loss_vals["loss_entropy"] ) # Optimize via gradient descent with the optimizer loss_value.backward() + + torch.nn.utils.clip_grad_norm_(loss_module.parameters(), 1.0) optimizer.step() optimizer.zero_grad() @@ -131,17 +167,21 @@ def train_ppo_once(policy_num, autopilot_learner, loss_module, advantage_module, if __name__ == "__main__": # Parameters (see https://pytorch.org/rl/tutorials/coding_ppo.html#ppo-parameters) batch_size = 100 - num_epochs = 10 + num_epochs = 100 + device = "cpu" if not torch.has_cuda else "cuda:0" + clip_epsilon = 0.2 # for PPO loss gamma = 1.0 lmbda = 0.95 entropy_eps = 1e-4 - lr = 3e-4 - num_trajectories = 100 + lr = 1e-4 + num_trajectories = 200 num_policy_iterations = 100 # Build the modules - autopilot_learner = StochasticAutopilotLearner() + #autopilot_learner = StochasticAutopilotLearner() + autopilot_learner = SlewRateAutopilotLearner() + # autopilot_learner.init_from_params(np.random.normal(0, 1, 350)) value_module = make_value_estimator_module(autopilot_learner.inputs) advantage_module = GAE( gamma=gamma, lmbda=lmbda, value_network=value_module, average_gae=True @@ -150,8 +190,8 @@ def train_ppo_once(policy_num, autopilot_learner, loss_module, advantage_module, actor=autopilot_learner.policy_module, critic=value_module, clip_epsilon=clip_epsilon, - entropy_bonus=bool(entropy_eps), - entropy_coef=entropy_eps, + entropy_bonus=False, + entropy_coef=0, value_target_key=advantage_module.value_target_key, critic_coef=1.0, gamma=gamma, diff --git a/learning/utils.py b/learning/utils.py new file mode 100644 index 0000000..f0f2002 --- /dev/null +++ b/learning/utils.py @@ -0,0 +1,11 @@ +import torch +import torch.nn as nn + +class CategoricalControlsExtractor(nn.Module): + def __init__(self): + super().__init__() + + def forward(self, *tensors: torch.Tensor) -> tuple[torch.Tensor, ...]: + tensor, *others = tensors + throttle, aileron, elevator, rudder = tensor.chunk(4, -1) + return (nn.functional.softmax(throttle), nn.functional.softmax(aileron), nn.functional.softmax(elevator), nn.functional.softmax(rudder), *others) diff --git a/scripts/sim_autopilot.py b/scripts/sim_autopilot.py index c54daef..060fe23 100644 --- a/scripts/sim_autopilot.py +++ b/scripts/sim_autopilot.py @@ -6,7 +6,7 @@ and rolls out a trajectory in sim for that autopilot learner. """ -from learning.autopilot import AutopilotLearner +from learning.autopilot import AutopilotLearner, SlewRateAutopilotLearner from simulation.simulate import FullIntegratedSim from simulation.jsbsim_aircraft import x8 import sys @@ -17,7 +17,7 @@ file = os.path.join(*(["data"] + policy_file_path.split('/'))) # Initialize autopilot -autopilot = AutopilotLearner() +autopilot = SlewRateAutopilotLearner() autopilot.init_from_saved(file) # Play sim diff --git a/simulation/jsbsim_simulator.py b/simulation/jsbsim_simulator.py index 9cc5a11..308f428 100644 --- a/simulation/jsbsim_simulator.py +++ b/simulation/jsbsim_simulator.py @@ -84,7 +84,8 @@ def __init__(self, aircraft: Aircraft = x8, init_conditions: Dict[prp.Property, float] = None, debug_level: int = 0): - self.fdm = jsbsim.FGFDMExec(root_dir=self.ROOT_DIR) + with HidePrints(): + self.fdm = jsbsim.FGFDMExec(root_dir=self.ROOT_DIR) self.fdm.set_debug_level(debug_level) self.sim_dt = 1.0 / sim_frequency_hz self.aircraft = aircraft @@ -94,7 +95,9 @@ def __init__(self, self.wall_clock_dt = None self.update_airsim(ignore_collisions=True) self.waypoint_id = 0 - self.waypoint_threshold = 2 + self.waypoint_threshold = 3 + self.takeoff_rewarded = False + self.completed_takeoff = False self.waypoint_rewarded = True self.waypoints = [] with open("waypoints.csv", 'r') as file: @@ -241,8 +244,9 @@ def airsim_connect() -> airsim.VehicleClient: :return: the airsim client object """ - client = airsim.VehicleClient() - client.confirmConnection() + with HidePrints(): + client = airsim.VehicleClient() + client.confirmConnection() return client def update_airsim(self, ignore_collisions = False) -> None: diff --git a/simulation/mdp.py b/simulation/mdp.py index 58a292f..fbced7e 100644 --- a/simulation/mdp.py +++ b/simulation/mdp.py @@ -4,6 +4,7 @@ states/observations, actions, and rewards. """ +import math import torch import simulation.jsbsim_properties as prp import numpy as np @@ -12,28 +13,29 @@ """ Extracts agent state data from the sim. """ -def state_from_sim(sim, debug=False): - state = torch.zeros(13,) +def state_from_sim(sim): + state = torch.zeros(15,) FT_TO_M = 0.3048 # altitude state[0] = sim[prp.altitude_sl_ft] * FT_TO_M # z - # velocity - state[1] = sim[prp.v_north_fps] * FT_TO_M # x velocity - state[2] = sim[prp.v_east_fps] * FT_TO_M # y velocity - state[3] = -sim[prp.v_down_fps] * FT_TO_M # z velocity - - # angles - state[4] = sim[prp.roll_rad] # roll - state[5] = sim[prp.pitch_rad] # pitch - state[6] = sim[prp.heading_rad] # yaw + # speed + state[1] = FT_TO_M * np.linalg.norm(np.array([sim[prp.v_north_fps], sim[prp.v_east_fps], -sim[prp.v_down_fps]])) + + # roll + state[2] = sim[prp.roll_rad] + # pitch + state[3] = sim[prp.pitch_rad] + # angle rates - state[7] = sim[prp.p_radps] # roll rate - state[8] = sim[prp.q_radps] # pitch rate - state[9] = sim[prp.r_radps] # yaw rate + state[4] = sim[prp.p_radps] # roll rate + state[5] = sim[prp.q_radps] # pitch rate + state[6] = sim[prp.r_radps] # yaw rate + + state[7] = -sim[prp.v_down_fps] * FT_TO_M # z velocity # next waypoint (relative) position = np.array(sim.get_local_position()) @@ -41,25 +43,70 @@ def state_from_sim(sim, debug=False): displacement = waypoint - position + # waypoint ground distance + state[8] = np.linalg.norm(np.array(displacement[0:2])) + + # waypoint altitude distance + state[9] = displacement[2] + + # angles + waypoint_heading = np.arctan2(displacement[1], displacement[0]) + heading = sim[prp.heading_rad] # yaw + + state[10] = waypoint_heading - heading + + if state[10] < -math.pi: + state[10] += 2 * math.pi + elif state[10] > math.pi: + state[10] -= 2* math.pi + + + # print("\t\t\t\t\t\t\t\t\t\t YAW", heading/math.pi*180) + # print("\t\t\t\t\t\t\t\t\t\t waypoint_heading", waypoint_heading/math.pi*180) + + + # print("\t\t\t\t\t\t\t\t\t\theading", state[10]/math.pi*180) + # print("\t\t\t\t\t\t\t\t\t\tdist", state[8]) + # print("\t\t\t\t\t\t\t\t\t\theight", state[9]) + + + + state[11] = sim[prp.throttle_cmd] / 0.8 + state[12] = sim[prp.aileron_cmd] / 0.1 + state[13] = sim[prp.elevator_cmd] / 0.4 + state[14] = sim[prp.rudder_cmd] / 0.1 + + + if state[0] >= 2: + if not sim.completed_takeoff: + print("\t\t\t\t\t\t\t\t\t\tTake off!") + sim.completed_takeoff = True + if np.linalg.norm(displacement) <= sim.waypoint_threshold: - print("Waypoint Hit!") + print(f"\t\t\t\t\t\t\t\t\t\tWaypoint {sim.waypoint_id} Hit!") sim.waypoint_id += 1 sim.waypoint_rewarded = False - state[10] = displacement[0] - state[11] = displacement[1] - state[12] = displacement[2] + if is_unhealthy_state(state): + raise Exception("Unhealthy state, do better") - if debug: - print('State!') - print('Altitude:', state[0]) - print('Velocity: (', state[1], state[2], state[3], ')') - print('Roll:', state[4], '; Pitch:', state[5], '; Yaw:', state[6]) - print('RollRate:', state[7], '; PitchRate:', state[8], '; YawRate:', state[9]) - print('Relative WP: (', state[10], state[11], state[12], ')') return state +""" +Returns a bool for whether the state is unhealthy +""" +def is_unhealthy_state(state): + MAX_BANK = math.pi / 3 + MAX_PITCH = math.pi / 4 + if np.linalg.norm(state[8:10]) > 75: + return True + if not -MAX_BANK < state[2] < MAX_BANK: + return True + if not -MAX_PITCH < state[3] < MAX_PITCH: + return True + return False + """ Updates sim according to a control, assumes [control] is a 4-item tensor of @@ -72,6 +119,23 @@ def update_sim_from_control(sim, control, debug=False): sim[prp.rudder_cmd] = control[3] if debug: print('Control Taken:', control) + + +# Get the state/action/log_prob and control from the slewrate autopilot +def query_slewrate_autopilot(sim, autopilot, deterministic=True): + state = state_from_sim(sim) + action, log_prob = autopilot.get_deterministic_action(state) if deterministic else autopilot.get_action(state) + control = autopilot.get_control(action) + + return state, action, log_prob, control + +# Called every single sim step to enact slewrate +# Assumes autopilot is a SlewRateAutopilotLearner +def update_sim_from_slewrate_control(sim, control, autopilot): + sim[prp.throttle_cmd] = np.clip(sim[prp.throttle_cmd] + control[0] * autopilot.throttle_slew_rate, 0, 0.8) + sim[prp.aileron_cmd] = np.clip(sim[prp.aileron_cmd] + control[1] * autopilot.aileron_slew_rate, -0.1, 0.1) + sim[prp.elevator_cmd] = np.clip(sim[prp.elevator_cmd] + control[2] * autopilot.elevator_slew_rate, -0.4, 0.4) + sim[prp.rudder_cmd] = np.clip(sim[prp.rudder_cmd] + control[3] * autopilot.rudder_slew_rate, -0.1, 0.1) """ Follows a predetermined sequence of controls, instead of using autopilot. @@ -115,18 +179,25 @@ def enact_predetermined_controls(sim, autopilot): Basically just updates sim throttle / control surfaces according to the autopilot. """ def enact_autopilot(sim, autopilot): - state = state_from_sim(sim, debug=False) + state = state_from_sim(sim) action, log_prob = autopilot.get_action(state) - update_sim_from_control(sim, autopilot.get_control(action)) return state, action, log_prob + # Takes in the action outputted directly from the network and outputs the # normalized quadratic action cost from 0-1 def quadratic_action_cost(action): - action[1:4] = 2*action[1:4] - 1 # convert control surfaces to [-1, 1] - return float(torch.dot(action, action).detach()) / 4 # divide by 4 to be 0-1 + action_cost_weights = torch.tensor([1.0, 20.0, 10.0, 1.0]) + action[0] = 0.5 * (action[0] + 1) # converts throttle to be 0-1 + return float(torch.dot(action ** 2, action_cost_weights).detach() / sum(action_cost_weights)) # divide by 4 to be 0-1 + +# Takes in the action outputted directly from the network and outputs the +# normalized quadratic action cost from 0-1 +def quadratic_control_cost(control): + control_cost_weights = torch.tensor([1.0, 10.0, 5.0, 1.0]) + return float(torch.dot(control ** 2, control_cost_weights).detach() / sum(control_cost_weights)) # divide by 4 to be 0-1 """ The reward function for the bb autopilots. Since they won't know how to fly, @@ -134,6 +205,7 @@ def quadratic_action_cost(action): + 1 if moving with some velocity threshold vel_reward_threshold + alt_reward_threshold if flying above ground with some threshold alt_reward_threshold - action_coeff * the quadratic control cost + !!! WARNING: Deprecated """ def bb_reward(action, next_state, collided, alt_reward_coeff=10, action_coeff=1, alt_reward_threshold=2, vel_reward_threshold=1): moving_reward = 1 if (next_state[3]**2 + next_state[4]**2 + next_state[5]**2) > vel_reward_threshold else 0 @@ -145,6 +217,7 @@ def bb_reward(action, next_state, collided, alt_reward_coeff=10, action_coeff=1, A reward function that tries to incentivize the plane to go towards the waypoint at each timestep, while also rewarding for being above ground and penalizing for high control effort + !!! WARNING: Deprecated """ def new_init_wp_reward(action, next_state, collided, wp_coeff=1, action_coeff=1, alt_reward_threshold=5): alt_reward = 1 if next_state[2] > alt_reward_threshold else 0 @@ -152,29 +225,32 @@ def new_init_wp_reward(action, next_state, collided, wp_coeff=1, action_coeff=1, waypoint_rel_unit = next_state[10:13] / torch.norm(next_state[10:13]) vel = next_state[1:4] - toward_waypoint_reward = wp_coeff * float(torch.dot((vel ** 2 * torch.sign(vel)), waypoint_rel_unit).detach()) + toward_waypoint_reward = wp_coeff * float(torch.dot(vel, waypoint_rel_unit).detach()) return toward_waypoint_reward + alt_reward - action_cost if not collided else 0 """ -A reward function. +A reward function getter. """ def get_wp_reward(sim): - def wp_reward(action, next_state, collided, wp_coeff=0.01, action_coeff=0.01, alt_reward_threshold=10): + def wp_reward(action, next_state, collided, wp_coeff=0.1, action_coeff=0.5): if not sim.waypoint_rewarded: sim.waypoint_rewarded = True - wp_reward = 1_000_000 + wp_reward = 1_000 else: wp_reward = 0 - # print("altitude", next_state[2] ) - alt_reward = 0.1 if next_state[2] > alt_reward_threshold else 0 - action_cost = action_coeff * quadratic_action_cost(action) - - waypoint_rel_unit = next_state[10:13] / torch.norm(next_state[10:13]) - vel = next_state[1:4] - toward_waypoint_reward = wp_coeff * float(torch.dot((vel ** 2 * torch.sign(vel)), waypoint_rel_unit).detach()) - # print("wp_reward: ", wp_reward, " toward_waypoint_reward: ", toward_waypoint_reward) - # print("alt_reward: ", alt_reward, " action_cost: ", -action_cost) - # print("\t reward", wp_reward + toward_waypoint_reward + alt_reward - action_cost if not collided else 0) - return wp_reward + toward_waypoint_reward + alt_reward - action_cost if not collided else 0 + action_cost = action_coeff * quadratic_control_cost(next_state[11:15]) + #print('\t\t\t\t\t\tACTION COST:', action_cost, next_state[11:15]) + # waypoint_rel_unit = torch.nn.functional.normalize(next_state[10:13], dim=0) + # vel = next_state[1:4] + # toward_waypoint_reward = wp_coeff * float(torch.dot(vel, waypoint_rel_unit).detach()) + toward_waypoint_reward = 0 + if not sim.takeoff_rewarded and sim.completed_takeoff: + takeoff_reward = 500 + sim.takeoff_rewarded = True + else: + takeoff_reward = 0 + # toward_waypoint_reward = 0 + # toward_waypoint_reward = torch.min(torch.tensor(10), 0.01 * 1 / torch.max(torch.tensor(0.1), torch.sum(next_state[10:13]**2))) + return wp_reward + toward_waypoint_reward + takeoff_reward - action_cost if not collided else -500 return wp_reward """ @@ -182,7 +258,7 @@ def wp_reward(action, next_state, collided, wp_coeff=0.01, action_coeff=0.01, al rollout, including states/actions/rewards the agent experienced. """ class MDPDataCollector: - def __init__(self, sim, reward_fn, expected_trajectory_length, state_dim=13, action_dim=4): + def __init__(self, sim, reward_fn, expected_trajectory_length, state_dim=15, action_dim=4): # parameters self.sim = sim self.reward_fn = reward_fn diff --git a/simulation/simulate.py b/simulation/simulate.py index b0ab2a3..bbec2c7 100644 --- a/simulation/simulate.py +++ b/simulation/simulate.py @@ -1,7 +1,9 @@ +from shared import HidePrints from simulation.jsbsim_simulator import Simulation from simulation.jsbsim_aircraft import Aircraft, x8 import simulation.jsbsim_properties as prp -from learning.autopilot import AutopilotLearner +from learning.autopilot import AutopilotLearner, SlewRateAutopilotLearner +import torch import simulation.mdp as mdp import os import numpy as np @@ -13,10 +15,10 @@ class FullIntegratedSim: def __init__(self, aircraft: Aircraft, - autopilot: AutopilotLearner, + autopilot: SlewRateAutopilotLearner, sim_time: float, display_graphics: bool = True, - agent_interaction_frequency: int = 1, + agent_interaction_frequency: int = 10, airsim_frequency_hz: float = 392.0, sim_frequency_hz: float = 240.0, init_conditions: bool = None, @@ -44,6 +46,9 @@ def __init__(self, # Triggered when sim is complete self.done: bool = False + # + self.unhealthy_termination: bool = False + self.initial_collision = False """ Run loop for one simulation. @@ -76,14 +81,20 @@ def simulation_loop(self): while i < update_num: # Do autopilot controls try: - state, action, log_prob = mdp.enact_autopilot(self.sim, self.autopilot) + #state, action, log_prob = mdp.enact_autopilot(self.sim, self.autopilot) + state, action, log_prob, control = mdp.query_slewrate_autopilot(self.sim, self.autopilot) + if torch.isnan(state).any(): + break except Exception as e: print(e) + self.unhealthy_termination = True # If enacting the autopilot fails, end the simulation immediately break # Update sim while waiting for next agent interaction while True: + mdp.update_sim_from_slewrate_control(self.sim, control, self.autopilot) + # Run another sim step self.sim.run() @@ -100,28 +111,34 @@ def simulation_loop(self): # Check for collisions via airsim and terminate if there is one if self.sim.get_collision_info().has_collided: if self.initial_collision: - print('Aircraft has collided.') + # print('Aircraft has collided.') self.done = True self.sim.reinitialize() else: - print("Aircraft completed initial landing") + # print("Aircraft completed initial landing") self.initial_collision = True # Exit if sim is over or it's time for another agent interaction if self.done or i % self.agent_interaction_frequency == 0: break - + # Get new state try: next_state = mdp.state_from_sim(self.sim) - except: + if torch.isnan(next_state).any(): + next_state = state + self.done = True + except Exception as e: # If we couldn't acquire the state, something crashed with jsbsim # We treat that as the end of the simulation and don't update the state + print("\t\t\t\t\t\t\t\t\t\t", e) next_state = state self.done = True + self.unhealthy_termination = True # Data collection update for this step - self.mdp_data_collector.update(int(i/self.agent_interaction_frequency)-1, state, action, log_prob, next_state, self.done) + self.mdp_data_collector.update(int(i/self.agent_interaction_frequency)-1, state, action, + log_prob, next_state, self.unhealthy_termination) # End if collided if self.done == True: @@ -130,6 +147,8 @@ def simulation_loop(self): self.done = True self.mdp_data_collector.terminate(int(i/self.agent_interaction_frequency)) print('Simulation complete.') + print('\t\t\t\t\t\t\t\t\t\tCum reward:', self.mdp_data_collector.cum_reward) + """ Replays a simulation @@ -151,6 +170,7 @@ def simulation_replay(self, actions): # NOTE: for replays, the agent interaction frequency must match what # it was when the trajectory was created if i % self.agent_interaction_frequency == 0: + print("Step") break if __name__ == "__main__": diff --git a/waypoints.csv b/waypoints.csv index 7b87c80..aa70d84 100644 --- a/waypoints.csv +++ b/waypoints.csv @@ -1,7 +1,8 @@ waypoint_id,x,y,z,relative_x,relative_y,relative_z 1,-4461,-417.960938,90,21.5,-0.2,0.89 -2,-701,-417.960938,120,59.1,-0.2,1.19 -3,2869,-57.960938,450,94.8,3.4,4.49 -4,6509,-967.960938,1230,131.2,-5.7,12.29 +2,-1891,-487.960938,120,47.2,-0.9,1.19 +3,3006,-461,586,96.17,-0.63039062,5.85 +4,6639,-237.960938,730,132.5,1.6,7.29 5,9469,-477.960938,1480,160.8,-0.8,14.79 -6,15619,-397.960938,2920,222.3,0,29.19 \ No newline at end of file +6,11919,-287.960938,1980,185.3,1.1,19.79 +7,15619,-407.960938,2920,222.3,-0.1,29.19 \ No newline at end of file