diff --git a/assets/x8.xml b/assets/x8.xml
index 3368f7b..bcc8bde 100644
--- a/assets/x8.xml
+++ b/assets/x8.xml
@@ -70,10 +70,10 @@
0.8
0.5
- 0.1
+ .02
8.0
- 0.85
- 0
+ 2.85
+ 360
NONE
0
@@ -85,10 +85,10 @@
0.8
0.5
- 0.1
+ 0.03
4.0
- 0.95
- 0.0
+ 1.95
+ 360
LEFT
0
@@ -100,10 +100,10 @@
0.8
0.5
- 0.1
+ 0.03
4.0
- 0.95
- 0.0
+ 1.95
+ 360
RIGHT
0
@@ -145,7 +145,7 @@
attitude/heading-true-rad
- 0.003
+ 0.0
diff --git a/data/ppo/stats.txt b/data/ppo/stats.txt
new file mode 100644
index 0000000..e69de29
diff --git a/learning/autopilot.py b/learning/autopilot.py
index 5cd3f47..d20c6ed 100644
--- a/learning/autopilot.py
+++ b/learning/autopilot.py
@@ -4,6 +4,9 @@
from tensordict.nn.distributions import NormalParamExtractor
from tensordict.nn import TensorDictModule, InteractionType
from torchrl.modules import ProbabilisticActor, TanhNormal, ValueOperator
+from torch.distributions import Categorical
+from tensordict.nn import CompositeDistribution
+from learning.utils import CategoricalControlsExtractor
import os
"""
@@ -20,7 +23,7 @@ def __init__(self):
- 3 angular velocities: w_roll, w_pitch, w_yaw
- 3d relative position of next waypoint: wx, wy, wz
"""
- self.inputs = 13
+ self.inputs = 15
"""
Action:
@@ -36,7 +39,7 @@ def __init__(self):
nn.Linear(self.inputs, self.inputs),
nn.ReLU(),
nn.Linear(self.inputs, self.outputs),
- nn.Sigmoid(),
+ nn.Tanh(),
)
# Returns the action selected and 0, representing the log-prob of the
@@ -50,10 +53,10 @@ def get_control(self, action):
Clamps various control outputs and sets the mean for control surfaces to 0.
Assumes [action] is a 4-item tensor of throttle, aileron cmd, elevator cmd, rudder cmd.
"""
- action[0] = 0.5 * action[0]
- action[1] = 0.1 * (action[1] - 0.5)
- action[2] = 0.5 * (action[2] - 0.5)
- action[3] = 0.5 * (action[3] - 0.5)
+ action[0] = 0.8 * (0.5*(action[0] + 1))
+ action[1] = 0.1 * action[1]
+ action[2] = 0.4 * action[2]
+ action[3] = 0.1 * action[3]
return action
# flattened_params = flattened dx1 numpy array of all params to init from
@@ -82,12 +85,11 @@ def init_from_params(self, flattened_params):
layer1,
nn.ReLU(),
layer2,
- nn.Sigmoid(),
+ nn.Tanh()
)
- # Loads the network from dir/name.pth
- def init_from_saved(self, dir, name):
- path = os.path.join(dir, name + '.pth')
+ # Loads the network from path
+ def init_from_saved(self, path):
self.policy_network = torch.load(path)
# Saves the network to dir/name.pth
@@ -125,8 +127,8 @@ def transform_from_deterministic_learner(self):
self.policy_network[-2] = nn.Linear(self.inputs, self.outputs * 2)
# Update the output layer to include the original weights and biases
- self.policy_network[-2].weight = nn.Parameter(torch.cat((w, torch.ones(w.shape) * 100), 0))
- self.policy_network[-2].bias = nn.Parameter(torch.cat((b, torch.ones(b.shape) * 100), 0))
+ self.policy_network[-2].weight = nn.Parameter(torch.cat((w, torch.zeros(w.shape)), 0))
+ self.policy_network[-2].bias = nn.Parameter(torch.cat((b, torch.zeros(b.shape)), 0))
# Add a normal param extractor to the network to extract (means, sigmas) tuple
self.policy_network.append(NormalParamExtractor())
@@ -141,10 +143,6 @@ def transform_from_deterministic_learner(self):
module=policy_module,
in_keys=["loc", "scale"],
distribution_class=TanhNormal,
- distribution_kwargs={
- "min": 0, # minimum control
- "max": 1, # maximum control
- },
default_interaction_type=InteractionType.RANDOM,
return_log_prob=True,
)
@@ -153,7 +151,6 @@ def transform_from_deterministic_learner(self):
def get_action(self, observation):
data = TensorDict({"observation": observation}, [])
policy_forward = self.policy_module(data)
- print("action", self.policy_network(observation))
return policy_forward["action"], policy_forward["sample_log_prob"]
# NOTE: This initializes from *deterministic* learner parameters and picks
@@ -173,10 +170,114 @@ def init_from_saved(self, path):
module=policy_module,
in_keys=["loc", "scale"],
distribution_class=TanhNormal,
- distribution_kwargs={
- "min": 0, # minimum control
- "max": 1, # maximum control
- },
default_interaction_type=InteractionType.RANDOM,
return_log_prob=True,
)
+
+"""
+Slew rate autopilot learner.
+Each control (throttle/aileron/elevator/rudder) has three options:
+stay constant, go down, or go up.
+"""
+class SlewRateAutopilotLearner:
+ def __init__(self):
+ self.inputs = 15
+ self.outputs = 4
+
+ # Slew rates are wrt sim clock
+ self.throttle_slew_rate = 0.005
+ self.aileron_slew_rate = 0.0001
+ self.elevator_slew_rate = 0.00025
+ self.rudder_slew_rate = 0.0001
+
+ self.policy_network = nn.Sequential(
+ nn.Linear(self.inputs, self.inputs),
+ nn.ReLU(),
+ nn.Linear(self.inputs, 3 * self.outputs),
+ nn.Sigmoid(),
+ CategoricalControlsExtractor()
+ )
+
+ self.instantiate_policy_module()
+
+ def instantiate_policy_module(self):
+ policy_module = TensorDictModule(self.policy_network, in_keys=["observation"], out_keys=[("params", "throttle", "probs"),("params", "aileron", "probs"),("params", "elevator", "probs"),("params", "rudder", "probs")])
+ self.policy_module = policy_module = ProbabilisticActor(
+ module=policy_module,
+ in_keys=["params"],
+ distribution_class=CompositeDistribution,
+ distribution_kwargs={
+ "distribution_map": {
+ "throttle": Categorical,
+ "aileron": Categorical,
+ "elevator": Categorical,
+ "rudder": Categorical,
+ }
+ },
+ default_interaction_type=InteractionType.RANDOM,
+ return_log_prob=True
+ )
+
+ # Returns the action selected and the log_prob of that action
+ def get_action(self, observation):
+ data = TensorDict({"observation": observation}, [])
+ policy_forward = self.policy_module(data)
+ action = torch.Tensor([policy_forward['throttle'], policy_forward['aileron'], policy_forward['elevator'], policy_forward['rudder']])
+ return action, policy_forward["sample_log_prob"]
+
+ # Always samples the mode of the output for each control
+ def get_deterministic_action(self, observation):
+ throttle_probs, aileron_probs, elevator_probs, rudder_probs = self.policy_network(observation)
+ action = torch.Tensor([torch.argmax(throttle_probs), torch.argmax(aileron_probs), torch.argmax(elevator_probs), torch.argmax(rudder_probs)])
+ return action, 0
+
+ # Apply a -1 transformation to the action to create control tensor such that:
+ # -1 means go down, 0 means stay same, and +1 means go up
+ def get_control(self, action):
+ return action - 1
+
+ # flattened_params = flattened dx1 numpy array of all params to init from
+ # NOTE: the way the params are broken up into the weights/biases of each layer
+ # would need to be manually edited for changes in network architecture
+ def init_from_params(self, flattened_params):
+ flattened_params = torch.from_numpy(flattened_params).to(torch.float32)
+
+ pl, pr = 0, 0
+ layer1 = nn.Linear(self.inputs, self.inputs)
+ pr += layer1.weight.nelement()
+ layer1.weight = nn.Parameter(flattened_params[pl:pr].reshape(layer1.weight.shape))
+ pl = pr
+ pr += layer1.bias.nelement()
+ layer1.bias = nn.Parameter(flattened_params[pl:pr].reshape(layer1.bias.shape))
+
+ layer2 = nn.Linear(self.inputs, 3*self.outputs)
+ pl = pr
+ pr += layer2.weight.nelement()
+ layer2.weight = nn.Parameter(flattened_params[pl:pr].reshape(layer2.weight.shape))
+ pl = pr
+ pr += layer2.bias.nelement()
+ layer2.bias = nn.Parameter(flattened_params[pl:pr].reshape(layer2.bias.shape))
+
+ self.policy_network = nn.Sequential(
+ layer1,
+ nn.ReLU(),
+ layer2,
+ nn.Sigmoid(),
+ CategoricalControlsExtractor()
+ )
+ self.instantiate_policy_module()
+
+ # Loads the network from path
+ def init_from_saved(self, path):
+ self.policy_network = torch.load(path)
+ self.instantiate_policy_module()
+
+ # Saves the network to dir/name.pth
+ def save(self, dir, name):
+ path = os.path.join(dir, name + '.pth')
+ torch.save(self.policy_network, path)
+
+
+
+
+
diff --git a/learning/crossentropy.py b/learning/crossentropy.py
index 677fcf1..a768312 100644
--- a/learning/crossentropy.py
+++ b/learning/crossentropy.py
@@ -1,6 +1,6 @@
import torch
import numpy as np
-from learning.autopilot import AutopilotLearner
+from learning.autopilot import AutopilotLearner, SlewRateAutopilotLearner
from simulation.simulate import FullIntegratedSim
from simulation.jsbsim_aircraft import x8
import os
@@ -25,7 +25,7 @@ def __init__(self, learners, num_params):
def init_using_torch_default(generation_size, num_params):
learners = []
for i in range(generation_size):
- learners.append(AutopilotLearner())
+ learners.append(SlewRateAutopilotLearner())
return Generation(learners, num_params)
# Utilizes [rewards], which contains the reward obtained by each learner as a
@@ -89,12 +89,12 @@ def make_new_generation(mean, cov, generation_size, num_params):
# Generate each learner from params
learners = []
for param_list in selected_params:
- l = AutopilotLearner()
+ l = SlewRateAutopilotLearner()
l.init_from_params(param_list)
learners.append(l)
return Generation(learners, num_params)
-def cross_entropy_train(epochs, generation_size, num_survive, num_params=238, sim_time=60.0, save_dir='cross_entropy'):
+def cross_entropy_train(epochs, generation_size, num_survive, num_params=432, sim_time=60.0, save_dir='cross_entropy'):
# Create save_dir (and if one already exists, rename it with some rand int)
if os.path.exists(os.path.join('data', save_dir)):
os.rename(os.path.join('data', save_dir), os.path.join('data', save_dir + '_old' + str(randint(0, 100000))))
@@ -117,7 +117,7 @@ def cross_entropy_train(epochs, generation_size, num_survive, num_params=238, si
# Evaluate generation through rollouts
rewards = []
for i in range(len(generation.learners)):
- id = str(100*(epoch+1) + (i+1))
+ id = str(1000*(epoch+1) + (i+1))
learner = generation.learners[i]
# Run simulation to evaluate learner
@@ -127,7 +127,7 @@ def cross_entropy_train(epochs, generation_size, num_survive, num_params=238, si
integrated_sim.simulation_loop()
# Acquire/save data
- integrated_sim.mdp_data_collector.save(os.path.join(save_dir, 'generation' + str(epoch+1)), 'trajectory_learner#' + str(i+1))
+ #integrated_sim.mdp_data_collector.save(os.path.join(save_dir, 'generation' + str(epoch+1)), 'trajectory_learner#' + str(i+1))
rewards.append(integrated_sim.mdp_data_collector.get_cum_reward())
print('Reward for Learner #', id, ': ', integrated_sim.mdp_data_collector.get_cum_reward())
@@ -149,4 +149,4 @@ def cross_entropy_train(epochs, generation_size, num_survive, num_params=238, si
if __name__ == "__main__":
os.environ["JSBSIM_DEBUG"]=str(0)
# epochs, generation_size, num_survive
- cross_entropy_train(100, 99, 50)
\ No newline at end of file
+ cross_entropy_train(100, 200, 50)
\ No newline at end of file
diff --git a/learning/ppo.py b/learning/ppo.py
index e012f67..76138c2 100644
--- a/learning/ppo.py
+++ b/learning/ppo.py
@@ -7,11 +7,14 @@
from torchrl.data.replay_buffers.storages import LazyTensorStorage
from torchrl.objectives import ClipPPOLoss
from torchrl.objectives.value import GAE
-from learning.autopilot import StochasticAutopilotLearner
+from learning.autopilot import StochasticAutopilotLearner, SlewRateAutopilotLearner
from simulation.simulate import FullIntegratedSim
from simulation.jsbsim_aircraft import x8
+import numpy as np
import os
+device = "cpu" if not torch.has_cuda else "cuda:0"
+
"""
Gathers rollout data and returns it in the way the Proximal Policy Optimization loss_module expects
"""
@@ -24,15 +27,27 @@ def gather_rollout_data(autopilot_learner, policy_num, num_trajectories=100, sim
sample_log_probs = torch.empty(0)
rewards = torch.empty(0)
dones = torch.empty(0, dtype=torch.bool)
+ best_cum_reward = -float('inf')
+ worst_cum_reward = float('inf')
+ total_cum_reward = 0
+ total_timesteps = 0
for t in range(num_trajectories):
integrated_sim = FullIntegratedSim(x8, autopilot_learner, sim_time)
integrated_sim.simulation_loop()
# Acquire data
observation, next_observation, action, sample_log_prob, reward, done = integrated_sim.mdp_data_collector.get_trajectory_data()
+ cum_reward = integrated_sim.mdp_data_collector.get_cum_reward()
+ total_cum_reward += cum_reward
+ total_timesteps += reward.shape[0]
- # Save data
- # integrated_sim.mdp_data_collector.save(os.path.join('ppo', 'trajectories'), 'rollout' + str(policy_num * num_trajectories + t))
+ # Save trajectory if worst or best so far
+ if cum_reward > best_cum_reward:
+ integrated_sim.mdp_data_collector.save(os.path.join('ppo', 'trajectories'), 'best_rollout#' + str(policy_num))
+ best_cum_reward = cum_reward
+ if cum_reward < worst_cum_reward:
+ integrated_sim.mdp_data_collector.save(os.path.join('ppo', 'trajectories'), 'worst_rollout#' + str(policy_num))
+ worst_cum_reward = cum_reward
# Add to the data
observations = torch.cat((observations, observation))
@@ -47,14 +62,34 @@ def gather_rollout_data(autopilot_learner, policy_num, num_trajectories=100, sim
# that entry for one step in a rollout.
data = TensorDict({
"observation": observations,
- "action": actions.detach(),
+ "action": TensorDict({
+ "throttle": actions[:,0].detach(),
+ "aileron": actions[:,1].detach(),
+ "elevator": actions[:,2].detach(),
+ "rudder": actions[:,3].detach(),
+ "throttle_log_prob": torch.zeros(data_size).detach(),
+ "aileron_log_prob": torch.zeros(data_size).detach(),
+ "elevator_log_prob": torch.zeros(data_size).detach(),
+ "rudder_log_prob": torch.zeros(data_size).detach(),
+ }, [data_size,]),
"sample_log_prob": sample_log_probs.detach(), # log probability that each action was selected
+ # "sample_log_prob": TensorDict({
+ # }, [data_size,]), # log probability that each action was selected
("next", "done"): dones,
("next", "terminated"): dones,
("next", "reward"): rewards,
("next", "observation"): next_observations,
}, [data_size,])
+ # Write to stats file
+ stats_file = open(os.path.join('data', 'ppo', 'stats.txt'), 'a')
+ stats_file.write('Policy Number #' + str(policy_num) + ':\n')
+ stats_file.write('Average Time Steps =' + str(total_timesteps/num_trajectories) + ':\n')
+ stats_file.write('Average Reward: ' + str(total_cum_reward/num_trajectories) + '\n')
+ stats_file.write('Best Reward: ' + str(best_cum_reward) + '\n')
+ stats_file.write('Worst Reward: ' + str(worst_cum_reward) + '\n')
+ stats_file.write('\n\n')
+
return data, data_size
"""
@@ -64,12 +99,12 @@ def gather_rollout_data(autopilot_learner, policy_num, num_trajectories=100, sim
action taken during PPO learning.
"""
def make_value_estimator_module(n_obs):
+ global device
+
value_net = nn.Sequential(
- nn.Linear(n_obs, n_obs),
+ nn.Linear(n_obs, 2*n_obs, device=device),
nn.Tanh(),
- nn.Linear(n_obs, n_obs),
- nn.Tanh(),
- nn.Linear(n_obs, 1), # one value is computed for the state
+ nn.Linear(2*n_obs, 1, device=device), # one value is computed for the state
)
return ValueOperator(
@@ -115,15 +150,16 @@ def train_ppo_once(policy_num, autopilot_learner, loss_module, advantage_module,
for b in range(0, data_size // batch_size):
# Gather batch and calculate PPO loss on it
batch = replay_buffer.sample(batch_size)
- loss_vals = loss_module(batch.to("cpu"))
+ loss_vals = loss_module(batch.to(device))
loss_value = (
loss_vals["loss_objective"]
+ loss_vals["loss_critic"]
- + loss_vals["loss_entropy"]
)
# Optimize via gradient descent with the optimizer
loss_value.backward()
+
+ torch.nn.utils.clip_grad_norm_(loss_module.parameters(), 1.0)
optimizer.step()
optimizer.zero_grad()
@@ -131,17 +167,21 @@ def train_ppo_once(policy_num, autopilot_learner, loss_module, advantage_module,
if __name__ == "__main__":
# Parameters (see https://pytorch.org/rl/tutorials/coding_ppo.html#ppo-parameters)
batch_size = 100
- num_epochs = 10
+ num_epochs = 100
+ device = "cpu" if not torch.has_cuda else "cuda:0"
+
clip_epsilon = 0.2 # for PPO loss
gamma = 1.0
lmbda = 0.95
entropy_eps = 1e-4
- lr = 3e-4
- num_trajectories = 100
+ lr = 1e-4
+ num_trajectories = 200
num_policy_iterations = 100
# Build the modules
- autopilot_learner = StochasticAutopilotLearner()
+ #autopilot_learner = StochasticAutopilotLearner()
+ autopilot_learner = SlewRateAutopilotLearner()
+ # autopilot_learner.init_from_params(np.random.normal(0, 1, 350))
value_module = make_value_estimator_module(autopilot_learner.inputs)
advantage_module = GAE(
gamma=gamma, lmbda=lmbda, value_network=value_module, average_gae=True
@@ -150,8 +190,8 @@ def train_ppo_once(policy_num, autopilot_learner, loss_module, advantage_module,
actor=autopilot_learner.policy_module,
critic=value_module,
clip_epsilon=clip_epsilon,
- entropy_bonus=bool(entropy_eps),
- entropy_coef=entropy_eps,
+ entropy_bonus=False,
+ entropy_coef=0,
value_target_key=advantage_module.value_target_key,
critic_coef=1.0,
gamma=gamma,
diff --git a/learning/utils.py b/learning/utils.py
new file mode 100644
index 0000000..f0f2002
--- /dev/null
+++ b/learning/utils.py
@@ -0,0 +1,11 @@
+import torch
+import torch.nn as nn
+
+class CategoricalControlsExtractor(nn.Module):
+ def __init__(self):
+ super().__init__()
+
+ def forward(self, *tensors: torch.Tensor) -> tuple[torch.Tensor, ...]:
+ tensor, *others = tensors
+ throttle, aileron, elevator, rudder = tensor.chunk(4, -1)
+ return (nn.functional.softmax(throttle), nn.functional.softmax(aileron), nn.functional.softmax(elevator), nn.functional.softmax(rudder), *others)
diff --git a/scripts/sim_autopilot.py b/scripts/sim_autopilot.py
index c54daef..060fe23 100644
--- a/scripts/sim_autopilot.py
+++ b/scripts/sim_autopilot.py
@@ -6,7 +6,7 @@
and rolls out a trajectory in sim for that autopilot learner.
"""
-from learning.autopilot import AutopilotLearner
+from learning.autopilot import AutopilotLearner, SlewRateAutopilotLearner
from simulation.simulate import FullIntegratedSim
from simulation.jsbsim_aircraft import x8
import sys
@@ -17,7 +17,7 @@
file = os.path.join(*(["data"] + policy_file_path.split('/')))
# Initialize autopilot
-autopilot = AutopilotLearner()
+autopilot = SlewRateAutopilotLearner()
autopilot.init_from_saved(file)
# Play sim
diff --git a/simulation/jsbsim_simulator.py b/simulation/jsbsim_simulator.py
index 9cc5a11..308f428 100644
--- a/simulation/jsbsim_simulator.py
+++ b/simulation/jsbsim_simulator.py
@@ -84,7 +84,8 @@ def __init__(self,
aircraft: Aircraft = x8,
init_conditions: Dict[prp.Property, float] = None,
debug_level: int = 0):
- self.fdm = jsbsim.FGFDMExec(root_dir=self.ROOT_DIR)
+ with HidePrints():
+ self.fdm = jsbsim.FGFDMExec(root_dir=self.ROOT_DIR)
self.fdm.set_debug_level(debug_level)
self.sim_dt = 1.0 / sim_frequency_hz
self.aircraft = aircraft
@@ -94,7 +95,9 @@ def __init__(self,
self.wall_clock_dt = None
self.update_airsim(ignore_collisions=True)
self.waypoint_id = 0
- self.waypoint_threshold = 2
+ self.waypoint_threshold = 3
+ self.takeoff_rewarded = False
+ self.completed_takeoff = False
self.waypoint_rewarded = True
self.waypoints = []
with open("waypoints.csv", 'r') as file:
@@ -241,8 +244,9 @@ def airsim_connect() -> airsim.VehicleClient:
:return: the airsim client object
"""
- client = airsim.VehicleClient()
- client.confirmConnection()
+ with HidePrints():
+ client = airsim.VehicleClient()
+ client.confirmConnection()
return client
def update_airsim(self, ignore_collisions = False) -> None:
diff --git a/simulation/mdp.py b/simulation/mdp.py
index 58a292f..fbced7e 100644
--- a/simulation/mdp.py
+++ b/simulation/mdp.py
@@ -4,6 +4,7 @@
states/observations, actions, and rewards.
"""
+import math
import torch
import simulation.jsbsim_properties as prp
import numpy as np
@@ -12,28 +13,29 @@
"""
Extracts agent state data from the sim.
"""
-def state_from_sim(sim, debug=False):
- state = torch.zeros(13,)
+def state_from_sim(sim):
+ state = torch.zeros(15,)
FT_TO_M = 0.3048
# altitude
state[0] = sim[prp.altitude_sl_ft] * FT_TO_M # z
- # velocity
- state[1] = sim[prp.v_north_fps] * FT_TO_M # x velocity
- state[2] = sim[prp.v_east_fps] * FT_TO_M # y velocity
- state[3] = -sim[prp.v_down_fps] * FT_TO_M # z velocity
-
- # angles
- state[4] = sim[prp.roll_rad] # roll
- state[5] = sim[prp.pitch_rad] # pitch
- state[6] = sim[prp.heading_rad] # yaw
+ # speed
+ state[1] = FT_TO_M * np.linalg.norm(np.array([sim[prp.v_north_fps], sim[prp.v_east_fps], -sim[prp.v_down_fps]]))
+
+ # roll
+ state[2] = sim[prp.roll_rad]
+ # pitch
+ state[3] = sim[prp.pitch_rad]
+
# angle rates
- state[7] = sim[prp.p_radps] # roll rate
- state[8] = sim[prp.q_radps] # pitch rate
- state[9] = sim[prp.r_radps] # yaw rate
+ state[4] = sim[prp.p_radps] # roll rate
+ state[5] = sim[prp.q_radps] # pitch rate
+ state[6] = sim[prp.r_radps] # yaw rate
+
+ state[7] = -sim[prp.v_down_fps] * FT_TO_M # z velocity
# next waypoint (relative)
position = np.array(sim.get_local_position())
@@ -41,25 +43,70 @@ def state_from_sim(sim, debug=False):
displacement = waypoint - position
+ # waypoint ground distance
+ state[8] = np.linalg.norm(np.array(displacement[0:2]))
+
+ # waypoint altitude distance
+ state[9] = displacement[2]
+
+ # angles
+ waypoint_heading = np.arctan2(displacement[1], displacement[0])
+ heading = sim[prp.heading_rad] # yaw
+
+ state[10] = waypoint_heading - heading
+
+ if state[10] < -math.pi:
+ state[10] += 2 * math.pi
+ elif state[10] > math.pi:
+ state[10] -= 2* math.pi
+
+
+ # print("\t\t\t\t\t\t\t\t\t\t YAW", heading/math.pi*180)
+ # print("\t\t\t\t\t\t\t\t\t\t waypoint_heading", waypoint_heading/math.pi*180)
+
+
+ # print("\t\t\t\t\t\t\t\t\t\theading", state[10]/math.pi*180)
+ # print("\t\t\t\t\t\t\t\t\t\tdist", state[8])
+ # print("\t\t\t\t\t\t\t\t\t\theight", state[9])
+
+
+
+ state[11] = sim[prp.throttle_cmd] / 0.8
+ state[12] = sim[prp.aileron_cmd] / 0.1
+ state[13] = sim[prp.elevator_cmd] / 0.4
+ state[14] = sim[prp.rudder_cmd] / 0.1
+
+
+ if state[0] >= 2:
+ if not sim.completed_takeoff:
+ print("\t\t\t\t\t\t\t\t\t\tTake off!")
+ sim.completed_takeoff = True
+
if np.linalg.norm(displacement) <= sim.waypoint_threshold:
- print("Waypoint Hit!")
+ print(f"\t\t\t\t\t\t\t\t\t\tWaypoint {sim.waypoint_id} Hit!")
sim.waypoint_id += 1
sim.waypoint_rewarded = False
- state[10] = displacement[0]
- state[11] = displacement[1]
- state[12] = displacement[2]
+ if is_unhealthy_state(state):
+ raise Exception("Unhealthy state, do better")
- if debug:
- print('State!')
- print('Altitude:', state[0])
- print('Velocity: (', state[1], state[2], state[3], ')')
- print('Roll:', state[4], '; Pitch:', state[5], '; Yaw:', state[6])
- print('RollRate:', state[7], '; PitchRate:', state[8], '; YawRate:', state[9])
- print('Relative WP: (', state[10], state[11], state[12], ')')
return state
+"""
+Returns a bool for whether the state is unhealthy
+"""
+def is_unhealthy_state(state):
+ MAX_BANK = math.pi / 3
+ MAX_PITCH = math.pi / 4
+ if np.linalg.norm(state[8:10]) > 75:
+ return True
+ if not -MAX_BANK < state[2] < MAX_BANK:
+ return True
+ if not -MAX_PITCH < state[3] < MAX_PITCH:
+ return True
+ return False
+
"""
Updates sim according to a control, assumes [control] is a 4-item tensor of
@@ -72,6 +119,23 @@ def update_sim_from_control(sim, control, debug=False):
sim[prp.rudder_cmd] = control[3]
if debug:
print('Control Taken:', control)
+
+
+# Get the state/action/log_prob and control from the slewrate autopilot
+def query_slewrate_autopilot(sim, autopilot, deterministic=True):
+ state = state_from_sim(sim)
+ action, log_prob = autopilot.get_deterministic_action(state) if deterministic else autopilot.get_action(state)
+ control = autopilot.get_control(action)
+
+ return state, action, log_prob, control
+
+# Called every single sim step to enact slewrate
+# Assumes autopilot is a SlewRateAutopilotLearner
+def update_sim_from_slewrate_control(sim, control, autopilot):
+ sim[prp.throttle_cmd] = np.clip(sim[prp.throttle_cmd] + control[0] * autopilot.throttle_slew_rate, 0, 0.8)
+ sim[prp.aileron_cmd] = np.clip(sim[prp.aileron_cmd] + control[1] * autopilot.aileron_slew_rate, -0.1, 0.1)
+ sim[prp.elevator_cmd] = np.clip(sim[prp.elevator_cmd] + control[2] * autopilot.elevator_slew_rate, -0.4, 0.4)
+ sim[prp.rudder_cmd] = np.clip(sim[prp.rudder_cmd] + control[3] * autopilot.rudder_slew_rate, -0.1, 0.1)
"""
Follows a predetermined sequence of controls, instead of using autopilot.
@@ -115,18 +179,25 @@ def enact_predetermined_controls(sim, autopilot):
Basically just updates sim throttle / control surfaces according to the autopilot.
"""
def enact_autopilot(sim, autopilot):
- state = state_from_sim(sim, debug=False)
+ state = state_from_sim(sim)
action, log_prob = autopilot.get_action(state)
-
update_sim_from_control(sim, autopilot.get_control(action))
return state, action, log_prob
+
# Takes in the action outputted directly from the network and outputs the
# normalized quadratic action cost from 0-1
def quadratic_action_cost(action):
- action[1:4] = 2*action[1:4] - 1 # convert control surfaces to [-1, 1]
- return float(torch.dot(action, action).detach()) / 4 # divide by 4 to be 0-1
+ action_cost_weights = torch.tensor([1.0, 20.0, 10.0, 1.0])
+ action[0] = 0.5 * (action[0] + 1) # converts throttle to be 0-1
+ return float(torch.dot(action ** 2, action_cost_weights).detach() / sum(action_cost_weights)) # divide by 4 to be 0-1
+
+# Takes in the action outputted directly from the network and outputs the
+# normalized quadratic action cost from 0-1
+def quadratic_control_cost(control):
+ control_cost_weights = torch.tensor([1.0, 10.0, 5.0, 1.0])
+ return float(torch.dot(control ** 2, control_cost_weights).detach() / sum(control_cost_weights)) # divide by 4 to be 0-1
"""
The reward function for the bb autopilots. Since they won't know how to fly,
@@ -134,6 +205,7 @@ def quadratic_action_cost(action):
+ 1 if moving with some velocity threshold vel_reward_threshold
+ alt_reward_threshold if flying above ground with some threshold alt_reward_threshold
- action_coeff * the quadratic control cost
+ !!! WARNING: Deprecated
"""
def bb_reward(action, next_state, collided, alt_reward_coeff=10, action_coeff=1, alt_reward_threshold=2, vel_reward_threshold=1):
moving_reward = 1 if (next_state[3]**2 + next_state[4]**2 + next_state[5]**2) > vel_reward_threshold else 0
@@ -145,6 +217,7 @@ def bb_reward(action, next_state, collided, alt_reward_coeff=10, action_coeff=1,
A reward function that tries to incentivize the plane to go towards the waypoint
at each timestep, while also rewarding for being above ground and penalizing
for high control effort
+ !!! WARNING: Deprecated
"""
def new_init_wp_reward(action, next_state, collided, wp_coeff=1, action_coeff=1, alt_reward_threshold=5):
alt_reward = 1 if next_state[2] > alt_reward_threshold else 0
@@ -152,29 +225,32 @@ def new_init_wp_reward(action, next_state, collided, wp_coeff=1, action_coeff=1,
waypoint_rel_unit = next_state[10:13] / torch.norm(next_state[10:13])
vel = next_state[1:4]
- toward_waypoint_reward = wp_coeff * float(torch.dot((vel ** 2 * torch.sign(vel)), waypoint_rel_unit).detach())
+ toward_waypoint_reward = wp_coeff * float(torch.dot(vel, waypoint_rel_unit).detach())
return toward_waypoint_reward + alt_reward - action_cost if not collided else 0
"""
-A reward function.
+A reward function getter.
"""
def get_wp_reward(sim):
- def wp_reward(action, next_state, collided, wp_coeff=0.01, action_coeff=0.01, alt_reward_threshold=10):
+ def wp_reward(action, next_state, collided, wp_coeff=0.1, action_coeff=0.5):
if not sim.waypoint_rewarded:
sim.waypoint_rewarded = True
- wp_reward = 1_000_000
+ wp_reward = 1_000
else: wp_reward = 0
- # print("altitude", next_state[2] )
- alt_reward = 0.1 if next_state[2] > alt_reward_threshold else 0
- action_cost = action_coeff * quadratic_action_cost(action)
-
- waypoint_rel_unit = next_state[10:13] / torch.norm(next_state[10:13])
- vel = next_state[1:4]
- toward_waypoint_reward = wp_coeff * float(torch.dot((vel ** 2 * torch.sign(vel)), waypoint_rel_unit).detach())
- # print("wp_reward: ", wp_reward, " toward_waypoint_reward: ", toward_waypoint_reward)
- # print("alt_reward: ", alt_reward, " action_cost: ", -action_cost)
- # print("\t reward", wp_reward + toward_waypoint_reward + alt_reward - action_cost if not collided else 0)
- return wp_reward + toward_waypoint_reward + alt_reward - action_cost if not collided else 0
+ action_cost = action_coeff * quadratic_control_cost(next_state[11:15])
+ #print('\t\t\t\t\t\tACTION COST:', action_cost, next_state[11:15])
+ # waypoint_rel_unit = torch.nn.functional.normalize(next_state[10:13], dim=0)
+ # vel = next_state[1:4]
+ # toward_waypoint_reward = wp_coeff * float(torch.dot(vel, waypoint_rel_unit).detach())
+ toward_waypoint_reward = 0
+ if not sim.takeoff_rewarded and sim.completed_takeoff:
+ takeoff_reward = 500
+ sim.takeoff_rewarded = True
+ else:
+ takeoff_reward = 0
+ # toward_waypoint_reward = 0
+ # toward_waypoint_reward = torch.min(torch.tensor(10), 0.01 * 1 / torch.max(torch.tensor(0.1), torch.sum(next_state[10:13]**2)))
+ return wp_reward + toward_waypoint_reward + takeoff_reward - action_cost if not collided else -500
return wp_reward
"""
@@ -182,7 +258,7 @@ def wp_reward(action, next_state, collided, wp_coeff=0.01, action_coeff=0.01, al
rollout, including states/actions/rewards the agent experienced.
"""
class MDPDataCollector:
- def __init__(self, sim, reward_fn, expected_trajectory_length, state_dim=13, action_dim=4):
+ def __init__(self, sim, reward_fn, expected_trajectory_length, state_dim=15, action_dim=4):
# parameters
self.sim = sim
self.reward_fn = reward_fn
diff --git a/simulation/simulate.py b/simulation/simulate.py
index b0ab2a3..bbec2c7 100644
--- a/simulation/simulate.py
+++ b/simulation/simulate.py
@@ -1,7 +1,9 @@
+from shared import HidePrints
from simulation.jsbsim_simulator import Simulation
from simulation.jsbsim_aircraft import Aircraft, x8
import simulation.jsbsim_properties as prp
-from learning.autopilot import AutopilotLearner
+from learning.autopilot import AutopilotLearner, SlewRateAutopilotLearner
+import torch
import simulation.mdp as mdp
import os
import numpy as np
@@ -13,10 +15,10 @@
class FullIntegratedSim:
def __init__(self,
aircraft: Aircraft,
- autopilot: AutopilotLearner,
+ autopilot: SlewRateAutopilotLearner,
sim_time: float,
display_graphics: bool = True,
- agent_interaction_frequency: int = 1,
+ agent_interaction_frequency: int = 10,
airsim_frequency_hz: float = 392.0,
sim_frequency_hz: float = 240.0,
init_conditions: bool = None,
@@ -44,6 +46,9 @@ def __init__(self,
# Triggered when sim is complete
self.done: bool = False
+ #
+ self.unhealthy_termination: bool = False
+
self.initial_collision = False
"""
Run loop for one simulation.
@@ -76,14 +81,20 @@ def simulation_loop(self):
while i < update_num:
# Do autopilot controls
try:
- state, action, log_prob = mdp.enact_autopilot(self.sim, self.autopilot)
+ #state, action, log_prob = mdp.enact_autopilot(self.sim, self.autopilot)
+ state, action, log_prob, control = mdp.query_slewrate_autopilot(self.sim, self.autopilot)
+ if torch.isnan(state).any():
+ break
except Exception as e:
print(e)
+ self.unhealthy_termination = True
# If enacting the autopilot fails, end the simulation immediately
break
# Update sim while waiting for next agent interaction
while True:
+ mdp.update_sim_from_slewrate_control(self.sim, control, self.autopilot)
+
# Run another sim step
self.sim.run()
@@ -100,28 +111,34 @@ def simulation_loop(self):
# Check for collisions via airsim and terminate if there is one
if self.sim.get_collision_info().has_collided:
if self.initial_collision:
- print('Aircraft has collided.')
+ # print('Aircraft has collided.')
self.done = True
self.sim.reinitialize()
else:
- print("Aircraft completed initial landing")
+ # print("Aircraft completed initial landing")
self.initial_collision = True
# Exit if sim is over or it's time for another agent interaction
if self.done or i % self.agent_interaction_frequency == 0:
break
-
+
# Get new state
try:
next_state = mdp.state_from_sim(self.sim)
- except:
+ if torch.isnan(next_state).any():
+ next_state = state
+ self.done = True
+ except Exception as e:
# If we couldn't acquire the state, something crashed with jsbsim
# We treat that as the end of the simulation and don't update the state
+ print("\t\t\t\t\t\t\t\t\t\t", e)
next_state = state
self.done = True
+ self.unhealthy_termination = True
# Data collection update for this step
- self.mdp_data_collector.update(int(i/self.agent_interaction_frequency)-1, state, action, log_prob, next_state, self.done)
+ self.mdp_data_collector.update(int(i/self.agent_interaction_frequency)-1, state, action,
+ log_prob, next_state, self.unhealthy_termination)
# End if collided
if self.done == True:
@@ -130,6 +147,8 @@ def simulation_loop(self):
self.done = True
self.mdp_data_collector.terminate(int(i/self.agent_interaction_frequency))
print('Simulation complete.')
+ print('\t\t\t\t\t\t\t\t\t\tCum reward:', self.mdp_data_collector.cum_reward)
+
"""
Replays a simulation
@@ -151,6 +170,7 @@ def simulation_replay(self, actions):
# NOTE: for replays, the agent interaction frequency must match what
# it was when the trajectory was created
if i % self.agent_interaction_frequency == 0:
+ print("Step")
break
if __name__ == "__main__":
diff --git a/waypoints.csv b/waypoints.csv
index 7b87c80..aa70d84 100644
--- a/waypoints.csv
+++ b/waypoints.csv
@@ -1,7 +1,8 @@
waypoint_id,x,y,z,relative_x,relative_y,relative_z
1,-4461,-417.960938,90,21.5,-0.2,0.89
-2,-701,-417.960938,120,59.1,-0.2,1.19
-3,2869,-57.960938,450,94.8,3.4,4.49
-4,6509,-967.960938,1230,131.2,-5.7,12.29
+2,-1891,-487.960938,120,47.2,-0.9,1.19
+3,3006,-461,586,96.17,-0.63039062,5.85
+4,6639,-237.960938,730,132.5,1.6,7.29
5,9469,-477.960938,1480,160.8,-0.8,14.79
-6,15619,-397.960938,2920,222.3,0,29.19
\ No newline at end of file
+6,11919,-287.960938,1980,185.3,1.1,19.79
+7,15619,-407.960938,2920,222.3,-0.1,29.19
\ No newline at end of file