diff --git a/assets/x8.xml b/assets/x8.xml
index 3368f7b..bcc8bde 100644
--- a/assets/x8.xml
+++ b/assets/x8.xml
@@ -70,10 +70,10 @@
             </location>
             <static_friction> 0.8 </static_friction>
             <dynamic_friction> 0.5 </dynamic_friction>
-            <rolling_friction> 0.1 </rolling_friction>
+            <rolling_friction> .02 </rolling_friction>
             <spring_coeff unit="LBS/FT"> 8.0 </spring_coeff>
-            <damping_coeff unit="LBS/FT/SEC"> 0.85 </damping_coeff>
-            <max_steer unit="DEG"> 0 </max_steer>
+            <damping_coeff unit="LBS/FT/SEC"> 2.85 </damping_coeff>
+            <max_steer unit="DEG"> 360 </max_steer>
             <brake_group> NONE </brake_group>
             <retractable>0</retractable>
         </contact>
@@ -85,10 +85,10 @@
             </location>
             <static_friction> 0.8 </static_friction>
             <dynamic_friction> 0.5 </dynamic_friction>
-            <rolling_friction> 0.1 </rolling_friction>
+            <rolling_friction> 0.03 </rolling_friction>
             <spring_coeff unit="LBS/FT"> 4.0 </spring_coeff>
-            <damping_coeff unit="LBS/FT/SEC"> 0.95 </damping_coeff>
-            <max_steer unit="DEG"> 0.0 </max_steer>
+            <damping_coeff unit="LBS/FT/SEC"> 1.95 </damping_coeff>
+            <max_steer unit="DEG"> 360 </max_steer>
             <brake_group> LEFT </brake_group>
             <retractable>0</retractable>
         </contact>
@@ -100,10 +100,10 @@
             </location>
             <static_friction> 0.8 </static_friction>
             <dynamic_friction> 0.5 </dynamic_friction>
-            <rolling_friction> 0.1 </rolling_friction>
+            <rolling_friction> 0.03 </rolling_friction>
             <spring_coeff unit="LBS/FT"> 4.0 </spring_coeff>
-            <damping_coeff unit="LBS/FT/SEC"> 0.95 </damping_coeff>
-            <max_steer unit="DEG"> 0.0 </max_steer>
+            <damping_coeff unit="LBS/FT/SEC"> 1.95 </damping_coeff>
+            <max_steer unit="DEG"> 360 </max_steer>
             <brake_group> RIGHT </brake_group>
             <retractable>0</retractable>
         </contact>
@@ -145,7 +145,7 @@
           <input> attitude/heading-true-rad </input>
             <!-- <lag> 2500 </lag> -->
             <!-- <noise variation="PERCENT"> .02 </noise> -->
-          <bias> 0.003 </bias>
+          <bias> 0.0 </bias>
         </sensor>
       </channel>
     </system>
diff --git a/data/ppo/stats.txt b/data/ppo/stats.txt
new file mode 100644
index 0000000..e69de29
diff --git a/learning/autopilot.py b/learning/autopilot.py
index 5cd3f47..d20c6ed 100644
--- a/learning/autopilot.py
+++ b/learning/autopilot.py
@@ -4,6 +4,9 @@
 from tensordict.nn.distributions import NormalParamExtractor
 from tensordict.nn import TensorDictModule, InteractionType
 from torchrl.modules import ProbabilisticActor, TanhNormal, ValueOperator
+from torch.distributions import Categorical
+from tensordict.nn import CompositeDistribution
+from learning.utils import CategoricalControlsExtractor
 import os
 
 """
@@ -20,7 +23,7 @@ def __init__(self):
     - 3 angular velocities: w_roll, w_pitch, w_yaw
     - 3d relative position of next waypoint: wx, wy, wz
     """
-    self.inputs = 13
+    self.inputs = 15
 
     """
     Action:
@@ -36,7 +39,7 @@ def __init__(self):
       nn.Linear(self.inputs, self.inputs),
       nn.ReLU(),
       nn.Linear(self.inputs, self.outputs),
-      nn.Sigmoid(),
+      nn.Tanh(),
     )
 
   # Returns the action selected and 0, representing the log-prob of the 
@@ -50,10 +53,10 @@ def get_control(self, action):
     Clamps various control outputs and sets the mean for control surfaces to 0.
     Assumes [action] is a 4-item tensor of throttle, aileron cmd, elevator cmd, rudder cmd.
     """
-    action[0] = 0.5 * action[0]
-    action[1] = 0.1 * (action[1] - 0.5)
-    action[2] = 0.5 * (action[2] - 0.5)
-    action[3] = 0.5 * (action[3] - 0.5) 
+    action[0] = 0.8 * (0.5*(action[0] + 1))
+    action[1] = 0.1 * action[1]
+    action[2] = 0.4 * action[2]
+    action[3] = 0.1 * action[3]
     return action
 
   # flattened_params = flattened dx1 numpy array of all params to init from
@@ -82,12 +85,11 @@ def init_from_params(self, flattened_params):
       layer1,
       nn.ReLU(),
       layer2,
-      nn.Sigmoid(),
+      nn.Tanh()
     )
   
-  # Loads the network from dir/name.pth
-  def init_from_saved(self, dir, name):
-    path = os.path.join(dir, name + '.pth')
+  # Loads the network from path
+  def init_from_saved(self, path):
     self.policy_network = torch.load(path)
 
   # Saves the network to dir/name.pth
@@ -125,8 +127,8 @@ def transform_from_deterministic_learner(self):
     self.policy_network[-2] = nn.Linear(self.inputs, self.outputs * 2)
 
     # Update the output layer to include the original weights and biases
-    self.policy_network[-2].weight = nn.Parameter(torch.cat((w, torch.ones(w.shape) * 100), 0))
-    self.policy_network[-2].bias = nn.Parameter(torch.cat((b, torch.ones(b.shape) * 100), 0))
+    self.policy_network[-2].weight = nn.Parameter(torch.cat((w, torch.zeros(w.shape)), 0))
+    self.policy_network[-2].bias = nn.Parameter(torch.cat((b, torch.zeros(b.shape)), 0))
 
     # Add a normal param extractor to the network to extract (means, sigmas) tuple
     self.policy_network.append(NormalParamExtractor())
@@ -141,10 +143,6 @@ def transform_from_deterministic_learner(self):
       module=policy_module,
       in_keys=["loc", "scale"],
       distribution_class=TanhNormal,
-      distribution_kwargs={
-          "min": 0, # minimum control
-          "max": 1, # maximum control
-      },
       default_interaction_type=InteractionType.RANDOM,
       return_log_prob=True,
     )
@@ -153,7 +151,6 @@ def transform_from_deterministic_learner(self):
   def get_action(self, observation):
     data = TensorDict({"observation": observation}, [])
     policy_forward = self.policy_module(data)
-    print("action", self.policy_network(observation))
     return policy_forward["action"], policy_forward["sample_log_prob"]
 
   # NOTE: This initializes from *deterministic* learner parameters and picks
@@ -173,10 +170,114 @@ def init_from_saved(self, path):
       module=policy_module,
       in_keys=["loc", "scale"],
       distribution_class=TanhNormal,
-      distribution_kwargs={
-          "min": 0, # minimum control
-          "max": 1, # maximum control
-      },
       default_interaction_type=InteractionType.RANDOM,
       return_log_prob=True,
     )
+
+"""
+Slew rate autopilot learner.
+Each control (throttle/aileron/elevator/rudder) has three options:
+stay constant, go down, or go up.
+"""
+class SlewRateAutopilotLearner:
+  def __init__(self):
+    self.inputs = 15
+    self.outputs = 4
+
+    # Slew rates are wrt sim clock
+    self.throttle_slew_rate = 0.005
+    self.aileron_slew_rate = 0.0001
+    self.elevator_slew_rate = 0.00025
+    self.rudder_slew_rate = 0.0001
+    
+    self.policy_network = nn.Sequential(
+      nn.Linear(self.inputs, self.inputs),
+      nn.ReLU(),
+      nn.Linear(self.inputs, 3 * self.outputs),
+      nn.Sigmoid(),
+      CategoricalControlsExtractor()
+    )
+
+    self.instantiate_policy_module()
+  
+  def instantiate_policy_module(self):
+    policy_module = TensorDictModule(self.policy_network, in_keys=["observation"], out_keys=[("params", "throttle", "probs"),("params", "aileron", "probs"),("params", "elevator", "probs"),("params", "rudder", "probs")])
+    self.policy_module = policy_module = ProbabilisticActor(
+      module=policy_module,
+      in_keys=["params"],
+      distribution_class=CompositeDistribution, 
+      distribution_kwargs={
+        "distribution_map": {
+          "throttle": Categorical,
+          "aileron": Categorical,
+          "elevator": Categorical,
+          "rudder": Categorical,
+        }
+      },
+      default_interaction_type=InteractionType.RANDOM, 
+      return_log_prob=True
+    )
+
+  # Returns the action selected and the log_prob of that action
+  def get_action(self, observation):
+    data = TensorDict({"observation": observation}, [])
+    policy_forward = self.policy_module(data)
+    action = torch.Tensor([policy_forward['throttle'], policy_forward['aileron'], policy_forward['elevator'], policy_forward['rudder']])
+    return action, policy_forward["sample_log_prob"]
+
+  # Always samples the mode of the output for each control
+  def get_deterministic_action(self, observation):
+    throttle_probs, aileron_probs, elevator_probs, rudder_probs = self.policy_network(observation)
+    action = torch.Tensor([torch.argmax(throttle_probs), torch.argmax(aileron_probs), torch.argmax(elevator_probs), torch.argmax(rudder_probs)])
+    return action, 0
+ 
+  # Apply a -1 transformation to the action to create control tensor such that:
+  # -1 means go down, 0 means stay same, and +1 means go up
+  def get_control(self, action):
+    return action - 1
+
+  # flattened_params = flattened dx1 numpy array of all params to init from
+  # NOTE: the way the params are broken up into the weights/biases of each layer
+  #        would need to be manually edited for changes in network architecture
+  def init_from_params(self, flattened_params):
+    flattened_params = torch.from_numpy(flattened_params).to(torch.float32)
+    
+    pl, pr = 0, 0
+    layer1 = nn.Linear(self.inputs, self.inputs)
+    pr += layer1.weight.nelement()
+    layer1.weight = nn.Parameter(flattened_params[pl:pr].reshape(layer1.weight.shape))
+    pl = pr
+    pr += layer1.bias.nelement()
+    layer1.bias = nn.Parameter(flattened_params[pl:pr].reshape(layer1.bias.shape))
+
+    layer2 = nn.Linear(self.inputs, 3*self.outputs)
+    pl = pr
+    pr += layer2.weight.nelement()
+    layer2.weight = nn.Parameter(flattened_params[pl:pr].reshape(layer2.weight.shape))
+    pl = pr
+    pr += layer2.bias.nelement()
+    layer2.bias = nn.Parameter(flattened_params[pl:pr].reshape(layer2.bias.shape))
+
+    self.policy_network = nn.Sequential(
+      layer1,
+      nn.ReLU(),
+      layer2,
+      nn.Sigmoid(),
+      CategoricalControlsExtractor()
+    )
+    self.instantiate_policy_module()
+  
+  # Loads the network from path
+  def init_from_saved(self, path):
+    self.policy_network = torch.load(path)
+    self.instantiate_policy_module()
+  
+  # Saves the network to dir/name.pth
+  def save(self, dir, name):
+    path = os.path.join(dir, name + '.pth')
+    torch.save(self.policy_network, path)
+
+  
+
+  
+
diff --git a/learning/crossentropy.py b/learning/crossentropy.py
index 677fcf1..a768312 100644
--- a/learning/crossentropy.py
+++ b/learning/crossentropy.py
@@ -1,6 +1,6 @@
 import torch
 import numpy as np
-from learning.autopilot import AutopilotLearner
+from learning.autopilot import AutopilotLearner, SlewRateAutopilotLearner
 from simulation.simulate import FullIntegratedSim
 from simulation.jsbsim_aircraft import x8
 import os
@@ -25,7 +25,7 @@ def __init__(self, learners, num_params):
   def init_using_torch_default(generation_size, num_params):
     learners = []
     for i in range(generation_size):
-      learners.append(AutopilotLearner())
+      learners.append(SlewRateAutopilotLearner())
     return Generation(learners, num_params)
 
   # Utilizes [rewards], which contains the reward obtained by each learner as a
@@ -89,12 +89,12 @@ def make_new_generation(mean, cov, generation_size, num_params):
     # Generate each learner from params
     learners = []
     for param_list in selected_params:
-      l = AutopilotLearner()
+      l = SlewRateAutopilotLearner()
       l.init_from_params(param_list)
       learners.append(l)
     return Generation(learners, num_params)
 
-def cross_entropy_train(epochs, generation_size, num_survive, num_params=238, sim_time=60.0, save_dir='cross_entropy'):
+def cross_entropy_train(epochs, generation_size, num_survive, num_params=432, sim_time=60.0, save_dir='cross_entropy'):
   # Create save_dir (and if one already exists, rename it with some rand int)
   if os.path.exists(os.path.join('data', save_dir)):
     os.rename(os.path.join('data', save_dir), os.path.join('data', save_dir + '_old' + str(randint(0, 100000))))
@@ -117,7 +117,7 @@ def cross_entropy_train(epochs, generation_size, num_survive, num_params=238, si
     # Evaluate generation through rollouts
     rewards = []
     for i in range(len(generation.learners)):
-      id = str(100*(epoch+1) + (i+1))
+      id = str(1000*(epoch+1) + (i+1))
       learner = generation.learners[i]
       
       # Run simulation to evaluate learner
@@ -127,7 +127,7 @@ def cross_entropy_train(epochs, generation_size, num_survive, num_params=238, si
       integrated_sim.simulation_loop()
 
       # Acquire/save data
-      integrated_sim.mdp_data_collector.save(os.path.join(save_dir, 'generation' + str(epoch+1)), 'trajectory_learner#' + str(i+1))
+      #integrated_sim.mdp_data_collector.save(os.path.join(save_dir, 'generation' + str(epoch+1)), 'trajectory_learner#' + str(i+1))
       rewards.append(integrated_sim.mdp_data_collector.get_cum_reward())
       print('Reward for Learner #', id, ': ', integrated_sim.mdp_data_collector.get_cum_reward())
 
@@ -149,4 +149,4 @@ def cross_entropy_train(epochs, generation_size, num_survive, num_params=238, si
 if __name__ == "__main__":
   os.environ["JSBSIM_DEBUG"]=str(0)
   # epochs, generation_size, num_survive
-  cross_entropy_train(100, 99, 50)
\ No newline at end of file
+  cross_entropy_train(100, 200, 50)
\ No newline at end of file
diff --git a/learning/ppo.py b/learning/ppo.py
index e012f67..76138c2 100644
--- a/learning/ppo.py
+++ b/learning/ppo.py
@@ -7,11 +7,14 @@
 from torchrl.data.replay_buffers.storages import LazyTensorStorage
 from torchrl.objectives import ClipPPOLoss
 from torchrl.objectives.value import GAE
-from learning.autopilot import StochasticAutopilotLearner
+from learning.autopilot import StochasticAutopilotLearner, SlewRateAutopilotLearner
 from simulation.simulate import FullIntegratedSim
 from simulation.jsbsim_aircraft import x8
+import numpy as np
 import os
 
+device = "cpu" if not torch.has_cuda else "cuda:0"
+
 """
 Gathers rollout data and returns it in the way the Proximal Policy Optimization loss_module expects
 """
@@ -24,15 +27,27 @@ def gather_rollout_data(autopilot_learner, policy_num, num_trajectories=100, sim
   sample_log_probs = torch.empty(0)
   rewards = torch.empty(0)
   dones = torch.empty(0, dtype=torch.bool)
+  best_cum_reward = -float('inf')
+  worst_cum_reward = float('inf')
+  total_cum_reward = 0
+  total_timesteps = 0
   for t in range(num_trajectories):
     integrated_sim = FullIntegratedSim(x8, autopilot_learner, sim_time)
     integrated_sim.simulation_loop()
     
     # Acquire data
     observation, next_observation, action, sample_log_prob, reward, done = integrated_sim.mdp_data_collector.get_trajectory_data()
+    cum_reward = integrated_sim.mdp_data_collector.get_cum_reward()
+    total_cum_reward += cum_reward
+    total_timesteps += reward.shape[0]
     
-    # Save data
-    # integrated_sim.mdp_data_collector.save(os.path.join('ppo', 'trajectories'), 'rollout' + str(policy_num * num_trajectories + t))
+    # Save trajectory if worst or best so far
+    if cum_reward > best_cum_reward:
+      integrated_sim.mdp_data_collector.save(os.path.join('ppo', 'trajectories'), 'best_rollout#' + str(policy_num))
+      best_cum_reward = cum_reward
+    if cum_reward < worst_cum_reward:
+      integrated_sim.mdp_data_collector.save(os.path.join('ppo', 'trajectories'), 'worst_rollout#' + str(policy_num))
+      worst_cum_reward = cum_reward
 
     # Add to the data
     observations = torch.cat((observations, observation))
@@ -47,14 +62,34 @@ def gather_rollout_data(autopilot_learner, policy_num, num_trajectories=100, sim
   # that entry for one step in a rollout.
   data = TensorDict({
     "observation": observations,
-    "action": actions.detach(),
+    "action": TensorDict({
+      "throttle": actions[:,0].detach(),
+      "aileron": actions[:,1].detach(),
+      "elevator": actions[:,2].detach(),
+      "rudder": actions[:,3].detach(),
+      "throttle_log_prob": torch.zeros(data_size).detach(),
+      "aileron_log_prob": torch.zeros(data_size).detach(),
+      "elevator_log_prob": torch.zeros(data_size).detach(),
+      "rudder_log_prob": torch.zeros(data_size).detach(),
+      }, [data_size,]),
     "sample_log_prob": sample_log_probs.detach(), # log probability that each action was selected
+    # "sample_log_prob": TensorDict({
+    # }, [data_size,]), # log probability that each action was selected
     ("next", "done"): dones,
     ("next", "terminated"): dones,
     ("next", "reward"): rewards,
     ("next", "observation"): next_observations,
   }, [data_size,])
 
+  # Write to stats file
+  stats_file = open(os.path.join('data', 'ppo', 'stats.txt'), 'a')
+  stats_file.write('Policy Number #' + str(policy_num) + ':\n')
+  stats_file.write('Average Time Steps =' + str(total_timesteps/num_trajectories) + ':\n')
+  stats_file.write('Average Reward: ' + str(total_cum_reward/num_trajectories) + '\n')
+  stats_file.write('Best Reward: ' + str(best_cum_reward) + '\n')
+  stats_file.write('Worst Reward: ' + str(worst_cum_reward) + '\n')
+  stats_file.write('\n\n')
+
   return data, data_size
 
 """
@@ -64,12 +99,12 @@ def gather_rollout_data(autopilot_learner, policy_num, num_trajectories=100, sim
 action taken during PPO learning.
 """
 def make_value_estimator_module(n_obs):
+  global device
+
   value_net = nn.Sequential(
-    nn.Linear(n_obs, n_obs),
+    nn.Linear(n_obs, 2*n_obs, device=device),
     nn.Tanh(),
-    nn.Linear(n_obs, n_obs),
-    nn.Tanh(),
-    nn.Linear(n_obs, 1), # one value is computed for the state
+    nn.Linear(2*n_obs, 1, device=device), # one value is computed for the state
   )
 
   return ValueOperator(
@@ -115,15 +150,16 @@ def train_ppo_once(policy_num, autopilot_learner, loss_module, advantage_module,
     for b in range(0, data_size // batch_size):
       # Gather batch and calculate PPO loss on it
       batch = replay_buffer.sample(batch_size)
-      loss_vals = loss_module(batch.to("cpu"))
+      loss_vals = loss_module(batch.to(device))
       loss_value = (
         loss_vals["loss_objective"]
         + loss_vals["loss_critic"]
-        + loss_vals["loss_entropy"]
       )
 
       # Optimize via gradient descent with the optimizer
       loss_value.backward()
+
+      torch.nn.utils.clip_grad_norm_(loss_module.parameters(), 1.0)
       optimizer.step()
       optimizer.zero_grad()
 
@@ -131,17 +167,21 @@ def train_ppo_once(policy_num, autopilot_learner, loss_module, advantage_module,
 if __name__ == "__main__":
   # Parameters (see https://pytorch.org/rl/tutorials/coding_ppo.html#ppo-parameters)
   batch_size = 100
-  num_epochs = 10
+  num_epochs = 100
+  device = "cpu" if not torch.has_cuda else "cuda:0"
+
   clip_epsilon = 0.2 # for PPO loss
   gamma = 1.0
   lmbda = 0.95
   entropy_eps = 1e-4
-  lr = 3e-4
-  num_trajectories = 100
+  lr = 1e-4
+  num_trajectories = 200
   num_policy_iterations = 100
 
   # Build the modules
-  autopilot_learner = StochasticAutopilotLearner()
+  #autopilot_learner = StochasticAutopilotLearner()
+  autopilot_learner = SlewRateAutopilotLearner()
+  # autopilot_learner.init_from_params(np.random.normal(0, 1, 350))
   value_module = make_value_estimator_module(autopilot_learner.inputs)
   advantage_module = GAE(
     gamma=gamma, lmbda=lmbda, value_network=value_module, average_gae=True
@@ -150,8 +190,8 @@ def train_ppo_once(policy_num, autopilot_learner, loss_module, advantage_module,
     actor=autopilot_learner.policy_module,
     critic=value_module,
     clip_epsilon=clip_epsilon,
-    entropy_bonus=bool(entropy_eps),
-    entropy_coef=entropy_eps,
+    entropy_bonus=False,
+    entropy_coef=0,
     value_target_key=advantage_module.value_target_key,
     critic_coef=1.0,
     gamma=gamma,
diff --git a/learning/utils.py b/learning/utils.py
new file mode 100644
index 0000000..f0f2002
--- /dev/null
+++ b/learning/utils.py
@@ -0,0 +1,11 @@
+import torch
+import torch.nn as nn
+
+class CategoricalControlsExtractor(nn.Module):
+  def __init__(self):
+    super().__init__()
+  
+  def forward(self, *tensors: torch.Tensor) -> tuple[torch.Tensor, ...]:
+    tensor, *others = tensors
+    throttle, aileron, elevator, rudder = tensor.chunk(4, -1)
+    return (nn.functional.softmax(throttle), nn.functional.softmax(aileron), nn.functional.softmax(elevator), nn.functional.softmax(rudder), *others)
diff --git a/scripts/sim_autopilot.py b/scripts/sim_autopilot.py
index c54daef..060fe23 100644
--- a/scripts/sim_autopilot.py
+++ b/scripts/sim_autopilot.py
@@ -6,7 +6,7 @@
 and rolls out a trajectory in sim for that autopilot learner.
 """
 
-from learning.autopilot import AutopilotLearner
+from learning.autopilot import AutopilotLearner, SlewRateAutopilotLearner
 from simulation.simulate import FullIntegratedSim
 from simulation.jsbsim_aircraft import x8
 import sys
@@ -17,7 +17,7 @@
 file = os.path.join(*(["data"] + policy_file_path.split('/')))
 
 # Initialize autopilot
-autopilot = AutopilotLearner()
+autopilot = SlewRateAutopilotLearner()
 autopilot.init_from_saved(file)
 
 # Play sim
diff --git a/simulation/jsbsim_simulator.py b/simulation/jsbsim_simulator.py
index 9cc5a11..308f428 100644
--- a/simulation/jsbsim_simulator.py
+++ b/simulation/jsbsim_simulator.py
@@ -84,7 +84,8 @@ def __init__(self,
                  aircraft: Aircraft = x8,
                  init_conditions: Dict[prp.Property, float] = None,
                  debug_level: int = 0):
-        self.fdm = jsbsim.FGFDMExec(root_dir=self.ROOT_DIR)
+        with HidePrints():
+            self.fdm = jsbsim.FGFDMExec(root_dir=self.ROOT_DIR)
         self.fdm.set_debug_level(debug_level)
         self.sim_dt = 1.0 / sim_frequency_hz
         self.aircraft = aircraft
@@ -94,7 +95,9 @@ def __init__(self,
         self.wall_clock_dt = None
         self.update_airsim(ignore_collisions=True)
         self.waypoint_id = 0
-        self.waypoint_threshold = 2
+        self.waypoint_threshold = 3
+        self.takeoff_rewarded = False
+        self.completed_takeoff = False
         self.waypoint_rewarded = True
         self.waypoints = []
         with open("waypoints.csv", 'r') as file:
@@ -241,8 +244,9 @@ def airsim_connect() -> airsim.VehicleClient:
 
         :return: the airsim client object
         """
-        client = airsim.VehicleClient()
-        client.confirmConnection()
+        with HidePrints():
+            client = airsim.VehicleClient()
+            client.confirmConnection()
         return client
 
     def update_airsim(self, ignore_collisions = False) -> None:
diff --git a/simulation/mdp.py b/simulation/mdp.py
index 58a292f..fbced7e 100644
--- a/simulation/mdp.py
+++ b/simulation/mdp.py
@@ -4,6 +4,7 @@
 states/observations, actions, and rewards.
 """
 
+import math
 import torch
 import simulation.jsbsim_properties as prp
 import numpy as np
@@ -12,28 +13,29 @@
 """
 Extracts agent state data from the sim.
 """
-def state_from_sim(sim, debug=False):
-  state = torch.zeros(13,)
+def state_from_sim(sim):
+  state = torch.zeros(15,)
   
   FT_TO_M = 0.3048
   
   # altitude
   state[0] = sim[prp.altitude_sl_ft] * FT_TO_M # z
 
-  # velocity
-  state[1] = sim[prp.v_north_fps] * FT_TO_M # x velocity
-  state[2] = sim[prp.v_east_fps] * FT_TO_M # y velocity
-  state[3] = -sim[prp.v_down_fps] * FT_TO_M # z velocity
-
-  # angles
-  state[4] = sim[prp.roll_rad] # roll
-  state[5] = sim[prp.pitch_rad] # pitch
-  state[6] = sim[prp.heading_rad] # yaw
+  # speed
+  state[1] = FT_TO_M * np.linalg.norm(np.array([sim[prp.v_north_fps], sim[prp.v_east_fps], -sim[prp.v_down_fps]]))
+  
+  # roll
+  state[2] = sim[prp.roll_rad]
 
+  # pitch 
+  state[3] = sim[prp.pitch_rad] 
+  
   # angle rates
-  state[7] = sim[prp.p_radps] # roll rate
-  state[8] = sim[prp.q_radps] # pitch rate
-  state[9] = sim[prp.r_radps] # yaw rate
+  state[4] = sim[prp.p_radps] # roll rate
+  state[5] = sim[prp.q_radps] # pitch rate
+  state[6] = sim[prp.r_radps] # yaw rate
+
+  state[7] = -sim[prp.v_down_fps] * FT_TO_M # z velocity
 
   # next waypoint (relative)
   position = np.array(sim.get_local_position())
@@ -41,25 +43,70 @@ def state_from_sim(sim, debug=False):
   
   displacement = waypoint - position
 
+  # waypoint ground distance
+  state[8] = np.linalg.norm(np.array(displacement[0:2]))
+
+  # waypoint altitude distance
+  state[9] = displacement[2]
+
+  # angles
+  waypoint_heading = np.arctan2(displacement[1], displacement[0]) 
+  heading = sim[prp.heading_rad] # yaw
+
+  state[10] = waypoint_heading - heading
+
+  if state[10] < -math.pi:
+    state[10] += 2 * math.pi 
+  elif state[10] > math.pi:
+    state[10] -= 2* math.pi
+  
+
+  # print("\t\t\t\t\t\t\t\t\t\t YAW", heading/math.pi*180)
+  # print("\t\t\t\t\t\t\t\t\t\t waypoint_heading", waypoint_heading/math.pi*180)
+
+
+  # print("\t\t\t\t\t\t\t\t\t\theading", state[10]/math.pi*180)
+  # print("\t\t\t\t\t\t\t\t\t\tdist", state[8])
+  # print("\t\t\t\t\t\t\t\t\t\theight", state[9])
+  
+
+
+  state[11] = sim[prp.throttle_cmd] / 0.8
+  state[12] = sim[prp.aileron_cmd] / 0.1
+  state[13] = sim[prp.elevator_cmd] / 0.4
+  state[14] = sim[prp.rudder_cmd] / 0.1
+
+
+  if state[0] >= 2:
+    if not sim.completed_takeoff:
+      print("\t\t\t\t\t\t\t\t\t\tTake off!")
+    sim.completed_takeoff = True
+
   if np.linalg.norm(displacement) <= sim.waypoint_threshold:
-    print("Waypoint Hit!")
+    print(f"\t\t\t\t\t\t\t\t\t\tWaypoint {sim.waypoint_id} Hit!")
     sim.waypoint_id += 1
     sim.waypoint_rewarded = False
 
-  state[10] = displacement[0]
-  state[11] = displacement[1]
-  state[12] = displacement[2]
+  if is_unhealthy_state(state):
+    raise Exception("Unhealthy state, do better")
 
-  if debug:
-    print('State!')
-    print('Altitude:', state[0])
-    print('Velocity: (', state[1], state[2], state[3], ')')
-    print('Roll:', state[4], '; Pitch:', state[5], '; Yaw:', state[6])
-    print('RollRate:', state[7], '; PitchRate:', state[8], '; YawRate:', state[9])
-    print('Relative WP: (', state[10], state[11], state[12], ')')
 
   return state
 
+"""
+Returns a bool for whether the state is unhealthy
+"""
+def is_unhealthy_state(state):
+  MAX_BANK =  math.pi / 3
+  MAX_PITCH =  math.pi / 4
+  if np.linalg.norm(state[8:10]) > 75:
+    return True
+  if not -MAX_BANK < state[2] < MAX_BANK:
+    return True
+  if not -MAX_PITCH < state[3] < MAX_PITCH:
+    return True
+  return False
+  
 
 """
 Updates sim according to a control, assumes [control] is a 4-item tensor of
@@ -72,6 +119,23 @@ def update_sim_from_control(sim, control, debug=False):
   sim[prp.rudder_cmd] = control[3]
   if debug:
     print('Control Taken:', control)
+  
+
+# Get the state/action/log_prob and control from the slewrate autopilot
+def query_slewrate_autopilot(sim, autopilot, deterministic=True):
+  state = state_from_sim(sim)
+  action, log_prob = autopilot.get_deterministic_action(state) if deterministic else autopilot.get_action(state)
+  control = autopilot.get_control(action)
+
+  return state, action, log_prob, control
+
+# Called every single sim step to enact slewrate
+# Assumes autopilot is a SlewRateAutopilotLearner
+def update_sim_from_slewrate_control(sim, control, autopilot):
+  sim[prp.throttle_cmd] = np.clip(sim[prp.throttle_cmd] + control[0] * autopilot.throttle_slew_rate, 0, 0.8)
+  sim[prp.aileron_cmd] = np.clip(sim[prp.aileron_cmd] + control[1] * autopilot.aileron_slew_rate, -0.1, 0.1)
+  sim[prp.elevator_cmd] = np.clip(sim[prp.elevator_cmd] + control[2] * autopilot.elevator_slew_rate, -0.4, 0.4)
+  sim[prp.rudder_cmd] = np.clip(sim[prp.rudder_cmd] + control[3] * autopilot.rudder_slew_rate, -0.1, 0.1)
 
 """
 Follows a predetermined sequence of controls, instead of using autopilot.
@@ -115,18 +179,25 @@ def enact_predetermined_controls(sim, autopilot):
 Basically just updates sim throttle / control surfaces according to the autopilot.
 """
 def enact_autopilot(sim, autopilot):
-  state = state_from_sim(sim, debug=False)
+  state = state_from_sim(sim)
   action, log_prob = autopilot.get_action(state)
-
   update_sim_from_control(sim, autopilot.get_control(action))
 
   return state, action, log_prob
 
+
 # Takes in the action outputted directly from the network and outputs the 
 # normalized quadratic action cost from 0-1
 def quadratic_action_cost(action):
-  action[1:4] = 2*action[1:4] - 1 # convert control surfaces to [-1, 1]
-  return float(torch.dot(action, action).detach()) / 4 # divide by 4 to be 0-1
+  action_cost_weights = torch.tensor([1.0, 20.0, 10.0, 1.0])
+  action[0] = 0.5 * (action[0] + 1) # converts throttle to be 0-1
+  return float(torch.dot(action ** 2, action_cost_weights).detach() / sum(action_cost_weights)) # divide by 4 to be 0-1
+
+# Takes in the action outputted directly from the network and outputs the 
+# normalized quadratic action cost from 0-1
+def quadratic_control_cost(control):
+  control_cost_weights = torch.tensor([1.0, 10.0, 5.0, 1.0])
+  return float(torch.dot(control ** 2, control_cost_weights).detach() / sum(control_cost_weights)) # divide by 4 to be 0-1
 
 """
 The reward function for the bb autopilots. Since they won't know how to fly, 
@@ -134,6 +205,7 @@ def quadratic_action_cost(action):
   + 1 if moving with some velocity threshold vel_reward_threshold
   + alt_reward_threshold if flying above ground with some threshold alt_reward_threshold
   - action_coeff * the quadratic control cost
+  !!! WARNING: Deprecated
 """
 def bb_reward(action, next_state, collided, alt_reward_coeff=10, action_coeff=1, alt_reward_threshold=2, vel_reward_threshold=1):
   moving_reward = 1 if (next_state[3]**2 + next_state[4]**2 + next_state[5]**2) > vel_reward_threshold else 0
@@ -145,6 +217,7 @@ def bb_reward(action, next_state, collided, alt_reward_coeff=10, action_coeff=1,
 A reward function that tries to incentivize the plane to go towards the waypoint
 at each timestep, while also rewarding for being above ground and penalizing
 for high control effort
+  !!! WARNING: Deprecated
 """
 def new_init_wp_reward(action, next_state, collided, wp_coeff=1, action_coeff=1, alt_reward_threshold=5):
   alt_reward = 1 if next_state[2] > alt_reward_threshold else 0
@@ -152,29 +225,32 @@ def new_init_wp_reward(action, next_state, collided, wp_coeff=1, action_coeff=1,
 
   waypoint_rel_unit = next_state[10:13] / torch.norm(next_state[10:13])
   vel = next_state[1:4]
-  toward_waypoint_reward = wp_coeff * float(torch.dot((vel ** 2 * torch.sign(vel)), waypoint_rel_unit).detach())
+  toward_waypoint_reward = wp_coeff * float(torch.dot(vel, waypoint_rel_unit).detach())
   return toward_waypoint_reward + alt_reward - action_cost if not collided else 0
 
 """
-A reward function.
+A reward function getter.
 """
 def get_wp_reward(sim):
-  def wp_reward(action, next_state, collided, wp_coeff=0.01, action_coeff=0.01, alt_reward_threshold=10):
+  def wp_reward(action, next_state, collided, wp_coeff=0.1, action_coeff=0.5):
     if not sim.waypoint_rewarded:
       sim.waypoint_rewarded = True
-      wp_reward = 1_000_000
+      wp_reward = 1_000
     else: wp_reward = 0
-    # print("altitude",  next_state[2] )
-    alt_reward = 0.1 if next_state[2] > alt_reward_threshold else 0
-    action_cost = action_coeff * quadratic_action_cost(action)
-
-    waypoint_rel_unit = next_state[10:13] / torch.norm(next_state[10:13])
-    vel = next_state[1:4]
-    toward_waypoint_reward = wp_coeff * float(torch.dot((vel ** 2 * torch.sign(vel)), waypoint_rel_unit).detach())
-    # print("wp_reward: ", wp_reward, " toward_waypoint_reward: ", toward_waypoint_reward)
-    # print("alt_reward: ", alt_reward, " action_cost: ", -action_cost)
-    # print("\t reward", wp_reward + toward_waypoint_reward + alt_reward - action_cost if not collided else 0)
-    return wp_reward + toward_waypoint_reward + alt_reward - action_cost if not collided else 0
+    action_cost = action_coeff * quadratic_control_cost(next_state[11:15])
+    #print('\t\t\t\t\t\tACTION COST:', action_cost, next_state[11:15])
+    # waypoint_rel_unit = torch.nn.functional.normalize(next_state[10:13], dim=0)
+    # vel = next_state[1:4]
+    # toward_waypoint_reward = wp_coeff * float(torch.dot(vel, waypoint_rel_unit).detach())
+    toward_waypoint_reward = 0
+    if not sim.takeoff_rewarded and sim.completed_takeoff:
+      takeoff_reward = 500
+      sim.takeoff_rewarded = True
+    else:
+      takeoff_reward = 0
+    # toward_waypoint_reward = 0
+    # toward_waypoint_reward = torch.min(torch.tensor(10), 0.01 * 1 / torch.max(torch.tensor(0.1), torch.sum(next_state[10:13]**2)))
+    return wp_reward + toward_waypoint_reward + takeoff_reward - action_cost if not collided else -500
   return wp_reward
 
 """
@@ -182,7 +258,7 @@ def wp_reward(action, next_state, collided, wp_coeff=0.01, action_coeff=0.01, al
 rollout, including states/actions/rewards the agent experienced.
 """
 class MDPDataCollector:
-  def __init__(self, sim, reward_fn, expected_trajectory_length, state_dim=13, action_dim=4):
+  def __init__(self, sim, reward_fn, expected_trajectory_length, state_dim=15, action_dim=4):
     # parameters
     self.sim = sim
     self.reward_fn = reward_fn
diff --git a/simulation/simulate.py b/simulation/simulate.py
index b0ab2a3..bbec2c7 100644
--- a/simulation/simulate.py
+++ b/simulation/simulate.py
@@ -1,7 +1,9 @@
+from shared import HidePrints
 from simulation.jsbsim_simulator import Simulation
 from simulation.jsbsim_aircraft import Aircraft, x8
 import simulation.jsbsim_properties as prp
-from learning.autopilot import AutopilotLearner
+from learning.autopilot import AutopilotLearner, SlewRateAutopilotLearner
+import torch
 import simulation.mdp as mdp
 import os
 import numpy as np
@@ -13,10 +15,10 @@
 class FullIntegratedSim:
   def __init__(self,
                 aircraft: Aircraft,
-                autopilot: AutopilotLearner,
+                autopilot: SlewRateAutopilotLearner,
                 sim_time: float,
                 display_graphics: bool = True,
-                agent_interaction_frequency: int = 1,
+                agent_interaction_frequency: int = 10,
                 airsim_frequency_hz: float = 392.0,
                 sim_frequency_hz: float = 240.0,
                 init_conditions: bool = None,
@@ -44,6 +46,9 @@ def __init__(self,
     # Triggered when sim is complete
     self.done: bool = False
 
+    # 
+    self.unhealthy_termination: bool = False
+
     self.initial_collision = False
   """
     Run loop for one simulation.
@@ -76,14 +81,20 @@ def simulation_loop(self):
     while i < update_num:
       # Do autopilot controls          
       try:
-        state, action, log_prob = mdp.enact_autopilot(self.sim, self.autopilot)
+        #state, action, log_prob = mdp.enact_autopilot(self.sim, self.autopilot)
+        state, action, log_prob, control = mdp.query_slewrate_autopilot(self.sim, self.autopilot)
+        if torch.isnan(state).any():
+          break
       except Exception as e:
         print(e)
+        self.unhealthy_termination = True
         # If enacting the autopilot fails, end the simulation immediately
         break
       
       # Update sim while waiting for next agent interaction
       while True:
+        mdp.update_sim_from_slewrate_control(self.sim, control, self.autopilot)
+
         # Run another sim step
         self.sim.run()
 
@@ -100,28 +111,34 @@ def simulation_loop(self):
         # Check for collisions via airsim and terminate if there is one
         if self.sim.get_collision_info().has_collided:
           if self.initial_collision:
-            print('Aircraft has collided.')
+            # print('Aircraft has collided.')
             self.done = True
             self.sim.reinitialize()
           else:
-            print("Aircraft completed initial landing")
+            # print("Aircraft completed initial landing")
             self.initial_collision = True
 
         # Exit if sim is over or it's time for another agent interaction
         if self.done or i % self.agent_interaction_frequency == 0:
           break
-      
+        
       # Get new state
       try:
         next_state = mdp.state_from_sim(self.sim)
-      except:
+        if torch.isnan(next_state).any():
+          next_state = state
+          self.done = True
+      except Exception as e:
         # If we couldn't acquire the state, something crashed with jsbsim
         # We treat that as the end of the simulation and don't update the state
+        print("\t\t\t\t\t\t\t\t\t\t", e)
         next_state = state
         self.done = True
+        self.unhealthy_termination = True
 
       # Data collection update for this step
-      self.mdp_data_collector.update(int(i/self.agent_interaction_frequency)-1, state, action, log_prob, next_state, self.done)
+      self.mdp_data_collector.update(int(i/self.agent_interaction_frequency)-1, state, action, 
+                                     log_prob, next_state, self.unhealthy_termination)
 
       # End if collided
       if self.done == True:
@@ -130,6 +147,8 @@ def simulation_loop(self):
     self.done = True
     self.mdp_data_collector.terminate(int(i/self.agent_interaction_frequency))
     print('Simulation complete.')
+    print('\t\t\t\t\t\t\t\t\t\tCum reward:', self.mdp_data_collector.cum_reward)
+          
 
   """
   Replays a simulation
@@ -151,6 +170,7 @@ def simulation_replay(self, actions):
         # NOTE: for replays, the agent interaction frequency must match what
         # it was when the trajectory was created
         if i % self.agent_interaction_frequency == 0:
+          print("Step")
           break
 
 if __name__ == "__main__":
diff --git a/waypoints.csv b/waypoints.csv
index 7b87c80..aa70d84 100644
--- a/waypoints.csv
+++ b/waypoints.csv
@@ -1,7 +1,8 @@
 waypoint_id,x,y,z,relative_x,relative_y,relative_z
 1,-4461,-417.960938,90,21.5,-0.2,0.89
-2,-701,-417.960938,120,59.1,-0.2,1.19
-3,2869,-57.960938,450,94.8,3.4,4.49
-4,6509,-967.960938,1230,131.2,-5.7,12.29
+2,-1891,-487.960938,120,47.2,-0.9,1.19
+3,3006,-461,586,96.17,-0.63039062,5.85
+4,6639,-237.960938,730,132.5,1.6,7.29
 5,9469,-477.960938,1480,160.8,-0.8,14.79
-6,15619,-397.960938,2920,222.3,0,29.19
\ No newline at end of file
+6,11919,-287.960938,1980,185.3,1.1,19.79
+7,15619,-407.960938,2920,222.3,-0.1,29.19
\ No newline at end of file