owsorber · yekime · Dec 3, 2023 · Dec 3, 2023 · Dec 3, 2023 · Dec 3, 2023
diff --git a/assets/x8.xml b/assets/x8.xml
@@ -70,10 +70,10 @@
             </location>
             <static_friction> 0.8 </static_friction>
             <dynamic_friction> 0.5 </dynamic_friction>
-            <rolling_friction> 0.1 </rolling_friction>
+            <rolling_friction> .02 </rolling_friction>
             <spring_coeff unit="LBS/FT"> 8.0 </spring_coeff>
-            <damping_coeff unit="LBS/FT/SEC"> 0.85 </damping_coeff>
-            <max_steer unit="DEG"> 0 </max_steer>
+            <damping_coeff unit="LBS/FT/SEC"> 2.85 </damping_coeff>
+            <max_steer unit="DEG"> 360 </max_steer>
             <brake_group> NONE </brake_group>
             <retractable>0</retractable>
         </contact>
@@ -85,10 +85,10 @@
             </location>
             <static_friction> 0.8 </static_friction>
             <dynamic_friction> 0.5 </dynamic_friction>
-            <rolling_friction> 0.1 </rolling_friction>
+            <rolling_friction> 0.03 </rolling_friction>
             <spring_coeff unit="LBS/FT"> 4.0 </spring_coeff>
-            <damping_coeff unit="LBS/FT/SEC"> 0.95 </damping_coeff>
-            <max_steer unit="DEG"> 0.0 </max_steer>
+            <damping_coeff unit="LBS/FT/SEC"> 1.95 </damping_coeff>
+            <max_steer unit="DEG"> 360 </max_steer>
             <brake_group> LEFT </brake_group>
             <retractable>0</retractable>
         </contact>
@@ -100,10 +100,10 @@
             </location>
             <static_friction> 0.8 </static_friction>
             <dynamic_friction> 0.5 </dynamic_friction>
-            <rolling_friction> 0.1 </rolling_friction>
+            <rolling_friction> 0.03 </rolling_friction>
             <spring_coeff unit="LBS/FT"> 4.0 </spring_coeff>
-            <damping_coeff unit="LBS/FT/SEC"> 0.95 </damping_coeff>
-            <max_steer unit="DEG"> 0.0 </max_steer>
+            <damping_coeff unit="LBS/FT/SEC"> 1.95 </damping_coeff>
+            <max_steer unit="DEG"> 360 </max_steer>
             <brake_group> RIGHT </brake_group>
             <retractable>0</retractable>
         </contact>
@@ -145,7 +145,7 @@
           <input> attitude/heading-true-rad </input>
             <!-- <lag> 2500 </lag> -->
             <!-- <noise variation="PERCENT"> .02 </noise> -->
-          <bias> 0.003 </bias>
+          <bias> 0.0 </bias>
         </sensor>
       </channel>
     </system>

diff --git a/data/ppo/stats.txt b/data/ppo/stats.txt
diff --git a/learning/autopilot.py b/learning/autopilot.py
@@ -4,6 +4,9 @@
 from tensordict.nn.distributions import NormalParamExtractor
 from tensordict.nn import TensorDictModule, InteractionType
 from torchrl.modules import ProbabilisticActor, TanhNormal, ValueOperator
+from torch.distributions import Categorical
+from tensordict.nn import CompositeDistribution
+from learning.utils import CategoricalControlsExtractor
 import os
 
 """
@@ -20,7 +23,7 @@ def __init__(self):
     - 3 angular velocities: w_roll, w_pitch, w_yaw
     - 3d relative position of next waypoint: wx, wy, wz
     """
-    self.inputs = 13
+    self.inputs = 15
 
     """
     Action:
@@ -36,7 +39,7 @@ def __init__(self):
       nn.Linear(self.inputs, self.inputs),
       nn.ReLU(),
       nn.Linear(self.inputs, self.outputs),
-      nn.Sigmoid(),
+      nn.Tanh(),
     )
 
   # Returns the action selected and 0, representing the log-prob of the 
@@ -50,10 +53,10 @@ def get_control(self, action):
     Clamps various control outputs and sets the mean for control surfaces to 0.
     Assumes [action] is a 4-item tensor of throttle, aileron cmd, elevator cmd, rudder cmd.
     """
-    action[0] = 0.5 * action[0]
-    action[1] = 0.1 * (action[1] - 0.5)
-    action[2] = 0.5 * (action[2] - 0.5)
-    action[3] = 0.5 * (action[3] - 0.5) 
+    action[0] = 0.8 * (0.5*(action[0] + 1))
+    action[1] = 0.1 * action[1]
+    action[2] = 0.4 * action[2]
+    action[3] = 0.1 * action[3]
     return action
 
   # flattened_params = flattened dx1 numpy array of all params to init from
@@ -82,12 +85,11 @@ def init_from_params(self, flattened_params):
       layer1,
       nn.ReLU(),
       layer2,
-      nn.Sigmoid(),
+      nn.Tanh()
     )
 
-  # Loads the network from dir/name.pth
-  def init_from_saved(self, dir, name):
-    path = os.path.join(dir, name + '.pth')
+  # Loads the network from path
+  def init_from_saved(self, path):
     self.policy_network = torch.load(path)
 
   # Saves the network to dir/name.pth
@@ -125,8 +127,8 @@ def transform_from_deterministic_learner(self):
     self.policy_network[-2] = nn.Linear(self.inputs, self.outputs * 2)
 
     # Update the output layer to include the original weights and biases
-    self.policy_network[-2].weight = nn.Parameter(torch.cat((w, torch.ones(w.shape) * 100), 0))
-    self.policy_network[-2].bias = nn.Parameter(torch.cat((b, torch.ones(b.shape) * 100), 0))
+    self.policy_network[-2].weight = nn.Parameter(torch.cat((w, torch.zeros(w.shape)), 0))
+    self.policy_network[-2].bias = nn.Parameter(torch.cat((b, torch.zeros(b.shape)), 0))
 
     # Add a normal param extractor to the network to extract (means, sigmas) tuple
     self.policy_network.append(NormalParamExtractor())
@@ -141,10 +143,6 @@ def transform_from_deterministic_learner(self):
       module=policy_module,
       in_keys=["loc", "scale"],
       distribution_class=TanhNormal,
-      distribution_kwargs={
-          "min": 0, # minimum control
-          "max": 1, # maximum control
-      },
       default_interaction_type=InteractionType.RANDOM,
       return_log_prob=True,
     )
@@ -153,7 +151,6 @@ def transform_from_deterministic_learner(self):
   def get_action(self, observation):
     data = TensorDict({"observation": observation}, [])
     policy_forward = self.policy_module(data)
-    print("action", self.policy_network(observation))
     return policy_forward["action"], policy_forward["sample_log_prob"]
 
   # NOTE: This initializes from *deterministic* learner parameters and picks
@@ -173,10 +170,114 @@ def init_from_saved(self, path):
       module=policy_module,
       in_keys=["loc", "scale"],
       distribution_class=TanhNormal,
-      distribution_kwargs={
-          "min": 0, # minimum control
-          "max": 1, # maximum control
-      },
       default_interaction_type=InteractionType.RANDOM,
       return_log_prob=True,
     )
+
+"""
+Slew rate autopilot learner.
+Each control (throttle/aileron/elevator/rudder) has three options:
+stay constant, go down, or go up.
+"""
+class SlewRateAutopilotLearner:
+  def __init__(self):
+    self.inputs = 15
+    self.outputs = 4
+
+    # Slew rates are wrt sim clock
+    self.throttle_slew_rate = 0.005
+    self.aileron_slew_rate = 0.0001
+    self.elevator_slew_rate = 0.00025
+    self.rudder_slew_rate = 0.0001
+
+    self.policy_network = nn.Sequential(
+      nn.Linear(self.inputs, self.inputs),
+      nn.ReLU(),
+      nn.Linear(self.inputs, 3 * self.outputs),
+      nn.Sigmoid(),
+      CategoricalControlsExtractor()
+    )
+
+    self.instantiate_policy_module()
+
+  def instantiate_policy_module(self):
+    policy_module = TensorDictModule(self.policy_network, in_keys=["observation"], out_keys=[("params", "throttle", "probs"),("params", "aileron", "probs"),("params", "elevator", "probs"),("params", "rudder", "probs")])
+    self.policy_module = policy_module = ProbabilisticActor(
+      module=policy_module,
+      in_keys=["params"],
+      distribution_class=CompositeDistribution, 
+      distribution_kwargs={
+        "distribution_map": {
+          "throttle": Categorical,
+          "aileron": Categorical,
+          "elevator": Categorical,
+          "rudder": Categorical,
+        }
+      },
+      default_interaction_type=InteractionType.RANDOM, 
+      return_log_prob=True
+    )
+
+  # Returns the action selected and the log_prob of that action
+  def get_action(self, observation):
+    data = TensorDict({"observation": observation}, [])
+    policy_forward = self.policy_module(data)
+    action = torch.Tensor([policy_forward['throttle'], policy_forward['aileron'], policy_forward['elevator'], policy_forward['rudder']])
+    return action, policy_forward["sample_log_prob"]
+
+  # Always samples the mode of the output for each control
+  def get_deterministic_action(self, observation):
+    throttle_probs, aileron_probs, elevator_probs, rudder_probs = self.policy_network(observation)
+    action = torch.Tensor([torch.argmax(throttle_probs), torch.argmax(aileron_probs), torch.argmax(elevator_probs), torch.argmax(rudder_probs)])
+    return action, 0
+
+  # Apply a -1 transformation to the action to create control tensor such that:
+  # -1 means go down, 0 means stay same, and +1 means go up
+  def get_control(self, action):
+    return action - 1
+
+  # flattened_params = flattened dx1 numpy array of all params to init from
+  # NOTE: the way the params are broken up into the weights/biases of each layer
+  #        would need to be manually edited for changes in network architecture
+  def init_from_params(self, flattened_params):
+    flattened_params = torch.from_numpy(flattened_params).to(torch.float32)
+
+    pl, pr = 0, 0
+    layer1 = nn.Linear(self.inputs, self.inputs)
+    pr += layer1.weight.nelement()
+    layer1.weight = nn.Parameter(flattened_params[pl:pr].reshape(layer1.weight.shape))
+    pl = pr
+    pr += layer1.bias.nelement()
+    layer1.bias = nn.Parameter(flattened_params[pl:pr].reshape(layer1.bias.shape))
+
+    layer2 = nn.Linear(self.inputs, 3*self.outputs)
+    pl = pr
+    pr += layer2.weight.nelement()
+    layer2.weight = nn.Parameter(flattened_params[pl:pr].reshape(layer2.weight.shape))
+    pl = pr
+    pr += layer2.bias.nelement()
+    layer2.bias = nn.Parameter(flattened_params[pl:pr].reshape(layer2.bias.shape))
+
+    self.policy_network = nn.Sequential(
+      layer1,
+      nn.ReLU(),
+      layer2,
+      nn.Sigmoid(),
+      CategoricalControlsExtractor()
+    )
+    self.instantiate_policy_module()
+
+  # Loads the network from path
+  def init_from_saved(self, path):
+    self.policy_network = torch.load(path)
+    self.instantiate_policy_module()
+
+  # Saves the network to dir/name.pth
+  def save(self, dir, name):
+    path = os.path.join(dir, name + '.pth')
+    torch.save(self.policy_network, path)
+
+
+
+
+
diff --git a/learning/crossentropy.py b/learning/crossentropy.py
@@ -1,6 +1,6 @@
 import torch
 import numpy as np
-from learning.autopilot import AutopilotLearner
+from learning.autopilot import AutopilotLearner, SlewRateAutopilotLearner
 from simulation.simulate import FullIntegratedSim
 from simulation.jsbsim_aircraft import x8
 import os
@@ -25,7 +25,7 @@ def __init__(self, learners, num_params):
   def init_using_torch_default(generation_size, num_params):
     learners = []
     for i in range(generation_size):
-      learners.append(AutopilotLearner())
+      learners.append(SlewRateAutopilotLearner())
     return Generation(learners, num_params)
 
   # Utilizes [rewards], which contains the reward obtained by each learner as a
@@ -89,12 +89,12 @@ def make_new_generation(mean, cov, generation_size, num_params):
     # Generate each learner from params
     learners = []
     for param_list in selected_params:
-      l = AutopilotLearner()
+      l = SlewRateAutopilotLearner()
       l.init_from_params(param_list)
       learners.append(l)
     return Generation(learners, num_params)
 
-def cross_entropy_train(epochs, generation_size, num_survive, num_params=238, sim_time=60.0, save_dir='cross_entropy'):
+def cross_entropy_train(epochs, generation_size, num_survive, num_params=432, sim_time=60.0, save_dir='cross_entropy'):
   # Create save_dir (and if one already exists, rename it with some rand int)
   if os.path.exists(os.path.join('data', save_dir)):
     os.rename(os.path.join('data', save_dir), os.path.join('data', save_dir + '_old' + str(randint(0, 100000))))
@@ -117,7 +117,7 @@ def cross_entropy_train(epochs, generation_size, num_survive, num_params=238, si
     # Evaluate generation through rollouts
     rewards = []
     for i in range(len(generation.learners)):
-      id = str(100*(epoch+1) + (i+1))
+      id = str(1000*(epoch+1) + (i+1))
       learner = generation.learners[i]
 
       # Run simulation to evaluate learner
@@ -127,7 +127,7 @@ def cross_entropy_train(epochs, generation_size, num_survive, num_params=238, si
       integrated_sim.simulation_loop()
 
       # Acquire/save data
-      integrated_sim.mdp_data_collector.save(os.path.join(save_dir, 'generation' + str(epoch+1)), 'trajectory_learner#' + str(i+1))
+      #integrated_sim.mdp_data_collector.save(os.path.join(save_dir, 'generation' + str(epoch+1)), 'trajectory_learner#' + str(i+1))
       rewards.append(integrated_sim.mdp_data_collector.get_cum_reward())
       print('Reward for Learner #', id, ': ', integrated_sim.mdp_data_collector.get_cum_reward())
 
@@ -149,4 +149,4 @@ def cross_entropy_train(epochs, generation_size, num_survive, num_params=238, si
 if __name__ == "__main__":
   os.environ["JSBSIM_DEBUG"]=str(0)
   # epochs, generation_size, num_survive
-  cross_entropy_train(100, 99, 50)
+  cross_entropy_train(100, 200, 50)