Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
20 changes: 10 additions & 10 deletions assets/x8.xml
Original file line number Diff line number Diff line change
Expand Up @@ -70,10 +70,10 @@
</location>
<static_friction> 0.8 </static_friction>
<dynamic_friction> 0.5 </dynamic_friction>
<rolling_friction> 0.1 </rolling_friction>
<rolling_friction> .02 </rolling_friction>
<spring_coeff unit="LBS/FT"> 8.0 </spring_coeff>
<damping_coeff unit="LBS/FT/SEC"> 0.85 </damping_coeff>
<max_steer unit="DEG"> 0 </max_steer>
<damping_coeff unit="LBS/FT/SEC"> 2.85 </damping_coeff>
<max_steer unit="DEG"> 360 </max_steer>
<brake_group> NONE </brake_group>
<retractable>0</retractable>
</contact>
Expand All @@ -85,10 +85,10 @@
</location>
<static_friction> 0.8 </static_friction>
<dynamic_friction> 0.5 </dynamic_friction>
<rolling_friction> 0.1 </rolling_friction>
<rolling_friction> 0.03 </rolling_friction>
<spring_coeff unit="LBS/FT"> 4.0 </spring_coeff>
<damping_coeff unit="LBS/FT/SEC"> 0.95 </damping_coeff>
<max_steer unit="DEG"> 0.0 </max_steer>
<damping_coeff unit="LBS/FT/SEC"> 1.95 </damping_coeff>
<max_steer unit="DEG"> 360 </max_steer>
<brake_group> LEFT </brake_group>
<retractable>0</retractable>
</contact>
Expand All @@ -100,10 +100,10 @@
</location>
<static_friction> 0.8 </static_friction>
<dynamic_friction> 0.5 </dynamic_friction>
<rolling_friction> 0.1 </rolling_friction>
<rolling_friction> 0.03 </rolling_friction>
<spring_coeff unit="LBS/FT"> 4.0 </spring_coeff>
<damping_coeff unit="LBS/FT/SEC"> 0.95 </damping_coeff>
<max_steer unit="DEG"> 0.0 </max_steer>
<damping_coeff unit="LBS/FT/SEC"> 1.95 </damping_coeff>
<max_steer unit="DEG"> 360 </max_steer>
<brake_group> RIGHT </brake_group>
<retractable>0</retractable>
</contact>
Expand Down Expand Up @@ -145,7 +145,7 @@
<input> attitude/heading-true-rad </input>
<!-- <lag> 2500 </lag> -->
<!-- <noise variation="PERCENT"> .02 </noise> -->
<bias> 0.003 </bias>
<bias> 0.0 </bias>
</sensor>
</channel>
</system>
Expand Down
Empty file added data/ppo/stats.txt
Empty file.
143 changes: 122 additions & 21 deletions learning/autopilot.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,9 @@
from tensordict.nn.distributions import NormalParamExtractor
from tensordict.nn import TensorDictModule, InteractionType
from torchrl.modules import ProbabilisticActor, TanhNormal, ValueOperator
from torch.distributions import Categorical
from tensordict.nn import CompositeDistribution
from learning.utils import CategoricalControlsExtractor
import os

"""
Expand All @@ -20,7 +23,7 @@ def __init__(self):
- 3 angular velocities: w_roll, w_pitch, w_yaw
- 3d relative position of next waypoint: wx, wy, wz
"""
self.inputs = 13
self.inputs = 15

"""
Action:
Expand All @@ -36,7 +39,7 @@ def __init__(self):
nn.Linear(self.inputs, self.inputs),
nn.ReLU(),
nn.Linear(self.inputs, self.outputs),
nn.Sigmoid(),
nn.Tanh(),
)

# Returns the action selected and 0, representing the log-prob of the
Expand All @@ -50,10 +53,10 @@ def get_control(self, action):
Clamps various control outputs and sets the mean for control surfaces to 0.
Assumes [action] is a 4-item tensor of throttle, aileron cmd, elevator cmd, rudder cmd.
"""
action[0] = 0.5 * action[0]
action[1] = 0.1 * (action[1] - 0.5)
action[2] = 0.5 * (action[2] - 0.5)
action[3] = 0.5 * (action[3] - 0.5)
action[0] = 0.8 * (0.5*(action[0] + 1))
action[1] = 0.1 * action[1]
action[2] = 0.4 * action[2]
action[3] = 0.1 * action[3]
return action

# flattened_params = flattened dx1 numpy array of all params to init from
Expand Down Expand Up @@ -82,12 +85,11 @@ def init_from_params(self, flattened_params):
layer1,
nn.ReLU(),
layer2,
nn.Sigmoid(),
nn.Tanh()
)

# Loads the network from dir/name.pth
def init_from_saved(self, dir, name):
path = os.path.join(dir, name + '.pth')
# Loads the network from path
def init_from_saved(self, path):
self.policy_network = torch.load(path)

# Saves the network to dir/name.pth
Expand Down Expand Up @@ -125,8 +127,8 @@ def transform_from_deterministic_learner(self):
self.policy_network[-2] = nn.Linear(self.inputs, self.outputs * 2)

# Update the output layer to include the original weights and biases
self.policy_network[-2].weight = nn.Parameter(torch.cat((w, torch.ones(w.shape) * 100), 0))
self.policy_network[-2].bias = nn.Parameter(torch.cat((b, torch.ones(b.shape) * 100), 0))
self.policy_network[-2].weight = nn.Parameter(torch.cat((w, torch.zeros(w.shape)), 0))
self.policy_network[-2].bias = nn.Parameter(torch.cat((b, torch.zeros(b.shape)), 0))

# Add a normal param extractor to the network to extract (means, sigmas) tuple
self.policy_network.append(NormalParamExtractor())
Expand All @@ -141,10 +143,6 @@ def transform_from_deterministic_learner(self):
module=policy_module,
in_keys=["loc", "scale"],
distribution_class=TanhNormal,
distribution_kwargs={
"min": 0, # minimum control
"max": 1, # maximum control
},
default_interaction_type=InteractionType.RANDOM,
return_log_prob=True,
)
Expand All @@ -153,7 +151,6 @@ def transform_from_deterministic_learner(self):
def get_action(self, observation):
data = TensorDict({"observation": observation}, [])
policy_forward = self.policy_module(data)
print("action", self.policy_network(observation))
return policy_forward["action"], policy_forward["sample_log_prob"]

# NOTE: This initializes from *deterministic* learner parameters and picks
Expand All @@ -173,10 +170,114 @@ def init_from_saved(self, path):
module=policy_module,
in_keys=["loc", "scale"],
distribution_class=TanhNormal,
distribution_kwargs={
"min": 0, # minimum control
"max": 1, # maximum control
},
default_interaction_type=InteractionType.RANDOM,
return_log_prob=True,
)

"""
Slew rate autopilot learner.
Each control (throttle/aileron/elevator/rudder) has three options:
stay constant, go down, or go up.
"""
class SlewRateAutopilotLearner:
def __init__(self):
self.inputs = 15
self.outputs = 4

# Slew rates are wrt sim clock
self.throttle_slew_rate = 0.005
self.aileron_slew_rate = 0.0001
self.elevator_slew_rate = 0.00025
self.rudder_slew_rate = 0.0001

self.policy_network = nn.Sequential(
nn.Linear(self.inputs, self.inputs),
nn.ReLU(),
nn.Linear(self.inputs, 3 * self.outputs),
nn.Sigmoid(),
CategoricalControlsExtractor()
)

self.instantiate_policy_module()

def instantiate_policy_module(self):
policy_module = TensorDictModule(self.policy_network, in_keys=["observation"], out_keys=[("params", "throttle", "probs"),("params", "aileron", "probs"),("params", "elevator", "probs"),("params", "rudder", "probs")])
self.policy_module = policy_module = ProbabilisticActor(
module=policy_module,
in_keys=["params"],
distribution_class=CompositeDistribution,
distribution_kwargs={
"distribution_map": {
"throttle": Categorical,
"aileron": Categorical,
"elevator": Categorical,
"rudder": Categorical,
}
},
default_interaction_type=InteractionType.RANDOM,
return_log_prob=True
)

# Returns the action selected and the log_prob of that action
def get_action(self, observation):
data = TensorDict({"observation": observation}, [])
policy_forward = self.policy_module(data)
action = torch.Tensor([policy_forward['throttle'], policy_forward['aileron'], policy_forward['elevator'], policy_forward['rudder']])
return action, policy_forward["sample_log_prob"]

# Always samples the mode of the output for each control
def get_deterministic_action(self, observation):
throttle_probs, aileron_probs, elevator_probs, rudder_probs = self.policy_network(observation)
action = torch.Tensor([torch.argmax(throttle_probs), torch.argmax(aileron_probs), torch.argmax(elevator_probs), torch.argmax(rudder_probs)])
return action, 0

# Apply a -1 transformation to the action to create control tensor such that:
# -1 means go down, 0 means stay same, and +1 means go up
def get_control(self, action):
return action - 1

# flattened_params = flattened dx1 numpy array of all params to init from
# NOTE: the way the params are broken up into the weights/biases of each layer
# would need to be manually edited for changes in network architecture
def init_from_params(self, flattened_params):
flattened_params = torch.from_numpy(flattened_params).to(torch.float32)

pl, pr = 0, 0
layer1 = nn.Linear(self.inputs, self.inputs)
pr += layer1.weight.nelement()
layer1.weight = nn.Parameter(flattened_params[pl:pr].reshape(layer1.weight.shape))
pl = pr
pr += layer1.bias.nelement()
layer1.bias = nn.Parameter(flattened_params[pl:pr].reshape(layer1.bias.shape))

layer2 = nn.Linear(self.inputs, 3*self.outputs)
pl = pr
pr += layer2.weight.nelement()
layer2.weight = nn.Parameter(flattened_params[pl:pr].reshape(layer2.weight.shape))
pl = pr
pr += layer2.bias.nelement()
layer2.bias = nn.Parameter(flattened_params[pl:pr].reshape(layer2.bias.shape))

self.policy_network = nn.Sequential(
layer1,
nn.ReLU(),
layer2,
nn.Sigmoid(),
CategoricalControlsExtractor()
)
self.instantiate_policy_module()

# Loads the network from path
def init_from_saved(self, path):
self.policy_network = torch.load(path)
self.instantiate_policy_module()

# Saves the network to dir/name.pth
def save(self, dir, name):
path = os.path.join(dir, name + '.pth')
torch.save(self.policy_network, path)





14 changes: 7 additions & 7 deletions learning/crossentropy.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
import torch
import numpy as np
from learning.autopilot import AutopilotLearner
from learning.autopilot import AutopilotLearner, SlewRateAutopilotLearner
from simulation.simulate import FullIntegratedSim
from simulation.jsbsim_aircraft import x8
import os
Expand All @@ -25,7 +25,7 @@ def __init__(self, learners, num_params):
def init_using_torch_default(generation_size, num_params):
learners = []
for i in range(generation_size):
learners.append(AutopilotLearner())
learners.append(SlewRateAutopilotLearner())
return Generation(learners, num_params)

# Utilizes [rewards], which contains the reward obtained by each learner as a
Expand Down Expand Up @@ -89,12 +89,12 @@ def make_new_generation(mean, cov, generation_size, num_params):
# Generate each learner from params
learners = []
for param_list in selected_params:
l = AutopilotLearner()
l = SlewRateAutopilotLearner()
l.init_from_params(param_list)
learners.append(l)
return Generation(learners, num_params)

def cross_entropy_train(epochs, generation_size, num_survive, num_params=238, sim_time=60.0, save_dir='cross_entropy'):
def cross_entropy_train(epochs, generation_size, num_survive, num_params=432, sim_time=60.0, save_dir='cross_entropy'):
# Create save_dir (and if one already exists, rename it with some rand int)
if os.path.exists(os.path.join('data', save_dir)):
os.rename(os.path.join('data', save_dir), os.path.join('data', save_dir + '_old' + str(randint(0, 100000))))
Expand All @@ -117,7 +117,7 @@ def cross_entropy_train(epochs, generation_size, num_survive, num_params=238, si
# Evaluate generation through rollouts
rewards = []
for i in range(len(generation.learners)):
id = str(100*(epoch+1) + (i+1))
id = str(1000*(epoch+1) + (i+1))
learner = generation.learners[i]

# Run simulation to evaluate learner
Expand All @@ -127,7 +127,7 @@ def cross_entropy_train(epochs, generation_size, num_survive, num_params=238, si
integrated_sim.simulation_loop()

# Acquire/save data
integrated_sim.mdp_data_collector.save(os.path.join(save_dir, 'generation' + str(epoch+1)), 'trajectory_learner#' + str(i+1))
#integrated_sim.mdp_data_collector.save(os.path.join(save_dir, 'generation' + str(epoch+1)), 'trajectory_learner#' + str(i+1))
rewards.append(integrated_sim.mdp_data_collector.get_cum_reward())
print('Reward for Learner #', id, ': ', integrated_sim.mdp_data_collector.get_cum_reward())

Expand All @@ -149,4 +149,4 @@ def cross_entropy_train(epochs, generation_size, num_survive, num_params=238, si
if __name__ == "__main__":
os.environ["JSBSIM_DEBUG"]=str(0)
# epochs, generation_size, num_survive
cross_entropy_train(100, 99, 50)
cross_entropy_train(100, 200, 50)
Loading