ARQ_RL/TrainAndTestGradientQL.py at main · GKasperF/ARQ_RL · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
import numpy as np
import time
import copy
from joblib import Parallel, delayed
import multiprocessing
import pickle
import ReinforcementLearning.QlearningFunctions as QL
import Envs.PytorchEnvironments as Envs
import torch

q = []
if torch.cuda.is_available():
  num_cores = torch.cuda.device_count()
  for i in range(num_cores):
    q.append('cuda:'+'{}'.format(i))
else:
  num_cores = multiprocessing.cpu_count()
  for i in range(num_cores):
    q.append('cpu')

def TrainAndTest(alpha_reward, beta_reward, Tf, Nit, discount_factor, num_episodes, epsilon, batch, Channel):
    device = q.pop()
    Channel_Local = copy.deepcopy(Channel).to(device)
    string_alpha = str(alpha_reward.numpy())
    alpha_reward = alpha_reward.to(device)
    TransEnv = Envs.EnvFeedbackGeneral(Tf, alpha_reward, beta_reward, Channel_Local, batch, M=5)
    TransEnv = TransEnv.to(device)
    model_file = 'ModelCNNBatch_FritchmanBurstOnly'+string_alpha+'.pickle'
    t0 = time.time()
    Q, policy = Train(TransEnv, discount_factor, num_episodes, epsilon)
    t1 = time.time()

    with open('Data/'+model_file, 'wb') as f:
      pickle.dump(Q, f)

    print('Training takes {} seconds'.format(t1 - t0))
    t0 = time.time()
    result = Test(TransEnv, Q, Nit, batch)
    t1 = time.time()
    print('Testing takes {} seconds'.format(t1 - t0))
    q.append(device)

    with open('Data/AgentCNNRLresultsTestBatch_FritchmanBurstOnly.pickle', 'ab') as f:
      pickle.dump(result, f)

    return(result)

def Train(env, discount_factor, num_episodes, epsilon):
    Qfunction = QL.QApproxFunction(env.observation_space.n, env.action_space.n, 1000).to(env.device)
    lr_list = [0.001, 0.001, 0.001, 0.0001, 0.00001]
    for i in range(len(num_episodes)):
        Qfunction, policy, _ = QL.GradientQLearningDebug(env, num_episodes[i], Qfunction, discount_factor, epsilon[i], UpdateEpisodes= 10, UpdateTargetEpisodes= 100, lr = lr_list[i])

    return(Qfunction, policy)

def Test(env, Q, Nit, batch):
    device = env.device
    reward_acc = torch.zeros(batch).to(device)
    transmissions = torch.zeros(batch).to(device)
    time_instant = torch.zeros(batch).to(device)
    number_successes = torch.zeros(batch).to(device)

    reward_save = torch.empty((0, 4)).to(device)
    for i in range(int(Nit/batch)):
        done = 0
        reward_acc[:] = 0
        transmissions[:] = 0
        time_instant[:] = 1
        number_successes[:] = 0
        state = env.reset()
        SuccessF = torch.zeros(batch).to(device)
        while 1:
          action_index = torch.argmax(Q(state), dim = 1)
          # take action and get reward, transit to next state
          transmissions[torch.logical_not(SuccessF)] = transmissions[torch.logical_not(SuccessF)] + action_index.reshape(len(action_index))[torch.logical_not(SuccessF)]

          next_state, reward, done, SuccessF = env.step(action_index)


          # Update statistics
          reward_acc += reward.reshape(len(reward))
          time_instant[ torch.logical_not(SuccessF)] += 1
          state = next_state
          if torch.any(time_instant > env.Tf) and torch.any(transmissions == 0):
            print('Learned bad policy')
            break
          if done:
            break

        temp = torch.cat( (reward_acc.reshape(batch, 1), transmissions.reshape(batch, 1), time_instant.reshape(batch, 1), number_successes.reshape(batch, 1)), dim = 1)
        reward_save = torch.cat( (reward_save, copy.deepcopy(temp)), dim = 0)

    average_reward = torch.mean(reward_save[:, 0])
    average_transmissions = torch.mean(reward_save[:, 1])
    average_recovery = torch.mean(reward_save[:, 2]) - env.Tf

    print('Estimated expected reward is {} \n Expected reward is: {}'.format(Q(env.reset()), average_reward))

    return(average_reward, average_transmissions, average_recovery)


alpha_range = torch.arange(0.1, 5.5, 0.1)
#alpha_range = torch.tensor([0.1, 0.5, 1.0, 1.2, 1.4, 1.6, 1.8, 2.0, 2.5, 3.0, 3.5, 4.0])
beta_reward = 5
Tf = 10
Nit = 10000
epsilon = [0.8, 0.6, 0.3, 0.2, 0.1]
discount_factor = 0.95

batches = 100

#Channel = Envs.GilbertElliott(0.25, 0.25, 0, 1, batches)
Channel = Envs.Fritchman(0.25, 1, 0, 5, batches)

num_episodes = [int(2000), int(2000), int(10000), int(20000), int(50000)]

store_results = Parallel(n_jobs = num_cores, require='sharedmem')(delayed(TrainAndTest)(alpha_reward, beta_reward, Tf, Nit, discount_factor, num_episodes, epsilon, batches, Channel) for alpha_reward in alpha_range)
#store_results = TrainAndTest(alpha_range[0], beta_reward, Tf, Nit, discount_factor, num_episodes, epsilon, batches, Channel)
with open('Data/AgentCNNRLresultsTestBatch_FritchmanBurstOnly.pickle', 'wb') as f:
    pickle.dump(store_results, f)