DeepRL-Navigation/main.py at master · elifons/DeepRL-Navigation · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
from unityagents import UnityEnvironment
import numpy as np
from utils.dqn_agent import Agent
from collections import deque
import torch
import matplotlib.pyplot as plt
import pandas as pd
import argparse
import os
import os.path as op

# Code based on  https://github.com/udacity/deep-reinforcement-learning.git


def dqn(dir_, n_episodes=1500, max_t=1000, eps_start=1.0, eps_end=0.01, eps_decay=0.995):
    """Deep Q-Learning.

    Params
    ======
        n_episodes (int): maximum number of training episodes
        max_t (int): maximum number of timesteps per episode
        eps_start (float): starting value of epsilon, for epsilon-greedy action selection
        eps_end (float): minimum value of epsilon
        eps_decay (float): multiplicative factor (per episode) for decreasing epsilon
    """
    scores = []                        # list containing scores from each episode
    scores_window = deque(maxlen=100)  # last 100 scores
    eps = eps_start                    # initialize epsilon
    passed = False
    for i_episode in range(1, n_episodes+1):
        env_info = env.reset(train_mode=True)[brain_name]
        action_size = brain.vector_action_space_size
        state = env_info.vector_observations[0]
        state_size = len(state)
        score = 0                                          # initialize the score
        for t in range(max_t):
            action = agent.act(state, eps)
            env_info = env.step(action)[brain_name]        # send the action to the environment
            next_state = env_info.vector_observations[0]   # get the next state
            reward = env_info.rewards[0]                   # get the reward
            done = env_info.local_done[0]                  # see if episode ha
            agent.step(state, action, reward, next_state, done)
            state = next_state
            score += reward
            if done:
                break
        scores_window.append(score)       # save most recent score
        scores.append(score)              # save most recent score
        eps = max(eps_end, eps_decay*eps) # decrease epsilon
        print('\rEpisode {}\tAverage Score: {:.2f}'.format(i_episode, np.mean(scores_window)), end="")
        if i_episode % 100 == 0:
            print('\rEpisode {}\tAverage Score: {:.2f}'.format(i_episode, np.mean(scores_window)))
        if np.mean(scores_window) >= 13.0 and passed == False:
            print('\nEnvironment reached +13 in {:d} episodes!\tAverage Score: {:.2f}'.format(i_episode-100, np.mean(scores_window)))
            passed = True
        if np.mean(scores_window)>=16.0:
            print('\nEnvironment solved in {:d} episodes!\tAverage Score: {:.2f}'.format(i_episode-100, np.mean(scores_window)))
            torch.save(agent.qnetwork_local.state_dict(), op.join(dir_,'checkpoint.pth'))
            break
    return scores

def create_directory(logdir):
    try:
        os.makedirs(logdir)
    except FileExistsError:
        pass


if __name__ == '__main__':

    # Inputs for the main function
    parser = argparse.ArgumentParser(formatter_class=argparse.ArgumentDefaultsHelpFormatter)
    parser.add_argument('--n_episodes', default=500, type=int, help='max number of training episodes')
    parser.add_argument('--max_t', default=1000, type=int, help='max. number of timesteps per episode')
    parser.add_argument('--eps_start', default=1.0, type=float, help='starting value of epsilon, for epsilon-greedy action selection')
    parser.add_argument('--eps_end', default=0.01, type=float, help='minimum value of epsilon',)
    parser.add_argument('--eps_decay', default=0.995, type=float, help='multiplicative factor (per episode) for decreasing epsilon')
    parser.add_argument('--dest', default='runs', type=str, help='experiment dir')
    args = parser.parse_args()

    # Change the file_name parameter to match the location of the Univy environment.
    env = UnityEnvironment(file_name="Banana.app")
    path = args.dest
    create_directory(path)
    # get the default brain
    brain_name = env.brain_names[0]
    brain = env.brains[brain_name]


    # reset the environment
    env_info = env.reset(train_mode=True)[brain_name]
    # number of agents in the environment
    print('Number of agents:', len(env_info.agents))
    # number of actions
    action_size = brain.vector_action_space_size
    print('Number of actions:', action_size)
    # examine the state space
    state = env_info.vector_observations[0]
    print('States look like:', state)
    state_size = len(state)
    print('States have length:', state_size)


    agent = Agent(state_size=state_size, action_size=action_size, seed=0)

    scores = dqn(dir_=path, n_episodes=args.n_episodes, max_t=args.max_t, eps_start=args.eps_start, eps_end=args.eps_end, eps_decay=args.eps_decay)

    env.close()


    df_scores = pd.DataFrame(scores)
    df_scores.to_csv(op.join(path, 'scores_values.csv'))

    fig = plt.figure()
    ax = fig.add_subplot(111)
    plt.plot(np.arange(len(scores)), scores)
    plt.ylabel('Score')
    plt.xlabel('Episode #')
    plt.savefig(op.join(path, 'score.png'))
    # plt.show()