Data-Science/RL_Taxi-v3-AI.py at main · Pepcoders/Data-Science · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
import gym
import numpy as np
import random
import os
import time


# create Taxi environment
env = gym.make('Taxi-v3')

# Q-Learning
state_size = env.observation_space.n  # total number of states (S)
action_size = env.action_space.n      # total number of actions (A)
qtable = np.zeros((state_size, action_size))

learning_rate = 0.9
discount_rate = 0.8
epsilon = 1.0     # probability that our agent will explore
decay_rate = 0.005 # of epsilon


# create a new instance of taxi, and get the initial state

max_steps = 99

episode = 1

total_score = 0
record = 0

def prnt(action, score, step):
    os.system('clear')
    env.render()
    print(action)
    print()
    print('Episodes:', episode-1)
    print('TimeStep:', step)
    print('Score:', score)
    print('Average:', total_score//episode)

slowAt = [1, 100, 500, 1000, 2000]
fastAt = [3, 102, 502, 1002, 0]
maxsteps = [7, 10, 15 , 20, 30]
idx = 0
slow = True

# for episode in range(num_episode):
while True:
    state = env.reset()
    done = False
    score = 0

    step = 0
    while step < max_steps:

        # sample a random action from the list of available actions
        if random.uniform(0,1) < epsilon:
            # explore
            action = env.action_space.sample()
        else:
            # exploit
            action = np.argmax(qtable[state,:])


        # perform this action on the environment
        new_state, reward, done, info = env.step(action)

        qtable[state, action] += learning_rate * (reward + discount_rate * np.max(qtable[new_state,:]) - qtable[state,action])
        # print the new state
        state = new_state

        # env.render()
        if slow:
            prnt(action, score, step)
        # input()
        if done:
            break
        score += reward

        if idx < len(slowAt) and slowAt[idx] == episode:
            slow = True
            max_steps = maxsteps[idx]
        if idx < len(slowAt) and fastAt[idx] == episode:
            slow = False
            idx+=1
            max_steps = 99

        if slow:
            time.sleep(0.7)
        step += 1
    epsilon = np.exp(-decay_rate*episode)
    episode += 1

    record = max(record, score)
    total_score += score