-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathgrid.py
More file actions
129 lines (118 loc) · 3.45 KB
/
grid.py
File metadata and controls
129 lines (118 loc) · 3.45 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
# -*- coding: utf-8 -*-
"""
Created on Sun Nov 5 14:52:39 2017
@author: Pinar Ozisik
"""
from constants import *
import numpy as np
import pickle
import os
import string
import random
def coordinate_to_index(x, y): # needed for policy lookup
return (x+y) + ((y*COL) - y)
def index_to_coordinate(index): # needed for adversarial tampering
x = index // (ROW-1)
y = index - ((ROW-1) * x)
return (x, y)
def action_to_col_num(a):
if a == 'UP':
return 0
elif a == 'DOWN':
return 1
elif a == 'RIGHT':
return 2
elif a == 'LEFT':
return 3
# rows are the total number of actual_states
# cols are the total number of actions
def return_rand_policy():
data = np.random.rand(ROW*COL, len(ACTIONS))
vector = np.sum(data, axis=1)
policy = data / vector.reshape((vector.shape[0], 1))
return policy
def next_state(x, y, a):
if a == 0:
new_x = x-1 # up
new_y = y
elif a == 1:
new_x = x+1 # down
new_y = y
elif a == 2: # right
new_x = x
new_y = y+1
elif a == 3: # left
new_x = x
new_y = y-1
if check_valid_state(new_x, new_y):
return (new_x, new_y) # move to new place
else:
return (x, y) # stay in current place
def check_valid_state(x, y):
if x > -1 and x < ROW and y > -1 and y < COL:
return True
else:
return False
def select_random_action(s, policy):
rand_val = np.random.rand()
ind = coordinate_to_index(s[0], s[1])
probs = np.cumsum(policy[ind, :])
for i in range(0, probs.shape[0]):
if rand_val < probs[i]:
return i
def sample_trajectories(policy, num, flag=False):
new_directory = PATH+'trajectories-'+''.join([random.choice(string.ascii_letters + string.digits) for n in range(10)])
os.mkdir(new_directory)
#trajectories = []
max_time = -1
for episodes in range(0, num):
s = START
time = 0
e = []
while True:
#print('state ', s)
a = select_random_action(s, policy)
#print('action ', a)
s_prime = next_state(s[0], s[1], a)
#print('s prime ', s_prime)
if s_prime == FINISH:
r = (GAMMA**time) * 1
e.append((s, a, r))
break
#elif flag == True and time == TAU-1: # round is over
# r = 0
# e.append((s, a, r))
# break
else:
r = 0
e.append((s, a, r))
s = s_prime
time += 1
pickle.dump(e, open(new_directory+'/trajectory-'+str(episodes)+'.p', 'wb'))
if time > max_time:
max_time = time
#assert max_time > -1
return new_directory+'/', max_time # returns directory where trajectories are kept
def compute_J(policy):
reward = 0
for t in range(0, NUM_SAMPLES_FOR_J):
s = START
time = 0
e = []
while True:
a = select_random_action(s, policy)
s_prime = next_state(s[0], s[1], a)
if s_prime == FINISH:
r = (GAMMA**time) * 1
break
#elif time == TAU-1: # round is over
# r = 0
# break
else:
r = 0
s = s_prime
time += 1
#print('Time: ', time)
#assert time < TAU
reward += r
return reward / TRIALS # returns avg reward