diff --git a/RandomBird/FlappyAgent.py b/RandomBird/FlappyAgent.py deleted file mode 100644 index 9f3ec84..0000000 --- a/RandomBird/FlappyAgent.py +++ /dev/null @@ -1,9 +0,0 @@ -import numpy as np - -def FlappyPolicy(state, screen): - action=None - if(np.random.randint(0,2)<1): - action=119 - return action - - diff --git a/Roy/.ipynb_checkpoints/Flappy - Comments-checkpoint.ipynb b/Roy/.ipynb_checkpoints/Flappy - Comments-checkpoint.ipynb new file mode 100644 index 0000000..ada11d1 --- /dev/null +++ b/Roy/.ipynb_checkpoints/Flappy - Comments-checkpoint.ipynb @@ -0,0 +1,408 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# RL challenge\n", + "\n", + "Your challenge is to learn to play [Flappy Bird](https://en.wikipedia.org/wiki/Flappy_Bird)!\n", + "\n", + "Flappybird is a side-scrolling game where the agent must successfully nagivate through gaps between pipes. Only two actions in this game: at each time step, either you click and the bird flaps, or you don't click and gravity plays its role.\n", + "\n", + "\n", + "## Step 1 : FlappyAgentManual & FlappyAgentEasy\n", + "\n", + "\n", + "To begin with this Flappy Challenge, I decided to develop two very easy functions in order to understand the kind of trajectories that may help Flappy to win his challenge. We have:\n", + "\n", + "* FlappyAgentManual : it is only a function that allow you to play Flappy Bird. You should press a key of your keyboard to make the bird flap. This function allow you to play Flappy and to decide the trajectory of the bird.\n", + "\n", + "* FlappyAgentEasy : it is a completely deterministic function. It is not interesting for Reinforcment Learning but it is a first function that uses state variables to help Flappy to fly.\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "from ple.games.flappybird import FlappyBird\n", + "from ple import PLE\n", + "import pygame\n", + "import numpy as np\n", + "import pickle" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": true + }, + "outputs": [], + "source": [ + "# first policies\n", + "\n", + "def FlappyAgentManual(state, screen):\n", + " action = None\n", + " events = pygame.event.get()\n", + " for event in events:\n", + " print(event)\n", + " if event.type == pygame.KEYUP:\n", + " action = 119\n", + " return action\n", + "\n", + "def FlappyAgentEasy(state, screen):\n", + " if state['player_y'] > state['next_pipe_bottom_y'] - 50:\n", + " action = 119\n", + " else:\n", + " action = None\n", + " return action" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "scrolled": false + }, + "outputs": [], + "source": [ + "# run.py\n", + "game = FlappyBird()\n", + "p = PLE(game, fps=30, frame_skip=1, num_steps=1, force_fps=True, display_screen=True)\n", + "\n", + "p.init()\n", + "reward = 0.0\n", + "\n", + "nb_games = 1\n", + "nb_data = 200\n", + "cumulated = np.zeros((nb_games))\n", + "\n", + "\n", + "next_pipe_bottom_y = np.array(range(nb_data))\n", + "player_vel = np.array(range(nb_data))\n", + "player_y = np.array(range(nb_data))\n", + "next_pipe_dist_to_player = np.array(range(nb_data))\n", + "count = 0\n", + "for i in range(nb_games):\n", + " p.reset_game()\n", + " \n", + " while(not p.game_over()):\n", + " state = game.getGameState()\n", + " screen = p.getScreenRGB()\n", + " action=FlappyAgentEasy(state, screen) \n", + " reward = p.act(action)\n", + " cumulated[i] = cumulated[i] + reward\n", + " if count <= nb_data-1:\n", + " next_pipe_bottom_y[count] = state['next_pipe_bottom_y']\n", + " player_vel[count] = state['player_vel']\n", + " player_y[count] = state['player_y']\n", + " next_pipe_dist_to_player[count] = state['next_pipe_dist_to_player']\n", + " print(action, reward)\n", + " count = count +1\n", + "\n", + "average_score = np.mean(cumulated)\n", + "max_score = np.max(cumulated)\n", + "\n", + "print(average_score)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "print(player_y)\n", + "print(player_vel)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "As you can see, the previous code allow to begin a game and to read the values of the state of the game at every moment of the game. Through this pre-study, we can read the value that may take every variable such as **player_y**, **player_vel**, **next_pipe_bottom_y**, **next_pipe_dist_to_player** and the **reward**\n", + "\n", + "\n", + "## FlappyAgentSarsa\n", + "\n", + "Since we know the values, the next step of the challenge was to simplify the number of possible states in order to reduce the size of the space we want to explore afterwards. We define then a vector for each variable that goes from the min to the max values with a specific resolution. At the beginning we worked with around 20 possible values for each variables. " + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": true + }, + "outputs": [], + "source": [ + "# grid definition\n", + "resolution = 20\n", + "next_pipe_bottom_y = np.array(range(0,400,resolution))\n", + "player_vel = np.array(range(-10,10, 2))\n", + "player_y = np.array(range(0,400,resolution))\n", + "next_pipe_dist_to_player = np.array(range(0,300,resolution))" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Then we decided to write the code of a Sarsa($\\lambda$) algorithm. The writing of the code is quite easy, all the difficulty for the next step lies in choosing good parameters. A first idea was to take random parameters and to compare the learning converge ratio of the sarsa algorithme on the first 100 games to detect the best parameters. After different tries, here are the values I choosed and why.\n", + "\n", + "* **$\\alpha$ = 0.1 :** I tried the value of the RL notebook at the begining $\\alpha$ = 0.01. This parameter correponds to the compromise between the speed of convergence and the accuracy of convergence. But with $\\alpha$ = 0.01 I had no improvement on the first 100 games, that's why I decided to increase it until $\\alpha$ = 0.1.\n", + "* **$\\gamma$ = 0.9:** It corresponds to factor of the discounted reward. It guarantees the convergence of the reward model. So we want it to be quite close to 1. I used the same value than in the course\n", + "* **$\\lambda$ = 0.9:** It is the parameter of the eligibility trace. It corresponds approximately to the number of step that we want to memorize. A state seen n steps ago will have a weight $\\lambda^n (1-\\lambda)$. I wanted this weight to be approximately 0.01 after 10 states, so I chose $\\lambda$ = 0.9\n", + "* **$\\epsilon$ = 0.05:** We want a random action that allow us to sometimes go through the first pipe. That's why 0.05 is a relevant solution." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": true + }, + "outputs": [], + "source": [ + "# parameter definition\n", + "gamma = 0.9\n", + "alpha = 0.1\n", + "epsilon = 1\n", + "lbd = 0.9\n", + "\n", + "# initialize matrix Q and eligibility\n", + "Q = np.zeros((len(player_y), len(next_pipe_bottom_y), len(player_vel), len(next_pipe_dist_to_player) ,2))\n", + "eligibility = np.zeros((len(player_y), len(next_pipe_bottom_y), len(player_vel), len(next_pipe_dist_to_player)))\n", + "\n", + "n = 100000 ##nb de parties utilisées" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Other things to make our algorithm converge faster:\n", + "\n", + "* **Reward shaping:** I decided to reshape the reward. (see function RewardShape). The idea is to make the loss worst when Flappy is really far from going through the pipe. That's why I used a reward that is -1000 multiplied to the distance between flappy when he dies and the middle of the hole he should have gone through.\n", + "* **Context dependent exploration:** The idea is to reduce the value of epsilon with time to reduce the frequency of random action. It allows us to see Flappy's progression. However to keep exploring new states the reducing of epsilon is really slow: I divided epsilon by 1.1 every 1000 games" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": true + }, + "outputs": [], + "source": [ + "# convert boolean to Action\n", + "def ToAction(a):\n", + " if a:\n", + " action = 119\n", + " else:\n", + " action = None\n", + " return action\n", + "\n", + "# reward shaping\n", + "def RewardShape(reward):\n", + " reward_ = reward\n", + " if reward == 1:\n", + " reward_ = 100\n", + " if reward == -5:\n", + " state_end = game.getGameState()\n", + " reward_ = -1000 * abs(state_new['player_y'] - (state_new['next_pipe_top_y']+state_new['next_pipe_bottom_y'])/2)\n", + " return reward_\n", + "\n", + "def epsilon_greedy(Q, i,j,k, l, epsilon, state):\n", + " a = np.argmax(Q[i][j][k][l][:])\n", + " if(np.random.rand()<=epsilon): # random action\n", + " if np.random.rand()<= 0.05:\n", + " a = 1-a\n", + " return a" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "## TRAINING GAME\n", + "cumulated = np.zeros((100))\n", + "count = 0\n", + "\n", + "game = FlappyBird()\n", + "p = PLE(game, fps=30, frame_skip=1, num_steps=1, force_fps=True, display_screen=True)\n", + "\n", + "for kk in range(1,n):\n", + " if ((kk+1)%1000==0):\n", + " epsilon = epsilon/1.1\n", + " if ((kk+1)%100 == 0):\n", + " print('Moyenne sur les 100 derniers jeux:')\n", + " print((5 + np.mean(cumulated)))\n", + " cumulated = np.zeros((100))\n", + " count = 0\n", + " p.init()\n", + " reward = 0.0\n", + " p.reset_game()\n", + " state = game.getGameState()\n", + " screen = p.getScreenRGB()\n", + " i = np.argmin(abs(player_y - state['player_y']))\n", + " j = np.argmin(abs(next_pipe_bottom_y - state['next_pipe_bottom_y']))\n", + " k = np.argmin(abs(player_vel - state['player_vel']))\n", + " l = np.argmin(abs(next_pipe_dist_to_player - state['next_pipe_dist_to_player']))\n", + " a = epsilon_greedy(Q, i, j, k, l, epsilon, state)\n", + " \n", + " \n", + " while(not p.game_over()):\n", + " # observe r, s and s' \n", + " reward = p.act(ToAction(a))\n", + " reward_ = RewardShape(reward)\n", + " state_new = game.getGameState()\n", + " screen_new = p.getScreenRGB()\n", + " i_new = np.argmin(abs(player_y - state_new['player_y']))\n", + " j_new = np.argmin(abs(next_pipe_bottom_y - state_new['next_pipe_bottom_y']))\n", + " k_new = np.argmin(abs(player_vel - state_new['player_vel']))\n", + " l_new = np.argmin(abs(next_pipe_dist_to_player - state_new['next_pipe_dist_to_player']))\n", + " aa = epsilon_greedy(Q, i_new, j_new, k_new, l_new, epsilon, state_new)\n", + " \n", + " eligibility = gamma * lbd * eligibility\n", + " eligibility[i][j][k][l] = 1\n", + " delta = reward_ + gamma*Q[i_new][j_new][k_new][l_new][aa] - Q[i][j][k][l][a] \n", + " Q[i][j][k][l][a] = Q[i][j][k][l][a] + alpha * eligibility[i][j][k][l]* delta\n", + " \n", + "\n", + " i = i_new\n", + " j = j_new\n", + " k = k_new \n", + " l = l_new\n", + " a = aa\n", + " \n", + " cumulated[count] = cumulated[count] + reward\n", + " count += 1\n", + "Q\n" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Once Q is computed, we saved it in a python file. So next time, we'll just have to import the matrix we defined the policy below:" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": true + }, + "outputs": [], + "source": [ + "## SAVE\n", + "#with open('Qsarsa', 'wb') as f:\n", + "# pickle.dump(Q,f)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": true + }, + "outputs": [], + "source": [ + "def QToPolicy(Q):\n", + " pi = np.zeros((len(player_y), len(next_pipe_bottom_y), len(player_vel), len(next_pipe_dist_to_player)))\n", + " for i in range(len(player_y)):\n", + " for j in range(len(next_pipe_bottom_y)):\n", + " for k in range(len(player_vel)):\n", + " for l in range(len(next_pipe_dist_to_player)):\n", + " pi[i][j][k][l] = np.argmax(Q[i][j][k][l][:])\n", + " return pi\n", + "\n", + "def FlappyAgentSarsa(state, screen):\n", + " \n", + " # determine state\n", + " i = np.argmin(abs(player_y - state['player_y']))\n", + " j = np.argmin(abs(next_pipe_bottom_y - state['next_pipe_bottom_y']))\n", + " k = np.argmin(abs(player_vel - state['player_vel']))\n", + " l = np.argmin(abs(next_pipe_dist_to_player - state['next_pipe_dist_to_player']))\n", + " \n", + " action = ToAction(Pisarsa[i][j][k][l])\n", + " return action\n", + "\n", + "file = open(\"Qsarsa\",'rb')\n", + "Qsarsa = pickle.load(file)\n", + "Pisarsa = QToPolicy(Qsarsa)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# run.py\n", + "\n", + "game = FlappyBird()\n", + "p = PLE(game, fps=30, frame_skip=1, num_steps=1, force_fps=True, display_screen=True)\n", + "\n", + "p.init()\n", + "reward = 0.0\n", + "\n", + "nb_games = 100\n", + "cumulated = np.zeros((nb_games))\n", + "\n", + "for i in range(nb_games):\n", + " p.reset_game()\n", + " \n", + " while(not p.game_over()):\n", + " state = game.getGameState()\n", + " screen = p.getScreenRGB()\n", + " action=FlappyAgentSarsa(state, screen)\n", + " \n", + " reward = p.act(action)\n", + " cumulated[i] = cumulated[i] + reward\n", + "\n", + "average_score = np.mean(cumulated)\n", + "max_score = np.max(cumulated)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": true + }, + "outputs": [], + "source": [ + "print(average_score)\n", + "print(max_score)" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python [conda root]", + "language": "python", + "name": "conda-root-py" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.6.3" + } + }, + "nbformat": 4, + "nbformat_minor": 2 +} diff --git a/Roy/.ipynb_checkpoints/Untitled-checkpoint.ipynb b/Roy/.ipynb_checkpoints/Untitled-checkpoint.ipynb new file mode 100644 index 0000000..2fd6442 --- /dev/null +++ b/Roy/.ipynb_checkpoints/Untitled-checkpoint.ipynb @@ -0,0 +1,6 @@ +{ + "cells": [], + "metadata": {}, + "nbformat": 4, + "nbformat_minor": 2 +} diff --git a/Roy/Flappy - Comments.ipynb b/Roy/Flappy - Comments.ipynb new file mode 100644 index 0000000..ada11d1 --- /dev/null +++ b/Roy/Flappy - Comments.ipynb @@ -0,0 +1,408 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# RL challenge\n", + "\n", + "Your challenge is to learn to play [Flappy Bird](https://en.wikipedia.org/wiki/Flappy_Bird)!\n", + "\n", + "Flappybird is a side-scrolling game where the agent must successfully nagivate through gaps between pipes. Only two actions in this game: at each time step, either you click and the bird flaps, or you don't click and gravity plays its role.\n", + "\n", + "\n", + "## Step 1 : FlappyAgentManual & FlappyAgentEasy\n", + "\n", + "\n", + "To begin with this Flappy Challenge, I decided to develop two very easy functions in order to understand the kind of trajectories that may help Flappy to win his challenge. We have:\n", + "\n", + "* FlappyAgentManual : it is only a function that allow you to play Flappy Bird. You should press a key of your keyboard to make the bird flap. This function allow you to play Flappy and to decide the trajectory of the bird.\n", + "\n", + "* FlappyAgentEasy : it is a completely deterministic function. It is not interesting for Reinforcment Learning but it is a first function that uses state variables to help Flappy to fly.\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "from ple.games.flappybird import FlappyBird\n", + "from ple import PLE\n", + "import pygame\n", + "import numpy as np\n", + "import pickle" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": true + }, + "outputs": [], + "source": [ + "# first policies\n", + "\n", + "def FlappyAgentManual(state, screen):\n", + " action = None\n", + " events = pygame.event.get()\n", + " for event in events:\n", + " print(event)\n", + " if event.type == pygame.KEYUP:\n", + " action = 119\n", + " return action\n", + "\n", + "def FlappyAgentEasy(state, screen):\n", + " if state['player_y'] > state['next_pipe_bottom_y'] - 50:\n", + " action = 119\n", + " else:\n", + " action = None\n", + " return action" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "scrolled": false + }, + "outputs": [], + "source": [ + "# run.py\n", + "game = FlappyBird()\n", + "p = PLE(game, fps=30, frame_skip=1, num_steps=1, force_fps=True, display_screen=True)\n", + "\n", + "p.init()\n", + "reward = 0.0\n", + "\n", + "nb_games = 1\n", + "nb_data = 200\n", + "cumulated = np.zeros((nb_games))\n", + "\n", + "\n", + "next_pipe_bottom_y = np.array(range(nb_data))\n", + "player_vel = np.array(range(nb_data))\n", + "player_y = np.array(range(nb_data))\n", + "next_pipe_dist_to_player = np.array(range(nb_data))\n", + "count = 0\n", + "for i in range(nb_games):\n", + " p.reset_game()\n", + " \n", + " while(not p.game_over()):\n", + " state = game.getGameState()\n", + " screen = p.getScreenRGB()\n", + " action=FlappyAgentEasy(state, screen) \n", + " reward = p.act(action)\n", + " cumulated[i] = cumulated[i] + reward\n", + " if count <= nb_data-1:\n", + " next_pipe_bottom_y[count] = state['next_pipe_bottom_y']\n", + " player_vel[count] = state['player_vel']\n", + " player_y[count] = state['player_y']\n", + " next_pipe_dist_to_player[count] = state['next_pipe_dist_to_player']\n", + " print(action, reward)\n", + " count = count +1\n", + "\n", + "average_score = np.mean(cumulated)\n", + "max_score = np.max(cumulated)\n", + "\n", + "print(average_score)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "print(player_y)\n", + "print(player_vel)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "As you can see, the previous code allow to begin a game and to read the values of the state of the game at every moment of the game. Through this pre-study, we can read the value that may take every variable such as **player_y**, **player_vel**, **next_pipe_bottom_y**, **next_pipe_dist_to_player** and the **reward**\n", + "\n", + "\n", + "## FlappyAgentSarsa\n", + "\n", + "Since we know the values, the next step of the challenge was to simplify the number of possible states in order to reduce the size of the space we want to explore afterwards. We define then a vector for each variable that goes from the min to the max values with a specific resolution. At the beginning we worked with around 20 possible values for each variables. " + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": true + }, + "outputs": [], + "source": [ + "# grid definition\n", + "resolution = 20\n", + "next_pipe_bottom_y = np.array(range(0,400,resolution))\n", + "player_vel = np.array(range(-10,10, 2))\n", + "player_y = np.array(range(0,400,resolution))\n", + "next_pipe_dist_to_player = np.array(range(0,300,resolution))" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Then we decided to write the code of a Sarsa($\\lambda$) algorithm. The writing of the code is quite easy, all the difficulty for the next step lies in choosing good parameters. A first idea was to take random parameters and to compare the learning converge ratio of the sarsa algorithme on the first 100 games to detect the best parameters. After different tries, here are the values I choosed and why.\n", + "\n", + "* **$\\alpha$ = 0.1 :** I tried the value of the RL notebook at the begining $\\alpha$ = 0.01. This parameter correponds to the compromise between the speed of convergence and the accuracy of convergence. But with $\\alpha$ = 0.01 I had no improvement on the first 100 games, that's why I decided to increase it until $\\alpha$ = 0.1.\n", + "* **$\\gamma$ = 0.9:** It corresponds to factor of the discounted reward. It guarantees the convergence of the reward model. So we want it to be quite close to 1. I used the same value than in the course\n", + "* **$\\lambda$ = 0.9:** It is the parameter of the eligibility trace. It corresponds approximately to the number of step that we want to memorize. A state seen n steps ago will have a weight $\\lambda^n (1-\\lambda)$. I wanted this weight to be approximately 0.01 after 10 states, so I chose $\\lambda$ = 0.9\n", + "* **$\\epsilon$ = 0.05:** We want a random action that allow us to sometimes go through the first pipe. That's why 0.05 is a relevant solution." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": true + }, + "outputs": [], + "source": [ + "# parameter definition\n", + "gamma = 0.9\n", + "alpha = 0.1\n", + "epsilon = 1\n", + "lbd = 0.9\n", + "\n", + "# initialize matrix Q and eligibility\n", + "Q = np.zeros((len(player_y), len(next_pipe_bottom_y), len(player_vel), len(next_pipe_dist_to_player) ,2))\n", + "eligibility = np.zeros((len(player_y), len(next_pipe_bottom_y), len(player_vel), len(next_pipe_dist_to_player)))\n", + "\n", + "n = 100000 ##nb de parties utilisées" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Other things to make our algorithm converge faster:\n", + "\n", + "* **Reward shaping:** I decided to reshape the reward. (see function RewardShape). The idea is to make the loss worst when Flappy is really far from going through the pipe. That's why I used a reward that is -1000 multiplied to the distance between flappy when he dies and the middle of the hole he should have gone through.\n", + "* **Context dependent exploration:** The idea is to reduce the value of epsilon with time to reduce the frequency of random action. It allows us to see Flappy's progression. However to keep exploring new states the reducing of epsilon is really slow: I divided epsilon by 1.1 every 1000 games" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": true + }, + "outputs": [], + "source": [ + "# convert boolean to Action\n", + "def ToAction(a):\n", + " if a:\n", + " action = 119\n", + " else:\n", + " action = None\n", + " return action\n", + "\n", + "# reward shaping\n", + "def RewardShape(reward):\n", + " reward_ = reward\n", + " if reward == 1:\n", + " reward_ = 100\n", + " if reward == -5:\n", + " state_end = game.getGameState()\n", + " reward_ = -1000 * abs(state_new['player_y'] - (state_new['next_pipe_top_y']+state_new['next_pipe_bottom_y'])/2)\n", + " return reward_\n", + "\n", + "def epsilon_greedy(Q, i,j,k, l, epsilon, state):\n", + " a = np.argmax(Q[i][j][k][l][:])\n", + " if(np.random.rand()<=epsilon): # random action\n", + " if np.random.rand()<= 0.05:\n", + " a = 1-a\n", + " return a" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "## TRAINING GAME\n", + "cumulated = np.zeros((100))\n", + "count = 0\n", + "\n", + "game = FlappyBird()\n", + "p = PLE(game, fps=30, frame_skip=1, num_steps=1, force_fps=True, display_screen=True)\n", + "\n", + "for kk in range(1,n):\n", + " if ((kk+1)%1000==0):\n", + " epsilon = epsilon/1.1\n", + " if ((kk+1)%100 == 0):\n", + " print('Moyenne sur les 100 derniers jeux:')\n", + " print((5 + np.mean(cumulated)))\n", + " cumulated = np.zeros((100))\n", + " count = 0\n", + " p.init()\n", + " reward = 0.0\n", + " p.reset_game()\n", + " state = game.getGameState()\n", + " screen = p.getScreenRGB()\n", + " i = np.argmin(abs(player_y - state['player_y']))\n", + " j = np.argmin(abs(next_pipe_bottom_y - state['next_pipe_bottom_y']))\n", + " k = np.argmin(abs(player_vel - state['player_vel']))\n", + " l = np.argmin(abs(next_pipe_dist_to_player - state['next_pipe_dist_to_player']))\n", + " a = epsilon_greedy(Q, i, j, k, l, epsilon, state)\n", + " \n", + " \n", + " while(not p.game_over()):\n", + " # observe r, s and s' \n", + " reward = p.act(ToAction(a))\n", + " reward_ = RewardShape(reward)\n", + " state_new = game.getGameState()\n", + " screen_new = p.getScreenRGB()\n", + " i_new = np.argmin(abs(player_y - state_new['player_y']))\n", + " j_new = np.argmin(abs(next_pipe_bottom_y - state_new['next_pipe_bottom_y']))\n", + " k_new = np.argmin(abs(player_vel - state_new['player_vel']))\n", + " l_new = np.argmin(abs(next_pipe_dist_to_player - state_new['next_pipe_dist_to_player']))\n", + " aa = epsilon_greedy(Q, i_new, j_new, k_new, l_new, epsilon, state_new)\n", + " \n", + " eligibility = gamma * lbd * eligibility\n", + " eligibility[i][j][k][l] = 1\n", + " delta = reward_ + gamma*Q[i_new][j_new][k_new][l_new][aa] - Q[i][j][k][l][a] \n", + " Q[i][j][k][l][a] = Q[i][j][k][l][a] + alpha * eligibility[i][j][k][l]* delta\n", + " \n", + "\n", + " i = i_new\n", + " j = j_new\n", + " k = k_new \n", + " l = l_new\n", + " a = aa\n", + " \n", + " cumulated[count] = cumulated[count] + reward\n", + " count += 1\n", + "Q\n" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Once Q is computed, we saved it in a python file. So next time, we'll just have to import the matrix we defined the policy below:" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": true + }, + "outputs": [], + "source": [ + "## SAVE\n", + "#with open('Qsarsa', 'wb') as f:\n", + "# pickle.dump(Q,f)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": true + }, + "outputs": [], + "source": [ + "def QToPolicy(Q):\n", + " pi = np.zeros((len(player_y), len(next_pipe_bottom_y), len(player_vel), len(next_pipe_dist_to_player)))\n", + " for i in range(len(player_y)):\n", + " for j in range(len(next_pipe_bottom_y)):\n", + " for k in range(len(player_vel)):\n", + " for l in range(len(next_pipe_dist_to_player)):\n", + " pi[i][j][k][l] = np.argmax(Q[i][j][k][l][:])\n", + " return pi\n", + "\n", + "def FlappyAgentSarsa(state, screen):\n", + " \n", + " # determine state\n", + " i = np.argmin(abs(player_y - state['player_y']))\n", + " j = np.argmin(abs(next_pipe_bottom_y - state['next_pipe_bottom_y']))\n", + " k = np.argmin(abs(player_vel - state['player_vel']))\n", + " l = np.argmin(abs(next_pipe_dist_to_player - state['next_pipe_dist_to_player']))\n", + " \n", + " action = ToAction(Pisarsa[i][j][k][l])\n", + " return action\n", + "\n", + "file = open(\"Qsarsa\",'rb')\n", + "Qsarsa = pickle.load(file)\n", + "Pisarsa = QToPolicy(Qsarsa)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# run.py\n", + "\n", + "game = FlappyBird()\n", + "p = PLE(game, fps=30, frame_skip=1, num_steps=1, force_fps=True, display_screen=True)\n", + "\n", + "p.init()\n", + "reward = 0.0\n", + "\n", + "nb_games = 100\n", + "cumulated = np.zeros((nb_games))\n", + "\n", + "for i in range(nb_games):\n", + " p.reset_game()\n", + " \n", + " while(not p.game_over()):\n", + " state = game.getGameState()\n", + " screen = p.getScreenRGB()\n", + " action=FlappyAgentSarsa(state, screen)\n", + " \n", + " reward = p.act(action)\n", + " cumulated[i] = cumulated[i] + reward\n", + "\n", + "average_score = np.mean(cumulated)\n", + "max_score = np.max(cumulated)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": true + }, + "outputs": [], + "source": [ + "print(average_score)\n", + "print(max_score)" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python [conda root]", + "language": "python", + "name": "conda-root-py" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.6.3" + } + }, + "nbformat": 4, + "nbformat_minor": 2 +} diff --git a/Roy/FlappyAgent.py b/Roy/FlappyAgent.py new file mode 100644 index 0000000..a36d501 --- /dev/null +++ b/Roy/FlappyAgent.py @@ -0,0 +1,44 @@ +import pickle +import numpy as np + +# grid definition +resolution = 20 +next_pipe_bottom_y = np.array(range(0,400,resolution)) +player_vel = np.array(range(-10,10, 2)) +player_y = np.array(range(0,400,resolution)) +next_pipe_dist_to_player = np.array(range(0,300,resolution)) + +# convert boolean to Action +def ToAction(a): + if a: + action = 119 + else: + action = None + return action + +def QToPolicy(Q): + pi = np.zeros((len(player_y), len(next_pipe_bottom_y), len(player_vel), len(next_pipe_dist_to_player))) + for i in range(len(player_y)): + for j in range(len(next_pipe_bottom_y)): + for k in range(len(player_vel)): + for l in range(len(next_pipe_dist_to_player)): + pi[i][j][k][l] = np.argmax(Q[i][j][k][l][:]) + return pi + +def FlappyPolicy(state, screen): + + # determine state + i = np.argmin(abs(player_y - state['player_y'])) + j = np.argmin(abs(next_pipe_bottom_y - state['next_pipe_bottom_y'])) + k = np.argmin(abs(player_vel - state['player_vel'])) + l = np.argmin(abs(next_pipe_dist_to_player - state['next_pipe_dist_to_player'])) + + action = ToAction(Pisarsa[i][j][k][l]) + return action + +file = open("Qsarsa",'rb') +Qsarsa = pickle.load(file) +Pisarsa = QToPolicy(Qsarsa) + + + diff --git a/Roy/Qsarsa b/Roy/Qsarsa new file mode 100644 index 0000000..faa0a95 Binary files /dev/null and b/Roy/Qsarsa differ diff --git a/RandomBird/run.py b/Roy/run.py similarity index 100% rename from RandomBird/run.py rename to Roy/run.py