RL/bandit_benchmark.py at main · WideLearning/RL · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
# pylint: disable=wrong-import-position, unnecessary-lambda-assignment
from typing import Callable

import matplotlib as mpl

mpl.use("TkAgg")  # otherwise gymnasium conflicts with matplotlib

import gymnasium as gym
import numpy as np
from algorithms.approximation import TableExp, TableMean
from algorithms.multiarmed_bandit import (
    UCB,
    EpsGreedy,
    GradientBandit,
    GradientBanditBiased,
    MultiArmedBanditPolicy,
    OptimalBandit,
    OptimalGradientBandit,
)
from matplotlib import pyplot as plt
from scipy.ndimage import gaussian_filter1d
from tqdm import tqdm

plt.rcParams["text.usetex"] = True


def rewards_during_training(
    build_agent: Callable[[gym.Env], MultiArmedBanditPolicy],
    seed=0,
    num_steps=300,
) -> np.ndarray[np.float32]:
    np.random.seed(seed)
    env = gym.make(
        "MultiArmedBandit",
        means=np.random.randn(5),
        stds=np.full(5, fill_value=3),
        render_mode="ansi",
    )
    agent = build_agent(env)
    rewards = np.zeros(num_steps)
    _observation, _info = env.reset(seed=seed)
    for i in range(num_steps):
        action = agent.predict(exploration=True)
        _observation, rewards[i], _terminated, _truncated, _info = env.step(action)
        agent.learn(action, rewards[i])
    return rewards


def plot_training(build_agent: Callable[[], MultiArmedBanditPolicy], title):
    runs = 10000
    rewards = np.stack(
        [rewards_during_training(build_agent, seed) for seed in tqdm(range(runs))]
    )
    mean = gaussian_filter1d(rewards.mean(axis=0), sigma=10)
    std = rewards.std(axis=0) / np.sqrt(runs)
    plt.xlim(1, 300)
    plt.ylim(0, 2)
    plt.xticks(np.linspace(0, 300, 11))
    plt.yticks(np.linspace(0, 2, 11))
    plt.grid(visible=True)
    plt.plot(mean, "-", c="red")
    plt.plot(mean + std, "--", c="blue")
    plt.plot(mean - std, "--", c="blue")
    plt.title(title, fontdict={"size": 20})


def under_curve(
    build_agent: Callable[[], MultiArmedBanditPolicy]
) -> tuple[np.float32, np.float32]:
    runs = 10000
    rewards = np.stack(
        [rewards_during_training(build_agent, seed) for seed in tqdm(range(runs))]
    )
    mean = rewards.mean(axis=0)
    stds = rewards.std(axis=0) / np.sqrt(runs)
    return (
        mean.sum(),
        stds.sum(),
    )  # just summing standard deviations, because they are correlated


greedy = lambda env: EpsGreedy(k=env.means.size, config={"eps": 0.0})
eps_greedy = lambda env: EpsGreedy(
    k=env.means.size,
    config={"eps": 0.05, "q": TableMean({"default": np.inf})},
)
eps_greedy_exp = lambda env: EpsGreedy(
    k=env.means.size,
    config={"eps": 0.05, "q": TableExp({"default": 3.0, "lr": 0.1})},
)
ucb = lambda env: UCB(k=env.means.size, config={"c": 3.0})
optimal = lambda env: OptimalBandit(k=env.means.size, config={"means": env.means})
gradient_biased = lambda env: GradientBanditBiased(k=env.means.size, config={"lr": 0.2})
gradient = lambda env: GradientBandit(k=env.means.size, config={"lr": 0.2})
optimal_gradient = lambda env: OptimalGradientBandit(
    k=env.means.size, config={"means": env.means, "lr": 0.2}
)

current = optimal_gradient
print(under_curve(current))
plot_training(current, r"Optimal gradient ascent, $\alpha=0.2$")
plt.savefig("images/optimal_gradient.svg")

# optimal 350±10
# eps_greedy 242±10
# ucb 262±10
# greedy 225±10
# optimal_gradient 308±10
# gradient_biased 256±10
# gradient 247±10