srikanthbaride
diff --git a/‎.github/workflows/python-tests.yml‎
Lines changed: 9 additions & 8 deletions b/‎.github/workflows/python-tests.yml‎
Lines changed: 9 additions & 8 deletions
diff --git a/‎.gitignore‎
Lines changed: 2 additions & 15 deletions b/‎.gitignore‎
Lines changed: 2 additions & 15 deletions
diff --git a/‎LICENSE‎
Lines changed: 1 addition & 19 deletions b/‎LICENSE‎
Lines changed: 1 addition & 19 deletions
diff --git a/‎README.md‎
Lines changed: 2 additions & 32 deletions b/‎README.md‎
Lines changed: 2 additions & 32 deletions
diff --git a/‎ch2_rl_formulation/LICENSE‎
Lines changed: 0 additions & 21 deletions b/‎ch2_rl_formulation/LICENSE‎
Lines changed: 0 additions & 21 deletions
diff --git a/‎ch2_rl_formulation/README.md‎
Lines changed: 0 additions & 45 deletions b/‎ch2_rl_formulation/README.md‎
Lines changed: 0 additions & 45 deletions
diff --git a/‎ch2_rl_formulation/evaluation.py‎
Lines changed: 35 additions & 12 deletions b/‎ch2_rl_formulation/evaluation.py‎
Lines changed: 35 additions & 12 deletions
diff --git a/‎ch2_rl_formulation/examples/gridworld_demo.py‎
Lines changed: 29 additions & 10 deletions b/‎ch2_rl_formulation/examples/gridworld_demo.py‎
Lines changed: 29 additions & 10 deletions
diff --git a/‎ch2_rl_formulation/examples/numeric_checks.py‎
Lines changed: 23 additions & 8 deletions b/‎ch2_rl_formulation/examples/numeric_checks.py‎
Lines changed: 23 additions & 8 deletions
@@ -1,4 +1,4 @@
-name: Python (Chapter 2)
+name: Python (Chapters)
 
 on:
   push:
@@ -11,21 +11,22 @@ jobs:
     runs-on: ubuntu-latest
     strategy:
       matrix:
-        python-version: ["3.9", "3.10", "3.11"]
+        python-version: ["3.10", "3.11"]
     steps:
       - uses: actions/checkout@v4
 
       - name: Set up Python
         uses: actions/setup-python@v5
         with:
           python-version: ${{ matrix.python-version }}
-          cache: "pip"
 
-      - name: Install dependencies (Chapter 2)
+      - name: Install root deps
         run: |
           python -m pip install --upgrade pip
-          pip install -r ch2_rl_formulation/requirements.txt
+          pip install -r requirements.txt
 
-      - name: Run tests (Chapter 2)
-        run: |
-          pytest -q ch2_rl_formulation/tests
+      - name: Run Chapter 2 tests
+        run: pytest -q ch2_rl_formulation/tests
+
+      - name: Run Chapter 3 tests
+        run: pytest -q ch3_multi_armed_bandits/tests
@@ -1,18 +1,5 @@
-# Python
+.venv/
 __pycache__/
 *.pyc
-.venv/
-.env
-.ipynb_checkpoints/
-
-# OS
-.DS_Store
-
-# Packaging / build
-build/
-dist/
-*.egg-info/
-
-# IDE
-.vscode/
 .idea/
+.DS_Store
@@ -1,21 +1,3 @@
 MIT License
 
-Copyright (c) 2025 <Your Name>
-
-Permission is hereby granted, free of charge, to any person obtaining a copy
-of this software and associated documentation files (the "Software"), to deal
-in the Software without restriction, including without limitation the rights
-to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
-copies of the Software, and to permit persons to whom the Software is
-furnished to do so, subject to the following conditions:
-
-The above copyright notice and this permission notice shall be included in all
-copies or substantial portions of the Software.
-
-THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
-IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
-FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
-AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
-LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
-OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
-SOFTWARE.
+Copyright (c) 2025
@@ -1,32 +1,2 @@
-# Reinforcement Learning Fundamentals — Companion Code
-
-This repository contains runnable Python code accompanying the book **"Reinforcement Learning Fundamentals: From Theory to Practice"**.
-
-> Tip: Every chapter has its own folder with a short `README.md` and Python examples that mirror the book’s notation and figures.
-
-## Contents
-
-- [`ch2_rl_formulation/`](ch2_rl_formulation/README.md) — MDPs, policies, value functions, Bellman equations, policy evaluation, value iteration, grid world.
-- [`ch3_multi_armed_bandits/`](ch3_multi_armed_bandits/README.md) — ε-greedy, UCB, Thompson Sampling.
-- [`utils/`](utils/) — Shared helpers (random seeds, plotting, gridworld helpers).
-
-## Quickstart
-
-```bash
-# 1) Create and activate a virtual environment (optional but recommended)
-python -m venv .venv
-source .venv/bin/activate  # Windows: .venv\Scripts\activate
-
-# 2) Install dependencies
-pip install -r requirements.txt
-
-# 3) Run a Chapter 2 demo (GridWorld Value Iteration)
-python ch2_rl_formulation/value_iteration.py
-
-# 4) Run the Chapter 2 Random MDP demo
-python ch2_rl_formulation/demo_random_mdp.py
-
-# 5) Run Chapter 3 bandit demos
-python ch3_multi_armed_bandits/epsilon_greedy.py
-python ch3_multi_armed_bandits/ucb.py
-python ch3_multi_armed_bandits/thompson_sampling.py
+# Reinforcement Learning Fundamentals — Code
+This repo hosts chapter-wise companion code. Chapter 2 is complete and CI-tested.
@@ -1,22 +1,45 @@
 import numpy as np
 
-def policy_evaluation(S,A,P,R,pi,gamma=1.0,theta=1e-10):
-    nS,nA,_=P.shape; V=np.zeros(nS)
+def policy_evaluation(S, A, P, R, pi, gamma=1.0, theta=1e-10):
+    """
+    Tabular policy evaluation for general R(s,a,s').
+    Inputs:
+      - S: list of states
+      - A: list of actions
+      - P: [|S|, |A|, |S'|] transition probabilities
+      - R: [|S|, |A|, |S'|] rewards
+      - pi: [|S|, |A|] policy (row-stochastic; can be deterministic one-hot)
+      - gamma: discount factor
+      - theta: convergence threshold (max delta)
+    Returns:
+      - V: np.ndarray of shape [|S|]
+    """
+    nS, nA, nSp = P.shape
+    assert nS == len(S) and nA == len(A) and nSp == nS
+    assert pi.shape == (nS, nA)
+
+    V = np.zeros(nS, dtype=float)
     while True:
-        delta=0; V_new=np.zeros_like(V)
+        delta = 0.0
+        V_new = np.zeros_like(V)
         for s in range(nS):
-            val=0
+            val = 0.0
             for a in range(nA):
-                if pi[s,a]==0: continue
-                val+=pi[s,a]*np.sum(P[s,a,:]*(R[s,a,:]+gamma*V))
-            V_new[s]=val; delta=max(delta,abs(V_new[s]-V[s]))
-        V=V_new
-        if delta<theta: break
+                p_sa = pi[s, a]
+                if p_sa == 0.0:
+                    continue
+                val += p_sa * np.sum(P[s, a, :] * (R[s, a, :] + gamma * V))
+            V_new[s] = val
+            delta = max(delta, abs(V_new[s] - V[s]))
+        V = V_new
+        if delta < theta:
+            break
     return V
 
-def q_from_v(S,A,P,R,V,gamma=1.0):
-    nS,nA,_=P.shape; Q=np.zeros((nS,nA))
+def q_from_v(S, A, P, R, V, gamma=1.0):
+    nS, nA, _ = P.shape
+    Q = np.zeros((nS, nA), dtype=float)
     for s in range(nS):
         for a in range(nA):
-            Q[s,a]=np.sum(P[s,a,:]*(R[s,a,:]+gamma*V))
+            Q[s, a] = np.sum(P[s, a, :] * (R[s, a, :] + gamma * V))
     return Q
@@ -1,15 +1,34 @@
 import numpy as np
 from ch2_rl_formulation.gridworld import GridWorld4x4
 from ch2_rl_formulation.policies import greedy_toward_goal_policy
-from ch2_rl_formulation.evaluation import policy_evaluation,q_from_v
+from ch2_rl_formulation.evaluation import policy_evaluation, q_from_v
 
 def main():
-    env=GridWorld4x4(); S,A=env.states(),env.actions()
-    P,R=env.P_tensor(),env.R_tensor()
-    pi=greedy_toward_goal_policy(env)
-    V=policy_evaluation(S,A,P,R,pi)
-    print("V_pi grid:",np.array(V).reshape(4,4))
-    Q=q_from_v(S,A,P,R,V)
-    s_bl=env.state_index(3,0)
-    print("Q bottom-left:",[Q[s_bl,env.action_index(a)] for a in A])
-if __name__=='__main__': main()
+    env = GridWorld4x4(step_reward=-1, goal_reward=0, goal=(0,3))
+    S, A = env.states(), env.actions()
+    P, R = env.P_tensor(), env.R_tensor()
+    pi = greedy_toward_goal_policy(env)
+    V = policy_evaluation(S, A, P, R, pi, gamma=1.0, theta=1e-12)
+    Vgrid = np.array(V).reshape(4,4)
+    print("V_pi grid (goal top-right):\n", Vgrid)
+    expected = np.array([
+        [-4, -3, -2,  0],
+        [-5, -4, -3, -1],
+        [-6, -5, -4, -2],
+        [-7, -6, -5, -3],
+    ], dtype=float)
+    assert np.allclose(Vgrid, expected, atol=1e-12)
+    Q = q_from_v(S, A, P, R, V, gamma=1.0)
+    s_bl = env.state_index(3,0)
+    a = {name: env.action_index(name) for name in A}
+    print("\nQ at bottom-left (row=3,col=0):")
+    for name in A:
+        print(f"{name:>5}: {Q[s_bl, a[name]]:6.2f}")
+    assert abs(Q[s_bl, a["up"]]   - (-7)) < 1e-12
+    assert abs(Q[s_bl, a["right"]]- (-7)) < 1e-12
+    assert abs(Q[s_bl, a["left"]] - (-8)) < 1e-12
+    assert abs(Q[s_bl, a["down"]] - (-8)) < 1e-12
+    print("\nAll checks PASS.")
+
+if __name__ == "__main__":
+    main()
@@ -1,9 +1,24 @@
+def approx(x, y, tol=1e-6): 
+    return abs(x - y) < tol
+
 def main():
-    print("G0 =",1+0.9*2+0.9**2*3) #5.23
-    print("v =",-1+0.9*(-1)+0.9**2*(-1)+0.9**3*10) #4.58
-    print("v_pe =",2+0.9*4) #5.6
-    print("q_pe =",1+0.9*3) #3.7
-    print("v* =",max(2+0.9*5,1+0.9*8)) #8.2
-    print("q* =",2+0.9*6) #7.4
-    print("v*_4steps =",sum((0.9**k)*(-1) for k in range(4))+0.9**4*10) #3.122
-if __name__=='__main__': main()
+    ok = True
+    g0 = 1 + 0.9*2 + (0.9**2)*3
+    print("G0 =", g0); ok &= approx(g0, 5.23)
+    v = -1 + 0.9*(-1) + (0.9**2)*(-1) + (0.9**3)*10
+    print("v =", v); ok &= approx(v, 4.58)
+    v_pe = 2 + 0.9*4
+    print("v_pe =", v_pe); ok &= approx(v_pe, 5.6, 1e-12)
+    q_pe = 1 + 0.9*3
+    print("q_pe =", q_pe); ok &= approx(q_pe, 3.7, 1e-12)
+    vopt = max(2 + 0.9*5, 1 + 0.9*8)
+    print("v* =", vopt); ok &= approx(vopt, 8.2, 1e-12)
+    qopt = 2 + 0.9*6
+    print("q* =", qopt); ok &= approx(qopt, 7.4, 1e-12)
+    v4 = sum((0.9**k)*(-1) for k in range(4)) + (0.9**4)*10
+    print("v*(4 steps) =", v4); ok &= abs(v4 - 3.122) < 1e-3
+    print("\nALL NUMERIC EXAMPLES:", "PASS" if ok else "FAIL")
+    if not ok: raise SystemExit(1)
+
+if __name__ == "__main__":
+    main()