Add Chapter 4 Dynamic Programming code (policy evaluation, policy iteration, value iteration)

srikanthbaride · srikanthbaride · commit aec264451a66 · 2025-09-02T15:01:23.000-05:00
diff --git a/ch4_dp/.gitignore b/ch4_dp/.gitignore
@@ -0,0 +1,4 @@
+.venv/
+__pycache__/
+*.pyc
+artifacts/latex/*.tex
diff --git a/ch4_dp/Makefile b/ch4_dp/Makefile
@@ -0,0 +1,10 @@
+ART4=artifacts/ch4_4x4
+ART6=artifacts/ch4_6x6
+
+.PHONY: ch4-artifacts ch4-tables
+ch4-artifacts:
+	python examples/generate_artifacts.py --env 4x4 --outdir $(ART4)
+	python examples/generate_artifacts.py --env 6x6 --outdir $(ART6)
+
+ch4-tables: ch4-artifacts
+	python examples/csv_to_latex.py $(ART4) --outdir artifacts/latex --no-wrap --round 0
diff --git a/ch4_dp/README.md b/ch4_dp/README.md
@@ -0,0 +1,46 @@
+# Chapter 4 – Dynamic Programming (DP) Code
+
+This repo contains clean, runnable reference code to accompany **Chapter 4: Dynamic Programming Approaches**.
+
+## Contents
+
+- `src/rldp/dp.py` – Policy Evaluation, Policy Iteration, Value Iteration
+- `src/rldp/gridworld.py` – Simple deterministic GridWorld (4×4, 6×6)
+- `src/rldp/latex.py` – CSV → LaTeX table helper (booktabs-ready)
+- `examples/generate_artifacts.py` – Reproduces tables/plots for the chapter
+- `examples/csv_to_latex.py` – Convert CSV matrices to LaTeX tables
+- `Makefile` – Convenience targets
+- `requirements.txt` – Python deps
+
+## Quickstart
+
+```bash
+python -m venv .venv && source .venv/bin/activate  # (Windows: .venv\Scripts\activate)
+pip install -r requirements.txt
+
+# Generate artifacts for 4×4 and 6×6 worlds
+python examples/generate_artifacts.py --env 4x4 --outdir artifacts/ch4_4x4
+python examples/generate_artifacts.py --env 6x6 --outdir artifacts/ch4_6x6
+
+# Convert CSV → LaTeX tabular (booktabs)
+python examples/csv_to_latex.py artifacts/ch4_4x4/vi_values_4x4_k2.csv   --outdir artifacts/latex   --caption "Value iteration estimates (k=2) on the $4\\times4$ gridworld."   --label tab:vi-4x4-k2   --float-format ".0f"
+```
+
+Then include in LaTeX:
+
+```latex
+\usepackage{booktabs} % in preamble
+% ...
+\input{artifacts/latex/vi_values_4x4_k2.tex}
+```
+
+## Make targets
+
+```bash
+make ch4-artifacts    # build default artifacts
+make ch4-tables       # CSV → LaTeX for default directory
+```
+
+## License
+
+MIT for the code snippets here. Attribution appreciated.
diff --git a/ch4_dp/artifacts/.gitkeep b/ch4_dp/artifacts/.gitkeep
diff --git a/ch4_dp/examples/csv_to_latex.py b/ch4_dp/examples/csv_to_latex.py
@@ -0,0 +1,68 @@
+from __future__ import annotations
+import argparse, os, sys
+from pathlib import Path
+from rldp.latex import grid_csv_to_tabular
+
+def convert_one(csv_path: str, out_dir: str, caption: str | None, label: str | None,
+                colfmt: str | None, float_format: str | None, index: bool,
+                wrap_table: bool, round_digits: int | None,
+                transpose: bool, suffix: str) -> str:
+    tex = grid_csv_to_tabular(csv_path, caption, label, colfmt, float_format,
+                              index, True, wrap_table, round_digits, transpose)
+    name = Path(csv_path).stem + (suffix if suffix else "") + ".tex"
+    os.makedirs(out_dir, exist_ok=True)
+    out_path = str(Path(out_dir) / name)
+    with open(out_path, "w", encoding="utf-8") as f:
+        f.write(tex)
+    return out_path
+
+def main():
+    p = argparse.ArgumentParser(description="Convert CSV grids (values/policies) to LaTeX tables.")
+    p.add_argument("inputs", nargs="+", help="CSV files or a directory (will convert all *.csv).")
+    p.add_argument("--outdir", default="artifacts/latex", help="Output directory for .tex tables.")
+    p.add_argument("--caption", default=None, help="Caption to use (optional).")
+    p.add_argument("--label", default=None, help="LaTeX label (e.g., tab:vi-iterations).")
+    p.add_argument("--colfmt", default=None, help="LaTeX column format, e.g., 'cccc'.")
+    p.add_argument("--float-format", default=None, help="Python format, e.g., '.0f' or '{:.2f}'.")
+    p.add_argument("--index", action="store_true", help="Include DataFrame index.")
+    p.add_argument("--no-wrap", action="store_true", help="Emit only tabular (no table environment).")
+    p.add_argument("--round", type=int, default=None, help="Round all numbers to N decimals.")
+    p.add_argument("--transpose", action="store_true", help="Transpose before rendering.")
+    p.add_argument("--suffix", default="", help="Append to output filename stem (e.g., '_nice').")
+    args = p.parse_args()
+
+    # Expand inputs: if a directory is given, take all CSVs in it
+    files = []
+    for item in args.inputs:
+        pth = Path(item)
+        if pth.is_dir():
+            files.extend(str(p) for p in pth.glob("*.csv"))
+        elif pth.suffix.lower() == ".csv":
+            files.append(str(pth))
+        else:
+            print(f"Skipping non-CSV: {item}", file=sys.stderr)
+
+    if not files:
+        print("No CSV files found.", file=sys.stderr)
+        sys.exit(1)
+
+    created = []
+    for csv in sorted(files):
+        out = convert_one(
+            csv_path=csv,
+            out_dir=args.outdir,
+            caption=args.caption,
+            label=args.label,
+            colfmt=args.colfmt,
+            float_format=args.float_format,
+            index=args.index,
+            wrap_table=not args.no_wrap,
+            round_digits=args.round,
+            transpose=args.transpose,
+            suffix=args.suffix,
+        )
+        created.append(out)
+        print(f"Wrote: {out}")
+
+if __name__ == "__main__":
+    main()
diff --git a/ch4_dp/examples/generate_artifacts.py b/ch4_dp/examples/generate_artifacts.py
@@ -0,0 +1,72 @@
+from __future__ import annotations
+import argparse, os
+import numpy as np
+import pandas as pd
+import matplotlib.pyplot as plt
+
+from rldp.gridworld import make_gridworld, unravel_index, ACTIONS, arrows_from_policy
+from rldp.dp import policy_evaluation, policy_iteration, value_iteration
+
+def save_grid_csv(V, n, out_csv):
+    M = np.zeros((n, n))
+    for s in range(n*n):
+        i, j = unravel_index(s, n)
+        M[i, j] = V[s]
+    df = pd.DataFrame(M)
+    os.makedirs(os.path.dirname(out_csv), exist_ok=True)
+    df.to_csv(out_csv, index=False)
+
+def save_policy_csv(pi, n, out_csv):
+    arr = arrows_from_policy(pi).reshape(n, n)
+    df = pd.DataFrame(arr)
+    os.makedirs(os.path.dirname(out_csv), exist_ok=True)
+    df.to_csv(out_csv, index=False)
+
+def plot_values(V, n, out_png, title=None):
+    M = np.zeros((n, n))
+    for s in range(n*n):
+        i, j = unravel_index(s, n)
+        M[i, j] = V[s]
+    fig = plt.figure()
+    plt.imshow(M, interpolation='nearest')
+    plt.colorbar()
+    if title:
+        plt.title(title)
+    for i in range(n):
+        for j in range(n):
+            plt.text(j, i, f"{M[i,j]:.0f}", ha='center', va='center')
+    os.makedirs(os.path.dirname(out_png), exist_ok=True)
+    plt.savefig(out_png, bbox_inches='tight', dpi=160)
+    plt.close(fig)
+
+def main():
+    ap = argparse.ArgumentParser()
+    ap.add_argument('--env', default='4x4', choices=['4x4','6x6'])
+    ap.add_argument('--gamma', type=float, default=1.0)
+    ap.add_argument('--theta', type=float, default=1e-6)
+    ap.add_argument('--outdir', default='artifacts/ch4_4x4')
+    args = ap.parse_args()
+
+    n = 4 if args.env == '4x4' else 6
+    states, actions, P, R, meta = make_gridworld(n=n)
+    # Policy Iteration
+    pi_pi, V_pi = policy_iteration(states, actions, P, R, gamma=args.gamma, theta=args.theta)
+    # Value Iteration
+    pi_vi, V_vi = value_iteration(states, actions, P, R, gamma=args.gamma, theta=args.theta)
+
+    os.makedirs(args.outdir, exist_ok=True)
+
+    # Save values (final)
+    save_grid_csv(V_pi, n, os.path.join(args.outdir, f'pi_values_{args.env}.csv'))
+    save_grid_csv(V_vi, n, os.path.join(args.outdir, f'vi_values_{args.env}.csv'))
+    plot_values(V_pi, n, os.path.join(args.outdir, f'pi_values_{args.env}.png'), 'Policy Iteration Values')
+    plot_values(V_vi, n, os.path.join(args.outdir, f'vi_values_{args.env}.png'), 'Value Iteration Values')
+
+    # Save policies
+    save_policy_csv(pi_pi, n, os.path.join(args.outdir, f'pi_policy_{args.env}.csv'))
+    save_policy_csv(pi_vi, n, os.path.join(args.outdir, f'vi_policy_{args.env}.csv'))
+
+    print('Artifacts written to:', args.outdir)
+
+if __name__ == '__main__':
+    main()
diff --git a/ch4_dp/requirements.txt b/ch4_dp/requirements.txt
@@ -0,0 +1,3 @@
+numpy
+pandas
+matplotlib
diff --git a/ch4_dp/src/rldp/__init__.py b/ch4_dp/src/rldp/__init__.py
@@ -0,0 +1 @@
+__all__ = ["dp", "gridworld", "latex"]
diff --git a/ch4_dp/src/rldp/dp.py b/ch4_dp/src/rldp/dp.py
@@ -0,0 +1,74 @@
+from __future__ import annotations
+import numpy as np
+
+def policy_evaluation(states, actions, P, R, pi, gamma: float = 1.0, theta: float = 1e-6):
+    """Iterative policy evaluation.
+    states: list-like of states (indices 0..S-1)
+    actions: list-like of actions (indices 0..A-1)
+    P: shape [S, A, S] transition probabilities
+    R: shape [S, A, S] expected rewards
+    pi: shape [S, A] policy (row-stochastic)
+    """
+    S = len(states)
+    V = np.zeros(S, dtype=float)
+    while True:
+        delta = 0.0
+        for s in range(S):
+            v_old = V[s]
+            V[s] = sum(
+                pi[s, a] * sum(P[s, a, s2] * (R[s, a, s2] + gamma * V[s2]) for s2 in range(S))
+                for a in range(len(actions))
+            )
+            delta = max(delta, abs(v_old - V[s]))
+        if delta < theta:
+            break
+    return V
+
+def policy_iteration(states, actions, P, R, gamma: float = 1.0, theta: float = 1e-6):
+    """Howard's policy iteration."""
+    S, A = len(states), len(actions)
+    pi = np.ones((S, A)) / A
+    V = np.zeros(S, dtype=float)
+    stable = False
+
+    while not stable:
+        V = policy_evaluation(states, actions, P, R, pi, gamma, theta)
+        stable = True
+        for s in range(S):
+            old_action = np.argmax(pi[s])
+            q_values = [
+                sum(P[s, a, s2] * (R[s, a, s2] + gamma * V[s2]) for s2 in range(S))
+                for a in range(A)
+            ]
+            best = int(np.argmax(q_values))
+            pi[s] = np.eye(A)[best]
+            if best != old_action:
+                stable = False
+    return pi, V
+
+def value_iteration(states, actions, P, R, gamma: float = 1.0, theta: float = 1e-6):
+    """Bellman optimality updates until convergence."""
+    S, A = len(states), len(actions)
+    V = np.zeros(S, dtype=float)
+    while True:
+        delta = 0.0
+        for s in range(S):
+            v_old = V[s]
+            q_values = [
+                sum(P[s, a, s2] * (R[s, a, s2] + gamma * V[s2]) for s2 in range(S))
+                for a in range(A)
+            ]
+            V[s] = max(q_values)
+            delta = max(delta, abs(v_old - V[s]))
+        if delta < theta:
+            break
+    # Derive greedy policy
+    pi = np.zeros((S, A))
+    for s in range(S):
+        q_values = [
+            sum(P[s, a, s2] * (R[s, a, s2] + gamma * V[s2]) for s2 in range(S))
+            for a in range(A)
+        ]
+        best = int(np.argmax(q_values))
+        pi[s] = np.eye(A)[best]
+    return pi, V
diff --git a/ch4_dp/src/rldp/gridworld.py b/ch4_dp/src/rldp/gridworld.py
@@ -0,0 +1,45 @@
+from __future__ import annotations
+import numpy as np
+
+ACTIONS = ['U','R','D','L']  # up, right, down, left
+A_DELTA = {'U':(-1,0), 'R':(0,1), 'D':(1,0), 'L':(0,-1)}
+
+def make_gridworld(n: int = 4, step_reward: float = -1.0, terminal: tuple[int,int] | None = None):
+    """Deterministic gridworld (n×n). Terminal default is (0, n-1)."""
+    if terminal is None:
+        terminal = (0, n-1)
+    S = n*n
+    A = len(ACTIONS)
+    P = np.zeros((S, A, S), dtype=float)
+    R = np.full((S, A, S), 0.0, dtype=float)
+
+    def idx(i,j): return i*n + j
+    term_idx = idx(*terminal)
+
+    for i in range(n):
+        for j in range(n):
+            s = idx(i,j)
+            for a_id, a in enumerate(ACTIONS):
+                if s == term_idx:
+                    P[s, a_id, s] = 1.0
+                    R[s, a_id, s] = 0.0
+                    continue
+                di, dj = A_DELTA[a]
+                ni, nj = i+di, j+dj
+                if ni < 0 or ni >= n or nj < 0 or nj >= n:
+                    ns = s  # bump into wall
+                else:
+                    ns = idx(ni, nj)
+                P[s, a_id, ns] = 1.0
+                R[s, a_id, ns] = step_reward if ns != term_idx else 0.0
+    states = list(range(S))
+    actions = list(range(A))
+    return states, actions, P, R, (n, terminal, term_idx)
+
+def unravel_index(s: int, n: int):
+    return (s // n, s % n)
+
+def arrows_from_policy(pi):
+    """Convert one-hot deterministic policy (S×A) to symbol grid of U/R/D/L."""
+    idx = np.argmax(pi, axis=1)
+    return np.array([['U','R','D','L'][k] for k in idx])
diff --git a/ch4_dp/src/rldp/latex.py b/ch4_dp/src/rldp/latex.py

-Original file line number
+Diff line change
@@ @@ -0,0 +1,4 @@ @@
 +.venv/
 +__pycache__/
 +*.pyc
 +artifacts/latex/*.tex