diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml
index baa769a3..2a8c0b48 100644
--- a/.github/workflows/ci.yml
+++ b/.github/workflows/ci.yml
@@ -342,3 +342,215 @@ jobs:
         run: |
           docker run --rm -v ${{ github.workspace }}:/workspace frost-dev:latest \
             pre-commit run --all-files
+
+  # ---------------------------------------------------------------------------
+  # FROST no-MMU M-mode Linux: build the image from source, then boot it two
+  # ways off the SAME artifact -- on the FROST RTL in cocotb (the gremlin
+  # regression) and under QEMU (a fast full boot-to-shell reference).
+  # ---------------------------------------------------------------------------
+
+  # Build the kernel + busybox initramfs + FROST memory images from the vendored
+  # Buildroot submodule (linux/buildroot @ pinned SHA) driven by the FROST
+  # BR2_EXTERNAL tree. Runs inside the frost-dev image, which ships Buildroot's
+  # host deps + QEMU, so the host deps are single-sourced in the Dockerfile. The
+  # first build compiles a full rv32 uClibc cross toolchain from source; the dl/
+  # and ccache caches make later runs much faster.
+  build-frost-linux:
+    name: Build FROST Linux Image (Buildroot)
+    runs-on: ubuntu-24.04
+    needs: build-docker
+    timeout-minutes: 120
+    steps:
+      - name: Checkout repository
+        uses: actions/checkout@v4
+        with:
+          # Recursive so the pinned linux/buildroot submodule is fetched.
+          submodules: recursive
+
+      - name: Download Docker image
+        uses: actions/download-artifact@v4
+        with:
+          name: frost-docker-image
+          path: /tmp
+
+      - name: Load Docker image
+        run: |
+          docker load --input /tmp/frost-dev.tar
+          # The in-workspace Buildroot build needs ~10 GB; free the image tar.
+          rm -f /tmp/frost-dev.tar
+
+      - name: Cache Buildroot downloads (source tarballs)
+        uses: actions/cache@v4
+        with:
+          path: linux/dl
+          # dl/ only changes when a package version (kernel/toolchain) changes.
+          key: br2-dl-${{ hashFiles('linux/buildroot-external/configs/frost_nommu_rv32_defconfig', 'linux/buildroot-external/board/frost/**') }}
+          restore-keys: |
+            br2-dl-
+
+      - name: Cache Buildroot ccache (toolchain + kernel object cache)
+        uses: actions/cache@v4
+        with:
+          path: linux/ccache
+          key: br2-ccache-${{ github.sha }}
+          restore-keys: |
+            br2-ccache-
+
+      - name: Build kernel + initramfs + FROST memory images
+        run: |
+          docker run --rm \
+            -e BR2_DL_DIR=/workspace/linux/dl \
+            -e BR2_CCACHE_DIR=/workspace/linux/ccache \
+            -v ${{ github.workspace }}:/workspace frost-dev:latest \
+            bash -c '
+              set -euo pipefail
+              make -C linux/buildroot O=/workspace/linux/build \
+                BR2_EXTERNAL=/workspace/linux/buildroot-external \
+                frost_nommu_rv32_defconfig
+              # Enable ccache for CI only (kept out of the committed defconfig).
+              echo "BR2_CCACHE=y" >> /workspace/linux/build/.config
+              make -C linux/buildroot O=/workspace/linux/build olddefconfig
+              make -C linux/buildroot O=/workspace/linux/build
+              # Stage the memory images where the cocotb linux_boot test resolves
+              # them (sw/apps/linux_boot/{sw,sw_ddr}.mem).
+              mkdir -p sw/apps/linux_boot
+              cp linux/build/images/sw.mem     sw/apps/linux_boot/sw.mem
+              cp linux/build/images/sw_ddr.mem sw/apps/linux_boot/sw_ddr.mem
+              ls -l linux/build/images/
+            '
+
+      - name: Upload FROST Linux boot images
+        uses: actions/upload-artifact@v4
+        with:
+          name: frost-linux-boot-images
+          path: |
+            linux/build/images/Image
+            linux/build/images/rootfs.cpio.gz
+            linux/build/images/frost-nommu-fpga.dtb
+            linux/build/images/sw.mem
+            linux/build/images/sw_ddr.mem
+          retention-days: 7
+          if-no-files-found: error
+
+  # Boot the freshly built image on the FROST RTL in cocotb, bounded to ~22M
+  # cycles in the genesys2-faithful cache shape (128 KiB L1I, no L2). This is the
+  # "gremlin" regression: that window is silent mem_init after devtmpfs (no deep
+  # console marker), so the run captures the full boot (FROST_LINUX_RUN_FULL) and
+  # check_linux_boot_regression.py then asserts boot health -- banner + devtmpfs,
+  # no panic, the periodic CLINT timer tick serviced (mtimecmp re-armed, the
+  # thing the gremlin hung), and forward progress (retire) all the way to the
+  # cap. CACHED_HAS_L2=0 must be an env/make var (the test's own -GCACHED_HAS_L2=0
+  # is otherwise overridden by the tests/Makefile default).
+  linux-boot-cocotb:
+    name: Cocotb Linux Boot (22M, genesys2 shape)
+    runs-on: ubuntu-24.04
+    needs: [build-docker, build-frost-linux]
+    # Verilator compile of the core plus the bounded 22M-cycle sim; GitHub
+    # runners sim this at roughly 4-8k cycles/s, so leave generous headroom.
+    timeout-minutes: 240
+    steps:
+      - name: Checkout repository
+        uses: actions/checkout@v4
+        with:
+          submodules: recursive
+
+      - name: Download Docker image
+        uses: actions/download-artifact@v4
+        with:
+          name: frost-docker-image
+          path: /tmp
+
+      - name: Load Docker image
+        run: |
+          docker load --input /tmp/frost-dev.tar
+          # Long Verilator compile + 22M-cycle sim in this job; free the tar.
+          rm -f /tmp/frost-dev.tar
+
+      - name: Download FROST Linux boot images
+        uses: actions/download-artifact@v4
+        with:
+          name: frost-linux-boot-images
+          path: /tmp/frost-linux-images
+
+      - name: Stage boot images for the cocotb linux_boot test
+        run: |
+          mkdir -p sw/apps/linux_boot
+          cp /tmp/frost-linux-images/sw.mem     sw/apps/linux_boot/sw.mem
+          cp /tmp/frost-linux-images/sw_ddr.mem sw/apps/linux_boot/sw_ddr.mem
+
+      - name: Run cocotb linux_boot (22M-cycle gremlin regression)
+        run: |
+          # FROST_LINUX_PREBUILT=1: the cocotb runner cleans + re-makes the app
+          # before simulating; this tells the linux_boot Makefile the staged
+          # images are authoritative (no in-job Buildroot rebuild, which would
+          # take ~1h uncached and overwrite the artifact under test).
+          docker run --rm \
+            -e CACHED_HAS_L2=0 \
+            -e FROST_LINUX_RUN_FULL=1 \
+            -e FROST_LINUX_PREBUILT=1 \
+            -e COCOTB_NUM_RUNS=1 \
+            -e COCOTB_LINUX_MAX_CYCLES=22000000 \
+            -e COCOTB_PROGRESS_INTERVAL=500000 \
+            -v ${{ github.workspace }}:/workspace frost-dev:latest \
+            bash -c '
+              set -o pipefail
+              cd tests && make clean
+              # RUN_FULL capture always trips the cocotb never-match assertion by
+              # design; the boot-health verdict comes from the checker below.
+              ./test_run_cocotb.py linux_boot_128k 2>&1 \
+                | tee /workspace/linux_boot_cocotb.log || true
+              python3 check_linux_boot_regression.py /workspace/linux_boot_cocotb.log
+            '
+
+      - name: Upload cocotb boot log
+        if: always()
+        uses: actions/upload-artifact@v4
+        with:
+          name: linux-boot-cocotb-log
+          path: linux_boot_cocotb.log
+          if-no-files-found: ignore
+
+  # Boot the SAME kernel + rootfs to a login prompt under QEMU: a fast (seconds)
+  # full-userspace reference that the RTL boot is bounded well short of. Confirms
+  # the built image itself reaches a shell, independent of the FROST core.
+  linux-boot-qemu:
+    name: QEMU Linux Boot to Shell
+    runs-on: ubuntu-24.04
+    needs: [build-docker, build-frost-linux]
+    timeout-minutes: 20
+    steps:
+      - name: Checkout repository
+        uses: actions/checkout@v4
+
+      - name: Download Docker image
+        uses: actions/download-artifact@v4
+        with:
+          name: frost-docker-image
+          path: /tmp
+
+      - name: Load Docker image
+        run: docker load --input /tmp/frost-dev.tar
+
+      - name: Download FROST Linux boot images
+        uses: actions/download-artifact@v4
+        with:
+          name: frost-linux-boot-images
+          path: /tmp/frost-linux-images
+
+      - name: Boot to a shell under QEMU (assert login prompt)
+        run: |
+          docker run --rm -v /tmp/frost-linux-images:/img frost-dev:latest \
+            bash -c '
+              set -euo pipefail
+              # QEMU sits at the login prompt forever, so time-box it and assert
+              # the marker from the captured log (file redirect, never a pipe --
+              # piping QEMUs stdout to grep deadlocks).
+              timeout -k5 120 qemu-system-riscv32 -M virt -bios none \
+                -kernel /img/Image \
+                -append "earlycon=sbi console=ttyS0 rdinit=/sbin/init" \
+                -initrd /img/rootfs.cpio.gz -nographic -cpu rv32,mmu=off \
+                </dev/null >/tmp/qemu-boot.log 2>&1 || true
+              echo "===== QEMU boot log (tail) ====="
+              tail -n 20 /tmp/qemu-boot.log
+              grep -q "buildroot login:" /tmp/qemu-boot.log
+            '
diff --git a/.gitignore b/.gitignore
index 9c763545..e73378ca 100644
--- a/.gitignore
+++ b/.gitignore
@@ -65,6 +65,13 @@ sw_ddr.bin
 sw_imem_*.mem
 sw/apps/*/sw.S
 
+# FROST no-MMU Linux: out-of-tree Buildroot build output + download/ccache
+# caches. Regenerated from the linux/buildroot submodule + linux/buildroot-external
+# tree by `make -C sw/apps/linux_boot` (and the CI build-frost-linux job).
+/linux/build/
+/linux/dl/
+/linux/ccache/
+
 # mypy
 .mypy_cache/
 
diff --git a/.gitmodules b/.gitmodules
index 2f663032..e2e8282c 100644
--- a/.gitmodules
+++ b/.gitmodules
@@ -15,3 +15,6 @@
 [submodule "sw/apps/coremark_pro/coremark-pro"]
 	path = sw/apps/coremark_pro/coremark-pro
 	url = https://github.com/eembc/coremark-pro.git
+[submodule "linux/buildroot"]
+	path = linux/buildroot
+	url = https://github.com/buildroot/buildroot.git
diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md
index d93c99da..a6f71a84 100644
--- a/CONTRIBUTING.md
+++ b/CONTRIBUTING.md
@@ -20,7 +20,7 @@ This document provides guidelines for contributors. The detailed style sections
 
 ## Project Overview
 
-FROST is an out-of-order RISC-V processor implementing **RV32GCB** (G = IMAFD) with a Tomasulo back-end and full machine-mode privilege support. Understanding the architecture helps you contribute effectively:
+FROST is an out-of-order RISC-V processor implementing **RV32GCB** (G = IMAFD) with a Tomasulo back-end and Machine + User (M/U) privilege modes. Understanding the architecture helps you contribute effectively:
 
 ### Architecture Outline
 
@@ -446,8 +446,11 @@ The project uses pytest markers to categorize tests:
 
 Run the full CPU test suite:
 ```bash
-# Full random instruction test (16,000+ instructions)
-pytest tests/test_run_cocotb.py::TestCPU -s
+# Full cocotb test suite
+pytest tests/test_run_cocotb.py -s
+
+# Directed trap/exception tests
+./tests/test_run_cocotb.py directed_traps
 
 # ISA compliance tests
 pytest "tests/test_run_cocotb.py::TestRealPrograms::test_real_program[isa_test]" -s
@@ -577,7 +580,7 @@ We welcome contributions in these areas:
 |------|----------|
 | Bug fixes | OOO ordering, instruction encoding, timing issues |
 | ISA extensions | Additional standard or custom extensions |
-| Privilege modes | S-mode (supervisor), U-mode (user) support |
+| Privilege modes | S-mode (supervisor), PMP, virtual memory (M and U modes already supported) |
 | Board support | New FPGA boards, SoC integrations |
 | Performance | Branch predictor, scheduler, memory-system, or cache improvements |
 | Peripherals | SPI, I2C, GPIO, timers |
diff --git a/Dockerfile b/Dockerfile
index 410b3721..026380c7 100644
--- a/Dockerfile
+++ b/Dockerfile
@@ -151,6 +151,33 @@ ENV RISCV_PREFIX=riscv-none-elf-
 # Fix git "dubious ownership" error when mounting repo as volume
 RUN git config --global --add safe.directory /workspace
 
+# Buildroot host dependencies + QEMU. Used by the FROST no-MMU Linux CI jobs:
+#   * build-frost-linux  - builds the kernel + initramfs + FROST memory images
+#     from the linux/buildroot-external tree (Buildroot compiles its own rv32
+#     uClibc cross toolchain from source, so it needs a full host build env).
+#   * qemu-linux-boot     - boots the same Image + rootfs to a shell under
+#     qemu-system-riscv32 (qemu-system-misc provides the riscv32 target).
+# `load_software.py <board> linux_boot` self-builds via the same path, so these
+# are the single source of truth for the Linux build's host deps. Kept as a late
+# layer so the expensive Verilator/Yosys/SMT source builds above stay cached.
+RUN apt-get update && apt-get install -y --no-install-recommends \
+    build-essential \
+    patch \
+    cpio \
+    rsync \
+    bc \
+    file \
+    unzip \
+    wget \
+    bzip2 \
+    ccache \
+    libssl-dev \
+    libelf-dev \
+    libncurses-dev \
+    device-tree-compiler \
+    qemu-system-misc \
+    && rm -rf /var/lib/apt/lists/*
+
 # Install Python dependencies (cocotb, pytest, pre-commit, etc.)
 RUN pip install --no-cache-dir --break-system-packages \
     "cocotb==2.0.1" \
diff --git a/README.md b/README.md
index 553a607e..7e194353 100644
--- a/README.md
+++ b/README.md
@@ -2,7 +2,7 @@
 
 **F**PGA **R**ISC-V **O**pen-sourced in **S**ystemVerilog by **T**woSigma
 
-An out-of-order RISC-V processor implementing **RV32GCB** (G = IMAFD) with a Tomasulo back-end and full machine-mode privilege support for RTOS operation. Achieves 300 MHz on UltraScale+. Designed for FPGA deployment with clean, portable SystemVerilog.
+An out-of-order RISC-V processor implementing **RV32GCB** (G = IMAFD) with a Tomasulo back-end and Machine + User (M/U) privilege modes for RTOS operation. Achieves 300 MHz on UltraScale+. Designed for FPGA deployment with clean, portable SystemVerilog.
 
 ## Why FROST?
 
@@ -13,6 +13,7 @@ There are many RISC-V cores. Here's what makes FROST different:
 - **Solid performance** — 3.08 CoreMark/MHz (924 CoreMark at 300 MHz on UltraScale+) from a Tomasulo out-of-order back-end with 2-wide dispatch/rename, 2-wide commit, branch prediction (BTB + bimodal direction predictor + RAS), an L0 cache, and a fast two-cycle conditional-branch misprediction recovery path.
 - **Layered verification** — constrained-random tests, directed tests, real C programs, the official [riscv-arch-test](https://github.com/riscv-non-isa/riscv-arch-test) compliance suite, [riscv-tests](https://github.com/riscv-software-src/riscv-tests) ISA tests, and random instruction torture tests all run in Cocotb simulation, along with formal verification.
 - **Real workloads included** — all nine official EEMBC CoreMark-PRO workloads (on both supported boards, backed by the DDR cache hierarchy), FreeRTOS demo, CoreMark benchmark, ISA compliance suite, and 400+ architecture compliance tests all run in simulation and on hardware.
+- **Boots no-MMU Linux** — an in-tree Buildroot flow (`linux/`) builds a no-MMU M-mode Linux image; CI builds it from source (`build-frost-linux`) and boots it in both cocotb RTL simulation (`linux-boot-cocotb`) and QEMU (`linux-boot-qemu`).
 - **Portable core RTL** — the CPU core avoids vendor primitives and is checked with generic Yosys coarse synthesis. Full open-source Yosys synthesis is also tested for Xilinx 7-series, UltraScale, and UltraScale+ targets; board wrappers are provided for Kintex-7 and UltraScale+.
 - **Apache 2.0 licensed** — permissive license suitable for commercial and academic use.
 
@@ -55,8 +56,8 @@ There are many RISC-V cores. Here's what makes FROST different:
 │                                                                              │
 │   ┌──────────────────────────┐    ┌─────────────────────────────────────┐    │
 │   │ Trap Unit                │    │ Peripherals                         │    │
-│   │ (M-mode, mret, wfi,      │    │ UART, mtime/mtimecmp, FIFO0/1       │    │
-│   │  interrupts, exceptions) │    │                                     │    │
+│   │ (M/U traps, mret, wfi,   │    │ UART (+ ns16550a face), FIFO0/1     │    │
+│   │  interrupts, exceptions) │    │ CLINT timer (mtime/mtimecmp, msip)  │    │
 │   └──────────────────────────┘    └─────────────────────────────────────┘    │
 │                                                                              │
 └──────────────────────────────────────────────────────────────────────────────┘
@@ -82,6 +83,7 @@ There are many RISC-V cores. Here's what makes FROST different:
 | **Zbkb**         | Bit manipulation for crypto                    |
 | **Zihintpause**  | Pause hint for spin-wait loops                 |
 | **Machine Mode** | M-mode privilege (mret, wfi, ecall, ebreak)    |
+| **User Mode**    | U-mode privilege (ecall traps to M-mode)       |
 
 ### Architecture Highlights
 
@@ -95,11 +97,11 @@ There are many RISC-V cores. Here's what makes FROST different:
 - **Conservative memory disambiguation** — loads gated until older store addresses known, with store-to-load forwarding from the SQ
 - **Two-tier branch recovery** — conditional-branch mispredictions use a fast ~2-cycle path (front-end redirect + RAT restore in the same cycle); JALR and exceptions take the slower commit-time path
 - **Branch prediction** with a 256-entry 2-bit BTB (trained for conditional branches and JAL, with slot-2 lookup support), 1024-entry bimodal direction predictor, 8-entry return address stack, and PD-stage computed-target redirects for conditional BTB misses predicted taken
-- **L0 cache** in front of the load queue reduces load-use latency (direct-mapped, write-through)
-- **M-mode trap handling** for RTOS support (interrupts and exceptions)
+- **L0 cache** in front of the load queue reduces load-use latency (direct-mapped, read-fill; stores invalidate matching lines)
+- **Machine + User (M/U) privilege modes** for RTOS support — traps from both modes are taken in M-mode (interrupts and exceptions)
 - **CLINT-compatible timer** (mtime/mtimecmp) for preemptive scheduling
 - **Harvard architecture** with separate instruction and data memory ports
-- **Write-back cache hierarchy over DDR** — a 1 GiB cached region at `0x8000_0000` served by recursive line-port caches (`frost_cache`: direct-mapped, 32 B lines, write-back/write-allocate). Both instruction fetch (a 16 KiB read-only L1I) and data (a 128 KiB L1D) run through it on every board — so code can execute from DDR, not just from low BRAM — sharing a 2:1 line-port arbiter (data-side priority), plus a 2 MiB UltraRAM L2 spliced in on UltraScale+, over the board's DDR (DDR3 on Genesys2, DDR4 on X3) through a single-beat AXI bridge
+- **Write-back cache hierarchy over DDR** — a 1 GiB cached region at `0x8000_0000` served by recursive line-port caches (`frost_cache`: direct-mapped, 32 B lines, write-back/write-allocate). Both instruction fetch (a read-only L1I — 16 KiB on X3, 128 KiB on Genesys2) and data (a 128 KiB L1D) run through it on every board — so code can execute from DDR, not just from low BRAM — sharing a 2:1 line-port arbiter (data-side priority), plus a 2 MiB UltraRAM L2 spliced in on UltraScale+, over the board's DDR (DDR3 on Genesys2, DDR4 on X3) through a single-beat AXI bridge
 - **One memory map everywhere** — software sees the same layout on every board and in simulation: a 256 KiB fast, uncached BRAM region (code/data/stack, 1-cycle) plus the 1 GiB cached region (execute-from-DDR code, heap, and large data); the hierarchy shape behind it is opaque to software
 - **Portable core RTL** — written in generic SystemVerilog with no vendor-specific primitives in the CPU core; CI checks vendor-agnostic elaboration and coarse synthesis, while full FPGA builds are currently Xilinx-focused
 
@@ -172,10 +174,11 @@ You should see "Hello, world!" in the output.
 ### Run the CPU Verification Suite
 
 ```bash
-make -C tests        # constrained-random regression on the cpu_tb testbench
+pytest tests/                              # full regression (riscv-tests, arch compliance, C programs, …)
+./tests/test_run_cocotb.py directed_traps  # directed M-mode trap/interrupt tests (cpu_tb)
 ```
 
-This runs constrained-random instructions through the CPU, verifying each against a software reference model. (The random regression runs on the `cpu_tb` testbench — the `tests/` Makefile default — rather than as a `test_run_cocotb.py` target.)
+The pytest run validates the CPU against the riscv-tests ISA suites, the riscv-arch-test compliance suite, and real C programs. The legacy constrained-random `cpu_tb` regression is registered as the CLI-only `cpu_random` target; it predates the OOO integration and needs porting before it passes on the current core.
 
 ## Directory Structure
 
@@ -203,6 +206,7 @@ frost/
 │       ├── coremark_pro/     # EEMBC CoreMark-PRO suite (DDR-backed heap)
 │       ├── freertos_demo/    # FreeRTOS RTOS demo
 │       └── ...               # Other applications
+├── linux/                    # Buildroot no-MMU Linux image build (submodule + external tree)
 ├── verif/                    # Verification infrastructure
 │   ├── cocotb_tests/         # Cocotb test cases
 │   ├── models/               # Software reference models
@@ -244,7 +248,7 @@ git submodule update --init
 pytest tests/                              # Run all tests
 pytest tests/ -s                           # With live output
 # Standalone test runner
-make -C tests                              # CPU constrained-random verification (cpu_tb)
+./tests/test_run_cocotb.py directed_traps  # Directed trap/interrupt tests (cpu_tb)
 ./tests/test_run_cocotb.py hello_world     # Hello World program
 ./tests/test_run_cocotb.py isa_test        # ISA compliance
 ./tests/test_run_cocotb.py coremark        # CoreMark benchmark
@@ -256,7 +260,8 @@ make -C tests                              # CPU constrained-random verification
 ./tests/test_run_cocotb.py frost_cache     # Cache-hierarchy unit bench (X3 shape)
 ./tests/test_run_cocotb.py freertos_demo   # FreeRTOS demo
 
-# With waveform output
+# With waveform output (cpu_tb Makefile flow; note the default constrained-random
+# suite is the CLI-only `cpu_random` target and needs porting to the OOO core)
 WAVES=1 make -C tests
 ```
 
@@ -275,8 +280,7 @@ WAVES=1 make -C tests
 
 Running `pytest tests/` exercises:
 
-- **CPU verification** — constrained-random instruction sequences validated against Python reference models
-- **Directed tests** — atomic operations (LR/SC), trap handling, compressed instructions
+- **Directed tests** — M-mode trap/interrupt handling (`directed_traps` on the cpu_tb harness); LR/SC and compressed-instruction coverage is carried by the rv32ua/rv32uc riscv-tests, the arch-compliance suite, and the ddr_atomic_test/c_ext_test programs (the remaining cpu_tb directed suites and the constrained-random regression are CLI-only pending a port to the OOO core)
 - **Architecture compliance** — 400+ tests from the official [riscv-arch-test](https://github.com/riscv-non-isa/riscv-arch-test) suite across I, M, A, F, D, C, B, K, Zicond, and Zifencei extensions, with signature comparison against Spike golden references (Verilator only, parallelized by extension in CI)
 - **ISA pipeline tests** — 126 self-checking tests from [riscv-tests](https://github.com/riscv-software-src/riscv-tests) across rv32ui, rv32um, rv32ua, rv32uf, rv32ud, rv32uc, rv32mi, and B-extension suites, exercising rename, wakeup, CDB arbitration, and OOO commit (Verilator only)
 - **Random instruction torture tests** — 20 randomly generated RV32IMAFDC instruction sequences (ALU, multiply/divide, memory, branch, FP, AMO) verified against Spike golden register signatures (Verilator only)
@@ -317,7 +321,7 @@ Use a serial terminal configured for 115200 baud, 8 data bits, no parity, and
 | Board              | FPGA                 | CPU Clock | Cache hierarchy → main memory               |
 |--------------------|----------------------|-----------|---------------------------------------------|
 | Alveo X3522PV      | UltraScale+ (xcux35) | 300 MHz   | 128 KiB L1D + 16 KiB L1I → 2 MiB URAM L2 → 1 GiB DDR4 |
-| Digilent Genesys2  | Kintex-7 (xc7k325t)  | 133 MHz   | 128 KiB L1D + 16 KiB L1I → 1 GiB DDR3                 |
+| Digilent Genesys2  | Kintex-7 (xc7k325t)  | 133 MHz   | 128 KiB L1D + 128 KiB L1I → 1 GiB DDR3                |
 
 Both boards also carry the 256 KiB fast (uncached, 1-cycle) low BRAM region and
 present the identical software-visible memory map: `[0, 256 KiB)` fast BRAM,
@@ -333,15 +337,15 @@ controller calibrates, so software never observes an uninitialized main memory.
 
 | Resource | Used | Available | Util% |
 |----------|-----:|----------:|------:|
-| CLB LUTs | 148,337 | 1,029,600 | 14.4% |
-|   LUT as Logic | 138,133 | 1,029,600 | 13.4% |
-|   LUT as Distributed RAM | 9,034 | — | — |
-|   LUT as Shift Register | 1,170 | — | — |
-| CLB Registers | 113,144 | 2,059,200 | 5.5% |
+| CLB LUTs | 149,121 | 1,029,600 | 14.5% |
+|   LUT as Logic | 138,878 | 1,029,600 | 13.5% |
+|   LUT as Distributed RAM | 9,074 | — | — |
+|   LUT as Shift Register | 1,169 | — | — |
+| CLB Registers | 113,334 | 2,059,200 | 5.5% |
 | Block RAM Tile | 240 | 2,112 | 11.4% |
 | URAM | 64 | 352 | 18.2% |
 | DSPs | 35 | 1,320 | 2.6% |
-| CARRY8 | 4,415 | 128,700 | 3.4% |
+| CARRY8 | 4,436 | 128,700 | 3.5% |
 | F7 Muxes | 208 | 514,800 | 0.0% |
 | F8 Muxes | 49 | 257,400 | 0.0% |
 | Bonded IOB | 132 | 364 | 36.3% |
@@ -352,12 +356,12 @@ controller calibrates, so software never observes an uninitialized main memory.
 
 | Resource | Used | Available | Util% |
 |----------|-----:|----------:|------:|
-| Slice LUTs | 129,281 | 203,800 | 63.4% |
-|   LUT as Logic | 120,714 | 203,800 | 59.2% |
-|   LUT as Distributed RAM | 7,722 | — | — |
+| Slice LUTs | 130,622 | 203,800 | 64.1% |
+|   LUT as Logic | 122,015 | 203,800 | 59.9% |
+|   LUT as Distributed RAM | 7,762 | — | — |
 |   LUT as Shift Register | 845 | — | — |
-| Slice Registers | 86,734 | 407,600 | 21.3% |
-| Block RAM Tile | 189.5 | 445 | 42.6% |
+| Slice Registers | 87,375 | 407,600 | 21.4% |
+| Block RAM Tile | 219 | 445 | 49.2% |
 | DSPs | 36 | 840 | 4.3% |
 | F7 Muxes | 98 | 101,900 | 0.1% |
 | F8 Muxes | 33 | 50,950 | 0.1% |
@@ -403,7 +407,7 @@ queue, store queue, CDB arbiter, FU shims) has its own README under
 | **CDB**         | Common Data Bus (2-lane result broadcast)        |
 | **FU**          | Functional Unit (ALU, MUL/DIV, FPU, …)           |
 | **L0 Cache**    | Level-0 cache for load-use bypass                |
-| **L1I / L1D**   | Split write-back line caches (16 KiB instruction, 128 KiB data) over the cached DDR region, through a shared 2:1 line-port arbiter |
+| **L1I / L1D**   | Split write-back line caches (16 KiB instruction on X3 / 128 KiB on Genesys2, 128 KiB data) over the cached DDR region, through a shared 2:1 line-port arbiter |
 | **L2 Cache**    | 2 MiB UltraRAM line cache below the L1s (UltraScale+ only)        |
 | **Cached region** | `[0x8000_0000, +1 GiB)` — code (execute-from-DDR), heap, and large data, behind L1[/L2]→DDR |
 | **BTB**         | Branch Target Buffer (256-entry target predictor) |
diff --git a/__init__.py b/__init__.py
index 094cb92f..55a30449 100644
--- a/__init__.py
+++ b/__init__.py
@@ -15,7 +15,7 @@
 """FROST - RISC-V processor package.
 
 This package contains a complete RV32GCB (G = IMAFD) RISC-V processor
-implementation with full machine-mode support and additional extensions
+implementation with Machine (M) and User (U) privilege modes and additional extensions
 (Zicsr, Zicntr, Zifencei, Zicond, Zbkb, and Zihintpause), along with
 verification infrastructure, build tools, and software libraries.
 
diff --git a/boards/README.md b/boards/README.md
index b42b1abe..cfbc35e8 100644
--- a/boards/README.md
+++ b/boards/README.md
@@ -6,7 +6,7 @@ This directory contains board-specific wrappers that enable the FROST RISC-V pro
 
 | Board                  | FPGA                               | CPU Clock  | Cache hierarchy → main memory                         | Features                 |
 |------------------------|------------------------------------|------------|-------------------------------------------------------|--------------------------|
-| [Genesys2](genesys2/)  | Xilinx Kintex-7 (xc7k325t)         | 133.33 MHz | 128 KiB L1D + 16 KiB L1I → 1 GiB DDR3                 | Entry-level development  |
+| [Genesys2](genesys2/)  | Xilinx Kintex-7 (xc7k325t)         | 133.33 MHz | 128 KiB L1D + 128 KiB L1I → 1 GiB DDR3                | Entry-level development  |
 | [X3](x3/)              | Xilinx Alveo X3522PV (UltraScale+) | 300 MHz    | 128 KiB L1D + 16 KiB L1I → 2 MiB URAM L2 → 1 GiB DDR4 | High-performance target  |
 
 Both boards expose the identical software-visible memory map (256 KiB fast
diff --git a/boards/genesys2/genesys2_frost.sv b/boards/genesys2/genesys2_frost.sv
index e47c9218..0e519104 100644
--- a/boards/genesys2/genesys2_frost.sv
+++ b/boards/genesys2/genesys2_frost.sv
@@ -196,7 +196,10 @@ module genesys2_frost (
       // backed by the DDR3 controller through the AXI port below.
       .ENABLE_CACHED_TIER(1),
       .CACHED_HAS_L2(0),
-      .USE_BEHAVIORAL_DDR(0)
+      .USE_BEHAVIORAL_DDR(0),
+      // Bump L1I 16 KiB -> 128 KiB: hold the kernel tick/softirq/scheduler
+      // working set to defeat the periodic-tick catch-up livelock (no L2 here).
+      .L1I_CACHE_BYTES(128 * 1024)
   ) subsystem (
       .i_clk(main_clock),
       .i_clk_div4(divided_clock_by_4),
diff --git a/boards/xilinx_frost_subsystem.sv b/boards/xilinx_frost_subsystem.sv
index 756dc332..7e824f09 100644
--- a/boards/xilinx_frost_subsystem.sv
+++ b/boards/xilinx_frost_subsystem.sv
@@ -32,7 +32,13 @@ module xilinx_frost_subsystem #(
     // 1 = the cached tier ends in the simulation-only behavioral DDR model;
     // 0 = it ends at the o_ddr_axi_*/i_ddr_axi_* ports below, wired to the
     // board's DDR controller subsystem (both boards drive 0).
-    parameter int unsigned USE_BEHAVIORAL_DDR = 1
+    parameter int unsigned USE_BEHAVIORAL_DDR = 1,
+    // L1 instruction-cache size in bytes. genesys2 (L1-only, no L2) bumps this
+    // above the 16 KiB default so the kernel periodic-tick/softirq/scheduler
+    // working set stays resident, addressing the tick-livelock I$ thrash.
+    parameter int unsigned L1I_CACHE_BYTES = 16 * 1024,
+    // Optional boot-hang UART classifier. Leave off for interactive testing.
+    parameter int unsigned ENABLE_HANG_TRIAGE = 0
 ) (
     input logic i_clk,       // Main CPU clock
     input logic i_clk_div4,  // Divided clock for JTAG/UART (1/4 of main clock)
@@ -217,7 +223,9 @@ module xilinx_frost_subsystem #(
       .CLK_FREQ_HZ(CLK_FREQ_HZ),
       .ENABLE_CACHED_TIER(ENABLE_CACHED_TIER),
       .CACHED_HAS_L2(CACHED_HAS_L2),
-      .USE_BEHAVIORAL_DDR(USE_BEHAVIORAL_DDR)
+      .USE_BEHAVIORAL_DDR(USE_BEHAVIORAL_DDR),
+      .L1I_CACHE_BYTES(L1I_CACHE_BYTES),
+      .ENABLE_HANG_TRIAGE(ENABLE_HANG_TRIAGE)
   ) frost_processor (
       .i_clk(i_clk),
       .i_clk_div4(i_clk_div4),
diff --git a/formal/reorder_buffer.sby b/formal/reorder_buffer.sby
index 7f10d441..cd51a06a 100644
--- a/formal/reorder_buffer.sby
+++ b/formal/reorder_buffer.sby
@@ -21,6 +21,7 @@ cover: smtbmc boolector
 read -formal -sv riscv_pkg.sv
 read -formal -sv sdp_dist_ram.sv
 read -formal -sv mwp_dist_ram.sv
+read -formal -sv mwp_dist_ram_ohread.sv
 read -formal -sv rob_serializer.sv
 read -formal -sv reorder_buffer.sv
 prep -top reorder_buffer
@@ -29,5 +30,6 @@ prep -top reorder_buffer
 ../hw/rtl/cpu_and_mem/cpu/riscv_pkg.sv
 ../hw/rtl/lib/ram/sdp_dist_ram.sv
 ../hw/rtl/lib/ram/mwp_dist_ram.sv
+../hw/rtl/lib/ram/mwp_dist_ram_ohread.sv
 ../hw/rtl/cpu_and_mem/cpu/tomasulo/reorder_buffer/rob_serializer.sv
 ../hw/rtl/cpu_and_mem/cpu/tomasulo/reorder_buffer/reorder_buffer.sv
diff --git a/formal/store_queue.sby b/formal/store_queue.sby
index af51e708..9fbca9c2 100644
--- a/formal/store_queue.sby
+++ b/formal/store_queue.sby
@@ -14,7 +14,7 @@ smtbmc boolector
 [script]
 read -formal -sv riscv_pkg.sv
 read -sv sdp_dist_ram.sv
-read -sv sq_forwarding_unit.sv
+read -formal -sv sq_forwarding_unit.sv
 read -formal -sv store_queue.sv
 prep -top store_queue
 
diff --git a/formal/tomasulo_wrapper.sby b/formal/tomasulo_wrapper.sby
index 8ccd2631..d1de30db 100644
--- a/formal/tomasulo_wrapper.sby
+++ b/formal/tomasulo_wrapper.sby
@@ -16,6 +16,7 @@ smtbmc boolector
 read -formal -sv riscv_pkg.sv
 read -formal -sv sdp_dist_ram.sv
 read -formal -sv mwp_dist_ram.sv
+read -formal -sv mwp_dist_ram_ohread.sv
 read -sv rob_serializer.sv
 read -sv reorder_buffer.sv
 read -sv register_alias_table.sv
@@ -72,6 +73,7 @@ prep -top tomasulo_wrapper
 ../hw/rtl/cpu_and_mem/cpu/riscv_pkg.sv
 ../hw/rtl/lib/ram/sdp_dist_ram.sv
 ../hw/rtl/lib/ram/mwp_dist_ram.sv
+../hw/rtl/lib/ram/mwp_dist_ram_ohread.sv
 ../hw/rtl/cpu_and_mem/cpu/tomasulo/reorder_buffer/rob_serializer.sv
 ../hw/rtl/cpu_and_mem/cpu/tomasulo/reorder_buffer/reorder_buffer.sv
 ../hw/rtl/cpu_and_mem/cpu/tomasulo/register_alias_table/register_alias_table.sv
diff --git a/fpga/load_software/file_to_bram.tcl b/fpga/load_software/file_to_bram.tcl
index 20647787..7ff2727a 100644
--- a/fpga/load_software/file_to_bram.tcl
+++ b/fpga/load_software/file_to_bram.tcl
@@ -18,27 +18,62 @@
 # Reads hex file (one 32-bit word per line) and writes to BRAM through
 # JTAG-to-AXI bridge. Used for loading software without reprogramming FPGA.
 
-proc file2bram {base_memory_address firmware_filename {axi_interface_name hw_axi_1}} {
+proc _file2bram_rearm_image_load_reset {axi_interface_name base_memory_address rearm_word} {
+    set old_txn [get_hw_axi_txns -quiet bramrstkeep]
+    if {[llength $old_txn] > 0} {
+        delete_hw_axi_txn $old_txn
+    }
+    create_hw_axi_txn bramrstkeep [get_hw_axis $axi_interface_name] \
+        -type write -address [format 0x%08x $base_memory_address] -len 1 -data $rearm_word
+    run_hw_axi [get_hw_axi_txns bramrstkeep]
+    delete_hw_axi_txn [get_hw_axi_txns bramrstkeep]
+}
+
+proc file2bram {base_memory_address firmware_filename {axi_interface_name hw_axi_1} {batch_limit 64}} {
 
     # Open firmware file (text format: 8 hex digits per line)
     set file_descriptor [open $firmware_filename r]
     set current_address $base_memory_address
     set transaction_number 0
+    set batch_word_count 0
+    set total_words 0
+    set first_word ""
 
-    # Read file line by line - each line is one 32-bit word in hexadecimal
+    # Read file line by line - each line is one 32-bit word in hexadecimal.
+    # Run bounded batches so the hardware image-load reset one-shot cannot
+    # expire while Vivado is blocked inside one very large run_hw_axi call.
     while {[gets $file_descriptor word_hex_value] >= 0} {
+        set word_hex_value [string trim $word_hex_value]
+        if {$word_hex_value eq ""} {
+            continue
+        }
+        if {$first_word eq ""} {
+            set first_word $word_hex_value
+        }
+
         set formatted_address [format 0x%08x $current_address]
-        # Create AXI write transaction for this word
-        create_hw_axi_txn wr$transaction_number [get_hw_axis $axi_interface_name] \
+        create_hw_axi_txn bramwr$batch_word_count [get_hw_axis $axi_interface_name] \
             -type write -address $formatted_address -len 1 -data $word_hex_value
+        incr batch_word_count
         incr transaction_number
-        # Move to next word (4 bytes)
+        incr total_words
         incr current_address 4
+
+        if {$batch_word_count >= $batch_limit} {
+            run_hw_axi [get_hw_axi_txns bramwr*]
+            delete_hw_axi_txn [get_hw_axi_txns bramwr*]
+            set batch_word_count 0
+            if {$first_word ne ""} {
+                _file2bram_rearm_image_load_reset $axi_interface_name $base_memory_address $first_word
+            }
+        }
     }
     close $file_descriptor
 
-    # Execute all queued AXI transactions
-    run_hw_axi [get_hw_axi_txns]
+    if {$batch_word_count > 0} {
+        run_hw_axi [get_hw_axi_txns bramwr*]
+        delete_hw_axi_txn [get_hw_axi_txns bramwr*]
+    }
 
-    puts "Loaded $transaction_number words starting at [format 0x%08x $base_memory_address]"
+    puts "Loaded $total_words words starting at [format 0x%08x $base_memory_address] in bounded batches"
 }
diff --git a/fpga/load_software/file_to_ddr.tcl b/fpga/load_software/file_to_ddr.tcl
index 1a3ee52f..40c0f852 100644
--- a/fpga/load_software/file_to_ddr.tcl
+++ b/fpga/load_software/file_to_ddr.tcl
@@ -21,45 +21,90 @@
 # Addresses are REGION-RELATIVE: offset 0 = the base of the 1 GiB cached
 # region (0x8000_0000 in the CPU's address map). The CPU must be held in
 # reset while this runs (the image-load reset in xilinx_frost_subsystem
-# asserts on low-BRAM writes, which the loader always performs afterwards;
-# the caches re-invalidate on that reset, so the freshly written DDR contents
-# are never shadowed by stale lines).
+# asserts on low-BRAM writes; the caches re-invalidate on that reset, so the
+# freshly written DDR contents are never shadowed by stale lines).
+#
+# CRITICAL: the image_load_reset is a ~4 s one-shot counter re-armed by each
+# low-BRAM write. A multi-MB DDR image takes much longer than 4 s to burst in,
+# so a single pre-load BRAM write is NOT enough -- the counter expires
+# mid-load, the CPU comes out of reset, and free-runs against the half-written
+# DDR image (nondeterministic -> flaky boot hangs). When bram_axi_name is
+# given we re-arm the reset with a dummy low-BRAM write every poke_interval
+# bursts (sub-second << 4 s), holding the CPU in reset for the ENTIRE load.
+# The DDR loader (S01) is a separate AXI master and keeps running while the CPU
+# is held, so the load still completes.
+
+# Re-arm the image-load CPU reset with a single low-BRAM write (restarts the
+# subsystem's ~4 s reset counter). Called right before every blocking DDR batch
+# run so the counter can never expire mid-load and let the CPU free-run.
+proc _rearm_image_load_reset {bram_axi_name rearm_word} {
+    if {$bram_axi_name eq ""} return
+    create_hw_axi_txn rstkeep [get_hw_axis $bram_axi_name] \
+        -type write -address 0x00000000 -len 1 -data $rearm_word
+    run_hw_axi [get_hw_axi_txns rstkeep]
+    delete_hw_axi_txn [get_hw_axi_txns rstkeep]
+}
 
-proc file2ddr {firmware_filename {axi_interface_name hw_axi_2} {burst_words 256}} {
+proc file2ddr {firmware_filename {axi_interface_name hw_axi_2} {burst_words 256} {bram_axi_name ""} {rearm_word "00000000"}} {
 
     set file_descriptor [open $firmware_filename r]
-    set words [list]
-    while {[gets $file_descriptor word_hex_value] >= 0} {
-        set trimmed [string trim $word_hex_value]
-        if {$trimmed ne ""} {
-            lappend words $trimmed
-        }
-    }
-    close $file_descriptor
 
-    set total_words [llength $words]
+    # Stream the image in burst-sized chunks. Reading the whole file into one
+    # giant Tcl list and indexing it per word (lindex on a multi-MB list) is
+    # pathologically slow in the Vivado tcl interpreter -- THAT, not the JTAG,
+    # is what turned a ~6 MB Linux image into a ~17 min load (the actual
+    # create/run/delete of all ~8.8k bursts is only ~15 s). Reading burst_words
+    # lines at a time keeps every list tiny, so the data-prep is ~linear and
+    # negligible. run+delete in batches so the live hw_axi_txn set stays bounded.
+    set axi [get_hw_axis $axi_interface_name]
     set current_address 0
     set transaction_number 0
-    set index 0
+    set total_words 0
+    set batch 0
+    set batch_limit 128  ;# small batches so each blocking run_hw_axi stays well under the ~4 s reset counter
+
+    while {1} {
+        # Collect up to burst_words words for this burst (skipping blank lines,
+        # so non-blank word N still lands at DDR offset N -- matches the old
+        # read-all-then-index behaviour).
+        set chunk [list]
+        for {set i 0} {$i < $burst_words} {incr i} {
+            if {[gets $file_descriptor word_hex_value] < 0} { break }
+            set trimmed [string trim $word_hex_value]
+            if {$trimmed ne ""} { lappend chunk $trimmed }
+        }
+        set beats [llength $chunk]
+        if {$beats == 0} { break }
 
-    while {$index < $total_words} {
-        set beats [expr {min($burst_words, $total_words - $index)}]
         # hw_axi burst data is one bit-vector with beat 0 in the least
         # significant word: concatenate this burst's words last-to-first.
         set data ""
         for {set b [expr {$beats - 1}]} {$b >= 0} {incr b -1} {
-            append data [lindex $words [expr {$index + $b}]]
+            append data [lindex $chunk $b]
         }
-        set formatted_address [format 0x%08x $current_address]
-        create_hw_axi_txn ddrwr$transaction_number [get_hw_axis $axi_interface_name] \
-            -type write -address $formatted_address -len $beats -data $data
+        create_hw_axi_txn ddrwr$batch $axi \
+            -type write -address [format 0x%08x $current_address] -len $beats -data $data
+        incr batch
         incr transaction_number
-        incr index $beats
+        incr total_words $beats
         incr current_address [expr {4 * $beats}]
+        if {$batch >= $batch_limit} {
+            # Re-arm the reset IMMEDIATELY before the blocking batch run (the only
+            # loop step long enough to risk the ~4 s counter expiring mid-load).
+            _rearm_image_load_reset $bram_axi_name $rearm_word
+            run_hw_axi [get_hw_axi_txns ddrwr*]
+            delete_hw_axi_txn [get_hw_axi_txns ddrwr*]
+            set batch 0
+            puts "  DDR load progress: $total_words words"
+            flush stdout
+        }
     }
+    close $file_descriptor
 
-    if {$transaction_number > 0} {
+    if {$batch > 0} {
+        _rearm_image_load_reset $bram_axi_name $rearm_word
         run_hw_axi [get_hw_axi_txns ddrwr*]
+        delete_hw_axi_txn [get_hw_axi_txns ddrwr*]
     }
 
     puts "Loaded $total_words DDR words in $transaction_number burst transaction(s)"
diff --git a/fpga/load_software/load_software.py b/fpga/load_software/load_software.py
index b835cb07..88feda8c 100755
--- a/fpga/load_software/load_software.py
+++ b/fpga/load_software/load_software.py
@@ -18,6 +18,7 @@
 
 import argparse
 import os
+import shutil
 import subprocess
 import sys
 from pathlib import Path
@@ -47,6 +48,7 @@
     *COREMARK_PRO_APP_NAMES,
     "csr_test",
     "ddr_exec_test",
+    "ddr_atomic_test",
     "ddr_heap_test",
     "ddr_smc_test",
     "ddr_test",
@@ -55,8 +57,13 @@
     "fpu_test",
     "hello_world",
     "isa_test",
+    "linux_irq_active_ddr_test",
+    "linux_boot",
+    "linux_irq_ddr_test",
+    "linux_irq_stack_slot_test",
     "memory_test",
     "packet_parser",
+    "pde_return_hazard",
     "print_clock_speed",
     "ras_stress_test",
     "ras_test",
@@ -89,18 +96,68 @@
 # that address range reads back zero. Rejected below until then.
 DDR_APPS = frozenset(COREMARK_PRO_APP_NAMES) | {
     "ddr_exec_test",
+    "ddr_atomic_test",
     "ddr_heap_test",
     "ddr_smc_test",
     "ddr_test",
+    "linux_irq_active_ddr_test",
+    "linux_boot",
+    "linux_irq_ddr_test",
+    "linux_irq_stack_slot_test",
+    "pde_return_hazard",
 }
 
 
+def _linux_boot_preflight() -> None:
+    """Fail fast (with actionable guidance) before the long linux_boot self-build.
+
+    linux_boot is the only app that builds a whole Linux system from source via
+    the Buildroot submodule, so check its prerequisites up front rather than
+    dying deep inside a 30-60 min build (or after prompting for a hardware
+    target). Also warn on the first, from-scratch build so the runtime is not a
+    surprise.
+    """
+    buildroot_makefile = PROJECT_ROOT / "linux" / "buildroot" / "Makefile"
+    if not buildroot_makefile.exists():
+        print(
+            "Error: the Buildroot submodule (linux/buildroot) is not initialized.\n"
+            "  Run: git submodule update --init linux/buildroot",
+            file=sys.stderr,
+        )
+        sys.exit(1)
+
+    missing = [tool for tool in ("make", "dtc") if shutil.which(tool) is None]
+    if missing:
+        print(
+            "Error: missing host tools required to build the Linux image: "
+            f"{', '.join(missing)}.\n"
+            "  Install Buildroot's host dependencies (see "
+            "linux/buildroot-external/README.md) or run inside the\n"
+            "  frost-dev Docker image, which ships them.",
+            file=sys.stderr,
+        )
+        sys.exit(1)
+
+    kimage = PROJECT_ROOT / "linux" / "build" / "images" / "Image"
+    if not kimage.exists():
+        print(
+            "Note: no cached kernel image found -- linux_boot will build the "
+            "kernel + rootfs from source now.\n"
+            "  The FIRST build compiles a full rv32 cross toolchain and can take "
+            "30-60 min; later loads reuse\n"
+            "  the cached build and only re-pack the DDR image for this board "
+            "(seconds).",
+            file=sys.stderr,
+        )
+
+
 def compile_app_for_board(
     app_name: str,
     app_dir: Path,
     clock_freq: int,
     coremark_iterations: int,
     make_vars: dict[str, str] | None = None,
+    mem_config: str | None = None,
 ) -> bool:
     """Compile the application with board-specific settings.
 
@@ -110,6 +167,7 @@ def compile_app_for_board(
         clock_freq: CPU clock frequency for this board
         coremark_iterations: Number of iterations for CoreMark
         make_vars: Extra make variable overrides
+        mem_config: If set, exported as MEM_CONFIG to relink the app (e.g. "ddr")
 
     Returns:
         True if compilation succeeded, False otherwise
@@ -123,6 +181,20 @@ def compile_app_for_board(
     env["FPGA_CPU_CLK_FREQ"] = str(clock_freq)
     if app_name == "coremark":
         env["ITERATIONS"] = str(coremark_iterations)
+    # MEM_CONFIG=ddr relinks the app's code into the cached DDR region (the app
+    # Makefiles default to bram); this lets an arbitrary app run from DDR like
+    # the dedicated ddr_* apps. The Makefile's `?=` honors this env override.
+    if mem_config:
+        env["MEM_CONFIG"] = mem_config
+
+    # linux_boot self-builds the kernel + rootfs from the Buildroot submodule on
+    # a clean checkout, which can take ~30-60 min the first time (a full cross
+    # toolchain build); every other app is a quick cross-compile. `make clean`
+    # for linux_boot only drops the board-dependent pack outputs (the cached
+    # kernel/rootfs survive), so the re-pack after clean is fast either way.
+    is_linux_boot = app_name == "linux_boot"
+    clean_timeout = 300 if is_linux_boot else 30
+    build_timeout = 5400 if is_linux_boot else 120
 
     try:
         # Clean first to force recompilation with new settings
@@ -132,7 +204,7 @@ def compile_app_for_board(
             env=env,
             capture_output=True,
             text=True,
-            timeout=30,
+            timeout=clean_timeout,
         )
 
         # Build with board-specific settings
@@ -147,7 +219,7 @@ def compile_app_for_board(
             env=env,
             capture_output=False,  # Show output
             text=True,
-            timeout=120,
+            timeout=build_timeout,
         )
 
         if result.returncode != 0:
@@ -204,6 +276,15 @@ def main() -> None:
         default="vivado",
         help="Path to Vivado executable (default: vivado from PATH)",
     )
+    parser.add_argument(
+        "--ddr",
+        action="store_true",
+        help=(
+            "Build the app to execute from the cached DDR region (passes "
+            "MEM_CONFIG=ddr to the app Makefile), so an otherwise BRAM-resident "
+            "app runs its code from DDR. Requires a board with has_ddr."
+        ),
+    )
     coremark_pro_mode = parser.add_mutually_exclusive_group()
     coremark_pro_mode.add_argument(
         "-v0",
@@ -349,6 +430,12 @@ def main() -> None:
             f"CoreMark-PRO hardware flow: {coremark_pro_error}."
         )
 
+    # linux_boot builds a full Linux system from source; check its build
+    # prerequisites (and warn about the first-build runtime) before we prompt for
+    # a hardware target or kick off a long compile.
+    if args.software_app == "linux_boot":
+        _linux_boot_preflight()
+
     # Select hardware target (may prompt user if multiple targets)
     # Auto-filters by vendor based on board (e.g., genesys2 -> Digilent, x3 -> Xilinx)
     selected_target = select_target(
@@ -427,7 +514,12 @@ def main() -> None:
         elif args.coremark_pro_mode == "validation":
             print("  CoreMark-PRO run type: validation (-v1)")
     if not compile_app_for_board(
-        args.software_app, app_dir, clock_freq, coremark_iterations, make_vars
+        args.software_app,
+        app_dir,
+        clock_freq,
+        coremark_iterations,
+        make_vars,
+        mem_config="ddr" if args.ddr else None,
     ):
         print(f"Error: Failed to compile {args.software_app}", file=sys.stderr)
         sys.exit(1)
diff --git a/fpga/load_software/load_software.tcl b/fpga/load_software/load_software.tcl
index 1d812293..edfbb336 100644
--- a/fpga/load_software/load_software.tcl
+++ b/fpga/load_software/load_software.tcl
@@ -39,10 +39,10 @@ set coremark_pro_apps [list coremark_pro_core coremark_pro_cjpeg \
 
 # Valid software applications (mirrors load_software.py VALID_APPS)
 set valid_apps [list branch_pred_test c_ext_test call_stress cf_ext_test coremark \
-                     {*}$coremark_pro_apps csr_test ddr_exec_test ddr_heap_test \
+                     {*}$coremark_pro_apps csr_test ddr_atomic_test ddr_exec_test ddr_heap_test \
                      ddr_smc_test ddr_test freertos_demo fpu_assembly_test fpu_test \
-                     hello_world isa_test memory_test \
-                     packet_parser print_clock_speed ras_stress_test ras_test \
+                     hello_world isa_test linux_irq_active_ddr_test linux_boot linux_irq_ddr_test linux_irq_stack_slot_test memory_test \
+                     packet_parser pde_return_hazard print_clock_speed ras_stress_test ras_test \
                      spanning_test sprintf_test strings_test tomasulo_perf \
                      tomasulo_test uart_echo]
 
@@ -165,10 +165,14 @@ set bram_base_address 0x00000000
 set ddr_text_file ${project_root}/sw/apps/${firmware_application_name}/sw_ddr.txt
 
 # DDR image first (when present): assert the image-load CPU reset with a
-# single low-BRAM write, then burst the DDR image through hw_axi_2. The CPU
-# stays in reset until well after the subsequent full BRAM load, and the
-# caches re-invalidate on release, so the fresh DDR contents are never
-# shadowed by stale lines or racing writebacks.
+# low-BRAM write, then burst the DDR image through hw_axi_2 while RE-ARMING
+# that reset periodically (file2ddr pokes bram_axi every poke_interval bursts).
+# The image_load_reset is only a ~4 s one-shot, far shorter than a multi-MB DDR
+# load, so without the periodic re-arm the CPU would leave reset mid-load and
+# free-run against the half-written DDR image (nondeterministic boot hangs).
+# With it the CPU stays in reset until well after the subsequent full BRAM
+# load, and the caches re-invalidate on release, so the fresh DDR contents are
+# never shadowed by stale lines or racing writebacks.
 if { $has_ddr && $ddr_axi ne "" && [file exists $ddr_text_file] && [file size $ddr_text_file] > 12 } {
     set first_word_fd [open $firmware_text_file r]
     gets $first_word_fd first_word
@@ -177,8 +181,8 @@ if { $has_ddr && $ddr_axi ne "" && [file exists $ddr_text_file] && [file size $d
         -type write -address 0x00000000 -len 1 -data $first_word
     run_hw_axi [get_hw_axi_txns rst_assert]
     set ddr_word_count [expr {[file size $ddr_text_file] / 9}]
-    puts "Loading ~${ddr_word_count} words into DDR via ${ddr_axi} (bursts)..."
-    file2ddr $ddr_text_file $ddr_axi
+    puts "Loading ~${ddr_word_count} words into DDR via ${ddr_axi} (bursts, CPU held in reset)..."
+    file2ddr $ddr_text_file $ddr_axi 256 $bram_axi $first_word
 }
 
 # Write software to low BRAM starting at address 0.
diff --git a/fpga/sweep_coremark_pro.py b/fpga/sweep_coremark_pro.py
index a8e4d64e..4cad3afb 100755
--- a/fpga/sweep_coremark_pro.py
+++ b/fpga/sweep_coremark_pro.py
@@ -20,8 +20,19 @@
 (clean rebuild with the official registry args + JTAG load) on the selected
 board while holding the board UART open, then applies the strict pass rule to
 the captured output: ``<<PASS>>`` present, no ``ERROR``/``<<FAIL>>``/``<<TRAP>>``,
-and every ``:fails=N`` counter zero. Each workload's ``time(secs)`` is extracted
-for the summary table. Exits 0 only if every app passes.
+and every ``:fails=N`` counter zero. Each workload's ``time(secs)`` and
+``iterations`` are extracted and reduced to iter/s for the summary table.
+Exits 0 only if every app passes.
+
+A full passing -v0 sweep also reports the official CoreMark-PRO score: each
+workload's iter/s is multiplied by its scale factor and divided by its
+reference-platform score, and the mark is 1000 x the geometric mean of the
+nine normalized results (EEMBC Symmetric Multicore Benchmark User Guide 2.1.4
+sec. 4.4 p.12, identical to coremark-pro's util/perl/cert_mark.pl). FROST is
+single-core, so the single-context result is both the SingleCore and MultiCore
+mark. -v1 sweeps print iter/s but no score (verification runs are not
+score-eligible), and -v0 workloads finishing under the ~10s score-rule minimum
+get a warning to recalibrate their registry iteration count.
 
 The target board is chosen with the required ``--board`` flag (``x3`` or
 ``genesys2``); both expose all nine hardware-supported workloads. With no app
@@ -30,6 +41,9 @@
 
 The UART device (``--serial``) and JTAG target (``--target``) default per board
 (X3: /dev/ttyUSB2; genesys2: /dev/ttyUSB0); override either with its flag.
+The sweep refuses to start while another process holds the UART open, and
+holds the port in exclusive mode (TIOCEXCL) while running -- a second reader
+(e.g. a forgotten minicom) would silently steal chunks of the capture.
 
 Examples (from the repo root):
 
@@ -45,6 +59,9 @@
 
 import argparse
 import collections
+import fcntl
+import glob
+import math
 import os
 import re
 import select
@@ -59,7 +76,10 @@
 REPO_DEFAULT = SCRIPT_DIR.parent
 
 sys.path.insert(0, str(REPO_DEFAULT / "sw" / "apps"))
-from software_registry import COREMARK_PRO_PROGRAMS  # noqa: E402
+from software_registry import (  # noqa: E402
+    COREMARK_PRO_PROGRAM_BY_APP,
+    COREMARK_PRO_PROGRAMS,
+)
 
 HW_APPS = tuple(p.app_name for p in COREMARK_PRO_PROGRAMS if p.hardware_supported)
 
@@ -96,10 +116,109 @@
 # avoid misreading a prior run's <<PASS>>/time as this run's result.
 LOAD_COMPLETE_SENTINEL = "FROST_LOAD_COMPLETE"
 
+# Official CoreMark-PRO scoring constants: workload -> (scale factor,
+# reference-platform score), from the EEMBC Symmetric Multicore Benchmark User
+# Guide 2.1.4 sec. 4.4 Figure 10 and coremark-pro util/perl/cert_mark.pl (the
+# two agree). A workload's normalized result is iter/s * scale / reference,
+# and the mark is 1000 x the geometric mean of the nine normalized results.
+COREMARK_PRO_REFERENCE = {
+    "cjpeg-rose7-preset": (1.0, 40.3438),
+    "core": (10000.0, 2855.0),
+    "linear_alg-mid-100x100-sp": (1.0, 38.5624),
+    "loops-all-mid-10k-sp": (1.0, 0.87959),
+    "nnet_test": (1.0, 1.45853),
+    "parser-125k": (1.0, 4.81116),
+    "radix2-big-64k": (1.0, 99.6587),
+    "sha-test": (1.0, 48.5201),
+    "zip-test": (1.0, 21.3618),
+}
+
+# Minimum -v0 workload runtime for an official score run; the registry
+# calibrates each workload's iteration count to clear this.
+SCORE_RULE_MIN_SECS = 10.0
+
+# mith prints time(secs) with %8g: usually plain decimal, but accept the
+# exponent form %g falls back to for extreme values.
+MITH_NUMBER = r"([0-9]+(?:\.[0-9]*)?(?:[eE][+-]?[0-9]+)?)"
+
+
+def parse_workload_perf(serial_buf: str, workload: str) -> dict[str, Any]:
+    """Extract the workload-level iterations/time(secs) pair and derive iter/s.
+
+    Mirrors coremark-pro's util/perl/results_parser.pl (iter/s = iterations /
+    time(secs)). Anchoring on the official workload name keeps -v1 per-item
+    lines out of the match: mith prints the workload-level block first, and
+    only that block has an ``iterations=`` line.
+    """
+    name = re.escape(workload)
+    iters_match = re.search(rf"-- {name}:iterations=([0-9]+)", serial_buf)
+    secs_match = re.search(rf"-- {name}:time\(secs\)=\s*{MITH_NUMBER}", serial_buf)
+    iterations = int(iters_match.group(1)) if iters_match else None
+    secs = float(secs_match.group(1)) if secs_match else None
+    ips = None
+    if iterations and secs and secs > 0:
+        ips = iterations / secs
+    return {"iterations": iterations, "secs": secs, "ips": ips}
+
+
+def coremark_pro_mark(
+    ips_by_workload: dict[str, float],
+) -> tuple[float | None, list[str]]:
+    """Compute the official CoreMark-PRO mark from per-workload iter/s.
+
+    Returns (mark, []) when every official workload has a positive iter/s,
+    else (None, sorted missing workload names) -- the mark is only defined
+    over the full set of nine.
+    """
+    missing = sorted(
+        workload
+        for workload in COREMARK_PRO_REFERENCE
+        if not ips_by_workload.get(workload)
+    )
+    if missing:
+        return None, missing
+    log_sum = 0.0
+    for workload, (scale, reference) in COREMARK_PRO_REFERENCE.items():
+        log_sum += math.log(ips_by_workload[workload] * scale / reference)
+    return 1000.0 * math.exp(log_sum / len(COREMARK_PRO_REFERENCE)), []
+
+
+def serial_holders(path: str) -> list[str]:
+    """Return 'pid: cmdline' for other processes holding the serial device.
+
+    The tty layer delivers each received byte to exactly one reader, so a
+    second attached process (a forgotten minicom, an old capture script)
+    steals random chunks of the UART stream and silently corrupts the
+    sweep's capture. Scans /proc, so it only sees same-user processes.
+    """
+    try:
+        target = os.stat(path).st_rdev
+    except OSError:
+        return []
+    holders = set()
+    for fd_link in glob.glob("/proc/[0-9]*/fd/*"):
+        pid = fd_link.split("/")[2]
+        if pid == str(os.getpid()):
+            continue
+        try:
+            if os.stat(fd_link).st_rdev != target:
+                continue
+            with open(f"/proc/{pid}/cmdline", "rb") as f:
+                cmdline = f.read().replace(b"\0", b" ").decode().strip()
+        except OSError:
+            continue
+        holders.add(f"pid {pid}: {cmdline or '<unknown>'}")
+    return sorted(holders)
+
 
 def configure_serial(path: str) -> int:
-    """Open the UART raw/non-blocking at 115200 8N1 and flush stale bytes."""
+    """Open the UART raw/non-blocking at 115200 8N1 and flush stale bytes.
+
+    The port is put in exclusive mode (TIOCEXCL) so a terminal opened
+    mid-sweep gets EBUSY instead of silently stealing capture bytes.
+    """
     fd = os.open(path, os.O_RDWR | os.O_NOCTTY | os.O_NONBLOCK)
+    fcntl.ioctl(fd, termios.TIOCEXCL)
     attrs = termios.tcgetattr(fd)
     attrs[0] = 0
     attrs[1] = 0
@@ -154,6 +273,8 @@ def run_one(
     target: str,
 ) -> dict[str, Any]:
     """Load one app on the given board and watch the UART until a marker/timeout."""
+    program = COREMARK_PRO_PROGRAM_BY_APP.get(app)
+    workload = program.workload if program else None
     drain(serial_fd)
     cmd = [
         "./fpga/load_software/load_software.py",
@@ -241,9 +362,13 @@ def consume_loader(text: str) -> None:
             if proc.returncode != 0:
                 return {
                     "app": app,
+                    "workload": workload,
                     "mode": mode,
                     "status": "LOAD_FAIL",
                     "elapsed": None,
+                    "iterations": None,
+                    "secs": None,
+                    "ips": None,
                     "serial": serial_buf,
                     "loader_tail": list(loader_tail),
                 }
@@ -283,16 +408,89 @@ def consume_loader(text: str) -> None:
     if match:
         workload_time = float(match.group(1))
 
+    perf = (
+        parse_workload_perf(serial_buf, workload)
+        if workload
+        else {"iterations": None, "secs": None, "ips": None}
+    )
+
     return {
         "app": app,
+        "workload": workload,
         "mode": mode,
         "status": status,
         "elapsed": workload_time,
+        **perf,
         "serial": serial_buf,
         "loader_tail": list(loader_tail),
     }
 
 
+def print_score_report(results: list[dict[str, Any]], mode: str) -> None:
+    """Print the per-workload iter/s table and, for a -v0 sweep, the mark."""
+    rows = [r for r in results if r["workload"]]
+    if not rows:
+        return
+
+    print("\nCoreMark-PRO WORKLOAD RESULTS (single context)")
+    print(
+        f"{'Workload Name':<27} {'Status':>9} {'iters':>6} "
+        f"{'time(s)':>10} {'iter/s':>12} {'weighted':>10}"
+    )
+    print(f"{'-' * 27} {'-' * 9} {'-' * 6} {'-' * 10} {'-' * 12} {'-' * 10}")
+    for r in rows:
+        scale_ref = COREMARK_PRO_REFERENCE.get(r["workload"])
+        iters_text = "n/a" if r["iterations"] is None else str(r["iterations"])
+        secs_text = "n/a" if r["secs"] is None else f"{r['secs']:.4f}"
+        ips_text = "n/a" if r["ips"] is None else f"{r['ips']:.6g}"
+        weighted_text = "n/a"
+        if r["ips"] is not None and scale_ref is not None:
+            weighted_text = f"{r['ips'] * scale_ref[0] / scale_ref[1]:.6g}"
+        print(
+            f"{r['workload']:<27} {r['status']:>9} {iters_text:>6} "
+            f"{secs_text:>10} {ips_text:>12} {weighted_text:>10}"
+        )
+    print(
+        "weighted = iter/s x scale / reference-platform score "
+        "(EEMBC guide 2.1.4 sec. 4.4 Fig. 10)"
+    )
+
+    if mode == "-v1":
+        print(
+            "\nCoreMark-PRO score: n/a for -v1 validation sweeps (verification "
+            "runs are not score-eligible); rerun with -v0."
+        )
+        return
+
+    for r in rows:
+        if (
+            r["status"] == "PASS"
+            and r["secs"] is not None
+            and r["secs"] < SCORE_RULE_MIN_SECS
+        ):
+            print(
+                f"warning: {r['workload']} ran {r['secs']:.1f}s, under the "
+                f"~{SCORE_RULE_MIN_SECS:.0f}s score-rule minimum; recalibrate "
+                "its iteration count in sw/apps/software_registry.py"
+            )
+
+    ips_by_workload = {
+        r["workload"]: r["ips"] for r in rows if r["status"] == "PASS" and r["ips"]
+    }
+    score, missing = coremark_pro_mark(ips_by_workload)
+    if score is None:
+        print(
+            "\nCoreMark-PRO score: n/a -- the official mark needs a passing "
+            f"iter/s from all 9 workloads; missing: {', '.join(missing)}"
+        )
+    else:
+        print(f"\nCoreMark-PRO score (single context): {score:.2f}")
+        print(
+            "  1000 x geomean of the 9 weighted results; single core, so "
+            "SingleCore == MultiCore"
+        )
+
+
 def main() -> int:
     """Run the sweep and print the summary table."""
     parser = argparse.ArgumentParser(
@@ -307,7 +505,7 @@ def main() -> int:
         const="-v0",
         help=(
             "longer performance/score sweep: runs the registry-calibrated "
-            "iteration counts"
+            "iteration counts and computes the official CoreMark-PRO score"
         ),
     )
     mode_group.add_argument(
@@ -387,6 +585,18 @@ def main() -> int:
     serial = args.serial if args.serial else DEFAULT_SERIALS[args.board]
     timeout = args.timeout if args.timeout is not None else DEFAULT_TIMEOUTS[args.board]
 
+    holders = serial_holders(serial)
+    if holders:
+        print(
+            f"ERROR: {serial} is already open in another process, which would "
+            "steal chunks of the UART capture:",
+            file=sys.stderr,
+        )
+        for holder in holders:
+            print(f"  {holder}", file=sys.stderr)
+        print("Close it (or pass another --serial) and re-run.", file=sys.stderr)
+        return 1
+
     fd = configure_serial(serial)
     results = []
     try:
@@ -408,6 +618,12 @@ def main() -> int:
                 f"{result['status']} time={result['elapsed']}",
                 flush=True,
             )
+            if result["status"] == "PASS" and result["ips"] is None:
+                print(
+                    "warning: PASS but iterations/time(secs) missing from the "
+                    "capture -- UART bytes lost?",
+                    flush=True,
+                )
             if result["status"] == "LOAD_FAIL":
                 print("loader tail:", flush=True)
                 print("\n".join(result["loader_tail"]), flush=True)
@@ -417,7 +633,11 @@ def main() -> int:
     bad = [r for r in results if r["status"] != "PASS"]
     print(f"\nSUMMARY ({args.board})")
     for r in results:
-        print(f"{args.board} {r['app']} {r['mode']} {r['status']} time={r['elapsed']}")
+        line = f"{args.board} {r['app']} {r['mode']} {r['status']} time={r['elapsed']}"
+        if r["ips"] is not None:
+            line += f" iter/s={r['ips']:.6g}"
+        print(line)
+    print_score_report(results, args.mode)
     return 1 if bad else 0
 
 
diff --git a/hw/rtl/README.md b/hw/rtl/README.md
index ddb83852..bcfeba10 100644
--- a/hw/rtl/README.md
+++ b/hw/rtl/README.md
@@ -4,7 +4,7 @@ This directory contains the synthesizable SystemVerilog for FROST. The current
 CPU is an **out-of-order RV32GCB implementation with a 2-wide front-end and
 2-wide commit**: a 2-wide in-order IF/PD/ID front-end, Tomasulo register renaming
 and dynamic scheduling, out-of-order execution across six function units, and
-precise 2-wide in-order commit, with machine-mode traps and separate
+precise 2-wide in-order commit, with M/U-mode traps and separate
 instruction/data memory ports.
 
 The pipeline width is **asymmetric**. Fetch, decode, rename, ROB allocation,
@@ -88,7 +88,7 @@ backend notes.
 | `cpu_and_mem/cpu/csr/` | In use | Zicsr/Zicntr/fcsr support |
 | `cpu_and_mem/cpu/wb_stage/generic_regfile.sv` | In use | Parameterized INT/FP regfiles for OOO commit |
 | `cpu_and_mem/cpu/ex_stage/` | In use | Shared ALU, multiplier/divider, FPU, and `branch_jump_unit.sv` used by the OOO core and FU shims |
-| `cpu_and_mem/cpu/control/trap_unit.sv` | In use | Machine-mode exception/interrupt handling |
+| `cpu_and_mem/cpu/control/trap_unit.sv` | In use | M- and U-mode exception/interrupt handling (traps taken in M-mode) |
 | `lib/` | In use | Portable RAM/FIFO/stall helper primitives, plus `lib/cache/` (the `frost_cache` hierarchy, AXI bridge, and behavioral DDR model) and `lib/ram/sdp_ram_byte_en.sv` (row-granular byte-enable RAM with a selectable block/ultra primitive backing the cache data arrays) |
 | `peripherals/` | In use | UART TX/RX blocks |
 
@@ -102,7 +102,7 @@ served by the cache hierarchy:
 |--------|---------|------|-------------|
 | ROM | `0x0000_0000` | 96 KiB | Code and read-only data (fast BRAM) |
 | RAM | `0x0001_8000` | 160 KiB | Data, BSS, stack (fast BRAM) |
-| MMIO | `0x4000_0000` | 44 B | UART, FIFOs, CLINT-style timer, software interrupt |
+| MMIO | `0x4000_0000` | 112 KiB | UART/FIFOs/timer; plus Linux-facing ns16550a UART (`0x4000_1000`) and SiFive CLINT (`0x4001_0000`) |
 | DDR | `0x8000_0000` | 1 GiB | Cached region: code (`.ddr_text`), heap and large data (see below) |
 
 The cached tier serves both sides of the core: loads/stores through the
@@ -146,10 +146,22 @@ MMIO registers:
 | `0x4000_0020` | MSIP | Machine software interrupt pending |
 | `0x4000_0024` | UART_RX_STATUS | Bit 0 is data available |
 | `0x4000_0028` | UART_TX_STATUS | Bit 0 is can accept byte |
+| `0x4000_1000`–`101C` | ns16550a UART face | 16550 register file (word stride) aliasing UART_TX/RX for the Linux 8250 driver |
+| `0x4001_0000` | CLINT MSIP | SiFive CLINT alias of MSIP |
+| `0x4001_4000`/`4004` | CLINT MTIMECMP_LO/HI | SiFive CLINT alias of MTIMECMP |
+| `0x4001_BFF8`/`BFFC` | CLINT MTIME_LO/HI | SiFive CLINT alias of MTIME |
 
 The hardware UART console is configured for 115200 baud, 8 data bits, no
 parity, and 1 stop bit (8N1).
 
+For no-MMU Linux, the same UART is also reachable through a standard
+ns16550a register face at `0x4000_1000` (word stride; device-tree
+`reg-shift=2`, `reg-io-width=4`; `earlycon=uart8250,mmio32`), and the timer
+through a SiFive-CLINT-compatible window at `0x4001_0000` (`mtimecmp` at
+`+0x4000`, `mtime` at `+0xBFF8`). Both alias the native registers listed
+above onto the same hardware, so the in-tree Linux 8250 console and CLINT
+timer drivers work without a board-specific driver.
+
 If these addresses change, update `cpu_and_mem.sv`, `cpu_ooo.sv` parameters,
 `sw/common/link.ld`, `sw/lib/include/mmio.h`, and the verification constants in
 `verif/config.py`.
@@ -161,7 +173,8 @@ From the repo root:
 ```bash
 # Cocotb/Verilator simulation
 ./tests/test_run_cocotb.py hello_world
-./tests/test_run_cocotb.py cpu
+./tests/test_run_cocotb.py tomasulo_test
+./tests/test_run_cocotb.py --list-tests   # show all registered tests
 
 # Open-source RTL synthesis checks
 ./tests/test_run_yosys.py
@@ -180,7 +193,7 @@ sed -n '1,200p' hw/rtl/frost.f
 The CPU build file list is:
 
 ```bash
-sed -n '1,200p' hw/rtl/cpu_and_mem/cpu/cpu_ooo.f
+sed -n '1,200p' hw/rtl/cpu_and_mem/cpu/cpu_ooo/cpu_ooo.f
 ```
 
 ## Parameters
@@ -199,7 +212,7 @@ sed -n '1,200p' hw/rtl/cpu_and_mem/cpu/cpu_ooo.f
 | `frost.sv` | `DDR_MODEL_BYTES` / `DDR_MODEL_LATENCY` | `64 MiB` / `30` | Behavioral DDR model size and access latency (simulation) |
 | `frost.sv` | `FETCH_VALID_FUZZ` | `0` | Simulation-only: 1 wraps the low BRAM in a variable-latency fetch model (LFSR fetch-valid gaps) that mirrors the L1I provider's fetch contract; hardware keeps 0 |
 | `cpu_ooo.sv` | `MMIO_ADDR` | `32'h4000_0000` | MMIO base |
-| `cpu_ooo.sv` | `MMIO_SIZE_BYTES` | `32'h2C` | MMIO range size |
+| `cpu_ooo.sv` | `MMIO_SIZE_BYTES` | `32'h2C` | MMIO range size; `cpu_and_mem.sv` overrides to `32'h1_C000` (covers the ns16550a face + CLINT alias) |
 
 Simulation overrides parameters through Verilator generics (`-G`): the test
 Makefile enables the cached tier with the X3 hierarchy shape by default
diff --git a/hw/rtl/cpu_and_mem/cpu/README.md b/hw/rtl/cpu_and_mem/cpu/README.md
index 4f3e8166..2532d352 100644
--- a/hw/rtl/cpu_and_mem/cpu/README.md
+++ b/hw/rtl/cpu_and_mem/cpu/README.md
@@ -104,7 +104,7 @@ instruction size.
 | `if_stage/`, `pd_stage/`, `id_stage/` | **In use**  | Reused front-end stages, including BTB/direction/RAS prediction, PD BTB-miss redirects, and RVC handling. IF now drives a stall-capable, variable-latency fetch seam (NOP bubbles + a 1-deep owed-ask while unserved) so code can run from the cached DDR region as well as low BRAM; the seam's `fetch_provider` (low-BRAM fast path vs. a two-line L1I fetch buffer with predecode-on-fill) lives one level up in `cpu_and_mem/`. |
 | `wb_stage/`                         | **In use**    | Only the parameterized regfile is in the OOO build (instantiated twice for INT / FP). |
 | `csr/`                              | **In use**    | Zicsr / Zicntr / fcsr. CSR ops are decoded in ID but read and write the CSR at commit through the ROB serializing FSM. |
-| `control/trap_unit.sv`               | **In use**    | Machine-mode exception/interrupt handling used by `cpu_ooo.sv`. |
+| `control/trap_unit.sv`               | **In use**    | M- and U-mode exception/interrupt handling (traps taken in M-mode) used by `cpu_ooo.sv`. |
 | `ex_stage/`                         | **In use**    | `branch_jump_unit.sv` is instantiated directly at top level. ALU/MUL/DIV/FPU are used via the FU shims in `tomasulo/fu_shims/`. |
 
 `cpu_ooo.f` is the authoritative filelist for what actually gets compiled.
diff --git a/hw/rtl/cpu_and_mem/cpu/control/trap_unit.sv b/hw/rtl/cpu_and_mem/cpu/control/trap_unit.sv
index 089e4583..446fe9e1 100644
--- a/hw/rtl/cpu_and_mem/cpu/control/trap_unit.sv
+++ b/hw/rtl/cpu_and_mem/cpu/control/trap_unit.sv
@@ -15,18 +15,21 @@
  */
 
 /*
- * Trap Unit - Machine-mode exception and interrupt handling
+ * Trap Unit - exception and interrupt handling
  *
  * This module implements the RISC-V privileged architecture trap mechanism,
  * supporting both synchronous exceptions and asynchronous interrupts.
+ * Traps originate from M-mode or U-mode and are always taken in M-mode (mtvec).
+ * Machine interrupts are taken while running in U-mode regardless of mstatus.MIE,
+ * so the timer can preempt user code.
  *
  * Responsibilities:
  * =================
- *   - Exception detection from EX stage (ECALL, EBREAK, misaligned access)
+ *   - Exception handling from ROB commit (ECALL, EBREAK, misaligned access)
  *   - Interrupt prioritization and masking
  *   - Trap entry: save state, redirect to mtvec
  *   - Trap exit (MRET): restore state, return to mepc
- *   - WFI: stall until interrupt pending
+ *   - WFI state machine (unused in cpu_ooo; see WFI Behavior below)
  *
  * Trap Priority (highest to lowest):
  * ==================================
@@ -62,6 +65,8 @@
  *   - Stall pipeline until any interrupt is pending
  *   - Resume at next instruction if interrupt not taken
  *   - Take trap if interrupt is both pending and enabled
+ *   - NOTE: unused in cpu_ooo -- i_wfi_start is tied to 0 and o_stall_for_wfi
+ *     is unconnected; WFI stalling is handled by ROB serialization at the head
  *
  * Related Modules:
  *   - csr_file.sv: Provides mstatus/mie/mtvec/mepc, receives trap updates
@@ -96,6 +101,10 @@ module trap_unit #(
     // Direct MIE bit input keeps mstatus bit extraction out of this path.
     input logic i_mstatus_mie_direct,
 
+    // Current privilege mode. Machine interrupts are taken whenever running
+    // below M (priv != PrivM) regardless of mstatus.MIE (RISC-V privileged spec).
+    input logic [1:0] i_priv,
+
     // Interrupt pending inputs
     input riscv_pkg::interrupt_t i_interrupts,
 
@@ -104,6 +113,7 @@ module trap_unit #(
     input logic [XLEN-1:0] i_exception_cause,
     input logic [XLEN-1:0] i_exception_tval,
     input logic [XLEN-1:0] i_exception_pc,
+    input logic [XLEN-1:0] i_interrupt_pc,
 
     // MRET trap-return request
     input logic i_mret_start,
@@ -136,18 +146,39 @@ module trap_unit #(
   assign mie_msie = i_mie[riscv_pkg::MieMsiBit];
 
   // Register trap_taken for one cycle to prevent it from re-asserting immediately
-  // after CSR update (breaks combinational loop with mstatus_mie)
+  // after CSR update (breaks combinational loop with mstatus_mie).  Also keep
+  // a one-cycle MRET recovery marker: CSR privilege/MIE state changes on the
+  // raw MRET pulse, while the OOO front/back-end flush is registered one cycle
+  // later.  During that handoff, an old registered interrupt must not trap with
+  // mepc equal to the MRET instruction itself.
   logic trap_taken_prev;
+  logic mret_taken_prev;
   always_ff @(posedge i_clk) begin
-    if (i_rst) trap_taken_prev <= 1'b0;
-    else trap_taken_prev <= o_trap_taken;
+    if (i_rst) begin
+      trap_taken_prev <= 1'b0;
+      mret_taken_prev <= 1'b0;
+    end else begin
+      trap_taken_prev <= o_trap_taken;
+      mret_taken_prev <= o_mret_taken;
+    end
   end
 
-  // Interrupt pending and enabled (gate by !trap_taken_prev to prevent re-entry)
+  logic mret_interrupt_inhibit;
+  assign mret_interrupt_inhibit = i_mret_start || mret_taken_prev;
+
+  // Interrupt pending and enabled (gate by !trap_taken_prev to prevent re-entry).
+  // Global M-interrupt enable: mstatus.MIE while in M, but ALWAYS enabled while
+  // running below M (priv != PrivM) so a machine timer/SW/ext interrupt can
+  // preempt U-mode even with MIE=0 (RISC-V privileged spec).
+  logic m_int_globally_enabled;
+  assign m_int_globally_enabled = mstatus_mie || (i_priv != riscv_pkg::PrivM);
   logic meip_enabled, mtip_enabled, msip_enabled;
-  assign meip_enabled = i_interrupts.meip && mie_meie && mstatus_mie && !trap_taken_prev;
-  assign mtip_enabled = i_interrupts.mtip && mie_mtie && mstatus_mie && !trap_taken_prev;
-  assign msip_enabled = i_interrupts.msip && mie_msie && mstatus_mie && !trap_taken_prev;
+  assign meip_enabled = i_interrupts.meip && mie_meie && m_int_globally_enabled &&
+      !trap_taken_prev && !mret_interrupt_inhibit;
+  assign mtip_enabled = i_interrupts.mtip && mie_mtie && m_int_globally_enabled &&
+      !trap_taken_prev && !mret_interrupt_inhibit;
+  assign msip_enabled = i_interrupts.msip && mie_msie && m_int_globally_enabled &&
+      !trap_taken_prev && !mret_interrupt_inhibit;
 
   // TIMING OPTIMIZATION: Register interrupt_pending to break critical path.
   // The combinational path from msip -> interrupt_pending -> take_trap -> stall -> cache
@@ -156,11 +187,53 @@ module trap_unit #(
   // Note: mtip is already registered in cpu_and_mem.sv for similar timing reasons.
   logic interrupt_pending_comb;
   logic interrupt_pending;
-  assign interrupt_pending_comb = meip_enabled || mtip_enabled || msip_enabled;
+  // Gate with !o_trap_taken so a still-pending interrupt is NOT re-latched on
+  // the cycle its own trap is taken. interrupt_pending is registered, so
+  // otherwise the latched value fires a second, spurious trap entry the next
+  // cycle (re-saving mstatus.MPP=M and corrupting a U-mode trap). NOT a comb
+  // loop: o_trap_taken derives from the REGISTERED interrupt_pending, so the
+  // feedback path passes through a flop.
+  assign interrupt_pending_comb = (meip_enabled || mtip_enabled || msip_enabled) && !o_trap_taken;
+
+  // Source-level qualification: pending AND locally enabled (mie.x) and not in a
+  // trap/MRET recovery window -- but NOT gated by the live global mstatus.MIE.
+  //
+  // Once interrupt_pending has been LATCHED (while fully eligible, MIE=1), a
+  // YOUNGER csr clear of mstatus.MIE (e.g. the kernel idle `csrsi; ...; csrci`)
+  // must not retroactively erase it: the interrupt was eligible at an instruction
+  // boundary the csr-clear is younger than, so per the spec it is taken (the
+  // csr-clear is squashed by the trap). interrupt_pending is registered (1-cycle
+  // late) and interrupt_pending_eligible re-checks the LIVE global enable, so
+  // without a hold a csr-clear's delayed mstatus.MIE side-effect lands in the
+  // sample-to-service gap, drops interrupt_pending_comb, and clears the
+  // already-qualified bit -> the interrupt is LOST. On the no-MMU kernel that
+  // dropped machine-timer tick freezes jiffies and hangs the boot. (Usually the
+  // service is delayed one cycle by a draining store via i_sq_committed_empty,
+  // widening the window.) Hold across a global-MIE drop; still release when the
+  // source itself de-qualifies (mtip/meip/msip drops or mie.x cleared) or the
+  // trap is taken, so masking and acks behave normally.
+  // interrupt_source_live: a REAL, current interrupt source exists -- pending AND
+  // locally enabled (mie.x), gated ONLY by !trap_taken_prev. NOT gated by the live
+  // global mstatus.MIE and NOT by mret_interrupt_inhibit, so a persistent timer is
+  // HELD across both a global-MIE drop AND the MRET-recovery window rather than
+  // erased. It is still never TAKEN there (interrupt_pending_eligible keeps
+  // !mret_interrupt_inhibit + live m_int_globally_enabled), and the 0x80388bba
+  // panic stays guarded by the cpu_ooo interrupt_resume_pc seed on mret_taken (not
+  // by this latch) -- per commit 718f8cc the seed is THE panic fix and the old
+  // trap_unit MRET/interrupt cancel was incidental bring-up timing. A stale sample
+  // whose source has dropped (source_live=0) is still cleared, preserving the
+  // "cancel a stale one-cycle sample before MRET" property.
+  logic interrupt_source_live;
+  assign interrupt_source_live =
+      ((i_interrupts.meip && mie_meie) || (i_interrupts.mtip && mie_mtie) ||
+       (i_interrupts.msip && mie_msie)) && !trap_taken_prev;
 
   always_ff @(posedge i_clk) begin
     if (i_rst) interrupt_pending <= 1'b0;
-    else interrupt_pending <= interrupt_pending_comb;
+    else if (interrupt_pending_comb) interrupt_pending <= 1'b1;  // latch when fully eligible
+    else if (interrupt_pending && interrupt_source_live && !o_trap_taken)
+      interrupt_pending <= 1'b1;  // hold a live source across a global-MIE drop AND MRET inhibit
+    else interrupt_pending <= 1'b0;  // clear stale (no live source) / on take
   end
 
   // Register synchronous exceptions from the ROB head before trap entry.
@@ -177,6 +250,12 @@ module trap_unit #(
       exception_pending <= 1'b0;
     end else if (o_trap_taken) begin
       exception_pending <= 1'b0;
+    end else if (trap_taken_prev) begin
+      // Hold cleared one extra cycle: i_exception_valid (the ROB's trap_pending)
+      // stays high until the trap is acked (~1 cycle after o_trap_taken), so
+      // without this the exception re-arms and the trap is taken a second time
+      // (now in M, corrupting mstatus.MPP / mcause for a U-mode trap).
+      exception_pending <= 1'b0;
     end else if (i_exception_valid) begin
       exception_pending <= 1'b1;
       exception_cause_q <= i_exception_cause;
@@ -241,23 +320,53 @@ module trap_unit #(
   end
 
   always_ff @(posedge i_clk) begin
-    interrupt_cause <= interrupt_cause_comb;
+    // Hold the cause while interrupt_pending is held (across a global-MIE drop or
+    // the MRET inhibit); interrupt_cause_comb is built from the gated *_enabled so
+    // it decays to 0 there, which would default interrupt_latched_source_enabled
+    // false and leave the held interrupt ineligible when it can finally trap.
+    if (interrupt_cause_comb != '0) interrupt_cause <= interrupt_cause_comb;
+    else if (interrupt_pending && interrupt_source_live) interrupt_cause <= interrupt_cause;
+    else interrupt_cause <= '0;
   end
 
+  // A registered interrupt request must still be enabled when it reaches the
+  // trap decision. This keeps raw interrupt inputs out of the take_trap cone,
+  // while allowing CSR writes such as Linux's ret_from_exception mstatus
+  // restore to cancel a stale one-cycle interrupt sample before MRET.
+  logic interrupt_latched_source_enabled;
+  always_comb begin
+    unique case (interrupt_cause)
+      riscv_pkg::IntMachineExternal: interrupt_latched_source_enabled = mie_meie;
+      riscv_pkg::IntMachineSoftware: interrupt_latched_source_enabled = mie_msie;
+      riscv_pkg::IntMachineTimer:    interrupt_latched_source_enabled = mie_mtie;
+      default:                       interrupt_latched_source_enabled = 1'b0;
+    endcase
+  end
+
+  logic interrupt_pending_eligible;
+  assign interrupt_pending_eligible = interrupt_pending &&
+      interrupt_latched_source_enabled &&
+      m_int_globally_enabled &&
+      !trap_taken_prev &&
+      !mret_interrupt_inhibit;
+
   // Trap taken: either interrupt or exception, the pipeline not stalled
   // (except for WFI stall, which should be broken by interrupt), and no
   // committed store still draining (see i_sq_committed_empty).
   logic take_trap;
-  assign take_trap = (interrupt_pending || exception_pending) && !i_pipeline_stall &&
+  assign take_trap = (interrupt_pending_eligible || exception_pending) &&
+      !i_pipeline_stall &&
       i_sq_committed_empty;
 
-  // MRET execution (trap has priority: if interrupt/exception fires same cycle, trap wins)
+  // MRET execution.  Synchronous exceptions are structurally impossible with
+  // MRET at the ROB head; pending interrupts are deferred across the MRET
+  // recovery window above so the return redirect stays precise.
   logic take_mret;
   assign take_mret = i_mret_start && !i_pipeline_stall && !take_trap && i_sq_committed_empty;
 
   // Hold commit while a trap/MRET waits out the store drain, so the
   // committed set shrinks monotonically and the wait is bounded.
-  assign o_trap_drain_wait = (interrupt_pending || exception_pending || i_mret_start) &&
+  assign o_trap_drain_wait = (interrupt_pending_eligible || exception_pending || i_mret_start) &&
       !i_sq_committed_empty;
 
   // Output trap signals
@@ -272,7 +381,7 @@ module trap_unit #(
       o_trap_target = i_mepc;
     end else if (take_trap) begin
       // Check mtvec mode
-      if (i_mtvec[1:0] == 2'b01 && interrupt_pending) begin
+      if (i_mtvec[1:0] == 2'b01 && interrupt_pending_eligible) begin
         // Vectored mode for interrupts: BASE + 4*cause_code
         // Use pre-computed small offset (6 bits) for faster timing than
         // extracting from full interrupt_cause which synthesis can't optimize
@@ -289,11 +398,13 @@ module trap_unit #(
   // Trap entry information for CSR file
   // Interrupts have priority over synchronous exceptions
   always_comb begin
-    if (interrupt_pending) begin
+    if (interrupt_pending_eligible) begin
       o_trap_cause = interrupt_cause;
       o_trap_value = '0;  // Interrupts have mtval = 0
-      // For interrupts, save PC of next instruction (the one that will be interrupted)
-      o_trap_pc = i_exception_pc;
+      // For interrupts, save the precise architectural resume PC.  The live
+      // ROB head PC can be transient or stale while an async interrupt drains
+      // through the registered commit path.
+      o_trap_pc = i_interrupt_pc;
     end else begin
       o_trap_cause = exception_cause_q;
       o_trap_value = exception_tval_q;
@@ -318,9 +429,9 @@ module trap_unit #(
     assume (!(i_mret_start && i_exception_valid));
     assume (!(i_wfi_start && i_mret_start));
     assume (!(i_wfi_start && i_exception_valid));
-    // Note: MRET + interrupt_pending is NOT assumed away. The RTL handles it
-    // by giving trap priority (!take_trap gate on take_mret), and the
-    // p_trap_mret_mutex assertion proves this without over-constraining.
+    // Note: MRET + interrupt_pending is NOT assumed away. MRET wins that race;
+    // the pending interrupt is re-sampled after the return redirect has had
+    // time to retire the MRET precisely.
   end
 
   always @(posedge i_clk) begin
@@ -329,7 +440,8 @@ module trap_unit #(
       p_trap_mret_mutex : assert (!(o_trap_taken && o_mret_taken));
 
       // Trap needs source: trap_taken requires interrupt or exception.
-      p_trap_needs_source : assert (!o_trap_taken || (interrupt_pending || exception_pending));
+      p_trap_needs_source :
+      assert (!o_trap_taken || (interrupt_pending_eligible || exception_pending));
 
       // Trap not during stall: traps only fire when pipeline not stalled.
       p_trap_not_stalled : assert (!o_trap_taken || !i_pipeline_stall);
@@ -344,6 +456,11 @@ module trap_unit #(
       // MRET target is mepc: when MRET fires, target must be mepc.
       p_mret_target : assert (!o_mret_taken || (o_trap_target == i_mepc));
 
+      // A pending interrupt must not preempt the MRET instruction itself.
+      if (i_mret_start && !exception_pending) begin
+        p_mret_defers_interrupt : assert (!o_trap_taken);
+      end
+
       // WFI stall contract: if stall_for_wfi_comb, wfi must be active.
       p_wfi_stall_needs_active : assert (!stall_for_wfi_comb || wfi_active);
     end
@@ -403,8 +520,8 @@ module trap_unit #(
       cover_wfi_stall : cover (stall_for_wfi_comb);
       cover_wfi_wakeup : cover (f_past_valid && !wfi_active && $past(wfi_active));
       cover_external_interrupt :
-      cover (interrupt_pending && interrupt_cause == riscv_pkg::IntMachineExternal);
-      cover_exception : cover (o_trap_taken && i_exception_valid && !interrupt_pending);
+      cover (interrupt_pending_eligible && interrupt_cause == riscv_pkg::IntMachineExternal);
+      cover_exception : cover (o_trap_taken && i_exception_valid && !interrupt_pending_eligible);
       cover_trap_after_drain : cover (f_past_valid && o_trap_taken && $past(o_trap_drain_wait));
     end
   end
diff --git a/hw/rtl/cpu_and_mem/cpu/cpu_ooo/branch_recovery/branch_resolution.sv b/hw/rtl/cpu_and_mem/cpu/cpu_ooo/branch_recovery/branch_resolution.sv
index 298a03e4..4f065d9d 100644
--- a/hw/rtl/cpu_and_mem/cpu/cpu_ooo/branch_recovery/branch_resolution.sv
+++ b/hw/rtl/cpu_and_mem/cpu/cpu_ooo/branch_recovery/branch_resolution.sv
@@ -86,15 +86,27 @@ module branch_resolution #(
   logic [riscv_pkg::ReorderBufferTagWidth:0] branch_issue_age;
   logic [riscv_pkg::ReorderBufferTagWidth:0] early_flush_age;
   logic [riscv_pkg::ReorderBufferTagWidth:0] commit_flush_age;
+  // TIMING: compare-then-mux instead of mux-then-compare.  The original form
+  // muxed the 5-bit owner tag by checkpoint_id and THEN compared against
+  // rob_tag (8:1 x 5b mux + 5b compare in series).  Computing the per-
+  // checkpoint live bit first lets all eight in_use+owner-tag compares run in
+  // parallel straight out of the checkpoint registers, leaving only a 1-bit
+  // 8:1 select behind checkpoint_id.  Pure boolean identity — for every
+  // checkpoint_id value the selected bit is exactly the original expression.
+  logic [riscv_pkg::NumCheckpoints-1:0] checkpoint_live_per_id;
   always_comb begin
-    branch_issue_checkpoint_live = 1'b1;
-    if (rs_issue_int.has_checkpoint) begin
+    for (int i = 0; i < riscv_pkg::NumCheckpoints; i++) begin
       // Use the registered checkpoint state here to avoid a feedback loop
       // through execute-time checkpoint free.  The owner-tag check still
       // filters out stale/reused checkpoint IDs.
-      branch_issue_checkpoint_live =
-          checkpoint_in_use[rs_issue_int.checkpoint_id] &&
-          (checkpoint_owner_tag[rs_issue_int.checkpoint_id] == rs_issue_int.rob_tag);
+      checkpoint_live_per_id[i] =
+          checkpoint_in_use[i] && (checkpoint_owner_tag[i] == rs_issue_int.rob_tag);
+    end
+  end
+  always_comb begin
+    branch_issue_checkpoint_live = 1'b1;
+    if (rs_issue_int.has_checkpoint) begin
+      branch_issue_checkpoint_live = checkpoint_live_per_id[rs_issue_int.checkpoint_id];
     end
   end
 
@@ -133,10 +145,22 @@ module branch_resolution #(
     // suppress_branch_resolution → is_branch_issue → branch comparison (CARRY8)
     // → branch_update → commit_en created a 16-level combinational chain that
     // was the WNS critical path (-0.739 ns).  Removing it is safe because:
-    //   (a) commit_en already has a direct branch_update collision guard that
-    //       delays commit when the same branch resolves and commits in one cycle;
-    //   (b) resolution writes to entries that will be flushed are harmless;
-    //   (c) early_mispredict_fire still gates on the candidate directly.
+    //   (a) a resolving branch can never BE the committing head: branches have
+    //       no CDB done-bypass (reorder_buffer head_cdb_bypass excludes
+    //       head_is_branch), so a branch's done bit is registered and it can
+    //       only be head_ready the cycle AFTER its branch_update;
+    //   (b) resolution writes to entries that will be flushed are harmless --
+    //       flush-after-head invalidates them next cycle, allocation re-inits
+    //       the branch bits, and the unresolved-branch counter resets on
+    //       flush_pipeline;
+    //   (c) an early_mispredict_fire coinciding with a head-mispredict commit
+    //       is DROPPED one cycle later: early_mispredict_active gates on
+    //       !mispredict_recovery_pending (early_misprediction_recovery.sv),
+    //       which registers the commit-time recovery launch, so the early
+    //       pulse dies before any redirect / RAT restore / rob_early_recovered
+    //       write / backend flush.  (The former fire-time candidate gate was
+    //       removed for timing; o_head_commit_misprediction_candidate is now
+    //       an unconsumed observation output.)
   end
 
   assign suppress_branch_resolution = branch_issue_is_flushed;
diff --git a/hw/rtl/cpu_and_mem/cpu/cpu_ooo/branch_recovery/early_misprediction_recovery.sv b/hw/rtl/cpu_and_mem/cpu/cpu_ooo/branch_recovery/early_misprediction_recovery.sv
index 1740e9fb..bac1f730 100644
--- a/hw/rtl/cpu_and_mem/cpu/cpu_ooo/branch_recovery/early_misprediction_recovery.sv
+++ b/hw/rtl/cpu_and_mem/cpu/cpu_ooo/branch_recovery/early_misprediction_recovery.sv
@@ -97,8 +97,14 @@ module early_misprediction_recovery #(
   logic early_mispredict_fire;
   logic early_mispredict_pending;
   logic early_mispredict_active;
-  logic early_backend_recovery_pending;
-  logic [riscv_pkg::ReorderBufferTagWidth-1:0] early_backend_flush_tag;
+  // TIMING: this single FF broadcast into ~1300 failing endpoints post-opt
+  // (flush_en -> RS/LQ/SQ/ROB kill and capture gating).  Cap the fanout so
+  // synthesis replicates the register per consumer region.  Replication only
+  // — D input, resets, and the sacred recovery conditions are untouched.
+  (* max_fanout = 48 *) logic early_backend_recovery_pending;
+  // TIMING: flush-tag broadcast feeding per-entry age compares across the
+  // backend (CDB kill, LQ/RS squash).  Same register-replication treatment.
+  (* max_fanout = 48 *) logic [riscv_pkg::ReorderBufferTagWidth-1:0] early_backend_flush_tag;
 
   // Captured data from the mispredicting branch
   logic [riscv_pkg::ReorderBufferTagWidth-1:0] early_mispredict_tag;
diff --git a/hw/rtl/cpu_and_mem/cpu/cpu_ooo/branch_recovery/misprediction_flush_controller.sv b/hw/rtl/cpu_and_mem/cpu/cpu_ooo/branch_recovery/misprediction_flush_controller.sv
index bdc6c5cd..080ddbdf 100644
--- a/hw/rtl/cpu_and_mem/cpu/cpu_ooo/branch_recovery/misprediction_flush_controller.sv
+++ b/hw/rtl/cpu_and_mem/cpu/cpu_ooo/branch_recovery/misprediction_flush_controller.sv
@@ -121,9 +121,14 @@ module misprediction_flush_controller #(
   logic dispatch_flush;
   logic full_flush_side_effect_kill;
   logic frontend_state_flush;
-  logic flush_en;
-  logic [riscv_pkg::ReorderBufferTagWidth-1:0] flush_tag;
-  logic flush_all;
+  // TIMING: flush_en / flush_tag / flush_all broadcast into the whole backend
+  // (ROB commit gate, RS/LQ/SQ kills, RAT).  They are shallow functions of
+  // registered recovery state, so cap the fanout and let synthesis replicate
+  // the driver LUTs per consumer region.  Pure fanout splitting — the
+  // priority structure below is untouched.
+  (* max_fanout = 64 *) logic flush_en;
+  (* max_fanout = 64 *) logic [riscv_pkg::ReorderBufferTagWidth-1:0] flush_tag;
+  (* max_fanout = 64 *) logic flush_all;
   logic commit_recovery_flush_after_head;
   logic checkpoint_restore;
   logic [riscv_pkg::CheckpointIdWidth-1:0] checkpoint_restore_id;
diff --git a/hw/rtl/cpu_and_mem/cpu/cpu_ooo/cpu_ooo.sv b/hw/rtl/cpu_and_mem/cpu/cpu_ooo/cpu_ooo.sv
index 20fb096e..fe8d7648 100644
--- a/hw/rtl/cpu_and_mem/cpu/cpu_ooo/cpu_ooo.sv
+++ b/hw/rtl/cpu_and_mem/cpu/cpu_ooo/cpu_ooo.sv
@@ -46,6 +46,7 @@ module cpu_ooo #(
     input logic [63:0] i_instr,  // 64-bit fetch: {next_word, current_word}
     input logic [riscv_pkg::ImemFetchSidebandWidth-1:0] i_instr_sideband,
     input logic i_instr_bank_sel_r,  // Fetch-word parity (for spanning select)
+    input logic [31:0] i_served_addr,  // Served fetch-window tag (served-window guard)
     // Fetch window valid (see if_stage).  Tie 1 for fixed 1-cycle providers.
     input logic i_instr_valid,
     // Stall-replay bundle consumed this cycle (see if_stage) -- the fetch
@@ -104,6 +105,10 @@ module cpu_ooo #(
     // Interrupts
     input riscv_pkg::interrupt_t i_interrupts,
     input logic [63:0] i_mtime,
+    output logic [5:0] o_debug_irq_status,
+    output logic [XLEN-1:0] o_debug_commit_pc,
+    output logic [XLEN-1:0] o_debug_commit_2_pc,
+    output logic [1:0] o_debug_commit_valid,
     // Debug
     input logic i_disable_branch_prediction
 );
@@ -298,6 +303,39 @@ module cpu_ooo #(
   logic [riscv_pkg::ReorderBufferTagWidth-1:0] dbg_rat_alloc_rob_tag  /* verilator public_flat_rd */;
   logic [XLEN-1:0] dbg_last_a0_alloc_pc  /* verilator public_flat_rd */;
   logic [riscv_pkg::ReorderBufferTagWidth-1:0] dbg_last_a0_alloc_tag  /* verilator public_flat_rd */;
+  logic dbg_trap_taken_raw  /* verilator public_flat_rd */;
+  logic dbg_trap_taken_q  /* verilator public_flat_rd */;
+  logic [XLEN-1:0] dbg_trap_cause_internal  /* verilator public_flat_rd */;
+  logic [XLEN-1:0] dbg_trap_pc_internal  /* verilator public_flat_rd */;
+  logic [XLEN-1:0] dbg_interrupt_resume_pc  /* verilator public_flat_rd */;
+  logic dbg_port0_int_we  /* verilator public_flat_rd */;
+  logic [riscv_pkg::RegAddrWidth-1:0] dbg_port0_int_addr  /* verilator public_flat_rd */;
+  logic [XLEN-1:0] dbg_port0_int_data  /* verilator public_flat_rd */;
+  logic dbg_port1_int_we  /* verilator public_flat_rd */;
+  logic [riscv_pkg::RegAddrWidth-1:0] dbg_port1_int_addr  /* verilator public_flat_rd */;
+  logic [XLEN-1:0] dbg_port1_int_data  /* verilator public_flat_rd */;
+  logic dbg_commit_dest_valid  /* verilator public_flat_rd */;
+  logic dbg_commit_dest_rf  /* verilator public_flat_rd */;
+  logic [riscv_pkg::RegAddrWidth-1:0] dbg_commit_dest_reg  /* verilator public_flat_rd */;
+  logic [XLEN-1:0] dbg_commit_value  /* verilator public_flat_rd */;
+  logic dbg_commit_2_valid  /* verilator public_flat_rd */;
+  logic [XLEN-1:0] dbg_commit_2_pc  /* verilator public_flat_rd */;
+  logic dbg_commit_2_dest_valid  /* verilator public_flat_rd */;
+  logic dbg_commit_2_dest_rf  /* verilator public_flat_rd */;
+  logic [riscv_pkg::RegAddrWidth-1:0] dbg_commit_2_dest_reg  /* verilator public_flat_rd */;
+  logic [XLEN-1:0] dbg_commit_2_value  /* verilator public_flat_rd */;
+  logic dbg_rob_commit_reg_valid  /* verilator public_flat_rd */;
+  logic [XLEN-1:0] dbg_rob_commit_reg_pc  /* verilator public_flat_rd */;
+  logic dbg_rob_commit_reg_dest_valid  /* verilator public_flat_rd */;
+  logic dbg_rob_commit_reg_dest_rf  /* verilator public_flat_rd */;
+  logic [riscv_pkg::RegAddrWidth-1:0] dbg_rob_commit_reg_dest_reg  /* verilator public_flat_rd */;
+  logic [XLEN-1:0] dbg_rob_commit_reg_value  /* verilator public_flat_rd */;
+  logic dbg_rob_commit_2_reg_valid  /* verilator public_flat_rd */;
+  logic [XLEN-1:0] dbg_rob_commit_2_reg_pc  /* verilator public_flat_rd */;
+  logic dbg_rob_commit_2_reg_dest_valid  /* verilator public_flat_rd */;
+  logic dbg_rob_commit_2_reg_dest_rf  /* verilator public_flat_rd */;
+  logic [riscv_pkg::RegAddrWidth-1:0] dbg_rob_commit_2_reg_dest_reg  /* verilator public_flat_rd */;
+  logic [XLEN-1:0] dbg_rob_commit_2_reg_value  /* verilator public_flat_rd */;
   // verilog_lint: waive-stop line-length
 `endif
 
@@ -407,6 +445,7 @@ module cpu_ooo #(
       .i_instr,
       .i_instr_sideband,
       .i_instr_bank_sel_r,
+      .i_served_addr,
       .i_instr_valid,
       .o_fetch_replay_consume,
       .i_from_ex_comb(from_ex_comb_synth),
@@ -636,6 +675,11 @@ module cpu_ooo #(
   logic rob_commit_2_store_like_raw;
   logic rob_commit_2_valid;
   assign rob_commit_2_valid = rob_commit_2.valid;
+  logic rob_commit_store_like_raw;
+  logic sq_committed_empty_for_trap;
+  assign rob_commit_store_like_raw =
+      rob_commit_valid_raw &&
+      (rob_commit_comb.is_store || rob_commit_comb.is_fp_store || rob_commit_comb.is_sc);
   logic widen_commit_ok;
   assign widen_commit_ok = 1'b1;
   logic [riscv_pkg::ReorderBufferDepth-1:0] rob_entry_epoch;
@@ -839,7 +883,20 @@ module cpu_ooo #(
   logic trap_pending;
   logic trap_mret_commit_hold_q;
   logic [XLEN-1:0] rob_trap_pc;
+  logic rob_head_is_wfi;  // ROB head decodes as WFI (drives the WFI interrupt-resume-PC seed)
+  // Retired-next-PC precompute from the ROB (TIMING): equals
+  // retired_next_pc(rob_commit_comb) / (rob_commit_comb_2) whenever the
+  // corresponding commit valid is high, but computed from ungated head fields
+  // so the RAM read + adder are off the late commit_en cone.
+  logic [XLEN-1:0] rob_head_retired_next_pc;
+  logic [XLEN-1:0] rob_head_next_retired_next_pc;
   riscv_pkg::exc_cause_t rob_trap_cause;
+  riscv_pkg::exc_cause_t rob_trap_cause_remapped;
+  logic [1:0] csr_priv;  // current privilege from csr_file (PrivM/PrivU)
+  // Arbitrated trap cause from trap_unit (interrupt cause with bit 31, or the
+  // remapped synchronous-exception cause) -> csr_file mcause. Declared here so
+  // it is visible above the trap_unit instantiation that drives it.
+  logic [XLEN-1:0] trap_cause_internal;
   logic [XLEN-1:0] rob_trap_value;
   logic rob_trap_taken_ack;
   logic mret_start, mret_done_ack;
@@ -1007,6 +1064,9 @@ module cpu_ooo #(
       .i_alloc_req_2(rob_alloc_req_2),
       .o_alloc_resp_2(rob_alloc_resp_2),
 
+      // Current privilege (PrivM/PrivU) for U-mode CSR/MRET illegal checks
+      .i_priv(csr_priv),
+
       .o_cdb_grant(cdb_grant),
       .o_cdb(cdb_out),
 
@@ -1042,6 +1102,9 @@ module cpu_ooo #(
       .i_csr_done(csr_done_ack),
       .o_trap_pending(trap_pending),
       .o_trap_pc(rob_trap_pc),
+      .o_head_is_wfi(rob_head_is_wfi),
+      .o_head_retired_next_pc(rob_head_retired_next_pc),
+      .o_head_next_retired_next_pc(rob_head_next_retired_next_pc),
       .o_trap_cause(rob_trap_cause),
       .o_trap_value(rob_trap_value),
       .i_trap_taken(rob_trap_taken_ack),
@@ -1630,6 +1693,42 @@ module cpu_ooo #(
   // The wrapper already provides a registered observation port for commit.
   assign rob_commit_valid = rob_commit.valid;
 
+`ifndef SYNTHESIS
+  assign dbg_trap_taken_raw = trap_taken;
+  assign dbg_trap_taken_q = trap_taken_reg;
+  assign dbg_trap_cause_internal = trap_cause_internal;
+  assign dbg_trap_pc_internal = trap_pc_internal;
+  assign dbg_interrupt_resume_pc = interrupt_resume_pc;
+  assign dbg_port0_int_we = port0_int_we;
+  assign dbg_port0_int_addr = port0_int_addr;
+  assign dbg_port0_int_data = port0_int_data;
+  assign dbg_port1_int_we = port1_int_we;
+  assign dbg_port1_int_addr = port1_int_addr;
+  assign dbg_port1_int_data = port1_int_data;
+  assign dbg_commit_dest_valid = rob_commit_comb.dest_valid;
+  assign dbg_commit_dest_rf = rob_commit_comb.dest_rf;
+  assign dbg_commit_dest_reg = rob_commit_comb.dest_reg;
+  assign dbg_commit_value = rob_commit_comb.value[XLEN-1:0];
+  assign dbg_commit_2_valid = rob_commit_comb_2.valid;
+  assign dbg_commit_2_pc = rob_commit_comb_2.pc;
+  assign dbg_commit_2_dest_valid = rob_commit_comb_2.dest_valid;
+  assign dbg_commit_2_dest_rf = rob_commit_comb_2.dest_rf;
+  assign dbg_commit_2_dest_reg = rob_commit_comb_2.dest_reg;
+  assign dbg_commit_2_value = rob_commit_comb_2.value[XLEN-1:0];
+  assign dbg_rob_commit_reg_valid = rob_commit.valid;
+  assign dbg_rob_commit_reg_pc = rob_commit.pc;
+  assign dbg_rob_commit_reg_dest_valid = rob_commit.dest_valid;
+  assign dbg_rob_commit_reg_dest_rf = rob_commit.dest_rf;
+  assign dbg_rob_commit_reg_dest_reg = rob_commit.dest_reg;
+  assign dbg_rob_commit_reg_value = rob_commit.value[XLEN-1:0];
+  assign dbg_rob_commit_2_reg_valid = rob_commit_2.valid;
+  assign dbg_rob_commit_2_reg_pc = rob_commit_2.pc;
+  assign dbg_rob_commit_2_reg_dest_valid = rob_commit_2.dest_valid;
+  assign dbg_rob_commit_2_reg_dest_rf = rob_commit_2.dest_rf;
+  assign dbg_rob_commit_2_reg_dest_reg = rob_commit_2.dest_reg;
+  assign dbg_rob_commit_2_reg_value = rob_commit_2.value[XLEN-1:0];
+`endif
+
   // DEBUG: verify early recovery redirect_pc matches commit-time redirect_pc
   // (Disabled for performance — re-enable for debugging.)
   // always @(posedge i_clk) begin
@@ -1858,6 +1957,25 @@ module cpu_ooo #(
     endcase
   end
 
+  // ECALL cause is privilege-dependent (U-mode = 8, M-mode = 11). The FU shim
+  // tags every ECALL as ExcEcallMmode (it has no architectural privilege), so
+  // remap at commit using the current privilege. csr_file writes this to mcause
+  // -- the load-bearing path. It is also fed to trap_unit.i_exception_cause for
+  // symmetry, though FROST does not vector mtvec on synchronous-exception causes
+  // (only interrupts vector) and trap_unit's own o_trap_cause is unused. The
+  // csr_trap_value (mtval) mux above intentionally keeps the ORIGINAL cause
+  // (ECALL mtval is 0 either way).
+  //
+  // SAFE against the cause==11 / IntMachineExternal (0x8000_000B) low-bit
+  // collision: rob_trap_cause carries synchronous-exception causes ONLY (ROB
+  // o_trap_cause = head_exc_cause; the ROB's i_interrupt_pending is WFI-wakeup
+  // only, never a cause source), so a value of 11 here is unambiguously an
+  // M-mode ECALL.
+  assign rob_trap_cause_remapped =
+      ((rob_trap_cause == riscv_pkg::ExcEcallMmode[riscv_pkg::ExcCauseWidth-1:0]) &&
+       (csr_priv == riscv_pkg::PrivU)) ?
+      riscv_pkg::ExcEcallUmode[riscv_pkg::ExcCauseWidth-1:0] : rob_trap_cause;
+
   csr_file #(
       .XLEN(XLEN)
   ) csr_file_inst (
@@ -1874,8 +1992,11 @@ module cpu_ooo #(
       .i_interrupts(i_interrupts),
       .i_mtime(i_mtime),
       .i_trap_taken(trap_taken),
-      .i_trap_pc(rob_trap_pc),
-      .i_trap_cause({{(XLEN - $bits(rob_trap_cause)) {1'b0}}, rob_trap_cause}),
+      .i_trap_pc(trap_pc_internal),
+      // mcause from trap_unit's arbitrated cause: interrupt cause (with the
+      // interrupt bit) for interrupts, or the remapped exception cause (which
+      // carries the U-mode ECALL remap via trap_unit.i_exception_cause below).
+      .i_trap_cause(trap_cause_internal),
       .i_trap_value(csr_trap_value),
       .i_mret_taken(mret_taken),
       .o_mstatus(csr_mstatus),
@@ -1883,6 +2004,7 @@ module cpu_ooo #(
       .o_mtvec(csr_mtvec),
       .o_mepc(csr_mepc),
       .o_mstatus_mie_direct(csr_mstatus_mie_direct),
+      .o_priv(csr_priv),
       // FP flags: accumulated from ROB commit
       .i_fp_flags(rob_commit_fp_flags_merged),
       .i_fp_flags_valid(rob_commit_any_fp_flags_valid),
@@ -1920,7 +2042,93 @@ module cpu_ooo #(
   assign interrupt_pending = i_interrupts.meip || i_interrupts.mtip || i_interrupts.msip;
 
   logic [XLEN-1:0] trap_target_internal, trap_pc_internal;
-  logic [XLEN-1:0] trap_cause_internal, trap_value_internal;
+  logic [XLEN-1:0] trap_value_internal;
+  logic [XLEN-1:0] interrupt_resume_pc;
+
+  function automatic logic [XLEN-1:0] retired_next_pc(
+      input riscv_pkg::reorder_buffer_commit_t commit);
+    logic [XLEN-1:0] step;
+    begin
+      step = commit.is_compressed ? {{(XLEN - 2) {1'b0}}, 2'b10} : {{(XLEN - 3) {1'b0}}, 3'b100};
+      if (commit.is_branch || commit.is_mret) begin
+        retired_next_pc = commit.redirect_pc;
+      end else begin
+        retired_next_pc = commit.pc + step;
+      end
+    end
+  endfunction
+
+  always_ff @(posedge i_clk) begin
+    if (i_rst) begin
+      interrupt_resume_pc <= '0;
+    end else if (mret_taken) begin
+      // An MRET retires through the trap/MRET full flush, NOT the normal commit
+      // path: the cycle after o_mret_taken, flush_all (from mret_taken_reg)
+      // wipes the ROB head and gates commit_en, so the MRET never appears on
+      // rob_commit_valid_raw and never updates interrupt_resume_pc via the
+      // branches below. Without this seed, interrupt_resume_pc keeps the
+      // architectural next-PC of the instruction *before* the MRET -- which is
+      // the MRET instruction's own PC -- for the entire MRET-to-U window (until
+      // the first post-MRET instruction commits). A machine interrupt taken
+      // after privilege drops below M (eligible once the trap_unit inhibit
+      // lifts, ~2 cycles later, long before that first commit) would then save
+      // mepc = <MRET PC>, an M-mode handler address, which Linux later restores
+      // and MRETs to illegally in U-mode (the ret_from_exception 0x80388bba
+      // panic). Seed the resume PC from the MRET target (mepc, == the MRET
+      // redirect target) now so it is already correct before the inhibit
+      // window closes. csr_mepc is stable here: MRET does not write mepc and
+      // cannot coincide with a trap entry that would.
+      interrupt_resume_pc <= csr_mepc;
+    end else if (rob_commit_2_valid_raw) begin
+      // TIMING: identical value to retired_next_pc(rob_commit_comb_2) in every
+      // cycle this arm is taken (checked below in simulation), but the ROB
+      // precomputes it from ungated head+1 fields so the PC RAM read + 32-bit
+      // add do not sit behind the late commit gating.
+      interrupt_resume_pc <= rob_head_next_retired_next_pc;
+    end else if (rob_commit_valid_raw) begin
+      // TIMING: identical value to retired_next_pc(rob_commit_comb); see above.
+      interrupt_resume_pc <= rob_head_retired_next_pc;
+    end else if (rob_head_is_wfi && head_valid) begin
+      // Bug#2 (drain-gated WFI mepc): while a WFI stalls at the ROB head, the
+      // architectural resume PC is always wfi_pc+4 (WFI never redirects). Seed it
+      // here so that if a machine interrupt is taken at the WFI -- including the
+      // narrow window where a committed store finishes draining and take_trap
+      // fires the same cycle, before the WFI's own commit can advance
+      // interrupt_resume_pc -- mepc is the spec-required wfi_pc+4 rather than the
+      // pre-WFI instruction's next-PC (== wfi_pc). Lowest priority: a real commit
+      // (incl. a dual-commit retiring the WFI and its successor) always wins, and
+      // WFI is never compressed so +4 is exact. Mirrors the mret_taken seed above.
+      interrupt_resume_pc <= rob_trap_pc + 32'd4;
+    end
+  end
+
+`ifndef SYNTHESIS
+  // Equivalence check for the ROB retired-next-PC precompute: whenever a
+  // commit fires, the precomputed value must match the original
+  // retired_next_pc() derivation from the (gated) commit payload.
+  always @(posedge i_clk) begin
+    if (!i_rst) begin
+      if (rob_commit_valid_raw && rob_head_retired_next_pc != retired_next_pc(
+              rob_commit_comb
+          )) begin
+        $error("cpu_ooo: rob_head_retired_next_pc %08x != retired_next_pc(commit) %08x",
+               rob_head_retired_next_pc, retired_next_pc(rob_commit_comb));
+      end
+      if (rob_commit_2_valid_raw && rob_head_next_retired_next_pc != retired_next_pc(
+              rob_commit_comb_2
+          )) begin
+        $error("cpu_ooo: rob_head_next_retired_next_pc %08x != retired_next_pc(commit_2) %08x",
+               rob_head_next_retired_next_pc, retired_next_pc(rob_commit_comb_2));
+      end
+    end
+  end
+`endif
+
+  // A same-cycle store-like ROB commit is not yet in the SQ committed set.
+  // If a trap full-flushes here, the registered commit can be masked before
+  // SQ observes it. Delay trap/MRET one cycle so SQ can own and drain it.
+  assign sq_committed_empty_for_trap =
+      sq_committed_empty && !rob_commit_store_like_raw && !rob_commit_2_store_like_raw;
 
   trap_unit #(
       .XLEN(XLEN)
@@ -1928,19 +2136,23 @@ module cpu_ooo #(
       .i_clk,
       .i_rst,
       .i_pipeline_stall(1'b0),  // OOO: no stall for trap check
-      .i_sq_committed_empty(sq_committed_empty),
+      .i_sq_committed_empty(sq_committed_empty_for_trap),
       .o_trap_drain_wait(trap_drain_wait),
       .i_mstatus(csr_mstatus),
       .i_mie(csr_mie),
       .i_mtvec(csr_mtvec),
       .i_mepc(csr_mepc),
       .i_mstatus_mie_direct(csr_mstatus_mie_direct),
+      .i_priv(csr_priv),
       .i_interrupts(i_interrupts),
       // Exception from ROB commit
       .i_exception_valid(trap_pending),
-      .i_exception_cause({{(XLEN - $bits(rob_trap_cause)) {1'b0}}, rob_trap_cause}),
+      .i_exception_cause({
+        {(XLEN - $bits(rob_trap_cause_remapped)) {1'b0}}, rob_trap_cause_remapped
+      }),
       .i_exception_tval('0),
       .i_exception_pc(rob_trap_pc),
+      .i_interrupt_pc(interrupt_resume_pc),
       .i_mret_start(mret_start),
       .i_wfi_start(1'b0),  // WFI handled by ROB serialization
       .o_trap_taken(trap_taken),
@@ -1972,6 +2184,15 @@ module cpu_ooo #(
   assign rob_trap_taken_ack = trap_taken_reg;
   assign mret_done_ack = mret_taken_reg;
 
+  // Passive on-silicon debug tap for the top-level hang triage UART. Packed as:
+  // [5]=mret, [4]=trap, [3:2]=priv, [1]=mstatus.MIE, [0]=mie.MTIE.
+  assign o_debug_irq_status = {
+    mret_taken, trap_taken, csr_priv, csr_mstatus_mie_direct, csr_mie[riscv_pkg::MieMtiBit]
+  };
+  assign o_debug_commit_pc = rob_commit.pc;
+  assign o_debug_commit_2_pc = rob_commit_2.pc;
+  assign o_debug_commit_valid = {rob_commit_2.valid, rob_commit.valid};
+
   // ===========================================================================
   // Profiling Counter Aggregation
   // ===========================================================================
diff --git a/hw/rtl/cpu_and_mem/cpu/csr/csr_file.sv b/hw/rtl/cpu_and_mem/cpu/csr/csr_file.sv
index 73acfdc8..15c783e6 100644
--- a/hw/rtl/cpu_and_mem/cpu/csr/csr_file.sv
+++ b/hw/rtl/cpu_and_mem/cpu/csr/csr_file.sv
@@ -15,7 +15,7 @@
  */
 
 /*
-  CSR (Control and Status Register) File for RISC-V Zicsr + Zicntr + Machine-mode + F extensions.
+  CSR (Control and Status Register) File for RISC-V Zicsr + Zicntr + Machine/User-mode + F extensions.
 
   This module implements:
 
@@ -31,9 +31,9 @@
     - instret/instreth (0xC02/0xC82): Instructions retired counter (64-bit)
     - minstret/minstreth (0xB02/0xB82): Machine-mode alias for instret counter
 
-  Machine-mode CSRs (for trap/interrupt handling):
-    - mstatus (0x300): Machine status (MIE, MPIE bits)
-    - misa (0x301): Machine ISA (read-only, reports RV32IMAFB)
+  Machine-mode CSRs (for trap/interrupt handling; M and U privilege modes):
+    - mstatus (0x300): Machine status (MIE, MPIE bits; MPP WARL field {M, U}; MPRV bit, inert)
+    - misa (0x301): Machine ISA (read-only, reports RV32GCB + U: 0x4010_112F)
     - mie (0x304): Machine interrupt enable (MEIE, MTIE, MSIE)
     - mtvec (0x305): Machine trap vector base address
     - mscratch (0x340): Machine scratch register
@@ -67,7 +67,7 @@ module csr_file #(
     // counter must increment by the retire count.
     input logic [1:0] i_instruction_retired_count,
 
-    // Interrupt pending inputs (directly from peripherals)
+    // Interrupt pending inputs (meip/mtip registered upstream in cpu_and_mem; msip direct)
     input riscv_pkg::interrupt_t i_interrupts,
 
     // mtime input (from memory-mapped timer)
@@ -91,6 +91,11 @@ module csr_file #(
     // Direct output of mstatus MIE bit for timing and simpler consumers.
     output logic o_mstatus_mie_direct,
 
+    // Current privilege mode (PrivM/PrivU): consumed by trap_unit (interrupt
+    // enable while in U) and the commit-time ECALL cause select. Changes only
+    // on trap entry and MRET.
+    output logic [1:0] o_priv,
+
     // F extension: FP exception flags from FPU (to accumulate in fflags)
     input riscv_pkg::fp_flags_t i_fp_flags,
     input logic i_fp_flags_valid,  // Valid when FP instruction retires (gated by o_vld)
@@ -140,8 +145,14 @@ module csr_file #(
   // do not require read/modify/write of the full CSR word.
   logic            mstatus_mie;  // Machine Interrupt Enable (bit 3)
   logic            mstatus_mpie;  // Machine Previous Interrupt Enable (bit 7)
-  logic [XLEN-1:0] mstatus;  // Constructed from mie and mpie
-  assign mstatus = {19'b0, 2'b11, 3'b0, mstatus_mpie, 3'b0, mstatus_mie, 3'b0};
+  logic [     1:0] mstatus_mpp;  // Previous Privilege [12:11]; WARL {PrivM,PrivU}
+  logic            mstatus_mprv;  // Modify PRiV (bit 17); stored but inert (no PMP/MMU)
+  logic [     1:0] priv_q;  // Current privilege mode (resets to PrivM)
+  logic [XLEN-1:0] mstatus;  // Constructed from the fields above
+  assign mstatus = {
+    14'b0, mstatus_mprv, 4'b0, mstatus_mpp, 3'b0, mstatus_mpie, 3'b0, mstatus_mie, 3'b0
+  };
+  assign o_priv = priv_q;
 
   // mie CSR: store each interrupt enable as separate register
   logic mie_msie;  // Machine Software Interrupt Enable (bit 3)
@@ -153,6 +164,9 @@ module csr_file #(
   // Next-state signals for mstatus bits (computed combinationally)
   logic next_mstatus_mie;
   logic next_mstatus_mpie;
+  logic [1:0] next_mstatus_mpp;
+  logic next_mstatus_mprv;
+  logic [1:0] next_priv;
   // Next-state signals for mie bits
   logic next_mie_msie;
   logic next_mie_mtie;
@@ -169,10 +183,11 @@ module csr_file #(
   logic [XLEN-1:0] mip;
   assign mip = {20'b0, i_interrupts.meip, 3'b0, i_interrupts.mtip, 3'b0, i_interrupts.msip, 3'b0};
 
-  // misa is read-only: RV32IMAFB
-  // Bit 0 (A), Bit 1 (B), Bit 5 (F), Bit 8 (I), Bit 12 (M) = 0x0000_1123
+  // misa is read-only: RV32IMAFDC + B + U (= RV32GCB with User mode)
+  // Bit 0 (A), Bit 1 (B), Bit 2 (C), Bit 3 (D), Bit 5 (F), Bit 8 (I), Bit 12 (M),
+  // Bit 20 (U) = 0x0010_112F
   // MXL = 1 (32-bit) in bits [31:30]
-  localparam logic [XLEN-1:0] MisaValue = 32'h4000_1123;
+  localparam logic [XLEN-1:0] MisaValue = 32'h4010_112F;
 
   // Output CSRs for trap unit
   assign o_mstatus = mstatus;
@@ -237,12 +252,46 @@ module csr_file #(
   // ==========================================================================
   // Instructions Retired Counter
   // ==========================================================================
+  // TIMING RETIME (+1 cycle, architecturally invisible — analysis below):
+  // the per-cycle retire count arrives late (its !trap_taken suppression sits
+  // at the end of the commit/trap serialization cone) and previously entered
+  // the LSB of a 64-bit carry chain, making instret[63]/D the post-opt WNS
+  // (-0.94 ns at 300 MHz).  Stage the FULLY-GATED count through
+  // instruction_retired_count_q so the late cone terminates at a 2-bit
+  // register; the 64-bit add then runs register-to-register.
+  //
+  // Invariant: instret_counter at cycle T equals the total retire count
+  // through cycle T-2 (one staging cycle) instead of T-1.  Architecturally
+  // invisible because the ONLY observation of instret is a CSR read of
+  // instret/instreth/minstret{,h}, and CSR reads are commit-serialized:
+  //   cycle C:   the youngest instruction OLDER than the CSR read commits
+  //              (commit_en); its count is computed at C+1 from the
+  //              REGISTERED commit bus (commit_actions), staged into
+  //              instruction_retired_count_q at the C+1->C+2 edge, and
+  //              accumulated into instret_counter at the C+2->C+3 edge;
+  //   cycle C+1: the CSR reaches the ROB head; rob_serializer asserts
+  //              commit_stall and requests CSR execution (o_csr_start);
+  //   cycle C+2: earliest csr_done_ack (1-cycle handshake in cpu_ooo) ->
+  //              earliest CSR commit_en;
+  //   cycle C+3: csr_commit_fire (registered commit) performs the actual
+  //              csr_file read -> observes a counter that already includes
+  //              cycle C's commits.
+  // Every stall (head not ready, commit_hold, later csr_done) only adds
+  // margin, and the reading instruction itself is never included — exactly
+  // as in the un-retimed design, whose own count also landed after the read.
+  // The staged count preserves the !trap_taken suppression bit-for-bit (the
+  // gated count is registered as-is: the same instructions are counted, one
+  // cycle later).  Proven in the FORMAL section (p_instret_stage_follows /
+  // p_instret_applies_staged_count).
+  logic [1:0] instruction_retired_count_q;
 
   always_ff @(posedge i_clk) begin
     if (i_rst) begin
+      instruction_retired_count_q <= 2'd0;
       instret_counter <= 64'd0;
     end else begin
-      instret_counter <= instret_counter + 64'(i_instruction_retired_count);
+      instruction_retired_count_q <= i_instruction_retired_count;
+      instret_counter <= instret_counter + 64'(instruction_retired_count_q);
     end
   end
 
@@ -318,22 +367,35 @@ module csr_file #(
     // Default: keep current values
     next_mstatus_mie = mstatus_mie;
     next_mstatus_mpie = mstatus_mpie;
+    next_mstatus_mpp = mstatus_mpp;
+    next_mstatus_mprv = mstatus_mprv;
+    next_priv = priv_q;
     next_mie_msie = mie_msie;
     next_mie_mtie = mie_mtie;
     next_mie_meie = mie_meie;
 
     if (i_trap_taken) begin
-      // Trap entry: save MIE to MPIE, clear MIE
+      // Trap entry: save MIE->MPIE, clear MIE, save priv->MPP, enter M-mode.
       next_mstatus_mpie = mstatus_mie;
       next_mstatus_mie  = 1'b0;
+      next_mstatus_mpp  = priv_q;
+      next_priv         = riscv_pkg::PrivM;
     end else if (i_mret_taken) begin
-      // MRET: restore MIE from MPIE, set MPIE to 1
+      // MRET: restore MIE<-MPIE, MPIE=1, return to MPP's privilege, set MPP=U,
+      // and clear MPRV if returning below M (per the privileged spec).
       next_mstatus_mie  = mstatus_mpie;
       next_mstatus_mpie = 1'b1;
+      next_priv         = mstatus_mpp;
+      if (mstatus_mpp != riscv_pkg::PrivM) next_mstatus_mprv = 1'b0;
+      next_mstatus_mpp = riscv_pkg::PrivU;
     end else if (i_csr_write_enable && i_csr_read_enable) begin
       if (i_csr_address == riscv_pkg::CsrMstatus) begin
-        next_mstatus_mie  = csr_new_value[3];
+        next_mstatus_mie = csr_new_value[3];
         next_mstatus_mpie = csr_new_value[7];
+        // MPP is WARL: FROST implements only M and U, so fold S/reserved -> U.
+        next_mstatus_mpp  = (csr_new_value[12:11] == riscv_pkg::PrivM) ?
+            riscv_pkg::PrivM : riscv_pkg::PrivU;
+        next_mstatus_mprv = csr_new_value[17];
       end else if (i_csr_address == riscv_pkg::CsrMie) begin
         next_mie_msie = csr_new_value[3];
         next_mie_mtie = csr_new_value[7];
@@ -348,12 +410,18 @@ module csr_file #(
     if (i_rst) begin
       mstatus_mie <= 1'b0;
       mstatus_mpie <= 1'b0;
+      mstatus_mpp <= riscv_pkg::PrivU;
+      mstatus_mprv <= 1'b0;
+      priv_q <= riscv_pkg::PrivM;
       mie_msie <= 1'b0;
       mie_mtie <= 1'b0;
       mie_meie <= 1'b0;
     end else begin
       mstatus_mie <= next_mstatus_mie;
       mstatus_mpie <= next_mstatus_mpie;
+      mstatus_mpp <= next_mstatus_mpp;
+      mstatus_mprv <= next_mstatus_mprv;
+      priv_q <= next_priv;
       mie_msie <= next_mie_msie;
       mie_mtie <= next_mie_mtie;
       mie_meie <= next_mie_meie;
@@ -525,9 +593,18 @@ module csr_file #(
       // Cycle counter increments every cycle (not in reset).
       p_cycle_increments : assert (cycle_counter == $past(cycle_counter) + 64'd1);
 
-      // Instret increments by the retire count (0, 1, or 2 per cycle).
-      p_instret_increments :
-      assert (instret_counter == $past(instret_counter) + 64'($past(i_instruction_retired_count)));
+      // Instret retime invariants (see the Instructions Retired Counter
+      // comment): the staging register follows the input by one cycle, and
+      // the accumulator applies the staged count.  Composed:
+      //   instret_counter(T) == instret_counter(T-1) + retired_count(T-2)
+      // i.e. instret equals the running total of retired instructions delayed
+      // by exactly one staging cycle; the delay is architecturally invisible
+      // because commit-serialized CSR reads sample the counter no earlier
+      // than <last counted commit> + 3 cycles.
+      p_instret_stage_follows :
+      assert (instruction_retired_count_q == $past(i_instruction_retired_count));
+      p_instret_applies_staged_count :
+      assert (instret_counter == $past(instret_counter) + 64'($past(instruction_retired_count_q)));
 
       // fflags sticky: when no CSR write to fflags/fcsr and no effective fp_flags_valid,
       // fflags does not shrink.
@@ -542,6 +619,7 @@ module csr_file #(
       if ($past(i_rst)) begin
         p_reset_cycle : assert (cycle_counter == 64'd0);
         p_reset_instret : assert (instret_counter == 64'd0);
+        p_reset_instret_stage : assert (instruction_retired_count_q == 2'd0);
         p_reset_mie : assert (!mstatus_mie);
         p_reset_mpie : assert (!mstatus_mpie);
         p_reset_fflags : assert (fflags == 5'b0);
diff --git a/hw/rtl/cpu_and_mem/cpu/ex_stage/alu/alu.sv b/hw/rtl/cpu_and_mem/cpu/ex_stage/alu/alu.sv
index fc482584..bfcc3631 100644
--- a/hw/rtl/cpu_and_mem/cpu/ex_stage/alu/alu.sv
+++ b/hw/rtl/cpu_and_mem/cpu/ex_stage/alu/alu.sv
@@ -164,7 +164,7 @@ module alu #(
       // Use pre-computed link address from IF stage (PC+2 for compressed, PC+4 for 32-bit)
       riscv_pkg::JAL: o_result = i_link_address;
       riscv_pkg::JALR: o_result = i_link_address;
-      // M-extension multiply operations (1-cycle registered, requires stall)
+      // M-extension multiply operations (4-cycle pipelined multiplier, requires stall until o_valid_output)
       riscv_pkg::MUL: begin
         // Start multiply if not already in progress; use lower 32 bits of result
         multiplier_valid_input = ~multiplier_valid_input_registered;
diff --git a/hw/rtl/cpu_and_mem/cpu/ex_stage/alu/divider.sv b/hw/rtl/cpu_and_mem/cpu/ex_stage/alu/divider.sv
index 8a615221..a25182d0 100644
--- a/hw/rtl/cpu_and_mem/cpu/ex_stage/alu/divider.sv
+++ b/hw/rtl/cpu_and_mem/cpu/ex_stage/alu/divider.sv
@@ -102,7 +102,7 @@ module divider #(
     remainder_should_be_negative = dividend_is_negative;
   end
 
-  // 2x-folded radix-2 division requires one pipeline stage per 2 bits (32 stages for 32-bit)
+  // 2x-folded radix-2 division requires one pipeline stage per 2 bits (16 stages for 32-bit)
   localparam int unsigned NumPipelineStages = WIDTH / 2;
 
   // Pipeline arrays for each stage - carry values through division process
diff --git a/hw/rtl/cpu_and_mem/cpu/id_stage/instr_decoder.sv b/hw/rtl/cpu_and_mem/cpu/id_stage/instr_decoder.sv
index 34be8e3f..68351bcc 100644
--- a/hw/rtl/cpu_and_mem/cpu/id_stage/instr_decoder.sv
+++ b/hw/rtl/cpu_and_mem/cpu/id_stage/instr_decoder.sv
@@ -15,7 +15,7 @@
  */
 
 /*
-  Instruction decoder for RISC-V RV32IMAFB + Zicsr + Machine-mode privileged.
+  Instruction decoder for RISC-V RV32GCB + Zicsr + M/U-mode privileged.
   B extension = Zba + Zbb + Zbs (full bit manipulation).
   F extension = Single-precision floating-point.
   This combinational module decodes 32-bit RISC-V instructions into control signals
diff --git a/hw/rtl/cpu_and_mem/cpu/if_stage/if_stage.sv b/hw/rtl/cpu_and_mem/cpu/if_stage/if_stage.sv
index 4edf58a4..43aac51f 100644
--- a/hw/rtl/cpu_and_mem/cpu/if_stage/if_stage.sv
+++ b/hw/rtl/cpu_and_mem/cpu/if_stage/if_stage.sv
@@ -26,7 +26,7 @@
  *   ├── pc_controller               PC management, next-PC selection
  *   │   └── control_flow_tracker        Holdoff signal generation
  *   ├── branch_prediction/          Branch prediction subsystem
- *   │   ├── branch_predictor            32-entry BTB (combinational lookup)
+ *   │   ├── branch_predictor            256-entry BTB (combinational lookup)
  *   │   ├── branch_prediction_controller  Prediction gating and registration
  *   │   └── prediction_metadata_tracker   Stall/spanning metadata handling
  *   └── c_extension/                Compressed instruction subsystem
@@ -66,7 +66,7 @@
  * =========
  *   - RISC-V C extension support (compressed 16-bit instructions)
  *   - Handles 32-bit instructions spanning two memory words (PC[1]=1)
- *   - Branch prediction with 32-entry BTB
+ *   - Branch prediction with 256-entry BTB
  *   - Outputs raw parcel + selection signals for PD stage decompression
  *
  * TIMING OPTIMIZATION:
@@ -86,6 +86,7 @@ module if_stage #(
     input logic [63:0] i_instr,  // 64-bit fetch: {next_word, current_word}
     input logic [riscv_pkg::ImemFetchSidebandWidth-1:0] i_instr_sideband,
     input logic i_instr_bank_sel_r,  // Fetch-word parity (PC[2] from fetch cycle)
+    input logic [XLEN-1:0] i_served_addr,  // Served fetch-window tag (full address)
     // Fetch window valid: the {i_instr, i_instr_sideband, i_instr_bank_sel_r}
     // window corresponds to the fetch address presented last cycle.  When low
     // (variable-latency provider: L1I miss / fuzz), IF emits NOP bubbles,
@@ -506,6 +507,8 @@ module if_stage #(
 
       .i_pd_redirect(i_pd_redirect),
       .i_pd_redirect_target(i_pd_redirect_target),
+      .i_window_cannot_serve(window_resteer_pc_reg),
+      .i_window_cannot_serve_raw(window_cannot_serve_pc_reg),
 
       .i_trap_taken (i_trap_ctrl.trap_taken),
       .i_mret_taken (i_trap_ctrl.mret_taken),
@@ -742,13 +745,55 @@ module if_stage #(
   // stall-held, so a stall covering the bubble cycle let the bubble
   // present-and-dispatch on release alongside the realigned repeat. Fixed
   // by stall-gating pd_redirect_q (see its always_ff above).
-  assign sel_nop = i_pipeline_ctrl.flush || flush_for_c_ext_safe || !fetch_progress ||
+  // Served-window invariant: the fetched 64-bit window covers exactly the two
+  // words {word(i_served_addr), word(i_served_addr)+1}.  pc_reg must lie in that
+  // window or the 1-bit bank-sel parity in instruction_aligner silently selects
+  // the wrong word -> wrong instruction-size sample -> pc_reg advances onto a
+  // mid-instruction byte (the workqueue_init_early epc 0x8038d7fa boot Oops).
+  // A fetch stall (L1I line-fill) can leave the served window >1 word from
+  // pc_reg, which the single parity bit cannot represent.  Detect it from the
+  // full served address; pc_controller squashes (sel_nop below), holds pc_reg,
+  // and resteers fetch onto pc_reg's word until the correct window is served.
+  logic signed [XLEN-1:0] served_word_delta;
+  assign served_word_delta = $signed(
+      {2'b00, i_served_addr[XLEN-1:2]}
+  ) - $signed(
+      {2'b00, pc_reg[XLEN-1:2]}
+  );
+  logic window_cannot_serve_pc_reg;
+  // Gated to the cached region (pc_reg[XLEN-1], i.e. >= CACHED_BASE): the low BRAM
+  // fetch path is fixed 1-cycle/always-valid and never desyncs, and its served-addr
+  // tracking is approximate -- firing there only causes spurious squashes.
+  assign window_cannot_serve_pc_reg = i_instr_valid && pc_reg[XLEN-1] &&
+      (served_word_delta != $signed(
+      0
+  )) && (served_word_delta != -$signed(
+      1
+  )) && !((served_word_delta == $signed(
+      1
+  )) && use_instr_buffer);
+
+  // The existing (pre-served-window-guard) squash conditions.
+  logic sel_nop_existing;
+  assign sel_nop_existing = i_pipeline_ctrl.flush ||
+                   flush_for_c_ext_safe || !fetch_progress ||
                    sel_nop_align || reset_holdoff ||
                    pending_prediction_target_holdoff ||
                    (pending_prediction_fetch_holdoff && !prediction_holdoff) ||
                    (control_flow_holdoff &&
                     (!prediction_holdoff || pd_redirect_q || slot2_redirect_q));
 
+  // Resteer fetch onto pc_reg's word + hold pc_reg ONLY at a real consume cycle
+  // (not during an existing holdoff, where pc_reg is already managed and a resteer
+  // would thrash the front end -- the cause of the earlier isa_test/boot regression).
+  // At a holdoff release with the served window still stale (fetch ran ahead during
+  // the redirect bubble), this fires the cycle the wrong-word decode would otherwise
+  // advance pc_reg onto a mid-instruction byte.
+  logic window_resteer_pc_reg;
+  assign window_resteer_pc_reg = window_cannot_serve_pc_reg && !sel_nop_existing;
+
+  assign sel_nop = sel_nop_existing || window_cannot_serve_pc_reg;
+
   // ===========================================================================
   // Stall State Registers
   // ===========================================================================
@@ -951,11 +996,35 @@ module if_stage #(
   logic [XLEN-1:0] instruction_pc;
   logic [XLEN-1:0] link_address;
 
-  // Use the same stall-safe compressed selection metadata that PD consumes.
-  // This keeps link_address aligned with the actual instruction that will be
-  // seen downstream, including prediction/stall replay cases.
+  // link_address (the fall-through PC) must reflect the TRUE size of the slot-1
+  // instruction held across a stall.  The shared sel_compressed_sc is flush-zeroed
+  // by its stall_capture_reg (stall_capture_reg.sv: `if (i_flush) saved <= '0`),
+  // so on a flush-inside-stall a *compressed* branch held at fetch reads
+  // is_compressed_for_link = 0 -> link_address = pc_reg + 4 (one halfword too far).
+  // That stale fall-through link is then consumed as the not-taken redirect target
+  // (early_misprediction_recovery: `... : rs_issue_int.link_addr`), making fetch
+  // skip the branch's successor parcel.  This is the no-MMU-Linux timer-IRQ
+  // "gremlin": the revmap_size load (`lw a5,80(a0)`) right after a not-taken
+  // `c.beqz` is dropped, so the dependent `bgeu a1,a5` reads a stale a5 and takes
+  // the wrong IRQ-dispatch path.  Capture sel_compressed for the link WITHOUT the
+  // flush-zero so the held size matches the actual held instruction (pc_reg+2/+4
+  // correctly).  sel_compressed_sc's other consumers (o_from_if_to_pd.sel_compressed,
+  // slot2_pc_sc) are replay-gated by sel_nop_saved=1 after a flush, so they are
+  // unaffected; only this link path reads the captured bit in the post-flush window.
   logic is_compressed_for_link;
-  assign is_compressed_for_link = sel_compressed_sc;
+  logic sel_compressed_for_link_sc;
+  stall_capture_reg #(
+      .WIDTH(1)
+  ) u_sel_compressed_for_link_sc (
+      .i_clk,
+      .i_reset(1'b0),
+      .i_flush(1'b0),
+      .i_stall(if_stage_stall),
+      .i_stall_registered(if_stage_stall_registered),
+      .i_data(sel_compressed),
+      .o_data(sel_compressed_for_link_sc)
+  );
+  assign is_compressed_for_link = sel_compressed_for_link_sc;
 
   assign instruction_pc = pc_reg;
   assign link_address = instruction_pc + (is_compressed_for_link ?
@@ -1153,9 +1222,8 @@ module if_stage #(
   // Slot-2 IF→PD Packet (2-wide dispatch — Session F)
   // ===========================================================================
   // Slot-2 follows slot-1 sequentially in program order: PC and link address
-  // are simply slot-1's plus the slot-1 / slot-2 sizes.  No BTB lookup is
-  // performed for slot-2 (decision #3, single-port BTB on slot-1 PC) and no
-  // RAS prediction is consumed for slot-2 (decision #1: slot-2 is invalid
+  // are simply slot-1's plus the slot-1 / slot-2 sizes.  No RAS prediction
+  // is consumed for slot-2 (decision #1: slot-2 is invalid
   // when slot-1 is a branch, so slot-1 cannot have pushed/popped RAS in the
   // same cycle).
   //
diff --git a/hw/rtl/cpu_and_mem/cpu/if_stage/pc_controller.sv b/hw/rtl/cpu_and_mem/cpu/if_stage/pc_controller.sv
index 7882b477..2f03ef6e 100644
--- a/hw/rtl/cpu_and_mem/cpu/if_stage/pc_controller.sv
+++ b/hw/rtl/cpu_and_mem/cpu/if_stage/pc_controller.sv
@@ -51,8 +51,8 @@
     |                       v                                                 |
     |  +-----------------------------------------+                            |
     |  |     Final PC Mux (Priority Encoded)     |                            |
-    |  |  reset > trap > stall > branch >        |------------------> o_pc    |
-    |  |  prediction > sequential                |------------------> o_pc_reg|
+    |  |  reset > trap > fence.i > branch >      |------------------> o_pc    |
+    |  |  PD redirect > hold > prediction > seq. |------------------> o_pc_reg|
     |  +-----------------------------------------+                            |
     |                                                                         |
     +-------------------------------------------------------------------------+
@@ -61,11 +61,13 @@
   ==============
     1. Control flow tracking - Detect stale instruction cycles after redirects
     2. PC increment calculation - C-extension aware (+0, +2, or +4) [submodule]
-    3. Mid-32bit correction - Handle landing in middle of 32-bit instruction
+    3. Mid-32bit correction - DISABLED with 64-bit fetch (output tied to 0)
     4. Final PC selection - Priority mux with timing-optimized flat structure
 
-  All branches and jumps (JAL, JALR, conditional branches) are resolved in the EX
-  stage and come through the i_branch_taken/i_branch_target interface.
+  Branch/jump redirects (JAL, JALR, conditional branches) arrive on the
+  i_branch_taken/i_branch_target interface only on misprediction recovery
+  (early or commit-time, synthesized by ex_comb_synthesizer); correctly
+  predicted branches commit without a redirect here.
 */
 module pc_controller #(
     parameter int unsigned XLEN = 32
@@ -91,8 +93,13 @@ module pc_controller #(
     input logic [XLEN-1:0] i_branch_target,
 
     // PD backward-branch heuristic redirect (from pd_stage)
-    input logic            i_pd_redirect,
+    input logic i_pd_redirect,
     input logic [XLEN-1:0] i_pd_redirect_target,
+    input logic i_window_cannot_serve,  // Served window cannot hold pc_reg -> resteer+hold
+    // Raw window-cannot-serve (UNGATED by sel_nop) -- the exact gremlin DROP condition.
+    // Narrows the immediate-predecessor carve-out to fire ONLY when the load would
+    // actually be dropped (wcs=1), not at the ~50k benign wcs=0 dual-issue sites.
+    input logic i_window_cannot_serve_raw,
 
     // Trap control
     input logic            i_trap_taken,
@@ -323,6 +330,9 @@ module pc_controller #(
   logic [XLEN-1:0] pending_prediction_pc;
   logic [XLEN-1:0] pending_prediction_target;
   logic            pending_prediction_effective;
+  logic            pending_imm_pred_emit;
+  logic            pim_base;  // immediate-predecessor + pending (pre-narrowing)
+  logic            carve_out_engaged_q;  // latched: raw wcs=1 seen this episode
   logic            pending_prediction_from_buffer;
   logic            prediction_needs_pending;
   logic            use_pending_prediction_for_pc_reg;
@@ -407,7 +417,19 @@ module pc_controller #(
       pc_reg_next_bit1_for_prediction = o_pc_reg[1] ^ i_is_compressed;
     end
   end
-  assign pc_reg_next_misses_fetch_pc_for_prediction = pc_reg_next_bit1_for_prediction != o_pc[1];
+  // BOOT-HANG FIX (verification form): the bit1-only fast predictor
+  // (pc_reg_next_bit1_for_prediction != o_pc[1]) diverges from the full result
+  // when pc_reg is >=2 words behind the word-aligned fetch PC -- both are
+  // word-aligned so bit 1 matches, but the words differ. There the fast value
+  // is 0 ("no miss") while the truth (seq_next_pc_reg != o_pc) is 1, so
+  // prediction_needs_pending is wrongly false, the prediction is applied without
+  // the pc_reg handoff, and fetch redirects to the wrong PC (silent on HW where
+  // the assert below is compiled out -> the no-MMU Linux boot hang at pid_max).
+  // Use the full compare; conservative-safe (only ever pends MORE, exactly in
+  // the cases the bit1 proxy missed). NOTE: this reintroduces the
+  // seq_next_pc_reg compare on the prediction cone that the bit1 proxy existed
+  // to avoid -- a timing-friendly correct form is a follow-up if WNS regresses.
+  assign pc_reg_next_misses_fetch_pc_for_prediction = (seq_next_pc_reg != o_pc);
 
   assign prediction_needs_pending =
       i_prediction_used && !i_ras_predicted && !i_slot2_prediction_used &&
@@ -456,9 +478,63 @@ module pc_controller #(
   assign stale_pending_prediction = pending_prediction_effective &&
                                     !use_pending_prediction_for_pc_reg &&
                                     (pc_reg_hw > pending_prediction_pc_hw);
+  // GREMLIN fix (Option 1b): immediate-predecessor carve-out.  When a pending BTB
+  // prediction is in flight for a branch that is the COMPRESSED parcel immediately
+  // after pc_reg (pending_prediction_pc == o_pc_reg + 2) and pc_reg has NOT yet
+  // reached it (!use_pending, !stale), the parcel currently at pc_reg is a
+  // correct-path OLDER instruction that MUST execute (e.g. the no-MMU IRQ revmap_size
+  // load at 0x8005a19a sitting between the fetch point and the predicted bgeu at
+  // 0x8005a19c).  Without this, hold_pending_prediction_fetch squashes it (->
+  // o_pending_prediction_fetch_holdoff -> if_stage sel_nop) and the land-on-branch arm
+  // jumps pc_reg straight to pending_prediction_pc, DROPPING it.  pending_imm_pred_emit
+  // suppresses the fetch-holdoff squash + the land-on-branch jump so the parcel emits
+  // and pc_reg advances SEQUENTIALLY onto the branch.  pending_prediction_valid stays
+  // live, so the prediction still applies (metadata-replay path unchanged) once pc_reg
+  // reaches the branch.  This is the documented design intent of
+  // prediction_metadata_tracker ("IF keeps walking older instructions after a BTB
+  // redirect").
+  //
+  // LOOP-BREAK: the predicate uses ONLY registered state -- o_pc_reg and
+  // pending_prediction_pc + a constant.  An earlier form used seq_next_pc_reg, which
+  // depends on pc_reg_advance_sel -> sel_nop; combined with gate (a) feeding
+  // pending_imm_pred_emit BACK into sel_nop (via o_pending_prediction_fetch_holdoff)
+  // that closed a combinational cycle (Verilator "Active region did not converge" at
+  // ~16.6M, masked by -Wno-UNOPTFLAT).  o_pc_reg + PcIncrementCompressed is exactly the
+  // value seq_next_pc_reg held while the parcel was squashed (pc_reg_advance_sel_live
+  // DEFAULTS to +2 when sel_nop=1, if_stage.sv ~1297), so behaviour is preserved for
+  // the compressed immediate-predecessor (the observed gremlin) while the cycle is
+  // broken.  A 32-bit predecessor is intentionally NOT covered: it cannot be
+  // identified sel_nop-free here (the served instruction-size signals are unreliable
+  // under the coincident served-window guard) and the prior form did not cover it
+  // either (it too saw +2 during the squash), so the scope is unchanged.
+  // NARROWING: the base condition (pim_base, below) by itself fires ~50k times/boot, at
+  // wcs=0 dual-issue load+branch bundles where the load already emits -- and there, the
+  // carve-out clearing sel_nop makes pc_reg_advance_sel_live pick +4 (slot-2) so pc_reg
+  // jumps PAST the branch, mishandling the pending prediction -> stale-ra wild ret
+  // (of_prop_next_string 0x8021fcae).
+  assign pim_base =
+      pending_prediction_effective && !use_pending_prediction_for_pc_reg &&
+      !stale_pending_prediction &&
+      (pending_prediction_pc == (o_pc_reg + riscv_pkg::PcIncrementCompressed));
+  // NARROW to the true gremlin: the load is only DROPPED when the served window cannot
+  // deliver it (raw wcs=1).  But the load can only EMIT on the wcs=0 cycle (one after the
+  // resteer), so a plain "&& wcs" would drop pim exactly then and re-NOP the load.  Instead
+  // LATCH the engagement once wcs=1 is seen during the episode, and hold it until the
+  // episode ends (pc_reg reaches the branch -> pim_base falls) or any redirect.  This is
+  // NOT a pc_reg hold -- pim still advances pc_reg via the carve-out -- so it cannot
+  // deadlock.  At wcs=0 sites it never engages.  Acyclic: raw wcs is independent of sel_nop.
+  assign pending_imm_pred_emit = pim_base && (i_window_cannot_serve_raw || carve_out_engaged_q);
+  always_ff @(posedge i_clk) begin
+    if (i_reset || i_flush || i_trap_taken || i_mret_taken || i_branch_taken ||
+        i_pd_redirect || i_fence_i_flush || !pim_base) begin
+      carve_out_engaged_q <= 1'b0;
+    end else if (!fetch_stall && i_window_cannot_serve_raw) begin
+      carve_out_engaged_q <= 1'b1;
+    end
+  end
   assign hold_pending_prediction_fetch =
       pending_prediction_effective && !use_pending_prediction_for_pc_reg &&
-      !stale_pending_prediction;
+      !stale_pending_prediction && !pending_imm_pred_emit;
   assign hold_pending_prediction_consume_fetch =
       pending_prediction_effective && use_pending_prediction_for_pc_reg;
   // Keep a PC-mux-local copy of the pending-handoff cone so synthesis can
@@ -485,7 +561,7 @@ module pc_controller #(
   assign hold_pending_prediction_fetch_pc_mux =
       pending_prediction_effective &&
       !use_pending_prediction_for_pc_reg_pc_mux &&
-      !stale_pending_prediction_pc_mux;
+      !stale_pending_prediction_pc_mux && !pending_imm_pred_emit;
   assign hold_pending_prediction_consume_fetch_pc_mux =
       pending_prediction_effective &&
       use_pending_prediction_for_pc_reg_pc_mux;
@@ -607,6 +683,7 @@ module pc_controller #(
     else if (i_fence_i_flush) next_pc = i_fence_i_target;
     else if (i_branch_taken) next_pc = i_branch_target;
     else if (i_pd_redirect) next_pc = i_pd_redirect_target;
+    else if (i_window_cannot_serve) next_pc = {o_pc_reg[XLEN-1:2], 2'b00};
     // No fetch progress: hold the fetch address so the provider can keep
     // working on the owed ask.  Sits above the prediction/pending arms
     // (their state is frozen and predictions are suppressed while invalid)
@@ -670,6 +747,7 @@ module pc_controller #(
     else if (i_fence_i_flush) next_pc_reg = i_fence_i_target;
     else if (i_branch_taken) next_pc_reg = i_branch_target;
     else if (i_pd_redirect) next_pc_reg = i_pd_redirect_target;
+    else if (i_window_cannot_serve) next_pc_reg = o_pc_reg;
     // No fetch progress: hold the instruction address (nothing is being
     // delivered).  Same placement rationale as the next_pc hold arm above.
     else if (!i_fetch_progress) next_pc_reg = o_pc_reg;
@@ -684,8 +762,13 @@ module pc_controller #(
     // that bubble; advancing here pairs the arriving target word with the next
     // halfword PC and corrupts C-extension alignment on loop back-edges.
     else if (o_pending_prediction_target_holdoff) next_pc_reg = o_pc_reg;
+    // GREMLIN fix (Option 1b): suppress the land-on-branch JUMP in the immediate-
+    // predecessor carve-out so pc_reg advances SEQUENTIALLY (seq_next_pc_reg, which
+    // equals pending_prediction_pc here) and the intervening older parcel emits first
+    // instead of being skipped.  pending_prediction_valid stays live -> the target
+    // handoff (below) still fires when pc_reg actually reaches the branch.
     else if (pending_prediction_effective && !pending_prediction_allow_cross_pc_mux_q &&
-             !use_pending_prediction_for_pc_reg_pc_mux)
+             !use_pending_prediction_for_pc_reg_pc_mux && !pending_imm_pred_emit)
       next_pc_reg = pending_prediction_pc;
     else if (pending_prediction_cross_handoff_pc_mux) next_pc_reg = pending_prediction_pc;
     else if (pending_prediction_target_handoff_pc_mux) next_pc_reg = pending_prediction_target;
diff --git a/hw/rtl/cpu_and_mem/cpu/if_stage/pc_increment_calculator.sv b/hw/rtl/cpu_and_mem/cpu/if_stage/pc_increment_calculator.sv
index 24ee5f39..13934c9f 100644
--- a/hw/rtl/cpu_and_mem/cpu/if_stage/pc_increment_calculator.sv
+++ b/hw/rtl/cpu_and_mem/cpu/if_stage/pc_increment_calculator.sv
@@ -18,7 +18,7 @@
   PC Increment Calculator
 
   Computes the next sequential PC values using parallel adders for timing optimization.
-  This module pre-computes fetch PC increment results (pc+2, pc+4) in parallel,
+  This module pre-computes fetch PC increment results (pc+2 through pc+8) in parallel,
   then selects the correct result based on instruction type and state.
 
   Key Timing Optimization:
diff --git a/hw/rtl/cpu_and_mem/cpu/if_stage/pc_reg_precompute.sv b/hw/rtl/cpu_and_mem/cpu/if_stage/pc_reg_precompute.sv
index fb190938..c37b1ab2 100644
--- a/hw/rtl/cpu_and_mem/cpu/if_stage/pc_reg_precompute.sv
+++ b/hw/rtl/cpu_and_mem/cpu/if_stage/pc_reg_precompute.sv
@@ -17,7 +17,7 @@
 /*
  * PC Register Pre-computation
  *
- * Computes pc_reg + 0/2/4 in parallel and selects the result for both
+ * Computes pc_reg + 0/2/4/6 in parallel and selects the result for both
  * the "instruction is compressed" and "instruction is 32-bit" cases using
  * ONLY registered select signals.
  *
diff --git a/hw/rtl/cpu_and_mem/cpu/riscv_pkg.sv b/hw/rtl/cpu_and_mem/cpu/riscv_pkg.sv
index eb8b0893..6d09eaea 100644
--- a/hw/rtl/cpu_and_mem/cpu/riscv_pkg.sv
+++ b/hw/rtl/cpu_and_mem/cpu/riscv_pkg.sv
@@ -381,7 +381,7 @@ package riscv_pkg;
   // Section 3: CSR Definitions
   // ===========================================================================
   // Control and Status Register addresses, bit positions, and cause codes.
-  // Includes Zicsr instruction encodings and M-mode trap support.
+  // Includes Zicsr instruction encodings and M/U-mode trap support.
 
   // CSR instruction funct3 encoding
   typedef enum bit [2:0] {
@@ -475,6 +475,13 @@ package riscv_pkg;
   // mstatus bit positions (RV32)
   localparam int unsigned MstatusMieBit = 3;  // Machine Interrupt Enable
   localparam int unsigned MstatusMpieBit = 7;  // Machine Previous Interrupt Enable
+  // mstatus.MPP occupies [12:11]; mstatus.MPRV is bit 17 (RV32).
+  localparam int unsigned MstatusMppLo = 11;
+  localparam int unsigned MstatusMprvBit = 17;
+
+  // Privilege modes (RISC-V encoding). FROST implements Machine and User only.
+  localparam logic [1:0] PrivU = 2'b00;
+  localparam logic [1:0] PrivM = 2'b11;
 
   // mie/mip bit positions
   localparam int unsigned MieMsiBit = 3;  // Machine Software Interrupt
@@ -486,6 +493,7 @@ package riscv_pkg;
   localparam bit [31:0] ExcBreakpoint = 32'd3;
   localparam bit [31:0] ExcLoadAddrMisalign = 32'd4;
   localparam bit [31:0] ExcStoreAddrMisalign = 32'd6;
+  localparam bit [31:0] ExcEcallUmode = 32'd8;
   localparam bit [31:0] ExcEcallMmode = 32'd11;
 
   // Interrupt cause codes (mcause values when interrupt bit = 1)
@@ -836,7 +844,7 @@ package riscv_pkg;
   // Section 9: Trap/Exception Handling
   // ===========================================================================
   // Structures for trap control.
-  // Used by trap_unit.sv for M-mode exception/interrupt handling.
+  // Used by trap_unit.sv for M/U-mode exception/interrupt handling.
   // Trap control signals (from trap unit to pipeline)
   typedef struct packed {
     logic            trap_taken;   // Trap is being taken this cycle
@@ -1075,7 +1083,7 @@ package riscv_pkg;
   localparam int unsigned FLEN = FpWidth;  // 64 bits for D extension
 
   // CDB parameters
-  localparam int unsigned NumCdbLanes = 1;  // Single CDB (future expansion)
+  localparam int unsigned NumCdbLanes = 1;  // unused: the CDB is 2-lane today (o_cdb + o_cdb_2)
   localparam int unsigned NumFus = 7;  // ALU, MUL, DIV, MEM, FP_ADD, FP_MUL, FP_DIV
 
   // ---------------------------------------------------------------------------
diff --git a/hw/rtl/cpu_and_mem/cpu/tomasulo/README.md b/hw/rtl/cpu_and_mem/cpu/tomasulo/README.md
index 7206be89..9fc75936 100644
--- a/hw/rtl/cpu_and_mem/cpu/tomasulo/README.md
+++ b/hw/rtl/cpu_and_mem/cpu/tomasulo/README.md
@@ -66,11 +66,12 @@ moves, no functional change): `store_queue/sq_forwarding_unit`,
 `serial_state_e` enum lives in `riscv_pkg` so the ROB and submodule share it).
 Each is documented in its parent module's README.
 
-The CPU top-level (`../cpu_ooo.sv`) instantiates `tomasulo_wrapper`
-plus `dispatch` and the front-end stages, and contains a few large
-inline blocks that straddle the front-end / back-end boundary
-(early misprediction recovery, commit flush controller, memory port
-arbitration, …). See [`../README.md`](../README.md).
+The CPU top-level (`../cpu_ooo/cpu_ooo.sv`) instantiates
+`tomasulo_wrapper` plus `dispatch` and the front-end stages; the logic
+that straddles the front-end / back-end boundary (early misprediction
+recovery, the misprediction flush controller, memory port arbitration,
+…) lives in its glue submodules under `../cpu_ooo/branch_recovery/`
+and `../cpu_ooo/memory_if/`. See [`../README.md`](../README.md).
 
 ## Cross-cutting design notes
 
@@ -97,7 +98,8 @@ Branches and JALRs reserve a RAT checkpoint at dispatch (full INT +
 FP RAT snapshot + RAS top + valid count, 8 slots).
 
 Conditional-branch mispredictions resolve in `branch_jump_unit` and
-trigger a fast two-phase recovery directly from `cpu_ooo.sv`: the
+trigger a fast two-phase recovery in the `early_misprediction_recovery`
+submodule (under `cpu_ooo/branch_recovery/`): the
 front-end redirects and the RAT restores in the same cycle, then the
 OOO back-end's partial flush fires one cycle later. This cuts the
 typical penalty from ~15 cycles to ~2.
diff --git a/hw/rtl/cpu_and_mem/cpu/tomasulo/cdb_arbiter/cdb_arbiter.sv b/hw/rtl/cpu_and_mem/cpu/tomasulo/cdb_arbiter/cdb_arbiter.sv
index 3955a167..16a615c2 100644
--- a/hw/rtl/cpu_and_mem/cpu/tomasulo/cdb_arbiter/cdb_arbiter.sv
+++ b/hw/rtl/cpu_and_mem/cpu/tomasulo/cdb_arbiter/cdb_arbiter.sv
@@ -17,8 +17,9 @@
 /*
  * CDB Arbiter
  *
- * Priority-based multiplexer that selects one functional unit result per cycle
- * for broadcast on the Common Data Bus (CDB). Ties FU completions back to:
+ * Priority-based multiplexer that selects up to two functional unit results
+ * per cycle (2-wide CDB: primary o_cdb + secondary o_cdb_2) for broadcast on
+ * the Common Data Bus (CDB). Ties FU completions back to:
  *   - ROB (mark done + store value)
  *   - All RS instances (operand wakeup)
  *
diff --git a/hw/rtl/cpu_and_mem/cpu/tomasulo/dispatch/dispatch.sv b/hw/rtl/cpu_and_mem/cpu/tomasulo/dispatch/dispatch.sv
index e19ea796..3c0870a2 100644
--- a/hw/rtl/cpu_and_mem/cpu/tomasulo/dispatch/dispatch.sv
+++ b/hw/rtl/cpu_and_mem/cpu/tomasulo/dispatch/dispatch.sv
@@ -26,8 +26,10 @@
  *   5. Allocates a checkpoint for branches/jumps
  *   6. Generates back-pressure (stall) when resources are exhausted
  *
- * The dispatch is combinational: all outputs are derived from the registered
- * from_id_to_ex pipeline register in the same cycle.
+ * The dispatch is mostly combinational: outputs are derived from the
+ * registered from_id_to_ex pipeline register in the same cycle, except the
+ * done-repair bypass valid/tag channels, which are registered and appear one
+ * cycle after the dispatch fire.
  *
  * Stall conditions (any one stalls the front-end):
  *   - ROB full
diff --git a/hw/rtl/cpu_and_mem/cpu/tomasulo/load_queue/README.md b/hw/rtl/cpu_and_mem/cpu/tomasulo/load_queue/README.md
index 08aef01f..f860ad32 100644
--- a/hw/rtl/cpu_and_mem/cpu/tomasulo/load_queue/README.md
+++ b/hw/rtl/cpu_and_mem/cpu/tomasulo/load_queue/README.md
@@ -52,6 +52,10 @@ Two things the cache intentionally *doesn't* do:
   so there's nothing speculative to throw away. Leaving cached lines
   hot across mispredict recovery roughly doubles the steady-state hit
   rate on CoreMark (36.5% → 72.4%).
+- **No fill from a full-flush-cycle response.** Trap/MRET/FENCE.I full
+  flushes keep existing L0 lines hot, but a memory response that arrives
+  on the flush cycle is treated as a drained response for a killed load
+  and is not allowed to install a new L0 line.
 - **No same-cycle fill → lookup bypass.** Forwarding the in-flight
   fill into a same-cycle lookup dragged the back-end flush cone
   (`i_flush_en` → `accept_mem_response` → fill → bypass → hit →
diff --git a/hw/rtl/cpu_and_mem/cpu/tomasulo/load_queue/load_queue.sv b/hw/rtl/cpu_and_mem/cpu/tomasulo/load_queue/load_queue.sv
index 1263ed49..cfb12594 100644
--- a/hw/rtl/cpu_and_mem/cpu/tomasulo/load_queue/load_queue.sv
+++ b/hw/rtl/cpu_and_mem/cpu/tomasulo/load_queue/load_queue.sv
@@ -18,10 +18,10 @@
  * Load Queue - Tracks in-flight load instructions
  *
  * Circular buffer of DEPTH entries (8), allocated in program order at
- * dispatch time, freed when the load result is broadcast on the CDB.
+ * dispatch time, freed the cycle the result is captured into cdb_stage.
  *
  * Features:
- *   - Parameterized depth (8 entries, FF-based)
+ *   - Parameterized depth (8 entries; LUTRAM payload, FF control state)
  *   - CAM-style tag search for address update (all entries in parallel)
  *   - Oldest-first priority scan for issue selection
  *   - Two-phase FLD support (64-bit double on 32-bit bus)
@@ -169,7 +169,7 @@ module load_queue #(
     input logic                                        i_early_recovery_flush,
 
     // =========================================================================
-    // L0 Cache Invalidation (from SQ, future)
+    // L0 Cache Invalidation (from SQ store-write launch)
     // =========================================================================
     input logic                       i_cache_invalidate_valid,
     input logic [riscv_pkg::XLEN-1:0] i_cache_invalidate_addr,
@@ -360,8 +360,8 @@ module load_queue #(
   // ===========================================================================
   // lq_data payload is only read at issue_cdb_idx (CDB broadcast).
   // Writes come from two independent sources that can overlap:
-  //   Port 0 (primary): cache hit / store forward / memory response
-  //   Port 1 (AMO):     AMO write completion
+  //   Port 0 (mem resp): memory response (dedicated)
+  //   Port 1 (local):    cache hit / SQ forward / AMO write completion
   // Split into 32-bit lo and hi halves so FLD can write each phase
   // independently without read-modify-write.
 
@@ -525,6 +525,7 @@ module load_queue #(
   // Response acceptance/drain control
   logic flush_all_entries;
   logic issued_entry_flushed;
+  logic full_flush_response_drain;
   logic accept_mem_response;
   logic drop_mem_response_now;
 
@@ -747,8 +748,17 @@ module load_queue #(
   // Issue Selection -> lq_issue_selector.sv (pure boundary move).  issue_cdb_idx
   // still drives the LQ data LUTRAM read below; that RAM stays here.
   // ===========================================================================
-  logic [DEPTH-1:0] mem_issue_stored_mask;
-  logic [DEPTH-1:0] mem_issue_update_mask;
+  logic stored_scan_found;
+  logic [IdxWidth-1:0] stored_scan_idx;
+  logic [IdxWidth-1:0] stored_scan_pos;
+  logic [DEPTH-1:0] stored_scan_onehot;
+  logic [ReorderBufferTagWidth-1:0] stored_scan_rob_tag;
+
+  logic update_scan_found;
+  logic [IdxWidth-1:0] update_scan_idx;
+  logic [IdxWidth-1:0] update_scan_pos;
+  logic [DEPTH-1:0] update_scan_onehot;
+  logic [ReorderBufferTagWidth-1:0] update_scan_rob_tag;
   logic head_mem_stored_found;
   logic [IdxWidth-1:0] head_mem_stored_idx;
   logic [ReorderBufferTagWidth-1:0] head_mem_stored_rob_tag;
@@ -756,6 +766,7 @@ module load_queue #(
   logic [IdxWidth-1:0] head_mem_update_idx;
   logic [ReorderBufferTagWidth-1:0] head_mem_update_rob_tag;
   logic [DEPTH*ReorderBufferTagWidth-1:0] lq_rob_tag_flat;
+  logic force_head_amo;
 
   for (genvar g_lq_tag = 0; g_lq_tag < DEPTH; g_lq_tag++) begin : gen_lq_rob_tag_flat
     assign lq_rob_tag_flat[g_lq_tag*ReorderBufferTagWidth +: ReorderBufferTagWidth] =
@@ -778,10 +789,19 @@ module load_queue #(
       .lq_rob_tag_flat(lq_rob_tag_flat),
       .head_idx(head_idx),
       .i_sq_committed_empty(i_sq_committed_empty),
+      .i_force_head_amo(force_head_amo),
       .o_issue_cdb_found(issue_cdb_found),
       .o_issue_cdb_idx(issue_cdb_idx),
-      .o_mem_issue_stored_mask(mem_issue_stored_mask),
-      .o_mem_issue_update_mask(mem_issue_update_mask),
+      .o_stored_scan_found(stored_scan_found),
+      .o_stored_scan_idx(stored_scan_idx),
+      .o_stored_scan_pos(stored_scan_pos),
+      .o_stored_scan_onehot(stored_scan_onehot),
+      .o_stored_scan_rob_tag(stored_scan_rob_tag),
+      .o_update_scan_found(update_scan_found),
+      .o_update_scan_idx(update_scan_idx),
+      .o_update_scan_pos(update_scan_pos),
+      .o_update_scan_onehot(update_scan_onehot),
+      .o_update_scan_rob_tag(update_scan_rob_tag),
       .o_head_mem_stored_found(head_mem_stored_found),
       .o_head_mem_stored_idx(head_mem_stored_idx),
       .o_head_mem_stored_rob_tag(head_mem_stored_rob_tag),
@@ -790,15 +810,6 @@ module load_queue #(
       .o_head_mem_update_rob_tag(head_mem_update_rob_tag)
   );
 
-  // scan_idx recomputed locally for the head-load diagnostics below; the
-  // selector computes its own identical copy internally (head-relative idx).
-  logic [IdxWidth-1:0] scan_idx[DEPTH];
-  always_comb begin
-    for (int unsigned j = 0; j < DEPTH; j++) begin
-      scan_idx[j] = IdxWidth'(head_idx + IdxWidth'(j));
-    end
-  end
-
   // ===========================================================================
   // Head-load sub-bucket diagnostics
   // ===========================================================================
@@ -914,49 +925,6 @@ module load_queue #(
   // a post-encoder 8-to-1 MUX on lq_rob_tag[issue_mem_idx])
   logic [ReorderBufferTagWidth-1:0] issue_mem_rob_tag;
 
-  logic stored_scan_found;
-  logic [IdxWidth-1:0] stored_scan_idx;
-  logic [IdxWidth-1:0] stored_scan_pos;
-  logic [DEPTH-1:0] stored_scan_onehot;
-  logic [ReorderBufferTagWidth-1:0] stored_scan_rob_tag;
-
-  logic update_scan_found;
-  logic [IdxWidth-1:0] update_scan_idx;
-  logic [IdxWidth-1:0] update_scan_pos;
-  logic [DEPTH-1:0] update_scan_onehot;
-  logic [ReorderBufferTagWidth-1:0] update_scan_rob_tag;
-
-  always_comb begin
-    stored_scan_found   = 1'b0;
-    stored_scan_idx     = '0;
-    stored_scan_pos     = '0;
-    stored_scan_onehot  = '0;
-    stored_scan_rob_tag = '0;
-    update_scan_found   = 1'b0;
-    update_scan_idx     = '0;
-    update_scan_pos     = '0;
-    update_scan_onehot  = '0;
-    update_scan_rob_tag = '0;
-
-    for (int unsigned i = 0; i < DEPTH; i++) begin
-      if (mem_issue_stored_mask[i] && !stored_scan_found) begin
-        stored_scan_found               = 1'b1;
-        stored_scan_idx                 = scan_idx[i];
-        stored_scan_pos                 = IdxWidth'(i);
-        stored_scan_onehot[scan_idx[i]] = 1'b1;
-        stored_scan_rob_tag             = lq_rob_tag[scan_idx[i]];
-      end
-
-      if (mem_issue_update_mask[i] && !update_scan_found) begin
-        update_scan_found               = 1'b1;
-        update_scan_idx                 = scan_idx[i];
-        update_scan_pos                 = IdxWidth'(i);
-        update_scan_onehot[scan_idx[i]] = 1'b1;
-        update_scan_rob_tag             = lq_rob_tag[scan_idx[i]];
-      end
-    end
-  end
-
   logic [IdxWidth-1:0] stored_issue_idx;
   logic [ReorderBufferTagWidth-1:0] stored_issue_rob_tag;
   logic [ReorderBufferTagWidth-1:0] update_issue_rob_tag;
@@ -1128,21 +1096,83 @@ module load_queue #(
       !sq_commit_interlock &&
       i_sq_forward.can_forward
       && !sq_check_is_mmio_q && !sq_check_is_lr_q && !sq_check_is_amo_q;
+
+  // Break the rare ROB-head AMO deadlock without changing steady-state AMO
+  // order.  The normal selector remains pristine until a head AMO is eligible
+  // for issue and the machine has made no useful LQ/SQ progress for a sustained
+  // window.  Once saturated, force_head_amo lets the head-priority path choose
+  // that AMO for one capture/replace cycle.
+  localparam int unsigned AmoDeadlockThresh = 512;
+  localparam int unsigned AmoDeadlockCntW = $clog2(AmoDeadlockThresh + 1);
+
+  logic head_amo_eligible_waiting;
+  logic sq_check_waiting_older_store;
+  logic head_amo_no_issue_deadlock;
+  logic head_amo_sq_deadlock;
+  logic head_amo_deadlock_wait;
+  logic [AmoDeadlockCntW-1:0] amo_deadlock_cnt_q;
+
+  always_comb begin
+    head_amo_eligible_waiting = 1'b0;
+    for (int unsigned i = 0; i < DEPTH; i++) begin
+      if (rob_head_match_q[i] &&
+          lq_valid[i] &&
+          lq_is_amo[i] &&
+          entry_addr_valid_now[i] &&
+          !lq_issued[i] &&
+          !lq_data_valid[i] &&
+          !sq_check_in_flight_mask[i] &&
+          i_sq_committed_empty) begin
+        head_amo_eligible_waiting = 1'b1;
+      end
+    end
+  end
+
+  assign sq_check_waiting_older_store =
+      sq_check_pending && sq_check_phase2 && sq_check_entry_issueable &&
+      !sq_check_misaligned && !sq_commit_interlock && !sq_no_older_store &&
+      (!i_sq_all_older_addrs_known || (i_sq_forward.match && !i_sq_forward.can_forward)) &&
+      !i_mem_bus_busy && !drop_mem_response_pending && !i_flush_all && !i_flush_en;
+
+  assign head_amo_no_issue_deadlock =
+      head_amo_eligible_waiting && !issue_mem_found && !sq_check_pending;
+  assign head_amo_sq_deadlock =
+      head_amo_eligible_waiting && sq_check_waiting_older_store &&
+      (sq_check_rob_tag_q != i_rob_head_tag);
+  assign head_amo_deadlock_wait =
+      !mem_outstanding && (amo_state == AMO_IDLE) &&
+      (head_amo_no_issue_deadlock || head_amo_sq_deadlock);
+
+  always_ff @(posedge i_clk) begin
+    if (!i_rst_n || i_flush_all || i_flush_en || !head_amo_deadlock_wait) begin
+      amo_deadlock_cnt_q <= '0;
+    end else if (amo_deadlock_cnt_q < AmoDeadlockCntW'(AmoDeadlockThresh)) begin
+      amo_deadlock_cnt_q <= amo_deadlock_cnt_q + 1'b1;
+    end
+  end
+
+  assign force_head_amo = (amo_deadlock_cnt_q >= AmoDeadlockCntW'(AmoDeadlockThresh));
+
   assign flush_all_entries = i_flush_en && !i_early_recovery_flush &&
       (i_rob_head_tag == (i_flush_tag + ReorderBufferTagWidth'(1)));
 
   // Data memory has fixed 1-cycle latency in this design. If a partial flush
   // kills the outstanding load, drop that next response explicitly so the slot
-  // can be safely reused before the stale data returns.
+  // can be safely reused before the stale data returns. A full flush clears all
+  // entries at the edge; a same-cycle response is therefore drained here rather
+  // than accepted, so it cannot complete a killed load or refill the persistent
+  // L0 cache from a flushed context.
   assign issued_entry_flushed = i_flush_en && mem_outstanding && lq_valid[issued_idx] &&
       (flush_all_entries || is_younger(
       issued_rob_tag, i_flush_tag, i_rob_head_tag
   ));
+  assign full_flush_response_drain = i_flush_all && i_mem_read_valid && mem_outstanding;
   assign accept_mem_response = i_mem_read_valid && mem_outstanding &&
-                               !drop_mem_response_pending && !issued_entry_flushed &&
-                               lq_valid[issued_idx];
+                               !i_flush_all && !drop_mem_response_pending &&
+                               !issued_entry_flushed && lq_valid[issued_idx];
   assign drop_mem_response_now = i_mem_read_valid &&
-                                 (drop_mem_response_pending || issued_entry_flushed ||
+                                 (full_flush_response_drain ||
+                                  drop_mem_response_pending || issued_entry_flushed ||
                                   (mem_outstanding && !lq_valid[issued_idx]));
 
   // ===========================================================================
@@ -2344,6 +2374,9 @@ module load_queue #(
         $warning("LQ: slot-2 alloc attempted when full_for_2 (and slot-1 firing)");
       if (i_alloc_2.valid && !i_alloc.valid && full)
         $warning("LQ: slot-2 alloc attempted alone when full");
+      if (i_flush_all && accept_mem_response)
+        $error("LQ: accepted memory response during full flush");
+      if (i_flush_all && cache_fill_valid) $error("LQ: filled L0 cache during full flush");
       // Slot-1 and slot-2 must never target the same physical entry.
       if (slot1_alloc_en && slot2_alloc_en && (alloc_target[IdxWidth-1:0] == slot2_alloc_idx))
         $error("LQ: slot-1 and slot-2 alloc collide on entry %0d", alloc_target[IdxWidth-1:0]);
@@ -2539,6 +2572,15 @@ module load_queue #(
     end
   end
 
+  // Full-flush-cycle responses are drains only. They must not perform any
+  // architectural or persistent-cache side effect.
+  always_comb begin
+    if (i_rst_n && i_flush_all) begin
+      p_no_accept_during_full_flush : assert (!accept_mem_response);
+      p_no_l0_fill_during_full_flush : assert (!cache_fill_valid);
+    end
+  end
+
   // -------------------------------------------------------------------------
   // Sequential assertions
   // -------------------------------------------------------------------------
diff --git a/hw/rtl/cpu_and_mem/cpu/tomasulo/load_queue/lq_issue_selector.sv b/hw/rtl/cpu_and_mem/cpu/tomasulo/load_queue/lq_issue_selector.sv
index 8e8a887e..15067a08 100644
--- a/hw/rtl/cpu_and_mem/cpu/tomasulo/load_queue/lq_issue_selector.sv
+++ b/hw/rtl/cpu_and_mem/cpu/tomasulo/load_queue/lq_issue_selector.sv
@@ -18,7 +18,8 @@
 // lq_issue_selector
 // =============================================================================
 // Extracted verbatim from load_queue.sv (pure RTL boundary move, zero functional
-// change).  Parallel issue selection: Phase A (oldest CDB-ready entry), Phase B
+// change, except for the optional registered deadlock break input).  Parallel
+// issue selection: Phase A (oldest CDB-ready entry), Phase B
 // (memory-issue eligibility masks with MMIO/LR/AMO head gating + older-AMO
 // blocking), and the explicit ROB-head priority result.  Replaces the old serial
 // 16-level scan with per-entry masks + tree encoders.  issue_cdb_idx is exported
@@ -42,11 +43,20 @@ module lq_issue_selector #(
     input logic [(DEPTH*riscv_pkg::ReorderBufferTagWidth)-1:0] lq_rob_tag_flat,
     input logic [$clog2(DEPTH)-1:0] head_idx,
     input logic i_sq_committed_empty,
+    input logic i_force_head_amo,
 
     output logic o_issue_cdb_found,
     output logic [$clog2(DEPTH)-1:0] o_issue_cdb_idx,
-    output logic [DEPTH-1:0] o_mem_issue_stored_mask,
-    output logic [DEPTH-1:0] o_mem_issue_update_mask,
+    output logic o_stored_scan_found,
+    output logic [$clog2(DEPTH)-1:0] o_stored_scan_idx,
+    output logic [$clog2(DEPTH)-1:0] o_stored_scan_pos,
+    output logic [DEPTH-1:0] o_stored_scan_onehot,
+    output logic [riscv_pkg::ReorderBufferTagWidth-1:0] o_stored_scan_rob_tag,
+    output logic o_update_scan_found,
+    output logic [$clog2(DEPTH)-1:0] o_update_scan_idx,
+    output logic [$clog2(DEPTH)-1:0] o_update_scan_pos,
+    output logic [DEPTH-1:0] o_update_scan_onehot,
+    output logic [riscv_pkg::ReorderBufferTagWidth-1:0] o_update_scan_rob_tag,
     output logic o_head_mem_stored_found,
     output logic [$clog2(DEPTH)-1:0] o_head_mem_stored_idx,
     output logic [riscv_pkg::ReorderBufferTagWidth-1:0] o_head_mem_stored_rob_tag,
@@ -173,6 +183,54 @@ module lq_issue_selector #(
   assign mem_issue_stored_mask = mem_eligible_stored_mask & ~blocked_by_amo;
   assign mem_issue_update_mask = mem_eligible_update_mask & ~blocked_by_amo;
 
+  // Encode the oldest normal stored-address and current-update candidates here
+  // while scan_idx is already local. Exporting encoded candidates avoids
+  // re-scanning the masks in load_queue on the SQ-check payload enable path.
+  logic stored_scan_found;
+  logic [IdxWidth-1:0] stored_scan_idx;
+  logic [IdxWidth-1:0] stored_scan_pos;
+  logic [DEPTH-1:0] stored_scan_onehot;
+  logic [ReorderBufferTagWidth-1:0] stored_scan_rob_tag;
+
+  logic update_scan_found;
+  logic [IdxWidth-1:0] update_scan_idx;
+  logic [IdxWidth-1:0] update_scan_pos;
+  logic [DEPTH-1:0] update_scan_onehot;
+  logic [ReorderBufferTagWidth-1:0] update_scan_rob_tag;
+
+  always_comb begin
+    stored_scan_found   = 1'b0;
+    stored_scan_idx     = '0;
+    stored_scan_pos     = '0;
+    stored_scan_onehot  = '0;
+    stored_scan_rob_tag = '0;
+    update_scan_found   = 1'b0;
+    update_scan_idx     = '0;
+    update_scan_pos     = '0;
+    update_scan_onehot  = '0;
+    update_scan_rob_tag = '0;
+
+    for (int unsigned i = 0; i < DEPTH; i++) begin
+      if (mem_issue_stored_mask[i] && !stored_scan_found) begin
+        stored_scan_found = 1'b1;
+        stored_scan_idx = scan_idx[i];
+        stored_scan_pos = IdxWidth'(i);
+        stored_scan_onehot[scan_idx[i]] = 1'b1;
+        stored_scan_rob_tag =
+            lq_rob_tag_flat[scan_idx[i]*ReorderBufferTagWidth+:ReorderBufferTagWidth];
+      end
+
+      if (mem_issue_update_mask[i] && !update_scan_found) begin
+        update_scan_found = 1'b1;
+        update_scan_idx = scan_idx[i];
+        update_scan_pos = IdxWidth'(i);
+        update_scan_onehot[scan_idx[i]] = 1'b1;
+        update_scan_rob_tag =
+            lq_rob_tag_flat[scan_idx[i]*ReorderBufferTagWidth+:ReorderBufferTagWidth];
+      end
+    end
+  end
+
   // The sparse queue can reuse reclaimed holes after flushes, so physical
   // queue order is not always identical to ROB age.  To avoid starving the
   // oldest architectural load behind a younger blocked entry, explicitly
@@ -200,7 +258,7 @@ module lq_issue_selector #(
           !in_flight_mask[i] &&
           !lq_is_mmio[i] &&
           !lq_is_lr[i] &&
-          !lq_is_amo[i]) begin
+          (!lq_is_amo[i] || (i_force_head_amo && i_sq_committed_empty))) begin
         head_mem_stored_found   = 1'b1;
         head_mem_stored_idx     = IdxWidth'(i);
         head_mem_stored_rob_tag = lq_rob_tag_flat[i*ReorderBufferTagWidth+:ReorderBufferTagWidth];
@@ -214,7 +272,7 @@ module lq_issue_selector #(
           !lq_data_valid[i] &&
           !in_flight_mask[i] &&
           !lq_is_lr[i] &&
-          !lq_is_amo[i]) begin
+          (!lq_is_amo[i] || (i_force_head_amo && i_sq_committed_empty))) begin
         head_mem_update_found   = 1'b1;
         head_mem_update_idx     = IdxWidth'(i);
         head_mem_update_rob_tag = lq_rob_tag_flat[i*ReorderBufferTagWidth+:ReorderBufferTagWidth];
@@ -224,8 +282,16 @@ module lq_issue_selector #(
 
   assign o_issue_cdb_found = issue_cdb_found;
   assign o_issue_cdb_idx = issue_cdb_idx;
-  assign o_mem_issue_stored_mask = mem_issue_stored_mask;
-  assign o_mem_issue_update_mask = mem_issue_update_mask;
+  assign o_stored_scan_found = stored_scan_found;
+  assign o_stored_scan_idx = stored_scan_idx;
+  assign o_stored_scan_pos = stored_scan_pos;
+  assign o_stored_scan_onehot = stored_scan_onehot;
+  assign o_stored_scan_rob_tag = stored_scan_rob_tag;
+  assign o_update_scan_found = update_scan_found;
+  assign o_update_scan_idx = update_scan_idx;
+  assign o_update_scan_pos = update_scan_pos;
+  assign o_update_scan_onehot = update_scan_onehot;
+  assign o_update_scan_rob_tag = update_scan_rob_tag;
   assign o_head_mem_stored_found = head_mem_stored_found;
   assign o_head_mem_stored_idx = head_mem_stored_idx;
   assign o_head_mem_stored_rob_tag = head_mem_stored_rob_tag;
diff --git a/hw/rtl/cpu_and_mem/cpu/tomasulo/load_queue/lq_l0_cache.sv b/hw/rtl/cpu_and_mem/cpu/tomasulo/load_queue/lq_l0_cache.sv
index fd439d93..7188e4cd 100644
--- a/hw/rtl/cpu_and_mem/cpu/tomasulo/load_queue/lq_l0_cache.sv
+++ b/hw/rtl/cpu_and_mem/cpu/tomasulo/load_queue/lq_l0_cache.sv
@@ -26,9 +26,11 @@
  * Features:
  *   - Combinational lookup (hit in same cycle as address)
  *   - Fill on memory response
- *   - MMIO addresses always miss (>= MMIO_ADDR)
+ *   - MMIO addresses always miss (addr[31:30] == 2'b01 quadrant; DDR at
+ *     0x8000_0000+ is cacheable)
  *   - Flush all valid bits on pipeline flush
- *   - Per-address invalidation port (for future SQ integration)
+ *   - Per-address invalidation port (driven by SQ store-write launch and
+ *     AMO completion)
  */
 
 module lq_l0_cache #(
diff --git a/hw/rtl/cpu_and_mem/cpu/tomasulo/register_alias_table/register_alias_table.sv b/hw/rtl/cpu_and_mem/cpu/tomasulo/register_alias_table/register_alias_table.sv
index 3ad2c087..23722832 100644
--- a/hw/rtl/cpu_and_mem/cpu/tomasulo/register_alias_table/register_alias_table.sv
+++ b/hw/rtl/cpu_and_mem/cpu/tomasulo/register_alias_table/register_alias_table.sv
@@ -271,7 +271,7 @@ module register_alias_table (
   logic [   NumCheckpoints-1:0] checkpoint_valid;
 
   // Checkpoint RAT snapshots — distributed RAM
-  // Combined INT + FP snapshot (384 bits wide, 2-bit address)
+  // Combined INT + FP snapshot (448 bits wide, 3-bit address)
   logic                         ckpt_rat_wr_en;
   logic [CheckpointIdWidth-1:0] ckpt_rat_wr_addr;
   logic [ RatSnapshotWidth-1:0] ckpt_rat_wr_data;
diff --git a/hw/rtl/cpu_and_mem/cpu/tomasulo/reorder_buffer/README.md b/hw/rtl/cpu_and_mem/cpu/tomasulo/reorder_buffer/README.md
index c9cd6bd7..39c3685c 100644
--- a/hw/rtl/cpu_and_mem/cpu/tomasulo/reorder_buffer/README.md
+++ b/hw/rtl/cpu_and_mem/cpu/tomasulo/reorder_buffer/README.md
@@ -165,7 +165,9 @@ The exceptions:
 ## Performance counters
 
 The ROB drives several of the wrapper's performance counters
-directly: `head_and_next_done` (widen-commit actually fired) and
+directly: `head_and_next_done` (commit fired while head+1 was also
+done — a widen-commit upper bound; the actual fire count is
+`commit_2_fire_actual`) and
 `head_plus_one_done` (ungated head+1 ready, for the drain-backlog
 bucket) come from here, along with `commit_2_opportunity` /
 `commit_2_fire_actual` — the gap between those two measures how
diff --git a/hw/rtl/cpu_and_mem/cpu/tomasulo/reorder_buffer/reorder_buffer.f b/hw/rtl/cpu_and_mem/cpu/tomasulo/reorder_buffer/reorder_buffer.f
index 5ee5f997..f6cd36d8 100644
--- a/hw/rtl/cpu_and_mem/cpu/tomasulo/reorder_buffer/reorder_buffer.f
+++ b/hw/rtl/cpu_and_mem/cpu/tomasulo/reorder_buffer/reorder_buffer.f
@@ -7,6 +7,7 @@
 # RAM primitives (distributed RAM used for multi-bit ROB fields)
 $(ROOT)/hw/rtl/lib/ram/sdp_dist_ram.sv
 $(ROOT)/hw/rtl/lib/ram/mwp_dist_ram.sv
+$(ROOT)/hw/rtl/lib/ram/mwp_dist_ram_ohread.sv
 
 # Reorder Buffer module
 $(ROOT)/hw/rtl/cpu_and_mem/cpu/tomasulo/reorder_buffer/rob_serializer.sv
diff --git a/hw/rtl/cpu_and_mem/cpu/tomasulo/reorder_buffer/reorder_buffer.sv b/hw/rtl/cpu_and_mem/cpu/tomasulo/reorder_buffer/reorder_buffer.sv
index d75a83ed..5348b8e5 100644
--- a/hw/rtl/cpu_and_mem/cpu/tomasulo/reorder_buffer/reorder_buffer.sv
+++ b/hw/rtl/cpu_and_mem/cpu/tomasulo/reorder_buffer/reorder_buffer.sv
@@ -39,10 +39,12 @@
  *
  * Storage:
  *   Multi-bit fields use distributed RAM (LUTRAM) to reduce FF usage.
- *   Single-write-port fields (written only at allocation) use sdp_dist_ram.
- *   Multi-write-port fields (allocation + CDB/branch) use mwp_dist_ram
- *   with a Live Value Table. 1-bit packed vectors that need per-entry
- *   flush/reset remain in flip-flops.
+ *   Alloc-written fields use mwp_dist_ram_ohread with 2 write ports
+ *   (slot-1 + slot-2 alloc); fields also written by the CDB use 4 write
+ *   ports (+ 2 CDB lanes) via mwp_dist_ram / mwp_dist_ram_ohread with a
+ *   Live Value Table. The branch-update-written resolved-target field is
+ *   the only remaining sdp_dist_ram. 1-bit packed vectors that need
+ *   per-entry flush/reset remain in flip-flops.
  *
  * External Coordination:
  *   The Reorder Buffer coordinates with several external units via handshake signals:
@@ -120,12 +122,11 @@ module reorder_buffer (
     output logic                              o_commit_2_valid_raw,
     output logic                              o_commit_2_store_like_raw,
 
-    // Back-pressure from the cpu_ooo pending-write FIFO.  Asserted when
-    // there is room for a slot-2 regfile write this cycle; deasserted when
-    // the pending register holds a prior slot-2 write that has not yet
-    // drained AND rob_commit (slot 1) also wants the port this cycle.
-    // Driven from a registered cpu_ooo signal, so the feedback path
-    // closes at a flop (no combinational loop).
+    // Slot-2 accept indication from cpu_ooo.  Asserted when the second
+    // retiring entry can write the regfile this cycle.  With the dedicated
+    // second regfile write port cpu_ooo ties this permanently high (1'b1);
+    // the gate plumbing is kept so the signal path stays symmetric with
+    // the earlier back-pressure approach.
     input logic i_widen_commit_ok,
     input logic i_commit_hold,
 
@@ -152,6 +153,18 @@ module reorder_buffer (
     // Exception detected at head - signal trap unit
     output logic o_trap_pending,  // Exception needs handling
     output logic [riscv_pkg::XLEN-1:0] o_trap_pc,  // PC of excepting instruction
+    // Head decodes as WFI (drives WFI interrupt-resume-PC seed in cpu_ooo)
+    output logic o_head_is_wfi,
+    // TIMING precompute of the architectural next-PC of the head / head+1
+    // entry, for cpu_ooo's interrupt_resume_pc capture.  Contract: whenever
+    // o_commit_valid_raw (resp. o_commit_2_valid_raw) is high,
+    // o_head_retired_next_pc (resp. o_head_next_retired_next_pc) equals
+    // retired_next_pc(o_commit_comb) (resp. (o_commit_comb_2)) as computed in
+    // cpu_ooo.  Computed from UNGATED head fields so the RAM read + 32-bit add
+    // run in parallel with (not after) the late commit_en gating; in cycles
+    // without a commit the value is unused (checked in cpu_ooo simulation).
+    output logic [riscv_pkg::XLEN-1:0] o_head_retired_next_pc,
+    output logic [riscv_pkg::XLEN-1:0] o_head_next_retired_next_pc,
     output riscv_pkg::exc_cause_t o_trap_cause,  // Exception cause
     // Head entry's CDB value at trap time. For a misaligned load/store the
     // load_queue/SQ path parks the faulting address here (the value slot is
@@ -169,6 +182,10 @@ module reorder_buffer (
     // =========================================================================
     input logic i_interrupt_pending,  // Interrupt is pending (wake from WFI)
 
+    // Current privilege (PrivM/PrivU). A U-mode access to MRET or to a CSR that
+    // requires more privilege is an illegal instruction, detected at the head.
+    input logic [1:0] i_priv,
+
     // =========================================================================
     // Pipeline Flush Control
     // =========================================================================
@@ -266,8 +283,8 @@ module reorder_buffer (
   // consumer sees slot 2 even though the plumbing exists).  The
   // commit_2_opportunity perf counter is still updated so we can keep
   // measuring the upper bound across incremental steps.  Flipped to 1
-  // after all downstream consumers (RAT, SQ, cpu_ooo FIFO, instret) are
-  // in place.
+  // after all downstream consumers (RAT, SQ, cpu_ooo second regfile write
+  // port, instret) were in place.
   localparam bit EnableWidenCommit = 1'b1;
 
   // ===========================================================================
@@ -295,8 +312,23 @@ module reorder_buffer (
     end
   endfunction
 
+  // TIMING helper: read one bit of a per-entry packed FF vector using a
+  // registered ONE-HOT select instead of a binary index.  Given the invariant
+  // onehot == (1 << idx), |(vec & onehot) === vec[idx] bit-for-bit; the win is
+  // physical only: the select bits come pre-decoded out of registers (no
+  // 5-bit high-fanout head_idx net feeding a 32:1 mux tree on the commit
+  // critical path).
+  function automatic logic onehot_read(input logic [ReorderBufferDepth-1:0] vec,
+                                       input logic [ReorderBufferDepth-1:0] onehot);
+    onehot_read = |(vec & onehot);
+  endfunction
+
   // Forward declarations (used in debug assigns before main decl)
-  logic [ReorderBufferTagWidth:0] head_ptr;
+  // TIMING: head_ptr (via head_idx) drives every head RAM read address plus
+  // pointer arithmetic — post-synth fanout was ~650 with only 4 tool-chosen
+  // replicas. Cap the per-replica load so each copy can be placed next to its
+  // RAM/consumer cluster. Pure register replication; semantics unchanged.
+  (* max_fanout = 96 *) logic [ReorderBufferTagWidth:0] head_ptr;
   logic [ReorderBufferTagWidth:0] tail_ptr;
   logic full;
   logic full_for_2;
@@ -338,8 +370,19 @@ module reorder_buffer (
   logic [ReorderBufferTagWidth-1:0] tail_idx;
   // Slot-2 alloc target, wraps within ReorderBufferTagWidth modulus.
   logic [ReorderBufferTagWidth-1:0] tail_idx_2;
-  logic [ReorderBufferDepth-1:0] head_clear_mask;
-  logic [ReorderBufferDepth-1:0] head_next_clear_mask;
+  // Registered ONE-HOT images of head_idx / head_next_idx.  Invariant (by
+  // construction, checked by assertions below):
+  //   head_clear_mask      == ReorderBufferDepth'(1) << head_idx
+  //   head_next_clear_mask == ReorderBufferDepth'(1) << head_next_idx
+  // Both are written ONLY in the Head Pointer Management block, in lockstep
+  // with head_ptr: reset loads {1, 2} while head_ptr loads 0; commit rotates
+  // them by the same 1/2 steps head_ptr advances; flushes touch neither
+  // (flushes only move the tail).  TIMING: besides gating the rob_valid
+  // commit-clear, they now also replace the binary head_idx as the select of
+  // every head-side 32:1 read (packed FF vectors + LVT bank selects), turning
+  // a high-fanout 5-bit select into per-entry registered one-hot bits.
+  (* max_fanout = 16 *) logic [ReorderBufferDepth-1:0] head_clear_mask;
+  (* max_fanout = 16 *) logic [ReorderBufferDepth-1:0] head_next_clear_mask;
 
   // Status signals (full and empty declared above for forward ref)
   logic [ReorderBufferTagWidth:0] count;
@@ -349,7 +392,10 @@ module reorder_buffer (
   logic head_valid;
   logic head_done;
   logic head_exception;
-  riscv_pkg::exc_cause_t head_exc_cause;  // from RAM
+  logic head_exception_raw;  // stored ROB exception flag (before U-mode priv fault)
+  logic head_priv_fault;  // U-mode access to MRET / an M-CSR -> illegal instruction
+  riscv_pkg::exc_cause_t head_exc_cause;  // effective cause (includes priv fault)
+  riscv_pkg::exc_cause_t head_exc_cause_raw;  // from RAM
   logic [XLEN-1:0] head_pc;  // from RAM
   logic head_dest_rf;
   logic [RegAddrWidth-1:0] head_dest_reg;  // from RAM
@@ -443,16 +489,47 @@ module reorder_buffer (
 
   // Commit control signals
   logic head_ready;  // Head is valid and done
+  // NOTE: deliberately NO synthesis attributes on commit_stall or the
+  // *_early aggregates below.  Three measured rounds on this spine: every
+  // attribute-based constraint made it worse — round-1 (* max_fanout *) on
+  // commit_en/commit_2_fire fragmented the interrupt arc (WNS -1.17);
+  // round-3 (* keep *) on commit_stall + the early aggregates pinned fusion
+  // boundaries in the MIDDLE of the true critical cone (commit_stall is NOT
+  // a late external input — its serializer cone itself reads the head
+  // metadata through the one-hot masks, so mask -> is_csr/store-like ->
+  // FSM stall -> take_trap is one deep register-to-register cone; WNS
+  // -0.938).  Every real structural change (one-hot head reads, ohread LVT
+  // select, meip register, compare-then-mux) helped.  The two-term
+  // factoring below stays as plain RTL only — synthesis is free to refuse
+  // it back into the baseline-style fused tree.
   logic commit_stall;  // Stall commit for serializing instructions
+  // Early/late factoring of the commit gates (pure AND re-association,
+  // bit-identical conjunct sets — see Commit Enable Logic).
+  logic commit_ready_early;
+  logic commit_2_ready_early;
+  logic commit_store_like_early;
+  logic commit_mispredict_early;
+  logic commit_correct_branch_early;
+  logic head_mispredict_candidate_early;
+  logic commit_2_store_like_early;
+  // NOTE: no max_fanout on commit_en.  A (* max_fanout = 96 *) was tried and
+  // measured WORSE overall: the attribute forces the commit_en net to keep its
+  // identity, which blocks opt_design from collapsing the serialization spine
+  // (interrupt_pending -> commit_stall -> commit_en -> store-like ->
+  // sq_committed_empty_for_trap -> trap_taken) into shared LUTs, adding
+  // levels to the late UART/interrupt-pending arc (933 new failing paths,
+  // WNS -1.17).  With the one-hot head reads the head-side arrival is early
+  // enough that the un-split ~655-load net is no longer the limiter.
   logic commit_en;  // Actually commit this cycle
 
   // Widen-commit ("2-wide") gate. Asserted when commit_en is high this
   // cycle AND the entry immediately behind head is also retirable AND
   // neither slot hits a hazard that forces 1-wide commit (serial ops,
   // head mispredict, head+1 branch, FENCE.I, exceptions, AMO/LR/SC).
-  // Step 1 uses this only as a perf-counter input — it does NOT yet
-  // change head_ptr advancement, rob_valid clearing, or the commit
-  // output struct.
+  // commit_2_gate is the ungated opportunity signal (perf-counter input);
+  // commit_2_fire (gate && EnableWidenCommit && i_widen_commit_ok) drives
+  // the actual 2-wide retire: head_ptr advances by 2, rob_valid clears at
+  // head+1, and o_commit_comb_2 carries the second entry.
   logic head_ok_2wide;
   logic head_next_ok_2wide;
   logic commit_2_gate;
@@ -489,13 +566,27 @@ module reorder_buffer (
   // Count of valid entries
   assign count = tail_ptr - head_ptr;
 
-  // Head entry fields from FF-backed packed vectors / distributed RAM
-  assign head_valid = rob_valid[head_idx];
-  assign head_done = rob_done[head_idx];
-  assign head_exception = rob_exception[head_idx];
-  assign head_branch_taken = rob_branch_taken[head_idx];
-  assign head_mispredicted = rob_mispredicted[head_idx];
-  assign head_early_recovered = rob_early_recovered[head_idx];
+  // Head entry fields from FF-backed packed vectors / distributed RAM.
+  // TIMING: indexed with the registered one-hot head image (see onehot_read);
+  // identical value to rob_*[head_idx] under the head_clear_mask invariant.
+  assign head_valid = onehot_read(rob_valid, head_clear_mask);
+  assign head_done = onehot_read(rob_done, head_clear_mask);
+  assign head_exception_raw = onehot_read(rob_exception, head_clear_mask);
+  // U-mode privilege fault: MRET, or a CSR access requiring more privilege than
+  // the current mode (csr_addr[9:8] > priv), is an illegal instruction. Folding
+  // it into head_exception/head_exc_cause makes every consumer (commit_en,
+  // o_csr_start/o_mret_start, o_trap_pending, the serial FSM, the commit record)
+  // treat it as a precise exception, so the faulting op never executes or
+  // retires. The faulting op rides the same single-cycle exception path, so the
+  // double-trap guard in trap_unit already covers it.
+  assign head_priv_fault = (head_is_mret && (i_priv != riscv_pkg::PrivM)) ||
+                           (head_is_csr && (head_csr_addr[9:8] > i_priv));
+  assign head_exception = head_exception_raw || head_priv_fault;
+  assign head_exc_cause   = (head_priv_fault && !head_exception_raw) ?
+      riscv_pkg::exc_cause_t'(riscv_pkg::ExcIllegalInstr) : head_exc_cause_raw;
+  assign head_branch_taken = onehot_read(rob_branch_taken, head_clear_mask);
+  assign head_mispredicted = onehot_read(rob_mispredicted, head_clear_mask);
+  assign head_early_recovered = onehot_read(rob_early_recovered, head_clear_mask);
   assign {
     head_dest_rf,
     head_dest_valid,
@@ -533,12 +624,14 @@ module reorder_buffer (
   // RAMs below.  1-bit packed-vector fields share the existing FF storage
   // and are indexed at head_next_idx for free.
   assign head_next_idx = head_idx + 1'b1;
-  assign head_next_valid = rob_valid[head_next_idx];
-  assign head_next_done = rob_done[head_next_idx];
-  assign head_next_exception = rob_exception[head_next_idx];
-  assign head_next_branch_taken = rob_branch_taken[head_next_idx];
-  assign head_next_mispredicted = rob_mispredicted[head_next_idx];
-  assign head_next_early_recovered = rob_early_recovered[head_next_idx];
+  // TIMING: same one-hot substitution as the head fields, using the
+  // registered head_next_clear_mask (== 1 << head_next_idx by construction).
+  assign head_next_valid = onehot_read(rob_valid, head_next_clear_mask);
+  assign head_next_done = onehot_read(rob_done, head_next_clear_mask);
+  assign head_next_exception = onehot_read(rob_exception, head_next_clear_mask);
+  assign head_next_branch_taken = onehot_read(rob_branch_taken, head_next_clear_mask);
+  assign head_next_mispredicted = onehot_read(rob_mispredicted, head_next_clear_mask);
+  assign head_next_early_recovered = onehot_read(rob_early_recovered, head_next_clear_mask);
   assign {
     head_next_dest_rf,
     head_next_dest_valid,
@@ -657,13 +750,21 @@ module reorder_buffer (
 
   // 2-wide commit gate.  commit_2_gate is the "opportunity" signal — it
   // fires whenever the ROB could theoretically retire two entries this
-  // cycle, independent of the master enable and the FIFO back-pressure.
+  // cycle, independent of the master enable and the slot-2 accept input.
   // This feeds the perf counter so we can keep measuring upper bound
   // even when widen-commit is gated off.  commit_2_fire is what the
   // output / retire logic actually acts on — it ANDs the opportunity with
-  // the master enable and the cpu_ooo pending-write FIFO back-pressure.
-  assign commit_2_gate = commit_en && head_next_valid && head_next_done_eff &&
-                         head_ok_2wide && head_next_ok_2wide;
+  // the master enable and the cpu_ooo slot-2 accept signal
+  // (i_widen_commit_ok, currently tied high).
+  // TIMING (late-side factoring, see Commit Enable Logic): commit_en && X
+  // == (commit_ready_early && X) && !commit_stall — same conjunct set,
+  // re-associated so the late commit_stall enters one final LUT.
+  assign commit_2_ready_early = commit_ready_early && head_next_valid && head_next_done_eff &&
+                                head_ok_2wide && head_next_ok_2wide;
+  assign commit_2_gate = commit_2_ready_early && !commit_stall;
+  // NOTE: no max_fanout on commit_2_fire — a forced net boundary here sat
+  // mid-spine on the late UART/interrupt-pending -> trap_taken arc (it
+  // appeared as a distinct fo=40 level in the round-1 -1.17 post-opt path).
   logic commit_2_fire;
   assign commit_2_fire = commit_2_gate && EnableWidenCommit && i_widen_commit_ok;
 
@@ -802,13 +903,13 @@ module reorder_buffer (
   // ===========================================================================
   // Distributed RAM Instances
   // ===========================================================================
-  // Single-write-port fields (written only at allocation, read at head).
-  // These use sdp_dist_ram — one write port, one async read port.
+  // Alloc-written fields (read at head / head+1).  Since 2-wide dispatch
+  // these use mwp_dist_ram_ohread with 2 write ports (slot-1 + slot-2 alloc).
   // ---------------------------------------------------------------------------
 
   // 2-write port: slot-1 alloc (port 0) + slot-2 alloc (port 1).  Port 1
   // writes when slot-2 allocates its ROB entry in the same cycle as slot-1.
-  mwp_dist_ram #(
+  mwp_dist_ram_ohread #(
       .ADDR_WIDTH     (ReorderBufferTagWidth),
       .DATA_WIDTH     (XLEN),
       .NUM_WRITE_PORTS(2)
@@ -818,11 +919,12 @@ module reorder_buffer (
       .i_write_address({tail_idx_2, tail_idx}),
       .i_write_data   ({i_alloc_req_2.pc, i_alloc_req.pc}),
       .i_read_address (head_idx),
+      .i_read_onehot  (head_clear_mask),
       .o_read_data    (head_pc)
   );
 
   // Widen-commit replica: head+1 read port for pc.
-  mwp_dist_ram #(
+  mwp_dist_ram_ohread #(
       .ADDR_WIDTH     (ReorderBufferTagWidth),
       .DATA_WIDTH     (XLEN),
       .NUM_WRITE_PORTS(2)
@@ -832,10 +934,11 @@ module reorder_buffer (
       .i_write_address({tail_idx_2, tail_idx}),
       .i_write_data   ({i_alloc_req_2.pc, i_alloc_req.pc}),
       .i_read_address (head_next_idx),
+      .i_read_onehot  (head_next_clear_mask),
       .o_read_data    (head_next_pc)
   );
 
-  mwp_dist_ram #(
+  mwp_dist_ram_ohread #(
       .ADDR_WIDTH     (ReorderBufferTagWidth),
       .DATA_WIDTH     (RegAddrWidth),
       .NUM_WRITE_PORTS(2)
@@ -845,11 +948,12 @@ module reorder_buffer (
       .i_write_address({tail_idx_2, tail_idx}),
       .i_write_data   ({i_alloc_req_2.dest_reg, i_alloc_req.dest_reg}),
       .i_read_address (head_idx),
+      .i_read_onehot  (head_clear_mask),
       .o_read_data    (head_dest_reg)
   );
 
   // Widen-commit replica: head+1 read port for dest_reg.
-  mwp_dist_ram #(
+  mwp_dist_ram_ohread #(
       .ADDR_WIDTH     (ReorderBufferTagWidth),
       .DATA_WIDTH     (RegAddrWidth),
       .NUM_WRITE_PORTS(2)
@@ -859,10 +963,11 @@ module reorder_buffer (
       .i_write_address({tail_idx_2, tail_idx}),
       .i_write_data   ({i_alloc_req_2.dest_reg, i_alloc_req.dest_reg}),
       .i_read_address (head_next_idx),
+      .i_read_onehot  (head_next_clear_mask),
       .o_read_data    (head_next_dest_reg)
   );
 
-  mwp_dist_ram #(
+  mwp_dist_ram_ohread #(
       .ADDR_WIDTH     (ReorderBufferTagWidth),
       .DATA_WIDTH     (XLEN),
       .NUM_WRITE_PORTS(2)
@@ -872,11 +977,12 @@ module reorder_buffer (
       .i_write_address({tail_idx_2, tail_idx}),
       .i_write_data   ({i_alloc_req_2.predicted_target, i_alloc_req.predicted_target}),
       .i_read_address (head_idx),
+      .i_read_onehot  (head_clear_mask),
       .o_read_data    (head_predicted_target)
   );
 
   // Widen-commit replica: head+1 read port for predicted_target.
-  mwp_dist_ram #(
+  mwp_dist_ram_ohread #(
       .ADDR_WIDTH     (ReorderBufferTagWidth),
       .DATA_WIDTH     (XLEN),
       .NUM_WRITE_PORTS(2)
@@ -886,10 +992,11 @@ module reorder_buffer (
       .i_write_address({tail_idx_2, tail_idx}),
       .i_write_data   ({i_alloc_req_2.predicted_target, i_alloc_req.predicted_target}),
       .i_read_address (head_next_idx),
+      .i_read_onehot  (head_next_clear_mask),
       .o_read_data    (head_next_predicted_target)
   );
 
-  mwp_dist_ram #(
+  mwp_dist_ram_ohread #(
       .ADDR_WIDTH     (ReorderBufferTagWidth),
       .DATA_WIDTH     (CheckpointIdWidth),
       .NUM_WRITE_PORTS(2)
@@ -899,11 +1006,12 @@ module reorder_buffer (
       .i_write_address({tail_idx_2, tail_idx}),
       .i_write_data   ({alloc_checkpoint_id_data_2, alloc_checkpoint_id_data}),
       .i_read_address (head_idx),
+      .i_read_onehot  (head_clear_mask),
       .o_read_data    (head_checkpoint_id)
   );
 
   // Widen-commit replica: head+1 read port for checkpoint_id.
-  mwp_dist_ram #(
+  mwp_dist_ram_ohread #(
       .ADDR_WIDTH     (ReorderBufferTagWidth),
       .DATA_WIDTH     (CheckpointIdWidth),
       .NUM_WRITE_PORTS(2)
@@ -913,10 +1021,11 @@ module reorder_buffer (
       .i_write_address({tail_idx_2, tail_idx}),
       .i_write_data   ({alloc_checkpoint_id_data_2, alloc_checkpoint_id_data}),
       .i_read_address (head_next_idx),
+      .i_read_onehot  (head_next_clear_mask),
       .o_read_data    (head_next_checkpoint_id)
   );
 
-  mwp_dist_ram #(
+  mwp_dist_ram_ohread #(
       .ADDR_WIDTH     (ReorderBufferTagWidth),
       .DATA_WIDTH     (HeadMetaWidth),
       .NUM_WRITE_PORTS(2)
@@ -926,12 +1035,13 @@ module reorder_buffer (
       .i_write_address({tail_idx_2, tail_idx}),
       .i_write_data   ({alloc_head_meta_data_2, alloc_head_meta_data}),
       .i_read_address (head_idx),
+      .i_read_onehot  (head_clear_mask),
       .o_read_data    (head_meta_rd_data)
   );
 
   // Widen-commit replica: head+1 read port for head_meta.  This feeds the
   // head_next_* hazard flags consumed by the 2-wide commit gate.
-  mwp_dist_ram #(
+  mwp_dist_ram_ohread #(
       .ADDR_WIDTH     (ReorderBufferTagWidth),
       .DATA_WIDTH     (HeadMetaWidth),
       .NUM_WRITE_PORTS(2)
@@ -941,18 +1051,22 @@ module reorder_buffer (
       .i_write_address({tail_idx_2, tail_idx}),
       .i_write_data   ({alloc_head_meta_data_2, alloc_head_meta_data}),
       .i_read_address (head_next_idx),
+      .i_read_onehot  (head_next_clear_mask),
       .o_read_data    (head_next_meta_rd_data)
   );
 
   // ---------------------------------------------------------------------------
-  // Multi-write-port fields (allocation + CDB or branch update).
-  // These use mwp_dist_ram with 3 write ports for 2-wide alloc support.
-  // Port 0 = slot-1 alloc, Port 1 = slot-2 alloc, Port 2 = CDB (highest pri).
+  // Multi-write-port fields (allocation + CDB).
+  // These use mwp_dist_ram (mwp_dist_ram_ohread for head-side reads) with
+  // 4 write ports: Port 0 = slot-1 alloc, Port 1 = slot-2 alloc,
+  // Port 2 = CDB lane 0, Port 3 = CDB lane 1 (highest pri; the arbiter
+  // guarantees the two CDB lanes never collide on an address).
   // ---------------------------------------------------------------------------
 
-  // rob_value: 3 write ports (alloc1 + alloc2 + CDB), 2 read ports (head + RAT bypass).
-  // Two instances with identical writes, different read addresses.
-  mwp_dist_ram #(
+  // rob_value: 4 write ports (alloc1 + alloc2 + CDB lane 0 + CDB lane 1).
+  // Twelve instances with identical writes, different read addresses
+  // (head, head+1, RAT, dispatch bypass x6, fmul-pending x3).
+  mwp_dist_ram_ohread #(
       .ADDR_WIDTH     (ReorderBufferTagWidth),
       .DATA_WIDTH     (FLEN),
       .NUM_WRITE_PORTS(4)
@@ -962,11 +1076,12 @@ module reorder_buffer (
       .i_write_address({i_cdb_write_2.tag, i_cdb_write.tag, tail_idx_2, tail_idx}),
       .i_write_data({i_cdb_write_2.value, i_cdb_write.value, alloc_value_data_2, alloc_value_data}),
       .i_read_address(head_idx),
+      .i_read_onehot(head_clear_mask),
       .o_read_data(head_value)
   );
 
   // Widen-commit replica: head+1 read port for value.
-  mwp_dist_ram #(
+  mwp_dist_ram_ohread #(
       .ADDR_WIDTH     (ReorderBufferTagWidth),
       .DATA_WIDTH     (FLEN),
       .NUM_WRITE_PORTS(4)
@@ -976,6 +1091,7 @@ module reorder_buffer (
       .i_write_address({i_cdb_write_2.tag, i_cdb_write.tag, tail_idx_2, tail_idx}),
       .i_write_data({i_cdb_write_2.value, i_cdb_write.value, alloc_value_data_2, alloc_value_data}),
       .i_read_address(head_next_idx),
+      .i_read_onehot(head_next_clear_mask),
       .o_read_data(head_next_value)
   );
 
@@ -1111,8 +1227,8 @@ module reorder_buffer (
       .o_read_data(o_fmul_pending_bypass_value_3)
   );
 
-  // rob_exc_cause: 3 write ports (alloc1='0 + alloc2='0 + CDB), 1 read port (head)
-  mwp_dist_ram #(
+  // rob_exc_cause: 4 write ports (alloc1='0 + alloc2='0 + CDB lanes 0/1), 1 read port (head)
+  mwp_dist_ram_ohread #(
       .ADDR_WIDTH     (ReorderBufferTagWidth),
       .DATA_WIDTH     (ExcCauseWidth),
       .NUM_WRITE_PORTS(4)
@@ -1124,11 +1240,12 @@ module reorder_buffer (
         i_cdb_write_2.exc_cause, i_cdb_write.exc_cause, ExcCauseWidth'(0), ExcCauseWidth'(0)
       }),
       .i_read_address(head_idx),
-      .o_read_data(head_exc_cause)
+      .i_read_onehot(head_clear_mask),
+      .o_read_data(head_exc_cause_raw)
   );
 
   // Widen-commit replica: head+1 read port for exc_cause.
-  mwp_dist_ram #(
+  mwp_dist_ram_ohread #(
       .ADDR_WIDTH     (ReorderBufferTagWidth),
       .DATA_WIDTH     (ExcCauseWidth),
       .NUM_WRITE_PORTS(4)
@@ -1140,11 +1257,12 @@ module reorder_buffer (
         i_cdb_write_2.exc_cause, i_cdb_write.exc_cause, ExcCauseWidth'(0), ExcCauseWidth'(0)
       }),
       .i_read_address(head_next_idx),
+      .i_read_onehot(head_next_clear_mask),
       .o_read_data(head_next_exc_cause)
   );
 
-  // rob_fp_flags: 3 write ports (alloc1='0 + alloc2='0 + CDB), 1 read port (head)
-  mwp_dist_ram #(
+  // rob_fp_flags: 4 write ports (alloc1='0 + alloc2='0 + CDB lanes 0/1), 1 read port (head)
+  mwp_dist_ram_ohread #(
       .ADDR_WIDTH     (ReorderBufferTagWidth),
       .DATA_WIDTH     (FpFlagsWidth),
       .NUM_WRITE_PORTS(4)
@@ -1156,11 +1274,12 @@ module reorder_buffer (
         i_cdb_write_2.fp_flags, i_cdb_write.fp_flags, FpFlagsWidth'(0), FpFlagsWidth'(0)
       }),
       .i_read_address(head_idx),
+      .i_read_onehot(head_clear_mask),
       .o_read_data(head_fp_flags)
   );
 
   // Widen-commit replica: head+1 read port for fp_flags.
-  mwp_dist_ram #(
+  mwp_dist_ram_ohread #(
       .ADDR_WIDTH     (ReorderBufferTagWidth),
       .DATA_WIDTH     (FpFlagsWidth),
       .NUM_WRITE_PORTS(4)
@@ -1172,6 +1291,7 @@ module reorder_buffer (
         i_cdb_write_2.fp_flags, i_cdb_write.fp_flags, FpFlagsWidth'(0), FpFlagsWidth'(0)
       }),
       .i_read_address(head_next_idx),
+      .i_read_onehot(head_next_clear_mask),
       .o_read_data(head_next_fp_flags)
   );
 
@@ -1180,7 +1300,7 @@ module reorder_buffer (
   // branches/JALR write their resolved target on branch update. Split the
   // field across two single-write memories and select at the head instead of
   // paying the timing cost of a 2-write-port LVT RAM here.
-  mwp_dist_ram #(
+  mwp_dist_ram_ohread #(
       .ADDR_WIDTH     (ReorderBufferTagWidth),
       .DATA_WIDTH     (XLEN),
       .NUM_WRITE_PORTS(2)
@@ -1190,11 +1310,12 @@ module reorder_buffer (
       .i_write_address({tail_idx_2, tail_idx}),
       .i_write_data   ({alloc_branch_target_data_2, alloc_branch_target_data}),
       .i_read_address (head_idx),
+      .i_read_onehot  (head_clear_mask),
       .o_read_data    (head_branch_target_jal)
   );
 
   // Widen-commit replica: head+1 read port for branch_target_jal.
-  mwp_dist_ram #(
+  mwp_dist_ram_ohread #(
       .ADDR_WIDTH     (ReorderBufferTagWidth),
       .DATA_WIDTH     (XLEN),
       .NUM_WRITE_PORTS(2)
@@ -1204,6 +1325,7 @@ module reorder_buffer (
       .i_write_address({tail_idx_2, tail_idx}),
       .i_write_data   ({alloc_branch_target_data_2, alloc_branch_target_data}),
       .i_read_address (head_next_idx),
+      .i_read_onehot  (head_next_clear_mask),
       .o_read_data    (head_next_branch_target_jal)
   );
 
@@ -1233,7 +1355,7 @@ module reorder_buffer (
   );
 
   // CSR address RAM (12-bit, written at allocation)
-  mwp_dist_ram #(
+  mwp_dist_ram_ohread #(
       .ADDR_WIDTH     (ReorderBufferTagWidth),
       .DATA_WIDTH     (12),
       .NUM_WRITE_PORTS(2)
@@ -1243,11 +1365,12 @@ module reorder_buffer (
       .i_write_address({tail_idx_2, tail_idx}),
       .i_write_data   ({i_alloc_req_2.csr_addr, i_alloc_req.csr_addr}),
       .i_read_address (head_idx),
+      .i_read_onehot  (head_clear_mask),
       .o_read_data    (head_csr_addr)
   );
 
   // Widen-commit replica: head+1 read port for csr_addr.
-  mwp_dist_ram #(
+  mwp_dist_ram_ohread #(
       .ADDR_WIDTH     (ReorderBufferTagWidth),
       .DATA_WIDTH     (12),
       .NUM_WRITE_PORTS(2)
@@ -1257,11 +1380,12 @@ module reorder_buffer (
       .i_write_address({tail_idx_2, tail_idx}),
       .i_write_data   ({i_alloc_req_2.csr_addr, i_alloc_req.csr_addr}),
       .i_read_address (head_next_idx),
+      .i_read_onehot  (head_next_clear_mask),
       .o_read_data    (head_next_csr_addr)
   );
 
   // CSR op RAM (3-bit funct3, written at allocation)
-  mwp_dist_ram #(
+  mwp_dist_ram_ohread #(
       .ADDR_WIDTH     (ReorderBufferTagWidth),
       .DATA_WIDTH     (3),
       .NUM_WRITE_PORTS(2)
@@ -1271,11 +1395,12 @@ module reorder_buffer (
       .i_write_address({tail_idx_2, tail_idx}),
       .i_write_data   ({i_alloc_req_2.csr_op, i_alloc_req.csr_op}),
       .i_read_address (head_idx),
+      .i_read_onehot  (head_clear_mask),
       .o_read_data    (head_csr_op)
   );
 
   // Widen-commit replica: head+1 read port for csr_op.
-  mwp_dist_ram #(
+  mwp_dist_ram_ohread #(
       .ADDR_WIDTH     (ReorderBufferTagWidth),
       .DATA_WIDTH     (3),
       .NUM_WRITE_PORTS(2)
@@ -1285,11 +1410,12 @@ module reorder_buffer (
       .i_write_address({tail_idx_2, tail_idx}),
       .i_write_data   ({i_alloc_req_2.csr_op, i_alloc_req.csr_op}),
       .i_read_address (head_next_idx),
+      .i_read_onehot  (head_next_clear_mask),
       .o_read_data    (head_next_csr_op)
   );
 
   // CSR write data RAM (32-bit, written at allocation)
-  mwp_dist_ram #(
+  mwp_dist_ram_ohread #(
       .ADDR_WIDTH     (ReorderBufferTagWidth),
       .DATA_WIDTH     (XLEN),
       .NUM_WRITE_PORTS(2)
@@ -1299,11 +1425,12 @@ module reorder_buffer (
       .i_write_address({tail_idx_2, tail_idx}),
       .i_write_data   ({i_alloc_req_2.csr_write_data, i_alloc_req.csr_write_data}),
       .i_read_address (head_idx),
+      .i_read_onehot  (head_clear_mask),
       .o_read_data    (head_csr_write_data)
   );
 
   // Widen-commit replica: head+1 read port for csr_write_data.
-  mwp_dist_ram #(
+  mwp_dist_ram_ohread #(
       .ADDR_WIDTH     (ReorderBufferTagWidth),
       .DATA_WIDTH     (XLEN),
       .NUM_WRITE_PORTS(2)
@@ -1313,6 +1440,7 @@ module reorder_buffer (
       .i_write_address({tail_idx_2, tail_idx}),
       .i_write_data   ({i_alloc_req_2.csr_write_data, i_alloc_req.csr_write_data}),
       .i_read_address (head_next_idx),
+      .i_read_onehot  (head_next_clear_mask),
       .o_read_data    (head_next_csr_write_data)
   );
 
@@ -1659,28 +1787,63 @@ module reorder_buffer (
   // The old branch_update collision guard (which delayed commit when a
   // mispredicted branch resolved via CDB in the same cycle as commit) is
   // removed: (a) JAL — the stated motivation — never produces branch_update
-  // (is_jal_issue is excluded); (b) for conditional branches, the
-  // rob_head_commit_misprediction_candidate check in early_mispredict_fire
-  // already blocks the early-recovery race; (c) removing the guard breaks
+  // (is_jal_issue is excluded); (b) a conditional branch cannot resolve and
+  // commit in the same cycle (head_cdb_bypass excludes branches, so its done
+  // bit trails branch_update by one cycle), and an early_mispredict_fire
+  // coinciding with a head-mispredict commit is dropped one cycle later by
+  // the !mispredict_recovery_pending term in early_mispredict_active
+  // (early_misprediction_recovery.sv) — the fire-time candidate gate this
+  // comment used to cite no longer exists; (c) removing the guard breaks
   // the commit_en ↔ branch_update critical path (19 LUT levels through the
   // CARRY8 branch-target comparison).
-  assign commit_en = head_ready && !head_exception && !commit_stall && !i_commit_hold &&
-                     !i_early_recovery_en && !i_flush_all && !flush_after_head_commit;
+  // !i_flush_en is REQUIRED for serializing correctness, not just a flush guard.
+  // rob_serializer only recognizes a serial head (CSR/FENCE/FENCE.I/WFI/MRET)
+  // while !i_flush_en (rob_serializer.sv SERIAL_IDLE guard).  During an
+  // early-backend-recovery / mispredict-recovery bubble (i_flush_en=1) the
+  // serializer therefore leaves commit_stall=0 for a head FENCE.I, so without
+  // this term commit_en would RETIRE the FENCE.I unserialized -- skipping the
+  // cache sync (L1D writeback-all + L1I invalidate-all) entirely and letting a
+  // post-fence fetch read pre-fence code (the SMC bug).  Gating commit on
+  // !i_flush_en keeps commit_en a subset of the serializer's guard, so a serial
+  // head can never RETIRE during the bubble; it commits (and is serialized)
+  // after the bubble clears.  The bubble is a fixed hold (early-backend /
+  // mispredict recovery), never waiting on the head committing -> no deadlock.
+  // TIMING (late-side factoring): commit_en and every commit_stall-qualified
+  // derivative are written as <kept early aggregate> && !commit_stall.  The
+  // conjunct SETS are identical to the flat originals (pure AND
+  // re-association; AND is associative/commutative, so the value is
+  // bit-identical for every input combination).  All early conjuncts are
+  // register-sourced and settle well before commit_stall's interrupt arc, so
+  // the late arc traverses exactly one LUT per gate — restoring (and slightly
+  // beating) the baseline netlist's shape, where commit_stall entered the
+  // second-to-last commit_en LUT and the derivatives chained behind the
+  // commit_en broadcast.
+  assign commit_ready_early = head_ready && !head_exception && !i_commit_hold &&
+                              !i_early_recovery_en && !i_flush_en && !i_flush_all &&
+                              !flush_after_head_commit;
+  assign commit_en = commit_ready_early && !commit_stall;
 
   // Raw misprediction at commit (early_recovered handled externally by cpu_ooo)
   assign commit_misprediction = head_is_branch && head_mispredicted;
   assign o_commit_valid_raw = commit_en;
-  assign o_commit_store_like_raw = commit_en && (head_is_store || head_is_fp_store || head_is_sc);
-  assign o_commit_misprediction_raw = commit_en && commit_misprediction && !head_early_recovered;
-  assign o_commit_correct_branch_raw = commit_en && head_has_checkpoint &&
+  assign commit_store_like_early =
+      commit_ready_early && (head_is_store || head_is_fp_store || head_is_sc);
+  assign o_commit_store_like_raw = commit_store_like_early && !commit_stall;
+  assign commit_mispredict_early =
+      commit_ready_early && commit_misprediction && !head_early_recovered;
+  assign o_commit_misprediction_raw = commit_mispredict_early && !commit_stall;
+  assign commit_correct_branch_early = commit_ready_early && head_has_checkpoint &&
                                        !commit_misprediction && !head_early_recovered;
+  assign o_commit_correct_branch_raw = commit_correct_branch_early && !commit_stall;
   // Same-cycle head-mispredict indicator without the branch_update collision
   // term. Outer control logic uses this to suppress younger branch resolution
   // without feeding branch_update back into commit_en.
-  assign o_head_commit_misprediction_candidate =
-      head_ready && !commit_stall && !i_commit_hold && !i_early_recovery_en &&
-      !i_flush_all && !flush_after_head_commit &&
+  // (Same factoring; note the original conjunct set has no !head_exception.)
+  assign head_mispredict_candidate_early =
+      head_ready && !i_commit_hold && !i_early_recovery_en &&
+      !i_flush_en && !i_flush_all && !flush_after_head_commit &&
       commit_misprediction && !head_early_recovered;
+  assign o_head_commit_misprediction_candidate = head_mispredict_candidate_early && !commit_stall;
 
   // ===========================================================================
   // External Coordination Outputs
@@ -1693,14 +1856,35 @@ module reorder_buffer (
                        head_is_csr && !head_exception &&
                        !i_flush_en && !i_flush_all;
 
-  // MRET execution signal - asserted when entering MRET_EXEC state.
+  // MRET execution signal - asserted when entering MRET_EXEC and SUSTAINED while
+  // waiting there for committed stores to drain.
+  //
+  // take_mret (trap_unit) only fires when i_sq_committed_empty is high IN THE
+  // SAME CYCLE as o_mret_start, and it has no retry. Without the
+  // SERIAL_MRET_EXEC sustaining term o_mret_start is a one-cycle pulse on the
+  // IDLE->MRET_EXEC cycle: if a committed store is still draining then, take_mret
+  // misses its only chance and the serializer wedges in SERIAL_MRET_EXEC forever
+  // (no later flush can rescue it -- the stuck MRET never restores MIE, so no
+  // interrupt becomes eligible to flush it). The sustaining term mirrors
+  // o_trap_pending (below) and lets take_mret retry every cycle until the SQ
+  // drains.
+  //
+  // The i_sq_committed_empty gate keeps o_mret_start (hence i_mret_start ->
+  // trap_drain_wait -> i_commit_hold) low during the drain wait, which (a)
+  // prevents a commit-hold/o_mret_start f/2 oscillation and (b) keeps mret_taken
+  // a single-cycle pulse so flush_all fires exactly once. It is free on the
+  // common path: a retiring MRET normally finds the committed SQ already empty.
+  //
   // Note: !i_flush_en/!i_flush_all intentionally omitted — flush signals are
   // derived from mret_taken which is derived from o_mret_start, so gating
   // by them creates an oscillating combinational loop.
-  assign o_mret_start = (serial_state == riscv_pkg::SERIAL_IDLE) && head_ready &&
+  assign o_mret_start = ((serial_state == riscv_pkg::SERIAL_IDLE) ||
+                         (serial_state == riscv_pkg::SERIAL_MRET_EXEC)) &&
+                        head_ready &&
                         !i_commit_hold &&
                         !i_early_recovery_en &&
-                        head_is_mret && !head_exception;
+                        head_is_mret && !head_exception &&
+                        i_sq_committed_empty;
 
   // Trap pending signal - asserted when exception at head.
   // Note: during the IDLE->TRAP_WAIT transition, both the state check and the
@@ -1716,9 +1900,34 @@ module reorder_buffer (
       (serial_state == riscv_pkg::SERIAL_TRAP_WAIT) ||
       (head_ready && !i_commit_hold && !i_early_recovery_en && head_exception);
   assign o_trap_pc = head_pc;
+  // WFI interrupt-resume-PC seed (Bug#2): expose that the ROB head is a WFI so
+  // cpu_ooo can seed interrupt_resume_pc = wfi_pc+4 while the WFI stalls at the
+  // head. A machine interrupt taken at a *drain-gated* WFI (a committed store
+  // still draining) otherwise flushes the WFI before it commits, leaving
+  // interrupt_resume_pc at the pre-WFI instruction's next-PC (== the WFI's own
+  // PC) -> mepc=wfi_pc instead of the spec-required wfi_pc+4.
+  assign o_head_is_wfi = head_is_wfi;
   assign o_trap_cause = head_exc_cause;
   assign o_trap_value = head_value[XLEN-1:0];
 
+  // TIMING: retired-next-PC precompute (see port comment).  Equivalence with
+  // cpu_ooo's retired_next_pc(o_commit_comb) whenever o_commit_comb.valid:
+  //  - head MRET:  retired_next_pc returns redirect_pc, and the o_commit_comb
+  //    redirect chain puts i_mepc there for MRET (highest priority);
+  //  - head branch: retired_next_pc returns redirect_pc = taken ?
+  //    head_branch_target : head_fallthrough_pc;
+  //  - otherwise:  retired_next_pc returns pc + (is_compressed ? 2 : 4) with
+  //    is_compressed == head_is_compressed (the head_link_is_compressed arm
+  //    only applies to branches, which take the redirect arm above)
+  //    == head_fallthrough_pc.
+  // Slot 2 is never a branch/MRET by the 2-wide gate, and o_commit_comb_2
+  // zeroes is_branch/is_mret, so its next-PC is always the sequential one.
+  assign o_head_retired_next_pc =
+      head_is_mret ? i_mepc :
+      (head_is_branch && head_branch_taken) ? head_branch_target :
+      head_fallthrough_pc;
+  assign o_head_next_retired_next_pc = head_next_pc + (head_next_is_compressed ? 32'd2 : 32'd4);
+
   // FENCE.I flush signal - pulse when FENCE.I commits
   always_ff @(posedge i_clk) begin
     if (!i_rst_n) begin
@@ -1902,7 +2111,14 @@ module reorder_buffer (
   end
 
   assign o_commit_2_valid_raw = commit_2_fire;
-  assign o_commit_2_store_like_raw = commit_2_fire && (head_next_is_store || head_next_is_fp_store);
+  // TIMING (late-side factoring): commit_2_fire && X == (commit_2_ready_early
+  // && EnableWidenCommit && i_widen_commit_ok && X) && !commit_stall — same
+  // conjunct set, one late LUT.  This output feeds sq_committed_empty_for_trap
+  // (the trap arc of the uart spine) and the SQ same-cycle commit guard.
+  assign commit_2_store_like_early =
+      commit_2_ready_early && EnableWidenCommit && i_widen_commit_ok &&
+      (head_next_is_store || head_next_is_fp_store);
+  assign o_commit_2_store_like_raw = commit_2_store_like_early && !commit_stall;
 
   // Registered copy of slot 2 commit so external observers can sample it
   // after the head pointer advances.  Mirrors the o_commit register.
@@ -2026,8 +2242,8 @@ module reorder_buffer (
     // because the hazard gate (serial ops, head+1 branches, FENCE.I,
     // exceptions, AMO/LR/SC, head-mispredicting-branches) is already
     // applied.  commit_2_fire_actual additionally folds in the master
-    // enable and the cpu_ooo pending-write FIFO back-pressure term
-    // (i_widen_commit_ok) — this is what the head_ptr increment and
+    // enable and the cpu_ooo slot-2 accept term (i_widen_commit_ok,
+    // currently tied high) — this is what the head_ptr increment and
     // rob_valid clear actually use.
     o_perf_events.commit_2_opportunity = commit_2_gate;
     o_perf_events.commit_2_fire_actual = commit_2_fire;
@@ -2063,6 +2279,30 @@ module reorder_buffer (
 `ifndef SYNTHESIS
 `ifndef FORMAL
 
+  // One-hot head-image invariant (load-bearing for TIMING reads): the
+  // registered masks must mirror the binary pointers every cycle, since
+  // onehot_read() and the mwp_dist_ram_ohread LVT selects substitute them for
+  // binary head_idx / head_next_idx indexing.  Only check once reset has been
+  // observed asserted at least once: at sim time 0 the full-chip bench can
+  // present i_rst_n=1 before the reset synchronizer fires, while the mask FFs
+  // still hold their uninitialized all-zero value (which reads identically to
+  // the pre-fix binary indexing of the equally-uninitialized state).
+  logic dbg_mask_seen_reset;
+  initial dbg_mask_seen_reset = 1'b0;
+  always @(posedge i_clk) begin
+    if (!i_rst_n) dbg_mask_seen_reset <= 1'b1;
+    if (i_rst_n && dbg_mask_seen_reset) begin
+      if (head_clear_mask != (ReorderBufferDepth'(1) << head_idx)) begin
+        $error("Reorder Buffer: head_clear_mask (0x%08x) != 1 << head_idx (%0d)", head_clear_mask,
+               head_idx);
+      end
+      if (head_next_clear_mask != (ReorderBufferDepth'(1) << head_next_idx)) begin
+        $error("Reorder Buffer: head_next_clear_mask (0x%08x) != 1 << head_next_idx (%0d)",
+               head_next_clear_mask, head_next_idx);
+      end
+    end
+  end
+
   // Retire trace: log every committed instruction (for debugging)
   integer retire_trace_fd;
   initial begin
@@ -2214,6 +2454,12 @@ module reorder_buffer (
       // empty iff pointers exactly equal
       p_empty_matches_ptrs : assert (empty == (head_ptr == tail_ptr));
 
+      // Registered one-hot head images track the binary pointers exactly.
+      // The one-hot reads (onehot_read / mwp_dist_ram_ohread) rely on this.
+      p_head_mask_onehot : assert (head_clear_mask == (ReorderBufferDepth'(1) << head_idx));
+      p_head_next_mask_onehot :
+      assert (head_next_clear_mask == (ReorderBufferDepth'(1) << head_next_idx));
+
       // alloc_en implies !full
       p_alloc_not_when_full : assert (!alloc_en || !full);
 
@@ -2257,10 +2503,19 @@ module reorder_buffer (
         assert ($past(serial_state) == riscv_pkg::SERIAL_IDLE && $past(head_is_csr));
       end
 
-      // o_mret_start only in IDLE with MRET at head
+      // o_mret_start is asserted when MRET first reaches the ready head and is
+      // sustained in MRET_EXEC so trap_unit can retry after committed SQ drain.
       if ($past(o_mret_start)) begin
         p_mret_start_contract :
-        assert ($past(serial_state) == riscv_pkg::SERIAL_IDLE && $past(head_is_mret));
+        assert (($past(
+            serial_state
+        ) == riscv_pkg::SERIAL_IDLE || $past(
+            serial_state
+        ) == riscv_pkg::SERIAL_MRET_EXEC) && $past(
+            head_is_mret
+        ) && $past(
+            i_sq_committed_empty
+        ));
       end
 
       // o_fence_i_flush is registered (one cycle after commit of FENCE.I)
diff --git a/hw/rtl/cpu_and_mem/cpu/tomasulo/store_queue/README.md b/hw/rtl/cpu_and_mem/cpu/tomasulo/store_queue/README.md
index 8b74a169..5b53bef7 100644
--- a/hw/rtl/cpu_and_mem/cpu/tomasulo/store_queue/README.md
+++ b/hw/rtl/cpu_and_mem/cpu/tomasulo/store_queue/README.md
@@ -101,15 +101,21 @@ Back-pressure is therefore only ever conservatively long, never short.
 ## Widen-commit slot 2
 
 The SQ accepts a parallel slot-2 commit port
-(`i_commit_valid_2`, `i_commit_rob_tag_2`, plus combinational twin
-for the same-cycle partial-flush guard). Slot 2 only ever retires
+(`i_commit_valid_2`, `i_commit_rob_tag_2`, plus a combinational twin
+for the same-cycle flush guard). Slot 2 only ever retires
 plain stores — SC / AMO are forced onto slot 1 by the ROB's
 widen-commit hazard gate — so there's no SC-discard path sharing.
 Forwarding scans both slot 1 and slot 2 commits in the same cycle.
+The wrapper now actually drives the combinational twin
+(`i_commit_valid_comb_2` / `i_commit_rob_tag_comb_2`, previously tied to
+`1'b0`); without it a full-flush trap (e.g. a machine-timer IRQ) could
+observe committed-empty and drop a head+1 store the SQ has not yet seen on
+the registered commit path.
 
 ## Same-cycle commit hazard
 
-When a partial flush and a ROB commit fire on the same cycle, the
+When any same-cycle flush races a registered ROB commit — partial-flush
+misprediction recovery and full-flush trap / MRET / FENCE.I drains alike — the
 registered commit signal is one cycle behind the flush, which means
 the flush could otherwise wipe out a store that's being committed
 right then. The SQ takes a combinational commit guard from the ROB
diff --git a/hw/rtl/cpu_and_mem/cpu/tomasulo/store_queue/sq_forwarding_unit.sv b/hw/rtl/cpu_and_mem/cpu/tomasulo/store_queue/sq_forwarding_unit.sv
index 06c11b61..f4349357 100644
--- a/hw/rtl/cpu_and_mem/cpu/tomasulo/store_queue/sq_forwarding_unit.sv
+++ b/hw/rtl/cpu_and_mem/cpu/tomasulo/store_queue/sq_forwarding_unit.sv
@@ -73,6 +73,29 @@ module sq_forwarding_unit #(
   localparam int unsigned WordAddrWidth = XLEN - 2;
   localparam int unsigned IdxWidth = $clog2(DEPTH);
 
+  typedef struct packed {
+    logic                           valid;
+    logic [ReorderBufferTagWidth:0] age;
+    logic                           can_forward;
+    logic [IdxWidth-1:0]            idx;
+    logic [1:0]                     extract_type;
+  } fwd_winner_t;
+
+  function automatic fwd_winner_t choose_newer_winner(input fwd_winner_t lhs,
+                                                      input fwd_winner_t rhs);
+    begin
+      if (!lhs.valid) begin
+        choose_newer_winner = rhs;
+      end else if (!rhs.valid) begin
+        choose_newer_winner = lhs;
+      end else if (rhs.age >= lhs.age) begin
+        choose_newer_winner = rhs;
+      end else begin
+        choose_newer_winner = lhs;
+      end
+    end
+  endfunction
+
   // Forwarding scan result index (drives the SQ data-RAM read address in parent)
   logic [IdxWidth-1:0] fwd_match_idx;
 
@@ -184,6 +207,12 @@ module sq_forwarding_unit #(
   logic [ReorderBufferTagWidth:0] fwd_load_age;
   logic [ReorderBufferTagWidth:0] fwd_entry_age[DEPTH];
   logic [1:0] fwd_entry_extract_type[DEPTH];
+`ifndef FORMAL
+  fwd_winner_t fwd_leaf[DEPTH];
+  fwd_winner_t fwd_pair[4];
+  fwd_winner_t fwd_quad[2];
+  fwd_winner_t fwd_winner;
+`endif
 
   assign fwd_load_byte_mask = gen_byte_en(i_sq_check_addr[1:0], i_sq_check_size);
   assign fwd_load_age = {1'b0, i_sq_check_rob_tag} - {1'b0, i_rob_head_tag};
@@ -305,25 +334,60 @@ module sq_forwarding_unit #(
   // Block 2: newest conflicting store wins for data/extract selection. The
   // heavy address/age qualification is already parallelized above, so this
   // block only prioritizes 1-bit match results and their precomputed metadata.
+`ifdef FORMAL
+  // Yosys's formal frontend currently mishandles the balanced tree's unpacked
+  // array of packed structs, treating fields such as fwd_leaf[i].can_forward
+  // as implicit wires. Use an equivalent linear selector for formal only; the
+  // synthesized implementation below remains the timing-optimized tree.
+  logic fwd_formal_winner_valid;
+  logic [ReorderBufferTagWidth:0] fwd_formal_winner_age;
+
   always_comb begin
-    logic have_winner;
-    logic [ReorderBufferTagWidth:0] winner_age;
-
-    have_winner      = 1'b0;
-    winner_age       = '0;
-    fwd_can_fwd      = 1'b0;
-    fwd_match_idx    = '0;
-    fwd_extract_type = 2'd0;
+    fwd_formal_winner_valid = 1'b0;
+    fwd_formal_winner_age   = '0;
+    fwd_can_fwd             = 1'b0;
+    fwd_match_idx           = '0;
+    fwd_extract_type        = 2'd0;
+
     for (int unsigned i = 0; i < DEPTH; i++) begin
-      if (fwd_conflict_mask[i] && (!have_winner || (fwd_entry_age[i] >= winner_age))) begin
-        have_winner      = 1'b1;
-        winner_age       = fwd_entry_age[i];
-        fwd_can_fwd      = fwd_can_forward_mask[i];
-        fwd_match_idx    = IdxWidth'(i);
-        fwd_extract_type = fwd_entry_extract_type[i];
+      if (fwd_conflict_mask[i] &&
+          (!fwd_formal_winner_valid || (fwd_entry_age[i] >= fwd_formal_winner_age))) begin
+        fwd_formal_winner_valid = 1'b1;
+        fwd_formal_winner_age   = fwd_entry_age[i];
+        fwd_can_fwd             = fwd_can_forward_mask[i];
+        fwd_match_idx           = IdxWidth'(i);
+        fwd_extract_type        = fwd_entry_extract_type[i];
       end
     end
   end
+`else
+  // Keep this as a balanced tree: the old serial loop let an SQ-check address
+  // bit feed each entry's conflict logic and then walk an 8-entry winner chain
+  // before reaching o_sq_forward.can_forward.
+  always_comb begin
+    for (int unsigned i = 0; i < DEPTH; i++) begin
+      fwd_leaf[i].valid        = fwd_conflict_mask[i];
+      fwd_leaf[i].age          = fwd_entry_age[i];
+      fwd_leaf[i].can_forward  = fwd_can_forward_mask[i];
+      fwd_leaf[i].idx          = IdxWidth'(i);
+      fwd_leaf[i].extract_type = fwd_entry_extract_type[i];
+    end
+
+    fwd_pair[0]      = choose_newer_winner(fwd_leaf[0], fwd_leaf[1]);
+    fwd_pair[1]      = choose_newer_winner(fwd_leaf[2], fwd_leaf[3]);
+    fwd_pair[2]      = choose_newer_winner(fwd_leaf[4], fwd_leaf[5]);
+    fwd_pair[3]      = choose_newer_winner(fwd_leaf[6], fwd_leaf[7]);
+
+    fwd_quad[0]      = choose_newer_winner(fwd_pair[0], fwd_pair[1]);
+    fwd_quad[1]      = choose_newer_winner(fwd_pair[2], fwd_pair[3]);
+
+    fwd_winner       = choose_newer_winner(fwd_quad[0], fwd_quad[1]);
+
+    fwd_can_fwd      = fwd_winner.valid && fwd_winner.can_forward;
+    fwd_match_idx    = fwd_winner.idx;
+    fwd_extract_type = fwd_winner.extract_type;
+  end
+`endif
 
   // Block 3: Registered forwarding outputs.
   // Keep the SQ compare/forwarding result behind a register so the LQ sees it
@@ -336,7 +400,7 @@ module sq_forwarding_unit #(
     end else begin
       o_sq_all_older_addrs_known <= i_sq_check_valid ? fwd_all_older_known : 1'b0;
       o_sq_forward.match         <= i_sq_check_valid ? fwd_found_match : 1'b0;
-      o_sq_forward.can_forward   <= i_sq_check_valid ? (fwd_found_match && fwd_can_fwd) : 1'b0;
+      o_sq_forward.can_forward   <= i_sq_check_valid ? fwd_can_fwd : 1'b0;
     end
 
     case (fwd_extract_type)
diff --git a/hw/rtl/cpu_and_mem/cpu/tomasulo/store_queue/store_queue.sv b/hw/rtl/cpu_and_mem/cpu/tomasulo/store_queue/store_queue.sv
index ffdc8b85..d599d050 100644
--- a/hw/rtl/cpu_and_mem/cpu/tomasulo/store_queue/store_queue.sv
+++ b/hw/rtl/cpu_and_mem/cpu/tomasulo/store_queue/store_queue.sv
@@ -22,7 +22,7 @@
  * them (non-speculative writes). Supports store-to-load forwarding.
  *
  * Features:
- *   - Parameterized depth (8 entries, FF-based)
+ *   - Parameterized depth (8 entries, hybrid FF + LUTRAM; see Storage Strategy)
  *   - CAM-style tag search for address/data update (all entries in parallel)
  *   - In-order commit: head entry writes to memory when committed + ready
  *   - Store-to-load forwarding: combinational scan for LQ disambiguation
@@ -1174,14 +1174,22 @@ module store_queue #(
   always @(posedge i_clk) begin
     if (i_rst_n) begin
       if (i_alloc.valid && full) $warning("SQ: allocation attempted when full");
-      if (i_alloc.valid && (i_flush_all || i_flush_en))
-        $warning("SQ: allocation attempted during flush");
+      // Only PARTIAL flush (i_flush_en) is dangerous: there the alloc block in
+      // the !flush_all else-branch actually LANDS (sets sq_valid, line ~1060).
+      // i_flush_all is intentionally excluded — its priority else-if branches
+      // (lines ~859, ~1027) structurally squash the alloc (sq_valid <= '0), a
+      // documented-safe, formally-proven (p_alloc_slot_free) handshake that the
+      // RS issues un-flush-gated for timing closure (see note ~line 1263). The
+      // old (i_flush_all||i_flush_en) form fired ~1178x/run on the benign
+      // flush_all handshake, burying the genuinely-unsafe flush_en case.
+      if (i_alloc.valid && i_flush_en && !i_flush_all)
+        $warning("SQ: allocation attempted during partial flush");
       if (i_alloc_2.valid && i_alloc.valid && full_for_2)
         $warning("SQ: slot-2 alloc attempted when full_for_2 (and slot-1 firing)");
       if (i_alloc_2.valid && !i_alloc.valid && full)
         $warning("SQ: slot-2 alloc attempted alone when full");
-      if (i_alloc_2.valid && (i_flush_all || i_flush_en))
-        $warning("SQ: slot-2 alloc attempted during flush");
+      if (i_alloc_2.valid && i_flush_en && !i_flush_all)
+        $warning("SQ: slot-2 alloc attempted during partial flush");
       if (slot1_alloc_en && slot2_alloc_en && (alloc_target[IdxWidth-1:0] == slot2_alloc_idx))
         $error("SQ: slot-1 and slot-2 alloc collide on entry %0d", alloc_target[IdxWidth-1:0]);
     end
@@ -1356,7 +1364,8 @@ module store_queue #(
     end
   end
 
-  // Forwarding outputs are registered, so they reflect the previous check.
+  // Forwarding outputs are driven from staged SQ CAM results, so they reflect
+  // the previous check.
   always @(posedge i_clk) begin
     if (f_past_valid && i_rst_n && $past(
             i_rst_n
diff --git a/hw/rtl/cpu_and_mem/cpu/tomasulo/tomasulo.f b/hw/rtl/cpu_and_mem/cpu/tomasulo/tomasulo.f
index e0718221..fbac7638 100644
--- a/hw/rtl/cpu_and_mem/cpu/tomasulo/tomasulo.f
+++ b/hw/rtl/cpu_and_mem/cpu/tomasulo/tomasulo.f
@@ -9,6 +9,7 @@
 $(ROOT)/hw/rtl/lib/ram/sdp_dist_ram_2r.sv
 $(ROOT)/hw/rtl/lib/ram/mwp_dist_ram.sv
 $(ROOT)/hw/rtl/lib/ram/mwp_dist_ram_2r.sv
+$(ROOT)/hw/rtl/lib/ram/mwp_dist_ram_ohread.sv
 
 # Reorder Buffer
 $(ROOT)/hw/rtl/cpu_and_mem/cpu/tomasulo/reorder_buffer/rob_serializer.sv
diff --git a/hw/rtl/cpu_and_mem/cpu/tomasulo/tomasulo_wrapper/README.md b/hw/rtl/cpu_and_mem/cpu/tomasulo/tomasulo_wrapper/README.md
index db64dcab..2b646992 100644
--- a/hw/rtl/cpu_and_mem/cpu/tomasulo/tomasulo_wrapper/README.md
+++ b/hw/rtl/cpu_and_mem/cpu/tomasulo/tomasulo_wrapper/README.md
@@ -17,7 +17,7 @@ verbatim, so the flattened design is unchanged:
 | `commit_bus_pipeline` | `commit_bus/` | The four `always_ff` that register the combinational ROB commit bus into `commit_bus_q` / `commit_bus_2_q` plus the decomposed `commit_q_*` fields. |
 | `sq_early_addr_pipeline` | `store_addr/` | The dual-ported early store-address stage (register dispatch base+imm, add the next cycle off the dispatch critical path) that produces the two SQ early-address update packets. |
 | `dispatch_rs_router` | `dispatch_routing/` | Combinational decode of the dispatch packet(s) into per-RS dispatch-valid signals (slot 1 + slot 2) and the fast slot-1 "intent" signals. |
-| `sc_pending_unit` | `atomics/` | Store-conditional resolution: the SC pending-register FSM (set at MEM_RS SC issue, cleared on fire / flush / age), its rob_tag+addr capture, the fire/success decode, and the `sc_fu_complete` packet. |
+| `sc_pending_unit` | `atomics/` | Store-conditional resolution: a per-ROB-tag table of in-flight SCs (allocated at MEM_RS SC issue, freed on fire / flush), the head-match fire/success decode, and the `sc_fu_complete` packet. |
 
 The per-RS dispatch-valid nets in `dispatch_rs_router` carry `(* max_fanout =
 32 *)`; the attribute is preserved both in the submodule and on the wrapper-side
@@ -48,7 +48,7 @@ while the entry was queued gets a fresh value.
 
 ### SC state machine
 
-The SC pending FSM and its fire/success decode live in
+The SC tracking table and its fire/success decode live in
 `atomics/sc_pending_unit.sv`; the surrounding store-misalign path and MEM-adapter
 mux described below stay in the wrapper.
 
@@ -61,6 +61,20 @@ result is just `~reservation_valid`. On failure, the wrapper sends
 a discard signal to the SQ to drop the SC's entry without writing
 memory.
 
+Several SCs can be in flight at once: a branch-speculated LR/SC retry
+loop issues one SC per speculated iteration, and the MEM_RS may issue
+them out of program order. `sc_pending_unit` therefore tracks every
+in-flight SC in a small table keyed by ROB tag (depth `NumCheckpoints
++ 1`) and fires the entry whose tag matches the ROB head; a flush drops
+only entries younger than the flush boundary, so a surviving older SC
+is never lost. This replaced a single pending register plus a
+`!(sc_pending && mem_rs_next_is_sc)` issue-serialization gate in
+`mem_rs_fu_ready_base`: under speculation a younger SC could take the
+register and the gate would then block the older head SC from issuing
+at all, so it never fired and `sc_pending` never cleared — Linux
+printk's `_prb_commit` cmpxchg on the cached DDR tier deadlocked
+exactly that way. The gate is gone; the table makes concurrent SCs safe.
+
 The `sc_fu_complete` output is registered (`sc_fu_complete_reg`)
 before feeding the MEM adapter. The combinational path from the
 full-flush term `speculative_flush_all` (driven by `i_flush_all` /
@@ -96,6 +110,20 @@ misprediction-detect path in `cpu_ooo.sv`, and the CDB grants remain
 combinational so FU adapters can clear their hold registers on the same cycle as
 a grant.
 
+The registered valid outputs (`o_commit_bus_q_valid`, `o_commit_bus_2_q_valid`)
+are additionally masked combinationally with `!i_flush_all`. The valid flops
+clear on the flush edge, but downstream consumers still observe the previous
+valid value during that same cycle; masking immediately prevents a commit that
+overlaps a trap / MRET / FENCE.I full flush from performing one more
+architectural side effect while the back-end is being squashed.
+
+The wrapper also drives the SQ slot-2 combinational commit guard from the raw
+head+1 store-commit pulse (`i_commit_valid_comb_2 = commit_2_store_like_raw`,
+`i_commit_rob_tag_comb_2 = commit_bus_2.tag`; previously tied to `1'b0`/`'0`).
+Slot 2 has the same raw-commit race as slot 1: `commit_bus_2_q_valid` reaches the
+SQ one cycle late, so without this a full-flush trap (e.g. a machine-timer IRQ)
+could observe `sq_committed_empty` and squash a store the SQ has not yet owned.
+
 ### Dispatch routing
 
 Dispatch now emits already-routed per-RS packets for slot 1 and slot 2. The
diff --git a/hw/rtl/cpu_and_mem/cpu/tomasulo/tomasulo_wrapper/atomics/sc_pending_unit.sv b/hw/rtl/cpu_and_mem/cpu/tomasulo/tomasulo_wrapper/atomics/sc_pending_unit.sv
index 170ade76..4ebe9536 100644
--- a/hw/rtl/cpu_and_mem/cpu/tomasulo/tomasulo_wrapper/atomics/sc_pending_unit.sv
+++ b/hw/rtl/cpu_and_mem/cpu/tomasulo/tomasulo_wrapper/atomics/sc_pending_unit.sv
@@ -17,19 +17,32 @@
 // =============================================================================
 // sc_pending_unit
 // =============================================================================
-// Extracted verbatim from tomasulo_wrapper.sv (pure RTL boundary move, zero
-// functional change).  Store-conditional (SC.W) resolution:
-//   * the SC pending register FSM (set at MEM_RS SC issue, cleared on fire /
-//     flush / age) and its data capture (rob_tag + address),
-//   * the combinational fire/success decode, and
-//   * the sc_fu_complete result packet.
-// The store-misalign exception path, the MEM-adapter input mux, and
-// lq_result_accepted remain in the wrapper; this unit consumes store_misalign_*
-// as inputs and produces sc_pending (visible to dispatch) and sc_fu_complete
-// (registered by the wrapper before the MEM adapter).
+// Store-conditional (SC.W) resolution.
 //
-// is_younger is duplicated here (it is also used elsewhere in the wrapper, and
-// the wrapper comment notes it is identical to the load_queue / RS copies).
+// In-flight SCs are tracked in a small table keyed by ROB tag, so the SC that
+// reaches the ROB head can ALWAYS fire -- even when an LR/SC retry loop is
+// branch-speculated and the core issues several SCs (one per speculated
+// iteration) before the oldest resolves. A single pending-SC register failed
+// here: under speculation the MEM_RS issues SCs out of program order, so a
+// younger SC took the one register, and the wrapper's former issue-
+// serialization gate (!(sc_pending && mem_rs_next_is_sc)) then blocked the
+// OLDER head SC from issuing at all -- so it never fired and the core
+// deadlocked. Observed on Linux printk's _prb_commit cmpxchg loop (11 SCs
+// issued, 8-deep speculation; head=tag15 never issued, the register held
+// tag19). This table pairs with removing that gate (see tomasulo_wrapper.sv).
+// BRAM LR/SC resolves before a second SC issues, so BRAM/FreeRTOS were
+// unaffected; the longer cached-tier (DDR) latency exposes the overlap.
+//
+// Two flush rules matter and were both bugs in the single-register version:
+//   * an SC fires when head_tag matches a VALID entry and the SQ is drained;
+//   * an entry is cleared on a flush ONLY if it is younger than the flush
+//     boundary (is_younger) -- NOT unconditionally on partial flush, which
+//     would drop a surviving older SC.
+// Depth = NumCheckpoints + 1 (branch speculation depth bounds concurrent SCs).
+//
+// The store-misalign exception path, MEM-adapter input mux, and
+// lq_result_accepted remain in the wrapper. is_younger is duplicated here
+// (identical to the load_queue / RS copies).
 // =============================================================================
 module sc_pending_unit (
     input logic i_clk,
@@ -56,8 +69,7 @@ module sc_pending_unit (
 );
 
   // ---------------------------------------------------------------------------
-  // Alias input ports back to the wrapper's local names so the bodies below are
-  // byte-identical to the original tomasulo_wrapper logic.
+  // Alias input ports back to the wrapper's local names.
   // ---------------------------------------------------------------------------
   logic [riscv_pkg::ReorderBufferTagWidth-1:0] head_tag;
   logic sq_committed_empty;
@@ -86,12 +98,13 @@ module sc_pending_unit (
   assign speculative_flush_en = i_speculative_flush_en;
   assign speculative_partial_flush = i_speculative_partial_flush;
 
-  // SC pending state (rob_tag / addr are internal; sc_pending is also output)
-  logic sc_pending;
-  logic [riscv_pkg::ReorderBufferTagWidth-1:0] sc_pending_rob_tag;
-  logic [riscv_pkg::XLEN-1:0] sc_pending_addr;
+  // SC tracking table: one entry per in-flight SC, keyed by ROB tag.
+  localparam int unsigned ScTableDepth = riscv_pkg::NumCheckpoints + 1;
+  logic [ScTableDepth-1:0] sct_valid;
+  logic [riscv_pkg::ReorderBufferTagWidth-1:0] sct_tag[ScTableDepth];
+  logic [riscv_pkg::XLEN-1:0] sct_addr[ScTableDepth];
 
-  // Age comparison for SC flush guard (identical to load_queue/reservation_station)
+  // Age comparison for the SC flush guard (identical to load_queue / RS).
   function automatic logic is_younger(input logic [riscv_pkg::ReorderBufferTagWidth-1:0] entry_tag,
                                       input logic [riscv_pkg::ReorderBufferTagWidth-1:0] flush_tag,
                                       input logic [riscv_pkg::ReorderBufferTagWidth-1:0] head);
@@ -104,70 +117,111 @@ module sc_pending_unit (
     end
   endfunction
 
+  // Head match: an in-flight SC sits at the ROB head.
+  logic                       sct_hit;
+  logic [riscv_pkg::XLEN-1:0] sct_hit_addr;
+  logic [   ScTableDepth-1:0] sct_hit_oh;
+  always_comb begin
+    sct_hit      = 1'b0;
+    sct_hit_addr = '0;
+    sct_hit_oh   = '0;
+    for (int i = 0; i < ScTableDepth; i++) begin
+      if (sct_valid[i] && (sct_tag[i] == head_tag)) begin
+        sct_hit       = 1'b1;
+        sct_hit_addr  = sct_addr[i];
+        sct_hit_oh[i] = 1'b1;
+      end
+    end
+  end
+
+  // First free slot for a newly-issued SC.
+  logic                    sct_has_free;
+  logic [ScTableDepth-1:0] sct_free_oh;
+  always_comb begin
+    sct_has_free = 1'b0;
+    sct_free_oh  = '0;
+    for (int i = 0; i < ScTableDepth; i++) begin
+      if (!sct_valid[i] && !sct_has_free) begin
+        sct_has_free   = 1'b1;
+        sct_free_oh[i] = 1'b1;
+      end
+    end
+  end
+  // Capture an issuing SC. Reject a phantom SC only when it is younger than the
+  // flush boundary (it is being killed); a real SC that survives the flush must
+  // be captured even if its issue coincides with the flush window.
+  logic sct_alloc;
+  assign sct_alloc = o_mem_rs_issue.valid && !speculative_flush_all &&
+      (o_mem_rs_issue.op == riscv_pkg::SC_W) &&
+      !(speculative_flush_en && is_younger(
+      o_mem_rs_issue.rob_tag, i_flush_tag, head_tag
+  ));
+
   logic sc_can_fire;
   logic sc_success;
   logic sc_fire_now;
 
-  assign sc_can_fire = sc_pending && (sc_pending_rob_tag == head_tag) && sq_committed_empty;
+  assign sc_can_fire = sct_hit && sq_committed_empty;
   assign sc_success = lq_reservation_valid
-      && (lq_reservation_addr[riscv_pkg::XLEN-1:2] == sc_pending_addr[riscv_pkg::XLEN-1:2]);
-  // Arm SC only when the MEM adapter has no competing same-cycle producer.
-  // This keeps the rare SC head-tag compare local to the SC register D path;
-  // the registered completion below owns the MEM adapter on the next cycle.
+      && (lq_reservation_addr[riscv_pkg::XLEN-1:2] == sct_hit_addr[riscv_pkg::XLEN-1:2]);
+  // Arm SC only when the MEM adapter has no competing same-cycle producer; the
+  // registered completion below owns the MEM adapter on the next cycle.
   assign sc_fire_now = sc_can_fire &&
                        !mem_adapter_result_pending &&
                        !lq_fu_complete.valid &&
                        !store_misalign_issue &&
                        !store_misalign_fu_complete_reg.valid;
 
-  // SC fu_complete generation
+  // SC fu_complete generation. The firing SC's tag IS head_tag (it matched).
   riscv_pkg::fu_complete_t sc_fu_complete;
   always_comb begin
     sc_fu_complete       = '0;
     sc_fu_complete.valid = sc_fire_now;
-    sc_fu_complete.tag   = sc_pending_rob_tag;
+    sc_fu_complete.tag   = head_tag;
     sc_fu_complete.value = {{(riscv_pkg::FLEN - 1) {1'b0}}, ~sc_success};
   end
 
+  // Table valid bits: allocate on SC issue, free on fire, flush younger entries.
   always_ff @(posedge i_clk) begin
-    if (!i_rst_n) begin
-      sc_pending <= 1'b0;
-    end else if (speculative_flush_all) begin
-      sc_pending <= 1'b0;
+    if (!i_rst_n || speculative_flush_all) begin
+      sct_valid <= '0;
     end else begin
-      // Set when MEM_RS issues SC.  Gate with flush signals because
-      // the RS output valid is no longer suppressed during flush for
-      // timing closure — a phantom SC set during partial flush would
-      // leave sc_pending stuck (the flushed tag never reaches head).
-      if (o_mem_rs_issue.valid && !speculative_flush_all && !speculative_flush_en
-          && (o_mem_rs_issue.op == riscv_pkg::SC_W)) begin
-        sc_pending <= 1'b1;
+      // Clear ONLY entries younger than the flush boundary (i_flush_tag) -- i.e.
+      // actually being flushed. Do NOT clear on speculative_partial_flush alone:
+      // an SC older than the mispredicted branch (e.g. one still waiting for the
+      // head to reach it on the slow cached tier) must survive.
+      if (i_flush_en) begin
+        for (int i = 0; i < ScTableDepth; i++) begin
+          if (sct_valid[i] && is_younger(sct_tag[i], i_flush_tag, head_tag)) begin
+            sct_valid[i] <= 1'b0;
+          end
+        end
       end
-      // Clear when SC fu_complete is armed for the registered MEM path.
+      // Free the firing entry.
       if (sc_fire_now) begin
-        sc_pending <= 1'b0;
+        for (int i = 0; i < ScTableDepth; i++) if (sct_hit_oh[i]) sct_valid[i] <= 1'b0;
       end
-      // A pending SC is speculative if it is younger than the flush boundary,
-      // or if recovery is draining everything younger than the current/just-
-      // retired head.
-      if (i_flush_en && sc_pending && (speculative_partial_flush || is_younger(
-              sc_pending_rob_tag, i_flush_tag, head_tag
-          ))) begin
-        sc_pending <= 1'b0;
+      // Allocate a newly-issued SC into the first free slot. (Alloc targets a
+      // free slot; fire/flush clear valid slots, so the indices never collide.)
+      if (sct_alloc && sct_has_free) begin
+        for (int i = 0; i < ScTableDepth; i++) if (sct_free_oh[i]) sct_valid[i] <= 1'b1;
       end
     end
   end
 
-  // SC data capture (no reset - gated by sc_pending)
+  // SC tag/addr capture (no reset; gated by the alloc one-hot).
   always_ff @(posedge i_clk) begin
-    if (o_mem_rs_issue.valid && !speculative_flush_all && !speculative_flush_en
-        && (o_mem_rs_issue.op == riscv_pkg::SC_W)) begin
-      sc_pending_rob_tag <= o_mem_rs_issue.rob_tag;
-      sc_pending_addr    <= sq_effective_addr;
+    if (sct_alloc && sct_has_free) begin
+      for (int i = 0; i < ScTableDepth; i++) begin
+        if (sct_free_oh[i]) begin
+          sct_tag[i]  <= o_mem_rs_issue.rob_tag;
+          sct_addr[i] <= sq_effective_addr;
+        end
+      end
     end
   end
 
-  assign o_sc_pending     = sc_pending;
+  assign o_sc_pending     = |sct_valid;
   assign o_sc_fu_complete = sc_fu_complete;
 
 endmodule
diff --git a/hw/rtl/cpu_and_mem/cpu/tomasulo/tomasulo_wrapper/commit_bus/commit_bus_pipeline.sv b/hw/rtl/cpu_and_mem/cpu/tomasulo/tomasulo_wrapper/commit_bus/commit_bus_pipeline.sv
index 094e27aa..0b900402 100644
--- a/hw/rtl/cpu_and_mem/cpu/tomasulo/tomasulo_wrapper/commit_bus/commit_bus_pipeline.sv
+++ b/hw/rtl/cpu_and_mem/cpu/tomasulo/tomasulo_wrapper/commit_bus/commit_bus_pipeline.sv
@@ -117,9 +117,13 @@ module commit_bus_pipeline (
     commit_q_2_is_store_like <= commit_bus_2.is_store || commit_bus_2.is_fp_store;
   end
 
-  // Drive the output ports from the registered locals.
+  // Drive the output ports from the registered locals.  The flops above clear
+  // valid on the flush edge, but consumers see the previous valid value during
+  // that same cycle.  Mask the qualified valid outputs immediately so a
+  // commit that overlaps a trap/MRET/FENCE.I full flush cannot perform one
+  // more architectural side effect while the backend is being squashed.
   assign o_commit_bus_q             = commit_bus_q;
-  assign o_commit_bus_q_valid       = commit_bus_q_valid;
+  assign o_commit_bus_q_valid       = commit_bus_q_valid && !i_flush_all;
   assign o_commit_q_dest_valid      = commit_q_dest_valid;
   assign o_commit_q_dest_rf         = commit_q_dest_rf;
   assign o_commit_q_dest_reg        = commit_q_dest_reg;
@@ -128,7 +132,7 @@ module commit_bus_pipeline (
   assign o_commit_q_is_store_like   = commit_q_is_store_like;
   assign o_commit_q_sc_failed       = commit_q_sc_failed;
   assign o_commit_bus_2_q           = commit_bus_2_q;
-  assign o_commit_bus_2_q_valid     = commit_bus_2_q_valid;
+  assign o_commit_bus_2_q_valid     = commit_bus_2_q_valid && !i_flush_all;
   assign o_commit_q_2_dest_valid    = commit_q_2_dest_valid;
   assign o_commit_q_2_dest_rf       = commit_q_2_dest_rf;
   assign o_commit_q_2_dest_reg      = commit_q_2_dest_reg;
diff --git a/hw/rtl/cpu_and_mem/cpu/tomasulo/tomasulo_wrapper/tomasulo_wrapper.f b/hw/rtl/cpu_and_mem/cpu/tomasulo/tomasulo_wrapper/tomasulo_wrapper.f
index 23834886..31b808fc 100644
--- a/hw/rtl/cpu_and_mem/cpu/tomasulo/tomasulo_wrapper/tomasulo_wrapper.f
+++ b/hw/rtl/cpu_and_mem/cpu/tomasulo/tomasulo_wrapper/tomasulo_wrapper.f
@@ -9,6 +9,7 @@
 $(ROOT)/hw/rtl/lib/ram/sdp_dist_ram_2r.sv
 $(ROOT)/hw/rtl/lib/ram/mwp_dist_ram.sv
 $(ROOT)/hw/rtl/lib/ram/mwp_dist_ram_2r.sv
+$(ROOT)/hw/rtl/lib/ram/mwp_dist_ram_ohread.sv
 
 # Submodules
 $(ROOT)/hw/rtl/cpu_and_mem/cpu/tomasulo/reorder_buffer/rob_serializer.sv
diff --git a/hw/rtl/cpu_and_mem/cpu/tomasulo/tomasulo_wrapper/tomasulo_wrapper.sv b/hw/rtl/cpu_and_mem/cpu/tomasulo/tomasulo_wrapper/tomasulo_wrapper.sv
index b6980bbe..2bcb4c0a 100644
--- a/hw/rtl/cpu_and_mem/cpu/tomasulo/tomasulo_wrapper/tomasulo_wrapper.sv
+++ b/hw/rtl/cpu_and_mem/cpu/tomasulo/tomasulo_wrapper/tomasulo_wrapper.sv
@@ -17,7 +17,7 @@
 /*
  * Tomasulo Integration Wrapper
  *
- * Verification wrapper that instantiates ROB + RAT + six RS instances
+ * Wrapper (instantiated by cpu_ooo) that instantiates ROB + RAT + six RS instances
  * (INT_RS, MUL_RS, MEM_RS, FP_RS, FMUL_RS, FDIV_RS), LQ, SQ, CDB arbiter,
  * FU shims, and hardwires the internal commit bus, dispatch routing,
  * SQ↔LQ forwarding, and shared CDB/flush signals.
@@ -29,7 +29,7 @@
  *
  * Internal wiring:
  *   ROB.o_commit_comb --> commit_bus --> cpu_ooo same-cycle mispredict detect
- *   ROB.o_commit      --> o_commit   (registered testbench observation)
+ *   ROB.o_commit_comb --> commit_bus --> commit_bus_pipeline --> o_commit
  *   commit_bus_q      --> RAT commit-clear signals
  *   FU adapters --> cdb_arbiter --> cdb_bus --> ROB.i_cdb_write (derived)
  *                                           --> all RS .i_cdb (broadcast for wakeup)
@@ -129,6 +129,11 @@ module tomasulo_wrapper #(
     input  logic                                        i_csr_done,
     output logic                                        o_trap_pending,
     output logic                  [riscv_pkg::XLEN-1:0] o_trap_pc,
+    output logic                                        o_head_is_wfi,
+    // Retired-next-PC precompute for cpu_ooo's interrupt_resume_pc (see
+    // reorder_buffer port comment; pure timing restructure).
+    output logic                  [riscv_pkg::XLEN-1:0] o_head_retired_next_pc,
+    output logic                  [riscv_pkg::XLEN-1:0] o_head_next_retired_next_pc,
     output riscv_pkg::exc_cause_t                       o_trap_cause,
     output logic                  [riscv_pkg::XLEN-1:0] o_trap_value,
     input  logic                                        i_trap_taken,
@@ -136,7 +141,11 @@ module tomasulo_wrapper #(
     input  logic                                        i_mret_done,
     input  logic                  [riscv_pkg::XLEN-1:0] i_mepc,
     input  logic                                        i_interrupt_pending,
-    input  logic                                        i_trap_misaligned_accesses,
+
+    // Current privilege (PrivM/PrivU), forwarded to the ROB for U-mode
+    // CSR/MRET illegal-instruction checks.
+    input logic [1:0] i_priv,
+    input logic       i_trap_misaligned_accesses,
 
     // Widen-commit back-pressure: asserted when the downstream slot-2
     // retire path can accept a second commit this cycle.  cpu_ooo ties this
@@ -460,10 +469,11 @@ module tomasulo_wrapper #(
   //
   // commit_bus_q is a one-cycle pipeline register that breaks the critical
   // timing path from ROB head_ready/commit_en through SQ/RAT to LQ.
-  // All internal consumers (RAT, SQ commit, SC logic) use the registered
-  // version.  The valid bit is cleared on full flush for safety — although
-  // overlapping pipelined commits with flush_all only occurs for non-store
-  // instructions (traps, MRET, FENCE.I), so SQ/SC are unaffected.
+  // Internal consumers (RAT, SQ commit, SC logic) use the registered
+  // version, except the SQ same-cycle flush-race guard, which taps the raw
+  // ROB commit pulses.  The valid bit is cleared on full flush for safety —
+  // although overlapping pipelined commits with flush_all only occurs for
+  // non-store instructions (traps, MRET, FENCE.I), so SQ/SC are unaffected.
   riscv_pkg::reorder_buffer_commit_t commit_bus;
   // Split commit_bus_q into separate valid + data to prevent Vivado from
   // dragging the reset net onto payload register bits.
@@ -632,9 +642,15 @@ module tomasulo_wrapper #(
   // CDB Arbiter: FU completions → single CDB broadcast
   // ===========================================================================
   riscv_pkg::cdb_broadcast_t cdb_bus_comb;  // combinational from arbiter
-  riscv_pkg::cdb_broadcast_t cdb_bus;  // registered — feeds RS/ROB wakeup
+  // registered — feeds RS/ROB wakeup
+  (* equivalent_register_removal = "no" *)riscv_pkg::cdb_broadcast_t cdb_bus;
+  // same-cycle INT_RS-local copy
+  (* equivalent_register_removal = "no" *)riscv_pkg::cdb_broadcast_t cdb_bus_int_rs;
   riscv_pkg::cdb_broadcast_t cdb_bus_2_comb;  // 2-wide CDB lane-1, combinational
-  riscv_pkg::cdb_broadcast_t cdb_bus_2;  // registered lane-1 — feeds RS/ROB wakeup
+  // registered lane-1 — feeds RS/ROB wakeup
+  (* equivalent_register_removal = "no" *)riscv_pkg::cdb_broadcast_t cdb_bus_2;
+  // same-cycle INT_RS-local copy
+  (* equivalent_register_removal = "no" *)riscv_pkg::cdb_broadcast_t cdb_bus_2_int_rs;
 
   // Forward declarations: adapter→arbiter signals (used here, defined below)
   riscv_pkg::fu_complete_t   alu_adapter_to_arbiter;
@@ -689,15 +705,22 @@ module tomasulo_wrapper #(
   // max_fanout forces replication across the RS snoop / ROB-write consumers —
   // the high-fanout report (609 loads) showed this net being one of the top
   // drivers into the flush-recovery cone that failed timing at -0.947 ns.
-  (* max_fanout = 32 *) logic cdb_bus_valid;
+  (* max_fanout = 32 *)logic cdb_bus_valid;
+  (* equivalent_register_removal = "no", max_fanout = 32 *)logic cdb_bus_int_rs_valid;
 
   always_ff @(posedge i_clk) begin
     if (!i_rst_n) cdb_bus_valid <= 1'b0;
     else cdb_bus_valid <= cdb_bus_comb.valid;
   end
 
+  always_ff @(posedge i_clk) begin
+    if (!i_rst_n) cdb_bus_int_rs_valid <= 1'b0;
+    else cdb_bus_int_rs_valid <= cdb_bus_comb.valid;
+  end
+
   always_ff @(posedge i_clk) begin
     cdb_bus <= cdb_bus_comb;
+    cdb_bus_int_rs <= cdb_bus_comb;
   end
 
   // Expose combinational CDB for testbench observation (grant timing matches)
@@ -710,6 +733,16 @@ module tomasulo_wrapper #(
     cdb_bus_qualified.valid = cdb_bus_valid;
   end
 
+  // INT_RS is physically far from the shared CDB register on Genesys2 and
+  // snoops many value bits in parallel.  Give it an equivalent same-cycle CDB
+  // register so placement can keep that high-fanout payload local without
+  // changing wakeup latency.
+  riscv_pkg::cdb_broadcast_t cdb_bus_int_rs_qualified;
+  always_comb begin
+    cdb_bus_int_rs_qualified       = cdb_bus_int_rs;
+    cdb_bus_int_rs_qualified.valid = cdb_bus_int_rs_valid;
+  end
+
   // Derive ROB CDB write from CDB broadcast
   riscv_pkg::reorder_buffer_cdb_write_t cdb_write_from_arbiter;
   always_comb begin
@@ -722,19 +755,30 @@ module tomasulo_wrapper #(
   end
 
   // ---- 2-wide CDB lane-1: registered mirror of the lane-0 pipeline above.
-  (* max_fanout = 32 *) logic cdb_bus_2_valid;
+  (* max_fanout = 32 *)logic cdb_bus_2_valid;
+  (* equivalent_register_removal = "no", max_fanout = 32 *)logic cdb_bus_2_int_rs_valid;
   always_ff @(posedge i_clk) begin
     if (!i_rst_n) cdb_bus_2_valid <= 1'b0;
     else cdb_bus_2_valid <= cdb_bus_2_comb.valid;
   end
+  always_ff @(posedge i_clk) begin
+    if (!i_rst_n) cdb_bus_2_int_rs_valid <= 1'b0;
+    else cdb_bus_2_int_rs_valid <= cdb_bus_2_comb.valid;
+  end
   always_ff @(posedge i_clk) begin
     cdb_bus_2 <= cdb_bus_2_comb;
+    cdb_bus_2_int_rs <= cdb_bus_2_comb;
   end
   riscv_pkg::cdb_broadcast_t cdb_bus_2_qualified;
   always_comb begin
     cdb_bus_2_qualified       = cdb_bus_2;
     cdb_bus_2_qualified.valid = cdb_bus_2_valid;
   end
+  riscv_pkg::cdb_broadcast_t cdb_bus_2_int_rs_qualified;
+  always_comb begin
+    cdb_bus_2_int_rs_qualified       = cdb_bus_2_int_rs;
+    cdb_bus_2_int_rs_qualified.valid = cdb_bus_2_int_rs_valid;
+  end
   riscv_pkg::reorder_buffer_cdb_write_t cdb_write_from_arbiter_2;
   always_comb begin
     cdb_write_from_arbiter_2.valid     = cdb_bus_2_valid;
@@ -1327,8 +1371,13 @@ module tomasulo_wrapper #(
   logic mem_rs_fu_ready_base;
   logic mem_rs_fu_ready;
 
+  // Do NOT gate SC issue on (sc_pending && next_is_sc). That single-SC
+  // serialization deadlocked Linux: under speculation a YOUNGER SC issues
+  // out-of-order, sets sc_pending, and then this gate blocked the OLDER head SC
+  // from ever issuing -- so it never fired, sc_pending never cleared, and the
+  // core hung at _prb_commit. sc_pending_unit now tracks multiple in-flight SCs
+  // (a table keyed by ROB tag), so several SCs may legitimately be in flight.
   assign mem_rs_fu_ready_base = i_mem_rs_fu_ready &&
-                                !(sc_pending && mem_rs_next_is_sc) &&
                                 !sc_fu_complete_reg.valid &&
                                 !mem_adapter_result_pending &&
                                 !i_backend_recovery_hold;
@@ -1406,22 +1455,26 @@ module tomasulo_wrapper #(
       .i_widen_commit_ok        (i_widen_commit_ok),
 
       // External coordination
-      .i_sq_empty          (o_sq_empty),
-      .i_sq_committed_empty(sq_committed_empty),
-      .i_fence_i_sync_done (i_fence_i_sync_done),
-      .o_fence_i_sync_req  (o_fence_i_sync_req),
-      .o_csr_start         (o_csr_start),
-      .i_csr_done          (i_csr_done),
-      .o_trap_pending      (o_trap_pending),
-      .o_trap_pc           (o_trap_pc),
-      .o_trap_cause        (o_trap_cause),
-      .o_trap_value        (o_trap_value),
-      .i_trap_taken        (i_trap_taken),
-      .o_mret_start        (o_mret_start),
-      .i_mret_done         (i_mret_done),
-      .i_mepc              (i_mepc),
-      .i_interrupt_pending (i_interrupt_pending),
-      .i_commit_hold       (i_commit_hold),
+      .i_sq_empty                 (o_sq_empty),
+      .i_sq_committed_empty       (sq_committed_empty),
+      .i_fence_i_sync_done        (i_fence_i_sync_done),
+      .o_fence_i_sync_req         (o_fence_i_sync_req),
+      .o_csr_start                (o_csr_start),
+      .i_csr_done                 (i_csr_done),
+      .o_trap_pending             (o_trap_pending),
+      .o_trap_pc                  (o_trap_pc),
+      .o_head_is_wfi              (o_head_is_wfi),
+      .o_head_retired_next_pc     (o_head_retired_next_pc),
+      .o_head_next_retired_next_pc(o_head_next_retired_next_pc),
+      .o_trap_cause               (o_trap_cause),
+      .o_trap_value               (o_trap_value),
+      .i_trap_taken               (i_trap_taken),
+      .o_mret_start               (o_mret_start),
+      .i_mret_done                (i_mret_done),
+      .i_mepc                     (i_mepc),
+      .i_interrupt_pending        (i_interrupt_pending),
+      .i_priv                     (i_priv),
+      .i_commit_hold              (i_commit_hold),
 
       // Flush
       .i_flush_en(i_flush_en),
@@ -1648,8 +1701,8 @@ module tomasulo_wrapper #(
       .o_full_for_2(int_rs_full_for_2_w),
 
       // CDB snoop (from arbiter)
-      .i_cdb(cdb_bus_qualified),
-      .i_cdb_2(cdb_bus_2_qualified),
+      .i_cdb(cdb_bus_int_rs_qualified),
+      .i_cdb_2(cdb_bus_2_int_rs_qualified),
       .i_repair_valid_1(int_done_repair_valid_1),
       .i_repair_tag_1(i_bypass_tag_1),
       .i_repair_value_1(bypass_value_1),
@@ -2652,7 +2705,6 @@ module tomasulo_wrapper #(
   // Effective address: base (src1) + immediate (declared above near SC pending)
   assign sq_effective_addr = o_mem_rs_issue.src1_value[riscv_pkg::XLEN-1:0] + o_mem_rs_issue.imm;
 
-  // MMIO detection: address >= MMIO base
   logic sq_addr_is_mmio;
   // MMIO quadrant test; see lq_addr_is_mmio above.
   assign sq_addr_is_mmio = (sq_effective_addr[31:30] == 2'b01);
@@ -2721,12 +2773,12 @@ module tomasulo_wrapper #(
       .i_commit_valid_comb  (commit_store_like_raw),
       .i_commit_rob_tag_comb(head_tag),
 
-      // Slot 2 is always older than any ordinary partial-flush boundary that
-      // can overlap commit_2_fire, and delayed recovery sees it through the
-      // registered commit path.  Keep the raw head+1 ROB metadata cone out of
-      // the SQ valid flops.
-      .i_commit_valid_comb_2  (1'b0),
-      .i_commit_rob_tag_comb_2('0),
+      // Slot 2 has the same raw commit race as slot 1 for full-trap drains:
+      // commit_bus_2_q_valid is still one cycle away from SQ, so a timer IRQ
+      // must not observe committed-empty and full-flush the entry before SQ
+      // sees the registered commit.
+      .i_commit_valid_comb_2  (commit_2_store_like_raw),
+      .i_commit_rob_tag_comb_2(commit_bus_2.tag),
 
       // Store-to-load forwarding (from LQ)
       .i_sq_check_valid          (sq_check_valid),
diff --git a/hw/rtl/cpu_and_mem/cpu_and_mem.f b/hw/rtl/cpu_and_mem/cpu_and_mem.f
index 5e8abaaa..95cabf80 100644
--- a/hw/rtl/cpu_and_mem/cpu_and_mem.f
+++ b/hw/rtl/cpu_and_mem/cpu_and_mem.f
@@ -25,5 +25,8 @@
 # High-address fetch window provider (two-line L1I buffer)
 $(ROOT)/hw/rtl/cpu_and_mem/fetch_provider.sv
 
+# On-silicon hang triage (synthesizable boot-hang classifier over UART)
+$(ROOT)/hw/rtl/cpu_and_mem/hang_triage.sv
+
 # CPU and memory integration module
 $(ROOT)/hw/rtl/cpu_and_mem/cpu_and_mem.sv
diff --git a/hw/rtl/cpu_and_mem/cpu_and_mem.sv b/hw/rtl/cpu_and_mem/cpu_and_mem.sv
index 2578d6f8..9f7ba994 100644
--- a/hw/rtl/cpu_and_mem/cpu_and_mem.sv
+++ b/hw/rtl/cpu_and_mem/cpu_and_mem.sv
@@ -49,6 +49,9 @@ module cpu_and_mem #(
     parameter int unsigned L1_CACHE_BYTES = 128 * 1024,
     parameter int unsigned L1I_CACHE_BYTES = 16 * 1024,
     parameter int unsigned L2_CACHE_BYTES = 2 * 1024 * 1024,
+    // Simulation-only fast cache maintenance for fence.i (see frost_cache).
+    // 0 = FPGA (cycle-accurate maintenance FSM); non-zero = sim fast path.
+    parameter int unsigned SIM_FAST_MAINT = 0,
     // Behavioral main-memory model (simulation only; hardware integration
     // replaces it with the DDR controller behind the same AXI port).
     parameter int unsigned DDR_MODEL_BYTES = 64 * 1024 * 1024,
@@ -60,7 +63,10 @@ module cpu_and_mem #(
     // provider over the 1-cycle instruction BRAM (LFSR-gated i_instr_valid +
     // owed-ask tracking).  Exercises the core's fetch-invalid machinery
     // before a real I-cache sits behind it; hardware keeps 0.
-    parameter int unsigned FETCH_VALID_FUZZ = 0
+    parameter int unsigned FETCH_VALID_FUZZ = 0,
+    // On-silicon boot-hang classifier that can take over the console UART.
+    // Keep it default-off for normal interactive software and Linux bring-up.
+    parameter int unsigned ENABLE_HANG_TRIAGE = 0
 ) (
     input logic i_clk,
     input logic i_clk_div4,  // Divided clock for instruction memory programming
@@ -95,7 +101,8 @@ module cpu_and_mem #(
     input  logic        i_fifo1_empty,
     output logic        o_fifo1_rd_en,
 
-    // External interrupt input (directly triggers MEIP when high)
+    // External interrupt input (registered +1 cycle and ORed with the
+    // ns16550 UART IRQ before driving MEIP)
     input logic i_external_interrupt,
 
     // DDR AXI master (cache-hierarchy bridge). Quiescent when
@@ -129,7 +136,7 @@ module cpu_and_mem #(
 
   // Memory addressing parameters
   localparam int unsigned MemByteAddrWidth = $clog2(MEM_SIZE_BYTES);
-  // ((128 KiB total memory)/(4 bytes per word)) = 32k words = 2^15 word address bits
+  // (MEM_SIZE_BYTES/(4 bytes per word)) words; e.g. 256 KiB -> 64k words = 16 word address bits
   localparam int unsigned MemWordAddrWidth = MemByteAddrWidth - 2;
 
   // Memory-mapped I/O addresses for peripherals
@@ -137,7 +144,7 @@ module cpu_and_mem #(
   // - sw/common/link.ld (MMIO memory region and PROVIDE statements)
   // - cpu module parameters
   localparam int unsigned MmioAddr = 32'h4000_0000;
-  localparam int unsigned MmioSizeBytes = 32'h2C;
+  localparam int unsigned MmioSizeBytes = 32'h1_C000;  // ns16550 @ +0x1000, CLINT @ +0x10000
   localparam int unsigned UartMmioAddr = 32'h4000_0000;  // UART TX (write-only)
   localparam int unsigned UartRxDataMmioAddr = 32'h4000_0004;  // UART RX data (read consumes byte)
   localparam int unsigned UartRxStatusMmioAddr = 32'h4000_0024;  // RX status (bit0: data available)
@@ -152,17 +159,46 @@ module cpu_and_mem #(
   // Software interrupt register
   localparam int unsigned MsipMmioAddr = 32'h4000_0020;
 
+  // ns16550a UART face for Linux (word-stride; DTB reg-shift=2, reg-io-width=4).
+  // Aliases the native UART TX/RX. DLAB (LCR[7]) remaps offsets 0/4 to DLL/DLM.
+  localparam int unsigned Ns16550ThrRbr = 32'h4000_1000;  // THR(w)/RBR(r) | DLL when DLAB
+  localparam int unsigned Ns16550IerDlm = 32'h4000_1004;  // IER | DLM when DLAB
+  localparam int unsigned Ns16550IirFcr = 32'h4000_1008;  // IIR(r) / FCR(w)
+  localparam int unsigned Ns16550Lcr = 32'h4000_100C;
+  localparam int unsigned Ns16550Mcr = 32'h4000_1010;
+  localparam int unsigned Ns16550Lsr = 32'h4000_1014;  // read-only line status
+  localparam int unsigned Ns16550Msr = 32'h4000_1018;  // read-only modem status
+  localparam int unsigned Ns16550Scr = 32'h4000_101C;  // scratch
+
+  // SiFive CLINT alias for Linux (compatible "sifive,clint0") @ 0x4001_0000.
+  // These map onto the SAME msip/mtimecmp/mtime registers as the native FROST
+  // timer block; the kernel reaches the timer through the CLINT layout via DTB.
+  localparam int unsigned ClintMsip = 32'h4001_0000;  // hart-0 software interrupt
+  localparam int unsigned ClintMtimecmpLo = 32'h4001_4000;  // mtimecmp[31:0]
+  localparam int unsigned ClintMtimecmpHi = 32'h4001_4004;  // mtimecmp[63:32]
+  localparam int unsigned ClintMtimeLo = 32'h4001_BFF8;  // mtime[31:0]
+  localparam int unsigned ClintMtimeHi = 32'h4001_BFFC;  // mtime[63:32]
+
   // Timer register defaults
   // Default mtimecmp to max value so no timer interrupt fires until software configures it
   localparam logic [63:0] MtimecmpDefault = 64'hFFFF_FFFF_FFFF_FFFF;
 
   // CPU interface signals
   logic [31:0] program_counter;
+  logic commit_vld;  // instruction-retire pulse (hang-triage tap)
+  // CPU-side UART write, muxed against the hang-triage byte stream further down.
+  logic cpu_uart_wr_en;
+  logic [7:0] cpu_uart_wr_data;
   logic [31:0] fetch_address;  // imem port B address (the presented fetch ask)
   logic [63:0] instruction;  // 64-bit fetch: {next_word, current_word}
   logic [riscv_pkg::ImemFetchSidebandWidth-1:0] instruction_sideband;
   logic instruction_bank_sel_r;  // Fetch-word parity (for spanning select)
   logic instruction_valid;  // Fetch window valid
+  // Served-window tag for the muxed fetch (drives the if_stage served-window
+  // guard) and the low-BRAM served address (fetch_address delayed one cycle to
+  // match the 1-cycle imem read latency).
+  logic [31:0] instruction_served_addr;
+  logic [31:0] bram_fetch_served_addr_q;
   logic fetch_replay_consume;  // CPU consumed the stall-replay bundle this cycle
   logic pipeline_stall;  // front-end pipeline stall (gates fetch publish-valid)
   logic fence_i_sync_req;  // ROB serializer holding commit for a fence.i cache sync
@@ -229,15 +265,50 @@ module cpu_and_mem #(
 `endif
 
   // Timer registers (CLINT-style)
-  logic                  [63:0] mtime;  // Machine time counter
-  logic                  [63:0] mtimecmp;  // Machine timer compare register
-  logic                         msip;  // Machine software interrupt pending
+  logic [63:0] mtime;  // Machine time counter
+  logic [63:0] mtimecmp;  // Machine timer compare register
+  logic        msip;  // Machine software interrupt pending
+
+  // ns16550a UART face register file (8-bit). DLAB = ns_lcr[7].
+  logic [7:0] ns_dll, ns_dlm, ns_ier, ns_fcr, ns_lcr, ns_mcr, ns_scr;
+  logic       ns_rx_irq_pending;
+  logic       ns_tx_irq_pending;
+  logic       ns_irq_pending;
+  logic [7:0] ns_iir;
+  assign ns_rx_irq_pending = ns_ier[0] && i_uart_rx_valid;
+  assign ns_tx_irq_pending = ns_ier[1] && i_uart_tx_ready;
+  assign ns_irq_pending = ns_rx_irq_pending || ns_tx_irq_pending;
+  always_comb begin
+    if (ns_rx_irq_pending) ns_iir = 8'hC4;  // FIFO enabled, received data available.
+    else if (ns_tx_irq_pending) ns_iir = 8'hC2;  // FIFO enabled, THR empty.
+    else ns_iir = 8'hC1;  // FIFO enabled, no interrupt pending.
+  end
 
   // Interrupt signals to CPU
-  riscv_pkg::interrupt_t        interrupts;
-  // Clamp unknown external interrupt values to 0 for simulation stability.
-  // This avoids X-propagation into mip when the top-level input is left un-driven.
-  assign interrupts.meip = (i_external_interrupt === 1'b1);
+  riscv_pkg::interrupt_t interrupts;
+  // External/UART interrupt: REGISTER the aggregate to break the dominant
+  // post-opt timing spine (uart TX-FIFO CDC read-pointer -> occupancy CARRY
+  // compare -> i_uart_tx_ready -> ns16550 THRE irq -> meip -> trap_unit /
+  // ROB-serializer WFI-wake -> commit_en -> retire/trap/SQ endpoints; ~1256
+  // failing paths, WNS -1.09 at 300 MHz).  The whole combinational compare
+  // cone now terminates at this flop's D.  Mirrors mtip_registered below.
+  //
+  // DELIBERATE +1-cycle interrupt-delivery latency (user-approved
+  // 2026-07-01): meip/THRE/RX are level conditions and a 1-cycle-delayed
+  // level is architecturally benign; interrupt delivery is not on the
+  // CoreMark-scored path.  Only the interrupt VIEW is registered — the MMIO
+  // store-drain handshake on i_uart_tx_ready is untouched, and the ns_iir
+  // register readback stays combinational (matches how a real 8250's IIR
+  // reflects current conditions when the handler reads it).
+  //
+  // The === clamp keeps unknown external-interrupt values from propagating
+  // into mip when the top-level input is left un-driven in simulation.
+  logic meip_registered;
+  always_ff @(posedge i_clk) begin
+    if (i_rst) meip_registered <= 1'b0;
+    else meip_registered <= (i_external_interrupt === 1'b1) || ns_irq_pending;
+  end
+  assign interrupts.meip = meip_registered;
   assign interrupts.msip = msip;
 
   // Timer interrupt: register the 64-bit comparison result to break critical timing path.
@@ -251,7 +322,20 @@ module cpu_and_mem #(
   end
   assign interrupts.mtip = mtip_registered;
 
-  // RISC-V OOO CPU core - Tomasulo out-of-order with RV32IMACBFD + Zicsr + Machine-mode
+  // mtimecmp MMIO write pulse: a kernel/handler timer re-arm. Used by the hang
+  // triage as a "timer tick serviced" event tap.
+  logic mtimecmp_write_pulse;
+  assign mtimecmp_write_pulse = |data_memory_byte_write_enable_registered &&
+      ((data_memory_address_registered == MtimecmpLowMmioAddr) ||
+       (data_memory_address_registered == MtimecmpHighMmioAddr) ||
+       (data_memory_address_registered == ClintMtimecmpLo) ||
+       (data_memory_address_registered == ClintMtimecmpHi));
+  logic [ 5:0] cpu_debug_irq_status;
+  logic [31:0] cpu_debug_commit_pc;
+  logic [31:0] cpu_debug_commit_2_pc;
+  logic [ 1:0] cpu_debug_commit_valid;
+
+  // RISC-V OOO CPU core - Tomasulo out-of-order with RV32IMACBFD + Zicsr + Machine/User-mode
   cpu_ooo #(
       .MEM_BYTE_ADDR_WIDTH(MemByteAddrWidth),
       .MMIO_ADDR(MmioAddr),
@@ -265,6 +349,7 @@ module cpu_and_mem #(
       .i_instr(instruction),
       .i_instr_sideband(instruction_sideband),
       .i_instr_bank_sel_r(instruction_bank_sel_r),
+      .i_served_addr(instruction_served_addr),
       .i_instr_valid(instruction_valid),
       .o_fetch_replay_consume(fetch_replay_consume),
       .o_pipeline_stall(pipeline_stall),
@@ -292,11 +377,15 @@ module cpu_and_mem #(
       .o_mmio_uart_rx_ready_pulse(mmio_uart_rx_ready_pulse),
       .i_data_mem_rd_data(data_memory_or_peripheral_read_data),
       .o_rst_done(/*not connected*/),
-      .o_vld   (/*not connected*/),
+      .o_vld   (commit_vld),
       .o_pc_vld(/*not connected*/),
       // Interrupt and timer interface
       .i_interrupts(interrupts),
       .i_mtime(mtime),
+      .o_debug_irq_status(cpu_debug_irq_status),
+      .o_debug_commit_pc(cpu_debug_commit_pc),
+      .o_debug_commit_2_pc(cpu_debug_commit_2_pc),
+      .o_debug_commit_valid(cpu_debug_commit_valid),
       // Branch prediction enabled by default in production
       .i_disable_branch_prediction(1'b0)
   );
@@ -344,6 +433,7 @@ module cpu_and_mem #(
     // still carries valid (preserving the IF first-cycle capture); the real
     // provider's registered stall produces the same 1-cycle lag.
     assign instruction_valid = fuzz_ok && fuzz_window_ready && !pipeline_stall_q;
+    assign instruction_served_addr = served_addr_q;
     assign fuzz_accepted = instruction_valid && !pipeline_stall;
     // The BRAM chases the owed ask while unserved and the live PC once
     // serving (the 1-cycle BRAM then keeps the window contract-aligned).
@@ -406,6 +496,7 @@ module cpu_and_mem #(
     logic [63:0] cached_fetch_instr;
     logic [riscv_pkg::ImemFetchSidebandWidth-1:0] cached_fetch_sideband;
     logic cached_fetch_bank_sel_r;
+    logic [31:0] cached_fetch_served_addr;
     logic cached_fetch_valid;
 
     assign fetch_address = program_counter;
@@ -434,6 +525,8 @@ module cpu_and_mem #(
                                   bram_fetch_sideband;
     assign instruction_bank_sel_r = fetch_high_valid_q ? cached_fetch_bank_sel_r :
                                                          bram_fetch_bank_sel_cpu_r;
+    assign instruction_served_addr = fetch_high_valid_q ? cached_fetch_served_addr :
+                                                          bram_fetch_served_addr_q;
 
     // High-address provider: two-line L1I fetch buffer for cached/DDR code.
     // It no longer drives the low-BRAM address pins; that path stays direct
@@ -449,6 +542,7 @@ module cpu_and_mem #(
         .o_instr(cached_fetch_instr),
         .o_instr_sideband(cached_fetch_sideband),
         .o_instr_bank_sel_r(cached_fetch_bank_sel_r),
+        .o_served_addr(cached_fetch_served_addr),
         .o_instr_valid(cached_fetch_valid),
         .o_line_req_valid(iup_req_valid),
         .i_line_req_ready(iup_req_ready),
@@ -464,6 +558,7 @@ module cpu_and_mem #(
     );
   end else begin : gen_fetch_direct
     assign instruction_valid = 1'b1;
+    assign instruction_served_addr = bram_fetch_served_addr_q;
     assign fetch_address = program_counter;
     assign instruction = bram_fetch_instr;
     assign instruction_sideband = bram_fetch_sideband;
@@ -491,7 +586,7 @@ module cpu_and_mem #(
       // Port A: Instruction programming (div4 clock, write only)
       .i_port_a_byte_address(i_instr_mem_addr),
       .i_port_a_write_data(i_instr_mem_wrdata),
-      .i_port_a_write_enable(i_instr_mem_en),
+      .i_port_a_write_enable(i_instr_mem_en && (|i_instr_mem_we)),
       .o_port_a_read_data(  /* unused - write only */),
       // Port B: Instruction fetch (main clock, read only)
       .i_port_b_clk(i_clk),
@@ -508,6 +603,7 @@ module cpu_and_mem #(
   // control net.
   always_ff @(posedge i_clk) begin
     bram_fetch_bank_sel_cpu_r <= fetch_address[2];
+    bram_fetch_served_addr_q  <= fetch_address;
   end
 
 `ifndef SYNTHESIS
@@ -603,7 +699,8 @@ module cpu_and_mem #(
         .HAS_L2(CACHED_HAS_L2),
         .L1_CACHE_BYTES(L1_CACHE_BYTES),
         .L1I_CACHE_BYTES(L1I_CACHE_BYTES),
-        .L2_CACHE_BYTES(L2_CACHE_BYTES)
+        .L2_CACHE_BYTES(L2_CACHE_BYTES),
+        .SIM_FAST_MAINT(SIM_FAST_MAINT)
     ) cache_hierarchy (
         .i_clk(i_clk),
         .i_rst(i_rst),
@@ -824,19 +921,36 @@ module cpu_and_mem #(
     // Use MA-stage address captured from CPU for MMIO reads
     unique case (mmio_load_addr)
       // UART RX data - returns received byte in lower 8 bits (reading consumes byte)
-      UartRxDataMmioAddr:   mmio_read_data_comb = {24'b0, i_uart_rx_data};
+      UartRxDataMmioAddr: mmio_read_data_comb = {24'b0, i_uart_rx_data};
       // UART RX status - bit 0 indicates data available (non-destructive read)
       UartRxStatusMmioAddr: mmio_read_data_comb = {31'b0, i_uart_rx_valid};
       // UART TX status - bit 0 indicates the TX FIFO can accept at least one byte.
       UartTxStatusMmioAddr: mmio_read_data_comb = {31'b0, i_uart_tx_ready};
-      Fifo0MmioAddr:        mmio_read_data_comb = i_fifo0_rd_data;
-      Fifo1MmioAddr:        mmio_read_data_comb = i_fifo1_rd_data;
-      MtimeLowMmioAddr:     mmio_read_data_comb = mtime[31:0];
-      MtimeHighMmioAddr:    mmio_read_data_comb = mtime[63:32];
-      MtimecmpLowMmioAddr:  mmio_read_data_comb = mtimecmp[31:0];
+      Fifo0MmioAddr: mmio_read_data_comb = i_fifo0_rd_data;
+      Fifo1MmioAddr: mmio_read_data_comb = i_fifo1_rd_data;
+      MtimeLowMmioAddr: mmio_read_data_comb = mtime[31:0];
+      MtimeHighMmioAddr: mmio_read_data_comb = mtime[63:32];
+      MtimecmpLowMmioAddr: mmio_read_data_comb = mtimecmp[31:0];
       MtimecmpHighMmioAddr: mmio_read_data_comb = mtimecmp[63:32];
-      MsipMmioAddr:         mmio_read_data_comb = {31'b0, msip};
-      default:              ;
+      MsipMmioAddr: mmio_read_data_comb = {31'b0, msip};
+      // ns16550a UART face (aliases native UART TX/RX). DLAB selects DLL/DLM.
+      Ns16550ThrRbr: mmio_read_data_comb = ns_lcr[7] ? {24'b0, ns_dll} : {24'b0, i_uart_rx_data};
+      Ns16550IerDlm: mmio_read_data_comb = ns_lcr[7] ? {24'b0, ns_dlm} : {24'b0, ns_ier};
+      Ns16550IirFcr: mmio_read_data_comb = {24'b0, ns_iir};
+      Ns16550Lcr: mmio_read_data_comb = {24'b0, ns_lcr};
+      Ns16550Mcr: mmio_read_data_comb = {24'b0, ns_mcr};
+      // LSR: TEMT|THRE from TX-ready (bits 6,5); DR from RX-valid (bit 0).
+      Ns16550Lsr:
+      mmio_read_data_comb = {24'b0, 1'b0, i_uart_tx_ready, i_uart_tx_ready, 4'b0, i_uart_rx_valid};
+      Ns16550Msr: mmio_read_data_comb = {24'b0, 8'hB0};  // DCD|DSR|CTS asserted
+      Ns16550Scr: mmio_read_data_comb = {24'b0, ns_scr};
+      // SiFive CLINT alias (same registers as the native timer block).
+      ClintMsip: mmio_read_data_comb = {31'b0, msip};
+      ClintMtimecmpLo: mmio_read_data_comb = mtimecmp[31:0];
+      ClintMtimecmpHi: mmio_read_data_comb = mtimecmp[63:32];
+      ClintMtimeLo: mmio_read_data_comb = mtime[31:0];
+      ClintMtimeHi: mmio_read_data_comb = mtime[63:32];
+      default: ;
     endcase
   end
 
@@ -887,11 +1001,114 @@ module cpu_and_mem #(
     if (mmio_read_data_valid) data_memory_or_peripheral_read_data = mmio_read_data_reg;
   end
 
-  // write to UART
+  // write to UART (native 0x4000_0000 TX, or the ns16550 THR at 0x4000_1000
+  // when DLAB is clear -- both funnel into the same TX byte stream).
+  always_ff @(posedge i_clk) begin
+    cpu_uart_wr_data <= data_memory_write_data_registered[7:0];  // UART uses only lower byte
+    cpu_uart_wr_en   <= |data_memory_byte_write_enable_registered &&
+                       ((data_memory_address_registered == UartMmioAddr) ||
+                        (data_memory_address_registered == Ns16550ThrRbr && !ns_lcr[7]));
+  end
+
+  generate
+    if (ENABLE_HANG_TRIAGE != 0) begin : gen_hang_triage
+      // On-silicon hang triage: classify a silent boot hang over UART. This is
+      // intentionally opt-in because it periodically takes over the console.
+      logic        triage_active;
+      logic        triage_wr_en;
+      logic [ 7:0] triage_wr_data;
+      logic [31:0] triage_mtime_lo;
+      logic [31:0] triage_mtime_hi;
+      logic [31:0] triage_mtimecmp_lo;
+      logic [31:0] triage_mtimecmp_hi;
+      logic [31:0] triage_mtimecmp_delta_lo;
+      logic [31:0] triage_irq_status;
+      always_ff @(posedge i_clk) begin
+        if (i_rst) begin
+          triage_mtime_lo          <= 32'd0;
+          triage_mtime_hi          <= 32'd0;
+          triage_mtimecmp_lo       <= 32'd0;
+          triage_mtimecmp_hi       <= 32'd0;
+          triage_mtimecmp_delta_lo <= 32'd0;
+          triage_irq_status        <= 32'd0;
+        end else begin
+          triage_mtime_lo <= mtime[31:0];
+          triage_mtime_hi <= mtime[63:32];
+          triage_mtimecmp_lo <= mtimecmp[31:0];
+          triage_mtimecmp_hi <= mtimecmp[63:32];
+          triage_mtimecmp_delta_lo <= mtimecmp[31:0] - mtime[31:0];
+          triage_irq_status <= {
+            22'd0,
+            cpu_debug_irq_status[5],
+            cpu_debug_irq_status[4],
+            cpu_debug_irq_status[3:2],
+            cpu_debug_irq_status[1],
+            cpu_debug_irq_status[0],
+            interrupts.meip,
+            interrupts.msip,
+            interrupts.mtip,
+            mtip_comparison
+          };
+        end
+      end
+      hang_triage u_hang_triage (
+          .i_clk              (i_clk),
+          .i_rst              (i_rst),
+          .i_commit           (commit_vld),
+          .i_timer_event      (mtimecmp_write_pulse),
+          .i_cread_req        (data_memory_cached_read_enable),
+          .i_cread_resp       (data_memory_cached_read_valid),
+          .i_cwrite_req       (|data_memory_cached_byte_write_enable),
+          .i_cwrite_done      (data_memory_cached_write_done),
+          .i_pc               (program_counter),
+          .i_commit0_valid    (cpu_debug_commit_valid[0]),
+          .i_commit0_pc       (cpu_debug_commit_pc),
+          .i_commit1_valid    (cpu_debug_commit_valid[1]),
+          .i_commit1_pc       (cpu_debug_commit_2_pc),
+          .i_mtime_lo         (triage_mtime_lo),
+          .i_mtime_hi         (triage_mtime_hi),
+          .i_mtimecmp_lo      (triage_mtimecmp_lo),
+          .i_mtimecmp_hi      (triage_mtimecmp_hi),
+          .i_mtimecmp_delta_lo(triage_mtimecmp_delta_lo),
+          .i_irq_status       (triage_irq_status),
+          .i_uart_busy        (cpu_uart_wr_en),
+          .i_uart_ready       (i_uart_tx_ready),
+          .o_active           (triage_active),
+          .o_wr_en            (triage_wr_en),
+          .o_wr_data          (triage_wr_data)
+      );
+      assign o_uart_wr_en   = triage_active ? triage_wr_en : cpu_uart_wr_en;
+      assign o_uart_wr_data = triage_active ? triage_wr_data : cpu_uart_wr_data;
+    end else begin : gen_no_hang_triage
+      assign o_uart_wr_en   = cpu_uart_wr_en;
+      assign o_uart_wr_data = cpu_uart_wr_data;
+    end
+  endgenerate
+
+  // ns16550a register-file writes. DLAB (LCR[7]) routes offsets 0/4 to the
+  // baud divisor (DLL/DLM); the THR write itself transmits via o_uart_wr_en.
   always_ff @(posedge i_clk) begin
-    o_uart_wr_data <= data_memory_write_data_registered[7:0];  // UART uses only lower byte
-    o_uart_wr_en   <= |data_memory_byte_write_enable_registered &&
-                       data_memory_address_registered == UartMmioAddr;
+    if (i_rst) begin
+      ns_dll <= 8'h01;
+      ns_dlm <= 8'h00;
+      ns_ier <= 8'h00;
+      ns_fcr <= 8'h00;
+      ns_lcr <= 8'h00;
+      ns_mcr <= 8'h00;
+      ns_scr <= 8'h00;
+    end else if (|data_memory_byte_write_enable_registered) begin
+      unique case (data_memory_address_registered)
+        Ns16550ThrRbr: if (ns_lcr[7]) ns_dll <= data_memory_write_data_registered[7:0];
+        Ns16550IerDlm:
+        if (ns_lcr[7]) ns_dlm <= data_memory_write_data_registered[7:0];
+        else ns_ier <= data_memory_write_data_registered[7:0];
+        Ns16550IirFcr: ns_fcr <= data_memory_write_data_registered[7:0];
+        Ns16550Lcr: ns_lcr <= data_memory_write_data_registered[7:0];
+        Ns16550Mcr: ns_mcr <= data_memory_write_data_registered[7:0];
+        Ns16550Scr: ns_scr <= data_memory_write_data_registered[7:0];
+        default: ;
+      endcase
+    end
   end
 
   // FIFO write logic - write to FIFOs when CPU writes to FIFO MMIO addresses
@@ -902,11 +1119,23 @@ module cpu_and_mem #(
   assign o_fifo1_wr_en   = |data_memory_byte_write_enable_registered &&
                             data_memory_address_registered == Fifo1MmioAddr;
 
+  // Linux reads received bytes through the ns16550 RBR alias.  That read must
+  // consume the shared UART RX FIFO just like the native FROST RX-data address,
+  // but only when DLAB is clear; with DLAB set, offset 0 is DLL.
+  logic ns16550_rbr_read_pulse;
+  always_ff @(posedge i_clk) begin
+    if (i_rst) begin
+      ns16550_rbr_read_pulse <= 1'b0;
+    end else begin
+      ns16550_rbr_read_pulse <= mmio_read_pulse && (mmio_load_addr == Ns16550ThrRbr) && !ns_lcr[7];
+    end
+  end
+
   // FIFO/UART consume pulses fire one cycle after the MMIO read request is
   // accepted. The response data itself was already captured above.
-  assign o_fifo0_rd_en = mmio_fifo0_read_pulse;
-  assign o_fifo1_rd_en = mmio_fifo1_read_pulse;
-  assign o_uart_rx_ready = mmio_uart_rx_ready_pulse;
+  assign o_fifo0_rd_en   = mmio_fifo0_read_pulse;
+  assign o_fifo1_rd_en   = mmio_fifo1_read_pulse;
+  assign o_uart_rx_ready = mmio_uart_rx_ready_pulse || ns16550_rbr_read_pulse;
 
   // Timer register updates
   // mtime increments every clock cycle (provides wall-clock time)
@@ -918,9 +1147,11 @@ module cpu_and_mem #(
   // This would cause the non-written half to increment during a write, which is wrong.
   logic writing_mtime_low, writing_mtime_high;
   assign writing_mtime_low = |data_memory_byte_write_enable_registered &&
-                             (data_memory_address_registered == MtimeLowMmioAddr);
+                             ((data_memory_address_registered == MtimeLowMmioAddr) ||
+                              (data_memory_address_registered == ClintMtimeLo));
   assign writing_mtime_high = |data_memory_byte_write_enable_registered &&
-                              (data_memory_address_registered == MtimeHighMmioAddr);
+                              ((data_memory_address_registered == MtimeHighMmioAddr) ||
+                               (data_memory_address_registered == ClintMtimeHi));
 
   always_ff @(posedge i_clk) begin
     if (i_rst) begin
@@ -944,11 +1175,12 @@ module cpu_and_mem #(
       if (|data_memory_byte_write_enable_registered) begin
         unique case (data_memory_address_registered)
           // mtimecmp controls timer interrupt threshold
-          MtimecmpLowMmioAddr:  mtimecmp[31:0] <= data_memory_write_data_registered;
-          MtimecmpHighMmioAddr: mtimecmp[63:32] <= data_memory_write_data_registered;
+          MtimecmpLowMmioAddr, ClintMtimecmpLo: mtimecmp[31:0] <= data_memory_write_data_registered;
+          MtimecmpHighMmioAddr, ClintMtimecmpHi:
+          mtimecmp[63:32] <= data_memory_write_data_registered;
           // msip controls software interrupt (only bit 0 is writable)
-          MsipMmioAddr:         msip <= data_memory_write_data_registered[0];
-          default:              ;
+          MsipMmioAddr, ClintMsip: msip <= data_memory_write_data_registered[0];
+          default: ;
         endcase
       end
     end
diff --git a/hw/rtl/cpu_and_mem/fetch_provider.sv b/hw/rtl/cpu_and_mem/fetch_provider.sv
index 535af9c5..52416061 100644
--- a/hw/rtl/cpu_and_mem/fetch_provider.sv
+++ b/hw/rtl/cpu_and_mem/fetch_provider.sv
@@ -18,7 +18,7 @@
  * fetch_provider -- the variable-latency fetch window provider.
  *
  * Serves the high-address side of the core's fetch seam
- * ({instr64, sideband16, bank_sel_r} + valid) from a two-line fetch buffer
+ * ({instr64, sideband24, bank_sel_r} + valid) from a two-line fetch buffer
  * over the L1I line port.  The low instruction BRAM fast path is selected in
  * cpu_and_mem and drives imem_predecode directly from o_pc; this block never
  * drives the low-BRAM address pins.  Each filled line carries per-word
@@ -70,6 +70,11 @@ module fetch_provider #(
     output logic [63:0] o_instr,
     output logic [riscv_pkg::ImemFetchSidebandWidth-1:0] o_instr_sideband,
     output logic o_instr_bank_sel_r,
+    // Full served-window address (its tag).  if_stage uses this to detect a fetch
+    // stall that left pc_reg outside the served window (>1 word away), which the
+    // 1-bit bank_sel parity cannot represent -> wrong-word size sample / mid-insn
+    // pc_reg drift.  Observe-only output; does not change fetch behaviour here.
+    output logic [31:0] o_served_addr,
     output logic o_instr_valid,
 
     // L1I line port (master; read-only -- write/wdata/wstrb tied inactive).
@@ -221,6 +226,7 @@ module fetch_provider #(
   assign o_instr = ddr_instr_q;
   assign o_instr_sideband = ddr_sb_pair_q;
   assign o_instr_bank_sel_r = bank_sel_q;
+  assign o_served_addr = served_addr_q;
 
   // ===========================================================================
   // Miss engine: single-outstanding line fills + next-line prefetch
diff --git a/hw/rtl/cpu_and_mem/hang_triage.sv b/hw/rtl/cpu_and_mem/hang_triage.sv
new file mode 100644
index 00000000..b4396f5a
--- /dev/null
+++ b/hw/rtl/cpu_and_mem/hang_triage.sv
@@ -0,0 +1,355 @@
+/*
+ *    Copyright 2026 Two Sigma Open Source, LLC
+ *
+ *    Licensed under the Apache License, Version 2.0 (the "License");
+ *    you may not use this file except in compliance with the License.
+ *    You may obtain a copy of the License at
+ *
+ *        http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *    Unless required by applicable law or agreed to in writing, software
+ *    distributed under the License is distributed on an "AS IS" BASIS,
+ *    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *    See the License for the specific language governing permissions and
+ *    limitations under the License.
+ */
+
+/*
+ * hang_triage — on-silicon classifier for the silent boot hang.
+ *
+ * Trigger: the console UART goes quiet (every hang flavor stops the kernel
+ * printing). On a quiet stretch it streams ASCII over the UART and re-emits
+ * periodically so the trajectory is visible:
+ *
+ *   "\n!!HANG c=<commits> t=<timer> q=<cread_req> v=<cread_resp> w=<wreq:wdone>"
+ *   " l=<pc_lo> h=<pc_hi> r=<commit0_pc> s=<commit1_pc> m=<mtime_lo>"
+ *   " n=<mtime_hi> x=<mtimecmp_lo>"
+ *   " y=<mtimecmp_hi> d=<mtimecmp-mtime lo> p=<irq/status>"
+ *   "\nH <hist[0]> <hist[1]> ... <hist[63]>\n"
+ *
+ *   c   committed instructions     climbing => busy-loop; frozen => wedge
+ *   t   mtimecmp writes (timer)     frozen  => timer service stopped
+ *   q/v cached read req/resp        q>v frozen => a DDR read never returned
+ *   w   cached write {req:done}     req>done => a DDR write never landed
+ *   l/h pc_lo..pc_hi               PC range executed since last console output
+ *   r/s last retired PCs            slot-1 / slot-2 commit PCs
+ *   m/n mtime lo/hi                CLINT time at snapshot
+ *   x/y mtimecmp lo/hi             CLINT compare at snapshot
+ *   d   mtimecmp-mtime low word     high bit set usually means compare is overdue
+ *   p   irq/status bits:
+ *       [0]=raw mtime>=mtimecmp, [1]=registered MTIP, [2]=MSIP, [3]=MEIP,
+ *       [4]=mie.MTIE, [5]=mstatus.MIE, [7:6]=priv, [8]=trap, [9]=mret
+ *   H   PC histogram, 64 buckets of 64 KiB keyed on pc[21:16] (kernel pc[31]=1)
+ *       => cycle-weighted hot region of the livelock (bucket k = 0x8000_0000 +
+ *       k*0x10000). The hottest bucket localizes the spin to a 64 KiB window.
+ *
+ * Non-latching: any console write resets the quiet timer + PC window.
+ */
+module hang_triage #(
+    parameter logic [31:0] QUIET_CYCLES  = 32'd400_000_000,  // ~3 s @133 MHz
+    parameter logic [31:0] REEMIT_CYCLES = 32'd134_000_000   // ~1 s
+) (
+    input logic i_clk,
+    input logic i_rst,
+
+    input logic        i_commit,
+    input logic        i_timer_event,
+    input logic        i_cread_req,
+    input logic        i_cread_resp,
+    input logic        i_cwrite_req,
+    input logic        i_cwrite_done,
+    input logic [31:0] i_pc,
+    input logic        i_commit0_valid,
+    input logic [31:0] i_commit0_pc,
+    input logic        i_commit1_valid,
+    input logic [31:0] i_commit1_pc,
+    input logic [31:0] i_mtime_lo,
+    input logic [31:0] i_mtime_hi,
+    input logic [31:0] i_mtimecmp_lo,
+    input logic [31:0] i_mtimecmp_hi,
+    input logic [31:0] i_mtimecmp_delta_lo,
+    input logic [31:0] i_irq_status,
+    input logic        i_uart_busy,
+
+    input  logic       i_uart_ready,
+    output logic       o_active,
+    output logic       o_wr_en,
+    output logic [7:0] o_wr_data
+);
+
+  // ---- Free-running event counters ------------------------------------------
+  logic [31:0] cnt_commit, cnt_timer, cnt_cread_req, cnt_cread_resp;
+  logic [31:0] cnt_cwrite_req, cnt_cwrite_done;
+  always_ff @(posedge i_clk) begin
+    if (i_rst) begin
+      cnt_commit      <= 32'd0;
+      cnt_timer       <= 32'd0;
+      cnt_cread_req   <= 32'd0;
+      cnt_cread_resp  <= 32'd0;
+      cnt_cwrite_req  <= 32'd0;
+      cnt_cwrite_done <= 32'd0;
+    end else begin
+      if (i_commit) cnt_commit <= cnt_commit + 32'd1;
+      if (i_timer_event) cnt_timer <= cnt_timer + 32'd1;
+      if (i_cread_req) cnt_cread_req <= cnt_cread_req + 32'd1;
+      if (i_cread_resp) cnt_cread_resp <= cnt_cread_resp + 32'd1;
+      if (i_cwrite_req) cnt_cwrite_req <= cnt_cwrite_req + 32'd1;
+      if (i_cwrite_done) cnt_cwrite_done <= cnt_cwrite_done + 32'd1;
+    end
+  end
+
+  // ---- PC histogram: 64 x 64 KiB buckets, kernel PCs only -------------------
+  logic [31:0] hist[64];
+  logic [5:0] pc_bucket;
+  assign pc_bucket = i_pc[21:16];
+  always_ff @(posedge i_clk) begin
+    if (i_rst || i_uart_busy) begin
+      // Clear while the console is active so the histogram reflects ONLY the
+      // quiet (hang) window, not the pre-hang boot execution.
+      for (int b = 0; b < 64; b++) hist[b] <= 32'd0;
+    end else if (i_pc[31]) begin  // count only kernel-range PCs
+      hist[pc_bucket] <= hist[pc_bucket] + 32'd1;
+    end
+  end
+
+  // ---- Console-idle timer + PC window ---------------------------------------
+  logic [31:0] quiet_cnt;
+  logic [31:0] pc_lo, pc_hi;
+  logic [31:0] last_commit0_pc, last_commit1_pc;
+  logic win_reset;
+  always_ff @(posedge i_clk) begin
+    if (i_rst) begin
+      quiet_cnt       <= 32'd0;
+      pc_lo           <= 32'hFFFFFFFF;
+      pc_hi           <= 32'h00000000;
+      last_commit0_pc <= 32'd0;
+      last_commit1_pc <= 32'd0;
+    end else if (i_uart_busy) begin
+      quiet_cnt <= 32'd0;
+      pc_lo     <= i_pc;
+      pc_hi     <= i_pc;
+      if (i_commit0_valid) last_commit0_pc <= i_commit0_pc;
+      if (i_commit1_valid) last_commit1_pc <= i_commit1_pc;
+    end else begin
+      if (quiet_cnt != 32'hFFFFFFFF) quiet_cnt <= quiet_cnt + 32'd1;
+      if (i_commit0_valid) last_commit0_pc <= i_commit0_pc;
+      if (i_commit1_valid) last_commit1_pc <= i_commit1_pc;
+      if (win_reset) begin
+        pc_lo <= i_pc;
+        pc_hi <= i_pc;
+      end else begin
+        if (i_pc < pc_lo) pc_lo <= i_pc;
+        if (i_pc > pc_hi) pc_hi <= i_pc;
+      end
+    end
+  end
+
+  // ---- Snapshot -------------------------------------------------------------
+  logic [31:0] snap_c, snap_t, snap_q, snap_v, snap_w, snap_l, snap_h, snap_r, snap_s;
+  logic [31:0] snap_m, snap_n, snap_x, snap_y, snap_d, snap_p;
+
+  // ---- ASCII emit FSM -------------------------------------------------------
+  typedef enum logic [2:0] {
+    EM_IDLE,
+    EM_PREFIX,
+    EM_FIELD,
+    EM_HPRE,
+    EM_HIST,
+    EM_GAP
+  } em_state_e;
+  em_state_e em_state;
+  logic [3:0] pcnt;
+  localparam logic [3:0] FieldLast = 4'd14;
+  logic [ 3:0] fld;
+  logic [ 3:0] fpos;
+  logic [ 5:0] hidx;
+  logic [ 3:0] hpos;  // 0..8 within a hist entry
+  logic [31:0] reemit_cnt;
+
+  assign win_reset = (em_state == EM_IDLE) && (quiet_cnt >= QUIET_CYCLES);
+
+  function automatic logic [7:0] hex4(input logic [3:0] n);
+    hex4 = (n < 4'd10) ? (8'h30 + {4'b0, n}) : (8'h41 + {4'b0, n} - 8'd10);
+  endfunction
+
+  function automatic logic [7:0] prefix_byte(input logic [3:0] i);
+    case (i)
+      4'd0:    prefix_byte = 8'h0A;
+      4'd1:    prefix_byte = "!";
+      4'd2:    prefix_byte = "!";
+      4'd3:    prefix_byte = "H";
+      4'd4:    prefix_byte = "A";
+      4'd5:    prefix_byte = "N";
+      4'd6:    prefix_byte = "G";
+      default: prefix_byte = " ";
+    endcase
+  endfunction
+
+  function automatic logic [7:0] label_byte(input logic [3:0] f);
+    case (f)
+      4'd0:    label_byte = "c";
+      4'd1:    label_byte = "t";
+      4'd2:    label_byte = "q";
+      4'd3:    label_byte = "v";
+      4'd4:    label_byte = "w";
+      4'd5:    label_byte = "l";
+      4'd6:    label_byte = "h";
+      4'd7:    label_byte = "r";
+      4'd8:    label_byte = "s";
+      4'd9:    label_byte = "m";
+      4'd10:   label_byte = "n";
+      4'd11:   label_byte = "x";
+      4'd12:   label_byte = "y";
+      4'd13:   label_byte = "d";
+      default: label_byte = "p";
+    endcase
+  endfunction
+
+  logic [31:0] fld_val;
+  always_comb begin
+    case (fld)
+      4'd0:    fld_val = snap_c;
+      4'd1:    fld_val = snap_t;
+      4'd2:    fld_val = snap_q;
+      4'd3:    fld_val = snap_v;
+      4'd4:    fld_val = snap_w;
+      4'd5:    fld_val = snap_l;
+      4'd6:    fld_val = snap_h;
+      4'd7:    fld_val = snap_r;
+      4'd8:    fld_val = snap_s;
+      4'd9:    fld_val = snap_m;
+      4'd10:   fld_val = snap_n;
+      4'd11:   fld_val = snap_x;
+      4'd12:   fld_val = snap_y;
+      4'd13:   fld_val = snap_d;
+      default: fld_val = snap_p;
+    endcase
+  end
+
+  logic [3:0] nib_idx;
+  always_comb begin
+    nib_idx = 4'd0;
+    if (fpos >= 4'd2 && fpos <= 4'd9) nib_idx = 4'd9 - fpos;
+  end
+
+  logic [3:0] hnib_idx;
+  always_comb begin
+    hnib_idx = 4'd0;
+    if (hpos <= 4'd7) hnib_idx = 4'd7 - hpos;
+  end
+
+  logic [7:0] emit_byte;
+  always_comb begin
+    emit_byte = 8'h20;
+    unique case (em_state)
+      EM_PREFIX: emit_byte = prefix_byte(pcnt);
+      EM_FIELD: begin
+        if (fpos == 4'd0) emit_byte = label_byte(fld);
+        else if (fpos == 4'd1) emit_byte = "=";
+        else if (fpos == 4'd10) emit_byte = 8'h20;
+        else emit_byte = hex4(fld_val[nib_idx*4+:4]);
+      end
+      EM_HPRE: emit_byte = (pcnt == 4'd0) ? 8'h0A : ((pcnt == 4'd1) ? "H" : " ");
+      EM_HIST:
+      emit_byte = (hpos == 4'd8) ? ((hidx == 6'd63) ? 8'h0A : 8'h20) :
+          hex4(hist[hidx][hnib_idx*4+:4]);
+      default: emit_byte = 8'h20;
+    endcase
+  end
+
+  always_ff @(posedge i_clk) begin
+    if (i_rst) begin
+      em_state   <= EM_IDLE;
+      pcnt       <= 4'd0;
+      fld        <= 4'd0;
+      fpos       <= 4'd0;
+      hidx       <= 6'd0;
+      hpos       <= 4'd0;
+      reemit_cnt <= 32'd0;
+      o_active   <= 1'b0;
+      o_wr_en    <= 1'b0;
+      o_wr_data  <= 8'd0;
+    end else begin
+      o_wr_en <= 1'b0;
+      case (em_state)
+        EM_IDLE: begin
+          if (quiet_cnt >= QUIET_CYCLES) begin
+            snap_c   <= cnt_commit;
+            snap_t   <= cnt_timer;
+            snap_q   <= cnt_cread_req;
+            snap_v   <= cnt_cread_resp;
+            snap_w   <= {cnt_cwrite_req[15:0], cnt_cwrite_done[15:0]};
+            snap_l   <= pc_lo;
+            snap_h   <= pc_hi;
+            snap_r   <= last_commit0_pc;
+            snap_s   <= last_commit1_pc;
+            snap_m   <= i_mtime_lo;
+            snap_n   <= i_mtime_hi;
+            snap_x   <= i_mtimecmp_lo;
+            snap_y   <= i_mtimecmp_hi;
+            snap_d   <= i_mtimecmp_delta_lo;
+            snap_p   <= i_irq_status;
+            o_active <= 1'b1;
+            pcnt     <= 4'd0;
+            em_state <= EM_PREFIX;
+          end
+        end
+        EM_PREFIX:
+        if (i_uart_ready) begin
+          o_wr_en   <= 1'b1;
+          o_wr_data <= emit_byte;
+          if (pcnt == 4'd7) begin
+            fld <= 4'd0;
+            fpos <= 4'd0;
+            em_state <= EM_FIELD;
+          end else pcnt <= pcnt + 4'd1;
+        end
+        EM_FIELD:
+        if (i_uart_ready) begin
+          o_wr_en   <= 1'b1;
+          o_wr_data <= emit_byte;
+          if (fpos == 4'd10) begin
+            if (fld == FieldLast) begin
+              pcnt <= 4'd0;
+              em_state <= EM_HPRE;
+            end else begin
+              fld  <= fld + 4'd1;
+              fpos <= 4'd0;
+            end
+          end else fpos <= fpos + 4'd1;
+        end
+        EM_HPRE:
+        if (i_uart_ready) begin
+          o_wr_en   <= 1'b1;
+          o_wr_data <= emit_byte;
+          if (pcnt == 4'd2) begin
+            hidx <= 6'd0;
+            hpos <= 4'd0;
+            em_state <= EM_HIST;
+          end else pcnt <= pcnt + 4'd1;
+        end
+        EM_HIST:
+        if (i_uart_ready) begin
+          o_wr_en   <= 1'b1;
+          o_wr_data <= emit_byte;
+          if (hpos == 4'd8) begin
+            if (hidx == 6'd63) begin
+              em_state   <= EM_GAP;
+              reemit_cnt <= REEMIT_CYCLES;
+            end else begin
+              hidx <= hidx + 6'd1;
+              hpos <= 4'd0;
+            end
+          end else hpos <= hpos + 4'd1;
+        end
+        EM_GAP: begin
+          o_active <= 1'b0;
+          if (reemit_cnt <= 32'd1) em_state <= EM_IDLE;
+          else reemit_cnt <= reemit_cnt - 32'd1;
+        end
+        default: em_state <= EM_IDLE;
+      endcase
+    end
+  end
+
+endmodule : hang_triage
diff --git a/hw/rtl/frost.sv b/hw/rtl/frost.sv
index 8bb657e3..ffde4255 100644
--- a/hw/rtl/frost.sv
+++ b/hw/rtl/frost.sv
@@ -54,6 +54,10 @@ module frost #(
     parameter int unsigned L1_CACHE_BYTES = 128 * 1024,
     parameter int unsigned L1I_CACHE_BYTES = 16 * 1024,
     parameter int unsigned L2_CACHE_BYTES = 2 * 1024 * 1024,
+    // Simulation-only fast cache maintenance for fence.i: 0 = FPGA (cycle-
+    // accurate maintenance FSM, unchanged); non-zero = sim fast path (see
+    // frost_cache). Set to 1 only by the cocotb sim build, never for boards.
+    parameter int unsigned SIM_FAST_MAINT = 0,
     // Behavioral main-memory model knobs (simulation only).
     parameter int unsigned DDR_MODEL_BYTES = 64 * 1024 * 1024,
     parameter int unsigned DDR_MODEL_LATENCY = 30,
@@ -62,7 +66,9 @@ module frost #(
     // them to their DDR controller subsystem).
     parameter int unsigned USE_BEHAVIORAL_DDR = 1,
     // Simulation-only fetch-latency fuzz (see cpu_and_mem). Hardware keeps 0.
-    parameter int unsigned FETCH_VALID_FUZZ = 0
+    parameter int unsigned FETCH_VALID_FUZZ = 0,
+    // Optional on-silicon boot-hang classifier that can emit over UART.
+    parameter int unsigned ENABLE_HANG_TRIAGE = 0
 ) (
     input logic i_clk,
     input logic i_clk_div4,
@@ -193,10 +199,12 @@ module frost #(
       .L1_CACHE_BYTES(L1_CACHE_BYTES),
       .L1I_CACHE_BYTES(L1I_CACHE_BYTES),
       .L2_CACHE_BYTES(L2_CACHE_BYTES),
+      .SIM_FAST_MAINT(SIM_FAST_MAINT),
       .DDR_MODEL_BYTES(DDR_MODEL_BYTES),
       .DDR_MODEL_LATENCY(DDR_MODEL_LATENCY),
       .USE_BEHAVIORAL_DDR(USE_BEHAVIORAL_DDR),
-      .FETCH_VALID_FUZZ(FETCH_VALID_FUZZ)
+      .FETCH_VALID_FUZZ(FETCH_VALID_FUZZ),
+      .ENABLE_HANG_TRIAGE(ENABLE_HANG_TRIAGE)
   ) cpu_and_memory_subsystem (
       .i_clk,
       .i_clk_div4,
diff --git a/hw/rtl/lib/cache/axi_behavioral_memory.sv b/hw/rtl/lib/cache/axi_behavioral_memory.sv
index 637aed1e..627baeea 100644
--- a/hw/rtl/lib/cache/axi_behavioral_memory.sv
+++ b/hw/rtl/lib/cache/axi_behavioral_memory.sv
@@ -16,8 +16,8 @@
 
 /*
  * axi_behavioral_memory -- SIMULATION-ONLY main-memory model (stands in for
- * the DDR controller in Phase 1; replaced by the MIG + SmartConnect on
- * hardware). AXI4 slave, single-beat 256-bit transactions (asserts on
+ * the DDR controller in Phase 1; replaced by the board's DDR controller
+ * (MIG DDR3 / DDR4 IP) + SmartConnect on hardware). AXI4 slave, single-beat 256-bit transactions (asserts on
  * anything else), parameterized response latency to mimic DDR access time.
  *
  * The array is dense and parameter-sized (default 64 MiB) while the DECODED
diff --git a/hw/rtl/lib/cache/frost_cache.sv b/hw/rtl/lib/cache/frost_cache.sv
index d67bb1b0..b413e7a1 100644
--- a/hw/rtl/lib/cache/frost_cache.sv
+++ b/hw/rtl/lib/cache/frost_cache.sv
@@ -72,7 +72,17 @@ module frost_cache #(
     // verilog_lint: waive explicit-parameter-storage-type
     parameter DATA_MEMORY_PRIMITIVE = "block",
     parameter int unsigned DATA_READ_LATENCY = 2,
-    parameter int unsigned DATA_WRITE_LATENCY = 1
+    parameter int unsigned DATA_WRITE_LATENCY = 1,
+    // Simulation-only fast cache maintenance (fence.i). 0 = FPGA: the
+    // cycle-accurate maintenance FSM below is byte-for-byte unchanged. Non-zero
+    // = simulation: invalidate-all completes in a single cycle (a tag bulk
+    // clear) and writeback-all iterates only the dirty lines -- O(dirty) rather
+    // than O(NumLines) -- guided by a sim-only shadow of the dirty bits. The
+    // functional effect is identical to the slow path: every line is left
+    // invalid after invalidate-all, and every valid+dirty line is still written
+    // downstream and marked clean by writeback-all. Threaded in only for the
+    // cocotb sim build; never set for board/synthesis builds.
+    parameter int unsigned SIM_FAST_MAINT = 0
 ) (
     input logic i_clk,
     input logic i_rst,
@@ -156,12 +166,31 @@ module frost_cache #(
   logic   [IndexBits-1:0] flush_idx_q;
   logic   [  TagBits-1:0] flush_tag_q;
 
+  // Real-FSM (FPGA) writeback-all acceleration: bound the index walk to the
+  // [wb_lo_q, wb_hi_q] span of lines made dirty since the last writeback-all,
+  // instead of scanning all NumLines on every fence.i. Cheap and synthesizable
+  // (two index regs + a 1-bit "any dirty" flag), unlike the SIM_FAST_MAINT
+  // shadow's NumLines-bit priority encoder. wb_any_q == 0 means no dirty lines.
+  logic [IndexBits-1:0] wb_lo_q, wb_hi_q;
+  logic wb_any_q;
+
+  // Fast maintenance (SIM_FAST_MAINT, simulation only).
+  // tag_bulk_clear: one-cycle invalidate-all of the whole tag array.
+  // any_dirty_*/first_dirty_*: lowest dirty line index from the sim-only dirty
+  // shadow, used to walk only dirty lines during writeback-all. All driven to
+  // constants when the feature is off, so the FPGA build carries none of it.
+  logic tag_bulk_clear;
+  logic any_dirty_full, any_dirty_excl;
+  logic [IndexBits-1:0] first_dirty_full, first_dirty_excl;
+
   // Writeback-all walk states (data/tag addressing + busy).
-  logic                   flush_active;
+  logic flush_active;
   assign flush_active = (state_q == S_FLUSH_SCAN) || (state_q == S_FLUSH_CHECK) ||
       (state_q == S_FLUSH_DATA) || (state_q == S_FLUSH_WB_REQ) ||
       (state_q == S_FLUSH_WB_WAIT);
   assign o_maint_busy = flush_active || (state_q == S_SWEEP);
+  // Fast invalidate-all: hold the tag bulk clear for the (now one-cycle) sweep.
+  assign tag_bulk_clear = (SIM_FAST_MAINT != 0) && (state_q == S_SWEEP);
   logic [             7:0] wait_cnt_q;  // data-array latency countdown (latencies are small)
   logic [     TagBits-1:0] victim_tag_q;
   logic [    LineBits-1:0] victim_line_q;
@@ -197,16 +226,91 @@ module frost_cache #(
 
   sdp_block_ram #(
       .ADDR_WIDTH(IndexBits),
-      .DATA_WIDTH(TagEntryBits)
+      .DATA_WIDTH(TagEntryBits),
+      .SUPPORT_BULK_CLEAR(SIM_FAST_MAINT)
   ) tag_array (
       .i_clk(i_clk),
       .i_write_enable(tag_we),
+      .i_bulk_clear(tag_bulk_clear),
       .i_write_address(tag_waddr),
       .i_read_address(tag_raddr),
       .i_write_data(tag_wdata),
       .o_read_data(tag_rdata)
   );
 
+  // ---- Fast maintenance dirty shadow (SIM_FAST_MAINT, simulation only) ------
+  // A shadow of the tag array's dirty bits, updated by the exact same writes
+  // that update the tag RAM, so writeback-all can jump straight to dirty lines
+  // instead of scanning every index. Elaborated only when the feature is on:
+  // FPGA/synthesis builds carry none of this logic and read the constant
+  // outputs below.
+  if (SIM_FAST_MAINT != 0) begin : gen_fast_maint
+    logic [NumLines-1:0] dirty_shadow_q;
+    always_ff @(posedge i_clk) begin
+      if (i_rst) dirty_shadow_q <= '0;
+      else if (tag_bulk_clear) dirty_shadow_q <= '0;  // invalidate-all / reset
+      // tag_wdata = {valid, dirty, tag}: bit TagBits is the dirty bit.
+      else if (tag_we) dirty_shadow_q[tag_waddr] <= tag_wdata[TagBits];
+    end
+
+    // Lowest set dirty index over the whole shadow (first_dirty_full) and
+    // excluding the line being written back this cycle (first_dirty_excl). The
+    // scan is gated to the writeback-all states, so ordinary traffic never pays
+    // for it -- a dirty store just toggles one shadow bit above.
+    always_comb begin
+      any_dirty_full   = 1'b0;
+      first_dirty_full = '0;
+      any_dirty_excl   = 1'b0;
+      first_dirty_excl = '0;
+      if ((state_q == S_IDLE && i_writeback_all) || flush_active) begin
+        for (int idx = int'(NumLines) - 1; idx >= 0; idx--) begin
+          if (dirty_shadow_q[idx]) begin
+            any_dirty_full   = 1'b1;
+            first_dirty_full = IndexBits'(idx);
+            if (IndexBits'(idx) != flush_idx_q) begin
+              any_dirty_excl   = 1'b1;
+              first_dirty_excl = IndexBits'(idx);
+            end
+          end
+        end
+      end
+    end
+  end else begin : gen_no_fast_maint
+    assign any_dirty_full   = 1'b0;
+    assign first_dirty_full = '0;
+    assign any_dirty_excl   = 1'b0;
+    assign first_dirty_excl = '0;
+  end
+
+  // ---- Real-FSM writeback-all dirty-range tracker ---------------------------
+  // Mirror the dirty-bit writes (tag_we with the dirty bit set, at tag_waddr --
+  // i.e. the S_TAG_CHECK write-hit and the S_ALLOC write-allocate) into the
+  // lowest/highest dirty index. The real (FPGA) writeback-all walk then scans
+  // only [wb_lo_q, wb_hi_q]. No upstream request is accepted while a walk runs
+  // (o_up_req_ready is low for the duration), so the span is stable across it.
+  // real_wb_done is exactly the cycle the real walk returns to S_IDLE; clearing
+  // the span there is safe because every dirty line in the span has been written
+  // back and lines outside it were never dirty -> wb_any_q==0 iff no dirty line.
+  logic dirty_set;
+  assign dirty_set = tag_we && tag_wdata[TagBits];
+  logic real_wb_done;
+  assign real_wb_done = (SIM_FAST_MAINT == 0) &&
+      ((state_q == S_FLUSH_CHECK && !(tag_rdata_valid && tag_rdata_dirty) &&
+            (!wb_any_q || (flush_idx_q == wb_hi_q))) ||
+       (state_q == S_FLUSH_WB_WAIT && i_down_resp_valid && (flush_idx_q == wb_hi_q)));
+
+  always_ff @(posedge i_clk) begin
+    if (i_rst || real_wb_done) begin
+      wb_lo_q  <= {IndexBits{1'b1}};
+      wb_hi_q  <= '0;
+      wb_any_q <= 1'b0;
+    end else if (dirty_set) begin
+      wb_lo_q  <= (!wb_any_q || (tag_waddr < wb_lo_q)) ? tag_waddr : wb_lo_q;
+      wb_hi_q  <= (!wb_any_q || (tag_waddr > wb_hi_q)) ? tag_waddr : wb_hi_q;
+      wb_any_q <= 1'b1;
+    end
+  end
+
   // Tag read address: the incoming request's index, sampled at the fire so
   // the entry is readable in S_TAG_CHECK; the walk index during the
   // writeback-all scan. Don't-care in every other state.
@@ -273,9 +377,14 @@ module frost_cache #(
 
     unique case (state_q)
       S_SWEEP: begin
-        tag_we    = 1'b1;
-        tag_waddr = sweep_idx_q;
-        tag_wdata = '0;  // valid=0, dirty=0
+        // FPGA: clear one tag entry per cycle. Fast (sim): the tag bulk clear
+        // (tag_bulk_clear -> tag_array.i_bulk_clear) zeroes every entry this
+        // single cycle, so no per-index write is issued here.
+        if (SIM_FAST_MAINT == 0) begin
+          tag_we    = 1'b1;
+          tag_waddr = sweep_idx_q;
+          tag_wdata = '0;  // valid=0, dirty=0
+        end
       end
 
       S_TAG_CHECK: begin
@@ -353,8 +462,13 @@ module frost_cache #(
     end else begin
       unique case (state_q)
         S_SWEEP: begin
-          sweep_idx_q <= sweep_idx_q + 1'b1;
-          if (sweep_idx_q == {IndexBits{1'b1}}) state_q <= S_IDLE;
+          if (SIM_FAST_MAINT != 0) begin
+            // Fast: tag_bulk_clear zeroed every entry this cycle -- done.
+            state_q <= S_IDLE;
+          end else begin
+            sweep_idx_q <= sweep_idx_q + 1'b1;
+            if (sweep_idx_q == {IndexBits{1'b1}}) state_q <= S_IDLE;
+          end
         end
 
         S_IDLE: begin
@@ -364,7 +478,10 @@ module frost_cache #(
             sweep_idx_q <= '0;
             state_q     <= S_SWEEP;
           end else if (i_writeback_all) begin
-            flush_idx_q <= '0;
+            // Fast: jump straight to the first dirty line (O(dirty) walk).
+            // FPGA: start the walk at the bottom of the dirty span (0 if the
+            // cache holds no dirty line -- a single scan cycle then finishes).
+            flush_idx_q <= (SIM_FAST_MAINT != 0) ? first_dirty_full : (wb_any_q ? wb_lo_q : '0);
             state_q     <= S_FLUSH_SCAN;
           end else if (up_req_fire) begin
             req_write_q <= i_up_req_write;
@@ -446,7 +563,12 @@ module frost_cache #(
             wait_cnt_q  <= 8'(DATA_READ_LATENCY);
             flush_tag_q <= tag_rdata_tag;
             state_q     <= S_FLUSH_DATA;
-          end else if (flush_idx_q == {IndexBits{1'b1}}) begin
+          end else if (SIM_FAST_MAINT != 0) begin
+            // Fast: a non-dirty line is only reached when the shadow is empty
+            // (no dirty lines to start with), so the writeback-all is done.
+            state_q <= S_IDLE;
+          end else if (!wb_any_q || (flush_idx_q == wb_hi_q)) begin
+            // Real FSM: scanned the whole dirty span (or nothing was dirty).
             state_q <= S_IDLE;
           end else begin
             flush_idx_q <= flush_idx_q + 1'b1;
@@ -466,7 +588,19 @@ module frost_cache #(
 
         S_FLUSH_WB_WAIT: begin
           if (i_down_resp_valid) begin
-            if (flush_idx_q == {IndexBits{1'b1}}) begin
+            // This line's dirty bit is cleared this cycle (combinational tag
+            // write above), and the sim-only shadow mirrors that clear.
+            if (SIM_FAST_MAINT != 0) begin
+              // Fast: jump to the next still-dirty line (excluding this one);
+              // when none remain the writeback-all is complete.
+              if (any_dirty_excl) begin
+                flush_idx_q <= first_dirty_excl;
+                state_q     <= S_FLUSH_SCAN;
+              end else begin
+                state_q <= S_IDLE;
+              end
+            end else if (flush_idx_q == wb_hi_q) begin
+              // Real FSM: just wrote back the top dirty line of the span -- done.
               state_q <= S_IDLE;
             end else begin
               flush_idx_q <= flush_idx_q + 1'b1;
diff --git a/hw/rtl/lib/cache/frost_cache_hierarchy.sv b/hw/rtl/lib/cache/frost_cache_hierarchy.sv
index bee1d18a..46f40584 100644
--- a/hw/rtl/lib/cache/frost_cache_hierarchy.sv
+++ b/hw/rtl/lib/cache/frost_cache_hierarchy.sv
@@ -48,7 +48,12 @@ module frost_cache_hierarchy #(
     parameter int unsigned L1I_DATA_READ_LATENCY = 2,
     parameter int unsigned L2_CACHE_BYTES = 2 * 1024 * 1024,
     parameter int unsigned L2_DATA_READ_LATENCY = 6,
-    parameter int unsigned L2_DATA_WRITE_LATENCY = 2
+    parameter int unsigned L2_DATA_WRITE_LATENCY = 2,
+    // Simulation-only fast cache maintenance for fence.i (see frost_cache).
+    // 0 = FPGA cycle-accurate FSM; non-zero = sim fast path. Applied to the two
+    // L1s -- the only caches that run fence.i maintenance; the L2 sits below the
+    // arbiter and needs none, so it keeps the default.
+    parameter int unsigned SIM_FAST_MAINT = 0
 ) (
     input logic i_clk,
     input logic i_rst,
@@ -134,7 +139,8 @@ module frost_cache_hierarchy #(
       .LINE_BYTES(LINE_BYTES),
       .DATA_MEMORY_PRIMITIVE("block"),
       .DATA_READ_LATENCY(L1_DATA_READ_LATENCY),
-      .DATA_WRITE_LATENCY(L1_DATA_WRITE_LATENCY)
+      .DATA_WRITE_LATENCY(L1_DATA_WRITE_LATENCY),
+      .SIM_FAST_MAINT(SIM_FAST_MAINT)
   ) l1_cache (
       .i_clk(i_clk),
       .i_rst(i_rst),
@@ -164,7 +170,8 @@ module frost_cache_hierarchy #(
       .CACHE_SIZE_BYTES(L1I_CACHE_BYTES),
       .LINE_BYTES(LINE_BYTES),
       .DATA_MEMORY_PRIMITIVE("block"),
-      .DATA_READ_LATENCY(L1I_DATA_READ_LATENCY)
+      .DATA_READ_LATENCY(L1I_DATA_READ_LATENCY),
+      .SIM_FAST_MAINT(SIM_FAST_MAINT)
   ) l1i_cache (
       .i_clk(i_clk),
       .i_rst(i_rst),
diff --git a/hw/rtl/lib/cache/frost_cache_test_harness.sv b/hw/rtl/lib/cache/frost_cache_test_harness.sv
index 96973797..ae189128 100644
--- a/hw/rtl/lib/cache/frost_cache_test_harness.sv
+++ b/hw/rtl/lib/cache/frost_cache_test_harness.sv
@@ -36,7 +36,10 @@ module frost_cache_test_harness #(
     parameter int unsigned L2_DATA_WRITE_LATENCY = 2,
     parameter logic [31:0] BASE_ADDR = 32'h8000_0000,
     parameter int unsigned MEM_BYTES = 4 * 1024 * 1024,
-    parameter int unsigned MEM_LATENCY = 12
+    parameter int unsigned MEM_LATENCY = 12,
+    // Simulation-only fast cache maintenance for fence.i (see frost_cache). The
+    // cocotb cache registry runs this bench with it both off (default) and on.
+    parameter int unsigned SIM_FAST_MAINT = 0
 ) (
     input  logic                    i_clk,
     input  logic                    i_rst,
@@ -76,7 +79,8 @@ module frost_cache_test_harness #(
       .L1I_CACHE_BYTES(L1I_CACHE_BYTES),
       .L2_CACHE_BYTES(L2_CACHE_BYTES),
       .L2_DATA_READ_LATENCY(L2_DATA_READ_LATENCY),
-      .L2_DATA_WRITE_LATENCY(L2_DATA_WRITE_LATENCY)
+      .L2_DATA_WRITE_LATENCY(L2_DATA_WRITE_LATENCY),
+      .SIM_FAST_MAINT(SIM_FAST_MAINT)
   ) cache_hierarchy (
       .i_clk(i_clk),
       .i_rst(i_rst),
diff --git a/hw/rtl/lib/ram/mwp_dist_ram_ohread.sv b/hw/rtl/lib/ram/mwp_dist_ram_ohread.sv
new file mode 100644
index 00000000..8381edee
--- /dev/null
+++ b/hw/rtl/lib/ram/mwp_dist_ram_ohread.sv
@@ -0,0 +1,134 @@
+/*
+ *    Copyright 2026 Two Sigma Open Source, LLC
+ *
+ *    Licensed under the Apache License, Version 2.0 (the "License");
+ *    you may not use this file except in compliance with the License.
+ *    You may obtain a copy of the License at
+ *
+ *        http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *    Unless required by applicable law or agreed to in writing, software
+ *    distributed under the License is distributed on an "AS IS" BASIS,
+ *    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *    See the License for the specific language governing permissions and
+ *    limitations under the License.
+ */
+
+/*
+ * mwp_dist_ram with a ONE-HOT read select for the Live Value Table.
+ *
+ * Identical storage/write semantics to mwp_dist_ram (one sdp_dist_ram bank per
+ * write port + register LVT, highest-numbered port wins on same-address
+ * writes).  The difference is purely a TIMING restructure of the read path:
+ * the caller supplies BOTH the binary read address (still used for the banks'
+ * LUTRAM address pins, which require binary) AND a registered one-hot image of
+ * the same address (i_read_onehot).  The LVT bank-select lookup — a 32:1 mux
+ * of registered LVT bits behind a high-fanout binary select in the base
+ * module — becomes an AND-OR reduction over per-entry one-hot bits:
+ *
+ *   lvt_read_sel = OR_i (i_read_onehot[i] ? lvt[i] : '0)
+ *
+ * CONTRACT (caller invariant): i_read_onehot == (1 << i_read_address) in
+ * every cycle where o_read_data is consumed.  Under that invariant the
+ * reduction equals lvt[i_read_address] exactly, so o_read_data is
+ * bit-identical to the base module's.  A simulation-only check below fires if
+ * the invariant is ever violated.
+ *
+ * Intended use: the reorder buffer head / head+1 read ports, whose one-hot
+ * images (head_clear_mask / head_next_clear_mask) are already maintained as
+ * registers that move in lockstep with head_ptr.
+ */
+module mwp_dist_ram_ohread #(
+    parameter int unsigned ADDR_WIDTH      = 5,   // Address width in bits
+    parameter int unsigned DATA_WIDTH      = 32,  // Data width in bits
+    parameter int unsigned NUM_WRITE_PORTS = 2    // Number of write ports (>= 2)
+) (
+    input logic i_clk,
+
+    // Write ports (active-high enables, independent addresses and data)
+    input logic [NUM_WRITE_PORTS-1:0]                 i_write_enable,
+    input logic [NUM_WRITE_PORTS-1:0][ADDR_WIDTH-1:0] i_write_address,
+    input logic [NUM_WRITE_PORTS-1:0][DATA_WIDTH-1:0] i_write_data,
+
+    // Read port (asynchronous / combinational).
+    // i_read_address feeds the LUTRAM banks (binary); i_read_onehot must be a
+    // registered one-hot image of the SAME address and steers the LVT select.
+    input  logic [   ADDR_WIDTH-1:0] i_read_address,
+    input  logic [2**ADDR_WIDTH-1:0] i_read_onehot,
+    output logic [   DATA_WIDTH-1:0] o_read_data
+);
+
+  localparam int unsigned RamDepth = 2 ** ADDR_WIDTH;
+  localparam int unsigned SelWidth = $clog2(NUM_WRITE_PORTS);
+
+  // ---------------------------------------------------------------------------
+  // RAM bank per write port (identical to mwp_dist_ram)
+  // ---------------------------------------------------------------------------
+  logic [NUM_WRITE_PORTS-1:0][DATA_WIDTH-1:0] bank_read_data;
+
+  for (genvar wp = 0; wp < NUM_WRITE_PORTS; wp++) begin : g_banks
+    sdp_dist_ram #(
+        .ADDR_WIDTH(ADDR_WIDTH),
+        .DATA_WIDTH(DATA_WIDTH)
+    ) u_bank (
+        .i_clk,
+        .i_write_enable (i_write_enable[wp]),
+        .i_write_address(i_write_address[wp]),
+        .i_read_address (i_read_address),
+        .i_write_data   (i_write_data[wp]),
+        .o_read_data    (bank_read_data[wp])
+    );
+  end : g_banks
+
+  // ---------------------------------------------------------------------------
+  // Live Value Table (register-based, identical write behavior)
+  // ---------------------------------------------------------------------------
+  logic [SelWidth-1:0] lvt[RamDepth];
+
+  initial for (int i = 0; i < RamDepth; ++i) lvt[i] = '0;
+
+  always_ff @(posedge i_clk) begin
+    for (int wp = 0; wp < NUM_WRITE_PORTS; wp++) begin
+      if (i_write_enable[wp]) lvt[i_write_address[wp]] <= SelWidth'(wp);
+    end
+  end
+
+  // ---------------------------------------------------------------------------
+  // Read mux — LVT selected via the one-hot AND-OR instead of a binary mux
+  // ---------------------------------------------------------------------------
+  logic [SelWidth-1:0] lvt_read_sel;
+  always_comb begin
+    lvt_read_sel = '0;
+    for (int i = 0; i < RamDepth; i++) begin
+      if (i_read_onehot[i]) lvt_read_sel |= lvt[i];
+    end
+  end
+
+  assign o_read_data = bank_read_data[lvt_read_sel];
+
+`ifndef SYNTHESIS
+`ifndef FORMAL
+  // Simulation-only contract check: the one-hot select must mirror the binary
+  // read address whenever both are known.  A mismatch would silently return
+  // the wrong bank's data, so treat it as an error.  The all-zero case is
+  // tolerated: it only occurs before the caller's reset has loaded the mask
+  // register (2-state sims read uninitialized FFs as 0), where it selects
+  // bank 0 exactly like the base module's initial lvt='0 read would.
+  // (FORMAL builds exclude this block — yosys cannot elaborate $error in a
+  // clocked process; the equivalent invariant is proven as
+  // p_head_mask_onehot / p_head_next_mask_onehot in the reorder_buffer's
+  // FORMAL section instead.)
+  always @(posedge i_clk) begin
+    if (!$isunknown(
+            i_read_address
+        ) && !$isunknown(
+            i_read_onehot
+        ) && (i_read_onehot != '0) && (i_read_onehot != (RamDepth'(1) << i_read_address))) begin
+      $error("mwp_dist_ram_ohread: i_read_onehot (0x%0h) != 1 << i_read_address (%0d)",
+             i_read_onehot, i_read_address);
+    end
+  end
+`endif
+`endif
+
+endmodule : mwp_dist_ram_ohread
diff --git a/hw/rtl/lib/ram/ram.f b/hw/rtl/lib/ram/ram.f
index bbf43814..3cd92a74 100644
--- a/hw/rtl/lib/ram/ram.f
+++ b/hw/rtl/lib/ram/ram.f
@@ -13,6 +13,9 @@
 # Two-read-port variant of mwp_dist_ram (shared LVT + banks, two async reads)
 $(ROOT)/hw/rtl/lib/ram/mwp_dist_ram_2r.sv
 
+# One-hot-read-select variant of mwp_dist_ram (LVT select via registered one-hot)
+$(ROOT)/hw/rtl/lib/ram/mwp_dist_ram_ohread.sv
+
 # Simple dual-port block RAM (sync read, sync write)
 $(ROOT)/hw/rtl/lib/ram/sdp_block_ram.sv
 
diff --git a/hw/rtl/lib/ram/sdp_block_ram.sv b/hw/rtl/lib/ram/sdp_block_ram.sv
index 81851060..b50b3d93 100644
--- a/hw/rtl/lib/ram/sdp_block_ram.sv
+++ b/hw/rtl/lib/ram/sdp_block_ram.sv
@@ -26,10 +26,21 @@
  */
 module sdp_block_ram #(
     parameter int unsigned ADDR_WIDTH = 5,  // Address width in bits
-    parameter int unsigned DATA_WIDTH = 32  // Data width in bits
+    parameter int unsigned DATA_WIDTH = 32,  // Data width in bits
+    // Simulation-only bulk-clear support. 0 (FPGA/synthesis): this module is
+    // byte-for-byte the plain single-write block RAM -- the clear path is not
+    // elaborated, so inference is unchanged. Non-zero: a sim-only path lets
+    // i_bulk_clear zero every entry in one cycle (frost_cache's fast
+    // invalidate-all). The clear branch lives in a generate that is elaborated
+    // only when this is set, so no synthesis flow ever sees the array-wide
+    // reset.
+    parameter int unsigned SUPPORT_BULK_CLEAR = 0
 ) (
     input logic i_clk,
     input logic i_write_enable,
+    // Sim-only one-cycle clear of every entry (see SUPPORT_BULK_CLEAR). Tied
+    // low / unused on FPGA builds (SUPPORT_BULK_CLEAR = 0).
+    input logic i_bulk_clear,
     input logic [ADDR_WIDTH-1:0] i_write_address,
     input logic [ADDR_WIDTH-1:0] i_read_address,
     input logic [DATA_WIDTH-1:0] i_write_data,
@@ -42,8 +53,18 @@ module sdp_block_ram #(
   // Initialize all memory locations to zero
   initial for (int i = 0; i < RamDepth; ++i) ram[i] = '0;
 
-  // Synchronous write operation
-  always_ff @(posedge i_clk) if (i_write_enable) ram[i_write_address] <= i_write_data;
+  // Synchronous write. SUPPORT_BULK_CLEAR picks the write block at elaboration:
+  // the FPGA path is exactly the original single-port write (so block-RAM
+  // inference is unchanged); the sim-only path adds a one-cycle clear-all that
+  // takes priority over a write. Only one branch ever exists in a build.
+  if (SUPPORT_BULK_CLEAR != 0) begin : gen_clearable_write
+    always_ff @(posedge i_clk) begin
+      if (i_bulk_clear) for (int i = 0; i < int'(RamDepth); ++i) ram[i] <= '0;
+      else if (i_write_enable) ram[i_write_address] <= i_write_data;
+    end
+  end else begin : gen_plain_write
+    always_ff @(posedge i_clk) if (i_write_enable) ram[i_write_address] <= i_write_data;
+  end
 
   // Synchronous read - output registered for block RAM inference and timing
   always_ff @(posedge i_clk) o_read_data <= ram[i_read_address];
diff --git a/hw/rtl/lib/ram/sdp_block_ram_dc.sv b/hw/rtl/lib/ram/sdp_block_ram_dc.sv
index b17a6995..efcb7584 100644
--- a/hw/rtl/lib/ram/sdp_block_ram_dc.sv
+++ b/hw/rtl/lib/ram/sdp_block_ram_dc.sv
@@ -18,7 +18,8 @@
  * Dual-clock simple dual-port block RAM for clock domain crossing.
  * This module implements a block RAM with separate clocks for read and write ports,
  * enabling safe data transfer between different clock domains. The write port operates
- * on i_wr_clk while the read port operates on i_rd_clk, with the block RAM providing
+ * on i_write_clock while the read port operates on i_read_clock, with the block RAM
+ * providing
  * inherent synchronization. Both ports have registered (single-cycle latency) access
  * to ensure clean timing and proper block RAM inference. This module is specifically
  * designed for use in asynchronous FIFOs where write and read operations occur in
diff --git a/hw/rtl/peripherals/uart_rx.sv b/hw/rtl/peripherals/uart_rx.sv
index bb62bed8..8f646e72 100644
--- a/hw/rtl/peripherals/uart_rx.sv
+++ b/hw/rtl/peripherals/uart_rx.sv
@@ -113,7 +113,7 @@ module uart_rx #(
 
       STATE_DATA_BITS: begin
         // Move to stop bit after all 8 data bits received
-        if (baud_rate_prescaler_counter == 0 && bits_remaining_counter == 0) begin
+        if (baud_rate_prescaler_counter == 0 && bits_remaining_counter == 1) begin
           next_state = STATE_STOP_BIT;
         end
       end
diff --git a/hw/sim/cpu_tb.sv b/hw/sim/cpu_tb.sv
index d360a0da..3c474531 100644
--- a/hw/sim/cpu_tb.sv
+++ b/hw/sim/cpu_tb.sv
@@ -48,16 +48,52 @@ module cpu_tb
 );
 
   // Internal signals (names match CPU port names for wildcard connection)
-  logic [31:0] i_instr;  // Registered instruction fed to CPU (raw 32-bit for C extension)
-  logic [1:0] i_instr_sideband;  // Predecode: {is_compressed_hi, is_compressed_lo}
+  // 64-bit fetch window {next_word, current_word} (the CPU fetches a word pair).
+  logic [63:0] i_instr;
+  // Per-32-bit-word predecode sideband (ImemSidebandWidth bits each half).
+  logic [riscv_pkg::ImemFetchSidebandWidth-1:0] i_instr_sideband;
+  logic i_instr_bank_sel_r;  // Fetch-word parity (pc_reg[2]) for the window
+  logic i_instr_valid;  // Fetch window valid (tie 1: fixed 1-cycle provider)
+  logic [31:0] i_served_addr;  // Served fetch-window tag (address fetched last cycle)
   logic [31:0] i_data_mem_rd_data;  // Data memory read data to CPU
   logic pipeline_stall_from_cpu;  // Stall signal monitoring (registered, 1-cycle delay)
   logic pipeline_stall_comb;  // Stall signal (combinational, immediate)
   logic reset_to_cpu;  // Reset signal monitoring
-  logic o_mmio_read_pulse;  // Unused in testbench; required for CPU .* connection
-  logic [31:0] o_mmio_load_addr;  // Unused in testbench; required for CPU .* connection
-  logic o_mmio_load_valid;  // Unused in testbench; required for CPU .* connection
-  logic o_pipeline_stall;  // Unused in testbench; required for CPU .* connection
+
+  // Registered 1-cycle fetch state (mimics block-RAM instruction memory latency)
+  logic [31:0] tb_cur_word;  // current fetch word presented to the CPU
+  logic tb_bank_sel_q;  // parity (PC[2]) of the fetched address
+  logic [31:0] tb_served_addr_q;  // address whose window is presented (o_pc, 1 cycle back)
+  localparam logic [31:0] TbNop = 32'h0000_0013;  // addi x0,x0,0
+
+  // Ports below are unused by this instruction-feed testbench but must exist as
+  // local signals so the wildcard (.*) connection to cpu_ooo resolves.
+  logic o_mmio_read_pulse;
+  logic [31:0] o_mmio_load_addr;
+  logic o_mmio_load_valid;
+  logic o_mmio_fifo0_read_pulse;
+  logic o_mmio_fifo1_read_pulse;
+  logic o_mmio_uart_rx_ready_pulse;
+  logic o_pipeline_stall;
+  logic o_fetch_replay_consume;
+  // FENCE.I cache-sync handshake (no I-cache here; completed immediately below)
+  logic o_fence_i_sync_req;
+  logic i_fence_i_sync_done;
+  logic o_fence_i_flush;
+  // Cached (high-address) tier request outputs + response inputs (tied idle:
+  // the directed programs touch only the low BRAM range, never CACHED_BASE).
+  logic [3:0] o_data_mem_cached_byte_wr_en;
+  logic [31:0] o_data_mem_cached_wr_data;
+  logic o_data_mem_cached_read_enable;
+  logic [31:0] i_cached_read_data;
+  logic i_cached_read_valid;
+  logic i_cached_write_done;
+  logic i_cached_write_inflight;
+  // Debug taps (read from cocotb via device_under_test.*; also exposed here).
+  logic [5:0] o_debug_irq_status;
+  logic [31:0] o_debug_commit_pc;
+  logic [31:0] o_debug_commit_2_pc;
+  logic [1:0] o_debug_commit_valid;
 
   // Interrupt and timer signals for CPU (controllable from testbench)
   // Use reg type to allow testbench to drive values via force/deposit
@@ -81,14 +117,42 @@ module cpu_tb
   always_ff @(posedge i_clk) begin
     // Stall signal from CPU observed on next rising edge
     pipeline_stall_from_cpu <= device_under_test.pipeline_ctrl.stall;
-    // Mimic one cycle read latency of block RAM instruction memory port
-    i_instr <= instruction_from_testbench;
-    // Compute sideband: {is_compressed_hi, is_compressed_lo}
-    // A halfword is compressed when its low 2 bits != 2'b11
-    i_instr_sideband[0] <= (instruction_from_testbench[1:0] != 2'b11);
-    i_instr_sideband[1] <= (instruction_from_testbench[17:16] != 2'b11);
+    // Mimic one cycle read latency of block RAM instruction memory port: the
+    // word for the address requested on o_pc this cycle is presented next cycle.
+    tb_cur_word <= instruction_from_testbench;
+    tb_bank_sel_q <= o_pc[2];  // parity of the fetched address
+    tb_served_addr_q <= o_pc;  // served-window tag: the address fetched last cycle
   end
 
+  // 64-bit fetch window {next_word, current_word}. The testbench feeds only
+  // 32-bit, 4-byte-aligned instructions (no compressed, no halfword spanning),
+  // so the "next word" half is never consumed (spanning only fires at pc[1]);
+  // drive a NOP there.
+  assign i_instr = {TbNop, tb_cur_word};
+  // Per-word predecode sideband, computed by the same pure function the RTL
+  // fetch path uses (riscv_pkg::imem_make_sideband; no lookahead).
+  assign i_instr_sideband = {
+    riscv_pkg::imem_make_sideband(TbNop), riscv_pkg::imem_make_sideband(tb_cur_word)
+  };
+  // bank_sel_r == pc_reg[2] => aligned: current word taken from i_instr[31:0].
+  assign i_instr_bank_sel_r = tb_bank_sel_q;
+  // Served-window tag: this fixed 1-cycle provider always presents the window
+  // for last cycle's o_pc, so the tag is exactly that registered address (the
+  // if_stage served-window guard sees a window that always covers pc_reg).
+  assign i_served_addr = tb_served_addr_q;
+  // Fixed 1-cycle provider: the fetch window is always valid.
+  assign i_instr_valid = 1'b1;
+
+  // FENCE.I cache-sync handshake completes immediately (no I-cache here; the
+  // directed programs never issue FENCE.I, so o_fence_i_sync_req stays low).
+  assign i_fence_i_sync_done = o_fence_i_sync_req;
+
+  // Cached (high-address) tier response inputs tied inactive (tier unused).
+  assign i_cached_read_data = '0;
+  assign i_cached_read_valid = 1'b0;
+  assign i_cached_write_done = 1'b0;
+  assign i_cached_write_inflight = 1'b0;
+
   // Memory addressing parameters
   localparam int unsigned MemByteAddrWidth = $clog2(MEM_SIZE_BYTES);
   localparam int unsigned MemWordAddrWidth = MemByteAddrWidth - 2;
diff --git a/linux/buildroot b/linux/buildroot
new file mode 160000
index 00000000..67449130
--- /dev/null
+++ b/linux/buildroot
@@ -0,0 +1 @@
+Subproject commit 67449130e9fdd71a38ca26539dddfa8c882b1977
diff --git a/linux/buildroot-external/Config.in b/linux/buildroot-external/Config.in
new file mode 100644
index 00000000..4de49724
--- /dev/null
+++ b/linux/buildroot-external/Config.in
@@ -0,0 +1,18 @@
+#    Copyright 2026 Two Sigma Open Source, LLC
+#
+#    Licensed under the Apache License, Version 2.0 (the "License");
+#    you may not use this file except in compliance with the License.
+#    You may obtain a copy of the License at
+#
+#        http://www.apache.org/licenses/LICENSE-2.0
+#
+#    Unless required by applicable law or agreed to in writing, software
+#    distributed under the License is distributed on an "AS IS" BASIS,
+#    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#    See the License for the specific language governing permissions and
+#    limitations under the License.
+
+# The FROST external tree provides no extra target packages today; the kernel,
+# toolchain and rootfs are all selected by configs/frost_nommu_rv32_defconfig.
+# Add `source "$BR2_EXTERNAL_FROST_PATH/package/<pkg>/Config.in"` lines here if
+# FROST-specific Buildroot packages are introduced later.
diff --git a/linux/buildroot-external/README.md b/linux/buildroot-external/README.md
new file mode 100644
index 00000000..341970c8
--- /dev/null
+++ b/linux/buildroot-external/README.md
@@ -0,0 +1,157 @@
+<!--
+   Copyright 2026 Two Sigma Open Source, LLC
+
+   Licensed under the Apache License, Version 2.0 (the "License");
+   you may not use this file except in compliance with the License.
+   You may obtain a copy of the License at
+
+       http://www.apache.org/licenses/LICENSE-2.0
+
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License.
+-->
+
+# FROST Buildroot external tree (`BR2_EXTERNAL`)
+
+Reproducibly builds the FROST **RV32 / no-MMU / M-mode Linux** kernel (6.18.7),
+a busybox initramfs, and packages them into the memory images the FROST cocotb
+`linux_boot` simulation (and the FPGA JTAG loader) consume.
+
+This is a standard Buildroot [`BR2_EXTERNAL`](https://buildroot.org/downloads/manual/manual.html#outside-br-custom)
+tree. It carries **no** Buildroot source itself — point an out-of-tree build at
+a pinned upstream Buildroot checkout (see *Buildroot pin* below).
+
+## Layout
+
+```
+linux/buildroot-external/
+├── external.desc                          # BR2_EXTERNAL manifest (name: FROST)
+├── external.mk                            # package include hook (no packages today)
+├── Config.in                              # package menu hook (empty today)
+├── configs/
+│   └── frost_nommu_rv32_defconfig         # the FROST Buildroot defconfig
+└── board/frost/
+    ├── linux-nommu-base.config            # base kernel config (from buildroot board/qemu/riscv32-virt)
+    ├── linux-nommu-frost.config.fragment  # FROST kernel CONFIG delta, merged on top of the base
+    ├── frost-nommu-fpga.dts               # reference DTB source (the packer regenerates it per build)
+    ├── build_fpga_boot.py                 # packer: Image + DTB + initramfs -> sw.{mem,txt}, sw_ddr.{mem,txt}
+    ├── post-image.sh                      # Buildroot post-image hook -> runs the packer
+    └── patches/linux/linux.hash           # sha256 for the custom linux-6.18.7 tarball
+```
+
+## Buildroot pin
+
+Buildroot is vendored as a submodule at `linux/buildroot`, pinned to the exact
+commit **`67449130`** (a `2026.08-git` snapshot). That commit provides the
+defaults this defconfig relies on: **gcc 15.2.0**, **binutils 2.45.1**, the
+internal rv32-nommu **uClibc** toolchain, and the **Linux 6.18** host-headers
+option. The pin is the exact commit rather than a release tag so the build is
+reproducible regardless of tag movement.
+
+A fresh checkout only needs the submodule initialized:
+
+```bash
+git submodule update --init linux/buildroot
+```
+
+To bump the pin, checkout the new commit in the submodule and commit the
+updated gitlink:
+
+```bash
+git -C linux/buildroot checkout <new-sha>
+git add linux/buildroot
+git commit -m "linux: bump vendored buildroot to <new-sha>"
+```
+
+> Re-verify a bump ships `BR2_GCC_VERSION_15_X` (15.2.0),
+> `BR2_BINUTILS_VERSION_2_45_X` (2.45.1) and
+> `BR2_PACKAGE_HOST_LINUX_HEADERS_CUSTOM_6_18`, which this defconfig relies on.
+
+## Build
+
+Out-of-tree build (keeps the Buildroot submodule pristine):
+
+```bash
+# from the repo root
+make -C linux/buildroot O="$(pwd)/linux/build" \
+     BR2_EXTERNAL="$(pwd)/linux/buildroot-external" frost_nommu_rv32_defconfig
+make -C linux/buildroot O="$(pwd)/linux/build"
+```
+
+First build is ~30–60 min (it builds the cross toolchain from source). Outputs
+land in `linux/build/images/`:
+
+| File | Purpose |
+|---|---|
+| `Image` | rv32 no-MMU kernel (flat, uncompressed) |
+| `rootfs.cpio.gz` | busybox initramfs |
+| `frost-nommu-fpga.dtb` | generated FROST device tree (UART/CLINT @ 0x4000_xxxx, 133.333 MHz) |
+| `sw.mem` / `sw.txt` | low-BRAM boot shim (`a0=0`, `a1=DTB`, jump to kernel) |
+| `sw_ddr.mem` / `sw_ddr.txt` | DDR image: kernel @ 0x8000_0000, DTB @ 0x8080_0000, initramfs @ 0x8081_0000 |
+
+## Feeding the cocotb `linux_boot` test
+
+`tests/test_run_cocotb.py` resolves an app's images at
+`sw/apps/<app>/sw.mem` (+ `sw_ddr.mem`). Stage the build outputs there:
+
+```bash
+mkdir -p sw/apps/linux_boot
+cp linux/build/images/sw.mem     sw/apps/linux_boot/sw.mem
+cp linux/build/images/sw_ddr.mem sw/apps/linux_boot/sw_ddr.mem
+# then, per the repo CLAUDE.md test flow:
+cd tests && make clean && ./test_run_cocotb.py linux_boot
+```
+
+Or let the app Makefile self-build straight from this tree (it runs the whole
+Buildroot build if `linux/build/images/Image` is absent, then packs for the
+board clock) -- this is what `fpga/load_software/load_software.py <board>
+linux_boot` and the
+CI `build-frost-linux` job drive:
+
+```bash
+make -C sw/apps/linux_boot            # genesys2 clock (133.33 MHz) by default
+make -C sw/apps/linux_boot FPGA_CPU_CLK_FREQ=300000000   # x3 clock
+```
+
+The `linux_boot` cocotb registry entry (`linux_boot` / `linux_boot_128k`) and
+its `build-frost-linux` + `linux-boot-cocotb` + `linux-boot-qemu` CI jobs live
+on this branch (`nommu_linux`); they reach `main` when the branch merges.
+
+## How the kernel config is assembled
+
+`BR2_LINUX_KERNEL_USE_CUSTOM_CONFIG` uses `board/frost/linux-nommu-base.config`
+as the base, and `BR2_LINUX_KERNEL_CONFIG_FRAGMENT_FILES` merges
+`board/frost/linux-nommu-frost.config.fragment` on top (kconfig
+`merge_config.sh` semantics). The fragment retargets the known-good QEMU-virt
+nommu kernel at FROST: it keeps M-mode / rv32 / no-MMU / bFLT, switches the
+rootfs to an initramfs (`BLK_DEV_INITRD` + `RD_GZIP`), and drops
+virtio / PCI / net / ext2 / PLIC. See the header of the fragment for the full,
+per-symbol rationale and the hardware caveats.
+
+## Notes, assumptions and gaps
+
+- **Rootfs reproduction.** `rootfs.cpio.gz` is reproduced from Buildroot's
+  default busybox (`busybox-minimal.config`) + `BR2_TARGET_ROOTFS_CPIO[_GZIP]`,
+  not vendored. It is functionally equivalent to the hand-made
+  `frost-artifacts/rootfs.cpio.gz` but **not** byte-identical. Add a
+  `rootfs-overlay/` + `BR2_ROOTFS_OVERLAY` here if a specific userspace is
+  required.
+- **Fragment vs. the latest hand-built Image.** This defconfig *applies* the
+  FROST fragment (per the build notes' "Option A"). The most recent artifact
+  `Image` checked on the dev box was actually built from the **stock**
+  `qemu_riscv32_nommu_virt_defconfig` *without* the fragment (it still had
+  `CONFIG_NET` / `CONFIG_VIRTIO_BLK` / `CONFIG_SIFIVE_PLIC` / `CONFIG_EXT2_FS`
+  set). Decide whether the fragment-applied kernel here is the intended target
+  (it should be — it is strictly closer to FROST and the generated DTB has no
+  PLIC/virtio nodes) or whether to drop the fragment to match that artifact
+  bit-for-bit.
+- **Boot shim toolchain.** Standalone, the packer uses the xPack
+  `riscv-none-elf-*` bare-metal toolchain (`rv32i_zicsr` / `ilp32`). In CI
+  `post-image.sh` instead uses the Buildroot-built `riscv32-*-` toolchain with
+  its own default `-march`/`-mabi` (the shim is ABI-agnostic integer code).
+- **`dtc`.** `post-image.sh` prefers `$HOST_DIR/bin/dtc`, then the kernel's
+  `scripts/dtc/dtc`, then `$PATH`. Enable `BR2_PACKAGE_HOST_DTC=y` if you want
+  to guarantee a host `dtc` independent of the kernel build.
diff --git a/linux/buildroot-external/board/frost/.gitignore b/linux/buildroot-external/board/frost/.gitignore
new file mode 100644
index 00000000..d3be67cb
--- /dev/null
+++ b/linux/buildroot-external/board/frost/.gitignore
@@ -0,0 +1,8 @@
+# Generated by build_fpga_boot.py (standalone runs default FROST_OUTDIR here).
+# The Buildroot/CI flow writes these into $BINARIES_DIR instead, not the tree.
+/frost-nommu-fpga.dts
+/frost-nommu-fpga.dtb
+/sw.mem
+/sw.txt
+/sw_ddr.mem
+/sw_ddr.txt
diff --git a/linux/buildroot-external/board/frost/build_fpga_boot.py b/linux/buildroot-external/board/frost/build_fpga_boot.py
new file mode 100755
index 00000000..064575e1
--- /dev/null
+++ b/linux/buildroot-external/board/frost/build_fpga_boot.py
@@ -0,0 +1,243 @@
+#!/usr/bin/env python3
+
+#    Copyright 2026 Two Sigma Open Source, LLC
+#
+#    Licensed under the Apache License, Version 2.0 (the "License");
+#    you may not use this file except in compliance with the License.
+#    You may obtain a copy of the License at
+#
+#        http://www.apache.org/licenses/LICENSE-2.0
+#
+#    Unless required by applicable law or agreed to in writing, software
+#    distributed under the License is distributed on an "AS IS" BASIS,
+#    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#    See the License for the specific language governing permissions and
+#    limitations under the License.
+
+# Vendored from frost-artifacts/build_fpga_boot.py; style carve-outs pending a refactor.
+# ruff: noqa: D103, UP031
+
+"""Build a FROST FPGA / sim no-MMU Linux boot image.
+
+Derived from frost-artifacts/build_fpga_boot.py. The packing logic (memory
+layout, word format, DTB template and boot shim) is unchanged; the only
+additions are environment overrides so the script runs both:
+
+  * standalone on a dev box (xPack riscv-none-elf toolchain, original paths), and
+  * as a Buildroot post-image hook in CI (board/frost/post-image.sh sets the
+    env to point at Buildroot's $BINARIES_DIR and its just-built toolchain).
+
+Emits BOTH forms of each image:
+  sw.{mem,txt}      low BRAM: boot shim (a0=0, a1=DTB, jr kernel entry).
+  sw_ddr.{mem,txt}  DDR (offset 0 == 0x8000_0000): kernel Image @ 0,
+                    DTB @ 0x80_0000, initramfs (cpio.gz) @ 0x81_0000.
+
+  .mem = $readmemh form (sim): "@<word-index>" directives + word values.
+  .txt = FPGA-loader form: dense, one little-endian word value per line from
+         offset 0 (file_to_bram.tcl / file_to_ddr.tcl burst it sequentially).
+Both carry identical little-endian word values.
+
+Environment overrides (all optional; defaults reproduce the standalone build):
+  FROST_IMAGE          kernel Image path        (default: ~/bigger_l0/linux-mvp/buildroot/output/images/Image)
+  FROST_INITRD         rootfs.cpio.gz path      (default: <script dir>/rootfs.cpio.gz)
+  FROST_OUTDIR         where to write outputs   (default: <script dir>)
+  FROST_CROSS_COMPILE  cross toolchain prefix   (default: riscv-none-elf-)
+  FROST_DTC            device-tree compiler     (default: dtc)
+  FROST_SHIM_MARCH     shim -march (empty=omit) (default: rv32i_zicsr)
+  FROST_SHIM_MABI      shim -mabi  (empty=omit) (default: ilp32)
+  FPGA_CPU_CLK_FREQ    timebase/uart clock Hz   (default: 133333333, genesys2)
+"""
+
+import os
+import struct
+import subprocess
+
+ART = os.path.dirname(os.path.abspath(__file__))
+IMAGE = os.environ.get(
+    "FROST_IMAGE",
+    os.path.expanduser("~/bigger_l0/linux-mvp/buildroot/output/images/Image"),
+)
+INITRD = os.environ.get("FROST_INITRD", os.path.join(ART, "rootfs.cpio.gz"))
+OUTDIR = os.environ.get("FROST_OUTDIR", ART)
+DTS = os.path.join(OUTDIR, "frost-nommu-fpga.dts")
+DTB = os.path.join(OUTDIR, "frost-nommu-fpga.dtb")
+
+CROSS = os.environ.get("FROST_CROSS_COMPILE", "riscv-none-elf-")
+GCC = CROSS + "gcc"
+OBJCOPY = CROSS + "objcopy"
+DTC = os.environ.get("FROST_DTC", "dtc")
+SHIM_MARCH = os.environ.get("FROST_SHIM_MARCH", "rv32i_zicsr")
+SHIM_MABI = os.environ.get("FROST_SHIM_MABI", "ilp32")
+
+KERNEL_ENTRY = 0x80000000
+DTB_BASE = 0x80800000  # 8 MiB: clear of the kernel image_size footprint
+INITRD_BASE = 0x80810000  # 8 MiB + 64 KiB: clear of the (small) DTB
+DTB_WORD = (DTB_BASE - KERNEL_ENTRY) // 4  # 0x200000
+INITRD_WORD = (INITRD_BASE - KERNEL_ENTRY) // 4  # 0x204000
+MEM_SIZE = 0x4000000  # 64 MiB.
+CLK = int(os.environ.get("FPGA_CPU_CLK_FREQ", "133333333"))  # genesys2 default
+
+OUT = {
+    k: os.path.join(OUTDIR, k) for k in ("sw.mem", "sw.txt", "sw_ddr.mem", "sw_ddr.txt")
+}
+
+
+def to_words(data: bytes):
+    """Bytes -> 8-hex-digit little-endian WORD VALUES (xxd -e style)."""
+    if len(data) % 4:
+        data += b"\x00" * (4 - len(data) % 4)
+    return [
+        "{:08x}".format(struct.unpack_from("<I", data, i)[0])
+        for i in range(0, len(data), 4)
+    ]
+
+
+def gen_dtb(initrd_size: int) -> bytes:
+    initrd_end = INITRD_BASE + initrd_size
+    dts = f"""/dts-v1/;
+
+/ {{
+\t#address-cells = <0x01>;
+\t#size-cells = <0x01>;
+\tcompatible = "frost,nommu-rv32", "frost";
+\tmodel = "FROST RV32 (no-MMU, M-mode Linux)";
+
+\tchosen {{
+\t\tstdout-path = "/soc/serial@40001000";
+\t\tbootargs = "earlycon=uart8250,mmio32,0x40001000 console=ttyS0 rdinit=/sbin/init";
+\t\tlinux,initrd-start = <0x{INITRD_BASE:08x}>;
+\t\tlinux,initrd-end = <0x{initrd_end:08x}>;
+\t}};
+
+\tcpus {{
+\t\t#address-cells = <0x01>;
+\t\t#size-cells = <0x00>;
+\t\ttimebase-frequency = <{CLK}>;
+
+\t\tcpu@0 {{
+\t\t\tdevice_type = "cpu";
+\t\t\treg = <0x00>;
+\t\t\tstatus = "okay";
+\t\t\tcompatible = "riscv";
+\t\t\triscv,isa-base = "rv32i";
+\t\t\triscv,isa = "rv32imafdc_zicsr_zifencei_zicntr_zba_zbb_zbs_zbkb_zicond_zihintpause";
+\t\t\triscv,isa-extensions = "i", "m", "a", "f", "d", "c",
+\t\t\t\t"zicsr", "zifencei", "zicntr",
+\t\t\t\t"zba", "zbb", "zbs", "zbkb",
+\t\t\t\t"zicond", "zihintpause";
+
+\t\t\tcpu0_intc: interrupt-controller {{
+\t\t\t\t#interrupt-cells = <0x01>;
+\t\t\t\tinterrupt-controller;
+\t\t\t\tcompatible = "riscv,cpu-intc";
+\t\t\t\tphandle = <0x01>;
+\t\t\t}};
+\t\t}};
+\t}};
+
+\tmemory@80000000 {{
+\t\tdevice_type = "memory";
+\t\treg = <0x80000000 0x{MEM_SIZE:08x}>;
+\t}};
+
+\tsoc {{
+\t\t#address-cells = <0x01>;
+\t\t#size-cells = <0x01>;
+\t\tcompatible = "simple-bus";
+\t\tranges;
+
+\t\tserial@40001000 {{
+\t\t\tcompatible = "ns16550a";
+\t\t\treg = <0x40001000 0x100>;
+\t\t\treg-shift = <0x02>;
+\t\t\treg-io-width = <0x04>;
+\t\t\tclock-frequency = <{CLK}>;
+\t\t}};
+
+\t\tclint@40010000 {{
+\t\t\tcompatible = "sifive,clint0", "riscv,clint0";
+\t\t\treg = <0x40010000 0x10000>;
+\t\t\tinterrupts-extended = <&cpu0_intc 3 &cpu0_intc 7>;
+\t\t}};
+\t}};
+}};
+"""
+    with open(DTS, "w") as f:
+        f.write(dts)
+    subprocess.run([DTC, "-I", "dts", "-O", "dtb", "-o", DTB, DTS], check=True)
+    return open(DTB, "rb").read()
+
+
+def build_shim() -> bytes:
+    src = os.path.join(OUTDIR, "frost_boot_shim.S")
+    with open(src, "w") as f:
+        f.write(
+            ".section .text\n.globl _start\n_start:\n"
+            "    li   a0, 0\n"  # boot hart id (FROST single-hart)
+            f"    li   a1, 0x{DTB_BASE:08x}\n"  # a1 = DTB physical address
+            f"    li   t0, 0x{KERNEL_ENTRY:08x}\n"  # kernel entry in DDR
+            "    jr   t0\n"
+        )
+    elf = os.path.join(OUTDIR, "shim.elf")
+    binf = os.path.join(OUTDIR, "shim.bin")
+    cmd = [GCC]
+    if SHIM_MARCH:
+        cmd.append("-march=" + SHIM_MARCH)
+    if SHIM_MABI:
+        cmd.append("-mabi=" + SHIM_MABI)
+    cmd += ["-nostdlib", "-Wl,-Ttext=0", "-o", elf, src]
+    subprocess.run(cmd, check=True)
+    subprocess.run([OBJCOPY, "-O", "binary", elf, binf], check=True)
+    return open(binf, "rb").read()
+
+
+def main():
+    img = open(IMAGE, "rb").read()
+    initrd = open(INITRD, "rb").read()
+    dtb = gen_dtb(len(initrd))
+    shim = build_shim()
+
+    iw, dw, rw = to_words(img), to_words(dtb), to_words(initrd)
+    assert (
+        len(iw) <= DTB_WORD
+    ), f"kernel Image (0x{len(iw):x} words) overruns the DTB slot 0x{DTB_WORD:x}"
+    assert DTB_WORD + len(dw) <= INITRD_WORD, "DTB overruns the initramfs slot"
+
+    sw = to_words(shim)
+    open(OUT["sw.mem"], "w").write("@00000000\n" + "\n".join(sw) + "\n")
+    open(OUT["sw.txt"], "w").write("\n".join(sw) + "\n")
+
+    with open(OUT["sw_ddr.mem"], "w") as f:
+        f.write("@00000000\n" + "\n".join(iw) + "\n")
+        f.write(f"@{DTB_WORD:08x}\n" + "\n".join(dw) + "\n")
+        f.write(f"@{INITRD_WORD:08x}\n" + "\n".join(rw) + "\n")
+    dense = (
+        iw
+        + ["00000000"] * (DTB_WORD - len(iw))
+        + dw
+        + ["00000000"] * (INITRD_WORD - DTB_WORD - len(dw))
+        + rw
+    )
+    open(OUT["sw_ddr.txt"], "w").write("\n".join(dense) + "\n")
+
+    print(
+        "kernel %d B (%d w); DTB %d B @ 0x%08x; initrd %d B @ 0x%08x (end 0x%08x)"
+        % (
+            len(img),
+            len(iw),
+            len(dtb),
+            DTB_BASE,
+            len(initrd),
+            INITRD_BASE,
+            INITRD_BASE + len(initrd),
+        )
+    )
+    print(
+        "sw_ddr.txt: %d dense words (~%.1f MB), timebase/uart-clk = %d Hz"
+        % (len(dense), len(dense) * 4 / 1e6, CLK)
+    )
+    print(f"outputs written to {OUTDIR}")
+
+
+if __name__ == "__main__":
+    main()
diff --git a/linux/buildroot-external/board/frost/busybox.config b/linux/buildroot-external/board/frost/busybox.config
new file mode 100644
index 00000000..9682adc0
--- /dev/null
+++ b/linux/buildroot-external/board/frost/busybox.config
@@ -0,0 +1,1236 @@
+#
+# Automatically generated make config: don't edit
+# Busybox version: 1.38.0
+# Tue Jun 30 23:14:13 2026
+#
+CONFIG_HAVE_DOT_CONFIG=y
+
+#
+# Settings
+#
+# CONFIG_DESKTOP is not set
+# CONFIG_EXTRA_COMPAT is not set
+# CONFIG_FEDORA_COMPAT is not set
+CONFIG_INCLUDE_SUSv2=y
+CONFIG_LONG_OPTS=y
+# CONFIG_SHOW_USAGE is not set
+# CONFIG_FEATURE_VERBOSE_USAGE is not set
+# CONFIG_FEATURE_COMPRESS_USAGE is not set
+CONFIG_LFS=y
+CONFIG_TIME64=y
+# CONFIG_PAM is not set
+CONFIG_FEATURE_DEVPTS=y
+CONFIG_FEATURE_UTMP=y
+CONFIG_FEATURE_WTMP=y
+# CONFIG_FEATURE_PIDFILE is not set
+CONFIG_PID_FILE_PATH=""
+CONFIG_BUSYBOX=y
+CONFIG_FEATURE_SHOW_SCRIPT=y
+CONFIG_FEATURE_INSTALLER=y
+CONFIG_FEATURE_VERSION=y
+# CONFIG_INSTALL_NO_USR is not set
+CONFIG_FEATURE_SUID=y
+# CONFIG_FEATURE_SUID_CONFIG is not set
+# CONFIG_FEATURE_SUID_CONFIG_QUIET is not set
+# CONFIG_FEATURE_PREFER_APPLETS is not set
+CONFIG_BUSYBOX_EXEC_PATH="/proc/self/exe"
+# CONFIG_SELINUX is not set
+# CONFIG_FEATURE_CLEAN_UP is not set
+CONFIG_FEATURE_SYSLOG_INFO=y
+CONFIG_FEATURE_SYSLOG=y
+
+#
+# Build Options
+#
+CONFIG_STATIC=y
+# CONFIG_PIE is not set
+CONFIG_NOMMU=y
+# CONFIG_BUILD_LIBBUSYBOX is not set
+# CONFIG_FEATURE_LIBBUSYBOX_STATIC is not set
+# CONFIG_FEATURE_INDIVIDUAL is not set
+# CONFIG_FEATURE_SHARED_BUSYBOX is not set
+CONFIG_CROSS_COMPILER_PREFIX=""
+CONFIG_SYSROOT=""
+CONFIG_EXTRA_CFLAGS=""
+CONFIG_EXTRA_LDFLAGS=""
+CONFIG_EXTRA_LDLIBS=""
+# CONFIG_USE_PORTABLE_CODE is not set
+# CONFIG_STACK_OPTIMIZATION_386 is not set
+CONFIG_STATIC_LIBGCC=y
+
+#
+# Installation Options ("make install" behavior)
+#
+CONFIG_INSTALL_APPLET_SYMLINKS=y
+# CONFIG_INSTALL_APPLET_HARDLINKS is not set
+# CONFIG_INSTALL_APPLET_SCRIPT_WRAPPERS is not set
+# CONFIG_INSTALL_APPLET_DONT is not set
+# CONFIG_INSTALL_SH_APPLET_SYMLINK is not set
+# CONFIG_INSTALL_SH_APPLET_HARDLINK is not set
+# CONFIG_INSTALL_SH_APPLET_SCRIPT_WRAPPER is not set
+CONFIG_PREFIX=""
+
+#
+# Debugging Options
+#
+# CONFIG_DEBUG is not set
+# CONFIG_DEBUG_PESSIMIZE is not set
+# CONFIG_DEBUG_SANITIZE is not set
+# CONFIG_UNIT_TEST is not set
+# CONFIG_WERROR is not set
+# CONFIG_WARN_SIMPLE_MSG is not set
+CONFIG_NO_DEBUG_LIB=y
+# CONFIG_DMALLOC is not set
+# CONFIG_EFENCE is not set
+
+#
+# Library Tuning
+#
+# CONFIG_FEATURE_USE_BSS_TAIL is not set
+CONFIG_FLOAT_DURATION=y
+CONFIG_FEATURE_RTMINMAX=y
+CONFIG_FEATURE_RTMINMAX_USE_LIBC_DEFINITIONS=y
+CONFIG_FEATURE_BUFFERS_USE_MALLOC=y
+# CONFIG_FEATURE_BUFFERS_GO_ON_STACK is not set
+# CONFIG_FEATURE_BUFFERS_GO_IN_BSS is not set
+CONFIG_PASSWORD_MINLEN=6
+CONFIG_MD5_SMALL=1
+CONFIG_SHA1_SMALL=3
+CONFIG_SHA1_HWACCEL=y
+CONFIG_SHA256_HWACCEL=y
+CONFIG_SHA3_SMALL=1
+CONFIG_FEATURE_NON_POSIX_CP=y
+# CONFIG_FEATURE_VERBOSE_CP_MESSAGE is not set
+CONFIG_FEATURE_USE_SENDFILE=y
+CONFIG_FEATURE_COPYBUF_KB=4
+CONFIG_MONOTONIC_SYSCALL=y
+CONFIG_IOCTL_HEX2STR_ERROR=y
+CONFIG_FEATURE_EDITING=y
+CONFIG_FEATURE_EDITING_MAX_LEN=1024
+CONFIG_FEATURE_EDITING_VI=y
+CONFIG_FEATURE_EDITING_HISTORY=999
+CONFIG_FEATURE_EDITING_SAVEHISTORY=y
+CONFIG_FEATURE_EDITING_SAVE_ON_EXIT=y
+CONFIG_FEATURE_REVERSE_SEARCH=y
+CONFIG_FEATURE_TAB_COMPLETION=y
+# CONFIG_FEATURE_USERNAME_COMPLETION is not set
+CONFIG_FEATURE_EDITING_FANCY_PROMPT=y
+CONFIG_FEATURE_EDITING_WINCH=y
+# CONFIG_FEATURE_EDITING_ASK_TERMINAL is not set
+# CONFIG_LOCALE_SUPPORT is not set
+# CONFIG_UNICODE_SUPPORT is not set
+# CONFIG_UNICODE_USING_LOCALE is not set
+# CONFIG_FEATURE_CHECK_UNICODE_IN_ENV is not set
+CONFIG_SUBST_WCHAR=0
+CONFIG_LAST_SUPPORTED_WCHAR=0
+# CONFIG_UNICODE_COMBINING_WCHARS is not set
+# CONFIG_UNICODE_WIDE_WCHARS is not set
+# CONFIG_UNICODE_BIDI_SUPPORT is not set
+# CONFIG_UNICODE_NEUTRAL_TABLE is not set
+# CONFIG_UNICODE_PRESERVE_BROKEN is not set
+# CONFIG_LOOP_CONFIGURE is not set
+# CONFIG_NO_LOOP_CONFIGURE is not set
+CONFIG_TRY_LOOP_CONFIGURE=y
+
+#
+# Applets
+#
+
+#
+# Archival Utilities
+#
+CONFIG_FEATURE_SEAMLESS_XZ=y
+# CONFIG_FEATURE_SEAMLESS_LZMA is not set
+CONFIG_FEATURE_SEAMLESS_BZ2=y
+CONFIG_FEATURE_SEAMLESS_GZ=y
+# CONFIG_FEATURE_SEAMLESS_Z is not set
+# CONFIG_AR is not set
+# CONFIG_FEATURE_AR_LONG_FILENAMES is not set
+# CONFIG_FEATURE_AR_CREATE is not set
+# CONFIG_UNCOMPRESS is not set
+CONFIG_GUNZIP=y
+CONFIG_ZCAT=y
+# CONFIG_FEATURE_GUNZIP_LONG_OPTIONS is not set
+# CONFIG_BUNZIP2 is not set
+CONFIG_BZCAT=y
+# CONFIG_UNLZMA is not set
+CONFIG_LZCAT=y
+# CONFIG_LZMA is not set
+# CONFIG_UNXZ is not set
+CONFIG_XZCAT=y
+# CONFIG_XZ is not set
+# CONFIG_BZIP2 is not set
+CONFIG_BZIP2_SMALL=0
+CONFIG_FEATURE_BZIP2_DECOMPRESS=y
+# CONFIG_CPIO is not set
+# CONFIG_FEATURE_CPIO_O is not set
+# CONFIG_FEATURE_CPIO_P is not set
+# CONFIG_FEATURE_CPIO_IGNORE_DEVNO is not set
+# CONFIG_FEATURE_CPIO_RENUMBER_INODES is not set
+# CONFIG_DPKG is not set
+# CONFIG_DPKG_DEB is not set
+CONFIG_GZIP=y
+CONFIG_FEATURE_GZIP_LONG_OPTIONS=y
+CONFIG_GZIP_FAST=0
+# CONFIG_FEATURE_GZIP_LEVELS is not set
+CONFIG_FEATURE_GZIP_DECOMPRESS=y
+# CONFIG_LZOP is not set
+CONFIG_UNLZOP=y
+CONFIG_LZOPCAT=y
+# CONFIG_LZOP_COMPR_HIGH is not set
+# CONFIG_RPM is not set
+# CONFIG_RPM2CPIO is not set
+CONFIG_TAR=y
+CONFIG_FEATURE_TAR_LONG_OPTIONS=y
+CONFIG_FEATURE_TAR_CREATE=y
+CONFIG_FEATURE_TAR_AUTODETECT=y
+CONFIG_FEATURE_TAR_FROM=y
+CONFIG_FEATURE_TAR_OLDGNU_COMPATIBILITY=y
+CONFIG_FEATURE_TAR_OLDSUN_COMPATIBILITY=y
+CONFIG_FEATURE_TAR_GNU_EXTENSIONS=y
+CONFIG_FEATURE_TAR_TO_COMMAND=y
+CONFIG_FEATURE_TAR_UNAME_GNAME=y
+CONFIG_FEATURE_TAR_NOPRESERVE_TIME=y
+# CONFIG_FEATURE_TAR_SELINUX is not set
+# CONFIG_UNZIP is not set
+# CONFIG_FEATURE_UNZIP_CDF is not set
+# CONFIG_FEATURE_UNZIP_BZIP2 is not set
+# CONFIG_FEATURE_UNZIP_LZMA is not set
+# CONFIG_FEATURE_UNZIP_XZ is not set
+# CONFIG_FEATURE_LZMA_FAST is not set
+CONFIG_FEATURE_PATH_TRAVERSAL_PROTECTION=y
+
+#
+# Coreutils
+#
+CONFIG_FEATURE_VERBOSE=y
+
+#
+# Common options for date and touch
+#
+# CONFIG_FEATURE_TIMEZONE is not set
+
+#
+# Common options for cp and mv
+#
+CONFIG_FEATURE_PRESERVE_HARDLINKS=y
+
+#
+# Common options for df, du, ls
+#
+CONFIG_FEATURE_HUMAN_READABLE=y
+CONFIG_BASENAME=y
+CONFIG_CAT=y
+CONFIG_FEATURE_CATN=y
+CONFIG_FEATURE_CATV=y
+CONFIG_CHGRP=y
+CONFIG_CHMOD=y
+CONFIG_CHOWN=y
+# CONFIG_FEATURE_CHOWN_LONG_OPTIONS is not set
+CONFIG_CHROOT=y
+CONFIG_CKSUM=y
+CONFIG_CRC32=y
+# CONFIG_COMM is not set
+CONFIG_CP=y
+# CONFIG_FEATURE_CP_LONG_OPTIONS is not set
+# CONFIG_FEATURE_CP_REFLINK is not set
+CONFIG_CUT=y
+CONFIG_FEATURE_CUT_REGEX=y
+CONFIG_DATE=y
+CONFIG_FEATURE_DATE_ISOFMT=y
+# CONFIG_FEATURE_DATE_NANO is not set
+CONFIG_FEATURE_DATE_COMPAT=y
+CONFIG_DD=y
+CONFIG_FEATURE_DD_SIGNAL_HANDLING=y
+# CONFIG_FEATURE_DD_THIRD_STATUS_LINE is not set
+CONFIG_FEATURE_DD_IBS_OBS=y
+CONFIG_FEATURE_DD_STATUS=y
+CONFIG_DF=y
+# CONFIG_FEATURE_DF_FANCY is not set
+CONFIG_FEATURE_SKIP_ROOTFS=y
+CONFIG_DIRNAME=y
+CONFIG_DOS2UNIX=y
+CONFIG_UNIX2DOS=y
+CONFIG_DU=y
+CONFIG_FEATURE_DU_DEFAULT_BLOCKSIZE_1K=y
+CONFIG_ECHO=y
+CONFIG_FEATURE_FANCY_ECHO=y
+CONFIG_ENV=y
+# CONFIG_EXPAND is not set
+# CONFIG_UNEXPAND is not set
+CONFIG_EXPR=y
+CONFIG_EXPR_MATH_SUPPORT_64=y
+CONFIG_FACTOR=y
+CONFIG_FALSE=y
+CONFIG_FOLD=y
+CONFIG_HEAD=y
+CONFIG_FEATURE_FANCY_HEAD=y
+CONFIG_HOSTID=y
+CONFIG_ID=y
+# CONFIG_GROUPS is not set
+CONFIG_INSTALL=y
+CONFIG_FEATURE_INSTALL_LONG_OPTIONS=y
+CONFIG_LINK=y
+CONFIG_LN=y
+CONFIG_LOGNAME=y
+CONFIG_LS=y
+CONFIG_FEATURE_LS_FILETYPES=y
+CONFIG_FEATURE_LS_FOLLOWLINKS=y
+CONFIG_FEATURE_LS_RECURSIVE=y
+CONFIG_FEATURE_LS_WIDTH=y
+CONFIG_FEATURE_LS_SORTFILES=y
+CONFIG_FEATURE_LS_TIMESTAMPS=y
+CONFIG_FEATURE_LS_USERNAME=y
+CONFIG_FEATURE_LS_COLOR=y
+CONFIG_FEATURE_LS_COLOR_IS_DEFAULT=y
+CONFIG_MD5SUM=y
+CONFIG_SHA1SUM=y
+CONFIG_SHA256SUM=y
+CONFIG_SHA384SUM=y
+CONFIG_SHA512SUM=y
+CONFIG_SHA3SUM=y
+
+#
+# Common options for md5sum, sha1sum, sha256sum, ..., sha3sum
+#
+CONFIG_FEATURE_MD5_SHA1_SUM_CHECK=y
+CONFIG_MKDIR=y
+CONFIG_MKFIFO=y
+CONFIG_MKNOD=y
+# CONFIG_MKTEMP is not set
+CONFIG_MV=y
+CONFIG_NICE=y
+CONFIG_NL=y
+CONFIG_NOHUP=y
+CONFIG_NPROC=y
+CONFIG_OD=y
+CONFIG_PASTE=y
+CONFIG_PRINTENV=y
+CONFIG_PRINTF=y
+CONFIG_PWD=y
+CONFIG_READLINK=y
+CONFIG_FEATURE_READLINK_FOLLOW=y
+CONFIG_REALPATH=y
+CONFIG_RM=y
+CONFIG_RMDIR=y
+CONFIG_SEQ=y
+CONFIG_SHRED=y
+# CONFIG_SHUF is not set
+CONFIG_SLEEP=y
+CONFIG_FEATURE_FANCY_SLEEP=y
+CONFIG_SORT=y
+CONFIG_FEATURE_SORT_BIG=y
+# CONFIG_FEATURE_SORT_OPTIMIZE_MEMORY is not set
+# CONFIG_SPLIT is not set
+# CONFIG_FEATURE_SPLIT_FANCY is not set
+# CONFIG_STAT is not set
+# CONFIG_FEATURE_STAT_FORMAT is not set
+# CONFIG_FEATURE_STAT_FILESYSTEM is not set
+CONFIG_STTY=y
+# CONFIG_SUM is not set
+CONFIG_SYNC=y
+# CONFIG_FEATURE_SYNC_FANCY is not set
+# CONFIG_FSYNC is not set
+# CONFIG_TAC is not set
+CONFIG_TAIL=y
+CONFIG_FEATURE_FANCY_TAIL=y
+CONFIG_TEE=y
+CONFIG_FEATURE_TEE_USE_BLOCK_IO=y
+CONFIG_TEST=y
+CONFIG_TEST1=y
+CONFIG_TEST2=y
+CONFIG_FEATURE_TEST_64=y
+# CONFIG_TIMEOUT is not set
+CONFIG_TOUCH=y
+CONFIG_FEATURE_TOUCH_SUSV3=y
+CONFIG_TR=y
+CONFIG_FEATURE_TR_CLASSES=y
+CONFIG_FEATURE_TR_EQUIV=y
+CONFIG_TRUE=y
+CONFIG_TRUNCATE=y
+CONFIG_TSORT=y
+CONFIG_TTY=y
+CONFIG_UNAME=y
+CONFIG_UNAME_OSNAME="GNU/Linux"
+CONFIG_BB_ARCH=y
+CONFIG_UNIQ=y
+CONFIG_UNLINK=y
+CONFIG_USLEEP=y
+CONFIG_UUDECODE=y
+CONFIG_BASE32=y
+# CONFIG_BASE64 is not set
+CONFIG_UUENCODE=y
+CONFIG_WC=y
+# CONFIG_FEATURE_WC_LARGE is not set
+CONFIG_WHO=y
+CONFIG_W=y
+# CONFIG_USERS is not set
+CONFIG_WHOAMI=y
+CONFIG_YES=y
+
+#
+# Console Utilities
+#
+# CONFIG_CHVT is not set
+# CONFIG_CLEAR is not set
+# CONFIG_DEALLOCVT is not set
+# CONFIG_DUMPKMAP is not set
+# CONFIG_FGCONSOLE is not set
+# CONFIG_KBD_MODE is not set
+# CONFIG_LOADFONT is not set
+# CONFIG_SETFONT is not set
+# CONFIG_FEATURE_SETFONT_TEXTUAL_MAP is not set
+CONFIG_DEFAULT_SETFONT_DIR=""
+# CONFIG_FEATURE_LOADFONT_PSF2 is not set
+# CONFIG_FEATURE_LOADFONT_RAW is not set
+# CONFIG_LOADKMAP is not set
+# CONFIG_OPENVT is not set
+# CONFIG_RESET is not set
+# CONFIG_RESIZE is not set
+# CONFIG_FEATURE_RESIZE_PRINT is not set
+# CONFIG_SETCONSOLE is not set
+# CONFIG_FEATURE_SETCONSOLE_LONG_OPTIONS is not set
+# CONFIG_SETKEYCODES is not set
+# CONFIG_SETLOGCONS is not set
+# CONFIG_SHOWKEY is not set
+
+#
+# Debian Utilities
+#
+# CONFIG_PIPE_PROGRESS is not set
+CONFIG_RUN_PARTS=y
+CONFIG_FEATURE_RUN_PARTS_LONG_OPTIONS=y
+CONFIG_FEATURE_RUN_PARTS_FANCY=y
+CONFIG_START_STOP_DAEMON=y
+CONFIG_FEATURE_START_STOP_DAEMON_LONG_OPTIONS=y
+CONFIG_FEATURE_START_STOP_DAEMON_FANCY=y
+CONFIG_WHICH=y
+
+#
+# klibc-utils
+#
+# CONFIG_MINIPS is not set
+CONFIG_NUKE=y
+CONFIG_RESUME=y
+CONFIG_RUN_INIT=y
+
+#
+# Editors
+#
+CONFIG_AWK=y
+CONFIG_FEATURE_AWK_LIBM=y
+CONFIG_FEATURE_AWK_GNU_EXTENSIONS=y
+# CONFIG_CMP is not set
+# CONFIG_DIFF is not set
+# CONFIG_FEATURE_DIFF_LONG_OPTIONS is not set
+# CONFIG_FEATURE_DIFF_DIR is not set
+# CONFIG_ED is not set
+# CONFIG_PATCH is not set
+CONFIG_SED=y
+CONFIG_VI=y
+CONFIG_FEATURE_VI_MAX_LEN=4096
+CONFIG_FEATURE_VI_8BIT=y
+CONFIG_FEATURE_VI_COLON=y
+CONFIG_FEATURE_VI_COLON_EXPAND=y
+CONFIG_FEATURE_VI_YANKMARK=y
+CONFIG_FEATURE_VI_SEARCH=y
+CONFIG_FEATURE_VI_REGEX_SEARCH=y
+CONFIG_FEATURE_VI_USE_SIGNALS=y
+CONFIG_FEATURE_VI_DOT_CMD=y
+CONFIG_FEATURE_VI_READONLY=y
+CONFIG_FEATURE_VI_SETOPTS=y
+CONFIG_FEATURE_VI_SET=y
+CONFIG_FEATURE_VI_WIN_RESIZE=y
+CONFIG_FEATURE_VI_ASK_TERMINAL=y
+CONFIG_FEATURE_VI_UNDO=y
+CONFIG_FEATURE_VI_UNDO_QUEUE=y
+CONFIG_FEATURE_VI_UNDO_QUEUE_MAX=256
+CONFIG_FEATURE_VI_VERBOSE_STATUS=y
+# CONFIG_FEATURE_ALLOW_EXEC is not set
+
+#
+# Finding Utilities
+#
+CONFIG_FIND=y
+CONFIG_FEATURE_FIND_PRINT0=y
+CONFIG_FEATURE_FIND_MTIME=y
+CONFIG_FEATURE_FIND_ATIME=y
+CONFIG_FEATURE_FIND_CTIME=y
+CONFIG_FEATURE_FIND_MMIN=y
+CONFIG_FEATURE_FIND_AMIN=y
+CONFIG_FEATURE_FIND_CMIN=y
+CONFIG_FEATURE_FIND_PERM=y
+CONFIG_FEATURE_FIND_TYPE=y
+CONFIG_FEATURE_FIND_EXECUTABLE=y
+CONFIG_FEATURE_FIND_XDEV=y
+CONFIG_FEATURE_FIND_MAXDEPTH=y
+CONFIG_FEATURE_FIND_NEWER=y
+CONFIG_FEATURE_FIND_INUM=y
+CONFIG_FEATURE_FIND_SAMEFILE=y
+CONFIG_FEATURE_FIND_EXEC=y
+CONFIG_FEATURE_FIND_EXEC_PLUS=y
+CONFIG_FEATURE_FIND_EXEC_OK=y
+CONFIG_FEATURE_FIND_USER=y
+CONFIG_FEATURE_FIND_GROUP=y
+CONFIG_FEATURE_FIND_NOT=y
+CONFIG_FEATURE_FIND_DEPTH=y
+CONFIG_FEATURE_FIND_PAREN=y
+CONFIG_FEATURE_FIND_SIZE=y
+CONFIG_FEATURE_FIND_PRUNE=y
+CONFIG_FEATURE_FIND_QUIT=y
+CONFIG_FEATURE_FIND_DELETE=y
+CONFIG_FEATURE_FIND_EMPTY=y
+CONFIG_FEATURE_FIND_PATH=y
+CONFIG_FEATURE_FIND_REGEX=y
+# CONFIG_FEATURE_FIND_CONTEXT is not set
+CONFIG_FEATURE_FIND_LINKS=y
+CONFIG_GREP=y
+CONFIG_EGREP=y
+CONFIG_FGREP=y
+CONFIG_FEATURE_GREP_CONTEXT=y
+CONFIG_XARGS=y
+# CONFIG_FEATURE_XARGS_SUPPORT_CONFIRMATION is not set
+CONFIG_FEATURE_XARGS_SUPPORT_QUOTES=y
+CONFIG_FEATURE_XARGS_SUPPORT_TERMOPT=y
+CONFIG_FEATURE_XARGS_SUPPORT_ZERO_TERM=y
+CONFIG_FEATURE_XARGS_SUPPORT_REPL_STR=y
+CONFIG_FEATURE_XARGS_SUPPORT_PARALLEL=y
+CONFIG_FEATURE_XARGS_SUPPORT_ARGS_FILE=y
+
+#
+# Init Utilities
+#
+# CONFIG_BOOTCHARTD is not set
+# CONFIG_FEATURE_BOOTCHARTD_BLOATED_HEADER is not set
+# CONFIG_FEATURE_BOOTCHARTD_CONFIG_FILE is not set
+CONFIG_HALT=y
+CONFIG_POWEROFF=y
+CONFIG_REBOOT=y
+CONFIG_FEATURE_WAIT_FOR_INIT=y
+# CONFIG_FEATURE_CALL_TELINIT is not set
+CONFIG_TELINIT_PATH=""
+CONFIG_INIT=y
+CONFIG_LINUXRC=y
+CONFIG_FEATURE_USE_INITTAB=y
+CONFIG_FEATURE_KILL_REMOVED=y
+CONFIG_FEATURE_KILL_DELAY=0
+CONFIG_FEATURE_INIT_SCTTY=y
+CONFIG_FEATURE_INIT_SYSLOG=y
+CONFIG_FEATURE_INIT_QUIET=y
+# CONFIG_FEATURE_INIT_COREDUMPS is not set
+CONFIG_INIT_TERMINAL_TYPE="linux"
+CONFIG_FEATURE_INIT_MODIFY_CMDLINE=y
+
+#
+# Login/Password Management Utilities
+#
+CONFIG_FEATURE_SHADOWPASSWDS=y
+# CONFIG_USE_BB_PWD_GRP is not set
+# CONFIG_USE_BB_SHADOW is not set
+CONFIG_USE_BB_CRYPT=y
+CONFIG_USE_BB_CRYPT_SHA=y
+CONFIG_USE_BB_CRYPT_YES=y
+# CONFIG_ADD_SHELL is not set
+# CONFIG_REMOVE_SHELL is not set
+# CONFIG_ADDGROUP is not set
+# CONFIG_FEATURE_ADDUSER_TO_GROUP is not set
+CONFIG_ADDUSER=y
+# CONFIG_FEATURE_CHECK_NAMES is not set
+CONFIG_LAST_ID=60000
+CONFIG_FIRST_SYSTEM_ID=100
+CONFIG_LAST_SYSTEM_ID=999
+# CONFIG_CHPASSWD is not set
+CONFIG_FEATURE_DEFAULT_PASSWD_ALGO="sha256"
+# CONFIG_CRYPTPW is not set
+# CONFIG_MKPASSWD is not set
+# CONFIG_DELUSER is not set
+# CONFIG_DELGROUP is not set
+# CONFIG_FEATURE_DEL_USER_FROM_GROUP is not set
+CONFIG_GETTY=y
+CONFIG_LOGIN=y
+# CONFIG_LOGIN_SESSION_AS_CHILD is not set
+# CONFIG_LOGIN_SCRIPTS is not set
+CONFIG_FEATURE_NOLOGIN=y
+CONFIG_FEATURE_SECURETTY=y
+CONFIG_PASSWD=y
+CONFIG_FEATURE_PASSWD_WEAK_CHECK=y
+# CONFIG_SU is not set
+# CONFIG_FEATURE_SU_SYSLOG is not set
+# CONFIG_FEATURE_SU_CHECKS_SHELLS is not set
+# CONFIG_FEATURE_SU_BLANK_PW_NEEDS_SECURE_TTY is not set
+# CONFIG_SULOGIN is not set
+# CONFIG_VLOCK is not set
+
+#
+# Linux Ext2 FS Progs
+#
+# CONFIG_CHATTR is not set
+# CONFIG_FSCK is not set
+# CONFIG_LSATTR is not set
+# CONFIG_TUNE2FS is not set
+
+#
+# Linux Module Utilities
+#
+# CONFIG_MODPROBE_SMALL is not set
+# CONFIG_DEPMOD is not set
+# CONFIG_INSMOD is not set
+# CONFIG_LSMOD is not set
+# CONFIG_FEATURE_LSMOD_PRETTY_2_6_OUTPUT is not set
+# CONFIG_MODINFO is not set
+# CONFIG_MODPROBE is not set
+# CONFIG_FEATURE_MODPROBE_BLACKLIST is not set
+# CONFIG_RMMOD is not set
+
+#
+# Options common to multiple modutils
+#
+# CONFIG_FEATURE_CMDLINE_MODULE_OPTIONS is not set
+# CONFIG_FEATURE_MODPROBE_SMALL_CHECK_ALREADY_LOADED is not set
+# CONFIG_FEATURE_2_4_MODULES is not set
+# CONFIG_FEATURE_INSMOD_VERSION_CHECKING is not set
+# CONFIG_FEATURE_INSMOD_KSYMOOPS_SYMBOLS is not set
+# CONFIG_FEATURE_INSMOD_LOADINKMEM is not set
+# CONFIG_FEATURE_INSMOD_LOAD_MAP is not set
+# CONFIG_FEATURE_INSMOD_LOAD_MAP_FULL is not set
+# CONFIG_FEATURE_CHECK_TAINTED_MODULE is not set
+# CONFIG_FEATURE_INSMOD_TRY_MMAP is not set
+# CONFIG_FEATURE_MODUTILS_ALIAS is not set
+# CONFIG_FEATURE_MODUTILS_SYMBOLS is not set
+CONFIG_DEFAULT_MODULES_DIR=""
+CONFIG_DEFAULT_DEPMOD_FILE=""
+
+#
+# Linux System Utilities
+#
+# CONFIG_ACPID is not set
+# CONFIG_FEATURE_ACPID_COMPAT is not set
+# CONFIG_BLKDISCARD is not set
+# CONFIG_BLKID is not set
+# CONFIG_FEATURE_BLKID_TYPE is not set
+# CONFIG_BLOCKDEV is not set
+# CONFIG_CAL is not set
+# CONFIG_CHRT is not set
+CONFIG_DMESG=y
+CONFIG_FEATURE_DMESG_PRETTY=y
+# CONFIG_EJECT is not set
+# CONFIG_FEATURE_EJECT_SCSI is not set
+CONFIG_FALLOCATE=y
+# CONFIG_FATATTR is not set
+# CONFIG_FBSET is not set
+# CONFIG_FEATURE_FBSET_FANCY is not set
+# CONFIG_FEATURE_FBSET_READMODE is not set
+# CONFIG_FDFORMAT is not set
+# CONFIG_FDISK is not set
+# CONFIG_FDISK_SUPPORT_LARGE_DISKS is not set
+# CONFIG_FEATURE_FDISK_BLKSIZE is not set
+# CONFIG_FEATURE_FDISK_WRITABLE is not set
+# CONFIG_FEATURE_AIX_LABEL is not set
+# CONFIG_FEATURE_SGI_LABEL is not set
+# CONFIG_FEATURE_SUN_LABEL is not set
+# CONFIG_FEATURE_OSF_LABEL is not set
+# CONFIG_FEATURE_GPT_LABEL is not set
+# CONFIG_FEATURE_FDISK_ADVANCED is not set
+# CONFIG_FINDFS is not set
+# CONFIG_FLOCK is not set
+# CONFIG_FDFLUSH is not set
+CONFIG_FREERAMDISK=y
+# CONFIG_FSCK_MINIX is not set
+CONFIG_FSFREEZE=y
+# CONFIG_FSTRIM is not set
+CONFIG_GETOPT=y
+CONFIG_FEATURE_GETOPT_LONG=y
+# CONFIG_HEXDUMP is not set
+# CONFIG_HD is not set
+CONFIG_XXD=y
+# CONFIG_HWCLOCK is not set
+# CONFIG_FEATURE_HWCLOCK_ADJTIME_FHS is not set
+# CONFIG_IONICE is not set
+# CONFIG_IPCRM is not set
+# CONFIG_IPCS is not set
+# CONFIG_LAST is not set
+# CONFIG_FEATURE_LAST_FANCY is not set
+# CONFIG_LOSETUP is not set
+CONFIG_LSBLK=y
+# CONFIG_LSPCI is not set
+# CONFIG_LSUSB is not set
+# CONFIG_MDEV is not set
+# CONFIG_FEATURE_MDEV_CONF is not set
+# CONFIG_FEATURE_MDEV_RENAME is not set
+# CONFIG_FEATURE_MDEV_RENAME_REGEXP is not set
+# CONFIG_FEATURE_MDEV_EXEC is not set
+# CONFIG_FEATURE_MDEV_LOAD_FIRMWARE is not set
+# CONFIG_FEATURE_MDEV_DAEMON is not set
+CONFIG_MESG=y
+CONFIG_FEATURE_MESG_ENABLE_ONLY_GROUP=y
+CONFIG_MKE2FS=y
+# CONFIG_MKFS_EXT2 is not set
+# CONFIG_MKFS_MINIX is not set
+# CONFIG_FEATURE_MINIX2 is not set
+# CONFIG_MKFS_REISER is not set
+CONFIG_MKDOSFS=y
+# CONFIG_MKFS_VFAT is not set
+# CONFIG_MKSWAP is not set
+# CONFIG_FEATURE_MKSWAP_UUID is not set
+CONFIG_MORE=y
+CONFIG_MOUNT=y
+# CONFIG_FEATURE_MOUNT_FAKE is not set
+# CONFIG_FEATURE_MOUNT_VERBOSE is not set
+# CONFIG_FEATURE_MOUNT_HELPERS is not set
+# CONFIG_FEATURE_MOUNT_LABEL is not set
+# CONFIG_FEATURE_MOUNT_NFS is not set
+CONFIG_FEATURE_MOUNT_CIFS=y
+CONFIG_FEATURE_MOUNT_FLAGS=y
+CONFIG_FEATURE_MOUNT_FSTAB=y
+CONFIG_FEATURE_MOUNT_OTHERTAB=y
+# CONFIG_MOUNTPOINT is not set
+CONFIG_NOLOGIN=y
+# CONFIG_NOLOGIN_DEPENDENCIES is not set
+# CONFIG_NSENTER is not set
+# CONFIG_PIVOT_ROOT is not set
+# CONFIG_RDATE is not set
+# CONFIG_RDEV is not set
+# CONFIG_READPROFILE is not set
+# CONFIG_RENICE is not set
+# CONFIG_REV is not set
+# CONFIG_RTCWAKE is not set
+# CONFIG_SCRIPT is not set
+# CONFIG_SCRIPTREPLAY is not set
+# CONFIG_SETARCH is not set
+CONFIG_LINUX32=y
+CONFIG_LINUX64=y
+CONFIG_SETPRIV=y
+CONFIG_FEATURE_SETPRIV_DUMP=y
+CONFIG_FEATURE_SETPRIV_CAPABILITIES=y
+CONFIG_FEATURE_SETPRIV_CAPABILITY_NAMES=y
+# CONFIG_SETSID is not set
+# CONFIG_SWAPON is not set
+# CONFIG_FEATURE_SWAPON_DISCARD is not set
+# CONFIG_FEATURE_SWAPON_PRI is not set
+# CONFIG_SWAPOFF is not set
+# CONFIG_FEATURE_SWAPONOFF_LABEL is not set
+# CONFIG_SWITCH_ROOT is not set
+# CONFIG_TASKSET is not set
+# CONFIG_FEATURE_TASKSET_FANCY is not set
+# CONFIG_FEATURE_TASKSET_CPULIST is not set
+# CONFIG_UEVENT is not set
+CONFIG_UMOUNT=y
+CONFIG_FEATURE_UMOUNT_ALL=y
+# CONFIG_UNSHARE is not set
+CONFIG_UUIDGEN=y
+# CONFIG_WALL is not set
+
+#
+# Common options for mount/umount
+#
+CONFIG_FEATURE_MOUNT_LOOP=y
+CONFIG_FEATURE_MOUNT_LOOP_CREATE=y
+# CONFIG_FEATURE_MTAB_SUPPORT is not set
+# CONFIG_VOLUMEID is not set
+# CONFIG_FEATURE_VOLUMEID_BCACHE is not set
+# CONFIG_FEATURE_VOLUMEID_BTRFS is not set
+# CONFIG_FEATURE_VOLUMEID_CRAMFS is not set
+# CONFIG_FEATURE_VOLUMEID_EROFS is not set
+# CONFIG_FEATURE_VOLUMEID_EXFAT is not set
+# CONFIG_FEATURE_VOLUMEID_EXT is not set
+# CONFIG_FEATURE_VOLUMEID_F2FS is not set
+# CONFIG_FEATURE_VOLUMEID_FAT is not set
+# CONFIG_FEATURE_VOLUMEID_HFS is not set
+# CONFIG_FEATURE_VOLUMEID_ISO9660 is not set
+# CONFIG_FEATURE_VOLUMEID_JFS is not set
+# CONFIG_FEATURE_VOLUMEID_LFS is not set
+# CONFIG_FEATURE_VOLUMEID_LINUXRAID is not set
+# CONFIG_FEATURE_VOLUMEID_LINUXSWAP is not set
+# CONFIG_FEATURE_VOLUMEID_LUKS is not set
+# CONFIG_FEATURE_VOLUMEID_MINIX is not set
+# CONFIG_FEATURE_VOLUMEID_NILFS is not set
+# CONFIG_FEATURE_VOLUMEID_NTFS is not set
+# CONFIG_FEATURE_VOLUMEID_OCFS2 is not set
+# CONFIG_FEATURE_VOLUMEID_REISERFS is not set
+# CONFIG_FEATURE_VOLUMEID_ROMFS is not set
+# CONFIG_FEATURE_VOLUMEID_SQUASHFS is not set
+# CONFIG_FEATURE_VOLUMEID_SYSV is not set
+# CONFIG_FEATURE_VOLUMEID_UBIFS is not set
+# CONFIG_FEATURE_VOLUMEID_UDF is not set
+# CONFIG_FEATURE_VOLUMEID_XFS is not set
+
+#
+# Miscellaneous Utilities
+#
+# CONFIG_ADJTIMEX is not set
+CONFIG_ASCII=y
+# CONFIG_BBCONFIG is not set
+# CONFIG_FEATURE_COMPRESS_BBCONFIG is not set
+CONFIG_BC=y
+# CONFIG_DC is not set
+CONFIG_FEATURE_DC_BIG=y
+# CONFIG_FEATURE_DC_LIBM is not set
+CONFIG_FEATURE_BC_INTERACTIVE=y
+CONFIG_FEATURE_BC_LONG_OPTIONS=y
+# CONFIG_BEEP is not set
+CONFIG_FEATURE_BEEP_FREQ=0
+CONFIG_FEATURE_BEEP_LENGTH_MS=0
+# CONFIG_CHAT is not set
+# CONFIG_FEATURE_CHAT_NOFAIL is not set
+# CONFIG_FEATURE_CHAT_TTY_HIFI is not set
+# CONFIG_FEATURE_CHAT_IMPLICIT_CR is not set
+# CONFIG_FEATURE_CHAT_SWALLOW_OPTS is not set
+# CONFIG_FEATURE_CHAT_SEND_ESCAPES is not set
+# CONFIG_FEATURE_CHAT_VAR_ABORT_LEN is not set
+# CONFIG_FEATURE_CHAT_CLR_ABORT is not set
+# CONFIG_CONSPY is not set
+# CONFIG_CROND is not set
+# CONFIG_FEATURE_CROND_D is not set
+# CONFIG_FEATURE_CROND_CALL_SENDMAIL is not set
+# CONFIG_FEATURE_CROND_SPECIAL_TIMES is not set
+CONFIG_FEATURE_CROND_DIR=""
+# CONFIG_CRONTAB is not set
+# CONFIG_DEVFSD is not set
+# CONFIG_DEVFSD_MODLOAD is not set
+# CONFIG_DEVFSD_FG_NP is not set
+# CONFIG_DEVFSD_VERBOSE is not set
+# CONFIG_FEATURE_DEVFS is not set
+CONFIG_DEVMEM=y
+# CONFIG_FBSPLASH is not set
+# CONFIG_FLASH_ERASEALL is not set
+# CONFIG_FLASH_LOCK is not set
+# CONFIG_FLASH_UNLOCK is not set
+# CONFIG_FLASHCP is not set
+CONFIG_GETFATTR=y
+# CONFIG_HDPARM is not set
+# CONFIG_FEATURE_HDPARM_GET_IDENTITY is not set
+# CONFIG_FEATURE_HDPARM_HDIO_SCAN_HWIF is not set
+# CONFIG_FEATURE_HDPARM_HDIO_UNREGISTER_HWIF is not set
+# CONFIG_FEATURE_HDPARM_HDIO_DRIVE_RESET is not set
+# CONFIG_FEATURE_HDPARM_HDIO_TRISTATE_HWIF is not set
+# CONFIG_FEATURE_HDPARM_HDIO_GETSET_DMA is not set
+CONFIG_HEXEDIT=y
+# CONFIG_I2CGET is not set
+# CONFIG_I2CSET is not set
+# CONFIG_I2CDUMP is not set
+# CONFIG_I2CDETECT is not set
+CONFIG_I2CTRANSFER=y
+# CONFIG_INOTIFYD is not set
+CONFIG_LESS=y
+CONFIG_FEATURE_LESS_MAXLINES=0
+CONFIG_FEATURE_LESS_BRACKETS=y
+CONFIG_FEATURE_LESS_FLAGS=y
+CONFIG_FEATURE_LESS_TRUNCATE=y
+CONFIG_FEATURE_LESS_MARKS=y
+CONFIG_FEATURE_LESS_REGEXP=y
+CONFIG_FEATURE_LESS_WINCH=y
+CONFIG_FEATURE_LESS_ASK_TERMINAL=y
+CONFIG_FEATURE_LESS_DASHCMD=y
+CONFIG_FEATURE_LESS_LINENUMS=y
+CONFIG_FEATURE_LESS_RAW=y
+CONFIG_FEATURE_LESS_ENV=y
+CONFIG_LSSCSI=y
+# CONFIG_MAKEDEVS is not set
+# CONFIG_FEATURE_MAKEDEVS_LEAF is not set
+# CONFIG_FEATURE_MAKEDEVS_TABLE is not set
+# CONFIG_MAN is not set
+# CONFIG_MICROCOM is not set
+CONFIG_MIM=y
+# CONFIG_MT is not set
+# CONFIG_NANDWRITE is not set
+# CONFIG_NANDDUMP is not set
+CONFIG_PARTPROBE=y
+# CONFIG_RAIDAUTORUN is not set
+# CONFIG_READAHEAD is not set
+# CONFIG_RFKILL is not set
+# CONFIG_RUNLEVEL is not set
+# CONFIG_RX is not set
+CONFIG_SEEDRNG=y
+CONFIG_SETFATTR=y
+CONFIG_SETSERIAL=y
+# CONFIG_STRINGS is not set
+# CONFIG_TIME is not set
+CONFIG_TREE=y
+CONFIG_TS=y
+# CONFIG_TTYSIZE is not set
+# CONFIG_UBIATTACH is not set
+# CONFIG_UBIDETACH is not set
+# CONFIG_UBIMKVOL is not set
+# CONFIG_UBIRMVOL is not set
+# CONFIG_UBIRSVOL is not set
+# CONFIG_UBIUPDATEVOL is not set
+# CONFIG_UBIRENAME is not set
+# CONFIG_VOLNAME is not set
+# CONFIG_WATCHDOG is not set
+# CONFIG_FEATURE_WATCHDOG_OPEN_TWICE is not set
+
+#
+# Networking Utilities
+#
+# CONFIG_FEATURE_IPV6 is not set
+# CONFIG_FEATURE_UNIX_LOCAL is not set
+# CONFIG_FEATURE_PREFER_IPV4_ADDRESS is not set
+# CONFIG_VERBOSE_RESOLUTION_ERRORS is not set
+# CONFIG_FEATURE_ETC_NETWORKS is not set
+# CONFIG_FEATURE_ETC_SERVICES is not set
+CONFIG_FEATURE_HWIB=y
+# CONFIG_FEATURE_TLS_SHA1 is not set
+# CONFIG_ARP is not set
+# CONFIG_ARPING is not set
+# CONFIG_BRCTL is not set
+# CONFIG_FEATURE_BRCTL_FANCY is not set
+# CONFIG_FEATURE_BRCTL_SHOW is not set
+# CONFIG_DNSD is not set
+# CONFIG_ETHER_WAKE is not set
+# CONFIG_FTPD is not set
+# CONFIG_FEATURE_FTPD_WRITE is not set
+# CONFIG_FEATURE_FTPD_ACCEPT_BROKEN_LIST is not set
+# CONFIG_FEATURE_FTPD_AUTHENTICATION is not set
+# CONFIG_FTPGET is not set
+# CONFIG_FTPPUT is not set
+# CONFIG_FEATURE_FTPGETPUT_LONG_OPTIONS is not set
+CONFIG_HOSTNAME=y
+CONFIG_DNSDOMAINNAME=y
+# CONFIG_HTTPD is not set
+CONFIG_FEATURE_HTTPD_PORT_DEFAULT=0
+# CONFIG_FEATURE_HTTPD_RANGES is not set
+# CONFIG_FEATURE_HTTPD_SETUID is not set
+# CONFIG_FEATURE_HTTPD_BASIC_AUTH is not set
+# CONFIG_FEATURE_HTTPD_AUTH_MD5 is not set
+# CONFIG_FEATURE_HTTPD_CGI is not set
+# CONFIG_FEATURE_HTTPD_CONFIG_WITH_SCRIPT_INTERPR is not set
+# CONFIG_FEATURE_HTTPD_SET_REMOTE_PORT_TO_ENV is not set
+# CONFIG_FEATURE_HTTPD_ENCODE_URL_STR is not set
+# CONFIG_FEATURE_HTTPD_ERROR_PAGES is not set
+# CONFIG_FEATURE_HTTPD_PROXY is not set
+# CONFIG_FEATURE_HTTPD_GZIP is not set
+# CONFIG_FEATURE_HTTPD_ETAG is not set
+# CONFIG_FEATURE_HTTPD_LAST_MODIFIED is not set
+# CONFIG_FEATURE_HTTPD_DATE is not set
+# CONFIG_FEATURE_HTTPD_ACL_IP is not set
+CONFIG_IFCONFIG=y
+CONFIG_FEATURE_IFCONFIG_STATUS=y
+# CONFIG_FEATURE_IFCONFIG_SLIP is not set
+# CONFIG_FEATURE_IFCONFIG_MEMSTART_IOADDR_IRQ is not set
+CONFIG_FEATURE_IFCONFIG_HW=y
+CONFIG_FEATURE_IFCONFIG_BROADCAST_PLUS=y
+# CONFIG_IFENSLAVE is not set
+# CONFIG_IFPLUGD is not set
+CONFIG_IFUP=y
+CONFIG_IFDOWN=y
+CONFIG_IFUPDOWN_IFSTATE_PATH="/var/run/ifstate"
+CONFIG_FEATURE_IFUPDOWN_IP=y
+CONFIG_FEATURE_IFUPDOWN_IPV4=y
+# CONFIG_FEATURE_IFUPDOWN_IPV6 is not set
+# CONFIG_FEATURE_IFUPDOWN_MAPPING is not set
+CONFIG_FEATURE_IFUPDOWN_EXTERNAL_DHCP=y
+# CONFIG_INETD is not set
+# CONFIG_FEATURE_INETD_SUPPORT_BUILTIN_ECHO is not set
+# CONFIG_FEATURE_INETD_SUPPORT_BUILTIN_DISCARD is not set
+# CONFIG_FEATURE_INETD_SUPPORT_BUILTIN_TIME is not set
+# CONFIG_FEATURE_INETD_SUPPORT_BUILTIN_DAYTIME is not set
+# CONFIG_FEATURE_INETD_SUPPORT_BUILTIN_CHARGEN is not set
+# CONFIG_FEATURE_INETD_RPC is not set
+CONFIG_IP=y
+CONFIG_IPADDR=y
+CONFIG_IPLINK=y
+CONFIG_IPROUTE=y
+# CONFIG_IPTUNNEL is not set
+# CONFIG_IPRULE is not set
+# CONFIG_IPNEIGH is not set
+CONFIG_FEATURE_IP_ADDRESS=y
+CONFIG_FEATURE_IP_LINK=y
+CONFIG_FEATURE_IP_LINK_CAN=y
+CONFIG_FEATURE_IP_ROUTE=y
+CONFIG_FEATURE_IP_ROUTE_DIR="/etc/iproute2"
+# CONFIG_FEATURE_IP_TUNNEL is not set
+# CONFIG_FEATURE_IP_RULE is not set
+# CONFIG_FEATURE_IP_NEIGH is not set
+# CONFIG_FEATURE_IP_RARE_PROTOCOLS is not set
+# CONFIG_IPCALC is not set
+# CONFIG_FEATURE_IPCALC_LONG_OPTIONS is not set
+# CONFIG_FEATURE_IPCALC_FANCY is not set
+# CONFIG_FAKEIDENTD is not set
+# CONFIG_NAMEIF is not set
+# CONFIG_FEATURE_NAMEIF_EXTENDED is not set
+# CONFIG_NBDCLIENT is not set
+CONFIG_NC=y
+# CONFIG_NETCAT is not set
+CONFIG_NC_SERVER=y
+# CONFIG_NC_EXTRA is not set
+# CONFIG_NC_110_COMPAT is not set
+CONFIG_NETSTAT=y
+# CONFIG_FEATURE_NETSTAT_WIDE is not set
+CONFIG_FEATURE_NETSTAT_PRG=y
+# CONFIG_NSLOOKUP is not set
+# CONFIG_FEATURE_NSLOOKUP_BIG is not set
+# CONFIG_FEATURE_NSLOOKUP_LONG_OPTIONS is not set
+# CONFIG_NTPD is not set
+# CONFIG_FEATURE_NTPD_SERVER is not set
+# CONFIG_FEATURE_NTPD_CONF is not set
+# CONFIG_FEATURE_NTP_AUTH is not set
+CONFIG_PING=y
+# CONFIG_PING6 is not set
+# CONFIG_FEATURE_FANCY_PING is not set
+# CONFIG_PSCAN is not set
+CONFIG_ROUTE=y
+# CONFIG_SLATTACH is not set
+# CONFIG_SSL_CLIENT is not set
+CONFIG_SSL_SERVER=y
+# CONFIG_TC is not set
+# CONFIG_FEATURE_TC_INGRESS is not set
+# CONFIG_TCPSVD is not set
+# CONFIG_UDPSVD is not set
+CONFIG_TELNET=y
+CONFIG_FEATURE_TELNET_TTYPE=y
+CONFIG_FEATURE_TELNET_AUTOLOGIN=y
+CONFIG_FEATURE_TELNET_WIDTH=y
+# CONFIG_TELNETD is not set
+# CONFIG_FEATURE_TELNETD_SELFTEST_DEBUG is not set
+# CONFIG_FEATURE_TELNETD_STANDALONE is not set
+CONFIG_FEATURE_TELNETD_PORT_DEFAULT=0
+# CONFIG_FEATURE_TELNETD_INETD_WAIT is not set
+# CONFIG_TFTP is not set
+# CONFIG_FEATURE_TFTP_PROGRESS_BAR is not set
+# CONFIG_FEATURE_TFTP_HPA_COMPAT is not set
+# CONFIG_TFTPD is not set
+# CONFIG_FEATURE_TFTP_GET is not set
+# CONFIG_FEATURE_TFTP_PUT is not set
+# CONFIG_FEATURE_TFTP_BLOCKSIZE is not set
+# CONFIG_TFTP_DEBUG is not set
+CONFIG_TLS=y
+# CONFIG_TRACEROUTE is not set
+# CONFIG_TRACEROUTE6 is not set
+# CONFIG_FEATURE_TRACEROUTE_VERBOSE is not set
+# CONFIG_FEATURE_TRACEROUTE_USE_ICMP is not set
+# CONFIG_TUNCTL is not set
+# CONFIG_FEATURE_TUNCTL_UG is not set
+# CONFIG_VCONFIG is not set
+CONFIG_WGET=y
+# CONFIG_FEATURE_WGET_LONG_OPTIONS is not set
+# CONFIG_FEATURE_WGET_STATUSBAR is not set
+CONFIG_FEATURE_WGET_FTP=y
+# CONFIG_FEATURE_WGET_AUTHENTICATION is not set
+# CONFIG_FEATURE_WGET_TIMEOUT is not set
+# CONFIG_FEATURE_WGET_HTTPS is not set
+# CONFIG_FEATURE_WGET_OPENSSL is not set
+# CONFIG_WHOIS is not set
+# CONFIG_ZCIP is not set
+# CONFIG_UDHCPD is not set
+# CONFIG_FEATURE_UDHCPD_BOOTP is not set
+# CONFIG_FEATURE_UDHCPD_BASE_IP_ON_MAC is not set
+# CONFIG_FEATURE_UDHCPD_WRITE_LEASES_EARLY is not set
+CONFIG_DHCPD_LEASES_FILE=""
+# CONFIG_DUMPLEASES is not set
+# CONFIG_DHCPRELAY is not set
+CONFIG_UDHCPC=y
+# CONFIG_FEATURE_UDHCPC_ARPING is not set
+CONFIG_FEATURE_UDHCPC_SANITIZEOPT=y
+CONFIG_UDHCPC_DEFAULT_SCRIPT="/usr/share/udhcpc/default.script"
+CONFIG_UDHCPC6_DEFAULT_SCRIPT=""
+# CONFIG_UDHCPC6 is not set
+# CONFIG_FEATURE_UDHCPC6_RFC3646 is not set
+# CONFIG_FEATURE_UDHCPC6_RFC4704 is not set
+# CONFIG_FEATURE_UDHCPC6_RFC4833 is not set
+# CONFIG_FEATURE_UDHCPC6_RFC5970 is not set
+
+#
+# Common options for DHCP applets
+#
+CONFIG_UDHCPC_DEFAULT_INTERFACE="eth0"
+# CONFIG_FEATURE_UDHCP_PORT is not set
+CONFIG_UDHCP_DEBUG=9
+CONFIG_UDHCPC_SLACK_FOR_BUGGY_SERVERS=80
+# CONFIG_FEATURE_UDHCP_RFC3397 is not set
+# CONFIG_FEATURE_UDHCP_8021Q is not set
+CONFIG_IFUPDOWN_UDHCPC_CMD_OPTIONS="-R"
+
+#
+# Print Utilities
+#
+# CONFIG_LPD is not set
+# CONFIG_LPR is not set
+# CONFIG_LPQ is not set
+
+#
+# Mail Utilities
+#
+CONFIG_FEATURE_MIME_CHARSET=""
+# CONFIG_MAKEMIME is not set
+# CONFIG_POPMAILDIR is not set
+# CONFIG_FEATURE_POPMAILDIR_DELIVERY is not set
+# CONFIG_REFORMIME is not set
+# CONFIG_FEATURE_REFORMIME_COMPAT is not set
+# CONFIG_SENDMAIL is not set
+
+#
+# Process Utilities
+#
+# CONFIG_FEATURE_FAST_TOP is not set
+# CONFIG_FEATURE_SHOW_THREADS is not set
+# CONFIG_FREE is not set
+# CONFIG_FUSER is not set
+# CONFIG_IOSTAT is not set
+CONFIG_KILL=y
+CONFIG_KILLALL=y
+CONFIG_KILLALL5=y
+# CONFIG_LSOF is not set
+# CONFIG_MPSTAT is not set
+# CONFIG_NMETER is not set
+# CONFIG_PGREP is not set
+# CONFIG_PKILL is not set
+# CONFIG_PIDOF is not set
+# CONFIG_FEATURE_PIDOF_SINGLE is not set
+# CONFIG_FEATURE_PIDOF_OMIT is not set
+# CONFIG_PMAP is not set
+# CONFIG_POWERTOP is not set
+# CONFIG_FEATURE_POWERTOP_INTERACTIVE is not set
+CONFIG_PS=y
+CONFIG_FEATURE_PS_WIDE=y
+CONFIG_FEATURE_PS_LONG=y
+# CONFIG_FEATURE_PS_TIME is not set
+# CONFIG_FEATURE_PS_UNUSUAL_SYSTEMS is not set
+# CONFIG_FEATURE_PS_ADDITIONAL_COLUMNS is not set
+# CONFIG_PSTREE is not set
+# CONFIG_PWDX is not set
+# CONFIG_SMEMCAP is not set
+# CONFIG_BB_SYSCTL is not set
+CONFIG_TOP=y
+CONFIG_FEATURE_TOP_INTERACTIVE=y
+CONFIG_FEATURE_TOP_CPU_USAGE_PERCENTAGE=y
+CONFIG_FEATURE_TOP_CPU_GLOBAL_PERCENTS=y
+CONFIG_FEATURE_TOP_SMP_CPU=y
+CONFIG_FEATURE_TOP_DECIMALS=y
+CONFIG_FEATURE_TOP_SMP_PROCESS=y
+# CONFIG_FEATURE_TOPMEM is not set
+# CONFIG_UPTIME is not set
+# CONFIG_FEATURE_UPTIME_UTMP_SUPPORT is not set
+CONFIG_VMSTAT=y
+# CONFIG_WATCH is not set
+
+#
+# Runit Utilities
+#
+# CONFIG_CHPST is not set
+# CONFIG_SETUIDGID is not set
+# CONFIG_ENVUIDGID is not set
+# CONFIG_ENVDIR is not set
+# CONFIG_SOFTLIMIT is not set
+# CONFIG_RUNSV is not set
+# CONFIG_RUNSVDIR is not set
+# CONFIG_FEATURE_RUNSVDIR_LOG is not set
+# CONFIG_SV is not set
+CONFIG_SV_DEFAULT_SERVICE_DIR=""
+CONFIG_SVC=y
+CONFIG_SVOK=y
+# CONFIG_SVLOGD is not set
+# CONFIG_CHCON is not set
+# CONFIG_GETENFORCE is not set
+# CONFIG_GETSEBOOL is not set
+# CONFIG_LOAD_POLICY is not set
+# CONFIG_MATCHPATHCON is not set
+# CONFIG_RUNCON is not set
+# CONFIG_SELINUXENABLED is not set
+# CONFIG_SESTATUS is not set
+# CONFIG_SETENFORCE is not set
+# CONFIG_SETFILES is not set
+# CONFIG_FEATURE_SETFILES_CHECK_OPTION is not set
+# CONFIG_RESTORECON is not set
+# CONFIG_SETSEBOOL is not set
+
+#
+# Shells
+#
+# CONFIG_SH_IS_ASH is not set
+CONFIG_SH_IS_HUSH=y
+# CONFIG_SH_IS_NONE is not set
+# CONFIG_BASH_IS_ASH is not set
+# CONFIG_BASH_IS_HUSH is not set
+CONFIG_BASH_IS_NONE=y
+# CONFIG_SHELL_ASH is not set
+# CONFIG_ASH is not set
+# CONFIG_ASH_OPTIMIZE_FOR_SIZE is not set
+# CONFIG_ASH_INTERNAL_GLOB is not set
+# CONFIG_ASH_BASH_COMPAT is not set
+# CONFIG_ASH_BASH_SOURCE_CURDIR is not set
+# CONFIG_ASH_BASH_NOT_FOUND_HOOK is not set
+# CONFIG_ASH_JOB_CONTROL is not set
+# CONFIG_ASH_ALIAS is not set
+# CONFIG_ASH_RANDOM_SUPPORT is not set
+# CONFIG_ASH_EXPAND_PRMT is not set
+# CONFIG_ASH_IDLE_TIMEOUT is not set
+# CONFIG_ASH_MAIL is not set
+# CONFIG_ASH_ECHO is not set
+# CONFIG_ASH_PRINTF is not set
+# CONFIG_ASH_TEST is not set
+# CONFIG_ASH_HELP is not set
+# CONFIG_ASH_GETOPTS is not set
+# CONFIG_ASH_CMDCMD is not set
+# CONFIG_CTTYHACK is not set
+CONFIG_HUSH=y
+CONFIG_SHELL_HUSH=y
+CONFIG_HUSH_NEED_FOR_SPEED=y
+CONFIG_HUSH_BASH_COMPAT=y
+CONFIG_HUSH_BRACE_EXPANSION=y
+# CONFIG_HUSH_BASH_SOURCE_CURDIR is not set
+CONFIG_HUSH_LINENO_VAR=y
+CONFIG_HUSH_INTERACTIVE=y
+CONFIG_HUSH_SAVEHISTORY=y
+CONFIG_HUSH_JOB=y
+CONFIG_HUSH_TICK=y
+CONFIG_HUSH_IF=y
+CONFIG_HUSH_LOOPS=y
+CONFIG_HUSH_CASE=y
+CONFIG_HUSH_ALIAS=y
+CONFIG_HUSH_FUNCTIONS=y
+CONFIG_HUSH_FUNCTION_KEYWORD=y
+CONFIG_HUSH_LOCAL=y
+CONFIG_HUSH_RANDOM_SUPPORT=y
+CONFIG_HUSH_MODE_X=y
+CONFIG_HUSH_ECHO=y
+CONFIG_HUSH_PRINTF=y
+CONFIG_HUSH_TEST=y
+CONFIG_HUSH_HELP=y
+CONFIG_HUSH_EXPORT=y
+CONFIG_HUSH_EXPORT_N=y
+CONFIG_HUSH_READONLY=y
+CONFIG_HUSH_KILL=y
+CONFIG_HUSH_WAIT=y
+CONFIG_HUSH_COMMAND=y
+CONFIG_HUSH_TRAP=y
+CONFIG_HUSH_TYPE=y
+CONFIG_HUSH_TIMES=y
+CONFIG_HUSH_READ=y
+CONFIG_HUSH_SET=y
+CONFIG_HUSH_UNSET=y
+CONFIG_HUSH_ULIMIT=y
+CONFIG_HUSH_UMASK=y
+CONFIG_HUSH_GETOPTS=y
+# CONFIG_HUSH_MEMLEAK is not set
+
+#
+# Options common to all shells
+#
+CONFIG_FEATURE_SH_MATH=y
+CONFIG_FEATURE_SH_MATH_64=y
+CONFIG_FEATURE_SH_MATH_BASE=y
+CONFIG_FEATURE_SH_EXTRA_QUIET=y
+# CONFIG_FEATURE_SH_STANDALONE is not set
+# CONFIG_FEATURE_SH_NOFORK is not set
+CONFIG_FEATURE_SH_READ_FRAC=y
+# CONFIG_FEATURE_SH_HISTFILESIZE is not set
+CONFIG_FEATURE_SH_EMBEDDED_SCRIPTS=y
+
+#
+# System Logging Utilities
+#
+# CONFIG_KLOGD is not set
+# CONFIG_FEATURE_KLOGD_KLOGCTL is not set
+# CONFIG_LOGGER is not set
+# CONFIG_LOGREAD is not set
+# CONFIG_FEATURE_LOGREAD_REDUCED_LOCKING is not set
+# CONFIG_SYSLOGD is not set
+# CONFIG_FEATURE_ROTATE_LOGFILE is not set
+# CONFIG_FEATURE_REMOTE_LOG is not set
+# CONFIG_FEATURE_SYSLOGD_DUP is not set
+# CONFIG_FEATURE_SYSLOGD_CFG is not set
+# CONFIG_FEATURE_SYSLOGD_PRECISE_TIMESTAMPS is not set
+CONFIG_FEATURE_SYSLOGD_READ_BUFFER_SIZE=0
+# CONFIG_FEATURE_IPC_SYSLOG is not set
+CONFIG_FEATURE_IPC_SYSLOG_BUFFER_SIZE=0
+# CONFIG_FEATURE_KMSG_SYSLOG is not set
diff --git a/linux/buildroot-external/board/frost/device_table.txt b/linux/buildroot-external/board/frost/device_table.txt
new file mode 100644
index 00000000..b2161793
--- /dev/null
+++ b/linux/buildroot-external/board/frost/device_table.txt
@@ -0,0 +1,15 @@
+# FROST RV32 no-MMU: static bootstrap device nodes for the initramfs.
+#
+# The rootfs is an initramfs and the kernel does not auto-mount devtmpfs for an
+# initramfs boot, so /dev is empty at exec time. init's own stdio (and getty)
+# need /dev/console before the ::sysinit devtmpfs mount runs, so we ship a
+# minimal static /dev here; devtmpfs is mounted over it in inittab afterwards.
+#
+# <name>       <type> <mode> <uid> <gid> <major> <minor> <start> <inc> <count>
+/dev           d      755    0     0     -       -       -       -     -
+/dev/console   c      600    0     0     5       1       -       -     -
+/dev/null      c      666    0     0     1       3       -       -     -
+/dev/zero      c      666    0     0     1       5       -       -     -
+/dev/kmsg      c      644    0     0     1       11      -       -     -
+/dev/ttyS0     c      660    0     0     4       64      -       -     -
+/dev/tty       c      666    0     0     5       0       -       -     -
diff --git a/linux/buildroot-external/board/frost/linux-nommu-base.config b/linux/buildroot-external/board/frost/linux-nommu-base.config
new file mode 100644
index 00000000..784b12f1
--- /dev/null
+++ b/linux/buildroot-external/board/frost/linux-nommu-base.config
@@ -0,0 +1,21 @@
+CONFIG_BLK_DEV_INITRD=y
+# CONFIG_MMU is not set
+CONFIG_SOC_VIRT=y
+CONFIG_NONPORTABLE=y
+CONFIG_ARCH_RV32I=y
+CONFIG_BINFMT_FLAT=y
+CONFIG_SLOB=y
+CONFIG_NET=y
+CONFIG_PACKET=y
+CONFIG_UNIX=y
+CONFIG_INET=y
+CONFIG_NETDEVICES=y
+CONFIG_VIRTIO_BLK=y
+CONFIG_SERIAL_8250=y
+CONFIG_SERIAL_8250_CONSOLE=y
+CONFIG_SERIAL_OF_PLATFORM=y
+CONFIG_VIRTIO_MMIO=y
+CONFIG_VIRTIO_MMIO_CMDLINE_DEVICES=y
+CONFIG_VIRTIO_NET=y
+CONFIG_EXT2_FS=y
+CONFIG_PRINTK_TIME=y
diff --git a/linux/buildroot-external/board/frost/linux-nommu-frost.config.fragment b/linux/buildroot-external/board/frost/linux-nommu-frost.config.fragment
new file mode 100644
index 00000000..be7710b4
--- /dev/null
+++ b/linux/buildroot-external/board/frost/linux-nommu-frost.config.fragment
@@ -0,0 +1,188 @@
+# =============================================================================
+# linux-nommu-frost.config.fragment
+#
+# Kernel CONFIG delta to retarget the working rv32 / nommu / M-mode kernel
+# (6.18.7, built for QEMU "virt") at the FROST SoC.
+#
+# Apply ON TOP of the known-good virt config:
+#   board/qemu/riscv32-virt/linux-nommu.config
+#
+# Each line below is: CONFIG symbol = value, with a one-line rationale.
+# A "# CONFIG_x is not set" line DISABLES the symbol (kbuild syntax), and is
+# the correct way to drop a feature in a fragment — do NOT write "=n".
+#
+# -----------------------------------------------------------------------------
+# !!! READ THIS FIRST — HARDWARE REALITY (verified against FROST RTL) !!!
+# -----------------------------------------------------------------------------
+# The task brief and the pre-existing frost-nommu.dts assume an 8250/16550 UART
+# at 0x40001000 and a SiFive CLINT at 0x40010000. NEITHER matches the actual
+# FROST hardware. Verified from:
+#   frost/hw/rtl/cpu_and_mem/cpu_and_mem.sv (MMIO decode, lines ~139-153, 825-838)
+#   frost/hw/rtl/cpu_and_mem/cpu/cpu_ooo/memory_if/data_mem_request_router.sv
+#   frost/sw/common/link.ld  /  link_ddr.ld   (MEMORY{} + PROVIDE() addresses)
+#
+# FROST MMIO map (single 44-byte window at 0x4000_0000):
+#   0x4000_0000  UART TX data        (write low byte to transmit)
+#   0x4000_0004  UART RX data        (read consumes one byte)
+#   0x4000_0008  FIFO0               (general-purpose)
+#   0x4000_000C  FIFO1               (general-purpose)
+#   0x4000_0010  mtime[31:0]         (on-chip, NOT at a SiFive-CLINT offset)
+#   0x4000_0014  mtime[63:32]
+#   0x4000_0018  mtimecmp[31:0]
+#   0x4000_001C  mtimecmp[63:32]
+#   0x4000_0020  msip                (bit0 writable)
+#   0x4000_0024  UART RX status      (bit0 = RX data available)
+#   0x4000_0028  UART TX status      (bit0 = TX can accept a byte)
+#
+# Consequences for Linux (these are DESIGN GAPS, not config knobs):
+#   * The UART is NOT 16550-register-compatible. CONFIG_SERIAL_8250 cannot
+#     drive it: 8250 expects THR@+0, LSR@+5(DR=bit0,THRE=bit5), IER/FCR/LCR.
+#     FROST has TX@+0, RX@+4, RXstat@+0x24, TXstat@+0x28 with 1-bit status.
+#     => A small custom earlycon + a tiny serial driver are required, OR a
+#        16550-register-shim must be added to the RTL UART. Until then, 8250
+#        is kept enabled ONLY so the kernel still links/boots on QEMU virt
+#        (the validation harness), where the UART really is a 16550.
+#   * There is NO PLIC. The only external-IRQ path is a single i_external_interrupt
+#     pin -> MEIP. UART RX is poll-only (no IRQ wire). => no PLIC node, and the
+#     FROST console must be polled (no IRQ-driven RX).
+#   * The timer (mtime/mtimecmp) is on-chip but NOT at the SiFive-CLINT layout
+#     (mtimecmp would need base+0x4000, mtime base+0xBFF8). Linux's
+#     riscv,clint0 driver will mis-address it. => either add a CLINT-layout
+#     alias in RTL, or rely on the rv32 riscv,cpu-intc + a FROST timer driver.
+#     For first bring-up, the in-core CSR cycle/time counters
+#     (riscv,timer / rdtime) are the pragmatic clocksource.
+#
+# This fragment therefore does two things at once:
+#   (A) the FROST-correct CONFIG deltas that ARE expressible today
+#       (rootfs=initramfs, drop virtio/PCI/net, OF/DTB, keep M-mode/rv32);
+#   (B) leaves SERIAL_8250* + EARLYCON on (brief item b) so the SAME Image
+#       boots on QEMU virt for DTB+initramfs validation. The FROST console
+#       itself needs the driver/RTL work noted above before sim bring-up.
+# =============================================================================
+
+
+# -----------------------------------------------------------------------------
+# (c) Core ISA / mode — KEEP from virt (M-mode, no-MMU, rv32, nonportable).
+#     Restated here so the fragment is self-describing and a merge that drops
+#     the base config still asserts them. These MUST stay set for FROST.
+# -----------------------------------------------------------------------------
+CONFIG_NONPORTABLE=y            # gate for the M-mode/no-MMU "not upstream-portable" combo
+CONFIG_RISCV_M_MODE=y           # FROST runs the hart in M-mode only (no S/U, no SBI)
+CONFIG_ARCH_RV32I=y             # FROST is RV32 (XLEN=32)
+# CONFIG_MMU is not set         # FROST has no MMU/page tables -> nommu kernel
+CONFIG_BINFMT_FLAT=y            # nommu userspace needs bFLT executables (busybox FLAT)
+
+
+# -----------------------------------------------------------------------------
+# (a) Rootfs: virtio-blk + ext2 disk  ->  INITRAMFS (cpio in RAM)
+#     FROST has no block device; the rootfs must be an initramfs. Two ways to
+#     supply it: (1) external "-initrd rootfs.cpio" at boot (QEMU) / a second
+#     JTAG blob (FROST), leaving CONFIG_INITRAMFS_SOURCE empty; or (2) embed
+#     the cpio in the Image via CONFIG_INITRAMFS_SOURCE. We default to (1) for
+#     flexibility and leave the embed knob present-but-empty.
+# -----------------------------------------------------------------------------
+CONFIG_BLK_DEV_INITRD=y         # enable initrd/initramfs as the root filesystem
+CONFIG_INITRAMFS_SOURCE=""      # empty => supply cpio externally (-initrd / JTAG blob); set to a path to embed
+CONFIG_RD_GZIP=y                # allow a gzip-compressed external initramfs image
+# --- drop the disk-based rootfs path ---
+# CONFIG_VIRTIO_BLK is not set  # no virtio-blk on FROST
+# CONFIG_EXT2_FS is not set     # ext2 was only for the virtio-blk disk; initramfs uses ramfs/tmpfs
+# CONFIG_BLOCK is not set       # no block layer needed once virtio-blk/ext2 are gone (drop if any kept driver needs it)
+
+
+# -----------------------------------------------------------------------------
+# (b) Console over the FROST UART.
+#     Brief asks for 8250 console + earlycon. We keep these ENABLED, but note
+#     they bind to the FROST UART ONLY on QEMU virt (a real 16550). On FROST
+#     silicon the 8250 layout does not match (see header) — a custom driver or
+#     an RTL 16550-shim is required. Keeping them on lets the one Image be
+#     validated on QEMU before sim.
+# -----------------------------------------------------------------------------
+CONFIG_SERIAL_8250=y            # 8250/16550 core (drives QEMU virt's 16550; FROST needs shim/driver)
+CONFIG_SERIAL_8250_CONSOLE=y    # register ttyS0 as the system console
+CONFIG_SERIAL_EARLYCON=y        # generic earlycon framework (pre-driver boot messages)
+CONFIG_SERIAL_8250_NR_UARTS=1   # FROST exposes one UART; trim the static port table
+CONFIG_SERIAL_8250_RUNTIME_UARTS=1
+CONFIG_SERIAL_OF_PLATFORM=y     # bind the 8250 from the DT "serial@..." node (no PCI probing)
+CONFIG_SERIAL_CORE=y            # (auto-selected) tty serial core
+CONFIG_SERIAL_CORE_CONSOLE=y    # (auto-selected) serial-as-console support
+# NOTE: the matching DTB (frost-nommu.dts) must carry, for QEMU-virt validation,
+#   chosen/bootargs: "earlycon=uart8250,mmio,<UART_BASE> console=ttyS0 ..."
+#   For the REAL FROST UART, plan instead a custom earlycon name, e.g.
+#   "earlycon=frostuart,mmio32,0x40000000" backed by a ~30-line earlycon stub
+#   that writes 0x40000000 and polls bit0 @ 0x40000028.
+
+
+# -----------------------------------------------------------------------------
+# (e) OF / DTB usage with the FROST DTB (frost-nommu.dts), not virt's.
+#     nommu rv32 has no bootloader passing a /dtb pointer the usual way; the
+#     two supported paths are (1) QEMU "-dtb frost-nommu.dtb" (external), or
+#     (2) a DTB appended/built-in to the Image for FROST. We enable OF and the
+#     built-in-DTB knob (empty by default = external -dtb) so both work.
+# -----------------------------------------------------------------------------
+CONFIG_OF=y                     # device-tree support (probe UART/timer/memory from DT)
+CONFIG_OF_EARLY_FLATTREE=y      # parse the FDT early (needed for earlycon + memory setup)
+CONFIG_USE_BUILTIN_DTB=n        # default: pass the DTB externally (QEMU -dtb / FROST blob)
+CONFIG_BUILTIN_DTB_SOURCE=""    # if embedding instead: set to "frost-nommu" and drop frost-nommu.dts into arch/riscv/boot/dts/
+# (Do NOT set CONFIG_SOC_VIRT here — see the SoC section below.)
+
+
+# -----------------------------------------------------------------------------
+# SoC selection: virt -> FROST.
+#     CONFIG_SOC_VIRT pulls in the virt machine glue (goldfish-rtc, virtio
+#     plumbing, the virt CLINT/PLIC expectations). FROST is a bare SoC. There
+#     is no in-tree CONFIG_SOC_FROST, so for first bring-up we KEEP SOC_VIRT
+#     enabled (it is what makes the rv32 nommu kernel actually boot today, and
+#     it is harmless on QEMU). Flagged as an open question: a minimal
+#     "FROST/generic nommu" machine may be cleaner than riding SOC_VIRT.
+# -----------------------------------------------------------------------------
+CONFIG_SOC_VIRT=y               # KEEP for now (boot glue + earlycon table); revisit once a FROST machine exists
+
+
+# -----------------------------------------------------------------------------
+# (d) Drop virtio / PCI / networking not needed for the MVP.
+#     These were only there for the virtio-net/blk virt target. FROST has none.
+#     Removing them shrinks the Image (helps the 256 KiB BRAM stub / DDR load).
+# -----------------------------------------------------------------------------
+# CONFIG_VIRTIO_MMIO is not set            # no virtio transport on FROST
+# CONFIG_VIRTIO_MMIO_CMDLINE_DEVICES is not set
+# CONFIG_VIRTIO_NET is not set             # no NIC
+# CONFIG_VIRTIO_BLK is not set             # (also above) no disk
+# CONFIG_VIRTIO is not set                 # drop the virtio core
+# CONFIG_VIRTIO_MENU is not set
+# CONFIG_PCI is not set                    # FROST has no PCI host bridge
+# CONFIG_NET is not set                    # no networking stack for the MVP
+# CONFIG_PACKET is not set
+# CONFIG_INET is not set
+# CONFIG_UNIX is not set                   # (re-enable if userspace needs AF_UNIX later)
+# CONFIG_NETDEVICES is not set
+
+
+# -----------------------------------------------------------------------------
+# Interrupt controller: NO PLIC on FROST.
+#     The virt config pulls a PLIC (boot.log shows "riscv-plic ... mapped 95
+#     interrupts"). FROST has only the per-hart riscv,cpu-intc (MEIP/MTIP/MSIP
+#     direct lines). Disable PLIC so the kernel does not expect/probe one.
+#     (riscv,cpu-intc is always built for RISC-V; no symbol to set.)
+# -----------------------------------------------------------------------------
+# CONFIG_SIFIVE_PLIC is not set   # FROST has no PLIC; UART RX is poll-only, IRQs are direct cpu-intc lines
+
+
+# -----------------------------------------------------------------------------
+# Timer / clocksource.
+#     virt uses the SiFive CLINT timer (boot.log: "clint_clocksource"). FROST's
+#     mtime/mtimecmp are NOT at the SiFive-CLINT layout, so the riscv,clint0
+#     driver will mis-address them. Safest first-boot clocksource is the in-core
+#     rdtime/timebase (riscv,timer). Keep the CLINT driver available for the
+#     QEMU-virt validation pass; the FROST DTB simply must not present a
+#     mis-located clint node (use riscv,cpu-intc + timebase-frequency instead).
+# -----------------------------------------------------------------------------
+CONFIG_RISCV_TIMER=y            # in-core rdtime-based clocksource/clockevent (works in M-mode nommu)
+CONFIG_RISCV_M_MODE=y           # (restated) in M-mode, mtime/mtimecmp are accessed directly, not via SBI
+
+
+# -----------------------------------------------------------------------------
+# Quality-of-life (carry over from virt; cheap and useful for bring-up).
+# -----------------------------------------------------------------------------
+CONFIG_PRINTK_TIME=y            # timestamped boot log (matches the virt build, eases triage)
+CONFIG_SLOB=y                   # tiny allocator (carry from virt; appropriate for the small FROST footprint)
diff --git a/linux/buildroot-external/board/frost/patch_ret_from_exception.py b/linux/buildroot-external/board/frost/patch_ret_from_exception.py
new file mode 100644
index 00000000..191d34c8
--- /dev/null
+++ b/linux/buildroot-external/board/frost/patch_ret_from_exception.py
@@ -0,0 +1,1092 @@
+#!/usr/bin/env python3
+
+#    Copyright 2026 Two Sigma Open Source, LLC
+#
+#    Licensed under the Apache License, Version 2.0 (the "License");
+#    you may not use this file except in compliance with the License.
+#    You may obtain a copy of the License at
+#
+#        http://www.apache.org/licenses/LICENSE-2.0
+#
+#    Unless required by applicable law or agreed to in writing, software
+#    distributed under the License is distributed on an "AS IS" BASIS,
+#    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#    See the License for the specific language governing permissions and
+#    limitations under the License.
+
+"""Patch the temporary Linux bring-up image for current bring-up hazards.
+
+The external linux-mvp tree currently builds a debug kernel whose
+ret_from_exception sequence contains:
+
+    lw   a2, PT_EPC(sp)
+    sc.w zero, a2, (sp)
+    csrw mstatus, a0
+    csrw mepc, a2
+    ...
+    mret
+
+If the restored mstatus image has MIE set, the timer can preempt between the
+CSR write and MRET (an M-mode restore-window race). The trap then saves mepc at
+the MRET instruction itself, which later returns into MRET as user code and
+produces SIGILL at ret_from_exception+0x76. (The U-mode variant of that race is
+fixed in hardware -- cpu_ooo.sv seeds interrupt_resume_pc from csr_mepc on
+mret_taken -- but the M-mode restore-window variant is not yet, so this software
+crutch is still required: without it the unpatched kernel hangs at the CLINT
+clocksource switch once the periodic timer tick ramps up.)
+
+For bring-up, replace the reservation-clear SC with `andi a0, a0, -9`, clearing
+MIE in the value written to mstatus.  MRET still restores the final
+interrupt-enable state from MPIE, but the restore window is not interruptible.
+
+The target instruction is located by its unique machine-code word
+(`18c1202f`) rather than a fixed offset, so the patch survives kernel rebuilds
+that shift ret_from_exception. If the word is absent the image is assumed
+already patched (idempotent); if it occurs more than once the patch aborts
+rather than risk hitting the wrong site.
+
+Set FROST_LINUX_BOOTARGS to rewrite /chosen/bootargs in the generated DTB. This
+is useful for hardware-only boot triage such as forcing initramfs_async=0 without
+modifying the external linux-mvp artifact generator.
+
+Set FROST_LINUX_NOOP_FUNCTIONS to rewrite selected kernel functions to
+`li a0,0; ret` in the generated DDR images. This is a hardware bring-up escape
+hatch for narrow isolation runs; do not use it for correctness testing.
+
+Set FROST_LINUX_BUSYBOX to replace bin/busybox in the generated initramfs.
+This is a bring-up hook for testing BFLT header changes without rebuilding the
+external Buildroot tree.
+"""
+
+from __future__ import annotations
+
+import argparse
+import gzip
+import os
+import shutil
+import stat
+import struct
+import subprocess
+import tempfile
+from pathlib import Path
+
+
+OLD_WORD = "18c1202f"  # sc.w zero, a2, (sp) -- ret_from_exception reservation clear
+NEW_WORD = "ff757513"  # andi a0, a0, -9    -- clear mstatus.MIE in the restore value
+
+DTB_WORD = 0x200000
+INITRD_WORD = 0x204000
+KERNEL_ENTRY = 0x80000000
+FDT_MAGIC = 0xD00DFEED
+CPIO_NEWC_MAGIC = b"070701"
+CPIO_TRAILER = "TRAILER!!!"
+NOOP_INITCALL_PATCH = b"\x01\x45\x82\x80"  # c.li a0,0; c.ret
+CPU_RELAX_DIV_SYMBOL = "__delay"
+CPU_RELAX_DIV_OFFSET = 0x1C
+CPU_RELAX_DIV_OLD = b"\xb3\xc7\x07\x02"  # div a5,a5,zero
+CPU_RELAX_DIV_NEW = b"\x13\x00\x00\x00"  # nop
+CPU_RELAX_PAUSE_OFFSET = 0x20
+CPU_RELAX_PAUSE_OLD = b"\x0f\x00\x00\x01"  # pause / fence hint
+CPU_RELAX_PAUSE_NEW = b"\x13\x00\x00\x00"  # nop
+PROC_GET_INODE_MODE_RELOAD_OLD = b"\x83\xd7\x04\x00"  # lhu a5,0(s1)
+PROC_GET_INODE_MODE_RELOAD_NEW = b"\x83\x57\x09\x06"  # lhu a5,96(s2)
+PROC_GET_INODE_MODE_RELOAD_ADDRS = (0x001071B2, 0x00107220)
+PROC_GET_INODE_MODE_LOAD_ADDR = 0x0010718C
+PROC_GET_INODE_MODE_LOAD_OLD = b"\x83\x57\x09\x06"  # lhu a5,96(s2)
+PROC_GET_INODE_MODE_FORCE_REG = b"\xb7\x87\x00\x00"  # lui a5,0x8 (S_IFREG)
+PROC_LOOKUP_REF_AMO_ADDR = 0x0010BC82
+PROC_LOOKUP_REF_AMO_OLD = b"\x2f\x27\xb5\x00"  # amoadd.w a4,a1,(a0)
+PROC_LOOKUP_REF_AMO_CONST = b"\x13\x07\x10\x00"  # addi a4,zero,1
+PROC_LOOKUP_DE_ADJUST_ADDR = 0x0010BC7C
+PROC_LOOKUP_DE_ADJUST_OLD = b"\xaa\x87\x85\x45"  # mv a5,a0; li a1,1
+PROC_LOOKUP_DE_ADJUST_NEW = b"\x93\x07\x05\xfb"  # addi a5,a0,-80
+DEFAULT_SYSTEM_MAP = Path(
+    os.path.expanduser(
+        "~/bigger_l0/linux-mvp/buildroot/output/build/linux-6.18.7/System.map"
+    )
+)
+INITRD_DEVICES = {
+    "dev/console": (stat.S_IFCHR | 0o600, 5, 1),
+    "dev/null": (stat.S_IFCHR | 0o666, 1, 3),
+    "dev/random": (stat.S_IFCHR | 0o666, 1, 8),
+    "dev/ttyS0": (stat.S_IFCHR | 0o600, 4, 64),
+    "dev/urandom": (stat.S_IFCHR | 0o666, 1, 9),
+}
+DIAG_SHELL_INITTAB = """\
+console::sysinit:/bin/echo FROST_DIAG_INITTAB_START
+::sysinit:/bin/mount -t proc proc /proc
+::sysinit:/bin/mount -o remount,rw /
+::sysinit:/bin/mkdir -p /dev/pts /dev/shm /run/lock/subsys /tmp /sys
+::sysinit:/bin/mount -a
+console::sysinit:/bin/echo FROST_DIAG_INITTAB_AFTER_RCS
+console::respawn:/bin/sh
+::shutdown:/bin/umount -a -r
+"""
+SEEDRNG_NOOP = """\
+#!/bin/sh
+# FPGA bring-up has no hardware entropy source; seedrng can block PID 1 forever.
+exit 0
+"""
+
+
+def patch_ret_restore_window(path: Path) -> None:
+    """Patch the single OLD_WORD occurrence to NEW_WORD.
+
+    Works for both the dense FPGA-loader form (one word per line) and the
+    $readmemh form (skips '@<addr>' directives and blank lines).
+    """
+    lines = path.read_text().splitlines()
+    old_hits = []
+    new_hits = 0
+    for i, line in enumerate(lines):
+        s = line.strip().lower()
+        if not s or s.startswith("@"):
+            continue
+        if s == OLD_WORD:
+            old_hits.append(i)
+        elif s == NEW_WORD:
+            new_hits += 1
+    if not old_hits:
+        if new_hits:
+            return  # already patched
+        raise SystemExit(
+            f"{path}: target word {OLD_WORD} not found (and not already patched)"
+        )
+    if len(old_hits) > 1:
+        raise SystemExit(
+            f"{path}: {OLD_WORD} occurs {len(old_hits)}x; ambiguous, refusing to patch"
+        )
+    lines[old_hits[0]] = NEW_WORD
+    path.write_text("\n".join(lines) + "\n")
+
+
+def split_env_names(value: str) -> list[str]:
+    """Parse value (space/comma-separated) into a deduplicated ordered list of names."""
+    names: list[str] = []
+    seen: set[str] = set()
+    for raw_name in value.replace(",", " ").split():
+        name = raw_name.strip()
+        if not name or name in seen:
+            continue
+        names.append(name)
+        seen.add(name)
+    return names
+
+
+def resolve_system_map_symbols(system_map: Path, names: list[str]) -> dict[str, int]:
+    """Look up symbol names to byte addresses in a Linux System.map file."""
+    if not names:
+        return {}
+    if not system_map.exists():
+        raise SystemExit(f"System.map not found: {system_map}")
+
+    wanted = set(names)
+    resolved: dict[str, int] = {}
+    for line in system_map.read_text().splitlines():
+        parts = line.split()
+        if len(parts) < 3:
+            continue
+        addr, _kind, symbol = parts[:3]
+        if symbol in wanted:
+            resolved[symbol] = int(addr, 16)
+
+    missing = [name for name in names if name not in resolved]
+    if missing:
+        raise SystemExit(f"{system_map}: missing symbol(s): " + " ".join(missing))
+    return resolved
+
+
+def patch_word_byte(word: str, byte_offset: int, value: int) -> str:
+    """Patch one byte within a little-endian 4-byte hex word string and return the new word."""
+    data = bytearray(struct.pack("<I", int(word, 16)))
+    data[byte_offset] = value
+    return f"{struct.unpack('<I', data)[0]:08x}"
+
+
+def patch_dense_code_bytes(path: Path, patches: dict[int, bytes]) -> None:
+    """Apply byte-level patches to a dense (one-word-per-line) hex image file."""
+    words = [
+        line.strip().lower() for line in path.read_text().splitlines() if line.strip()
+    ]
+    for byte_addr, patch in patches.items():
+        for byte_idx, value in enumerate(patch):
+            absolute_byte = byte_addr + byte_idx
+            word_idx = absolute_byte // 4
+            byte_offset = absolute_byte % 4
+            if word_idx >= len(words):
+                raise SystemExit(
+                    f"{path}: patch address 0x{absolute_byte:x} is outside dense image"
+                )
+            words[word_idx] = patch_word_byte(words[word_idx], byte_offset, value)
+    path.write_text("\n".join(words) + "\n")
+
+
+def patch_sparse_code_bytes(path: Path, patches: dict[int, bytes]) -> None:
+    """Apply byte-level patches to a sparse (@addr-directive) hex image file."""
+    lines = path.read_text().splitlines()
+    word_line_by_addr: dict[int, int] = {}
+    current_word_addr = 0
+    for idx, line in enumerate(lines):
+        stripped = line.strip().lower()
+        if not stripped:
+            continue
+        if stripped.startswith("@"):
+            current_word_addr = int(stripped[1:], 16)
+            continue
+        word_line_by_addr[current_word_addr] = idx
+        current_word_addr += 1
+
+    for byte_addr, patch in patches.items():
+        for byte_idx, value in enumerate(patch):
+            absolute_byte = byte_addr + byte_idx
+            word_addr = absolute_byte // 4
+            byte_offset = absolute_byte % 4
+            line_idx = word_line_by_addr.get(word_addr)
+            if line_idx is None:
+                raise SystemExit(
+                    f"{path}: patch address 0x{absolute_byte:x} is outside sparse image"
+                )
+            lines[line_idx] = patch_word_byte(
+                lines[line_idx].strip().lower(), byte_offset, value
+            )
+    path.write_text("\n".join(lines) + "\n")
+
+
+def patch_code_bytes(path: Path, patches: dict[int, bytes]) -> None:
+    """Dispatch to dense or sparse patcher based on image format and apply patches."""
+    if not patches:
+        return
+    for line in path.read_text().splitlines():
+        stripped = line.strip()
+        if not stripped:
+            continue
+        if stripped.startswith("@"):
+            patch_sparse_code_bytes(path, patches)
+        else:
+            patch_dense_code_bytes(path, patches)
+        return
+    raise SystemExit(f"{path}: empty Linux DDR image")
+
+
+def patch_noop_return_zero(path: Path, symbols: dict[str, int]) -> None:
+    """Patch each symbol address with the NOOP_INITCALL_PATCH byte sequence."""
+    patch_code_bytes(path, {addr: NOOP_INITCALL_PATCH for addr in symbols.values()})
+
+
+def read_dense_code_bytes(path: Path, byte_addr: int, size: int) -> bytes:
+    """Read size bytes at byte_addr from a dense hex image file."""
+    words = [
+        line.strip().lower() for line in path.read_text().splitlines() if line.strip()
+    ]
+    data = bytearray()
+    for byte_idx in range(size):
+        absolute_byte = byte_addr + byte_idx
+        word_idx = absolute_byte // 4
+        byte_offset = absolute_byte % 4
+        if word_idx >= len(words):
+            raise SystemExit(
+                f"{path}: read address 0x{absolute_byte:x} is outside dense image"
+            )
+        data.append(struct.pack("<I", int(words[word_idx], 16))[byte_offset])
+    return bytes(data)
+
+
+def read_sparse_code_bytes(path: Path, byte_addr: int, size: int) -> bytes:
+    """Read size bytes at byte_addr from a sparse (@addr-directive) hex image file."""
+    lines = path.read_text().splitlines()
+    word_by_addr: dict[int, str] = {}
+    current_word_addr = 0
+    for line in lines:
+        stripped = line.strip().lower()
+        if not stripped:
+            continue
+        if stripped.startswith("@"):
+            current_word_addr = int(stripped[1:], 16)
+            continue
+        word_by_addr[current_word_addr] = stripped
+        current_word_addr += 1
+
+    data = bytearray()
+    for byte_idx in range(size):
+        absolute_byte = byte_addr + byte_idx
+        word_addr = absolute_byte // 4
+        byte_offset = absolute_byte % 4
+        word = word_by_addr.get(word_addr)
+        if word is None:
+            raise SystemExit(
+                f"{path}: read address 0x{absolute_byte:x} is outside sparse image"
+            )
+        data.append(struct.pack("<I", int(word, 16))[byte_offset])
+    return bytes(data)
+
+
+def read_code_bytes(path: Path, byte_addr: int, size: int) -> bytes:
+    """Dispatch to dense or sparse reader based on image format."""
+    for line in path.read_text().splitlines():
+        stripped = line.strip()
+        if not stripped:
+            continue
+        if stripped.startswith("@"):
+            return read_sparse_code_bytes(path, byte_addr, size)
+        return read_dense_code_bytes(path, byte_addr, size)
+    raise SystemExit(f"{path}: empty Linux DDR image")
+
+
+def patch_cpu_relax_div(path: Path, delay_addr: int) -> None:
+    """Patch the div-by-zero instruction inside cpu_relax (__delay+0x1C) to a NOP."""
+    patch_addr = delay_addr + CPU_RELAX_DIV_OFFSET
+    current = read_code_bytes(path, patch_addr, len(CPU_RELAX_DIV_OLD))
+    if current not in (CPU_RELAX_DIV_OLD, CPU_RELAX_DIV_NEW):
+        raise SystemExit(
+            f"{path}: {CPU_RELAX_DIV_SYMBOL}+0x{CPU_RELAX_DIV_OFFSET:x} "
+            f"at 0x{patch_addr:08x} has {current.hex()}, expected "
+            f"{CPU_RELAX_DIV_OLD.hex()}"
+        )
+    patch_code_bytes(path, {patch_addr: CPU_RELAX_DIV_NEW})
+
+
+def patch_cpu_relax_pause(path: Path, delay_addr: int) -> None:
+    """Patch the pause fence hint inside cpu_relax (__delay+0x20) to a NOP."""
+    patch_addr = delay_addr + CPU_RELAX_PAUSE_OFFSET
+    current = read_code_bytes(path, patch_addr, len(CPU_RELAX_PAUSE_OLD))
+    if current not in (CPU_RELAX_PAUSE_OLD, CPU_RELAX_PAUSE_NEW):
+        raise SystemExit(
+            f"{path}: {CPU_RELAX_DIV_SYMBOL}+0x{CPU_RELAX_PAUSE_OFFSET:x} "
+            f"at 0x{patch_addr:08x} has {current.hex()}, expected "
+            f"{CPU_RELAX_PAUSE_OLD.hex()}"
+        )
+    patch_code_bytes(path, {patch_addr: CPU_RELAX_PAUSE_NEW})
+
+
+def patch_proc_get_inode_mode_reload(path: Path) -> None:
+    """Patch all proc_get_inode mode-reload instructions to the new encoding."""
+    patches: dict[int, bytes] = {}
+    for addr in PROC_GET_INODE_MODE_RELOAD_ADDRS:
+        current = read_code_bytes(path, addr, len(PROC_GET_INODE_MODE_RELOAD_OLD))
+        if current not in (
+            PROC_GET_INODE_MODE_RELOAD_OLD,
+            PROC_GET_INODE_MODE_RELOAD_NEW,
+        ):
+            raise SystemExit(
+                f"{path}: proc_get_inode mode reload at 0x{addr:08x} "
+                f"has {current.hex()}, expected {PROC_GET_INODE_MODE_RELOAD_OLD.hex()}"
+            )
+        patches[addr] = PROC_GET_INODE_MODE_RELOAD_NEW
+    patch_code_bytes(path, patches)
+
+
+def patch_proc_get_inode_force_mode_reg(path: Path) -> None:
+    """Patch proc_get_inode to force the mode load through a register."""
+    current = read_code_bytes(
+        path, PROC_GET_INODE_MODE_LOAD_ADDR, len(PROC_GET_INODE_MODE_LOAD_OLD)
+    )
+    if current not in (PROC_GET_INODE_MODE_LOAD_OLD, PROC_GET_INODE_MODE_FORCE_REG):
+        raise SystemExit(
+            f"{path}: proc_get_inode mode load at 0x{PROC_GET_INODE_MODE_LOAD_ADDR:08x} "
+            f"has {current.hex()}, expected {PROC_GET_INODE_MODE_LOAD_OLD.hex()}"
+        )
+    patch_code_bytes(
+        path, {PROC_GET_INODE_MODE_LOAD_ADDR: PROC_GET_INODE_MODE_FORCE_REG}
+    )
+
+
+def patch_proc_lookup_ref_const(path: Path) -> None:
+    """Replace the proc_lookup_de refcount AMO with a constant-store encoding."""
+    current = read_code_bytes(
+        path, PROC_LOOKUP_REF_AMO_ADDR, len(PROC_LOOKUP_REF_AMO_OLD)
+    )
+    if current not in (PROC_LOOKUP_REF_AMO_OLD, PROC_LOOKUP_REF_AMO_CONST):
+        raise SystemExit(
+            f"{path}: proc_lookup_de refcount AMO at 0x{PROC_LOOKUP_REF_AMO_ADDR:08x} "
+            f"has {current.hex()}, expected {PROC_LOOKUP_REF_AMO_OLD.hex()}"
+        )
+    patch_code_bytes(path, {PROC_LOOKUP_REF_AMO_ADDR: PROC_LOOKUP_REF_AMO_CONST})
+
+
+def patch_proc_lookup_de_adjust(path: Path) -> None:
+    """Patch the proc_lookup_de returned-de pointer-adjustment instruction."""
+    current = read_code_bytes(
+        path, PROC_LOOKUP_DE_ADJUST_ADDR, len(PROC_LOOKUP_DE_ADJUST_OLD)
+    )
+    if current not in (PROC_LOOKUP_DE_ADJUST_OLD, PROC_LOOKUP_DE_ADJUST_NEW):
+        raise SystemExit(
+            f"{path}: proc_lookup_de returned-de adjust at "
+            f"0x{PROC_LOOKUP_DE_ADJUST_ADDR:08x} has {current.hex()}, expected "
+            f"{PROC_LOOKUP_DE_ADJUST_OLD.hex()}"
+        )
+    patch_code_bytes(path, {PROC_LOOKUP_DE_ADJUST_ADDR: PROC_LOOKUP_DE_ADJUST_NEW})
+
+
+def words_to_bytes(words: list[str]) -> bytes:
+    """Pack a list of little-endian 8-hex-digit word strings into bytes."""
+    return b"".join(struct.pack("<I", int(word, 16)) for word in words)
+
+
+def bytes_to_words(data: bytes) -> list[str]:
+    """Unpack bytes into a list of little-endian 8-hex-digit word strings."""
+    if len(data) % 4:
+        data += b"\x00" * (4 - len(data) % 4)
+    return [
+        f"{struct.unpack_from('<I', data, i)[0]:08x}" for i in range(0, len(data), 4)
+    ]
+
+
+def fdt_total_size(data: bytes) -> int:
+    """Return the total_size field from a FDT blob after validating the magic."""
+    if len(data) < 8:
+        raise SystemExit("DTB slot is too small to contain an FDT header")
+    magic, total_size = struct.unpack_from(">II", data, 0)
+    if magic != FDT_MAGIC:
+        raise SystemExit(
+            f"DTB magic mismatch: got 0x{magic:08x}, expected 0x{FDT_MAGIC:08x}"
+        )
+    if total_size > len(data):
+        raise SystemExit(
+            f"DTB total size {total_size} exceeds extracted slot {len(data)}"
+        )
+    return total_size
+
+
+def padded_dtb_slot(words: list[str]) -> bytes:
+    """Extract and zero-pad a DTB from a word list to its declared total_size."""
+    data = words_to_bytes(words)
+    if len(data) < 8:
+        raise SystemExit("DTB slot is too small to contain an FDT header")
+    magic, total_size = struct.unpack_from(">II", data, 0)
+    if magic != FDT_MAGIC:
+        raise SystemExit(
+            f"DTB magic mismatch: got 0x{magic:08x}, expected 0x{FDT_MAGIC:08x}"
+        )
+    if total_size > len(data):
+        data += b"\x00" * (total_size - len(data))
+    return data
+
+
+def fdt_tool(name: str) -> str:
+    """Locate an FDT command-line tool on PATH or raise SystemExit if absent."""
+    tool = shutil.which(name)
+    if not tool:
+        raise SystemExit(f"{name} is required in PATH")
+    return tool
+
+
+def run_fdtget_u32(dtb_path: Path, prop: str) -> int:
+    """Read a single hex /chosen property from a DTB file using fdtget."""
+    result = subprocess.run(
+        [fdt_tool("fdtget"), "-t", "x", str(dtb_path), "/chosen", prop],
+        check=True,
+        capture_output=True,
+        text=True,
+    )
+    words = result.stdout.split()
+    if len(words) != 1:
+        raise SystemExit(f"{dtb_path}: expected one {prop} cell, got {result.stdout!r}")
+    return int(words[0], 16)
+
+
+def rewrite_dtb(dtb_slot: bytes, bootargs: str | None, initrd_end: int | None) -> bytes:
+    """Rewrite bootargs and linux,initrd-end in a DTB blob using fdtput."""
+    fdtput = shutil.which("fdtput")
+    if not fdtput:
+        raise SystemExit("DTB rewriting requires fdtput in PATH")
+
+    total_size = fdt_total_size(dtb_slot)
+    old_dtb = dtb_slot[:total_size]
+    with tempfile.TemporaryDirectory(prefix="frost_dtb_") as tmp:
+        dtb_path = Path(tmp) / "frost.dtb"
+        dtb_path.write_bytes(old_dtb)
+        if bootargs is not None:
+            subprocess.run(
+                [fdtput, "-t", "s", str(dtb_path), "/chosen", "bootargs", bootargs],
+                check=True,
+            )
+        if initrd_end is not None:
+            subprocess.run(
+                [
+                    fdtput,
+                    "-t",
+                    "x",
+                    str(dtb_path),
+                    "/chosen",
+                    "linux,initrd-end",
+                    f"0x{initrd_end:08x}",
+                ],
+                check=True,
+            )
+        serial_irq_mode = os.environ.get("FROST_LINUX_SERIAL_IRQ_MODE", "poll")
+        if serial_irq_mode == "poll":
+            subprocess.run(
+                [
+                    fdtput,
+                    "-d",
+                    str(dtb_path),
+                    "/soc/serial@40001000",
+                    "interrupts-extended",
+                ],
+                check=False,
+                stdout=subprocess.DEVNULL,
+                stderr=subprocess.DEVNULL,
+            )
+        elif serial_irq_mode == "cpu-local-meip":
+            subprocess.run(
+                [
+                    fdtput,
+                    "-t",
+                    "x",
+                    str(dtb_path),
+                    "/soc/serial@40001000",
+                    "interrupts-extended",
+                    "0x00000001",
+                    "0x0000000b",
+                ],
+                check=True,
+            )
+        else:
+            raise SystemExit(f"unknown FROST_LINUX_SERIAL_IRQ_MODE={serial_irq_mode!r}")
+        new_dtb = dtb_path.read_bytes()
+
+    if len(new_dtb) > (INITRD_WORD - DTB_WORD) * 4:
+        raise SystemExit(
+            f"patched DTB is {len(new_dtb)} bytes; only "
+            f"{(INITRD_WORD - DTB_WORD) * 4} bytes available before initrd"
+        )
+    return new_dtb
+
+
+def get_initrd_bounds(dtb_slot: bytes) -> tuple[int, int]:
+    """Read the initrd start and end byte addresses from a DTB blob using fdtget."""
+    total_size = fdt_total_size(dtb_slot)
+    with tempfile.TemporaryDirectory(prefix="frost_dtb_") as tmp:
+        dtb_path = Path(tmp) / "frost.dtb"
+        dtb_path.write_bytes(dtb_slot[:total_size])
+        start = run_fdtget_u32(dtb_path, "linux,initrd-start")
+        end = run_fdtget_u32(dtb_path, "linux,initrd-end")
+    if end < start:
+        raise SystemExit(f"invalid initrd bounds: start=0x{start:08x}, end=0x{end:08x}")
+    if start < KERNEL_ENTRY or (start - KERNEL_ENTRY) % 4:
+        raise SystemExit(f"unsupported initrd start: 0x{start:08x}")
+    return start, end
+
+
+def newc_pad(n: int) -> int:
+    """Return the number of padding bytes to reach the next 4-byte CPIO alignment boundary."""
+    return (-n) & 3
+
+
+def parse_newc_entry(data: bytes, offset: int) -> tuple[str, list[int], int, int, int]:
+    """Parse one CPIO newc entry, returning name, fields, body_start, next_offset, and file_size."""
+    if offset + 110 > len(data) or data[offset : offset + 6] != CPIO_NEWC_MAGIC:
+        raise SystemExit(f"initramfs is not a valid newc archive at byte {offset}")
+    fields = [
+        int(data[offset + 6 + idx * 8 : offset + 14 + idx * 8], 16) for idx in range(13)
+    ]
+    file_size = fields[6]
+    name_size = fields[11]
+    name_start = offset + 110
+    name_end = name_start + name_size
+    if name_end > len(data):
+        raise SystemExit(f"initramfs newc entry at byte {offset} has truncated name")
+    name = data[name_start : name_end - 1].decode("utf-8")
+    body_start = name_end + newc_pad(name_end)
+    next_offset = body_start + file_size + newc_pad(body_start + file_size)
+    if next_offset > len(data):
+        raise SystemExit(f"initramfs newc entry {name!r} at byte {offset} is truncated")
+    return name, fields, body_start, next_offset, file_size
+
+
+def find_newc_trailer(data: bytes) -> tuple[int, set[str]]:
+    """Scan a CPIO newc archive for the TRAILER entry and return its offset and all filenames seen."""
+    offset = 0
+    names: set[str] = set()
+    while offset < len(data):
+        name, _fields, _body_start, next_offset, _file_size = parse_newc_entry(
+            data, offset
+        )
+        names.add(name)
+        if name == CPIO_TRAILER:
+            return offset, names
+        offset = next_offset
+    raise SystemExit("initramfs newc archive has no TRAILER!!! entry")
+
+
+def make_newc_entry(
+    name: str,
+    mode: int,
+    rdev_major: int,
+    rdev_minor: int,
+    ino: int,
+    data: bytes = b"",
+    uid: int = 0,
+    gid: int = 0,
+    nlink: int = 1,
+    mtime: int = 0,
+    dev_major: int = 0,
+    dev_minor: int = 0,
+) -> bytes:
+    """Build a complete CPIO newc archive entry from name, mode, device numbers, and data."""
+    encoded_name = name.encode("utf-8") + b"\x00"
+    fields = [
+        ino,
+        mode,
+        uid,
+        gid,
+        nlink,
+        mtime,
+        len(data),
+        dev_major,
+        dev_minor,
+        rdev_major,
+        rdev_minor,
+        len(encoded_name),
+        0,  # check
+    ]
+    header = CPIO_NEWC_MAGIC + b"".join(
+        f"{field:08x}".encode("ascii") for field in fields
+    )
+    name_block = (
+        header + encoded_name + (b"\x00" * newc_pad(len(header) + len(encoded_name)))
+    )
+    return name_block + data + (b"\x00" * newc_pad(len(name_block) + len(data)))
+
+
+def make_newc_replacement_entry(name: str, fields: list[int], data: bytes) -> bytes:
+    """Rebuild a CPIO newc entry preserving the original metadata with new data."""
+    return make_newc_entry(
+        name,
+        fields[1],
+        fields[9],
+        fields[10],
+        fields[0],
+        data=data,
+        uid=fields[2],
+        gid=fields[3],
+        nlink=fields[4],
+        mtime=fields[5],
+        dev_major=fields[7],
+        dev_minor=fields[8],
+    )
+
+
+def patch_initramfs(
+    initrd_gz: bytes,
+    replacements: dict[str, bytes],
+    additions: dict[str, tuple[int, bytes]],
+    deletions: set[str],
+) -> tuple[bytes, list[str], list[str], list[str], list[str]]:
+    """Patch, add, and delete entries in a gzip-compressed CPIO initramfs."""
+    conflicts = (set(replacements) | set(additions)) & deletions
+    if conflicts:
+        raise SystemExit(
+            "initramfs paths cannot be both patched/added and deleted: "
+            + " ".join(sorted(conflicts))
+        )
+
+    initrd = gzip.decompress(initrd_gz)
+    trailer_offset, names = find_newc_trailer(initrd)
+    missing = [name for name in INITRD_DEVICES if name not in names]
+    existing_additions = set(additions) & names
+
+    if not missing and not replacements and not additions and not deletions:
+        return initrd_gz, [], [], [], []
+
+    patched_entries: list[bytes] = []
+    replaced: list[str] = []
+    deleted: list[str] = []
+    offset = 0
+    while offset < trailer_offset:
+        name, fields, body_start, next_offset, file_size = parse_newc_entry(
+            initrd, offset
+        )
+        if name in deletions:
+            deleted.append(name)
+        elif name in replacements:
+            patched_entries.append(
+                make_newc_replacement_entry(name, fields, replacements[name])
+            )
+            replaced.append(name)
+        elif name in existing_additions:
+            _mode, data = additions[name]
+            patched_entries.append(make_newc_replacement_entry(name, fields, data))
+            replaced.append(name)
+        else:
+            patched_entries.append(initrd[offset:next_offset])
+        offset = next_offset
+
+    for idx, name in enumerate(missing, start=0xF005700):
+        mode, major, minor = INITRD_DEVICES[name]
+        patched_entries.append(make_newc_entry(name, mode, major, minor, idx))
+    added_files: list[str] = []
+    for idx, (name, (mode, data)) in enumerate(additions.items(), start=0xF006700):
+        if name in names:
+            continue
+        patched_entries.append(make_newc_entry(name, mode, 0, 0, idx, data=data))
+        added_files.append(name)
+    trailer = make_newc_entry(CPIO_TRAILER, 0, 0, 0, 0)
+    patched = b"".join(patched_entries) + trailer
+
+    missing_replacements = sorted(set(replacements) - set(replaced))
+    if missing_replacements:
+        raise SystemExit(
+            "initramfs replacement target(s) not found: "
+            + " ".join(missing_replacements)
+        )
+    missing_deletions = sorted(deletions - set(deleted))
+    if missing_deletions:
+        raise SystemExit(
+            "initramfs deletion target(s) not found: " + " ".join(missing_deletions)
+        )
+    return gzip.compress(patched, mtime=0), missing, replaced, added_files, deleted
+
+
+def get_initramfs_replacements() -> dict[str, bytes]:
+    """Build the initramfs file-replacement map from FROST_LINUX_* environment variables."""
+    replacements = {
+        "etc/init.d/S01seedrng": SEEDRNG_NOOP.encode("utf-8"),
+    }
+    busybox_replacement = os.environ.get("FROST_LINUX_BUSYBOX")
+    if busybox_replacement:
+        replacements["bin/busybox"] = Path(busybox_replacement).read_bytes()
+    preset = os.environ.get("FROST_LINUX_INITTAB_PRESET")
+    raw_inittab = os.environ.get("FROST_LINUX_INITTAB")
+    if raw_inittab and preset:
+        raise SystemExit(
+            "set either FROST_LINUX_INITTAB or FROST_LINUX_INITTAB_PRESET, not both"
+        )
+    if preset == "diag-shell":
+        replacements["etc/inittab"] = DIAG_SHELL_INITTAB.encode("utf-8")
+        return replacements
+    if preset:
+        raise SystemExit(f"unknown FROST_LINUX_INITTAB_PRESET={preset!r}")
+    if raw_inittab:
+        replacements["etc/inittab"] = raw_inittab.replace("\\n", "\n").encode("utf-8")
+    return replacements
+
+
+def get_initramfs_additions() -> dict[str, tuple[int, bytes]]:
+    """Build the initramfs file-addition map from FROST_LINUX_* environment variables."""
+    additions: dict[str, tuple[int, bytes]] = {}
+    diag_init = os.environ.get("FROST_LINUX_DIAG_INIT")
+    if diag_init:
+        additions["frost_diag_init"] = (
+            stat.S_IFREG | 0o755,
+            Path(diag_init).read_bytes(),
+        )
+    return additions
+
+
+def get_initramfs_deletions() -> set[str]:
+    """Build the set of initramfs paths to delete from FROST_LINUX_* environment variables."""
+    deletions = set(
+        split_env_names(os.environ.get("FROST_LINUX_DELETE_INITRAMFS_NAMES", ""))
+    )
+    if os.environ.get("FROST_LINUX_DELETE_INITTAB") == "1":
+        deletions.add("etc/inittab")
+    return deletions
+
+
+def patch_dense_image(
+    path: Path,
+    bootargs: str | None,
+    initramfs_replacements: dict[str, bytes],
+    initramfs_additions: dict[str, tuple[int, bytes]],
+    initramfs_deletions: set[str],
+) -> tuple[list[str], list[str], list[str], list[str]]:
+    """Patch DTB and initramfs embedded in a dense Linux DDR hex image."""
+    words = [
+        line.strip().lower() for line in path.read_text().splitlines() if line.strip()
+    ]
+    if len(words) < INITRD_WORD:
+        raise SystemExit(f"{path}: dense DDR image is too short for DTB/initrd slots")
+
+    dtb_slot_words = words[DTB_WORD:INITRD_WORD]
+    dtb_slot = words_to_bytes(dtb_slot_words)
+    initrd_start, initrd_end = get_initrd_bounds(dtb_slot)
+    initrd_word = (initrd_start - KERNEL_ENTRY) // 4
+    if initrd_word != INITRD_WORD:
+        raise SystemExit(f"{path}: unexpected initrd word offset 0x{initrd_word:x}")
+    initrd_size = initrd_end - initrd_start
+    initrd_word_count = (initrd_size + 3) // 4
+    initrd_gz = words_to_bytes(words[INITRD_WORD : INITRD_WORD + initrd_word_count])[
+        :initrd_size
+    ]
+    new_initrd_gz, added_devices, replaced_files, added_files, deleted_files = (
+        patch_initramfs(
+            initrd_gz, initramfs_replacements, initramfs_additions, initramfs_deletions
+        )
+    )
+    new_initrd_end = initrd_start + len(new_initrd_gz)
+
+    new_dtb_words = bytes_to_words(rewrite_dtb(dtb_slot, bootargs, new_initrd_end))
+    if DTB_WORD + len(new_dtb_words) > INITRD_WORD:
+        raise SystemExit(f"{path}: patched DTB overlaps initrd")
+    new_initrd_words = bytes_to_words(new_initrd_gz)
+
+    words[DTB_WORD : DTB_WORD + len(new_dtb_words)] = new_dtb_words
+    for i in range(DTB_WORD + len(new_dtb_words), INITRD_WORD):
+        words[i] = "00000000"
+    words[INITRD_WORD:] = new_initrd_words
+    path.write_text("\n".join(words) + "\n")
+    return added_devices, replaced_files, added_files, deleted_files
+
+
+def patch_sparse_image(
+    path: Path,
+    bootargs: str | None,
+    initramfs_replacements: dict[str, bytes],
+    initramfs_additions: dict[str, tuple[int, bytes]],
+    initramfs_deletions: set[str],
+) -> tuple[list[str], list[str], list[str], list[str]]:
+    """Patch DTB and initramfs embedded in a sparse Linux DDR hex image."""
+
+    def is_gzip_first_word(word: str) -> bool:
+        try:
+            return (int(word, 16) & 0x00FF_FFFF) == 0x0008_8B1F
+        except ValueError:
+            return False
+
+    lines = path.read_text().splitlines()
+    dtb_directive = f"@{DTB_WORD:08x}"
+    initrd_directive = f"@{INITRD_WORD:08x}"
+    try:
+        dtb_line = next(
+            i for i, line in enumerate(lines) if line.strip().lower() == dtb_directive
+        )
+    except StopIteration as exc:
+        raise SystemExit(f"{path}: missing DTB address directive") from exc
+    initrd_line = next(
+        (i for i, line in enumerate(lines) if line.strip().lower() == initrd_directive),
+        None,
+    )
+    if initrd_line is not None and initrd_line <= dtb_line:
+        raise SystemExit(f"{path}: initrd directive appears before DTB directive")
+
+    dtb_slot_words = INITRD_WORD - DTB_WORD
+    sparse_payload_initrd_word = dtb_slot_words
+    if initrd_line is None:
+        payload_words = [
+            line.strip().lower() for line in lines[dtb_line + 1 :] if line.strip()
+        ]
+        if len(payload_words) > dtb_slot_words and is_gzip_first_word(
+            payload_words[dtb_slot_words]
+        ):
+            sparse_payload_initrd_word = dtb_slot_words
+            dtb_words = payload_words[:dtb_slot_words]
+        else:
+            gzip_word = next(
+                (
+                    idx
+                    for idx, word in enumerate(payload_words)
+                    if is_gzip_first_word(word)
+                ),
+                None,
+            )
+            if gzip_word is None:
+                raise SystemExit(
+                    f"{path}: missing initrd directive and gzip initrd header"
+                )
+            sparse_payload_initrd_word = gzip_word
+            dtb_words = payload_words[:gzip_word]
+        initrd_words = payload_words[sparse_payload_initrd_word:]
+    else:
+        dtb_words = [
+            line.strip().lower()
+            for line in lines[dtb_line + 1 : initrd_line]
+            if line.strip()
+        ]
+        initrd_words = [
+            line.strip().lower() for line in lines[initrd_line + 1 :] if line.strip()
+        ]
+    dtb_slot = padded_dtb_slot(dtb_words)
+    initrd_start, initrd_end = get_initrd_bounds(dtb_slot)
+    initrd_word = (initrd_start - KERNEL_ENTRY) // 4
+    if initrd_word != INITRD_WORD:
+        raise SystemExit(f"{path}: unexpected initrd word offset 0x{initrd_word:x}")
+    initrd_size = initrd_end - initrd_start
+    initrd_gz = words_to_bytes(initrd_words)[:initrd_size]
+    new_initrd_gz, added_devices, replaced_files, added_files, deleted_files = (
+        patch_initramfs(
+            initrd_gz, initramfs_replacements, initramfs_additions, initramfs_deletions
+        )
+    )
+    new_initrd_end = initrd_start + len(new_initrd_gz)
+
+    new_dtb_words = bytes_to_words(rewrite_dtb(dtb_slot, bootargs, new_initrd_end))
+    if DTB_WORD + len(new_dtb_words) > INITRD_WORD:
+        raise SystemExit(f"{path}: patched DTB overlaps initrd")
+    new_initrd_words = bytes_to_words(new_initrd_gz)
+
+    lines[dtb_line + 1 :] = new_dtb_words + [initrd_directive] + new_initrd_words
+    path.write_text("\n".join(lines) + "\n")
+    return added_devices, replaced_files, added_files, deleted_files
+
+
+def patch_linux_image(
+    path: Path,
+    bootargs: str | None,
+    initramfs_replacements: dict[str, bytes],
+    initramfs_additions: dict[str, tuple[int, bytes]],
+    initramfs_deletions: set[str],
+) -> tuple[list[str], list[str], list[str], list[str]]:
+    """Patch a Linux DDR image, dispatching to dense or sparse handler by format."""
+    for line in path.read_text().splitlines():
+        stripped = line.strip()
+        if not stripped:
+            continue
+        if stripped.startswith("@"):
+            return patch_sparse_image(
+                path,
+                bootargs,
+                initramfs_replacements,
+                initramfs_additions,
+                initramfs_deletions,
+            )
+        return patch_dense_image(
+            path,
+            bootargs,
+            initramfs_replacements,
+            initramfs_additions,
+            initramfs_deletions,
+        )
+    raise SystemExit(f"{path}: empty Linux DDR image")
+
+
+def main() -> None:
+    """Entry point: patches the Linux DDR image with all FROST boot patches."""
+    parser = argparse.ArgumentParser()
+    parser.add_argument("sw_ddr_mem", type=Path)
+    parser.add_argument("sw_ddr_txt", type=Path)
+    args = parser.parse_args()
+
+    patch_ret_restore_window(args.sw_ddr_mem)
+    patch_ret_restore_window(args.sw_ddr_txt)
+    print(f"Patched Linux ret_from_exception restore window: {OLD_WORD}->{NEW_WORD}")
+
+    noop_initcall_names = split_env_names(
+        os.environ.get("FROST_LINUX_NOOP_INITCALLS", "")
+    )
+    noop_function_names = split_env_names(
+        os.environ.get("FROST_LINUX_NOOP_FUNCTIONS", "")
+    )
+    system_map = Path(
+        os.environ.get("FROST_LINUX_SYSTEM_MAP", DEFAULT_SYSTEM_MAP)
+    ).expanduser()
+    noop_initcall_symbols = resolve_system_map_symbols(system_map, noop_initcall_names)
+    patch_noop_return_zero(args.sw_ddr_mem, noop_initcall_symbols)
+    patch_noop_return_zero(args.sw_ddr_txt, noop_initcall_symbols)
+    if noop_initcall_symbols:
+        patched = " ".join(
+            f"{name}@0x{noop_initcall_symbols[name]:08x}"
+            for name in noop_initcall_names
+        )
+        print(f"Patched Linux initcalls to return 0: {patched}")
+
+    noop_function_symbols = resolve_system_map_symbols(system_map, noop_function_names)
+    patch_noop_return_zero(args.sw_ddr_mem, noop_function_symbols)
+    patch_noop_return_zero(args.sw_ddr_txt, noop_function_symbols)
+    if noop_function_symbols:
+        patched = " ".join(
+            f"{name}@0x{noop_function_symbols[name]:08x}"
+            for name in noop_function_names
+        )
+        print(f"Patched Linux functions to return 0: {patched}")
+
+    if os.environ.get("FROST_LINUX_NOP_CPU_RELAX_DIV") == "1":
+        delay_addr = resolve_system_map_symbols(system_map, [CPU_RELAX_DIV_SYMBOL])[
+            CPU_RELAX_DIV_SYMBOL
+        ]
+        patch_cpu_relax_div(args.sw_ddr_mem, delay_addr)
+        patch_cpu_relax_div(args.sw_ddr_txt, delay_addr)
+        print(
+            f"Patched Linux {CPU_RELAX_DIV_SYMBOL} cpu_relax DIV-by-zero to NOP: "
+            f"{CPU_RELAX_DIV_SYMBOL}+0x{CPU_RELAX_DIV_OFFSET:x}@"
+            f"0x{delay_addr + CPU_RELAX_DIV_OFFSET:08x}"
+        )
+
+    if os.environ.get("FROST_LINUX_NOP_CPU_RELAX_PAUSE") == "1":
+        delay_addr = resolve_system_map_symbols(system_map, [CPU_RELAX_DIV_SYMBOL])[
+            CPU_RELAX_DIV_SYMBOL
+        ]
+        patch_cpu_relax_pause(args.sw_ddr_mem, delay_addr)
+        patch_cpu_relax_pause(args.sw_ddr_txt, delay_addr)
+        print(
+            f"Patched Linux {CPU_RELAX_DIV_SYMBOL} cpu_relax PAUSE to NOP: "
+            f"{CPU_RELAX_DIV_SYMBOL}+0x{CPU_RELAX_PAUSE_OFFSET:x}@"
+            f"0x{delay_addr + CPU_RELAX_PAUSE_OFFSET:08x}"
+        )
+
+    if os.environ.get("FROST_LINUX_PATCH_PROC_GET_INODE_MODE_RELOAD") == "1":
+        patch_proc_get_inode_mode_reload(args.sw_ddr_mem)
+        patch_proc_get_inode_mode_reload(args.sw_ddr_txt)
+        print(
+            "Patched Linux proc_get_inode mode reload: "
+            f"{','.join(f'0x{addr:08x}' for addr in PROC_GET_INODE_MODE_RELOAD_ADDRS)} "
+            f"{PROC_GET_INODE_MODE_RELOAD_OLD.hex()}->"
+            f"{PROC_GET_INODE_MODE_RELOAD_NEW.hex()}"
+        )
+
+    if os.environ.get("FROST_LINUX_FORCE_PROC_GET_INODE_MODE_REG") == "1":
+        patch_proc_get_inode_force_mode_reg(args.sw_ddr_mem)
+        patch_proc_get_inode_force_mode_reg(args.sw_ddr_txt)
+        print(
+            "Patched Linux proc_get_inode mode load to S_IFREG: "
+            f"0x{PROC_GET_INODE_MODE_LOAD_ADDR:08x} "
+            f"{PROC_GET_INODE_MODE_LOAD_OLD.hex()}->"
+            f"{PROC_GET_INODE_MODE_FORCE_REG.hex()}"
+        )
+
+    if os.environ.get("FROST_LINUX_PATCH_PROC_LOOKUP_REF_CONST") == "1":
+        patch_proc_lookup_ref_const(args.sw_ddr_mem)
+        patch_proc_lookup_ref_const(args.sw_ddr_txt)
+        print(
+            "Patched Linux proc_lookup_de refcount AMO result to 1: "
+            f"0x{PROC_LOOKUP_REF_AMO_ADDR:08x} "
+            f"{PROC_LOOKUP_REF_AMO_OLD.hex()}->"
+            f"{PROC_LOOKUP_REF_AMO_CONST.hex()}"
+        )
+
+    if os.environ.get("FROST_LINUX_PATCH_PROC_LOOKUP_DE_ADJUST") == "1":
+        patch_proc_lookup_de_adjust(args.sw_ddr_mem)
+        patch_proc_lookup_de_adjust(args.sw_ddr_txt)
+        print(
+            "Patched Linux proc_lookup_de returned pointer adjust: "
+            f"0x{PROC_LOOKUP_DE_ADJUST_ADDR:08x} "
+            f"{PROC_LOOKUP_DE_ADJUST_OLD.hex()}->"
+            f"{PROC_LOOKUP_DE_ADJUST_NEW.hex()}"
+        )
+
+    bootargs = os.environ.get("FROST_LINUX_BOOTARGS")
+    initramfs_replacements = get_initramfs_replacements()
+    initramfs_additions = get_initramfs_additions()
+    initramfs_deletions = get_initramfs_deletions()
+    sparse_devices, sparse_replaced, sparse_added, sparse_deleted = patch_linux_image(
+        args.sw_ddr_mem,
+        bootargs,
+        initramfs_replacements,
+        initramfs_additions,
+        initramfs_deletions,
+    )
+    dense_devices, dense_replaced, dense_added, dense_deleted = patch_linux_image(
+        args.sw_ddr_txt,
+        bootargs,
+        initramfs_replacements,
+        initramfs_additions,
+        initramfs_deletions,
+    )
+    if bootargs:
+        print(f"Patched Linux DTB bootargs: {bootargs}")
+    added_devices = sorted(set(sparse_devices) | set(dense_devices))
+    if added_devices:
+        print(f"Patched Linux initramfs device nodes: {' '.join(added_devices)}")
+    replaced_files = sorted(set(sparse_replaced) | set(dense_replaced))
+    if replaced_files:
+        print(f"Patched Linux initramfs files: {' '.join(replaced_files)}")
+    added_files = sorted(set(sparse_added) | set(dense_added))
+    if added_files:
+        print(f"Patched Linux initramfs added files: {' '.join(added_files)}")
+    deleted_files = sorted(set(sparse_deleted) | set(dense_deleted))
+    if deleted_files:
+        print(f"Patched Linux initramfs deleted files: {' '.join(deleted_files)}")
+
+
+if __name__ == "__main__":
+    main()
diff --git a/linux/buildroot-external/board/frost/patches/linux/linux.hash b/linux/buildroot-external/board/frost/patches/linux/linux.hash
new file mode 100644
index 00000000..e47db06e
--- /dev/null
+++ b/linux/buildroot-external/board/frost/patches/linux/linux.hash
@@ -0,0 +1,7 @@
+# Locally-carried hash so BR2_DOWNLOAD_FORCE_CHECK_HASHES can verify the custom
+# kernel version (BR2_LINUX_KERNEL_CUSTOM_VERSION_VALUE = "6.18.7"). Copied from
+# the upstream buildroot board/qemu/patches/linux/linux.hash used by the current
+# FROST build. Keep this in sync if the pinned kernel version changes.
+#
+# From https://www.kernel.org/pub/linux/kernel/v6.x/sha256sums.asc
+sha256  b726a4d15cf9ae06219b56d87820776e34d89fbc137e55fb54a9b9c3015b8f1e  linux-6.18.7.tar.xz
diff --git a/linux/buildroot-external/board/frost/patches/uclibc/0001-nommu-default-dl_pagesize-to-PAGE_SIZE.patch b/linux/buildroot-external/board/frost/patches/uclibc/0001-nommu-default-dl_pagesize-to-PAGE_SIZE.patch
new file mode 100644
index 00000000..7e3da257
--- /dev/null
+++ b/linux/buildroot-external/board/frost/patches/uclibc/0001-nommu-default-dl_pagesize-to-PAGE_SIZE.patch
@@ -0,0 +1,36 @@
+FROST no-MMU: default _dl_pagesize to PAGE_SIZE when the auxv lacks AT_PAGESZ
+
+The kernel binfmt_flat loader (no-MMU / bFLT) does not push an ELF auxiliary
+vector onto the initial process stack, so AT_PAGESZ is absent.  The static-libc
+dynamic-linker support (_dl_aux_init) therefore leaves _dl_pagesize == 0, and
+getpagesize() / sysconf(_SC_PAGESIZE) return 0.
+
+uClibc's malloc then rounds its first heap-extend block up to a multiple of the
+page size:
+
+    block_size = MALLOC_ROUND_UP_TO_PAGE_SIZE(size) = round_up(size, 0) = 0
+
+so the heap mmap() becomes mmap(NULL, 0, ...), which the kernel rejects with
+EINVAL.  The very first malloc() fails, busybox (init or sh) aborts with
+"out of memory", PID 1 exits, and the kernel panics ("Attempted to kill init") --
+the system never reaches a shell.
+
+Fall back to the compile-time PAGE_SIZE (which the per-arch bits/uClibc_page.h
+comments already document as the intended behaviour, "?: PAGE_SIZE") when
+AT_PAGESZ is missing.  This matches the dynamic-loader path in ldso.c and fixes
+no-MMU userspace (verified: busybox reaches an interactive shell on RV32 no-MMU
+under both QEMU and the FROST FPGA).
+
+Signed-off-by: FROST <frost@twosigma.com>
+
+--- a/libc/misc/elf/dl-support.c
++++ b/libc/misc/elf/dl-support.c
+@@ -53,7 +53,7 @@ void internal_function _dl_aux_init (ElfW(auxv_t) *av)
+    _dl_phnum = (size_t) _dl_auxvt[AT_PHNUM].a_un.a_val;
+
+    /* Get the pagesize from the aux vect */
+-   _dl_pagesize = (_dl_auxvt[AT_PAGESZ].a_un.a_val) ? (size_t) _dl_auxvt[AT_PAGESZ].a_un.a_val : 0;
++   _dl_pagesize = (_dl_auxvt[AT_PAGESZ].a_un.a_val) ? (size_t) _dl_auxvt[AT_PAGESZ].a_un.a_val : PAGE_SIZE;
+ }
+
+ #if defined(USE_TLS) && USE_TLS
diff --git a/linux/buildroot-external/board/frost/post-image.sh b/linux/buildroot-external/board/frost/post-image.sh
new file mode 100755
index 00000000..2c8f7cf6
--- /dev/null
+++ b/linux/buildroot-external/board/frost/post-image.sh
@@ -0,0 +1,88 @@
+#!/usr/bin/env bash
+
+#    Copyright 2026 Two Sigma Open Source, LLC
+#
+#    Licensed under the Apache License, Version 2.0 (the "License");
+#    you may not use this file except in compliance with the License.
+#    You may obtain a copy of the License at
+#
+#        http://www.apache.org/licenses/LICENSE-2.0
+#
+#    Unless required by applicable law or agreed to in writing, software
+#    distributed under the License is distributed on an "AS IS" BASIS,
+#    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#    See the License for the specific language governing permissions and
+#    limitations under the License.
+
+# Buildroot post-image hook for the FROST RV32 no-MMU Linux MVP.
+#
+# Buildroot runs this after the rootfs/image stage with BINARIES_DIR, HOST_DIR,
+# BUILD_DIR (and BASE_DIR) exported. It locates the toolchain Buildroot just
+# built and the device-tree compiler, then invokes build_fpga_boot.py to emit
+# the FROST FPGA/sim memory images:
+#
+#   $BINARIES_DIR/sw.{mem,txt}       low-BRAM boot shim
+#   $BINARIES_DIR/sw_ddr.{mem,txt}   kernel Image + DTB + initramfs in DDR
+#
+# CI then stages sw.mem / sw_ddr.mem into sw/apps/linux_boot/ for the cocotb
+# linux_boot test (see linux/buildroot-external/README.md).
+
+set -euo pipefail
+
+BOARD_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
+
+: "${BINARIES_DIR:?BINARIES_DIR must be set (run me as a Buildroot post-image script)}"
+: "${HOST_DIR:?HOST_DIR must be set (run me as a Buildroot post-image script)}"
+
+# --- locate the rv32 cross toolchain Buildroot just produced ---
+gcc_path="$(ls "${HOST_DIR}"/bin/riscv32-*-gcc 2>/dev/null | head -n1 || true)"
+if [ -z "${gcc_path}" ]; then
+    echo "post-image.sh: no riscv32-*-gcc found in ${HOST_DIR}/bin" >&2
+    exit 1
+fi
+cross_compile="${gcc_path%gcc}"
+
+# --- locate dtc: prefer the host build, fall back to the kernel's scripts/dtc ---
+dtc_path="${HOST_DIR}/bin/dtc"
+if [ ! -x "${dtc_path}" ]; then
+    dtc_path="$(ls "${BUILD_DIR:-}"/linux-*/scripts/dtc/dtc 2>/dev/null | head -n1 || true)"
+fi
+if [ -z "${dtc_path}" ] || [ ! -x "${dtc_path}" ]; then
+    dtc_path="$(command -v dtc || true)"
+fi
+if [ -z "${dtc_path}" ]; then
+    echo "post-image.sh: no dtc found (HOST_DIR/bin, kernel scripts/dtc, or PATH)" >&2
+    exit 1
+fi
+
+export FROST_IMAGE="${BINARIES_DIR}/Image"
+export FROST_INITRD="${BINARIES_DIR}/rootfs.cpio.gz"
+export FROST_OUTDIR="${BINARIES_DIR}"
+export FROST_CROSS_COMPILE="${cross_compile}"
+export FROST_DTC="${dtc_path}"
+# The boot shim is pure rv32i integer code; use the Buildroot toolchain's own
+# default -march/-mabi to avoid an ilp32-vs-ilp32d ABI mismatch (the standalone
+# xPack default is rv32i_zicsr / ilp32, restored when these are unset).
+export FROST_SHIM_MARCH=""
+export FROST_SHIM_MABI=""
+export FPGA_CPU_CLK_FREQ="${FPGA_CPU_CLK_FREQ:-133333333}"
+
+echo "post-image.sh: packaging FROST boot image"
+echo "  Image  = ${FROST_IMAGE}"
+echo "  initrd = ${FROST_INITRD}"
+echo "  cross  = ${FROST_CROSS_COMPILE}"
+echo "  dtc    = ${FROST_DTC}"
+echo "  out    = ${FROST_OUTDIR}"
+
+python3 "${BOARD_DIR}/build_fpga_boot.py"
+
+# Apply the ret_from_exception M-mode restore-window software crutch to the
+# packed DDR image. Required for the FROST core (cocotb sim + FPGA) until the
+# RTL fix lands; idempotent (located by opcode, no-op if already patched).
+# Harmless/irrelevant for QEMU, which boots Image+rootfs directly and never
+# consumes sw_ddr.mem.
+if [ -f "${BINARIES_DIR}/sw_ddr.mem" ]; then
+    echo "post-image.sh: applying ret_from_exception M-mode-timer patch to sw_ddr"
+    python3 "${BOARD_DIR}/patch_ret_from_exception.py" \
+        "${BINARIES_DIR}/sw_ddr.mem" "${BINARIES_DIR}/sw_ddr.txt"
+fi
diff --git a/linux/buildroot-external/board/frost/rootfs-overlay/etc/inittab b/linux/buildroot-external/board/frost/rootfs-overlay/etc/inittab
new file mode 100644
index 00000000..9c7d41c8
--- /dev/null
+++ b/linux/buildroot-external/board/frost/rootfs-overlay/etc/inittab
@@ -0,0 +1,44 @@
+# /etc/inittab
+#
+# Copyright (C) 2001 Erik Andersen <andersen@codepoet.org>
+#
+# Note: BusyBox init doesn't support runlevels.  The runlevels field is
+# completely ignored by BusyBox init. If you want runlevels, use
+# sysvinit.
+#
+# Format for each entry: <id>:<runlevels>:<action>:<process>
+#
+# id        == tty to run on, or empty for /dev/console
+# runlevels == ignored
+# action    == one of sysinit, respawn, askfirst, wait, and once
+# process   == program to run
+
+# Startup the system
+# FROST no-MMU: mount devtmpfs first -- an initramfs boot does not auto-mount
+# it, and the rest of /dev is populated from it (static /dev/console bootstraps
+# init's own stdio before this runs).
+::sysinit:/bin/mount -t devtmpfs devtmpfs /dev
+::sysinit:/bin/mount -t proc proc /proc
+::sysinit:/bin/mount -o remount,rw /
+::sysinit:/bin/mkdir -p /dev/pts /dev/shm
+::sysinit:/bin/mount -a
+::sysinit:/bin/mkdir -p /run/lock/subsys
+#::sysinit:/sbin/swapon -a
+null::sysinit:/bin/ln -sf /proc/self/fd /dev/fd
+null::sysinit:/bin/ln -sf /proc/self/fd/0 /dev/stdin
+null::sysinit:/bin/ln -sf /proc/self/fd/1 /dev/stdout
+null::sysinit:/bin/ln -sf /proc/self/fd/2 /dev/stderr
+::sysinit:/bin/hostname -F /etc/hostname
+# now run any rc scripts
+::sysinit:/etc/init.d/rcS
+
+# Put a getty on the serial port
+console::respawn:/sbin/getty -L  console 0 vt100 # GENERIC_SERIAL
+
+# Stuff to do for the 3-finger salute
+#::ctrlaltdel:/sbin/reboot
+
+# Stuff to do before rebooting
+::shutdown:/etc/init.d/rcK
+#::shutdown:/sbin/swapoff -a
+::shutdown:/bin/umount -a -r
diff --git a/linux/buildroot-external/configs/frost_nommu_rv32_defconfig b/linux/buildroot-external/configs/frost_nommu_rv32_defconfig
new file mode 100644
index 00000000..2f929037
--- /dev/null
+++ b/linux/buildroot-external/configs/frost_nommu_rv32_defconfig
@@ -0,0 +1,56 @@
+# FROST RV32 no-MMU M-mode Linux -- Buildroot defconfig
+#
+# Provenance: derived from the upstream buildroot defconfig
+#   configs/qemu_riscv32_nommu_virt_defconfig
+# with the FROST retarget applied (initramfs instead of an ext2/virtio disk,
+# the FROST kernel CONFIG fragment, and a post-image step that packages the
+# Image + DTB + initramfs into the FROST FPGA/sim memory images).
+#
+# Build:
+#   make -C linux/buildroot O=$(pwd)/linux/build \
+#        BR2_EXTERNAL=$(pwd)/linux/buildroot-external frost_nommu_rv32_defconfig
+#   make -C linux/buildroot O=$(pwd)/linux/build
+# Output lands in linux/build/images/ (Image, rootfs.cpio.gz, sw_ddr.{mem,txt}).
+#
+# NOTE: gcc 15.2.0 / binutils 2.45.1 / uClibc and the rv32-nommu internal
+# toolchain are the *defaults* of the pinned Buildroot tree, so they are not
+# restated here -- pin Buildroot (see linux/buildroot-external/README.md) and
+# the toolchain is reproduced automatically.
+
+# --- Architecture: RISC-V 32-bit, no MMU ---
+BR2_riscv=y
+BR2_RISCV_32=y
+# BR2_RISCV_USE_MMU is not set
+
+# --- Host kernel headers must match the target kernel series (Linux 6.18) ---
+BR2_PACKAGE_HOST_LINUX_HEADERS_CUSTOM_6_18=y
+
+# --- Download integrity: carry the linux-6.18.7 hash in our own patch dir so
+#     BR2_DOWNLOAD_FORCE_CHECK_HASHES can verify the custom kernel tarball ---
+BR2_GLOBAL_PATCH_DIR="$(BR2_EXTERNAL_FROST_PATH)/board/frost/patches"
+BR2_DOWNLOAD_FORCE_CHECK_HASHES=y
+
+# --- Linux kernel: 6.18.7, virt-nommu base config + FROST fragment ---
+BR2_LINUX_KERNEL=y
+BR2_LINUX_KERNEL_CUSTOM_VERSION=y
+BR2_LINUX_KERNEL_CUSTOM_VERSION_VALUE="6.18.7"
+BR2_LINUX_KERNEL_USE_CUSTOM_CONFIG=y
+BR2_LINUX_KERNEL_CUSTOM_CONFIG_FILE="$(BR2_EXTERNAL_FROST_PATH)/board/frost/linux-nommu-base.config"
+BR2_LINUX_KERNEL_CONFIG_FRAGMENT_FILES="$(BR2_EXTERNAL_FROST_PATH)/board/frost/linux-nommu-frost.config.fragment"
+
+# --- Init system: busybox (nommu bFLT userspace) ---
+BR2_INIT_BUSYBOX=y
+
+# --- Root filesystem: initramfs (cpio.gz). FROST has no block device, so the
+#     rootfs is an initramfs loaded into DDR; the kernel fragment enables
+#     CONFIG_BLK_DEV_INITRD + CONFIG_RD_GZIP to consume it. ---
+BR2_TARGET_ROOTFS_CPIO=y
+BR2_TARGET_ROOTFS_CPIO_GZIP=y
+# (ext2 / virtio-blk rootfs intentionally omitted -- see the kernel fragment.)
+
+# --- Post-image: package Image + generated DTB + initramfs into the FROST
+#     FPGA/sim memory images sw.{mem,txt} and sw_ddr.{mem,txt}. ---
+BR2_ROOTFS_POST_IMAGE_SCRIPT="$(BR2_EXTERNAL_FROST_PATH)/board/frost/post-image.sh"
+BR2_PACKAGE_BUSYBOX_CONFIG="$(BR2_EXTERNAL_FROST_PATH)/board/frost/busybox.config"
+BR2_ROOTFS_DEVICE_TABLE="system/device_table.txt $(BR2_EXTERNAL_FROST_PATH)/board/frost/device_table.txt"
+BR2_ROOTFS_OVERLAY="$(BR2_EXTERNAL_FROST_PATH)/board/frost/rootfs-overlay"
diff --git a/linux/buildroot-external/external.desc b/linux/buildroot-external/external.desc
new file mode 100644
index 00000000..c7d31d16
--- /dev/null
+++ b/linux/buildroot-external/external.desc
@@ -0,0 +1,2 @@
+name: FROST
+desc: FROST RV32 no-MMU M-mode Linux board support (kernel + initramfs + FPGA boot-image packaging)
diff --git a/linux/buildroot-external/external.mk b/linux/buildroot-external/external.mk
new file mode 100644
index 00000000..fb0f3dd5
--- /dev/null
+++ b/linux/buildroot-external/external.mk
@@ -0,0 +1,22 @@
+#    Copyright 2026 Two Sigma Open Source, LLC
+#
+#    Licensed under the Apache License, Version 2.0 (the "License");
+#    you may not use this file except in compliance with the License.
+#    You may obtain a copy of the License at
+#
+#        http://www.apache.org/licenses/LICENSE-2.0
+#
+#    Unless required by applicable law or agreed to in writing, software
+#    distributed under the License is distributed on an "AS IS" BASIS,
+#    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#    See the License for the specific language governing permissions and
+#    limitations under the License.
+
+# FROST BR2_EXTERNAL makefile.
+#
+# This external tree adds no custom Buildroot packages of its own: the FROST
+# Linux MVP is just an upstream kernel (6.18.7) + a busybox initramfs + a
+# post-image packaging step. The wildcard include below is the standard
+# BR2_EXTERNAL hook so that any future board/frost packages are picked up
+# automatically without editing this file.
+include $(sort $(wildcard $(BR2_EXTERNAL_FROST_PATH)/package/*/*.mk))
diff --git a/sw/README.md b/sw/README.md
index a88cc6cd..80208866 100644
--- a/sw/README.md
+++ b/sw/README.md
@@ -247,7 +247,7 @@ Control and Status Register access for performance counters and machine-mode con
 uint32_t cycles_lo = rdcycle();           // Low 32 bits of cycle counter
 uint32_t cycles_hi = rdcycleh();          // High 32 bits of cycle counter
 uint32_t instret_lo = rdinstret();        // Low 32 bits of instructions retired
-uint32_t time_lo = rdtime();              // Low 32 bits of time (aliased to cycle)
+uint32_t time_lo = rdtime();              // Low 32 bits of time (backed by CLINT mtime)
 
 // Read full 64-bit counters atomically
 uint64_t start = rdcycle64();
@@ -265,7 +265,7 @@ csr_clear(mstatus, MSTATUS_MIE);          // Clear bits in CSR
 
 **Available counters:**
 - `cycle`/`cycleh`: Clock cycles since reset (64-bit)
-- `time`/`timeh`: Wall-clock time (aliased to cycle on Frost)
+- `time`/`timeh`: Wall-clock time (backed by CLINT mtime, which ticks at the core clock on Frost)
 - `instret`/`instreth`: Instructions retired since reset (64-bit)
 
 **M-mode CSRs (for RTOS support):**
@@ -483,7 +483,7 @@ Defined in `common/link.ld`:
 |--------|--------------|---------|----------------------------------------------------|
 | ROM    | `0x00000000` | 96 KiB  | Code and small read-only data (fast BRAM, 1-cycle) |
 | RAM    | `0x00018000` | 160 KiB | Variables, BSS, and stack (fast BRAM, 1-cycle)     |
-| MMIO   | `0x40000000` | 44 B    | Memory-mapped I/O peripherals                      |
+| MMIO   | `0x40000000` | 44 B    | Memory-mapped I/O peripherals (legacy/linker window; the NS16550 UART at `0x40001000` and the SiFive CLINT alias at `0x40010000` sit above it) |
 | DDR    | `0x80000000` | 1 GiB   | Cached region: execute-from-DDR code, heap, large `.ddr_*` data |
 
 Within the DDR region, loaded `.ddr_rodata`/`.ddr_data` sections (e.g.
@@ -491,7 +491,7 @@ radix2's ~800 KiB FFT tables, routed there by per-object linker rules or an
 explicit `__attribute__((section(".ddr_rodata")))`) come first, then
 `.ddr_bss`, then the heap to the end of the gigabyte. The low-BRAM stack
 carries a 112 KiB reserve sized from measured per-workload high-water marks
-(parser's recursive XML cleanup is the deepest user at ~75 KiB), enforced by
+(parser's recursive XML cleanup is the deepest user at 112 KiB), enforced by
 a link-time assert against data+bss growth.
 
 Image delivery is split: `sw.mem`/`sw.txt` carry the low-BRAM image, and
@@ -518,6 +518,8 @@ same 256 KiB low-BRAM map.
 | MSIP           | `0x40000020` | Machine software interrupt pending       |
 | UART_RX_STATUS | `0x40000024` | UART RX status (bit 0 = data available)  |
 | UART_TX_STATUS | `0x40000028` | UART TX status (bit 0 = can accept byte) |
+| NS16550        | `0x40001000` | NS16550-compatible UART registers (`0x40001000`-`0x4000101C`) |
+| CLINT alias    | `0x40010000` | SiFive CLINT-compatible alias of MSIP/mtimecmp/mtime (for Linux) |
 
 **Notes:**
 - Simple timing uses Zicntr CSR counters (cycle, instret) via single-instruction reads. See `csr.h` and `timer.h`.
@@ -550,7 +552,7 @@ include ../../common/common.mk
 
 ## Architecture Notes
 
-Frost implements **RV32GCB** with full M-mode privilege support. See the [root README](../README.md) for the full ISA extension table and architecture details.
+Frost implements **RV32GCB** with Machine (M) and User (U) privilege modes. See the [root README](../README.md) for the full ISA extension table and architecture details.
 
 ### Test Result Markers
 
@@ -565,8 +567,8 @@ These markers are distinct from individual test output (like `PASS: test_name`)
 
 **Special cases:**
 - **hello_world**: Open-ended (loops forever); passes when "Hello, world!" is printed
-- **coremark**: Long-running benchmark; passes when "CoreMark" welcome message is printed
-- **freertos_demo**: Runs multiple iterations; passes when "PASS" is printed
+- **linux_boot**: Kernel boot; passes when the "Linux version" boot banner is printed (uses a boot-health checker in full CI runs)
+- **uart_echo**: Interactive; the harness injects UART input and passes when the prompt, echo, and response are observed (no `<<PASS>>` marker)
 
 ### Other Details
 
diff --git a/sw/apps/clint_test/Makefile b/sw/apps/clint_test/Makefile
new file mode 100644
index 00000000..8621641d
--- /dev/null
+++ b/sw/apps/clint_test/Makefile
@@ -0,0 +1,17 @@
+#    Copyright 2026 Two Sigma Open Source, LLC
+#
+#    Licensed under the Apache License, Version 2.0 (the "License");
+#    you may not use this file except in compliance with the License.
+#    You may obtain a copy of the License at
+#
+#        http://www.apache.org/licenses/LICENSE-2.0
+#
+#    Unless required by applicable law or agreed to in writing, software
+#    distributed under the License is distributed on an "AS IS" BASIS,
+#    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#    See the License for the specific language governing permissions and
+#    limitations under the License.
+
+# Makefile for the SiFive CLINT alias directed test
+SRC_C := main.c
+include ../../common/common.mk
diff --git a/sw/apps/clint_test/main.c b/sw/apps/clint_test/main.c
new file mode 100644
index 00000000..0f7e24a9
--- /dev/null
+++ b/sw/apps/clint_test/main.c
@@ -0,0 +1,112 @@
+/*
+ *    Copyright 2026 Two Sigma Open Source, LLC
+ *
+ *    Licensed under the Apache License, Version 2.0 (the "License");
+ *    you may not use this file except in compliance with the License.
+ *    You may obtain a copy of the License at
+ *
+ *        http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *    Unless required by applicable law or agreed to in writing, software
+ *    distributed under the License is distributed on an "AS IS" BASIS,
+ *    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *    See the License for the specific language governing permissions and
+ *    limitations under the License.
+ */
+
+/*
+ * SiFive CLINT alias directed test (Increment 2 of the no-MMU Linux glue).
+ *
+ * FROST exposes a sifive,clint0-compatible window at 0x4001_0000 (msip @ +0,
+ * mtimecmp @ +0x4000, mtime @ +0xBFF8) that aliases the native FROST timer
+ * registers, so a stock Linux CLINT driver can deliver the timer tick. This
+ * test proves the alias two ways:
+ *   1. writes through the CLINT addresses are observable at the native timer
+ *      addresses (same physical registers);
+ *   2. an actual machine timer interrupt set up entirely through the CLINT
+ *      window fires with mcause = 0x8000_0007.
+ */
+
+#include <stdint.h>
+
+/* SiFive CLINT alias window. */
+#define CLINT_MSIP (*(volatile uint32_t *) 0x40010000u)
+#define CLINT_MTIMECMP_LO (*(volatile uint32_t *) 0x40014000u)
+#define CLINT_MTIMECMP_HI (*(volatile uint32_t *) 0x40014004u)
+#define CLINT_MTIME_LO (*(volatile uint32_t *) 0x4001BFF8u)
+
+/* Native FROST timer registers (the aliased physical registers). */
+#define NAT_MTIMECMP_LO (*(volatile uint32_t *) 0x40000018u)
+#define NAT_MTIMECMP_HI (*(volatile uint32_t *) 0x4000001Cu)
+#define NAT_MSIP (*(volatile uint32_t *) 0x40000020u)
+#define NAT_MTIME_LO (*(volatile uint32_t *) 0x40000010u)
+
+/* Native UART for the PASS/FAIL marker. */
+#define UTX (*(volatile uint32_t *) 0x40000000u)
+#define UTX_ST (*(volatile uint32_t *) 0x40000028u)
+static void putc_(char c)
+{
+    while (!(UTX_ST & 1u)) {
+    }
+    UTX = (uint8_t) c;
+}
+static void puts_(const char *s)
+{
+    while (*s)
+        putc_(*s++);
+}
+
+static volatile uint32_t g_cause;
+
+/* Machine trap handler. GCC's "interrupt" attribute emits the register
+ * save/restore and MRET, so it is safe as a normal C function. */
+__attribute__((interrupt("machine"), aligned(4))) static void mtrap(void)
+{
+    uint32_t mc;
+    __asm__ volatile("csrr %0, mcause" : "=r"(mc));
+    g_cause = mc;
+    /* Ack: push the compare (through the CLINT alias) to max so it cannot
+     * refire. */
+    CLINT_MTIMECMP_HI = 0xFFFFFFFFu;
+    CLINT_MTIMECMP_LO = 0xFFFFFFFFu;
+}
+
+int main(void)
+{
+    int ok = 1;
+
+    __asm__ volatile("csrw mtvec, %0" ::"r"(&mtrap)); /* direct mode */
+
+    /* 1a. mtimecmp written via CLINT is visible at the native address. */
+    CLINT_MTIMECMP_LO = 0x12345678u;
+    CLINT_MTIMECMP_HI = 0x9ABCDEF0u;
+    ok &= (NAT_MTIMECMP_LO == 0x12345678u);
+    ok &= (NAT_MTIMECMP_HI == 0x9ABCDEF0u);
+
+    /* 1b. msip written via CLINT is visible at the native address. */
+    CLINT_MSIP = 1u;
+    ok &= ((NAT_MSIP & 1u) == 1u);
+    CLINT_MSIP = 0u;
+    ok &= ((NAT_MSIP & 1u) == 0u);
+
+    /* 1c. CLINT mtime and native mtime read the same advancing counter. */
+    uint32_t t_clint = CLINT_MTIME_LO;
+    uint32_t t_nat = NAT_MTIME_LO; /* read after -> >= */
+    ok &= (t_nat >= t_clint);
+
+    /* 2. A machine timer interrupt set up entirely through the CLINT window. */
+    g_cause = 0u;
+    CLINT_MTIMECMP_HI = 0xFFFFFFFFu; /* block premature fire */
+    CLINT_MTIMECMP_LO = CLINT_MTIME_LO + 1000u;
+    CLINT_MTIMECMP_HI = 0u;
+    __asm__ volatile("csrs mie, %0" ::"r"(0x80));    /* MTIE */
+    __asm__ volatile("csrs mstatus, %0" ::"r"(0x8)); /* MIE */
+    for (volatile int i = 0; i < 1000000 && g_cause == 0u; i++) {
+    }
+    ok &= (g_cause == 0x80000007u);
+
+    puts_(ok ? "\r\n<<PASS>>\r\n" : "\r\n<<FAIL>>\r\n");
+    for (;;) {
+    }
+    return 0;
+}
diff --git a/sw/apps/compile_app.py b/sw/apps/compile_app.py
index 9f4b6b0c..ef308bb6 100755
--- a/sw/apps/compile_app.py
+++ b/sw/apps/compile_app.py
@@ -17,8 +17,9 @@
 """Compile a FROST software application.
 
 This module provides a function to compile applications in sw/apps/.
-Used by test_run_cocotb.py, test_run_yosys.py, load_software.py, and build.py
-to ensure binaries are always up-to-date before simulation, synthesis, or FPGA loading.
+Used by test_run_cocotb.py and test_run_yosys.py to ensure binaries are always
+up-to-date before simulation or synthesis. The FPGA flows (e.g. load_software.py)
+compile applications independently with their own board-specific paths.
 """
 
 import os
diff --git a/sw/apps/csr_rmw_test/Makefile b/sw/apps/csr_rmw_test/Makefile
new file mode 100644
index 00000000..f57fce6d
--- /dev/null
+++ b/sw/apps/csr_rmw_test/Makefile
@@ -0,0 +1,17 @@
+#    Copyright 2026 Two Sigma Open Source, LLC
+#
+#    Licensed under the Apache License, Version 2.0 (the "License");
+#    you may not use this file except in compliance with the License.
+#    You may obtain a copy of the License at
+#
+#        http://www.apache.org/licenses/LICENSE-2.0
+#
+#    Unless required by applicable law or agreed to in writing, software
+#    distributed under the License is distributed on an "AS IS" BASIS,
+#    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#    See the License for the specific language governing permissions and
+#    limitations under the License.
+
+# Makefile for the CSR read-modify-write directed test
+SRC_C := ../../lib/src/uart.c main.c
+include ../../common/common.mk
diff --git a/sw/apps/csr_rmw_test/main.c b/sw/apps/csr_rmw_test/main.c
new file mode 100644
index 00000000..b1550440
--- /dev/null
+++ b/sw/apps/csr_rmw_test/main.c
@@ -0,0 +1,106 @@
+/*
+ *    Copyright 2026 Two Sigma Open Source, LLC
+ *
+ *    Licensed under the Apache License, Version 2.0 (the "License");
+ *    you may not use this file except in compliance with the License.
+ *    You may obtain a copy of the License at
+ *
+ *        http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *    Unless required by applicable law or agreed to in writing, software
+ *    distributed under the License is distributed on an "AS IS" BASIS,
+ *    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *    See the License for the specific language governing permissions and
+ *    limitations under the License.
+ */
+
+/*
+ * Directed CSR read-modify-write test.
+ *
+ * No-MMU M-mode Linux panics on the FIRST machine-timer interrupt with
+ * epc==ra==garbage (a `ret` through a clobbered return address). The kernel's
+ * trap entry swaps the thread pointer with `csrrw tp, mscratch, tp`, while the
+ * PASSING paths (umode_test, FreeRTOS) only ever use separate csrr/csrw -- so
+ * FROST's CSR read-modify-write instructions are an untested differentiator.
+ *
+ * This isolates whether csrrw/csrrs/csrrc correctly (a) return the OLD CSR
+ * value into rd AND (b) write the new value -- including the same-register swap
+ * idiom (`csrrw t0, mscratch, t0`) the kernel depends on. Self-checks over UART
+ * with <<PASS>> / <<FAIL>>.
+ */
+
+#include <stdint.h>
+
+#include "uart.h"
+
+static int g_ok = 1;
+
+static void check(const char *name, uint32_t got, uint32_t want)
+{
+    int ok = (got == want);
+    if (!ok)
+        g_ok = 0;
+    uart_printf("%s %s: got=%08x want=%08x\n", ok ? "[PASS]" : "[FAIL]", name, got, want);
+}
+
+static inline uint32_t rd_scratch(void)
+{
+    uint32_t v;
+    __asm__ volatile("csrr %0, mscratch" : "=r"(v));
+    return v;
+}
+
+static inline void wr_scratch(uint32_t v)
+{
+    __asm__ volatile("csrw mscratch, %0" : : "r"(v));
+}
+
+int main(void)
+{
+    uint32_t old, cur, swapped;
+
+    uart_printf("\n=== CSR read-modify-write directed test ===\n");
+
+    /* csrrw: rd <- old(CSR); CSR <- rs1 */
+    wr_scratch(0xAAAA1111u);
+    __asm__ volatile("li t0, 0xBBBB2222\n\tcsrrw %0, mscratch, t0" : "=r"(old) : : "t0");
+    cur = rd_scratch();
+    check("csrrw returns old", old, 0xAAAA1111u);
+    check("csrrw writes new", cur, 0xBBBB2222u);
+
+    /* csrrs: rd <- old; CSR <- old | rs1 */
+    wr_scratch(0xF0F0F0F0u);
+    __asm__ volatile("li t0, 0x0F0F0F0F\n\tcsrrs %0, mscratch, t0" : "=r"(old) : : "t0");
+    cur = rd_scratch();
+    check("csrrs returns old", old, 0xF0F0F0F0u);
+    check("csrrs sets bits", cur, 0xFFFFFFFFu);
+
+    /* csrrc: rd <- old; CSR <- old & ~rs1 */
+    wr_scratch(0xFFFFFFFFu);
+    __asm__ volatile("li t0, 0x0F0F0F0F\n\tcsrrc %0, mscratch, t0" : "=r"(old) : : "t0");
+    cur = rd_scratch();
+    check("csrrc returns old", old, 0xFFFFFFFFu);
+    check("csrrc clears bits", cur, 0xF0F0F0F0u);
+
+    /* csrrw with x0 destination must STILL write the CSR (== csrw). */
+    wr_scratch(0x12345678u);
+    __asm__ volatile("li t0, 0x9ABCDEF0\n\tcsrrw x0, mscratch, t0" : : : "t0");
+    cur = rd_scratch();
+    check("csrrw x0-dest still writes", cur, 0x9ABCDEF0u);
+
+    /* THE KERNEL PATTERN: `csrrw t0, mscratch, t0` (same reg as rd and rs1 =
+     * atomic swap). After: t0 <- old(CSR), CSR <- old(t0). */
+    wr_scratch(0xCAFEBABEu);
+    __asm__ volatile("li t0, 0xDEADBEEF\n\tcsrrw t0, mscratch, t0\n\tmv %0, t0"
+                     : "=r"(swapped)
+                     :
+                     : "t0");
+    cur = rd_scratch();
+    check("csrrw swap: reg<-old", swapped, 0xCAFEBABEu);
+    check("csrrw swap: CSR<-reg", cur, 0xDEADBEEFu);
+
+    uart_printf(g_ok ? "\n<<PASS>>\n" : "\n<<FAIL>>\n");
+    for (;;) {
+    }
+    return 0;
+}
diff --git a/sw/apps/ddr_atomic_test/Makefile b/sw/apps/ddr_atomic_test/Makefile
new file mode 100644
index 00000000..aa2cc97b
--- /dev/null
+++ b/sw/apps/ddr_atomic_test/Makefile
@@ -0,0 +1,17 @@
+#    Copyright 2026 Two Sigma Open Source, LLC
+#
+#    Licensed under the Apache License, Version 2.0 (the "License");
+#    you may not use this file except in compliance with the License.
+#    You may obtain a copy of the License at
+#
+#        http://www.apache.org/licenses/LICENSE-2.0
+#
+#    Unless required by applicable law or agreed to in writing, software
+#    distributed under the License is distributed on an "AS IS" BASIS,
+#    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#    See the License for the specific language governing permissions and
+#    limitations under the License.
+
+# Makefile for the DDR (cached-tier) atomics reproducer
+SRC_C := main.c
+include ../../common/common.mk
diff --git a/sw/apps/ddr_atomic_test/main.c b/sw/apps/ddr_atomic_test/main.c
new file mode 100644
index 00000000..8e12286f
--- /dev/null
+++ b/sw/apps/ddr_atomic_test/main.c
@@ -0,0 +1,150 @@
+/*
+ *    Copyright 2026 Two Sigma Open Source, LLC
+ *
+ *    Licensed under the Apache License, Version 2.0 (the "License");
+ *    you may not use this file except in compliance with the License.
+ *    You may obtain a copy of the License at
+ *
+ *        http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *    Unless required by applicable law or agreed to in writing, software
+ *    distributed under the License is distributed on an "AS IS" BASIS,
+ *    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *    See the License for the specific language governing permissions and
+ *    limitations under the License.
+ */
+
+/*
+ * Directed reproducer for RV32-A atomics to the CACHED DDR region.
+ *
+ * A no-MMU Linux boot hangs on a store-conditional (sc.w.rl) to a printk
+ * ring-buffer descriptor in DDR -- i.e. LR/SC to the cached tier deadlocks,
+ * even though atomics to low BRAM work (FreeRTOS A-extension stress passes).
+ *
+ * This isolates it: the target variable lives in .ddr_data (DDR / cached
+ * tier). A progress letter is printed BEFORE each step so the last letter
+ * received over UART pinpoints which operation wedged:
+ *   "S"      started
+ *   "SL"     plain DDR store/load OK  (hang at AMO)
+ *   "SLA"    AMO (amoadd.w) to DDR OK (hang at LR/SC)
+ *   "SLAC"   LR/SC to DDR OK
+ *   "<<PASS>>" all DDR atomics work (then the kernel hang is elsewhere)
+ */
+
+#include <stdint.h>
+
+#define UTX (*(volatile uint32_t *) 0x40000000u)
+#define UTX_ST (*(volatile uint32_t *) 0x40000028u)
+static void putc_(char c)
+{
+    while (!(UTX_ST & 1u)) {
+    }
+    UTX = (uint8_t) c;
+}
+static void puts_(const char *s)
+{
+    while (*s)
+        putc_(*s++);
+}
+
+/* Lives in the cached DDR region. */
+__attribute__((section(".ddr_data"))) static volatile uint32_t ddr_var = 0x10;
+struct pde_like {
+    uint32_t in_use;
+    uint32_t refcnt;
+    uint8_t pad[88];
+    uint16_t mode;
+    uint8_t flags;
+    uint8_t namelen;
+    uint32_t tail;
+};
+__attribute__((section(".ddr_data"))) static volatile struct pde_like ddr_pde_like;
+
+int main(void)
+{
+    putc_('S');
+
+    /* 1. plain DDR store/load (should already work -- ddr_test passes). */
+    ddr_var = 0x20;
+    if (ddr_var != 0x20) {
+        puts_("\r\n<<FAIL>> ddr store/load\r\n");
+        for (;;) {
+        }
+    }
+    putc_('L');
+
+    /* 2. AMO to DDR (amoadd.w). Hangs here if AMO-to-cached deadlocks. */
+    uint32_t old_amo;
+    __asm__ volatile("amoadd.w %0, %2, (%1)" : "=r"(old_amo) : "r"(&ddr_var), "r"(1u) : "memory");
+    if (old_amo != 0x20) {
+        puts_("\r\n<<FAIL>> amo old value\r\n");
+        for (;;) {
+        }
+    }
+    if (ddr_var != 0x21) {
+        puts_("\r\n<<FAIL>> amo result\r\n");
+        for (;;) {
+        }
+    }
+    putc_('A');
+
+    /* 2b. Refcount-like repeated AMO increments: validate both old and new values. */
+    ddr_var = 1;
+    for (uint32_t i = 0; i < 256; i++) {
+        uint32_t old_loop;
+        __asm__ volatile("amoadd.w %0, %2, (%1)"
+                         : "=r"(old_loop)
+                         : "r"(&ddr_var), "r"(1u)
+                         : "memory");
+        if (old_loop != i + 1 || ddr_var != i + 2) {
+            puts_("\r\n<<FAIL>> amo loop value\r\n");
+            for (;;) {
+            }
+        }
+    }
+    putc_('R');
+
+    /* 2c. Proc-dir-entry-like layout: AMO at +4 must not corrupt mode at +96. */
+    ddr_pde_like.in_use = 0x11111111u;
+    ddr_pde_like.refcnt = 1u;
+    ddr_pde_like.mode = 0x8124u;
+    ddr_pde_like.flags = 0x5au;
+    ddr_pde_like.namelen = 7u;
+    ddr_pde_like.tail = 0xa5a55a5au;
+    for (uint32_t i = 0; i < 256; i++) {
+        uint32_t old_ref;
+        __asm__ volatile("amoadd.w %0, %2, (%1)"
+                         : "=r"(old_ref)
+                         : "r"(&ddr_pde_like.refcnt), "r"(1u)
+                         : "memory");
+        if (old_ref != i + 1 || ddr_pde_like.refcnt != i + 2 ||
+            ddr_pde_like.in_use != 0x11111111u || ddr_pde_like.mode != 0x8124u ||
+            ddr_pde_like.flags != 0x5au || ddr_pde_like.namelen != 7u ||
+            ddr_pde_like.tail != 0xa5a55a5au) {
+            puts_("\r\n<<FAIL>> amo struct corruption\r\n");
+            for (;;) {
+            }
+        }
+    }
+    putc_('P');
+
+    /* 3. LR/SC compare-exchange to DDR (matches the kernel's sc.w.rl). */
+    uint32_t prev;
+    __asm__ volatile("1: lr.w    %0, (%1)\n"
+                     "   sc.w.rl t0, %2, (%1)\n"
+                     "   bnez    t0, 1b\n"
+                     : "=&r"(prev)
+                     : "r"(&ddr_var), "r"(0xABCDu)
+                     : "t0", "memory");
+    if (ddr_var != 0xABCDu) {
+        puts_("\r\n<<FAIL>> lr/sc result\r\n");
+        for (;;) {
+        }
+    }
+    putc_('C');
+
+    puts_("\r\n<<PASS>>\r\n");
+    for (;;) {
+    }
+    return 0;
+}
diff --git a/sw/apps/drain_trapframe_test/Makefile b/sw/apps/drain_trapframe_test/Makefile
new file mode 100644
index 00000000..1150cd86
--- /dev/null
+++ b/sw/apps/drain_trapframe_test/Makefile
@@ -0,0 +1,20 @@
+#    Copyright 2026 Two Sigma Open Source, LLC
+#
+#    Licensed under the Apache License, Version 2.0 (the "License");
+#    you may not use this file except in compliance with the License.
+#    You may obtain a copy of the License at
+#
+#        http://www.apache.org/licenses/LICENSE-2.0
+#
+#    Unless required by applicable law or agreed to in writing, software
+#    distributed under the License is distributed on an "AS IS" BASIS,
+#    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#    See the License for the specific language governing permissions and
+#    limitations under the License.
+
+# Trap-frame store-visibility-under-L1D-eviction directed test ("Bug B" relocated
+# to pt_regs s2). Force the whole program into cached DDR so the trap-frame save
+# store and the conflicting eviction accesses all traverse the L1D -> DDR path.
+override MEM_CONFIG := ddr
+SRC_C := ../../lib/src/uart.c main.c
+include ../../common/common.mk
diff --git a/sw/apps/drain_trapframe_test/main.c b/sw/apps/drain_trapframe_test/main.c
new file mode 100644
index 00000000..038ae846
--- /dev/null
+++ b/sw/apps/drain_trapframe_test/main.c
@@ -0,0 +1,511 @@
+/*
+ *    Copyright 2026 Two Sigma Open Source, LLC
+ *
+ *    Licensed under the Apache License, Version 2.0 (the "License");
+ *    you may not use this file except in compliance with the License.
+ *    You may obtain a copy of the License at
+ *
+ *        http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *    Unless required by applicable law or agreed to in writing, software
+ *    distributed under the License is distributed on an "AS IS" BASIS,
+ *    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *    See the License for the specific language governing permissions and
+ *    limitations under the License.
+ */
+
+/*
+ * Directed test for "Bug B" relocated to the kernel trap frame (pt_regs):
+ * TRAP-FRAME STORE-VISIBILITY UNDER L1D CACHE EVICTION.
+ *
+ * Real-world failure being reproduced: at a procfs panic the callee-saved
+ * register s2 came back as 0x19999998 (a value name_to_int materialises)
+ * instead of its proper pointer, after a machine-timer interrupt fired during
+ * active kernel code. Suspected mechanism (same class as the fence.i/SMC "store
+ * leaves the SQ when sent, not when landed" bug): the naked trap entry saves
+ * GPRs to the kernel stack ("sw s2, 72(sp)"); if that committed store is
+ * considered drained from the store queue BEFORE its data physically lands in
+ * the write-back L1D, and cache pressure during the handler EVICTS that stack
+ * line (writing STALE data back to DDR), then the trap exit "lw s2, 72(sp)"
+ * refills from DDR and restores s2 WRONG.
+ *
+ * Construction (full frost SoC, cached/DDR tier, MEM_CONFIG=ddr):
+ *   - A faithful Linux-style naked trap entry saves the full pt_regs frame to a
+ *     cached-DDR "kernel stack" at a FIXED line-aligned address (FRAME_BASE), so
+ *     the exact L1D set holding the saved s2 word is known and can be evicted
+ *     deterministically. s2 sits at offset 72, exactly as in the real handler.
+ *   - Before each interrupt the frame's s2 cache line is PRE-POISONED with
+ *     0x19999998 (the real failing value), so a non-landed save read-back yields
+ *     EXACTLY the real-world wrong value.
+ *   - A cold-miss DDR drain store is issued just before the IRQ window (like
+ *     wfi_drain_mepc_test) so a store is in flight / the memory subsystem is
+ *     busy when the trap is taken.
+ *   - AFTER saving the frame (s2 stored LAST, immediately before the eviction)
+ *     the handler AGGRESSIVELY EVICTS the saved s2 line by striding through
+ *     cached-DDR addresses that map to the SAME L1D set. The L1D is 128 KiB
+ *     DIRECT-MAPPED with 32-byte lines (hw/rtl/lib/cache/frost_cache.sv,
+ *     L1_CACHE_BYTES=128*1024), so address A and A + 0x20000 collide in one set.
+ *   - The handler then reads the s2 slot back (the load under test) and checks.
+ *
+ * Discriminator (the key result):
+ *   code=29 : the incoming ARCHITECTURAL s2 was already wrong (precise-state /
+ *             rename corruption) -- not the target bug.
+ *   code=30 : the SAVED frame value was already wrong BEFORE eviction
+ *             (the store never became visible at all).
+ *   code=31 : the saved frame was CORRECT before eviction but the post-eviction
+ *             read-back is WRONG  ==> the store/eviction memory-visibility bug.
+ *             THIS is the targeted reproduction.
+ *
+ * The timer margin is swept finely (0..255) so the IRQ lands at every offset
+ * across the drain+handler window; the per-margin "gap" (filler between the s2
+ * store and the eviction) is also swept (0..15) to sample the in-flight window.
+ * Resume is via a fixed continuation (the handler redirects mepc), so a wrong
+ * mepc is never fatal. Run with CACHED_HAS_L2=0 (Genesys2 / HW-faithful shape,
+ * where a cold write-back actually drains) and DDR_MODEL_LATENCY>=70.
+ *
+ * PASS  -> prints <<PASS>> (no margin ever corrupts a restored register).
+ * FAIL  -> prints <<FAIL>> with code + margin + expected/actual (e.g. s2).
+ */
+
+#include <stdint.h>
+
+#include "trap.h"
+#include "uart.h"
+
+/* ---- L1D geometry (frost_cache.sv: 128 KiB direct-mapped, 32 B lines) ---- */
+#define L1D_STRIDE 0x00020000u /* 128 KiB: A and A+stride share one set */
+#define N_EVICT 6u             /* conflicting lines touched per eviction */
+
+/* ---- Fixed cached-DDR "kernel stack" for the trap frame (line aligned) ---- */
+#define FRAME_BASE 0x82000000u
+#define FRAME_TOP (FRAME_BASE + 144u)   /* pt_regs is 144 bytes; sp on entry */
+#define S2_LINE_BASE (FRAME_BASE + 64u) /* 32 B line holding s2@72 (64..95) */
+
+/* Cold DDR region for the per-margin in-flight drain store. Chosen so its L1D
+ * sets (2048..) never collide with the frame's s2 set (2), and far from the
+ * program and the frame. */
+#define DRAIN_BASE 0x83010000u
+#define DRAIN_LINE 64u
+
+#define MARGIN_MIN 0u
+#define MARGIN_MAX 255u
+
+#define POISON_S2 0x19999998u /* the real name_to_int value */
+
+/* Globals referenced by name from the naked asm (kept non-static, used). */
+uint32_t g_s2_target; /* &g_s2_target is the pointer-like correct s2 value */
+
+volatile uint32_t g_ticks;
+volatile uint32_t g_irq_count;
+volatile uint32_t g_expected_s2;
+volatile uint32_t g_gap;
+volatile uint32_t g_timer_margin;
+volatile uint32_t g_drain_addr;
+volatile uint32_t g_cont;       /* fixed continuation PC for the handler */
+volatile uint32_t g_cret;       /* irq_window() return address into C */
+volatile uint32_t g_csp;        /* irq_window() caller stack pointer */
+volatile uint32_t g_save_s[12]; /* main's callee-saved s0..s11 spill */
+
+volatile uint32_t g_last_code;
+volatile uint32_t g_last_reg;
+volatile uint32_t g_last_expected;
+volatile uint32_t g_last_actual;
+
+/*
+ * Naked M-mode timer trap entry. Faithful Linux-style pt_regs save/restore to a
+ * cached-DDR "kernel stack" (sp == FRAME_TOP, set by irq_window), with s2 saved
+ * LAST (immediately before the eviction) and the saved s2 line then evicted from
+ * the direct-mapped L1D. Records the discriminator codes. Resumes via the fixed
+ * continuation in g_cont so a wrong mepc cannot wedge the sweep.
+ */
+__attribute__((naked, used, aligned(4))) static void trapframe_irq_entry(void)
+{
+    __asm__ volatile("addi sp, sp, -144\n"
+                     /* ---- save the frame (everything EXCEPT s2 first) ---- */
+                     "sw   ra, 4(sp)\n"
+                     "sw   gp, 12(sp)\n"
+                     "sw   tp, 16(sp)\n"
+                     "sw   t0, 20(sp)\n"
+                     "sw   t1, 24(sp)\n"
+                     "sw   t2, 28(sp)\n"
+                     "sw   s0, 32(sp)\n"
+                     "sw   s1, 36(sp)\n"
+                     "sw   a0, 40(sp)\n"
+                     "sw   a1, 44(sp)\n"
+                     "sw   a2, 48(sp)\n"
+                     "sw   a3, 52(sp)\n"
+                     "sw   a4, 56(sp)\n"
+                     "sw   a5, 60(sp)\n"
+                     "sw   a6, 64(sp)\n"
+                     "sw   a7, 68(sp)\n"
+                     "sw   s3, 76(sp)\n"
+                     "sw   s4, 80(sp)\n"
+                     "sw   s5, 84(sp)\n"
+                     "sw   s6, 88(sp)\n"
+                     "sw   s7, 92(sp)\n"
+                     "sw   s8, 96(sp)\n"
+                     "sw   s9, 100(sp)\n"
+                     "sw   s10, 104(sp)\n"
+                     "sw   s11, 108(sp)\n"
+                     "sw   t3, 112(sp)\n"
+                     "sw   t4, 116(sp)\n"
+                     "sw   t5, 120(sp)\n"
+                     "sw   t6, 124(sp)\n"
+                     "csrr t0, mepc\n"
+                     "sw   t0, 0(sp)\n"
+                     "csrr t0, mstatus\n"
+                     "sw   t0, 128(sp)\n"
+                     /* preload the gap count into a saved scratch (t4) so the s2-store ->
+                      * eviction distance is ALU-only and not perturbed by a memory read */
+                     "la   t4, g_gap\n"
+                     "lw   t4, 0(t4)\n"
+                     /* ---- code=29: incoming architectural s2 vs expected (precise state) */
+                     "la   t0, g_expected_s2\n"
+                     "lw   t0, 0(t0)\n"
+                     "beq  s2, t0, 1f\n"
+                     "la   t1, g_last_code\n"
+                     "lw   t2, 0(t1)\n"
+                     "bnez t2, 1f\n"
+                     "li   t2, 29\n"
+                     "sw   t2, 0(t1)\n"
+                     "la   t1, g_last_reg\n"
+                     "li   t2, 2\n"
+                     "sw   t2, 0(t1)\n"
+                     "la   t1, g_last_expected\n"
+                     "sw   t0, 0(t1)\n"
+                     "la   t1, g_last_actual\n"
+                     "sw   s2, 0(t1)\n"
+                     "1:\n"
+                     /* ================= STORE UNDER TEST: sw s2, 72(sp) ================= */
+                     "sw   s2, 72(sp)\n"
+                     /* ---- tunable gap (ALU only) ---- */
+                     "2:\n"
+                     "beqz t4, 3f\n"
+                     "addi t4, t4, -1\n"
+                     "j    2b\n"
+                     "3:\n"
+                     /* ---- code=30: saved value BEFORE eviction (forwards from SQ if the
+                      * store is still in flight; reads L1D otherwise) ---- */
+                     "lw   t0, 72(sp)\n"
+                     "la   t1, g_expected_s2\n"
+                     "lw   t1, 0(t1)\n"
+                     "beq  t0, t1, 4f\n"
+                     "la   t2, g_last_code\n"
+                     "lw   t3, 0(t2)\n"
+                     "bnez t3, 4f\n"
+                     "li   t3, 30\n"
+                     "sw   t3, 0(t2)\n"
+                     "la   t2, g_last_reg\n"
+                     "li   t3, 2\n"
+                     "sw   t3, 0(t2)\n"
+                     "la   t2, g_last_expected\n"
+                     "sw   t1, 0(t2)\n"
+                     "la   t2, g_last_actual\n"
+                     "sw   t0, 0(t2)\n"
+                     "4:\n"
+                     /* ---- EVICT the saved s2 line: stride by the L1D size so every access
+                      * maps to the SAME set with a different tag (direct-mapped), evicting
+                      * and writing back the just-stored dirty frame line ---- */
+                     "li   t1, 0x82000040\n" /* S2_LINE_BASE */
+                     "li   t2, 0x20000\n"    /* L1D_STRIDE  */
+                     "li   t3, 6\n"          /* N_EVICT     */
+                     "5:\n"
+                     "lw   t5, 0(t1)\n"
+                     "add  t1, t1, t2\n"
+                     "addi t3, t3, -1\n"
+                     "bnez t3, 5b\n"
+                     /* ============ LOAD UNDER TEST: lw s2, 72(sp) (post-evict) ==========
+                      * line was evicted -> this misses -> refills from DDR -> sees whatever
+                      * the eviction wrote back. code=31 if it differs (the targeted bug). */
+                     "lw   t0, 72(sp)\n"
+                     "la   t1, g_expected_s2\n"
+                     "lw   t1, 0(t1)\n"
+                     "beq  t0, t1, 6f\n"
+                     "la   t2, g_last_code\n"
+                     "lw   t3, 0(t2)\n"
+                     "bnez t3, 6f\n"
+                     "li   t3, 31\n"
+                     "sw   t3, 0(t2)\n"
+                     "la   t2, g_last_reg\n"
+                     "li   t3, 2\n"
+                     "sw   t3, 0(t2)\n"
+                     "la   t2, g_last_expected\n"
+                     "sw   t1, 0(t2)\n"
+                     "la   t2, g_last_actual\n"
+                     "sw   t0, 0(t2)\n"
+                     "6:\n"
+                     /* ---- supporting witnesses on the same line: s3@76, s4@80 ---- */
+                     "lw   t0, 76(sp)\n"
+                     "li   t1, 0x51000003\n"
+                     "beq  t0, t1, 7f\n"
+                     "la   t2, g_last_code\n"
+                     "lw   t3, 0(t2)\n"
+                     "bnez t3, 7f\n"
+                     "li   t3, 31\n"
+                     "sw   t3, 0(t2)\n"
+                     "la   t2, g_last_reg\n"
+                     "li   t3, 3\n"
+                     "sw   t3, 0(t2)\n"
+                     "la   t2, g_last_expected\n"
+                     "sw   t1, 0(t2)\n"
+                     "la   t2, g_last_actual\n"
+                     "sw   t0, 0(t2)\n"
+                     "7:\n"
+                     "lw   t0, 80(sp)\n"
+                     "li   t1, 0x51000004\n"
+                     "beq  t0, t1, 8f\n"
+                     "la   t2, g_last_code\n"
+                     "lw   t3, 0(t2)\n"
+                     "bnez t3, 8f\n"
+                     "li   t3, 31\n"
+                     "sw   t3, 0(t2)\n"
+                     "la   t2, g_last_reg\n"
+                     "li   t3, 4\n"
+                     "sw   t3, 0(t2)\n"
+                     "la   t2, g_last_expected\n"
+                     "sw   t1, 0(t2)\n"
+                     "la   t2, g_last_actual\n"
+                     "sw   t0, 0(t2)\n"
+                     "8:\n"
+                     /* ---- side effects (scratch t0..t2, restored below) ---- */
+                     "li   t1, 0x4000001C\n" /* MTIMECMP_HI := -1 : disarm so no refire */
+                     "li   t0, -1\n"
+                     "sw   t0, 0(t1)\n"
+                     "la   t1, g_ticks\n"
+                     "li   t0, 1\n"
+                     "sw   t0, 0(t1)\n"
+                     "la   t1, g_irq_count\n"
+                     "lw   t0, 0(t1)\n"
+                     "addi t0, t0, 1\n"
+                     "sw   t0, 0(t1)\n"
+                     "la   t1, g_cont\n" /* fixed continuation -> robust to a bad mepc */
+                     "lw   t0, 0(t1)\n"
+                     "csrw mepc, t0\n"
+                     "lw   t0, 128(sp)\n"
+                     "csrw mstatus, t0\n"
+                     /* ---- restore the frame (faithful trap exit) ---- */
+                     "lw   ra, 4(sp)\n"
+                     "lw   gp, 12(sp)\n"
+                     "lw   tp, 16(sp)\n"
+                     "lw   s0, 32(sp)\n"
+                     "lw   s1, 36(sp)\n"
+                     "lw   a0, 40(sp)\n"
+                     "lw   a1, 44(sp)\n"
+                     "lw   a2, 48(sp)\n"
+                     "lw   a3, 52(sp)\n"
+                     "lw   a4, 56(sp)\n"
+                     "lw   a5, 60(sp)\n"
+                     "lw   a6, 64(sp)\n"
+                     "lw   a7, 68(sp)\n"
+                     "lw   s2, 72(sp)\n"
+                     "lw   s3, 76(sp)\n"
+                     "lw   s4, 80(sp)\n"
+                     "lw   s5, 84(sp)\n"
+                     "lw   s6, 88(sp)\n"
+                     "lw   s7, 92(sp)\n"
+                     "lw   s8, 96(sp)\n"
+                     "lw   s9, 100(sp)\n"
+                     "lw   s10, 104(sp)\n"
+                     "lw   s11, 108(sp)\n"
+                     "lw   t3, 112(sp)\n"
+                     "lw   t4, 116(sp)\n"
+                     "lw   t5, 120(sp)\n"
+                     "lw   t6, 124(sp)\n"
+                     "lw   t0, 20(sp)\n"
+                     "lw   t1, 24(sp)\n"
+                     "lw   t2, 28(sp)\n"
+                     "addi sp, sp, 144\n"
+                     "mret\n");
+}
+
+/*
+ * Naked per-margin window. Preserves main's callee-saved registers, sets up the
+ * cached-DDR frame stack + poison + drain store, arms the timer, loads the s0..
+ * s11 sentinels, enables MIE, and spins until the handler fires. The handler
+ * redirects mepc to label 9 (the fixed continuation). Reads its per-margin
+ * inputs (g_timer_margin, g_gap, g_drain_addr, g_expected_s2) from globals set
+ * by C before the call.
+ */
+__attribute__((naked, used, noinline)) static void irq_window(void)
+{
+    __asm__ volatile(
+        /* preserve main's callee-saved s0..s11 (we clobber them with sentinels) */
+        "la   t0, g_save_s\n"
+        "sw   s0, 0(t0)\n"
+        "sw   s1, 4(t0)\n"
+        "sw   s2, 8(t0)\n"
+        "sw   s3, 12(t0)\n"
+        "sw   s4, 16(t0)\n"
+        "sw   s5, 20(t0)\n"
+        "sw   s6, 24(t0)\n"
+        "sw   s7, 28(t0)\n"
+        "sw   s8, 32(t0)\n"
+        "sw   s9, 36(t0)\n"
+        "sw   s10, 40(t0)\n"
+        "sw   s11, 44(t0)\n"
+        "la   t0, g_csp\n"
+        "sw   sp, 0(t0)\n"
+        "la   t0, g_cret\n"
+        "sw   ra, 0(t0)\n"
+        /* fixed continuation for the handler's mepc redirect */
+        "la   t0, g_cont\n"
+        "la   t1, 9f\n"
+        "sw   t1, 0(t0)\n"
+        "la   t0, g_ticks\n"
+        "sw   x0, 0(t0)\n"
+        /* faithful kernel stack pointer: handler does sw s2, 72(sp) */
+        "li   sp, 0x82000090\n" /* FRAME_TOP */
+        /* PRE-POISON the frame's s2 line so a non-landed save reads a stale
+         * value; s2 slot gets 0x19999998 (the real name_to_int value). */
+        "li   t0, 0x82000000\n" /* FRAME_BASE */
+        "li   t1, 0x19999998\n"
+        "sw   t1, 72(t0)\n"
+        "li   t1, 0x19999993\n"
+        "sw   t1, 76(t0)\n"
+        "li   t1, 0x19999994\n"
+        "sw   t1, 80(t0)\n"
+        "li   t1, 0x19999995\n"
+        "sw   t1, 84(t0)\n"
+        "li   t1, 0x19999996\n"
+        "sw   t1, 88(t0)\n"
+        "li   t1, 0x19999997\n"
+        "sw   t1, 92(t0)\n"
+        /* COLD-MISS DRAIN STORE: a fresh DDR line, in flight when the IRQ hits */
+        "la   t0, g_drain_addr\n"
+        "lw   t0, 0(t0)\n"
+        "li   t1, 0xD2A14000\n"
+        "sw   t1, 0(t0)\n"
+        /* ARM the timer: mtimecmp = mtime + margin */
+        "la   t0, g_timer_margin\n"
+        "lw   t0, 0(t0)\n"
+        "li   t2, 0x40000010\n" /* MTIME_LO base */
+        "lw   t3, 4(t2)\n"      /* mtime hi (0x14) */
+        "lw   t4, 0(t2)\n"      /* mtime lo (0x10) */
+        "add  t4, t4, t0\n"
+        "li   t1, 0x40000018\n" /* MTIMECMP_LO base */
+        "li   t5, -1\n"
+        "sw   t5, 4(t1)\n" /* MTIMECMP_HI = max (0x1C) */
+        "sw   t4, 0(t1)\n" /* MTIMECMP_LO (0x18)      */
+        "sw   t3, 4(t1)\n" /* MTIMECMP_HI = hi (0x1C) */
+        /* sentinels into s0..s11 (s2 = pointer-like expected) -- LAST */
+        "li   s0, 0x51000000\n"
+        "li   s1, 0x51000001\n"
+        "la   s2, g_s2_target\n"
+        "li   s3, 0x51000003\n"
+        "li   s4, 0x51000004\n"
+        "li   s5, 0x51000005\n"
+        "li   s6, 0x51000006\n"
+        "li   s7, 0x51000007\n"
+        "li   s8, 0x51000008\n"
+        "li   s9, 0x51000009\n"
+        "li   s10, 0x5100000a\n"
+        "li   s11, 0x5100000b\n"
+        "csrsi mstatus, 8\n" /* enable MIE -> armed timer fires into handler */
+        "li   t0, 0\n"
+        "10:\n"
+        "la   t1, g_ticks\n"
+        "lw   t1, 0(t1)\n"
+        "bnez t1, 9f\n"
+        "la   t1, g_last_code\n"
+        "lw   t1, 0(t1)\n"
+        "bnez t1, 9f\n"
+        "addi t0, t0, 1\n"
+        "li   t1, 200000\n"
+        "bltu t0, t1, 10b\n"
+        "9:\n" /* continuation (handler redirects mepc here) */
+        "csrci mstatus, 8\n"
+        /* restore main's s0..s11 */
+        "la   t0, g_save_s\n"
+        "lw   s0, 0(t0)\n"
+        "lw   s1, 4(t0)\n"
+        "lw   s2, 8(t0)\n"
+        "lw   s3, 12(t0)\n"
+        "lw   s4, 16(t0)\n"
+        "lw   s5, 20(t0)\n"
+        "lw   s6, 24(t0)\n"
+        "lw   s7, 28(t0)\n"
+        "lw   s8, 32(t0)\n"
+        "lw   s9, 36(t0)\n"
+        "lw   s10, 40(t0)\n"
+        "lw   s11, 44(t0)\n"
+        "la   t0, g_csp\n"
+        "lw   sp, 0(t0)\n"
+        "la   t0, g_cret\n"
+        "lw   ra, 0(t0)\n"
+        "ret\n");
+}
+
+int main(void)
+{
+    uint32_t n29 = 0, n30 = 0, n31 = 0, fired = 0, nofire = 0;
+    uint32_t first_margin = 0xFFFFFFFFu;
+    uint32_t first_code = 0, first_reg = 0, first_exp = 0, first_act = 0;
+
+    uart_printf("\n=== drain trap-frame eviction test (Bug B @ pt_regs s2) ===\n");
+    uart_printf("L1D=128KiB direct-mapped 32B lines; evict stride=0x%08x; frame@0x%08x s2@72\n",
+                L1D_STRIDE,
+                FRAME_BASE);
+
+    g_expected_s2 = (uint32_t) &g_s2_target;
+    set_trap_handler(&trapframe_irq_entry);
+    csr_set(mie, MIE_MTIE);
+    disable_interrupts();
+
+    for (uint32_t margin = MARGIN_MIN; margin <= MARGIN_MAX; margin++) {
+        g_timer_margin = margin;
+        g_gap = margin & 15u;
+        g_drain_addr = DRAIN_BASE + margin * DRAIN_LINE;
+        g_expected_s2 = (uint32_t) &g_s2_target;
+        g_last_code = 0;
+        g_last_reg = 0;
+        g_last_expected = 0;
+        g_last_actual = 0;
+        g_ticks = 0;
+
+        irq_window();
+
+        if (g_ticks == 0u) {
+            nofire++;
+            continue;
+        }
+        fired++;
+        if (g_last_code == 29u) {
+            n29++;
+        } else if (g_last_code == 30u) {
+            n30++;
+        } else if (g_last_code == 31u) {
+            n31++;
+        }
+        if (g_last_code != 0u && first_margin == 0xFFFFFFFFu) {
+            first_margin = margin;
+            first_code = g_last_code;
+            first_reg = g_last_reg;
+            first_exp = g_last_expected;
+            first_act = g_last_actual;
+        }
+    }
+
+    disable_timer_interrupt();
+    disable_interrupts();
+
+    uart_printf(
+        "sweep: fired=%u nofire=%u code29=%u code30=%u code31=%u\n", fired, nofire, n29, n30, n31);
+    uart_printf("expected_s2=%08x irq_count=%u\n", g_expected_s2, g_irq_count);
+
+    if (n29 == 0u && n30 == 0u && n31 == 0u && fired > 0u) {
+        uart_printf("<<PASS>>\n");
+    } else {
+        uart_printf("FAIL first_margin=%u code=%u reg=s%u expected=%08x actual=%08x\n",
+                    first_margin,
+                    first_code,
+                    first_reg,
+                    first_exp,
+                    first_act);
+        uart_printf("codes: 29=precise-state 30=save-not-visible 31=eviction/visibility\n");
+        uart_printf("<<FAIL>>\n");
+    }
+
+    for (;;) {
+    }
+    return 0;
+}
diff --git a/sw/apps/fetch_stall_repro/Makefile b/sw/apps/fetch_stall_repro/Makefile
new file mode 100644
index 00000000..130362ff
--- /dev/null
+++ b/sw/apps/fetch_stall_repro/Makefile
@@ -0,0 +1,84 @@
+#    Copyright 2026 Two Sigma Open Source, LLC
+#
+#    Licensed under the Apache License, Version 2.0 (the "License");
+#    you may not use this file except in compliance with the License.
+#    You may obtain a copy of the License at
+#
+#        http://www.apache.org/licenses/LICENSE-2.0
+#
+#    Unless required by applicable law or agreed to in writing, software
+#    distributed under the License is distributed on an "AS IS" BASIS,
+#    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#    See the License for the specific language governing permissions and
+#    limitations under the License.
+
+# Makefile for fetch_stall_repro
+# Self-contained assembly directed test (defines its own _start, no crt0.S).
+
+ARCH = rv32imac_zicsr_zicntr_zifencei_zba_zbb_zbs_zicond_zbkb_zihintpause
+ABI = ilp32
+
+RISCV_PREFIX ?= riscv-none-elf-
+AS      = $(RISCV_PREFIX)as
+LD      = $(RISCV_PREFIX)ld
+CC      = $(RISCV_PREFIX)gcc
+OBJCOPY = $(RISCV_PREFIX)objcopy
+OBJDUMP = $(RISCV_PREFIX)objdump
+
+DDR_BOOT_STUB_SRC = ../../common/crt0_ddr_boot.S
+DDR_BOOT_STUB_OBJ = crt0_ddr_boot.o
+
+MEM_CONFIG ?= bram
+
+ifeq ($(MEM_CONFIG),bram)
+LINKER_SCRIPT = ../../common/link.ld
+BOOT_STUB_OBJ =
+DDR_SECTIONS  = .ddr_text .ddr_rodata .ddr_data
+else ifeq ($(MEM_CONFIG),ddr)
+LINKER_SCRIPT = ../../common/link_ddr.ld
+BOOT_STUB_OBJ = $(DDR_BOOT_STUB_OBJ)
+DDR_SECTIONS  = .text .rodata .data .sdata .ddr_text .ddr_rodata .ddr_data
+else
+$(error MEM_CONFIG must be one of: bram, ddr (got '$(MEM_CONFIG)'))
+endif
+
+EXECUTABLE_ELF_FILE  = sw.elf
+VERILOG_HEX_FILE     = sw.mem
+DDR_VERILOG_HEX_FILE = sw_ddr.mem
+RAW_BINARY_FILE      = sw.bin
+VIVADO_BRAM_FILE     = sw.txt
+DISASSEMBLY_FILE     = sw.S
+
+all: $(EXECUTABLE_ELF_FILE) $(VERILOG_HEX_FILE) $(DDR_VERILOG_HEX_FILE) $(RAW_BINARY_FILE) $(VIVADO_BRAM_FILE) $(DISASSEMBLY_FILE)
+
+$(DDR_BOOT_STUB_OBJ): $(DDR_BOOT_STUB_SRC)
+	$(CC) -march=$(ARCH) -mabi=$(ABI) -nostdlib -nostartfiles -c -o $@ $<
+
+$(EXECUTABLE_ELF_FILE): fetch_stall_repro.S $(BOOT_STUB_OBJ) $(LINKER_SCRIPT)
+	$(AS) -march=$(ARCH) -mabi=$(ABI) -o fetch_stall_repro.o fetch_stall_repro.S
+	$(LD) -m elf32lriscv -T $(LINKER_SCRIPT) -o $@ $(BOOT_STUB_OBJ) fetch_stall_repro.o
+	@rm -f fetch_stall_repro.o
+
+$(VERILOG_HEX_FILE): $(EXECUTABLE_ELF_FILE)
+	$(OBJCOPY) -O verilog --verilog-data-width 4 -R .comment -R .note.gnu.build-id \
+		$(addprefix -R ,$(DDR_SECTIONS)) $< $@
+
+$(DDR_VERILOG_HEX_FILE): $(EXECUTABLE_ELF_FILE)
+	-$(OBJCOPY) -O verilog --verilog-data-width 4 $(addprefix -j ,$(DDR_SECTIONS)) \
+		--change-addresses -0x80000000 $< $@ 2>/dev/null
+	@if [ ! -s $@ ]; then echo 00000000 > $@; fi
+
+$(RAW_BINARY_FILE): $(EXECUTABLE_ELF_FILE)
+	$(OBJCOPY) -O binary -R .comment -R .note.gnu.build-id \
+		$(addprefix -R ,$(DDR_SECTIONS)) $< $@
+
+$(VIVADO_BRAM_FILE): $(RAW_BINARY_FILE)
+	xxd -e -g4 -c4 $< | awk '{printf "%08x\n", strtonum("0x" $$2)}' > $@
+
+$(DISASSEMBLY_FILE): $(EXECUTABLE_ELF_FILE)
+	$(OBJDUMP) -d $< > $@
+
+clean:
+	rm -f $(EXECUTABLE_ELF_FILE) $(VERILOG_HEX_FILE) $(DDR_VERILOG_HEX_FILE) \
+		$(RAW_BINARY_FILE) $(VIVADO_BRAM_FILE) $(DISASSEMBLY_FILE) \
+		fetch_stall_repro.o $(DDR_BOOT_STUB_OBJ)
diff --git a/sw/apps/fetch_stall_repro/fetch_stall_repro.S b/sw/apps/fetch_stall_repro/fetch_stall_repro.S
new file mode 100644
index 00000000..7d01d9ea
--- /dev/null
+++ b/sw/apps/fetch_stall_repro/fetch_stall_repro.S
@@ -0,0 +1,125 @@
+# Directed cached-fetch PC-step repro (executes from .ddr_text through the L1I)
+#
+# Reproduces the HW front-end defect where the core steps PC +2 instead of +4 on
+# a 32-bit instruction (mis-decoding it as compressed), landing mid-instruction.
+# On genesys2 this fires at workqueue_init_early (epc=0x8038d7fa, 2 bytes into a
+# 32-bit `sw zero,4(s1)`), deterministically.
+#
+# The defect needs the cached L1I fetch path: a fetch stall (!fetch_progress ->
+# sel_nop) from a 32-byte line-fill coinciding with a 32-bit instruction near a
+# line boundary. A BRAM fetch-fuzz run did NOT trigger it, so this version runs
+# the stream from cached DDR (.ddr_text) COLD, so every line is a miss -> a
+# regular line-fill stall, like the boot.
+#
+# Stream = [compressed-nop run][32-bit nops] blocks; compressed run length 3..6
+# sweeps the 32-bit nops across every alignment vs the line boundaries. A 32-bit
+# nop is 0x00000013 (UPPER half 0x0000 = illegal compressed). Any +2 mis-step
+# onto one fetches 0x0000 -> illegal-instruction trap; the handler prints mepc
+# (the mid-instruction PC) and <<FAIL>>. Clean run -> <<PASS>>.
+
+    .section .init
+    .option push
+    .option norelax
+    .globl _start
+
+.macro CRUN n
+    .option rvc
+    .rept \n
+    c.nop
+    .endr
+    .option norvc
+.endm
+
+_start:
+    .option norvc
+    la      t0, trap_handler
+    csrw    mtvec, t0
+    lui     sp, %hi(_stack_top)
+    addi    sp, sp, %lo(_stack_top)
+    lui     s0, 0x40000             # UART base
+    la      a0, msg_header
+    jal     ra, print_string
+    # Call the cached-DDR fetch-stall stream (absolute address; .ddr_text @ DDR).
+    lui     t0, %hi(ddr_pattern)
+    addi    t0, t0, %lo(ddr_pattern)
+    jalr    ra, t0
+    la      a0, msg_pass            # survived clean -> PASS
+    jal     ra, print_string
+done:
+    .option rvc
+    c.j     done
+    .option norvc
+
+    .balign 4
+trap_handler:
+    csrr    s1, mepc                # faulting PC (mid-instruction if +2 bug)
+    la      a0, msg_trap
+    jal     ra, print_string
+    mv      a0, s1
+    jal     ra, print_hex
+    la      a0, msg_fail
+    jal     ra, print_string
+trap_done:
+    .option rvc
+    c.j     trap_done
+    .option norvc
+
+    .balign 4
+print_string:
+    mv      t2, a0
+1:  lb      t1, 0(t2)
+    beqz    t1, 2f
+    sb      t1, 0(s0)
+    addi    t2, t2, 1
+    j       1b
+2:  ret
+
+    .balign 4
+print_hex:
+    mv      t2, a0
+    li      t4, 28
+1:  srl     t1, t2, t4
+    andi    t1, t1, 0xf
+    li      t5, 10
+    blt     t1, t5, 2f
+    addi    t1, t1, 0x57            # 'a'-10
+    j       3f
+2:  addi    t1, t1, 0x30            # '0'
+3:  sb      t1, 0(s0)
+    addi    t4, t4, -4
+    bgez    t4, 1b
+    ret
+
+    .option pop
+
+# ===== cached-DDR fetch-stall stream (fetched through the L1I) =====
+    .section .ddr_text, "ax"
+    .option push
+    .option norelax
+    .balign 32
+ddr_pattern:
+    .rept 300
+    CRUN 3
+    nop ; nop ; nop ; nop
+    CRUN 4
+    nop ; nop ; nop ; nop
+    CRUN 5
+    nop ; nop ; nop ; nop
+    CRUN 6
+    nop ; nop ; nop ; nop
+    .endr
+    .option norvc
+    ret
+    .option pop
+
+    .section .rodata
+msg_header: .asciz "=== fetch_stall_repro (ddr) ===\n"
+msg_pass:   .asciz "\n<<PASS>>\n"
+msg_trap:   .asciz "\nILLEGAL TRAP mepc="
+msg_fail:   .asciz " <<FAIL>>\n"
+
+    .section .bss
+    .align 4
+stack_bottom:
+    .space 512
+_stack_top:
diff --git a/sw/apps/freertos_demo/FreeRTOSConfig.h b/sw/apps/freertos_demo/FreeRTOSConfig.h
index 59d83ad8..d69287bb 100644
--- a/sw/apps/freertos_demo/FreeRTOSConfig.h
+++ b/sw/apps/freertos_demo/FreeRTOSConfig.h
@@ -18,7 +18,7 @@
  * FreeRTOS Configuration for FROST RISC-V Processor
  *
  * This configuration is for a minimal FreeRTOS setup targeting:
- *   - RV32GCB with M-mode only
+ *   - RV32GCB with Machine (M) and User (U) privilege modes
  *   - Single core (mhartid = 0)
  *   - CLINT-style timer (mtime/mtimecmp)
  *   - 300 MHz clock frequency
diff --git a/sw/apps/irq_mie_window/Makefile b/sw/apps/irq_mie_window/Makefile
new file mode 100644
index 00000000..ecaa3b92
--- /dev/null
+++ b/sw/apps/irq_mie_window/Makefile
@@ -0,0 +1,19 @@
+#    Copyright 2026 Two Sigma Open Source, LLC
+#
+#    Licensed under the Apache License, Version 2.0 (the "License");
+#    you may not use this file except in compliance with the License.
+#    You may obtain a copy of the License at
+#
+#        http://www.apache.org/licenses/LICENSE-2.0
+#
+#    Unless required by applicable law or agreed to in writing, software
+#    distributed under the License is distributed on an "AS IS" BASIS,
+#    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#    See the License for the specific language governing permissions and
+#    limitations under the License.
+
+#    Copyright 2026 Two Sigma Open Source, LLC
+#    SPDX-License-Identifier: Apache-2.0
+# Short-MIE-window lost-interrupt directed test (registered-pending erase race)
+SRC_C := main.c
+include ../../common/common.mk
diff --git a/sw/apps/irq_mie_window/main.c b/sw/apps/irq_mie_window/main.c
new file mode 100644
index 00000000..6a541b7d
--- /dev/null
+++ b/sw/apps/irq_mie_window/main.c
@@ -0,0 +1,127 @@
+/*
+ *    Copyright 2026 Two Sigma Open Source, LLC
+ *
+ *    Licensed under the Apache License, Version 2.0 (the "License");
+ *    you may not use this file except in compliance with the License.
+ *    You may obtain a copy of the License at
+ *
+ *        http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *    Unless required by applicable law or agreed to in writing, software
+ *    distributed under the License is distributed on an "AS IS" BASIS,
+ *    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *    See the License for the specific language governing permissions and
+ *    limitations under the License.
+ */
+
+/*
+ * Short-MIE-window lost-interrupt directed test.
+ *
+ * Root cause under test (trap_unit.sv): interrupt_pending is sampled from
+ * (mtip && mie.MTIE && mstatus.MIE) into a 1-cycle-late flop, and
+ * interrupt_pending_eligible then RE-CHECKS the LIVE mstatus.MIE/mie.MTIE when
+ * the sample matures. So a machine interrupt is only taken if the enable is high
+ * for TWO consecutive cycles (sample + service). A legal SHORT MIE-enable window
+ * -- e.g. `csrsi mstatus,8` immediately followed by `csrci mstatus,8` -- gets
+ * its already-qualified interrupt ERASED: the registered pending bit matures one
+ * cycle after csrsi, but the csrci's (delayed) side-effect has already driven
+ * mstatus.MIE back to 0, so interrupt_pending_eligible=0 and the pending bit is
+ * cleared without ever being serviced. Per RISC-V the interrupt MUST be taken at
+ * the instruction boundary right after the csrsi (before the csrci), so this is
+ * a dropped interrupt. On the real no-MMU kernel this is the lost machine-timer
+ * tick -> frozen jiffies -> boot hang (the same drop, usually opened by the trap
+ * being delayed a cycle by a draining store rather than a literal adjacent
+ * csrci).
+ *
+ * Setup: make the machine timer permanently pending (mtimecmp=0 => mtip high),
+ * enable mie.MTIE, leave mstatus.MIE=0. Then pulse MIE high for one cycle
+ * (csrsi; csrci) many times. A correct core takes the timer at the first pulse
+ * (the handler acks it); a buggy core erases it every pulse and never traps.
+ *
+ * PASS: g_taken >= 1 (the eligible timer was taken).
+ * FAIL: g_taken == 0 (the timer was eligible at every csrsi but never taken).
+ */
+
+#include <stdint.h>
+
+#include "trap.h"
+
+#define PULSES 256u
+
+volatile uint32_t g_taken; /* timer-trap count */
+
+static void uart_putc(char c)
+{
+    UART_TX = (uint8_t) c;
+}
+static void uart_puts(const char *s)
+{
+    while (*s)
+        uart_putc(*s++);
+}
+static void uart_hex(uint32_t v)
+{
+    static const char hex[] = "0123456789ABCDEF";
+    uart_puts("0x");
+    for (int i = 28; i >= 0; i -= 4)
+        uart_putc(hex[(v >> i) & 0xF]);
+}
+
+/* Naked handler: count the trap, ack the timer (push mtimecmp_hi to max so mtip
+ * drops and it cannot re-fire), MRET. */
+__attribute__((naked, aligned(4))) static void timer_handler(void)
+{
+    __asm__ volatile("addi sp, sp, -8\n"
+                     "sw   t0, 0(sp)\n"
+                     "sw   t1, 4(sp)\n"
+                     "lui  t0, %hi(g_taken)\n"
+                     "lw   t1, %lo(g_taken)(t0)\n"
+                     "addi t1, t1, 1\n"
+                     "sw   t1, %lo(g_taken)(t0)\n"
+                     "li   t0, 0x4000001C\n" /* MTIMECMP_HI */
+                     "li   t1, -1\n"
+                     "sw   t1, 0(t0)\n" /* mtimecmp = huge -> mtip low (ack) */
+                     "lw   t0, 0(sp)\n"
+                     "lw   t1, 4(sp)\n"
+                     "addi sp, sp, 8\n"
+                     "mret\n");
+}
+
+int main(void)
+{
+    uart_puts("\r\n=== short-MIE-window lost-interrupt test ===\r\n");
+    set_trap_handler(&timer_handler);
+    g_taken = 0;
+
+    /* Machine timer permanently pending (mtime >= 0 always), MTIE enabled,
+     * mstatus.MIE left 0 -- pending but masked. */
+    MTIMECMP_HI = 0;
+    MTIMECMP_LO = 0;
+    enable_timer_interrupt(); /* mie.MTIE = 1 */
+
+    /* Pulse mstatus.MIE high for a single cycle, repeatedly. Each csrsi makes the
+     * pending timer eligible at the very next instruction boundary; the adjacent
+     * csrci must NOT be able to retroactively cancel it. */
+    for (uint32_t i = 0; i < PULSES; i++) {
+        __asm__ volatile("csrsi mstatus, 8\n" /* mstatus.MIE = 1 (1-cycle window) */
+                         "csrci mstatus, 8\n" /* mstatus.MIE = 0 */
+                         ::
+                             : "memory");
+        if (g_taken)
+            break; /* taken once -> correct; acked, no point continuing */
+    }
+
+    disable_timer_interrupt();
+    uart_puts("taken=");
+    uart_hex(g_taken);
+    uart_puts("\r\n");
+    if (g_taken >= 1u) {
+        uart_puts("<<PASS>>\r\n");
+    } else {
+        uart_puts("[FAIL] eligible machine timer was erased by the adjacent MIE clear "
+                  "(never taken)\r\n<<FAIL>>\r\n");
+    }
+    for (;;) {
+    }
+    return 0;
+}
diff --git a/sw/apps/linux_boot/.gitignore b/sw/apps/linux_boot/.gitignore
new file mode 100644
index 00000000..902e8ed9
--- /dev/null
+++ b/sw/apps/linux_boot/.gitignore
@@ -0,0 +1,11 @@
+# All linux_boot images are build artifacts, produced by `make` here (the
+# Buildroot self-build + build_fpga_boot.py packer). None are committed.
+sw.mem
+sw.txt
+sw_ddr.mem
+sw_ddr.txt
+frost-nommu-fpga.dts
+frost-nommu-fpga.dtb
+frost_boot_shim.S
+shim.elf
+shim.bin
diff --git a/sw/apps/linux_boot/Makefile b/sw/apps/linux_boot/Makefile
new file mode 100644
index 00000000..778b083d
--- /dev/null
+++ b/sw/apps/linux_boot/Makefile
@@ -0,0 +1,133 @@
+#    Copyright 2026 Two Sigma Open Source, LLC
+#
+#    Licensed under the Apache License, Version 2.0 (the "License");
+#    you may not use this file except in compliance with the License.
+#    You may obtain a copy of the License at
+#
+#        http://www.apache.org/licenses/LICENSE-2.0
+#
+#    Unless required by applicable law or agreed to in writing, software
+#    distributed under the License is distributed on an "AS IS" BASIS,
+#    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#    See the License for the specific language governing permissions and
+#    limitations under the License.
+
+# Builds the FROST memory images for the no-MMU Linux boot straight from source
+# in this repo: the kernel + busybox initramfs come from the vendored Buildroot
+# submodule (linux/buildroot) driven by the FROST BR2_EXTERNAL tree
+# (linux/buildroot-external), and build_fpga_boot.py packs them into the low-BRAM
+# boot shim (sw.{mem,txt}) plus the DDR image (sw_ddr.{mem,txt}) that cocotb's
+# linux_boot test and fpga/load_software consume. No hand-built artifacts and no
+# out-of-repo paths -- a fresh `git clone --recurse-submodules` can build it.
+#
+# Two stages, deliberately split so re-loading for a different board is cheap:
+#
+#   Stage 1 (slow, board-INDEPENDENT): Buildroot builds its own rv32 uClibc cross
+#     toolchain, the kernel Image and the rootfs.cpio.gz. ~30-60 min the first
+#     time; cached in linux/build afterwards. Only runs when the kernel Image is
+#     absent, so it does not re-run on every load.
+#
+#   Stage 2 (fast, board-DEPENDENT): build_fpga_boot.py regenerates the DTB
+#     (timebase + UART clock = FPGA_CPU_CLK_FREQ) and repacks the DDR image, then
+#     patch_ret_from_exception.py applies the M-mode timer restore-window patch.
+#     `load_software.py <board> linux_boot` sets FPGA_CPU_CLK_FREQ per board
+#     (genesys2 = 133.33 MHz, x3 = 300 MHz) and `make clean`s first, so the DDR
+#     image is always repacked for the target board without a kernel rebuild.
+#
+# The old footgun (sw_ddr.mem had no prerequisites, so it never regenerated when
+# the underlying image changed) is gone: sw_ddr.mem now depends on the kernel
+# Image and the packer, so a newer kernel or packer forces a repack.
+#
+# The patch clears mstatus.MIE in the MRET restore window: the M-mode variant of
+# the timer-interrupt-resume-PC race is not yet fixed in hardware, so an
+# unpatched kernel hangs at the CLINT clocksource switch once the periodic tick
+# ramps up. It locates its target by unique machine-code word, so it survives
+# kernel rebuilds. Drop it once the M-mode restore window is fixed in RTL.
+
+# Repo-root-relative paths (this Makefile lives in sw/apps/linux_boot).
+FROST_ROOT   := $(abspath $(CURDIR)/../../..)
+BR2_SRC      := $(FROST_ROOT)/linux/buildroot
+BR2_EXTERNAL := $(FROST_ROOT)/linux/buildroot-external
+BR2_OUT      := $(FROST_ROOT)/linux/build
+BOARD_DIR    := $(BR2_EXTERNAL)/board/frost
+IMAGES       := $(BR2_OUT)/images
+KIMAGE       := $(IMAGES)/Image
+INITRD_IMG   := $(IMAGES)/rootfs.cpio.gz
+
+# DTB timebase + UART clock. load_software.py exports this per board; default is
+# genesys2 (133.33 MHz) for a bare `make` on a dev box.
+FPGA_CPU_CLK_FREQ ?= 133333333
+
+# The FROST cross prefix for the tiny boot shim (build_fpga_boot.py's default is
+# riscv-none-elf-, the xPack bare-metal toolchain that load_software.py / the
+# Docker image provide). Honor RISCV_PREFIX if the environment set one.
+ifdef RISCV_PREFIX
+SHIM_CROSS := FROST_CROSS_COMPILE=$(RISCV_PREFIX)
+endif
+
+.PHONY: all clean distclean
+
+# --- Prebuilt-image mode (CI) -------------------------------------------------
+# The CI linux-boot-cocotb job downloads the images that build-frost-linux
+# produced and stages them here; the cocotb runner then does `make clean` +
+# `make` on this app (compile_app clean_first=True). Without a kernel build
+# tree in the clone, that clean would delete the staged images and the build
+# would kick off a full in-job Buildroot rebuild that overwrites them.
+# FROST_LINUX_PREBUILT=1 declares the staged images authoritative: `make`
+# verifies they exist and touches nothing; `make clean` keeps them and only
+# removes packer scratch files.
+ifeq ($(FROST_LINUX_PREBUILT),1)
+
+all:
+	@test -f sw.mem -a -f sw_ddr.mem || { \
+	  echo "ERROR: FROST_LINUX_PREBUILT=1 but sw.mem / sw_ddr.mem are not staged here."; \
+	  exit 1; }
+	@echo "linux_boot: FROST_LINUX_PREBUILT=1 -> using staged images as-is"
+
+clean:
+	rm -f shim.elf shim.bin frost_boot_shim.S \
+	      frost-nommu-fpga.dts frost-nommu-fpga.dtb
+
+distclean: clean
+
+else  # --- Normal self-build mode ---------------------------------------------
+
+# sw_ddr.mem's recipe emits all four images at once, so it is the canonical goal.
+all: sw_ddr.mem
+
+# Stage 1: kernel Image + rootfs via Buildroot. Skipped when already built.
+# Grouped target (&:): one Buildroot invocation produces both files (a plain
+# multi-target rule would run the recipe once per missing file, and in a -j
+# build could even run two Buildroot makes concurrently in the same O= dir).
+$(KIMAGE) $(INITRD_IMG) &:
+	@test -f "$(BR2_SRC)/Makefile" || { \
+	  echo "ERROR: buildroot submodule not initialized."; \
+	  echo "  Run: git submodule update --init linux/buildroot"; \
+	  exit 1; }
+	@echo ">>> linux_boot: building kernel + rootfs via Buildroot (first build ~30-60 min)..."
+	$(MAKE) -C "$(BR2_SRC)" O="$(BR2_OUT)" BR2_EXTERNAL="$(BR2_EXTERNAL)" frost_nommu_rv32_defconfig
+	$(MAKE) -C "$(BR2_SRC)" O="$(BR2_OUT)"
+
+# Stage 2: pack for this board's clock, then apply the M-mode timer patch.
+sw_ddr.mem sw.mem sw_ddr.txt sw.txt: $(KIMAGE) $(INITRD_IMG) $(BOARD_DIR)/build_fpga_boot.py patch_ret_from_exception.py
+	FPGA_CPU_CLK_FREQ=$(FPGA_CPU_CLK_FREQ) \
+	  FROST_IMAGE="$(KIMAGE)" \
+	  FROST_INITRD="$(INITRD_IMG)" \
+	  FROST_OUTDIR="$(CURDIR)" \
+	  $(SHIM_CROSS) \
+	  python3 "$(BOARD_DIR)/build_fpga_boot.py"
+	python3 ./patch_ret_from_exception.py ./sw_ddr.mem ./sw_ddr.txt
+
+# Light clean: drop only the board-dependent pack outputs (Stage 2). The cached
+# Buildroot kernel/rootfs (Stage 1) survive, so a re-pack for another board is
+# fast. This is what load_software.py runs before every load.
+clean:
+	rm -f sw.mem sw.txt sw_ddr.mem sw_ddr.txt \
+	      shim.elf shim.bin frost_boot_shim.S \
+	      frost-nommu-fpga.dts frost-nommu-fpga.dtb
+
+# Full clean: also wipe the (slow) Buildroot build tree, forcing a Stage 1 rebuild.
+distclean: clean
+	rm -rf "$(BR2_OUT)"
+
+endif  # FROST_LINUX_PREBUILT
diff --git a/sw/apps/linux_boot/patch_ret_from_exception.py b/sw/apps/linux_boot/patch_ret_from_exception.py
new file mode 100644
index 00000000..191d34c8
--- /dev/null
+++ b/sw/apps/linux_boot/patch_ret_from_exception.py
@@ -0,0 +1,1092 @@
+#!/usr/bin/env python3
+
+#    Copyright 2026 Two Sigma Open Source, LLC
+#
+#    Licensed under the Apache License, Version 2.0 (the "License");
+#    you may not use this file except in compliance with the License.
+#    You may obtain a copy of the License at
+#
+#        http://www.apache.org/licenses/LICENSE-2.0
+#
+#    Unless required by applicable law or agreed to in writing, software
+#    distributed under the License is distributed on an "AS IS" BASIS,
+#    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#    See the License for the specific language governing permissions and
+#    limitations under the License.
+
+"""Patch the temporary Linux bring-up image for current bring-up hazards.
+
+The external linux-mvp tree currently builds a debug kernel whose
+ret_from_exception sequence contains:
+
+    lw   a2, PT_EPC(sp)
+    sc.w zero, a2, (sp)
+    csrw mstatus, a0
+    csrw mepc, a2
+    ...
+    mret
+
+If the restored mstatus image has MIE set, the timer can preempt between the
+CSR write and MRET (an M-mode restore-window race). The trap then saves mepc at
+the MRET instruction itself, which later returns into MRET as user code and
+produces SIGILL at ret_from_exception+0x76. (The U-mode variant of that race is
+fixed in hardware -- cpu_ooo.sv seeds interrupt_resume_pc from csr_mepc on
+mret_taken -- but the M-mode restore-window variant is not yet, so this software
+crutch is still required: without it the unpatched kernel hangs at the CLINT
+clocksource switch once the periodic timer tick ramps up.)
+
+For bring-up, replace the reservation-clear SC with `andi a0, a0, -9`, clearing
+MIE in the value written to mstatus.  MRET still restores the final
+interrupt-enable state from MPIE, but the restore window is not interruptible.
+
+The target instruction is located by its unique machine-code word
+(`18c1202f`) rather than a fixed offset, so the patch survives kernel rebuilds
+that shift ret_from_exception. If the word is absent the image is assumed
+already patched (idempotent); if it occurs more than once the patch aborts
+rather than risk hitting the wrong site.
+
+Set FROST_LINUX_BOOTARGS to rewrite /chosen/bootargs in the generated DTB. This
+is useful for hardware-only boot triage such as forcing initramfs_async=0 without
+modifying the external linux-mvp artifact generator.
+
+Set FROST_LINUX_NOOP_FUNCTIONS to rewrite selected kernel functions to
+`li a0,0; ret` in the generated DDR images. This is a hardware bring-up escape
+hatch for narrow isolation runs; do not use it for correctness testing.
+
+Set FROST_LINUX_BUSYBOX to replace bin/busybox in the generated initramfs.
+This is a bring-up hook for testing BFLT header changes without rebuilding the
+external Buildroot tree.
+"""
+
+from __future__ import annotations
+
+import argparse
+import gzip
+import os
+import shutil
+import stat
+import struct
+import subprocess
+import tempfile
+from pathlib import Path
+
+
+OLD_WORD = "18c1202f"  # sc.w zero, a2, (sp) -- ret_from_exception reservation clear
+NEW_WORD = "ff757513"  # andi a0, a0, -9    -- clear mstatus.MIE in the restore value
+
+DTB_WORD = 0x200000
+INITRD_WORD = 0x204000
+KERNEL_ENTRY = 0x80000000
+FDT_MAGIC = 0xD00DFEED
+CPIO_NEWC_MAGIC = b"070701"
+CPIO_TRAILER = "TRAILER!!!"
+NOOP_INITCALL_PATCH = b"\x01\x45\x82\x80"  # c.li a0,0; c.ret
+CPU_RELAX_DIV_SYMBOL = "__delay"
+CPU_RELAX_DIV_OFFSET = 0x1C
+CPU_RELAX_DIV_OLD = b"\xb3\xc7\x07\x02"  # div a5,a5,zero
+CPU_RELAX_DIV_NEW = b"\x13\x00\x00\x00"  # nop
+CPU_RELAX_PAUSE_OFFSET = 0x20
+CPU_RELAX_PAUSE_OLD = b"\x0f\x00\x00\x01"  # pause / fence hint
+CPU_RELAX_PAUSE_NEW = b"\x13\x00\x00\x00"  # nop
+PROC_GET_INODE_MODE_RELOAD_OLD = b"\x83\xd7\x04\x00"  # lhu a5,0(s1)
+PROC_GET_INODE_MODE_RELOAD_NEW = b"\x83\x57\x09\x06"  # lhu a5,96(s2)
+PROC_GET_INODE_MODE_RELOAD_ADDRS = (0x001071B2, 0x00107220)
+PROC_GET_INODE_MODE_LOAD_ADDR = 0x0010718C
+PROC_GET_INODE_MODE_LOAD_OLD = b"\x83\x57\x09\x06"  # lhu a5,96(s2)
+PROC_GET_INODE_MODE_FORCE_REG = b"\xb7\x87\x00\x00"  # lui a5,0x8 (S_IFREG)
+PROC_LOOKUP_REF_AMO_ADDR = 0x0010BC82
+PROC_LOOKUP_REF_AMO_OLD = b"\x2f\x27\xb5\x00"  # amoadd.w a4,a1,(a0)
+PROC_LOOKUP_REF_AMO_CONST = b"\x13\x07\x10\x00"  # addi a4,zero,1
+PROC_LOOKUP_DE_ADJUST_ADDR = 0x0010BC7C
+PROC_LOOKUP_DE_ADJUST_OLD = b"\xaa\x87\x85\x45"  # mv a5,a0; li a1,1
+PROC_LOOKUP_DE_ADJUST_NEW = b"\x93\x07\x05\xfb"  # addi a5,a0,-80
+DEFAULT_SYSTEM_MAP = Path(
+    os.path.expanduser(
+        "~/bigger_l0/linux-mvp/buildroot/output/build/linux-6.18.7/System.map"
+    )
+)
+INITRD_DEVICES = {
+    "dev/console": (stat.S_IFCHR | 0o600, 5, 1),
+    "dev/null": (stat.S_IFCHR | 0o666, 1, 3),
+    "dev/random": (stat.S_IFCHR | 0o666, 1, 8),
+    "dev/ttyS0": (stat.S_IFCHR | 0o600, 4, 64),
+    "dev/urandom": (stat.S_IFCHR | 0o666, 1, 9),
+}
+DIAG_SHELL_INITTAB = """\
+console::sysinit:/bin/echo FROST_DIAG_INITTAB_START
+::sysinit:/bin/mount -t proc proc /proc
+::sysinit:/bin/mount -o remount,rw /
+::sysinit:/bin/mkdir -p /dev/pts /dev/shm /run/lock/subsys /tmp /sys
+::sysinit:/bin/mount -a
+console::sysinit:/bin/echo FROST_DIAG_INITTAB_AFTER_RCS
+console::respawn:/bin/sh
+::shutdown:/bin/umount -a -r
+"""
+SEEDRNG_NOOP = """\
+#!/bin/sh
+# FPGA bring-up has no hardware entropy source; seedrng can block PID 1 forever.
+exit 0
+"""
+
+
+def patch_ret_restore_window(path: Path) -> None:
+    """Patch the single OLD_WORD occurrence to NEW_WORD.
+
+    Works for both the dense FPGA-loader form (one word per line) and the
+    $readmemh form (skips '@<addr>' directives and blank lines).
+    """
+    lines = path.read_text().splitlines()
+    old_hits = []
+    new_hits = 0
+    for i, line in enumerate(lines):
+        s = line.strip().lower()
+        if not s or s.startswith("@"):
+            continue
+        if s == OLD_WORD:
+            old_hits.append(i)
+        elif s == NEW_WORD:
+            new_hits += 1
+    if not old_hits:
+        if new_hits:
+            return  # already patched
+        raise SystemExit(
+            f"{path}: target word {OLD_WORD} not found (and not already patched)"
+        )
+    if len(old_hits) > 1:
+        raise SystemExit(
+            f"{path}: {OLD_WORD} occurs {len(old_hits)}x; ambiguous, refusing to patch"
+        )
+    lines[old_hits[0]] = NEW_WORD
+    path.write_text("\n".join(lines) + "\n")
+
+
+def split_env_names(value: str) -> list[str]:
+    """Parse value (space/comma-separated) into a deduplicated ordered list of names."""
+    names: list[str] = []
+    seen: set[str] = set()
+    for raw_name in value.replace(",", " ").split():
+        name = raw_name.strip()
+        if not name or name in seen:
+            continue
+        names.append(name)
+        seen.add(name)
+    return names
+
+
+def resolve_system_map_symbols(system_map: Path, names: list[str]) -> dict[str, int]:
+    """Look up symbol names to byte addresses in a Linux System.map file."""
+    if not names:
+        return {}
+    if not system_map.exists():
+        raise SystemExit(f"System.map not found: {system_map}")
+
+    wanted = set(names)
+    resolved: dict[str, int] = {}
+    for line in system_map.read_text().splitlines():
+        parts = line.split()
+        if len(parts) < 3:
+            continue
+        addr, _kind, symbol = parts[:3]
+        if symbol in wanted:
+            resolved[symbol] = int(addr, 16)
+
+    missing = [name for name in names if name not in resolved]
+    if missing:
+        raise SystemExit(f"{system_map}: missing symbol(s): " + " ".join(missing))
+    return resolved
+
+
+def patch_word_byte(word: str, byte_offset: int, value: int) -> str:
+    """Patch one byte within a little-endian 4-byte hex word string and return the new word."""
+    data = bytearray(struct.pack("<I", int(word, 16)))
+    data[byte_offset] = value
+    return f"{struct.unpack('<I', data)[0]:08x}"
+
+
+def patch_dense_code_bytes(path: Path, patches: dict[int, bytes]) -> None:
+    """Apply byte-level patches to a dense (one-word-per-line) hex image file."""
+    words = [
+        line.strip().lower() for line in path.read_text().splitlines() if line.strip()
+    ]
+    for byte_addr, patch in patches.items():
+        for byte_idx, value in enumerate(patch):
+            absolute_byte = byte_addr + byte_idx
+            word_idx = absolute_byte // 4
+            byte_offset = absolute_byte % 4
+            if word_idx >= len(words):
+                raise SystemExit(
+                    f"{path}: patch address 0x{absolute_byte:x} is outside dense image"
+                )
+            words[word_idx] = patch_word_byte(words[word_idx], byte_offset, value)
+    path.write_text("\n".join(words) + "\n")
+
+
+def patch_sparse_code_bytes(path: Path, patches: dict[int, bytes]) -> None:
+    """Apply byte-level patches to a sparse (@addr-directive) hex image file."""
+    lines = path.read_text().splitlines()
+    word_line_by_addr: dict[int, int] = {}
+    current_word_addr = 0
+    for idx, line in enumerate(lines):
+        stripped = line.strip().lower()
+        if not stripped:
+            continue
+        if stripped.startswith("@"):
+            current_word_addr = int(stripped[1:], 16)
+            continue
+        word_line_by_addr[current_word_addr] = idx
+        current_word_addr += 1
+
+    for byte_addr, patch in patches.items():
+        for byte_idx, value in enumerate(patch):
+            absolute_byte = byte_addr + byte_idx
+            word_addr = absolute_byte // 4
+            byte_offset = absolute_byte % 4
+            line_idx = word_line_by_addr.get(word_addr)
+            if line_idx is None:
+                raise SystemExit(
+                    f"{path}: patch address 0x{absolute_byte:x} is outside sparse image"
+                )
+            lines[line_idx] = patch_word_byte(
+                lines[line_idx].strip().lower(), byte_offset, value
+            )
+    path.write_text("\n".join(lines) + "\n")
+
+
+def patch_code_bytes(path: Path, patches: dict[int, bytes]) -> None:
+    """Dispatch to dense or sparse patcher based on image format and apply patches."""
+    if not patches:
+        return
+    for line in path.read_text().splitlines():
+        stripped = line.strip()
+        if not stripped:
+            continue
+        if stripped.startswith("@"):
+            patch_sparse_code_bytes(path, patches)
+        else:
+            patch_dense_code_bytes(path, patches)
+        return
+    raise SystemExit(f"{path}: empty Linux DDR image")
+
+
+def patch_noop_return_zero(path: Path, symbols: dict[str, int]) -> None:
+    """Patch each symbol address with the NOOP_INITCALL_PATCH byte sequence."""
+    patch_code_bytes(path, {addr: NOOP_INITCALL_PATCH for addr in symbols.values()})
+
+
+def read_dense_code_bytes(path: Path, byte_addr: int, size: int) -> bytes:
+    """Read size bytes at byte_addr from a dense hex image file."""
+    words = [
+        line.strip().lower() for line in path.read_text().splitlines() if line.strip()
+    ]
+    data = bytearray()
+    for byte_idx in range(size):
+        absolute_byte = byte_addr + byte_idx
+        word_idx = absolute_byte // 4
+        byte_offset = absolute_byte % 4
+        if word_idx >= len(words):
+            raise SystemExit(
+                f"{path}: read address 0x{absolute_byte:x} is outside dense image"
+            )
+        data.append(struct.pack("<I", int(words[word_idx], 16))[byte_offset])
+    return bytes(data)
+
+
+def read_sparse_code_bytes(path: Path, byte_addr: int, size: int) -> bytes:
+    """Read size bytes at byte_addr from a sparse (@addr-directive) hex image file."""
+    lines = path.read_text().splitlines()
+    word_by_addr: dict[int, str] = {}
+    current_word_addr = 0
+    for line in lines:
+        stripped = line.strip().lower()
+        if not stripped:
+            continue
+        if stripped.startswith("@"):
+            current_word_addr = int(stripped[1:], 16)
+            continue
+        word_by_addr[current_word_addr] = stripped
+        current_word_addr += 1
+
+    data = bytearray()
+    for byte_idx in range(size):
+        absolute_byte = byte_addr + byte_idx
+        word_addr = absolute_byte // 4
+        byte_offset = absolute_byte % 4
+        word = word_by_addr.get(word_addr)
+        if word is None:
+            raise SystemExit(
+                f"{path}: read address 0x{absolute_byte:x} is outside sparse image"
+            )
+        data.append(struct.pack("<I", int(word, 16))[byte_offset])
+    return bytes(data)
+
+
+def read_code_bytes(path: Path, byte_addr: int, size: int) -> bytes:
+    """Dispatch to dense or sparse reader based on image format."""
+    for line in path.read_text().splitlines():
+        stripped = line.strip()
+        if not stripped:
+            continue
+        if stripped.startswith("@"):
+            return read_sparse_code_bytes(path, byte_addr, size)
+        return read_dense_code_bytes(path, byte_addr, size)
+    raise SystemExit(f"{path}: empty Linux DDR image")
+
+
+def patch_cpu_relax_div(path: Path, delay_addr: int) -> None:
+    """Patch the div-by-zero instruction inside cpu_relax (__delay+0x1C) to a NOP."""
+    patch_addr = delay_addr + CPU_RELAX_DIV_OFFSET
+    current = read_code_bytes(path, patch_addr, len(CPU_RELAX_DIV_OLD))
+    if current not in (CPU_RELAX_DIV_OLD, CPU_RELAX_DIV_NEW):
+        raise SystemExit(
+            f"{path}: {CPU_RELAX_DIV_SYMBOL}+0x{CPU_RELAX_DIV_OFFSET:x} "
+            f"at 0x{patch_addr:08x} has {current.hex()}, expected "
+            f"{CPU_RELAX_DIV_OLD.hex()}"
+        )
+    patch_code_bytes(path, {patch_addr: CPU_RELAX_DIV_NEW})
+
+
+def patch_cpu_relax_pause(path: Path, delay_addr: int) -> None:
+    """Patch the pause fence hint inside cpu_relax (__delay+0x20) to a NOP."""
+    patch_addr = delay_addr + CPU_RELAX_PAUSE_OFFSET
+    current = read_code_bytes(path, patch_addr, len(CPU_RELAX_PAUSE_OLD))
+    if current not in (CPU_RELAX_PAUSE_OLD, CPU_RELAX_PAUSE_NEW):
+        raise SystemExit(
+            f"{path}: {CPU_RELAX_DIV_SYMBOL}+0x{CPU_RELAX_PAUSE_OFFSET:x} "
+            f"at 0x{patch_addr:08x} has {current.hex()}, expected "
+            f"{CPU_RELAX_PAUSE_OLD.hex()}"
+        )
+    patch_code_bytes(path, {patch_addr: CPU_RELAX_PAUSE_NEW})
+
+
+def patch_proc_get_inode_mode_reload(path: Path) -> None:
+    """Patch all proc_get_inode mode-reload instructions to the new encoding."""
+    patches: dict[int, bytes] = {}
+    for addr in PROC_GET_INODE_MODE_RELOAD_ADDRS:
+        current = read_code_bytes(path, addr, len(PROC_GET_INODE_MODE_RELOAD_OLD))
+        if current not in (
+            PROC_GET_INODE_MODE_RELOAD_OLD,
+            PROC_GET_INODE_MODE_RELOAD_NEW,
+        ):
+            raise SystemExit(
+                f"{path}: proc_get_inode mode reload at 0x{addr:08x} "
+                f"has {current.hex()}, expected {PROC_GET_INODE_MODE_RELOAD_OLD.hex()}"
+            )
+        patches[addr] = PROC_GET_INODE_MODE_RELOAD_NEW
+    patch_code_bytes(path, patches)
+
+
+def patch_proc_get_inode_force_mode_reg(path: Path) -> None:
+    """Patch proc_get_inode to force the mode load through a register."""
+    current = read_code_bytes(
+        path, PROC_GET_INODE_MODE_LOAD_ADDR, len(PROC_GET_INODE_MODE_LOAD_OLD)
+    )
+    if current not in (PROC_GET_INODE_MODE_LOAD_OLD, PROC_GET_INODE_MODE_FORCE_REG):
+        raise SystemExit(
+            f"{path}: proc_get_inode mode load at 0x{PROC_GET_INODE_MODE_LOAD_ADDR:08x} "
+            f"has {current.hex()}, expected {PROC_GET_INODE_MODE_LOAD_OLD.hex()}"
+        )
+    patch_code_bytes(
+        path, {PROC_GET_INODE_MODE_LOAD_ADDR: PROC_GET_INODE_MODE_FORCE_REG}
+    )
+
+
+def patch_proc_lookup_ref_const(path: Path) -> None:
+    """Replace the proc_lookup_de refcount AMO with a constant-store encoding."""
+    current = read_code_bytes(
+        path, PROC_LOOKUP_REF_AMO_ADDR, len(PROC_LOOKUP_REF_AMO_OLD)
+    )
+    if current not in (PROC_LOOKUP_REF_AMO_OLD, PROC_LOOKUP_REF_AMO_CONST):
+        raise SystemExit(
+            f"{path}: proc_lookup_de refcount AMO at 0x{PROC_LOOKUP_REF_AMO_ADDR:08x} "
+            f"has {current.hex()}, expected {PROC_LOOKUP_REF_AMO_OLD.hex()}"
+        )
+    patch_code_bytes(path, {PROC_LOOKUP_REF_AMO_ADDR: PROC_LOOKUP_REF_AMO_CONST})
+
+
+def patch_proc_lookup_de_adjust(path: Path) -> None:
+    """Patch the proc_lookup_de returned-de pointer-adjustment instruction."""
+    current = read_code_bytes(
+        path, PROC_LOOKUP_DE_ADJUST_ADDR, len(PROC_LOOKUP_DE_ADJUST_OLD)
+    )
+    if current not in (PROC_LOOKUP_DE_ADJUST_OLD, PROC_LOOKUP_DE_ADJUST_NEW):
+        raise SystemExit(
+            f"{path}: proc_lookup_de returned-de adjust at "
+            f"0x{PROC_LOOKUP_DE_ADJUST_ADDR:08x} has {current.hex()}, expected "
+            f"{PROC_LOOKUP_DE_ADJUST_OLD.hex()}"
+        )
+    patch_code_bytes(path, {PROC_LOOKUP_DE_ADJUST_ADDR: PROC_LOOKUP_DE_ADJUST_NEW})
+
+
+def words_to_bytes(words: list[str]) -> bytes:
+    """Pack a list of little-endian 8-hex-digit word strings into bytes."""
+    return b"".join(struct.pack("<I", int(word, 16)) for word in words)
+
+
+def bytes_to_words(data: bytes) -> list[str]:
+    """Unpack bytes into a list of little-endian 8-hex-digit word strings."""
+    if len(data) % 4:
+        data += b"\x00" * (4 - len(data) % 4)
+    return [
+        f"{struct.unpack_from('<I', data, i)[0]:08x}" for i in range(0, len(data), 4)
+    ]
+
+
+def fdt_total_size(data: bytes) -> int:
+    """Return the total_size field from a FDT blob after validating the magic."""
+    if len(data) < 8:
+        raise SystemExit("DTB slot is too small to contain an FDT header")
+    magic, total_size = struct.unpack_from(">II", data, 0)
+    if magic != FDT_MAGIC:
+        raise SystemExit(
+            f"DTB magic mismatch: got 0x{magic:08x}, expected 0x{FDT_MAGIC:08x}"
+        )
+    if total_size > len(data):
+        raise SystemExit(
+            f"DTB total size {total_size} exceeds extracted slot {len(data)}"
+        )
+    return total_size
+
+
+def padded_dtb_slot(words: list[str]) -> bytes:
+    """Extract and zero-pad a DTB from a word list to its declared total_size."""
+    data = words_to_bytes(words)
+    if len(data) < 8:
+        raise SystemExit("DTB slot is too small to contain an FDT header")
+    magic, total_size = struct.unpack_from(">II", data, 0)
+    if magic != FDT_MAGIC:
+        raise SystemExit(
+            f"DTB magic mismatch: got 0x{magic:08x}, expected 0x{FDT_MAGIC:08x}"
+        )
+    if total_size > len(data):
+        data += b"\x00" * (total_size - len(data))
+    return data
+
+
+def fdt_tool(name: str) -> str:
+    """Locate an FDT command-line tool on PATH or raise SystemExit if absent."""
+    tool = shutil.which(name)
+    if not tool:
+        raise SystemExit(f"{name} is required in PATH")
+    return tool
+
+
+def run_fdtget_u32(dtb_path: Path, prop: str) -> int:
+    """Read a single hex /chosen property from a DTB file using fdtget."""
+    result = subprocess.run(
+        [fdt_tool("fdtget"), "-t", "x", str(dtb_path), "/chosen", prop],
+        check=True,
+        capture_output=True,
+        text=True,
+    )
+    words = result.stdout.split()
+    if len(words) != 1:
+        raise SystemExit(f"{dtb_path}: expected one {prop} cell, got {result.stdout!r}")
+    return int(words[0], 16)
+
+
+def rewrite_dtb(dtb_slot: bytes, bootargs: str | None, initrd_end: int | None) -> bytes:
+    """Rewrite bootargs and linux,initrd-end in a DTB blob using fdtput."""
+    fdtput = shutil.which("fdtput")
+    if not fdtput:
+        raise SystemExit("DTB rewriting requires fdtput in PATH")
+
+    total_size = fdt_total_size(dtb_slot)
+    old_dtb = dtb_slot[:total_size]
+    with tempfile.TemporaryDirectory(prefix="frost_dtb_") as tmp:
+        dtb_path = Path(tmp) / "frost.dtb"
+        dtb_path.write_bytes(old_dtb)
+        if bootargs is not None:
+            subprocess.run(
+                [fdtput, "-t", "s", str(dtb_path), "/chosen", "bootargs", bootargs],
+                check=True,
+            )
+        if initrd_end is not None:
+            subprocess.run(
+                [
+                    fdtput,
+                    "-t",
+                    "x",
+                    str(dtb_path),
+                    "/chosen",
+                    "linux,initrd-end",
+                    f"0x{initrd_end:08x}",
+                ],
+                check=True,
+            )
+        serial_irq_mode = os.environ.get("FROST_LINUX_SERIAL_IRQ_MODE", "poll")
+        if serial_irq_mode == "poll":
+            subprocess.run(
+                [
+                    fdtput,
+                    "-d",
+                    str(dtb_path),
+                    "/soc/serial@40001000",
+                    "interrupts-extended",
+                ],
+                check=False,
+                stdout=subprocess.DEVNULL,
+                stderr=subprocess.DEVNULL,
+            )
+        elif serial_irq_mode == "cpu-local-meip":
+            subprocess.run(
+                [
+                    fdtput,
+                    "-t",
+                    "x",
+                    str(dtb_path),
+                    "/soc/serial@40001000",
+                    "interrupts-extended",
+                    "0x00000001",
+                    "0x0000000b",
+                ],
+                check=True,
+            )
+        else:
+            raise SystemExit(f"unknown FROST_LINUX_SERIAL_IRQ_MODE={serial_irq_mode!r}")
+        new_dtb = dtb_path.read_bytes()
+
+    if len(new_dtb) > (INITRD_WORD - DTB_WORD) * 4:
+        raise SystemExit(
+            f"patched DTB is {len(new_dtb)} bytes; only "
+            f"{(INITRD_WORD - DTB_WORD) * 4} bytes available before initrd"
+        )
+    return new_dtb
+
+
+def get_initrd_bounds(dtb_slot: bytes) -> tuple[int, int]:
+    """Read the initrd start and end byte addresses from a DTB blob using fdtget."""
+    total_size = fdt_total_size(dtb_slot)
+    with tempfile.TemporaryDirectory(prefix="frost_dtb_") as tmp:
+        dtb_path = Path(tmp) / "frost.dtb"
+        dtb_path.write_bytes(dtb_slot[:total_size])
+        start = run_fdtget_u32(dtb_path, "linux,initrd-start")
+        end = run_fdtget_u32(dtb_path, "linux,initrd-end")
+    if end < start:
+        raise SystemExit(f"invalid initrd bounds: start=0x{start:08x}, end=0x{end:08x}")
+    if start < KERNEL_ENTRY or (start - KERNEL_ENTRY) % 4:
+        raise SystemExit(f"unsupported initrd start: 0x{start:08x}")
+    return start, end
+
+
+def newc_pad(n: int) -> int:
+    """Return the number of padding bytes to reach the next 4-byte CPIO alignment boundary."""
+    return (-n) & 3
+
+
+def parse_newc_entry(data: bytes, offset: int) -> tuple[str, list[int], int, int, int]:
+    """Parse one CPIO newc entry, returning name, fields, body_start, next_offset, and file_size."""
+    if offset + 110 > len(data) or data[offset : offset + 6] != CPIO_NEWC_MAGIC:
+        raise SystemExit(f"initramfs is not a valid newc archive at byte {offset}")
+    fields = [
+        int(data[offset + 6 + idx * 8 : offset + 14 + idx * 8], 16) for idx in range(13)
+    ]
+    file_size = fields[6]
+    name_size = fields[11]
+    name_start = offset + 110
+    name_end = name_start + name_size
+    if name_end > len(data):
+        raise SystemExit(f"initramfs newc entry at byte {offset} has truncated name")
+    name = data[name_start : name_end - 1].decode("utf-8")
+    body_start = name_end + newc_pad(name_end)
+    next_offset = body_start + file_size + newc_pad(body_start + file_size)
+    if next_offset > len(data):
+        raise SystemExit(f"initramfs newc entry {name!r} at byte {offset} is truncated")
+    return name, fields, body_start, next_offset, file_size
+
+
+def find_newc_trailer(data: bytes) -> tuple[int, set[str]]:
+    """Scan a CPIO newc archive for the TRAILER entry and return its offset and all filenames seen."""
+    offset = 0
+    names: set[str] = set()
+    while offset < len(data):
+        name, _fields, _body_start, next_offset, _file_size = parse_newc_entry(
+            data, offset
+        )
+        names.add(name)
+        if name == CPIO_TRAILER:
+            return offset, names
+        offset = next_offset
+    raise SystemExit("initramfs newc archive has no TRAILER!!! entry")
+
+
+def make_newc_entry(
+    name: str,
+    mode: int,
+    rdev_major: int,
+    rdev_minor: int,
+    ino: int,
+    data: bytes = b"",
+    uid: int = 0,
+    gid: int = 0,
+    nlink: int = 1,
+    mtime: int = 0,
+    dev_major: int = 0,
+    dev_minor: int = 0,
+) -> bytes:
+    """Build a complete CPIO newc archive entry from name, mode, device numbers, and data."""
+    encoded_name = name.encode("utf-8") + b"\x00"
+    fields = [
+        ino,
+        mode,
+        uid,
+        gid,
+        nlink,
+        mtime,
+        len(data),
+        dev_major,
+        dev_minor,
+        rdev_major,
+        rdev_minor,
+        len(encoded_name),
+        0,  # check
+    ]
+    header = CPIO_NEWC_MAGIC + b"".join(
+        f"{field:08x}".encode("ascii") for field in fields
+    )
+    name_block = (
+        header + encoded_name + (b"\x00" * newc_pad(len(header) + len(encoded_name)))
+    )
+    return name_block + data + (b"\x00" * newc_pad(len(name_block) + len(data)))
+
+
+def make_newc_replacement_entry(name: str, fields: list[int], data: bytes) -> bytes:
+    """Rebuild a CPIO newc entry preserving the original metadata with new data."""
+    return make_newc_entry(
+        name,
+        fields[1],
+        fields[9],
+        fields[10],
+        fields[0],
+        data=data,
+        uid=fields[2],
+        gid=fields[3],
+        nlink=fields[4],
+        mtime=fields[5],
+        dev_major=fields[7],
+        dev_minor=fields[8],
+    )
+
+
+def patch_initramfs(
+    initrd_gz: bytes,
+    replacements: dict[str, bytes],
+    additions: dict[str, tuple[int, bytes]],
+    deletions: set[str],
+) -> tuple[bytes, list[str], list[str], list[str], list[str]]:
+    """Patch, add, and delete entries in a gzip-compressed CPIO initramfs."""
+    conflicts = (set(replacements) | set(additions)) & deletions
+    if conflicts:
+        raise SystemExit(
+            "initramfs paths cannot be both patched/added and deleted: "
+            + " ".join(sorted(conflicts))
+        )
+
+    initrd = gzip.decompress(initrd_gz)
+    trailer_offset, names = find_newc_trailer(initrd)
+    missing = [name for name in INITRD_DEVICES if name not in names]
+    existing_additions = set(additions) & names
+
+    if not missing and not replacements and not additions and not deletions:
+        return initrd_gz, [], [], [], []
+
+    patched_entries: list[bytes] = []
+    replaced: list[str] = []
+    deleted: list[str] = []
+    offset = 0
+    while offset < trailer_offset:
+        name, fields, body_start, next_offset, file_size = parse_newc_entry(
+            initrd, offset
+        )
+        if name in deletions:
+            deleted.append(name)
+        elif name in replacements:
+            patched_entries.append(
+                make_newc_replacement_entry(name, fields, replacements[name])
+            )
+            replaced.append(name)
+        elif name in existing_additions:
+            _mode, data = additions[name]
+            patched_entries.append(make_newc_replacement_entry(name, fields, data))
+            replaced.append(name)
+        else:
+            patched_entries.append(initrd[offset:next_offset])
+        offset = next_offset
+
+    for idx, name in enumerate(missing, start=0xF005700):
+        mode, major, minor = INITRD_DEVICES[name]
+        patched_entries.append(make_newc_entry(name, mode, major, minor, idx))
+    added_files: list[str] = []
+    for idx, (name, (mode, data)) in enumerate(additions.items(), start=0xF006700):
+        if name in names:
+            continue
+        patched_entries.append(make_newc_entry(name, mode, 0, 0, idx, data=data))
+        added_files.append(name)
+    trailer = make_newc_entry(CPIO_TRAILER, 0, 0, 0, 0)
+    patched = b"".join(patched_entries) + trailer
+
+    missing_replacements = sorted(set(replacements) - set(replaced))
+    if missing_replacements:
+        raise SystemExit(
+            "initramfs replacement target(s) not found: "
+            + " ".join(missing_replacements)
+        )
+    missing_deletions = sorted(deletions - set(deleted))
+    if missing_deletions:
+        raise SystemExit(
+            "initramfs deletion target(s) not found: " + " ".join(missing_deletions)
+        )
+    return gzip.compress(patched, mtime=0), missing, replaced, added_files, deleted
+
+
+def get_initramfs_replacements() -> dict[str, bytes]:
+    """Build the initramfs file-replacement map from FROST_LINUX_* environment variables."""
+    replacements = {
+        "etc/init.d/S01seedrng": SEEDRNG_NOOP.encode("utf-8"),
+    }
+    busybox_replacement = os.environ.get("FROST_LINUX_BUSYBOX")
+    if busybox_replacement:
+        replacements["bin/busybox"] = Path(busybox_replacement).read_bytes()
+    preset = os.environ.get("FROST_LINUX_INITTAB_PRESET")
+    raw_inittab = os.environ.get("FROST_LINUX_INITTAB")
+    if raw_inittab and preset:
+        raise SystemExit(
+            "set either FROST_LINUX_INITTAB or FROST_LINUX_INITTAB_PRESET, not both"
+        )
+    if preset == "diag-shell":
+        replacements["etc/inittab"] = DIAG_SHELL_INITTAB.encode("utf-8")
+        return replacements
+    if preset:
+        raise SystemExit(f"unknown FROST_LINUX_INITTAB_PRESET={preset!r}")
+    if raw_inittab:
+        replacements["etc/inittab"] = raw_inittab.replace("\\n", "\n").encode("utf-8")
+    return replacements
+
+
+def get_initramfs_additions() -> dict[str, tuple[int, bytes]]:
+    """Build the initramfs file-addition map from FROST_LINUX_* environment variables."""
+    additions: dict[str, tuple[int, bytes]] = {}
+    diag_init = os.environ.get("FROST_LINUX_DIAG_INIT")
+    if diag_init:
+        additions["frost_diag_init"] = (
+            stat.S_IFREG | 0o755,
+            Path(diag_init).read_bytes(),
+        )
+    return additions
+
+
+def get_initramfs_deletions() -> set[str]:
+    """Build the set of initramfs paths to delete from FROST_LINUX_* environment variables."""
+    deletions = set(
+        split_env_names(os.environ.get("FROST_LINUX_DELETE_INITRAMFS_NAMES", ""))
+    )
+    if os.environ.get("FROST_LINUX_DELETE_INITTAB") == "1":
+        deletions.add("etc/inittab")
+    return deletions
+
+
+def patch_dense_image(
+    path: Path,
+    bootargs: str | None,
+    initramfs_replacements: dict[str, bytes],
+    initramfs_additions: dict[str, tuple[int, bytes]],
+    initramfs_deletions: set[str],
+) -> tuple[list[str], list[str], list[str], list[str]]:
+    """Patch DTB and initramfs embedded in a dense Linux DDR hex image."""
+    words = [
+        line.strip().lower() for line in path.read_text().splitlines() if line.strip()
+    ]
+    if len(words) < INITRD_WORD:
+        raise SystemExit(f"{path}: dense DDR image is too short for DTB/initrd slots")
+
+    dtb_slot_words = words[DTB_WORD:INITRD_WORD]
+    dtb_slot = words_to_bytes(dtb_slot_words)
+    initrd_start, initrd_end = get_initrd_bounds(dtb_slot)
+    initrd_word = (initrd_start - KERNEL_ENTRY) // 4
+    if initrd_word != INITRD_WORD:
+        raise SystemExit(f"{path}: unexpected initrd word offset 0x{initrd_word:x}")
+    initrd_size = initrd_end - initrd_start
+    initrd_word_count = (initrd_size + 3) // 4
+    initrd_gz = words_to_bytes(words[INITRD_WORD : INITRD_WORD + initrd_word_count])[
+        :initrd_size
+    ]
+    new_initrd_gz, added_devices, replaced_files, added_files, deleted_files = (
+        patch_initramfs(
+            initrd_gz, initramfs_replacements, initramfs_additions, initramfs_deletions
+        )
+    )
+    new_initrd_end = initrd_start + len(new_initrd_gz)
+
+    new_dtb_words = bytes_to_words(rewrite_dtb(dtb_slot, bootargs, new_initrd_end))
+    if DTB_WORD + len(new_dtb_words) > INITRD_WORD:
+        raise SystemExit(f"{path}: patched DTB overlaps initrd")
+    new_initrd_words = bytes_to_words(new_initrd_gz)
+
+    words[DTB_WORD : DTB_WORD + len(new_dtb_words)] = new_dtb_words
+    for i in range(DTB_WORD + len(new_dtb_words), INITRD_WORD):
+        words[i] = "00000000"
+    words[INITRD_WORD:] = new_initrd_words
+    path.write_text("\n".join(words) + "\n")
+    return added_devices, replaced_files, added_files, deleted_files
+
+
+def patch_sparse_image(
+    path: Path,
+    bootargs: str | None,
+    initramfs_replacements: dict[str, bytes],
+    initramfs_additions: dict[str, tuple[int, bytes]],
+    initramfs_deletions: set[str],
+) -> tuple[list[str], list[str], list[str], list[str]]:
+    """Patch DTB and initramfs embedded in a sparse Linux DDR hex image."""
+
+    def is_gzip_first_word(word: str) -> bool:
+        try:
+            return (int(word, 16) & 0x00FF_FFFF) == 0x0008_8B1F
+        except ValueError:
+            return False
+
+    lines = path.read_text().splitlines()
+    dtb_directive = f"@{DTB_WORD:08x}"
+    initrd_directive = f"@{INITRD_WORD:08x}"
+    try:
+        dtb_line = next(
+            i for i, line in enumerate(lines) if line.strip().lower() == dtb_directive
+        )
+    except StopIteration as exc:
+        raise SystemExit(f"{path}: missing DTB address directive") from exc
+    initrd_line = next(
+        (i for i, line in enumerate(lines) if line.strip().lower() == initrd_directive),
+        None,
+    )
+    if initrd_line is not None and initrd_line <= dtb_line:
+        raise SystemExit(f"{path}: initrd directive appears before DTB directive")
+
+    dtb_slot_words = INITRD_WORD - DTB_WORD
+    sparse_payload_initrd_word = dtb_slot_words
+    if initrd_line is None:
+        payload_words = [
+            line.strip().lower() for line in lines[dtb_line + 1 :] if line.strip()
+        ]
+        if len(payload_words) > dtb_slot_words and is_gzip_first_word(
+            payload_words[dtb_slot_words]
+        ):
+            sparse_payload_initrd_word = dtb_slot_words
+            dtb_words = payload_words[:dtb_slot_words]
+        else:
+            gzip_word = next(
+                (
+                    idx
+                    for idx, word in enumerate(payload_words)
+                    if is_gzip_first_word(word)
+                ),
+                None,
+            )
+            if gzip_word is None:
+                raise SystemExit(
+                    f"{path}: missing initrd directive and gzip initrd header"
+                )
+            sparse_payload_initrd_word = gzip_word
+            dtb_words = payload_words[:gzip_word]
+        initrd_words = payload_words[sparse_payload_initrd_word:]
+    else:
+        dtb_words = [
+            line.strip().lower()
+            for line in lines[dtb_line + 1 : initrd_line]
+            if line.strip()
+        ]
+        initrd_words = [
+            line.strip().lower() for line in lines[initrd_line + 1 :] if line.strip()
+        ]
+    dtb_slot = padded_dtb_slot(dtb_words)
+    initrd_start, initrd_end = get_initrd_bounds(dtb_slot)
+    initrd_word = (initrd_start - KERNEL_ENTRY) // 4
+    if initrd_word != INITRD_WORD:
+        raise SystemExit(f"{path}: unexpected initrd word offset 0x{initrd_word:x}")
+    initrd_size = initrd_end - initrd_start
+    initrd_gz = words_to_bytes(initrd_words)[:initrd_size]
+    new_initrd_gz, added_devices, replaced_files, added_files, deleted_files = (
+        patch_initramfs(
+            initrd_gz, initramfs_replacements, initramfs_additions, initramfs_deletions
+        )
+    )
+    new_initrd_end = initrd_start + len(new_initrd_gz)
+
+    new_dtb_words = bytes_to_words(rewrite_dtb(dtb_slot, bootargs, new_initrd_end))
+    if DTB_WORD + len(new_dtb_words) > INITRD_WORD:
+        raise SystemExit(f"{path}: patched DTB overlaps initrd")
+    new_initrd_words = bytes_to_words(new_initrd_gz)
+
+    lines[dtb_line + 1 :] = new_dtb_words + [initrd_directive] + new_initrd_words
+    path.write_text("\n".join(lines) + "\n")
+    return added_devices, replaced_files, added_files, deleted_files
+
+
+def patch_linux_image(
+    path: Path,
+    bootargs: str | None,
+    initramfs_replacements: dict[str, bytes],
+    initramfs_additions: dict[str, tuple[int, bytes]],
+    initramfs_deletions: set[str],
+) -> tuple[list[str], list[str], list[str], list[str]]:
+    """Patch a Linux DDR image, dispatching to dense or sparse handler by format."""
+    for line in path.read_text().splitlines():
+        stripped = line.strip()
+        if not stripped:
+            continue
+        if stripped.startswith("@"):
+            return patch_sparse_image(
+                path,
+                bootargs,
+                initramfs_replacements,
+                initramfs_additions,
+                initramfs_deletions,
+            )
+        return patch_dense_image(
+            path,
+            bootargs,
+            initramfs_replacements,
+            initramfs_additions,
+            initramfs_deletions,
+        )
+    raise SystemExit(f"{path}: empty Linux DDR image")
+
+
+def main() -> None:
+    """Entry point: patches the Linux DDR image with all FROST boot patches."""
+    parser = argparse.ArgumentParser()
+    parser.add_argument("sw_ddr_mem", type=Path)
+    parser.add_argument("sw_ddr_txt", type=Path)
+    args = parser.parse_args()
+
+    patch_ret_restore_window(args.sw_ddr_mem)
+    patch_ret_restore_window(args.sw_ddr_txt)
+    print(f"Patched Linux ret_from_exception restore window: {OLD_WORD}->{NEW_WORD}")
+
+    noop_initcall_names = split_env_names(
+        os.environ.get("FROST_LINUX_NOOP_INITCALLS", "")
+    )
+    noop_function_names = split_env_names(
+        os.environ.get("FROST_LINUX_NOOP_FUNCTIONS", "")
+    )
+    system_map = Path(
+        os.environ.get("FROST_LINUX_SYSTEM_MAP", DEFAULT_SYSTEM_MAP)
+    ).expanduser()
+    noop_initcall_symbols = resolve_system_map_symbols(system_map, noop_initcall_names)
+    patch_noop_return_zero(args.sw_ddr_mem, noop_initcall_symbols)
+    patch_noop_return_zero(args.sw_ddr_txt, noop_initcall_symbols)
+    if noop_initcall_symbols:
+        patched = " ".join(
+            f"{name}@0x{noop_initcall_symbols[name]:08x}"
+            for name in noop_initcall_names
+        )
+        print(f"Patched Linux initcalls to return 0: {patched}")
+
+    noop_function_symbols = resolve_system_map_symbols(system_map, noop_function_names)
+    patch_noop_return_zero(args.sw_ddr_mem, noop_function_symbols)
+    patch_noop_return_zero(args.sw_ddr_txt, noop_function_symbols)
+    if noop_function_symbols:
+        patched = " ".join(
+            f"{name}@0x{noop_function_symbols[name]:08x}"
+            for name in noop_function_names
+        )
+        print(f"Patched Linux functions to return 0: {patched}")
+
+    if os.environ.get("FROST_LINUX_NOP_CPU_RELAX_DIV") == "1":
+        delay_addr = resolve_system_map_symbols(system_map, [CPU_RELAX_DIV_SYMBOL])[
+            CPU_RELAX_DIV_SYMBOL
+        ]
+        patch_cpu_relax_div(args.sw_ddr_mem, delay_addr)
+        patch_cpu_relax_div(args.sw_ddr_txt, delay_addr)
+        print(
+            f"Patched Linux {CPU_RELAX_DIV_SYMBOL} cpu_relax DIV-by-zero to NOP: "
+            f"{CPU_RELAX_DIV_SYMBOL}+0x{CPU_RELAX_DIV_OFFSET:x}@"
+            f"0x{delay_addr + CPU_RELAX_DIV_OFFSET:08x}"
+        )
+
+    if os.environ.get("FROST_LINUX_NOP_CPU_RELAX_PAUSE") == "1":
+        delay_addr = resolve_system_map_symbols(system_map, [CPU_RELAX_DIV_SYMBOL])[
+            CPU_RELAX_DIV_SYMBOL
+        ]
+        patch_cpu_relax_pause(args.sw_ddr_mem, delay_addr)
+        patch_cpu_relax_pause(args.sw_ddr_txt, delay_addr)
+        print(
+            f"Patched Linux {CPU_RELAX_DIV_SYMBOL} cpu_relax PAUSE to NOP: "
+            f"{CPU_RELAX_DIV_SYMBOL}+0x{CPU_RELAX_PAUSE_OFFSET:x}@"
+            f"0x{delay_addr + CPU_RELAX_PAUSE_OFFSET:08x}"
+        )
+
+    if os.environ.get("FROST_LINUX_PATCH_PROC_GET_INODE_MODE_RELOAD") == "1":
+        patch_proc_get_inode_mode_reload(args.sw_ddr_mem)
+        patch_proc_get_inode_mode_reload(args.sw_ddr_txt)
+        print(
+            "Patched Linux proc_get_inode mode reload: "
+            f"{','.join(f'0x{addr:08x}' for addr in PROC_GET_INODE_MODE_RELOAD_ADDRS)} "
+            f"{PROC_GET_INODE_MODE_RELOAD_OLD.hex()}->"
+            f"{PROC_GET_INODE_MODE_RELOAD_NEW.hex()}"
+        )
+
+    if os.environ.get("FROST_LINUX_FORCE_PROC_GET_INODE_MODE_REG") == "1":
+        patch_proc_get_inode_force_mode_reg(args.sw_ddr_mem)
+        patch_proc_get_inode_force_mode_reg(args.sw_ddr_txt)
+        print(
+            "Patched Linux proc_get_inode mode load to S_IFREG: "
+            f"0x{PROC_GET_INODE_MODE_LOAD_ADDR:08x} "
+            f"{PROC_GET_INODE_MODE_LOAD_OLD.hex()}->"
+            f"{PROC_GET_INODE_MODE_FORCE_REG.hex()}"
+        )
+
+    if os.environ.get("FROST_LINUX_PATCH_PROC_LOOKUP_REF_CONST") == "1":
+        patch_proc_lookup_ref_const(args.sw_ddr_mem)
+        patch_proc_lookup_ref_const(args.sw_ddr_txt)
+        print(
+            "Patched Linux proc_lookup_de refcount AMO result to 1: "
+            f"0x{PROC_LOOKUP_REF_AMO_ADDR:08x} "
+            f"{PROC_LOOKUP_REF_AMO_OLD.hex()}->"
+            f"{PROC_LOOKUP_REF_AMO_CONST.hex()}"
+        )
+
+    if os.environ.get("FROST_LINUX_PATCH_PROC_LOOKUP_DE_ADJUST") == "1":
+        patch_proc_lookup_de_adjust(args.sw_ddr_mem)
+        patch_proc_lookup_de_adjust(args.sw_ddr_txt)
+        print(
+            "Patched Linux proc_lookup_de returned pointer adjust: "
+            f"0x{PROC_LOOKUP_DE_ADJUST_ADDR:08x} "
+            f"{PROC_LOOKUP_DE_ADJUST_OLD.hex()}->"
+            f"{PROC_LOOKUP_DE_ADJUST_NEW.hex()}"
+        )
+
+    bootargs = os.environ.get("FROST_LINUX_BOOTARGS")
+    initramfs_replacements = get_initramfs_replacements()
+    initramfs_additions = get_initramfs_additions()
+    initramfs_deletions = get_initramfs_deletions()
+    sparse_devices, sparse_replaced, sparse_added, sparse_deleted = patch_linux_image(
+        args.sw_ddr_mem,
+        bootargs,
+        initramfs_replacements,
+        initramfs_additions,
+        initramfs_deletions,
+    )
+    dense_devices, dense_replaced, dense_added, dense_deleted = patch_linux_image(
+        args.sw_ddr_txt,
+        bootargs,
+        initramfs_replacements,
+        initramfs_additions,
+        initramfs_deletions,
+    )
+    if bootargs:
+        print(f"Patched Linux DTB bootargs: {bootargs}")
+    added_devices = sorted(set(sparse_devices) | set(dense_devices))
+    if added_devices:
+        print(f"Patched Linux initramfs device nodes: {' '.join(added_devices)}")
+    replaced_files = sorted(set(sparse_replaced) | set(dense_replaced))
+    if replaced_files:
+        print(f"Patched Linux initramfs files: {' '.join(replaced_files)}")
+    added_files = sorted(set(sparse_added) | set(dense_added))
+    if added_files:
+        print(f"Patched Linux initramfs added files: {' '.join(added_files)}")
+    deleted_files = sorted(set(sparse_deleted) | set(dense_deleted))
+    if deleted_files:
+        print(f"Patched Linux initramfs deleted files: {' '.join(deleted_files)}")
+
+
+if __name__ == "__main__":
+    main()
diff --git a/sw/apps/linux_clksrc_faithful/Makefile b/sw/apps/linux_clksrc_faithful/Makefile
new file mode 100644
index 00000000..7a4cf72a
--- /dev/null
+++ b/sw/apps/linux_clksrc_faithful/Makefile
@@ -0,0 +1,19 @@
+#    Copyright 2026 Two Sigma Open Source, LLC
+#
+#    Licensed under the Apache License, Version 2.0 (the "License");
+#    you may not use this file except in compliance with the License.
+#    You may obtain a copy of the License at
+#
+#        http://www.apache.org/licenses/LICENSE-2.0
+#
+#    Unless required by applicable law or agreed to in writing, software
+#    distributed under the License is distributed on an "AS IS" BASIS,
+#    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#    See the License for the specific language governing permissions and
+#    limitations under the License.
+
+# Faithful Linux clocksource-switch timer stressor. Force the whole program
+# into cached DDR (matches the kernel's DDR-resident code/data/stack).
+override MEM_CONFIG := ddr
+SRC_C := ../../lib/src/uart.c main.c
+include ../../common/common.mk
diff --git a/sw/apps/linux_clksrc_faithful/main.c b/sw/apps/linux_clksrc_faithful/main.c
new file mode 100644
index 00000000..9047d804
--- /dev/null
+++ b/sw/apps/linux_clksrc_faithful/main.c
@@ -0,0 +1,343 @@
+/*
+ *    Copyright 2026 Two Sigma Open Source, LLC
+ *
+ *    Licensed under the Apache License, Version 2.0 (the "License");
+ *    you may not use this file except in compliance with the License.
+ *    You may obtain a copy of the License at
+ *
+ *        http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *    Unless required by applicable law or agreed to in writing, software
+ *    distributed under the License is distributed on an "AS IS" BASIS,
+ *    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *    See the License for the specific language governing permissions and
+ *    limitations under the License.
+ */
+
+/*
+ * Faithful Linux clocksource-switch timer stressor (M-mode, DDR-resident).
+ *
+ * Mirrors what no-MMU Linux actually does at/after "Switched to clocksource
+ * clint_clocksource", which the existing linux_irq_*_ddr tests do NOT:
+ *
+ *   - clint_clock_next_event() ORDER: csr_set(MTIE) is done FIRST, THEN
+ *     mtimecmp is armed with a non-disabling 2-write lo-then-hi writeq
+ *     (io-64-nonatomic-lo-hi). So MTIE is enabled while the OLD (just-fired)
+ *     mtimecmp is still <= mtime, and the new deadline is written through a
+ *     torn {old_hi,new_lo} transient.
+ *   - clint_timer_interrupt() RE-ARMS: it acks with csr_clear(MTIE), then the
+ *     event_handler re-arms via clint_clock_next_event(). It never leaves the
+ *     timer disabled, so a tick taken "early" cannot strand a later wfi (the
+ *     failure mode of the other tests, which is a test artifact, not Linux).
+ *   - arch_cpu_idle() is a BARE wfi with mstatus.MIE left enabled throughout;
+ *     MTIE is what gets toggled, by the handler.
+ *   - concurrent cached-DDR churn so a machine-timer IRQ frequently lands while
+ *     cached (long-latency) loads/stores are still outstanding.
+ *
+ * Run at hardware-realistic DDR latency (DDR_MODEL_LATENCY>=70, CACHED_HAS_L2=0).
+ * PASS prints <<PASS>>; a frame-integrity violation prints <<FAIL>> with a code;
+ * a true deadlock is caught by the RTL no-retire watchdog.
+ */
+
+#include <stdint.h>
+
+#include "csr.h"
+#include "trap.h"
+#include "uart.h"
+
+#define CLINT_MTIMECMP_LO (*(volatile uint32_t *) 0x40014000u)
+#define CLINT_MTIMECMP_HI (*(volatile uint32_t *) 0x40014004u)
+#define CLINT_MTIME_LO (*(volatile uint32_t *) 0x4001BFF8u)
+#define CLINT_MTIME_HI (*(volatile uint32_t *) 0x4001BFFCu)
+
+#define TARGET_TICKS 64u
+#define DDR_STACK_SIZE 4096u
+#define CHURN_WORDS 4096 /* 16 KiB > L1: each idle sweep sustains DDR misses */
+
+struct linux_pt_regs {
+    uint32_t epc, ra, sp, gp, tp;
+    uint32_t t0, t1, t2, s0, s1;
+    uint32_t a0, a1, a2, a3, a4, a5, a6, a7;
+    uint32_t s2, s3, s4, s5, s6, s7, s8, s9, s10, s11;
+    uint32_t t3, t4, t5, t6;
+    uint32_t status, badaddr, cause, orig_a0;
+};
+
+struct fake_current {
+    uint32_t kernel_sp;
+    uint32_t user_sp;
+    uint32_t marker;
+};
+
+volatile struct fake_current g_fake_current = {0u, 0u, 0x5441534Bu};
+volatile uint32_t g_ticks;
+volatile uint32_t g_fail_code;
+volatile uint32_t g_fail_seen;
+volatile uint32_t g_last_mepc;
+volatile uint32_t g_last_ra;
+volatile uint32_t g_last_sp;
+volatile uint32_t g_last_tp;
+volatile uint32_t g_last_mscratch;
+volatile uint32_t g_churn[CHURN_WORDS];
+
+static uint8_t g_ddr_stack[DDR_STACK_SIZE] __attribute__((aligned(16)));
+
+static inline uint32_t read_tp(void)
+{
+    uint32_t v;
+    __asm__ volatile("mv %0, tp" : "=r"(v));
+    return v;
+}
+
+static inline void write_tp(uint32_t v)
+{
+    __asm__ volatile("mv tp, %0" : : "r"(v) : "memory");
+}
+
+static void record_failure(uint32_t code)
+{
+    if (!g_fail_seen) {
+        g_fail_seen = 1u;
+        g_fail_code = code;
+    }
+}
+
+static uint64_t clint_rdmtime(void)
+{
+    uint32_t hi, lo, hi2;
+    do {
+        hi = CLINT_MTIME_HI;
+        lo = CLINT_MTIME_LO;
+        hi2 = CLINT_MTIME_HI;
+    } while (hi != hi2);
+    return ((uint64_t) hi << 32) | lo;
+}
+
+/* Linux clint_clock_next_event(): enable MTIE FIRST, then non-disabling
+ * lo-then-hi writeq of the new deadline (io-64-nonatomic-lo-hi). */
+static void clint_clock_next_event(uint64_t cmp)
+{
+    csr_set(mie, MIE_MTIE);
+    CLINT_MTIMECMP_LO = (uint32_t) cmp;
+    CLINT_MTIMECMP_HI = (uint32_t) (cmp >> 32);
+}
+
+static uint32_t churn_ddr(uint32_t seed)
+{
+    uint32_t acc = seed;
+    for (int i = 0; i < CHURN_WORDS; i++) {
+        uint32_t v = g_churn[i];
+        acc ^= v + ((uint32_t) i << 3);
+        acc = (acc << 5) | (acc >> 27);
+        g_churn[i] = v ^ acc ^ (0x9E3779B9u + (uint32_t) i);
+    }
+    return acc;
+}
+
+/* Linux clint_timer_interrupt(): ack by clearing MTIE, then RE-ARM via the
+ * event_handler -> clint_clock_next_event() path. */
+__attribute__((noinline, used)) void faithful_irq_c(struct linux_pt_regs *frame)
+{
+    csr_clear(mie, MIE_MTIE);
+
+    g_last_mepc = frame->epc;
+    g_last_ra = frame->ra;
+    g_last_sp = frame->sp;
+    g_last_tp = frame->tp;
+    g_last_mscratch = csr_read(mscratch);
+
+    if (frame->cause != (MCAUSE_INTERRUPT_BIT | INT_MTI)) {
+        record_failure(1u);
+    }
+    /* Corrupted/garbage return PC is the hardware symptom (ra==epc==0xCC0). */
+    if (frame->epc < 0x80000000u || frame->epc == 0x00000CC0u) {
+        record_failure(2u);
+    }
+    if (frame->ra < 0x80000000u || frame->ra == 0x00000CC0u) {
+        record_failure(3u);
+    }
+    if (frame->sp < (uint32_t) &g_ddr_stack[0] ||
+        frame->sp > (uint32_t) &g_ddr_stack[DDR_STACK_SIZE]) {
+        record_failure(4u);
+    }
+    if (frame->tp != (uint32_t) &g_fake_current) {
+        record_failure(5u);
+    }
+    if (g_last_mscratch != 0u) {
+        record_failure(6u);
+    }
+
+    /* Light handler-side cached touch (rotating window) so the handler stays
+     * short; the sustained DDR traffic comes from the idle-loop sweep. */
+    {
+        uint32_t base = (g_ticks << 4) & (CHURN_WORDS - 1u);
+        uint32_t acc = frame->epc ^ frame->ra ^ g_ticks;
+        for (int i = 0; i < 8; i++) {
+            uint32_t idx = (base + (uint32_t) i) & (CHURN_WORDS - 1u);
+            acc ^= g_churn[idx];
+            g_churn[idx] = acc + (uint32_t) i;
+        }
+    }
+    g_ticks = g_ticks + 1u;
+
+    /* event_handler -> clint_clock_next_event(now + delta). Vary the delta so
+     * the IRQ phase relative to the idle churn/wfi sweeps across alignments. */
+    clint_clock_next_event(clint_rdmtime() + 256u + ((uint64_t) (g_ticks & 63u) << 3));
+}
+
+/* Linux-style naked trap entry: save/restore the GPR frame on the current
+ * (DDR) stack, csrrw tp,mscratch,tp swap idiom, sc.w in the return path. */
+__attribute__((naked, aligned(4))) static void faithful_irq_entry(void)
+{
+    __asm__ volatile("csrrw tp, mscratch, tp\n"
+                     "bnez tp, 1f\n"
+                     "csrr tp, mscratch\n"
+                     "1:\n"
+                     "addi sp, sp, -144\n"
+                     "sw   ra, 4(sp)\n"
+                     "sw   gp, 12(sp)\n"
+                     "sw   t0, 20(sp)\n"
+                     "sw   t1, 24(sp)\n"
+                     "sw   t2, 28(sp)\n"
+                     "sw   s0, 32(sp)\n"
+                     "sw   s1, 36(sp)\n"
+                     "sw   a0, 40(sp)\n"
+                     "sw   a1, 44(sp)\n"
+                     "sw   a2, 48(sp)\n"
+                     "sw   a3, 52(sp)\n"
+                     "sw   a4, 56(sp)\n"
+                     "sw   a5, 60(sp)\n"
+                     "sw   a6, 64(sp)\n"
+                     "sw   a7, 68(sp)\n"
+                     "sw   s2, 72(sp)\n"
+                     "sw   s3, 76(sp)\n"
+                     "sw   s4, 80(sp)\n"
+                     "sw   s5, 84(sp)\n"
+                     "sw   s6, 88(sp)\n"
+                     "sw   s7, 92(sp)\n"
+                     "sw   s8, 96(sp)\n"
+                     "sw   s9, 100(sp)\n"
+                     "sw   s10, 104(sp)\n"
+                     "sw   s11, 108(sp)\n"
+                     "sw   t3, 112(sp)\n"
+                     "sw   t4, 116(sp)\n"
+                     "sw   t5, 120(sp)\n"
+                     "sw   t6, 124(sp)\n"
+                     "sw   a0, 140(sp)\n"
+                     "addi t0, sp, 144\n"
+                     "sw   t0, 8(sp)\n"
+                     "csrr t0, mepc\n"
+                     "sw   t0, 0(sp)\n"
+                     "csrr t0, mstatus\n"
+                     "sw   t0, 128(sp)\n"
+                     "csrr t0, mtval\n"
+                     "sw   t0, 132(sp)\n"
+                     "csrr t0, mcause\n"
+                     "sw   t0, 136(sp)\n"
+                     "csrr t0, mscratch\n"
+                     "sw   t0, 16(sp)\n"
+                     "csrw mscratch, x0\n"
+                     "mv   a0, sp\n"
+                     "call faithful_irq_c\n"
+                     "lw   a0, 128(sp)\n"
+                     "lw   a2, 0(sp)\n"
+                     "sc.w x0, a2, 0(sp)\n"
+                     "csrw mstatus, a0\n"
+                     "csrw mepc, a2\n"
+                     "lw   ra, 4(sp)\n"
+                     "lw   gp, 12(sp)\n"
+                     "lw   tp, 16(sp)\n"
+                     "lw   t0, 20(sp)\n"
+                     "lw   t1, 24(sp)\n"
+                     "lw   t2, 28(sp)\n"
+                     "lw   s0, 32(sp)\n"
+                     "lw   s1, 36(sp)\n"
+                     "lw   a0, 40(sp)\n"
+                     "lw   a1, 44(sp)\n"
+                     "lw   a2, 48(sp)\n"
+                     "lw   a3, 52(sp)\n"
+                     "lw   a4, 56(sp)\n"
+                     "lw   a5, 60(sp)\n"
+                     "lw   a6, 64(sp)\n"
+                     "lw   a7, 68(sp)\n"
+                     "lw   s2, 72(sp)\n"
+                     "lw   s3, 76(sp)\n"
+                     "lw   s4, 80(sp)\n"
+                     "lw   s5, 84(sp)\n"
+                     "lw   s6, 88(sp)\n"
+                     "lw   s7, 92(sp)\n"
+                     "lw   s8, 96(sp)\n"
+                     "lw   s9, 100(sp)\n"
+                     "lw   s10, 104(sp)\n"
+                     "lw   s11, 108(sp)\n"
+                     "lw   t3, 112(sp)\n"
+                     "lw   t4, 116(sp)\n"
+                     "lw   t5, 120(sp)\n"
+                     "lw   t6, 124(sp)\n"
+                     "lw   sp, 8(sp)\n"
+                     "mret\n");
+}
+
+__attribute__((noreturn, noinline, used)) void main_on_ddr_stack(void)
+{
+    uart_printf("\n=== Linux faithful clocksource-switch timer test ===\n");
+
+    for (int i = 0; i < CHURN_WORDS; i++) {
+        g_churn[i] = 0x80000000u ^ ((uint32_t) i * 0x10204081u);
+    }
+    g_fake_current.kernel_sp = (uint32_t) &g_ddr_stack[DDR_STACK_SIZE];
+    g_fake_current.user_sp = 0u;
+
+    write_tp((uint32_t) &g_fake_current);
+    csr_write(mscratch, 0u);
+    set_trap_handler(&faithful_irq_entry);
+
+    /* Start the clockevent (clint_timer_starting_cpu -> first next_event), then
+     * enable MIE once and leave it on, exactly like the kernel after boot. */
+    clint_clock_next_event(clint_rdmtime() + 384u);
+    enable_interrupts();
+
+    /* arch_cpu_idle(): bare wfi with MIE on, interleaved with concurrent
+     * cached-DDR work so IRQs land while cached ops are outstanding. */
+    uint32_t spin = 0x2468ACE0u;
+    while (g_ticks < TARGET_TICKS && !g_fail_seen) {
+        spin = churn_ddr(spin ^ g_ticks);
+        __asm__ volatile("wfi" ::: "memory");
+    }
+
+    disable_timer_interrupt();
+    disable_interrupts();
+
+    if (!g_fail_seen && g_ticks >= TARGET_TICKS && spin != 0u) {
+        uart_printf("ticks=%u spin=%08x last_mepc=%08x last_ra=%08x\n",
+                    g_ticks,
+                    spin,
+                    g_last_mepc,
+                    g_last_ra);
+        uart_printf("<<PASS>>\n");
+    } else {
+        uart_printf("FAIL code=%u ticks=%u mepc=%08x ra=%08x sp=%08x tp=%08x mscratch=%08x\n",
+                    g_fail_code,
+                    g_ticks,
+                    g_last_mepc,
+                    g_last_ra,
+                    g_last_sp,
+                    g_last_tp,
+                    g_last_mscratch);
+        uart_printf("<<FAIL>>\n");
+    }
+
+    for (;;) {
+    }
+}
+
+int main(void)
+{
+    uint32_t stack_top = ((uint32_t) &g_ddr_stack[DDR_STACK_SIZE]) & ~0xFu;
+    __asm__ volatile("mv sp, %0\n"
+                     "j  main_on_ddr_stack\n"
+                     :
+                     : "r"(stack_top)
+                     : "memory");
+    __builtin_unreachable();
+}
diff --git a/sw/apps/linux_irq_active_ddr_test/Makefile b/sw/apps/linux_irq_active_ddr_test/Makefile
new file mode 100644
index 00000000..a15ba7f2
--- /dev/null
+++ b/sw/apps/linux_irq_active_ddr_test/Makefile
@@ -0,0 +1,19 @@
+#    Copyright 2026 Two Sigma Open Source, LLC
+#
+#    Licensed under the Apache License, Version 2.0 (the "License");
+#    you may not use this file except in compliance with the License.
+#    You may obtain a copy of the License at
+#
+#        http://www.apache.org/licenses/LICENSE-2.0
+#
+#    Unless required by applicable law or agreed to in writing, software
+#    distributed under the License is distributed on an "AS IS" BASIS,
+#    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#    See the License for the specific language governing permissions and
+#    limitations under the License.
+
+# Linux-like active-code timer IRQ directed test. Force the whole program into
+# cached DDR even when the generic cocotb runner is in its default BRAM tier.
+override MEM_CONFIG := ddr
+SRC_C := ../../lib/src/uart.c main.c
+include ../../common/common.mk
diff --git a/sw/apps/linux_irq_active_ddr_test/main.c b/sw/apps/linux_irq_active_ddr_test/main.c
new file mode 100644
index 00000000..d7ed3288
--- /dev/null
+++ b/sw/apps/linux_irq_active_ddr_test/main.c
@@ -0,0 +1,927 @@
+/*
+ *    Copyright 2026 Two Sigma Open Source, LLC
+ *
+ *    Licensed under the Apache License, Version 2.0 (the "License");
+ *    you may not use this file except in compliance with the License.
+ *    You may obtain a copy of the License at
+ *
+ *        http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *    Unless required by applicable law or agreed to in writing, software
+ *    distributed under the License is distributed on an "AS IS" BASIS,
+ *    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *    See the License for the specific language governing permissions and
+ *    limitations under the License.
+ */
+
+/*
+ * Linux-like active-code timer IRQ test, linked and executed from cached DDR.
+ *
+ * The no-MMU Linux hardware failure is an illegal-instruction panic with
+ * ra == epc == 0x00000cc0 after the first machine timer interrupt from idle.
+ * This test keeps the loop much smaller than Linux while preserving the risky
+ * ingredients: DDR-resident code/data, an explicit DDR stack, WFI idle,
+ * active-code machine-timer IRQs, a Linux-style naked trap entry that
+ * saves/restores GPRs on the current stack, and the csrrw tp,mscratch,tp swap
+ * idiom. The active phase repeatedly creates a low-value temporary-register
+ * poison while nested call/return traffic is in flight; ra should remain a
+ * high DDR return address at every interrupt boundary.
+ */
+
+#include <stdint.h>
+
+#include "csr.h"
+#include "trap.h"
+#include "uart.h"
+
+#define ARRAY_LEN(a) ((int) (sizeof(a) / sizeof((a)[0])))
+#define CLINT_MTIMECMP_LO (*(volatile uint32_t *) 0x40014000u)
+#define CLINT_MTIMECMP_HI (*(volatile uint32_t *) 0x40014004u)
+#define CLINT_MTIME_LO (*(volatile uint32_t *) 0x4001BFF8u)
+#define CLINT_MTIME_HI (*(volatile uint32_t *) 0x4001BFFCu)
+#define NORMAL_IRQ_COUNT 16u
+#define POISON_IRQ_COUNT 16u
+#define ACTIVE_IRQ_COUNT 8u
+#define SENTINEL_IRQ_COUNT 32u
+#define IRQ_COUNT (NORMAL_IRQ_COUNT + POISON_IRQ_COUNT + ACTIVE_IRQ_COUNT + SENTINEL_IRQ_COUNT)
+#define FRAME_WORDS 36u
+#define DDR_STACK_SIZE 4096u
+
+#define FRAME_EPC 0u
+#define FRAME_RA 1u
+#define FRAME_SP 2u
+#define FRAME_GP 3u
+#define FRAME_TP 4u
+#define FRAME_T0 5u
+#define FRAME_T1 6u
+#define FRAME_T2 7u
+#define FRAME_S0 8u
+#define FRAME_S1 9u
+#define FRAME_A0 10u
+#define FRAME_A1 11u
+#define FRAME_A2 12u
+#define FRAME_A3 13u
+#define FRAME_A4 14u
+#define FRAME_A5 15u
+#define FRAME_A6 16u
+#define FRAME_A7 17u
+#define FRAME_S2 18u
+#define FRAME_S3 19u
+#define FRAME_S4 20u
+#define FRAME_S5 21u
+#define FRAME_S6 22u
+#define FRAME_S7 23u
+#define FRAME_S8 24u
+#define FRAME_S9 25u
+#define FRAME_S10 26u
+#define FRAME_S11 27u
+
+#define SENTINEL_S0 0x51000000u
+#define SENTINEL_S1 0x51000001u
+#define SENTINEL_S3 0x51000003u
+#define SENTINEL_S4 0x51000004u
+#define SENTINEL_S5 0x51000005u
+#define SENTINEL_S6 0x51000006u
+#define SENTINEL_S7 0x51000007u
+#define SENTINEL_S8 0x51000008u
+#define SENTINEL_S9 0x51000009u
+#define SENTINEL_S10 0x5100000Au
+#define SENTINEL_S11 0x5100000Bu
+
+struct linux_pt_regs {
+    uint32_t epc;
+    uint32_t ra;
+    uint32_t sp;
+    uint32_t gp;
+    uint32_t tp;
+    uint32_t t0;
+    uint32_t t1;
+    uint32_t t2;
+    uint32_t s0;
+    uint32_t s1;
+    uint32_t a0;
+    uint32_t a1;
+    uint32_t a2;
+    uint32_t a3;
+    uint32_t a4;
+    uint32_t a5;
+    uint32_t a6;
+    uint32_t a7;
+    uint32_t s2;
+    uint32_t s3;
+    uint32_t s4;
+    uint32_t s5;
+    uint32_t s6;
+    uint32_t s7;
+    uint32_t s8;
+    uint32_t s9;
+    uint32_t s10;
+    uint32_t s11;
+    uint32_t t3;
+    uint32_t t4;
+    uint32_t t5;
+    uint32_t t6;
+    uint32_t status;
+    uint32_t badaddr;
+    uint32_t cause;
+    uint32_t orig_a0;
+};
+
+struct fake_current {
+    uint32_t kernel_sp;
+    uint32_t user_sp;
+    uint32_t marker;
+};
+
+volatile uint32_t g_expected_mepc;
+volatile uint32_t g_expected_ra;
+volatile uint32_t g_expected_sp;
+volatile uint32_t g_expected_tp;
+volatile uint32_t g_exact_frame_check;
+volatile struct fake_current g_fake_current = {0u, 0u, 0x5441534Bu};
+volatile uint32_t g_ticks;
+volatile uint32_t g_fail_code;
+volatile uint32_t g_fail_seen;
+volatile uint32_t g_bad_cause;
+volatile uint32_t g_bad_epc;
+volatile uint32_t g_bad_ra;
+volatile uint32_t g_last_mepc;
+volatile uint32_t g_last_ra;
+volatile uint32_t g_last_sp;
+volatile uint32_t g_last_tp;
+volatile uint32_t g_last_mscratch_in_handler;
+volatile uint32_t g_context_checksum;
+volatile uint32_t g_context_words[64];
+volatile uint32_t g_frame_snapshots[IRQ_COUNT][FRAME_WORDS];
+volatile uint32_t g_frame_check_mask[FRAME_WORDS];
+volatile uint32_t g_expected_frame[FRAME_WORDS];
+volatile uint32_t g_bad_frame_index;
+volatile uint32_t g_bad_expected;
+volatile uint32_t g_bad_actual;
+volatile uint32_t g_bad_tick;
+
+static uint8_t g_ddr_stack[DDR_STACK_SIZE] __attribute__((aligned(16)));
+
+static inline uint32_t read_tp(void)
+{
+    uint32_t value;
+    __asm__ volatile("mv %0, tp" : "=r"(value));
+    return value;
+}
+
+static inline void write_tp(uint32_t value)
+{
+    __asm__ volatile("mv tp, %0" : : "r"(value) : "memory");
+}
+
+static void record_failure(uint32_t code)
+{
+    if (!g_fail_seen) {
+        g_fail_seen = 1;
+        g_fail_code = code;
+        g_bad_cause = csr_read(mcause);
+    }
+}
+
+__attribute__((noinline, used)) void
+record_frame_failure(uint32_t code, uint32_t index, uint32_t expected, uint32_t actual)
+{
+    if (!g_fail_seen) {
+        g_bad_frame_index = index;
+        g_bad_expected = expected;
+        g_bad_actual = actual;
+        g_bad_tick = g_ticks;
+        record_failure(code);
+    }
+}
+
+static void clear_frame_checks(void)
+{
+    for (uint32_t i = 0; i < FRAME_WORDS; i++) {
+        g_frame_check_mask[i] = 0u;
+        g_expected_frame[i] = 0u;
+    }
+}
+
+static void expect_frame_word(uint32_t index, uint32_t value)
+{
+    g_expected_frame[index] = value;
+    g_frame_check_mask[index] = 0xFFFFFFFFu;
+}
+
+static void check_frame_masked(struct linux_pt_regs *frame)
+{
+    volatile uint32_t *words = (volatile uint32_t *) frame;
+
+    for (uint32_t i = 0; i < FRAME_WORDS; i++) {
+        uint32_t mask = g_frame_check_mask[i];
+        uint32_t actual;
+        uint32_t expected;
+
+        if (!mask) {
+            continue;
+        }
+        actual = words[i];
+        expected = g_expected_frame[i];
+        if (((actual ^ expected) & mask) != 0u) {
+            record_frame_failure(30u, i, expected, actual);
+            break;
+        }
+    }
+}
+
+static void fill_context(void)
+{
+    for (int i = 0; i < ARRAY_LEN(g_context_words); i++) {
+        g_context_words[i] = 0x80000000u ^ ((uint32_t) i * 0x10204081u);
+    }
+    g_context_checksum = 0x13579BDFu;
+}
+
+static uint32_t churn_context(uint32_t seed)
+{
+    uint32_t acc = seed ^ g_context_checksum;
+
+    for (int i = 0; i < ARRAY_LEN(g_context_words); i++) {
+        uint32_t value = g_context_words[i];
+        acc ^= value + ((uint32_t) i << 16);
+        acc = (acc << 5) | (acc >> 27);
+        g_context_words[i] = value ^ acc ^ (0x9E3779B9u + (uint32_t) i);
+    }
+
+    g_context_checksum = acc;
+    return acc;
+}
+
+static uint64_t clint_rdmtime(void)
+{
+    uint32_t hi;
+    uint32_t lo;
+    uint32_t hi2;
+
+    do {
+        hi = CLINT_MTIME_HI;
+        lo = CLINT_MTIME_LO;
+        hi2 = CLINT_MTIME_HI;
+    } while (hi != hi2);
+
+    return ((uint64_t) hi << 32) | lo;
+}
+
+static void clint_set_timer_cmp(uint64_t cmp)
+{
+    CLINT_MTIMECMP_HI = 0xFFFFFFFFu;
+    CLINT_MTIMECMP_LO = (uint32_t) cmp;
+    CLINT_MTIMECMP_HI = (uint32_t) (cmp >> 32);
+}
+
+static void clint_ack_timer(void)
+{
+    CLINT_MTIMECMP_HI = 0xFFFFFFFFu;
+    CLINT_MTIMECMP_LO = 0xFFFFFFFFu;
+}
+
+__attribute__((noinline)) static uint32_t active_poison_window(uint32_t value)
+{
+    uint32_t out;
+
+    __asm__ volatile("lui  t5, 0x1\n"
+                     "addi t5, t5, -832\n"
+                     "xor  %[out], %[in], t5\n"
+                     "addi %[out], %[out], 37\n"
+                     : [out] "=&r"(out)
+                     : [in] "r"(value)
+                     : "t5", "memory");
+
+    return out;
+}
+
+__attribute__((noinline)) static uint32_t active_leaf(uint32_t seed)
+{
+    volatile uint32_t local[12];
+    uint32_t acc = seed ^ g_context_checksum;
+
+    for (uint32_t i = 0; i < ARRAY_LEN(local); i++) {
+        local[i] = active_poison_window(acc + i);
+        acc ^= local[i] + (i << 8);
+    }
+
+    return active_poison_window(acc);
+}
+
+__attribute__((noinline)) static uint32_t active_mid3(uint32_t seed)
+{
+    return active_leaf(seed + 0x11111111u) ^ active_leaf(seed + 0x22222222u);
+}
+
+__attribute__((noinline)) static uint32_t active_mid2(uint32_t seed)
+{
+    uint32_t a = active_mid3(seed ^ 0x33333333u);
+    uint32_t b = active_poison_window(seed ^ a);
+
+    return active_mid3(b) ^ a;
+}
+
+__attribute__((noinline)) static uint32_t active_mid1(uint32_t seed)
+{
+    return active_mid2(seed + 0x44444444u) ^ active_poison_window(seed);
+}
+
+__attribute__((noinline)) static uint32_t active_until_irq(uint32_t iter)
+{
+    uint32_t before = g_ticks;
+    uint32_t acc = iter ^ 0xA5A50000u;
+    uint32_t guard = 0;
+
+    write_tp((uint32_t) &g_fake_current);
+    csr_write(mscratch, 0u);
+    g_exact_frame_check = 0u;
+    clear_frame_checks();
+    clint_set_timer_cmp(clint_rdmtime() + 700u + (iter & 63u));
+    enable_interrupts();
+
+    while (g_ticks == before && !g_fail_seen) {
+        acc ^= active_mid1(acc + guard);
+        guard++;
+        if (guard > 20000u) {
+            record_failure(19u);
+            break;
+        }
+    }
+
+    disable_interrupts();
+
+    if (g_ticks != before + 1u) {
+        record_failure(20u);
+    }
+    if (read_tp() != (uint32_t) &g_fake_current) {
+        record_failure(21u);
+    }
+    if (csr_read(mscratch) != 0u) {
+        record_failure(22u);
+    }
+
+    return churn_context(acc ^ g_ticks);
+}
+
+static void setup_sentinel_frame_checks(void)
+{
+    clear_frame_checks();
+    expect_frame_word(FRAME_TP, (uint32_t) &g_fake_current);
+    expect_frame_word(FRAME_S0, SENTINEL_S0);
+    expect_frame_word(FRAME_S1, SENTINEL_S1);
+    expect_frame_word(FRAME_S2, (uint32_t) &g_fake_current);
+    expect_frame_word(FRAME_S3, SENTINEL_S3);
+    expect_frame_word(FRAME_S4, SENTINEL_S4);
+    expect_frame_word(FRAME_S5, SENTINEL_S5);
+    expect_frame_word(FRAME_S6, SENTINEL_S6);
+    expect_frame_word(FRAME_S7, SENTINEL_S7);
+    expect_frame_word(FRAME_S8, SENTINEL_S8);
+    expect_frame_word(FRAME_S9, SENTINEL_S9);
+    expect_frame_word(FRAME_S10, SENTINEL_S10);
+    expect_frame_word(FRAME_S11, SENTINEL_S11);
+}
+
+__attribute__((naked, noinline, used)) static uint32_t name_to_int_shape_asm(uint32_t seed)
+{
+    __asm__ volatile("li   a5, 0x19999998\n"
+                     "addi a4, a5, 9\n"
+                     "xor  a0, a0, a5\n"
+                     "add  a0, a0, a4\n"
+                     "ret\n");
+}
+
+__attribute__((naked, noinline, used)) static uint32_t sentinel_irq_window(uint32_t before)
+{
+    __asm__ volatile("addi sp, sp, -64\n"
+                     "sw   ra, 0(sp)\n"
+                     "sw   s0, 4(sp)\n"
+                     "sw   s1, 8(sp)\n"
+                     "sw   s2, 12(sp)\n"
+                     "sw   s3, 16(sp)\n"
+                     "sw   s4, 20(sp)\n"
+                     "sw   s5, 24(sp)\n"
+                     "sw   s6, 28(sp)\n"
+                     "sw   s7, 32(sp)\n"
+                     "sw   s8, 36(sp)\n"
+                     "sw   s9, 40(sp)\n"
+                     "sw   s10, 44(sp)\n"
+                     "sw   s11, 48(sp)\n"
+                     "sw   a0, 52(sp)\n"
+                     "li   s0, 0x51000000\n"
+                     "li   s1, 0x51000001\n"
+                     "la   s2, g_fake_current\n"
+                     "li   s3, 0x51000003\n"
+                     "li   s4, 0x51000004\n"
+                     "li   s5, 0x51000005\n"
+                     "li   s6, 0x51000006\n"
+                     "li   s7, 0x51000007\n"
+                     "li   s8, 0x51000008\n"
+                     "li   s9, 0x51000009\n"
+                     "li   s10, 0x5100000a\n"
+                     "li   s11, 0x5100000b\n"
+                     "li   t0, 8\n"
+                     "csrs mstatus, t0\n"
+                     "li   t6, 0\n"
+                     "1:\n"
+                     "lw   a0, 52(sp)\n"
+                     "call name_to_int_shape_asm\n"
+                     "la   t0, g_fail_seen\n"
+                     "lw   t1, 0(t0)\n"
+                     "bnez t1, 2f\n"
+                     "la   t0, g_ticks\n"
+                     "lw   t1, 0(t0)\n"
+                     "lw   t2, 52(sp)\n"
+                     "bne  t1, t2, 2f\n"
+                     "addi t6, t6, 1\n"
+                     "li   t3, 30000\n"
+                     "bltu t6, t3, 1b\n"
+                     "li   t0, 8\n"
+                     "csrc mstatus, t0\n"
+                     "li   a0, 41\n"
+                     "li   a1, 0xffffffff\n"
+                     "li   a2, 0\n"
+                     "mv   a3, t6\n"
+                     "call record_frame_failure\n"
+                     "j    3f\n"
+                     "2:\n"
+                     "li   t0, 8\n"
+                     "csrc mstatus, t0\n"
+                     "3:\n"
+                     "li   t0, 0x51000000\n"
+                     "beq  s0, t0, 4f\n"
+                     "li   a0, 31\n"
+                     "li   a1, 8\n"
+                     "li   a2, 0x51000000\n"
+                     "mv   a3, s0\n"
+                     "call record_frame_failure\n"
+                     "j    15f\n"
+                     "4:\n"
+                     "li   t0, 0x51000001\n"
+                     "beq  s1, t0, 5f\n"
+                     "li   a0, 31\n"
+                     "li   a1, 9\n"
+                     "li   a2, 0x51000001\n"
+                     "mv   a3, s1\n"
+                     "call record_frame_failure\n"
+                     "j    15f\n"
+                     "5:\n"
+                     "la   t0, g_fake_current\n"
+                     "beq  s2, t0, 6f\n"
+                     "li   a0, 31\n"
+                     "li   a1, 18\n"
+                     "la   a2, g_fake_current\n"
+                     "mv   a3, s2\n"
+                     "call record_frame_failure\n"
+                     "j    15f\n"
+                     "6:\n"
+                     "li   t0, 0x51000003\n"
+                     "beq  s3, t0, 7f\n"
+                     "li   a0, 31\n"
+                     "li   a1, 19\n"
+                     "li   a2, 0x51000003\n"
+                     "mv   a3, s3\n"
+                     "call record_frame_failure\n"
+                     "j    15f\n"
+                     "7:\n"
+                     "li   t0, 0x51000004\n"
+                     "beq  s4, t0, 8f\n"
+                     "li   a0, 31\n"
+                     "li   a1, 20\n"
+                     "li   a2, 0x51000004\n"
+                     "mv   a3, s4\n"
+                     "call record_frame_failure\n"
+                     "j    15f\n"
+                     "8:\n"
+                     "li   t0, 0x51000005\n"
+                     "beq  s5, t0, 9f\n"
+                     "li   a0, 31\n"
+                     "li   a1, 21\n"
+                     "li   a2, 0x51000005\n"
+                     "mv   a3, s5\n"
+                     "call record_frame_failure\n"
+                     "j    15f\n"
+                     "9:\n"
+                     "li   t0, 0x51000006\n"
+                     "beq  s6, t0, 10f\n"
+                     "li   a0, 31\n"
+                     "li   a1, 22\n"
+                     "li   a2, 0x51000006\n"
+                     "mv   a3, s6\n"
+                     "call record_frame_failure\n"
+                     "j    15f\n"
+                     "10:\n"
+                     "li   t0, 0x51000007\n"
+                     "beq  s7, t0, 11f\n"
+                     "li   a0, 31\n"
+                     "li   a1, 23\n"
+                     "li   a2, 0x51000007\n"
+                     "mv   a3, s7\n"
+                     "call record_frame_failure\n"
+                     "j    15f\n"
+                     "11:\n"
+                     "li   t0, 0x51000008\n"
+                     "beq  s8, t0, 12f\n"
+                     "li   a0, 31\n"
+                     "li   a1, 24\n"
+                     "li   a2, 0x51000008\n"
+                     "mv   a3, s8\n"
+                     "call record_frame_failure\n"
+                     "j    15f\n"
+                     "12:\n"
+                     "li   t0, 0x51000009\n"
+                     "beq  s9, t0, 13f\n"
+                     "li   a0, 31\n"
+                     "li   a1, 25\n"
+                     "li   a2, 0x51000009\n"
+                     "mv   a3, s9\n"
+                     "call record_frame_failure\n"
+                     "j    15f\n"
+                     "13:\n"
+                     "li   t0, 0x5100000a\n"
+                     "beq  s10, t0, 14f\n"
+                     "li   a0, 31\n"
+                     "li   a1, 26\n"
+                     "li   a2, 0x5100000a\n"
+                     "mv   a3, s10\n"
+                     "call record_frame_failure\n"
+                     "j    15f\n"
+                     "14:\n"
+                     "li   t0, 0x5100000b\n"
+                     "beq  s11, t0, 15f\n"
+                     "li   a0, 31\n"
+                     "li   a1, 27\n"
+                     "li   a2, 0x5100000b\n"
+                     "mv   a3, s11\n"
+                     "call record_frame_failure\n"
+                     "15:\n"
+                     "lw   ra, 0(sp)\n"
+                     "lw   s0, 4(sp)\n"
+                     "lw   s1, 8(sp)\n"
+                     "lw   s2, 12(sp)\n"
+                     "lw   s3, 16(sp)\n"
+                     "lw   s4, 20(sp)\n"
+                     "lw   s5, 24(sp)\n"
+                     "lw   s6, 28(sp)\n"
+                     "lw   s7, 32(sp)\n"
+                     "lw   s8, 36(sp)\n"
+                     "lw   s9, 40(sp)\n"
+                     "lw   s10, 44(sp)\n"
+                     "lw   s11, 48(sp)\n"
+                     "addi sp, sp, 64\n"
+                     "ret\n");
+}
+
+__attribute__((noinline)) static uint32_t sentinel_until_irq(uint32_t iter)
+{
+    uint32_t before = g_ticks;
+
+    write_tp((uint32_t) &g_fake_current);
+    csr_write(mscratch, 0u);
+    g_exact_frame_check = 0u;
+    setup_sentinel_frame_checks();
+    clint_set_timer_cmp(clint_rdmtime() + 180u + ((iter * 37u) & 255u));
+    sentinel_irq_window(before);
+    disable_interrupts();
+    clear_frame_checks();
+
+    if (g_ticks != before + 1u) {
+        record_failure(32u);
+    }
+    if (read_tp() != (uint32_t) &g_fake_current) {
+        record_failure(33u);
+    }
+    if (csr_read(mscratch) != 0u) {
+        record_failure(34u);
+    }
+
+    return churn_context(0x19999998u ^ iter ^ g_ticks);
+}
+
+__attribute__((noinline, used)) void linux_like_irq_c(struct linux_pt_regs *frame)
+{
+    uint32_t tick = g_ticks;
+
+    g_last_mepc = frame->epc;
+    g_last_ra = frame->ra;
+    g_last_sp = frame->sp;
+    g_last_tp = frame->tp;
+    g_last_mscratch_in_handler = csr_read(mscratch);
+
+    if (tick < IRQ_COUNT) {
+        for (uint32_t i = 0; i < FRAME_WORDS; i++) {
+            g_frame_snapshots[tick][i] = ((volatile uint32_t *) frame)[i];
+        }
+    }
+
+    if (frame->cause != (MCAUSE_INTERRUPT_BIT | INT_MTI)) {
+        g_bad_epc = frame->epc;
+        g_bad_ra = frame->ra;
+        record_failure(1u);
+        uart_printf("FAIL code=%u ticks=%u cause=%08x mepc=%08x ra=%08x\n",
+                    g_fail_code,
+                    g_ticks,
+                    frame->cause,
+                    frame->epc,
+                    frame->ra);
+        uart_printf("<<FAIL>>\n");
+        for (;;) {
+        }
+    }
+    check_frame_masked(frame);
+    if (g_exact_frame_check) {
+        if (frame->epc != g_expected_mepc) {
+            record_failure(2u);
+        }
+        if (frame->ra != g_expected_ra) {
+            record_failure(3u);
+        }
+        if (frame->ra < 0x80000000u || frame->ra == 0x00000CC0u || frame->ra < 0x00001000u) {
+            record_failure(14u);
+        }
+        if (frame->sp != g_expected_sp) {
+            record_failure(4u);
+        }
+        if (frame->tp != g_expected_tp) {
+            record_failure(5u);
+        }
+    } else {
+        if (frame->epc < 0x80000000u || frame->epc == 0x00000CC0u) {
+            record_failure(15u);
+        }
+        if (frame->ra < 0x80000000u || frame->ra == 0x00000CC0u) {
+            record_failure(16u);
+        }
+        if (frame->sp < 0x80000000u) {
+            record_failure(17u);
+        }
+        if (frame->tp != (uint32_t) &g_fake_current) {
+            record_failure(18u);
+        }
+    }
+    if (g_last_mscratch_in_handler != 0u) {
+        record_failure(6u);
+    }
+
+    churn_context(frame->epc ^ frame->ra ^ tick);
+
+    clint_ack_timer();
+    g_ticks = tick + 1u;
+}
+
+__attribute__((naked, aligned(4))) static void linux_like_irq_entry(void)
+{
+    __asm__ volatile("csrrw tp, mscratch, tp\n"
+                     "bnez tp, 1f\n"
+                     "csrr tp, mscratch\n"
+                     "1:\n"
+                     "addi sp, sp, -144\n"
+                     "sw   ra, 4(sp)\n"
+                     "sw   gp, 12(sp)\n"
+                     "sw   t0, 20(sp)\n"
+                     "sw   t1, 24(sp)\n"
+                     "sw   t2, 28(sp)\n"
+                     "sw   s0, 32(sp)\n"
+                     "sw   s1, 36(sp)\n"
+                     "sw   a0, 40(sp)\n"
+                     "sw   a1, 44(sp)\n"
+                     "sw   a2, 48(sp)\n"
+                     "sw   a3, 52(sp)\n"
+                     "sw   a4, 56(sp)\n"
+                     "sw   a5, 60(sp)\n"
+                     "sw   a6, 64(sp)\n"
+                     "sw   a7, 68(sp)\n"
+                     "sw   s2, 72(sp)\n"
+                     "sw   s3, 76(sp)\n"
+                     "sw   s4, 80(sp)\n"
+                     "sw   s5, 84(sp)\n"
+                     "sw   s6, 88(sp)\n"
+                     "sw   s7, 92(sp)\n"
+                     "sw   s8, 96(sp)\n"
+                     "sw   s9, 100(sp)\n"
+                     "sw   s10, 104(sp)\n"
+                     "sw   s11, 108(sp)\n"
+                     "sw   t3, 112(sp)\n"
+                     "sw   t4, 116(sp)\n"
+                     "sw   t5, 120(sp)\n"
+                     "sw   t6, 124(sp)\n"
+                     "sw   a0, 140(sp)\n"
+                     "addi t0, sp, 144\n"
+                     "sw   t0, 8(sp)\n"
+                     "csrr t0, mepc\n"
+                     "sw   t0, 0(sp)\n"
+                     "csrr t0, mstatus\n"
+                     "sw   t0, 128(sp)\n"
+                     "csrr t0, mtval\n"
+                     "sw   t0, 132(sp)\n"
+                     "csrr t0, mcause\n"
+                     "sw   t0, 136(sp)\n"
+                     "csrr t0, mscratch\n"
+                     "sw   t0, 16(sp)\n"
+                     "csrw mscratch, x0\n"
+                     "mv   a0, sp\n"
+                     "call linux_like_irq_c\n"
+                     "lw   a0, 128(sp)\n"
+                     "lw   a2, 0(sp)\n"
+                     "sc.w x0, a2, 0(sp)\n"
+                     "csrw mstatus, a0\n"
+                     "csrw mepc, a2\n"
+                     "lw   ra, 4(sp)\n"
+                     "lw   gp, 12(sp)\n"
+                     "lw   tp, 16(sp)\n"
+                     "lw   t0, 20(sp)\n"
+                     "lw   t1, 24(sp)\n"
+                     "lw   t2, 28(sp)\n"
+                     "lw   s0, 32(sp)\n"
+                     "lw   s1, 36(sp)\n"
+                     "lw   a0, 40(sp)\n"
+                     "lw   a1, 44(sp)\n"
+                     "lw   a2, 48(sp)\n"
+                     "lw   a3, 52(sp)\n"
+                     "lw   a4, 56(sp)\n"
+                     "lw   a5, 60(sp)\n"
+                     "lw   a6, 64(sp)\n"
+                     "lw   a7, 68(sp)\n"
+                     "lw   s2, 72(sp)\n"
+                     "lw   s3, 76(sp)\n"
+                     "lw   s4, 80(sp)\n"
+                     "lw   s5, 84(sp)\n"
+                     "lw   s6, 88(sp)\n"
+                     "lw   s7, 92(sp)\n"
+                     "lw   s8, 96(sp)\n"
+                     "lw   s9, 100(sp)\n"
+                     "lw   s10, 104(sp)\n"
+                     "lw   s11, 108(sp)\n"
+                     "lw   t3, 112(sp)\n"
+                     "lw   t4, 116(sp)\n"
+                     "lw   t5, 120(sp)\n"
+                     "lw   t6, 124(sp)\n"
+                     "lw   sp, 8(sp)\n"
+                     "mret\n");
+}
+
+__attribute__((noinline)) static uint32_t idle_once(uint32_t iter)
+{
+    uint32_t before = g_ticks;
+
+    write_tp((uint32_t) &g_fake_current);
+    csr_write(mscratch, 0u);
+    g_exact_frame_check = 1u;
+    clear_frame_checks();
+    clint_set_timer_cmp(clint_rdmtime() + 300u + (iter & 31u));
+    enable_interrupts();
+
+    __asm__ volatile("mv   t2, ra\n"
+                     "mv   t3, sp\n"
+                     "mv   t4, tp\n"
+                     "la   t0, 1f\n"
+                     "la   t1, g_expected_mepc\n"
+                     "sw   t0, 0(t1)\n"
+                     "la   t1, g_expected_ra\n"
+                     "sw   t2, 0(t1)\n"
+                     "la   t1, g_expected_sp\n"
+                     "sw   t3, 0(t1)\n"
+                     "la   t1, g_expected_tp\n"
+                     "sw   t4, 0(t1)\n"
+                     "wfi\n"
+                     "1:\n"
+                     :
+                     :
+                     : "t0", "t1", "t2", "t3", "t4", "memory");
+
+    disable_interrupts();
+
+    if (g_ticks != before + 1u) {
+        record_failure(8u);
+    }
+    if (read_tp() != (uint32_t) &g_fake_current) {
+        record_failure(9u);
+    }
+    if (csr_read(mscratch) != 0u) {
+        record_failure(10u);
+    }
+
+    return churn_context(iter ^ g_ticks);
+}
+
+__attribute__((noinline)) static uint32_t idle_then_poison_ra_once(uint32_t iter)
+{
+    uint32_t before = g_ticks;
+
+    write_tp((uint32_t) &g_fake_current);
+    csr_write(mscratch, 0u);
+    g_exact_frame_check = 1u;
+    clear_frame_checks();
+    clint_set_timer_cmp(clint_rdmtime() + 300u + (iter & 31u));
+    enable_interrupts();
+
+    __asm__ volatile("mv   t2, ra\n"
+                     "mv   t3, sp\n"
+                     "mv   t4, tp\n"
+                     "la   t0, 1f\n"
+                     "la   t1, g_expected_mepc\n"
+                     "sw   t0, 0(t1)\n"
+                     "la   t1, g_expected_ra\n"
+                     "sw   t2, 0(t1)\n"
+                     "la   t1, g_expected_sp\n"
+                     "sw   t3, 0(t1)\n"
+                     "la   t1, g_expected_tp\n"
+                     "sw   t4, 0(t1)\n"
+                     "wfi\n"
+                     "1:\n"
+                     "lui  ra, 0x1\n"
+                     "addi ra, ra, -832\n"
+                     "mv   ra, t2\n"
+                     :
+                     :
+                     : "t0", "t1", "t2", "t3", "t4", "memory");
+
+    disable_interrupts();
+
+    if (g_ticks != before + 1u) {
+        record_failure(11u);
+    }
+    if (read_tp() != (uint32_t) &g_fake_current) {
+        record_failure(12u);
+    }
+    if (csr_read(mscratch) != 0u) {
+        record_failure(13u);
+    }
+
+    return churn_context(0xCC0u ^ iter ^ g_ticks);
+}
+
+__attribute__((noreturn, noinline, used)) void main_on_ddr_stack(void)
+{
+    uint32_t aggregate = 0x2468ACE0u;
+
+    uart_printf("\n=== Linux-like active DDR timer IRQ test ===\n");
+    fill_context();
+    clear_frame_checks();
+    g_fake_current.kernel_sp = (uint32_t) &g_ddr_stack[DDR_STACK_SIZE];
+    g_fake_current.user_sp = 0u;
+    set_trap_handler(&linux_like_irq_entry);
+    disable_interrupts();
+    enable_timer_interrupt();
+
+    for (uint32_t i = 0; i < NORMAL_IRQ_COUNT; i++) {
+        aggregate ^= idle_once(i);
+        if (g_fail_seen) {
+            break;
+        }
+    }
+    for (uint32_t i = 0; i < POISON_IRQ_COUNT && !g_fail_seen; i++) {
+        aggregate ^= idle_then_poison_ra_once(i);
+    }
+    for (uint32_t i = 0; i < ACTIVE_IRQ_COUNT && !g_fail_seen; i++) {
+        aggregate ^= active_until_irq(i);
+    }
+    for (uint32_t i = 0; i < SENTINEL_IRQ_COUNT && !g_fail_seen; i++) {
+        aggregate ^= sentinel_until_irq(i);
+    }
+
+    disable_timer_interrupt();
+    disable_interrupts();
+    clint_ack_timer();
+
+    if (!g_fail_seen && g_ticks == IRQ_COUNT && aggregate != 0u) {
+        uart_printf("ticks=%u checksum=%08x last_mepc=%08x last_ra=%08x\n",
+                    g_ticks,
+                    g_context_checksum,
+                    g_last_mepc,
+                    g_last_ra);
+        uart_printf("<<PASS>>\n");
+    } else {
+        uart_printf(
+            "FAIL code=%u ticks=%u cause=%08x mepc=%08x ra=%08x sp=%08x tp=%08x mscratch=%08x\n",
+            g_fail_code,
+            g_ticks,
+            g_bad_cause,
+            g_last_mepc,
+            g_last_ra,
+            g_last_sp,
+            g_last_tp,
+            g_last_mscratch_in_handler);
+        uart_printf("bad_frame idx=%u tick=%u expected=%08x actual=%08x\n",
+                    g_bad_frame_index,
+                    g_bad_tick,
+                    g_bad_expected,
+                    g_bad_actual);
+        uart_printf("<<FAIL>>\n");
+    }
+
+    for (;;) {
+    }
+}
+
+int main(void)
+{
+    uint32_t stack_top = ((uint32_t) &g_ddr_stack[DDR_STACK_SIZE]) & ~0xFu;
+
+    __asm__ volatile("mv sp, %0\n"
+                     "j  main_on_ddr_stack\n"
+                     :
+                     : "r"(stack_top)
+                     : "memory");
+    __builtin_unreachable();
+}
diff --git a/sw/apps/linux_irq_ddr_test/Makefile b/sw/apps/linux_irq_ddr_test/Makefile
new file mode 100644
index 00000000..765051ad
--- /dev/null
+++ b/sw/apps/linux_irq_ddr_test/Makefile
@@ -0,0 +1,19 @@
+#    Copyright 2026 Two Sigma Open Source, LLC
+#
+#    Licensed under the Apache License, Version 2.0 (the "License");
+#    you may not use this file except in compliance with the License.
+#    You may obtain a copy of the License at
+#
+#        http://www.apache.org/licenses/LICENSE-2.0
+#
+#    Unless required by applicable law or agreed to in writing, software
+#    distributed under the License is distributed on an "AS IS" BASIS,
+#    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#    See the License for the specific language governing permissions and
+#    limitations under the License.
+
+# Linux-like timer IRQ directed test. Force the whole program into cached DDR
+# even when the generic cocotb runner is in its default BRAM tier.
+override MEM_CONFIG := ddr
+SRC_C := ../../lib/src/uart.c main.c
+include ../../common/common.mk
diff --git a/sw/apps/linux_irq_ddr_test/main.c b/sw/apps/linux_irq_ddr_test/main.c
new file mode 100644
index 00000000..cf1c67fb
--- /dev/null
+++ b/sw/apps/linux_irq_ddr_test/main.c
@@ -0,0 +1,461 @@
+/*
+ *    Copyright 2026 Two Sigma Open Source, LLC
+ *
+ *    Licensed under the Apache License, Version 2.0 (the "License");
+ *    you may not use this file except in compliance with the License.
+ *    You may obtain a copy of the License at
+ *
+ *        http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *    Unless required by applicable law or agreed to in writing, software
+ *    distributed under the License is distributed on an "AS IS" BASIS,
+ *    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *    See the License for the specific language governing permissions and
+ *    limitations under the License.
+ */
+
+/*
+ * Linux-like timer IRQ test, linked and executed from cached DDR.
+ *
+ * The no-MMU Linux hardware failure is an illegal-instruction panic with
+ * ra == epc == 0x00000cc0 after the first machine timer interrupt from idle.
+ * This test keeps the loop much smaller than Linux while preserving the risky
+ * ingredients: DDR-resident code/data, an explicit DDR stack, WFI idle, a
+ * machine-timer IRQ, a Linux-style naked trap entry that saves/restores GPRs on
+ * the current stack, and the csrrw tp,mscratch,tp swap idiom.
+ */
+
+#include <stdint.h>
+
+#include "csr.h"
+#include "trap.h"
+#include "uart.h"
+
+#define ARRAY_LEN(a) ((int) (sizeof(a) / sizeof((a)[0])))
+#define CLINT_MTIMECMP_LO (*(volatile uint32_t *) 0x40014000u)
+#define CLINT_MTIMECMP_HI (*(volatile uint32_t *) 0x40014004u)
+#define CLINT_MTIME_LO (*(volatile uint32_t *) 0x4001BFF8u)
+#define CLINT_MTIME_HI (*(volatile uint32_t *) 0x4001BFFCu)
+#define NORMAL_IRQ_COUNT 16u
+#define POISON_IRQ_COUNT 16u
+#define IRQ_COUNT (NORMAL_IRQ_COUNT + POISON_IRQ_COUNT)
+#define FRAME_WORDS 36u
+#define DDR_STACK_SIZE 4096u
+
+struct linux_pt_regs {
+    uint32_t epc;
+    uint32_t ra;
+    uint32_t sp;
+    uint32_t gp;
+    uint32_t tp;
+    uint32_t t0;
+    uint32_t t1;
+    uint32_t t2;
+    uint32_t s0;
+    uint32_t s1;
+    uint32_t a0;
+    uint32_t a1;
+    uint32_t a2;
+    uint32_t a3;
+    uint32_t a4;
+    uint32_t a5;
+    uint32_t a6;
+    uint32_t a7;
+    uint32_t s2;
+    uint32_t s3;
+    uint32_t s4;
+    uint32_t s5;
+    uint32_t s6;
+    uint32_t s7;
+    uint32_t s8;
+    uint32_t s9;
+    uint32_t s10;
+    uint32_t s11;
+    uint32_t t3;
+    uint32_t t4;
+    uint32_t t5;
+    uint32_t t6;
+    uint32_t status;
+    uint32_t badaddr;
+    uint32_t cause;
+    uint32_t orig_a0;
+};
+
+struct fake_current {
+    uint32_t kernel_sp;
+    uint32_t user_sp;
+    uint32_t marker;
+};
+
+volatile uint32_t g_expected_mepc;
+volatile uint32_t g_expected_ra;
+volatile uint32_t g_expected_sp;
+volatile uint32_t g_expected_tp;
+volatile struct fake_current g_fake_current = {0u, 0u, 0x5441534Bu};
+volatile uint32_t g_ticks;
+volatile uint32_t g_fail_code;
+volatile uint32_t g_fail_seen;
+volatile uint32_t g_last_mepc;
+volatile uint32_t g_last_ra;
+volatile uint32_t g_last_sp;
+volatile uint32_t g_last_tp;
+volatile uint32_t g_last_mscratch_in_handler;
+volatile uint32_t g_context_checksum;
+volatile uint32_t g_context_words[64];
+volatile uint32_t g_frame_snapshots[IRQ_COUNT][FRAME_WORDS];
+
+static uint8_t g_ddr_stack[DDR_STACK_SIZE] __attribute__((aligned(16)));
+
+static inline uint32_t read_tp(void)
+{
+    uint32_t value;
+    __asm__ volatile("mv %0, tp" : "=r"(value));
+    return value;
+}
+
+static inline void write_tp(uint32_t value)
+{
+    __asm__ volatile("mv tp, %0" : : "r"(value) : "memory");
+}
+
+static void record_failure(uint32_t code)
+{
+    if (!g_fail_seen) {
+        g_fail_seen = 1;
+        g_fail_code = code;
+    }
+}
+
+static void fill_context(void)
+{
+    for (int i = 0; i < ARRAY_LEN(g_context_words); i++) {
+        g_context_words[i] = 0x80000000u ^ ((uint32_t) i * 0x10204081u);
+    }
+    g_context_checksum = 0x13579BDFu;
+}
+
+static uint32_t churn_context(uint32_t seed)
+{
+    uint32_t acc = seed ^ g_context_checksum;
+
+    for (int i = 0; i < ARRAY_LEN(g_context_words); i++) {
+        uint32_t value = g_context_words[i];
+        acc ^= value + ((uint32_t) i << 16);
+        acc = (acc << 5) | (acc >> 27);
+        g_context_words[i] = value ^ acc ^ (0x9E3779B9u + (uint32_t) i);
+    }
+
+    g_context_checksum = acc;
+    return acc;
+}
+
+static uint64_t clint_rdmtime(void)
+{
+    uint32_t hi;
+    uint32_t lo;
+    uint32_t hi2;
+
+    do {
+        hi = CLINT_MTIME_HI;
+        lo = CLINT_MTIME_LO;
+        hi2 = CLINT_MTIME_HI;
+    } while (hi != hi2);
+
+    return ((uint64_t) hi << 32) | lo;
+}
+
+static void clint_set_timer_cmp(uint64_t cmp)
+{
+    CLINT_MTIMECMP_HI = 0xFFFFFFFFu;
+    CLINT_MTIMECMP_LO = (uint32_t) cmp;
+    CLINT_MTIMECMP_HI = (uint32_t) (cmp >> 32);
+}
+
+static void clint_ack_timer(void)
+{
+    CLINT_MTIMECMP_HI = 0xFFFFFFFFu;
+    CLINT_MTIMECMP_LO = 0xFFFFFFFFu;
+}
+
+__attribute__((noinline, used)) void linux_like_irq_c(struct linux_pt_regs *frame)
+{
+    uint32_t tick = g_ticks;
+
+    g_last_mepc = frame->epc;
+    g_last_ra = frame->ra;
+    g_last_sp = frame->sp;
+    g_last_tp = frame->tp;
+    g_last_mscratch_in_handler = csr_read(mscratch);
+
+    if (tick < IRQ_COUNT) {
+        for (uint32_t i = 0; i < FRAME_WORDS; i++) {
+            g_frame_snapshots[tick][i] = ((volatile uint32_t *) frame)[i];
+        }
+    }
+
+    if (frame->cause != (MCAUSE_INTERRUPT_BIT | INT_MTI)) {
+        record_failure(1u);
+    }
+    if (frame->epc != g_expected_mepc) {
+        record_failure(2u);
+    }
+    if (frame->ra != g_expected_ra) {
+        record_failure(3u);
+    }
+    if (frame->ra < 0x80000000u || frame->ra == 0x00000CC0u || frame->ra < 0x00001000u) {
+        record_failure(14u);
+    }
+    if (frame->sp != g_expected_sp) {
+        record_failure(4u);
+    }
+    if (frame->tp != g_expected_tp) {
+        record_failure(5u);
+    }
+    if (g_last_mscratch_in_handler != 0u) {
+        record_failure(6u);
+    }
+
+    churn_context(frame->epc ^ frame->ra ^ tick);
+
+    clint_ack_timer();
+    g_ticks = tick + 1u;
+}
+
+__attribute__((naked, aligned(4))) static void linux_like_irq_entry(void)
+{
+    __asm__ volatile("csrrw tp, mscratch, tp\n"
+                     "bnez tp, 1f\n"
+                     "csrr tp, mscratch\n"
+                     "1:\n"
+                     "addi sp, sp, -144\n"
+                     "sw   ra, 4(sp)\n"
+                     "sw   gp, 12(sp)\n"
+                     "sw   t0, 20(sp)\n"
+                     "sw   t1, 24(sp)\n"
+                     "sw   t2, 28(sp)\n"
+                     "sw   s0, 32(sp)\n"
+                     "sw   s1, 36(sp)\n"
+                     "sw   a0, 40(sp)\n"
+                     "sw   a1, 44(sp)\n"
+                     "sw   a2, 48(sp)\n"
+                     "sw   a3, 52(sp)\n"
+                     "sw   a4, 56(sp)\n"
+                     "sw   a5, 60(sp)\n"
+                     "sw   a6, 64(sp)\n"
+                     "sw   a7, 68(sp)\n"
+                     "sw   s2, 72(sp)\n"
+                     "sw   s3, 76(sp)\n"
+                     "sw   s4, 80(sp)\n"
+                     "sw   s5, 84(sp)\n"
+                     "sw   s6, 88(sp)\n"
+                     "sw   s7, 92(sp)\n"
+                     "sw   s8, 96(sp)\n"
+                     "sw   s9, 100(sp)\n"
+                     "sw   s10, 104(sp)\n"
+                     "sw   s11, 108(sp)\n"
+                     "sw   t3, 112(sp)\n"
+                     "sw   t4, 116(sp)\n"
+                     "sw   t5, 120(sp)\n"
+                     "sw   t6, 124(sp)\n"
+                     "sw   a0, 140(sp)\n"
+                     "addi t0, sp, 144\n"
+                     "sw   t0, 8(sp)\n"
+                     "csrr t0, mepc\n"
+                     "sw   t0, 0(sp)\n"
+                     "csrr t0, mstatus\n"
+                     "sw   t0, 128(sp)\n"
+                     "csrr t0, mtval\n"
+                     "sw   t0, 132(sp)\n"
+                     "csrr t0, mcause\n"
+                     "sw   t0, 136(sp)\n"
+                     "csrr t0, mscratch\n"
+                     "sw   t0, 16(sp)\n"
+                     "csrw mscratch, x0\n"
+                     "mv   a0, sp\n"
+                     "call linux_like_irq_c\n"
+                     "lw   a0, 128(sp)\n"
+                     "lw   a2, 0(sp)\n"
+                     "sc.w x0, a2, 0(sp)\n"
+                     "csrw mstatus, a0\n"
+                     "csrw mepc, a2\n"
+                     "lw   ra, 4(sp)\n"
+                     "lw   gp, 12(sp)\n"
+                     "lw   tp, 16(sp)\n"
+                     "lw   t0, 20(sp)\n"
+                     "lw   t1, 24(sp)\n"
+                     "lw   t2, 28(sp)\n"
+                     "lw   s0, 32(sp)\n"
+                     "lw   s1, 36(sp)\n"
+                     "lw   a0, 40(sp)\n"
+                     "lw   a1, 44(sp)\n"
+                     "lw   a2, 48(sp)\n"
+                     "lw   a3, 52(sp)\n"
+                     "lw   a4, 56(sp)\n"
+                     "lw   a5, 60(sp)\n"
+                     "lw   a6, 64(sp)\n"
+                     "lw   a7, 68(sp)\n"
+                     "lw   s2, 72(sp)\n"
+                     "lw   s3, 76(sp)\n"
+                     "lw   s4, 80(sp)\n"
+                     "lw   s5, 84(sp)\n"
+                     "lw   s6, 88(sp)\n"
+                     "lw   s7, 92(sp)\n"
+                     "lw   s8, 96(sp)\n"
+                     "lw   s9, 100(sp)\n"
+                     "lw   s10, 104(sp)\n"
+                     "lw   s11, 108(sp)\n"
+                     "lw   t3, 112(sp)\n"
+                     "lw   t4, 116(sp)\n"
+                     "lw   t5, 120(sp)\n"
+                     "lw   t6, 124(sp)\n"
+                     "lw   sp, 8(sp)\n"
+                     "mret\n");
+}
+
+__attribute__((noinline)) static uint32_t idle_once(uint32_t iter)
+{
+    uint32_t before = g_ticks;
+
+    write_tp((uint32_t) &g_fake_current);
+    csr_write(mscratch, 0u);
+    clint_set_timer_cmp(clint_rdmtime() + 300u + (iter & 31u));
+    enable_interrupts();
+
+    __asm__ volatile("mv   t2, ra\n"
+                     "mv   t3, sp\n"
+                     "mv   t4, tp\n"
+                     "la   t0, 1f\n"
+                     "la   t1, g_expected_mepc\n"
+                     "sw   t0, 0(t1)\n"
+                     "la   t1, g_expected_ra\n"
+                     "sw   t2, 0(t1)\n"
+                     "la   t1, g_expected_sp\n"
+                     "sw   t3, 0(t1)\n"
+                     "la   t1, g_expected_tp\n"
+                     "sw   t4, 0(t1)\n"
+                     "wfi\n"
+                     "1:\n"
+                     :
+                     :
+                     : "t0", "t1", "t2", "t3", "t4", "memory");
+
+    disable_interrupts();
+
+    if (g_ticks != before + 1u) {
+        record_failure(8u);
+    }
+    if (read_tp() != (uint32_t) &g_fake_current) {
+        record_failure(9u);
+    }
+    if (csr_read(mscratch) != 0u) {
+        record_failure(10u);
+    }
+
+    return churn_context(iter ^ g_ticks);
+}
+
+__attribute__((noinline)) static uint32_t idle_then_poison_ra_once(uint32_t iter)
+{
+    uint32_t before = g_ticks;
+
+    write_tp((uint32_t) &g_fake_current);
+    csr_write(mscratch, 0u);
+    clint_set_timer_cmp(clint_rdmtime() + 300u + (iter & 31u));
+    enable_interrupts();
+
+    __asm__ volatile("mv   t2, ra\n"
+                     "mv   t3, sp\n"
+                     "mv   t4, tp\n"
+                     "la   t0, 1f\n"
+                     "la   t1, g_expected_mepc\n"
+                     "sw   t0, 0(t1)\n"
+                     "la   t1, g_expected_ra\n"
+                     "sw   t2, 0(t1)\n"
+                     "la   t1, g_expected_sp\n"
+                     "sw   t3, 0(t1)\n"
+                     "la   t1, g_expected_tp\n"
+                     "sw   t4, 0(t1)\n"
+                     "wfi\n"
+                     "1:\n"
+                     "lui  ra, 0x1\n"
+                     "addi ra, ra, -832\n"
+                     "mv   ra, t2\n"
+                     :
+                     :
+                     : "t0", "t1", "t2", "t3", "t4", "memory");
+
+    disable_interrupts();
+
+    if (g_ticks != before + 1u) {
+        record_failure(11u);
+    }
+    if (read_tp() != (uint32_t) &g_fake_current) {
+        record_failure(12u);
+    }
+    if (csr_read(mscratch) != 0u) {
+        record_failure(13u);
+    }
+
+    return churn_context(0xCC0u ^ iter ^ g_ticks);
+}
+
+__attribute__((noreturn, noinline, used)) void main_on_ddr_stack(void)
+{
+    uint32_t aggregate = 0x2468ACE0u;
+
+    uart_printf("\n=== Linux-like DDR timer IRQ test ===\n");
+    fill_context();
+    g_fake_current.kernel_sp = (uint32_t) &g_ddr_stack[DDR_STACK_SIZE];
+    g_fake_current.user_sp = 0u;
+    set_trap_handler(&linux_like_irq_entry);
+    disable_interrupts();
+    enable_timer_interrupt();
+
+    for (uint32_t i = 0; i < NORMAL_IRQ_COUNT; i++) {
+        aggregate ^= idle_once(i);
+        if (g_fail_seen) {
+            break;
+        }
+    }
+    for (uint32_t i = 0; i < POISON_IRQ_COUNT && !g_fail_seen; i++) {
+        aggregate ^= idle_then_poison_ra_once(i);
+    }
+
+    disable_timer_interrupt();
+    disable_interrupts();
+    clint_ack_timer();
+
+    if (!g_fail_seen && g_ticks == IRQ_COUNT && aggregate != 0u) {
+        uart_printf("ticks=%u checksum=%08x last_mepc=%08x last_ra=%08x\n",
+                    g_ticks,
+                    g_context_checksum,
+                    g_last_mepc,
+                    g_last_ra);
+        uart_printf("<<PASS>>\n");
+    } else {
+        uart_printf("FAIL code=%u ticks=%u mepc=%08x ra=%08x sp=%08x tp=%08x mscratch=%08x\n",
+                    g_fail_code,
+                    g_ticks,
+                    g_last_mepc,
+                    g_last_ra,
+                    g_last_sp,
+                    g_last_tp,
+                    g_last_mscratch_in_handler);
+        uart_printf("<<FAIL>>\n");
+    }
+
+    for (;;) {
+    }
+}
+
+int main(void)
+{
+    uint32_t stack_top = ((uint32_t) &g_ddr_stack[DDR_STACK_SIZE]) & ~0xFu;
+
+    __asm__ volatile("mv sp, %0\n"
+                     "j  main_on_ddr_stack\n"
+                     :
+                     : "r"(stack_top)
+                     : "memory");
+    __builtin_unreachable();
+}
diff --git a/sw/apps/linux_irq_find_next_slot_test/Makefile b/sw/apps/linux_irq_find_next_slot_test/Makefile
new file mode 100644
index 00000000..dd4526e5
--- /dev/null
+++ b/sw/apps/linux_irq_find_next_slot_test/Makefile
@@ -0,0 +1,19 @@
+#    Copyright 2026 Two Sigma Open Source, LLC
+#
+#    Licensed under the Apache License, Version 2.0 (the "License");
+#    you may not use this file except in compliance with the License.
+#    You may obtain a copy of the License at
+#
+#        http://www.apache.org/licenses/LICENSE-2.0
+#
+#    Unless required by applicable law or agreed to in writing, software
+#    distributed under the License is distributed on an "AS IS" BASIS,
+#    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#    See the License for the specific language governing permissions and
+#    limitations under the License.
+
+# Directed Linux IRQ stack-slot repro. Force the whole program into cached DDR
+# so the callee save slot exercises the same D-side path as the kernel stack.
+override MEM_CONFIG := ddr
+SRC_C := ../../lib/src/uart.c main.c
+include ../../common/common.mk
diff --git a/sw/apps/linux_irq_find_next_slot_test/main.c b/sw/apps/linux_irq_find_next_slot_test/main.c
new file mode 100644
index 00000000..c29f277f
--- /dev/null
+++ b/sw/apps/linux_irq_find_next_slot_test/main.c
@@ -0,0 +1,922 @@
+/*
+ *    Copyright 2026 Two Sigma Open Source, LLC
+ *
+ *    Licensed under the Apache License, Version 2.0 (the "License");
+ *    you may not use this file except in compliance with the License.
+ *    You may obtain a copy of the License at
+ *
+ *        http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *    Unless required by applicable law or agreed to in writing, software
+ *    distributed under the License is distributed on an "AS IS" BASIS,
+ *    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *    See the License for the specific language governing permissions and
+ *    limitations under the License.
+ */
+
+/*
+ * Directed repro for the Linux timer-IRQ failure where _find_next_bit()
+ * returned through ra == 0x00000cc0 after an IRQ. The test poisons the exact
+ * future callee save slot with 0xcc0, enters a callee whose prologue matches
+ * the Linux helper:
+ *
+ *     addi sp, sp, -16
+ *     sw   s0, 8(sp)
+ *     sw   ra, 12(sp)
+ *     addi s0, sp, 16
+ *
+ * The callee then loops in a Linux-shaped find-bit ctz/byte-test block while a
+ * timer phase sweep forces IRQs at many active-code retire boundaries.
+ */
+
+#include <stdint.h>
+
+#include "csr.h"
+#include "trap.h"
+#include "uart.h"
+
+#define CLINT_MTIMECMP_LO (*(volatile uint32_t *) 0x40014000u)
+#define CLINT_MTIMECMP_HI (*(volatile uint32_t *) 0x40014004u)
+#define CLINT_MTIME_LO (*(volatile uint32_t *) 0x4001BFF8u)
+#define CLINT_MTIME_HI (*(volatile uint32_t *) 0x4001BFFCu)
+#define DDR_STACK_SIZE 4096u
+#define FIND_SWEEP_ITERATIONS 64u
+#define TOTAL_ITERATIONS FIND_SWEEP_ITERATIONS
+#define FIND_BITMAP_WORDS 256u
+#define POISON_RA 0x00000CC0u
+
+struct linux_pt_regs {
+    uint32_t epc;
+    uint32_t ra;
+    uint32_t sp;
+    uint32_t gp;
+    uint32_t tp;
+    uint32_t t0;
+    uint32_t t1;
+    uint32_t t2;
+    uint32_t s0;
+    uint32_t s1;
+    uint32_t a0;
+    uint32_t a1;
+    uint32_t a2;
+    uint32_t a3;
+    uint32_t a4;
+    uint32_t a5;
+    uint32_t a6;
+    uint32_t a7;
+    uint32_t s2;
+    uint32_t s3;
+    uint32_t s4;
+    uint32_t s5;
+    uint32_t s6;
+    uint32_t s7;
+    uint32_t s8;
+    uint32_t s9;
+    uint32_t s10;
+    uint32_t s11;
+    uint32_t t3;
+    uint32_t t4;
+    uint32_t t5;
+    uint32_t t6;
+    uint32_t status;
+    uint32_t badaddr;
+    uint32_t cause;
+    uint32_t orig_a0;
+};
+
+struct fake_current {
+    uint32_t kernel_sp;
+    uint32_t user_sp;
+    uint32_t marker;
+};
+
+volatile struct fake_current g_fake_current = {0u, 0u, 0x5354414Bu};
+volatile uint32_t g_ticks;
+volatile uint32_t g_target_tick;
+volatile uint32_t g_current_iter;
+volatile uint32_t g_read_slot_in_handler;
+
+volatile uint32_t g_fail_seen;
+volatile uint32_t g_fail_code;
+volatile uint32_t g_bad_cause;
+volatile uint32_t g_bad_epc;
+volatile uint32_t g_bad_ra;
+
+volatile uint32_t g_expected_slot_addr;
+volatile uint32_t g_expected_caller_slot_addr;
+volatile uint32_t g_expected_saved_ra;
+volatile uint32_t g_poison_readback;
+volatile uint32_t g_caller_poison_readback;
+volatile uint32_t g_caller_sp;
+volatile uint32_t g_callee_sp;
+volatile uint32_t g_callee_ra_saved;
+volatile uint32_t g_slot_during_irq;
+volatile uint32_t g_slot_before_return;
+
+volatile uint32_t g_irq_in_callee;
+volatile uint32_t g_last_mepc;
+volatile uint32_t g_last_ra;
+volatile uint32_t g_last_sp;
+volatile uint32_t g_last_tp;
+volatile uint32_t g_last_mscratch_in_handler;
+volatile uint32_t g_last_slot_addr;
+volatile uint32_t g_irq_in_ctz;
+volatile uint32_t g_seen_ctz_irq;
+volatile uint32_t g_seen_exact_ctz_irq;
+volatile uint32_t g_find_result;
+volatile uint32_t g_exact_result;
+volatile uint32_t g_find_bitmap[FIND_BITMAP_WORDS] __attribute__((aligned(16)));
+
+static uint8_t g_ddr_stack[DDR_STACK_SIZE] __attribute__((aligned(16)));
+
+extern char irq_find_next_ctz_start[];
+extern char irq_find_next_ctz_end[];
+extern char irq_find_next_exact_ctz_start[];
+extern char irq_find_next_exact_ctz_end[];
+
+__attribute__((naked, aligned(4), noinline, used)) void irq_find_next_bit_callee(void);
+__attribute__((naked, aligned(4), noinline, used)) void irq_find_next_bit_exact_callee(void);
+__attribute__((naked, aligned(4), noinline, used)) uint32_t run_find_next_call_window(void);
+__attribute__((naked, aligned(4), noinline, used)) uint32_t run_find_next_exact_call_window(void);
+__attribute__((noreturn, noinline, used)) void stack_slot_bad_return(uint32_t observed);
+__attribute__((noreturn, noinline, used)) void stack_slot_timeout(uint32_t code);
+
+static inline uint32_t read_tp(void)
+{
+    uint32_t value;
+
+    __asm__ volatile("mv %0, tp" : "=r"(value));
+    return value;
+}
+
+static inline void write_tp(uint32_t value)
+{
+    __asm__ volatile("mv tp, %0" : : "r"(value) : "memory");
+}
+
+static uint64_t clint_rdmtime(void)
+{
+    uint32_t hi;
+    uint32_t lo;
+    uint32_t hi2;
+
+    do {
+        hi = CLINT_MTIME_HI;
+        lo = CLINT_MTIME_LO;
+        hi2 = CLINT_MTIME_HI;
+    } while (hi != hi2);
+
+    return ((uint64_t) hi << 32) | lo;
+}
+
+static void clint_set_timer_cmp(uint64_t cmp)
+{
+    CLINT_MTIMECMP_HI = 0xFFFFFFFFu;
+    CLINT_MTIMECMP_LO = (uint32_t) cmp;
+    CLINT_MTIMECMP_HI = (uint32_t) (cmp >> 32);
+}
+
+static void clint_ack_timer(void)
+{
+    CLINT_MTIMECMP_HI = 0xFFFFFFFFu;
+    CLINT_MTIMECMP_LO = 0xFFFFFFFFu;
+}
+
+static void record_failure(uint32_t code)
+{
+    if (!g_fail_seen) {
+        g_fail_seen = 1u;
+        g_fail_code = code;
+        g_bad_cause = csr_read(mcause);
+        g_bad_epc = csr_read(mepc);
+        g_bad_ra = g_last_ra;
+    }
+}
+
+__attribute__((noreturn, noinline)) static void finish_fail(const char *tag)
+{
+    disable_timer_interrupt();
+    disable_external_interrupt();
+    disable_interrupts();
+    clint_ack_timer();
+
+    uart_printf("FAIL %s code=%u iter=%u ticks=%u target=%u cause=%08x\n",
+                tag,
+                g_fail_code,
+                g_current_iter,
+                g_ticks,
+                g_target_tick,
+                g_bad_cause);
+    uart_printf("pc epc=%08x ra=%08x sp=%08x tp=%08x mscratch=%08x\n",
+                g_last_mepc,
+                g_last_ra,
+                g_last_sp,
+                g_last_tp,
+                g_last_mscratch_in_handler);
+    uart_printf("slot addr=%08x irq_addr=%08x poison=%08x irq_slot=%08x before_ret=%08x\n",
+                g_expected_slot_addr,
+                g_last_slot_addr,
+                g_poison_readback,
+                g_slot_during_irq,
+                g_slot_before_return);
+    uart_printf("caller_slot=%08x caller_poison=%08x caller_sp=%08x expected_ra=%08x\n",
+                g_expected_caller_slot_addr,
+                g_caller_poison_readback,
+                g_caller_sp,
+                g_expected_saved_ra);
+    uart_printf("callee_ra=%08x callee_sp=%08x bad_epc=%08x bad_ra=%08x\n",
+                g_callee_ra_saved,
+                g_callee_sp,
+                g_bad_epc,
+                g_bad_ra);
+    uart_printf("<<FAIL>>\n");
+
+    for (;;) {
+    }
+}
+
+__attribute__((noreturn, noinline, used)) void stack_slot_bad_return(uint32_t observed)
+{
+    g_slot_before_return = observed;
+    record_failure(30u);
+    finish_fail("bad_return_slot");
+}
+
+__attribute__((noreturn, noinline, used)) void stack_slot_timeout(uint32_t code)
+{
+    record_failure(code);
+    finish_fail("callee_timeout");
+}
+
+__attribute__((noinline, used)) void linux_like_irq_c(struct linux_pt_regs *frame)
+{
+    uint32_t tick = g_ticks;
+
+    g_last_mepc = frame->epc;
+    g_last_ra = frame->ra;
+    g_last_sp = frame->sp;
+    g_last_tp = frame->tp;
+    g_last_mscratch_in_handler = csr_read(mscratch);
+
+    uint32_t cause_code = frame->cause & ~MCAUSE_INTERRUPT_BIT;
+    if ((frame->cause & MCAUSE_INTERRUPT_BIT) == 0u ||
+        (cause_code != INT_MTI && cause_code != INT_MEI)) {
+        g_bad_epc = frame->epc;
+        g_bad_ra = frame->ra;
+        record_failure(1u);
+        finish_fail("unexpected_trap");
+    }
+
+    if (g_callee_sp != 0u && frame->sp == g_callee_sp) {
+        g_irq_in_callee = 1u;
+        g_last_slot_addr = frame->sp + 12u;
+        if (g_expected_saved_ra != 0u && frame->ra != g_expected_saved_ra) {
+            record_failure(8u);
+        }
+        if (frame->epc >= (uint32_t) irq_find_next_ctz_start &&
+            frame->epc < (uint32_t) irq_find_next_ctz_end) {
+            g_irq_in_ctz = 1u;
+            g_seen_ctz_irq = 1u;
+        }
+        if (frame->epc >= (uint32_t) irq_find_next_exact_ctz_start &&
+            frame->epc < (uint32_t) irq_find_next_exact_ctz_end) {
+            g_seen_exact_ctz_irq = 1u;
+        }
+        if (g_read_slot_in_handler) {
+            g_slot_during_irq = *(volatile uint32_t *) (frame->sp + 12u);
+            if (g_slot_during_irq != g_expected_saved_ra) {
+                record_failure(6u);
+            }
+        }
+    } else if (g_caller_sp != 0u && frame->sp == g_caller_sp) {
+        g_last_slot_addr = frame->sp + 12u;
+        if (g_read_slot_in_handler) {
+            g_slot_during_irq = *(volatile uint32_t *) (frame->sp + 12u);
+        }
+        record_failure(9u);
+    } else {
+        record_failure(2u);
+    }
+
+    if (frame->ra < 0x80000000u || frame->ra == POISON_RA) {
+        record_failure(3u);
+    }
+    if (frame->tp != (uint32_t) &g_fake_current) {
+        record_failure(4u);
+    }
+    if (g_last_mscratch_in_handler != 0u) {
+        record_failure(7u);
+    }
+
+    clint_ack_timer();
+    g_ticks = tick + 1u;
+}
+
+__attribute__((naked, aligned(4))) static void linux_like_irq_entry(void)
+{
+    __asm__ volatile("csrrw tp, mscratch, tp\n"
+                     "bnez tp, 1f\n"
+                     "csrr tp, mscratch\n"
+                     "1:\n"
+                     "addi sp, sp, -144\n"
+                     "sw   ra, 4(sp)\n"
+                     "sw   gp, 12(sp)\n"
+                     "sw   t0, 20(sp)\n"
+                     "sw   t1, 24(sp)\n"
+                     "sw   t2, 28(sp)\n"
+                     "sw   s0, 32(sp)\n"
+                     "sw   s1, 36(sp)\n"
+                     "sw   a0, 40(sp)\n"
+                     "sw   a1, 44(sp)\n"
+                     "sw   a2, 48(sp)\n"
+                     "sw   a3, 52(sp)\n"
+                     "sw   a4, 56(sp)\n"
+                     "sw   a5, 60(sp)\n"
+                     "sw   a6, 64(sp)\n"
+                     "sw   a7, 68(sp)\n"
+                     "sw   s2, 72(sp)\n"
+                     "sw   s3, 76(sp)\n"
+                     "sw   s4, 80(sp)\n"
+                     "sw   s5, 84(sp)\n"
+                     "sw   s6, 88(sp)\n"
+                     "sw   s7, 92(sp)\n"
+                     "sw   s8, 96(sp)\n"
+                     "sw   s9, 100(sp)\n"
+                     "sw   s10, 104(sp)\n"
+                     "sw   s11, 108(sp)\n"
+                     "sw   t3, 112(sp)\n"
+                     "sw   t4, 116(sp)\n"
+                     "sw   t5, 120(sp)\n"
+                     "sw   t6, 124(sp)\n"
+                     "sw   a0, 140(sp)\n"
+                     "addi t0, sp, 144\n"
+                     "sw   t0, 8(sp)\n"
+                     "csrr t0, mepc\n"
+                     "sw   t0, 0(sp)\n"
+                     "csrr t0, mstatus\n"
+                     "sw   t0, 128(sp)\n"
+                     "csrr t0, mtval\n"
+                     "sw   t0, 132(sp)\n"
+                     "csrr t0, mcause\n"
+                     "sw   t0, 136(sp)\n"
+                     "csrr t0, mscratch\n"
+                     "sw   t0, 16(sp)\n"
+                     "csrw mscratch, x0\n"
+                     "mv   a0, sp\n"
+                     "call linux_like_irq_c\n"
+                     "lw   a0, 128(sp)\n"
+                     "lw   a2, 0(sp)\n"
+                     "sc.w x0, a2, 0(sp)\n"
+                     "csrw mstatus, a0\n"
+                     "csrw mepc, a2\n"
+                     "lw   ra, 4(sp)\n"
+                     "lw   gp, 12(sp)\n"
+                     "lw   tp, 16(sp)\n"
+                     "lw   t0, 20(sp)\n"
+                     "lw   t1, 24(sp)\n"
+                     "lw   t2, 28(sp)\n"
+                     "lw   s0, 32(sp)\n"
+                     "lw   s1, 36(sp)\n"
+                     "lw   a0, 40(sp)\n"
+                     "lw   a1, 44(sp)\n"
+                     "lw   a2, 48(sp)\n"
+                     "lw   a3, 52(sp)\n"
+                     "lw   a4, 56(sp)\n"
+                     "lw   a5, 60(sp)\n"
+                     "lw   a6, 64(sp)\n"
+                     "lw   a7, 68(sp)\n"
+                     "lw   s2, 72(sp)\n"
+                     "lw   s3, 76(sp)\n"
+                     "lw   s4, 80(sp)\n"
+                     "lw   s5, 84(sp)\n"
+                     "lw   s6, 88(sp)\n"
+                     "lw   s7, 92(sp)\n"
+                     "lw   s8, 96(sp)\n"
+                     "lw   s9, 100(sp)\n"
+                     "lw   s10, 104(sp)\n"
+                     "lw   s11, 108(sp)\n"
+                     "lw   t3, 112(sp)\n"
+                     "lw   t4, 116(sp)\n"
+                     "lw   t5, 120(sp)\n"
+                     "lw   t6, 124(sp)\n"
+                     "lw   sp, 8(sp)\n"
+                     "mret\n");
+}
+
+__attribute__((naked, aligned(4), noinline, used)) void irq_find_next_bit_callee(void)
+{
+    __asm__ volatile(".option push\n"
+                     ".option rvc\n"
+                     "addi sp, sp, -16\n"
+                     "sw   s0, 8(sp)\n"
+                     "sw   ra, 12(sp)\n"
+                     "addi s0, sp, 16\n"
+                     "bgeu a2, a1, 4f\n"
+                     "srli a4, a2, 5\n"
+                     "slli a5, a4, 2\n"
+                     "add  a0, a0, a5\n"
+                     "lw   a5, 0(a0)\n"
+                     "li   a3, -1\n"
+                     "sll  a3, a3, a2\n"
+                     "not  a5, a5\n"
+                     "and  a5, a5, a3\n"
+                     "bnez a5, 1f\n"
+                     "li   a5, 0x00010000\n"
+                     "1:\n"
+                     ".global irq_find_next_ctz_start\n"
+                     "irq_find_next_ctz_start:\n"
+                     "li   t4, 200000\n"
+                     "2:\n"
+                     "li   a5, 0x00010000\n"
+                     "slli a2, a5, 16\n"
+                     "srli a2, a2, 16\n"
+                     "li   a3, 0\n"
+                     "bnez a2, 3f\n"
+                     "srli a5, a5, 16\n"
+                     "li   a3, 16\n"
+                     "3:\n"
+                     "zext.b a2, a5\n"
+                     "bnez a2, 5f\n"
+                     "addi a3, a3, 8\n"
+                     "srli a5, a5, 8\n"
+                     "5:\n"
+                     "andi a2, a5, 0xf\n"
+                     "bnez a2, 6f\n"
+                     "addi a3, a3, 4\n"
+                     "srli a5, a5, 4\n"
+                     "6:\n"
+                     "andi a2, a5, 0x3\n"
+                     "bnez a2, 7f\n"
+                     "addi a3, a3, 2\n"
+                     "srli a5, a5, 2\n"
+                     "7:\n"
+                     "andi a2, a5, 0x1\n"
+                     "bnez a2, 8f\n"
+                     "addi a3, a3, 1\n"
+                     "8:\n"
+                     "la   t0, g_find_result\n"
+                     "sw   a3, 0(t0)\n"
+                     "la   t0, g_ticks\n"
+                     "lw   t1, 0(t0)\n"
+                     "la   t0, g_target_tick\n"
+                     "lw   t2, 0(t0)\n"
+                     "beq  t1, t2, 9f\n"
+                     "la   t0, g_fail_seen\n"
+                     "lw   t1, 0(t0)\n"
+                     "bnez t1, 9f\n"
+                     "addi t4, t4, -1\n"
+                     "bnez t4, 2b\n"
+                     "li   a0, 31\n"
+                     "lw   s0, 8(sp)\n"
+                     "addi sp, sp, 16\n"
+                     "j    stack_slot_timeout\n"
+                     "4:\n"
+                     "li   a3, 0\n"
+                     "9:\n"
+                     ".global irq_find_next_ctz_end\n"
+                     "irq_find_next_ctz_end:\n"
+                     "lw   ra, 12(sp)\n"
+                     "la   t0, g_slot_before_return\n"
+                     "sw   ra, 0(t0)\n"
+                     "li   t2, 0x80000000\n"
+                     "bltu ra, t2, irq_find_next_bad_return\n"
+                     "lw   s0, 8(sp)\n"
+                     "addi sp, sp, 16\n"
+                     "ret\n"
+                     "irq_find_next_bad_return:\n"
+                     "mv   a0, ra\n"
+                     "lw   s0, 8(sp)\n"
+                     "addi sp, sp, 16\n"
+                     "j    stack_slot_bad_return\n"
+                     ".option pop\n");
+}
+
+__attribute__((naked, aligned(4), noinline, used)) void irq_find_next_bit_exact_callee(void)
+{
+    __asm__ volatile(".option push\n"
+                     ".option rvc\n"
+                     "addi sp, sp, -16\n"
+                     "sw   s0, 8(sp)\n"
+                     "sw   ra, 12(sp)\n"
+                     "addi s0, sp, 16\n"
+                     "bgeu a2, a1, 4f\n"
+                     "srli a5, a2, 5\n"
+                     "slli a4, a5, 2\n"
+                     "add  a0, a0, a4\n"
+                     "lw   a3, 0(a0)\n"
+                     "li   a4, -1\n"
+                     "sll  a4, a4, a2\n"
+                     "and  a4, a4, a3\n"
+                     "bnez a4, 1f\n"
+                     "addi a5, a5, 1\n"
+                     "slli a5, a5, 5\n"
+                     "bgeu a5, a1, 4f\n"
+                     "2:\n"
+                     "lw   a4, 4(a0)\n"
+                     "addi a0, a0, 4\n"
+                     "bnez a4, 3f\n"
+                     "addi a5, a5, 32\n"
+                     "bltu a5, a1, 2b\n"
+                     "4:\n"
+                     "lw   ra, 12(sp)\n"
+                     "la   t0, g_slot_before_return\n"
+                     "sw   ra, 0(t0)\n"
+                     "li   t2, 0x80000000\n"
+                     "bltu ra, t2, irq_find_next_exact_bad_return\n"
+                     "lw   s0, 8(sp)\n"
+                     "mv   a0, a1\n"
+                     "addi sp, sp, 16\n"
+                     "ret\n"
+                     "1:\n"
+                     "slli a5, a5, 5\n"
+                     "3:\n"
+                     ".global irq_find_next_exact_ctz_start\n"
+                     "irq_find_next_exact_ctz_start:\n"
+                     "slli a2, a4, 16\n"
+                     "srli a2, a2, 16\n"
+                     "li   a3, 0\n"
+                     "bnez a2, 5f\n"
+                     "srli a4, a4, 16\n"
+                     "li   a3, 16\n"
+                     "5:\n"
+                     "zext.b a2, a4\n"
+                     "bnez a2, 6f\n"
+                     "addi a3, a3, 8\n"
+                     "srli a4, a4, 8\n"
+                     "6:\n"
+                     "andi a2, a4, 0xf\n"
+                     "bnez a2, 7f\n"
+                     "addi a3, a3, 4\n"
+                     "srli a4, a4, 4\n"
+                     "7:\n"
+                     "andi a2, a4, 0x3\n"
+                     "bnez a2, 8f\n"
+                     "addi a3, a3, 2\n"
+                     "srli a4, a4, 2\n"
+                     "8:\n"
+                     "andi a4, a4, 1\n"
+                     "seqz a4, a4\n"
+                     "add  a3, a3, a4\n"
+                     ".global irq_find_next_exact_ctz_end\n"
+                     "irq_find_next_exact_ctz_end:\n"
+                     "add  a3, a3, a5\n"
+                     "bgeu a3, a1, 4b\n"
+                     "lw   ra, 12(sp)\n"
+                     "la   t0, g_slot_before_return\n"
+                     "sw   ra, 0(t0)\n"
+                     "li   t2, 0x80000000\n"
+                     "bltu ra, t2, irq_find_next_exact_bad_return\n"
+                     "lw   s0, 8(sp)\n"
+                     "mv   a0, a3\n"
+                     "addi sp, sp, 16\n"
+                     "ret\n"
+                     "irq_find_next_exact_bad_return:\n"
+                     "mv   a0, ra\n"
+                     "lw   s0, 8(sp)\n"
+                     "addi sp, sp, 16\n"
+                     "j    stack_slot_bad_return\n"
+                     ".option pop\n");
+}
+
+__attribute__((naked, aligned(4), noinline, used)) uint32_t run_find_next_call_window(void)
+{
+    __asm__ volatile(".option push\n"
+                     ".option rvc\n"
+                     "addi sp, sp, -16\n"
+                     "sw   ra, 0(sp)\n"
+                     "la   t1, g_caller_sp\n"
+                     "sw   sp, 0(t1)\n"
+                     "addi t0, sp, 12\n"
+                     "la   t1, g_expected_caller_slot_addr\n"
+                     "sw   t0, 0(t1)\n"
+                     "li   t2, 0x00000cc0\n"
+                     "sw   t2, 0(t0)\n"
+                     "lw   t3, 0(t0)\n"
+                     "la   t1, g_caller_poison_readback\n"
+                     "sw   t3, 0(t1)\n"
+                     "addi t0, sp, -16\n"
+                     "la   t1, g_callee_sp\n"
+                     "sw   t0, 0(t1)\n"
+                     "addi t0, t0, 12\n"
+                     "la   t1, g_expected_slot_addr\n"
+                     "sw   t0, 0(t1)\n"
+                     "li   t2, 0x00000cc0\n"
+                     "sw   t2, 0(t0)\n"
+                     "lw   t3, 0(t0)\n"
+                     "la   t1, g_poison_readback\n"
+                     "sw   t3, 0(t1)\n"
+                     "la   t1, 1f\n"
+                     "la   t0, g_expected_saved_ra\n"
+                     "sw   t1, 0(t0)\n"
+                     "la   t0, g_callee_ra_saved\n"
+                     "sw   t1, 0(t0)\n"
+                     "la   a0, g_find_bitmap\n"
+                     "li   a1, 8192\n"
+                     "li   a2, 16\n"
+                     "call irq_find_next_bit_callee\n"
+                     "1:\n"
+                     "li   a0, 1\n"
+                     "lw   ra, 0(sp)\n"
+                     "addi sp, sp, 16\n"
+                     "ret\n"
+                     ".option pop\n");
+}
+
+__attribute__((naked, aligned(4), noinline, used)) uint32_t run_find_next_exact_call_window(void)
+{
+    __asm__ volatile(".option push\n"
+                     ".option rvc\n"
+                     "addi sp, sp, -16\n"
+                     "sw   ra, 0(sp)\n"
+                     "la   t1, g_caller_sp\n"
+                     "sw   sp, 0(t1)\n"
+                     "addi t0, sp, 12\n"
+                     "la   t1, g_expected_caller_slot_addr\n"
+                     "sw   t0, 0(t1)\n"
+                     "li   t2, 0x00000cc0\n"
+                     "sw   t2, 0(t0)\n"
+                     "lw   t3, 0(t0)\n"
+                     "la   t1, g_caller_poison_readback\n"
+                     "sw   t3, 0(t1)\n"
+                     "addi t0, sp, -16\n"
+                     "la   t1, g_callee_sp\n"
+                     "sw   t0, 0(t1)\n"
+                     "addi t0, t0, 12\n"
+                     "la   t1, g_expected_slot_addr\n"
+                     "sw   t0, 0(t1)\n"
+                     "li   t2, 0x00000cc0\n"
+                     "sw   t2, 0(t0)\n"
+                     "lw   t3, 0(t0)\n"
+                     "la   t1, g_poison_readback\n"
+                     "sw   t3, 0(t1)\n"
+                     "la   t1, 1f\n"
+                     "la   t0, g_expected_saved_ra\n"
+                     "sw   t1, 0(t0)\n"
+                     "la   t0, g_callee_ra_saved\n"
+                     "sw   t1, 0(t0)\n"
+                     "la   a0, g_find_bitmap\n"
+                     "li   a1, 8192\n"
+                     "li   a2, 16\n"
+                     "call irq_find_next_bit_exact_callee\n"
+                     "1:\n"
+                     "la   t0, g_exact_result\n"
+                     "sw   a0, 0(t0)\n"
+                     "li   a0, 1\n"
+                     "lw   ra, 0(sp)\n"
+                     "addi sp, sp, 16\n"
+                     "ret\n"
+                     ".option pop\n");
+}
+
+static void init_find_bitmap(void)
+{
+    for (uint32_t i = 0; i < FIND_BITMAP_WORDS; i++) {
+        g_find_bitmap[i] = 0xFFFFFFFFu;
+    }
+    g_find_bitmap[0] = 0xFFFEFFFFu;
+}
+
+static void init_find_bitmap_exact(void)
+{
+    for (uint32_t i = 0; i < FIND_BITMAP_WORDS; i++) {
+        g_find_bitmap[i] = 0u;
+    }
+    g_find_bitmap[0] = 0x00010000u;
+}
+
+static void prepare_exact_probe(void)
+{
+    disable_interrupts();
+    disable_timer_interrupt();
+    clint_ack_timer();
+
+    g_current_iter = 0xE0000000u;
+    g_target_tick = g_ticks;
+    g_read_slot_in_handler = 1u;
+    g_expected_slot_addr = 0u;
+    g_expected_caller_slot_addr = 0u;
+    g_expected_saved_ra = 0u;
+    g_poison_readback = 0u;
+    g_caller_poison_readback = 0u;
+    g_caller_sp = 0u;
+    g_callee_sp = 0u;
+    g_callee_ra_saved = 0u;
+    g_slot_during_irq = 0xFFFFFFFFu;
+    g_slot_before_return = 0u;
+    g_irq_in_callee = 0u;
+    g_last_slot_addr = 0u;
+    g_irq_in_ctz = 0u;
+    g_exact_result = 0xFFFFFFFFu;
+
+    write_tp((uint32_t) &g_fake_current);
+    csr_write(mscratch, 0u);
+    enable_external_interrupt();
+    enable_interrupts();
+}
+
+static uint32_t run_exact_probe(void)
+{
+    uint32_t checksum = 0x13579BDFu;
+
+    init_find_bitmap_exact();
+    g_seen_exact_ctz_irq = 0u;
+
+    for (uint32_t i = 0; i < 128u && !g_fail_seen; i++) {
+        prepare_exact_probe();
+        if (run_find_next_exact_call_window() != 1u) {
+            record_failure(70u);
+        }
+        disable_interrupts();
+        clint_ack_timer();
+
+        if (g_exact_result != 16u) {
+            record_failure(71u);
+        }
+        if (g_callee_sp == 0u || g_expected_slot_addr != g_callee_sp + 12u) {
+            record_failure(72u);
+        }
+        if (g_poison_readback != POISON_RA) {
+            record_failure(73u);
+        }
+        if (g_caller_sp == 0u || g_expected_caller_slot_addr != g_caller_sp + 12u) {
+            record_failure(76u);
+        }
+        if (g_caller_poison_readback != POISON_RA) {
+            record_failure(77u);
+        }
+        if (g_slot_before_return != g_expected_saved_ra) {
+            record_failure(74u);
+        }
+        if (g_seen_exact_ctz_irq && g_slot_during_irq != g_expected_saved_ra) {
+            record_failure(75u);
+        }
+
+        checksum ^= g_exact_result ^ g_slot_before_return ^ g_expected_slot_addr;
+        checksum ^= (i << 24) ^ g_ticks;
+        if (g_seen_exact_ctz_irq) {
+            break;
+        }
+    }
+
+    disable_interrupts();
+    disable_external_interrupt();
+    clint_ack_timer();
+    return checksum;
+}
+
+static void prepare_window(uint32_t iter, uint32_t read_slot_in_handler)
+{
+    disable_interrupts();
+    clint_ack_timer();
+
+    g_current_iter = iter;
+    g_target_tick = g_ticks + 1u;
+    g_read_slot_in_handler = read_slot_in_handler;
+    g_expected_slot_addr = 0u;
+    g_expected_caller_slot_addr = 0u;
+    g_expected_saved_ra = 0u;
+    g_poison_readback = 0u;
+    g_caller_poison_readback = 0u;
+    g_caller_sp = 0u;
+    g_callee_sp = 0u;
+    g_callee_ra_saved = 0u;
+    g_slot_during_irq = 0xFFFFFFFFu;
+    g_slot_before_return = 0u;
+    g_irq_in_callee = 0u;
+    g_last_slot_addr = 0u;
+    g_irq_in_ctz = 0u;
+    g_find_result = 0xFFFFFFFFu;
+
+    write_tp((uint32_t) &g_fake_current);
+    csr_write(mscratch, 0u);
+    clint_set_timer_cmp(clint_rdmtime() + 3000u + ((iter * 211u) & 2047u));
+    enable_interrupts();
+}
+
+static uint32_t run_one_window(uint32_t iter, uint32_t read_slot_in_handler)
+{
+    uint32_t returned;
+    uint32_t checksum;
+
+    prepare_window(iter, read_slot_in_handler);
+    returned = run_find_next_call_window();
+    disable_interrupts();
+    clint_ack_timer();
+
+    if (returned != 1u) {
+        record_failure(40u);
+    }
+    if (g_ticks != g_target_tick) {
+        record_failure(41u);
+    }
+    if (!g_irq_in_callee) {
+        record_failure(42u);
+    }
+    if (!g_irq_in_ctz) {
+        record_failure(51u);
+    }
+    if (g_callee_sp == 0u || g_expected_slot_addr != g_callee_sp + 12u) {
+        record_failure(43u);
+    }
+    if (g_poison_readback != POISON_RA) {
+        record_failure(44u);
+    }
+    if (g_caller_sp == 0u || g_expected_caller_slot_addr != g_caller_sp + 12u) {
+        record_failure(53u);
+    }
+    if (g_caller_poison_readback != POISON_RA) {
+        record_failure(54u);
+    }
+    if (g_expected_saved_ra < 0x80000000u || g_expected_saved_ra == POISON_RA) {
+        record_failure(45u);
+    }
+    if (g_callee_ra_saved != g_expected_saved_ra) {
+        record_failure(46u);
+    }
+    if (g_slot_before_return != g_expected_saved_ra) {
+        record_failure(47u);
+    }
+    if (read_slot_in_handler && g_slot_during_irq != g_expected_saved_ra) {
+        record_failure(48u);
+    }
+    if (g_find_result != 16u) {
+        record_failure(52u);
+    }
+    if (read_tp() != (uint32_t) &g_fake_current) {
+        record_failure(49u);
+    }
+    if (csr_read(mscratch) != 0u) {
+        record_failure(50u);
+    }
+
+    checksum = g_slot_before_return ^ g_expected_slot_addr ^ g_last_mepc;
+    checksum ^= (g_current_iter << 16) ^ g_ticks ^ (g_find_result << 8);
+    return checksum;
+}
+
+__attribute__((noreturn, noinline, used)) void main_on_ddr_stack(void)
+{
+    uint32_t checksum = 0xA51C05E0u;
+    uint32_t sweep_start_ticks;
+
+    uart_printf("\n=== Linux IRQ find-next-slot DDR test ===\n");
+
+    g_seen_ctz_irq = 0u;
+    g_fake_current.kernel_sp = (uint32_t) &g_ddr_stack[DDR_STACK_SIZE];
+    g_fake_current.user_sp = 0u;
+    set_trap_handler(&linux_like_irq_entry);
+    disable_interrupts();
+    clint_ack_timer();
+
+    checksum ^= run_exact_probe();
+    if (g_fail_seen) {
+        finish_fail("exact_probe");
+    }
+
+    init_find_bitmap();
+    sweep_start_ticks = g_ticks;
+    enable_timer_interrupt();
+    enable_external_interrupt();
+
+    for (uint32_t i = 0; i < TOTAL_ITERATIONS && !g_fail_seen; i++) {
+        checksum ^= run_one_window(i, 1u);
+    }
+
+    disable_timer_interrupt();
+    disable_external_interrupt();
+    disable_interrupts();
+    clint_ack_timer();
+
+    if (g_fail_seen) {
+        finish_fail("post_check");
+    }
+
+    if (g_ticks == sweep_start_ticks + TOTAL_ITERATIONS && g_seen_ctz_irq && checksum != 0u) {
+        uart_printf("ticks=%u checksum=%08x last_mepc=%08x last_ra=%08x slot=%08x exact_irq=%u\n",
+                    g_ticks,
+                    checksum,
+                    g_last_mepc,
+                    g_last_ra,
+                    g_slot_before_return,
+                    g_seen_exact_ctz_irq);
+        uart_printf("<<PASS>>\n");
+    } else {
+        record_failure(60u);
+        finish_fail("final_count");
+    }
+
+    for (;;) {
+    }
+}
+
+int main(void)
+{
+    uint32_t stack_top = ((uint32_t) &g_ddr_stack[DDR_STACK_SIZE]) & ~0xFu;
+
+    __asm__ volatile("mv sp, %0\n"
+                     "j  main_on_ddr_stack\n"
+                     :
+                     : "r"(stack_top)
+                     : "memory");
+    __builtin_unreachable();
+}
diff --git a/sw/apps/linux_irq_stack_slot_test/Makefile b/sw/apps/linux_irq_stack_slot_test/Makefile
new file mode 100644
index 00000000..dd4526e5
--- /dev/null
+++ b/sw/apps/linux_irq_stack_slot_test/Makefile
@@ -0,0 +1,19 @@
+#    Copyright 2026 Two Sigma Open Source, LLC
+#
+#    Licensed under the Apache License, Version 2.0 (the "License");
+#    you may not use this file except in compliance with the License.
+#    You may obtain a copy of the License at
+#
+#        http://www.apache.org/licenses/LICENSE-2.0
+#
+#    Unless required by applicable law or agreed to in writing, software
+#    distributed under the License is distributed on an "AS IS" BASIS,
+#    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#    See the License for the specific language governing permissions and
+#    limitations under the License.
+
+# Directed Linux IRQ stack-slot repro. Force the whole program into cached DDR
+# so the callee save slot exercises the same D-side path as the kernel stack.
+override MEM_CONFIG := ddr
+SRC_C := ../../lib/src/uart.c main.c
+include ../../common/common.mk
diff --git a/sw/apps/linux_irq_stack_slot_test/main.c b/sw/apps/linux_irq_stack_slot_test/main.c
new file mode 100644
index 00000000..5101dbd0
--- /dev/null
+++ b/sw/apps/linux_irq_stack_slot_test/main.c
@@ -0,0 +1,565 @@
+/*
+ *    Copyright 2026 Two Sigma Open Source, LLC
+ *
+ *    Licensed under the Apache License, Version 2.0 (the "License");
+ *    you may not use this file except in compliance with the License.
+ *    You may obtain a copy of the License at
+ *
+ *        http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *    Unless required by applicable law or agreed to in writing, software
+ *    distributed under the License is distributed on an "AS IS" BASIS,
+ *    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *    See the License for the specific language governing permissions and
+ *    limitations under the License.
+ */
+
+/*
+ * Directed repro for the Linux timer-IRQ failure where _find_next_zero_bit()
+ * returned through ra == 0x00000cc0 after an IRQ.  The test poisons the exact
+ * future callee save slot with 0xcc0, enters a callee whose prologue matches:
+ *
+ *     addi sp, sp, -16
+ *     sw   s0, 8(sp)
+ *     sw   ra, 12(sp)
+ *     addi s0, sp, 16
+ *
+ * It takes a Linux-like machine timer IRQ while the callee is active, then
+ * checks the later load from 12(sp) before using it as a return address.
+ */
+
+#include <stdint.h>
+
+#include "csr.h"
+#include "trap.h"
+#include "uart.h"
+
+#define CLINT_MTIMECMP_LO (*(volatile uint32_t *) 0x40014000u)
+#define CLINT_MTIMECMP_HI (*(volatile uint32_t *) 0x40014004u)
+#define CLINT_MTIME_LO (*(volatile uint32_t *) 0x4001BFF8u)
+#define CLINT_MTIME_HI (*(volatile uint32_t *) 0x4001BFFCu)
+#define DDR_STACK_SIZE 4096u
+#define NONINTRUSIVE_ITERATIONS 24u
+#define INTRUSIVE_ITERATIONS 8u
+#define TOTAL_ITERATIONS (NONINTRUSIVE_ITERATIONS + INTRUSIVE_ITERATIONS)
+#define POISON_RA 0x00000CC0u
+
+struct linux_pt_regs {
+    uint32_t epc;
+    uint32_t ra;
+    uint32_t sp;
+    uint32_t gp;
+    uint32_t tp;
+    uint32_t t0;
+    uint32_t t1;
+    uint32_t t2;
+    uint32_t s0;
+    uint32_t s1;
+    uint32_t a0;
+    uint32_t a1;
+    uint32_t a2;
+    uint32_t a3;
+    uint32_t a4;
+    uint32_t a5;
+    uint32_t a6;
+    uint32_t a7;
+    uint32_t s2;
+    uint32_t s3;
+    uint32_t s4;
+    uint32_t s5;
+    uint32_t s6;
+    uint32_t s7;
+    uint32_t s8;
+    uint32_t s9;
+    uint32_t s10;
+    uint32_t s11;
+    uint32_t t3;
+    uint32_t t4;
+    uint32_t t5;
+    uint32_t t6;
+    uint32_t status;
+    uint32_t badaddr;
+    uint32_t cause;
+    uint32_t orig_a0;
+};
+
+struct fake_current {
+    uint32_t kernel_sp;
+    uint32_t user_sp;
+    uint32_t marker;
+};
+
+volatile struct fake_current g_fake_current = {0u, 0u, 0x5354414Bu};
+volatile uint32_t g_ticks;
+volatile uint32_t g_target_tick;
+volatile uint32_t g_current_iter;
+volatile uint32_t g_read_slot_in_handler;
+
+volatile uint32_t g_fail_seen;
+volatile uint32_t g_fail_code;
+volatile uint32_t g_bad_cause;
+volatile uint32_t g_bad_epc;
+volatile uint32_t g_bad_ra;
+
+volatile uint32_t g_expected_slot_addr;
+volatile uint32_t g_expected_saved_ra;
+volatile uint32_t g_poison_readback;
+volatile uint32_t g_callee_sp;
+volatile uint32_t g_callee_ra_saved;
+volatile uint32_t g_slot_during_irq;
+volatile uint32_t g_slot_before_return;
+
+volatile uint32_t g_irq_in_callee;
+volatile uint32_t g_last_mepc;
+volatile uint32_t g_last_ra;
+volatile uint32_t g_last_sp;
+volatile uint32_t g_last_tp;
+volatile uint32_t g_last_mscratch_in_handler;
+volatile uint32_t g_last_slot_addr;
+
+static uint8_t g_ddr_stack[DDR_STACK_SIZE] __attribute__((aligned(16)));
+
+__attribute__((naked, aligned(4), noinline, used)) void irq_stack_slot_callee(void);
+__attribute__((naked, aligned(4), noinline, used)) uint32_t run_stack_slot_call_window(void);
+__attribute__((noreturn, noinline, used)) void stack_slot_bad_return(uint32_t observed);
+__attribute__((noreturn, noinline, used)) void stack_slot_timeout(uint32_t code);
+
+static inline uint32_t read_tp(void)
+{
+    uint32_t value;
+
+    __asm__ volatile("mv %0, tp" : "=r"(value));
+    return value;
+}
+
+static inline void write_tp(uint32_t value)
+{
+    __asm__ volatile("mv tp, %0" : : "r"(value) : "memory");
+}
+
+static uint64_t clint_rdmtime(void)
+{
+    uint32_t hi;
+    uint32_t lo;
+    uint32_t hi2;
+
+    do {
+        hi = CLINT_MTIME_HI;
+        lo = CLINT_MTIME_LO;
+        hi2 = CLINT_MTIME_HI;
+    } while (hi != hi2);
+
+    return ((uint64_t) hi << 32) | lo;
+}
+
+static void clint_set_timer_cmp(uint64_t cmp)
+{
+    CLINT_MTIMECMP_HI = 0xFFFFFFFFu;
+    CLINT_MTIMECMP_LO = (uint32_t) cmp;
+    CLINT_MTIMECMP_HI = (uint32_t) (cmp >> 32);
+}
+
+static void clint_ack_timer(void)
+{
+    CLINT_MTIMECMP_HI = 0xFFFFFFFFu;
+    CLINT_MTIMECMP_LO = 0xFFFFFFFFu;
+}
+
+static void record_failure(uint32_t code)
+{
+    if (!g_fail_seen) {
+        g_fail_seen = 1u;
+        g_fail_code = code;
+        g_bad_cause = csr_read(mcause);
+        g_bad_epc = csr_read(mepc);
+        g_bad_ra = g_last_ra;
+    }
+}
+
+__attribute__((noreturn, noinline)) static void finish_fail(const char *tag)
+{
+    disable_timer_interrupt();
+    disable_external_interrupt();
+    disable_interrupts();
+    clint_ack_timer();
+
+    uart_printf("FAIL %s code=%u iter=%u ticks=%u target=%u cause=%08x\n",
+                tag,
+                g_fail_code,
+                g_current_iter,
+                g_ticks,
+                g_target_tick,
+                g_bad_cause);
+    uart_printf("pc epc=%08x ra=%08x sp=%08x tp=%08x mscratch=%08x\n",
+                g_last_mepc,
+                g_last_ra,
+                g_last_sp,
+                g_last_tp,
+                g_last_mscratch_in_handler);
+    uart_printf("slot addr=%08x irq_addr=%08x poison=%08x irq_slot=%08x before_ret=%08x\n",
+                g_expected_slot_addr,
+                g_last_slot_addr,
+                g_poison_readback,
+                g_slot_during_irq,
+                g_slot_before_return);
+    uart_printf("expected_ra=%08x callee_ra=%08x callee_sp=%08x bad_epc=%08x bad_ra=%08x\n",
+                g_expected_saved_ra,
+                g_callee_ra_saved,
+                g_callee_sp,
+                g_bad_epc,
+                g_bad_ra);
+    uart_printf("<<FAIL>>\n");
+
+    for (;;) {
+    }
+}
+
+__attribute__((noreturn, noinline, used)) void stack_slot_bad_return(uint32_t observed)
+{
+    g_slot_before_return = observed;
+    record_failure(30u);
+    finish_fail("bad_return_slot");
+}
+
+__attribute__((noreturn, noinline, used)) void stack_slot_timeout(uint32_t code)
+{
+    record_failure(code);
+    finish_fail("callee_timeout");
+}
+
+__attribute__((noinline, used)) void linux_like_irq_c(struct linux_pt_regs *frame)
+{
+    uint32_t tick = g_ticks;
+
+    g_last_mepc = frame->epc;
+    g_last_ra = frame->ra;
+    g_last_sp = frame->sp;
+    g_last_tp = frame->tp;
+    g_last_mscratch_in_handler = csr_read(mscratch);
+
+    uint32_t cause_code = frame->cause & ~MCAUSE_INTERRUPT_BIT;
+    if ((frame->cause & MCAUSE_INTERRUPT_BIT) == 0u ||
+        (cause_code != INT_MTI && cause_code != INT_MEI)) {
+        g_bad_epc = frame->epc;
+        g_bad_ra = frame->ra;
+        record_failure(1u);
+        finish_fail("unexpected_trap");
+    }
+
+    if (g_callee_sp != 0u && frame->sp == g_callee_sp) {
+        g_irq_in_callee = 1u;
+        g_last_slot_addr = frame->sp + 12u;
+        if (g_read_slot_in_handler) {
+            g_slot_during_irq = *(volatile uint32_t *) (frame->sp + 12u);
+        }
+    } else {
+        record_failure(2u);
+    }
+
+    if (frame->ra < 0x80000000u || frame->ra == POISON_RA) {
+        record_failure(3u);
+    }
+    if (frame->tp != (uint32_t) &g_fake_current) {
+        record_failure(4u);
+    }
+    if (g_last_mscratch_in_handler != 0u) {
+        record_failure(5u);
+    }
+
+    clint_ack_timer();
+    g_ticks = tick + 1u;
+}
+
+__attribute__((naked, aligned(4))) static void linux_like_irq_entry(void)
+{
+    __asm__ volatile("csrrw tp, mscratch, tp\n"
+                     "bnez tp, 1f\n"
+                     "csrr tp, mscratch\n"
+                     "1:\n"
+                     "addi sp, sp, -144\n"
+                     "sw   ra, 4(sp)\n"
+                     "sw   gp, 12(sp)\n"
+                     "sw   t0, 20(sp)\n"
+                     "sw   t1, 24(sp)\n"
+                     "sw   t2, 28(sp)\n"
+                     "sw   s0, 32(sp)\n"
+                     "sw   s1, 36(sp)\n"
+                     "sw   a0, 40(sp)\n"
+                     "sw   a1, 44(sp)\n"
+                     "sw   a2, 48(sp)\n"
+                     "sw   a3, 52(sp)\n"
+                     "sw   a4, 56(sp)\n"
+                     "sw   a5, 60(sp)\n"
+                     "sw   a6, 64(sp)\n"
+                     "sw   a7, 68(sp)\n"
+                     "sw   s2, 72(sp)\n"
+                     "sw   s3, 76(sp)\n"
+                     "sw   s4, 80(sp)\n"
+                     "sw   s5, 84(sp)\n"
+                     "sw   s6, 88(sp)\n"
+                     "sw   s7, 92(sp)\n"
+                     "sw   s8, 96(sp)\n"
+                     "sw   s9, 100(sp)\n"
+                     "sw   s10, 104(sp)\n"
+                     "sw   s11, 108(sp)\n"
+                     "sw   t3, 112(sp)\n"
+                     "sw   t4, 116(sp)\n"
+                     "sw   t5, 120(sp)\n"
+                     "sw   t6, 124(sp)\n"
+                     "sw   a0, 140(sp)\n"
+                     "addi t0, sp, 144\n"
+                     "sw   t0, 8(sp)\n"
+                     "csrr t0, mepc\n"
+                     "sw   t0, 0(sp)\n"
+                     "csrr t0, mstatus\n"
+                     "sw   t0, 128(sp)\n"
+                     "csrr t0, mtval\n"
+                     "sw   t0, 132(sp)\n"
+                     "csrr t0, mcause\n"
+                     "sw   t0, 136(sp)\n"
+                     "csrr t0, mscratch\n"
+                     "sw   t0, 16(sp)\n"
+                     "csrw mscratch, x0\n"
+                     "mv   a0, sp\n"
+                     "call linux_like_irq_c\n"
+                     "lw   a0, 128(sp)\n"
+                     "lw   a2, 0(sp)\n"
+                     "sc.w x0, a2, 0(sp)\n"
+                     "csrw mstatus, a0\n"
+                     "csrw mepc, a2\n"
+                     "lw   ra, 4(sp)\n"
+                     "lw   gp, 12(sp)\n"
+                     "lw   tp, 16(sp)\n"
+                     "lw   t0, 20(sp)\n"
+                     "lw   t1, 24(sp)\n"
+                     "lw   t2, 28(sp)\n"
+                     "lw   s0, 32(sp)\n"
+                     "lw   s1, 36(sp)\n"
+                     "lw   a0, 40(sp)\n"
+                     "lw   a1, 44(sp)\n"
+                     "lw   a2, 48(sp)\n"
+                     "lw   a3, 52(sp)\n"
+                     "lw   a4, 56(sp)\n"
+                     "lw   a5, 60(sp)\n"
+                     "lw   a6, 64(sp)\n"
+                     "lw   a7, 68(sp)\n"
+                     "lw   s2, 72(sp)\n"
+                     "lw   s3, 76(sp)\n"
+                     "lw   s4, 80(sp)\n"
+                     "lw   s5, 84(sp)\n"
+                     "lw   s6, 88(sp)\n"
+                     "lw   s7, 92(sp)\n"
+                     "lw   s8, 96(sp)\n"
+                     "lw   s9, 100(sp)\n"
+                     "lw   s10, 104(sp)\n"
+                     "lw   s11, 108(sp)\n"
+                     "lw   t3, 112(sp)\n"
+                     "lw   t4, 116(sp)\n"
+                     "lw   t5, 120(sp)\n"
+                     "lw   t6, 124(sp)\n"
+                     "lw   sp, 8(sp)\n"
+                     "mret\n");
+}
+
+__attribute__((naked, aligned(4), noinline, used)) void irq_stack_slot_callee(void)
+{
+    __asm__ volatile(".option push\n"
+                     ".option rvc\n"
+                     "addi sp, sp, -16\n"
+                     "sw   s0, 8(sp)\n"
+                     "sw   ra, 12(sp)\n"
+                     "addi s0, sp, 16\n"
+                     "la   t0, g_callee_sp\n"
+                     "sw   sp, 0(t0)\n"
+                     "la   t0, g_callee_ra_saved\n"
+                     "sw   ra, 0(t0)\n"
+                     "li   t4, 200000\n"
+                     "1:\n"
+                     "la   t0, g_ticks\n"
+                     "lw   t1, 0(t0)\n"
+                     "la   t0, g_target_tick\n"
+                     "lw   t2, 0(t0)\n"
+                     "beq  t1, t2, 3f\n"
+                     "la   t0, g_fail_seen\n"
+                     "lw   t1, 0(t0)\n"
+                     "bnez t1, 3f\n"
+                     "addi t4, t4, -1\n"
+                     "bnez t4, 1b\n"
+                     "li   a0, 31\n"
+                     "lw   s0, 8(sp)\n"
+                     "addi sp, sp, 16\n"
+                     "j    stack_slot_timeout\n"
+                     "3:\n"
+                     "lw   ra, 12(sp)\n"
+                     "la   t0, g_slot_before_return\n"
+                     "sw   ra, 0(t0)\n"
+                     "li   t2, 0x80000000\n"
+                     "bltu ra, t2, 2f\n"
+                     "lw   s0, 8(sp)\n"
+                     "addi sp, sp, 16\n"
+                     "ret\n"
+                     "2:\n"
+                     "mv   a0, ra\n"
+                     "lw   s0, 8(sp)\n"
+                     "addi sp, sp, 16\n"
+                     "j    stack_slot_bad_return\n"
+                     ".option pop\n");
+}
+
+__attribute__((naked, aligned(4), noinline, used)) uint32_t run_stack_slot_call_window(void)
+{
+    __asm__ volatile(".option push\n"
+                     ".option rvc\n"
+                     "addi sp, sp, -16\n"
+                     "sw   ra, 0(sp)\n"
+                     "addi t0, sp, -4\n"
+                     "la   t1, g_expected_slot_addr\n"
+                     "sw   t0, 0(t1)\n"
+                     "li   t2, 0x00000cc0\n"
+                     "sw   t2, 0(t0)\n"
+                     "lw   t3, 0(t0)\n"
+                     "la   t1, g_poison_readback\n"
+                     "sw   t3, 0(t1)\n"
+                     "la   t1, 1f\n"
+                     "la   t0, g_expected_saved_ra\n"
+                     "sw   t1, 0(t0)\n"
+                     "call irq_stack_slot_callee\n"
+                     "1:\n"
+                     "li   a0, 1\n"
+                     "lw   ra, 0(sp)\n"
+                     "addi sp, sp, 16\n"
+                     "ret\n"
+                     ".option pop\n");
+}
+
+static void prepare_window(uint32_t iter, uint32_t read_slot_in_handler)
+{
+    disable_interrupts();
+    clint_ack_timer();
+
+    g_current_iter = iter;
+    g_target_tick = g_ticks + 1u;
+    g_read_slot_in_handler = read_slot_in_handler;
+    g_expected_slot_addr = 0u;
+    g_expected_saved_ra = 0u;
+    g_poison_readback = 0u;
+    g_callee_sp = 0u;
+    g_callee_ra_saved = 0u;
+    g_slot_during_irq = 0xFFFFFFFFu;
+    g_slot_before_return = 0u;
+    g_irq_in_callee = 0u;
+    g_last_slot_addr = 0u;
+
+    write_tp((uint32_t) &g_fake_current);
+    csr_write(mscratch, 0u);
+    clint_set_timer_cmp(clint_rdmtime() + 3000u + ((iter * 211u) & 1023u));
+    enable_interrupts();
+}
+
+static uint32_t run_one_window(uint32_t iter, uint32_t read_slot_in_handler)
+{
+    uint32_t returned;
+    uint32_t checksum;
+
+    prepare_window(iter, read_slot_in_handler);
+    returned = run_stack_slot_call_window();
+    disable_interrupts();
+    clint_ack_timer();
+
+    if (returned != 1u) {
+        record_failure(40u);
+    }
+    if (g_ticks != g_target_tick) {
+        record_failure(41u);
+    }
+    if (!g_irq_in_callee) {
+        record_failure(42u);
+    }
+    if (g_callee_sp == 0u || g_expected_slot_addr != g_callee_sp + 12u) {
+        record_failure(43u);
+    }
+    if (g_poison_readback != POISON_RA) {
+        record_failure(44u);
+    }
+    if (g_expected_saved_ra < 0x80000000u || g_expected_saved_ra == POISON_RA) {
+        record_failure(45u);
+    }
+    if (g_callee_ra_saved != g_expected_saved_ra) {
+        record_failure(46u);
+    }
+    if (g_slot_before_return != g_expected_saved_ra) {
+        record_failure(47u);
+    }
+    if (read_slot_in_handler && g_slot_during_irq != g_expected_saved_ra) {
+        record_failure(48u);
+    }
+    if (read_tp() != (uint32_t) &g_fake_current) {
+        record_failure(49u);
+    }
+    if (csr_read(mscratch) != 0u) {
+        record_failure(50u);
+    }
+
+    checksum = g_slot_before_return ^ g_expected_slot_addr ^ g_last_mepc;
+    checksum ^= (g_current_iter << 16) ^ g_ticks ^ (read_slot_in_handler << 31);
+    return checksum;
+}
+
+__attribute__((noreturn, noinline, used)) void main_on_ddr_stack(void)
+{
+    uint32_t checksum = 0xA51C05E0u;
+
+    uart_printf("\n=== Linux IRQ stack-slot DDR test ===\n");
+
+    g_fake_current.kernel_sp = (uint32_t) &g_ddr_stack[DDR_STACK_SIZE];
+    g_fake_current.user_sp = 0u;
+    set_trap_handler(&linux_like_irq_entry);
+    disable_interrupts();
+    clint_ack_timer();
+    enable_timer_interrupt();
+    enable_external_interrupt();
+
+    for (uint32_t i = 0; i < NONINTRUSIVE_ITERATIONS && !g_fail_seen; i++) {
+        checksum ^= run_one_window(i, 0u);
+    }
+    for (uint32_t i = 0; i < INTRUSIVE_ITERATIONS && !g_fail_seen; i++) {
+        checksum ^= run_one_window(NONINTRUSIVE_ITERATIONS + i, 1u);
+    }
+
+    disable_timer_interrupt();
+    disable_external_interrupt();
+    disable_interrupts();
+    clint_ack_timer();
+
+    if (g_fail_seen) {
+        finish_fail("post_check");
+    }
+
+    if (g_ticks == TOTAL_ITERATIONS && checksum != 0u) {
+        uart_printf("ticks=%u checksum=%08x last_mepc=%08x last_ra=%08x slot=%08x\n",
+                    g_ticks,
+                    checksum,
+                    g_last_mepc,
+                    g_last_ra,
+                    g_slot_before_return);
+        uart_printf("<<PASS>>\n");
+    } else {
+        record_failure(60u);
+        finish_fail("final_count");
+    }
+
+    for (;;) {
+    }
+}
+
+int main(void)
+{
+    uint32_t stack_top = ((uint32_t) &g_ddr_stack[DDR_STACK_SIZE]) & ~0xFu;
+
+    __asm__ volatile("mv sp, %0\n"
+                     "j  main_on_ddr_stack\n"
+                     :
+                     : "r"(stack_top)
+                     : "memory");
+    __builtin_unreachable();
+}
diff --git a/sw/apps/mret_drain_deadlock/Makefile b/sw/apps/mret_drain_deadlock/Makefile
new file mode 100644
index 00000000..21271c3f
--- /dev/null
+++ b/sw/apps/mret_drain_deadlock/Makefile
@@ -0,0 +1,22 @@
+#    Copyright 2026 Two Sigma Open Source, LLC
+#
+#    Licensed under the Apache License, Version 2.0 (the "License");
+#    you may not use this file except in compliance with the License.
+#    You may obtain a copy of the License at
+#
+#        http://www.apache.org/licenses/LICENSE-2.0
+#
+#    Unless required by applicable law or agreed to in writing, software
+#    distributed under the License is distributed on an "AS IS" BASIS,
+#    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#    See the License for the specific language governing permissions and
+#    limitations under the License.
+
+#    Copyright 2026 Two Sigma Open Source, LLC
+#    SPDX-License-Identifier: Apache-2.0
+# MRET-drain deadlock directed test (one-shot o_mret_start vs draining store).
+# Program runs from BRAM (simple boot); only the store buffer lives in the cached
+# DDR region (a loaded .ddr_data section) so the behavioral DDR model serves its
+# lines and cached stores actually drain (sq_committed_empty toggles).
+SRC_C := main.c
+include ../../common/common.mk
diff --git a/sw/apps/mret_drain_deadlock/main.c b/sw/apps/mret_drain_deadlock/main.c
new file mode 100644
index 00000000..6e5451c3
--- /dev/null
+++ b/sw/apps/mret_drain_deadlock/main.c
@@ -0,0 +1,171 @@
+/*
+ *    Copyright 2026 Two Sigma Open Source, LLC
+ *
+ *    Licensed under the Apache License, Version 2.0 (the "License");
+ *    you may not use this file except in compliance with the License.
+ *    You may obtain a copy of the License at
+ *
+ *        http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *    Unless required by applicable law or agreed to in writing, software
+ *    distributed under the License is distributed on an "AS IS" BASIS,
+ *    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *    See the License for the specific language governing permissions and
+ *    limitations under the License.
+ */
+
+/*
+ * MRET-drain deadlock directed test (deterministic).
+ *
+ * Reproduces the residual flaky HANG seen booting no-MMU Linux on Genesys2:
+ * the kernel intermittently wedges at the first idle/clocksource machine-timer
+ * activity. Proven root cause (FROST RTL):
+ *
+ *   o_mret_start (reorder_buffer.sv) is a strict ONE-CYCLE pulse asserted only
+ *   on the SERIAL_IDLE->SERIAL_MRET_EXEC cycle (unlike o_trap_pending, it has no
+ *   SERIAL_*_WAIT sustaining term). trap_unit.sv take_mret requires
+ *   i_sq_committed_empty IN THAT SAME CYCLE and has no retry. So if a committed
+ *   store is still draining when an MRET reaches the ROB head, take_mret misses
+ *   its only shot: mret_taken/mret_done never assert and the serializer wedges
+ *   in SERIAL_MRET_EXEC forever (commit_stall=1 freezes the core). There is no
+ *   escape -- the stuck MRET never restores MIE, so no later interrupt can flush
+ *   the pipeline back to SERIAL_IDLE.
+ *
+ * Why the existing tests miss it: mtimer_stress / wfi_mepc_test /
+ * mret_timer_resume_test all keep the handler stack in low BRAM (drains in ~1
+ * cycle, so sq_committed_empty is already 1 when the MRET arrives) and never
+ * create the "MRET reaches head while a committed CACHED/DDR store is mid-drain"
+ * window. The real kernel saves/restores its trap frame on the cached DDR kernel
+ * stack and idles (WFI) so the ROB empties and the restore MRET reaches head
+ * almost immediately -- exactly this window.
+ *
+ * This test makes the window DETERMINISTIC and timer-independent: an M-mode loop
+ * commits a backlog of distinct-line stores into the cached/DDR region (slow,
+ * serialized write-back drains => sq_committed_empty held 0 for many cycles),
+ * then immediately executes an MRET back to the loop top. On buggy RTL the very
+ * first MRET wedges in SERIAL_MRET_EXEC and the loop never prints <<PASS>> (the
+ * cocotb harness times out, and the optional FROST_MRET_DEADLOCK_PROBE asserts
+ * on serial_state stuck in SERIAL_MRET_EXEC). On fixed RTL every MRET waits out
+ * the drain, completes, and the loop prints <<PASS>>.
+ *
+ * The cocotb registration bakes in the Genesys2 cached shape (-GCACHED_HAS_L2=0,
+ * L1 -> DDR direct), which is where the bug manifests on hardware and where a
+ * cold cached-store write-back actually drains in sim, so the standard flow just
+ * works:
+ *   cd frost/tests; make clean; ./test_run_cocotb.py mret_drain_deadlock
+ * BEFORE the cpu_ooo/reorder_buffer o_mret_start fix: the first MRET wedges in
+ * SERIAL_MRET_EXEC and the harness times out. AFTER the fix: <<PASS>>.
+ */
+
+#include <stdint.h>
+
+#include "trap.h"
+
+/* Store buffer in the cached/DDR region (CACHED_BASE = 0x8000_0000). Placed in
+ * a loaded .ddr_data section (like ddr_atomic_test): it lives in the behavioral
+ * DDR model and -- unlike .bss -- is NOT touched by crt0, so the loop's first
+ * stores to it MISS the L1 and take the full ~DDR_MODEL_LATENCY write-back drain
+ * (fill from valid DDR, then write), reliably holding sq_committed_empty low
+ * while the MRET reaches the ROB head. Non-zero initializer forces it loaded. */
+__attribute__((section(".ddr_data"), aligned(64))) static volatile uint32_t g_ddr_buf[256] = {1};
+
+static void uart_putc(char c)
+{
+    UART_TX = (uint8_t) c;
+}
+static void uart_puts(const char *s)
+{
+    while (*s)
+        uart_putc(*s++);
+}
+static void uart_hex(uint32_t v)
+{
+    static const char hex[] = "0123456789ABCDEF";
+    uart_puts("0x");
+    for (int i = 28; i >= 0; i -= 4)
+        uart_putc(hex[(v >> i) & 0xF]);
+}
+
+/*
+ * Unexpected-trap canary. Nothing in this test should trap (no interrupts are
+ * enabled and every access is legal); if one does (e.g. an unexpected fault),
+ * spin emitting 'T' so the failure is visible over UART instead of a silent
+ * wild jump. Naked: entered as a raw trap handler.
+ */
+__attribute__((naked, aligned(4))) static void trap_canary(void)
+{
+    __asm__ volatile("li   t0, 0x40000000\n" /* UART_TX */
+                     "li   t1, 'T'\n"
+                     "1:\n"
+                     "sb   t1, 0(t0)\n"
+                     "j    1b\n");
+}
+
+/*
+ * Commit a backlog of distinct-line cached/DDR stores, then MRET back to the top
+ * of the loop -- `iters` times. The MRET is the loop back-edge, reached a handful
+ * of cycles after the youngest store commits, while that store (and the rest of
+ * the backlog) is still draining => the one-shot o_mret_start pulse coincides
+ * with sq_committed_empty==0.
+ *
+ * a0 = cached/DDR buffer base, a1 = iteration count. Naked: hand-written control
+ * flow (the MRET is the loop branch). Uses only caller-saved temporaries, so the
+ * final `ret` returns to C with ra intact.
+ */
+__attribute__((naked)) static void mret_drain_loop(volatile uint32_t *ddr, uint32_t iters)
+{
+    (void) ddr;
+    (void) iters;
+    __asm__ volatile(
+        /* MRET return target = loop top. Constant, so set mepc ONCE; MRET reads
+         * mepc but never writes it. */
+        "la   t1, 1f\n"
+        "csrw mepc, t1\n"
+        "li   t2, 0x1800\n" /* mstatus.MPP = M (0b11 << 11) mask */
+        "1:\n"
+        "beqz a1, 3f\n" /* done after `iters` MRETs */
+        "addi a1, a1, -1\n"
+        /* MPP=M re-set here (BEFORE the backlog), since MRET pops MPP to U. Kept
+         * off the youngest-store->MRET critical path so NO instruction sits
+         * between the last store and the MRET. */
+        "csrs mstatus, t2\n"
+        /* A few stores to distinct 32 B lines (64 B apart). Enough that the
+         * youngest committed store is still in its (cached/DDR) write-back drain
+         * when the MRET reaches the ROB head, but few enough not to overflow the
+         * store queue (which would wedge on backpressure, not on the MRET). */
+        "sw   a1, 0(a0)\n"
+        "sw   a1, 64(a0)\n"
+        "sw   a1, 128(a0)\n"
+        "sw   a1, 192(a0)\n" /* youngest committed store; still draining at MRET */
+        /* MRET immediately follows the youngest store: it reaches the ROB head a
+         * couple cycles later, while that store (and the backlog) is still
+         * draining => the one-shot o_mret_start pulse coincides with
+         * sq_committed_empty==0. */
+        "mret\n"
+        "3:\n"
+        "ret\n" ::
+            : "t0", "t1", "t2", "a0", "a1", "memory");
+}
+
+int main(void)
+{
+    uart_puts("\r\n=== MRET drain-deadlock repro ===\r\n");
+
+    /* Any unexpected trap becomes visible rather than a silent wild jump. */
+    set_trap_handler(&trap_canary);
+
+    /* No interrupts: this deadlock is purely the MRET<->store-drain handshake. */
+    (void) disable_interrupts();
+
+    uart_puts("running MRET/drain loop...\r\n");
+    mret_drain_loop(g_ddr_buf, 16u);
+
+    /* Only reached if every MRET completed (fixed RTL). On buggy RTL the first
+     * MRET wedges the serializer and we never get here. */
+    uart_puts("survived all MRETs: iters=");
+    uart_hex(16u);
+    uart_puts("\r\n<<PASS>>\r\n");
+    for (;;) {
+    }
+    return 0;
+}
diff --git a/sw/apps/mret_timer_resume_test/Makefile b/sw/apps/mret_timer_resume_test/Makefile
new file mode 100644
index 00000000..bd2f906b
--- /dev/null
+++ b/sw/apps/mret_timer_resume_test/Makefile
@@ -0,0 +1,17 @@
+#    Copyright 2026 Two Sigma Open Source, LLC
+#
+#    Licensed under the Apache License, Version 2.0 (the "License");
+#    you may not use this file except in compliance with the License.
+#    You may obtain a copy of the License at
+#
+#        http://www.apache.org/licenses/LICENSE-2.0
+#
+#    Unless required by applicable law or agreed to in writing, software
+#    distributed under the License is distributed on an "AS IS" BASIS,
+#    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#    See the License for the specific language governing permissions and
+#    limitations under the License.
+
+# Makefile for the MRET-to-U-mode + pending-timer interrupt-resume-PC test
+SRC_C := main.c
+include ../../common/common.mk
diff --git a/sw/apps/mret_timer_resume_test/main.c b/sw/apps/mret_timer_resume_test/main.c
new file mode 100644
index 00000000..f07086a9
--- /dev/null
+++ b/sw/apps/mret_timer_resume_test/main.c
@@ -0,0 +1,193 @@
+/*
+ *    Copyright 2026 Two Sigma Open Source, LLC
+ *
+ *    Licensed under the Apache License, Version 2.0 (the "License");
+ *    you may not use this file except in compliance with the License.
+ *    You may obtain a copy of the License at
+ *
+ *        http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *    Unless required by applicable law or agreed to in writing, software
+ *    distributed under the License is distributed on an "AS IS" BASIS,
+ *    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *    See the License for the specific language governing permissions and
+ *    limitations under the License.
+ */
+
+/*
+ * MRET-to-U-mode + already-pending machine timer: interrupt-resume-PC (mepc)
+ * directed test.
+ *
+ * Reproduces the Linux no-MMU boot panic where a U-mode context illegally
+ * executes the kernel's M-mode MRET (ret_from_exception). Root cause under
+ * test: when an MRET returns to U-mode it retires via the trap/MRET full
+ * flush, NOT via the normal commit path, so the core's `interrupt_resume_pc`
+ * register is never updated to the MRET target. It keeps holding the
+ * architectural next-PC of the instruction before the MRET -- i.e. the MRET
+ * instruction's own PC. The trap unit only inhibits interrupts for the two
+ * cycles around the MRET (i_mret_start, mret_taken_prev). If a machine timer
+ * is pending, it becomes eligible the moment privilege drops below M and is
+ * taken a few cycles later, BEFORE the first U-mode instruction commits and
+ * refreshes interrupt_resume_pc. The trap therefore saves
+ *   mepc = interrupt_resume_pc = <MRET instruction PC>.
+ * Linux later restores that trap frame and MRETs to the kernel MRET PC while
+ * in U-mode -> illegal instruction (signal 4) -> "Attempted to kill init".
+ *
+ * Test shape (mirrors umode_test's timer-preempts-U case, but with the timer
+ * ALREADY pending at MRET time so it fires in the vulnerable post-MRET
+ * window):
+ *
+ *   1. M-mode installs a naked handler at mtvec that records, for the FIRST
+ *      trap only, mcause, mepc (the saved resume PC) and mstatus.MPP.
+ *   2. Make the machine timer permanently pending (mtimecmp = 0) while in
+ *      M-mode with MIE=0 (so it cannot fire in M-mode).
+ *   3. MRET into a tiny U-mode spin (`u_spin: j .`). Machine interrupts are
+ *      taken below M regardless of MIE, so the pending timer preempts U
+ *      immediately.
+ *   4. The handler runs; we then assert the saved resume PC points at u_spin
+ *      (the MRET target) and is NOT the MRET instruction's own PC.
+ *
+ * PASS: mcause == 0x8000_0007 (machine timer), trapped-from-priv == U, and
+ *       mepc == &u_spin.
+ * FAIL (the bug): mepc == <MRET PC in run_in_umode_pending_timer> != &u_spin.
+ */
+
+#include <stdint.h>
+
+#include "trap.h"
+
+/* ---- minimal UART (UART_TX is provided by mmio.h via trap.h) ---- */
+static void uart_putc(char c)
+{
+    UART_TX = (uint8_t) c;
+}
+
+static void uart_puts(const char *s)
+{
+    while (*s)
+        uart_putc(*s++);
+}
+
+static void uart_hex(uint32_t v)
+{
+    static const char hex[] = "0123456789ABCDEF";
+    uart_puts("0x");
+    for (int i = 28; i >= 0; i -= 4)
+        uart_putc(hex[(v >> i) & 0xF]);
+}
+
+/* ---- trap state shared with the naked handler ---- */
+static volatile uint32_t g_cause;
+static volatile uint32_t g_mepc;      /* saved resume PC of the FIRST trap     */
+static volatile uint32_t g_from_priv; /* mstatus.MPP at trap entry = prev priv */
+
+/*
+ * Naked M-mode trap handler. For the first trap only, records mcause, mepc and
+ * the trapping privilege (mstatus.MPP). Then pushes mtimecmp to max (acks the
+ * timer so it cannot refire), and returns to M-mode at the continuation
+ * address stashed in mscratch with MPP=M. Bouncing to a fixed continuation
+ * (rather than resuming U-mode) means clobbering temporaries here is safe.
+ */
+__attribute__((naked, aligned(4))) static void mret_timer_trap_handler(void)
+{
+    __asm__ volatile("csrr t0, mcause\n"
+                     "lui  t1, %hi(g_cause)\n"
+                     "lw   t2, %lo(g_cause)(t1)\n"
+                     "li   t3, -1\n" /* sentinel: only the FIRST trap records */
+                     "bne  t2, t3, 2f\n"
+                     "sw   t0, %lo(g_cause)(t1)\n"
+                     "csrr t0, mepc\n" /* saved resume PC of this trap */
+                     "lui  t1, %hi(g_mepc)\n"
+                     "sw   t0, %lo(g_mepc)(t1)\n"
+                     "csrr t0, mstatus\n"
+                     "srli t0, t0, 11\n"
+                     "andi t0, t0, 0x3\n" /* mstatus.MPP */
+                     "lui  t1, %hi(g_from_priv)\n"
+                     "sw   t0, %lo(g_from_priv)(t1)\n"
+                     "2:\n"
+                     "li   t1, 0x4000001C\n" /* MTIMECMP_HI: push compare to max to ack timer */
+                     "li   t0, -1\n"
+                     "sw   t0, 0(t1)\n"
+                     "csrr t0, mscratch\n" /* M-mode continuation set by run_in_umode */
+                     "csrw mepc, t0\n"
+                     "li   t0, 0x1800\n" /* MPP = M (0b11 << 11) */
+                     "csrs mstatus, t0\n"
+                     "mret\n");
+}
+
+/*
+ * Enter U-mode at ufn with the machine timer ALREADY pending; the handler
+ * returns control to the instruction after the MRET. The MRET here is the
+ * instruction whose PC must NOT leak into the timer trap's mepc.
+ */
+static uint32_t run_in_umode_pending_timer(void (*ufn)(void))
+{
+    g_cause = 0xFFFFFFFFu;
+    g_mepc = 0u;
+    g_from_priv = 0xFFFFFFFFu;
+    __asm__ volatile("la   t0, 1f\n"
+                     "csrw mscratch, t0\n" /* where the handler returns */
+                     "li   t0, 0x1800\n"
+                     "csrc mstatus, t0\n" /* MPP = U (00) */
+                     "csrw mepc, %0\n"
+                     "mret\n" /* -> U-mode at ufn; pending timer preempts here */
+                     "1:\n"
+                     :
+                     : "r"(ufn)
+                     : "t0", "t1", "t2", "memory");
+    return g_cause;
+}
+
+/* U-mode body: spin in place. naked so its first (and only) instruction is the
+ * jump, making the architectural resume PC of any preempting interrupt exactly
+ * &u_spin. */
+__attribute__((naked)) static void u_spin(void)
+{
+    __asm__ volatile("j .");
+}
+
+int main(void)
+{
+    uart_puts("\r\n=== MRET->U timer-resume mepc test ===\r\n");
+    set_trap_handler(&mret_timer_trap_handler);
+
+    /* Machine interrupts off in M (MIE=0), and MPIE=0 so U also runs with
+     * MIE=0. The machine timer still preempts U-mode (priv != M). */
+    (void) disable_interrupts();
+    csr_clear(mstatus, MSTATUS_MPIE);
+    enable_timer_interrupt(); /* mie.MTIE = 1 */
+
+    /* Make the machine timer permanently pending BEFORE the MRET-to-U so it
+     * preempts at the first eligible cycle after privilege drops to U -- the
+     * window in which interrupt_resume_pc may still hold the MRET's own PC. */
+    set_timer_cmp(0); /* mtime >= 0 always => MTIP asserted */
+
+    uint32_t cause = run_in_umode_pending_timer(&u_spin);
+    disable_timer_interrupt();
+
+    uint32_t mepc = g_mepc;
+    uint32_t want_pc = (uint32_t) &u_spin;
+    int ok = (cause == 0x80000007u) && (g_from_priv == 0u) && (mepc == want_pc);
+
+    uart_puts("cause=");
+    uart_hex(cause);
+    uart_puts(" from_priv=");
+    uart_hex(g_from_priv);
+    uart_puts(" resume_mepc=");
+    uart_hex(mepc);
+    uart_puts(" want_pc(u_spin)=");
+    uart_hex(want_pc);
+    uart_puts("\r\n");
+
+    if (!ok) {
+        uart_puts("[FAIL] timer trap saved a wrong resume PC "
+                  "(stale interrupt_resume_pc around MRET-to-U)\r\n");
+    } else {
+        uart_puts("[PASS] timer trap resumed at the U-mode target\r\n");
+    }
+
+    uart_puts(ok ? "\r\n<<PASS>>\r\n" : "\r\n<<FAIL>>\r\n");
+    for (;;) {
+    }
+    return 0;
+}
diff --git a/sw/apps/mtimer_stress/Makefile b/sw/apps/mtimer_stress/Makefile
new file mode 100644
index 00000000..46e0a7a3
--- /dev/null
+++ b/sw/apps/mtimer_stress/Makefile
@@ -0,0 +1,19 @@
+#    Copyright 2026 Two Sigma Open Source, LLC
+#
+#    Licensed under the Apache License, Version 2.0 (the "License");
+#    you may not use this file except in compliance with the License.
+#    You may obtain a copy of the License at
+#
+#        http://www.apache.org/licenses/LICENSE-2.0
+#
+#    Unless required by applicable law or agreed to in writing, software
+#    distributed under the License is distributed on an "AS IS" BASIS,
+#    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#    See the License for the specific language governing permissions and
+#    limitations under the License.
+
+#    Copyright 2026 Two Sigma Open Source, LLC
+#    SPDX-License-Identifier: Apache-2.0
+# Machine-timer + MRET deadlock stress test
+SRC_C := main.c
+include ../../common/common.mk
diff --git a/sw/apps/mtimer_stress/main.c b/sw/apps/mtimer_stress/main.c
new file mode 100644
index 00000000..753f00ad
--- /dev/null
+++ b/sw/apps/mtimer_stress/main.c
@@ -0,0 +1,140 @@
+/*
+ *    Copyright 2026 Two Sigma Open Source, LLC
+ *
+ *    Licensed under the Apache License, Version 2.0 (the "License");
+ *    you may not use this file except in compliance with the License.
+ *    You may obtain a copy of the License at
+ *
+ *        http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *    Unless required by applicable law or agreed to in writing, software
+ *    distributed under the License is distributed on an "AS IS" BASIS,
+ *    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *    See the License for the specific language governing permissions and
+ *    limitations under the License.
+ */
+
+/*
+ * Machine-timer + MRET deadlock stress test.
+ *
+ * Reproduce target: the residual flaky hang seen booting no-MMU Linux on
+ * hardware. It is memory-size- and board-state-independent, ~50% of boots, and
+ * frequently hangs at the first periodic machine-timer interrupts (right after
+ * the kernel switches to the CLINT clocksource). The U-mode interrupt-resume-PC
+ * fix (cpu_ooo.sv) and the kernel MIE-clear patch made it flaky instead of
+ * deterministic but did not close it -> a residual machine-timer trap-return
+ * race in the FROST trap/MRET/flush machinery.
+ *
+ * This is the full linux_boot in miniature: an M-mode loop preempted by a
+ * machine timer firing very frequently, the handler doing a real MRET back to
+ * the loop, with the timer PHASE swept (period re-armed to mtime + 512..575 each
+ * tick) so the timer lands at every cycle offset around the MRET / in the loop
+ * across many hundreds of ticks. If a timer landing at a bad cycle deadlocks
+ * the pipeline, the loop counter stops advancing and `<<PASS>>` is never
+ * printed -> the cocotb harness times out (reproduced). If it survives all
+ * phases for the whole run, it prints `<<PASS>>`.
+ */
+
+#include <stdint.h>
+
+#include "trap.h"
+
+static void uart_putc(char c)
+{
+    UART_TX = (uint8_t) c;
+}
+static void uart_puts(const char *s)
+{
+    while (*s)
+        uart_putc(*s++);
+}
+static void uart_hex(uint32_t v)
+{
+    static const char hex[] = "0123456789ABCDEF";
+    uart_puts("0x");
+    for (int i = 28; i >= 0; i -= 4)
+        uart_putc(hex[(v >> i) & 0xF]);
+}
+
+volatile uint32_t g_irq;  /* timer-interrupt count (also drives the phase sweep) */
+volatile uint32_t g_loop; /* loop progress marker */
+static volatile uint32_t buf[64];
+
+/*
+ * Naked M-mode timer handler: re-arm the timer to fire again in 512..575 cycles
+ * (period = 512 + (g_irq & 0x3f), so the phase relative to the loop/MRET drifts
+ * every tick and sweeps the whole window), bump g_irq, and MRET back to the
+ * interrupted loop. Trap entry cleared MIE; the MRET restores it from MPIE, so
+ * the next timer fires back in the loop -- exactly the kernel's pattern with
+ * the MIE-clear patch applied. Saves only the regs it uses; everything else is
+ * preserved by not touching it.
+ *
+ * NOTE: the period MUST exceed the trap->handler->MRET->resume round-trip
+ * (~90 cycles here: two full flush_all pipeline wipes per tick + store-drain).
+ * An earlier 24..87-cycle period was SHORTER than the round-trip, so the timer
+ * was perpetually overdue and the handler saturated the core -- main never
+ * advanced and <<PASS>> never printed. That was an interrupt-saturation
+ * livelock in the TEST, not an RTL deadlock (the pipeline kept retiring and the
+ * timer was serviced every tick); 512 gives main net forward progress while
+ * still preempting it at every swept phase across ~hundreds of ticks.
+ */
+__attribute__((naked, aligned(4))) static void mtimer_handler(void)
+{
+    __asm__ volatile("addi sp, sp, -16\n"
+                     "sw   t0, 0(sp)\n"
+                     "sw   t1, 4(sp)\n"
+                     "sw   t2, 8(sp)\n"
+                     "lui  t0, %hi(g_irq)\n"
+                     "lw   t1, %lo(g_irq)(t0)\n"
+                     "andi t2, t1, 0x3f\n"
+                     "addi t2, t2, 512\n" /* period = 512 + (g_irq & 0x3f); see note below */
+                     "addi t1, t1, 1\n"
+                     "sw   t1, %lo(g_irq)(t0)\n" /* g_irq++ */
+                     "li   t0, 0x40000010\n"     /* MTIME_LO */
+                     "lw   t1, 0(t0)\n"
+                     "add  t1, t1, t2\n"
+                     "li   t0, 0x40000018\n" /* MTIMECMP_LO (HI stays 0, set in main) */
+                     "sw   t1, 0(t0)\n"
+                     "lw   t0, 0(sp)\n"
+                     "lw   t1, 4(sp)\n"
+                     "lw   t2, 8(sp)\n"
+                     "addi sp, sp, 16\n"
+                     "mret\n");
+}
+
+int main(void)
+{
+    uart_puts("\r\n=== mtimer MRET deadlock stress ===\r\n");
+    set_trap_handler(&mtimer_handler);
+    for (int i = 0; i < 64; i++)
+        buf[i] = (uint32_t) i;
+
+    /* Arm a frequent machine timer; handler re-arms each tick (phase sweep). */
+    MTIMECMP_HI = 0;
+    MTIMECMP_LO = (uint32_t) rdmtime() + 40;
+    enable_timer_interrupt(); /* mie.MTIE */
+    enable_interrupts();      /* mstatus.MIE */
+
+    /* Loop with loads/stores/ALU so the timer preempts varied pipeline state
+     * (in-flight memory ops, branches) at every swept phase. */
+    uint32_t acc = 0;
+    for (uint32_t i = 0; i < 20000u; i++) {
+        g_loop = i;
+        uint32_t k = i & 63u;
+        acc += buf[k];
+        acc ^= (acc << 1) | (acc >> 3);
+        buf[k] = acc + i;
+    }
+
+    disable_timer_interrupt();
+    uart_puts("survived: loop=");
+    uart_hex(g_loop);
+    uart_puts(" irqs=");
+    uart_hex(g_irq);
+    uart_puts(" acc=");
+    uart_hex(acc);
+    uart_puts("\r\n<<PASS>>\r\n");
+    for (;;) {
+    }
+    return 0;
+}
diff --git a/sw/apps/ns16550_test/Makefile b/sw/apps/ns16550_test/Makefile
new file mode 100644
index 00000000..6e3b306c
--- /dev/null
+++ b/sw/apps/ns16550_test/Makefile
@@ -0,0 +1,17 @@
+#    Copyright 2026 Two Sigma Open Source, LLC
+#
+#    Licensed under the Apache License, Version 2.0 (the "License");
+#    you may not use this file except in compliance with the License.
+#    You may obtain a copy of the License at
+#
+#        http://www.apache.org/licenses/LICENSE-2.0
+#
+#    Unless required by applicable law or agreed to in writing, software
+#    distributed under the License is distributed on an "AS IS" BASIS,
+#    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#    See the License for the specific language governing permissions and
+#    limitations under the License.
+
+# Makefile for the ns16550a UART face directed test
+SRC_C := main.c
+include ../../common/common.mk
diff --git a/sw/apps/ns16550_test/main.c b/sw/apps/ns16550_test/main.c
new file mode 100644
index 00000000..becc0021
--- /dev/null
+++ b/sw/apps/ns16550_test/main.c
@@ -0,0 +1,100 @@
+/*
+ *    Copyright 2026 Two Sigma Open Source, LLC
+ *
+ *    Licensed under the Apache License, Version 2.0 (the "License");
+ *    you may not use this file except in compliance with the License.
+ *    You may obtain a copy of the License at
+ *
+ *        http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *    Unless required by applicable law or agreed to in writing, software
+ *    distributed under the License is distributed on an "AS IS" BASIS,
+ *    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *    See the License for the specific language governing permissions and
+ *    limitations under the License.
+ */
+
+/*
+ * ns16550a UART face directed test (Increment 1 of the no-MMU Linux glue).
+ *
+ * FROST presents a word-stride 16550 register face at 0x4000_1000 (DTB
+ * reg-shift=2, reg-io-width=4) that aliases the native UART TX/RX, so a stock
+ * Linux 8250 console driver can drive it. This test runs the 8250 init dance
+ * (DLAB/baud, 8N1, FIFO, MCR), checks the register file and TX-ready status,
+ * and transmits a banner THROUGH the face (which must appear on the UART TX
+ * line). PASS/FAIL is emitted over the known-good native UART so the verdict
+ * is independent of the face under test.
+ */
+
+#include <stdint.h>
+
+/* Native FROST UART (known-good) -- used only for the PASS/FAIL marker. */
+#define NATIVE_TX (*(volatile uint32_t *) 0x40000000u)
+#define NATIVE_TX_ST (*(volatile uint32_t *) 0x40000028u)
+static void n_putc(char c)
+{
+    while (!(NATIVE_TX_ST & 1u)) {
+    }
+    NATIVE_TX = (uint8_t) c;
+}
+static void n_puts(const char *s)
+{
+    while (*s)
+        n_putc(*s++);
+}
+
+/* ns16550a face @ 0x4000_1000, word stride. */
+#define NS(off) (*(volatile uint32_t *) (uintptr_t) (0x40001000u + (off)))
+#define NS_THR NS(0x00)
+#define NS_IER NS(0x04)
+#define NS_IIR NS(0x08)
+#define NS_FCR NS(0x08)
+#define NS_LCR NS(0x0C)
+#define NS_MCR NS(0x10)
+#define NS_LSR NS(0x14)
+#define NS_SCR NS(0x1C)
+
+static void ns_init(void)
+{
+    NS_IER = 0x00u; /* polled (no interrupts wired) */
+    NS_LCR = 0x80u; /* DLAB = 1 */
+    NS_THR = 0x01u; /* DLL (baud divisor low) -- FROST ignores the divisor */
+    NS_IER = 0x00u; /* DLM (baud divisor high) */
+    NS_LCR = 0x03u; /* DLAB = 0, 8N1 */
+    NS_FCR = 0x07u; /* enable + clear RX/TX FIFOs */
+    NS_MCR = 0x03u; /* DTR | RTS */
+}
+static void ns_putc(char c)
+{
+    while (!(NS_LSR & 0x20u)) { /* wait for THRE */
+    }
+    NS_THR = (uint8_t) c;
+}
+static void ns_puts(const char *s)
+{
+    while (*s)
+        ns_putc(*s++);
+}
+
+int main(void)
+{
+    int ok = 1;
+
+    ns_init();
+    ok &= ((NS_LCR & 0xFFu) == 0x03u); /* LCR readback: 8N1, DLAB clear */
+    ok &= ((NS_LSR & 0x60u) == 0x60u); /* THRE | TEMT set (TX ready) */
+    ok &= ((NS_IIR & 0x01u) == 0x01u); /* no interrupt pending */
+
+    NS_SCR = 0xA5u; /* scratch register is read/write */
+    ok &= ((NS_SCR & 0xFFu) == 0xA5u);
+    NS_SCR = 0x5Au;
+    ok &= ((NS_SCR & 0xFFu) == 0x5Au);
+
+    /* Transmit a banner THROUGH the ns16550 face; it must reach the UART TX. */
+    ns_puts("[ns16550 face: TX path OK]\r\n");
+
+    n_puts(ok ? "\r\n<<PASS>>\r\n" : "\r\n<<FAIL>>\r\n");
+    for (;;) {
+    }
+    return 0;
+}
diff --git a/sw/apps/pde_return_hazard/Makefile b/sw/apps/pde_return_hazard/Makefile
new file mode 100644
index 00000000..b521507f
--- /dev/null
+++ b/sw/apps/pde_return_hazard/Makefile
@@ -0,0 +1,21 @@
+#    Copyright 2026 Two Sigma Open Source, LLC
+#
+#    Licensed under the Apache License, Version 2.0 (the "License");
+#    you may not use this file except in compliance with the License.
+#    You may obtain a copy of the License at
+#
+#        http://www.apache.org/licenses/LICENSE-2.0
+#
+#    Unless required by applicable law or agreed to in writing, software
+#    distributed under the License is distributed on an "AS IS" BASIS,
+#    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#    See the License for the specific language governing permissions and
+#    limitations under the License.
+
+#    Copyright 2026 Two Sigma Open Source, LLC
+#
+#    Licensed under the Apache License, Version 2.0 (the "License");
+
+override MEM_CONFIG := ddr
+SRC_C := ../../lib/src/uart.c main.c
+include ../../common/common.mk
diff --git a/sw/apps/pde_return_hazard/main.c b/sw/apps/pde_return_hazard/main.c
new file mode 100644
index 00000000..b1960737
--- /dev/null
+++ b/sw/apps/pde_return_hazard/main.c
@@ -0,0 +1,851 @@
+/*
+ *    Copyright 2026 Two Sigma Open Source, LLC
+ *
+ *    Licensed under the Apache License, Version 2.0 (the "License");
+ *    you may not use this file except in compliance with the License.
+ *    You may obtain a copy of the License at
+ *
+ *        http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *    Unless required by applicable law or agreed to in writing, software
+ *    distributed under the License is distributed on an "AS IS" BASIS,
+ *    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *    See the License for the specific language governing permissions and
+ *    limitations under the License.
+ */
+
+/*
+ * Directed reproducer for the procfs /proc lookup failure seen on hardware.
+ *
+ * The Linux failure shows proc_get_inode() receiving a pointer that looks like
+ * a proc_dir_entry.subdir_node, not the proc_dir_entry base. The hot epilogue in
+ * pde_subdir_find() subtracts the rb_node offset from s1, then returns it via
+ * a0 shortly before restoring the caller's s1.
+ */
+
+#include <stdint.h>
+
+#include "uart.h"
+
+#define ITERATIONS 64u
+#define PDE_VIS_ITERATIONS 16u
+#define PDE_VIS_CHURN_BYTES (16u * 1024u)
+#define PDE_SUBDIR_NODE_OFFSET 80u
+#define PDE_SUBDIR_ROOT_OFFSET 76u
+#define PDE_REFCOUNT_OFFSET 4u
+#define PDE_NLINK_OFFSET 48u
+#define PDE_UID_OFFSET 52u
+#define PDE_GID_OFFSET 56u
+#define PDE_NAME_OFFSET 92u
+#define PDE_MODE_OFFSET 96u
+#define PDE_FLAGS_OFFSET 98u
+#define PDE_NAMELEN_OFFSET 99u
+#define PDE_INLINE_NAME_OFFSET 100u
+#define RB_RIGHT_OFFSET 4u
+#define RB_LEFT_OFFSET 8u
+#define RB_NAME_OFFSET 12u
+#define RB_NAMELEN_OFFSET 19u
+#define MULTI_PDE_COUNT 5u
+#define PDE_MODE_REG_0444 0x8124u
+
+static uint8_t root_pde[128] __attribute__((aligned(64)));
+static uint8_t entry_pde[128] __attribute__((aligned(64)));
+static uint8_t multi_pdes[MULTI_PDE_COUNT][128] __attribute__((aligned(64)));
+static uint8_t fake_dir[32] __attribute__((aligned(64)));
+static uint8_t fake_dentry[40] __attribute__((aligned(64)));
+static uint8_t s2l_area[64 * 1024] __attribute__((aligned(64)));
+static const char version_name[] = "version";
+static const char cmdline_name[] = "cmdline";
+static const char loadavg_name[] = "loadavg";
+static const char maps_name[] = "maps";
+static const char meminfo_name[] = "meminfo";
+
+static volatile uintptr_t observed_de;
+static volatile uintptr_t observed_sb;
+static volatile uint32_t observed_ref_old;
+static volatile uint32_t observed_mode;
+static volatile uint32_t observed_namelen;
+
+static void churn_cache(uint32_t seed);
+
+__attribute__((noinline, naked, used, aligned(4))) static uintptr_t
+epilogue_repro(uintptr_t node, uintptr_t salt2, uintptr_t salt3)
+{
+    __asm__ volatile("addi sp, sp, -32\n"
+                     "sw   s0, 24(sp)\n"
+                     "sw   ra, 28(sp)\n"
+                     "sw   s1, 20(sp)\n"
+                     "sw   s2, 16(sp)\n"
+                     "sw   s3, 12(sp)\n"
+                     "addi s0, sp, 32\n"
+                     "mv   s1, a0\n"
+                     "mv   s2, a1\n"
+                     "mv   s3, a2\n"
+                     "xor  a5, s2, s3\n"
+                     "andi a5, a5, 1\n"
+                     "beqz a5, 1f\n"
+                     "addi s1, s1, 0\n"
+                     "1:\n"
+                     "lw   ra, 28(sp)\n"
+                     "lw   s0, 24(sp)\n"
+                     "addi s1, s1, -80\n"
+                     "lw   s2, 16(sp)\n"
+                     "lw   s3, 12(sp)\n"
+                     "mv   a0, s1\n"
+                     "lw   s1, 20(sp)\n"
+                     "addi sp, sp, 32\n"
+                     "ret\n");
+}
+
+__attribute__((noinline, naked, used, aligned(4))) static uintptr_t
+epilogue_direct_a0(uintptr_t node, uintptr_t salt2, uintptr_t salt3)
+{
+    __asm__ volatile("addi sp, sp, -32\n"
+                     "sw   s0, 24(sp)\n"
+                     "sw   ra, 28(sp)\n"
+                     "sw   s1, 20(sp)\n"
+                     "sw   s2, 16(sp)\n"
+                     "sw   s3, 12(sp)\n"
+                     "addi s0, sp, 32\n"
+                     "mv   s1, a0\n"
+                     "mv   s2, a1\n"
+                     "mv   s3, a2\n"
+                     "xor  a5, s2, s3\n"
+                     "andi a5, a5, 1\n"
+                     "beqz a5, 1f\n"
+                     "addi s1, s1, 0\n"
+                     "1:\n"
+                     "lw   ra, 28(sp)\n"
+                     "lw   s0, 24(sp)\n"
+                     "addi a0, s1, -80\n"
+                     "lw   s2, 16(sp)\n"
+                     "lw   s3, 12(sp)\n"
+                     "lw   s1, 20(sp)\n"
+                     "addi sp, sp, 32\n"
+                     "ret\n");
+}
+
+static volatile uintptr_t sink;
+static volatile uint32_t s2l_sink;
+
+__attribute__((noinline, used)) static uint32_t halfword_s2l(uint8_t *ptr, uint32_t value)
+{
+    uint32_t out;
+
+    __asm__ volatile("sh  %[value], 0(%[ptr])\n"
+                     "lhu %[out], 0(%[ptr])\n"
+                     : [out] "=r"(out)
+                     : [ptr] "r"(ptr), [value] "r"(value)
+                     : "memory");
+    return out;
+}
+
+__attribute__((noinline, used)) static uint32_t amo_halfword_s2l(uint8_t *ptr, uint32_t value)
+{
+    uint32_t out;
+
+    __asm__ volatile("li t0, 1\n"
+                     "addi t1, %[ptr], 4\n"
+                     "amoadd.w zero, t0, (t1)\n"
+                     "sh  %[value], 0(%[ptr])\n"
+                     "lhu %[out], 0(%[ptr])\n"
+                     : [out] "=r"(out)
+                     : [ptr] "r"(ptr), [value] "r"(value)
+                     : "t0", "t1", "memory");
+    return out;
+}
+
+__attribute__((noinline, used)) static uint32_t word_s2l(uint8_t *ptr, uint32_t value)
+{
+    uint32_t out;
+
+    __asm__ volatile("sw %[value], 0(%[ptr])\n"
+                     "lw %[out], 0(%[ptr])\n"
+                     : [out] "=r"(out)
+                     : [ptr] "r"(ptr), [value] "r"(value)
+                     : "memory");
+    return out;
+}
+
+__attribute__((noinline, used)) static int
+hazard_memcmp(const void *lhs, const void *rhs, uint32_t len)
+{
+    const uint8_t *a = (const uint8_t *) lhs;
+    const uint8_t *b = (const uint8_t *) rhs;
+
+    for (uint32_t i = 0; i < len; i++) {
+        if (a[i] != b[i]) {
+            return (int) a[i] - (int) b[i];
+        }
+    }
+    return 0;
+}
+
+__attribute__((noinline, used)) static uintptr_t fake_proc_get_inode(uintptr_t sb, uintptr_t de)
+{
+    uint32_t mode;
+
+    __asm__ volatile("lhu %0, 96(%1)" : "=r"(mode) : "r"(de) : "memory");
+    observed_sb = sb;
+    observed_de = de;
+    observed_mode = mode;
+    observed_namelen = *(volatile uint8_t *) (uintptr_t) (de + PDE_NAMELEN_OFFSET);
+    return 0x12345678u;
+}
+
+__attribute__((noinline, naked, used, aligned(4))) static void pde_init_version_asm(uintptr_t de)
+{
+    __asm__ volatile("addi t0, a0, 100\n"
+                     "sw   t0, 92(a0)\n"
+                     "li   t1, 0x73726576\n" /* "vers" */
+                     "sw   t1, 100(a0)\n"
+                     "li   t1, 0x006e6f69\n" /* "ion\\0" */
+                     "sw   t1, 104(a0)\n"
+                     "li   t1, 1\n"
+                     "sw   t1, 4(a0)\n"
+                     "addi t2, a0, 8\n"
+                     "sw   t2, 8(a0)\n"
+                     "li   t3, 0x8124\n"
+                     "sh   t3, 96(a0)\n"
+                     "sw   t1, 48(a0)\n"
+                     "sw   zero, 76(a0)\n"
+                     "li   t4, 7\n"
+                     "sb   t4, 99(a0)\n"
+                     "sw   t2, 12(a0)\n"
+                     "sw   zero, 52(a0)\n"
+                     "sw   zero, 56(a0)\n"
+                     "ret\n");
+}
+
+__attribute__((noinline, naked, used, aligned(4))) static uintptr_t
+pde_subdir_find_asm(uintptr_t de, const char *name, uint32_t len)
+{
+    __asm__ volatile("addi sp, sp, -32\n"
+                     "sw   s0, 24(sp)\n"
+                     "sw   ra, 28(sp)\n"
+                     "sw   s1, 20(sp)\n"
+                     "addi s0, sp, 32\n"
+                     "lw   s1, 76(a0)\n"
+                     "beqz s1, 4f\n"
+                     "sw   s2, 16(sp)\n"
+                     "sw   s3, 12(sp)\n"
+                     "mv   s2, a2\n"
+                     "mv   s3, a1\n"
+                     "1:\n"
+                     "lbu  a5, 19(s1)\n"
+                     "mv   a2, s2\n"
+                     "mv   a0, s3\n"
+                     "bltu s2, a5, 5f\n"
+                     "bltu a5, s2, 2f\n"
+                     "lw   a1, 12(s1)\n"
+                     "call hazard_memcmp\n"
+                     "bltz a0, 5f\n"
+                     "beqz a0, 6f\n"
+                     "2:\n"
+                     "lw   s1, 4(s1)\n"
+                     "bnez s1, 1b\n"
+                     "3:\n"
+                     "lw   s2, 16(sp)\n"
+                     "lw   s3, 12(sp)\n"
+                     "4:\n"
+                     "lw   ra, 28(sp)\n"
+                     "lw   s0, 24(sp)\n"
+                     "mv   a0, s1\n"
+                     "lw   s1, 20(sp)\n"
+                     "addi sp, sp, 32\n"
+                     "ret\n"
+                     "5:\n"
+                     "lw   s1, 8(s1)\n"
+                     "bnez s1, 1b\n"
+                     "j    3b\n"
+                     "6:\n"
+                     "lw   ra, 28(sp)\n"
+                     "lw   s0, 24(sp)\n"
+                     "addi s1, s1, -80\n"
+                     "lw   s2, 16(sp)\n"
+                     "lw   s3, 12(sp)\n"
+                     "mv   a0, s1\n"
+                     "lw   s1, 20(sp)\n"
+                     "addi sp, sp, 32\n"
+                     "ret\n");
+}
+
+__attribute__((noinline, naked, used, aligned(4))) static uintptr_t
+proc_lookup_de_asm(uintptr_t dir, uintptr_t dentry, uintptr_t de)
+{
+    __asm__ volatile("addi sp, sp, -32\n"
+                     "sw   s0, 24(sp)\n"
+                     "sw   s1, 20(sp)\n"
+                     "sw   s2, 16(sp)\n"
+                     "sw   ra, 28(sp)\n"
+                     "addi s0, sp, 32\n"
+                     "mv   s2, a0\n"
+                     "mv   s1, a1\n"
+                     "mv   a0, a2\n"
+                     "lw   a2, 28(a1)\n"
+                     "lw   a1, 32(a1)\n"
+                     "call pde_subdir_find_asm\n"
+                     "beqz a0, 1f\n"
+                     "mv   a5, a0\n"
+                     "li   a1, 1\n"
+                     "addi a0, a0, 4\n"
+                     "amoadd.w a4, a1, (a0)\n"
+                     "la   t0, observed_ref_old\n"
+                     "sw   a4, 0(t0)\n"
+                     "lw   a0, 20(s2)\n"
+                     "mv   a1, a5\n"
+                     "sw   a5, -20(s0)\n"
+                     "call fake_proc_get_inode\n"
+                     "j    2f\n"
+                     "1:\n"
+                     "li   a0, -2\n"
+                     "2:\n"
+                     "lw   ra, 28(sp)\n"
+                     "lw   s0, 24(sp)\n"
+                     "lw   s1, 20(sp)\n"
+                     "lw   s2, 16(sp)\n"
+                     "addi sp, sp, 32\n"
+                     "ret\n");
+}
+
+static int run_one(const char *name, uintptr_t (*fn)(uintptr_t, uintptr_t, uintptr_t))
+{
+    for (uint32_t i = 0; i < ITERATIONS; i++) {
+        uintptr_t node = 0x80c60050u + ((uintptr_t) i << 6);
+        uintptr_t expected = node - 80u;
+        uintptr_t got = fn(node, 0x13572468u + i, 0x24681357u ^ i);
+        sink ^= got;
+        if (got != expected) {
+            uart_printf("%s FAIL i=%u node=0x%08lx got=0x%08lx expected=0x%08lx\n",
+                        name,
+                        (unsigned) i,
+                        (unsigned long) node,
+                        (unsigned long) got,
+                        (unsigned long) expected);
+            return -1;
+        }
+    }
+    uart_printf("%s PASS\n", name);
+    return 0;
+}
+
+static void write32(uint8_t *base, uint32_t offset, uintptr_t value)
+{
+    *(volatile uintptr_t *) (void *) (base + offset) = value;
+}
+
+static uintptr_t read32(uint8_t *base, uint32_t offset)
+{
+    return *(volatile uintptr_t *) (void *) (base + offset);
+}
+
+static void clear_bytes(uint8_t *base, uint32_t size)
+{
+    for (uint32_t i = 0; i < size; i++) {
+        base[i] = 0;
+    }
+}
+
+static uint32_t small_strlen(const char *name)
+{
+    uint32_t len = 0;
+
+    while (name[len] != '\0') {
+        len++;
+    }
+    return len;
+}
+
+static uintptr_t multi_base(uint32_t idx)
+{
+    return (uintptr_t) multi_pdes[idx];
+}
+
+static uintptr_t multi_node(uint32_t idx)
+{
+    return multi_base(idx) + PDE_SUBDIR_NODE_OFFSET;
+}
+
+static const char *multi_inline_name(uint32_t idx)
+{
+    return (const char *) (const void *) (multi_pdes[idx] + PDE_INLINE_NAME_OFFSET);
+}
+
+static const char *known_pde_name(uintptr_t de)
+{
+    for (uint32_t i = 0; i < MULTI_PDE_COUNT; i++) {
+        if (de == multi_base(i)) {
+            return multi_inline_name(i);
+        }
+        if (de == multi_node(i)) {
+            return "NODE_PTR";
+        }
+    }
+    return "UNKNOWN";
+}
+
+static void init_multi_pde(uint32_t idx, const char *name)
+{
+    uint8_t *de = multi_pdes[idx];
+    uint32_t len = small_strlen(name);
+
+    clear_bytes(de, sizeof(multi_pdes[idx]));
+    write32(de, PDE_REFCOUNT_OFFSET, 1u);
+    write32(de, PDE_NAME_OFFSET, multi_base(idx) + PDE_INLINE_NAME_OFFSET);
+    for (uint32_t i = 0; i <= len; i++) {
+        de[PDE_INLINE_NAME_OFFSET + i] = (uint8_t) name[i];
+    }
+    *(volatile uint16_t *) (void *) (de + PDE_MODE_OFFSET) = PDE_MODE_REG_0444;
+    de[PDE_NAMELEN_OFFSET] = (uint8_t) len;
+    write32(de, PDE_NLINK_OFFSET, 1u);
+}
+
+static void set_rb_links(uint32_t idx, int32_t right_idx, int32_t left_idx)
+{
+    uint8_t *de = multi_pdes[idx];
+
+    write32(de,
+            PDE_SUBDIR_NODE_OFFSET + RB_RIGHT_OFFSET,
+            right_idx >= 0 ? multi_node((uint32_t) right_idx) : 0u);
+    write32(de,
+            PDE_SUBDIR_NODE_OFFSET + RB_LEFT_OFFSET,
+            left_idx >= 0 ? multi_node((uint32_t) left_idx) : 0u);
+}
+
+enum {
+    MULTI_CMDLINE = 0,
+    MULTI_LOADAVG = 1,
+    MULTI_MAPS = 2,
+    MULTI_MEMINFO = 3,
+    MULTI_VERSION = 4,
+};
+
+struct multi_lookup_case {
+    const char *name;
+    uint32_t idx;
+};
+
+static const struct multi_lookup_case multi_lookup_cases[] = {
+    {version_name, MULTI_VERSION},
+    {cmdline_name, MULTI_CMDLINE},
+    {loadavg_name, MULTI_LOADAVG},
+    {maps_name, MULTI_MAPS},
+    {meminfo_name, MULTI_MEMINFO},
+};
+
+static void setup_multi_proc_tree(void)
+{
+    clear_bytes(root_pde, sizeof(root_pde));
+    clear_bytes(fake_dir, sizeof(fake_dir));
+    clear_bytes(fake_dentry, sizeof(fake_dentry));
+
+    init_multi_pde(MULTI_CMDLINE, cmdline_name);
+    init_multi_pde(MULTI_LOADAVG, loadavg_name);
+    init_multi_pde(MULTI_MAPS, maps_name);
+    init_multi_pde(MULTI_MEMINFO, meminfo_name);
+    init_multi_pde(MULTI_VERSION, version_name);
+
+    /*
+     * A small rb-tree keyed (namelen, then name) like /proc root.  set_rb_links
+     * takes (node, RIGHT, LEFT).  The lookup walk (pde_subdir_find_asm) is
+     * LENGTH-FIRST: a search name shorter than the node descends LEFT, longer
+     * descends RIGHT.  "maps" (len 4) is shorter than every other node (len 7),
+     * so it must live on the left spine to be reachable: loadavg.left=cmdline,
+     * cmdline.left=maps.  (It used to be meminfo.left — i.e. inside loadavg's
+     * RIGHT subtree — which a len-4 query can NEVER reach, because the len-7
+     * root sends every len-4 query LEFT into the cmdline subtree, hits
+     * cmdline.left=NULL, and returns 0.  That made the "maps" lookup assert fail
+     * by tree construction, not by any RTL fault.)
+     */
+    write32(root_pde, PDE_SUBDIR_ROOT_OFFSET, multi_node(MULTI_LOADAVG));
+    set_rb_links(MULTI_LOADAVG, MULTI_MEMINFO, MULTI_CMDLINE);
+    set_rb_links(MULTI_MEMINFO, MULTI_VERSION, -1);
+    set_rb_links(MULTI_CMDLINE, -1, MULTI_MAPS);
+    set_rb_links(MULTI_MAPS, -1, -1);
+    set_rb_links(MULTI_VERSION, -1, -1);
+
+    write32(fake_dir, 20u, 0xcafef00du);
+}
+
+static int
+run_multi_lookup_case(const char *test_name, uint32_t iter, const struct multi_lookup_case *lookup)
+{
+    uint32_t len = small_strlen(lookup->name);
+    uintptr_t expected = multi_base(lookup->idx);
+    uintptr_t expected_node = multi_node(lookup->idx);
+    uint8_t *expected_pde = multi_pdes[lookup->idx];
+
+    observed_de = 0;
+    observed_sb = 0;
+    observed_ref_old = 0xdeadbeefu;
+    observed_mode = 0xdeadbeefu;
+    observed_namelen = 0xdeadbeefu;
+    write32(fake_dentry, 28u, len);
+    write32(fake_dentry, 32u, (uintptr_t) lookup->name);
+
+    uintptr_t direct = pde_subdir_find_asm((uintptr_t) root_pde, lookup->name, len);
+    uintptr_t ret =
+        proc_lookup_de_asm((uintptr_t) fake_dir, (uintptr_t) fake_dentry, (uintptr_t) root_pde);
+    uint32_t ref_now = (uint32_t) read32(expected_pde, PDE_REFCOUNT_OFFSET);
+
+    sink ^= direct ^ ret;
+    if (direct != expected || observed_de != expected || observed_sb != 0xcafef00du ||
+        observed_ref_old != 1u || ref_now != 2u || observed_mode != PDE_MODE_REG_0444 ||
+        observed_namelen != len) {
+        uart_printf("%s FAIL i=%u query=%s direct=0x%08lx expected=0x%08lx node=0x%08lx\n",
+                    test_name,
+                    (unsigned) iter,
+                    lookup->name,
+                    (unsigned long) direct,
+                    (unsigned long) expected,
+                    (unsigned long) expected_node);
+        uart_printf("%s obs_de=0x%08lx obs_name=%s sb=0x%08lx mode=0x%04lx len=%lu old=0x%08lx "
+                    "ref=0x%08lx ret=0x%08lx\n",
+                    test_name,
+                    (unsigned long) observed_de,
+                    known_pde_name(observed_de),
+                    (unsigned long) observed_sb,
+                    (unsigned long) observed_mode,
+                    (unsigned long) observed_namelen,
+                    (unsigned long) observed_ref_old,
+                    (unsigned long) ref_now,
+                    (unsigned long) ret);
+        return -1;
+    }
+    return 0;
+}
+
+static int run_multi_lookup_repro_variant(const char *name, int churn)
+{
+    const uint32_t case_count =
+        (uint32_t) (sizeof(multi_lookup_cases) / sizeof(multi_lookup_cases[0]));
+
+    for (uint32_t i = 0; i < PDE_VIS_ITERATIONS; i++) {
+        if (churn) {
+            churn_cache(i + 0x200u);
+        }
+        for (uint32_t c = 0; c < case_count; c++) {
+            setup_multi_proc_tree();
+            if (run_multi_lookup_case(name, i, &multi_lookup_cases[c]) != 0) {
+                return -1;
+            }
+        }
+    }
+
+    uart_printf("%s PASS\n", name);
+    return 0;
+}
+
+static int run_multi_lookup_repro(void)
+{
+    if (run_multi_lookup_repro_variant("multi_lookup_immediate", 0) != 0) {
+        return -1;
+    }
+    if (run_multi_lookup_repro_variant("multi_lookup_churn", 1) != 0) {
+        return -1;
+    }
+    return 0;
+}
+
+static void setup_fake_proc_tree(void)
+{
+    for (uint32_t i = 0; i < sizeof(root_pde); i++) {
+        root_pde[i] = 0;
+        entry_pde[i] = 0;
+    }
+    for (uint32_t i = 0; i < sizeof(fake_dir); i++) {
+        fake_dir[i] = 0;
+    }
+    for (uint32_t i = 0; i < sizeof(fake_dentry); i++) {
+        fake_dentry[i] = 0;
+    }
+
+    uintptr_t entry_base = (uintptr_t) entry_pde;
+    uintptr_t entry_node = entry_base + PDE_SUBDIR_NODE_OFFSET;
+
+    write32(root_pde, PDE_SUBDIR_ROOT_OFFSET, entry_node);
+    write32(entry_pde, PDE_REFCOUNT_OFFSET, 1u);
+    write32(entry_pde, PDE_SUBDIR_NODE_OFFSET + RB_RIGHT_OFFSET, 0u);
+    write32(entry_pde, PDE_SUBDIR_NODE_OFFSET + RB_LEFT_OFFSET, 0u);
+    write32(entry_pde, PDE_SUBDIR_NODE_OFFSET + RB_NAME_OFFSET, (uintptr_t) version_name);
+    entry_pde[PDE_SUBDIR_NODE_OFFSET + RB_NAMELEN_OFFSET] = 7u;
+
+    write32(fake_dir, 20u, 0xcafef00du);
+    write32(fake_dentry, 28u, 7u);
+    write32(fake_dentry, 32u, (uintptr_t) version_name);
+}
+
+static int run_lookup_repro(void)
+{
+    uintptr_t entry_base = (uintptr_t) entry_pde;
+    uintptr_t entry_node = entry_base + PDE_SUBDIR_NODE_OFFSET;
+
+    for (uint32_t i = 0; i < ITERATIONS; i++) {
+        setup_fake_proc_tree();
+        observed_de = 0;
+        observed_sb = 0;
+
+        uintptr_t direct = pde_subdir_find_asm((uintptr_t) root_pde, version_name, 7u);
+        if (direct != entry_base) {
+            uart_printf("pde_subdir_find_asm FAIL i=%u got=0x%08lx expected=0x%08lx node=0x%08lx\n",
+                        (unsigned) i,
+                        (unsigned long) direct,
+                        (unsigned long) entry_base,
+                        (unsigned long) entry_node);
+            return -1;
+        }
+
+        uintptr_t ret =
+            proc_lookup_de_asm((uintptr_t) fake_dir, (uintptr_t) fake_dentry, (uintptr_t) root_pde);
+        sink ^= ret;
+        if (observed_de != entry_base) {
+            uart_printf("proc_lookup_de_asm FAIL i=%u observed_de=0x%08lx expected=0x%08lx "
+                        "node=0x%08lx ret=0x%08lx\n",
+                        (unsigned) i,
+                        (unsigned long) observed_de,
+                        (unsigned long) entry_base,
+                        (unsigned long) entry_node,
+                        (unsigned long) ret);
+            return -1;
+        }
+        if (observed_sb != 0xcafef00du) {
+            uart_printf("proc_lookup_de_asm SB FAIL i=%u observed_sb=0x%08lx\n",
+                        (unsigned) i,
+                        (unsigned long) observed_sb);
+            return -1;
+        }
+        if (read32(entry_pde, PDE_REFCOUNT_OFFSET) != 2u) {
+            uart_printf(
+                "proc_lookup_de_asm REF FAIL i=%u ref=0x%08lx node_right=0x%08lx\n",
+                (unsigned) i,
+                (unsigned long) read32(entry_pde, PDE_REFCOUNT_OFFSET),
+                (unsigned long) read32(entry_pde, PDE_SUBDIR_NODE_OFFSET + RB_RIGHT_OFFSET));
+            return -1;
+        }
+    }
+
+    uart_printf("proc_lookup_de_asm PASS\n");
+    return 0;
+}
+
+static void setup_pde_visibility_tree(void)
+{
+    for (uint32_t i = 0; i < sizeof(root_pde); i++) {
+        root_pde[i] = 0;
+        entry_pde[i] = 0;
+    }
+    for (uint32_t i = 0; i < sizeof(fake_dir); i++) {
+        fake_dir[i] = 0;
+    }
+    for (uint32_t i = 0; i < sizeof(fake_dentry); i++) {
+        fake_dentry[i] = 0;
+    }
+
+    uintptr_t entry_base = (uintptr_t) entry_pde;
+    uintptr_t entry_node = entry_base + PDE_SUBDIR_NODE_OFFSET;
+
+    pde_init_version_asm(entry_base);
+    write32(root_pde, PDE_SUBDIR_ROOT_OFFSET, entry_node);
+    write32(fake_dir, 20u, 0xcafef00du);
+    write32(fake_dentry, 28u, 7u);
+    write32(fake_dentry, 32u, (uintptr_t) version_name);
+}
+
+static void churn_cache(uint32_t seed)
+{
+    for (uint32_t i = 0; i < PDE_VIS_CHURN_BYTES; i += 64u) {
+        volatile uint32_t *word = (volatile uint32_t *) (void *) (s2l_area + i);
+        uint32_t value = *word ^ (seed + i + 0x9e3779b9u);
+        *word = value;
+        seed ^= *word + (seed << 5) + (seed >> 2);
+    }
+    s2l_sink ^= seed;
+}
+
+static int
+check_pde_visibility_result(const char *name, uint32_t i, uintptr_t direct, uintptr_t ret)
+{
+    uintptr_t entry_base = (uintptr_t) entry_pde;
+    uintptr_t entry_node = entry_base + PDE_SUBDIR_NODE_OFFSET;
+    uint32_t ref_now = (uint32_t) read32(entry_pde, PDE_REFCOUNT_OFFSET);
+
+    if (direct != entry_base) {
+        uart_printf("%s FIND FAIL i=%u got=0x%08lx expected=0x%08lx node=0x%08lx namelen=%u\n",
+                    name,
+                    (unsigned) i,
+                    (unsigned long) direct,
+                    (unsigned long) entry_base,
+                    (unsigned long) entry_node,
+                    (unsigned) entry_pde[PDE_NAMELEN_OFFSET]);
+        return -1;
+    }
+    if (observed_de != entry_base) {
+        uart_printf("%s DE FAIL i=%u observed_de=0x%08lx expected=0x%08lx ret=0x%08lx\n",
+                    name,
+                    (unsigned) i,
+                    (unsigned long) observed_de,
+                    (unsigned long) entry_base,
+                    (unsigned long) ret);
+        return -1;
+    }
+    if (observed_sb != 0xcafef00du) {
+        uart_printf("%s SB FAIL i=%u observed_sb=0x%08lx\n",
+                    name,
+                    (unsigned) i,
+                    (unsigned long) observed_sb);
+        return -1;
+    }
+    if (observed_ref_old != 1u || ref_now != 2u) {
+        uart_printf("%s REF FAIL i=%u old=0x%08lx now=0x%08lx mode_mem=0x%04x namelen=%u\n",
+                    name,
+                    (unsigned) i,
+                    (unsigned long) observed_ref_old,
+                    (unsigned long) ref_now,
+                    (unsigned) (*(volatile uint16_t *) (void *) (entry_pde + PDE_MODE_OFFSET)),
+                    (unsigned) entry_pde[PDE_NAMELEN_OFFSET]);
+        return -1;
+    }
+    if (observed_mode != 0x8124u || observed_namelen != 7u) {
+        uart_printf("%s MODE FAIL i=%u mode=0x%04lx namelen=%lu ref_old=0x%08lx ref_now=0x%08lx "
+                    "word96=0x%08lx\n",
+                    name,
+                    (unsigned) i,
+                    (unsigned long) observed_mode,
+                    (unsigned long) observed_namelen,
+                    (unsigned long) observed_ref_old,
+                    (unsigned long) ref_now,
+                    (unsigned long) read32(entry_pde, PDE_MODE_OFFSET));
+        return -1;
+    }
+    return 0;
+}
+
+static int run_pde_visibility_repro_variant(const char *name, int churn)
+{
+    for (uint32_t i = 0; i < PDE_VIS_ITERATIONS; i++) {
+        setup_pde_visibility_tree();
+        observed_de = 0;
+        observed_sb = 0;
+        observed_ref_old = 0xdeadbeefu;
+        observed_mode = 0xdeadbeefu;
+        observed_namelen = 0xdeadbeefu;
+
+        if (churn) {
+            churn_cache(i);
+        }
+
+        uintptr_t direct = pde_subdir_find_asm((uintptr_t) root_pde, version_name, 7u);
+        uintptr_t ret =
+            proc_lookup_de_asm((uintptr_t) fake_dir, (uintptr_t) fake_dentry, (uintptr_t) root_pde);
+        sink ^= direct ^ ret;
+        if (check_pde_visibility_result(name, i, direct, ret) != 0) {
+            return -1;
+        }
+    }
+
+    uart_printf("%s PASS\n", name);
+    return 0;
+}
+
+static int run_pde_visibility_repro(void)
+{
+    if (run_pde_visibility_repro_variant("pde_visibility_immediate", 0) != 0) {
+        return -1;
+    }
+    if (run_pde_visibility_repro_variant("pde_visibility_churn", 1) != 0) {
+        return -1;
+    }
+    return 0;
+}
+
+static int run_store_load_repro(void)
+{
+    for (uint32_t i = 0; i < ITERATIONS; i++) {
+        uint8_t *ptr = s2l_area + (i * 256u);
+        uint32_t value = 0x8000u | ((i * 37u + 0x16du) & 0x7fffu);
+        uint32_t got = halfword_s2l(ptr, value);
+        s2l_sink ^= got;
+        if (got != (value & 0xffffu)) {
+            uart_printf("halfword_s2l FAIL i=%u ptr=0x%08lx got=0x%08lx expected=0x%08lx\n",
+                        (unsigned) i,
+                        (unsigned long) (uintptr_t) ptr,
+                        (unsigned long) got,
+                        (unsigned long) (value & 0xffffu));
+            return -1;
+        }
+    }
+    uart_printf("halfword_s2l PASS\n");
+
+    for (uint32_t i = 0; i < ITERATIONS; i++) {
+        uint8_t *ptr = s2l_area + 0x4000u + (i * 256u);
+        uint32_t value = 0x40000000u | (i * 0x10203u) | 0x5a5u;
+        uint32_t got = word_s2l(ptr, value);
+        s2l_sink ^= got;
+        if (got != value) {
+            uart_printf("word_s2l FAIL i=%u ptr=0x%08lx got=0x%08lx expected=0x%08lx\n",
+                        (unsigned) i,
+                        (unsigned long) (uintptr_t) ptr,
+                        (unsigned long) got,
+                        (unsigned long) value);
+            return -1;
+        }
+    }
+    uart_printf("word_s2l PASS\n");
+
+    for (uint32_t i = 0; i < ITERATIONS; i++) {
+        uint8_t *ptr = s2l_area + 0x8000u + (i * 256u);
+        uint32_t value = 0x9000u | ((i * 53u + 0x55u) & 0x6fffu);
+        uint32_t got = amo_halfword_s2l(ptr, value);
+        s2l_sink ^= got;
+        if (got != (value & 0xffffu)) {
+            uart_printf("amo_halfword_s2l FAIL i=%u ptr=0x%08lx got=0x%08lx expected=0x%08lx\n",
+                        (unsigned) i,
+                        (unsigned long) (uintptr_t) ptr,
+                        (unsigned long) got,
+                        (unsigned long) (value & 0xffffu));
+            return -1;
+        }
+    }
+    uart_printf("amo_halfword_s2l PASS\n");
+    return 0;
+}
+
+int main(void)
+{
+    uart_printf("\n=== pde_return_hazard ===\n");
+    if (run_one("epilogue_repro", epilogue_repro) != 0) {
+        uart_printf("<<FAIL>>\n");
+        for (;;) {
+        }
+    }
+    if (run_one("epilogue_direct_a0", epilogue_direct_a0) != 0) {
+        uart_printf("<<FAIL>>\n");
+        for (;;) {
+        }
+    }
+    if (run_lookup_repro() != 0) {
+        uart_printf("<<FAIL>>\n");
+        for (;;) {
+        }
+    }
+    if (run_multi_lookup_repro() != 0) {
+        uart_printf("<<FAIL>>\n");
+        for (;;) {
+        }
+    }
+    if (run_pde_visibility_repro() != 0) {
+        uart_printf("<<FAIL>>\n");
+        for (;;) {
+        }
+    }
+    if (run_store_load_repro() != 0) {
+        uart_printf("<<FAIL>>\n");
+        for (;;) {
+        }
+    }
+    uart_printf("sink=0x%08lx\n", (unsigned long) sink);
+    uart_printf("s2l_sink=0x%08lx\n", (unsigned long) s2l_sink);
+    uart_printf("<<PASS>>\n");
+    for (;;) {
+    }
+}
diff --git a/sw/apps/smc_fencei_test/Makefile b/sw/apps/smc_fencei_test/Makefile
new file mode 100644
index 00000000..557494b2
--- /dev/null
+++ b/sw/apps/smc_fencei_test/Makefile
@@ -0,0 +1,19 @@
+#    Copyright 2026 Two Sigma Open Source, LLC
+#
+#    Licensed under the Apache License, Version 2.0 (the "License");
+#    you may not use this file except in compliance with the License.
+#    You may obtain a copy of the License at
+#
+#        http://www.apache.org/licenses/LICENSE-2.0
+#
+#    Unless required by applicable law or agreed to in writing, software
+#    distributed under the License is distributed on an "AS IS" BASIS,
+#    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#    See the License for the specific language governing permissions and
+#    limitations under the License.
+
+# Makefile for the hardened self-modifying-code (fence.i) reproducer.
+# Sweeps the store->fence.i timing/layout knobs (gap, warm/cold L1D,
+# write-allocate miss, tight self-modify loops) that the boot hang implicates.
+SRC_C   := ../../lib/src/uart.c main.c
+include ../../common/common.mk
diff --git a/sw/apps/smc_fencei_test/main.c b/sw/apps/smc_fencei_test/main.c
new file mode 100644
index 00000000..6108b2c6
--- /dev/null
+++ b/sw/apps/smc_fencei_test/main.c
@@ -0,0 +1,172 @@
+/*
+ *    Copyright 2026 Two Sigma Open Source, LLC
+ *
+ *    Licensed under the Apache License, Version 2.0 (the "License");
+ *    you may not use this file except in compliance with the License.
+ *    You may obtain a copy of the License at
+ *
+ *        http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *    Unless required by applicable law or agreed to in writing, software
+ *    distributed under the License is distributed on an "AS IS" BASIS,
+ *    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *    See the License for the specific language governing permissions and
+ *    limitations under the License.
+ */
+
+/**
+ * Hardened self-modifying-code / fence.i directed reproducer.
+ *
+ * Models the kernel's runtime code-patching contract (patch_insn_write +
+ * fence.i): store a new instruction word into cached-DDR code, fence.i to
+ * sync, then fetch/execute it. The fence.i must:
+ *   store -> SQ -> L1D (dirty) ... new code invisible to fetch
+ *   fence.i: drain committed SQ -> L1D writeback-all -> L1I invalidate-all
+ *            -> fetch-buffer invalidate
+ *   call -> L1I miss -> fill returns the freshly written code
+ *
+ * The gentle ddr_smc_test passes; this sweeps the timing/layout knobs that the
+ * boot hang implicates so a transient becomes a deterministic, waveform-able
+ * failure:
+ *   - store->fence.i freshness GAP (0/1/2/3/4/8 nops): how fresh the committed
+ *     store is when fence.i drains the store queue.
+ *   - WARM L1D (write-hit) vs COLD L1D (write-allocate miss): the L1D is
+ *     128 KiB direct-mapped with 32 B lines, so a single read +128 KiB shares
+ *     the index but not the tag and conflict-evicts the ddr_code line, forcing
+ *     the next patch store to miss and race the fence.i writeback walk.
+ *   - tight alternating self-modify loops (a stale/previous read is always a
+ *     detectable mismatch).
+ *
+ * Prints "<<PASS>>" if every post-fence.i call returns its freshly written
+ * value; "<<FAIL>>" with detail otherwise. A wedge (stale garbage executed)
+ * shows up as a simulation/UART timeout.
+ */
+
+#include <stdint.h>
+
+#include "../../lib/include/uart.h"
+
+#define ADDI_A0(imm) (0x00000513u | (((uint32_t) (imm) & 0xfffu) << 20)) /* addi a0,x0,imm */
+#define RET_INSN 0x00008067u                                             /* jalr x0,0(ra)  */
+
+/* Executable + writable patch target in the cached DDR region, line aligned
+ * (LINE_BYTES = 32). ddr_code[0] is the entry (patched); [1] is `ret`. */
+__attribute__((section(".ddr_data"), aligned(32))) static volatile uint32_t ddr_code[8];
+
+/* Direct-mapped L1D = 128 KiB. */
+#define L1D_BYTES (128u * 1024u)
+
+typedef int (*fn_t)(void);
+
+/* Patch word[0] with `addi a0,x0,imm`, then GAP nops, then fence.i. The single
+ * 32-bit store mirrors patch_insn_write; GAP varies how fresh the committed
+ * store is when the fence.i serializer drains the SQ. */
+#define MK_PATCH(name, nops)                                                                       \
+    static inline void name(uint32_t imm)                                                          \
+    {                                                                                              \
+        __asm__ volatile("sw %1, 0(%0)\n\t" nops "fence.i\n\t"                                     \
+                         :                                                                         \
+                         : "r"(&ddr_code[0]), "r"(ADDI_A0(imm))                                    \
+                         : "memory");                                                              \
+    }
+MK_PATCH(patch_g0, "")
+MK_PATCH(patch_g1, "nop\n\t")
+MK_PATCH(patch_g2, "nop\n\tnop\n\t")
+MK_PATCH(patch_g3, "nop\n\tnop\n\tnop\n\t")
+MK_PATCH(patch_g4, "nop\n\tnop\n\tnop\n\tnop\n\t")
+MK_PATCH(patch_g8, "nop\n\tnop\n\tnop\n\tnop\n\tnop\n\tnop\n\tnop\n\tnop\n\t")
+
+typedef void (*patch_fn_t)(uint32_t);
+static patch_fn_t const patchers[] = {patch_g0, patch_g1, patch_g2, patch_g3, patch_g4, patch_g8};
+static const int gaps[] = {0, 1, 2, 3, 4, 8};
+#define NGAPS ((int) (sizeof(gaps) / sizeof(gaps[0])))
+
+/* Conflict-evict the ddr_code line from a direct-mapped L1D (read several
+ * +N*128 KiB aliases; one suffices for direct-mapped, extras cover any
+ * set-assoc surprise). */
+static inline void evict_code_line(void)
+{
+    uintptr_t base = (uintptr_t) &ddr_code[0];
+    volatile uint32_t *a1 = (volatile uint32_t *) (base + 1u * L1D_BYTES);
+    volatile uint32_t *a2 = (volatile uint32_t *) (base + 2u * L1D_BYTES);
+    volatile uint32_t *a3 = (volatile uint32_t *) (base + 3u * L1D_BYTES);
+    volatile uint32_t *a4 = (volatile uint32_t *) (base + 4u * L1D_BYTES);
+    volatile uint32_t s = *a1 + *a2 + *a3 + *a4;
+    (void) s;
+}
+
+static int g_fail;
+static int g_reported;
+
+static void check(int tag, int gap, uint32_t want, int cold)
+{
+    fn_t fn = (fn_t) (uintptr_t) &ddr_code[0];
+    int got = fn();
+    if (got != (int) want) {
+        g_fail++;
+        if (g_reported < 16) {
+            uart_printf("FAIL tag=%x gap=%d cold=%d got=0x%x want=0x%x\n",
+                        (unsigned) tag,
+                        gap,
+                        cold,
+                        (unsigned) got,
+                        (unsigned) want);
+            g_reported++;
+        }
+    }
+}
+
+int main(void)
+{
+    /* Establish word[1] = ret once and sync it in. */
+    ddr_code[1] = RET_INSN;
+    __asm__ volatile("fence.i" ::: "memory");
+
+    /* Phase A: gap sweep, WARM L1D (write-hit). */
+    uart_printf("A");
+    for (int rep = 0; rep < 4; rep++) {
+        for (int g = 0; g < NGAPS; g++) {
+            uint32_t want = ((rep + g) & 1) ? 0x2Au : 0x355u;
+            patchers[g](want);
+            check(0xA, gaps[g], want, 0);
+        }
+    }
+
+    /* Phase B: gap sweep, COLD L1D (write-allocate miss). */
+    uart_printf("B");
+    for (int rep = 0; rep < 4; rep++) {
+        for (int g = 0; g < NGAPS; g++) {
+            uint32_t want = ((rep + g) & 1) ? 0x111u : 0x222u;
+            evict_code_line();
+            patchers[g](want);
+            check(0xB, gaps[g], want, 1);
+        }
+    }
+
+    /* Phase C: tight alternating self-modify loop, gap 0, warm. */
+    uart_printf("C");
+    for (int i = 0; i < 96; i++) {
+        uint32_t want = (i & 1) ? 0x123u : 0x456u;
+        patch_g0(want);
+        check(0xC, 0, want, 0);
+    }
+
+    /* Phase D: tight alternating self-modify loop, gap 0, cold (miss each time). */
+    uart_printf("D");
+    for (int i = 0; i < 48; i++) {
+        uint32_t want = (i & 1) ? 0x0AAu : 0x055u;
+        evict_code_line();
+        patch_g0(want);
+        check(0xD, 0, want, 1);
+    }
+
+    if (g_fail == 0) {
+        uart_printf("\n<<PASS>>\n");
+    } else {
+        uart_printf("\n<<FAIL>> (%d failures)\n", g_fail);
+    }
+
+    for (;;) {
+    }
+    return 0;
+}
diff --git a/sw/apps/trap_s2l_fwd/Makefile b/sw/apps/trap_s2l_fwd/Makefile
new file mode 100644
index 00000000..f79b59ea
--- /dev/null
+++ b/sw/apps/trap_s2l_fwd/Makefile
@@ -0,0 +1,18 @@
+#    Copyright 2026 Two Sigma Open Source, LLC
+#
+#    Licensed under the Apache License, Version 2.0 (the "License");
+#    you may not use this file except in compliance with the License.
+#    You may obtain a copy of the License at
+#
+#        http://www.apache.org/licenses/LICENSE-2.0
+#
+#    Unless required by applicable law or agreed to in writing, software
+#    distributed under the License is distributed on an "AS IS" BASIS,
+#    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#    See the License for the specific language governing permissions and
+#    limitations under the License.
+
+# Directed trap store->load forwarding repro. DDR-resident (cached tier).
+override MEM_CONFIG := ddr
+SRC_C := ../../lib/src/uart.c main.c
+include ../../common/common.mk
diff --git a/sw/apps/trap_s2l_fwd/main.c b/sw/apps/trap_s2l_fwd/main.c
new file mode 100644
index 00000000..21e4e6b2
--- /dev/null
+++ b/sw/apps/trap_s2l_fwd/main.c
@@ -0,0 +1,158 @@
+/*
+ *    Copyright 2026 Two Sigma Open Source, LLC
+ *
+ *    Licensed under the Apache License, Version 2.0 (the "License");
+ *    you may not use this file except in compliance with the License.
+ *    You may obtain a copy of the License at
+ *
+ *        http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *    Unless required by applicable law or agreed to in writing, software
+ *    distributed under the License is distributed on an "AS IS" BASIS,
+ *    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *    See the License for the specific language governing permissions and
+ *    limitations under the License.
+ */
+
+/*
+ * Deterministic repro for the boot-hang root cause: cached store->load
+ * visibility across the trap path.
+ *
+ * The handler increments a CACHED counter g_ctr every trap; the main loop spins
+ * until it observes g_ctr reach a target. If a store of g_ctr is not visible to
+ * a later load of g_ctr (the store->load bug), g_ctr never advances from the
+ * observer's view and the loop hangs -- the exact livelock signature of the
+ * real boot hang. A wall-clock (mtime) watchdog prints the stuck g_ctr instead
+ * of hanging forever, so the failure is observable.
+ *
+ * Run at hardware-realistic latency: DDR_MODEL_LATENCY>=70, CACHED_HAS_L2=0.
+ */
+
+#include <stdint.h>
+
+#include "csr.h"
+#include "trap.h"
+#include "uart.h"
+
+#define CLINT_MTIMECMP_LO (*(volatile uint32_t *) 0x40014000u)
+#define CLINT_MTIMECMP_HI (*(volatile uint32_t *) 0x40014004u)
+#define CLINT_MTIME_LO (*(volatile uint32_t *) 0x4001BFF8u)
+#define CLINT_MTIME_HI (*(volatile uint32_t *) 0x4001BFFCu)
+
+#define TARGET 200u
+#define DDR_STACK_SIZE 4096u
+
+volatile uint32_t g_ctr;        /* cached counter, written by handler, read by main */
+volatile uint32_t g_percpu[16]; /* DDR per-cpu-like scratch (tp base) */
+static uint8_t g_ddr_stack[DDR_STACK_SIZE] __attribute__((aligned(16)));
+
+static inline uint64_t clint_rdmtime(void)
+{
+    uint32_t hi, lo, hi2;
+    do {
+        hi = CLINT_MTIME_HI;
+        lo = CLINT_MTIME_LO;
+        hi2 = CLINT_MTIME_HI;
+    } while (hi != hi2);
+    return ((uint64_t) hi << 32) | lo;
+}
+
+static void clint_arm(uint64_t cmp)
+{
+    CLINT_MTIMECMP_HI = 0xFFFFFFFFu;
+    CLINT_MTIMECMP_LO = (uint32_t) cmp;
+    CLINT_MTIMECMP_HI = (uint32_t) (cmp >> 32);
+}
+
+/* Trap handler, faithful to a real kernel handler: saves/restores the GPRs it
+ * uses on the (cached DDR) stack -- which IS the handle_exception store->load
+ * pattern -- and explicitly checks a store->load with a VARYING value so a
+ * forward-miss is always caught. 'X' on the raw UART if the reload is wrong. */
+/* FULLY FAITHFUL to handle_exception's kernel-trap entry: the tp/mscratch swap,
+ * then sw sp,8(tp); sw sp,12(tp); lw sp,8(tp) -- loading the trap-time sp back
+ * INTO sp via the cached scratch slot -- then GPR saves to that reloaded sp.
+ * If the cached store->load (lw sp,8(tp)) drops the just-stored sp, sp becomes
+ * garbage and the GPR saves fault -> re-trap -> hang, exactly like the kernel. */
+__attribute__((naked, aligned(4))) static void ctr_entry(void)
+{
+    __asm__ volatile("csrrw tp, mscratch, tp\n" /* kernel: tp=0, mscratch=old tp(&g_percpu) */
+                     "bnez  tp, 1f\n"
+                     "csrr  tp, mscratch\n" /* tp = &g_percpu */
+                     "sw    sp, 8(tp)\n"    /* *(tp+8) = sp */
+                     "1:\n"
+                     "sw    sp, 12(tp)\n"
+                     "lw    sp, 8(tp)\n" /* sp = *(tp+8)  <-- cached store->load INTO sp */
+                     "addi  sp, sp, -64\n"
+                     "sw    ra, 0(sp)\n" /* GPR saves to the reloaded sp (fault if sp bad) */
+                     "sw    t0, 4(sp)\n"
+                     "sw    t1, 8(sp)\n"
+                     "sw    t2, 12(sp)\n"
+                     /* work: g_ctr++ */
+                     "la    t1, g_ctr\n"
+                     "lw    t2, 0(t1)\n"
+                     "addi  t2, t2, 1\n"
+                     "sw    t2, 0(t1)\n"
+                     /* ack timer */
+                     "li    t1, 0x40014004\n"
+                     "li    t2, -1\n"
+                     "sw    t2, 0(t1)\n"
+                     "li    t1, 0x40014000\n"
+                     "sw    t2, 0(t1)\n"
+                     /* restore */
+                     "lw    ra, 0(sp)\n"
+                     "lw    t0, 4(sp)\n"
+                     "lw    t1, 8(sp)\n"
+                     "lw    t2, 12(sp)\n"
+                     "addi  sp, sp, 64\n" /* sp back to trap-time value */
+                     "csrw  mscratch, x0\n"
+                     "mret\n");
+}
+
+__attribute__((noreturn, noinline, used)) void main_on_ddr_stack(void)
+{
+    uart_printf("\n=== faithful handle_exception sw/lw-into-sp repro ===\n");
+    g_ctr = 0u;
+    for (int i = 0; i < 16; i++)
+        g_percpu[i] = 0xB6B60000u + (uint32_t) i;
+    /* kernel convention: tp = per-cpu ptr, mscratch = 0 */
+    __asm__ volatile("mv tp, %0" : : "r"((uint32_t) &g_percpu[0]) : "memory");
+    csr_write(mscratch, 0u);
+    set_trap_handler(&ctr_entry);
+    enable_timer_interrupt();
+
+    uint64_t deadline = clint_rdmtime() + 1500000u;
+    uint32_t observed = 0u;
+    while (g_ctr < TARGET) {
+        clint_arm(clint_rdmtime() + 200u);
+        enable_interrupts();
+        for (volatile int s = 0; s < 32; s++) {
+        }
+        disable_interrupts();
+        observed = g_ctr;
+        if (clint_rdmtime() > deadline) {
+            break;
+        }
+    }
+
+    if (g_ctr >= TARGET) {
+        uart_printf("g_ctr=%u reached target -- store->load OK\n", g_ctr);
+        uart_printf("<<PASS>>\n");
+    } else {
+        uart_printf(
+            "HANG: g_ctr stuck at %u (last observed %u) -- store->load broken\n", g_ctr, observed);
+        uart_printf("<<FAIL>>\n");
+    }
+    for (;;) {
+    }
+}
+
+int main(void)
+{
+    uint32_t stack_top = ((uint32_t) &g_ddr_stack[DDR_STACK_SIZE]) & ~0xFu;
+    __asm__ volatile("mv sp, %0\n"
+                     "j  main_on_ddr_stack\n"
+                     :
+                     : "r"(stack_top)
+                     : "memory");
+    __builtin_unreachable();
+}
diff --git a/sw/apps/umode_test/Makefile b/sw/apps/umode_test/Makefile
new file mode 100644
index 00000000..c51e51ad
--- /dev/null
+++ b/sw/apps/umode_test/Makefile
@@ -0,0 +1,17 @@
+#    Copyright 2026 Two Sigma Open Source, LLC
+#
+#    Licensed under the Apache License, Version 2.0 (the "License");
+#    you may not use this file except in compliance with the License.
+#    You may obtain a copy of the License at
+#
+#        http://www.apache.org/licenses/LICENSE-2.0
+#
+#    Unless required by applicable law or agreed to in writing, software
+#    distributed under the License is distributed on an "AS IS" BASIS,
+#    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#    See the License for the specific language governing permissions and
+#    limitations under the License.
+
+# Makefile for the U-mode (User privilege) directed test
+SRC_C := main.c
+include ../../common/common.mk
diff --git a/sw/apps/umode_test/main.c b/sw/apps/umode_test/main.c
new file mode 100644
index 00000000..eac0c492
--- /dev/null
+++ b/sw/apps/umode_test/main.c
@@ -0,0 +1,198 @@
+/*
+ *    Copyright 2026 Two Sigma Open Source, LLC
+ *
+ *    Licensed under the Apache License, Version 2.0 (the "License");
+ *    you may not use this file except in compliance with the License.
+ *    You may obtain a copy of the License at
+ *
+ *        http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *    Unless required by applicable law or agreed to in writing, software
+ *    distributed under the License is distributed on an "AS IS" BASIS,
+ *    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *    See the License for the specific language governing permissions and
+ *    limitations under the License.
+ */
+
+/*
+ * U-mode (User privilege) directed test.
+ *
+ * Exercises the Machine+User privilege support end-to-end on the real core and
+ * self-checks over UART (<<PASS>> / <<FAIL>>):
+ *
+ *   A. ECALL from U-mode            -> mcause = 8  (ExcEcallUmode; 11 is M-mode)
+ *   B. Machine timer interrupt while in U-mode with mstatus.MIE = 0
+ *                                   -> trap taken, mcause = 0x8000_0007.
+ *      Proves machine interrupts fire while running below M regardless of MIE
+ *      (so the timer can preempt user code) AND that the interrupt mcause
+ *      carries the interrupt bit + code.
+ *   C. Reading an M-mode CSR from U -> illegal instruction (mcause = 2).
+ *      Requires the U-mode CSR-permission check. If that check is absent the
+ *      trailing ECALL traps instead (mcause = 8), so the test FAILs cleanly
+ *      rather than hanging.
+ *   D. Executing MRET from U-mode   -> illegal instruction (mcause = 2).
+ *      MRET is an M-mode-only instruction; the trailing ECALL is the cause-8
+ *      fallback so the test FAILs (not hangs) if the check is absent.
+ *
+ * Mechanism: each case drops to U-mode via MRET (mstatus.MPP = U) into a small
+ * naked U-mode function that triggers the trap. A naked M-mode handler records
+ * mcause and the privilege the trap came from (mstatus.MPP), pushes mtimecmp to
+ * max so a timer interrupt cannot refire, and returns to M-mode at a fixed
+ * continuation address stashed in mscratch (forcing MPP=M for its MRET).
+ */
+
+#include <stdint.h>
+
+#include "trap.h"
+
+/* ---- minimal UART (UART_TX is provided by mmio.h via trap.h) ---- */
+static void uart_putc(char c)
+{
+    UART_TX = (uint8_t) c;
+}
+
+static void uart_puts(const char *s)
+{
+    while (*s)
+        uart_putc(*s++);
+}
+
+static void uart_hex(uint32_t v)
+{
+    static const char hex[] = "0123456789ABCDEF";
+    uart_puts("0x");
+    for (int i = 28; i >= 0; i -= 4)
+        uart_putc(hex[(v >> i) & 0xF]);
+}
+
+/* ---- trap state shared with the naked handler ---- */
+static volatile uint32_t g_cause;
+static volatile uint32_t g_from_priv; /* mstatus.MPP at trap entry = prev priv */
+
+/*
+ * Naked M-mode trap handler. Records mcause and the trapping privilege, pushes
+ * mtimecmp to max (so a timer interrupt cannot refire), then returns to M-mode
+ * at the continuation address run_in_umode stashed in mscratch. Forces MPP=M so
+ * the MRET lands back in M-mode. Bouncing to a fixed continuation (rather than
+ * resuming the U-mode code) means clobbering temporaries here is safe.
+ */
+__attribute__((naked, aligned(4))) static void umode_trap_handler(void)
+{
+    __asm__ volatile("csrr t0, mcause\n"
+                     "lui  t1, %hi(g_cause)\n"
+                     "lw   t2, %lo(g_cause)(t1)\n"
+                     "li   t3, -1\n" /* sentinel: only the FIRST trap of each test records */
+                     "bne  t2, t3, 2f\n"
+                     "sw   t0, %lo(g_cause)(t1)\n"
+                     "csrr t0, mstatus\n"
+                     "srli t0, t0, 11\n"
+                     "andi t0, t0, 0x3\n" /* mstatus.MPP */
+                     "lui  t1, %hi(g_from_priv)\n"
+                     "sw   t0, %lo(g_from_priv)(t1)\n"
+                     "2:\n"
+                     "li   t1, 0x4000001C\n" /* MTIMECMP_HI: push compare to max to ack timer */
+                     "li   t0, -1\n"
+                     "sw   t0, 0(t1)\n"
+                     "csrr t0, mscratch\n" /* M-mode continuation set by run_in_umode */
+                     "csrw mepc, t0\n"
+                     "li   t0, 0x1800\n" /* MPP = M (0b11 << 11) */
+                     "csrs mstatus, t0\n"
+                     "mret\n");
+}
+
+/*
+ * Enter U-mode at ufn; the handler returns control to the instruction after the
+ * MRET. Returns the mcause of the trap that ended U-mode execution.
+ */
+static uint32_t run_in_umode(void (*ufn)(void))
+{
+    g_cause = 0xFFFFFFFFu;
+    g_from_priv = 0xFFFFFFFFu;
+    __asm__ volatile("la   t0, 1f\n"
+                     "csrw mscratch, t0\n" /* where the handler returns */
+                     "li   t0, 0x1800\n"
+                     "csrc mstatus, t0\n" /* MPP = U (00) */
+                     "csrw mepc, %0\n"
+                     "mret\n" /* -> U-mode at ufn */
+                     "1:\n"
+                     :
+                     : "r"(ufn)
+                     : "t0", "t1", "t2", "memory");
+    return g_cause;
+}
+
+/* ---- U-mode test bodies (naked: no prologue, so a mid-loop trap leaves the
+ *      M-mode stack frame intact). Each spins after its trapping instruction. */
+__attribute__((naked)) static void u_ecall(void)
+{
+    __asm__ volatile("ecall\n j .");
+}
+
+__attribute__((naked)) static void u_spin(void)
+{
+    __asm__ volatile("j .");
+}
+
+__attribute__((naked)) static void u_read_mcsr(void)
+{
+    /* csrr of an M-CSR is illegal from U (cause 2); the ecall is the
+     * cause-8 fallback so the test FAILs (not hangs) if the check is absent. */
+    __asm__ volatile("csrr t0, mstatus\n ecall\n j .");
+}
+
+__attribute__((naked)) static void u_mret_umode(void)
+{
+    /* MRET is an M-mode-only instruction; executing it from U is illegal
+     * (cause 2). The ecall is the cause-8 fallback so the test FAILs (not
+     * hangs) if the check is absent. */
+    __asm__ volatile("mret\n ecall\n j .");
+}
+
+static int report(const char *name, uint32_t got, uint32_t want, uint32_t from_priv)
+{
+    int ok = (got == want) && (from_priv == 0u /* U */);
+    uart_puts(ok ? "[PASS] " : "[FAIL] ");
+    uart_puts(name);
+    uart_puts(" mcause=");
+    uart_hex(got);
+    uart_puts(" from_priv=");
+    uart_hex(from_priv);
+    uart_puts("\r\n");
+    return ok;
+}
+
+int main(void)
+{
+    int all_ok = 1;
+    uint32_t cause;
+
+    uart_puts("\r\n=== U-mode privilege test ===\r\n");
+    set_trap_handler(&umode_trap_handler);
+
+    /* A: ECALL from U-mode -> mcause 8 */
+    cause = run_in_umode(&u_ecall);
+    all_ok &= report("A ecall-from-U (want mcause=8)", cause, 8u, g_from_priv);
+
+    /* B: timer preempts U-mode with MIE=0 -> mcause 0x8000_0007 */
+    (void) disable_interrupts();      /* MIE = 0 */
+    csr_clear(mstatus, MSTATUS_MPIE); /* so U runs with MIE=0 as well */
+    enable_timer_interrupt();         /* mie.MTIE = 1 */
+    set_timer_cmp(rdmtime() + 300);
+    cause = run_in_umode(&u_spin);
+    all_ok &=
+        report("B timer-preempts-U (want mcause=0x80000007)", cause, 0x80000007u, g_from_priv);
+    disable_timer_interrupt();
+
+    /* C: M-mode CSR read from U -> illegal (mcause 2) */
+    cause = run_in_umode(&u_read_mcsr);
+    all_ok &= report("C M-CSR-from-U (want mcause=2)", cause, 2u, g_from_priv);
+
+    /* D: MRET from U -> illegal (mcause 2) */
+    cause = run_in_umode(&u_mret_umode);
+    all_ok &= report("D mret-from-U (want mcause=2)", cause, 2u, g_from_priv);
+
+    uart_puts(all_ok ? "\r\n<<PASS>>\r\n" : "\r\n<<FAIL>>\r\n");
+    for (;;) {
+    }
+    return 0;
+}
diff --git a/sw/apps/wfi_drain_mepc_test/Makefile b/sw/apps/wfi_drain_mepc_test/Makefile
new file mode 100644
index 00000000..aa44fc98
--- /dev/null
+++ b/sw/apps/wfi_drain_mepc_test/Makefile
@@ -0,0 +1,20 @@
+#    Copyright 2026 Two Sigma Open Source, LLC
+#
+#    Licensed under the Apache License, Version 2.0 (the "License");
+#    you may not use this file except in compliance with the License.
+#    You may obtain a copy of the License at
+#
+#        http://www.apache.org/licenses/LICENSE-2.0
+#
+#    Unless required by applicable law or agreed to in writing, software
+#    distributed under the License is distributed on an "AS IS" BASIS,
+#    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#    See the License for the specific language governing permissions and
+#    limitations under the License.
+
+# Drain-gated WFI mepc directed test. Force the whole program into cached DDR so
+# the pre-WFI store is a slow cached drain (the committed entry that must still be
+# draining when the timer IRQ is taken at the WFI).
+override MEM_CONFIG := ddr
+SRC_C := ../../lib/src/uart.c main.c
+include ../../common/common.mk
diff --git a/sw/apps/wfi_drain_mepc_test/main.c b/sw/apps/wfi_drain_mepc_test/main.c
new file mode 100644
index 00000000..2419eb79
--- /dev/null
+++ b/sw/apps/wfi_drain_mepc_test/main.c
@@ -0,0 +1,167 @@
+/*
+ *    Copyright 2026 Two Sigma Open Source, LLC
+ *
+ *    Licensed under the Apache License, Version 2.0 (the "License");
+ *    you may not use this file except in compliance with the License.
+ *    You may obtain a copy of the License at
+ *
+ *        http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *    Unless required by applicable law or agreed to in writing, software
+ *    distributed under the License is distributed on an "AS IS" BASIS,
+ *    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *    See the License for the specific language governing permissions and
+ *    limitations under the License.
+ */
+
+/*
+ * Directed test for the *drain-gated* WFI mepc spec deviation.
+ *
+ * wfi_mepc_test covers the simple case (timer IRQ at a WFI with an empty ROB ->
+ * mepc must be the post-WFI PC). This test targets a narrower, analysis-derived
+ * window: a machine-timer interrupt that becomes eligible while a WFI is at the
+ * ROB head AND a committed CACHED (DDR) store is still draining.
+ *
+ * Mechanism under test (cpu_ooo.sv interrupt_resume_pc / trap_unit.sv take_trap
+ * gated on sq_committed_empty / the *registered* trap_mret_commit_hold_q): when
+ * the store drain finishes, take_trap fires combinationally that cycle while
+ * commit_hold still lags one cycle, so the WFI is flushed before it commits and
+ * mepc is saved as the WFI's own PC instead of wfi_pc+4. RISC-V priv spec: an
+ * interrupt taken at WFI resumes at the *following* instruction (mepc=wfi_pc+4).
+ *
+ * Construction: DDR-resident (MEM_CONFIG=ddr); immediately before the WFI, store
+ * to a FRESH cold DDR cache line (a different line each margin, in a region the
+ * program never otherwise touches) so the store reliably misses and drains the
+ * full DDR latency -- regardless of L1 write policy / warmth. Sweep the timer
+ * margin so the IRQ lands at every offset across that drain window.
+ *
+ * Robustness fixes vs the first cut:
+ *  - mscratch (the handler's fixed continuation) is armed BEFORE interrupts are
+ *    enabled, inside the asm, so a tiny margin cannot take the trap with a stale
+ *    mscratch and crash. Enable/disable MIE is done in-asm around the WFI.
+ *  - The handler is register-preserving and resumes via mscratch (never the
+ *    recorded mepc), so a wrong mepc is detected, not fatal.
+ *
+ * PASS iff no margin ever produces mepc==wfi_pc. Run at DDR_MODEL_LATENCY>=70.
+ */
+
+#include <stdint.h>
+
+#include "trap.h"
+#include "uart.h"
+
+#define MARGIN_MIN 0u
+#define MARGIN_MAX 200u
+
+/* Cold DDR region the program never otherwise touches (well inside the 64 MiB
+ * model, far from the app's own code/data/stack). Each margin stores to its own
+ * 64 B line here, so every pre-WFI store is a cold miss -> full DDR-latency drain. */
+#define DRAIN_BASE 0x82000000u
+#define DRAIN_LINE 64u
+
+static volatile uint32_t g_mepc;  /* mepc the trap saved, last fire */
+static volatile uint32_t g_taken; /* running count of timer traps taken */
+
+/*
+ * Naked M-mode timer handler. Register-preserving (saves/restores t0,t1 on the
+ * current stack) so the WFI/resume addresses the caller holds in registers across
+ * the WFI are not corrupted. Records the saved mepc, counts the trap, disarms the
+ * timer (mtimecmp_hi := -1) so it cannot refire, then resumes at the fixed
+ * continuation in mscratch -- never at the recorded mepc, so a wrong mepc cannot
+ * send us back into the WFI and hang.
+ */
+__attribute__((naked, aligned(4))) static void wfi_drain_trap_handler(void)
+{
+    __asm__ volatile("addi sp, sp, -16\n"
+                     "sw   t0, 0(sp)\n"
+                     "sw   t1, 4(sp)\n"
+                     "csrr t0, mepc\n"
+                     "lui  t1, %hi(g_mepc)\n"
+                     "sw   t0, %lo(g_mepc)(t1)\n"
+                     "lui  t1, %hi(g_taken)\n"
+                     "lw   t0, %lo(g_taken)(t1)\n"
+                     "addi t0, t0, 1\n"
+                     "sw   t0, %lo(g_taken)(t1)\n"
+                     "li   t1, 0x4000001C\n" /* MTIMECMP_HI: disarm */
+                     "li   t0, -1\n"
+                     "sw   t0, 0(t1)\n"
+                     "csrr t0, mscratch\n" /* fixed continuation after the WFI */
+                     "csrw mepc, t0\n"
+                     "lw   t0, 0(sp)\n"
+                     "lw   t1, 4(sp)\n"
+                     "addi sp, sp, 16\n"
+                     "mret\n");
+}
+
+int main(void)
+{
+    uint32_t bug = 0, correct = 0, early = 0, nofire = 0;
+    uint32_t bug_margin = 0, bug_mepc = 0, bug_wfi = 0;
+
+    uart_printf("\n=== drain-gated WFI mepc test ===\n");
+    set_trap_handler(&wfi_drain_trap_handler);
+    enable_timer_interrupt();
+
+    for (uint32_t margin = MARGIN_MIN; margin <= MARGIN_MAX; margin++) {
+        volatile uint32_t *sink = (volatile uint32_t *) (DRAIN_BASE + margin * DRAIN_LINE);
+        uint32_t wfi_addr = 0;
+        uint32_t resume_addr = 0;
+        uint32_t before = g_taken;
+
+        g_mepc = 0;
+        set_timer_cmp(rdmtime() + margin); /* armed; MIE still 0 until the asm */
+
+        /*
+         * Arm mscratch (handler continuation) BEFORE enabling interrupts, then
+         * enable MIE in-asm; capture the WFI/resume PCs; issue one cold-miss DDR
+         * store IMMEDIATELY before the WFI (the committed entry that must still be
+         * draining when the IRQ is taken); WFI; then disable MIE. The handler
+         * bounces us to label 2 regardless of mepc.
+         */
+        __asm__ volatile("la    %[res], 2f\n"
+                         "csrw  mscratch, %[res]\n"
+                         "csrsi mstatus, 8\n" /* enable MIE (interrupts) after mscratch is valid */
+                         "la    %[wfi], 1f\n"
+                         "sw    %[res], 0(%[sink])\n"
+                         "1:\n"
+                         "wfi\n"
+                         "2:\n"
+                         "csrci mstatus, 8\n" /* disable MIE */
+                         : [res] "=&r"(resume_addr), [wfi] "=&r"(wfi_addr)
+                         : [sink] "r"(sink)
+                         : "memory");
+
+        if (g_taken == before) {
+            nofire++;
+            continue;
+        }
+        if (g_mepc == wfi_addr) {
+            bug++;
+            bug_margin = margin;
+            bug_mepc = g_mepc;
+            bug_wfi = wfi_addr;
+        } else if (g_mepc == resume_addr) {
+            correct++;
+        } else {
+            early++;
+        }
+    }
+
+    disable_timer_interrupt();
+    disable_interrupts();
+
+    uart_printf("sweep: bug=%u correct=%u early=%u nofire=%u\n", bug, correct, early, nofire);
+    if (bug) {
+        uart_printf("drain-gated WFI saved mepc==wfi_pc: margin=%u mepc=%08x wfi=%08x\n",
+                    bug_margin,
+                    bug_mepc,
+                    bug_wfi);
+        uart_printf("<<FAIL>>\n");
+    } else {
+        uart_printf("<<PASS>>\n");
+    }
+
+    for (;;) {
+    }
+    return 0;
+}
diff --git a/sw/apps/wfi_lost_tick/Makefile b/sw/apps/wfi_lost_tick/Makefile
new file mode 100644
index 00000000..32d1f147
--- /dev/null
+++ b/sw/apps/wfi_lost_tick/Makefile
@@ -0,0 +1,19 @@
+#    Copyright 2026 Two Sigma Open Source, LLC
+#
+#    Licensed under the Apache License, Version 2.0 (the "License");
+#    you may not use this file except in compliance with the License.
+#    You may obtain a copy of the License at
+#
+#        http://www.apache.org/licenses/LICENSE-2.0
+#
+#    Unless required by applicable law or agreed to in writing, software
+#    distributed under the License is distributed on an "AS IS" BASIS,
+#    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#    See the License for the specific language governing permissions and
+#    limitations under the License.
+
+#    Copyright 2026 Two Sigma Open Source, LLC
+#    SPDX-License-Identifier: Apache-2.0
+# WFI-idle lost-machine-timer-tick directed test (deferred-eligibility MIE edge)
+SRC_C := main.c
+include ../../common/common.mk
diff --git a/sw/apps/wfi_lost_tick/main.c b/sw/apps/wfi_lost_tick/main.c
new file mode 100644
index 00000000..22abbe19
--- /dev/null
+++ b/sw/apps/wfi_lost_tick/main.c
@@ -0,0 +1,146 @@
+/*
+ *    Copyright 2026 Two Sigma Open Source, LLC
+ *
+ *    Licensed under the Apache License, Version 2.0 (the "License");
+ *    you may not use this file except in compliance with the License.
+ *    You may obtain a copy of the License at
+ *
+ *        http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *    Unless required by applicable law or agreed to in writing, software
+ *    distributed under the License is distributed on an "AS IS" BASIS,
+ *    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *    See the License for the specific language governing permissions and
+ *    limitations under the License.
+ */
+
+/*
+ * WFI-idle lost-machine-timer-tick directed test.
+ *
+ * Reproduce target: the residual flaky HANG booting no-MMU Linux on Genesys2.
+ * After fixing the MRET-drain deadlock the boot is STILL ~50% flaky, hanging at
+ * VARYING points after the first timer activity with no panic -- the signature
+ * of a LOST machine-timer tick -> frozen jiffies. The machine-timer trap is
+ * occasionally NOT TAKEN and timekeeping stops.
+ *
+ * This faithfully mirrors the kernel's idle + CLINT-timer flow, which the
+ * existing mtimer_stress (no WFI, MIE always 1) and linux_irq_ddr_test miss:
+ *   - idle loop: csrci mstatus,8 (MIE:=0); fence; wfi; csrsi mstatus,8 (MIE:=1).
+ *     The whole kernel is M-mode, so the machine-timer trap is eligible ONLY
+ *     when mstatus.MIE=1 -- it is DEFERRED from the WFI-wake (raw mtip level) to
+ *     the later csrsi MIE 0->1 edge.
+ *   - handler = the CLINT pattern: csr_clear mie.MTIE on entry (clint_timer_
+ *     interrupt), then csr_set mie.MTIE + write a fresh future mtimecmp
+ *     (clint_clock_next_event), then MRET (restores MIE from MPIE).
+ * The re-arm period is phase-swept (mtime + 24..87 per tick) so the deadline
+ * crossing lands at every cycle offset around the wfi / csrsi / MRET-recovery
+ * window across thousands of ticks.
+ *
+ * Invariant: each idle iteration arms exactly one future deadline and must take
+ * exactly one trap, so g_jiffies must equal the iteration count. If any trap is
+ * dropped (and especially if mie.MTIE sticks low so timekeeping freezes),
+ * g_jiffies falls behind -> <<FAIL>>. If every tick is taken -> <<PASS>>.
+ */
+
+#include <stdint.h>
+
+#include "trap.h"
+
+#define MIE_MTIE_BIT 0x80u /* mie.MTIE = bit 7 */
+#define ITERS 3000u
+
+volatile uint32_t g_jiffies; /* incremented once per timer trap (the "tick") */
+
+static void uart_putc(char c)
+{
+    UART_TX = (uint8_t) c;
+}
+static void uart_puts(const char *s)
+{
+    while (*s)
+        uart_putc(*s++);
+}
+static void uart_hex(uint32_t v)
+{
+    static const char hex[] = "0123456789ABCDEF";
+    uart_puts("0x");
+    for (int i = 28; i >= 0; i -= 4)
+        uart_putc(hex[(v >> i) & 0xF]);
+}
+
+/*
+ * Naked M-mode timer handler mirroring the CLINT driver:
+ *   clint_timer_interrupt:  csr_clear(mie, MTIE)               [mask on entry]
+ *   clint_clock_next_event: csr_set(mie, MTIE); write mtimecmp [re-arm]
+ * then MRET (MIE restored from MPIE). The phase-sweep period = 24 + (jiffies&63).
+ */
+__attribute__((naked, aligned(4))) static void clint_like_handler(void)
+{
+    __asm__ volatile("addi sp, sp, -16\n"
+                     "sw   t0, 0(sp)\n"
+                     "sw   t1, 4(sp)\n"
+                     "sw   t2, 8(sp)\n"
+                     "li   t0, 0x80\n"     /* mie.MTIE */
+                     "csrrc x0, mie, t0\n" /* csr_clear(mie, MTIE) -- handler entry */
+                     "lui  t0, %hi(g_jiffies)\n"
+                     "lw   t1, %lo(g_jiffies)(t0)\n"
+                     "addi t1, t1, 1\n"
+                     "sw   t1, %lo(g_jiffies)(t0)\n" /* g_jiffies++  (the tick) */
+                     "andi t2, t1, 0x3f\n"
+                     "addi t2, t2, 24\n" /* period = 24 + (jiffies & 63): phase sweep */
+                     "li   t0, 0x80\n"
+                     "csrrs x0, mie, t0\n"   /* csr_set(mie, MTIE) -- re-arm enable */
+                     "li   t0, 0x40000010\n" /* MTIME_LO */
+                     "lw   t1, 0(t0)\n"
+                     "add  t1, t1, t2\n"
+                     "li   t0, 0x40000018\n" /* MTIMECMP_LO (HI stays 0, set in main) */
+                     "sw   t1, 0(t0)\n"      /* write fresh future deadline -> mtip low */
+                     "lw   t0, 0(sp)\n"
+                     "lw   t1, 4(sp)\n"
+                     "lw   t2, 8(sp)\n"
+                     "addi sp, sp, 16\n"
+                     "mret\n");
+}
+
+int main(void)
+{
+    uart_puts("\r\n=== WFI-idle lost-timer-tick test ===\r\n");
+    set_trap_handler(&clint_like_handler);
+
+    /* Arm the first deadline, enable the machine timer, then run the idle loop. */
+    MTIMECMP_HI = 0;
+    MTIMECMP_LO = (uint32_t) rdmtime() + 40;
+    enable_timer_interrupt(); /* mie.MTIE = 1 */
+    enable_interrupts();      /* mstatus.MIE = 1 (idle loop toggles it) */
+
+    /* Kernel idle pattern: MIE off, WFI (wake on raw mtip), MIE on (deferred
+     * timer trap taken here). Exactly one tick must be taken per iteration. */
+    for (uint32_t i = 0; i < ITERS; i++) {
+        __asm__ volatile("csrci mstatus, 8\n" /* mstatus.MIE = 0 */
+                         "fence\n"
+                         "wfi\n"
+                         "csrsi mstatus, 8\n" /* mstatus.MIE = 1 -> take deferred timer */
+                         ::
+                             : "memory");
+    }
+
+    disable_timer_interrupt();
+    uint32_t jiffies = g_jiffies;
+    uart_puts("iters=");
+    uart_hex(ITERS);
+    uart_puts(" jiffies=");
+    uart_hex(jiffies);
+    uart_puts("\r\n");
+
+    /* Every WFI-wake must produce exactly one tick. A shortfall means a
+     * machine-timer trap was dropped (lost tick / frozen timekeeping). */
+    if (jiffies + 4u >= ITERS) {
+        uart_puts("<<PASS>>\r\n");
+    } else {
+        uart_puts("[FAIL] lost timer tick(s): jiffies fell behind idle iterations\r\n");
+        uart_puts("<<FAIL>>\r\n");
+    }
+    for (;;) {
+    }
+    return 0;
+}
diff --git a/sw/apps/wfi_mepc_test/Makefile b/sw/apps/wfi_mepc_test/Makefile
new file mode 100644
index 00000000..bbb9e7da
--- /dev/null
+++ b/sw/apps/wfi_mepc_test/Makefile
@@ -0,0 +1,17 @@
+#    Copyright 2026 Two Sigma Open Source, LLC
+#
+#    Licensed under the Apache License, Version 2.0 (the "License");
+#    you may not use this file except in compliance with the License.
+#    You may obtain a copy of the License at
+#
+#        http://www.apache.org/licenses/LICENSE-2.0
+#
+#    Unless required by applicable law or agreed to in writing, software
+#    distributed under the License is distributed on an "AS IS" BASIS,
+#    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#    See the License for the specific language governing permissions and
+#    limitations under the License.
+
+# Makefile for the timer-interrupt-at-WFI mepc directed test
+SRC_C := ../../lib/src/uart.c main.c
+include ../../common/common.mk
diff --git a/sw/apps/wfi_mepc_test/main.c b/sw/apps/wfi_mepc_test/main.c
new file mode 100644
index 00000000..5a361910
--- /dev/null
+++ b/sw/apps/wfi_mepc_test/main.c
@@ -0,0 +1,108 @@
+/*
+ *    Copyright 2026 Two Sigma Open Source, LLC
+ *
+ *    Licensed under the Apache License, Version 2.0 (the "License");
+ *    you may not use this file except in compliance with the License.
+ *    You may obtain a copy of the License at
+ *
+ *        http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *    Unless required by applicable law or agreed to in writing, software
+ *    distributed under the License is distributed on an "AS IS" BASIS,
+ *    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *    See the License for the specific language governing permissions and
+ *    limitations under the License.
+ */
+
+/*
+ * Timer-interrupt-at-WFI mepc directed test.
+ *
+ * No-MMU M-mode Linux dies on the FIRST machine-timer interrupt taken from the
+ * idle loop (which executes WFI). FROST sources the interrupt resume PC (mepc)
+ * from the ROB head_pc UNCONDITIONALLY (reorder_buffer.sv o_trap_pc = head_pc,
+ * trap_unit.sv interrupt o_trap_pc = i_exception_pc), with no head_valid check.
+ * WFI drains the ROB, so when the timer fires at WFI the ROB is EMPTY and the
+ * saved mepc can be a stale head_pc instead of the instruction after the WFI.
+ *
+ * umode_test's timer-preempt never hit this: its U-code spins (ROB busy) and it
+ * never checks mepc. This test fires a timer interrupt while the core is in WFI
+ * (empty ROB) and checks that the saved mepc == the resume point after the WFI.
+ * Self-checks over UART (<<PASS>>/<<FAIL>>).
+ */
+
+#include <stdint.h>
+
+#include "trap.h"
+#include "uart.h"
+
+static volatile uint32_t g_mepc;
+static volatile uint32_t g_taken;
+
+/*
+ * Naked M-mode handler: record mepc (the saved resume PC) + the taken flag,
+ * ack the timer (push mtimecmp_hi to max so it cannot refire), then resume at
+ * the safe continuation stashed in mscratch (NOT the recorded mepc -- if mepc
+ * is wrong we must still land somewhere valid to report the result). Clobbering
+ * temporaries is fine because we bounce to a fixed continuation.
+ */
+__attribute__((naked, aligned(4))) static void wfi_trap_handler(void)
+{
+    __asm__ volatile("csrr t0, mepc\n"
+                     "lui  t1, %hi(g_mepc)\n"
+                     "sw   t0, %lo(g_mepc)(t1)\n"
+                     "li   t0, 1\n"
+                     "lui  t1, %hi(g_taken)\n"
+                     "sw   t0, %lo(g_taken)(t1)\n"
+                     "li   t1, 0x4000001C\n" /* MTIMECMP_HI: ack timer */
+                     "li   t0, -1\n"
+                     "sw   t0, 0(t1)\n"
+                     "csrr t0, mscratch\n" /* safe continuation after the WFI */
+                     "csrw mepc, t0\n"
+                     "mret\n");
+}
+
+int main(void)
+{
+    uint32_t resume_pc;
+
+    uart_printf("\n=== timer-interrupt-at-WFI mepc test ===\n");
+    set_trap_handler(&wfi_trap_handler);
+    g_mepc = 0;
+    g_taken = 0;
+
+    enable_timer_interrupt();
+    set_timer_cmp(rdmtime() + 300); /* fire ~300 cycles out: lands during WFI */
+    enable_interrupts();
+
+    /* Stash the post-WFI continuation in mscratch, capture its address as the
+     * expected resume PC, then WFI (drains the ROB). The timer fires here. */
+    __asm__ volatile("la   t0, 1f\n"
+                     "csrw mscratch, t0\n"
+                     "la   %0, 1f\n"
+                     "wfi\n"
+                     "1:\n"
+                     : "=r"(resume_pc)
+                     :
+                     /* The timer interrupt fires DURING the wfi, and the naked
+                      * handler clobbers BOTH t0 and t1 (it uses t1 to address
+                      * g_mepc/g_taken and then to ack MTIMECMP_HI).  Both must be
+                      * listed so the compiler does not keep a live value (e.g.
+                      * g_taken's base) pinned in t1 across the wfi -- otherwise the
+                      * post-wfi `while(!g_taken)` reads a stale clobbered address
+                      * (DDR layout: 2008(t1=0x4000001C)=0x400007f4) and spins. */
+                     : "t0", "t1", "memory");
+
+    while (!g_taken) {
+    }
+
+    uart_printf("mepc=%08x  expected(after WFI)=%08x  taken=%u\n", g_mepc, resume_pc, g_taken);
+    if (g_mepc == resume_pc) {
+        uart_printf("<<PASS>>\n");
+    } else {
+        uart_printf(
+            "<<FAIL>> interrupt-from-empty-ROB saved a stale mepc (not the WFI resume PC)\n");
+    }
+    for (;;) {
+    }
+    return 0;
+}
diff --git a/sw/lib/include/csr.h b/sw/lib/include/csr.h
index 0acda286..6d7f83b7 100644
--- a/sw/lib/include/csr.h
+++ b/sw/lib/include/csr.h
@@ -89,7 +89,7 @@
 /* ========================================================================== */
 #define MSTATUS_MIE (1U << 3)  /* Machine Interrupt Enable */
 #define MSTATUS_MPIE (1U << 7) /* Machine Previous Interrupt Enable */
-#define MSTATUS_MPP (3U << 11) /* Machine Previous Privilege (2 bits) */
+#define MSTATUS_MPP (3U << 11) /* Machine Previous Privilege (2 bits, WARL {M,U}) */
 
 /* ========================================================================== */
 /* mie/mip bit definitions (interrupt enable/pending)                         */
diff --git a/sw/lib/include/trap.h b/sw/lib/include/trap.h
index bc222157..650b782c 100644
--- a/sw/lib/include/trap.h
+++ b/sw/lib/include/trap.h
@@ -30,9 +30,9 @@
  *   - Privileged instructions (WFI, ECALL, EBREAK)
  *   - Timer interrupt configuration
  *
- * Frost implements machine-mode only (no S-mode or U-mode), so all code
- * runs with full privilege. Traps jump to the address in mtvec, saving
- * the return address in mepc and the cause in mcause.
+ * Frost implements Machine (M) and User (U) privilege modes (no S-mode).
+ * Traps from both M and U are taken in M-mode: they jump to the address in
+ * mtvec, saving the return address in mepc and the cause in mcause.
  *
  * Usage:
  *   // Set up trap handler
@@ -74,7 +74,7 @@ static inline __attribute__((always_inline)) void wfi(void)
 /**
  * ECALL - Environment Call
  *
- * Generates a synchronous exception (mcause = 11 for M-mode).
+ * Generates a synchronous exception (mcause = 8 from U-mode, 11 from M-mode).
  * Used for system calls in OS environments.
  */
 static inline __attribute__((always_inline)) void ecall(void)
diff --git a/tests/Makefile b/tests/Makefile
index 73166812..33369bd5 100644
--- a/tests/Makefile
+++ b/tests/Makefile
@@ -31,9 +31,9 @@ WAVES           ?= 0
 # Export variables for Cocotb
 export TOPLEVEL COCOTB_TEST_MODULES TOPLEVEL_LANG SIM ROOT
 
-# Timer speedup for simulation (1000x faster than real-time)
-# This makes FreeRTOS and timer-based tests complete much faster in simulation
-# Default is 1000 for simulation; set to 1 for synthesis
+# Timer speedup for simulation (intended to make FreeRTOS/timer-based tests faster)
+# NOTE: currently NOT plumbed to Verilator (no -G override is passed), so sims
+# run at the RTL parameter default of 1; this variable has no effect
 SIM_TIMER_SPEEDUP ?= 1000
 
 # Compilation arguments - RTL file list (varies by toplevel)
@@ -145,6 +145,12 @@ COMPILE_ARGS := \
         $(ROOT)/hw/rtl/cpu_and_mem/cpu/riscv_pkg.sv \
         $(ROOT)/hw/rtl/cpu_and_mem/cpu/cpu_ooo/memory_if/data_mem_request_router.sv
 VERILOG_SOURCES :=
+else ifeq ($(TOPLEVEL),trap_unit)
+# Trap unit interrupt/MRET arbitration unit test
+COMPILE_ARGS := \
+        $(ROOT)/hw/rtl/cpu_and_mem/cpu/riscv_pkg.sv \
+        $(ROOT)/hw/rtl/cpu_and_mem/cpu/control/trap_unit.sv
+VERILOG_SOURCES :=
 else ifeq ($(TOPLEVEL),frontend_validity_tracker)
 # CPU OOO frontend validity/control-flow tracker unit test
 COMPILE_ARGS := \
@@ -358,8 +364,14 @@ ENABLE_CACHED_TIER ?= 1
 CACHED_HAS_L2 ?= 1
 DDR_MODEL_BYTES ?= 67108864
 DDR_MODEL_LATENCY ?= 30
+# Fast cache maintenance (fence.i) for simulation: completes invalidate-all /
+# writeback-all in drastically fewer cycles while preserving the exact
+# functional effect, so the real Linux kernel's fence.i-heavy patching boots
+# tractably. Defaults ON for sim; the FPGA default in the RTL stays 0 and the
+# board builds never set it.
+SIM_FAST_MAINT ?= 1
 ifeq ($(TOPLEVEL),frost)
-	EXTRA_ARGS += -GMEM_SIZE_BYTES=$(SIM_MEM_SIZE_BYTES) -GENABLE_CACHED_TIER=$(ENABLE_CACHED_TIER) -GCACHED_HAS_L2=$(CACHED_HAS_L2) -GDDR_MODEL_BYTES=$(DDR_MODEL_BYTES) -GDDR_MODEL_LATENCY=$(DDR_MODEL_LATENCY)
+	EXTRA_ARGS += -GMEM_SIZE_BYTES=$(SIM_MEM_SIZE_BYTES) -GENABLE_CACHED_TIER=$(ENABLE_CACHED_TIER) -GCACHED_HAS_L2=$(CACHED_HAS_L2) -GDDR_MODEL_BYTES=$(DDR_MODEL_BYTES) -GDDR_MODEL_LATENCY=$(DDR_MODEL_LATENCY) -GSIM_FAST_MAINT=$(SIM_FAST_MAINT)
 endif
 
 # Include Cocotb simulation makefile rules
diff --git a/tests/README.md b/tests/README.md
index ba852118..821a1d9a 100644
--- a/tests/README.md
+++ b/tests/README.md
@@ -110,7 +110,7 @@ FROST_COCOTB_MEM_CONFIG=ddr ./test_run_cocotb.py hello_world
 FROST_COCOTB_MEM_CONFIG=ddr pytest test_run_cocotb.py -k test_real_program
 ```
 
-Tests in `DDR_TIER_EXCLUDE` self-skip in the `ddr` tier: the `*_fetch_fuzz` fetch fuzzers, and the already-DDR-focused `ddr_*` programs (`ddr_test`, `ddr_exec_test`, `ddr_smc_test`, `ddr_heap_test`) whose fixed-address writes a whole-program relocation would clobber. Unit benches are tier-independent and run only once (in the `bram` job).
+Tests in `DDR_TIER_EXCLUDE` self-skip in the `ddr` tier: the `*_fetch_fuzz` fetch fuzzers, and the already-DDR-focused `ddr_*` programs (`ddr_test`, `ddr_exec_test`, `ddr_smc_test`, `ddr_heap_test`, `ddr_atomic_test`) whose fixed-address writes a whole-program relocation would clobber. Unit benches are tier-independent and run only once (in the `bram` job).
 
 ### `test_arch_compliance.py`
 
diff --git a/tests/check_linux_boot_regression.py b/tests/check_linux_boot_regression.py
new file mode 100755
index 00000000..b67b900a
--- /dev/null
+++ b/tests/check_linux_boot_regression.py
@@ -0,0 +1,150 @@
+#!/usr/bin/env python3
+
+#    Copyright 2026 Two Sigma Open Source, LLC
+#
+#    Licensed under the Apache License, Version 2.0 (the "License");
+#    you may not use this file except in compliance with the License.
+#    You may obtain a copy of the License at
+#
+#        http://www.apache.org/licenses/LICENSE-2.0
+#
+#    Unless required by applicable law or agreed to in writing, software
+#    distributed under the License is distributed on an "AS IS" BASIS,
+#    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#    See the License for the specific language governing permissions and
+#    limitations under the License.
+
+"""Assert FROST no-MMU Linux boot health from a cocotb linux_boot capture log.
+
+The CI ``linux-boot-cocotb`` job boots the freshly built image on the FROST RTL
+for ~22M cycles in ``FROST_LINUX_RUN_FULL`` capture mode (with
+``COCOTB_PROGRESS_INTERVAL`` set so the run emits per-interval retire + CLINT
+lines). That window is *silent* ``mem_init`` after ``devtmpfs: initialized`` --
+the next console line is millions of cycles further on -- so there is no deep
+boot marker to match on. Instead this checker asserts the signals that actually
+separate a healthy boot from the regressions the job guards:
+
+  * the "gremlin": a timer-IRQ hang that froze the boot at the periodic CLINT
+    tick (retire count stops advancing; mtimecmp stops being re-armed), and
+  * fence.i / instruction-fetch breakage that derails a long real-code boot.
+
+Health criteria (all must hold):
+  1. the kernel banner printed (the core booted Linux at all),
+  2. early init was reached (``devtmpfs: initialized``),
+  3. no kernel panic,
+  4. the run reached at least ``--min-cycle`` (past the historical gremlin tick
+     at ~cycle 20.96M),
+  5. the core was still retiring instructions in the final progress window
+     (``delta_retired`` >= ``--min-end-delta`` -- i.e. it did not hang), and
+  6. the periodic CLINT timer tick was serviced: mtimecmp was re-armed to at
+     least ``--min-timer-arms`` distinct non-disabled values (the gremlin hung
+     here, freezing the tick).
+
+Usage: ``check_linux_boot_regression.py <cocotb-boot-log>``
+"""
+
+import argparse
+import re
+import sys
+
+BANNER = "Linux version"
+EARLY_INIT = "devtmpfs: initialized"
+PANIC = "Kernel panic"
+MTIMECMP_DISABLED = 0xFFFFFFFFFFFFFFFF
+
+# "... progress: cycle=<n> retired=<r> delta_retired=<d> ..."
+PROGRESS_RE = re.compile(r"progress: cycle=(\d+) retired=\d+ delta_retired=(\d+)")
+# "... CLINT/serial: ... mtimecmp=0x<hex> ..."
+MTIMECMP_RE = re.compile(r"mtimecmp=0x([0-9a-fA-F]+)")
+
+
+def main() -> int:
+    """Read the capture log, assert boot health, return 0 (healthy) else 1."""
+    ap = argparse.ArgumentParser(description=__doc__)
+    ap.add_argument("logfile", help="cocotb linux_boot capture log")
+    ap.add_argument(
+        "--min-cycle",
+        type=int,
+        default=21_000_000,
+        help="require the run reached at least this sim cycle (default: 21e6, "
+        "past the historical gremlin tick at ~20.96e6)",
+    )
+    ap.add_argument(
+        "--min-end-delta",
+        type=int,
+        default=1000,
+        help="require this many retired instructions in the final progress "
+        "window (default: 1000 -- a hung boot retires ~0)",
+    )
+    ap.add_argument(
+        "--min-timer-arms",
+        type=int,
+        default=2,
+        help="require this many distinct armed mtimecmp values, i.e. periodic "
+        "timer ticks serviced (default: 2)",
+    )
+    args = ap.parse_args()
+
+    try:
+        with open(args.logfile, errors="replace") as f:
+            text = f.read()
+    except OSError as exc:
+        print(f"error: cannot read {args.logfile}: {exc}", file=sys.stderr)
+        return 2
+
+    failures = []
+
+    if BANNER not in text:
+        failures.append(
+            f"kernel banner ({BANNER!r}) not found -- core did not boot Linux"
+        )
+    if EARLY_INIT not in text:
+        failures.append(f"early-init marker ({EARLY_INIT!r}) not reached")
+    if PANIC in text:
+        failures.append(f"kernel panic detected ({PANIC!r})")
+
+    progress = [(int(c), int(d)) for c, d in PROGRESS_RE.findall(text)]
+    max_cycle = 0
+    if not progress:
+        failures.append(
+            "no progress lines found -- set COCOTB_PROGRESS_INTERVAL so the run "
+            "emits retire/CLINT progress, or the sim did not start"
+        )
+    else:
+        max_cycle = max(c for c, _ in progress)
+        if max_cycle < args.min_cycle:
+            failures.append(
+                f"boot stopped early: reached cycle {max_cycle:,} < {args.min_cycle:,}"
+            )
+        _, last_delta = max(progress, key=lambda cd: cd[0])
+        if last_delta < args.min_end_delta:
+            failures.append(
+                f"no forward progress at the cap: delta_retired={last_delta} at "
+                f"cycle {max_cycle:,} < {args.min_end_delta} (boot hung?)"
+            )
+
+    armed = {
+        int(v, 16) for v in MTIMECMP_RE.findall(text) if int(v, 16) != MTIMECMP_DISABLED
+    }
+    if len(armed) < args.min_timer_arms:
+        failures.append(
+            f"periodic timer tick not serviced: {len(armed)} distinct armed "
+            f"mtimecmp value(s) < {args.min_timer_arms} (gremlin timer-IRQ hang?)"
+        )
+
+    if failures:
+        print("FROST linux_boot regression FAILED:", file=sys.stderr)
+        for msg in failures:
+            print(f"  - {msg}", file=sys.stderr)
+        return 1
+
+    print(
+        "FROST linux_boot regression PASSED: banner + devtmpfs reached, no panic, "
+        f"timer serviced ({len(armed)} distinct arms), forward progress to "
+        f"cycle {max_cycle:,}."
+    )
+    return 0
+
+
+if __name__ == "__main__":
+    sys.exit(main())
diff --git a/tests/test_arch_compliance.py b/tests/test_arch_compliance.py
index 17b26a9b..359d3205 100755
--- a/tests/test_arch_compliance.py
+++ b/tests/test_arch_compliance.py
@@ -73,8 +73,12 @@
 # Extensions not listed here run all their tests.
 # Frost implements Zbkb (pack, packh, brev8, zip, unzip) from the K extension
 # but not Zbkx (xperm4/xperm8), Zkn (AES/SHA256/SHA512), or Zks (SM3/SM4).
-# Frost is M-mode only (no S/U mode), so privilege tests are filtered
-# to exclude supervisor, user, and hypervisor tests.
+# Frost implements Machine and User privilege (no Supervisor/Hypervisor). The
+# privilege suite's U-mode tests (menvcfg/senvcfg/henvcfg *_illegal_u) drive an
+# S-mode trap routine and require S/H ISA extensions (Ssdtso/Sstc/...), so they
+# cannot run on M+U-only Frost and stay filtered out. Frost's U-mode -- including
+# illegal M-CSR/MRET access from U -- is covered by the directed sw/apps/umode_test
+# instead. Supervisor and hypervisor tests are likewise excluded.
 EXTENSION_TEST_FILTERS: dict[str, set[str]] = {
     "K": {"pack", "packh", "brev8", "zip", "unzip"},
     "privilege": {"ebreak", "ecall", "misalign", "menvcfg_m"},
diff --git a/tests/test_run_cocotb.py b/tests/test_run_cocotb.py
index 11d6c305..41b83416 100755
--- a/tests/test_run_cocotb.py
+++ b/tests/test_run_cocotb.py
@@ -73,7 +73,8 @@ class CocotbRunConfig:
     verilator_extra_args: tuple[str, ...] = ()
 
 
-# CPU testbench tests (multiple modules combined)
+# CPU testbench modules (currently unreferenced: no combined "cpu" registry
+# entry consumes this; each module runs via its own cpu_tb registry entry)
 CPU_TEST_MODULES = ",".join(
     [
         "cocotb_tests.test_cpu",
@@ -90,10 +91,10 @@ class CocotbRunConfig:
         hdl_toplevel_module="frost",
         app_name=program.app_name,
         description=program.description,
-        # All nine workloads run CRC-verified minimal-preset simulations,
-        # including the three whose OFFICIAL datasets exceed the platform's
-        # memory limits (loops/radix2/zip, hardware_supported=False): the
-        # sim presets are small enough to fit and are validated green.
+        # All nine workloads run CRC-verified minimal-preset simulations.
+        # They are also all hardware-supported: the DDR-backed heap and
+        # calibrated hardware_iterations cover the larger datasets
+        # (loops/radix2/zip included); sim keeps the small verified presets.
     )
     for program in COREMARK_PRO_PROGRAMS
 }
@@ -151,6 +152,12 @@ class CocotbRunConfig:
         app_name="ddr_smc_test",
         description="Self-modifying code test (stores + fence.i + execute, full sync chain)",
     ),
+    "smc_fencei_test": CocotbRunConfig(
+        python_test_module="cocotb_tests.test_real_program",
+        hdl_toplevel_module="frost",
+        app_name="smc_fencei_test",
+        description="Hardened SMC/fence.i reproducer (gap sweep, warm/cold L1D, write-miss, tight loop)",
+    ),
     "ddr_heap_test": CocotbRunConfig(
         python_test_module="cocotb_tests.test_real_program",
         hdl_toplevel_module="frost",
@@ -163,6 +170,153 @@ class CocotbRunConfig:
         app_name="csr_test",
         description="CSR test",
     ),
+    "umode_test": CocotbRunConfig(
+        python_test_module="cocotb_tests.test_real_program",
+        hdl_toplevel_module="frost",
+        app_name="umode_test",
+        description="U-mode (User privilege) directed test",
+    ),
+    "csr_rmw_test": CocotbRunConfig(
+        python_test_module="cocotb_tests.test_real_program",
+        hdl_toplevel_module="frost",
+        app_name="csr_rmw_test",
+        description="CSR read-modify-write directed test (csrrw/csrrs/csrrc; kernel trap path)",
+    ),
+    "wfi_mepc_test": CocotbRunConfig(
+        python_test_module="cocotb_tests.test_real_program",
+        hdl_toplevel_module="frost",
+        app_name="wfi_mepc_test",
+        description="Timer-interrupt-at-WFI mepc directed test (empty-ROB interrupt resume PC)",
+    ),
+    "wfi_drain_mepc_test": CocotbRunConfig(
+        python_test_module="cocotb_tests.test_real_program",
+        hdl_toplevel_module="frost",
+        app_name="wfi_drain_mepc_test",
+        description="Drain-gated WFI mepc directed test (timer IRQ at WFI with a draining DDR store)",
+    ),
+    "drain_trapframe_test": CocotbRunConfig(
+        python_test_module="cocotb_tests.test_real_program",
+        hdl_toplevel_module="frost",
+        app_name="drain_trapframe_test",
+        description="Trap-frame store-visibility under L1D eviction (Bug B relocated to pt_regs s2)",
+        # Genesys2-faithful shape: no L2 (L1 -> DDR direct, where a cold write-back
+        # actually drains) + high DDR latency, so the save-store / eviction race is
+        # not masked. The default (L2 on, latency 30) gives a false PASS.
+        verilator_extra_args=("-GCACHED_HAS_L2=0", "-GDDR_MODEL_LATENCY=70"),
+    ),
+    "mret_timer_resume_test": CocotbRunConfig(
+        python_test_module="cocotb_tests.test_real_program",
+        hdl_toplevel_module="frost",
+        app_name="mret_timer_resume_test",
+        description="MRET-to-U + pending-timer mepc directed test (stale interrupt resume PC)",
+    ),
+    "mtimer_stress": CocotbRunConfig(
+        python_test_module="cocotb_tests.test_real_program",
+        hdl_toplevel_module="frost",
+        app_name="mtimer_stress",
+        description="M-mode machine-timer + MRET deadlock stress (phase-swept; flaky-hang repro)",
+    ),
+    "mret_drain_deadlock": CocotbRunConfig(
+        python_test_module="cocotb_tests.test_real_program",
+        hdl_toplevel_module="frost",
+        app_name="mret_drain_deadlock",
+        description="MRET-vs-draining-cached-store deadlock (one-shot o_mret_start; deterministic hang repro)",
+        # Genesys2 cache shape (L1 -> DDR direct), where the bug manifests on
+        # hardware and where a cold cached-store write-back actually drains in sim
+        # (the L2-enabled shape leaves the cold tier undrained, masking the race).
+        verilator_extra_args=("-GCACHED_HAS_L2=0",),
+    ),
+    "wfi_lost_tick": CocotbRunConfig(
+        python_test_module="cocotb_tests.test_real_program",
+        hdl_toplevel_module="frost",
+        app_name="wfi_lost_tick",
+        description="WFI-idle + MIE-toggle + CLINT-rearm lost-timer-tick repro (deferred-eligibility; frozen jiffies)",
+    ),
+    "irq_mie_window": CocotbRunConfig(
+        python_test_module="cocotb_tests.test_real_program",
+        hdl_toplevel_module="frost",
+        app_name="irq_mie_window",
+        description="Short-MIE-window lost-interrupt repro (registered interrupt_pending erased by adjacent MIE clear)",
+    ),
+    "ns16550_test": CocotbRunConfig(
+        python_test_module="cocotb_tests.test_real_program",
+        hdl_toplevel_module="frost",
+        app_name="ns16550_test",
+        description="ns16550a UART face directed test (Linux glue)",
+    ),
+    "clint_test": CocotbRunConfig(
+        python_test_module="cocotb_tests.test_real_program",
+        hdl_toplevel_module="frost",
+        app_name="clint_test",
+        description="SiFive CLINT alias directed test (Linux glue)",
+    ),
+    "linux_boot": CocotbRunConfig(
+        python_test_module="cocotb_tests.test_real_program",
+        hdl_toplevel_module="frost",
+        app_name="linux_boot",
+        description="No-MMU Linux boot (kernel Image in DDR)",
+        include_in_pytest=False,
+    ),
+    # Same boot image, but 128 KiB L1I (the genesys2 HW config the handoff says
+    # wedges at SLUB). Pair with CACHED_HAS_L2=0 to match genesys2. Debug only.
+    "linux_boot_128k": CocotbRunConfig(
+        python_test_module="cocotb_tests.test_real_program",
+        hdl_toplevel_module="frost",
+        app_name="linux_boot",
+        description="No-MMU Linux boot with 128 KiB L1I (genesys2 wedge-repro config)",
+        include_in_pytest=False,
+        verilator_extra_args=("-GL1I_CACHE_BYTES=131072", "-GCACHED_HAS_L2=0"),
+    ),
+    "linux_irq_ddr_test": CocotbRunConfig(
+        python_test_module="cocotb_tests.test_real_program",
+        hdl_toplevel_module="frost",
+        app_name="linux_irq_ddr_test",
+        description="Linux-like machine-timer IRQ path with DDR code/data/stack",
+    ),
+    "linux_irq_active_ddr_test": CocotbRunConfig(
+        python_test_module="cocotb_tests.test_real_program",
+        hdl_toplevel_module="frost",
+        app_name="linux_irq_active_ddr_test",
+        description="Linux-like active-code machine-timer IRQ path with DDR call/return traffic",
+    ),
+    "linux_clksrc_faithful": CocotbRunConfig(
+        python_test_module="cocotb_tests.test_real_program",
+        hdl_toplevel_module="frost",
+        app_name="linux_clksrc_faithful",
+        description="Faithful Linux clocksource-switch: enable-MTIE-then-arm, re-arming handler, bare-wfi idle, concurrent DDR",
+    ),
+    "trap_s2l_fwd": CocotbRunConfig(
+        python_test_module="cocotb_tests.test_real_program",
+        hdl_toplevel_module="frost",
+        app_name="trap_s2l_fwd",
+        description="handle_exception-pattern trap store->load forwarding repro (sw sp,8(tp); lw ,8(tp))",
+    ),
+    "linux_irq_stack_slot_test": CocotbRunConfig(
+        python_test_module="cocotb_tests.test_real_program",
+        hdl_toplevel_module="frost",
+        app_name="linux_irq_stack_slot_test",
+        description="Linux-like timer IRQ over a poisoned DDR callee return-address stack slot",
+    ),
+    "linux_irq_find_next_slot_test": CocotbRunConfig(
+        python_test_module="cocotb_tests.test_real_program",
+        hdl_toplevel_module="frost",
+        app_name="linux_irq_find_next_slot_test",
+        description="Linux _find_next_bit-shaped IRQ over a poisoned DDR return slot",
+    ),
+    "ddr_atomic_test": CocotbRunConfig(
+        python_test_module="cocotb_tests.test_real_program",
+        hdl_toplevel_module="frost",
+        app_name="ddr_atomic_test",
+        description="RV32-A atomics to the cached DDR region (LR/SC, AMO)",
+        include_in_pytest=True,
+    ),
+    "pde_return_hazard": CocotbRunConfig(
+        python_test_module="cocotb_tests.test_real_program",
+        hdl_toplevel_module="frost",
+        app_name="pde_return_hazard",
+        description="pde_subdir_find epilogue return-value hazard reproducer",
+        verilator_extra_args=("-GCACHED_HAS_L2=0",),
+    ),
     "freertos_demo": CocotbRunConfig(
         python_test_module="cocotb_tests.test_real_program",
         hdl_toplevel_module="frost",
@@ -291,6 +445,19 @@ class CocotbRunConfig:
         description="RAS call/return stress under randomized fetch-latency fuzz",
         verilator_extra_args=("-GFETCH_VALID_FUZZ=1",),
     ),
+    "fetch_stall_repro": CocotbRunConfig(
+        python_test_module="cocotb_tests.test_real_program",
+        hdl_toplevel_module="frost",
+        app_name="fetch_stall_repro",
+        description="Directed 32-bit-insn PC+2 mis-step repro (no fuzz; sanity = PASS)",
+    ),
+    "fetch_stall_repro_128k": CocotbRunConfig(
+        python_test_module="cocotb_tests.test_real_program",
+        hdl_toplevel_module="frost",
+        app_name="fetch_stall_repro",
+        description="Directed PC+2 mis-step repro, cached .ddr_text, 128KiB L1I (genesys2)",
+        verilator_extra_args=("-GL1I_CACHE_BYTES=131072",),
+    ),
     # Tomasulo unit tests
     "reorder_buffer": CocotbRunConfig(
         python_test_module="cocotb_tests.tomasulo.reorder_buffer.test_reorder_buffer",
@@ -392,6 +559,11 @@ class CocotbRunConfig:
         hdl_toplevel_module="data_mem_request_router",
         description="CPU OOO data-memory request router tests",
     ),
+    "trap_unit": CocotbRunConfig(
+        python_test_module="cocotb_tests.control.test_trap_unit",
+        hdl_toplevel_module="trap_unit",
+        description="Trap unit tests (interrupt/MRET arbitration)",
+    ),
     "frost_cache": CocotbRunConfig(
         python_test_module="cocotb_tests.cache.test_frost_cache",
         hdl_toplevel_module="frost_cache_test_harness",
@@ -404,6 +576,48 @@ class CocotbRunConfig:
         description="Cache hierarchy unit tests (L1 -> DDR, Genesys2 shape)",
         verilator_extra_args=("-GHAS_L2=0",),
     ),
+    # Same functional suite, but with the sim-only fast maintenance path
+    # (SIM_FAST_MAINT=1) enabled: proves invalidate-all / writeback-all stay
+    # functionally identical when the fence.i fast path is active.
+    "frost_cache_fast": CocotbRunConfig(
+        python_test_module="cocotb_tests.cache.test_frost_cache",
+        hdl_toplevel_module="frost_cache_test_harness",
+        description="Cache hierarchy unit tests, fast fence.i maintenance (L1 -> L2 -> DDR)",
+        verilator_extra_args=("-GHAS_L2=1", "-GSIM_FAST_MAINT=1"),
+    ),
+    "frost_cache_l1_only_fast": CocotbRunConfig(
+        python_test_module="cocotb_tests.cache.test_frost_cache",
+        hdl_toplevel_module="frost_cache_test_harness",
+        description="Cache hierarchy unit tests, fast fence.i maintenance (L1 -> DDR)",
+        verilator_extra_args=("-GHAS_L2=0", "-GSIM_FAST_MAINT=1"),
+    ),
+    # fence.i maintenance cycle-count measurement at the real L1 geometry
+    # (128 KiB D / 16 KiB I). Two builds, slow (FPGA-path FSM) vs fast, so the
+    # speedup is directly observable in the logs. Not part of the pytest sweep.
+    "fence_speed_slow": CocotbRunConfig(
+        python_test_module="cocotb_tests.cache.test_fence_speed",
+        hdl_toplevel_module="frost_cache_test_harness",
+        description="fence.i maintenance cost, FPGA-path FSM (SIM_FAST_MAINT=0)",
+        verilator_extra_args=(
+            "-GHAS_L2=0",
+            "-GL1_CACHE_BYTES=131072",
+            "-GL1I_CACHE_BYTES=16384",
+            "-GSIM_FAST_MAINT=0",
+        ),
+        include_in_pytest=False,
+    ),
+    "fence_speed_fast": CocotbRunConfig(
+        python_test_module="cocotb_tests.cache.test_fence_speed",
+        hdl_toplevel_module="frost_cache_test_harness",
+        description="fence.i maintenance cost, fast sim path (SIM_FAST_MAINT=1)",
+        verilator_extra_args=(
+            "-GHAS_L2=0",
+            "-GL1_CACHE_BYTES=131072",
+            "-GL1I_CACHE_BYTES=16384",
+            "-GSIM_FAST_MAINT=1",
+        ),
+        include_in_pytest=False,
+    ),
     "line_port_arbiter": CocotbRunConfig(
         python_test_module="cocotb_tests.cache.test_line_port_arbiter",
         hdl_toplevel_module="line_port_arbiter_test_harness",
@@ -523,6 +737,49 @@ class CocotbRunConfig:
         description="Tomasulo wrapper tests with CPU production split-RS dispatch",
         verilator_extra_args=("-GSPLIT_RS_DISPATCH=1",),
     ),
+    # Directed machine-mode trap/interrupt tests run on the cpu_tb harness
+    # (one instruction fed per ready cycle into the cpu_ooo core). Collected by
+    # pytest so the cpu_tb suites cannot rot invisibly again (the harness once
+    # sat broken -- missing i_served_addr -- with nothing in CI noticing);
+    # filter to a single function with --testcase when running by hand.
+    "directed_traps": CocotbRunConfig(
+        python_test_module="cocotb_tests.test_directed_traps",
+        hdl_toplevel_module="cpu_tb",
+        description="Directed M-mode trap/interrupt tests (cpu_tb directed suite)",
+    ),
+    # The remaining cpu_tb suites below predate the OOO integration and have
+    # rotted: they probe pre-rename hierarchy (regfile_inst / fp_regfile_inst)
+    # and in-order fixed latencies, so they fail on the current core until
+    # ported to the maintained DUTInterface helpers (as test_directed_traps
+    # was). Registered CLI-only so they are visible and invokable instead of
+    # silently orphaned; their ISA coverage is meanwhile gated in CI by the
+    # rv32ua/rv32uc/rv32um riscv-tests, the arch-compliance matrix, and the
+    # ddr_atomic_test/c_ext_test real programs. Flip include_in_pytest after
+    # porting.
+    "directed_atomics": CocotbRunConfig(
+        python_test_module="cocotb_tests.test_directed_atomics",
+        hdl_toplevel_module="cpu_tb",
+        description="Directed LR.W/SC.W atomic tests (cpu_tb; NEEDS PORTING to OOO)",
+        include_in_pytest=False,
+    ),
+    "directed_multicycle": CocotbRunConfig(
+        python_test_module="cocotb_tests.test_directed_multicycle",
+        hdl_toplevel_module="cpu_tb",
+        description="Directed back-to-back multi-cycle op tests (cpu_tb; NEEDS PORTING to OOO)",
+        include_in_pytest=False,
+    ),
+    "compressed": CocotbRunConfig(
+        python_test_module="cocotb_tests.test_compressed",
+        hdl_toplevel_module="cpu_tb",
+        description="RISC-V C-extension directed tests (cpu_tb; NEEDS PORTING to OOO)",
+        include_in_pytest=False,
+    ),
+    "cpu_random": CocotbRunConfig(
+        python_test_module="cocotb_tests.test_cpu",
+        hdl_toplevel_module="cpu_tb",
+        description="Constrained-random instruction regression (cpu_tb; NEEDS PORTING to OOO)",
+        include_in_pytest=False,
+    ),
 }
 
 # List of real program test names (excludes 'cpu' which uses different toplevel)
diff --git a/verif/README.md b/verif/README.md
index 7a5c07c2..ded3c91d 100644
--- a/verif/README.md
+++ b/verif/README.md
@@ -37,7 +37,7 @@ This directory contains a comprehensive Python-based verification framework for
 
 ### Design Under Test (DUT)
 
-The Frost CPU implements **RV32GCB** (G = IMAFD, plus C and B) with full M-mode privilege support. See the [root README](../README.md) for the full ISA extension table.
+The Frost CPU implements **RV32GCB** (G = IMAFD, plus C and B) with M and U privilege modes. See the [root README](../README.md) for the full ISA extension table.
 
 Additional features:
 - 32 general-purpose registers plus a separate FP register file
@@ -82,6 +82,7 @@ verif/
 │   ├── cache/             # Cache hierarchy + line-port arbiter block tests
 │   ├── cpu_ooo/           # OOO block tests (commit, recovery, memory router,
 │   │                      #   register files, perf counters, pipeline control)
+│   ├── control/           # Control-block tests (trap_unit interrupt/MRET arbitration)
 │   └── tomasulo/          # Block-level cocotb tests for Tomasulo submodules
 │                          #   (ROB, RAT, RS, dispatch, CDB arbiter, LQ/SQ, FU shims)
 ├── models/                # Reference models for verification
@@ -257,26 +258,30 @@ program / unit tests are launched with `./test_run_cocotb.py <name>`; use
 `./test_run_cocotb.py --list-tests` for the canonical target list (the single
 source of truth is `TEST_REGISTRY` in `tests/test_run_cocotb.py`).
 
-The random-regression and directed CPU tests (`test_cpu`,
-`test_directed_atomics`, `test_directed_traps`, `test_compressed`,
-`test_directed_multicycle`) all run on the `cpu_tb` testbench, which is the
-`tests/Makefile` default (`TOPLEVEL=cpu_tb`, `COCOTB_TEST_MODULES=cocotb_tests.test_cpu`).
-They are not `test_run_cocotb.py` registry targets; run them directly:
-
-Run the default random instruction test (all cpu_tb modules):
+The random-regression and directed CPU tests all run on the `cpu_tb`
+testbench and are `test_run_cocotb.py` registry targets: `directed_traps`
+(pytest-collected, in CI) plus `directed_atomics`, `directed_multicycle`,
+`compressed`, and `cpu_random` (registered CLI-only -- these four predate the
+OOO integration and currently fail on the OOO core until ported to the
+maintained `DUTInterface` helpers; their ISA coverage is meanwhile gated by
+the riscv-tests / arch-compliance / real-program suites). Note that a bare
+`make` in `tests/` builds the `Makefile` default (`TOPLEVEL=cpu_tb`,
+`COCOTB_TEST_MODULES=cocotb_tests.test_cpu`), which loads only the unported
+`cpu_random` module -- prefer the registry targets:
+
+Run a cpu_tb suite via the registry:
 ```bash
-make
+# Trap handling (ECALL, EBREAK, MRET) -- ported, runs in CI
+./test_run_cocotb.py directed_traps
+# LR.W/SC.W atomic instructions -- NEEDS PORTING, expected to fail
+./test_run_cocotb.py directed_atomics
 ```
 
-Run a single cpu_tb test function by setting cocotb's `COCOTB_TEST_FILTER`
-(a regex; anchor with `$` for an exact match):
+Run a single test function with `--testcase` (sets cocotb's
+`COCOTB_TEST_FILTER` to an exact match):
 ```bash
-# Forced single address (stress memory hazards)
-COCOTB_TEST_FILTER='test_random_riscv_regression_force_one_address$' make
-# LR.W/SC.W atomic instructions
-COCOTB_TEST_FILTER='test_directed_lr_sc$' make
-# Trap handling (ECALL, EBREAK, MRET)
-COCOTB_TEST_FILTER='test_directed_trap_handling$' make
+./test_run_cocotb.py directed_traps --testcase test_directed_trap_handling
+./test_run_cocotb.py directed_atomics --testcase test_directed_lr_sc
 ```
 
 Run integration tests with real programs (registry targets):
diff --git a/verif/cocotb_tests/cache/test_fence_speed.py b/verif/cocotb_tests/cache/test_fence_speed.py
new file mode 100644
index 00000000..af70b403
--- /dev/null
+++ b/verif/cocotb_tests/cache/test_fence_speed.py
@@ -0,0 +1,141 @@
+#    Copyright 2026 Two Sigma Open Source, LLC
+#
+#    Licensed under the Apache License, Version 2.0 (the "License");
+#    you may not use this file except in compliance with the License.
+#    You may obtain a copy of the License at
+#
+#        http://www.apache.org/licenses/LICENSE-2.0
+#
+#    Unless required by applicable law or agreed to in writing, software
+#    distributed under the License is distributed on an "AS IS" BASIS,
+#    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#    See the License for the specific language governing permissions and
+#    limitations under the License.
+
+"""fence.i maintenance cycle-count measurement (frost_cache_test_harness DUT).
+
+Drives the cache hierarchy at the real L1 geometry (128 KiB D-side / 16 KiB
+I-side, set via -G in the registry), dirties a handful of D-side lines, then
+issues one fence.i cache-sync handshake and counts the cycles from sync-assert
+to done. Run the two registry builds to see the speedup directly:
+
+    ./test_run_cocotb.py fence_speed_slow   # SIM_FAST_MAINT=0 (FPGA-path FSM)
+    ./test_run_cocotb.py fence_speed_fast   # SIM_FAST_MAINT=1 (fast sim path)
+
+The slow build walks every line (writeback-all over 4096 lines + invalidate-all
+over 512 lines, ~thousands of cycles); the fast build touches only the dirty
+lines and bulk-clears the tags (low hundreds or fewer). The measured count is
+logged as `FENCE_I_MAINT_CYCLES=<n>` for easy comparison.
+"""
+
+from typing import Any
+
+import cocotb
+from cocotb.clock import Clock
+from cocotb.triggers import FallingEdge, RisingEdge
+
+CLOCK_PERIOD_NS = 10
+LINE_BYTES = 32
+BASE_ADDR = 0x8000_0000
+
+# Generous: the slow reset sweep walks every L1 line (4096) before ready.
+READY_TIMEOUT_CYCLES = 100_000
+RESP_TIMEOUT_CYCLES = 20_000
+FENCE_TIMEOUT_CYCLES = 200_000
+
+# Number of distinct dirty D-side lines to publish before the fence.
+NUM_DIRTY_LINES = 16
+
+
+def _clear_inputs(dut: Any) -> None:
+    dut.i_up_req_valid.value = 0
+    dut.i_up_req_write.value = 0
+    dut.i_up_req_addr.value = 0
+    dut.i_up_req_wdata.value = 0
+    dut.i_up_req_wstrb.value = 0
+    dut.i_iup_req_valid.value = 0
+    dut.i_iup_req_write.value = 0
+    dut.i_iup_req_addr.value = 0
+    dut.i_iup_req_wdata.value = 0
+    dut.i_iup_req_wstrb.value = 0
+    dut.i_fence_sync.value = 0
+
+
+async def _setup(dut: Any) -> None:
+    """Start the clock, reset, and wait out the tag-invalidate sweep."""
+    cocotb.start_soon(Clock(dut.i_clk, CLOCK_PERIOD_NS, unit="ns").start())
+    _clear_inputs(dut)
+    dut.i_rst.value = 1
+    for _ in range(4):
+        await RisingEdge(dut.i_clk)
+    await FallingEdge(dut.i_clk)
+    dut.i_rst.value = 0
+    for _ in range(READY_TIMEOUT_CYCLES):
+        await FallingEdge(dut.i_clk)
+        if int(dut.o_up_req_ready.value) == 1 and int(dut.o_iup_req_ready.value) == 1:
+            return
+    raise AssertionError("cache never became ready after reset (sweep stuck?)")
+
+
+async def _write_line(dut: Any, addr: int, wdata: int) -> None:
+    """Whole-line D-side write (dirties the line in L1)."""
+    full = (1 << LINE_BYTES) - 1
+    await FallingEdge(dut.i_clk)
+    dut.i_up_req_valid.value = 1
+    dut.i_up_req_write.value = 1
+    dut.i_up_req_addr.value = addr
+    dut.i_up_req_wdata.value = wdata
+    dut.i_up_req_wstrb.value = full
+    for _ in range(RESP_TIMEOUT_CYCLES):
+        if int(dut.o_up_req_ready.value) == 1:
+            break
+        await FallingEdge(dut.i_clk)
+    else:
+        raise AssertionError(f"write never accepted (addr=0x{addr:08x})")
+    await FallingEdge(dut.i_clk)
+    dut.i_up_req_valid.value = 0
+    dut.i_up_req_write.value = 0
+    for _ in range(RESP_TIMEOUT_CYCLES):
+        if int(dut.o_up_resp_valid.value) == 1:
+            return
+        await FallingEdge(dut.i_clk)
+    raise AssertionError(f"no write response (addr=0x{addr:08x})")
+
+
+async def _measure_fence_cycles(dut: Any) -> int:
+    """Assert i_fence_sync and count cycles until o_fence_done rises."""
+    await FallingEdge(dut.i_clk)
+    dut.i_fence_sync.value = 1
+    cycles = 0
+    for _ in range(FENCE_TIMEOUT_CYCLES):
+        await RisingEdge(dut.i_clk)
+        cycles += 1
+        if int(dut.o_fence_done.value) == 1:
+            break
+    else:
+        raise AssertionError("fence sync never completed")
+    await FallingEdge(dut.i_clk)
+    dut.i_fence_sync.value = 0
+    await FallingEdge(dut.i_clk)
+    return cycles
+
+
+@cocotb.test()
+async def test_fence_i_maintenance_cycles(dut: Any) -> None:
+    """Dirty several lines, fence, and report the maintenance cycle count."""
+    await _setup(dut)
+
+    for i in range(NUM_DIRTY_LINES):
+        addr = BASE_ADDR + i * LINE_BYTES
+        wdata = int.from_bytes(bytes([(i * 7 + b) & 0xFF for b in range(32)]), "little")
+        await _write_line(dut, addr, wdata)
+
+    cycles = await _measure_fence_cycles(dut)
+    dut._log.info(
+        f"FENCE_I_MAINT_CYCLES={cycles} (dirty_lines={NUM_DIRTY_LINES}, "
+        f"L1=128KiB/4096 lines, L1I=16KiB/512 lines)"
+    )
+
+    # Sanity only: completion within the timeout. The slow vs fast comparison is
+    # read from the logged FENCE_I_MAINT_CYCLES line across the two builds.
+    assert cycles < FENCE_TIMEOUT_CYCLES, "fence.i maintenance did not complete"
diff --git a/verif/cocotb_tests/control/test_trap_unit.py b/verif/cocotb_tests/control/test_trap_unit.py
new file mode 100644
index 00000000..feb26146
--- /dev/null
+++ b/verif/cocotb_tests/control/test_trap_unit.py
@@ -0,0 +1,166 @@
+#    Copyright 2026 Two Sigma Open Source, LLC
+#
+#    Licensed under the Apache License, Version 2.0 (the "License");
+#    you may not use this file except in compliance with the License.
+#    You may obtain a copy of the License at
+#
+#        http://www.apache.org/licenses/LICENSE-2.0
+#
+#    Unless required by applicable law or agreed to in writing, software
+#    distributed under the License is distributed on an "AS IS" BASIS,
+#    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#    See the License for the specific language governing permissions and
+#    limitations under the License.
+
+"""Unit tests for trap_unit interrupt/MRET arbitration."""
+
+from typing import Any
+
+import cocotb
+from cocotb.clock import Clock
+from cocotb.triggers import RisingEdge, Timer
+
+
+MSTATUS_MIE = 1 << 3
+MIE_MTIE = 1 << 7
+INTERRUPT_MTIP = 0b010
+PRIV_U = 0
+PRIV_M = 3
+
+
+def _drive_defaults(dut: Any) -> None:
+    dut.i_pipeline_stall.value = 0
+    dut.i_sq_committed_empty.value = 1
+    dut.i_mstatus.value = 0
+    dut.i_mie.value = 0
+    dut.i_mtvec.value = 0x1000
+    dut.i_mepc.value = 0x2000
+    dut.i_mstatus_mie_direct.value = 0
+    dut.i_priv.value = PRIV_M
+    dut.i_interrupts.value = 0
+    dut.i_exception_valid.value = 0
+    dut.i_exception_cause.value = 0
+    dut.i_exception_tval.value = 0
+    dut.i_exception_pc.value = 0x3000
+    dut.i_interrupt_pc.value = 0x4000
+    dut.i_mret_start.value = 0
+    dut.i_wfi_start.value = 0
+
+
+async def _reset(dut: Any) -> None:
+    _drive_defaults(dut)
+    dut.i_rst.value = 1
+    await RisingEdge(dut.i_clk)
+    await RisingEdge(dut.i_clk)
+    dut.i_rst.value = 0
+    await RisingEdge(dut.i_clk)
+
+
+@cocotb.test()
+async def test_mret_defers_registered_timer_interrupt(dut: Any) -> None:
+    """Verify that a pending timer interrupt is deferred while MRET is in flight."""
+    cocotb.start_soon(Clock(dut.i_clk, 10, unit="ns").start())
+    await _reset(dut)
+
+    dut.i_mstatus.value = MSTATUS_MIE
+    dut.i_mstatus_mie_direct.value = 1
+    dut.i_mie.value = MIE_MTIE
+    dut.i_interrupts.value = INTERRUPT_MTIP
+
+    # Latch a timer interrupt while the trap unit is stalled. This creates the
+    # exact bad state from hardware: interrupt_pending is already registered
+    # when MRET reaches the trap unit.
+    dut.i_pipeline_stall.value = 1
+    await RisingEdge(dut.i_clk)
+
+    dut.i_pipeline_stall.value = 0
+    dut.i_mret_start.value = 1
+    await Timer(1, unit="ns")
+
+    assert int(dut.o_trap_taken.value) == 0
+    assert int(dut.o_mret_taken.value) == 1
+    assert int(dut.o_trap_target.value) == 0x2000
+
+    await RisingEdge(dut.i_clk)
+    dut.i_mret_start.value = 0
+    dut.i_priv.value = PRIV_U
+    dut.i_mstatus_mie_direct.value = 0
+    await Timer(1, unit="ns")
+    assert int(dut.o_trap_taken.value) == 0
+
+    await RisingEdge(dut.i_clk)
+    await Timer(1, unit="ns")
+    # Once the MRET-recovery inhibit lifts, the still-live machine timer -- HELD
+    # across the inhibit rather than force-cleared -- is taken at the first
+    # eligible boundary (U-mode here, where a machine interrupt preempts regardless
+    # of MIE). Holding a live source avoids LOSING a real timer tick; the 0x80388bba
+    # panic stays guarded by cpu_ooo's interrupt_resume_pc seed on mret_taken, not
+    # by this latch (commit 718f8cc).
+    assert int(dut.o_trap_taken.value) == 1
+    assert int(dut.o_trap_cause.value) == 0x80000007
+
+
+@cocotb.test()
+async def test_timer_interrupt_still_traps_without_mret(dut: Any) -> None:
+    """Verify that a latched timer interrupt is taken immediately when no MRET is in flight."""
+    cocotb.start_soon(Clock(dut.i_clk, 10, unit="ns").start())
+    await _reset(dut)
+
+    dut.i_mstatus.value = MSTATUS_MIE
+    dut.i_mstatus_mie_direct.value = 1
+    dut.i_mie.value = MIE_MTIE
+    dut.i_interrupts.value = INTERRUPT_MTIP
+
+    dut.i_pipeline_stall.value = 1
+    await RisingEdge(dut.i_clk)
+    dut.i_pipeline_stall.value = 0
+    await Timer(1, unit="ns")
+
+    assert int(dut.o_trap_taken.value) == 1
+    assert int(dut.o_mret_taken.value) == 0
+    assert int(dut.o_trap_cause.value) == 0x80000007
+    assert int(dut.o_trap_target.value) == 0x1000
+
+
+@cocotb.test()
+async def test_registered_interrupt_requires_current_mie(dut: Any) -> None:
+    """Verify that a held interrupt is only taken when current MIE is asserted."""
+    cocotb.start_soon(Clock(dut.i_clk, 10, unit="ns").start())
+    await _reset(dut)
+
+    dut.i_mstatus.value = MSTATUS_MIE
+    dut.i_mstatus_mie_direct.value = 1
+    dut.i_mie.value = MIE_MTIE
+    dut.i_interrupts.value = INTERRUPT_MTIP
+
+    # Latch a pending timer interrupt, then model the Linux return path clearing
+    # mstatus.MIE before that registered pending bit reaches take_trap.
+    dut.i_pipeline_stall.value = 1
+    await RisingEdge(dut.i_clk)
+
+    dut.i_pipeline_stall.value = 0
+    dut.i_mstatus.value = 0
+    dut.i_mstatus_mie_direct.value = 0
+    await Timer(1, unit="ns")
+    assert int(dut.o_trap_taken.value) == 0
+
+    await RisingEdge(dut.i_clk)
+    await Timer(1, unit="ns")
+    assert int(dut.o_trap_taken.value) == 0
+
+    # Once MIE is restored, the timer interrupt was HELD across the MIE-low window
+    # (not erased), so it is eligible and taken IMMEDIATELY on the restore cycle --
+    # one cycle earlier than the old clear-then-re-latch path, which could LOSE the
+    # tick if MIE never stayed high long enough (the no-MMU boot lost-tick hang). It
+    # still requires CURRENT MIE to be taken (eligible gates on live
+    # m_int_globally_enabled), so the name still holds.
+    dut.i_mstatus.value = MSTATUS_MIE
+    dut.i_mstatus_mie_direct.value = 1
+    await Timer(1, unit="ns")
+    assert int(dut.o_trap_taken.value) == 1
+    assert int(dut.o_trap_cause.value) == 0x80000007
+
+    # Cleared on take (trap_taken_prev gates re-entry); does not re-fire next cycle.
+    await RisingEdge(dut.i_clk)
+    await Timer(1, unit="ns")
+    assert int(dut.o_trap_taken.value) == 0
diff --git a/verif/cocotb_tests/cpu_ooo/recovery/test_early_misprediction_recovery.py b/verif/cocotb_tests/cpu_ooo/recovery/test_early_misprediction_recovery.py
index 0894b8ad..b7b5d495 100644
--- a/verif/cocotb_tests/cpu_ooo/recovery/test_early_misprediction_recovery.py
+++ b/verif/cocotb_tests/cpu_ooo/recovery/test_early_misprediction_recovery.py
@@ -274,6 +274,53 @@ async def test_unqualified_mispredictions_do_not_fire(dut: Any) -> None:
         _assert_idle(dut)
 
 
+@cocotb.test()
+async def test_commit_recovery_next_cycle_drops_coincident_fire(dut: Any) -> None:
+    """A fire coinciding with a head-mispredict commit is dropped one cycle later.
+
+    The one-cycle collision the fire-time gates cannot see: a younger branch
+    fires (capture succeeds, i_mispredict_recovery_pending still 0) in the same
+    cycle an older head-mispredict commits.  The commit-time launch registers
+    into mispredict_recovery_pending on the NEXT cycle, and the
+    !i_mispredict_recovery_pending term in early_mispredict_active must drop
+    the early pulse there -- before any redirect / RAT restore /
+    rob_early_recovered write / backend flush.  This is the load-bearing guard
+    that replaced the removed fire-time candidate gate (see the NOTE in
+    branch_resolution.sv); no other test or formal property pins it.
+    """
+    await _setup_test(dut)
+
+    # Cycle N: qualified fire with no recovery pending -- capture succeeds.
+    _drive_mispredict(dut, tag=6, checkpoint_id=1)
+    await _settle_after_edge(dut)
+
+    # Cycle N+1: the coincident commit-time recovery launch is now registered.
+    _clear_inputs(dut)
+    dut.i_mispredict_recovery_pending.value = 1
+    await Timer(1, unit="ns")
+
+    # The pulse is dropped: no active phase, no RAT restore enable.  Only the
+    # benign one-cycle dispatch hold remains (dispatch is being flushed by the
+    # commit-time recovery in this cycle anyway).
+    assert not dut.o_early_mispredict_active.value
+    assert not dut.o_early_recovery_en.value
+    assert dut.o_early_backend_recovery_hold.value
+
+    await _settle_after_edge(dut)
+
+    # Cycle N+2: recovery_pending was a one-cycle pulse; the dropped fire must
+    # leave no residue -- in particular no phantom backend flush.
+    _clear_inputs(dut)
+    await Timer(1, unit="ns")
+
+    assert not dut.o_early_backend_recovery_pending.value
+    _assert_idle(dut)
+
+    await _settle_after_edge(dut)
+
+    _assert_idle(dut)
+
+
 @cocotb.test()
 async def test_backend_phase_blocks_new_capture(dut: Any) -> None:
     """A second misprediction cannot start while the backend phase is pending."""
diff --git a/verif/cocotb_tests/if_stage/test_if_stage.py b/verif/cocotb_tests/if_stage/test_if_stage.py
index 256ed7be..e01f2511 100644
--- a/verif/cocotb_tests/if_stage/test_if_stage.py
+++ b/verif/cocotb_tests/if_stage/test_if_stage.py
@@ -268,6 +268,7 @@ def _clear_inputs(dut: Any) -> None:
     _drive_from_ex(dut, {})
     _drive_fetch(dut, current_word=NOP_INSTR, next_word=NOP_INSTR)
     dut.i_instr_valid.value = 1
+    dut.i_served_addr.value = 0
     _drive_pipeline_ctrl(dut, {})
     _drive_trap_ctrl(dut, {})
     dut.i_frontend_state_flush.value = 0
@@ -278,7 +279,35 @@ def _clear_inputs(dut: Any) -> None:
     dut.i_pd_redirect_target.value = 0
 
 
-async def _setup_test(dut: Any) -> None:
+def _start_served_addr_tracker(dut: Any, *, word_offset: int = 0) -> None:
+    """Model the unit-test fetch provider's served-window tag (i_served_addr).
+
+    if_stage's served-window guard (window_cannot_serve_pc_reg, if_stage.sv:766)
+    squashes the IF output and holds pc_reg whenever the served 64-bit fetch
+    window {word(i_served_addr), word(i_served_addr)+1} does not cover pc_reg's
+    word (delta 0 or -1).  The guard only arms in the cached region
+    (pc_reg[XLEN-1]); the directed tests use cached PCs (BASE_PC=0x80001000), so
+    it is live.  In the real SoC i_served_addr is the registered fetch address
+    (cpu_and_mem.sv:585), which tracks pc_reg for the always-valid 1-cycle
+    provider these tests model.  Mirror that here so the guard stays inert during
+    normal fetch.  pc_reg only changes on a clock edge, so refreshing once per
+    edge keeps i_served_addr aligned with pc_reg for every read in between.
+
+    word_offset>0 deliberately leads the served window ahead of pc_reg (e.g. the
+    F=W+1 case) to exercise the guard instead of suppressing it.
+    """
+    mask = (1 << XLEN) - 1
+
+    async def _tracker() -> None:
+        while True:
+            dut.i_served_addr.value = (int(dut.pc_reg.value) + 4 * word_offset) & mask
+            await RisingEdge(dut.i_clk)
+            await Timer(1, unit="step")
+
+    cocotb.start_soon(_tracker())
+
+
+async def _setup_test(dut: Any, *, served_word_offset: int = 0) -> None:
     """Start the clock, reset the IF stage, and clear inputs."""
     cocotb.start_soon(Clock(dut.i_clk, CLOCK_PERIOD_NS, unit="ns").start())
     _clear_inputs(dut)
@@ -286,6 +315,7 @@ async def _setup_test(dut: Any) -> None:
     await RisingEdge(dut.i_clk)
     await FallingEdge(dut.i_clk)
     _drive_pipeline_ctrl(dut, {})
+    _start_served_addr_tracker(dut, word_offset=served_word_offset)
     await _settle()
 
 
@@ -768,3 +798,111 @@ async def test_fetch_invalid_compressed_pair_resume(dut: Any) -> None:
         effective=COMPRESSED_HINT,
         compressed=True,
     )
+
+
+@cocotb.test()
+async def test_pd_redirect_stall_32bit_target_no_plus2_desync(dut: Any) -> None:
+    """PD-redirect+BTB-collision+stall must not advance pc_reg by +2 onto a 32-bit instruction.
+
+    Same race as test_pd_redirect_with_stall_kills_registered_prediction_handoff
+    but the wrong-ADVANCE (+2) variant rather than wrong-TARGET: on genesys2 the
+    HW lands pc_reg 2 bytes into a 32-bit insn (epc=0x8038d7fa, mid sw zero,4(s1))
+    at workqueue_init_early -> illegal-instruction Oops. Drive a 32-bit stream at
+    the PD target; every dispatched PC must be 4-byte aligned.
+    """
+    await _setup_test(dut)
+    dut.i_disable_branch_prediction.value = 0
+
+    branch_pc = BASE_PC + 8
+    stale_pred_target = 0x80005000
+    pd_target = 0x80006000
+
+    _drive_from_ex(
+        dut,
+        {
+            "btb_update": True,
+            "btb_update_pc": branch_pc,
+            "btb_update_target": stale_pred_target,
+            "btb_update_taken": True,
+            "btb_update_compressed": False,
+            "btb_update_requires_pc_reg_handoff": True,
+        },
+    )
+    await _advance_cycle(dut)
+    _drive_from_ex(dut, {})
+
+    await _redirect_to(dut, BASE_PC)
+    prediction_cycle_found = False
+    for _ in range(20):
+        if int(dut.branch_prediction_controller_inst.o_prediction_used.value):
+            prediction_cycle_found = True
+            break
+        await _advance_cycle(dut)
+    assert prediction_cycle_found, "BTB prediction never fired; test misconfigured"
+
+    dut.i_pd_redirect.value = 1
+    dut.i_pd_redirect_target.value = pd_target
+    await _advance_cycle(dut)
+    dut.i_pd_redirect.value = 0
+    dut.i_pd_redirect_target.value = 0
+
+    _drive_pipeline_ctrl(dut, {"stall": True})
+    await _advance_cycle(dut)
+    for _ in range(3):
+        _drive_pipeline_ctrl(dut, {"stall": True, "stall_registered": True})
+        await _advance_cycle(dut)
+    _drive_pipeline_ctrl(dut, {})
+
+    bad: list[int] = []
+    for _ in range(8):
+        _drive_fetch(dut, current_word=ADD_INSTR_A, next_word=ADD_INSTR_B)
+        await _settle()
+        packet = _read_if_packet(dut)
+        if not packet["sel_nop"]:
+            pc = packet["program_counter"]
+            if pc & 0x2:
+                bad.append(pc)
+        await _advance_cycle(dut)
+    assert not bad, (
+        "pc_reg landed mid-32-bit-instruction (+2 desync) after PD-redirect+stall: "
+        f"{[hex(x) for x in bad]}"
+    )
+
+
+@cocotb.test()
+async def test_fetch_window_lead_parity_plus2_desync(dut: Any) -> None:
+    """Fetch window leading pc_reg by one word (F=W+1) -> is_compressed_fast reads word(W+2)'s size bit.
+
+    If that word's low parcel predecodes compressed, a
+    word-aligned 32-bit insn at pc_reg advances +2 (mid-instruction). This is the
+    workqueue_init_early HW Oops shape (epc 2 bytes into a word-aligned 32-bit sw).
+    fetch_word_swapped = i_instr_bank_sel_r ^ pc_reg[2] is a 1-bit parity that
+    cannot represent F=W+1 (instruction_aligner.sv:141-147,235-240).
+    """
+    # served_word_offset=1 models the served window leading pc_reg by one word
+    # (F=W+1): the case the served-window guard must catch (hold pc_reg, stay
+    # 4-aligned) rather than letting the 1-bit aligner parity advance pc_reg +2.
+    await _setup_test(dut, served_word_offset=1)
+    await _redirect_to(
+        dut, BASE_PC
+    )  # pc_reg -> 0x80001000 (bit1=0, bit2=0); 32-bit insn here
+
+    _drive_fetch(
+        dut,
+        current_word=ADD_INSTR_A,  # i_instr[31:0]
+        next_word=0x00000004,  # i_instr[63:32] = word(W+2); lo parcel 0x0004 -> "compressed"
+        current_sb=_sideband(),  # 32-bit at pc_reg
+        next_sb=_sideband(compressed_lo=True, compressed_hi=False),
+        bank_sel=1,  # = ~pc_reg[2]; models served window one word AHEAD (F=W+1)
+    )
+    await _settle()
+    assert int(_read_if_packet(dut)["program_counter"]) == BASE_PC
+
+    await _advance_cycle(dut)
+    _drive_fetch(dut, current_word=ADD_INSTR_B, next_word=ADD_INSTR_C, bank_sel=1)
+    await _settle()
+    pc2 = int(_read_if_packet(dut)["program_counter"])
+    assert (pc2 & 0x2) == 0, (
+        f"pc_reg landed mid-32-bit-instruction at {pc2:#x} "
+        "(F=W+1 fetch-window-lead parity hole; is_compressed_fast read the wrong word)"
+    )
diff --git a/verif/cocotb_tests/test_common.py b/verif/cocotb_tests/test_common.py
index accd88c6..96f3bf26 100644
--- a/verif/cocotb_tests/test_common.py
+++ b/verif/cocotb_tests/test_common.py
@@ -129,8 +129,10 @@ def handle_branch_flush(
         │ Cycle 3: All flags cleared             → Resume normal ops  │
         └─────────────────────────────────────────────────────────────┘
 
-    All branches and jumps (JAL, JALR, conditional branches) are resolved in
-    EX stage and cause a 3-cycle flush.
+    This reference model treats all branches and jumps (JAL, JALR,
+    conditional branches) as resolving at EX with a 3-cycle flush; the
+    actual CPU predicts branches and its flush timing varies, but the
+    monitors' expected-value queues line up with this simplified model.
 
     Args:
         state: Test state to update branch tracking
diff --git a/verif/cocotb_tests/test_directed_traps.py b/verif/cocotb_tests/test_directed_traps.py
index a87f1f0a..cce7f2e5 100644
--- a/verif/cocotb_tests/test_directed_traps.py
+++ b/verif/cocotb_tests/test_directed_traps.py
@@ -51,10 +51,7 @@
     └────────────────────────────────────────────────────────────────┘
 
 Usage:
-    make test TEST=test_directed_trap_handling
-    make test TEST=test_directed_interrupt_trap_mstatus
-    make test TEST=test_directed_mret_interrupt_race
-    make test TEST=test_directed_csrsi_enable_mie
+    cd tests && make clean && ./test_run_cocotb.py directed_traps
 """
 
 import cocotb
@@ -1004,34 +1001,45 @@ async def run_directed_csrsi_enable_mie_test(
     await FallingEdge(dut_if.clock)
     await dut_if.wait_ready()
     dut_if.instruction = instr_csrsi_mstatus
-
-    # Log cycle-by-cycle what happens
-    cocotb.log.info("=== Cycle-by-cycle monitoring ===")
-    for cycle in range(PIPELINE_DEPTH):
+    await RisingEdge(dut_if.clock)
+    # Park a NOP so exactly one CSRSI enters the pipe (the harness keeps
+    # presenting dut_if.instruction every fetch; without this the trap handler
+    # would fetch CSRSIs and re-enable MIE in a trap loop).
+    dut_if.instruction = 0x00000013
+
+    # Wait for the trap, event-based. On the OOO core a CSR op is serialized:
+    # drain to the ROB head, csr_done handshake, commit, then the CSR write
+    # lands off the registered commit bus -- roughly 8-11 cycles from fetch,
+    # not the old in-order PIPELINE_DEPTH. Then the (registered) pending
+    # interrupt is taken. Poll with a generous budget instead of guessing.
+    cocotb.log.info("=== Waiting for CSRSI commit + interrupt trap ===")
+    trap_seen_cycle = -1
+    for cycle in range(100):
         await RisingEdge(dut_if.clock)
         try:
             trap_taken = int(dut.device_under_test.trap_unit_inst.o_trap_taken.value)
             mstatus = int(dut.device_under_test.csr_file_inst.mstatus.value)
-            mie = (mstatus >> 3) & 1
-            mpie = (mstatus >> 7) & 1
-            csr_write_en = int(dut.device_under_test.csr_write_enable.value)
-            cocotb.log.info(
-                f"Cycle {cycle}: trap_taken={trap_taken}, csr_write_en={csr_write_en}, "
-                f"mstatus=0x{mstatus:08X}, MIE={mie}, MPIE={mpie}"
-            )
-
-            # If trap was taken, check mstatus
+            csr_fire = int(dut.device_under_test.csr_commit_fire.value)
+            if csr_fire or trap_taken:
+                cocotb.log.info(
+                    f"Cycle {cycle}: trap_taken={trap_taken}, "
+                    f"csr_commit_fire={csr_fire}, mstatus=0x{mstatus:08X}"
+                )
             if trap_taken:
+                trap_seen_cycle = cycle
                 cocotb.log.info(f">>> Trap detected at cycle {cycle}")
-                # The trap is being taken NOW, mstatus update happens next edge
+                break
         except Exception as e:
             cocotb.log.warning(f"Cycle {cycle}: Could not read signals: {e}")
 
-    # Wait one additional cycle for mstatus to be updated after trap_taken.
-    # This is needed because:
-    # 1. interrupt_pending is registered in trap_unit for timing optimization (1 cycle latency)
-    # 2. mstatus update happens on the clock edge after trap_taken is asserted
-    # Total: 2 cycles from MIE enable to mstatus update
+    assert trap_seen_cycle >= 0, (
+        "CSRSI+TRAP BUG: no trap taken within 100 cycles of the CSRSI "
+        "(MIE enable never took effect or pending interrupt not delivered)"
+    )
+
+    # mstatus updates on the edge after trap_taken asserts; give it two edges
+    # so the registered trap state settles before the final check.
+    await RisingEdge(dut_if.clock)
     await RisingEdge(dut_if.clock)
 
     # Final check
@@ -1329,3 +1337,434 @@ async def run_directed_illegal_instruction_test(
 async def test_directed_illegal_instruction(dut: Any) -> None:
     """Directed test for illegal instruction trapping (mcause=2)."""
     await run_directed_illegal_instruction_test(dut)
+
+
+# ============================================================================
+# Directed Test for Precise-Interrupt / Commit Race (mepc off-by-one detector)
+# ============================================================================
+#
+# Bug this test was written to catch (since fixed):
+#   When an async machine-timer interrupt was recognized in the SAME cycle an
+#   ordinary instruction committed, precise state was mis-handled.
+#     * commit_en (reorder_buffer.sv) was gated only by the REGISTERED
+#       trap_mret_commit_hold_q (cpu_ooo.sv), which for an async interrupt stays
+#       low (it tracks trap_pending/mret/drain, none of which an async timer IRQ
+#       asserts). So a normal commit could fire in the cycle o_trap_taken asserted.
+#     * interrupt_resume_pc (cpu_ooo.sv), the source of mepc for async
+#       interrupts, was updated from the COMBINATIONAL rob_commit_valid_raw, so a
+#       commit in the trap cycle advanced it to that instruction's next-PC.
+#     * The registered ROB commit (reorder_buffer.sv o_commit.valid) and the
+#       regfile write (commit_actions.sv) were NOT gated by the coincident flush /
+#       trap, so the racing instruction's architectural write still landed.
+#   Net effect: mepc and the set of architecturally-retired instructions could
+#   disagree by one -- a precise-state violation (on Linux this surfaced as a
+#   lost callee-saved restore, s2 = 0x19999998).
+#   Fixed by the commit_ready_early gating in reorder_buffer.sv, which blocks
+#   commit_en on the coincident i_flush_en / i_flush_all / i_commit_hold; this
+#   test now asserts zero violations as a regression check.
+#
+# Detector (prefix invariant): at trap entry the architectural regfile must
+# reflect EXACTLY the instructions with PC < mepc -- every such instruction's
+# destination register holds its marker, and no instruction with PC >= mepc has
+# its marker visible.  This sweeps the interrupt fire-cycle across a stream of
+# distinct register-writing ALU ops in a SINGLE simulation and flags any offset
+# where the invariant breaks.
+#
+# Regfile note: the architectural integer regfile is a multi-write distributed
+# RAM (generic_regfile -> mwp_dist_ram) with a per-address live-value table, so
+# a register's committed value is g_banks[lvt[r]].u_bank.ram[r] (read port 0).
+
+
+async def run_directed_interrupt_commit_race_test(
+    dut: Any, config: TestConfig | None = None, mode: str = "alu"
+) -> None:
+    """Sweep an async timer interrupt cycle-by-cycle over a register-writing stream.
+
+    Assert the trap-entry precise-state prefix invariant.
+
+    mode="alu":  stream is `addi xK, x0, marker` (result produced in EX).
+    mode="load": stream is `lw  xK, off(x4)`     (result produced via the load
+                 queue / data memory) -- mirrors the Linux symptom, which was a
+                 lost callee-saved *load* restore (s2 = 0x19999998).
+    """
+    from encoders.op_tables import I_ALU, CSRS, LOADS
+    from encoders.instruction_encode import CSRAddress
+
+    if config is None:
+        config = TestConfig(num_loops=100)
+
+    # ---- parameters --------------------------------------------------------
+    nop = 0x00000013
+    base_reg = 5  # stream writes x5..x{4+n_stream}
+    n_stream = 27  # x5..x31
+    warmup = 6
+    gap = 4  # NOPs between serialized CSR writes
+    obs = 56  # stream + observation cycles per offset
+    post_trap = 8  # cycles to keep observing after o_trap_taken
+    fire_lo, fire_hi = 0, 40
+    mem_base = 0x400  # byte base of the load region (x4); BRAM, non-cached
+    word_base = mem_base >> 2
+
+    enc_addi = I_ALU["addi"][0]
+    enc_slli = I_ALU["slli"][0]
+    enc_csrrw = CSRS["csrrw"]
+    enc_lw = LOADS["lw"][0]
+
+    # Iteration-unique expected destination value: distinct per (stream index,
+    # generation) and never 0, so a leftover value from a prior sweep iteration
+    # can never masquerade as a commit in the current one (neither the regfile
+    # RAM nor the data BRAM is reset between runs).
+    def expected_val(i: int, gen: int) -> int:
+        if mode == "load":
+            # 32-bit memory word loaded into the dest register.
+            return (0x19990000 | ((gen & 0xFF) << 8) | (i & 0xFF)) & MASK32
+        return 0x40 + gen * 48 + i  # 12-bit addi immediate (<= 1914)
+
+    def stream_instr(c: int, gen: int) -> int:
+        if mode == "load":
+            return enc_lw(base_reg + c, 4, c * 4)  # lw x{5+c}, (c*4)(x4)
+        return enc_addi(base_reg + c, 0, expected_val(c, gen))
+
+    dut_if = DUTInterface(dut)
+    clk = dut_if.clock
+    d = dut.device_under_test
+
+    def ri(handle: Any) -> int | None:
+        try:
+            return int(handle.value)
+        except Exception:
+            return None
+
+    # Read port 0 of the architectural integer regfile (multi-write banked RAM).
+    def _read_port0() -> Any:
+        return d.ooo_register_files_inst.regfile_inst.gen_read_port[
+            0
+        ].gen_multi_write.read_port_ram
+
+    def read_reg(r: int) -> int | None:
+        if r == 0:
+            return 0
+        try:
+            rp = _read_port0()
+            sel = int(rp.lvt[r].value)
+            return int(rp.g_banks[sel].u_bank.ram[r].value) & MASK32
+        except Exception as e:  # pragma: no cover - surfaced as a clear failure
+            raise AssertionError(
+                f"regfile read path failed for x{r}: {e}. "
+                f"Expected ooo_register_files_inst.regfile_inst."
+                f"gen_read_port[0].gen_multi_write.read_port_ram.{{lvt,g_banks[*].u_bank.ram}}"
+            ) from e
+
+    # one clock for the whole sweep
+    cocotb.start_soon(Clock(clk, config.clock_period_ns, unit="ns").start())
+
+    async def feed(instr: int) -> None:
+        await FallingEdge(clk)
+        await dut_if.wait_ready()
+        dut_if.instruction = instr
+        await RisingEdge(clk)
+
+    gen_counter = {"g": 0}
+
+    async def setup_phase() -> int:
+        """Reset and rebuild mtvec/mie/mstatus via fed instructions.
+
+        Enables the machine-timer interrupt; i_interrupts remains 0 so nothing fires yet.
+        """
+        gen = gen_counter["g"]
+        gen_counter["g"] += 1
+        dut.i_interrupts_reg.value = 0
+        dut_if.instruction = nop
+        await dut_if.reset_dut(config.reset_cycles)
+        for _ in range(6):
+            await feed(nop)
+        # Preload the load region with this generation's expected values (the
+        # data BRAM persists across reset, so refresh it every iteration).
+        if mode == "load":
+            for i in range(n_stream):
+                dut.data_memory_for_simulation.memory[
+                    word_base + i
+                ].value = expected_val(i, gen)
+        # Construct CSR operands (no deposits needed): x1=mtvec(0x1000),
+        # x2=mie.MTIE(0x80), x3=mstatus.MIE(0x08), x4=load base.
+        await feed(enc_addi(1, 0, 1))  # x1 = 1
+        await feed(enc_slli(1, 1, 12))  # x1 = 0x1000
+        await feed(enc_addi(2, 0, 0x80))  # x2 = MTIE
+        await feed(enc_addi(3, 0, 0x08))  # x3 = MIE
+        await feed(enc_addi(4, 0, mem_base))  # x4 = load base address
+        for _ in range(warmup):
+            await feed(nop)
+        await feed(enc_csrrw(0, CSRAddress.MTVEC, 1))
+        for _ in range(gap):
+            await feed(nop)
+        await feed(enc_csrrw(0, CSRAddress.MIE, 2))
+        for _ in range(gap):
+            await feed(nop)
+        await feed(enc_csrrw(0, CSRAddress.MSTATUS, 3))
+        for _ in range(gap):
+            await feed(nop)
+        return gen
+
+    async def calibrate() -> list[int]:
+        """Run the stream with no interrupt to learn each stream instruction's PC.
+
+        Captures PCs from regfile write ports and confirms a clean run commits
+        every marker in order.
+        """
+        gen = await setup_phase()
+        reg_pc: dict[int, int] = {}
+        for c in range(obs):
+            await FallingEdge(clk)
+            await dut_if.wait_ready()
+            dut_if.instruction = stream_instr(c, gen) if c < n_stream else nop
+            await RisingEdge(clk)
+            we0, a0, pc0 = (
+                ri(d.dbg_port0_int_we),
+                ri(d.dbg_port0_int_addr),
+                ri(d.dbg_rob_commit_reg_pc),
+            )
+            we1, a1, pc1 = (
+                ri(d.dbg_port1_int_we),
+                ri(d.dbg_port1_int_addr),
+                ri(d.dbg_rob_commit_2_reg_pc),
+            )
+            if (
+                we0
+                and a0 is not None
+                and pc0 is not None
+                and base_reg <= a0 < base_reg + n_stream
+            ):
+                reg_pc.setdefault(a0, pc0)
+            if (
+                we1
+                and a1 is not None
+                and pc1 is not None
+                and base_reg <= a1 < base_reg + n_stream
+            ):
+                reg_pc.setdefault(a1, pc1)
+        missing = [
+            base_reg + i for i in range(n_stream) if (base_reg + i) not in reg_pc
+        ]
+        assert not missing, f"calibration missed regfile writes for {missing}: {reg_pc}"
+        stream_pcs = [reg_pc[base_reg + i] for i in range(n_stream)]
+        for i in range(1, n_stream):
+            assert (
+                stream_pcs[i] == stream_pcs[0] + 4 * i
+            ), f"stream PCs not contiguous: {[hex(p) for p in stream_pcs]}"
+        for i in range(n_stream):
+            v = read_reg(base_reg + i)
+            assert v == expected_val(i, gen), (
+                f"clean-run marker mismatch x{base_reg + i}: "
+                f"got {v:#x} want {expected_val(i, gen):#x}"
+            )
+        cocotb.log.info(
+            f"Calibrated stream PCs x{base_reg}..x{base_reg + n_stream - 1}: "
+            f"{stream_pcs[0]:#x}..{stream_pcs[-1]:#x} (step 4); clean run committed "
+            f"all {n_stream} markers."
+        )
+        return stream_pcs
+
+    async def run_offset(fire_offset: int, stream_pcs: list[int]) -> dict[str, Any]:
+        gen = await setup_phase()
+        trap_c: int | None = None
+        racer: dict[str, Any] | None = None
+        resume_at_trap: int | None = None
+        last_mepc: int | None = None
+        for c in range(obs):
+            await FallingEdge(clk)
+            await dut_if.wait_ready()
+            # Stop injecting new stream writes once the trap is taken so the
+            # post-trap handler (NOPs) cannot perturb the x5..x31 snapshot.
+            if c < n_stream and trap_c is None:
+                dut_if.instruction = stream_instr(c, gen)
+            else:
+                dut_if.instruction = nop
+            # Cycle-exact injection: assert mtip for the cycle ending at this edge.
+            if c == fire_offset:
+                dut.i_interrupts_reg.value = 0b010
+            await RisingEdge(clk)
+            ttr = ri(d.dbg_trap_taken_raw)
+            mepc = ri(d.csr_file_inst.mepc)
+            if mepc is not None:
+                last_mepc = mepc
+            if trap_c is None and ttr == 1:
+                trap_c = c
+                resume_at_trap = ri(d.dbg_interrupt_resume_pc)
+                racer = dict(
+                    valid=ri(d.dbg_commit_valid),
+                    pc=ri(d.dbg_commit_pc),
+                    dest_valid=ri(d.dbg_commit_dest_valid),
+                    dest_reg=ri(d.dbg_commit_dest_reg),
+                    value=ri(d.dbg_commit_value),
+                    c2_valid=ri(d.dbg_commit_2_valid),
+                    c2_pc=ri(d.dbg_commit_2_pc),
+                )
+            if trap_c is not None and c >= trap_c + post_trap:
+                break
+        mepc_final = ri(d.csr_file_inst.mepc)
+        if mepc_final is None:
+            mepc_final = last_mepc
+        regs = {base_reg + i: read_reg(base_reg + i) for i in range(n_stream)}
+        dut.i_interrupts_reg.value = 0
+        return dict(
+            fire_offset=fire_offset,
+            gen=gen,
+            trap_c=trap_c,
+            mepc=mepc_final,
+            resume_at_trap=resume_at_trap,
+            racer=racer,
+            regs=regs,
+        )
+
+    def analyze(res: dict[str, Any], stream_pcs: list[int]) -> dict[str, Any]:
+        gen = res["gen"]
+        mepc = res["mepc"]
+        regs = res["regs"]
+        committed = [
+            regs[base_reg + i] == expected_val(i, gen) for i in range(n_stream)
+        ]
+        ncommit = sum(committed)
+        longest_prefix = 0
+        while longest_prefix < n_stream and committed[longest_prefix]:
+            longest_prefix += 1
+        lost: list[int] = []
+        leaked: list[int] = []
+        r: int | None = None
+        no_trap = res["trap_c"] is None
+        if mepc is not None and not no_trap:
+            # Expected #committed stream instrs == those with PC < mepc.
+            r = sum(1 for pc in stream_pcs if pc < mepc)
+            for i in range(n_stream):
+                if stream_pcs[i] < mepc and not committed[i]:
+                    lost.append(i)  # mepc skipped it, but its write is missing
+                elif stream_pcs[i] >= mepc and committed[i]:
+                    leaked.append(i)  # committed though mepc resumes at/before it
+        violation = bool(lost or leaked) and not no_trap
+        return dict(
+            committed=committed,
+            ncommit=ncommit,
+            longest_prefix=longest_prefix,
+            R=r,
+            lost=lost,
+            leaked=leaked,
+            violation=violation,
+            no_trap=no_trap,
+        )
+
+    # ---- sweep -------------------------------------------------------------
+    cocotb.log.info(f"=== Precise-interrupt sweep: stream mode={mode} ===")
+    cocotb.log.info("=== Calibrating clean stream PCs (no interrupt) ===")
+    stream_pcs = await calibrate()
+
+    cocotb.log.info("=== Sweeping interrupt fire-cycle (single simulation) ===")
+    results: list[dict[str, Any]] = []
+    for fire_offset in range(fire_lo, fire_hi):
+        res = await run_offset(fire_offset, stream_pcs)
+        an = analyze(res, stream_pcs)
+        res["an"] = an
+        results.append(res)
+
+        def _h(x: int | None) -> str:
+            return "None" if x is None else f"0x{x:08x}"
+
+        racer = res["racer"] or {}
+        tag = (
+            "  <<< VIOLATION"
+            if an["violation"]
+            else ("  (no trap)" if an["no_trap"] else "")
+        )
+        cocotb.log.info(
+            f"offset={fire_offset:2d} trap_c={res['trap_c']} mepc={_h(res['mepc'])} "
+            f"resume@trap={_h(res['resume_at_trap'])} R={an['R']} "
+            f"committed={an['ncommit']} prefix={an['longest_prefix']} "
+            f"lost={an['lost']} leaked={an['leaked']} "
+            f"racer[pc={_h(racer.get('pc'))} x{racer.get('dest_reg')}={_h(racer.get('value'))} "
+            f"v={racer.get('valid')}]{tag}"
+        )
+
+    violations = [r for r in results if r["an"]["violation"]]
+
+    # ---- detailed evidence for each violation ------------------------------
+    for r in violations[:8]:
+        an = r["an"]
+        gen = r["gen"]
+        fo = r["fire_offset"]
+        cocotb.log.error(
+            f"--- VIOLATION fire_offset={fo} mepc=0x{r['mepc']:08x} "
+            f"resume_pc@trap="
+            f"{f'0x{r['resume_at_trap']:08x}' if r['resume_at_trap'] is not None else None} ---"
+        )
+        for i in an["lost"]:
+            reg = base_reg + i
+            cocotb.log.error(
+                f"   LOST  x{reg} (stream #{i}, pc=0x{stream_pcs[i]:08x} < mepc): "
+                f"expected marker 0x{expected_val(i, gen):08x}, regfile=0x{r['regs'][reg]:08x} "
+                f"-- mepc advanced past this instruction but its write is missing"
+            )
+        for i in an["leaked"]:
+            reg = base_reg + i
+            cocotb.log.error(
+                f"   LEAK  x{reg} (stream #{i}, pc=0x{stream_pcs[i]:08x} >= mepc): "
+                f"regfile=0x{r['regs'][reg]:08x} == marker 0x{expected_val(i, gen):08x} "
+                f"-- committed although mepc resumes at/before it (re-execution)"
+            )
+        rc = r["racer"]
+        if rc and rc.get("valid"):
+            cocotb.log.error(
+                f"   trap-cycle committer: pc=0x{rc['pc']:08x} "
+                f"x{rc['dest_reg']}<=0x{(rc['value'] or 0):08x} -- this combinational "
+                f"commit advanced interrupt_resume_pc in the o_trap_taken cycle"
+            )
+
+    # ---- per-offset mepc table (visibility, incl. negative results) --------
+    cocotb.log.info("=== Per-offset mepc / commit summary ===")
+    for r in results:
+        an = r["an"]
+        cocotb.log.info(
+            f"  offset={r['fire_offset']:2d} "
+            f"mepc={f'0x{r['mepc']:08x}' if r['mepc'] is not None else None} "
+            f"committed={an['ncommit']} prefix={an['longest_prefix']} "
+            f"violation={an['violation']}"
+        )
+
+    n_trapped = sum(1 for r in results if not r["an"]["no_trap"])
+    cocotb.log.info(
+        f"Swept {len(results)} offsets ({n_trapped} took the trap); "
+        f"{len(violations)} violated the prefix invariant."
+    )
+
+    assert not violations, (
+        f"PRECISE-INTERRUPT BUG REPRODUCED (mode={mode}): {len(violations)}/{len(results)} "
+        f"interrupt fire-offsets violate the trap-entry prefix invariant (architectural "
+        f"regfile != instructions with PC < mepc). First failing "
+        f"offset={violations[0]['fire_offset']}, mepc=0x{violations[0]['mepc']:08x}, "
+        f"lost={violations[0]['an']['lost']}, leaked={violations[0]['an']['leaked']}. "
+        f"See per-offset log above for the exact lost/leaked register (expected vs "
+        f"actual value) and the trap-cycle committer that advanced interrupt_resume_pc."
+    )
+    cocotb.log.info(
+        f"=== mode={mode}: no violations across all fire offsets; "
+        f"trap-entry prefix invariant holds. ==="
+    )
+
+
+@cocotb.test()
+async def test_directed_interrupt_commit_race(dut: Any) -> None:
+    """Deterministic precise-interrupt repro (ALU stream): sweep an async M-timer interrupt.
+
+    Sweep cycle-by-cycle across a register-writing ALU stream and check that,
+    at trap entry, the architectural regfile reflects exactly the instructions
+    with PC < mepc (precise-state prefix invariant).
+    """
+    await run_directed_interrupt_commit_race_test(dut, mode="alu")
+
+
+@cocotb.test()
+async def test_directed_interrupt_commit_race_loads(dut: Any) -> None:
+    """Deterministic precise-interrupt repro (LOAD stream): same cycle-exact interrupt sweep.
+
+    The stream is `lw` instructions whose results come from
+    the load queue / data memory -- mirroring the Linux symptom (a lost
+    callee-saved load restore, s2 = 0x19999998).
+    """
+    await run_directed_interrupt_commit_race_test(dut, mode="load")
diff --git a/verif/cocotb_tests/test_helpers.py b/verif/cocotb_tests/test_helpers.py
index 8b737667..1248d214 100644
--- a/verif/cocotb_tests/test_helpers.py
+++ b/verif/cocotb_tests/test_helpers.py
@@ -228,12 +228,39 @@ def _get_regfile_ram(self, ram_index: int = 0) -> Any:
             path = self.paths.regfile_ram_rs2_path
         return self._navigate_signal_path(path)
 
+    # Number of read ports on the architectural integer register file
+    # (generic_regfile in the current cpu_ooo).
+    _INT_RF_READ_PORTS = 8
+
+    def _int_regfile_inst(self) -> Any | None:
+        """Return the architectural integer register-file instance for the cpu_ooo DUT.
+
+        Returns None when the hierarchy does not expose it (other toplevels).
+        """
+        try:
+            return self.dut.device_under_test.ooo_register_files_inst.regfile_inst
+        except Exception:
+            return None
+
+    def _int_read_port_ram(self, regfile_inst: Any, port: int) -> tuple[Any, bool]:
+        """Return (read_port_ram_handle, is_multi_write) for one read port.
+
+        generic_regfile gives each read port its own RAM: a multi-write banked
+        RAM (mwp_dist_ram, under gen_multi_write) when there are 2+ write ports,
+        otherwise a single-write sdp_dist_ram (under gen_single_write).
+        """
+        rp = regfile_inst.gen_read_port[port]
+        try:
+            return rp.gen_multi_write.read_port_ram, True
+        except Exception:
+            return rp.gen_single_write.read_port_ram, False
+
     def read_register(self, reg: int, ram_index: int = 0) -> int:
-        """Read register value from hardware.
+        """Read an architectural integer register value from hardware.
 
         Args:
             reg: Register index (0-31)
-            ram_index: Which RAM instance to read from (0=rs1, 1=rs2)
+            ram_index: Legacy RAM-instance selector (only used by the fallback)
 
         Returns:
             Register value
@@ -241,23 +268,50 @@ def read_register(self, reg: int, ram_index: int = 0) -> int:
         HardwareAssertions.assert_register_valid(reg)
         if reg == 0:
             return 0
+        regfile_inst = self._int_regfile_inst()
+        if regfile_inst is not None:
+            ram, multi = self._int_read_port_ram(regfile_inst, 0)
+            if multi:
+                # Committed value = the bank chosen by the live-value table.
+                sel = int(ram.lvt[reg].value)
+                return int(ram.g_banks[sel].u_bank.ram[reg].value)
+            return int(ram.ram[reg].value)
+        # Fallback: legacy flat regfile RAM via the configured signal path.
         ram = self._get_regfile_ram(ram_index)
         return int(ram[reg].value)
 
     def write_register(self, reg: int, value: int) -> None:
-        """Write register value to hardware (both RAM instances).
+        """Deposit an architectural integer register value into hardware.
 
         Args:
             reg: Register index (0-31)
             value: Value to write
         """
         HardwareAssertions.assert_register_valid(reg)
-        if reg > 0:  # x0 is always zero
-            # Write to both RAM instances for consistency
-            ram_rs1 = self._get_regfile_ram(0)
-            ram_rs2 = self._get_regfile_ram(1)
-            ram_rs1[reg].value = value
-            ram_rs2[reg].value = value
+        if reg == 0:  # x0 is always zero
+            return
+        regfile_inst = self._int_regfile_inst()
+        if regfile_inst is not None:
+            # Deposit into every read port (and, for the banked RAM, both banks
+            # with the live-value table cleared) so all dispatch read ports and
+            # the snapshot read return the deposited value.
+            for port in range(self._INT_RF_READ_PORTS):
+                try:
+                    ram, multi = self._int_read_port_ram(regfile_inst, port)
+                except Exception:
+                    break
+                if multi:
+                    ram.g_banks[0].u_bank.ram[reg].value = value
+                    ram.g_banks[1].u_bank.ram[reg].value = value
+                    ram.lvt[reg].value = 0
+                else:
+                    ram.ram[reg].value = value
+            return
+        # Fallback: legacy flat regfile RAM (rs1 + rs2 instances).
+        ram_rs1 = self._get_regfile_ram(0)
+        ram_rs2 = self._get_regfile_ram(1)
+        ram_rs1[reg].value = value
+        ram_rs2[reg].value = value
 
     def initialize_registers(self, seed_value: int | None = None) -> list[int]:
         """Initialize all registers randomly and return the values."""
diff --git a/verif/cocotb_tests/test_real_program.py b/verif/cocotb_tests/test_real_program.py
index dd2611aa..b7294816 100644
--- a/verif/cocotb_tests/test_real_program.py
+++ b/verif/cocotb_tests/test_real_program.py
@@ -32,7 +32,7 @@
 from collections import Counter
 import cocotb
 from cocotb.clock import Clock
-from cocotb.triggers import RisingEdge, Timer
+from cocotb.triggers import FallingEdge, RisingEdge, Timer
 from typing import Any
 
 CLK_PERIOD_NS = 3
@@ -184,6 +184,23 @@ async def generate_divided_clock(dut: Any) -> None:
 # sprintf_test needs more cycles due to ~200 test cases with heavy FP formatting on RV32
 SPRINTF_TEST_MAX_CYCLES = 2000000
 
+# pde_return_hazard runs PDE_VIS_ITERATIONS(16) x 5 lookups x 2 variants (one with
+# cache churn) plus the s2l sub-tests; the full pass takes ~1.12M cycles (it used to
+# bail early on the unreachable-"maps" tree bug, masking the real budget).
+PDE_RETURN_HAZARD_MAX_CYCLES = 2000000
+
+# wfi_lost_tick sweeps ITERS(3000) idle/WFI iterations, each taking exactly one
+# deferred timer trap. On the bram axis the whole sweep finishes in ~345k cycles,
+# but on the ddr axis (FROST_COCOTB_MEM_CONFIG=ddr) the .text + g_jiffies live in
+# DDR: a ~70k-cycle cold-boot I-cache fill plus a slightly slower per-tick round
+# trip push the 3000-tick sweep just past the 500k default cap (timeout, not a
+# lost tick -- the tick rate stays flat to the end). Give it room like the other
+# legitimately-long tests rather than shrinking the phase-sweep coverage.
+WFI_LOST_TICK_MAX_CYCLES = 800000
+
+# No-MMU Linux boot: reaching the kernel banner takes millions of cycles.
+LINUX_BOOT_MAX_CYCLES = int(os.environ.get("COCOTB_LINUX_MAX_CYCLES", 20000000))
+
 # Number of clock cycles to hold reset between runs
 RESET_CYCLES = 10
 
@@ -280,6 +297,215 @@ def _read_bool(signal: Any) -> bool | None:
     return bool(value)
 
 
+async def wedge_monitor(dut: Any, uart_monitor: "UartMonitor | None") -> None:
+    """Observe a trap/MRET deadlock wedge by sampling ground-truth signals.
+
+    Enabled with FROST_WEDGE_MONITOR=1. Samples the trap/MRET/flush/IRQ/store-
+    drain state every clock and emits an aggregated snapshot every
+    FROST_WEDGE_DUMP_INTERVAL cycles (default 2000). It also raises a one-shot
+    "STALL DETECTED" banner once UART output stops advancing for
+    FROST_WEDGE_STALL_CYCLES cycles (default 20000) and then emits up to
+    FROST_WEDGE_POST_STALL_DUMPS (default 16) full snapshots before it stops
+    logging (the simulation keeps running to the cycle cap).
+
+    Every tap is None-safe: signals that do not resolve are reported once in the
+    "missing_taps" list and counted as 0. This is pure instrumentation -- it
+    drives no signals and changes no behaviour.
+    """
+    dump_interval = int(os.environ.get("FROST_WEDGE_DUMP_INTERVAL", "2000"))
+    stall_cycles = int(os.environ.get("FROST_WEDGE_STALL_CYCLES", "20000"))
+    post_stall_dump_limit = int(os.environ.get("FROST_WEDGE_POST_STALL_DUMPS", "16"))
+
+    def g(path: str) -> Any:
+        return _get_signal(dut, path)
+
+    cpu = "cpu_and_memory_subsystem.cpu_inst"
+    mem = "cpu_and_memory_subsystem"
+
+    # 1-bit signals: aggregated as cycles-high + 0->1 edge counts per interval.
+    bool_sig = {
+        "trap_taken": g(f"{cpu}.trap_taken"),
+        "trap_taken_reg": _first_signal(
+            dut, [f"{cpu}.trap_taken_reg", f"{cpu}.dbg_trap_taken_q"]
+        ),
+        "mret_taken": g(f"{cpu}.mret_taken"),
+        "mret_taken_reg": g(f"{cpu}.mret_taken_reg"),
+        "flush_all": g(f"{cpu}.flush_all"),
+        "flush_en": g(f"{cpu}.flush_en"),
+        "trap_pending": g(f"{cpu}.trap_pending"),
+        "mret_start": g(f"{cpu}.mret_start"),
+        "trap_drain_wait": g(f"{cpu}.trap_drain_wait"),
+        "sq_committed_empty": g(f"{cpu}.sq_committed_empty"),
+        "commit_hold_q": g(f"{cpu}.trap_mret_commit_hold_q"),
+        "mispredict_recovery_pending": g(f"{cpu}.mispredict_recovery_pending"),
+        "interrupt_pending": g(f"{cpu}.interrupt_pending"),
+        "csr_mstatus_mie_direct": g(f"{cpu}.csr_mstatus_mie_direct"),
+        "rob_head_is_wfi": g(f"{cpu}.rob_head_is_wfi"),
+        "head_valid": g(f"{cpu}.head_valid"),
+        "dbg_commit_valid": g(f"{cpu}.dbg_commit_valid"),
+        # CLINT / store-drain taps live in the cpu_and_mem parent scope.
+        "mtip_registered": g(f"{mem}.mtip_registered"),
+        "mtip_comparison": g(f"{mem}.mtip_comparison"),
+        "mtimecmp_write_pulse": g(f"{mem}.mtimecmp_write_pulse"),
+        "cached_write_inflight": g(f"{mem}.data_memory_cached_write_inflight"),
+    }
+    # Load-address taps: prove which address the spin-loop load targets
+    # (decisive for distinguishing a clobbered base register from a lost store).
+    mem_addr_sig = g(f"{cpu}.o_data_mem_addr")
+    mem_rd_en_sig = g(f"{cpu}.o_data_mem_read_enable")
+    mem_cached_rd_en_sig = g(f"{cpu}.o_data_mem_cached_read_enable")
+
+    # Multi-bit signals: sampled at dump time (steady-state snapshot value).
+    val_sig = {
+        "dbg_commit_pc": g(f"{cpu}.dbg_commit_pc"),
+        "head_pc": g(f"{cpu}.u_tomasulo.u_rob.head_pc"),
+        "head_idx": g(f"{cpu}.u_tomasulo.u_rob.head_idx"),
+        "sq_count": g(f"{cpu}.sq_count"),
+        "rob_count": g(f"{cpu}.rob_count"),
+        "csr_priv": g(f"{cpu}.csr_priv"),
+        "csr_mstatus": g(f"{cpu}.csr_mstatus"),
+        "csr_mie": g(f"{cpu}.csr_mie"),
+        "csr_mepc": g(f"{cpu}.csr_mepc"),
+        "resume_pc": _first_signal(
+            dut,
+            [f"{cpu}.dbg_interrupt_resume_pc", f"{cpu}.interrupt_resume_pc"],
+        ),
+        "o_pc": g(f"{cpu}.o_pc"),
+        "mtime": g(f"{mem}.mtime"),
+        "mtimecmp": g(f"{mem}.mtimecmp"),
+    }
+
+    missing = sorted(k for k, v in {**bool_sig, **val_sig}.items() if v is None)
+    cocotb.log.info(
+        f"WEDGE monitor armed: dump_interval={dump_interval} "
+        f"stall_cycles={stall_cycles} post_stall_dumps={post_stall_dump_limit} "
+        f"missing_taps={missing}"
+    )
+
+    bool_keys = list(bool_sig.keys())
+    hi = {k: 0 for k in bool_keys}
+    edges = {k: 0 for k in bool_keys}
+    prev = {k: 0 for k in bool_keys}
+    head_pc_ctr: Counter[int] = Counter()
+    commit_pc_ctr: Counter[int] = Counter()
+    read_addr_ctr: Counter[int] = Counter()
+    commit_count = 0
+    interval_start = 0
+
+    def hx(name: str) -> str:
+        v = _read_int(val_sig.get(name))
+        return "None" if v is None else f"0x{v:08x}"
+
+    def hx64(name: str) -> str:
+        v = _read_int(val_sig.get(name))
+        return "None" if v is None else f"0x{v:016x}"
+
+    def iv(name: str) -> str:
+        v = _read_int(val_sig.get(name))
+        return "None" if v is None else str(v)
+
+    def topn(ctr: Counter[int], n: int = 5) -> str:
+        if not ctr:
+            return "{}"
+        return "{" + ", ".join(f"0x{pc:08x}:{c}" for pc, c in ctr.most_common(n)) + "}"
+
+    def emit(mc: int, length: int, stalled: bool, uart_len: int) -> None:
+        committed_pending = length - hi["sq_committed_empty"]
+        cocotb.log.info(
+            f"WEDGE mc={mc} ilen={length} stalled={stalled} uart_len={uart_len}\n"
+            f"  COMMIT: valid_cyc={hi['dbg_commit_valid']} commits={commit_count} "
+            f"distinct_pc={len(commit_pc_ctr)} top={topn(commit_pc_ctr)} o_pc={hx('o_pc')}\n"
+            f"  HEAD: head_valid_cyc={hi['head_valid']} distinct_head_pc={len(head_pc_ctr)} "
+            f"top={topn(head_pc_ctr)} head_idx={iv('head_idx')} wfi_cyc={hi['rob_head_is_wfi']} "
+            f"rob_count={iv('rob_count')} sq_count={iv('sq_count')}\n"
+            f"  LOAD: read_addrs={topn(read_addr_ctr)}\n"
+            f"  FLUSH: flush_all_hi={hi['flush_all']} flush_en_hi={hi['flush_en']} "
+            f"trap_taken(hi={hi['trap_taken']},edges={edges['trap_taken']}) "
+            f"trap_taken_reg_hi={hi['trap_taken_reg']} "
+            f"mret_taken(hi={hi['mret_taken']},edges={edges['mret_taken']}) "
+            f"mret_taken_reg_hi={hi['mret_taken_reg']} "
+            f"mispred_recov_hi={hi['mispredict_recovery_pending']}\n"
+            f"  GATE: trap_pending_hi={hi['trap_pending']} mret_start_hi={hi['mret_start']} "
+            f"drain_wait_hi={hi['trap_drain_wait']} commit_hold_hi={hi['commit_hold_q']} "
+            f"sq_committed_pending_cyc={committed_pending} "
+            f"cached_inflight_hi={hi['cached_write_inflight']}\n"
+            f"  IRQ: int_pending_hi={hi['interrupt_pending']} mtip_reg_hi={hi['mtip_registered']} "
+            f"mtip_cmp_hi={hi['mtip_comparison']} mtimecmp_writes={hi['mtimecmp_write_pulse']} "
+            f"mtime={hx64('mtime')} mtimecmp={hx64('mtimecmp')}\n"
+            f"  CSR: priv={iv('csr_priv')} mstatus={hx('csr_mstatus')} mie={hx('csr_mie')} "
+            f"mstatus_mie_hi={hi['csr_mstatus_mie_direct']} mepc={hx('csr_mepc')} "
+            f"resume_pc={hx('resume_pc')}"
+        )
+
+    def reset_interval(mc: int) -> None:
+        nonlocal commit_count, interval_start
+        for k in bool_keys:
+            hi[k] = 0
+            edges[k] = 0
+        head_pc_ctr.clear()
+        commit_pc_ctr.clear()
+        read_addr_ctr.clear()
+        commit_count = 0
+        interval_start = mc
+
+    mc = 0
+    last_uart_len = 0
+    last_uart_change = 0
+    stall_announced = False
+    post_stall_dumps = 0
+
+    while True:
+        await RisingEdge(dut.i_clk)
+        mc += 1
+        for k in bool_keys:
+            raw = _read_int(bool_sig[k])
+            v = 1 if raw else 0
+            hi[k] += v
+            if v and not prev[k]:
+                edges[k] += 1
+            prev[k] = v
+        if prev["head_valid"] and len(head_pc_ctr) < 256:
+            hp = _read_int(val_sig["head_pc"])
+            if hp is not None:
+                head_pc_ctr[hp] += 1
+        if prev["dbg_commit_valid"]:
+            cp = _read_int(val_sig["dbg_commit_pc"])
+            if cp is not None:
+                commit_count += 1
+                if len(commit_pc_ctr) < 256:
+                    commit_pc_ctr[cp] += 1
+        rd = _read_int(mem_rd_en_sig)
+        crd = _read_int(mem_cached_rd_en_sig)
+        if (rd or crd) and len(read_addr_ctr) < 256:
+            ra = _read_int(mem_addr_sig)
+            if ra is not None:
+                read_addr_ctr[ra] += 1
+
+        uart_len = len(uart_monitor.get_output()) if uart_monitor is not None else 0
+        if uart_len != last_uart_len:
+            last_uart_len = uart_len
+            last_uart_change = mc
+        stalled = (mc - last_uart_change) >= stall_cycles
+        if stalled and not stall_announced:
+            cocotb.log.info(
+                f"WEDGE STALL DETECTED at mc={mc}: no UART progress for "
+                f"{stall_cycles} cycles (uart_len={uart_len})"
+            )
+            stall_announced = True
+
+        if mc % dump_interval == 0:
+            emit(mc, mc - interval_start, stalled, uart_len)
+            if stalled:
+                post_stall_dumps += 1
+                if post_stall_dumps >= post_stall_dump_limit:
+                    cocotb.log.info(
+                        "WEDGE monitor: post-stall snapshot budget reached; "
+                        "stopping further logging (sim continues to cap)."
+                    )
+                    return
+            reset_interval(mc)
+
+
 class UartRxDriver:
     """Drive UART RX serial input to the DUT (8N1)."""
 
@@ -298,18 +524,16 @@ def _compute_bit_cycles(self) -> int:
         """Match uart_rx.sv prescaler math to compute cycles per bit.
 
         uart_rx uses CLK_FREQ_HZ/4 (since it runs on clk_div4) and computes:
-        ClockCyclesPerBit = (CLK_FREQ_HZ/4) / (BAUD_RATE * DATA_WIDTH)
-        Then it waits ClockCyclesPerBit * DATA_WIDTH cycles per bit.
+        ClockCyclesPerBit = (CLK_FREQ_HZ/4) / BAUD_RATE.
         """
         clk_freq = _read_u64(getattr(self.dut, "CLK_FREQ_HZ", None))
         if clk_freq is None:
             clk_freq = UART_CLK_FREQ_HZ_DEFAULT
         uart_clk_freq = clk_freq // 4
-        base = uart_clk_freq // (UART_BAUD_RATE * UART_DATA_BITS)
-        bit_cycles = base * UART_DATA_BITS
+        bit_cycles = uart_clk_freq // UART_BAUD_RATE
         cocotb.log.info(
             f"UartRxDriver: clk_freq={clk_freq}, uart_clk_freq={uart_clk_freq}, "
-            f"base={base}, bit_cycles={bit_cycles}"
+            f"bit_cycles={bit_cycles}"
         )
         return max(1, bit_cycles)
 
@@ -318,20 +542,26 @@ async def _wait_cycles(self, cycles: int) -> None:
         for _ in range(cycles):
             await RisingEdge(self.dut.i_clk_div4)
 
+    async def _wait_bit_edges(self, cycles: int) -> None:
+        """Wait bit-time cycles using the non-sampling edge for UART transitions."""
+        for _ in range(cycles):
+            await FallingEdge(self.dut.i_clk_div4)
+
     async def send_byte(self, value: int) -> None:
         """Send a single byte over UART RX (LSB first)."""
+        await FallingEdge(self.dut.i_clk_div4)
         # Start bit
         self.dut.i_uart_rx.value = 0
-        await self._wait_cycles(self.bit_cycles)
+        await self._wait_bit_edges(self.bit_cycles)
         # Data bits
         for bit in range(UART_DATA_BITS):
             self.dut.i_uart_rx.value = (value >> bit) & 0x1
-            await self._wait_cycles(self.bit_cycles)
+            await self._wait_bit_edges(self.bit_cycles)
         # Stop bit
         self.dut.i_uart_rx.value = 1
-        await self._wait_cycles(self.bit_cycles)
+        await self._wait_bit_edges(self.bit_cycles)
 
-    async def send(self, data: bytes) -> None:
+    async def send(self, data: bytes, inter_byte_cycles: int = 0) -> None:
         """Send a byte string over UART RX."""
         # Ensure line is idle for multiple bit times before starting
         # This gives the receiver time to sync after any glitches
@@ -339,8 +569,8 @@ async def send(self, data: bytes) -> None:
         await self._wait_cycles(self.bit_cycles * 4)
         for byte in data:
             await self.send_byte(byte)
-            # Extra idle time between characters for receiver to process
-            await self._wait_cycles(self.bit_cycles)
+            if inter_byte_cycles > 0:
+                await self._wait_cycles(inter_byte_cycles)
 
 
 async def wait_for_uart_text(
@@ -647,6 +877,21 @@ def get_expected_behavior() -> tuple[str | None, str | None, bool, str | None]:
                 if app_name == "hello_world":
                     # Just needs to print the first hello message
                     return (None, "Hello, world!", False, app_name)
+                if app_name == "linux_boot":
+                    if os.environ.get("FROST_LINUX_RUN_FULL") == "1":
+                        # Diagnostic / CI regression capture: never matches -> run
+                        # the full COCOTB_LINUX_MAX_CYCLES capturing all UART +
+                        # CLINT/retire progress. The CI linux-boot-cocotb job runs
+                        # in this mode and asserts boot health afterwards with
+                        # tests/check_linux_boot_regression.py (the ~22M window is
+                        # silent mem_init after devtmpfs, so there is no deep
+                        # console marker to match on -- progress + a serviced timer
+                        # tick are the real gremlin-regression signals).
+                        return ("<<__never_matches__>>", None, True, app_name)
+                    # Passes once the kernel reaches its boot banner. (Interim
+                    # bring-up criterion; tighten to a userspace/shell marker
+                    # once no-MMU Linux boots that far.)
+                    return (None, "Linux version", False, app_name)
                 if app_name == "uart_echo":
                     # Interactive test handled separately (UART input injection)
                     return (None, None, False, app_name)
@@ -695,6 +940,20 @@ async def run_until_complete(
         progress_interval = int(
             os.environ.get("COCOTB_COREMARK_PROGRESS_INTERVAL", 500_000)
         )
+    irq_precision_check = os.environ.get("FROST_IRQ_PRECISION_CHECK") == "1"
+    irq_precision_strict = os.environ.get("FROST_IRQ_PRECISION_STRICT") == "1"
+    irq_low_ra_assert = os.environ.get("FROST_IRQ_LOW_RA_ASSERT") == "1"
+    irq_precision_event_limit = int(
+        os.environ.get("FROST_IRQ_PRECISION_EVENT_LIMIT", "64")
+    )
+    irq_precision_events: list[str] = []
+    external_irq_symbol = os.environ.get("FROST_EXTERNAL_IRQ_SYMBOL")
+    external_irq_enabled = bool(external_irq_symbol)
+    external_irq_offset = int(os.environ.get("FROST_EXTERNAL_IRQ_OFFSET", "0"), 0)
+    external_irq_max_pulses = int(os.environ.get("FROST_EXTERNAL_IRQ_MAX_PULSES", "1"))
+    external_irq_hold_cycles = int(
+        os.environ.get("FROST_EXTERNAL_IRQ_HOLD_CYCLES", "1")
+    )
     retire_sig = None
     pc_sig = None
     pc_vld_sig = None
@@ -780,6 +1039,16 @@ async def run_until_complete(
     ras_pop_after_restore_live_sig = None
     commit_valid_live_sig = None
     commit_pc_live_sig = None
+    commit0_dest_valid_sig = None
+    commit0_dest_rf_sig = None
+    commit0_dest_reg_sig = None
+    commit0_value_sig = None
+    commit1_valid_sig = None
+    commit1_pc_sig = None
+    commit1_dest_valid_sig = None
+    commit1_dest_rf_sig = None
+    commit1_dest_reg_sig = None
+    commit1_value_sig = None
     commit_is_return_live_sig = None
     commit_is_call_live_sig = None
     commit_checkpoint_id_live_sig = None
@@ -850,6 +1119,34 @@ async def run_until_complete(
     rob_alloc_is_csr_live_sig = None
     rob_alloc_is_mret_live_sig = None
     id_instruction_live_sig = None
+    trap_taken_live_sig = None
+    trap_taken_reg_dbg_sig = None
+    trap_cause_internal_live_sig = None
+    mret_taken_live_sig = None
+    trap_target_live_sig = None
+    trap_pending_live_sig = None
+    rob_trap_pc_live_sig = None
+    trap_pc_internal_live_sig = None
+    interrupt_resume_pc_live_sig = None
+    csr_commit_fire_live_sig = None
+    csr_mepc_live_sig = None
+    flush_all_live_sig = None
+    port0_int_we_sig = None
+    port0_int_addr_sig = None
+    port0_int_data_sig = None
+    port1_int_we_sig = None
+    port1_int_addr_sig = None
+    port1_int_data_sig = None
+    rob_commit0_reg_valid_sig = None
+    rob_commit0_reg_pc_sig = None
+    rob_commit0_reg_dest_valid_sig = None
+    rob_commit0_reg_dest_rf_sig = None
+    rob_commit0_reg_dest_reg_sig = None
+    rob_commit1_reg_valid_sig = None
+    rob_commit1_reg_pc_sig = None
+    rob_commit1_reg_dest_valid_sig = None
+    rob_commit1_reg_dest_rf_sig = None
+    rob_commit1_reg_dest_reg_sig = None
     coremark_cf_debug_enabled = (
         is_coremark_like and os.environ.get("FROST_COREMARK_CF_DEBUG") == "1"
     )
@@ -888,7 +1185,7 @@ async def run_until_complete(
         control_flow_trace_label = os.environ.get(
             "FROST_CONTROL_FLOW_TRACE_LABEL", f"{app_name or 'program'} trace"
         )
-    if progress_interval:
+    if progress_interval or irq_precision_check or external_irq_enabled:
         retire_sig = _get_signal(
             dut, "cpu_and_memory_subsystem.cpu_inst.dbg_commit_valid"
         )
@@ -1447,6 +1744,36 @@ async def run_until_complete(
         commit_pc_live_sig = _get_signal(
             dut, "cpu_and_memory_subsystem.cpu_inst.dbg_commit_pc"
         )
+        commit0_dest_valid_sig = _get_signal(
+            dut, "cpu_and_memory_subsystem.cpu_inst.dbg_commit_dest_valid"
+        )
+        commit0_dest_rf_sig = _get_signal(
+            dut, "cpu_and_memory_subsystem.cpu_inst.dbg_commit_dest_rf"
+        )
+        commit0_dest_reg_sig = _get_signal(
+            dut, "cpu_and_memory_subsystem.cpu_inst.dbg_commit_dest_reg"
+        )
+        commit0_value_sig = _get_signal(
+            dut, "cpu_and_memory_subsystem.cpu_inst.dbg_commit_value"
+        )
+        commit1_valid_sig = _get_signal(
+            dut, "cpu_and_memory_subsystem.cpu_inst.dbg_commit_2_valid"
+        )
+        commit1_pc_sig = _get_signal(
+            dut, "cpu_and_memory_subsystem.cpu_inst.dbg_commit_2_pc"
+        )
+        commit1_dest_valid_sig = _get_signal(
+            dut, "cpu_and_memory_subsystem.cpu_inst.dbg_commit_2_dest_valid"
+        )
+        commit1_dest_rf_sig = _get_signal(
+            dut, "cpu_and_memory_subsystem.cpu_inst.dbg_commit_2_dest_rf"
+        )
+        commit1_dest_reg_sig = _get_signal(
+            dut, "cpu_and_memory_subsystem.cpu_inst.dbg_commit_2_dest_reg"
+        )
+        commit1_value_sig = _get_signal(
+            dut, "cpu_and_memory_subsystem.cpu_inst.dbg_commit_2_value"
+        )
         commit_is_return_live_sig = _get_signal(
             dut, "cpu_and_memory_subsystem.cpu_inst.dbg_commit_is_return"
         )
@@ -1523,6 +1850,20 @@ async def run_until_complete(
         trap_taken_live_sig = _get_signal(
             dut, "cpu_and_memory_subsystem.cpu_inst.trap_taken"
         )
+        trap_taken_reg_dbg_sig = _first_signal(
+            dut,
+            [
+                "cpu_and_memory_subsystem.cpu_inst.dbg_trap_taken_q",
+                "cpu_and_memory_subsystem.cpu_inst.trap_taken_reg",
+            ],
+        )
+        trap_cause_internal_live_sig = _first_signal(
+            dut,
+            [
+                "cpu_and_memory_subsystem.cpu_inst.dbg_trap_cause_internal",
+                "cpu_and_memory_subsystem.cpu_inst.trap_cause_internal",
+            ],
+        )
         mret_taken_live_sig = _get_signal(
             dut, "cpu_and_memory_subsystem.cpu_inst.mret_taken"
         )
@@ -1535,6 +1876,16 @@ async def run_until_complete(
         rob_trap_pc_live_sig = _get_signal(
             dut, "cpu_and_memory_subsystem.cpu_inst.rob_trap_pc"
         )
+        trap_pc_internal_live_sig = _first_signal(
+            dut,
+            [
+                "cpu_and_memory_subsystem.cpu_inst.dbg_trap_pc_internal",
+                "cpu_and_memory_subsystem.cpu_inst.rob_trap_pc",
+            ],
+        )
+        interrupt_resume_pc_live_sig = _get_signal(
+            dut, "cpu_and_memory_subsystem.cpu_inst.dbg_interrupt_resume_pc"
+        )
         rob_trap_cause_live_sig = _get_signal(
             dut, "cpu_and_memory_subsystem.cpu_inst.rob_trap_cause"
         )
@@ -1550,6 +1901,81 @@ async def run_until_complete(
         csr_mepc_live_sig = _get_signal(
             dut, "cpu_and_memory_subsystem.cpu_inst.csr_mepc"
         )
+        flush_all_live_sig = _get_signal(
+            dut, "cpu_and_memory_subsystem.cpu_inst.flush_all"
+        )
+        port0_int_we_sig = _first_signal(
+            dut,
+            [
+                "cpu_and_memory_subsystem.cpu_inst.dbg_port0_int_we",
+                "cpu_and_memory_subsystem.cpu_inst.port0_int_we",
+            ],
+        )
+        port0_int_addr_sig = _first_signal(
+            dut,
+            [
+                "cpu_and_memory_subsystem.cpu_inst.dbg_port0_int_addr",
+                "cpu_and_memory_subsystem.cpu_inst.port0_int_addr",
+            ],
+        )
+        port0_int_data_sig = _first_signal(
+            dut,
+            [
+                "cpu_and_memory_subsystem.cpu_inst.dbg_port0_int_data",
+                "cpu_and_memory_subsystem.cpu_inst.port0_int_data",
+            ],
+        )
+        port1_int_we_sig = _first_signal(
+            dut,
+            [
+                "cpu_and_memory_subsystem.cpu_inst.dbg_port1_int_we",
+                "cpu_and_memory_subsystem.cpu_inst.port1_int_we",
+            ],
+        )
+        port1_int_addr_sig = _first_signal(
+            dut,
+            [
+                "cpu_and_memory_subsystem.cpu_inst.dbg_port1_int_addr",
+                "cpu_and_memory_subsystem.cpu_inst.port1_int_addr",
+            ],
+        )
+        port1_int_data_sig = _first_signal(
+            dut,
+            [
+                "cpu_and_memory_subsystem.cpu_inst.dbg_port1_int_data",
+                "cpu_and_memory_subsystem.cpu_inst.port1_int_data",
+            ],
+        )
+        rob_commit0_reg_valid_sig = _get_signal(
+            dut, "cpu_and_memory_subsystem.cpu_inst.dbg_rob_commit_reg_valid"
+        )
+        rob_commit0_reg_pc_sig = _get_signal(
+            dut, "cpu_and_memory_subsystem.cpu_inst.dbg_rob_commit_reg_pc"
+        )
+        rob_commit0_reg_dest_valid_sig = _get_signal(
+            dut, "cpu_and_memory_subsystem.cpu_inst.dbg_rob_commit_reg_dest_valid"
+        )
+        rob_commit0_reg_dest_rf_sig = _get_signal(
+            dut, "cpu_and_memory_subsystem.cpu_inst.dbg_rob_commit_reg_dest_rf"
+        )
+        rob_commit0_reg_dest_reg_sig = _get_signal(
+            dut, "cpu_and_memory_subsystem.cpu_inst.dbg_rob_commit_reg_dest_reg"
+        )
+        rob_commit1_reg_valid_sig = _get_signal(
+            dut, "cpu_and_memory_subsystem.cpu_inst.dbg_rob_commit_2_reg_valid"
+        )
+        rob_commit1_reg_pc_sig = _get_signal(
+            dut, "cpu_and_memory_subsystem.cpu_inst.dbg_rob_commit_2_reg_pc"
+        )
+        rob_commit1_reg_dest_valid_sig = _get_signal(
+            dut, "cpu_and_memory_subsystem.cpu_inst.dbg_rob_commit_2_reg_dest_valid"
+        )
+        rob_commit1_reg_dest_rf_sig = _get_signal(
+            dut, "cpu_and_memory_subsystem.cpu_inst.dbg_rob_commit_2_reg_dest_rf"
+        )
+        rob_commit1_reg_dest_reg_sig = _get_signal(
+            dut, "cpu_and_memory_subsystem.cpu_inst.dbg_rob_commit_2_reg_dest_reg"
+        )
         flush_pipeline_live_sig = _get_signal(
             dut, "cpu_and_memory_subsystem.cpu_inst.flush_pipeline"
         )
@@ -1645,6 +2071,112 @@ async def run_until_complete(
                     (0x353C, 0x35DC),
                 ]
 
+    if irq_precision_check:
+        trap_taken_live_sig = _get_signal(
+            dut, "cpu_and_memory_subsystem.cpu_inst.trap_taken"
+        )
+        trap_cause_internal_live_sig = _first_signal(
+            dut,
+            [
+                "cpu_and_memory_subsystem.cpu_inst.dbg_trap_cause_internal",
+                "cpu_and_memory_subsystem.cpu_inst.trap_cause_internal",
+            ],
+        )
+        rob_trap_pc_live_sig = _get_signal(
+            dut, "cpu_and_memory_subsystem.cpu_inst.rob_trap_pc"
+        )
+        trap_pc_internal_live_sig = _first_signal(
+            dut,
+            [
+                "cpu_and_memory_subsystem.cpu_inst.dbg_trap_pc_internal",
+                "cpu_and_memory_subsystem.cpu_inst.rob_trap_pc",
+            ],
+        )
+        interrupt_resume_pc_live_sig = _get_signal(
+            dut, "cpu_and_memory_subsystem.cpu_inst.dbg_interrupt_resume_pc"
+        )
+        csr_commit_fire_live_sig = _get_signal(
+            dut, "cpu_and_memory_subsystem.cpu_inst.csr_commit_fire"
+        )
+        csr_mepc_live_sig = _get_signal(
+            dut, "cpu_and_memory_subsystem.cpu_inst.csr_mepc"
+        )
+        flush_all_live_sig = _get_signal(
+            dut, "cpu_and_memory_subsystem.cpu_inst.flush_all"
+        )
+        port0_int_we_sig = _first_signal(
+            dut,
+            [
+                "cpu_and_memory_subsystem.cpu_inst.dbg_port0_int_we",
+                "cpu_and_memory_subsystem.cpu_inst.port0_int_we",
+            ],
+        )
+        port0_int_addr_sig = _first_signal(
+            dut,
+            [
+                "cpu_and_memory_subsystem.cpu_inst.dbg_port0_int_addr",
+                "cpu_and_memory_subsystem.cpu_inst.port0_int_addr",
+            ],
+        )
+        port0_int_data_sig = _first_signal(
+            dut,
+            [
+                "cpu_and_memory_subsystem.cpu_inst.dbg_port0_int_data",
+                "cpu_and_memory_subsystem.cpu_inst.port0_int_data",
+            ],
+        )
+        port1_int_we_sig = _first_signal(
+            dut,
+            [
+                "cpu_and_memory_subsystem.cpu_inst.dbg_port1_int_we",
+                "cpu_and_memory_subsystem.cpu_inst.port1_int_we",
+            ],
+        )
+        port1_int_addr_sig = _first_signal(
+            dut,
+            [
+                "cpu_and_memory_subsystem.cpu_inst.dbg_port1_int_addr",
+                "cpu_and_memory_subsystem.cpu_inst.port1_int_addr",
+            ],
+        )
+        port1_int_data_sig = _first_signal(
+            dut,
+            [
+                "cpu_and_memory_subsystem.cpu_inst.dbg_port1_int_data",
+                "cpu_and_memory_subsystem.cpu_inst.port1_int_data",
+            ],
+        )
+        rob_commit0_reg_valid_sig = _get_signal(
+            dut, "cpu_and_memory_subsystem.cpu_inst.dbg_rob_commit_reg_valid"
+        )
+        rob_commit0_reg_pc_sig = _get_signal(
+            dut, "cpu_and_memory_subsystem.cpu_inst.dbg_rob_commit_reg_pc"
+        )
+        rob_commit0_reg_dest_valid_sig = _get_signal(
+            dut, "cpu_and_memory_subsystem.cpu_inst.dbg_rob_commit_reg_dest_valid"
+        )
+        rob_commit0_reg_dest_rf_sig = _get_signal(
+            dut, "cpu_and_memory_subsystem.cpu_inst.dbg_rob_commit_reg_dest_rf"
+        )
+        rob_commit0_reg_dest_reg_sig = _get_signal(
+            dut, "cpu_and_memory_subsystem.cpu_inst.dbg_rob_commit_reg_dest_reg"
+        )
+        rob_commit1_reg_valid_sig = _get_signal(
+            dut, "cpu_and_memory_subsystem.cpu_inst.dbg_rob_commit_2_reg_valid"
+        )
+        rob_commit1_reg_pc_sig = _get_signal(
+            dut, "cpu_and_memory_subsystem.cpu_inst.dbg_rob_commit_2_reg_pc"
+        )
+        rob_commit1_reg_dest_valid_sig = _get_signal(
+            dut, "cpu_and_memory_subsystem.cpu_inst.dbg_rob_commit_2_reg_dest_valid"
+        )
+        rob_commit1_reg_dest_rf_sig = _get_signal(
+            dut, "cpu_and_memory_subsystem.cpu_inst.dbg_rob_commit_2_reg_dest_rf"
+        )
+        rob_commit1_reg_dest_reg_sig = _get_signal(
+            dut, "cpu_and_memory_subsystem.cpu_inst.dbg_rob_commit_2_reg_dest_reg"
+        )
+
     retired_pc_hist: Counter[int] = Counter()
     retired_mispredicts = 0
     last_progress_mispredicts = 0
@@ -1675,6 +2207,9 @@ async def run_until_complete(
     last_checkpoint_in_use = None
     last_rat_a0_state = None
     last_x2_commit = None
+    last_x2_commit_pc = None
+    last_x2_raw_commit = None
+    last_x2_raw_commit_pc = None
     last_x5_commit = None
     last_x8_commit = None
     last_x9_commit = None
@@ -1685,6 +2220,37 @@ async def run_until_complete(
     control_flow_trace_enabled = True
     retire_only_trace = os.environ.get("FROST_CONTROL_FLOW_RETIRE_ONLY") == "1"
     ras_transition_trace_active = True
+    irq_precision_callee_range: tuple[int, int] | None = None
+    external_irq_range: tuple[int, int] | None = None
+    external_irq_active = False
+    external_irq_hold_remaining = 0
+    external_irq_pulses = 0
+    external_irq_armed = True
+    if irq_precision_check:
+        irq_callee_symbol = os.environ.get(
+            "FROST_IRQ_CALLEE_SYMBOL", "irq_stack_slot_callee"
+        )
+        irq_symbol_ranges = _load_symbol_ranges([irq_callee_symbol], app_name)
+        irq_precision_callee_range = irq_symbol_ranges.get(irq_callee_symbol)
+        if irq_precision_callee_range is not None:
+            lo, hi = irq_precision_callee_range
+            cocotb.log.info(
+                f"IRQ precision callee window {irq_callee_symbol}: "
+                f"[0x{lo:08x}, 0x{hi:08x})"
+            )
+    if external_irq_enabled and external_irq_symbol is not None:
+        external_symbol_ranges = _load_symbol_ranges([external_irq_symbol], app_name)
+        external_irq_range = external_symbol_ranges.get(external_irq_symbol)
+        if external_irq_range is None:
+            raise AssertionError(
+                f"FROST_EXTERNAL_IRQ_SYMBOL={external_irq_symbol!r} not found"
+            )
+        lo, hi = external_irq_range
+        cocotb.log.info(
+            f"External IRQ injector armed for {external_irq_symbol}: "
+            f"[0x{lo:08x}, 0x{hi:08x}) offset=0x{external_irq_offset:x} "
+            f"max_pulses={external_irq_max_pulses}"
+        )
 
     def in_trace_window(pc: int | None) -> bool:
         if pc is None or control_flow_trace_ranges is None:
@@ -1704,6 +2270,25 @@ def coremark_symbol_name_for_pc(pc: int | None) -> str:
                 return symbol_name
         return "-"
 
+    def commit_writes_x1_x2_at_pc(
+        valid_sig: Any | None,
+        pc_sig: Any | None,
+        dest_valid_sig: Any | None,
+        dest_rf_sig: Any | None,
+        dest_reg_sig: Any | None,
+        trap_pc: int | None,
+    ) -> bool:
+        if trap_pc is None:
+            return False
+        dest_reg = _read_int(dest_reg_sig)
+        return (
+            bool(_read_bool(valid_sig))
+            and bool(_read_bool(dest_valid_sig))
+            and not bool(_read_bool(dest_rf_sig))
+            and dest_reg in {1, 2}
+            and _read_int(pc_sig) == trap_pc
+        )
+
     def format_coremark_if_mismatch(
         *,
         stage: str,
@@ -1744,6 +2329,263 @@ def dump_coremark_retire_trace() -> None:
 
     for cycle in range(max_cycles):
         await RisingEdge(dut.i_clk)
+        if external_irq_enabled and hasattr(dut, "i_external_interrupt"):
+            if external_irq_active:
+                if external_irq_hold_remaining > 0:
+                    external_irq_hold_remaining -= 1
+                if trap_taken_live_sig is not None and bool(
+                    _read_bool(trap_taken_live_sig)
+                ):
+                    external_irq_hold_remaining = 0
+                if external_irq_hold_remaining == 0:
+                    dut.i_external_interrupt.value = 0
+                    external_irq_active = False
+                    external_irq_armed = False
+
+            if (
+                not external_irq_active
+                and external_irq_armed
+                and external_irq_range is not None
+                and external_irq_pulses < external_irq_max_pulses
+            ):
+                retire_valid = bool(_read_bool(retire_sig))
+                retire_pc = _read_int(retire_pc_sig)
+                lo, hi = external_irq_range
+                trigger_pc = lo + external_irq_offset
+                if (
+                    retire_valid
+                    and retire_pc is not None
+                    and trigger_pc <= retire_pc < hi
+                ):
+                    dut.i_external_interrupt.value = 1
+                    external_irq_active = True
+                    external_irq_hold_remaining = max(1, external_irq_hold_cycles)
+                    external_irq_pulses += 1
+                    cocotb.log.info(
+                        f"External IRQ pulse {external_irq_pulses} at "
+                        f"cycle={cycle + 1} retire_pc=0x{retire_pc:08x}"
+                    )
+
+            if (
+                not external_irq_armed
+                and external_irq_range is not None
+                and external_irq_pulses < external_irq_max_pulses
+            ):
+                retire_pc = _read_int(retire_pc_sig)
+                lo, hi = external_irq_range
+                if retire_pc is None or not (lo <= retire_pc < hi):
+                    external_irq_armed = True
+
+        if irq_precision_check:
+            raw_x2_events = []
+            for (
+                valid_sig,
+                pc_sig,
+                dest_valid_sig,
+                dest_rf_sig,
+                dest_reg_sig,
+                value_sig,
+            ) in (
+                (
+                    commit_valid_live_sig,
+                    commit_pc_live_sig,
+                    commit0_dest_valid_sig,
+                    commit0_dest_rf_sig,
+                    commit0_dest_reg_sig,
+                    commit0_value_sig,
+                ),
+                (
+                    commit1_valid_sig,
+                    commit1_pc_sig,
+                    commit1_dest_valid_sig,
+                    commit1_dest_rf_sig,
+                    commit1_dest_reg_sig,
+                    commit1_value_sig,
+                ),
+            ):
+                if (
+                    bool(_read_bool(valid_sig))
+                    and bool(_read_bool(dest_valid_sig))
+                    and not bool(_read_bool(dest_rf_sig))
+                    and _read_int(dest_reg_sig) == 2
+                ):
+                    value = _read_int(value_sig)
+                    pc = _read_int(pc_sig)
+                    last_x2_raw_commit = value
+                    last_x2_raw_commit_pc = pc
+                    raw_x2_events.append(f"0x{(value or 0):08x}@0x{(pc or 0):08x}")
+
+            current_x2_commit = last_x2_commit
+            current_x2_commit_pc = last_x2_commit_pc
+            wb_x2_events = []
+            for port_name, we_sig, addr_sig, data_sig, pc_sig in (
+                (
+                    "p0",
+                    port0_int_we_sig,
+                    port0_int_addr_sig,
+                    port0_int_data_sig,
+                    rob_commit0_reg_pc_sig,
+                ),
+                (
+                    "p1",
+                    port1_int_we_sig,
+                    port1_int_addr_sig,
+                    port1_int_data_sig,
+                    rob_commit1_reg_pc_sig,
+                ),
+            ):
+                if bool(_read_bool(we_sig)) and _read_int(addr_sig) == 2:
+                    value = _read_int(data_sig)
+                    pc = _read_int(pc_sig)
+                    current_x2_commit = value
+                    current_x2_commit_pc = pc
+                    wb_x2_events.append(
+                        f"{port_name}=0x{(value or 0):08x}@0x{(pc or 0):08x}"
+                    )
+
+            trap = bool(_read_bool(trap_taken_live_sig))
+            trap_q = bool(_read_bool(trap_taken_reg_dbg_sig))
+            flush_all = bool(_read_bool(flush_all_live_sig))
+            trap_cause = _read_int(trap_cause_internal_live_sig)
+            is_irq = bool((trap_cause or 0) & 0x8000_0000)
+            trap_pc = _read_int(trap_pc_internal_live_sig)
+            rob_trap_pc = _read_int(rob_trap_pc_live_sig)
+            interrupt_resume_pc = _read_int(interrupt_resume_pc_live_sig)
+            c0_valid = bool(_read_bool(commit_valid_live_sig))
+            c1_valid = bool(_read_bool(commit1_valid_sig))
+            c0_pc = _read_int(commit_pc_live_sig)
+            c1_pc = _read_int(commit1_pc_sig)
+            reg0_sensitive = commit_writes_x1_x2_at_pc(
+                rob_commit0_reg_valid_sig,
+                rob_commit0_reg_pc_sig,
+                rob_commit0_reg_dest_valid_sig,
+                rob_commit0_reg_dest_rf_sig,
+                rob_commit0_reg_dest_reg_sig,
+                trap_pc,
+            )
+            reg1_sensitive = commit_writes_x1_x2_at_pc(
+                rob_commit1_reg_valid_sig,
+                rob_commit1_reg_pc_sig,
+                rob_commit1_reg_dest_valid_sig,
+                rob_commit1_reg_dest_rf_sig,
+                rob_commit1_reg_dest_reg_sig,
+                trap_pc,
+            )
+            raw0_sensitive = commit_writes_x1_x2_at_pc(
+                commit_valid_live_sig,
+                commit_pc_live_sig,
+                commit0_dest_valid_sig,
+                commit0_dest_rf_sig,
+                commit0_dest_reg_sig,
+                trap_pc,
+            )
+            raw1_sensitive = commit_writes_x1_x2_at_pc(
+                commit1_valid_sig,
+                commit1_pc_sig,
+                commit1_dest_valid_sig,
+                commit1_dest_rf_sig,
+                commit1_dest_reg_sig,
+                trap_pc,
+            )
+
+            stale_sp_body = False
+            if trap and is_irq and trap_pc is not None and irq_precision_callee_range:
+                callee_lo, callee_hi = irq_precision_callee_range
+                x2_from_callee = (
+                    current_x2_commit_pc is not None
+                    and callee_lo <= current_x2_commit_pc < callee_hi
+                )
+                stale_sp_body = (
+                    callee_lo + 4 <= trap_pc < callee_hi and not x2_from_callee
+                )
+
+            if trap and is_irq:
+                event = (
+                    f"IRQ precision event cycle={cycle + 1} "
+                    f"cause=0x{(trap_cause or 0):08x} trap_pc=0x{(trap_pc or 0):08x} "
+                    f"rob_pc=0x{(rob_trap_pc or 0):08x} "
+                    f"resume_pc=0x{(interrupt_resume_pc or 0):08x} "
+                    f"c0={int(c0_valid)} pc0=0x{(c0_pc or 0):08x} "
+                    f"rd0={_read_int(commit0_dest_reg_sig)} "
+                    f"c1={int(c1_valid)} pc1=0x{(c1_pc or 0):08x} "
+                    f"rd1={_read_int(commit1_dest_reg_sig)} "
+                    f"p0we={int(bool(_read_bool(port0_int_we_sig)))} "
+                    f"p0a={_read_int(port0_int_addr_sig)} "
+                    f"p0d=0x{(_read_int(port0_int_data_sig) or 0):08x} "
+                    f"p0pc=0x{(_read_int(rob_commit0_reg_pc_sig) or 0):08x} "
+                    f"p1we={int(bool(_read_bool(port1_int_we_sig)))} "
+                    f"p1a={_read_int(port1_int_addr_sig)} "
+                    f"p1d=0x{(_read_int(port1_int_data_sig) or 0):08x} "
+                    f"p1pc=0x{(_read_int(rob_commit1_reg_pc_sig) or 0):08x} "
+                    f"csr_fire={int(bool(_read_bool(csr_commit_fire_live_sig)))} "
+                    f"trap_q={int(trap_q)} flush_all={int(flush_all)} "
+                    f"mepc=0x{(_read_int(csr_mepc_live_sig) or 0):08x} "
+                    f"last_x2_arch=0x{(current_x2_commit or 0):08x} "
+                    f"last_x2_arch_pc=0x{(current_x2_commit_pc or 0):08x} "
+                    f"last_x2_raw=0x{(last_x2_raw_commit or 0):08x} "
+                    f"last_x2_raw_pc=0x{(last_x2_raw_commit_pc or 0):08x} "
+                    f"raw_x2_now={','.join(raw_x2_events) or '-'} "
+                    f"wb_x2_now={','.join(wb_x2_events) or '-'}"
+                )
+                if len(irq_precision_events) < irq_precision_event_limit:
+                    irq_precision_events.append(event)
+                    cocotb.log.info(event)
+
+                raw_commit_collision = c0_valid or c1_valid
+                sensitive_pc_write = (
+                    raw0_sensitive or raw1_sensitive or reg0_sensitive or reg1_sensitive
+                )
+                if irq_precision_strict and (
+                    raw_commit_collision or sensitive_pc_write or stale_sp_body
+                ):
+                    raise AssertionError(
+                        "IRQ precision violation: "
+                        f"raw_commit={raw_commit_collision} "
+                        f"x1_x2_same_pc={sensitive_pc_write} "
+                        f"stale_sp_body={stale_sp_body}; {event}"
+                    )
+
+            low_ra_events = []
+            for port_name, we_sig, addr_sig, data_sig in (
+                ("p0", port0_int_we_sig, port0_int_addr_sig, port0_int_data_sig),
+                ("p1", port1_int_we_sig, port1_int_addr_sig, port1_int_data_sig),
+            ):
+                data_value = _read_int(data_sig)
+                if (
+                    bool(_read_bool(we_sig))
+                    and _read_int(addr_sig) == 1
+                    and data_value is not None
+                    and data_value < 0x1000
+                ):
+                    low_ra_events.append(f"{port_name}=0x{data_value:08x}")
+            if irq_low_ra_assert and low_ra_events:
+                raise AssertionError(
+                    "Low RA writeback under IRQ monitor: "
+                    f"cycle={cycle + 1} {' '.join(low_ra_events)} "
+                    f"trap={int(trap)} irq={int(is_irq)} "
+                    f"cause=0x{(trap_cause or 0):08x} "
+                    f"trap_pc=0x{(trap_pc or 0):08x} "
+                    f"mepc=0x{(_read_int(csr_mepc_live_sig) or 0):08x}"
+                )
+
+        for we_sig, addr_sig, data_sig, pc_sig in (
+            (
+                port0_int_we_sig,
+                port0_int_addr_sig,
+                port0_int_data_sig,
+                rob_commit0_reg_pc_sig,
+            ),
+            (
+                port1_int_we_sig,
+                port1_int_addr_sig,
+                port1_int_data_sig,
+                rob_commit1_reg_pc_sig,
+            ),
+        ):
+            if bool(_read_bool(we_sig)) and _read_int(addr_sig) == 2:
+                last_x2_commit = _read_int(data_sig)
+                last_x2_commit_pc = _read_int(pc_sig)
+
         if _read_bool(int_rf_write_enable_sig):
             commit_addr = _read_int(int_rf_write_addr_sig)
             commit_data = _read_int(int_rf_write_data_sig)
@@ -2457,6 +3299,14 @@ def dump_coremark_retire_trace() -> None:
                 f"lq_mem_outstanding={lq_mem_outstanding}"
                 f"{cf_debug_suffix}"
             )
+            cocotb.log.info(
+                f"Run {run_number} CLINT/serial: cycle={cycle + 1} "
+                f"mtime=0x{(_read_u64(_get_signal(dut, 'cpu_and_memory_subsystem.mtime')) or 0):016x} "
+                f"mtimecmp=0x{(_read_u64(_get_signal(dut, 'cpu_and_memory_subsystem.mtimecmp')) or 0):016x} "
+                f"mtip={_read_bool(_get_signal(dut, 'cpu_and_memory_subsystem.mtip_registered'))} "
+                f"priv={_read_int(_get_signal(dut, 'cpu_and_memory_subsystem.cpu_inst.csr_priv'))} "
+                f"mstatus=0x{(_read_int(_get_signal(dut, 'cpu_and_memory_subsystem.cpu_inst.csr_mstatus')) or 0):08x}"
+            )
             last_progress_retired = retired_count
             last_progress_mispredicts = retired_mispredicts
 
@@ -2684,6 +3534,12 @@ async def test_real_program(dut: Any) -> None:
         max_cycles = COREMARK_MAX_CYCLES
     elif app_name == "sprintf_test":
         max_cycles = SPRINTF_TEST_MAX_CYCLES
+    elif app_name == "pde_return_hazard":
+        max_cycles = PDE_RETURN_HAZARD_MAX_CYCLES
+    elif app_name == "wfi_lost_tick":
+        max_cycles = WFI_LOST_TICK_MAX_CYCLES
+    elif app_name == "linux_boot":
+        max_cycles = LINUX_BOOT_MAX_CYCLES
     else:
         max_cycles = MAX_CYCLES
 
@@ -2696,6 +3552,11 @@ async def test_real_program(dut: Any) -> None:
     # Start UART monitor (runs throughout both program executions)
     uart_monitor = UartMonitor(dut)
     await uart_monitor.start()
+
+    # Optional trap/MRET deadlock wedge observer (pure instrumentation).
+    if os.environ.get("FROST_WEDGE_MONITOR") == "1":
+        cocotb.start_soon(wedge_monitor(dut, uart_monitor))
+
     uart_driver = None
     debug_monitor = None
     if app_name == "uart_echo":
@@ -2711,6 +3572,8 @@ async def test_real_program(dut: Any) -> None:
             dut.i_rst_n.value = 0
             if hasattr(dut, "i_uart_rx"):
                 dut.i_uart_rx.value = 1
+            if hasattr(dut, "i_external_interrupt"):
+                dut.i_external_interrupt.value = 0
             for _ in range(RESET_CYCLES):
                 await RisingEdge(dut.i_clk)
             dut.i_rst_n.value = 1
@@ -2720,6 +3583,8 @@ async def test_real_program(dut: Any) -> None:
             dut.i_rst_n.value = 0
             if hasattr(dut, "i_uart_rx"):
                 dut.i_uart_rx.value = 1
+            if hasattr(dut, "i_external_interrupt"):
+                dut.i_external_interrupt.value = 0
             await Timer(2 * CLK_PERIOD_NS, unit="ns")
             await RisingEdge(dut.i_clk)
             dut.i_rst_n.value = 1
diff --git a/verif/cocotb_tests/tomasulo/load_queue/test_load_queue.py b/verif/cocotb_tests/tomasulo/load_queue/test_load_queue.py
index 4afd39b9..93920605 100644
--- a/verif/cocotb_tests/tomasulo/load_queue/test_load_queue.py
+++ b/verif/cocotb_tests/tomasulo/load_queue/test_load_queue.py
@@ -40,6 +40,7 @@
 
 CLOCK_PERIOD_NS = 10
 LQ_DEPTH = 8
+AMO_RESCUE_THRESHOLD = 16384
 
 
 async def setup(dut: Any) -> tuple[LQInterface, LQModel]:
@@ -1309,6 +1310,68 @@ async def test_cached_response_after_invalidate_does_not_refill_l0(dut: Any) ->
     await accept_fu_complete(dut_if)
 
 
+@cocotb.test()
+async def test_cached_response_during_flush_all_does_not_refill_l0(dut: Any) -> None:
+    """A cached-tier response coincident with flush_all must be drained only."""
+    dut_if, model = await setup(dut)
+
+    addr = 0x8000_0300
+    stale_word = 0x0000_0CC0
+    fresh_word = 0xA5A5_5A5A
+
+    dut_if.drive_sq_empty(True)
+
+    # Launch a cached-region load and delay its response until full flush.
+    await alloc_and_addr(dut_if, model, rob_tag=1, address=addr)
+    dut_if.drive_sq_all_older_known(True)
+    dut_if.drive_sq_forward(match=False, can_forward=False)
+
+    mem_req = await wait_for_mem_request(dut_if)
+    assert mem_req["en"], "Expected cached load to issue"
+    assert mem_req["addr"] == addr
+    await dut_if.step()
+
+    # The response arrives in the same cycle as trap/MRET-style full flush.
+    # It must not complete the killed load and must not refill the persistent L0.
+    dut_if.drive_flush_all()
+    model.flush_all()
+    dut_if.drive_mem_response(stale_word)
+    await Timer(1, unit="ns")
+    assert not bool(dut.o_l0_fill.value), "Full-flush response filled L0"
+    await dut_if.step()
+    dut_if.clear_flush_all()
+    dut_if.clear_mem_response()
+
+    assert dut_if.empty, "Full flush should clear the LQ"
+    assert not (await wait_for_fu_complete(dut_if, max_cycles=1)).valid
+
+    # A later load to the same word must miss L0 and fetch the fresh value.
+    await alloc_and_addr(dut_if, model, rob_tag=2, address=addr)
+    dut_if.drive_sq_all_older_known(True)
+    dut_if.drive_sq_forward(match=False, can_forward=False)
+    await Timer(1, unit="ns")
+
+    assert not bool(dut.o_l0_hit.value), "Flushed response left a stale L0 hit"
+    mem_req = await wait_for_mem_request(dut_if, max_cycles=4)
+    assert mem_req["en"], "Later load should miss L0 and issue to memory"
+    assert mem_req["addr"] == addr
+    await dut_if.step()
+
+    dut_if.drive_mem_response(fresh_word)
+    model.mem_response(fresh_word)
+    await dut_if.step()
+    dut_if.clear_mem_response()
+
+    result = await wait_for_fu_complete(dut_if)
+    assert result.valid, "Later load should complete from memory"
+    assert result.tag == 2
+    assert result.value == fresh_word
+
+    dut_if.drive_sq_all_older_known(False)
+    dut_if.clear_sq_forward()
+    await accept_fu_complete(dut_if)
+
+
 # ============================================================================
 # Test 26: Cache miss fills cache, subsequent hit
 # ============================================================================
@@ -1854,6 +1917,159 @@ async def test_amo_waits_for_rob_head_and_sq_committed_empty(dut: Any) -> None:
     assert mem_req["en"], "AMO should issue when at ROB head and sq_committed_empty"
 
 
+# ============================================================================
+# Test 35b: ROB-head AMO rescue from physical older-AMO block
+# ============================================================================
+@cocotb.test()
+async def test_blocked_head_amo_rescues_when_issue_would_idle(dut: Any) -> None:
+    """A physically blocked ROB-head AMO issues when no normal candidate exists."""
+    dut_if, model = await setup(dut)
+
+    from .lq_interface import AMOSWAP_W
+
+    # Physical order: younger pending AMO, then the true ROB-head AMO.  The
+    # older-AMO prefix is physical-order based, so the head AMO is masked unless
+    # the idle rescue path re-adds it.
+    dut_if.drive_alloc(rob_tag=1, size=MEM_SIZE_WORD, is_amo=True, amo_op=AMOSWAP_W)
+    model.alloc(1, False, MEM_SIZE_WORD, False, is_amo=True, amo_op=AMOSWAP_W)
+    await dut_if.step()
+    dut_if.clear_alloc()
+
+    dut_if.drive_alloc(rob_tag=0, size=MEM_SIZE_WORD, is_amo=True, amo_op=AMOSWAP_W)
+    model.alloc(0, False, MEM_SIZE_WORD, False, is_amo=True, amo_op=AMOSWAP_W)
+    await dut_if.step()
+    dut_if.clear_alloc()
+
+    dut_if.drive_addr_update(rob_tag=1, address=0x9000, amo_rs2=0x11)
+    model.addr_update(1, 0x9000, amo_rs2=0x11)
+    await dut_if.step()
+    dut_if.clear_addr_update()
+
+    dut_if.drive_addr_update(rob_tag=0, address=0x9004, amo_rs2=0x22)
+    model.addr_update(0, 0x9004, amo_rs2=0x22)
+    await dut_if.step()
+    dut_if.clear_addr_update()
+
+    dut_if.drive_rob_head_tag(0)
+    dut_if.drive_sq_empty(True)
+    dut_if.drive_sq_committed_empty(True)
+
+    mem_req = await wait_for_mem_request(dut_if, max_cycles=AMO_RESCUE_THRESHOLD + 8)
+    assert mem_req["en"], "Blocked ROB-head AMO should be rescued"
+    assert (
+        mem_req["addr"] == 0x9004
+    ), f"Expected rescued head AMO addr=0x9004, got 0x{mem_req['addr']:x}"
+
+
+# ============================================================================
+# Test 35c: ROB-head AMO rescue stays dormant while normal progress exists
+# ============================================================================
+@cocotb.test()
+async def test_blocked_head_amo_does_not_preempt_normal_candidate(dut: Any) -> None:
+    """A blocked ROB-head AMO does not jump ahead of a normal eligible load."""
+    dut_if, model = await setup(dut)
+
+    from .lq_interface import AMOSWAP_W
+
+    # Physical order: normal younger load, younger pending AMO, ROB-head AMO.
+    # The head AMO is physically blocked, but the load is a normal candidate, so
+    # the rescue path must stay dormant and preserve speculative load progress.
+    dut_if.drive_alloc(rob_tag=2, size=MEM_SIZE_WORD)
+    model.alloc(2, False, MEM_SIZE_WORD, False)
+    await dut_if.step()
+    dut_if.clear_alloc()
+
+    dut_if.drive_alloc(rob_tag=1, size=MEM_SIZE_WORD, is_amo=True, amo_op=AMOSWAP_W)
+    model.alloc(1, False, MEM_SIZE_WORD, False, is_amo=True, amo_op=AMOSWAP_W)
+    await dut_if.step()
+    dut_if.clear_alloc()
+
+    dut_if.drive_alloc(rob_tag=0, size=MEM_SIZE_WORD, is_amo=True, amo_op=AMOSWAP_W)
+    model.alloc(0, False, MEM_SIZE_WORD, False, is_amo=True, amo_op=AMOSWAP_W)
+    await dut_if.step()
+    dut_if.clear_alloc()
+
+    dut_if.drive_addr_update(rob_tag=2, address=0xA000)
+    model.addr_update(2, 0xA000)
+    await dut_if.step()
+    dut_if.clear_addr_update()
+
+    dut_if.drive_addr_update(rob_tag=1, address=0xA004, amo_rs2=0x11)
+    model.addr_update(1, 0xA004, amo_rs2=0x11)
+    await dut_if.step()
+    dut_if.clear_addr_update()
+
+    dut_if.drive_addr_update(rob_tag=0, address=0xA008, amo_rs2=0x22)
+    model.addr_update(0, 0xA008, amo_rs2=0x22)
+    await dut_if.step()
+    dut_if.clear_addr_update()
+
+    dut_if.drive_rob_head_tag(0)
+    dut_if.drive_sq_empty(True)
+    dut_if.drive_sq_committed_empty(True)
+
+    mem_req = await wait_for_mem_request(dut_if, max_cycles=8)
+    assert mem_req["en"], "Normal load candidate should still issue"
+    assert (
+        mem_req["addr"] == 0xA000
+    ), f"Expected normal load addr=0xA000, got 0x{mem_req['addr']:x}"
+
+
+# ============================================================================
+# Test 35d: ROB-head AMO idle rescue must not replace busy SQ-check
+# ============================================================================
+@cocotb.test()
+async def test_blocked_head_amo_does_not_replace_busy_sq_check(dut: Any) -> None:
+    """Idle rescue stays off while a younger load is already in SQ-check."""
+    dut_if, model = await setup(dut)
+
+    from .lq_interface import AMOSWAP_W
+
+    dut_if.drive_rob_head_tag(0)
+    dut_if.drive_sq_empty(False)
+    dut_if.drive_sq_all_older_known(False)
+    dut_if.drive_sq_forward(match=False, can_forward=False)
+    dut_if.drive_sq_committed_empty(True)
+
+    await alloc_and_addr(dut_if, model, rob_tag=2, address=0xB000)
+
+    sq_check = await wait_for_sq_check(dut_if, max_cycles=4)
+    assert sq_check["valid"], "Younger load should occupy SQ-check"
+    assert sq_check["rob_tag"] == 2
+
+    # Physical order after the staged load: younger pending AMO, then the true
+    # ROB-head AMO.  The head AMO is eligible but physically blocked by the
+    # younger AMO.  The idle rescue must not evict the existing SQ-check entry.
+    dut_if.drive_alloc(rob_tag=1, size=MEM_SIZE_WORD, is_amo=True, amo_op=AMOSWAP_W)
+    model.alloc(1, False, MEM_SIZE_WORD, False, is_amo=True, amo_op=AMOSWAP_W)
+    await dut_if.step()
+    dut_if.clear_alloc()
+
+    dut_if.drive_alloc(rob_tag=0, size=MEM_SIZE_WORD, is_amo=True, amo_op=AMOSWAP_W)
+    model.alloc(0, False, MEM_SIZE_WORD, False, is_amo=True, amo_op=AMOSWAP_W)
+    await dut_if.step()
+    dut_if.clear_alloc()
+
+    dut_if.drive_addr_update(rob_tag=1, address=0xB004, amo_rs2=0x11)
+    model.addr_update(1, 0xB004, amo_rs2=0x11)
+    await dut_if.step()
+    dut_if.clear_addr_update()
+
+    dut_if.drive_addr_update(rob_tag=0, address=0xB008, amo_rs2=0x22)
+    model.addr_update(0, 0xB008, amo_rs2=0x22)
+    await dut_if.step()
+    dut_if.clear_addr_update()
+
+    for _ in range(6):
+        await Timer(1, unit="ns")
+        mem_req = dut_if.read_mem_request()
+        assert not mem_req["en"], "Blocked head AMO must not replace busy SQ-check"
+        sq_check = dut_if.read_sq_check()
+        assert sq_check["valid"], "Original SQ-check entry should remain staged"
+        assert sq_check["rob_tag"] == 2
+        await dut_if.step()
+
+
 # ============================================================================
 # Test 36: AMO SWAP
 # ============================================================================
diff --git a/verif/cocotb_tests/tomasulo/reorder_buffer/reorder_buffer_interface.py b/verif/cocotb_tests/tomasulo/reorder_buffer/reorder_buffer_interface.py
index d405210b..ecfe047d 100644
--- a/verif/cocotb_tests/tomasulo/reorder_buffer/reorder_buffer_interface.py
+++ b/verif/cocotb_tests/tomasulo/reorder_buffer/reorder_buffer_interface.py
@@ -383,6 +383,9 @@ def _init_inputs(self) -> None:
         self.dut.i_trap_taken.value = 0
         self.dut.i_mret_done.value = 0
         self.dut.i_mepc.value = 0
+        self.dut.i_priv.value = (
+            0b11  # PrivM: MRET/privileged CSR tests run in machine mode.
+        )
         self.dut.i_interrupt_pending.value = 0
         self.dut.i_flush_en.value = 0
         self.dut.i_flush_tag.value = 0
diff --git a/verif/cocotb_tests/tomasulo/tomasulo_wrapper/test_tomasulo_wrapper.py b/verif/cocotb_tests/tomasulo/tomasulo_wrapper/test_tomasulo_wrapper.py
index cf75edaa..16e0f1f8 100644
--- a/verif/cocotb_tests/tomasulo/tomasulo_wrapper/test_tomasulo_wrapper.py
+++ b/verif/cocotb_tests/tomasulo/tomasulo_wrapper/test_tomasulo_wrapper.py
@@ -367,6 +367,43 @@ async def mark_done_via_cdb(dut_if: TomasuloInterface, tag: int, value: int) ->
     dut_if.clear_cdb_write()
 
 
+async def issue_sw_via_mem_rs(
+    dut_if: TomasuloInterface,
+    tag: int,
+    base_addr: int,
+    store_data: int,
+    imm: int = 0,
+    max_cycles: int = 6,
+) -> dict:
+    """Dispatch an SW to MEM_RS and wait until its issue is captured."""
+    dut_if.drive_rs_dispatch(
+        rs_type=RS_MEM,
+        rob_tag=tag,
+        op=OP_SW,
+        src1_ready=True,
+        src1_value=base_addr,
+        src2_ready=True,
+        src2_value=store_data,
+        src3_ready=True,
+        imm=imm,
+        use_imm=True,
+        mem_size=2,
+        mem_signed=False,
+    )
+    await dut_if.step()
+    dut_if.clear_rs_dispatch()
+
+    for _ in range(max_cycles):
+        await Timer(1, unit="ps")
+        issue = dut_if.read_rs_issue_for(RS_MEM)
+        if issue["valid"] and issue["rob_tag"] == tag:
+            await dut_if.step()
+            return dict(issue)
+        await dut_if.step()
+
+    raise TimeoutError("SW did not issue from MEM_RS")
+
+
 async def wait_for_commit_pair(
     dut_if: TomasuloInterface, max_cycles: int = 10
 ) -> tuple[dict, dict, bool]:
@@ -874,6 +911,58 @@ async def test_widen_commit_slot2_clears_rat_through_wrapper(dut: Any) -> None:
     cocotb.log.info("=== Test Passed ===")
 
 
+@cocotb.test()
+async def test_slot2_store_raw_commit_blocks_sq_committed_empty(dut: Any) -> None:
+    """A raw slot-2 store commit must immediately hold SQ committed_empty low."""
+    cocotb.log.info("=== Test: Slot-2 Store Raw Commit Blocks SQ committed_empty ===")
+    dut_if, _ = await setup_test(dut)
+
+    dut_if.set_all_fu_ready(True)
+    dut_if.set_commit_hold(True)
+
+    tag_1, tag_2 = await drive_dual_alloc(
+        dut_if,
+        make_int_req(pc=0x4380, rd=11),
+        make_store_req(pc=0x4384),
+    )
+
+    await mark_done_via_cdb(dut_if, tag_1, 0x5151)
+    issue = await issue_sw_via_mem_rs(
+        dut_if,
+        tag=tag_2,
+        base_addr=0x2400,
+        store_data=0xA5A5_5A5A,
+        imm=4,
+    )
+    assert issue["rob_tag"] == tag_2
+    assert not dut_if.sq_empty, "Store should have an SQ entry before commit"
+    assert dut_if.sq_committed_empty, "Uncommitted store should not block traps yet"
+
+    dut_if.set_widen_commit_ok(True)
+    dut_if.set_commit_hold(False)
+
+    await Timer(1, unit="ps")
+    commit_1 = dut_if.read_commit()
+    commit_2 = dut_if.read_commit_2()
+    commit_2_valid_raw = dut_if.commit_2_valid_raw
+    commit_2_store_like_raw = dut_if.commit_2_store_like_raw
+
+    await RisingEdge(dut_if.clock)
+    await Timer(1, unit="ps")
+    sq_committed_empty = bool(dut_if.sq_committed_empty)
+    await FallingEdge(dut_if.clock)
+
+    assert commit_1["valid"] and commit_1["tag"] == tag_1
+    assert commit_2["valid"] and commit_2["tag"] == tag_2
+    assert commit_2_valid_raw, "Slot-2 raw commit should be visible"
+    assert commit_2_store_like_raw, "Slot-2 raw commit should be store-like"
+    assert (
+        not sq_committed_empty
+    ), "Slot-2 raw store commit must feed SQ's same-cycle committed_empty guard"
+
+    cocotb.log.info("=== Test Passed ===")
+
+
 @cocotb.test()
 async def test_widen_commit_ok_blocks_slot2_through_wrapper(dut: Any) -> None:
     """Wrapper forwards slot-2 widen-commit back-pressure to the ROB."""
@@ -4387,16 +4476,13 @@ async def test_sc_pending_does_not_block_older_load(dut: Any) -> None:
 
 @cocotb.test()
 async def test_partial_flush_preserves_older_sc_pending(dut: Any) -> None:
-    """Partial flush clears sc_pending even if SC is older than flush tag.
-
-    speculative_flush_all treats any i_flush_en as a full flush for timing
-    closure, so sc_pending is always cleared on partial flush regardless of
-    age.  This test verifies the conservative (timing-safe) behaviour.
+    """Partial flush preserves sc_pending when SC is older than flush tag.
 
-    Scenario: SC (tag 1) issues → sc_pending set. Branch (tag 2) mispredicts →
-    partial flush with flush_tag=2. Conservative flush clears sc_pending.
+    Scenario: SC (tag 1) issues -> sc_pending set. Branch (tag 2) mispredicts
+    -> partial flush with flush_tag=2. The SC is older than the flush boundary,
+    so the table entry must survive.
     """
-    cocotb.log.info("=== Test: Partial Flush Clears SC Pending (Conservative) ===")
+    cocotb.log.info("=== Test: Partial Flush Preserves Older SC Pending ===")
     dut_if, model = await setup_test(dut)
 
     addr = 0x1000
@@ -4518,17 +4604,14 @@ async def test_partial_flush_preserves_older_sc_pending(dut: Any) -> None:
     dut_if.set_fu_ready(RS_MEM, False)
 
     assert int(dut.sc_pending.value), "sc_pending should be set"
-    assert int(dut.sc_pending_unit_inst.sc_pending_rob_tag.value) == tag_sc
 
     # --- Phase 6: Partial flush with tag=branch (younger than SC) ---
-    # speculative_flush_all = i_flush_all || i_flush_en, so any partial flush
-    # conservatively clears sc_pending regardless of age comparison.
     dut_if.drive_flush_en(tag_branch)
     await dut_if.step()
     dut_if.clear_flush_en()
 
-    assert not int(dut.sc_pending.value), (
-        f"sc_pending should be cleared by conservative flush: SC tag={tag_sc}, "
+    assert int(dut.sc_pending.value), (
+        f"sc_pending should survive partial flush: SC tag={tag_sc}, "
         f"flush tag={tag_branch}"
     )
 
diff --git a/verif/cocotb_tests/tomasulo/tomasulo_wrapper/tomasulo_interface.py b/verif/cocotb_tests/tomasulo/tomasulo_wrapper/tomasulo_interface.py
index 011e31c2..d238d5fd 100644
--- a/verif/cocotb_tests/tomasulo/tomasulo_wrapper/tomasulo_interface.py
+++ b/verif/cocotb_tests/tomasulo/tomasulo_wrapper/tomasulo_interface.py
@@ -601,6 +601,11 @@ def commit_2_valid_raw(self) -> bool:
         """Return unregistered widen-commit slot-2 valid."""
         return bool(self.dut.o_commit_2_valid_raw.value)
 
+    @property
+    def commit_2_store_like_raw(self) -> bool:
+        """Return unregistered widen-commit slot-2 store-like marker."""
+        return bool(self.dut.o_commit_2_store_like_raw.value)
+
     # =========================================================================
     # ROB Status
     # =========================================================================
@@ -1091,6 +1096,11 @@ def sq_empty(self) -> bool:
         """Return whether SQ is empty."""
         return bool(self.dut.o_sq_empty.value)
 
+    @property
+    def sq_committed_empty(self) -> bool:
+        """Return whether SQ has no committed stores waiting to drain."""
+        return bool(self.dut.o_sq_committed_empty.value)
+
     @property
     def sq_count(self) -> int:
         """Return number of valid SQ entries."""
diff --git a/verif/config.py b/verif/config.py
index d7f8ed32..c81e0620 100644
--- a/verif/config.py
+++ b/verif/config.py
@@ -82,7 +82,9 @@
 """Base address of MMIO peripheral range (UART, CLINT timer, etc.)."""
 
 MMIO_SIZE_BYTES: Final[int] = 0x2C
-"""Size of MMIO peripheral range in bytes (44 bytes: 0x40000000-0x4000002B)."""
+"""Legacy MMIO range size; currently unused. The RTL window is 0x1_C000 bytes
+(see cpu_and_mem.sv MmioSizeBytes) with the ns16550 UART at +0x1000 and the
+CLINT at +0x10000."""
 
 # ============================================================================
 # Register File Configuration
@@ -179,8 +181,14 @@ class DUTSignalPaths:
         dut.device_under_test.regfile_inst.ram
 
     Default Paths:
-        These paths match the Frost CPU's default hierarchy. If your DUT
-        has different module names or hierarchy, create a custom instance:
+        The defaults below are legacy fallbacks from the pre-rename hierarchy
+        and no longer resolve. The regfiles now live under
+        ``ooo_register_files_inst`` and each read_port_ram sits inside a
+        ``gen_single_write``/``gen_multi_write`` scope, e.g.
+        ``device_under_test.ooo_register_files_inst.regfile_inst.``
+        ``gen_read_port[0].gen_multi_write.read_port_ram`` (test_helpers.py
+        navigates this modern path directly). If your DUT has different
+        module names or hierarchy, create a custom instance:
 
         >>> custom_paths = DUTSignalPaths(
         ...     regfile_ram_rs1_path="cpu_core.registers.port_a.data",
diff --git a/verif/encoders/__init__.py b/verif/encoders/__init__.py
index 8ae73513..dcf4301c 100644
--- a/verif/encoders/__init__.py
+++ b/verif/encoders/__init__.py
@@ -55,7 +55,7 @@
     binary = enc_add(rd=1, rs1=2, rs2=3)  # add x1, x2, x3
 
     # Get encoder for 'lw' instruction
-    enc_lw = LOADS["lw"]
+    enc_lw, eval_lw = LOADS["lw"]
     binary = enc_lw(rd=5, rs1=10, imm=16)  # lw x5, 16(x10)
 """
 
diff --git a/verif/encoders/compressed_encode.py b/verif/encoders/compressed_encode.py
index 4f0decbd..69fa68ce 100644
--- a/verif/encoders/compressed_encode.py
+++ b/verif/encoders/compressed_encode.py
@@ -32,7 +32,7 @@
 
 Example Usage:
     >>> # Encode C.ADDI x10, 5
-    >>> instr = enc_c_addi(rd=10, imm=5)
+    >>> instr = enc_c_addi(rd=10, nzimm=5)
     >>> hex(instr)
     '0x0515'  # 16-bit compressed instruction
 
diff --git a/verif/encoders/instruction_encode.py b/verif/encoders/instruction_encode.py
index f0f57bfc..ace3654d 100644
--- a/verif/encoders/instruction_encode.py
+++ b/verif/encoders/instruction_encode.py
@@ -49,7 +49,7 @@
     ...     opcode=0x33                 # ALU register-register
     ... )
     >>> hex(instruction)
-    '0x004181b3'
+    '0x004182b3'
 """
 
 from dataclasses import dataclass