diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index baa769a3..2a8c0b48 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -342,3 +342,215 @@ jobs: run: | docker run --rm -v ${{ github.workspace }}:/workspace frost-dev:latest \ pre-commit run --all-files + + # --------------------------------------------------------------------------- + # FROST no-MMU M-mode Linux: build the image from source, then boot it two + # ways off the SAME artifact -- on the FROST RTL in cocotb (the gremlin + # regression) and under QEMU (a fast full boot-to-shell reference). + # --------------------------------------------------------------------------- + + # Build the kernel + busybox initramfs + FROST memory images from the vendored + # Buildroot submodule (linux/buildroot @ pinned SHA) driven by the FROST + # BR2_EXTERNAL tree. Runs inside the frost-dev image, which ships Buildroot's + # host deps + QEMU, so the host deps are single-sourced in the Dockerfile. The + # first build compiles a full rv32 uClibc cross toolchain from source; the dl/ + # and ccache caches make later runs much faster. + build-frost-linux: + name: Build FROST Linux Image (Buildroot) + runs-on: ubuntu-24.04 + needs: build-docker + timeout-minutes: 120 + steps: + - name: Checkout repository + uses: actions/checkout@v4 + with: + # Recursive so the pinned linux/buildroot submodule is fetched. + submodules: recursive + + - name: Download Docker image + uses: actions/download-artifact@v4 + with: + name: frost-docker-image + path: /tmp + + - name: Load Docker image + run: | + docker load --input /tmp/frost-dev.tar + # The in-workspace Buildroot build needs ~10 GB; free the image tar. + rm -f /tmp/frost-dev.tar + + - name: Cache Buildroot downloads (source tarballs) + uses: actions/cache@v4 + with: + path: linux/dl + # dl/ only changes when a package version (kernel/toolchain) changes. + key: br2-dl-${{ hashFiles('linux/buildroot-external/configs/frost_nommu_rv32_defconfig', 'linux/buildroot-external/board/frost/**') }} + restore-keys: | + br2-dl- + + - name: Cache Buildroot ccache (toolchain + kernel object cache) + uses: actions/cache@v4 + with: + path: linux/ccache + key: br2-ccache-${{ github.sha }} + restore-keys: | + br2-ccache- + + - name: Build kernel + initramfs + FROST memory images + run: | + docker run --rm \ + -e BR2_DL_DIR=/workspace/linux/dl \ + -e BR2_CCACHE_DIR=/workspace/linux/ccache \ + -v ${{ github.workspace }}:/workspace frost-dev:latest \ + bash -c ' + set -euo pipefail + make -C linux/buildroot O=/workspace/linux/build \ + BR2_EXTERNAL=/workspace/linux/buildroot-external \ + frost_nommu_rv32_defconfig + # Enable ccache for CI only (kept out of the committed defconfig). + echo "BR2_CCACHE=y" >> /workspace/linux/build/.config + make -C linux/buildroot O=/workspace/linux/build olddefconfig + make -C linux/buildroot O=/workspace/linux/build + # Stage the memory images where the cocotb linux_boot test resolves + # them (sw/apps/linux_boot/{sw,sw_ddr}.mem). + mkdir -p sw/apps/linux_boot + cp linux/build/images/sw.mem sw/apps/linux_boot/sw.mem + cp linux/build/images/sw_ddr.mem sw/apps/linux_boot/sw_ddr.mem + ls -l linux/build/images/ + ' + + - name: Upload FROST Linux boot images + uses: actions/upload-artifact@v4 + with: + name: frost-linux-boot-images + path: | + linux/build/images/Image + linux/build/images/rootfs.cpio.gz + linux/build/images/frost-nommu-fpga.dtb + linux/build/images/sw.mem + linux/build/images/sw_ddr.mem + retention-days: 7 + if-no-files-found: error + + # Boot the freshly built image on the FROST RTL in cocotb, bounded to ~22M + # cycles in the genesys2-faithful cache shape (128 KiB L1I, no L2). This is the + # "gremlin" regression: that window is silent mem_init after devtmpfs (no deep + # console marker), so the run captures the full boot (FROST_LINUX_RUN_FULL) and + # check_linux_boot_regression.py then asserts boot health -- banner + devtmpfs, + # no panic, the periodic CLINT timer tick serviced (mtimecmp re-armed, the + # thing the gremlin hung), and forward progress (retire) all the way to the + # cap. CACHED_HAS_L2=0 must be an env/make var (the test's own -GCACHED_HAS_L2=0 + # is otherwise overridden by the tests/Makefile default). + linux-boot-cocotb: + name: Cocotb Linux Boot (22M, genesys2 shape) + runs-on: ubuntu-24.04 + needs: [build-docker, build-frost-linux] + # Verilator compile of the core plus the bounded 22M-cycle sim; GitHub + # runners sim this at roughly 4-8k cycles/s, so leave generous headroom. + timeout-minutes: 240 + steps: + - name: Checkout repository + uses: actions/checkout@v4 + with: + submodules: recursive + + - name: Download Docker image + uses: actions/download-artifact@v4 + with: + name: frost-docker-image + path: /tmp + + - name: Load Docker image + run: | + docker load --input /tmp/frost-dev.tar + # Long Verilator compile + 22M-cycle sim in this job; free the tar. + rm -f /tmp/frost-dev.tar + + - name: Download FROST Linux boot images + uses: actions/download-artifact@v4 + with: + name: frost-linux-boot-images + path: /tmp/frost-linux-images + + - name: Stage boot images for the cocotb linux_boot test + run: | + mkdir -p sw/apps/linux_boot + cp /tmp/frost-linux-images/sw.mem sw/apps/linux_boot/sw.mem + cp /tmp/frost-linux-images/sw_ddr.mem sw/apps/linux_boot/sw_ddr.mem + + - name: Run cocotb linux_boot (22M-cycle gremlin regression) + run: | + # FROST_LINUX_PREBUILT=1: the cocotb runner cleans + re-makes the app + # before simulating; this tells the linux_boot Makefile the staged + # images are authoritative (no in-job Buildroot rebuild, which would + # take ~1h uncached and overwrite the artifact under test). + docker run --rm \ + -e CACHED_HAS_L2=0 \ + -e FROST_LINUX_RUN_FULL=1 \ + -e FROST_LINUX_PREBUILT=1 \ + -e COCOTB_NUM_RUNS=1 \ + -e COCOTB_LINUX_MAX_CYCLES=22000000 \ + -e COCOTB_PROGRESS_INTERVAL=500000 \ + -v ${{ github.workspace }}:/workspace frost-dev:latest \ + bash -c ' + set -o pipefail + cd tests && make clean + # RUN_FULL capture always trips the cocotb never-match assertion by + # design; the boot-health verdict comes from the checker below. + ./test_run_cocotb.py linux_boot_128k 2>&1 \ + | tee /workspace/linux_boot_cocotb.log || true + python3 check_linux_boot_regression.py /workspace/linux_boot_cocotb.log + ' + + - name: Upload cocotb boot log + if: always() + uses: actions/upload-artifact@v4 + with: + name: linux-boot-cocotb-log + path: linux_boot_cocotb.log + if-no-files-found: ignore + + # Boot the SAME kernel + rootfs to a login prompt under QEMU: a fast (seconds) + # full-userspace reference that the RTL boot is bounded well short of. Confirms + # the built image itself reaches a shell, independent of the FROST core. + linux-boot-qemu: + name: QEMU Linux Boot to Shell + runs-on: ubuntu-24.04 + needs: [build-docker, build-frost-linux] + timeout-minutes: 20 + steps: + - name: Checkout repository + uses: actions/checkout@v4 + + - name: Download Docker image + uses: actions/download-artifact@v4 + with: + name: frost-docker-image + path: /tmp + + - name: Load Docker image + run: docker load --input /tmp/frost-dev.tar + + - name: Download FROST Linux boot images + uses: actions/download-artifact@v4 + with: + name: frost-linux-boot-images + path: /tmp/frost-linux-images + + - name: Boot to a shell under QEMU (assert login prompt) + run: | + docker run --rm -v /tmp/frost-linux-images:/img frost-dev:latest \ + bash -c ' + set -euo pipefail + # QEMU sits at the login prompt forever, so time-box it and assert + # the marker from the captured log (file redirect, never a pipe -- + # piping QEMUs stdout to grep deadlocks). + timeout -k5 120 qemu-system-riscv32 -M virt -bios none \ + -kernel /img/Image \ + -append "earlycon=sbi console=ttyS0 rdinit=/sbin/init" \ + -initrd /img/rootfs.cpio.gz -nographic -cpu rv32,mmu=off \ + /tmp/qemu-boot.log 2>&1 || true + echo "===== QEMU boot log (tail) =====" + tail -n 20 /tmp/qemu-boot.log + grep -q "buildroot login:" /tmp/qemu-boot.log + ' diff --git a/.gitignore b/.gitignore index 9c763545..e73378ca 100644 --- a/.gitignore +++ b/.gitignore @@ -65,6 +65,13 @@ sw_ddr.bin sw_imem_*.mem sw/apps/*/sw.S +# FROST no-MMU Linux: out-of-tree Buildroot build output + download/ccache +# caches. Regenerated from the linux/buildroot submodule + linux/buildroot-external +# tree by `make -C sw/apps/linux_boot` (and the CI build-frost-linux job). +/linux/build/ +/linux/dl/ +/linux/ccache/ + # mypy .mypy_cache/ diff --git a/.gitmodules b/.gitmodules index 2f663032..e2e8282c 100644 --- a/.gitmodules +++ b/.gitmodules @@ -15,3 +15,6 @@ [submodule "sw/apps/coremark_pro/coremark-pro"] path = sw/apps/coremark_pro/coremark-pro url = https://github.com/eembc/coremark-pro.git +[submodule "linux/buildroot"] + path = linux/buildroot + url = https://github.com/buildroot/buildroot.git diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md index d93c99da..a6f71a84 100644 --- a/CONTRIBUTING.md +++ b/CONTRIBUTING.md @@ -20,7 +20,7 @@ This document provides guidelines for contributors. The detailed style sections ## Project Overview -FROST is an out-of-order RISC-V processor implementing **RV32GCB** (G = IMAFD) with a Tomasulo back-end and full machine-mode privilege support. Understanding the architecture helps you contribute effectively: +FROST is an out-of-order RISC-V processor implementing **RV32GCB** (G = IMAFD) with a Tomasulo back-end and Machine + User (M/U) privilege modes. Understanding the architecture helps you contribute effectively: ### Architecture Outline @@ -446,8 +446,11 @@ The project uses pytest markers to categorize tests: Run the full CPU test suite: ```bash -# Full random instruction test (16,000+ instructions) -pytest tests/test_run_cocotb.py::TestCPU -s +# Full cocotb test suite +pytest tests/test_run_cocotb.py -s + +# Directed trap/exception tests +./tests/test_run_cocotb.py directed_traps # ISA compliance tests pytest "tests/test_run_cocotb.py::TestRealPrograms::test_real_program[isa_test]" -s @@ -577,7 +580,7 @@ We welcome contributions in these areas: |------|----------| | Bug fixes | OOO ordering, instruction encoding, timing issues | | ISA extensions | Additional standard or custom extensions | -| Privilege modes | S-mode (supervisor), U-mode (user) support | +| Privilege modes | S-mode (supervisor), PMP, virtual memory (M and U modes already supported) | | Board support | New FPGA boards, SoC integrations | | Performance | Branch predictor, scheduler, memory-system, or cache improvements | | Peripherals | SPI, I2C, GPIO, timers | diff --git a/Dockerfile b/Dockerfile index 410b3721..026380c7 100644 --- a/Dockerfile +++ b/Dockerfile @@ -151,6 +151,33 @@ ENV RISCV_PREFIX=riscv-none-elf- # Fix git "dubious ownership" error when mounting repo as volume RUN git config --global --add safe.directory /workspace +# Buildroot host dependencies + QEMU. Used by the FROST no-MMU Linux CI jobs: +# * build-frost-linux - builds the kernel + initramfs + FROST memory images +# from the linux/buildroot-external tree (Buildroot compiles its own rv32 +# uClibc cross toolchain from source, so it needs a full host build env). +# * qemu-linux-boot - boots the same Image + rootfs to a shell under +# qemu-system-riscv32 (qemu-system-misc provides the riscv32 target). +# `load_software.py linux_boot` self-builds via the same path, so these +# are the single source of truth for the Linux build's host deps. Kept as a late +# layer so the expensive Verilator/Yosys/SMT source builds above stay cached. +RUN apt-get update && apt-get install -y --no-install-recommends \ + build-essential \ + patch \ + cpio \ + rsync \ + bc \ + file \ + unzip \ + wget \ + bzip2 \ + ccache \ + libssl-dev \ + libelf-dev \ + libncurses-dev \ + device-tree-compiler \ + qemu-system-misc \ + && rm -rf /var/lib/apt/lists/* + # Install Python dependencies (cocotb, pytest, pre-commit, etc.) RUN pip install --no-cache-dir --break-system-packages \ "cocotb==2.0.1" \ diff --git a/README.md b/README.md index 553a607e..7e194353 100644 --- a/README.md +++ b/README.md @@ -2,7 +2,7 @@ **F**PGA **R**ISC-V **O**pen-sourced in **S**ystemVerilog by **T**woSigma -An out-of-order RISC-V processor implementing **RV32GCB** (G = IMAFD) with a Tomasulo back-end and full machine-mode privilege support for RTOS operation. Achieves 300 MHz on UltraScale+. Designed for FPGA deployment with clean, portable SystemVerilog. +An out-of-order RISC-V processor implementing **RV32GCB** (G = IMAFD) with a Tomasulo back-end and Machine + User (M/U) privilege modes for RTOS operation. Achieves 300 MHz on UltraScale+. Designed for FPGA deployment with clean, portable SystemVerilog. ## Why FROST? @@ -13,6 +13,7 @@ There are many RISC-V cores. Here's what makes FROST different: - **Solid performance** — 3.08 CoreMark/MHz (924 CoreMark at 300 MHz on UltraScale+) from a Tomasulo out-of-order back-end with 2-wide dispatch/rename, 2-wide commit, branch prediction (BTB + bimodal direction predictor + RAS), an L0 cache, and a fast two-cycle conditional-branch misprediction recovery path. - **Layered verification** — constrained-random tests, directed tests, real C programs, the official [riscv-arch-test](https://github.com/riscv-non-isa/riscv-arch-test) compliance suite, [riscv-tests](https://github.com/riscv-software-src/riscv-tests) ISA tests, and random instruction torture tests all run in Cocotb simulation, along with formal verification. - **Real workloads included** — all nine official EEMBC CoreMark-PRO workloads (on both supported boards, backed by the DDR cache hierarchy), FreeRTOS demo, CoreMark benchmark, ISA compliance suite, and 400+ architecture compliance tests all run in simulation and on hardware. +- **Boots no-MMU Linux** — an in-tree Buildroot flow (`linux/`) builds a no-MMU M-mode Linux image; CI builds it from source (`build-frost-linux`) and boots it in both cocotb RTL simulation (`linux-boot-cocotb`) and QEMU (`linux-boot-qemu`). - **Portable core RTL** — the CPU core avoids vendor primitives and is checked with generic Yosys coarse synthesis. Full open-source Yosys synthesis is also tested for Xilinx 7-series, UltraScale, and UltraScale+ targets; board wrappers are provided for Kintex-7 and UltraScale+. - **Apache 2.0 licensed** — permissive license suitable for commercial and academic use. @@ -55,8 +56,8 @@ There are many RISC-V cores. Here's what makes FROST different: │ │ │ ┌──────────────────────────┐ ┌─────────────────────────────────────┐ │ │ │ Trap Unit │ │ Peripherals │ │ -│ │ (M-mode, mret, wfi, │ │ UART, mtime/mtimecmp, FIFO0/1 │ │ -│ │ interrupts, exceptions) │ │ │ │ +│ │ (M/U traps, mret, wfi, │ │ UART (+ ns16550a face), FIFO0/1 │ │ +│ │ interrupts, exceptions) │ │ CLINT timer (mtime/mtimecmp, msip) │ │ │ └──────────────────────────┘ └─────────────────────────────────────┘ │ │ │ └──────────────────────────────────────────────────────────────────────────────┘ @@ -82,6 +83,7 @@ There are many RISC-V cores. Here's what makes FROST different: | **Zbkb** | Bit manipulation for crypto | | **Zihintpause** | Pause hint for spin-wait loops | | **Machine Mode** | M-mode privilege (mret, wfi, ecall, ebreak) | +| **User Mode** | U-mode privilege (ecall traps to M-mode) | ### Architecture Highlights @@ -95,11 +97,11 @@ There are many RISC-V cores. Here's what makes FROST different: - **Conservative memory disambiguation** — loads gated until older store addresses known, with store-to-load forwarding from the SQ - **Two-tier branch recovery** — conditional-branch mispredictions use a fast ~2-cycle path (front-end redirect + RAT restore in the same cycle); JALR and exceptions take the slower commit-time path - **Branch prediction** with a 256-entry 2-bit BTB (trained for conditional branches and JAL, with slot-2 lookup support), 1024-entry bimodal direction predictor, 8-entry return address stack, and PD-stage computed-target redirects for conditional BTB misses predicted taken -- **L0 cache** in front of the load queue reduces load-use latency (direct-mapped, write-through) -- **M-mode trap handling** for RTOS support (interrupts and exceptions) +- **L0 cache** in front of the load queue reduces load-use latency (direct-mapped, read-fill; stores invalidate matching lines) +- **Machine + User (M/U) privilege modes** for RTOS support — traps from both modes are taken in M-mode (interrupts and exceptions) - **CLINT-compatible timer** (mtime/mtimecmp) for preemptive scheduling - **Harvard architecture** with separate instruction and data memory ports -- **Write-back cache hierarchy over DDR** — a 1 GiB cached region at `0x8000_0000` served by recursive line-port caches (`frost_cache`: direct-mapped, 32 B lines, write-back/write-allocate). Both instruction fetch (a 16 KiB read-only L1I) and data (a 128 KiB L1D) run through it on every board — so code can execute from DDR, not just from low BRAM — sharing a 2:1 line-port arbiter (data-side priority), plus a 2 MiB UltraRAM L2 spliced in on UltraScale+, over the board's DDR (DDR3 on Genesys2, DDR4 on X3) through a single-beat AXI bridge +- **Write-back cache hierarchy over DDR** — a 1 GiB cached region at `0x8000_0000` served by recursive line-port caches (`frost_cache`: direct-mapped, 32 B lines, write-back/write-allocate). Both instruction fetch (a read-only L1I — 16 KiB on X3, 128 KiB on Genesys2) and data (a 128 KiB L1D) run through it on every board — so code can execute from DDR, not just from low BRAM — sharing a 2:1 line-port arbiter (data-side priority), plus a 2 MiB UltraRAM L2 spliced in on UltraScale+, over the board's DDR (DDR3 on Genesys2, DDR4 on X3) through a single-beat AXI bridge - **One memory map everywhere** — software sees the same layout on every board and in simulation: a 256 KiB fast, uncached BRAM region (code/data/stack, 1-cycle) plus the 1 GiB cached region (execute-from-DDR code, heap, and large data); the hierarchy shape behind it is opaque to software - **Portable core RTL** — written in generic SystemVerilog with no vendor-specific primitives in the CPU core; CI checks vendor-agnostic elaboration and coarse synthesis, while full FPGA builds are currently Xilinx-focused @@ -172,10 +174,11 @@ You should see "Hello, world!" in the output. ### Run the CPU Verification Suite ```bash -make -C tests # constrained-random regression on the cpu_tb testbench +pytest tests/ # full regression (riscv-tests, arch compliance, C programs, …) +./tests/test_run_cocotb.py directed_traps # directed M-mode trap/interrupt tests (cpu_tb) ``` -This runs constrained-random instructions through the CPU, verifying each against a software reference model. (The random regression runs on the `cpu_tb` testbench — the `tests/` Makefile default — rather than as a `test_run_cocotb.py` target.) +The pytest run validates the CPU against the riscv-tests ISA suites, the riscv-arch-test compliance suite, and real C programs. The legacy constrained-random `cpu_tb` regression is registered as the CLI-only `cpu_random` target; it predates the OOO integration and needs porting before it passes on the current core. ## Directory Structure @@ -203,6 +206,7 @@ frost/ │ ├── coremark_pro/ # EEMBC CoreMark-PRO suite (DDR-backed heap) │ ├── freertos_demo/ # FreeRTOS RTOS demo │ └── ... # Other applications +├── linux/ # Buildroot no-MMU Linux image build (submodule + external tree) ├── verif/ # Verification infrastructure │ ├── cocotb_tests/ # Cocotb test cases │ ├── models/ # Software reference models @@ -244,7 +248,7 @@ git submodule update --init pytest tests/ # Run all tests pytest tests/ -s # With live output # Standalone test runner -make -C tests # CPU constrained-random verification (cpu_tb) +./tests/test_run_cocotb.py directed_traps # Directed trap/interrupt tests (cpu_tb) ./tests/test_run_cocotb.py hello_world # Hello World program ./tests/test_run_cocotb.py isa_test # ISA compliance ./tests/test_run_cocotb.py coremark # CoreMark benchmark @@ -256,7 +260,8 @@ make -C tests # CPU constrained-random verification ./tests/test_run_cocotb.py frost_cache # Cache-hierarchy unit bench (X3 shape) ./tests/test_run_cocotb.py freertos_demo # FreeRTOS demo -# With waveform output +# With waveform output (cpu_tb Makefile flow; note the default constrained-random +# suite is the CLI-only `cpu_random` target and needs porting to the OOO core) WAVES=1 make -C tests ``` @@ -275,8 +280,7 @@ WAVES=1 make -C tests Running `pytest tests/` exercises: -- **CPU verification** — constrained-random instruction sequences validated against Python reference models -- **Directed tests** — atomic operations (LR/SC), trap handling, compressed instructions +- **Directed tests** — M-mode trap/interrupt handling (`directed_traps` on the cpu_tb harness); LR/SC and compressed-instruction coverage is carried by the rv32ua/rv32uc riscv-tests, the arch-compliance suite, and the ddr_atomic_test/c_ext_test programs (the remaining cpu_tb directed suites and the constrained-random regression are CLI-only pending a port to the OOO core) - **Architecture compliance** — 400+ tests from the official [riscv-arch-test](https://github.com/riscv-non-isa/riscv-arch-test) suite across I, M, A, F, D, C, B, K, Zicond, and Zifencei extensions, with signature comparison against Spike golden references (Verilator only, parallelized by extension in CI) - **ISA pipeline tests** — 126 self-checking tests from [riscv-tests](https://github.com/riscv-software-src/riscv-tests) across rv32ui, rv32um, rv32ua, rv32uf, rv32ud, rv32uc, rv32mi, and B-extension suites, exercising rename, wakeup, CDB arbitration, and OOO commit (Verilator only) - **Random instruction torture tests** — 20 randomly generated RV32IMAFDC instruction sequences (ALU, multiply/divide, memory, branch, FP, AMO) verified against Spike golden register signatures (Verilator only) @@ -317,7 +321,7 @@ Use a serial terminal configured for 115200 baud, 8 data bits, no parity, and | Board | FPGA | CPU Clock | Cache hierarchy → main memory | |--------------------|----------------------|-----------|---------------------------------------------| | Alveo X3522PV | UltraScale+ (xcux35) | 300 MHz | 128 KiB L1D + 16 KiB L1I → 2 MiB URAM L2 → 1 GiB DDR4 | -| Digilent Genesys2 | Kintex-7 (xc7k325t) | 133 MHz | 128 KiB L1D + 16 KiB L1I → 1 GiB DDR3 | +| Digilent Genesys2 | Kintex-7 (xc7k325t) | 133 MHz | 128 KiB L1D + 128 KiB L1I → 1 GiB DDR3 | Both boards also carry the 256 KiB fast (uncached, 1-cycle) low BRAM region and present the identical software-visible memory map: `[0, 256 KiB)` fast BRAM, @@ -333,15 +337,15 @@ controller calibrates, so software never observes an uninitialized main memory. | Resource | Used | Available | Util% | |----------|-----:|----------:|------:| -| CLB LUTs | 148,337 | 1,029,600 | 14.4% | -| LUT as Logic | 138,133 | 1,029,600 | 13.4% | -| LUT as Distributed RAM | 9,034 | — | — | -| LUT as Shift Register | 1,170 | — | — | -| CLB Registers | 113,144 | 2,059,200 | 5.5% | +| CLB LUTs | 149,121 | 1,029,600 | 14.5% | +| LUT as Logic | 138,878 | 1,029,600 | 13.5% | +| LUT as Distributed RAM | 9,074 | — | — | +| LUT as Shift Register | 1,169 | — | — | +| CLB Registers | 113,334 | 2,059,200 | 5.5% | | Block RAM Tile | 240 | 2,112 | 11.4% | | URAM | 64 | 352 | 18.2% | | DSPs | 35 | 1,320 | 2.6% | -| CARRY8 | 4,415 | 128,700 | 3.4% | +| CARRY8 | 4,436 | 128,700 | 3.5% | | F7 Muxes | 208 | 514,800 | 0.0% | | F8 Muxes | 49 | 257,400 | 0.0% | | Bonded IOB | 132 | 364 | 36.3% | @@ -352,12 +356,12 @@ controller calibrates, so software never observes an uninitialized main memory. | Resource | Used | Available | Util% | |----------|-----:|----------:|------:| -| Slice LUTs | 129,281 | 203,800 | 63.4% | -| LUT as Logic | 120,714 | 203,800 | 59.2% | -| LUT as Distributed RAM | 7,722 | — | — | +| Slice LUTs | 130,622 | 203,800 | 64.1% | +| LUT as Logic | 122,015 | 203,800 | 59.9% | +| LUT as Distributed RAM | 7,762 | — | — | | LUT as Shift Register | 845 | — | — | -| Slice Registers | 86,734 | 407,600 | 21.3% | -| Block RAM Tile | 189.5 | 445 | 42.6% | +| Slice Registers | 87,375 | 407,600 | 21.4% | +| Block RAM Tile | 219 | 445 | 49.2% | | DSPs | 36 | 840 | 4.3% | | F7 Muxes | 98 | 101,900 | 0.1% | | F8 Muxes | 33 | 50,950 | 0.1% | @@ -403,7 +407,7 @@ queue, store queue, CDB arbiter, FU shims) has its own README under | **CDB** | Common Data Bus (2-lane result broadcast) | | **FU** | Functional Unit (ALU, MUL/DIV, FPU, …) | | **L0 Cache** | Level-0 cache for load-use bypass | -| **L1I / L1D** | Split write-back line caches (16 KiB instruction, 128 KiB data) over the cached DDR region, through a shared 2:1 line-port arbiter | +| **L1I / L1D** | Split write-back line caches (16 KiB instruction on X3 / 128 KiB on Genesys2, 128 KiB data) over the cached DDR region, through a shared 2:1 line-port arbiter | | **L2 Cache** | 2 MiB UltraRAM line cache below the L1s (UltraScale+ only) | | **Cached region** | `[0x8000_0000, +1 GiB)` — code (execute-from-DDR), heap, and large data, behind L1[/L2]→DDR | | **BTB** | Branch Target Buffer (256-entry target predictor) | diff --git a/__init__.py b/__init__.py index 094cb92f..55a30449 100644 --- a/__init__.py +++ b/__init__.py @@ -15,7 +15,7 @@ """FROST - RISC-V processor package. This package contains a complete RV32GCB (G = IMAFD) RISC-V processor -implementation with full machine-mode support and additional extensions +implementation with Machine (M) and User (U) privilege modes and additional extensions (Zicsr, Zicntr, Zifencei, Zicond, Zbkb, and Zihintpause), along with verification infrastructure, build tools, and software libraries. diff --git a/boards/README.md b/boards/README.md index b42b1abe..cfbc35e8 100644 --- a/boards/README.md +++ b/boards/README.md @@ -6,7 +6,7 @@ This directory contains board-specific wrappers that enable the FROST RISC-V pro | Board | FPGA | CPU Clock | Cache hierarchy → main memory | Features | |------------------------|------------------------------------|------------|-------------------------------------------------------|--------------------------| -| [Genesys2](genesys2/) | Xilinx Kintex-7 (xc7k325t) | 133.33 MHz | 128 KiB L1D + 16 KiB L1I → 1 GiB DDR3 | Entry-level development | +| [Genesys2](genesys2/) | Xilinx Kintex-7 (xc7k325t) | 133.33 MHz | 128 KiB L1D + 128 KiB L1I → 1 GiB DDR3 | Entry-level development | | [X3](x3/) | Xilinx Alveo X3522PV (UltraScale+) | 300 MHz | 128 KiB L1D + 16 KiB L1I → 2 MiB URAM L2 → 1 GiB DDR4 | High-performance target | Both boards expose the identical software-visible memory map (256 KiB fast diff --git a/boards/genesys2/genesys2_frost.sv b/boards/genesys2/genesys2_frost.sv index e47c9218..0e519104 100644 --- a/boards/genesys2/genesys2_frost.sv +++ b/boards/genesys2/genesys2_frost.sv @@ -196,7 +196,10 @@ module genesys2_frost ( // backed by the DDR3 controller through the AXI port below. .ENABLE_CACHED_TIER(1), .CACHED_HAS_L2(0), - .USE_BEHAVIORAL_DDR(0) + .USE_BEHAVIORAL_DDR(0), + // Bump L1I 16 KiB -> 128 KiB: hold the kernel tick/softirq/scheduler + // working set to defeat the periodic-tick catch-up livelock (no L2 here). + .L1I_CACHE_BYTES(128 * 1024) ) subsystem ( .i_clk(main_clock), .i_clk_div4(divided_clock_by_4), diff --git a/boards/xilinx_frost_subsystem.sv b/boards/xilinx_frost_subsystem.sv index 756dc332..7e824f09 100644 --- a/boards/xilinx_frost_subsystem.sv +++ b/boards/xilinx_frost_subsystem.sv @@ -32,7 +32,13 @@ module xilinx_frost_subsystem #( // 1 = the cached tier ends in the simulation-only behavioral DDR model; // 0 = it ends at the o_ddr_axi_*/i_ddr_axi_* ports below, wired to the // board's DDR controller subsystem (both boards drive 0). - parameter int unsigned USE_BEHAVIORAL_DDR = 1 + parameter int unsigned USE_BEHAVIORAL_DDR = 1, + // L1 instruction-cache size in bytes. genesys2 (L1-only, no L2) bumps this + // above the 16 KiB default so the kernel periodic-tick/softirq/scheduler + // working set stays resident, addressing the tick-livelock I$ thrash. + parameter int unsigned L1I_CACHE_BYTES = 16 * 1024, + // Optional boot-hang UART classifier. Leave off for interactive testing. + parameter int unsigned ENABLE_HANG_TRIAGE = 0 ) ( input logic i_clk, // Main CPU clock input logic i_clk_div4, // Divided clock for JTAG/UART (1/4 of main clock) @@ -217,7 +223,9 @@ module xilinx_frost_subsystem #( .CLK_FREQ_HZ(CLK_FREQ_HZ), .ENABLE_CACHED_TIER(ENABLE_CACHED_TIER), .CACHED_HAS_L2(CACHED_HAS_L2), - .USE_BEHAVIORAL_DDR(USE_BEHAVIORAL_DDR) + .USE_BEHAVIORAL_DDR(USE_BEHAVIORAL_DDR), + .L1I_CACHE_BYTES(L1I_CACHE_BYTES), + .ENABLE_HANG_TRIAGE(ENABLE_HANG_TRIAGE) ) frost_processor ( .i_clk(i_clk), .i_clk_div4(i_clk_div4), diff --git a/formal/reorder_buffer.sby b/formal/reorder_buffer.sby index 7f10d441..cd51a06a 100644 --- a/formal/reorder_buffer.sby +++ b/formal/reorder_buffer.sby @@ -21,6 +21,7 @@ cover: smtbmc boolector read -formal -sv riscv_pkg.sv read -formal -sv sdp_dist_ram.sv read -formal -sv mwp_dist_ram.sv +read -formal -sv mwp_dist_ram_ohread.sv read -formal -sv rob_serializer.sv read -formal -sv reorder_buffer.sv prep -top reorder_buffer @@ -29,5 +30,6 @@ prep -top reorder_buffer ../hw/rtl/cpu_and_mem/cpu/riscv_pkg.sv ../hw/rtl/lib/ram/sdp_dist_ram.sv ../hw/rtl/lib/ram/mwp_dist_ram.sv +../hw/rtl/lib/ram/mwp_dist_ram_ohread.sv ../hw/rtl/cpu_and_mem/cpu/tomasulo/reorder_buffer/rob_serializer.sv ../hw/rtl/cpu_and_mem/cpu/tomasulo/reorder_buffer/reorder_buffer.sv diff --git a/formal/store_queue.sby b/formal/store_queue.sby index af51e708..9fbca9c2 100644 --- a/formal/store_queue.sby +++ b/formal/store_queue.sby @@ -14,7 +14,7 @@ smtbmc boolector [script] read -formal -sv riscv_pkg.sv read -sv sdp_dist_ram.sv -read -sv sq_forwarding_unit.sv +read -formal -sv sq_forwarding_unit.sv read -formal -sv store_queue.sv prep -top store_queue diff --git a/formal/tomasulo_wrapper.sby b/formal/tomasulo_wrapper.sby index 8ccd2631..d1de30db 100644 --- a/formal/tomasulo_wrapper.sby +++ b/formal/tomasulo_wrapper.sby @@ -16,6 +16,7 @@ smtbmc boolector read -formal -sv riscv_pkg.sv read -formal -sv sdp_dist_ram.sv read -formal -sv mwp_dist_ram.sv +read -formal -sv mwp_dist_ram_ohread.sv read -sv rob_serializer.sv read -sv reorder_buffer.sv read -sv register_alias_table.sv @@ -72,6 +73,7 @@ prep -top tomasulo_wrapper ../hw/rtl/cpu_and_mem/cpu/riscv_pkg.sv ../hw/rtl/lib/ram/sdp_dist_ram.sv ../hw/rtl/lib/ram/mwp_dist_ram.sv +../hw/rtl/lib/ram/mwp_dist_ram_ohread.sv ../hw/rtl/cpu_and_mem/cpu/tomasulo/reorder_buffer/rob_serializer.sv ../hw/rtl/cpu_and_mem/cpu/tomasulo/reorder_buffer/reorder_buffer.sv ../hw/rtl/cpu_and_mem/cpu/tomasulo/register_alias_table/register_alias_table.sv diff --git a/fpga/load_software/file_to_bram.tcl b/fpga/load_software/file_to_bram.tcl index 20647787..7ff2727a 100644 --- a/fpga/load_software/file_to_bram.tcl +++ b/fpga/load_software/file_to_bram.tcl @@ -18,27 +18,62 @@ # Reads hex file (one 32-bit word per line) and writes to BRAM through # JTAG-to-AXI bridge. Used for loading software without reprogramming FPGA. -proc file2bram {base_memory_address firmware_filename {axi_interface_name hw_axi_1}} { +proc _file2bram_rearm_image_load_reset {axi_interface_name base_memory_address rearm_word} { + set old_txn [get_hw_axi_txns -quiet bramrstkeep] + if {[llength $old_txn] > 0} { + delete_hw_axi_txn $old_txn + } + create_hw_axi_txn bramrstkeep [get_hw_axis $axi_interface_name] \ + -type write -address [format 0x%08x $base_memory_address] -len 1 -data $rearm_word + run_hw_axi [get_hw_axi_txns bramrstkeep] + delete_hw_axi_txn [get_hw_axi_txns bramrstkeep] +} + +proc file2bram {base_memory_address firmware_filename {axi_interface_name hw_axi_1} {batch_limit 64}} { # Open firmware file (text format: 8 hex digits per line) set file_descriptor [open $firmware_filename r] set current_address $base_memory_address set transaction_number 0 + set batch_word_count 0 + set total_words 0 + set first_word "" - # Read file line by line - each line is one 32-bit word in hexadecimal + # Read file line by line - each line is one 32-bit word in hexadecimal. + # Run bounded batches so the hardware image-load reset one-shot cannot + # expire while Vivado is blocked inside one very large run_hw_axi call. while {[gets $file_descriptor word_hex_value] >= 0} { + set word_hex_value [string trim $word_hex_value] + if {$word_hex_value eq ""} { + continue + } + if {$first_word eq ""} { + set first_word $word_hex_value + } + set formatted_address [format 0x%08x $current_address] - # Create AXI write transaction for this word - create_hw_axi_txn wr$transaction_number [get_hw_axis $axi_interface_name] \ + create_hw_axi_txn bramwr$batch_word_count [get_hw_axis $axi_interface_name] \ -type write -address $formatted_address -len 1 -data $word_hex_value + incr batch_word_count incr transaction_number - # Move to next word (4 bytes) + incr total_words incr current_address 4 + + if {$batch_word_count >= $batch_limit} { + run_hw_axi [get_hw_axi_txns bramwr*] + delete_hw_axi_txn [get_hw_axi_txns bramwr*] + set batch_word_count 0 + if {$first_word ne ""} { + _file2bram_rearm_image_load_reset $axi_interface_name $base_memory_address $first_word + } + } } close $file_descriptor - # Execute all queued AXI transactions - run_hw_axi [get_hw_axi_txns] + if {$batch_word_count > 0} { + run_hw_axi [get_hw_axi_txns bramwr*] + delete_hw_axi_txn [get_hw_axi_txns bramwr*] + } - puts "Loaded $transaction_number words starting at [format 0x%08x $base_memory_address]" + puts "Loaded $total_words words starting at [format 0x%08x $base_memory_address] in bounded batches" } diff --git a/fpga/load_software/file_to_ddr.tcl b/fpga/load_software/file_to_ddr.tcl index 1a3ee52f..40c0f852 100644 --- a/fpga/load_software/file_to_ddr.tcl +++ b/fpga/load_software/file_to_ddr.tcl @@ -21,45 +21,90 @@ # Addresses are REGION-RELATIVE: offset 0 = the base of the 1 GiB cached # region (0x8000_0000 in the CPU's address map). The CPU must be held in # reset while this runs (the image-load reset in xilinx_frost_subsystem -# asserts on low-BRAM writes, which the loader always performs afterwards; -# the caches re-invalidate on that reset, so the freshly written DDR contents -# are never shadowed by stale lines). +# asserts on low-BRAM writes; the caches re-invalidate on that reset, so the +# freshly written DDR contents are never shadowed by stale lines). +# +# CRITICAL: the image_load_reset is a ~4 s one-shot counter re-armed by each +# low-BRAM write. A multi-MB DDR image takes much longer than 4 s to burst in, +# so a single pre-load BRAM write is NOT enough -- the counter expires +# mid-load, the CPU comes out of reset, and free-runs against the half-written +# DDR image (nondeterministic -> flaky boot hangs). When bram_axi_name is +# given we re-arm the reset with a dummy low-BRAM write every poke_interval +# bursts (sub-second << 4 s), holding the CPU in reset for the ENTIRE load. +# The DDR loader (S01) is a separate AXI master and keeps running while the CPU +# is held, so the load still completes. + +# Re-arm the image-load CPU reset with a single low-BRAM write (restarts the +# subsystem's ~4 s reset counter). Called right before every blocking DDR batch +# run so the counter can never expire mid-load and let the CPU free-run. +proc _rearm_image_load_reset {bram_axi_name rearm_word} { + if {$bram_axi_name eq ""} return + create_hw_axi_txn rstkeep [get_hw_axis $bram_axi_name] \ + -type write -address 0x00000000 -len 1 -data $rearm_word + run_hw_axi [get_hw_axi_txns rstkeep] + delete_hw_axi_txn [get_hw_axi_txns rstkeep] +} -proc file2ddr {firmware_filename {axi_interface_name hw_axi_2} {burst_words 256}} { +proc file2ddr {firmware_filename {axi_interface_name hw_axi_2} {burst_words 256} {bram_axi_name ""} {rearm_word "00000000"}} { set file_descriptor [open $firmware_filename r] - set words [list] - while {[gets $file_descriptor word_hex_value] >= 0} { - set trimmed [string trim $word_hex_value] - if {$trimmed ne ""} { - lappend words $trimmed - } - } - close $file_descriptor - set total_words [llength $words] + # Stream the image in burst-sized chunks. Reading the whole file into one + # giant Tcl list and indexing it per word (lindex on a multi-MB list) is + # pathologically slow in the Vivado tcl interpreter -- THAT, not the JTAG, + # is what turned a ~6 MB Linux image into a ~17 min load (the actual + # create/run/delete of all ~8.8k bursts is only ~15 s). Reading burst_words + # lines at a time keeps every list tiny, so the data-prep is ~linear and + # negligible. run+delete in batches so the live hw_axi_txn set stays bounded. + set axi [get_hw_axis $axi_interface_name] set current_address 0 set transaction_number 0 - set index 0 + set total_words 0 + set batch 0 + set batch_limit 128 ;# small batches so each blocking run_hw_axi stays well under the ~4 s reset counter + + while {1} { + # Collect up to burst_words words for this burst (skipping blank lines, + # so non-blank word N still lands at DDR offset N -- matches the old + # read-all-then-index behaviour). + set chunk [list] + for {set i 0} {$i < $burst_words} {incr i} { + if {[gets $file_descriptor word_hex_value] < 0} { break } + set trimmed [string trim $word_hex_value] + if {$trimmed ne ""} { lappend chunk $trimmed } + } + set beats [llength $chunk] + if {$beats == 0} { break } - while {$index < $total_words} { - set beats [expr {min($burst_words, $total_words - $index)}] # hw_axi burst data is one bit-vector with beat 0 in the least # significant word: concatenate this burst's words last-to-first. set data "" for {set b [expr {$beats - 1}]} {$b >= 0} {incr b -1} { - append data [lindex $words [expr {$index + $b}]] + append data [lindex $chunk $b] } - set formatted_address [format 0x%08x $current_address] - create_hw_axi_txn ddrwr$transaction_number [get_hw_axis $axi_interface_name] \ - -type write -address $formatted_address -len $beats -data $data + create_hw_axi_txn ddrwr$batch $axi \ + -type write -address [format 0x%08x $current_address] -len $beats -data $data + incr batch incr transaction_number - incr index $beats + incr total_words $beats incr current_address [expr {4 * $beats}] + if {$batch >= $batch_limit} { + # Re-arm the reset IMMEDIATELY before the blocking batch run (the only + # loop step long enough to risk the ~4 s counter expiring mid-load). + _rearm_image_load_reset $bram_axi_name $rearm_word + run_hw_axi [get_hw_axi_txns ddrwr*] + delete_hw_axi_txn [get_hw_axi_txns ddrwr*] + set batch 0 + puts " DDR load progress: $total_words words" + flush stdout + } } + close $file_descriptor - if {$transaction_number > 0} { + if {$batch > 0} { + _rearm_image_load_reset $bram_axi_name $rearm_word run_hw_axi [get_hw_axi_txns ddrwr*] + delete_hw_axi_txn [get_hw_axi_txns ddrwr*] } puts "Loaded $total_words DDR words in $transaction_number burst transaction(s)" diff --git a/fpga/load_software/load_software.py b/fpga/load_software/load_software.py index b835cb07..88feda8c 100755 --- a/fpga/load_software/load_software.py +++ b/fpga/load_software/load_software.py @@ -18,6 +18,7 @@ import argparse import os +import shutil import subprocess import sys from pathlib import Path @@ -47,6 +48,7 @@ *COREMARK_PRO_APP_NAMES, "csr_test", "ddr_exec_test", + "ddr_atomic_test", "ddr_heap_test", "ddr_smc_test", "ddr_test", @@ -55,8 +57,13 @@ "fpu_test", "hello_world", "isa_test", + "linux_irq_active_ddr_test", + "linux_boot", + "linux_irq_ddr_test", + "linux_irq_stack_slot_test", "memory_test", "packet_parser", + "pde_return_hazard", "print_clock_speed", "ras_stress_test", "ras_test", @@ -89,18 +96,68 @@ # that address range reads back zero. Rejected below until then. DDR_APPS = frozenset(COREMARK_PRO_APP_NAMES) | { "ddr_exec_test", + "ddr_atomic_test", "ddr_heap_test", "ddr_smc_test", "ddr_test", + "linux_irq_active_ddr_test", + "linux_boot", + "linux_irq_ddr_test", + "linux_irq_stack_slot_test", + "pde_return_hazard", } +def _linux_boot_preflight() -> None: + """Fail fast (with actionable guidance) before the long linux_boot self-build. + + linux_boot is the only app that builds a whole Linux system from source via + the Buildroot submodule, so check its prerequisites up front rather than + dying deep inside a 30-60 min build (or after prompting for a hardware + target). Also warn on the first, from-scratch build so the runtime is not a + surprise. + """ + buildroot_makefile = PROJECT_ROOT / "linux" / "buildroot" / "Makefile" + if not buildroot_makefile.exists(): + print( + "Error: the Buildroot submodule (linux/buildroot) is not initialized.\n" + " Run: git submodule update --init linux/buildroot", + file=sys.stderr, + ) + sys.exit(1) + + missing = [tool for tool in ("make", "dtc") if shutil.which(tool) is None] + if missing: + print( + "Error: missing host tools required to build the Linux image: " + f"{', '.join(missing)}.\n" + " Install Buildroot's host dependencies (see " + "linux/buildroot-external/README.md) or run inside the\n" + " frost-dev Docker image, which ships them.", + file=sys.stderr, + ) + sys.exit(1) + + kimage = PROJECT_ROOT / "linux" / "build" / "images" / "Image" + if not kimage.exists(): + print( + "Note: no cached kernel image found -- linux_boot will build the " + "kernel + rootfs from source now.\n" + " The FIRST build compiles a full rv32 cross toolchain and can take " + "30-60 min; later loads reuse\n" + " the cached build and only re-pack the DDR image for this board " + "(seconds).", + file=sys.stderr, + ) + + def compile_app_for_board( app_name: str, app_dir: Path, clock_freq: int, coremark_iterations: int, make_vars: dict[str, str] | None = None, + mem_config: str | None = None, ) -> bool: """Compile the application with board-specific settings. @@ -110,6 +167,7 @@ def compile_app_for_board( clock_freq: CPU clock frequency for this board coremark_iterations: Number of iterations for CoreMark make_vars: Extra make variable overrides + mem_config: If set, exported as MEM_CONFIG to relink the app (e.g. "ddr") Returns: True if compilation succeeded, False otherwise @@ -123,6 +181,20 @@ def compile_app_for_board( env["FPGA_CPU_CLK_FREQ"] = str(clock_freq) if app_name == "coremark": env["ITERATIONS"] = str(coremark_iterations) + # MEM_CONFIG=ddr relinks the app's code into the cached DDR region (the app + # Makefiles default to bram); this lets an arbitrary app run from DDR like + # the dedicated ddr_* apps. The Makefile's `?=` honors this env override. + if mem_config: + env["MEM_CONFIG"] = mem_config + + # linux_boot self-builds the kernel + rootfs from the Buildroot submodule on + # a clean checkout, which can take ~30-60 min the first time (a full cross + # toolchain build); every other app is a quick cross-compile. `make clean` + # for linux_boot only drops the board-dependent pack outputs (the cached + # kernel/rootfs survive), so the re-pack after clean is fast either way. + is_linux_boot = app_name == "linux_boot" + clean_timeout = 300 if is_linux_boot else 30 + build_timeout = 5400 if is_linux_boot else 120 try: # Clean first to force recompilation with new settings @@ -132,7 +204,7 @@ def compile_app_for_board( env=env, capture_output=True, text=True, - timeout=30, + timeout=clean_timeout, ) # Build with board-specific settings @@ -147,7 +219,7 @@ def compile_app_for_board( env=env, capture_output=False, # Show output text=True, - timeout=120, + timeout=build_timeout, ) if result.returncode != 0: @@ -204,6 +276,15 @@ def main() -> None: default="vivado", help="Path to Vivado executable (default: vivado from PATH)", ) + parser.add_argument( + "--ddr", + action="store_true", + help=( + "Build the app to execute from the cached DDR region (passes " + "MEM_CONFIG=ddr to the app Makefile), so an otherwise BRAM-resident " + "app runs its code from DDR. Requires a board with has_ddr." + ), + ) coremark_pro_mode = parser.add_mutually_exclusive_group() coremark_pro_mode.add_argument( "-v0", @@ -349,6 +430,12 @@ def main() -> None: f"CoreMark-PRO hardware flow: {coremark_pro_error}." ) + # linux_boot builds a full Linux system from source; check its build + # prerequisites (and warn about the first-build runtime) before we prompt for + # a hardware target or kick off a long compile. + if args.software_app == "linux_boot": + _linux_boot_preflight() + # Select hardware target (may prompt user if multiple targets) # Auto-filters by vendor based on board (e.g., genesys2 -> Digilent, x3 -> Xilinx) selected_target = select_target( @@ -427,7 +514,12 @@ def main() -> None: elif args.coremark_pro_mode == "validation": print(" CoreMark-PRO run type: validation (-v1)") if not compile_app_for_board( - args.software_app, app_dir, clock_freq, coremark_iterations, make_vars + args.software_app, + app_dir, + clock_freq, + coremark_iterations, + make_vars, + mem_config="ddr" if args.ddr else None, ): print(f"Error: Failed to compile {args.software_app}", file=sys.stderr) sys.exit(1) diff --git a/fpga/load_software/load_software.tcl b/fpga/load_software/load_software.tcl index 1d812293..edfbb336 100644 --- a/fpga/load_software/load_software.tcl +++ b/fpga/load_software/load_software.tcl @@ -39,10 +39,10 @@ set coremark_pro_apps [list coremark_pro_core coremark_pro_cjpeg \ # Valid software applications (mirrors load_software.py VALID_APPS) set valid_apps [list branch_pred_test c_ext_test call_stress cf_ext_test coremark \ - {*}$coremark_pro_apps csr_test ddr_exec_test ddr_heap_test \ + {*}$coremark_pro_apps csr_test ddr_atomic_test ddr_exec_test ddr_heap_test \ ddr_smc_test ddr_test freertos_demo fpu_assembly_test fpu_test \ - hello_world isa_test memory_test \ - packet_parser print_clock_speed ras_stress_test ras_test \ + hello_world isa_test linux_irq_active_ddr_test linux_boot linux_irq_ddr_test linux_irq_stack_slot_test memory_test \ + packet_parser pde_return_hazard print_clock_speed ras_stress_test ras_test \ spanning_test sprintf_test strings_test tomasulo_perf \ tomasulo_test uart_echo] @@ -165,10 +165,14 @@ set bram_base_address 0x00000000 set ddr_text_file ${project_root}/sw/apps/${firmware_application_name}/sw_ddr.txt # DDR image first (when present): assert the image-load CPU reset with a -# single low-BRAM write, then burst the DDR image through hw_axi_2. The CPU -# stays in reset until well after the subsequent full BRAM load, and the -# caches re-invalidate on release, so the fresh DDR contents are never -# shadowed by stale lines or racing writebacks. +# low-BRAM write, then burst the DDR image through hw_axi_2 while RE-ARMING +# that reset periodically (file2ddr pokes bram_axi every poke_interval bursts). +# The image_load_reset is only a ~4 s one-shot, far shorter than a multi-MB DDR +# load, so without the periodic re-arm the CPU would leave reset mid-load and +# free-run against the half-written DDR image (nondeterministic boot hangs). +# With it the CPU stays in reset until well after the subsequent full BRAM +# load, and the caches re-invalidate on release, so the fresh DDR contents are +# never shadowed by stale lines or racing writebacks. if { $has_ddr && $ddr_axi ne "" && [file exists $ddr_text_file] && [file size $ddr_text_file] > 12 } { set first_word_fd [open $firmware_text_file r] gets $first_word_fd first_word @@ -177,8 +181,8 @@ if { $has_ddr && $ddr_axi ne "" && [file exists $ddr_text_file] && [file size $d -type write -address 0x00000000 -len 1 -data $first_word run_hw_axi [get_hw_axi_txns rst_assert] set ddr_word_count [expr {[file size $ddr_text_file] / 9}] - puts "Loading ~${ddr_word_count} words into DDR via ${ddr_axi} (bursts)..." - file2ddr $ddr_text_file $ddr_axi + puts "Loading ~${ddr_word_count} words into DDR via ${ddr_axi} (bursts, CPU held in reset)..." + file2ddr $ddr_text_file $ddr_axi 256 $bram_axi $first_word } # Write software to low BRAM starting at address 0. diff --git a/fpga/sweep_coremark_pro.py b/fpga/sweep_coremark_pro.py index a8e4d64e..4cad3afb 100755 --- a/fpga/sweep_coremark_pro.py +++ b/fpga/sweep_coremark_pro.py @@ -20,8 +20,19 @@ (clean rebuild with the official registry args + JTAG load) on the selected board while holding the board UART open, then applies the strict pass rule to the captured output: ``<>`` present, no ``ERROR``/``<>``/``<>``, -and every ``:fails=N`` counter zero. Each workload's ``time(secs)`` is extracted -for the summary table. Exits 0 only if every app passes. +and every ``:fails=N`` counter zero. Each workload's ``time(secs)`` and +``iterations`` are extracted and reduced to iter/s for the summary table. +Exits 0 only if every app passes. + +A full passing -v0 sweep also reports the official CoreMark-PRO score: each +workload's iter/s is multiplied by its scale factor and divided by its +reference-platform score, and the mark is 1000 x the geometric mean of the +nine normalized results (EEMBC Symmetric Multicore Benchmark User Guide 2.1.4 +sec. 4.4 p.12, identical to coremark-pro's util/perl/cert_mark.pl). FROST is +single-core, so the single-context result is both the SingleCore and MultiCore +mark. -v1 sweeps print iter/s but no score (verification runs are not +score-eligible), and -v0 workloads finishing under the ~10s score-rule minimum +get a warning to recalibrate their registry iteration count. The target board is chosen with the required ``--board`` flag (``x3`` or ``genesys2``); both expose all nine hardware-supported workloads. With no app @@ -30,6 +41,9 @@ The UART device (``--serial``) and JTAG target (``--target``) default per board (X3: /dev/ttyUSB2; genesys2: /dev/ttyUSB0); override either with its flag. +The sweep refuses to start while another process holds the UART open, and +holds the port in exclusive mode (TIOCEXCL) while running -- a second reader +(e.g. a forgotten minicom) would silently steal chunks of the capture. Examples (from the repo root): @@ -45,6 +59,9 @@ import argparse import collections +import fcntl +import glob +import math import os import re import select @@ -59,7 +76,10 @@ REPO_DEFAULT = SCRIPT_DIR.parent sys.path.insert(0, str(REPO_DEFAULT / "sw" / "apps")) -from software_registry import COREMARK_PRO_PROGRAMS # noqa: E402 +from software_registry import ( # noqa: E402 + COREMARK_PRO_PROGRAM_BY_APP, + COREMARK_PRO_PROGRAMS, +) HW_APPS = tuple(p.app_name for p in COREMARK_PRO_PROGRAMS if p.hardware_supported) @@ -96,10 +116,109 @@ # avoid misreading a prior run's <>/time as this run's result. LOAD_COMPLETE_SENTINEL = "FROST_LOAD_COMPLETE" +# Official CoreMark-PRO scoring constants: workload -> (scale factor, +# reference-platform score), from the EEMBC Symmetric Multicore Benchmark User +# Guide 2.1.4 sec. 4.4 Figure 10 and coremark-pro util/perl/cert_mark.pl (the +# two agree). A workload's normalized result is iter/s * scale / reference, +# and the mark is 1000 x the geometric mean of the nine normalized results. +COREMARK_PRO_REFERENCE = { + "cjpeg-rose7-preset": (1.0, 40.3438), + "core": (10000.0, 2855.0), + "linear_alg-mid-100x100-sp": (1.0, 38.5624), + "loops-all-mid-10k-sp": (1.0, 0.87959), + "nnet_test": (1.0, 1.45853), + "parser-125k": (1.0, 4.81116), + "radix2-big-64k": (1.0, 99.6587), + "sha-test": (1.0, 48.5201), + "zip-test": (1.0, 21.3618), +} + +# Minimum -v0 workload runtime for an official score run; the registry +# calibrates each workload's iteration count to clear this. +SCORE_RULE_MIN_SECS = 10.0 + +# mith prints time(secs) with %8g: usually plain decimal, but accept the +# exponent form %g falls back to for extreme values. +MITH_NUMBER = r"([0-9]+(?:\.[0-9]*)?(?:[eE][+-]?[0-9]+)?)" + + +def parse_workload_perf(serial_buf: str, workload: str) -> dict[str, Any]: + """Extract the workload-level iterations/time(secs) pair and derive iter/s. + + Mirrors coremark-pro's util/perl/results_parser.pl (iter/s = iterations / + time(secs)). Anchoring on the official workload name keeps -v1 per-item + lines out of the match: mith prints the workload-level block first, and + only that block has an ``iterations=`` line. + """ + name = re.escape(workload) + iters_match = re.search(rf"-- {name}:iterations=([0-9]+)", serial_buf) + secs_match = re.search(rf"-- {name}:time\(secs\)=\s*{MITH_NUMBER}", serial_buf) + iterations = int(iters_match.group(1)) if iters_match else None + secs = float(secs_match.group(1)) if secs_match else None + ips = None + if iterations and secs and secs > 0: + ips = iterations / secs + return {"iterations": iterations, "secs": secs, "ips": ips} + + +def coremark_pro_mark( + ips_by_workload: dict[str, float], +) -> tuple[float | None, list[str]]: + """Compute the official CoreMark-PRO mark from per-workload iter/s. + + Returns (mark, []) when every official workload has a positive iter/s, + else (None, sorted missing workload names) -- the mark is only defined + over the full set of nine. + """ + missing = sorted( + workload + for workload in COREMARK_PRO_REFERENCE + if not ips_by_workload.get(workload) + ) + if missing: + return None, missing + log_sum = 0.0 + for workload, (scale, reference) in COREMARK_PRO_REFERENCE.items(): + log_sum += math.log(ips_by_workload[workload] * scale / reference) + return 1000.0 * math.exp(log_sum / len(COREMARK_PRO_REFERENCE)), [] + + +def serial_holders(path: str) -> list[str]: + """Return 'pid: cmdline' for other processes holding the serial device. + + The tty layer delivers each received byte to exactly one reader, so a + second attached process (a forgotten minicom, an old capture script) + steals random chunks of the UART stream and silently corrupts the + sweep's capture. Scans /proc, so it only sees same-user processes. + """ + try: + target = os.stat(path).st_rdev + except OSError: + return [] + holders = set() + for fd_link in glob.glob("/proc/[0-9]*/fd/*"): + pid = fd_link.split("/")[2] + if pid == str(os.getpid()): + continue + try: + if os.stat(fd_link).st_rdev != target: + continue + with open(f"/proc/{pid}/cmdline", "rb") as f: + cmdline = f.read().replace(b"\0", b" ").decode().strip() + except OSError: + continue + holders.add(f"pid {pid}: {cmdline or ''}") + return sorted(holders) + def configure_serial(path: str) -> int: - """Open the UART raw/non-blocking at 115200 8N1 and flush stale bytes.""" + """Open the UART raw/non-blocking at 115200 8N1 and flush stale bytes. + + The port is put in exclusive mode (TIOCEXCL) so a terminal opened + mid-sweep gets EBUSY instead of silently stealing capture bytes. + """ fd = os.open(path, os.O_RDWR | os.O_NOCTTY | os.O_NONBLOCK) + fcntl.ioctl(fd, termios.TIOCEXCL) attrs = termios.tcgetattr(fd) attrs[0] = 0 attrs[1] = 0 @@ -154,6 +273,8 @@ def run_one( target: str, ) -> dict[str, Any]: """Load one app on the given board and watch the UART until a marker/timeout.""" + program = COREMARK_PRO_PROGRAM_BY_APP.get(app) + workload = program.workload if program else None drain(serial_fd) cmd = [ "./fpga/load_software/load_software.py", @@ -241,9 +362,13 @@ def consume_loader(text: str) -> None: if proc.returncode != 0: return { "app": app, + "workload": workload, "mode": mode, "status": "LOAD_FAIL", "elapsed": None, + "iterations": None, + "secs": None, + "ips": None, "serial": serial_buf, "loader_tail": list(loader_tail), } @@ -283,16 +408,89 @@ def consume_loader(text: str) -> None: if match: workload_time = float(match.group(1)) + perf = ( + parse_workload_perf(serial_buf, workload) + if workload + else {"iterations": None, "secs": None, "ips": None} + ) + return { "app": app, + "workload": workload, "mode": mode, "status": status, "elapsed": workload_time, + **perf, "serial": serial_buf, "loader_tail": list(loader_tail), } +def print_score_report(results: list[dict[str, Any]], mode: str) -> None: + """Print the per-workload iter/s table and, for a -v0 sweep, the mark.""" + rows = [r for r in results if r["workload"]] + if not rows: + return + + print("\nCoreMark-PRO WORKLOAD RESULTS (single context)") + print( + f"{'Workload Name':<27} {'Status':>9} {'iters':>6} " + f"{'time(s)':>10} {'iter/s':>12} {'weighted':>10}" + ) + print(f"{'-' * 27} {'-' * 9} {'-' * 6} {'-' * 10} {'-' * 12} {'-' * 10}") + for r in rows: + scale_ref = COREMARK_PRO_REFERENCE.get(r["workload"]) + iters_text = "n/a" if r["iterations"] is None else str(r["iterations"]) + secs_text = "n/a" if r["secs"] is None else f"{r['secs']:.4f}" + ips_text = "n/a" if r["ips"] is None else f"{r['ips']:.6g}" + weighted_text = "n/a" + if r["ips"] is not None and scale_ref is not None: + weighted_text = f"{r['ips'] * scale_ref[0] / scale_ref[1]:.6g}" + print( + f"{r['workload']:<27} {r['status']:>9} {iters_text:>6} " + f"{secs_text:>10} {ips_text:>12} {weighted_text:>10}" + ) + print( + "weighted = iter/s x scale / reference-platform score " + "(EEMBC guide 2.1.4 sec. 4.4 Fig. 10)" + ) + + if mode == "-v1": + print( + "\nCoreMark-PRO score: n/a for -v1 validation sweeps (verification " + "runs are not score-eligible); rerun with -v0." + ) + return + + for r in rows: + if ( + r["status"] == "PASS" + and r["secs"] is not None + and r["secs"] < SCORE_RULE_MIN_SECS + ): + print( + f"warning: {r['workload']} ran {r['secs']:.1f}s, under the " + f"~{SCORE_RULE_MIN_SECS:.0f}s score-rule minimum; recalibrate " + "its iteration count in sw/apps/software_registry.py" + ) + + ips_by_workload = { + r["workload"]: r["ips"] for r in rows if r["status"] == "PASS" and r["ips"] + } + score, missing = coremark_pro_mark(ips_by_workload) + if score is None: + print( + "\nCoreMark-PRO score: n/a -- the official mark needs a passing " + f"iter/s from all 9 workloads; missing: {', '.join(missing)}" + ) + else: + print(f"\nCoreMark-PRO score (single context): {score:.2f}") + print( + " 1000 x geomean of the 9 weighted results; single core, so " + "SingleCore == MultiCore" + ) + + def main() -> int: """Run the sweep and print the summary table.""" parser = argparse.ArgumentParser( @@ -307,7 +505,7 @@ def main() -> int: const="-v0", help=( "longer performance/score sweep: runs the registry-calibrated " - "iteration counts" + "iteration counts and computes the official CoreMark-PRO score" ), ) mode_group.add_argument( @@ -387,6 +585,18 @@ def main() -> int: serial = args.serial if args.serial else DEFAULT_SERIALS[args.board] timeout = args.timeout if args.timeout is not None else DEFAULT_TIMEOUTS[args.board] + holders = serial_holders(serial) + if holders: + print( + f"ERROR: {serial} is already open in another process, which would " + "steal chunks of the UART capture:", + file=sys.stderr, + ) + for holder in holders: + print(f" {holder}", file=sys.stderr) + print("Close it (or pass another --serial) and re-run.", file=sys.stderr) + return 1 + fd = configure_serial(serial) results = [] try: @@ -408,6 +618,12 @@ def main() -> int: f"{result['status']} time={result['elapsed']}", flush=True, ) + if result["status"] == "PASS" and result["ips"] is None: + print( + "warning: PASS but iterations/time(secs) missing from the " + "capture -- UART bytes lost?", + flush=True, + ) if result["status"] == "LOAD_FAIL": print("loader tail:", flush=True) print("\n".join(result["loader_tail"]), flush=True) @@ -417,7 +633,11 @@ def main() -> int: bad = [r for r in results if r["status"] != "PASS"] print(f"\nSUMMARY ({args.board})") for r in results: - print(f"{args.board} {r['app']} {r['mode']} {r['status']} time={r['elapsed']}") + line = f"{args.board} {r['app']} {r['mode']} {r['status']} time={r['elapsed']}" + if r["ips"] is not None: + line += f" iter/s={r['ips']:.6g}" + print(line) + print_score_report(results, args.mode) return 1 if bad else 0 diff --git a/hw/rtl/README.md b/hw/rtl/README.md index ddb83852..bcfeba10 100644 --- a/hw/rtl/README.md +++ b/hw/rtl/README.md @@ -4,7 +4,7 @@ This directory contains the synthesizable SystemVerilog for FROST. The current CPU is an **out-of-order RV32GCB implementation with a 2-wide front-end and 2-wide commit**: a 2-wide in-order IF/PD/ID front-end, Tomasulo register renaming and dynamic scheduling, out-of-order execution across six function units, and -precise 2-wide in-order commit, with machine-mode traps and separate +precise 2-wide in-order commit, with M/U-mode traps and separate instruction/data memory ports. The pipeline width is **asymmetric**. Fetch, decode, rename, ROB allocation, @@ -88,7 +88,7 @@ backend notes. | `cpu_and_mem/cpu/csr/` | In use | Zicsr/Zicntr/fcsr support | | `cpu_and_mem/cpu/wb_stage/generic_regfile.sv` | In use | Parameterized INT/FP regfiles for OOO commit | | `cpu_and_mem/cpu/ex_stage/` | In use | Shared ALU, multiplier/divider, FPU, and `branch_jump_unit.sv` used by the OOO core and FU shims | -| `cpu_and_mem/cpu/control/trap_unit.sv` | In use | Machine-mode exception/interrupt handling | +| `cpu_and_mem/cpu/control/trap_unit.sv` | In use | M- and U-mode exception/interrupt handling (traps taken in M-mode) | | `lib/` | In use | Portable RAM/FIFO/stall helper primitives, plus `lib/cache/` (the `frost_cache` hierarchy, AXI bridge, and behavioral DDR model) and `lib/ram/sdp_ram_byte_en.sv` (row-granular byte-enable RAM with a selectable block/ultra primitive backing the cache data arrays) | | `peripherals/` | In use | UART TX/RX blocks | @@ -102,7 +102,7 @@ served by the cache hierarchy: |--------|---------|------|-------------| | ROM | `0x0000_0000` | 96 KiB | Code and read-only data (fast BRAM) | | RAM | `0x0001_8000` | 160 KiB | Data, BSS, stack (fast BRAM) | -| MMIO | `0x4000_0000` | 44 B | UART, FIFOs, CLINT-style timer, software interrupt | +| MMIO | `0x4000_0000` | 112 KiB | UART/FIFOs/timer; plus Linux-facing ns16550a UART (`0x4000_1000`) and SiFive CLINT (`0x4001_0000`) | | DDR | `0x8000_0000` | 1 GiB | Cached region: code (`.ddr_text`), heap and large data (see below) | The cached tier serves both sides of the core: loads/stores through the @@ -146,10 +146,22 @@ MMIO registers: | `0x4000_0020` | MSIP | Machine software interrupt pending | | `0x4000_0024` | UART_RX_STATUS | Bit 0 is data available | | `0x4000_0028` | UART_TX_STATUS | Bit 0 is can accept byte | +| `0x4000_1000`–`101C` | ns16550a UART face | 16550 register file (word stride) aliasing UART_TX/RX for the Linux 8250 driver | +| `0x4001_0000` | CLINT MSIP | SiFive CLINT alias of MSIP | +| `0x4001_4000`/`4004` | CLINT MTIMECMP_LO/HI | SiFive CLINT alias of MTIMECMP | +| `0x4001_BFF8`/`BFFC` | CLINT MTIME_LO/HI | SiFive CLINT alias of MTIME | The hardware UART console is configured for 115200 baud, 8 data bits, no parity, and 1 stop bit (8N1). +For no-MMU Linux, the same UART is also reachable through a standard +ns16550a register face at `0x4000_1000` (word stride; device-tree +`reg-shift=2`, `reg-io-width=4`; `earlycon=uart8250,mmio32`), and the timer +through a SiFive-CLINT-compatible window at `0x4001_0000` (`mtimecmp` at +`+0x4000`, `mtime` at `+0xBFF8`). Both alias the native registers listed +above onto the same hardware, so the in-tree Linux 8250 console and CLINT +timer drivers work without a board-specific driver. + If these addresses change, update `cpu_and_mem.sv`, `cpu_ooo.sv` parameters, `sw/common/link.ld`, `sw/lib/include/mmio.h`, and the verification constants in `verif/config.py`. @@ -161,7 +173,8 @@ From the repo root: ```bash # Cocotb/Verilator simulation ./tests/test_run_cocotb.py hello_world -./tests/test_run_cocotb.py cpu +./tests/test_run_cocotb.py tomasulo_test +./tests/test_run_cocotb.py --list-tests # show all registered tests # Open-source RTL synthesis checks ./tests/test_run_yosys.py @@ -180,7 +193,7 @@ sed -n '1,200p' hw/rtl/frost.f The CPU build file list is: ```bash -sed -n '1,200p' hw/rtl/cpu_and_mem/cpu/cpu_ooo.f +sed -n '1,200p' hw/rtl/cpu_and_mem/cpu/cpu_ooo/cpu_ooo.f ``` ## Parameters @@ -199,7 +212,7 @@ sed -n '1,200p' hw/rtl/cpu_and_mem/cpu/cpu_ooo.f | `frost.sv` | `DDR_MODEL_BYTES` / `DDR_MODEL_LATENCY` | `64 MiB` / `30` | Behavioral DDR model size and access latency (simulation) | | `frost.sv` | `FETCH_VALID_FUZZ` | `0` | Simulation-only: 1 wraps the low BRAM in a variable-latency fetch model (LFSR fetch-valid gaps) that mirrors the L1I provider's fetch contract; hardware keeps 0 | | `cpu_ooo.sv` | `MMIO_ADDR` | `32'h4000_0000` | MMIO base | -| `cpu_ooo.sv` | `MMIO_SIZE_BYTES` | `32'h2C` | MMIO range size | +| `cpu_ooo.sv` | `MMIO_SIZE_BYTES` | `32'h2C` | MMIO range size; `cpu_and_mem.sv` overrides to `32'h1_C000` (covers the ns16550a face + CLINT alias) | Simulation overrides parameters through Verilator generics (`-G`): the test Makefile enables the cached tier with the X3 hierarchy shape by default diff --git a/hw/rtl/cpu_and_mem/cpu/README.md b/hw/rtl/cpu_and_mem/cpu/README.md index 4f3e8166..2532d352 100644 --- a/hw/rtl/cpu_and_mem/cpu/README.md +++ b/hw/rtl/cpu_and_mem/cpu/README.md @@ -104,7 +104,7 @@ instruction size. | `if_stage/`, `pd_stage/`, `id_stage/` | **In use** | Reused front-end stages, including BTB/direction/RAS prediction, PD BTB-miss redirects, and RVC handling. IF now drives a stall-capable, variable-latency fetch seam (NOP bubbles + a 1-deep owed-ask while unserved) so code can run from the cached DDR region as well as low BRAM; the seam's `fetch_provider` (low-BRAM fast path vs. a two-line L1I fetch buffer with predecode-on-fill) lives one level up in `cpu_and_mem/`. | | `wb_stage/` | **In use** | Only the parameterized regfile is in the OOO build (instantiated twice for INT / FP). | | `csr/` | **In use** | Zicsr / Zicntr / fcsr. CSR ops are decoded in ID but read and write the CSR at commit through the ROB serializing FSM. | -| `control/trap_unit.sv` | **In use** | Machine-mode exception/interrupt handling used by `cpu_ooo.sv`. | +| `control/trap_unit.sv` | **In use** | M- and U-mode exception/interrupt handling (traps taken in M-mode) used by `cpu_ooo.sv`. | | `ex_stage/` | **In use** | `branch_jump_unit.sv` is instantiated directly at top level. ALU/MUL/DIV/FPU are used via the FU shims in `tomasulo/fu_shims/`. | `cpu_ooo.f` is the authoritative filelist for what actually gets compiled. diff --git a/hw/rtl/cpu_and_mem/cpu/control/trap_unit.sv b/hw/rtl/cpu_and_mem/cpu/control/trap_unit.sv index 089e4583..446fe9e1 100644 --- a/hw/rtl/cpu_and_mem/cpu/control/trap_unit.sv +++ b/hw/rtl/cpu_and_mem/cpu/control/trap_unit.sv @@ -15,18 +15,21 @@ */ /* - * Trap Unit - Machine-mode exception and interrupt handling + * Trap Unit - exception and interrupt handling * * This module implements the RISC-V privileged architecture trap mechanism, * supporting both synchronous exceptions and asynchronous interrupts. + * Traps originate from M-mode or U-mode and are always taken in M-mode (mtvec). + * Machine interrupts are taken while running in U-mode regardless of mstatus.MIE, + * so the timer can preempt user code. * * Responsibilities: * ================= - * - Exception detection from EX stage (ECALL, EBREAK, misaligned access) + * - Exception handling from ROB commit (ECALL, EBREAK, misaligned access) * - Interrupt prioritization and masking * - Trap entry: save state, redirect to mtvec * - Trap exit (MRET): restore state, return to mepc - * - WFI: stall until interrupt pending + * - WFI state machine (unused in cpu_ooo; see WFI Behavior below) * * Trap Priority (highest to lowest): * ================================== @@ -62,6 +65,8 @@ * - Stall pipeline until any interrupt is pending * - Resume at next instruction if interrupt not taken * - Take trap if interrupt is both pending and enabled + * - NOTE: unused in cpu_ooo -- i_wfi_start is tied to 0 and o_stall_for_wfi + * is unconnected; WFI stalling is handled by ROB serialization at the head * * Related Modules: * - csr_file.sv: Provides mstatus/mie/mtvec/mepc, receives trap updates @@ -96,6 +101,10 @@ module trap_unit #( // Direct MIE bit input keeps mstatus bit extraction out of this path. input logic i_mstatus_mie_direct, + // Current privilege mode. Machine interrupts are taken whenever running + // below M (priv != PrivM) regardless of mstatus.MIE (RISC-V privileged spec). + input logic [1:0] i_priv, + // Interrupt pending inputs input riscv_pkg::interrupt_t i_interrupts, @@ -104,6 +113,7 @@ module trap_unit #( input logic [XLEN-1:0] i_exception_cause, input logic [XLEN-1:0] i_exception_tval, input logic [XLEN-1:0] i_exception_pc, + input logic [XLEN-1:0] i_interrupt_pc, // MRET trap-return request input logic i_mret_start, @@ -136,18 +146,39 @@ module trap_unit #( assign mie_msie = i_mie[riscv_pkg::MieMsiBit]; // Register trap_taken for one cycle to prevent it from re-asserting immediately - // after CSR update (breaks combinational loop with mstatus_mie) + // after CSR update (breaks combinational loop with mstatus_mie). Also keep + // a one-cycle MRET recovery marker: CSR privilege/MIE state changes on the + // raw MRET pulse, while the OOO front/back-end flush is registered one cycle + // later. During that handoff, an old registered interrupt must not trap with + // mepc equal to the MRET instruction itself. logic trap_taken_prev; + logic mret_taken_prev; always_ff @(posedge i_clk) begin - if (i_rst) trap_taken_prev <= 1'b0; - else trap_taken_prev <= o_trap_taken; + if (i_rst) begin + trap_taken_prev <= 1'b0; + mret_taken_prev <= 1'b0; + end else begin + trap_taken_prev <= o_trap_taken; + mret_taken_prev <= o_mret_taken; + end end - // Interrupt pending and enabled (gate by !trap_taken_prev to prevent re-entry) + logic mret_interrupt_inhibit; + assign mret_interrupt_inhibit = i_mret_start || mret_taken_prev; + + // Interrupt pending and enabled (gate by !trap_taken_prev to prevent re-entry). + // Global M-interrupt enable: mstatus.MIE while in M, but ALWAYS enabled while + // running below M (priv != PrivM) so a machine timer/SW/ext interrupt can + // preempt U-mode even with MIE=0 (RISC-V privileged spec). + logic m_int_globally_enabled; + assign m_int_globally_enabled = mstatus_mie || (i_priv != riscv_pkg::PrivM); logic meip_enabled, mtip_enabled, msip_enabled; - assign meip_enabled = i_interrupts.meip && mie_meie && mstatus_mie && !trap_taken_prev; - assign mtip_enabled = i_interrupts.mtip && mie_mtie && mstatus_mie && !trap_taken_prev; - assign msip_enabled = i_interrupts.msip && mie_msie && mstatus_mie && !trap_taken_prev; + assign meip_enabled = i_interrupts.meip && mie_meie && m_int_globally_enabled && + !trap_taken_prev && !mret_interrupt_inhibit; + assign mtip_enabled = i_interrupts.mtip && mie_mtie && m_int_globally_enabled && + !trap_taken_prev && !mret_interrupt_inhibit; + assign msip_enabled = i_interrupts.msip && mie_msie && m_int_globally_enabled && + !trap_taken_prev && !mret_interrupt_inhibit; // TIMING OPTIMIZATION: Register interrupt_pending to break critical path. // The combinational path from msip -> interrupt_pending -> take_trap -> stall -> cache @@ -156,11 +187,53 @@ module trap_unit #( // Note: mtip is already registered in cpu_and_mem.sv for similar timing reasons. logic interrupt_pending_comb; logic interrupt_pending; - assign interrupt_pending_comb = meip_enabled || mtip_enabled || msip_enabled; + // Gate with !o_trap_taken so a still-pending interrupt is NOT re-latched on + // the cycle its own trap is taken. interrupt_pending is registered, so + // otherwise the latched value fires a second, spurious trap entry the next + // cycle (re-saving mstatus.MPP=M and corrupting a U-mode trap). NOT a comb + // loop: o_trap_taken derives from the REGISTERED interrupt_pending, so the + // feedback path passes through a flop. + assign interrupt_pending_comb = (meip_enabled || mtip_enabled || msip_enabled) && !o_trap_taken; + + // Source-level qualification: pending AND locally enabled (mie.x) and not in a + // trap/MRET recovery window -- but NOT gated by the live global mstatus.MIE. + // + // Once interrupt_pending has been LATCHED (while fully eligible, MIE=1), a + // YOUNGER csr clear of mstatus.MIE (e.g. the kernel idle `csrsi; ...; csrci`) + // must not retroactively erase it: the interrupt was eligible at an instruction + // boundary the csr-clear is younger than, so per the spec it is taken (the + // csr-clear is squashed by the trap). interrupt_pending is registered (1-cycle + // late) and interrupt_pending_eligible re-checks the LIVE global enable, so + // without a hold a csr-clear's delayed mstatus.MIE side-effect lands in the + // sample-to-service gap, drops interrupt_pending_comb, and clears the + // already-qualified bit -> the interrupt is LOST. On the no-MMU kernel that + // dropped machine-timer tick freezes jiffies and hangs the boot. (Usually the + // service is delayed one cycle by a draining store via i_sq_committed_empty, + // widening the window.) Hold across a global-MIE drop; still release when the + // source itself de-qualifies (mtip/meip/msip drops or mie.x cleared) or the + // trap is taken, so masking and acks behave normally. + // interrupt_source_live: a REAL, current interrupt source exists -- pending AND + // locally enabled (mie.x), gated ONLY by !trap_taken_prev. NOT gated by the live + // global mstatus.MIE and NOT by mret_interrupt_inhibit, so a persistent timer is + // HELD across both a global-MIE drop AND the MRET-recovery window rather than + // erased. It is still never TAKEN there (interrupt_pending_eligible keeps + // !mret_interrupt_inhibit + live m_int_globally_enabled), and the 0x80388bba + // panic stays guarded by the cpu_ooo interrupt_resume_pc seed on mret_taken (not + // by this latch) -- per commit 718f8cc the seed is THE panic fix and the old + // trap_unit MRET/interrupt cancel was incidental bring-up timing. A stale sample + // whose source has dropped (source_live=0) is still cleared, preserving the + // "cancel a stale one-cycle sample before MRET" property. + logic interrupt_source_live; + assign interrupt_source_live = + ((i_interrupts.meip && mie_meie) || (i_interrupts.mtip && mie_mtie) || + (i_interrupts.msip && mie_msie)) && !trap_taken_prev; always_ff @(posedge i_clk) begin if (i_rst) interrupt_pending <= 1'b0; - else interrupt_pending <= interrupt_pending_comb; + else if (interrupt_pending_comb) interrupt_pending <= 1'b1; // latch when fully eligible + else if (interrupt_pending && interrupt_source_live && !o_trap_taken) + interrupt_pending <= 1'b1; // hold a live source across a global-MIE drop AND MRET inhibit + else interrupt_pending <= 1'b0; // clear stale (no live source) / on take end // Register synchronous exceptions from the ROB head before trap entry. @@ -177,6 +250,12 @@ module trap_unit #( exception_pending <= 1'b0; end else if (o_trap_taken) begin exception_pending <= 1'b0; + end else if (trap_taken_prev) begin + // Hold cleared one extra cycle: i_exception_valid (the ROB's trap_pending) + // stays high until the trap is acked (~1 cycle after o_trap_taken), so + // without this the exception re-arms and the trap is taken a second time + // (now in M, corrupting mstatus.MPP / mcause for a U-mode trap). + exception_pending <= 1'b0; end else if (i_exception_valid) begin exception_pending <= 1'b1; exception_cause_q <= i_exception_cause; @@ -241,23 +320,53 @@ module trap_unit #( end always_ff @(posedge i_clk) begin - interrupt_cause <= interrupt_cause_comb; + // Hold the cause while interrupt_pending is held (across a global-MIE drop or + // the MRET inhibit); interrupt_cause_comb is built from the gated *_enabled so + // it decays to 0 there, which would default interrupt_latched_source_enabled + // false and leave the held interrupt ineligible when it can finally trap. + if (interrupt_cause_comb != '0) interrupt_cause <= interrupt_cause_comb; + else if (interrupt_pending && interrupt_source_live) interrupt_cause <= interrupt_cause; + else interrupt_cause <= '0; end + // A registered interrupt request must still be enabled when it reaches the + // trap decision. This keeps raw interrupt inputs out of the take_trap cone, + // while allowing CSR writes such as Linux's ret_from_exception mstatus + // restore to cancel a stale one-cycle interrupt sample before MRET. + logic interrupt_latched_source_enabled; + always_comb begin + unique case (interrupt_cause) + riscv_pkg::IntMachineExternal: interrupt_latched_source_enabled = mie_meie; + riscv_pkg::IntMachineSoftware: interrupt_latched_source_enabled = mie_msie; + riscv_pkg::IntMachineTimer: interrupt_latched_source_enabled = mie_mtie; + default: interrupt_latched_source_enabled = 1'b0; + endcase + end + + logic interrupt_pending_eligible; + assign interrupt_pending_eligible = interrupt_pending && + interrupt_latched_source_enabled && + m_int_globally_enabled && + !trap_taken_prev && + !mret_interrupt_inhibit; + // Trap taken: either interrupt or exception, the pipeline not stalled // (except for WFI stall, which should be broken by interrupt), and no // committed store still draining (see i_sq_committed_empty). logic take_trap; - assign take_trap = (interrupt_pending || exception_pending) && !i_pipeline_stall && + assign take_trap = (interrupt_pending_eligible || exception_pending) && + !i_pipeline_stall && i_sq_committed_empty; - // MRET execution (trap has priority: if interrupt/exception fires same cycle, trap wins) + // MRET execution. Synchronous exceptions are structurally impossible with + // MRET at the ROB head; pending interrupts are deferred across the MRET + // recovery window above so the return redirect stays precise. logic take_mret; assign take_mret = i_mret_start && !i_pipeline_stall && !take_trap && i_sq_committed_empty; // Hold commit while a trap/MRET waits out the store drain, so the // committed set shrinks monotonically and the wait is bounded. - assign o_trap_drain_wait = (interrupt_pending || exception_pending || i_mret_start) && + assign o_trap_drain_wait = (interrupt_pending_eligible || exception_pending || i_mret_start) && !i_sq_committed_empty; // Output trap signals @@ -272,7 +381,7 @@ module trap_unit #( o_trap_target = i_mepc; end else if (take_trap) begin // Check mtvec mode - if (i_mtvec[1:0] == 2'b01 && interrupt_pending) begin + if (i_mtvec[1:0] == 2'b01 && interrupt_pending_eligible) begin // Vectored mode for interrupts: BASE + 4*cause_code // Use pre-computed small offset (6 bits) for faster timing than // extracting from full interrupt_cause which synthesis can't optimize @@ -289,11 +398,13 @@ module trap_unit #( // Trap entry information for CSR file // Interrupts have priority over synchronous exceptions always_comb begin - if (interrupt_pending) begin + if (interrupt_pending_eligible) begin o_trap_cause = interrupt_cause; o_trap_value = '0; // Interrupts have mtval = 0 - // For interrupts, save PC of next instruction (the one that will be interrupted) - o_trap_pc = i_exception_pc; + // For interrupts, save the precise architectural resume PC. The live + // ROB head PC can be transient or stale while an async interrupt drains + // through the registered commit path. + o_trap_pc = i_interrupt_pc; end else begin o_trap_cause = exception_cause_q; o_trap_value = exception_tval_q; @@ -318,9 +429,9 @@ module trap_unit #( assume (!(i_mret_start && i_exception_valid)); assume (!(i_wfi_start && i_mret_start)); assume (!(i_wfi_start && i_exception_valid)); - // Note: MRET + interrupt_pending is NOT assumed away. The RTL handles it - // by giving trap priority (!take_trap gate on take_mret), and the - // p_trap_mret_mutex assertion proves this without over-constraining. + // Note: MRET + interrupt_pending is NOT assumed away. MRET wins that race; + // the pending interrupt is re-sampled after the return redirect has had + // time to retire the MRET precisely. end always @(posedge i_clk) begin @@ -329,7 +440,8 @@ module trap_unit #( p_trap_mret_mutex : assert (!(o_trap_taken && o_mret_taken)); // Trap needs source: trap_taken requires interrupt or exception. - p_trap_needs_source : assert (!o_trap_taken || (interrupt_pending || exception_pending)); + p_trap_needs_source : + assert (!o_trap_taken || (interrupt_pending_eligible || exception_pending)); // Trap not during stall: traps only fire when pipeline not stalled. p_trap_not_stalled : assert (!o_trap_taken || !i_pipeline_stall); @@ -344,6 +456,11 @@ module trap_unit #( // MRET target is mepc: when MRET fires, target must be mepc. p_mret_target : assert (!o_mret_taken || (o_trap_target == i_mepc)); + // A pending interrupt must not preempt the MRET instruction itself. + if (i_mret_start && !exception_pending) begin + p_mret_defers_interrupt : assert (!o_trap_taken); + end + // WFI stall contract: if stall_for_wfi_comb, wfi must be active. p_wfi_stall_needs_active : assert (!stall_for_wfi_comb || wfi_active); end @@ -403,8 +520,8 @@ module trap_unit #( cover_wfi_stall : cover (stall_for_wfi_comb); cover_wfi_wakeup : cover (f_past_valid && !wfi_active && $past(wfi_active)); cover_external_interrupt : - cover (interrupt_pending && interrupt_cause == riscv_pkg::IntMachineExternal); - cover_exception : cover (o_trap_taken && i_exception_valid && !interrupt_pending); + cover (interrupt_pending_eligible && interrupt_cause == riscv_pkg::IntMachineExternal); + cover_exception : cover (o_trap_taken && i_exception_valid && !interrupt_pending_eligible); cover_trap_after_drain : cover (f_past_valid && o_trap_taken && $past(o_trap_drain_wait)); end end diff --git a/hw/rtl/cpu_and_mem/cpu/cpu_ooo/branch_recovery/branch_resolution.sv b/hw/rtl/cpu_and_mem/cpu/cpu_ooo/branch_recovery/branch_resolution.sv index 298a03e4..4f065d9d 100644 --- a/hw/rtl/cpu_and_mem/cpu/cpu_ooo/branch_recovery/branch_resolution.sv +++ b/hw/rtl/cpu_and_mem/cpu/cpu_ooo/branch_recovery/branch_resolution.sv @@ -86,15 +86,27 @@ module branch_resolution #( logic [riscv_pkg::ReorderBufferTagWidth:0] branch_issue_age; logic [riscv_pkg::ReorderBufferTagWidth:0] early_flush_age; logic [riscv_pkg::ReorderBufferTagWidth:0] commit_flush_age; + // TIMING: compare-then-mux instead of mux-then-compare. The original form + // muxed the 5-bit owner tag by checkpoint_id and THEN compared against + // rob_tag (8:1 x 5b mux + 5b compare in series). Computing the per- + // checkpoint live bit first lets all eight in_use+owner-tag compares run in + // parallel straight out of the checkpoint registers, leaving only a 1-bit + // 8:1 select behind checkpoint_id. Pure boolean identity — for every + // checkpoint_id value the selected bit is exactly the original expression. + logic [riscv_pkg::NumCheckpoints-1:0] checkpoint_live_per_id; always_comb begin - branch_issue_checkpoint_live = 1'b1; - if (rs_issue_int.has_checkpoint) begin + for (int i = 0; i < riscv_pkg::NumCheckpoints; i++) begin // Use the registered checkpoint state here to avoid a feedback loop // through execute-time checkpoint free. The owner-tag check still // filters out stale/reused checkpoint IDs. - branch_issue_checkpoint_live = - checkpoint_in_use[rs_issue_int.checkpoint_id] && - (checkpoint_owner_tag[rs_issue_int.checkpoint_id] == rs_issue_int.rob_tag); + checkpoint_live_per_id[i] = + checkpoint_in_use[i] && (checkpoint_owner_tag[i] == rs_issue_int.rob_tag); + end + end + always_comb begin + branch_issue_checkpoint_live = 1'b1; + if (rs_issue_int.has_checkpoint) begin + branch_issue_checkpoint_live = checkpoint_live_per_id[rs_issue_int.checkpoint_id]; end end @@ -133,10 +145,22 @@ module branch_resolution #( // suppress_branch_resolution → is_branch_issue → branch comparison (CARRY8) // → branch_update → commit_en created a 16-level combinational chain that // was the WNS critical path (-0.739 ns). Removing it is safe because: - // (a) commit_en already has a direct branch_update collision guard that - // delays commit when the same branch resolves and commits in one cycle; - // (b) resolution writes to entries that will be flushed are harmless; - // (c) early_mispredict_fire still gates on the candidate directly. + // (a) a resolving branch can never BE the committing head: branches have + // no CDB done-bypass (reorder_buffer head_cdb_bypass excludes + // head_is_branch), so a branch's done bit is registered and it can + // only be head_ready the cycle AFTER its branch_update; + // (b) resolution writes to entries that will be flushed are harmless -- + // flush-after-head invalidates them next cycle, allocation re-inits + // the branch bits, and the unresolved-branch counter resets on + // flush_pipeline; + // (c) an early_mispredict_fire coinciding with a head-mispredict commit + // is DROPPED one cycle later: early_mispredict_active gates on + // !mispredict_recovery_pending (early_misprediction_recovery.sv), + // which registers the commit-time recovery launch, so the early + // pulse dies before any redirect / RAT restore / rob_early_recovered + // write / backend flush. (The former fire-time candidate gate was + // removed for timing; o_head_commit_misprediction_candidate is now + // an unconsumed observation output.) end assign suppress_branch_resolution = branch_issue_is_flushed; diff --git a/hw/rtl/cpu_and_mem/cpu/cpu_ooo/branch_recovery/early_misprediction_recovery.sv b/hw/rtl/cpu_and_mem/cpu/cpu_ooo/branch_recovery/early_misprediction_recovery.sv index 1740e9fb..bac1f730 100644 --- a/hw/rtl/cpu_and_mem/cpu/cpu_ooo/branch_recovery/early_misprediction_recovery.sv +++ b/hw/rtl/cpu_and_mem/cpu/cpu_ooo/branch_recovery/early_misprediction_recovery.sv @@ -97,8 +97,14 @@ module early_misprediction_recovery #( logic early_mispredict_fire; logic early_mispredict_pending; logic early_mispredict_active; - logic early_backend_recovery_pending; - logic [riscv_pkg::ReorderBufferTagWidth-1:0] early_backend_flush_tag; + // TIMING: this single FF broadcast into ~1300 failing endpoints post-opt + // (flush_en -> RS/LQ/SQ/ROB kill and capture gating). Cap the fanout so + // synthesis replicates the register per consumer region. Replication only + // — D input, resets, and the sacred recovery conditions are untouched. + (* max_fanout = 48 *) logic early_backend_recovery_pending; + // TIMING: flush-tag broadcast feeding per-entry age compares across the + // backend (CDB kill, LQ/RS squash). Same register-replication treatment. + (* max_fanout = 48 *) logic [riscv_pkg::ReorderBufferTagWidth-1:0] early_backend_flush_tag; // Captured data from the mispredicting branch logic [riscv_pkg::ReorderBufferTagWidth-1:0] early_mispredict_tag; diff --git a/hw/rtl/cpu_and_mem/cpu/cpu_ooo/branch_recovery/misprediction_flush_controller.sv b/hw/rtl/cpu_and_mem/cpu/cpu_ooo/branch_recovery/misprediction_flush_controller.sv index bdc6c5cd..080ddbdf 100644 --- a/hw/rtl/cpu_and_mem/cpu/cpu_ooo/branch_recovery/misprediction_flush_controller.sv +++ b/hw/rtl/cpu_and_mem/cpu/cpu_ooo/branch_recovery/misprediction_flush_controller.sv @@ -121,9 +121,14 @@ module misprediction_flush_controller #( logic dispatch_flush; logic full_flush_side_effect_kill; logic frontend_state_flush; - logic flush_en; - logic [riscv_pkg::ReorderBufferTagWidth-1:0] flush_tag; - logic flush_all; + // TIMING: flush_en / flush_tag / flush_all broadcast into the whole backend + // (ROB commit gate, RS/LQ/SQ kills, RAT). They are shallow functions of + // registered recovery state, so cap the fanout and let synthesis replicate + // the driver LUTs per consumer region. Pure fanout splitting — the + // priority structure below is untouched. + (* max_fanout = 64 *) logic flush_en; + (* max_fanout = 64 *) logic [riscv_pkg::ReorderBufferTagWidth-1:0] flush_tag; + (* max_fanout = 64 *) logic flush_all; logic commit_recovery_flush_after_head; logic checkpoint_restore; logic [riscv_pkg::CheckpointIdWidth-1:0] checkpoint_restore_id; diff --git a/hw/rtl/cpu_and_mem/cpu/cpu_ooo/cpu_ooo.sv b/hw/rtl/cpu_and_mem/cpu/cpu_ooo/cpu_ooo.sv index 20fb096e..fe8d7648 100644 --- a/hw/rtl/cpu_and_mem/cpu/cpu_ooo/cpu_ooo.sv +++ b/hw/rtl/cpu_and_mem/cpu/cpu_ooo/cpu_ooo.sv @@ -46,6 +46,7 @@ module cpu_ooo #( input logic [63:0] i_instr, // 64-bit fetch: {next_word, current_word} input logic [riscv_pkg::ImemFetchSidebandWidth-1:0] i_instr_sideband, input logic i_instr_bank_sel_r, // Fetch-word parity (for spanning select) + input logic [31:0] i_served_addr, // Served fetch-window tag (served-window guard) // Fetch window valid (see if_stage). Tie 1 for fixed 1-cycle providers. input logic i_instr_valid, // Stall-replay bundle consumed this cycle (see if_stage) -- the fetch @@ -104,6 +105,10 @@ module cpu_ooo #( // Interrupts input riscv_pkg::interrupt_t i_interrupts, input logic [63:0] i_mtime, + output logic [5:0] o_debug_irq_status, + output logic [XLEN-1:0] o_debug_commit_pc, + output logic [XLEN-1:0] o_debug_commit_2_pc, + output logic [1:0] o_debug_commit_valid, // Debug input logic i_disable_branch_prediction ); @@ -298,6 +303,39 @@ module cpu_ooo #( logic [riscv_pkg::ReorderBufferTagWidth-1:0] dbg_rat_alloc_rob_tag /* verilator public_flat_rd */; logic [XLEN-1:0] dbg_last_a0_alloc_pc /* verilator public_flat_rd */; logic [riscv_pkg::ReorderBufferTagWidth-1:0] dbg_last_a0_alloc_tag /* verilator public_flat_rd */; + logic dbg_trap_taken_raw /* verilator public_flat_rd */; + logic dbg_trap_taken_q /* verilator public_flat_rd */; + logic [XLEN-1:0] dbg_trap_cause_internal /* verilator public_flat_rd */; + logic [XLEN-1:0] dbg_trap_pc_internal /* verilator public_flat_rd */; + logic [XLEN-1:0] dbg_interrupt_resume_pc /* verilator public_flat_rd */; + logic dbg_port0_int_we /* verilator public_flat_rd */; + logic [riscv_pkg::RegAddrWidth-1:0] dbg_port0_int_addr /* verilator public_flat_rd */; + logic [XLEN-1:0] dbg_port0_int_data /* verilator public_flat_rd */; + logic dbg_port1_int_we /* verilator public_flat_rd */; + logic [riscv_pkg::RegAddrWidth-1:0] dbg_port1_int_addr /* verilator public_flat_rd */; + logic [XLEN-1:0] dbg_port1_int_data /* verilator public_flat_rd */; + logic dbg_commit_dest_valid /* verilator public_flat_rd */; + logic dbg_commit_dest_rf /* verilator public_flat_rd */; + logic [riscv_pkg::RegAddrWidth-1:0] dbg_commit_dest_reg /* verilator public_flat_rd */; + logic [XLEN-1:0] dbg_commit_value /* verilator public_flat_rd */; + logic dbg_commit_2_valid /* verilator public_flat_rd */; + logic [XLEN-1:0] dbg_commit_2_pc /* verilator public_flat_rd */; + logic dbg_commit_2_dest_valid /* verilator public_flat_rd */; + logic dbg_commit_2_dest_rf /* verilator public_flat_rd */; + logic [riscv_pkg::RegAddrWidth-1:0] dbg_commit_2_dest_reg /* verilator public_flat_rd */; + logic [XLEN-1:0] dbg_commit_2_value /* verilator public_flat_rd */; + logic dbg_rob_commit_reg_valid /* verilator public_flat_rd */; + logic [XLEN-1:0] dbg_rob_commit_reg_pc /* verilator public_flat_rd */; + logic dbg_rob_commit_reg_dest_valid /* verilator public_flat_rd */; + logic dbg_rob_commit_reg_dest_rf /* verilator public_flat_rd */; + logic [riscv_pkg::RegAddrWidth-1:0] dbg_rob_commit_reg_dest_reg /* verilator public_flat_rd */; + logic [XLEN-1:0] dbg_rob_commit_reg_value /* verilator public_flat_rd */; + logic dbg_rob_commit_2_reg_valid /* verilator public_flat_rd */; + logic [XLEN-1:0] dbg_rob_commit_2_reg_pc /* verilator public_flat_rd */; + logic dbg_rob_commit_2_reg_dest_valid /* verilator public_flat_rd */; + logic dbg_rob_commit_2_reg_dest_rf /* verilator public_flat_rd */; + logic [riscv_pkg::RegAddrWidth-1:0] dbg_rob_commit_2_reg_dest_reg /* verilator public_flat_rd */; + logic [XLEN-1:0] dbg_rob_commit_2_reg_value /* verilator public_flat_rd */; // verilog_lint: waive-stop line-length `endif @@ -407,6 +445,7 @@ module cpu_ooo #( .i_instr, .i_instr_sideband, .i_instr_bank_sel_r, + .i_served_addr, .i_instr_valid, .o_fetch_replay_consume, .i_from_ex_comb(from_ex_comb_synth), @@ -636,6 +675,11 @@ module cpu_ooo #( logic rob_commit_2_store_like_raw; logic rob_commit_2_valid; assign rob_commit_2_valid = rob_commit_2.valid; + logic rob_commit_store_like_raw; + logic sq_committed_empty_for_trap; + assign rob_commit_store_like_raw = + rob_commit_valid_raw && + (rob_commit_comb.is_store || rob_commit_comb.is_fp_store || rob_commit_comb.is_sc); logic widen_commit_ok; assign widen_commit_ok = 1'b1; logic [riscv_pkg::ReorderBufferDepth-1:0] rob_entry_epoch; @@ -839,7 +883,20 @@ module cpu_ooo #( logic trap_pending; logic trap_mret_commit_hold_q; logic [XLEN-1:0] rob_trap_pc; + logic rob_head_is_wfi; // ROB head decodes as WFI (drives the WFI interrupt-resume-PC seed) + // Retired-next-PC precompute from the ROB (TIMING): equals + // retired_next_pc(rob_commit_comb) / (rob_commit_comb_2) whenever the + // corresponding commit valid is high, but computed from ungated head fields + // so the RAM read + adder are off the late commit_en cone. + logic [XLEN-1:0] rob_head_retired_next_pc; + logic [XLEN-1:0] rob_head_next_retired_next_pc; riscv_pkg::exc_cause_t rob_trap_cause; + riscv_pkg::exc_cause_t rob_trap_cause_remapped; + logic [1:0] csr_priv; // current privilege from csr_file (PrivM/PrivU) + // Arbitrated trap cause from trap_unit (interrupt cause with bit 31, or the + // remapped synchronous-exception cause) -> csr_file mcause. Declared here so + // it is visible above the trap_unit instantiation that drives it. + logic [XLEN-1:0] trap_cause_internal; logic [XLEN-1:0] rob_trap_value; logic rob_trap_taken_ack; logic mret_start, mret_done_ack; @@ -1007,6 +1064,9 @@ module cpu_ooo #( .i_alloc_req_2(rob_alloc_req_2), .o_alloc_resp_2(rob_alloc_resp_2), + // Current privilege (PrivM/PrivU) for U-mode CSR/MRET illegal checks + .i_priv(csr_priv), + .o_cdb_grant(cdb_grant), .o_cdb(cdb_out), @@ -1042,6 +1102,9 @@ module cpu_ooo #( .i_csr_done(csr_done_ack), .o_trap_pending(trap_pending), .o_trap_pc(rob_trap_pc), + .o_head_is_wfi(rob_head_is_wfi), + .o_head_retired_next_pc(rob_head_retired_next_pc), + .o_head_next_retired_next_pc(rob_head_next_retired_next_pc), .o_trap_cause(rob_trap_cause), .o_trap_value(rob_trap_value), .i_trap_taken(rob_trap_taken_ack), @@ -1630,6 +1693,42 @@ module cpu_ooo #( // The wrapper already provides a registered observation port for commit. assign rob_commit_valid = rob_commit.valid; +`ifndef SYNTHESIS + assign dbg_trap_taken_raw = trap_taken; + assign dbg_trap_taken_q = trap_taken_reg; + assign dbg_trap_cause_internal = trap_cause_internal; + assign dbg_trap_pc_internal = trap_pc_internal; + assign dbg_interrupt_resume_pc = interrupt_resume_pc; + assign dbg_port0_int_we = port0_int_we; + assign dbg_port0_int_addr = port0_int_addr; + assign dbg_port0_int_data = port0_int_data; + assign dbg_port1_int_we = port1_int_we; + assign dbg_port1_int_addr = port1_int_addr; + assign dbg_port1_int_data = port1_int_data; + assign dbg_commit_dest_valid = rob_commit_comb.dest_valid; + assign dbg_commit_dest_rf = rob_commit_comb.dest_rf; + assign dbg_commit_dest_reg = rob_commit_comb.dest_reg; + assign dbg_commit_value = rob_commit_comb.value[XLEN-1:0]; + assign dbg_commit_2_valid = rob_commit_comb_2.valid; + assign dbg_commit_2_pc = rob_commit_comb_2.pc; + assign dbg_commit_2_dest_valid = rob_commit_comb_2.dest_valid; + assign dbg_commit_2_dest_rf = rob_commit_comb_2.dest_rf; + assign dbg_commit_2_dest_reg = rob_commit_comb_2.dest_reg; + assign dbg_commit_2_value = rob_commit_comb_2.value[XLEN-1:0]; + assign dbg_rob_commit_reg_valid = rob_commit.valid; + assign dbg_rob_commit_reg_pc = rob_commit.pc; + assign dbg_rob_commit_reg_dest_valid = rob_commit.dest_valid; + assign dbg_rob_commit_reg_dest_rf = rob_commit.dest_rf; + assign dbg_rob_commit_reg_dest_reg = rob_commit.dest_reg; + assign dbg_rob_commit_reg_value = rob_commit.value[XLEN-1:0]; + assign dbg_rob_commit_2_reg_valid = rob_commit_2.valid; + assign dbg_rob_commit_2_reg_pc = rob_commit_2.pc; + assign dbg_rob_commit_2_reg_dest_valid = rob_commit_2.dest_valid; + assign dbg_rob_commit_2_reg_dest_rf = rob_commit_2.dest_rf; + assign dbg_rob_commit_2_reg_dest_reg = rob_commit_2.dest_reg; + assign dbg_rob_commit_2_reg_value = rob_commit_2.value[XLEN-1:0]; +`endif + // DEBUG: verify early recovery redirect_pc matches commit-time redirect_pc // (Disabled for performance — re-enable for debugging.) // always @(posedge i_clk) begin @@ -1858,6 +1957,25 @@ module cpu_ooo #( endcase end + // ECALL cause is privilege-dependent (U-mode = 8, M-mode = 11). The FU shim + // tags every ECALL as ExcEcallMmode (it has no architectural privilege), so + // remap at commit using the current privilege. csr_file writes this to mcause + // -- the load-bearing path. It is also fed to trap_unit.i_exception_cause for + // symmetry, though FROST does not vector mtvec on synchronous-exception causes + // (only interrupts vector) and trap_unit's own o_trap_cause is unused. The + // csr_trap_value (mtval) mux above intentionally keeps the ORIGINAL cause + // (ECALL mtval is 0 either way). + // + // SAFE against the cause==11 / IntMachineExternal (0x8000_000B) low-bit + // collision: rob_trap_cause carries synchronous-exception causes ONLY (ROB + // o_trap_cause = head_exc_cause; the ROB's i_interrupt_pending is WFI-wakeup + // only, never a cause source), so a value of 11 here is unambiguously an + // M-mode ECALL. + assign rob_trap_cause_remapped = + ((rob_trap_cause == riscv_pkg::ExcEcallMmode[riscv_pkg::ExcCauseWidth-1:0]) && + (csr_priv == riscv_pkg::PrivU)) ? + riscv_pkg::ExcEcallUmode[riscv_pkg::ExcCauseWidth-1:0] : rob_trap_cause; + csr_file #( .XLEN(XLEN) ) csr_file_inst ( @@ -1874,8 +1992,11 @@ module cpu_ooo #( .i_interrupts(i_interrupts), .i_mtime(i_mtime), .i_trap_taken(trap_taken), - .i_trap_pc(rob_trap_pc), - .i_trap_cause({{(XLEN - $bits(rob_trap_cause)) {1'b0}}, rob_trap_cause}), + .i_trap_pc(trap_pc_internal), + // mcause from trap_unit's arbitrated cause: interrupt cause (with the + // interrupt bit) for interrupts, or the remapped exception cause (which + // carries the U-mode ECALL remap via trap_unit.i_exception_cause below). + .i_trap_cause(trap_cause_internal), .i_trap_value(csr_trap_value), .i_mret_taken(mret_taken), .o_mstatus(csr_mstatus), @@ -1883,6 +2004,7 @@ module cpu_ooo #( .o_mtvec(csr_mtvec), .o_mepc(csr_mepc), .o_mstatus_mie_direct(csr_mstatus_mie_direct), + .o_priv(csr_priv), // FP flags: accumulated from ROB commit .i_fp_flags(rob_commit_fp_flags_merged), .i_fp_flags_valid(rob_commit_any_fp_flags_valid), @@ -1920,7 +2042,93 @@ module cpu_ooo #( assign interrupt_pending = i_interrupts.meip || i_interrupts.mtip || i_interrupts.msip; logic [XLEN-1:0] trap_target_internal, trap_pc_internal; - logic [XLEN-1:0] trap_cause_internal, trap_value_internal; + logic [XLEN-1:0] trap_value_internal; + logic [XLEN-1:0] interrupt_resume_pc; + + function automatic logic [XLEN-1:0] retired_next_pc( + input riscv_pkg::reorder_buffer_commit_t commit); + logic [XLEN-1:0] step; + begin + step = commit.is_compressed ? {{(XLEN - 2) {1'b0}}, 2'b10} : {{(XLEN - 3) {1'b0}}, 3'b100}; + if (commit.is_branch || commit.is_mret) begin + retired_next_pc = commit.redirect_pc; + end else begin + retired_next_pc = commit.pc + step; + end + end + endfunction + + always_ff @(posedge i_clk) begin + if (i_rst) begin + interrupt_resume_pc <= '0; + end else if (mret_taken) begin + // An MRET retires through the trap/MRET full flush, NOT the normal commit + // path: the cycle after o_mret_taken, flush_all (from mret_taken_reg) + // wipes the ROB head and gates commit_en, so the MRET never appears on + // rob_commit_valid_raw and never updates interrupt_resume_pc via the + // branches below. Without this seed, interrupt_resume_pc keeps the + // architectural next-PC of the instruction *before* the MRET -- which is + // the MRET instruction's own PC -- for the entire MRET-to-U window (until + // the first post-MRET instruction commits). A machine interrupt taken + // after privilege drops below M (eligible once the trap_unit inhibit + // lifts, ~2 cycles later, long before that first commit) would then save + // mepc = , an M-mode handler address, which Linux later restores + // and MRETs to illegally in U-mode (the ret_from_exception 0x80388bba + // panic). Seed the resume PC from the MRET target (mepc, == the MRET + // redirect target) now so it is already correct before the inhibit + // window closes. csr_mepc is stable here: MRET does not write mepc and + // cannot coincide with a trap entry that would. + interrupt_resume_pc <= csr_mepc; + end else if (rob_commit_2_valid_raw) begin + // TIMING: identical value to retired_next_pc(rob_commit_comb_2) in every + // cycle this arm is taken (checked below in simulation), but the ROB + // precomputes it from ungated head+1 fields so the PC RAM read + 32-bit + // add do not sit behind the late commit gating. + interrupt_resume_pc <= rob_head_next_retired_next_pc; + end else if (rob_commit_valid_raw) begin + // TIMING: identical value to retired_next_pc(rob_commit_comb); see above. + interrupt_resume_pc <= rob_head_retired_next_pc; + end else if (rob_head_is_wfi && head_valid) begin + // Bug#2 (drain-gated WFI mepc): while a WFI stalls at the ROB head, the + // architectural resume PC is always wfi_pc+4 (WFI never redirects). Seed it + // here so that if a machine interrupt is taken at the WFI -- including the + // narrow window where a committed store finishes draining and take_trap + // fires the same cycle, before the WFI's own commit can advance + // interrupt_resume_pc -- mepc is the spec-required wfi_pc+4 rather than the + // pre-WFI instruction's next-PC (== wfi_pc). Lowest priority: a real commit + // (incl. a dual-commit retiring the WFI and its successor) always wins, and + // WFI is never compressed so +4 is exact. Mirrors the mret_taken seed above. + interrupt_resume_pc <= rob_trap_pc + 32'd4; + end + end + +`ifndef SYNTHESIS + // Equivalence check for the ROB retired-next-PC precompute: whenever a + // commit fires, the precomputed value must match the original + // retired_next_pc() derivation from the (gated) commit payload. + always @(posedge i_clk) begin + if (!i_rst) begin + if (rob_commit_valid_raw && rob_head_retired_next_pc != retired_next_pc( + rob_commit_comb + )) begin + $error("cpu_ooo: rob_head_retired_next_pc %08x != retired_next_pc(commit) %08x", + rob_head_retired_next_pc, retired_next_pc(rob_commit_comb)); + end + if (rob_commit_2_valid_raw && rob_head_next_retired_next_pc != retired_next_pc( + rob_commit_comb_2 + )) begin + $error("cpu_ooo: rob_head_next_retired_next_pc %08x != retired_next_pc(commit_2) %08x", + rob_head_next_retired_next_pc, retired_next_pc(rob_commit_comb_2)); + end + end + end +`endif + + // A same-cycle store-like ROB commit is not yet in the SQ committed set. + // If a trap full-flushes here, the registered commit can be masked before + // SQ observes it. Delay trap/MRET one cycle so SQ can own and drain it. + assign sq_committed_empty_for_trap = + sq_committed_empty && !rob_commit_store_like_raw && !rob_commit_2_store_like_raw; trap_unit #( .XLEN(XLEN) @@ -1928,19 +2136,23 @@ module cpu_ooo #( .i_clk, .i_rst, .i_pipeline_stall(1'b0), // OOO: no stall for trap check - .i_sq_committed_empty(sq_committed_empty), + .i_sq_committed_empty(sq_committed_empty_for_trap), .o_trap_drain_wait(trap_drain_wait), .i_mstatus(csr_mstatus), .i_mie(csr_mie), .i_mtvec(csr_mtvec), .i_mepc(csr_mepc), .i_mstatus_mie_direct(csr_mstatus_mie_direct), + .i_priv(csr_priv), .i_interrupts(i_interrupts), // Exception from ROB commit .i_exception_valid(trap_pending), - .i_exception_cause({{(XLEN - $bits(rob_trap_cause)) {1'b0}}, rob_trap_cause}), + .i_exception_cause({ + {(XLEN - $bits(rob_trap_cause_remapped)) {1'b0}}, rob_trap_cause_remapped + }), .i_exception_tval('0), .i_exception_pc(rob_trap_pc), + .i_interrupt_pc(interrupt_resume_pc), .i_mret_start(mret_start), .i_wfi_start(1'b0), // WFI handled by ROB serialization .o_trap_taken(trap_taken), @@ -1972,6 +2184,15 @@ module cpu_ooo #( assign rob_trap_taken_ack = trap_taken_reg; assign mret_done_ack = mret_taken_reg; + // Passive on-silicon debug tap for the top-level hang triage UART. Packed as: + // [5]=mret, [4]=trap, [3:2]=priv, [1]=mstatus.MIE, [0]=mie.MTIE. + assign o_debug_irq_status = { + mret_taken, trap_taken, csr_priv, csr_mstatus_mie_direct, csr_mie[riscv_pkg::MieMtiBit] + }; + assign o_debug_commit_pc = rob_commit.pc; + assign o_debug_commit_2_pc = rob_commit_2.pc; + assign o_debug_commit_valid = {rob_commit_2.valid, rob_commit.valid}; + // =========================================================================== // Profiling Counter Aggregation // =========================================================================== diff --git a/hw/rtl/cpu_and_mem/cpu/csr/csr_file.sv b/hw/rtl/cpu_and_mem/cpu/csr/csr_file.sv index 73acfdc8..15c783e6 100644 --- a/hw/rtl/cpu_and_mem/cpu/csr/csr_file.sv +++ b/hw/rtl/cpu_and_mem/cpu/csr/csr_file.sv @@ -15,7 +15,7 @@ */ /* - CSR (Control and Status Register) File for RISC-V Zicsr + Zicntr + Machine-mode + F extensions. + CSR (Control and Status Register) File for RISC-V Zicsr + Zicntr + Machine/User-mode + F extensions. This module implements: @@ -31,9 +31,9 @@ - instret/instreth (0xC02/0xC82): Instructions retired counter (64-bit) - minstret/minstreth (0xB02/0xB82): Machine-mode alias for instret counter - Machine-mode CSRs (for trap/interrupt handling): - - mstatus (0x300): Machine status (MIE, MPIE bits) - - misa (0x301): Machine ISA (read-only, reports RV32IMAFB) + Machine-mode CSRs (for trap/interrupt handling; M and U privilege modes): + - mstatus (0x300): Machine status (MIE, MPIE bits; MPP WARL field {M, U}; MPRV bit, inert) + - misa (0x301): Machine ISA (read-only, reports RV32GCB + U: 0x4010_112F) - mie (0x304): Machine interrupt enable (MEIE, MTIE, MSIE) - mtvec (0x305): Machine trap vector base address - mscratch (0x340): Machine scratch register @@ -67,7 +67,7 @@ module csr_file #( // counter must increment by the retire count. input logic [1:0] i_instruction_retired_count, - // Interrupt pending inputs (directly from peripherals) + // Interrupt pending inputs (meip/mtip registered upstream in cpu_and_mem; msip direct) input riscv_pkg::interrupt_t i_interrupts, // mtime input (from memory-mapped timer) @@ -91,6 +91,11 @@ module csr_file #( // Direct output of mstatus MIE bit for timing and simpler consumers. output logic o_mstatus_mie_direct, + // Current privilege mode (PrivM/PrivU): consumed by trap_unit (interrupt + // enable while in U) and the commit-time ECALL cause select. Changes only + // on trap entry and MRET. + output logic [1:0] o_priv, + // F extension: FP exception flags from FPU (to accumulate in fflags) input riscv_pkg::fp_flags_t i_fp_flags, input logic i_fp_flags_valid, // Valid when FP instruction retires (gated by o_vld) @@ -140,8 +145,14 @@ module csr_file #( // do not require read/modify/write of the full CSR word. logic mstatus_mie; // Machine Interrupt Enable (bit 3) logic mstatus_mpie; // Machine Previous Interrupt Enable (bit 7) - logic [XLEN-1:0] mstatus; // Constructed from mie and mpie - assign mstatus = {19'b0, 2'b11, 3'b0, mstatus_mpie, 3'b0, mstatus_mie, 3'b0}; + logic [ 1:0] mstatus_mpp; // Previous Privilege [12:11]; WARL {PrivM,PrivU} + logic mstatus_mprv; // Modify PRiV (bit 17); stored but inert (no PMP/MMU) + logic [ 1:0] priv_q; // Current privilege mode (resets to PrivM) + logic [XLEN-1:0] mstatus; // Constructed from the fields above + assign mstatus = { + 14'b0, mstatus_mprv, 4'b0, mstatus_mpp, 3'b0, mstatus_mpie, 3'b0, mstatus_mie, 3'b0 + }; + assign o_priv = priv_q; // mie CSR: store each interrupt enable as separate register logic mie_msie; // Machine Software Interrupt Enable (bit 3) @@ -153,6 +164,9 @@ module csr_file #( // Next-state signals for mstatus bits (computed combinationally) logic next_mstatus_mie; logic next_mstatus_mpie; + logic [1:0] next_mstatus_mpp; + logic next_mstatus_mprv; + logic [1:0] next_priv; // Next-state signals for mie bits logic next_mie_msie; logic next_mie_mtie; @@ -169,10 +183,11 @@ module csr_file #( logic [XLEN-1:0] mip; assign mip = {20'b0, i_interrupts.meip, 3'b0, i_interrupts.mtip, 3'b0, i_interrupts.msip, 3'b0}; - // misa is read-only: RV32IMAFB - // Bit 0 (A), Bit 1 (B), Bit 5 (F), Bit 8 (I), Bit 12 (M) = 0x0000_1123 + // misa is read-only: RV32IMAFDC + B + U (= RV32GCB with User mode) + // Bit 0 (A), Bit 1 (B), Bit 2 (C), Bit 3 (D), Bit 5 (F), Bit 8 (I), Bit 12 (M), + // Bit 20 (U) = 0x0010_112F // MXL = 1 (32-bit) in bits [31:30] - localparam logic [XLEN-1:0] MisaValue = 32'h4000_1123; + localparam logic [XLEN-1:0] MisaValue = 32'h4010_112F; // Output CSRs for trap unit assign o_mstatus = mstatus; @@ -237,12 +252,46 @@ module csr_file #( // ========================================================================== // Instructions Retired Counter // ========================================================================== + // TIMING RETIME (+1 cycle, architecturally invisible — analysis below): + // the per-cycle retire count arrives late (its !trap_taken suppression sits + // at the end of the commit/trap serialization cone) and previously entered + // the LSB of a 64-bit carry chain, making instret[63]/D the post-opt WNS + // (-0.94 ns at 300 MHz). Stage the FULLY-GATED count through + // instruction_retired_count_q so the late cone terminates at a 2-bit + // register; the 64-bit add then runs register-to-register. + // + // Invariant: instret_counter at cycle T equals the total retire count + // through cycle T-2 (one staging cycle) instead of T-1. Architecturally + // invisible because the ONLY observation of instret is a CSR read of + // instret/instreth/minstret{,h}, and CSR reads are commit-serialized: + // cycle C: the youngest instruction OLDER than the CSR read commits + // (commit_en); its count is computed at C+1 from the + // REGISTERED commit bus (commit_actions), staged into + // instruction_retired_count_q at the C+1->C+2 edge, and + // accumulated into instret_counter at the C+2->C+3 edge; + // cycle C+1: the CSR reaches the ROB head; rob_serializer asserts + // commit_stall and requests CSR execution (o_csr_start); + // cycle C+2: earliest csr_done_ack (1-cycle handshake in cpu_ooo) -> + // earliest CSR commit_en; + // cycle C+3: csr_commit_fire (registered commit) performs the actual + // csr_file read -> observes a counter that already includes + // cycle C's commits. + // Every stall (head not ready, commit_hold, later csr_done) only adds + // margin, and the reading instruction itself is never included — exactly + // as in the un-retimed design, whose own count also landed after the read. + // The staged count preserves the !trap_taken suppression bit-for-bit (the + // gated count is registered as-is: the same instructions are counted, one + // cycle later). Proven in the FORMAL section (p_instret_stage_follows / + // p_instret_applies_staged_count). + logic [1:0] instruction_retired_count_q; always_ff @(posedge i_clk) begin if (i_rst) begin + instruction_retired_count_q <= 2'd0; instret_counter <= 64'd0; end else begin - instret_counter <= instret_counter + 64'(i_instruction_retired_count); + instruction_retired_count_q <= i_instruction_retired_count; + instret_counter <= instret_counter + 64'(instruction_retired_count_q); end end @@ -318,22 +367,35 @@ module csr_file #( // Default: keep current values next_mstatus_mie = mstatus_mie; next_mstatus_mpie = mstatus_mpie; + next_mstatus_mpp = mstatus_mpp; + next_mstatus_mprv = mstatus_mprv; + next_priv = priv_q; next_mie_msie = mie_msie; next_mie_mtie = mie_mtie; next_mie_meie = mie_meie; if (i_trap_taken) begin - // Trap entry: save MIE to MPIE, clear MIE + // Trap entry: save MIE->MPIE, clear MIE, save priv->MPP, enter M-mode. next_mstatus_mpie = mstatus_mie; next_mstatus_mie = 1'b0; + next_mstatus_mpp = priv_q; + next_priv = riscv_pkg::PrivM; end else if (i_mret_taken) begin - // MRET: restore MIE from MPIE, set MPIE to 1 + // MRET: restore MIE<-MPIE, MPIE=1, return to MPP's privilege, set MPP=U, + // and clear MPRV if returning below M (per the privileged spec). next_mstatus_mie = mstatus_mpie; next_mstatus_mpie = 1'b1; + next_priv = mstatus_mpp; + if (mstatus_mpp != riscv_pkg::PrivM) next_mstatus_mprv = 1'b0; + next_mstatus_mpp = riscv_pkg::PrivU; end else if (i_csr_write_enable && i_csr_read_enable) begin if (i_csr_address == riscv_pkg::CsrMstatus) begin - next_mstatus_mie = csr_new_value[3]; + next_mstatus_mie = csr_new_value[3]; next_mstatus_mpie = csr_new_value[7]; + // MPP is WARL: FROST implements only M and U, so fold S/reserved -> U. + next_mstatus_mpp = (csr_new_value[12:11] == riscv_pkg::PrivM) ? + riscv_pkg::PrivM : riscv_pkg::PrivU; + next_mstatus_mprv = csr_new_value[17]; end else if (i_csr_address == riscv_pkg::CsrMie) begin next_mie_msie = csr_new_value[3]; next_mie_mtie = csr_new_value[7]; @@ -348,12 +410,18 @@ module csr_file #( if (i_rst) begin mstatus_mie <= 1'b0; mstatus_mpie <= 1'b0; + mstatus_mpp <= riscv_pkg::PrivU; + mstatus_mprv <= 1'b0; + priv_q <= riscv_pkg::PrivM; mie_msie <= 1'b0; mie_mtie <= 1'b0; mie_meie <= 1'b0; end else begin mstatus_mie <= next_mstatus_mie; mstatus_mpie <= next_mstatus_mpie; + mstatus_mpp <= next_mstatus_mpp; + mstatus_mprv <= next_mstatus_mprv; + priv_q <= next_priv; mie_msie <= next_mie_msie; mie_mtie <= next_mie_mtie; mie_meie <= next_mie_meie; @@ -525,9 +593,18 @@ module csr_file #( // Cycle counter increments every cycle (not in reset). p_cycle_increments : assert (cycle_counter == $past(cycle_counter) + 64'd1); - // Instret increments by the retire count (0, 1, or 2 per cycle). - p_instret_increments : - assert (instret_counter == $past(instret_counter) + 64'($past(i_instruction_retired_count))); + // Instret retime invariants (see the Instructions Retired Counter + // comment): the staging register follows the input by one cycle, and + // the accumulator applies the staged count. Composed: + // instret_counter(T) == instret_counter(T-1) + retired_count(T-2) + // i.e. instret equals the running total of retired instructions delayed + // by exactly one staging cycle; the delay is architecturally invisible + // because commit-serialized CSR reads sample the counter no earlier + // than + 3 cycles. + p_instret_stage_follows : + assert (instruction_retired_count_q == $past(i_instruction_retired_count)); + p_instret_applies_staged_count : + assert (instret_counter == $past(instret_counter) + 64'($past(instruction_retired_count_q))); // fflags sticky: when no CSR write to fflags/fcsr and no effective fp_flags_valid, // fflags does not shrink. @@ -542,6 +619,7 @@ module csr_file #( if ($past(i_rst)) begin p_reset_cycle : assert (cycle_counter == 64'd0); p_reset_instret : assert (instret_counter == 64'd0); + p_reset_instret_stage : assert (instruction_retired_count_q == 2'd0); p_reset_mie : assert (!mstatus_mie); p_reset_mpie : assert (!mstatus_mpie); p_reset_fflags : assert (fflags == 5'b0); diff --git a/hw/rtl/cpu_and_mem/cpu/ex_stage/alu/alu.sv b/hw/rtl/cpu_and_mem/cpu/ex_stage/alu/alu.sv index fc482584..bfcc3631 100644 --- a/hw/rtl/cpu_and_mem/cpu/ex_stage/alu/alu.sv +++ b/hw/rtl/cpu_and_mem/cpu/ex_stage/alu/alu.sv @@ -164,7 +164,7 @@ module alu #( // Use pre-computed link address from IF stage (PC+2 for compressed, PC+4 for 32-bit) riscv_pkg::JAL: o_result = i_link_address; riscv_pkg::JALR: o_result = i_link_address; - // M-extension multiply operations (1-cycle registered, requires stall) + // M-extension multiply operations (4-cycle pipelined multiplier, requires stall until o_valid_output) riscv_pkg::MUL: begin // Start multiply if not already in progress; use lower 32 bits of result multiplier_valid_input = ~multiplier_valid_input_registered; diff --git a/hw/rtl/cpu_and_mem/cpu/ex_stage/alu/divider.sv b/hw/rtl/cpu_and_mem/cpu/ex_stage/alu/divider.sv index 8a615221..a25182d0 100644 --- a/hw/rtl/cpu_and_mem/cpu/ex_stage/alu/divider.sv +++ b/hw/rtl/cpu_and_mem/cpu/ex_stage/alu/divider.sv @@ -102,7 +102,7 @@ module divider #( remainder_should_be_negative = dividend_is_negative; end - // 2x-folded radix-2 division requires one pipeline stage per 2 bits (32 stages for 32-bit) + // 2x-folded radix-2 division requires one pipeline stage per 2 bits (16 stages for 32-bit) localparam int unsigned NumPipelineStages = WIDTH / 2; // Pipeline arrays for each stage - carry values through division process diff --git a/hw/rtl/cpu_and_mem/cpu/id_stage/instr_decoder.sv b/hw/rtl/cpu_and_mem/cpu/id_stage/instr_decoder.sv index 34be8e3f..68351bcc 100644 --- a/hw/rtl/cpu_and_mem/cpu/id_stage/instr_decoder.sv +++ b/hw/rtl/cpu_and_mem/cpu/id_stage/instr_decoder.sv @@ -15,7 +15,7 @@ */ /* - Instruction decoder for RISC-V RV32IMAFB + Zicsr + Machine-mode privileged. + Instruction decoder for RISC-V RV32GCB + Zicsr + M/U-mode privileged. B extension = Zba + Zbb + Zbs (full bit manipulation). F extension = Single-precision floating-point. This combinational module decodes 32-bit RISC-V instructions into control signals diff --git a/hw/rtl/cpu_and_mem/cpu/if_stage/if_stage.sv b/hw/rtl/cpu_and_mem/cpu/if_stage/if_stage.sv index 4edf58a4..43aac51f 100644 --- a/hw/rtl/cpu_and_mem/cpu/if_stage/if_stage.sv +++ b/hw/rtl/cpu_and_mem/cpu/if_stage/if_stage.sv @@ -26,7 +26,7 @@ * ├── pc_controller PC management, next-PC selection * │ └── control_flow_tracker Holdoff signal generation * ├── branch_prediction/ Branch prediction subsystem - * │ ├── branch_predictor 32-entry BTB (combinational lookup) + * │ ├── branch_predictor 256-entry BTB (combinational lookup) * │ ├── branch_prediction_controller Prediction gating and registration * │ └── prediction_metadata_tracker Stall/spanning metadata handling * └── c_extension/ Compressed instruction subsystem @@ -66,7 +66,7 @@ * ========= * - RISC-V C extension support (compressed 16-bit instructions) * - Handles 32-bit instructions spanning two memory words (PC[1]=1) - * - Branch prediction with 32-entry BTB + * - Branch prediction with 256-entry BTB * - Outputs raw parcel + selection signals for PD stage decompression * * TIMING OPTIMIZATION: @@ -86,6 +86,7 @@ module if_stage #( input logic [63:0] i_instr, // 64-bit fetch: {next_word, current_word} input logic [riscv_pkg::ImemFetchSidebandWidth-1:0] i_instr_sideband, input logic i_instr_bank_sel_r, // Fetch-word parity (PC[2] from fetch cycle) + input logic [XLEN-1:0] i_served_addr, // Served fetch-window tag (full address) // Fetch window valid: the {i_instr, i_instr_sideband, i_instr_bank_sel_r} // window corresponds to the fetch address presented last cycle. When low // (variable-latency provider: L1I miss / fuzz), IF emits NOP bubbles, @@ -506,6 +507,8 @@ module if_stage #( .i_pd_redirect(i_pd_redirect), .i_pd_redirect_target(i_pd_redirect_target), + .i_window_cannot_serve(window_resteer_pc_reg), + .i_window_cannot_serve_raw(window_cannot_serve_pc_reg), .i_trap_taken (i_trap_ctrl.trap_taken), .i_mret_taken (i_trap_ctrl.mret_taken), @@ -742,13 +745,55 @@ module if_stage #( // stall-held, so a stall covering the bubble cycle let the bubble // present-and-dispatch on release alongside the realigned repeat. Fixed // by stall-gating pd_redirect_q (see its always_ff above). - assign sel_nop = i_pipeline_ctrl.flush || flush_for_c_ext_safe || !fetch_progress || + // Served-window invariant: the fetched 64-bit window covers exactly the two + // words {word(i_served_addr), word(i_served_addr)+1}. pc_reg must lie in that + // window or the 1-bit bank-sel parity in instruction_aligner silently selects + // the wrong word -> wrong instruction-size sample -> pc_reg advances onto a + // mid-instruction byte (the workqueue_init_early epc 0x8038d7fa boot Oops). + // A fetch stall (L1I line-fill) can leave the served window >1 word from + // pc_reg, which the single parity bit cannot represent. Detect it from the + // full served address; pc_controller squashes (sel_nop below), holds pc_reg, + // and resteers fetch onto pc_reg's word until the correct window is served. + logic signed [XLEN-1:0] served_word_delta; + assign served_word_delta = $signed( + {2'b00, i_served_addr[XLEN-1:2]} + ) - $signed( + {2'b00, pc_reg[XLEN-1:2]} + ); + logic window_cannot_serve_pc_reg; + // Gated to the cached region (pc_reg[XLEN-1], i.e. >= CACHED_BASE): the low BRAM + // fetch path is fixed 1-cycle/always-valid and never desyncs, and its served-addr + // tracking is approximate -- firing there only causes spurious squashes. + assign window_cannot_serve_pc_reg = i_instr_valid && pc_reg[XLEN-1] && + (served_word_delta != $signed( + 0 + )) && (served_word_delta != -$signed( + 1 + )) && !((served_word_delta == $signed( + 1 + )) && use_instr_buffer); + + // The existing (pre-served-window-guard) squash conditions. + logic sel_nop_existing; + assign sel_nop_existing = i_pipeline_ctrl.flush || + flush_for_c_ext_safe || !fetch_progress || sel_nop_align || reset_holdoff || pending_prediction_target_holdoff || (pending_prediction_fetch_holdoff && !prediction_holdoff) || (control_flow_holdoff && (!prediction_holdoff || pd_redirect_q || slot2_redirect_q)); + // Resteer fetch onto pc_reg's word + hold pc_reg ONLY at a real consume cycle + // (not during an existing holdoff, where pc_reg is already managed and a resteer + // would thrash the front end -- the cause of the earlier isa_test/boot regression). + // At a holdoff release with the served window still stale (fetch ran ahead during + // the redirect bubble), this fires the cycle the wrong-word decode would otherwise + // advance pc_reg onto a mid-instruction byte. + logic window_resteer_pc_reg; + assign window_resteer_pc_reg = window_cannot_serve_pc_reg && !sel_nop_existing; + + assign sel_nop = sel_nop_existing || window_cannot_serve_pc_reg; + // =========================================================================== // Stall State Registers // =========================================================================== @@ -951,11 +996,35 @@ module if_stage #( logic [XLEN-1:0] instruction_pc; logic [XLEN-1:0] link_address; - // Use the same stall-safe compressed selection metadata that PD consumes. - // This keeps link_address aligned with the actual instruction that will be - // seen downstream, including prediction/stall replay cases. + // link_address (the fall-through PC) must reflect the TRUE size of the slot-1 + // instruction held across a stall. The shared sel_compressed_sc is flush-zeroed + // by its stall_capture_reg (stall_capture_reg.sv: `if (i_flush) saved <= '0`), + // so on a flush-inside-stall a *compressed* branch held at fetch reads + // is_compressed_for_link = 0 -> link_address = pc_reg + 4 (one halfword too far). + // That stale fall-through link is then consumed as the not-taken redirect target + // (early_misprediction_recovery: `... : rs_issue_int.link_addr`), making fetch + // skip the branch's successor parcel. This is the no-MMU-Linux timer-IRQ + // "gremlin": the revmap_size load (`lw a5,80(a0)`) right after a not-taken + // `c.beqz` is dropped, so the dependent `bgeu a1,a5` reads a stale a5 and takes + // the wrong IRQ-dispatch path. Capture sel_compressed for the link WITHOUT the + // flush-zero so the held size matches the actual held instruction (pc_reg+2/+4 + // correctly). sel_compressed_sc's other consumers (o_from_if_to_pd.sel_compressed, + // slot2_pc_sc) are replay-gated by sel_nop_saved=1 after a flush, so they are + // unaffected; only this link path reads the captured bit in the post-flush window. logic is_compressed_for_link; - assign is_compressed_for_link = sel_compressed_sc; + logic sel_compressed_for_link_sc; + stall_capture_reg #( + .WIDTH(1) + ) u_sel_compressed_for_link_sc ( + .i_clk, + .i_reset(1'b0), + .i_flush(1'b0), + .i_stall(if_stage_stall), + .i_stall_registered(if_stage_stall_registered), + .i_data(sel_compressed), + .o_data(sel_compressed_for_link_sc) + ); + assign is_compressed_for_link = sel_compressed_for_link_sc; assign instruction_pc = pc_reg; assign link_address = instruction_pc + (is_compressed_for_link ? @@ -1153,9 +1222,8 @@ module if_stage #( // Slot-2 IF→PD Packet (2-wide dispatch — Session F) // =========================================================================== // Slot-2 follows slot-1 sequentially in program order: PC and link address - // are simply slot-1's plus the slot-1 / slot-2 sizes. No BTB lookup is - // performed for slot-2 (decision #3, single-port BTB on slot-1 PC) and no - // RAS prediction is consumed for slot-2 (decision #1: slot-2 is invalid + // are simply slot-1's plus the slot-1 / slot-2 sizes. No RAS prediction + // is consumed for slot-2 (decision #1: slot-2 is invalid // when slot-1 is a branch, so slot-1 cannot have pushed/popped RAS in the // same cycle). // diff --git a/hw/rtl/cpu_and_mem/cpu/if_stage/pc_controller.sv b/hw/rtl/cpu_and_mem/cpu/if_stage/pc_controller.sv index 7882b477..2f03ef6e 100644 --- a/hw/rtl/cpu_and_mem/cpu/if_stage/pc_controller.sv +++ b/hw/rtl/cpu_and_mem/cpu/if_stage/pc_controller.sv @@ -51,8 +51,8 @@ | v | | +-----------------------------------------+ | | | Final PC Mux (Priority Encoded) | | - | | reset > trap > stall > branch > |------------------> o_pc | - | | prediction > sequential |------------------> o_pc_reg| + | | reset > trap > fence.i > branch > |------------------> o_pc | + | | PD redirect > hold > prediction > seq. |------------------> o_pc_reg| | +-----------------------------------------+ | | | +-------------------------------------------------------------------------+ @@ -61,11 +61,13 @@ ============== 1. Control flow tracking - Detect stale instruction cycles after redirects 2. PC increment calculation - C-extension aware (+0, +2, or +4) [submodule] - 3. Mid-32bit correction - Handle landing in middle of 32-bit instruction + 3. Mid-32bit correction - DISABLED with 64-bit fetch (output tied to 0) 4. Final PC selection - Priority mux with timing-optimized flat structure - All branches and jumps (JAL, JALR, conditional branches) are resolved in the EX - stage and come through the i_branch_taken/i_branch_target interface. + Branch/jump redirects (JAL, JALR, conditional branches) arrive on the + i_branch_taken/i_branch_target interface only on misprediction recovery + (early or commit-time, synthesized by ex_comb_synthesizer); correctly + predicted branches commit without a redirect here. */ module pc_controller #( parameter int unsigned XLEN = 32 @@ -91,8 +93,13 @@ module pc_controller #( input logic [XLEN-1:0] i_branch_target, // PD backward-branch heuristic redirect (from pd_stage) - input logic i_pd_redirect, + input logic i_pd_redirect, input logic [XLEN-1:0] i_pd_redirect_target, + input logic i_window_cannot_serve, // Served window cannot hold pc_reg -> resteer+hold + // Raw window-cannot-serve (UNGATED by sel_nop) -- the exact gremlin DROP condition. + // Narrows the immediate-predecessor carve-out to fire ONLY when the load would + // actually be dropped (wcs=1), not at the ~50k benign wcs=0 dual-issue sites. + input logic i_window_cannot_serve_raw, // Trap control input logic i_trap_taken, @@ -323,6 +330,9 @@ module pc_controller #( logic [XLEN-1:0] pending_prediction_pc; logic [XLEN-1:0] pending_prediction_target; logic pending_prediction_effective; + logic pending_imm_pred_emit; + logic pim_base; // immediate-predecessor + pending (pre-narrowing) + logic carve_out_engaged_q; // latched: raw wcs=1 seen this episode logic pending_prediction_from_buffer; logic prediction_needs_pending; logic use_pending_prediction_for_pc_reg; @@ -407,7 +417,19 @@ module pc_controller #( pc_reg_next_bit1_for_prediction = o_pc_reg[1] ^ i_is_compressed; end end - assign pc_reg_next_misses_fetch_pc_for_prediction = pc_reg_next_bit1_for_prediction != o_pc[1]; + // BOOT-HANG FIX (verification form): the bit1-only fast predictor + // (pc_reg_next_bit1_for_prediction != o_pc[1]) diverges from the full result + // when pc_reg is >=2 words behind the word-aligned fetch PC -- both are + // word-aligned so bit 1 matches, but the words differ. There the fast value + // is 0 ("no miss") while the truth (seq_next_pc_reg != o_pc) is 1, so + // prediction_needs_pending is wrongly false, the prediction is applied without + // the pc_reg handoff, and fetch redirects to the wrong PC (silent on HW where + // the assert below is compiled out -> the no-MMU Linux boot hang at pid_max). + // Use the full compare; conservative-safe (only ever pends MORE, exactly in + // the cases the bit1 proxy missed). NOTE: this reintroduces the + // seq_next_pc_reg compare on the prediction cone that the bit1 proxy existed + // to avoid -- a timing-friendly correct form is a follow-up if WNS regresses. + assign pc_reg_next_misses_fetch_pc_for_prediction = (seq_next_pc_reg != o_pc); assign prediction_needs_pending = i_prediction_used && !i_ras_predicted && !i_slot2_prediction_used && @@ -456,9 +478,63 @@ module pc_controller #( assign stale_pending_prediction = pending_prediction_effective && !use_pending_prediction_for_pc_reg && (pc_reg_hw > pending_prediction_pc_hw); + // GREMLIN fix (Option 1b): immediate-predecessor carve-out. When a pending BTB + // prediction is in flight for a branch that is the COMPRESSED parcel immediately + // after pc_reg (pending_prediction_pc == o_pc_reg + 2) and pc_reg has NOT yet + // reached it (!use_pending, !stale), the parcel currently at pc_reg is a + // correct-path OLDER instruction that MUST execute (e.g. the no-MMU IRQ revmap_size + // load at 0x8005a19a sitting between the fetch point and the predicted bgeu at + // 0x8005a19c). Without this, hold_pending_prediction_fetch squashes it (-> + // o_pending_prediction_fetch_holdoff -> if_stage sel_nop) and the land-on-branch arm + // jumps pc_reg straight to pending_prediction_pc, DROPPING it. pending_imm_pred_emit + // suppresses the fetch-holdoff squash + the land-on-branch jump so the parcel emits + // and pc_reg advances SEQUENTIALLY onto the branch. pending_prediction_valid stays + // live, so the prediction still applies (metadata-replay path unchanged) once pc_reg + // reaches the branch. This is the documented design intent of + // prediction_metadata_tracker ("IF keeps walking older instructions after a BTB + // redirect"). + // + // LOOP-BREAK: the predicate uses ONLY registered state -- o_pc_reg and + // pending_prediction_pc + a constant. An earlier form used seq_next_pc_reg, which + // depends on pc_reg_advance_sel -> sel_nop; combined with gate (a) feeding + // pending_imm_pred_emit BACK into sel_nop (via o_pending_prediction_fetch_holdoff) + // that closed a combinational cycle (Verilator "Active region did not converge" at + // ~16.6M, masked by -Wno-UNOPTFLAT). o_pc_reg + PcIncrementCompressed is exactly the + // value seq_next_pc_reg held while the parcel was squashed (pc_reg_advance_sel_live + // DEFAULTS to +2 when sel_nop=1, if_stage.sv ~1297), so behaviour is preserved for + // the compressed immediate-predecessor (the observed gremlin) while the cycle is + // broken. A 32-bit predecessor is intentionally NOT covered: it cannot be + // identified sel_nop-free here (the served instruction-size signals are unreliable + // under the coincident served-window guard) and the prior form did not cover it + // either (it too saw +2 during the squash), so the scope is unchanged. + // NARROWING: the base condition (pim_base, below) by itself fires ~50k times/boot, at + // wcs=0 dual-issue load+branch bundles where the load already emits -- and there, the + // carve-out clearing sel_nop makes pc_reg_advance_sel_live pick +4 (slot-2) so pc_reg + // jumps PAST the branch, mishandling the pending prediction -> stale-ra wild ret + // (of_prop_next_string 0x8021fcae). + assign pim_base = + pending_prediction_effective && !use_pending_prediction_for_pc_reg && + !stale_pending_prediction && + (pending_prediction_pc == (o_pc_reg + riscv_pkg::PcIncrementCompressed)); + // NARROW to the true gremlin: the load is only DROPPED when the served window cannot + // deliver it (raw wcs=1). But the load can only EMIT on the wcs=0 cycle (one after the + // resteer), so a plain "&& wcs" would drop pim exactly then and re-NOP the load. Instead + // LATCH the engagement once wcs=1 is seen during the episode, and hold it until the + // episode ends (pc_reg reaches the branch -> pim_base falls) or any redirect. This is + // NOT a pc_reg hold -- pim still advances pc_reg via the carve-out -- so it cannot + // deadlock. At wcs=0 sites it never engages. Acyclic: raw wcs is independent of sel_nop. + assign pending_imm_pred_emit = pim_base && (i_window_cannot_serve_raw || carve_out_engaged_q); + always_ff @(posedge i_clk) begin + if (i_reset || i_flush || i_trap_taken || i_mret_taken || i_branch_taken || + i_pd_redirect || i_fence_i_flush || !pim_base) begin + carve_out_engaged_q <= 1'b0; + end else if (!fetch_stall && i_window_cannot_serve_raw) begin + carve_out_engaged_q <= 1'b1; + end + end assign hold_pending_prediction_fetch = pending_prediction_effective && !use_pending_prediction_for_pc_reg && - !stale_pending_prediction; + !stale_pending_prediction && !pending_imm_pred_emit; assign hold_pending_prediction_consume_fetch = pending_prediction_effective && use_pending_prediction_for_pc_reg; // Keep a PC-mux-local copy of the pending-handoff cone so synthesis can @@ -485,7 +561,7 @@ module pc_controller #( assign hold_pending_prediction_fetch_pc_mux = pending_prediction_effective && !use_pending_prediction_for_pc_reg_pc_mux && - !stale_pending_prediction_pc_mux; + !stale_pending_prediction_pc_mux && !pending_imm_pred_emit; assign hold_pending_prediction_consume_fetch_pc_mux = pending_prediction_effective && use_pending_prediction_for_pc_reg_pc_mux; @@ -607,6 +683,7 @@ module pc_controller #( else if (i_fence_i_flush) next_pc = i_fence_i_target; else if (i_branch_taken) next_pc = i_branch_target; else if (i_pd_redirect) next_pc = i_pd_redirect_target; + else if (i_window_cannot_serve) next_pc = {o_pc_reg[XLEN-1:2], 2'b00}; // No fetch progress: hold the fetch address so the provider can keep // working on the owed ask. Sits above the prediction/pending arms // (their state is frozen and predictions are suppressed while invalid) @@ -670,6 +747,7 @@ module pc_controller #( else if (i_fence_i_flush) next_pc_reg = i_fence_i_target; else if (i_branch_taken) next_pc_reg = i_branch_target; else if (i_pd_redirect) next_pc_reg = i_pd_redirect_target; + else if (i_window_cannot_serve) next_pc_reg = o_pc_reg; // No fetch progress: hold the instruction address (nothing is being // delivered). Same placement rationale as the next_pc hold arm above. else if (!i_fetch_progress) next_pc_reg = o_pc_reg; @@ -684,8 +762,13 @@ module pc_controller #( // that bubble; advancing here pairs the arriving target word with the next // halfword PC and corrupts C-extension alignment on loop back-edges. else if (o_pending_prediction_target_holdoff) next_pc_reg = o_pc_reg; + // GREMLIN fix (Option 1b): suppress the land-on-branch JUMP in the immediate- + // predecessor carve-out so pc_reg advances SEQUENTIALLY (seq_next_pc_reg, which + // equals pending_prediction_pc here) and the intervening older parcel emits first + // instead of being skipped. pending_prediction_valid stays live -> the target + // handoff (below) still fires when pc_reg actually reaches the branch. else if (pending_prediction_effective && !pending_prediction_allow_cross_pc_mux_q && - !use_pending_prediction_for_pc_reg_pc_mux) + !use_pending_prediction_for_pc_reg_pc_mux && !pending_imm_pred_emit) next_pc_reg = pending_prediction_pc; else if (pending_prediction_cross_handoff_pc_mux) next_pc_reg = pending_prediction_pc; else if (pending_prediction_target_handoff_pc_mux) next_pc_reg = pending_prediction_target; diff --git a/hw/rtl/cpu_and_mem/cpu/if_stage/pc_increment_calculator.sv b/hw/rtl/cpu_and_mem/cpu/if_stage/pc_increment_calculator.sv index 24ee5f39..13934c9f 100644 --- a/hw/rtl/cpu_and_mem/cpu/if_stage/pc_increment_calculator.sv +++ b/hw/rtl/cpu_and_mem/cpu/if_stage/pc_increment_calculator.sv @@ -18,7 +18,7 @@ PC Increment Calculator Computes the next sequential PC values using parallel adders for timing optimization. - This module pre-computes fetch PC increment results (pc+2, pc+4) in parallel, + This module pre-computes fetch PC increment results (pc+2 through pc+8) in parallel, then selects the correct result based on instruction type and state. Key Timing Optimization: diff --git a/hw/rtl/cpu_and_mem/cpu/if_stage/pc_reg_precompute.sv b/hw/rtl/cpu_and_mem/cpu/if_stage/pc_reg_precompute.sv index fb190938..c37b1ab2 100644 --- a/hw/rtl/cpu_and_mem/cpu/if_stage/pc_reg_precompute.sv +++ b/hw/rtl/cpu_and_mem/cpu/if_stage/pc_reg_precompute.sv @@ -17,7 +17,7 @@ /* * PC Register Pre-computation * - * Computes pc_reg + 0/2/4 in parallel and selects the result for both + * Computes pc_reg + 0/2/4/6 in parallel and selects the result for both * the "instruction is compressed" and "instruction is 32-bit" cases using * ONLY registered select signals. * diff --git a/hw/rtl/cpu_and_mem/cpu/riscv_pkg.sv b/hw/rtl/cpu_and_mem/cpu/riscv_pkg.sv index eb8b0893..6d09eaea 100644 --- a/hw/rtl/cpu_and_mem/cpu/riscv_pkg.sv +++ b/hw/rtl/cpu_and_mem/cpu/riscv_pkg.sv @@ -381,7 +381,7 @@ package riscv_pkg; // Section 3: CSR Definitions // =========================================================================== // Control and Status Register addresses, bit positions, and cause codes. - // Includes Zicsr instruction encodings and M-mode trap support. + // Includes Zicsr instruction encodings and M/U-mode trap support. // CSR instruction funct3 encoding typedef enum bit [2:0] { @@ -475,6 +475,13 @@ package riscv_pkg; // mstatus bit positions (RV32) localparam int unsigned MstatusMieBit = 3; // Machine Interrupt Enable localparam int unsigned MstatusMpieBit = 7; // Machine Previous Interrupt Enable + // mstatus.MPP occupies [12:11]; mstatus.MPRV is bit 17 (RV32). + localparam int unsigned MstatusMppLo = 11; + localparam int unsigned MstatusMprvBit = 17; + + // Privilege modes (RISC-V encoding). FROST implements Machine and User only. + localparam logic [1:0] PrivU = 2'b00; + localparam logic [1:0] PrivM = 2'b11; // mie/mip bit positions localparam int unsigned MieMsiBit = 3; // Machine Software Interrupt @@ -486,6 +493,7 @@ package riscv_pkg; localparam bit [31:0] ExcBreakpoint = 32'd3; localparam bit [31:0] ExcLoadAddrMisalign = 32'd4; localparam bit [31:0] ExcStoreAddrMisalign = 32'd6; + localparam bit [31:0] ExcEcallUmode = 32'd8; localparam bit [31:0] ExcEcallMmode = 32'd11; // Interrupt cause codes (mcause values when interrupt bit = 1) @@ -836,7 +844,7 @@ package riscv_pkg; // Section 9: Trap/Exception Handling // =========================================================================== // Structures for trap control. - // Used by trap_unit.sv for M-mode exception/interrupt handling. + // Used by trap_unit.sv for M/U-mode exception/interrupt handling. // Trap control signals (from trap unit to pipeline) typedef struct packed { logic trap_taken; // Trap is being taken this cycle @@ -1075,7 +1083,7 @@ package riscv_pkg; localparam int unsigned FLEN = FpWidth; // 64 bits for D extension // CDB parameters - localparam int unsigned NumCdbLanes = 1; // Single CDB (future expansion) + localparam int unsigned NumCdbLanes = 1; // unused: the CDB is 2-lane today (o_cdb + o_cdb_2) localparam int unsigned NumFus = 7; // ALU, MUL, DIV, MEM, FP_ADD, FP_MUL, FP_DIV // --------------------------------------------------------------------------- diff --git a/hw/rtl/cpu_and_mem/cpu/tomasulo/README.md b/hw/rtl/cpu_and_mem/cpu/tomasulo/README.md index 7206be89..9fc75936 100644 --- a/hw/rtl/cpu_and_mem/cpu/tomasulo/README.md +++ b/hw/rtl/cpu_and_mem/cpu/tomasulo/README.md @@ -66,11 +66,12 @@ moves, no functional change): `store_queue/sq_forwarding_unit`, `serial_state_e` enum lives in `riscv_pkg` so the ROB and submodule share it). Each is documented in its parent module's README. -The CPU top-level (`../cpu_ooo.sv`) instantiates `tomasulo_wrapper` -plus `dispatch` and the front-end stages, and contains a few large -inline blocks that straddle the front-end / back-end boundary -(early misprediction recovery, commit flush controller, memory port -arbitration, …). See [`../README.md`](../README.md). +The CPU top-level (`../cpu_ooo/cpu_ooo.sv`) instantiates +`tomasulo_wrapper` plus `dispatch` and the front-end stages; the logic +that straddles the front-end / back-end boundary (early misprediction +recovery, the misprediction flush controller, memory port arbitration, +…) lives in its glue submodules under `../cpu_ooo/branch_recovery/` +and `../cpu_ooo/memory_if/`. See [`../README.md`](../README.md). ## Cross-cutting design notes @@ -97,7 +98,8 @@ Branches and JALRs reserve a RAT checkpoint at dispatch (full INT + FP RAT snapshot + RAS top + valid count, 8 slots). Conditional-branch mispredictions resolve in `branch_jump_unit` and -trigger a fast two-phase recovery directly from `cpu_ooo.sv`: the +trigger a fast two-phase recovery in the `early_misprediction_recovery` +submodule (under `cpu_ooo/branch_recovery/`): the front-end redirects and the RAT restores in the same cycle, then the OOO back-end's partial flush fires one cycle later. This cuts the typical penalty from ~15 cycles to ~2. diff --git a/hw/rtl/cpu_and_mem/cpu/tomasulo/cdb_arbiter/cdb_arbiter.sv b/hw/rtl/cpu_and_mem/cpu/tomasulo/cdb_arbiter/cdb_arbiter.sv index 3955a167..16a615c2 100644 --- a/hw/rtl/cpu_and_mem/cpu/tomasulo/cdb_arbiter/cdb_arbiter.sv +++ b/hw/rtl/cpu_and_mem/cpu/tomasulo/cdb_arbiter/cdb_arbiter.sv @@ -17,8 +17,9 @@ /* * CDB Arbiter * - * Priority-based multiplexer that selects one functional unit result per cycle - * for broadcast on the Common Data Bus (CDB). Ties FU completions back to: + * Priority-based multiplexer that selects up to two functional unit results + * per cycle (2-wide CDB: primary o_cdb + secondary o_cdb_2) for broadcast on + * the Common Data Bus (CDB). Ties FU completions back to: * - ROB (mark done + store value) * - All RS instances (operand wakeup) * diff --git a/hw/rtl/cpu_and_mem/cpu/tomasulo/dispatch/dispatch.sv b/hw/rtl/cpu_and_mem/cpu/tomasulo/dispatch/dispatch.sv index e19ea796..3c0870a2 100644 --- a/hw/rtl/cpu_and_mem/cpu/tomasulo/dispatch/dispatch.sv +++ b/hw/rtl/cpu_and_mem/cpu/tomasulo/dispatch/dispatch.sv @@ -26,8 +26,10 @@ * 5. Allocates a checkpoint for branches/jumps * 6. Generates back-pressure (stall) when resources are exhausted * - * The dispatch is combinational: all outputs are derived from the registered - * from_id_to_ex pipeline register in the same cycle. + * The dispatch is mostly combinational: outputs are derived from the + * registered from_id_to_ex pipeline register in the same cycle, except the + * done-repair bypass valid/tag channels, which are registered and appear one + * cycle after the dispatch fire. * * Stall conditions (any one stalls the front-end): * - ROB full diff --git a/hw/rtl/cpu_and_mem/cpu/tomasulo/load_queue/README.md b/hw/rtl/cpu_and_mem/cpu/tomasulo/load_queue/README.md index 08aef01f..f860ad32 100644 --- a/hw/rtl/cpu_and_mem/cpu/tomasulo/load_queue/README.md +++ b/hw/rtl/cpu_and_mem/cpu/tomasulo/load_queue/README.md @@ -52,6 +52,10 @@ Two things the cache intentionally *doesn't* do: so there's nothing speculative to throw away. Leaving cached lines hot across mispredict recovery roughly doubles the steady-state hit rate on CoreMark (36.5% → 72.4%). +- **No fill from a full-flush-cycle response.** Trap/MRET/FENCE.I full + flushes keep existing L0 lines hot, but a memory response that arrives + on the flush cycle is treated as a drained response for a killed load + and is not allowed to install a new L0 line. - **No same-cycle fill → lookup bypass.** Forwarding the in-flight fill into a same-cycle lookup dragged the back-end flush cone (`i_flush_en` → `accept_mem_response` → fill → bypass → hit → diff --git a/hw/rtl/cpu_and_mem/cpu/tomasulo/load_queue/load_queue.sv b/hw/rtl/cpu_and_mem/cpu/tomasulo/load_queue/load_queue.sv index 1263ed49..cfb12594 100644 --- a/hw/rtl/cpu_and_mem/cpu/tomasulo/load_queue/load_queue.sv +++ b/hw/rtl/cpu_and_mem/cpu/tomasulo/load_queue/load_queue.sv @@ -18,10 +18,10 @@ * Load Queue - Tracks in-flight load instructions * * Circular buffer of DEPTH entries (8), allocated in program order at - * dispatch time, freed when the load result is broadcast on the CDB. + * dispatch time, freed the cycle the result is captured into cdb_stage. * * Features: - * - Parameterized depth (8 entries, FF-based) + * - Parameterized depth (8 entries; LUTRAM payload, FF control state) * - CAM-style tag search for address update (all entries in parallel) * - Oldest-first priority scan for issue selection * - Two-phase FLD support (64-bit double on 32-bit bus) @@ -169,7 +169,7 @@ module load_queue #( input logic i_early_recovery_flush, // ========================================================================= - // L0 Cache Invalidation (from SQ, future) + // L0 Cache Invalidation (from SQ store-write launch) // ========================================================================= input logic i_cache_invalidate_valid, input logic [riscv_pkg::XLEN-1:0] i_cache_invalidate_addr, @@ -360,8 +360,8 @@ module load_queue #( // =========================================================================== // lq_data payload is only read at issue_cdb_idx (CDB broadcast). // Writes come from two independent sources that can overlap: - // Port 0 (primary): cache hit / store forward / memory response - // Port 1 (AMO): AMO write completion + // Port 0 (mem resp): memory response (dedicated) + // Port 1 (local): cache hit / SQ forward / AMO write completion // Split into 32-bit lo and hi halves so FLD can write each phase // independently without read-modify-write. @@ -525,6 +525,7 @@ module load_queue #( // Response acceptance/drain control logic flush_all_entries; logic issued_entry_flushed; + logic full_flush_response_drain; logic accept_mem_response; logic drop_mem_response_now; @@ -747,8 +748,17 @@ module load_queue #( // Issue Selection -> lq_issue_selector.sv (pure boundary move). issue_cdb_idx // still drives the LQ data LUTRAM read below; that RAM stays here. // =========================================================================== - logic [DEPTH-1:0] mem_issue_stored_mask; - logic [DEPTH-1:0] mem_issue_update_mask; + logic stored_scan_found; + logic [IdxWidth-1:0] stored_scan_idx; + logic [IdxWidth-1:0] stored_scan_pos; + logic [DEPTH-1:0] stored_scan_onehot; + logic [ReorderBufferTagWidth-1:0] stored_scan_rob_tag; + + logic update_scan_found; + logic [IdxWidth-1:0] update_scan_idx; + logic [IdxWidth-1:0] update_scan_pos; + logic [DEPTH-1:0] update_scan_onehot; + logic [ReorderBufferTagWidth-1:0] update_scan_rob_tag; logic head_mem_stored_found; logic [IdxWidth-1:0] head_mem_stored_idx; logic [ReorderBufferTagWidth-1:0] head_mem_stored_rob_tag; @@ -756,6 +766,7 @@ module load_queue #( logic [IdxWidth-1:0] head_mem_update_idx; logic [ReorderBufferTagWidth-1:0] head_mem_update_rob_tag; logic [DEPTH*ReorderBufferTagWidth-1:0] lq_rob_tag_flat; + logic force_head_amo; for (genvar g_lq_tag = 0; g_lq_tag < DEPTH; g_lq_tag++) begin : gen_lq_rob_tag_flat assign lq_rob_tag_flat[g_lq_tag*ReorderBufferTagWidth +: ReorderBufferTagWidth] = @@ -778,10 +789,19 @@ module load_queue #( .lq_rob_tag_flat(lq_rob_tag_flat), .head_idx(head_idx), .i_sq_committed_empty(i_sq_committed_empty), + .i_force_head_amo(force_head_amo), .o_issue_cdb_found(issue_cdb_found), .o_issue_cdb_idx(issue_cdb_idx), - .o_mem_issue_stored_mask(mem_issue_stored_mask), - .o_mem_issue_update_mask(mem_issue_update_mask), + .o_stored_scan_found(stored_scan_found), + .o_stored_scan_idx(stored_scan_idx), + .o_stored_scan_pos(stored_scan_pos), + .o_stored_scan_onehot(stored_scan_onehot), + .o_stored_scan_rob_tag(stored_scan_rob_tag), + .o_update_scan_found(update_scan_found), + .o_update_scan_idx(update_scan_idx), + .o_update_scan_pos(update_scan_pos), + .o_update_scan_onehot(update_scan_onehot), + .o_update_scan_rob_tag(update_scan_rob_tag), .o_head_mem_stored_found(head_mem_stored_found), .o_head_mem_stored_idx(head_mem_stored_idx), .o_head_mem_stored_rob_tag(head_mem_stored_rob_tag), @@ -790,15 +810,6 @@ module load_queue #( .o_head_mem_update_rob_tag(head_mem_update_rob_tag) ); - // scan_idx recomputed locally for the head-load diagnostics below; the - // selector computes its own identical copy internally (head-relative idx). - logic [IdxWidth-1:0] scan_idx[DEPTH]; - always_comb begin - for (int unsigned j = 0; j < DEPTH; j++) begin - scan_idx[j] = IdxWidth'(head_idx + IdxWidth'(j)); - end - end - // =========================================================================== // Head-load sub-bucket diagnostics // =========================================================================== @@ -914,49 +925,6 @@ module load_queue #( // a post-encoder 8-to-1 MUX on lq_rob_tag[issue_mem_idx]) logic [ReorderBufferTagWidth-1:0] issue_mem_rob_tag; - logic stored_scan_found; - logic [IdxWidth-1:0] stored_scan_idx; - logic [IdxWidth-1:0] stored_scan_pos; - logic [DEPTH-1:0] stored_scan_onehot; - logic [ReorderBufferTagWidth-1:0] stored_scan_rob_tag; - - logic update_scan_found; - logic [IdxWidth-1:0] update_scan_idx; - logic [IdxWidth-1:0] update_scan_pos; - logic [DEPTH-1:0] update_scan_onehot; - logic [ReorderBufferTagWidth-1:0] update_scan_rob_tag; - - always_comb begin - stored_scan_found = 1'b0; - stored_scan_idx = '0; - stored_scan_pos = '0; - stored_scan_onehot = '0; - stored_scan_rob_tag = '0; - update_scan_found = 1'b0; - update_scan_idx = '0; - update_scan_pos = '0; - update_scan_onehot = '0; - update_scan_rob_tag = '0; - - for (int unsigned i = 0; i < DEPTH; i++) begin - if (mem_issue_stored_mask[i] && !stored_scan_found) begin - stored_scan_found = 1'b1; - stored_scan_idx = scan_idx[i]; - stored_scan_pos = IdxWidth'(i); - stored_scan_onehot[scan_idx[i]] = 1'b1; - stored_scan_rob_tag = lq_rob_tag[scan_idx[i]]; - end - - if (mem_issue_update_mask[i] && !update_scan_found) begin - update_scan_found = 1'b1; - update_scan_idx = scan_idx[i]; - update_scan_pos = IdxWidth'(i); - update_scan_onehot[scan_idx[i]] = 1'b1; - update_scan_rob_tag = lq_rob_tag[scan_idx[i]]; - end - end - end - logic [IdxWidth-1:0] stored_issue_idx; logic [ReorderBufferTagWidth-1:0] stored_issue_rob_tag; logic [ReorderBufferTagWidth-1:0] update_issue_rob_tag; @@ -1128,21 +1096,83 @@ module load_queue #( !sq_commit_interlock && i_sq_forward.can_forward && !sq_check_is_mmio_q && !sq_check_is_lr_q && !sq_check_is_amo_q; + + // Break the rare ROB-head AMO deadlock without changing steady-state AMO + // order. The normal selector remains pristine until a head AMO is eligible + // for issue and the machine has made no useful LQ/SQ progress for a sustained + // window. Once saturated, force_head_amo lets the head-priority path choose + // that AMO for one capture/replace cycle. + localparam int unsigned AmoDeadlockThresh = 512; + localparam int unsigned AmoDeadlockCntW = $clog2(AmoDeadlockThresh + 1); + + logic head_amo_eligible_waiting; + logic sq_check_waiting_older_store; + logic head_amo_no_issue_deadlock; + logic head_amo_sq_deadlock; + logic head_amo_deadlock_wait; + logic [AmoDeadlockCntW-1:0] amo_deadlock_cnt_q; + + always_comb begin + head_amo_eligible_waiting = 1'b0; + for (int unsigned i = 0; i < DEPTH; i++) begin + if (rob_head_match_q[i] && + lq_valid[i] && + lq_is_amo[i] && + entry_addr_valid_now[i] && + !lq_issued[i] && + !lq_data_valid[i] && + !sq_check_in_flight_mask[i] && + i_sq_committed_empty) begin + head_amo_eligible_waiting = 1'b1; + end + end + end + + assign sq_check_waiting_older_store = + sq_check_pending && sq_check_phase2 && sq_check_entry_issueable && + !sq_check_misaligned && !sq_commit_interlock && !sq_no_older_store && + (!i_sq_all_older_addrs_known || (i_sq_forward.match && !i_sq_forward.can_forward)) && + !i_mem_bus_busy && !drop_mem_response_pending && !i_flush_all && !i_flush_en; + + assign head_amo_no_issue_deadlock = + head_amo_eligible_waiting && !issue_mem_found && !sq_check_pending; + assign head_amo_sq_deadlock = + head_amo_eligible_waiting && sq_check_waiting_older_store && + (sq_check_rob_tag_q != i_rob_head_tag); + assign head_amo_deadlock_wait = + !mem_outstanding && (amo_state == AMO_IDLE) && + (head_amo_no_issue_deadlock || head_amo_sq_deadlock); + + always_ff @(posedge i_clk) begin + if (!i_rst_n || i_flush_all || i_flush_en || !head_amo_deadlock_wait) begin + amo_deadlock_cnt_q <= '0; + end else if (amo_deadlock_cnt_q < AmoDeadlockCntW'(AmoDeadlockThresh)) begin + amo_deadlock_cnt_q <= amo_deadlock_cnt_q + 1'b1; + end + end + + assign force_head_amo = (amo_deadlock_cnt_q >= AmoDeadlockCntW'(AmoDeadlockThresh)); + assign flush_all_entries = i_flush_en && !i_early_recovery_flush && (i_rob_head_tag == (i_flush_tag + ReorderBufferTagWidth'(1))); // Data memory has fixed 1-cycle latency in this design. If a partial flush // kills the outstanding load, drop that next response explicitly so the slot - // can be safely reused before the stale data returns. + // can be safely reused before the stale data returns. A full flush clears all + // entries at the edge; a same-cycle response is therefore drained here rather + // than accepted, so it cannot complete a killed load or refill the persistent + // L0 cache from a flushed context. assign issued_entry_flushed = i_flush_en && mem_outstanding && lq_valid[issued_idx] && (flush_all_entries || is_younger( issued_rob_tag, i_flush_tag, i_rob_head_tag )); + assign full_flush_response_drain = i_flush_all && i_mem_read_valid && mem_outstanding; assign accept_mem_response = i_mem_read_valid && mem_outstanding && - !drop_mem_response_pending && !issued_entry_flushed && - lq_valid[issued_idx]; + !i_flush_all && !drop_mem_response_pending && + !issued_entry_flushed && lq_valid[issued_idx]; assign drop_mem_response_now = i_mem_read_valid && - (drop_mem_response_pending || issued_entry_flushed || + (full_flush_response_drain || + drop_mem_response_pending || issued_entry_flushed || (mem_outstanding && !lq_valid[issued_idx])); // =========================================================================== @@ -2344,6 +2374,9 @@ module load_queue #( $warning("LQ: slot-2 alloc attempted when full_for_2 (and slot-1 firing)"); if (i_alloc_2.valid && !i_alloc.valid && full) $warning("LQ: slot-2 alloc attempted alone when full"); + if (i_flush_all && accept_mem_response) + $error("LQ: accepted memory response during full flush"); + if (i_flush_all && cache_fill_valid) $error("LQ: filled L0 cache during full flush"); // Slot-1 and slot-2 must never target the same physical entry. if (slot1_alloc_en && slot2_alloc_en && (alloc_target[IdxWidth-1:0] == slot2_alloc_idx)) $error("LQ: slot-1 and slot-2 alloc collide on entry %0d", alloc_target[IdxWidth-1:0]); @@ -2539,6 +2572,15 @@ module load_queue #( end end + // Full-flush-cycle responses are drains only. They must not perform any + // architectural or persistent-cache side effect. + always_comb begin + if (i_rst_n && i_flush_all) begin + p_no_accept_during_full_flush : assert (!accept_mem_response); + p_no_l0_fill_during_full_flush : assert (!cache_fill_valid); + end + end + // ------------------------------------------------------------------------- // Sequential assertions // ------------------------------------------------------------------------- diff --git a/hw/rtl/cpu_and_mem/cpu/tomasulo/load_queue/lq_issue_selector.sv b/hw/rtl/cpu_and_mem/cpu/tomasulo/load_queue/lq_issue_selector.sv index 8e8a887e..15067a08 100644 --- a/hw/rtl/cpu_and_mem/cpu/tomasulo/load_queue/lq_issue_selector.sv +++ b/hw/rtl/cpu_and_mem/cpu/tomasulo/load_queue/lq_issue_selector.sv @@ -18,7 +18,8 @@ // lq_issue_selector // ============================================================================= // Extracted verbatim from load_queue.sv (pure RTL boundary move, zero functional -// change). Parallel issue selection: Phase A (oldest CDB-ready entry), Phase B +// change, except for the optional registered deadlock break input). Parallel +// issue selection: Phase A (oldest CDB-ready entry), Phase B // (memory-issue eligibility masks with MMIO/LR/AMO head gating + older-AMO // blocking), and the explicit ROB-head priority result. Replaces the old serial // 16-level scan with per-entry masks + tree encoders. issue_cdb_idx is exported @@ -42,11 +43,20 @@ module lq_issue_selector #( input logic [(DEPTH*riscv_pkg::ReorderBufferTagWidth)-1:0] lq_rob_tag_flat, input logic [$clog2(DEPTH)-1:0] head_idx, input logic i_sq_committed_empty, + input logic i_force_head_amo, output logic o_issue_cdb_found, output logic [$clog2(DEPTH)-1:0] o_issue_cdb_idx, - output logic [DEPTH-1:0] o_mem_issue_stored_mask, - output logic [DEPTH-1:0] o_mem_issue_update_mask, + output logic o_stored_scan_found, + output logic [$clog2(DEPTH)-1:0] o_stored_scan_idx, + output logic [$clog2(DEPTH)-1:0] o_stored_scan_pos, + output logic [DEPTH-1:0] o_stored_scan_onehot, + output logic [riscv_pkg::ReorderBufferTagWidth-1:0] o_stored_scan_rob_tag, + output logic o_update_scan_found, + output logic [$clog2(DEPTH)-1:0] o_update_scan_idx, + output logic [$clog2(DEPTH)-1:0] o_update_scan_pos, + output logic [DEPTH-1:0] o_update_scan_onehot, + output logic [riscv_pkg::ReorderBufferTagWidth-1:0] o_update_scan_rob_tag, output logic o_head_mem_stored_found, output logic [$clog2(DEPTH)-1:0] o_head_mem_stored_idx, output logic [riscv_pkg::ReorderBufferTagWidth-1:0] o_head_mem_stored_rob_tag, @@ -173,6 +183,54 @@ module lq_issue_selector #( assign mem_issue_stored_mask = mem_eligible_stored_mask & ~blocked_by_amo; assign mem_issue_update_mask = mem_eligible_update_mask & ~blocked_by_amo; + // Encode the oldest normal stored-address and current-update candidates here + // while scan_idx is already local. Exporting encoded candidates avoids + // re-scanning the masks in load_queue on the SQ-check payload enable path. + logic stored_scan_found; + logic [IdxWidth-1:0] stored_scan_idx; + logic [IdxWidth-1:0] stored_scan_pos; + logic [DEPTH-1:0] stored_scan_onehot; + logic [ReorderBufferTagWidth-1:0] stored_scan_rob_tag; + + logic update_scan_found; + logic [IdxWidth-1:0] update_scan_idx; + logic [IdxWidth-1:0] update_scan_pos; + logic [DEPTH-1:0] update_scan_onehot; + logic [ReorderBufferTagWidth-1:0] update_scan_rob_tag; + + always_comb begin + stored_scan_found = 1'b0; + stored_scan_idx = '0; + stored_scan_pos = '0; + stored_scan_onehot = '0; + stored_scan_rob_tag = '0; + update_scan_found = 1'b0; + update_scan_idx = '0; + update_scan_pos = '0; + update_scan_onehot = '0; + update_scan_rob_tag = '0; + + for (int unsigned i = 0; i < DEPTH; i++) begin + if (mem_issue_stored_mask[i] && !stored_scan_found) begin + stored_scan_found = 1'b1; + stored_scan_idx = scan_idx[i]; + stored_scan_pos = IdxWidth'(i); + stored_scan_onehot[scan_idx[i]] = 1'b1; + stored_scan_rob_tag = + lq_rob_tag_flat[scan_idx[i]*ReorderBufferTagWidth+:ReorderBufferTagWidth]; + end + + if (mem_issue_update_mask[i] && !update_scan_found) begin + update_scan_found = 1'b1; + update_scan_idx = scan_idx[i]; + update_scan_pos = IdxWidth'(i); + update_scan_onehot[scan_idx[i]] = 1'b1; + update_scan_rob_tag = + lq_rob_tag_flat[scan_idx[i]*ReorderBufferTagWidth+:ReorderBufferTagWidth]; + end + end + end + // The sparse queue can reuse reclaimed holes after flushes, so physical // queue order is not always identical to ROB age. To avoid starving the // oldest architectural load behind a younger blocked entry, explicitly @@ -200,7 +258,7 @@ module lq_issue_selector #( !in_flight_mask[i] && !lq_is_mmio[i] && !lq_is_lr[i] && - !lq_is_amo[i]) begin + (!lq_is_amo[i] || (i_force_head_amo && i_sq_committed_empty))) begin head_mem_stored_found = 1'b1; head_mem_stored_idx = IdxWidth'(i); head_mem_stored_rob_tag = lq_rob_tag_flat[i*ReorderBufferTagWidth+:ReorderBufferTagWidth]; @@ -214,7 +272,7 @@ module lq_issue_selector #( !lq_data_valid[i] && !in_flight_mask[i] && !lq_is_lr[i] && - !lq_is_amo[i]) begin + (!lq_is_amo[i] || (i_force_head_amo && i_sq_committed_empty))) begin head_mem_update_found = 1'b1; head_mem_update_idx = IdxWidth'(i); head_mem_update_rob_tag = lq_rob_tag_flat[i*ReorderBufferTagWidth+:ReorderBufferTagWidth]; @@ -224,8 +282,16 @@ module lq_issue_selector #( assign o_issue_cdb_found = issue_cdb_found; assign o_issue_cdb_idx = issue_cdb_idx; - assign o_mem_issue_stored_mask = mem_issue_stored_mask; - assign o_mem_issue_update_mask = mem_issue_update_mask; + assign o_stored_scan_found = stored_scan_found; + assign o_stored_scan_idx = stored_scan_idx; + assign o_stored_scan_pos = stored_scan_pos; + assign o_stored_scan_onehot = stored_scan_onehot; + assign o_stored_scan_rob_tag = stored_scan_rob_tag; + assign o_update_scan_found = update_scan_found; + assign o_update_scan_idx = update_scan_idx; + assign o_update_scan_pos = update_scan_pos; + assign o_update_scan_onehot = update_scan_onehot; + assign o_update_scan_rob_tag = update_scan_rob_tag; assign o_head_mem_stored_found = head_mem_stored_found; assign o_head_mem_stored_idx = head_mem_stored_idx; assign o_head_mem_stored_rob_tag = head_mem_stored_rob_tag; diff --git a/hw/rtl/cpu_and_mem/cpu/tomasulo/load_queue/lq_l0_cache.sv b/hw/rtl/cpu_and_mem/cpu/tomasulo/load_queue/lq_l0_cache.sv index fd439d93..7188e4cd 100644 --- a/hw/rtl/cpu_and_mem/cpu/tomasulo/load_queue/lq_l0_cache.sv +++ b/hw/rtl/cpu_and_mem/cpu/tomasulo/load_queue/lq_l0_cache.sv @@ -26,9 +26,11 @@ * Features: * - Combinational lookup (hit in same cycle as address) * - Fill on memory response - * - MMIO addresses always miss (>= MMIO_ADDR) + * - MMIO addresses always miss (addr[31:30] == 2'b01 quadrant; DDR at + * 0x8000_0000+ is cacheable) * - Flush all valid bits on pipeline flush - * - Per-address invalidation port (for future SQ integration) + * - Per-address invalidation port (driven by SQ store-write launch and + * AMO completion) */ module lq_l0_cache #( diff --git a/hw/rtl/cpu_and_mem/cpu/tomasulo/register_alias_table/register_alias_table.sv b/hw/rtl/cpu_and_mem/cpu/tomasulo/register_alias_table/register_alias_table.sv index 3ad2c087..23722832 100644 --- a/hw/rtl/cpu_and_mem/cpu/tomasulo/register_alias_table/register_alias_table.sv +++ b/hw/rtl/cpu_and_mem/cpu/tomasulo/register_alias_table/register_alias_table.sv @@ -271,7 +271,7 @@ module register_alias_table ( logic [ NumCheckpoints-1:0] checkpoint_valid; // Checkpoint RAT snapshots — distributed RAM - // Combined INT + FP snapshot (384 bits wide, 2-bit address) + // Combined INT + FP snapshot (448 bits wide, 3-bit address) logic ckpt_rat_wr_en; logic [CheckpointIdWidth-1:0] ckpt_rat_wr_addr; logic [ RatSnapshotWidth-1:0] ckpt_rat_wr_data; diff --git a/hw/rtl/cpu_and_mem/cpu/tomasulo/reorder_buffer/README.md b/hw/rtl/cpu_and_mem/cpu/tomasulo/reorder_buffer/README.md index c9cd6bd7..39c3685c 100644 --- a/hw/rtl/cpu_and_mem/cpu/tomasulo/reorder_buffer/README.md +++ b/hw/rtl/cpu_and_mem/cpu/tomasulo/reorder_buffer/README.md @@ -165,7 +165,9 @@ The exceptions: ## Performance counters The ROB drives several of the wrapper's performance counters -directly: `head_and_next_done` (widen-commit actually fired) and +directly: `head_and_next_done` (commit fired while head+1 was also +done — a widen-commit upper bound; the actual fire count is +`commit_2_fire_actual`) and `head_plus_one_done` (ungated head+1 ready, for the drain-backlog bucket) come from here, along with `commit_2_opportunity` / `commit_2_fire_actual` — the gap between those two measures how diff --git a/hw/rtl/cpu_and_mem/cpu/tomasulo/reorder_buffer/reorder_buffer.f b/hw/rtl/cpu_and_mem/cpu/tomasulo/reorder_buffer/reorder_buffer.f index 5ee5f997..f6cd36d8 100644 --- a/hw/rtl/cpu_and_mem/cpu/tomasulo/reorder_buffer/reorder_buffer.f +++ b/hw/rtl/cpu_and_mem/cpu/tomasulo/reorder_buffer/reorder_buffer.f @@ -7,6 +7,7 @@ # RAM primitives (distributed RAM used for multi-bit ROB fields) $(ROOT)/hw/rtl/lib/ram/sdp_dist_ram.sv $(ROOT)/hw/rtl/lib/ram/mwp_dist_ram.sv +$(ROOT)/hw/rtl/lib/ram/mwp_dist_ram_ohread.sv # Reorder Buffer module $(ROOT)/hw/rtl/cpu_and_mem/cpu/tomasulo/reorder_buffer/rob_serializer.sv diff --git a/hw/rtl/cpu_and_mem/cpu/tomasulo/reorder_buffer/reorder_buffer.sv b/hw/rtl/cpu_and_mem/cpu/tomasulo/reorder_buffer/reorder_buffer.sv index d75a83ed..5348b8e5 100644 --- a/hw/rtl/cpu_and_mem/cpu/tomasulo/reorder_buffer/reorder_buffer.sv +++ b/hw/rtl/cpu_and_mem/cpu/tomasulo/reorder_buffer/reorder_buffer.sv @@ -39,10 +39,12 @@ * * Storage: * Multi-bit fields use distributed RAM (LUTRAM) to reduce FF usage. - * Single-write-port fields (written only at allocation) use sdp_dist_ram. - * Multi-write-port fields (allocation + CDB/branch) use mwp_dist_ram - * with a Live Value Table. 1-bit packed vectors that need per-entry - * flush/reset remain in flip-flops. + * Alloc-written fields use mwp_dist_ram_ohread with 2 write ports + * (slot-1 + slot-2 alloc); fields also written by the CDB use 4 write + * ports (+ 2 CDB lanes) via mwp_dist_ram / mwp_dist_ram_ohread with a + * Live Value Table. The branch-update-written resolved-target field is + * the only remaining sdp_dist_ram. 1-bit packed vectors that need + * per-entry flush/reset remain in flip-flops. * * External Coordination: * The Reorder Buffer coordinates with several external units via handshake signals: @@ -120,12 +122,11 @@ module reorder_buffer ( output logic o_commit_2_valid_raw, output logic o_commit_2_store_like_raw, - // Back-pressure from the cpu_ooo pending-write FIFO. Asserted when - // there is room for a slot-2 regfile write this cycle; deasserted when - // the pending register holds a prior slot-2 write that has not yet - // drained AND rob_commit (slot 1) also wants the port this cycle. - // Driven from a registered cpu_ooo signal, so the feedback path - // closes at a flop (no combinational loop). + // Slot-2 accept indication from cpu_ooo. Asserted when the second + // retiring entry can write the regfile this cycle. With the dedicated + // second regfile write port cpu_ooo ties this permanently high (1'b1); + // the gate plumbing is kept so the signal path stays symmetric with + // the earlier back-pressure approach. input logic i_widen_commit_ok, input logic i_commit_hold, @@ -152,6 +153,18 @@ module reorder_buffer ( // Exception detected at head - signal trap unit output logic o_trap_pending, // Exception needs handling output logic [riscv_pkg::XLEN-1:0] o_trap_pc, // PC of excepting instruction + // Head decodes as WFI (drives WFI interrupt-resume-PC seed in cpu_ooo) + output logic o_head_is_wfi, + // TIMING precompute of the architectural next-PC of the head / head+1 + // entry, for cpu_ooo's interrupt_resume_pc capture. Contract: whenever + // o_commit_valid_raw (resp. o_commit_2_valid_raw) is high, + // o_head_retired_next_pc (resp. o_head_next_retired_next_pc) equals + // retired_next_pc(o_commit_comb) (resp. (o_commit_comb_2)) as computed in + // cpu_ooo. Computed from UNGATED head fields so the RAM read + 32-bit add + // run in parallel with (not after) the late commit_en gating; in cycles + // without a commit the value is unused (checked in cpu_ooo simulation). + output logic [riscv_pkg::XLEN-1:0] o_head_retired_next_pc, + output logic [riscv_pkg::XLEN-1:0] o_head_next_retired_next_pc, output riscv_pkg::exc_cause_t o_trap_cause, // Exception cause // Head entry's CDB value at trap time. For a misaligned load/store the // load_queue/SQ path parks the faulting address here (the value slot is @@ -169,6 +182,10 @@ module reorder_buffer ( // ========================================================================= input logic i_interrupt_pending, // Interrupt is pending (wake from WFI) + // Current privilege (PrivM/PrivU). A U-mode access to MRET or to a CSR that + // requires more privilege is an illegal instruction, detected at the head. + input logic [1:0] i_priv, + // ========================================================================= // Pipeline Flush Control // ========================================================================= @@ -266,8 +283,8 @@ module reorder_buffer ( // consumer sees slot 2 even though the plumbing exists). The // commit_2_opportunity perf counter is still updated so we can keep // measuring the upper bound across incremental steps. Flipped to 1 - // after all downstream consumers (RAT, SQ, cpu_ooo FIFO, instret) are - // in place. + // after all downstream consumers (RAT, SQ, cpu_ooo second regfile write + // port, instret) were in place. localparam bit EnableWidenCommit = 1'b1; // =========================================================================== @@ -295,8 +312,23 @@ module reorder_buffer ( end endfunction + // TIMING helper: read one bit of a per-entry packed FF vector using a + // registered ONE-HOT select instead of a binary index. Given the invariant + // onehot == (1 << idx), |(vec & onehot) === vec[idx] bit-for-bit; the win is + // physical only: the select bits come pre-decoded out of registers (no + // 5-bit high-fanout head_idx net feeding a 32:1 mux tree on the commit + // critical path). + function automatic logic onehot_read(input logic [ReorderBufferDepth-1:0] vec, + input logic [ReorderBufferDepth-1:0] onehot); + onehot_read = |(vec & onehot); + endfunction + // Forward declarations (used in debug assigns before main decl) - logic [ReorderBufferTagWidth:0] head_ptr; + // TIMING: head_ptr (via head_idx) drives every head RAM read address plus + // pointer arithmetic — post-synth fanout was ~650 with only 4 tool-chosen + // replicas. Cap the per-replica load so each copy can be placed next to its + // RAM/consumer cluster. Pure register replication; semantics unchanged. + (* max_fanout = 96 *) logic [ReorderBufferTagWidth:0] head_ptr; logic [ReorderBufferTagWidth:0] tail_ptr; logic full; logic full_for_2; @@ -338,8 +370,19 @@ module reorder_buffer ( logic [ReorderBufferTagWidth-1:0] tail_idx; // Slot-2 alloc target, wraps within ReorderBufferTagWidth modulus. logic [ReorderBufferTagWidth-1:0] tail_idx_2; - logic [ReorderBufferDepth-1:0] head_clear_mask; - logic [ReorderBufferDepth-1:0] head_next_clear_mask; + // Registered ONE-HOT images of head_idx / head_next_idx. Invariant (by + // construction, checked by assertions below): + // head_clear_mask == ReorderBufferDepth'(1) << head_idx + // head_next_clear_mask == ReorderBufferDepth'(1) << head_next_idx + // Both are written ONLY in the Head Pointer Management block, in lockstep + // with head_ptr: reset loads {1, 2} while head_ptr loads 0; commit rotates + // them by the same 1/2 steps head_ptr advances; flushes touch neither + // (flushes only move the tail). TIMING: besides gating the rob_valid + // commit-clear, they now also replace the binary head_idx as the select of + // every head-side 32:1 read (packed FF vectors + LVT bank selects), turning + // a high-fanout 5-bit select into per-entry registered one-hot bits. + (* max_fanout = 16 *) logic [ReorderBufferDepth-1:0] head_clear_mask; + (* max_fanout = 16 *) logic [ReorderBufferDepth-1:0] head_next_clear_mask; // Status signals (full and empty declared above for forward ref) logic [ReorderBufferTagWidth:0] count; @@ -349,7 +392,10 @@ module reorder_buffer ( logic head_valid; logic head_done; logic head_exception; - riscv_pkg::exc_cause_t head_exc_cause; // from RAM + logic head_exception_raw; // stored ROB exception flag (before U-mode priv fault) + logic head_priv_fault; // U-mode access to MRET / an M-CSR -> illegal instruction + riscv_pkg::exc_cause_t head_exc_cause; // effective cause (includes priv fault) + riscv_pkg::exc_cause_t head_exc_cause_raw; // from RAM logic [XLEN-1:0] head_pc; // from RAM logic head_dest_rf; logic [RegAddrWidth-1:0] head_dest_reg; // from RAM @@ -443,16 +489,47 @@ module reorder_buffer ( // Commit control signals logic head_ready; // Head is valid and done + // NOTE: deliberately NO synthesis attributes on commit_stall or the + // *_early aggregates below. Three measured rounds on this spine: every + // attribute-based constraint made it worse — round-1 (* max_fanout *) on + // commit_en/commit_2_fire fragmented the interrupt arc (WNS -1.17); + // round-3 (* keep *) on commit_stall + the early aggregates pinned fusion + // boundaries in the MIDDLE of the true critical cone (commit_stall is NOT + // a late external input — its serializer cone itself reads the head + // metadata through the one-hot masks, so mask -> is_csr/store-like -> + // FSM stall -> take_trap is one deep register-to-register cone; WNS + // -0.938). Every real structural change (one-hot head reads, ohread LVT + // select, meip register, compare-then-mux) helped. The two-term + // factoring below stays as plain RTL only — synthesis is free to refuse + // it back into the baseline-style fused tree. logic commit_stall; // Stall commit for serializing instructions + // Early/late factoring of the commit gates (pure AND re-association, + // bit-identical conjunct sets — see Commit Enable Logic). + logic commit_ready_early; + logic commit_2_ready_early; + logic commit_store_like_early; + logic commit_mispredict_early; + logic commit_correct_branch_early; + logic head_mispredict_candidate_early; + logic commit_2_store_like_early; + // NOTE: no max_fanout on commit_en. A (* max_fanout = 96 *) was tried and + // measured WORSE overall: the attribute forces the commit_en net to keep its + // identity, which blocks opt_design from collapsing the serialization spine + // (interrupt_pending -> commit_stall -> commit_en -> store-like -> + // sq_committed_empty_for_trap -> trap_taken) into shared LUTs, adding + // levels to the late UART/interrupt-pending arc (933 new failing paths, + // WNS -1.17). With the one-hot head reads the head-side arrival is early + // enough that the un-split ~655-load net is no longer the limiter. logic commit_en; // Actually commit this cycle // Widen-commit ("2-wide") gate. Asserted when commit_en is high this // cycle AND the entry immediately behind head is also retirable AND // neither slot hits a hazard that forces 1-wide commit (serial ops, // head mispredict, head+1 branch, FENCE.I, exceptions, AMO/LR/SC). - // Step 1 uses this only as a perf-counter input — it does NOT yet - // change head_ptr advancement, rob_valid clearing, or the commit - // output struct. + // commit_2_gate is the ungated opportunity signal (perf-counter input); + // commit_2_fire (gate && EnableWidenCommit && i_widen_commit_ok) drives + // the actual 2-wide retire: head_ptr advances by 2, rob_valid clears at + // head+1, and o_commit_comb_2 carries the second entry. logic head_ok_2wide; logic head_next_ok_2wide; logic commit_2_gate; @@ -489,13 +566,27 @@ module reorder_buffer ( // Count of valid entries assign count = tail_ptr - head_ptr; - // Head entry fields from FF-backed packed vectors / distributed RAM - assign head_valid = rob_valid[head_idx]; - assign head_done = rob_done[head_idx]; - assign head_exception = rob_exception[head_idx]; - assign head_branch_taken = rob_branch_taken[head_idx]; - assign head_mispredicted = rob_mispredicted[head_idx]; - assign head_early_recovered = rob_early_recovered[head_idx]; + // Head entry fields from FF-backed packed vectors / distributed RAM. + // TIMING: indexed with the registered one-hot head image (see onehot_read); + // identical value to rob_*[head_idx] under the head_clear_mask invariant. + assign head_valid = onehot_read(rob_valid, head_clear_mask); + assign head_done = onehot_read(rob_done, head_clear_mask); + assign head_exception_raw = onehot_read(rob_exception, head_clear_mask); + // U-mode privilege fault: MRET, or a CSR access requiring more privilege than + // the current mode (csr_addr[9:8] > priv), is an illegal instruction. Folding + // it into head_exception/head_exc_cause makes every consumer (commit_en, + // o_csr_start/o_mret_start, o_trap_pending, the serial FSM, the commit record) + // treat it as a precise exception, so the faulting op never executes or + // retires. The faulting op rides the same single-cycle exception path, so the + // double-trap guard in trap_unit already covers it. + assign head_priv_fault = (head_is_mret && (i_priv != riscv_pkg::PrivM)) || + (head_is_csr && (head_csr_addr[9:8] > i_priv)); + assign head_exception = head_exception_raw || head_priv_fault; + assign head_exc_cause = (head_priv_fault && !head_exception_raw) ? + riscv_pkg::exc_cause_t'(riscv_pkg::ExcIllegalInstr) : head_exc_cause_raw; + assign head_branch_taken = onehot_read(rob_branch_taken, head_clear_mask); + assign head_mispredicted = onehot_read(rob_mispredicted, head_clear_mask); + assign head_early_recovered = onehot_read(rob_early_recovered, head_clear_mask); assign { head_dest_rf, head_dest_valid, @@ -533,12 +624,14 @@ module reorder_buffer ( // RAMs below. 1-bit packed-vector fields share the existing FF storage // and are indexed at head_next_idx for free. assign head_next_idx = head_idx + 1'b1; - assign head_next_valid = rob_valid[head_next_idx]; - assign head_next_done = rob_done[head_next_idx]; - assign head_next_exception = rob_exception[head_next_idx]; - assign head_next_branch_taken = rob_branch_taken[head_next_idx]; - assign head_next_mispredicted = rob_mispredicted[head_next_idx]; - assign head_next_early_recovered = rob_early_recovered[head_next_idx]; + // TIMING: same one-hot substitution as the head fields, using the + // registered head_next_clear_mask (== 1 << head_next_idx by construction). + assign head_next_valid = onehot_read(rob_valid, head_next_clear_mask); + assign head_next_done = onehot_read(rob_done, head_next_clear_mask); + assign head_next_exception = onehot_read(rob_exception, head_next_clear_mask); + assign head_next_branch_taken = onehot_read(rob_branch_taken, head_next_clear_mask); + assign head_next_mispredicted = onehot_read(rob_mispredicted, head_next_clear_mask); + assign head_next_early_recovered = onehot_read(rob_early_recovered, head_next_clear_mask); assign { head_next_dest_rf, head_next_dest_valid, @@ -657,13 +750,21 @@ module reorder_buffer ( // 2-wide commit gate. commit_2_gate is the "opportunity" signal — it // fires whenever the ROB could theoretically retire two entries this - // cycle, independent of the master enable and the FIFO back-pressure. + // cycle, independent of the master enable and the slot-2 accept input. // This feeds the perf counter so we can keep measuring upper bound // even when widen-commit is gated off. commit_2_fire is what the // output / retire logic actually acts on — it ANDs the opportunity with - // the master enable and the cpu_ooo pending-write FIFO back-pressure. - assign commit_2_gate = commit_en && head_next_valid && head_next_done_eff && - head_ok_2wide && head_next_ok_2wide; + // the master enable and the cpu_ooo slot-2 accept signal + // (i_widen_commit_ok, currently tied high). + // TIMING (late-side factoring, see Commit Enable Logic): commit_en && X + // == (commit_ready_early && X) && !commit_stall — same conjunct set, + // re-associated so the late commit_stall enters one final LUT. + assign commit_2_ready_early = commit_ready_early && head_next_valid && head_next_done_eff && + head_ok_2wide && head_next_ok_2wide; + assign commit_2_gate = commit_2_ready_early && !commit_stall; + // NOTE: no max_fanout on commit_2_fire — a forced net boundary here sat + // mid-spine on the late UART/interrupt-pending -> trap_taken arc (it + // appeared as a distinct fo=40 level in the round-1 -1.17 post-opt path). logic commit_2_fire; assign commit_2_fire = commit_2_gate && EnableWidenCommit && i_widen_commit_ok; @@ -802,13 +903,13 @@ module reorder_buffer ( // =========================================================================== // Distributed RAM Instances // =========================================================================== - // Single-write-port fields (written only at allocation, read at head). - // These use sdp_dist_ram — one write port, one async read port. + // Alloc-written fields (read at head / head+1). Since 2-wide dispatch + // these use mwp_dist_ram_ohread with 2 write ports (slot-1 + slot-2 alloc). // --------------------------------------------------------------------------- // 2-write port: slot-1 alloc (port 0) + slot-2 alloc (port 1). Port 1 // writes when slot-2 allocates its ROB entry in the same cycle as slot-1. - mwp_dist_ram #( + mwp_dist_ram_ohread #( .ADDR_WIDTH (ReorderBufferTagWidth), .DATA_WIDTH (XLEN), .NUM_WRITE_PORTS(2) @@ -818,11 +919,12 @@ module reorder_buffer ( .i_write_address({tail_idx_2, tail_idx}), .i_write_data ({i_alloc_req_2.pc, i_alloc_req.pc}), .i_read_address (head_idx), + .i_read_onehot (head_clear_mask), .o_read_data (head_pc) ); // Widen-commit replica: head+1 read port for pc. - mwp_dist_ram #( + mwp_dist_ram_ohread #( .ADDR_WIDTH (ReorderBufferTagWidth), .DATA_WIDTH (XLEN), .NUM_WRITE_PORTS(2) @@ -832,10 +934,11 @@ module reorder_buffer ( .i_write_address({tail_idx_2, tail_idx}), .i_write_data ({i_alloc_req_2.pc, i_alloc_req.pc}), .i_read_address (head_next_idx), + .i_read_onehot (head_next_clear_mask), .o_read_data (head_next_pc) ); - mwp_dist_ram #( + mwp_dist_ram_ohread #( .ADDR_WIDTH (ReorderBufferTagWidth), .DATA_WIDTH (RegAddrWidth), .NUM_WRITE_PORTS(2) @@ -845,11 +948,12 @@ module reorder_buffer ( .i_write_address({tail_idx_2, tail_idx}), .i_write_data ({i_alloc_req_2.dest_reg, i_alloc_req.dest_reg}), .i_read_address (head_idx), + .i_read_onehot (head_clear_mask), .o_read_data (head_dest_reg) ); // Widen-commit replica: head+1 read port for dest_reg. - mwp_dist_ram #( + mwp_dist_ram_ohread #( .ADDR_WIDTH (ReorderBufferTagWidth), .DATA_WIDTH (RegAddrWidth), .NUM_WRITE_PORTS(2) @@ -859,10 +963,11 @@ module reorder_buffer ( .i_write_address({tail_idx_2, tail_idx}), .i_write_data ({i_alloc_req_2.dest_reg, i_alloc_req.dest_reg}), .i_read_address (head_next_idx), + .i_read_onehot (head_next_clear_mask), .o_read_data (head_next_dest_reg) ); - mwp_dist_ram #( + mwp_dist_ram_ohread #( .ADDR_WIDTH (ReorderBufferTagWidth), .DATA_WIDTH (XLEN), .NUM_WRITE_PORTS(2) @@ -872,11 +977,12 @@ module reorder_buffer ( .i_write_address({tail_idx_2, tail_idx}), .i_write_data ({i_alloc_req_2.predicted_target, i_alloc_req.predicted_target}), .i_read_address (head_idx), + .i_read_onehot (head_clear_mask), .o_read_data (head_predicted_target) ); // Widen-commit replica: head+1 read port for predicted_target. - mwp_dist_ram #( + mwp_dist_ram_ohread #( .ADDR_WIDTH (ReorderBufferTagWidth), .DATA_WIDTH (XLEN), .NUM_WRITE_PORTS(2) @@ -886,10 +992,11 @@ module reorder_buffer ( .i_write_address({tail_idx_2, tail_idx}), .i_write_data ({i_alloc_req_2.predicted_target, i_alloc_req.predicted_target}), .i_read_address (head_next_idx), + .i_read_onehot (head_next_clear_mask), .o_read_data (head_next_predicted_target) ); - mwp_dist_ram #( + mwp_dist_ram_ohread #( .ADDR_WIDTH (ReorderBufferTagWidth), .DATA_WIDTH (CheckpointIdWidth), .NUM_WRITE_PORTS(2) @@ -899,11 +1006,12 @@ module reorder_buffer ( .i_write_address({tail_idx_2, tail_idx}), .i_write_data ({alloc_checkpoint_id_data_2, alloc_checkpoint_id_data}), .i_read_address (head_idx), + .i_read_onehot (head_clear_mask), .o_read_data (head_checkpoint_id) ); // Widen-commit replica: head+1 read port for checkpoint_id. - mwp_dist_ram #( + mwp_dist_ram_ohread #( .ADDR_WIDTH (ReorderBufferTagWidth), .DATA_WIDTH (CheckpointIdWidth), .NUM_WRITE_PORTS(2) @@ -913,10 +1021,11 @@ module reorder_buffer ( .i_write_address({tail_idx_2, tail_idx}), .i_write_data ({alloc_checkpoint_id_data_2, alloc_checkpoint_id_data}), .i_read_address (head_next_idx), + .i_read_onehot (head_next_clear_mask), .o_read_data (head_next_checkpoint_id) ); - mwp_dist_ram #( + mwp_dist_ram_ohread #( .ADDR_WIDTH (ReorderBufferTagWidth), .DATA_WIDTH (HeadMetaWidth), .NUM_WRITE_PORTS(2) @@ -926,12 +1035,13 @@ module reorder_buffer ( .i_write_address({tail_idx_2, tail_idx}), .i_write_data ({alloc_head_meta_data_2, alloc_head_meta_data}), .i_read_address (head_idx), + .i_read_onehot (head_clear_mask), .o_read_data (head_meta_rd_data) ); // Widen-commit replica: head+1 read port for head_meta. This feeds the // head_next_* hazard flags consumed by the 2-wide commit gate. - mwp_dist_ram #( + mwp_dist_ram_ohread #( .ADDR_WIDTH (ReorderBufferTagWidth), .DATA_WIDTH (HeadMetaWidth), .NUM_WRITE_PORTS(2) @@ -941,18 +1051,22 @@ module reorder_buffer ( .i_write_address({tail_idx_2, tail_idx}), .i_write_data ({alloc_head_meta_data_2, alloc_head_meta_data}), .i_read_address (head_next_idx), + .i_read_onehot (head_next_clear_mask), .o_read_data (head_next_meta_rd_data) ); // --------------------------------------------------------------------------- - // Multi-write-port fields (allocation + CDB or branch update). - // These use mwp_dist_ram with 3 write ports for 2-wide alloc support. - // Port 0 = slot-1 alloc, Port 1 = slot-2 alloc, Port 2 = CDB (highest pri). + // Multi-write-port fields (allocation + CDB). + // These use mwp_dist_ram (mwp_dist_ram_ohread for head-side reads) with + // 4 write ports: Port 0 = slot-1 alloc, Port 1 = slot-2 alloc, + // Port 2 = CDB lane 0, Port 3 = CDB lane 1 (highest pri; the arbiter + // guarantees the two CDB lanes never collide on an address). // --------------------------------------------------------------------------- - // rob_value: 3 write ports (alloc1 + alloc2 + CDB), 2 read ports (head + RAT bypass). - // Two instances with identical writes, different read addresses. - mwp_dist_ram #( + // rob_value: 4 write ports (alloc1 + alloc2 + CDB lane 0 + CDB lane 1). + // Twelve instances with identical writes, different read addresses + // (head, head+1, RAT, dispatch bypass x6, fmul-pending x3). + mwp_dist_ram_ohread #( .ADDR_WIDTH (ReorderBufferTagWidth), .DATA_WIDTH (FLEN), .NUM_WRITE_PORTS(4) @@ -962,11 +1076,12 @@ module reorder_buffer ( .i_write_address({i_cdb_write_2.tag, i_cdb_write.tag, tail_idx_2, tail_idx}), .i_write_data({i_cdb_write_2.value, i_cdb_write.value, alloc_value_data_2, alloc_value_data}), .i_read_address(head_idx), + .i_read_onehot(head_clear_mask), .o_read_data(head_value) ); // Widen-commit replica: head+1 read port for value. - mwp_dist_ram #( + mwp_dist_ram_ohread #( .ADDR_WIDTH (ReorderBufferTagWidth), .DATA_WIDTH (FLEN), .NUM_WRITE_PORTS(4) @@ -976,6 +1091,7 @@ module reorder_buffer ( .i_write_address({i_cdb_write_2.tag, i_cdb_write.tag, tail_idx_2, tail_idx}), .i_write_data({i_cdb_write_2.value, i_cdb_write.value, alloc_value_data_2, alloc_value_data}), .i_read_address(head_next_idx), + .i_read_onehot(head_next_clear_mask), .o_read_data(head_next_value) ); @@ -1111,8 +1227,8 @@ module reorder_buffer ( .o_read_data(o_fmul_pending_bypass_value_3) ); - // rob_exc_cause: 3 write ports (alloc1='0 + alloc2='0 + CDB), 1 read port (head) - mwp_dist_ram #( + // rob_exc_cause: 4 write ports (alloc1='0 + alloc2='0 + CDB lanes 0/1), 1 read port (head) + mwp_dist_ram_ohread #( .ADDR_WIDTH (ReorderBufferTagWidth), .DATA_WIDTH (ExcCauseWidth), .NUM_WRITE_PORTS(4) @@ -1124,11 +1240,12 @@ module reorder_buffer ( i_cdb_write_2.exc_cause, i_cdb_write.exc_cause, ExcCauseWidth'(0), ExcCauseWidth'(0) }), .i_read_address(head_idx), - .o_read_data(head_exc_cause) + .i_read_onehot(head_clear_mask), + .o_read_data(head_exc_cause_raw) ); // Widen-commit replica: head+1 read port for exc_cause. - mwp_dist_ram #( + mwp_dist_ram_ohread #( .ADDR_WIDTH (ReorderBufferTagWidth), .DATA_WIDTH (ExcCauseWidth), .NUM_WRITE_PORTS(4) @@ -1140,11 +1257,12 @@ module reorder_buffer ( i_cdb_write_2.exc_cause, i_cdb_write.exc_cause, ExcCauseWidth'(0), ExcCauseWidth'(0) }), .i_read_address(head_next_idx), + .i_read_onehot(head_next_clear_mask), .o_read_data(head_next_exc_cause) ); - // rob_fp_flags: 3 write ports (alloc1='0 + alloc2='0 + CDB), 1 read port (head) - mwp_dist_ram #( + // rob_fp_flags: 4 write ports (alloc1='0 + alloc2='0 + CDB lanes 0/1), 1 read port (head) + mwp_dist_ram_ohread #( .ADDR_WIDTH (ReorderBufferTagWidth), .DATA_WIDTH (FpFlagsWidth), .NUM_WRITE_PORTS(4) @@ -1156,11 +1274,12 @@ module reorder_buffer ( i_cdb_write_2.fp_flags, i_cdb_write.fp_flags, FpFlagsWidth'(0), FpFlagsWidth'(0) }), .i_read_address(head_idx), + .i_read_onehot(head_clear_mask), .o_read_data(head_fp_flags) ); // Widen-commit replica: head+1 read port for fp_flags. - mwp_dist_ram #( + mwp_dist_ram_ohread #( .ADDR_WIDTH (ReorderBufferTagWidth), .DATA_WIDTH (FpFlagsWidth), .NUM_WRITE_PORTS(4) @@ -1172,6 +1291,7 @@ module reorder_buffer ( i_cdb_write_2.fp_flags, i_cdb_write.fp_flags, FpFlagsWidth'(0), FpFlagsWidth'(0) }), .i_read_address(head_next_idx), + .i_read_onehot(head_next_clear_mask), .o_read_data(head_next_fp_flags) ); @@ -1180,7 +1300,7 @@ module reorder_buffer ( // branches/JALR write their resolved target on branch update. Split the // field across two single-write memories and select at the head instead of // paying the timing cost of a 2-write-port LVT RAM here. - mwp_dist_ram #( + mwp_dist_ram_ohread #( .ADDR_WIDTH (ReorderBufferTagWidth), .DATA_WIDTH (XLEN), .NUM_WRITE_PORTS(2) @@ -1190,11 +1310,12 @@ module reorder_buffer ( .i_write_address({tail_idx_2, tail_idx}), .i_write_data ({alloc_branch_target_data_2, alloc_branch_target_data}), .i_read_address (head_idx), + .i_read_onehot (head_clear_mask), .o_read_data (head_branch_target_jal) ); // Widen-commit replica: head+1 read port for branch_target_jal. - mwp_dist_ram #( + mwp_dist_ram_ohread #( .ADDR_WIDTH (ReorderBufferTagWidth), .DATA_WIDTH (XLEN), .NUM_WRITE_PORTS(2) @@ -1204,6 +1325,7 @@ module reorder_buffer ( .i_write_address({tail_idx_2, tail_idx}), .i_write_data ({alloc_branch_target_data_2, alloc_branch_target_data}), .i_read_address (head_next_idx), + .i_read_onehot (head_next_clear_mask), .o_read_data (head_next_branch_target_jal) ); @@ -1233,7 +1355,7 @@ module reorder_buffer ( ); // CSR address RAM (12-bit, written at allocation) - mwp_dist_ram #( + mwp_dist_ram_ohread #( .ADDR_WIDTH (ReorderBufferTagWidth), .DATA_WIDTH (12), .NUM_WRITE_PORTS(2) @@ -1243,11 +1365,12 @@ module reorder_buffer ( .i_write_address({tail_idx_2, tail_idx}), .i_write_data ({i_alloc_req_2.csr_addr, i_alloc_req.csr_addr}), .i_read_address (head_idx), + .i_read_onehot (head_clear_mask), .o_read_data (head_csr_addr) ); // Widen-commit replica: head+1 read port for csr_addr. - mwp_dist_ram #( + mwp_dist_ram_ohread #( .ADDR_WIDTH (ReorderBufferTagWidth), .DATA_WIDTH (12), .NUM_WRITE_PORTS(2) @@ -1257,11 +1380,12 @@ module reorder_buffer ( .i_write_address({tail_idx_2, tail_idx}), .i_write_data ({i_alloc_req_2.csr_addr, i_alloc_req.csr_addr}), .i_read_address (head_next_idx), + .i_read_onehot (head_next_clear_mask), .o_read_data (head_next_csr_addr) ); // CSR op RAM (3-bit funct3, written at allocation) - mwp_dist_ram #( + mwp_dist_ram_ohread #( .ADDR_WIDTH (ReorderBufferTagWidth), .DATA_WIDTH (3), .NUM_WRITE_PORTS(2) @@ -1271,11 +1395,12 @@ module reorder_buffer ( .i_write_address({tail_idx_2, tail_idx}), .i_write_data ({i_alloc_req_2.csr_op, i_alloc_req.csr_op}), .i_read_address (head_idx), + .i_read_onehot (head_clear_mask), .o_read_data (head_csr_op) ); // Widen-commit replica: head+1 read port for csr_op. - mwp_dist_ram #( + mwp_dist_ram_ohread #( .ADDR_WIDTH (ReorderBufferTagWidth), .DATA_WIDTH (3), .NUM_WRITE_PORTS(2) @@ -1285,11 +1410,12 @@ module reorder_buffer ( .i_write_address({tail_idx_2, tail_idx}), .i_write_data ({i_alloc_req_2.csr_op, i_alloc_req.csr_op}), .i_read_address (head_next_idx), + .i_read_onehot (head_next_clear_mask), .o_read_data (head_next_csr_op) ); // CSR write data RAM (32-bit, written at allocation) - mwp_dist_ram #( + mwp_dist_ram_ohread #( .ADDR_WIDTH (ReorderBufferTagWidth), .DATA_WIDTH (XLEN), .NUM_WRITE_PORTS(2) @@ -1299,11 +1425,12 @@ module reorder_buffer ( .i_write_address({tail_idx_2, tail_idx}), .i_write_data ({i_alloc_req_2.csr_write_data, i_alloc_req.csr_write_data}), .i_read_address (head_idx), + .i_read_onehot (head_clear_mask), .o_read_data (head_csr_write_data) ); // Widen-commit replica: head+1 read port for csr_write_data. - mwp_dist_ram #( + mwp_dist_ram_ohread #( .ADDR_WIDTH (ReorderBufferTagWidth), .DATA_WIDTH (XLEN), .NUM_WRITE_PORTS(2) @@ -1313,6 +1440,7 @@ module reorder_buffer ( .i_write_address({tail_idx_2, tail_idx}), .i_write_data ({i_alloc_req_2.csr_write_data, i_alloc_req.csr_write_data}), .i_read_address (head_next_idx), + .i_read_onehot (head_next_clear_mask), .o_read_data (head_next_csr_write_data) ); @@ -1659,28 +1787,63 @@ module reorder_buffer ( // The old branch_update collision guard (which delayed commit when a // mispredicted branch resolved via CDB in the same cycle as commit) is // removed: (a) JAL — the stated motivation — never produces branch_update - // (is_jal_issue is excluded); (b) for conditional branches, the - // rob_head_commit_misprediction_candidate check in early_mispredict_fire - // already blocks the early-recovery race; (c) removing the guard breaks + // (is_jal_issue is excluded); (b) a conditional branch cannot resolve and + // commit in the same cycle (head_cdb_bypass excludes branches, so its done + // bit trails branch_update by one cycle), and an early_mispredict_fire + // coinciding with a head-mispredict commit is dropped one cycle later by + // the !mispredict_recovery_pending term in early_mispredict_active + // (early_misprediction_recovery.sv) — the fire-time candidate gate this + // comment used to cite no longer exists; (c) removing the guard breaks // the commit_en ↔ branch_update critical path (19 LUT levels through the // CARRY8 branch-target comparison). - assign commit_en = head_ready && !head_exception && !commit_stall && !i_commit_hold && - !i_early_recovery_en && !i_flush_all && !flush_after_head_commit; + // !i_flush_en is REQUIRED for serializing correctness, not just a flush guard. + // rob_serializer only recognizes a serial head (CSR/FENCE/FENCE.I/WFI/MRET) + // while !i_flush_en (rob_serializer.sv SERIAL_IDLE guard). During an + // early-backend-recovery / mispredict-recovery bubble (i_flush_en=1) the + // serializer therefore leaves commit_stall=0 for a head FENCE.I, so without + // this term commit_en would RETIRE the FENCE.I unserialized -- skipping the + // cache sync (L1D writeback-all + L1I invalidate-all) entirely and letting a + // post-fence fetch read pre-fence code (the SMC bug). Gating commit on + // !i_flush_en keeps commit_en a subset of the serializer's guard, so a serial + // head can never RETIRE during the bubble; it commits (and is serialized) + // after the bubble clears. The bubble is a fixed hold (early-backend / + // mispredict recovery), never waiting on the head committing -> no deadlock. + // TIMING (late-side factoring): commit_en and every commit_stall-qualified + // derivative are written as && !commit_stall. The + // conjunct SETS are identical to the flat originals (pure AND + // re-association; AND is associative/commutative, so the value is + // bit-identical for every input combination). All early conjuncts are + // register-sourced and settle well before commit_stall's interrupt arc, so + // the late arc traverses exactly one LUT per gate — restoring (and slightly + // beating) the baseline netlist's shape, where commit_stall entered the + // second-to-last commit_en LUT and the derivatives chained behind the + // commit_en broadcast. + assign commit_ready_early = head_ready && !head_exception && !i_commit_hold && + !i_early_recovery_en && !i_flush_en && !i_flush_all && + !flush_after_head_commit; + assign commit_en = commit_ready_early && !commit_stall; // Raw misprediction at commit (early_recovered handled externally by cpu_ooo) assign commit_misprediction = head_is_branch && head_mispredicted; assign o_commit_valid_raw = commit_en; - assign o_commit_store_like_raw = commit_en && (head_is_store || head_is_fp_store || head_is_sc); - assign o_commit_misprediction_raw = commit_en && commit_misprediction && !head_early_recovered; - assign o_commit_correct_branch_raw = commit_en && head_has_checkpoint && + assign commit_store_like_early = + commit_ready_early && (head_is_store || head_is_fp_store || head_is_sc); + assign o_commit_store_like_raw = commit_store_like_early && !commit_stall; + assign commit_mispredict_early = + commit_ready_early && commit_misprediction && !head_early_recovered; + assign o_commit_misprediction_raw = commit_mispredict_early && !commit_stall; + assign commit_correct_branch_early = commit_ready_early && head_has_checkpoint && !commit_misprediction && !head_early_recovered; + assign o_commit_correct_branch_raw = commit_correct_branch_early && !commit_stall; // Same-cycle head-mispredict indicator without the branch_update collision // term. Outer control logic uses this to suppress younger branch resolution // without feeding branch_update back into commit_en. - assign o_head_commit_misprediction_candidate = - head_ready && !commit_stall && !i_commit_hold && !i_early_recovery_en && - !i_flush_all && !flush_after_head_commit && + // (Same factoring; note the original conjunct set has no !head_exception.) + assign head_mispredict_candidate_early = + head_ready && !i_commit_hold && !i_early_recovery_en && + !i_flush_en && !i_flush_all && !flush_after_head_commit && commit_misprediction && !head_early_recovered; + assign o_head_commit_misprediction_candidate = head_mispredict_candidate_early && !commit_stall; // =========================================================================== // External Coordination Outputs @@ -1693,14 +1856,35 @@ module reorder_buffer ( head_is_csr && !head_exception && !i_flush_en && !i_flush_all; - // MRET execution signal - asserted when entering MRET_EXEC state. + // MRET execution signal - asserted when entering MRET_EXEC and SUSTAINED while + // waiting there for committed stores to drain. + // + // take_mret (trap_unit) only fires when i_sq_committed_empty is high IN THE + // SAME CYCLE as o_mret_start, and it has no retry. Without the + // SERIAL_MRET_EXEC sustaining term o_mret_start is a one-cycle pulse on the + // IDLE->MRET_EXEC cycle: if a committed store is still draining then, take_mret + // misses its only chance and the serializer wedges in SERIAL_MRET_EXEC forever + // (no later flush can rescue it -- the stuck MRET never restores MIE, so no + // interrupt becomes eligible to flush it). The sustaining term mirrors + // o_trap_pending (below) and lets take_mret retry every cycle until the SQ + // drains. + // + // The i_sq_committed_empty gate keeps o_mret_start (hence i_mret_start -> + // trap_drain_wait -> i_commit_hold) low during the drain wait, which (a) + // prevents a commit-hold/o_mret_start f/2 oscillation and (b) keeps mret_taken + // a single-cycle pulse so flush_all fires exactly once. It is free on the + // common path: a retiring MRET normally finds the committed SQ already empty. + // // Note: !i_flush_en/!i_flush_all intentionally omitted — flush signals are // derived from mret_taken which is derived from o_mret_start, so gating // by them creates an oscillating combinational loop. - assign o_mret_start = (serial_state == riscv_pkg::SERIAL_IDLE) && head_ready && + assign o_mret_start = ((serial_state == riscv_pkg::SERIAL_IDLE) || + (serial_state == riscv_pkg::SERIAL_MRET_EXEC)) && + head_ready && !i_commit_hold && !i_early_recovery_en && - head_is_mret && !head_exception; + head_is_mret && !head_exception && + i_sq_committed_empty; // Trap pending signal - asserted when exception at head. // Note: during the IDLE->TRAP_WAIT transition, both the state check and the @@ -1716,9 +1900,34 @@ module reorder_buffer ( (serial_state == riscv_pkg::SERIAL_TRAP_WAIT) || (head_ready && !i_commit_hold && !i_early_recovery_en && head_exception); assign o_trap_pc = head_pc; + // WFI interrupt-resume-PC seed (Bug#2): expose that the ROB head is a WFI so + // cpu_ooo can seed interrupt_resume_pc = wfi_pc+4 while the WFI stalls at the + // head. A machine interrupt taken at a *drain-gated* WFI (a committed store + // still draining) otherwise flushes the WFI before it commits, leaving + // interrupt_resume_pc at the pre-WFI instruction's next-PC (== the WFI's own + // PC) -> mepc=wfi_pc instead of the spec-required wfi_pc+4. + assign o_head_is_wfi = head_is_wfi; assign o_trap_cause = head_exc_cause; assign o_trap_value = head_value[XLEN-1:0]; + // TIMING: retired-next-PC precompute (see port comment). Equivalence with + // cpu_ooo's retired_next_pc(o_commit_comb) whenever o_commit_comb.valid: + // - head MRET: retired_next_pc returns redirect_pc, and the o_commit_comb + // redirect chain puts i_mepc there for MRET (highest priority); + // - head branch: retired_next_pc returns redirect_pc = taken ? + // head_branch_target : head_fallthrough_pc; + // - otherwise: retired_next_pc returns pc + (is_compressed ? 2 : 4) with + // is_compressed == head_is_compressed (the head_link_is_compressed arm + // only applies to branches, which take the redirect arm above) + // == head_fallthrough_pc. + // Slot 2 is never a branch/MRET by the 2-wide gate, and o_commit_comb_2 + // zeroes is_branch/is_mret, so its next-PC is always the sequential one. + assign o_head_retired_next_pc = + head_is_mret ? i_mepc : + (head_is_branch && head_branch_taken) ? head_branch_target : + head_fallthrough_pc; + assign o_head_next_retired_next_pc = head_next_pc + (head_next_is_compressed ? 32'd2 : 32'd4); + // FENCE.I flush signal - pulse when FENCE.I commits always_ff @(posedge i_clk) begin if (!i_rst_n) begin @@ -1902,7 +2111,14 @@ module reorder_buffer ( end assign o_commit_2_valid_raw = commit_2_fire; - assign o_commit_2_store_like_raw = commit_2_fire && (head_next_is_store || head_next_is_fp_store); + // TIMING (late-side factoring): commit_2_fire && X == (commit_2_ready_early + // && EnableWidenCommit && i_widen_commit_ok && X) && !commit_stall — same + // conjunct set, one late LUT. This output feeds sq_committed_empty_for_trap + // (the trap arc of the uart spine) and the SQ same-cycle commit guard. + assign commit_2_store_like_early = + commit_2_ready_early && EnableWidenCommit && i_widen_commit_ok && + (head_next_is_store || head_next_is_fp_store); + assign o_commit_2_store_like_raw = commit_2_store_like_early && !commit_stall; // Registered copy of slot 2 commit so external observers can sample it // after the head pointer advances. Mirrors the o_commit register. @@ -2026,8 +2242,8 @@ module reorder_buffer ( // because the hazard gate (serial ops, head+1 branches, FENCE.I, // exceptions, AMO/LR/SC, head-mispredicting-branches) is already // applied. commit_2_fire_actual additionally folds in the master - // enable and the cpu_ooo pending-write FIFO back-pressure term - // (i_widen_commit_ok) — this is what the head_ptr increment and + // enable and the cpu_ooo slot-2 accept term (i_widen_commit_ok, + // currently tied high) — this is what the head_ptr increment and // rob_valid clear actually use. o_perf_events.commit_2_opportunity = commit_2_gate; o_perf_events.commit_2_fire_actual = commit_2_fire; @@ -2063,6 +2279,30 @@ module reorder_buffer ( `ifndef SYNTHESIS `ifndef FORMAL + // One-hot head-image invariant (load-bearing for TIMING reads): the + // registered masks must mirror the binary pointers every cycle, since + // onehot_read() and the mwp_dist_ram_ohread LVT selects substitute them for + // binary head_idx / head_next_idx indexing. Only check once reset has been + // observed asserted at least once: at sim time 0 the full-chip bench can + // present i_rst_n=1 before the reset synchronizer fires, while the mask FFs + // still hold their uninitialized all-zero value (which reads identically to + // the pre-fix binary indexing of the equally-uninitialized state). + logic dbg_mask_seen_reset; + initial dbg_mask_seen_reset = 1'b0; + always @(posedge i_clk) begin + if (!i_rst_n) dbg_mask_seen_reset <= 1'b1; + if (i_rst_n && dbg_mask_seen_reset) begin + if (head_clear_mask != (ReorderBufferDepth'(1) << head_idx)) begin + $error("Reorder Buffer: head_clear_mask (0x%08x) != 1 << head_idx (%0d)", head_clear_mask, + head_idx); + end + if (head_next_clear_mask != (ReorderBufferDepth'(1) << head_next_idx)) begin + $error("Reorder Buffer: head_next_clear_mask (0x%08x) != 1 << head_next_idx (%0d)", + head_next_clear_mask, head_next_idx); + end + end + end + // Retire trace: log every committed instruction (for debugging) integer retire_trace_fd; initial begin @@ -2214,6 +2454,12 @@ module reorder_buffer ( // empty iff pointers exactly equal p_empty_matches_ptrs : assert (empty == (head_ptr == tail_ptr)); + // Registered one-hot head images track the binary pointers exactly. + // The one-hot reads (onehot_read / mwp_dist_ram_ohread) rely on this. + p_head_mask_onehot : assert (head_clear_mask == (ReorderBufferDepth'(1) << head_idx)); + p_head_next_mask_onehot : + assert (head_next_clear_mask == (ReorderBufferDepth'(1) << head_next_idx)); + // alloc_en implies !full p_alloc_not_when_full : assert (!alloc_en || !full); @@ -2257,10 +2503,19 @@ module reorder_buffer ( assert ($past(serial_state) == riscv_pkg::SERIAL_IDLE && $past(head_is_csr)); end - // o_mret_start only in IDLE with MRET at head + // o_mret_start is asserted when MRET first reaches the ready head and is + // sustained in MRET_EXEC so trap_unit can retry after committed SQ drain. if ($past(o_mret_start)) begin p_mret_start_contract : - assert ($past(serial_state) == riscv_pkg::SERIAL_IDLE && $past(head_is_mret)); + assert (($past( + serial_state + ) == riscv_pkg::SERIAL_IDLE || $past( + serial_state + ) == riscv_pkg::SERIAL_MRET_EXEC) && $past( + head_is_mret + ) && $past( + i_sq_committed_empty + )); end // o_fence_i_flush is registered (one cycle after commit of FENCE.I) diff --git a/hw/rtl/cpu_and_mem/cpu/tomasulo/store_queue/README.md b/hw/rtl/cpu_and_mem/cpu/tomasulo/store_queue/README.md index 8b74a169..5b53bef7 100644 --- a/hw/rtl/cpu_and_mem/cpu/tomasulo/store_queue/README.md +++ b/hw/rtl/cpu_and_mem/cpu/tomasulo/store_queue/README.md @@ -101,15 +101,21 @@ Back-pressure is therefore only ever conservatively long, never short. ## Widen-commit slot 2 The SQ accepts a parallel slot-2 commit port -(`i_commit_valid_2`, `i_commit_rob_tag_2`, plus combinational twin -for the same-cycle partial-flush guard). Slot 2 only ever retires +(`i_commit_valid_2`, `i_commit_rob_tag_2`, plus a combinational twin +for the same-cycle flush guard). Slot 2 only ever retires plain stores — SC / AMO are forced onto slot 1 by the ROB's widen-commit hazard gate — so there's no SC-discard path sharing. Forwarding scans both slot 1 and slot 2 commits in the same cycle. +The wrapper now actually drives the combinational twin +(`i_commit_valid_comb_2` / `i_commit_rob_tag_comb_2`, previously tied to +`1'b0`); without it a full-flush trap (e.g. a machine-timer IRQ) could +observe committed-empty and drop a head+1 store the SQ has not yet seen on +the registered commit path. ## Same-cycle commit hazard -When a partial flush and a ROB commit fire on the same cycle, the +When any same-cycle flush races a registered ROB commit — partial-flush +misprediction recovery and full-flush trap / MRET / FENCE.I drains alike — the registered commit signal is one cycle behind the flush, which means the flush could otherwise wipe out a store that's being committed right then. The SQ takes a combinational commit guard from the ROB diff --git a/hw/rtl/cpu_and_mem/cpu/tomasulo/store_queue/sq_forwarding_unit.sv b/hw/rtl/cpu_and_mem/cpu/tomasulo/store_queue/sq_forwarding_unit.sv index 06c11b61..f4349357 100644 --- a/hw/rtl/cpu_and_mem/cpu/tomasulo/store_queue/sq_forwarding_unit.sv +++ b/hw/rtl/cpu_and_mem/cpu/tomasulo/store_queue/sq_forwarding_unit.sv @@ -73,6 +73,29 @@ module sq_forwarding_unit #( localparam int unsigned WordAddrWidth = XLEN - 2; localparam int unsigned IdxWidth = $clog2(DEPTH); + typedef struct packed { + logic valid; + logic [ReorderBufferTagWidth:0] age; + logic can_forward; + logic [IdxWidth-1:0] idx; + logic [1:0] extract_type; + } fwd_winner_t; + + function automatic fwd_winner_t choose_newer_winner(input fwd_winner_t lhs, + input fwd_winner_t rhs); + begin + if (!lhs.valid) begin + choose_newer_winner = rhs; + end else if (!rhs.valid) begin + choose_newer_winner = lhs; + end else if (rhs.age >= lhs.age) begin + choose_newer_winner = rhs; + end else begin + choose_newer_winner = lhs; + end + end + endfunction + // Forwarding scan result index (drives the SQ data-RAM read address in parent) logic [IdxWidth-1:0] fwd_match_idx; @@ -184,6 +207,12 @@ module sq_forwarding_unit #( logic [ReorderBufferTagWidth:0] fwd_load_age; logic [ReorderBufferTagWidth:0] fwd_entry_age[DEPTH]; logic [1:0] fwd_entry_extract_type[DEPTH]; +`ifndef FORMAL + fwd_winner_t fwd_leaf[DEPTH]; + fwd_winner_t fwd_pair[4]; + fwd_winner_t fwd_quad[2]; + fwd_winner_t fwd_winner; +`endif assign fwd_load_byte_mask = gen_byte_en(i_sq_check_addr[1:0], i_sq_check_size); assign fwd_load_age = {1'b0, i_sq_check_rob_tag} - {1'b0, i_rob_head_tag}; @@ -305,25 +334,60 @@ module sq_forwarding_unit #( // Block 2: newest conflicting store wins for data/extract selection. The // heavy address/age qualification is already parallelized above, so this // block only prioritizes 1-bit match results and their precomputed metadata. +`ifdef FORMAL + // Yosys's formal frontend currently mishandles the balanced tree's unpacked + // array of packed structs, treating fields such as fwd_leaf[i].can_forward + // as implicit wires. Use an equivalent linear selector for formal only; the + // synthesized implementation below remains the timing-optimized tree. + logic fwd_formal_winner_valid; + logic [ReorderBufferTagWidth:0] fwd_formal_winner_age; + always_comb begin - logic have_winner; - logic [ReorderBufferTagWidth:0] winner_age; - - have_winner = 1'b0; - winner_age = '0; - fwd_can_fwd = 1'b0; - fwd_match_idx = '0; - fwd_extract_type = 2'd0; + fwd_formal_winner_valid = 1'b0; + fwd_formal_winner_age = '0; + fwd_can_fwd = 1'b0; + fwd_match_idx = '0; + fwd_extract_type = 2'd0; + for (int unsigned i = 0; i < DEPTH; i++) begin - if (fwd_conflict_mask[i] && (!have_winner || (fwd_entry_age[i] >= winner_age))) begin - have_winner = 1'b1; - winner_age = fwd_entry_age[i]; - fwd_can_fwd = fwd_can_forward_mask[i]; - fwd_match_idx = IdxWidth'(i); - fwd_extract_type = fwd_entry_extract_type[i]; + if (fwd_conflict_mask[i] && + (!fwd_formal_winner_valid || (fwd_entry_age[i] >= fwd_formal_winner_age))) begin + fwd_formal_winner_valid = 1'b1; + fwd_formal_winner_age = fwd_entry_age[i]; + fwd_can_fwd = fwd_can_forward_mask[i]; + fwd_match_idx = IdxWidth'(i); + fwd_extract_type = fwd_entry_extract_type[i]; end end end +`else + // Keep this as a balanced tree: the old serial loop let an SQ-check address + // bit feed each entry's conflict logic and then walk an 8-entry winner chain + // before reaching o_sq_forward.can_forward. + always_comb begin + for (int unsigned i = 0; i < DEPTH; i++) begin + fwd_leaf[i].valid = fwd_conflict_mask[i]; + fwd_leaf[i].age = fwd_entry_age[i]; + fwd_leaf[i].can_forward = fwd_can_forward_mask[i]; + fwd_leaf[i].idx = IdxWidth'(i); + fwd_leaf[i].extract_type = fwd_entry_extract_type[i]; + end + + fwd_pair[0] = choose_newer_winner(fwd_leaf[0], fwd_leaf[1]); + fwd_pair[1] = choose_newer_winner(fwd_leaf[2], fwd_leaf[3]); + fwd_pair[2] = choose_newer_winner(fwd_leaf[4], fwd_leaf[5]); + fwd_pair[3] = choose_newer_winner(fwd_leaf[6], fwd_leaf[7]); + + fwd_quad[0] = choose_newer_winner(fwd_pair[0], fwd_pair[1]); + fwd_quad[1] = choose_newer_winner(fwd_pair[2], fwd_pair[3]); + + fwd_winner = choose_newer_winner(fwd_quad[0], fwd_quad[1]); + + fwd_can_fwd = fwd_winner.valid && fwd_winner.can_forward; + fwd_match_idx = fwd_winner.idx; + fwd_extract_type = fwd_winner.extract_type; + end +`endif // Block 3: Registered forwarding outputs. // Keep the SQ compare/forwarding result behind a register so the LQ sees it @@ -336,7 +400,7 @@ module sq_forwarding_unit #( end else begin o_sq_all_older_addrs_known <= i_sq_check_valid ? fwd_all_older_known : 1'b0; o_sq_forward.match <= i_sq_check_valid ? fwd_found_match : 1'b0; - o_sq_forward.can_forward <= i_sq_check_valid ? (fwd_found_match && fwd_can_fwd) : 1'b0; + o_sq_forward.can_forward <= i_sq_check_valid ? fwd_can_fwd : 1'b0; end case (fwd_extract_type) diff --git a/hw/rtl/cpu_and_mem/cpu/tomasulo/store_queue/store_queue.sv b/hw/rtl/cpu_and_mem/cpu/tomasulo/store_queue/store_queue.sv index ffdc8b85..d599d050 100644 --- a/hw/rtl/cpu_and_mem/cpu/tomasulo/store_queue/store_queue.sv +++ b/hw/rtl/cpu_and_mem/cpu/tomasulo/store_queue/store_queue.sv @@ -22,7 +22,7 @@ * them (non-speculative writes). Supports store-to-load forwarding. * * Features: - * - Parameterized depth (8 entries, FF-based) + * - Parameterized depth (8 entries, hybrid FF + LUTRAM; see Storage Strategy) * - CAM-style tag search for address/data update (all entries in parallel) * - In-order commit: head entry writes to memory when committed + ready * - Store-to-load forwarding: combinational scan for LQ disambiguation @@ -1174,14 +1174,22 @@ module store_queue #( always @(posedge i_clk) begin if (i_rst_n) begin if (i_alloc.valid && full) $warning("SQ: allocation attempted when full"); - if (i_alloc.valid && (i_flush_all || i_flush_en)) - $warning("SQ: allocation attempted during flush"); + // Only PARTIAL flush (i_flush_en) is dangerous: there the alloc block in + // the !flush_all else-branch actually LANDS (sets sq_valid, line ~1060). + // i_flush_all is intentionally excluded — its priority else-if branches + // (lines ~859, ~1027) structurally squash the alloc (sq_valid <= '0), a + // documented-safe, formally-proven (p_alloc_slot_free) handshake that the + // RS issues un-flush-gated for timing closure (see note ~line 1263). The + // old (i_flush_all||i_flush_en) form fired ~1178x/run on the benign + // flush_all handshake, burying the genuinely-unsafe flush_en case. + if (i_alloc.valid && i_flush_en && !i_flush_all) + $warning("SQ: allocation attempted during partial flush"); if (i_alloc_2.valid && i_alloc.valid && full_for_2) $warning("SQ: slot-2 alloc attempted when full_for_2 (and slot-1 firing)"); if (i_alloc_2.valid && !i_alloc.valid && full) $warning("SQ: slot-2 alloc attempted alone when full"); - if (i_alloc_2.valid && (i_flush_all || i_flush_en)) - $warning("SQ: slot-2 alloc attempted during flush"); + if (i_alloc_2.valid && i_flush_en && !i_flush_all) + $warning("SQ: slot-2 alloc attempted during partial flush"); if (slot1_alloc_en && slot2_alloc_en && (alloc_target[IdxWidth-1:0] == slot2_alloc_idx)) $error("SQ: slot-1 and slot-2 alloc collide on entry %0d", alloc_target[IdxWidth-1:0]); end @@ -1356,7 +1364,8 @@ module store_queue #( end end - // Forwarding outputs are registered, so they reflect the previous check. + // Forwarding outputs are driven from staged SQ CAM results, so they reflect + // the previous check. always @(posedge i_clk) begin if (f_past_valid && i_rst_n && $past( i_rst_n diff --git a/hw/rtl/cpu_and_mem/cpu/tomasulo/tomasulo.f b/hw/rtl/cpu_and_mem/cpu/tomasulo/tomasulo.f index e0718221..fbac7638 100644 --- a/hw/rtl/cpu_and_mem/cpu/tomasulo/tomasulo.f +++ b/hw/rtl/cpu_and_mem/cpu/tomasulo/tomasulo.f @@ -9,6 +9,7 @@ $(ROOT)/hw/rtl/lib/ram/sdp_dist_ram_2r.sv $(ROOT)/hw/rtl/lib/ram/mwp_dist_ram.sv $(ROOT)/hw/rtl/lib/ram/mwp_dist_ram_2r.sv +$(ROOT)/hw/rtl/lib/ram/mwp_dist_ram_ohread.sv # Reorder Buffer $(ROOT)/hw/rtl/cpu_and_mem/cpu/tomasulo/reorder_buffer/rob_serializer.sv diff --git a/hw/rtl/cpu_and_mem/cpu/tomasulo/tomasulo_wrapper/README.md b/hw/rtl/cpu_and_mem/cpu/tomasulo/tomasulo_wrapper/README.md index db64dcab..2b646992 100644 --- a/hw/rtl/cpu_and_mem/cpu/tomasulo/tomasulo_wrapper/README.md +++ b/hw/rtl/cpu_and_mem/cpu/tomasulo/tomasulo_wrapper/README.md @@ -17,7 +17,7 @@ verbatim, so the flattened design is unchanged: | `commit_bus_pipeline` | `commit_bus/` | The four `always_ff` that register the combinational ROB commit bus into `commit_bus_q` / `commit_bus_2_q` plus the decomposed `commit_q_*` fields. | | `sq_early_addr_pipeline` | `store_addr/` | The dual-ported early store-address stage (register dispatch base+imm, add the next cycle off the dispatch critical path) that produces the two SQ early-address update packets. | | `dispatch_rs_router` | `dispatch_routing/` | Combinational decode of the dispatch packet(s) into per-RS dispatch-valid signals (slot 1 + slot 2) and the fast slot-1 "intent" signals. | -| `sc_pending_unit` | `atomics/` | Store-conditional resolution: the SC pending-register FSM (set at MEM_RS SC issue, cleared on fire / flush / age), its rob_tag+addr capture, the fire/success decode, and the `sc_fu_complete` packet. | +| `sc_pending_unit` | `atomics/` | Store-conditional resolution: a per-ROB-tag table of in-flight SCs (allocated at MEM_RS SC issue, freed on fire / flush), the head-match fire/success decode, and the `sc_fu_complete` packet. | The per-RS dispatch-valid nets in `dispatch_rs_router` carry `(* max_fanout = 32 *)`; the attribute is preserved both in the submodule and on the wrapper-side @@ -48,7 +48,7 @@ while the entry was queued gets a fresh value. ### SC state machine -The SC pending FSM and its fire/success decode live in +The SC tracking table and its fire/success decode live in `atomics/sc_pending_unit.sv`; the surrounding store-misalign path and MEM-adapter mux described below stay in the wrapper. @@ -61,6 +61,20 @@ result is just `~reservation_valid`. On failure, the wrapper sends a discard signal to the SQ to drop the SC's entry without writing memory. +Several SCs can be in flight at once: a branch-speculated LR/SC retry +loop issues one SC per speculated iteration, and the MEM_RS may issue +them out of program order. `sc_pending_unit` therefore tracks every +in-flight SC in a small table keyed by ROB tag (depth `NumCheckpoints ++ 1`) and fires the entry whose tag matches the ROB head; a flush drops +only entries younger than the flush boundary, so a surviving older SC +is never lost. This replaced a single pending register plus a +`!(sc_pending && mem_rs_next_is_sc)` issue-serialization gate in +`mem_rs_fu_ready_base`: under speculation a younger SC could take the +register and the gate would then block the older head SC from issuing +at all, so it never fired and `sc_pending` never cleared — Linux +printk's `_prb_commit` cmpxchg on the cached DDR tier deadlocked +exactly that way. The gate is gone; the table makes concurrent SCs safe. + The `sc_fu_complete` output is registered (`sc_fu_complete_reg`) before feeding the MEM adapter. The combinational path from the full-flush term `speculative_flush_all` (driven by `i_flush_all` / @@ -96,6 +110,20 @@ misprediction-detect path in `cpu_ooo.sv`, and the CDB grants remain combinational so FU adapters can clear their hold registers on the same cycle as a grant. +The registered valid outputs (`o_commit_bus_q_valid`, `o_commit_bus_2_q_valid`) +are additionally masked combinationally with `!i_flush_all`. The valid flops +clear on the flush edge, but downstream consumers still observe the previous +valid value during that same cycle; masking immediately prevents a commit that +overlaps a trap / MRET / FENCE.I full flush from performing one more +architectural side effect while the back-end is being squashed. + +The wrapper also drives the SQ slot-2 combinational commit guard from the raw +head+1 store-commit pulse (`i_commit_valid_comb_2 = commit_2_store_like_raw`, +`i_commit_rob_tag_comb_2 = commit_bus_2.tag`; previously tied to `1'b0`/`'0`). +Slot 2 has the same raw-commit race as slot 1: `commit_bus_2_q_valid` reaches the +SQ one cycle late, so without this a full-flush trap (e.g. a machine-timer IRQ) +could observe `sq_committed_empty` and squash a store the SQ has not yet owned. + ### Dispatch routing Dispatch now emits already-routed per-RS packets for slot 1 and slot 2. The diff --git a/hw/rtl/cpu_and_mem/cpu/tomasulo/tomasulo_wrapper/atomics/sc_pending_unit.sv b/hw/rtl/cpu_and_mem/cpu/tomasulo/tomasulo_wrapper/atomics/sc_pending_unit.sv index 170ade76..4ebe9536 100644 --- a/hw/rtl/cpu_and_mem/cpu/tomasulo/tomasulo_wrapper/atomics/sc_pending_unit.sv +++ b/hw/rtl/cpu_and_mem/cpu/tomasulo/tomasulo_wrapper/atomics/sc_pending_unit.sv @@ -17,19 +17,32 @@ // ============================================================================= // sc_pending_unit // ============================================================================= -// Extracted verbatim from tomasulo_wrapper.sv (pure RTL boundary move, zero -// functional change). Store-conditional (SC.W) resolution: -// * the SC pending register FSM (set at MEM_RS SC issue, cleared on fire / -// flush / age) and its data capture (rob_tag + address), -// * the combinational fire/success decode, and -// * the sc_fu_complete result packet. -// The store-misalign exception path, the MEM-adapter input mux, and -// lq_result_accepted remain in the wrapper; this unit consumes store_misalign_* -// as inputs and produces sc_pending (visible to dispatch) and sc_fu_complete -// (registered by the wrapper before the MEM adapter). +// Store-conditional (SC.W) resolution. // -// is_younger is duplicated here (it is also used elsewhere in the wrapper, and -// the wrapper comment notes it is identical to the load_queue / RS copies). +// In-flight SCs are tracked in a small table keyed by ROB tag, so the SC that +// reaches the ROB head can ALWAYS fire -- even when an LR/SC retry loop is +// branch-speculated and the core issues several SCs (one per speculated +// iteration) before the oldest resolves. A single pending-SC register failed +// here: under speculation the MEM_RS issues SCs out of program order, so a +// younger SC took the one register, and the wrapper's former issue- +// serialization gate (!(sc_pending && mem_rs_next_is_sc)) then blocked the +// OLDER head SC from issuing at all -- so it never fired and the core +// deadlocked. Observed on Linux printk's _prb_commit cmpxchg loop (11 SCs +// issued, 8-deep speculation; head=tag15 never issued, the register held +// tag19). This table pairs with removing that gate (see tomasulo_wrapper.sv). +// BRAM LR/SC resolves before a second SC issues, so BRAM/FreeRTOS were +// unaffected; the longer cached-tier (DDR) latency exposes the overlap. +// +// Two flush rules matter and were both bugs in the single-register version: +// * an SC fires when head_tag matches a VALID entry and the SQ is drained; +// * an entry is cleared on a flush ONLY if it is younger than the flush +// boundary (is_younger) -- NOT unconditionally on partial flush, which +// would drop a surviving older SC. +// Depth = NumCheckpoints + 1 (branch speculation depth bounds concurrent SCs). +// +// The store-misalign exception path, MEM-adapter input mux, and +// lq_result_accepted remain in the wrapper. is_younger is duplicated here +// (identical to the load_queue / RS copies). // ============================================================================= module sc_pending_unit ( input logic i_clk, @@ -56,8 +69,7 @@ module sc_pending_unit ( ); // --------------------------------------------------------------------------- - // Alias input ports back to the wrapper's local names so the bodies below are - // byte-identical to the original tomasulo_wrapper logic. + // Alias input ports back to the wrapper's local names. // --------------------------------------------------------------------------- logic [riscv_pkg::ReorderBufferTagWidth-1:0] head_tag; logic sq_committed_empty; @@ -86,12 +98,13 @@ module sc_pending_unit ( assign speculative_flush_en = i_speculative_flush_en; assign speculative_partial_flush = i_speculative_partial_flush; - // SC pending state (rob_tag / addr are internal; sc_pending is also output) - logic sc_pending; - logic [riscv_pkg::ReorderBufferTagWidth-1:0] sc_pending_rob_tag; - logic [riscv_pkg::XLEN-1:0] sc_pending_addr; + // SC tracking table: one entry per in-flight SC, keyed by ROB tag. + localparam int unsigned ScTableDepth = riscv_pkg::NumCheckpoints + 1; + logic [ScTableDepth-1:0] sct_valid; + logic [riscv_pkg::ReorderBufferTagWidth-1:0] sct_tag[ScTableDepth]; + logic [riscv_pkg::XLEN-1:0] sct_addr[ScTableDepth]; - // Age comparison for SC flush guard (identical to load_queue/reservation_station) + // Age comparison for the SC flush guard (identical to load_queue / RS). function automatic logic is_younger(input logic [riscv_pkg::ReorderBufferTagWidth-1:0] entry_tag, input logic [riscv_pkg::ReorderBufferTagWidth-1:0] flush_tag, input logic [riscv_pkg::ReorderBufferTagWidth-1:0] head); @@ -104,70 +117,111 @@ module sc_pending_unit ( end endfunction + // Head match: an in-flight SC sits at the ROB head. + logic sct_hit; + logic [riscv_pkg::XLEN-1:0] sct_hit_addr; + logic [ ScTableDepth-1:0] sct_hit_oh; + always_comb begin + sct_hit = 1'b0; + sct_hit_addr = '0; + sct_hit_oh = '0; + for (int i = 0; i < ScTableDepth; i++) begin + if (sct_valid[i] && (sct_tag[i] == head_tag)) begin + sct_hit = 1'b1; + sct_hit_addr = sct_addr[i]; + sct_hit_oh[i] = 1'b1; + end + end + end + + // First free slot for a newly-issued SC. + logic sct_has_free; + logic [ScTableDepth-1:0] sct_free_oh; + always_comb begin + sct_has_free = 1'b0; + sct_free_oh = '0; + for (int i = 0; i < ScTableDepth; i++) begin + if (!sct_valid[i] && !sct_has_free) begin + sct_has_free = 1'b1; + sct_free_oh[i] = 1'b1; + end + end + end + // Capture an issuing SC. Reject a phantom SC only when it is younger than the + // flush boundary (it is being killed); a real SC that survives the flush must + // be captured even if its issue coincides with the flush window. + logic sct_alloc; + assign sct_alloc = o_mem_rs_issue.valid && !speculative_flush_all && + (o_mem_rs_issue.op == riscv_pkg::SC_W) && + !(speculative_flush_en && is_younger( + o_mem_rs_issue.rob_tag, i_flush_tag, head_tag + )); + logic sc_can_fire; logic sc_success; logic sc_fire_now; - assign sc_can_fire = sc_pending && (sc_pending_rob_tag == head_tag) && sq_committed_empty; + assign sc_can_fire = sct_hit && sq_committed_empty; assign sc_success = lq_reservation_valid - && (lq_reservation_addr[riscv_pkg::XLEN-1:2] == sc_pending_addr[riscv_pkg::XLEN-1:2]); - // Arm SC only when the MEM adapter has no competing same-cycle producer. - // This keeps the rare SC head-tag compare local to the SC register D path; - // the registered completion below owns the MEM adapter on the next cycle. + && (lq_reservation_addr[riscv_pkg::XLEN-1:2] == sct_hit_addr[riscv_pkg::XLEN-1:2]); + // Arm SC only when the MEM adapter has no competing same-cycle producer; the + // registered completion below owns the MEM adapter on the next cycle. assign sc_fire_now = sc_can_fire && !mem_adapter_result_pending && !lq_fu_complete.valid && !store_misalign_issue && !store_misalign_fu_complete_reg.valid; - // SC fu_complete generation + // SC fu_complete generation. The firing SC's tag IS head_tag (it matched). riscv_pkg::fu_complete_t sc_fu_complete; always_comb begin sc_fu_complete = '0; sc_fu_complete.valid = sc_fire_now; - sc_fu_complete.tag = sc_pending_rob_tag; + sc_fu_complete.tag = head_tag; sc_fu_complete.value = {{(riscv_pkg::FLEN - 1) {1'b0}}, ~sc_success}; end + // Table valid bits: allocate on SC issue, free on fire, flush younger entries. always_ff @(posedge i_clk) begin - if (!i_rst_n) begin - sc_pending <= 1'b0; - end else if (speculative_flush_all) begin - sc_pending <= 1'b0; + if (!i_rst_n || speculative_flush_all) begin + sct_valid <= '0; end else begin - // Set when MEM_RS issues SC. Gate with flush signals because - // the RS output valid is no longer suppressed during flush for - // timing closure — a phantom SC set during partial flush would - // leave sc_pending stuck (the flushed tag never reaches head). - if (o_mem_rs_issue.valid && !speculative_flush_all && !speculative_flush_en - && (o_mem_rs_issue.op == riscv_pkg::SC_W)) begin - sc_pending <= 1'b1; + // Clear ONLY entries younger than the flush boundary (i_flush_tag) -- i.e. + // actually being flushed. Do NOT clear on speculative_partial_flush alone: + // an SC older than the mispredicted branch (e.g. one still waiting for the + // head to reach it on the slow cached tier) must survive. + if (i_flush_en) begin + for (int i = 0; i < ScTableDepth; i++) begin + if (sct_valid[i] && is_younger(sct_tag[i], i_flush_tag, head_tag)) begin + sct_valid[i] <= 1'b0; + end + end end - // Clear when SC fu_complete is armed for the registered MEM path. + // Free the firing entry. if (sc_fire_now) begin - sc_pending <= 1'b0; + for (int i = 0; i < ScTableDepth; i++) if (sct_hit_oh[i]) sct_valid[i] <= 1'b0; end - // A pending SC is speculative if it is younger than the flush boundary, - // or if recovery is draining everything younger than the current/just- - // retired head. - if (i_flush_en && sc_pending && (speculative_partial_flush || is_younger( - sc_pending_rob_tag, i_flush_tag, head_tag - ))) begin - sc_pending <= 1'b0; + // Allocate a newly-issued SC into the first free slot. (Alloc targets a + // free slot; fire/flush clear valid slots, so the indices never collide.) + if (sct_alloc && sct_has_free) begin + for (int i = 0; i < ScTableDepth; i++) if (sct_free_oh[i]) sct_valid[i] <= 1'b1; end end end - // SC data capture (no reset - gated by sc_pending) + // SC tag/addr capture (no reset; gated by the alloc one-hot). always_ff @(posedge i_clk) begin - if (o_mem_rs_issue.valid && !speculative_flush_all && !speculative_flush_en - && (o_mem_rs_issue.op == riscv_pkg::SC_W)) begin - sc_pending_rob_tag <= o_mem_rs_issue.rob_tag; - sc_pending_addr <= sq_effective_addr; + if (sct_alloc && sct_has_free) begin + for (int i = 0; i < ScTableDepth; i++) begin + if (sct_free_oh[i]) begin + sct_tag[i] <= o_mem_rs_issue.rob_tag; + sct_addr[i] <= sq_effective_addr; + end + end end end - assign o_sc_pending = sc_pending; + assign o_sc_pending = |sct_valid; assign o_sc_fu_complete = sc_fu_complete; endmodule diff --git a/hw/rtl/cpu_and_mem/cpu/tomasulo/tomasulo_wrapper/commit_bus/commit_bus_pipeline.sv b/hw/rtl/cpu_and_mem/cpu/tomasulo/tomasulo_wrapper/commit_bus/commit_bus_pipeline.sv index 094e27aa..0b900402 100644 --- a/hw/rtl/cpu_and_mem/cpu/tomasulo/tomasulo_wrapper/commit_bus/commit_bus_pipeline.sv +++ b/hw/rtl/cpu_and_mem/cpu/tomasulo/tomasulo_wrapper/commit_bus/commit_bus_pipeline.sv @@ -117,9 +117,13 @@ module commit_bus_pipeline ( commit_q_2_is_store_like <= commit_bus_2.is_store || commit_bus_2.is_fp_store; end - // Drive the output ports from the registered locals. + // Drive the output ports from the registered locals. The flops above clear + // valid on the flush edge, but consumers see the previous valid value during + // that same cycle. Mask the qualified valid outputs immediately so a + // commit that overlaps a trap/MRET/FENCE.I full flush cannot perform one + // more architectural side effect while the backend is being squashed. assign o_commit_bus_q = commit_bus_q; - assign o_commit_bus_q_valid = commit_bus_q_valid; + assign o_commit_bus_q_valid = commit_bus_q_valid && !i_flush_all; assign o_commit_q_dest_valid = commit_q_dest_valid; assign o_commit_q_dest_rf = commit_q_dest_rf; assign o_commit_q_dest_reg = commit_q_dest_reg; @@ -128,7 +132,7 @@ module commit_bus_pipeline ( assign o_commit_q_is_store_like = commit_q_is_store_like; assign o_commit_q_sc_failed = commit_q_sc_failed; assign o_commit_bus_2_q = commit_bus_2_q; - assign o_commit_bus_2_q_valid = commit_bus_2_q_valid; + assign o_commit_bus_2_q_valid = commit_bus_2_q_valid && !i_flush_all; assign o_commit_q_2_dest_valid = commit_q_2_dest_valid; assign o_commit_q_2_dest_rf = commit_q_2_dest_rf; assign o_commit_q_2_dest_reg = commit_q_2_dest_reg; diff --git a/hw/rtl/cpu_and_mem/cpu/tomasulo/tomasulo_wrapper/tomasulo_wrapper.f b/hw/rtl/cpu_and_mem/cpu/tomasulo/tomasulo_wrapper/tomasulo_wrapper.f index 23834886..31b808fc 100644 --- a/hw/rtl/cpu_and_mem/cpu/tomasulo/tomasulo_wrapper/tomasulo_wrapper.f +++ b/hw/rtl/cpu_and_mem/cpu/tomasulo/tomasulo_wrapper/tomasulo_wrapper.f @@ -9,6 +9,7 @@ $(ROOT)/hw/rtl/lib/ram/sdp_dist_ram_2r.sv $(ROOT)/hw/rtl/lib/ram/mwp_dist_ram.sv $(ROOT)/hw/rtl/lib/ram/mwp_dist_ram_2r.sv +$(ROOT)/hw/rtl/lib/ram/mwp_dist_ram_ohread.sv # Submodules $(ROOT)/hw/rtl/cpu_and_mem/cpu/tomasulo/reorder_buffer/rob_serializer.sv diff --git a/hw/rtl/cpu_and_mem/cpu/tomasulo/tomasulo_wrapper/tomasulo_wrapper.sv b/hw/rtl/cpu_and_mem/cpu/tomasulo/tomasulo_wrapper/tomasulo_wrapper.sv index b6980bbe..2bcb4c0a 100644 --- a/hw/rtl/cpu_and_mem/cpu/tomasulo/tomasulo_wrapper/tomasulo_wrapper.sv +++ b/hw/rtl/cpu_and_mem/cpu/tomasulo/tomasulo_wrapper/tomasulo_wrapper.sv @@ -17,7 +17,7 @@ /* * Tomasulo Integration Wrapper * - * Verification wrapper that instantiates ROB + RAT + six RS instances + * Wrapper (instantiated by cpu_ooo) that instantiates ROB + RAT + six RS instances * (INT_RS, MUL_RS, MEM_RS, FP_RS, FMUL_RS, FDIV_RS), LQ, SQ, CDB arbiter, * FU shims, and hardwires the internal commit bus, dispatch routing, * SQ↔LQ forwarding, and shared CDB/flush signals. @@ -29,7 +29,7 @@ * * Internal wiring: * ROB.o_commit_comb --> commit_bus --> cpu_ooo same-cycle mispredict detect - * ROB.o_commit --> o_commit (registered testbench observation) + * ROB.o_commit_comb --> commit_bus --> commit_bus_pipeline --> o_commit * commit_bus_q --> RAT commit-clear signals * FU adapters --> cdb_arbiter --> cdb_bus --> ROB.i_cdb_write (derived) * --> all RS .i_cdb (broadcast for wakeup) @@ -129,6 +129,11 @@ module tomasulo_wrapper #( input logic i_csr_done, output logic o_trap_pending, output logic [riscv_pkg::XLEN-1:0] o_trap_pc, + output logic o_head_is_wfi, + // Retired-next-PC precompute for cpu_ooo's interrupt_resume_pc (see + // reorder_buffer port comment; pure timing restructure). + output logic [riscv_pkg::XLEN-1:0] o_head_retired_next_pc, + output logic [riscv_pkg::XLEN-1:0] o_head_next_retired_next_pc, output riscv_pkg::exc_cause_t o_trap_cause, output logic [riscv_pkg::XLEN-1:0] o_trap_value, input logic i_trap_taken, @@ -136,7 +141,11 @@ module tomasulo_wrapper #( input logic i_mret_done, input logic [riscv_pkg::XLEN-1:0] i_mepc, input logic i_interrupt_pending, - input logic i_trap_misaligned_accesses, + + // Current privilege (PrivM/PrivU), forwarded to the ROB for U-mode + // CSR/MRET illegal-instruction checks. + input logic [1:0] i_priv, + input logic i_trap_misaligned_accesses, // Widen-commit back-pressure: asserted when the downstream slot-2 // retire path can accept a second commit this cycle. cpu_ooo ties this @@ -460,10 +469,11 @@ module tomasulo_wrapper #( // // commit_bus_q is a one-cycle pipeline register that breaks the critical // timing path from ROB head_ready/commit_en through SQ/RAT to LQ. - // All internal consumers (RAT, SQ commit, SC logic) use the registered - // version. The valid bit is cleared on full flush for safety — although - // overlapping pipelined commits with flush_all only occurs for non-store - // instructions (traps, MRET, FENCE.I), so SQ/SC are unaffected. + // Internal consumers (RAT, SQ commit, SC logic) use the registered + // version, except the SQ same-cycle flush-race guard, which taps the raw + // ROB commit pulses. The valid bit is cleared on full flush for safety — + // although overlapping pipelined commits with flush_all only occurs for + // non-store instructions (traps, MRET, FENCE.I), so SQ/SC are unaffected. riscv_pkg::reorder_buffer_commit_t commit_bus; // Split commit_bus_q into separate valid + data to prevent Vivado from // dragging the reset net onto payload register bits. @@ -632,9 +642,15 @@ module tomasulo_wrapper #( // CDB Arbiter: FU completions → single CDB broadcast // =========================================================================== riscv_pkg::cdb_broadcast_t cdb_bus_comb; // combinational from arbiter - riscv_pkg::cdb_broadcast_t cdb_bus; // registered — feeds RS/ROB wakeup + // registered — feeds RS/ROB wakeup + (* equivalent_register_removal = "no" *)riscv_pkg::cdb_broadcast_t cdb_bus; + // same-cycle INT_RS-local copy + (* equivalent_register_removal = "no" *)riscv_pkg::cdb_broadcast_t cdb_bus_int_rs; riscv_pkg::cdb_broadcast_t cdb_bus_2_comb; // 2-wide CDB lane-1, combinational - riscv_pkg::cdb_broadcast_t cdb_bus_2; // registered lane-1 — feeds RS/ROB wakeup + // registered lane-1 — feeds RS/ROB wakeup + (* equivalent_register_removal = "no" *)riscv_pkg::cdb_broadcast_t cdb_bus_2; + // same-cycle INT_RS-local copy + (* equivalent_register_removal = "no" *)riscv_pkg::cdb_broadcast_t cdb_bus_2_int_rs; // Forward declarations: adapter→arbiter signals (used here, defined below) riscv_pkg::fu_complete_t alu_adapter_to_arbiter; @@ -689,15 +705,22 @@ module tomasulo_wrapper #( // max_fanout forces replication across the RS snoop / ROB-write consumers — // the high-fanout report (609 loads) showed this net being one of the top // drivers into the flush-recovery cone that failed timing at -0.947 ns. - (* max_fanout = 32 *) logic cdb_bus_valid; + (* max_fanout = 32 *)logic cdb_bus_valid; + (* equivalent_register_removal = "no", max_fanout = 32 *)logic cdb_bus_int_rs_valid; always_ff @(posedge i_clk) begin if (!i_rst_n) cdb_bus_valid <= 1'b0; else cdb_bus_valid <= cdb_bus_comb.valid; end + always_ff @(posedge i_clk) begin + if (!i_rst_n) cdb_bus_int_rs_valid <= 1'b0; + else cdb_bus_int_rs_valid <= cdb_bus_comb.valid; + end + always_ff @(posedge i_clk) begin cdb_bus <= cdb_bus_comb; + cdb_bus_int_rs <= cdb_bus_comb; end // Expose combinational CDB for testbench observation (grant timing matches) @@ -710,6 +733,16 @@ module tomasulo_wrapper #( cdb_bus_qualified.valid = cdb_bus_valid; end + // INT_RS is physically far from the shared CDB register on Genesys2 and + // snoops many value bits in parallel. Give it an equivalent same-cycle CDB + // register so placement can keep that high-fanout payload local without + // changing wakeup latency. + riscv_pkg::cdb_broadcast_t cdb_bus_int_rs_qualified; + always_comb begin + cdb_bus_int_rs_qualified = cdb_bus_int_rs; + cdb_bus_int_rs_qualified.valid = cdb_bus_int_rs_valid; + end + // Derive ROB CDB write from CDB broadcast riscv_pkg::reorder_buffer_cdb_write_t cdb_write_from_arbiter; always_comb begin @@ -722,19 +755,30 @@ module tomasulo_wrapper #( end // ---- 2-wide CDB lane-1: registered mirror of the lane-0 pipeline above. - (* max_fanout = 32 *) logic cdb_bus_2_valid; + (* max_fanout = 32 *)logic cdb_bus_2_valid; + (* equivalent_register_removal = "no", max_fanout = 32 *)logic cdb_bus_2_int_rs_valid; always_ff @(posedge i_clk) begin if (!i_rst_n) cdb_bus_2_valid <= 1'b0; else cdb_bus_2_valid <= cdb_bus_2_comb.valid; end + always_ff @(posedge i_clk) begin + if (!i_rst_n) cdb_bus_2_int_rs_valid <= 1'b0; + else cdb_bus_2_int_rs_valid <= cdb_bus_2_comb.valid; + end always_ff @(posedge i_clk) begin cdb_bus_2 <= cdb_bus_2_comb; + cdb_bus_2_int_rs <= cdb_bus_2_comb; end riscv_pkg::cdb_broadcast_t cdb_bus_2_qualified; always_comb begin cdb_bus_2_qualified = cdb_bus_2; cdb_bus_2_qualified.valid = cdb_bus_2_valid; end + riscv_pkg::cdb_broadcast_t cdb_bus_2_int_rs_qualified; + always_comb begin + cdb_bus_2_int_rs_qualified = cdb_bus_2_int_rs; + cdb_bus_2_int_rs_qualified.valid = cdb_bus_2_int_rs_valid; + end riscv_pkg::reorder_buffer_cdb_write_t cdb_write_from_arbiter_2; always_comb begin cdb_write_from_arbiter_2.valid = cdb_bus_2_valid; @@ -1327,8 +1371,13 @@ module tomasulo_wrapper #( logic mem_rs_fu_ready_base; logic mem_rs_fu_ready; + // Do NOT gate SC issue on (sc_pending && next_is_sc). That single-SC + // serialization deadlocked Linux: under speculation a YOUNGER SC issues + // out-of-order, sets sc_pending, and then this gate blocked the OLDER head SC + // from ever issuing -- so it never fired, sc_pending never cleared, and the + // core hung at _prb_commit. sc_pending_unit now tracks multiple in-flight SCs + // (a table keyed by ROB tag), so several SCs may legitimately be in flight. assign mem_rs_fu_ready_base = i_mem_rs_fu_ready && - !(sc_pending && mem_rs_next_is_sc) && !sc_fu_complete_reg.valid && !mem_adapter_result_pending && !i_backend_recovery_hold; @@ -1406,22 +1455,26 @@ module tomasulo_wrapper #( .i_widen_commit_ok (i_widen_commit_ok), // External coordination - .i_sq_empty (o_sq_empty), - .i_sq_committed_empty(sq_committed_empty), - .i_fence_i_sync_done (i_fence_i_sync_done), - .o_fence_i_sync_req (o_fence_i_sync_req), - .o_csr_start (o_csr_start), - .i_csr_done (i_csr_done), - .o_trap_pending (o_trap_pending), - .o_trap_pc (o_trap_pc), - .o_trap_cause (o_trap_cause), - .o_trap_value (o_trap_value), - .i_trap_taken (i_trap_taken), - .o_mret_start (o_mret_start), - .i_mret_done (i_mret_done), - .i_mepc (i_mepc), - .i_interrupt_pending (i_interrupt_pending), - .i_commit_hold (i_commit_hold), + .i_sq_empty (o_sq_empty), + .i_sq_committed_empty (sq_committed_empty), + .i_fence_i_sync_done (i_fence_i_sync_done), + .o_fence_i_sync_req (o_fence_i_sync_req), + .o_csr_start (o_csr_start), + .i_csr_done (i_csr_done), + .o_trap_pending (o_trap_pending), + .o_trap_pc (o_trap_pc), + .o_head_is_wfi (o_head_is_wfi), + .o_head_retired_next_pc (o_head_retired_next_pc), + .o_head_next_retired_next_pc(o_head_next_retired_next_pc), + .o_trap_cause (o_trap_cause), + .o_trap_value (o_trap_value), + .i_trap_taken (i_trap_taken), + .o_mret_start (o_mret_start), + .i_mret_done (i_mret_done), + .i_mepc (i_mepc), + .i_interrupt_pending (i_interrupt_pending), + .i_priv (i_priv), + .i_commit_hold (i_commit_hold), // Flush .i_flush_en(i_flush_en), @@ -1648,8 +1701,8 @@ module tomasulo_wrapper #( .o_full_for_2(int_rs_full_for_2_w), // CDB snoop (from arbiter) - .i_cdb(cdb_bus_qualified), - .i_cdb_2(cdb_bus_2_qualified), + .i_cdb(cdb_bus_int_rs_qualified), + .i_cdb_2(cdb_bus_2_int_rs_qualified), .i_repair_valid_1(int_done_repair_valid_1), .i_repair_tag_1(i_bypass_tag_1), .i_repair_value_1(bypass_value_1), @@ -2652,7 +2705,6 @@ module tomasulo_wrapper #( // Effective address: base (src1) + immediate (declared above near SC pending) assign sq_effective_addr = o_mem_rs_issue.src1_value[riscv_pkg::XLEN-1:0] + o_mem_rs_issue.imm; - // MMIO detection: address >= MMIO base logic sq_addr_is_mmio; // MMIO quadrant test; see lq_addr_is_mmio above. assign sq_addr_is_mmio = (sq_effective_addr[31:30] == 2'b01); @@ -2721,12 +2773,12 @@ module tomasulo_wrapper #( .i_commit_valid_comb (commit_store_like_raw), .i_commit_rob_tag_comb(head_tag), - // Slot 2 is always older than any ordinary partial-flush boundary that - // can overlap commit_2_fire, and delayed recovery sees it through the - // registered commit path. Keep the raw head+1 ROB metadata cone out of - // the SQ valid flops. - .i_commit_valid_comb_2 (1'b0), - .i_commit_rob_tag_comb_2('0), + // Slot 2 has the same raw commit race as slot 1 for full-trap drains: + // commit_bus_2_q_valid is still one cycle away from SQ, so a timer IRQ + // must not observe committed-empty and full-flush the entry before SQ + // sees the registered commit. + .i_commit_valid_comb_2 (commit_2_store_like_raw), + .i_commit_rob_tag_comb_2(commit_bus_2.tag), // Store-to-load forwarding (from LQ) .i_sq_check_valid (sq_check_valid), diff --git a/hw/rtl/cpu_and_mem/cpu_and_mem.f b/hw/rtl/cpu_and_mem/cpu_and_mem.f index 5e8abaaa..95cabf80 100644 --- a/hw/rtl/cpu_and_mem/cpu_and_mem.f +++ b/hw/rtl/cpu_and_mem/cpu_and_mem.f @@ -25,5 +25,8 @@ # High-address fetch window provider (two-line L1I buffer) $(ROOT)/hw/rtl/cpu_and_mem/fetch_provider.sv +# On-silicon hang triage (synthesizable boot-hang classifier over UART) +$(ROOT)/hw/rtl/cpu_and_mem/hang_triage.sv + # CPU and memory integration module $(ROOT)/hw/rtl/cpu_and_mem/cpu_and_mem.sv diff --git a/hw/rtl/cpu_and_mem/cpu_and_mem.sv b/hw/rtl/cpu_and_mem/cpu_and_mem.sv index 2578d6f8..9f7ba994 100644 --- a/hw/rtl/cpu_and_mem/cpu_and_mem.sv +++ b/hw/rtl/cpu_and_mem/cpu_and_mem.sv @@ -49,6 +49,9 @@ module cpu_and_mem #( parameter int unsigned L1_CACHE_BYTES = 128 * 1024, parameter int unsigned L1I_CACHE_BYTES = 16 * 1024, parameter int unsigned L2_CACHE_BYTES = 2 * 1024 * 1024, + // Simulation-only fast cache maintenance for fence.i (see frost_cache). + // 0 = FPGA (cycle-accurate maintenance FSM); non-zero = sim fast path. + parameter int unsigned SIM_FAST_MAINT = 0, // Behavioral main-memory model (simulation only; hardware integration // replaces it with the DDR controller behind the same AXI port). parameter int unsigned DDR_MODEL_BYTES = 64 * 1024 * 1024, @@ -60,7 +63,10 @@ module cpu_and_mem #( // provider over the 1-cycle instruction BRAM (LFSR-gated i_instr_valid + // owed-ask tracking). Exercises the core's fetch-invalid machinery // before a real I-cache sits behind it; hardware keeps 0. - parameter int unsigned FETCH_VALID_FUZZ = 0 + parameter int unsigned FETCH_VALID_FUZZ = 0, + // On-silicon boot-hang classifier that can take over the console UART. + // Keep it default-off for normal interactive software and Linux bring-up. + parameter int unsigned ENABLE_HANG_TRIAGE = 0 ) ( input logic i_clk, input logic i_clk_div4, // Divided clock for instruction memory programming @@ -95,7 +101,8 @@ module cpu_and_mem #( input logic i_fifo1_empty, output logic o_fifo1_rd_en, - // External interrupt input (directly triggers MEIP when high) + // External interrupt input (registered +1 cycle and ORed with the + // ns16550 UART IRQ before driving MEIP) input logic i_external_interrupt, // DDR AXI master (cache-hierarchy bridge). Quiescent when @@ -129,7 +136,7 @@ module cpu_and_mem #( // Memory addressing parameters localparam int unsigned MemByteAddrWidth = $clog2(MEM_SIZE_BYTES); - // ((128 KiB total memory)/(4 bytes per word)) = 32k words = 2^15 word address bits + // (MEM_SIZE_BYTES/(4 bytes per word)) words; e.g. 256 KiB -> 64k words = 16 word address bits localparam int unsigned MemWordAddrWidth = MemByteAddrWidth - 2; // Memory-mapped I/O addresses for peripherals @@ -137,7 +144,7 @@ module cpu_and_mem #( // - sw/common/link.ld (MMIO memory region and PROVIDE statements) // - cpu module parameters localparam int unsigned MmioAddr = 32'h4000_0000; - localparam int unsigned MmioSizeBytes = 32'h2C; + localparam int unsigned MmioSizeBytes = 32'h1_C000; // ns16550 @ +0x1000, CLINT @ +0x10000 localparam int unsigned UartMmioAddr = 32'h4000_0000; // UART TX (write-only) localparam int unsigned UartRxDataMmioAddr = 32'h4000_0004; // UART RX data (read consumes byte) localparam int unsigned UartRxStatusMmioAddr = 32'h4000_0024; // RX status (bit0: data available) @@ -152,17 +159,46 @@ module cpu_and_mem #( // Software interrupt register localparam int unsigned MsipMmioAddr = 32'h4000_0020; + // ns16550a UART face for Linux (word-stride; DTB reg-shift=2, reg-io-width=4). + // Aliases the native UART TX/RX. DLAB (LCR[7]) remaps offsets 0/4 to DLL/DLM. + localparam int unsigned Ns16550ThrRbr = 32'h4000_1000; // THR(w)/RBR(r) | DLL when DLAB + localparam int unsigned Ns16550IerDlm = 32'h4000_1004; // IER | DLM when DLAB + localparam int unsigned Ns16550IirFcr = 32'h4000_1008; // IIR(r) / FCR(w) + localparam int unsigned Ns16550Lcr = 32'h4000_100C; + localparam int unsigned Ns16550Mcr = 32'h4000_1010; + localparam int unsigned Ns16550Lsr = 32'h4000_1014; // read-only line status + localparam int unsigned Ns16550Msr = 32'h4000_1018; // read-only modem status + localparam int unsigned Ns16550Scr = 32'h4000_101C; // scratch + + // SiFive CLINT alias for Linux (compatible "sifive,clint0") @ 0x4001_0000. + // These map onto the SAME msip/mtimecmp/mtime registers as the native FROST + // timer block; the kernel reaches the timer through the CLINT layout via DTB. + localparam int unsigned ClintMsip = 32'h4001_0000; // hart-0 software interrupt + localparam int unsigned ClintMtimecmpLo = 32'h4001_4000; // mtimecmp[31:0] + localparam int unsigned ClintMtimecmpHi = 32'h4001_4004; // mtimecmp[63:32] + localparam int unsigned ClintMtimeLo = 32'h4001_BFF8; // mtime[31:0] + localparam int unsigned ClintMtimeHi = 32'h4001_BFFC; // mtime[63:32] + // Timer register defaults // Default mtimecmp to max value so no timer interrupt fires until software configures it localparam logic [63:0] MtimecmpDefault = 64'hFFFF_FFFF_FFFF_FFFF; // CPU interface signals logic [31:0] program_counter; + logic commit_vld; // instruction-retire pulse (hang-triage tap) + // CPU-side UART write, muxed against the hang-triage byte stream further down. + logic cpu_uart_wr_en; + logic [7:0] cpu_uart_wr_data; logic [31:0] fetch_address; // imem port B address (the presented fetch ask) logic [63:0] instruction; // 64-bit fetch: {next_word, current_word} logic [riscv_pkg::ImemFetchSidebandWidth-1:0] instruction_sideband; logic instruction_bank_sel_r; // Fetch-word parity (for spanning select) logic instruction_valid; // Fetch window valid + // Served-window tag for the muxed fetch (drives the if_stage served-window + // guard) and the low-BRAM served address (fetch_address delayed one cycle to + // match the 1-cycle imem read latency). + logic [31:0] instruction_served_addr; + logic [31:0] bram_fetch_served_addr_q; logic fetch_replay_consume; // CPU consumed the stall-replay bundle this cycle logic pipeline_stall; // front-end pipeline stall (gates fetch publish-valid) logic fence_i_sync_req; // ROB serializer holding commit for a fence.i cache sync @@ -229,15 +265,50 @@ module cpu_and_mem #( `endif // Timer registers (CLINT-style) - logic [63:0] mtime; // Machine time counter - logic [63:0] mtimecmp; // Machine timer compare register - logic msip; // Machine software interrupt pending + logic [63:0] mtime; // Machine time counter + logic [63:0] mtimecmp; // Machine timer compare register + logic msip; // Machine software interrupt pending + + // ns16550a UART face register file (8-bit). DLAB = ns_lcr[7]. + logic [7:0] ns_dll, ns_dlm, ns_ier, ns_fcr, ns_lcr, ns_mcr, ns_scr; + logic ns_rx_irq_pending; + logic ns_tx_irq_pending; + logic ns_irq_pending; + logic [7:0] ns_iir; + assign ns_rx_irq_pending = ns_ier[0] && i_uart_rx_valid; + assign ns_tx_irq_pending = ns_ier[1] && i_uart_tx_ready; + assign ns_irq_pending = ns_rx_irq_pending || ns_tx_irq_pending; + always_comb begin + if (ns_rx_irq_pending) ns_iir = 8'hC4; // FIFO enabled, received data available. + else if (ns_tx_irq_pending) ns_iir = 8'hC2; // FIFO enabled, THR empty. + else ns_iir = 8'hC1; // FIFO enabled, no interrupt pending. + end // Interrupt signals to CPU - riscv_pkg::interrupt_t interrupts; - // Clamp unknown external interrupt values to 0 for simulation stability. - // This avoids X-propagation into mip when the top-level input is left un-driven. - assign interrupts.meip = (i_external_interrupt === 1'b1); + riscv_pkg::interrupt_t interrupts; + // External/UART interrupt: REGISTER the aggregate to break the dominant + // post-opt timing spine (uart TX-FIFO CDC read-pointer -> occupancy CARRY + // compare -> i_uart_tx_ready -> ns16550 THRE irq -> meip -> trap_unit / + // ROB-serializer WFI-wake -> commit_en -> retire/trap/SQ endpoints; ~1256 + // failing paths, WNS -1.09 at 300 MHz). The whole combinational compare + // cone now terminates at this flop's D. Mirrors mtip_registered below. + // + // DELIBERATE +1-cycle interrupt-delivery latency (user-approved + // 2026-07-01): meip/THRE/RX are level conditions and a 1-cycle-delayed + // level is architecturally benign; interrupt delivery is not on the + // CoreMark-scored path. Only the interrupt VIEW is registered — the MMIO + // store-drain handshake on i_uart_tx_ready is untouched, and the ns_iir + // register readback stays combinational (matches how a real 8250's IIR + // reflects current conditions when the handler reads it). + // + // The === clamp keeps unknown external-interrupt values from propagating + // into mip when the top-level input is left un-driven in simulation. + logic meip_registered; + always_ff @(posedge i_clk) begin + if (i_rst) meip_registered <= 1'b0; + else meip_registered <= (i_external_interrupt === 1'b1) || ns_irq_pending; + end + assign interrupts.meip = meip_registered; assign interrupts.msip = msip; // Timer interrupt: register the 64-bit comparison result to break critical timing path. @@ -251,7 +322,20 @@ module cpu_and_mem #( end assign interrupts.mtip = mtip_registered; - // RISC-V OOO CPU core - Tomasulo out-of-order with RV32IMACBFD + Zicsr + Machine-mode + // mtimecmp MMIO write pulse: a kernel/handler timer re-arm. Used by the hang + // triage as a "timer tick serviced" event tap. + logic mtimecmp_write_pulse; + assign mtimecmp_write_pulse = |data_memory_byte_write_enable_registered && + ((data_memory_address_registered == MtimecmpLowMmioAddr) || + (data_memory_address_registered == MtimecmpHighMmioAddr) || + (data_memory_address_registered == ClintMtimecmpLo) || + (data_memory_address_registered == ClintMtimecmpHi)); + logic [ 5:0] cpu_debug_irq_status; + logic [31:0] cpu_debug_commit_pc; + logic [31:0] cpu_debug_commit_2_pc; + logic [ 1:0] cpu_debug_commit_valid; + + // RISC-V OOO CPU core - Tomasulo out-of-order with RV32IMACBFD + Zicsr + Machine/User-mode cpu_ooo #( .MEM_BYTE_ADDR_WIDTH(MemByteAddrWidth), .MMIO_ADDR(MmioAddr), @@ -265,6 +349,7 @@ module cpu_and_mem #( .i_instr(instruction), .i_instr_sideband(instruction_sideband), .i_instr_bank_sel_r(instruction_bank_sel_r), + .i_served_addr(instruction_served_addr), .i_instr_valid(instruction_valid), .o_fetch_replay_consume(fetch_replay_consume), .o_pipeline_stall(pipeline_stall), @@ -292,11 +377,15 @@ module cpu_and_mem #( .o_mmio_uart_rx_ready_pulse(mmio_uart_rx_ready_pulse), .i_data_mem_rd_data(data_memory_or_peripheral_read_data), .o_rst_done(/*not connected*/), - .o_vld (/*not connected*/), + .o_vld (commit_vld), .o_pc_vld(/*not connected*/), // Interrupt and timer interface .i_interrupts(interrupts), .i_mtime(mtime), + .o_debug_irq_status(cpu_debug_irq_status), + .o_debug_commit_pc(cpu_debug_commit_pc), + .o_debug_commit_2_pc(cpu_debug_commit_2_pc), + .o_debug_commit_valid(cpu_debug_commit_valid), // Branch prediction enabled by default in production .i_disable_branch_prediction(1'b0) ); @@ -344,6 +433,7 @@ module cpu_and_mem #( // still carries valid (preserving the IF first-cycle capture); the real // provider's registered stall produces the same 1-cycle lag. assign instruction_valid = fuzz_ok && fuzz_window_ready && !pipeline_stall_q; + assign instruction_served_addr = served_addr_q; assign fuzz_accepted = instruction_valid && !pipeline_stall; // The BRAM chases the owed ask while unserved and the live PC once // serving (the 1-cycle BRAM then keeps the window contract-aligned). @@ -406,6 +496,7 @@ module cpu_and_mem #( logic [63:0] cached_fetch_instr; logic [riscv_pkg::ImemFetchSidebandWidth-1:0] cached_fetch_sideband; logic cached_fetch_bank_sel_r; + logic [31:0] cached_fetch_served_addr; logic cached_fetch_valid; assign fetch_address = program_counter; @@ -434,6 +525,8 @@ module cpu_and_mem #( bram_fetch_sideband; assign instruction_bank_sel_r = fetch_high_valid_q ? cached_fetch_bank_sel_r : bram_fetch_bank_sel_cpu_r; + assign instruction_served_addr = fetch_high_valid_q ? cached_fetch_served_addr : + bram_fetch_served_addr_q; // High-address provider: two-line L1I fetch buffer for cached/DDR code. // It no longer drives the low-BRAM address pins; that path stays direct @@ -449,6 +542,7 @@ module cpu_and_mem #( .o_instr(cached_fetch_instr), .o_instr_sideband(cached_fetch_sideband), .o_instr_bank_sel_r(cached_fetch_bank_sel_r), + .o_served_addr(cached_fetch_served_addr), .o_instr_valid(cached_fetch_valid), .o_line_req_valid(iup_req_valid), .i_line_req_ready(iup_req_ready), @@ -464,6 +558,7 @@ module cpu_and_mem #( ); end else begin : gen_fetch_direct assign instruction_valid = 1'b1; + assign instruction_served_addr = bram_fetch_served_addr_q; assign fetch_address = program_counter; assign instruction = bram_fetch_instr; assign instruction_sideband = bram_fetch_sideband; @@ -491,7 +586,7 @@ module cpu_and_mem #( // Port A: Instruction programming (div4 clock, write only) .i_port_a_byte_address(i_instr_mem_addr), .i_port_a_write_data(i_instr_mem_wrdata), - .i_port_a_write_enable(i_instr_mem_en), + .i_port_a_write_enable(i_instr_mem_en && (|i_instr_mem_we)), .o_port_a_read_data( /* unused - write only */), // Port B: Instruction fetch (main clock, read only) .i_port_b_clk(i_clk), @@ -508,6 +603,7 @@ module cpu_and_mem #( // control net. always_ff @(posedge i_clk) begin bram_fetch_bank_sel_cpu_r <= fetch_address[2]; + bram_fetch_served_addr_q <= fetch_address; end `ifndef SYNTHESIS @@ -603,7 +699,8 @@ module cpu_and_mem #( .HAS_L2(CACHED_HAS_L2), .L1_CACHE_BYTES(L1_CACHE_BYTES), .L1I_CACHE_BYTES(L1I_CACHE_BYTES), - .L2_CACHE_BYTES(L2_CACHE_BYTES) + .L2_CACHE_BYTES(L2_CACHE_BYTES), + .SIM_FAST_MAINT(SIM_FAST_MAINT) ) cache_hierarchy ( .i_clk(i_clk), .i_rst(i_rst), @@ -824,19 +921,36 @@ module cpu_and_mem #( // Use MA-stage address captured from CPU for MMIO reads unique case (mmio_load_addr) // UART RX data - returns received byte in lower 8 bits (reading consumes byte) - UartRxDataMmioAddr: mmio_read_data_comb = {24'b0, i_uart_rx_data}; + UartRxDataMmioAddr: mmio_read_data_comb = {24'b0, i_uart_rx_data}; // UART RX status - bit 0 indicates data available (non-destructive read) UartRxStatusMmioAddr: mmio_read_data_comb = {31'b0, i_uart_rx_valid}; // UART TX status - bit 0 indicates the TX FIFO can accept at least one byte. UartTxStatusMmioAddr: mmio_read_data_comb = {31'b0, i_uart_tx_ready}; - Fifo0MmioAddr: mmio_read_data_comb = i_fifo0_rd_data; - Fifo1MmioAddr: mmio_read_data_comb = i_fifo1_rd_data; - MtimeLowMmioAddr: mmio_read_data_comb = mtime[31:0]; - MtimeHighMmioAddr: mmio_read_data_comb = mtime[63:32]; - MtimecmpLowMmioAddr: mmio_read_data_comb = mtimecmp[31:0]; + Fifo0MmioAddr: mmio_read_data_comb = i_fifo0_rd_data; + Fifo1MmioAddr: mmio_read_data_comb = i_fifo1_rd_data; + MtimeLowMmioAddr: mmio_read_data_comb = mtime[31:0]; + MtimeHighMmioAddr: mmio_read_data_comb = mtime[63:32]; + MtimecmpLowMmioAddr: mmio_read_data_comb = mtimecmp[31:0]; MtimecmpHighMmioAddr: mmio_read_data_comb = mtimecmp[63:32]; - MsipMmioAddr: mmio_read_data_comb = {31'b0, msip}; - default: ; + MsipMmioAddr: mmio_read_data_comb = {31'b0, msip}; + // ns16550a UART face (aliases native UART TX/RX). DLAB selects DLL/DLM. + Ns16550ThrRbr: mmio_read_data_comb = ns_lcr[7] ? {24'b0, ns_dll} : {24'b0, i_uart_rx_data}; + Ns16550IerDlm: mmio_read_data_comb = ns_lcr[7] ? {24'b0, ns_dlm} : {24'b0, ns_ier}; + Ns16550IirFcr: mmio_read_data_comb = {24'b0, ns_iir}; + Ns16550Lcr: mmio_read_data_comb = {24'b0, ns_lcr}; + Ns16550Mcr: mmio_read_data_comb = {24'b0, ns_mcr}; + // LSR: TEMT|THRE from TX-ready (bits 6,5); DR from RX-valid (bit 0). + Ns16550Lsr: + mmio_read_data_comb = {24'b0, 1'b0, i_uart_tx_ready, i_uart_tx_ready, 4'b0, i_uart_rx_valid}; + Ns16550Msr: mmio_read_data_comb = {24'b0, 8'hB0}; // DCD|DSR|CTS asserted + Ns16550Scr: mmio_read_data_comb = {24'b0, ns_scr}; + // SiFive CLINT alias (same registers as the native timer block). + ClintMsip: mmio_read_data_comb = {31'b0, msip}; + ClintMtimecmpLo: mmio_read_data_comb = mtimecmp[31:0]; + ClintMtimecmpHi: mmio_read_data_comb = mtimecmp[63:32]; + ClintMtimeLo: mmio_read_data_comb = mtime[31:0]; + ClintMtimeHi: mmio_read_data_comb = mtime[63:32]; + default: ; endcase end @@ -887,11 +1001,114 @@ module cpu_and_mem #( if (mmio_read_data_valid) data_memory_or_peripheral_read_data = mmio_read_data_reg; end - // write to UART + // write to UART (native 0x4000_0000 TX, or the ns16550 THR at 0x4000_1000 + // when DLAB is clear -- both funnel into the same TX byte stream). + always_ff @(posedge i_clk) begin + cpu_uart_wr_data <= data_memory_write_data_registered[7:0]; // UART uses only lower byte + cpu_uart_wr_en <= |data_memory_byte_write_enable_registered && + ((data_memory_address_registered == UartMmioAddr) || + (data_memory_address_registered == Ns16550ThrRbr && !ns_lcr[7])); + end + + generate + if (ENABLE_HANG_TRIAGE != 0) begin : gen_hang_triage + // On-silicon hang triage: classify a silent boot hang over UART. This is + // intentionally opt-in because it periodically takes over the console. + logic triage_active; + logic triage_wr_en; + logic [ 7:0] triage_wr_data; + logic [31:0] triage_mtime_lo; + logic [31:0] triage_mtime_hi; + logic [31:0] triage_mtimecmp_lo; + logic [31:0] triage_mtimecmp_hi; + logic [31:0] triage_mtimecmp_delta_lo; + logic [31:0] triage_irq_status; + always_ff @(posedge i_clk) begin + if (i_rst) begin + triage_mtime_lo <= 32'd0; + triage_mtime_hi <= 32'd0; + triage_mtimecmp_lo <= 32'd0; + triage_mtimecmp_hi <= 32'd0; + triage_mtimecmp_delta_lo <= 32'd0; + triage_irq_status <= 32'd0; + end else begin + triage_mtime_lo <= mtime[31:0]; + triage_mtime_hi <= mtime[63:32]; + triage_mtimecmp_lo <= mtimecmp[31:0]; + triage_mtimecmp_hi <= mtimecmp[63:32]; + triage_mtimecmp_delta_lo <= mtimecmp[31:0] - mtime[31:0]; + triage_irq_status <= { + 22'd0, + cpu_debug_irq_status[5], + cpu_debug_irq_status[4], + cpu_debug_irq_status[3:2], + cpu_debug_irq_status[1], + cpu_debug_irq_status[0], + interrupts.meip, + interrupts.msip, + interrupts.mtip, + mtip_comparison + }; + end + end + hang_triage u_hang_triage ( + .i_clk (i_clk), + .i_rst (i_rst), + .i_commit (commit_vld), + .i_timer_event (mtimecmp_write_pulse), + .i_cread_req (data_memory_cached_read_enable), + .i_cread_resp (data_memory_cached_read_valid), + .i_cwrite_req (|data_memory_cached_byte_write_enable), + .i_cwrite_done (data_memory_cached_write_done), + .i_pc (program_counter), + .i_commit0_valid (cpu_debug_commit_valid[0]), + .i_commit0_pc (cpu_debug_commit_pc), + .i_commit1_valid (cpu_debug_commit_valid[1]), + .i_commit1_pc (cpu_debug_commit_2_pc), + .i_mtime_lo (triage_mtime_lo), + .i_mtime_hi (triage_mtime_hi), + .i_mtimecmp_lo (triage_mtimecmp_lo), + .i_mtimecmp_hi (triage_mtimecmp_hi), + .i_mtimecmp_delta_lo(triage_mtimecmp_delta_lo), + .i_irq_status (triage_irq_status), + .i_uart_busy (cpu_uart_wr_en), + .i_uart_ready (i_uart_tx_ready), + .o_active (triage_active), + .o_wr_en (triage_wr_en), + .o_wr_data (triage_wr_data) + ); + assign o_uart_wr_en = triage_active ? triage_wr_en : cpu_uart_wr_en; + assign o_uart_wr_data = triage_active ? triage_wr_data : cpu_uart_wr_data; + end else begin : gen_no_hang_triage + assign o_uart_wr_en = cpu_uart_wr_en; + assign o_uart_wr_data = cpu_uart_wr_data; + end + endgenerate + + // ns16550a register-file writes. DLAB (LCR[7]) routes offsets 0/4 to the + // baud divisor (DLL/DLM); the THR write itself transmits via o_uart_wr_en. always_ff @(posedge i_clk) begin - o_uart_wr_data <= data_memory_write_data_registered[7:0]; // UART uses only lower byte - o_uart_wr_en <= |data_memory_byte_write_enable_registered && - data_memory_address_registered == UartMmioAddr; + if (i_rst) begin + ns_dll <= 8'h01; + ns_dlm <= 8'h00; + ns_ier <= 8'h00; + ns_fcr <= 8'h00; + ns_lcr <= 8'h00; + ns_mcr <= 8'h00; + ns_scr <= 8'h00; + end else if (|data_memory_byte_write_enable_registered) begin + unique case (data_memory_address_registered) + Ns16550ThrRbr: if (ns_lcr[7]) ns_dll <= data_memory_write_data_registered[7:0]; + Ns16550IerDlm: + if (ns_lcr[7]) ns_dlm <= data_memory_write_data_registered[7:0]; + else ns_ier <= data_memory_write_data_registered[7:0]; + Ns16550IirFcr: ns_fcr <= data_memory_write_data_registered[7:0]; + Ns16550Lcr: ns_lcr <= data_memory_write_data_registered[7:0]; + Ns16550Mcr: ns_mcr <= data_memory_write_data_registered[7:0]; + Ns16550Scr: ns_scr <= data_memory_write_data_registered[7:0]; + default: ; + endcase + end end // FIFO write logic - write to FIFOs when CPU writes to FIFO MMIO addresses @@ -902,11 +1119,23 @@ module cpu_and_mem #( assign o_fifo1_wr_en = |data_memory_byte_write_enable_registered && data_memory_address_registered == Fifo1MmioAddr; + // Linux reads received bytes through the ns16550 RBR alias. That read must + // consume the shared UART RX FIFO just like the native FROST RX-data address, + // but only when DLAB is clear; with DLAB set, offset 0 is DLL. + logic ns16550_rbr_read_pulse; + always_ff @(posedge i_clk) begin + if (i_rst) begin + ns16550_rbr_read_pulse <= 1'b0; + end else begin + ns16550_rbr_read_pulse <= mmio_read_pulse && (mmio_load_addr == Ns16550ThrRbr) && !ns_lcr[7]; + end + end + // FIFO/UART consume pulses fire one cycle after the MMIO read request is // accepted. The response data itself was already captured above. - assign o_fifo0_rd_en = mmio_fifo0_read_pulse; - assign o_fifo1_rd_en = mmio_fifo1_read_pulse; - assign o_uart_rx_ready = mmio_uart_rx_ready_pulse; + assign o_fifo0_rd_en = mmio_fifo0_read_pulse; + assign o_fifo1_rd_en = mmio_fifo1_read_pulse; + assign o_uart_rx_ready = mmio_uart_rx_ready_pulse || ns16550_rbr_read_pulse; // Timer register updates // mtime increments every clock cycle (provides wall-clock time) @@ -918,9 +1147,11 @@ module cpu_and_mem #( // This would cause the non-written half to increment during a write, which is wrong. logic writing_mtime_low, writing_mtime_high; assign writing_mtime_low = |data_memory_byte_write_enable_registered && - (data_memory_address_registered == MtimeLowMmioAddr); + ((data_memory_address_registered == MtimeLowMmioAddr) || + (data_memory_address_registered == ClintMtimeLo)); assign writing_mtime_high = |data_memory_byte_write_enable_registered && - (data_memory_address_registered == MtimeHighMmioAddr); + ((data_memory_address_registered == MtimeHighMmioAddr) || + (data_memory_address_registered == ClintMtimeHi)); always_ff @(posedge i_clk) begin if (i_rst) begin @@ -944,11 +1175,12 @@ module cpu_and_mem #( if (|data_memory_byte_write_enable_registered) begin unique case (data_memory_address_registered) // mtimecmp controls timer interrupt threshold - MtimecmpLowMmioAddr: mtimecmp[31:0] <= data_memory_write_data_registered; - MtimecmpHighMmioAddr: mtimecmp[63:32] <= data_memory_write_data_registered; + MtimecmpLowMmioAddr, ClintMtimecmpLo: mtimecmp[31:0] <= data_memory_write_data_registered; + MtimecmpHighMmioAddr, ClintMtimecmpHi: + mtimecmp[63:32] <= data_memory_write_data_registered; // msip controls software interrupt (only bit 0 is writable) - MsipMmioAddr: msip <= data_memory_write_data_registered[0]; - default: ; + MsipMmioAddr, ClintMsip: msip <= data_memory_write_data_registered[0]; + default: ; endcase end end diff --git a/hw/rtl/cpu_and_mem/fetch_provider.sv b/hw/rtl/cpu_and_mem/fetch_provider.sv index 535af9c5..52416061 100644 --- a/hw/rtl/cpu_and_mem/fetch_provider.sv +++ b/hw/rtl/cpu_and_mem/fetch_provider.sv @@ -18,7 +18,7 @@ * fetch_provider -- the variable-latency fetch window provider. * * Serves the high-address side of the core's fetch seam - * ({instr64, sideband16, bank_sel_r} + valid) from a two-line fetch buffer + * ({instr64, sideband24, bank_sel_r} + valid) from a two-line fetch buffer * over the L1I line port. The low instruction BRAM fast path is selected in * cpu_and_mem and drives imem_predecode directly from o_pc; this block never * drives the low-BRAM address pins. Each filled line carries per-word @@ -70,6 +70,11 @@ module fetch_provider #( output logic [63:0] o_instr, output logic [riscv_pkg::ImemFetchSidebandWidth-1:0] o_instr_sideband, output logic o_instr_bank_sel_r, + // Full served-window address (its tag). if_stage uses this to detect a fetch + // stall that left pc_reg outside the served window (>1 word away), which the + // 1-bit bank_sel parity cannot represent -> wrong-word size sample / mid-insn + // pc_reg drift. Observe-only output; does not change fetch behaviour here. + output logic [31:0] o_served_addr, output logic o_instr_valid, // L1I line port (master; read-only -- write/wdata/wstrb tied inactive). @@ -221,6 +226,7 @@ module fetch_provider #( assign o_instr = ddr_instr_q; assign o_instr_sideband = ddr_sb_pair_q; assign o_instr_bank_sel_r = bank_sel_q; + assign o_served_addr = served_addr_q; // =========================================================================== // Miss engine: single-outstanding line fills + next-line prefetch diff --git a/hw/rtl/cpu_and_mem/hang_triage.sv b/hw/rtl/cpu_and_mem/hang_triage.sv new file mode 100644 index 00000000..b4396f5a --- /dev/null +++ b/hw/rtl/cpu_and_mem/hang_triage.sv @@ -0,0 +1,355 @@ +/* + * Copyright 2026 Two Sigma Open Source, LLC + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/* + * hang_triage — on-silicon classifier for the silent boot hang. + * + * Trigger: the console UART goes quiet (every hang flavor stops the kernel + * printing). On a quiet stretch it streams ASCII over the UART and re-emits + * periodically so the trajectory is visible: + * + * "\n!!HANG c= t= q= v= w=" + * " l= h= r= s= m=" + * " n= x=" + * " y= d= p=" + * "\nH ... \n" + * + * c committed instructions climbing => busy-loop; frozen => wedge + * t mtimecmp writes (timer) frozen => timer service stopped + * q/v cached read req/resp q>v frozen => a DDR read never returned + * w cached write {req:done} req>done => a DDR write never landed + * l/h pc_lo..pc_hi PC range executed since last console output + * r/s last retired PCs slot-1 / slot-2 commit PCs + * m/n mtime lo/hi CLINT time at snapshot + * x/y mtimecmp lo/hi CLINT compare at snapshot + * d mtimecmp-mtime low word high bit set usually means compare is overdue + * p irq/status bits: + * [0]=raw mtime>=mtimecmp, [1]=registered MTIP, [2]=MSIP, [3]=MEIP, + * [4]=mie.MTIE, [5]=mstatus.MIE, [7:6]=priv, [8]=trap, [9]=mret + * H PC histogram, 64 buckets of 64 KiB keyed on pc[21:16] (kernel pc[31]=1) + * => cycle-weighted hot region of the livelock (bucket k = 0x8000_0000 + + * k*0x10000). The hottest bucket localizes the spin to a 64 KiB window. + * + * Non-latching: any console write resets the quiet timer + PC window. + */ +module hang_triage #( + parameter logic [31:0] QUIET_CYCLES = 32'd400_000_000, // ~3 s @133 MHz + parameter logic [31:0] REEMIT_CYCLES = 32'd134_000_000 // ~1 s +) ( + input logic i_clk, + input logic i_rst, + + input logic i_commit, + input logic i_timer_event, + input logic i_cread_req, + input logic i_cread_resp, + input logic i_cwrite_req, + input logic i_cwrite_done, + input logic [31:0] i_pc, + input logic i_commit0_valid, + input logic [31:0] i_commit0_pc, + input logic i_commit1_valid, + input logic [31:0] i_commit1_pc, + input logic [31:0] i_mtime_lo, + input logic [31:0] i_mtime_hi, + input logic [31:0] i_mtimecmp_lo, + input logic [31:0] i_mtimecmp_hi, + input logic [31:0] i_mtimecmp_delta_lo, + input logic [31:0] i_irq_status, + input logic i_uart_busy, + + input logic i_uart_ready, + output logic o_active, + output logic o_wr_en, + output logic [7:0] o_wr_data +); + + // ---- Free-running event counters ------------------------------------------ + logic [31:0] cnt_commit, cnt_timer, cnt_cread_req, cnt_cread_resp; + logic [31:0] cnt_cwrite_req, cnt_cwrite_done; + always_ff @(posedge i_clk) begin + if (i_rst) begin + cnt_commit <= 32'd0; + cnt_timer <= 32'd0; + cnt_cread_req <= 32'd0; + cnt_cread_resp <= 32'd0; + cnt_cwrite_req <= 32'd0; + cnt_cwrite_done <= 32'd0; + end else begin + if (i_commit) cnt_commit <= cnt_commit + 32'd1; + if (i_timer_event) cnt_timer <= cnt_timer + 32'd1; + if (i_cread_req) cnt_cread_req <= cnt_cread_req + 32'd1; + if (i_cread_resp) cnt_cread_resp <= cnt_cread_resp + 32'd1; + if (i_cwrite_req) cnt_cwrite_req <= cnt_cwrite_req + 32'd1; + if (i_cwrite_done) cnt_cwrite_done <= cnt_cwrite_done + 32'd1; + end + end + + // ---- PC histogram: 64 x 64 KiB buckets, kernel PCs only ------------------- + logic [31:0] hist[64]; + logic [5:0] pc_bucket; + assign pc_bucket = i_pc[21:16]; + always_ff @(posedge i_clk) begin + if (i_rst || i_uart_busy) begin + // Clear while the console is active so the histogram reflects ONLY the + // quiet (hang) window, not the pre-hang boot execution. + for (int b = 0; b < 64; b++) hist[b] <= 32'd0; + end else if (i_pc[31]) begin // count only kernel-range PCs + hist[pc_bucket] <= hist[pc_bucket] + 32'd1; + end + end + + // ---- Console-idle timer + PC window --------------------------------------- + logic [31:0] quiet_cnt; + logic [31:0] pc_lo, pc_hi; + logic [31:0] last_commit0_pc, last_commit1_pc; + logic win_reset; + always_ff @(posedge i_clk) begin + if (i_rst) begin + quiet_cnt <= 32'd0; + pc_lo <= 32'hFFFFFFFF; + pc_hi <= 32'h00000000; + last_commit0_pc <= 32'd0; + last_commit1_pc <= 32'd0; + end else if (i_uart_busy) begin + quiet_cnt <= 32'd0; + pc_lo <= i_pc; + pc_hi <= i_pc; + if (i_commit0_valid) last_commit0_pc <= i_commit0_pc; + if (i_commit1_valid) last_commit1_pc <= i_commit1_pc; + end else begin + if (quiet_cnt != 32'hFFFFFFFF) quiet_cnt <= quiet_cnt + 32'd1; + if (i_commit0_valid) last_commit0_pc <= i_commit0_pc; + if (i_commit1_valid) last_commit1_pc <= i_commit1_pc; + if (win_reset) begin + pc_lo <= i_pc; + pc_hi <= i_pc; + end else begin + if (i_pc < pc_lo) pc_lo <= i_pc; + if (i_pc > pc_hi) pc_hi <= i_pc; + end + end + end + + // ---- Snapshot ------------------------------------------------------------- + logic [31:0] snap_c, snap_t, snap_q, snap_v, snap_w, snap_l, snap_h, snap_r, snap_s; + logic [31:0] snap_m, snap_n, snap_x, snap_y, snap_d, snap_p; + + // ---- ASCII emit FSM ------------------------------------------------------- + typedef enum logic [2:0] { + EM_IDLE, + EM_PREFIX, + EM_FIELD, + EM_HPRE, + EM_HIST, + EM_GAP + } em_state_e; + em_state_e em_state; + logic [3:0] pcnt; + localparam logic [3:0] FieldLast = 4'd14; + logic [ 3:0] fld; + logic [ 3:0] fpos; + logic [ 5:0] hidx; + logic [ 3:0] hpos; // 0..8 within a hist entry + logic [31:0] reemit_cnt; + + assign win_reset = (em_state == EM_IDLE) && (quiet_cnt >= QUIET_CYCLES); + + function automatic logic [7:0] hex4(input logic [3:0] n); + hex4 = (n < 4'd10) ? (8'h30 + {4'b0, n}) : (8'h41 + {4'b0, n} - 8'd10); + endfunction + + function automatic logic [7:0] prefix_byte(input logic [3:0] i); + case (i) + 4'd0: prefix_byte = 8'h0A; + 4'd1: prefix_byte = "!"; + 4'd2: prefix_byte = "!"; + 4'd3: prefix_byte = "H"; + 4'd4: prefix_byte = "A"; + 4'd5: prefix_byte = "N"; + 4'd6: prefix_byte = "G"; + default: prefix_byte = " "; + endcase + endfunction + + function automatic logic [7:0] label_byte(input logic [3:0] f); + case (f) + 4'd0: label_byte = "c"; + 4'd1: label_byte = "t"; + 4'd2: label_byte = "q"; + 4'd3: label_byte = "v"; + 4'd4: label_byte = "w"; + 4'd5: label_byte = "l"; + 4'd6: label_byte = "h"; + 4'd7: label_byte = "r"; + 4'd8: label_byte = "s"; + 4'd9: label_byte = "m"; + 4'd10: label_byte = "n"; + 4'd11: label_byte = "x"; + 4'd12: label_byte = "y"; + 4'd13: label_byte = "d"; + default: label_byte = "p"; + endcase + endfunction + + logic [31:0] fld_val; + always_comb begin + case (fld) + 4'd0: fld_val = snap_c; + 4'd1: fld_val = snap_t; + 4'd2: fld_val = snap_q; + 4'd3: fld_val = snap_v; + 4'd4: fld_val = snap_w; + 4'd5: fld_val = snap_l; + 4'd6: fld_val = snap_h; + 4'd7: fld_val = snap_r; + 4'd8: fld_val = snap_s; + 4'd9: fld_val = snap_m; + 4'd10: fld_val = snap_n; + 4'd11: fld_val = snap_x; + 4'd12: fld_val = snap_y; + 4'd13: fld_val = snap_d; + default: fld_val = snap_p; + endcase + end + + logic [3:0] nib_idx; + always_comb begin + nib_idx = 4'd0; + if (fpos >= 4'd2 && fpos <= 4'd9) nib_idx = 4'd9 - fpos; + end + + logic [3:0] hnib_idx; + always_comb begin + hnib_idx = 4'd0; + if (hpos <= 4'd7) hnib_idx = 4'd7 - hpos; + end + + logic [7:0] emit_byte; + always_comb begin + emit_byte = 8'h20; + unique case (em_state) + EM_PREFIX: emit_byte = prefix_byte(pcnt); + EM_FIELD: begin + if (fpos == 4'd0) emit_byte = label_byte(fld); + else if (fpos == 4'd1) emit_byte = "="; + else if (fpos == 4'd10) emit_byte = 8'h20; + else emit_byte = hex4(fld_val[nib_idx*4+:4]); + end + EM_HPRE: emit_byte = (pcnt == 4'd0) ? 8'h0A : ((pcnt == 4'd1) ? "H" : " "); + EM_HIST: + emit_byte = (hpos == 4'd8) ? ((hidx == 6'd63) ? 8'h0A : 8'h20) : + hex4(hist[hidx][hnib_idx*4+:4]); + default: emit_byte = 8'h20; + endcase + end + + always_ff @(posedge i_clk) begin + if (i_rst) begin + em_state <= EM_IDLE; + pcnt <= 4'd0; + fld <= 4'd0; + fpos <= 4'd0; + hidx <= 6'd0; + hpos <= 4'd0; + reemit_cnt <= 32'd0; + o_active <= 1'b0; + o_wr_en <= 1'b0; + o_wr_data <= 8'd0; + end else begin + o_wr_en <= 1'b0; + case (em_state) + EM_IDLE: begin + if (quiet_cnt >= QUIET_CYCLES) begin + snap_c <= cnt_commit; + snap_t <= cnt_timer; + snap_q <= cnt_cread_req; + snap_v <= cnt_cread_resp; + snap_w <= {cnt_cwrite_req[15:0], cnt_cwrite_done[15:0]}; + snap_l <= pc_lo; + snap_h <= pc_hi; + snap_r <= last_commit0_pc; + snap_s <= last_commit1_pc; + snap_m <= i_mtime_lo; + snap_n <= i_mtime_hi; + snap_x <= i_mtimecmp_lo; + snap_y <= i_mtimecmp_hi; + snap_d <= i_mtimecmp_delta_lo; + snap_p <= i_irq_status; + o_active <= 1'b1; + pcnt <= 4'd0; + em_state <= EM_PREFIX; + end + end + EM_PREFIX: + if (i_uart_ready) begin + o_wr_en <= 1'b1; + o_wr_data <= emit_byte; + if (pcnt == 4'd7) begin + fld <= 4'd0; + fpos <= 4'd0; + em_state <= EM_FIELD; + end else pcnt <= pcnt + 4'd1; + end + EM_FIELD: + if (i_uart_ready) begin + o_wr_en <= 1'b1; + o_wr_data <= emit_byte; + if (fpos == 4'd10) begin + if (fld == FieldLast) begin + pcnt <= 4'd0; + em_state <= EM_HPRE; + end else begin + fld <= fld + 4'd1; + fpos <= 4'd0; + end + end else fpos <= fpos + 4'd1; + end + EM_HPRE: + if (i_uart_ready) begin + o_wr_en <= 1'b1; + o_wr_data <= emit_byte; + if (pcnt == 4'd2) begin + hidx <= 6'd0; + hpos <= 4'd0; + em_state <= EM_HIST; + end else pcnt <= pcnt + 4'd1; + end + EM_HIST: + if (i_uart_ready) begin + o_wr_en <= 1'b1; + o_wr_data <= emit_byte; + if (hpos == 4'd8) begin + if (hidx == 6'd63) begin + em_state <= EM_GAP; + reemit_cnt <= REEMIT_CYCLES; + end else begin + hidx <= hidx + 6'd1; + hpos <= 4'd0; + end + end else hpos <= hpos + 4'd1; + end + EM_GAP: begin + o_active <= 1'b0; + if (reemit_cnt <= 32'd1) em_state <= EM_IDLE; + else reemit_cnt <= reemit_cnt - 32'd1; + end + default: em_state <= EM_IDLE; + endcase + end + end + +endmodule : hang_triage diff --git a/hw/rtl/frost.sv b/hw/rtl/frost.sv index 8bb657e3..ffde4255 100644 --- a/hw/rtl/frost.sv +++ b/hw/rtl/frost.sv @@ -54,6 +54,10 @@ module frost #( parameter int unsigned L1_CACHE_BYTES = 128 * 1024, parameter int unsigned L1I_CACHE_BYTES = 16 * 1024, parameter int unsigned L2_CACHE_BYTES = 2 * 1024 * 1024, + // Simulation-only fast cache maintenance for fence.i: 0 = FPGA (cycle- + // accurate maintenance FSM, unchanged); non-zero = sim fast path (see + // frost_cache). Set to 1 only by the cocotb sim build, never for boards. + parameter int unsigned SIM_FAST_MAINT = 0, // Behavioral main-memory model knobs (simulation only). parameter int unsigned DDR_MODEL_BYTES = 64 * 1024 * 1024, parameter int unsigned DDR_MODEL_LATENCY = 30, @@ -62,7 +66,9 @@ module frost #( // them to their DDR controller subsystem). parameter int unsigned USE_BEHAVIORAL_DDR = 1, // Simulation-only fetch-latency fuzz (see cpu_and_mem). Hardware keeps 0. - parameter int unsigned FETCH_VALID_FUZZ = 0 + parameter int unsigned FETCH_VALID_FUZZ = 0, + // Optional on-silicon boot-hang classifier that can emit over UART. + parameter int unsigned ENABLE_HANG_TRIAGE = 0 ) ( input logic i_clk, input logic i_clk_div4, @@ -193,10 +199,12 @@ module frost #( .L1_CACHE_BYTES(L1_CACHE_BYTES), .L1I_CACHE_BYTES(L1I_CACHE_BYTES), .L2_CACHE_BYTES(L2_CACHE_BYTES), + .SIM_FAST_MAINT(SIM_FAST_MAINT), .DDR_MODEL_BYTES(DDR_MODEL_BYTES), .DDR_MODEL_LATENCY(DDR_MODEL_LATENCY), .USE_BEHAVIORAL_DDR(USE_BEHAVIORAL_DDR), - .FETCH_VALID_FUZZ(FETCH_VALID_FUZZ) + .FETCH_VALID_FUZZ(FETCH_VALID_FUZZ), + .ENABLE_HANG_TRIAGE(ENABLE_HANG_TRIAGE) ) cpu_and_memory_subsystem ( .i_clk, .i_clk_div4, diff --git a/hw/rtl/lib/cache/axi_behavioral_memory.sv b/hw/rtl/lib/cache/axi_behavioral_memory.sv index 637aed1e..627baeea 100644 --- a/hw/rtl/lib/cache/axi_behavioral_memory.sv +++ b/hw/rtl/lib/cache/axi_behavioral_memory.sv @@ -16,8 +16,8 @@ /* * axi_behavioral_memory -- SIMULATION-ONLY main-memory model (stands in for - * the DDR controller in Phase 1; replaced by the MIG + SmartConnect on - * hardware). AXI4 slave, single-beat 256-bit transactions (asserts on + * the DDR controller in Phase 1; replaced by the board's DDR controller + * (MIG DDR3 / DDR4 IP) + SmartConnect on hardware). AXI4 slave, single-beat 256-bit transactions (asserts on * anything else), parameterized response latency to mimic DDR access time. * * The array is dense and parameter-sized (default 64 MiB) while the DECODED diff --git a/hw/rtl/lib/cache/frost_cache.sv b/hw/rtl/lib/cache/frost_cache.sv index d67bb1b0..b413e7a1 100644 --- a/hw/rtl/lib/cache/frost_cache.sv +++ b/hw/rtl/lib/cache/frost_cache.sv @@ -72,7 +72,17 @@ module frost_cache #( // verilog_lint: waive explicit-parameter-storage-type parameter DATA_MEMORY_PRIMITIVE = "block", parameter int unsigned DATA_READ_LATENCY = 2, - parameter int unsigned DATA_WRITE_LATENCY = 1 + parameter int unsigned DATA_WRITE_LATENCY = 1, + // Simulation-only fast cache maintenance (fence.i). 0 = FPGA: the + // cycle-accurate maintenance FSM below is byte-for-byte unchanged. Non-zero + // = simulation: invalidate-all completes in a single cycle (a tag bulk + // clear) and writeback-all iterates only the dirty lines -- O(dirty) rather + // than O(NumLines) -- guided by a sim-only shadow of the dirty bits. The + // functional effect is identical to the slow path: every line is left + // invalid after invalidate-all, and every valid+dirty line is still written + // downstream and marked clean by writeback-all. Threaded in only for the + // cocotb sim build; never set for board/synthesis builds. + parameter int unsigned SIM_FAST_MAINT = 0 ) ( input logic i_clk, input logic i_rst, @@ -156,12 +166,31 @@ module frost_cache #( logic [IndexBits-1:0] flush_idx_q; logic [ TagBits-1:0] flush_tag_q; + // Real-FSM (FPGA) writeback-all acceleration: bound the index walk to the + // [wb_lo_q, wb_hi_q] span of lines made dirty since the last writeback-all, + // instead of scanning all NumLines on every fence.i. Cheap and synthesizable + // (two index regs + a 1-bit "any dirty" flag), unlike the SIM_FAST_MAINT + // shadow's NumLines-bit priority encoder. wb_any_q == 0 means no dirty lines. + logic [IndexBits-1:0] wb_lo_q, wb_hi_q; + logic wb_any_q; + + // Fast maintenance (SIM_FAST_MAINT, simulation only). + // tag_bulk_clear: one-cycle invalidate-all of the whole tag array. + // any_dirty_*/first_dirty_*: lowest dirty line index from the sim-only dirty + // shadow, used to walk only dirty lines during writeback-all. All driven to + // constants when the feature is off, so the FPGA build carries none of it. + logic tag_bulk_clear; + logic any_dirty_full, any_dirty_excl; + logic [IndexBits-1:0] first_dirty_full, first_dirty_excl; + // Writeback-all walk states (data/tag addressing + busy). - logic flush_active; + logic flush_active; assign flush_active = (state_q == S_FLUSH_SCAN) || (state_q == S_FLUSH_CHECK) || (state_q == S_FLUSH_DATA) || (state_q == S_FLUSH_WB_REQ) || (state_q == S_FLUSH_WB_WAIT); assign o_maint_busy = flush_active || (state_q == S_SWEEP); + // Fast invalidate-all: hold the tag bulk clear for the (now one-cycle) sweep. + assign tag_bulk_clear = (SIM_FAST_MAINT != 0) && (state_q == S_SWEEP); logic [ 7:0] wait_cnt_q; // data-array latency countdown (latencies are small) logic [ TagBits-1:0] victim_tag_q; logic [ LineBits-1:0] victim_line_q; @@ -197,16 +226,91 @@ module frost_cache #( sdp_block_ram #( .ADDR_WIDTH(IndexBits), - .DATA_WIDTH(TagEntryBits) + .DATA_WIDTH(TagEntryBits), + .SUPPORT_BULK_CLEAR(SIM_FAST_MAINT) ) tag_array ( .i_clk(i_clk), .i_write_enable(tag_we), + .i_bulk_clear(tag_bulk_clear), .i_write_address(tag_waddr), .i_read_address(tag_raddr), .i_write_data(tag_wdata), .o_read_data(tag_rdata) ); + // ---- Fast maintenance dirty shadow (SIM_FAST_MAINT, simulation only) ------ + // A shadow of the tag array's dirty bits, updated by the exact same writes + // that update the tag RAM, so writeback-all can jump straight to dirty lines + // instead of scanning every index. Elaborated only when the feature is on: + // FPGA/synthesis builds carry none of this logic and read the constant + // outputs below. + if (SIM_FAST_MAINT != 0) begin : gen_fast_maint + logic [NumLines-1:0] dirty_shadow_q; + always_ff @(posedge i_clk) begin + if (i_rst) dirty_shadow_q <= '0; + else if (tag_bulk_clear) dirty_shadow_q <= '0; // invalidate-all / reset + // tag_wdata = {valid, dirty, tag}: bit TagBits is the dirty bit. + else if (tag_we) dirty_shadow_q[tag_waddr] <= tag_wdata[TagBits]; + end + + // Lowest set dirty index over the whole shadow (first_dirty_full) and + // excluding the line being written back this cycle (first_dirty_excl). The + // scan is gated to the writeback-all states, so ordinary traffic never pays + // for it -- a dirty store just toggles one shadow bit above. + always_comb begin + any_dirty_full = 1'b0; + first_dirty_full = '0; + any_dirty_excl = 1'b0; + first_dirty_excl = '0; + if ((state_q == S_IDLE && i_writeback_all) || flush_active) begin + for (int idx = int'(NumLines) - 1; idx >= 0; idx--) begin + if (dirty_shadow_q[idx]) begin + any_dirty_full = 1'b1; + first_dirty_full = IndexBits'(idx); + if (IndexBits'(idx) != flush_idx_q) begin + any_dirty_excl = 1'b1; + first_dirty_excl = IndexBits'(idx); + end + end + end + end + end + end else begin : gen_no_fast_maint + assign any_dirty_full = 1'b0; + assign first_dirty_full = '0; + assign any_dirty_excl = 1'b0; + assign first_dirty_excl = '0; + end + + // ---- Real-FSM writeback-all dirty-range tracker --------------------------- + // Mirror the dirty-bit writes (tag_we with the dirty bit set, at tag_waddr -- + // i.e. the S_TAG_CHECK write-hit and the S_ALLOC write-allocate) into the + // lowest/highest dirty index. The real (FPGA) writeback-all walk then scans + // only [wb_lo_q, wb_hi_q]. No upstream request is accepted while a walk runs + // (o_up_req_ready is low for the duration), so the span is stable across it. + // real_wb_done is exactly the cycle the real walk returns to S_IDLE; clearing + // the span there is safe because every dirty line in the span has been written + // back and lines outside it were never dirty -> wb_any_q==0 iff no dirty line. + logic dirty_set; + assign dirty_set = tag_we && tag_wdata[TagBits]; + logic real_wb_done; + assign real_wb_done = (SIM_FAST_MAINT == 0) && + ((state_q == S_FLUSH_CHECK && !(tag_rdata_valid && tag_rdata_dirty) && + (!wb_any_q || (flush_idx_q == wb_hi_q))) || + (state_q == S_FLUSH_WB_WAIT && i_down_resp_valid && (flush_idx_q == wb_hi_q))); + + always_ff @(posedge i_clk) begin + if (i_rst || real_wb_done) begin + wb_lo_q <= {IndexBits{1'b1}}; + wb_hi_q <= '0; + wb_any_q <= 1'b0; + end else if (dirty_set) begin + wb_lo_q <= (!wb_any_q || (tag_waddr < wb_lo_q)) ? tag_waddr : wb_lo_q; + wb_hi_q <= (!wb_any_q || (tag_waddr > wb_hi_q)) ? tag_waddr : wb_hi_q; + wb_any_q <= 1'b1; + end + end + // Tag read address: the incoming request's index, sampled at the fire so // the entry is readable in S_TAG_CHECK; the walk index during the // writeback-all scan. Don't-care in every other state. @@ -273,9 +377,14 @@ module frost_cache #( unique case (state_q) S_SWEEP: begin - tag_we = 1'b1; - tag_waddr = sweep_idx_q; - tag_wdata = '0; // valid=0, dirty=0 + // FPGA: clear one tag entry per cycle. Fast (sim): the tag bulk clear + // (tag_bulk_clear -> tag_array.i_bulk_clear) zeroes every entry this + // single cycle, so no per-index write is issued here. + if (SIM_FAST_MAINT == 0) begin + tag_we = 1'b1; + tag_waddr = sweep_idx_q; + tag_wdata = '0; // valid=0, dirty=0 + end end S_TAG_CHECK: begin @@ -353,8 +462,13 @@ module frost_cache #( end else begin unique case (state_q) S_SWEEP: begin - sweep_idx_q <= sweep_idx_q + 1'b1; - if (sweep_idx_q == {IndexBits{1'b1}}) state_q <= S_IDLE; + if (SIM_FAST_MAINT != 0) begin + // Fast: tag_bulk_clear zeroed every entry this cycle -- done. + state_q <= S_IDLE; + end else begin + sweep_idx_q <= sweep_idx_q + 1'b1; + if (sweep_idx_q == {IndexBits{1'b1}}) state_q <= S_IDLE; + end end S_IDLE: begin @@ -364,7 +478,10 @@ module frost_cache #( sweep_idx_q <= '0; state_q <= S_SWEEP; end else if (i_writeback_all) begin - flush_idx_q <= '0; + // Fast: jump straight to the first dirty line (O(dirty) walk). + // FPGA: start the walk at the bottom of the dirty span (0 if the + // cache holds no dirty line -- a single scan cycle then finishes). + flush_idx_q <= (SIM_FAST_MAINT != 0) ? first_dirty_full : (wb_any_q ? wb_lo_q : '0); state_q <= S_FLUSH_SCAN; end else if (up_req_fire) begin req_write_q <= i_up_req_write; @@ -446,7 +563,12 @@ module frost_cache #( wait_cnt_q <= 8'(DATA_READ_LATENCY); flush_tag_q <= tag_rdata_tag; state_q <= S_FLUSH_DATA; - end else if (flush_idx_q == {IndexBits{1'b1}}) begin + end else if (SIM_FAST_MAINT != 0) begin + // Fast: a non-dirty line is only reached when the shadow is empty + // (no dirty lines to start with), so the writeback-all is done. + state_q <= S_IDLE; + end else if (!wb_any_q || (flush_idx_q == wb_hi_q)) begin + // Real FSM: scanned the whole dirty span (or nothing was dirty). state_q <= S_IDLE; end else begin flush_idx_q <= flush_idx_q + 1'b1; @@ -466,7 +588,19 @@ module frost_cache #( S_FLUSH_WB_WAIT: begin if (i_down_resp_valid) begin - if (flush_idx_q == {IndexBits{1'b1}}) begin + // This line's dirty bit is cleared this cycle (combinational tag + // write above), and the sim-only shadow mirrors that clear. + if (SIM_FAST_MAINT != 0) begin + // Fast: jump to the next still-dirty line (excluding this one); + // when none remain the writeback-all is complete. + if (any_dirty_excl) begin + flush_idx_q <= first_dirty_excl; + state_q <= S_FLUSH_SCAN; + end else begin + state_q <= S_IDLE; + end + end else if (flush_idx_q == wb_hi_q) begin + // Real FSM: just wrote back the top dirty line of the span -- done. state_q <= S_IDLE; end else begin flush_idx_q <= flush_idx_q + 1'b1; diff --git a/hw/rtl/lib/cache/frost_cache_hierarchy.sv b/hw/rtl/lib/cache/frost_cache_hierarchy.sv index bee1d18a..46f40584 100644 --- a/hw/rtl/lib/cache/frost_cache_hierarchy.sv +++ b/hw/rtl/lib/cache/frost_cache_hierarchy.sv @@ -48,7 +48,12 @@ module frost_cache_hierarchy #( parameter int unsigned L1I_DATA_READ_LATENCY = 2, parameter int unsigned L2_CACHE_BYTES = 2 * 1024 * 1024, parameter int unsigned L2_DATA_READ_LATENCY = 6, - parameter int unsigned L2_DATA_WRITE_LATENCY = 2 + parameter int unsigned L2_DATA_WRITE_LATENCY = 2, + // Simulation-only fast cache maintenance for fence.i (see frost_cache). + // 0 = FPGA cycle-accurate FSM; non-zero = sim fast path. Applied to the two + // L1s -- the only caches that run fence.i maintenance; the L2 sits below the + // arbiter and needs none, so it keeps the default. + parameter int unsigned SIM_FAST_MAINT = 0 ) ( input logic i_clk, input logic i_rst, @@ -134,7 +139,8 @@ module frost_cache_hierarchy #( .LINE_BYTES(LINE_BYTES), .DATA_MEMORY_PRIMITIVE("block"), .DATA_READ_LATENCY(L1_DATA_READ_LATENCY), - .DATA_WRITE_LATENCY(L1_DATA_WRITE_LATENCY) + .DATA_WRITE_LATENCY(L1_DATA_WRITE_LATENCY), + .SIM_FAST_MAINT(SIM_FAST_MAINT) ) l1_cache ( .i_clk(i_clk), .i_rst(i_rst), @@ -164,7 +170,8 @@ module frost_cache_hierarchy #( .CACHE_SIZE_BYTES(L1I_CACHE_BYTES), .LINE_BYTES(LINE_BYTES), .DATA_MEMORY_PRIMITIVE("block"), - .DATA_READ_LATENCY(L1I_DATA_READ_LATENCY) + .DATA_READ_LATENCY(L1I_DATA_READ_LATENCY), + .SIM_FAST_MAINT(SIM_FAST_MAINT) ) l1i_cache ( .i_clk(i_clk), .i_rst(i_rst), diff --git a/hw/rtl/lib/cache/frost_cache_test_harness.sv b/hw/rtl/lib/cache/frost_cache_test_harness.sv index 96973797..ae189128 100644 --- a/hw/rtl/lib/cache/frost_cache_test_harness.sv +++ b/hw/rtl/lib/cache/frost_cache_test_harness.sv @@ -36,7 +36,10 @@ module frost_cache_test_harness #( parameter int unsigned L2_DATA_WRITE_LATENCY = 2, parameter logic [31:0] BASE_ADDR = 32'h8000_0000, parameter int unsigned MEM_BYTES = 4 * 1024 * 1024, - parameter int unsigned MEM_LATENCY = 12 + parameter int unsigned MEM_LATENCY = 12, + // Simulation-only fast cache maintenance for fence.i (see frost_cache). The + // cocotb cache registry runs this bench with it both off (default) and on. + parameter int unsigned SIM_FAST_MAINT = 0 ) ( input logic i_clk, input logic i_rst, @@ -76,7 +79,8 @@ module frost_cache_test_harness #( .L1I_CACHE_BYTES(L1I_CACHE_BYTES), .L2_CACHE_BYTES(L2_CACHE_BYTES), .L2_DATA_READ_LATENCY(L2_DATA_READ_LATENCY), - .L2_DATA_WRITE_LATENCY(L2_DATA_WRITE_LATENCY) + .L2_DATA_WRITE_LATENCY(L2_DATA_WRITE_LATENCY), + .SIM_FAST_MAINT(SIM_FAST_MAINT) ) cache_hierarchy ( .i_clk(i_clk), .i_rst(i_rst), diff --git a/hw/rtl/lib/ram/mwp_dist_ram_ohread.sv b/hw/rtl/lib/ram/mwp_dist_ram_ohread.sv new file mode 100644 index 00000000..8381edee --- /dev/null +++ b/hw/rtl/lib/ram/mwp_dist_ram_ohread.sv @@ -0,0 +1,134 @@ +/* + * Copyright 2026 Two Sigma Open Source, LLC + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/* + * mwp_dist_ram with a ONE-HOT read select for the Live Value Table. + * + * Identical storage/write semantics to mwp_dist_ram (one sdp_dist_ram bank per + * write port + register LVT, highest-numbered port wins on same-address + * writes). The difference is purely a TIMING restructure of the read path: + * the caller supplies BOTH the binary read address (still used for the banks' + * LUTRAM address pins, which require binary) AND a registered one-hot image of + * the same address (i_read_onehot). The LVT bank-select lookup — a 32:1 mux + * of registered LVT bits behind a high-fanout binary select in the base + * module — becomes an AND-OR reduction over per-entry one-hot bits: + * + * lvt_read_sel = OR_i (i_read_onehot[i] ? lvt[i] : '0) + * + * CONTRACT (caller invariant): i_read_onehot == (1 << i_read_address) in + * every cycle where o_read_data is consumed. Under that invariant the + * reduction equals lvt[i_read_address] exactly, so o_read_data is + * bit-identical to the base module's. A simulation-only check below fires if + * the invariant is ever violated. + * + * Intended use: the reorder buffer head / head+1 read ports, whose one-hot + * images (head_clear_mask / head_next_clear_mask) are already maintained as + * registers that move in lockstep with head_ptr. + */ +module mwp_dist_ram_ohread #( + parameter int unsigned ADDR_WIDTH = 5, // Address width in bits + parameter int unsigned DATA_WIDTH = 32, // Data width in bits + parameter int unsigned NUM_WRITE_PORTS = 2 // Number of write ports (>= 2) +) ( + input logic i_clk, + + // Write ports (active-high enables, independent addresses and data) + input logic [NUM_WRITE_PORTS-1:0] i_write_enable, + input logic [NUM_WRITE_PORTS-1:0][ADDR_WIDTH-1:0] i_write_address, + input logic [NUM_WRITE_PORTS-1:0][DATA_WIDTH-1:0] i_write_data, + + // Read port (asynchronous / combinational). + // i_read_address feeds the LUTRAM banks (binary); i_read_onehot must be a + // registered one-hot image of the SAME address and steers the LVT select. + input logic [ ADDR_WIDTH-1:0] i_read_address, + input logic [2**ADDR_WIDTH-1:0] i_read_onehot, + output logic [ DATA_WIDTH-1:0] o_read_data +); + + localparam int unsigned RamDepth = 2 ** ADDR_WIDTH; + localparam int unsigned SelWidth = $clog2(NUM_WRITE_PORTS); + + // --------------------------------------------------------------------------- + // RAM bank per write port (identical to mwp_dist_ram) + // --------------------------------------------------------------------------- + logic [NUM_WRITE_PORTS-1:0][DATA_WIDTH-1:0] bank_read_data; + + for (genvar wp = 0; wp < NUM_WRITE_PORTS; wp++) begin : g_banks + sdp_dist_ram #( + .ADDR_WIDTH(ADDR_WIDTH), + .DATA_WIDTH(DATA_WIDTH) + ) u_bank ( + .i_clk, + .i_write_enable (i_write_enable[wp]), + .i_write_address(i_write_address[wp]), + .i_read_address (i_read_address), + .i_write_data (i_write_data[wp]), + .o_read_data (bank_read_data[wp]) + ); + end : g_banks + + // --------------------------------------------------------------------------- + // Live Value Table (register-based, identical write behavior) + // --------------------------------------------------------------------------- + logic [SelWidth-1:0] lvt[RamDepth]; + + initial for (int i = 0; i < RamDepth; ++i) lvt[i] = '0; + + always_ff @(posedge i_clk) begin + for (int wp = 0; wp < NUM_WRITE_PORTS; wp++) begin + if (i_write_enable[wp]) lvt[i_write_address[wp]] <= SelWidth'(wp); + end + end + + // --------------------------------------------------------------------------- + // Read mux — LVT selected via the one-hot AND-OR instead of a binary mux + // --------------------------------------------------------------------------- + logic [SelWidth-1:0] lvt_read_sel; + always_comb begin + lvt_read_sel = '0; + for (int i = 0; i < RamDepth; i++) begin + if (i_read_onehot[i]) lvt_read_sel |= lvt[i]; + end + end + + assign o_read_data = bank_read_data[lvt_read_sel]; + +`ifndef SYNTHESIS +`ifndef FORMAL + // Simulation-only contract check: the one-hot select must mirror the binary + // read address whenever both are known. A mismatch would silently return + // the wrong bank's data, so treat it as an error. The all-zero case is + // tolerated: it only occurs before the caller's reset has loaded the mask + // register (2-state sims read uninitialized FFs as 0), where it selects + // bank 0 exactly like the base module's initial lvt='0 read would. + // (FORMAL builds exclude this block — yosys cannot elaborate $error in a + // clocked process; the equivalent invariant is proven as + // p_head_mask_onehot / p_head_next_mask_onehot in the reorder_buffer's + // FORMAL section instead.) + always @(posedge i_clk) begin + if (!$isunknown( + i_read_address + ) && !$isunknown( + i_read_onehot + ) && (i_read_onehot != '0) && (i_read_onehot != (RamDepth'(1) << i_read_address))) begin + $error("mwp_dist_ram_ohread: i_read_onehot (0x%0h) != 1 << i_read_address (%0d)", + i_read_onehot, i_read_address); + end + end +`endif +`endif + +endmodule : mwp_dist_ram_ohread diff --git a/hw/rtl/lib/ram/ram.f b/hw/rtl/lib/ram/ram.f index bbf43814..3cd92a74 100644 --- a/hw/rtl/lib/ram/ram.f +++ b/hw/rtl/lib/ram/ram.f @@ -13,6 +13,9 @@ # Two-read-port variant of mwp_dist_ram (shared LVT + banks, two async reads) $(ROOT)/hw/rtl/lib/ram/mwp_dist_ram_2r.sv +# One-hot-read-select variant of mwp_dist_ram (LVT select via registered one-hot) +$(ROOT)/hw/rtl/lib/ram/mwp_dist_ram_ohread.sv + # Simple dual-port block RAM (sync read, sync write) $(ROOT)/hw/rtl/lib/ram/sdp_block_ram.sv diff --git a/hw/rtl/lib/ram/sdp_block_ram.sv b/hw/rtl/lib/ram/sdp_block_ram.sv index 81851060..b50b3d93 100644 --- a/hw/rtl/lib/ram/sdp_block_ram.sv +++ b/hw/rtl/lib/ram/sdp_block_ram.sv @@ -26,10 +26,21 @@ */ module sdp_block_ram #( parameter int unsigned ADDR_WIDTH = 5, // Address width in bits - parameter int unsigned DATA_WIDTH = 32 // Data width in bits + parameter int unsigned DATA_WIDTH = 32, // Data width in bits + // Simulation-only bulk-clear support. 0 (FPGA/synthesis): this module is + // byte-for-byte the plain single-write block RAM -- the clear path is not + // elaborated, so inference is unchanged. Non-zero: a sim-only path lets + // i_bulk_clear zero every entry in one cycle (frost_cache's fast + // invalidate-all). The clear branch lives in a generate that is elaborated + // only when this is set, so no synthesis flow ever sees the array-wide + // reset. + parameter int unsigned SUPPORT_BULK_CLEAR = 0 ) ( input logic i_clk, input logic i_write_enable, + // Sim-only one-cycle clear of every entry (see SUPPORT_BULK_CLEAR). Tied + // low / unused on FPGA builds (SUPPORT_BULK_CLEAR = 0). + input logic i_bulk_clear, input logic [ADDR_WIDTH-1:0] i_write_address, input logic [ADDR_WIDTH-1:0] i_read_address, input logic [DATA_WIDTH-1:0] i_write_data, @@ -42,8 +53,18 @@ module sdp_block_ram #( // Initialize all memory locations to zero initial for (int i = 0; i < RamDepth; ++i) ram[i] = '0; - // Synchronous write operation - always_ff @(posedge i_clk) if (i_write_enable) ram[i_write_address] <= i_write_data; + // Synchronous write. SUPPORT_BULK_CLEAR picks the write block at elaboration: + // the FPGA path is exactly the original single-port write (so block-RAM + // inference is unchanged); the sim-only path adds a one-cycle clear-all that + // takes priority over a write. Only one branch ever exists in a build. + if (SUPPORT_BULK_CLEAR != 0) begin : gen_clearable_write + always_ff @(posedge i_clk) begin + if (i_bulk_clear) for (int i = 0; i < int'(RamDepth); ++i) ram[i] <= '0; + else if (i_write_enable) ram[i_write_address] <= i_write_data; + end + end else begin : gen_plain_write + always_ff @(posedge i_clk) if (i_write_enable) ram[i_write_address] <= i_write_data; + end // Synchronous read - output registered for block RAM inference and timing always_ff @(posedge i_clk) o_read_data <= ram[i_read_address]; diff --git a/hw/rtl/lib/ram/sdp_block_ram_dc.sv b/hw/rtl/lib/ram/sdp_block_ram_dc.sv index b17a6995..efcb7584 100644 --- a/hw/rtl/lib/ram/sdp_block_ram_dc.sv +++ b/hw/rtl/lib/ram/sdp_block_ram_dc.sv @@ -18,7 +18,8 @@ * Dual-clock simple dual-port block RAM for clock domain crossing. * This module implements a block RAM with separate clocks for read and write ports, * enabling safe data transfer between different clock domains. The write port operates - * on i_wr_clk while the read port operates on i_rd_clk, with the block RAM providing + * on i_write_clock while the read port operates on i_read_clock, with the block RAM + * providing * inherent synchronization. Both ports have registered (single-cycle latency) access * to ensure clean timing and proper block RAM inference. This module is specifically * designed for use in asynchronous FIFOs where write and read operations occur in diff --git a/hw/rtl/peripherals/uart_rx.sv b/hw/rtl/peripherals/uart_rx.sv index bb62bed8..8f646e72 100644 --- a/hw/rtl/peripherals/uart_rx.sv +++ b/hw/rtl/peripherals/uart_rx.sv @@ -113,7 +113,7 @@ module uart_rx #( STATE_DATA_BITS: begin // Move to stop bit after all 8 data bits received - if (baud_rate_prescaler_counter == 0 && bits_remaining_counter == 0) begin + if (baud_rate_prescaler_counter == 0 && bits_remaining_counter == 1) begin next_state = STATE_STOP_BIT; end end diff --git a/hw/sim/cpu_tb.sv b/hw/sim/cpu_tb.sv index d360a0da..3c474531 100644 --- a/hw/sim/cpu_tb.sv +++ b/hw/sim/cpu_tb.sv @@ -48,16 +48,52 @@ module cpu_tb ); // Internal signals (names match CPU port names for wildcard connection) - logic [31:0] i_instr; // Registered instruction fed to CPU (raw 32-bit for C extension) - logic [1:0] i_instr_sideband; // Predecode: {is_compressed_hi, is_compressed_lo} + // 64-bit fetch window {next_word, current_word} (the CPU fetches a word pair). + logic [63:0] i_instr; + // Per-32-bit-word predecode sideband (ImemSidebandWidth bits each half). + logic [riscv_pkg::ImemFetchSidebandWidth-1:0] i_instr_sideband; + logic i_instr_bank_sel_r; // Fetch-word parity (pc_reg[2]) for the window + logic i_instr_valid; // Fetch window valid (tie 1: fixed 1-cycle provider) + logic [31:0] i_served_addr; // Served fetch-window tag (address fetched last cycle) logic [31:0] i_data_mem_rd_data; // Data memory read data to CPU logic pipeline_stall_from_cpu; // Stall signal monitoring (registered, 1-cycle delay) logic pipeline_stall_comb; // Stall signal (combinational, immediate) logic reset_to_cpu; // Reset signal monitoring - logic o_mmio_read_pulse; // Unused in testbench; required for CPU .* connection - logic [31:0] o_mmio_load_addr; // Unused in testbench; required for CPU .* connection - logic o_mmio_load_valid; // Unused in testbench; required for CPU .* connection - logic o_pipeline_stall; // Unused in testbench; required for CPU .* connection + + // Registered 1-cycle fetch state (mimics block-RAM instruction memory latency) + logic [31:0] tb_cur_word; // current fetch word presented to the CPU + logic tb_bank_sel_q; // parity (PC[2]) of the fetched address + logic [31:0] tb_served_addr_q; // address whose window is presented (o_pc, 1 cycle back) + localparam logic [31:0] TbNop = 32'h0000_0013; // addi x0,x0,0 + + // Ports below are unused by this instruction-feed testbench but must exist as + // local signals so the wildcard (.*) connection to cpu_ooo resolves. + logic o_mmio_read_pulse; + logic [31:0] o_mmio_load_addr; + logic o_mmio_load_valid; + logic o_mmio_fifo0_read_pulse; + logic o_mmio_fifo1_read_pulse; + logic o_mmio_uart_rx_ready_pulse; + logic o_pipeline_stall; + logic o_fetch_replay_consume; + // FENCE.I cache-sync handshake (no I-cache here; completed immediately below) + logic o_fence_i_sync_req; + logic i_fence_i_sync_done; + logic o_fence_i_flush; + // Cached (high-address) tier request outputs + response inputs (tied idle: + // the directed programs touch only the low BRAM range, never CACHED_BASE). + logic [3:0] o_data_mem_cached_byte_wr_en; + logic [31:0] o_data_mem_cached_wr_data; + logic o_data_mem_cached_read_enable; + logic [31:0] i_cached_read_data; + logic i_cached_read_valid; + logic i_cached_write_done; + logic i_cached_write_inflight; + // Debug taps (read from cocotb via device_under_test.*; also exposed here). + logic [5:0] o_debug_irq_status; + logic [31:0] o_debug_commit_pc; + logic [31:0] o_debug_commit_2_pc; + logic [1:0] o_debug_commit_valid; // Interrupt and timer signals for CPU (controllable from testbench) // Use reg type to allow testbench to drive values via force/deposit @@ -81,14 +117,42 @@ module cpu_tb always_ff @(posedge i_clk) begin // Stall signal from CPU observed on next rising edge pipeline_stall_from_cpu <= device_under_test.pipeline_ctrl.stall; - // Mimic one cycle read latency of block RAM instruction memory port - i_instr <= instruction_from_testbench; - // Compute sideband: {is_compressed_hi, is_compressed_lo} - // A halfword is compressed when its low 2 bits != 2'b11 - i_instr_sideband[0] <= (instruction_from_testbench[1:0] != 2'b11); - i_instr_sideband[1] <= (instruction_from_testbench[17:16] != 2'b11); + // Mimic one cycle read latency of block RAM instruction memory port: the + // word for the address requested on o_pc this cycle is presented next cycle. + tb_cur_word <= instruction_from_testbench; + tb_bank_sel_q <= o_pc[2]; // parity of the fetched address + tb_served_addr_q <= o_pc; // served-window tag: the address fetched last cycle end + // 64-bit fetch window {next_word, current_word}. The testbench feeds only + // 32-bit, 4-byte-aligned instructions (no compressed, no halfword spanning), + // so the "next word" half is never consumed (spanning only fires at pc[1]); + // drive a NOP there. + assign i_instr = {TbNop, tb_cur_word}; + // Per-word predecode sideband, computed by the same pure function the RTL + // fetch path uses (riscv_pkg::imem_make_sideband; no lookahead). + assign i_instr_sideband = { + riscv_pkg::imem_make_sideband(TbNop), riscv_pkg::imem_make_sideband(tb_cur_word) + }; + // bank_sel_r == pc_reg[2] => aligned: current word taken from i_instr[31:0]. + assign i_instr_bank_sel_r = tb_bank_sel_q; + // Served-window tag: this fixed 1-cycle provider always presents the window + // for last cycle's o_pc, so the tag is exactly that registered address (the + // if_stage served-window guard sees a window that always covers pc_reg). + assign i_served_addr = tb_served_addr_q; + // Fixed 1-cycle provider: the fetch window is always valid. + assign i_instr_valid = 1'b1; + + // FENCE.I cache-sync handshake completes immediately (no I-cache here; the + // directed programs never issue FENCE.I, so o_fence_i_sync_req stays low). + assign i_fence_i_sync_done = o_fence_i_sync_req; + + // Cached (high-address) tier response inputs tied inactive (tier unused). + assign i_cached_read_data = '0; + assign i_cached_read_valid = 1'b0; + assign i_cached_write_done = 1'b0; + assign i_cached_write_inflight = 1'b0; + // Memory addressing parameters localparam int unsigned MemByteAddrWidth = $clog2(MEM_SIZE_BYTES); localparam int unsigned MemWordAddrWidth = MemByteAddrWidth - 2; diff --git a/linux/buildroot b/linux/buildroot new file mode 160000 index 00000000..67449130 --- /dev/null +++ b/linux/buildroot @@ -0,0 +1 @@ +Subproject commit 67449130e9fdd71a38ca26539dddfa8c882b1977 diff --git a/linux/buildroot-external/Config.in b/linux/buildroot-external/Config.in new file mode 100644 index 00000000..4de49724 --- /dev/null +++ b/linux/buildroot-external/Config.in @@ -0,0 +1,18 @@ +# Copyright 2026 Two Sigma Open Source, LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +# The FROST external tree provides no extra target packages today; the kernel, +# toolchain and rootfs are all selected by configs/frost_nommu_rv32_defconfig. +# Add `source "$BR2_EXTERNAL_FROST_PATH/package//Config.in"` lines here if +# FROST-specific Buildroot packages are introduced later. diff --git a/linux/buildroot-external/README.md b/linux/buildroot-external/README.md new file mode 100644 index 00000000..341970c8 --- /dev/null +++ b/linux/buildroot-external/README.md @@ -0,0 +1,157 @@ + + +# FROST Buildroot external tree (`BR2_EXTERNAL`) + +Reproducibly builds the FROST **RV32 / no-MMU / M-mode Linux** kernel (6.18.7), +a busybox initramfs, and packages them into the memory images the FROST cocotb +`linux_boot` simulation (and the FPGA JTAG loader) consume. + +This is a standard Buildroot [`BR2_EXTERNAL`](https://buildroot.org/downloads/manual/manual.html#outside-br-custom) +tree. It carries **no** Buildroot source itself — point an out-of-tree build at +a pinned upstream Buildroot checkout (see *Buildroot pin* below). + +## Layout + +``` +linux/buildroot-external/ +├── external.desc # BR2_EXTERNAL manifest (name: FROST) +├── external.mk # package include hook (no packages today) +├── Config.in # package menu hook (empty today) +├── configs/ +│ └── frost_nommu_rv32_defconfig # the FROST Buildroot defconfig +└── board/frost/ + ├── linux-nommu-base.config # base kernel config (from buildroot board/qemu/riscv32-virt) + ├── linux-nommu-frost.config.fragment # FROST kernel CONFIG delta, merged on top of the base + ├── frost-nommu-fpga.dts # reference DTB source (the packer regenerates it per build) + ├── build_fpga_boot.py # packer: Image + DTB + initramfs -> sw.{mem,txt}, sw_ddr.{mem,txt} + ├── post-image.sh # Buildroot post-image hook -> runs the packer + └── patches/linux/linux.hash # sha256 for the custom linux-6.18.7 tarball +``` + +## Buildroot pin + +Buildroot is vendored as a submodule at `linux/buildroot`, pinned to the exact +commit **`67449130`** (a `2026.08-git` snapshot). That commit provides the +defaults this defconfig relies on: **gcc 15.2.0**, **binutils 2.45.1**, the +internal rv32-nommu **uClibc** toolchain, and the **Linux 6.18** host-headers +option. The pin is the exact commit rather than a release tag so the build is +reproducible regardless of tag movement. + +A fresh checkout only needs the submodule initialized: + +```bash +git submodule update --init linux/buildroot +``` + +To bump the pin, checkout the new commit in the submodule and commit the +updated gitlink: + +```bash +git -C linux/buildroot checkout +git add linux/buildroot +git commit -m "linux: bump vendored buildroot to " +``` + +> Re-verify a bump ships `BR2_GCC_VERSION_15_X` (15.2.0), +> `BR2_BINUTILS_VERSION_2_45_X` (2.45.1) and +> `BR2_PACKAGE_HOST_LINUX_HEADERS_CUSTOM_6_18`, which this defconfig relies on. + +## Build + +Out-of-tree build (keeps the Buildroot submodule pristine): + +```bash +# from the repo root +make -C linux/buildroot O="$(pwd)/linux/build" \ + BR2_EXTERNAL="$(pwd)/linux/buildroot-external" frost_nommu_rv32_defconfig +make -C linux/buildroot O="$(pwd)/linux/build" +``` + +First build is ~30–60 min (it builds the cross toolchain from source). Outputs +land in `linux/build/images/`: + +| File | Purpose | +|---|---| +| `Image` | rv32 no-MMU kernel (flat, uncompressed) | +| `rootfs.cpio.gz` | busybox initramfs | +| `frost-nommu-fpga.dtb` | generated FROST device tree (UART/CLINT @ 0x4000_xxxx, 133.333 MHz) | +| `sw.mem` / `sw.txt` | low-BRAM boot shim (`a0=0`, `a1=DTB`, jump to kernel) | +| `sw_ddr.mem` / `sw_ddr.txt` | DDR image: kernel @ 0x8000_0000, DTB @ 0x8080_0000, initramfs @ 0x8081_0000 | + +## Feeding the cocotb `linux_boot` test + +`tests/test_run_cocotb.py` resolves an app's images at +`sw/apps//sw.mem` (+ `sw_ddr.mem`). Stage the build outputs there: + +```bash +mkdir -p sw/apps/linux_boot +cp linux/build/images/sw.mem sw/apps/linux_boot/sw.mem +cp linux/build/images/sw_ddr.mem sw/apps/linux_boot/sw_ddr.mem +# then, per the repo CLAUDE.md test flow: +cd tests && make clean && ./test_run_cocotb.py linux_boot +``` + +Or let the app Makefile self-build straight from this tree (it runs the whole +Buildroot build if `linux/build/images/Image` is absent, then packs for the +board clock) -- this is what `fpga/load_software/load_software.py +linux_boot` and the +CI `build-frost-linux` job drive: + +```bash +make -C sw/apps/linux_boot # genesys2 clock (133.33 MHz) by default +make -C sw/apps/linux_boot FPGA_CPU_CLK_FREQ=300000000 # x3 clock +``` + +The `linux_boot` cocotb registry entry (`linux_boot` / `linux_boot_128k`) and +its `build-frost-linux` + `linux-boot-cocotb` + `linux-boot-qemu` CI jobs live +on this branch (`nommu_linux`); they reach `main` when the branch merges. + +## How the kernel config is assembled + +`BR2_LINUX_KERNEL_USE_CUSTOM_CONFIG` uses `board/frost/linux-nommu-base.config` +as the base, and `BR2_LINUX_KERNEL_CONFIG_FRAGMENT_FILES` merges +`board/frost/linux-nommu-frost.config.fragment` on top (kconfig +`merge_config.sh` semantics). The fragment retargets the known-good QEMU-virt +nommu kernel at FROST: it keeps M-mode / rv32 / no-MMU / bFLT, switches the +rootfs to an initramfs (`BLK_DEV_INITRD` + `RD_GZIP`), and drops +virtio / PCI / net / ext2 / PLIC. See the header of the fragment for the full, +per-symbol rationale and the hardware caveats. + +## Notes, assumptions and gaps + +- **Rootfs reproduction.** `rootfs.cpio.gz` is reproduced from Buildroot's + default busybox (`busybox-minimal.config`) + `BR2_TARGET_ROOTFS_CPIO[_GZIP]`, + not vendored. It is functionally equivalent to the hand-made + `frost-artifacts/rootfs.cpio.gz` but **not** byte-identical. Add a + `rootfs-overlay/` + `BR2_ROOTFS_OVERLAY` here if a specific userspace is + required. +- **Fragment vs. the latest hand-built Image.** This defconfig *applies* the + FROST fragment (per the build notes' "Option A"). The most recent artifact + `Image` checked on the dev box was actually built from the **stock** + `qemu_riscv32_nommu_virt_defconfig` *without* the fragment (it still had + `CONFIG_NET` / `CONFIG_VIRTIO_BLK` / `CONFIG_SIFIVE_PLIC` / `CONFIG_EXT2_FS` + set). Decide whether the fragment-applied kernel here is the intended target + (it should be — it is strictly closer to FROST and the generated DTB has no + PLIC/virtio nodes) or whether to drop the fragment to match that artifact + bit-for-bit. +- **Boot shim toolchain.** Standalone, the packer uses the xPack + `riscv-none-elf-*` bare-metal toolchain (`rv32i_zicsr` / `ilp32`). In CI + `post-image.sh` instead uses the Buildroot-built `riscv32-*-` toolchain with + its own default `-march`/`-mabi` (the shim is ABI-agnostic integer code). +- **`dtc`.** `post-image.sh` prefers `$HOST_DIR/bin/dtc`, then the kernel's + `scripts/dtc/dtc`, then `$PATH`. Enable `BR2_PACKAGE_HOST_DTC=y` if you want + to guarantee a host `dtc` independent of the kernel build. diff --git a/linux/buildroot-external/board/frost/.gitignore b/linux/buildroot-external/board/frost/.gitignore new file mode 100644 index 00000000..d3be67cb --- /dev/null +++ b/linux/buildroot-external/board/frost/.gitignore @@ -0,0 +1,8 @@ +# Generated by build_fpga_boot.py (standalone runs default FROST_OUTDIR here). +# The Buildroot/CI flow writes these into $BINARIES_DIR instead, not the tree. +/frost-nommu-fpga.dts +/frost-nommu-fpga.dtb +/sw.mem +/sw.txt +/sw_ddr.mem +/sw_ddr.txt diff --git a/linux/buildroot-external/board/frost/build_fpga_boot.py b/linux/buildroot-external/board/frost/build_fpga_boot.py new file mode 100755 index 00000000..064575e1 --- /dev/null +++ b/linux/buildroot-external/board/frost/build_fpga_boot.py @@ -0,0 +1,243 @@ +#!/usr/bin/env python3 + +# Copyright 2026 Two Sigma Open Source, LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +# Vendored from frost-artifacts/build_fpga_boot.py; style carve-outs pending a refactor. +# ruff: noqa: D103, UP031 + +"""Build a FROST FPGA / sim no-MMU Linux boot image. + +Derived from frost-artifacts/build_fpga_boot.py. The packing logic (memory +layout, word format, DTB template and boot shim) is unchanged; the only +additions are environment overrides so the script runs both: + + * standalone on a dev box (xPack riscv-none-elf toolchain, original paths), and + * as a Buildroot post-image hook in CI (board/frost/post-image.sh sets the + env to point at Buildroot's $BINARIES_DIR and its just-built toolchain). + +Emits BOTH forms of each image: + sw.{mem,txt} low BRAM: boot shim (a0=0, a1=DTB, jr kernel entry). + sw_ddr.{mem,txt} DDR (offset 0 == 0x8000_0000): kernel Image @ 0, + DTB @ 0x80_0000, initramfs (cpio.gz) @ 0x81_0000. + + .mem = $readmemh form (sim): "@" directives + word values. + .txt = FPGA-loader form: dense, one little-endian word value per line from + offset 0 (file_to_bram.tcl / file_to_ddr.tcl burst it sequentially). +Both carry identical little-endian word values. + +Environment overrides (all optional; defaults reproduce the standalone build): + FROST_IMAGE kernel Image path (default: ~/bigger_l0/linux-mvp/buildroot/output/images/Image) + FROST_INITRD rootfs.cpio.gz path (default: