From e07b9c6a09ea31e6bc04317a2a9a9f42b953e0fd Mon Sep 17 00:00:00 2001 From: Adam Bagley Date: Thu, 18 Jun 2026 21:41:46 -0400 Subject: [PATCH 01/43] update csr_file misa to advertise C and D extension support --- hw/rtl/cpu_and_mem/cpu/csr/csr_file.sv | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/hw/rtl/cpu_and_mem/cpu/csr/csr_file.sv b/hw/rtl/cpu_and_mem/cpu/csr/csr_file.sv index 73acfdc8..b92f3da7 100644 --- a/hw/rtl/cpu_and_mem/cpu/csr/csr_file.sv +++ b/hw/rtl/cpu_and_mem/cpu/csr/csr_file.sv @@ -169,10 +169,10 @@ module csr_file #( logic [XLEN-1:0] mip; assign mip = {20'b0, i_interrupts.meip, 3'b0, i_interrupts.mtip, 3'b0, i_interrupts.msip, 3'b0}; - // misa is read-only: RV32IMAFB - // Bit 0 (A), Bit 1 (B), Bit 5 (F), Bit 8 (I), Bit 12 (M) = 0x0000_1123 + // misa is read-only: RV32IMAFDC + B (= RV32GCB) + // Bit 0 (A), Bit 1 (B), Bit 2 (C), Bit 3 (D), Bit 5 (F), Bit 8 (I), Bit 12 (M) = 0x0000_112F // MXL = 1 (32-bit) in bits [31:30] - localparam logic [XLEN-1:0] MisaValue = 32'h4000_1123; + localparam logic [XLEN-1:0] MisaValue = 32'h4000_112F; // Output CSRs for trap unit assign o_mstatus = mstatus; From 6b80304cbfad6dfab72d36e865b96cb899bf077b Mon Sep 17 00:00:00 2001 From: Adam Bagley Date: Thu, 18 Jun 2026 22:44:54 -0400 Subject: [PATCH 02/43] Add RISC-V User mode (U-mode) to the core The core was Machine-mode only. Add User privilege (M+U; no Supervisor, MMU, or PMP) so an M-mode kernel can run user processes in U-mode. - riscv_pkg: ExcEcallUmode (cause 8); PrivU/PrivM encodings; mstatus MPP/MPRV field positions. - csr_file: current-privilege register (resets to M); mstatus.MPP as a live WARL field {M,U} (was hardwired M) plus mstatus.MPRV. Trap entry saves the privilege to MPP and enters M; MRET returns to MPP, sets MPP=U, and clears MPRV when returning below M. misa advertises U (0x4010_112F). New o_priv output. - trap_unit: machine interrupts are taken while in U-mode regardless of mstatus.MIE, so the timer can preempt user code. - cpu_ooo: commit-time ECALL cause select (8 from U, 11 from M) into mcause. MPRV is intentionally inert (no PMP/MMU); trapping illegal CSR/MRET access from U-mode is intentionally deferred. M-mode behavior is unchanged (the interrupt-enable reduces to mstatus.MIE in M). Verified: hello_world cocotb smoke passes. --- hw/rtl/cpu_and_mem/cpu/control/trap_unit.sv | 17 +++++-- hw/rtl/cpu_and_mem/cpu/cpu_ooo/cpu_ooo.sv | 29 +++++++++++- hw/rtl/cpu_and_mem/cpu/csr/csr_file.sv | 50 +++++++++++++++++---- hw/rtl/cpu_and_mem/cpu/riscv_pkg.sv | 8 ++++ 4 files changed, 90 insertions(+), 14 deletions(-) diff --git a/hw/rtl/cpu_and_mem/cpu/control/trap_unit.sv b/hw/rtl/cpu_and_mem/cpu/control/trap_unit.sv index 089e4583..d67b1038 100644 --- a/hw/rtl/cpu_and_mem/cpu/control/trap_unit.sv +++ b/hw/rtl/cpu_and_mem/cpu/control/trap_unit.sv @@ -96,6 +96,10 @@ module trap_unit #( // Direct MIE bit input keeps mstatus bit extraction out of this path. input logic i_mstatus_mie_direct, + // Current privilege mode. Machine interrupts are taken whenever running + // below M (priv != PrivM) regardless of mstatus.MIE (RISC-V privileged spec). + input logic [1:0] i_priv, + // Interrupt pending inputs input riscv_pkg::interrupt_t i_interrupts, @@ -143,11 +147,16 @@ module trap_unit #( else trap_taken_prev <= o_trap_taken; end - // Interrupt pending and enabled (gate by !trap_taken_prev to prevent re-entry) + // Interrupt pending and enabled (gate by !trap_taken_prev to prevent re-entry). + // Global M-interrupt enable: mstatus.MIE while in M, but ALWAYS enabled while + // running below M (priv != PrivM) so a machine timer/SW/ext interrupt can + // preempt U-mode even with MIE=0 (RISC-V privileged spec). + logic m_int_globally_enabled; + assign m_int_globally_enabled = mstatus_mie || (i_priv != riscv_pkg::PrivM); logic meip_enabled, mtip_enabled, msip_enabled; - assign meip_enabled = i_interrupts.meip && mie_meie && mstatus_mie && !trap_taken_prev; - assign mtip_enabled = i_interrupts.mtip && mie_mtie && mstatus_mie && !trap_taken_prev; - assign msip_enabled = i_interrupts.msip && mie_msie && mstatus_mie && !trap_taken_prev; + assign meip_enabled = i_interrupts.meip && mie_meie && m_int_globally_enabled && !trap_taken_prev; + assign mtip_enabled = i_interrupts.mtip && mie_mtie && m_int_globally_enabled && !trap_taken_prev; + assign msip_enabled = i_interrupts.msip && mie_msie && m_int_globally_enabled && !trap_taken_prev; // TIMING OPTIMIZATION: Register interrupt_pending to break critical path. // The combinational path from msip -> interrupt_pending -> take_trap -> stall -> cache diff --git a/hw/rtl/cpu_and_mem/cpu/cpu_ooo/cpu_ooo.sv b/hw/rtl/cpu_and_mem/cpu/cpu_ooo/cpu_ooo.sv index 20fb096e..1a472549 100644 --- a/hw/rtl/cpu_and_mem/cpu/cpu_ooo/cpu_ooo.sv +++ b/hw/rtl/cpu_and_mem/cpu/cpu_ooo/cpu_ooo.sv @@ -840,6 +840,8 @@ module cpu_ooo #( logic trap_mret_commit_hold_q; logic [XLEN-1:0] rob_trap_pc; riscv_pkg::exc_cause_t rob_trap_cause; + riscv_pkg::exc_cause_t rob_trap_cause_remapped; + logic [1:0] csr_priv; // current privilege from csr_file (PrivM/PrivU) logic [XLEN-1:0] rob_trap_value; logic rob_trap_taken_ack; logic mret_start, mret_done_ack; @@ -1858,6 +1860,25 @@ module cpu_ooo #( endcase end + // ECALL cause is privilege-dependent (U-mode = 8, M-mode = 11). The FU shim + // tags every ECALL as ExcEcallMmode (it has no architectural privilege), so + // remap at commit using the current privilege. csr_file writes this to mcause + // -- the load-bearing path. It is also fed to trap_unit.i_exception_cause for + // symmetry, though FROST does not vector mtvec on synchronous-exception causes + // (only interrupts vector) and trap_unit's own o_trap_cause is unused. The + // csr_trap_value (mtval) mux above intentionally keeps the ORIGINAL cause + // (ECALL mtval is 0 either way). + // + // SAFE against the cause==11 / IntMachineExternal (0x8000_000B) low-bit + // collision: rob_trap_cause carries synchronous-exception causes ONLY (ROB + // o_trap_cause = head_exc_cause; the ROB's i_interrupt_pending is WFI-wakeup + // only, never a cause source), so a value of 11 here is unambiguously an + // M-mode ECALL. + assign rob_trap_cause_remapped = + ((rob_trap_cause == riscv_pkg::ExcEcallMmode[riscv_pkg::ExcCauseWidth-1:0]) && + (csr_priv == riscv_pkg::PrivU)) ? + riscv_pkg::ExcEcallUmode[riscv_pkg::ExcCauseWidth-1:0] : rob_trap_cause; + csr_file #( .XLEN(XLEN) ) csr_file_inst ( @@ -1875,7 +1896,7 @@ module cpu_ooo #( .i_mtime(i_mtime), .i_trap_taken(trap_taken), .i_trap_pc(rob_trap_pc), - .i_trap_cause({{(XLEN - $bits(rob_trap_cause)) {1'b0}}, rob_trap_cause}), + .i_trap_cause({{(XLEN - $bits(rob_trap_cause_remapped)) {1'b0}}, rob_trap_cause_remapped}), .i_trap_value(csr_trap_value), .i_mret_taken(mret_taken), .o_mstatus(csr_mstatus), @@ -1883,6 +1904,7 @@ module cpu_ooo #( .o_mtvec(csr_mtvec), .o_mepc(csr_mepc), .o_mstatus_mie_direct(csr_mstatus_mie_direct), + .o_priv(csr_priv), // FP flags: accumulated from ROB commit .i_fp_flags(rob_commit_fp_flags_merged), .i_fp_flags_valid(rob_commit_any_fp_flags_valid), @@ -1935,10 +1957,13 @@ module cpu_ooo #( .i_mtvec(csr_mtvec), .i_mepc(csr_mepc), .i_mstatus_mie_direct(csr_mstatus_mie_direct), + .i_priv(csr_priv), .i_interrupts(i_interrupts), // Exception from ROB commit .i_exception_valid(trap_pending), - .i_exception_cause({{(XLEN - $bits(rob_trap_cause)) {1'b0}}, rob_trap_cause}), + .i_exception_cause({ + {(XLEN - $bits(rob_trap_cause_remapped)) {1'b0}}, rob_trap_cause_remapped + }), .i_exception_tval('0), .i_exception_pc(rob_trap_pc), .i_mret_start(mret_start), diff --git a/hw/rtl/cpu_and_mem/cpu/csr/csr_file.sv b/hw/rtl/cpu_and_mem/cpu/csr/csr_file.sv index b92f3da7..6734859f 100644 --- a/hw/rtl/cpu_and_mem/cpu/csr/csr_file.sv +++ b/hw/rtl/cpu_and_mem/cpu/csr/csr_file.sv @@ -91,6 +91,11 @@ module csr_file #( // Direct output of mstatus MIE bit for timing and simpler consumers. output logic o_mstatus_mie_direct, + // Current privilege mode (PrivM/PrivU): consumed by trap_unit (interrupt + // enable while in U) and the commit-time ECALL cause select. Changes only + // on trap entry and MRET. + output logic [1:0] o_priv, + // F extension: FP exception flags from FPU (to accumulate in fflags) input riscv_pkg::fp_flags_t i_fp_flags, input logic i_fp_flags_valid, // Valid when FP instruction retires (gated by o_vld) @@ -140,8 +145,14 @@ module csr_file #( // do not require read/modify/write of the full CSR word. logic mstatus_mie; // Machine Interrupt Enable (bit 3) logic mstatus_mpie; // Machine Previous Interrupt Enable (bit 7) - logic [XLEN-1:0] mstatus; // Constructed from mie and mpie - assign mstatus = {19'b0, 2'b11, 3'b0, mstatus_mpie, 3'b0, mstatus_mie, 3'b0}; + logic [ 1:0] mstatus_mpp; // Previous Privilege [12:11]; WARL {PrivM,PrivU} + logic mstatus_mprv; // Modify PRiV (bit 17); stored but inert (no PMP/MMU) + logic [ 1:0] priv_q; // Current privilege mode (resets to PrivM) + logic [XLEN-1:0] mstatus; // Constructed from the fields above + assign mstatus = { + 14'b0, mstatus_mprv, 4'b0, mstatus_mpp, 3'b0, mstatus_mpie, 3'b0, mstatus_mie, 3'b0 + }; + assign o_priv = priv_q; // mie CSR: store each interrupt enable as separate register logic mie_msie; // Machine Software Interrupt Enable (bit 3) @@ -153,6 +164,9 @@ module csr_file #( // Next-state signals for mstatus bits (computed combinationally) logic next_mstatus_mie; logic next_mstatus_mpie; + logic [1:0] next_mstatus_mpp; + logic next_mstatus_mprv; + logic [1:0] next_priv; // Next-state signals for mie bits logic next_mie_msie; logic next_mie_mtie; @@ -169,10 +183,11 @@ module csr_file #( logic [XLEN-1:0] mip; assign mip = {20'b0, i_interrupts.meip, 3'b0, i_interrupts.mtip, 3'b0, i_interrupts.msip, 3'b0}; - // misa is read-only: RV32IMAFDC + B (= RV32GCB) - // Bit 0 (A), Bit 1 (B), Bit 2 (C), Bit 3 (D), Bit 5 (F), Bit 8 (I), Bit 12 (M) = 0x0000_112F + // misa is read-only: RV32IMAFDC + B + U (= RV32GCB with User mode) + // Bit 0 (A), Bit 1 (B), Bit 2 (C), Bit 3 (D), Bit 5 (F), Bit 8 (I), Bit 12 (M), + // Bit 20 (U) = 0x0010_112F // MXL = 1 (32-bit) in bits [31:30] - localparam logic [XLEN-1:0] MisaValue = 32'h4000_112F; + localparam logic [XLEN-1:0] MisaValue = 32'h4010_112F; // Output CSRs for trap unit assign o_mstatus = mstatus; @@ -318,22 +333,35 @@ module csr_file #( // Default: keep current values next_mstatus_mie = mstatus_mie; next_mstatus_mpie = mstatus_mpie; + next_mstatus_mpp = mstatus_mpp; + next_mstatus_mprv = mstatus_mprv; + next_priv = priv_q; next_mie_msie = mie_msie; next_mie_mtie = mie_mtie; next_mie_meie = mie_meie; if (i_trap_taken) begin - // Trap entry: save MIE to MPIE, clear MIE + // Trap entry: save MIE->MPIE, clear MIE, save priv->MPP, enter M-mode. next_mstatus_mpie = mstatus_mie; next_mstatus_mie = 1'b0; + next_mstatus_mpp = priv_q; + next_priv = riscv_pkg::PrivM; end else if (i_mret_taken) begin - // MRET: restore MIE from MPIE, set MPIE to 1 + // MRET: restore MIE<-MPIE, MPIE=1, return to MPP's privilege, set MPP=U, + // and clear MPRV if returning below M (per the privileged spec). next_mstatus_mie = mstatus_mpie; next_mstatus_mpie = 1'b1; + next_priv = mstatus_mpp; + if (mstatus_mpp != riscv_pkg::PrivM) next_mstatus_mprv = 1'b0; + next_mstatus_mpp = riscv_pkg::PrivU; end else if (i_csr_write_enable && i_csr_read_enable) begin if (i_csr_address == riscv_pkg::CsrMstatus) begin - next_mstatus_mie = csr_new_value[3]; + next_mstatus_mie = csr_new_value[3]; next_mstatus_mpie = csr_new_value[7]; + // MPP is WARL: FROST implements only M and U, so fold S/reserved -> U. + next_mstatus_mpp = (csr_new_value[12:11] == riscv_pkg::PrivM) ? + riscv_pkg::PrivM : riscv_pkg::PrivU; + next_mstatus_mprv = csr_new_value[17]; end else if (i_csr_address == riscv_pkg::CsrMie) begin next_mie_msie = csr_new_value[3]; next_mie_mtie = csr_new_value[7]; @@ -348,12 +376,18 @@ module csr_file #( if (i_rst) begin mstatus_mie <= 1'b0; mstatus_mpie <= 1'b0; + mstatus_mpp <= riscv_pkg::PrivU; + mstatus_mprv <= 1'b0; + priv_q <= riscv_pkg::PrivM; mie_msie <= 1'b0; mie_mtie <= 1'b0; mie_meie <= 1'b0; end else begin mstatus_mie <= next_mstatus_mie; mstatus_mpie <= next_mstatus_mpie; + mstatus_mpp <= next_mstatus_mpp; + mstatus_mprv <= next_mstatus_mprv; + priv_q <= next_priv; mie_msie <= next_mie_msie; mie_mtie <= next_mie_mtie; mie_meie <= next_mie_meie; diff --git a/hw/rtl/cpu_and_mem/cpu/riscv_pkg.sv b/hw/rtl/cpu_and_mem/cpu/riscv_pkg.sv index eb8b0893..863bb158 100644 --- a/hw/rtl/cpu_and_mem/cpu/riscv_pkg.sv +++ b/hw/rtl/cpu_and_mem/cpu/riscv_pkg.sv @@ -475,6 +475,13 @@ package riscv_pkg; // mstatus bit positions (RV32) localparam int unsigned MstatusMieBit = 3; // Machine Interrupt Enable localparam int unsigned MstatusMpieBit = 7; // Machine Previous Interrupt Enable + // mstatus.MPP occupies [12:11]; mstatus.MPRV is bit 17 (RV32). + localparam int unsigned MstatusMppLo = 11; + localparam int unsigned MstatusMprvBit = 17; + + // Privilege modes (RISC-V encoding). FROST implements Machine and User only. + localparam logic [1:0] PrivU = 2'b00; + localparam logic [1:0] PrivM = 2'b11; // mie/mip bit positions localparam int unsigned MieMsiBit = 3; // Machine Software Interrupt @@ -486,6 +493,7 @@ package riscv_pkg; localparam bit [31:0] ExcBreakpoint = 32'd3; localparam bit [31:0] ExcLoadAddrMisalign = 32'd4; localparam bit [31:0] ExcStoreAddrMisalign = 32'd6; + localparam bit [31:0] ExcEcallUmode = 32'd8; localparam bit [31:0] ExcEcallMmode = 32'd11; // Interrupt cause codes (mcause values when interrupt bit = 1) From 1838c02064a6aa8d3a65c042e56a4f211c587939 Mon Sep 17 00:00:00 2001 From: Adam Bagley Date: Thu, 18 Jun 2026 23:05:26 -0400 Subject: [PATCH 03/43] docs: reflect M+U privilege model (U-mode addition) The core now implements Machine (M) and User (U) privilege modes (was M-mode only; still no S-mode/MMU/PMP). Sync docs and descriptive comments to match the U-mode commit. - READMEs (root, hw/rtl, cpu, verif, sw) + CONTRIBUTING: M-mode -> M/U privilege wording; root extensions table gains a User Mode row; CONTRIBUTING future-work list no longer implies S-mode is supported. - RTL header/doc-block comments (csr_file, trap_unit, riscv_pkg, cpu_and_mem, instr_decoder): M/U privilege; csr_file notes mstatus.MPP WARL {M,U}, inert MPRV, and misa 0x4010_112F. - sw headers (trap.h, csr.h), FreeRTOSConfig.h, __init__.py: M/U wording; trap.h notes ECALL cause 8 (U) / 11 (M); csr.h notes MSTATUS_MPP WARL. Comments/docs only; no functional, test, or test-skip changes. --- CONTRIBUTING.md | 4 ++-- README.md | 7 ++++--- __init__.py | 2 +- hw/rtl/README.md | 4 ++-- hw/rtl/cpu_and_mem/cpu/README.md | 2 +- hw/rtl/cpu_and_mem/cpu/control/trap_unit.sv | 5 ++++- hw/rtl/cpu_and_mem/cpu/csr/csr_file.sv | 8 ++++---- hw/rtl/cpu_and_mem/cpu/id_stage/instr_decoder.sv | 2 +- hw/rtl/cpu_and_mem/cpu/riscv_pkg.sv | 4 ++-- hw/rtl/cpu_and_mem/cpu_and_mem.sv | 2 +- sw/README.md | 2 +- sw/apps/freertos_demo/FreeRTOSConfig.h | 2 +- sw/lib/include/csr.h | 2 +- sw/lib/include/trap.h | 8 ++++---- verif/README.md | 2 +- 15 files changed, 30 insertions(+), 26 deletions(-) diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md index d93c99da..36a48e62 100644 --- a/CONTRIBUTING.md +++ b/CONTRIBUTING.md @@ -20,7 +20,7 @@ This document provides guidelines for contributors. The detailed style sections ## Project Overview -FROST is an out-of-order RISC-V processor implementing **RV32GCB** (G = IMAFD) with a Tomasulo back-end and full machine-mode privilege support. Understanding the architecture helps you contribute effectively: +FROST is an out-of-order RISC-V processor implementing **RV32GCB** (G = IMAFD) with a Tomasulo back-end and Machine + User (M/U) privilege modes. Understanding the architecture helps you contribute effectively: ### Architecture Outline @@ -577,7 +577,7 @@ We welcome contributions in these areas: |------|----------| | Bug fixes | OOO ordering, instruction encoding, timing issues | | ISA extensions | Additional standard or custom extensions | -| Privilege modes | S-mode (supervisor), U-mode (user) support | +| Privilege modes | S-mode (supervisor), PMP, virtual memory (M and U modes already supported) | | Board support | New FPGA boards, SoC integrations | | Performance | Branch predictor, scheduler, memory-system, or cache improvements | | Peripherals | SPI, I2C, GPIO, timers | diff --git a/README.md b/README.md index 553a607e..6b01ff01 100644 --- a/README.md +++ b/README.md @@ -2,7 +2,7 @@ **F**PGA **R**ISC-V **O**pen-sourced in **S**ystemVerilog by **T**woSigma -An out-of-order RISC-V processor implementing **RV32GCB** (G = IMAFD) with a Tomasulo back-end and full machine-mode privilege support for RTOS operation. Achieves 300 MHz on UltraScale+. Designed for FPGA deployment with clean, portable SystemVerilog. +An out-of-order RISC-V processor implementing **RV32GCB** (G = IMAFD) with a Tomasulo back-end and Machine + User (M/U) privilege modes for RTOS operation. Achieves 300 MHz on UltraScale+. Designed for FPGA deployment with clean, portable SystemVerilog. ## Why FROST? @@ -55,7 +55,7 @@ There are many RISC-V cores. Here's what makes FROST different: │ │ │ ┌──────────────────────────┐ ┌─────────────────────────────────────┐ │ │ │ Trap Unit │ │ Peripherals │ │ -│ │ (M-mode, mret, wfi, │ │ UART, mtime/mtimecmp, FIFO0/1 │ │ +│ │ (M/U traps, mret, wfi, │ │ UART, mtime/mtimecmp, FIFO0/1 │ │ │ │ interrupts, exceptions) │ │ │ │ │ └──────────────────────────┘ └─────────────────────────────────────┘ │ │ │ @@ -82,6 +82,7 @@ There are many RISC-V cores. Here's what makes FROST different: | **Zbkb** | Bit manipulation for crypto | | **Zihintpause** | Pause hint for spin-wait loops | | **Machine Mode** | M-mode privilege (mret, wfi, ecall, ebreak) | +| **User Mode** | U-mode privilege (ecall traps to M-mode) | ### Architecture Highlights @@ -96,7 +97,7 @@ There are many RISC-V cores. Here's what makes FROST different: - **Two-tier branch recovery** — conditional-branch mispredictions use a fast ~2-cycle path (front-end redirect + RAT restore in the same cycle); JALR and exceptions take the slower commit-time path - **Branch prediction** with a 256-entry 2-bit BTB (trained for conditional branches and JAL, with slot-2 lookup support), 1024-entry bimodal direction predictor, 8-entry return address stack, and PD-stage computed-target redirects for conditional BTB misses predicted taken - **L0 cache** in front of the load queue reduces load-use latency (direct-mapped, write-through) -- **M-mode trap handling** for RTOS support (interrupts and exceptions) +- **Machine + User (M/U) privilege modes** for RTOS support — traps from both modes are taken in M-mode (interrupts and exceptions) - **CLINT-compatible timer** (mtime/mtimecmp) for preemptive scheduling - **Harvard architecture** with separate instruction and data memory ports - **Write-back cache hierarchy over DDR** — a 1 GiB cached region at `0x8000_0000` served by recursive line-port caches (`frost_cache`: direct-mapped, 32 B lines, write-back/write-allocate). Both instruction fetch (a 16 KiB read-only L1I) and data (a 128 KiB L1D) run through it on every board — so code can execute from DDR, not just from low BRAM — sharing a 2:1 line-port arbiter (data-side priority), plus a 2 MiB UltraRAM L2 spliced in on UltraScale+, over the board's DDR (DDR3 on Genesys2, DDR4 on X3) through a single-beat AXI bridge diff --git a/__init__.py b/__init__.py index 094cb92f..55a30449 100644 --- a/__init__.py +++ b/__init__.py @@ -15,7 +15,7 @@ """FROST - RISC-V processor package. This package contains a complete RV32GCB (G = IMAFD) RISC-V processor -implementation with full machine-mode support and additional extensions +implementation with Machine (M) and User (U) privilege modes and additional extensions (Zicsr, Zicntr, Zifencei, Zicond, Zbkb, and Zihintpause), along with verification infrastructure, build tools, and software libraries. diff --git a/hw/rtl/README.md b/hw/rtl/README.md index ddb83852..8ba90844 100644 --- a/hw/rtl/README.md +++ b/hw/rtl/README.md @@ -4,7 +4,7 @@ This directory contains the synthesizable SystemVerilog for FROST. The current CPU is an **out-of-order RV32GCB implementation with a 2-wide front-end and 2-wide commit**: a 2-wide in-order IF/PD/ID front-end, Tomasulo register renaming and dynamic scheduling, out-of-order execution across six function units, and -precise 2-wide in-order commit, with machine-mode traps and separate +precise 2-wide in-order commit, with M/U-mode traps and separate instruction/data memory ports. The pipeline width is **asymmetric**. Fetch, decode, rename, ROB allocation, @@ -88,7 +88,7 @@ backend notes. | `cpu_and_mem/cpu/csr/` | In use | Zicsr/Zicntr/fcsr support | | `cpu_and_mem/cpu/wb_stage/generic_regfile.sv` | In use | Parameterized INT/FP regfiles for OOO commit | | `cpu_and_mem/cpu/ex_stage/` | In use | Shared ALU, multiplier/divider, FPU, and `branch_jump_unit.sv` used by the OOO core and FU shims | -| `cpu_and_mem/cpu/control/trap_unit.sv` | In use | Machine-mode exception/interrupt handling | +| `cpu_and_mem/cpu/control/trap_unit.sv` | In use | M- and U-mode exception/interrupt handling (traps taken in M-mode) | | `lib/` | In use | Portable RAM/FIFO/stall helper primitives, plus `lib/cache/` (the `frost_cache` hierarchy, AXI bridge, and behavioral DDR model) and `lib/ram/sdp_ram_byte_en.sv` (row-granular byte-enable RAM with a selectable block/ultra primitive backing the cache data arrays) | | `peripherals/` | In use | UART TX/RX blocks | diff --git a/hw/rtl/cpu_and_mem/cpu/README.md b/hw/rtl/cpu_and_mem/cpu/README.md index 4f3e8166..2532d352 100644 --- a/hw/rtl/cpu_and_mem/cpu/README.md +++ b/hw/rtl/cpu_and_mem/cpu/README.md @@ -104,7 +104,7 @@ instruction size. | `if_stage/`, `pd_stage/`, `id_stage/` | **In use** | Reused front-end stages, including BTB/direction/RAS prediction, PD BTB-miss redirects, and RVC handling. IF now drives a stall-capable, variable-latency fetch seam (NOP bubbles + a 1-deep owed-ask while unserved) so code can run from the cached DDR region as well as low BRAM; the seam's `fetch_provider` (low-BRAM fast path vs. a two-line L1I fetch buffer with predecode-on-fill) lives one level up in `cpu_and_mem/`. | | `wb_stage/` | **In use** | Only the parameterized regfile is in the OOO build (instantiated twice for INT / FP). | | `csr/` | **In use** | Zicsr / Zicntr / fcsr. CSR ops are decoded in ID but read and write the CSR at commit through the ROB serializing FSM. | -| `control/trap_unit.sv` | **In use** | Machine-mode exception/interrupt handling used by `cpu_ooo.sv`. | +| `control/trap_unit.sv` | **In use** | M- and U-mode exception/interrupt handling (traps taken in M-mode) used by `cpu_ooo.sv`. | | `ex_stage/` | **In use** | `branch_jump_unit.sv` is instantiated directly at top level. ALU/MUL/DIV/FPU are used via the FU shims in `tomasulo/fu_shims/`. | `cpu_ooo.f` is the authoritative filelist for what actually gets compiled. diff --git a/hw/rtl/cpu_and_mem/cpu/control/trap_unit.sv b/hw/rtl/cpu_and_mem/cpu/control/trap_unit.sv index d67b1038..ae6014de 100644 --- a/hw/rtl/cpu_and_mem/cpu/control/trap_unit.sv +++ b/hw/rtl/cpu_and_mem/cpu/control/trap_unit.sv @@ -15,10 +15,13 @@ */ /* - * Trap Unit - Machine-mode exception and interrupt handling + * Trap Unit - exception and interrupt handling * * This module implements the RISC-V privileged architecture trap mechanism, * supporting both synchronous exceptions and asynchronous interrupts. + * Traps originate from M-mode or U-mode and are always taken in M-mode (mtvec). + * Machine interrupts are taken while running in U-mode regardless of mstatus.MIE, + * so the timer can preempt user code. * * Responsibilities: * ================= diff --git a/hw/rtl/cpu_and_mem/cpu/csr/csr_file.sv b/hw/rtl/cpu_and_mem/cpu/csr/csr_file.sv index 6734859f..4c2d8690 100644 --- a/hw/rtl/cpu_and_mem/cpu/csr/csr_file.sv +++ b/hw/rtl/cpu_and_mem/cpu/csr/csr_file.sv @@ -15,7 +15,7 @@ */ /* - CSR (Control and Status Register) File for RISC-V Zicsr + Zicntr + Machine-mode + F extensions. + CSR (Control and Status Register) File for RISC-V Zicsr + Zicntr + Machine/User-mode + F extensions. This module implements: @@ -31,9 +31,9 @@ - instret/instreth (0xC02/0xC82): Instructions retired counter (64-bit) - minstret/minstreth (0xB02/0xB82): Machine-mode alias for instret counter - Machine-mode CSRs (for trap/interrupt handling): - - mstatus (0x300): Machine status (MIE, MPIE bits) - - misa (0x301): Machine ISA (read-only, reports RV32IMAFB) + Machine-mode CSRs (for trap/interrupt handling; M and U privilege modes): + - mstatus (0x300): Machine status (MIE, MPIE bits; MPP WARL field {M, U}; MPRV bit, inert) + - misa (0x301): Machine ISA (read-only, reports RV32GCB + U: 0x4010_112F) - mie (0x304): Machine interrupt enable (MEIE, MTIE, MSIE) - mtvec (0x305): Machine trap vector base address - mscratch (0x340): Machine scratch register diff --git a/hw/rtl/cpu_and_mem/cpu/id_stage/instr_decoder.sv b/hw/rtl/cpu_and_mem/cpu/id_stage/instr_decoder.sv index 34be8e3f..68351bcc 100644 --- a/hw/rtl/cpu_and_mem/cpu/id_stage/instr_decoder.sv +++ b/hw/rtl/cpu_and_mem/cpu/id_stage/instr_decoder.sv @@ -15,7 +15,7 @@ */ /* - Instruction decoder for RISC-V RV32IMAFB + Zicsr + Machine-mode privileged. + Instruction decoder for RISC-V RV32GCB + Zicsr + M/U-mode privileged. B extension = Zba + Zbb + Zbs (full bit manipulation). F extension = Single-precision floating-point. This combinational module decodes 32-bit RISC-V instructions into control signals diff --git a/hw/rtl/cpu_and_mem/cpu/riscv_pkg.sv b/hw/rtl/cpu_and_mem/cpu/riscv_pkg.sv index 863bb158..9efa84aa 100644 --- a/hw/rtl/cpu_and_mem/cpu/riscv_pkg.sv +++ b/hw/rtl/cpu_and_mem/cpu/riscv_pkg.sv @@ -381,7 +381,7 @@ package riscv_pkg; // Section 3: CSR Definitions // =========================================================================== // Control and Status Register addresses, bit positions, and cause codes. - // Includes Zicsr instruction encodings and M-mode trap support. + // Includes Zicsr instruction encodings and M/U-mode trap support. // CSR instruction funct3 encoding typedef enum bit [2:0] { @@ -844,7 +844,7 @@ package riscv_pkg; // Section 9: Trap/Exception Handling // =========================================================================== // Structures for trap control. - // Used by trap_unit.sv for M-mode exception/interrupt handling. + // Used by trap_unit.sv for M/U-mode exception/interrupt handling. // Trap control signals (from trap unit to pipeline) typedef struct packed { logic trap_taken; // Trap is being taken this cycle diff --git a/hw/rtl/cpu_and_mem/cpu_and_mem.sv b/hw/rtl/cpu_and_mem/cpu_and_mem.sv index 2578d6f8..655b4d3f 100644 --- a/hw/rtl/cpu_and_mem/cpu_and_mem.sv +++ b/hw/rtl/cpu_and_mem/cpu_and_mem.sv @@ -251,7 +251,7 @@ module cpu_and_mem #( end assign interrupts.mtip = mtip_registered; - // RISC-V OOO CPU core - Tomasulo out-of-order with RV32IMACBFD + Zicsr + Machine-mode + // RISC-V OOO CPU core - Tomasulo out-of-order with RV32IMACBFD + Zicsr + Machine/User-mode cpu_ooo #( .MEM_BYTE_ADDR_WIDTH(MemByteAddrWidth), .MMIO_ADDR(MmioAddr), diff --git a/sw/README.md b/sw/README.md index a88cc6cd..c692fdda 100644 --- a/sw/README.md +++ b/sw/README.md @@ -550,7 +550,7 @@ include ../../common/common.mk ## Architecture Notes -Frost implements **RV32GCB** with full M-mode privilege support. See the [root README](../README.md) for the full ISA extension table and architecture details. +Frost implements **RV32GCB** with Machine (M) and User (U) privilege modes. See the [root README](../README.md) for the full ISA extension table and architecture details. ### Test Result Markers diff --git a/sw/apps/freertos_demo/FreeRTOSConfig.h b/sw/apps/freertos_demo/FreeRTOSConfig.h index 59d83ad8..d69287bb 100644 --- a/sw/apps/freertos_demo/FreeRTOSConfig.h +++ b/sw/apps/freertos_demo/FreeRTOSConfig.h @@ -18,7 +18,7 @@ * FreeRTOS Configuration for FROST RISC-V Processor * * This configuration is for a minimal FreeRTOS setup targeting: - * - RV32GCB with M-mode only + * - RV32GCB with Machine (M) and User (U) privilege modes * - Single core (mhartid = 0) * - CLINT-style timer (mtime/mtimecmp) * - 300 MHz clock frequency diff --git a/sw/lib/include/csr.h b/sw/lib/include/csr.h index 0acda286..6d7f83b7 100644 --- a/sw/lib/include/csr.h +++ b/sw/lib/include/csr.h @@ -89,7 +89,7 @@ /* ========================================================================== */ #define MSTATUS_MIE (1U << 3) /* Machine Interrupt Enable */ #define MSTATUS_MPIE (1U << 7) /* Machine Previous Interrupt Enable */ -#define MSTATUS_MPP (3U << 11) /* Machine Previous Privilege (2 bits) */ +#define MSTATUS_MPP (3U << 11) /* Machine Previous Privilege (2 bits, WARL {M,U}) */ /* ========================================================================== */ /* mie/mip bit definitions (interrupt enable/pending) */ diff --git a/sw/lib/include/trap.h b/sw/lib/include/trap.h index bc222157..650b782c 100644 --- a/sw/lib/include/trap.h +++ b/sw/lib/include/trap.h @@ -30,9 +30,9 @@ * - Privileged instructions (WFI, ECALL, EBREAK) * - Timer interrupt configuration * - * Frost implements machine-mode only (no S-mode or U-mode), so all code - * runs with full privilege. Traps jump to the address in mtvec, saving - * the return address in mepc and the cause in mcause. + * Frost implements Machine (M) and User (U) privilege modes (no S-mode). + * Traps from both M and U are taken in M-mode: they jump to the address in + * mtvec, saving the return address in mepc and the cause in mcause. * * Usage: * // Set up trap handler @@ -74,7 +74,7 @@ static inline __attribute__((always_inline)) void wfi(void) /** * ECALL - Environment Call * - * Generates a synchronous exception (mcause = 11 for M-mode). + * Generates a synchronous exception (mcause = 8 from U-mode, 11 from M-mode). * Used for system calls in OS environments. */ static inline __attribute__((always_inline)) void ecall(void) diff --git a/verif/README.md b/verif/README.md index 7a5c07c2..876849d3 100644 --- a/verif/README.md +++ b/verif/README.md @@ -37,7 +37,7 @@ This directory contains a comprehensive Python-based verification framework for ### Design Under Test (DUT) -The Frost CPU implements **RV32GCB** (G = IMAFD, plus C and B) with full M-mode privilege support. See the [root README](../README.md) for the full ISA extension table. +The Frost CPU implements **RV32GCB** (G = IMAFD, plus C and B) with M and U privilege modes. See the [root README](../README.md) for the full ISA extension table. Additional features: - 32 general-purpose registers plus a separate FP register file From 3b61c2c9a56504420535bc5356c0b9a6e26cb23e Mon Sep 17 00:00:00 2001 From: Adam Bagley Date: Fri, 19 Jun 2026 00:13:10 -0400 Subject: [PATCH 04/43] Fix interrupt mcause delivery and double trap-entry Two trap-path bugs, surfaced by a U-mode directed test (they also affected M-mode interrupts): - Interrupt mcause was 0. csr_file's mcause was driven from the ROB's synchronous cause, while trap_unit's arbitrated cause (the interrupt cause with the interrupt bit set, or the remapped exception cause) was left unconnected. Route trap_unit.o_trap_cause into csr_file.i_trap_cause. This is why FreeRTOS's preemptive tick never fired -- its mcause==0x8000_0007 check could not match; it now works. - Double trap-entry. A single trap was applied twice, the second time in M-mode, corrupting mstatus.MPP/mcause. The exception path re-armed while the ROB's trap_pending was still asserted, and the registered interrupt_pending re-fired. Hold exception_pending cleared one extra cycle (via the existing trap_taken_prev) and gate interrupt_pending with !o_trap_taken. Both feedback paths pass through a flop (no combinational loop) and stay off the take_trap->stall->cache critical cone. Verified: freertos_demo passes (now with real preemption); the U-mode directed test's ecall-from-U (mcause=8) and timer-preempt-from-U (mcause=0x8000_0007, taken in U-mode) cases pass. --- hw/rtl/cpu_and_mem/cpu/control/trap_unit.sv | 14 +++++++++++++- hw/rtl/cpu_and_mem/cpu/cpu_ooo/cpu_ooo.sv | 11 +++++++++-- 2 files changed, 22 insertions(+), 3 deletions(-) diff --git a/hw/rtl/cpu_and_mem/cpu/control/trap_unit.sv b/hw/rtl/cpu_and_mem/cpu/control/trap_unit.sv index ae6014de..edd61149 100644 --- a/hw/rtl/cpu_and_mem/cpu/control/trap_unit.sv +++ b/hw/rtl/cpu_and_mem/cpu/control/trap_unit.sv @@ -168,7 +168,13 @@ module trap_unit #( // Note: mtip is already registered in cpu_and_mem.sv for similar timing reasons. logic interrupt_pending_comb; logic interrupt_pending; - assign interrupt_pending_comb = meip_enabled || mtip_enabled || msip_enabled; + // Gate with !o_trap_taken so a still-pending interrupt is NOT re-latched on + // the cycle its own trap is taken. interrupt_pending is registered, so + // otherwise the latched value fires a second, spurious trap entry the next + // cycle (re-saving mstatus.MPP=M and corrupting a U-mode trap). NOT a comb + // loop: o_trap_taken derives from the REGISTERED interrupt_pending, so the + // feedback path passes through a flop. + assign interrupt_pending_comb = (meip_enabled || mtip_enabled || msip_enabled) && !o_trap_taken; always_ff @(posedge i_clk) begin if (i_rst) interrupt_pending <= 1'b0; @@ -189,6 +195,12 @@ module trap_unit #( exception_pending <= 1'b0; end else if (o_trap_taken) begin exception_pending <= 1'b0; + end else if (trap_taken_prev) begin + // Hold cleared one extra cycle: i_exception_valid (the ROB's trap_pending) + // stays high until the trap is acked (~1 cycle after o_trap_taken), so + // without this the exception re-arms and the trap is taken a second time + // (now in M, corrupting mstatus.MPP / mcause for a U-mode trap). + exception_pending <= 1'b0; end else if (i_exception_valid) begin exception_pending <= 1'b1; exception_cause_q <= i_exception_cause; diff --git a/hw/rtl/cpu_and_mem/cpu/cpu_ooo/cpu_ooo.sv b/hw/rtl/cpu_and_mem/cpu/cpu_ooo/cpu_ooo.sv index 1a472549..9523fd90 100644 --- a/hw/rtl/cpu_and_mem/cpu/cpu_ooo/cpu_ooo.sv +++ b/hw/rtl/cpu_and_mem/cpu/cpu_ooo/cpu_ooo.sv @@ -842,6 +842,10 @@ module cpu_ooo #( riscv_pkg::exc_cause_t rob_trap_cause; riscv_pkg::exc_cause_t rob_trap_cause_remapped; logic [1:0] csr_priv; // current privilege from csr_file (PrivM/PrivU) + // Arbitrated trap cause from trap_unit (interrupt cause with bit 31, or the + // remapped synchronous-exception cause) -> csr_file mcause. Declared here so + // it is visible above the trap_unit instantiation that drives it. + logic [XLEN-1:0] trap_cause_internal; logic [XLEN-1:0] rob_trap_value; logic rob_trap_taken_ack; logic mret_start, mret_done_ack; @@ -1896,7 +1900,10 @@ module cpu_ooo #( .i_mtime(i_mtime), .i_trap_taken(trap_taken), .i_trap_pc(rob_trap_pc), - .i_trap_cause({{(XLEN - $bits(rob_trap_cause_remapped)) {1'b0}}, rob_trap_cause_remapped}), + // mcause from trap_unit's arbitrated cause: interrupt cause (with the + // interrupt bit) for interrupts, or the remapped exception cause (which + // carries the U-mode ECALL remap via trap_unit.i_exception_cause below). + .i_trap_cause(trap_cause_internal), .i_trap_value(csr_trap_value), .i_mret_taken(mret_taken), .o_mstatus(csr_mstatus), @@ -1942,7 +1949,7 @@ module cpu_ooo #( assign interrupt_pending = i_interrupts.meip || i_interrupts.mtip || i_interrupts.msip; logic [XLEN-1:0] trap_target_internal, trap_pc_internal; - logic [XLEN-1:0] trap_cause_internal, trap_value_internal; + logic [XLEN-1:0] trap_value_internal; trap_unit #( .XLEN(XLEN) From 747f77359f8e8ecab15258484d8525d9f4176160 Mon Sep 17 00:00:00 2001 From: Adam Bagley Date: Fri, 19 Jun 2026 00:43:52 -0400 Subject: [PATCH 05/43] Trap CSR/MRET-from-U-mode as illegal instruction Enforce the U-mode privilege check at the ROB head: a U-mode access to an M-mode CSR (csr_addr[9:8] > priv) or an MRET is an illegal instruction (mcause=2). It is computed from existing head signals (head_is_csr, head_is_mret, head_csr_addr) plus a new i_priv input fed from csr_file.o_priv -- so no decode/dispatch/HeadMetaWidth changes are needed. Folded into head_exception/head_exc_cause at the source so every consumer (commit_en, o_csr_start/o_mret_start, o_trap_pending, the serial FSM, the commit record) treats it as a precise exception; the faulting op never executes or retires. It rides the same single-cycle exception path, so the trap_unit double-trap guard already covers it. M-mode-inert by construction (head_priv_fault is 0 when priv==M). i_priv is bridged cpu_ooo -> tomasulo_wrapper -> reorder_buffer. Verified: umode_test C (M-CSR-from-U) and D (MRET-from-U) trap mcause=2 from U-mode; freertos_demo still passes (M-mode unchanged). Note: head_priv_fault lands on the commit_en cone -- needs an X3 post-opt WNS check before merge. --- hw/rtl/cpu_and_mem/cpu/cpu_ooo/cpu_ooo.sv | 3 +++ .../tomasulo/reorder_buffer/reorder_buffer.sv | 25 ++++++++++++++++--- .../tomasulo_wrapper/tomasulo_wrapper.sv | 7 +++++- 3 files changed, 31 insertions(+), 4 deletions(-) diff --git a/hw/rtl/cpu_and_mem/cpu/cpu_ooo/cpu_ooo.sv b/hw/rtl/cpu_and_mem/cpu/cpu_ooo/cpu_ooo.sv index 9523fd90..f7cec147 100644 --- a/hw/rtl/cpu_and_mem/cpu/cpu_ooo/cpu_ooo.sv +++ b/hw/rtl/cpu_and_mem/cpu/cpu_ooo/cpu_ooo.sv @@ -1013,6 +1013,9 @@ module cpu_ooo #( .i_alloc_req_2(rob_alloc_req_2), .o_alloc_resp_2(rob_alloc_resp_2), + // Current privilege (PrivM/PrivU) for U-mode CSR/MRET illegal checks + .i_priv(csr_priv), + .o_cdb_grant(cdb_grant), .o_cdb(cdb_out), diff --git a/hw/rtl/cpu_and_mem/cpu/tomasulo/reorder_buffer/reorder_buffer.sv b/hw/rtl/cpu_and_mem/cpu/tomasulo/reorder_buffer/reorder_buffer.sv index d75a83ed..f466683d 100644 --- a/hw/rtl/cpu_and_mem/cpu/tomasulo/reorder_buffer/reorder_buffer.sv +++ b/hw/rtl/cpu_and_mem/cpu/tomasulo/reorder_buffer/reorder_buffer.sv @@ -169,6 +169,10 @@ module reorder_buffer ( // ========================================================================= input logic i_interrupt_pending, // Interrupt is pending (wake from WFI) + // Current privilege (PrivM/PrivU). A U-mode access to MRET or to a CSR that + // requires more privilege is an illegal instruction, detected at the head. + input logic [1:0] i_priv, + // ========================================================================= // Pipeline Flush Control // ========================================================================= @@ -349,7 +353,10 @@ module reorder_buffer ( logic head_valid; logic head_done; logic head_exception; - riscv_pkg::exc_cause_t head_exc_cause; // from RAM + logic head_exception_raw; // stored ROB exception flag (before U-mode priv fault) + logic head_priv_fault; // U-mode access to MRET / an M-CSR -> illegal instruction + riscv_pkg::exc_cause_t head_exc_cause; // effective cause (includes priv fault) + riscv_pkg::exc_cause_t head_exc_cause_raw; // from RAM logic [XLEN-1:0] head_pc; // from RAM logic head_dest_rf; logic [RegAddrWidth-1:0] head_dest_reg; // from RAM @@ -492,7 +499,19 @@ module reorder_buffer ( // Head entry fields from FF-backed packed vectors / distributed RAM assign head_valid = rob_valid[head_idx]; assign head_done = rob_done[head_idx]; - assign head_exception = rob_exception[head_idx]; + assign head_exception_raw = rob_exception[head_idx]; + // U-mode privilege fault: MRET, or a CSR access requiring more privilege than + // the current mode (csr_addr[9:8] > priv), is an illegal instruction. Folding + // it into head_exception/head_exc_cause makes every consumer (commit_en, + // o_csr_start/o_mret_start, o_trap_pending, the serial FSM, the commit record) + // treat it as a precise exception, so the faulting op never executes or + // retires. The faulting op rides the same single-cycle exception path, so the + // double-trap guard in trap_unit already covers it. + assign head_priv_fault = (head_is_mret && (i_priv != riscv_pkg::PrivM)) || + (head_is_csr && (head_csr_addr[9:8] > i_priv)); + assign head_exception = head_exception_raw || head_priv_fault; + assign head_exc_cause = (head_priv_fault && !head_exception_raw) ? + riscv_pkg::exc_cause_t'(riscv_pkg::ExcIllegalInstr) : head_exc_cause_raw; assign head_branch_taken = rob_branch_taken[head_idx]; assign head_mispredicted = rob_mispredicted[head_idx]; assign head_early_recovered = rob_early_recovered[head_idx]; @@ -1124,7 +1143,7 @@ module reorder_buffer ( i_cdb_write_2.exc_cause, i_cdb_write.exc_cause, ExcCauseWidth'(0), ExcCauseWidth'(0) }), .i_read_address(head_idx), - .o_read_data(head_exc_cause) + .o_read_data(head_exc_cause_raw) ); // Widen-commit replica: head+1 read port for exc_cause. diff --git a/hw/rtl/cpu_and_mem/cpu/tomasulo/tomasulo_wrapper/tomasulo_wrapper.sv b/hw/rtl/cpu_and_mem/cpu/tomasulo/tomasulo_wrapper/tomasulo_wrapper.sv index b6980bbe..7aa7d4a3 100644 --- a/hw/rtl/cpu_and_mem/cpu/tomasulo/tomasulo_wrapper/tomasulo_wrapper.sv +++ b/hw/rtl/cpu_and_mem/cpu/tomasulo/tomasulo_wrapper/tomasulo_wrapper.sv @@ -136,7 +136,11 @@ module tomasulo_wrapper #( input logic i_mret_done, input logic [riscv_pkg::XLEN-1:0] i_mepc, input logic i_interrupt_pending, - input logic i_trap_misaligned_accesses, + + // Current privilege (PrivM/PrivU), forwarded to the ROB for U-mode + // CSR/MRET illegal-instruction checks. + input logic [1:0] i_priv, + input logic i_trap_misaligned_accesses, // Widen-commit back-pressure: asserted when the downstream slot-2 // retire path can accept a second commit this cycle. cpu_ooo ties this @@ -1421,6 +1425,7 @@ module tomasulo_wrapper #( .i_mret_done (i_mret_done), .i_mepc (i_mepc), .i_interrupt_pending (i_interrupt_pending), + .i_priv (i_priv), .i_commit_hold (i_commit_hold), // Flush From 81a2aed4d0f1c93f621eec5f81cc339dbf981701 Mon Sep 17 00:00:00 2001 From: Adam Bagley Date: Fri, 19 Jun 2026 00:44:33 -0400 Subject: [PATCH 06/43] Add U-mode directed test (umode_test) Self-checking app on the real core (<>/<>) covering the M+U privilege support end-to-end: A ecall-from-U -> mcause=8 (ExcEcallUmode) B timer preempts U (MIE=0) -> mcause=0x8000_0007, taken in U-mode C M-CSR read from U -> illegal instruction (mcause=2) D MRET from U -> illegal instruction (mcause=2) The naked M-mode handler records the first trap's mcause + originating privilege (mstatus.MPP) and bounces to a fixed continuation, so each case self-checks both the cause and that it was taken from U-mode. Registered in test_run_cocotb.py. Also corrects the now-stale test_arch_compliance.py comment: Frost is M+U (not M-only); the privilege suite's U-mode tests drive an S-mode trap routine and need S/H extensions, so they remain filtered out, with U-mode covered by this directed test. --- sw/apps/umode_test/Makefile | 17 +++ sw/apps/umode_test/main.c | 198 ++++++++++++++++++++++++++++++++++ tests/test_arch_compliance.py | 8 +- tests/test_run_cocotb.py | 6 ++ 4 files changed, 227 insertions(+), 2 deletions(-) create mode 100644 sw/apps/umode_test/Makefile create mode 100644 sw/apps/umode_test/main.c diff --git a/sw/apps/umode_test/Makefile b/sw/apps/umode_test/Makefile new file mode 100644 index 00000000..c51e51ad --- /dev/null +++ b/sw/apps/umode_test/Makefile @@ -0,0 +1,17 @@ +# Copyright 2026 Two Sigma Open Source, LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +# Makefile for the U-mode (User privilege) directed test +SRC_C := main.c +include ../../common/common.mk diff --git a/sw/apps/umode_test/main.c b/sw/apps/umode_test/main.c new file mode 100644 index 00000000..eac0c492 --- /dev/null +++ b/sw/apps/umode_test/main.c @@ -0,0 +1,198 @@ +/* + * Copyright 2026 Two Sigma Open Source, LLC + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/* + * U-mode (User privilege) directed test. + * + * Exercises the Machine+User privilege support end-to-end on the real core and + * self-checks over UART (<> / <>): + * + * A. ECALL from U-mode -> mcause = 8 (ExcEcallUmode; 11 is M-mode) + * B. Machine timer interrupt while in U-mode with mstatus.MIE = 0 + * -> trap taken, mcause = 0x8000_0007. + * Proves machine interrupts fire while running below M regardless of MIE + * (so the timer can preempt user code) AND that the interrupt mcause + * carries the interrupt bit + code. + * C. Reading an M-mode CSR from U -> illegal instruction (mcause = 2). + * Requires the U-mode CSR-permission check. If that check is absent the + * trailing ECALL traps instead (mcause = 8), so the test FAILs cleanly + * rather than hanging. + * D. Executing MRET from U-mode -> illegal instruction (mcause = 2). + * MRET is an M-mode-only instruction; the trailing ECALL is the cause-8 + * fallback so the test FAILs (not hangs) if the check is absent. + * + * Mechanism: each case drops to U-mode via MRET (mstatus.MPP = U) into a small + * naked U-mode function that triggers the trap. A naked M-mode handler records + * mcause and the privilege the trap came from (mstatus.MPP), pushes mtimecmp to + * max so a timer interrupt cannot refire, and returns to M-mode at a fixed + * continuation address stashed in mscratch (forcing MPP=M for its MRET). + */ + +#include + +#include "trap.h" + +/* ---- minimal UART (UART_TX is provided by mmio.h via trap.h) ---- */ +static void uart_putc(char c) +{ + UART_TX = (uint8_t) c; +} + +static void uart_puts(const char *s) +{ + while (*s) + uart_putc(*s++); +} + +static void uart_hex(uint32_t v) +{ + static const char hex[] = "0123456789ABCDEF"; + uart_puts("0x"); + for (int i = 28; i >= 0; i -= 4) + uart_putc(hex[(v >> i) & 0xF]); +} + +/* ---- trap state shared with the naked handler ---- */ +static volatile uint32_t g_cause; +static volatile uint32_t g_from_priv; /* mstatus.MPP at trap entry = prev priv */ + +/* + * Naked M-mode trap handler. Records mcause and the trapping privilege, pushes + * mtimecmp to max (so a timer interrupt cannot refire), then returns to M-mode + * at the continuation address run_in_umode stashed in mscratch. Forces MPP=M so + * the MRET lands back in M-mode. Bouncing to a fixed continuation (rather than + * resuming the U-mode code) means clobbering temporaries here is safe. + */ +__attribute__((naked, aligned(4))) static void umode_trap_handler(void) +{ + __asm__ volatile("csrr t0, mcause\n" + "lui t1, %hi(g_cause)\n" + "lw t2, %lo(g_cause)(t1)\n" + "li t3, -1\n" /* sentinel: only the FIRST trap of each test records */ + "bne t2, t3, 2f\n" + "sw t0, %lo(g_cause)(t1)\n" + "csrr t0, mstatus\n" + "srli t0, t0, 11\n" + "andi t0, t0, 0x3\n" /* mstatus.MPP */ + "lui t1, %hi(g_from_priv)\n" + "sw t0, %lo(g_from_priv)(t1)\n" + "2:\n" + "li t1, 0x4000001C\n" /* MTIMECMP_HI: push compare to max to ack timer */ + "li t0, -1\n" + "sw t0, 0(t1)\n" + "csrr t0, mscratch\n" /* M-mode continuation set by run_in_umode */ + "csrw mepc, t0\n" + "li t0, 0x1800\n" /* MPP = M (0b11 << 11) */ + "csrs mstatus, t0\n" + "mret\n"); +} + +/* + * Enter U-mode at ufn; the handler returns control to the instruction after the + * MRET. Returns the mcause of the trap that ended U-mode execution. + */ +static uint32_t run_in_umode(void (*ufn)(void)) +{ + g_cause = 0xFFFFFFFFu; + g_from_priv = 0xFFFFFFFFu; + __asm__ volatile("la t0, 1f\n" + "csrw mscratch, t0\n" /* where the handler returns */ + "li t0, 0x1800\n" + "csrc mstatus, t0\n" /* MPP = U (00) */ + "csrw mepc, %0\n" + "mret\n" /* -> U-mode at ufn */ + "1:\n" + : + : "r"(ufn) + : "t0", "t1", "t2", "memory"); + return g_cause; +} + +/* ---- U-mode test bodies (naked: no prologue, so a mid-loop trap leaves the + * M-mode stack frame intact). Each spins after its trapping instruction. */ +__attribute__((naked)) static void u_ecall(void) +{ + __asm__ volatile("ecall\n j ."); +} + +__attribute__((naked)) static void u_spin(void) +{ + __asm__ volatile("j ."); +} + +__attribute__((naked)) static void u_read_mcsr(void) +{ + /* csrr of an M-CSR is illegal from U (cause 2); the ecall is the + * cause-8 fallback so the test FAILs (not hangs) if the check is absent. */ + __asm__ volatile("csrr t0, mstatus\n ecall\n j ."); +} + +__attribute__((naked)) static void u_mret_umode(void) +{ + /* MRET is an M-mode-only instruction; executing it from U is illegal + * (cause 2). The ecall is the cause-8 fallback so the test FAILs (not + * hangs) if the check is absent. */ + __asm__ volatile("mret\n ecall\n j ."); +} + +static int report(const char *name, uint32_t got, uint32_t want, uint32_t from_priv) +{ + int ok = (got == want) && (from_priv == 0u /* U */); + uart_puts(ok ? "[PASS] " : "[FAIL] "); + uart_puts(name); + uart_puts(" mcause="); + uart_hex(got); + uart_puts(" from_priv="); + uart_hex(from_priv); + uart_puts("\r\n"); + return ok; +} + +int main(void) +{ + int all_ok = 1; + uint32_t cause; + + uart_puts("\r\n=== U-mode privilege test ===\r\n"); + set_trap_handler(&umode_trap_handler); + + /* A: ECALL from U-mode -> mcause 8 */ + cause = run_in_umode(&u_ecall); + all_ok &= report("A ecall-from-U (want mcause=8)", cause, 8u, g_from_priv); + + /* B: timer preempts U-mode with MIE=0 -> mcause 0x8000_0007 */ + (void) disable_interrupts(); /* MIE = 0 */ + csr_clear(mstatus, MSTATUS_MPIE); /* so U runs with MIE=0 as well */ + enable_timer_interrupt(); /* mie.MTIE = 1 */ + set_timer_cmp(rdmtime() + 300); + cause = run_in_umode(&u_spin); + all_ok &= + report("B timer-preempts-U (want mcause=0x80000007)", cause, 0x80000007u, g_from_priv); + disable_timer_interrupt(); + + /* C: M-mode CSR read from U -> illegal (mcause 2) */ + cause = run_in_umode(&u_read_mcsr); + all_ok &= report("C M-CSR-from-U (want mcause=2)", cause, 2u, g_from_priv); + + /* D: MRET from U -> illegal (mcause 2) */ + cause = run_in_umode(&u_mret_umode); + all_ok &= report("D mret-from-U (want mcause=2)", cause, 2u, g_from_priv); + + uart_puts(all_ok ? "\r\n<>\r\n" : "\r\n<>\r\n"); + for (;;) { + } + return 0; +} diff --git a/tests/test_arch_compliance.py b/tests/test_arch_compliance.py index 17b26a9b..359d3205 100755 --- a/tests/test_arch_compliance.py +++ b/tests/test_arch_compliance.py @@ -73,8 +73,12 @@ # Extensions not listed here run all their tests. # Frost implements Zbkb (pack, packh, brev8, zip, unzip) from the K extension # but not Zbkx (xperm4/xperm8), Zkn (AES/SHA256/SHA512), or Zks (SM3/SM4). -# Frost is M-mode only (no S/U mode), so privilege tests are filtered -# to exclude supervisor, user, and hypervisor tests. +# Frost implements Machine and User privilege (no Supervisor/Hypervisor). The +# privilege suite's U-mode tests (menvcfg/senvcfg/henvcfg *_illegal_u) drive an +# S-mode trap routine and require S/H ISA extensions (Ssdtso/Sstc/...), so they +# cannot run on M+U-only Frost and stay filtered out. Frost's U-mode -- including +# illegal M-CSR/MRET access from U -- is covered by the directed sw/apps/umode_test +# instead. Supervisor and hypervisor tests are likewise excluded. EXTENSION_TEST_FILTERS: dict[str, set[str]] = { "K": {"pack", "packh", "brev8", "zip", "unzip"}, "privilege": {"ebreak", "ecall", "misalign", "menvcfg_m"}, diff --git a/tests/test_run_cocotb.py b/tests/test_run_cocotb.py index 11d6c305..1d0322ba 100755 --- a/tests/test_run_cocotb.py +++ b/tests/test_run_cocotb.py @@ -163,6 +163,12 @@ class CocotbRunConfig: app_name="csr_test", description="CSR test", ), + "umode_test": CocotbRunConfig( + python_test_module="cocotb_tests.test_real_program", + hdl_toplevel_module="frost", + app_name="umode_test", + description="U-mode (User privilege) directed test", + ), "freertos_demo": CocotbRunConfig( python_test_module="cocotb_tests.test_real_program", hdl_toplevel_module="frost", From f4e742b90281bea5e1c89e2b03f102070c12dbae Mon Sep 17 00:00:00 2001 From: Adam Bagley Date: Fri, 19 Jun 2026 01:08:35 -0400 Subject: [PATCH 07/43] Add ns16550a UART face for the Linux console Present a word-stride 16550 register face at 0x4000_1000 (DTB reg-shift=2, reg-io-width=4) that aliases the native UART TX/RX, so a stock Linux 8250 console driver (earlycon=uart8250,mmio32) can drive FROST's UART. THR transmits when DLAB is clear; LSR reports THRE/TEMT from TX-ready and DR from RX-valid; IER/FCR/LCR/MCR/SCR plus the DLL/DLM baud divisor form a small register file (the divisor is accepted but ignored -- FROST's UART runs at a fixed baud). Widen MmioSizeBytes 0x2C -> 0x1_C000 so the new face (and the CLINT alias to follow) fall in the MMIO-decoded range. The native UART TX/RX, FIFO, and timer paths are unchanged: the THR alias only adds an inert OR term to o_uart_wr_en, and the register-file writes live in a separate block whose decode misses every existing address. Validated by sw/apps/ns16550_test: the 8250 init dance plus register-file and TX-ready checks pass, and a banner transmitted through the face appears on the UART TX line. --- hw/rtl/cpu_and_mem/cpu_and_mem.sv | 83 ++++++++++++++++++++----- sw/apps/ns16550_test/Makefile | 17 +++++ sw/apps/ns16550_test/main.c | 100 ++++++++++++++++++++++++++++++ tests/test_run_cocotb.py | 6 ++ 4 files changed, 191 insertions(+), 15 deletions(-) create mode 100644 sw/apps/ns16550_test/Makefile create mode 100644 sw/apps/ns16550_test/main.c diff --git a/hw/rtl/cpu_and_mem/cpu_and_mem.sv b/hw/rtl/cpu_and_mem/cpu_and_mem.sv index 655b4d3f..5dfe53ad 100644 --- a/hw/rtl/cpu_and_mem/cpu_and_mem.sv +++ b/hw/rtl/cpu_and_mem/cpu_and_mem.sv @@ -137,7 +137,7 @@ module cpu_and_mem #( // - sw/common/link.ld (MMIO memory region and PROVIDE statements) // - cpu module parameters localparam int unsigned MmioAddr = 32'h4000_0000; - localparam int unsigned MmioSizeBytes = 32'h2C; + localparam int unsigned MmioSizeBytes = 32'h1_C000; // ns16550 @ +0x1000, CLINT @ +0x10000 localparam int unsigned UartMmioAddr = 32'h4000_0000; // UART TX (write-only) localparam int unsigned UartRxDataMmioAddr = 32'h4000_0004; // UART RX data (read consumes byte) localparam int unsigned UartRxStatusMmioAddr = 32'h4000_0024; // RX status (bit0: data available) @@ -152,6 +152,17 @@ module cpu_and_mem #( // Software interrupt register localparam int unsigned MsipMmioAddr = 32'h4000_0020; + // ns16550a UART face for Linux (word-stride; DTB reg-shift=2, reg-io-width=4). + // Aliases the native UART TX/RX. DLAB (LCR[7]) remaps offsets 0/4 to DLL/DLM. + localparam int unsigned Ns16550ThrRbr = 32'h4000_1000; // THR(w)/RBR(r) | DLL when DLAB + localparam int unsigned Ns16550IerDlm = 32'h4000_1004; // IER | DLM when DLAB + localparam int unsigned Ns16550IirFcr = 32'h4000_1008; // IIR(r) / FCR(w) + localparam int unsigned Ns16550Lcr = 32'h4000_100C; + localparam int unsigned Ns16550Mcr = 32'h4000_1010; + localparam int unsigned Ns16550Lsr = 32'h4000_1014; // read-only line status + localparam int unsigned Ns16550Msr = 32'h4000_1018; // read-only modem status + localparam int unsigned Ns16550Scr = 32'h4000_101C; // scratch + // Timer register defaults // Default mtimecmp to max value so no timer interrupt fires until software configures it localparam logic [63:0] MtimecmpDefault = 64'hFFFF_FFFF_FFFF_FFFF; @@ -229,12 +240,15 @@ module cpu_and_mem #( `endif // Timer registers (CLINT-style) - logic [63:0] mtime; // Machine time counter - logic [63:0] mtimecmp; // Machine timer compare register - logic msip; // Machine software interrupt pending + logic [63:0] mtime; // Machine time counter + logic [63:0] mtimecmp; // Machine timer compare register + logic msip; // Machine software interrupt pending + + // ns16550a UART face register file (8-bit). DLAB = ns_lcr[7]. + logic [7:0] ns_dll, ns_dlm, ns_ier, ns_fcr, ns_lcr, ns_mcr, ns_scr; // Interrupt signals to CPU - riscv_pkg::interrupt_t interrupts; + riscv_pkg::interrupt_t interrupts; // Clamp unknown external interrupt values to 0 for simulation stability. // This avoids X-propagation into mip when the top-level input is left un-driven. assign interrupts.meip = (i_external_interrupt === 1'b1); @@ -824,19 +838,30 @@ module cpu_and_mem #( // Use MA-stage address captured from CPU for MMIO reads unique case (mmio_load_addr) // UART RX data - returns received byte in lower 8 bits (reading consumes byte) - UartRxDataMmioAddr: mmio_read_data_comb = {24'b0, i_uart_rx_data}; + UartRxDataMmioAddr: mmio_read_data_comb = {24'b0, i_uart_rx_data}; // UART RX status - bit 0 indicates data available (non-destructive read) UartRxStatusMmioAddr: mmio_read_data_comb = {31'b0, i_uart_rx_valid}; // UART TX status - bit 0 indicates the TX FIFO can accept at least one byte. UartTxStatusMmioAddr: mmio_read_data_comb = {31'b0, i_uart_tx_ready}; - Fifo0MmioAddr: mmio_read_data_comb = i_fifo0_rd_data; - Fifo1MmioAddr: mmio_read_data_comb = i_fifo1_rd_data; - MtimeLowMmioAddr: mmio_read_data_comb = mtime[31:0]; - MtimeHighMmioAddr: mmio_read_data_comb = mtime[63:32]; - MtimecmpLowMmioAddr: mmio_read_data_comb = mtimecmp[31:0]; + Fifo0MmioAddr: mmio_read_data_comb = i_fifo0_rd_data; + Fifo1MmioAddr: mmio_read_data_comb = i_fifo1_rd_data; + MtimeLowMmioAddr: mmio_read_data_comb = mtime[31:0]; + MtimeHighMmioAddr: mmio_read_data_comb = mtime[63:32]; + MtimecmpLowMmioAddr: mmio_read_data_comb = mtimecmp[31:0]; MtimecmpHighMmioAddr: mmio_read_data_comb = mtimecmp[63:32]; - MsipMmioAddr: mmio_read_data_comb = {31'b0, msip}; - default: ; + MsipMmioAddr: mmio_read_data_comb = {31'b0, msip}; + // ns16550a UART face (aliases native UART TX/RX). DLAB selects DLL/DLM. + Ns16550ThrRbr: mmio_read_data_comb = ns_lcr[7] ? {24'b0, ns_dll} : {24'b0, i_uart_rx_data}; + Ns16550IerDlm: mmio_read_data_comb = ns_lcr[7] ? {24'b0, ns_dlm} : {24'b0, ns_ier}; + Ns16550IirFcr: mmio_read_data_comb = {24'b0, 8'hC1}; // FIFO enabled, no int pending + Ns16550Lcr: mmio_read_data_comb = {24'b0, ns_lcr}; + Ns16550Mcr: mmio_read_data_comb = {24'b0, ns_mcr}; + // LSR: TEMT|THRE from TX-ready (bits 6,5); DR from RX-valid (bit 0). + Ns16550Lsr: + mmio_read_data_comb = {24'b0, 1'b0, i_uart_tx_ready, i_uart_tx_ready, 4'b0, i_uart_rx_valid}; + Ns16550Msr: mmio_read_data_comb = {24'b0, 8'hB0}; // DCD|DSR|CTS asserted + Ns16550Scr: mmio_read_data_comb = {24'b0, ns_scr}; + default: ; endcase end @@ -887,11 +912,39 @@ module cpu_and_mem #( if (mmio_read_data_valid) data_memory_or_peripheral_read_data = mmio_read_data_reg; end - // write to UART + // write to UART (native 0x4000_0000 TX, or the ns16550 THR at 0x4000_1000 + // when DLAB is clear -- both funnel into the same TX byte stream). always_ff @(posedge i_clk) begin o_uart_wr_data <= data_memory_write_data_registered[7:0]; // UART uses only lower byte o_uart_wr_en <= |data_memory_byte_write_enable_registered && - data_memory_address_registered == UartMmioAddr; + ((data_memory_address_registered == UartMmioAddr) || + (data_memory_address_registered == Ns16550ThrRbr && !ns_lcr[7])); + end + + // ns16550a register-file writes. DLAB (LCR[7]) routes offsets 0/4 to the + // baud divisor (DLL/DLM); the THR write itself transmits via o_uart_wr_en. + always_ff @(posedge i_clk) begin + if (i_rst) begin + ns_dll <= 8'h01; + ns_dlm <= 8'h00; + ns_ier <= 8'h00; + ns_fcr <= 8'h00; + ns_lcr <= 8'h00; + ns_mcr <= 8'h00; + ns_scr <= 8'h00; + end else if (|data_memory_byte_write_enable_registered) begin + unique case (data_memory_address_registered) + Ns16550ThrRbr: if (ns_lcr[7]) ns_dll <= data_memory_write_data_registered[7:0]; + Ns16550IerDlm: + if (ns_lcr[7]) ns_dlm <= data_memory_write_data_registered[7:0]; + else ns_ier <= data_memory_write_data_registered[7:0]; + Ns16550IirFcr: ns_fcr <= data_memory_write_data_registered[7:0]; + Ns16550Lcr: ns_lcr <= data_memory_write_data_registered[7:0]; + Ns16550Mcr: ns_mcr <= data_memory_write_data_registered[7:0]; + Ns16550Scr: ns_scr <= data_memory_write_data_registered[7:0]; + default: ; + endcase + end end // FIFO write logic - write to FIFOs when CPU writes to FIFO MMIO addresses diff --git a/sw/apps/ns16550_test/Makefile b/sw/apps/ns16550_test/Makefile new file mode 100644 index 00000000..6e3b306c --- /dev/null +++ b/sw/apps/ns16550_test/Makefile @@ -0,0 +1,17 @@ +# Copyright 2026 Two Sigma Open Source, LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +# Makefile for the ns16550a UART face directed test +SRC_C := main.c +include ../../common/common.mk diff --git a/sw/apps/ns16550_test/main.c b/sw/apps/ns16550_test/main.c new file mode 100644 index 00000000..becc0021 --- /dev/null +++ b/sw/apps/ns16550_test/main.c @@ -0,0 +1,100 @@ +/* + * Copyright 2026 Two Sigma Open Source, LLC + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/* + * ns16550a UART face directed test (Increment 1 of the no-MMU Linux glue). + * + * FROST presents a word-stride 16550 register face at 0x4000_1000 (DTB + * reg-shift=2, reg-io-width=4) that aliases the native UART TX/RX, so a stock + * Linux 8250 console driver can drive it. This test runs the 8250 init dance + * (DLAB/baud, 8N1, FIFO, MCR), checks the register file and TX-ready status, + * and transmits a banner THROUGH the face (which must appear on the UART TX + * line). PASS/FAIL is emitted over the known-good native UART so the verdict + * is independent of the face under test. + */ + +#include + +/* Native FROST UART (known-good) -- used only for the PASS/FAIL marker. */ +#define NATIVE_TX (*(volatile uint32_t *) 0x40000000u) +#define NATIVE_TX_ST (*(volatile uint32_t *) 0x40000028u) +static void n_putc(char c) +{ + while (!(NATIVE_TX_ST & 1u)) { + } + NATIVE_TX = (uint8_t) c; +} +static void n_puts(const char *s) +{ + while (*s) + n_putc(*s++); +} + +/* ns16550a face @ 0x4000_1000, word stride. */ +#define NS(off) (*(volatile uint32_t *) (uintptr_t) (0x40001000u + (off))) +#define NS_THR NS(0x00) +#define NS_IER NS(0x04) +#define NS_IIR NS(0x08) +#define NS_FCR NS(0x08) +#define NS_LCR NS(0x0C) +#define NS_MCR NS(0x10) +#define NS_LSR NS(0x14) +#define NS_SCR NS(0x1C) + +static void ns_init(void) +{ + NS_IER = 0x00u; /* polled (no interrupts wired) */ + NS_LCR = 0x80u; /* DLAB = 1 */ + NS_THR = 0x01u; /* DLL (baud divisor low) -- FROST ignores the divisor */ + NS_IER = 0x00u; /* DLM (baud divisor high) */ + NS_LCR = 0x03u; /* DLAB = 0, 8N1 */ + NS_FCR = 0x07u; /* enable + clear RX/TX FIFOs */ + NS_MCR = 0x03u; /* DTR | RTS */ +} +static void ns_putc(char c) +{ + while (!(NS_LSR & 0x20u)) { /* wait for THRE */ + } + NS_THR = (uint8_t) c; +} +static void ns_puts(const char *s) +{ + while (*s) + ns_putc(*s++); +} + +int main(void) +{ + int ok = 1; + + ns_init(); + ok &= ((NS_LCR & 0xFFu) == 0x03u); /* LCR readback: 8N1, DLAB clear */ + ok &= ((NS_LSR & 0x60u) == 0x60u); /* THRE | TEMT set (TX ready) */ + ok &= ((NS_IIR & 0x01u) == 0x01u); /* no interrupt pending */ + + NS_SCR = 0xA5u; /* scratch register is read/write */ + ok &= ((NS_SCR & 0xFFu) == 0xA5u); + NS_SCR = 0x5Au; + ok &= ((NS_SCR & 0xFFu) == 0x5Au); + + /* Transmit a banner THROUGH the ns16550 face; it must reach the UART TX. */ + ns_puts("[ns16550 face: TX path OK]\r\n"); + + n_puts(ok ? "\r\n<>\r\n" : "\r\n<>\r\n"); + for (;;) { + } + return 0; +} diff --git a/tests/test_run_cocotb.py b/tests/test_run_cocotb.py index 1d0322ba..4eb2bfe6 100755 --- a/tests/test_run_cocotb.py +++ b/tests/test_run_cocotb.py @@ -169,6 +169,12 @@ class CocotbRunConfig: app_name="umode_test", description="U-mode (User privilege) directed test", ), + "ns16550_test": CocotbRunConfig( + python_test_module="cocotb_tests.test_real_program", + hdl_toplevel_module="frost", + app_name="ns16550_test", + description="ns16550a UART face directed test (Linux glue)", + ), "freertos_demo": CocotbRunConfig( python_test_module="cocotb_tests.test_real_program", hdl_toplevel_module="frost", From 36eae452943adb69fa9983d1b585a7e470bd5e95 Mon Sep 17 00:00:00 2001 From: Adam Bagley Date: Fri, 19 Jun 2026 01:16:52 -0400 Subject: [PATCH 08/43] Add SiFive CLINT alias for the Linux timer Expose a sifive,clint0-compatible window at 0x4001_0000 (msip @ +0, mtimecmp @ +0x4000, mtime @ +0xBFF8) that aliases the native FROST timer registers, so a stock Linux CLINT driver delivers the machine timer tick. The aliases read and write the same msip/mtimecmp/mtime as the native block (no new state); native-timer behavior is unchanged -- the CLINT addresses are added as extra read-mux cases and extra labels on the existing mtime/mtimecmp/msip write paths. Validated by sw/apps/clint_test: writes through the CLINT window are observable at the native timer addresses, and a machine timer interrupt set up entirely through the CLINT window fires with mcause=0x8000_0007. freertos_demo still passes (M-mode timer/UART unchanged). --- hw/rtl/cpu_and_mem/cpu_and_mem.sv | 30 ++++++-- sw/apps/clint_test/Makefile | 17 +++++ sw/apps/clint_test/main.c | 112 ++++++++++++++++++++++++++++++ tests/test_run_cocotb.py | 6 ++ 4 files changed, 159 insertions(+), 6 deletions(-) create mode 100644 sw/apps/clint_test/Makefile create mode 100644 sw/apps/clint_test/main.c diff --git a/hw/rtl/cpu_and_mem/cpu_and_mem.sv b/hw/rtl/cpu_and_mem/cpu_and_mem.sv index 5dfe53ad..e1dfebef 100644 --- a/hw/rtl/cpu_and_mem/cpu_and_mem.sv +++ b/hw/rtl/cpu_and_mem/cpu_and_mem.sv @@ -163,6 +163,15 @@ module cpu_and_mem #( localparam int unsigned Ns16550Msr = 32'h4000_1018; // read-only modem status localparam int unsigned Ns16550Scr = 32'h4000_101C; // scratch + // SiFive CLINT alias for Linux (compatible "sifive,clint0") @ 0x4001_0000. + // These map onto the SAME msip/mtimecmp/mtime registers as the native FROST + // timer block; the kernel reaches the timer through the CLINT layout via DTB. + localparam int unsigned ClintMsip = 32'h4001_0000; // hart-0 software interrupt + localparam int unsigned ClintMtimecmpLo = 32'h4001_4000; // mtimecmp[31:0] + localparam int unsigned ClintMtimecmpHi = 32'h4001_4004; // mtimecmp[63:32] + localparam int unsigned ClintMtimeLo = 32'h4001_BFF8; // mtime[31:0] + localparam int unsigned ClintMtimeHi = 32'h4001_BFFC; // mtime[63:32] + // Timer register defaults // Default mtimecmp to max value so no timer interrupt fires until software configures it localparam logic [63:0] MtimecmpDefault = 64'hFFFF_FFFF_FFFF_FFFF; @@ -861,6 +870,12 @@ module cpu_and_mem #( mmio_read_data_comb = {24'b0, 1'b0, i_uart_tx_ready, i_uart_tx_ready, 4'b0, i_uart_rx_valid}; Ns16550Msr: mmio_read_data_comb = {24'b0, 8'hB0}; // DCD|DSR|CTS asserted Ns16550Scr: mmio_read_data_comb = {24'b0, ns_scr}; + // SiFive CLINT alias (same registers as the native timer block). + ClintMsip: mmio_read_data_comb = {31'b0, msip}; + ClintMtimecmpLo: mmio_read_data_comb = mtimecmp[31:0]; + ClintMtimecmpHi: mmio_read_data_comb = mtimecmp[63:32]; + ClintMtimeLo: mmio_read_data_comb = mtime[31:0]; + ClintMtimeHi: mmio_read_data_comb = mtime[63:32]; default: ; endcase end @@ -971,9 +986,11 @@ module cpu_and_mem #( // This would cause the non-written half to increment during a write, which is wrong. logic writing_mtime_low, writing_mtime_high; assign writing_mtime_low = |data_memory_byte_write_enable_registered && - (data_memory_address_registered == MtimeLowMmioAddr); + ((data_memory_address_registered == MtimeLowMmioAddr) || + (data_memory_address_registered == ClintMtimeLo)); assign writing_mtime_high = |data_memory_byte_write_enable_registered && - (data_memory_address_registered == MtimeHighMmioAddr); + ((data_memory_address_registered == MtimeHighMmioAddr) || + (data_memory_address_registered == ClintMtimeHi)); always_ff @(posedge i_clk) begin if (i_rst) begin @@ -997,11 +1014,12 @@ module cpu_and_mem #( if (|data_memory_byte_write_enable_registered) begin unique case (data_memory_address_registered) // mtimecmp controls timer interrupt threshold - MtimecmpLowMmioAddr: mtimecmp[31:0] <= data_memory_write_data_registered; - MtimecmpHighMmioAddr: mtimecmp[63:32] <= data_memory_write_data_registered; + MtimecmpLowMmioAddr, ClintMtimecmpLo: mtimecmp[31:0] <= data_memory_write_data_registered; + MtimecmpHighMmioAddr, ClintMtimecmpHi: + mtimecmp[63:32] <= data_memory_write_data_registered; // msip controls software interrupt (only bit 0 is writable) - MsipMmioAddr: msip <= data_memory_write_data_registered[0]; - default: ; + MsipMmioAddr, ClintMsip: msip <= data_memory_write_data_registered[0]; + default: ; endcase end end diff --git a/sw/apps/clint_test/Makefile b/sw/apps/clint_test/Makefile new file mode 100644 index 00000000..8621641d --- /dev/null +++ b/sw/apps/clint_test/Makefile @@ -0,0 +1,17 @@ +# Copyright 2026 Two Sigma Open Source, LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +# Makefile for the SiFive CLINT alias directed test +SRC_C := main.c +include ../../common/common.mk diff --git a/sw/apps/clint_test/main.c b/sw/apps/clint_test/main.c new file mode 100644 index 00000000..0f7e24a9 --- /dev/null +++ b/sw/apps/clint_test/main.c @@ -0,0 +1,112 @@ +/* + * Copyright 2026 Two Sigma Open Source, LLC + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/* + * SiFive CLINT alias directed test (Increment 2 of the no-MMU Linux glue). + * + * FROST exposes a sifive,clint0-compatible window at 0x4001_0000 (msip @ +0, + * mtimecmp @ +0x4000, mtime @ +0xBFF8) that aliases the native FROST timer + * registers, so a stock Linux CLINT driver can deliver the timer tick. This + * test proves the alias two ways: + * 1. writes through the CLINT addresses are observable at the native timer + * addresses (same physical registers); + * 2. an actual machine timer interrupt set up entirely through the CLINT + * window fires with mcause = 0x8000_0007. + */ + +#include + +/* SiFive CLINT alias window. */ +#define CLINT_MSIP (*(volatile uint32_t *) 0x40010000u) +#define CLINT_MTIMECMP_LO (*(volatile uint32_t *) 0x40014000u) +#define CLINT_MTIMECMP_HI (*(volatile uint32_t *) 0x40014004u) +#define CLINT_MTIME_LO (*(volatile uint32_t *) 0x4001BFF8u) + +/* Native FROST timer registers (the aliased physical registers). */ +#define NAT_MTIMECMP_LO (*(volatile uint32_t *) 0x40000018u) +#define NAT_MTIMECMP_HI (*(volatile uint32_t *) 0x4000001Cu) +#define NAT_MSIP (*(volatile uint32_t *) 0x40000020u) +#define NAT_MTIME_LO (*(volatile uint32_t *) 0x40000010u) + +/* Native UART for the PASS/FAIL marker. */ +#define UTX (*(volatile uint32_t *) 0x40000000u) +#define UTX_ST (*(volatile uint32_t *) 0x40000028u) +static void putc_(char c) +{ + while (!(UTX_ST & 1u)) { + } + UTX = (uint8_t) c; +} +static void puts_(const char *s) +{ + while (*s) + putc_(*s++); +} + +static volatile uint32_t g_cause; + +/* Machine trap handler. GCC's "interrupt" attribute emits the register + * save/restore and MRET, so it is safe as a normal C function. */ +__attribute__((interrupt("machine"), aligned(4))) static void mtrap(void) +{ + uint32_t mc; + __asm__ volatile("csrr %0, mcause" : "=r"(mc)); + g_cause = mc; + /* Ack: push the compare (through the CLINT alias) to max so it cannot + * refire. */ + CLINT_MTIMECMP_HI = 0xFFFFFFFFu; + CLINT_MTIMECMP_LO = 0xFFFFFFFFu; +} + +int main(void) +{ + int ok = 1; + + __asm__ volatile("csrw mtvec, %0" ::"r"(&mtrap)); /* direct mode */ + + /* 1a. mtimecmp written via CLINT is visible at the native address. */ + CLINT_MTIMECMP_LO = 0x12345678u; + CLINT_MTIMECMP_HI = 0x9ABCDEF0u; + ok &= (NAT_MTIMECMP_LO == 0x12345678u); + ok &= (NAT_MTIMECMP_HI == 0x9ABCDEF0u); + + /* 1b. msip written via CLINT is visible at the native address. */ + CLINT_MSIP = 1u; + ok &= ((NAT_MSIP & 1u) == 1u); + CLINT_MSIP = 0u; + ok &= ((NAT_MSIP & 1u) == 0u); + + /* 1c. CLINT mtime and native mtime read the same advancing counter. */ + uint32_t t_clint = CLINT_MTIME_LO; + uint32_t t_nat = NAT_MTIME_LO; /* read after -> >= */ + ok &= (t_nat >= t_clint); + + /* 2. A machine timer interrupt set up entirely through the CLINT window. */ + g_cause = 0u; + CLINT_MTIMECMP_HI = 0xFFFFFFFFu; /* block premature fire */ + CLINT_MTIMECMP_LO = CLINT_MTIME_LO + 1000u; + CLINT_MTIMECMP_HI = 0u; + __asm__ volatile("csrs mie, %0" ::"r"(0x80)); /* MTIE */ + __asm__ volatile("csrs mstatus, %0" ::"r"(0x8)); /* MIE */ + for (volatile int i = 0; i < 1000000 && g_cause == 0u; i++) { + } + ok &= (g_cause == 0x80000007u); + + puts_(ok ? "\r\n<>\r\n" : "\r\n<>\r\n"); + for (;;) { + } + return 0; +} diff --git a/tests/test_run_cocotb.py b/tests/test_run_cocotb.py index 4eb2bfe6..b3edbaf3 100755 --- a/tests/test_run_cocotb.py +++ b/tests/test_run_cocotb.py @@ -175,6 +175,12 @@ class CocotbRunConfig: app_name="ns16550_test", description="ns16550a UART face directed test (Linux glue)", ), + "clint_test": CocotbRunConfig( + python_test_module="cocotb_tests.test_real_program", + hdl_toplevel_module="frost", + app_name="clint_test", + description="SiFive CLINT alias directed test (Linux glue)", + ), "freertos_demo": CocotbRunConfig( python_test_module="cocotb_tests.test_real_program", hdl_toplevel_module="frost", From 56474c2db281fe46c4281bb1d2633ed95a7fd4cf Mon Sep 17 00:00:00 2001 From: Adam Bagley Date: Fri, 19 Jun 2026 01:34:31 -0400 Subject: [PATCH 09/43] docs: document the ns16550a UART face and CLINT alias Reflect the Linux glue (commits "Add ns16550a UART face for the Linux console" and "Add SiFive CLINT alias for the Linux timer") in hw/rtl/README.md: widen the MMIO region size (44 B -> 112 KiB), add the ns16550a face (0x4000_1000) and the SiFive CLINT alias (0x4001_0000) to the MMIO register table with a note on the device-tree binding, and correct the MMIO_SIZE_BYTES parameter (0x2C -> 0x1_C000). --- hw/rtl/README.md | 16 ++++++++++++++-- 1 file changed, 14 insertions(+), 2 deletions(-) diff --git a/hw/rtl/README.md b/hw/rtl/README.md index 8ba90844..eded590e 100644 --- a/hw/rtl/README.md +++ b/hw/rtl/README.md @@ -102,7 +102,7 @@ served by the cache hierarchy: |--------|---------|------|-------------| | ROM | `0x0000_0000` | 96 KiB | Code and read-only data (fast BRAM) | | RAM | `0x0001_8000` | 160 KiB | Data, BSS, stack (fast BRAM) | -| MMIO | `0x4000_0000` | 44 B | UART, FIFOs, CLINT-style timer, software interrupt | +| MMIO | `0x4000_0000` | 112 KiB | UART/FIFOs/timer; plus Linux-facing ns16550a UART (`0x4000_1000`) and SiFive CLINT (`0x4001_0000`) | | DDR | `0x8000_0000` | 1 GiB | Cached region: code (`.ddr_text`), heap and large data (see below) | The cached tier serves both sides of the core: loads/stores through the @@ -146,10 +146,22 @@ MMIO registers: | `0x4000_0020` | MSIP | Machine software interrupt pending | | `0x4000_0024` | UART_RX_STATUS | Bit 0 is data available | | `0x4000_0028` | UART_TX_STATUS | Bit 0 is can accept byte | +| `0x4000_1000`–`101C` | ns16550a UART face | 16550 register file (word stride) aliasing UART_TX/RX for the Linux 8250 driver | +| `0x4001_0000` | CLINT MSIP | SiFive CLINT alias of MSIP | +| `0x4001_4000`/`4004` | CLINT MTIMECMP_LO/HI | SiFive CLINT alias of MTIMECMP | +| `0x4001_BFF8`/`BFFC` | CLINT MTIME_LO/HI | SiFive CLINT alias of MTIME | The hardware UART console is configured for 115200 baud, 8 data bits, no parity, and 1 stop bit (8N1). +For no-MMU Linux, the same UART is also reachable through a standard +ns16550a register face at `0x4000_1000` (word stride; device-tree +`reg-shift=2`, `reg-io-width=4`; `earlycon=uart8250,mmio32`), and the timer +through a SiFive-CLINT-compatible window at `0x4001_0000` (`mtimecmp` at +`+0x4000`, `mtime` at `+0xBFF8`). Both alias the native registers listed +above onto the same hardware, so the in-tree Linux 8250 console and CLINT +timer drivers work without a board-specific driver. + If these addresses change, update `cpu_and_mem.sv`, `cpu_ooo.sv` parameters, `sw/common/link.ld`, `sw/lib/include/mmio.h`, and the verification constants in `verif/config.py`. @@ -199,7 +211,7 @@ sed -n '1,200p' hw/rtl/cpu_and_mem/cpu/cpu_ooo.f | `frost.sv` | `DDR_MODEL_BYTES` / `DDR_MODEL_LATENCY` | `64 MiB` / `30` | Behavioral DDR model size and access latency (simulation) | | `frost.sv` | `FETCH_VALID_FUZZ` | `0` | Simulation-only: 1 wraps the low BRAM in a variable-latency fetch model (LFSR fetch-valid gaps) that mirrors the L1I provider's fetch contract; hardware keeps 0 | | `cpu_ooo.sv` | `MMIO_ADDR` | `32'h4000_0000` | MMIO base | -| `cpu_ooo.sv` | `MMIO_SIZE_BYTES` | `32'h2C` | MMIO range size | +| `cpu_ooo.sv` | `MMIO_SIZE_BYTES` | `32'h1_C000` | MMIO range size (covers the ns16550a face + CLINT alias) | Simulation overrides parameters through Verilator generics (`-G`): the test Makefile enables the cached tier with the X3 hierarchy shape by default From 828457cb6838c12dc99afc8368149333a7ddc7c2 Mon Sep 17 00:00:00 2001 From: Adam Bagley Date: Fri, 19 Jun 2026 15:02:55 -0400 Subject: [PATCH 10/43] Fix LR/SC store-conditional deadlock under speculation; boot nommu Linux to banner The store-conditional resolution path deadlocked when several SCs are in flight under branch speculation (e.g. an LR/SC retry loop). Two bugs: 1. sc_pending was cleared on any partial flush ((speculative_partial_flush || is_younger(...))), which dropped a *surviving* older SC. Clear only when the pending SC is younger than the flush boundary (is_younger). 2. The mem-RS ready gate (!(sc_pending && mem_rs_next_is_sc)) blocked the *older* head SC from issuing whenever a *younger* speculative SC had already issued and set sc_pending. The head SC then never issued, never fired, and sc_pending never cleared -> deadlock. sc_pending_unit now tracks multiple in-flight SCs in a small per-ROB-tag table (depth NumCheckpoints+1) and fires the SC whose tag matches the ROB head; the serialization gate is removed. BRAM LR/SC was unaffected (it resolves before a second SC issues); the longer cached-DDR latency exposed it (Linux printk _prb_commit). Also adds the ddr_atomic_test directed reproducer and the linux_boot bring-up. With the fix the kernel now reaches "[ 0.000000] Linux version". --- .../atomics/sc_pending_unit.sv | 155 ++++++++++++------ .../tomasulo_wrapper/tomasulo_wrapper.sv | 7 +- sw/apps/ddr_atomic_test/Makefile | 17 ++ sw/apps/ddr_atomic_test/main.c | 95 +++++++++++ sw/apps/linux_boot/Makefile | 31 ++++ tests/test_run_cocotb.py | 14 ++ verif/cocotb_tests/test_real_program.py | 8 + 7 files changed, 274 insertions(+), 53 deletions(-) create mode 100644 sw/apps/ddr_atomic_test/Makefile create mode 100644 sw/apps/ddr_atomic_test/main.c create mode 100644 sw/apps/linux_boot/Makefile diff --git a/hw/rtl/cpu_and_mem/cpu/tomasulo/tomasulo_wrapper/atomics/sc_pending_unit.sv b/hw/rtl/cpu_and_mem/cpu/tomasulo/tomasulo_wrapper/atomics/sc_pending_unit.sv index 170ade76..b8b22252 100644 --- a/hw/rtl/cpu_and_mem/cpu/tomasulo/tomasulo_wrapper/atomics/sc_pending_unit.sv +++ b/hw/rtl/cpu_and_mem/cpu/tomasulo/tomasulo_wrapper/atomics/sc_pending_unit.sv @@ -17,19 +17,29 @@ // ============================================================================= // sc_pending_unit // ============================================================================= -// Extracted verbatim from tomasulo_wrapper.sv (pure RTL boundary move, zero -// functional change). Store-conditional (SC.W) resolution: -// * the SC pending register FSM (set at MEM_RS SC issue, cleared on fire / -// flush / age) and its data capture (rob_tag + address), -// * the combinational fire/success decode, and -// * the sc_fu_complete result packet. -// The store-misalign exception path, the MEM-adapter input mux, and -// lq_result_accepted remain in the wrapper; this unit consumes store_misalign_* -// as inputs and produces sc_pending (visible to dispatch) and sc_fu_complete -// (registered by the wrapper before the MEM adapter). +// Store-conditional (SC.W) resolution. // -// is_younger is duplicated here (it is also used elsewhere in the wrapper, and -// the wrapper comment notes it is identical to the load_queue / RS copies). +// In-flight SCs are tracked in a small table keyed by ROB tag, so the SC that +// reaches the ROB head can ALWAYS fire -- even when an LR/SC retry loop is +// branch-speculated and the core issues several SCs (one per speculated +// iteration) before the oldest resolves. A single pending-SC register failed +// here: a younger speculative SC overwrote the head SC's rob_tag, so the head +// SC's tag never matched, it never fired, the branch never resolved, and the +// core deadlocked. Observed on Linux printk's _prb_commit cmpxchg loop (11 SCs +// issued, 8-deep speculation; head=tag15 but the register held tag19). BRAM +// LR/SC resolves before a second SC issues, so BRAM/FreeRTOS were unaffected; +// the longer cached-tier (DDR) latency exposes the overlap. +// +// Two flush rules matter and were both bugs in the single-register version: +// * an SC fires when head_tag matches a VALID entry and the SQ is drained; +// * an entry is cleared on a flush ONLY if it is younger than the flush +// boundary (is_younger) -- NOT unconditionally on partial flush, which +// would drop a surviving older SC. +// Depth = NumCheckpoints + 1 (branch speculation depth bounds concurrent SCs). +// +// The store-misalign exception path, MEM-adapter input mux, and +// lq_result_accepted remain in the wrapper. is_younger is duplicated here +// (identical to the load_queue / RS copies). // ============================================================================= module sc_pending_unit ( input logic i_clk, @@ -56,8 +66,7 @@ module sc_pending_unit ( ); // --------------------------------------------------------------------------- - // Alias input ports back to the wrapper's local names so the bodies below are - // byte-identical to the original tomasulo_wrapper logic. + // Alias input ports back to the wrapper's local names. // --------------------------------------------------------------------------- logic [riscv_pkg::ReorderBufferTagWidth-1:0] head_tag; logic sq_committed_empty; @@ -86,12 +95,13 @@ module sc_pending_unit ( assign speculative_flush_en = i_speculative_flush_en; assign speculative_partial_flush = i_speculative_partial_flush; - // SC pending state (rob_tag / addr are internal; sc_pending is also output) - logic sc_pending; - logic [riscv_pkg::ReorderBufferTagWidth-1:0] sc_pending_rob_tag; - logic [riscv_pkg::XLEN-1:0] sc_pending_addr; + // SC tracking table: one entry per in-flight SC, keyed by ROB tag. + localparam int unsigned ScTableDepth = riscv_pkg::NumCheckpoints + 1; + logic [ScTableDepth-1:0] sct_valid; + logic [riscv_pkg::ReorderBufferTagWidth-1:0] sct_tag[ScTableDepth]; + logic [riscv_pkg::XLEN-1:0] sct_addr[ScTableDepth]; - // Age comparison for SC flush guard (identical to load_queue/reservation_station) + // Age comparison for the SC flush guard (identical to load_queue / RS). function automatic logic is_younger(input logic [riscv_pkg::ReorderBufferTagWidth-1:0] entry_tag, input logic [riscv_pkg::ReorderBufferTagWidth-1:0] flush_tag, input logic [riscv_pkg::ReorderBufferTagWidth-1:0] head); @@ -104,70 +114,111 @@ module sc_pending_unit ( end endfunction + // Head match: an in-flight SC sits at the ROB head. + logic sct_hit; + logic [riscv_pkg::XLEN-1:0] sct_hit_addr; + logic [ ScTableDepth-1:0] sct_hit_oh; + always_comb begin + sct_hit = 1'b0; + sct_hit_addr = '0; + sct_hit_oh = '0; + for (int i = 0; i < ScTableDepth; i++) begin + if (sct_valid[i] && (sct_tag[i] == head_tag)) begin + sct_hit = 1'b1; + sct_hit_addr = sct_addr[i]; + sct_hit_oh[i] = 1'b1; + end + end + end + + // First free slot for a newly-issued SC. + logic sct_has_free; + logic [ScTableDepth-1:0] sct_free_oh; + always_comb begin + sct_has_free = 1'b0; + sct_free_oh = '0; + for (int i = 0; i < ScTableDepth; i++) begin + if (!sct_valid[i] && !sct_has_free) begin + sct_has_free = 1'b1; + sct_free_oh[i] = 1'b1; + end + end + end + // Capture an issuing SC. Reject a phantom SC only when it is younger than the + // flush boundary (it is being killed); a real SC that survives the flush must + // be captured even if its issue coincides with the flush window. + logic sct_alloc; + assign sct_alloc = o_mem_rs_issue.valid && !speculative_flush_all && + (o_mem_rs_issue.op == riscv_pkg::SC_W) && + !(speculative_flush_en && is_younger( + o_mem_rs_issue.rob_tag, i_flush_tag, head_tag + )); + logic sc_can_fire; logic sc_success; logic sc_fire_now; - assign sc_can_fire = sc_pending && (sc_pending_rob_tag == head_tag) && sq_committed_empty; + assign sc_can_fire = sct_hit && sq_committed_empty; assign sc_success = lq_reservation_valid - && (lq_reservation_addr[riscv_pkg::XLEN-1:2] == sc_pending_addr[riscv_pkg::XLEN-1:2]); - // Arm SC only when the MEM adapter has no competing same-cycle producer. - // This keeps the rare SC head-tag compare local to the SC register D path; - // the registered completion below owns the MEM adapter on the next cycle. + && (lq_reservation_addr[riscv_pkg::XLEN-1:2] == sct_hit_addr[riscv_pkg::XLEN-1:2]); + // Arm SC only when the MEM adapter has no competing same-cycle producer; the + // registered completion below owns the MEM adapter on the next cycle. assign sc_fire_now = sc_can_fire && !mem_adapter_result_pending && !lq_fu_complete.valid && !store_misalign_issue && !store_misalign_fu_complete_reg.valid; - // SC fu_complete generation + // SC fu_complete generation. The firing SC's tag IS head_tag (it matched). riscv_pkg::fu_complete_t sc_fu_complete; always_comb begin sc_fu_complete = '0; sc_fu_complete.valid = sc_fire_now; - sc_fu_complete.tag = sc_pending_rob_tag; + sc_fu_complete.tag = head_tag; sc_fu_complete.value = {{(riscv_pkg::FLEN - 1) {1'b0}}, ~sc_success}; end + // Table valid bits: allocate on SC issue, free on fire, flush younger entries. always_ff @(posedge i_clk) begin - if (!i_rst_n) begin - sc_pending <= 1'b0; - end else if (speculative_flush_all) begin - sc_pending <= 1'b0; + if (!i_rst_n || speculative_flush_all) begin + sct_valid <= '0; end else begin - // Set when MEM_RS issues SC. Gate with flush signals because - // the RS output valid is no longer suppressed during flush for - // timing closure — a phantom SC set during partial flush would - // leave sc_pending stuck (the flushed tag never reaches head). - if (o_mem_rs_issue.valid && !speculative_flush_all && !speculative_flush_en - && (o_mem_rs_issue.op == riscv_pkg::SC_W)) begin - sc_pending <= 1'b1; + // Clear ONLY entries younger than the flush boundary (i_flush_tag) -- i.e. + // actually being flushed. Do NOT clear on speculative_partial_flush alone: + // an SC older than the mispredicted branch (e.g. one still waiting for the + // head to reach it on the slow cached tier) must survive. + if (i_flush_en) begin + for (int i = 0; i < ScTableDepth; i++) begin + if (sct_valid[i] && is_younger(sct_tag[i], i_flush_tag, head_tag)) begin + sct_valid[i] <= 1'b0; + end + end end - // Clear when SC fu_complete is armed for the registered MEM path. + // Free the firing entry. if (sc_fire_now) begin - sc_pending <= 1'b0; + for (int i = 0; i < ScTableDepth; i++) if (sct_hit_oh[i]) sct_valid[i] <= 1'b0; end - // A pending SC is speculative if it is younger than the flush boundary, - // or if recovery is draining everything younger than the current/just- - // retired head. - if (i_flush_en && sc_pending && (speculative_partial_flush || is_younger( - sc_pending_rob_tag, i_flush_tag, head_tag - ))) begin - sc_pending <= 1'b0; + // Allocate a newly-issued SC into the first free slot. (Alloc targets a + // free slot; fire/flush clear valid slots, so the indices never collide.) + if (sct_alloc && sct_has_free) begin + for (int i = 0; i < ScTableDepth; i++) if (sct_free_oh[i]) sct_valid[i] <= 1'b1; end end end - // SC data capture (no reset - gated by sc_pending) + // SC tag/addr capture (no reset; gated by the alloc one-hot). always_ff @(posedge i_clk) begin - if (o_mem_rs_issue.valid && !speculative_flush_all && !speculative_flush_en - && (o_mem_rs_issue.op == riscv_pkg::SC_W)) begin - sc_pending_rob_tag <= o_mem_rs_issue.rob_tag; - sc_pending_addr <= sq_effective_addr; + if (sct_alloc && sct_has_free) begin + for (int i = 0; i < ScTableDepth; i++) begin + if (sct_free_oh[i]) begin + sct_tag[i] <= o_mem_rs_issue.rob_tag; + sct_addr[i] <= sq_effective_addr; + end + end end end - assign o_sc_pending = sc_pending; + assign o_sc_pending = |sct_valid; assign o_sc_fu_complete = sc_fu_complete; endmodule diff --git a/hw/rtl/cpu_and_mem/cpu/tomasulo/tomasulo_wrapper/tomasulo_wrapper.sv b/hw/rtl/cpu_and_mem/cpu/tomasulo/tomasulo_wrapper/tomasulo_wrapper.sv index 7aa7d4a3..739e12dd 100644 --- a/hw/rtl/cpu_and_mem/cpu/tomasulo/tomasulo_wrapper/tomasulo_wrapper.sv +++ b/hw/rtl/cpu_and_mem/cpu/tomasulo/tomasulo_wrapper/tomasulo_wrapper.sv @@ -1331,8 +1331,13 @@ module tomasulo_wrapper #( logic mem_rs_fu_ready_base; logic mem_rs_fu_ready; + // Do NOT gate SC issue on (sc_pending && next_is_sc). That single-SC + // serialization deadlocked Linux: under speculation a YOUNGER SC issues + // out-of-order, sets sc_pending, and then this gate blocked the OLDER head SC + // from ever issuing -- so it never fired, sc_pending never cleared, and the + // core hung at _prb_commit. sc_pending_unit now tracks multiple in-flight SCs + // (a table keyed by ROB tag), so several SCs may legitimately be in flight. assign mem_rs_fu_ready_base = i_mem_rs_fu_ready && - !(sc_pending && mem_rs_next_is_sc) && !sc_fu_complete_reg.valid && !mem_adapter_result_pending && !i_backend_recovery_hold; diff --git a/sw/apps/ddr_atomic_test/Makefile b/sw/apps/ddr_atomic_test/Makefile new file mode 100644 index 00000000..aa2cc97b --- /dev/null +++ b/sw/apps/ddr_atomic_test/Makefile @@ -0,0 +1,17 @@ +# Copyright 2026 Two Sigma Open Source, LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +# Makefile for the DDR (cached-tier) atomics reproducer +SRC_C := main.c +include ../../common/common.mk diff --git a/sw/apps/ddr_atomic_test/main.c b/sw/apps/ddr_atomic_test/main.c new file mode 100644 index 00000000..dd39f8ba --- /dev/null +++ b/sw/apps/ddr_atomic_test/main.c @@ -0,0 +1,95 @@ +/* + * Copyright 2026 Two Sigma Open Source, LLC + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/* + * Directed reproducer for RV32-A atomics to the CACHED DDR region. + * + * A no-MMU Linux boot hangs on a store-conditional (sc.w.rl) to a printk + * ring-buffer descriptor in DDR -- i.e. LR/SC to the cached tier deadlocks, + * even though atomics to low BRAM work (FreeRTOS A-extension stress passes). + * + * This isolates it: the target variable lives in .ddr_data (DDR / cached + * tier). A progress letter is printed BEFORE each step so the last letter + * received over UART pinpoints which operation wedged: + * "S" started + * "SL" plain DDR store/load OK (hang at AMO) + * "SLA" AMO (amoadd.w) to DDR OK (hang at LR/SC) + * "SLAC" LR/SC to DDR OK + * "<>" all DDR atomics work (then the kernel hang is elsewhere) + */ + +#include + +#define UTX (*(volatile uint32_t *) 0x40000000u) +#define UTX_ST (*(volatile uint32_t *) 0x40000028u) +static void putc_(char c) +{ + while (!(UTX_ST & 1u)) { + } + UTX = (uint8_t) c; +} +static void puts_(const char *s) +{ + while (*s) + putc_(*s++); +} + +/* Lives in the cached DDR region. */ +__attribute__((section(".ddr_data"))) static volatile uint32_t ddr_var = 0x10; + +int main(void) +{ + putc_('S'); + + /* 1. plain DDR store/load (should already work -- ddr_test passes). */ + ddr_var = 0x20; + if (ddr_var != 0x20) { + puts_("\r\n<> ddr store/load\r\n"); + for (;;) { + } + } + putc_('L'); + + /* 2. AMO to DDR (amoadd.w). Hangs here if AMO-to-cached deadlocks. */ + uint32_t old_amo; + __asm__ volatile("amoadd.w %0, %2, (%1)" : "=r"(old_amo) : "r"(&ddr_var), "r"(1u) : "memory"); + if (ddr_var != 0x21) { + puts_("\r\n<> amo result\r\n"); + for (;;) { + } + } + putc_('A'); + + /* 3. LR/SC compare-exchange to DDR (matches the kernel's sc.w.rl). */ + uint32_t prev; + __asm__ volatile("1: lr.w %0, (%1)\n" + " sc.w.rl t0, %2, (%1)\n" + " bnez t0, 1b\n" + : "=&r"(prev) + : "r"(&ddr_var), "r"(0xABCDu) + : "t0", "memory"); + if (ddr_var != 0xABCDu) { + puts_("\r\n<> lr/sc result\r\n"); + for (;;) { + } + } + putc_('C'); + + puts_("\r\n<>\r\n"); + for (;;) { + } + return 0; +} diff --git a/sw/apps/linux_boot/Makefile b/sw/apps/linux_boot/Makefile new file mode 100644 index 00000000..171eecf1 --- /dev/null +++ b/sw/apps/linux_boot/Makefile @@ -0,0 +1,31 @@ +# Copyright 2026 Two Sigma Open Source, LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +# Builds the cocotb memory images for the no-MMU Linux boot: a low-BRAM boot +# shim (sw.mem) plus the kernel Image + DTB in DDR (sw_ddr.mem). +# +# NOTE (bring-up): this pulls the kernel Image and DTB from the external +# linux-mvp tree via build_linux_mem.py. It is the integration scaffolding for +# first boot and will be replaced by an in-repo build once Linux boots. +ARTIFACTS ?= $(HOME)/bigger_l0/linux-mvp/frost-artifacts + +all: sw.mem sw_ddr.mem + +sw.mem sw_ddr.mem: + python3 $(ARTIFACTS)/build_linux_mem.py + cp -f $(ARTIFACTS)/sw.mem ./sw.mem + cp -f $(ARTIFACTS)/sw_ddr.mem ./sw_ddr.mem + +clean: + rm -f sw.mem sw_ddr.mem shim.elf shim.bin diff --git a/tests/test_run_cocotb.py b/tests/test_run_cocotb.py index b3edbaf3..f35dbe36 100755 --- a/tests/test_run_cocotb.py +++ b/tests/test_run_cocotb.py @@ -181,6 +181,20 @@ class CocotbRunConfig: app_name="clint_test", description="SiFive CLINT alias directed test (Linux glue)", ), + "linux_boot": CocotbRunConfig( + python_test_module="cocotb_tests.test_real_program", + hdl_toplevel_module="frost", + app_name="linux_boot", + description="No-MMU Linux boot (kernel Image in DDR)", + include_in_pytest=False, + ), + "ddr_atomic_test": CocotbRunConfig( + python_test_module="cocotb_tests.test_real_program", + hdl_toplevel_module="frost", + app_name="ddr_atomic_test", + description="RV32-A atomics to the cached DDR region (LR/SC, AMO)", + include_in_pytest=False, + ), "freertos_demo": CocotbRunConfig( python_test_module="cocotb_tests.test_real_program", hdl_toplevel_module="frost", diff --git a/verif/cocotb_tests/test_real_program.py b/verif/cocotb_tests/test_real_program.py index dd2611aa..7b3f3d95 100644 --- a/verif/cocotb_tests/test_real_program.py +++ b/verif/cocotb_tests/test_real_program.py @@ -184,6 +184,9 @@ async def generate_divided_clock(dut: Any) -> None: # sprintf_test needs more cycles due to ~200 test cases with heavy FP formatting on RV32 SPRINTF_TEST_MAX_CYCLES = 2000000 +# No-MMU Linux boot: reaching the kernel banner takes millions of cycles. +LINUX_BOOT_MAX_CYCLES = int(os.environ.get("COCOTB_LINUX_MAX_CYCLES", 20000000)) + # Number of clock cycles to hold reset between runs RESET_CYCLES = 10 @@ -647,6 +650,9 @@ def get_expected_behavior() -> tuple[str | None, str | None, bool, str | None]: if app_name == "hello_world": # Just needs to print the first hello message return (None, "Hello, world!", False, app_name) + if app_name == "linux_boot": + # No-MMU Linux boot: pass when the kernel banner appears. + return (None, "Linux version", False, app_name) if app_name == "uart_echo": # Interactive test handled separately (UART input injection) return (None, None, False, app_name) @@ -2684,6 +2690,8 @@ async def test_real_program(dut: Any) -> None: max_cycles = COREMARK_MAX_CYCLES elif app_name == "sprintf_test": max_cycles = SPRINTF_TEST_MAX_CYCLES + elif app_name == "linux_boot": + max_cycles = LINUX_BOOT_MAX_CYCLES else: max_cycles = MAX_CYCLES From a0cfdf7ce3cff32d0d84ba2f9c466ccecf706c47 Mon Sep 17 00:00:00 2001 From: Adam Bagley Date: Fri, 19 Jun 2026 15:15:29 -0400 Subject: [PATCH 11/43] Docs + CI for the LR/SC store-conditional fix - ddr_atomic_test: include_in_pytest=True so CI runs it (bram tier; it self-skips the ddr relink tier via its ddr_ name in DDR_TIER_EXCLUDE). linux_boot stays excluded from pytest (needs external kernel artifacts). - tomasulo_wrapper/README.md: rewrite the SC-state-machine section and the sc_pending_unit row for the multi-SC table and the removed issue-serialization gate. - sc_pending_unit.sv: correct the header comment (under speculation the head SC was blocked from issuing by the gate, not overwritten in the register). - tests/README.md: list ddr_atomic_test among the DDR-tier-excluded programs. --- .../cpu/tomasulo/tomasulo_wrapper/README.md | 18 ++++++++++++++++-- .../atomics/sc_pending_unit.sv | 15 +++++++++------ tests/README.md | 2 +- tests/test_run_cocotb.py | 2 +- 4 files changed, 27 insertions(+), 10 deletions(-) diff --git a/hw/rtl/cpu_and_mem/cpu/tomasulo/tomasulo_wrapper/README.md b/hw/rtl/cpu_and_mem/cpu/tomasulo/tomasulo_wrapper/README.md index db64dcab..25714efe 100644 --- a/hw/rtl/cpu_and_mem/cpu/tomasulo/tomasulo_wrapper/README.md +++ b/hw/rtl/cpu_and_mem/cpu/tomasulo/tomasulo_wrapper/README.md @@ -17,7 +17,7 @@ verbatim, so the flattened design is unchanged: | `commit_bus_pipeline` | `commit_bus/` | The four `always_ff` that register the combinational ROB commit bus into `commit_bus_q` / `commit_bus_2_q` plus the decomposed `commit_q_*` fields. | | `sq_early_addr_pipeline` | `store_addr/` | The dual-ported early store-address stage (register dispatch base+imm, add the next cycle off the dispatch critical path) that produces the two SQ early-address update packets. | | `dispatch_rs_router` | `dispatch_routing/` | Combinational decode of the dispatch packet(s) into per-RS dispatch-valid signals (slot 1 + slot 2) and the fast slot-1 "intent" signals. | -| `sc_pending_unit` | `atomics/` | Store-conditional resolution: the SC pending-register FSM (set at MEM_RS SC issue, cleared on fire / flush / age), its rob_tag+addr capture, the fire/success decode, and the `sc_fu_complete` packet. | +| `sc_pending_unit` | `atomics/` | Store-conditional resolution: a per-ROB-tag table of in-flight SCs (allocated at MEM_RS SC issue, freed on fire / flush), the head-match fire/success decode, and the `sc_fu_complete` packet. | The per-RS dispatch-valid nets in `dispatch_rs_router` carry `(* max_fanout = 32 *)`; the attribute is preserved both in the submodule and on the wrapper-side @@ -48,7 +48,7 @@ while the entry was queued gets a fresh value. ### SC state machine -The SC pending FSM and its fire/success decode live in +The SC tracking table and its fire/success decode live in `atomics/sc_pending_unit.sv`; the surrounding store-misalign path and MEM-adapter mux described below stay in the wrapper. @@ -61,6 +61,20 @@ result is just `~reservation_valid`. On failure, the wrapper sends a discard signal to the SQ to drop the SC's entry without writing memory. +Several SCs can be in flight at once: a branch-speculated LR/SC retry +loop issues one SC per speculated iteration, and the MEM_RS may issue +them out of program order. `sc_pending_unit` therefore tracks every +in-flight SC in a small table keyed by ROB tag (depth `NumCheckpoints ++ 1`) and fires the entry whose tag matches the ROB head; a flush drops +only entries younger than the flush boundary, so a surviving older SC +is never lost. This replaced a single pending register plus a +`!(sc_pending && mem_rs_next_is_sc)` issue-serialization gate in +`mem_rs_fu_ready_base`: under speculation a younger SC could take the +register and the gate would then block the older head SC from issuing +at all, so it never fired and `sc_pending` never cleared — Linux +printk's `_prb_commit` cmpxchg on the cached DDR tier deadlocked +exactly that way. The gate is gone; the table makes concurrent SCs safe. + The `sc_fu_complete` output is registered (`sc_fu_complete_reg`) before feeding the MEM adapter. The combinational path from the full-flush term `speculative_flush_all` (driven by `i_flush_all` / diff --git a/hw/rtl/cpu_and_mem/cpu/tomasulo/tomasulo_wrapper/atomics/sc_pending_unit.sv b/hw/rtl/cpu_and_mem/cpu/tomasulo/tomasulo_wrapper/atomics/sc_pending_unit.sv index b8b22252..4ebe9536 100644 --- a/hw/rtl/cpu_and_mem/cpu/tomasulo/tomasulo_wrapper/atomics/sc_pending_unit.sv +++ b/hw/rtl/cpu_and_mem/cpu/tomasulo/tomasulo_wrapper/atomics/sc_pending_unit.sv @@ -23,12 +23,15 @@ // reaches the ROB head can ALWAYS fire -- even when an LR/SC retry loop is // branch-speculated and the core issues several SCs (one per speculated // iteration) before the oldest resolves. A single pending-SC register failed -// here: a younger speculative SC overwrote the head SC's rob_tag, so the head -// SC's tag never matched, it never fired, the branch never resolved, and the -// core deadlocked. Observed on Linux printk's _prb_commit cmpxchg loop (11 SCs -// issued, 8-deep speculation; head=tag15 but the register held tag19). BRAM -// LR/SC resolves before a second SC issues, so BRAM/FreeRTOS were unaffected; -// the longer cached-tier (DDR) latency exposes the overlap. +// here: under speculation the MEM_RS issues SCs out of program order, so a +// younger SC took the one register, and the wrapper's former issue- +// serialization gate (!(sc_pending && mem_rs_next_is_sc)) then blocked the +// OLDER head SC from issuing at all -- so it never fired and the core +// deadlocked. Observed on Linux printk's _prb_commit cmpxchg loop (11 SCs +// issued, 8-deep speculation; head=tag15 never issued, the register held +// tag19). This table pairs with removing that gate (see tomasulo_wrapper.sv). +// BRAM LR/SC resolves before a second SC issues, so BRAM/FreeRTOS were +// unaffected; the longer cached-tier (DDR) latency exposes the overlap. // // Two flush rules matter and were both bugs in the single-register version: // * an SC fires when head_tag matches a VALID entry and the SQ is drained; diff --git a/tests/README.md b/tests/README.md index ba852118..821a1d9a 100644 --- a/tests/README.md +++ b/tests/README.md @@ -110,7 +110,7 @@ FROST_COCOTB_MEM_CONFIG=ddr ./test_run_cocotb.py hello_world FROST_COCOTB_MEM_CONFIG=ddr pytest test_run_cocotb.py -k test_real_program ``` -Tests in `DDR_TIER_EXCLUDE` self-skip in the `ddr` tier: the `*_fetch_fuzz` fetch fuzzers, and the already-DDR-focused `ddr_*` programs (`ddr_test`, `ddr_exec_test`, `ddr_smc_test`, `ddr_heap_test`) whose fixed-address writes a whole-program relocation would clobber. Unit benches are tier-independent and run only once (in the `bram` job). +Tests in `DDR_TIER_EXCLUDE` self-skip in the `ddr` tier: the `*_fetch_fuzz` fetch fuzzers, and the already-DDR-focused `ddr_*` programs (`ddr_test`, `ddr_exec_test`, `ddr_smc_test`, `ddr_heap_test`, `ddr_atomic_test`) whose fixed-address writes a whole-program relocation would clobber. Unit benches are tier-independent and run only once (in the `bram` job). ### `test_arch_compliance.py` diff --git a/tests/test_run_cocotb.py b/tests/test_run_cocotb.py index f35dbe36..cbc06a3d 100755 --- a/tests/test_run_cocotb.py +++ b/tests/test_run_cocotb.py @@ -193,7 +193,7 @@ class CocotbRunConfig: hdl_toplevel_module="frost", app_name="ddr_atomic_test", description="RV32-A atomics to the cached DDR region (LR/SC, AMO)", - include_in_pytest=False, + include_in_pytest=True, ), "freertos_demo": CocotbRunConfig( python_test_module="cocotb_tests.test_real_program", From 718f8cc576e02f50f245dbc0a0f9afce257cbe88 Mon Sep 17 00:00:00 2001 From: Adam Bagley Date: Sun, 21 Jun 2026 02:16:09 -0400 Subject: [PATCH 12/43] nommu Linux: fix MRET->U-mode timer interrupt-resume-PC (0x80388bba panic) An MRET returning below M-mode retires via the trap/MRET full flush, not the commit path, so cpu_ooo's interrupt_resume_pc was never refreshed to the MRET target and held the MRET instruction's own PC across the whole MRET-to-U window. A machine timer taken after privilege dropped to U (once the trap_unit inhibit lifted, before the first U-mode commit) then saved mepc = ; Linux later MRET'd to that kernel address in U-mode -> SIGILL at ret_from_exception+0x76 -> "Attempted to kill init" panic. Fix (cpu_ooo.sv): seed interrupt_resume_pc <= csr_mepc when mret_taken fires, so the U-target is in place before the inhibit window closes. Proven by a new directed test (sw/apps/mret_timer_resume_test): timer already pending across an MRET-to-U; asserts the saved resume PC is the U-mode target, not the MRET PC. FAIL (resume_mepc=MRET PC) before, PASS after. Directed regression green (umode, wfi_mepc, trap_unit, linux_irq stack/find/ddr/ active_ddr). On Genesys2 the boot clears the 0x80388bba panic and advances from ~0.85s to past initramfs unpacking. Also captures the entangled in-flight bring-up work in the same files (trap_unit MRET/interrupt inhibit window, slot-2 store-commit SQ guard, LR/SC load_queue), the linux_boot ret_from_exception image patch, and the fpga/load_software JTAG DDR-loader updates. --- README.md | 10 +- fpga/load_software/file_to_ddr.tcl | 57 +- fpga/load_software/load_software.py | 27 +- fpga/load_software/load_software.tcl | 2 +- handoff_linux_timer_irq_panic.md | 483 ++++++++++ hw/rtl/cpu_and_mem/cpu/control/trap_unit.sv | 88 +- hw/rtl/cpu_and_mem/cpu/cpu_ooo/cpu_ooo.sv | 160 ++- .../cpu/tomasulo/load_queue/README.md | 4 + .../cpu/tomasulo/load_queue/load_queue.sv | 25 +- .../commit_bus/commit_bus_pipeline.sv | 10 +- .../tomasulo_wrapper/tomasulo_wrapper.sv | 12 +- sw/apps/csr_rmw_test/Makefile | 17 + sw/apps/csr_rmw_test/main.c | 106 ++ sw/apps/linux_boot/Makefile | 29 +- .../linux_boot/patch_ret_from_exception.py | 92 ++ sw/apps/linux_irq_active_ddr_test/Makefile | 19 + sw/apps/linux_irq_active_ddr_test/main.c | 576 +++++++++++ sw/apps/linux_irq_ddr_test/Makefile | 19 + sw/apps/linux_irq_ddr_test/main.c | 457 +++++++++ .../linux_irq_find_next_slot_test/Makefile | 19 + sw/apps/linux_irq_find_next_slot_test/main.c | 907 ++++++++++++++++++ sw/apps/linux_irq_stack_slot_test/Makefile | 19 + sw/apps/linux_irq_stack_slot_test/main.c | 549 +++++++++++ sw/apps/mret_timer_resume_test/Makefile | 17 + sw/apps/mret_timer_resume_test/main.c | 193 ++++ sw/apps/wfi_mepc_test/Makefile | 17 + sw/apps/wfi_mepc_test/main.c | 100 ++ tests/Makefile | 6 + tests/test_run_cocotb.py | 47 + verif/cocotb_tests/control/test_trap_unit.py | 137 +++ verif/cocotb_tests/test_real_program.py | 596 +++++++++++- .../tomasulo/load_queue/test_load_queue.py | 62 ++ .../tomasulo_wrapper/test_tomasulo_wrapper.py | 109 ++- .../tomasulo_wrapper/tomasulo_interface.py | 10 + 34 files changed, 4898 insertions(+), 83 deletions(-) create mode 100644 handoff_linux_timer_irq_panic.md create mode 100644 sw/apps/csr_rmw_test/Makefile create mode 100644 sw/apps/csr_rmw_test/main.c create mode 100644 sw/apps/linux_boot/patch_ret_from_exception.py create mode 100644 sw/apps/linux_irq_active_ddr_test/Makefile create mode 100644 sw/apps/linux_irq_active_ddr_test/main.c create mode 100644 sw/apps/linux_irq_ddr_test/Makefile create mode 100644 sw/apps/linux_irq_ddr_test/main.c create mode 100644 sw/apps/linux_irq_find_next_slot_test/Makefile create mode 100644 sw/apps/linux_irq_find_next_slot_test/main.c create mode 100644 sw/apps/linux_irq_stack_slot_test/Makefile create mode 100644 sw/apps/linux_irq_stack_slot_test/main.c create mode 100644 sw/apps/mret_timer_resume_test/Makefile create mode 100644 sw/apps/mret_timer_resume_test/main.c create mode 100644 sw/apps/wfi_mepc_test/Makefile create mode 100644 sw/apps/wfi_mepc_test/main.c create mode 100644 verif/cocotb_tests/control/test_trap_unit.py diff --git a/README.md b/README.md index 6b01ff01..62f42e1f 100644 --- a/README.md +++ b/README.md @@ -353,11 +353,11 @@ controller calibrates, so software never observes an uninitialized main memory. | Resource | Used | Available | Util% | |----------|-----:|----------:|------:| -| Slice LUTs | 129,281 | 203,800 | 63.4% | -| LUT as Logic | 120,714 | 203,800 | 59.2% | -| LUT as Distributed RAM | 7,722 | — | — | -| LUT as Shift Register | 845 | — | — | -| Slice Registers | 86,734 | 407,600 | 21.3% | +| Slice LUTs | 129,853 | 203,800 | 63.7% | +| LUT as Logic | 121,241 | 203,800 | 59.5% | +| LUT as Distributed RAM | 7,768 | — | — | +| LUT as Shift Register | 844 | — | — | +| Slice Registers | 87,128 | 407,600 | 21.4% | | Block RAM Tile | 189.5 | 445 | 42.6% | | DSPs | 36 | 840 | 4.3% | | F7 Muxes | 98 | 101,900 | 0.1% | diff --git a/fpga/load_software/file_to_ddr.tcl b/fpga/load_software/file_to_ddr.tcl index 1a3ee52f..217cbec1 100644 --- a/fpga/load_software/file_to_ddr.tcl +++ b/fpga/load_software/file_to_ddr.tcl @@ -28,38 +28,59 @@ proc file2ddr {firmware_filename {axi_interface_name hw_axi_2} {burst_words 256}} { set file_descriptor [open $firmware_filename r] - set words [list] - while {[gets $file_descriptor word_hex_value] >= 0} { - set trimmed [string trim $word_hex_value] - if {$trimmed ne ""} { - lappend words $trimmed - } - } - close $file_descriptor - set total_words [llength $words] + # Stream the image in burst-sized chunks. Reading the whole file into one + # giant Tcl list and indexing it per word (lindex on a multi-MB list) is + # pathologically slow in the Vivado tcl interpreter -- THAT, not the JTAG, + # is what turned a ~6 MB Linux image into a ~17 min load (the actual + # create/run/delete of all ~8.8k bursts is only ~15 s). Reading burst_words + # lines at a time keeps every list tiny, so the data-prep is ~linear and + # negligible. run+delete in batches so the live hw_axi_txn set stays bounded. + set axi [get_hw_axis $axi_interface_name] set current_address 0 set transaction_number 0 - set index 0 + set total_words 0 + set batch 0 + set batch_limit 512 + + while {1} { + # Collect up to burst_words words for this burst (skipping blank lines, + # so non-blank word N still lands at DDR offset N -- matches the old + # read-all-then-index behaviour). + set chunk [list] + for {set i 0} {$i < $burst_words} {incr i} { + if {[gets $file_descriptor word_hex_value] < 0} { break } + set trimmed [string trim $word_hex_value] + if {$trimmed ne ""} { lappend chunk $trimmed } + } + set beats [llength $chunk] + if {$beats == 0} { break } - while {$index < $total_words} { - set beats [expr {min($burst_words, $total_words - $index)}] # hw_axi burst data is one bit-vector with beat 0 in the least # significant word: concatenate this burst's words last-to-first. set data "" for {set b [expr {$beats - 1}]} {$b >= 0} {incr b -1} { - append data [lindex $words [expr {$index + $b}]] + append data [lindex $chunk $b] } - set formatted_address [format 0x%08x $current_address] - create_hw_axi_txn ddrwr$transaction_number [get_hw_axis $axi_interface_name] \ - -type write -address $formatted_address -len $beats -data $data + create_hw_axi_txn ddrwr$batch $axi \ + -type write -address [format 0x%08x $current_address] -len $beats -data $data + incr batch incr transaction_number - incr index $beats + incr total_words $beats incr current_address [expr {4 * $beats}] + if {$batch >= $batch_limit} { + run_hw_axi [get_hw_axi_txns ddrwr*] + delete_hw_axi_txn [get_hw_axi_txns ddrwr*] + set batch 0 + puts " DDR load progress: $total_words words" + flush stdout + } } + close $file_descriptor - if {$transaction_number > 0} { + if {$batch > 0} { run_hw_axi [get_hw_axi_txns ddrwr*] + delete_hw_axi_txn [get_hw_axi_txns ddrwr*] } puts "Loaded $total_words DDR words in $transaction_number burst transaction(s)" diff --git a/fpga/load_software/load_software.py b/fpga/load_software/load_software.py index b835cb07..0f3dec56 100755 --- a/fpga/load_software/load_software.py +++ b/fpga/load_software/load_software.py @@ -55,6 +55,10 @@ "fpu_test", "hello_world", "isa_test", + "linux_irq_active_ddr_test", + "linux_boot", + "linux_irq_ddr_test", + "linux_irq_stack_slot_test", "memory_test", "packet_parser", "print_clock_speed", @@ -92,6 +96,10 @@ "ddr_heap_test", "ddr_smc_test", "ddr_test", + "linux_irq_active_ddr_test", + "linux_boot", + "linux_irq_ddr_test", + "linux_irq_stack_slot_test", } @@ -101,6 +109,7 @@ def compile_app_for_board( clock_freq: int, coremark_iterations: int, make_vars: dict[str, str] | None = None, + mem_config: str | None = None, ) -> bool: """Compile the application with board-specific settings. @@ -110,6 +119,7 @@ def compile_app_for_board( clock_freq: CPU clock frequency for this board coremark_iterations: Number of iterations for CoreMark make_vars: Extra make variable overrides + mem_config: If set, exported as MEM_CONFIG to relink the app (e.g. "ddr") Returns: True if compilation succeeded, False otherwise @@ -123,6 +133,11 @@ def compile_app_for_board( env["FPGA_CPU_CLK_FREQ"] = str(clock_freq) if app_name == "coremark": env["ITERATIONS"] = str(coremark_iterations) + # MEM_CONFIG=ddr relinks the app's code into the cached DDR region (the app + # Makefiles default to bram); this lets an arbitrary app run from DDR like + # the dedicated ddr_* apps. The Makefile's `?=` honors this env override. + if mem_config: + env["MEM_CONFIG"] = mem_config try: # Clean first to force recompilation with new settings @@ -204,6 +219,15 @@ def main() -> None: default="vivado", help="Path to Vivado executable (default: vivado from PATH)", ) + parser.add_argument( + "--ddr", + action="store_true", + help=( + "Build the app to execute from the cached DDR region (passes " + "MEM_CONFIG=ddr to the app Makefile), so an otherwise BRAM-resident " + "app runs its code from DDR. Requires a board with has_ddr." + ), + ) coremark_pro_mode = parser.add_mutually_exclusive_group() coremark_pro_mode.add_argument( "-v0", @@ -427,7 +451,8 @@ def main() -> None: elif args.coremark_pro_mode == "validation": print(" CoreMark-PRO run type: validation (-v1)") if not compile_app_for_board( - args.software_app, app_dir, clock_freq, coremark_iterations, make_vars + args.software_app, app_dir, clock_freq, coremark_iterations, make_vars, + mem_config="ddr" if args.ddr else None, ): print(f"Error: Failed to compile {args.software_app}", file=sys.stderr) sys.exit(1) diff --git a/fpga/load_software/load_software.tcl b/fpga/load_software/load_software.tcl index 1d812293..3a2fba0c 100644 --- a/fpga/load_software/load_software.tcl +++ b/fpga/load_software/load_software.tcl @@ -41,7 +41,7 @@ set coremark_pro_apps [list coremark_pro_core coremark_pro_cjpeg \ set valid_apps [list branch_pred_test c_ext_test call_stress cf_ext_test coremark \ {*}$coremark_pro_apps csr_test ddr_exec_test ddr_heap_test \ ddr_smc_test ddr_test freertos_demo fpu_assembly_test fpu_test \ - hello_world isa_test memory_test \ + hello_world isa_test linux_irq_active_ddr_test linux_boot linux_irq_ddr_test linux_irq_stack_slot_test memory_test \ packet_parser print_clock_speed ras_stress_test ras_test \ spanning_test sprintf_test strings_test tomasulo_perf \ tomasulo_test uart_echo] diff --git a/handoff_linux_timer_irq_panic.md b/handoff_linux_timer_irq_panic.md new file mode 100644 index 00000000..050accc9 --- /dev/null +++ b/handoff_linux_timer_irq_panic.md @@ -0,0 +1,483 @@ +# Fresh handoff - FROST no-MMU Linux boot on Genesys2 + +Last updated by Codex: 2026-06-21. Latest hardware run described here was on +2026-06-20. This file is meant to be self-contained for a fresh agent. + +## Mission + +Boot no-MMU M-mode Linux on real Genesys2 hardware with the FROST RV32 out-of-order +core. Do not treat this as only a single bug fix. The larger goal is real hardware +Linux bring-up. + +Current state (updated 2026-06-21 by Claude): the `0x80388bba` panic root cause is +now PROVEN in directed simulation and FIXED in RTL. The fix is verified in sim but +NOT yet on hardware. The next step is a Genesys2 bitstream rebuild with the +`cpu_ooo.sv` change and a hardware Linux boot re-test. See +"## RESOLVED 2026-06-21: stale interrupt_resume_pc across MRET-to-U (proven + fixed)" +below for the full proof, the one-line-class RTL fix, and the new directed test. + +## RESOLVED 2026-06-21: stale interrupt_resume_pc across MRET-to-U (proven + fixed) + +### Proven root cause + +An MRET that returns below M-mode retires through the trap/MRET **full flush**, NOT +through the normal commit path: + +- `o_mret_taken` asserts combinationally on the `o_mret_start` cycle (call it T). +- One cycle later (T+1) `mret_taken_reg` is high, and `misprediction_flush_controller` + drives `flush_all` combinationally from it. `flush_all` wipes the ROB head and gates + `commit_en` (reorder_buffer.sv), so the MRET is squashed and **never appears on + `rob_commit_valid_raw`**. +- `interrupt_resume_pc` (cpu_ooo.sv) only updates on a valid ROB commit, so the MRET + never refreshes it. It keeps the architectural next-PC of the instruction *before* + the MRET — which equals the MRET instruction's own PC (in Linux, the `c.lwsp + sp,8(sp)` at `0x80388bb8` makes that exactly `0x80388bba`). +- `trap_unit` only inhibits interrupts at T and T+1 (`i_mret_start || mret_taken_prev`). + From T+3 onward (priv = U, inhibit dropped, registered timer re-eligible) until the + first post-MRET instruction commits, a machine timer is taken and saves + `mepc = interrupt_resume_pc = `. +- Linux later restores that trap frame and `mret`s to the kernel MRET PC while in + U-mode → illegal instruction (signal 4) → "Attempted to kill init". + +This was confirmed two ways: independent static trace across +rob_serializer / reorder_buffer / misprediction_flush_controller / trap_unit / csr_file, +and a directed cocotb sim (below). The kernel disassembly was verified: runtime +`0x80388bba` is `mret` (0x30200073) at `ret_from_exception+0x76`, preceded by a 2-byte +non-branch `c.lwsp sp,8(sp)` — so `interrupt_resume_pc == 0x80388bba` pre-MRET is +legitimate, and `ret_from_exception` is the unified (U and M) return path. + +### Directed test (new) + +`sw/apps/mret_timer_resume_test/` (registered in `tests/test_run_cocotb.py`). It is a +focused variant of `umode_test`'s timer-preempts-U case: it makes the machine timer +*already pending* (`mtimecmp = 0`) before an MRET-to-U, and the naked M-mode handler +additionally records `mepc`. It asserts the saved resume PC is the U-mode target +(`&u_spin`), never the MRET's own PC. + +Run it the standard way (`frost/tests`, `make clean`, `./test_run_cocotb.py +mret_timer_resume_test`). At low addresses the analog of `0x80388bba` is the inlined +`run_in_umode_pending_timer` MRET at `0x1c6`; the correct resume PC is `u_spin` at +`0xea`. + +- BEFORE fix: `cause=0x80000007 from_priv=0x0 resume_mepc=0x000001C6` → `<>` + (the bug: mepc = MRET PC). +- AFTER fix: `cause=0x80000007 from_priv=0x0 resume_mepc=0x000000EA` → `<>`. + +The `FROST_DBG ... TRAP` probe shows the same timer trap (cycle 1095000) flipping +`resume_pc` from the stale `1c6` to `ea`, while the live `rob_pc` stays `1c6` — i.e. +the resume PC is now correctly decoupled from the squashed MRET head. + +### The fix (RTL) + +`hw/rtl/cpu_and_mem/cpu/cpu_ooo/cpu_ooo.sv`, the `interrupt_resume_pc` always_ff: add a +highest-priority branch that seeds it from the MRET target the cycle `mret_taken` +fires, so the U-target is in place before the inhibit window closes: + +```systemverilog +end else if (mret_taken) begin + interrupt_resume_pc <= csr_mepc; // MRET retires via flush, never via commit; + // seed the resume PC from the MRET target now +end else if (rob_commit_2_valid_raw) begin + ... +``` + +`csr_mepc` is stable at that cycle (MRET does not write mepc; a trap entry that would +cannot coincide with `mret_taken`), and it equals the MRET redirect target. No +regression to the normal precise-interrupt-resume path: for non-MRET interrupts and +the WFI/empty-ROB case the commit branches are unchanged; nothing commits on the +`mret_taken` cycle (serializer has `commit_stall=1`), so the new branch never steals a +real commit's update. It is a narrow 1-bit select on a non-critical register (feeds +only `trap_unit.i_interrupt_pc`), so it should be timing-benign. + +### Next step (hardware) + +Rebuild the Genesys2 bitstream with this `cpu_ooo.sv` change and re-run the hardware +Linux boot (`python3 /tmp/linux_boot_watch.py`). The `0x80388bba` user-mode-MRET +illegal-instruction panic should no longer occur. If a new/different failure appears, +treat it as a fresh symptom — this specific stale-resume-PC mechanism is now closed. + +## Environment + +FROST repo: + +```text +/home/adam-bagley/bigger_l0/frost +``` + +Relevant external Linux tree: + +```text +/home/adam-bagley/bigger_l0/linux-mvp/buildroot/output/build/linux-6.18.7 +``` + +Hardware: + +- Genesys2 / Kintex-7. +- UART is `/dev/ttyUSB0`, 115200 8N1. +- The user programs FPGA bitstreams manually and tells the agent when the FPGA is ready. +- Use the boot-watch script rather than minicom for capture. + +Hardware boot command: + +```sh +python3 /tmp/linux_boot_watch.py +``` + +Latest synchronized UART log: + +```text +/tmp/genesys2_linux_boot_synchronized.log +``` + +The worktree is dirty and contains intentional changes plus unrelated older bring-up +changes. Do not revert wholesale. Start with `git status --short` and inspect before +editing. + +## Latest hardware result + +The user programmed a Genesys2 bitstream containing the newest `trap_unit.sv` changes +and the current `cpu_ooo.sv` interrupt-resume plumbing. Running: + +```sh +python3 /tmp/linux_boot_watch.py +``` + +rebuilt and loaded `sw/apps/linux_boot`, patched the local DDR image, loaded the FPGA, +and captured UART. The boot got past the original `_find_next_bit` / `ra=0xcc0` +panic and reached later initcall/pty territory, but still died: + +```text +[ 0.847064] swapper/0[1]: unhandled signal 4 code 0x1 at 0x80388bba +... +[ 1.095342] Kernel panic - not syncing: Attempted to kill init! exitcode=0x00000004 +... +[<80388bba>] ret_from_exception+0x76/0x7a +``` + +`0x80388bba` is Linux `ret_from_exception`'s final `mret` instruction: + +```text +00388b74: csrw mepc,a2 +... +00388bb8: lw sp,8(sp) +00388bba: mret +``` + +The current bad symptom is therefore: user context eventually tries to execute the +kernel's `mret` instruction at `0x80388bba`, which is illegal outside M-mode. + +## Important image-patch detail + +`sw/apps/linux_boot/patch_ret_from_exception.py` patches the local FPGA-loadable DDR +image after copying from external Linux artifacts: + +```text +word 0xe22dc: 18c1202f -> ff757513 +``` + +The patch applies to: + +```text +sw/apps/linux_boot/sw_ddr.mem +sw/apps/linux_boot/sw_ddr.txt +``` + +Do not use `vmlinux` objdump alone to decide whether the image was patched. The +external `vmlinux` and `linux-mvp/frost-artifacts/sw_ddr.txt` remain unpatched. The +loaded local dense image was patched in the latest run. Current verification command: + +```sh +rg -n "18c1202f|ff757513" sw/apps/linux_boot/sw_ddr.txt \ + /home/adam-bagley/bigger_l0/linux-mvp/frost-artifacts/sw_ddr.txt +``` + +Expected current output: + +```text +sw/apps/linux_boot/sw_ddr.txt:926429:ff757513 +/home/adam-bagley/bigger_l0/linux-mvp/frost-artifacts/sw_ddr.txt:926429:18c1202f +``` + +## What is already fixed or ruled out + +### 1. Original `_find_next_bit` / `ra=0xcc0` panic + +Original hardware failure: + +```text +FROST_IRQ_ENTER epc=801657ae ra=80094556 sp=804c3e40 cause=80000007 slot12=00000cc0 +FROST_IRQ_RETURN epc=801657ae ra=80094556 + +Oops - illegal instruction +epc : 00000cc0 +ra : 00000cc0 +sp : 804c3e50 +``` + +The UART probe proved the trap frame itself still had sane `epc`/`ra`; the interrupted +callee's own saved return-address slot at `12(sp)` was already stale `0x00000cc0` at +IRQ entry. That pointed to a lost stack store, not trap-frame corruption. + +Root cause found: a same-cycle slot-2 store-like ROB commit could be missed by the +store queue's committed-empty guard during a full trap flush. `store_queue.sv` had +raw guard ports for a second commit slot, but `tomasulo_wrapper.sv` had tied them off. +This could let a timer IRQ full-flush while slot 2's store commit was still one cycle +away from the SQ; the registered commit then got masked, losing stores like +`sw ra,12(sp)`. + +Fixes/checks now present in the worktree: + +- `hw/rtl/cpu_and_mem/cpu/tomasulo/tomasulo_wrapper/tomasulo_wrapper.sv` + connects raw slot-2 store-like commit information into the SQ guard. +- `hw/rtl/cpu_and_mem/cpu/cpu_ooo/cpu_ooo.sv` computes + `sq_committed_empty_for_trap = sq_committed_empty && !rob_commit_store_like_raw && + !rob_commit_2_store_like_raw`. +- Directed tests were added for the Linux IRQ stack slot and the wrapper slot-2 guard. + +After these changes, the old `_find_next_bit` / `slot12=0xcc0` signature did not +reproduce in the next hardware runs. Treat it as fixed unless it reappears. + +### 2. MRET/interrupt race at `ret_from_exception::mret` + +After the slot-2 store fix, a hardware run failed with a cleaner signature: + +```text +FROST_IRQ_ENTER epc=80388bba ra=80094556 sp=804c3dc0 cause=80000007 +FROST_IRQ_RETURN epc=80388bba ra=80094556 +swapper/0[0]: unhandled signal 4 code 0x1 at 0x80388bba +``` + +This showed a timer IRQ could be taken with `mepc` equal to the M-mode `mret` +instruction itself. + +Fixes/checks now present in `hw/rtl/cpu_and_mem/cpu/control/trap_unit.sv`: + +- one-cycle `mret_taken_prev` recovery marker, +- interrupt latch inhibited/cleared during `i_mret_start || mret_taken_prev`, +- registered pending interrupts re-qualified against current CSR interrupt + eligibility, +- registered pending interrupt loses to MRET during the MRET recovery window, +- interrupt trap PC comes from `i_interrupt_pc`, not raw live ROB trap PC. + +These changes improved or changed the failure mode, but they did not finish the boot. +Latest hardware still reaches a user illegal instruction at `0x80388bba`. + +### 3. CSR privilege write theory + +Checked `hw/rtl/cpu_and_mem/cpu/csr/csr_file.sv`: plain `csrw mstatus` updates +`mstatus_mpp` and related fields but does not change current privilege `priv_q`. +`priv_q` changes on trap entry and actual `i_mret_taken`. + +So the tempting explanation "Linux writes `mstatus.MPP=U` before `mret`, therefore +trap_unit already thinks it is in U-mode" does not match the CSR implementation. + +### 4. UART drops + +The apparent UART output drops were caused by a stale capture process. The user +confirmed the drops disappeared after killing that process. Do not chase UART output +drops as an RTL issue unless a new independent symptom appears. + +## Current RTL areas to read first + +`hw/rtl/cpu_and_mem/cpu/cpu_ooo/cpu_ooo.sv` + +```systemverilog +logic [XLEN-1:0] interrupt_resume_pc; + +function automatic logic [XLEN-1:0] retired_next_pc( + input riscv_pkg::reorder_buffer_commit_t commit +); + logic [XLEN-1:0] step; + begin + step = commit.is_compressed ? {{(XLEN - 2){1'b0}}, 2'b10} : + {{(XLEN - 3){1'b0}}, 3'b100}; + if (commit.is_branch || commit.is_mret) begin + retired_next_pc = commit.redirect_pc; + end else begin + retired_next_pc = commit.pc + step; + end + end +endfunction + +always_ff @(posedge i_clk) begin + if (i_rst) begin + interrupt_resume_pc <= '0; + end else if (rob_commit_2_valid_raw) begin + interrupt_resume_pc <= retired_next_pc(rob_commit_comb_2); + end else if (rob_commit_valid_raw) begin + interrupt_resume_pc <= retired_next_pc(rob_commit_comb); + end +end +``` + +`trap_unit` saves this as `mepc` for interrupts: + +```systemverilog +o_trap_pc = i_interrupt_pc; +``` + +`hw/rtl/cpu_and_mem/cpu/tomasulo/reorder_buffer/reorder_buffer.sv` should also be +read around MRET commit handling. It currently sets MRET commit `redirect_pc` from +`i_mepc`. + +## Current best hypothesis, not proven + +The next thing to prove or disprove is whether `interrupt_resume_pc` can be stale or +wrong around an MRET return to U-mode. + +Possible bad sequence: + +1. Linux returns to user through `ret_from_exception`. +2. MRET redirect targets the user PC, but `interrupt_resume_pc` is still or becomes + `0x80388bba`, the M-mode `mret` instruction address. +3. A machine timer interrupt becomes eligible just after return below M. This is legal: + machine interrupts can preempt U-mode even when `mstatus.MIE` is 0. +4. Trap entry saves `mepc = i_interrupt_pc = 0x80388bba`. +5. Linux later restores that trap frame and executes `mret` to `0x80388bba` as user + context. +6. U-mode executing MRET raises illegal instruction at `0x80388bba`. + +This fits the latest user-visible failure, but it is still only a hypothesis. The +critical question is: what exact value is on `i_interrupt_pc` when the timer trap that +eventually leads to the `0x80388bba` signal is taken? + +## Recommended next step + +Do a directed simulation before any more hardware rebuilds. + +Add or extend a small app, likely `sw/apps/umode_test/main.c` or a new +`sw/apps/mret_timer_resume_test/main.c`, to exercise: + +1. M-mode sets `mtvec` to a handler that records `mcause`, `mepc`, `mstatus`, and a + small progress marker. +2. M-mode sets up a U-mode return target label in `mepc`. +3. M-mode sets `mstatus.MPP=U` and enables the machine timer interrupt in `mie`. +4. Arrange a timer pending condition before MRET and/or immediately after MRET. +5. Execute MRET. +6. In the trap handler, assert: + - `mcause == 0x80000007`, + - previous privilege was U, + - saved `mepc` is the U-mode target or U-mode fallthrough, + - saved `mepc` is never the M-mode MRET instruction PC. + +Add temporary cocotb visibility/assertions around: + +- `mret_start`, +- `mret_taken`, +- `mret_taken_reg`, +- `csr_mepc`, +- `csr_priv`, +- `rob_commit_comb.valid`, +- `rob_commit_comb.is_mret`, +- `rob_commit_comb.pc`, +- `rob_commit_comb.redirect_pc`, +- `rob_commit_2_*` equivalents, +- `interrupt_resume_pc`, +- `trap_taken`, +- `trap_pc_internal`. + +If the directed sim reproduces the bad `mepc`, the likely RTL fix is in +`cpu_ooo.sv`: seed or hold `interrupt_resume_pc` from the MRET target (`csr_mepc` / +MRET `redirect_pc`) across the MRET recovery window, and prevent MRET/invalid/old ROB +state from leaving it at the M-mode MRET PC. Do not apply that blindly; prove the +failure first. + +If the directed sim does not reproduce, add better Linux restore-path instrumentation +before another bitstream: + +- print a compact `FROST_RET_RESTORE` line immediately before `csrw mstatus`, + `csrw mepc`, and `mret`, +- include `PT_EPC`, `PT_STATUS`, live `mstatus`, live `mepc`, and maybe `PT_RA`. + +The Linux tree already has temporary raw UART probes in +`arch/riscv/kernel/entry.S` for `FROST_IRQ_ENTER`, `FROST_IRQ_RETURN`, and +`FROST_BAD_RET`, but it does not currently print every normal restore state. + +## Tests that passed recently + +```sh +./tests/test_run_cocotb.py trap_unit +COCOTB_NUM_RUNS=1 ./tests/test_run_cocotb.py umode_test +env FROST_IRQ_PRECISION_CHECK=1 FROST_IRQ_LOW_RA_ASSERT=1 \ + FROST_EXTERNAL_IRQ_SYMBOL=irq_find_next_bit_exact_callee \ + FROST_EXTERNAL_IRQ_OFFSET=0x52 FROST_EXTERNAL_IRQ_MAX_PULSES=1 \ + FROST_IRQ_CALLEE_SYMBOL=irq_find_next_bit_exact_callee \ + FROST_IRQ_PRECISION_EVENT_LIMIT=16 COCOTB_NUM_RUNS=1 \ + ./tests/test_run_cocotb.py linux_irq_find_next_slot_test +COCOTB_NUM_RUNS=1 ./tests/test_run_cocotb.py wfi_mepc_test +python3 -m py_compile sw/apps/linux_boot/patch_ret_from_exception.py +make -C sw/apps/linux_boot +``` + +Earlier slot-store regressions that passed: + +```sh +env COCOTB_NUM_RUNS=1 ./tests/test_run_cocotb.py linux_irq_stack_slot_test +env COCOTB_NUM_RUNS=1 ./tests/test_run_cocotb.py linux_irq_active_ddr_test +env COCOTB_NUM_RUNS=1 ./tests/test_run_cocotb.py linux_irq_ddr_test +./tests/test_run_cocotb.py tomasulo_wrapper --testcase test_slot2_store_raw_commit_blocks_sq_committed_empty +./tests/test_run_cocotb.py tomasulo_wrapper --random-seed 1781982550 +``` + +## Current dirty files that matter + +At the time of this handoff, relevant intentional edits include: + +- `hw/rtl/cpu_and_mem/cpu/control/trap_unit.sv` +- `hw/rtl/cpu_and_mem/cpu/cpu_ooo/cpu_ooo.sv` +- `hw/rtl/cpu_and_mem/cpu/tomasulo/tomasulo_wrapper/tomasulo_wrapper.sv` +- `sw/apps/linux_boot/Makefile` +- `sw/apps/linux_boot/patch_ret_from_exception.py` +- `tests/Makefile` +- `tests/test_run_cocotb.py` +- `verif/cocotb_tests/control/test_trap_unit.py` +- `verif/cocotb_tests/test_real_program.py` +- `verif/cocotb_tests/tomasulo/tomasulo_wrapper/test_tomasulo_wrapper.py` +- `verif/cocotb_tests/tomasulo/tomasulo_wrapper/tomasulo_interface.py` +- `sw/apps/linux_irq_stack_slot_test/` +- `sw/apps/linux_irq_active_ddr_test/` +- `sw/apps/linux_irq_ddr_test/` +- `sw/apps/linux_irq_find_next_slot_test/` +- `sw/apps/wfi_mepc_test/` + +There are also unrelated dirty/untracked files in the worktree. Inspect before +touching and do not clean the tree unless the user explicitly asks. + +## Timing caution + +The user reported that recent RTL instrumentation made post-opt timing worse, although +one later implementation recovered during placement and closed. If more synthesizable +instrumentation is needed, keep it narrow: a few registered values or counters, no wide +debug muxes on already bad paths. Prefer directed simulation and Linux UART probes +before adding more FPGA-visible RTL debug. + +## How to inspect the latest hardware log + +Useful command: + +```sh +rg -a -n "80388bba|80388bb|ret_from_exception|unhandled signal|Kernel panic|FROST_IRQ_(ENTER|RETURN)|FROST_BAD_RET|FROST_RET" \ + /tmp/genesys2_linux_boot_synchronized.log +``` + +Expected important lines include: + +```text +[ 0.847064] swapper/0[1]: unhandled signal 4 code 0x1 at 0x80388bba +[ 1.095342] Kernel panic - not syncing: Attempted to kill init! exitcode=0x00000004 +[<80388bba>] ret_from_exception+0x76/0x7a +``` + +Some UART around the recursive failure is garbled. Treat garbled `FROST_IRQ_ENTER`-like +lines as hints only; rely on clean kernel lines and directed sim for proof. + +## Claude starting checklist + +1. Read this file fully. +2. Run `git status --short`. +3. Read `cpu_ooo.sv` around `interrupt_resume_pc` and `trap_unit` instantiation. +4. Read `trap_unit.sv` around interrupt registration, MRET inhibit, and `o_trap_pc`. +5. Read ROB MRET commit/redirect handling. +6. Build the directed MRET-to-U plus timer-pending sim. +7. Only after the sim proves or disproves the current hypothesis, decide whether to + patch RTL, add Linux restore instrumentation, or ask the user for another bitstream. diff --git a/hw/rtl/cpu_and_mem/cpu/control/trap_unit.sv b/hw/rtl/cpu_and_mem/cpu/control/trap_unit.sv index edd61149..fc10a670 100644 --- a/hw/rtl/cpu_and_mem/cpu/control/trap_unit.sv +++ b/hw/rtl/cpu_and_mem/cpu/control/trap_unit.sv @@ -111,6 +111,7 @@ module trap_unit #( input logic [XLEN-1:0] i_exception_cause, input logic [XLEN-1:0] i_exception_tval, input logic [XLEN-1:0] i_exception_pc, + input logic [XLEN-1:0] i_interrupt_pc, // MRET trap-return request input logic i_mret_start, @@ -143,13 +144,26 @@ module trap_unit #( assign mie_msie = i_mie[riscv_pkg::MieMsiBit]; // Register trap_taken for one cycle to prevent it from re-asserting immediately - // after CSR update (breaks combinational loop with mstatus_mie) + // after CSR update (breaks combinational loop with mstatus_mie). Also keep + // a one-cycle MRET recovery marker: CSR privilege/MIE state changes on the + // raw MRET pulse, while the OOO front/back-end flush is registered one cycle + // later. During that handoff, an old registered interrupt must not trap with + // mepc equal to the MRET instruction itself. logic trap_taken_prev; + logic mret_taken_prev; always_ff @(posedge i_clk) begin - if (i_rst) trap_taken_prev <= 1'b0; - else trap_taken_prev <= o_trap_taken; + if (i_rst) begin + trap_taken_prev <= 1'b0; + mret_taken_prev <= 1'b0; + end else begin + trap_taken_prev <= o_trap_taken; + mret_taken_prev <= o_mret_taken; + end end + logic mret_interrupt_inhibit; + assign mret_interrupt_inhibit = i_mret_start || mret_taken_prev; + // Interrupt pending and enabled (gate by !trap_taken_prev to prevent re-entry). // Global M-interrupt enable: mstatus.MIE while in M, but ALWAYS enabled while // running below M (priv != PrivM) so a machine timer/SW/ext interrupt can @@ -157,9 +171,12 @@ module trap_unit #( logic m_int_globally_enabled; assign m_int_globally_enabled = mstatus_mie || (i_priv != riscv_pkg::PrivM); logic meip_enabled, mtip_enabled, msip_enabled; - assign meip_enabled = i_interrupts.meip && mie_meie && m_int_globally_enabled && !trap_taken_prev; - assign mtip_enabled = i_interrupts.mtip && mie_mtie && m_int_globally_enabled && !trap_taken_prev; - assign msip_enabled = i_interrupts.msip && mie_msie && m_int_globally_enabled && !trap_taken_prev; + assign meip_enabled = i_interrupts.meip && mie_meie && m_int_globally_enabled && + !trap_taken_prev && !mret_interrupt_inhibit; + assign mtip_enabled = i_interrupts.mtip && mie_mtie && m_int_globally_enabled && + !trap_taken_prev && !mret_interrupt_inhibit; + assign msip_enabled = i_interrupts.msip && mie_msie && m_int_globally_enabled && + !trap_taken_prev && !mret_interrupt_inhibit; // TIMING OPTIMIZATION: Register interrupt_pending to break critical path. // The combinational path from msip -> interrupt_pending -> take_trap -> stall -> cache @@ -178,6 +195,7 @@ module trap_unit #( always_ff @(posedge i_clk) begin if (i_rst) interrupt_pending <= 1'b0; + else if (mret_interrupt_inhibit) interrupt_pending <= 1'b0; else interrupt_pending <= interrupt_pending_comb; end @@ -268,20 +286,44 @@ module trap_unit #( interrupt_cause <= interrupt_cause_comb; end + // A registered interrupt request must still be enabled when it reaches the + // trap decision. This keeps raw interrupt inputs out of the take_trap cone, + // while allowing CSR writes such as Linux's ret_from_exception mstatus + // restore to cancel a stale one-cycle interrupt sample before MRET. + logic interrupt_latched_source_enabled; + always_comb begin + unique case (interrupt_cause) + riscv_pkg::IntMachineExternal: interrupt_latched_source_enabled = mie_meie; + riscv_pkg::IntMachineSoftware: interrupt_latched_source_enabled = mie_msie; + riscv_pkg::IntMachineTimer: interrupt_latched_source_enabled = mie_mtie; + default: interrupt_latched_source_enabled = 1'b0; + endcase + end + + logic interrupt_pending_eligible; + assign interrupt_pending_eligible = interrupt_pending && + interrupt_latched_source_enabled && + m_int_globally_enabled && + !trap_taken_prev && + !mret_interrupt_inhibit; + // Trap taken: either interrupt or exception, the pipeline not stalled // (except for WFI stall, which should be broken by interrupt), and no // committed store still draining (see i_sq_committed_empty). logic take_trap; - assign take_trap = (interrupt_pending || exception_pending) && !i_pipeline_stall && + assign take_trap = (interrupt_pending_eligible || exception_pending) && + !i_pipeline_stall && i_sq_committed_empty; - // MRET execution (trap has priority: if interrupt/exception fires same cycle, trap wins) + // MRET execution. Synchronous exceptions are structurally impossible with + // MRET at the ROB head; pending interrupts are deferred across the MRET + // recovery window above so the return redirect stays precise. logic take_mret; assign take_mret = i_mret_start && !i_pipeline_stall && !take_trap && i_sq_committed_empty; // Hold commit while a trap/MRET waits out the store drain, so the // committed set shrinks monotonically and the wait is bounded. - assign o_trap_drain_wait = (interrupt_pending || exception_pending || i_mret_start) && + assign o_trap_drain_wait = (interrupt_pending_eligible || exception_pending || i_mret_start) && !i_sq_committed_empty; // Output trap signals @@ -296,7 +338,7 @@ module trap_unit #( o_trap_target = i_mepc; end else if (take_trap) begin // Check mtvec mode - if (i_mtvec[1:0] == 2'b01 && interrupt_pending) begin + if (i_mtvec[1:0] == 2'b01 && interrupt_pending_eligible) begin // Vectored mode for interrupts: BASE + 4*cause_code // Use pre-computed small offset (6 bits) for faster timing than // extracting from full interrupt_cause which synthesis can't optimize @@ -313,11 +355,13 @@ module trap_unit #( // Trap entry information for CSR file // Interrupts have priority over synchronous exceptions always_comb begin - if (interrupt_pending) begin + if (interrupt_pending_eligible) begin o_trap_cause = interrupt_cause; o_trap_value = '0; // Interrupts have mtval = 0 - // For interrupts, save PC of next instruction (the one that will be interrupted) - o_trap_pc = i_exception_pc; + // For interrupts, save the precise architectural resume PC. The live + // ROB head PC can be transient or stale while an async interrupt drains + // through the registered commit path. + o_trap_pc = i_interrupt_pc; end else begin o_trap_cause = exception_cause_q; o_trap_value = exception_tval_q; @@ -342,9 +386,9 @@ module trap_unit #( assume (!(i_mret_start && i_exception_valid)); assume (!(i_wfi_start && i_mret_start)); assume (!(i_wfi_start && i_exception_valid)); - // Note: MRET + interrupt_pending is NOT assumed away. The RTL handles it - // by giving trap priority (!take_trap gate on take_mret), and the - // p_trap_mret_mutex assertion proves this without over-constraining. + // Note: MRET + interrupt_pending is NOT assumed away. MRET wins that race; + // the pending interrupt is re-sampled after the return redirect has had + // time to retire the MRET precisely. end always @(posedge i_clk) begin @@ -353,7 +397,8 @@ module trap_unit #( p_trap_mret_mutex : assert (!(o_trap_taken && o_mret_taken)); // Trap needs source: trap_taken requires interrupt or exception. - p_trap_needs_source : assert (!o_trap_taken || (interrupt_pending || exception_pending)); + p_trap_needs_source : assert (!o_trap_taken || (interrupt_pending_eligible || + exception_pending)); // Trap not during stall: traps only fire when pipeline not stalled. p_trap_not_stalled : assert (!o_trap_taken || !i_pipeline_stall); @@ -368,6 +413,11 @@ module trap_unit #( // MRET target is mepc: when MRET fires, target must be mepc. p_mret_target : assert (!o_mret_taken || (o_trap_target == i_mepc)); + // A pending interrupt must not preempt the MRET instruction itself. + if (i_mret_start && !exception_pending) begin + p_mret_defers_interrupt : assert (!o_trap_taken); + end + // WFI stall contract: if stall_for_wfi_comb, wfi must be active. p_wfi_stall_needs_active : assert (!stall_for_wfi_comb || wfi_active); end @@ -427,8 +477,8 @@ module trap_unit #( cover_wfi_stall : cover (stall_for_wfi_comb); cover_wfi_wakeup : cover (f_past_valid && !wfi_active && $past(wfi_active)); cover_external_interrupt : - cover (interrupt_pending && interrupt_cause == riscv_pkg::IntMachineExternal); - cover_exception : cover (o_trap_taken && i_exception_valid && !interrupt_pending); + cover (interrupt_pending_eligible && interrupt_cause == riscv_pkg::IntMachineExternal); + cover_exception : cover (o_trap_taken && i_exception_valid && !interrupt_pending_eligible); cover_trap_after_drain : cover (f_past_valid && o_trap_taken && $past(o_trap_drain_wait)); end end diff --git a/hw/rtl/cpu_and_mem/cpu/cpu_ooo/cpu_ooo.sv b/hw/rtl/cpu_and_mem/cpu/cpu_ooo/cpu_ooo.sv index f7cec147..8ea18537 100644 --- a/hw/rtl/cpu_and_mem/cpu/cpu_ooo/cpu_ooo.sv +++ b/hw/rtl/cpu_and_mem/cpu/cpu_ooo/cpu_ooo.sv @@ -298,6 +298,39 @@ module cpu_ooo #( logic [riscv_pkg::ReorderBufferTagWidth-1:0] dbg_rat_alloc_rob_tag /* verilator public_flat_rd */; logic [XLEN-1:0] dbg_last_a0_alloc_pc /* verilator public_flat_rd */; logic [riscv_pkg::ReorderBufferTagWidth-1:0] dbg_last_a0_alloc_tag /* verilator public_flat_rd */; + logic dbg_trap_taken_raw /* verilator public_flat_rd */; + logic dbg_trap_taken_q /* verilator public_flat_rd */; + logic [XLEN-1:0] dbg_trap_cause_internal /* verilator public_flat_rd */; + logic [XLEN-1:0] dbg_trap_pc_internal /* verilator public_flat_rd */; + logic [XLEN-1:0] dbg_interrupt_resume_pc /* verilator public_flat_rd */; + logic dbg_port0_int_we /* verilator public_flat_rd */; + logic [riscv_pkg::RegAddrWidth-1:0] dbg_port0_int_addr /* verilator public_flat_rd */; + logic [XLEN-1:0] dbg_port0_int_data /* verilator public_flat_rd */; + logic dbg_port1_int_we /* verilator public_flat_rd */; + logic [riscv_pkg::RegAddrWidth-1:0] dbg_port1_int_addr /* verilator public_flat_rd */; + logic [XLEN-1:0] dbg_port1_int_data /* verilator public_flat_rd */; + logic dbg_commit_dest_valid /* verilator public_flat_rd */; + logic dbg_commit_dest_rf /* verilator public_flat_rd */; + logic [riscv_pkg::RegAddrWidth-1:0] dbg_commit_dest_reg /* verilator public_flat_rd */; + logic [XLEN-1:0] dbg_commit_value /* verilator public_flat_rd */; + logic dbg_commit_2_valid /* verilator public_flat_rd */; + logic [XLEN-1:0] dbg_commit_2_pc /* verilator public_flat_rd */; + logic dbg_commit_2_dest_valid /* verilator public_flat_rd */; + logic dbg_commit_2_dest_rf /* verilator public_flat_rd */; + logic [riscv_pkg::RegAddrWidth-1:0] dbg_commit_2_dest_reg /* verilator public_flat_rd */; + logic [XLEN-1:0] dbg_commit_2_value /* verilator public_flat_rd */; + logic dbg_rob_commit_reg_valid /* verilator public_flat_rd */; + logic [XLEN-1:0] dbg_rob_commit_reg_pc /* verilator public_flat_rd */; + logic dbg_rob_commit_reg_dest_valid /* verilator public_flat_rd */; + logic dbg_rob_commit_reg_dest_rf /* verilator public_flat_rd */; + logic [riscv_pkg::RegAddrWidth-1:0] dbg_rob_commit_reg_dest_reg /* verilator public_flat_rd */; + logic [XLEN-1:0] dbg_rob_commit_reg_value /* verilator public_flat_rd */; + logic dbg_rob_commit_2_reg_valid /* verilator public_flat_rd */; + logic [XLEN-1:0] dbg_rob_commit_2_reg_pc /* verilator public_flat_rd */; + logic dbg_rob_commit_2_reg_dest_valid /* verilator public_flat_rd */; + logic dbg_rob_commit_2_reg_dest_rf /* verilator public_flat_rd */; + logic [riscv_pkg::RegAddrWidth-1:0] dbg_rob_commit_2_reg_dest_reg /* verilator public_flat_rd */; + logic [XLEN-1:0] dbg_rob_commit_2_reg_value /* verilator public_flat_rd */; // verilog_lint: waive-stop line-length `endif @@ -636,6 +669,11 @@ module cpu_ooo #( logic rob_commit_2_store_like_raw; logic rob_commit_2_valid; assign rob_commit_2_valid = rob_commit_2.valid; + logic rob_commit_store_like_raw; + logic sq_committed_empty_for_trap; + assign rob_commit_store_like_raw = + rob_commit_valid_raw && + (rob_commit_comb.is_store || rob_commit_comb.is_fp_store || rob_commit_comb.is_sc); logic widen_commit_ok; assign widen_commit_ok = 1'b1; logic [riscv_pkg::ReorderBufferDepth-1:0] rob_entry_epoch; @@ -1639,6 +1677,42 @@ module cpu_ooo #( // The wrapper already provides a registered observation port for commit. assign rob_commit_valid = rob_commit.valid; +`ifndef SYNTHESIS + assign dbg_trap_taken_raw = trap_taken; + assign dbg_trap_taken_q = trap_taken_reg; + assign dbg_trap_cause_internal = trap_cause_internal; + assign dbg_trap_pc_internal = trap_pc_internal; + assign dbg_interrupt_resume_pc = interrupt_resume_pc; + assign dbg_port0_int_we = port0_int_we; + assign dbg_port0_int_addr = port0_int_addr; + assign dbg_port0_int_data = port0_int_data; + assign dbg_port1_int_we = port1_int_we; + assign dbg_port1_int_addr = port1_int_addr; + assign dbg_port1_int_data = port1_int_data; + assign dbg_commit_dest_valid = rob_commit_comb.dest_valid; + assign dbg_commit_dest_rf = rob_commit_comb.dest_rf; + assign dbg_commit_dest_reg = rob_commit_comb.dest_reg; + assign dbg_commit_value = rob_commit_comb.value[XLEN-1:0]; + assign dbg_commit_2_valid = rob_commit_comb_2.valid; + assign dbg_commit_2_pc = rob_commit_comb_2.pc; + assign dbg_commit_2_dest_valid = rob_commit_comb_2.dest_valid; + assign dbg_commit_2_dest_rf = rob_commit_comb_2.dest_rf; + assign dbg_commit_2_dest_reg = rob_commit_comb_2.dest_reg; + assign dbg_commit_2_value = rob_commit_comb_2.value[XLEN-1:0]; + assign dbg_rob_commit_reg_valid = rob_commit.valid; + assign dbg_rob_commit_reg_pc = rob_commit.pc; + assign dbg_rob_commit_reg_dest_valid = rob_commit.dest_valid; + assign dbg_rob_commit_reg_dest_rf = rob_commit.dest_rf; + assign dbg_rob_commit_reg_dest_reg = rob_commit.dest_reg; + assign dbg_rob_commit_reg_value = rob_commit.value[XLEN-1:0]; + assign dbg_rob_commit_2_reg_valid = rob_commit_2.valid; + assign dbg_rob_commit_2_reg_pc = rob_commit_2.pc; + assign dbg_rob_commit_2_reg_dest_valid = rob_commit_2.dest_valid; + assign dbg_rob_commit_2_reg_dest_rf = rob_commit_2.dest_rf; + assign dbg_rob_commit_2_reg_dest_reg = rob_commit_2.dest_reg; + assign dbg_rob_commit_2_reg_value = rob_commit_2.value[XLEN-1:0]; +`endif + // DEBUG: verify early recovery redirect_pc matches commit-time redirect_pc // (Disabled for performance — re-enable for debugging.) // always @(posedge i_clk) begin @@ -1902,7 +1976,7 @@ module cpu_ooo #( .i_interrupts(i_interrupts), .i_mtime(i_mtime), .i_trap_taken(trap_taken), - .i_trap_pc(rob_trap_pc), + .i_trap_pc(trap_pc_internal), // mcause from trap_unit's arbitrated cause: interrupt cause (with the // interrupt bit) for interrupts, or the remapped exception cause (which // carries the U-mode ECALL remap via trap_unit.i_exception_cause below). @@ -1953,6 +2027,56 @@ module cpu_ooo #( logic [XLEN-1:0] trap_target_internal, trap_pc_internal; logic [XLEN-1:0] trap_value_internal; + logic [XLEN-1:0] interrupt_resume_pc; + + function automatic logic [XLEN-1:0] retired_next_pc( + input riscv_pkg::reorder_buffer_commit_t commit + ); + logic [XLEN-1:0] step; + begin + step = commit.is_compressed ? {{(XLEN - 2){1'b0}}, 2'b10} : + {{(XLEN - 3){1'b0}}, 3'b100}; + if (commit.is_branch || commit.is_mret) begin + retired_next_pc = commit.redirect_pc; + end else begin + retired_next_pc = commit.pc + step; + end + end + endfunction + + always_ff @(posedge i_clk) begin + if (i_rst) begin + interrupt_resume_pc <= '0; + end else if (mret_taken) begin + // An MRET retires through the trap/MRET full flush, NOT the normal commit + // path: the cycle after o_mret_taken, flush_all (from mret_taken_reg) + // wipes the ROB head and gates commit_en, so the MRET never appears on + // rob_commit_valid_raw and never updates interrupt_resume_pc via the + // branches below. Without this seed, interrupt_resume_pc keeps the + // architectural next-PC of the instruction *before* the MRET -- which is + // the MRET instruction's own PC -- for the entire MRET-to-U window (until + // the first post-MRET instruction commits). A machine interrupt taken + // after privilege drops below M (eligible once the trap_unit inhibit + // lifts, ~2 cycles later, long before that first commit) would then save + // mepc = , an M-mode handler address, which Linux later restores + // and MRETs to illegally in U-mode (the ret_from_exception 0x80388bba + // panic). Seed the resume PC from the MRET target (mepc, == the MRET + // redirect target) now so it is already correct before the inhibit + // window closes. csr_mepc is stable here: MRET does not write mepc and + // cannot coincide with a trap entry that would. + interrupt_resume_pc <= csr_mepc; + end else if (rob_commit_2_valid_raw) begin + interrupt_resume_pc <= retired_next_pc(rob_commit_comb_2); + end else if (rob_commit_valid_raw) begin + interrupt_resume_pc <= retired_next_pc(rob_commit_comb); + end + end + + // A same-cycle store-like ROB commit is not yet in the SQ committed set. + // If a trap full-flushes here, the registered commit can be masked before + // SQ observes it. Delay trap/MRET one cycle so SQ can own and drain it. + assign sq_committed_empty_for_trap = + sq_committed_empty && !rob_commit_store_like_raw && !rob_commit_2_store_like_raw; trap_unit #( .XLEN(XLEN) @@ -1960,7 +2084,7 @@ module cpu_ooo #( .i_clk, .i_rst, .i_pipeline_stall(1'b0), // OOO: no stall for trap check - .i_sq_committed_empty(sq_committed_empty), + .i_sq_committed_empty(sq_committed_empty_for_trap), .o_trap_drain_wait(trap_drain_wait), .i_mstatus(csr_mstatus), .i_mie(csr_mie), @@ -1976,6 +2100,7 @@ module cpu_ooo #( }), .i_exception_tval('0), .i_exception_pc(rob_trap_pc), + .i_interrupt_pc(interrupt_resume_pc), .i_mret_start(mret_start), .i_wfi_start(1'b0), // WFI handled by ROB serialization .o_trap_taken(trap_taken), @@ -1987,6 +2112,37 @@ module cpu_ooo #( .o_stall_for_wfi() // WFI stall handled at ROB head ); +`ifndef SYNTHESIS + // FROST DEBUG (TEMP -- remove after the first-timer-IRQ ra-corruption hunt): + // (1) every trap taken -- does a commit write-port collide this cycle (codex + // #3: async interrupt + same-cycle commit corrupting x1/ra + saved PC)? + // (2/3) any commit that writes x1/ra a bogus LOW value (the 0xcc0 panic ra). + // (4) heartbeat every 500k cycles: is the CPU progressing (head_pc advancing) + // or stuck (e.g. a DDR load that never returns) before the first IRQ? + int unsigned frost_cyc = 0; + always_ff @(posedge i_clk) begin + if (!i_rst) begin + frost_cyc <= frost_cyc + 1; + if ((frost_cyc % 500000) == 0) + $display("FROST_HB cyc=%0d head_pc=%08x trap_taken=%b cmt_vld=%b p0we=%b p0a=%0d", + frost_cyc, rob_trap_pc, trap_taken, rob_commit_valid, port0_int_we, + port0_int_addr); + if (trap_taken) + $display("FROST_DBG %0t TRAP cause=%08x csr_pc=%08x rob_pc=%08x resume_pc=%08x mepc=%08x | p0we=%b p0a=%0d p0d=%08x p1we=%b p1a=%0d p1d=%08x cmt_vld=%b", + $time, trap_cause_internal, trap_pc_internal, rob_trap_pc, + interrupt_resume_pc, csr_mepc, port0_int_we, port0_int_addr, + port0_int_data, port1_int_we, port1_int_addr, port1_int_data, + rob_commit_valid); + if (port0_int_we && (port0_int_addr == 5'd1) && (port0_int_data < 32'h0000_1000)) + $display("FROST_DBG %0t *** RA<-%08x PORT0 trap_taken=%b rob_trap_pc=%08x mepc=%08x", + $time, port0_int_data, trap_taken, rob_trap_pc, csr_mepc); + if (port1_int_we && (port1_int_addr == 5'd1) && (port1_int_data < 32'h0000_1000)) + $display("FROST_DBG %0t *** RA<-%08x PORT1 trap_taken=%b rob_trap_pc=%08x mepc=%08x", + $time, port1_int_data, trap_taken, rob_trap_pc, csr_mepc); + end + end +`endif + // Use the registered trap/mret pulses when driving the front-end flush so // flush_pipeline no longer rides on the combinational // rob_valid[head_idx] → commit_en → trap_unit → trap_taken diff --git a/hw/rtl/cpu_and_mem/cpu/tomasulo/load_queue/README.md b/hw/rtl/cpu_and_mem/cpu/tomasulo/load_queue/README.md index 08aef01f..f860ad32 100644 --- a/hw/rtl/cpu_and_mem/cpu/tomasulo/load_queue/README.md +++ b/hw/rtl/cpu_and_mem/cpu/tomasulo/load_queue/README.md @@ -52,6 +52,10 @@ Two things the cache intentionally *doesn't* do: so there's nothing speculative to throw away. Leaving cached lines hot across mispredict recovery roughly doubles the steady-state hit rate on CoreMark (36.5% → 72.4%). +- **No fill from a full-flush-cycle response.** Trap/MRET/FENCE.I full + flushes keep existing L0 lines hot, but a memory response that arrives + on the flush cycle is treated as a drained response for a killed load + and is not allowed to install a new L0 line. - **No same-cycle fill → lookup bypass.** Forwarding the in-flight fill into a same-cycle lookup dragged the back-end flush cone (`i_flush_en` → `accept_mem_response` → fill → bypass → hit → diff --git a/hw/rtl/cpu_and_mem/cpu/tomasulo/load_queue/load_queue.sv b/hw/rtl/cpu_and_mem/cpu/tomasulo/load_queue/load_queue.sv index 1263ed49..5a1ce7bf 100644 --- a/hw/rtl/cpu_and_mem/cpu/tomasulo/load_queue/load_queue.sv +++ b/hw/rtl/cpu_and_mem/cpu/tomasulo/load_queue/load_queue.sv @@ -525,6 +525,7 @@ module load_queue #( // Response acceptance/drain control logic flush_all_entries; logic issued_entry_flushed; + logic full_flush_response_drain; logic accept_mem_response; logic drop_mem_response_now; @@ -1133,16 +1134,21 @@ module load_queue #( // Data memory has fixed 1-cycle latency in this design. If a partial flush // kills the outstanding load, drop that next response explicitly so the slot - // can be safely reused before the stale data returns. + // can be safely reused before the stale data returns. A full flush clears all + // entries at the edge; a same-cycle response is therefore drained here rather + // than accepted, so it cannot complete a killed load or refill the persistent + // L0 cache from a flushed context. assign issued_entry_flushed = i_flush_en && mem_outstanding && lq_valid[issued_idx] && (flush_all_entries || is_younger( issued_rob_tag, i_flush_tag, i_rob_head_tag )); + assign full_flush_response_drain = i_flush_all && i_mem_read_valid && mem_outstanding; assign accept_mem_response = i_mem_read_valid && mem_outstanding && - !drop_mem_response_pending && !issued_entry_flushed && + !i_flush_all && !drop_mem_response_pending && !issued_entry_flushed && lq_valid[issued_idx]; assign drop_mem_response_now = i_mem_read_valid && - (drop_mem_response_pending || issued_entry_flushed || + (full_flush_response_drain || + drop_mem_response_pending || issued_entry_flushed || (mem_outstanding && !lq_valid[issued_idx])); // =========================================================================== @@ -2344,6 +2350,10 @@ module load_queue #( $warning("LQ: slot-2 alloc attempted when full_for_2 (and slot-1 firing)"); if (i_alloc_2.valid && !i_alloc.valid && full) $warning("LQ: slot-2 alloc attempted alone when full"); + if (i_flush_all && accept_mem_response) + $error("LQ: accepted memory response during full flush"); + if (i_flush_all && cache_fill_valid) + $error("LQ: filled L0 cache during full flush"); // Slot-1 and slot-2 must never target the same physical entry. if (slot1_alloc_en && slot2_alloc_en && (alloc_target[IdxWidth-1:0] == slot2_alloc_idx)) $error("LQ: slot-1 and slot-2 alloc collide on entry %0d", alloc_target[IdxWidth-1:0]); @@ -2539,6 +2549,15 @@ module load_queue #( end end + // Full-flush-cycle responses are drains only. They must not perform any + // architectural or persistent-cache side effect. + always_comb begin + if (i_rst_n && i_flush_all) begin + p_no_accept_during_full_flush : assert (!accept_mem_response); + p_no_l0_fill_during_full_flush : assert (!cache_fill_valid); + end + end + // ------------------------------------------------------------------------- // Sequential assertions // ------------------------------------------------------------------------- diff --git a/hw/rtl/cpu_and_mem/cpu/tomasulo/tomasulo_wrapper/commit_bus/commit_bus_pipeline.sv b/hw/rtl/cpu_and_mem/cpu/tomasulo/tomasulo_wrapper/commit_bus/commit_bus_pipeline.sv index 094e27aa..0b900402 100644 --- a/hw/rtl/cpu_and_mem/cpu/tomasulo/tomasulo_wrapper/commit_bus/commit_bus_pipeline.sv +++ b/hw/rtl/cpu_and_mem/cpu/tomasulo/tomasulo_wrapper/commit_bus/commit_bus_pipeline.sv @@ -117,9 +117,13 @@ module commit_bus_pipeline ( commit_q_2_is_store_like <= commit_bus_2.is_store || commit_bus_2.is_fp_store; end - // Drive the output ports from the registered locals. + // Drive the output ports from the registered locals. The flops above clear + // valid on the flush edge, but consumers see the previous valid value during + // that same cycle. Mask the qualified valid outputs immediately so a + // commit that overlaps a trap/MRET/FENCE.I full flush cannot perform one + // more architectural side effect while the backend is being squashed. assign o_commit_bus_q = commit_bus_q; - assign o_commit_bus_q_valid = commit_bus_q_valid; + assign o_commit_bus_q_valid = commit_bus_q_valid && !i_flush_all; assign o_commit_q_dest_valid = commit_q_dest_valid; assign o_commit_q_dest_rf = commit_q_dest_rf; assign o_commit_q_dest_reg = commit_q_dest_reg; @@ -128,7 +132,7 @@ module commit_bus_pipeline ( assign o_commit_q_is_store_like = commit_q_is_store_like; assign o_commit_q_sc_failed = commit_q_sc_failed; assign o_commit_bus_2_q = commit_bus_2_q; - assign o_commit_bus_2_q_valid = commit_bus_2_q_valid; + assign o_commit_bus_2_q_valid = commit_bus_2_q_valid && !i_flush_all; assign o_commit_q_2_dest_valid = commit_q_2_dest_valid; assign o_commit_q_2_dest_rf = commit_q_2_dest_rf; assign o_commit_q_2_dest_reg = commit_q_2_dest_reg; diff --git a/hw/rtl/cpu_and_mem/cpu/tomasulo/tomasulo_wrapper/tomasulo_wrapper.sv b/hw/rtl/cpu_and_mem/cpu/tomasulo/tomasulo_wrapper/tomasulo_wrapper.sv index 739e12dd..b1b15c4e 100644 --- a/hw/rtl/cpu_and_mem/cpu/tomasulo/tomasulo_wrapper/tomasulo_wrapper.sv +++ b/hw/rtl/cpu_and_mem/cpu/tomasulo/tomasulo_wrapper/tomasulo_wrapper.sv @@ -2731,12 +2731,12 @@ module tomasulo_wrapper #( .i_commit_valid_comb (commit_store_like_raw), .i_commit_rob_tag_comb(head_tag), - // Slot 2 is always older than any ordinary partial-flush boundary that - // can overlap commit_2_fire, and delayed recovery sees it through the - // registered commit path. Keep the raw head+1 ROB metadata cone out of - // the SQ valid flops. - .i_commit_valid_comb_2 (1'b0), - .i_commit_rob_tag_comb_2('0), + // Slot 2 has the same raw commit race as slot 1 for full-trap drains: + // commit_bus_2_q_valid is still one cycle away from SQ, so a timer IRQ + // must not observe committed-empty and full-flush the entry before SQ + // sees the registered commit. + .i_commit_valid_comb_2 (commit_2_store_like_raw), + .i_commit_rob_tag_comb_2(commit_bus_2.tag), // Store-to-load forwarding (from LQ) .i_sq_check_valid (sq_check_valid), diff --git a/sw/apps/csr_rmw_test/Makefile b/sw/apps/csr_rmw_test/Makefile new file mode 100644 index 00000000..f57fce6d --- /dev/null +++ b/sw/apps/csr_rmw_test/Makefile @@ -0,0 +1,17 @@ +# Copyright 2026 Two Sigma Open Source, LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +# Makefile for the CSR read-modify-write directed test +SRC_C := ../../lib/src/uart.c main.c +include ../../common/common.mk diff --git a/sw/apps/csr_rmw_test/main.c b/sw/apps/csr_rmw_test/main.c new file mode 100644 index 00000000..b1550440 --- /dev/null +++ b/sw/apps/csr_rmw_test/main.c @@ -0,0 +1,106 @@ +/* + * Copyright 2026 Two Sigma Open Source, LLC + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/* + * Directed CSR read-modify-write test. + * + * No-MMU M-mode Linux panics on the FIRST machine-timer interrupt with + * epc==ra==garbage (a `ret` through a clobbered return address). The kernel's + * trap entry swaps the thread pointer with `csrrw tp, mscratch, tp`, while the + * PASSING paths (umode_test, FreeRTOS) only ever use separate csrr/csrw -- so + * FROST's CSR read-modify-write instructions are an untested differentiator. + * + * This isolates whether csrrw/csrrs/csrrc correctly (a) return the OLD CSR + * value into rd AND (b) write the new value -- including the same-register swap + * idiom (`csrrw t0, mscratch, t0`) the kernel depends on. Self-checks over UART + * with <> / <>. + */ + +#include + +#include "uart.h" + +static int g_ok = 1; + +static void check(const char *name, uint32_t got, uint32_t want) +{ + int ok = (got == want); + if (!ok) + g_ok = 0; + uart_printf("%s %s: got=%08x want=%08x\n", ok ? "[PASS]" : "[FAIL]", name, got, want); +} + +static inline uint32_t rd_scratch(void) +{ + uint32_t v; + __asm__ volatile("csrr %0, mscratch" : "=r"(v)); + return v; +} + +static inline void wr_scratch(uint32_t v) +{ + __asm__ volatile("csrw mscratch, %0" : : "r"(v)); +} + +int main(void) +{ + uint32_t old, cur, swapped; + + uart_printf("\n=== CSR read-modify-write directed test ===\n"); + + /* csrrw: rd <- old(CSR); CSR <- rs1 */ + wr_scratch(0xAAAA1111u); + __asm__ volatile("li t0, 0xBBBB2222\n\tcsrrw %0, mscratch, t0" : "=r"(old) : : "t0"); + cur = rd_scratch(); + check("csrrw returns old", old, 0xAAAA1111u); + check("csrrw writes new", cur, 0xBBBB2222u); + + /* csrrs: rd <- old; CSR <- old | rs1 */ + wr_scratch(0xF0F0F0F0u); + __asm__ volatile("li t0, 0x0F0F0F0F\n\tcsrrs %0, mscratch, t0" : "=r"(old) : : "t0"); + cur = rd_scratch(); + check("csrrs returns old", old, 0xF0F0F0F0u); + check("csrrs sets bits", cur, 0xFFFFFFFFu); + + /* csrrc: rd <- old; CSR <- old & ~rs1 */ + wr_scratch(0xFFFFFFFFu); + __asm__ volatile("li t0, 0x0F0F0F0F\n\tcsrrc %0, mscratch, t0" : "=r"(old) : : "t0"); + cur = rd_scratch(); + check("csrrc returns old", old, 0xFFFFFFFFu); + check("csrrc clears bits", cur, 0xF0F0F0F0u); + + /* csrrw with x0 destination must STILL write the CSR (== csrw). */ + wr_scratch(0x12345678u); + __asm__ volatile("li t0, 0x9ABCDEF0\n\tcsrrw x0, mscratch, t0" : : : "t0"); + cur = rd_scratch(); + check("csrrw x0-dest still writes", cur, 0x9ABCDEF0u); + + /* THE KERNEL PATTERN: `csrrw t0, mscratch, t0` (same reg as rd and rs1 = + * atomic swap). After: t0 <- old(CSR), CSR <- old(t0). */ + wr_scratch(0xCAFEBABEu); + __asm__ volatile("li t0, 0xDEADBEEF\n\tcsrrw t0, mscratch, t0\n\tmv %0, t0" + : "=r"(swapped) + : + : "t0"); + cur = rd_scratch(); + check("csrrw swap: reg<-old", swapped, 0xCAFEBABEu); + check("csrrw swap: CSR<-reg", cur, 0xDEADBEEFu); + + uart_printf(g_ok ? "\n<>\n" : "\n<>\n"); + for (;;) { + } + return 0; +} diff --git a/sw/apps/linux_boot/Makefile b/sw/apps/linux_boot/Makefile index 171eecf1..8374d248 100644 --- a/sw/apps/linux_boot/Makefile +++ b/sw/apps/linux_boot/Makefile @@ -12,20 +12,31 @@ # See the License for the specific language governing permissions and # limitations under the License. -# Builds the cocotb memory images for the no-MMU Linux boot: a low-BRAM boot -# shim (sw.mem) plus the kernel Image + DTB in DDR (sw_ddr.mem). +# Builds the FROST memory images for the no-MMU Linux boot: a low-BRAM boot +# shim plus the kernel Image + DTB + initramfs in DDR. Emits both the cocotb +# $readmemh form (sw.mem / sw_ddr.mem) and the FPGA JTAG-loader form (sw.txt / +# sw_ddr.txt, dense one 32-bit word per line) consumed by fpga/load_software. # -# NOTE (bring-up): this pulls the kernel Image and DTB from the external -# linux-mvp tree via build_linux_mem.py. It is the integration scaffolding for -# first boot and will be replaced by an in-repo build once Linux boots. +# NOTE (bring-up): this pulls the kernel Image + initramfs and generates the +# board DTB from the external linux-mvp tree via build_fpga_boot.py. The DTB's +# timebase / UART clock-frequency follow FPGA_CPU_CLK_FREQ (set by the FPGA +# loader per board: genesys2 = 133.33 MHz). Integration scaffolding for first +# boot; to be replaced by an in-repo build once Linux boots to a shell. ARTIFACTS ?= $(HOME)/bigger_l0/linux-mvp/frost-artifacts -all: sw.mem sw_ddr.mem +all: patch_linux_image -sw.mem sw_ddr.mem: - python3 $(ARTIFACTS)/build_linux_mem.py +.PHONY: patch_linux_image + +patch_linux_image: sw.mem sw.txt sw_ddr.mem sw_ddr.txt + python3 ./patch_ret_from_exception.py ./sw_ddr.mem ./sw_ddr.txt + +sw.mem sw.txt sw_ddr.mem sw_ddr.txt: + python3 $(ARTIFACTS)/build_fpga_boot.py cp -f $(ARTIFACTS)/sw.mem ./sw.mem + cp -f $(ARTIFACTS)/sw.txt ./sw.txt cp -f $(ARTIFACTS)/sw_ddr.mem ./sw_ddr.mem + cp -f $(ARTIFACTS)/sw_ddr.txt ./sw_ddr.txt clean: - rm -f sw.mem sw_ddr.mem shim.elf shim.bin + rm -f sw.mem sw.txt sw_ddr.mem sw_ddr.txt shim.elf shim.bin diff --git a/sw/apps/linux_boot/patch_ret_from_exception.py b/sw/apps/linux_boot/patch_ret_from_exception.py new file mode 100644 index 00000000..4200bcbe --- /dev/null +++ b/sw/apps/linux_boot/patch_ret_from_exception.py @@ -0,0 +1,92 @@ +#!/usr/bin/env python3 + +"""Patch the temporary Linux bring-up image for the MRET restore window. + +The external linux-mvp tree currently builds a debug kernel whose +ret_from_exception sequence contains: + + lw a2, PT_EPC(sp) + sc.w zero, a2, (sp) + csrw mstatus, a0 + csrw mepc, a2 + ... + mret + +If the restored mstatus image has MIE set, the timer can preempt between the +CSR write and MRET. The trap then saves mepc at the MRET instruction itself, +which later returns into MRET as user code and produces SIGILL at +ret_from_exception+0x76. + +For bring-up, replace the non-essential reservation-clear SC with +`andi a0, a0, -9`, clearing MIE in the value written to mstatus. MRET still +restores the final interrupt-enable state from MPIE, but the restore window is +not interruptible. +""" + +from __future__ import annotations + +import argparse +from pathlib import Path + + +TARGET_WORD_INDEX = 0x00388B70 // 4 +OLD_WORD = "18c1202f" +NEW_WORD = "ff757513" + + +def patch_dense(path: Path) -> None: + lines = path.read_text().splitlines() + if TARGET_WORD_INDEX >= len(lines): + raise SystemExit(f"{path}: target word index 0x{TARGET_WORD_INDEX:x} is out of range") + old = lines[TARGET_WORD_INDEX].strip().lower() + if old == NEW_WORD: + return + if old != OLD_WORD: + raise SystemExit( + f"{path}: expected {OLD_WORD} at word 0x{TARGET_WORD_INDEX:x}, found {old}" + ) + lines[TARGET_WORD_INDEX] = NEW_WORD + path.write_text("\n".join(lines) + "\n") + + +def patch_mem(path: Path) -> None: + lines = path.read_text().splitlines() + word_index = 0 + for line_no, line in enumerate(lines): + stripped = line.strip() + if not stripped: + continue + if stripped.startswith("@"): + word_index = int(stripped[1:], 16) + continue + if word_index == TARGET_WORD_INDEX: + old = stripped.lower() + if old == NEW_WORD: + return + if old != OLD_WORD: + raise SystemExit( + f"{path}: expected {OLD_WORD} at word 0x{TARGET_WORD_INDEX:x}, found {old}" + ) + lines[line_no] = NEW_WORD + path.write_text("\n".join(lines) + "\n") + return + word_index += 1 + raise SystemExit(f"{path}: target word index 0x{TARGET_WORD_INDEX:x} not found") + + +def main() -> None: + parser = argparse.ArgumentParser() + parser.add_argument("sw_ddr_mem", type=Path) + parser.add_argument("sw_ddr_txt", type=Path) + args = parser.parse_args() + + patch_mem(args.sw_ddr_mem) + patch_dense(args.sw_ddr_txt) + print( + "Patched Linux ret_from_exception restore window: " + f"word 0x{TARGET_WORD_INDEX:x} {OLD_WORD}->{NEW_WORD}" + ) + + +if __name__ == "__main__": + main() diff --git a/sw/apps/linux_irq_active_ddr_test/Makefile b/sw/apps/linux_irq_active_ddr_test/Makefile new file mode 100644 index 00000000..a15ba7f2 --- /dev/null +++ b/sw/apps/linux_irq_active_ddr_test/Makefile @@ -0,0 +1,19 @@ +# Copyright 2026 Two Sigma Open Source, LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +# Linux-like active-code timer IRQ directed test. Force the whole program into +# cached DDR even when the generic cocotb runner is in its default BRAM tier. +override MEM_CONFIG := ddr +SRC_C := ../../lib/src/uart.c main.c +include ../../common/common.mk diff --git a/sw/apps/linux_irq_active_ddr_test/main.c b/sw/apps/linux_irq_active_ddr_test/main.c new file mode 100644 index 00000000..1d479aa4 --- /dev/null +++ b/sw/apps/linux_irq_active_ddr_test/main.c @@ -0,0 +1,576 @@ +/* + * Copyright 2026 Two Sigma Open Source, LLC + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/* + * Linux-like active-code timer IRQ test, linked and executed from cached DDR. + * + * The no-MMU Linux hardware failure is an illegal-instruction panic with + * ra == epc == 0x00000cc0 after the first machine timer interrupt from idle. + * This test keeps the loop much smaller than Linux while preserving the risky + * ingredients: DDR-resident code/data, an explicit DDR stack, WFI idle, + * active-code machine-timer IRQs, a Linux-style naked trap entry that + * saves/restores GPRs on the current stack, and the csrrw tp,mscratch,tp swap + * idiom. The active phase repeatedly creates a low-value temporary-register + * poison while nested call/return traffic is in flight; ra should remain a + * high DDR return address at every interrupt boundary. + */ + +#include + +#include "csr.h" +#include "trap.h" +#include "uart.h" + +#define ARRAY_LEN(a) ((int) (sizeof(a) / sizeof((a)[0]))) +#define CLINT_MTIMECMP_LO (*(volatile uint32_t *) 0x40014000u) +#define CLINT_MTIMECMP_HI (*(volatile uint32_t *) 0x40014004u) +#define CLINT_MTIME_LO (*(volatile uint32_t *) 0x4001BFF8u) +#define CLINT_MTIME_HI (*(volatile uint32_t *) 0x4001BFFCu) +#define NORMAL_IRQ_COUNT 16u +#define POISON_IRQ_COUNT 16u +#define ACTIVE_IRQ_COUNT 8u +#define IRQ_COUNT (NORMAL_IRQ_COUNT + POISON_IRQ_COUNT + ACTIVE_IRQ_COUNT) +#define FRAME_WORDS 36u +#define DDR_STACK_SIZE 4096u + +struct linux_pt_regs { + uint32_t epc; + uint32_t ra; + uint32_t sp; + uint32_t gp; + uint32_t tp; + uint32_t t0; + uint32_t t1; + uint32_t t2; + uint32_t s0; + uint32_t s1; + uint32_t a0; + uint32_t a1; + uint32_t a2; + uint32_t a3; + uint32_t a4; + uint32_t a5; + uint32_t a6; + uint32_t a7; + uint32_t s2; + uint32_t s3; + uint32_t s4; + uint32_t s5; + uint32_t s6; + uint32_t s7; + uint32_t s8; + uint32_t s9; + uint32_t s10; + uint32_t s11; + uint32_t t3; + uint32_t t4; + uint32_t t5; + uint32_t t6; + uint32_t status; + uint32_t badaddr; + uint32_t cause; + uint32_t orig_a0; +}; + +struct fake_current { + uint32_t kernel_sp; + uint32_t user_sp; + uint32_t marker; +}; + +volatile uint32_t g_expected_mepc; +volatile uint32_t g_expected_ra; +volatile uint32_t g_expected_sp; +volatile uint32_t g_expected_tp; +volatile uint32_t g_exact_frame_check; +volatile struct fake_current g_fake_current = {0u, 0u, 0x5441534Bu}; +volatile uint32_t g_ticks; +volatile uint32_t g_fail_code; +volatile uint32_t g_fail_seen; +volatile uint32_t g_bad_cause; +volatile uint32_t g_bad_epc; +volatile uint32_t g_bad_ra; +volatile uint32_t g_last_mepc; +volatile uint32_t g_last_ra; +volatile uint32_t g_last_sp; +volatile uint32_t g_last_tp; +volatile uint32_t g_last_mscratch_in_handler; +volatile uint32_t g_context_checksum; +volatile uint32_t g_context_words[64]; +volatile uint32_t g_frame_snapshots[IRQ_COUNT][FRAME_WORDS]; + +static uint8_t g_ddr_stack[DDR_STACK_SIZE] __attribute__((aligned(16))); + +static inline uint32_t read_tp(void) +{ + uint32_t value; + __asm__ volatile("mv %0, tp" : "=r"(value)); + return value; +} + +static inline void write_tp(uint32_t value) +{ + __asm__ volatile("mv tp, %0" : : "r"(value) : "memory"); +} + +static void record_failure(uint32_t code) +{ + if (!g_fail_seen) { + g_fail_seen = 1; + g_fail_code = code; + g_bad_cause = csr_read(mcause); + } +} + +static void fill_context(void) +{ + for (int i = 0; i < ARRAY_LEN(g_context_words); i++) { + g_context_words[i] = 0x80000000u ^ ((uint32_t) i * 0x10204081u); + } + g_context_checksum = 0x13579BDFu; +} + +static uint32_t churn_context(uint32_t seed) +{ + uint32_t acc = seed ^ g_context_checksum; + + for (int i = 0; i < ARRAY_LEN(g_context_words); i++) { + uint32_t value = g_context_words[i]; + acc ^= value + ((uint32_t) i << 16); + acc = (acc << 5) | (acc >> 27); + g_context_words[i] = value ^ acc ^ (0x9E3779B9u + (uint32_t) i); + } + + g_context_checksum = acc; + return acc; +} + +static uint64_t clint_rdmtime(void) +{ + uint32_t hi; + uint32_t lo; + uint32_t hi2; + + do { + hi = CLINT_MTIME_HI; + lo = CLINT_MTIME_LO; + hi2 = CLINT_MTIME_HI; + } while (hi != hi2); + + return ((uint64_t) hi << 32) | lo; +} + +static void clint_set_timer_cmp(uint64_t cmp) +{ + CLINT_MTIMECMP_HI = 0xFFFFFFFFu; + CLINT_MTIMECMP_LO = (uint32_t) cmp; + CLINT_MTIMECMP_HI = (uint32_t) (cmp >> 32); +} + +static void clint_ack_timer(void) +{ + CLINT_MTIMECMP_HI = 0xFFFFFFFFu; + CLINT_MTIMECMP_LO = 0xFFFFFFFFu; +} + +__attribute__((noinline)) static uint32_t active_poison_window(uint32_t value) +{ + uint32_t out; + + __asm__ volatile( + "lui t5, 0x1\n" + "addi t5, t5, -832\n" + "xor %[out], %[in], t5\n" + "addi %[out], %[out], 37\n" + : [out] "=&r"(out) + : [in] "r"(value) + : "t5", "memory"); + + return out; +} + +__attribute__((noinline)) static uint32_t active_leaf(uint32_t seed) +{ + volatile uint32_t local[12]; + uint32_t acc = seed ^ g_context_checksum; + + for (uint32_t i = 0; i < ARRAY_LEN(local); i++) { + local[i] = active_poison_window(acc + i); + acc ^= local[i] + (i << 8); + } + + return active_poison_window(acc); +} + +__attribute__((noinline)) static uint32_t active_mid3(uint32_t seed) +{ + return active_leaf(seed + 0x11111111u) ^ active_leaf(seed + 0x22222222u); +} + +__attribute__((noinline)) static uint32_t active_mid2(uint32_t seed) +{ + uint32_t a = active_mid3(seed ^ 0x33333333u); + uint32_t b = active_poison_window(seed ^ a); + + return active_mid3(b) ^ a; +} + +__attribute__((noinline)) static uint32_t active_mid1(uint32_t seed) +{ + return active_mid2(seed + 0x44444444u) ^ active_poison_window(seed); +} + +__attribute__((noinline)) static uint32_t active_until_irq(uint32_t iter) +{ + uint32_t before = g_ticks; + uint32_t acc = iter ^ 0xA5A50000u; + uint32_t guard = 0; + + write_tp((uint32_t) &g_fake_current); + csr_write(mscratch, 0u); + g_exact_frame_check = 0u; + clint_set_timer_cmp(clint_rdmtime() + 700u + (iter & 63u)); + enable_interrupts(); + + while (g_ticks == before && !g_fail_seen) { + acc ^= active_mid1(acc + guard); + guard++; + if (guard > 20000u) { + record_failure(19u); + break; + } + } + + disable_interrupts(); + + if (g_ticks != before + 1u) { + record_failure(20u); + } + if (read_tp() != (uint32_t) &g_fake_current) { + record_failure(21u); + } + if (csr_read(mscratch) != 0u) { + record_failure(22u); + } + + return churn_context(acc ^ g_ticks); +} + +__attribute__((noinline, used)) void linux_like_irq_c(struct linux_pt_regs *frame) +{ + uint32_t tick = g_ticks; + + g_last_mepc = frame->epc; + g_last_ra = frame->ra; + g_last_sp = frame->sp; + g_last_tp = frame->tp; + g_last_mscratch_in_handler = csr_read(mscratch); + + if (tick < IRQ_COUNT) { + for (uint32_t i = 0; i < FRAME_WORDS; i++) { + g_frame_snapshots[tick][i] = ((volatile uint32_t *) frame)[i]; + } + } + + if (frame->cause != (MCAUSE_INTERRUPT_BIT | INT_MTI)) { + g_bad_epc = frame->epc; + g_bad_ra = frame->ra; + record_failure(1u); + uart_printf("FAIL code=%u ticks=%u cause=%08x mepc=%08x ra=%08x\n", + g_fail_code, g_ticks, frame->cause, frame->epc, frame->ra); + uart_printf("<>\n"); + for (;;) { + } + } + if (g_exact_frame_check) { + if (frame->epc != g_expected_mepc) { + record_failure(2u); + } + if (frame->ra != g_expected_ra) { + record_failure(3u); + } + if (frame->ra < 0x80000000u || frame->ra == 0x00000CC0u || frame->ra < 0x00001000u) { + record_failure(14u); + } + if (frame->sp != g_expected_sp) { + record_failure(4u); + } + if (frame->tp != g_expected_tp) { + record_failure(5u); + } + } else { + if (frame->epc < 0x80000000u || frame->epc == 0x00000CC0u) { + record_failure(15u); + } + if (frame->ra < 0x80000000u || frame->ra == 0x00000CC0u) { + record_failure(16u); + } + if (frame->sp < 0x80000000u) { + record_failure(17u); + } + if (frame->tp != (uint32_t) &g_fake_current) { + record_failure(18u); + } + } + if (g_last_mscratch_in_handler != 0u) { + record_failure(6u); + } + + churn_context(frame->epc ^ frame->ra ^ tick); + + clint_ack_timer(); + g_ticks = tick + 1u; +} + +__attribute__((naked, aligned(4))) static void linux_like_irq_entry(void) +{ + __asm__ volatile( + "csrrw tp, mscratch, tp\n" + "bnez tp, 1f\n" + "csrr tp, mscratch\n" + "1:\n" + "addi sp, sp, -144\n" + "sw ra, 4(sp)\n" + "sw gp, 12(sp)\n" + "sw t0, 20(sp)\n" + "sw t1, 24(sp)\n" + "sw t2, 28(sp)\n" + "sw s0, 32(sp)\n" + "sw s1, 36(sp)\n" + "sw a0, 40(sp)\n" + "sw a1, 44(sp)\n" + "sw a2, 48(sp)\n" + "sw a3, 52(sp)\n" + "sw a4, 56(sp)\n" + "sw a5, 60(sp)\n" + "sw a6, 64(sp)\n" + "sw a7, 68(sp)\n" + "sw s2, 72(sp)\n" + "sw s3, 76(sp)\n" + "sw s4, 80(sp)\n" + "sw s5, 84(sp)\n" + "sw s6, 88(sp)\n" + "sw s7, 92(sp)\n" + "sw s8, 96(sp)\n" + "sw s9, 100(sp)\n" + "sw s10, 104(sp)\n" + "sw s11, 108(sp)\n" + "sw t3, 112(sp)\n" + "sw t4, 116(sp)\n" + "sw t5, 120(sp)\n" + "sw t6, 124(sp)\n" + "sw a0, 140(sp)\n" + "addi t0, sp, 144\n" + "sw t0, 8(sp)\n" + "csrr t0, mepc\n" + "sw t0, 0(sp)\n" + "csrr t0, mstatus\n" + "sw t0, 128(sp)\n" + "csrr t0, mtval\n" + "sw t0, 132(sp)\n" + "csrr t0, mcause\n" + "sw t0, 136(sp)\n" + "csrr t0, mscratch\n" + "sw t0, 16(sp)\n" + "csrw mscratch, x0\n" + "mv a0, sp\n" + "call linux_like_irq_c\n" + "lw a0, 128(sp)\n" + "lw a2, 0(sp)\n" + "sc.w x0, a2, 0(sp)\n" + "csrw mstatus, a0\n" + "csrw mepc, a2\n" + "lw ra, 4(sp)\n" + "lw gp, 12(sp)\n" + "lw tp, 16(sp)\n" + "lw t0, 20(sp)\n" + "lw t1, 24(sp)\n" + "lw t2, 28(sp)\n" + "lw s0, 32(sp)\n" + "lw s1, 36(sp)\n" + "lw a0, 40(sp)\n" + "lw a1, 44(sp)\n" + "lw a2, 48(sp)\n" + "lw a3, 52(sp)\n" + "lw a4, 56(sp)\n" + "lw a5, 60(sp)\n" + "lw a6, 64(sp)\n" + "lw a7, 68(sp)\n" + "lw s2, 72(sp)\n" + "lw s3, 76(sp)\n" + "lw s4, 80(sp)\n" + "lw s5, 84(sp)\n" + "lw s6, 88(sp)\n" + "lw s7, 92(sp)\n" + "lw s8, 96(sp)\n" + "lw s9, 100(sp)\n" + "lw s10, 104(sp)\n" + "lw s11, 108(sp)\n" + "lw t3, 112(sp)\n" + "lw t4, 116(sp)\n" + "lw t5, 120(sp)\n" + "lw t6, 124(sp)\n" + "lw sp, 8(sp)\n" + "mret\n"); +} + +__attribute__((noinline)) static uint32_t idle_once(uint32_t iter) +{ + uint32_t before = g_ticks; + + write_tp((uint32_t) &g_fake_current); + csr_write(mscratch, 0u); + g_exact_frame_check = 1u; + clint_set_timer_cmp(clint_rdmtime() + 300u + (iter & 31u)); + enable_interrupts(); + + __asm__ volatile( + "mv t2, ra\n" + "mv t3, sp\n" + "mv t4, tp\n" + "la t0, 1f\n" + "la t1, g_expected_mepc\n" + "sw t0, 0(t1)\n" + "la t1, g_expected_ra\n" + "sw t2, 0(t1)\n" + "la t1, g_expected_sp\n" + "sw t3, 0(t1)\n" + "la t1, g_expected_tp\n" + "sw t4, 0(t1)\n" + "wfi\n" + "1:\n" + : + : + : "t0", "t1", "t2", "t3", "t4", "memory"); + + disable_interrupts(); + + if (g_ticks != before + 1u) { + record_failure(8u); + } + if (read_tp() != (uint32_t) &g_fake_current) { + record_failure(9u); + } + if (csr_read(mscratch) != 0u) { + record_failure(10u); + } + + return churn_context(iter ^ g_ticks); +} + +__attribute__((noinline)) static uint32_t idle_then_poison_ra_once(uint32_t iter) +{ + uint32_t before = g_ticks; + + write_tp((uint32_t) &g_fake_current); + csr_write(mscratch, 0u); + g_exact_frame_check = 1u; + clint_set_timer_cmp(clint_rdmtime() + 300u + (iter & 31u)); + enable_interrupts(); + + __asm__ volatile( + "mv t2, ra\n" + "mv t3, sp\n" + "mv t4, tp\n" + "la t0, 1f\n" + "la t1, g_expected_mepc\n" + "sw t0, 0(t1)\n" + "la t1, g_expected_ra\n" + "sw t2, 0(t1)\n" + "la t1, g_expected_sp\n" + "sw t3, 0(t1)\n" + "la t1, g_expected_tp\n" + "sw t4, 0(t1)\n" + "wfi\n" + "1:\n" + "lui ra, 0x1\n" + "addi ra, ra, -832\n" + "mv ra, t2\n" + : + : + : "t0", "t1", "t2", "t3", "t4", "memory"); + + disable_interrupts(); + + if (g_ticks != before + 1u) { + record_failure(11u); + } + if (read_tp() != (uint32_t) &g_fake_current) { + record_failure(12u); + } + if (csr_read(mscratch) != 0u) { + record_failure(13u); + } + + return churn_context(0xCC0u ^ iter ^ g_ticks); +} + +__attribute__((noreturn, noinline, used)) void main_on_ddr_stack(void) +{ + uint32_t aggregate = 0x2468ACE0u; + + uart_printf("\n=== Linux-like active DDR timer IRQ test ===\n"); + fill_context(); + g_fake_current.kernel_sp = (uint32_t) &g_ddr_stack[DDR_STACK_SIZE]; + g_fake_current.user_sp = 0u; + set_trap_handler(&linux_like_irq_entry); + disable_interrupts(); + enable_timer_interrupt(); + + for (uint32_t i = 0; i < NORMAL_IRQ_COUNT; i++) { + aggregate ^= idle_once(i); + if (g_fail_seen) { + break; + } + } + for (uint32_t i = 0; i < POISON_IRQ_COUNT && !g_fail_seen; i++) { + aggregate ^= idle_then_poison_ra_once(i); + } + for (uint32_t i = 0; i < ACTIVE_IRQ_COUNT && !g_fail_seen; i++) { + aggregate ^= active_until_irq(i); + } + + disable_timer_interrupt(); + disable_interrupts(); + clint_ack_timer(); + + if (!g_fail_seen && g_ticks == IRQ_COUNT && aggregate != 0u) { + uart_printf("ticks=%u checksum=%08x last_mepc=%08x last_ra=%08x\n", + g_ticks, g_context_checksum, g_last_mepc, g_last_ra); + uart_printf("<>\n"); + } else { + uart_printf("FAIL code=%u ticks=%u cause=%08x mepc=%08x ra=%08x sp=%08x tp=%08x mscratch=%08x\n", + g_fail_code, g_ticks, g_bad_cause, g_last_mepc, g_last_ra, + g_last_sp, g_last_tp, g_last_mscratch_in_handler); + uart_printf("<>\n"); + } + + for (;;) { + } +} + +int main(void) +{ + uint32_t stack_top = ((uint32_t) &g_ddr_stack[DDR_STACK_SIZE]) & ~0xFu; + + __asm__ volatile( + "mv sp, %0\n" + "j main_on_ddr_stack\n" + : + : "r"(stack_top) + : "memory"); + __builtin_unreachable(); +} diff --git a/sw/apps/linux_irq_ddr_test/Makefile b/sw/apps/linux_irq_ddr_test/Makefile new file mode 100644 index 00000000..765051ad --- /dev/null +++ b/sw/apps/linux_irq_ddr_test/Makefile @@ -0,0 +1,19 @@ +# Copyright 2026 Two Sigma Open Source, LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +# Linux-like timer IRQ directed test. Force the whole program into cached DDR +# even when the generic cocotb runner is in its default BRAM tier. +override MEM_CONFIG := ddr +SRC_C := ../../lib/src/uart.c main.c +include ../../common/common.mk diff --git a/sw/apps/linux_irq_ddr_test/main.c b/sw/apps/linux_irq_ddr_test/main.c new file mode 100644 index 00000000..19769f03 --- /dev/null +++ b/sw/apps/linux_irq_ddr_test/main.c @@ -0,0 +1,457 @@ +/* + * Copyright 2026 Two Sigma Open Source, LLC + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/* + * Linux-like timer IRQ test, linked and executed from cached DDR. + * + * The no-MMU Linux hardware failure is an illegal-instruction panic with + * ra == epc == 0x00000cc0 after the first machine timer interrupt from idle. + * This test keeps the loop much smaller than Linux while preserving the risky + * ingredients: DDR-resident code/data, an explicit DDR stack, WFI idle, a + * machine-timer IRQ, a Linux-style naked trap entry that saves/restores GPRs on + * the current stack, and the csrrw tp,mscratch,tp swap idiom. + */ + +#include + +#include "csr.h" +#include "trap.h" +#include "uart.h" + +#define ARRAY_LEN(a) ((int) (sizeof(a) / sizeof((a)[0]))) +#define CLINT_MTIMECMP_LO (*(volatile uint32_t *) 0x40014000u) +#define CLINT_MTIMECMP_HI (*(volatile uint32_t *) 0x40014004u) +#define CLINT_MTIME_LO (*(volatile uint32_t *) 0x4001BFF8u) +#define CLINT_MTIME_HI (*(volatile uint32_t *) 0x4001BFFCu) +#define NORMAL_IRQ_COUNT 16u +#define POISON_IRQ_COUNT 16u +#define IRQ_COUNT (NORMAL_IRQ_COUNT + POISON_IRQ_COUNT) +#define FRAME_WORDS 36u +#define DDR_STACK_SIZE 4096u + +struct linux_pt_regs { + uint32_t epc; + uint32_t ra; + uint32_t sp; + uint32_t gp; + uint32_t tp; + uint32_t t0; + uint32_t t1; + uint32_t t2; + uint32_t s0; + uint32_t s1; + uint32_t a0; + uint32_t a1; + uint32_t a2; + uint32_t a3; + uint32_t a4; + uint32_t a5; + uint32_t a6; + uint32_t a7; + uint32_t s2; + uint32_t s3; + uint32_t s4; + uint32_t s5; + uint32_t s6; + uint32_t s7; + uint32_t s8; + uint32_t s9; + uint32_t s10; + uint32_t s11; + uint32_t t3; + uint32_t t4; + uint32_t t5; + uint32_t t6; + uint32_t status; + uint32_t badaddr; + uint32_t cause; + uint32_t orig_a0; +}; + +struct fake_current { + uint32_t kernel_sp; + uint32_t user_sp; + uint32_t marker; +}; + +volatile uint32_t g_expected_mepc; +volatile uint32_t g_expected_ra; +volatile uint32_t g_expected_sp; +volatile uint32_t g_expected_tp; +volatile struct fake_current g_fake_current = {0u, 0u, 0x5441534Bu}; +volatile uint32_t g_ticks; +volatile uint32_t g_fail_code; +volatile uint32_t g_fail_seen; +volatile uint32_t g_last_mepc; +volatile uint32_t g_last_ra; +volatile uint32_t g_last_sp; +volatile uint32_t g_last_tp; +volatile uint32_t g_last_mscratch_in_handler; +volatile uint32_t g_context_checksum; +volatile uint32_t g_context_words[64]; +volatile uint32_t g_frame_snapshots[IRQ_COUNT][FRAME_WORDS]; + +static uint8_t g_ddr_stack[DDR_STACK_SIZE] __attribute__((aligned(16))); + +static inline uint32_t read_tp(void) +{ + uint32_t value; + __asm__ volatile("mv %0, tp" : "=r"(value)); + return value; +} + +static inline void write_tp(uint32_t value) +{ + __asm__ volatile("mv tp, %0" : : "r"(value) : "memory"); +} + +static void record_failure(uint32_t code) +{ + if (!g_fail_seen) { + g_fail_seen = 1; + g_fail_code = code; + } +} + +static void fill_context(void) +{ + for (int i = 0; i < ARRAY_LEN(g_context_words); i++) { + g_context_words[i] = 0x80000000u ^ ((uint32_t) i * 0x10204081u); + } + g_context_checksum = 0x13579BDFu; +} + +static uint32_t churn_context(uint32_t seed) +{ + uint32_t acc = seed ^ g_context_checksum; + + for (int i = 0; i < ARRAY_LEN(g_context_words); i++) { + uint32_t value = g_context_words[i]; + acc ^= value + ((uint32_t) i << 16); + acc = (acc << 5) | (acc >> 27); + g_context_words[i] = value ^ acc ^ (0x9E3779B9u + (uint32_t) i); + } + + g_context_checksum = acc; + return acc; +} + +static uint64_t clint_rdmtime(void) +{ + uint32_t hi; + uint32_t lo; + uint32_t hi2; + + do { + hi = CLINT_MTIME_HI; + lo = CLINT_MTIME_LO; + hi2 = CLINT_MTIME_HI; + } while (hi != hi2); + + return ((uint64_t) hi << 32) | lo; +} + +static void clint_set_timer_cmp(uint64_t cmp) +{ + CLINT_MTIMECMP_HI = 0xFFFFFFFFu; + CLINT_MTIMECMP_LO = (uint32_t) cmp; + CLINT_MTIMECMP_HI = (uint32_t) (cmp >> 32); +} + +static void clint_ack_timer(void) +{ + CLINT_MTIMECMP_HI = 0xFFFFFFFFu; + CLINT_MTIMECMP_LO = 0xFFFFFFFFu; +} + +__attribute__((noinline, used)) void linux_like_irq_c(struct linux_pt_regs *frame) +{ + uint32_t tick = g_ticks; + + g_last_mepc = frame->epc; + g_last_ra = frame->ra; + g_last_sp = frame->sp; + g_last_tp = frame->tp; + g_last_mscratch_in_handler = csr_read(mscratch); + + if (tick < IRQ_COUNT) { + for (uint32_t i = 0; i < FRAME_WORDS; i++) { + g_frame_snapshots[tick][i] = ((volatile uint32_t *) frame)[i]; + } + } + + if (frame->cause != (MCAUSE_INTERRUPT_BIT | INT_MTI)) { + record_failure(1u); + } + if (frame->epc != g_expected_mepc) { + record_failure(2u); + } + if (frame->ra != g_expected_ra) { + record_failure(3u); + } + if (frame->ra < 0x80000000u || frame->ra == 0x00000CC0u || frame->ra < 0x00001000u) { + record_failure(14u); + } + if (frame->sp != g_expected_sp) { + record_failure(4u); + } + if (frame->tp != g_expected_tp) { + record_failure(5u); + } + if (g_last_mscratch_in_handler != 0u) { + record_failure(6u); + } + + churn_context(frame->epc ^ frame->ra ^ tick); + + clint_ack_timer(); + g_ticks = tick + 1u; +} + +__attribute__((naked, aligned(4))) static void linux_like_irq_entry(void) +{ + __asm__ volatile( + "csrrw tp, mscratch, tp\n" + "bnez tp, 1f\n" + "csrr tp, mscratch\n" + "1:\n" + "addi sp, sp, -144\n" + "sw ra, 4(sp)\n" + "sw gp, 12(sp)\n" + "sw t0, 20(sp)\n" + "sw t1, 24(sp)\n" + "sw t2, 28(sp)\n" + "sw s0, 32(sp)\n" + "sw s1, 36(sp)\n" + "sw a0, 40(sp)\n" + "sw a1, 44(sp)\n" + "sw a2, 48(sp)\n" + "sw a3, 52(sp)\n" + "sw a4, 56(sp)\n" + "sw a5, 60(sp)\n" + "sw a6, 64(sp)\n" + "sw a7, 68(sp)\n" + "sw s2, 72(sp)\n" + "sw s3, 76(sp)\n" + "sw s4, 80(sp)\n" + "sw s5, 84(sp)\n" + "sw s6, 88(sp)\n" + "sw s7, 92(sp)\n" + "sw s8, 96(sp)\n" + "sw s9, 100(sp)\n" + "sw s10, 104(sp)\n" + "sw s11, 108(sp)\n" + "sw t3, 112(sp)\n" + "sw t4, 116(sp)\n" + "sw t5, 120(sp)\n" + "sw t6, 124(sp)\n" + "sw a0, 140(sp)\n" + "addi t0, sp, 144\n" + "sw t0, 8(sp)\n" + "csrr t0, mepc\n" + "sw t0, 0(sp)\n" + "csrr t0, mstatus\n" + "sw t0, 128(sp)\n" + "csrr t0, mtval\n" + "sw t0, 132(sp)\n" + "csrr t0, mcause\n" + "sw t0, 136(sp)\n" + "csrr t0, mscratch\n" + "sw t0, 16(sp)\n" + "csrw mscratch, x0\n" + "mv a0, sp\n" + "call linux_like_irq_c\n" + "lw a0, 128(sp)\n" + "lw a2, 0(sp)\n" + "sc.w x0, a2, 0(sp)\n" + "csrw mstatus, a0\n" + "csrw mepc, a2\n" + "lw ra, 4(sp)\n" + "lw gp, 12(sp)\n" + "lw tp, 16(sp)\n" + "lw t0, 20(sp)\n" + "lw t1, 24(sp)\n" + "lw t2, 28(sp)\n" + "lw s0, 32(sp)\n" + "lw s1, 36(sp)\n" + "lw a0, 40(sp)\n" + "lw a1, 44(sp)\n" + "lw a2, 48(sp)\n" + "lw a3, 52(sp)\n" + "lw a4, 56(sp)\n" + "lw a5, 60(sp)\n" + "lw a6, 64(sp)\n" + "lw a7, 68(sp)\n" + "lw s2, 72(sp)\n" + "lw s3, 76(sp)\n" + "lw s4, 80(sp)\n" + "lw s5, 84(sp)\n" + "lw s6, 88(sp)\n" + "lw s7, 92(sp)\n" + "lw s8, 96(sp)\n" + "lw s9, 100(sp)\n" + "lw s10, 104(sp)\n" + "lw s11, 108(sp)\n" + "lw t3, 112(sp)\n" + "lw t4, 116(sp)\n" + "lw t5, 120(sp)\n" + "lw t6, 124(sp)\n" + "lw sp, 8(sp)\n" + "mret\n"); +} + +__attribute__((noinline)) static uint32_t idle_once(uint32_t iter) +{ + uint32_t before = g_ticks; + + write_tp((uint32_t) &g_fake_current); + csr_write(mscratch, 0u); + clint_set_timer_cmp(clint_rdmtime() + 300u + (iter & 31u)); + enable_interrupts(); + + __asm__ volatile( + "mv t2, ra\n" + "mv t3, sp\n" + "mv t4, tp\n" + "la t0, 1f\n" + "la t1, g_expected_mepc\n" + "sw t0, 0(t1)\n" + "la t1, g_expected_ra\n" + "sw t2, 0(t1)\n" + "la t1, g_expected_sp\n" + "sw t3, 0(t1)\n" + "la t1, g_expected_tp\n" + "sw t4, 0(t1)\n" + "wfi\n" + "1:\n" + : + : + : "t0", "t1", "t2", "t3", "t4", "memory"); + + disable_interrupts(); + + if (g_ticks != before + 1u) { + record_failure(8u); + } + if (read_tp() != (uint32_t) &g_fake_current) { + record_failure(9u); + } + if (csr_read(mscratch) != 0u) { + record_failure(10u); + } + + return churn_context(iter ^ g_ticks); +} + +__attribute__((noinline)) static uint32_t idle_then_poison_ra_once(uint32_t iter) +{ + uint32_t before = g_ticks; + + write_tp((uint32_t) &g_fake_current); + csr_write(mscratch, 0u); + clint_set_timer_cmp(clint_rdmtime() + 300u + (iter & 31u)); + enable_interrupts(); + + __asm__ volatile( + "mv t2, ra\n" + "mv t3, sp\n" + "mv t4, tp\n" + "la t0, 1f\n" + "la t1, g_expected_mepc\n" + "sw t0, 0(t1)\n" + "la t1, g_expected_ra\n" + "sw t2, 0(t1)\n" + "la t1, g_expected_sp\n" + "sw t3, 0(t1)\n" + "la t1, g_expected_tp\n" + "sw t4, 0(t1)\n" + "wfi\n" + "1:\n" + "lui ra, 0x1\n" + "addi ra, ra, -832\n" + "mv ra, t2\n" + : + : + : "t0", "t1", "t2", "t3", "t4", "memory"); + + disable_interrupts(); + + if (g_ticks != before + 1u) { + record_failure(11u); + } + if (read_tp() != (uint32_t) &g_fake_current) { + record_failure(12u); + } + if (csr_read(mscratch) != 0u) { + record_failure(13u); + } + + return churn_context(0xCC0u ^ iter ^ g_ticks); +} + +__attribute__((noreturn, noinline, used)) void main_on_ddr_stack(void) +{ + uint32_t aggregate = 0x2468ACE0u; + + uart_printf("\n=== Linux-like DDR timer IRQ test ===\n"); + fill_context(); + g_fake_current.kernel_sp = (uint32_t) &g_ddr_stack[DDR_STACK_SIZE]; + g_fake_current.user_sp = 0u; + set_trap_handler(&linux_like_irq_entry); + disable_interrupts(); + enable_timer_interrupt(); + + for (uint32_t i = 0; i < NORMAL_IRQ_COUNT; i++) { + aggregate ^= idle_once(i); + if (g_fail_seen) { + break; + } + } + for (uint32_t i = 0; i < POISON_IRQ_COUNT && !g_fail_seen; i++) { + aggregate ^= idle_then_poison_ra_once(i); + } + + disable_timer_interrupt(); + disable_interrupts(); + clint_ack_timer(); + + if (!g_fail_seen && g_ticks == IRQ_COUNT && aggregate != 0u) { + uart_printf("ticks=%u checksum=%08x last_mepc=%08x last_ra=%08x\n", + g_ticks, g_context_checksum, g_last_mepc, g_last_ra); + uart_printf("<>\n"); + } else { + uart_printf("FAIL code=%u ticks=%u mepc=%08x ra=%08x sp=%08x tp=%08x mscratch=%08x\n", + g_fail_code, g_ticks, g_last_mepc, g_last_ra, g_last_sp, + g_last_tp, g_last_mscratch_in_handler); + uart_printf("<>\n"); + } + + for (;;) { + } +} + +int main(void) +{ + uint32_t stack_top = ((uint32_t) &g_ddr_stack[DDR_STACK_SIZE]) & ~0xFu; + + __asm__ volatile( + "mv sp, %0\n" + "j main_on_ddr_stack\n" + : + : "r"(stack_top) + : "memory"); + __builtin_unreachable(); +} diff --git a/sw/apps/linux_irq_find_next_slot_test/Makefile b/sw/apps/linux_irq_find_next_slot_test/Makefile new file mode 100644 index 00000000..dd4526e5 --- /dev/null +++ b/sw/apps/linux_irq_find_next_slot_test/Makefile @@ -0,0 +1,19 @@ +# Copyright 2026 Two Sigma Open Source, LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +# Directed Linux IRQ stack-slot repro. Force the whole program into cached DDR +# so the callee save slot exercises the same D-side path as the kernel stack. +override MEM_CONFIG := ddr +SRC_C := ../../lib/src/uart.c main.c +include ../../common/common.mk diff --git a/sw/apps/linux_irq_find_next_slot_test/main.c b/sw/apps/linux_irq_find_next_slot_test/main.c new file mode 100644 index 00000000..832cdf39 --- /dev/null +++ b/sw/apps/linux_irq_find_next_slot_test/main.c @@ -0,0 +1,907 @@ +/* + * Copyright 2026 Two Sigma Open Source, LLC + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/* + * Directed repro for the Linux timer-IRQ failure where _find_next_bit() + * returned through ra == 0x00000cc0 after an IRQ. The test poisons the exact + * future callee save slot with 0xcc0, enters a callee whose prologue matches + * the Linux helper: + * + * addi sp, sp, -16 + * sw s0, 8(sp) + * sw ra, 12(sp) + * addi s0, sp, 16 + * + * The callee then loops in a Linux-shaped find-bit ctz/byte-test block while a + * timer phase sweep forces IRQs at many active-code retire boundaries. + */ + +#include + +#include "csr.h" +#include "trap.h" +#include "uart.h" + +#define CLINT_MTIMECMP_LO (*(volatile uint32_t *) 0x40014000u) +#define CLINT_MTIMECMP_HI (*(volatile uint32_t *) 0x40014004u) +#define CLINT_MTIME_LO (*(volatile uint32_t *) 0x4001BFF8u) +#define CLINT_MTIME_HI (*(volatile uint32_t *) 0x4001BFFCu) +#define DDR_STACK_SIZE 4096u +#define FIND_SWEEP_ITERATIONS 64u +#define TOTAL_ITERATIONS FIND_SWEEP_ITERATIONS +#define FIND_BITMAP_WORDS 256u +#define POISON_RA 0x00000CC0u + +struct linux_pt_regs { + uint32_t epc; + uint32_t ra; + uint32_t sp; + uint32_t gp; + uint32_t tp; + uint32_t t0; + uint32_t t1; + uint32_t t2; + uint32_t s0; + uint32_t s1; + uint32_t a0; + uint32_t a1; + uint32_t a2; + uint32_t a3; + uint32_t a4; + uint32_t a5; + uint32_t a6; + uint32_t a7; + uint32_t s2; + uint32_t s3; + uint32_t s4; + uint32_t s5; + uint32_t s6; + uint32_t s7; + uint32_t s8; + uint32_t s9; + uint32_t s10; + uint32_t s11; + uint32_t t3; + uint32_t t4; + uint32_t t5; + uint32_t t6; + uint32_t status; + uint32_t badaddr; + uint32_t cause; + uint32_t orig_a0; +}; + +struct fake_current { + uint32_t kernel_sp; + uint32_t user_sp; + uint32_t marker; +}; + +volatile struct fake_current g_fake_current = {0u, 0u, 0x5354414Bu}; +volatile uint32_t g_ticks; +volatile uint32_t g_target_tick; +volatile uint32_t g_current_iter; +volatile uint32_t g_read_slot_in_handler; + +volatile uint32_t g_fail_seen; +volatile uint32_t g_fail_code; +volatile uint32_t g_bad_cause; +volatile uint32_t g_bad_epc; +volatile uint32_t g_bad_ra; + +volatile uint32_t g_expected_slot_addr; +volatile uint32_t g_expected_caller_slot_addr; +volatile uint32_t g_expected_saved_ra; +volatile uint32_t g_poison_readback; +volatile uint32_t g_caller_poison_readback; +volatile uint32_t g_caller_sp; +volatile uint32_t g_callee_sp; +volatile uint32_t g_callee_ra_saved; +volatile uint32_t g_slot_during_irq; +volatile uint32_t g_slot_before_return; + +volatile uint32_t g_irq_in_callee; +volatile uint32_t g_last_mepc; +volatile uint32_t g_last_ra; +volatile uint32_t g_last_sp; +volatile uint32_t g_last_tp; +volatile uint32_t g_last_mscratch_in_handler; +volatile uint32_t g_last_slot_addr; +volatile uint32_t g_irq_in_ctz; +volatile uint32_t g_seen_ctz_irq; +volatile uint32_t g_seen_exact_ctz_irq; +volatile uint32_t g_find_result; +volatile uint32_t g_exact_result; +volatile uint32_t g_find_bitmap[FIND_BITMAP_WORDS] __attribute__((aligned(16))); + +static uint8_t g_ddr_stack[DDR_STACK_SIZE] __attribute__((aligned(16))); + +extern char irq_find_next_ctz_start[]; +extern char irq_find_next_ctz_end[]; +extern char irq_find_next_exact_ctz_start[]; +extern char irq_find_next_exact_ctz_end[]; + +__attribute__((naked, aligned(4), noinline, used)) void irq_find_next_bit_callee(void); +__attribute__((naked, aligned(4), noinline, used)) void irq_find_next_bit_exact_callee(void); +__attribute__((naked, aligned(4), noinline, used)) uint32_t run_find_next_call_window(void); +__attribute__((naked, aligned(4), noinline, used)) uint32_t run_find_next_exact_call_window(void); +__attribute__((noreturn, noinline, used)) void stack_slot_bad_return(uint32_t observed); +__attribute__((noreturn, noinline, used)) void stack_slot_timeout(uint32_t code); + +static inline uint32_t read_tp(void) +{ + uint32_t value; + + __asm__ volatile("mv %0, tp" : "=r"(value)); + return value; +} + +static inline void write_tp(uint32_t value) +{ + __asm__ volatile("mv tp, %0" : : "r"(value) : "memory"); +} + +static uint64_t clint_rdmtime(void) +{ + uint32_t hi; + uint32_t lo; + uint32_t hi2; + + do { + hi = CLINT_MTIME_HI; + lo = CLINT_MTIME_LO; + hi2 = CLINT_MTIME_HI; + } while (hi != hi2); + + return ((uint64_t) hi << 32) | lo; +} + +static void clint_set_timer_cmp(uint64_t cmp) +{ + CLINT_MTIMECMP_HI = 0xFFFFFFFFu; + CLINT_MTIMECMP_LO = (uint32_t) cmp; + CLINT_MTIMECMP_HI = (uint32_t) (cmp >> 32); +} + +static void clint_ack_timer(void) +{ + CLINT_MTIMECMP_HI = 0xFFFFFFFFu; + CLINT_MTIMECMP_LO = 0xFFFFFFFFu; +} + +static void record_failure(uint32_t code) +{ + if (!g_fail_seen) { + g_fail_seen = 1u; + g_fail_code = code; + g_bad_cause = csr_read(mcause); + g_bad_epc = csr_read(mepc); + g_bad_ra = g_last_ra; + } +} + +__attribute__((noreturn, noinline)) static void finish_fail(const char *tag) +{ + disable_timer_interrupt(); + disable_external_interrupt(); + disable_interrupts(); + clint_ack_timer(); + + uart_printf("FAIL %s code=%u iter=%u ticks=%u target=%u cause=%08x\n", + tag, g_fail_code, g_current_iter, g_ticks, g_target_tick, g_bad_cause); + uart_printf("pc epc=%08x ra=%08x sp=%08x tp=%08x mscratch=%08x\n", + g_last_mepc, g_last_ra, g_last_sp, g_last_tp, g_last_mscratch_in_handler); + uart_printf("slot addr=%08x irq_addr=%08x poison=%08x irq_slot=%08x before_ret=%08x\n", + g_expected_slot_addr, g_last_slot_addr, g_poison_readback, + g_slot_during_irq, g_slot_before_return); + uart_printf("caller_slot=%08x caller_poison=%08x caller_sp=%08x expected_ra=%08x\n", + g_expected_caller_slot_addr, g_caller_poison_readback, + g_caller_sp, g_expected_saved_ra); + uart_printf("callee_ra=%08x callee_sp=%08x bad_epc=%08x bad_ra=%08x\n", + g_callee_ra_saved, g_callee_sp, g_bad_epc, g_bad_ra); + uart_printf("<>\n"); + + for (;;) { + } +} + +__attribute__((noreturn, noinline, used)) void stack_slot_bad_return(uint32_t observed) +{ + g_slot_before_return = observed; + record_failure(30u); + finish_fail("bad_return_slot"); +} + +__attribute__((noreturn, noinline, used)) void stack_slot_timeout(uint32_t code) +{ + record_failure(code); + finish_fail("callee_timeout"); +} + +__attribute__((noinline, used)) void linux_like_irq_c(struct linux_pt_regs *frame) +{ + uint32_t tick = g_ticks; + + g_last_mepc = frame->epc; + g_last_ra = frame->ra; + g_last_sp = frame->sp; + g_last_tp = frame->tp; + g_last_mscratch_in_handler = csr_read(mscratch); + + uint32_t cause_code = frame->cause & ~MCAUSE_INTERRUPT_BIT; + if ((frame->cause & MCAUSE_INTERRUPT_BIT) == 0u || + (cause_code != INT_MTI && cause_code != INT_MEI)) { + g_bad_epc = frame->epc; + g_bad_ra = frame->ra; + record_failure(1u); + finish_fail("unexpected_trap"); + } + + if (g_callee_sp != 0u && frame->sp == g_callee_sp) { + g_irq_in_callee = 1u; + g_last_slot_addr = frame->sp + 12u; + if (g_expected_saved_ra != 0u && frame->ra != g_expected_saved_ra) { + record_failure(8u); + } + if (frame->epc >= (uint32_t) irq_find_next_ctz_start && + frame->epc < (uint32_t) irq_find_next_ctz_end) { + g_irq_in_ctz = 1u; + g_seen_ctz_irq = 1u; + } + if (frame->epc >= (uint32_t) irq_find_next_exact_ctz_start && + frame->epc < (uint32_t) irq_find_next_exact_ctz_end) { + g_seen_exact_ctz_irq = 1u; + } + if (g_read_slot_in_handler) { + g_slot_during_irq = *(volatile uint32_t *) (frame->sp + 12u); + if (g_slot_during_irq != g_expected_saved_ra) { + record_failure(6u); + } + } + } else if (g_caller_sp != 0u && frame->sp == g_caller_sp) { + g_last_slot_addr = frame->sp + 12u; + if (g_read_slot_in_handler) { + g_slot_during_irq = *(volatile uint32_t *) (frame->sp + 12u); + } + record_failure(9u); + } else { + record_failure(2u); + } + + if (frame->ra < 0x80000000u || frame->ra == POISON_RA) { + record_failure(3u); + } + if (frame->tp != (uint32_t) &g_fake_current) { + record_failure(4u); + } + if (g_last_mscratch_in_handler != 0u) { + record_failure(7u); + } + + clint_ack_timer(); + g_ticks = tick + 1u; +} + +__attribute__((naked, aligned(4))) static void linux_like_irq_entry(void) +{ + __asm__ volatile( + "csrrw tp, mscratch, tp\n" + "bnez tp, 1f\n" + "csrr tp, mscratch\n" + "1:\n" + "addi sp, sp, -144\n" + "sw ra, 4(sp)\n" + "sw gp, 12(sp)\n" + "sw t0, 20(sp)\n" + "sw t1, 24(sp)\n" + "sw t2, 28(sp)\n" + "sw s0, 32(sp)\n" + "sw s1, 36(sp)\n" + "sw a0, 40(sp)\n" + "sw a1, 44(sp)\n" + "sw a2, 48(sp)\n" + "sw a3, 52(sp)\n" + "sw a4, 56(sp)\n" + "sw a5, 60(sp)\n" + "sw a6, 64(sp)\n" + "sw a7, 68(sp)\n" + "sw s2, 72(sp)\n" + "sw s3, 76(sp)\n" + "sw s4, 80(sp)\n" + "sw s5, 84(sp)\n" + "sw s6, 88(sp)\n" + "sw s7, 92(sp)\n" + "sw s8, 96(sp)\n" + "sw s9, 100(sp)\n" + "sw s10, 104(sp)\n" + "sw s11, 108(sp)\n" + "sw t3, 112(sp)\n" + "sw t4, 116(sp)\n" + "sw t5, 120(sp)\n" + "sw t6, 124(sp)\n" + "sw a0, 140(sp)\n" + "addi t0, sp, 144\n" + "sw t0, 8(sp)\n" + "csrr t0, mepc\n" + "sw t0, 0(sp)\n" + "csrr t0, mstatus\n" + "sw t0, 128(sp)\n" + "csrr t0, mtval\n" + "sw t0, 132(sp)\n" + "csrr t0, mcause\n" + "sw t0, 136(sp)\n" + "csrr t0, mscratch\n" + "sw t0, 16(sp)\n" + "csrw mscratch, x0\n" + "mv a0, sp\n" + "call linux_like_irq_c\n" + "lw a0, 128(sp)\n" + "lw a2, 0(sp)\n" + "sc.w x0, a2, 0(sp)\n" + "csrw mstatus, a0\n" + "csrw mepc, a2\n" + "lw ra, 4(sp)\n" + "lw gp, 12(sp)\n" + "lw tp, 16(sp)\n" + "lw t0, 20(sp)\n" + "lw t1, 24(sp)\n" + "lw t2, 28(sp)\n" + "lw s0, 32(sp)\n" + "lw s1, 36(sp)\n" + "lw a0, 40(sp)\n" + "lw a1, 44(sp)\n" + "lw a2, 48(sp)\n" + "lw a3, 52(sp)\n" + "lw a4, 56(sp)\n" + "lw a5, 60(sp)\n" + "lw a6, 64(sp)\n" + "lw a7, 68(sp)\n" + "lw s2, 72(sp)\n" + "lw s3, 76(sp)\n" + "lw s4, 80(sp)\n" + "lw s5, 84(sp)\n" + "lw s6, 88(sp)\n" + "lw s7, 92(sp)\n" + "lw s8, 96(sp)\n" + "lw s9, 100(sp)\n" + "lw s10, 104(sp)\n" + "lw s11, 108(sp)\n" + "lw t3, 112(sp)\n" + "lw t4, 116(sp)\n" + "lw t5, 120(sp)\n" + "lw t6, 124(sp)\n" + "lw sp, 8(sp)\n" + "mret\n"); +} + +__attribute__((naked, aligned(4), noinline, used)) void irq_find_next_bit_callee(void) +{ + __asm__ volatile( + ".option push\n" + ".option rvc\n" + "addi sp, sp, -16\n" + "sw s0, 8(sp)\n" + "sw ra, 12(sp)\n" + "addi s0, sp, 16\n" + "bgeu a2, a1, 4f\n" + "srli a4, a2, 5\n" + "slli a5, a4, 2\n" + "add a0, a0, a5\n" + "lw a5, 0(a0)\n" + "li a3, -1\n" + "sll a3, a3, a2\n" + "not a5, a5\n" + "and a5, a5, a3\n" + "bnez a5, 1f\n" + "li a5, 0x00010000\n" + "1:\n" + ".global irq_find_next_ctz_start\n" + "irq_find_next_ctz_start:\n" + "li t4, 200000\n" + "2:\n" + "li a5, 0x00010000\n" + "slli a2, a5, 16\n" + "srli a2, a2, 16\n" + "li a3, 0\n" + "bnez a2, 3f\n" + "srli a5, a5, 16\n" + "li a3, 16\n" + "3:\n" + "zext.b a2, a5\n" + "bnez a2, 5f\n" + "addi a3, a3, 8\n" + "srli a5, a5, 8\n" + "5:\n" + "andi a2, a5, 0xf\n" + "bnez a2, 6f\n" + "addi a3, a3, 4\n" + "srli a5, a5, 4\n" + "6:\n" + "andi a2, a5, 0x3\n" + "bnez a2, 7f\n" + "addi a3, a3, 2\n" + "srli a5, a5, 2\n" + "7:\n" + "andi a2, a5, 0x1\n" + "bnez a2, 8f\n" + "addi a3, a3, 1\n" + "8:\n" + "la t0, g_find_result\n" + "sw a3, 0(t0)\n" + "la t0, g_ticks\n" + "lw t1, 0(t0)\n" + "la t0, g_target_tick\n" + "lw t2, 0(t0)\n" + "beq t1, t2, 9f\n" + "la t0, g_fail_seen\n" + "lw t1, 0(t0)\n" + "bnez t1, 9f\n" + "addi t4, t4, -1\n" + "bnez t4, 2b\n" + "li a0, 31\n" + "lw s0, 8(sp)\n" + "addi sp, sp, 16\n" + "j stack_slot_timeout\n" + "4:\n" + "li a3, 0\n" + "9:\n" + ".global irq_find_next_ctz_end\n" + "irq_find_next_ctz_end:\n" + "lw ra, 12(sp)\n" + "la t0, g_slot_before_return\n" + "sw ra, 0(t0)\n" + "li t2, 0x80000000\n" + "bltu ra, t2, irq_find_next_bad_return\n" + "lw s0, 8(sp)\n" + "addi sp, sp, 16\n" + "ret\n" + "irq_find_next_bad_return:\n" + "mv a0, ra\n" + "lw s0, 8(sp)\n" + "addi sp, sp, 16\n" + "j stack_slot_bad_return\n" + ".option pop\n"); +} + +__attribute__((naked, aligned(4), noinline, used)) void irq_find_next_bit_exact_callee(void) +{ + __asm__ volatile( + ".option push\n" + ".option rvc\n" + "addi sp, sp, -16\n" + "sw s0, 8(sp)\n" + "sw ra, 12(sp)\n" + "addi s0, sp, 16\n" + "bgeu a2, a1, 4f\n" + "srli a5, a2, 5\n" + "slli a4, a5, 2\n" + "add a0, a0, a4\n" + "lw a3, 0(a0)\n" + "li a4, -1\n" + "sll a4, a4, a2\n" + "and a4, a4, a3\n" + "bnez a4, 1f\n" + "addi a5, a5, 1\n" + "slli a5, a5, 5\n" + "bgeu a5, a1, 4f\n" + "2:\n" + "lw a4, 4(a0)\n" + "addi a0, a0, 4\n" + "bnez a4, 3f\n" + "addi a5, a5, 32\n" + "bltu a5, a1, 2b\n" + "4:\n" + "lw ra, 12(sp)\n" + "la t0, g_slot_before_return\n" + "sw ra, 0(t0)\n" + "li t2, 0x80000000\n" + "bltu ra, t2, irq_find_next_exact_bad_return\n" + "lw s0, 8(sp)\n" + "mv a0, a1\n" + "addi sp, sp, 16\n" + "ret\n" + "1:\n" + "slli a5, a5, 5\n" + "3:\n" + ".global irq_find_next_exact_ctz_start\n" + "irq_find_next_exact_ctz_start:\n" + "slli a2, a4, 16\n" + "srli a2, a2, 16\n" + "li a3, 0\n" + "bnez a2, 5f\n" + "srli a4, a4, 16\n" + "li a3, 16\n" + "5:\n" + "zext.b a2, a4\n" + "bnez a2, 6f\n" + "addi a3, a3, 8\n" + "srli a4, a4, 8\n" + "6:\n" + "andi a2, a4, 0xf\n" + "bnez a2, 7f\n" + "addi a3, a3, 4\n" + "srli a4, a4, 4\n" + "7:\n" + "andi a2, a4, 0x3\n" + "bnez a2, 8f\n" + "addi a3, a3, 2\n" + "srli a4, a4, 2\n" + "8:\n" + "andi a4, a4, 1\n" + "seqz a4, a4\n" + "add a3, a3, a4\n" + ".global irq_find_next_exact_ctz_end\n" + "irq_find_next_exact_ctz_end:\n" + "add a3, a3, a5\n" + "bgeu a3, a1, 4b\n" + "lw ra, 12(sp)\n" + "la t0, g_slot_before_return\n" + "sw ra, 0(t0)\n" + "li t2, 0x80000000\n" + "bltu ra, t2, irq_find_next_exact_bad_return\n" + "lw s0, 8(sp)\n" + "mv a0, a3\n" + "addi sp, sp, 16\n" + "ret\n" + "irq_find_next_exact_bad_return:\n" + "mv a0, ra\n" + "lw s0, 8(sp)\n" + "addi sp, sp, 16\n" + "j stack_slot_bad_return\n" + ".option pop\n"); +} + +__attribute__((naked, aligned(4), noinline, used)) uint32_t run_find_next_call_window(void) +{ + __asm__ volatile( + ".option push\n" + ".option rvc\n" + "addi sp, sp, -16\n" + "sw ra, 0(sp)\n" + "la t1, g_caller_sp\n" + "sw sp, 0(t1)\n" + "addi t0, sp, 12\n" + "la t1, g_expected_caller_slot_addr\n" + "sw t0, 0(t1)\n" + "li t2, 0x00000cc0\n" + "sw t2, 0(t0)\n" + "lw t3, 0(t0)\n" + "la t1, g_caller_poison_readback\n" + "sw t3, 0(t1)\n" + "addi t0, sp, -16\n" + "la t1, g_callee_sp\n" + "sw t0, 0(t1)\n" + "addi t0, t0, 12\n" + "la t1, g_expected_slot_addr\n" + "sw t0, 0(t1)\n" + "li t2, 0x00000cc0\n" + "sw t2, 0(t0)\n" + "lw t3, 0(t0)\n" + "la t1, g_poison_readback\n" + "sw t3, 0(t1)\n" + "la t1, 1f\n" + "la t0, g_expected_saved_ra\n" + "sw t1, 0(t0)\n" + "la t0, g_callee_ra_saved\n" + "sw t1, 0(t0)\n" + "la a0, g_find_bitmap\n" + "li a1, 8192\n" + "li a2, 16\n" + "call irq_find_next_bit_callee\n" + "1:\n" + "li a0, 1\n" + "lw ra, 0(sp)\n" + "addi sp, sp, 16\n" + "ret\n" + ".option pop\n"); +} + +__attribute__((naked, aligned(4), noinline, used)) uint32_t run_find_next_exact_call_window(void) +{ + __asm__ volatile( + ".option push\n" + ".option rvc\n" + "addi sp, sp, -16\n" + "sw ra, 0(sp)\n" + "la t1, g_caller_sp\n" + "sw sp, 0(t1)\n" + "addi t0, sp, 12\n" + "la t1, g_expected_caller_slot_addr\n" + "sw t0, 0(t1)\n" + "li t2, 0x00000cc0\n" + "sw t2, 0(t0)\n" + "lw t3, 0(t0)\n" + "la t1, g_caller_poison_readback\n" + "sw t3, 0(t1)\n" + "addi t0, sp, -16\n" + "la t1, g_callee_sp\n" + "sw t0, 0(t1)\n" + "addi t0, t0, 12\n" + "la t1, g_expected_slot_addr\n" + "sw t0, 0(t1)\n" + "li t2, 0x00000cc0\n" + "sw t2, 0(t0)\n" + "lw t3, 0(t0)\n" + "la t1, g_poison_readback\n" + "sw t3, 0(t1)\n" + "la t1, 1f\n" + "la t0, g_expected_saved_ra\n" + "sw t1, 0(t0)\n" + "la t0, g_callee_ra_saved\n" + "sw t1, 0(t0)\n" + "la a0, g_find_bitmap\n" + "li a1, 8192\n" + "li a2, 16\n" + "call irq_find_next_bit_exact_callee\n" + "1:\n" + "la t0, g_exact_result\n" + "sw a0, 0(t0)\n" + "li a0, 1\n" + "lw ra, 0(sp)\n" + "addi sp, sp, 16\n" + "ret\n" + ".option pop\n"); +} + +static void init_find_bitmap(void) +{ + for (uint32_t i = 0; i < FIND_BITMAP_WORDS; i++) { + g_find_bitmap[i] = 0xFFFFFFFFu; + } + g_find_bitmap[0] = 0xFFFEFFFFu; +} + +static void init_find_bitmap_exact(void) +{ + for (uint32_t i = 0; i < FIND_BITMAP_WORDS; i++) { + g_find_bitmap[i] = 0u; + } + g_find_bitmap[0] = 0x00010000u; +} + +static void prepare_exact_probe(void) +{ + disable_interrupts(); + disable_timer_interrupt(); + clint_ack_timer(); + + g_current_iter = 0xE0000000u; + g_target_tick = g_ticks; + g_read_slot_in_handler = 1u; + g_expected_slot_addr = 0u; + g_expected_caller_slot_addr = 0u; + g_expected_saved_ra = 0u; + g_poison_readback = 0u; + g_caller_poison_readback = 0u; + g_caller_sp = 0u; + g_callee_sp = 0u; + g_callee_ra_saved = 0u; + g_slot_during_irq = 0xFFFFFFFFu; + g_slot_before_return = 0u; + g_irq_in_callee = 0u; + g_last_slot_addr = 0u; + g_irq_in_ctz = 0u; + g_exact_result = 0xFFFFFFFFu; + + write_tp((uint32_t) &g_fake_current); + csr_write(mscratch, 0u); + enable_external_interrupt(); + enable_interrupts(); +} + +static uint32_t run_exact_probe(void) +{ + uint32_t checksum = 0x13579BDFu; + + init_find_bitmap_exact(); + g_seen_exact_ctz_irq = 0u; + + for (uint32_t i = 0; i < 128u && !g_fail_seen; i++) { + prepare_exact_probe(); + if (run_find_next_exact_call_window() != 1u) { + record_failure(70u); + } + disable_interrupts(); + clint_ack_timer(); + + if (g_exact_result != 16u) { + record_failure(71u); + } + if (g_callee_sp == 0u || g_expected_slot_addr != g_callee_sp + 12u) { + record_failure(72u); + } + if (g_poison_readback != POISON_RA) { + record_failure(73u); + } + if (g_caller_sp == 0u || g_expected_caller_slot_addr != g_caller_sp + 12u) { + record_failure(76u); + } + if (g_caller_poison_readback != POISON_RA) { + record_failure(77u); + } + if (g_slot_before_return != g_expected_saved_ra) { + record_failure(74u); + } + if (g_seen_exact_ctz_irq && g_slot_during_irq != g_expected_saved_ra) { + record_failure(75u); + } + + checksum ^= g_exact_result ^ g_slot_before_return ^ g_expected_slot_addr; + checksum ^= (i << 24) ^ g_ticks; + if (g_seen_exact_ctz_irq) { + break; + } + } + + disable_interrupts(); + disable_external_interrupt(); + clint_ack_timer(); + return checksum; +} + +static void prepare_window(uint32_t iter, uint32_t read_slot_in_handler) +{ + disable_interrupts(); + clint_ack_timer(); + + g_current_iter = iter; + g_target_tick = g_ticks + 1u; + g_read_slot_in_handler = read_slot_in_handler; + g_expected_slot_addr = 0u; + g_expected_caller_slot_addr = 0u; + g_expected_saved_ra = 0u; + g_poison_readback = 0u; + g_caller_poison_readback = 0u; + g_caller_sp = 0u; + g_callee_sp = 0u; + g_callee_ra_saved = 0u; + g_slot_during_irq = 0xFFFFFFFFu; + g_slot_before_return = 0u; + g_irq_in_callee = 0u; + g_last_slot_addr = 0u; + g_irq_in_ctz = 0u; + g_find_result = 0xFFFFFFFFu; + + write_tp((uint32_t) &g_fake_current); + csr_write(mscratch, 0u); + clint_set_timer_cmp(clint_rdmtime() + 3000u + ((iter * 211u) & 2047u)); + enable_interrupts(); +} + +static uint32_t run_one_window(uint32_t iter, uint32_t read_slot_in_handler) +{ + uint32_t returned; + uint32_t checksum; + + prepare_window(iter, read_slot_in_handler); + returned = run_find_next_call_window(); + disable_interrupts(); + clint_ack_timer(); + + if (returned != 1u) { + record_failure(40u); + } + if (g_ticks != g_target_tick) { + record_failure(41u); + } + if (!g_irq_in_callee) { + record_failure(42u); + } + if (!g_irq_in_ctz) { + record_failure(51u); + } + if (g_callee_sp == 0u || g_expected_slot_addr != g_callee_sp + 12u) { + record_failure(43u); + } + if (g_poison_readback != POISON_RA) { + record_failure(44u); + } + if (g_caller_sp == 0u || g_expected_caller_slot_addr != g_caller_sp + 12u) { + record_failure(53u); + } + if (g_caller_poison_readback != POISON_RA) { + record_failure(54u); + } + if (g_expected_saved_ra < 0x80000000u || g_expected_saved_ra == POISON_RA) { + record_failure(45u); + } + if (g_callee_ra_saved != g_expected_saved_ra) { + record_failure(46u); + } + if (g_slot_before_return != g_expected_saved_ra) { + record_failure(47u); + } + if (read_slot_in_handler && g_slot_during_irq != g_expected_saved_ra) { + record_failure(48u); + } + if (g_find_result != 16u) { + record_failure(52u); + } + if (read_tp() != (uint32_t) &g_fake_current) { + record_failure(49u); + } + if (csr_read(mscratch) != 0u) { + record_failure(50u); + } + + checksum = g_slot_before_return ^ g_expected_slot_addr ^ g_last_mepc; + checksum ^= (g_current_iter << 16) ^ g_ticks ^ (g_find_result << 8); + return checksum; +} + +__attribute__((noreturn, noinline, used)) void main_on_ddr_stack(void) +{ + uint32_t checksum = 0xA51C05E0u; + uint32_t sweep_start_ticks; + + uart_printf("\n=== Linux IRQ find-next-slot DDR test ===\n"); + + g_seen_ctz_irq = 0u; + g_fake_current.kernel_sp = (uint32_t) &g_ddr_stack[DDR_STACK_SIZE]; + g_fake_current.user_sp = 0u; + set_trap_handler(&linux_like_irq_entry); + disable_interrupts(); + clint_ack_timer(); + + checksum ^= run_exact_probe(); + if (g_fail_seen) { + finish_fail("exact_probe"); + } + + init_find_bitmap(); + sweep_start_ticks = g_ticks; + enable_timer_interrupt(); + enable_external_interrupt(); + + for (uint32_t i = 0; i < TOTAL_ITERATIONS && !g_fail_seen; i++) { + checksum ^= run_one_window(i, 1u); + } + + disable_timer_interrupt(); + disable_external_interrupt(); + disable_interrupts(); + clint_ack_timer(); + + if (g_fail_seen) { + finish_fail("post_check"); + } + + if (g_ticks == sweep_start_ticks + TOTAL_ITERATIONS && g_seen_ctz_irq && checksum != 0u) { + uart_printf("ticks=%u checksum=%08x last_mepc=%08x last_ra=%08x slot=%08x exact_irq=%u\n", + g_ticks, checksum, g_last_mepc, g_last_ra, g_slot_before_return, + g_seen_exact_ctz_irq); + uart_printf("<>\n"); + } else { + record_failure(60u); + finish_fail("final_count"); + } + + for (;;) { + } +} + +int main(void) +{ + uint32_t stack_top = ((uint32_t) &g_ddr_stack[DDR_STACK_SIZE]) & ~0xFu; + + __asm__ volatile( + "mv sp, %0\n" + "j main_on_ddr_stack\n" + : + : "r"(stack_top) + : "memory"); + __builtin_unreachable(); +} diff --git a/sw/apps/linux_irq_stack_slot_test/Makefile b/sw/apps/linux_irq_stack_slot_test/Makefile new file mode 100644 index 00000000..dd4526e5 --- /dev/null +++ b/sw/apps/linux_irq_stack_slot_test/Makefile @@ -0,0 +1,19 @@ +# Copyright 2026 Two Sigma Open Source, LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +# Directed Linux IRQ stack-slot repro. Force the whole program into cached DDR +# so the callee save slot exercises the same D-side path as the kernel stack. +override MEM_CONFIG := ddr +SRC_C := ../../lib/src/uart.c main.c +include ../../common/common.mk diff --git a/sw/apps/linux_irq_stack_slot_test/main.c b/sw/apps/linux_irq_stack_slot_test/main.c new file mode 100644 index 00000000..27f43b27 --- /dev/null +++ b/sw/apps/linux_irq_stack_slot_test/main.c @@ -0,0 +1,549 @@ +/* + * Copyright 2026 Two Sigma Open Source, LLC + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/* + * Directed repro for the Linux timer-IRQ failure where _find_next_zero_bit() + * returned through ra == 0x00000cc0 after an IRQ. The test poisons the exact + * future callee save slot with 0xcc0, enters a callee whose prologue matches: + * + * addi sp, sp, -16 + * sw s0, 8(sp) + * sw ra, 12(sp) + * addi s0, sp, 16 + * + * It takes a Linux-like machine timer IRQ while the callee is active, then + * checks the later load from 12(sp) before using it as a return address. + */ + +#include + +#include "csr.h" +#include "trap.h" +#include "uart.h" + +#define CLINT_MTIMECMP_LO (*(volatile uint32_t *) 0x40014000u) +#define CLINT_MTIMECMP_HI (*(volatile uint32_t *) 0x40014004u) +#define CLINT_MTIME_LO (*(volatile uint32_t *) 0x4001BFF8u) +#define CLINT_MTIME_HI (*(volatile uint32_t *) 0x4001BFFCu) +#define DDR_STACK_SIZE 4096u +#define NONINTRUSIVE_ITERATIONS 24u +#define INTRUSIVE_ITERATIONS 8u +#define TOTAL_ITERATIONS (NONINTRUSIVE_ITERATIONS + INTRUSIVE_ITERATIONS) +#define POISON_RA 0x00000CC0u + +struct linux_pt_regs { + uint32_t epc; + uint32_t ra; + uint32_t sp; + uint32_t gp; + uint32_t tp; + uint32_t t0; + uint32_t t1; + uint32_t t2; + uint32_t s0; + uint32_t s1; + uint32_t a0; + uint32_t a1; + uint32_t a2; + uint32_t a3; + uint32_t a4; + uint32_t a5; + uint32_t a6; + uint32_t a7; + uint32_t s2; + uint32_t s3; + uint32_t s4; + uint32_t s5; + uint32_t s6; + uint32_t s7; + uint32_t s8; + uint32_t s9; + uint32_t s10; + uint32_t s11; + uint32_t t3; + uint32_t t4; + uint32_t t5; + uint32_t t6; + uint32_t status; + uint32_t badaddr; + uint32_t cause; + uint32_t orig_a0; +}; + +struct fake_current { + uint32_t kernel_sp; + uint32_t user_sp; + uint32_t marker; +}; + +volatile struct fake_current g_fake_current = {0u, 0u, 0x5354414Bu}; +volatile uint32_t g_ticks; +volatile uint32_t g_target_tick; +volatile uint32_t g_current_iter; +volatile uint32_t g_read_slot_in_handler; + +volatile uint32_t g_fail_seen; +volatile uint32_t g_fail_code; +volatile uint32_t g_bad_cause; +volatile uint32_t g_bad_epc; +volatile uint32_t g_bad_ra; + +volatile uint32_t g_expected_slot_addr; +volatile uint32_t g_expected_saved_ra; +volatile uint32_t g_poison_readback; +volatile uint32_t g_callee_sp; +volatile uint32_t g_callee_ra_saved; +volatile uint32_t g_slot_during_irq; +volatile uint32_t g_slot_before_return; + +volatile uint32_t g_irq_in_callee; +volatile uint32_t g_last_mepc; +volatile uint32_t g_last_ra; +volatile uint32_t g_last_sp; +volatile uint32_t g_last_tp; +volatile uint32_t g_last_mscratch_in_handler; +volatile uint32_t g_last_slot_addr; + +static uint8_t g_ddr_stack[DDR_STACK_SIZE] __attribute__((aligned(16))); + +__attribute__((naked, aligned(4), noinline, used)) void irq_stack_slot_callee(void); +__attribute__((naked, aligned(4), noinline, used)) uint32_t run_stack_slot_call_window(void); +__attribute__((noreturn, noinline, used)) void stack_slot_bad_return(uint32_t observed); +__attribute__((noreturn, noinline, used)) void stack_slot_timeout(uint32_t code); + +static inline uint32_t read_tp(void) +{ + uint32_t value; + + __asm__ volatile("mv %0, tp" : "=r"(value)); + return value; +} + +static inline void write_tp(uint32_t value) +{ + __asm__ volatile("mv tp, %0" : : "r"(value) : "memory"); +} + +static uint64_t clint_rdmtime(void) +{ + uint32_t hi; + uint32_t lo; + uint32_t hi2; + + do { + hi = CLINT_MTIME_HI; + lo = CLINT_MTIME_LO; + hi2 = CLINT_MTIME_HI; + } while (hi != hi2); + + return ((uint64_t) hi << 32) | lo; +} + +static void clint_set_timer_cmp(uint64_t cmp) +{ + CLINT_MTIMECMP_HI = 0xFFFFFFFFu; + CLINT_MTIMECMP_LO = (uint32_t) cmp; + CLINT_MTIMECMP_HI = (uint32_t) (cmp >> 32); +} + +static void clint_ack_timer(void) +{ + CLINT_MTIMECMP_HI = 0xFFFFFFFFu; + CLINT_MTIMECMP_LO = 0xFFFFFFFFu; +} + +static void record_failure(uint32_t code) +{ + if (!g_fail_seen) { + g_fail_seen = 1u; + g_fail_code = code; + g_bad_cause = csr_read(mcause); + g_bad_epc = csr_read(mepc); + g_bad_ra = g_last_ra; + } +} + +__attribute__((noreturn, noinline)) static void finish_fail(const char *tag) +{ + disable_timer_interrupt(); + disable_external_interrupt(); + disable_interrupts(); + clint_ack_timer(); + + uart_printf("FAIL %s code=%u iter=%u ticks=%u target=%u cause=%08x\n", + tag, g_fail_code, g_current_iter, g_ticks, g_target_tick, g_bad_cause); + uart_printf("pc epc=%08x ra=%08x sp=%08x tp=%08x mscratch=%08x\n", + g_last_mepc, g_last_ra, g_last_sp, g_last_tp, g_last_mscratch_in_handler); + uart_printf("slot addr=%08x irq_addr=%08x poison=%08x irq_slot=%08x before_ret=%08x\n", + g_expected_slot_addr, g_last_slot_addr, g_poison_readback, + g_slot_during_irq, g_slot_before_return); + uart_printf("expected_ra=%08x callee_ra=%08x callee_sp=%08x bad_epc=%08x bad_ra=%08x\n", + g_expected_saved_ra, g_callee_ra_saved, g_callee_sp, g_bad_epc, g_bad_ra); + uart_printf("<>\n"); + + for (;;) { + } +} + +__attribute__((noreturn, noinline, used)) void stack_slot_bad_return(uint32_t observed) +{ + g_slot_before_return = observed; + record_failure(30u); + finish_fail("bad_return_slot"); +} + +__attribute__((noreturn, noinline, used)) void stack_slot_timeout(uint32_t code) +{ + record_failure(code); + finish_fail("callee_timeout"); +} + +__attribute__((noinline, used)) void linux_like_irq_c(struct linux_pt_regs *frame) +{ + uint32_t tick = g_ticks; + + g_last_mepc = frame->epc; + g_last_ra = frame->ra; + g_last_sp = frame->sp; + g_last_tp = frame->tp; + g_last_mscratch_in_handler = csr_read(mscratch); + + uint32_t cause_code = frame->cause & ~MCAUSE_INTERRUPT_BIT; + if ((frame->cause & MCAUSE_INTERRUPT_BIT) == 0u || + (cause_code != INT_MTI && cause_code != INT_MEI)) { + g_bad_epc = frame->epc; + g_bad_ra = frame->ra; + record_failure(1u); + finish_fail("unexpected_trap"); + } + + if (g_callee_sp != 0u && frame->sp == g_callee_sp) { + g_irq_in_callee = 1u; + g_last_slot_addr = frame->sp + 12u; + if (g_read_slot_in_handler) { + g_slot_during_irq = *(volatile uint32_t *) (frame->sp + 12u); + } + } else { + record_failure(2u); + } + + if (frame->ra < 0x80000000u || frame->ra == POISON_RA) { + record_failure(3u); + } + if (frame->tp != (uint32_t) &g_fake_current) { + record_failure(4u); + } + if (g_last_mscratch_in_handler != 0u) { + record_failure(5u); + } + + clint_ack_timer(); + g_ticks = tick + 1u; +} + +__attribute__((naked, aligned(4))) static void linux_like_irq_entry(void) +{ + __asm__ volatile( + "csrrw tp, mscratch, tp\n" + "bnez tp, 1f\n" + "csrr tp, mscratch\n" + "1:\n" + "addi sp, sp, -144\n" + "sw ra, 4(sp)\n" + "sw gp, 12(sp)\n" + "sw t0, 20(sp)\n" + "sw t1, 24(sp)\n" + "sw t2, 28(sp)\n" + "sw s0, 32(sp)\n" + "sw s1, 36(sp)\n" + "sw a0, 40(sp)\n" + "sw a1, 44(sp)\n" + "sw a2, 48(sp)\n" + "sw a3, 52(sp)\n" + "sw a4, 56(sp)\n" + "sw a5, 60(sp)\n" + "sw a6, 64(sp)\n" + "sw a7, 68(sp)\n" + "sw s2, 72(sp)\n" + "sw s3, 76(sp)\n" + "sw s4, 80(sp)\n" + "sw s5, 84(sp)\n" + "sw s6, 88(sp)\n" + "sw s7, 92(sp)\n" + "sw s8, 96(sp)\n" + "sw s9, 100(sp)\n" + "sw s10, 104(sp)\n" + "sw s11, 108(sp)\n" + "sw t3, 112(sp)\n" + "sw t4, 116(sp)\n" + "sw t5, 120(sp)\n" + "sw t6, 124(sp)\n" + "sw a0, 140(sp)\n" + "addi t0, sp, 144\n" + "sw t0, 8(sp)\n" + "csrr t0, mepc\n" + "sw t0, 0(sp)\n" + "csrr t0, mstatus\n" + "sw t0, 128(sp)\n" + "csrr t0, mtval\n" + "sw t0, 132(sp)\n" + "csrr t0, mcause\n" + "sw t0, 136(sp)\n" + "csrr t0, mscratch\n" + "sw t0, 16(sp)\n" + "csrw mscratch, x0\n" + "mv a0, sp\n" + "call linux_like_irq_c\n" + "lw a0, 128(sp)\n" + "lw a2, 0(sp)\n" + "sc.w x0, a2, 0(sp)\n" + "csrw mstatus, a0\n" + "csrw mepc, a2\n" + "lw ra, 4(sp)\n" + "lw gp, 12(sp)\n" + "lw tp, 16(sp)\n" + "lw t0, 20(sp)\n" + "lw t1, 24(sp)\n" + "lw t2, 28(sp)\n" + "lw s0, 32(sp)\n" + "lw s1, 36(sp)\n" + "lw a0, 40(sp)\n" + "lw a1, 44(sp)\n" + "lw a2, 48(sp)\n" + "lw a3, 52(sp)\n" + "lw a4, 56(sp)\n" + "lw a5, 60(sp)\n" + "lw a6, 64(sp)\n" + "lw a7, 68(sp)\n" + "lw s2, 72(sp)\n" + "lw s3, 76(sp)\n" + "lw s4, 80(sp)\n" + "lw s5, 84(sp)\n" + "lw s6, 88(sp)\n" + "lw s7, 92(sp)\n" + "lw s8, 96(sp)\n" + "lw s9, 100(sp)\n" + "lw s10, 104(sp)\n" + "lw s11, 108(sp)\n" + "lw t3, 112(sp)\n" + "lw t4, 116(sp)\n" + "lw t5, 120(sp)\n" + "lw t6, 124(sp)\n" + "lw sp, 8(sp)\n" + "mret\n"); +} + +__attribute__((naked, aligned(4), noinline, used)) void irq_stack_slot_callee(void) +{ + __asm__ volatile( + ".option push\n" + ".option rvc\n" + "addi sp, sp, -16\n" + "sw s0, 8(sp)\n" + "sw ra, 12(sp)\n" + "addi s0, sp, 16\n" + "la t0, g_callee_sp\n" + "sw sp, 0(t0)\n" + "la t0, g_callee_ra_saved\n" + "sw ra, 0(t0)\n" + "li t4, 200000\n" + "1:\n" + "la t0, g_ticks\n" + "lw t1, 0(t0)\n" + "la t0, g_target_tick\n" + "lw t2, 0(t0)\n" + "beq t1, t2, 3f\n" + "la t0, g_fail_seen\n" + "lw t1, 0(t0)\n" + "bnez t1, 3f\n" + "addi t4, t4, -1\n" + "bnez t4, 1b\n" + "li a0, 31\n" + "lw s0, 8(sp)\n" + "addi sp, sp, 16\n" + "j stack_slot_timeout\n" + "3:\n" + "lw ra, 12(sp)\n" + "la t0, g_slot_before_return\n" + "sw ra, 0(t0)\n" + "li t2, 0x80000000\n" + "bltu ra, t2, 2f\n" + "lw s0, 8(sp)\n" + "addi sp, sp, 16\n" + "ret\n" + "2:\n" + "mv a0, ra\n" + "lw s0, 8(sp)\n" + "addi sp, sp, 16\n" + "j stack_slot_bad_return\n" + ".option pop\n"); +} + +__attribute__((naked, aligned(4), noinline, used)) uint32_t run_stack_slot_call_window(void) +{ + __asm__ volatile( + ".option push\n" + ".option rvc\n" + "addi sp, sp, -16\n" + "sw ra, 0(sp)\n" + "addi t0, sp, -4\n" + "la t1, g_expected_slot_addr\n" + "sw t0, 0(t1)\n" + "li t2, 0x00000cc0\n" + "sw t2, 0(t0)\n" + "lw t3, 0(t0)\n" + "la t1, g_poison_readback\n" + "sw t3, 0(t1)\n" + "la t1, 1f\n" + "la t0, g_expected_saved_ra\n" + "sw t1, 0(t0)\n" + "call irq_stack_slot_callee\n" + "1:\n" + "li a0, 1\n" + "lw ra, 0(sp)\n" + "addi sp, sp, 16\n" + "ret\n" + ".option pop\n"); +} + +static void prepare_window(uint32_t iter, uint32_t read_slot_in_handler) +{ + disable_interrupts(); + clint_ack_timer(); + + g_current_iter = iter; + g_target_tick = g_ticks + 1u; + g_read_slot_in_handler = read_slot_in_handler; + g_expected_slot_addr = 0u; + g_expected_saved_ra = 0u; + g_poison_readback = 0u; + g_callee_sp = 0u; + g_callee_ra_saved = 0u; + g_slot_during_irq = 0xFFFFFFFFu; + g_slot_before_return = 0u; + g_irq_in_callee = 0u; + g_last_slot_addr = 0u; + + write_tp((uint32_t) &g_fake_current); + csr_write(mscratch, 0u); + clint_set_timer_cmp(clint_rdmtime() + 3000u + ((iter * 211u) & 1023u)); + enable_interrupts(); +} + +static uint32_t run_one_window(uint32_t iter, uint32_t read_slot_in_handler) +{ + uint32_t returned; + uint32_t checksum; + + prepare_window(iter, read_slot_in_handler); + returned = run_stack_slot_call_window(); + disable_interrupts(); + clint_ack_timer(); + + if (returned != 1u) { + record_failure(40u); + } + if (g_ticks != g_target_tick) { + record_failure(41u); + } + if (!g_irq_in_callee) { + record_failure(42u); + } + if (g_callee_sp == 0u || g_expected_slot_addr != g_callee_sp + 12u) { + record_failure(43u); + } + if (g_poison_readback != POISON_RA) { + record_failure(44u); + } + if (g_expected_saved_ra < 0x80000000u || g_expected_saved_ra == POISON_RA) { + record_failure(45u); + } + if (g_callee_ra_saved != g_expected_saved_ra) { + record_failure(46u); + } + if (g_slot_before_return != g_expected_saved_ra) { + record_failure(47u); + } + if (read_slot_in_handler && g_slot_during_irq != g_expected_saved_ra) { + record_failure(48u); + } + if (read_tp() != (uint32_t) &g_fake_current) { + record_failure(49u); + } + if (csr_read(mscratch) != 0u) { + record_failure(50u); + } + + checksum = g_slot_before_return ^ g_expected_slot_addr ^ g_last_mepc; + checksum ^= (g_current_iter << 16) ^ g_ticks ^ (read_slot_in_handler << 31); + return checksum; +} + +__attribute__((noreturn, noinline, used)) void main_on_ddr_stack(void) +{ + uint32_t checksum = 0xA51C05E0u; + + uart_printf("\n=== Linux IRQ stack-slot DDR test ===\n"); + + g_fake_current.kernel_sp = (uint32_t) &g_ddr_stack[DDR_STACK_SIZE]; + g_fake_current.user_sp = 0u; + set_trap_handler(&linux_like_irq_entry); + disable_interrupts(); + clint_ack_timer(); + enable_timer_interrupt(); + enable_external_interrupt(); + + for (uint32_t i = 0; i < NONINTRUSIVE_ITERATIONS && !g_fail_seen; i++) { + checksum ^= run_one_window(i, 0u); + } + for (uint32_t i = 0; i < INTRUSIVE_ITERATIONS && !g_fail_seen; i++) { + checksum ^= run_one_window(NONINTRUSIVE_ITERATIONS + i, 1u); + } + + disable_timer_interrupt(); + disable_external_interrupt(); + disable_interrupts(); + clint_ack_timer(); + + if (g_fail_seen) { + finish_fail("post_check"); + } + + if (g_ticks == TOTAL_ITERATIONS && checksum != 0u) { + uart_printf("ticks=%u checksum=%08x last_mepc=%08x last_ra=%08x slot=%08x\n", + g_ticks, checksum, g_last_mepc, g_last_ra, g_slot_before_return); + uart_printf("<>\n"); + } else { + record_failure(60u); + finish_fail("final_count"); + } + + for (;;) { + } +} + +int main(void) +{ + uint32_t stack_top = ((uint32_t) &g_ddr_stack[DDR_STACK_SIZE]) & ~0xFu; + + __asm__ volatile( + "mv sp, %0\n" + "j main_on_ddr_stack\n" + : + : "r"(stack_top) + : "memory"); + __builtin_unreachable(); +} diff --git a/sw/apps/mret_timer_resume_test/Makefile b/sw/apps/mret_timer_resume_test/Makefile new file mode 100644 index 00000000..bd2f906b --- /dev/null +++ b/sw/apps/mret_timer_resume_test/Makefile @@ -0,0 +1,17 @@ +# Copyright 2026 Two Sigma Open Source, LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +# Makefile for the MRET-to-U-mode + pending-timer interrupt-resume-PC test +SRC_C := main.c +include ../../common/common.mk diff --git a/sw/apps/mret_timer_resume_test/main.c b/sw/apps/mret_timer_resume_test/main.c new file mode 100644 index 00000000..f07086a9 --- /dev/null +++ b/sw/apps/mret_timer_resume_test/main.c @@ -0,0 +1,193 @@ +/* + * Copyright 2026 Two Sigma Open Source, LLC + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/* + * MRET-to-U-mode + already-pending machine timer: interrupt-resume-PC (mepc) + * directed test. + * + * Reproduces the Linux no-MMU boot panic where a U-mode context illegally + * executes the kernel's M-mode MRET (ret_from_exception). Root cause under + * test: when an MRET returns to U-mode it retires via the trap/MRET full + * flush, NOT via the normal commit path, so the core's `interrupt_resume_pc` + * register is never updated to the MRET target. It keeps holding the + * architectural next-PC of the instruction before the MRET -- i.e. the MRET + * instruction's own PC. The trap unit only inhibits interrupts for the two + * cycles around the MRET (i_mret_start, mret_taken_prev). If a machine timer + * is pending, it becomes eligible the moment privilege drops below M and is + * taken a few cycles later, BEFORE the first U-mode instruction commits and + * refreshes interrupt_resume_pc. The trap therefore saves + * mepc = interrupt_resume_pc = . + * Linux later restores that trap frame and MRETs to the kernel MRET PC while + * in U-mode -> illegal instruction (signal 4) -> "Attempted to kill init". + * + * Test shape (mirrors umode_test's timer-preempts-U case, but with the timer + * ALREADY pending at MRET time so it fires in the vulnerable post-MRET + * window): + * + * 1. M-mode installs a naked handler at mtvec that records, for the FIRST + * trap only, mcause, mepc (the saved resume PC) and mstatus.MPP. + * 2. Make the machine timer permanently pending (mtimecmp = 0) while in + * M-mode with MIE=0 (so it cannot fire in M-mode). + * 3. MRET into a tiny U-mode spin (`u_spin: j .`). Machine interrupts are + * taken below M regardless of MIE, so the pending timer preempts U + * immediately. + * 4. The handler runs; we then assert the saved resume PC points at u_spin + * (the MRET target) and is NOT the MRET instruction's own PC. + * + * PASS: mcause == 0x8000_0007 (machine timer), trapped-from-priv == U, and + * mepc == &u_spin. + * FAIL (the bug): mepc == != &u_spin. + */ + +#include + +#include "trap.h" + +/* ---- minimal UART (UART_TX is provided by mmio.h via trap.h) ---- */ +static void uart_putc(char c) +{ + UART_TX = (uint8_t) c; +} + +static void uart_puts(const char *s) +{ + while (*s) + uart_putc(*s++); +} + +static void uart_hex(uint32_t v) +{ + static const char hex[] = "0123456789ABCDEF"; + uart_puts("0x"); + for (int i = 28; i >= 0; i -= 4) + uart_putc(hex[(v >> i) & 0xF]); +} + +/* ---- trap state shared with the naked handler ---- */ +static volatile uint32_t g_cause; +static volatile uint32_t g_mepc; /* saved resume PC of the FIRST trap */ +static volatile uint32_t g_from_priv; /* mstatus.MPP at trap entry = prev priv */ + +/* + * Naked M-mode trap handler. For the first trap only, records mcause, mepc and + * the trapping privilege (mstatus.MPP). Then pushes mtimecmp to max (acks the + * timer so it cannot refire), and returns to M-mode at the continuation + * address stashed in mscratch with MPP=M. Bouncing to a fixed continuation + * (rather than resuming U-mode) means clobbering temporaries here is safe. + */ +__attribute__((naked, aligned(4))) static void mret_timer_trap_handler(void) +{ + __asm__ volatile("csrr t0, mcause\n" + "lui t1, %hi(g_cause)\n" + "lw t2, %lo(g_cause)(t1)\n" + "li t3, -1\n" /* sentinel: only the FIRST trap records */ + "bne t2, t3, 2f\n" + "sw t0, %lo(g_cause)(t1)\n" + "csrr t0, mepc\n" /* saved resume PC of this trap */ + "lui t1, %hi(g_mepc)\n" + "sw t0, %lo(g_mepc)(t1)\n" + "csrr t0, mstatus\n" + "srli t0, t0, 11\n" + "andi t0, t0, 0x3\n" /* mstatus.MPP */ + "lui t1, %hi(g_from_priv)\n" + "sw t0, %lo(g_from_priv)(t1)\n" + "2:\n" + "li t1, 0x4000001C\n" /* MTIMECMP_HI: push compare to max to ack timer */ + "li t0, -1\n" + "sw t0, 0(t1)\n" + "csrr t0, mscratch\n" /* M-mode continuation set by run_in_umode */ + "csrw mepc, t0\n" + "li t0, 0x1800\n" /* MPP = M (0b11 << 11) */ + "csrs mstatus, t0\n" + "mret\n"); +} + +/* + * Enter U-mode at ufn with the machine timer ALREADY pending; the handler + * returns control to the instruction after the MRET. The MRET here is the + * instruction whose PC must NOT leak into the timer trap's mepc. + */ +static uint32_t run_in_umode_pending_timer(void (*ufn)(void)) +{ + g_cause = 0xFFFFFFFFu; + g_mepc = 0u; + g_from_priv = 0xFFFFFFFFu; + __asm__ volatile("la t0, 1f\n" + "csrw mscratch, t0\n" /* where the handler returns */ + "li t0, 0x1800\n" + "csrc mstatus, t0\n" /* MPP = U (00) */ + "csrw mepc, %0\n" + "mret\n" /* -> U-mode at ufn; pending timer preempts here */ + "1:\n" + : + : "r"(ufn) + : "t0", "t1", "t2", "memory"); + return g_cause; +} + +/* U-mode body: spin in place. naked so its first (and only) instruction is the + * jump, making the architectural resume PC of any preempting interrupt exactly + * &u_spin. */ +__attribute__((naked)) static void u_spin(void) +{ + __asm__ volatile("j ."); +} + +int main(void) +{ + uart_puts("\r\n=== MRET->U timer-resume mepc test ===\r\n"); + set_trap_handler(&mret_timer_trap_handler); + + /* Machine interrupts off in M (MIE=0), and MPIE=0 so U also runs with + * MIE=0. The machine timer still preempts U-mode (priv != M). */ + (void) disable_interrupts(); + csr_clear(mstatus, MSTATUS_MPIE); + enable_timer_interrupt(); /* mie.MTIE = 1 */ + + /* Make the machine timer permanently pending BEFORE the MRET-to-U so it + * preempts at the first eligible cycle after privilege drops to U -- the + * window in which interrupt_resume_pc may still hold the MRET's own PC. */ + set_timer_cmp(0); /* mtime >= 0 always => MTIP asserted */ + + uint32_t cause = run_in_umode_pending_timer(&u_spin); + disable_timer_interrupt(); + + uint32_t mepc = g_mepc; + uint32_t want_pc = (uint32_t) &u_spin; + int ok = (cause == 0x80000007u) && (g_from_priv == 0u) && (mepc == want_pc); + + uart_puts("cause="); + uart_hex(cause); + uart_puts(" from_priv="); + uart_hex(g_from_priv); + uart_puts(" resume_mepc="); + uart_hex(mepc); + uart_puts(" want_pc(u_spin)="); + uart_hex(want_pc); + uart_puts("\r\n"); + + if (!ok) { + uart_puts("[FAIL] timer trap saved a wrong resume PC " + "(stale interrupt_resume_pc around MRET-to-U)\r\n"); + } else { + uart_puts("[PASS] timer trap resumed at the U-mode target\r\n"); + } + + uart_puts(ok ? "\r\n<>\r\n" : "\r\n<>\r\n"); + for (;;) { + } + return 0; +} diff --git a/sw/apps/wfi_mepc_test/Makefile b/sw/apps/wfi_mepc_test/Makefile new file mode 100644 index 00000000..bbb9e7da --- /dev/null +++ b/sw/apps/wfi_mepc_test/Makefile @@ -0,0 +1,17 @@ +# Copyright 2026 Two Sigma Open Source, LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +# Makefile for the timer-interrupt-at-WFI mepc directed test +SRC_C := ../../lib/src/uart.c main.c +include ../../common/common.mk diff --git a/sw/apps/wfi_mepc_test/main.c b/sw/apps/wfi_mepc_test/main.c new file mode 100644 index 00000000..69da6def --- /dev/null +++ b/sw/apps/wfi_mepc_test/main.c @@ -0,0 +1,100 @@ +/* + * Copyright 2026 Two Sigma Open Source, LLC + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/* + * Timer-interrupt-at-WFI mepc directed test. + * + * No-MMU M-mode Linux dies on the FIRST machine-timer interrupt taken from the + * idle loop (which executes WFI). FROST sources the interrupt resume PC (mepc) + * from the ROB head_pc UNCONDITIONALLY (reorder_buffer.sv o_trap_pc = head_pc, + * trap_unit.sv interrupt o_trap_pc = i_exception_pc), with no head_valid check. + * WFI drains the ROB, so when the timer fires at WFI the ROB is EMPTY and the + * saved mepc can be a stale head_pc instead of the instruction after the WFI. + * + * umode_test's timer-preempt never hit this: its U-code spins (ROB busy) and it + * never checks mepc. This test fires a timer interrupt while the core is in WFI + * (empty ROB) and checks that the saved mepc == the resume point after the WFI. + * Self-checks over UART (<>/<>). + */ + +#include + +#include "trap.h" +#include "uart.h" + +static volatile uint32_t g_mepc; +static volatile uint32_t g_taken; + +/* + * Naked M-mode handler: record mepc (the saved resume PC) + the taken flag, + * ack the timer (push mtimecmp_hi to max so it cannot refire), then resume at + * the safe continuation stashed in mscratch (NOT the recorded mepc -- if mepc + * is wrong we must still land somewhere valid to report the result). Clobbering + * temporaries is fine because we bounce to a fixed continuation. + */ +__attribute__((naked, aligned(4))) static void wfi_trap_handler(void) +{ + __asm__ volatile("csrr t0, mepc\n" + "lui t1, %hi(g_mepc)\n" + "sw t0, %lo(g_mepc)(t1)\n" + "li t0, 1\n" + "lui t1, %hi(g_taken)\n" + "sw t0, %lo(g_taken)(t1)\n" + "li t1, 0x4000001C\n" /* MTIMECMP_HI: ack timer */ + "li t0, -1\n" + "sw t0, 0(t1)\n" + "csrr t0, mscratch\n" /* safe continuation after the WFI */ + "csrw mepc, t0\n" + "mret\n"); +} + +int main(void) +{ + uint32_t resume_pc; + + uart_printf("\n=== timer-interrupt-at-WFI mepc test ===\n"); + set_trap_handler(&wfi_trap_handler); + g_mepc = 0; + g_taken = 0; + + enable_timer_interrupt(); + set_timer_cmp(rdmtime() + 300); /* fire ~300 cycles out: lands during WFI */ + enable_interrupts(); + + /* Stash the post-WFI continuation in mscratch, capture its address as the + * expected resume PC, then WFI (drains the ROB). The timer fires here. */ + __asm__ volatile("la t0, 1f\n" + "csrw mscratch, t0\n" + "la %0, 1f\n" + "wfi\n" + "1:\n" + : "=r"(resume_pc) + : + : "t0", "memory"); + + while (!g_taken) { + } + + uart_printf("mepc=%08x expected(after WFI)=%08x taken=%u\n", g_mepc, resume_pc, g_taken); + if (g_mepc == resume_pc) { + uart_printf("<>\n"); + } else { + uart_printf("<> interrupt-from-empty-ROB saved a stale mepc (not the WFI resume PC)\n"); + } + for (;;) { + } + return 0; +} diff --git a/tests/Makefile b/tests/Makefile index 73166812..628e8335 100644 --- a/tests/Makefile +++ b/tests/Makefile @@ -145,6 +145,12 @@ COMPILE_ARGS := \ $(ROOT)/hw/rtl/cpu_and_mem/cpu/riscv_pkg.sv \ $(ROOT)/hw/rtl/cpu_and_mem/cpu/cpu_ooo/memory_if/data_mem_request_router.sv VERILOG_SOURCES := +else ifeq ($(TOPLEVEL),trap_unit) +# Trap unit interrupt/MRET arbitration unit test +COMPILE_ARGS := \ + $(ROOT)/hw/rtl/cpu_and_mem/cpu/riscv_pkg.sv \ + $(ROOT)/hw/rtl/cpu_and_mem/cpu/control/trap_unit.sv +VERILOG_SOURCES := else ifeq ($(TOPLEVEL),frontend_validity_tracker) # CPU OOO frontend validity/control-flow tracker unit test COMPILE_ARGS := \ diff --git a/tests/test_run_cocotb.py b/tests/test_run_cocotb.py index cbc06a3d..910890ff 100755 --- a/tests/test_run_cocotb.py +++ b/tests/test_run_cocotb.py @@ -169,6 +169,24 @@ class CocotbRunConfig: app_name="umode_test", description="U-mode (User privilege) directed test", ), + "csr_rmw_test": CocotbRunConfig( + python_test_module="cocotb_tests.test_real_program", + hdl_toplevel_module="frost", + app_name="csr_rmw_test", + description="CSR read-modify-write directed test (csrrw/csrrs/csrrc; kernel trap path)", + ), + "wfi_mepc_test": CocotbRunConfig( + python_test_module="cocotb_tests.test_real_program", + hdl_toplevel_module="frost", + app_name="wfi_mepc_test", + description="Timer-interrupt-at-WFI mepc directed test (empty-ROB interrupt resume PC)", + ), + "mret_timer_resume_test": CocotbRunConfig( + python_test_module="cocotb_tests.test_real_program", + hdl_toplevel_module="frost", + app_name="mret_timer_resume_test", + description="MRET-to-U + pending-timer mepc directed test (stale interrupt resume PC)", + ), "ns16550_test": CocotbRunConfig( python_test_module="cocotb_tests.test_real_program", hdl_toplevel_module="frost", @@ -188,6 +206,30 @@ class CocotbRunConfig: description="No-MMU Linux boot (kernel Image in DDR)", include_in_pytest=False, ), + "linux_irq_ddr_test": CocotbRunConfig( + python_test_module="cocotb_tests.test_real_program", + hdl_toplevel_module="frost", + app_name="linux_irq_ddr_test", + description="Linux-like machine-timer IRQ path with DDR code/data/stack", + ), + "linux_irq_active_ddr_test": CocotbRunConfig( + python_test_module="cocotb_tests.test_real_program", + hdl_toplevel_module="frost", + app_name="linux_irq_active_ddr_test", + description="Linux-like active-code machine-timer IRQ path with DDR call/return traffic", + ), + "linux_irq_stack_slot_test": CocotbRunConfig( + python_test_module="cocotb_tests.test_real_program", + hdl_toplevel_module="frost", + app_name="linux_irq_stack_slot_test", + description="Linux-like timer IRQ over a poisoned DDR callee return-address stack slot", + ), + "linux_irq_find_next_slot_test": CocotbRunConfig( + python_test_module="cocotb_tests.test_real_program", + hdl_toplevel_module="frost", + app_name="linux_irq_find_next_slot_test", + description="Linux _find_next_bit-shaped IRQ over a poisoned DDR return slot", + ), "ddr_atomic_test": CocotbRunConfig( python_test_module="cocotb_tests.test_real_program", hdl_toplevel_module="frost", @@ -424,6 +466,11 @@ class CocotbRunConfig: hdl_toplevel_module="data_mem_request_router", description="CPU OOO data-memory request router tests", ), + "trap_unit": CocotbRunConfig( + python_test_module="cocotb_tests.control.test_trap_unit", + hdl_toplevel_module="trap_unit", + description="Trap unit tests (interrupt/MRET arbitration)", + ), "frost_cache": CocotbRunConfig( python_test_module="cocotb_tests.cache.test_frost_cache", hdl_toplevel_module="frost_cache_test_harness", diff --git a/verif/cocotb_tests/control/test_trap_unit.py b/verif/cocotb_tests/control/test_trap_unit.py new file mode 100644 index 00000000..e653e8e1 --- /dev/null +++ b/verif/cocotb_tests/control/test_trap_unit.py @@ -0,0 +1,137 @@ +"""Unit tests for trap_unit interrupt/MRET arbitration.""" + +from typing import Any + +import cocotb +from cocotb.clock import Clock +from cocotb.triggers import RisingEdge, Timer + + +MSTATUS_MIE = 1 << 3 +MIE_MTIE = 1 << 7 +INTERRUPT_MTIP = 0b010 +PRIV_U = 0 +PRIV_M = 3 + + +def _drive_defaults(dut: Any) -> None: + dut.i_pipeline_stall.value = 0 + dut.i_sq_committed_empty.value = 1 + dut.i_mstatus.value = 0 + dut.i_mie.value = 0 + dut.i_mtvec.value = 0x1000 + dut.i_mepc.value = 0x2000 + dut.i_mstatus_mie_direct.value = 0 + dut.i_priv.value = PRIV_M + dut.i_interrupts.value = 0 + dut.i_exception_valid.value = 0 + dut.i_exception_cause.value = 0 + dut.i_exception_tval.value = 0 + dut.i_exception_pc.value = 0x3000 + dut.i_interrupt_pc.value = 0x4000 + dut.i_mret_start.value = 0 + dut.i_wfi_start.value = 0 + + +async def _reset(dut: Any) -> None: + _drive_defaults(dut) + dut.i_rst.value = 1 + await RisingEdge(dut.i_clk) + await RisingEdge(dut.i_clk) + dut.i_rst.value = 0 + await RisingEdge(dut.i_clk) + + +@cocotb.test() +async def test_mret_defers_registered_timer_interrupt(dut: Any) -> None: + cocotb.start_soon(Clock(dut.i_clk, 10, unit="ns").start()) + await _reset(dut) + + dut.i_mstatus.value = MSTATUS_MIE + dut.i_mstatus_mie_direct.value = 1 + dut.i_mie.value = MIE_MTIE + dut.i_interrupts.value = INTERRUPT_MTIP + + # Latch a timer interrupt while the trap unit is stalled. This creates the + # exact bad state from hardware: interrupt_pending is already registered + # when MRET reaches the trap unit. + dut.i_pipeline_stall.value = 1 + await RisingEdge(dut.i_clk) + + dut.i_pipeline_stall.value = 0 + dut.i_mret_start.value = 1 + await Timer(1, unit="ns") + + assert int(dut.o_trap_taken.value) == 0 + assert int(dut.o_mret_taken.value) == 1 + assert int(dut.o_trap_target.value) == 0x2000 + + await RisingEdge(dut.i_clk) + dut.i_mret_start.value = 0 + dut.i_priv.value = PRIV_U + dut.i_mstatus_mie_direct.value = 0 + await Timer(1, unit="ns") + assert int(dut.o_trap_taken.value) == 0 + + await RisingEdge(dut.i_clk) + await Timer(1, unit="ns") + assert int(dut.o_trap_taken.value) == 0 + + +@cocotb.test() +async def test_timer_interrupt_still_traps_without_mret(dut: Any) -> None: + cocotb.start_soon(Clock(dut.i_clk, 10, unit="ns").start()) + await _reset(dut) + + dut.i_mstatus.value = MSTATUS_MIE + dut.i_mstatus_mie_direct.value = 1 + dut.i_mie.value = MIE_MTIE + dut.i_interrupts.value = INTERRUPT_MTIP + + dut.i_pipeline_stall.value = 1 + await RisingEdge(dut.i_clk) + dut.i_pipeline_stall.value = 0 + await Timer(1, unit="ns") + + assert int(dut.o_trap_taken.value) == 1 + assert int(dut.o_mret_taken.value) == 0 + assert int(dut.o_trap_cause.value) == 0x80000007 + assert int(dut.o_trap_target.value) == 0x1000 + + +@cocotb.test() +async def test_registered_interrupt_requires_current_mie(dut: Any) -> None: + cocotb.start_soon(Clock(dut.i_clk, 10, unit="ns").start()) + await _reset(dut) + + dut.i_mstatus.value = MSTATUS_MIE + dut.i_mstatus_mie_direct.value = 1 + dut.i_mie.value = MIE_MTIE + dut.i_interrupts.value = INTERRUPT_MTIP + + # Latch a pending timer interrupt, then model the Linux return path clearing + # mstatus.MIE before that registered pending bit reaches take_trap. + dut.i_pipeline_stall.value = 1 + await RisingEdge(dut.i_clk) + + dut.i_pipeline_stall.value = 0 + dut.i_mstatus.value = 0 + dut.i_mstatus_mie_direct.value = 0 + await Timer(1, unit="ns") + assert int(dut.o_trap_taken.value) == 0 + + await RisingEdge(dut.i_clk) + await Timer(1, unit="ns") + assert int(dut.o_trap_taken.value) == 0 + + # Once MIE is restored, the still-asserted timer interrupt is sampled again + # and should trap on the registered path. + dut.i_mstatus.value = MSTATUS_MIE + dut.i_mstatus_mie_direct.value = 1 + await Timer(1, unit="ns") + assert int(dut.o_trap_taken.value) == 0 + + await RisingEdge(dut.i_clk) + await Timer(1, unit="ns") + assert int(dut.o_trap_taken.value) == 1 + assert int(dut.o_trap_cause.value) == 0x80000007 diff --git a/verif/cocotb_tests/test_real_program.py b/verif/cocotb_tests/test_real_program.py index 7b3f3d95..0c5bd0ce 100644 --- a/verif/cocotb_tests/test_real_program.py +++ b/verif/cocotb_tests/test_real_program.py @@ -651,8 +651,10 @@ def get_expected_behavior() -> tuple[str | None, str | None, bool, str | None]: # Just needs to print the first hello message return (None, "Hello, world!", False, app_name) if app_name == "linux_boot": - # No-MMU Linux boot: pass when the kernel banner appears. - return (None, "Linux version", False, app_name) + # FROST DEBUG (TEMP): run PAST the banner to the first-timer-IRQ + # panic so the RTL trace fires; stop at the panic (or time out + # at max_cycles if it hangs). Revert to "Linux version" after. + return (None, "Kernel panic - not syncing", False, app_name) if app_name == "uart_echo": # Interactive test handled separately (UART input injection) return (None, None, False, app_name) @@ -701,6 +703,18 @@ async def run_until_complete( progress_interval = int( os.environ.get("COCOTB_COREMARK_PROGRESS_INTERVAL", 500_000) ) + irq_precision_check = os.environ.get("FROST_IRQ_PRECISION_CHECK") == "1" + irq_precision_strict = os.environ.get("FROST_IRQ_PRECISION_STRICT") == "1" + irq_low_ra_assert = os.environ.get("FROST_IRQ_LOW_RA_ASSERT") == "1" + irq_precision_event_limit = int( + os.environ.get("FROST_IRQ_PRECISION_EVENT_LIMIT", "64") + ) + irq_precision_events: list[str] = [] + external_irq_symbol = os.environ.get("FROST_EXTERNAL_IRQ_SYMBOL") + external_irq_enabled = bool(external_irq_symbol) + external_irq_offset = int(os.environ.get("FROST_EXTERNAL_IRQ_OFFSET", "0"), 0) + external_irq_max_pulses = int(os.environ.get("FROST_EXTERNAL_IRQ_MAX_PULSES", "1")) + external_irq_hold_cycles = int(os.environ.get("FROST_EXTERNAL_IRQ_HOLD_CYCLES", "1")) retire_sig = None pc_sig = None pc_vld_sig = None @@ -786,6 +800,16 @@ async def run_until_complete( ras_pop_after_restore_live_sig = None commit_valid_live_sig = None commit_pc_live_sig = None + commit0_dest_valid_sig = None + commit0_dest_rf_sig = None + commit0_dest_reg_sig = None + commit0_value_sig = None + commit1_valid_sig = None + commit1_pc_sig = None + commit1_dest_valid_sig = None + commit1_dest_rf_sig = None + commit1_dest_reg_sig = None + commit1_value_sig = None commit_is_return_live_sig = None commit_is_call_live_sig = None commit_checkpoint_id_live_sig = None @@ -856,6 +880,36 @@ async def run_until_complete( rob_alloc_is_csr_live_sig = None rob_alloc_is_mret_live_sig = None id_instruction_live_sig = None + trap_taken_live_sig = None + trap_taken_reg_dbg_sig = None + trap_cause_internal_live_sig = None + mret_taken_live_sig = None + trap_target_live_sig = None + trap_pending_live_sig = None + rob_trap_pc_live_sig = None + trap_pc_internal_live_sig = None + interrupt_resume_pc_live_sig = None + csr_commit_fire_live_sig = None + csr_mepc_live_sig = None + flush_all_live_sig = None + port0_int_we_sig = None + port0_int_addr_sig = None + port0_int_data_sig = None + port1_int_we_sig = None + port1_int_addr_sig = None + port1_int_data_sig = None + rob_commit0_reg_valid_sig = None + rob_commit0_reg_pc_sig = None + rob_commit0_reg_dest_valid_sig = None + rob_commit0_reg_dest_rf_sig = None + rob_commit0_reg_dest_reg_sig = None + rob_commit0_reg_value_sig = None + rob_commit1_reg_valid_sig = None + rob_commit1_reg_pc_sig = None + rob_commit1_reg_dest_valid_sig = None + rob_commit1_reg_dest_rf_sig = None + rob_commit1_reg_dest_reg_sig = None + rob_commit1_reg_value_sig = None coremark_cf_debug_enabled = ( is_coremark_like and os.environ.get("FROST_COREMARK_CF_DEBUG") == "1" ) @@ -894,7 +948,7 @@ async def run_until_complete( control_flow_trace_label = os.environ.get( "FROST_CONTROL_FLOW_TRACE_LABEL", f"{app_name or 'program'} trace" ) - if progress_interval: + if progress_interval or irq_precision_check or external_irq_enabled: retire_sig = _get_signal( dut, "cpu_and_memory_subsystem.cpu_inst.dbg_commit_valid" ) @@ -1453,6 +1507,36 @@ async def run_until_complete( commit_pc_live_sig = _get_signal( dut, "cpu_and_memory_subsystem.cpu_inst.dbg_commit_pc" ) + commit0_dest_valid_sig = _get_signal( + dut, "cpu_and_memory_subsystem.cpu_inst.dbg_commit_dest_valid" + ) + commit0_dest_rf_sig = _get_signal( + dut, "cpu_and_memory_subsystem.cpu_inst.dbg_commit_dest_rf" + ) + commit0_dest_reg_sig = _get_signal( + dut, "cpu_and_memory_subsystem.cpu_inst.dbg_commit_dest_reg" + ) + commit0_value_sig = _get_signal( + dut, "cpu_and_memory_subsystem.cpu_inst.dbg_commit_value" + ) + commit1_valid_sig = _get_signal( + dut, "cpu_and_memory_subsystem.cpu_inst.dbg_commit_2_valid" + ) + commit1_pc_sig = _get_signal( + dut, "cpu_and_memory_subsystem.cpu_inst.dbg_commit_2_pc" + ) + commit1_dest_valid_sig = _get_signal( + dut, "cpu_and_memory_subsystem.cpu_inst.dbg_commit_2_dest_valid" + ) + commit1_dest_rf_sig = _get_signal( + dut, "cpu_and_memory_subsystem.cpu_inst.dbg_commit_2_dest_rf" + ) + commit1_dest_reg_sig = _get_signal( + dut, "cpu_and_memory_subsystem.cpu_inst.dbg_commit_2_dest_reg" + ) + commit1_value_sig = _get_signal( + dut, "cpu_and_memory_subsystem.cpu_inst.dbg_commit_2_value" + ) commit_is_return_live_sig = _get_signal( dut, "cpu_and_memory_subsystem.cpu_inst.dbg_commit_is_return" ) @@ -1529,6 +1613,20 @@ async def run_until_complete( trap_taken_live_sig = _get_signal( dut, "cpu_and_memory_subsystem.cpu_inst.trap_taken" ) + trap_taken_reg_dbg_sig = _first_signal( + dut, + [ + "cpu_and_memory_subsystem.cpu_inst.dbg_trap_taken_q", + "cpu_and_memory_subsystem.cpu_inst.trap_taken_reg", + ], + ) + trap_cause_internal_live_sig = _first_signal( + dut, + [ + "cpu_and_memory_subsystem.cpu_inst.dbg_trap_cause_internal", + "cpu_and_memory_subsystem.cpu_inst.trap_cause_internal", + ], + ) mret_taken_live_sig = _get_signal( dut, "cpu_and_memory_subsystem.cpu_inst.mret_taken" ) @@ -1541,6 +1639,16 @@ async def run_until_complete( rob_trap_pc_live_sig = _get_signal( dut, "cpu_and_memory_subsystem.cpu_inst.rob_trap_pc" ) + trap_pc_internal_live_sig = _first_signal( + dut, + [ + "cpu_and_memory_subsystem.cpu_inst.dbg_trap_pc_internal", + "cpu_and_memory_subsystem.cpu_inst.rob_trap_pc", + ], + ) + interrupt_resume_pc_live_sig = _get_signal( + dut, "cpu_and_memory_subsystem.cpu_inst.dbg_interrupt_resume_pc" + ) rob_trap_cause_live_sig = _get_signal( dut, "cpu_and_memory_subsystem.cpu_inst.rob_trap_cause" ) @@ -1556,6 +1664,87 @@ async def run_until_complete( csr_mepc_live_sig = _get_signal( dut, "cpu_and_memory_subsystem.cpu_inst.csr_mepc" ) + flush_all_live_sig = _get_signal( + dut, "cpu_and_memory_subsystem.cpu_inst.flush_all" + ) + port0_int_we_sig = _first_signal( + dut, + [ + "cpu_and_memory_subsystem.cpu_inst.dbg_port0_int_we", + "cpu_and_memory_subsystem.cpu_inst.port0_int_we", + ], + ) + port0_int_addr_sig = _first_signal( + dut, + [ + "cpu_and_memory_subsystem.cpu_inst.dbg_port0_int_addr", + "cpu_and_memory_subsystem.cpu_inst.port0_int_addr", + ], + ) + port0_int_data_sig = _first_signal( + dut, + [ + "cpu_and_memory_subsystem.cpu_inst.dbg_port0_int_data", + "cpu_and_memory_subsystem.cpu_inst.port0_int_data", + ], + ) + port1_int_we_sig = _first_signal( + dut, + [ + "cpu_and_memory_subsystem.cpu_inst.dbg_port1_int_we", + "cpu_and_memory_subsystem.cpu_inst.port1_int_we", + ], + ) + port1_int_addr_sig = _first_signal( + dut, + [ + "cpu_and_memory_subsystem.cpu_inst.dbg_port1_int_addr", + "cpu_and_memory_subsystem.cpu_inst.port1_int_addr", + ], + ) + port1_int_data_sig = _first_signal( + dut, + [ + "cpu_and_memory_subsystem.cpu_inst.dbg_port1_int_data", + "cpu_and_memory_subsystem.cpu_inst.port1_int_data", + ], + ) + rob_commit0_reg_valid_sig = _get_signal( + dut, "cpu_and_memory_subsystem.cpu_inst.dbg_rob_commit_reg_valid" + ) + rob_commit0_reg_pc_sig = _get_signal( + dut, "cpu_and_memory_subsystem.cpu_inst.dbg_rob_commit_reg_pc" + ) + rob_commit0_reg_dest_valid_sig = _get_signal( + dut, "cpu_and_memory_subsystem.cpu_inst.dbg_rob_commit_reg_dest_valid" + ) + rob_commit0_reg_dest_rf_sig = _get_signal( + dut, "cpu_and_memory_subsystem.cpu_inst.dbg_rob_commit_reg_dest_rf" + ) + rob_commit0_reg_dest_reg_sig = _get_signal( + dut, "cpu_and_memory_subsystem.cpu_inst.dbg_rob_commit_reg_dest_reg" + ) + rob_commit0_reg_value_sig = _get_signal( + dut, "cpu_and_memory_subsystem.cpu_inst.dbg_rob_commit_reg_value" + ) + rob_commit1_reg_valid_sig = _get_signal( + dut, "cpu_and_memory_subsystem.cpu_inst.dbg_rob_commit_2_reg_valid" + ) + rob_commit1_reg_pc_sig = _get_signal( + dut, "cpu_and_memory_subsystem.cpu_inst.dbg_rob_commit_2_reg_pc" + ) + rob_commit1_reg_dest_valid_sig = _get_signal( + dut, "cpu_and_memory_subsystem.cpu_inst.dbg_rob_commit_2_reg_dest_valid" + ) + rob_commit1_reg_dest_rf_sig = _get_signal( + dut, "cpu_and_memory_subsystem.cpu_inst.dbg_rob_commit_2_reg_dest_rf" + ) + rob_commit1_reg_dest_reg_sig = _get_signal( + dut, "cpu_and_memory_subsystem.cpu_inst.dbg_rob_commit_2_reg_dest_reg" + ) + rob_commit1_reg_value_sig = _get_signal( + dut, "cpu_and_memory_subsystem.cpu_inst.dbg_rob_commit_2_reg_value" + ) flush_pipeline_live_sig = _get_signal( dut, "cpu_and_memory_subsystem.cpu_inst.flush_pipeline" ) @@ -1651,6 +1840,118 @@ async def run_until_complete( (0x353C, 0x35DC), ] + if irq_precision_check: + trap_taken_live_sig = _get_signal( + dut, "cpu_and_memory_subsystem.cpu_inst.trap_taken" + ) + trap_cause_internal_live_sig = _first_signal( + dut, + [ + "cpu_and_memory_subsystem.cpu_inst.dbg_trap_cause_internal", + "cpu_and_memory_subsystem.cpu_inst.trap_cause_internal", + ], + ) + rob_trap_pc_live_sig = _get_signal( + dut, "cpu_and_memory_subsystem.cpu_inst.rob_trap_pc" + ) + trap_pc_internal_live_sig = _first_signal( + dut, + [ + "cpu_and_memory_subsystem.cpu_inst.dbg_trap_pc_internal", + "cpu_and_memory_subsystem.cpu_inst.rob_trap_pc", + ], + ) + interrupt_resume_pc_live_sig = _get_signal( + dut, "cpu_and_memory_subsystem.cpu_inst.dbg_interrupt_resume_pc" + ) + csr_commit_fire_live_sig = _get_signal( + dut, "cpu_and_memory_subsystem.cpu_inst.csr_commit_fire" + ) + csr_mepc_live_sig = _get_signal( + dut, "cpu_and_memory_subsystem.cpu_inst.csr_mepc" + ) + flush_all_live_sig = _get_signal( + dut, "cpu_and_memory_subsystem.cpu_inst.flush_all" + ) + port0_int_we_sig = _first_signal( + dut, + [ + "cpu_and_memory_subsystem.cpu_inst.dbg_port0_int_we", + "cpu_and_memory_subsystem.cpu_inst.port0_int_we", + ], + ) + port0_int_addr_sig = _first_signal( + dut, + [ + "cpu_and_memory_subsystem.cpu_inst.dbg_port0_int_addr", + "cpu_and_memory_subsystem.cpu_inst.port0_int_addr", + ], + ) + port0_int_data_sig = _first_signal( + dut, + [ + "cpu_and_memory_subsystem.cpu_inst.dbg_port0_int_data", + "cpu_and_memory_subsystem.cpu_inst.port0_int_data", + ], + ) + port1_int_we_sig = _first_signal( + dut, + [ + "cpu_and_memory_subsystem.cpu_inst.dbg_port1_int_we", + "cpu_and_memory_subsystem.cpu_inst.port1_int_we", + ], + ) + port1_int_addr_sig = _first_signal( + dut, + [ + "cpu_and_memory_subsystem.cpu_inst.dbg_port1_int_addr", + "cpu_and_memory_subsystem.cpu_inst.port1_int_addr", + ], + ) + port1_int_data_sig = _first_signal( + dut, + [ + "cpu_and_memory_subsystem.cpu_inst.dbg_port1_int_data", + "cpu_and_memory_subsystem.cpu_inst.port1_int_data", + ], + ) + rob_commit0_reg_valid_sig = _get_signal( + dut, "cpu_and_memory_subsystem.cpu_inst.dbg_rob_commit_reg_valid" + ) + rob_commit0_reg_pc_sig = _get_signal( + dut, "cpu_and_memory_subsystem.cpu_inst.dbg_rob_commit_reg_pc" + ) + rob_commit0_reg_dest_valid_sig = _get_signal( + dut, "cpu_and_memory_subsystem.cpu_inst.dbg_rob_commit_reg_dest_valid" + ) + rob_commit0_reg_dest_rf_sig = _get_signal( + dut, "cpu_and_memory_subsystem.cpu_inst.dbg_rob_commit_reg_dest_rf" + ) + rob_commit0_reg_dest_reg_sig = _get_signal( + dut, "cpu_and_memory_subsystem.cpu_inst.dbg_rob_commit_reg_dest_reg" + ) + rob_commit0_reg_value_sig = _get_signal( + dut, "cpu_and_memory_subsystem.cpu_inst.dbg_rob_commit_reg_value" + ) + rob_commit1_reg_valid_sig = _get_signal( + dut, "cpu_and_memory_subsystem.cpu_inst.dbg_rob_commit_2_reg_valid" + ) + rob_commit1_reg_pc_sig = _get_signal( + dut, "cpu_and_memory_subsystem.cpu_inst.dbg_rob_commit_2_reg_pc" + ) + rob_commit1_reg_dest_valid_sig = _get_signal( + dut, "cpu_and_memory_subsystem.cpu_inst.dbg_rob_commit_2_reg_dest_valid" + ) + rob_commit1_reg_dest_rf_sig = _get_signal( + dut, "cpu_and_memory_subsystem.cpu_inst.dbg_rob_commit_2_reg_dest_rf" + ) + rob_commit1_reg_dest_reg_sig = _get_signal( + dut, "cpu_and_memory_subsystem.cpu_inst.dbg_rob_commit_2_reg_dest_reg" + ) + rob_commit1_reg_value_sig = _get_signal( + dut, "cpu_and_memory_subsystem.cpu_inst.dbg_rob_commit_2_reg_value" + ) + retired_pc_hist: Counter[int] = Counter() retired_mispredicts = 0 last_progress_mispredicts = 0 @@ -1681,6 +1982,9 @@ async def run_until_complete( last_checkpoint_in_use = None last_rat_a0_state = None last_x2_commit = None + last_x2_commit_pc = None + last_x2_raw_commit = None + last_x2_raw_commit_pc = None last_x5_commit = None last_x8_commit = None last_x9_commit = None @@ -1691,6 +1995,37 @@ async def run_until_complete( control_flow_trace_enabled = True retire_only_trace = os.environ.get("FROST_CONTROL_FLOW_RETIRE_ONLY") == "1" ras_transition_trace_active = True + irq_precision_callee_range: tuple[int, int] | None = None + external_irq_range: tuple[int, int] | None = None + external_irq_active = False + external_irq_hold_remaining = 0 + external_irq_pulses = 0 + external_irq_armed = True + if irq_precision_check: + irq_callee_symbol = os.environ.get( + "FROST_IRQ_CALLEE_SYMBOL", "irq_stack_slot_callee" + ) + irq_symbol_ranges = _load_symbol_ranges([irq_callee_symbol], app_name) + irq_precision_callee_range = irq_symbol_ranges.get(irq_callee_symbol) + if irq_precision_callee_range is not None: + lo, hi = irq_precision_callee_range + cocotb.log.info( + f"IRQ precision callee window {irq_callee_symbol}: " + f"[0x{lo:08x}, 0x{hi:08x})" + ) + if external_irq_enabled and external_irq_symbol is not None: + external_symbol_ranges = _load_symbol_ranges([external_irq_symbol], app_name) + external_irq_range = external_symbol_ranges.get(external_irq_symbol) + if external_irq_range is None: + raise AssertionError( + f"FROST_EXTERNAL_IRQ_SYMBOL={external_irq_symbol!r} not found" + ) + lo, hi = external_irq_range + cocotb.log.info( + f"External IRQ injector armed for {external_irq_symbol}: " + f"[0x{lo:08x}, 0x{hi:08x}) offset=0x{external_irq_offset:x} " + f"max_pulses={external_irq_max_pulses}" + ) def in_trace_window(pc: int | None) -> bool: if pc is None or control_flow_trace_ranges is None: @@ -1710,6 +2045,25 @@ def coremark_symbol_name_for_pc(pc: int | None) -> str: return symbol_name return "-" + def commit_writes_x1_x2_at_pc( + valid_sig: Any | None, + pc_sig: Any | None, + dest_valid_sig: Any | None, + dest_rf_sig: Any | None, + dest_reg_sig: Any | None, + trap_pc: int | None, + ) -> bool: + if trap_pc is None: + return False + dest_reg = _read_int(dest_reg_sig) + return ( + bool(_read_bool(valid_sig)) + and bool(_read_bool(dest_valid_sig)) + and not bool(_read_bool(dest_rf_sig)) + and dest_reg in {1, 2} + and _read_int(pc_sig) == trap_pc + ) + def format_coremark_if_mismatch( *, stage: str, @@ -1750,6 +2104,238 @@ def dump_coremark_retire_trace() -> None: for cycle in range(max_cycles): await RisingEdge(dut.i_clk) + if external_irq_enabled and hasattr(dut, "i_external_interrupt"): + if external_irq_active: + if external_irq_hold_remaining > 0: + external_irq_hold_remaining -= 1 + if trap_taken_live_sig is not None and bool(_read_bool(trap_taken_live_sig)): + external_irq_hold_remaining = 0 + if external_irq_hold_remaining == 0: + dut.i_external_interrupt.value = 0 + external_irq_active = False + external_irq_armed = False + + if ( + not external_irq_active + and external_irq_armed + and external_irq_range is not None + and external_irq_pulses < external_irq_max_pulses + ): + retire_valid = bool(_read_bool(retire_sig)) + retire_pc = _read_int(retire_pc_sig) + lo, hi = external_irq_range + trigger_pc = lo + external_irq_offset + if retire_valid and retire_pc is not None and trigger_pc <= retire_pc < hi: + dut.i_external_interrupt.value = 1 + external_irq_active = True + external_irq_hold_remaining = max(1, external_irq_hold_cycles) + external_irq_pulses += 1 + cocotb.log.info( + f"External IRQ pulse {external_irq_pulses} at " + f"cycle={cycle + 1} retire_pc=0x{retire_pc:08x}" + ) + + if ( + not external_irq_armed + and external_irq_range is not None + and external_irq_pulses < external_irq_max_pulses + ): + retire_pc = _read_int(retire_pc_sig) + lo, hi = external_irq_range + if retire_pc is None or not (lo <= retire_pc < hi): + external_irq_armed = True + + if irq_precision_check: + raw_x2_events = [] + for valid_sig, pc_sig, dest_valid_sig, dest_rf_sig, dest_reg_sig, value_sig in ( + ( + commit_valid_live_sig, + commit_pc_live_sig, + commit0_dest_valid_sig, + commit0_dest_rf_sig, + commit0_dest_reg_sig, + commit0_value_sig, + ), + ( + commit1_valid_sig, + commit1_pc_sig, + commit1_dest_valid_sig, + commit1_dest_rf_sig, + commit1_dest_reg_sig, + commit1_value_sig, + ), + ): + if ( + bool(_read_bool(valid_sig)) + and bool(_read_bool(dest_valid_sig)) + and not bool(_read_bool(dest_rf_sig)) + and _read_int(dest_reg_sig) == 2 + ): + value = _read_int(value_sig) + pc = _read_int(pc_sig) + last_x2_raw_commit = value + last_x2_raw_commit_pc = pc + raw_x2_events.append(f"0x{(value or 0):08x}@0x{(pc or 0):08x}") + + current_x2_commit = last_x2_commit + current_x2_commit_pc = last_x2_commit_pc + wb_x2_events = [] + for port_name, we_sig, addr_sig, data_sig, pc_sig in ( + ( + "p0", + port0_int_we_sig, + port0_int_addr_sig, + port0_int_data_sig, + rob_commit0_reg_pc_sig, + ), + ( + "p1", + port1_int_we_sig, + port1_int_addr_sig, + port1_int_data_sig, + rob_commit1_reg_pc_sig, + ), + ): + if bool(_read_bool(we_sig)) and _read_int(addr_sig) == 2: + value = _read_int(data_sig) + pc = _read_int(pc_sig) + current_x2_commit = value + current_x2_commit_pc = pc + wb_x2_events.append( + f"{port_name}=0x{(value or 0):08x}@0x{(pc or 0):08x}" + ) + + trap = bool(_read_bool(trap_taken_live_sig)) + trap_q = bool(_read_bool(trap_taken_reg_dbg_sig)) + flush_all = bool(_read_bool(flush_all_live_sig)) + trap_cause = _read_int(trap_cause_internal_live_sig) + is_irq = bool((trap_cause or 0) & 0x8000_0000) + trap_pc = _read_int(trap_pc_internal_live_sig) + rob_trap_pc = _read_int(rob_trap_pc_live_sig) + interrupt_resume_pc = _read_int(interrupt_resume_pc_live_sig) + c0_valid = bool(_read_bool(commit_valid_live_sig)) + c1_valid = bool(_read_bool(commit1_valid_sig)) + c0_pc = _read_int(commit_pc_live_sig) + c1_pc = _read_int(commit1_pc_sig) + reg0_sensitive = commit_writes_x1_x2_at_pc( + rob_commit0_reg_valid_sig, + rob_commit0_reg_pc_sig, + rob_commit0_reg_dest_valid_sig, + rob_commit0_reg_dest_rf_sig, + rob_commit0_reg_dest_reg_sig, + trap_pc, + ) + reg1_sensitive = commit_writes_x1_x2_at_pc( + rob_commit1_reg_valid_sig, + rob_commit1_reg_pc_sig, + rob_commit1_reg_dest_valid_sig, + rob_commit1_reg_dest_rf_sig, + rob_commit1_reg_dest_reg_sig, + trap_pc, + ) + raw0_sensitive = commit_writes_x1_x2_at_pc( + commit_valid_live_sig, + commit_pc_live_sig, + commit0_dest_valid_sig, + commit0_dest_rf_sig, + commit0_dest_reg_sig, + trap_pc, + ) + raw1_sensitive = commit_writes_x1_x2_at_pc( + commit1_valid_sig, + commit1_pc_sig, + commit1_dest_valid_sig, + commit1_dest_rf_sig, + commit1_dest_reg_sig, + trap_pc, + ) + + stale_sp_body = False + if trap and is_irq and trap_pc is not None and irq_precision_callee_range: + callee_lo, callee_hi = irq_precision_callee_range + x2_from_callee = ( + current_x2_commit_pc is not None + and callee_lo <= current_x2_commit_pc < callee_hi + ) + stale_sp_body = callee_lo + 4 <= trap_pc < callee_hi and not x2_from_callee + + if trap and is_irq: + event = ( + f"IRQ precision event cycle={cycle + 1} " + f"cause=0x{(trap_cause or 0):08x} trap_pc=0x{(trap_pc or 0):08x} " + f"rob_pc=0x{(rob_trap_pc or 0):08x} " + f"resume_pc=0x{(interrupt_resume_pc or 0):08x} " + f"c0={int(c0_valid)} pc0=0x{(c0_pc or 0):08x} " + f"rd0={_read_int(commit0_dest_reg_sig)} " + f"c1={int(c1_valid)} pc1=0x{(c1_pc or 0):08x} " + f"rd1={_read_int(commit1_dest_reg_sig)} " + f"p0we={int(bool(_read_bool(port0_int_we_sig)))} " + f"p0a={_read_int(port0_int_addr_sig)} " + f"p0d=0x{(_read_int(port0_int_data_sig) or 0):08x} " + f"p0pc=0x{(_read_int(rob_commit0_reg_pc_sig) or 0):08x} " + f"p1we={int(bool(_read_bool(port1_int_we_sig)))} " + f"p1a={_read_int(port1_int_addr_sig)} " + f"p1d=0x{(_read_int(port1_int_data_sig) or 0):08x} " + f"p1pc=0x{(_read_int(rob_commit1_reg_pc_sig) or 0):08x} " + f"csr_fire={int(bool(_read_bool(csr_commit_fire_live_sig)))} " + f"trap_q={int(trap_q)} flush_all={int(flush_all)} " + f"mepc=0x{(_read_int(csr_mepc_live_sig) or 0):08x} " + f"last_x2_arch=0x{(current_x2_commit or 0):08x} " + f"last_x2_arch_pc=0x{(current_x2_commit_pc or 0):08x} " + f"last_x2_raw=0x{(last_x2_raw_commit or 0):08x} " + f"last_x2_raw_pc=0x{(last_x2_raw_commit_pc or 0):08x} " + f"raw_x2_now={','.join(raw_x2_events) or '-'} " + f"wb_x2_now={','.join(wb_x2_events) or '-'}" + ) + if len(irq_precision_events) < irq_precision_event_limit: + irq_precision_events.append(event) + cocotb.log.info(event) + + raw_commit_collision = c0_valid or c1_valid + sensitive_pc_write = ( + raw0_sensitive or raw1_sensitive or reg0_sensitive or reg1_sensitive + ) + if irq_precision_strict and ( + raw_commit_collision or sensitive_pc_write or stale_sp_body + ): + raise AssertionError( + "IRQ precision violation: " + f"raw_commit={raw_commit_collision} " + f"x1_x2_same_pc={sensitive_pc_write} " + f"stale_sp_body={stale_sp_body}; {event}" + ) + + low_ra_events = [] + for port_name, we_sig, addr_sig, data_sig in ( + ("p0", port0_int_we_sig, port0_int_addr_sig, port0_int_data_sig), + ("p1", port1_int_we_sig, port1_int_addr_sig, port1_int_data_sig), + ): + data_value = _read_int(data_sig) + if ( + bool(_read_bool(we_sig)) + and _read_int(addr_sig) == 1 + and data_value is not None + and data_value < 0x1000 + ): + low_ra_events.append(f"{port_name}=0x{data_value:08x}") + if irq_low_ra_assert and low_ra_events: + raise AssertionError( + "Low RA writeback under IRQ monitor: " + f"cycle={cycle + 1} {' '.join(low_ra_events)} " + f"trap={int(trap)} irq={int(is_irq)} " + f"cause=0x{(trap_cause or 0):08x} " + f"trap_pc=0x{(trap_pc or 0):08x} " + f"mepc=0x{(_read_int(csr_mepc_live_sig) or 0):08x}" + ) + + for we_sig, addr_sig, data_sig, pc_sig in ( + (port0_int_we_sig, port0_int_addr_sig, port0_int_data_sig, rob_commit0_reg_pc_sig), + (port1_int_we_sig, port1_int_addr_sig, port1_int_data_sig, rob_commit1_reg_pc_sig), + ): + if bool(_read_bool(we_sig)) and _read_int(addr_sig) == 2: + last_x2_commit = _read_int(data_sig) + last_x2_commit_pc = _read_int(pc_sig) + if _read_bool(int_rf_write_enable_sig): commit_addr = _read_int(int_rf_write_addr_sig) commit_data = _read_int(int_rf_write_data_sig) @@ -2719,6 +3305,8 @@ async def test_real_program(dut: Any) -> None: dut.i_rst_n.value = 0 if hasattr(dut, "i_uart_rx"): dut.i_uart_rx.value = 1 + if hasattr(dut, "i_external_interrupt"): + dut.i_external_interrupt.value = 0 for _ in range(RESET_CYCLES): await RisingEdge(dut.i_clk) dut.i_rst_n.value = 1 @@ -2728,6 +3316,8 @@ async def test_real_program(dut: Any) -> None: dut.i_rst_n.value = 0 if hasattr(dut, "i_uart_rx"): dut.i_uart_rx.value = 1 + if hasattr(dut, "i_external_interrupt"): + dut.i_external_interrupt.value = 0 await Timer(2 * CLK_PERIOD_NS, unit="ns") await RisingEdge(dut.i_clk) dut.i_rst_n.value = 1 diff --git a/verif/cocotb_tests/tomasulo/load_queue/test_load_queue.py b/verif/cocotb_tests/tomasulo/load_queue/test_load_queue.py index 4afd39b9..e0f9784a 100644 --- a/verif/cocotb_tests/tomasulo/load_queue/test_load_queue.py +++ b/verif/cocotb_tests/tomasulo/load_queue/test_load_queue.py @@ -1309,6 +1309,68 @@ async def test_cached_response_after_invalidate_does_not_refill_l0(dut: Any) -> await accept_fu_complete(dut_if) +@cocotb.test() +async def test_cached_response_during_flush_all_does_not_refill_l0(dut: Any) -> None: + """A cached-tier response coincident with flush_all must be drained only.""" + dut_if, model = await setup(dut) + + addr = 0x8000_0300 + stale_word = 0x0000_0CC0 + fresh_word = 0xA5A5_5A5A + + dut_if.drive_sq_empty(True) + + # Launch a cached-region load and delay its response until full flush. + await alloc_and_addr(dut_if, model, rob_tag=1, address=addr) + dut_if.drive_sq_all_older_known(True) + dut_if.drive_sq_forward(match=False, can_forward=False) + + mem_req = await wait_for_mem_request(dut_if) + assert mem_req["en"], "Expected cached load to issue" + assert mem_req["addr"] == addr + await dut_if.step() + + # The response arrives in the same cycle as trap/MRET-style full flush. + # It must not complete the killed load and must not refill the persistent L0. + dut_if.drive_flush_all() + model.flush_all() + dut_if.drive_mem_response(stale_word) + await Timer(1, unit="ns") + assert not bool(dut.o_l0_fill.value), "Full-flush response filled L0" + await dut_if.step() + dut_if.clear_flush_all() + dut_if.clear_mem_response() + + assert dut_if.empty, "Full flush should clear the LQ" + assert not (await wait_for_fu_complete(dut_if, max_cycles=1)).valid + + # A later load to the same word must miss L0 and fetch the fresh value. + await alloc_and_addr(dut_if, model, rob_tag=2, address=addr) + dut_if.drive_sq_all_older_known(True) + dut_if.drive_sq_forward(match=False, can_forward=False) + await Timer(1, unit="ns") + + assert not bool(dut.o_l0_hit.value), "Flushed response left a stale L0 hit" + mem_req = await wait_for_mem_request(dut_if, max_cycles=4) + assert mem_req["en"], "Later load should miss L0 and issue to memory" + assert mem_req["addr"] == addr + await dut_if.step() + + dut_if.drive_mem_response(fresh_word) + model.mem_response(fresh_word) + await dut_if.step() + dut_if.clear_mem_response() + + result = await wait_for_fu_complete(dut_if) + assert result.valid, "Later load should complete from memory" + assert result.tag == 2 + assert result.value == fresh_word + + dut_if.drive_sq_all_older_known(False) + dut_if.clear_sq_forward() + await accept_fu_complete(dut_if) + + # ============================================================================ # Test 26: Cache miss fills cache, subsequent hit # ============================================================================ diff --git a/verif/cocotb_tests/tomasulo/tomasulo_wrapper/test_tomasulo_wrapper.py b/verif/cocotb_tests/tomasulo/tomasulo_wrapper/test_tomasulo_wrapper.py index cf75edaa..cc3bb5a0 100644 --- a/verif/cocotb_tests/tomasulo/tomasulo_wrapper/test_tomasulo_wrapper.py +++ b/verif/cocotb_tests/tomasulo/tomasulo_wrapper/test_tomasulo_wrapper.py @@ -367,6 +367,43 @@ async def mark_done_via_cdb(dut_if: TomasuloInterface, tag: int, value: int) -> dut_if.clear_cdb_write() +async def issue_sw_via_mem_rs( + dut_if: TomasuloInterface, + tag: int, + base_addr: int, + store_data: int, + imm: int = 0, + max_cycles: int = 6, +) -> dict: + """Dispatch an SW to MEM_RS and wait until its issue is captured.""" + dut_if.drive_rs_dispatch( + rs_type=RS_MEM, + rob_tag=tag, + op=OP_SW, + src1_ready=True, + src1_value=base_addr, + src2_ready=True, + src2_value=store_data, + src3_ready=True, + imm=imm, + use_imm=True, + mem_size=2, + mem_signed=False, + ) + await dut_if.step() + dut_if.clear_rs_dispatch() + + for _ in range(max_cycles): + await Timer(1, unit="ps") + issue = dut_if.read_rs_issue_for(RS_MEM) + if issue["valid"] and issue["rob_tag"] == tag: + await dut_if.step() + return dict(issue) + await dut_if.step() + + raise TimeoutError("SW did not issue from MEM_RS") + + async def wait_for_commit_pair( dut_if: TomasuloInterface, max_cycles: int = 10 ) -> tuple[dict, dict, bool]: @@ -874,6 +911,58 @@ async def test_widen_commit_slot2_clears_rat_through_wrapper(dut: Any) -> None: cocotb.log.info("=== Test Passed ===") +@cocotb.test() +async def test_slot2_store_raw_commit_blocks_sq_committed_empty(dut: Any) -> None: + """A raw slot-2 store commit must immediately hold SQ committed_empty low.""" + cocotb.log.info("=== Test: Slot-2 Store Raw Commit Blocks SQ committed_empty ===") + dut_if, _ = await setup_test(dut) + + dut_if.set_all_fu_ready(True) + dut_if.set_commit_hold(True) + + tag_1, tag_2 = await drive_dual_alloc( + dut_if, + make_int_req(pc=0x4380, rd=11), + make_store_req(pc=0x4384), + ) + + await mark_done_via_cdb(dut_if, tag_1, 0x5151) + issue = await issue_sw_via_mem_rs( + dut_if, + tag=tag_2, + base_addr=0x2400, + store_data=0xA5A5_5A5A, + imm=4, + ) + assert issue["rob_tag"] == tag_2 + assert not dut_if.sq_empty, "Store should have an SQ entry before commit" + assert dut_if.sq_committed_empty, "Uncommitted store should not block traps yet" + + dut_if.set_widen_commit_ok(True) + dut_if.set_commit_hold(False) + + await Timer(1, unit="ps") + commit_1 = dut_if.read_commit() + commit_2 = dut_if.read_commit_2() + commit_2_valid_raw = dut_if.commit_2_valid_raw + commit_2_store_like_raw = dut_if.commit_2_store_like_raw + + await RisingEdge(dut_if.clock) + await Timer(1, unit="ps") + sq_committed_empty = dut_if.sq_committed_empty + await FallingEdge(dut_if.clock) + + assert commit_1["valid"] and commit_1["tag"] == tag_1 + assert commit_2["valid"] and commit_2["tag"] == tag_2 + assert commit_2_valid_raw, "Slot-2 raw commit should be visible" + assert commit_2_store_like_raw, "Slot-2 raw commit should be store-like" + assert not sq_committed_empty, ( + "Slot-2 raw store commit must feed SQ's same-cycle committed_empty guard" + ) + + cocotb.log.info("=== Test Passed ===") + + @cocotb.test() async def test_widen_commit_ok_blocks_slot2_through_wrapper(dut: Any) -> None: """Wrapper forwards slot-2 widen-commit back-pressure to the ROB.""" @@ -4387,16 +4476,13 @@ async def test_sc_pending_does_not_block_older_load(dut: Any) -> None: @cocotb.test() async def test_partial_flush_preserves_older_sc_pending(dut: Any) -> None: - """Partial flush clears sc_pending even if SC is older than flush tag. - - speculative_flush_all treats any i_flush_en as a full flush for timing - closure, so sc_pending is always cleared on partial flush regardless of - age. This test verifies the conservative (timing-safe) behaviour. + """Partial flush preserves sc_pending when SC is older than flush tag. - Scenario: SC (tag 1) issues → sc_pending set. Branch (tag 2) mispredicts → - partial flush with flush_tag=2. Conservative flush clears sc_pending. + Scenario: SC (tag 1) issues -> sc_pending set. Branch (tag 2) mispredicts + -> partial flush with flush_tag=2. The SC is older than the flush boundary, + so the table entry must survive. """ - cocotb.log.info("=== Test: Partial Flush Clears SC Pending (Conservative) ===") + cocotb.log.info("=== Test: Partial Flush Preserves Older SC Pending ===") dut_if, model = await setup_test(dut) addr = 0x1000 @@ -4518,17 +4604,14 @@ async def test_partial_flush_preserves_older_sc_pending(dut: Any) -> None: dut_if.set_fu_ready(RS_MEM, False) assert int(dut.sc_pending.value), "sc_pending should be set" - assert int(dut.sc_pending_unit_inst.sc_pending_rob_tag.value) == tag_sc # --- Phase 6: Partial flush with tag=branch (younger than SC) --- - # speculative_flush_all = i_flush_all || i_flush_en, so any partial flush - # conservatively clears sc_pending regardless of age comparison. dut_if.drive_flush_en(tag_branch) await dut_if.step() dut_if.clear_flush_en() - assert not int(dut.sc_pending.value), ( - f"sc_pending should be cleared by conservative flush: SC tag={tag_sc}, " + assert int(dut.sc_pending.value), ( + f"sc_pending should survive partial flush: SC tag={tag_sc}, " f"flush tag={tag_branch}" ) diff --git a/verif/cocotb_tests/tomasulo/tomasulo_wrapper/tomasulo_interface.py b/verif/cocotb_tests/tomasulo/tomasulo_wrapper/tomasulo_interface.py index 011e31c2..d238d5fd 100644 --- a/verif/cocotb_tests/tomasulo/tomasulo_wrapper/tomasulo_interface.py +++ b/verif/cocotb_tests/tomasulo/tomasulo_wrapper/tomasulo_interface.py @@ -601,6 +601,11 @@ def commit_2_valid_raw(self) -> bool: """Return unregistered widen-commit slot-2 valid.""" return bool(self.dut.o_commit_2_valid_raw.value) + @property + def commit_2_store_like_raw(self) -> bool: + """Return unregistered widen-commit slot-2 store-like marker.""" + return bool(self.dut.o_commit_2_store_like_raw.value) + # ========================================================================= # ROB Status # ========================================================================= @@ -1091,6 +1096,11 @@ def sq_empty(self) -> bool: """Return whether SQ is empty.""" return bool(self.dut.o_sq_empty.value) + @property + def sq_committed_empty(self) -> bool: + """Return whether SQ has no committed stores waiting to drain.""" + return bool(self.dut.o_sq_committed_empty.value) + @property def sq_count(self) -> int: """Return number of valid SQ entries.""" From 3d7766c8ef2c28aee08020308249d89b859cbbf6 Mon Sep 17 00:00:00 2001 From: Adam Bagley Date: Sun, 21 Jun 2026 02:41:07 -0400 Subject: [PATCH 13/43] productionize 718f8cc bring-up changeset: drop debug harness, sync docs - cpu_ooo.sv: remove the temporary `ifndef SYNTHESIS FROST_HB/FROST_DBG $display debug harness (the self-labeled "first-timer-IRQ ra-corruption hunt" heartbeat + trap/RA traces). Pure deletion (31 lines, 0 added); the test-facing dbg_* /* verilator public_flat_rd */ signals the cocotb harness reads are untouched. Rebuilt clean; mret_timer_resume_test still PASSes (resume_mepc = u_spin U-target). - Docs synced to the committed RTL: - tomasulo_wrapper/README.md: document the !i_flush_all mask on the registered commit-valid outputs and the slot-2 raw store-commit SQ guard (i_commit_valid_comb_2, previously tied to 1'b0). - store_queue/README.md: the combinational commit guard covers any flush racing a registered commit (partial-flush recovery AND full-flush trap/MRET/ FENCE.I drains); the wrapper now actually drives the slot-2 twin. - verif/README.md: add the new cocotb_tests/control/ (trap_unit) entry. - test_real_program.py: revert the linux_boot pass marker from the temporary "Kernel panic" debug string (now obsolete -- the fix removed that panic) to the "Linux version" boot banner. linux_boot is include_in_pytest=False (not in CI); interim criterion pending boot-to-shell. CI already covers the new tests (pytest -m cocotb / -k test_real_program); runner + Makefile registration was already complete. --- hw/rtl/cpu_and_mem/cpu/cpu_ooo/cpu_ooo.sv | 31 ------------------- .../cpu/tomasulo/store_queue/README.md | 12 +++++-- .../cpu/tomasulo/tomasulo_wrapper/README.md | 14 +++++++++ verif/README.md | 1 + verif/cocotb_tests/test_real_program.py | 8 ++--- 5 files changed, 28 insertions(+), 38 deletions(-) diff --git a/hw/rtl/cpu_and_mem/cpu/cpu_ooo/cpu_ooo.sv b/hw/rtl/cpu_and_mem/cpu/cpu_ooo/cpu_ooo.sv index 8ea18537..dfc0d24d 100644 --- a/hw/rtl/cpu_and_mem/cpu/cpu_ooo/cpu_ooo.sv +++ b/hw/rtl/cpu_and_mem/cpu/cpu_ooo/cpu_ooo.sv @@ -2112,37 +2112,6 @@ module cpu_ooo #( .o_stall_for_wfi() // WFI stall handled at ROB head ); -`ifndef SYNTHESIS - // FROST DEBUG (TEMP -- remove after the first-timer-IRQ ra-corruption hunt): - // (1) every trap taken -- does a commit write-port collide this cycle (codex - // #3: async interrupt + same-cycle commit corrupting x1/ra + saved PC)? - // (2/3) any commit that writes x1/ra a bogus LOW value (the 0xcc0 panic ra). - // (4) heartbeat every 500k cycles: is the CPU progressing (head_pc advancing) - // or stuck (e.g. a DDR load that never returns) before the first IRQ? - int unsigned frost_cyc = 0; - always_ff @(posedge i_clk) begin - if (!i_rst) begin - frost_cyc <= frost_cyc + 1; - if ((frost_cyc % 500000) == 0) - $display("FROST_HB cyc=%0d head_pc=%08x trap_taken=%b cmt_vld=%b p0we=%b p0a=%0d", - frost_cyc, rob_trap_pc, trap_taken, rob_commit_valid, port0_int_we, - port0_int_addr); - if (trap_taken) - $display("FROST_DBG %0t TRAP cause=%08x csr_pc=%08x rob_pc=%08x resume_pc=%08x mepc=%08x | p0we=%b p0a=%0d p0d=%08x p1we=%b p1a=%0d p1d=%08x cmt_vld=%b", - $time, trap_cause_internal, trap_pc_internal, rob_trap_pc, - interrupt_resume_pc, csr_mepc, port0_int_we, port0_int_addr, - port0_int_data, port1_int_we, port1_int_addr, port1_int_data, - rob_commit_valid); - if (port0_int_we && (port0_int_addr == 5'd1) && (port0_int_data < 32'h0000_1000)) - $display("FROST_DBG %0t *** RA<-%08x PORT0 trap_taken=%b rob_trap_pc=%08x mepc=%08x", - $time, port0_int_data, trap_taken, rob_trap_pc, csr_mepc); - if (port1_int_we && (port1_int_addr == 5'd1) && (port1_int_data < 32'h0000_1000)) - $display("FROST_DBG %0t *** RA<-%08x PORT1 trap_taken=%b rob_trap_pc=%08x mepc=%08x", - $time, port1_int_data, trap_taken, rob_trap_pc, csr_mepc); - end - end -`endif - // Use the registered trap/mret pulses when driving the front-end flush so // flush_pipeline no longer rides on the combinational // rob_valid[head_idx] → commit_en → trap_unit → trap_taken diff --git a/hw/rtl/cpu_and_mem/cpu/tomasulo/store_queue/README.md b/hw/rtl/cpu_and_mem/cpu/tomasulo/store_queue/README.md index 8b74a169..5b53bef7 100644 --- a/hw/rtl/cpu_and_mem/cpu/tomasulo/store_queue/README.md +++ b/hw/rtl/cpu_and_mem/cpu/tomasulo/store_queue/README.md @@ -101,15 +101,21 @@ Back-pressure is therefore only ever conservatively long, never short. ## Widen-commit slot 2 The SQ accepts a parallel slot-2 commit port -(`i_commit_valid_2`, `i_commit_rob_tag_2`, plus combinational twin -for the same-cycle partial-flush guard). Slot 2 only ever retires +(`i_commit_valid_2`, `i_commit_rob_tag_2`, plus a combinational twin +for the same-cycle flush guard). Slot 2 only ever retires plain stores — SC / AMO are forced onto slot 1 by the ROB's widen-commit hazard gate — so there's no SC-discard path sharing. Forwarding scans both slot 1 and slot 2 commits in the same cycle. +The wrapper now actually drives the combinational twin +(`i_commit_valid_comb_2` / `i_commit_rob_tag_comb_2`, previously tied to +`1'b0`); without it a full-flush trap (e.g. a machine-timer IRQ) could +observe committed-empty and drop a head+1 store the SQ has not yet seen on +the registered commit path. ## Same-cycle commit hazard -When a partial flush and a ROB commit fire on the same cycle, the +When any same-cycle flush races a registered ROB commit — partial-flush +misprediction recovery and full-flush trap / MRET / FENCE.I drains alike — the registered commit signal is one cycle behind the flush, which means the flush could otherwise wipe out a store that's being committed right then. The SQ takes a combinational commit guard from the ROB diff --git a/hw/rtl/cpu_and_mem/cpu/tomasulo/tomasulo_wrapper/README.md b/hw/rtl/cpu_and_mem/cpu/tomasulo/tomasulo_wrapper/README.md index 25714efe..2b646992 100644 --- a/hw/rtl/cpu_and_mem/cpu/tomasulo/tomasulo_wrapper/README.md +++ b/hw/rtl/cpu_and_mem/cpu/tomasulo/tomasulo_wrapper/README.md @@ -110,6 +110,20 @@ misprediction-detect path in `cpu_ooo.sv`, and the CDB grants remain combinational so FU adapters can clear their hold registers on the same cycle as a grant. +The registered valid outputs (`o_commit_bus_q_valid`, `o_commit_bus_2_q_valid`) +are additionally masked combinationally with `!i_flush_all`. The valid flops +clear on the flush edge, but downstream consumers still observe the previous +valid value during that same cycle; masking immediately prevents a commit that +overlaps a trap / MRET / FENCE.I full flush from performing one more +architectural side effect while the back-end is being squashed. + +The wrapper also drives the SQ slot-2 combinational commit guard from the raw +head+1 store-commit pulse (`i_commit_valid_comb_2 = commit_2_store_like_raw`, +`i_commit_rob_tag_comb_2 = commit_bus_2.tag`; previously tied to `1'b0`/`'0`). +Slot 2 has the same raw-commit race as slot 1: `commit_bus_2_q_valid` reaches the +SQ one cycle late, so without this a full-flush trap (e.g. a machine-timer IRQ) +could observe `sq_committed_empty` and squash a store the SQ has not yet owned. + ### Dispatch routing Dispatch now emits already-routed per-RS packets for slot 1 and slot 2. The diff --git a/verif/README.md b/verif/README.md index 876849d3..03a33f01 100644 --- a/verif/README.md +++ b/verif/README.md @@ -82,6 +82,7 @@ verif/ │ ├── cache/ # Cache hierarchy + line-port arbiter block tests │ ├── cpu_ooo/ # OOO block tests (commit, recovery, memory router, │ │ # register files, perf counters, pipeline control) +│ ├── control/ # Control-block tests (trap_unit interrupt/MRET arbitration) │ └── tomasulo/ # Block-level cocotb tests for Tomasulo submodules │ # (ROB, RAT, RS, dispatch, CDB arbiter, LQ/SQ, FU shims) ├── models/ # Reference models for verification diff --git a/verif/cocotb_tests/test_real_program.py b/verif/cocotb_tests/test_real_program.py index 0c5bd0ce..e1d3c4c8 100644 --- a/verif/cocotb_tests/test_real_program.py +++ b/verif/cocotb_tests/test_real_program.py @@ -651,10 +651,10 @@ def get_expected_behavior() -> tuple[str | None, str | None, bool, str | None]: # Just needs to print the first hello message return (None, "Hello, world!", False, app_name) if app_name == "linux_boot": - # FROST DEBUG (TEMP): run PAST the banner to the first-timer-IRQ - # panic so the RTL trace fires; stop at the panic (or time out - # at max_cycles if it hangs). Revert to "Linux version" after. - return (None, "Kernel panic - not syncing", False, app_name) + # Passes once the kernel reaches its boot banner. (Interim + # bring-up criterion; tighten to a userspace/shell marker + # once no-MMU Linux boots that far.) + return (None, "Linux version", False, app_name) if app_name == "uart_echo": # Interactive test handled separately (UART input injection) return (None, None, False, app_name) From 21d5af7a31be11bc9277cbb81013e32c5430768b Mon Sep 17 00:00:00 2001 From: Adam Bagley Date: Sun, 21 Jun 2026 08:23:17 -0400 Subject: [PATCH 14/43] linux_boot: make ret_from_exception patch rebuild-robust The MRET restore-window patch targeted a hardcoded image word offset, which shifts whenever the kernel is rebuilt (e.g. after editing entry.S to strip the bring-up IRQ probes). Locate the target by its unique machine-code word (sc.w == 18c1202f) instead, with an idempotency check (absent + NEW_WORD present => already patched) and an ambiguity guard (>1 occurrence => abort). The patch now survives kernel rebuilds. Context: the U-mode variant of the timer interrupt-resume-PC race is fixed in hardware (cpu_ooo.sv seeds interrupt_resume_pc from csr_mepc on mret_taken), but the M-mode restore-window variant is not yet -- an unpatched kernel hangs at the CLINT clocksource switch once the periodic timer tick ramps up. So this software crutch (clear mstatus.MIE in the restore window) is still required for now. Drop it once the M-mode window is fixed properly (RTL, or a clean kernel change that keeps the sc.w reservation-clear). Makefile comment updated to reflect this. --- sw/apps/linux_boot/Makefile | 9 ++ .../linux_boot/patch_ret_from_exception.py | 96 +++++++++---------- 2 files changed, 54 insertions(+), 51 deletions(-) diff --git a/sw/apps/linux_boot/Makefile b/sw/apps/linux_boot/Makefile index 8374d248..975fcee7 100644 --- a/sw/apps/linux_boot/Makefile +++ b/sw/apps/linux_boot/Makefile @@ -24,6 +24,15 @@ # boot; to be replaced by an in-repo build once Linux boots to a shell. ARTIFACTS ?= $(HOME)/bigger_l0/linux-mvp/frost-artifacts +# The ret_from_exception restore-window patch (patch_ret_from_exception.py) +# clears mstatus.MIE in the MRET restore window. The U-mode variant of that +# timer-interrupt-resume-PC race is now fixed in hardware (cpu_ooo.sv seeds +# interrupt_resume_pc from csr_mepc on mret_taken), but the M-mode restore-window +# variant is not yet: an unpatched kernel hangs at the CLINT clocksource switch +# once the periodic timer tick ramps up. So the patch is still required for now. +# The patch locates its target by unique machine-code word, so it survives kernel +# rebuilds. Drop it once the M-mode restore window is fixed properly (RTL or a +# clean kernel change that keeps the sc.w reservation-clear). all: patch_linux_image .PHONY: patch_linux_image diff --git a/sw/apps/linux_boot/patch_ret_from_exception.py b/sw/apps/linux_boot/patch_ret_from_exception.py index 4200bcbe..213f46f5 100644 --- a/sw/apps/linux_boot/patch_ret_from_exception.py +++ b/sw/apps/linux_boot/patch_ret_from_exception.py @@ -13,14 +13,23 @@ mret If the restored mstatus image has MIE set, the timer can preempt between the -CSR write and MRET. The trap then saves mepc at the MRET instruction itself, -which later returns into MRET as user code and produces SIGILL at -ret_from_exception+0x76. - -For bring-up, replace the non-essential reservation-clear SC with -`andi a0, a0, -9`, clearing MIE in the value written to mstatus. MRET still -restores the final interrupt-enable state from MPIE, but the restore window is -not interruptible. +CSR write and MRET (an M-mode restore-window race). The trap then saves mepc at +the MRET instruction itself, which later returns into MRET as user code and +produces SIGILL at ret_from_exception+0x76. (The U-mode variant of that race is +fixed in hardware -- cpu_ooo.sv seeds interrupt_resume_pc from csr_mepc on +mret_taken -- but the M-mode restore-window variant is not yet, so this software +crutch is still required: without it the unpatched kernel hangs at the CLINT +clocksource switch once the periodic timer tick ramps up.) + +For bring-up, replace the reservation-clear SC with `andi a0, a0, -9`, clearing +MIE in the value written to mstatus. MRET still restores the final +interrupt-enable state from MPIE, but the restore window is not interruptible. + +The target instruction is located by its unique machine-code word +(`18c1202f`) rather than a fixed offset, so the patch survives kernel rebuilds +that shift ret_from_exception. If the word is absent the image is assumed +already patched (idempotent); if it occurs more than once the patch aborts +rather than risk hitting the wrong site. """ from __future__ import annotations @@ -29,63 +38,48 @@ from pathlib import Path -TARGET_WORD_INDEX = 0x00388B70 // 4 -OLD_WORD = "18c1202f" -NEW_WORD = "ff757513" +OLD_WORD = "18c1202f" # sc.w zero, a2, (sp) -- ret_from_exception reservation clear +NEW_WORD = "ff757513" # andi a0, a0, -9 -- clear mstatus.MIE in the restore value -def patch_dense(path: Path) -> None: +def patch_words(path: Path) -> None: + """Patch the single OLD_WORD occurrence to NEW_WORD. + + Works for both the dense FPGA-loader form (one word per line) and the + $readmemh form (skips '@' directives and blank lines). + """ lines = path.read_text().splitlines() - if TARGET_WORD_INDEX >= len(lines): - raise SystemExit(f"{path}: target word index 0x{TARGET_WORD_INDEX:x} is out of range") - old = lines[TARGET_WORD_INDEX].strip().lower() - if old == NEW_WORD: - return - if old != OLD_WORD: + old_hits = [] + new_hits = 0 + for i, line in enumerate(lines): + s = line.strip().lower() + if not s or s.startswith("@"): + continue + if s == OLD_WORD: + old_hits.append(i) + elif s == NEW_WORD: + new_hits += 1 + if not old_hits: + if new_hits: + return # already patched + raise SystemExit(f"{path}: target word {OLD_WORD} not found (and not already patched)") + if len(old_hits) > 1: raise SystemExit( - f"{path}: expected {OLD_WORD} at word 0x{TARGET_WORD_INDEX:x}, found {old}" + f"{path}: {OLD_WORD} occurs {len(old_hits)}x; ambiguous, refusing to patch" ) - lines[TARGET_WORD_INDEX] = NEW_WORD + lines[old_hits[0]] = NEW_WORD path.write_text("\n".join(lines) + "\n") -def patch_mem(path: Path) -> None: - lines = path.read_text().splitlines() - word_index = 0 - for line_no, line in enumerate(lines): - stripped = line.strip() - if not stripped: - continue - if stripped.startswith("@"): - word_index = int(stripped[1:], 16) - continue - if word_index == TARGET_WORD_INDEX: - old = stripped.lower() - if old == NEW_WORD: - return - if old != OLD_WORD: - raise SystemExit( - f"{path}: expected {OLD_WORD} at word 0x{TARGET_WORD_INDEX:x}, found {old}" - ) - lines[line_no] = NEW_WORD - path.write_text("\n".join(lines) + "\n") - return - word_index += 1 - raise SystemExit(f"{path}: target word index 0x{TARGET_WORD_INDEX:x} not found") - - def main() -> None: parser = argparse.ArgumentParser() parser.add_argument("sw_ddr_mem", type=Path) parser.add_argument("sw_ddr_txt", type=Path) args = parser.parse_args() - patch_mem(args.sw_ddr_mem) - patch_dense(args.sw_ddr_txt) - print( - "Patched Linux ret_from_exception restore window: " - f"word 0x{TARGET_WORD_INDEX:x} {OLD_WORD}->{NEW_WORD}" - ) + patch_words(args.sw_ddr_mem) + patch_words(args.sw_ddr_txt) + print(f"Patched Linux ret_from_exception restore window: {OLD_WORD}->{NEW_WORD}") if __name__ == "__main__": From 4b148f471a75db1d13ebda1d6cf96893f915f9da Mon Sep 17 00:00:00 2001 From: Adam Bagley Date: Sun, 21 Jun 2026 09:47:54 -0400 Subject: [PATCH 15/43] handoff: nommu Linux boots to userspace; flaky M-mode timer race isolated Capture the post-0x80388bba-fix bring-up state: kernel boots fully to the /sbin/init handoff on hardware; userspace execution + syscalls + vfork/exec/ wait proven working via minimal bFLT test inits; busybox blockers root-caused (bFLT stack 16KB too small -> Buildroot FLAT stack-size fix; 16MiB->64MiB RAM); and the remaining reliability blocker isolated to a residual M-mode machine- timer trap-return race (memory-size- and board-state-independent; ~33-67% flaky; often hangs at the clocksource switch where the unpatched kernel died). Next: directed sim of the M-mode ret_from_exception/MRET restore path to find and fix the residual race. Plus operational notes (autonomous bitstream programming, fast boot-watch, bFLT test technique). --- handoff_linux_timer_irq_panic.md | 72 ++++++++++++++++++++++++++++++++ 1 file changed, 72 insertions(+) diff --git a/handoff_linux_timer_irq_panic.md b/handoff_linux_timer_irq_panic.md index 050accc9..ced2ec22 100644 --- a/handoff_linux_timer_irq_panic.md +++ b/handoff_linux_timer_irq_panic.md @@ -481,3 +481,75 @@ lines as hints only; rely on clean kernel lines and directed sim for proof. 6. Build the directed MRET-to-U plus timer-pending sim. 7. Only after the sim proves or disproves the current hypothesis, decide whether to patch RTL, add Linux restore instrumentation, or ask the user for another bitstream. + +## UPDATE 2026-06-21 (Claude): boots to userspace; one flaky RTL race remains + +The `0x80388bba` panic is fixed (RTL, committed: `718f8cc` + productionization +`3d7766c` + rebuild-robust patch `21d5af7`). Since then the bring-up went much +further on real Genesys2 hardware. Current state below. + +### Kernel boots fully to the userspace handoff + +No-MMU Linux 6.18.7 boots cleanly reset -> console (`ttyS0` 16550A) -> initramfs +-> `Run /sbin/init` at ~2.8s, clean log. The unpatched-kernel test proved the +hardware `interrupt_resume_pc` fix is the real cure for the U-mode MRET panic +(the `patch_ret_from_exception.py` MIE-clear is a *separate* partial mitigation +for the M-mode variant; see below). + +### Userspace execution is PROVEN working + +Built minimal bFLT test inits with the Buildroot toolchain +(`riscv32-buildroot-linux-uclibc-gcc -O2 -Wl,-elf2flt=-r`; no-MMU userspace is +bFLT `ram gotpic`, NOT plain ELF/FDPIC) and ran them as `/sbin/init`: +- single process `write()` + spin -> prints `USERSPACE_OK`. +- `vfork`+`exec` of a child, child + parent both run -> all markers. +- `vfork`+`exec`+child `_exit`+parent `waitpid` -> all markers, child reaped. + +So U-mode execution, the `ecall` syscall round-trip, process creation, child +MRET-to-U, exit and reap all work. FROST hardware is solid for Linux userspace. + +### Blocker 1 (software, root-caused): BusyBox bFLT stack + RAM + +- `init: out of mem` -> BusyBox's bFLT stack/heap was only `0x3e80` (16 KiB), + too small for no-MMU. `flthdr -s 0x100000 bin/busybox` clears it. PROPER FIX: + set the FLAT stack size in Buildroot (covers all applets, which are symlinks + to the one busybox binary). +- 16 MiB RAM was a temp sim-speed shrink; reverted `MEM_SIZE` to 64 MiB in + `linux-mvp/frost-artifacts/build_fpga_boot.py`. (Memory size does NOT affect + the flaky hang below.) + +### Blocker 2 (RTL, isolated, NOT yet fixed): residual M-mode timer race + +After Blocker 1, the late-kernel boot is intermittently hung (~33-67% of boots), +non-deterministic, at varying points in the timer-active region — frequently at +the exact `[0.14] clocksource: Switched to clocksource clint_clocksource` where +the UNPATCHED kernel hung 100% deterministically. Cheap isolation proved it is: +- NOT memory size: 2/6 flaky at both 32 and 64 MiB. +- NOT DDR / board state: 4/6 flaky on a freshly reprogrammed bitstream. +- => a residual machine-timer-interrupt trap-return race. The U-mode RTL fix + + the MIE-clear patch reduced it from deterministic to flaky but did not close + it. This is the "proper M-mode-window fix" that was deferred — the real + reliability blocker, back in FROST-RTL territory. + +NEXT STEP: directed sim of an M-mode machine-timer interrupt taken through the +`ret_from_exception` restore / MRET path (sweep the timer-injection cycle to hit +the bad window, like the original IRQ-precision tests), find the residual race, +fix in RTL, re-verify on hardware (re-run the 6-boot flaky-rate measurement, +expect 0 hangs). + +### Operational notes for the next agent + +- Program the FPGA yourself: `./fpga/program_bitstream/program_bitstream.py + genesys2` (no need to ask the user). Reprogram to reset board state. +- Hardware boot/reload: `python3 /tmp/linux_boot_watch.py` (loads the patched + image over JTAG via `load_software.py`; no bitstream reprogram needed for + software/kernel changes). It now also breaks on `Run /sbin/init` / `out of + mem` with a 30s post-load window — handy for fast flaky-rate loops (see + `/tmp/flaky_iso.sh`). +- Kernel rebuild: edit `linux-mvp/.../linux-6.18.7/arch/riscv/kernel/entry.S` + then `make -C linux-mvp/buildroot linux-rebuild`; the FROST IRQ debug probes + and the false-triggering `FROST_BAD_RET` canary (it tripped on the legitimate + `RA=0` at first return-to-userspace) are already removed. +- `patch_ret_from_exception.py` now locates its target by unique machine-code + word (survives kernel rebuilds). It is STILL REQUIRED until the M-mode race is + fixed; drop it after. From a8f3098a38e72cd903b7e7da4074b07a64e1da28 Mon Sep 17 00:00:00 2001 From: Adam Bagley Date: Sun, 21 Jun 2026 12:32:23 -0400 Subject: [PATCH 16/43] test: add mtimer_stress (M-mode timer+MRET deadlock stress, flaky-hang repro WIP) Synthetic reproducer for the residual M-mode machine-timer trap-return race that intermittently hangs the no-MMU Linux boot. An M-mode loop (loads/stores/ ALU) is preempted by a frequent machine timer whose period is re-armed to a swept value (mtime + 24..87) each tick, so the timer lands at every cycle offset around the MRET across ~10k ticks; a deadlock would stall the loop and time out. Currently PASSES (survives 9851 IRQs) -- i.e. it does NOT yet reproduce the race, same as the existing linux_irq_* DDR tests. Kept as a regression + a starting point: the race needs more faithful conditions (full GPR save/restore to a DDR stack, WFI idle, or the exact clocksource-setup sequence). The full linux_boot sim is DDR-bound and too slow to reach the hang (25M cycles only reached early pre-timer init). --- sw/apps/mtimer_stress/Makefile | 5 ++ sw/apps/mtimer_stress/main.c | 125 +++++++++++++++++++++++++++++++++ tests/test_run_cocotb.py | 6 ++ 3 files changed, 136 insertions(+) create mode 100644 sw/apps/mtimer_stress/Makefile create mode 100644 sw/apps/mtimer_stress/main.c diff --git a/sw/apps/mtimer_stress/Makefile b/sw/apps/mtimer_stress/Makefile new file mode 100644 index 00000000..b9a9c7c2 --- /dev/null +++ b/sw/apps/mtimer_stress/Makefile @@ -0,0 +1,5 @@ +# Copyright 2026 Two Sigma Open Source, LLC +# SPDX-License-Identifier: Apache-2.0 +# Machine-timer + MRET deadlock stress test +SRC_C := main.c +include ../../common/common.mk diff --git a/sw/apps/mtimer_stress/main.c b/sw/apps/mtimer_stress/main.c new file mode 100644 index 00000000..4e4b9619 --- /dev/null +++ b/sw/apps/mtimer_stress/main.c @@ -0,0 +1,125 @@ +/* + * Copyright 2026 Two Sigma Open Source, LLC + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/* + * Machine-timer + MRET deadlock stress test. + * + * Reproduce target: the residual flaky hang seen booting no-MMU Linux on + * hardware. It is memory-size- and board-state-independent, ~50% of boots, and + * frequently hangs at the first periodic machine-timer interrupts (right after + * the kernel switches to the CLINT clocksource). The U-mode interrupt-resume-PC + * fix (cpu_ooo.sv) and the kernel MIE-clear patch made it flaky instead of + * deterministic but did not close it -> a residual machine-timer trap-return + * race in the FROST trap/MRET/flush machinery. + * + * This is the full linux_boot in miniature: an M-mode loop preempted by a + * machine timer firing very frequently, the handler doing a real MRET back to + * the loop, with the timer PHASE swept (period re-armed to mtime + 24..87 each + * tick) so the timer lands at every cycle offset around the MRET / in the loop + * across many thousands of ticks. If a timer landing at a bad cycle deadlocks + * the pipeline, the loop counter stops advancing and `<>` is never + * printed -> the cocotb harness times out (reproduced). If it survives all + * phases for the whole run, it prints `<>`. + */ + +#include + +#include "trap.h" + +static void uart_putc(char c) { UART_TX = (uint8_t) c; } +static void uart_puts(const char *s) { while (*s) uart_putc(*s++); } +static void uart_hex(uint32_t v) +{ + static const char hex[] = "0123456789ABCDEF"; + uart_puts("0x"); + for (int i = 28; i >= 0; i -= 4) + uart_putc(hex[(v >> i) & 0xF]); +} + +volatile uint32_t g_irq; /* timer-interrupt count (also drives the phase sweep) */ +volatile uint32_t g_loop; /* loop progress marker */ +static volatile uint32_t buf[64]; + +/* + * Naked M-mode timer handler: re-arm the timer to fire again in 24..87 cycles + * (period = 24 + (g_irq & 0x3f), so the phase relative to the loop/MRET drifts + * every tick and sweeps the whole window), bump g_irq, and MRET back to the + * interrupted loop. Trap entry cleared MIE; the MRET restores it from MPIE, so + * the next timer fires back in the loop -- exactly the kernel's pattern with + * the MIE-clear patch applied. Saves only the regs it uses; everything else is + * preserved by not touching it. + */ +__attribute__((naked, aligned(4))) static void mtimer_handler(void) +{ + __asm__ volatile( + "addi sp, sp, -16\n" + "sw t0, 0(sp)\n" + "sw t1, 4(sp)\n" + "sw t2, 8(sp)\n" + "lui t0, %hi(g_irq)\n" + "lw t1, %lo(g_irq)(t0)\n" + "andi t2, t1, 0x3f\n" + "addi t2, t2, 24\n" /* period = 24 + (g_irq & 0x3f) */ + "addi t1, t1, 1\n" + "sw t1, %lo(g_irq)(t0)\n" /* g_irq++ */ + "li t0, 0x40000010\n" /* MTIME_LO */ + "lw t1, 0(t0)\n" + "add t1, t1, t2\n" + "li t0, 0x40000018\n" /* MTIMECMP_LO (HI stays 0, set in main) */ + "sw t1, 0(t0)\n" + "lw t0, 0(sp)\n" + "lw t1, 4(sp)\n" + "lw t2, 8(sp)\n" + "addi sp, sp, 16\n" + "mret\n"); +} + +int main(void) +{ + uart_puts("\r\n=== mtimer MRET deadlock stress ===\r\n"); + set_trap_handler(&mtimer_handler); + for (int i = 0; i < 64; i++) + buf[i] = (uint32_t) i; + + /* Arm a frequent machine timer; handler re-arms each tick (phase sweep). */ + MTIMECMP_HI = 0; + MTIMECMP_LO = (uint32_t) rdmtime() + 40; + enable_timer_interrupt(); /* mie.MTIE */ + enable_interrupts(); /* mstatus.MIE */ + + /* Loop with loads/stores/ALU so the timer preempts varied pipeline state + * (in-flight memory ops, branches) at every swept phase. */ + uint32_t acc = 0; + for (uint32_t i = 0; i < 20000u; i++) { + g_loop = i; + uint32_t k = i & 63u; + acc += buf[k]; + acc ^= (acc << 1) | (acc >> 3); + buf[k] = acc + i; + } + + disable_timer_interrupt(); + uart_puts("survived: loop="); + uart_hex(g_loop); + uart_puts(" irqs="); + uart_hex(g_irq); + uart_puts(" acc="); + uart_hex(acc); + uart_puts("\r\n<>\r\n"); + for (;;) { + } + return 0; +} diff --git a/tests/test_run_cocotb.py b/tests/test_run_cocotb.py index 910890ff..f6c3a4c5 100755 --- a/tests/test_run_cocotb.py +++ b/tests/test_run_cocotb.py @@ -187,6 +187,12 @@ class CocotbRunConfig: app_name="mret_timer_resume_test", description="MRET-to-U + pending-timer mepc directed test (stale interrupt resume PC)", ), + "mtimer_stress": CocotbRunConfig( + python_test_module="cocotb_tests.test_real_program", + hdl_toplevel_module="frost", + app_name="mtimer_stress", + description="M-mode machine-timer + MRET deadlock stress (phase-swept; flaky-hang repro)", + ), "ns16550_test": CocotbRunConfig( python_test_module="cocotb_tests.test_real_program", hdl_toplevel_module="frost", From 3bc3ecd82a3f971cacb5583e0074eec432a22bca Mon Sep 17 00:00:00 2001 From: Adam Bagley Date: Sun, 21 Jun 2026 12:32:54 -0400 Subject: [PATCH 17/43] handoff: M-mode race hunt status (repro approach scoped, not yet reproduced) --- handoff_linux_timer_irq_panic.md | 26 ++++++++++++++++++++++++++ 1 file changed, 26 insertions(+) diff --git a/handoff_linux_timer_irq_panic.md b/handoff_linux_timer_irq_panic.md index ced2ec22..ffd3faf1 100644 --- a/handoff_linux_timer_irq_panic.md +++ b/handoff_linux_timer_irq_panic.md @@ -553,3 +553,29 @@ expect 0 hangs). - `patch_ret_from_exception.py` now locates its target by unique machine-code word (survives kernel rebuilds). It is STILL REQUIRED until the M-mode race is fixed; drop it after. + +### M-mode race hunt status (in progress) + +Reproduction approach scoped; race not yet reproduced in sim: +- Full `linux_boot` in Verilator is NOT viable: it is DDR-latency-bound, so 25M + cycles only reached the early pre-timer `[0.000000] SLUB` line. Reaching the + clocksource switch (~18.6M *instructions*) would take hundreds of millions of + cycles / many hours. Don't retry full-kernel sim for this. +- New synthetic reproducer `sw/apps/mtimer_stress/` (registered; run from + `frost/tests` with `COCOTB_MAX_CYCLES=3000000`): M-mode loop preempted by a + frequent machine timer with the period swept (mtime + 24..87) each tick to hit + every cycle offset around the MRET. It PASSES (survived 9851 IRQs) -> does NOT + reproduce the race. The existing `linux_irq_*_test` DDR tests also pass. So the + trigger is more specific than "M-mode timer + MRET + phase sweep". +- The `SQ: allocation attempted during flush` $warning (store_queue.sv:1178) + fires during the stress but is a generic, handled condition (test passes) -- + likely benign, not the race. + +Next ideas to try (untried): (a) a handler that saves/restores ALL 31 GPRs to a +DDR stack like the kernel (heavy in-flight DDR mem-ops during the trap-return +flush); (b) a WFI-idle + timer-wake deadlock stress (the clocksource hang may be +the first idle WFI not waking); (c) deep RTL analysis of deadlock paths in the +SERIAL FSM (rob_serializer.sv: SERIAL_MRET_EXEC / SERIAL_TRAP_WAIT waiting +forever) and the SQ/LQ full-flush drain (load_queue.sv ~1140-1152) when a timer +flush races in-flight memory ops; (d) if reliability is needed sooner, a stronger +kernel-side interrupt mask around the critical windows as an interim mitigation. From 3c3084988119485fbbeab8ddbd406598bcd38416 Mon Sep 17 00:00:00 2001 From: Adam Bagley Date: Sun, 28 Jun 2026 02:11:13 -0400 Subject: [PATCH 18/43] =?UTF-8?q?checkpoint:=20no-MMU=20Linux=20bring-up?= =?UTF-8?q?=20=E2=80=94=20fetch=20fix=20+=20memory/IRQ=20WIP?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit HW-verified on genesys2: Coremark Pro (9 workloads), Coremark, FreeRTOS demo, and isa_test all pass. No-MMU Linux 6.18.7 boots past the ___slab_alloc AMO wedge and the 0x38d7fa RVC fetch-misalign into the timer phase (furthest yet); next blocker is a timer-IRQ-dispatch data corruption, still open. - if_stage/pc_controller/fetch_provider/cpu_and_mem/cpu_ooo: served-window resteer fix for the RVC fetch/decode alignment desync (passes isa + all fetch regressions) - load_queue/lq_issue_selector: AMO-deadlock breaker (clears ___slab_alloc) - trap_unit/reorder_buffer/sq_forwarding/store_queue/tomasulo_wrapper: interrupt + memory-path WIP - new cocotb directed tests + bare-metal repros (sw/apps/*) + registry - fpga/load_software DDR loader + linux_boot kernel-image patch script WIP checkpoint: verified better than prior state, not final. --- README.md | 12 +- boards/genesys2/genesys2_frost.sv | 5 +- boards/xilinx_frost_subsystem.sv | 12 +- fpga/load_software/file_to_bram.tcl | 51 +- fpga/load_software/file_to_ddr.tcl | 34 +- fpga/load_software/load_software.py | 10 +- fpga/load_software/load_software.tcl | 20 +- handoff_linux_timer_irq_panic.md | 581 ---------- hw/rtl/cpu_and_mem/cpu/control/trap_unit.sv | 51 +- hw/rtl/cpu_and_mem/cpu/cpu_ooo/cpu_ooo.sv | 34 +- hw/rtl/cpu_and_mem/cpu/if_stage/if_stage.sv | 46 +- .../cpu_and_mem/cpu/if_stage/pc_controller.sv | 19 +- .../cpu/tomasulo/load_queue/load_queue.sv | 143 ++- .../tomasulo/load_queue/lq_issue_selector.sv | 80 +- .../tomasulo/reorder_buffer/reorder_buffer.sv | 36 +- .../store_queue/sq_forwarding_unit.sv | 66 +- .../cpu/tomasulo/store_queue/store_queue.sv | 3 +- .../tomasulo_wrapper/tomasulo_wrapper.sv | 48 +- hw/rtl/cpu_and_mem/cpu_and_mem.f | 3 + hw/rtl/cpu_and_mem/cpu_and_mem.sv | 156 ++- hw/rtl/cpu_and_mem/fetch_provider.sv | 6 + hw/rtl/cpu_and_mem/hang_triage.sv | 354 ++++++ hw/rtl/frost.sv | 7 +- hw/rtl/peripherals/uart_rx.sv | 2 +- hw/sim/cpu_tb.sv | 81 +- sw/apps/ddr_atomic_test/main.c | 55 + sw/apps/drain_trapframe_test/Makefile | 20 + sw/apps/drain_trapframe_test/main.c | 511 +++++++++ sw/apps/fetch_stall_repro/Makefile | 84 ++ sw/apps/fetch_stall_repro/fetch_stall_repro.S | 125 ++ sw/apps/irq_mie_window/Makefile | 19 + sw/apps/irq_mie_window/main.c | 127 +++ .../linux_boot/patch_ret_from_exception.py | 1016 ++++++++++++++++- sw/apps/linux_clksrc_faithful/Makefile | 19 + sw/apps/linux_clksrc_faithful/main.c | 343 ++++++ sw/apps/linux_irq_active_ddr_test/main.c | 645 ++++++++--- sw/apps/mret_drain_deadlock/Makefile | 22 + sw/apps/mret_drain_deadlock/main.c | 171 +++ sw/apps/pde_return_hazard/Makefile | 21 + sw/apps/pde_return_hazard/main.c | 843 ++++++++++++++ sw/apps/smc_fencei_test/Makefile | 19 + sw/apps/smc_fencei_test/main.c | 172 +++ sw/apps/trap_s2l_fwd/Makefile | 18 + sw/apps/trap_s2l_fwd/main.c | 158 +++ sw/apps/wfi_drain_mepc_test/Makefile | 20 + sw/apps/wfi_drain_mepc_test/main.c | 167 +++ sw/apps/wfi_lost_tick/Makefile | 19 + sw/apps/wfi_lost_tick/main.c | 146 +++ tests/test_run_cocotb.py | 97 ++ verif/cocotb_tests/control/test_trap_unit.py | 41 +- verif/cocotb_tests/if_stage/test_if_stage.py | 105 ++ verif/cocotb_tests/test_directed_traps.py | 428 +++++++ verif/cocotb_tests/test_helpers.py | 72 +- verif/cocotb_tests/test_real_program.py | 95 +- .../tomasulo/load_queue/test_load_queue.py | 154 +++ .../tomasulo_wrapper/test_tomasulo_wrapper.py | 8 +- 56 files changed, 6654 insertions(+), 946 deletions(-) delete mode 100644 handoff_linux_timer_irq_panic.md create mode 100644 hw/rtl/cpu_and_mem/hang_triage.sv create mode 100644 sw/apps/drain_trapframe_test/Makefile create mode 100644 sw/apps/drain_trapframe_test/main.c create mode 100644 sw/apps/fetch_stall_repro/Makefile create mode 100644 sw/apps/fetch_stall_repro/fetch_stall_repro.S create mode 100644 sw/apps/irq_mie_window/Makefile create mode 100644 sw/apps/irq_mie_window/main.c create mode 100644 sw/apps/linux_clksrc_faithful/Makefile create mode 100644 sw/apps/linux_clksrc_faithful/main.c create mode 100644 sw/apps/mret_drain_deadlock/Makefile create mode 100644 sw/apps/mret_drain_deadlock/main.c create mode 100644 sw/apps/pde_return_hazard/Makefile create mode 100644 sw/apps/pde_return_hazard/main.c create mode 100644 sw/apps/smc_fencei_test/Makefile create mode 100644 sw/apps/smc_fencei_test/main.c create mode 100644 sw/apps/trap_s2l_fwd/Makefile create mode 100644 sw/apps/trap_s2l_fwd/main.c create mode 100644 sw/apps/wfi_drain_mepc_test/Makefile create mode 100644 sw/apps/wfi_drain_mepc_test/main.c create mode 100644 sw/apps/wfi_lost_tick/Makefile create mode 100644 sw/apps/wfi_lost_tick/main.c diff --git a/README.md b/README.md index 62f42e1f..8d394af7 100644 --- a/README.md +++ b/README.md @@ -353,12 +353,12 @@ controller calibrates, so software never observes an uninitialized main memory. | Resource | Used | Available | Util% | |----------|-----:|----------:|------:| -| Slice LUTs | 129,853 | 203,800 | 63.7% | -| LUT as Logic | 121,241 | 203,800 | 59.5% | -| LUT as Distributed RAM | 7,768 | — | — | -| LUT as Shift Register | 844 | — | — | -| Slice Registers | 87,128 | 407,600 | 21.4% | -| Block RAM Tile | 189.5 | 445 | 42.6% | +| Slice LUTs | 130,263 | 203,800 | 63.9% | +| LUT as Logic | 121,656 | 203,800 | 59.7% | +| LUT as Distributed RAM | 7,762 | — | — | +| LUT as Shift Register | 845 | — | — | +| Slice Registers | 87,277 | 407,600 | 21.4% | +| Block RAM Tile | 219 | 445 | 49.2% | | DSPs | 36 | 840 | 4.3% | | F7 Muxes | 98 | 101,900 | 0.1% | | F8 Muxes | 33 | 50,950 | 0.1% | diff --git a/boards/genesys2/genesys2_frost.sv b/boards/genesys2/genesys2_frost.sv index e47c9218..0e519104 100644 --- a/boards/genesys2/genesys2_frost.sv +++ b/boards/genesys2/genesys2_frost.sv @@ -196,7 +196,10 @@ module genesys2_frost ( // backed by the DDR3 controller through the AXI port below. .ENABLE_CACHED_TIER(1), .CACHED_HAS_L2(0), - .USE_BEHAVIORAL_DDR(0) + .USE_BEHAVIORAL_DDR(0), + // Bump L1I 16 KiB -> 128 KiB: hold the kernel tick/softirq/scheduler + // working set to defeat the periodic-tick catch-up livelock (no L2 here). + .L1I_CACHE_BYTES(128 * 1024) ) subsystem ( .i_clk(main_clock), .i_clk_div4(divided_clock_by_4), diff --git a/boards/xilinx_frost_subsystem.sv b/boards/xilinx_frost_subsystem.sv index 756dc332..7e824f09 100644 --- a/boards/xilinx_frost_subsystem.sv +++ b/boards/xilinx_frost_subsystem.sv @@ -32,7 +32,13 @@ module xilinx_frost_subsystem #( // 1 = the cached tier ends in the simulation-only behavioral DDR model; // 0 = it ends at the o_ddr_axi_*/i_ddr_axi_* ports below, wired to the // board's DDR controller subsystem (both boards drive 0). - parameter int unsigned USE_BEHAVIORAL_DDR = 1 + parameter int unsigned USE_BEHAVIORAL_DDR = 1, + // L1 instruction-cache size in bytes. genesys2 (L1-only, no L2) bumps this + // above the 16 KiB default so the kernel periodic-tick/softirq/scheduler + // working set stays resident, addressing the tick-livelock I$ thrash. + parameter int unsigned L1I_CACHE_BYTES = 16 * 1024, + // Optional boot-hang UART classifier. Leave off for interactive testing. + parameter int unsigned ENABLE_HANG_TRIAGE = 0 ) ( input logic i_clk, // Main CPU clock input logic i_clk_div4, // Divided clock for JTAG/UART (1/4 of main clock) @@ -217,7 +223,9 @@ module xilinx_frost_subsystem #( .CLK_FREQ_HZ(CLK_FREQ_HZ), .ENABLE_CACHED_TIER(ENABLE_CACHED_TIER), .CACHED_HAS_L2(CACHED_HAS_L2), - .USE_BEHAVIORAL_DDR(USE_BEHAVIORAL_DDR) + .USE_BEHAVIORAL_DDR(USE_BEHAVIORAL_DDR), + .L1I_CACHE_BYTES(L1I_CACHE_BYTES), + .ENABLE_HANG_TRIAGE(ENABLE_HANG_TRIAGE) ) frost_processor ( .i_clk(i_clk), .i_clk_div4(i_clk_div4), diff --git a/fpga/load_software/file_to_bram.tcl b/fpga/load_software/file_to_bram.tcl index 20647787..7ff2727a 100644 --- a/fpga/load_software/file_to_bram.tcl +++ b/fpga/load_software/file_to_bram.tcl @@ -18,27 +18,62 @@ # Reads hex file (one 32-bit word per line) and writes to BRAM through # JTAG-to-AXI bridge. Used for loading software without reprogramming FPGA. -proc file2bram {base_memory_address firmware_filename {axi_interface_name hw_axi_1}} { +proc _file2bram_rearm_image_load_reset {axi_interface_name base_memory_address rearm_word} { + set old_txn [get_hw_axi_txns -quiet bramrstkeep] + if {[llength $old_txn] > 0} { + delete_hw_axi_txn $old_txn + } + create_hw_axi_txn bramrstkeep [get_hw_axis $axi_interface_name] \ + -type write -address [format 0x%08x $base_memory_address] -len 1 -data $rearm_word + run_hw_axi [get_hw_axi_txns bramrstkeep] + delete_hw_axi_txn [get_hw_axi_txns bramrstkeep] +} + +proc file2bram {base_memory_address firmware_filename {axi_interface_name hw_axi_1} {batch_limit 64}} { # Open firmware file (text format: 8 hex digits per line) set file_descriptor [open $firmware_filename r] set current_address $base_memory_address set transaction_number 0 + set batch_word_count 0 + set total_words 0 + set first_word "" - # Read file line by line - each line is one 32-bit word in hexadecimal + # Read file line by line - each line is one 32-bit word in hexadecimal. + # Run bounded batches so the hardware image-load reset one-shot cannot + # expire while Vivado is blocked inside one very large run_hw_axi call. while {[gets $file_descriptor word_hex_value] >= 0} { + set word_hex_value [string trim $word_hex_value] + if {$word_hex_value eq ""} { + continue + } + if {$first_word eq ""} { + set first_word $word_hex_value + } + set formatted_address [format 0x%08x $current_address] - # Create AXI write transaction for this word - create_hw_axi_txn wr$transaction_number [get_hw_axis $axi_interface_name] \ + create_hw_axi_txn bramwr$batch_word_count [get_hw_axis $axi_interface_name] \ -type write -address $formatted_address -len 1 -data $word_hex_value + incr batch_word_count incr transaction_number - # Move to next word (4 bytes) + incr total_words incr current_address 4 + + if {$batch_word_count >= $batch_limit} { + run_hw_axi [get_hw_axi_txns bramwr*] + delete_hw_axi_txn [get_hw_axi_txns bramwr*] + set batch_word_count 0 + if {$first_word ne ""} { + _file2bram_rearm_image_load_reset $axi_interface_name $base_memory_address $first_word + } + } } close $file_descriptor - # Execute all queued AXI transactions - run_hw_axi [get_hw_axi_txns] + if {$batch_word_count > 0} { + run_hw_axi [get_hw_axi_txns bramwr*] + delete_hw_axi_txn [get_hw_axi_txns bramwr*] + } - puts "Loaded $transaction_number words starting at [format 0x%08x $base_memory_address]" + puts "Loaded $total_words words starting at [format 0x%08x $base_memory_address] in bounded batches" } diff --git a/fpga/load_software/file_to_ddr.tcl b/fpga/load_software/file_to_ddr.tcl index 217cbec1..40c0f852 100644 --- a/fpga/load_software/file_to_ddr.tcl +++ b/fpga/load_software/file_to_ddr.tcl @@ -21,11 +21,31 @@ # Addresses are REGION-RELATIVE: offset 0 = the base of the 1 GiB cached # region (0x8000_0000 in the CPU's address map). The CPU must be held in # reset while this runs (the image-load reset in xilinx_frost_subsystem -# asserts on low-BRAM writes, which the loader always performs afterwards; -# the caches re-invalidate on that reset, so the freshly written DDR contents -# are never shadowed by stale lines). +# asserts on low-BRAM writes; the caches re-invalidate on that reset, so the +# freshly written DDR contents are never shadowed by stale lines). +# +# CRITICAL: the image_load_reset is a ~4 s one-shot counter re-armed by each +# low-BRAM write. A multi-MB DDR image takes much longer than 4 s to burst in, +# so a single pre-load BRAM write is NOT enough -- the counter expires +# mid-load, the CPU comes out of reset, and free-runs against the half-written +# DDR image (nondeterministic -> flaky boot hangs). When bram_axi_name is +# given we re-arm the reset with a dummy low-BRAM write every poke_interval +# bursts (sub-second << 4 s), holding the CPU in reset for the ENTIRE load. +# The DDR loader (S01) is a separate AXI master and keeps running while the CPU +# is held, so the load still completes. + +# Re-arm the image-load CPU reset with a single low-BRAM write (restarts the +# subsystem's ~4 s reset counter). Called right before every blocking DDR batch +# run so the counter can never expire mid-load and let the CPU free-run. +proc _rearm_image_load_reset {bram_axi_name rearm_word} { + if {$bram_axi_name eq ""} return + create_hw_axi_txn rstkeep [get_hw_axis $bram_axi_name] \ + -type write -address 0x00000000 -len 1 -data $rearm_word + run_hw_axi [get_hw_axi_txns rstkeep] + delete_hw_axi_txn [get_hw_axi_txns rstkeep] +} -proc file2ddr {firmware_filename {axi_interface_name hw_axi_2} {burst_words 256}} { +proc file2ddr {firmware_filename {axi_interface_name hw_axi_2} {burst_words 256} {bram_axi_name ""} {rearm_word "00000000"}} { set file_descriptor [open $firmware_filename r] @@ -41,7 +61,7 @@ proc file2ddr {firmware_filename {axi_interface_name hw_axi_2} {burst_words 256} set transaction_number 0 set total_words 0 set batch 0 - set batch_limit 512 + set batch_limit 128 ;# small batches so each blocking run_hw_axi stays well under the ~4 s reset counter while {1} { # Collect up to burst_words words for this burst (skipping blank lines, @@ -69,6 +89,9 @@ proc file2ddr {firmware_filename {axi_interface_name hw_axi_2} {burst_words 256} incr total_words $beats incr current_address [expr {4 * $beats}] if {$batch >= $batch_limit} { + # Re-arm the reset IMMEDIATELY before the blocking batch run (the only + # loop step long enough to risk the ~4 s counter expiring mid-load). + _rearm_image_load_reset $bram_axi_name $rearm_word run_hw_axi [get_hw_axi_txns ddrwr*] delete_hw_axi_txn [get_hw_axi_txns ddrwr*] set batch 0 @@ -79,6 +102,7 @@ proc file2ddr {firmware_filename {axi_interface_name hw_axi_2} {burst_words 256} close $file_descriptor if {$batch > 0} { + _rearm_image_load_reset $bram_axi_name $rearm_word run_hw_axi [get_hw_axi_txns ddrwr*] delete_hw_axi_txn [get_hw_axi_txns ddrwr*] } diff --git a/fpga/load_software/load_software.py b/fpga/load_software/load_software.py index 0f3dec56..4353d60b 100755 --- a/fpga/load_software/load_software.py +++ b/fpga/load_software/load_software.py @@ -47,6 +47,7 @@ *COREMARK_PRO_APP_NAMES, "csr_test", "ddr_exec_test", + "ddr_atomic_test", "ddr_heap_test", "ddr_smc_test", "ddr_test", @@ -61,6 +62,7 @@ "linux_irq_stack_slot_test", "memory_test", "packet_parser", + "pde_return_hazard", "print_clock_speed", "ras_stress_test", "ras_test", @@ -93,6 +95,7 @@ # that address range reads back zero. Rejected below until then. DDR_APPS = frozenset(COREMARK_PRO_APP_NAMES) | { "ddr_exec_test", + "ddr_atomic_test", "ddr_heap_test", "ddr_smc_test", "ddr_test", @@ -100,6 +103,7 @@ "linux_boot", "linux_irq_ddr_test", "linux_irq_stack_slot_test", + "pde_return_hazard", } @@ -451,7 +455,11 @@ def main() -> None: elif args.coremark_pro_mode == "validation": print(" CoreMark-PRO run type: validation (-v1)") if not compile_app_for_board( - args.software_app, app_dir, clock_freq, coremark_iterations, make_vars, + args.software_app, + app_dir, + clock_freq, + coremark_iterations, + make_vars, mem_config="ddr" if args.ddr else None, ): print(f"Error: Failed to compile {args.software_app}", file=sys.stderr) diff --git a/fpga/load_software/load_software.tcl b/fpga/load_software/load_software.tcl index 3a2fba0c..edfbb336 100644 --- a/fpga/load_software/load_software.tcl +++ b/fpga/load_software/load_software.tcl @@ -39,10 +39,10 @@ set coremark_pro_apps [list coremark_pro_core coremark_pro_cjpeg \ # Valid software applications (mirrors load_software.py VALID_APPS) set valid_apps [list branch_pred_test c_ext_test call_stress cf_ext_test coremark \ - {*}$coremark_pro_apps csr_test ddr_exec_test ddr_heap_test \ + {*}$coremark_pro_apps csr_test ddr_atomic_test ddr_exec_test ddr_heap_test \ ddr_smc_test ddr_test freertos_demo fpu_assembly_test fpu_test \ hello_world isa_test linux_irq_active_ddr_test linux_boot linux_irq_ddr_test linux_irq_stack_slot_test memory_test \ - packet_parser print_clock_speed ras_stress_test ras_test \ + packet_parser pde_return_hazard print_clock_speed ras_stress_test ras_test \ spanning_test sprintf_test strings_test tomasulo_perf \ tomasulo_test uart_echo] @@ -165,10 +165,14 @@ set bram_base_address 0x00000000 set ddr_text_file ${project_root}/sw/apps/${firmware_application_name}/sw_ddr.txt # DDR image first (when present): assert the image-load CPU reset with a -# single low-BRAM write, then burst the DDR image through hw_axi_2. The CPU -# stays in reset until well after the subsequent full BRAM load, and the -# caches re-invalidate on release, so the fresh DDR contents are never -# shadowed by stale lines or racing writebacks. +# low-BRAM write, then burst the DDR image through hw_axi_2 while RE-ARMING +# that reset periodically (file2ddr pokes bram_axi every poke_interval bursts). +# The image_load_reset is only a ~4 s one-shot, far shorter than a multi-MB DDR +# load, so without the periodic re-arm the CPU would leave reset mid-load and +# free-run against the half-written DDR image (nondeterministic boot hangs). +# With it the CPU stays in reset until well after the subsequent full BRAM +# load, and the caches re-invalidate on release, so the fresh DDR contents are +# never shadowed by stale lines or racing writebacks. if { $has_ddr && $ddr_axi ne "" && [file exists $ddr_text_file] && [file size $ddr_text_file] > 12 } { set first_word_fd [open $firmware_text_file r] gets $first_word_fd first_word @@ -177,8 +181,8 @@ if { $has_ddr && $ddr_axi ne "" && [file exists $ddr_text_file] && [file size $d -type write -address 0x00000000 -len 1 -data $first_word run_hw_axi [get_hw_axi_txns rst_assert] set ddr_word_count [expr {[file size $ddr_text_file] / 9}] - puts "Loading ~${ddr_word_count} words into DDR via ${ddr_axi} (bursts)..." - file2ddr $ddr_text_file $ddr_axi + puts "Loading ~${ddr_word_count} words into DDR via ${ddr_axi} (bursts, CPU held in reset)..." + file2ddr $ddr_text_file $ddr_axi 256 $bram_axi $first_word } # Write software to low BRAM starting at address 0. diff --git a/handoff_linux_timer_irq_panic.md b/handoff_linux_timer_irq_panic.md deleted file mode 100644 index ffd3faf1..00000000 --- a/handoff_linux_timer_irq_panic.md +++ /dev/null @@ -1,581 +0,0 @@ -# Fresh handoff - FROST no-MMU Linux boot on Genesys2 - -Last updated by Codex: 2026-06-21. Latest hardware run described here was on -2026-06-20. This file is meant to be self-contained for a fresh agent. - -## Mission - -Boot no-MMU M-mode Linux on real Genesys2 hardware with the FROST RV32 out-of-order -core. Do not treat this as only a single bug fix. The larger goal is real hardware -Linux bring-up. - -Current state (updated 2026-06-21 by Claude): the `0x80388bba` panic root cause is -now PROVEN in directed simulation and FIXED in RTL. The fix is verified in sim but -NOT yet on hardware. The next step is a Genesys2 bitstream rebuild with the -`cpu_ooo.sv` change and a hardware Linux boot re-test. See -"## RESOLVED 2026-06-21: stale interrupt_resume_pc across MRET-to-U (proven + fixed)" -below for the full proof, the one-line-class RTL fix, and the new directed test. - -## RESOLVED 2026-06-21: stale interrupt_resume_pc across MRET-to-U (proven + fixed) - -### Proven root cause - -An MRET that returns below M-mode retires through the trap/MRET **full flush**, NOT -through the normal commit path: - -- `o_mret_taken` asserts combinationally on the `o_mret_start` cycle (call it T). -- One cycle later (T+1) `mret_taken_reg` is high, and `misprediction_flush_controller` - drives `flush_all` combinationally from it. `flush_all` wipes the ROB head and gates - `commit_en` (reorder_buffer.sv), so the MRET is squashed and **never appears on - `rob_commit_valid_raw`**. -- `interrupt_resume_pc` (cpu_ooo.sv) only updates on a valid ROB commit, so the MRET - never refreshes it. It keeps the architectural next-PC of the instruction *before* - the MRET — which equals the MRET instruction's own PC (in Linux, the `c.lwsp - sp,8(sp)` at `0x80388bb8` makes that exactly `0x80388bba`). -- `trap_unit` only inhibits interrupts at T and T+1 (`i_mret_start || mret_taken_prev`). - From T+3 onward (priv = U, inhibit dropped, registered timer re-eligible) until the - first post-MRET instruction commits, a machine timer is taken and saves - `mepc = interrupt_resume_pc = `. -- Linux later restores that trap frame and `mret`s to the kernel MRET PC while in - U-mode → illegal instruction (signal 4) → "Attempted to kill init". - -This was confirmed two ways: independent static trace across -rob_serializer / reorder_buffer / misprediction_flush_controller / trap_unit / csr_file, -and a directed cocotb sim (below). The kernel disassembly was verified: runtime -`0x80388bba` is `mret` (0x30200073) at `ret_from_exception+0x76`, preceded by a 2-byte -non-branch `c.lwsp sp,8(sp)` — so `interrupt_resume_pc == 0x80388bba` pre-MRET is -legitimate, and `ret_from_exception` is the unified (U and M) return path. - -### Directed test (new) - -`sw/apps/mret_timer_resume_test/` (registered in `tests/test_run_cocotb.py`). It is a -focused variant of `umode_test`'s timer-preempts-U case: it makes the machine timer -*already pending* (`mtimecmp = 0`) before an MRET-to-U, and the naked M-mode handler -additionally records `mepc`. It asserts the saved resume PC is the U-mode target -(`&u_spin`), never the MRET's own PC. - -Run it the standard way (`frost/tests`, `make clean`, `./test_run_cocotb.py -mret_timer_resume_test`). At low addresses the analog of `0x80388bba` is the inlined -`run_in_umode_pending_timer` MRET at `0x1c6`; the correct resume PC is `u_spin` at -`0xea`. - -- BEFORE fix: `cause=0x80000007 from_priv=0x0 resume_mepc=0x000001C6` → `<>` - (the bug: mepc = MRET PC). -- AFTER fix: `cause=0x80000007 from_priv=0x0 resume_mepc=0x000000EA` → `<>`. - -The `FROST_DBG ... TRAP` probe shows the same timer trap (cycle 1095000) flipping -`resume_pc` from the stale `1c6` to `ea`, while the live `rob_pc` stays `1c6` — i.e. -the resume PC is now correctly decoupled from the squashed MRET head. - -### The fix (RTL) - -`hw/rtl/cpu_and_mem/cpu/cpu_ooo/cpu_ooo.sv`, the `interrupt_resume_pc` always_ff: add a -highest-priority branch that seeds it from the MRET target the cycle `mret_taken` -fires, so the U-target is in place before the inhibit window closes: - -```systemverilog -end else if (mret_taken) begin - interrupt_resume_pc <= csr_mepc; // MRET retires via flush, never via commit; - // seed the resume PC from the MRET target now -end else if (rob_commit_2_valid_raw) begin - ... -``` - -`csr_mepc` is stable at that cycle (MRET does not write mepc; a trap entry that would -cannot coincide with `mret_taken`), and it equals the MRET redirect target. No -regression to the normal precise-interrupt-resume path: for non-MRET interrupts and -the WFI/empty-ROB case the commit branches are unchanged; nothing commits on the -`mret_taken` cycle (serializer has `commit_stall=1`), so the new branch never steals a -real commit's update. It is a narrow 1-bit select on a non-critical register (feeds -only `trap_unit.i_interrupt_pc`), so it should be timing-benign. - -### Next step (hardware) - -Rebuild the Genesys2 bitstream with this `cpu_ooo.sv` change and re-run the hardware -Linux boot (`python3 /tmp/linux_boot_watch.py`). The `0x80388bba` user-mode-MRET -illegal-instruction panic should no longer occur. If a new/different failure appears, -treat it as a fresh symptom — this specific stale-resume-PC mechanism is now closed. - -## Environment - -FROST repo: - -```text -/home/adam-bagley/bigger_l0/frost -``` - -Relevant external Linux tree: - -```text -/home/adam-bagley/bigger_l0/linux-mvp/buildroot/output/build/linux-6.18.7 -``` - -Hardware: - -- Genesys2 / Kintex-7. -- UART is `/dev/ttyUSB0`, 115200 8N1. -- The user programs FPGA bitstreams manually and tells the agent when the FPGA is ready. -- Use the boot-watch script rather than minicom for capture. - -Hardware boot command: - -```sh -python3 /tmp/linux_boot_watch.py -``` - -Latest synchronized UART log: - -```text -/tmp/genesys2_linux_boot_synchronized.log -``` - -The worktree is dirty and contains intentional changes plus unrelated older bring-up -changes. Do not revert wholesale. Start with `git status --short` and inspect before -editing. - -## Latest hardware result - -The user programmed a Genesys2 bitstream containing the newest `trap_unit.sv` changes -and the current `cpu_ooo.sv` interrupt-resume plumbing. Running: - -```sh -python3 /tmp/linux_boot_watch.py -``` - -rebuilt and loaded `sw/apps/linux_boot`, patched the local DDR image, loaded the FPGA, -and captured UART. The boot got past the original `_find_next_bit` / `ra=0xcc0` -panic and reached later initcall/pty territory, but still died: - -```text -[ 0.847064] swapper/0[1]: unhandled signal 4 code 0x1 at 0x80388bba -... -[ 1.095342] Kernel panic - not syncing: Attempted to kill init! exitcode=0x00000004 -... -[<80388bba>] ret_from_exception+0x76/0x7a -``` - -`0x80388bba` is Linux `ret_from_exception`'s final `mret` instruction: - -```text -00388b74: csrw mepc,a2 -... -00388bb8: lw sp,8(sp) -00388bba: mret -``` - -The current bad symptom is therefore: user context eventually tries to execute the -kernel's `mret` instruction at `0x80388bba`, which is illegal outside M-mode. - -## Important image-patch detail - -`sw/apps/linux_boot/patch_ret_from_exception.py` patches the local FPGA-loadable DDR -image after copying from external Linux artifacts: - -```text -word 0xe22dc: 18c1202f -> ff757513 -``` - -The patch applies to: - -```text -sw/apps/linux_boot/sw_ddr.mem -sw/apps/linux_boot/sw_ddr.txt -``` - -Do not use `vmlinux` objdump alone to decide whether the image was patched. The -external `vmlinux` and `linux-mvp/frost-artifacts/sw_ddr.txt` remain unpatched. The -loaded local dense image was patched in the latest run. Current verification command: - -```sh -rg -n "18c1202f|ff757513" sw/apps/linux_boot/sw_ddr.txt \ - /home/adam-bagley/bigger_l0/linux-mvp/frost-artifacts/sw_ddr.txt -``` - -Expected current output: - -```text -sw/apps/linux_boot/sw_ddr.txt:926429:ff757513 -/home/adam-bagley/bigger_l0/linux-mvp/frost-artifacts/sw_ddr.txt:926429:18c1202f -``` - -## What is already fixed or ruled out - -### 1. Original `_find_next_bit` / `ra=0xcc0` panic - -Original hardware failure: - -```text -FROST_IRQ_ENTER epc=801657ae ra=80094556 sp=804c3e40 cause=80000007 slot12=00000cc0 -FROST_IRQ_RETURN epc=801657ae ra=80094556 - -Oops - illegal instruction -epc : 00000cc0 -ra : 00000cc0 -sp : 804c3e50 -``` - -The UART probe proved the trap frame itself still had sane `epc`/`ra`; the interrupted -callee's own saved return-address slot at `12(sp)` was already stale `0x00000cc0` at -IRQ entry. That pointed to a lost stack store, not trap-frame corruption. - -Root cause found: a same-cycle slot-2 store-like ROB commit could be missed by the -store queue's committed-empty guard during a full trap flush. `store_queue.sv` had -raw guard ports for a second commit slot, but `tomasulo_wrapper.sv` had tied them off. -This could let a timer IRQ full-flush while slot 2's store commit was still one cycle -away from the SQ; the registered commit then got masked, losing stores like -`sw ra,12(sp)`. - -Fixes/checks now present in the worktree: - -- `hw/rtl/cpu_and_mem/cpu/tomasulo/tomasulo_wrapper/tomasulo_wrapper.sv` - connects raw slot-2 store-like commit information into the SQ guard. -- `hw/rtl/cpu_and_mem/cpu/cpu_ooo/cpu_ooo.sv` computes - `sq_committed_empty_for_trap = sq_committed_empty && !rob_commit_store_like_raw && - !rob_commit_2_store_like_raw`. -- Directed tests were added for the Linux IRQ stack slot and the wrapper slot-2 guard. - -After these changes, the old `_find_next_bit` / `slot12=0xcc0` signature did not -reproduce in the next hardware runs. Treat it as fixed unless it reappears. - -### 2. MRET/interrupt race at `ret_from_exception::mret` - -After the slot-2 store fix, a hardware run failed with a cleaner signature: - -```text -FROST_IRQ_ENTER epc=80388bba ra=80094556 sp=804c3dc0 cause=80000007 -FROST_IRQ_RETURN epc=80388bba ra=80094556 -swapper/0[0]: unhandled signal 4 code 0x1 at 0x80388bba -``` - -This showed a timer IRQ could be taken with `mepc` equal to the M-mode `mret` -instruction itself. - -Fixes/checks now present in `hw/rtl/cpu_and_mem/cpu/control/trap_unit.sv`: - -- one-cycle `mret_taken_prev` recovery marker, -- interrupt latch inhibited/cleared during `i_mret_start || mret_taken_prev`, -- registered pending interrupts re-qualified against current CSR interrupt - eligibility, -- registered pending interrupt loses to MRET during the MRET recovery window, -- interrupt trap PC comes from `i_interrupt_pc`, not raw live ROB trap PC. - -These changes improved or changed the failure mode, but they did not finish the boot. -Latest hardware still reaches a user illegal instruction at `0x80388bba`. - -### 3. CSR privilege write theory - -Checked `hw/rtl/cpu_and_mem/cpu/csr/csr_file.sv`: plain `csrw mstatus` updates -`mstatus_mpp` and related fields but does not change current privilege `priv_q`. -`priv_q` changes on trap entry and actual `i_mret_taken`. - -So the tempting explanation "Linux writes `mstatus.MPP=U` before `mret`, therefore -trap_unit already thinks it is in U-mode" does not match the CSR implementation. - -### 4. UART drops - -The apparent UART output drops were caused by a stale capture process. The user -confirmed the drops disappeared after killing that process. Do not chase UART output -drops as an RTL issue unless a new independent symptom appears. - -## Current RTL areas to read first - -`hw/rtl/cpu_and_mem/cpu/cpu_ooo/cpu_ooo.sv` - -```systemverilog -logic [XLEN-1:0] interrupt_resume_pc; - -function automatic logic [XLEN-1:0] retired_next_pc( - input riscv_pkg::reorder_buffer_commit_t commit -); - logic [XLEN-1:0] step; - begin - step = commit.is_compressed ? {{(XLEN - 2){1'b0}}, 2'b10} : - {{(XLEN - 3){1'b0}}, 3'b100}; - if (commit.is_branch || commit.is_mret) begin - retired_next_pc = commit.redirect_pc; - end else begin - retired_next_pc = commit.pc + step; - end - end -endfunction - -always_ff @(posedge i_clk) begin - if (i_rst) begin - interrupt_resume_pc <= '0; - end else if (rob_commit_2_valid_raw) begin - interrupt_resume_pc <= retired_next_pc(rob_commit_comb_2); - end else if (rob_commit_valid_raw) begin - interrupt_resume_pc <= retired_next_pc(rob_commit_comb); - end -end -``` - -`trap_unit` saves this as `mepc` for interrupts: - -```systemverilog -o_trap_pc = i_interrupt_pc; -``` - -`hw/rtl/cpu_and_mem/cpu/tomasulo/reorder_buffer/reorder_buffer.sv` should also be -read around MRET commit handling. It currently sets MRET commit `redirect_pc` from -`i_mepc`. - -## Current best hypothesis, not proven - -The next thing to prove or disprove is whether `interrupt_resume_pc` can be stale or -wrong around an MRET return to U-mode. - -Possible bad sequence: - -1. Linux returns to user through `ret_from_exception`. -2. MRET redirect targets the user PC, but `interrupt_resume_pc` is still or becomes - `0x80388bba`, the M-mode `mret` instruction address. -3. A machine timer interrupt becomes eligible just after return below M. This is legal: - machine interrupts can preempt U-mode even when `mstatus.MIE` is 0. -4. Trap entry saves `mepc = i_interrupt_pc = 0x80388bba`. -5. Linux later restores that trap frame and executes `mret` to `0x80388bba` as user - context. -6. U-mode executing MRET raises illegal instruction at `0x80388bba`. - -This fits the latest user-visible failure, but it is still only a hypothesis. The -critical question is: what exact value is on `i_interrupt_pc` when the timer trap that -eventually leads to the `0x80388bba` signal is taken? - -## Recommended next step - -Do a directed simulation before any more hardware rebuilds. - -Add or extend a small app, likely `sw/apps/umode_test/main.c` or a new -`sw/apps/mret_timer_resume_test/main.c`, to exercise: - -1. M-mode sets `mtvec` to a handler that records `mcause`, `mepc`, `mstatus`, and a - small progress marker. -2. M-mode sets up a U-mode return target label in `mepc`. -3. M-mode sets `mstatus.MPP=U` and enables the machine timer interrupt in `mie`. -4. Arrange a timer pending condition before MRET and/or immediately after MRET. -5. Execute MRET. -6. In the trap handler, assert: - - `mcause == 0x80000007`, - - previous privilege was U, - - saved `mepc` is the U-mode target or U-mode fallthrough, - - saved `mepc` is never the M-mode MRET instruction PC. - -Add temporary cocotb visibility/assertions around: - -- `mret_start`, -- `mret_taken`, -- `mret_taken_reg`, -- `csr_mepc`, -- `csr_priv`, -- `rob_commit_comb.valid`, -- `rob_commit_comb.is_mret`, -- `rob_commit_comb.pc`, -- `rob_commit_comb.redirect_pc`, -- `rob_commit_2_*` equivalents, -- `interrupt_resume_pc`, -- `trap_taken`, -- `trap_pc_internal`. - -If the directed sim reproduces the bad `mepc`, the likely RTL fix is in -`cpu_ooo.sv`: seed or hold `interrupt_resume_pc` from the MRET target (`csr_mepc` / -MRET `redirect_pc`) across the MRET recovery window, and prevent MRET/invalid/old ROB -state from leaving it at the M-mode MRET PC. Do not apply that blindly; prove the -failure first. - -If the directed sim does not reproduce, add better Linux restore-path instrumentation -before another bitstream: - -- print a compact `FROST_RET_RESTORE` line immediately before `csrw mstatus`, - `csrw mepc`, and `mret`, -- include `PT_EPC`, `PT_STATUS`, live `mstatus`, live `mepc`, and maybe `PT_RA`. - -The Linux tree already has temporary raw UART probes in -`arch/riscv/kernel/entry.S` for `FROST_IRQ_ENTER`, `FROST_IRQ_RETURN`, and -`FROST_BAD_RET`, but it does not currently print every normal restore state. - -## Tests that passed recently - -```sh -./tests/test_run_cocotb.py trap_unit -COCOTB_NUM_RUNS=1 ./tests/test_run_cocotb.py umode_test -env FROST_IRQ_PRECISION_CHECK=1 FROST_IRQ_LOW_RA_ASSERT=1 \ - FROST_EXTERNAL_IRQ_SYMBOL=irq_find_next_bit_exact_callee \ - FROST_EXTERNAL_IRQ_OFFSET=0x52 FROST_EXTERNAL_IRQ_MAX_PULSES=1 \ - FROST_IRQ_CALLEE_SYMBOL=irq_find_next_bit_exact_callee \ - FROST_IRQ_PRECISION_EVENT_LIMIT=16 COCOTB_NUM_RUNS=1 \ - ./tests/test_run_cocotb.py linux_irq_find_next_slot_test -COCOTB_NUM_RUNS=1 ./tests/test_run_cocotb.py wfi_mepc_test -python3 -m py_compile sw/apps/linux_boot/patch_ret_from_exception.py -make -C sw/apps/linux_boot -``` - -Earlier slot-store regressions that passed: - -```sh -env COCOTB_NUM_RUNS=1 ./tests/test_run_cocotb.py linux_irq_stack_slot_test -env COCOTB_NUM_RUNS=1 ./tests/test_run_cocotb.py linux_irq_active_ddr_test -env COCOTB_NUM_RUNS=1 ./tests/test_run_cocotb.py linux_irq_ddr_test -./tests/test_run_cocotb.py tomasulo_wrapper --testcase test_slot2_store_raw_commit_blocks_sq_committed_empty -./tests/test_run_cocotb.py tomasulo_wrapper --random-seed 1781982550 -``` - -## Current dirty files that matter - -At the time of this handoff, relevant intentional edits include: - -- `hw/rtl/cpu_and_mem/cpu/control/trap_unit.sv` -- `hw/rtl/cpu_and_mem/cpu/cpu_ooo/cpu_ooo.sv` -- `hw/rtl/cpu_and_mem/cpu/tomasulo/tomasulo_wrapper/tomasulo_wrapper.sv` -- `sw/apps/linux_boot/Makefile` -- `sw/apps/linux_boot/patch_ret_from_exception.py` -- `tests/Makefile` -- `tests/test_run_cocotb.py` -- `verif/cocotb_tests/control/test_trap_unit.py` -- `verif/cocotb_tests/test_real_program.py` -- `verif/cocotb_tests/tomasulo/tomasulo_wrapper/test_tomasulo_wrapper.py` -- `verif/cocotb_tests/tomasulo/tomasulo_wrapper/tomasulo_interface.py` -- `sw/apps/linux_irq_stack_slot_test/` -- `sw/apps/linux_irq_active_ddr_test/` -- `sw/apps/linux_irq_ddr_test/` -- `sw/apps/linux_irq_find_next_slot_test/` -- `sw/apps/wfi_mepc_test/` - -There are also unrelated dirty/untracked files in the worktree. Inspect before -touching and do not clean the tree unless the user explicitly asks. - -## Timing caution - -The user reported that recent RTL instrumentation made post-opt timing worse, although -one later implementation recovered during placement and closed. If more synthesizable -instrumentation is needed, keep it narrow: a few registered values or counters, no wide -debug muxes on already bad paths. Prefer directed simulation and Linux UART probes -before adding more FPGA-visible RTL debug. - -## How to inspect the latest hardware log - -Useful command: - -```sh -rg -a -n "80388bba|80388bb|ret_from_exception|unhandled signal|Kernel panic|FROST_IRQ_(ENTER|RETURN)|FROST_BAD_RET|FROST_RET" \ - /tmp/genesys2_linux_boot_synchronized.log -``` - -Expected important lines include: - -```text -[ 0.847064] swapper/0[1]: unhandled signal 4 code 0x1 at 0x80388bba -[ 1.095342] Kernel panic - not syncing: Attempted to kill init! exitcode=0x00000004 -[<80388bba>] ret_from_exception+0x76/0x7a -``` - -Some UART around the recursive failure is garbled. Treat garbled `FROST_IRQ_ENTER`-like -lines as hints only; rely on clean kernel lines and directed sim for proof. - -## Claude starting checklist - -1. Read this file fully. -2. Run `git status --short`. -3. Read `cpu_ooo.sv` around `interrupt_resume_pc` and `trap_unit` instantiation. -4. Read `trap_unit.sv` around interrupt registration, MRET inhibit, and `o_trap_pc`. -5. Read ROB MRET commit/redirect handling. -6. Build the directed MRET-to-U plus timer-pending sim. -7. Only after the sim proves or disproves the current hypothesis, decide whether to - patch RTL, add Linux restore instrumentation, or ask the user for another bitstream. - -## UPDATE 2026-06-21 (Claude): boots to userspace; one flaky RTL race remains - -The `0x80388bba` panic is fixed (RTL, committed: `718f8cc` + productionization -`3d7766c` + rebuild-robust patch `21d5af7`). Since then the bring-up went much -further on real Genesys2 hardware. Current state below. - -### Kernel boots fully to the userspace handoff - -No-MMU Linux 6.18.7 boots cleanly reset -> console (`ttyS0` 16550A) -> initramfs --> `Run /sbin/init` at ~2.8s, clean log. The unpatched-kernel test proved the -hardware `interrupt_resume_pc` fix is the real cure for the U-mode MRET panic -(the `patch_ret_from_exception.py` MIE-clear is a *separate* partial mitigation -for the M-mode variant; see below). - -### Userspace execution is PROVEN working - -Built minimal bFLT test inits with the Buildroot toolchain -(`riscv32-buildroot-linux-uclibc-gcc -O2 -Wl,-elf2flt=-r`; no-MMU userspace is -bFLT `ram gotpic`, NOT plain ELF/FDPIC) and ran them as `/sbin/init`: -- single process `write()` + spin -> prints `USERSPACE_OK`. -- `vfork`+`exec` of a child, child + parent both run -> all markers. -- `vfork`+`exec`+child `_exit`+parent `waitpid` -> all markers, child reaped. - -So U-mode execution, the `ecall` syscall round-trip, process creation, child -MRET-to-U, exit and reap all work. FROST hardware is solid for Linux userspace. - -### Blocker 1 (software, root-caused): BusyBox bFLT stack + RAM - -- `init: out of mem` -> BusyBox's bFLT stack/heap was only `0x3e80` (16 KiB), - too small for no-MMU. `flthdr -s 0x100000 bin/busybox` clears it. PROPER FIX: - set the FLAT stack size in Buildroot (covers all applets, which are symlinks - to the one busybox binary). -- 16 MiB RAM was a temp sim-speed shrink; reverted `MEM_SIZE` to 64 MiB in - `linux-mvp/frost-artifacts/build_fpga_boot.py`. (Memory size does NOT affect - the flaky hang below.) - -### Blocker 2 (RTL, isolated, NOT yet fixed): residual M-mode timer race - -After Blocker 1, the late-kernel boot is intermittently hung (~33-67% of boots), -non-deterministic, at varying points in the timer-active region — frequently at -the exact `[0.14] clocksource: Switched to clocksource clint_clocksource` where -the UNPATCHED kernel hung 100% deterministically. Cheap isolation proved it is: -- NOT memory size: 2/6 flaky at both 32 and 64 MiB. -- NOT DDR / board state: 4/6 flaky on a freshly reprogrammed bitstream. -- => a residual machine-timer-interrupt trap-return race. The U-mode RTL fix + - the MIE-clear patch reduced it from deterministic to flaky but did not close - it. This is the "proper M-mode-window fix" that was deferred — the real - reliability blocker, back in FROST-RTL territory. - -NEXT STEP: directed sim of an M-mode machine-timer interrupt taken through the -`ret_from_exception` restore / MRET path (sweep the timer-injection cycle to hit -the bad window, like the original IRQ-precision tests), find the residual race, -fix in RTL, re-verify on hardware (re-run the 6-boot flaky-rate measurement, -expect 0 hangs). - -### Operational notes for the next agent - -- Program the FPGA yourself: `./fpga/program_bitstream/program_bitstream.py - genesys2` (no need to ask the user). Reprogram to reset board state. -- Hardware boot/reload: `python3 /tmp/linux_boot_watch.py` (loads the patched - image over JTAG via `load_software.py`; no bitstream reprogram needed for - software/kernel changes). It now also breaks on `Run /sbin/init` / `out of - mem` with a 30s post-load window — handy for fast flaky-rate loops (see - `/tmp/flaky_iso.sh`). -- Kernel rebuild: edit `linux-mvp/.../linux-6.18.7/arch/riscv/kernel/entry.S` - then `make -C linux-mvp/buildroot linux-rebuild`; the FROST IRQ debug probes - and the false-triggering `FROST_BAD_RET` canary (it tripped on the legitimate - `RA=0` at first return-to-userspace) are already removed. -- `patch_ret_from_exception.py` now locates its target by unique machine-code - word (survives kernel rebuilds). It is STILL REQUIRED until the M-mode race is - fixed; drop it after. - -### M-mode race hunt status (in progress) - -Reproduction approach scoped; race not yet reproduced in sim: -- Full `linux_boot` in Verilator is NOT viable: it is DDR-latency-bound, so 25M - cycles only reached the early pre-timer `[0.000000] SLUB` line. Reaching the - clocksource switch (~18.6M *instructions*) would take hundreds of millions of - cycles / many hours. Don't retry full-kernel sim for this. -- New synthetic reproducer `sw/apps/mtimer_stress/` (registered; run from - `frost/tests` with `COCOTB_MAX_CYCLES=3000000`): M-mode loop preempted by a - frequent machine timer with the period swept (mtime + 24..87) each tick to hit - every cycle offset around the MRET. It PASSES (survived 9851 IRQs) -> does NOT - reproduce the race. The existing `linux_irq_*_test` DDR tests also pass. So the - trigger is more specific than "M-mode timer + MRET + phase sweep". -- The `SQ: allocation attempted during flush` $warning (store_queue.sv:1178) - fires during the stress but is a generic, handled condition (test passes) -- - likely benign, not the race. - -Next ideas to try (untried): (a) a handler that saves/restores ALL 31 GPRs to a -DDR stack like the kernel (heavy in-flight DDR mem-ops during the trap-return -flush); (b) a WFI-idle + timer-wake deadlock stress (the clocksource hang may be -the first idle WFI not waking); (c) deep RTL analysis of deadlock paths in the -SERIAL FSM (rob_serializer.sv: SERIAL_MRET_EXEC / SERIAL_TRAP_WAIT waiting -forever) and the SQ/LQ full-flush drain (load_queue.sv ~1140-1152) when a timer -flush races in-flight memory ops; (d) if reliability is needed sooner, a stronger -kernel-side interrupt mask around the critical windows as an interim mitigation. diff --git a/hw/rtl/cpu_and_mem/cpu/control/trap_unit.sv b/hw/rtl/cpu_and_mem/cpu/control/trap_unit.sv index fc10a670..e35d9da8 100644 --- a/hw/rtl/cpu_and_mem/cpu/control/trap_unit.sv +++ b/hw/rtl/cpu_and_mem/cpu/control/trap_unit.sv @@ -193,10 +193,45 @@ module trap_unit #( // feedback path passes through a flop. assign interrupt_pending_comb = (meip_enabled || mtip_enabled || msip_enabled) && !o_trap_taken; + // Source-level qualification: pending AND locally enabled (mie.x) and not in a + // trap/MRET recovery window -- but NOT gated by the live global mstatus.MIE. + // + // Once interrupt_pending has been LATCHED (while fully eligible, MIE=1), a + // YOUNGER csr clear of mstatus.MIE (e.g. the kernel idle `csrsi; ...; csrci`) + // must not retroactively erase it: the interrupt was eligible at an instruction + // boundary the csr-clear is younger than, so per the spec it is taken (the + // csr-clear is squashed by the trap). interrupt_pending is registered (1-cycle + // late) and interrupt_pending_eligible re-checks the LIVE global enable, so + // without a hold a csr-clear's delayed mstatus.MIE side-effect lands in the + // sample-to-service gap, drops interrupt_pending_comb, and clears the + // already-qualified bit -> the interrupt is LOST. On the no-MMU kernel that + // dropped machine-timer tick freezes jiffies and hangs the boot. (Usually the + // service is delayed one cycle by a draining store via i_sq_committed_empty, + // widening the window.) Hold across a global-MIE drop; still release when the + // source itself de-qualifies (mtip/meip/msip drops or mie.x cleared) or the + // trap is taken, so masking and acks behave normally. + // interrupt_source_live: a REAL, current interrupt source exists -- pending AND + // locally enabled (mie.x), gated ONLY by !trap_taken_prev. NOT gated by the live + // global mstatus.MIE and NOT by mret_interrupt_inhibit, so a persistent timer is + // HELD across both a global-MIE drop AND the MRET-recovery window rather than + // erased. It is still never TAKEN there (interrupt_pending_eligible keeps + // !mret_interrupt_inhibit + live m_int_globally_enabled), and the 0x80388bba + // panic stays guarded by the cpu_ooo interrupt_resume_pc seed on mret_taken (not + // by this latch) -- per commit 718f8cc the seed is THE panic fix and the old + // trap_unit MRET/interrupt cancel was incidental bring-up timing. A stale sample + // whose source has dropped (source_live=0) is still cleared, preserving the + // "cancel a stale one-cycle sample before MRET" property. + logic interrupt_source_live; + assign interrupt_source_live = + ((i_interrupts.meip && mie_meie) || (i_interrupts.mtip && mie_mtie) || + (i_interrupts.msip && mie_msie)) && !trap_taken_prev; + always_ff @(posedge i_clk) begin if (i_rst) interrupt_pending <= 1'b0; - else if (mret_interrupt_inhibit) interrupt_pending <= 1'b0; - else interrupt_pending <= interrupt_pending_comb; + else if (interrupt_pending_comb) interrupt_pending <= 1'b1; // latch when fully eligible + else if (interrupt_pending && interrupt_source_live && !o_trap_taken) + interrupt_pending <= 1'b1; // hold a live source across a global-MIE drop AND MRET inhibit + else interrupt_pending <= 1'b0; // clear stale (no live source) / on take end // Register synchronous exceptions from the ROB head before trap entry. @@ -283,7 +318,13 @@ module trap_unit #( end always_ff @(posedge i_clk) begin - interrupt_cause <= interrupt_cause_comb; + // Hold the cause while interrupt_pending is held (across a global-MIE drop or + // the MRET inhibit); interrupt_cause_comb is built from the gated *_enabled so + // it decays to 0 there, which would default interrupt_latched_source_enabled + // false and leave the held interrupt ineligible when it can finally trap. + if (interrupt_cause_comb != '0) interrupt_cause <= interrupt_cause_comb; + else if (interrupt_pending && interrupt_source_live) interrupt_cause <= interrupt_cause; + else interrupt_cause <= '0; end // A registered interrupt request must still be enabled when it reaches the @@ -397,8 +438,8 @@ module trap_unit #( p_trap_mret_mutex : assert (!(o_trap_taken && o_mret_taken)); // Trap needs source: trap_taken requires interrupt or exception. - p_trap_needs_source : assert (!o_trap_taken || (interrupt_pending_eligible || - exception_pending)); + p_trap_needs_source : + assert (!o_trap_taken || (interrupt_pending_eligible || exception_pending)); // Trap not during stall: traps only fire when pipeline not stalled. p_trap_not_stalled : assert (!o_trap_taken || !i_pipeline_stall); diff --git a/hw/rtl/cpu_and_mem/cpu/cpu_ooo/cpu_ooo.sv b/hw/rtl/cpu_and_mem/cpu/cpu_ooo/cpu_ooo.sv index dfc0d24d..c33d74b9 100644 --- a/hw/rtl/cpu_and_mem/cpu/cpu_ooo/cpu_ooo.sv +++ b/hw/rtl/cpu_and_mem/cpu/cpu_ooo/cpu_ooo.sv @@ -46,6 +46,7 @@ module cpu_ooo #( input logic [63:0] i_instr, // 64-bit fetch: {next_word, current_word} input logic [riscv_pkg::ImemFetchSidebandWidth-1:0] i_instr_sideband, input logic i_instr_bank_sel_r, // Fetch-word parity (for spanning select) + input logic [31:0] i_served_addr, // Served fetch-window tag (served-window guard) // Fetch window valid (see if_stage). Tie 1 for fixed 1-cycle providers. input logic i_instr_valid, // Stall-replay bundle consumed this cycle (see if_stage) -- the fetch @@ -104,6 +105,10 @@ module cpu_ooo #( // Interrupts input riscv_pkg::interrupt_t i_interrupts, input logic [63:0] i_mtime, + output logic [5:0] o_debug_irq_status, + output logic [XLEN-1:0] o_debug_commit_pc, + output logic [XLEN-1:0] o_debug_commit_2_pc, + output logic [1:0] o_debug_commit_valid, // Debug input logic i_disable_branch_prediction ); @@ -440,6 +445,7 @@ module cpu_ooo #( .i_instr, .i_instr_sideband, .i_instr_bank_sel_r, + .i_served_addr, .i_instr_valid, .o_fetch_replay_consume, .i_from_ex_comb(from_ex_comb_synth), @@ -877,6 +883,7 @@ module cpu_ooo #( logic trap_pending; logic trap_mret_commit_hold_q; logic [XLEN-1:0] rob_trap_pc; + logic rob_head_is_wfi; // ROB head decodes as WFI (drives the WFI interrupt-resume-PC seed) riscv_pkg::exc_cause_t rob_trap_cause; riscv_pkg::exc_cause_t rob_trap_cause_remapped; logic [1:0] csr_priv; // current privilege from csr_file (PrivM/PrivU) @@ -1089,6 +1096,7 @@ module cpu_ooo #( .i_csr_done(csr_done_ack), .o_trap_pending(trap_pending), .o_trap_pc(rob_trap_pc), + .o_head_is_wfi(rob_head_is_wfi), .o_trap_cause(rob_trap_cause), .o_trap_value(rob_trap_value), .i_trap_taken(rob_trap_taken_ack), @@ -2030,12 +2038,10 @@ module cpu_ooo #( logic [XLEN-1:0] interrupt_resume_pc; function automatic logic [XLEN-1:0] retired_next_pc( - input riscv_pkg::reorder_buffer_commit_t commit - ); + input riscv_pkg::reorder_buffer_commit_t commit); logic [XLEN-1:0] step; begin - step = commit.is_compressed ? {{(XLEN - 2){1'b0}}, 2'b10} : - {{(XLEN - 3){1'b0}}, 3'b100}; + step = commit.is_compressed ? {{(XLEN - 2) {1'b0}}, 2'b10} : {{(XLEN - 3) {1'b0}}, 3'b100}; if (commit.is_branch || commit.is_mret) begin retired_next_pc = commit.redirect_pc; end else begin @@ -2069,6 +2075,17 @@ module cpu_ooo #( interrupt_resume_pc <= retired_next_pc(rob_commit_comb_2); end else if (rob_commit_valid_raw) begin interrupt_resume_pc <= retired_next_pc(rob_commit_comb); + end else if (rob_head_is_wfi && head_valid) begin + // Bug#2 (drain-gated WFI mepc): while a WFI stalls at the ROB head, the + // architectural resume PC is always wfi_pc+4 (WFI never redirects). Seed it + // here so that if a machine interrupt is taken at the WFI -- including the + // narrow window where a committed store finishes draining and take_trap + // fires the same cycle, before the WFI's own commit can advance + // interrupt_resume_pc -- mepc is the spec-required wfi_pc+4 rather than the + // pre-WFI instruction's next-PC (== wfi_pc). Lowest priority: a real commit + // (incl. a dual-commit retiring the WFI and its successor) always wins, and + // WFI is never compressed so +4 is exact. Mirrors the mret_taken seed above. + interrupt_resume_pc <= rob_trap_pc + 32'd4; end end @@ -2132,6 +2149,15 @@ module cpu_ooo #( assign rob_trap_taken_ack = trap_taken_reg; assign mret_done_ack = mret_taken_reg; + // Passive on-silicon debug tap for the top-level hang triage UART. Packed as: + // [5]=mret, [4]=trap, [3:2]=priv, [1]=mstatus.MIE, [0]=mie.MTIE. + assign o_debug_irq_status = { + mret_taken, trap_taken, csr_priv, csr_mstatus_mie_direct, csr_mie[riscv_pkg::MieMtiBit] + }; + assign o_debug_commit_pc = rob_commit.pc; + assign o_debug_commit_2_pc = rob_commit_2.pc; + assign o_debug_commit_valid = {rob_commit_2.valid, rob_commit.valid}; + // =========================================================================== // Profiling Counter Aggregation // =========================================================================== diff --git a/hw/rtl/cpu_and_mem/cpu/if_stage/if_stage.sv b/hw/rtl/cpu_and_mem/cpu/if_stage/if_stage.sv index 4edf58a4..cad270a6 100644 --- a/hw/rtl/cpu_and_mem/cpu/if_stage/if_stage.sv +++ b/hw/rtl/cpu_and_mem/cpu/if_stage/if_stage.sv @@ -86,6 +86,7 @@ module if_stage #( input logic [63:0] i_instr, // 64-bit fetch: {next_word, current_word} input logic [riscv_pkg::ImemFetchSidebandWidth-1:0] i_instr_sideband, input logic i_instr_bank_sel_r, // Fetch-word parity (PC[2] from fetch cycle) + input logic [XLEN-1:0] i_served_addr, // Served fetch-window tag (full address) // Fetch window valid: the {i_instr, i_instr_sideband, i_instr_bank_sel_r} // window corresponds to the fetch address presented last cycle. When low // (variable-latency provider: L1I miss / fuzz), IF emits NOP bubbles, @@ -506,6 +507,7 @@ module if_stage #( .i_pd_redirect(i_pd_redirect), .i_pd_redirect_target(i_pd_redirect_target), + .i_window_cannot_serve(window_resteer_pc_reg), .i_trap_taken (i_trap_ctrl.trap_taken), .i_mret_taken (i_trap_ctrl.mret_taken), @@ -742,13 +744,55 @@ module if_stage #( // stall-held, so a stall covering the bubble cycle let the bubble // present-and-dispatch on release alongside the realigned repeat. Fixed // by stall-gating pd_redirect_q (see its always_ff above). - assign sel_nop = i_pipeline_ctrl.flush || flush_for_c_ext_safe || !fetch_progress || + // Served-window invariant: the fetched 64-bit window covers exactly the two + // words {word(i_served_addr), word(i_served_addr)+1}. pc_reg must lie in that + // window or the 1-bit bank-sel parity in instruction_aligner silently selects + // the wrong word -> wrong instruction-size sample -> pc_reg advances onto a + // mid-instruction byte (the workqueue_init_early epc 0x8038d7fa boot Oops). + // A fetch stall (L1I line-fill) can leave the served window >1 word from + // pc_reg, which the single parity bit cannot represent. Detect it from the + // full served address; pc_controller squashes (sel_nop below), holds pc_reg, + // and resteers fetch onto pc_reg's word until the correct window is served. + logic signed [XLEN-1:0] served_word_delta; + assign served_word_delta = $signed( + {2'b00, i_served_addr[XLEN-1:2]} + ) - $signed( + {2'b00, pc_reg[XLEN-1:2]} + ); + logic window_cannot_serve_pc_reg; + // Gated to the cached region (pc_reg[XLEN-1], i.e. >= CACHED_BASE): the low BRAM + // fetch path is fixed 1-cycle/always-valid and never desyncs, and its served-addr + // tracking is approximate -- firing there only causes spurious squashes. + assign window_cannot_serve_pc_reg = i_instr_valid && pc_reg[XLEN-1] && + (served_word_delta != $signed( + 0 + )) && (served_word_delta != -$signed( + 1 + )) && !((served_word_delta == $signed( + 1 + )) && use_instr_buffer); + + // The existing (pre-served-window-guard) squash conditions. + logic sel_nop_existing; + assign sel_nop_existing = i_pipeline_ctrl.flush || + flush_for_c_ext_safe || !fetch_progress || sel_nop_align || reset_holdoff || pending_prediction_target_holdoff || (pending_prediction_fetch_holdoff && !prediction_holdoff) || (control_flow_holdoff && (!prediction_holdoff || pd_redirect_q || slot2_redirect_q)); + // Resteer fetch onto pc_reg's word + hold pc_reg ONLY at a real consume cycle + // (not during an existing holdoff, where pc_reg is already managed and a resteer + // would thrash the front end -- the cause of the earlier isa_test/boot regression). + // At a holdoff release with the served window still stale (fetch ran ahead during + // the redirect bubble), this fires the cycle the wrong-word decode would otherwise + // advance pc_reg onto a mid-instruction byte. + logic window_resteer_pc_reg; + assign window_resteer_pc_reg = window_cannot_serve_pc_reg && !sel_nop_existing; + + assign sel_nop = sel_nop_existing || window_cannot_serve_pc_reg; + // =========================================================================== // Stall State Registers // =========================================================================== diff --git a/hw/rtl/cpu_and_mem/cpu/if_stage/pc_controller.sv b/hw/rtl/cpu_and_mem/cpu/if_stage/pc_controller.sv index 7882b477..a19a54a2 100644 --- a/hw/rtl/cpu_and_mem/cpu/if_stage/pc_controller.sv +++ b/hw/rtl/cpu_and_mem/cpu/if_stage/pc_controller.sv @@ -91,8 +91,9 @@ module pc_controller #( input logic [XLEN-1:0] i_branch_target, // PD backward-branch heuristic redirect (from pd_stage) - input logic i_pd_redirect, + input logic i_pd_redirect, input logic [XLEN-1:0] i_pd_redirect_target, + input logic i_window_cannot_serve, // Served window cannot hold pc_reg -> resteer+hold // Trap control input logic i_trap_taken, @@ -407,7 +408,19 @@ module pc_controller #( pc_reg_next_bit1_for_prediction = o_pc_reg[1] ^ i_is_compressed; end end - assign pc_reg_next_misses_fetch_pc_for_prediction = pc_reg_next_bit1_for_prediction != o_pc[1]; + // BOOT-HANG FIX (verification form): the bit1-only fast predictor + // (pc_reg_next_bit1_for_prediction != o_pc[1]) diverges from the full result + // when pc_reg is >=2 words behind the word-aligned fetch PC -- both are + // word-aligned so bit 1 matches, but the words differ. There the fast value + // is 0 ("no miss") while the truth (seq_next_pc_reg != o_pc) is 1, so + // prediction_needs_pending is wrongly false, the prediction is applied without + // the pc_reg handoff, and fetch redirects to the wrong PC (silent on HW where + // the assert below is compiled out -> the no-MMU Linux boot hang at pid_max). + // Use the full compare; conservative-safe (only ever pends MORE, exactly in + // the cases the bit1 proxy missed). NOTE: this reintroduces the + // seq_next_pc_reg compare on the prediction cone that the bit1 proxy existed + // to avoid -- a timing-friendly correct form is a follow-up if WNS regresses. + assign pc_reg_next_misses_fetch_pc_for_prediction = (seq_next_pc_reg != o_pc); assign prediction_needs_pending = i_prediction_used && !i_ras_predicted && !i_slot2_prediction_used && @@ -607,6 +620,7 @@ module pc_controller #( else if (i_fence_i_flush) next_pc = i_fence_i_target; else if (i_branch_taken) next_pc = i_branch_target; else if (i_pd_redirect) next_pc = i_pd_redirect_target; + else if (i_window_cannot_serve) next_pc = {o_pc_reg[XLEN-1:2], 2'b00}; // No fetch progress: hold the fetch address so the provider can keep // working on the owed ask. Sits above the prediction/pending arms // (their state is frozen and predictions are suppressed while invalid) @@ -670,6 +684,7 @@ module pc_controller #( else if (i_fence_i_flush) next_pc_reg = i_fence_i_target; else if (i_branch_taken) next_pc_reg = i_branch_target; else if (i_pd_redirect) next_pc_reg = i_pd_redirect_target; + else if (i_window_cannot_serve) next_pc_reg = o_pc_reg; // No fetch progress: hold the instruction address (nothing is being // delivered). Same placement rationale as the next_pc hold arm above. else if (!i_fetch_progress) next_pc_reg = o_pc_reg; diff --git a/hw/rtl/cpu_and_mem/cpu/tomasulo/load_queue/load_queue.sv b/hw/rtl/cpu_and_mem/cpu/tomasulo/load_queue/load_queue.sv index 5a1ce7bf..75cf9f39 100644 --- a/hw/rtl/cpu_and_mem/cpu/tomasulo/load_queue/load_queue.sv +++ b/hw/rtl/cpu_and_mem/cpu/tomasulo/load_queue/load_queue.sv @@ -748,8 +748,17 @@ module load_queue #( // Issue Selection -> lq_issue_selector.sv (pure boundary move). issue_cdb_idx // still drives the LQ data LUTRAM read below; that RAM stays here. // =========================================================================== - logic [DEPTH-1:0] mem_issue_stored_mask; - logic [DEPTH-1:0] mem_issue_update_mask; + logic stored_scan_found; + logic [IdxWidth-1:0] stored_scan_idx; + logic [IdxWidth-1:0] stored_scan_pos; + logic [DEPTH-1:0] stored_scan_onehot; + logic [ReorderBufferTagWidth-1:0] stored_scan_rob_tag; + + logic update_scan_found; + logic [IdxWidth-1:0] update_scan_idx; + logic [IdxWidth-1:0] update_scan_pos; + logic [DEPTH-1:0] update_scan_onehot; + logic [ReorderBufferTagWidth-1:0] update_scan_rob_tag; logic head_mem_stored_found; logic [IdxWidth-1:0] head_mem_stored_idx; logic [ReorderBufferTagWidth-1:0] head_mem_stored_rob_tag; @@ -757,6 +766,7 @@ module load_queue #( logic [IdxWidth-1:0] head_mem_update_idx; logic [ReorderBufferTagWidth-1:0] head_mem_update_rob_tag; logic [DEPTH*ReorderBufferTagWidth-1:0] lq_rob_tag_flat; + logic force_head_amo; for (genvar g_lq_tag = 0; g_lq_tag < DEPTH; g_lq_tag++) begin : gen_lq_rob_tag_flat assign lq_rob_tag_flat[g_lq_tag*ReorderBufferTagWidth +: ReorderBufferTagWidth] = @@ -779,10 +789,19 @@ module load_queue #( .lq_rob_tag_flat(lq_rob_tag_flat), .head_idx(head_idx), .i_sq_committed_empty(i_sq_committed_empty), + .i_force_head_amo(force_head_amo), .o_issue_cdb_found(issue_cdb_found), .o_issue_cdb_idx(issue_cdb_idx), - .o_mem_issue_stored_mask(mem_issue_stored_mask), - .o_mem_issue_update_mask(mem_issue_update_mask), + .o_stored_scan_found(stored_scan_found), + .o_stored_scan_idx(stored_scan_idx), + .o_stored_scan_pos(stored_scan_pos), + .o_stored_scan_onehot(stored_scan_onehot), + .o_stored_scan_rob_tag(stored_scan_rob_tag), + .o_update_scan_found(update_scan_found), + .o_update_scan_idx(update_scan_idx), + .o_update_scan_pos(update_scan_pos), + .o_update_scan_onehot(update_scan_onehot), + .o_update_scan_rob_tag(update_scan_rob_tag), .o_head_mem_stored_found(head_mem_stored_found), .o_head_mem_stored_idx(head_mem_stored_idx), .o_head_mem_stored_rob_tag(head_mem_stored_rob_tag), @@ -791,15 +810,6 @@ module load_queue #( .o_head_mem_update_rob_tag(head_mem_update_rob_tag) ); - // scan_idx recomputed locally for the head-load diagnostics below; the - // selector computes its own identical copy internally (head-relative idx). - logic [IdxWidth-1:0] scan_idx[DEPTH]; - always_comb begin - for (int unsigned j = 0; j < DEPTH; j++) begin - scan_idx[j] = IdxWidth'(head_idx + IdxWidth'(j)); - end - end - // =========================================================================== // Head-load sub-bucket diagnostics // =========================================================================== @@ -915,49 +925,6 @@ module load_queue #( // a post-encoder 8-to-1 MUX on lq_rob_tag[issue_mem_idx]) logic [ReorderBufferTagWidth-1:0] issue_mem_rob_tag; - logic stored_scan_found; - logic [IdxWidth-1:0] stored_scan_idx; - logic [IdxWidth-1:0] stored_scan_pos; - logic [DEPTH-1:0] stored_scan_onehot; - logic [ReorderBufferTagWidth-1:0] stored_scan_rob_tag; - - logic update_scan_found; - logic [IdxWidth-1:0] update_scan_idx; - logic [IdxWidth-1:0] update_scan_pos; - logic [DEPTH-1:0] update_scan_onehot; - logic [ReorderBufferTagWidth-1:0] update_scan_rob_tag; - - always_comb begin - stored_scan_found = 1'b0; - stored_scan_idx = '0; - stored_scan_pos = '0; - stored_scan_onehot = '0; - stored_scan_rob_tag = '0; - update_scan_found = 1'b0; - update_scan_idx = '0; - update_scan_pos = '0; - update_scan_onehot = '0; - update_scan_rob_tag = '0; - - for (int unsigned i = 0; i < DEPTH; i++) begin - if (mem_issue_stored_mask[i] && !stored_scan_found) begin - stored_scan_found = 1'b1; - stored_scan_idx = scan_idx[i]; - stored_scan_pos = IdxWidth'(i); - stored_scan_onehot[scan_idx[i]] = 1'b1; - stored_scan_rob_tag = lq_rob_tag[scan_idx[i]]; - end - - if (mem_issue_update_mask[i] && !update_scan_found) begin - update_scan_found = 1'b1; - update_scan_idx = scan_idx[i]; - update_scan_pos = IdxWidth'(i); - update_scan_onehot[scan_idx[i]] = 1'b1; - update_scan_rob_tag = lq_rob_tag[scan_idx[i]]; - end - end - end - logic [IdxWidth-1:0] stored_issue_idx; logic [ReorderBufferTagWidth-1:0] stored_issue_rob_tag; logic [ReorderBufferTagWidth-1:0] update_issue_rob_tag; @@ -1129,6 +1096,63 @@ module load_queue #( !sq_commit_interlock && i_sq_forward.can_forward && !sq_check_is_mmio_q && !sq_check_is_lr_q && !sq_check_is_amo_q; + + // Break the rare ROB-head AMO deadlock without changing steady-state AMO + // order. The normal selector remains pristine until a head AMO is eligible + // for issue and the machine has made no useful LQ/SQ progress for a sustained + // window. Once saturated, force_head_amo lets the head-priority path choose + // that AMO for one capture/replace cycle. + localparam int unsigned AmoDeadlockThresh = 512; + localparam int unsigned AmoDeadlockCntW = $clog2(AmoDeadlockThresh + 1); + + logic head_amo_eligible_waiting; + logic sq_check_waiting_older_store; + logic head_amo_no_issue_deadlock; + logic head_amo_sq_deadlock; + logic head_amo_deadlock_wait; + logic [AmoDeadlockCntW-1:0] amo_deadlock_cnt_q; + + always_comb begin + head_amo_eligible_waiting = 1'b0; + for (int unsigned i = 0; i < DEPTH; i++) begin + if (rob_head_match_q[i] && + lq_valid[i] && + lq_is_amo[i] && + entry_addr_valid_now[i] && + !lq_issued[i] && + !lq_data_valid[i] && + !sq_check_in_flight_mask[i] && + i_sq_committed_empty) begin + head_amo_eligible_waiting = 1'b1; + end + end + end + + assign sq_check_waiting_older_store = + sq_check_pending && sq_check_phase2 && sq_check_entry_issueable && + !sq_check_misaligned && !sq_commit_interlock && !sq_no_older_store && + (!i_sq_all_older_addrs_known || (i_sq_forward.match && !i_sq_forward.can_forward)) && + !i_mem_bus_busy && !drop_mem_response_pending && !i_flush_all && !i_flush_en; + + assign head_amo_no_issue_deadlock = + head_amo_eligible_waiting && !issue_mem_found && !sq_check_pending; + assign head_amo_sq_deadlock = + head_amo_eligible_waiting && sq_check_waiting_older_store && + (sq_check_rob_tag_q != i_rob_head_tag); + assign head_amo_deadlock_wait = + !mem_outstanding && (amo_state == AMO_IDLE) && + (head_amo_no_issue_deadlock || head_amo_sq_deadlock); + + always_ff @(posedge i_clk) begin + if (!i_rst_n || i_flush_all || i_flush_en || !head_amo_deadlock_wait) begin + amo_deadlock_cnt_q <= '0; + end else if (amo_deadlock_cnt_q < AmoDeadlockCntW'(AmoDeadlockThresh)) begin + amo_deadlock_cnt_q <= amo_deadlock_cnt_q + 1'b1; + end + end + + assign force_head_amo = (amo_deadlock_cnt_q >= AmoDeadlockCntW'(AmoDeadlockThresh)); + assign flush_all_entries = i_flush_en && !i_early_recovery_flush && (i_rob_head_tag == (i_flush_tag + ReorderBufferTagWidth'(1))); @@ -1144,8 +1168,8 @@ module load_queue #( )); assign full_flush_response_drain = i_flush_all && i_mem_read_valid && mem_outstanding; assign accept_mem_response = i_mem_read_valid && mem_outstanding && - !i_flush_all && !drop_mem_response_pending && !issued_entry_flushed && - lq_valid[issued_idx]; + !i_flush_all && !drop_mem_response_pending && + !issued_entry_flushed && lq_valid[issued_idx]; assign drop_mem_response_now = i_mem_read_valid && (full_flush_response_drain || drop_mem_response_pending || issued_entry_flushed || @@ -2352,8 +2376,7 @@ module load_queue #( $warning("LQ: slot-2 alloc attempted alone when full"); if (i_flush_all && accept_mem_response) $error("LQ: accepted memory response during full flush"); - if (i_flush_all && cache_fill_valid) - $error("LQ: filled L0 cache during full flush"); + if (i_flush_all && cache_fill_valid) $error("LQ: filled L0 cache during full flush"); // Slot-1 and slot-2 must never target the same physical entry. if (slot1_alloc_en && slot2_alloc_en && (alloc_target[IdxWidth-1:0] == slot2_alloc_idx)) $error("LQ: slot-1 and slot-2 alloc collide on entry %0d", alloc_target[IdxWidth-1:0]); diff --git a/hw/rtl/cpu_and_mem/cpu/tomasulo/load_queue/lq_issue_selector.sv b/hw/rtl/cpu_and_mem/cpu/tomasulo/load_queue/lq_issue_selector.sv index 8e8a887e..15067a08 100644 --- a/hw/rtl/cpu_and_mem/cpu/tomasulo/load_queue/lq_issue_selector.sv +++ b/hw/rtl/cpu_and_mem/cpu/tomasulo/load_queue/lq_issue_selector.sv @@ -18,7 +18,8 @@ // lq_issue_selector // ============================================================================= // Extracted verbatim from load_queue.sv (pure RTL boundary move, zero functional -// change). Parallel issue selection: Phase A (oldest CDB-ready entry), Phase B +// change, except for the optional registered deadlock break input). Parallel +// issue selection: Phase A (oldest CDB-ready entry), Phase B // (memory-issue eligibility masks with MMIO/LR/AMO head gating + older-AMO // blocking), and the explicit ROB-head priority result. Replaces the old serial // 16-level scan with per-entry masks + tree encoders. issue_cdb_idx is exported @@ -42,11 +43,20 @@ module lq_issue_selector #( input logic [(DEPTH*riscv_pkg::ReorderBufferTagWidth)-1:0] lq_rob_tag_flat, input logic [$clog2(DEPTH)-1:0] head_idx, input logic i_sq_committed_empty, + input logic i_force_head_amo, output logic o_issue_cdb_found, output logic [$clog2(DEPTH)-1:0] o_issue_cdb_idx, - output logic [DEPTH-1:0] o_mem_issue_stored_mask, - output logic [DEPTH-1:0] o_mem_issue_update_mask, + output logic o_stored_scan_found, + output logic [$clog2(DEPTH)-1:0] o_stored_scan_idx, + output logic [$clog2(DEPTH)-1:0] o_stored_scan_pos, + output logic [DEPTH-1:0] o_stored_scan_onehot, + output logic [riscv_pkg::ReorderBufferTagWidth-1:0] o_stored_scan_rob_tag, + output logic o_update_scan_found, + output logic [$clog2(DEPTH)-1:0] o_update_scan_idx, + output logic [$clog2(DEPTH)-1:0] o_update_scan_pos, + output logic [DEPTH-1:0] o_update_scan_onehot, + output logic [riscv_pkg::ReorderBufferTagWidth-1:0] o_update_scan_rob_tag, output logic o_head_mem_stored_found, output logic [$clog2(DEPTH)-1:0] o_head_mem_stored_idx, output logic [riscv_pkg::ReorderBufferTagWidth-1:0] o_head_mem_stored_rob_tag, @@ -173,6 +183,54 @@ module lq_issue_selector #( assign mem_issue_stored_mask = mem_eligible_stored_mask & ~blocked_by_amo; assign mem_issue_update_mask = mem_eligible_update_mask & ~blocked_by_amo; + // Encode the oldest normal stored-address and current-update candidates here + // while scan_idx is already local. Exporting encoded candidates avoids + // re-scanning the masks in load_queue on the SQ-check payload enable path. + logic stored_scan_found; + logic [IdxWidth-1:0] stored_scan_idx; + logic [IdxWidth-1:0] stored_scan_pos; + logic [DEPTH-1:0] stored_scan_onehot; + logic [ReorderBufferTagWidth-1:0] stored_scan_rob_tag; + + logic update_scan_found; + logic [IdxWidth-1:0] update_scan_idx; + logic [IdxWidth-1:0] update_scan_pos; + logic [DEPTH-1:0] update_scan_onehot; + logic [ReorderBufferTagWidth-1:0] update_scan_rob_tag; + + always_comb begin + stored_scan_found = 1'b0; + stored_scan_idx = '0; + stored_scan_pos = '0; + stored_scan_onehot = '0; + stored_scan_rob_tag = '0; + update_scan_found = 1'b0; + update_scan_idx = '0; + update_scan_pos = '0; + update_scan_onehot = '0; + update_scan_rob_tag = '0; + + for (int unsigned i = 0; i < DEPTH; i++) begin + if (mem_issue_stored_mask[i] && !stored_scan_found) begin + stored_scan_found = 1'b1; + stored_scan_idx = scan_idx[i]; + stored_scan_pos = IdxWidth'(i); + stored_scan_onehot[scan_idx[i]] = 1'b1; + stored_scan_rob_tag = + lq_rob_tag_flat[scan_idx[i]*ReorderBufferTagWidth+:ReorderBufferTagWidth]; + end + + if (mem_issue_update_mask[i] && !update_scan_found) begin + update_scan_found = 1'b1; + update_scan_idx = scan_idx[i]; + update_scan_pos = IdxWidth'(i); + update_scan_onehot[scan_idx[i]] = 1'b1; + update_scan_rob_tag = + lq_rob_tag_flat[scan_idx[i]*ReorderBufferTagWidth+:ReorderBufferTagWidth]; + end + end + end + // The sparse queue can reuse reclaimed holes after flushes, so physical // queue order is not always identical to ROB age. To avoid starving the // oldest architectural load behind a younger blocked entry, explicitly @@ -200,7 +258,7 @@ module lq_issue_selector #( !in_flight_mask[i] && !lq_is_mmio[i] && !lq_is_lr[i] && - !lq_is_amo[i]) begin + (!lq_is_amo[i] || (i_force_head_amo && i_sq_committed_empty))) begin head_mem_stored_found = 1'b1; head_mem_stored_idx = IdxWidth'(i); head_mem_stored_rob_tag = lq_rob_tag_flat[i*ReorderBufferTagWidth+:ReorderBufferTagWidth]; @@ -214,7 +272,7 @@ module lq_issue_selector #( !lq_data_valid[i] && !in_flight_mask[i] && !lq_is_lr[i] && - !lq_is_amo[i]) begin + (!lq_is_amo[i] || (i_force_head_amo && i_sq_committed_empty))) begin head_mem_update_found = 1'b1; head_mem_update_idx = IdxWidth'(i); head_mem_update_rob_tag = lq_rob_tag_flat[i*ReorderBufferTagWidth+:ReorderBufferTagWidth]; @@ -224,8 +282,16 @@ module lq_issue_selector #( assign o_issue_cdb_found = issue_cdb_found; assign o_issue_cdb_idx = issue_cdb_idx; - assign o_mem_issue_stored_mask = mem_issue_stored_mask; - assign o_mem_issue_update_mask = mem_issue_update_mask; + assign o_stored_scan_found = stored_scan_found; + assign o_stored_scan_idx = stored_scan_idx; + assign o_stored_scan_pos = stored_scan_pos; + assign o_stored_scan_onehot = stored_scan_onehot; + assign o_stored_scan_rob_tag = stored_scan_rob_tag; + assign o_update_scan_found = update_scan_found; + assign o_update_scan_idx = update_scan_idx; + assign o_update_scan_pos = update_scan_pos; + assign o_update_scan_onehot = update_scan_onehot; + assign o_update_scan_rob_tag = update_scan_rob_tag; assign o_head_mem_stored_found = head_mem_stored_found; assign o_head_mem_stored_idx = head_mem_stored_idx; assign o_head_mem_stored_rob_tag = head_mem_stored_rob_tag; diff --git a/hw/rtl/cpu_and_mem/cpu/tomasulo/reorder_buffer/reorder_buffer.sv b/hw/rtl/cpu_and_mem/cpu/tomasulo/reorder_buffer/reorder_buffer.sv index f466683d..f3f92e8d 100644 --- a/hw/rtl/cpu_and_mem/cpu/tomasulo/reorder_buffer/reorder_buffer.sv +++ b/hw/rtl/cpu_and_mem/cpu/tomasulo/reorder_buffer/reorder_buffer.sv @@ -152,6 +152,8 @@ module reorder_buffer ( // Exception detected at head - signal trap unit output logic o_trap_pending, // Exception needs handling output logic [riscv_pkg::XLEN-1:0] o_trap_pc, // PC of excepting instruction + // Head decodes as WFI (drives WFI interrupt-resume-PC seed in cpu_ooo) + output logic o_head_is_wfi, output riscv_pkg::exc_cause_t o_trap_cause, // Exception cause // Head entry's CDB value at trap time. For a misaligned load/store the // load_queue/SQ path parks the faulting address here (the value slot is @@ -1712,14 +1714,35 @@ module reorder_buffer ( head_is_csr && !head_exception && !i_flush_en && !i_flush_all; - // MRET execution signal - asserted when entering MRET_EXEC state. + // MRET execution signal - asserted when entering MRET_EXEC and SUSTAINED while + // waiting there for committed stores to drain. + // + // take_mret (trap_unit) only fires when i_sq_committed_empty is high IN THE + // SAME CYCLE as o_mret_start, and it has no retry. Without the + // SERIAL_MRET_EXEC sustaining term o_mret_start is a one-cycle pulse on the + // IDLE->MRET_EXEC cycle: if a committed store is still draining then, take_mret + // misses its only chance and the serializer wedges in SERIAL_MRET_EXEC forever + // (no later flush can rescue it -- the stuck MRET never restores MIE, so no + // interrupt becomes eligible to flush it). The sustaining term mirrors + // o_trap_pending (below) and lets take_mret retry every cycle until the SQ + // drains. + // + // The i_sq_committed_empty gate keeps o_mret_start (hence i_mret_start -> + // trap_drain_wait -> i_commit_hold) low during the drain wait, which (a) + // prevents a commit-hold/o_mret_start f/2 oscillation and (b) keeps mret_taken + // a single-cycle pulse so flush_all fires exactly once. It is free on the + // common path: a retiring MRET normally finds the committed SQ already empty. + // // Note: !i_flush_en/!i_flush_all intentionally omitted — flush signals are // derived from mret_taken which is derived from o_mret_start, so gating // by them creates an oscillating combinational loop. - assign o_mret_start = (serial_state == riscv_pkg::SERIAL_IDLE) && head_ready && + assign o_mret_start = ((serial_state == riscv_pkg::SERIAL_IDLE) || + (serial_state == riscv_pkg::SERIAL_MRET_EXEC)) && + head_ready && !i_commit_hold && !i_early_recovery_en && - head_is_mret && !head_exception; + head_is_mret && !head_exception && + i_sq_committed_empty; // Trap pending signal - asserted when exception at head. // Note: during the IDLE->TRAP_WAIT transition, both the state check and the @@ -1735,6 +1758,13 @@ module reorder_buffer ( (serial_state == riscv_pkg::SERIAL_TRAP_WAIT) || (head_ready && !i_commit_hold && !i_early_recovery_en && head_exception); assign o_trap_pc = head_pc; + // WFI interrupt-resume-PC seed (Bug#2): expose that the ROB head is a WFI so + // cpu_ooo can seed interrupt_resume_pc = wfi_pc+4 while the WFI stalls at the + // head. A machine interrupt taken at a *drain-gated* WFI (a committed store + // still draining) otherwise flushes the WFI before it commits, leaving + // interrupt_resume_pc at the pre-WFI instruction's next-PC (== the WFI's own + // PC) -> mepc=wfi_pc instead of the spec-required wfi_pc+4. + assign o_head_is_wfi = head_is_wfi; assign o_trap_cause = head_exc_cause; assign o_trap_value = head_value[XLEN-1:0]; diff --git a/hw/rtl/cpu_and_mem/cpu/tomasulo/store_queue/sq_forwarding_unit.sv b/hw/rtl/cpu_and_mem/cpu/tomasulo/store_queue/sq_forwarding_unit.sv index 06c11b61..1b1c55d8 100644 --- a/hw/rtl/cpu_and_mem/cpu/tomasulo/store_queue/sq_forwarding_unit.sv +++ b/hw/rtl/cpu_and_mem/cpu/tomasulo/store_queue/sq_forwarding_unit.sv @@ -73,6 +73,29 @@ module sq_forwarding_unit #( localparam int unsigned WordAddrWidth = XLEN - 2; localparam int unsigned IdxWidth = $clog2(DEPTH); + typedef struct packed { + logic valid; + logic [ReorderBufferTagWidth:0] age; + logic can_forward; + logic [IdxWidth-1:0] idx; + logic [1:0] extract_type; + } fwd_winner_t; + + function automatic fwd_winner_t choose_newer_winner(input fwd_winner_t lhs, + input fwd_winner_t rhs); + begin + if (!lhs.valid) begin + choose_newer_winner = rhs; + end else if (!rhs.valid) begin + choose_newer_winner = lhs; + end else if (rhs.age >= lhs.age) begin + choose_newer_winner = rhs; + end else begin + choose_newer_winner = lhs; + end + end + endfunction + // Forwarding scan result index (drives the SQ data-RAM read address in parent) logic [IdxWidth-1:0] fwd_match_idx; @@ -184,6 +207,10 @@ module sq_forwarding_unit #( logic [ReorderBufferTagWidth:0] fwd_load_age; logic [ReorderBufferTagWidth:0] fwd_entry_age[DEPTH]; logic [1:0] fwd_entry_extract_type[DEPTH]; + fwd_winner_t fwd_leaf[DEPTH]; + fwd_winner_t fwd_pair[4]; + fwd_winner_t fwd_quad[2]; + fwd_winner_t fwd_winner; assign fwd_load_byte_mask = gen_byte_en(i_sq_check_addr[1:0], i_sq_check_size); assign fwd_load_age = {1'b0, i_sq_check_rob_tag} - {1'b0, i_rob_head_tag}; @@ -305,24 +332,31 @@ module sq_forwarding_unit #( // Block 2: newest conflicting store wins for data/extract selection. The // heavy address/age qualification is already parallelized above, so this // block only prioritizes 1-bit match results and their precomputed metadata. + // Keep this as a balanced tree: the old serial loop let an SQ-check address + // bit feed each entry's conflict logic and then walk an 8-entry winner chain + // before reaching o_sq_forward.can_forward. always_comb begin - logic have_winner; - logic [ReorderBufferTagWidth:0] winner_age; - - have_winner = 1'b0; - winner_age = '0; - fwd_can_fwd = 1'b0; - fwd_match_idx = '0; - fwd_extract_type = 2'd0; for (int unsigned i = 0; i < DEPTH; i++) begin - if (fwd_conflict_mask[i] && (!have_winner || (fwd_entry_age[i] >= winner_age))) begin - have_winner = 1'b1; - winner_age = fwd_entry_age[i]; - fwd_can_fwd = fwd_can_forward_mask[i]; - fwd_match_idx = IdxWidth'(i); - fwd_extract_type = fwd_entry_extract_type[i]; - end + fwd_leaf[i].valid = fwd_conflict_mask[i]; + fwd_leaf[i].age = fwd_entry_age[i]; + fwd_leaf[i].can_forward = fwd_can_forward_mask[i]; + fwd_leaf[i].idx = IdxWidth'(i); + fwd_leaf[i].extract_type = fwd_entry_extract_type[i]; end + + fwd_pair[0] = choose_newer_winner(fwd_leaf[0], fwd_leaf[1]); + fwd_pair[1] = choose_newer_winner(fwd_leaf[2], fwd_leaf[3]); + fwd_pair[2] = choose_newer_winner(fwd_leaf[4], fwd_leaf[5]); + fwd_pair[3] = choose_newer_winner(fwd_leaf[6], fwd_leaf[7]); + + fwd_quad[0] = choose_newer_winner(fwd_pair[0], fwd_pair[1]); + fwd_quad[1] = choose_newer_winner(fwd_pair[2], fwd_pair[3]); + + fwd_winner = choose_newer_winner(fwd_quad[0], fwd_quad[1]); + + fwd_can_fwd = fwd_winner.valid && fwd_winner.can_forward; + fwd_match_idx = fwd_winner.idx; + fwd_extract_type = fwd_winner.extract_type; end // Block 3: Registered forwarding outputs. @@ -336,7 +370,7 @@ module sq_forwarding_unit #( end else begin o_sq_all_older_addrs_known <= i_sq_check_valid ? fwd_all_older_known : 1'b0; o_sq_forward.match <= i_sq_check_valid ? fwd_found_match : 1'b0; - o_sq_forward.can_forward <= i_sq_check_valid ? (fwd_found_match && fwd_can_fwd) : 1'b0; + o_sq_forward.can_forward <= i_sq_check_valid ? fwd_can_fwd : 1'b0; end case (fwd_extract_type) diff --git a/hw/rtl/cpu_and_mem/cpu/tomasulo/store_queue/store_queue.sv b/hw/rtl/cpu_and_mem/cpu/tomasulo/store_queue/store_queue.sv index ffdc8b85..888441f0 100644 --- a/hw/rtl/cpu_and_mem/cpu/tomasulo/store_queue/store_queue.sv +++ b/hw/rtl/cpu_and_mem/cpu/tomasulo/store_queue/store_queue.sv @@ -1356,7 +1356,8 @@ module store_queue #( end end - // Forwarding outputs are registered, so they reflect the previous check. + // Forwarding outputs are driven from staged SQ CAM results, so they reflect + // the previous check. always @(posedge i_clk) begin if (f_past_valid && i_rst_n && $past( i_rst_n diff --git a/hw/rtl/cpu_and_mem/cpu/tomasulo/tomasulo_wrapper/tomasulo_wrapper.sv b/hw/rtl/cpu_and_mem/cpu/tomasulo/tomasulo_wrapper/tomasulo_wrapper.sv index b1b15c4e..1bd61cea 100644 --- a/hw/rtl/cpu_and_mem/cpu/tomasulo/tomasulo_wrapper/tomasulo_wrapper.sv +++ b/hw/rtl/cpu_and_mem/cpu/tomasulo/tomasulo_wrapper/tomasulo_wrapper.sv @@ -129,6 +129,7 @@ module tomasulo_wrapper #( input logic i_csr_done, output logic o_trap_pending, output logic [riscv_pkg::XLEN-1:0] o_trap_pc, + output logic o_head_is_wfi, output riscv_pkg::exc_cause_t o_trap_cause, output logic [riscv_pkg::XLEN-1:0] o_trap_value, input logic i_trap_taken, @@ -636,9 +637,15 @@ module tomasulo_wrapper #( // CDB Arbiter: FU completions → single CDB broadcast // =========================================================================== riscv_pkg::cdb_broadcast_t cdb_bus_comb; // combinational from arbiter - riscv_pkg::cdb_broadcast_t cdb_bus; // registered — feeds RS/ROB wakeup + // registered — feeds RS/ROB wakeup + (* equivalent_register_removal = "no" *)riscv_pkg::cdb_broadcast_t cdb_bus; + // same-cycle INT_RS-local copy + (* equivalent_register_removal = "no" *)riscv_pkg::cdb_broadcast_t cdb_bus_int_rs; riscv_pkg::cdb_broadcast_t cdb_bus_2_comb; // 2-wide CDB lane-1, combinational - riscv_pkg::cdb_broadcast_t cdb_bus_2; // registered lane-1 — feeds RS/ROB wakeup + // registered lane-1 — feeds RS/ROB wakeup + (* equivalent_register_removal = "no" *)riscv_pkg::cdb_broadcast_t cdb_bus_2; + // same-cycle INT_RS-local copy + (* equivalent_register_removal = "no" *)riscv_pkg::cdb_broadcast_t cdb_bus_2_int_rs; // Forward declarations: adapter→arbiter signals (used here, defined below) riscv_pkg::fu_complete_t alu_adapter_to_arbiter; @@ -693,15 +700,22 @@ module tomasulo_wrapper #( // max_fanout forces replication across the RS snoop / ROB-write consumers — // the high-fanout report (609 loads) showed this net being one of the top // drivers into the flush-recovery cone that failed timing at -0.947 ns. - (* max_fanout = 32 *) logic cdb_bus_valid; + (* max_fanout = 32 *)logic cdb_bus_valid; + (* equivalent_register_removal = "no", max_fanout = 32 *)logic cdb_bus_int_rs_valid; always_ff @(posedge i_clk) begin if (!i_rst_n) cdb_bus_valid <= 1'b0; else cdb_bus_valid <= cdb_bus_comb.valid; end + always_ff @(posedge i_clk) begin + if (!i_rst_n) cdb_bus_int_rs_valid <= 1'b0; + else cdb_bus_int_rs_valid <= cdb_bus_comb.valid; + end + always_ff @(posedge i_clk) begin cdb_bus <= cdb_bus_comb; + cdb_bus_int_rs <= cdb_bus_comb; end // Expose combinational CDB for testbench observation (grant timing matches) @@ -714,6 +728,16 @@ module tomasulo_wrapper #( cdb_bus_qualified.valid = cdb_bus_valid; end + // INT_RS is physically far from the shared CDB register on Genesys2 and + // snoops many value bits in parallel. Give it an equivalent same-cycle CDB + // register so placement can keep that high-fanout payload local without + // changing wakeup latency. + riscv_pkg::cdb_broadcast_t cdb_bus_int_rs_qualified; + always_comb begin + cdb_bus_int_rs_qualified = cdb_bus_int_rs; + cdb_bus_int_rs_qualified.valid = cdb_bus_int_rs_valid; + end + // Derive ROB CDB write from CDB broadcast riscv_pkg::reorder_buffer_cdb_write_t cdb_write_from_arbiter; always_comb begin @@ -726,19 +750,30 @@ module tomasulo_wrapper #( end // ---- 2-wide CDB lane-1: registered mirror of the lane-0 pipeline above. - (* max_fanout = 32 *) logic cdb_bus_2_valid; + (* max_fanout = 32 *)logic cdb_bus_2_valid; + (* equivalent_register_removal = "no", max_fanout = 32 *)logic cdb_bus_2_int_rs_valid; always_ff @(posedge i_clk) begin if (!i_rst_n) cdb_bus_2_valid <= 1'b0; else cdb_bus_2_valid <= cdb_bus_2_comb.valid; end + always_ff @(posedge i_clk) begin + if (!i_rst_n) cdb_bus_2_int_rs_valid <= 1'b0; + else cdb_bus_2_int_rs_valid <= cdb_bus_2_comb.valid; + end always_ff @(posedge i_clk) begin cdb_bus_2 <= cdb_bus_2_comb; + cdb_bus_2_int_rs <= cdb_bus_2_comb; end riscv_pkg::cdb_broadcast_t cdb_bus_2_qualified; always_comb begin cdb_bus_2_qualified = cdb_bus_2; cdb_bus_2_qualified.valid = cdb_bus_2_valid; end + riscv_pkg::cdb_broadcast_t cdb_bus_2_int_rs_qualified; + always_comb begin + cdb_bus_2_int_rs_qualified = cdb_bus_2_int_rs; + cdb_bus_2_int_rs_qualified.valid = cdb_bus_2_int_rs_valid; + end riscv_pkg::reorder_buffer_cdb_write_t cdb_write_from_arbiter_2; always_comb begin cdb_write_from_arbiter_2.valid = cdb_bus_2_valid; @@ -1423,6 +1458,7 @@ module tomasulo_wrapper #( .i_csr_done (i_csr_done), .o_trap_pending (o_trap_pending), .o_trap_pc (o_trap_pc), + .o_head_is_wfi (o_head_is_wfi), .o_trap_cause (o_trap_cause), .o_trap_value (o_trap_value), .i_trap_taken (i_trap_taken), @@ -1658,8 +1694,8 @@ module tomasulo_wrapper #( .o_full_for_2(int_rs_full_for_2_w), // CDB snoop (from arbiter) - .i_cdb(cdb_bus_qualified), - .i_cdb_2(cdb_bus_2_qualified), + .i_cdb(cdb_bus_int_rs_qualified), + .i_cdb_2(cdb_bus_2_int_rs_qualified), .i_repair_valid_1(int_done_repair_valid_1), .i_repair_tag_1(i_bypass_tag_1), .i_repair_value_1(bypass_value_1), diff --git a/hw/rtl/cpu_and_mem/cpu_and_mem.f b/hw/rtl/cpu_and_mem/cpu_and_mem.f index 5e8abaaa..95cabf80 100644 --- a/hw/rtl/cpu_and_mem/cpu_and_mem.f +++ b/hw/rtl/cpu_and_mem/cpu_and_mem.f @@ -25,5 +25,8 @@ # High-address fetch window provider (two-line L1I buffer) $(ROOT)/hw/rtl/cpu_and_mem/fetch_provider.sv +# On-silicon hang triage (synthesizable boot-hang classifier over UART) +$(ROOT)/hw/rtl/cpu_and_mem/hang_triage.sv + # CPU and memory integration module $(ROOT)/hw/rtl/cpu_and_mem/cpu_and_mem.sv diff --git a/hw/rtl/cpu_and_mem/cpu_and_mem.sv b/hw/rtl/cpu_and_mem/cpu_and_mem.sv index e1dfebef..b6d96b67 100644 --- a/hw/rtl/cpu_and_mem/cpu_and_mem.sv +++ b/hw/rtl/cpu_and_mem/cpu_and_mem.sv @@ -60,7 +60,10 @@ module cpu_and_mem #( // provider over the 1-cycle instruction BRAM (LFSR-gated i_instr_valid + // owed-ask tracking). Exercises the core's fetch-invalid machinery // before a real I-cache sits behind it; hardware keeps 0. - parameter int unsigned FETCH_VALID_FUZZ = 0 + parameter int unsigned FETCH_VALID_FUZZ = 0, + // On-silicon boot-hang classifier that can take over the console UART. + // Keep it default-off for normal interactive software and Linux bring-up. + parameter int unsigned ENABLE_HANG_TRIAGE = 0 ) ( input logic i_clk, input logic i_clk_div4, // Divided clock for instruction memory programming @@ -178,11 +181,20 @@ module cpu_and_mem #( // CPU interface signals logic [31:0] program_counter; + logic commit_vld; // instruction-retire pulse (hang-triage tap) + // CPU-side UART write, muxed against the hang-triage byte stream further down. + logic cpu_uart_wr_en; + logic [7:0] cpu_uart_wr_data; logic [31:0] fetch_address; // imem port B address (the presented fetch ask) logic [63:0] instruction; // 64-bit fetch: {next_word, current_word} logic [riscv_pkg::ImemFetchSidebandWidth-1:0] instruction_sideband; logic instruction_bank_sel_r; // Fetch-word parity (for spanning select) logic instruction_valid; // Fetch window valid + // Served-window tag for the muxed fetch (drives the if_stage served-window + // guard) and the low-BRAM served address (fetch_address delayed one cycle to + // match the 1-cycle imem read latency). + logic [31:0] instruction_served_addr; + logic [31:0] bram_fetch_served_addr_q; logic fetch_replay_consume; // CPU consumed the stall-replay bundle this cycle logic pipeline_stall; // front-end pipeline stall (gates fetch publish-valid) logic fence_i_sync_req; // ROB serializer holding commit for a fence.i cache sync @@ -255,12 +267,24 @@ module cpu_and_mem #( // ns16550a UART face register file (8-bit). DLAB = ns_lcr[7]. logic [7:0] ns_dll, ns_dlm, ns_ier, ns_fcr, ns_lcr, ns_mcr, ns_scr; + logic ns_rx_irq_pending; + logic ns_tx_irq_pending; + logic ns_irq_pending; + logic [7:0] ns_iir; + assign ns_rx_irq_pending = ns_ier[0] && i_uart_rx_valid; + assign ns_tx_irq_pending = ns_ier[1] && i_uart_tx_ready; + assign ns_irq_pending = ns_rx_irq_pending || ns_tx_irq_pending; + always_comb begin + if (ns_rx_irq_pending) ns_iir = 8'hC4; // FIFO enabled, received data available. + else if (ns_tx_irq_pending) ns_iir = 8'hC2; // FIFO enabled, THR empty. + else ns_iir = 8'hC1; // FIFO enabled, no interrupt pending. + end // Interrupt signals to CPU riscv_pkg::interrupt_t interrupts; // Clamp unknown external interrupt values to 0 for simulation stability. // This avoids X-propagation into mip when the top-level input is left un-driven. - assign interrupts.meip = (i_external_interrupt === 1'b1); + assign interrupts.meip = (i_external_interrupt === 1'b1) || ns_irq_pending; assign interrupts.msip = msip; // Timer interrupt: register the 64-bit comparison result to break critical timing path. @@ -274,6 +298,19 @@ module cpu_and_mem #( end assign interrupts.mtip = mtip_registered; + // mtimecmp MMIO write pulse: a kernel/handler timer re-arm. Used by the hang + // triage as a "timer tick serviced" event tap. + logic mtimecmp_write_pulse; + assign mtimecmp_write_pulse = |data_memory_byte_write_enable_registered && + ((data_memory_address_registered == MtimecmpLowMmioAddr) || + (data_memory_address_registered == MtimecmpHighMmioAddr) || + (data_memory_address_registered == ClintMtimecmpLo) || + (data_memory_address_registered == ClintMtimecmpHi)); + logic [ 5:0] cpu_debug_irq_status; + logic [31:0] cpu_debug_commit_pc; + logic [31:0] cpu_debug_commit_2_pc; + logic [ 1:0] cpu_debug_commit_valid; + // RISC-V OOO CPU core - Tomasulo out-of-order with RV32IMACBFD + Zicsr + Machine/User-mode cpu_ooo #( .MEM_BYTE_ADDR_WIDTH(MemByteAddrWidth), @@ -288,6 +325,7 @@ module cpu_and_mem #( .i_instr(instruction), .i_instr_sideband(instruction_sideband), .i_instr_bank_sel_r(instruction_bank_sel_r), + .i_served_addr(instruction_served_addr), .i_instr_valid(instruction_valid), .o_fetch_replay_consume(fetch_replay_consume), .o_pipeline_stall(pipeline_stall), @@ -315,11 +353,15 @@ module cpu_and_mem #( .o_mmio_uart_rx_ready_pulse(mmio_uart_rx_ready_pulse), .i_data_mem_rd_data(data_memory_or_peripheral_read_data), .o_rst_done(/*not connected*/), - .o_vld (/*not connected*/), + .o_vld (commit_vld), .o_pc_vld(/*not connected*/), // Interrupt and timer interface .i_interrupts(interrupts), .i_mtime(mtime), + .o_debug_irq_status(cpu_debug_irq_status), + .o_debug_commit_pc(cpu_debug_commit_pc), + .o_debug_commit_2_pc(cpu_debug_commit_2_pc), + .o_debug_commit_valid(cpu_debug_commit_valid), // Branch prediction enabled by default in production .i_disable_branch_prediction(1'b0) ); @@ -367,6 +409,7 @@ module cpu_and_mem #( // still carries valid (preserving the IF first-cycle capture); the real // provider's registered stall produces the same 1-cycle lag. assign instruction_valid = fuzz_ok && fuzz_window_ready && !pipeline_stall_q; + assign instruction_served_addr = served_addr_q; assign fuzz_accepted = instruction_valid && !pipeline_stall; // The BRAM chases the owed ask while unserved and the live PC once // serving (the 1-cycle BRAM then keeps the window contract-aligned). @@ -429,6 +472,7 @@ module cpu_and_mem #( logic [63:0] cached_fetch_instr; logic [riscv_pkg::ImemFetchSidebandWidth-1:0] cached_fetch_sideband; logic cached_fetch_bank_sel_r; + logic [31:0] cached_fetch_served_addr; logic cached_fetch_valid; assign fetch_address = program_counter; @@ -457,6 +501,8 @@ module cpu_and_mem #( bram_fetch_sideband; assign instruction_bank_sel_r = fetch_high_valid_q ? cached_fetch_bank_sel_r : bram_fetch_bank_sel_cpu_r; + assign instruction_served_addr = fetch_high_valid_q ? cached_fetch_served_addr : + bram_fetch_served_addr_q; // High-address provider: two-line L1I fetch buffer for cached/DDR code. // It no longer drives the low-BRAM address pins; that path stays direct @@ -472,6 +518,7 @@ module cpu_and_mem #( .o_instr(cached_fetch_instr), .o_instr_sideband(cached_fetch_sideband), .o_instr_bank_sel_r(cached_fetch_bank_sel_r), + .o_served_addr(cached_fetch_served_addr), .o_instr_valid(cached_fetch_valid), .o_line_req_valid(iup_req_valid), .i_line_req_ready(iup_req_ready), @@ -487,6 +534,7 @@ module cpu_and_mem #( ); end else begin : gen_fetch_direct assign instruction_valid = 1'b1; + assign instruction_served_addr = bram_fetch_served_addr_q; assign fetch_address = program_counter; assign instruction = bram_fetch_instr; assign instruction_sideband = bram_fetch_sideband; @@ -514,7 +562,7 @@ module cpu_and_mem #( // Port A: Instruction programming (div4 clock, write only) .i_port_a_byte_address(i_instr_mem_addr), .i_port_a_write_data(i_instr_mem_wrdata), - .i_port_a_write_enable(i_instr_mem_en), + .i_port_a_write_enable(i_instr_mem_en && (|i_instr_mem_we)), .o_port_a_read_data( /* unused - write only */), // Port B: Instruction fetch (main clock, read only) .i_port_b_clk(i_clk), @@ -531,6 +579,7 @@ module cpu_and_mem #( // control net. always_ff @(posedge i_clk) begin bram_fetch_bank_sel_cpu_r <= fetch_address[2]; + bram_fetch_served_addr_q <= fetch_address; end `ifndef SYNTHESIS @@ -862,7 +911,7 @@ module cpu_and_mem #( // ns16550a UART face (aliases native UART TX/RX). DLAB selects DLL/DLM. Ns16550ThrRbr: mmio_read_data_comb = ns_lcr[7] ? {24'b0, ns_dll} : {24'b0, i_uart_rx_data}; Ns16550IerDlm: mmio_read_data_comb = ns_lcr[7] ? {24'b0, ns_dlm} : {24'b0, ns_ier}; - Ns16550IirFcr: mmio_read_data_comb = {24'b0, 8'hC1}; // FIFO enabled, no int pending + Ns16550IirFcr: mmio_read_data_comb = {24'b0, ns_iir}; Ns16550Lcr: mmio_read_data_comb = {24'b0, ns_lcr}; Ns16550Mcr: mmio_read_data_comb = {24'b0, ns_mcr}; // LSR: TEMT|THRE from TX-ready (bits 6,5); DR from RX-valid (bit 0). @@ -930,12 +979,87 @@ module cpu_and_mem #( // write to UART (native 0x4000_0000 TX, or the ns16550 THR at 0x4000_1000 // when DLAB is clear -- both funnel into the same TX byte stream). always_ff @(posedge i_clk) begin - o_uart_wr_data <= data_memory_write_data_registered[7:0]; // UART uses only lower byte - o_uart_wr_en <= |data_memory_byte_write_enable_registered && + cpu_uart_wr_data <= data_memory_write_data_registered[7:0]; // UART uses only lower byte + cpu_uart_wr_en <= |data_memory_byte_write_enable_registered && ((data_memory_address_registered == UartMmioAddr) || (data_memory_address_registered == Ns16550ThrRbr && !ns_lcr[7])); end + generate + if (ENABLE_HANG_TRIAGE != 0) begin : gen_hang_triage + // On-silicon hang triage: classify a silent boot hang over UART. This is + // intentionally opt-in because it periodically takes over the console. + logic triage_active; + logic triage_wr_en; + logic [ 7:0] triage_wr_data; + logic [31:0] triage_mtime_lo; + logic [31:0] triage_mtime_hi; + logic [31:0] triage_mtimecmp_lo; + logic [31:0] triage_mtimecmp_hi; + logic [31:0] triage_mtimecmp_delta_lo; + logic [31:0] triage_irq_status; + always_ff @(posedge i_clk) begin + if (i_rst) begin + triage_mtime_lo <= 32'd0; + triage_mtime_hi <= 32'd0; + triage_mtimecmp_lo <= 32'd0; + triage_mtimecmp_hi <= 32'd0; + triage_mtimecmp_delta_lo <= 32'd0; + triage_irq_status <= 32'd0; + end else begin + triage_mtime_lo <= mtime[31:0]; + triage_mtime_hi <= mtime[63:32]; + triage_mtimecmp_lo <= mtimecmp[31:0]; + triage_mtimecmp_hi <= mtimecmp[63:32]; + triage_mtimecmp_delta_lo <= mtimecmp[31:0] - mtime[31:0]; + triage_irq_status <= { + 22'd0, + cpu_debug_irq_status[5], + cpu_debug_irq_status[4], + cpu_debug_irq_status[3:2], + cpu_debug_irq_status[1], + cpu_debug_irq_status[0], + interrupts.meip, + interrupts.msip, + interrupts.mtip, + mtip_comparison + }; + end + end + hang_triage u_hang_triage ( + .i_clk (i_clk), + .i_rst (i_rst), + .i_commit (commit_vld), + .i_timer_event (mtimecmp_write_pulse), + .i_cread_req (data_memory_cached_read_enable), + .i_cread_resp (data_memory_cached_read_valid), + .i_cwrite_req (|data_memory_cached_byte_write_enable), + .i_cwrite_done (data_memory_cached_write_done), + .i_pc (program_counter), + .i_commit0_valid (cpu_debug_commit_valid[0]), + .i_commit0_pc (cpu_debug_commit_pc), + .i_commit1_valid (cpu_debug_commit_valid[1]), + .i_commit1_pc (cpu_debug_commit_2_pc), + .i_mtime_lo (triage_mtime_lo), + .i_mtime_hi (triage_mtime_hi), + .i_mtimecmp_lo (triage_mtimecmp_lo), + .i_mtimecmp_hi (triage_mtimecmp_hi), + .i_mtimecmp_delta_lo(triage_mtimecmp_delta_lo), + .i_irq_status (triage_irq_status), + .i_uart_busy (cpu_uart_wr_en), + .i_uart_ready (i_uart_tx_ready), + .o_active (triage_active), + .o_wr_en (triage_wr_en), + .o_wr_data (triage_wr_data) + ); + assign o_uart_wr_en = triage_active ? triage_wr_en : cpu_uart_wr_en; + assign o_uart_wr_data = triage_active ? triage_wr_data : cpu_uart_wr_data; + end else begin : gen_no_hang_triage + assign o_uart_wr_en = cpu_uart_wr_en; + assign o_uart_wr_data = cpu_uart_wr_data; + end + endgenerate + // ns16550a register-file writes. DLAB (LCR[7]) routes offsets 0/4 to the // baud divisor (DLL/DLM); the THR write itself transmits via o_uart_wr_en. always_ff @(posedge i_clk) begin @@ -970,11 +1094,23 @@ module cpu_and_mem #( assign o_fifo1_wr_en = |data_memory_byte_write_enable_registered && data_memory_address_registered == Fifo1MmioAddr; + // Linux reads received bytes through the ns16550 RBR alias. That read must + // consume the shared UART RX FIFO just like the native FROST RX-data address, + // but only when DLAB is clear; with DLAB set, offset 0 is DLL. + logic ns16550_rbr_read_pulse; + always_ff @(posedge i_clk) begin + if (i_rst) begin + ns16550_rbr_read_pulse <= 1'b0; + end else begin + ns16550_rbr_read_pulse <= mmio_read_pulse && (mmio_load_addr == Ns16550ThrRbr) && !ns_lcr[7]; + end + end + // FIFO/UART consume pulses fire one cycle after the MMIO read request is // accepted. The response data itself was already captured above. - assign o_fifo0_rd_en = mmio_fifo0_read_pulse; - assign o_fifo1_rd_en = mmio_fifo1_read_pulse; - assign o_uart_rx_ready = mmio_uart_rx_ready_pulse; + assign o_fifo0_rd_en = mmio_fifo0_read_pulse; + assign o_fifo1_rd_en = mmio_fifo1_read_pulse; + assign o_uart_rx_ready = mmio_uart_rx_ready_pulse || ns16550_rbr_read_pulse; // Timer register updates // mtime increments every clock cycle (provides wall-clock time) diff --git a/hw/rtl/cpu_and_mem/fetch_provider.sv b/hw/rtl/cpu_and_mem/fetch_provider.sv index 535af9c5..651a505c 100644 --- a/hw/rtl/cpu_and_mem/fetch_provider.sv +++ b/hw/rtl/cpu_and_mem/fetch_provider.sv @@ -70,6 +70,11 @@ module fetch_provider #( output logic [63:0] o_instr, output logic [riscv_pkg::ImemFetchSidebandWidth-1:0] o_instr_sideband, output logic o_instr_bank_sel_r, + // Full served-window address (its tag). if_stage uses this to detect a fetch + // stall that left pc_reg outside the served window (>1 word away), which the + // 1-bit bank_sel parity cannot represent -> wrong-word size sample / mid-insn + // pc_reg drift. Observe-only output; does not change fetch behaviour here. + output logic [31:0] o_served_addr, output logic o_instr_valid, // L1I line port (master; read-only -- write/wdata/wstrb tied inactive). @@ -221,6 +226,7 @@ module fetch_provider #( assign o_instr = ddr_instr_q; assign o_instr_sideband = ddr_sb_pair_q; assign o_instr_bank_sel_r = bank_sel_q; + assign o_served_addr = served_addr_q; // =========================================================================== // Miss engine: single-outstanding line fills + next-line prefetch diff --git a/hw/rtl/cpu_and_mem/hang_triage.sv b/hw/rtl/cpu_and_mem/hang_triage.sv new file mode 100644 index 00000000..f995d6b1 --- /dev/null +++ b/hw/rtl/cpu_and_mem/hang_triage.sv @@ -0,0 +1,354 @@ +/* + * Copyright 2026 Two Sigma Open Source, LLC + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/* + * hang_triage — on-silicon classifier for the silent boot hang. + * + * Trigger: the console UART goes quiet (every hang flavor stops the kernel + * printing). On a quiet stretch it streams ASCII over the UART and re-emits + * periodically so the trajectory is visible: + * + * "\n!!HANG c= t= q= v= w=" + * " l= h= m= n= x=" + * " y= d= p=" + * "\nH ... \n" + * + * c committed instructions climbing => busy-loop; frozen => wedge + * t mtimecmp writes (timer) frozen => timer service stopped + * q/v cached read req/resp q>v frozen => a DDR read never returned + * w cached write {req:done} req>done => a DDR write never landed + * l/h pc_lo..pc_hi PC range executed since last console output + * r/s last retired PCs slot-1 / slot-2 commit PCs + * m/n mtime lo/hi CLINT time at snapshot + * x/y mtimecmp lo/hi CLINT compare at snapshot + * d mtimecmp-mtime low word high bit set usually means compare is overdue + * p irq/status bits: + * [0]=raw mtime>=mtimecmp, [1]=registered MTIP, [2]=MSIP, [3]=MEIP, + * [4]=mie.MTIE, [5]=mstatus.MIE, [7:6]=priv, [8]=trap, [9]=mret + * H PC histogram, 64 buckets of 64 KiB keyed on pc[21:16] (kernel pc[31]=1) + * => cycle-weighted hot region of the livelock (bucket k = 0x8000_0000 + + * k*0x10000). The hottest bucket localizes the spin to a 64 KiB window. + * + * Non-latching: any console write resets the quiet timer + PC window. + */ +module hang_triage #( + parameter logic [31:0] QUIET_CYCLES = 32'd400_000_000, // ~3 s @133 MHz + parameter logic [31:0] REEMIT_CYCLES = 32'd134_000_000 // ~1 s +) ( + input logic i_clk, + input logic i_rst, + + input logic i_commit, + input logic i_timer_event, + input logic i_cread_req, + input logic i_cread_resp, + input logic i_cwrite_req, + input logic i_cwrite_done, + input logic [31:0] i_pc, + input logic i_commit0_valid, + input logic [31:0] i_commit0_pc, + input logic i_commit1_valid, + input logic [31:0] i_commit1_pc, + input logic [31:0] i_mtime_lo, + input logic [31:0] i_mtime_hi, + input logic [31:0] i_mtimecmp_lo, + input logic [31:0] i_mtimecmp_hi, + input logic [31:0] i_mtimecmp_delta_lo, + input logic [31:0] i_irq_status, + input logic i_uart_busy, + + input logic i_uart_ready, + output logic o_active, + output logic o_wr_en, + output logic [7:0] o_wr_data +); + + // ---- Free-running event counters ------------------------------------------ + logic [31:0] cnt_commit, cnt_timer, cnt_cread_req, cnt_cread_resp; + logic [31:0] cnt_cwrite_req, cnt_cwrite_done; + always_ff @(posedge i_clk) begin + if (i_rst) begin + cnt_commit <= 32'd0; + cnt_timer <= 32'd0; + cnt_cread_req <= 32'd0; + cnt_cread_resp <= 32'd0; + cnt_cwrite_req <= 32'd0; + cnt_cwrite_done <= 32'd0; + end else begin + if (i_commit) cnt_commit <= cnt_commit + 32'd1; + if (i_timer_event) cnt_timer <= cnt_timer + 32'd1; + if (i_cread_req) cnt_cread_req <= cnt_cread_req + 32'd1; + if (i_cread_resp) cnt_cread_resp <= cnt_cread_resp + 32'd1; + if (i_cwrite_req) cnt_cwrite_req <= cnt_cwrite_req + 32'd1; + if (i_cwrite_done) cnt_cwrite_done <= cnt_cwrite_done + 32'd1; + end + end + + // ---- PC histogram: 64 x 64 KiB buckets, kernel PCs only ------------------- + logic [31:0] hist[64]; + logic [5:0] pc_bucket; + assign pc_bucket = i_pc[21:16]; + always_ff @(posedge i_clk) begin + if (i_rst || i_uart_busy) begin + // Clear while the console is active so the histogram reflects ONLY the + // quiet (hang) window, not the pre-hang boot execution. + for (int b = 0; b < 64; b++) hist[b] <= 32'd0; + end else if (i_pc[31]) begin // count only kernel-range PCs + hist[pc_bucket] <= hist[pc_bucket] + 32'd1; + end + end + + // ---- Console-idle timer + PC window --------------------------------------- + logic [31:0] quiet_cnt; + logic [31:0] pc_lo, pc_hi; + logic [31:0] last_commit0_pc, last_commit1_pc; + logic win_reset; + always_ff @(posedge i_clk) begin + if (i_rst) begin + quiet_cnt <= 32'd0; + pc_lo <= 32'hFFFFFFFF; + pc_hi <= 32'h00000000; + last_commit0_pc <= 32'd0; + last_commit1_pc <= 32'd0; + end else if (i_uart_busy) begin + quiet_cnt <= 32'd0; + pc_lo <= i_pc; + pc_hi <= i_pc; + if (i_commit0_valid) last_commit0_pc <= i_commit0_pc; + if (i_commit1_valid) last_commit1_pc <= i_commit1_pc; + end else begin + if (quiet_cnt != 32'hFFFFFFFF) quiet_cnt <= quiet_cnt + 32'd1; + if (i_commit0_valid) last_commit0_pc <= i_commit0_pc; + if (i_commit1_valid) last_commit1_pc <= i_commit1_pc; + if (win_reset) begin + pc_lo <= i_pc; + pc_hi <= i_pc; + end else begin + if (i_pc < pc_lo) pc_lo <= i_pc; + if (i_pc > pc_hi) pc_hi <= i_pc; + end + end + end + + // ---- Snapshot ------------------------------------------------------------- + logic [31:0] snap_c, snap_t, snap_q, snap_v, snap_w, snap_l, snap_h, snap_r, snap_s; + logic [31:0] snap_m, snap_n, snap_x, snap_y, snap_d, snap_p; + + // ---- ASCII emit FSM ------------------------------------------------------- + typedef enum logic [2:0] { + EM_IDLE, + EM_PREFIX, + EM_FIELD, + EM_HPRE, + EM_HIST, + EM_GAP + } em_state_e; + em_state_e em_state; + logic [3:0] pcnt; + localparam logic [3:0] FieldLast = 4'd14; + logic [ 3:0] fld; + logic [ 3:0] fpos; + logic [ 5:0] hidx; + logic [ 3:0] hpos; // 0..8 within a hist entry + logic [31:0] reemit_cnt; + + assign win_reset = (em_state == EM_IDLE) && (quiet_cnt >= QUIET_CYCLES); + + function automatic logic [7:0] hex4(input logic [3:0] n); + hex4 = (n < 4'd10) ? (8'h30 + {4'b0, n}) : (8'h41 + {4'b0, n} - 8'd10); + endfunction + + function automatic logic [7:0] prefix_byte(input logic [3:0] i); + case (i) + 4'd0: prefix_byte = 8'h0A; + 4'd1: prefix_byte = "!"; + 4'd2: prefix_byte = "!"; + 4'd3: prefix_byte = "H"; + 4'd4: prefix_byte = "A"; + 4'd5: prefix_byte = "N"; + 4'd6: prefix_byte = "G"; + default: prefix_byte = " "; + endcase + endfunction + + function automatic logic [7:0] label_byte(input logic [3:0] f); + case (f) + 4'd0: label_byte = "c"; + 4'd1: label_byte = "t"; + 4'd2: label_byte = "q"; + 4'd3: label_byte = "v"; + 4'd4: label_byte = "w"; + 4'd5: label_byte = "l"; + 4'd6: label_byte = "h"; + 4'd7: label_byte = "r"; + 4'd8: label_byte = "s"; + 4'd9: label_byte = "m"; + 4'd10: label_byte = "n"; + 4'd11: label_byte = "x"; + 4'd12: label_byte = "y"; + 4'd13: label_byte = "d"; + default: label_byte = "p"; + endcase + endfunction + + logic [31:0] fld_val; + always_comb begin + case (fld) + 4'd0: fld_val = snap_c; + 4'd1: fld_val = snap_t; + 4'd2: fld_val = snap_q; + 4'd3: fld_val = snap_v; + 4'd4: fld_val = snap_w; + 4'd5: fld_val = snap_l; + 4'd6: fld_val = snap_h; + 4'd7: fld_val = snap_r; + 4'd8: fld_val = snap_s; + 4'd9: fld_val = snap_m; + 4'd10: fld_val = snap_n; + 4'd11: fld_val = snap_x; + 4'd12: fld_val = snap_y; + 4'd13: fld_val = snap_d; + default: fld_val = snap_p; + endcase + end + + logic [3:0] nib_idx; + always_comb begin + nib_idx = 4'd0; + if (fpos >= 4'd2 && fpos <= 4'd9) nib_idx = 4'd9 - fpos; + end + + logic [3:0] hnib_idx; + always_comb begin + hnib_idx = 4'd0; + if (hpos <= 4'd7) hnib_idx = 4'd7 - hpos; + end + + logic [7:0] emit_byte; + always_comb begin + emit_byte = 8'h20; + unique case (em_state) + EM_PREFIX: emit_byte = prefix_byte(pcnt); + EM_FIELD: begin + if (fpos == 4'd0) emit_byte = label_byte(fld); + else if (fpos == 4'd1) emit_byte = "="; + else if (fpos == 4'd10) emit_byte = 8'h20; + else emit_byte = hex4(fld_val[nib_idx*4+:4]); + end + EM_HPRE: emit_byte = (pcnt == 4'd0) ? 8'h0A : ((pcnt == 4'd1) ? "H" : " "); + EM_HIST: + emit_byte = (hpos == 4'd8) ? ((hidx == 6'd63) ? 8'h0A : 8'h20) : + hex4(hist[hidx][hnib_idx*4+:4]); + default: emit_byte = 8'h20; + endcase + end + + always_ff @(posedge i_clk) begin + if (i_rst) begin + em_state <= EM_IDLE; + pcnt <= 4'd0; + fld <= 4'd0; + fpos <= 4'd0; + hidx <= 6'd0; + hpos <= 4'd0; + reemit_cnt <= 32'd0; + o_active <= 1'b0; + o_wr_en <= 1'b0; + o_wr_data <= 8'd0; + end else begin + o_wr_en <= 1'b0; + case (em_state) + EM_IDLE: begin + if (quiet_cnt >= QUIET_CYCLES) begin + snap_c <= cnt_commit; + snap_t <= cnt_timer; + snap_q <= cnt_cread_req; + snap_v <= cnt_cread_resp; + snap_w <= {cnt_cwrite_req[15:0], cnt_cwrite_done[15:0]}; + snap_l <= pc_lo; + snap_h <= pc_hi; + snap_r <= last_commit0_pc; + snap_s <= last_commit1_pc; + snap_m <= i_mtime_lo; + snap_n <= i_mtime_hi; + snap_x <= i_mtimecmp_lo; + snap_y <= i_mtimecmp_hi; + snap_d <= i_mtimecmp_delta_lo; + snap_p <= i_irq_status; + o_active <= 1'b1; + pcnt <= 4'd0; + em_state <= EM_PREFIX; + end + end + EM_PREFIX: + if (i_uart_ready) begin + o_wr_en <= 1'b1; + o_wr_data <= emit_byte; + if (pcnt == 4'd7) begin + fld <= 4'd0; + fpos <= 4'd0; + em_state <= EM_FIELD; + end else pcnt <= pcnt + 4'd1; + end + EM_FIELD: + if (i_uart_ready) begin + o_wr_en <= 1'b1; + o_wr_data <= emit_byte; + if (fpos == 4'd10) begin + if (fld == FieldLast) begin + pcnt <= 4'd0; + em_state <= EM_HPRE; + end else begin + fld <= fld + 4'd1; + fpos <= 4'd0; + end + end else fpos <= fpos + 4'd1; + end + EM_HPRE: + if (i_uart_ready) begin + o_wr_en <= 1'b1; + o_wr_data <= emit_byte; + if (pcnt == 4'd2) begin + hidx <= 6'd0; + hpos <= 4'd0; + em_state <= EM_HIST; + end else pcnt <= pcnt + 4'd1; + end + EM_HIST: + if (i_uart_ready) begin + o_wr_en <= 1'b1; + o_wr_data <= emit_byte; + if (hpos == 4'd8) begin + if (hidx == 6'd63) begin + em_state <= EM_GAP; + reemit_cnt <= REEMIT_CYCLES; + end else begin + hidx <= hidx + 6'd1; + hpos <= 4'd0; + end + end else hpos <= hpos + 4'd1; + end + EM_GAP: begin + o_active <= 1'b0; + if (reemit_cnt <= 32'd1) em_state <= EM_IDLE; + else reemit_cnt <= reemit_cnt - 32'd1; + end + default: em_state <= EM_IDLE; + endcase + end + end + +endmodule : hang_triage diff --git a/hw/rtl/frost.sv b/hw/rtl/frost.sv index 8bb657e3..c102081f 100644 --- a/hw/rtl/frost.sv +++ b/hw/rtl/frost.sv @@ -62,7 +62,9 @@ module frost #( // them to their DDR controller subsystem). parameter int unsigned USE_BEHAVIORAL_DDR = 1, // Simulation-only fetch-latency fuzz (see cpu_and_mem). Hardware keeps 0. - parameter int unsigned FETCH_VALID_FUZZ = 0 + parameter int unsigned FETCH_VALID_FUZZ = 0, + // Optional on-silicon boot-hang classifier that can emit over UART. + parameter int unsigned ENABLE_HANG_TRIAGE = 0 ) ( input logic i_clk, input logic i_clk_div4, @@ -196,7 +198,8 @@ module frost #( .DDR_MODEL_BYTES(DDR_MODEL_BYTES), .DDR_MODEL_LATENCY(DDR_MODEL_LATENCY), .USE_BEHAVIORAL_DDR(USE_BEHAVIORAL_DDR), - .FETCH_VALID_FUZZ(FETCH_VALID_FUZZ) + .FETCH_VALID_FUZZ(FETCH_VALID_FUZZ), + .ENABLE_HANG_TRIAGE(ENABLE_HANG_TRIAGE) ) cpu_and_memory_subsystem ( .i_clk, .i_clk_div4, diff --git a/hw/rtl/peripherals/uart_rx.sv b/hw/rtl/peripherals/uart_rx.sv index bb62bed8..8f646e72 100644 --- a/hw/rtl/peripherals/uart_rx.sv +++ b/hw/rtl/peripherals/uart_rx.sv @@ -113,7 +113,7 @@ module uart_rx #( STATE_DATA_BITS: begin // Move to stop bit after all 8 data bits received - if (baud_rate_prescaler_counter == 0 && bits_remaining_counter == 0) begin + if (baud_rate_prescaler_counter == 0 && bits_remaining_counter == 1) begin next_state = STATE_STOP_BIT; end end diff --git a/hw/sim/cpu_tb.sv b/hw/sim/cpu_tb.sv index d360a0da..d1b5f133 100644 --- a/hw/sim/cpu_tb.sv +++ b/hw/sim/cpu_tb.sv @@ -48,16 +48,50 @@ module cpu_tb ); // Internal signals (names match CPU port names for wildcard connection) - logic [31:0] i_instr; // Registered instruction fed to CPU (raw 32-bit for C extension) - logic [1:0] i_instr_sideband; // Predecode: {is_compressed_hi, is_compressed_lo} + // 64-bit fetch window {next_word, current_word} (the CPU fetches a word pair). + logic [63:0] i_instr; + // Per-32-bit-word predecode sideband (ImemSidebandWidth bits each half). + logic [riscv_pkg::ImemFetchSidebandWidth-1:0] i_instr_sideband; + logic i_instr_bank_sel_r; // Fetch-word parity (pc_reg[2]) for the window + logic i_instr_valid; // Fetch window valid (tie 1: fixed 1-cycle provider) logic [31:0] i_data_mem_rd_data; // Data memory read data to CPU logic pipeline_stall_from_cpu; // Stall signal monitoring (registered, 1-cycle delay) logic pipeline_stall_comb; // Stall signal (combinational, immediate) logic reset_to_cpu; // Reset signal monitoring - logic o_mmio_read_pulse; // Unused in testbench; required for CPU .* connection - logic [31:0] o_mmio_load_addr; // Unused in testbench; required for CPU .* connection - logic o_mmio_load_valid; // Unused in testbench; required for CPU .* connection - logic o_pipeline_stall; // Unused in testbench; required for CPU .* connection + + // Registered 1-cycle fetch state (mimics block-RAM instruction memory latency) + logic [31:0] tb_cur_word; // current fetch word presented to the CPU + logic tb_bank_sel_q; // parity (PC[2]) of the fetched address + localparam logic [31:0] TbNop = 32'h0000_0013; // addi x0,x0,0 + + // Ports below are unused by this instruction-feed testbench but must exist as + // local signals so the wildcard (.*) connection to cpu_ooo resolves. + logic o_mmio_read_pulse; + logic [31:0] o_mmio_load_addr; + logic o_mmio_load_valid; + logic o_mmio_fifo0_read_pulse; + logic o_mmio_fifo1_read_pulse; + logic o_mmio_uart_rx_ready_pulse; + logic o_pipeline_stall; + logic o_fetch_replay_consume; + // FENCE.I cache-sync handshake (no I-cache here; completed immediately below) + logic o_fence_i_sync_req; + logic i_fence_i_sync_done; + logic o_fence_i_flush; + // Cached (high-address) tier request outputs + response inputs (tied idle: + // the directed programs touch only the low BRAM range, never CACHED_BASE). + logic [3:0] o_data_mem_cached_byte_wr_en; + logic [31:0] o_data_mem_cached_wr_data; + logic o_data_mem_cached_read_enable; + logic [31:0] i_cached_read_data; + logic i_cached_read_valid; + logic i_cached_write_done; + logic i_cached_write_inflight; + // Debug taps (read from cocotb via device_under_test.*; also exposed here). + logic [5:0] o_debug_irq_status; + logic [31:0] o_debug_commit_pc; + logic [31:0] o_debug_commit_2_pc; + logic [1:0] o_debug_commit_valid; // Interrupt and timer signals for CPU (controllable from testbench) // Use reg type to allow testbench to drive values via force/deposit @@ -81,14 +115,37 @@ module cpu_tb always_ff @(posedge i_clk) begin // Stall signal from CPU observed on next rising edge pipeline_stall_from_cpu <= device_under_test.pipeline_ctrl.stall; - // Mimic one cycle read latency of block RAM instruction memory port - i_instr <= instruction_from_testbench; - // Compute sideband: {is_compressed_hi, is_compressed_lo} - // A halfword is compressed when its low 2 bits != 2'b11 - i_instr_sideband[0] <= (instruction_from_testbench[1:0] != 2'b11); - i_instr_sideband[1] <= (instruction_from_testbench[17:16] != 2'b11); + // Mimic one cycle read latency of block RAM instruction memory port: the + // word for the address requested on o_pc this cycle is presented next cycle. + tb_cur_word <= instruction_from_testbench; + tb_bank_sel_q <= o_pc[2]; // parity of the fetched address end + // 64-bit fetch window {next_word, current_word}. The testbench feeds only + // 32-bit, 4-byte-aligned instructions (no compressed, no halfword spanning), + // so the "next word" half is never consumed (spanning only fires at pc[1]); + // drive a NOP there. + assign i_instr = {TbNop, tb_cur_word}; + // Per-word predecode sideband, computed by the same pure function the RTL + // fetch path uses (riscv_pkg::imem_make_sideband; no lookahead). + assign i_instr_sideband = { + riscv_pkg::imem_make_sideband(TbNop), riscv_pkg::imem_make_sideband(tb_cur_word) + }; + // bank_sel_r == pc_reg[2] => aligned: current word taken from i_instr[31:0]. + assign i_instr_bank_sel_r = tb_bank_sel_q; + // Fixed 1-cycle provider: the fetch window is always valid. + assign i_instr_valid = 1'b1; + + // FENCE.I cache-sync handshake completes immediately (no I-cache here; the + // directed programs never issue FENCE.I, so o_fence_i_sync_req stays low). + assign i_fence_i_sync_done = o_fence_i_sync_req; + + // Cached (high-address) tier response inputs tied inactive (tier unused). + assign i_cached_read_data = '0; + assign i_cached_read_valid = 1'b0; + assign i_cached_write_done = 1'b0; + assign i_cached_write_inflight = 1'b0; + // Memory addressing parameters localparam int unsigned MemByteAddrWidth = $clog2(MEM_SIZE_BYTES); localparam int unsigned MemWordAddrWidth = MemByteAddrWidth - 2; diff --git a/sw/apps/ddr_atomic_test/main.c b/sw/apps/ddr_atomic_test/main.c index dd39f8ba..8e12286f 100644 --- a/sw/apps/ddr_atomic_test/main.c +++ b/sw/apps/ddr_atomic_test/main.c @@ -49,6 +49,16 @@ static void puts_(const char *s) /* Lives in the cached DDR region. */ __attribute__((section(".ddr_data"))) static volatile uint32_t ddr_var = 0x10; +struct pde_like { + uint32_t in_use; + uint32_t refcnt; + uint8_t pad[88]; + uint16_t mode; + uint8_t flags; + uint8_t namelen; + uint32_t tail; +}; +__attribute__((section(".ddr_data"))) static volatile struct pde_like ddr_pde_like; int main(void) { @@ -66,6 +76,11 @@ int main(void) /* 2. AMO to DDR (amoadd.w). Hangs here if AMO-to-cached deadlocks. */ uint32_t old_amo; __asm__ volatile("amoadd.w %0, %2, (%1)" : "=r"(old_amo) : "r"(&ddr_var), "r"(1u) : "memory"); + if (old_amo != 0x20) { + puts_("\r\n<> amo old value\r\n"); + for (;;) { + } + } if (ddr_var != 0x21) { puts_("\r\n<> amo result\r\n"); for (;;) { @@ -73,6 +88,46 @@ int main(void) } putc_('A'); + /* 2b. Refcount-like repeated AMO increments: validate both old and new values. */ + ddr_var = 1; + for (uint32_t i = 0; i < 256; i++) { + uint32_t old_loop; + __asm__ volatile("amoadd.w %0, %2, (%1)" + : "=r"(old_loop) + : "r"(&ddr_var), "r"(1u) + : "memory"); + if (old_loop != i + 1 || ddr_var != i + 2) { + puts_("\r\n<> amo loop value\r\n"); + for (;;) { + } + } + } + putc_('R'); + + /* 2c. Proc-dir-entry-like layout: AMO at +4 must not corrupt mode at +96. */ + ddr_pde_like.in_use = 0x11111111u; + ddr_pde_like.refcnt = 1u; + ddr_pde_like.mode = 0x8124u; + ddr_pde_like.flags = 0x5au; + ddr_pde_like.namelen = 7u; + ddr_pde_like.tail = 0xa5a55a5au; + for (uint32_t i = 0; i < 256; i++) { + uint32_t old_ref; + __asm__ volatile("amoadd.w %0, %2, (%1)" + : "=r"(old_ref) + : "r"(&ddr_pde_like.refcnt), "r"(1u) + : "memory"); + if (old_ref != i + 1 || ddr_pde_like.refcnt != i + 2 || + ddr_pde_like.in_use != 0x11111111u || ddr_pde_like.mode != 0x8124u || + ddr_pde_like.flags != 0x5au || ddr_pde_like.namelen != 7u || + ddr_pde_like.tail != 0xa5a55a5au) { + puts_("\r\n<> amo struct corruption\r\n"); + for (;;) { + } + } + } + putc_('P'); + /* 3. LR/SC compare-exchange to DDR (matches the kernel's sc.w.rl). */ uint32_t prev; __asm__ volatile("1: lr.w %0, (%1)\n" diff --git a/sw/apps/drain_trapframe_test/Makefile b/sw/apps/drain_trapframe_test/Makefile new file mode 100644 index 00000000..1150cd86 --- /dev/null +++ b/sw/apps/drain_trapframe_test/Makefile @@ -0,0 +1,20 @@ +# Copyright 2026 Two Sigma Open Source, LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +# Trap-frame store-visibility-under-L1D-eviction directed test ("Bug B" relocated +# to pt_regs s2). Force the whole program into cached DDR so the trap-frame save +# store and the conflicting eviction accesses all traverse the L1D -> DDR path. +override MEM_CONFIG := ddr +SRC_C := ../../lib/src/uart.c main.c +include ../../common/common.mk diff --git a/sw/apps/drain_trapframe_test/main.c b/sw/apps/drain_trapframe_test/main.c new file mode 100644 index 00000000..038ae846 --- /dev/null +++ b/sw/apps/drain_trapframe_test/main.c @@ -0,0 +1,511 @@ +/* + * Copyright 2026 Two Sigma Open Source, LLC + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/* + * Directed test for "Bug B" relocated to the kernel trap frame (pt_regs): + * TRAP-FRAME STORE-VISIBILITY UNDER L1D CACHE EVICTION. + * + * Real-world failure being reproduced: at a procfs panic the callee-saved + * register s2 came back as 0x19999998 (a value name_to_int materialises) + * instead of its proper pointer, after a machine-timer interrupt fired during + * active kernel code. Suspected mechanism (same class as the fence.i/SMC "store + * leaves the SQ when sent, not when landed" bug): the naked trap entry saves + * GPRs to the kernel stack ("sw s2, 72(sp)"); if that committed store is + * considered drained from the store queue BEFORE its data physically lands in + * the write-back L1D, and cache pressure during the handler EVICTS that stack + * line (writing STALE data back to DDR), then the trap exit "lw s2, 72(sp)" + * refills from DDR and restores s2 WRONG. + * + * Construction (full frost SoC, cached/DDR tier, MEM_CONFIG=ddr): + * - A faithful Linux-style naked trap entry saves the full pt_regs frame to a + * cached-DDR "kernel stack" at a FIXED line-aligned address (FRAME_BASE), so + * the exact L1D set holding the saved s2 word is known and can be evicted + * deterministically. s2 sits at offset 72, exactly as in the real handler. + * - Before each interrupt the frame's s2 cache line is PRE-POISONED with + * 0x19999998 (the real failing value), so a non-landed save read-back yields + * EXACTLY the real-world wrong value. + * - A cold-miss DDR drain store is issued just before the IRQ window (like + * wfi_drain_mepc_test) so a store is in flight / the memory subsystem is + * busy when the trap is taken. + * - AFTER saving the frame (s2 stored LAST, immediately before the eviction) + * the handler AGGRESSIVELY EVICTS the saved s2 line by striding through + * cached-DDR addresses that map to the SAME L1D set. The L1D is 128 KiB + * DIRECT-MAPPED with 32-byte lines (hw/rtl/lib/cache/frost_cache.sv, + * L1_CACHE_BYTES=128*1024), so address A and A + 0x20000 collide in one set. + * - The handler then reads the s2 slot back (the load under test) and checks. + * + * Discriminator (the key result): + * code=29 : the incoming ARCHITECTURAL s2 was already wrong (precise-state / + * rename corruption) -- not the target bug. + * code=30 : the SAVED frame value was already wrong BEFORE eviction + * (the store never became visible at all). + * code=31 : the saved frame was CORRECT before eviction but the post-eviction + * read-back is WRONG ==> the store/eviction memory-visibility bug. + * THIS is the targeted reproduction. + * + * The timer margin is swept finely (0..255) so the IRQ lands at every offset + * across the drain+handler window; the per-margin "gap" (filler between the s2 + * store and the eviction) is also swept (0..15) to sample the in-flight window. + * Resume is via a fixed continuation (the handler redirects mepc), so a wrong + * mepc is never fatal. Run with CACHED_HAS_L2=0 (Genesys2 / HW-faithful shape, + * where a cold write-back actually drains) and DDR_MODEL_LATENCY>=70. + * + * PASS -> prints <> (no margin ever corrupts a restored register). + * FAIL -> prints <> with code + margin + expected/actual (e.g. s2). + */ + +#include + +#include "trap.h" +#include "uart.h" + +/* ---- L1D geometry (frost_cache.sv: 128 KiB direct-mapped, 32 B lines) ---- */ +#define L1D_STRIDE 0x00020000u /* 128 KiB: A and A+stride share one set */ +#define N_EVICT 6u /* conflicting lines touched per eviction */ + +/* ---- Fixed cached-DDR "kernel stack" for the trap frame (line aligned) ---- */ +#define FRAME_BASE 0x82000000u +#define FRAME_TOP (FRAME_BASE + 144u) /* pt_regs is 144 bytes; sp on entry */ +#define S2_LINE_BASE (FRAME_BASE + 64u) /* 32 B line holding s2@72 (64..95) */ + +/* Cold DDR region for the per-margin in-flight drain store. Chosen so its L1D + * sets (2048..) never collide with the frame's s2 set (2), and far from the + * program and the frame. */ +#define DRAIN_BASE 0x83010000u +#define DRAIN_LINE 64u + +#define MARGIN_MIN 0u +#define MARGIN_MAX 255u + +#define POISON_S2 0x19999998u /* the real name_to_int value */ + +/* Globals referenced by name from the naked asm (kept non-static, used). */ +uint32_t g_s2_target; /* &g_s2_target is the pointer-like correct s2 value */ + +volatile uint32_t g_ticks; +volatile uint32_t g_irq_count; +volatile uint32_t g_expected_s2; +volatile uint32_t g_gap; +volatile uint32_t g_timer_margin; +volatile uint32_t g_drain_addr; +volatile uint32_t g_cont; /* fixed continuation PC for the handler */ +volatile uint32_t g_cret; /* irq_window() return address into C */ +volatile uint32_t g_csp; /* irq_window() caller stack pointer */ +volatile uint32_t g_save_s[12]; /* main's callee-saved s0..s11 spill */ + +volatile uint32_t g_last_code; +volatile uint32_t g_last_reg; +volatile uint32_t g_last_expected; +volatile uint32_t g_last_actual; + +/* + * Naked M-mode timer trap entry. Faithful Linux-style pt_regs save/restore to a + * cached-DDR "kernel stack" (sp == FRAME_TOP, set by irq_window), with s2 saved + * LAST (immediately before the eviction) and the saved s2 line then evicted from + * the direct-mapped L1D. Records the discriminator codes. Resumes via the fixed + * continuation in g_cont so a wrong mepc cannot wedge the sweep. + */ +__attribute__((naked, used, aligned(4))) static void trapframe_irq_entry(void) +{ + __asm__ volatile("addi sp, sp, -144\n" + /* ---- save the frame (everything EXCEPT s2 first) ---- */ + "sw ra, 4(sp)\n" + "sw gp, 12(sp)\n" + "sw tp, 16(sp)\n" + "sw t0, 20(sp)\n" + "sw t1, 24(sp)\n" + "sw t2, 28(sp)\n" + "sw s0, 32(sp)\n" + "sw s1, 36(sp)\n" + "sw a0, 40(sp)\n" + "sw a1, 44(sp)\n" + "sw a2, 48(sp)\n" + "sw a3, 52(sp)\n" + "sw a4, 56(sp)\n" + "sw a5, 60(sp)\n" + "sw a6, 64(sp)\n" + "sw a7, 68(sp)\n" + "sw s3, 76(sp)\n" + "sw s4, 80(sp)\n" + "sw s5, 84(sp)\n" + "sw s6, 88(sp)\n" + "sw s7, 92(sp)\n" + "sw s8, 96(sp)\n" + "sw s9, 100(sp)\n" + "sw s10, 104(sp)\n" + "sw s11, 108(sp)\n" + "sw t3, 112(sp)\n" + "sw t4, 116(sp)\n" + "sw t5, 120(sp)\n" + "sw t6, 124(sp)\n" + "csrr t0, mepc\n" + "sw t0, 0(sp)\n" + "csrr t0, mstatus\n" + "sw t0, 128(sp)\n" + /* preload the gap count into a saved scratch (t4) so the s2-store -> + * eviction distance is ALU-only and not perturbed by a memory read */ + "la t4, g_gap\n" + "lw t4, 0(t4)\n" + /* ---- code=29: incoming architectural s2 vs expected (precise state) */ + "la t0, g_expected_s2\n" + "lw t0, 0(t0)\n" + "beq s2, t0, 1f\n" + "la t1, g_last_code\n" + "lw t2, 0(t1)\n" + "bnez t2, 1f\n" + "li t2, 29\n" + "sw t2, 0(t1)\n" + "la t1, g_last_reg\n" + "li t2, 2\n" + "sw t2, 0(t1)\n" + "la t1, g_last_expected\n" + "sw t0, 0(t1)\n" + "la t1, g_last_actual\n" + "sw s2, 0(t1)\n" + "1:\n" + /* ================= STORE UNDER TEST: sw s2, 72(sp) ================= */ + "sw s2, 72(sp)\n" + /* ---- tunable gap (ALU only) ---- */ + "2:\n" + "beqz t4, 3f\n" + "addi t4, t4, -1\n" + "j 2b\n" + "3:\n" + /* ---- code=30: saved value BEFORE eviction (forwards from SQ if the + * store is still in flight; reads L1D otherwise) ---- */ + "lw t0, 72(sp)\n" + "la t1, g_expected_s2\n" + "lw t1, 0(t1)\n" + "beq t0, t1, 4f\n" + "la t2, g_last_code\n" + "lw t3, 0(t2)\n" + "bnez t3, 4f\n" + "li t3, 30\n" + "sw t3, 0(t2)\n" + "la t2, g_last_reg\n" + "li t3, 2\n" + "sw t3, 0(t2)\n" + "la t2, g_last_expected\n" + "sw t1, 0(t2)\n" + "la t2, g_last_actual\n" + "sw t0, 0(t2)\n" + "4:\n" + /* ---- EVICT the saved s2 line: stride by the L1D size so every access + * maps to the SAME set with a different tag (direct-mapped), evicting + * and writing back the just-stored dirty frame line ---- */ + "li t1, 0x82000040\n" /* S2_LINE_BASE */ + "li t2, 0x20000\n" /* L1D_STRIDE */ + "li t3, 6\n" /* N_EVICT */ + "5:\n" + "lw t5, 0(t1)\n" + "add t1, t1, t2\n" + "addi t3, t3, -1\n" + "bnez t3, 5b\n" + /* ============ LOAD UNDER TEST: lw s2, 72(sp) (post-evict) ========== + * line was evicted -> this misses -> refills from DDR -> sees whatever + * the eviction wrote back. code=31 if it differs (the targeted bug). */ + "lw t0, 72(sp)\n" + "la t1, g_expected_s2\n" + "lw t1, 0(t1)\n" + "beq t0, t1, 6f\n" + "la t2, g_last_code\n" + "lw t3, 0(t2)\n" + "bnez t3, 6f\n" + "li t3, 31\n" + "sw t3, 0(t2)\n" + "la t2, g_last_reg\n" + "li t3, 2\n" + "sw t3, 0(t2)\n" + "la t2, g_last_expected\n" + "sw t1, 0(t2)\n" + "la t2, g_last_actual\n" + "sw t0, 0(t2)\n" + "6:\n" + /* ---- supporting witnesses on the same line: s3@76, s4@80 ---- */ + "lw t0, 76(sp)\n" + "li t1, 0x51000003\n" + "beq t0, t1, 7f\n" + "la t2, g_last_code\n" + "lw t3, 0(t2)\n" + "bnez t3, 7f\n" + "li t3, 31\n" + "sw t3, 0(t2)\n" + "la t2, g_last_reg\n" + "li t3, 3\n" + "sw t3, 0(t2)\n" + "la t2, g_last_expected\n" + "sw t1, 0(t2)\n" + "la t2, g_last_actual\n" + "sw t0, 0(t2)\n" + "7:\n" + "lw t0, 80(sp)\n" + "li t1, 0x51000004\n" + "beq t0, t1, 8f\n" + "la t2, g_last_code\n" + "lw t3, 0(t2)\n" + "bnez t3, 8f\n" + "li t3, 31\n" + "sw t3, 0(t2)\n" + "la t2, g_last_reg\n" + "li t3, 4\n" + "sw t3, 0(t2)\n" + "la t2, g_last_expected\n" + "sw t1, 0(t2)\n" + "la t2, g_last_actual\n" + "sw t0, 0(t2)\n" + "8:\n" + /* ---- side effects (scratch t0..t2, restored below) ---- */ + "li t1, 0x4000001C\n" /* MTIMECMP_HI := -1 : disarm so no refire */ + "li t0, -1\n" + "sw t0, 0(t1)\n" + "la t1, g_ticks\n" + "li t0, 1\n" + "sw t0, 0(t1)\n" + "la t1, g_irq_count\n" + "lw t0, 0(t1)\n" + "addi t0, t0, 1\n" + "sw t0, 0(t1)\n" + "la t1, g_cont\n" /* fixed continuation -> robust to a bad mepc */ + "lw t0, 0(t1)\n" + "csrw mepc, t0\n" + "lw t0, 128(sp)\n" + "csrw mstatus, t0\n" + /* ---- restore the frame (faithful trap exit) ---- */ + "lw ra, 4(sp)\n" + "lw gp, 12(sp)\n" + "lw tp, 16(sp)\n" + "lw s0, 32(sp)\n" + "lw s1, 36(sp)\n" + "lw a0, 40(sp)\n" + "lw a1, 44(sp)\n" + "lw a2, 48(sp)\n" + "lw a3, 52(sp)\n" + "lw a4, 56(sp)\n" + "lw a5, 60(sp)\n" + "lw a6, 64(sp)\n" + "lw a7, 68(sp)\n" + "lw s2, 72(sp)\n" + "lw s3, 76(sp)\n" + "lw s4, 80(sp)\n" + "lw s5, 84(sp)\n" + "lw s6, 88(sp)\n" + "lw s7, 92(sp)\n" + "lw s8, 96(sp)\n" + "lw s9, 100(sp)\n" + "lw s10, 104(sp)\n" + "lw s11, 108(sp)\n" + "lw t3, 112(sp)\n" + "lw t4, 116(sp)\n" + "lw t5, 120(sp)\n" + "lw t6, 124(sp)\n" + "lw t0, 20(sp)\n" + "lw t1, 24(sp)\n" + "lw t2, 28(sp)\n" + "addi sp, sp, 144\n" + "mret\n"); +} + +/* + * Naked per-margin window. Preserves main's callee-saved registers, sets up the + * cached-DDR frame stack + poison + drain store, arms the timer, loads the s0.. + * s11 sentinels, enables MIE, and spins until the handler fires. The handler + * redirects mepc to label 9 (the fixed continuation). Reads its per-margin + * inputs (g_timer_margin, g_gap, g_drain_addr, g_expected_s2) from globals set + * by C before the call. + */ +__attribute__((naked, used, noinline)) static void irq_window(void) +{ + __asm__ volatile( + /* preserve main's callee-saved s0..s11 (we clobber them with sentinels) */ + "la t0, g_save_s\n" + "sw s0, 0(t0)\n" + "sw s1, 4(t0)\n" + "sw s2, 8(t0)\n" + "sw s3, 12(t0)\n" + "sw s4, 16(t0)\n" + "sw s5, 20(t0)\n" + "sw s6, 24(t0)\n" + "sw s7, 28(t0)\n" + "sw s8, 32(t0)\n" + "sw s9, 36(t0)\n" + "sw s10, 40(t0)\n" + "sw s11, 44(t0)\n" + "la t0, g_csp\n" + "sw sp, 0(t0)\n" + "la t0, g_cret\n" + "sw ra, 0(t0)\n" + /* fixed continuation for the handler's mepc redirect */ + "la t0, g_cont\n" + "la t1, 9f\n" + "sw t1, 0(t0)\n" + "la t0, g_ticks\n" + "sw x0, 0(t0)\n" + /* faithful kernel stack pointer: handler does sw s2, 72(sp) */ + "li sp, 0x82000090\n" /* FRAME_TOP */ + /* PRE-POISON the frame's s2 line so a non-landed save reads a stale + * value; s2 slot gets 0x19999998 (the real name_to_int value). */ + "li t0, 0x82000000\n" /* FRAME_BASE */ + "li t1, 0x19999998\n" + "sw t1, 72(t0)\n" + "li t1, 0x19999993\n" + "sw t1, 76(t0)\n" + "li t1, 0x19999994\n" + "sw t1, 80(t0)\n" + "li t1, 0x19999995\n" + "sw t1, 84(t0)\n" + "li t1, 0x19999996\n" + "sw t1, 88(t0)\n" + "li t1, 0x19999997\n" + "sw t1, 92(t0)\n" + /* COLD-MISS DRAIN STORE: a fresh DDR line, in flight when the IRQ hits */ + "la t0, g_drain_addr\n" + "lw t0, 0(t0)\n" + "li t1, 0xD2A14000\n" + "sw t1, 0(t0)\n" + /* ARM the timer: mtimecmp = mtime + margin */ + "la t0, g_timer_margin\n" + "lw t0, 0(t0)\n" + "li t2, 0x40000010\n" /* MTIME_LO base */ + "lw t3, 4(t2)\n" /* mtime hi (0x14) */ + "lw t4, 0(t2)\n" /* mtime lo (0x10) */ + "add t4, t4, t0\n" + "li t1, 0x40000018\n" /* MTIMECMP_LO base */ + "li t5, -1\n" + "sw t5, 4(t1)\n" /* MTIMECMP_HI = max (0x1C) */ + "sw t4, 0(t1)\n" /* MTIMECMP_LO (0x18) */ + "sw t3, 4(t1)\n" /* MTIMECMP_HI = hi (0x1C) */ + /* sentinels into s0..s11 (s2 = pointer-like expected) -- LAST */ + "li s0, 0x51000000\n" + "li s1, 0x51000001\n" + "la s2, g_s2_target\n" + "li s3, 0x51000003\n" + "li s4, 0x51000004\n" + "li s5, 0x51000005\n" + "li s6, 0x51000006\n" + "li s7, 0x51000007\n" + "li s8, 0x51000008\n" + "li s9, 0x51000009\n" + "li s10, 0x5100000a\n" + "li s11, 0x5100000b\n" + "csrsi mstatus, 8\n" /* enable MIE -> armed timer fires into handler */ + "li t0, 0\n" + "10:\n" + "la t1, g_ticks\n" + "lw t1, 0(t1)\n" + "bnez t1, 9f\n" + "la t1, g_last_code\n" + "lw t1, 0(t1)\n" + "bnez t1, 9f\n" + "addi t0, t0, 1\n" + "li t1, 200000\n" + "bltu t0, t1, 10b\n" + "9:\n" /* continuation (handler redirects mepc here) */ + "csrci mstatus, 8\n" + /* restore main's s0..s11 */ + "la t0, g_save_s\n" + "lw s0, 0(t0)\n" + "lw s1, 4(t0)\n" + "lw s2, 8(t0)\n" + "lw s3, 12(t0)\n" + "lw s4, 16(t0)\n" + "lw s5, 20(t0)\n" + "lw s6, 24(t0)\n" + "lw s7, 28(t0)\n" + "lw s8, 32(t0)\n" + "lw s9, 36(t0)\n" + "lw s10, 40(t0)\n" + "lw s11, 44(t0)\n" + "la t0, g_csp\n" + "lw sp, 0(t0)\n" + "la t0, g_cret\n" + "lw ra, 0(t0)\n" + "ret\n"); +} + +int main(void) +{ + uint32_t n29 = 0, n30 = 0, n31 = 0, fired = 0, nofire = 0; + uint32_t first_margin = 0xFFFFFFFFu; + uint32_t first_code = 0, first_reg = 0, first_exp = 0, first_act = 0; + + uart_printf("\n=== drain trap-frame eviction test (Bug B @ pt_regs s2) ===\n"); + uart_printf("L1D=128KiB direct-mapped 32B lines; evict stride=0x%08x; frame@0x%08x s2@72\n", + L1D_STRIDE, + FRAME_BASE); + + g_expected_s2 = (uint32_t) &g_s2_target; + set_trap_handler(&trapframe_irq_entry); + csr_set(mie, MIE_MTIE); + disable_interrupts(); + + for (uint32_t margin = MARGIN_MIN; margin <= MARGIN_MAX; margin++) { + g_timer_margin = margin; + g_gap = margin & 15u; + g_drain_addr = DRAIN_BASE + margin * DRAIN_LINE; + g_expected_s2 = (uint32_t) &g_s2_target; + g_last_code = 0; + g_last_reg = 0; + g_last_expected = 0; + g_last_actual = 0; + g_ticks = 0; + + irq_window(); + + if (g_ticks == 0u) { + nofire++; + continue; + } + fired++; + if (g_last_code == 29u) { + n29++; + } else if (g_last_code == 30u) { + n30++; + } else if (g_last_code == 31u) { + n31++; + } + if (g_last_code != 0u && first_margin == 0xFFFFFFFFu) { + first_margin = margin; + first_code = g_last_code; + first_reg = g_last_reg; + first_exp = g_last_expected; + first_act = g_last_actual; + } + } + + disable_timer_interrupt(); + disable_interrupts(); + + uart_printf( + "sweep: fired=%u nofire=%u code29=%u code30=%u code31=%u\n", fired, nofire, n29, n30, n31); + uart_printf("expected_s2=%08x irq_count=%u\n", g_expected_s2, g_irq_count); + + if (n29 == 0u && n30 == 0u && n31 == 0u && fired > 0u) { + uart_printf("<>\n"); + } else { + uart_printf("FAIL first_margin=%u code=%u reg=s%u expected=%08x actual=%08x\n", + first_margin, + first_code, + first_reg, + first_exp, + first_act); + uart_printf("codes: 29=precise-state 30=save-not-visible 31=eviction/visibility\n"); + uart_printf("<>\n"); + } + + for (;;) { + } + return 0; +} diff --git a/sw/apps/fetch_stall_repro/Makefile b/sw/apps/fetch_stall_repro/Makefile new file mode 100644 index 00000000..130362ff --- /dev/null +++ b/sw/apps/fetch_stall_repro/Makefile @@ -0,0 +1,84 @@ +# Copyright 2026 Two Sigma Open Source, LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +# Makefile for fetch_stall_repro +# Self-contained assembly directed test (defines its own _start, no crt0.S). + +ARCH = rv32imac_zicsr_zicntr_zifencei_zba_zbb_zbs_zicond_zbkb_zihintpause +ABI = ilp32 + +RISCV_PREFIX ?= riscv-none-elf- +AS = $(RISCV_PREFIX)as +LD = $(RISCV_PREFIX)ld +CC = $(RISCV_PREFIX)gcc +OBJCOPY = $(RISCV_PREFIX)objcopy +OBJDUMP = $(RISCV_PREFIX)objdump + +DDR_BOOT_STUB_SRC = ../../common/crt0_ddr_boot.S +DDR_BOOT_STUB_OBJ = crt0_ddr_boot.o + +MEM_CONFIG ?= bram + +ifeq ($(MEM_CONFIG),bram) +LINKER_SCRIPT = ../../common/link.ld +BOOT_STUB_OBJ = +DDR_SECTIONS = .ddr_text .ddr_rodata .ddr_data +else ifeq ($(MEM_CONFIG),ddr) +LINKER_SCRIPT = ../../common/link_ddr.ld +BOOT_STUB_OBJ = $(DDR_BOOT_STUB_OBJ) +DDR_SECTIONS = .text .rodata .data .sdata .ddr_text .ddr_rodata .ddr_data +else +$(error MEM_CONFIG must be one of: bram, ddr (got '$(MEM_CONFIG)')) +endif + +EXECUTABLE_ELF_FILE = sw.elf +VERILOG_HEX_FILE = sw.mem +DDR_VERILOG_HEX_FILE = sw_ddr.mem +RAW_BINARY_FILE = sw.bin +VIVADO_BRAM_FILE = sw.txt +DISASSEMBLY_FILE = sw.S + +all: $(EXECUTABLE_ELF_FILE) $(VERILOG_HEX_FILE) $(DDR_VERILOG_HEX_FILE) $(RAW_BINARY_FILE) $(VIVADO_BRAM_FILE) $(DISASSEMBLY_FILE) + +$(DDR_BOOT_STUB_OBJ): $(DDR_BOOT_STUB_SRC) + $(CC) -march=$(ARCH) -mabi=$(ABI) -nostdlib -nostartfiles -c -o $@ $< + +$(EXECUTABLE_ELF_FILE): fetch_stall_repro.S $(BOOT_STUB_OBJ) $(LINKER_SCRIPT) + $(AS) -march=$(ARCH) -mabi=$(ABI) -o fetch_stall_repro.o fetch_stall_repro.S + $(LD) -m elf32lriscv -T $(LINKER_SCRIPT) -o $@ $(BOOT_STUB_OBJ) fetch_stall_repro.o + @rm -f fetch_stall_repro.o + +$(VERILOG_HEX_FILE): $(EXECUTABLE_ELF_FILE) + $(OBJCOPY) -O verilog --verilog-data-width 4 -R .comment -R .note.gnu.build-id \ + $(addprefix -R ,$(DDR_SECTIONS)) $< $@ + +$(DDR_VERILOG_HEX_FILE): $(EXECUTABLE_ELF_FILE) + -$(OBJCOPY) -O verilog --verilog-data-width 4 $(addprefix -j ,$(DDR_SECTIONS)) \ + --change-addresses -0x80000000 $< $@ 2>/dev/null + @if [ ! -s $@ ]; then echo 00000000 > $@; fi + +$(RAW_BINARY_FILE): $(EXECUTABLE_ELF_FILE) + $(OBJCOPY) -O binary -R .comment -R .note.gnu.build-id \ + $(addprefix -R ,$(DDR_SECTIONS)) $< $@ + +$(VIVADO_BRAM_FILE): $(RAW_BINARY_FILE) + xxd -e -g4 -c4 $< | awk '{printf "%08x\n", strtonum("0x" $$2)}' > $@ + +$(DISASSEMBLY_FILE): $(EXECUTABLE_ELF_FILE) + $(OBJDUMP) -d $< > $@ + +clean: + rm -f $(EXECUTABLE_ELF_FILE) $(VERILOG_HEX_FILE) $(DDR_VERILOG_HEX_FILE) \ + $(RAW_BINARY_FILE) $(VIVADO_BRAM_FILE) $(DISASSEMBLY_FILE) \ + fetch_stall_repro.o $(DDR_BOOT_STUB_OBJ) diff --git a/sw/apps/fetch_stall_repro/fetch_stall_repro.S b/sw/apps/fetch_stall_repro/fetch_stall_repro.S new file mode 100644 index 00000000..7d01d9ea --- /dev/null +++ b/sw/apps/fetch_stall_repro/fetch_stall_repro.S @@ -0,0 +1,125 @@ +# Directed cached-fetch PC-step repro (executes from .ddr_text through the L1I) +# +# Reproduces the HW front-end defect where the core steps PC +2 instead of +4 on +# a 32-bit instruction (mis-decoding it as compressed), landing mid-instruction. +# On genesys2 this fires at workqueue_init_early (epc=0x8038d7fa, 2 bytes into a +# 32-bit `sw zero,4(s1)`), deterministically. +# +# The defect needs the cached L1I fetch path: a fetch stall (!fetch_progress -> +# sel_nop) from a 32-byte line-fill coinciding with a 32-bit instruction near a +# line boundary. A BRAM fetch-fuzz run did NOT trigger it, so this version runs +# the stream from cached DDR (.ddr_text) COLD, so every line is a miss -> a +# regular line-fill stall, like the boot. +# +# Stream = [compressed-nop run][32-bit nops] blocks; compressed run length 3..6 +# sweeps the 32-bit nops across every alignment vs the line boundaries. A 32-bit +# nop is 0x00000013 (UPPER half 0x0000 = illegal compressed). Any +2 mis-step +# onto one fetches 0x0000 -> illegal-instruction trap; the handler prints mepc +# (the mid-instruction PC) and <>. Clean run -> <>. + + .section .init + .option push + .option norelax + .globl _start + +.macro CRUN n + .option rvc + .rept \n + c.nop + .endr + .option norvc +.endm + +_start: + .option norvc + la t0, trap_handler + csrw mtvec, t0 + lui sp, %hi(_stack_top) + addi sp, sp, %lo(_stack_top) + lui s0, 0x40000 # UART base + la a0, msg_header + jal ra, print_string + # Call the cached-DDR fetch-stall stream (absolute address; .ddr_text @ DDR). + lui t0, %hi(ddr_pattern) + addi t0, t0, %lo(ddr_pattern) + jalr ra, t0 + la a0, msg_pass # survived clean -> PASS + jal ra, print_string +done: + .option rvc + c.j done + .option norvc + + .balign 4 +trap_handler: + csrr s1, mepc # faulting PC (mid-instruction if +2 bug) + la a0, msg_trap + jal ra, print_string + mv a0, s1 + jal ra, print_hex + la a0, msg_fail + jal ra, print_string +trap_done: + .option rvc + c.j trap_done + .option norvc + + .balign 4 +print_string: + mv t2, a0 +1: lb t1, 0(t2) + beqz t1, 2f + sb t1, 0(s0) + addi t2, t2, 1 + j 1b +2: ret + + .balign 4 +print_hex: + mv t2, a0 + li t4, 28 +1: srl t1, t2, t4 + andi t1, t1, 0xf + li t5, 10 + blt t1, t5, 2f + addi t1, t1, 0x57 # 'a'-10 + j 3f +2: addi t1, t1, 0x30 # '0' +3: sb t1, 0(s0) + addi t4, t4, -4 + bgez t4, 1b + ret + + .option pop + +# ===== cached-DDR fetch-stall stream (fetched through the L1I) ===== + .section .ddr_text, "ax" + .option push + .option norelax + .balign 32 +ddr_pattern: + .rept 300 + CRUN 3 + nop ; nop ; nop ; nop + CRUN 4 + nop ; nop ; nop ; nop + CRUN 5 + nop ; nop ; nop ; nop + CRUN 6 + nop ; nop ; nop ; nop + .endr + .option norvc + ret + .option pop + + .section .rodata +msg_header: .asciz "=== fetch_stall_repro (ddr) ===\n" +msg_pass: .asciz "\n<>\n" +msg_trap: .asciz "\nILLEGAL TRAP mepc=" +msg_fail: .asciz " <>\n" + + .section .bss + .align 4 +stack_bottom: + .space 512 +_stack_top: diff --git a/sw/apps/irq_mie_window/Makefile b/sw/apps/irq_mie_window/Makefile new file mode 100644 index 00000000..ecaa3b92 --- /dev/null +++ b/sw/apps/irq_mie_window/Makefile @@ -0,0 +1,19 @@ +# Copyright 2026 Two Sigma Open Source, LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +# Copyright 2026 Two Sigma Open Source, LLC +# SPDX-License-Identifier: Apache-2.0 +# Short-MIE-window lost-interrupt directed test (registered-pending erase race) +SRC_C := main.c +include ../../common/common.mk diff --git a/sw/apps/irq_mie_window/main.c b/sw/apps/irq_mie_window/main.c new file mode 100644 index 00000000..6a541b7d --- /dev/null +++ b/sw/apps/irq_mie_window/main.c @@ -0,0 +1,127 @@ +/* + * Copyright 2026 Two Sigma Open Source, LLC + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/* + * Short-MIE-window lost-interrupt directed test. + * + * Root cause under test (trap_unit.sv): interrupt_pending is sampled from + * (mtip && mie.MTIE && mstatus.MIE) into a 1-cycle-late flop, and + * interrupt_pending_eligible then RE-CHECKS the LIVE mstatus.MIE/mie.MTIE when + * the sample matures. So a machine interrupt is only taken if the enable is high + * for TWO consecutive cycles (sample + service). A legal SHORT MIE-enable window + * -- e.g. `csrsi mstatus,8` immediately followed by `csrci mstatus,8` -- gets + * its already-qualified interrupt ERASED: the registered pending bit matures one + * cycle after csrsi, but the csrci's (delayed) side-effect has already driven + * mstatus.MIE back to 0, so interrupt_pending_eligible=0 and the pending bit is + * cleared without ever being serviced. Per RISC-V the interrupt MUST be taken at + * the instruction boundary right after the csrsi (before the csrci), so this is + * a dropped interrupt. On the real no-MMU kernel this is the lost machine-timer + * tick -> frozen jiffies -> boot hang (the same drop, usually opened by the trap + * being delayed a cycle by a draining store rather than a literal adjacent + * csrci). + * + * Setup: make the machine timer permanently pending (mtimecmp=0 => mtip high), + * enable mie.MTIE, leave mstatus.MIE=0. Then pulse MIE high for one cycle + * (csrsi; csrci) many times. A correct core takes the timer at the first pulse + * (the handler acks it); a buggy core erases it every pulse and never traps. + * + * PASS: g_taken >= 1 (the eligible timer was taken). + * FAIL: g_taken == 0 (the timer was eligible at every csrsi but never taken). + */ + +#include + +#include "trap.h" + +#define PULSES 256u + +volatile uint32_t g_taken; /* timer-trap count */ + +static void uart_putc(char c) +{ + UART_TX = (uint8_t) c; +} +static void uart_puts(const char *s) +{ + while (*s) + uart_putc(*s++); +} +static void uart_hex(uint32_t v) +{ + static const char hex[] = "0123456789ABCDEF"; + uart_puts("0x"); + for (int i = 28; i >= 0; i -= 4) + uart_putc(hex[(v >> i) & 0xF]); +} + +/* Naked handler: count the trap, ack the timer (push mtimecmp_hi to max so mtip + * drops and it cannot re-fire), MRET. */ +__attribute__((naked, aligned(4))) static void timer_handler(void) +{ + __asm__ volatile("addi sp, sp, -8\n" + "sw t0, 0(sp)\n" + "sw t1, 4(sp)\n" + "lui t0, %hi(g_taken)\n" + "lw t1, %lo(g_taken)(t0)\n" + "addi t1, t1, 1\n" + "sw t1, %lo(g_taken)(t0)\n" + "li t0, 0x4000001C\n" /* MTIMECMP_HI */ + "li t1, -1\n" + "sw t1, 0(t0)\n" /* mtimecmp = huge -> mtip low (ack) */ + "lw t0, 0(sp)\n" + "lw t1, 4(sp)\n" + "addi sp, sp, 8\n" + "mret\n"); +} + +int main(void) +{ + uart_puts("\r\n=== short-MIE-window lost-interrupt test ===\r\n"); + set_trap_handler(&timer_handler); + g_taken = 0; + + /* Machine timer permanently pending (mtime >= 0 always), MTIE enabled, + * mstatus.MIE left 0 -- pending but masked. */ + MTIMECMP_HI = 0; + MTIMECMP_LO = 0; + enable_timer_interrupt(); /* mie.MTIE = 1 */ + + /* Pulse mstatus.MIE high for a single cycle, repeatedly. Each csrsi makes the + * pending timer eligible at the very next instruction boundary; the adjacent + * csrci must NOT be able to retroactively cancel it. */ + for (uint32_t i = 0; i < PULSES; i++) { + __asm__ volatile("csrsi mstatus, 8\n" /* mstatus.MIE = 1 (1-cycle window) */ + "csrci mstatus, 8\n" /* mstatus.MIE = 0 */ + :: + : "memory"); + if (g_taken) + break; /* taken once -> correct; acked, no point continuing */ + } + + disable_timer_interrupt(); + uart_puts("taken="); + uart_hex(g_taken); + uart_puts("\r\n"); + if (g_taken >= 1u) { + uart_puts("<>\r\n"); + } else { + uart_puts("[FAIL] eligible machine timer was erased by the adjacent MIE clear " + "(never taken)\r\n<>\r\n"); + } + for (;;) { + } + return 0; +} diff --git a/sw/apps/linux_boot/patch_ret_from_exception.py b/sw/apps/linux_boot/patch_ret_from_exception.py index 213f46f5..191d34c8 100644 --- a/sw/apps/linux_boot/patch_ret_from_exception.py +++ b/sw/apps/linux_boot/patch_ret_from_exception.py @@ -1,6 +1,20 @@ #!/usr/bin/env python3 -"""Patch the temporary Linux bring-up image for the MRET restore window. +# Copyright 2026 Two Sigma Open Source, LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +"""Patch the temporary Linux bring-up image for current bring-up hazards. The external linux-mvp tree currently builds a debug kernel whose ret_from_exception sequence contains: @@ -30,19 +44,92 @@ that shift ret_from_exception. If the word is absent the image is assumed already patched (idempotent); if it occurs more than once the patch aborts rather than risk hitting the wrong site. + +Set FROST_LINUX_BOOTARGS to rewrite /chosen/bootargs in the generated DTB. This +is useful for hardware-only boot triage such as forcing initramfs_async=0 without +modifying the external linux-mvp artifact generator. + +Set FROST_LINUX_NOOP_FUNCTIONS to rewrite selected kernel functions to +`li a0,0; ret` in the generated DDR images. This is a hardware bring-up escape +hatch for narrow isolation runs; do not use it for correctness testing. + +Set FROST_LINUX_BUSYBOX to replace bin/busybox in the generated initramfs. +This is a bring-up hook for testing BFLT header changes without rebuilding the +external Buildroot tree. """ from __future__ import annotations import argparse +import gzip +import os +import shutil +import stat +import struct +import subprocess +import tempfile from pathlib import Path OLD_WORD = "18c1202f" # sc.w zero, a2, (sp) -- ret_from_exception reservation clear NEW_WORD = "ff757513" # andi a0, a0, -9 -- clear mstatus.MIE in the restore value +DTB_WORD = 0x200000 +INITRD_WORD = 0x204000 +KERNEL_ENTRY = 0x80000000 +FDT_MAGIC = 0xD00DFEED +CPIO_NEWC_MAGIC = b"070701" +CPIO_TRAILER = "TRAILER!!!" +NOOP_INITCALL_PATCH = b"\x01\x45\x82\x80" # c.li a0,0; c.ret +CPU_RELAX_DIV_SYMBOL = "__delay" +CPU_RELAX_DIV_OFFSET = 0x1C +CPU_RELAX_DIV_OLD = b"\xb3\xc7\x07\x02" # div a5,a5,zero +CPU_RELAX_DIV_NEW = b"\x13\x00\x00\x00" # nop +CPU_RELAX_PAUSE_OFFSET = 0x20 +CPU_RELAX_PAUSE_OLD = b"\x0f\x00\x00\x01" # pause / fence hint +CPU_RELAX_PAUSE_NEW = b"\x13\x00\x00\x00" # nop +PROC_GET_INODE_MODE_RELOAD_OLD = b"\x83\xd7\x04\x00" # lhu a5,0(s1) +PROC_GET_INODE_MODE_RELOAD_NEW = b"\x83\x57\x09\x06" # lhu a5,96(s2) +PROC_GET_INODE_MODE_RELOAD_ADDRS = (0x001071B2, 0x00107220) +PROC_GET_INODE_MODE_LOAD_ADDR = 0x0010718C +PROC_GET_INODE_MODE_LOAD_OLD = b"\x83\x57\x09\x06" # lhu a5,96(s2) +PROC_GET_INODE_MODE_FORCE_REG = b"\xb7\x87\x00\x00" # lui a5,0x8 (S_IFREG) +PROC_LOOKUP_REF_AMO_ADDR = 0x0010BC82 +PROC_LOOKUP_REF_AMO_OLD = b"\x2f\x27\xb5\x00" # amoadd.w a4,a1,(a0) +PROC_LOOKUP_REF_AMO_CONST = b"\x13\x07\x10\x00" # addi a4,zero,1 +PROC_LOOKUP_DE_ADJUST_ADDR = 0x0010BC7C +PROC_LOOKUP_DE_ADJUST_OLD = b"\xaa\x87\x85\x45" # mv a5,a0; li a1,1 +PROC_LOOKUP_DE_ADJUST_NEW = b"\x93\x07\x05\xfb" # addi a5,a0,-80 +DEFAULT_SYSTEM_MAP = Path( + os.path.expanduser( + "~/bigger_l0/linux-mvp/buildroot/output/build/linux-6.18.7/System.map" + ) +) +INITRD_DEVICES = { + "dev/console": (stat.S_IFCHR | 0o600, 5, 1), + "dev/null": (stat.S_IFCHR | 0o666, 1, 3), + "dev/random": (stat.S_IFCHR | 0o666, 1, 8), + "dev/ttyS0": (stat.S_IFCHR | 0o600, 4, 64), + "dev/urandom": (stat.S_IFCHR | 0o666, 1, 9), +} +DIAG_SHELL_INITTAB = """\ +console::sysinit:/bin/echo FROST_DIAG_INITTAB_START +::sysinit:/bin/mount -t proc proc /proc +::sysinit:/bin/mount -o remount,rw / +::sysinit:/bin/mkdir -p /dev/pts /dev/shm /run/lock/subsys /tmp /sys +::sysinit:/bin/mount -a +console::sysinit:/bin/echo FROST_DIAG_INITTAB_AFTER_RCS +console::respawn:/bin/sh +::shutdown:/bin/umount -a -r +""" +SEEDRNG_NOOP = """\ +#!/bin/sh +# FPGA bring-up has no hardware entropy source; seedrng can block PID 1 forever. +exit 0 +""" -def patch_words(path: Path) -> None: + +def patch_ret_restore_window(path: Path) -> None: """Patch the single OLD_WORD occurrence to NEW_WORD. Works for both the dense FPGA-loader form (one word per line) and the @@ -62,7 +149,9 @@ def patch_words(path: Path) -> None: if not old_hits: if new_hits: return # already patched - raise SystemExit(f"{path}: target word {OLD_WORD} not found (and not already patched)") + raise SystemExit( + f"{path}: target word {OLD_WORD} not found (and not already patched)" + ) if len(old_hits) > 1: raise SystemExit( f"{path}: {OLD_WORD} occurs {len(old_hits)}x; ambiguous, refusing to patch" @@ -71,16 +160,933 @@ def patch_words(path: Path) -> None: path.write_text("\n".join(lines) + "\n") +def split_env_names(value: str) -> list[str]: + """Parse value (space/comma-separated) into a deduplicated ordered list of names.""" + names: list[str] = [] + seen: set[str] = set() + for raw_name in value.replace(",", " ").split(): + name = raw_name.strip() + if not name or name in seen: + continue + names.append(name) + seen.add(name) + return names + + +def resolve_system_map_symbols(system_map: Path, names: list[str]) -> dict[str, int]: + """Look up symbol names to byte addresses in a Linux System.map file.""" + if not names: + return {} + if not system_map.exists(): + raise SystemExit(f"System.map not found: {system_map}") + + wanted = set(names) + resolved: dict[str, int] = {} + for line in system_map.read_text().splitlines(): + parts = line.split() + if len(parts) < 3: + continue + addr, _kind, symbol = parts[:3] + if symbol in wanted: + resolved[symbol] = int(addr, 16) + + missing = [name for name in names if name not in resolved] + if missing: + raise SystemExit(f"{system_map}: missing symbol(s): " + " ".join(missing)) + return resolved + + +def patch_word_byte(word: str, byte_offset: int, value: int) -> str: + """Patch one byte within a little-endian 4-byte hex word string and return the new word.""" + data = bytearray(struct.pack(" None: + """Apply byte-level patches to a dense (one-word-per-line) hex image file.""" + words = [ + line.strip().lower() for line in path.read_text().splitlines() if line.strip() + ] + for byte_addr, patch in patches.items(): + for byte_idx, value in enumerate(patch): + absolute_byte = byte_addr + byte_idx + word_idx = absolute_byte // 4 + byte_offset = absolute_byte % 4 + if word_idx >= len(words): + raise SystemExit( + f"{path}: patch address 0x{absolute_byte:x} is outside dense image" + ) + words[word_idx] = patch_word_byte(words[word_idx], byte_offset, value) + path.write_text("\n".join(words) + "\n") + + +def patch_sparse_code_bytes(path: Path, patches: dict[int, bytes]) -> None: + """Apply byte-level patches to a sparse (@addr-directive) hex image file.""" + lines = path.read_text().splitlines() + word_line_by_addr: dict[int, int] = {} + current_word_addr = 0 + for idx, line in enumerate(lines): + stripped = line.strip().lower() + if not stripped: + continue + if stripped.startswith("@"): + current_word_addr = int(stripped[1:], 16) + continue + word_line_by_addr[current_word_addr] = idx + current_word_addr += 1 + + for byte_addr, patch in patches.items(): + for byte_idx, value in enumerate(patch): + absolute_byte = byte_addr + byte_idx + word_addr = absolute_byte // 4 + byte_offset = absolute_byte % 4 + line_idx = word_line_by_addr.get(word_addr) + if line_idx is None: + raise SystemExit( + f"{path}: patch address 0x{absolute_byte:x} is outside sparse image" + ) + lines[line_idx] = patch_word_byte( + lines[line_idx].strip().lower(), byte_offset, value + ) + path.write_text("\n".join(lines) + "\n") + + +def patch_code_bytes(path: Path, patches: dict[int, bytes]) -> None: + """Dispatch to dense or sparse patcher based on image format and apply patches.""" + if not patches: + return + for line in path.read_text().splitlines(): + stripped = line.strip() + if not stripped: + continue + if stripped.startswith("@"): + patch_sparse_code_bytes(path, patches) + else: + patch_dense_code_bytes(path, patches) + return + raise SystemExit(f"{path}: empty Linux DDR image") + + +def patch_noop_return_zero(path: Path, symbols: dict[str, int]) -> None: + """Patch each symbol address with the NOOP_INITCALL_PATCH byte sequence.""" + patch_code_bytes(path, {addr: NOOP_INITCALL_PATCH for addr in symbols.values()}) + + +def read_dense_code_bytes(path: Path, byte_addr: int, size: int) -> bytes: + """Read size bytes at byte_addr from a dense hex image file.""" + words = [ + line.strip().lower() for line in path.read_text().splitlines() if line.strip() + ] + data = bytearray() + for byte_idx in range(size): + absolute_byte = byte_addr + byte_idx + word_idx = absolute_byte // 4 + byte_offset = absolute_byte % 4 + if word_idx >= len(words): + raise SystemExit( + f"{path}: read address 0x{absolute_byte:x} is outside dense image" + ) + data.append(struct.pack(" bytes: + """Read size bytes at byte_addr from a sparse (@addr-directive) hex image file.""" + lines = path.read_text().splitlines() + word_by_addr: dict[int, str] = {} + current_word_addr = 0 + for line in lines: + stripped = line.strip().lower() + if not stripped: + continue + if stripped.startswith("@"): + current_word_addr = int(stripped[1:], 16) + continue + word_by_addr[current_word_addr] = stripped + current_word_addr += 1 + + data = bytearray() + for byte_idx in range(size): + absolute_byte = byte_addr + byte_idx + word_addr = absolute_byte // 4 + byte_offset = absolute_byte % 4 + word = word_by_addr.get(word_addr) + if word is None: + raise SystemExit( + f"{path}: read address 0x{absolute_byte:x} is outside sparse image" + ) + data.append(struct.pack(" bytes: + """Dispatch to dense or sparse reader based on image format.""" + for line in path.read_text().splitlines(): + stripped = line.strip() + if not stripped: + continue + if stripped.startswith("@"): + return read_sparse_code_bytes(path, byte_addr, size) + return read_dense_code_bytes(path, byte_addr, size) + raise SystemExit(f"{path}: empty Linux DDR image") + + +def patch_cpu_relax_div(path: Path, delay_addr: int) -> None: + """Patch the div-by-zero instruction inside cpu_relax (__delay+0x1C) to a NOP.""" + patch_addr = delay_addr + CPU_RELAX_DIV_OFFSET + current = read_code_bytes(path, patch_addr, len(CPU_RELAX_DIV_OLD)) + if current not in (CPU_RELAX_DIV_OLD, CPU_RELAX_DIV_NEW): + raise SystemExit( + f"{path}: {CPU_RELAX_DIV_SYMBOL}+0x{CPU_RELAX_DIV_OFFSET:x} " + f"at 0x{patch_addr:08x} has {current.hex()}, expected " + f"{CPU_RELAX_DIV_OLD.hex()}" + ) + patch_code_bytes(path, {patch_addr: CPU_RELAX_DIV_NEW}) + + +def patch_cpu_relax_pause(path: Path, delay_addr: int) -> None: + """Patch the pause fence hint inside cpu_relax (__delay+0x20) to a NOP.""" + patch_addr = delay_addr + CPU_RELAX_PAUSE_OFFSET + current = read_code_bytes(path, patch_addr, len(CPU_RELAX_PAUSE_OLD)) + if current not in (CPU_RELAX_PAUSE_OLD, CPU_RELAX_PAUSE_NEW): + raise SystemExit( + f"{path}: {CPU_RELAX_DIV_SYMBOL}+0x{CPU_RELAX_PAUSE_OFFSET:x} " + f"at 0x{patch_addr:08x} has {current.hex()}, expected " + f"{CPU_RELAX_PAUSE_OLD.hex()}" + ) + patch_code_bytes(path, {patch_addr: CPU_RELAX_PAUSE_NEW}) + + +def patch_proc_get_inode_mode_reload(path: Path) -> None: + """Patch all proc_get_inode mode-reload instructions to the new encoding.""" + patches: dict[int, bytes] = {} + for addr in PROC_GET_INODE_MODE_RELOAD_ADDRS: + current = read_code_bytes(path, addr, len(PROC_GET_INODE_MODE_RELOAD_OLD)) + if current not in ( + PROC_GET_INODE_MODE_RELOAD_OLD, + PROC_GET_INODE_MODE_RELOAD_NEW, + ): + raise SystemExit( + f"{path}: proc_get_inode mode reload at 0x{addr:08x} " + f"has {current.hex()}, expected {PROC_GET_INODE_MODE_RELOAD_OLD.hex()}" + ) + patches[addr] = PROC_GET_INODE_MODE_RELOAD_NEW + patch_code_bytes(path, patches) + + +def patch_proc_get_inode_force_mode_reg(path: Path) -> None: + """Patch proc_get_inode to force the mode load through a register.""" + current = read_code_bytes( + path, PROC_GET_INODE_MODE_LOAD_ADDR, len(PROC_GET_INODE_MODE_LOAD_OLD) + ) + if current not in (PROC_GET_INODE_MODE_LOAD_OLD, PROC_GET_INODE_MODE_FORCE_REG): + raise SystemExit( + f"{path}: proc_get_inode mode load at 0x{PROC_GET_INODE_MODE_LOAD_ADDR:08x} " + f"has {current.hex()}, expected {PROC_GET_INODE_MODE_LOAD_OLD.hex()}" + ) + patch_code_bytes( + path, {PROC_GET_INODE_MODE_LOAD_ADDR: PROC_GET_INODE_MODE_FORCE_REG} + ) + + +def patch_proc_lookup_ref_const(path: Path) -> None: + """Replace the proc_lookup_de refcount AMO with a constant-store encoding.""" + current = read_code_bytes( + path, PROC_LOOKUP_REF_AMO_ADDR, len(PROC_LOOKUP_REF_AMO_OLD) + ) + if current not in (PROC_LOOKUP_REF_AMO_OLD, PROC_LOOKUP_REF_AMO_CONST): + raise SystemExit( + f"{path}: proc_lookup_de refcount AMO at 0x{PROC_LOOKUP_REF_AMO_ADDR:08x} " + f"has {current.hex()}, expected {PROC_LOOKUP_REF_AMO_OLD.hex()}" + ) + patch_code_bytes(path, {PROC_LOOKUP_REF_AMO_ADDR: PROC_LOOKUP_REF_AMO_CONST}) + + +def patch_proc_lookup_de_adjust(path: Path) -> None: + """Patch the proc_lookup_de returned-de pointer-adjustment instruction.""" + current = read_code_bytes( + path, PROC_LOOKUP_DE_ADJUST_ADDR, len(PROC_LOOKUP_DE_ADJUST_OLD) + ) + if current not in (PROC_LOOKUP_DE_ADJUST_OLD, PROC_LOOKUP_DE_ADJUST_NEW): + raise SystemExit( + f"{path}: proc_lookup_de returned-de adjust at " + f"0x{PROC_LOOKUP_DE_ADJUST_ADDR:08x} has {current.hex()}, expected " + f"{PROC_LOOKUP_DE_ADJUST_OLD.hex()}" + ) + patch_code_bytes(path, {PROC_LOOKUP_DE_ADJUST_ADDR: PROC_LOOKUP_DE_ADJUST_NEW}) + + +def words_to_bytes(words: list[str]) -> bytes: + """Pack a list of little-endian 8-hex-digit word strings into bytes.""" + return b"".join(struct.pack(" list[str]: + """Unpack bytes into a list of little-endian 8-hex-digit word strings.""" + if len(data) % 4: + data += b"\x00" * (4 - len(data) % 4) + return [ + f"{struct.unpack_from(' int: + """Return the total_size field from a FDT blob after validating the magic.""" + if len(data) < 8: + raise SystemExit("DTB slot is too small to contain an FDT header") + magic, total_size = struct.unpack_from(">II", data, 0) + if magic != FDT_MAGIC: + raise SystemExit( + f"DTB magic mismatch: got 0x{magic:08x}, expected 0x{FDT_MAGIC:08x}" + ) + if total_size > len(data): + raise SystemExit( + f"DTB total size {total_size} exceeds extracted slot {len(data)}" + ) + return total_size + + +def padded_dtb_slot(words: list[str]) -> bytes: + """Extract and zero-pad a DTB from a word list to its declared total_size.""" + data = words_to_bytes(words) + if len(data) < 8: + raise SystemExit("DTB slot is too small to contain an FDT header") + magic, total_size = struct.unpack_from(">II", data, 0) + if magic != FDT_MAGIC: + raise SystemExit( + f"DTB magic mismatch: got 0x{magic:08x}, expected 0x{FDT_MAGIC:08x}" + ) + if total_size > len(data): + data += b"\x00" * (total_size - len(data)) + return data + + +def fdt_tool(name: str) -> str: + """Locate an FDT command-line tool on PATH or raise SystemExit if absent.""" + tool = shutil.which(name) + if not tool: + raise SystemExit(f"{name} is required in PATH") + return tool + + +def run_fdtget_u32(dtb_path: Path, prop: str) -> int: + """Read a single hex /chosen property from a DTB file using fdtget.""" + result = subprocess.run( + [fdt_tool("fdtget"), "-t", "x", str(dtb_path), "/chosen", prop], + check=True, + capture_output=True, + text=True, + ) + words = result.stdout.split() + if len(words) != 1: + raise SystemExit(f"{dtb_path}: expected one {prop} cell, got {result.stdout!r}") + return int(words[0], 16) + + +def rewrite_dtb(dtb_slot: bytes, bootargs: str | None, initrd_end: int | None) -> bytes: + """Rewrite bootargs and linux,initrd-end in a DTB blob using fdtput.""" + fdtput = shutil.which("fdtput") + if not fdtput: + raise SystemExit("DTB rewriting requires fdtput in PATH") + + total_size = fdt_total_size(dtb_slot) + old_dtb = dtb_slot[:total_size] + with tempfile.TemporaryDirectory(prefix="frost_dtb_") as tmp: + dtb_path = Path(tmp) / "frost.dtb" + dtb_path.write_bytes(old_dtb) + if bootargs is not None: + subprocess.run( + [fdtput, "-t", "s", str(dtb_path), "/chosen", "bootargs", bootargs], + check=True, + ) + if initrd_end is not None: + subprocess.run( + [ + fdtput, + "-t", + "x", + str(dtb_path), + "/chosen", + "linux,initrd-end", + f"0x{initrd_end:08x}", + ], + check=True, + ) + serial_irq_mode = os.environ.get("FROST_LINUX_SERIAL_IRQ_MODE", "poll") + if serial_irq_mode == "poll": + subprocess.run( + [ + fdtput, + "-d", + str(dtb_path), + "/soc/serial@40001000", + "interrupts-extended", + ], + check=False, + stdout=subprocess.DEVNULL, + stderr=subprocess.DEVNULL, + ) + elif serial_irq_mode == "cpu-local-meip": + subprocess.run( + [ + fdtput, + "-t", + "x", + str(dtb_path), + "/soc/serial@40001000", + "interrupts-extended", + "0x00000001", + "0x0000000b", + ], + check=True, + ) + else: + raise SystemExit(f"unknown FROST_LINUX_SERIAL_IRQ_MODE={serial_irq_mode!r}") + new_dtb = dtb_path.read_bytes() + + if len(new_dtb) > (INITRD_WORD - DTB_WORD) * 4: + raise SystemExit( + f"patched DTB is {len(new_dtb)} bytes; only " + f"{(INITRD_WORD - DTB_WORD) * 4} bytes available before initrd" + ) + return new_dtb + + +def get_initrd_bounds(dtb_slot: bytes) -> tuple[int, int]: + """Read the initrd start and end byte addresses from a DTB blob using fdtget.""" + total_size = fdt_total_size(dtb_slot) + with tempfile.TemporaryDirectory(prefix="frost_dtb_") as tmp: + dtb_path = Path(tmp) / "frost.dtb" + dtb_path.write_bytes(dtb_slot[:total_size]) + start = run_fdtget_u32(dtb_path, "linux,initrd-start") + end = run_fdtget_u32(dtb_path, "linux,initrd-end") + if end < start: + raise SystemExit(f"invalid initrd bounds: start=0x{start:08x}, end=0x{end:08x}") + if start < KERNEL_ENTRY or (start - KERNEL_ENTRY) % 4: + raise SystemExit(f"unsupported initrd start: 0x{start:08x}") + return start, end + + +def newc_pad(n: int) -> int: + """Return the number of padding bytes to reach the next 4-byte CPIO alignment boundary.""" + return (-n) & 3 + + +def parse_newc_entry(data: bytes, offset: int) -> tuple[str, list[int], int, int, int]: + """Parse one CPIO newc entry, returning name, fields, body_start, next_offset, and file_size.""" + if offset + 110 > len(data) or data[offset : offset + 6] != CPIO_NEWC_MAGIC: + raise SystemExit(f"initramfs is not a valid newc archive at byte {offset}") + fields = [ + int(data[offset + 6 + idx * 8 : offset + 14 + idx * 8], 16) for idx in range(13) + ] + file_size = fields[6] + name_size = fields[11] + name_start = offset + 110 + name_end = name_start + name_size + if name_end > len(data): + raise SystemExit(f"initramfs newc entry at byte {offset} has truncated name") + name = data[name_start : name_end - 1].decode("utf-8") + body_start = name_end + newc_pad(name_end) + next_offset = body_start + file_size + newc_pad(body_start + file_size) + if next_offset > len(data): + raise SystemExit(f"initramfs newc entry {name!r} at byte {offset} is truncated") + return name, fields, body_start, next_offset, file_size + + +def find_newc_trailer(data: bytes) -> tuple[int, set[str]]: + """Scan a CPIO newc archive for the TRAILER entry and return its offset and all filenames seen.""" + offset = 0 + names: set[str] = set() + while offset < len(data): + name, _fields, _body_start, next_offset, _file_size = parse_newc_entry( + data, offset + ) + names.add(name) + if name == CPIO_TRAILER: + return offset, names + offset = next_offset + raise SystemExit("initramfs newc archive has no TRAILER!!! entry") + + +def make_newc_entry( + name: str, + mode: int, + rdev_major: int, + rdev_minor: int, + ino: int, + data: bytes = b"", + uid: int = 0, + gid: int = 0, + nlink: int = 1, + mtime: int = 0, + dev_major: int = 0, + dev_minor: int = 0, +) -> bytes: + """Build a complete CPIO newc archive entry from name, mode, device numbers, and data.""" + encoded_name = name.encode("utf-8") + b"\x00" + fields = [ + ino, + mode, + uid, + gid, + nlink, + mtime, + len(data), + dev_major, + dev_minor, + rdev_major, + rdev_minor, + len(encoded_name), + 0, # check + ] + header = CPIO_NEWC_MAGIC + b"".join( + f"{field:08x}".encode("ascii") for field in fields + ) + name_block = ( + header + encoded_name + (b"\x00" * newc_pad(len(header) + len(encoded_name))) + ) + return name_block + data + (b"\x00" * newc_pad(len(name_block) + len(data))) + + +def make_newc_replacement_entry(name: str, fields: list[int], data: bytes) -> bytes: + """Rebuild a CPIO newc entry preserving the original metadata with new data.""" + return make_newc_entry( + name, + fields[1], + fields[9], + fields[10], + fields[0], + data=data, + uid=fields[2], + gid=fields[3], + nlink=fields[4], + mtime=fields[5], + dev_major=fields[7], + dev_minor=fields[8], + ) + + +def patch_initramfs( + initrd_gz: bytes, + replacements: dict[str, bytes], + additions: dict[str, tuple[int, bytes]], + deletions: set[str], +) -> tuple[bytes, list[str], list[str], list[str], list[str]]: + """Patch, add, and delete entries in a gzip-compressed CPIO initramfs.""" + conflicts = (set(replacements) | set(additions)) & deletions + if conflicts: + raise SystemExit( + "initramfs paths cannot be both patched/added and deleted: " + + " ".join(sorted(conflicts)) + ) + + initrd = gzip.decompress(initrd_gz) + trailer_offset, names = find_newc_trailer(initrd) + missing = [name for name in INITRD_DEVICES if name not in names] + existing_additions = set(additions) & names + + if not missing and not replacements and not additions and not deletions: + return initrd_gz, [], [], [], [] + + patched_entries: list[bytes] = [] + replaced: list[str] = [] + deleted: list[str] = [] + offset = 0 + while offset < trailer_offset: + name, fields, body_start, next_offset, file_size = parse_newc_entry( + initrd, offset + ) + if name in deletions: + deleted.append(name) + elif name in replacements: + patched_entries.append( + make_newc_replacement_entry(name, fields, replacements[name]) + ) + replaced.append(name) + elif name in existing_additions: + _mode, data = additions[name] + patched_entries.append(make_newc_replacement_entry(name, fields, data)) + replaced.append(name) + else: + patched_entries.append(initrd[offset:next_offset]) + offset = next_offset + + for idx, name in enumerate(missing, start=0xF005700): + mode, major, minor = INITRD_DEVICES[name] + patched_entries.append(make_newc_entry(name, mode, major, minor, idx)) + added_files: list[str] = [] + for idx, (name, (mode, data)) in enumerate(additions.items(), start=0xF006700): + if name in names: + continue + patched_entries.append(make_newc_entry(name, mode, 0, 0, idx, data=data)) + added_files.append(name) + trailer = make_newc_entry(CPIO_TRAILER, 0, 0, 0, 0) + patched = b"".join(patched_entries) + trailer + + missing_replacements = sorted(set(replacements) - set(replaced)) + if missing_replacements: + raise SystemExit( + "initramfs replacement target(s) not found: " + + " ".join(missing_replacements) + ) + missing_deletions = sorted(deletions - set(deleted)) + if missing_deletions: + raise SystemExit( + "initramfs deletion target(s) not found: " + " ".join(missing_deletions) + ) + return gzip.compress(patched, mtime=0), missing, replaced, added_files, deleted + + +def get_initramfs_replacements() -> dict[str, bytes]: + """Build the initramfs file-replacement map from FROST_LINUX_* environment variables.""" + replacements = { + "etc/init.d/S01seedrng": SEEDRNG_NOOP.encode("utf-8"), + } + busybox_replacement = os.environ.get("FROST_LINUX_BUSYBOX") + if busybox_replacement: + replacements["bin/busybox"] = Path(busybox_replacement).read_bytes() + preset = os.environ.get("FROST_LINUX_INITTAB_PRESET") + raw_inittab = os.environ.get("FROST_LINUX_INITTAB") + if raw_inittab and preset: + raise SystemExit( + "set either FROST_LINUX_INITTAB or FROST_LINUX_INITTAB_PRESET, not both" + ) + if preset == "diag-shell": + replacements["etc/inittab"] = DIAG_SHELL_INITTAB.encode("utf-8") + return replacements + if preset: + raise SystemExit(f"unknown FROST_LINUX_INITTAB_PRESET={preset!r}") + if raw_inittab: + replacements["etc/inittab"] = raw_inittab.replace("\\n", "\n").encode("utf-8") + return replacements + + +def get_initramfs_additions() -> dict[str, tuple[int, bytes]]: + """Build the initramfs file-addition map from FROST_LINUX_* environment variables.""" + additions: dict[str, tuple[int, bytes]] = {} + diag_init = os.environ.get("FROST_LINUX_DIAG_INIT") + if diag_init: + additions["frost_diag_init"] = ( + stat.S_IFREG | 0o755, + Path(diag_init).read_bytes(), + ) + return additions + + +def get_initramfs_deletions() -> set[str]: + """Build the set of initramfs paths to delete from FROST_LINUX_* environment variables.""" + deletions = set( + split_env_names(os.environ.get("FROST_LINUX_DELETE_INITRAMFS_NAMES", "")) + ) + if os.environ.get("FROST_LINUX_DELETE_INITTAB") == "1": + deletions.add("etc/inittab") + return deletions + + +def patch_dense_image( + path: Path, + bootargs: str | None, + initramfs_replacements: dict[str, bytes], + initramfs_additions: dict[str, tuple[int, bytes]], + initramfs_deletions: set[str], +) -> tuple[list[str], list[str], list[str], list[str]]: + """Patch DTB and initramfs embedded in a dense Linux DDR hex image.""" + words = [ + line.strip().lower() for line in path.read_text().splitlines() if line.strip() + ] + if len(words) < INITRD_WORD: + raise SystemExit(f"{path}: dense DDR image is too short for DTB/initrd slots") + + dtb_slot_words = words[DTB_WORD:INITRD_WORD] + dtb_slot = words_to_bytes(dtb_slot_words) + initrd_start, initrd_end = get_initrd_bounds(dtb_slot) + initrd_word = (initrd_start - KERNEL_ENTRY) // 4 + if initrd_word != INITRD_WORD: + raise SystemExit(f"{path}: unexpected initrd word offset 0x{initrd_word:x}") + initrd_size = initrd_end - initrd_start + initrd_word_count = (initrd_size + 3) // 4 + initrd_gz = words_to_bytes(words[INITRD_WORD : INITRD_WORD + initrd_word_count])[ + :initrd_size + ] + new_initrd_gz, added_devices, replaced_files, added_files, deleted_files = ( + patch_initramfs( + initrd_gz, initramfs_replacements, initramfs_additions, initramfs_deletions + ) + ) + new_initrd_end = initrd_start + len(new_initrd_gz) + + new_dtb_words = bytes_to_words(rewrite_dtb(dtb_slot, bootargs, new_initrd_end)) + if DTB_WORD + len(new_dtb_words) > INITRD_WORD: + raise SystemExit(f"{path}: patched DTB overlaps initrd") + new_initrd_words = bytes_to_words(new_initrd_gz) + + words[DTB_WORD : DTB_WORD + len(new_dtb_words)] = new_dtb_words + for i in range(DTB_WORD + len(new_dtb_words), INITRD_WORD): + words[i] = "00000000" + words[INITRD_WORD:] = new_initrd_words + path.write_text("\n".join(words) + "\n") + return added_devices, replaced_files, added_files, deleted_files + + +def patch_sparse_image( + path: Path, + bootargs: str | None, + initramfs_replacements: dict[str, bytes], + initramfs_additions: dict[str, tuple[int, bytes]], + initramfs_deletions: set[str], +) -> tuple[list[str], list[str], list[str], list[str]]: + """Patch DTB and initramfs embedded in a sparse Linux DDR hex image.""" + + def is_gzip_first_word(word: str) -> bool: + try: + return (int(word, 16) & 0x00FF_FFFF) == 0x0008_8B1F + except ValueError: + return False + + lines = path.read_text().splitlines() + dtb_directive = f"@{DTB_WORD:08x}" + initrd_directive = f"@{INITRD_WORD:08x}" + try: + dtb_line = next( + i for i, line in enumerate(lines) if line.strip().lower() == dtb_directive + ) + except StopIteration as exc: + raise SystemExit(f"{path}: missing DTB address directive") from exc + initrd_line = next( + (i for i, line in enumerate(lines) if line.strip().lower() == initrd_directive), + None, + ) + if initrd_line is not None and initrd_line <= dtb_line: + raise SystemExit(f"{path}: initrd directive appears before DTB directive") + + dtb_slot_words = INITRD_WORD - DTB_WORD + sparse_payload_initrd_word = dtb_slot_words + if initrd_line is None: + payload_words = [ + line.strip().lower() for line in lines[dtb_line + 1 :] if line.strip() + ] + if len(payload_words) > dtb_slot_words and is_gzip_first_word( + payload_words[dtb_slot_words] + ): + sparse_payload_initrd_word = dtb_slot_words + dtb_words = payload_words[:dtb_slot_words] + else: + gzip_word = next( + ( + idx + for idx, word in enumerate(payload_words) + if is_gzip_first_word(word) + ), + None, + ) + if gzip_word is None: + raise SystemExit( + f"{path}: missing initrd directive and gzip initrd header" + ) + sparse_payload_initrd_word = gzip_word + dtb_words = payload_words[:gzip_word] + initrd_words = payload_words[sparse_payload_initrd_word:] + else: + dtb_words = [ + line.strip().lower() + for line in lines[dtb_line + 1 : initrd_line] + if line.strip() + ] + initrd_words = [ + line.strip().lower() for line in lines[initrd_line + 1 :] if line.strip() + ] + dtb_slot = padded_dtb_slot(dtb_words) + initrd_start, initrd_end = get_initrd_bounds(dtb_slot) + initrd_word = (initrd_start - KERNEL_ENTRY) // 4 + if initrd_word != INITRD_WORD: + raise SystemExit(f"{path}: unexpected initrd word offset 0x{initrd_word:x}") + initrd_size = initrd_end - initrd_start + initrd_gz = words_to_bytes(initrd_words)[:initrd_size] + new_initrd_gz, added_devices, replaced_files, added_files, deleted_files = ( + patch_initramfs( + initrd_gz, initramfs_replacements, initramfs_additions, initramfs_deletions + ) + ) + new_initrd_end = initrd_start + len(new_initrd_gz) + + new_dtb_words = bytes_to_words(rewrite_dtb(dtb_slot, bootargs, new_initrd_end)) + if DTB_WORD + len(new_dtb_words) > INITRD_WORD: + raise SystemExit(f"{path}: patched DTB overlaps initrd") + new_initrd_words = bytes_to_words(new_initrd_gz) + + lines[dtb_line + 1 :] = new_dtb_words + [initrd_directive] + new_initrd_words + path.write_text("\n".join(lines) + "\n") + return added_devices, replaced_files, added_files, deleted_files + + +def patch_linux_image( + path: Path, + bootargs: str | None, + initramfs_replacements: dict[str, bytes], + initramfs_additions: dict[str, tuple[int, bytes]], + initramfs_deletions: set[str], +) -> tuple[list[str], list[str], list[str], list[str]]: + """Patch a Linux DDR image, dispatching to dense or sparse handler by format.""" + for line in path.read_text().splitlines(): + stripped = line.strip() + if not stripped: + continue + if stripped.startswith("@"): + return patch_sparse_image( + path, + bootargs, + initramfs_replacements, + initramfs_additions, + initramfs_deletions, + ) + return patch_dense_image( + path, + bootargs, + initramfs_replacements, + initramfs_additions, + initramfs_deletions, + ) + raise SystemExit(f"{path}: empty Linux DDR image") + + def main() -> None: + """Entry point: patches the Linux DDR image with all FROST boot patches.""" parser = argparse.ArgumentParser() parser.add_argument("sw_ddr_mem", type=Path) parser.add_argument("sw_ddr_txt", type=Path) args = parser.parse_args() - patch_words(args.sw_ddr_mem) - patch_words(args.sw_ddr_txt) + patch_ret_restore_window(args.sw_ddr_mem) + patch_ret_restore_window(args.sw_ddr_txt) print(f"Patched Linux ret_from_exception restore window: {OLD_WORD}->{NEW_WORD}") + noop_initcall_names = split_env_names( + os.environ.get("FROST_LINUX_NOOP_INITCALLS", "") + ) + noop_function_names = split_env_names( + os.environ.get("FROST_LINUX_NOOP_FUNCTIONS", "") + ) + system_map = Path( + os.environ.get("FROST_LINUX_SYSTEM_MAP", DEFAULT_SYSTEM_MAP) + ).expanduser() + noop_initcall_symbols = resolve_system_map_symbols(system_map, noop_initcall_names) + patch_noop_return_zero(args.sw_ddr_mem, noop_initcall_symbols) + patch_noop_return_zero(args.sw_ddr_txt, noop_initcall_symbols) + if noop_initcall_symbols: + patched = " ".join( + f"{name}@0x{noop_initcall_symbols[name]:08x}" + for name in noop_initcall_names + ) + print(f"Patched Linux initcalls to return 0: {patched}") + + noop_function_symbols = resolve_system_map_symbols(system_map, noop_function_names) + patch_noop_return_zero(args.sw_ddr_mem, noop_function_symbols) + patch_noop_return_zero(args.sw_ddr_txt, noop_function_symbols) + if noop_function_symbols: + patched = " ".join( + f"{name}@0x{noop_function_symbols[name]:08x}" + for name in noop_function_names + ) + print(f"Patched Linux functions to return 0: {patched}") + + if os.environ.get("FROST_LINUX_NOP_CPU_RELAX_DIV") == "1": + delay_addr = resolve_system_map_symbols(system_map, [CPU_RELAX_DIV_SYMBOL])[ + CPU_RELAX_DIV_SYMBOL + ] + patch_cpu_relax_div(args.sw_ddr_mem, delay_addr) + patch_cpu_relax_div(args.sw_ddr_txt, delay_addr) + print( + f"Patched Linux {CPU_RELAX_DIV_SYMBOL} cpu_relax DIV-by-zero to NOP: " + f"{CPU_RELAX_DIV_SYMBOL}+0x{CPU_RELAX_DIV_OFFSET:x}@" + f"0x{delay_addr + CPU_RELAX_DIV_OFFSET:08x}" + ) + + if os.environ.get("FROST_LINUX_NOP_CPU_RELAX_PAUSE") == "1": + delay_addr = resolve_system_map_symbols(system_map, [CPU_RELAX_DIV_SYMBOL])[ + CPU_RELAX_DIV_SYMBOL + ] + patch_cpu_relax_pause(args.sw_ddr_mem, delay_addr) + patch_cpu_relax_pause(args.sw_ddr_txt, delay_addr) + print( + f"Patched Linux {CPU_RELAX_DIV_SYMBOL} cpu_relax PAUSE to NOP: " + f"{CPU_RELAX_DIV_SYMBOL}+0x{CPU_RELAX_PAUSE_OFFSET:x}@" + f"0x{delay_addr + CPU_RELAX_PAUSE_OFFSET:08x}" + ) + + if os.environ.get("FROST_LINUX_PATCH_PROC_GET_INODE_MODE_RELOAD") == "1": + patch_proc_get_inode_mode_reload(args.sw_ddr_mem) + patch_proc_get_inode_mode_reload(args.sw_ddr_txt) + print( + "Patched Linux proc_get_inode mode reload: " + f"{','.join(f'0x{addr:08x}' for addr in PROC_GET_INODE_MODE_RELOAD_ADDRS)} " + f"{PROC_GET_INODE_MODE_RELOAD_OLD.hex()}->" + f"{PROC_GET_INODE_MODE_RELOAD_NEW.hex()}" + ) + + if os.environ.get("FROST_LINUX_FORCE_PROC_GET_INODE_MODE_REG") == "1": + patch_proc_get_inode_force_mode_reg(args.sw_ddr_mem) + patch_proc_get_inode_force_mode_reg(args.sw_ddr_txt) + print( + "Patched Linux proc_get_inode mode load to S_IFREG: " + f"0x{PROC_GET_INODE_MODE_LOAD_ADDR:08x} " + f"{PROC_GET_INODE_MODE_LOAD_OLD.hex()}->" + f"{PROC_GET_INODE_MODE_FORCE_REG.hex()}" + ) + + if os.environ.get("FROST_LINUX_PATCH_PROC_LOOKUP_REF_CONST") == "1": + patch_proc_lookup_ref_const(args.sw_ddr_mem) + patch_proc_lookup_ref_const(args.sw_ddr_txt) + print( + "Patched Linux proc_lookup_de refcount AMO result to 1: " + f"0x{PROC_LOOKUP_REF_AMO_ADDR:08x} " + f"{PROC_LOOKUP_REF_AMO_OLD.hex()}->" + f"{PROC_LOOKUP_REF_AMO_CONST.hex()}" + ) + + if os.environ.get("FROST_LINUX_PATCH_PROC_LOOKUP_DE_ADJUST") == "1": + patch_proc_lookup_de_adjust(args.sw_ddr_mem) + patch_proc_lookup_de_adjust(args.sw_ddr_txt) + print( + "Patched Linux proc_lookup_de returned pointer adjust: " + f"0x{PROC_LOOKUP_DE_ADJUST_ADDR:08x} " + f"{PROC_LOOKUP_DE_ADJUST_OLD.hex()}->" + f"{PROC_LOOKUP_DE_ADJUST_NEW.hex()}" + ) + + bootargs = os.environ.get("FROST_LINUX_BOOTARGS") + initramfs_replacements = get_initramfs_replacements() + initramfs_additions = get_initramfs_additions() + initramfs_deletions = get_initramfs_deletions() + sparse_devices, sparse_replaced, sparse_added, sparse_deleted = patch_linux_image( + args.sw_ddr_mem, + bootargs, + initramfs_replacements, + initramfs_additions, + initramfs_deletions, + ) + dense_devices, dense_replaced, dense_added, dense_deleted = patch_linux_image( + args.sw_ddr_txt, + bootargs, + initramfs_replacements, + initramfs_additions, + initramfs_deletions, + ) + if bootargs: + print(f"Patched Linux DTB bootargs: {bootargs}") + added_devices = sorted(set(sparse_devices) | set(dense_devices)) + if added_devices: + print(f"Patched Linux initramfs device nodes: {' '.join(added_devices)}") + replaced_files = sorted(set(sparse_replaced) | set(dense_replaced)) + if replaced_files: + print(f"Patched Linux initramfs files: {' '.join(replaced_files)}") + added_files = sorted(set(sparse_added) | set(dense_added)) + if added_files: + print(f"Patched Linux initramfs added files: {' '.join(added_files)}") + deleted_files = sorted(set(sparse_deleted) | set(dense_deleted)) + if deleted_files: + print(f"Patched Linux initramfs deleted files: {' '.join(deleted_files)}") + if __name__ == "__main__": main() diff --git a/sw/apps/linux_clksrc_faithful/Makefile b/sw/apps/linux_clksrc_faithful/Makefile new file mode 100644 index 00000000..7a4cf72a --- /dev/null +++ b/sw/apps/linux_clksrc_faithful/Makefile @@ -0,0 +1,19 @@ +# Copyright 2026 Two Sigma Open Source, LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +# Faithful Linux clocksource-switch timer stressor. Force the whole program +# into cached DDR (matches the kernel's DDR-resident code/data/stack). +override MEM_CONFIG := ddr +SRC_C := ../../lib/src/uart.c main.c +include ../../common/common.mk diff --git a/sw/apps/linux_clksrc_faithful/main.c b/sw/apps/linux_clksrc_faithful/main.c new file mode 100644 index 00000000..9047d804 --- /dev/null +++ b/sw/apps/linux_clksrc_faithful/main.c @@ -0,0 +1,343 @@ +/* + * Copyright 2026 Two Sigma Open Source, LLC + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/* + * Faithful Linux clocksource-switch timer stressor (M-mode, DDR-resident). + * + * Mirrors what no-MMU Linux actually does at/after "Switched to clocksource + * clint_clocksource", which the existing linux_irq_*_ddr tests do NOT: + * + * - clint_clock_next_event() ORDER: csr_set(MTIE) is done FIRST, THEN + * mtimecmp is armed with a non-disabling 2-write lo-then-hi writeq + * (io-64-nonatomic-lo-hi). So MTIE is enabled while the OLD (just-fired) + * mtimecmp is still <= mtime, and the new deadline is written through a + * torn {old_hi,new_lo} transient. + * - clint_timer_interrupt() RE-ARMS: it acks with csr_clear(MTIE), then the + * event_handler re-arms via clint_clock_next_event(). It never leaves the + * timer disabled, so a tick taken "early" cannot strand a later wfi (the + * failure mode of the other tests, which is a test artifact, not Linux). + * - arch_cpu_idle() is a BARE wfi with mstatus.MIE left enabled throughout; + * MTIE is what gets toggled, by the handler. + * - concurrent cached-DDR churn so a machine-timer IRQ frequently lands while + * cached (long-latency) loads/stores are still outstanding. + * + * Run at hardware-realistic DDR latency (DDR_MODEL_LATENCY>=70, CACHED_HAS_L2=0). + * PASS prints <>; a frame-integrity violation prints <> with a code; + * a true deadlock is caught by the RTL no-retire watchdog. + */ + +#include + +#include "csr.h" +#include "trap.h" +#include "uart.h" + +#define CLINT_MTIMECMP_LO (*(volatile uint32_t *) 0x40014000u) +#define CLINT_MTIMECMP_HI (*(volatile uint32_t *) 0x40014004u) +#define CLINT_MTIME_LO (*(volatile uint32_t *) 0x4001BFF8u) +#define CLINT_MTIME_HI (*(volatile uint32_t *) 0x4001BFFCu) + +#define TARGET_TICKS 64u +#define DDR_STACK_SIZE 4096u +#define CHURN_WORDS 4096 /* 16 KiB > L1: each idle sweep sustains DDR misses */ + +struct linux_pt_regs { + uint32_t epc, ra, sp, gp, tp; + uint32_t t0, t1, t2, s0, s1; + uint32_t a0, a1, a2, a3, a4, a5, a6, a7; + uint32_t s2, s3, s4, s5, s6, s7, s8, s9, s10, s11; + uint32_t t3, t4, t5, t6; + uint32_t status, badaddr, cause, orig_a0; +}; + +struct fake_current { + uint32_t kernel_sp; + uint32_t user_sp; + uint32_t marker; +}; + +volatile struct fake_current g_fake_current = {0u, 0u, 0x5441534Bu}; +volatile uint32_t g_ticks; +volatile uint32_t g_fail_code; +volatile uint32_t g_fail_seen; +volatile uint32_t g_last_mepc; +volatile uint32_t g_last_ra; +volatile uint32_t g_last_sp; +volatile uint32_t g_last_tp; +volatile uint32_t g_last_mscratch; +volatile uint32_t g_churn[CHURN_WORDS]; + +static uint8_t g_ddr_stack[DDR_STACK_SIZE] __attribute__((aligned(16))); + +static inline uint32_t read_tp(void) +{ + uint32_t v; + __asm__ volatile("mv %0, tp" : "=r"(v)); + return v; +} + +static inline void write_tp(uint32_t v) +{ + __asm__ volatile("mv tp, %0" : : "r"(v) : "memory"); +} + +static void record_failure(uint32_t code) +{ + if (!g_fail_seen) { + g_fail_seen = 1u; + g_fail_code = code; + } +} + +static uint64_t clint_rdmtime(void) +{ + uint32_t hi, lo, hi2; + do { + hi = CLINT_MTIME_HI; + lo = CLINT_MTIME_LO; + hi2 = CLINT_MTIME_HI; + } while (hi != hi2); + return ((uint64_t) hi << 32) | lo; +} + +/* Linux clint_clock_next_event(): enable MTIE FIRST, then non-disabling + * lo-then-hi writeq of the new deadline (io-64-nonatomic-lo-hi). */ +static void clint_clock_next_event(uint64_t cmp) +{ + csr_set(mie, MIE_MTIE); + CLINT_MTIMECMP_LO = (uint32_t) cmp; + CLINT_MTIMECMP_HI = (uint32_t) (cmp >> 32); +} + +static uint32_t churn_ddr(uint32_t seed) +{ + uint32_t acc = seed; + for (int i = 0; i < CHURN_WORDS; i++) { + uint32_t v = g_churn[i]; + acc ^= v + ((uint32_t) i << 3); + acc = (acc << 5) | (acc >> 27); + g_churn[i] = v ^ acc ^ (0x9E3779B9u + (uint32_t) i); + } + return acc; +} + +/* Linux clint_timer_interrupt(): ack by clearing MTIE, then RE-ARM via the + * event_handler -> clint_clock_next_event() path. */ +__attribute__((noinline, used)) void faithful_irq_c(struct linux_pt_regs *frame) +{ + csr_clear(mie, MIE_MTIE); + + g_last_mepc = frame->epc; + g_last_ra = frame->ra; + g_last_sp = frame->sp; + g_last_tp = frame->tp; + g_last_mscratch = csr_read(mscratch); + + if (frame->cause != (MCAUSE_INTERRUPT_BIT | INT_MTI)) { + record_failure(1u); + } + /* Corrupted/garbage return PC is the hardware symptom (ra==epc==0xCC0). */ + if (frame->epc < 0x80000000u || frame->epc == 0x00000CC0u) { + record_failure(2u); + } + if (frame->ra < 0x80000000u || frame->ra == 0x00000CC0u) { + record_failure(3u); + } + if (frame->sp < (uint32_t) &g_ddr_stack[0] || + frame->sp > (uint32_t) &g_ddr_stack[DDR_STACK_SIZE]) { + record_failure(4u); + } + if (frame->tp != (uint32_t) &g_fake_current) { + record_failure(5u); + } + if (g_last_mscratch != 0u) { + record_failure(6u); + } + + /* Light handler-side cached touch (rotating window) so the handler stays + * short; the sustained DDR traffic comes from the idle-loop sweep. */ + { + uint32_t base = (g_ticks << 4) & (CHURN_WORDS - 1u); + uint32_t acc = frame->epc ^ frame->ra ^ g_ticks; + for (int i = 0; i < 8; i++) { + uint32_t idx = (base + (uint32_t) i) & (CHURN_WORDS - 1u); + acc ^= g_churn[idx]; + g_churn[idx] = acc + (uint32_t) i; + } + } + g_ticks = g_ticks + 1u; + + /* event_handler -> clint_clock_next_event(now + delta). Vary the delta so + * the IRQ phase relative to the idle churn/wfi sweeps across alignments. */ + clint_clock_next_event(clint_rdmtime() + 256u + ((uint64_t) (g_ticks & 63u) << 3)); +} + +/* Linux-style naked trap entry: save/restore the GPR frame on the current + * (DDR) stack, csrrw tp,mscratch,tp swap idiom, sc.w in the return path. */ +__attribute__((naked, aligned(4))) static void faithful_irq_entry(void) +{ + __asm__ volatile("csrrw tp, mscratch, tp\n" + "bnez tp, 1f\n" + "csrr tp, mscratch\n" + "1:\n" + "addi sp, sp, -144\n" + "sw ra, 4(sp)\n" + "sw gp, 12(sp)\n" + "sw t0, 20(sp)\n" + "sw t1, 24(sp)\n" + "sw t2, 28(sp)\n" + "sw s0, 32(sp)\n" + "sw s1, 36(sp)\n" + "sw a0, 40(sp)\n" + "sw a1, 44(sp)\n" + "sw a2, 48(sp)\n" + "sw a3, 52(sp)\n" + "sw a4, 56(sp)\n" + "sw a5, 60(sp)\n" + "sw a6, 64(sp)\n" + "sw a7, 68(sp)\n" + "sw s2, 72(sp)\n" + "sw s3, 76(sp)\n" + "sw s4, 80(sp)\n" + "sw s5, 84(sp)\n" + "sw s6, 88(sp)\n" + "sw s7, 92(sp)\n" + "sw s8, 96(sp)\n" + "sw s9, 100(sp)\n" + "sw s10, 104(sp)\n" + "sw s11, 108(sp)\n" + "sw t3, 112(sp)\n" + "sw t4, 116(sp)\n" + "sw t5, 120(sp)\n" + "sw t6, 124(sp)\n" + "sw a0, 140(sp)\n" + "addi t0, sp, 144\n" + "sw t0, 8(sp)\n" + "csrr t0, mepc\n" + "sw t0, 0(sp)\n" + "csrr t0, mstatus\n" + "sw t0, 128(sp)\n" + "csrr t0, mtval\n" + "sw t0, 132(sp)\n" + "csrr t0, mcause\n" + "sw t0, 136(sp)\n" + "csrr t0, mscratch\n" + "sw t0, 16(sp)\n" + "csrw mscratch, x0\n" + "mv a0, sp\n" + "call faithful_irq_c\n" + "lw a0, 128(sp)\n" + "lw a2, 0(sp)\n" + "sc.w x0, a2, 0(sp)\n" + "csrw mstatus, a0\n" + "csrw mepc, a2\n" + "lw ra, 4(sp)\n" + "lw gp, 12(sp)\n" + "lw tp, 16(sp)\n" + "lw t0, 20(sp)\n" + "lw t1, 24(sp)\n" + "lw t2, 28(sp)\n" + "lw s0, 32(sp)\n" + "lw s1, 36(sp)\n" + "lw a0, 40(sp)\n" + "lw a1, 44(sp)\n" + "lw a2, 48(sp)\n" + "lw a3, 52(sp)\n" + "lw a4, 56(sp)\n" + "lw a5, 60(sp)\n" + "lw a6, 64(sp)\n" + "lw a7, 68(sp)\n" + "lw s2, 72(sp)\n" + "lw s3, 76(sp)\n" + "lw s4, 80(sp)\n" + "lw s5, 84(sp)\n" + "lw s6, 88(sp)\n" + "lw s7, 92(sp)\n" + "lw s8, 96(sp)\n" + "lw s9, 100(sp)\n" + "lw s10, 104(sp)\n" + "lw s11, 108(sp)\n" + "lw t3, 112(sp)\n" + "lw t4, 116(sp)\n" + "lw t5, 120(sp)\n" + "lw t6, 124(sp)\n" + "lw sp, 8(sp)\n" + "mret\n"); +} + +__attribute__((noreturn, noinline, used)) void main_on_ddr_stack(void) +{ + uart_printf("\n=== Linux faithful clocksource-switch timer test ===\n"); + + for (int i = 0; i < CHURN_WORDS; i++) { + g_churn[i] = 0x80000000u ^ ((uint32_t) i * 0x10204081u); + } + g_fake_current.kernel_sp = (uint32_t) &g_ddr_stack[DDR_STACK_SIZE]; + g_fake_current.user_sp = 0u; + + write_tp((uint32_t) &g_fake_current); + csr_write(mscratch, 0u); + set_trap_handler(&faithful_irq_entry); + + /* Start the clockevent (clint_timer_starting_cpu -> first next_event), then + * enable MIE once and leave it on, exactly like the kernel after boot. */ + clint_clock_next_event(clint_rdmtime() + 384u); + enable_interrupts(); + + /* arch_cpu_idle(): bare wfi with MIE on, interleaved with concurrent + * cached-DDR work so IRQs land while cached ops are outstanding. */ + uint32_t spin = 0x2468ACE0u; + while (g_ticks < TARGET_TICKS && !g_fail_seen) { + spin = churn_ddr(spin ^ g_ticks); + __asm__ volatile("wfi" ::: "memory"); + } + + disable_timer_interrupt(); + disable_interrupts(); + + if (!g_fail_seen && g_ticks >= TARGET_TICKS && spin != 0u) { + uart_printf("ticks=%u spin=%08x last_mepc=%08x last_ra=%08x\n", + g_ticks, + spin, + g_last_mepc, + g_last_ra); + uart_printf("<>\n"); + } else { + uart_printf("FAIL code=%u ticks=%u mepc=%08x ra=%08x sp=%08x tp=%08x mscratch=%08x\n", + g_fail_code, + g_ticks, + g_last_mepc, + g_last_ra, + g_last_sp, + g_last_tp, + g_last_mscratch); + uart_printf("<>\n"); + } + + for (;;) { + } +} + +int main(void) +{ + uint32_t stack_top = ((uint32_t) &g_ddr_stack[DDR_STACK_SIZE]) & ~0xFu; + __asm__ volatile("mv sp, %0\n" + "j main_on_ddr_stack\n" + : + : "r"(stack_top) + : "memory"); + __builtin_unreachable(); +} diff --git a/sw/apps/linux_irq_active_ddr_test/main.c b/sw/apps/linux_irq_active_ddr_test/main.c index 1d479aa4..d7ed3288 100644 --- a/sw/apps/linux_irq_active_ddr_test/main.c +++ b/sw/apps/linux_irq_active_ddr_test/main.c @@ -42,10 +42,52 @@ #define NORMAL_IRQ_COUNT 16u #define POISON_IRQ_COUNT 16u #define ACTIVE_IRQ_COUNT 8u -#define IRQ_COUNT (NORMAL_IRQ_COUNT + POISON_IRQ_COUNT + ACTIVE_IRQ_COUNT) +#define SENTINEL_IRQ_COUNT 32u +#define IRQ_COUNT (NORMAL_IRQ_COUNT + POISON_IRQ_COUNT + ACTIVE_IRQ_COUNT + SENTINEL_IRQ_COUNT) #define FRAME_WORDS 36u #define DDR_STACK_SIZE 4096u +#define FRAME_EPC 0u +#define FRAME_RA 1u +#define FRAME_SP 2u +#define FRAME_GP 3u +#define FRAME_TP 4u +#define FRAME_T0 5u +#define FRAME_T1 6u +#define FRAME_T2 7u +#define FRAME_S0 8u +#define FRAME_S1 9u +#define FRAME_A0 10u +#define FRAME_A1 11u +#define FRAME_A2 12u +#define FRAME_A3 13u +#define FRAME_A4 14u +#define FRAME_A5 15u +#define FRAME_A6 16u +#define FRAME_A7 17u +#define FRAME_S2 18u +#define FRAME_S3 19u +#define FRAME_S4 20u +#define FRAME_S5 21u +#define FRAME_S6 22u +#define FRAME_S7 23u +#define FRAME_S8 24u +#define FRAME_S9 25u +#define FRAME_S10 26u +#define FRAME_S11 27u + +#define SENTINEL_S0 0x51000000u +#define SENTINEL_S1 0x51000001u +#define SENTINEL_S3 0x51000003u +#define SENTINEL_S4 0x51000004u +#define SENTINEL_S5 0x51000005u +#define SENTINEL_S6 0x51000006u +#define SENTINEL_S7 0x51000007u +#define SENTINEL_S8 0x51000008u +#define SENTINEL_S9 0x51000009u +#define SENTINEL_S10 0x5100000Au +#define SENTINEL_S11 0x5100000Bu + struct linux_pt_regs { uint32_t epc; uint32_t ra; @@ -111,6 +153,12 @@ volatile uint32_t g_last_mscratch_in_handler; volatile uint32_t g_context_checksum; volatile uint32_t g_context_words[64]; volatile uint32_t g_frame_snapshots[IRQ_COUNT][FRAME_WORDS]; +volatile uint32_t g_frame_check_mask[FRAME_WORDS]; +volatile uint32_t g_expected_frame[FRAME_WORDS]; +volatile uint32_t g_bad_frame_index; +volatile uint32_t g_bad_expected; +volatile uint32_t g_bad_actual; +volatile uint32_t g_bad_tick; static uint8_t g_ddr_stack[DDR_STACK_SIZE] __attribute__((aligned(16))); @@ -135,6 +183,53 @@ static void record_failure(uint32_t code) } } +__attribute__((noinline, used)) void +record_frame_failure(uint32_t code, uint32_t index, uint32_t expected, uint32_t actual) +{ + if (!g_fail_seen) { + g_bad_frame_index = index; + g_bad_expected = expected; + g_bad_actual = actual; + g_bad_tick = g_ticks; + record_failure(code); + } +} + +static void clear_frame_checks(void) +{ + for (uint32_t i = 0; i < FRAME_WORDS; i++) { + g_frame_check_mask[i] = 0u; + g_expected_frame[i] = 0u; + } +} + +static void expect_frame_word(uint32_t index, uint32_t value) +{ + g_expected_frame[index] = value; + g_frame_check_mask[index] = 0xFFFFFFFFu; +} + +static void check_frame_masked(struct linux_pt_regs *frame) +{ + volatile uint32_t *words = (volatile uint32_t *) frame; + + for (uint32_t i = 0; i < FRAME_WORDS; i++) { + uint32_t mask = g_frame_check_mask[i]; + uint32_t actual; + uint32_t expected; + + if (!mask) { + continue; + } + actual = words[i]; + expected = g_expected_frame[i]; + if (((actual ^ expected) & mask) != 0u) { + record_frame_failure(30u, i, expected, actual); + break; + } + } +} + static void fill_context(void) { for (int i = 0; i < ARRAY_LEN(g_context_words); i++) { @@ -190,14 +285,13 @@ __attribute__((noinline)) static uint32_t active_poison_window(uint32_t value) { uint32_t out; - __asm__ volatile( - "lui t5, 0x1\n" - "addi t5, t5, -832\n" - "xor %[out], %[in], t5\n" - "addi %[out], %[out], 37\n" - : [out] "=&r"(out) - : [in] "r"(value) - : "t5", "memory"); + __asm__ volatile("lui t5, 0x1\n" + "addi t5, t5, -832\n" + "xor %[out], %[in], t5\n" + "addi %[out], %[out], 37\n" + : [out] "=&r"(out) + : [in] "r"(value) + : "t5", "memory"); return out; } @@ -242,6 +336,7 @@ __attribute__((noinline)) static uint32_t active_until_irq(uint32_t iter) write_tp((uint32_t) &g_fake_current); csr_write(mscratch, 0u); g_exact_frame_check = 0u; + clear_frame_checks(); clint_set_timer_cmp(clint_rdmtime() + 700u + (iter & 63u)); enable_interrupts(); @@ -269,6 +364,240 @@ __attribute__((noinline)) static uint32_t active_until_irq(uint32_t iter) return churn_context(acc ^ g_ticks); } +static void setup_sentinel_frame_checks(void) +{ + clear_frame_checks(); + expect_frame_word(FRAME_TP, (uint32_t) &g_fake_current); + expect_frame_word(FRAME_S0, SENTINEL_S0); + expect_frame_word(FRAME_S1, SENTINEL_S1); + expect_frame_word(FRAME_S2, (uint32_t) &g_fake_current); + expect_frame_word(FRAME_S3, SENTINEL_S3); + expect_frame_word(FRAME_S4, SENTINEL_S4); + expect_frame_word(FRAME_S5, SENTINEL_S5); + expect_frame_word(FRAME_S6, SENTINEL_S6); + expect_frame_word(FRAME_S7, SENTINEL_S7); + expect_frame_word(FRAME_S8, SENTINEL_S8); + expect_frame_word(FRAME_S9, SENTINEL_S9); + expect_frame_word(FRAME_S10, SENTINEL_S10); + expect_frame_word(FRAME_S11, SENTINEL_S11); +} + +__attribute__((naked, noinline, used)) static uint32_t name_to_int_shape_asm(uint32_t seed) +{ + __asm__ volatile("li a5, 0x19999998\n" + "addi a4, a5, 9\n" + "xor a0, a0, a5\n" + "add a0, a0, a4\n" + "ret\n"); +} + +__attribute__((naked, noinline, used)) static uint32_t sentinel_irq_window(uint32_t before) +{ + __asm__ volatile("addi sp, sp, -64\n" + "sw ra, 0(sp)\n" + "sw s0, 4(sp)\n" + "sw s1, 8(sp)\n" + "sw s2, 12(sp)\n" + "sw s3, 16(sp)\n" + "sw s4, 20(sp)\n" + "sw s5, 24(sp)\n" + "sw s6, 28(sp)\n" + "sw s7, 32(sp)\n" + "sw s8, 36(sp)\n" + "sw s9, 40(sp)\n" + "sw s10, 44(sp)\n" + "sw s11, 48(sp)\n" + "sw a0, 52(sp)\n" + "li s0, 0x51000000\n" + "li s1, 0x51000001\n" + "la s2, g_fake_current\n" + "li s3, 0x51000003\n" + "li s4, 0x51000004\n" + "li s5, 0x51000005\n" + "li s6, 0x51000006\n" + "li s7, 0x51000007\n" + "li s8, 0x51000008\n" + "li s9, 0x51000009\n" + "li s10, 0x5100000a\n" + "li s11, 0x5100000b\n" + "li t0, 8\n" + "csrs mstatus, t0\n" + "li t6, 0\n" + "1:\n" + "lw a0, 52(sp)\n" + "call name_to_int_shape_asm\n" + "la t0, g_fail_seen\n" + "lw t1, 0(t0)\n" + "bnez t1, 2f\n" + "la t0, g_ticks\n" + "lw t1, 0(t0)\n" + "lw t2, 52(sp)\n" + "bne t1, t2, 2f\n" + "addi t6, t6, 1\n" + "li t3, 30000\n" + "bltu t6, t3, 1b\n" + "li t0, 8\n" + "csrc mstatus, t0\n" + "li a0, 41\n" + "li a1, 0xffffffff\n" + "li a2, 0\n" + "mv a3, t6\n" + "call record_frame_failure\n" + "j 3f\n" + "2:\n" + "li t0, 8\n" + "csrc mstatus, t0\n" + "3:\n" + "li t0, 0x51000000\n" + "beq s0, t0, 4f\n" + "li a0, 31\n" + "li a1, 8\n" + "li a2, 0x51000000\n" + "mv a3, s0\n" + "call record_frame_failure\n" + "j 15f\n" + "4:\n" + "li t0, 0x51000001\n" + "beq s1, t0, 5f\n" + "li a0, 31\n" + "li a1, 9\n" + "li a2, 0x51000001\n" + "mv a3, s1\n" + "call record_frame_failure\n" + "j 15f\n" + "5:\n" + "la t0, g_fake_current\n" + "beq s2, t0, 6f\n" + "li a0, 31\n" + "li a1, 18\n" + "la a2, g_fake_current\n" + "mv a3, s2\n" + "call record_frame_failure\n" + "j 15f\n" + "6:\n" + "li t0, 0x51000003\n" + "beq s3, t0, 7f\n" + "li a0, 31\n" + "li a1, 19\n" + "li a2, 0x51000003\n" + "mv a3, s3\n" + "call record_frame_failure\n" + "j 15f\n" + "7:\n" + "li t0, 0x51000004\n" + "beq s4, t0, 8f\n" + "li a0, 31\n" + "li a1, 20\n" + "li a2, 0x51000004\n" + "mv a3, s4\n" + "call record_frame_failure\n" + "j 15f\n" + "8:\n" + "li t0, 0x51000005\n" + "beq s5, t0, 9f\n" + "li a0, 31\n" + "li a1, 21\n" + "li a2, 0x51000005\n" + "mv a3, s5\n" + "call record_frame_failure\n" + "j 15f\n" + "9:\n" + "li t0, 0x51000006\n" + "beq s6, t0, 10f\n" + "li a0, 31\n" + "li a1, 22\n" + "li a2, 0x51000006\n" + "mv a3, s6\n" + "call record_frame_failure\n" + "j 15f\n" + "10:\n" + "li t0, 0x51000007\n" + "beq s7, t0, 11f\n" + "li a0, 31\n" + "li a1, 23\n" + "li a2, 0x51000007\n" + "mv a3, s7\n" + "call record_frame_failure\n" + "j 15f\n" + "11:\n" + "li t0, 0x51000008\n" + "beq s8, t0, 12f\n" + "li a0, 31\n" + "li a1, 24\n" + "li a2, 0x51000008\n" + "mv a3, s8\n" + "call record_frame_failure\n" + "j 15f\n" + "12:\n" + "li t0, 0x51000009\n" + "beq s9, t0, 13f\n" + "li a0, 31\n" + "li a1, 25\n" + "li a2, 0x51000009\n" + "mv a3, s9\n" + "call record_frame_failure\n" + "j 15f\n" + "13:\n" + "li t0, 0x5100000a\n" + "beq s10, t0, 14f\n" + "li a0, 31\n" + "li a1, 26\n" + "li a2, 0x5100000a\n" + "mv a3, s10\n" + "call record_frame_failure\n" + "j 15f\n" + "14:\n" + "li t0, 0x5100000b\n" + "beq s11, t0, 15f\n" + "li a0, 31\n" + "li a1, 27\n" + "li a2, 0x5100000b\n" + "mv a3, s11\n" + "call record_frame_failure\n" + "15:\n" + "lw ra, 0(sp)\n" + "lw s0, 4(sp)\n" + "lw s1, 8(sp)\n" + "lw s2, 12(sp)\n" + "lw s3, 16(sp)\n" + "lw s4, 20(sp)\n" + "lw s5, 24(sp)\n" + "lw s6, 28(sp)\n" + "lw s7, 32(sp)\n" + "lw s8, 36(sp)\n" + "lw s9, 40(sp)\n" + "lw s10, 44(sp)\n" + "lw s11, 48(sp)\n" + "addi sp, sp, 64\n" + "ret\n"); +} + +__attribute__((noinline)) static uint32_t sentinel_until_irq(uint32_t iter) +{ + uint32_t before = g_ticks; + + write_tp((uint32_t) &g_fake_current); + csr_write(mscratch, 0u); + g_exact_frame_check = 0u; + setup_sentinel_frame_checks(); + clint_set_timer_cmp(clint_rdmtime() + 180u + ((iter * 37u) & 255u)); + sentinel_irq_window(before); + disable_interrupts(); + clear_frame_checks(); + + if (g_ticks != before + 1u) { + record_failure(32u); + } + if (read_tp() != (uint32_t) &g_fake_current) { + record_failure(33u); + } + if (csr_read(mscratch) != 0u) { + record_failure(34u); + } + + return churn_context(0x19999998u ^ iter ^ g_ticks); +} + __attribute__((noinline, used)) void linux_like_irq_c(struct linux_pt_regs *frame) { uint32_t tick = g_ticks; @@ -290,11 +619,16 @@ __attribute__((noinline, used)) void linux_like_irq_c(struct linux_pt_regs *fram g_bad_ra = frame->ra; record_failure(1u); uart_printf("FAIL code=%u ticks=%u cause=%08x mepc=%08x ra=%08x\n", - g_fail_code, g_ticks, frame->cause, frame->epc, frame->ra); + g_fail_code, + g_ticks, + frame->cause, + frame->epc, + frame->ra); uart_printf("<>\n"); for (;;) { } } + check_frame_masked(frame); if (g_exact_frame_check) { if (frame->epc != g_expected_mepc) { record_failure(2u); @@ -337,94 +671,93 @@ __attribute__((noinline, used)) void linux_like_irq_c(struct linux_pt_regs *fram __attribute__((naked, aligned(4))) static void linux_like_irq_entry(void) { - __asm__ volatile( - "csrrw tp, mscratch, tp\n" - "bnez tp, 1f\n" - "csrr tp, mscratch\n" - "1:\n" - "addi sp, sp, -144\n" - "sw ra, 4(sp)\n" - "sw gp, 12(sp)\n" - "sw t0, 20(sp)\n" - "sw t1, 24(sp)\n" - "sw t2, 28(sp)\n" - "sw s0, 32(sp)\n" - "sw s1, 36(sp)\n" - "sw a0, 40(sp)\n" - "sw a1, 44(sp)\n" - "sw a2, 48(sp)\n" - "sw a3, 52(sp)\n" - "sw a4, 56(sp)\n" - "sw a5, 60(sp)\n" - "sw a6, 64(sp)\n" - "sw a7, 68(sp)\n" - "sw s2, 72(sp)\n" - "sw s3, 76(sp)\n" - "sw s4, 80(sp)\n" - "sw s5, 84(sp)\n" - "sw s6, 88(sp)\n" - "sw s7, 92(sp)\n" - "sw s8, 96(sp)\n" - "sw s9, 100(sp)\n" - "sw s10, 104(sp)\n" - "sw s11, 108(sp)\n" - "sw t3, 112(sp)\n" - "sw t4, 116(sp)\n" - "sw t5, 120(sp)\n" - "sw t6, 124(sp)\n" - "sw a0, 140(sp)\n" - "addi t0, sp, 144\n" - "sw t0, 8(sp)\n" - "csrr t0, mepc\n" - "sw t0, 0(sp)\n" - "csrr t0, mstatus\n" - "sw t0, 128(sp)\n" - "csrr t0, mtval\n" - "sw t0, 132(sp)\n" - "csrr t0, mcause\n" - "sw t0, 136(sp)\n" - "csrr t0, mscratch\n" - "sw t0, 16(sp)\n" - "csrw mscratch, x0\n" - "mv a0, sp\n" - "call linux_like_irq_c\n" - "lw a0, 128(sp)\n" - "lw a2, 0(sp)\n" - "sc.w x0, a2, 0(sp)\n" - "csrw mstatus, a0\n" - "csrw mepc, a2\n" - "lw ra, 4(sp)\n" - "lw gp, 12(sp)\n" - "lw tp, 16(sp)\n" - "lw t0, 20(sp)\n" - "lw t1, 24(sp)\n" - "lw t2, 28(sp)\n" - "lw s0, 32(sp)\n" - "lw s1, 36(sp)\n" - "lw a0, 40(sp)\n" - "lw a1, 44(sp)\n" - "lw a2, 48(sp)\n" - "lw a3, 52(sp)\n" - "lw a4, 56(sp)\n" - "lw a5, 60(sp)\n" - "lw a6, 64(sp)\n" - "lw a7, 68(sp)\n" - "lw s2, 72(sp)\n" - "lw s3, 76(sp)\n" - "lw s4, 80(sp)\n" - "lw s5, 84(sp)\n" - "lw s6, 88(sp)\n" - "lw s7, 92(sp)\n" - "lw s8, 96(sp)\n" - "lw s9, 100(sp)\n" - "lw s10, 104(sp)\n" - "lw s11, 108(sp)\n" - "lw t3, 112(sp)\n" - "lw t4, 116(sp)\n" - "lw t5, 120(sp)\n" - "lw t6, 124(sp)\n" - "lw sp, 8(sp)\n" - "mret\n"); + __asm__ volatile("csrrw tp, mscratch, tp\n" + "bnez tp, 1f\n" + "csrr tp, mscratch\n" + "1:\n" + "addi sp, sp, -144\n" + "sw ra, 4(sp)\n" + "sw gp, 12(sp)\n" + "sw t0, 20(sp)\n" + "sw t1, 24(sp)\n" + "sw t2, 28(sp)\n" + "sw s0, 32(sp)\n" + "sw s1, 36(sp)\n" + "sw a0, 40(sp)\n" + "sw a1, 44(sp)\n" + "sw a2, 48(sp)\n" + "sw a3, 52(sp)\n" + "sw a4, 56(sp)\n" + "sw a5, 60(sp)\n" + "sw a6, 64(sp)\n" + "sw a7, 68(sp)\n" + "sw s2, 72(sp)\n" + "sw s3, 76(sp)\n" + "sw s4, 80(sp)\n" + "sw s5, 84(sp)\n" + "sw s6, 88(sp)\n" + "sw s7, 92(sp)\n" + "sw s8, 96(sp)\n" + "sw s9, 100(sp)\n" + "sw s10, 104(sp)\n" + "sw s11, 108(sp)\n" + "sw t3, 112(sp)\n" + "sw t4, 116(sp)\n" + "sw t5, 120(sp)\n" + "sw t6, 124(sp)\n" + "sw a0, 140(sp)\n" + "addi t0, sp, 144\n" + "sw t0, 8(sp)\n" + "csrr t0, mepc\n" + "sw t0, 0(sp)\n" + "csrr t0, mstatus\n" + "sw t0, 128(sp)\n" + "csrr t0, mtval\n" + "sw t0, 132(sp)\n" + "csrr t0, mcause\n" + "sw t0, 136(sp)\n" + "csrr t0, mscratch\n" + "sw t0, 16(sp)\n" + "csrw mscratch, x0\n" + "mv a0, sp\n" + "call linux_like_irq_c\n" + "lw a0, 128(sp)\n" + "lw a2, 0(sp)\n" + "sc.w x0, a2, 0(sp)\n" + "csrw mstatus, a0\n" + "csrw mepc, a2\n" + "lw ra, 4(sp)\n" + "lw gp, 12(sp)\n" + "lw tp, 16(sp)\n" + "lw t0, 20(sp)\n" + "lw t1, 24(sp)\n" + "lw t2, 28(sp)\n" + "lw s0, 32(sp)\n" + "lw s1, 36(sp)\n" + "lw a0, 40(sp)\n" + "lw a1, 44(sp)\n" + "lw a2, 48(sp)\n" + "lw a3, 52(sp)\n" + "lw a4, 56(sp)\n" + "lw a5, 60(sp)\n" + "lw a6, 64(sp)\n" + "lw a7, 68(sp)\n" + "lw s2, 72(sp)\n" + "lw s3, 76(sp)\n" + "lw s4, 80(sp)\n" + "lw s5, 84(sp)\n" + "lw s6, 88(sp)\n" + "lw s7, 92(sp)\n" + "lw s8, 96(sp)\n" + "lw s9, 100(sp)\n" + "lw s10, 104(sp)\n" + "lw s11, 108(sp)\n" + "lw t3, 112(sp)\n" + "lw t4, 116(sp)\n" + "lw t5, 120(sp)\n" + "lw t6, 124(sp)\n" + "lw sp, 8(sp)\n" + "mret\n"); } __attribute__((noinline)) static uint32_t idle_once(uint32_t iter) @@ -434,27 +767,27 @@ __attribute__((noinline)) static uint32_t idle_once(uint32_t iter) write_tp((uint32_t) &g_fake_current); csr_write(mscratch, 0u); g_exact_frame_check = 1u; + clear_frame_checks(); clint_set_timer_cmp(clint_rdmtime() + 300u + (iter & 31u)); enable_interrupts(); - __asm__ volatile( - "mv t2, ra\n" - "mv t3, sp\n" - "mv t4, tp\n" - "la t0, 1f\n" - "la t1, g_expected_mepc\n" - "sw t0, 0(t1)\n" - "la t1, g_expected_ra\n" - "sw t2, 0(t1)\n" - "la t1, g_expected_sp\n" - "sw t3, 0(t1)\n" - "la t1, g_expected_tp\n" - "sw t4, 0(t1)\n" - "wfi\n" - "1:\n" - : - : - : "t0", "t1", "t2", "t3", "t4", "memory"); + __asm__ volatile("mv t2, ra\n" + "mv t3, sp\n" + "mv t4, tp\n" + "la t0, 1f\n" + "la t1, g_expected_mepc\n" + "sw t0, 0(t1)\n" + "la t1, g_expected_ra\n" + "sw t2, 0(t1)\n" + "la t1, g_expected_sp\n" + "sw t3, 0(t1)\n" + "la t1, g_expected_tp\n" + "sw t4, 0(t1)\n" + "wfi\n" + "1:\n" + : + : + : "t0", "t1", "t2", "t3", "t4", "memory"); disable_interrupts(); @@ -478,30 +811,30 @@ __attribute__((noinline)) static uint32_t idle_then_poison_ra_once(uint32_t iter write_tp((uint32_t) &g_fake_current); csr_write(mscratch, 0u); g_exact_frame_check = 1u; + clear_frame_checks(); clint_set_timer_cmp(clint_rdmtime() + 300u + (iter & 31u)); enable_interrupts(); - __asm__ volatile( - "mv t2, ra\n" - "mv t3, sp\n" - "mv t4, tp\n" - "la t0, 1f\n" - "la t1, g_expected_mepc\n" - "sw t0, 0(t1)\n" - "la t1, g_expected_ra\n" - "sw t2, 0(t1)\n" - "la t1, g_expected_sp\n" - "sw t3, 0(t1)\n" - "la t1, g_expected_tp\n" - "sw t4, 0(t1)\n" - "wfi\n" - "1:\n" - "lui ra, 0x1\n" - "addi ra, ra, -832\n" - "mv ra, t2\n" - : - : - : "t0", "t1", "t2", "t3", "t4", "memory"); + __asm__ volatile("mv t2, ra\n" + "mv t3, sp\n" + "mv t4, tp\n" + "la t0, 1f\n" + "la t1, g_expected_mepc\n" + "sw t0, 0(t1)\n" + "la t1, g_expected_ra\n" + "sw t2, 0(t1)\n" + "la t1, g_expected_sp\n" + "sw t3, 0(t1)\n" + "la t1, g_expected_tp\n" + "sw t4, 0(t1)\n" + "wfi\n" + "1:\n" + "lui ra, 0x1\n" + "addi ra, ra, -832\n" + "mv ra, t2\n" + : + : + : "t0", "t1", "t2", "t3", "t4", "memory"); disable_interrupts(); @@ -524,6 +857,7 @@ __attribute__((noreturn, noinline, used)) void main_on_ddr_stack(void) uart_printf("\n=== Linux-like active DDR timer IRQ test ===\n"); fill_context(); + clear_frame_checks(); g_fake_current.kernel_sp = (uint32_t) &g_ddr_stack[DDR_STACK_SIZE]; g_fake_current.user_sp = 0u; set_trap_handler(&linux_like_irq_entry); @@ -542,6 +876,9 @@ __attribute__((noreturn, noinline, used)) void main_on_ddr_stack(void) for (uint32_t i = 0; i < ACTIVE_IRQ_COUNT && !g_fail_seen; i++) { aggregate ^= active_until_irq(i); } + for (uint32_t i = 0; i < SENTINEL_IRQ_COUNT && !g_fail_seen; i++) { + aggregate ^= sentinel_until_irq(i); + } disable_timer_interrupt(); disable_interrupts(); @@ -549,12 +886,27 @@ __attribute__((noreturn, noinline, used)) void main_on_ddr_stack(void) if (!g_fail_seen && g_ticks == IRQ_COUNT && aggregate != 0u) { uart_printf("ticks=%u checksum=%08x last_mepc=%08x last_ra=%08x\n", - g_ticks, g_context_checksum, g_last_mepc, g_last_ra); + g_ticks, + g_context_checksum, + g_last_mepc, + g_last_ra); uart_printf("<>\n"); } else { - uart_printf("FAIL code=%u ticks=%u cause=%08x mepc=%08x ra=%08x sp=%08x tp=%08x mscratch=%08x\n", - g_fail_code, g_ticks, g_bad_cause, g_last_mepc, g_last_ra, - g_last_sp, g_last_tp, g_last_mscratch_in_handler); + uart_printf( + "FAIL code=%u ticks=%u cause=%08x mepc=%08x ra=%08x sp=%08x tp=%08x mscratch=%08x\n", + g_fail_code, + g_ticks, + g_bad_cause, + g_last_mepc, + g_last_ra, + g_last_sp, + g_last_tp, + g_last_mscratch_in_handler); + uart_printf("bad_frame idx=%u tick=%u expected=%08x actual=%08x\n", + g_bad_frame_index, + g_bad_tick, + g_bad_expected, + g_bad_actual); uart_printf("<>\n"); } @@ -566,11 +918,10 @@ int main(void) { uint32_t stack_top = ((uint32_t) &g_ddr_stack[DDR_STACK_SIZE]) & ~0xFu; - __asm__ volatile( - "mv sp, %0\n" - "j main_on_ddr_stack\n" - : - : "r"(stack_top) - : "memory"); + __asm__ volatile("mv sp, %0\n" + "j main_on_ddr_stack\n" + : + : "r"(stack_top) + : "memory"); __builtin_unreachable(); } diff --git a/sw/apps/mret_drain_deadlock/Makefile b/sw/apps/mret_drain_deadlock/Makefile new file mode 100644 index 00000000..21271c3f --- /dev/null +++ b/sw/apps/mret_drain_deadlock/Makefile @@ -0,0 +1,22 @@ +# Copyright 2026 Two Sigma Open Source, LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +# Copyright 2026 Two Sigma Open Source, LLC +# SPDX-License-Identifier: Apache-2.0 +# MRET-drain deadlock directed test (one-shot o_mret_start vs draining store). +# Program runs from BRAM (simple boot); only the store buffer lives in the cached +# DDR region (a loaded .ddr_data section) so the behavioral DDR model serves its +# lines and cached stores actually drain (sq_committed_empty toggles). +SRC_C := main.c +include ../../common/common.mk diff --git a/sw/apps/mret_drain_deadlock/main.c b/sw/apps/mret_drain_deadlock/main.c new file mode 100644 index 00000000..6e5451c3 --- /dev/null +++ b/sw/apps/mret_drain_deadlock/main.c @@ -0,0 +1,171 @@ +/* + * Copyright 2026 Two Sigma Open Source, LLC + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/* + * MRET-drain deadlock directed test (deterministic). + * + * Reproduces the residual flaky HANG seen booting no-MMU Linux on Genesys2: + * the kernel intermittently wedges at the first idle/clocksource machine-timer + * activity. Proven root cause (FROST RTL): + * + * o_mret_start (reorder_buffer.sv) is a strict ONE-CYCLE pulse asserted only + * on the SERIAL_IDLE->SERIAL_MRET_EXEC cycle (unlike o_trap_pending, it has no + * SERIAL_*_WAIT sustaining term). trap_unit.sv take_mret requires + * i_sq_committed_empty IN THAT SAME CYCLE and has no retry. So if a committed + * store is still draining when an MRET reaches the ROB head, take_mret misses + * its only shot: mret_taken/mret_done never assert and the serializer wedges + * in SERIAL_MRET_EXEC forever (commit_stall=1 freezes the core). There is no + * escape -- the stuck MRET never restores MIE, so no later interrupt can flush + * the pipeline back to SERIAL_IDLE. + * + * Why the existing tests miss it: mtimer_stress / wfi_mepc_test / + * mret_timer_resume_test all keep the handler stack in low BRAM (drains in ~1 + * cycle, so sq_committed_empty is already 1 when the MRET arrives) and never + * create the "MRET reaches head while a committed CACHED/DDR store is mid-drain" + * window. The real kernel saves/restores its trap frame on the cached DDR kernel + * stack and idles (WFI) so the ROB empties and the restore MRET reaches head + * almost immediately -- exactly this window. + * + * This test makes the window DETERMINISTIC and timer-independent: an M-mode loop + * commits a backlog of distinct-line stores into the cached/DDR region (slow, + * serialized write-back drains => sq_committed_empty held 0 for many cycles), + * then immediately executes an MRET back to the loop top. On buggy RTL the very + * first MRET wedges in SERIAL_MRET_EXEC and the loop never prints <> (the + * cocotb harness times out, and the optional FROST_MRET_DEADLOCK_PROBE asserts + * on serial_state stuck in SERIAL_MRET_EXEC). On fixed RTL every MRET waits out + * the drain, completes, and the loop prints <>. + * + * The cocotb registration bakes in the Genesys2 cached shape (-GCACHED_HAS_L2=0, + * L1 -> DDR direct), which is where the bug manifests on hardware and where a + * cold cached-store write-back actually drains in sim, so the standard flow just + * works: + * cd frost/tests; make clean; ./test_run_cocotb.py mret_drain_deadlock + * BEFORE the cpu_ooo/reorder_buffer o_mret_start fix: the first MRET wedges in + * SERIAL_MRET_EXEC and the harness times out. AFTER the fix: <>. + */ + +#include + +#include "trap.h" + +/* Store buffer in the cached/DDR region (CACHED_BASE = 0x8000_0000). Placed in + * a loaded .ddr_data section (like ddr_atomic_test): it lives in the behavioral + * DDR model and -- unlike .bss -- is NOT touched by crt0, so the loop's first + * stores to it MISS the L1 and take the full ~DDR_MODEL_LATENCY write-back drain + * (fill from valid DDR, then write), reliably holding sq_committed_empty low + * while the MRET reaches the ROB head. Non-zero initializer forces it loaded. */ +__attribute__((section(".ddr_data"), aligned(64))) static volatile uint32_t g_ddr_buf[256] = {1}; + +static void uart_putc(char c) +{ + UART_TX = (uint8_t) c; +} +static void uart_puts(const char *s) +{ + while (*s) + uart_putc(*s++); +} +static void uart_hex(uint32_t v) +{ + static const char hex[] = "0123456789ABCDEF"; + uart_puts("0x"); + for (int i = 28; i >= 0; i -= 4) + uart_putc(hex[(v >> i) & 0xF]); +} + +/* + * Unexpected-trap canary. Nothing in this test should trap (no interrupts are + * enabled and every access is legal); if one does (e.g. an unexpected fault), + * spin emitting 'T' so the failure is visible over UART instead of a silent + * wild jump. Naked: entered as a raw trap handler. + */ +__attribute__((naked, aligned(4))) static void trap_canary(void) +{ + __asm__ volatile("li t0, 0x40000000\n" /* UART_TX */ + "li t1, 'T'\n" + "1:\n" + "sb t1, 0(t0)\n" + "j 1b\n"); +} + +/* + * Commit a backlog of distinct-line cached/DDR stores, then MRET back to the top + * of the loop -- `iters` times. The MRET is the loop back-edge, reached a handful + * of cycles after the youngest store commits, while that store (and the rest of + * the backlog) is still draining => the one-shot o_mret_start pulse coincides + * with sq_committed_empty==0. + * + * a0 = cached/DDR buffer base, a1 = iteration count. Naked: hand-written control + * flow (the MRET is the loop branch). Uses only caller-saved temporaries, so the + * final `ret` returns to C with ra intact. + */ +__attribute__((naked)) static void mret_drain_loop(volatile uint32_t *ddr, uint32_t iters) +{ + (void) ddr; + (void) iters; + __asm__ volatile( + /* MRET return target = loop top. Constant, so set mepc ONCE; MRET reads + * mepc but never writes it. */ + "la t1, 1f\n" + "csrw mepc, t1\n" + "li t2, 0x1800\n" /* mstatus.MPP = M (0b11 << 11) mask */ + "1:\n" + "beqz a1, 3f\n" /* done after `iters` MRETs */ + "addi a1, a1, -1\n" + /* MPP=M re-set here (BEFORE the backlog), since MRET pops MPP to U. Kept + * off the youngest-store->MRET critical path so NO instruction sits + * between the last store and the MRET. */ + "csrs mstatus, t2\n" + /* A few stores to distinct 32 B lines (64 B apart). Enough that the + * youngest committed store is still in its (cached/DDR) write-back drain + * when the MRET reaches the ROB head, but few enough not to overflow the + * store queue (which would wedge on backpressure, not on the MRET). */ + "sw a1, 0(a0)\n" + "sw a1, 64(a0)\n" + "sw a1, 128(a0)\n" + "sw a1, 192(a0)\n" /* youngest committed store; still draining at MRET */ + /* MRET immediately follows the youngest store: it reaches the ROB head a + * couple cycles later, while that store (and the backlog) is still + * draining => the one-shot o_mret_start pulse coincides with + * sq_committed_empty==0. */ + "mret\n" + "3:\n" + "ret\n" :: + : "t0", "t1", "t2", "a0", "a1", "memory"); +} + +int main(void) +{ + uart_puts("\r\n=== MRET drain-deadlock repro ===\r\n"); + + /* Any unexpected trap becomes visible rather than a silent wild jump. */ + set_trap_handler(&trap_canary); + + /* No interrupts: this deadlock is purely the MRET<->store-drain handshake. */ + (void) disable_interrupts(); + + uart_puts("running MRET/drain loop...\r\n"); + mret_drain_loop(g_ddr_buf, 16u); + + /* Only reached if every MRET completed (fixed RTL). On buggy RTL the first + * MRET wedges the serializer and we never get here. */ + uart_puts("survived all MRETs: iters="); + uart_hex(16u); + uart_puts("\r\n<>\r\n"); + for (;;) { + } + return 0; +} diff --git a/sw/apps/pde_return_hazard/Makefile b/sw/apps/pde_return_hazard/Makefile new file mode 100644 index 00000000..b521507f --- /dev/null +++ b/sw/apps/pde_return_hazard/Makefile @@ -0,0 +1,21 @@ +# Copyright 2026 Two Sigma Open Source, LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +# Copyright 2026 Two Sigma Open Source, LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); + +override MEM_CONFIG := ddr +SRC_C := ../../lib/src/uart.c main.c +include ../../common/common.mk diff --git a/sw/apps/pde_return_hazard/main.c b/sw/apps/pde_return_hazard/main.c new file mode 100644 index 00000000..d47eaae8 --- /dev/null +++ b/sw/apps/pde_return_hazard/main.c @@ -0,0 +1,843 @@ +/* + * Copyright 2026 Two Sigma Open Source, LLC + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/* + * Directed reproducer for the procfs /proc lookup failure seen on hardware. + * + * The Linux failure shows proc_get_inode() receiving a pointer that looks like + * a proc_dir_entry.subdir_node, not the proc_dir_entry base. The hot epilogue in + * pde_subdir_find() subtracts the rb_node offset from s1, then returns it via + * a0 shortly before restoring the caller's s1. + */ + +#include + +#include "uart.h" + +#define ITERATIONS 64u +#define PDE_VIS_ITERATIONS 16u +#define PDE_VIS_CHURN_BYTES (16u * 1024u) +#define PDE_SUBDIR_NODE_OFFSET 80u +#define PDE_SUBDIR_ROOT_OFFSET 76u +#define PDE_REFCOUNT_OFFSET 4u +#define PDE_NLINK_OFFSET 48u +#define PDE_UID_OFFSET 52u +#define PDE_GID_OFFSET 56u +#define PDE_NAME_OFFSET 92u +#define PDE_MODE_OFFSET 96u +#define PDE_FLAGS_OFFSET 98u +#define PDE_NAMELEN_OFFSET 99u +#define PDE_INLINE_NAME_OFFSET 100u +#define RB_RIGHT_OFFSET 4u +#define RB_LEFT_OFFSET 8u +#define RB_NAME_OFFSET 12u +#define RB_NAMELEN_OFFSET 19u +#define MULTI_PDE_COUNT 5u +#define PDE_MODE_REG_0444 0x8124u + +static uint8_t root_pde[128] __attribute__((aligned(64))); +static uint8_t entry_pde[128] __attribute__((aligned(64))); +static uint8_t multi_pdes[MULTI_PDE_COUNT][128] __attribute__((aligned(64))); +static uint8_t fake_dir[32] __attribute__((aligned(64))); +static uint8_t fake_dentry[40] __attribute__((aligned(64))); +static uint8_t s2l_area[64 * 1024] __attribute__((aligned(64))); +static const char version_name[] = "version"; +static const char cmdline_name[] = "cmdline"; +static const char loadavg_name[] = "loadavg"; +static const char maps_name[] = "maps"; +static const char meminfo_name[] = "meminfo"; + +static volatile uintptr_t observed_de; +static volatile uintptr_t observed_sb; +static volatile uint32_t observed_ref_old; +static volatile uint32_t observed_mode; +static volatile uint32_t observed_namelen; + +static void churn_cache(uint32_t seed); + +__attribute__((noinline, naked, used, aligned(4))) static uintptr_t +epilogue_repro(uintptr_t node, uintptr_t salt2, uintptr_t salt3) +{ + __asm__ volatile("addi sp, sp, -32\n" + "sw s0, 24(sp)\n" + "sw ra, 28(sp)\n" + "sw s1, 20(sp)\n" + "sw s2, 16(sp)\n" + "sw s3, 12(sp)\n" + "addi s0, sp, 32\n" + "mv s1, a0\n" + "mv s2, a1\n" + "mv s3, a2\n" + "xor a5, s2, s3\n" + "andi a5, a5, 1\n" + "beqz a5, 1f\n" + "addi s1, s1, 0\n" + "1:\n" + "lw ra, 28(sp)\n" + "lw s0, 24(sp)\n" + "addi s1, s1, -80\n" + "lw s2, 16(sp)\n" + "lw s3, 12(sp)\n" + "mv a0, s1\n" + "lw s1, 20(sp)\n" + "addi sp, sp, 32\n" + "ret\n"); +} + +__attribute__((noinline, naked, used, aligned(4))) static uintptr_t +epilogue_direct_a0(uintptr_t node, uintptr_t salt2, uintptr_t salt3) +{ + __asm__ volatile("addi sp, sp, -32\n" + "sw s0, 24(sp)\n" + "sw ra, 28(sp)\n" + "sw s1, 20(sp)\n" + "sw s2, 16(sp)\n" + "sw s3, 12(sp)\n" + "addi s0, sp, 32\n" + "mv s1, a0\n" + "mv s2, a1\n" + "mv s3, a2\n" + "xor a5, s2, s3\n" + "andi a5, a5, 1\n" + "beqz a5, 1f\n" + "addi s1, s1, 0\n" + "1:\n" + "lw ra, 28(sp)\n" + "lw s0, 24(sp)\n" + "addi a0, s1, -80\n" + "lw s2, 16(sp)\n" + "lw s3, 12(sp)\n" + "lw s1, 20(sp)\n" + "addi sp, sp, 32\n" + "ret\n"); +} + +static volatile uintptr_t sink; +static volatile uint32_t s2l_sink; + +__attribute__((noinline, used)) static uint32_t halfword_s2l(uint8_t *ptr, uint32_t value) +{ + uint32_t out; + + __asm__ volatile("sh %[value], 0(%[ptr])\n" + "lhu %[out], 0(%[ptr])\n" + : [out] "=r"(out) + : [ptr] "r"(ptr), [value] "r"(value) + : "memory"); + return out; +} + +__attribute__((noinline, used)) static uint32_t amo_halfword_s2l(uint8_t *ptr, uint32_t value) +{ + uint32_t out; + + __asm__ volatile("li t0, 1\n" + "addi t1, %[ptr], 4\n" + "amoadd.w zero, t0, (t1)\n" + "sh %[value], 0(%[ptr])\n" + "lhu %[out], 0(%[ptr])\n" + : [out] "=r"(out) + : [ptr] "r"(ptr), [value] "r"(value) + : "t0", "t1", "memory"); + return out; +} + +__attribute__((noinline, used)) static uint32_t word_s2l(uint8_t *ptr, uint32_t value) +{ + uint32_t out; + + __asm__ volatile("sw %[value], 0(%[ptr])\n" + "lw %[out], 0(%[ptr])\n" + : [out] "=r"(out) + : [ptr] "r"(ptr), [value] "r"(value) + : "memory"); + return out; +} + +__attribute__((noinline, used)) static int +hazard_memcmp(const void *lhs, const void *rhs, uint32_t len) +{ + const uint8_t *a = (const uint8_t *) lhs; + const uint8_t *b = (const uint8_t *) rhs; + + for (uint32_t i = 0; i < len; i++) { + if (a[i] != b[i]) { + return (int) a[i] - (int) b[i]; + } + } + return 0; +} + +__attribute__((noinline, used)) static uintptr_t fake_proc_get_inode(uintptr_t sb, uintptr_t de) +{ + uint32_t mode; + + __asm__ volatile("lhu %0, 96(%1)" : "=r"(mode) : "r"(de) : "memory"); + observed_sb = sb; + observed_de = de; + observed_mode = mode; + observed_namelen = *(volatile uint8_t *) (uintptr_t) (de + PDE_NAMELEN_OFFSET); + return 0x12345678u; +} + +__attribute__((noinline, naked, used, aligned(4))) static void pde_init_version_asm(uintptr_t de) +{ + __asm__ volatile("addi t0, a0, 100\n" + "sw t0, 92(a0)\n" + "li t1, 0x73726576\n" /* "vers" */ + "sw t1, 100(a0)\n" + "li t1, 0x006e6f69\n" /* "ion\\0" */ + "sw t1, 104(a0)\n" + "li t1, 1\n" + "sw t1, 4(a0)\n" + "addi t2, a0, 8\n" + "sw t2, 8(a0)\n" + "li t3, 0x8124\n" + "sh t3, 96(a0)\n" + "sw t1, 48(a0)\n" + "sw zero, 76(a0)\n" + "li t4, 7\n" + "sb t4, 99(a0)\n" + "sw t2, 12(a0)\n" + "sw zero, 52(a0)\n" + "sw zero, 56(a0)\n" + "ret\n"); +} + +__attribute__((noinline, naked, used, aligned(4))) static uintptr_t +pde_subdir_find_asm(uintptr_t de, const char *name, uint32_t len) +{ + __asm__ volatile("addi sp, sp, -32\n" + "sw s0, 24(sp)\n" + "sw ra, 28(sp)\n" + "sw s1, 20(sp)\n" + "addi s0, sp, 32\n" + "lw s1, 76(a0)\n" + "beqz s1, 4f\n" + "sw s2, 16(sp)\n" + "sw s3, 12(sp)\n" + "mv s2, a2\n" + "mv s3, a1\n" + "1:\n" + "lbu a5, 19(s1)\n" + "mv a2, s2\n" + "mv a0, s3\n" + "bltu s2, a5, 5f\n" + "bltu a5, s2, 2f\n" + "lw a1, 12(s1)\n" + "call hazard_memcmp\n" + "bltz a0, 5f\n" + "beqz a0, 6f\n" + "2:\n" + "lw s1, 4(s1)\n" + "bnez s1, 1b\n" + "3:\n" + "lw s2, 16(sp)\n" + "lw s3, 12(sp)\n" + "4:\n" + "lw ra, 28(sp)\n" + "lw s0, 24(sp)\n" + "mv a0, s1\n" + "lw s1, 20(sp)\n" + "addi sp, sp, 32\n" + "ret\n" + "5:\n" + "lw s1, 8(s1)\n" + "bnez s1, 1b\n" + "j 3b\n" + "6:\n" + "lw ra, 28(sp)\n" + "lw s0, 24(sp)\n" + "addi s1, s1, -80\n" + "lw s2, 16(sp)\n" + "lw s3, 12(sp)\n" + "mv a0, s1\n" + "lw s1, 20(sp)\n" + "addi sp, sp, 32\n" + "ret\n"); +} + +__attribute__((noinline, naked, used, aligned(4))) static uintptr_t +proc_lookup_de_asm(uintptr_t dir, uintptr_t dentry, uintptr_t de) +{ + __asm__ volatile("addi sp, sp, -32\n" + "sw s0, 24(sp)\n" + "sw s1, 20(sp)\n" + "sw s2, 16(sp)\n" + "sw ra, 28(sp)\n" + "addi s0, sp, 32\n" + "mv s2, a0\n" + "mv s1, a1\n" + "mv a0, a2\n" + "lw a2, 28(a1)\n" + "lw a1, 32(a1)\n" + "call pde_subdir_find_asm\n" + "beqz a0, 1f\n" + "mv a5, a0\n" + "li a1, 1\n" + "addi a0, a0, 4\n" + "amoadd.w a4, a1, (a0)\n" + "la t0, observed_ref_old\n" + "sw a4, 0(t0)\n" + "lw a0, 20(s2)\n" + "mv a1, a5\n" + "sw a5, -20(s0)\n" + "call fake_proc_get_inode\n" + "j 2f\n" + "1:\n" + "li a0, -2\n" + "2:\n" + "lw ra, 28(sp)\n" + "lw s0, 24(sp)\n" + "lw s1, 20(sp)\n" + "lw s2, 16(sp)\n" + "addi sp, sp, 32\n" + "ret\n"); +} + +static int run_one(const char *name, uintptr_t (*fn)(uintptr_t, uintptr_t, uintptr_t)) +{ + for (uint32_t i = 0; i < ITERATIONS; i++) { + uintptr_t node = 0x80c60050u + ((uintptr_t) i << 6); + uintptr_t expected = node - 80u; + uintptr_t got = fn(node, 0x13572468u + i, 0x24681357u ^ i); + sink ^= got; + if (got != expected) { + uart_printf("%s FAIL i=%u node=0x%08lx got=0x%08lx expected=0x%08lx\n", + name, + (unsigned) i, + (unsigned long) node, + (unsigned long) got, + (unsigned long) expected); + return -1; + } + } + uart_printf("%s PASS\n", name); + return 0; +} + +static void write32(uint8_t *base, uint32_t offset, uintptr_t value) +{ + *(volatile uintptr_t *) (void *) (base + offset) = value; +} + +static uintptr_t read32(uint8_t *base, uint32_t offset) +{ + return *(volatile uintptr_t *) (void *) (base + offset); +} + +static void clear_bytes(uint8_t *base, uint32_t size) +{ + for (uint32_t i = 0; i < size; i++) { + base[i] = 0; + } +} + +static uint32_t small_strlen(const char *name) +{ + uint32_t len = 0; + + while (name[len] != '\0') { + len++; + } + return len; +} + +static uintptr_t multi_base(uint32_t idx) +{ + return (uintptr_t) multi_pdes[idx]; +} + +static uintptr_t multi_node(uint32_t idx) +{ + return multi_base(idx) + PDE_SUBDIR_NODE_OFFSET; +} + +static const char *multi_inline_name(uint32_t idx) +{ + return (const char *) (const void *) (multi_pdes[idx] + PDE_INLINE_NAME_OFFSET); +} + +static const char *known_pde_name(uintptr_t de) +{ + for (uint32_t i = 0; i < MULTI_PDE_COUNT; i++) { + if (de == multi_base(i)) { + return multi_inline_name(i); + } + if (de == multi_node(i)) { + return "NODE_PTR"; + } + } + return "UNKNOWN"; +} + +static void init_multi_pde(uint32_t idx, const char *name) +{ + uint8_t *de = multi_pdes[idx]; + uint32_t len = small_strlen(name); + + clear_bytes(de, sizeof(multi_pdes[idx])); + write32(de, PDE_REFCOUNT_OFFSET, 1u); + write32(de, PDE_NAME_OFFSET, multi_base(idx) + PDE_INLINE_NAME_OFFSET); + for (uint32_t i = 0; i <= len; i++) { + de[PDE_INLINE_NAME_OFFSET + i] = (uint8_t) name[i]; + } + *(volatile uint16_t *) (void *) (de + PDE_MODE_OFFSET) = PDE_MODE_REG_0444; + de[PDE_NAMELEN_OFFSET] = (uint8_t) len; + write32(de, PDE_NLINK_OFFSET, 1u); +} + +static void set_rb_links(uint32_t idx, int32_t right_idx, int32_t left_idx) +{ + uint8_t *de = multi_pdes[idx]; + + write32(de, + PDE_SUBDIR_NODE_OFFSET + RB_RIGHT_OFFSET, + right_idx >= 0 ? multi_node((uint32_t) right_idx) : 0u); + write32(de, + PDE_SUBDIR_NODE_OFFSET + RB_LEFT_OFFSET, + left_idx >= 0 ? multi_node((uint32_t) left_idx) : 0u); +} + +enum { + MULTI_CMDLINE = 0, + MULTI_LOADAVG = 1, + MULTI_MAPS = 2, + MULTI_MEMINFO = 3, + MULTI_VERSION = 4, +}; + +struct multi_lookup_case { + const char *name; + uint32_t idx; +}; + +static const struct multi_lookup_case multi_lookup_cases[] = { + {version_name, MULTI_VERSION}, + {cmdline_name, MULTI_CMDLINE}, + {loadavg_name, MULTI_LOADAVG}, + {maps_name, MULTI_MAPS}, + {meminfo_name, MULTI_MEMINFO}, +}; + +static void setup_multi_proc_tree(void) +{ + clear_bytes(root_pde, sizeof(root_pde)); + clear_bytes(fake_dir, sizeof(fake_dir)); + clear_bytes(fake_dentry, sizeof(fake_dentry)); + + init_multi_pde(MULTI_CMDLINE, cmdline_name); + init_multi_pde(MULTI_LOADAVG, loadavg_name); + init_multi_pde(MULTI_MAPS, maps_name); + init_multi_pde(MULTI_MEMINFO, meminfo_name); + init_multi_pde(MULTI_VERSION, version_name); + + /* + * A small rb-tree with the same comparison properties as /proc root: + * loadavg -> left cmdline, right meminfo; meminfo -> left maps, right version. + */ + write32(root_pde, PDE_SUBDIR_ROOT_OFFSET, multi_node(MULTI_LOADAVG)); + set_rb_links(MULTI_LOADAVG, MULTI_MEMINFO, MULTI_CMDLINE); + set_rb_links(MULTI_MEMINFO, MULTI_VERSION, MULTI_MAPS); + set_rb_links(MULTI_CMDLINE, -1, -1); + set_rb_links(MULTI_MAPS, -1, -1); + set_rb_links(MULTI_VERSION, -1, -1); + + write32(fake_dir, 20u, 0xcafef00du); +} + +static int +run_multi_lookup_case(const char *test_name, uint32_t iter, const struct multi_lookup_case *lookup) +{ + uint32_t len = small_strlen(lookup->name); + uintptr_t expected = multi_base(lookup->idx); + uintptr_t expected_node = multi_node(lookup->idx); + uint8_t *expected_pde = multi_pdes[lookup->idx]; + + observed_de = 0; + observed_sb = 0; + observed_ref_old = 0xdeadbeefu; + observed_mode = 0xdeadbeefu; + observed_namelen = 0xdeadbeefu; + write32(fake_dentry, 28u, len); + write32(fake_dentry, 32u, (uintptr_t) lookup->name); + + uintptr_t direct = pde_subdir_find_asm((uintptr_t) root_pde, lookup->name, len); + uintptr_t ret = + proc_lookup_de_asm((uintptr_t) fake_dir, (uintptr_t) fake_dentry, (uintptr_t) root_pde); + uint32_t ref_now = (uint32_t) read32(expected_pde, PDE_REFCOUNT_OFFSET); + + sink ^= direct ^ ret; + if (direct != expected || observed_de != expected || observed_sb != 0xcafef00du || + observed_ref_old != 1u || ref_now != 2u || observed_mode != PDE_MODE_REG_0444 || + observed_namelen != len) { + uart_printf("%s FAIL i=%u query=%s direct=0x%08lx expected=0x%08lx node=0x%08lx\n", + test_name, + (unsigned) iter, + lookup->name, + (unsigned long) direct, + (unsigned long) expected, + (unsigned long) expected_node); + uart_printf("%s obs_de=0x%08lx obs_name=%s sb=0x%08lx mode=0x%04lx len=%lu old=0x%08lx " + "ref=0x%08lx ret=0x%08lx\n", + test_name, + (unsigned long) observed_de, + known_pde_name(observed_de), + (unsigned long) observed_sb, + (unsigned long) observed_mode, + (unsigned long) observed_namelen, + (unsigned long) observed_ref_old, + (unsigned long) ref_now, + (unsigned long) ret); + return -1; + } + return 0; +} + +static int run_multi_lookup_repro_variant(const char *name, int churn) +{ + const uint32_t case_count = + (uint32_t) (sizeof(multi_lookup_cases) / sizeof(multi_lookup_cases[0])); + + for (uint32_t i = 0; i < PDE_VIS_ITERATIONS; i++) { + if (churn) { + churn_cache(i + 0x200u); + } + for (uint32_t c = 0; c < case_count; c++) { + setup_multi_proc_tree(); + if (run_multi_lookup_case(name, i, &multi_lookup_cases[c]) != 0) { + return -1; + } + } + } + + uart_printf("%s PASS\n", name); + return 0; +} + +static int run_multi_lookup_repro(void) +{ + if (run_multi_lookup_repro_variant("multi_lookup_immediate", 0) != 0) { + return -1; + } + if (run_multi_lookup_repro_variant("multi_lookup_churn", 1) != 0) { + return -1; + } + return 0; +} + +static void setup_fake_proc_tree(void) +{ + for (uint32_t i = 0; i < sizeof(root_pde); i++) { + root_pde[i] = 0; + entry_pde[i] = 0; + } + for (uint32_t i = 0; i < sizeof(fake_dir); i++) { + fake_dir[i] = 0; + } + for (uint32_t i = 0; i < sizeof(fake_dentry); i++) { + fake_dentry[i] = 0; + } + + uintptr_t entry_base = (uintptr_t) entry_pde; + uintptr_t entry_node = entry_base + PDE_SUBDIR_NODE_OFFSET; + + write32(root_pde, PDE_SUBDIR_ROOT_OFFSET, entry_node); + write32(entry_pde, PDE_REFCOUNT_OFFSET, 1u); + write32(entry_pde, PDE_SUBDIR_NODE_OFFSET + RB_RIGHT_OFFSET, 0u); + write32(entry_pde, PDE_SUBDIR_NODE_OFFSET + RB_LEFT_OFFSET, 0u); + write32(entry_pde, PDE_SUBDIR_NODE_OFFSET + RB_NAME_OFFSET, (uintptr_t) version_name); + entry_pde[PDE_SUBDIR_NODE_OFFSET + RB_NAMELEN_OFFSET] = 7u; + + write32(fake_dir, 20u, 0xcafef00du); + write32(fake_dentry, 28u, 7u); + write32(fake_dentry, 32u, (uintptr_t) version_name); +} + +static int run_lookup_repro(void) +{ + uintptr_t entry_base = (uintptr_t) entry_pde; + uintptr_t entry_node = entry_base + PDE_SUBDIR_NODE_OFFSET; + + for (uint32_t i = 0; i < ITERATIONS; i++) { + setup_fake_proc_tree(); + observed_de = 0; + observed_sb = 0; + + uintptr_t direct = pde_subdir_find_asm((uintptr_t) root_pde, version_name, 7u); + if (direct != entry_base) { + uart_printf("pde_subdir_find_asm FAIL i=%u got=0x%08lx expected=0x%08lx node=0x%08lx\n", + (unsigned) i, + (unsigned long) direct, + (unsigned long) entry_base, + (unsigned long) entry_node); + return -1; + } + + uintptr_t ret = + proc_lookup_de_asm((uintptr_t) fake_dir, (uintptr_t) fake_dentry, (uintptr_t) root_pde); + sink ^= ret; + if (observed_de != entry_base) { + uart_printf("proc_lookup_de_asm FAIL i=%u observed_de=0x%08lx expected=0x%08lx " + "node=0x%08lx ret=0x%08lx\n", + (unsigned) i, + (unsigned long) observed_de, + (unsigned long) entry_base, + (unsigned long) entry_node, + (unsigned long) ret); + return -1; + } + if (observed_sb != 0xcafef00du) { + uart_printf("proc_lookup_de_asm SB FAIL i=%u observed_sb=0x%08lx\n", + (unsigned) i, + (unsigned long) observed_sb); + return -1; + } + if (read32(entry_pde, PDE_REFCOUNT_OFFSET) != 2u) { + uart_printf( + "proc_lookup_de_asm REF FAIL i=%u ref=0x%08lx node_right=0x%08lx\n", + (unsigned) i, + (unsigned long) read32(entry_pde, PDE_REFCOUNT_OFFSET), + (unsigned long) read32(entry_pde, PDE_SUBDIR_NODE_OFFSET + RB_RIGHT_OFFSET)); + return -1; + } + } + + uart_printf("proc_lookup_de_asm PASS\n"); + return 0; +} + +static void setup_pde_visibility_tree(void) +{ + for (uint32_t i = 0; i < sizeof(root_pde); i++) { + root_pde[i] = 0; + entry_pde[i] = 0; + } + for (uint32_t i = 0; i < sizeof(fake_dir); i++) { + fake_dir[i] = 0; + } + for (uint32_t i = 0; i < sizeof(fake_dentry); i++) { + fake_dentry[i] = 0; + } + + uintptr_t entry_base = (uintptr_t) entry_pde; + uintptr_t entry_node = entry_base + PDE_SUBDIR_NODE_OFFSET; + + pde_init_version_asm(entry_base); + write32(root_pde, PDE_SUBDIR_ROOT_OFFSET, entry_node); + write32(fake_dir, 20u, 0xcafef00du); + write32(fake_dentry, 28u, 7u); + write32(fake_dentry, 32u, (uintptr_t) version_name); +} + +static void churn_cache(uint32_t seed) +{ + for (uint32_t i = 0; i < PDE_VIS_CHURN_BYTES; i += 64u) { + volatile uint32_t *word = (volatile uint32_t *) (void *) (s2l_area + i); + uint32_t value = *word ^ (seed + i + 0x9e3779b9u); + *word = value; + seed ^= *word + (seed << 5) + (seed >> 2); + } + s2l_sink ^= seed; +} + +static int +check_pde_visibility_result(const char *name, uint32_t i, uintptr_t direct, uintptr_t ret) +{ + uintptr_t entry_base = (uintptr_t) entry_pde; + uintptr_t entry_node = entry_base + PDE_SUBDIR_NODE_OFFSET; + uint32_t ref_now = (uint32_t) read32(entry_pde, PDE_REFCOUNT_OFFSET); + + if (direct != entry_base) { + uart_printf("%s FIND FAIL i=%u got=0x%08lx expected=0x%08lx node=0x%08lx namelen=%u\n", + name, + (unsigned) i, + (unsigned long) direct, + (unsigned long) entry_base, + (unsigned long) entry_node, + (unsigned) entry_pde[PDE_NAMELEN_OFFSET]); + return -1; + } + if (observed_de != entry_base) { + uart_printf("%s DE FAIL i=%u observed_de=0x%08lx expected=0x%08lx ret=0x%08lx\n", + name, + (unsigned) i, + (unsigned long) observed_de, + (unsigned long) entry_base, + (unsigned long) ret); + return -1; + } + if (observed_sb != 0xcafef00du) { + uart_printf("%s SB FAIL i=%u observed_sb=0x%08lx\n", + name, + (unsigned) i, + (unsigned long) observed_sb); + return -1; + } + if (observed_ref_old != 1u || ref_now != 2u) { + uart_printf("%s REF FAIL i=%u old=0x%08lx now=0x%08lx mode_mem=0x%04x namelen=%u\n", + name, + (unsigned) i, + (unsigned long) observed_ref_old, + (unsigned long) ref_now, + (unsigned) (*(volatile uint16_t *) (void *) (entry_pde + PDE_MODE_OFFSET)), + (unsigned) entry_pde[PDE_NAMELEN_OFFSET]); + return -1; + } + if (observed_mode != 0x8124u || observed_namelen != 7u) { + uart_printf("%s MODE FAIL i=%u mode=0x%04lx namelen=%lu ref_old=0x%08lx ref_now=0x%08lx " + "word96=0x%08lx\n", + name, + (unsigned) i, + (unsigned long) observed_mode, + (unsigned long) observed_namelen, + (unsigned long) observed_ref_old, + (unsigned long) ref_now, + (unsigned long) read32(entry_pde, PDE_MODE_OFFSET)); + return -1; + } + return 0; +} + +static int run_pde_visibility_repro_variant(const char *name, int churn) +{ + for (uint32_t i = 0; i < PDE_VIS_ITERATIONS; i++) { + setup_pde_visibility_tree(); + observed_de = 0; + observed_sb = 0; + observed_ref_old = 0xdeadbeefu; + observed_mode = 0xdeadbeefu; + observed_namelen = 0xdeadbeefu; + + if (churn) { + churn_cache(i); + } + + uintptr_t direct = pde_subdir_find_asm((uintptr_t) root_pde, version_name, 7u); + uintptr_t ret = + proc_lookup_de_asm((uintptr_t) fake_dir, (uintptr_t) fake_dentry, (uintptr_t) root_pde); + sink ^= direct ^ ret; + if (check_pde_visibility_result(name, i, direct, ret) != 0) { + return -1; + } + } + + uart_printf("%s PASS\n", name); + return 0; +} + +static int run_pde_visibility_repro(void) +{ + if (run_pde_visibility_repro_variant("pde_visibility_immediate", 0) != 0) { + return -1; + } + if (run_pde_visibility_repro_variant("pde_visibility_churn", 1) != 0) { + return -1; + } + return 0; +} + +static int run_store_load_repro(void) +{ + for (uint32_t i = 0; i < ITERATIONS; i++) { + uint8_t *ptr = s2l_area + (i * 256u); + uint32_t value = 0x8000u | ((i * 37u + 0x16du) & 0x7fffu); + uint32_t got = halfword_s2l(ptr, value); + s2l_sink ^= got; + if (got != (value & 0xffffu)) { + uart_printf("halfword_s2l FAIL i=%u ptr=0x%08lx got=0x%08lx expected=0x%08lx\n", + (unsigned) i, + (unsigned long) (uintptr_t) ptr, + (unsigned long) got, + (unsigned long) (value & 0xffffu)); + return -1; + } + } + uart_printf("halfword_s2l PASS\n"); + + for (uint32_t i = 0; i < ITERATIONS; i++) { + uint8_t *ptr = s2l_area + 0x4000u + (i * 256u); + uint32_t value = 0x40000000u | (i * 0x10203u) | 0x5a5u; + uint32_t got = word_s2l(ptr, value); + s2l_sink ^= got; + if (got != value) { + uart_printf("word_s2l FAIL i=%u ptr=0x%08lx got=0x%08lx expected=0x%08lx\n", + (unsigned) i, + (unsigned long) (uintptr_t) ptr, + (unsigned long) got, + (unsigned long) value); + return -1; + } + } + uart_printf("word_s2l PASS\n"); + + for (uint32_t i = 0; i < ITERATIONS; i++) { + uint8_t *ptr = s2l_area + 0x8000u + (i * 256u); + uint32_t value = 0x9000u | ((i * 53u + 0x55u) & 0x6fffu); + uint32_t got = amo_halfword_s2l(ptr, value); + s2l_sink ^= got; + if (got != (value & 0xffffu)) { + uart_printf("amo_halfword_s2l FAIL i=%u ptr=0x%08lx got=0x%08lx expected=0x%08lx\n", + (unsigned) i, + (unsigned long) (uintptr_t) ptr, + (unsigned long) got, + (unsigned long) (value & 0xffffu)); + return -1; + } + } + uart_printf("amo_halfword_s2l PASS\n"); + return 0; +} + +int main(void) +{ + uart_printf("\n=== pde_return_hazard ===\n"); + if (run_one("epilogue_repro", epilogue_repro) != 0) { + uart_printf("<>\n"); + for (;;) { + } + } + if (run_one("epilogue_direct_a0", epilogue_direct_a0) != 0) { + uart_printf("<>\n"); + for (;;) { + } + } + if (run_lookup_repro() != 0) { + uart_printf("<>\n"); + for (;;) { + } + } + if (run_multi_lookup_repro() != 0) { + uart_printf("<>\n"); + for (;;) { + } + } + if (run_pde_visibility_repro() != 0) { + uart_printf("<>\n"); + for (;;) { + } + } + if (run_store_load_repro() != 0) { + uart_printf("<>\n"); + for (;;) { + } + } + uart_printf("sink=0x%08lx\n", (unsigned long) sink); + uart_printf("s2l_sink=0x%08lx\n", (unsigned long) s2l_sink); + uart_printf("<>\n"); + for (;;) { + } +} diff --git a/sw/apps/smc_fencei_test/Makefile b/sw/apps/smc_fencei_test/Makefile new file mode 100644 index 00000000..557494b2 --- /dev/null +++ b/sw/apps/smc_fencei_test/Makefile @@ -0,0 +1,19 @@ +# Copyright 2026 Two Sigma Open Source, LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +# Makefile for the hardened self-modifying-code (fence.i) reproducer. +# Sweeps the store->fence.i timing/layout knobs (gap, warm/cold L1D, +# write-allocate miss, tight self-modify loops) that the boot hang implicates. +SRC_C := ../../lib/src/uart.c main.c +include ../../common/common.mk diff --git a/sw/apps/smc_fencei_test/main.c b/sw/apps/smc_fencei_test/main.c new file mode 100644 index 00000000..6108b2c6 --- /dev/null +++ b/sw/apps/smc_fencei_test/main.c @@ -0,0 +1,172 @@ +/* + * Copyright 2026 Two Sigma Open Source, LLC + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/** + * Hardened self-modifying-code / fence.i directed reproducer. + * + * Models the kernel's runtime code-patching contract (patch_insn_write + + * fence.i): store a new instruction word into cached-DDR code, fence.i to + * sync, then fetch/execute it. The fence.i must: + * store -> SQ -> L1D (dirty) ... new code invisible to fetch + * fence.i: drain committed SQ -> L1D writeback-all -> L1I invalidate-all + * -> fetch-buffer invalidate + * call -> L1I miss -> fill returns the freshly written code + * + * The gentle ddr_smc_test passes; this sweeps the timing/layout knobs that the + * boot hang implicates so a transient becomes a deterministic, waveform-able + * failure: + * - store->fence.i freshness GAP (0/1/2/3/4/8 nops): how fresh the committed + * store is when fence.i drains the store queue. + * - WARM L1D (write-hit) vs COLD L1D (write-allocate miss): the L1D is + * 128 KiB direct-mapped with 32 B lines, so a single read +128 KiB shares + * the index but not the tag and conflict-evicts the ddr_code line, forcing + * the next patch store to miss and race the fence.i writeback walk. + * - tight alternating self-modify loops (a stale/previous read is always a + * detectable mismatch). + * + * Prints "<>" if every post-fence.i call returns its freshly written + * value; "<>" with detail otherwise. A wedge (stale garbage executed) + * shows up as a simulation/UART timeout. + */ + +#include + +#include "../../lib/include/uart.h" + +#define ADDI_A0(imm) (0x00000513u | (((uint32_t) (imm) & 0xfffu) << 20)) /* addi a0,x0,imm */ +#define RET_INSN 0x00008067u /* jalr x0,0(ra) */ + +/* Executable + writable patch target in the cached DDR region, line aligned + * (LINE_BYTES = 32). ddr_code[0] is the entry (patched); [1] is `ret`. */ +__attribute__((section(".ddr_data"), aligned(32))) static volatile uint32_t ddr_code[8]; + +/* Direct-mapped L1D = 128 KiB. */ +#define L1D_BYTES (128u * 1024u) + +typedef int (*fn_t)(void); + +/* Patch word[0] with `addi a0,x0,imm`, then GAP nops, then fence.i. The single + * 32-bit store mirrors patch_insn_write; GAP varies how fresh the committed + * store is when the fence.i serializer drains the SQ. */ +#define MK_PATCH(name, nops) \ + static inline void name(uint32_t imm) \ + { \ + __asm__ volatile("sw %1, 0(%0)\n\t" nops "fence.i\n\t" \ + : \ + : "r"(&ddr_code[0]), "r"(ADDI_A0(imm)) \ + : "memory"); \ + } +MK_PATCH(patch_g0, "") +MK_PATCH(patch_g1, "nop\n\t") +MK_PATCH(patch_g2, "nop\n\tnop\n\t") +MK_PATCH(patch_g3, "nop\n\tnop\n\tnop\n\t") +MK_PATCH(patch_g4, "nop\n\tnop\n\tnop\n\tnop\n\t") +MK_PATCH(patch_g8, "nop\n\tnop\n\tnop\n\tnop\n\tnop\n\tnop\n\tnop\n\tnop\n\t") + +typedef void (*patch_fn_t)(uint32_t); +static patch_fn_t const patchers[] = {patch_g0, patch_g1, patch_g2, patch_g3, patch_g4, patch_g8}; +static const int gaps[] = {0, 1, 2, 3, 4, 8}; +#define NGAPS ((int) (sizeof(gaps) / sizeof(gaps[0]))) + +/* Conflict-evict the ddr_code line from a direct-mapped L1D (read several + * +N*128 KiB aliases; one suffices for direct-mapped, extras cover any + * set-assoc surprise). */ +static inline void evict_code_line(void) +{ + uintptr_t base = (uintptr_t) &ddr_code[0]; + volatile uint32_t *a1 = (volatile uint32_t *) (base + 1u * L1D_BYTES); + volatile uint32_t *a2 = (volatile uint32_t *) (base + 2u * L1D_BYTES); + volatile uint32_t *a3 = (volatile uint32_t *) (base + 3u * L1D_BYTES); + volatile uint32_t *a4 = (volatile uint32_t *) (base + 4u * L1D_BYTES); + volatile uint32_t s = *a1 + *a2 + *a3 + *a4; + (void) s; +} + +static int g_fail; +static int g_reported; + +static void check(int tag, int gap, uint32_t want, int cold) +{ + fn_t fn = (fn_t) (uintptr_t) &ddr_code[0]; + int got = fn(); + if (got != (int) want) { + g_fail++; + if (g_reported < 16) { + uart_printf("FAIL tag=%x gap=%d cold=%d got=0x%x want=0x%x\n", + (unsigned) tag, + gap, + cold, + (unsigned) got, + (unsigned) want); + g_reported++; + } + } +} + +int main(void) +{ + /* Establish word[1] = ret once and sync it in. */ + ddr_code[1] = RET_INSN; + __asm__ volatile("fence.i" ::: "memory"); + + /* Phase A: gap sweep, WARM L1D (write-hit). */ + uart_printf("A"); + for (int rep = 0; rep < 4; rep++) { + for (int g = 0; g < NGAPS; g++) { + uint32_t want = ((rep + g) & 1) ? 0x2Au : 0x355u; + patchers[g](want); + check(0xA, gaps[g], want, 0); + } + } + + /* Phase B: gap sweep, COLD L1D (write-allocate miss). */ + uart_printf("B"); + for (int rep = 0; rep < 4; rep++) { + for (int g = 0; g < NGAPS; g++) { + uint32_t want = ((rep + g) & 1) ? 0x111u : 0x222u; + evict_code_line(); + patchers[g](want); + check(0xB, gaps[g], want, 1); + } + } + + /* Phase C: tight alternating self-modify loop, gap 0, warm. */ + uart_printf("C"); + for (int i = 0; i < 96; i++) { + uint32_t want = (i & 1) ? 0x123u : 0x456u; + patch_g0(want); + check(0xC, 0, want, 0); + } + + /* Phase D: tight alternating self-modify loop, gap 0, cold (miss each time). */ + uart_printf("D"); + for (int i = 0; i < 48; i++) { + uint32_t want = (i & 1) ? 0x0AAu : 0x055u; + evict_code_line(); + patch_g0(want); + check(0xD, 0, want, 1); + } + + if (g_fail == 0) { + uart_printf("\n<>\n"); + } else { + uart_printf("\n<> (%d failures)\n", g_fail); + } + + for (;;) { + } + return 0; +} diff --git a/sw/apps/trap_s2l_fwd/Makefile b/sw/apps/trap_s2l_fwd/Makefile new file mode 100644 index 00000000..f79b59ea --- /dev/null +++ b/sw/apps/trap_s2l_fwd/Makefile @@ -0,0 +1,18 @@ +# Copyright 2026 Two Sigma Open Source, LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +# Directed trap store->load forwarding repro. DDR-resident (cached tier). +override MEM_CONFIG := ddr +SRC_C := ../../lib/src/uart.c main.c +include ../../common/common.mk diff --git a/sw/apps/trap_s2l_fwd/main.c b/sw/apps/trap_s2l_fwd/main.c new file mode 100644 index 00000000..21e4e6b2 --- /dev/null +++ b/sw/apps/trap_s2l_fwd/main.c @@ -0,0 +1,158 @@ +/* + * Copyright 2026 Two Sigma Open Source, LLC + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/* + * Deterministic repro for the boot-hang root cause: cached store->load + * visibility across the trap path. + * + * The handler increments a CACHED counter g_ctr every trap; the main loop spins + * until it observes g_ctr reach a target. If a store of g_ctr is not visible to + * a later load of g_ctr (the store->load bug), g_ctr never advances from the + * observer's view and the loop hangs -- the exact livelock signature of the + * real boot hang. A wall-clock (mtime) watchdog prints the stuck g_ctr instead + * of hanging forever, so the failure is observable. + * + * Run at hardware-realistic latency: DDR_MODEL_LATENCY>=70, CACHED_HAS_L2=0. + */ + +#include + +#include "csr.h" +#include "trap.h" +#include "uart.h" + +#define CLINT_MTIMECMP_LO (*(volatile uint32_t *) 0x40014000u) +#define CLINT_MTIMECMP_HI (*(volatile uint32_t *) 0x40014004u) +#define CLINT_MTIME_LO (*(volatile uint32_t *) 0x4001BFF8u) +#define CLINT_MTIME_HI (*(volatile uint32_t *) 0x4001BFFCu) + +#define TARGET 200u +#define DDR_STACK_SIZE 4096u + +volatile uint32_t g_ctr; /* cached counter, written by handler, read by main */ +volatile uint32_t g_percpu[16]; /* DDR per-cpu-like scratch (tp base) */ +static uint8_t g_ddr_stack[DDR_STACK_SIZE] __attribute__((aligned(16))); + +static inline uint64_t clint_rdmtime(void) +{ + uint32_t hi, lo, hi2; + do { + hi = CLINT_MTIME_HI; + lo = CLINT_MTIME_LO; + hi2 = CLINT_MTIME_HI; + } while (hi != hi2); + return ((uint64_t) hi << 32) | lo; +} + +static void clint_arm(uint64_t cmp) +{ + CLINT_MTIMECMP_HI = 0xFFFFFFFFu; + CLINT_MTIMECMP_LO = (uint32_t) cmp; + CLINT_MTIMECMP_HI = (uint32_t) (cmp >> 32); +} + +/* Trap handler, faithful to a real kernel handler: saves/restores the GPRs it + * uses on the (cached DDR) stack -- which IS the handle_exception store->load + * pattern -- and explicitly checks a store->load with a VARYING value so a + * forward-miss is always caught. 'X' on the raw UART if the reload is wrong. */ +/* FULLY FAITHFUL to handle_exception's kernel-trap entry: the tp/mscratch swap, + * then sw sp,8(tp); sw sp,12(tp); lw sp,8(tp) -- loading the trap-time sp back + * INTO sp via the cached scratch slot -- then GPR saves to that reloaded sp. + * If the cached store->load (lw sp,8(tp)) drops the just-stored sp, sp becomes + * garbage and the GPR saves fault -> re-trap -> hang, exactly like the kernel. */ +__attribute__((naked, aligned(4))) static void ctr_entry(void) +{ + __asm__ volatile("csrrw tp, mscratch, tp\n" /* kernel: tp=0, mscratch=old tp(&g_percpu) */ + "bnez tp, 1f\n" + "csrr tp, mscratch\n" /* tp = &g_percpu */ + "sw sp, 8(tp)\n" /* *(tp+8) = sp */ + "1:\n" + "sw sp, 12(tp)\n" + "lw sp, 8(tp)\n" /* sp = *(tp+8) <-- cached store->load INTO sp */ + "addi sp, sp, -64\n" + "sw ra, 0(sp)\n" /* GPR saves to the reloaded sp (fault if sp bad) */ + "sw t0, 4(sp)\n" + "sw t1, 8(sp)\n" + "sw t2, 12(sp)\n" + /* work: g_ctr++ */ + "la t1, g_ctr\n" + "lw t2, 0(t1)\n" + "addi t2, t2, 1\n" + "sw t2, 0(t1)\n" + /* ack timer */ + "li t1, 0x40014004\n" + "li t2, -1\n" + "sw t2, 0(t1)\n" + "li t1, 0x40014000\n" + "sw t2, 0(t1)\n" + /* restore */ + "lw ra, 0(sp)\n" + "lw t0, 4(sp)\n" + "lw t1, 8(sp)\n" + "lw t2, 12(sp)\n" + "addi sp, sp, 64\n" /* sp back to trap-time value */ + "csrw mscratch, x0\n" + "mret\n"); +} + +__attribute__((noreturn, noinline, used)) void main_on_ddr_stack(void) +{ + uart_printf("\n=== faithful handle_exception sw/lw-into-sp repro ===\n"); + g_ctr = 0u; + for (int i = 0; i < 16; i++) + g_percpu[i] = 0xB6B60000u + (uint32_t) i; + /* kernel convention: tp = per-cpu ptr, mscratch = 0 */ + __asm__ volatile("mv tp, %0" : : "r"((uint32_t) &g_percpu[0]) : "memory"); + csr_write(mscratch, 0u); + set_trap_handler(&ctr_entry); + enable_timer_interrupt(); + + uint64_t deadline = clint_rdmtime() + 1500000u; + uint32_t observed = 0u; + while (g_ctr < TARGET) { + clint_arm(clint_rdmtime() + 200u); + enable_interrupts(); + for (volatile int s = 0; s < 32; s++) { + } + disable_interrupts(); + observed = g_ctr; + if (clint_rdmtime() > deadline) { + break; + } + } + + if (g_ctr >= TARGET) { + uart_printf("g_ctr=%u reached target -- store->load OK\n", g_ctr); + uart_printf("<>\n"); + } else { + uart_printf( + "HANG: g_ctr stuck at %u (last observed %u) -- store->load broken\n", g_ctr, observed); + uart_printf("<>\n"); + } + for (;;) { + } +} + +int main(void) +{ + uint32_t stack_top = ((uint32_t) &g_ddr_stack[DDR_STACK_SIZE]) & ~0xFu; + __asm__ volatile("mv sp, %0\n" + "j main_on_ddr_stack\n" + : + : "r"(stack_top) + : "memory"); + __builtin_unreachable(); +} diff --git a/sw/apps/wfi_drain_mepc_test/Makefile b/sw/apps/wfi_drain_mepc_test/Makefile new file mode 100644 index 00000000..aa44fc98 --- /dev/null +++ b/sw/apps/wfi_drain_mepc_test/Makefile @@ -0,0 +1,20 @@ +# Copyright 2026 Two Sigma Open Source, LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +# Drain-gated WFI mepc directed test. Force the whole program into cached DDR so +# the pre-WFI store is a slow cached drain (the committed entry that must still be +# draining when the timer IRQ is taken at the WFI). +override MEM_CONFIG := ddr +SRC_C := ../../lib/src/uart.c main.c +include ../../common/common.mk diff --git a/sw/apps/wfi_drain_mepc_test/main.c b/sw/apps/wfi_drain_mepc_test/main.c new file mode 100644 index 00000000..2419eb79 --- /dev/null +++ b/sw/apps/wfi_drain_mepc_test/main.c @@ -0,0 +1,167 @@ +/* + * Copyright 2026 Two Sigma Open Source, LLC + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/* + * Directed test for the *drain-gated* WFI mepc spec deviation. + * + * wfi_mepc_test covers the simple case (timer IRQ at a WFI with an empty ROB -> + * mepc must be the post-WFI PC). This test targets a narrower, analysis-derived + * window: a machine-timer interrupt that becomes eligible while a WFI is at the + * ROB head AND a committed CACHED (DDR) store is still draining. + * + * Mechanism under test (cpu_ooo.sv interrupt_resume_pc / trap_unit.sv take_trap + * gated on sq_committed_empty / the *registered* trap_mret_commit_hold_q): when + * the store drain finishes, take_trap fires combinationally that cycle while + * commit_hold still lags one cycle, so the WFI is flushed before it commits and + * mepc is saved as the WFI's own PC instead of wfi_pc+4. RISC-V priv spec: an + * interrupt taken at WFI resumes at the *following* instruction (mepc=wfi_pc+4). + * + * Construction: DDR-resident (MEM_CONFIG=ddr); immediately before the WFI, store + * to a FRESH cold DDR cache line (a different line each margin, in a region the + * program never otherwise touches) so the store reliably misses and drains the + * full DDR latency -- regardless of L1 write policy / warmth. Sweep the timer + * margin so the IRQ lands at every offset across that drain window. + * + * Robustness fixes vs the first cut: + * - mscratch (the handler's fixed continuation) is armed BEFORE interrupts are + * enabled, inside the asm, so a tiny margin cannot take the trap with a stale + * mscratch and crash. Enable/disable MIE is done in-asm around the WFI. + * - The handler is register-preserving and resumes via mscratch (never the + * recorded mepc), so a wrong mepc is detected, not fatal. + * + * PASS iff no margin ever produces mepc==wfi_pc. Run at DDR_MODEL_LATENCY>=70. + */ + +#include + +#include "trap.h" +#include "uart.h" + +#define MARGIN_MIN 0u +#define MARGIN_MAX 200u + +/* Cold DDR region the program never otherwise touches (well inside the 64 MiB + * model, far from the app's own code/data/stack). Each margin stores to its own + * 64 B line here, so every pre-WFI store is a cold miss -> full DDR-latency drain. */ +#define DRAIN_BASE 0x82000000u +#define DRAIN_LINE 64u + +static volatile uint32_t g_mepc; /* mepc the trap saved, last fire */ +static volatile uint32_t g_taken; /* running count of timer traps taken */ + +/* + * Naked M-mode timer handler. Register-preserving (saves/restores t0,t1 on the + * current stack) so the WFI/resume addresses the caller holds in registers across + * the WFI are not corrupted. Records the saved mepc, counts the trap, disarms the + * timer (mtimecmp_hi := -1) so it cannot refire, then resumes at the fixed + * continuation in mscratch -- never at the recorded mepc, so a wrong mepc cannot + * send us back into the WFI and hang. + */ +__attribute__((naked, aligned(4))) static void wfi_drain_trap_handler(void) +{ + __asm__ volatile("addi sp, sp, -16\n" + "sw t0, 0(sp)\n" + "sw t1, 4(sp)\n" + "csrr t0, mepc\n" + "lui t1, %hi(g_mepc)\n" + "sw t0, %lo(g_mepc)(t1)\n" + "lui t1, %hi(g_taken)\n" + "lw t0, %lo(g_taken)(t1)\n" + "addi t0, t0, 1\n" + "sw t0, %lo(g_taken)(t1)\n" + "li t1, 0x4000001C\n" /* MTIMECMP_HI: disarm */ + "li t0, -1\n" + "sw t0, 0(t1)\n" + "csrr t0, mscratch\n" /* fixed continuation after the WFI */ + "csrw mepc, t0\n" + "lw t0, 0(sp)\n" + "lw t1, 4(sp)\n" + "addi sp, sp, 16\n" + "mret\n"); +} + +int main(void) +{ + uint32_t bug = 0, correct = 0, early = 0, nofire = 0; + uint32_t bug_margin = 0, bug_mepc = 0, bug_wfi = 0; + + uart_printf("\n=== drain-gated WFI mepc test ===\n"); + set_trap_handler(&wfi_drain_trap_handler); + enable_timer_interrupt(); + + for (uint32_t margin = MARGIN_MIN; margin <= MARGIN_MAX; margin++) { + volatile uint32_t *sink = (volatile uint32_t *) (DRAIN_BASE + margin * DRAIN_LINE); + uint32_t wfi_addr = 0; + uint32_t resume_addr = 0; + uint32_t before = g_taken; + + g_mepc = 0; + set_timer_cmp(rdmtime() + margin); /* armed; MIE still 0 until the asm */ + + /* + * Arm mscratch (handler continuation) BEFORE enabling interrupts, then + * enable MIE in-asm; capture the WFI/resume PCs; issue one cold-miss DDR + * store IMMEDIATELY before the WFI (the committed entry that must still be + * draining when the IRQ is taken); WFI; then disable MIE. The handler + * bounces us to label 2 regardless of mepc. + */ + __asm__ volatile("la %[res], 2f\n" + "csrw mscratch, %[res]\n" + "csrsi mstatus, 8\n" /* enable MIE (interrupts) after mscratch is valid */ + "la %[wfi], 1f\n" + "sw %[res], 0(%[sink])\n" + "1:\n" + "wfi\n" + "2:\n" + "csrci mstatus, 8\n" /* disable MIE */ + : [res] "=&r"(resume_addr), [wfi] "=&r"(wfi_addr) + : [sink] "r"(sink) + : "memory"); + + if (g_taken == before) { + nofire++; + continue; + } + if (g_mepc == wfi_addr) { + bug++; + bug_margin = margin; + bug_mepc = g_mepc; + bug_wfi = wfi_addr; + } else if (g_mepc == resume_addr) { + correct++; + } else { + early++; + } + } + + disable_timer_interrupt(); + disable_interrupts(); + + uart_printf("sweep: bug=%u correct=%u early=%u nofire=%u\n", bug, correct, early, nofire); + if (bug) { + uart_printf("drain-gated WFI saved mepc==wfi_pc: margin=%u mepc=%08x wfi=%08x\n", + bug_margin, + bug_mepc, + bug_wfi); + uart_printf("<>\n"); + } else { + uart_printf("<>\n"); + } + + for (;;) { + } + return 0; +} diff --git a/sw/apps/wfi_lost_tick/Makefile b/sw/apps/wfi_lost_tick/Makefile new file mode 100644 index 00000000..32d1f147 --- /dev/null +++ b/sw/apps/wfi_lost_tick/Makefile @@ -0,0 +1,19 @@ +# Copyright 2026 Two Sigma Open Source, LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +# Copyright 2026 Two Sigma Open Source, LLC +# SPDX-License-Identifier: Apache-2.0 +# WFI-idle lost-machine-timer-tick directed test (deferred-eligibility MIE edge) +SRC_C := main.c +include ../../common/common.mk diff --git a/sw/apps/wfi_lost_tick/main.c b/sw/apps/wfi_lost_tick/main.c new file mode 100644 index 00000000..22abbe19 --- /dev/null +++ b/sw/apps/wfi_lost_tick/main.c @@ -0,0 +1,146 @@ +/* + * Copyright 2026 Two Sigma Open Source, LLC + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/* + * WFI-idle lost-machine-timer-tick directed test. + * + * Reproduce target: the residual flaky HANG booting no-MMU Linux on Genesys2. + * After fixing the MRET-drain deadlock the boot is STILL ~50% flaky, hanging at + * VARYING points after the first timer activity with no panic -- the signature + * of a LOST machine-timer tick -> frozen jiffies. The machine-timer trap is + * occasionally NOT TAKEN and timekeeping stops. + * + * This faithfully mirrors the kernel's idle + CLINT-timer flow, which the + * existing mtimer_stress (no WFI, MIE always 1) and linux_irq_ddr_test miss: + * - idle loop: csrci mstatus,8 (MIE:=0); fence; wfi; csrsi mstatus,8 (MIE:=1). + * The whole kernel is M-mode, so the machine-timer trap is eligible ONLY + * when mstatus.MIE=1 -- it is DEFERRED from the WFI-wake (raw mtip level) to + * the later csrsi MIE 0->1 edge. + * - handler = the CLINT pattern: csr_clear mie.MTIE on entry (clint_timer_ + * interrupt), then csr_set mie.MTIE + write a fresh future mtimecmp + * (clint_clock_next_event), then MRET (restores MIE from MPIE). + * The re-arm period is phase-swept (mtime + 24..87 per tick) so the deadline + * crossing lands at every cycle offset around the wfi / csrsi / MRET-recovery + * window across thousands of ticks. + * + * Invariant: each idle iteration arms exactly one future deadline and must take + * exactly one trap, so g_jiffies must equal the iteration count. If any trap is + * dropped (and especially if mie.MTIE sticks low so timekeeping freezes), + * g_jiffies falls behind -> <>. If every tick is taken -> <>. + */ + +#include + +#include "trap.h" + +#define MIE_MTIE_BIT 0x80u /* mie.MTIE = bit 7 */ +#define ITERS 3000u + +volatile uint32_t g_jiffies; /* incremented once per timer trap (the "tick") */ + +static void uart_putc(char c) +{ + UART_TX = (uint8_t) c; +} +static void uart_puts(const char *s) +{ + while (*s) + uart_putc(*s++); +} +static void uart_hex(uint32_t v) +{ + static const char hex[] = "0123456789ABCDEF"; + uart_puts("0x"); + for (int i = 28; i >= 0; i -= 4) + uart_putc(hex[(v >> i) & 0xF]); +} + +/* + * Naked M-mode timer handler mirroring the CLINT driver: + * clint_timer_interrupt: csr_clear(mie, MTIE) [mask on entry] + * clint_clock_next_event: csr_set(mie, MTIE); write mtimecmp [re-arm] + * then MRET (MIE restored from MPIE). The phase-sweep period = 24 + (jiffies&63). + */ +__attribute__((naked, aligned(4))) static void clint_like_handler(void) +{ + __asm__ volatile("addi sp, sp, -16\n" + "sw t0, 0(sp)\n" + "sw t1, 4(sp)\n" + "sw t2, 8(sp)\n" + "li t0, 0x80\n" /* mie.MTIE */ + "csrrc x0, mie, t0\n" /* csr_clear(mie, MTIE) -- handler entry */ + "lui t0, %hi(g_jiffies)\n" + "lw t1, %lo(g_jiffies)(t0)\n" + "addi t1, t1, 1\n" + "sw t1, %lo(g_jiffies)(t0)\n" /* g_jiffies++ (the tick) */ + "andi t2, t1, 0x3f\n" + "addi t2, t2, 24\n" /* period = 24 + (jiffies & 63): phase sweep */ + "li t0, 0x80\n" + "csrrs x0, mie, t0\n" /* csr_set(mie, MTIE) -- re-arm enable */ + "li t0, 0x40000010\n" /* MTIME_LO */ + "lw t1, 0(t0)\n" + "add t1, t1, t2\n" + "li t0, 0x40000018\n" /* MTIMECMP_LO (HI stays 0, set in main) */ + "sw t1, 0(t0)\n" /* write fresh future deadline -> mtip low */ + "lw t0, 0(sp)\n" + "lw t1, 4(sp)\n" + "lw t2, 8(sp)\n" + "addi sp, sp, 16\n" + "mret\n"); +} + +int main(void) +{ + uart_puts("\r\n=== WFI-idle lost-timer-tick test ===\r\n"); + set_trap_handler(&clint_like_handler); + + /* Arm the first deadline, enable the machine timer, then run the idle loop. */ + MTIMECMP_HI = 0; + MTIMECMP_LO = (uint32_t) rdmtime() + 40; + enable_timer_interrupt(); /* mie.MTIE = 1 */ + enable_interrupts(); /* mstatus.MIE = 1 (idle loop toggles it) */ + + /* Kernel idle pattern: MIE off, WFI (wake on raw mtip), MIE on (deferred + * timer trap taken here). Exactly one tick must be taken per iteration. */ + for (uint32_t i = 0; i < ITERS; i++) { + __asm__ volatile("csrci mstatus, 8\n" /* mstatus.MIE = 0 */ + "fence\n" + "wfi\n" + "csrsi mstatus, 8\n" /* mstatus.MIE = 1 -> take deferred timer */ + :: + : "memory"); + } + + disable_timer_interrupt(); + uint32_t jiffies = g_jiffies; + uart_puts("iters="); + uart_hex(ITERS); + uart_puts(" jiffies="); + uart_hex(jiffies); + uart_puts("\r\n"); + + /* Every WFI-wake must produce exactly one tick. A shortfall means a + * machine-timer trap was dropped (lost tick / frozen timekeeping). */ + if (jiffies + 4u >= ITERS) { + uart_puts("<>\r\n"); + } else { + uart_puts("[FAIL] lost timer tick(s): jiffies fell behind idle iterations\r\n"); + uart_puts("<>\r\n"); + } + for (;;) { + } + return 0; +} diff --git a/tests/test_run_cocotb.py b/tests/test_run_cocotb.py index f6c3a4c5..7be4f8f2 100755 --- a/tests/test_run_cocotb.py +++ b/tests/test_run_cocotb.py @@ -151,6 +151,12 @@ class CocotbRunConfig: app_name="ddr_smc_test", description="Self-modifying code test (stores + fence.i + execute, full sync chain)", ), + "smc_fencei_test": CocotbRunConfig( + python_test_module="cocotb_tests.test_real_program", + hdl_toplevel_module="frost", + app_name="smc_fencei_test", + description="Hardened SMC/fence.i reproducer (gap sweep, warm/cold L1D, write-miss, tight loop)", + ), "ddr_heap_test": CocotbRunConfig( python_test_module="cocotb_tests.test_real_program", hdl_toplevel_module="frost", @@ -181,6 +187,22 @@ class CocotbRunConfig: app_name="wfi_mepc_test", description="Timer-interrupt-at-WFI mepc directed test (empty-ROB interrupt resume PC)", ), + "wfi_drain_mepc_test": CocotbRunConfig( + python_test_module="cocotb_tests.test_real_program", + hdl_toplevel_module="frost", + app_name="wfi_drain_mepc_test", + description="Drain-gated WFI mepc directed test (timer IRQ at WFI with a draining DDR store)", + ), + "drain_trapframe_test": CocotbRunConfig( + python_test_module="cocotb_tests.test_real_program", + hdl_toplevel_module="frost", + app_name="drain_trapframe_test", + description="Trap-frame store-visibility under L1D eviction (Bug B relocated to pt_regs s2)", + # Genesys2-faithful shape: no L2 (L1 -> DDR direct, where a cold write-back + # actually drains) + high DDR latency, so the save-store / eviction race is + # not masked. The default (L2 on, latency 30) gives a false PASS. + verilator_extra_args=("-GCACHED_HAS_L2=0", "-GDDR_MODEL_LATENCY=70"), + ), "mret_timer_resume_test": CocotbRunConfig( python_test_module="cocotb_tests.test_real_program", hdl_toplevel_module="frost", @@ -193,6 +215,28 @@ class CocotbRunConfig: app_name="mtimer_stress", description="M-mode machine-timer + MRET deadlock stress (phase-swept; flaky-hang repro)", ), + "mret_drain_deadlock": CocotbRunConfig( + python_test_module="cocotb_tests.test_real_program", + hdl_toplevel_module="frost", + app_name="mret_drain_deadlock", + description="MRET-vs-draining-cached-store deadlock (one-shot o_mret_start; deterministic hang repro)", + # Genesys2 cache shape (L1 -> DDR direct), where the bug manifests on + # hardware and where a cold cached-store write-back actually drains in sim + # (the L2-enabled shape leaves the cold tier undrained, masking the race). + verilator_extra_args=("-GCACHED_HAS_L2=0",), + ), + "wfi_lost_tick": CocotbRunConfig( + python_test_module="cocotb_tests.test_real_program", + hdl_toplevel_module="frost", + app_name="wfi_lost_tick", + description="WFI-idle + MIE-toggle + CLINT-rearm lost-timer-tick repro (deferred-eligibility; frozen jiffies)", + ), + "irq_mie_window": CocotbRunConfig( + python_test_module="cocotb_tests.test_real_program", + hdl_toplevel_module="frost", + app_name="irq_mie_window", + description="Short-MIE-window lost-interrupt repro (registered interrupt_pending erased by adjacent MIE clear)", + ), "ns16550_test": CocotbRunConfig( python_test_module="cocotb_tests.test_real_program", hdl_toplevel_module="frost", @@ -212,6 +256,16 @@ class CocotbRunConfig: description="No-MMU Linux boot (kernel Image in DDR)", include_in_pytest=False, ), + # Same boot image, but 128 KiB L1I (the genesys2 HW config the handoff says + # wedges at SLUB). Pair with CACHED_HAS_L2=0 to match genesys2. Debug only. + "linux_boot_128k": CocotbRunConfig( + python_test_module="cocotb_tests.test_real_program", + hdl_toplevel_module="frost", + app_name="linux_boot", + description="No-MMU Linux boot with 128 KiB L1I (genesys2 wedge-repro config)", + include_in_pytest=False, + verilator_extra_args=("-GL1I_CACHE_BYTES=131072", "-GCACHED_HAS_L2=0"), + ), "linux_irq_ddr_test": CocotbRunConfig( python_test_module="cocotb_tests.test_real_program", hdl_toplevel_module="frost", @@ -224,6 +278,18 @@ class CocotbRunConfig: app_name="linux_irq_active_ddr_test", description="Linux-like active-code machine-timer IRQ path with DDR call/return traffic", ), + "linux_clksrc_faithful": CocotbRunConfig( + python_test_module="cocotb_tests.test_real_program", + hdl_toplevel_module="frost", + app_name="linux_clksrc_faithful", + description="Faithful Linux clocksource-switch: enable-MTIE-then-arm, re-arming handler, bare-wfi idle, concurrent DDR", + ), + "trap_s2l_fwd": CocotbRunConfig( + python_test_module="cocotb_tests.test_real_program", + hdl_toplevel_module="frost", + app_name="trap_s2l_fwd", + description="handle_exception-pattern trap store->load forwarding repro (sw sp,8(tp); lw ,8(tp))", + ), "linux_irq_stack_slot_test": CocotbRunConfig( python_test_module="cocotb_tests.test_real_program", hdl_toplevel_module="frost", @@ -243,6 +309,13 @@ class CocotbRunConfig: description="RV32-A atomics to the cached DDR region (LR/SC, AMO)", include_in_pytest=True, ), + "pde_return_hazard": CocotbRunConfig( + python_test_module="cocotb_tests.test_real_program", + hdl_toplevel_module="frost", + app_name="pde_return_hazard", + description="pde_subdir_find epilogue return-value hazard reproducer", + verilator_extra_args=("-GCACHED_HAS_L2=0",), + ), "freertos_demo": CocotbRunConfig( python_test_module="cocotb_tests.test_real_program", hdl_toplevel_module="frost", @@ -371,6 +444,19 @@ class CocotbRunConfig: description="RAS call/return stress under randomized fetch-latency fuzz", verilator_extra_args=("-GFETCH_VALID_FUZZ=1",), ), + "fetch_stall_repro": CocotbRunConfig( + python_test_module="cocotb_tests.test_real_program", + hdl_toplevel_module="frost", + app_name="fetch_stall_repro", + description="Directed 32-bit-insn PC+2 mis-step repro (no fuzz; sanity = PASS)", + ), + "fetch_stall_repro_128k": CocotbRunConfig( + python_test_module="cocotb_tests.test_real_program", + hdl_toplevel_module="frost", + app_name="fetch_stall_repro", + description="Directed PC+2 mis-step repro, cached .ddr_text, 128KiB L1I (genesys2)", + verilator_extra_args=("-GL1I_CACHE_BYTES=131072",), + ), # Tomasulo unit tests "reorder_buffer": CocotbRunConfig( python_test_module="cocotb_tests.tomasulo.reorder_buffer.test_reorder_buffer", @@ -608,6 +694,17 @@ class CocotbRunConfig: description="Tomasulo wrapper tests with CPU production split-RS dispatch", verilator_extra_args=("-GSPLIT_RS_DISPATCH=1",), ), + # Directed machine-mode trap/interrupt tests run on the cpu_tb harness + # (one instruction fed per ready cycle into the cpu_ooo core). CLI-only + # entry so the cpu_tb directed-trap suite is invokable through the mandated + # runner; filter to a single function with --testcase. Not auto-collected by + # pytest (no app_name; include_in_pytest=False). + "directed_traps": CocotbRunConfig( + python_test_module="cocotb_tests.test_directed_traps", + hdl_toplevel_module="cpu_tb", + description="Directed M-mode trap/interrupt tests (cpu_tb directed suite)", + include_in_pytest=False, + ), } # List of real program test names (excludes 'cpu' which uses different toplevel) diff --git a/verif/cocotb_tests/control/test_trap_unit.py b/verif/cocotb_tests/control/test_trap_unit.py index e653e8e1..feb26146 100644 --- a/verif/cocotb_tests/control/test_trap_unit.py +++ b/verif/cocotb_tests/control/test_trap_unit.py @@ -1,3 +1,17 @@ +# Copyright 2026 Two Sigma Open Source, LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + """Unit tests for trap_unit interrupt/MRET arbitration.""" from typing import Any @@ -44,6 +58,7 @@ async def _reset(dut: Any) -> None: @cocotb.test() async def test_mret_defers_registered_timer_interrupt(dut: Any) -> None: + """Verify that a pending timer interrupt is deferred while MRET is in flight.""" cocotb.start_soon(Clock(dut.i_clk, 10, unit="ns").start()) await _reset(dut) @@ -75,11 +90,19 @@ async def test_mret_defers_registered_timer_interrupt(dut: Any) -> None: await RisingEdge(dut.i_clk) await Timer(1, unit="ns") - assert int(dut.o_trap_taken.value) == 0 + # Once the MRET-recovery inhibit lifts, the still-live machine timer -- HELD + # across the inhibit rather than force-cleared -- is taken at the first + # eligible boundary (U-mode here, where a machine interrupt preempts regardless + # of MIE). Holding a live source avoids LOSING a real timer tick; the 0x80388bba + # panic stays guarded by cpu_ooo's interrupt_resume_pc seed on mret_taken, not + # by this latch (commit 718f8cc). + assert int(dut.o_trap_taken.value) == 1 + assert int(dut.o_trap_cause.value) == 0x80000007 @cocotb.test() async def test_timer_interrupt_still_traps_without_mret(dut: Any) -> None: + """Verify that a latched timer interrupt is taken immediately when no MRET is in flight.""" cocotb.start_soon(Clock(dut.i_clk, 10, unit="ns").start()) await _reset(dut) @@ -101,6 +124,7 @@ async def test_timer_interrupt_still_traps_without_mret(dut: Any) -> None: @cocotb.test() async def test_registered_interrupt_requires_current_mie(dut: Any) -> None: + """Verify that a held interrupt is only taken when current MIE is asserted.""" cocotb.start_soon(Clock(dut.i_clk, 10, unit="ns").start()) await _reset(dut) @@ -124,14 +148,19 @@ async def test_registered_interrupt_requires_current_mie(dut: Any) -> None: await Timer(1, unit="ns") assert int(dut.o_trap_taken.value) == 0 - # Once MIE is restored, the still-asserted timer interrupt is sampled again - # and should trap on the registered path. + # Once MIE is restored, the timer interrupt was HELD across the MIE-low window + # (not erased), so it is eligible and taken IMMEDIATELY on the restore cycle -- + # one cycle earlier than the old clear-then-re-latch path, which could LOSE the + # tick if MIE never stayed high long enough (the no-MMU boot lost-tick hang). It + # still requires CURRENT MIE to be taken (eligible gates on live + # m_int_globally_enabled), so the name still holds. dut.i_mstatus.value = MSTATUS_MIE dut.i_mstatus_mie_direct.value = 1 await Timer(1, unit="ns") - assert int(dut.o_trap_taken.value) == 0 + assert int(dut.o_trap_taken.value) == 1 + assert int(dut.o_trap_cause.value) == 0x80000007 + # Cleared on take (trap_taken_prev gates re-entry); does not re-fire next cycle. await RisingEdge(dut.i_clk) await Timer(1, unit="ns") - assert int(dut.o_trap_taken.value) == 1 - assert int(dut.o_trap_cause.value) == 0x80000007 + assert int(dut.o_trap_taken.value) == 0 diff --git a/verif/cocotb_tests/if_stage/test_if_stage.py b/verif/cocotb_tests/if_stage/test_if_stage.py index 256ed7be..808f1f5d 100644 --- a/verif/cocotb_tests/if_stage/test_if_stage.py +++ b/verif/cocotb_tests/if_stage/test_if_stage.py @@ -768,3 +768,108 @@ async def test_fetch_invalid_compressed_pair_resume(dut: Any) -> None: effective=COMPRESSED_HINT, compressed=True, ) + + +@cocotb.test() +async def test_pd_redirect_stall_32bit_target_no_plus2_desync(dut: Any) -> None: + """PD-redirect+BTB-collision+stall must not advance pc_reg by +2 onto a 32-bit instruction. + + Same race as test_pd_redirect_with_stall_kills_registered_prediction_handoff + but the wrong-ADVANCE (+2) variant rather than wrong-TARGET: on genesys2 the + HW lands pc_reg 2 bytes into a 32-bit insn (epc=0x8038d7fa, mid sw zero,4(s1)) + at workqueue_init_early -> illegal-instruction Oops. Drive a 32-bit stream at + the PD target; every dispatched PC must be 4-byte aligned. + """ + await _setup_test(dut) + dut.i_disable_branch_prediction.value = 0 + + branch_pc = BASE_PC + 8 + stale_pred_target = 0x80005000 + pd_target = 0x80006000 + + _drive_from_ex( + dut, + { + "btb_update": True, + "btb_update_pc": branch_pc, + "btb_update_target": stale_pred_target, + "btb_update_taken": True, + "btb_update_compressed": False, + "btb_update_requires_pc_reg_handoff": True, + }, + ) + await _advance_cycle(dut) + _drive_from_ex(dut, {}) + + await _redirect_to(dut, BASE_PC) + prediction_cycle_found = False + for _ in range(20): + if int(dut.branch_prediction_controller_inst.o_prediction_used.value): + prediction_cycle_found = True + break + await _advance_cycle(dut) + assert prediction_cycle_found, "BTB prediction never fired; test misconfigured" + + dut.i_pd_redirect.value = 1 + dut.i_pd_redirect_target.value = pd_target + await _advance_cycle(dut) + dut.i_pd_redirect.value = 0 + dut.i_pd_redirect_target.value = 0 + + _drive_pipeline_ctrl(dut, {"stall": True}) + await _advance_cycle(dut) + for _ in range(3): + _drive_pipeline_ctrl(dut, {"stall": True, "stall_registered": True}) + await _advance_cycle(dut) + _drive_pipeline_ctrl(dut, {}) + + bad: list[int] = [] + for _ in range(8): + _drive_fetch(dut, current_word=ADD_INSTR_A, next_word=ADD_INSTR_B) + await _settle() + packet = _read_if_packet(dut) + if not packet["sel_nop"]: + pc = packet["program_counter"] + if pc & 0x2: + bad.append(pc) + await _advance_cycle(dut) + assert not bad, ( + "pc_reg landed mid-32-bit-instruction (+2 desync) after PD-redirect+stall: " + f"{[hex(x) for x in bad]}" + ) + + +@cocotb.test() +async def test_fetch_window_lead_parity_plus2_desync(dut: Any) -> None: + """Fetch window leading pc_reg by one word (F=W+1) -> is_compressed_fast reads word(W+2)'s size bit. + + If that word's low parcel predecodes compressed, a + word-aligned 32-bit insn at pc_reg advances +2 (mid-instruction). This is the + workqueue_init_early HW Oops shape (epc 2 bytes into a word-aligned 32-bit sw). + fetch_word_swapped = i_instr_bank_sel_r ^ pc_reg[2] is a 1-bit parity that + cannot represent F=W+1 (instruction_aligner.sv:141-147,235-240). + """ + await _setup_test(dut) + await _redirect_to( + dut, BASE_PC + ) # pc_reg -> 0x80001000 (bit1=0, bit2=0); 32-bit insn here + + _drive_fetch( + dut, + current_word=ADD_INSTR_A, # i_instr[31:0] + next_word=0x00000004, # i_instr[63:32] = word(W+2); lo parcel 0x0004 -> "compressed" + current_sb=_sideband(), # 32-bit at pc_reg + next_sb=_sideband(compressed_lo=True, compressed_hi=False), + bank_sel=1, # = ~pc_reg[2]; models served window one word AHEAD (F=W+1) + ) + await _settle() + assert int(_read_if_packet(dut)["program_counter"]) == BASE_PC + + await _advance_cycle(dut) + _drive_fetch(dut, current_word=ADD_INSTR_B, next_word=ADD_INSTR_C, bank_sel=1) + await _settle() + pc2 = int(_read_if_packet(dut)["program_counter"]) + assert (pc2 & 0x2) == 0, ( + f"pc_reg landed mid-32-bit-instruction at {pc2:#x} " + "(F=W+1 fetch-window-lead parity hole; is_compressed_fast read the wrong word)" + ) diff --git a/verif/cocotb_tests/test_directed_traps.py b/verif/cocotb_tests/test_directed_traps.py index a87f1f0a..22af1f48 100644 --- a/verif/cocotb_tests/test_directed_traps.py +++ b/verif/cocotb_tests/test_directed_traps.py @@ -1329,3 +1329,431 @@ async def run_directed_illegal_instruction_test( async def test_directed_illegal_instruction(dut: Any) -> None: """Directed test for illegal instruction trapping (mcause=2).""" await run_directed_illegal_instruction_test(dut) + + +# ============================================================================ +# Directed Test for Precise-Interrupt / Commit Race (mepc off-by-one detector) +# ============================================================================ +# +# Confirmed-by-RTL bug under test: +# When an async machine-timer interrupt is recognized in the SAME cycle an +# ordinary instruction commits, precise state is mis-handled. +# * commit_en (reorder_buffer.sv) is gated only by the REGISTERED +# trap_mret_commit_hold_q (cpu_ooo.sv), which for an async interrupt stays +# low (it tracks trap_pending/mret/drain, none of which an async timer IRQ +# asserts). So a normal commit can fire in the cycle o_trap_taken asserts. +# * interrupt_resume_pc (cpu_ooo.sv), the source of mepc for async +# interrupts, is updated from the COMBINATIONAL rob_commit_valid_raw, so a +# commit in the trap cycle advances it to that instruction's next-PC. +# * The registered ROB commit (reorder_buffer.sv o_commit.valid) and the +# regfile write (commit_actions.sv) are NOT gated by the coincident flush / +# trap, so the racing instruction's architectural write still lands. +# Net effect: mepc and the set of architecturally-retired instructions can +# disagree by one -- a precise-state violation (on Linux this surfaced as a +# lost callee-saved restore, s2 = 0x19999998). +# +# Detector (prefix invariant): at trap entry the architectural regfile must +# reflect EXACTLY the instructions with PC < mepc -- every such instruction's +# destination register holds its marker, and no instruction with PC >= mepc has +# its marker visible. This sweeps the interrupt fire-cycle across a stream of +# distinct register-writing ALU ops in a SINGLE simulation and flags any offset +# where the invariant breaks. +# +# Regfile note: the architectural integer regfile is a multi-write distributed +# RAM (generic_regfile -> mwp_dist_ram) with a per-address live-value table, so +# a register's committed value is g_banks[lvt[r]].u_bank.ram[r] (read port 0). + + +async def run_directed_interrupt_commit_race_test( + dut: Any, config: TestConfig | None = None, mode: str = "alu" +) -> None: + """Sweep an async timer interrupt cycle-by-cycle over a register-writing stream. + + Assert the trap-entry precise-state prefix invariant. + + mode="alu": stream is `addi xK, x0, marker` (result produced in EX). + mode="load": stream is `lw xK, off(x4)` (result produced via the load + queue / data memory) -- mirrors the Linux symptom, which was a + lost callee-saved *load* restore (s2 = 0x19999998). + """ + from encoders.op_tables import I_ALU, CSRS, LOADS + from encoders.instruction_encode import CSRAddress + + if config is None: + config = TestConfig(num_loops=100) + + # ---- parameters -------------------------------------------------------- + nop = 0x00000013 + base_reg = 5 # stream writes x5..x{4+n_stream} + n_stream = 27 # x5..x31 + warmup = 6 + gap = 4 # NOPs between serialized CSR writes + obs = 56 # stream + observation cycles per offset + post_trap = 8 # cycles to keep observing after o_trap_taken + fire_lo, fire_hi = 0, 40 + mem_base = 0x400 # byte base of the load region (x4); BRAM, non-cached + word_base = mem_base >> 2 + + enc_addi = I_ALU["addi"][0] + enc_slli = I_ALU["slli"][0] + enc_csrrw = CSRS["csrrw"] + enc_lw = LOADS["lw"][0] + + # Iteration-unique expected destination value: distinct per (stream index, + # generation) and never 0, so a leftover value from a prior sweep iteration + # can never masquerade as a commit in the current one (neither the regfile + # RAM nor the data BRAM is reset between runs). + def expected_val(i: int, gen: int) -> int: + if mode == "load": + # 32-bit memory word loaded into the dest register. + return (0x19990000 | ((gen & 0xFF) << 8) | (i & 0xFF)) & MASK32 + return 0x40 + gen * 48 + i # 12-bit addi immediate (<= 1914) + + def stream_instr(c: int, gen: int) -> int: + if mode == "load": + return enc_lw(base_reg + c, 4, c * 4) # lw x{5+c}, (c*4)(x4) + return enc_addi(base_reg + c, 0, expected_val(c, gen)) + + dut_if = DUTInterface(dut) + clk = dut_if.clock + d = dut.device_under_test + + def ri(handle: Any) -> int | None: + try: + return int(handle.value) + except Exception: + return None + + # Read port 0 of the architectural integer regfile (multi-write banked RAM). + def _read_port0() -> Any: + return d.ooo_register_files_inst.regfile_inst.gen_read_port[ + 0 + ].gen_multi_write.read_port_ram + + def read_reg(r: int) -> int | None: + if r == 0: + return 0 + try: + rp = _read_port0() + sel = int(rp.lvt[r].value) + return int(rp.g_banks[sel].u_bank.ram[r].value) & MASK32 + except Exception as e: # pragma: no cover - surfaced as a clear failure + raise AssertionError( + f"regfile read path failed for x{r}: {e}. " + f"Expected ooo_register_files_inst.regfile_inst." + f"gen_read_port[0].gen_multi_write.read_port_ram.{{lvt,g_banks[*].u_bank.ram}}" + ) from e + + # one clock for the whole sweep + cocotb.start_soon(Clock(clk, config.clock_period_ns, unit="ns").start()) + + async def feed(instr: int) -> None: + await FallingEdge(clk) + await dut_if.wait_ready() + dut_if.instruction = instr + await RisingEdge(clk) + + gen_counter = {"g": 0} + + async def setup_phase() -> int: + """Reset and rebuild mtvec/mie/mstatus via fed instructions. + + Enables the machine-timer interrupt; i_interrupts remains 0 so nothing fires yet. + """ + gen = gen_counter["g"] + gen_counter["g"] += 1 + dut.i_interrupts_reg.value = 0 + dut_if.instruction = nop + await dut_if.reset_dut(config.reset_cycles) + for _ in range(6): + await feed(nop) + # Preload the load region with this generation's expected values (the + # data BRAM persists across reset, so refresh it every iteration). + if mode == "load": + for i in range(n_stream): + dut.data_memory_for_simulation.memory[ + word_base + i + ].value = expected_val(i, gen) + # Construct CSR operands (no deposits needed): x1=mtvec(0x1000), + # x2=mie.MTIE(0x80), x3=mstatus.MIE(0x08), x4=load base. + await feed(enc_addi(1, 0, 1)) # x1 = 1 + await feed(enc_slli(1, 1, 12)) # x1 = 0x1000 + await feed(enc_addi(2, 0, 0x80)) # x2 = MTIE + await feed(enc_addi(3, 0, 0x08)) # x3 = MIE + await feed(enc_addi(4, 0, mem_base)) # x4 = load base address + for _ in range(warmup): + await feed(nop) + await feed(enc_csrrw(0, CSRAddress.MTVEC, 1)) + for _ in range(gap): + await feed(nop) + await feed(enc_csrrw(0, CSRAddress.MIE, 2)) + for _ in range(gap): + await feed(nop) + await feed(enc_csrrw(0, CSRAddress.MSTATUS, 3)) + for _ in range(gap): + await feed(nop) + return gen + + async def calibrate() -> list[int]: + """Run the stream with no interrupt to learn each stream instruction's PC. + + Captures PCs from regfile write ports and confirms a clean run commits + every marker in order. + """ + gen = await setup_phase() + reg_pc: dict[int, int] = {} + for c in range(obs): + await FallingEdge(clk) + await dut_if.wait_ready() + dut_if.instruction = stream_instr(c, gen) if c < n_stream else nop + await RisingEdge(clk) + we0, a0, pc0 = ( + ri(d.dbg_port0_int_we), + ri(d.dbg_port0_int_addr), + ri(d.dbg_rob_commit_reg_pc), + ) + we1, a1, pc1 = ( + ri(d.dbg_port1_int_we), + ri(d.dbg_port1_int_addr), + ri(d.dbg_rob_commit_2_reg_pc), + ) + if ( + we0 + and a0 is not None + and pc0 is not None + and base_reg <= a0 < base_reg + n_stream + ): + reg_pc.setdefault(a0, pc0) + if ( + we1 + and a1 is not None + and pc1 is not None + and base_reg <= a1 < base_reg + n_stream + ): + reg_pc.setdefault(a1, pc1) + missing = [ + base_reg + i for i in range(n_stream) if (base_reg + i) not in reg_pc + ] + assert not missing, f"calibration missed regfile writes for {missing}: {reg_pc}" + stream_pcs = [reg_pc[base_reg + i] for i in range(n_stream)] + for i in range(1, n_stream): + assert ( + stream_pcs[i] == stream_pcs[0] + 4 * i + ), f"stream PCs not contiguous: {[hex(p) for p in stream_pcs]}" + for i in range(n_stream): + v = read_reg(base_reg + i) + assert v == expected_val(i, gen), ( + f"clean-run marker mismatch x{base_reg + i}: " + f"got {v:#x} want {expected_val(i, gen):#x}" + ) + cocotb.log.info( + f"Calibrated stream PCs x{base_reg}..x{base_reg + n_stream - 1}: " + f"{stream_pcs[0]:#x}..{stream_pcs[-1]:#x} (step 4); clean run committed " + f"all {n_stream} markers." + ) + return stream_pcs + + async def run_offset(fire_offset: int, stream_pcs: list[int]) -> dict[str, Any]: + gen = await setup_phase() + trap_c: int | None = None + racer: dict[str, Any] | None = None + resume_at_trap: int | None = None + last_mepc: int | None = None + for c in range(obs): + await FallingEdge(clk) + await dut_if.wait_ready() + # Stop injecting new stream writes once the trap is taken so the + # post-trap handler (NOPs) cannot perturb the x5..x31 snapshot. + if c < n_stream and trap_c is None: + dut_if.instruction = stream_instr(c, gen) + else: + dut_if.instruction = nop + # Cycle-exact injection: assert mtip for the cycle ending at this edge. + if c == fire_offset: + dut.i_interrupts_reg.value = 0b010 + await RisingEdge(clk) + ttr = ri(d.dbg_trap_taken_raw) + mepc = ri(d.csr_file_inst.mepc) + if mepc is not None: + last_mepc = mepc + if trap_c is None and ttr == 1: + trap_c = c + resume_at_trap = ri(d.dbg_interrupt_resume_pc) + racer = dict( + valid=ri(d.dbg_commit_valid), + pc=ri(d.dbg_commit_pc), + dest_valid=ri(d.dbg_commit_dest_valid), + dest_reg=ri(d.dbg_commit_dest_reg), + value=ri(d.dbg_commit_value), + c2_valid=ri(d.dbg_commit_2_valid), + c2_pc=ri(d.dbg_commit_2_pc), + ) + if trap_c is not None and c >= trap_c + post_trap: + break + mepc_final = ri(d.csr_file_inst.mepc) + if mepc_final is None: + mepc_final = last_mepc + regs = {base_reg + i: read_reg(base_reg + i) for i in range(n_stream)} + dut.i_interrupts_reg.value = 0 + return dict( + fire_offset=fire_offset, + gen=gen, + trap_c=trap_c, + mepc=mepc_final, + resume_at_trap=resume_at_trap, + racer=racer, + regs=regs, + ) + + def analyze(res: dict[str, Any], stream_pcs: list[int]) -> dict[str, Any]: + gen = res["gen"] + mepc = res["mepc"] + regs = res["regs"] + committed = [ + regs[base_reg + i] == expected_val(i, gen) for i in range(n_stream) + ] + ncommit = sum(committed) + longest_prefix = 0 + while longest_prefix < n_stream and committed[longest_prefix]: + longest_prefix += 1 + lost: list[int] = [] + leaked: list[int] = [] + r: int | None = None + no_trap = res["trap_c"] is None + if mepc is not None and not no_trap: + # Expected #committed stream instrs == those with PC < mepc. + r = sum(1 for pc in stream_pcs if pc < mepc) + for i in range(n_stream): + if stream_pcs[i] < mepc and not committed[i]: + lost.append(i) # mepc skipped it, but its write is missing + elif stream_pcs[i] >= mepc and committed[i]: + leaked.append(i) # committed though mepc resumes at/before it + violation = bool(lost or leaked) and not no_trap + return dict( + committed=committed, + ncommit=ncommit, + longest_prefix=longest_prefix, + R=r, + lost=lost, + leaked=leaked, + violation=violation, + no_trap=no_trap, + ) + + # ---- sweep ------------------------------------------------------------- + cocotb.log.info(f"=== Precise-interrupt sweep: stream mode={mode} ===") + cocotb.log.info("=== Calibrating clean stream PCs (no interrupt) ===") + stream_pcs = await calibrate() + + cocotb.log.info("=== Sweeping interrupt fire-cycle (single simulation) ===") + results: list[dict[str, Any]] = [] + for fire_offset in range(fire_lo, fire_hi): + res = await run_offset(fire_offset, stream_pcs) + an = analyze(res, stream_pcs) + res["an"] = an + results.append(res) + + def _h(x: int | None) -> str: + return "None" if x is None else f"0x{x:08x}" + + racer = res["racer"] or {} + tag = ( + " <<< VIOLATION" + if an["violation"] + else (" (no trap)" if an["no_trap"] else "") + ) + cocotb.log.info( + f"offset={fire_offset:2d} trap_c={res['trap_c']} mepc={_h(res['mepc'])} " + f"resume@trap={_h(res['resume_at_trap'])} R={an['R']} " + f"committed={an['ncommit']} prefix={an['longest_prefix']} " + f"lost={an['lost']} leaked={an['leaked']} " + f"racer[pc={_h(racer.get('pc'))} x{racer.get('dest_reg')}={_h(racer.get('value'))} " + f"v={racer.get('valid')}]{tag}" + ) + + violations = [r for r in results if r["an"]["violation"]] + + # ---- detailed evidence for each violation ------------------------------ + for r in violations[:8]: + an = r["an"] + gen = r["gen"] + fo = r["fire_offset"] + cocotb.log.error( + f"--- VIOLATION fire_offset={fo} mepc=0x{r['mepc']:08x} " + f"resume_pc@trap=" + f"{f'0x{r['resume_at_trap']:08x}' if r['resume_at_trap'] is not None else None} ---" + ) + for i in an["lost"]: + reg = base_reg + i + cocotb.log.error( + f" LOST x{reg} (stream #{i}, pc=0x{stream_pcs[i]:08x} < mepc): " + f"expected marker 0x{expected_val(i, gen):08x}, regfile=0x{r['regs'][reg]:08x} " + f"-- mepc advanced past this instruction but its write is missing" + ) + for i in an["leaked"]: + reg = base_reg + i + cocotb.log.error( + f" LEAK x{reg} (stream #{i}, pc=0x{stream_pcs[i]:08x} >= mepc): " + f"regfile=0x{r['regs'][reg]:08x} == marker 0x{expected_val(i, gen):08x} " + f"-- committed although mepc resumes at/before it (re-execution)" + ) + rc = r["racer"] + if rc and rc.get("valid"): + cocotb.log.error( + f" trap-cycle committer: pc=0x{rc['pc']:08x} " + f"x{rc['dest_reg']}<=0x{(rc['value'] or 0):08x} -- this combinational " + f"commit advanced interrupt_resume_pc in the o_trap_taken cycle" + ) + + # ---- per-offset mepc table (visibility, incl. negative results) -------- + cocotb.log.info("=== Per-offset mepc / commit summary ===") + for r in results: + an = r["an"] + cocotb.log.info( + f" offset={r['fire_offset']:2d} " + f"mepc={f'0x{r['mepc']:08x}' if r['mepc'] is not None else None} " + f"committed={an['ncommit']} prefix={an['longest_prefix']} " + f"violation={an['violation']}" + ) + + n_trapped = sum(1 for r in results if not r["an"]["no_trap"]) + cocotb.log.info( + f"Swept {len(results)} offsets ({n_trapped} took the trap); " + f"{len(violations)} violated the prefix invariant." + ) + + assert not violations, ( + f"PRECISE-INTERRUPT BUG REPRODUCED (mode={mode}): {len(violations)}/{len(results)} " + f"interrupt fire-offsets violate the trap-entry prefix invariant (architectural " + f"regfile != instructions with PC < mepc). First failing " + f"offset={violations[0]['fire_offset']}, mepc=0x{violations[0]['mepc']:08x}, " + f"lost={violations[0]['an']['lost']}, leaked={violations[0]['an']['leaked']}. " + f"See per-offset log above for the exact lost/leaked register (expected vs " + f"actual value) and the trap-cycle committer that advanced interrupt_resume_pc." + ) + cocotb.log.info( + f"=== mode={mode}: no violations across all fire offsets; " + f"trap-entry prefix invariant holds. ===" + ) + + +@cocotb.test() +async def test_directed_interrupt_commit_race(dut: Any) -> None: + """Deterministic precise-interrupt repro (ALU stream): sweep an async M-timer interrupt. + + Sweep cycle-by-cycle across a register-writing ALU stream and check that, + at trap entry, the architectural regfile reflects exactly the instructions + with PC < mepc (precise-state prefix invariant). + """ + await run_directed_interrupt_commit_race_test(dut, mode="alu") + + +@cocotb.test() +async def test_directed_interrupt_commit_race_loads(dut: Any) -> None: + """Deterministic precise-interrupt repro (LOAD stream): same cycle-exact interrupt sweep. + + The stream is `lw` instructions whose results come from + the load queue / data memory -- mirroring the Linux symptom (a lost + callee-saved load restore, s2 = 0x19999998). + """ + await run_directed_interrupt_commit_race_test(dut, mode="load") diff --git a/verif/cocotb_tests/test_helpers.py b/verif/cocotb_tests/test_helpers.py index 8b737667..1248d214 100644 --- a/verif/cocotb_tests/test_helpers.py +++ b/verif/cocotb_tests/test_helpers.py @@ -228,12 +228,39 @@ def _get_regfile_ram(self, ram_index: int = 0) -> Any: path = self.paths.regfile_ram_rs2_path return self._navigate_signal_path(path) + # Number of read ports on the architectural integer register file + # (generic_regfile in the current cpu_ooo). + _INT_RF_READ_PORTS = 8 + + def _int_regfile_inst(self) -> Any | None: + """Return the architectural integer register-file instance for the cpu_ooo DUT. + + Returns None when the hierarchy does not expose it (other toplevels). + """ + try: + return self.dut.device_under_test.ooo_register_files_inst.regfile_inst + except Exception: + return None + + def _int_read_port_ram(self, regfile_inst: Any, port: int) -> tuple[Any, bool]: + """Return (read_port_ram_handle, is_multi_write) for one read port. + + generic_regfile gives each read port its own RAM: a multi-write banked + RAM (mwp_dist_ram, under gen_multi_write) when there are 2+ write ports, + otherwise a single-write sdp_dist_ram (under gen_single_write). + """ + rp = regfile_inst.gen_read_port[port] + try: + return rp.gen_multi_write.read_port_ram, True + except Exception: + return rp.gen_single_write.read_port_ram, False + def read_register(self, reg: int, ram_index: int = 0) -> int: - """Read register value from hardware. + """Read an architectural integer register value from hardware. Args: reg: Register index (0-31) - ram_index: Which RAM instance to read from (0=rs1, 1=rs2) + ram_index: Legacy RAM-instance selector (only used by the fallback) Returns: Register value @@ -241,23 +268,50 @@ def read_register(self, reg: int, ram_index: int = 0) -> int: HardwareAssertions.assert_register_valid(reg) if reg == 0: return 0 + regfile_inst = self._int_regfile_inst() + if regfile_inst is not None: + ram, multi = self._int_read_port_ram(regfile_inst, 0) + if multi: + # Committed value = the bank chosen by the live-value table. + sel = int(ram.lvt[reg].value) + return int(ram.g_banks[sel].u_bank.ram[reg].value) + return int(ram.ram[reg].value) + # Fallback: legacy flat regfile RAM via the configured signal path. ram = self._get_regfile_ram(ram_index) return int(ram[reg].value) def write_register(self, reg: int, value: int) -> None: - """Write register value to hardware (both RAM instances). + """Deposit an architectural integer register value into hardware. Args: reg: Register index (0-31) value: Value to write """ HardwareAssertions.assert_register_valid(reg) - if reg > 0: # x0 is always zero - # Write to both RAM instances for consistency - ram_rs1 = self._get_regfile_ram(0) - ram_rs2 = self._get_regfile_ram(1) - ram_rs1[reg].value = value - ram_rs2[reg].value = value + if reg == 0: # x0 is always zero + return + regfile_inst = self._int_regfile_inst() + if regfile_inst is not None: + # Deposit into every read port (and, for the banked RAM, both banks + # with the live-value table cleared) so all dispatch read ports and + # the snapshot read return the deposited value. + for port in range(self._INT_RF_READ_PORTS): + try: + ram, multi = self._int_read_port_ram(regfile_inst, port) + except Exception: + break + if multi: + ram.g_banks[0].u_bank.ram[reg].value = value + ram.g_banks[1].u_bank.ram[reg].value = value + ram.lvt[reg].value = 0 + else: + ram.ram[reg].value = value + return + # Fallback: legacy flat regfile RAM (rs1 + rs2 instances). + ram_rs1 = self._get_regfile_ram(0) + ram_rs2 = self._get_regfile_ram(1) + ram_rs1[reg].value = value + ram_rs2[reg].value = value def initialize_registers(self, seed_value: int | None = None) -> list[int]: """Initialize all registers randomly and return the values.""" diff --git a/verif/cocotb_tests/test_real_program.py b/verif/cocotb_tests/test_real_program.py index e1d3c4c8..cc3cf857 100644 --- a/verif/cocotb_tests/test_real_program.py +++ b/verif/cocotb_tests/test_real_program.py @@ -32,7 +32,7 @@ from collections import Counter import cocotb from cocotb.clock import Clock -from cocotb.triggers import RisingEdge, Timer +from cocotb.triggers import FallingEdge, RisingEdge, Timer from typing import Any CLK_PERIOD_NS = 3 @@ -301,18 +301,16 @@ def _compute_bit_cycles(self) -> int: """Match uart_rx.sv prescaler math to compute cycles per bit. uart_rx uses CLK_FREQ_HZ/4 (since it runs on clk_div4) and computes: - ClockCyclesPerBit = (CLK_FREQ_HZ/4) / (BAUD_RATE * DATA_WIDTH) - Then it waits ClockCyclesPerBit * DATA_WIDTH cycles per bit. + ClockCyclesPerBit = (CLK_FREQ_HZ/4) / BAUD_RATE. """ clk_freq = _read_u64(getattr(self.dut, "CLK_FREQ_HZ", None)) if clk_freq is None: clk_freq = UART_CLK_FREQ_HZ_DEFAULT uart_clk_freq = clk_freq // 4 - base = uart_clk_freq // (UART_BAUD_RATE * UART_DATA_BITS) - bit_cycles = base * UART_DATA_BITS + bit_cycles = uart_clk_freq // UART_BAUD_RATE cocotb.log.info( f"UartRxDriver: clk_freq={clk_freq}, uart_clk_freq={uart_clk_freq}, " - f"base={base}, bit_cycles={bit_cycles}" + f"bit_cycles={bit_cycles}" ) return max(1, bit_cycles) @@ -321,20 +319,26 @@ async def _wait_cycles(self, cycles: int) -> None: for _ in range(cycles): await RisingEdge(self.dut.i_clk_div4) + async def _wait_bit_edges(self, cycles: int) -> None: + """Wait bit-time cycles using the non-sampling edge for UART transitions.""" + for _ in range(cycles): + await FallingEdge(self.dut.i_clk_div4) + async def send_byte(self, value: int) -> None: """Send a single byte over UART RX (LSB first).""" + await FallingEdge(self.dut.i_clk_div4) # Start bit self.dut.i_uart_rx.value = 0 - await self._wait_cycles(self.bit_cycles) + await self._wait_bit_edges(self.bit_cycles) # Data bits for bit in range(UART_DATA_BITS): self.dut.i_uart_rx.value = (value >> bit) & 0x1 - await self._wait_cycles(self.bit_cycles) + await self._wait_bit_edges(self.bit_cycles) # Stop bit self.dut.i_uart_rx.value = 1 - await self._wait_cycles(self.bit_cycles) + await self._wait_bit_edges(self.bit_cycles) - async def send(self, data: bytes) -> None: + async def send(self, data: bytes, inter_byte_cycles: int = 0) -> None: """Send a byte string over UART RX.""" # Ensure line is idle for multiple bit times before starting # This gives the receiver time to sync after any glitches @@ -342,8 +346,8 @@ async def send(self, data: bytes) -> None: await self._wait_cycles(self.bit_cycles * 4) for byte in data: await self.send_byte(byte) - # Extra idle time between characters for receiver to process - await self._wait_cycles(self.bit_cycles) + if inter_byte_cycles > 0: + await self._wait_cycles(inter_byte_cycles) async def wait_for_uart_text( @@ -651,6 +655,10 @@ def get_expected_behavior() -> tuple[str | None, str | None, bool, str | None]: # Just needs to print the first hello message return (None, "Hello, world!", False, app_name) if app_name == "linux_boot": + if os.environ.get("FROST_LINUX_RUN_FULL") == "1": + # Diagnostic: never matches -> run to COCOTB_LINUX_MAX_CYCLES + # capturing all UART, to observe post-banner behavior. + return ("<<__never_matches__>>", None, True, app_name) # Passes once the kernel reaches its boot banner. (Interim # bring-up criterion; tighten to a userspace/shell marker # once no-MMU Linux boots that far.) @@ -714,7 +722,9 @@ async def run_until_complete( external_irq_enabled = bool(external_irq_symbol) external_irq_offset = int(os.environ.get("FROST_EXTERNAL_IRQ_OFFSET", "0"), 0) external_irq_max_pulses = int(os.environ.get("FROST_EXTERNAL_IRQ_MAX_PULSES", "1")) - external_irq_hold_cycles = int(os.environ.get("FROST_EXTERNAL_IRQ_HOLD_CYCLES", "1")) + external_irq_hold_cycles = int( + os.environ.get("FROST_EXTERNAL_IRQ_HOLD_CYCLES", "1") + ) retire_sig = None pc_sig = None pc_vld_sig = None @@ -903,13 +913,11 @@ async def run_until_complete( rob_commit0_reg_dest_valid_sig = None rob_commit0_reg_dest_rf_sig = None rob_commit0_reg_dest_reg_sig = None - rob_commit0_reg_value_sig = None rob_commit1_reg_valid_sig = None rob_commit1_reg_pc_sig = None rob_commit1_reg_dest_valid_sig = None rob_commit1_reg_dest_rf_sig = None rob_commit1_reg_dest_reg_sig = None - rob_commit1_reg_value_sig = None coremark_cf_debug_enabled = ( is_coremark_like and os.environ.get("FROST_COREMARK_CF_DEBUG") == "1" ) @@ -1724,9 +1732,6 @@ async def run_until_complete( rob_commit0_reg_dest_reg_sig = _get_signal( dut, "cpu_and_memory_subsystem.cpu_inst.dbg_rob_commit_reg_dest_reg" ) - rob_commit0_reg_value_sig = _get_signal( - dut, "cpu_and_memory_subsystem.cpu_inst.dbg_rob_commit_reg_value" - ) rob_commit1_reg_valid_sig = _get_signal( dut, "cpu_and_memory_subsystem.cpu_inst.dbg_rob_commit_2_reg_valid" ) @@ -1742,9 +1747,6 @@ async def run_until_complete( rob_commit1_reg_dest_reg_sig = _get_signal( dut, "cpu_and_memory_subsystem.cpu_inst.dbg_rob_commit_2_reg_dest_reg" ) - rob_commit1_reg_value_sig = _get_signal( - dut, "cpu_and_memory_subsystem.cpu_inst.dbg_rob_commit_2_reg_value" - ) flush_pipeline_live_sig = _get_signal( dut, "cpu_and_memory_subsystem.cpu_inst.flush_pipeline" ) @@ -1930,9 +1932,6 @@ async def run_until_complete( rob_commit0_reg_dest_reg_sig = _get_signal( dut, "cpu_and_memory_subsystem.cpu_inst.dbg_rob_commit_reg_dest_reg" ) - rob_commit0_reg_value_sig = _get_signal( - dut, "cpu_and_memory_subsystem.cpu_inst.dbg_rob_commit_reg_value" - ) rob_commit1_reg_valid_sig = _get_signal( dut, "cpu_and_memory_subsystem.cpu_inst.dbg_rob_commit_2_reg_valid" ) @@ -1948,9 +1947,6 @@ async def run_until_complete( rob_commit1_reg_dest_reg_sig = _get_signal( dut, "cpu_and_memory_subsystem.cpu_inst.dbg_rob_commit_2_reg_dest_reg" ) - rob_commit1_reg_value_sig = _get_signal( - dut, "cpu_and_memory_subsystem.cpu_inst.dbg_rob_commit_2_reg_value" - ) retired_pc_hist: Counter[int] = Counter() retired_mispredicts = 0 @@ -2108,7 +2104,9 @@ def dump_coremark_retire_trace() -> None: if external_irq_active: if external_irq_hold_remaining > 0: external_irq_hold_remaining -= 1 - if trap_taken_live_sig is not None and bool(_read_bool(trap_taken_live_sig)): + if trap_taken_live_sig is not None and bool( + _read_bool(trap_taken_live_sig) + ): external_irq_hold_remaining = 0 if external_irq_hold_remaining == 0: dut.i_external_interrupt.value = 0 @@ -2125,7 +2123,11 @@ def dump_coremark_retire_trace() -> None: retire_pc = _read_int(retire_pc_sig) lo, hi = external_irq_range trigger_pc = lo + external_irq_offset - if retire_valid and retire_pc is not None and trigger_pc <= retire_pc < hi: + if ( + retire_valid + and retire_pc is not None + and trigger_pc <= retire_pc < hi + ): dut.i_external_interrupt.value = 1 external_irq_active = True external_irq_hold_remaining = max(1, external_irq_hold_cycles) @@ -2147,7 +2149,14 @@ def dump_coremark_retire_trace() -> None: if irq_precision_check: raw_x2_events = [] - for valid_sig, pc_sig, dest_valid_sig, dest_rf_sig, dest_reg_sig, value_sig in ( + for ( + valid_sig, + pc_sig, + dest_valid_sig, + dest_rf_sig, + dest_reg_sig, + value_sig, + ) in ( ( commit_valid_live_sig, commit_pc_live_sig, @@ -2257,7 +2266,9 @@ def dump_coremark_retire_trace() -> None: current_x2_commit_pc is not None and callee_lo <= current_x2_commit_pc < callee_hi ) - stale_sp_body = callee_lo + 4 <= trap_pc < callee_hi and not x2_from_callee + stale_sp_body = ( + callee_lo + 4 <= trap_pc < callee_hi and not x2_from_callee + ) if trap and is_irq: event = ( @@ -2329,8 +2340,18 @@ def dump_coremark_retire_trace() -> None: ) for we_sig, addr_sig, data_sig, pc_sig in ( - (port0_int_we_sig, port0_int_addr_sig, port0_int_data_sig, rob_commit0_reg_pc_sig), - (port1_int_we_sig, port1_int_addr_sig, port1_int_data_sig, rob_commit1_reg_pc_sig), + ( + port0_int_we_sig, + port0_int_addr_sig, + port0_int_data_sig, + rob_commit0_reg_pc_sig, + ), + ( + port1_int_we_sig, + port1_int_addr_sig, + port1_int_data_sig, + rob_commit1_reg_pc_sig, + ), ): if bool(_read_bool(we_sig)) and _read_int(addr_sig) == 2: last_x2_commit = _read_int(data_sig) @@ -3049,6 +3070,14 @@ def dump_coremark_retire_trace() -> None: f"lq_mem_outstanding={lq_mem_outstanding}" f"{cf_debug_suffix}" ) + cocotb.log.info( + f"Run {run_number} CLINT/serial: cycle={cycle + 1} " + f"mtime=0x{(_read_u64(_get_signal(dut, 'cpu_and_memory_subsystem.mtime')) or 0):016x} " + f"mtimecmp=0x{(_read_u64(_get_signal(dut, 'cpu_and_memory_subsystem.mtimecmp')) or 0):016x} " + f"mtip={_read_bool(_get_signal(dut, 'cpu_and_memory_subsystem.mtip_registered'))} " + f"priv={_read_int(_get_signal(dut, 'cpu_and_memory_subsystem.cpu_inst.csr_priv'))} " + f"mstatus=0x{(_read_int(_get_signal(dut, 'cpu_and_memory_subsystem.cpu_inst.csr_mstatus')) or 0):08x}" + ) last_progress_retired = retired_count last_progress_mispredicts = retired_mispredicts diff --git a/verif/cocotb_tests/tomasulo/load_queue/test_load_queue.py b/verif/cocotb_tests/tomasulo/load_queue/test_load_queue.py index e0f9784a..93920605 100644 --- a/verif/cocotb_tests/tomasulo/load_queue/test_load_queue.py +++ b/verif/cocotb_tests/tomasulo/load_queue/test_load_queue.py @@ -40,6 +40,7 @@ CLOCK_PERIOD_NS = 10 LQ_DEPTH = 8 +AMO_RESCUE_THRESHOLD = 16384 async def setup(dut: Any) -> tuple[LQInterface, LQModel]: @@ -1916,6 +1917,159 @@ async def test_amo_waits_for_rob_head_and_sq_committed_empty(dut: Any) -> None: assert mem_req["en"], "AMO should issue when at ROB head and sq_committed_empty" +# ============================================================================ +# Test 35b: ROB-head AMO rescue from physical older-AMO block +# ============================================================================ +@cocotb.test() +async def test_blocked_head_amo_rescues_when_issue_would_idle(dut: Any) -> None: + """A physically blocked ROB-head AMO issues when no normal candidate exists.""" + dut_if, model = await setup(dut) + + from .lq_interface import AMOSWAP_W + + # Physical order: younger pending AMO, then the true ROB-head AMO. The + # older-AMO prefix is physical-order based, so the head AMO is masked unless + # the idle rescue path re-adds it. + dut_if.drive_alloc(rob_tag=1, size=MEM_SIZE_WORD, is_amo=True, amo_op=AMOSWAP_W) + model.alloc(1, False, MEM_SIZE_WORD, False, is_amo=True, amo_op=AMOSWAP_W) + await dut_if.step() + dut_if.clear_alloc() + + dut_if.drive_alloc(rob_tag=0, size=MEM_SIZE_WORD, is_amo=True, amo_op=AMOSWAP_W) + model.alloc(0, False, MEM_SIZE_WORD, False, is_amo=True, amo_op=AMOSWAP_W) + await dut_if.step() + dut_if.clear_alloc() + + dut_if.drive_addr_update(rob_tag=1, address=0x9000, amo_rs2=0x11) + model.addr_update(1, 0x9000, amo_rs2=0x11) + await dut_if.step() + dut_if.clear_addr_update() + + dut_if.drive_addr_update(rob_tag=0, address=0x9004, amo_rs2=0x22) + model.addr_update(0, 0x9004, amo_rs2=0x22) + await dut_if.step() + dut_if.clear_addr_update() + + dut_if.drive_rob_head_tag(0) + dut_if.drive_sq_empty(True) + dut_if.drive_sq_committed_empty(True) + + mem_req = await wait_for_mem_request(dut_if, max_cycles=AMO_RESCUE_THRESHOLD + 8) + assert mem_req["en"], "Blocked ROB-head AMO should be rescued" + assert ( + mem_req["addr"] == 0x9004 + ), f"Expected rescued head AMO addr=0x9004, got 0x{mem_req['addr']:x}" + + +# ============================================================================ +# Test 35c: ROB-head AMO rescue stays dormant while normal progress exists +# ============================================================================ +@cocotb.test() +async def test_blocked_head_amo_does_not_preempt_normal_candidate(dut: Any) -> None: + """A blocked ROB-head AMO does not jump ahead of a normal eligible load.""" + dut_if, model = await setup(dut) + + from .lq_interface import AMOSWAP_W + + # Physical order: normal younger load, younger pending AMO, ROB-head AMO. + # The head AMO is physically blocked, but the load is a normal candidate, so + # the rescue path must stay dormant and preserve speculative load progress. + dut_if.drive_alloc(rob_tag=2, size=MEM_SIZE_WORD) + model.alloc(2, False, MEM_SIZE_WORD, False) + await dut_if.step() + dut_if.clear_alloc() + + dut_if.drive_alloc(rob_tag=1, size=MEM_SIZE_WORD, is_amo=True, amo_op=AMOSWAP_W) + model.alloc(1, False, MEM_SIZE_WORD, False, is_amo=True, amo_op=AMOSWAP_W) + await dut_if.step() + dut_if.clear_alloc() + + dut_if.drive_alloc(rob_tag=0, size=MEM_SIZE_WORD, is_amo=True, amo_op=AMOSWAP_W) + model.alloc(0, False, MEM_SIZE_WORD, False, is_amo=True, amo_op=AMOSWAP_W) + await dut_if.step() + dut_if.clear_alloc() + + dut_if.drive_addr_update(rob_tag=2, address=0xA000) + model.addr_update(2, 0xA000) + await dut_if.step() + dut_if.clear_addr_update() + + dut_if.drive_addr_update(rob_tag=1, address=0xA004, amo_rs2=0x11) + model.addr_update(1, 0xA004, amo_rs2=0x11) + await dut_if.step() + dut_if.clear_addr_update() + + dut_if.drive_addr_update(rob_tag=0, address=0xA008, amo_rs2=0x22) + model.addr_update(0, 0xA008, amo_rs2=0x22) + await dut_if.step() + dut_if.clear_addr_update() + + dut_if.drive_rob_head_tag(0) + dut_if.drive_sq_empty(True) + dut_if.drive_sq_committed_empty(True) + + mem_req = await wait_for_mem_request(dut_if, max_cycles=8) + assert mem_req["en"], "Normal load candidate should still issue" + assert ( + mem_req["addr"] == 0xA000 + ), f"Expected normal load addr=0xA000, got 0x{mem_req['addr']:x}" + + +# ============================================================================ +# Test 35d: ROB-head AMO idle rescue must not replace busy SQ-check +# ============================================================================ +@cocotb.test() +async def test_blocked_head_amo_does_not_replace_busy_sq_check(dut: Any) -> None: + """Idle rescue stays off while a younger load is already in SQ-check.""" + dut_if, model = await setup(dut) + + from .lq_interface import AMOSWAP_W + + dut_if.drive_rob_head_tag(0) + dut_if.drive_sq_empty(False) + dut_if.drive_sq_all_older_known(False) + dut_if.drive_sq_forward(match=False, can_forward=False) + dut_if.drive_sq_committed_empty(True) + + await alloc_and_addr(dut_if, model, rob_tag=2, address=0xB000) + + sq_check = await wait_for_sq_check(dut_if, max_cycles=4) + assert sq_check["valid"], "Younger load should occupy SQ-check" + assert sq_check["rob_tag"] == 2 + + # Physical order after the staged load: younger pending AMO, then the true + # ROB-head AMO. The head AMO is eligible but physically blocked by the + # younger AMO. The idle rescue must not evict the existing SQ-check entry. + dut_if.drive_alloc(rob_tag=1, size=MEM_SIZE_WORD, is_amo=True, amo_op=AMOSWAP_W) + model.alloc(1, False, MEM_SIZE_WORD, False, is_amo=True, amo_op=AMOSWAP_W) + await dut_if.step() + dut_if.clear_alloc() + + dut_if.drive_alloc(rob_tag=0, size=MEM_SIZE_WORD, is_amo=True, amo_op=AMOSWAP_W) + model.alloc(0, False, MEM_SIZE_WORD, False, is_amo=True, amo_op=AMOSWAP_W) + await dut_if.step() + dut_if.clear_alloc() + + dut_if.drive_addr_update(rob_tag=1, address=0xB004, amo_rs2=0x11) + model.addr_update(1, 0xB004, amo_rs2=0x11) + await dut_if.step() + dut_if.clear_addr_update() + + dut_if.drive_addr_update(rob_tag=0, address=0xB008, amo_rs2=0x22) + model.addr_update(0, 0xB008, amo_rs2=0x22) + await dut_if.step() + dut_if.clear_addr_update() + + for _ in range(6): + await Timer(1, unit="ns") + mem_req = dut_if.read_mem_request() + assert not mem_req["en"], "Blocked head AMO must not replace busy SQ-check" + sq_check = dut_if.read_sq_check() + assert sq_check["valid"], "Original SQ-check entry should remain staged" + assert sq_check["rob_tag"] == 2 + await dut_if.step() + + # ============================================================================ # Test 36: AMO SWAP # ============================================================================ diff --git a/verif/cocotb_tests/tomasulo/tomasulo_wrapper/test_tomasulo_wrapper.py b/verif/cocotb_tests/tomasulo/tomasulo_wrapper/test_tomasulo_wrapper.py index cc3bb5a0..16e0f1f8 100644 --- a/verif/cocotb_tests/tomasulo/tomasulo_wrapper/test_tomasulo_wrapper.py +++ b/verif/cocotb_tests/tomasulo/tomasulo_wrapper/test_tomasulo_wrapper.py @@ -949,16 +949,16 @@ async def test_slot2_store_raw_commit_blocks_sq_committed_empty(dut: Any) -> Non await RisingEdge(dut_if.clock) await Timer(1, unit="ps") - sq_committed_empty = dut_if.sq_committed_empty + sq_committed_empty = bool(dut_if.sq_committed_empty) await FallingEdge(dut_if.clock) assert commit_1["valid"] and commit_1["tag"] == tag_1 assert commit_2["valid"] and commit_2["tag"] == tag_2 assert commit_2_valid_raw, "Slot-2 raw commit should be visible" assert commit_2_store_like_raw, "Slot-2 raw commit should be store-like" - assert not sq_committed_empty, ( - "Slot-2 raw store commit must feed SQ's same-cycle committed_empty guard" - ) + assert ( + not sq_committed_empty + ), "Slot-2 raw store commit must feed SQ's same-cycle committed_empty guard" cocotb.log.info("=== Test Passed ===") From 98ef43d2eafb6b8f8dcf38c761052d0ae8f7d241 Mon Sep 17 00:00:00 2001 From: Adam Bagley Date: Sun, 28 Jun 2026 13:01:41 -0400 Subject: [PATCH 19/43] =?UTF-8?q?sim:=20SIM=5FFAST=5FMAINT=20=E2=80=94=20f?= =?UTF-8?q?ast=20fence.i=20cache=20maintenance=20in=20simulation?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Sim-only parameter (default 0 = FPGA, set 1 for cocotb): completes fence.i L1 invalidate-all in O(1) (sdp_block_ram bulk-clear) and writeback-all in O(dirty) instead of O(NumLines), cutting fence.i from ~8998 to ~327 cycles (~27x). The default-0 path is byte-for-byte the original FSM; Yosys confirms the fast logic does not elaborate at param=0 (zero FPGA impact). Unblocks booting the real (un-noop'd) no-MMU Linux kernel in sim. Verified: frost_cache unit (on+off), ddr_smc_test (SMC via fence.i), isa_test all pass; verible clean. --- hw/rtl/cpu_and_mem/cpu_and_mem.sv | 6 +- hw/rtl/frost.sv | 5 + hw/rtl/lib/cache/frost_cache.sv | 114 +++++++++++++-- hw/rtl/lib/cache/frost_cache_hierarchy.sv | 13 +- hw/rtl/lib/cache/frost_cache_test_harness.sv | 8 +- hw/rtl/lib/ram/sdp_block_ram.sv | 27 +++- tests/Makefile | 8 +- tests/test_run_cocotb.py | 42 ++++++ verif/cocotb_tests/cache/test_fence_speed.py | 141 +++++++++++++++++++ 9 files changed, 344 insertions(+), 20 deletions(-) create mode 100644 verif/cocotb_tests/cache/test_fence_speed.py diff --git a/hw/rtl/cpu_and_mem/cpu_and_mem.sv b/hw/rtl/cpu_and_mem/cpu_and_mem.sv index b6d96b67..7d18cbd4 100644 --- a/hw/rtl/cpu_and_mem/cpu_and_mem.sv +++ b/hw/rtl/cpu_and_mem/cpu_and_mem.sv @@ -49,6 +49,9 @@ module cpu_and_mem #( parameter int unsigned L1_CACHE_BYTES = 128 * 1024, parameter int unsigned L1I_CACHE_BYTES = 16 * 1024, parameter int unsigned L2_CACHE_BYTES = 2 * 1024 * 1024, + // Simulation-only fast cache maintenance for fence.i (see frost_cache). + // 0 = FPGA (cycle-accurate maintenance FSM); non-zero = sim fast path. + parameter int unsigned SIM_FAST_MAINT = 0, // Behavioral main-memory model (simulation only; hardware integration // replaces it with the DDR controller behind the same AXI port). parameter int unsigned DDR_MODEL_BYTES = 64 * 1024 * 1024, @@ -675,7 +678,8 @@ module cpu_and_mem #( .HAS_L2(CACHED_HAS_L2), .L1_CACHE_BYTES(L1_CACHE_BYTES), .L1I_CACHE_BYTES(L1I_CACHE_BYTES), - .L2_CACHE_BYTES(L2_CACHE_BYTES) + .L2_CACHE_BYTES(L2_CACHE_BYTES), + .SIM_FAST_MAINT(SIM_FAST_MAINT) ) cache_hierarchy ( .i_clk(i_clk), .i_rst(i_rst), diff --git a/hw/rtl/frost.sv b/hw/rtl/frost.sv index c102081f..ffde4255 100644 --- a/hw/rtl/frost.sv +++ b/hw/rtl/frost.sv @@ -54,6 +54,10 @@ module frost #( parameter int unsigned L1_CACHE_BYTES = 128 * 1024, parameter int unsigned L1I_CACHE_BYTES = 16 * 1024, parameter int unsigned L2_CACHE_BYTES = 2 * 1024 * 1024, + // Simulation-only fast cache maintenance for fence.i: 0 = FPGA (cycle- + // accurate maintenance FSM, unchanged); non-zero = sim fast path (see + // frost_cache). Set to 1 only by the cocotb sim build, never for boards. + parameter int unsigned SIM_FAST_MAINT = 0, // Behavioral main-memory model knobs (simulation only). parameter int unsigned DDR_MODEL_BYTES = 64 * 1024 * 1024, parameter int unsigned DDR_MODEL_LATENCY = 30, @@ -195,6 +199,7 @@ module frost #( .L1_CACHE_BYTES(L1_CACHE_BYTES), .L1I_CACHE_BYTES(L1I_CACHE_BYTES), .L2_CACHE_BYTES(L2_CACHE_BYTES), + .SIM_FAST_MAINT(SIM_FAST_MAINT), .DDR_MODEL_BYTES(DDR_MODEL_BYTES), .DDR_MODEL_LATENCY(DDR_MODEL_LATENCY), .USE_BEHAVIORAL_DDR(USE_BEHAVIORAL_DDR), diff --git a/hw/rtl/lib/cache/frost_cache.sv b/hw/rtl/lib/cache/frost_cache.sv index d67bb1b0..54c5058c 100644 --- a/hw/rtl/lib/cache/frost_cache.sv +++ b/hw/rtl/lib/cache/frost_cache.sv @@ -72,7 +72,17 @@ module frost_cache #( // verilog_lint: waive explicit-parameter-storage-type parameter DATA_MEMORY_PRIMITIVE = "block", parameter int unsigned DATA_READ_LATENCY = 2, - parameter int unsigned DATA_WRITE_LATENCY = 1 + parameter int unsigned DATA_WRITE_LATENCY = 1, + // Simulation-only fast cache maintenance (fence.i). 0 = FPGA: the + // cycle-accurate maintenance FSM below is byte-for-byte unchanged. Non-zero + // = simulation: invalidate-all completes in a single cycle (a tag bulk + // clear) and writeback-all iterates only the dirty lines -- O(dirty) rather + // than O(NumLines) -- guided by a sim-only shadow of the dirty bits. The + // functional effect is identical to the slow path: every line is left + // invalid after invalidate-all, and every valid+dirty line is still written + // downstream and marked clean by writeback-all. Threaded in only for the + // cocotb sim build; never set for board/synthesis builds. + parameter int unsigned SIM_FAST_MAINT = 0 ) ( input logic i_clk, input logic i_rst, @@ -156,12 +166,23 @@ module frost_cache #( logic [IndexBits-1:0] flush_idx_q; logic [ TagBits-1:0] flush_tag_q; + // Fast maintenance (SIM_FAST_MAINT, simulation only). + // tag_bulk_clear: one-cycle invalidate-all of the whole tag array. + // any_dirty_*/first_dirty_*: lowest dirty line index from the sim-only dirty + // shadow, used to walk only dirty lines during writeback-all. All driven to + // constants when the feature is off, so the FPGA build carries none of it. + logic tag_bulk_clear; + logic any_dirty_full, any_dirty_excl; + logic [IndexBits-1:0] first_dirty_full, first_dirty_excl; + // Writeback-all walk states (data/tag addressing + busy). - logic flush_active; + logic flush_active; assign flush_active = (state_q == S_FLUSH_SCAN) || (state_q == S_FLUSH_CHECK) || (state_q == S_FLUSH_DATA) || (state_q == S_FLUSH_WB_REQ) || (state_q == S_FLUSH_WB_WAIT); assign o_maint_busy = flush_active || (state_q == S_SWEEP); + // Fast invalidate-all: hold the tag bulk clear for the (now one-cycle) sweep. + assign tag_bulk_clear = (SIM_FAST_MAINT != 0) && (state_q == S_SWEEP); logic [ 7:0] wait_cnt_q; // data-array latency countdown (latencies are small) logic [ TagBits-1:0] victim_tag_q; logic [ LineBits-1:0] victim_line_q; @@ -197,16 +218,62 @@ module frost_cache #( sdp_block_ram #( .ADDR_WIDTH(IndexBits), - .DATA_WIDTH(TagEntryBits) + .DATA_WIDTH(TagEntryBits), + .SUPPORT_BULK_CLEAR(SIM_FAST_MAINT) ) tag_array ( .i_clk(i_clk), .i_write_enable(tag_we), + .i_bulk_clear(tag_bulk_clear), .i_write_address(tag_waddr), .i_read_address(tag_raddr), .i_write_data(tag_wdata), .o_read_data(tag_rdata) ); + // ---- Fast maintenance dirty shadow (SIM_FAST_MAINT, simulation only) ------ + // A shadow of the tag array's dirty bits, updated by the exact same writes + // that update the tag RAM, so writeback-all can jump straight to dirty lines + // instead of scanning every index. Elaborated only when the feature is on: + // FPGA/synthesis builds carry none of this logic and read the constant + // outputs below. + if (SIM_FAST_MAINT != 0) begin : gen_fast_maint + logic [NumLines-1:0] dirty_shadow_q; + always_ff @(posedge i_clk) begin + if (i_rst) dirty_shadow_q <= '0; + else if (tag_bulk_clear) dirty_shadow_q <= '0; // invalidate-all / reset + // tag_wdata = {valid, dirty, tag}: bit TagBits is the dirty bit. + else if (tag_we) dirty_shadow_q[tag_waddr] <= tag_wdata[TagBits]; + end + + // Lowest set dirty index over the whole shadow (first_dirty_full) and + // excluding the line being written back this cycle (first_dirty_excl). The + // scan is gated to the writeback-all states, so ordinary traffic never pays + // for it -- a dirty store just toggles one shadow bit above. + always_comb begin + any_dirty_full = 1'b0; + first_dirty_full = '0; + any_dirty_excl = 1'b0; + first_dirty_excl = '0; + if ((state_q == S_IDLE && i_writeback_all) || flush_active) begin + for (int idx = int'(NumLines) - 1; idx >= 0; idx--) begin + if (dirty_shadow_q[idx]) begin + any_dirty_full = 1'b1; + first_dirty_full = IndexBits'(idx); + if (IndexBits'(idx) != flush_idx_q) begin + any_dirty_excl = 1'b1; + first_dirty_excl = IndexBits'(idx); + end + end + end + end + end + end else begin : gen_no_fast_maint + assign any_dirty_full = 1'b0; + assign first_dirty_full = '0; + assign any_dirty_excl = 1'b0; + assign first_dirty_excl = '0; + end + // Tag read address: the incoming request's index, sampled at the fire so // the entry is readable in S_TAG_CHECK; the walk index during the // writeback-all scan. Don't-care in every other state. @@ -273,9 +340,14 @@ module frost_cache #( unique case (state_q) S_SWEEP: begin - tag_we = 1'b1; - tag_waddr = sweep_idx_q; - tag_wdata = '0; // valid=0, dirty=0 + // FPGA: clear one tag entry per cycle. Fast (sim): the tag bulk clear + // (tag_bulk_clear -> tag_array.i_bulk_clear) zeroes every entry this + // single cycle, so no per-index write is issued here. + if (SIM_FAST_MAINT == 0) begin + tag_we = 1'b1; + tag_waddr = sweep_idx_q; + tag_wdata = '0; // valid=0, dirty=0 + end end S_TAG_CHECK: begin @@ -353,8 +425,13 @@ module frost_cache #( end else begin unique case (state_q) S_SWEEP: begin - sweep_idx_q <= sweep_idx_q + 1'b1; - if (sweep_idx_q == {IndexBits{1'b1}}) state_q <= S_IDLE; + if (SIM_FAST_MAINT != 0) begin + // Fast: tag_bulk_clear zeroed every entry this cycle -- done. + state_q <= S_IDLE; + end else begin + sweep_idx_q <= sweep_idx_q + 1'b1; + if (sweep_idx_q == {IndexBits{1'b1}}) state_q <= S_IDLE; + end end S_IDLE: begin @@ -364,7 +441,9 @@ module frost_cache #( sweep_idx_q <= '0; state_q <= S_SWEEP; end else if (i_writeback_all) begin - flush_idx_q <= '0; + // Fast: jump straight to the first dirty line (O(dirty) walk). + // FPGA: start the full index walk at 0. + flush_idx_q <= (SIM_FAST_MAINT != 0) ? first_dirty_full : '0; state_q <= S_FLUSH_SCAN; end else if (up_req_fire) begin req_write_q <= i_up_req_write; @@ -446,6 +525,10 @@ module frost_cache #( wait_cnt_q <= 8'(DATA_READ_LATENCY); flush_tag_q <= tag_rdata_tag; state_q <= S_FLUSH_DATA; + end else if (SIM_FAST_MAINT != 0) begin + // Fast: a non-dirty line is only reached when the shadow is empty + // (no dirty lines to start with), so the writeback-all is done. + state_q <= S_IDLE; end else if (flush_idx_q == {IndexBits{1'b1}}) begin state_q <= S_IDLE; end else begin @@ -466,7 +549,18 @@ module frost_cache #( S_FLUSH_WB_WAIT: begin if (i_down_resp_valid) begin - if (flush_idx_q == {IndexBits{1'b1}}) begin + // This line's dirty bit is cleared this cycle (combinational tag + // write above), and the sim-only shadow mirrors that clear. + if (SIM_FAST_MAINT != 0) begin + // Fast: jump to the next still-dirty line (excluding this one); + // when none remain the writeback-all is complete. + if (any_dirty_excl) begin + flush_idx_q <= first_dirty_excl; + state_q <= S_FLUSH_SCAN; + end else begin + state_q <= S_IDLE; + end + end else if (flush_idx_q == {IndexBits{1'b1}}) begin state_q <= S_IDLE; end else begin flush_idx_q <= flush_idx_q + 1'b1; diff --git a/hw/rtl/lib/cache/frost_cache_hierarchy.sv b/hw/rtl/lib/cache/frost_cache_hierarchy.sv index bee1d18a..46f40584 100644 --- a/hw/rtl/lib/cache/frost_cache_hierarchy.sv +++ b/hw/rtl/lib/cache/frost_cache_hierarchy.sv @@ -48,7 +48,12 @@ module frost_cache_hierarchy #( parameter int unsigned L1I_DATA_READ_LATENCY = 2, parameter int unsigned L2_CACHE_BYTES = 2 * 1024 * 1024, parameter int unsigned L2_DATA_READ_LATENCY = 6, - parameter int unsigned L2_DATA_WRITE_LATENCY = 2 + parameter int unsigned L2_DATA_WRITE_LATENCY = 2, + // Simulation-only fast cache maintenance for fence.i (see frost_cache). + // 0 = FPGA cycle-accurate FSM; non-zero = sim fast path. Applied to the two + // L1s -- the only caches that run fence.i maintenance; the L2 sits below the + // arbiter and needs none, so it keeps the default. + parameter int unsigned SIM_FAST_MAINT = 0 ) ( input logic i_clk, input logic i_rst, @@ -134,7 +139,8 @@ module frost_cache_hierarchy #( .LINE_BYTES(LINE_BYTES), .DATA_MEMORY_PRIMITIVE("block"), .DATA_READ_LATENCY(L1_DATA_READ_LATENCY), - .DATA_WRITE_LATENCY(L1_DATA_WRITE_LATENCY) + .DATA_WRITE_LATENCY(L1_DATA_WRITE_LATENCY), + .SIM_FAST_MAINT(SIM_FAST_MAINT) ) l1_cache ( .i_clk(i_clk), .i_rst(i_rst), @@ -164,7 +170,8 @@ module frost_cache_hierarchy #( .CACHE_SIZE_BYTES(L1I_CACHE_BYTES), .LINE_BYTES(LINE_BYTES), .DATA_MEMORY_PRIMITIVE("block"), - .DATA_READ_LATENCY(L1I_DATA_READ_LATENCY) + .DATA_READ_LATENCY(L1I_DATA_READ_LATENCY), + .SIM_FAST_MAINT(SIM_FAST_MAINT) ) l1i_cache ( .i_clk(i_clk), .i_rst(i_rst), diff --git a/hw/rtl/lib/cache/frost_cache_test_harness.sv b/hw/rtl/lib/cache/frost_cache_test_harness.sv index 96973797..ae189128 100644 --- a/hw/rtl/lib/cache/frost_cache_test_harness.sv +++ b/hw/rtl/lib/cache/frost_cache_test_harness.sv @@ -36,7 +36,10 @@ module frost_cache_test_harness #( parameter int unsigned L2_DATA_WRITE_LATENCY = 2, parameter logic [31:0] BASE_ADDR = 32'h8000_0000, parameter int unsigned MEM_BYTES = 4 * 1024 * 1024, - parameter int unsigned MEM_LATENCY = 12 + parameter int unsigned MEM_LATENCY = 12, + // Simulation-only fast cache maintenance for fence.i (see frost_cache). The + // cocotb cache registry runs this bench with it both off (default) and on. + parameter int unsigned SIM_FAST_MAINT = 0 ) ( input logic i_clk, input logic i_rst, @@ -76,7 +79,8 @@ module frost_cache_test_harness #( .L1I_CACHE_BYTES(L1I_CACHE_BYTES), .L2_CACHE_BYTES(L2_CACHE_BYTES), .L2_DATA_READ_LATENCY(L2_DATA_READ_LATENCY), - .L2_DATA_WRITE_LATENCY(L2_DATA_WRITE_LATENCY) + .L2_DATA_WRITE_LATENCY(L2_DATA_WRITE_LATENCY), + .SIM_FAST_MAINT(SIM_FAST_MAINT) ) cache_hierarchy ( .i_clk(i_clk), .i_rst(i_rst), diff --git a/hw/rtl/lib/ram/sdp_block_ram.sv b/hw/rtl/lib/ram/sdp_block_ram.sv index 81851060..b50b3d93 100644 --- a/hw/rtl/lib/ram/sdp_block_ram.sv +++ b/hw/rtl/lib/ram/sdp_block_ram.sv @@ -26,10 +26,21 @@ */ module sdp_block_ram #( parameter int unsigned ADDR_WIDTH = 5, // Address width in bits - parameter int unsigned DATA_WIDTH = 32 // Data width in bits + parameter int unsigned DATA_WIDTH = 32, // Data width in bits + // Simulation-only bulk-clear support. 0 (FPGA/synthesis): this module is + // byte-for-byte the plain single-write block RAM -- the clear path is not + // elaborated, so inference is unchanged. Non-zero: a sim-only path lets + // i_bulk_clear zero every entry in one cycle (frost_cache's fast + // invalidate-all). The clear branch lives in a generate that is elaborated + // only when this is set, so no synthesis flow ever sees the array-wide + // reset. + parameter int unsigned SUPPORT_BULK_CLEAR = 0 ) ( input logic i_clk, input logic i_write_enable, + // Sim-only one-cycle clear of every entry (see SUPPORT_BULK_CLEAR). Tied + // low / unused on FPGA builds (SUPPORT_BULK_CLEAR = 0). + input logic i_bulk_clear, input logic [ADDR_WIDTH-1:0] i_write_address, input logic [ADDR_WIDTH-1:0] i_read_address, input logic [DATA_WIDTH-1:0] i_write_data, @@ -42,8 +53,18 @@ module sdp_block_ram #( // Initialize all memory locations to zero initial for (int i = 0; i < RamDepth; ++i) ram[i] = '0; - // Synchronous write operation - always_ff @(posedge i_clk) if (i_write_enable) ram[i_write_address] <= i_write_data; + // Synchronous write. SUPPORT_BULK_CLEAR picks the write block at elaboration: + // the FPGA path is exactly the original single-port write (so block-RAM + // inference is unchanged); the sim-only path adds a one-cycle clear-all that + // takes priority over a write. Only one branch ever exists in a build. + if (SUPPORT_BULK_CLEAR != 0) begin : gen_clearable_write + always_ff @(posedge i_clk) begin + if (i_bulk_clear) for (int i = 0; i < int'(RamDepth); ++i) ram[i] <= '0; + else if (i_write_enable) ram[i_write_address] <= i_write_data; + end + end else begin : gen_plain_write + always_ff @(posedge i_clk) if (i_write_enable) ram[i_write_address] <= i_write_data; + end // Synchronous read - output registered for block RAM inference and timing always_ff @(posedge i_clk) o_read_data <= ram[i_read_address]; diff --git a/tests/Makefile b/tests/Makefile index 628e8335..03368610 100644 --- a/tests/Makefile +++ b/tests/Makefile @@ -364,8 +364,14 @@ ENABLE_CACHED_TIER ?= 1 CACHED_HAS_L2 ?= 1 DDR_MODEL_BYTES ?= 67108864 DDR_MODEL_LATENCY ?= 30 +# Fast cache maintenance (fence.i) for simulation: completes invalidate-all / +# writeback-all in drastically fewer cycles while preserving the exact +# functional effect, so the real Linux kernel's fence.i-heavy patching boots +# tractably. Defaults ON for sim; the FPGA default in the RTL stays 0 and the +# board builds never set it. +SIM_FAST_MAINT ?= 1 ifeq ($(TOPLEVEL),frost) - EXTRA_ARGS += -GMEM_SIZE_BYTES=$(SIM_MEM_SIZE_BYTES) -GENABLE_CACHED_TIER=$(ENABLE_CACHED_TIER) -GCACHED_HAS_L2=$(CACHED_HAS_L2) -GDDR_MODEL_BYTES=$(DDR_MODEL_BYTES) -GDDR_MODEL_LATENCY=$(DDR_MODEL_LATENCY) + EXTRA_ARGS += -GMEM_SIZE_BYTES=$(SIM_MEM_SIZE_BYTES) -GENABLE_CACHED_TIER=$(ENABLE_CACHED_TIER) -GCACHED_HAS_L2=$(CACHED_HAS_L2) -GDDR_MODEL_BYTES=$(DDR_MODEL_BYTES) -GDDR_MODEL_LATENCY=$(DDR_MODEL_LATENCY) -GSIM_FAST_MAINT=$(SIM_FAST_MAINT) endif # Include Cocotb simulation makefile rules diff --git a/tests/test_run_cocotb.py b/tests/test_run_cocotb.py index 7be4f8f2..bffb99df 100755 --- a/tests/test_run_cocotb.py +++ b/tests/test_run_cocotb.py @@ -575,6 +575,48 @@ class CocotbRunConfig: description="Cache hierarchy unit tests (L1 -> DDR, Genesys2 shape)", verilator_extra_args=("-GHAS_L2=0",), ), + # Same functional suite, but with the sim-only fast maintenance path + # (SIM_FAST_MAINT=1) enabled: proves invalidate-all / writeback-all stay + # functionally identical when the fence.i fast path is active. + "frost_cache_fast": CocotbRunConfig( + python_test_module="cocotb_tests.cache.test_frost_cache", + hdl_toplevel_module="frost_cache_test_harness", + description="Cache hierarchy unit tests, fast fence.i maintenance (L1 -> L2 -> DDR)", + verilator_extra_args=("-GHAS_L2=1", "-GSIM_FAST_MAINT=1"), + ), + "frost_cache_l1_only_fast": CocotbRunConfig( + python_test_module="cocotb_tests.cache.test_frost_cache", + hdl_toplevel_module="frost_cache_test_harness", + description="Cache hierarchy unit tests, fast fence.i maintenance (L1 -> DDR)", + verilator_extra_args=("-GHAS_L2=0", "-GSIM_FAST_MAINT=1"), + ), + # fence.i maintenance cycle-count measurement at the real L1 geometry + # (128 KiB D / 16 KiB I). Two builds, slow (FPGA-path FSM) vs fast, so the + # speedup is directly observable in the logs. Not part of the pytest sweep. + "fence_speed_slow": CocotbRunConfig( + python_test_module="cocotb_tests.cache.test_fence_speed", + hdl_toplevel_module="frost_cache_test_harness", + description="fence.i maintenance cost, FPGA-path FSM (SIM_FAST_MAINT=0)", + verilator_extra_args=( + "-GHAS_L2=0", + "-GL1_CACHE_BYTES=131072", + "-GL1I_CACHE_BYTES=16384", + "-GSIM_FAST_MAINT=0", + ), + include_in_pytest=False, + ), + "fence_speed_fast": CocotbRunConfig( + python_test_module="cocotb_tests.cache.test_fence_speed", + hdl_toplevel_module="frost_cache_test_harness", + description="fence.i maintenance cost, fast sim path (SIM_FAST_MAINT=1)", + verilator_extra_args=( + "-GHAS_L2=0", + "-GL1_CACHE_BYTES=131072", + "-GL1I_CACHE_BYTES=16384", + "-GSIM_FAST_MAINT=1", + ), + include_in_pytest=False, + ), "line_port_arbiter": CocotbRunConfig( python_test_module="cocotb_tests.cache.test_line_port_arbiter", hdl_toplevel_module="line_port_arbiter_test_harness", diff --git a/verif/cocotb_tests/cache/test_fence_speed.py b/verif/cocotb_tests/cache/test_fence_speed.py new file mode 100644 index 00000000..af70b403 --- /dev/null +++ b/verif/cocotb_tests/cache/test_fence_speed.py @@ -0,0 +1,141 @@ +# Copyright 2026 Two Sigma Open Source, LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +"""fence.i maintenance cycle-count measurement (frost_cache_test_harness DUT). + +Drives the cache hierarchy at the real L1 geometry (128 KiB D-side / 16 KiB +I-side, set via -G in the registry), dirties a handful of D-side lines, then +issues one fence.i cache-sync handshake and counts the cycles from sync-assert +to done. Run the two registry builds to see the speedup directly: + + ./test_run_cocotb.py fence_speed_slow # SIM_FAST_MAINT=0 (FPGA-path FSM) + ./test_run_cocotb.py fence_speed_fast # SIM_FAST_MAINT=1 (fast sim path) + +The slow build walks every line (writeback-all over 4096 lines + invalidate-all +over 512 lines, ~thousands of cycles); the fast build touches only the dirty +lines and bulk-clears the tags (low hundreds or fewer). The measured count is +logged as `FENCE_I_MAINT_CYCLES=` for easy comparison. +""" + +from typing import Any + +import cocotb +from cocotb.clock import Clock +from cocotb.triggers import FallingEdge, RisingEdge + +CLOCK_PERIOD_NS = 10 +LINE_BYTES = 32 +BASE_ADDR = 0x8000_0000 + +# Generous: the slow reset sweep walks every L1 line (4096) before ready. +READY_TIMEOUT_CYCLES = 100_000 +RESP_TIMEOUT_CYCLES = 20_000 +FENCE_TIMEOUT_CYCLES = 200_000 + +# Number of distinct dirty D-side lines to publish before the fence. +NUM_DIRTY_LINES = 16 + + +def _clear_inputs(dut: Any) -> None: + dut.i_up_req_valid.value = 0 + dut.i_up_req_write.value = 0 + dut.i_up_req_addr.value = 0 + dut.i_up_req_wdata.value = 0 + dut.i_up_req_wstrb.value = 0 + dut.i_iup_req_valid.value = 0 + dut.i_iup_req_write.value = 0 + dut.i_iup_req_addr.value = 0 + dut.i_iup_req_wdata.value = 0 + dut.i_iup_req_wstrb.value = 0 + dut.i_fence_sync.value = 0 + + +async def _setup(dut: Any) -> None: + """Start the clock, reset, and wait out the tag-invalidate sweep.""" + cocotb.start_soon(Clock(dut.i_clk, CLOCK_PERIOD_NS, unit="ns").start()) + _clear_inputs(dut) + dut.i_rst.value = 1 + for _ in range(4): + await RisingEdge(dut.i_clk) + await FallingEdge(dut.i_clk) + dut.i_rst.value = 0 + for _ in range(READY_TIMEOUT_CYCLES): + await FallingEdge(dut.i_clk) + if int(dut.o_up_req_ready.value) == 1 and int(dut.o_iup_req_ready.value) == 1: + return + raise AssertionError("cache never became ready after reset (sweep stuck?)") + + +async def _write_line(dut: Any, addr: int, wdata: int) -> None: + """Whole-line D-side write (dirties the line in L1).""" + full = (1 << LINE_BYTES) - 1 + await FallingEdge(dut.i_clk) + dut.i_up_req_valid.value = 1 + dut.i_up_req_write.value = 1 + dut.i_up_req_addr.value = addr + dut.i_up_req_wdata.value = wdata + dut.i_up_req_wstrb.value = full + for _ in range(RESP_TIMEOUT_CYCLES): + if int(dut.o_up_req_ready.value) == 1: + break + await FallingEdge(dut.i_clk) + else: + raise AssertionError(f"write never accepted (addr=0x{addr:08x})") + await FallingEdge(dut.i_clk) + dut.i_up_req_valid.value = 0 + dut.i_up_req_write.value = 0 + for _ in range(RESP_TIMEOUT_CYCLES): + if int(dut.o_up_resp_valid.value) == 1: + return + await FallingEdge(dut.i_clk) + raise AssertionError(f"no write response (addr=0x{addr:08x})") + + +async def _measure_fence_cycles(dut: Any) -> int: + """Assert i_fence_sync and count cycles until o_fence_done rises.""" + await FallingEdge(dut.i_clk) + dut.i_fence_sync.value = 1 + cycles = 0 + for _ in range(FENCE_TIMEOUT_CYCLES): + await RisingEdge(dut.i_clk) + cycles += 1 + if int(dut.o_fence_done.value) == 1: + break + else: + raise AssertionError("fence sync never completed") + await FallingEdge(dut.i_clk) + dut.i_fence_sync.value = 0 + await FallingEdge(dut.i_clk) + return cycles + + +@cocotb.test() +async def test_fence_i_maintenance_cycles(dut: Any) -> None: + """Dirty several lines, fence, and report the maintenance cycle count.""" + await _setup(dut) + + for i in range(NUM_DIRTY_LINES): + addr = BASE_ADDR + i * LINE_BYTES + wdata = int.from_bytes(bytes([(i * 7 + b) & 0xFF for b in range(32)]), "little") + await _write_line(dut, addr, wdata) + + cycles = await _measure_fence_cycles(dut) + dut._log.info( + f"FENCE_I_MAINT_CYCLES={cycles} (dirty_lines={NUM_DIRTY_LINES}, " + f"L1=128KiB/4096 lines, L1I=16KiB/512 lines)" + ) + + # Sanity only: completion within the timeout. The slow vs fast comparison is + # read from the logged FENCE_I_MAINT_CYCLES line across the two builds. + assert cycles < FENCE_TIMEOUT_CYCLES, "fence.i maintenance did not complete" From f535fc821f3031e143db6c5b83b72accc3ee7274 Mon Sep 17 00:00:00 2001 From: Adam Bagley Date: Sun, 28 Jun 2026 13:05:18 -0400 Subject: [PATCH 20/43] ci: Buildroot BR2_EXTERNAL for reproducible no-MMU Linux kernel builds MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit linux/buildroot-external/ — BR2_EXTERNAL board support to build the FROST no-MMU RV32 kernel (Linux 6.18.7) reproducibly: defconfig, kernel config fragment, DTS, post-image packer (build_fpga_boot.py), rootfs. Plus .github/workflows/linux-boot-sim.yml — CI that builds via the external tree, stages sw*.mem, and runs the cocotb linux_boot sim. Packer validated byte-identical to the existing hand-built artifacts. NOTE (follow-up): add the Buildroot submodule pin — local tree is 2026.08-git @67449130 (not the tagged 2026.05 the README assumes); pick one before CI runs. The vendored build_fpga_boot.py carries a ruff D103/UP031 noqa pending refactor. --- .github/workflows/linux-boot-sim.yml | 174 +++++++++++++ linux/buildroot-external/Config.in | 18 ++ linux/buildroot-external/README.md | 142 ++++++++++ .../board/frost/build_fpga_boot.py | 243 ++++++++++++++++++ .../board/frost/frost-nommu-fpga.dts | 67 +++++ .../board/frost/linux-nommu-base.config | 21 ++ .../frost/linux-nommu-frost.config.fragment | 188 ++++++++++++++ .../board/frost/patches/linux/linux.hash | 7 + .../board/frost/post-image.sh | 77 ++++++ .../configs/frost_nommu_rv32_defconfig | 53 ++++ linux/buildroot-external/external.desc | 2 + linux/buildroot-external/external.mk | 22 ++ 12 files changed, 1014 insertions(+) create mode 100644 .github/workflows/linux-boot-sim.yml create mode 100644 linux/buildroot-external/Config.in create mode 100644 linux/buildroot-external/README.md create mode 100755 linux/buildroot-external/board/frost/build_fpga_boot.py create mode 100644 linux/buildroot-external/board/frost/frost-nommu-fpga.dts create mode 100644 linux/buildroot-external/board/frost/linux-nommu-base.config create mode 100644 linux/buildroot-external/board/frost/linux-nommu-frost.config.fragment create mode 100644 linux/buildroot-external/board/frost/patches/linux/linux.hash create mode 100755 linux/buildroot-external/board/frost/post-image.sh create mode 100644 linux/buildroot-external/configs/frost_nommu_rv32_defconfig create mode 100644 linux/buildroot-external/external.desc create mode 100644 linux/buildroot-external/external.mk diff --git a/.github/workflows/linux-boot-sim.yml b/.github/workflows/linux-boot-sim.yml new file mode 100644 index 00000000..8a518c0d --- /dev/null +++ b/.github/workflows/linux-boot-sim.yml @@ -0,0 +1,174 @@ +# Copyright 2026 Two Sigma Open Source, LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +# Build the FROST RV32 no-MMU M-mode Linux kernel + initramfs with Buildroot +# (via the linux/buildroot-external BR2_EXTERNAL tree), package it into the +# FROST sim memory images, and run the cocotb `linux_boot` simulation. +# +# Prerequisites (one-time, not done by this workflow): +# git submodule add https://github.com/buildroot/buildroot.git linux/buildroot +# git -C linux/buildroot checkout 2026.05 # see README pin caveat +# +# This is a DRAFT. TODOs mark the spots that need real runner details. +name: Linux Boot Sim + +on: + workflow_dispatch: + pull_request: + branches: [main] + paths: + - "linux/buildroot-external/**" + - ".github/workflows/linux-boot-sim.yml" + +env: + # The kernel build inputs were captured from buildroot 2026.08-git (67449130). + # 2026.05 is the requested tag; verify it has gcc 15.2.0 / binutils 2.45.1 / + # linux-6.18 headers (README pin caveat) or change this to the commit SHA. + BUILDROOT_REF: "2026.05" + # Out-of-tree Buildroot build dir + a stable download cache dir. + BR2_OUT: "linux/build" + BR2_DL_DIR: "linux/dl" + +jobs: + # --------------------------------------------------------------------------- + # Build the kernel + initramfs and package the FROST sim memory images. + # --------------------------------------------------------------------------- + build-kernel: + name: Build FROST Linux (Buildroot) + runs-on: ubuntu-24.04 + # First build compiles the cross toolchain from source (~30-60 min). With + # the dl/ + ccache caches warm, subsequent runs are ~10-20 min. + timeout-minutes: 120 + steps: + - name: Checkout repository + uses: actions/checkout@v4 + with: + # Recursive so the pinned linux/buildroot submodule is fetched. + submodules: recursive + + # TODO: if the Buildroot submodule is not yet committed, add it here: + # - run: | + # git submodule add https://github.com/buildroot/buildroot.git linux/buildroot || true + # git -C linux/buildroot fetch --depth 1 origin ${{ env.BUILDROOT_REF }} + # git -C linux/buildroot checkout FETCH_HEAD + + - name: Install Buildroot host dependencies + run: | + sudo apt-get update + sudo apt-get install -y --no-install-recommends \ + build-essential gcc g++ make file cpio unzip rsync bc wget \ + python3 git ccache flex bison libssl-dev libelf-dev \ + libncurses-dev device-tree-compiler + + - name: Cache Buildroot downloads (source tarballs) + uses: actions/cache@v4 + with: + path: ${{ env.BR2_DL_DIR }} + # dl/ only changes when a package version (kernel/toolchain) changes. + key: br2-dl-${{ env.BUILDROOT_REF }}-${{ hashFiles('linux/buildroot-external/configs/frost_nommu_rv32_defconfig', 'linux/buildroot-external/board/frost/**') }} + restore-keys: | + br2-dl-${{ env.BUILDROOT_REF }}- + + - name: Cache ccache (toolchain + kernel object cache) + uses: actions/cache@v4 + with: + path: ~/.buildroot-ccache + key: br2-ccache-${{ env.BUILDROOT_REF }}-${{ github.sha }} + restore-keys: | + br2-ccache-${{ env.BUILDROOT_REF }}- + + - name: Configure (frost_nommu_rv32_defconfig) + run: | + make -C linux/buildroot O="${{ github.workspace }}/${{ env.BR2_OUT }}" \ + BR2_EXTERNAL="${{ github.workspace }}/linux/buildroot-external" \ + BR2_DL_DIR="${{ github.workspace }}/${{ env.BR2_DL_DIR }}" \ + frost_nommu_rv32_defconfig + # Enable ccache for CI only (kept out of the committed defconfig). + echo 'BR2_CCACHE=y' >> "${{ github.workspace }}/${{ env.BR2_OUT }}/.config" + make -C linux/buildroot O="${{ github.workspace }}/${{ env.BR2_OUT }}" olddefconfig + + - name: Build kernel + initramfs + FROST memory images + run: | + make -C linux/buildroot O="${{ github.workspace }}/${{ env.BR2_OUT }}" \ + BR2_DL_DIR="${{ github.workspace }}/${{ env.BR2_DL_DIR }}" + + - name: Stage memory images for the cocotb linux_boot test + run: | + mkdir -p sw/apps/linux_boot + cp "${{ env.BR2_OUT }}/images/sw.mem" sw/apps/linux_boot/sw.mem + cp "${{ env.BR2_OUT }}/images/sw_ddr.mem" sw/apps/linux_boot/sw_ddr.mem + ls -l "${{ env.BR2_OUT }}/images/" + + - name: Upload FROST boot images + uses: actions/upload-artifact@v4 + with: + name: frost-linux-boot-images + path: | + ${{ env.BR2_OUT }}/images/Image + ${{ env.BR2_OUT }}/images/rootfs.cpio.gz + ${{ env.BR2_OUT }}/images/frost-nommu-fpga.dtb + ${{ env.BR2_OUT }}/images/sw.mem + ${{ env.BR2_OUT }}/images/sw_ddr.mem + retention-days: 7 + + # --------------------------------------------------------------------------- + # Run the cocotb linux_boot simulation against the packaged images. + # Reuses the same Dockerized toolchain as ci.yml (Verilator + cocotb). + # --------------------------------------------------------------------------- + linux-boot-sim: + name: Cocotb linux_boot (Verilator) + runs-on: ubuntu-24.04 + needs: build-kernel + timeout-minutes: 120 + steps: + - name: Checkout repository + uses: actions/checkout@v4 + with: + submodules: recursive + + - name: Download FROST boot images + uses: actions/download-artifact@v4 + with: + name: frost-linux-boot-images + path: /tmp/frost-images + + - name: Stage images into sw/apps/linux_boot + run: | + mkdir -p sw/apps/linux_boot + cp /tmp/frost-images/sw.mem sw/apps/linux_boot/sw.mem + cp /tmp/frost-images/sw_ddr.mem sw/apps/linux_boot/sw_ddr.mem + + # Build the frost-dev image (same Dockerfile/pattern as ci.yml). To avoid + # rebuilding it here, this could instead reuse ci.yml's uploaded + # frost-docker-image artifact. TODO: share the image across workflows. + - name: Set up Docker Buildx + uses: docker/setup-buildx-action@v3 + + - name: Build frost-dev Docker image + uses: docker/build-push-action@v6 + with: + context: . + tags: frost-dev:latest + load: true + cache-from: type=gha + cache-to: type=gha,mode=max + + # TODO: the `linux_boot` TEST_REGISTRY entry / cocotb harness lives on the + # FROST Linux feature branch, not on main. Once it lands, this runs the + # boot sim against the staged sw_ddr.mem. It is a long-running sim, so + # cap it with the pytest/cocotb timeout the harness defines. + - name: Run cocotb linux_boot simulation + run: | + docker run --rm -v ${{ github.workspace }}:/workspace frost-dev:latest \ + pytest tests/test_run_cocotb.py -m cocotb -k linux_boot -v diff --git a/linux/buildroot-external/Config.in b/linux/buildroot-external/Config.in new file mode 100644 index 00000000..4de49724 --- /dev/null +++ b/linux/buildroot-external/Config.in @@ -0,0 +1,18 @@ +# Copyright 2026 Two Sigma Open Source, LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +# The FROST external tree provides no extra target packages today; the kernel, +# toolchain and rootfs are all selected by configs/frost_nommu_rv32_defconfig. +# Add `source "$BR2_EXTERNAL_FROST_PATH/package//Config.in"` lines here if +# FROST-specific Buildroot packages are introduced later. diff --git a/linux/buildroot-external/README.md b/linux/buildroot-external/README.md new file mode 100644 index 00000000..056d9d16 --- /dev/null +++ b/linux/buildroot-external/README.md @@ -0,0 +1,142 @@ + + +# FROST Buildroot external tree (`BR2_EXTERNAL`) + +Reproducibly builds the FROST **RV32 / no-MMU / M-mode Linux** kernel (6.18.7), +a busybox initramfs, and packages them into the memory images the FROST cocotb +`linux_boot` simulation (and the FPGA JTAG loader) consume. + +This is a standard Buildroot [`BR2_EXTERNAL`](https://buildroot.org/downloads/manual/manual.html#outside-br-custom) +tree. It carries **no** Buildroot source itself — point an out-of-tree build at +a pinned upstream Buildroot checkout (see *Buildroot pin* below). + +## Layout + +``` +linux/buildroot-external/ +├── external.desc # BR2_EXTERNAL manifest (name: FROST) +├── external.mk # package include hook (no packages today) +├── Config.in # package menu hook (empty today) +├── configs/ +│ └── frost_nommu_rv32_defconfig # the FROST Buildroot defconfig +└── board/frost/ + ├── linux-nommu-base.config # base kernel config (from buildroot board/qemu/riscv32-virt) + ├── linux-nommu-frost.config.fragment # FROST kernel CONFIG delta, merged on top of the base + ├── frost-nommu-fpga.dts # reference DTB source (the packer regenerates it per build) + ├── build_fpga_boot.py # packer: Image + DTB + initramfs -> sw.{mem,txt}, sw_ddr.{mem,txt} + ├── post-image.sh # Buildroot post-image hook -> runs the packer + └── patches/linux/linux.hash # sha256 for the custom linux-6.18.7 tarball +``` + +## Buildroot pin + +The FROST artifacts this tree reproduces were captured from a Buildroot +**`2026.08-git`** snapshot (commit `67449130`), which provides the defaults this +defconfig relies on: **gcc 15.2.0**, **binutils 2.45.1**, the internal rv32-nommu +**uClibc** toolchain, and the **Linux 6.18** host-headers option. + +Add Buildroot as a submodule next to this tree: + +```bash +git submodule add https://github.com/buildroot/buildroot.git linux/buildroot +git -C linux/buildroot checkout 2026.05 # tag requested for CI +git add .gitmodules linux/buildroot +git commit -m "linux: vendor buildroot 2026.05 as a submodule" +``` + +> **Pin caveat (needs a human decision).** The task specifies the **2026.05** +> tag, but the local artifacts were actually built from `2026.08-git` +> (`67449130`). The local Buildroot is a shallow clone with no tags, so 2026.05 +> could not be verified offline. Before trusting 2026.05, confirm it ships +> `BR2_GCC_VERSION_15_X` (15.2.0), `BR2_BINUTILS_VERSION_2_45_X` (2.45.1) and +> `BR2_PACKAGE_HOST_LINUX_HEADERS_CUSTOM_6_18`. If it doesn't, pin the exact +> commit instead: +> `git -C linux/buildroot checkout 67449130`. + +## Build + +Out-of-tree build (keeps the Buildroot submodule pristine): + +```bash +# from the repo root +make -C linux/buildroot O="$(pwd)/linux/build" \ + BR2_EXTERNAL="$(pwd)/linux/buildroot-external" frost_nommu_rv32_defconfig +make -C linux/buildroot O="$(pwd)/linux/build" +``` + +First build is ~30–60 min (it builds the cross toolchain from source). Outputs +land in `linux/build/images/`: + +| File | Purpose | +|---|---| +| `Image` | rv32 no-MMU kernel (flat, uncompressed) | +| `rootfs.cpio.gz` | busybox initramfs | +| `frost-nommu-fpga.dtb` | generated FROST device tree (UART/CLINT @ 0x4000_xxxx, 133.333 MHz) | +| `sw.mem` / `sw.txt` | low-BRAM boot shim (`a0=0`, `a1=DTB`, jump to kernel) | +| `sw_ddr.mem` / `sw_ddr.txt` | DDR image: kernel @ 0x8000_0000, DTB @ 0x8080_0000, initramfs @ 0x8081_0000 | + +## Feeding the cocotb `linux_boot` test + +`tests/test_run_cocotb.py` resolves an app's images at +`sw/apps//sw.mem` (+ `sw_ddr.mem`). Stage the build outputs there: + +```bash +mkdir -p sw/apps/linux_boot +cp linux/build/images/sw.mem sw/apps/linux_boot/sw.mem +cp linux/build/images/sw_ddr.mem sw/apps/linux_boot/sw_ddr.mem +# then, per the repo CLAUDE.md test flow: +cd tests && make clean && ./test_run_cocotb.py linux_boot +``` + +The `linux_boot` registry entry / harness is **not** part of this external tree +(it lives with the cocotb tests on the FROST Linux feature branch). + +## How the kernel config is assembled + +`BR2_LINUX_KERNEL_USE_CUSTOM_CONFIG` uses `board/frost/linux-nommu-base.config` +as the base, and `BR2_LINUX_KERNEL_CONFIG_FRAGMENT_FILES` merges +`board/frost/linux-nommu-frost.config.fragment` on top (kconfig +`merge_config.sh` semantics). The fragment retargets the known-good QEMU-virt +nommu kernel at FROST: it keeps M-mode / rv32 / no-MMU / bFLT, switches the +rootfs to an initramfs (`BLK_DEV_INITRD` + `RD_GZIP`), and drops +virtio / PCI / net / ext2 / PLIC. See the header of the fragment for the full, +per-symbol rationale and the hardware caveats. + +## Notes, assumptions and gaps + +- **Rootfs reproduction.** `rootfs.cpio.gz` is reproduced from Buildroot's + default busybox (`busybox-minimal.config`) + `BR2_TARGET_ROOTFS_CPIO[_GZIP]`, + not vendored. It is functionally equivalent to the hand-made + `frost-artifacts/rootfs.cpio.gz` but **not** byte-identical. Add a + `rootfs-overlay/` + `BR2_ROOTFS_OVERLAY` here if a specific userspace is + required. +- **Fragment vs. the latest hand-built Image.** This defconfig *applies* the + FROST fragment (per the build notes' "Option A"). The most recent artifact + `Image` checked on the dev box was actually built from the **stock** + `qemu_riscv32_nommu_virt_defconfig` *without* the fragment (it still had + `CONFIG_NET` / `CONFIG_VIRTIO_BLK` / `CONFIG_SIFIVE_PLIC` / `CONFIG_EXT2_FS` + set). Decide whether the fragment-applied kernel here is the intended target + (it should be — it is strictly closer to FROST and the generated DTB has no + PLIC/virtio nodes) or whether to drop the fragment to match that artifact + bit-for-bit. +- **Boot shim toolchain.** Standalone, the packer uses the xPack + `riscv-none-elf-*` bare-metal toolchain (`rv32i_zicsr` / `ilp32`). In CI + `post-image.sh` instead uses the Buildroot-built `riscv32-*-` toolchain with + its own default `-march`/`-mabi` (the shim is ABI-agnostic integer code). +- **`dtc`.** `post-image.sh` prefers `$HOST_DIR/bin/dtc`, then the kernel's + `scripts/dtc/dtc`, then `$PATH`. Enable `BR2_PACKAGE_HOST_DTC=y` if you want + to guarantee a host `dtc` independent of the kernel build. diff --git a/linux/buildroot-external/board/frost/build_fpga_boot.py b/linux/buildroot-external/board/frost/build_fpga_boot.py new file mode 100755 index 00000000..064575e1 --- /dev/null +++ b/linux/buildroot-external/board/frost/build_fpga_boot.py @@ -0,0 +1,243 @@ +#!/usr/bin/env python3 + +# Copyright 2026 Two Sigma Open Source, LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +# Vendored from frost-artifacts/build_fpga_boot.py; style carve-outs pending a refactor. +# ruff: noqa: D103, UP031 + +"""Build a FROST FPGA / sim no-MMU Linux boot image. + +Derived from frost-artifacts/build_fpga_boot.py. The packing logic (memory +layout, word format, DTB template and boot shim) is unchanged; the only +additions are environment overrides so the script runs both: + + * standalone on a dev box (xPack riscv-none-elf toolchain, original paths), and + * as a Buildroot post-image hook in CI (board/frost/post-image.sh sets the + env to point at Buildroot's $BINARIES_DIR and its just-built toolchain). + +Emits BOTH forms of each image: + sw.{mem,txt} low BRAM: boot shim (a0=0, a1=DTB, jr kernel entry). + sw_ddr.{mem,txt} DDR (offset 0 == 0x8000_0000): kernel Image @ 0, + DTB @ 0x80_0000, initramfs (cpio.gz) @ 0x81_0000. + + .mem = $readmemh form (sim): "@" directives + word values. + .txt = FPGA-loader form: dense, one little-endian word value per line from + offset 0 (file_to_bram.tcl / file_to_ddr.tcl burst it sequentially). +Both carry identical little-endian word values. + +Environment overrides (all optional; defaults reproduce the standalone build): + FROST_IMAGE kernel Image path (default: ~/bigger_l0/linux-mvp/buildroot/output/images/Image) + FROST_INITRD rootfs.cpio.gz path (default: