From 8057dd5b918fe14824a7697985f04868a97e7d1d Mon Sep 17 00:00:00 2001 From: hwirys <121537293+hwirys@users.noreply.github.com> Date: Thu, 30 Apr 2026 23:24:17 +0900 Subject: [PATCH] =?UTF-8?q?hw:=20VX=5Faxi=5Fadapter=20=E2=80=94=20fix=20tw?= =?UTF-8?q?o=20AXI4=20master=20correctness=20bugs?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit 1) AW-vs-AR ordering. AW and AR fired with different IDs are not ordered by AXI4 (A5), so on real DDR4 controllers a load issued right after a writethrough store on the same line can return the pre-store value. This silently corrupted any function-call-boundary stack spill/fill on Alveo U250 (e.g. dogfood Test21-trig sinf, regression `diverge` ~9/10 fail) while simx and rtlsim — whose memory models commit writes synchronously — passed. Fix: per-bank counter of outstanding writes (AW handshakes not yet matched by B) gates AR on the same bank only; cross-bank parallelism is preserved. New parameter WRITE_BEFORE_READ defaults to 1; verification environments that already serialize writes can set it to 0. 2) Multi-bank-out address generation. With NUM_BANKS_OUT > 1 (separate AXI master per DDR channel, e.g. U200/U250 4-bank), the previous formula `byte_addr = bank_addr * NUM_BANKS * LINE_SIZE + i * LINE_SIZE` only matched a single-master HBM-style fabric. With per-channel masters each DDR controller has its own [0, capacity) space and the host BANK_INTERLEAVE math at runtime/xrt/vortex.cpp:get_bank_info places data at compact offset `(block_addr / NUM_BANKS) * LINE_SIZE` in bo[bank] — the AFU and host disagreed by a stride that grew with VA, the long-standing 4-bank corruption. Fix: emit `byte_addr = bank_addr * LINE_SIZE` per channel. With NUM_BANKS_OUT = 1 this is bit-identical to the previous behaviour, so HBM single-master configurations (U280/U55C/U50) and VCK5000 single-channel are unchanged. Verified on real Alveo U250 (XRT 2.19, xdma_shell_4_1) single-bank build with default DSP FPU at 200 MHz: regression `diverge` 10/10 PASS (was ~1/10), dogfood Test0–Test20, vecadd, sgemm, dotproduct, demo, dropout, conv3, io_addr, fence, plus OpenCL (saxpy, vecadd, sgemm, sgemm2, sgemm3, stencil, sfilter, spmv, psort, oclprintf) and single-rank MPI (mpi_vecadd, mpi_dotproduct, mpi_diverge, mpi_put_dotproduct) — all pass. Final WNS = +0.057 ns; the patch adds ~7 LUT/FF per bank. Refs: #202, #262, #263, #278, #333. --- hw/rtl/libs/VX_axi_adapter.sv | 44 ++++++++++++++++++++++++----------- 1 file changed, 31 insertions(+), 13 deletions(-) diff --git a/hw/rtl/libs/VX_axi_adapter.sv b/hw/rtl/libs/VX_axi_adapter.sv index 4215128543..930208f2d9 100644 --- a/hw/rtl/libs/VX_axi_adapter.sv +++ b/hw/rtl/libs/VX_axi_adapter.sv @@ -27,6 +27,10 @@ module VX_axi_adapter #( parameter ARBITER = "R", parameter REQ_OUT_BUF = 0, parameter RSP_OUT_BUF = 0, + // Per-bank store-before-load AXI ordering. AW and AR with different IDs + // are not ordered by AXI4 (A5); set to 0 in verif environments whose + // memory model commits writes synchronously. + parameter WRITE_BEFORE_READ = 1, parameter DATA_SIZE = DATA_WIDTH/8 ) ( input wire clk, @@ -246,17 +250,36 @@ module VX_axi_adapter #( `UNUSED_PIN (tx_ack) ); - assign req_xbar_ready_out[i] = xbar_rw_out ? axi_write_ready : m_axi_arready[i]; + // Outstanding-write tracker: gate AR while AW is unmatched by B. + wire writes_pending; + if (WRITE_BEFORE_READ) begin : g_write_tracker + reg [7:0] pending_writes_r; + wire aw_fire = m_axi_awvalid[i] && m_axi_awready[i]; + wire b_fire = m_axi_bvalid[i] && m_axi_bready[i]; + always @(posedge clk) begin + if (reset) begin + pending_writes_r <= '0; + end else if (aw_fire && ~b_fire) begin + pending_writes_r <= pending_writes_r + 8'd1; + end else if (~aw_fire && b_fire) begin + pending_writes_r <= pending_writes_r - 8'd1; + end + end + assign writes_pending = (pending_writes_r != '0); + end else begin : g_no_write_tracker + assign writes_pending = 1'b0; + end + + assign req_xbar_ready_out[i] = xbar_rw_out ? axi_write_ready + : (m_axi_arready[i] && ~writes_pending); // AXI write address channel assign m_axi_awvalid[i] = req_xbar_valid_out[i] && xbar_rw_out && ~m_axi_aw_ack; - if (INTERLEAVE) begin : g_m_axi_awaddr_i - assign m_axi_awaddr[i] = (ADDR_WIDTH_OUT'(xbar_addr_out) << (BANK_SEL_BITS + LOG2_DATA_SIZE)) | (ADDR_WIDTH_OUT'(i) << LOG2_DATA_SIZE); - end else begin : g_m_axi_awaddr_ni - assign m_axi_awaddr[i] = (ADDR_WIDTH_OUT'(xbar_addr_out) << LOG2_DATA_SIZE) | (ADDR_WIDTH_OUT'(i) << (BANK_ADDR_WIDTH + LOG2_DATA_SIZE)); - end + // Each m_axi_mem_ drives its own DDR controller, so the byte address + // is the compact per-bank cache-line index (bank_sel routed via i). + assign m_axi_awaddr[i] = ADDR_WIDTH_OUT'(xbar_addr_out) << LOG2_DATA_SIZE; assign m_axi_awid[i] = TAG_WIDTH_OUT'(xbar_tag_out); assign m_axi_awlen[i] = 8'b00000000; @@ -285,14 +308,9 @@ module VX_axi_adapter #( assign xbar_tag_r_out = READ_TAG_WIDTH'(xbar_tag_out); end - assign m_axi_arvalid[i] = req_xbar_valid_out[i] && ~xbar_rw_out; + assign m_axi_arvalid[i] = req_xbar_valid_out[i] && ~xbar_rw_out && ~writes_pending; - // convert address to byte-addressable space - if (INTERLEAVE) begin : g_m_axi_araddr_i - assign m_axi_araddr[i] = (ADDR_WIDTH_OUT'(xbar_addr_out) << (BANK_SEL_BITS + LOG2_DATA_SIZE)) | (ADDR_WIDTH_OUT'(i) << LOG2_DATA_SIZE); - end else begin : g_m_axi_araddr_ni - assign m_axi_araddr[i] = (ADDR_WIDTH_OUT'(xbar_addr_out) << LOG2_DATA_SIZE) | (ADDR_WIDTH_OUT'(i) << (BANK_ADDR_WIDTH + LOG2_DATA_SIZE)); - end + assign m_axi_araddr[i] = ADDR_WIDTH_OUT'(xbar_addr_out) << LOG2_DATA_SIZE; assign m_axi_arid[i] = TAG_WIDTH_OUT'(xbar_tag_r_out); assign m_axi_arlen[i] = 8'b00000000; assign m_axi_arsize[i] = 3'(LOG2_DATA_SIZE);