From a08b8d727adceb1cf468db1b8289d6d7cbd22b8f Mon Sep 17 00:00:00 2001 From: hwirys <121537293+hwirys@users.noreply.github.com> Date: Fri, 1 May 2026 08:42:03 +0900 Subject: [PATCH] =?UTF-8?q?hw:=20per-bank=20PLATFORM=5FMEMORY=5FOFFSET=20?= =?UTF-8?q?=E2=80=94=20fix=20U250=20vx=5Fbusy=20hang?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit When Vortex is built for an XRT-based Xilinx platform that allocates each memory bank as a separate xrt::bo, XRT picks a different virtual address per bank (e.g. on U250 bank 0 lands at 0x40_00000000) — which is far above Vortex's compile-time absolute addresses STARTUP_ADDR = 0x180000000 and STACK_BASE_ADDR = 0x1FFFF0000. AXI requests from Vortex therefore fail to decode at the slave, vx_busy stays high forever, and the kernel never starts. This is the underlying cause of the long-running U250 hang reports (#262, #263, #278) — Vortex 2.3 added the U250 build path but did not address this BAR mismatch, so the produced xclbin has never actually booted on real silicon. Fix - vortex_afu.vh: introduce per-bank PLATFORM_MEMORY_OFFSET_ (i = 0..3) macros, each defaulting to the legacy global PLATFORM_MEMORY_OFFSET so HBM platforms (U280/U55C/U50) and VCK5000 single-channel are byte-for-byte unchanged. - VX_afu_wrap.sv: build a 4-entry platform_memory_offsets array from those macros and add the bank-i offset to each outgoing m_axi_mem_ AW/AR address. - platforms.mk U250: switch to single-bank (NUM_BANKS=1, DDR[0]) and set PLATFORM_MEMORY_OFFSET_0=40'h4000000000 so the build works end-to-end out of the box. Multi-bank (full 64 GB) deployment needs a runtime mechanism to push each bo's actual XRT VA into the AFU and will follow as a separate PR. Verified end-to-end on real Alveo U250 (XRT 2.19.194, shell xilinx_u250_gen3x16_xdma_4_1_202210_1) at 200 MHz with the default DSP FPU. Without this patch the kernel hangs immediately at ap_start (vx_busy stuck high, CTL register reads back 0x1 indefinitely). With the patch the kernel boots and: - regression `vecadd` (n=16..16384), `sgemm`, `dotproduct`, `demo`, `dropout`, `conv3`, `io_addr`, `fence`, `diverge` — all pass - `dogfood` Test0..Test20 — pass - OpenCL: `saxpy`, `vecadd`, `sgemm`, `sgemm2`, `sgemm3`, `stencil`, `sfilter`, `spmv`, `psort`, `oclprintf` — pass - single-rank MPI: `mpi_vecadd`, `mpi_dotproduct`, `mpi_diverge`, `mpi_put_dotproduct` — pass Final WNS = +0.057 ns at 200 MHz; the patch is purely combinational addressing logic (3 extra adds in the AFU) so area and timing impact are negligible. Refs: #262, #263, #278. --- hw/rtl/afu/xrt/VX_afu_wrap.sv | 16 ++++++++++++++-- hw/rtl/afu/xrt/vortex_afu.vh | 18 ++++++++++++++++++ hw/syn/xilinx/xrt/platforms.mk | 8 ++++++-- 3 files changed, 38 insertions(+), 4 deletions(-) diff --git a/hw/rtl/afu/xrt/VX_afu_wrap.sv b/hw/rtl/afu/xrt/VX_afu_wrap.sv index 39cc264924..0928fb9c2c 100644 --- a/hw/rtl/afu/xrt/VX_afu_wrap.sv +++ b/hw/rtl/afu/xrt/VX_afu_wrap.sv @@ -287,9 +287,21 @@ module VX_afu_wrap import VX_gpu_pkg::*; #( wire [M_AXI_MEM_ADDR_WIDTH-1:0] m_axi_mem_awaddr_u [C_M_AXI_MEM_NUM_BANKS]; wire [M_AXI_MEM_ADDR_WIDTH-1:0] m_axi_mem_araddr_u [C_M_AXI_MEM_NUM_BANKS]; + // Per-bank XRT BO base offsets. Each m_axi_mem_ port goes to a different + // xrt::bo (one per DDR/HBM channel) which XRT places at a different virtual + // base address, so a single global PLATFORM_MEMORY_OFFSET cannot cover all + // banks. PLATFORM_MEMORY_OFFSET_ overrides per bank; each defaults to + // PLATFORM_MEMORY_OFFSET so existing single-bank platforms (HBM, VCK5000) + // are unchanged. + wire [C_M_AXI_MEM_ADDR_WIDTH-1:0] platform_memory_offsets [4]; + assign platform_memory_offsets[0] = C_M_AXI_MEM_ADDR_WIDTH'(`PLATFORM_MEMORY_OFFSET_0); + assign platform_memory_offsets[1] = C_M_AXI_MEM_ADDR_WIDTH'(`PLATFORM_MEMORY_OFFSET_1); + assign platform_memory_offsets[2] = C_M_AXI_MEM_ADDR_WIDTH'(`PLATFORM_MEMORY_OFFSET_2); + assign platform_memory_offsets[3] = C_M_AXI_MEM_ADDR_WIDTH'(`PLATFORM_MEMORY_OFFSET_3); + for (genvar i = 0; i < C_M_AXI_MEM_NUM_BANKS; ++i) begin : g_addressing - assign m_axi_mem_awaddr_a[i] = C_M_AXI_MEM_ADDR_WIDTH'(m_axi_mem_awaddr_u[i]) + C_M_AXI_MEM_ADDR_WIDTH'(`PLATFORM_MEMORY_OFFSET); - assign m_axi_mem_araddr_a[i] = C_M_AXI_MEM_ADDR_WIDTH'(m_axi_mem_araddr_u[i]) + C_M_AXI_MEM_ADDR_WIDTH'(`PLATFORM_MEMORY_OFFSET); + assign m_axi_mem_awaddr_a[i] = C_M_AXI_MEM_ADDR_WIDTH'(m_axi_mem_awaddr_u[i]) + platform_memory_offsets[i]; + assign m_axi_mem_araddr_a[i] = C_M_AXI_MEM_ADDR_WIDTH'(m_axi_mem_araddr_u[i]) + platform_memory_offsets[i]; end `SCOPE_IO_SWITCH (2); diff --git a/hw/rtl/afu/xrt/vortex_afu.vh b/hw/rtl/afu/xrt/vortex_afu.vh index c66ede2b71..f1c6af810d 100644 --- a/hw/rtl/afu/xrt/vortex_afu.vh +++ b/hw/rtl/afu/xrt/vortex_afu.vh @@ -18,6 +18,24 @@ `define PLATFORM_MEMORY_OFFSET 0 `endif +// Per-bank XRT BO base address. Each m_axi_mem_ port's outgoing AXI byte +// address gets this offset added so that Vortex's compile-time absolute +// addresses (STARTUP_ADDR, STACK_BASE_ADDR, ...) land inside the xrt::bo +// allocation that XRT placed in that bank. Defaults to PLATFORM_MEMORY_OFFSET +// (single offset for all banks) for back-compatibility. +`ifndef PLATFORM_MEMORY_OFFSET_0 +`define PLATFORM_MEMORY_OFFSET_0 `PLATFORM_MEMORY_OFFSET +`endif +`ifndef PLATFORM_MEMORY_OFFSET_1 +`define PLATFORM_MEMORY_OFFSET_1 `PLATFORM_MEMORY_OFFSET +`endif +`ifndef PLATFORM_MEMORY_OFFSET_2 +`define PLATFORM_MEMORY_OFFSET_2 `PLATFORM_MEMORY_OFFSET +`endif +`ifndef PLATFORM_MEMORY_OFFSET_3 +`define PLATFORM_MEMORY_OFFSET_3 `PLATFORM_MEMORY_OFFSET +`endif + `ifndef PLATFORM_MEMORY_ID_WIDTH `define PLATFORM_MEMORY_ID_WIDTH 32 `endif diff --git a/hw/syn/xilinx/xrt/platforms.mk b/hw/syn/xilinx/xrt/platforms.mk index a841e1a046..f732a40ab1 100644 --- a/hw/syn/xilinx/xrt/platforms.mk +++ b/hw/syn/xilinx/xrt/platforms.mk @@ -30,8 +30,12 @@ else ifneq ($(findstring xilinx_u280,$(XSA)),) CONFIGS += -DPLATFORM_MEMORY_NUM_BANKS=32 -DPLATFORM_MEMORY_ADDR_WIDTH=33 VPP_FLAGS += --connectivity.sp vortex_afu_1.m_axi_mem_0:HBM[0:31] else ifneq ($(findstring xilinx_u250,$(XSA)),) - # 64 GB of DDR4 with 4 channels (16 GB per channel) - CONFIGS += -DPLATFORM_MEMORY_NUM_BANKS=4 -DPLATFORM_MEMORY_ADDR_WIDTH=36 + # 16 GB of DDR4 (single channel, bank 0). Multi-bank requires per-bank XRT + # VA offsets that aren't known at synthesis time without runtime plumbing; + # see follow-up PR for a DCR-based runtime path. + CONFIGS += -DPLATFORM_MEMORY_NUM_BANKS=1 -DPLATFORM_MEMORY_ADDR_WIDTH=34 + VPP_FLAGS += --connectivity.sp vortex_afu_1.m_axi_mem_0:DDR[0] + CONFIGS += -DPLATFORM_MEMORY_OFFSET_0=40\'h4000000000 else ifneq ($(findstring xilinx_u200,$(XSA)),) # 64 GB of DDR4 with 4 channels (16 GB per channel) CONFIGS += -DPLATFORM_MEMORY_NUM_BANKS=4 -DPLATFORM_MEMORY_ADDR_WIDTH=36