From fb21be69156ba1a191b2e85838f1b00b56f74ff5 Mon Sep 17 00:00:00 2001 From: bowwang Date: Sat, 10 May 2025 11:43:19 +0200 Subject: [PATCH 01/34] [das] init open source --- README.md | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/README.md b/README.md index cca8165cf..deaa6b6f8 100644 --- a/README.md +++ b/README.md @@ -1,6 +1,10 @@ [![ci](https://github.com/pulp-platform/mempool/actions/workflows/ci.yml/badge.svg)](https://github.com/pulp-platform/mempool/actions/workflows/ci.yml) [![lint](https://github.com/pulp-platform/mempool/actions/workflows/lint.yml/badge.svg)](https://github.com/pulp-platform/mempool/actions/workflows/lint.yml) [![License](https://img.shields.io/badge/License-Apache%202.0-blue.svg)](https://opensource.org/licenses/Apache-2.0) +# MemPool Dynamic Allocation Scheme +Dynamic Allocation Scheme (DAS), a flexible, adaptable, runtime-configurable address mapping technique. DAS remaps contiguous address spaces to physically adjacent memory banks based on the workload’s memory access patterns, placing the data physically close to PEs. + +This repository branch contains DAS extensions based on MemPool. # MemPool From 1989369fcb97ddf4cc5283230412d4d5b10760c3 Mon Sep 17 00:00:00 2001 From: bowwang Date: Sun, 11 May 2025 13:31:00 +0200 Subject: [PATCH 02/34] [config] add terapool-das configurations --- config/terapool.mk | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/config/terapool.mk b/config/terapool.mk index 0f1c264f8..42a5d73a6 100644 --- a/config/terapool.mk +++ b/config/terapool.mk @@ -10,6 +10,7 @@ # Global Control terapool ?= 1 +flex_terapool ?= 1 # Number of cores num_cores ?= 1024 @@ -45,4 +46,8 @@ dmas_per_group ?= 4 # Brust Length = 16 # L2 Banks/Channels l2_banks = 16 -l2_size ?= 16777216 # 1000000 \ No newline at end of file +l2_size ?= 16777216 # 1000000 + +# TeraPool w/ DAS +# Impacted memory size in byte per core by default +heap_seq_mem_size ?= 2048 \ No newline at end of file From 2b5cc299de5e231b2ebe13c310233bf81f08b61e Mon Sep 17 00:00:00 2001 From: bowwang Date: Sun, 11 May 2025 13:40:24 +0200 Subject: [PATCH 03/34] [traffic generator] bandwidth benchmark in selected number of tiles --- hardware/Makefile | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/hardware/Makefile b/hardware/Makefile index 1a78620c7..2bab80a79 100644 --- a/hardware/Makefile +++ b/hardware/Makefile @@ -151,6 +151,14 @@ ifdef tg cpp_defs += -DTG_SEQ_PROB=$(tg_seqprob) cpp_defs += -DTG_NCYCLES=$(tg_ncycles) cpp_defs += -DNUM_CORES=$(num_cores) + # DAS benchmark related + cpp_defs += -DPARTITION=$(partition) + cpp_defs += -DTG_PA=$(tg_pa) + cpp_defs += -DTG_PB=$(tg_pb) + cpp_defs += -DTG_PC=$(tg_pc) + cpp_defs += -DTG_PA_PROB=$(tg_pa_prob) + cpp_defs += -DTG_PB_PROB=$(tg_pb_prob) + cpp_defs += -DTG_PC_PROB=$(tg_pc_prob) # How many cycles should we execute? veril_flags := --term-after-cycles=$(tg_ncycles) From 320854a9eecbec96523e5aaa190054505f9586e9 Mon Sep 17 00:00:00 2001 From: bowwang Date: Sun, 11 May 2025 13:53:46 +0200 Subject: [PATCH 04/34] [hw] add DAS support in idma --- Bender.yml | 1 + hardware/deps/idma/Bender.yml | 4 + .../src/midends/idma_address_scrambler.sv | 182 +++++++ .../src/midends/idma_distributed_midend_v2.sv | 260 ++++++++++ .../idma/src/midends/idma_split_midend_v2.sv | 468 ++++++++++++++++++ 5 files changed, 915 insertions(+) create mode 100644 hardware/deps/idma/src/midends/idma_address_scrambler.sv create mode 100644 hardware/deps/idma/src/midends/idma_distributed_midend_v2.sv create mode 100644 hardware/deps/idma/src/midends/idma_split_midend_v2.sv diff --git a/Bender.yml b/Bender.yml index 8d6913a8e..959621e9a 100644 --- a/Bender.yml +++ b/Bender.yml @@ -44,6 +44,7 @@ sources: # Level 3 - hardware/src/mempool_group.sv # Level 4 + - hardware/src/idma_partition_midend.sv - hardware/src/mempool_cluster.sv # Level 5 - hardware/src/ctrl_registers.sv diff --git a/hardware/deps/idma/Bender.yml b/hardware/deps/idma/Bender.yml index 0064ee877..406e4969f 100644 --- a/hardware/deps/idma/Bender.yml +++ b/hardware/deps/idma/Bender.yml @@ -15,6 +15,7 @@ sources: # levels 1 and 0, etc. Files within a level are ordered alphabetically. # Level 0 - src/axi_dma_data_path.sv + - src/midends/idma_address_scrambler.sv # Level 1 - src/axi_dma_data_mover.sv - src/axi_dma_burst_reshaper.sv @@ -23,6 +24,9 @@ sources: # Level 3: MemPool - src/midends/idma_split_midend.sv - src/midends/idma_distributed_midend.sv + # If enabled DAS + - src/midends/idma_split_midend_v2.sv + - src/midends/idma_distributed_midend_v2.sv - src/frontends/mempool/mempool_dma_frontend_reg_pkg.sv - src/frontends/mempool/mempool_dma_frontend_reg_top.sv - src/frontends/mempool/mempool_dma.sv diff --git a/hardware/deps/idma/src/midends/idma_address_scrambler.sv b/hardware/deps/idma/src/midends/idma_address_scrambler.sv new file mode 100644 index 000000000..9c7473ce7 --- /dev/null +++ b/hardware/deps/idma/src/midends/idma_address_scrambler.sv @@ -0,0 +1,182 @@ +// Copyright 2021 ETH Zurich and University of Bologna. +// Solderpad Hardware License, Version 0.51, see LICENSE for details. +// SPDX-License-Identifier: SHL-0.51 + +// Description: Address scrambler for iDMA Midend, scramble scheme is determined +// by group_factor +// Current constraints: + +// Author: Bowen Wang + +module idma_address_scrambler #( + parameter int unsigned AddrWidth = 32, + parameter int unsigned DataWidth = 32, + parameter int unsigned ByteOffset = 2, + parameter int unsigned NumTiles = 128, + parameter int unsigned NumBanksPerTile = 32, + parameter bit Bypass = 0, + parameter int unsigned SeqMemSizePerTile = 4*1024, + parameter int unsigned HeapSeqMemSizePerTile = 8*2048, + parameter int unsigned MemSizePerTile = 8*4*1024, + parameter int unsigned MemSizePerRow = 4*4*1024, // 4bytes * 4096 banks + parameter int unsigned TCDMSize = 1024*1024 +) ( + input logic [AddrWidth-1:0] address_i, + input logic [31:0] num_bytes_i, + input logic [3:0][7:0] group_factor_i, + // For each allocation, the maximum number of rows assigned can be 128 rows + input logic [3:0][7:0] allocated_size_i, + input logic [3:0][DataWidth-1:0] start_addr_scheme_i, + output logic [7:0] group_factor_o, + output logic [7:0] allocated_size_o, + output logic [AddrWidth-1:0] address_o +); + // Basic Settings + localparam int unsigned BankOffsetBits = $clog2(NumBanksPerTile); + localparam int unsigned TileIdBits = $clog2(NumTiles); + localparam int unsigned ConstantBitsLSB = ByteOffset + BankOffsetBits; + + // Heap Sequential Settings + localparam int unsigned HeapSeqPerTileBits = $clog2(MemSizePerTile); // log2(8*4096) = 15 | RowIndexBits + ConstBits + localparam int unsigned HeapSeqTotalBits = HeapSeqPerTileBits+TileIdBits; // 15+7=22 | used for address_o assignment + localparam int unsigned RowIndexBits = HeapSeqPerTileBits-ConstantBitsLSB; // 15-7=8 | RowIndex + + if (Bypass || NumTiles < 2) begin + assign address_o = address_i; + end else begin + // ------ Heap Sequential Signals ------ // + // `shift_index` : how many bits to shift for TileID bits in each partition + // `shift_index_sc`: how many bits need to swap within Row Index + logic [3:0][2:0] shift_index; + logic [3:0][2:0] shift_index_sc; + for (genvar i = 0; i < 4; i++) begin : gen_shift_index + always_comb begin + case(group_factor_i[i]) + 128: shift_index[i] = 7; + 64: shift_index[i] = 6; + 32: shift_index[i] = 5; + 16: shift_index[i] = 4; + 8: shift_index[i] = 3; + 4: shift_index[i] = 2; + 2: shift_index[i] = 1; + default: shift_index[i] = 0; + endcase + + case(allocated_size_i[i]) + 128: shift_index_sc[i] = 7; + 64: shift_index_sc[i] = 6; + 32: shift_index_sc[i] = 5; + 16: shift_index_sc[i] = 4; + 8: shift_index_sc[i] = 3; + 4: shift_index_sc[i] = 2; + 2: shift_index_sc[i] = 1; + default: shift_index_sc[i] = 0; + endcase + end + end + + // post-scramble row index + logic [RowIndexBits-1:0] post_scramble_row_index; + logic [TileIdBits-1:0] post_scramble_tile_id; + + logic [3:0][RowIndexBits-1:0] mask_row_index, mask_row_index_n; + logic [3:0][TileIdBits-1:0] mask_tile_id, mask_tile_id_n; + + logic [TileIdBits-1:0] heap_tile_id; + + for (genvar j = 0; j < 4; j++) begin : gen_mask + assign mask_row_index[j] = (shift_index_sc[j] == 0) ? {RowIndexBits{1'b0}} : ({RowIndexBits{1'b1}} >> (RowIndexBits-shift_index_sc[j])); + assign mask_tile_id[j] = (shift_index[j] == 0) ? {TileIdBits{1'b0}} : ({TileIdBits{1'b1}} >> (TileIdBits -shift_index[j])); + + assign mask_row_index_n[j] = ~mask_row_index[j]; + assign mask_tile_id_n[j] = ~mask_tile_id[j]; + end + + assign heap_tile_id = address_i[(TileIdBits+ConstantBitsLSB-1):ConstantBitsLSB]; + + always_comb begin + // Default: Unscrambled + address_o = address_i; + group_factor_o = '0; + allocated_size_o = '0; + post_scramble_row_index = 'b0; + post_scramble_tile_id = 'b0; + // Need one more logic for interleaved heap region + // group_factor_o = {7{1'b1}}; + // Sequential Heap Region + // TODO (bowwang): add a new register to indicate the start addr of sequential heap region, currently hard coded + // if (address_i < start_addr_scheme_i[0]) begin + if (address_i < 32'h00120000) begin + group_factor_o = 128; // fully interleaved + allocated_size_o = num_bytes_i/(4*4096); + end else if ( (address_i >= start_addr_scheme_i[0]) && (address_i < start_addr_scheme_i[0]+MemSizePerRow*allocated_size_i[0]) ) begin + + post_scramble_row_index = 'b0; + post_scramble_tile_id = 'b0; + // 1. `post_scramble_row_index` generation + post_scramble_row_index |= (address_i >> (ConstantBitsLSB + shift_index[0])) & mask_row_index[0]; + post_scramble_row_index |= (address_i >> (ConstantBitsLSB + TileIdBits )) & mask_row_index_n[0]; + + // 2. `post_scramble_tile_id` generation + post_scramble_tile_id |= heap_tile_id & mask_tile_id[0]; + post_scramble_tile_id |= (address_i >> (ConstantBitsLSB + shift_index_sc[0])) & mask_tile_id_n[0]; + + address_o[HeapSeqTotalBits-1:ConstantBitsLSB] = {post_scramble_row_index, post_scramble_tile_id}; + group_factor_o = group_factor_i[0]; + allocated_size_o = allocated_size_i[0]; + end else if ( (address_i >= start_addr_scheme_i[1]) && (address_i < start_addr_scheme_i[1]+MemSizePerRow*allocated_size_i[1]) ) begin + + post_scramble_row_index = 'b0; + post_scramble_tile_id = 'b0; + // 1. `post_scramble_row_index` generation + post_scramble_row_index |= (address_i >> (ConstantBitsLSB + shift_index[1])) & mask_row_index[1]; + post_scramble_row_index |= (address_i >> (ConstantBitsLSB + TileIdBits )) & mask_row_index_n[1]; + + // 2. `post_scramble_tile_id` generation + post_scramble_tile_id |= heap_tile_id & mask_tile_id[1]; + post_scramble_tile_id |= (address_i >> (ConstantBitsLSB + shift_index_sc[1])) & mask_tile_id_n[1]; + + address_o[HeapSeqTotalBits-1:ConstantBitsLSB] = {post_scramble_row_index, post_scramble_tile_id}; + group_factor_o = group_factor_i[1]; + allocated_size_o = allocated_size_i[1]; + end else if ( (address_i >= start_addr_scheme_i[2]) && (address_i < start_addr_scheme_i[2]+MemSizePerRow*allocated_size_i[2]) ) begin + + post_scramble_row_index = 'b0; + post_scramble_tile_id = 'b0; + // 1. `post_scramble_row_index` generation + post_scramble_row_index |= (address_i >> (ConstantBitsLSB + shift_index[2])) & mask_row_index[2]; + post_scramble_row_index |= (address_i >> (ConstantBitsLSB + TileIdBits )) & mask_row_index_n[2]; + + // 2. `post_scramble_tile_id` generation + post_scramble_tile_id |= heap_tile_id & mask_tile_id[2]; + post_scramble_tile_id |= (address_i >> (ConstantBitsLSB + shift_index_sc[2])) & mask_tile_id_n[2]; + + address_o[HeapSeqTotalBits-1:ConstantBitsLSB] = {post_scramble_row_index, post_scramble_tile_id}; + group_factor_o = group_factor_i[2]; + allocated_size_o = allocated_size_i[2]; + end else if ( (address_i >= start_addr_scheme_i[3]) && (address_i < start_addr_scheme_i[3]+MemSizePerRow*allocated_size_i[3]) ) begin + + post_scramble_row_index = 'b0; + post_scramble_tile_id = 'b0; + // 1. `post_scramble_row_index` generation + post_scramble_row_index |= (address_i >> (ConstantBitsLSB + shift_index[3])) & mask_row_index[3]; + post_scramble_row_index |= (address_i >> (ConstantBitsLSB + TileIdBits )) & mask_row_index_n[3]; + + // 2. `post_scramble_tile_id` generation + post_scramble_tile_id |= heap_tile_id & mask_tile_id[3]; + post_scramble_tile_id |= (address_i >> (ConstantBitsLSB + shift_index_sc[3])) & mask_tile_id_n[3]; + + address_o[HeapSeqTotalBits-1:ConstantBitsLSB] = {post_scramble_row_index, post_scramble_tile_id}; + group_factor_o = group_factor_i[3]; + allocated_size_o = allocated_size_i[3]; + end + end + + end + + // Check for unsupported configurations + if (NumBanksPerTile < 2) + $fatal(1, "NumBanksPerTile must be greater than 2. The special case '1' is currently not supported!"); + if (HeapSeqMemSizePerTile % (2**ByteOffset*NumBanksPerTile) != 0) + $fatal(1, "HeapSeqMemSizePerTile must be a multiple of BankWidth*NumBanksPerTile!"); +endmodule : idma_address_scrambler diff --git a/hardware/deps/idma/src/midends/idma_distributed_midend_v2.sv b/hardware/deps/idma/src/midends/idma_distributed_midend_v2.sv new file mode 100644 index 000000000..902c0257a --- /dev/null +++ b/hardware/deps/idma/src/midends/idma_distributed_midend_v2.sv @@ -0,0 +1,260 @@ +// Copyright 2022 ETH Zurich and University of Bologna. +// Solderpad Hardware License, Version 0.51, see LICENSE for details. +// SPDX-License-Identifier: SHL-0.51 + +// Samuel Riedel +// Bowen Wang + +// Mode selection: [1:0] dma_mode_i +// 2'b00: safe mode, no modificaion will be added to the transfer +// 2'b01: fast mode, only apply to L1-aligned address +// 2'b10: dupl mode, only apply to partition-aligned address +// 2'b11: NOP + +`include "common_cells/registers.svh" + +module idma_distributed_midend_v2 #( + /// Number of backends to distribute the requests to + parameter int unsigned NoMstPorts = 1, + /// Bytes covered by each port + parameter int unsigned DmaRegionWidth = 1, // [B] Region that one port covers in bytes + /// Start of the distributed memory region + parameter int unsigned DmaRegionStart = 32'h0000_0000, + /// End of the distributed memory region + parameter int unsigned DmaRegionEnd = 32'h1000_0000, + /// Number of generic 1D requests that can be buffered + parameter int unsigned TransFifoDepth = 1, + /// Arbitrary 1D burst request definition + parameter type burst_req_t = logic, + /// Meta data response definition + parameter type meta_t = logic +) ( + input logic clk_i, + input logic rst_ni, + // Slave + input burst_req_t burst_req_i, + input logic valid_i, + output logic ready_o, + output meta_t meta_o, + // partition related signals + input logic [1:0] dma_mode_i, + input logic [7:0] allocated_size_i, + // Master + output burst_req_t [NoMstPorts-1:0] burst_req_o, + output logic [NoMstPorts-1:0] valid_o, + input logic [NoMstPorts-1:0] ready_i, + input meta_t [NoMstPorts-1:0] meta_i +); + + localparam DmaRegionAddressBits = $clog2(DmaRegionWidth); + localparam FullRegionAddressBits = $clog2(DmaRegionWidth*NoMstPorts); + localparam FullDmaRegionWidth = DmaRegionWidth*NoMstPorts; + + typedef logic [FullRegionAddressBits:0] full_addr_t; + + // Handle the ready signal + logic fork_ready, fifo_ready; + logic [NoMstPorts-1:0] fifo_full; + // Handle Metadata + logic [NoMstPorts-1:0] trans_complete_d, trans_complete_q; + logic [NoMstPorts-1:0] tie_off_trans_complete_d, tie_off_trans_complete_q; + logic [NoMstPorts-1:0] backend_idle_d, backend_idle_q; + assign meta_o.trans_complete = &trans_complete_q; + assign meta_o.backend_idle = &backend_idle_q; + assign fifo_ready = !(|fifo_full); + assign ready_o = fork_ready && fifo_ready; + + for (genvar i = 0; unsigned'(i) < NoMstPorts; i++) begin: gen_trans_complete_fifo + // Collect the `trans_complete` signals and reduce them once we have all of them + logic empty; + logic data; + logic conflict_push; + fifo_v3 #( + .FALL_THROUGH (0 ), + .DATA_WIDTH (1 ), + .DEPTH (TransFifoDepth) + ) i_trans_complete_fifo ( + .clk_i (clk_i ), + .rst_ni (rst_ni ), + .flush_i ('0 ), + .testmode_i ('0 ), + .full_o (fifo_full[i] ), + .empty_o (empty ), + .usage_o (/*unused*/ ), + .data_i (1'b1 ), + .push_i ( (trans_complete_d[i] | conflict_push) & (fifo_full[i]==0) ), + .data_o (data ), + .pop_i (meta_o.trans_complete) + ); + assign trans_complete_d[i] = meta_i[i].trans_complete | tie_off_trans_complete_q[i]; + assign trans_complete_q[i] = data && !empty; + + // handle two complete signals arrive at the same time + logic [3:0] conflict_complete_d, conflict_complete_q; + `FF(conflict_complete_q, conflict_complete_d, '0, clk_i, rst_ni) + + always_comb begin + conflict_complete_d = conflict_complete_q; + conflict_push = 0; + if (meta_i[i].trans_complete & tie_off_trans_complete_q[i] & (fifo_full[i]==0)) begin // FIFO is not full + conflict_complete_d = conflict_complete_q+1; + end + if (meta_i[i].trans_complete & tie_off_trans_complete_q[i] & (fifo_full[i]!=0)) begin // FIFO is full + conflict_complete_d = conflict_complete_q+2; + end + if (!meta_i[i].trans_complete & tie_off_trans_complete_q[i] & (fifo_full[i]!=0)) begin + conflict_complete_d = conflict_complete_q+1; + end + if (meta_i[i].trans_complete & !tie_off_trans_complete_q[i] & (fifo_full[i]!=0)) begin + conflict_complete_d = conflict_complete_q+1; + end + + if ( (conflict_complete_q!=0) & (trans_complete_d[i]==0) & (fifo_full[i]==0) ) begin // FIFO is not full, safe to push + conflict_push = 1; + conflict_complete_d = conflict_complete_q-1; + end + end + + end + + always_comb begin + backend_idle_d = backend_idle_q; + for (int unsigned i = 0; i < NoMstPorts; i++) begin + backend_idle_d[i] = meta_i[i].backend_idle; + end + end + `FF(tie_off_trans_complete_q, tie_off_trans_complete_d, '0, clk_i, rst_ni) + `FF(backend_idle_q, backend_idle_d, '1, clk_i, rst_ni) + + // Fork + logic [NoMstPorts-1:0] valid, ready; + stream_fork #( + .N_OUP (NoMstPorts) + ) i_stream_fork ( + .clk_i (clk_i ), + .rst_ni (rst_ni ), + .valid_i (valid_i & fifo_ready), + .ready_o (fork_ready ), + .valid_o (valid ), + .ready_i (ready ) + ); + + full_addr_t src_addr, dst_addr, start_addr, end_addr; + + assign src_addr = burst_req_i.src[FullRegionAddressBits-1:0]; + assign dst_addr = burst_req_i.dst[FullRegionAddressBits-1:0]; + + logic [1:0] num_split, split_offset; + // logic num_split, split_offset; + + always_comb begin + num_split = burst_req_i.num_bytes / DmaRegionWidth; + if (($unsigned(burst_req_i.src) >= DmaRegionStart) && ($unsigned(burst_req_i.src) < DmaRegionEnd)) begin + start_addr = src_addr; + end else begin + start_addr = dst_addr; + end + end_addr = start_addr+burst_req_i.num_bytes; + split_offset = start_addr[DmaRegionAddressBits+1:DmaRegionAddressBits]; + // split_offset = start_addr[DmaRegionAddressBits]; + // Connect valid ready by default + valid_o = valid; + ready = ready_i; + // Do not interfere with metadata per default + tie_off_trans_complete_d = '0; + for (int i = 0; i < NoMstPorts; i++) begin + tie_off_trans_complete_d[i] = tie_off_trans_complete_q[i] && meta_i[i].trans_complete; + // Feed metadata through directly + burst_req_o[i] = burst_req_i; + // Feed through the address bits + burst_req_o[i].src = burst_req_i.src; + burst_req_o[i].dst = burst_req_i.dst; + // Modify lower addresses bits and size + if (($unsigned(start_addr) >= (i+1)*DmaRegionWidth) || ($unsigned(end_addr) <= i*DmaRegionWidth)) begin + // We are not involved in the transfer + if ( (dma_mode_i == 2'b00) || (dma_mode_i == 2'b11) ) begin // safe mode + burst_req_o[i].src = '0; + burst_req_o[i].dst = '0; + burst_req_o[i].num_bytes = 1; + // Make handshake ourselves + valid_o[i] = 1'b0; + ready[i] = 1'b1; + // Inject trans complete + if (valid[i]) begin + tie_off_trans_complete_d[i] = 1'b1; + end + end else if (dma_mode_i == 2'b01) begin // fast mode + burst_req_o[i].num_bytes = (burst_req_i.num_bytes= DmaRegionStart) && ($unsigned(burst_req_i.src) < DmaRegionEnd)) begin + burst_req_o[i].src = burst_req_i.src+i*DmaRegionWidth; + burst_req_o[i].dst = burst_req_i.dst+i*allocated_size_i*DmaRegionWidth; + end else begin + // L2 --> L1 + if (burst_req_i.num_bytes<=DmaRegionWidth )begin + burst_req_o[i].src = burst_req_i.src+i*allocated_size_i*DmaRegionWidth; + end else if (i==2) begin + burst_req_o[i].src = burst_req_i.src+i*allocated_size_i*DmaRegionWidth; + end else if (i==3) begin + burst_req_o[i].src = burst_req_i.src+(i-1)*allocated_size_i*DmaRegionWidth + DmaRegionWidth; + end + burst_req_o[i].dst = burst_req_i.dst+i*DmaRegionWidth; + end + end else if (dma_mode_i == 2'b10) begin // duplication mode: only consider L2 --> L1 + if (($unsigned(burst_req_i.dst) >= DmaRegionStart) && ($unsigned(burst_req_i.dst) < DmaRegionEnd)) begin + // L2 ------> L1 + burst_req_o[i].num_bytes = (burst_req_i.num_bytes= i*DmaRegionWidth)) begin + // First (and potentially only) slice + // Leave address as is + if ($unsigned(end_addr) <= (i+1)*DmaRegionWidth) begin + burst_req_o[i].num_bytes = burst_req_i.num_bytes; + end else begin + burst_req_o[i].num_bytes = DmaRegionWidth-start_addr[DmaRegionAddressBits-1:0]; + end + // end else if (($unsigned(start_addr) < i*DmaRegionWidth)) begin + + end else begin + // Round up the address to the next DMA boundary + if (($unsigned(burst_req_i.src) >= DmaRegionStart) && ($unsigned(burst_req_i.src) < DmaRegionEnd)) begin + burst_req_o[i].src[FullRegionAddressBits-1:0] = i*DmaRegionWidth; + burst_req_o[i].dst = burst_req_i.dst+i*DmaRegionWidth-start_addr[DmaRegionAddressBits-1:0]; + end else begin + burst_req_o[i].src = burst_req_i.src+(i-split_offset)*DmaRegionWidth-start_addr[DmaRegionAddressBits-1:0]; + burst_req_o[i].dst[FullRegionAddressBits-1:0] = i*DmaRegionWidth; + end + if ($unsigned(end_addr) >= (i+1)*DmaRegionWidth) begin + // Middle slice + // Emit a full-sized transfer + burst_req_o[i].num_bytes = DmaRegionWidth; + end else begin + // Last slice + burst_req_o[i].num_bytes = end_addr[DmaRegionAddressBits-1:0]; + end + end + end + end + + // pragma translate_off + int f; + always_ff @(posedge clk_i or negedge rst_ni) begin + automatic string str; + if (rst_ni && valid_i && ready_o) begin + str = "\n[idma_distributed_midend_v2] Got request\n"; + str = $sformatf("%sRequest in: From: 0x%8x To: 0x%8x with size %d\n", str, burst_req_i.src, burst_req_i.dst, burst_req_i.num_bytes); + for (int i = 0; i < NoMstPorts; i++) begin + str = $sformatf("%sOut %6d: From: 0x%8x To: 0x%8x with size %d\n", str, i, burst_req_o[i].src, burst_req_o[i].dst, burst_req_o[i].num_bytes); + end + f = $fopen("dma.log", "a"); + $fwrite(f, str); + $fclose(f); + end + end + // pragma translate_on + +endmodule diff --git a/hardware/deps/idma/src/midends/idma_split_midend_v2.sv b/hardware/deps/idma/src/midends/idma_split_midend_v2.sv new file mode 100644 index 000000000..3dc04b088 --- /dev/null +++ b/hardware/deps/idma/src/midends/idma_split_midend_v2.sv @@ -0,0 +1,468 @@ +// Copyright 2022 ETH Zurich and University of Bologna. +// Solderpad Hardware License, Version 0.51, see LICENSE for details. +// SPDX-License-Identifier: SHL-0.51 + +// Samuel Riedel +// Bowen Wang + + +// The Split Midend (v2) slice one burst request aligned to partition boundary, instead of +// L1 SPM boundary as in v1. + +`include "common_cells/registers.svh" + +module idma_split_midend_v2 #( + parameter int unsigned DmaRegionWidth = 1, // [B] Region that one port covers in bytes + parameter int unsigned DmaRegionStart = 32'h0000_0000, + parameter int unsigned DmaRegionEnd = 32'h1000_0000, + parameter int unsigned AddrWidth = 32, + parameter type burst_req_t = logic, + parameter type meta_t = logic +) ( + input logic clk_i, + input logic rst_ni, + // Slave + input burst_req_t burst_req_i, + input logic valid_i, + output logic ready_o, + output meta_t meta_o, + + // Partition related signals + input logic [1:0] dma_mode_i, + input logic [3:0][7:0] group_factor_i, + input logic [3:0][7:0] allocated_size_i, + input logic [3:0][AddrWidth-1:0] start_addr_scheme_i, + output logic [7:0] allocated_size_o, + + // Master + output burst_req_t burst_req_o, + output logic valid_o, + input logic ready_i, + input meta_t meta_i +); + + // ------ Parameter Settings ------ // + typedef logic [AddrWidth-1:0] addr_t; + localparam DmaRegionAddressBits = $clog2(DmaRegionWidth); + localparam TileDmaRegionWidth = DmaRegionWidth / 128; + logic [AddrWidth-1:0] PartitionDmaRegionWidth; + + localparam DmaBackendWidth = 32*8*4; // 32banks*8Tiles*4bytes + + // ------ Address translation ------ // + // Only the address in L1 SPM will be scrambled + logic [AddrWidth-1:0] post_scramble_src; + logic [AddrWidth-1:0] post_scramble_dst; + logic [7:0] group_factor_src, group_factor_dst, group_factor_sel; + logic [7:0] allocated_size_src, allocated_size_dst, allocated_size_sel; + + assign group_factor_sel = group_factor_src | group_factor_dst; + assign allocated_size_sel = allocated_size_src | allocated_size_dst; + assign PartitionDmaRegionWidth = TileDmaRegionWidth * group_factor_sel; + + idma_address_scrambler i_idma_address_scrambler_src ( + .address_i (burst_req_i.src), + .num_bytes_i (burst_req_i.num_bytes), + .group_factor_i (group_factor_i), + .allocated_size_i (allocated_size_i), + .start_addr_scheme_i(start_addr_scheme_i), + .group_factor_o (group_factor_src), + .allocated_size_o (allocated_size_src), + .address_o (post_scramble_src) + ); + + idma_address_scrambler i_idma_address_scrambler_dst ( + .address_i (burst_req_i.dst), + .num_bytes_i (burst_req_i.num_bytes), + .group_factor_i (group_factor_i), + .allocated_size_i (allocated_size_i), + .start_addr_scheme_i(start_addr_scheme_i), + .group_factor_o (group_factor_dst), + .allocated_size_o (allocated_size_dst), + .address_o (post_scramble_dst) + ); + + // ------ Filter out address in L1 SPM ------ // + addr_t start_addr, end_addr; + logic spm2dram; + + always_comb begin + spm2dram = 0; + if (($unsigned(burst_req_i.src) >= DmaRegionStart) && ($unsigned(burst_req_i.src) < DmaRegionEnd)) begin + start_addr = post_scramble_src; + spm2dram = 1; + end else begin + start_addr = post_scramble_dst; + spm2dram = 0; + end + // not used + end_addr = start_addr + burst_req_i.num_bytes; + end + + // ------ Considering Partition Scheme ------ // + logic [2:0] shift_index; + logic [AddrWidth-1:0] partition_mask; + addr_t masked_start_addr; + + always_comb begin + case(group_factor_sel) + 128: shift_index = 0; + 64: shift_index = 1; + 32: shift_index = 2; + 16: shift_index = 3; + 8: shift_index = 4; + 4: shift_index = 5; + 2: shift_index = 6; + default: shift_index = 7; + endcase + end + + assign partition_mask = {DmaRegionAddressBits{1'b1}} >> shift_index; + assign masked_start_addr = start_addr & partition_mask; + + // ------ Handle Metadata ------ // + // Forward idle signal and count the trans_comlete signal + logic req_valid; + logic [31:0] num_trans_d, num_trans_q; + + assign meta_o.backend_idle = meta_i.backend_idle; + always_comb begin + num_trans_d = num_trans_q; + meta_o.trans_complete = 1'b0; + + if (req_valid) begin + num_trans_d += 1; + end + if (meta_i.trans_complete) begin + num_trans_d -= 1; + end + if (num_trans_q == 1 && num_trans_d == 0) begin + meta_o.trans_complete = 1'b1; + end + end + `FF(num_trans_q, num_trans_d, '0, clk_i, rst_ni) + + // ------ Beat Counter and Shifter Handler ------ // + logic [7:0] beat_cnt_d, beat_cnt_q; + `FFARN(beat_cnt_q, beat_cnt_d, '0, clk_i, rst_ni) + + logic [7:0] shift_row, shift_partition; + logic [2:0] shift_index_sc; + logic [7:0] mask_shift_row; + + always_comb begin + case(allocated_size_sel) + 128: shift_index_sc = 7; + 64: shift_index_sc = 6; + 32: shift_index_sc = 5; + 16: shift_index_sc = 4; + 8: shift_index_sc = 3; + 4: shift_index_sc = 2; + 2: shift_index_sc = 1; + default: shift_index_sc = 0; + endcase + end + + assign shift_partition = beat_cnt_q >> shift_index_sc; + assign mask_shift_row = ~( {8{1'b1}}<= burst_req_i.num_bytes) begin + burst_req_o = burst_req_i; + valid_o = 1'b1; + ready_o = ready_i; + req_valid = ready_i; + end else begin + // Store and acknowledge + req_d = burst_req_i; + ready_o = 1'b1; + burst_req_o = burst_req_i; + // Calculate the size for the 1st burst + burst_req_o.num_bytes = DmaRegionWidth-start_addr[DmaRegionAddressBits-1:0]; + valid_o = 1'b1; + + // Modify the stored info after first beat sent + if (ready_i) begin + req_d.num_bytes -= DmaRegionWidth-start_addr[DmaRegionAddressBits-1:0]; + req_d.src += DmaRegionWidth-start_addr[DmaRegionAddressBits-1:0]; + req_d.dst += DmaRegionWidth-start_addr[DmaRegionAddressBits-1:0]; + req_valid = 1'b1; + end + state_d = Busy; + end + end + + // ------ Fast Mode ------ // + 2'b01: begin + if ( (PartitionDmaRegionWidth-masked_start_addr) >= burst_req_i.num_bytes ) begin + burst_req_o = burst_req_i; + // Address in SPM need to be translated back to physical address + if (spm2dram) begin + burst_req_o.src = post_scramble_src; + end else begin + burst_req_o.dst = post_scramble_dst; + end + valid_o = 1'b1; + ready_o = ready_i; + req_valid = ready_i; + end else begin + // Store and acknowledge + req_d = burst_req_i; + ready_o = 1'b1; + burst_req_o = burst_req_i; + // Calculate the size for the 1st burst + burst_req_o.num_bytes = PartitionDmaRegionWidth-masked_start_addr; + // TODO (bowwang): parameterize + req_d.num_bytes = (group_factor_sel <= 8) ? (allocated_size_sel*DmaBackendWidth) : (allocated_size_sel*PartitionDmaRegionWidth); + if (spm2dram) begin + burst_req_o.src = post_scramble_src; + req_d.src = post_scramble_src; + end else begin + burst_req_o.dst = post_scramble_dst; + req_d.dst = post_scramble_dst; + end + valid_o = 1'b1; + + // Modify the stored info after first beat sent + if (ready_i) begin + // TODO (bowwang): May not be mecessary to consider alignment + req_d.num_bytes -= PartitionDmaRegionWidth-masked_start_addr; + if (spm2dram) begin + req_d.src += DmaRegionWidth-masked_start_addr; + req_d.dst += PartitionDmaRegionWidth-masked_start_addr; + end else begin + req_d.src += PartitionDmaRegionWidth-masked_start_addr; + req_d.dst += DmaRegionWidth-masked_start_addr; + end + req_valid = 1'b1; + beat_cnt_d = 1; + end + state_d = Busy; + end + end + + // ------ Duplicate Mode ------ // + 2'b10: begin + if ( (PartitionDmaRegionWidth-masked_start_addr) >= burst_req_i.num_bytes ) begin + burst_req_o = burst_req_i; + // Address in SPM need to be translated back to physical address + if (spm2dram) begin + burst_req_o.src = post_scramble_src; + end else begin + burst_req_o.dst = post_scramble_dst; + end + valid_o = 1'b1; + ready_o = ready_i; + req_valid = ready_i; + end else begin + // Store and acknowledge + req_d = burst_req_i; + ready_o = 1'b1; + burst_req_o = burst_req_i; + // Calculate the size for the 1st burst + burst_req_o.num_bytes = PartitionDmaRegionWidth-masked_start_addr; + // TODO (bowwang): parameterize + req_d.num_bytes = (group_factor_sel <= 8) ? (allocated_size_sel*DmaBackendWidth) : (allocated_size_sel*PartitionDmaRegionWidth); + dup_start_addr_d = burst_req_i.src; + if (spm2dram) begin + burst_req_o.src = post_scramble_src; + req_d.src = post_scramble_src; + end else begin + burst_req_o.dst = post_scramble_dst; + req_d.dst = post_scramble_dst; + end + valid_o = 1'b1; + + // Modify the stored info after first beat sent + if (ready_i) begin + // TODO (bowwang): May not be mecessary to consider alignment + req_d.num_bytes -= PartitionDmaRegionWidth-masked_start_addr; + if (spm2dram) begin + req_d.src += DmaRegionWidth-masked_start_addr; + req_d.dst += PartitionDmaRegionWidth-masked_start_addr; + end else begin + req_d.src += PartitionDmaRegionWidth-masked_start_addr; + req_d.dst += DmaRegionWidth-masked_start_addr; + end + req_valid = 1'b1; + beat_cnt_d = 1; + end + state_d = Busy; + end + end + + 2'b11: begin // Partition_Std Mode + if ( (PartitionDmaRegionWidth-masked_start_addr) >= burst_req_i.num_bytes ) begin + burst_req_o = burst_req_i; + // Address in SPM need to be translated back to physical address + if (spm2dram) begin + burst_req_o.src = post_scramble_src; + end else begin + burst_req_o.dst = post_scramble_dst; + end + valid_o = 1'b1; + ready_o = ready_i; + req_valid = ready_i; + end else begin + // Store and acknowledge + req_d = burst_req_i; + ready_o = 1'b1; + burst_req_o = burst_req_i; + // Calculate the size for the 1st burst + burst_req_o.num_bytes = PartitionDmaRegionWidth-masked_start_addr; + if (spm2dram) begin + burst_req_o.src = post_scramble_src; + req_d.src = post_scramble_src; + end else begin + burst_req_o.dst = post_scramble_dst; + req_d.dst = post_scramble_dst; + end + valid_o = 1'b1; + + // Modify the stored info after first beat sent + if (ready_i) begin + // TODO (bowwang): May not be mecessary to consider alignment + req_d.num_bytes -= PartitionDmaRegionWidth-masked_start_addr; + if (spm2dram) begin + req_d.src += DmaRegionWidth-masked_start_addr; + req_d.dst += PartitionDmaRegionWidth-masked_start_addr; + end else begin + req_d.src += PartitionDmaRegionWidth-masked_start_addr; + req_d.dst += DmaRegionWidth-masked_start_addr; + end + req_valid = 1'b1; + beat_cnt_d = 1; + end + state_d = Busy; + end + end + + default: /*do nothing*/; + endcase + end + end // Idle + + Busy: begin + // Sent next burst from split. + burst_req_o = req_q; + valid_o = 1'b1; + req_valid = ready_i; + + unique case (dma_mode_i) + // ------ Std Mode ------ // + 2'b00: begin + if ($unsigned(req_q.num_bytes) <= $unsigned(DmaRegionWidth)) begin + if (ready_i) begin + state_d = Idle; + end + end else begin + burst_req_o.num_bytes = DmaRegionWidth; + if (ready_i) begin + req_d.num_bytes = req_q.num_bytes - DmaRegionWidth; + req_d.src = req_q.src + DmaRegionWidth; + req_d.dst = req_q.dst + DmaRegionWidth; + end + end + end + + 2'b01, + 2'b10, + 2'b11: begin + if ($unsigned(req_q.num_bytes) <= $unsigned(PartitionDmaRegionWidth)) begin + // Last split + if (ready_i) begin + state_d = Idle; + beat_cnt_d = beat_cnt_q + 1; + end + end else begin + burst_req_o.num_bytes = PartitionDmaRegionWidth; + if (ready_i) begin + req_d.num_bytes = req_q.num_bytes - PartitionDmaRegionWidth; + beat_cnt_d = beat_cnt_q + 1; + + if (spm2dram) begin + if (shift_row == allocated_size_sel-1) begin + req_d.src = req_q.src + PartitionDmaRegionWidth - shift_row*DmaRegionWidth; + end else begin + req_d.src = req_q.src + DmaRegionWidth; + end + req_d.dst = req_q.dst + PartitionDmaRegionWidth; + end else begin + req_d.src = req_q.src + PartitionDmaRegionWidth; + if (shift_row == allocated_size_sel-1) begin + req_d.dst = req_q.dst + PartitionDmaRegionWidth - shift_row*DmaRegionWidth; + if (dma_mode_i == 2'b10) begin // duplication mode: recover start adddr + req_d.src = dup_start_addr_q; + end + end else begin + req_d.dst = req_q.dst + DmaRegionWidth; + end + end// spm2dram + end // ready_i + end + end // case {01, 10, 11} + + default: /*do nothing*/; + endcase + + end // Busy + default: /*do nothing*/; + endcase + end + + // pragma translate_off + int f; + always_ff @(posedge clk_i or negedge rst_ni) begin + automatic string str; + if (rst_ni && valid_i && ready_o) begin + str = "\n\n[idma_split_midend_v2] Got request\n"; + str = $sformatf("%sSplit: Request in: From: 0x%8x To: 0x%8x with size %d\n", str, burst_req_i.src, burst_req_i.dst, burst_req_i.num_bytes); + f = $fopen("dma.log", "a"); + $fwrite(f, str); + $fclose(f); + end + if (rst_ni && valid_o && ready_i) begin + str = $sformatf("Split: Out %6d: From: 0x%8x To: 0x%8x with size %d, start_addr 0x%8x.\n", num_trans_q, burst_req_o.src, burst_req_o.dst, burst_req_o.num_bytes, start_addr); + f = $fopen("dma.log", "a"); + $fwrite(f, str); + $fclose(f); + end + end + // pragma translate_on + +endmodule From afad72f4c1e3d028882db274a1d9ff9ab22cd88a Mon Sep 17 00:00:00 2001 From: bowwang Date: Sun, 11 May 2025 14:18:12 +0200 Subject: [PATCH 05/34] [hw] add DAS control logic to idma --- hardware/src/idma_partition_midend.sv | 333 ++++++++++++++++++++++++++ 1 file changed, 333 insertions(+) create mode 100644 hardware/src/idma_partition_midend.sv diff --git a/hardware/src/idma_partition_midend.sv b/hardware/src/idma_partition_midend.sv new file mode 100644 index 000000000..8f1332674 --- /dev/null +++ b/hardware/src/idma_partition_midend.sv @@ -0,0 +1,333 @@ +// Copyright 2022 ETH Zurich and University of Bologna. +// Solderpad Hardware License, Version 0.51, see LICENSE for details. +// SPDX-License-Identifier: SHL-0.51 + +// Bowen Wang +// This module split one burst request to several according to partition scheme selected +// This module is inserted between `idma_split_midend` and `idma_distribute_midend` in Terapool Cluster + +`include "common_cells/registers.svh" + +module idma_partition_midend + import mempool_pkg::SeqMemSizePerTile; + import mempool_pkg::HeapSeqMemSizePerTile; + import mempool_pkg::TCDMSize; + #( + parameter int unsigned DmaRegionWidth = 1, // [B] Region that one port covers in bytes + parameter int unsigned DmaRegionStart = 32'h0000_0000, + parameter int unsigned DmaRegionEnd = 32'h1000_0000, + parameter int unsigned AddrWidth = 32, + parameter type burst_req_t = logic, + parameter type meta_t = logic +) ( + input logic clk_i, + input logic rst_ni, + // Slave + input burst_req_t burst_req_i, + input logic [7:0] beat_cnt_i, + input logic valid_i, + output logic ready_o, + output meta_t meta_o, + // Partition + input logic [7:0] group_factor_i, + input logic [7:0] allocated_size_i, + output logic partition_req_valid_o, + input logic part_beat_cnt_rst_i, + // Master + output burst_req_t burst_req_o, + output logic valid_o, + input logic ready_i, + input meta_t meta_i +); + + // DmaRegionWidth covered by each Tile in [bytes] + // DmaRegionWidth = #banks*4 = 4096*4 [bytes] + // TileDmaRegionWidth = 32*4 [bytes] + typedef logic [AddrWidth-1:0] addr_t; + // log2(4096*4)= 14 = TileIdBits + ConstBits = 7 + 7 + localparam DmaRegionAddressBits = $clog2(DmaRegionWidth); + localparam TileDmaRegionWidth = DmaRegionWidth / 128; + + // ------ Considering Partition Scheme ------ // + // How many bits more need to consider in each partition + logic [2:0] shift_index; + + logic [AddrWidth-1:0] PartitionDmaRegionWidth; + logic [AddrWidth-1:0] partition_mask; + + assign shift_index = (group_factor_i == 128) ? 0 : + (group_factor_i == 64) ? 1 : + (group_factor_i == 32) ? 2 : + (group_factor_i == 16) ? 3 : + (group_factor_i == 8 ) ? 4 : + (group_factor_i == 4 ) ? 5 : + (group_factor_i == 2 ) ? 6 : 7; + // #bytes covered in each partition per row + assign PartitionDmaRegionWidth = TileDmaRegionWidth * group_factor_i; + // |--- 14 bits ---| Lower 14 bits in address + // 1111111_1111111 GF=128 + // 0111111_1111111 GF=64 + // 0011111_1111111 GF=32 + assign partition_mask = {DmaRegionAddressBits{1'b1}} >> shift_index; + + // start_addr: address in L1 of the current input burst + // masked_start_addr: address bits within partition region + addr_t start_addr, masked_start_addr; + always_comb begin + if (($unsigned(burst_req_i.src) >= DmaRegionStart) && ($unsigned(burst_req_i.src) < DmaRegionEnd)) begin + // L1 ------> L2 + start_addr = burst_req_i.src; + end else begin + // L2 ------> L1 + start_addr = burst_req_i.dst; + end + end + + assign masked_start_addr = start_addr & partition_mask; + + // ------ Handle Meta Data ------ // + logic req_valid; + // Forward IDLE signal + assign meta_o.backend_idle = meta_i.backend_idle; + // Forward trans_complete signal as well + assign meta_o.trans_complete = meta_i.trans_complete; + // Send the req_valid signal back to split_midend + assign partition_req_valid_o = req_valid; + + // ------ Split One request aligned to partition scheme ------ // + enum logic {Idle, Busy} state_d, state_q; + burst_req_t req_d, req_q; + + `FFARN(state_q, state_d, Idle, clk_i, rst_ni) + `FFARN(req_q, req_d, '0, clk_i, rst_ni) + + + // ------ Beat Counter Handler ------ // + // When detecting `negedge` on beat_cnt_i, meaning a new DMA request starts, + // beat counter of partition need to reset + // beat_cnt_i: how many beats has been sent from split midend + logic [7:0] beat_cnt_q; + `FFARN(beat_cnt_q, beat_cnt_i, '0, clk_i, rst_ni) + logic [7:0] rst_part_beat_cnt; + assign rst_part_beat_cnt = {8{~( ~(|beat_cnt_i) & (|beat_cnt_q) )}}; // fall edge detect, negative reset + + logic [7:0] part_beat_cnt_d, part_beat_cnt_q, part_beat_cnt_pre_q; + `FFARN(part_beat_cnt_pre_q, part_beat_cnt_d, '0, clk_i, rst_ni) + assign part_beat_cnt_q = part_beat_cnt_pre_q & rst_part_beat_cnt; + + // figure out which partition targeting + // only update if beat_cnt_i == 0 (first beat) + logic [2:0] pid_shift_index; + assign pid_shift_index = (group_factor_i == 128) ? 7 : + (group_factor_i == 64) ? 6 : + (group_factor_i == 32) ? 5 : + (group_factor_i == 16) ? 4 : + (group_factor_i == 8 ) ? 3 : + (group_factor_i == 4 ) ? 2 : + (group_factor_i == 2 ) ? 1 : 0; // TODO + + logic [6:0] part_id_d, part_id_q, part_id_mask; + `FFARN(part_id_q, part_id_d, '0, clk_i, rst_ni) + always_comb begin + part_id_d = part_id_q; + part_id_mask = {7{1'b1}}; + if (|beat_cnt_i == 0) begin + part_id_d = (group_factor_i == 128) ? 0 : (start_addr >> (pid_shift_index + 7)) & (part_id_mask>>pid_shift_index); + end + end + + // ------ Shifter from new partition layout ------ // + // maximum rows in each partition: 128 + // maximum number of partitions: 128 + + logic [7:0] shift_row, shift_partition; + logic [2:0] shift_index_sc; + logic [7:0] mask_shift_row; + always_comb begin + case(allocated_size_i) + 128: shift_index_sc = 7; + 64: shift_index_sc = 6; + 32: shift_index_sc = 5; + 16: shift_index_sc = 4; + 8: shift_index_sc = 3; + 4: shift_index_sc = 2; + 2: shift_index_sc = 1; + default: shift_index_sc = 0; + endcase + end + + assign shift_partition = part_beat_cnt_q >> shift_index_sc; + assign mask_shift_row = ~( {8{1'b1}}<= burst_req_i.num_bytes)begin + // increase part_beat_cnt + part_beat_cnt_d = part_beat_cnt_q + ready_i; + burst_req_o = burst_req_i; + if (($unsigned(burst_req_i.src) >= DmaRegionStart) && ($unsigned(burst_req_i.src) < DmaRegionEnd)) begin + // L1 ------> L2 + // correct addr = base addr + row offset + partition offset + if(beat_cnt_i == 0)begin + burst_req_o.src = burst_req_i.src + (part_beat_cnt_q-beat_cnt_i)*DmaRegionWidth; + end else begin + burst_req_o.src = burst_req_i.src + (part_beat_cnt_q-beat_cnt_i)*DmaRegionWidth + part_id_q*PartitionDmaRegionWidth; + end + end else begin + // L2 ------> L1 + if (beat_cnt_i == 0) begin + // handle 1.1 + // burst_req_o.dst = burst_req_i.dst + (part_beat_cnt_q-beat_cnt_i)*DmaRegionWidth; + burst_req_o.dst = burst_req_i.dst + part_beat_cnt_q*DmaRegionWidth; + end else begin + // handle 1.2 + burst_req_o.dst = burst_req_i.dst + (part_beat_cnt_q-beat_cnt_i)*DmaRegionWidth + part_id_q*PartitionDmaRegionWidth; + end + end + valid_o = 1'b1; + ready_o = ready_i; + req_valid = ready_i; + // 2. prepare split one beat into several + end else begin + // store and acknowledge + req_d = burst_req_i; + ready_o = 1'b1; + // keep: [src] [dest], modify: [num_bytes] + burst_req_o = burst_req_i; + burst_req_o.num_bytes = PartitionDmaRegionWidth-masked_start_addr; + if (($unsigned(burst_req_i.src) >= DmaRegionStart) && ($unsigned(burst_req_i.src) < DmaRegionEnd)) begin + // L1 ------> L2 + if (beat_cnt_i == 0) begin + req_d.src = burst_req_i.src + (part_beat_cnt_q-beat_cnt_i)*DmaRegionWidth; + burst_req_o.src = burst_req_i.src + (part_beat_cnt_q-beat_cnt_i)*DmaRegionWidth; + end else begin + // correct old version + req_d.src = burst_req_i.src + (part_beat_cnt_q-beat_cnt_i)*DmaRegionWidth + part_id_q*PartitionDmaRegionWidth; + burst_req_o.src = burst_req_i.src + (part_beat_cnt_q-beat_cnt_i)*DmaRegionWidth + part_id_q*PartitionDmaRegionWidth; + end + end else begin + // L2 ------> L1 + if (beat_cnt_i == 0) begin + // req_d.dst = burst_req_i.dst + (part_beat_cnt_q-beat_cnt_i)*DmaRegionWidth; + // burst_req_o.dst = burst_req_i.dst + (part_beat_cnt_q-beat_cnt_i)*DmaRegionWidth; + req_d.dst = burst_req_i.dst + shift_row*DmaRegionWidth + shift_partition*PartitionDmaRegionWidth; + burst_req_o.dst = burst_req_i.dst + shift_row*DmaRegionWidth + shift_partition*PartitionDmaRegionWidth; + end else begin + req_d.dst = burst_req_i.dst + (part_beat_cnt_q-beat_cnt_i)*DmaRegionWidth + part_id_q*PartitionDmaRegionWidth; + burst_req_o.dst = burst_req_i.dst + (part_beat_cnt_q-beat_cnt_i)*DmaRegionWidth + part_id_q*PartitionDmaRegionWidth; + end + end + // noftify downstream + valid_o = 1'b1; + if (ready_i) begin + // increase partition beat cnt + part_beat_cnt_d = part_beat_cnt_q + 1; + // downstream is ready to receive, modify the stored req + req_d.num_bytes -= PartitionDmaRegionWidth-masked_start_addr; + if (($unsigned(burst_req_i.src) >= DmaRegionStart) && ($unsigned(burst_req_i.src) < DmaRegionEnd)) begin + // L1 ------> L2 + req_d.src += DmaRegionWidth-masked_start_addr; // folded to second row + req_d.dst += PartitionDmaRegionWidth-masked_start_addr; + end else begin + // L2 ------> L1 + req_d.src += PartitionDmaRegionWidth-masked_start_addr; + // req_d.dst += DmaRegionWidth-masked_start_addr; + req_d.dst += DmaRegionWidth-masked_start_addr; // modification needed + end + req_valid = 1'b1; // one request sent, counter increment + end + state_d = Busy; + end + end + end + + Busy: begin + // get burst request from the stored one + burst_req_o = req_q; + valid_o = 1'b1; + req_valid = ready_i; // counter increment whenever one req sent to downstream + if ($unsigned(req_q.num_bytes) <= $unsigned(PartitionDmaRegionWidth)) begin + // last burst + if (ready_i) begin + state_d = Idle; + // burst_req does not need to change + // increase partition beat cnt + part_beat_cnt_d = part_beat_cnt_q + 1; + end + end else begin + // middle bursts + burst_req_o.num_bytes = PartitionDmaRegionWidth; + if (ready_i) begin + // increase partition beat cnt + part_beat_cnt_d = part_beat_cnt_q + 1; + req_d.num_bytes = req_q.num_bytes - PartitionDmaRegionWidth; + if (($unsigned(req_q.src) >= DmaRegionStart) && ($unsigned(req_q.src) < DmaRegionEnd)) begin + // L1 ------> L2 + req_d.src = req_q.src + DmaRegionWidth; // folded to second row + req_d.dst = req_q.dst + PartitionDmaRegionWidth; // addr in L2 increases as usual + end else begin + // L2 ------> L1 + req_d.src = req_q.src + PartitionDmaRegionWidth; + // req_d.dst = req_q.dst + DmaRegionWidth; + if (shift_row == allocated_size_i-1) begin + req_d.dst = req_q.dst + PartitionDmaRegionWidth - shift_row*DmaRegionWidth; + end else begin + req_d.dst = req_q.dst + DmaRegionWidth; + end + end + end + end + end + + default: /*do nothing*/; + endcase + + + end + + // pragma translate_off + int f; + always_ff @(posedge clk_i or negedge rst_ni) begin + automatic string str; + if (rst_ni && valid_i && ready_o) begin + str = "\n[Partition] Got request\n"; + str = $sformatf("%sPartition: Request in: From: 0x%8x To: 0x%8x with size %d. Beat count: %d\n", str, burst_req_i.src, burst_req_i.dst, burst_req_i.num_bytes, beat_cnt_i); + f = $fopen("dma.log", "a"); + $fwrite(f, str); + $fclose(f); + end + if (rst_ni && valid_o && ready_i) begin + str = $sformatf("Partition: From: 0x%8x To: 0x%8x with size %d. Partition beat count: %d. [part_id] %d\n", burst_req_o.src, burst_req_o.dst, burst_req_o.num_bytes, part_beat_cnt_q, part_id_q); + // str = $sformatf("Debug Rst: [rst_part_beat_cnt] %d [beat_cnt_q] %d [beat_cnt_i] %d \n",rst_part_beat_cnt, beat_cnt_q, beat_cnt_i); + f = $fopen("dma.log", "a"); + $fwrite(f, str); + // str = $sformatf("Debug: [start_addr] %8x [GF] %d \n",start_addr, group_factor_i); + // $fwrite(f, str); + $fclose(f); + end + end + // pragma translate_on + +endmodule From 4087bf4a4d87819b8906dd3aa3123c42cceaa5ce Mon Sep 17 00:00:00 2001 From: bowwang Date: Sun, 11 May 2025 14:21:20 +0200 Subject: [PATCH 06/34] [hw] extend scrambler for das --- hardware/src/address_scrambler.sv | 184 ++++++++++++++++++++++++++++++ 1 file changed, 184 insertions(+) diff --git a/hardware/src/address_scrambler.sv b/hardware/src/address_scrambler.sv index d2c790a65..b7e3dae27 100644 --- a/hardware/src/address_scrambler.sv +++ b/hardware/src/address_scrambler.sv @@ -19,6 +19,38 @@ module address_scrambler #( input logic [AddrWidth-1:0] address_i, output logic [AddrWidth-1:0] address_o ); + localparam int unsigned BankOffsetBits = $clog2(NumBanksPerTile); + localparam int unsigned TileIdBits = $clog2(NumTiles);// Copyright 2021 ETH Zurich and University of Bologna. +// Solderpad Hardware License, Version 0.51, see LICENSE for details. +// SPDX-License-Identifier: SHL-0.51 + +// Description: Scrambles the address in such a way, that part of the memory is accessed +// sequentially and part is interleaved. +// Current constraints: + +// Author: Samuel Riedel + +module address_scrambler #( + parameter int unsigned AddrWidth = 32, + parameter int unsigned DataWidth = 32, + parameter int unsigned ByteOffset = 2, + parameter int unsigned NumTiles = 2, + parameter int unsigned NumBanksPerTile = 2, + parameter bit Bypass = 0, + parameter int unsigned SeqMemSizePerTile = 4*1024, + parameter int unsigned HeapSeqMemSizePerTile = 8*2048, + parameter int unsigned MemSizePerTile = 8*4*1024, + parameter int unsigned MemSizePerRow = 4*4*1024, // 4bytes * 4096 banks + parameter int unsigned TCDMSize = 1024*1024 +) ( + input logic [AddrWidth-1:0] address_i, + input logic [3:0][7:0] group_factor_i, + // For each allocation, the maximum number of rows assigned can be 128 rows + input logic [3:0][7:0] allocated_size_i, + input logic [3:0][DataWidth-1:0] start_addr_scheme_i, + output logic [AddrWidth-1:0] address_o +); + // Stack Sequential Settings localparam int unsigned BankOffsetBits = $clog2(NumBanksPerTile); localparam int unsigned TileIdBits = $clog2(NumTiles); localparam int unsigned SeqPerTileBits = $clog2(SeqMemSizePerTile); @@ -26,6 +58,158 @@ module address_scrambler #( localparam int unsigned ConstantBitsLSB = ByteOffset + BankOffsetBits; localparam int unsigned ScrambleBits = SeqPerTileBits-ConstantBitsLSB; + // Heap Sequential Settings + localparam int unsigned HeapSeqPerTileBits = $clog2(MemSizePerTile); // log2(8*4096) = 15 | RowIndexBits + ConstBits + localparam int unsigned HeapSeqTotalBits = HeapSeqPerTileBits+TileIdBits; // 15+7=22 | used for address_o assignment + localparam int unsigned RowIndexBits = HeapSeqPerTileBits-ConstantBitsLSB; // 15-7=8 | RowIndex + + + if (Bypass || NumTiles < 2) begin + assign address_o = address_i; + end else begin + // ------ Stack Region Logic ------ // + logic [ScrambleBits-1:0] scramble; // Address bits that have to be shuffled around + logic [TileIdBits-1:0] tile_id; // Which tile does this address region belong to + + // Scramble the middle part + // Bits that would have gone to different tiles but now go to increasing lines in the same tile + assign scramble = address_i[SeqPerTileBits-1:ConstantBitsLSB]; // Bits that would + // Bits that would have gone to increasing lines in the same tile but now go to different tiles + assign tile_id = address_i[SeqTotalBits-1:SeqPerTileBits]; + + // ------ Heap Sequential Signals ------ // + + // `shift_index` : how many bits to shift for TileID bits in each partition + // `shift_index_sc`: how many bits need to swap within Row Index + logic [3:0][2:0] shift_index; + logic [3:0][2:0] shift_index_sc; + for (genvar i = 0; i < 4; i++) begin : gen_shift_index + always_comb begin + case(group_factor_i[i]) + 128: shift_index[i] = 7; + 64: shift_index[i] = 6; + 32: shift_index[i] = 5; + 16: shift_index[i] = 4; + 8: shift_index[i] = 3; + 4: shift_index[i] = 2; + 2: shift_index[i] = 1; + default: shift_index[i] = 0; + endcase + + case(allocated_size_i[i]) + 128: shift_index_sc[i] = 7; + 64: shift_index_sc[i] = 6; + 32: shift_index_sc[i] = 5; + 16: shift_index_sc[i] = 4; + 8: shift_index_sc[i] = 3; + 4: shift_index_sc[i] = 2; + 2: shift_index_sc[i] = 1; + default: shift_index_sc[i] = 0; + endcase + end + end + + + // post-scramble row index + logic [RowIndexBits-1:0] post_scramble_row_index; + logic [TileIdBits-1:0] post_scramble_tile_id; + + logic [3:0][RowIndexBits-1:0] mask_row_index, mask_row_index_n; + logic [3:0][TileIdBits-1:0] mask_tile_id, mask_tile_id_n; + + logic [TileIdBits-1:0] heap_tile_id; + + for (genvar j = 0; j < 4; j++) begin : gen_mask + assign mask_row_index[j] = (shift_index_sc[j] == 0) ? {RowIndexBits{1'b0}} : ({RowIndexBits{1'b1}} >> (RowIndexBits-shift_index_sc[j])); + assign mask_tile_id[j] = (shift_index[j] == 0) ? {TileIdBits{1'b0}} : ({TileIdBits{1'b1}} >> (TileIdBits -shift_index[j])); + + assign mask_row_index_n[j] = ~mask_row_index[j]; + assign mask_tile_id_n[j] = ~mask_tile_id[j]; + end + + assign heap_tile_id = address_i[(TileIdBits+ConstantBitsLSB-1):ConstantBitsLSB]; + + always_comb begin + // Default: Unscrambled + address_o[ConstantBitsLSB-1:0] = address_i[ConstantBitsLSB-1:0]; + address_o[SeqTotalBits-1:ConstantBitsLSB] = {tile_id, scramble}; + address_o[AddrWidth-1:SeqTotalBits] = address_i[AddrWidth-1:SeqTotalBits]; + post_scramble_row_index = 'b0; + post_scramble_tile_id = 'b0; + + // Stack Region + if (address_i < (NumTiles * SeqMemSizePerTile)) begin + address_o[SeqTotalBits-1:ConstantBitsLSB] = {scramble, tile_id}; + + // Sequential Heap Region + end else if ( (address_i >= start_addr_scheme_i[0]) && (address_i < start_addr_scheme_i[0]+MemSizePerRow*allocated_size_i[0]) ) begin + + post_scramble_row_index = 'b0; + post_scramble_tile_id = 'b0; + // 1. `post_scramble_row_index` generation + post_scramble_row_index |= (address_i >> (ConstantBitsLSB + shift_index[0])) & mask_row_index[0]; + post_scramble_row_index |= (address_i >> (ConstantBitsLSB + TileIdBits )) & mask_row_index_n[0]; + + // 2. `post_scramble_tile_id` generation + post_scramble_tile_id |= heap_tile_id & mask_tile_id[0]; + post_scramble_tile_id |= (address_i >> (ConstantBitsLSB + shift_index_sc[0])) & mask_tile_id_n[0]; + + address_o[HeapSeqTotalBits-1:ConstantBitsLSB] = {post_scramble_row_index, post_scramble_tile_id}; + end else if ( (address_i >= start_addr_scheme_i[1]) && (address_i < start_addr_scheme_i[1]+MemSizePerRow*allocated_size_i[1]) ) begin + + post_scramble_row_index = 'b0; + post_scramble_tile_id = 'b0; + // 1. `post_scramble_row_index` generation + post_scramble_row_index |= (address_i >> (ConstantBitsLSB + shift_index[1])) & mask_row_index[1]; + post_scramble_row_index |= (address_i >> (ConstantBitsLSB + TileIdBits )) & mask_row_index_n[1]; + + // 2. `post_scramble_tile_id` generation + post_scramble_tile_id |= heap_tile_id & mask_tile_id[1]; + post_scramble_tile_id |= (address_i >> (ConstantBitsLSB + shift_index_sc[1])) & mask_tile_id_n[1]; + + address_o[HeapSeqTotalBits-1:ConstantBitsLSB] = {post_scramble_row_index, post_scramble_tile_id}; + end else if ( (address_i >= start_addr_scheme_i[2]) && (address_i < start_addr_scheme_i[2]+MemSizePerRow*allocated_size_i[2]) ) begin + + post_scramble_row_index = 'b0; + post_scramble_tile_id = 'b0; + // 1. `post_scramble_row_index` generation + post_scramble_row_index |= (address_i >> (ConstantBitsLSB + shift_index[2])) & mask_row_index[2]; + post_scramble_row_index |= (address_i >> (ConstantBitsLSB + TileIdBits )) & mask_row_index_n[2]; + + // 2. `post_scramble_tile_id` generation + post_scramble_tile_id |= heap_tile_id & mask_tile_id[2]; + post_scramble_tile_id |= (address_i >> (ConstantBitsLSB + shift_index_sc[2])) & mask_tile_id_n[2]; + + address_o[HeapSeqTotalBits-1:ConstantBitsLSB] = {post_scramble_row_index, post_scramble_tile_id}; + end else if ( (address_i >= start_addr_scheme_i[3]) && (address_i < start_addr_scheme_i[3]+MemSizePerRow*allocated_size_i[3]) ) begin + + post_scramble_row_index = 'b0; + post_scramble_tile_id = 'b0; + // 1. `post_scramble_row_index` generation + post_scramble_row_index |= (address_i >> (ConstantBitsLSB + shift_index[3])) & mask_row_index[3]; + post_scramble_row_index |= (address_i >> (ConstantBitsLSB + TileIdBits )) & mask_row_index_n[3]; + + // 2. `post_scramble_tile_id` generation + post_scramble_tile_id |= heap_tile_id & mask_tile_id[3]; + post_scramble_tile_id |= (address_i >> (ConstantBitsLSB + shift_index_sc[3])) & mask_tile_id_n[3]; + + address_o[HeapSeqTotalBits-1:ConstantBitsLSB] = {post_scramble_row_index, post_scramble_tile_id}; + end + end + end + + // Check for unsupported configurations + if (NumBanksPerTile < 2) + $fatal(1, "NumBanksPerTile must be greater than 2. The special case '1' is currently not supported!"); + if (SeqMemSizePerTile % (2**ByteOffset*NumBanksPerTile) != 0) + $fatal(1, "SeqMemSizePerTile must be a multiple of BankWidth*NumBanksPerTile!"); +endmodule : address_scrambler + + localparam int unsigned SeqPerTileBits = $clog2(SeqMemSizePerTile); + localparam int unsigned SeqTotalBits = SeqPerTileBits+TileIdBits; + localparam int unsigned ConstantBitsLSB = ByteOffset + BankOffsetBits; + localparam int unsigned ScrambleBits = SeqPerTileBits-ConstantBitsLSB; + if (Bypass || NumTiles < 2) begin assign address_o = address_i; end else begin From 6cfabd860600d8f9d75c39ac5fe9c85b3c94e003 Mon Sep 17 00:00:00 2001 From: bowwang Date: Sun, 11 May 2025 16:30:10 +0200 Subject: [PATCH 07/34] [hw] add das to terapool --- hardware/src/mempool_cluster.sv | 78 +++++++++++++++++++++++++------ hardware/src/mempool_group.sv | 56 ++++++++++++++++------ hardware/src/mempool_pkg.sv | 5 ++ hardware/src/mempool_sub_group.sv | 12 ++++- hardware/src/mempool_system.sv | 11 +++++ hardware/src/mempool_tile.sv | 13 +++++- 6 files changed, 141 insertions(+), 34 deletions(-) diff --git a/hardware/src/mempool_cluster.sv b/hardware/src/mempool_cluster.sv index 561e0d369..8f1017848 100644 --- a/hardware/src/mempool_cluster.sv +++ b/hardware/src/mempool_cluster.sv @@ -26,6 +26,12 @@ module mempool_cluster output logic scan_data_o, // Wake up signal input logic [NumCores-1:0] wake_up_i, + // Partition Selection + input logic [3:0][PartitionDataWidth-1:0] partition_sel_i, + input logic [3:0][PartitionDataWidth-1:0] allocated_size_i, + input logic [3:0][DataWidth-1:0] start_addr_scheme_i, + // DMA Mode Selection + input logic [1:0] dma_mode_i, // RO-Cache configuration input ro_cache_ctrl_t ro_cache_ctrl_i, // DMA request @@ -81,30 +87,48 @@ module mempool_cluster logic [NumGroups-1:0] dma_req_group_valid, dma_req_group_q_valid; logic [NumGroups-1:0] dma_req_group_ready, dma_req_group_q_ready; dma_meta_t [NumGroups-1:0] dma_meta, dma_meta_q; + // dma after partition + dma_req_t dma_req_partition; + logic dma_req_partition_valid; + logic dma_req_partition_ready; + dma_meta_t dma_meta_partition; + logic [7:0] dma_beat_cnt; + logic partition_req_valid; + logic [7:0] group_factor_sel, allocated_size_sel; `FF(dma_meta_q, dma_meta, '0, clk_i, rst_ni); + logic part_beat_cnt_rst; - idma_split_midend #( - .DmaRegionWidth (NumBanksPerGroup*NumGroups*4), - .DmaRegionStart (TCDMBaseAddr ), - .DmaRegionEnd (TCDMBaseAddr+TCDMSize ), + idma_split_midend_v2 #( + .DmaRegionWidth (NumBanksPerGroup*NumGroups*4), // #DmaBytes = #banks*4 = 4096*4 // size per row + .DmaRegionStart (TCDMBaseAddr ), // 0x0000_0000, defined in tb + .DmaRegionEnd (TCDMBaseAddr+TCDMSize ), // TCDMSize = #banks*l1banksize = 4096*1024 // size of DMA region .AddrWidth (AddrWidth ), .burst_req_t (dma_req_t ), .meta_t (dma_meta_t ) - ) i_idma_split_midend ( + ) i_idma_split_midend_v2 ( .clk_i (clk_i ), .rst_ni (rst_ni ), + // slave .burst_req_i(dma_req_cut ), .valid_i (dma_req_cut_valid ), .ready_o (dma_req_cut_ready ), .meta_o (dma_meta_cut ), - .burst_req_o(dma_req_split ), - .valid_o (dma_req_split_valid), - .ready_i (dma_req_split_ready), - .meta_i (dma_meta_split ) + // master + .dma_mode_i (dma_mode_i), + .burst_req_o(dma_req_partition ), + .valid_o (dma_req_partition_valid), + .ready_i (dma_req_partition_ready), + .meta_i (dma_meta_partition ), + + // partition information + .group_factor_i (partition_sel_i ), + .allocated_size_i (allocated_size_i ), + .start_addr_scheme_i (start_addr_scheme_i ), + .allocated_size_o (allocated_size_sel) ); - idma_distributed_midend #( + idma_distributed_midend_v2 #( .NoMstPorts (NumGroups ), .DmaRegionWidth (NumBanksPerGroup*4 ), .DmaRegionStart (TCDMBaseAddr ), @@ -112,13 +136,18 @@ module mempool_cluster .TransFifoDepth (16 ), .burst_req_t (dma_req_t ), .meta_t (dma_meta_t ) - ) i_idma_distributed_midend ( + ) i_idma_distributed_midend_v2 ( .clk_i (clk_i ), .rst_ni (rst_ni ), - .burst_req_i (dma_req_split ), - .valid_i (dma_req_split_valid), - .ready_o (dma_req_split_ready), - .meta_o (dma_meta_split ), + // slave + .burst_req_i (dma_req_partition ), + .valid_i (dma_req_partition_valid), + .ready_o (dma_req_partition_ready), + .meta_o (dma_meta_partition ), + // partition info + .allocated_size_i(allocated_size_sel), + .dma_mode_i (dma_mode_i), + // master .burst_req_o (dma_req_group ), .valid_o (dma_req_group_valid), .ready_i (dma_req_group_ready), @@ -295,11 +324,16 @@ module mempool_cluster .tcdm_slave_resp_valid_o (tcdm_slave_resp_valid[g] ), .tcdm_slave_resp_ready_i (tcdm_slave_resp_ready[g] ), .wake_up_i (wake_up_q[g*NumCoresPerGroup +: NumCoresPerGroup] ), + .partition_sel_i (partition_sel_i ), + .start_addr_scheme_i (start_addr_scheme_i ), + .allocated_size_i (allocated_size_i ), + .dma_mode_i (dma_mode_i ), .ro_cache_ctrl_i (ro_cache_ctrl_q[g] ), // DMA request .dma_req_i (dma_req_group_q[g] ), .dma_req_valid_i (dma_req_group_q_valid[g] ), .dma_req_ready_o (dma_req_group_q_ready[g] ), + .dma_allocated_size_sel_i(allocated_size_sel ), // DMA status .dma_meta_o_backend_idle_ (dma_meta[g][1] ), .dma_meta_o_trans_complete_ (dma_meta[g][0] ), @@ -336,11 +370,16 @@ module mempool_cluster .tcdm_slave_resp_valid_o (tcdm_slave_resp_valid[g] ), .tcdm_slave_resp_ready_i (tcdm_slave_resp_ready[g] ), .wake_up_i (wake_up_q[g*NumCoresPerGroup +: NumCoresPerGroup] ), + .partition_sel_i (partition_sel_i ), + .start_addr_scheme_i (start_addr_scheme_i ), + .allocated_size_i (allocated_size_i ), + .dma_mode_i (dma_mode_i ), .ro_cache_ctrl_i (ro_cache_ctrl_q[g] ), // DMA request .dma_req_i (dma_req_group_q[g] ), .dma_req_valid_i (dma_req_group_q_valid[g] ), .dma_req_ready_o (dma_req_group_q_ready[g] ), + .dma_allocated_size_sel_i(allocated_size_sel ), // DMA status .dma_meta_o (dma_meta[g] ), // AXI interface @@ -374,11 +413,16 @@ module mempool_cluster .tcdm_slave_resp_valid_o (tcdm_slave_resp_valid[g] ), .tcdm_slave_resp_ready_i (tcdm_slave_resp_ready[g] ), .wake_up_i (wake_up_q[g*NumCoresPerGroup +: NumCoresPerGroup] ), + .partition_sel_i (partition_sel_i ), + .start_addr_scheme_i (start_addr_scheme_i ), + .allocated_size_i (allocated_size_i ), + .dma_mode_i (dma_mode_i ), .ro_cache_ctrl_i (ro_cache_ctrl_q[g] ), // DMA request .dma_req_i (dma_req_group_q[g] ), .dma_req_valid_i (dma_req_group_q_valid[g] ), .dma_req_ready_o (dma_req_group_q_ready[g] ), + .dma_allocated_size_sel_i(allocated_size_sel ), // DMA status .dma_meta_o (dma_meta[g] ), // AXI interface @@ -457,11 +501,15 @@ module mempool_cluster .tcdm_slave_resp_valid_o (tcdm_slave_resp_valid[g] ), .tcdm_slave_resp_ready_i (tcdm_slave_resp_ready[g] ), .wake_up_i (wake_up_q[g*NumCoresPerGroup +: NumCoresPerGroup] ), + .partition_sel_i (partition_sel_i ), + .allocated_size_i (allocated_size_i ), + .start_addr_scheme_i (start_addr_scheme_i ), .ro_cache_ctrl_i (ro_cache_ctrl_q[g] ), // DMA request .dma_req_i (dma_req_group_q[g] ), .dma_req_valid_i (dma_req_group_q_valid[g] ), .dma_req_ready_o (dma_req_group_q_ready[g] ), + .dma_allocated_size_sel_i(allocated_size_sel ), // DMA status .dma_meta_o (dma_meta[g] ), // AXI interface diff --git a/hardware/src/mempool_group.sv b/hardware/src/mempool_group.sv index 733f98b9c..3853fc97a 100644 --- a/hardware/src/mempool_group.sv +++ b/hardware/src/mempool_group.sv @@ -60,12 +60,18 @@ module mempool_group `endif // Wake up interface input logic [NumCoresPerGroup-1:0] wake_up_i, + // Partition selection + input logic [3:0][PartitionDataWidth-1:0] partition_sel_i, + input logic [3:0][PartitionDataWidth-1:0] allocated_size_i, + input logic [3:0][DataWidth-1:0] start_addr_scheme_i, + input logic [1:0] dma_mode_i, // RO-Cache configuration input `STRUCT_PORT(ro_cache_ctrl_t) ro_cache_ctrl_i, // DMA request input `STRUCT_PORT(dma_req_t) dma_req_i, input logic dma_req_valid_i, output logic dma_req_ready_o, + input logic [7:0] dma_allocated_size_sel_i, // DMA status output `STRUCT_PORT(dma_meta_t) dma_meta_o, // AXI Interface @@ -333,7 +339,11 @@ module mempool_group // RO-Cache configuration .ro_cache_ctrl_i (ro_cache_ctrl_q ), // Wake up interface - .wake_up_i (wake_up_q[sg*NumCoresPerSubGroup +: NumCoresPerSubGroup] ) + .wake_up_i (wake_up_q[sg*NumCoresPerSubGroup +: NumCoresPerSubGroup] ), + // Partition selection + .start_addr_scheme_i (start_addr_scheme_i ), + .allocated_size_i (allocated_size_i ), + .partition_sel_i (partition_sel_i) ); end else begin: gen_rtl_sg mempool_sub_group #( @@ -385,7 +395,10 @@ module mempool_group // RO-Cache configuration .ro_cache_ctrl_i (ro_cache_ctrl_q ), // Wake up interface - .wake_up_i (wake_up_q[sg*NumCoresPerSubGroup +: NumCoresPerSubGroup] ) + .wake_up_i (wake_up_q[sg*NumCoresPerSubGroup +: NumCoresPerSubGroup] ), + .start_addr_scheme_i (start_addr_scheme_i ), + .allocated_size_i (allocated_size_i ), + .partition_sel_i (partition_sel_i) ); end // Transpose the group requests @@ -560,21 +573,26 @@ module mempool_group `FF(dma_meta_o, dma_meta_cut, '0, clk_i, rst_ni); - idma_distributed_midend #( + idma_distributed_midend_v2 #( .NoMstPorts (NumDmasPerGroup ), .DmaRegionWidth (NumBanksPerGroup*4/NumDmasPerGroup ), .DmaRegionStart (TCDMBaseAddr ), .DmaRegionEnd (TCDMBaseAddr+TCDMSize ), - .TransFifoDepth (16 ), + .TransFifoDepth (8 ), .burst_req_t (dma_req_t ), .meta_t (dma_meta_t ) - ) i_idma_distributed_midend ( + ) i_idma_distributed_midend_v2 ( .clk_i (clk_i ), .rst_ni (rst_ni ), + // slave .burst_req_i (dma_req_cut ), .valid_i (dma_req_cut_valid), .ready_o (dma_req_cut_ready), .meta_o (dma_meta_cut ), + // partition + .allocated_size_i(dma_allocated_size_sel_i), + .dma_mode_i (dma_mode_i), + // master .burst_req_o (dma_req ), .valid_o (dma_req_valid ), .ready_i (dma_req_ready ), @@ -684,7 +702,10 @@ module mempool_group .axi_mst_req_o (axi_tile_req[t] ), .axi_mst_resp_i (axi_tile_resp[t] ), // Wake up interface - .wake_up_i (wake_up_q[t*NumCoresPerTile +: NumCoresPerTile]) + .wake_up_i (wake_up_q[t*NumCoresPerTile +: NumCoresPerTile]), + .start_addr_scheme_i (start_addr_scheme_i ), + .allocated_size_i (allocated_size_i ), + .partition_sel_i (partition_sel_i) ); // Transpose the group requests @@ -970,21 +991,26 @@ module mempool_group logic [NumDmasPerGroup-1:0] dma_req_ready; dma_meta_t [NumDmasPerGroup-1:0] dma_meta; - idma_distributed_midend #( - .NoMstPorts (NumDmasPerGroup ), - .DmaRegionWidth (NumBanksPerGroup*4/NumDmasPerGroup), - .DmaRegionStart (TCDMBaseAddr ), - .DmaRegionEnd (TCDMBaseAddr+TCDMSize ), - .TransFifoDepth (16 ), - .burst_req_t (dma_req_t ), - .meta_t (dma_meta_t ) - ) i_idma_distributed_midend ( + idma_distributed_midend_v2 #( + .NoMstPorts (NumDmasPerGroup ), + .DmaRegionWidth (NumBanksPerGroup*4/NumDmasPerGroup ), + .DmaRegionStart (TCDMBaseAddr ), + .DmaRegionEnd (TCDMBaseAddr+TCDMSize ), + .TransFifoDepth (8 ), + .burst_req_t (dma_req_t ), + .meta_t (dma_meta_t ) + ) i_idma_distributed_midend_v2 ( .clk_i (clk_i ), .rst_ni (rst_ni ), + // slave .burst_req_i (dma_req_cut ), .valid_i (dma_req_cut_valid), .ready_o (dma_req_cut_ready), .meta_o (dma_meta_cut ), + // partition + .allocated_size_i(dma_allocated_size_sel_i), + .dma_mode_i (dma_mode_i), + // master .burst_req_o (dma_req ), .valid_o (dma_req_valid ), .ready_i (dma_req_ready ), diff --git a/hardware/src/mempool_pkg.sv b/hardware/src/mempool_pkg.sv index 5d427feec..ea3962cd0 100644 --- a/hardware/src/mempool_pkg.sv +++ b/hardware/src/mempool_pkg.sv @@ -28,6 +28,8 @@ package mempool_pkg; localparam integer unsigned AxiDataWidth = `ifdef AXI_DATA_WIDTH `AXI_DATA_WIDTH `else 0 `endif; localparam integer unsigned AxiLiteDataWidth = 32; + localparam integer unsigned PartitionDataWidth = 8; // only support group_factor={128, 64, 32, 16, 8, 4, 2, 1} + /*********************** * MEMORY PARAMETERS * ***********************/ @@ -310,6 +312,9 @@ package mempool_pkg; localparam int unsigned SeqMemSizePerCore = `ifdef SEQ_MEM_SIZE `SEQ_MEM_SIZE `else 0 `endif; localparam int unsigned SeqMemSizePerTile = NumCoresPerTile*SeqMemSizePerCore; + localparam int unsigned HeapSeqMemSizePerCore = `ifdef HEAP_SEQ_MEM_SIZE `SEQ_MEM_SIZE `else 2048 `endif; + localparam int unsigned HeapSeqMemSizePerTile = NumCoresPerTile*HeapSeqMemSizePerCore; + typedef struct packed { int unsigned slave_idx; addr_t mask; diff --git a/hardware/src/mempool_sub_group.sv b/hardware/src/mempool_sub_group.sv index a3577450f..012cd77b8 100644 --- a/hardware/src/mempool_sub_group.sv +++ b/hardware/src/mempool_sub_group.sv @@ -63,7 +63,11 @@ module mempool_sub_group // RO-Cache configuration input `STRUCT_PORT(ro_cache_ctrl_t) ro_cache_ctrl_i, // Wake up interface - input logic [NumCoresPerSubGroup-1:0] wake_up_i + input logic [NumCoresPerSubGroup-1:0] wake_up_i, + // Partition Selection + input logic [3:0][PartitionDataWidth-1:0] partition_sel_i, + input logic [3:0][PartitionDataWidth-1:0] allocated_size_i, + input logic [3:0][DataWidth-1:0] start_addr_scheme_i ); /***************** @@ -199,7 +203,11 @@ module mempool_sub_group .axi_mst_req_o (axi_tile_req[t] ), .axi_mst_resp_i (axi_tile_resp[t] ), // Wake up interface - .wake_up_i (wake_up_q[t*NumCoresPerTile +: NumCoresPerTile]) + .wake_up_i (wake_up_q[t*NumCoresPerTile +: NumCoresPerTile]), + // Partition selection + .start_addr_scheme_i (start_addr_scheme_i ), + .allocated_size_i (allocated_size_i ), + .partition_sel_i (partition_sel_i) ); // Transpose the sub_group requests diff --git a/hardware/src/mempool_system.sv b/hardware/src/mempool_system.sv index 98c6fde07..3b75ebaef 100644 --- a/hardware/src/mempool_system.sv +++ b/hardware/src/mempool_system.sv @@ -91,6 +91,13 @@ module mempool_system logic [DataWidth-1:0] eoc; ro_cache_ctrl_t ro_cache_ctrl; + // For dynamic partitioning + logic [3:0][PartitionDataWidth-1:0] partition_sel; + logic [3:0][PartitionDataWidth-1:0] allocated_size; + logic [3:0][DataWidth-1:0] start_addr_scheme; + // For DMA Mode Selection + logic [DataWidth-1:0] dma_mode; + dma_req_t dma_req; logic dma_req_valid; logic dma_req_ready; @@ -140,6 +147,10 @@ module mempool_system .clk_i (clk_i ), .rst_ni (rst_ni ), .wake_up_i (wake_up ), + .partition_sel_i(partition_sel ), + .allocated_size_i (allocated_size), + .start_addr_scheme_i(start_addr_scheme ), + .dma_mode_i (dma_mode[1:0]), .testmode_i (1'b0 ), .scan_enable_i (1'b0 ), .scan_data_i (1'b0 ), diff --git a/hardware/src/mempool_tile.sv b/hardware/src/mempool_tile.sv index 49c8ddaea..0b77db180 100644 --- a/hardware/src/mempool_tile.sv +++ b/hardware/src/mempool_tile.sv @@ -52,7 +52,11 @@ module mempool_tile output `STRUCT_PORT(axi_tile_req_t) axi_mst_req_o, input `STRUCT_PORT(axi_tile_resp_t) axi_mst_resp_i, // Wake up interface - input logic [NumCoresPerTile-1:0] wake_up_i + input logic [NumCoresPerTile-1:0] wake_up_i, + // Partition selection + input logic [3:0][DataWidth-1:0] start_addr_scheme_i, + input logic [3:0][PartitionDataWidth-1:0] allocated_size_i, + input logic [3:0][PartitionDataWidth-1:0] partition_sel_i ); /**************** @@ -896,9 +900,14 @@ module mempool_tile .NumTiles (NumTiles ), .NumBanksPerTile (NumBanksPerTile ), .Bypass (0 ), - .SeqMemSizePerTile (SeqMemSizePerTile) + .SeqMemSizePerTile (SeqMemSizePerTile), + .HeapSeqMemSizePerTile (HeapSeqMemSizePerTile), + .TCDMSize (TCDMSize) ) i_address_scrambler ( .address_i (snitch_data_qaddr[c] ), + .group_factor_i(partition_sel_i), + .allocated_size_i (allocated_size_i), + .start_addr_scheme_i(start_addr_scheme_i), .address_o (snitch_data_qaddr_scrambled) ); From bd0386ba6ace4568600fa5bd270593fe0172a1e7 Mon Sep 17 00:00:00 2001 From: bowwang Date: Sun, 11 May 2025 17:02:30 +0200 Subject: [PATCH 08/34] [sw] add das runtime support --- software/runtime/alloc.c | 462 +++++++++++++++++++++++++++++ software/runtime/alloc.h | 17 +- software/runtime/alloc_partition.c | 86 ++++++ software/runtime/alloc_partition.h | 58 ++++ software/runtime/arch.ld.c | 3 + software/runtime/dma.h | 6 + software/runtime/runtime.h | 132 ++++++++- software/runtime/runtime.mk | 5 + 8 files changed, 766 insertions(+), 3 deletions(-) create mode 100644 software/runtime/alloc_partition.c create mode 100644 software/runtime/alloc_partition.h diff --git a/software/runtime/alloc.c b/software/runtime/alloc.c index 519bd8e32..401ddc116 100644 --- a/software/runtime/alloc.c +++ b/software/runtime/alloc.c @@ -34,6 +34,18 @@ alloc_t alloc_l1; // Allocators for L1 local sequential heap memory alloc_t alloc_tile[NUM_CORES / NUM_CORES_PER_TILE]; + +// ---------------------------------------------------------------------------- +// Dynamic Heap Allocator +// ---------------------------------------------------------------------------- +alloc_t* dynamic_heap_alloc = NULL; +void init_dynamic_heap_alloc(uint32_t num_partition){ // how many parts to devide the whole system + dynamic_heap_alloc = (alloc_t *)simple_malloc(num_partition * sizeof(alloc_t)); +} +void free_dynamic_heap_alloc(void){ + simple_free(dynamic_heap_alloc); +} + // ---------------------------------------------------------------------------- // Canary System based on LSBs of block pointer // ---------------------------------------------------------------------------- @@ -55,18 +67,32 @@ static inline canary_and_size_t canary_decode(const uint32_t value) { return (canary_and_size_t){.canary = value & 0xFF, .size = value >> 8}; } +typedef struct canary_chain_s{ + uint32_t canary_and_size; + uint32_t *data_address; + struct canary_chain_s *next_canary; +} canary_chain_t; + +// init as a NULL, assign this pointer when the first canary is allocated +// It is a pointer pointing to the canary chain +// canary_start_t first_canary; +canary_chain_t *first_canary = (canary_chain_t *)0x1000; + // ---------------------------------------------------------------------------- // Initialization // ---------------------------------------------------------------------------- void alloc_init(alloc_t *alloc, void *base, const uint32_t size) { // Create first block at base address aligned up uint32_t aligned_base = ALIGN_UP((uint32_t)base, MIN_BLOCK_SIZE); + // printf("base - %p - aligned_base %p\n", base, (alloc_block_t *)aligned_base); alloc_block_t *block_ptr = (alloc_block_t *)aligned_base; // Calculate block size aligned down uint32_t block_size = size - ((uint32_t)block_ptr - (uint32_t)base); block_size = ALIGN_DOWN(block_size, MIN_BLOCK_SIZE); + // printf("block_ptr: %p, block_ptr->size: %p, block_ptr->next: %p\n", block_ptr, &(block_ptr->size), &(block_ptr->next)); + // Setup allocator block_ptr->size = block_size; block_ptr->next = NULL; @@ -116,6 +142,121 @@ static void *allocate_memory(alloc_t *alloc, const uint32_t size) { } } +// ------ Function to calculate the aligned size ------ // +static uint32_t calc_aligned_size (uint32_t* addr, const uint32_t allocated_size) { + // interpret the addr + uint32_t tmp = allocated_size; + uint32_t log = 0; // log2 of 0 is undefined, handled as special case if needed + while (tmp >>= 1) { // Shift right until value is 0 + ++log; + } + uint32_t mask = (uint32_t)(( 1 << log )-1); + uint32_t row_id, tile_id, offset; + offset = ((uint32_t)addr) & 0x7F; + tile_id = ((uint32_t)addr >> 7 ) & 0x7F; + row_id = ((uint32_t)addr >> 14) & 0xFF; + row_id &= mask; + + uint32_t shift_size=0; + if ( (offset==0) && (row_id==0) && (tile_id==0) ){ + shift_size = 0; + } + else{ + uint32_t aligned_boundary = 4096*4*allocated_size; + uint32_t modified_curr = (row_id<<14) | (tile_id<<7) | offset; + shift_size = aligned_boundary - modified_curr; + } + + return shift_size; +} +// ------ Parameters ------ // +// size: Size of the data block need to be allocated +// allocated_size: How many rows the current partition scheme occupied +static void *allocate_memory_aligned(alloc_t *alloc, const uint32_t size, const uint32_t allocated_size) { + // Get first block of linked list of free blocks + alloc_block_t *curr = alloc->first_block; + alloc_block_t *prev = 0; + + // Search first block large enough in linked list + // 1. calculate the size aligned to the partition boundary + uint32_t shift_size = 0; + shift_size = calc_aligned_size( (uint32_t*)curr, allocated_size); + uint32_t aligned_size = size + shift_size; + + // while (curr && (curr->size < size)) { + while (curr && (curr->size < aligned_size)) { + prev = curr; + curr = curr->next; + shift_size = calc_aligned_size( (uint32_t*)curr, allocated_size); + aligned_size = size + shift_size; + } + printf("Dynamic Allocator >> size [%d] --- shift size [%d] --- aligned size [%d] \n", size, shift_size, aligned_size); + + if (curr) { + // Update allocator + if (size == aligned_size){ + // address is already aligned to the partition boundary + printf("Dynamic Allocator >> No alignment needed\n"); + if (curr->size == size) { + // Special case: Whole block taken + if (prev) { + prev->next = curr->next; + } else { + alloc->first_block = curr->next; + } + } else { + // Regular case: Split off block + alloc_block_t *new_block = (alloc_block_t *)((char *)curr + size); + new_block->size = curr->size - size; + new_block->next = curr->next; + if (prev) { + prev->next = new_block; + } else { + alloc->first_block = new_block; + } + } + } + else{ + printf("Dynamic Allocator >> Alignment needed\n"); + if (curr->size == aligned_size) { + // Special case: Whole block taken, first part of the block is still empty + // store the curr info in tmp + // uint32_t tmp_size = curr->size; + struct alloc_block_s *tmp_next = curr->next; + alloc_block_t *shift_block = (alloc_block_t *)((char *)curr); + shift_block->size = shift_size; + shift_block->next = tmp_next; + if (prev) { + prev->next = shift_block; + } else { + alloc->first_block = shift_block; + } + } + else{ + // Regular case: Split off block + alloc_block_t *new_block = (alloc_block_t *)((char *)curr + aligned_size); + new_block->size = curr->size - aligned_size; + new_block->next = curr->next; + + alloc_block_t *shift_block = (alloc_block_t *)((char *)curr); + shift_block->size = shift_size; + shift_block->next = new_block; + if (prev) { + prev->next = shift_block; + } else { + alloc->first_block = shift_block; + } + } + } + + // Return block pointer + return (void *)((char *)curr+shift_size); + } else { + // There is no free block large enough + return NULL; + } +} + void *domain_malloc(alloc_t *alloc, const uint32_t size) { // Calculate actually required block size uint32_t data_size = size + sizeof(uint32_t); // add size/metadata @@ -147,6 +288,232 @@ void *simple_malloc(const uint32_t size) { return domain_malloc(&alloc_l1, size); } +// ------ Allocate a space aligned with L1 boundary ------ // +static uint32_t calc_aligned_l1_size (uint32_t* addr) { + uint32_t shift_size = 0; + uint32_t l1_aligned_mask = 0x3fff; + uint32_t masked_addr = (uint32_t)addr & l1_aligned_mask; + if (masked_addr==0x3ffc){ + shift_size = 0; + } + else{ + shift_size = 0x3ffc - masked_addr; + } + return shift_size; +} + +// Input size is block size: [data_size + meta_size] +static void *allocate_memory_l1_aligned(alloc_t *alloc, const uint32_t size) { + // Get first block of linked list of free blocks + alloc_block_t *curr = alloc->first_block; + alloc_block_t *prev = 0; + + uint32_t shift_size = 0; + shift_size = calc_aligned_l1_size( (uint32_t*)curr); + uint32_t aligned_size = size + shift_size; + + // Search first block large enough in linked list + while (curr && (curr->size < aligned_size)) { + prev = curr; + curr = curr->next; + + shift_size = calc_aligned_l1_size( (uint32_t*)curr); + aligned_size = size + shift_size; + } + + if (curr) { + // Update allocator + if (shift_size==0){ + printf("[L1 Alloc] No Alignment.\n"); + if (curr->size == size) { + // Special case: Whole block taken + if (prev) { + prev->next = curr->next; + } else { + alloc->first_block = curr->next; + } + } else { + // Regular case: Split off block + alloc_block_t *new_block = (alloc_block_t *)((char *)curr + size); + new_block->size = curr->size - size; + new_block->next = curr->next; + if (prev) { + prev->next = new_block; + } else { + alloc->first_block = new_block; + } + } + } + else{ + printf("[L1 Alloc] Alignment Needed.\n"); + if (curr->size == aligned_size) { + // Special case: Whole block taken, first part of the block is still empty + // store the curr info in tmp + // uint32_t tmp_size = curr->size; + struct alloc_block_s *tmp_next = curr->next; + alloc_block_t *shift_block = (alloc_block_t *)((char *)curr); + shift_block->size = shift_size; + shift_block->next = tmp_next; + if (prev) { + prev->next = shift_block; + } else { + alloc->first_block = shift_block; + } + } + else{ + // Regular case: Split off block + alloc_block_t *new_block = (alloc_block_t *)((char *)curr + aligned_size); + new_block->size = curr->size - aligned_size; + new_block->next = curr->next; + + alloc_block_t *shift_block = (alloc_block_t *)((char *)curr); + shift_block->size = shift_size; + shift_block->next = new_block; + if (prev) { + prev->next = shift_block; + } else { + alloc->first_block = shift_block; + } + } + } + + // Return block pointer + return (void *)((char *)curr+shift_size); + } else { + // There is no free block large enough + return NULL; + } +} + +void *domain_malloc_aligned(alloc_t *alloc, const uint32_t size) { + // Calculate actually required block size + uint32_t data_size = size + sizeof(uint32_t); // add size/metadata + uint32_t block_size = ALIGN_UP(data_size, MIN_BLOCK_SIZE); // add alignment + + // 32-bit metadata = 8-bit canary + 24-bit size + // i.e. max allowed block_size == (2^24 - 1) bytes + if (block_size >= (1 << (sizeof(uint32_t) * 8 - sizeof(uint8_t) * 8))) { + printf("Memory allocator: Requested memory exceeds max block size\n"); + return NULL; + } + + // Allocate memory + void *block_ptr = allocate_memory_l1_aligned(alloc, block_size); + if (!block_ptr) { + printf("Memory allocator: No large enough block found (%d)\n", block_size); + return NULL; + } + + // Store canary and size into first four bytes + *((uint32_t *)block_ptr) = canary_encode(block_ptr, block_size); + + // Return data pointer + void *data_ptr = (void *)((uint32_t *)block_ptr + 1); + printf("[Aligned malloc] addr: %p - size: %d\n", data_ptr, size); + return data_ptr; +} + +void *simple_aligned_malloc(const uint32_t size){ + return domain_malloc_aligned(&alloc_l1, size); +} + +// ------ This function allocate data in Sequential Heap region ------ // +// Canary system is stored in a seperate linked list +// void *partition_malloc(alloc_t *alloc, const uint32_t size){ +void *partition_malloc(alloc_t *alloc, const uint32_t size, const uint32_t allocated_size){ + uint32_t data_size = size; + uint32_t block_size = ALIGN_UP(data_size, MIN_BLOCK_SIZE); // add alignment + + // Check if exceed maximum allowed size + if (block_size >= (1 << (sizeof(uint32_t) * 8 - sizeof(uint8_t) * 8))) { + printf("Memory allocator: Requested memory exceeds max block size\n"); + return NULL; + } + + // allocate + void *block_ptr = NULL; + if (allocated_size<2){ + block_ptr = allocate_memory(alloc, block_size); + } + else{ + block_ptr = allocate_memory_aligned(alloc, block_size, allocated_size); + } + // void *block_ptr = allocate_memory(alloc, block_size); + // void *block_ptr = allocate_memory_aligned(alloc, block_size, allocated_size); + if (!block_ptr) { + printf("Memory allocator: No large enough block found (%d)\n", block_size); + return NULL; + } + + // Allocate a region in L1 heap for canary + // printf("p1\n"); + canary_chain_t *canary = (canary_chain_t *)simple_malloc(sizeof(canary_chain_t)); + // printf("p2\n"); + // Init the canary + canary->data_address = (uint32_t *)block_ptr; + canary->canary_and_size = canary_encode(block_ptr, block_size); + canary->next_canary = NULL; + + // link the canary into the list + // canary_chain_t *curr = first_canary->first_block; + canary_chain_t *curr = first_canary; + canary_chain_t *prev = 0; + + + // Fit the canary into the chain, depending on data_address + // | prev | ------> | canary | ------> | curr | + uint32_t *data_addr = 0; + if (curr != (canary_chain_t *)0x1000){ + // only access struct when init + data_addr = curr->data_address; + } + + while((curr!=(canary_chain_t *)0x1000) && (curr!=NULL) && ((uint32_t *)data_addr < (uint32_t *)block_ptr)){ + prev = curr; + // data_addr = curr->data_address; + curr = curr->next_canary; + if (curr!=NULL){ + data_addr = curr->data_address; + } + // data_addr = curr->data_address; + } + + // printf("post: %p - %p \n", curr, prev); + if ((curr==(canary_chain_t *)0x1000) && !prev) { + // special case: first canary block + first_canary = canary; + printf("| First | ------> [ New ]\n"); + // printf("first_canary: %p\n", first_canary); + } + else{ + if (!curr){ + // reach to the last of the chain + // | prev | ------> | canary | ------> NULL + prev->next_canary = canary; + canary->next_canary = NULL; + printf("| Other | ------> [ New ] ------> NULL\n"); + } + else if (!prev){ + // canary need to insert at the beginning of the chain + // first_canary ------> | canary | ------> | curr | + first_canary = canary; + canary->next_canary = curr; + printf("| First | ------> [ New ] ------> | Other |\n"); + } + else{ + // normal case + // | prev | ------> | canary | ------> | curr | + canary->next_canary = prev->next_canary; + prev->next_canary = canary; + printf("| Other | ------> [ New ] ------> | Other |\n"); + } + + } + // return the block pointer directly + // printf("%p\n", block_ptr); + return block_ptr; +} + // ---------------------------------------------------------------------------- // Free Memory // ---------------------------------------------------------------------------- @@ -208,6 +575,78 @@ void domain_free(alloc_t *alloc, void *const ptr) { void simple_free(void *const ptr) { domain_free(&alloc_l1, ptr); } +void partition_free(alloc_t *alloc, void *const ptr){ + // block pointer is the input pointer + void *block_ptr = ptr; + + canary_and_size_t canary_and_size = (canary_and_size_t){.canary = 0, .size = 0}; + // find the canary block in the chain + canary_chain_t *curr = first_canary; + canary_chain_t *prev = 0; + + // While loop suppose to stop when curr->data_address == block_ptr + // | prev | ------> | curr | + uint32_t *data_addr = 0; + if (curr){ + data_addr = curr->data_address; + } + printf("data_addr - %p - block_ptr - %p - curr->data_address - %p \n", data_addr, block_ptr, curr->data_address); + while((curr!=(canary_chain_t *)0x1000) && (curr!=NULL) && (data_addr < (uint32_t *)block_ptr)){ + prev = curr; + // data_addr = curr->data_address; + curr = curr->next_canary; + if(curr!=NULL){ + data_addr = curr->data_address; + } + } + + if ((curr==(canary_chain_t *)0x1000) && !prev){ + // nothing in the chain + printf("CANARY: Empty canary chain!\n"); + } + else if (!curr){ + // reach to the end of the chain + printf("CANARY: Chain depleted. No info found for %p\n", block_ptr); + } + else if (curr->data_address != block_ptr){ + // no information for the current free + printf("CANARY: Unmatch! %p - %p\n", curr->data_address, block_ptr); + } + else if (!prev){ + // normal case 1: curr is the first canary + // first_canary ------> | curr | ------> next + canary_and_size = canary_decode(curr->canary_and_size); + if (curr->next_canary == NULL){ + first_canary = (canary_chain_t *)0x1000; + } + else{ + first_canary = curr->next_canary; + } + + simple_free((void *)curr); + } + else{ + // normal case 2: relink the chain, free the curr canary + // | prev | ------> | curr | ------> something + canary_and_size = canary_decode(curr->canary_and_size); + prev->next_canary = curr->next_canary; + simple_free((void *)curr); + } + + // Check for memory overflow + if (canary_and_size.canary != canary(block_ptr)) { + if (!canary_and_size.canary) { + printf("Empty canary.\n"); + } + printf("Memory Overflow at %p\n", block_ptr); + return; + } + + // Free memory + free_memory(alloc, block_ptr, canary_and_size.size); + +} + // ---------------------------------------------------------------------------- // Debugging Functions // ---------------------------------------------------------------------------- @@ -233,9 +672,32 @@ void alloc_dump(alloc_t *alloc) { } } +void canary_dump(void){ + printf(" ------ Canary Chain Dump ------ \n"); + canary_chain_t *curr = first_canary; + if (curr == (canary_chain_t *)0x1000){ + // empty list + printf("Empty Canary list.\n"); + } + else{ + uint32_t cnt = 0; + while(curr!=NULL){ + printf("[%d] - [%p] - [%p] - [%p]\n", cnt, curr, curr->data_address, curr->next_canary); + cnt += 1; + curr = curr->next_canary; + } + } + printf(" ------ Canary Dump END ------ \n"); +} + + // ---------------------------------------------------------------------------- // Get Allocators // ---------------------------------------------------------------------------- +// Get the address of global variable `alloc_l1` alloc_t *get_alloc_l1() { return &alloc_l1; } alloc_t *get_alloc_tile(const uint32_t tile_id) { return &alloc_tile[tile_id]; } + +// Dynamic Heap Allocator +alloc_t *get_dynamic_heap_alloc(const uint32_t part_id) {return &dynamic_heap_alloc[part_id];} \ No newline at end of file diff --git a/software/runtime/alloc.h b/software/runtime/alloc.h index f6db489a2..8527edbf6 100644 --- a/software/runtime/alloc.h +++ b/software/runtime/alloc.h @@ -27,28 +27,43 @@ typedef struct { alloc_block_t *first_block; } alloc_t; + // Initialization void alloc_init(alloc_t *alloc, void *base, const uint32_t size); // Malloc in L1 memory void *simple_malloc(const uint32_t size); +void *simple_aligned_malloc(const uint32_t size); + +// Dynamic heap allocation with Canary Chain +void *partition_malloc(alloc_t *alloc, const uint32_t size, const uint32_t allocated_size); + // Malloc with specified allocator void *domain_malloc(alloc_t *alloc, const uint32_t size); // Free in L1 memory void simple_free(void *const ptr); +// Free dynamic heap allocation with Canary chain +void partition_free(alloc_t *alloc, void *const ptr); + // Free with specified allocator void domain_free(alloc_t *alloc, void *const ptr); // Print out linked list of free blocks void alloc_dump(alloc_t *alloc); - +void canary_dump(void); // Get allocator for L1 interleaved heap memory alloc_t *get_alloc_l1(); // Get allocator for L1 local sequential heap memory alloc_t *get_alloc_tile(const uint32_t tile_id); +// ----- Dynamic Heap Allocator ----- // +extern alloc_t* dynamic_heap_alloc; +void init_dynamic_heap_alloc(uint32_t num_partition); +void free_dynamic_heap_alloc(void); +alloc_t *get_dynamic_heap_alloc(const uint32_t part_id); + #endif diff --git a/software/runtime/alloc_partition.c b/software/runtime/alloc_partition.c new file mode 100644 index 000000000..a572f9380 --- /dev/null +++ b/software/runtime/alloc_partition.c @@ -0,0 +1,86 @@ +// Copyright 2022 ETH Zurich and University of Bologna. +// Licensed under the Apache License, Version 2.0, see LICENSE for details. +// SPDX-License-Identifier: Apache-2.0 +// +// Author: Bowen Wang + +#include "alloc.h" +#include "alloc_partition.h" +#include "runtime.h" +#include "printf.h" + +extern partition_status_t volatile partition_status[NUM_PART_REGION]; + +// ======================================================================================== +// Allocate a region in L1 for a single or several matrices +// @inp: (uint32_t) size --- size of the single allocated matrix +// @inp: (uint32_t) num_matrix --- How many mtrices in this region +// @inp: (int32_t* volitile *) target --- Where to store this pointer +// @inp: (uint32_t) group_factor --- GF_A/B/C +// ======================================================================================== +void alloc_matrix (float *volatile * target, uint32_t size, uint32_t group_factor, uint32_t num_matrix){ + + // 1. Get allocator for sequential Heap region + uint32_t total_size = size*num_matrix; + alloc_t* alloc_heap = get_dynamic_heap_alloc(0); + + // 2. alloc a space, store the return address to the target + *target = (float *)partition_malloc(alloc_heap, total_size*sizeof(float), total_size/NUM_ELEMENTS_PER_ROW); + // 3. find which partition in free + uint32_t pid=0; + uint32_t avail=0; + while( (pid> pid[%d], start_addr[%p].\n", pid, *target); + } + else{ + pid++; + } + } + if ( (pid==NUM_PART_REGION) && (avail==0) ){ + printf("Dynamic Allocator >> WARNING: No available partition region.\n"); + } + + // 4. Config the hardware + printf("Dynamic Allocator >> pid[%d] parallel_sections[%d] elements_per_section[%d]\n", pid, NUM_TILES/group_factor, size); + partition_config(pid, group_factor); + start_addr_scheme_config(pid, (uint32_t)(*target), total_size); + + // 5. Handle multi-matrices + if (num_matrix > 1){ + for (uint32_t ii=1; ii> pid[%d] is freed.\n", pid); + pid++; + } + else{ + pid++; + } + } + } +} + + +void free_alloc(uint32_t core_id){ + if (core_id == 0){ + free_dynamic_heap_alloc(); + } +} \ No newline at end of file diff --git a/software/runtime/alloc_partition.h b/software/runtime/alloc_partition.h new file mode 100644 index 000000000..61fc2a753 --- /dev/null +++ b/software/runtime/alloc_partition.h @@ -0,0 +1,58 @@ +// Copyright 2022 ETH Zurich and University of Bologna. +// Licensed under the Apache License, Version 2.0, see LICENSE for details. +// SPDX-License-Identifier: Apache-2.0 + +// Author: Bowen Wang + +#ifndef _ALLOC_PARTITION_H_ +#define _ALLOC_PARTITION_H_ + +// ============================================================ +// Dynamic Data Pointers +// ============================================================ +#define NUM_TILES (128) +#ifdef FLOAT_APP +float* volatile Region_A[NUM_TILES] __attribute__((section(".l1"))); +float* volatile Region_B[NUM_TILES] __attribute__((section(".l1"))); +float* volatile Region_C[NUM_TILES] __attribute__((section(".l1"))); +float* volatile Region_D[NUM_TILES] __attribute__((section(".l1"))); +#endif + +#ifdef INT_APP +int32_t* volatile Region_A[NUM_TILES] __attribute__((section(".l1"))); +int32_t* volatile Region_B[NUM_TILES] __attribute__((section(".l1"))); +int32_t* volatile Region_C[NUM_TILES] __attribute__((section(".l1"))); +int32_t* volatile Region_D[NUM_TILES] __attribute__((section(".l1"))); +#endif + +// ============================================================ +// Group Factor +// ============================================================ +#ifndef _GF +#define _GF +#define GF_TILE (1) +#define GF_SUBG (8) +#define GF_GROUP (32) +#define GF_CLUSTER (128) +#endif + +// ============================================================ +// Dynamic Heap Region Status +// ============================================================ +#define NUM_ELEMENTS_PER_ROW (4096) +#define NUM_PART_REGION (4) +typedef struct { + float *data_addr; // trace which matrix belong to this partition + uint32_t status; // set to 1 if used +} partition_status_t; + +// ============================================================ +// Helper Functions +// ============================================================ +void alloc_matrix(float *volatile * target, uint32_t size, uint32_t group_factor, uint32_t num_matrix); + +void free_matrix(float *__restrict__ heap_matrix, uint32_t part_id, uint32_t core_id); + +void free_alloc(uint32_t core_id); + +#endif \ No newline at end of file diff --git a/software/runtime/arch.ld.c b/software/runtime/arch.ld.c index 1d8de5e57..43bc68dab 100644 --- a/software/runtime/arch.ld.c +++ b/software/runtime/arch.ld.c @@ -31,5 +31,8 @@ SECTIONS { __heap_start = __l1_start; __heap_end = __l1_end; + // DAS related, default impacted region size + __heap_seq_start = __l1_start + (NUM_CORES * 2 * L1_BANK_SIZE); + fake_uart = 0xC0000000; } diff --git a/software/runtime/dma.h b/software/runtime/dma.h index 4aa7f6cec..7c33b2588 100644 --- a/software/runtime/dma.h +++ b/software/runtime/dma.h @@ -73,4 +73,10 @@ void dma_memcpy_blocking(void *dest, const void *src, size_t len) { dma_memcpy_nonblocking(dest, src, len); dma_wait(); } + +void dma_memcpy_ModeSel(void *dest, const void *src, size_t len, uint32_t mode_sel){ + dma_mode_reg = mode_sel; + dma_memcpy_nonblocking(dest, src, len); +} + #endif // _DMA_H_ diff --git a/software/runtime/runtime.h b/software/runtime/runtime.h index 7ec0aa8d2..ca1395c4a 100644 --- a/software/runtime/runtime.h +++ b/software/runtime/runtime.h @@ -54,6 +54,26 @@ static uint32_t volatile *wake_up_offset_reg = (uint32_t volatile *)(CONTROL_REGISTER_OFFSET + CONTROL_REGISTERS_WAKE_UP_OFFST_REG_OFFSET); +/* DAS-related regs + +extern volatile uint32_t partition_reg; +extern volatile uint32_t partition1_reg; +extern volatile uint32_t partition2_reg; +extern volatile uint32_t partition3_reg; + +extern volatile uint32_t start_addr_scheme0_reg; +extern volatile uint32_t start_addr_scheme1_reg; +extern volatile uint32_t start_addr_scheme2_reg; +extern volatile uint32_t start_addr_scheme3_reg; + +extern volatile uint32_t allocated_size0_reg; +extern volatile uint32_t allocated_size1_reg; +extern volatile uint32_t allocated_size2_reg; +extern volatile uint32_t allocated_size3_reg; + +extern volatile uint32_t dma_mode_reg; +*/ + typedef uint32_t mempool_id_t; typedef uint32_t mempool_timer_t; @@ -99,8 +119,10 @@ static inline uint32_t mempool_get_core_count_per_group() { static inline void mempool_init(const uint32_t core_id) { if (core_id == 0) { // Initialize L1 Interleaved Heap Allocator - extern uint32_t __heap_start, __heap_end; - uint32_t heap_size = (uint32_t)&__heap_end - (uint32_t)&__heap_start; + extern uint32_t __heap_start; + extern uint32_t __heap_seq_start; + // Heap Region + uint32_t heap_size = (uint32_t)&__heap_seq_start - (uint32_t)&__heap_start; // Downscale interleaved heap size alloc_init(get_alloc_l1(), &__heap_start, heap_size); // Initialize L1 Sequential Heap Allocator per Tile @@ -123,6 +145,59 @@ static inline void mempool_init(const uint32_t core_id) { } } +// Reconfigure Interleaved Heap region, with explicit 'Dynamic Heap' start address +// Programmer API for flexible Dynamic Heap region configuration +static inline void mempool_reset_heap(const uint32_t core_id, uint32_t heap_seq_start) { + if (core_id == 0) { + // Initialize L1 Interleaved Heap Allocator + extern uint32_t __heap_start; + uint32_t heap_size = (uint32_t)heap_seq_start - (uint32_t)&__heap_start; // Downscale interleaved heap size + alloc_init(get_alloc_l1(), &__heap_start, heap_size); + } +} + + +// Initialize Dynamic Heap Allocator, as default specified in the linker script +// @inp (uint32_t) group_factor: Number of Tiles per partition +static inline void mempool_dynamic_heap_alloc_init(const uint32_t core_id, const uint32_t group_factor){ + if (core_id == 0){ + extern uint32_t __heap_seq_start; + uint32_t num_tiles_per_partition = group_factor; + + + // Dynamic allocator base and size + uint32_t seq_heap_base = (uint32_t)&__heap_seq_start; + uint32_t seq_heap_size = NUM_CORES_PER_TILE * num_tiles_per_partition * HEAP_SEQ_MEM_SIZE; + uint32_t num_partition = mempool_get_tile_count() / group_factor; + // Dynamically allocate the space for allocators + init_dynamic_heap_alloc(num_partition); + for (uint32_t part_id=0; part_id Date: Tue, 21 Oct 2025 18:51:13 +0200 Subject: [PATCH 09/34] [hardware] Add DAS registers and keep only one DMA transfer option --- .../src/midends/idma_address_scrambler.sv | 10 +- .../src/midends/idma_distributed_midend_v2.sv | 71 +-- .../idma/src/midends/idma_split_midend_v2.sv | 345 ++++------- .../control_registers/control_registers.hjson | 94 +++ .../control_registers_reg_pkg.sv | 202 +++++-- .../control_registers_reg_top.sv | 544 ++++++++++++++++-- hardware/src/ctrl_registers.sv | 35 +- hardware/src/mempool_cluster.sv | 87 ++- hardware/src/mempool_group.sv | 43 +- hardware/src/mempool_system.sv | 40 +- software/runtime/control_registers.h | 62 +- software/runtime/runtime.h | 52 +- 12 files changed, 1083 insertions(+), 502 deletions(-) diff --git a/hardware/deps/idma/src/midends/idma_address_scrambler.sv b/hardware/deps/idma/src/midends/idma_address_scrambler.sv index 9c7473ce7..58096d464 100644 --- a/hardware/deps/idma/src/midends/idma_address_scrambler.sv +++ b/hardware/deps/idma/src/midends/idma_address_scrambler.sv @@ -21,15 +21,15 @@ module idma_address_scrambler #( parameter int unsigned MemSizePerRow = 4*4*1024, // 4bytes * 4096 banks parameter int unsigned TCDMSize = 1024*1024 ) ( - input logic [AddrWidth-1:0] address_i, - input logic [31:0] num_bytes_i, + input logic [AddrWidth-1:0] address_i, + input logic [31:0] num_bytes_i, input logic [3:0][7:0] group_factor_i, // For each allocation, the maximum number of rows assigned can be 128 rows input logic [3:0][7:0] allocated_size_i, input logic [3:0][DataWidth-1:0] start_addr_scheme_i, - output logic [7:0] group_factor_o, - output logic [7:0] allocated_size_o, - output logic [AddrWidth-1:0] address_o + output logic [7:0] group_factor_o, + output logic [7:0] allocated_size_o, + output logic [AddrWidth-1:0] address_o ); // Basic Settings localparam int unsigned BankOffsetBits = $clog2(NumBanksPerTile); diff --git a/hardware/deps/idma/src/midends/idma_distributed_midend_v2.sv b/hardware/deps/idma/src/midends/idma_distributed_midend_v2.sv index 902c0257a..8300ec55f 100644 --- a/hardware/deps/idma/src/midends/idma_distributed_midend_v2.sv +++ b/hardware/deps/idma/src/midends/idma_distributed_midend_v2.sv @@ -5,12 +5,6 @@ // Samuel Riedel // Bowen Wang -// Mode selection: [1:0] dma_mode_i -// 2'b00: safe mode, no modificaion will be added to the transfer -// 2'b01: fast mode, only apply to L1-aligned address -// 2'b10: dupl mode, only apply to partition-aligned address -// 2'b11: NOP - `include "common_cells/registers.svh" module idma_distributed_midend_v2 #( @@ -37,7 +31,6 @@ module idma_distributed_midend_v2 #( output logic ready_o, output meta_t meta_o, // partition related signals - input logic [1:0] dma_mode_i, input logic [7:0] allocated_size_i, // Master output burst_req_t [NoMstPorts-1:0] burst_req_o, @@ -169,45 +162,38 @@ module idma_distributed_midend_v2 #( // Feed through the address bits burst_req_o[i].src = burst_req_i.src; burst_req_o[i].dst = burst_req_i.dst; + // Modify lower addresses bits and size if (($unsigned(start_addr) >= (i+1)*DmaRegionWidth) || ($unsigned(end_addr) <= i*DmaRegionWidth)) begin - // We are not involved in the transfer - if ( (dma_mode_i == 2'b00) || (dma_mode_i == 2'b11) ) begin // safe mode - burst_req_o[i].src = '0; - burst_req_o[i].dst = '0; - burst_req_o[i].num_bytes = 1; - // Make handshake ourselves - valid_o[i] = 1'b0; - ready[i] = 1'b1; - // Inject trans complete - if (valid[i]) begin - tie_off_trans_complete_d[i] = 1'b1; - end - end else if (dma_mode_i == 2'b01) begin // fast mode - burst_req_o[i].num_bytes = (burst_req_i.num_bytes= DmaRegionStart) && ($unsigned(burst_req_i.src) < DmaRegionEnd)) begin - burst_req_o[i].src = burst_req_i.src+i*DmaRegionWidth; - burst_req_o[i].dst = burst_req_i.dst+i*allocated_size_i*DmaRegionWidth; - end else begin - // L2 --> L1 - if (burst_req_i.num_bytes<=DmaRegionWidth )begin - burst_req_o[i].src = burst_req_i.src+i*allocated_size_i*DmaRegionWidth; - end else if (i==2) begin - burst_req_o[i].src = burst_req_i.src+i*allocated_size_i*DmaRegionWidth; - end else if (i==3) begin - burst_req_o[i].src = burst_req_i.src+(i-1)*allocated_size_i*DmaRegionWidth + DmaRegionWidth; - end - burst_req_o[i].dst = burst_req_i.dst+i*DmaRegionWidth; +`ifdef DAS + burst_req_o[i].num_bytes = (burst_req_i.num_bytes= DmaRegionStart) && ($unsigned(burst_req_i.src) < DmaRegionEnd)) begin + burst_req_o[i].src = burst_req_i.src+i*DmaRegionWidth; + burst_req_o[i].dst = burst_req_i.dst+i*allocated_size_i*DmaRegionWidth; + end else begin + // L2 --> L1 + if (burst_req_i.num_bytes<=DmaRegionWidth )begin + burst_req_o[i].src = burst_req_i.src+i*allocated_size_i*DmaRegionWidth; + end else if (i==2) begin + burst_req_o[i].src = burst_req_i.src+i*allocated_size_i*DmaRegionWidth; + end else if (i==3) begin + burst_req_o[i].src = burst_req_i.src+(i-1)*allocated_size_i*DmaRegionWidth + DmaRegionWidth; end - end else if (dma_mode_i == 2'b10) begin // duplication mode: only consider L2 --> L1 - if (($unsigned(burst_req_i.dst) >= DmaRegionStart) && ($unsigned(burst_req_i.dst) < DmaRegionEnd)) begin - // L2 ------> L1 - burst_req_o[i].num_bytes = (burst_req_i.num_bytes= i*DmaRegionWidth)) begin // First (and potentially only) slice @@ -217,7 +203,6 @@ module idma_distributed_midend_v2 #( end else begin burst_req_o[i].num_bytes = DmaRegionWidth-start_addr[DmaRegionAddressBits-1:0]; end - // end else if (($unsigned(start_addr) < i*DmaRegionWidth)) begin end else begin // Round up the address to the next DMA boundary diff --git a/hardware/deps/idma/src/midends/idma_split_midend_v2.sv b/hardware/deps/idma/src/midends/idma_split_midend_v2.sv index 3dc04b088..24c44d926 100644 --- a/hardware/deps/idma/src/midends/idma_split_midend_v2.sv +++ b/hardware/deps/idma/src/midends/idma_split_midend_v2.sv @@ -28,17 +28,16 @@ module idma_split_midend_v2 #( output meta_t meta_o, // Partition related signals - input logic [1:0] dma_mode_i, input logic [3:0][7:0] group_factor_i, input logic [3:0][7:0] allocated_size_i, input logic [3:0][AddrWidth-1:0] start_addr_scheme_i, output logic [7:0] allocated_size_o, // Master - output burst_req_t burst_req_o, - output logic valid_o, - input logic ready_i, - input meta_t meta_i + output burst_req_t burst_req_o, + output logic valid_o, + input logic ready_i, + input meta_t meta_i ); // ------ Parameter Settings ------ // @@ -198,182 +197,77 @@ module idma_split_midend_v2 #( unique case (state_q) Idle: begin - if (valid_i) begin - unique case (dma_mode_i) - // ------ Std Mode ------ // - 2'b00: begin - if ( (DmaRegionWidth-start_addr[DmaRegionAddressBits-1:0]) >= burst_req_i.num_bytes) begin - burst_req_o = burst_req_i; - valid_o = 1'b1; - ready_o = ready_i; - req_valid = ready_i; - end else begin - // Store and acknowledge - req_d = burst_req_i; - ready_o = 1'b1; - burst_req_o = burst_req_i; - // Calculate the size for the 1st burst - burst_req_o.num_bytes = DmaRegionWidth-start_addr[DmaRegionAddressBits-1:0]; - valid_o = 1'b1; - - // Modify the stored info after first beat sent - if (ready_i) begin - req_d.num_bytes -= DmaRegionWidth-start_addr[DmaRegionAddressBits-1:0]; - req_d.src += DmaRegionWidth-start_addr[DmaRegionAddressBits-1:0]; - req_d.dst += DmaRegionWidth-start_addr[DmaRegionAddressBits-1:0]; - req_valid = 1'b1; - end - state_d = Busy; - end + if (valid_i) begin + +`ifdef DAS + if ( (PartitionDmaRegionWidth-masked_start_addr) >= burst_req_i.num_bytes ) begin + burst_req_o = burst_req_i; + // Address in SPM need to be translated back to physical address + if (spm2dram) begin + burst_req_o.src = post_scramble_src; + end else begin + burst_req_o.dst = post_scramble_dst; end - - // ------ Fast Mode ------ // - 2'b01: begin - if ( (PartitionDmaRegionWidth-masked_start_addr) >= burst_req_i.num_bytes ) begin - burst_req_o = burst_req_i; - // Address in SPM need to be translated back to physical address - if (spm2dram) begin - burst_req_o.src = post_scramble_src; - end else begin - burst_req_o.dst = post_scramble_dst; - end - valid_o = 1'b1; - ready_o = ready_i; - req_valid = ready_i; - end else begin - // Store and acknowledge - req_d = burst_req_i; - ready_o = 1'b1; - burst_req_o = burst_req_i; - // Calculate the size for the 1st burst - burst_req_o.num_bytes = PartitionDmaRegionWidth-masked_start_addr; - // TODO (bowwang): parameterize - req_d.num_bytes = (group_factor_sel <= 8) ? (allocated_size_sel*DmaBackendWidth) : (allocated_size_sel*PartitionDmaRegionWidth); - if (spm2dram) begin - burst_req_o.src = post_scramble_src; - req_d.src = post_scramble_src; - end else begin - burst_req_o.dst = post_scramble_dst; - req_d.dst = post_scramble_dst; - end - valid_o = 1'b1; - - // Modify the stored info after first beat sent - if (ready_i) begin - // TODO (bowwang): May not be mecessary to consider alignment - req_d.num_bytes -= PartitionDmaRegionWidth-masked_start_addr; - if (spm2dram) begin - req_d.src += DmaRegionWidth-masked_start_addr; - req_d.dst += PartitionDmaRegionWidth-masked_start_addr; - end else begin - req_d.src += PartitionDmaRegionWidth-masked_start_addr; - req_d.dst += DmaRegionWidth-masked_start_addr; - end - req_valid = 1'b1; - beat_cnt_d = 1; - end - state_d = Busy; - end - end - - // ------ Duplicate Mode ------ // - 2'b10: begin - if ( (PartitionDmaRegionWidth-masked_start_addr) >= burst_req_i.num_bytes ) begin - burst_req_o = burst_req_i; - // Address in SPM need to be translated back to physical address - if (spm2dram) begin - burst_req_o.src = post_scramble_src; - end else begin - burst_req_o.dst = post_scramble_dst; - end - valid_o = 1'b1; - ready_o = ready_i; - req_valid = ready_i; - end else begin - // Store and acknowledge - req_d = burst_req_i; - ready_o = 1'b1; - burst_req_o = burst_req_i; - // Calculate the size for the 1st burst - burst_req_o.num_bytes = PartitionDmaRegionWidth-masked_start_addr; - // TODO (bowwang): parameterize - req_d.num_bytes = (group_factor_sel <= 8) ? (allocated_size_sel*DmaBackendWidth) : (allocated_size_sel*PartitionDmaRegionWidth); - dup_start_addr_d = burst_req_i.src; - if (spm2dram) begin - burst_req_o.src = post_scramble_src; - req_d.src = post_scramble_src; - end else begin - burst_req_o.dst = post_scramble_dst; - req_d.dst = post_scramble_dst; - end - valid_o = 1'b1; - - // Modify the stored info after first beat sent - if (ready_i) begin - // TODO (bowwang): May not be mecessary to consider alignment - req_d.num_bytes -= PartitionDmaRegionWidth-masked_start_addr; - if (spm2dram) begin - req_d.src += DmaRegionWidth-masked_start_addr; - req_d.dst += PartitionDmaRegionWidth-masked_start_addr; - end else begin - req_d.src += PartitionDmaRegionWidth-masked_start_addr; - req_d.dst += DmaRegionWidth-masked_start_addr; - end - req_valid = 1'b1; - beat_cnt_d = 1; - end - state_d = Busy; - end - end - - 2'b11: begin // Partition_Std Mode - if ( (PartitionDmaRegionWidth-masked_start_addr) >= burst_req_i.num_bytes ) begin - burst_req_o = burst_req_i; - // Address in SPM need to be translated back to physical address - if (spm2dram) begin - burst_req_o.src = post_scramble_src; - end else begin - burst_req_o.dst = post_scramble_dst; - end - valid_o = 1'b1; - ready_o = ready_i; - req_valid = ready_i; + valid_o = 1'b1; + ready_o = ready_i; + req_valid = ready_i; + end else begin + // Store and acknowledge + req_d = burst_req_i; + ready_o = 1'b1; + burst_req_o = burst_req_i; + // Calculate the size for the 1st burst + burst_req_o.num_bytes = PartitionDmaRegionWidth-masked_start_addr; + // TODO (bowwang): parameterize + req_d.num_bytes = (group_factor_sel <= 8) ? (allocated_size_sel*DmaBackendWidth) : (allocated_size_sel*PartitionDmaRegionWidth); + if (spm2dram) begin + burst_req_o.src = post_scramble_src; + req_d.src = post_scramble_src; + end else begin + burst_req_o.dst = post_scramble_dst; + req_d.dst = post_scramble_dst; + end + valid_o = 1'b1; + // Modify the stored info after first beat sent + if (ready_i) begin + // TODO (bowwang): May not be mecessary to consider alignment + req_d.num_bytes -= PartitionDmaRegionWidth-masked_start_addr; + if (spm2dram) begin + req_d.src += DmaRegionWidth-masked_start_addr; + req_d.dst += PartitionDmaRegionWidth-masked_start_addr; end else begin - // Store and acknowledge - req_d = burst_req_i; - ready_o = 1'b1; - burst_req_o = burst_req_i; - // Calculate the size for the 1st burst - burst_req_o.num_bytes = PartitionDmaRegionWidth-masked_start_addr; - if (spm2dram) begin - burst_req_o.src = post_scramble_src; - req_d.src = post_scramble_src; - end else begin - burst_req_o.dst = post_scramble_dst; - req_d.dst = post_scramble_dst; - end - valid_o = 1'b1; - - // Modify the stored info after first beat sent - if (ready_i) begin - // TODO (bowwang): May not be mecessary to consider alignment - req_d.num_bytes -= PartitionDmaRegionWidth-masked_start_addr; - if (spm2dram) begin - req_d.src += DmaRegionWidth-masked_start_addr; - req_d.dst += PartitionDmaRegionWidth-masked_start_addr; - end else begin - req_d.src += PartitionDmaRegionWidth-masked_start_addr; - req_d.dst += DmaRegionWidth-masked_start_addr; - end - req_valid = 1'b1; - beat_cnt_d = 1; - end - state_d = Busy; + req_d.src += PartitionDmaRegionWidth-masked_start_addr; + req_d.dst += DmaRegionWidth-masked_start_addr; end - end - - default: /*do nothing*/; - endcase + req_valid = 1'b1; + beat_cnt_d = 1; + end + state_d = Busy; + end +`else + if ( (DmaRegionWidth-start_addr[DmaRegionAddressBits-1:0]) >= burst_req_i.num_bytes) begin + burst_req_o = burst_req_i; + valid_o = 1'b1; + ready_o = ready_i; + req_valid = ready_i; + end else begin + // Store and acknowledge + req_d = burst_req_i; + ready_o = 1'b1; + burst_req_o = burst_req_i; + // Calculate the size for the 1st burst + burst_req_o.num_bytes = DmaRegionWidth-start_addr[DmaRegionAddressBits-1:0]; + valid_o = 1'b1; + // Modify the stored info after first beat sent + if (ready_i) begin + req_d.num_bytes -= DmaRegionWidth-start_addr[DmaRegionAddressBits-1:0]; + req_d.src += DmaRegionWidth-start_addr[DmaRegionAddressBits-1:0]; + req_d.dst += DmaRegionWidth-start_addr[DmaRegionAddressBits-1:0]; + req_valid = 1'b1; + end + state_d = Busy; + end +`endif end end // Idle @@ -383,63 +277,52 @@ module idma_split_midend_v2 #( valid_o = 1'b1; req_valid = ready_i; - unique case (dma_mode_i) - // ------ Std Mode ------ // - 2'b00: begin - if ($unsigned(req_q.num_bytes) <= $unsigned(DmaRegionWidth)) begin - if (ready_i) begin - state_d = Idle; +`ifdef PARTITION + if ($unsigned(req_q.num_bytes) <= $unsigned(PartitionDmaRegionWidth)) begin + // Last split + if (ready_i) begin + state_d = Idle; + beat_cnt_d = beat_cnt_q + 1; + end + end else begin + burst_req_o.num_bytes = PartitionDmaRegionWidth; + if (ready_i) begin + req_d.num_bytes = req_q.num_bytes - PartitionDmaRegionWidth; + beat_cnt_d = beat_cnt_q + 1; + if (spm2dram) begin + if (shift_row == allocated_size_sel-1) begin + req_d.src = req_q.src + PartitionDmaRegionWidth - shift_row*DmaRegionWidth; + end else begin + req_d.src = req_q.src + DmaRegionWidth; end + req_d.dst = req_q.dst + PartitionDmaRegionWidth; end else begin - burst_req_o.num_bytes = DmaRegionWidth; - if (ready_i) begin - req_d.num_bytes = req_q.num_bytes - DmaRegionWidth; - req_d.src = req_q.src + DmaRegionWidth; + req_d.src = req_q.src + PartitionDmaRegionWidth; + if (shift_row == allocated_size_sel-1) begin + req_d.dst = req_q.dst + PartitionDmaRegionWidth - shift_row*DmaRegionWidth; + if (dma_mode_i == 2'b10) begin // duplication mode: recover start adddr + req_d.src = dup_start_addr_q; + end + end else begin req_d.dst = req_q.dst + DmaRegionWidth; end - end - end - - 2'b01, - 2'b10, - 2'b11: begin - if ($unsigned(req_q.num_bytes) <= $unsigned(PartitionDmaRegionWidth)) begin - // Last split - if (ready_i) begin - state_d = Idle; - beat_cnt_d = beat_cnt_q + 1; - end - end else begin - burst_req_o.num_bytes = PartitionDmaRegionWidth; - if (ready_i) begin - req_d.num_bytes = req_q.num_bytes - PartitionDmaRegionWidth; - beat_cnt_d = beat_cnt_q + 1; - - if (spm2dram) begin - if (shift_row == allocated_size_sel-1) begin - req_d.src = req_q.src + PartitionDmaRegionWidth - shift_row*DmaRegionWidth; - end else begin - req_d.src = req_q.src + DmaRegionWidth; - end - req_d.dst = req_q.dst + PartitionDmaRegionWidth; - end else begin - req_d.src = req_q.src + PartitionDmaRegionWidth; - if (shift_row == allocated_size_sel-1) begin - req_d.dst = req_q.dst + PartitionDmaRegionWidth - shift_row*DmaRegionWidth; - if (dma_mode_i == 2'b10) begin // duplication mode: recover start adddr - req_d.src = dup_start_addr_q; - end - end else begin - req_d.dst = req_q.dst + DmaRegionWidth; - end - end// spm2dram - end // ready_i - end - end // case {01, 10, 11} - - default: /*do nothing*/; - endcase - + end// spm2dram + end // ready_i + end +`else + if ($unsigned(req_q.num_bytes) <= $unsigned(DmaRegionWidth)) begin + if (ready_i) begin + state_d = Idle; + end + end else begin + burst_req_o.num_bytes = DmaRegionWidth; + if (ready_i) begin + req_d.num_bytes = req_q.num_bytes - DmaRegionWidth; + req_d.src = req_q.src + DmaRegionWidth; + req_d.dst = req_q.dst + DmaRegionWidth; + end + end +`endif end // Busy default: /*do nothing*/; endcase diff --git a/hardware/src/control_registers/control_registers.hjson b/hardware/src/control_registers/control_registers.hjson index 1ef33e86e..24a4fb363 100644 --- a/hardware/src/control_registers/control_registers.hjson +++ b/hardware/src/control_registers/control_registers.hjson @@ -71,6 +71,100 @@ hwqe: "true" fields: [{ bits: "31:0" }] }, + + { name: "partition_sel_0" + desc: "Tile grouping for partition 1" + swaccess: "wo" + hwaccess: "hro" + hwqe: "true" + resval: 128 // Number of tiles + fields: [{ bits: "31:0" }] + }, + { name: "partition_sel_1" + desc: "Tile grouping for partition 2" + swaccess: "wo" + hwaccess: "hro" + hwqe: "true" + resval: 128 // Number of tiles + fields: [{ bits: "31:0" }] + }, + { name: "partition_sel_2" + desc: "Tile grouping for partition 3" + swaccess: "wo" + hwaccess: "hro" + hwqe: "true" + resval: 128 // Number of tiles + fields: [{ bits: "31:0" }] + }, + { name: "partition_sel_3" + desc: "Tile grouping for partition 4" + swaccess: "wo" + hwaccess: "hro" + hwqe: "true" + resval: 128 // Number of tiles + fields: [{ bits: "31:0" }] + }, + { name: "allocated_size_0" + desc: "Allocated size on partition 0" + swaccess: "wo" + hwaccess: "hro" + hwqe: "true" + fields: [{ bits: "31:0" }] + }, + { name: "allocated_size_1" + desc: "Allocated size on partition 1" + swaccess: "wo" + hwaccess: "hro" + hwqe: "true" + fields: [{ bits: "31:0" }] + }, + { name: "allocated_size_2" + desc: "Allocated size on partition 2" + swaccess: "wo" + hwaccess: "hro" + hwqe: "true" + fields: [{ bits: "31:0" }] + }, + { name: "allocated_size_3" + desc: "Allocated size on partition 3" + swaccess: "wo" + hwaccess: "hro" + hwqe: "true" + fields: [{ bits: "31:0" }] + }, + { name: "start_addr_scheme_0" + desc: "Allocated size on partition 0" + swaccess: "wo" + hwaccess: "hro" + hwqe: "true" + resval: 4194304 // 0x400000 starting point of the L1 + fields: [{ bits: "31:0" }] + }, + { name: "start_addr_scheme_1" + desc: "Allocated size on partition 1" + swaccess: "wo" + hwaccess: "hro" + hwqe: "true" + resval: 4194304 // 0x400000 starting point of the L1 + fields: [{ bits: "31:0" }] + }, + { name: "start_addr_scheme_2" + desc: "Allocated size on partition 2" + swaccess: "wo" + hwaccess: "hro" + hwqe: "true" + resval: 4194304 // 0x400000 starting point of the L1 + fields: [{ bits: "31:0" }] + }, + { name: "start_addr_scheme_3" + desc: "Allocated size on partition 3" + swaccess: "wo" + hwaccess: "hro" + hwqe: "true" + resval: 4194304 // 0x400000 starting point of the L1 + fields: [{ bits: "31:0" }] + }, + { name: "tcdm_start_address" desc: "TCDM Start Address Register" swaccess: "ro" diff --git a/hardware/src/control_registers/control_registers_reg_pkg.sv b/hardware/src/control_registers/control_registers_reg_pkg.sv index 0291dc527..d7d6cc373 100644 --- a/hardware/src/control_registers/control_registers_reg_pkg.sv +++ b/hardware/src/control_registers/control_registers_reg_pkg.sv @@ -11,7 +11,7 @@ package control_registers_reg_pkg; parameter int MAX_NumGroups = 8; // Address widths within the block - parameter int BlockAw = 7; + parameter int BlockAw = 8; //////////////////////////// // Typedefs for registers // @@ -46,6 +46,66 @@ package control_registers_reg_pkg; logic qe; } control_registers_reg2hw_wake_up_offst_reg_t; + typedef struct packed { + logic [31:0] q; + logic qe; + } control_registers_reg2hw_partition_sel_0_reg_t; + + typedef struct packed { + logic [31:0] q; + logic qe; + } control_registers_reg2hw_partition_sel_1_reg_t; + + typedef struct packed { + logic [31:0] q; + logic qe; + } control_registers_reg2hw_partition_sel_2_reg_t; + + typedef struct packed { + logic [31:0] q; + logic qe; + } control_registers_reg2hw_partition_sel_3_reg_t; + + typedef struct packed { + logic [31:0] q; + logic qe; + } control_registers_reg2hw_allocated_size_0_reg_t; + + typedef struct packed { + logic [31:0] q; + logic qe; + } control_registers_reg2hw_allocated_size_1_reg_t; + + typedef struct packed { + logic [31:0] q; + logic qe; + } control_registers_reg2hw_allocated_size_2_reg_t; + + typedef struct packed { + logic [31:0] q; + logic qe; + } control_registers_reg2hw_allocated_size_3_reg_t; + + typedef struct packed { + logic [31:0] q; + logic qe; + } control_registers_reg2hw_start_addr_scheme_0_reg_t; + + typedef struct packed { + logic [31:0] q; + logic qe; + } control_registers_reg2hw_start_addr_scheme_1_reg_t; + + typedef struct packed { + logic [31:0] q; + logic qe; + } control_registers_reg2hw_start_addr_scheme_2_reg_t; + + typedef struct packed { + logic [31:0] q; + logic qe; + } control_registers_reg2hw_start_addr_scheme_3_reg_t; + typedef struct packed { logic [31:0] q; } control_registers_reg2hw_ro_cache_enable_reg_t; @@ -86,12 +146,24 @@ package control_registers_reg_pkg; // Register -> HW type typedef struct packed { - control_registers_reg2hw_eoc_reg_t eoc; // [755:724] - control_registers_reg2hw_wake_up_reg_t wake_up; // [723:691] - control_registers_reg2hw_wake_up_tile_mreg_t [7:0] wake_up_tile; // [690:427] - control_registers_reg2hw_wake_up_group_reg_t wake_up_group; // [426:394] - control_registers_reg2hw_wake_up_strd_reg_t wake_up_strd; // [393:361] - control_registers_reg2hw_wake_up_offst_reg_t wake_up_offst; // [360:328] + control_registers_reg2hw_eoc_reg_t eoc; // [1151:1120] + control_registers_reg2hw_wake_up_reg_t wake_up; // [1119:1087] + control_registers_reg2hw_wake_up_tile_mreg_t [7:0] wake_up_tile; // [1086:823] + control_registers_reg2hw_wake_up_group_reg_t wake_up_group; // [822:790] + control_registers_reg2hw_wake_up_strd_reg_t wake_up_strd; // [789:757] + control_registers_reg2hw_wake_up_offst_reg_t wake_up_offst; // [756:724] + control_registers_reg2hw_partition_sel_0_reg_t partition_sel_0; // [723:691] + control_registers_reg2hw_partition_sel_1_reg_t partition_sel_1; // [690:658] + control_registers_reg2hw_partition_sel_2_reg_t partition_sel_2; // [657:625] + control_registers_reg2hw_partition_sel_3_reg_t partition_sel_3; // [624:592] + control_registers_reg2hw_allocated_size_0_reg_t allocated_size_0; // [591:559] + control_registers_reg2hw_allocated_size_1_reg_t allocated_size_1; // [558:526] + control_registers_reg2hw_allocated_size_2_reg_t allocated_size_2; // [525:493] + control_registers_reg2hw_allocated_size_3_reg_t allocated_size_3; // [492:460] + control_registers_reg2hw_start_addr_scheme_0_reg_t start_addr_scheme_0; // [459:427] + control_registers_reg2hw_start_addr_scheme_1_reg_t start_addr_scheme_1; // [426:394] + control_registers_reg2hw_start_addr_scheme_2_reg_t start_addr_scheme_2; // [393:361] + control_registers_reg2hw_start_addr_scheme_3_reg_t start_addr_scheme_3; // [360:328] control_registers_reg2hw_ro_cache_enable_reg_t ro_cache_enable; // [327:296] control_registers_reg2hw_ro_cache_flush_reg_t ro_cache_flush; // [295:264] control_registers_reg2hw_ro_cache_start_mreg_t [3:0] ro_cache_start; // [263:132] @@ -108,32 +180,44 @@ package control_registers_reg_pkg; } control_registers_hw2reg_t; // Register offsets - parameter logic [BlockAw-1:0] CONTROL_REGISTERS_EOC_OFFSET = 7'h 0; - parameter logic [BlockAw-1:0] CONTROL_REGISTERS_WAKE_UP_OFFSET = 7'h 4; - parameter logic [BlockAw-1:0] CONTROL_REGISTERS_WAKE_UP_TILE_0_OFFSET = 7'h 8; - parameter logic [BlockAw-1:0] CONTROL_REGISTERS_WAKE_UP_TILE_1_OFFSET = 7'h c; - parameter logic [BlockAw-1:0] CONTROL_REGISTERS_WAKE_UP_TILE_2_OFFSET = 7'h 10; - parameter logic [BlockAw-1:0] CONTROL_REGISTERS_WAKE_UP_TILE_3_OFFSET = 7'h 14; - parameter logic [BlockAw-1:0] CONTROL_REGISTERS_WAKE_UP_TILE_4_OFFSET = 7'h 18; - parameter logic [BlockAw-1:0] CONTROL_REGISTERS_WAKE_UP_TILE_5_OFFSET = 7'h 1c; - parameter logic [BlockAw-1:0] CONTROL_REGISTERS_WAKE_UP_TILE_6_OFFSET = 7'h 20; - parameter logic [BlockAw-1:0] CONTROL_REGISTERS_WAKE_UP_TILE_7_OFFSET = 7'h 24; - parameter logic [BlockAw-1:0] CONTROL_REGISTERS_WAKE_UP_GROUP_OFFSET = 7'h 28; - parameter logic [BlockAw-1:0] CONTROL_REGISTERS_WAKE_UP_STRD_OFFSET = 7'h 2c; - parameter logic [BlockAw-1:0] CONTROL_REGISTERS_WAKE_UP_OFFST_OFFSET = 7'h 30; - parameter logic [BlockAw-1:0] CONTROL_REGISTERS_TCDM_START_ADDRESS_OFFSET = 7'h 34; - parameter logic [BlockAw-1:0] CONTROL_REGISTERS_TCDM_END_ADDRESS_OFFSET = 7'h 38; - parameter logic [BlockAw-1:0] CONTROL_REGISTERS_NR_CORES_REG_OFFSET = 7'h 3c; - parameter logic [BlockAw-1:0] CONTROL_REGISTERS_RO_CACHE_ENABLE_OFFSET = 7'h 40; - parameter logic [BlockAw-1:0] CONTROL_REGISTERS_RO_CACHE_FLUSH_OFFSET = 7'h 44; - parameter logic [BlockAw-1:0] CONTROL_REGISTERS_RO_CACHE_START_0_OFFSET = 7'h 48; - parameter logic [BlockAw-1:0] CONTROL_REGISTERS_RO_CACHE_START_1_OFFSET = 7'h 4c; - parameter logic [BlockAw-1:0] CONTROL_REGISTERS_RO_CACHE_START_2_OFFSET = 7'h 50; - parameter logic [BlockAw-1:0] CONTROL_REGISTERS_RO_CACHE_START_3_OFFSET = 7'h 54; - parameter logic [BlockAw-1:0] CONTROL_REGISTERS_RO_CACHE_END_0_OFFSET = 7'h 58; - parameter logic [BlockAw-1:0] CONTROL_REGISTERS_RO_CACHE_END_1_OFFSET = 7'h 5c; - parameter logic [BlockAw-1:0] CONTROL_REGISTERS_RO_CACHE_END_2_OFFSET = 7'h 60; - parameter logic [BlockAw-1:0] CONTROL_REGISTERS_RO_CACHE_END_3_OFFSET = 7'h 64; + parameter logic [BlockAw-1:0] CONTROL_REGISTERS_EOC_OFFSET = 8'h 0; + parameter logic [BlockAw-1:0] CONTROL_REGISTERS_WAKE_UP_OFFSET = 8'h 4; + parameter logic [BlockAw-1:0] CONTROL_REGISTERS_WAKE_UP_TILE_0_OFFSET = 8'h 8; + parameter logic [BlockAw-1:0] CONTROL_REGISTERS_WAKE_UP_TILE_1_OFFSET = 8'h c; + parameter logic [BlockAw-1:0] CONTROL_REGISTERS_WAKE_UP_TILE_2_OFFSET = 8'h 10; + parameter logic [BlockAw-1:0] CONTROL_REGISTERS_WAKE_UP_TILE_3_OFFSET = 8'h 14; + parameter logic [BlockAw-1:0] CONTROL_REGISTERS_WAKE_UP_TILE_4_OFFSET = 8'h 18; + parameter logic [BlockAw-1:0] CONTROL_REGISTERS_WAKE_UP_TILE_5_OFFSET = 8'h 1c; + parameter logic [BlockAw-1:0] CONTROL_REGISTERS_WAKE_UP_TILE_6_OFFSET = 8'h 20; + parameter logic [BlockAw-1:0] CONTROL_REGISTERS_WAKE_UP_TILE_7_OFFSET = 8'h 24; + parameter logic [BlockAw-1:0] CONTROL_REGISTERS_WAKE_UP_GROUP_OFFSET = 8'h 28; + parameter logic [BlockAw-1:0] CONTROL_REGISTERS_WAKE_UP_STRD_OFFSET = 8'h 2c; + parameter logic [BlockAw-1:0] CONTROL_REGISTERS_WAKE_UP_OFFST_OFFSET = 8'h 30; + parameter logic [BlockAw-1:0] CONTROL_REGISTERS_PARTITION_SEL_0_OFFSET = 8'h 34; + parameter logic [BlockAw-1:0] CONTROL_REGISTERS_PARTITION_SEL_1_OFFSET = 8'h 38; + parameter logic [BlockAw-1:0] CONTROL_REGISTERS_PARTITION_SEL_2_OFFSET = 8'h 3c; + parameter logic [BlockAw-1:0] CONTROL_REGISTERS_PARTITION_SEL_3_OFFSET = 8'h 40; + parameter logic [BlockAw-1:0] CONTROL_REGISTERS_ALLOCATED_SIZE_0_OFFSET = 8'h 44; + parameter logic [BlockAw-1:0] CONTROL_REGISTERS_ALLOCATED_SIZE_1_OFFSET = 8'h 48; + parameter logic [BlockAw-1:0] CONTROL_REGISTERS_ALLOCATED_SIZE_2_OFFSET = 8'h 4c; + parameter logic [BlockAw-1:0] CONTROL_REGISTERS_ALLOCATED_SIZE_3_OFFSET = 8'h 50; + parameter logic [BlockAw-1:0] CONTROL_REGISTERS_START_ADDR_SCHEME_0_OFFSET = 8'h 54; + parameter logic [BlockAw-1:0] CONTROL_REGISTERS_START_ADDR_SCHEME_1_OFFSET = 8'h 58; + parameter logic [BlockAw-1:0] CONTROL_REGISTERS_START_ADDR_SCHEME_2_OFFSET = 8'h 5c; + parameter logic [BlockAw-1:0] CONTROL_REGISTERS_START_ADDR_SCHEME_3_OFFSET = 8'h 60; + parameter logic [BlockAw-1:0] CONTROL_REGISTERS_TCDM_START_ADDRESS_OFFSET = 8'h 64; + parameter logic [BlockAw-1:0] CONTROL_REGISTERS_TCDM_END_ADDRESS_OFFSET = 8'h 68; + parameter logic [BlockAw-1:0] CONTROL_REGISTERS_NR_CORES_REG_OFFSET = 8'h 6c; + parameter logic [BlockAw-1:0] CONTROL_REGISTERS_RO_CACHE_ENABLE_OFFSET = 8'h 70; + parameter logic [BlockAw-1:0] CONTROL_REGISTERS_RO_CACHE_FLUSH_OFFSET = 8'h 74; + parameter logic [BlockAw-1:0] CONTROL_REGISTERS_RO_CACHE_START_0_OFFSET = 8'h 78; + parameter logic [BlockAw-1:0] CONTROL_REGISTERS_RO_CACHE_START_1_OFFSET = 8'h 7c; + parameter logic [BlockAw-1:0] CONTROL_REGISTERS_RO_CACHE_START_2_OFFSET = 8'h 80; + parameter logic [BlockAw-1:0] CONTROL_REGISTERS_RO_CACHE_START_3_OFFSET = 8'h 84; + parameter logic [BlockAw-1:0] CONTROL_REGISTERS_RO_CACHE_END_0_OFFSET = 8'h 88; + parameter logic [BlockAw-1:0] CONTROL_REGISTERS_RO_CACHE_END_1_OFFSET = 8'h 8c; + parameter logic [BlockAw-1:0] CONTROL_REGISTERS_RO_CACHE_END_2_OFFSET = 8'h 90; + parameter logic [BlockAw-1:0] CONTROL_REGISTERS_RO_CACHE_END_3_OFFSET = 8'h 94; // Reset values for hwext registers and their fields parameter logic [31:0] CONTROL_REGISTERS_TCDM_START_ADDRESS_RESVAL = 32'h 0; @@ -163,6 +247,18 @@ package control_registers_reg_pkg; CONTROL_REGISTERS_WAKE_UP_GROUP, CONTROL_REGISTERS_WAKE_UP_STRD, CONTROL_REGISTERS_WAKE_UP_OFFST, + CONTROL_REGISTERS_PARTITION_SEL_0, + CONTROL_REGISTERS_PARTITION_SEL_1, + CONTROL_REGISTERS_PARTITION_SEL_2, + CONTROL_REGISTERS_PARTITION_SEL_3, + CONTROL_REGISTERS_ALLOCATED_SIZE_0, + CONTROL_REGISTERS_ALLOCATED_SIZE_1, + CONTROL_REGISTERS_ALLOCATED_SIZE_2, + CONTROL_REGISTERS_ALLOCATED_SIZE_3, + CONTROL_REGISTERS_START_ADDR_SCHEME_0, + CONTROL_REGISTERS_START_ADDR_SCHEME_1, + CONTROL_REGISTERS_START_ADDR_SCHEME_2, + CONTROL_REGISTERS_START_ADDR_SCHEME_3, CONTROL_REGISTERS_TCDM_START_ADDRESS, CONTROL_REGISTERS_TCDM_END_ADDRESS, CONTROL_REGISTERS_NR_CORES_REG, @@ -179,7 +275,7 @@ package control_registers_reg_pkg; } control_registers_id_e; // Register width information to check illegal writes - parameter logic [3:0] CONTROL_REGISTERS_PERMIT [26] = '{ + parameter logic [3:0] CONTROL_REGISTERS_PERMIT [38] = '{ 4'b 1111, // index[ 0] CONTROL_REGISTERS_EOC 4'b 1111, // index[ 1] CONTROL_REGISTERS_WAKE_UP 4'b 1111, // index[ 2] CONTROL_REGISTERS_WAKE_UP_TILE_0 @@ -193,19 +289,31 @@ package control_registers_reg_pkg; 4'b 1111, // index[10] CONTROL_REGISTERS_WAKE_UP_GROUP 4'b 1111, // index[11] CONTROL_REGISTERS_WAKE_UP_STRD 4'b 1111, // index[12] CONTROL_REGISTERS_WAKE_UP_OFFST - 4'b 1111, // index[13] CONTROL_REGISTERS_TCDM_START_ADDRESS - 4'b 1111, // index[14] CONTROL_REGISTERS_TCDM_END_ADDRESS - 4'b 1111, // index[15] CONTROL_REGISTERS_NR_CORES_REG - 4'b 1111, // index[16] CONTROL_REGISTERS_RO_CACHE_ENABLE - 4'b 1111, // index[17] CONTROL_REGISTERS_RO_CACHE_FLUSH - 4'b 1111, // index[18] CONTROL_REGISTERS_RO_CACHE_START_0 - 4'b 1111, // index[19] CONTROL_REGISTERS_RO_CACHE_START_1 - 4'b 1111, // index[20] CONTROL_REGISTERS_RO_CACHE_START_2 - 4'b 1111, // index[21] CONTROL_REGISTERS_RO_CACHE_START_3 - 4'b 1111, // index[22] CONTROL_REGISTERS_RO_CACHE_END_0 - 4'b 1111, // index[23] CONTROL_REGISTERS_RO_CACHE_END_1 - 4'b 1111, // index[24] CONTROL_REGISTERS_RO_CACHE_END_2 - 4'b 1111 // index[25] CONTROL_REGISTERS_RO_CACHE_END_3 + 4'b 1111, // index[13] CONTROL_REGISTERS_PARTITION_SEL_0 + 4'b 1111, // index[14] CONTROL_REGISTERS_PARTITION_SEL_1 + 4'b 1111, // index[15] CONTROL_REGISTERS_PARTITION_SEL_2 + 4'b 1111, // index[16] CONTROL_REGISTERS_PARTITION_SEL_3 + 4'b 1111, // index[17] CONTROL_REGISTERS_ALLOCATED_SIZE_0 + 4'b 1111, // index[18] CONTROL_REGISTERS_ALLOCATED_SIZE_1 + 4'b 1111, // index[19] CONTROL_REGISTERS_ALLOCATED_SIZE_2 + 4'b 1111, // index[20] CONTROL_REGISTERS_ALLOCATED_SIZE_3 + 4'b 1111, // index[21] CONTROL_REGISTERS_START_ADDR_SCHEME_0 + 4'b 1111, // index[22] CONTROL_REGISTERS_START_ADDR_SCHEME_1 + 4'b 1111, // index[23] CONTROL_REGISTERS_START_ADDR_SCHEME_2 + 4'b 1111, // index[24] CONTROL_REGISTERS_START_ADDR_SCHEME_3 + 4'b 1111, // index[25] CONTROL_REGISTERS_TCDM_START_ADDRESS + 4'b 1111, // index[26] CONTROL_REGISTERS_TCDM_END_ADDRESS + 4'b 1111, // index[27] CONTROL_REGISTERS_NR_CORES_REG + 4'b 1111, // index[28] CONTROL_REGISTERS_RO_CACHE_ENABLE + 4'b 1111, // index[29] CONTROL_REGISTERS_RO_CACHE_FLUSH + 4'b 1111, // index[30] CONTROL_REGISTERS_RO_CACHE_START_0 + 4'b 1111, // index[31] CONTROL_REGISTERS_RO_CACHE_START_1 + 4'b 1111, // index[32] CONTROL_REGISTERS_RO_CACHE_START_2 + 4'b 1111, // index[33] CONTROL_REGISTERS_RO_CACHE_START_3 + 4'b 1111, // index[34] CONTROL_REGISTERS_RO_CACHE_END_0 + 4'b 1111, // index[35] CONTROL_REGISTERS_RO_CACHE_END_1 + 4'b 1111, // index[36] CONTROL_REGISTERS_RO_CACHE_END_2 + 4'b 1111 // index[37] CONTROL_REGISTERS_RO_CACHE_END_3 }; endpackage diff --git a/hardware/src/control_registers/control_registers_reg_top.sv b/hardware/src/control_registers/control_registers_reg_top.sv index 9258d111d..a4ec1d7c3 100644 --- a/hardware/src/control_registers/control_registers_reg_top.sv +++ b/hardware/src/control_registers/control_registers_reg_top.sv @@ -10,7 +10,7 @@ module control_registers_reg_top #( parameter type reg_req_t = logic, parameter type reg_rsp_t = logic, - parameter int AW = 7 + parameter int AW = 8 ) ( input logic clk_i, input logic rst_ni, @@ -95,6 +95,30 @@ module control_registers_reg_top #( logic wake_up_strd_we; logic [31:0] wake_up_offst_wd; logic wake_up_offst_we; + logic [31:0] partition_sel_0_wd; + logic partition_sel_0_we; + logic [31:0] partition_sel_1_wd; + logic partition_sel_1_we; + logic [31:0] partition_sel_2_wd; + logic partition_sel_2_we; + logic [31:0] partition_sel_3_wd; + logic partition_sel_3_we; + logic [31:0] allocated_size_0_wd; + logic allocated_size_0_we; + logic [31:0] allocated_size_1_wd; + logic allocated_size_1_we; + logic [31:0] allocated_size_2_wd; + logic allocated_size_2_we; + logic [31:0] allocated_size_3_wd; + logic allocated_size_3_we; + logic [31:0] start_addr_scheme_0_wd; + logic start_addr_scheme_0_we; + logic [31:0] start_addr_scheme_1_wd; + logic start_addr_scheme_1_we; + logic [31:0] start_addr_scheme_2_wd; + logic start_addr_scheme_2_we; + logic [31:0] start_addr_scheme_3_wd; + logic start_addr_scheme_3_we; logic [31:0] tcdm_start_address_qs; logic tcdm_start_address_re; logic [31:0] tcdm_end_address_qs; @@ -482,6 +506,318 @@ module control_registers_reg_top #( ); + // R[partition_sel_0]: V(False) + + prim_subreg #( + .DW (32), + .SWACCESS("WO"), + .RESVAL (32'h80) + ) u_partition_sel_0 ( + .clk_i (clk_i ), + .rst_ni (rst_ni ), + + // from register interface + .we (partition_sel_0_we), + .wd (partition_sel_0_wd), + + // from internal hardware + .de (1'b0), + .d ('0 ), + + // to internal hardware + .qe (reg2hw.partition_sel_0.qe), + .q (reg2hw.partition_sel_0.q ), + + .qs () + ); + + + // R[partition_sel_1]: V(False) + + prim_subreg #( + .DW (32), + .SWACCESS("WO"), + .RESVAL (32'h80) + ) u_partition_sel_1 ( + .clk_i (clk_i ), + .rst_ni (rst_ni ), + + // from register interface + .we (partition_sel_1_we), + .wd (partition_sel_1_wd), + + // from internal hardware + .de (1'b0), + .d ('0 ), + + // to internal hardware + .qe (reg2hw.partition_sel_1.qe), + .q (reg2hw.partition_sel_1.q ), + + .qs () + ); + + + // R[partition_sel_2]: V(False) + + prim_subreg #( + .DW (32), + .SWACCESS("WO"), + .RESVAL (32'h80) + ) u_partition_sel_2 ( + .clk_i (clk_i ), + .rst_ni (rst_ni ), + + // from register interface + .we (partition_sel_2_we), + .wd (partition_sel_2_wd), + + // from internal hardware + .de (1'b0), + .d ('0 ), + + // to internal hardware + .qe (reg2hw.partition_sel_2.qe), + .q (reg2hw.partition_sel_2.q ), + + .qs () + ); + + + // R[partition_sel_3]: V(False) + + prim_subreg #( + .DW (32), + .SWACCESS("WO"), + .RESVAL (32'h80) + ) u_partition_sel_3 ( + .clk_i (clk_i ), + .rst_ni (rst_ni ), + + // from register interface + .we (partition_sel_3_we), + .wd (partition_sel_3_wd), + + // from internal hardware + .de (1'b0), + .d ('0 ), + + // to internal hardware + .qe (reg2hw.partition_sel_3.qe), + .q (reg2hw.partition_sel_3.q ), + + .qs () + ); + + + // R[allocated_size_0]: V(False) + + prim_subreg #( + .DW (32), + .SWACCESS("WO"), + .RESVAL (32'h0) + ) u_allocated_size_0 ( + .clk_i (clk_i ), + .rst_ni (rst_ni ), + + // from register interface + .we (allocated_size_0_we), + .wd (allocated_size_0_wd), + + // from internal hardware + .de (1'b0), + .d ('0 ), + + // to internal hardware + .qe (reg2hw.allocated_size_0.qe), + .q (reg2hw.allocated_size_0.q ), + + .qs () + ); + + + // R[allocated_size_1]: V(False) + + prim_subreg #( + .DW (32), + .SWACCESS("WO"), + .RESVAL (32'h0) + ) u_allocated_size_1 ( + .clk_i (clk_i ), + .rst_ni (rst_ni ), + + // from register interface + .we (allocated_size_1_we), + .wd (allocated_size_1_wd), + + // from internal hardware + .de (1'b0), + .d ('0 ), + + // to internal hardware + .qe (reg2hw.allocated_size_1.qe), + .q (reg2hw.allocated_size_1.q ), + + .qs () + ); + + + // R[allocated_size_2]: V(False) + + prim_subreg #( + .DW (32), + .SWACCESS("WO"), + .RESVAL (32'h0) + ) u_allocated_size_2 ( + .clk_i (clk_i ), + .rst_ni (rst_ni ), + + // from register interface + .we (allocated_size_2_we), + .wd (allocated_size_2_wd), + + // from internal hardware + .de (1'b0), + .d ('0 ), + + // to internal hardware + .qe (reg2hw.allocated_size_2.qe), + .q (reg2hw.allocated_size_2.q ), + + .qs () + ); + + + // R[allocated_size_3]: V(False) + + prim_subreg #( + .DW (32), + .SWACCESS("WO"), + .RESVAL (32'h0) + ) u_allocated_size_3 ( + .clk_i (clk_i ), + .rst_ni (rst_ni ), + + // from register interface + .we (allocated_size_3_we), + .wd (allocated_size_3_wd), + + // from internal hardware + .de (1'b0), + .d ('0 ), + + // to internal hardware + .qe (reg2hw.allocated_size_3.qe), + .q (reg2hw.allocated_size_3.q ), + + .qs () + ); + + + // R[start_addr_scheme_0]: V(False) + + prim_subreg #( + .DW (32), + .SWACCESS("WO"), + .RESVAL (32'h400000) + ) u_start_addr_scheme_0 ( + .clk_i (clk_i ), + .rst_ni (rst_ni ), + + // from register interface + .we (start_addr_scheme_0_we), + .wd (start_addr_scheme_0_wd), + + // from internal hardware + .de (1'b0), + .d ('0 ), + + // to internal hardware + .qe (reg2hw.start_addr_scheme_0.qe), + .q (reg2hw.start_addr_scheme_0.q ), + + .qs () + ); + + + // R[start_addr_scheme_1]: V(False) + + prim_subreg #( + .DW (32), + .SWACCESS("WO"), + .RESVAL (32'h400000) + ) u_start_addr_scheme_1 ( + .clk_i (clk_i ), + .rst_ni (rst_ni ), + + // from register interface + .we (start_addr_scheme_1_we), + .wd (start_addr_scheme_1_wd), + + // from internal hardware + .de (1'b0), + .d ('0 ), + + // to internal hardware + .qe (reg2hw.start_addr_scheme_1.qe), + .q (reg2hw.start_addr_scheme_1.q ), + + .qs () + ); + + + // R[start_addr_scheme_2]: V(False) + + prim_subreg #( + .DW (32), + .SWACCESS("WO"), + .RESVAL (32'h400000) + ) u_start_addr_scheme_2 ( + .clk_i (clk_i ), + .rst_ni (rst_ni ), + + // from register interface + .we (start_addr_scheme_2_we), + .wd (start_addr_scheme_2_wd), + + // from internal hardware + .de (1'b0), + .d ('0 ), + + // to internal hardware + .qe (reg2hw.start_addr_scheme_2.qe), + .q (reg2hw.start_addr_scheme_2.q ), + + .qs () + ); + + + // R[start_addr_scheme_3]: V(False) + + prim_subreg #( + .DW (32), + .SWACCESS("WO"), + .RESVAL (32'h400000) + ) u_start_addr_scheme_3 ( + .clk_i (clk_i ), + .rst_ni (rst_ni ), + + // from register interface + .we (start_addr_scheme_3_we), + .wd (start_addr_scheme_3_wd), + + // from internal hardware + .de (1'b0), + .d ('0 ), + + // to internal hardware + .qe (reg2hw.start_addr_scheme_3.qe), + .q (reg2hw.start_addr_scheme_3.q ), + + .qs () + ); + + // R[tcdm_start_address]: V(True) prim_subreg_ext #( @@ -718,7 +1054,7 @@ module control_registers_reg_top #( - logic [25:0] addr_hit; + logic [37:0] addr_hit; always_comb begin addr_hit = '0; addr_hit[ 0] = (reg_addr == CONTROL_REGISTERS_EOC_OFFSET); @@ -734,19 +1070,31 @@ module control_registers_reg_top #( addr_hit[10] = (reg_addr == CONTROL_REGISTERS_WAKE_UP_GROUP_OFFSET); addr_hit[11] = (reg_addr == CONTROL_REGISTERS_WAKE_UP_STRD_OFFSET); addr_hit[12] = (reg_addr == CONTROL_REGISTERS_WAKE_UP_OFFST_OFFSET); - addr_hit[13] = (reg_addr == CONTROL_REGISTERS_TCDM_START_ADDRESS_OFFSET); - addr_hit[14] = (reg_addr == CONTROL_REGISTERS_TCDM_END_ADDRESS_OFFSET); - addr_hit[15] = (reg_addr == CONTROL_REGISTERS_NR_CORES_REG_OFFSET); - addr_hit[16] = (reg_addr == CONTROL_REGISTERS_RO_CACHE_ENABLE_OFFSET); - addr_hit[17] = (reg_addr == CONTROL_REGISTERS_RO_CACHE_FLUSH_OFFSET); - addr_hit[18] = (reg_addr == CONTROL_REGISTERS_RO_CACHE_START_0_OFFSET); - addr_hit[19] = (reg_addr == CONTROL_REGISTERS_RO_CACHE_START_1_OFFSET); - addr_hit[20] = (reg_addr == CONTROL_REGISTERS_RO_CACHE_START_2_OFFSET); - addr_hit[21] = (reg_addr == CONTROL_REGISTERS_RO_CACHE_START_3_OFFSET); - addr_hit[22] = (reg_addr == CONTROL_REGISTERS_RO_CACHE_END_0_OFFSET); - addr_hit[23] = (reg_addr == CONTROL_REGISTERS_RO_CACHE_END_1_OFFSET); - addr_hit[24] = (reg_addr == CONTROL_REGISTERS_RO_CACHE_END_2_OFFSET); - addr_hit[25] = (reg_addr == CONTROL_REGISTERS_RO_CACHE_END_3_OFFSET); + addr_hit[13] = (reg_addr == CONTROL_REGISTERS_PARTITION_SEL_0_OFFSET); + addr_hit[14] = (reg_addr == CONTROL_REGISTERS_PARTITION_SEL_1_OFFSET); + addr_hit[15] = (reg_addr == CONTROL_REGISTERS_PARTITION_SEL_2_OFFSET); + addr_hit[16] = (reg_addr == CONTROL_REGISTERS_PARTITION_SEL_3_OFFSET); + addr_hit[17] = (reg_addr == CONTROL_REGISTERS_ALLOCATED_SIZE_0_OFFSET); + addr_hit[18] = (reg_addr == CONTROL_REGISTERS_ALLOCATED_SIZE_1_OFFSET); + addr_hit[19] = (reg_addr == CONTROL_REGISTERS_ALLOCATED_SIZE_2_OFFSET); + addr_hit[20] = (reg_addr == CONTROL_REGISTERS_ALLOCATED_SIZE_3_OFFSET); + addr_hit[21] = (reg_addr == CONTROL_REGISTERS_START_ADDR_SCHEME_0_OFFSET); + addr_hit[22] = (reg_addr == CONTROL_REGISTERS_START_ADDR_SCHEME_1_OFFSET); + addr_hit[23] = (reg_addr == CONTROL_REGISTERS_START_ADDR_SCHEME_2_OFFSET); + addr_hit[24] = (reg_addr == CONTROL_REGISTERS_START_ADDR_SCHEME_3_OFFSET); + addr_hit[25] = (reg_addr == CONTROL_REGISTERS_TCDM_START_ADDRESS_OFFSET); + addr_hit[26] = (reg_addr == CONTROL_REGISTERS_TCDM_END_ADDRESS_OFFSET); + addr_hit[27] = (reg_addr == CONTROL_REGISTERS_NR_CORES_REG_OFFSET); + addr_hit[28] = (reg_addr == CONTROL_REGISTERS_RO_CACHE_ENABLE_OFFSET); + addr_hit[29] = (reg_addr == CONTROL_REGISTERS_RO_CACHE_FLUSH_OFFSET); + addr_hit[30] = (reg_addr == CONTROL_REGISTERS_RO_CACHE_START_0_OFFSET); + addr_hit[31] = (reg_addr == CONTROL_REGISTERS_RO_CACHE_START_1_OFFSET); + addr_hit[32] = (reg_addr == CONTROL_REGISTERS_RO_CACHE_START_2_OFFSET); + addr_hit[33] = (reg_addr == CONTROL_REGISTERS_RO_CACHE_START_3_OFFSET); + addr_hit[34] = (reg_addr == CONTROL_REGISTERS_RO_CACHE_END_0_OFFSET); + addr_hit[35] = (reg_addr == CONTROL_REGISTERS_RO_CACHE_END_1_OFFSET); + addr_hit[36] = (reg_addr == CONTROL_REGISTERS_RO_CACHE_END_2_OFFSET); + addr_hit[37] = (reg_addr == CONTROL_REGISTERS_RO_CACHE_END_3_OFFSET); end assign addrmiss = (reg_re || reg_we) ? ~|addr_hit : 1'b0 ; @@ -779,7 +1127,19 @@ module control_registers_reg_top #( (addr_hit[22] & (|(CONTROL_REGISTERS_PERMIT[22] & ~reg_be))) | (addr_hit[23] & (|(CONTROL_REGISTERS_PERMIT[23] & ~reg_be))) | (addr_hit[24] & (|(CONTROL_REGISTERS_PERMIT[24] & ~reg_be))) | - (addr_hit[25] & (|(CONTROL_REGISTERS_PERMIT[25] & ~reg_be))))); + (addr_hit[25] & (|(CONTROL_REGISTERS_PERMIT[25] & ~reg_be))) | + (addr_hit[26] & (|(CONTROL_REGISTERS_PERMIT[26] & ~reg_be))) | + (addr_hit[27] & (|(CONTROL_REGISTERS_PERMIT[27] & ~reg_be))) | + (addr_hit[28] & (|(CONTROL_REGISTERS_PERMIT[28] & ~reg_be))) | + (addr_hit[29] & (|(CONTROL_REGISTERS_PERMIT[29] & ~reg_be))) | + (addr_hit[30] & (|(CONTROL_REGISTERS_PERMIT[30] & ~reg_be))) | + (addr_hit[31] & (|(CONTROL_REGISTERS_PERMIT[31] & ~reg_be))) | + (addr_hit[32] & (|(CONTROL_REGISTERS_PERMIT[32] & ~reg_be))) | + (addr_hit[33] & (|(CONTROL_REGISTERS_PERMIT[33] & ~reg_be))) | + (addr_hit[34] & (|(CONTROL_REGISTERS_PERMIT[34] & ~reg_be))) | + (addr_hit[35] & (|(CONTROL_REGISTERS_PERMIT[35] & ~reg_be))) | + (addr_hit[36] & (|(CONTROL_REGISTERS_PERMIT[36] & ~reg_be))) | + (addr_hit[37] & (|(CONTROL_REGISTERS_PERMIT[37] & ~reg_be))))); end assign eoc_we = addr_hit[0] & reg_we & !reg_error; @@ -821,49 +1181,85 @@ module control_registers_reg_top #( assign wake_up_offst_we = addr_hit[12] & reg_we & !reg_error; assign wake_up_offst_wd = reg_wdata[31:0]; - assign tcdm_start_address_re = addr_hit[13] & reg_re & !reg_error; + assign partition_sel_0_we = addr_hit[13] & reg_we & !reg_error; + assign partition_sel_0_wd = reg_wdata[31:0]; + + assign partition_sel_1_we = addr_hit[14] & reg_we & !reg_error; + assign partition_sel_1_wd = reg_wdata[31:0]; + + assign partition_sel_2_we = addr_hit[15] & reg_we & !reg_error; + assign partition_sel_2_wd = reg_wdata[31:0]; + + assign partition_sel_3_we = addr_hit[16] & reg_we & !reg_error; + assign partition_sel_3_wd = reg_wdata[31:0]; + + assign allocated_size_0_we = addr_hit[17] & reg_we & !reg_error; + assign allocated_size_0_wd = reg_wdata[31:0]; + + assign allocated_size_1_we = addr_hit[18] & reg_we & !reg_error; + assign allocated_size_1_wd = reg_wdata[31:0]; + + assign allocated_size_2_we = addr_hit[19] & reg_we & !reg_error; + assign allocated_size_2_wd = reg_wdata[31:0]; - assign tcdm_end_address_re = addr_hit[14] & reg_re & !reg_error; + assign allocated_size_3_we = addr_hit[20] & reg_we & !reg_error; + assign allocated_size_3_wd = reg_wdata[31:0]; - assign nr_cores_reg_re = addr_hit[15] & reg_re & !reg_error; + assign start_addr_scheme_0_we = addr_hit[21] & reg_we & !reg_error; + assign start_addr_scheme_0_wd = reg_wdata[31:0]; - assign ro_cache_enable_we = addr_hit[16] & reg_we & !reg_error; + assign start_addr_scheme_1_we = addr_hit[22] & reg_we & !reg_error; + assign start_addr_scheme_1_wd = reg_wdata[31:0]; + + assign start_addr_scheme_2_we = addr_hit[23] & reg_we & !reg_error; + assign start_addr_scheme_2_wd = reg_wdata[31:0]; + + assign start_addr_scheme_3_we = addr_hit[24] & reg_we & !reg_error; + assign start_addr_scheme_3_wd = reg_wdata[31:0]; + + assign tcdm_start_address_re = addr_hit[25] & reg_re & !reg_error; + + assign tcdm_end_address_re = addr_hit[26] & reg_re & !reg_error; + + assign nr_cores_reg_re = addr_hit[27] & reg_re & !reg_error; + + assign ro_cache_enable_we = addr_hit[28] & reg_we & !reg_error; assign ro_cache_enable_wd = reg_wdata[31:0]; - assign ro_cache_flush_we = addr_hit[17] & reg_we & !reg_error; + assign ro_cache_flush_we = addr_hit[29] & reg_we & !reg_error; assign ro_cache_flush_wd = reg_wdata[31:0]; - assign ro_cache_start_0_we = addr_hit[18] & reg_we & !reg_error; + assign ro_cache_start_0_we = addr_hit[30] & reg_we & !reg_error; assign ro_cache_start_0_wd = reg_wdata[31:0]; - assign ro_cache_start_0_re = addr_hit[18] & reg_re & !reg_error; + assign ro_cache_start_0_re = addr_hit[30] & reg_re & !reg_error; - assign ro_cache_start_1_we = addr_hit[19] & reg_we & !reg_error; + assign ro_cache_start_1_we = addr_hit[31] & reg_we & !reg_error; assign ro_cache_start_1_wd = reg_wdata[31:0]; - assign ro_cache_start_1_re = addr_hit[19] & reg_re & !reg_error; + assign ro_cache_start_1_re = addr_hit[31] & reg_re & !reg_error; - assign ro_cache_start_2_we = addr_hit[20] & reg_we & !reg_error; + assign ro_cache_start_2_we = addr_hit[32] & reg_we & !reg_error; assign ro_cache_start_2_wd = reg_wdata[31:0]; - assign ro_cache_start_2_re = addr_hit[20] & reg_re & !reg_error; + assign ro_cache_start_2_re = addr_hit[32] & reg_re & !reg_error; - assign ro_cache_start_3_we = addr_hit[21] & reg_we & !reg_error; + assign ro_cache_start_3_we = addr_hit[33] & reg_we & !reg_error; assign ro_cache_start_3_wd = reg_wdata[31:0]; - assign ro_cache_start_3_re = addr_hit[21] & reg_re & !reg_error; + assign ro_cache_start_3_re = addr_hit[33] & reg_re & !reg_error; - assign ro_cache_end_0_we = addr_hit[22] & reg_we & !reg_error; + assign ro_cache_end_0_we = addr_hit[34] & reg_we & !reg_error; assign ro_cache_end_0_wd = reg_wdata[31:0]; - assign ro_cache_end_0_re = addr_hit[22] & reg_re & !reg_error; + assign ro_cache_end_0_re = addr_hit[34] & reg_re & !reg_error; - assign ro_cache_end_1_we = addr_hit[23] & reg_we & !reg_error; + assign ro_cache_end_1_we = addr_hit[35] & reg_we & !reg_error; assign ro_cache_end_1_wd = reg_wdata[31:0]; - assign ro_cache_end_1_re = addr_hit[23] & reg_re & !reg_error; + assign ro_cache_end_1_re = addr_hit[35] & reg_re & !reg_error; - assign ro_cache_end_2_we = addr_hit[24] & reg_we & !reg_error; + assign ro_cache_end_2_we = addr_hit[36] & reg_we & !reg_error; assign ro_cache_end_2_wd = reg_wdata[31:0]; - assign ro_cache_end_2_re = addr_hit[24] & reg_re & !reg_error; + assign ro_cache_end_2_re = addr_hit[36] & reg_re & !reg_error; - assign ro_cache_end_3_we = addr_hit[25] & reg_we & !reg_error; + assign ro_cache_end_3_we = addr_hit[37] & reg_we & !reg_error; assign ro_cache_end_3_wd = reg_wdata[31:0]; - assign ro_cache_end_3_re = addr_hit[25] & reg_re & !reg_error; + assign ro_cache_end_3_re = addr_hit[37] & reg_re & !reg_error; // Read data return always_comb begin @@ -922,54 +1318,102 @@ module control_registers_reg_top #( end addr_hit[13]: begin - reg_rdata_next[31:0] = tcdm_start_address_qs; + reg_rdata_next[31:0] = '0; end addr_hit[14]: begin - reg_rdata_next[31:0] = tcdm_end_address_qs; + reg_rdata_next[31:0] = '0; end addr_hit[15]: begin - reg_rdata_next[31:0] = nr_cores_reg_qs; + reg_rdata_next[31:0] = '0; end addr_hit[16]: begin - reg_rdata_next[31:0] = ro_cache_enable_qs; + reg_rdata_next[31:0] = '0; end addr_hit[17]: begin - reg_rdata_next[31:0] = ro_cache_flush_qs; + reg_rdata_next[31:0] = '0; end addr_hit[18]: begin - reg_rdata_next[31:0] = ro_cache_start_0_qs; + reg_rdata_next[31:0] = '0; end addr_hit[19]: begin - reg_rdata_next[31:0] = ro_cache_start_1_qs; + reg_rdata_next[31:0] = '0; end addr_hit[20]: begin - reg_rdata_next[31:0] = ro_cache_start_2_qs; + reg_rdata_next[31:0] = '0; end addr_hit[21]: begin - reg_rdata_next[31:0] = ro_cache_start_3_qs; + reg_rdata_next[31:0] = '0; end addr_hit[22]: begin - reg_rdata_next[31:0] = ro_cache_end_0_qs; + reg_rdata_next[31:0] = '0; end addr_hit[23]: begin - reg_rdata_next[31:0] = ro_cache_end_1_qs; + reg_rdata_next[31:0] = '0; end addr_hit[24]: begin - reg_rdata_next[31:0] = ro_cache_end_2_qs; + reg_rdata_next[31:0] = '0; end addr_hit[25]: begin + reg_rdata_next[31:0] = tcdm_start_address_qs; + end + + addr_hit[26]: begin + reg_rdata_next[31:0] = tcdm_end_address_qs; + end + + addr_hit[27]: begin + reg_rdata_next[31:0] = nr_cores_reg_qs; + end + + addr_hit[28]: begin + reg_rdata_next[31:0] = ro_cache_enable_qs; + end + + addr_hit[29]: begin + reg_rdata_next[31:0] = ro_cache_flush_qs; + end + + addr_hit[30]: begin + reg_rdata_next[31:0] = ro_cache_start_0_qs; + end + + addr_hit[31]: begin + reg_rdata_next[31:0] = ro_cache_start_1_qs; + end + + addr_hit[32]: begin + reg_rdata_next[31:0] = ro_cache_start_2_qs; + end + + addr_hit[33]: begin + reg_rdata_next[31:0] = ro_cache_start_3_qs; + end + + addr_hit[34]: begin + reg_rdata_next[31:0] = ro_cache_end_0_qs; + end + + addr_hit[35]: begin + reg_rdata_next[31:0] = ro_cache_end_1_qs; + end + + addr_hit[36]: begin + reg_rdata_next[31:0] = ro_cache_end_2_qs; + end + + addr_hit[37]: begin reg_rdata_next[31:0] = ro_cache_end_3_qs; end @@ -996,7 +1440,7 @@ endmodule /* verilator lint_off DECLFILENAME */ module control_registers_reg_top_intf #( - parameter int AW = 7, + parameter int AW = 8, localparam int DW = 32 ) ( input logic clk_i, diff --git a/hardware/src/ctrl_registers.sv b/hardware/src/ctrl_registers.sv index 03cbe1bbb..06c10ec62 100644 --- a/hardware/src/ctrl_registers.sv +++ b/hardware/src/ctrl_registers.sv @@ -7,6 +7,7 @@ module ctrl_registers import mempool_pkg::ro_cache_ctrl_t; + import mempool_pkg::PartitionDataWidth; #( parameter int DataWidth = 32, // Parameters @@ -17,16 +18,19 @@ module ctrl_registers parameter type axi_lite_req_t = logic, parameter type axi_lite_resp_t = logic ) ( - input logic clk_i, - input logic rst_ni, + input logic clk_i, + input logic rst_ni, // AXI Bus - input axi_lite_req_t axi_lite_slave_req_i, - output axi_lite_resp_t axi_lite_slave_resp_o, + input axi_lite_req_t axi_lite_slave_req_i, + output axi_lite_resp_t axi_lite_slave_resp_o, // Control registers - output logic [DataWidth-1:0] eoc_o, - output logic eoc_valid_o, - output logic [NumCores-1:0] wake_up_o, - output ro_cache_ctrl_t ro_cache_ctrl_o + output logic [DataWidth-1:0] eoc_o, + output logic eoc_valid_o, + output logic [NumCores-1:0] wake_up_o, + output ro_cache_ctrl_t ro_cache_ctrl_o, + output logic [3:0][PartitionDataWidth-1:0] partition_sel_o, + output logic [3:0][PartitionDataWidth-1:0] allocated_size_o, + output logic [3:0][DataWidth-1:0] start_addr_scheme_o ); import mempool_pkg::AddrWidth; @@ -98,6 +102,21 @@ module ctrl_registers `FFL(ctrl_hw2reg.ro_cache_end[i].d, ctrl_reg2hw.ro_cache_end[i].q, ctrl_reg2hw.ro_cache_end[i].qe, ro_cache_regions[i].end_addr, clk_i, rst_ni) end + + assign partition_sel_o[0] = ctrl_reg2hw.partition_sel_0.q[PartitionDataWidth-1:0]; + assign partition_sel_o[1] = ctrl_reg2hw.partition_sel_1.q[PartitionDataWidth-1:0]; + assign partition_sel_o[2] = ctrl_reg2hw.partition_sel_2.q[PartitionDataWidth-1:0]; + assign partition_sel_o[3] = ctrl_reg2hw.partition_sel_3.q[PartitionDataWidth-1:0]; + assign start_addr_scheme_o[0] = ctrl_reg2hw.start_addr_scheme_0.q; + assign start_addr_scheme_o[1] = ctrl_reg2hw.start_addr_scheme_1.q; + assign start_addr_scheme_o[2] = ctrl_reg2hw.start_addr_scheme_2.q; + assign start_addr_scheme_o[3] = ctrl_reg2hw.start_addr_scheme_3.q; + assign allocated_size_o[0] = ctrl_reg2hw.allocated_size_0.q[PartitionDataWidth-1:0]; + assign allocated_size_o[1] = ctrl_reg2hw.allocated_size_1.q[PartitionDataWidth-1:0]; + assign allocated_size_o[2] = ctrl_reg2hw.allocated_size_2.q[PartitionDataWidth-1:0]; + assign allocated_size_o[3] = ctrl_reg2hw.allocated_size_3.q[PartitionDataWidth-1:0]; + + /************************ * Wakeup Pulse Logic * ************************/ diff --git a/hardware/src/mempool_cluster.sv b/hardware/src/mempool_cluster.sv index 8f1017848..a733102ed 100644 --- a/hardware/src/mempool_cluster.sv +++ b/hardware/src/mempool_cluster.sv @@ -17,32 +17,30 @@ module mempool_cluster parameter int unsigned NumAXIMasters = NumGroups * NumAXIMastersPerGroup ) ( // Clock and reset - input logic clk_i, - input logic rst_ni, - input logic testmode_i, + input logic clk_i, + input logic rst_ni, + input logic testmode_i, // Scan chain - input logic scan_enable_i, - input logic scan_data_i, - output logic scan_data_o, + input logic scan_enable_i, + input logic scan_data_i, + output logic scan_data_o, // Wake up signal - input logic [NumCores-1:0] wake_up_i, + input logic [NumCores-1:0] wake_up_i, // Partition Selection input logic [3:0][PartitionDataWidth-1:0] partition_sel_i, input logic [3:0][PartitionDataWidth-1:0] allocated_size_i, input logic [3:0][DataWidth-1:0] start_addr_scheme_i, - // DMA Mode Selection - input logic [1:0] dma_mode_i, // RO-Cache configuration - input ro_cache_ctrl_t ro_cache_ctrl_i, + input ro_cache_ctrl_t ro_cache_ctrl_i, // DMA request - input dma_req_t dma_req_i, - input logic dma_req_valid_i, - output logic dma_req_ready_o, + input dma_req_t dma_req_i, + input logic dma_req_valid_i, + output logic dma_req_ready_o, // DMA status - output dma_meta_t dma_meta_o, + output dma_meta_t dma_meta_o, // AXI Interface - output axi_tile_req_t [NumAXIMasters-1:0] axi_mst_req_o, - input axi_tile_resp_t [NumAXIMasters-1:0] axi_mst_resp_i + output axi_tile_req_t [NumAXIMasters-1:0] axi_mst_req_o, + input axi_tile_resp_t [NumAXIMasters-1:0] axi_mst_resp_i ); /********************* @@ -107,25 +105,22 @@ module mempool_cluster .burst_req_t (dma_req_t ), .meta_t (dma_meta_t ) ) i_idma_split_midend_v2 ( - .clk_i (clk_i ), - .rst_ni (rst_ni ), + .clk_i (clk_i ), + .rst_ni (rst_ni ), // slave - .burst_req_i(dma_req_cut ), - .valid_i (dma_req_cut_valid ), - .ready_o (dma_req_cut_ready ), - .meta_o (dma_meta_cut ), - // master - .dma_mode_i (dma_mode_i), - .burst_req_o(dma_req_partition ), - .valid_o (dma_req_partition_valid), - .ready_i (dma_req_partition_ready), - .meta_i (dma_meta_partition ), - + .burst_req_i (dma_req_cut ), + .valid_i (dma_req_cut_valid ), + .ready_o (dma_req_cut_ready ), + .meta_o (dma_meta_cut ), + .burst_req_o (dma_req_partition ), + .valid_o (dma_req_partition_valid), + .ready_i (dma_req_partition_ready), + .meta_i (dma_meta_partition ), // partition information - .group_factor_i (partition_sel_i ), - .allocated_size_i (allocated_size_i ), - .start_addr_scheme_i (start_addr_scheme_i ), - .allocated_size_o (allocated_size_sel) + .group_factor_i (partition_sel_i ), + .allocated_size_i (allocated_size_i ), + .start_addr_scheme_i(start_addr_scheme_i ), + .allocated_size_o (allocated_size_sel ) ); idma_distributed_midend_v2 #( @@ -137,21 +132,20 @@ module mempool_cluster .burst_req_t (dma_req_t ), .meta_t (dma_meta_t ) ) i_idma_distributed_midend_v2 ( - .clk_i (clk_i ), - .rst_ni (rst_ni ), + .clk_i (clk_i ), + .rst_ni (rst_ni ), // slave - .burst_req_i (dma_req_partition ), - .valid_i (dma_req_partition_valid), - .ready_o (dma_req_partition_ready), - .meta_o (dma_meta_partition ), + .burst_req_i (dma_req_partition ), + .valid_i (dma_req_partition_valid), + .ready_o (dma_req_partition_ready), + .meta_o (dma_meta_partition ), // partition info - .allocated_size_i(allocated_size_sel), - .dma_mode_i (dma_mode_i), + .allocated_size_i(allocated_size_sel ), // master - .burst_req_o (dma_req_group ), - .valid_o (dma_req_group_valid), - .ready_i (dma_req_group_ready), - .meta_i (dma_meta_q ) + .burst_req_o (dma_req_group ), + .valid_o (dma_req_group_valid ), + .ready_i (dma_req_group_ready ), + .meta_i (dma_meta_q ) ); for (genvar g = 0; unsigned'(g) < NumGroups; g++) begin: gen_dma_req_group_register @@ -327,7 +321,6 @@ module mempool_cluster .partition_sel_i (partition_sel_i ), .start_addr_scheme_i (start_addr_scheme_i ), .allocated_size_i (allocated_size_i ), - .dma_mode_i (dma_mode_i ), .ro_cache_ctrl_i (ro_cache_ctrl_q[g] ), // DMA request .dma_req_i (dma_req_group_q[g] ), @@ -373,7 +366,6 @@ module mempool_cluster .partition_sel_i (partition_sel_i ), .start_addr_scheme_i (start_addr_scheme_i ), .allocated_size_i (allocated_size_i ), - .dma_mode_i (dma_mode_i ), .ro_cache_ctrl_i (ro_cache_ctrl_q[g] ), // DMA request .dma_req_i (dma_req_group_q[g] ), @@ -416,7 +408,6 @@ module mempool_cluster .partition_sel_i (partition_sel_i ), .start_addr_scheme_i (start_addr_scheme_i ), .allocated_size_i (allocated_size_i ), - .dma_mode_i (dma_mode_i ), .ro_cache_ctrl_i (ro_cache_ctrl_q[g] ), // DMA request .dma_req_i (dma_req_group_q[g] ), diff --git a/hardware/src/mempool_group.sv b/hardware/src/mempool_group.sv index 3853fc97a..7e87a5f7c 100644 --- a/hardware/src/mempool_group.sv +++ b/hardware/src/mempool_group.sv @@ -64,7 +64,6 @@ module mempool_group input logic [3:0][PartitionDataWidth-1:0] partition_sel_i, input logic [3:0][PartitionDataWidth-1:0] allocated_size_i, input logic [3:0][DataWidth-1:0] start_addr_scheme_i, - input logic [1:0] dma_mode_i, // RO-Cache configuration input `STRUCT_PORT(ro_cache_ctrl_t) ro_cache_ctrl_i, // DMA request @@ -582,21 +581,20 @@ module mempool_group .burst_req_t (dma_req_t ), .meta_t (dma_meta_t ) ) i_idma_distributed_midend_v2 ( - .clk_i (clk_i ), - .rst_ni (rst_ni ), + .clk_i (clk_i ), + .rst_ni (rst_ni ), // slave - .burst_req_i (dma_req_cut ), - .valid_i (dma_req_cut_valid), - .ready_o (dma_req_cut_ready), - .meta_o (dma_meta_cut ), + .burst_req_i (dma_req_cut ), + .valid_i (dma_req_cut_valid ), + .ready_o (dma_req_cut_ready ), + .meta_o (dma_meta_cut ), // partition .allocated_size_i(dma_allocated_size_sel_i), - .dma_mode_i (dma_mode_i), // master - .burst_req_o (dma_req ), - .valid_o (dma_req_valid ), - .ready_i (dma_req_ready ), - .meta_i (dma_meta ) + .burst_req_o (dma_req ), + .valid_o (dma_req_valid ), + .ready_i (dma_req_ready ), + .meta_i (dma_meta ) ); `else @@ -1000,21 +998,20 @@ module mempool_group .burst_req_t (dma_req_t ), .meta_t (dma_meta_t ) ) i_idma_distributed_midend_v2 ( - .clk_i (clk_i ), - .rst_ni (rst_ni ), + .clk_i (clk_i ), + .rst_ni (rst_ni ), // slave - .burst_req_i (dma_req_cut ), - .valid_i (dma_req_cut_valid), - .ready_o (dma_req_cut_ready), - .meta_o (dma_meta_cut ), + .burst_req_i (dma_req_cut ), + .valid_i (dma_req_cut_valid ), + .ready_o (dma_req_cut_ready ), + .meta_o (dma_meta_cut ), // partition .allocated_size_i(dma_allocated_size_sel_i), - .dma_mode_i (dma_mode_i), // master - .burst_req_o (dma_req ), - .valid_o (dma_req_valid ), - .ready_i (dma_req_ready ), - .meta_i (dma_meta ) + .burst_req_o (dma_req ), + .valid_o (dma_req_valid ), + .ready_i (dma_req_ready ), + .meta_i (dma_meta ) ); // xbar diff --git a/hardware/src/mempool_system.sv b/hardware/src/mempool_system.sv index 3b75ebaef..9d11d4484 100644 --- a/hardware/src/mempool_system.sv +++ b/hardware/src/mempool_system.sv @@ -144,24 +144,23 @@ module mempool_system .TCDMBaseAddr(TCDMBaseAddr), .BootAddr (BootAddr ) ) i_mempool_cluster ( - .clk_i (clk_i ), - .rst_ni (rst_ni ), - .wake_up_i (wake_up ), - .partition_sel_i(partition_sel ), - .allocated_size_i (allocated_size), - .start_addr_scheme_i(start_addr_scheme ), - .dma_mode_i (dma_mode[1:0]), - .testmode_i (1'b0 ), - .scan_enable_i (1'b0 ), - .scan_data_i (1'b0 ), - .scan_data_o (/* Unused */ ), - .ro_cache_ctrl_i(ro_cache_ctrl ), - .dma_req_i (dma_req ), - .dma_req_valid_i(dma_req_valid ), - .dma_req_ready_o(dma_req_ready ), - .dma_meta_o (dma_meta ), - .axi_mst_req_o (axi_mst_req[NumAXIMasters-2:0] ), - .axi_mst_resp_i (axi_mst_resp[NumAXIMasters-2:0]) + .clk_i (clk_i ), + .rst_ni (rst_ni ), + .wake_up_i (wake_up ), + .partition_sel_i (partition_sel ), + .allocated_size_i (allocated_size ), + .start_addr_scheme_i(start_addr_scheme ), + .testmode_i (1'b0 ), + .scan_enable_i (1'b0 ), + .scan_data_i (1'b0 ), + .scan_data_o (/* Unused */ ), + .ro_cache_ctrl_i (ro_cache_ctrl ), + .dma_req_i (dma_req ), + .dma_req_valid_i (dma_req_valid ), + .dma_req_ready_o (dma_req_ready ), + .dma_meta_o (dma_meta ), + .axi_mst_req_o (axi_mst_req[NumAXIMasters-2:0] ), + .axi_mst_resp_i (axi_mst_resp[NumAXIMasters-2:0]) ); /********************** @@ -813,7 +812,10 @@ module mempool_system .eoc_o (/* Unused */ ), .eoc_valid_o (eoc_valid_o ), .wake_up_o (wake_up ), - .ro_cache_ctrl_o (ro_cache_ctrl ) + .ro_cache_ctrl_o (ro_cache_ctrl ), + .partition_sel_o (partition_sel ), + .start_addr_scheme_o (start_addr_scheme ), + .allocated_size_o (allocated_size ) ); mempool_dma #( diff --git a/software/runtime/control_registers.h b/software/runtime/control_registers.h index f2a467af2..8f509c9b1 100644 --- a/software/runtime/control_registers.h +++ b/software/runtime/control_registers.h @@ -66,20 +66,56 @@ extern "C" { // Wake Up Offst Register #define CONTROL_REGISTERS_WAKE_UP_OFFST_REG_OFFSET 0x30 +// Tile grouping for partition 1 +#define CONTROL_REGISTERS_PARTITION_SEL_0_REG_OFFSET 0x34 + +// Tile grouping for partition 2 +#define CONTROL_REGISTERS_PARTITION_SEL_1_REG_OFFSET 0x38 + +// Tile grouping for partition 3 +#define CONTROL_REGISTERS_PARTITION_SEL_2_REG_OFFSET 0x3c + +// Tile grouping for partition 4 +#define CONTROL_REGISTERS_PARTITION_SEL_3_REG_OFFSET 0x40 + +// Allocated size on partition 0 +#define CONTROL_REGISTERS_ALLOCATED_SIZE_0_REG_OFFSET 0x44 + +// Allocated size on partition 1 +#define CONTROL_REGISTERS_ALLOCATED_SIZE_1_REG_OFFSET 0x48 + +// Allocated size on partition 2 +#define CONTROL_REGISTERS_ALLOCATED_SIZE_2_REG_OFFSET 0x4c + +// Allocated size on partition 3 +#define CONTROL_REGISTERS_ALLOCATED_SIZE_3_REG_OFFSET 0x50 + +// Allocated size on partition 0 +#define CONTROL_REGISTERS_START_ADDR_SCHEME_0_REG_OFFSET 0x54 + +// Allocated size on partition 1 +#define CONTROL_REGISTERS_START_ADDR_SCHEME_1_REG_OFFSET 0x58 + +// Allocated size on partition 2 +#define CONTROL_REGISTERS_START_ADDR_SCHEME_2_REG_OFFSET 0x5c + +// Allocated size on partition 3 +#define CONTROL_REGISTERS_START_ADDR_SCHEME_3_REG_OFFSET 0x60 + // TCDM Start Address Register -#define CONTROL_REGISTERS_TCDM_START_ADDRESS_REG_OFFSET 0x34 +#define CONTROL_REGISTERS_TCDM_START_ADDRESS_REG_OFFSET 0x64 // TCDM End Address Register -#define CONTROL_REGISTERS_TCDM_END_ADDRESS_REG_OFFSET 0x38 +#define CONTROL_REGISTERS_TCDM_END_ADDRESS_REG_OFFSET 0x68 // Number of Cores Register -#define CONTROL_REGISTERS_NR_CORES_REG_REG_OFFSET 0x3c +#define CONTROL_REGISTERS_NR_CORES_REG_REG_OFFSET 0x6c // Read-only cache Enable -#define CONTROL_REGISTERS_RO_CACHE_ENABLE_REG_OFFSET 0x40 +#define CONTROL_REGISTERS_RO_CACHE_ENABLE_REG_OFFSET 0x70 // Read-only cache Flush -#define CONTROL_REGISTERS_RO_CACHE_FLUSH_REG_OFFSET 0x44 +#define CONTROL_REGISTERS_RO_CACHE_FLUSH_REG_OFFSET 0x74 // Read-only cache Region Start (common parameters) #define CONTROL_REGISTERS_RO_CACHE_START_RO_CACHE_START_FIELD_WIDTH 32 @@ -87,16 +123,16 @@ extern "C" { #define CONTROL_REGISTERS_RO_CACHE_START_MULTIREG_COUNT 4 // Read-only cache Region Start -#define CONTROL_REGISTERS_RO_CACHE_START_0_REG_OFFSET 0x48 +#define CONTROL_REGISTERS_RO_CACHE_START_0_REG_OFFSET 0x78 // Read-only cache Region Start -#define CONTROL_REGISTERS_RO_CACHE_START_1_REG_OFFSET 0x4c +#define CONTROL_REGISTERS_RO_CACHE_START_1_REG_OFFSET 0x7c // Read-only cache Region Start -#define CONTROL_REGISTERS_RO_CACHE_START_2_REG_OFFSET 0x50 +#define CONTROL_REGISTERS_RO_CACHE_START_2_REG_OFFSET 0x80 // Read-only cache Region Start -#define CONTROL_REGISTERS_RO_CACHE_START_3_REG_OFFSET 0x54 +#define CONTROL_REGISTERS_RO_CACHE_START_3_REG_OFFSET 0x84 // Read-only cache Region End (common parameters) #define CONTROL_REGISTERS_RO_CACHE_END_RO_CACHE_END_FIELD_WIDTH 32 @@ -104,16 +140,16 @@ extern "C" { #define CONTROL_REGISTERS_RO_CACHE_END_MULTIREG_COUNT 4 // Read-only cache Region End -#define CONTROL_REGISTERS_RO_CACHE_END_0_REG_OFFSET 0x58 +#define CONTROL_REGISTERS_RO_CACHE_END_0_REG_OFFSET 0x88 // Read-only cache Region End -#define CONTROL_REGISTERS_RO_CACHE_END_1_REG_OFFSET 0x5c +#define CONTROL_REGISTERS_RO_CACHE_END_1_REG_OFFSET 0x8c // Read-only cache Region End -#define CONTROL_REGISTERS_RO_CACHE_END_2_REG_OFFSET 0x60 +#define CONTROL_REGISTERS_RO_CACHE_END_2_REG_OFFSET 0x90 // Read-only cache Region End -#define CONTROL_REGISTERS_RO_CACHE_END_3_REG_OFFSET 0x64 +#define CONTROL_REGISTERS_RO_CACHE_END_3_REG_OFFSET 0x94 #ifdef __cplusplus } // extern "C" diff --git a/software/runtime/runtime.h b/software/runtime/runtime.h index ca1395c4a..995896393 100644 --- a/software/runtime/runtime.h +++ b/software/runtime/runtime.h @@ -54,25 +54,47 @@ static uint32_t volatile *wake_up_offset_reg = (uint32_t volatile *)(CONTROL_REGISTER_OFFSET + CONTROL_REGISTERS_WAKE_UP_OFFST_REG_OFFSET); -/* DAS-related regs +/* DAS-related regs */ -extern volatile uint32_t partition_reg; -extern volatile uint32_t partition1_reg; -extern volatile uint32_t partition2_reg; -extern volatile uint32_t partition3_reg; +static uint32_t volatile *partition_reg = + (uint32_t volatile *)(CONTROL_REGISTER_OFFSET + + CONTROL_REGISTERS_PARTITION_SEL_0_REG_OFFSET); +static uint32_t volatile *partition1_reg = + (uint32_t volatile *)(CONTROL_REGISTER_OFFSET + + CONTROL_REGISTERS_PARTITION_SEL_1_REG_OFFSET); +static uint32_t volatile *partition2_reg = + (uint32_t volatile *)(CONTROL_REGISTER_OFFSET + + CONTROL_REGISTERS_PARTITION_SEL_2_REG_OFFSET); +static uint32_t volatile *partition3_reg = + (uint32_t volatile *)(CONTROL_REGISTER_OFFSET + + CONTROL_REGISTERS_PARTITION_SEL_3_REG_OFFSET); -extern volatile uint32_t start_addr_scheme0_reg; -extern volatile uint32_t start_addr_scheme1_reg; -extern volatile uint32_t start_addr_scheme2_reg; -extern volatile uint32_t start_addr_scheme3_reg; +static uint32_t volatile *start_addr_scheme0_reg = + (uint32_t volatile *)(CONTROL_REGISTER_OFFSET + + CONTROL_REGISTERS_START_ADDR_SCHEME_0_REG_OFFSET); +static uint32_t volatile *start_addr_scheme1_reg = + (uint32_t volatile *)(CONTROL_REGISTER_OFFSET + + CONTROL_REGISTERS_START_ADDR_SCHEME_1_REG_OFFSET); +static uint32_t volatile *start_addr_scheme2_reg = + (uint32_t volatile *)(CONTROL_REGISTER_OFFSET + + CONTROL_REGISTERS_START_ADDR_SCHEME_2_REG_OFFSET); +static uint32_t volatile *start_addr_scheme3_reg = + (uint32_t volatile *)(CONTROL_REGISTER_OFFSET + + CONTROL_REGISTERS_START_ADDR_SCHEME_3_REG_OFFSET); -extern volatile uint32_t allocated_size0_reg; -extern volatile uint32_t allocated_size1_reg; -extern volatile uint32_t allocated_size2_reg; -extern volatile uint32_t allocated_size3_reg; +static uint32_t volatile *allocated_size0_reg = + (uint32_t volatile *)(CONTROL_REGISTER_OFFSET + + CONTROL_REGISTERS_ALLOCATED_SIZE_0_REG_OFFSET); +static uint32_t volatile *allocated_size1_reg = + (uint32_t volatile *)(CONTROL_REGISTER_OFFSET + + CONTROL_REGISTERS_ALLOCATED_SIZE_1_REG_OFFSET); +static uint32_t volatile *allocated_size2_reg = + (uint32_t volatile *)(CONTROL_REGISTER_OFFSET + + CONTROL_REGISTERS_ALLOCATED_SIZE_2_REG_OFFSET); +static uint32_t volatile *allocated_size3_reg = + (uint32_t volatile *)(CONTROL_REGISTER_OFFSET + + CONTROL_REGISTERS_ALLOCATED_SIZE_3_REG_OFFSET); -extern volatile uint32_t dma_mode_reg; -*/ typedef uint32_t mempool_id_t; typedef uint32_t mempool_timer_t; From 91999366220d7bf4b5ac83ff442c48d7d1a020be Mon Sep 17 00:00:00 2001 From: mbertuletti Date: Tue, 21 Oct 2025 18:52:25 +0200 Subject: [PATCH 10/34] [hardware] Correct address scrambler copy-pasted code --- hardware/src/address_scrambler.sv | 68 +------------------------------ 1 file changed, 2 insertions(+), 66 deletions(-) diff --git a/hardware/src/address_scrambler.sv b/hardware/src/address_scrambler.sv index b7e3dae27..493819ee0 100644 --- a/hardware/src/address_scrambler.sv +++ b/hardware/src/address_scrambler.sv @@ -8,28 +8,6 @@ // Author: Samuel Riedel -module address_scrambler #( - parameter int unsigned AddrWidth = 32, - parameter int unsigned ByteOffset = 2, - parameter int unsigned NumTiles = 2, - parameter int unsigned NumBanksPerTile = 2, - parameter bit Bypass = 0, - parameter int unsigned SeqMemSizePerTile = 4*1024 -) ( - input logic [AddrWidth-1:0] address_i, - output logic [AddrWidth-1:0] address_o -); - localparam int unsigned BankOffsetBits = $clog2(NumBanksPerTile); - localparam int unsigned TileIdBits = $clog2(NumTiles);// Copyright 2021 ETH Zurich and University of Bologna. -// Solderpad Hardware License, Version 0.51, see LICENSE for details. -// SPDX-License-Identifier: SHL-0.51 - -// Description: Scrambles the address in such a way, that part of the memory is accessed -// sequentially and part is interleaved. -// Current constraints: - -// Author: Samuel Riedel - module address_scrambler #( parameter int unsigned AddrWidth = 32, parameter int unsigned DataWidth = 32, @@ -43,12 +21,12 @@ module address_scrambler #( parameter int unsigned MemSizePerRow = 4*4*1024, // 4bytes * 4096 banks parameter int unsigned TCDMSize = 1024*1024 ) ( - input logic [AddrWidth-1:0] address_i, + input logic [AddrWidth-1:0] address_i, input logic [3:0][7:0] group_factor_i, // For each allocation, the maximum number of rows assigned can be 128 rows input logic [3:0][7:0] allocated_size_i, input logic [3:0][DataWidth-1:0] start_addr_scheme_i, - output logic [AddrWidth-1:0] address_o + output logic [AddrWidth-1:0] address_o ); // Stack Sequential Settings localparam int unsigned BankOffsetBits = $clog2(NumBanksPerTile); @@ -204,45 +182,3 @@ module address_scrambler #( if (SeqMemSizePerTile % (2**ByteOffset*NumBanksPerTile) != 0) $fatal(1, "SeqMemSizePerTile must be a multiple of BankWidth*NumBanksPerTile!"); endmodule : address_scrambler - - localparam int unsigned SeqPerTileBits = $clog2(SeqMemSizePerTile); - localparam int unsigned SeqTotalBits = SeqPerTileBits+TileIdBits; - localparam int unsigned ConstantBitsLSB = ByteOffset + BankOffsetBits; - localparam int unsigned ScrambleBits = SeqPerTileBits-ConstantBitsLSB; - - if (Bypass || NumTiles < 2) begin - assign address_o = address_i; - end else begin - logic [ScrambleBits-1:0] scramble; // Address bits that have to be shuffled around - logic [TileIdBits-1:0] tile_id; // Which tile does this address region belong to - - // Leave this part of the address unchanged - // The LSBs that correspond to the offset inside a tile. These are the byte offset (bank width) - // and the Bank offset (Number of Banks in tile) - assign address_o[ConstantBitsLSB-1:0] = address_i[ConstantBitsLSB-1:0]; - // The MSBs that are outside of the sequential memory size. Currently the sequential memory size - // always starts at 0. These are all the MSBs up to SeqMemSizePerTile*NumTiles - assign address_o[AddrWidth-1:SeqTotalBits] = address_i[AddrWidth-1:SeqTotalBits]; - - // Scramble the middle part - // Bits that would have gone to different tiles but now go to increasing lines in the same tile - assign scramble = address_i[SeqPerTileBits-1:ConstantBitsLSB]; // Bits that would - // Bits that would have gone to increasing lines in the same tile but now go to different tiles - assign tile_id = address_i[SeqTotalBits-1:SeqPerTileBits]; - - always_comb begin - // Default: Unscrambled - address_o[SeqTotalBits-1:ConstantBitsLSB] = {tile_id, scramble}; - // If not in bypass mode and address is in sequential region and more than one tile - if (address_i < (NumTiles * SeqMemSizePerTile)) begin - address_o[SeqTotalBits-1:ConstantBitsLSB] = {scramble, tile_id}; - end - end - end - - // Check for unsupported configurations - if (NumBanksPerTile < 2) - $fatal(1, "NumBanksPerTile must be greater than 2. The special case '1' is currently not supported!"); - if (SeqMemSizePerTile % (2**ByteOffset*NumBanksPerTile) != 0) - $fatal(1, "SeqMemSizePerTile must be a multiple of BankWidth*NumBanksPerTile!"); -endmodule : address_scrambler From c155c4d5dfd21eb6c8116bddf8c50616e41546a0 Mon Sep 17 00:00:00 2001 From: mbertuletti Date: Tue, 21 Oct 2025 18:54:12 +0200 Subject: [PATCH 11/34] [config] Parametrize feature with meaningful define --- config/config.mk | 5 +++++ config/terapool.mk | 5 ----- hardware/Makefile | 1 + software/runtime/runtime.mk | 6 +++--- 4 files changed, 9 insertions(+), 8 deletions(-) diff --git a/config/config.mk b/config/config.mk index 9ea9a0fd0..6db1ac3b7 100644 --- a/config/config.mk +++ b/config/config.mk @@ -73,6 +73,11 @@ zquarterinx ?= 0 # DivSqrt deactivated by default xDivSqrt ?= 0 +# Enable configurable addressing scheme in the heap +das ?= 1 +# Size for configurable addressing scheme heap +das_mem_size ?= 2048 + # This parameter is only used for TeraPool configurations num_sub_groups_per_group ?= 1 remote_group_latency_cycles ?= 7 diff --git a/config/terapool.mk b/config/terapool.mk index 42a5d73a6..6bdd329e9 100644 --- a/config/terapool.mk +++ b/config/terapool.mk @@ -10,7 +10,6 @@ # Global Control terapool ?= 1 -flex_terapool ?= 1 # Number of cores num_cores ?= 1024 @@ -47,7 +46,3 @@ dmas_per_group ?= 4 # Brust Length = 16 # L2 Banks/Channels l2_banks = 16 l2_size ?= 16777216 # 1000000 - -# TeraPool w/ DAS -# Impacted memory size in byte per core by default -heap_seq_mem_size ?= 2048 \ No newline at end of file diff --git a/hardware/Makefile b/hardware/Makefile index 2bab80a79..361a8de03 100644 --- a/hardware/Makefile +++ b/hardware/Makefile @@ -119,6 +119,7 @@ vlog_defs += -DL2_SIZE=32\'d$(l2_size) vlog_defs += -DL2_BANKS=$(l2_banks) vlog_defs += -DL1_BANK_SIZE=$(l1_bank_size) vlog_defs += -DBOOT_ADDR=32\'d$(boot_addr) +vlog_defs += -DDAS=$(das) # Snitch ISA vlog_defs += -DXPULPIMG=$(xpulpimg) vlog_defs += -DZFINX=$(zfinx) diff --git a/software/runtime/runtime.mk b/software/runtime/runtime.mk index 10d356d59..70f6e0b33 100644 --- a/software/runtime/runtime.mk +++ b/software/runtime/runtime.mk @@ -110,9 +110,9 @@ ifdef terapool DEFINES += -DNUM_CORES_PER_SUB_GROUP=$(shell awk 'BEGIN{print ($(num_cores)/$(num_groups))/$(num_sub_groups_per_group)}') DEFINES += -DNUM_TILES_PER_SUB_GROUP=$(shell awk 'BEGIN{print ($(num_cores)/$(num_groups))/$(num_cores_per_tile)/$(num_sub_groups_per_group)}') endif -ifdef flex_terapool - DEFINES += -DHEAP_SEQ_MEM_SIZE=$(heap_seq_mem_size) - DEFINES += -DLOG2_HEAP_SEQ_MEM_SIZE=$(shell awk 'BEGIN{print log($(heap_seq_mem_size))/log(2)}') +ifdef das + DEFINES += -DDAS_MEM_SIZE=$(das_mem_size) + DEFINES += -DLOG2_DAS_MEM_SIZE=$(shell awk 'BEGIN{print log($(das_mem_size))/log(2)}') endif # Specify cross compilation target. This can be omitted if LLVM is built with riscv as default target From a92a71fd0909b22689336c66ba9d9614fb25dd4e Mon Sep 17 00:00:00 2001 From: mbertuletti Date: Tue, 21 Oct 2025 18:55:24 +0200 Subject: [PATCH 12/34] [software] Rename "group_factor" with meaningful tile grouping --- software/runtime/runtime.h | 29 ++++++++++++----------------- 1 file changed, 12 insertions(+), 17 deletions(-) diff --git a/software/runtime/runtime.h b/software/runtime/runtime.h index 995896393..3cc5be60c 100644 --- a/software/runtime/runtime.h +++ b/software/runtime/runtime.h @@ -180,17 +180,13 @@ static inline void mempool_reset_heap(const uint32_t core_id, uint32_t heap_seq_ // Initialize Dynamic Heap Allocator, as default specified in the linker script -// @inp (uint32_t) group_factor: Number of Tiles per partition -static inline void mempool_dynamic_heap_alloc_init(const uint32_t core_id, const uint32_t group_factor){ +static inline void mempool_dynamic_heap_alloc_init(const uint32_t core_id, const uint32_t tiles_per_partition){ if (core_id == 0){ extern uint32_t __heap_seq_start; - uint32_t num_tiles_per_partition = group_factor; - - // Dynamic allocator base and size uint32_t seq_heap_base = (uint32_t)&__heap_seq_start; - uint32_t seq_heap_size = NUM_CORES_PER_TILE * num_tiles_per_partition * HEAP_SEQ_MEM_SIZE; - uint32_t num_partition = mempool_get_tile_count() / group_factor; + uint32_t seq_heap_size = (NUM_CORES_PER_TILE * tiles_per_partition) * DAS_MEM_SIZE; + uint32_t num_partition = mempool_get_tile_count() / tiles_per_partition; // Dynamically allocate the space for allocators init_dynamic_heap_alloc(num_partition); for (uint32_t part_id=0; part_id Date: Tue, 21 Oct 2025 18:56:17 +0200 Subject: [PATCH 13/34] [software] Add partition test --- .../tests/baremetal/das_malloc_test/main.c | 100 ++++++++++++++++++ 1 file changed, 100 insertions(+) create mode 100644 software/tests/baremetal/das_malloc_test/main.c diff --git a/software/tests/baremetal/das_malloc_test/main.c b/software/tests/baremetal/das_malloc_test/main.c new file mode 100644 index 000000000..a7bfd3b63 --- /dev/null +++ b/software/tests/baremetal/das_malloc_test/main.c @@ -0,0 +1,100 @@ +// Copyright 2021 ETH Zurich and University of Bologna. +// Licensed under the Apache License, Version 2.0, see LICENSE for details. +// SPDX-License-Identifier: Apache-2.0 + +// Note: This test is only for Terapool dynamic heap allocation +// Author: Bowen Wang + +#include +#include + +#include "alloc.h" +#include "encoding.h" +#include "printf.h" +#include "runtime.h" +#include "synchronization.h" + +#define TILES_PER_PARTITION (2) +#define ARRAY_SIZE (2 * TILES_PER_PARTITION * BANKING_FACTOR * NUM_CORES_PER_TILE) + + +int main() { + uint32_t core_id = mempool_get_core_id(); + uint32_t num_cores = mempool_get_core_count(); + + // Initialization + mempool_init(core_id); + mempool_barrier_init(core_id); + + // -------------------------------------------- + // Runtime Partition Selection + // -------------------------------------------- + + if (core_id == 0) { + printf("Initialize\n"); + // 1. Init dynamic heap allocator + partition_config(0, TILES_PER_PARTITION); + mempool_dynamic_heap_alloc_init(core_id, TILES_PER_PARTITION); + // 2. Set which partition write to. + uint32_t part_id = 0; // set to allocate in the first partition + // 3. Get the allocator and starting address to this region + alloc_t *dynamic_heap_alloc = get_dynamic_heap_alloc(part_id); + alloc_dump(dynamic_heap_alloc); + // 4. Allocate memory + uint32_t *array = (uint32_t *)domain_malloc(dynamic_heap_alloc, ARRAY_SIZE); + // 5. Move data + for (uint32_t i = 0; i < ARRAY_SIZE; ++i) { + array[i] = i; + } + // 6. Free array + domain_free(dynamic_heap_alloc, array); + // 7. Free dynamic allocator + free_dynamic_heap_alloc(); + printf("Done!\n"); + } + + mempool_barrier(num_cores); + + // -------------------------------------------- + // Verify partition + // -------------------------------------------- + + if (core_id == 0) { + printf("Verify partition\n"); + // 1. Init dynamic heap allocator + partition_config(0, TILES_PER_PARTITION); + mempool_dynamic_heap_alloc_init(core_id, TILES_PER_PARTITION); + // 2. Set which partition write to. + uint32_t num_partition = mempool_get_tile_count() / TILES_PER_PARTITION; + uint32_t part_id = 0; // set to allocate in the penultimate partition + // 3. Get the allocator and starting address to this region + alloc_t *dynamic_heap_alloc = get_dynamic_heap_alloc(part_id); + alloc_dump(dynamic_heap_alloc); + // 4. Allocate memory + uint32_t *array = (uint32_t *)domain_malloc(dynamic_heap_alloc, ARRAY_SIZE * TILES_PER_PARTITION); + // 5. Move data + for (uint32_t i = 0; i < ARRAY_SIZE; i++) { + array[i] = i; + } + // 6. Change addressing scheme + partition_config(0, NUM_CORES / NUM_CORES_PER_TILE); + for (uint32_t i = 0; i < ARRAY_SIZE; i++) { + uint32_t *fetch_address = &array[0] + \ + (i % (TILES_PER_PARTITION * NUM_CORES_PER_TILE * BANKING_FACTOR)) + \ + (i / (TILES_PER_PARTITION * NUM_CORES_PER_TILE * BANKING_FACTOR)) * NUM_BANKS; + if (i != *fetch_address) { + printf("%4d != %4d at address %8X.\n", i, *fetch_address, fetch_address); + } + } + // 7. Free array + domain_free(dynamic_heap_alloc, array); + // 8. Free dynamic allocator + free_dynamic_heap_alloc(); + printf("Done!\n"); + } + + mempool_barrier(num_cores); + + + return 0; +} From 699eb47b41167ab750791e91e41dce86a9677d03 Mon Sep 17 00:00:00 2001 From: mbertuletti Date: Wed, 22 Oct 2025 17:36:29 +0200 Subject: [PATCH 14/34] [hardware] Group DAS registers and assign external reset --- .../control_registers/control_registers.hjson | 132 +++------- .../control_registers_reg_pkg.sv | 85 ++---- .../control_registers_reg_top.sv | 248 ++++++------------ hardware/src/ctrl_registers.sv | 21 +- software/runtime/control_registers.h | 42 ++- 5 files changed, 191 insertions(+), 337 deletions(-) diff --git a/hardware/src/control_registers/control_registers.hjson b/hardware/src/control_registers/control_registers.hjson index 24a4fb363..414682f52 100644 --- a/hardware/src/control_registers/control_registers.hjson +++ b/hardware/src/control_registers/control_registers.hjson @@ -22,6 +22,11 @@ type: "int", default: "8" } + { name: "NumDASPartitions", + desc: "Supported number of DAS partitions", + type: "int", + default: "4" + } ], regwidth: 32 registers: [ @@ -72,99 +77,46 @@ fields: [{ bits: "31:0" }] }, - { name: "partition_sel_0" - desc: "Tile grouping for partition 1" - swaccess: "wo" - hwaccess: "hro" - hwqe: "true" - resval: 128 // Number of tiles - fields: [{ bits: "31:0" }] - }, - { name: "partition_sel_1" - desc: "Tile grouping for partition 2" - swaccess: "wo" - hwaccess: "hro" - hwqe: "true" - resval: 128 // Number of tiles - fields: [{ bits: "31:0" }] - }, - { name: "partition_sel_2" - desc: "Tile grouping for partition 3" - swaccess: "wo" - hwaccess: "hro" - hwqe: "true" - resval: 128 // Number of tiles - fields: [{ bits: "31:0" }] - }, - { name: "partition_sel_3" - desc: "Tile grouping for partition 4" - swaccess: "wo" - hwaccess: "hro" - hwqe: "true" - resval: 128 // Number of tiles - fields: [{ bits: "31:0" }] - }, - { name: "allocated_size_0" - desc: "Allocated size on partition 0" - swaccess: "wo" - hwaccess: "hro" - hwqe: "true" - fields: [{ bits: "31:0" }] - }, - { name: "allocated_size_1" - desc: "Allocated size on partition 1" - swaccess: "wo" - hwaccess: "hro" - hwqe: "true" - fields: [{ bits: "31:0" }] - }, - { name: "allocated_size_2" - desc: "Allocated size on partition 2" - swaccess: "wo" - hwaccess: "hro" - hwqe: "true" - fields: [{ bits: "31:0" }] - }, - { name: "allocated_size_3" - desc: "Allocated size on partition 3" - swaccess: "wo" - hwaccess: "hro" - hwqe: "true" - fields: [{ bits: "31:0" }] - }, - { name: "start_addr_scheme_0" - desc: "Allocated size on partition 0" - swaccess: "wo" - hwaccess: "hro" - hwqe: "true" - resval: 4194304 // 0x400000 starting point of the L1 - fields: [{ bits: "31:0" }] - }, - { name: "start_addr_scheme_1" - desc: "Allocated size on partition 1" - swaccess: "wo" - hwaccess: "hro" - hwqe: "true" - resval: 4194304 // 0x400000 starting point of the L1 - fields: [{ bits: "31:0" }] + { multireg: + { + name: "partition_sel" + desc: "Tile grouping for DAS partition" + swaccess: "wo" + hwaccess: "hrw" + hwqe: "true" + // External because we want to define the reset from a parameter + hwext: "true" + count: "NumDASPartitions" + cname: "partition_sel" + fields: [{ bits: "31:0" }] + }, }, - { name: "start_addr_scheme_2" - desc: "Allocated size on partition 2" - swaccess: "wo" - hwaccess: "hro" - hwqe: "true" - resval: 4194304 // 0x400000 starting point of the L1 - fields: [{ bits: "31:0" }] + { multireg: + { + name: "allocated_size" + desc: "Allocated size on DAS partition" + swaccess: "wo" + hwaccess: "hro" + hwqe: "true" + count: "NumDASPartitions" + cname: "allocated_size" + fields: [{ bits: "31:0" }] + }, }, - { name: "start_addr_scheme_3" - desc: "Allocated size on partition 3" - swaccess: "wo" - hwaccess: "hro" - hwqe: "true" - resval: 4194304 // 0x400000 starting point of the L1 - fields: [{ bits: "31:0" }] + { multireg: + { + name: "start_addr_scheme" + desc: "Start address of DAS partition" + swaccess: "wo" + hwaccess: "hrw" + hwqe: "true" + // External because we want to define the reset from a parameter + hwext: "true" + count: "NumDASPartitions" + cname: "start_addr_scheme" + fields: [{ bits: "31:0" }] + }, }, - { name: "tcdm_start_address" desc: "TCDM Start Address Register" swaccess: "ro" diff --git a/hardware/src/control_registers/control_registers_reg_pkg.sv b/hardware/src/control_registers/control_registers_reg_pkg.sv index d7d6cc373..95a5d8541 100644 --- a/hardware/src/control_registers/control_registers_reg_pkg.sv +++ b/hardware/src/control_registers/control_registers_reg_pkg.sv @@ -9,6 +9,7 @@ package control_registers_reg_pkg; // Param list parameter int ROCacheNumAddrRules = 4; parameter int MAX_NumGroups = 8; + parameter int NumDASPartitions = 4; // Address widths within the block parameter int BlockAw = 8; @@ -49,62 +50,17 @@ package control_registers_reg_pkg; typedef struct packed { logic [31:0] q; logic qe; - } control_registers_reg2hw_partition_sel_0_reg_t; + } control_registers_reg2hw_partition_sel_mreg_t; typedef struct packed { logic [31:0] q; logic qe; - } control_registers_reg2hw_partition_sel_1_reg_t; + } control_registers_reg2hw_allocated_size_mreg_t; typedef struct packed { logic [31:0] q; logic qe; - } control_registers_reg2hw_partition_sel_2_reg_t; - - typedef struct packed { - logic [31:0] q; - logic qe; - } control_registers_reg2hw_partition_sel_3_reg_t; - - typedef struct packed { - logic [31:0] q; - logic qe; - } control_registers_reg2hw_allocated_size_0_reg_t; - - typedef struct packed { - logic [31:0] q; - logic qe; - } control_registers_reg2hw_allocated_size_1_reg_t; - - typedef struct packed { - logic [31:0] q; - logic qe; - } control_registers_reg2hw_allocated_size_2_reg_t; - - typedef struct packed { - logic [31:0] q; - logic qe; - } control_registers_reg2hw_allocated_size_3_reg_t; - - typedef struct packed { - logic [31:0] q; - logic qe; - } control_registers_reg2hw_start_addr_scheme_0_reg_t; - - typedef struct packed { - logic [31:0] q; - logic qe; - } control_registers_reg2hw_start_addr_scheme_1_reg_t; - - typedef struct packed { - logic [31:0] q; - logic qe; - } control_registers_reg2hw_start_addr_scheme_2_reg_t; - - typedef struct packed { - logic [31:0] q; - logic qe; - } control_registers_reg2hw_start_addr_scheme_3_reg_t; + } control_registers_reg2hw_start_addr_scheme_mreg_t; typedef struct packed { logic [31:0] q; @@ -124,6 +80,14 @@ package control_registers_reg_pkg; logic qe; } control_registers_reg2hw_ro_cache_end_mreg_t; + typedef struct packed { + logic [31:0] d; + } control_registers_hw2reg_partition_sel_mreg_t; + + typedef struct packed { + logic [31:0] d; + } control_registers_hw2reg_start_addr_scheme_mreg_t; + typedef struct packed { logic [31:0] d; } control_registers_hw2reg_tcdm_start_address_reg_t; @@ -152,18 +116,9 @@ package control_registers_reg_pkg; control_registers_reg2hw_wake_up_group_reg_t wake_up_group; // [822:790] control_registers_reg2hw_wake_up_strd_reg_t wake_up_strd; // [789:757] control_registers_reg2hw_wake_up_offst_reg_t wake_up_offst; // [756:724] - control_registers_reg2hw_partition_sel_0_reg_t partition_sel_0; // [723:691] - control_registers_reg2hw_partition_sel_1_reg_t partition_sel_1; // [690:658] - control_registers_reg2hw_partition_sel_2_reg_t partition_sel_2; // [657:625] - control_registers_reg2hw_partition_sel_3_reg_t partition_sel_3; // [624:592] - control_registers_reg2hw_allocated_size_0_reg_t allocated_size_0; // [591:559] - control_registers_reg2hw_allocated_size_1_reg_t allocated_size_1; // [558:526] - control_registers_reg2hw_allocated_size_2_reg_t allocated_size_2; // [525:493] - control_registers_reg2hw_allocated_size_3_reg_t allocated_size_3; // [492:460] - control_registers_reg2hw_start_addr_scheme_0_reg_t start_addr_scheme_0; // [459:427] - control_registers_reg2hw_start_addr_scheme_1_reg_t start_addr_scheme_1; // [426:394] - control_registers_reg2hw_start_addr_scheme_2_reg_t start_addr_scheme_2; // [393:361] - control_registers_reg2hw_start_addr_scheme_3_reg_t start_addr_scheme_3; // [360:328] + control_registers_reg2hw_partition_sel_mreg_t [3:0] partition_sel; // [723:592] + control_registers_reg2hw_allocated_size_mreg_t [3:0] allocated_size; // [591:460] + control_registers_reg2hw_start_addr_scheme_mreg_t [3:0] start_addr_scheme; // [459:328] control_registers_reg2hw_ro_cache_enable_reg_t ro_cache_enable; // [327:296] control_registers_reg2hw_ro_cache_flush_reg_t ro_cache_flush; // [295:264] control_registers_reg2hw_ro_cache_start_mreg_t [3:0] ro_cache_start; // [263:132] @@ -172,6 +127,8 @@ package control_registers_reg_pkg; // HW -> register type typedef struct packed { + control_registers_hw2reg_partition_sel_mreg_t [3:0] partition_sel; // [607:480] + control_registers_hw2reg_start_addr_scheme_mreg_t [3:0] start_addr_scheme; // [479:352] control_registers_hw2reg_tcdm_start_address_reg_t tcdm_start_address; // [351:320] control_registers_hw2reg_tcdm_end_address_reg_t tcdm_end_address; // [319:288] control_registers_hw2reg_nr_cores_reg_reg_t nr_cores_reg; // [287:256] @@ -220,6 +177,14 @@ package control_registers_reg_pkg; parameter logic [BlockAw-1:0] CONTROL_REGISTERS_RO_CACHE_END_3_OFFSET = 8'h 94; // Reset values for hwext registers and their fields + parameter logic [31:0] CONTROL_REGISTERS_PARTITION_SEL_0_RESVAL = 32'h 0; + parameter logic [31:0] CONTROL_REGISTERS_PARTITION_SEL_1_RESVAL = 32'h 0; + parameter logic [31:0] CONTROL_REGISTERS_PARTITION_SEL_2_RESVAL = 32'h 0; + parameter logic [31:0] CONTROL_REGISTERS_PARTITION_SEL_3_RESVAL = 32'h 0; + parameter logic [31:0] CONTROL_REGISTERS_START_ADDR_SCHEME_0_RESVAL = 32'h 0; + parameter logic [31:0] CONTROL_REGISTERS_START_ADDR_SCHEME_1_RESVAL = 32'h 0; + parameter logic [31:0] CONTROL_REGISTERS_START_ADDR_SCHEME_2_RESVAL = 32'h 0; + parameter logic [31:0] CONTROL_REGISTERS_START_ADDR_SCHEME_3_RESVAL = 32'h 0; parameter logic [31:0] CONTROL_REGISTERS_TCDM_START_ADDRESS_RESVAL = 32'h 0; parameter logic [31:0] CONTROL_REGISTERS_TCDM_END_ADDRESS_RESVAL = 32'h 0; parameter logic [31:0] CONTROL_REGISTERS_NR_CORES_REG_RESVAL = 32'h 0; diff --git a/hardware/src/control_registers/control_registers_reg_top.sv b/hardware/src/control_registers/control_registers_reg_top.sv index a4ec1d7c3..c80a4f0dd 100644 --- a/hardware/src/control_registers/control_registers_reg_top.sv +++ b/hardware/src/control_registers/control_registers_reg_top.sv @@ -506,110 +506,74 @@ module control_registers_reg_top #( ); - // R[partition_sel_0]: V(False) - prim_subreg #( - .DW (32), - .SWACCESS("WO"), - .RESVAL (32'h80) - ) u_partition_sel_0 ( - .clk_i (clk_i ), - .rst_ni (rst_ni ), + // Subregister 0 of Multireg partition_sel + // R[partition_sel_0]: V(True) - // from register interface + prim_subreg_ext #( + .DW (32) + ) u_partition_sel_0 ( + .re (1'b0), .we (partition_sel_0_we), .wd (partition_sel_0_wd), - - // from internal hardware - .de (1'b0), - .d ('0 ), - - // to internal hardware - .qe (reg2hw.partition_sel_0.qe), - .q (reg2hw.partition_sel_0.q ), - + .d (hw2reg.partition_sel[0].d), + .qre (), + .qe (reg2hw.partition_sel[0].qe), + .q (reg2hw.partition_sel[0].q ), .qs () ); + // Subregister 1 of Multireg partition_sel + // R[partition_sel_1]: V(True) - // R[partition_sel_1]: V(False) - - prim_subreg #( - .DW (32), - .SWACCESS("WO"), - .RESVAL (32'h80) + prim_subreg_ext #( + .DW (32) ) u_partition_sel_1 ( - .clk_i (clk_i ), - .rst_ni (rst_ni ), - - // from register interface + .re (1'b0), .we (partition_sel_1_we), .wd (partition_sel_1_wd), - - // from internal hardware - .de (1'b0), - .d ('0 ), - - // to internal hardware - .qe (reg2hw.partition_sel_1.qe), - .q (reg2hw.partition_sel_1.q ), - + .d (hw2reg.partition_sel[1].d), + .qre (), + .qe (reg2hw.partition_sel[1].qe), + .q (reg2hw.partition_sel[1].q ), .qs () ); + // Subregister 2 of Multireg partition_sel + // R[partition_sel_2]: V(True) - // R[partition_sel_2]: V(False) - - prim_subreg #( - .DW (32), - .SWACCESS("WO"), - .RESVAL (32'h80) + prim_subreg_ext #( + .DW (32) ) u_partition_sel_2 ( - .clk_i (clk_i ), - .rst_ni (rst_ni ), - - // from register interface + .re (1'b0), .we (partition_sel_2_we), .wd (partition_sel_2_wd), - - // from internal hardware - .de (1'b0), - .d ('0 ), - - // to internal hardware - .qe (reg2hw.partition_sel_2.qe), - .q (reg2hw.partition_sel_2.q ), - + .d (hw2reg.partition_sel[2].d), + .qre (), + .qe (reg2hw.partition_sel[2].qe), + .q (reg2hw.partition_sel[2].q ), .qs () ); + // Subregister 3 of Multireg partition_sel + // R[partition_sel_3]: V(True) - // R[partition_sel_3]: V(False) - - prim_subreg #( - .DW (32), - .SWACCESS("WO"), - .RESVAL (32'h80) + prim_subreg_ext #( + .DW (32) ) u_partition_sel_3 ( - .clk_i (clk_i ), - .rst_ni (rst_ni ), - - // from register interface + .re (1'b0), .we (partition_sel_3_we), .wd (partition_sel_3_wd), - - // from internal hardware - .de (1'b0), - .d ('0 ), - - // to internal hardware - .qe (reg2hw.partition_sel_3.qe), - .q (reg2hw.partition_sel_3.q ), - + .d (hw2reg.partition_sel[3].d), + .qre (), + .qe (reg2hw.partition_sel[3].qe), + .q (reg2hw.partition_sel[3].q ), .qs () ); + + // Subregister 0 of Multireg allocated_size // R[allocated_size_0]: V(False) prim_subreg #( @@ -629,13 +593,13 @@ module control_registers_reg_top #( .d ('0 ), // to internal hardware - .qe (reg2hw.allocated_size_0.qe), - .q (reg2hw.allocated_size_0.q ), + .qe (reg2hw.allocated_size[0].qe), + .q (reg2hw.allocated_size[0].q ), .qs () ); - + // Subregister 1 of Multireg allocated_size // R[allocated_size_1]: V(False) prim_subreg #( @@ -655,13 +619,13 @@ module control_registers_reg_top #( .d ('0 ), // to internal hardware - .qe (reg2hw.allocated_size_1.qe), - .q (reg2hw.allocated_size_1.q ), + .qe (reg2hw.allocated_size[1].qe), + .q (reg2hw.allocated_size[1].q ), .qs () ); - + // Subregister 2 of Multireg allocated_size // R[allocated_size_2]: V(False) prim_subreg #( @@ -681,13 +645,13 @@ module control_registers_reg_top #( .d ('0 ), // to internal hardware - .qe (reg2hw.allocated_size_2.qe), - .q (reg2hw.allocated_size_2.q ), + .qe (reg2hw.allocated_size[2].qe), + .q (reg2hw.allocated_size[2].q ), .qs () ); - + // Subregister 3 of Multireg allocated_size // R[allocated_size_3]: V(False) prim_subreg #( @@ -707,113 +671,75 @@ module control_registers_reg_top #( .d ('0 ), // to internal hardware - .qe (reg2hw.allocated_size_3.qe), - .q (reg2hw.allocated_size_3.q ), + .qe (reg2hw.allocated_size[3].qe), + .q (reg2hw.allocated_size[3].q ), .qs () ); - // R[start_addr_scheme_0]: V(False) - prim_subreg #( - .DW (32), - .SWACCESS("WO"), - .RESVAL (32'h400000) - ) u_start_addr_scheme_0 ( - .clk_i (clk_i ), - .rst_ni (rst_ni ), + // Subregister 0 of Multireg start_addr_scheme + // R[start_addr_scheme_0]: V(True) - // from register interface + prim_subreg_ext #( + .DW (32) + ) u_start_addr_scheme_0 ( + .re (1'b0), .we (start_addr_scheme_0_we), .wd (start_addr_scheme_0_wd), - - // from internal hardware - .de (1'b0), - .d ('0 ), - - // to internal hardware - .qe (reg2hw.start_addr_scheme_0.qe), - .q (reg2hw.start_addr_scheme_0.q ), - + .d (hw2reg.start_addr_scheme[0].d), + .qre (), + .qe (reg2hw.start_addr_scheme[0].qe), + .q (reg2hw.start_addr_scheme[0].q ), .qs () ); + // Subregister 1 of Multireg start_addr_scheme + // R[start_addr_scheme_1]: V(True) - // R[start_addr_scheme_1]: V(False) - - prim_subreg #( - .DW (32), - .SWACCESS("WO"), - .RESVAL (32'h400000) + prim_subreg_ext #( + .DW (32) ) u_start_addr_scheme_1 ( - .clk_i (clk_i ), - .rst_ni (rst_ni ), - - // from register interface + .re (1'b0), .we (start_addr_scheme_1_we), .wd (start_addr_scheme_1_wd), - - // from internal hardware - .de (1'b0), - .d ('0 ), - - // to internal hardware - .qe (reg2hw.start_addr_scheme_1.qe), - .q (reg2hw.start_addr_scheme_1.q ), - + .d (hw2reg.start_addr_scheme[1].d), + .qre (), + .qe (reg2hw.start_addr_scheme[1].qe), + .q (reg2hw.start_addr_scheme[1].q ), .qs () ); + // Subregister 2 of Multireg start_addr_scheme + // R[start_addr_scheme_2]: V(True) - // R[start_addr_scheme_2]: V(False) - - prim_subreg #( - .DW (32), - .SWACCESS("WO"), - .RESVAL (32'h400000) + prim_subreg_ext #( + .DW (32) ) u_start_addr_scheme_2 ( - .clk_i (clk_i ), - .rst_ni (rst_ni ), - - // from register interface + .re (1'b0), .we (start_addr_scheme_2_we), .wd (start_addr_scheme_2_wd), - - // from internal hardware - .de (1'b0), - .d ('0 ), - - // to internal hardware - .qe (reg2hw.start_addr_scheme_2.qe), - .q (reg2hw.start_addr_scheme_2.q ), - + .d (hw2reg.start_addr_scheme[2].d), + .qre (), + .qe (reg2hw.start_addr_scheme[2].qe), + .q (reg2hw.start_addr_scheme[2].q ), .qs () ); + // Subregister 3 of Multireg start_addr_scheme + // R[start_addr_scheme_3]: V(True) - // R[start_addr_scheme_3]: V(False) - - prim_subreg #( - .DW (32), - .SWACCESS("WO"), - .RESVAL (32'h400000) + prim_subreg_ext #( + .DW (32) ) u_start_addr_scheme_3 ( - .clk_i (clk_i ), - .rst_ni (rst_ni ), - - // from register interface + .re (1'b0), .we (start_addr_scheme_3_we), .wd (start_addr_scheme_3_wd), - - // from internal hardware - .de (1'b0), - .d ('0 ), - - // to internal hardware - .qe (reg2hw.start_addr_scheme_3.qe), - .q (reg2hw.start_addr_scheme_3.q ), - + .d (hw2reg.start_addr_scheme[3].d), + .qre (), + .qe (reg2hw.start_addr_scheme[3].qe), + .q (reg2hw.start_addr_scheme[3].q ), .qs () ); diff --git a/hardware/src/ctrl_registers.sv b/hardware/src/ctrl_registers.sv index 06c10ec62..507df9e5c 100644 --- a/hardware/src/ctrl_registers.sv +++ b/hardware/src/ctrl_registers.sv @@ -102,20 +102,13 @@ module ctrl_registers `FFL(ctrl_hw2reg.ro_cache_end[i].d, ctrl_reg2hw.ro_cache_end[i].q, ctrl_reg2hw.ro_cache_end[i].qe, ro_cache_regions[i].end_addr, clk_i, rst_ni) end - - assign partition_sel_o[0] = ctrl_reg2hw.partition_sel_0.q[PartitionDataWidth-1:0]; - assign partition_sel_o[1] = ctrl_reg2hw.partition_sel_1.q[PartitionDataWidth-1:0]; - assign partition_sel_o[2] = ctrl_reg2hw.partition_sel_2.q[PartitionDataWidth-1:0]; - assign partition_sel_o[3] = ctrl_reg2hw.partition_sel_3.q[PartitionDataWidth-1:0]; - assign start_addr_scheme_o[0] = ctrl_reg2hw.start_addr_scheme_0.q; - assign start_addr_scheme_o[1] = ctrl_reg2hw.start_addr_scheme_1.q; - assign start_addr_scheme_o[2] = ctrl_reg2hw.start_addr_scheme_2.q; - assign start_addr_scheme_o[3] = ctrl_reg2hw.start_addr_scheme_3.q; - assign allocated_size_o[0] = ctrl_reg2hw.allocated_size_0.q[PartitionDataWidth-1:0]; - assign allocated_size_o[1] = ctrl_reg2hw.allocated_size_1.q[PartitionDataWidth-1:0]; - assign allocated_size_o[2] = ctrl_reg2hw.allocated_size_2.q[PartitionDataWidth-1:0]; - assign allocated_size_o[3] = ctrl_reg2hw.allocated_size_3.q[PartitionDataWidth-1:0]; - + for (genvar i = 0; i < mempool_pkg::NumDASPartitions; i++) begin: gen_das_regs + `FFL(ctrl_hw2reg.partition_sel[i].d, ctrl_reg2hw.partition_sel[i].q, ctrl_reg2hw.partition_sel[i].qe, mempool_pkg::NumTiles); + `FFL(ctrl_hw2reg.start_addr_scheme[i].d, ctrl_reg2hw.start_addr_scheme[i].q, ctrl_reg2hw.start_addr_scheme[i].qe, mempool_pkg::DASStartAddr); + assign partition_sel_o[i] = ctrl_hw2reg.partition_sel[i].d[PartitionDataWidth-1:0]; + assign start_addr_scheme_o[i] = ctrl_hw2reg.start_addr_scheme[i].d; + assign allocated_size_o[i] = ctrl_reg2hw.allocated_size[i].q[PartitionDataWidth-1:0]; + end /************************ * Wakeup Pulse Logic * diff --git a/software/runtime/control_registers.h b/software/runtime/control_registers.h index 8f509c9b1..88b4f96bd 100644 --- a/software/runtime/control_registers.h +++ b/software/runtime/control_registers.h @@ -19,6 +19,9 @@ extern "C" { // Maximum number of groups that we support in any configuration #define CONTROL_REGISTERS_PARAM_MAX_NUMGROUPS 8 +// Supported number of DAS partitions +#define CONTROL_REGISTERS_PARAM_NUM_D_A_S_PARTITIONS 4 + // Register width #define CONTROL_REGISTERS_PARAM_REG_WIDTH 32 @@ -66,40 +69,55 @@ extern "C" { // Wake Up Offst Register #define CONTROL_REGISTERS_WAKE_UP_OFFST_REG_OFFSET 0x30 -// Tile grouping for partition 1 +// Tile grouping for DAS partition (common parameters) +#define CONTROL_REGISTERS_PARTITION_SEL_PARTITION_SEL_FIELD_WIDTH 32 +#define CONTROL_REGISTERS_PARTITION_SEL_PARTITION_SEL_FIELDS_PER_REG 1 +#define CONTROL_REGISTERS_PARTITION_SEL_MULTIREG_COUNT 4 + +// Tile grouping for DAS partition #define CONTROL_REGISTERS_PARTITION_SEL_0_REG_OFFSET 0x34 -// Tile grouping for partition 2 +// Tile grouping for DAS partition #define CONTROL_REGISTERS_PARTITION_SEL_1_REG_OFFSET 0x38 -// Tile grouping for partition 3 +// Tile grouping for DAS partition #define CONTROL_REGISTERS_PARTITION_SEL_2_REG_OFFSET 0x3c -// Tile grouping for partition 4 +// Tile grouping for DAS partition #define CONTROL_REGISTERS_PARTITION_SEL_3_REG_OFFSET 0x40 -// Allocated size on partition 0 +// Allocated size on DAS partition (common parameters) +#define CONTROL_REGISTERS_ALLOCATED_SIZE_ALLOCATED_SIZE_FIELD_WIDTH 32 +#define CONTROL_REGISTERS_ALLOCATED_SIZE_ALLOCATED_SIZE_FIELDS_PER_REG 1 +#define CONTROL_REGISTERS_ALLOCATED_SIZE_MULTIREG_COUNT 4 + +// Allocated size on DAS partition #define CONTROL_REGISTERS_ALLOCATED_SIZE_0_REG_OFFSET 0x44 -// Allocated size on partition 1 +// Allocated size on DAS partition #define CONTROL_REGISTERS_ALLOCATED_SIZE_1_REG_OFFSET 0x48 -// Allocated size on partition 2 +// Allocated size on DAS partition #define CONTROL_REGISTERS_ALLOCATED_SIZE_2_REG_OFFSET 0x4c -// Allocated size on partition 3 +// Allocated size on DAS partition #define CONTROL_REGISTERS_ALLOCATED_SIZE_3_REG_OFFSET 0x50 -// Allocated size on partition 0 +// Start address of DAS partition (common parameters) +#define CONTROL_REGISTERS_START_ADDR_SCHEME_START_ADDR_SCHEME_FIELD_WIDTH 32 +#define CONTROL_REGISTERS_START_ADDR_SCHEME_START_ADDR_SCHEME_FIELDS_PER_REG 1 +#define CONTROL_REGISTERS_START_ADDR_SCHEME_MULTIREG_COUNT 4 + +// Start address of DAS partition #define CONTROL_REGISTERS_START_ADDR_SCHEME_0_REG_OFFSET 0x54 -// Allocated size on partition 1 +// Start address of DAS partition #define CONTROL_REGISTERS_START_ADDR_SCHEME_1_REG_OFFSET 0x58 -// Allocated size on partition 2 +// Start address of DAS partition #define CONTROL_REGISTERS_START_ADDR_SCHEME_2_REG_OFFSET 0x5c -// Allocated size on partition 3 +// Start address of DAS partition #define CONTROL_REGISTERS_START_ADDR_SCHEME_3_REG_OFFSET 0x60 // TCDM Start Address Register From bbf4280445ba11af43e76442ba6721788a931000 Mon Sep 17 00:00:00 2001 From: mbertuletti Date: Wed, 22 Oct 2025 17:38:38 +0200 Subject: [PATCH 15/34] [software] Streamline allocations in dynamic address regions --- software/runtime/alloc.c | 145 +----------------- software/runtime/alloc.h | 17 +- software/runtime/alloc_partition.c | 86 ----------- software/runtime/alloc_partition.h | 58 ------- software/runtime/arch.ld.c | 2 +- software/runtime/runtime.h | 47 +++--- software/runtime/runtime.mk | 1 - .../tests/baremetal/das_malloc_test/main.c | 75 ++++----- 8 files changed, 61 insertions(+), 370 deletions(-) delete mode 100644 software/runtime/alloc_partition.c delete mode 100644 software/runtime/alloc_partition.h diff --git a/software/runtime/alloc.c b/software/runtime/alloc.c index 401ddc116..877fdbd40 100644 --- a/software/runtime/alloc.c +++ b/software/runtime/alloc.c @@ -38,13 +38,7 @@ alloc_t alloc_tile[NUM_CORES / NUM_CORES_PER_TILE]; // ---------------------------------------------------------------------------- // Dynamic Heap Allocator // ---------------------------------------------------------------------------- -alloc_t* dynamic_heap_alloc = NULL; -void init_dynamic_heap_alloc(uint32_t num_partition){ // how many parts to devide the whole system - dynamic_heap_alloc = (alloc_t *)simple_malloc(num_partition * sizeof(alloc_t)); -} -void free_dynamic_heap_alloc(void){ - simple_free(dynamic_heap_alloc); -} +alloc_t dynamic_heap_alloc; // ---------------------------------------------------------------------------- // Canary System based on LSBs of block pointer @@ -288,140 +282,13 @@ void *simple_malloc(const uint32_t size) { return domain_malloc(&alloc_l1, size); } -// ------ Allocate a space aligned with L1 boundary ------ // -static uint32_t calc_aligned_l1_size (uint32_t* addr) { - uint32_t shift_size = 0; - uint32_t l1_aligned_mask = 0x3fff; - uint32_t masked_addr = (uint32_t)addr & l1_aligned_mask; - if (masked_addr==0x3ffc){ - shift_size = 0; - } - else{ - shift_size = 0x3ffc - masked_addr; - } - return shift_size; -} - -// Input size is block size: [data_size + meta_size] -static void *allocate_memory_l1_aligned(alloc_t *alloc, const uint32_t size) { - // Get first block of linked list of free blocks - alloc_block_t *curr = alloc->first_block; - alloc_block_t *prev = 0; - - uint32_t shift_size = 0; - shift_size = calc_aligned_l1_size( (uint32_t*)curr); - uint32_t aligned_size = size + shift_size; - - // Search first block large enough in linked list - while (curr && (curr->size < aligned_size)) { - prev = curr; - curr = curr->next; - - shift_size = calc_aligned_l1_size( (uint32_t*)curr); - aligned_size = size + shift_size; - } - - if (curr) { - // Update allocator - if (shift_size==0){ - printf("[L1 Alloc] No Alignment.\n"); - if (curr->size == size) { - // Special case: Whole block taken - if (prev) { - prev->next = curr->next; - } else { - alloc->first_block = curr->next; - } - } else { - // Regular case: Split off block - alloc_block_t *new_block = (alloc_block_t *)((char *)curr + size); - new_block->size = curr->size - size; - new_block->next = curr->next; - if (prev) { - prev->next = new_block; - } else { - alloc->first_block = new_block; - } - } - } - else{ - printf("[L1 Alloc] Alignment Needed.\n"); - if (curr->size == aligned_size) { - // Special case: Whole block taken, first part of the block is still empty - // store the curr info in tmp - // uint32_t tmp_size = curr->size; - struct alloc_block_s *tmp_next = curr->next; - alloc_block_t *shift_block = (alloc_block_t *)((char *)curr); - shift_block->size = shift_size; - shift_block->next = tmp_next; - if (prev) { - prev->next = shift_block; - } else { - alloc->first_block = shift_block; - } - } - else{ - // Regular case: Split off block - alloc_block_t *new_block = (alloc_block_t *)((char *)curr + aligned_size); - new_block->size = curr->size - aligned_size; - new_block->next = curr->next; - - alloc_block_t *shift_block = (alloc_block_t *)((char *)curr); - shift_block->size = shift_size; - shift_block->next = new_block; - if (prev) { - prev->next = shift_block; - } else { - alloc->first_block = shift_block; - } - } - } - - // Return block pointer - return (void *)((char *)curr+shift_size); - } else { - // There is no free block large enough - return NULL; - } -} - -void *domain_malloc_aligned(alloc_t *alloc, const uint32_t size) { - // Calculate actually required block size - uint32_t data_size = size + sizeof(uint32_t); // add size/metadata - uint32_t block_size = ALIGN_UP(data_size, MIN_BLOCK_SIZE); // add alignment - - // 32-bit metadata = 8-bit canary + 24-bit size - // i.e. max allowed block_size == (2^24 - 1) bytes - if (block_size >= (1 << (sizeof(uint32_t) * 8 - sizeof(uint8_t) * 8))) { - printf("Memory allocator: Requested memory exceeds max block size\n"); - return NULL; - } - - // Allocate memory - void *block_ptr = allocate_memory_l1_aligned(alloc, block_size); - if (!block_ptr) { - printf("Memory allocator: No large enough block found (%d)\n", block_size); - return NULL; - } - - // Store canary and size into first four bytes - *((uint32_t *)block_ptr) = canary_encode(block_ptr, block_size); - - // Return data pointer - void *data_ptr = (void *)((uint32_t *)block_ptr + 1); - printf("[Aligned malloc] addr: %p - size: %d\n", data_ptr, size); - return data_ptr; -} - -void *simple_aligned_malloc(const uint32_t size){ - return domain_malloc_aligned(&alloc_l1, size); -} - // ------ This function allocate data in Sequential Heap region ------ // // Canary system is stored in a seperate linked list // void *partition_malloc(alloc_t *alloc, const uint32_t size){ -void *partition_malloc(alloc_t *alloc, const uint32_t size, const uint32_t allocated_size){ - uint32_t data_size = size; +void *partition_malloc(alloc_t *alloc, const uint32_t size){ + + uint32_t data_size = size > 2*NUM_BANKS*sizeof(uint32_t) ? size : 2*NUM_BANKS*sizeof(uint32_t); + uint32_t allocated_size = data_size / (NUM_BANKS * sizeof(uint32_t)); uint32_t block_size = ALIGN_UP(data_size, MIN_BLOCK_SIZE); // add alignment // Check if exceed maximum allowed size @@ -700,4 +567,4 @@ alloc_t *get_alloc_l1() { return &alloc_l1; } alloc_t *get_alloc_tile(const uint32_t tile_id) { return &alloc_tile[tile_id]; } // Dynamic Heap Allocator -alloc_t *get_dynamic_heap_alloc(const uint32_t part_id) {return &dynamic_heap_alloc[part_id];} \ No newline at end of file +alloc_t *get_dynamic_heap_alloc() {return &dynamic_heap_alloc;} diff --git a/software/runtime/alloc.h b/software/runtime/alloc.h index 8527edbf6..39a9577a0 100644 --- a/software/runtime/alloc.h +++ b/software/runtime/alloc.h @@ -34,11 +34,6 @@ void alloc_init(alloc_t *alloc, void *base, const uint32_t size); // Malloc in L1 memory void *simple_malloc(const uint32_t size); -void *simple_aligned_malloc(const uint32_t size); - -// Dynamic heap allocation with Canary Chain -void *partition_malloc(alloc_t *alloc, const uint32_t size, const uint32_t allocated_size); - // Malloc with specified allocator void *domain_malloc(alloc_t *alloc, const uint32_t size); @@ -54,6 +49,7 @@ void domain_free(alloc_t *alloc, void *const ptr); // Print out linked list of free blocks void alloc_dump(alloc_t *alloc); void canary_dump(void); + // Get allocator for L1 interleaved heap memory alloc_t *get_alloc_l1(); @@ -61,9 +57,12 @@ alloc_t *get_alloc_l1(); alloc_t *get_alloc_tile(const uint32_t tile_id); // ----- Dynamic Heap Allocator ----- // -extern alloc_t* dynamic_heap_alloc; -void init_dynamic_heap_alloc(uint32_t num_partition); -void free_dynamic_heap_alloc(void); -alloc_t *get_dynamic_heap_alloc(const uint32_t part_id); +alloc_t *get_dynamic_heap_alloc(); + +// Dynamic heap allocation with Canary Chain +void *partition_malloc(alloc_t *alloc, const uint32_t size); + +// Free dynamic heap allocation with Canary chain +void partition_free(alloc_t *alloc, void *const ptr); #endif diff --git a/software/runtime/alloc_partition.c b/software/runtime/alloc_partition.c deleted file mode 100644 index a572f9380..000000000 --- a/software/runtime/alloc_partition.c +++ /dev/null @@ -1,86 +0,0 @@ -// Copyright 2022 ETH Zurich and University of Bologna. -// Licensed under the Apache License, Version 2.0, see LICENSE for details. -// SPDX-License-Identifier: Apache-2.0 -// -// Author: Bowen Wang - -#include "alloc.h" -#include "alloc_partition.h" -#include "runtime.h" -#include "printf.h" - -extern partition_status_t volatile partition_status[NUM_PART_REGION]; - -// ======================================================================================== -// Allocate a region in L1 for a single or several matrices -// @inp: (uint32_t) size --- size of the single allocated matrix -// @inp: (uint32_t) num_matrix --- How many mtrices in this region -// @inp: (int32_t* volitile *) target --- Where to store this pointer -// @inp: (uint32_t) group_factor --- GF_A/B/C -// ======================================================================================== -void alloc_matrix (float *volatile * target, uint32_t size, uint32_t group_factor, uint32_t num_matrix){ - - // 1. Get allocator for sequential Heap region - uint32_t total_size = size*num_matrix; - alloc_t* alloc_heap = get_dynamic_heap_alloc(0); - - // 2. alloc a space, store the return address to the target - *target = (float *)partition_malloc(alloc_heap, total_size*sizeof(float), total_size/NUM_ELEMENTS_PER_ROW); - // 3. find which partition in free - uint32_t pid=0; - uint32_t avail=0; - while( (pid> pid[%d], start_addr[%p].\n", pid, *target); - } - else{ - pid++; - } - } - if ( (pid==NUM_PART_REGION) && (avail==0) ){ - printf("Dynamic Allocator >> WARNING: No available partition region.\n"); - } - - // 4. Config the hardware - printf("Dynamic Allocator >> pid[%d] parallel_sections[%d] elements_per_section[%d]\n", pid, NUM_TILES/group_factor, size); - partition_config(pid, group_factor); - start_addr_scheme_config(pid, (uint32_t)(*target), total_size); - - // 5. Handle multi-matrices - if (num_matrix > 1){ - for (uint32_t ii=1; ii> pid[%d] is freed.\n", pid); - pid++; - } - else{ - pid++; - } - } - } -} - - -void free_alloc(uint32_t core_id){ - if (core_id == 0){ - free_dynamic_heap_alloc(); - } -} \ No newline at end of file diff --git a/software/runtime/alloc_partition.h b/software/runtime/alloc_partition.h deleted file mode 100644 index 61fc2a753..000000000 --- a/software/runtime/alloc_partition.h +++ /dev/null @@ -1,58 +0,0 @@ -// Copyright 2022 ETH Zurich and University of Bologna. -// Licensed under the Apache License, Version 2.0, see LICENSE for details. -// SPDX-License-Identifier: Apache-2.0 - -// Author: Bowen Wang - -#ifndef _ALLOC_PARTITION_H_ -#define _ALLOC_PARTITION_H_ - -// ============================================================ -// Dynamic Data Pointers -// ============================================================ -#define NUM_TILES (128) -#ifdef FLOAT_APP -float* volatile Region_A[NUM_TILES] __attribute__((section(".l1"))); -float* volatile Region_B[NUM_TILES] __attribute__((section(".l1"))); -float* volatile Region_C[NUM_TILES] __attribute__((section(".l1"))); -float* volatile Region_D[NUM_TILES] __attribute__((section(".l1"))); -#endif - -#ifdef INT_APP -int32_t* volatile Region_A[NUM_TILES] __attribute__((section(".l1"))); -int32_t* volatile Region_B[NUM_TILES] __attribute__((section(".l1"))); -int32_t* volatile Region_C[NUM_TILES] __attribute__((section(".l1"))); -int32_t* volatile Region_D[NUM_TILES] __attribute__((section(".l1"))); -#endif - -// ============================================================ -// Group Factor -// ============================================================ -#ifndef _GF -#define _GF -#define GF_TILE (1) -#define GF_SUBG (8) -#define GF_GROUP (32) -#define GF_CLUSTER (128) -#endif - -// ============================================================ -// Dynamic Heap Region Status -// ============================================================ -#define NUM_ELEMENTS_PER_ROW (4096) -#define NUM_PART_REGION (4) -typedef struct { - float *data_addr; // trace which matrix belong to this partition - uint32_t status; // set to 1 if used -} partition_status_t; - -// ============================================================ -// Helper Functions -// ============================================================ -void alloc_matrix(float *volatile * target, uint32_t size, uint32_t group_factor, uint32_t num_matrix); - -void free_matrix(float *__restrict__ heap_matrix, uint32_t part_id, uint32_t core_id); - -void free_alloc(uint32_t core_id); - -#endif \ No newline at end of file diff --git a/software/runtime/arch.ld.c b/software/runtime/arch.ld.c index 43bc68dab..ba1f66457 100644 --- a/software/runtime/arch.ld.c +++ b/software/runtime/arch.ld.c @@ -32,7 +32,7 @@ SECTIONS { __heap_end = __l1_end; // DAS related, default impacted region size - __heap_seq_start = __l1_start + (NUM_CORES * 2 * L1_BANK_SIZE); + __heap_seq_start = __l1_start + (NUM_CORES * BANKING_FACTOR * L1_BANK_SIZE) - NUM_CORES * DAS_MEM_SIZE; fake_uart = 0xC0000000; } diff --git a/software/runtime/runtime.h b/software/runtime/runtime.h index 3cc5be60c..103984bb5 100644 --- a/software/runtime/runtime.h +++ b/software/runtime/runtime.h @@ -56,7 +56,7 @@ static uint32_t volatile *wake_up_offset_reg = /* DAS-related regs */ -static uint32_t volatile *partition_reg = +static uint32_t volatile *partition0_reg = (uint32_t volatile *)(CONTROL_REGISTER_OFFSET + CONTROL_REGISTERS_PARTITION_SEL_0_REG_OFFSET); static uint32_t volatile *partition1_reg = @@ -178,41 +178,30 @@ static inline void mempool_reset_heap(const uint32_t core_id, uint32_t heap_seq_ } } - // Initialize Dynamic Heap Allocator, as default specified in the linker script -static inline void mempool_dynamic_heap_alloc_init(const uint32_t core_id, const uint32_t tiles_per_partition){ +static inline void mempool_dynamic_heap_alloc_init(const uint32_t core_id){ if (core_id == 0){ extern uint32_t __heap_seq_start; // Dynamic allocator base and size uint32_t seq_heap_base = (uint32_t)&__heap_seq_start; - uint32_t seq_heap_size = (NUM_CORES_PER_TILE * tiles_per_partition) * DAS_MEM_SIZE; - uint32_t num_partition = mempool_get_tile_count() / tiles_per_partition; - // Dynamically allocate the space for allocators - init_dynamic_heap_alloc(num_partition); - for (uint32_t part_id=0; part_id 2*NUM_BANKS*sizeof(uint32_t) ? size : 2*NUM_BANKS*sizeof(uint32_t); + uint32_t allocated_size = data_size / (NUM_BANKS * sizeof(uint32_t)); switch (reg_sel){ case 0: *start_addr_scheme0_reg = addr; - *allocated_size0_reg = size / 4096; + *allocated_size0_reg = allocated_size; break; case 1: *start_addr_scheme1_reg = addr; - *allocated_size1_reg = size / 4096; + *allocated_size1_reg = allocated_size; break; case 2: *start_addr_scheme2_reg = addr; - *allocated_size2_reg = size / 4096; + *allocated_size2_reg = allocated_size; break; case 3: *start_addr_scheme3_reg = addr; - *allocated_size3_reg = size / 4096; + *allocated_size3_reg = allocated_size; break; default: *start_addr_scheme0_reg = addr; - *allocated_size0_reg = size / 4096; + *allocated_size0_reg = allocated_size; break; } asm volatile("" ::: "memory"); diff --git a/software/runtime/runtime.mk b/software/runtime/runtime.mk index 70f6e0b33..5fa5846c2 100644 --- a/software/runtime/runtime.mk +++ b/software/runtime/runtime.mk @@ -160,7 +160,6 @@ endif LINKER_SCRIPT ?= $(ROOT_DIR)/arch.ld RUNTIME += $(ROOT_DIR)/alloc.c.o -RUNTIME += $(ROOT_DIR)/alloc_partition.c.o RUNTIME += $(ROOT_DIR)/crt0.S.o RUNTIME += $(ROOT_DIR)/printf.c.o RUNTIME += $(ROOT_DIR)/serial.c.o diff --git a/software/tests/baremetal/das_malloc_test/main.c b/software/tests/baremetal/das_malloc_test/main.c index a7bfd3b63..ac5677377 100644 --- a/software/tests/baremetal/das_malloc_test/main.c +++ b/software/tests/baremetal/das_malloc_test/main.c @@ -14,9 +14,9 @@ #include "runtime.h" #include "synchronization.h" -#define TILES_PER_PARTITION (2) -#define ARRAY_SIZE (2 * TILES_PER_PARTITION * BANKING_FACTOR * NUM_CORES_PER_TILE) - +#define NUM_TILES (NUM_CORES / NUM_CORES_PER_TILE) +#define NUM_TILES_PER_PARTITION (4) +#define ARRAY_SIZE (2 * NUM_TILES_PER_PARTITION * BANKING_FACTOR * NUM_CORES_PER_TILE) int main() { uint32_t core_id = mempool_get_core_id(); @@ -26,71 +26,50 @@ int main() { mempool_init(core_id); mempool_barrier_init(core_id); - // -------------------------------------------- - // Runtime Partition Selection - // -------------------------------------------- - - if (core_id == 0) { - printf("Initialize\n"); - // 1. Init dynamic heap allocator - partition_config(0, TILES_PER_PARTITION); - mempool_dynamic_heap_alloc_init(core_id, TILES_PER_PARTITION); - // 2. Set which partition write to. - uint32_t part_id = 0; // set to allocate in the first partition - // 3. Get the allocator and starting address to this region - alloc_t *dynamic_heap_alloc = get_dynamic_heap_alloc(part_id); - alloc_dump(dynamic_heap_alloc); - // 4. Allocate memory - uint32_t *array = (uint32_t *)domain_malloc(dynamic_heap_alloc, ARRAY_SIZE); - // 5. Move data - for (uint32_t i = 0; i < ARRAY_SIZE; ++i) { - array[i] = i; - } - // 6. Free array - domain_free(dynamic_heap_alloc, array); - // 7. Free dynamic allocator - free_dynamic_heap_alloc(); - printf("Done!\n"); - } - - mempool_barrier(num_cores); - // -------------------------------------------- // Verify partition // -------------------------------------------- if (core_id == 0) { printf("Verify partition\n"); + // 1. Init dynamic heap allocator - partition_config(0, TILES_PER_PARTITION); - mempool_dynamic_heap_alloc_init(core_id, TILES_PER_PARTITION); + mempool_dynamic_heap_alloc_init(core_id); + // 2. Set which partition write to. - uint32_t num_partition = mempool_get_tile_count() / TILES_PER_PARTITION; uint32_t part_id = 0; // set to allocate in the penultimate partition - // 3. Get the allocator and starting address to this region - alloc_t *dynamic_heap_alloc = get_dynamic_heap_alloc(part_id); + + // 3. Get the allocator + alloc_t *dynamic_heap_alloc = get_dynamic_heap_alloc(); alloc_dump(dynamic_heap_alloc); // 4. Allocate memory - uint32_t *array = (uint32_t *)domain_malloc(dynamic_heap_alloc, ARRAY_SIZE * TILES_PER_PARTITION); - // 5. Move data + uint32_t *array = (uint32_t *)partition_malloc(dynamic_heap_alloc, ARRAY_SIZE*sizeof(uint32_t)); + + // 5. Config the hardware registers + partition_config(part_id, NUM_TILES_PER_PARTITION); + start_addr_scheme_config(part_id, (uint32_t)(*array), ARRAY_SIZE*sizeof(uint32_t)); + + // 6. Move data for (uint32_t i = 0; i < ARRAY_SIZE; i++) { array[i] = i; } - // 6. Change addressing scheme - partition_config(0, NUM_CORES / NUM_CORES_PER_TILE); + + // 7. Change addressing scheme (to fully interleaved) + partition_config(part_id, NUM_TILES); + + // 8. check for (uint32_t i = 0; i < ARRAY_SIZE; i++) { uint32_t *fetch_address = &array[0] + \ - (i % (TILES_PER_PARTITION * NUM_CORES_PER_TILE * BANKING_FACTOR)) + \ - (i / (TILES_PER_PARTITION * NUM_CORES_PER_TILE * BANKING_FACTOR)) * NUM_BANKS; + (i % (NUM_TILES_PER_PARTITION * NUM_CORES_PER_TILE * BANKING_FACTOR)) + \ + (i / (NUM_TILES_PER_PARTITION * NUM_CORES_PER_TILE * BANKING_FACTOR)) * NUM_BANKS; if (i != *fetch_address) { printf("%4d != %4d at address %8X.\n", i, *fetch_address, fetch_address); } } - // 7. Free array - domain_free(dynamic_heap_alloc, array); - // 8. Free dynamic allocator - free_dynamic_heap_alloc(); - printf("Done!\n"); + + // 9. Free array + partition_free(dynamic_heap_alloc, array); + printf("All correct!\n"); } mempool_barrier(num_cores); From 56654a50cd3a289bfd333c088d9d60761fdb31ce Mon Sep 17 00:00:00 2001 From: mbertuletti Date: Wed, 22 Oct 2025 17:39:04 +0200 Subject: [PATCH 16/34] [hardware] Correct waves display --- hardware/scripts/questa/wave_core.tcl | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/hardware/scripts/questa/wave_core.tcl b/hardware/scripts/questa/wave_core.tcl index 85340078d..bcc97e0a6 100644 --- a/hardware/scripts/questa/wave_core.tcl +++ b/hardware/scripts/questa/wave_core.tcl @@ -13,7 +13,7 @@ if {$config == {terapool}} { add wave -noupdate -group core[$1][$2][$3][$4] -group Params /mempool_tb/dut/i_mempool_cluster/gen_groups[$1]/gen_rtl_group/i_group/gen_sub_groups[$2]/gen_rtl_sg/i_sub_group/gen_tiles[$3]/i_tile/gen_cores[$4]/gen_mempool_cc/riscv_core/RegisterTCDMReq add wave -noupdate -group core[$1][$2][$3][$4] -group Params /mempool_tb/dut/i_mempool_cluster/gen_groups[$1]/gen_rtl_group/i_group/gen_sub_groups[$2]/gen_rtl_sg/i_sub_group/gen_tiles[$3]/i_tile/gen_cores[$4]/gen_mempool_cc/riscv_core/RegisterTCDMResp add wave -noupdate -group core[$1][$2][$3][$4] /mempool_tb/dut/i_mempool_cluster/gen_groups[$1]/gen_rtl_group/i_group/gen_sub_groups[$2]/gen_rtl_sg/i_sub_group/gen_tiles[$3]/i_tile/gen_cores[$4]/gen_mempool_cc/riscv_core/i_snitch/clk_i - add wave -noupdate -group core[$1][$2][$3][$4] /mempool_tb/dut/i_mempool_cluster/gen_groups[$1]/gen_rtl_group/i_group/gen_sub_groups[$2]/gen_rtl_sg/i_sub_group/gen_tiles[$3]/i_tile/gen_cores[$4]/gen_mempool_cc/riscv_core/i_snitch/rst_i + add wave -noupdate -group core[$1][$2][$3][$4] /mempool_tb/dut/i_mempool_cluster/gen_groups[$1]/gen_rtl_group/i_group/gen_sub_groups[$2]/gen_rtl_sg/i_sub_group/gen_tiles[$3]/i_tile/gen_cores[$4]/gen_mempool_cc/riscv_core/i_snitch/rst_ni add wave -noupdate -group core[$1][$2][$3][$4] -radix unsigned /mempool_tb/dut/i_mempool_cluster/gen_groups[$1]/gen_rtl_group/i_group/gen_sub_groups[$2]/gen_rtl_sg/i_sub_group/gen_tiles[$3]/i_tile/gen_cores[$4]/gen_mempool_cc/riscv_core/i_snitch/hart_id_i add wave -noupdate -group core[$1][$2][$3][$4] -divider Instructions @@ -182,7 +182,7 @@ if {$config == {terapool}} { add wave -noupdate -group core[$1][$2][$3] -group Params /mempool_tb/dut/i_mempool_cluster/gen_groups[$1]/i_group/gen_tiles[$2]/i_tile/gen_cores[$3]/gen_mempool_cc/riscv_core/RegisterTCDMReq add wave -noupdate -group core[$1][$2][$3] -group Params /mempool_tb/dut/i_mempool_cluster/gen_groups[$1]/i_group/gen_tiles[$2]/i_tile/gen_cores[$3]/gen_mempool_cc/riscv_core/RegisterTCDMResp add wave -noupdate -group core[$1][$2][$3] /mempool_tb/dut/i_mempool_cluster/gen_groups[$1]/i_group/gen_tiles[$2]/i_tile/gen_cores[$3]/gen_mempool_cc/riscv_core/i_snitch/clk_i - add wave -noupdate -group core[$1][$2][$3] /mempool_tb/dut/i_mempool_cluster/gen_groups[$1]/i_group/gen_tiles[$2]/i_tile/gen_cores[$3]/gen_mempool_cc/riscv_core/i_snitch/rst_i + add wave -noupdate -group core[$1][$2][$3] /mempool_tb/dut/i_mempool_cluster/gen_groups[$1]/i_group/gen_tiles[$2]/i_tile/gen_cores[$3]/gen_mempool_cc/riscv_core/i_snitch/rst_ni add wave -noupdate -group core[$1][$2][$3] -radix unsigned /mempool_tb/dut/i_mempool_cluster/gen_groups[$1]/i_group/gen_tiles[$2]/i_tile/gen_cores[$3]/gen_mempool_cc/riscv_core/i_snitch/hart_id_i add wave -noupdate -group core[$1][$2][$3] -divider Instructions From 9c021728a2a7cf65f312b2f7debdfadd2a5e8831 Mon Sep 17 00:00:00 2001 From: mbertuletti Date: Wed, 22 Oct 2025 17:40:01 +0200 Subject: [PATCH 17/34] [hardware] Streamline address scrambler [hardware] Correct address scrambler parametrization --- config/config.mk | 3 +- hardware/Makefile | 2 + hardware/src/address_scrambler.sv | 163 ++++++++---------------------- hardware/src/mempool_pkg.sv | 4 + hardware/src/mempool_tile.sv | 14 +-- 5 files changed, 59 insertions(+), 127 deletions(-) diff --git a/config/config.mk b/config/config.mk index 6db1ac3b7..3a4fd33f8 100644 --- a/config/config.mk +++ b/config/config.mk @@ -75,7 +75,8 @@ xDivSqrt ?= 0 # Enable configurable addressing scheme in the heap das ?= 1 -# Size for configurable addressing scheme heap +num_das_partitions ?= 4 +# Size of DAS-heap per core das_mem_size ?= 2048 # This parameter is only used for TeraPool configurations diff --git a/hardware/Makefile b/hardware/Makefile index 361a8de03..cb17f11d5 100644 --- a/hardware/Makefile +++ b/hardware/Makefile @@ -120,6 +120,8 @@ vlog_defs += -DL2_BANKS=$(l2_banks) vlog_defs += -DL1_BANK_SIZE=$(l1_bank_size) vlog_defs += -DBOOT_ADDR=32\'d$(boot_addr) vlog_defs += -DDAS=$(das) +vlog_defs += -DNUM_DAS_PARTITIONS=$(num_das_partitions) +vlog_defs += -DDAS_MEM_SIZE=$(das_mem_size) # Snitch ISA vlog_defs += -DXPULPIMG=$(xpulpimg) vlog_defs += -DZFINX=$(zfinx) diff --git a/hardware/src/address_scrambler.sv b/hardware/src/address_scrambler.sv index 493819ee0..028be3360 100644 --- a/hardware/src/address_scrambler.sv +++ b/hardware/src/address_scrambler.sv @@ -6,7 +6,7 @@ // sequentially and part is interleaved. // Current constraints: -// Author: Samuel Riedel +// Author: Marco Bertuletti module address_scrambler #( parameter int unsigned AddrWidth = 32, @@ -16,17 +16,16 @@ module address_scrambler #( parameter int unsigned NumBanksPerTile = 2, parameter bit Bypass = 0, parameter int unsigned SeqMemSizePerTile = 4*1024, - parameter int unsigned HeapSeqMemSizePerTile = 8*2048, - parameter int unsigned MemSizePerTile = 8*4*1024, - parameter int unsigned MemSizePerRow = 4*4*1024, // 4bytes * 4096 banks - parameter int unsigned TCDMSize = 1024*1024 + parameter int unsigned TCDMSizePerBank = 1024, + parameter int unsigned NumDASPartitions = 4, + parameter int unsigned MemSizePerTile = NumBanksPerTile*TCDMSizePerBank, + parameter int unsigned MemSizePerRow = (1 << ByteOffset)*NumBanksPerTile*NumTiles ) ( - input logic [AddrWidth-1:0] address_i, - input logic [3:0][7:0] group_factor_i, - // For each allocation, the maximum number of rows assigned can be 128 rows - input logic [3:0][7:0] allocated_size_i, - input logic [3:0][DataWidth-1:0] start_addr_scheme_i, - output logic [AddrWidth-1:0] address_o + input logic [AddrWidth-1:0] address_i, + input logic [NumDASPartitions-1:0][$clog2(NumTiles):0] group_factor_i, + input logic [NumDASPartitions-1:0][$clog2(NumTiles):0] allocated_size_i, + input logic [NumDASPartitions-1:0][DataWidth-1:0] start_addr_scheme_i, + output logic [AddrWidth-1:0] address_o ); // Stack Sequential Settings localparam int unsigned BankOffsetBits = $clog2(NumBanksPerTile); @@ -36,12 +35,6 @@ module address_scrambler #( localparam int unsigned ConstantBitsLSB = ByteOffset + BankOffsetBits; localparam int unsigned ScrambleBits = SeqPerTileBits-ConstantBitsLSB; - // Heap Sequential Settings - localparam int unsigned HeapSeqPerTileBits = $clog2(MemSizePerTile); // log2(8*4096) = 15 | RowIndexBits + ConstBits - localparam int unsigned HeapSeqTotalBits = HeapSeqPerTileBits+TileIdBits; // 15+7=22 | used for address_o assignment - localparam int unsigned RowIndexBits = HeapSeqPerTileBits-ConstantBitsLSB; // 15-7=8 | RowIndex - - if (Bypass || NumTiles < 2) begin assign address_o = address_i; end else begin @@ -57,121 +50,53 @@ module address_scrambler #( // ------ Heap Sequential Signals ------ // - // `shift_index` : how many bits to shift for TileID bits in each partition - // `shift_index_sc`: how many bits need to swap within Row Index - logic [3:0][2:0] shift_index; - logic [3:0][2:0] shift_index_sc; - for (genvar i = 0; i < 4; i++) begin : gen_shift_index - always_comb begin - case(group_factor_i[i]) - 128: shift_index[i] = 7; - 64: shift_index[i] = 6; - 32: shift_index[i] = 5; - 16: shift_index[i] = 4; - 8: shift_index[i] = 3; - 4: shift_index[i] = 2; - 2: shift_index[i] = 1; - default: shift_index[i] = 0; - endcase - - case(allocated_size_i[i]) - 128: shift_index_sc[i] = 7; - 64: shift_index_sc[i] = 6; - 32: shift_index_sc[i] = 5; - 16: shift_index_sc[i] = 4; - 8: shift_index_sc[i] = 3; - 4: shift_index_sc[i] = 2; - 2: shift_index_sc[i] = 1; - default: shift_index_sc[i] = 0; - endcase - end + // `tile_index` : how many bits to shift for TileID bits in each partition + // `row_index`: how many bits need to swap within Row Index + logic [NumDASPartitions-1:0][$clog2(NumTiles):0] tile_index; + logic [NumDASPartitions-1:0][$clog2(NumTiles):0] row_index; + + for (genvar i = 0; i < NumDASPartitions; i++) begin : gen_shift_index + lzc #( + .WIDTH ($clog2(NumTiles)+1), + .MODE (1'b0 ) + ) i_log_tile_index ( + .in_i (group_factor_i[i]), + .cnt_o (tile_index[i] ), + .empty_o (/* Unused */ ) + ); + lzc #( + .WIDTH ($clog2(NumTiles)), + .MODE (1'b0 ) + ) i_log_row_index ( + .in_i (allocated_size_i[i]), + .cnt_o (row_index[i] ), + .empty_o (/* Unused */ ) + ); end - - // post-scramble row index - logic [RowIndexBits-1:0] post_scramble_row_index; - logic [TileIdBits-1:0] post_scramble_tile_id; - - logic [3:0][RowIndexBits-1:0] mask_row_index, mask_row_index_n; - logic [3:0][TileIdBits-1:0] mask_tile_id, mask_tile_id_n; - - logic [TileIdBits-1:0] heap_tile_id; - - for (genvar j = 0; j < 4; j++) begin : gen_mask - assign mask_row_index[j] = (shift_index_sc[j] == 0) ? {RowIndexBits{1'b0}} : ({RowIndexBits{1'b1}} >> (RowIndexBits-shift_index_sc[j])); - assign mask_tile_id[j] = (shift_index[j] == 0) ? {TileIdBits{1'b0}} : ({TileIdBits{1'b1}} >> (TileIdBits -shift_index[j])); - - assign mask_row_index_n[j] = ~mask_row_index[j]; - assign mask_tile_id_n[j] = ~mask_tile_id[j]; - end - - assign heap_tile_id = address_i[(TileIdBits+ConstantBitsLSB-1):ConstantBitsLSB]; - always_comb begin // Default: Unscrambled address_o[ConstantBitsLSB-1:0] = address_i[ConstantBitsLSB-1:0]; address_o[SeqTotalBits-1:ConstantBitsLSB] = {tile_id, scramble}; address_o[AddrWidth-1:SeqTotalBits] = address_i[AddrWidth-1:SeqTotalBits]; - post_scramble_row_index = 'b0; - post_scramble_tile_id = 'b0; // Stack Region if (address_i < (NumTiles * SeqMemSizePerTile)) begin address_o[SeqTotalBits-1:ConstantBitsLSB] = {scramble, tile_id}; - // Sequential Heap Region - end else if ( (address_i >= start_addr_scheme_i[0]) && (address_i < start_addr_scheme_i[0]+MemSizePerRow*allocated_size_i[0]) ) begin - - post_scramble_row_index = 'b0; - post_scramble_tile_id = 'b0; - // 1. `post_scramble_row_index` generation - post_scramble_row_index |= (address_i >> (ConstantBitsLSB + shift_index[0])) & mask_row_index[0]; - post_scramble_row_index |= (address_i >> (ConstantBitsLSB + TileIdBits )) & mask_row_index_n[0]; - - // 2. `post_scramble_tile_id` generation - post_scramble_tile_id |= heap_tile_id & mask_tile_id[0]; - post_scramble_tile_id |= (address_i >> (ConstantBitsLSB + shift_index_sc[0])) & mask_tile_id_n[0]; - - address_o[HeapSeqTotalBits-1:ConstantBitsLSB] = {post_scramble_row_index, post_scramble_tile_id}; - end else if ( (address_i >= start_addr_scheme_i[1]) && (address_i < start_addr_scheme_i[1]+MemSizePerRow*allocated_size_i[1]) ) begin - - post_scramble_row_index = 'b0; - post_scramble_tile_id = 'b0; - // 1. `post_scramble_row_index` generation - post_scramble_row_index |= (address_i >> (ConstantBitsLSB + shift_index[1])) & mask_row_index[1]; - post_scramble_row_index |= (address_i >> (ConstantBitsLSB + TileIdBits )) & mask_row_index_n[1]; - - // 2. `post_scramble_tile_id` generation - post_scramble_tile_id |= heap_tile_id & mask_tile_id[1]; - post_scramble_tile_id |= (address_i >> (ConstantBitsLSB + shift_index_sc[1])) & mask_tile_id_n[1]; - - address_o[HeapSeqTotalBits-1:ConstantBitsLSB] = {post_scramble_row_index, post_scramble_tile_id}; - end else if ( (address_i >= start_addr_scheme_i[2]) && (address_i < start_addr_scheme_i[2]+MemSizePerRow*allocated_size_i[2]) ) begin - - post_scramble_row_index = 'b0; - post_scramble_tile_id = 'b0; - // 1. `post_scramble_row_index` generation - post_scramble_row_index |= (address_i >> (ConstantBitsLSB + shift_index[2])) & mask_row_index[2]; - post_scramble_row_index |= (address_i >> (ConstantBitsLSB + TileIdBits )) & mask_row_index_n[2]; - - // 2. `post_scramble_tile_id` generation - post_scramble_tile_id |= heap_tile_id & mask_tile_id[2]; - post_scramble_tile_id |= (address_i >> (ConstantBitsLSB + shift_index_sc[2])) & mask_tile_id_n[2]; - - address_o[HeapSeqTotalBits-1:ConstantBitsLSB] = {post_scramble_row_index, post_scramble_tile_id}; - end else if ( (address_i >= start_addr_scheme_i[3]) && (address_i < start_addr_scheme_i[3]+MemSizePerRow*allocated_size_i[3]) ) begin - - post_scramble_row_index = 'b0; - post_scramble_tile_id = 'b0; - // 1. `post_scramble_row_index` generation - post_scramble_row_index |= (address_i >> (ConstantBitsLSB + shift_index[3])) & mask_row_index[3]; - post_scramble_row_index |= (address_i >> (ConstantBitsLSB + TileIdBits )) & mask_row_index_n[3]; - - // 2. `post_scramble_tile_id` generation - post_scramble_tile_id |= heap_tile_id & mask_tile_id[3]; - post_scramble_tile_id |= (address_i >> (ConstantBitsLSB + shift_index_sc[3])) & mask_tile_id_n[3]; + // DAS address scrambling + end else begin + + for (int p = 0; p < NumDASPartitions; p++) begin + if ( (address_i >= start_addr_scheme_i[0]) && (address_i < start_addr_scheme_i[0]+MemSizePerRow*allocated_size_i[0]) ) begin + address_o = '0; + address_o |= address_i & ((1 << (tile_index[0]+ConstantBitsLSB)) - 1); + address_o |= ((address_i >> (row_index[0]+tile_index[0]+ConstantBitsLSB)) << (tile_index[0]+ConstantBitsLSB)) & ((1 << (TileIdBits+ConstantBitsLSB)) - 1); + address_o |= ((address_i >> (tile_index[0]+ConstantBitsLSB)) << (TileIdBits + ConstantBitsLSB)) & ((1 << (row_index[0]+TileIdBits+ConstantBitsLSB)) - 1); + address_o |= address_i & ~((1 << (row_index[0]+TileIdBits+ConstantBitsLSB)) - 1); + end + end - address_o[HeapSeqTotalBits-1:ConstantBitsLSB] = {post_scramble_row_index, post_scramble_tile_id}; end end end diff --git a/hardware/src/mempool_pkg.sv b/hardware/src/mempool_pkg.sv index ea3962cd0..7c2e8efb1 100644 --- a/hardware/src/mempool_pkg.sv +++ b/hardware/src/mempool_pkg.sv @@ -46,6 +46,10 @@ package mempool_pkg; localparam integer unsigned NumBanksPerGroup = NumBanks / NumGroups; localparam integer unsigned TCDMAddrMemWidth = $clog2(TCDMSizePerBank / mempool_pkg::BeWidth); localparam integer unsigned TCDMAddrWidth = TCDMAddrMemWidth + idx_width(NumBanksPerGroup); + // DAS parameters + localparam integer unsigned NumDASPartitions = `ifdef NUM_DAS_PARTITIONS `NUM_DAS_PARTITIONS `else 0 `endif; + localparam integer unsigned DASMemSize = `ifdef DAS_MEM_SIZE `DAS_MEM_SIZE `else 0 `endif; + localparam integer unsigned DASStartAddr = (NumBanks * TCDMSizePerBank) - NumCores * DASMemSize; // L2 localparam integer unsigned L2Size = `ifdef L2_SIZE `L2_SIZE `else 0 `endif; // [B] diff --git a/hardware/src/mempool_tile.sv b/hardware/src/mempool_tile.sv index 0b77db180..241848f62 100644 --- a/hardware/src/mempool_tile.sv +++ b/hardware/src/mempool_tile.sv @@ -901,14 +901,14 @@ module mempool_tile .NumBanksPerTile (NumBanksPerTile ), .Bypass (0 ), .SeqMemSizePerTile (SeqMemSizePerTile), - .HeapSeqMemSizePerTile (HeapSeqMemSizePerTile), - .TCDMSize (TCDMSize) + .NumDASPartitions (NumDASPartitions ), + .TCDMSizePerBank (TCDMSizePerBank ) ) i_address_scrambler ( - .address_i (snitch_data_qaddr[c] ), - .group_factor_i(partition_sel_i), - .allocated_size_i (allocated_size_i), - .start_addr_scheme_i(start_addr_scheme_i), - .address_o (snitch_data_qaddr_scrambled) + .address_i (snitch_data_qaddr[c]), + .group_factor_i (partition_sel_i ), + .allocated_size_i (allocated_size_i ), + .start_addr_scheme_i(start_addr_scheme_i ), + .address_o (snitch_data_qaddr_scrambled) ); if (!TrafficGeneration) begin: gen_tcdm_shim From d26b44ad6e02e4bd3e5a7e864600fa4f8310ccec Mon Sep 17 00:00:00 2001 From: mbertuletti Date: Wed, 22 Oct 2025 18:42:28 +0200 Subject: [PATCH 18/34] [software] Allocation stress-test over multiple partitions & Tile-groups --- software/runtime/runtime.mk | 1 + .../tests/baremetal/das_malloc_test/main.c | 105 ++++++++++++------ 2 files changed, 71 insertions(+), 35 deletions(-) diff --git a/software/runtime/runtime.mk b/software/runtime/runtime.mk index 5fa5846c2..4d485dd22 100644 --- a/software/runtime/runtime.mk +++ b/software/runtime/runtime.mk @@ -111,6 +111,7 @@ ifdef terapool DEFINES += -DNUM_TILES_PER_SUB_GROUP=$(shell awk 'BEGIN{print ($(num_cores)/$(num_groups))/$(num_cores_per_tile)/$(num_sub_groups_per_group)}') endif ifdef das + DEFINES += -DNUM_DAS_PARTITIONS=$(num_das_partitions) DEFINES += -DDAS_MEM_SIZE=$(das_mem_size) DEFINES += -DLOG2_DAS_MEM_SIZE=$(shell awk 'BEGIN{print log($(das_mem_size))/log(2)}') endif diff --git a/software/tests/baremetal/das_malloc_test/main.c b/software/tests/baremetal/das_malloc_test/main.c index ac5677377..bee4a9db8 100644 --- a/software/tests/baremetal/das_malloc_test/main.c +++ b/software/tests/baremetal/das_malloc_test/main.c @@ -15,8 +15,6 @@ #include "synchronization.h" #define NUM_TILES (NUM_CORES / NUM_CORES_PER_TILE) -#define NUM_TILES_PER_PARTITION (4) -#define ARRAY_SIZE (2 * NUM_TILES_PER_PARTITION * BANKING_FACTOR * NUM_CORES_PER_TILE) int main() { uint32_t core_id = mempool_get_core_id(); @@ -26,54 +24,91 @@ int main() { mempool_init(core_id); mempool_barrier_init(core_id); - // -------------------------------------------- - // Verify partition - // -------------------------------------------- - if (core_id == 0) { - printf("Verify partition\n"); + + // -------------------------------------------- + // Verify DAS partitions + // -------------------------------------------- + printf("Verify DAS partitions\n\n"); + + uint32_t num_tiles_per_partition = 4; + uint32_t array_size = 2 * num_tiles_per_partition * BANKING_FACTOR * NUM_CORES_PER_TILE; // 1. Init dynamic heap allocator mempool_dynamic_heap_alloc_init(core_id); // 2. Set which partition write to. - uint32_t part_id = 0; // set to allocate in the penultimate partition - - // 3. Get the allocator - alloc_t *dynamic_heap_alloc = get_dynamic_heap_alloc(); - alloc_dump(dynamic_heap_alloc); - // 4. Allocate memory - uint32_t *array = (uint32_t *)partition_malloc(dynamic_heap_alloc, ARRAY_SIZE*sizeof(uint32_t)); - - // 5. Config the hardware registers - partition_config(part_id, NUM_TILES_PER_PARTITION); - start_addr_scheme_config(part_id, (uint32_t)(*array), ARRAY_SIZE*sizeof(uint32_t)); - - // 6. Move data - for (uint32_t i = 0; i < ARRAY_SIZE; i++) { - array[i] = i; + for (uint32_t part_id = 0; part_id < NUM_DAS_PARTITIONS; part_id++) { + // 3. Get the allocator + alloc_t *dynamic_heap_alloc = get_dynamic_heap_alloc(); + alloc_dump(dynamic_heap_alloc); + // 4. Allocate memory + uint32_t *array = (uint32_t *)partition_malloc(dynamic_heap_alloc, array_size*sizeof(uint32_t)); + // 5. Config the hardware registers + partition_config(part_id, num_tiles_per_partition); + start_addr_scheme_config(part_id, (uint32_t)(*array), array_size*sizeof(uint32_t)); + // 6. Move data + for (uint32_t i = 0; i < array_size; i++) { + array[i] = i; + } + // 7. Change addressing scheme (to fully interleaved) + partition_config(part_id, NUM_TILES); + // 8. check + for (uint32_t i = 0; i < array_size; i++) { + uint32_t *fetch_address = &array[0] + \ + (i % (num_tiles_per_partition * NUM_CORES_PER_TILE * BANKING_FACTOR)) + \ + (i / (num_tiles_per_partition * NUM_CORES_PER_TILE * BANKING_FACTOR)) * NUM_BANKS; + if (i != *fetch_address) { + printf("%4d != %4d at address %8X.\n", i, *fetch_address, fetch_address); + return 1; + } + } + // 9. Free array + partition_free(dynamic_heap_alloc, array); + printf("SUCCESS on partition %d \n\n", part_id); } - // 7. Change addressing scheme (to fully interleaved) - partition_config(part_id, NUM_TILES); + // -------------------------------------------- + // Verify DAS per Tile groups + // -------------------------------------------- + printf("Verify DAS per Tile-groups\n\n"); - // 8. check - for (uint32_t i = 0; i < ARRAY_SIZE; i++) { - uint32_t *fetch_address = &array[0] + \ - (i % (NUM_TILES_PER_PARTITION * NUM_CORES_PER_TILE * BANKING_FACTOR)) + \ - (i / (NUM_TILES_PER_PARTITION * NUM_CORES_PER_TILE * BANKING_FACTOR)) * NUM_BANKS; - if (i != *fetch_address) { - printf("%4d != %4d at address %8X.\n", i, *fetch_address, fetch_address); + // 2. Set which partition write to. + uint32_t part_id = 0; + for (num_tiles_per_partition = 1; num_tiles_per_partition < NUM_TILES; num_tiles_per_partition *= 2) { + array_size = 2 * num_tiles_per_partition * BANKING_FACTOR * NUM_CORES_PER_TILE; + // 3. Get the allocator + alloc_t *dynamic_heap_alloc = get_dynamic_heap_alloc(); + alloc_dump(dynamic_heap_alloc); + // 4. Allocate memory + uint32_t *array = (uint32_t *)partition_malloc(dynamic_heap_alloc, array_size*sizeof(uint32_t)); + // 5. Config the hardware registers + partition_config(part_id, num_tiles_per_partition); + start_addr_scheme_config(part_id, (uint32_t)(*array), array_size*sizeof(uint32_t)); + // 6. Move data + for (uint32_t i = 0; i < array_size; i++) { + array[i] = i; } + // 7. Change addressing scheme (to fully interleaved) + partition_config(part_id, NUM_TILES); + // 8. check + for (uint32_t i = 0; i < array_size; i++) { + uint32_t *fetch_address = &array[0] + \ + (i % (num_tiles_per_partition * NUM_CORES_PER_TILE * BANKING_FACTOR)) + \ + (i / (num_tiles_per_partition * NUM_CORES_PER_TILE * BANKING_FACTOR)) * NUM_BANKS; + if (i != *fetch_address) { + printf("%4d != %4d at address %8X.\n", i, *fetch_address, fetch_address); + return 1; + } + } + // 9. Free array + partition_free(dynamic_heap_alloc, array); + printf("SUCCESS for groups of %d tiles over the partition \n\n", num_tiles_per_partition); } - // 9. Free array - partition_free(dynamic_heap_alloc, array); printf("All correct!\n"); } mempool_barrier(num_cores); - - return 0; } From 92242b75acf8bd7a4575556f096550bd8945f207 Mon Sep 17 00:00:00 2001 From: mbertuletti Date: Wed, 22 Oct 2025 18:45:17 +0200 Subject: [PATCH 19/34] [hardware] Trash redundant file --- hardware/src/idma_partition_midend.sv | 333 -------------------------- 1 file changed, 333 deletions(-) delete mode 100644 hardware/src/idma_partition_midend.sv diff --git a/hardware/src/idma_partition_midend.sv b/hardware/src/idma_partition_midend.sv deleted file mode 100644 index 8f1332674..000000000 --- a/hardware/src/idma_partition_midend.sv +++ /dev/null @@ -1,333 +0,0 @@ -// Copyright 2022 ETH Zurich and University of Bologna. -// Solderpad Hardware License, Version 0.51, see LICENSE for details. -// SPDX-License-Identifier: SHL-0.51 - -// Bowen Wang -// This module split one burst request to several according to partition scheme selected -// This module is inserted between `idma_split_midend` and `idma_distribute_midend` in Terapool Cluster - -`include "common_cells/registers.svh" - -module idma_partition_midend - import mempool_pkg::SeqMemSizePerTile; - import mempool_pkg::HeapSeqMemSizePerTile; - import mempool_pkg::TCDMSize; - #( - parameter int unsigned DmaRegionWidth = 1, // [B] Region that one port covers in bytes - parameter int unsigned DmaRegionStart = 32'h0000_0000, - parameter int unsigned DmaRegionEnd = 32'h1000_0000, - parameter int unsigned AddrWidth = 32, - parameter type burst_req_t = logic, - parameter type meta_t = logic -) ( - input logic clk_i, - input logic rst_ni, - // Slave - input burst_req_t burst_req_i, - input logic [7:0] beat_cnt_i, - input logic valid_i, - output logic ready_o, - output meta_t meta_o, - // Partition - input logic [7:0] group_factor_i, - input logic [7:0] allocated_size_i, - output logic partition_req_valid_o, - input logic part_beat_cnt_rst_i, - // Master - output burst_req_t burst_req_o, - output logic valid_o, - input logic ready_i, - input meta_t meta_i -); - - // DmaRegionWidth covered by each Tile in [bytes] - // DmaRegionWidth = #banks*4 = 4096*4 [bytes] - // TileDmaRegionWidth = 32*4 [bytes] - typedef logic [AddrWidth-1:0] addr_t; - // log2(4096*4)= 14 = TileIdBits + ConstBits = 7 + 7 - localparam DmaRegionAddressBits = $clog2(DmaRegionWidth); - localparam TileDmaRegionWidth = DmaRegionWidth / 128; - - // ------ Considering Partition Scheme ------ // - // How many bits more need to consider in each partition - logic [2:0] shift_index; - - logic [AddrWidth-1:0] PartitionDmaRegionWidth; - logic [AddrWidth-1:0] partition_mask; - - assign shift_index = (group_factor_i == 128) ? 0 : - (group_factor_i == 64) ? 1 : - (group_factor_i == 32) ? 2 : - (group_factor_i == 16) ? 3 : - (group_factor_i == 8 ) ? 4 : - (group_factor_i == 4 ) ? 5 : - (group_factor_i == 2 ) ? 6 : 7; - // #bytes covered in each partition per row - assign PartitionDmaRegionWidth = TileDmaRegionWidth * group_factor_i; - // |--- 14 bits ---| Lower 14 bits in address - // 1111111_1111111 GF=128 - // 0111111_1111111 GF=64 - // 0011111_1111111 GF=32 - assign partition_mask = {DmaRegionAddressBits{1'b1}} >> shift_index; - - // start_addr: address in L1 of the current input burst - // masked_start_addr: address bits within partition region - addr_t start_addr, masked_start_addr; - always_comb begin - if (($unsigned(burst_req_i.src) >= DmaRegionStart) && ($unsigned(burst_req_i.src) < DmaRegionEnd)) begin - // L1 ------> L2 - start_addr = burst_req_i.src; - end else begin - // L2 ------> L1 - start_addr = burst_req_i.dst; - end - end - - assign masked_start_addr = start_addr & partition_mask; - - // ------ Handle Meta Data ------ // - logic req_valid; - // Forward IDLE signal - assign meta_o.backend_idle = meta_i.backend_idle; - // Forward trans_complete signal as well - assign meta_o.trans_complete = meta_i.trans_complete; - // Send the req_valid signal back to split_midend - assign partition_req_valid_o = req_valid; - - // ------ Split One request aligned to partition scheme ------ // - enum logic {Idle, Busy} state_d, state_q; - burst_req_t req_d, req_q; - - `FFARN(state_q, state_d, Idle, clk_i, rst_ni) - `FFARN(req_q, req_d, '0, clk_i, rst_ni) - - - // ------ Beat Counter Handler ------ // - // When detecting `negedge` on beat_cnt_i, meaning a new DMA request starts, - // beat counter of partition need to reset - // beat_cnt_i: how many beats has been sent from split midend - logic [7:0] beat_cnt_q; - `FFARN(beat_cnt_q, beat_cnt_i, '0, clk_i, rst_ni) - logic [7:0] rst_part_beat_cnt; - assign rst_part_beat_cnt = {8{~( ~(|beat_cnt_i) & (|beat_cnt_q) )}}; // fall edge detect, negative reset - - logic [7:0] part_beat_cnt_d, part_beat_cnt_q, part_beat_cnt_pre_q; - `FFARN(part_beat_cnt_pre_q, part_beat_cnt_d, '0, clk_i, rst_ni) - assign part_beat_cnt_q = part_beat_cnt_pre_q & rst_part_beat_cnt; - - // figure out which partition targeting - // only update if beat_cnt_i == 0 (first beat) - logic [2:0] pid_shift_index; - assign pid_shift_index = (group_factor_i == 128) ? 7 : - (group_factor_i == 64) ? 6 : - (group_factor_i == 32) ? 5 : - (group_factor_i == 16) ? 4 : - (group_factor_i == 8 ) ? 3 : - (group_factor_i == 4 ) ? 2 : - (group_factor_i == 2 ) ? 1 : 0; // TODO - - logic [6:0] part_id_d, part_id_q, part_id_mask; - `FFARN(part_id_q, part_id_d, '0, clk_i, rst_ni) - always_comb begin - part_id_d = part_id_q; - part_id_mask = {7{1'b1}}; - if (|beat_cnt_i == 0) begin - part_id_d = (group_factor_i == 128) ? 0 : (start_addr >> (pid_shift_index + 7)) & (part_id_mask>>pid_shift_index); - end - end - - // ------ Shifter from new partition layout ------ // - // maximum rows in each partition: 128 - // maximum number of partitions: 128 - - logic [7:0] shift_row, shift_partition; - logic [2:0] shift_index_sc; - logic [7:0] mask_shift_row; - always_comb begin - case(allocated_size_i) - 128: shift_index_sc = 7; - 64: shift_index_sc = 6; - 32: shift_index_sc = 5; - 16: shift_index_sc = 4; - 8: shift_index_sc = 3; - 4: shift_index_sc = 2; - 2: shift_index_sc = 1; - default: shift_index_sc = 0; - endcase - end - - assign shift_partition = part_beat_cnt_q >> shift_index_sc; - assign mask_shift_row = ~( {8{1'b1}}<= burst_req_i.num_bytes)begin - // increase part_beat_cnt - part_beat_cnt_d = part_beat_cnt_q + ready_i; - burst_req_o = burst_req_i; - if (($unsigned(burst_req_i.src) >= DmaRegionStart) && ($unsigned(burst_req_i.src) < DmaRegionEnd)) begin - // L1 ------> L2 - // correct addr = base addr + row offset + partition offset - if(beat_cnt_i == 0)begin - burst_req_o.src = burst_req_i.src + (part_beat_cnt_q-beat_cnt_i)*DmaRegionWidth; - end else begin - burst_req_o.src = burst_req_i.src + (part_beat_cnt_q-beat_cnt_i)*DmaRegionWidth + part_id_q*PartitionDmaRegionWidth; - end - end else begin - // L2 ------> L1 - if (beat_cnt_i == 0) begin - // handle 1.1 - // burst_req_o.dst = burst_req_i.dst + (part_beat_cnt_q-beat_cnt_i)*DmaRegionWidth; - burst_req_o.dst = burst_req_i.dst + part_beat_cnt_q*DmaRegionWidth; - end else begin - // handle 1.2 - burst_req_o.dst = burst_req_i.dst + (part_beat_cnt_q-beat_cnt_i)*DmaRegionWidth + part_id_q*PartitionDmaRegionWidth; - end - end - valid_o = 1'b1; - ready_o = ready_i; - req_valid = ready_i; - // 2. prepare split one beat into several - end else begin - // store and acknowledge - req_d = burst_req_i; - ready_o = 1'b1; - // keep: [src] [dest], modify: [num_bytes] - burst_req_o = burst_req_i; - burst_req_o.num_bytes = PartitionDmaRegionWidth-masked_start_addr; - if (($unsigned(burst_req_i.src) >= DmaRegionStart) && ($unsigned(burst_req_i.src) < DmaRegionEnd)) begin - // L1 ------> L2 - if (beat_cnt_i == 0) begin - req_d.src = burst_req_i.src + (part_beat_cnt_q-beat_cnt_i)*DmaRegionWidth; - burst_req_o.src = burst_req_i.src + (part_beat_cnt_q-beat_cnt_i)*DmaRegionWidth; - end else begin - // correct old version - req_d.src = burst_req_i.src + (part_beat_cnt_q-beat_cnt_i)*DmaRegionWidth + part_id_q*PartitionDmaRegionWidth; - burst_req_o.src = burst_req_i.src + (part_beat_cnt_q-beat_cnt_i)*DmaRegionWidth + part_id_q*PartitionDmaRegionWidth; - end - end else begin - // L2 ------> L1 - if (beat_cnt_i == 0) begin - // req_d.dst = burst_req_i.dst + (part_beat_cnt_q-beat_cnt_i)*DmaRegionWidth; - // burst_req_o.dst = burst_req_i.dst + (part_beat_cnt_q-beat_cnt_i)*DmaRegionWidth; - req_d.dst = burst_req_i.dst + shift_row*DmaRegionWidth + shift_partition*PartitionDmaRegionWidth; - burst_req_o.dst = burst_req_i.dst + shift_row*DmaRegionWidth + shift_partition*PartitionDmaRegionWidth; - end else begin - req_d.dst = burst_req_i.dst + (part_beat_cnt_q-beat_cnt_i)*DmaRegionWidth + part_id_q*PartitionDmaRegionWidth; - burst_req_o.dst = burst_req_i.dst + (part_beat_cnt_q-beat_cnt_i)*DmaRegionWidth + part_id_q*PartitionDmaRegionWidth; - end - end - // noftify downstream - valid_o = 1'b1; - if (ready_i) begin - // increase partition beat cnt - part_beat_cnt_d = part_beat_cnt_q + 1; - // downstream is ready to receive, modify the stored req - req_d.num_bytes -= PartitionDmaRegionWidth-masked_start_addr; - if (($unsigned(burst_req_i.src) >= DmaRegionStart) && ($unsigned(burst_req_i.src) < DmaRegionEnd)) begin - // L1 ------> L2 - req_d.src += DmaRegionWidth-masked_start_addr; // folded to second row - req_d.dst += PartitionDmaRegionWidth-masked_start_addr; - end else begin - // L2 ------> L1 - req_d.src += PartitionDmaRegionWidth-masked_start_addr; - // req_d.dst += DmaRegionWidth-masked_start_addr; - req_d.dst += DmaRegionWidth-masked_start_addr; // modification needed - end - req_valid = 1'b1; // one request sent, counter increment - end - state_d = Busy; - end - end - end - - Busy: begin - // get burst request from the stored one - burst_req_o = req_q; - valid_o = 1'b1; - req_valid = ready_i; // counter increment whenever one req sent to downstream - if ($unsigned(req_q.num_bytes) <= $unsigned(PartitionDmaRegionWidth)) begin - // last burst - if (ready_i) begin - state_d = Idle; - // burst_req does not need to change - // increase partition beat cnt - part_beat_cnt_d = part_beat_cnt_q + 1; - end - end else begin - // middle bursts - burst_req_o.num_bytes = PartitionDmaRegionWidth; - if (ready_i) begin - // increase partition beat cnt - part_beat_cnt_d = part_beat_cnt_q + 1; - req_d.num_bytes = req_q.num_bytes - PartitionDmaRegionWidth; - if (($unsigned(req_q.src) >= DmaRegionStart) && ($unsigned(req_q.src) < DmaRegionEnd)) begin - // L1 ------> L2 - req_d.src = req_q.src + DmaRegionWidth; // folded to second row - req_d.dst = req_q.dst + PartitionDmaRegionWidth; // addr in L2 increases as usual - end else begin - // L2 ------> L1 - req_d.src = req_q.src + PartitionDmaRegionWidth; - // req_d.dst = req_q.dst + DmaRegionWidth; - if (shift_row == allocated_size_i-1) begin - req_d.dst = req_q.dst + PartitionDmaRegionWidth - shift_row*DmaRegionWidth; - end else begin - req_d.dst = req_q.dst + DmaRegionWidth; - end - end - end - end - end - - default: /*do nothing*/; - endcase - - - end - - // pragma translate_off - int f; - always_ff @(posedge clk_i or negedge rst_ni) begin - automatic string str; - if (rst_ni && valid_i && ready_o) begin - str = "\n[Partition] Got request\n"; - str = $sformatf("%sPartition: Request in: From: 0x%8x To: 0x%8x with size %d. Beat count: %d\n", str, burst_req_i.src, burst_req_i.dst, burst_req_i.num_bytes, beat_cnt_i); - f = $fopen("dma.log", "a"); - $fwrite(f, str); - $fclose(f); - end - if (rst_ni && valid_o && ready_i) begin - str = $sformatf("Partition: From: 0x%8x To: 0x%8x with size %d. Partition beat count: %d. [part_id] %d\n", burst_req_o.src, burst_req_o.dst, burst_req_o.num_bytes, part_beat_cnt_q, part_id_q); - // str = $sformatf("Debug Rst: [rst_part_beat_cnt] %d [beat_cnt_q] %d [beat_cnt_i] %d \n",rst_part_beat_cnt, beat_cnt_q, beat_cnt_i); - f = $fopen("dma.log", "a"); - $fwrite(f, str); - // str = $sformatf("Debug: [start_addr] %8x [GF] %d \n",start_addr, group_factor_i); - // $fwrite(f, str); - $fclose(f); - end - end - // pragma translate_on - -endmodule From 543027bf5d2999e24c6adb5462ecf2e571c271d7 Mon Sep 17 00:00:00 2001 From: mbertuletti Date: Wed, 22 Oct 2025 18:46:08 +0200 Subject: [PATCH 20/34] [software] Correct format --- software/runtime/alloc.c | 169 +++++++++--------- software/runtime/alloc.h | 1 - software/runtime/dma.h | 3 +- software/runtime/runtime.h | 117 ++++++------ .../tests/baremetal/das_malloc_test/main.c | 50 ++++-- 5 files changed, 183 insertions(+), 157 deletions(-) diff --git a/software/runtime/alloc.c b/software/runtime/alloc.c index 877fdbd40..2b9924b66 100644 --- a/software/runtime/alloc.c +++ b/software/runtime/alloc.c @@ -34,9 +34,8 @@ alloc_t alloc_l1; // Allocators for L1 local sequential heap memory alloc_t alloc_tile[NUM_CORES / NUM_CORES_PER_TILE]; - // ---------------------------------------------------------------------------- -// Dynamic Heap Allocator +// Dynamic Heap Allocator // ---------------------------------------------------------------------------- alloc_t dynamic_heap_alloc; @@ -61,10 +60,10 @@ static inline canary_and_size_t canary_decode(const uint32_t value) { return (canary_and_size_t){.canary = value & 0xFF, .size = value >> 8}; } -typedef struct canary_chain_s{ - uint32_t canary_and_size; - uint32_t *data_address; - struct canary_chain_s *next_canary; +typedef struct canary_chain_s { + uint32_t canary_and_size; + uint32_t *data_address; + struct canary_chain_s *next_canary; } canary_chain_t; // init as a NULL, assign this pointer when the first canary is allocated @@ -78,14 +77,16 @@ canary_chain_t *first_canary = (canary_chain_t *)0x1000; void alloc_init(alloc_t *alloc, void *base, const uint32_t size) { // Create first block at base address aligned up uint32_t aligned_base = ALIGN_UP((uint32_t)base, MIN_BLOCK_SIZE); - // printf("base - %p - aligned_base %p\n", base, (alloc_block_t *)aligned_base); + // printf("base - %p - aligned_base %p\n", base, (alloc_block_t + // *)aligned_base); alloc_block_t *block_ptr = (alloc_block_t *)aligned_base; // Calculate block size aligned down uint32_t block_size = size - ((uint32_t)block_ptr - (uint32_t)base); block_size = ALIGN_DOWN(block_size, MIN_BLOCK_SIZE); - // printf("block_ptr: %p, block_ptr->size: %p, block_ptr->next: %p\n", block_ptr, &(block_ptr->size), &(block_ptr->next)); + // printf("block_ptr: %p, block_ptr->size: %p, block_ptr->next: %p\n", + // block_ptr, &(block_ptr->size), &(block_ptr->next)); // Setup allocator block_ptr->size = block_size; @@ -137,27 +138,27 @@ static void *allocate_memory(alloc_t *alloc, const uint32_t size) { } // ------ Function to calculate the aligned size ------ // -static uint32_t calc_aligned_size (uint32_t* addr, const uint32_t allocated_size) { +static uint32_t calc_aligned_size(uint32_t *addr, + const uint32_t allocated_size) { // interpret the addr uint32_t tmp = allocated_size; uint32_t log = 0; // log2 of 0 is undefined, handled as special case if needed while (tmp >>= 1) { // Shift right until value is 0 - ++log; + ++log; } - uint32_t mask = (uint32_t)(( 1 << log )-1); + uint32_t mask = (uint32_t)((1 << log) - 1); uint32_t row_id, tile_id, offset; - offset = ((uint32_t)addr) & 0x7F; - tile_id = ((uint32_t)addr >> 7 ) & 0x7F; - row_id = ((uint32_t)addr >> 14) & 0xFF; + offset = ((uint32_t)addr) & 0x7F; + tile_id = ((uint32_t)addr >> 7) & 0x7F; + row_id = ((uint32_t)addr >> 14) & 0xFF; row_id &= mask; - uint32_t shift_size=0; - if ( (offset==0) && (row_id==0) && (tile_id==0) ){ + uint32_t shift_size = 0; + if ((offset == 0) && (row_id == 0) && (tile_id == 0)) { shift_size = 0; - } - else{ - uint32_t aligned_boundary = 4096*4*allocated_size; - uint32_t modified_curr = (row_id<<14) | (tile_id<<7) | offset; + } else { + uint32_t aligned_boundary = 4096 * 4 * allocated_size; + uint32_t modified_curr = (row_id << 14) | (tile_id << 7) | offset; shift_size = aligned_boundary - modified_curr; } @@ -166,7 +167,8 @@ static uint32_t calc_aligned_size (uint32_t* addr, const uint32_t allocated_size // ------ Parameters ------ // // size: Size of the data block need to be allocated // allocated_size: How many rows the current partition scheme occupied -static void *allocate_memory_aligned(alloc_t *alloc, const uint32_t size, const uint32_t allocated_size) { +static void *allocate_memory_aligned(alloc_t *alloc, const uint32_t size, + const uint32_t allocated_size) { // Get first block of linked list of free blocks alloc_block_t *curr = alloc->first_block; alloc_block_t *prev = 0; @@ -174,21 +176,23 @@ static void *allocate_memory_aligned(alloc_t *alloc, const uint32_t size, const // Search first block large enough in linked list // 1. calculate the size aligned to the partition boundary uint32_t shift_size = 0; - shift_size = calc_aligned_size( (uint32_t*)curr, allocated_size); + shift_size = calc_aligned_size((uint32_t *)curr, allocated_size); uint32_t aligned_size = size + shift_size; // while (curr && (curr->size < size)) { while (curr && (curr->size < aligned_size)) { prev = curr; curr = curr->next; - shift_size = calc_aligned_size( (uint32_t*)curr, allocated_size); + shift_size = calc_aligned_size((uint32_t *)curr, allocated_size); aligned_size = size + shift_size; } - printf("Dynamic Allocator >> size [%d] --- shift size [%d] --- aligned size [%d] \n", size, shift_size, aligned_size); + printf("Dynamic Allocator >> size [%d] --- shift size [%d] --- aligned size " + "[%d] \n", + size, shift_size, aligned_size); if (curr) { // Update allocator - if (size == aligned_size){ + if (size == aligned_size) { // address is already aligned to the partition boundary printf("Dynamic Allocator >> No alignment needed\n"); if (curr->size == size) { @@ -209,13 +213,11 @@ static void *allocate_memory_aligned(alloc_t *alloc, const uint32_t size, const alloc->first_block = new_block; } } - } - else{ + } else { printf("Dynamic Allocator >> Alignment needed\n"); if (curr->size == aligned_size) { - // Special case: Whole block taken, first part of the block is still empty - // store the curr info in tmp - // uint32_t tmp_size = curr->size; + // Special case: Whole block taken, first part of the block is still + // empty store the curr info in tmp uint32_t tmp_size = curr->size; struct alloc_block_s *tmp_next = curr->next; alloc_block_t *shift_block = (alloc_block_t *)((char *)curr); shift_block->size = shift_size; @@ -225,10 +227,10 @@ static void *allocate_memory_aligned(alloc_t *alloc, const uint32_t size, const } else { alloc->first_block = shift_block; } - } - else{ + } else { // Regular case: Split off block - alloc_block_t *new_block = (alloc_block_t *)((char *)curr + aligned_size); + alloc_block_t *new_block = + (alloc_block_t *)((char *)curr + aligned_size); new_block->size = curr->size - aligned_size; new_block->next = curr->next; @@ -244,7 +246,7 @@ static void *allocate_memory_aligned(alloc_t *alloc, const uint32_t size, const } // Return block pointer - return (void *)((char *)curr+shift_size); + return (void *)((char *)curr + shift_size); } else { // There is no free block large enough return NULL; @@ -285,9 +287,11 @@ void *simple_malloc(const uint32_t size) { // ------ This function allocate data in Sequential Heap region ------ // // Canary system is stored in a seperate linked list // void *partition_malloc(alloc_t *alloc, const uint32_t size){ -void *partition_malloc(alloc_t *alloc, const uint32_t size){ +void *partition_malloc(alloc_t *alloc, const uint32_t size) { - uint32_t data_size = size > 2*NUM_BANKS*sizeof(uint32_t) ? size : 2*NUM_BANKS*sizeof(uint32_t); + uint32_t data_size = size > 2 * NUM_BANKS * sizeof(uint32_t) + ? size + : 2 * NUM_BANKS * sizeof(uint32_t); uint32_t allocated_size = data_size / (NUM_BANKS * sizeof(uint32_t)); uint32_t block_size = ALIGN_UP(data_size, MIN_BLOCK_SIZE); // add alignment @@ -297,16 +301,16 @@ void *partition_malloc(alloc_t *alloc, const uint32_t size){ return NULL; } - // allocate + // allocate void *block_ptr = NULL; - if (allocated_size<2){ + if (allocated_size < 2) { block_ptr = allocate_memory(alloc, block_size); - } - else{ + } else { block_ptr = allocate_memory_aligned(alloc, block_size, allocated_size); } // void *block_ptr = allocate_memory(alloc, block_size); - // void *block_ptr = allocate_memory_aligned(alloc, block_size, allocated_size); + // void *block_ptr = allocate_memory_aligned(alloc, block_size, + // allocated_size); if (!block_ptr) { printf("Memory allocator: No large enough block found (%d)\n", block_size); return NULL; @@ -314,7 +318,8 @@ void *partition_malloc(alloc_t *alloc, const uint32_t size){ // Allocate a region in L1 heap for canary // printf("p1\n"); - canary_chain_t *canary = (canary_chain_t *)simple_malloc(sizeof(canary_chain_t)); + canary_chain_t *canary = + (canary_chain_t *)simple_malloc(sizeof(canary_chain_t)); // printf("p2\n"); // Init the canary canary->data_address = (uint32_t *)block_ptr; @@ -325,56 +330,52 @@ void *partition_malloc(alloc_t *alloc, const uint32_t size){ // canary_chain_t *curr = first_canary->first_block; canary_chain_t *curr = first_canary; canary_chain_t *prev = 0; - // Fit the canary into the chain, depending on data_address // | prev | ------> | canary | ------> | curr | uint32_t *data_addr = 0; - if (curr != (canary_chain_t *)0x1000){ + if (curr != (canary_chain_t *)0x1000) { // only access struct when init data_addr = curr->data_address; } - while((curr!=(canary_chain_t *)0x1000) && (curr!=NULL) && ((uint32_t *)data_addr < (uint32_t *)block_ptr)){ + while ((curr != (canary_chain_t *)0x1000) && (curr != NULL) && + ((uint32_t *)data_addr < (uint32_t *)block_ptr)) { prev = curr; // data_addr = curr->data_address; curr = curr->next_canary; - if (curr!=NULL){ + if (curr != NULL) { data_addr = curr->data_address; } // data_addr = curr->data_address; } // printf("post: %p - %p \n", curr, prev); - if ((curr==(canary_chain_t *)0x1000) && !prev) { + if ((curr == (canary_chain_t *)0x1000) && !prev) { // special case: first canary block first_canary = canary; printf("| First | ------> [ New ]\n"); // printf("first_canary: %p\n", first_canary); - } - else{ - if (!curr){ + } else { + if (!curr) { // reach to the last of the chain // | prev | ------> | canary | ------> NULL - prev->next_canary = canary; + prev->next_canary = canary; canary->next_canary = NULL; printf("| Other | ------> [ New ] ------> NULL\n"); - } - else if (!prev){ + } else if (!prev) { // canary need to insert at the beginning of the chain // first_canary ------> | canary | ------> | curr | first_canary = canary; canary->next_canary = curr; printf("| First | ------> [ New ] ------> | Other |\n"); - } - else{ + } else { // normal case // | prev | ------> | canary | ------> | curr | canary->next_canary = prev->next_canary; - prev->next_canary = canary; + prev->next_canary = canary; printf("| Other | ------> [ New ] ------> | Other |\n"); } - } // return the block pointer directly // printf("%p\n", block_ptr); @@ -442,11 +443,12 @@ void domain_free(alloc_t *alloc, void *const ptr) { void simple_free(void *const ptr) { domain_free(&alloc_l1, ptr); } -void partition_free(alloc_t *alloc, void *const ptr){ +void partition_free(alloc_t *alloc, void *const ptr) { // block pointer is the input pointer void *block_ptr = ptr; - canary_and_size_t canary_and_size = (canary_and_size_t){.canary = 0, .size = 0}; + canary_and_size_t canary_and_size = + (canary_and_size_t){.canary = 0, .size = 0}; // find the canary block in the chain canary_chain_t *curr = first_canary; canary_chain_t *prev = 0; @@ -454,45 +456,42 @@ void partition_free(alloc_t *alloc, void *const ptr){ // While loop suppose to stop when curr->data_address == block_ptr // | prev | ------> | curr | uint32_t *data_addr = 0; - if (curr){ + if (curr) { data_addr = curr->data_address; } - printf("data_addr - %p - block_ptr - %p - curr->data_address - %p \n", data_addr, block_ptr, curr->data_address); - while((curr!=(canary_chain_t *)0x1000) && (curr!=NULL) && (data_addr < (uint32_t *)block_ptr)){ + printf("data_addr - %p - block_ptr - %p - curr->data_address - %p \n", + data_addr, block_ptr, curr->data_address); + while ((curr != (canary_chain_t *)0x1000) && (curr != NULL) && + (data_addr < (uint32_t *)block_ptr)) { prev = curr; // data_addr = curr->data_address; curr = curr->next_canary; - if(curr!=NULL){ + if (curr != NULL) { data_addr = curr->data_address; } } - if ((curr==(canary_chain_t *)0x1000) && !prev){ + if ((curr == (canary_chain_t *)0x1000) && !prev) { // nothing in the chain printf("CANARY: Empty canary chain!\n"); - } - else if (!curr){ + } else if (!curr) { // reach to the end of the chain printf("CANARY: Chain depleted. No info found for %p\n", block_ptr); - } - else if (curr->data_address != block_ptr){ + } else if (curr->data_address != block_ptr) { // no information for the current free printf("CANARY: Unmatch! %p - %p\n", curr->data_address, block_ptr); - } - else if (!prev){ + } else if (!prev) { // normal case 1: curr is the first canary // first_canary ------> | curr | ------> next canary_and_size = canary_decode(curr->canary_and_size); - if (curr->next_canary == NULL){ + if (curr->next_canary == NULL) { first_canary = (canary_chain_t *)0x1000; - } - else{ + } else { first_canary = curr->next_canary; } simple_free((void *)curr); - } - else{ + } else { // normal case 2: relink the chain, free the curr canary // | prev | ------> | curr | ------> something canary_and_size = canary_decode(curr->canary_and_size); @@ -511,7 +510,6 @@ void partition_free(alloc_t *alloc, void *const ptr){ // Free memory free_memory(alloc, block_ptr, canary_and_size.size); - } // ---------------------------------------------------------------------------- @@ -539,17 +537,17 @@ void alloc_dump(alloc_t *alloc) { } } -void canary_dump(void){ +void canary_dump(void) { printf(" ------ Canary Chain Dump ------ \n"); canary_chain_t *curr = first_canary; - if (curr == (canary_chain_t *)0x1000){ + if (curr == (canary_chain_t *)0x1000) { // empty list printf("Empty Canary list.\n"); - } - else{ + } else { uint32_t cnt = 0; - while(curr!=NULL){ - printf("[%d] - [%p] - [%p] - [%p]\n", cnt, curr, curr->data_address, curr->next_canary); + while (curr != NULL) { + printf("[%d] - [%p] - [%p] - [%p]\n", cnt, curr, curr->data_address, + curr->next_canary); cnt += 1; curr = curr->next_canary; } @@ -557,14 +555,13 @@ void canary_dump(void){ printf(" ------ Canary Dump END ------ \n"); } - // ---------------------------------------------------------------------------- // Get Allocators // ---------------------------------------------------------------------------- -// Get the address of global variable `alloc_l1` +// Get the address of global variable `alloc_l1` alloc_t *get_alloc_l1() { return &alloc_l1; } alloc_t *get_alloc_tile(const uint32_t tile_id) { return &alloc_tile[tile_id]; } -// Dynamic Heap Allocator -alloc_t *get_dynamic_heap_alloc() {return &dynamic_heap_alloc;} +// Dynamic Heap Allocator +alloc_t *get_dynamic_heap_alloc() { return &dynamic_heap_alloc; } diff --git a/software/runtime/alloc.h b/software/runtime/alloc.h index 39a9577a0..0533e767a 100644 --- a/software/runtime/alloc.h +++ b/software/runtime/alloc.h @@ -27,7 +27,6 @@ typedef struct { alloc_block_t *first_block; } alloc_t; - // Initialization void alloc_init(alloc_t *alloc, void *base, const uint32_t size); diff --git a/software/runtime/dma.h b/software/runtime/dma.h index 7c33b2588..81d6ebc72 100644 --- a/software/runtime/dma.h +++ b/software/runtime/dma.h @@ -74,7 +74,8 @@ void dma_memcpy_blocking(void *dest, const void *src, size_t len) { dma_wait(); } -void dma_memcpy_ModeSel(void *dest, const void *src, size_t len, uint32_t mode_sel){ +void dma_memcpy_ModeSel(void *dest, const void *src, size_t len, + uint32_t mode_sel) { dma_mode_reg = mode_sel; dma_memcpy_nonblocking(dest, src, len); } diff --git a/software/runtime/runtime.h b/software/runtime/runtime.h index 103984bb5..133d6c072 100644 --- a/software/runtime/runtime.h +++ b/software/runtime/runtime.h @@ -95,7 +95,6 @@ static uint32_t volatile *allocated_size3_reg = (uint32_t volatile *)(CONTROL_REGISTER_OFFSET + CONTROL_REGISTERS_ALLOCATED_SIZE_3_REG_OFFSET); - typedef uint32_t mempool_id_t; typedef uint32_t mempool_timer_t; @@ -144,7 +143,9 @@ static inline void mempool_init(const uint32_t core_id) { extern uint32_t __heap_start; extern uint32_t __heap_seq_start; // Heap Region - uint32_t heap_size = (uint32_t)&__heap_seq_start - (uint32_t)&__heap_start; // Downscale interleaved heap size + uint32_t heap_size = + (uint32_t)&__heap_seq_start - + (uint32_t)&__heap_start; // Downscale interleaved heap size alloc_init(get_alloc_l1(), &__heap_start, heap_size); // Initialize L1 Sequential Heap Allocator per Tile @@ -167,41 +168,47 @@ static inline void mempool_init(const uint32_t core_id) { } } -// Reconfigure Interleaved Heap region, with explicit 'Dynamic Heap' start address -// Programmer API for flexible Dynamic Heap region configuration -static inline void mempool_reset_heap(const uint32_t core_id, uint32_t heap_seq_start) { +// Reconfigure Interleaved Heap region, with explicit 'Dynamic Heap' start +// address Programmer API for flexible Dynamic Heap region configuration +static inline void mempool_reset_heap(const uint32_t core_id, + uint32_t heap_seq_start) { if (core_id == 0) { // Initialize L1 Interleaved Heap Allocator extern uint32_t __heap_start; - uint32_t heap_size = (uint32_t)heap_seq_start - (uint32_t)&__heap_start; // Downscale interleaved heap size + uint32_t heap_size = + (uint32_t)heap_seq_start - + (uint32_t)&__heap_start; // Downscale interleaved heap size alloc_init(get_alloc_l1(), &__heap_start, heap_size); } } // Initialize Dynamic Heap Allocator, as default specified in the linker script -static inline void mempool_dynamic_heap_alloc_init(const uint32_t core_id){ - if (core_id == 0){ +static inline void mempool_dynamic_heap_alloc_init(const uint32_t core_id) { + if (core_id == 0) { extern uint32_t __heap_seq_start; // Dynamic allocator base and size uint32_t seq_heap_base = (uint32_t)&__heap_seq_start; uint32_t seq_heap_size = NUM_CORES * DAS_MEM_SIZE; // Dynamically allocate the space for allocators alloc_t *dynamic_heap_allocator = get_dynamic_heap_alloc(); - alloc_init(dynamic_heap_allocator, (uint32_t *)seq_heap_base, seq_heap_size); + alloc_init(dynamic_heap_allocator, (uint32_t *)seq_heap_base, + seq_heap_size); } } // Reset Dynamic Heap region with explicit start address specification // A UNIFIED allocator will be used -static inline void mempool_dynamic_heap_alloc_reset(const uint32_t core_id, uint32_t heap_seq_start){ - if (core_id == 0){ +static inline void mempool_dynamic_heap_alloc_reset(const uint32_t core_id, + uint32_t heap_seq_start) { + if (core_id == 0) { extern uint32_t __heap_end; // Dynamic allocator base and size uint32_t seq_heap_base = heap_seq_start; uint32_t seq_heap_size = (uint32_t)&__heap_end - heap_seq_start; // Reset the space for allocators alloc_t *dynamic_heap_allocator = get_dynamic_heap_alloc(); - alloc_init(dynamic_heap_allocator, (uint32_t *)seq_heap_base, seq_heap_size); + alloc_init(dynamic_heap_allocator, (uint32_t *)seq_heap_base, + seq_heap_size); } } @@ -284,54 +291,58 @@ static inline void set_wake_up_offset(uint32_t offset) { } // Partition Configuration -static inline void partition_config (uint32_t reg_sel, uint32_t tiles_per_partition){ +static inline void partition_config(uint32_t reg_sel, + uint32_t tiles_per_partition) { asm volatile("" ::: "memory"); - switch (reg_sel){ - case 0: - *partition0_reg = tiles_per_partition; - break; - case 1: - *partition1_reg = tiles_per_partition; - break; - case 2: - *partition2_reg = tiles_per_partition; - break; - case 3: - *partition3_reg = tiles_per_partition; - break; - default: - *partition0_reg = tiles_per_partition; - break; + switch (reg_sel) { + case 0: + *partition0_reg = tiles_per_partition; + break; + case 1: + *partition1_reg = tiles_per_partition; + break; + case 2: + *partition2_reg = tiles_per_partition; + break; + case 3: + *partition3_reg = tiles_per_partition; + break; + default: + *partition0_reg = tiles_per_partition; + break; } asm volatile("" ::: "memory"); -} +} // reg_sel = {3, 2, 1, 0} -static inline void start_addr_scheme_config (uint32_t reg_sel, uint32_t addr, uint32_t size){ +static inline void start_addr_scheme_config(uint32_t reg_sel, uint32_t addr, + uint32_t size) { asm volatile("" ::: "memory"); - uint32_t data_size = size > 2*NUM_BANKS*sizeof(uint32_t) ? size : 2*NUM_BANKS*sizeof(uint32_t); + uint32_t data_size = size > 2 * NUM_BANKS * sizeof(uint32_t) + ? size + : 2 * NUM_BANKS * sizeof(uint32_t); uint32_t allocated_size = data_size / (NUM_BANKS * sizeof(uint32_t)); - switch (reg_sel){ - case 0: - *start_addr_scheme0_reg = addr; - *allocated_size0_reg = allocated_size; - break; - case 1: - *start_addr_scheme1_reg = addr; - *allocated_size1_reg = allocated_size; - break; - case 2: - *start_addr_scheme2_reg = addr; - *allocated_size2_reg = allocated_size; - break; - case 3: - *start_addr_scheme3_reg = addr; - *allocated_size3_reg = allocated_size; - break; - default: - *start_addr_scheme0_reg = addr; - *allocated_size0_reg = allocated_size; - break; + switch (reg_sel) { + case 0: + *start_addr_scheme0_reg = addr; + *allocated_size0_reg = allocated_size; + break; + case 1: + *start_addr_scheme1_reg = addr; + *allocated_size1_reg = allocated_size; + break; + case 2: + *start_addr_scheme2_reg = addr; + *allocated_size2_reg = allocated_size; + break; + case 3: + *start_addr_scheme3_reg = addr; + *allocated_size3_reg = allocated_size; + break; + default: + *start_addr_scheme0_reg = addr; + *allocated_size0_reg = allocated_size; + break; } asm volatile("" ::: "memory"); } diff --git a/software/tests/baremetal/das_malloc_test/main.c b/software/tests/baremetal/das_malloc_test/main.c index bee4a9db8..c55708233 100644 --- a/software/tests/baremetal/das_malloc_test/main.c +++ b/software/tests/baremetal/das_malloc_test/main.c @@ -32,7 +32,8 @@ int main() { printf("Verify DAS partitions\n\n"); uint32_t num_tiles_per_partition = 4; - uint32_t array_size = 2 * num_tiles_per_partition * BANKING_FACTOR * NUM_CORES_PER_TILE; + uint32_t array_size = + 2 * num_tiles_per_partition * BANKING_FACTOR * NUM_CORES_PER_TILE; // 1. Init dynamic heap allocator mempool_dynamic_heap_alloc_init(core_id); @@ -43,10 +44,12 @@ int main() { alloc_t *dynamic_heap_alloc = get_dynamic_heap_alloc(); alloc_dump(dynamic_heap_alloc); // 4. Allocate memory - uint32_t *array = (uint32_t *)partition_malloc(dynamic_heap_alloc, array_size*sizeof(uint32_t)); + uint32_t *array = (uint32_t *)partition_malloc( + dynamic_heap_alloc, array_size * sizeof(uint32_t)); // 5. Config the hardware registers partition_config(part_id, num_tiles_per_partition); - start_addr_scheme_config(part_id, (uint32_t)(*array), array_size*sizeof(uint32_t)); + start_addr_scheme_config(part_id, (uint32_t)(*array), + array_size * sizeof(uint32_t)); // 6. Move data for (uint32_t i = 0; i < array_size; i++) { array[i] = i; @@ -55,11 +58,16 @@ int main() { partition_config(part_id, NUM_TILES); // 8. check for (uint32_t i = 0; i < array_size; i++) { - uint32_t *fetch_address = &array[0] + \ - (i % (num_tiles_per_partition * NUM_CORES_PER_TILE * BANKING_FACTOR)) + \ - (i / (num_tiles_per_partition * NUM_CORES_PER_TILE * BANKING_FACTOR)) * NUM_BANKS; + uint32_t *fetch_address = + &array[0] + + (i % + (num_tiles_per_partition * NUM_CORES_PER_TILE * BANKING_FACTOR)) + + (i / + (num_tiles_per_partition * NUM_CORES_PER_TILE * BANKING_FACTOR)) * + NUM_BANKS; if (i != *fetch_address) { - printf("%4d != %4d at address %8X.\n", i, *fetch_address, fetch_address); + printf("%4d != %4d at address %8X.\n", i, *fetch_address, + fetch_address); return 1; } } @@ -75,16 +83,20 @@ int main() { // 2. Set which partition write to. uint32_t part_id = 0; - for (num_tiles_per_partition = 1; num_tiles_per_partition < NUM_TILES; num_tiles_per_partition *= 2) { - array_size = 2 * num_tiles_per_partition * BANKING_FACTOR * NUM_CORES_PER_TILE; + for (num_tiles_per_partition = 1; num_tiles_per_partition < NUM_TILES; + num_tiles_per_partition *= 2) { + array_size = + 2 * num_tiles_per_partition * BANKING_FACTOR * NUM_CORES_PER_TILE; // 3. Get the allocator alloc_t *dynamic_heap_alloc = get_dynamic_heap_alloc(); alloc_dump(dynamic_heap_alloc); // 4. Allocate memory - uint32_t *array = (uint32_t *)partition_malloc(dynamic_heap_alloc, array_size*sizeof(uint32_t)); + uint32_t *array = (uint32_t *)partition_malloc( + dynamic_heap_alloc, array_size * sizeof(uint32_t)); // 5. Config the hardware registers partition_config(part_id, num_tiles_per_partition); - start_addr_scheme_config(part_id, (uint32_t)(*array), array_size*sizeof(uint32_t)); + start_addr_scheme_config(part_id, (uint32_t)(*array), + array_size * sizeof(uint32_t)); // 6. Move data for (uint32_t i = 0; i < array_size; i++) { array[i] = i; @@ -93,17 +105,23 @@ int main() { partition_config(part_id, NUM_TILES); // 8. check for (uint32_t i = 0; i < array_size; i++) { - uint32_t *fetch_address = &array[0] + \ - (i % (num_tiles_per_partition * NUM_CORES_PER_TILE * BANKING_FACTOR)) + \ - (i / (num_tiles_per_partition * NUM_CORES_PER_TILE * BANKING_FACTOR)) * NUM_BANKS; + uint32_t *fetch_address = + &array[0] + + (i % + (num_tiles_per_partition * NUM_CORES_PER_TILE * BANKING_FACTOR)) + + (i / + (num_tiles_per_partition * NUM_CORES_PER_TILE * BANKING_FACTOR)) * + NUM_BANKS; if (i != *fetch_address) { - printf("%4d != %4d at address %8X.\n", i, *fetch_address, fetch_address); + printf("%4d != %4d at address %8X.\n", i, *fetch_address, + fetch_address); return 1; } } // 9. Free array partition_free(dynamic_heap_alloc, array); - printf("SUCCESS for groups of %d tiles over the partition \n\n", num_tiles_per_partition); + printf("SUCCESS for groups of %d tiles over the partition \n\n", + num_tiles_per_partition); } printf("All correct!\n"); From 473bd0e95a24bba678bf76b0fb1606ebee4d6bfd Mon Sep 17 00:00:00 2001 From: mbertuletti Date: Wed, 22 Oct 2025 18:48:32 +0200 Subject: [PATCH 21/34] [software] Remove DMA with mode selection --- software/runtime/dma.h | 6 ------ 1 file changed, 6 deletions(-) diff --git a/software/runtime/dma.h b/software/runtime/dma.h index 81d6ebc72..cab318d28 100644 --- a/software/runtime/dma.h +++ b/software/runtime/dma.h @@ -74,10 +74,4 @@ void dma_memcpy_blocking(void *dest, const void *src, size_t len) { dma_wait(); } -void dma_memcpy_ModeSel(void *dest, const void *src, size_t len, - uint32_t mode_sel) { - dma_mode_reg = mode_sel; - dma_memcpy_nonblocking(dest, src, len); -} - #endif // _DMA_H_ From 737d106da5bbbd555b83bdd812b07d99075b937b Mon Sep 17 00:00:00 2001 From: mbertuletti Date: Thu, 23 Oct 2025 10:19:34 +0200 Subject: [PATCH 22/34] [hardware] Correct scrambler parametrization --- hardware/src/address_scrambler.sv | 18 ++++++++++-------- hardware/src/mempool_tile.sv | 6 +++--- 2 files changed, 13 insertions(+), 11 deletions(-) diff --git a/hardware/src/address_scrambler.sv b/hardware/src/address_scrambler.sv index 028be3360..960a2665a 100644 --- a/hardware/src/address_scrambler.sv +++ b/hardware/src/address_scrambler.sv @@ -6,18 +6,20 @@ // sequentially and part is interleaved. // Current constraints: +// Author: Samuel Riedel // Author: Marco Bertuletti module address_scrambler #( parameter int unsigned AddrWidth = 32, parameter int unsigned DataWidth = 32, parameter int unsigned ByteOffset = 2, + parameter bit Bypass = 0, parameter int unsigned NumTiles = 2, parameter int unsigned NumBanksPerTile = 2, - parameter bit Bypass = 0, - parameter int unsigned SeqMemSizePerTile = 4*1024, parameter int unsigned TCDMSizePerBank = 1024, + parameter int unsigned SeqMemSizePerTile = 4096, parameter int unsigned NumDASPartitions = 4, + // Dependant parameters, do not change parameter int unsigned MemSizePerTile = NumBanksPerTile*TCDMSizePerBank, parameter int unsigned MemSizePerRow = (1 << ByteOffset)*NumBanksPerTile*NumTiles ) ( @@ -52,8 +54,8 @@ module address_scrambler #( // `tile_index` : how many bits to shift for TileID bits in each partition // `row_index`: how many bits need to swap within Row Index - logic [NumDASPartitions-1:0][$clog2(NumTiles):0] tile_index; - logic [NumDASPartitions-1:0][$clog2(NumTiles):0] row_index; + logic [NumDASPartitions-1:0][$clog2($clog2(NumTiles)+1)-1:0] tile_index; + logic [NumDASPartitions-1:0][$clog2($clog2(NumTiles)+1)-1:0] row_index; for (genvar i = 0; i < NumDASPartitions; i++) begin : gen_shift_index lzc #( @@ -65,12 +67,12 @@ module address_scrambler #( .empty_o (/* Unused */ ) ); lzc #( - .WIDTH ($clog2(NumTiles)), + .WIDTH ($clog2(NumTiles)+1), .MODE (1'b0 ) ) i_log_row_index ( - .in_i (allocated_size_i[i]), - .cnt_o (row_index[i] ), - .empty_o (/* Unused */ ) + .in_i (allocated_size_i[i][$clog2(NumTiles):0]), + .cnt_o (row_index[i] ), + .empty_o (/* Unused */ ) ); end diff --git a/hardware/src/mempool_tile.sv b/hardware/src/mempool_tile.sv index 241848f62..8b18b784b 100644 --- a/hardware/src/mempool_tile.sv +++ b/hardware/src/mempool_tile.sv @@ -897,12 +897,12 @@ module mempool_tile address_scrambler #( .AddrWidth (AddrWidth ), .ByteOffset (ByteOffset ), + .Bypass (0 ), .NumTiles (NumTiles ), .NumBanksPerTile (NumBanksPerTile ), - .Bypass (0 ), + .TCDMSizePerBank (TCDMSizePerBank ), .SeqMemSizePerTile (SeqMemSizePerTile), - .NumDASPartitions (NumDASPartitions ), - .TCDMSizePerBank (TCDMSizePerBank ) + .NumDASPartitions (NumDASPartitions ) ) i_address_scrambler ( .address_i (snitch_data_qaddr[c]), .group_factor_i (partition_sel_i ), From ee0ff5403d233f556ce0815acdc79f296c02a3e5 Mon Sep 17 00:00:00 2001 From: mbertuletti Date: Thu, 23 Oct 2025 10:39:45 +0200 Subject: [PATCH 23/34] [hardware] Parametrize DMA for DAS [software] Add DMA with DAS test --- Bender.yml | 1 - config/config.mk | 2 +- hardware/deps/idma/Bender.yml | 2 - .../src/midends/idma_address_scrambler.sv | 197 +++------- .../src/midends/idma_distributed_midend.sv | 95 ++++- .../src/midends/idma_distributed_midend_v2.sv | 245 ------------ .../idma/src/midends/idma_split_midend.sv | 237 +++++++++++- .../idma/src/midends/idma_split_midend_v2.sv | 351 ------------------ hardware/src/mempool_cluster.sv | 183 ++++----- hardware/src/mempool_group.sv | 79 ++-- hardware/src/mempool_pkg.sv | 6 +- hardware/src/mempool_sub_group.sv | 23 +- hardware/src/mempool_system.sv | 15 +- hardware/src/mempool_tile.sv | 14 +- software/tests/baremetal/das_dma/main.c | 92 +++++ 15 files changed, 635 insertions(+), 907 deletions(-) delete mode 100644 hardware/deps/idma/src/midends/idma_distributed_midend_v2.sv delete mode 100644 hardware/deps/idma/src/midends/idma_split_midend_v2.sv create mode 100644 software/tests/baremetal/das_dma/main.c diff --git a/Bender.yml b/Bender.yml index 959621e9a..8d6913a8e 100644 --- a/Bender.yml +++ b/Bender.yml @@ -44,7 +44,6 @@ sources: # Level 3 - hardware/src/mempool_group.sv # Level 4 - - hardware/src/idma_partition_midend.sv - hardware/src/mempool_cluster.sv # Level 5 - hardware/src/ctrl_registers.sv diff --git a/config/config.mk b/config/config.mk index 3a4fd33f8..9bef70d80 100644 --- a/config/config.mk +++ b/config/config.mk @@ -74,7 +74,7 @@ zquarterinx ?= 0 xDivSqrt ?= 0 # Enable configurable addressing scheme in the heap -das ?= 1 +das ?= 0 num_das_partitions ?= 4 # Size of DAS-heap per core das_mem_size ?= 2048 diff --git a/hardware/deps/idma/Bender.yml b/hardware/deps/idma/Bender.yml index 406e4969f..0ad4a786d 100644 --- a/hardware/deps/idma/Bender.yml +++ b/hardware/deps/idma/Bender.yml @@ -25,8 +25,6 @@ sources: - src/midends/idma_split_midend.sv - src/midends/idma_distributed_midend.sv # If enabled DAS - - src/midends/idma_split_midend_v2.sv - - src/midends/idma_distributed_midend_v2.sv - src/frontends/mempool/mempool_dma_frontend_reg_pkg.sv - src/frontends/mempool/mempool_dma_frontend_reg_top.sv - src/frontends/mempool/mempool_dma.sv diff --git a/hardware/deps/idma/src/midends/idma_address_scrambler.sv b/hardware/deps/idma/src/midends/idma_address_scrambler.sv index 58096d464..345ffdba8 100644 --- a/hardware/deps/idma/src/midends/idma_address_scrambler.sv +++ b/hardware/deps/idma/src/midends/idma_address_scrambler.sv @@ -7,168 +7,92 @@ // Current constraints: // Author: Bowen Wang +// Author: Marco Bertuletti module idma_address_scrambler #( parameter int unsigned AddrWidth = 32, parameter int unsigned DataWidth = 32, parameter int unsigned ByteOffset = 2, + parameter bit Bypass = 0, parameter int unsigned NumTiles = 128, parameter int unsigned NumBanksPerTile = 32, - parameter bit Bypass = 0, - parameter int unsigned SeqMemSizePerTile = 4*1024, - parameter int unsigned HeapSeqMemSizePerTile = 8*2048, - parameter int unsigned MemSizePerTile = 8*4*1024, - parameter int unsigned MemSizePerRow = 4*4*1024, // 4bytes * 4096 banks - parameter int unsigned TCDMSize = 1024*1024 + parameter int unsigned TCDMSizePerBank = 1024, + parameter int unsigned NumDASPartitions = 4, + parameter int unsigned DASStartAddr = 1024, + parameter int unsigned MemSizePerTile = NumBanksPerTile*TCDMSizePerBank, + parameter int unsigned MemSizePerRow = (1 << ByteOffset)*NumBanksPerTile*NumTiles ) ( - input logic [AddrWidth-1:0] address_i, - input logic [31:0] num_bytes_i, - input logic [3:0][7:0] group_factor_i, - // For each allocation, the maximum number of rows assigned can be 128 rows - input logic [3:0][7:0] allocated_size_i, - input logic [3:0][DataWidth-1:0] start_addr_scheme_i, - output logic [7:0] group_factor_o, - output logic [7:0] allocated_size_o, - output logic [AddrWidth-1:0] address_o + input logic [AddrWidth-1:0] address_i, + input logic [31:0] num_bytes_i, + input logic [NumDASPartitions-1:0][$clog2(NumTiles):0] group_factor_i, + input logic [NumDASPartitions-1:0][$clog2(NumTiles):0] allocated_size_i, + input logic [NumDASPartitions-1:0][DataWidth-1:0] start_addr_scheme_i, + output logic [$clog2(NumTiles):0] group_factor_o, + output logic [$clog2(NumTiles):0] allocated_size_o, + output logic [AddrWidth-1:0] address_o ); // Basic Settings localparam int unsigned BankOffsetBits = $clog2(NumBanksPerTile); localparam int unsigned TileIdBits = $clog2(NumTiles); localparam int unsigned ConstantBitsLSB = ByteOffset + BankOffsetBits; - // Heap Sequential Settings - localparam int unsigned HeapSeqPerTileBits = $clog2(MemSizePerTile); // log2(8*4096) = 15 | RowIndexBits + ConstBits - localparam int unsigned HeapSeqTotalBits = HeapSeqPerTileBits+TileIdBits; // 15+7=22 | used for address_o assignment - localparam int unsigned RowIndexBits = HeapSeqPerTileBits-ConstantBitsLSB; // 15-7=8 | RowIndex - if (Bypass || NumTiles < 2) begin assign address_o = address_i; end else begin - // ------ Heap Sequential Signals ------ // - // `shift_index` : how many bits to shift for TileID bits in each partition - // `shift_index_sc`: how many bits need to swap within Row Index - logic [3:0][2:0] shift_index; - logic [3:0][2:0] shift_index_sc; - for (genvar i = 0; i < 4; i++) begin : gen_shift_index - always_comb begin - case(group_factor_i[i]) - 128: shift_index[i] = 7; - 64: shift_index[i] = 6; - 32: shift_index[i] = 5; - 16: shift_index[i] = 4; - 8: shift_index[i] = 3; - 4: shift_index[i] = 2; - 2: shift_index[i] = 1; - default: shift_index[i] = 0; - endcase - - case(allocated_size_i[i]) - 128: shift_index_sc[i] = 7; - 64: shift_index_sc[i] = 6; - 32: shift_index_sc[i] = 5; - 16: shift_index_sc[i] = 4; - 8: shift_index_sc[i] = 3; - 4: shift_index_sc[i] = 2; - 2: shift_index_sc[i] = 1; - default: shift_index_sc[i] = 0; - endcase - end - end - - // post-scramble row index - logic [RowIndexBits-1:0] post_scramble_row_index; - logic [TileIdBits-1:0] post_scramble_tile_id; - - logic [3:0][RowIndexBits-1:0] mask_row_index, mask_row_index_n; - logic [3:0][TileIdBits-1:0] mask_tile_id, mask_tile_id_n; - logic [TileIdBits-1:0] heap_tile_id; + // ------ Heap Sequential Signals ------ // - for (genvar j = 0; j < 4; j++) begin : gen_mask - assign mask_row_index[j] = (shift_index_sc[j] == 0) ? {RowIndexBits{1'b0}} : ({RowIndexBits{1'b1}} >> (RowIndexBits-shift_index_sc[j])); - assign mask_tile_id[j] = (shift_index[j] == 0) ? {TileIdBits{1'b0}} : ({TileIdBits{1'b1}} >> (TileIdBits -shift_index[j])); - - assign mask_row_index_n[j] = ~mask_row_index[j]; - assign mask_tile_id_n[j] = ~mask_tile_id[j]; + // `tile_index` : how many bits to shift for TileID bits in each partition + // `row_index`: how many bits need to swap within Row Index + logic [NumDASPartitions-1:0][$clog2($clog2(NumTiles)+1)-1:0] tile_index; + logic [NumDASPartitions-1:0][$clog2($clog2(NumTiles)+1)-1:0] row_index; + + for (genvar i = 0; i < NumDASPartitions; i++) begin : gen_shift_index + lzc #( + .WIDTH ($clog2(NumTiles)+1), + .MODE (1'b0 ) + ) i_log_tile_index ( + .in_i (group_factor_i[i]), + .cnt_o (tile_index[i] ), + .empty_o (/* Unused */ ) + ); + lzc #( + .WIDTH ($clog2(NumTiles)+1), + .MODE (1'b0 ) + ) i_log_row_index ( + .in_i (allocated_size_i[i][$clog2(NumTiles):0]), + .cnt_o (row_index[i] ), + .empty_o (/* Unused */ ) + ); end - assign heap_tile_id = address_i[(TileIdBits+ConstantBitsLSB-1):ConstantBitsLSB]; - always_comb begin + // Default: Unscrambled address_o = address_i; group_factor_o = '0; allocated_size_o = '0; - post_scramble_row_index = 'b0; - post_scramble_tile_id = 'b0; - // Need one more logic for interleaved heap region - // group_factor_o = {7{1'b1}}; - // Sequential Heap Region + // TODO (bowwang): add a new register to indicate the start addr of sequential heap region, currently hard coded - // if (address_i < start_addr_scheme_i[0]) begin - if (address_i < 32'h00120000) begin - group_factor_o = 128; // fully interleaved - allocated_size_o = num_bytes_i/(4*4096); - end else if ( (address_i >= start_addr_scheme_i[0]) && (address_i < start_addr_scheme_i[0]+MemSizePerRow*allocated_size_i[0]) ) begin - - post_scramble_row_index = 'b0; - post_scramble_tile_id = 'b0; - // 1. `post_scramble_row_index` generation - post_scramble_row_index |= (address_i >> (ConstantBitsLSB + shift_index[0])) & mask_row_index[0]; - post_scramble_row_index |= (address_i >> (ConstantBitsLSB + TileIdBits )) & mask_row_index_n[0]; - - // 2. `post_scramble_tile_id` generation - post_scramble_tile_id |= heap_tile_id & mask_tile_id[0]; - post_scramble_tile_id |= (address_i >> (ConstantBitsLSB + shift_index_sc[0])) & mask_tile_id_n[0]; - - address_o[HeapSeqTotalBits-1:ConstantBitsLSB] = {post_scramble_row_index, post_scramble_tile_id}; - group_factor_o = group_factor_i[0]; - allocated_size_o = allocated_size_i[0]; - end else if ( (address_i >= start_addr_scheme_i[1]) && (address_i < start_addr_scheme_i[1]+MemSizePerRow*allocated_size_i[1]) ) begin - - post_scramble_row_index = 'b0; - post_scramble_tile_id = 'b0; - // 1. `post_scramble_row_index` generation - post_scramble_row_index |= (address_i >> (ConstantBitsLSB + shift_index[1])) & mask_row_index[1]; - post_scramble_row_index |= (address_i >> (ConstantBitsLSB + TileIdBits )) & mask_row_index_n[1]; - - // 2. `post_scramble_tile_id` generation - post_scramble_tile_id |= heap_tile_id & mask_tile_id[1]; - post_scramble_tile_id |= (address_i >> (ConstantBitsLSB + shift_index_sc[1])) & mask_tile_id_n[1]; - - address_o[HeapSeqTotalBits-1:ConstantBitsLSB] = {post_scramble_row_index, post_scramble_tile_id}; - group_factor_o = group_factor_i[1]; - allocated_size_o = allocated_size_i[1]; - end else if ( (address_i >= start_addr_scheme_i[2]) && (address_i < start_addr_scheme_i[2]+MemSizePerRow*allocated_size_i[2]) ) begin - - post_scramble_row_index = 'b0; - post_scramble_tile_id = 'b0; - // 1. `post_scramble_row_index` generation - post_scramble_row_index |= (address_i >> (ConstantBitsLSB + shift_index[2])) & mask_row_index[2]; - post_scramble_row_index |= (address_i >> (ConstantBitsLSB + TileIdBits )) & mask_row_index_n[2]; - - // 2. `post_scramble_tile_id` generation - post_scramble_tile_id |= heap_tile_id & mask_tile_id[2]; - post_scramble_tile_id |= (address_i >> (ConstantBitsLSB + shift_index_sc[2])) & mask_tile_id_n[2]; - - address_o[HeapSeqTotalBits-1:ConstantBitsLSB] = {post_scramble_row_index, post_scramble_tile_id}; - group_factor_o = group_factor_i[2]; - allocated_size_o = allocated_size_i[2]; - end else if ( (address_i >= start_addr_scheme_i[3]) && (address_i < start_addr_scheme_i[3]+MemSizePerRow*allocated_size_i[3]) ) begin - - post_scramble_row_index = 'b0; - post_scramble_tile_id = 'b0; - // 1. `post_scramble_row_index` generation - post_scramble_row_index |= (address_i >> (ConstantBitsLSB + shift_index[3])) & mask_row_index[3]; - post_scramble_row_index |= (address_i >> (ConstantBitsLSB + TileIdBits )) & mask_row_index_n[3]; - - // 2. `post_scramble_tile_id` generation - post_scramble_tile_id |= heap_tile_id & mask_tile_id[3]; - post_scramble_tile_id |= (address_i >> (ConstantBitsLSB + shift_index_sc[3])) & mask_tile_id_n[3]; - - address_o[HeapSeqTotalBits-1:ConstantBitsLSB] = {post_scramble_row_index, post_scramble_tile_id}; - group_factor_o = group_factor_i[3]; - allocated_size_o = allocated_size_i[3]; + if (address_i < DASStartAddr) begin + group_factor_o = NumTiles; // fully interleaved + allocated_size_o = num_bytes_i / MemSizePerRow; + + // DAS address scrambling + end else begin + + for (int p = 0; p < NumDASPartitions; p++) begin + if ( (address_i >= start_addr_scheme_i[p]) && (address_i < start_addr_scheme_i[p]+MemSizePerRow*allocated_size_i[p]) ) begin + address_o = '0; + address_o |= address_i & ((1 << (tile_index[p]+ConstantBitsLSB)) - 1); + address_o |= ((address_i >> (row_index[p]+tile_index[p]+ConstantBitsLSB)) << (tile_index[p]+ConstantBitsLSB)) & ((1 << (TileIdBits+ConstantBitsLSB)) - 1); + address_o |= ((address_i >> (tile_index[p]+ConstantBitsLSB)) << (TileIdBits + ConstantBitsLSB)) & ((1 << (row_index[p]+TileIdBits+ConstantBitsLSB)) - 1); + address_o |= address_i & ~((1 << (row_index[p]+TileIdBits+ConstantBitsLSB)) - 1); + group_factor_o = group_factor_i[p]; + allocated_size_o = allocated_size_i[p]; + end + end + end end @@ -177,6 +101,5 @@ module idma_address_scrambler #( // Check for unsupported configurations if (NumBanksPerTile < 2) $fatal(1, "NumBanksPerTile must be greater than 2. The special case '1' is currently not supported!"); - if (HeapSeqMemSizePerTile % (2**ByteOffset*NumBanksPerTile) != 0) - $fatal(1, "HeapSeqMemSizePerTile must be a multiple of BankWidth*NumBanksPerTile!"); + endmodule : idma_address_scrambler diff --git a/hardware/deps/idma/src/midends/idma_distributed_midend.sv b/hardware/deps/idma/src/midends/idma_distributed_midend.sv index e1cd96e10..1744f60b5 100644 --- a/hardware/deps/idma/src/midends/idma_distributed_midend.sv +++ b/hardware/deps/idma/src/midends/idma_distributed_midend.sv @@ -3,6 +3,8 @@ // SPDX-License-Identifier: SHL-0.51 // Samuel Riedel +// Bowen Wang +// Marco Bertuletti `include "common_cells/registers.svh" @@ -17,23 +19,31 @@ module idma_distributed_midend #( parameter int unsigned DmaRegionEnd = 32'h1000_0000, /// Number of generic 1D requests that can be buffered parameter int unsigned TransFifoDepth = 1, +`ifdef DAS + parameter int unsigned NumTiles = 64, + parameter int unsigned NumDASPartitions = 4, +`endif /// Arbitrary 1D burst request definition parameter type burst_req_t = logic, /// Meta data response definition parameter type meta_t = logic ) ( - input logic clk_i, - input logic rst_ni, + input logic clk_i, + input logic rst_ni, +`ifdef DAS + // DAS signals + input logic [$clog2(NumTiles):0] allocated_size_i, +`endif // Slave - input burst_req_t burst_req_i, - input logic valid_i, - output logic ready_o, - output meta_t meta_o, + input burst_req_t burst_req_i, + input logic valid_i, + output logic ready_o, + output meta_t meta_o, // Master - output burst_req_t [NoMstPorts-1:0] burst_req_o, - output logic [NoMstPorts-1:0] valid_o, - input logic [NoMstPorts-1:0] ready_i, - input meta_t [NoMstPorts-1:0] meta_i + output burst_req_t [NoMstPorts-1:0] burst_req_o, + output logic [NoMstPorts-1:0] valid_o, + input logic [NoMstPorts-1:0] ready_i, + input meta_t [NoMstPorts-1:0] meta_i ); localparam DmaRegionAddressBits = $clog2(DmaRegionWidth); @@ -57,6 +67,7 @@ module idma_distributed_midend #( // Collect the `trans_complete` signals and reduce them once we have all of them logic empty; logic data; + logic push; fifo_v3 #( .FALL_THROUGH (0 ), .DATA_WIDTH (1 ), @@ -70,12 +81,44 @@ module idma_distributed_midend #( .empty_o (empty ), .usage_o (/*unused*/ ), .data_i (1'b1 ), - .push_i (trans_complete_d[i] ), + .push_i (push ), .data_o (data ), .pop_i (meta_o.trans_complete) ); assign trans_complete_d[i] = meta_i[i].trans_complete | tie_off_trans_complete_q[i]; assign trans_complete_q[i] = data && !empty; + +`ifdef DAS + // Handle two complete signals arrive at the same time + logic [NumDASPartitions-1:0] conflict_counter_d, conflict_counter_q; + `FF(conflict_counter_q, conflict_counter_d, '0, clk_i, rst_ni) + always_comb begin + push = trans_complete_d[i] && !fifo_full[i]; + conflict_counter_d = conflict_counter_q; + // FIFO is not full + if (meta_i[i].trans_complete && tie_off_trans_complete_q[i] && !fifo_full[i]) begin + conflict_counter_d = conflict_counter_q+1; + end + // FIFO is full + if (meta_i[i].trans_complete && tie_off_trans_complete_q[i] && fifo_full[i]) begin + conflict_counter_d = conflict_counter_q+2; + end + if (!meta_i[i].trans_complete && tie_off_trans_complete_q[i] && fifo_full[i]) begin + conflict_counter_d = conflict_counter_q+1; + end + if (meta_i[i].trans_complete && !tie_off_trans_complete_q[i] && fifo_full[i]) begin + conflict_counter_d = conflict_counter_q+1; + end + // FIFO is not full, safe to push + if (|conflict_counter_q && !trans_complete_d[i] && !fifo_full[i] ) begin + push = 1'b1; + conflict_counter_d = conflict_counter_q-1; + end + end +`else + assign push = trans_complete_d[i] +`endif + end always_comb begin @@ -106,6 +149,7 @@ module idma_distributed_midend #( assign dst_addr = burst_req_i.dst[FullRegionAddressBits-1:0]; always_comb begin + if (($unsigned(burst_req_i.src) >= DmaRegionStart) && ($unsigned(burst_req_i.src) < DmaRegionEnd)) begin start_addr = src_addr; end else begin @@ -126,6 +170,23 @@ module idma_distributed_midend #( burst_req_o[i].dst = burst_req_i.dst; // Modify lower addresses bits and size if (($unsigned(start_addr) >= (i+1)*DmaRegionWidth) || ($unsigned(end_addr) <= i*DmaRegionWidth)) begin +`ifdef DAS + burst_req_o[i].num_bytes = (burst_req_i.num_bytes= DmaRegionStart) && ($unsigned(burst_req_i.src) < DmaRegionEnd)) begin + burst_req_o[i].src = burst_req_i.src+i*DmaRegionWidth; + burst_req_o[i].dst = burst_req_i.dst+i*allocated_size_i*DmaRegionWidth; + end else begin + // L2 --> L1 + if (burst_req_i.num_bytes<=DmaRegionWidth )begin + burst_req_o[i].src = burst_req_i.src+i*allocated_size_i*DmaRegionWidth; + end else if (i==2) begin + burst_req_o[i].src = burst_req_i.src+i*allocated_size_i*DmaRegionWidth; + end else if (i==3) begin + burst_req_o[i].src = burst_req_i.src+(i-1)*allocated_size_i*DmaRegionWidth + DmaRegionWidth; + end + burst_req_o[i].dst = burst_req_i.dst+i*DmaRegionWidth; + end +`else // We are not involved in the transfer burst_req_o[i].src = '0; burst_req_o[i].dst = '0; @@ -137,6 +198,7 @@ module idma_distributed_midend #( if (valid[i]) begin tie_off_trans_complete_d[i] = 1'b1; end +`endif end else if (($unsigned(start_addr) >= i*DmaRegionWidth)) begin // First (and potentially only) slice // Leave address as is @@ -146,6 +208,16 @@ module idma_distributed_midend #( burst_req_o[i].num_bytes = DmaRegionWidth-start_addr[DmaRegionAddressBits-1:0]; end end else begin +`ifdef DAS + // Round up the address to the next DMA boundary + if (($unsigned(burst_req_i.src) >= DmaRegionStart) && ($unsigned(burst_req_i.src) < DmaRegionEnd)) begin + burst_req_o[i].src[FullRegionAddressBits-1:0] = i*DmaRegionWidth; + burst_req_o[i].dst = burst_req_i.dst+i*DmaRegionWidth-start_addr[DmaRegionAddressBits-1:0]; + end else begin + burst_req_o[i].src = burst_req_i.src+(i-start_addr[DmaRegionAddressBits+1:DmaRegionAddressBits])*DmaRegionWidth-start_addr[DmaRegionAddressBits-1:0]; + burst_req_o[i].dst[FullRegionAddressBits-1:0] = i*DmaRegionWidth; + end +`else // Round up the address to the next DMA boundary if (($unsigned(burst_req_i.src) >= DmaRegionStart) && ($unsigned(burst_req_i.src) < DmaRegionEnd)) begin burst_req_o[i].src[FullRegionAddressBits-1:0] = i*DmaRegionWidth; @@ -154,6 +226,7 @@ module idma_distributed_midend #( burst_req_o[i].src = burst_req_i.src+i*DmaRegionWidth-start_addr; burst_req_o[i].dst[FullRegionAddressBits-1:0] = i*DmaRegionWidth; end +`endif if ($unsigned(end_addr) >= (i+1)*DmaRegionWidth) begin // Middle slice // Emit a full-sized transfer diff --git a/hardware/deps/idma/src/midends/idma_distributed_midend_v2.sv b/hardware/deps/idma/src/midends/idma_distributed_midend_v2.sv deleted file mode 100644 index 8300ec55f..000000000 --- a/hardware/deps/idma/src/midends/idma_distributed_midend_v2.sv +++ /dev/null @@ -1,245 +0,0 @@ -// Copyright 2022 ETH Zurich and University of Bologna. -// Solderpad Hardware License, Version 0.51, see LICENSE for details. -// SPDX-License-Identifier: SHL-0.51 - -// Samuel Riedel -// Bowen Wang - -`include "common_cells/registers.svh" - -module idma_distributed_midend_v2 #( - /// Number of backends to distribute the requests to - parameter int unsigned NoMstPorts = 1, - /// Bytes covered by each port - parameter int unsigned DmaRegionWidth = 1, // [B] Region that one port covers in bytes - /// Start of the distributed memory region - parameter int unsigned DmaRegionStart = 32'h0000_0000, - /// End of the distributed memory region - parameter int unsigned DmaRegionEnd = 32'h1000_0000, - /// Number of generic 1D requests that can be buffered - parameter int unsigned TransFifoDepth = 1, - /// Arbitrary 1D burst request definition - parameter type burst_req_t = logic, - /// Meta data response definition - parameter type meta_t = logic -) ( - input logic clk_i, - input logic rst_ni, - // Slave - input burst_req_t burst_req_i, - input logic valid_i, - output logic ready_o, - output meta_t meta_o, - // partition related signals - input logic [7:0] allocated_size_i, - // Master - output burst_req_t [NoMstPorts-1:0] burst_req_o, - output logic [NoMstPorts-1:0] valid_o, - input logic [NoMstPorts-1:0] ready_i, - input meta_t [NoMstPorts-1:0] meta_i -); - - localparam DmaRegionAddressBits = $clog2(DmaRegionWidth); - localparam FullRegionAddressBits = $clog2(DmaRegionWidth*NoMstPorts); - localparam FullDmaRegionWidth = DmaRegionWidth*NoMstPorts; - - typedef logic [FullRegionAddressBits:0] full_addr_t; - - // Handle the ready signal - logic fork_ready, fifo_ready; - logic [NoMstPorts-1:0] fifo_full; - // Handle Metadata - logic [NoMstPorts-1:0] trans_complete_d, trans_complete_q; - logic [NoMstPorts-1:0] tie_off_trans_complete_d, tie_off_trans_complete_q; - logic [NoMstPorts-1:0] backend_idle_d, backend_idle_q; - assign meta_o.trans_complete = &trans_complete_q; - assign meta_o.backend_idle = &backend_idle_q; - assign fifo_ready = !(|fifo_full); - assign ready_o = fork_ready && fifo_ready; - - for (genvar i = 0; unsigned'(i) < NoMstPorts; i++) begin: gen_trans_complete_fifo - // Collect the `trans_complete` signals and reduce them once we have all of them - logic empty; - logic data; - logic conflict_push; - fifo_v3 #( - .FALL_THROUGH (0 ), - .DATA_WIDTH (1 ), - .DEPTH (TransFifoDepth) - ) i_trans_complete_fifo ( - .clk_i (clk_i ), - .rst_ni (rst_ni ), - .flush_i ('0 ), - .testmode_i ('0 ), - .full_o (fifo_full[i] ), - .empty_o (empty ), - .usage_o (/*unused*/ ), - .data_i (1'b1 ), - .push_i ( (trans_complete_d[i] | conflict_push) & (fifo_full[i]==0) ), - .data_o (data ), - .pop_i (meta_o.trans_complete) - ); - assign trans_complete_d[i] = meta_i[i].trans_complete | tie_off_trans_complete_q[i]; - assign trans_complete_q[i] = data && !empty; - - // handle two complete signals arrive at the same time - logic [3:0] conflict_complete_d, conflict_complete_q; - `FF(conflict_complete_q, conflict_complete_d, '0, clk_i, rst_ni) - - always_comb begin - conflict_complete_d = conflict_complete_q; - conflict_push = 0; - if (meta_i[i].trans_complete & tie_off_trans_complete_q[i] & (fifo_full[i]==0)) begin // FIFO is not full - conflict_complete_d = conflict_complete_q+1; - end - if (meta_i[i].trans_complete & tie_off_trans_complete_q[i] & (fifo_full[i]!=0)) begin // FIFO is full - conflict_complete_d = conflict_complete_q+2; - end - if (!meta_i[i].trans_complete & tie_off_trans_complete_q[i] & (fifo_full[i]!=0)) begin - conflict_complete_d = conflict_complete_q+1; - end - if (meta_i[i].trans_complete & !tie_off_trans_complete_q[i] & (fifo_full[i]!=0)) begin - conflict_complete_d = conflict_complete_q+1; - end - - if ( (conflict_complete_q!=0) & (trans_complete_d[i]==0) & (fifo_full[i]==0) ) begin // FIFO is not full, safe to push - conflict_push = 1; - conflict_complete_d = conflict_complete_q-1; - end - end - - end - - always_comb begin - backend_idle_d = backend_idle_q; - for (int unsigned i = 0; i < NoMstPorts; i++) begin - backend_idle_d[i] = meta_i[i].backend_idle; - end - end - `FF(tie_off_trans_complete_q, tie_off_trans_complete_d, '0, clk_i, rst_ni) - `FF(backend_idle_q, backend_idle_d, '1, clk_i, rst_ni) - - // Fork - logic [NoMstPorts-1:0] valid, ready; - stream_fork #( - .N_OUP (NoMstPorts) - ) i_stream_fork ( - .clk_i (clk_i ), - .rst_ni (rst_ni ), - .valid_i (valid_i & fifo_ready), - .ready_o (fork_ready ), - .valid_o (valid ), - .ready_i (ready ) - ); - - full_addr_t src_addr, dst_addr, start_addr, end_addr; - - assign src_addr = burst_req_i.src[FullRegionAddressBits-1:0]; - assign dst_addr = burst_req_i.dst[FullRegionAddressBits-1:0]; - - logic [1:0] num_split, split_offset; - // logic num_split, split_offset; - - always_comb begin - num_split = burst_req_i.num_bytes / DmaRegionWidth; - if (($unsigned(burst_req_i.src) >= DmaRegionStart) && ($unsigned(burst_req_i.src) < DmaRegionEnd)) begin - start_addr = src_addr; - end else begin - start_addr = dst_addr; - end - end_addr = start_addr+burst_req_i.num_bytes; - split_offset = start_addr[DmaRegionAddressBits+1:DmaRegionAddressBits]; - // split_offset = start_addr[DmaRegionAddressBits]; - // Connect valid ready by default - valid_o = valid; - ready = ready_i; - // Do not interfere with metadata per default - tie_off_trans_complete_d = '0; - for (int i = 0; i < NoMstPorts; i++) begin - tie_off_trans_complete_d[i] = tie_off_trans_complete_q[i] && meta_i[i].trans_complete; - // Feed metadata through directly - burst_req_o[i] = burst_req_i; - // Feed through the address bits - burst_req_o[i].src = burst_req_i.src; - burst_req_o[i].dst = burst_req_i.dst; - - // Modify lower addresses bits and size - if (($unsigned(start_addr) >= (i+1)*DmaRegionWidth) || ($unsigned(end_addr) <= i*DmaRegionWidth)) begin -`ifdef DAS - burst_req_o[i].num_bytes = (burst_req_i.num_bytes= DmaRegionStart) && ($unsigned(burst_req_i.src) < DmaRegionEnd)) begin - burst_req_o[i].src = burst_req_i.src+i*DmaRegionWidth; - burst_req_o[i].dst = burst_req_i.dst+i*allocated_size_i*DmaRegionWidth; - end else begin - // L2 --> L1 - if (burst_req_i.num_bytes<=DmaRegionWidth )begin - burst_req_o[i].src = burst_req_i.src+i*allocated_size_i*DmaRegionWidth; - end else if (i==2) begin - burst_req_o[i].src = burst_req_i.src+i*allocated_size_i*DmaRegionWidth; - end else if (i==3) begin - burst_req_o[i].src = burst_req_i.src+(i-1)*allocated_size_i*DmaRegionWidth + DmaRegionWidth; - end - burst_req_o[i].dst = burst_req_i.dst+i*DmaRegionWidth; - end -`else - // We are not involved in the transfer - burst_req_o[i].src = '0; - burst_req_o[i].dst = '0; - burst_req_o[i].num_bytes = 1; - // Make handshake ourselves - valid_o[i] = 1'b0; - ready[i] = 1'b1; - // Inject trans complete - if (valid[i]) begin - tie_off_trans_complete_d[i] = 1'b1; - end -`endif - - end else if (($unsigned(start_addr) >= i*DmaRegionWidth)) begin - // First (and potentially only) slice - // Leave address as is - if ($unsigned(end_addr) <= (i+1)*DmaRegionWidth) begin - burst_req_o[i].num_bytes = burst_req_i.num_bytes; - end else begin - burst_req_o[i].num_bytes = DmaRegionWidth-start_addr[DmaRegionAddressBits-1:0]; - end - - end else begin - // Round up the address to the next DMA boundary - if (($unsigned(burst_req_i.src) >= DmaRegionStart) && ($unsigned(burst_req_i.src) < DmaRegionEnd)) begin - burst_req_o[i].src[FullRegionAddressBits-1:0] = i*DmaRegionWidth; - burst_req_o[i].dst = burst_req_i.dst+i*DmaRegionWidth-start_addr[DmaRegionAddressBits-1:0]; - end else begin - burst_req_o[i].src = burst_req_i.src+(i-split_offset)*DmaRegionWidth-start_addr[DmaRegionAddressBits-1:0]; - burst_req_o[i].dst[FullRegionAddressBits-1:0] = i*DmaRegionWidth; - end - if ($unsigned(end_addr) >= (i+1)*DmaRegionWidth) begin - // Middle slice - // Emit a full-sized transfer - burst_req_o[i].num_bytes = DmaRegionWidth; - end else begin - // Last slice - burst_req_o[i].num_bytes = end_addr[DmaRegionAddressBits-1:0]; - end - end - end - end - - // pragma translate_off - int f; - always_ff @(posedge clk_i or negedge rst_ni) begin - automatic string str; - if (rst_ni && valid_i && ready_o) begin - str = "\n[idma_distributed_midend_v2] Got request\n"; - str = $sformatf("%sRequest in: From: 0x%8x To: 0x%8x with size %d\n", str, burst_req_i.src, burst_req_i.dst, burst_req_i.num_bytes); - for (int i = 0; i < NoMstPorts; i++) begin - str = $sformatf("%sOut %6d: From: 0x%8x To: 0x%8x with size %d\n", str, i, burst_req_o[i].src, burst_req_o[i].dst, burst_req_o[i].num_bytes); - end - f = $fopen("dma.log", "a"); - $fwrite(f, str); - $fclose(f); - end - end - // pragma translate_on - -endmodule diff --git a/hardware/deps/idma/src/midends/idma_split_midend.sv b/hardware/deps/idma/src/midends/idma_split_midend.sv index 42a21e2d2..05b83a80a 100644 --- a/hardware/deps/idma/src/midends/idma_split_midend.sv +++ b/hardware/deps/idma/src/midends/idma_split_midend.sv @@ -3,6 +3,8 @@ // SPDX-License-Identifier: SHL-0.51 // Samuel Riedel +// Bowen Wang +// Marco Bertuletti `include "common_cells/registers.svh" @@ -11,11 +13,25 @@ module idma_split_midend #( parameter int unsigned DmaRegionStart = 32'h0000_0000, parameter int unsigned DmaRegionEnd = 32'h1000_0000, parameter int unsigned AddrWidth = 32, +`ifdef DAS + parameter int unsigned NumTiles = 64, + parameter int unsigned NumBanksPerTile = 32, + parameter int unsigned TCDMSizePerBank = 1024, + parameter int unsigned NumDASPartitions = 4, + parameter int unsigned DASStartAddr = 1024, +`endif parameter type burst_req_t = logic, parameter type meta_t = logic ) ( input logic clk_i, input logic rst_ni, +`ifdef DAS + // DAS signals + input logic [NumDASPartitions-1:0][$clog2(NumTiles):0] group_factor_i, + input logic [NumDASPartitions-1:0][$clog2(NumTiles):0] allocated_size_i, + input logic [NumDASPartitions-1:0][AddrWidth-1:0] start_addr_scheme_i, + output logic [$clog2(NumTiles):0] allocated_size_o, +`endif // Slave input burst_req_t burst_req_i, input logic valid_i, @@ -28,16 +44,13 @@ module idma_split_midend #( input meta_t meta_i ); + // ------ Parameter Settings ------ // localparam DmaRegionAddressBits = $clog2(DmaRegionWidth); - typedef logic [AddrWidth-1:0] addr_t; - addr_t start_addr, end_addr; - logic req_valid; - - - // Handle Metadata + // ------ Handle Metadata ------ // // Forward idle signal and count the trans_comlete signal + logic req_valid; logic [31:0] num_trans_d, num_trans_q; assign meta_o.backend_idle = meta_i.backend_idle; @@ -56,16 +69,130 @@ module idma_split_midend #( end `FF(num_trans_q, num_trans_d, '0, clk_i, rst_ni) - // Split requests +`ifdef DAS + localparam TileDmaRegionWidth = DmaRegionWidth / NumTiles; + logic [AddrWidth-1:0] PartitionDmaRegionWidth; + localparam DmaBackendWidth = NumBanksPerTile*NumTiles*4; // 32banks*8Tiles*4bytes + + // ------ Address translation ------ // + // Only the address in L1 SPM will be scrambled + logic [AddrWidth-1:0] post_scramble_src; + logic [AddrWidth-1:0] post_scramble_dst; + logic [$clog2(NumTiles):0] group_factor_src, group_factor_dst, group_factor_sel; + logic [$clog2(NumTiles):0] allocated_size_src, allocated_size_dst, allocated_size_sel; + + assign group_factor_sel = group_factor_src | group_factor_dst; + assign allocated_size_sel = allocated_size_src | allocated_size_dst; + assign PartitionDmaRegionWidth = TileDmaRegionWidth * group_factor_sel; + + idma_address_scrambler #( + .AddrWidth (AddrWidth ), + .NumTiles (NumTiles ), + .NumBanksPerTile (NumBanksPerTile ), + .Bypass (0 ), + .NumDASPartitions (NumDASPartitions), + .TCDMSizePerBank (TCDMSizePerBank ) + ) i_idma_address_scrambler_src ( + .address_i (burst_req_i.src), + .num_bytes_i (burst_req_i.num_bytes), + .group_factor_i (group_factor_i), + .allocated_size_i (allocated_size_i), + .start_addr_scheme_i(start_addr_scheme_i), + .group_factor_o (group_factor_src), + .allocated_size_o (allocated_size_src), + .address_o (post_scramble_src) + ); + + idma_address_scrambler #( + .AddrWidth (AddrWidth ), + .NumTiles (NumTiles ), + .NumBanksPerTile (NumBanksPerTile ), + .Bypass (0 ), + .NumDASPartitions (NumDASPartitions), + .TCDMSizePerBank (TCDMSizePerBank ) + ) i_idma_address_scrambler_dst ( + .address_i (burst_req_i.dst), + .num_bytes_i (burst_req_i.num_bytes), + .group_factor_i (group_factor_i), + .allocated_size_i (allocated_size_i), + .start_addr_scheme_i(start_addr_scheme_i), + .group_factor_o (group_factor_dst), + .allocated_size_o (allocated_size_dst), + .address_o (post_scramble_dst) + ); + + // ------ Filter out address in L1 SPM ------ // + addr_t start_addr; + logic spm2dram; + always_comb begin + spm2dram = 0; + if (($unsigned(burst_req_i.src) >= DmaRegionStart) && ($unsigned(burst_req_i.src) < DmaRegionEnd)) begin + start_addr = post_scramble_src; + spm2dram = 1; + end else begin + start_addr = post_scramble_dst; + spm2dram = 0; + end + end + + // ------ Considering Partition Scheme ------ // + logic [$clog2(NumTiles):0] shift_index; + logic [AddrWidth-1:0] partition_mask; + addr_t masked_start_addr; + + always_comb begin + case(group_factor_sel) + 128: shift_index = 7; + 64: shift_index = 6; + 32: shift_index = 5; + 16: shift_index = 4; + 8: shift_index = 3; + 4: shift_index = 2; + 2: shift_index = 1; + default: shift_index = 0; + endcase + end + + assign partition_mask = {DmaRegionAddressBits{1'b1}} >> ($clog2(NumTiles) - shift_index); + assign masked_start_addr = start_addr & partition_mask; + + // ------ Beat Counter and Shifter Handler ------ // + logic [$clog2(NumTiles):0] beat_cnt_d, beat_cnt_q; + `FFARN(beat_cnt_q, beat_cnt_d, '0, clk_i, rst_ni) + + logic [$clog2(NumTiles):0] shift_row, shift_partition; + logic [$clog2(NumTiles):0] shift_index_sc; + logic [$clog2(NumTiles):0] mask_shift_row; + + always_comb begin + case(allocated_size_sel) + 128: shift_index_sc = 7; + 64: shift_index_sc = 6; + 32: shift_index_sc = 5; + 16: shift_index_sc = 4; + 8: shift_index_sc = 3; + 4: shift_index_sc = 2; + 2: shift_index_sc = 1; + default: shift_index_sc = 0; + endcase + end + + assign shift_partition = beat_cnt_q >> shift_index_sc; + assign mask_shift_row = ~( {($clog2(NumTiles) + 1){1'b1}} << shift_index_sc ); + assign shift_row = beat_cnt_q & mask_shift_row; +`else + // ------ Filter out address in L1 SPM ------ // + addr_t start_addr; always_comb begin if (($unsigned(burst_req_i.src) >= DmaRegionStart) && ($unsigned(burst_req_i.src) < DmaRegionEnd)) begin start_addr = burst_req_i.src; end else begin start_addr = burst_req_i.dst; end - end_addr = start_addr + burst_req_i.num_bytes; end +`endif + // ------ Split requests ------ // enum logic {Idle, Busy} state_d, state_q; burst_req_t req_d, req_q; @@ -80,9 +207,63 @@ module idma_split_midend #( ready_o = 1'b0; req_valid = 1'b0; +`ifdef DAS + allocated_size_o = allocated_size_sel; + beat_cnt_d = beat_cnt_q; + if (num_trans_q == 1 && num_trans_d == 0) begin + beat_cnt_d = 0; + end +`endif + unique case (state_q) Idle: begin - if (valid_i) begin // Splitting required. + if (valid_i) begin // Splitting required +`ifdef DAS + if ((PartitionDmaRegionWidth-masked_start_addr) >= burst_req_i.num_bytes) begin + burst_req_o = burst_req_i; + // Address in SPM need to be translated back to physical address + if (spm2dram) begin + burst_req_o.src = post_scramble_src; + end else begin + burst_req_o.dst = post_scramble_dst; + end + valid_o = 1'b1; + ready_o = ready_i; + req_valid = ready_i; + end else begin + // Store and acknowledge + req_d = burst_req_i; + ready_o = 1'b1; + burst_req_o = burst_req_i; + // Calculate the size for the 1st burst + burst_req_o.num_bytes = PartitionDmaRegionWidth-masked_start_addr; + // TODO (bowwang): parameterize + req_d.num_bytes = (group_factor_sel <= $clog2(NumTiles) + 1) ? (allocated_size_sel*DmaBackendWidth) : (allocated_size_sel*PartitionDmaRegionWidth); + if (spm2dram) begin + burst_req_o.src = post_scramble_src; + req_d.src = post_scramble_src; + end else begin + burst_req_o.dst = post_scramble_dst; + req_d.dst = post_scramble_dst; + end + valid_o = 1'b1; + // Modify the stored info after first beat sent + if (ready_i) begin + // TODO (bowwang): May not be mecessary to consider alignment + req_d.num_bytes -= PartitionDmaRegionWidth-masked_start_addr; + if (spm2dram) begin + req_d.src += DmaRegionWidth-masked_start_addr; + req_d.dst += PartitionDmaRegionWidth-masked_start_addr; + end else begin + req_d.src += PartitionDmaRegionWidth-masked_start_addr; + req_d.dst += DmaRegionWidth-masked_start_addr; + end + req_valid = 1'b1; + beat_cnt_d = 1; + end + state_d = Busy; + end +`else if (DmaRegionWidth-start_addr[DmaRegionAddressBits-1:0] >= burst_req_i.num_bytes) begin // No splitting required, just forward burst_req_o = burst_req_i; @@ -108,6 +289,7 @@ module idma_split_midend #( end state_d = Busy; end +`endif end end Busy: begin @@ -115,7 +297,37 @@ module idma_split_midend #( burst_req_o = req_q; valid_o = 1'b1; req_valid = ready_i; - if (req_q.num_bytes <= DmaRegionWidth) begin +`ifdef DAS + if ($unsigned(req_q.num_bytes) <= $unsigned(PartitionDmaRegionWidth)) begin + // Last split + if (ready_i) begin + state_d = Idle; + beat_cnt_d = beat_cnt_q + 1; + end + end else begin + burst_req_o.num_bytes = PartitionDmaRegionWidth; + if (ready_i) begin + req_d.num_bytes = req_q.num_bytes - PartitionDmaRegionWidth; + beat_cnt_d = beat_cnt_q + 1; + if (spm2dram) begin + if (shift_row == allocated_size_sel-1) begin + req_d.src = req_q.src + PartitionDmaRegionWidth - shift_row*DmaRegionWidth; + end else begin + req_d.src = req_q.src + DmaRegionWidth; + end + req_d.dst = req_q.dst + PartitionDmaRegionWidth; + end else begin + req_d.src = req_q.src + PartitionDmaRegionWidth; + if (shift_row == allocated_size_sel-1) begin + req_d.dst = req_q.dst + PartitionDmaRegionWidth - shift_row*DmaRegionWidth; + end else begin + req_d.dst = req_q.dst + DmaRegionWidth; + end + end// spm2dram + end // ready_i + end +`else + if ($unsigned(req_q.num_bytes) <= $unsigned(DmaRegionWidth)) begin // Last split if (ready_i) begin state_d = Idle; @@ -129,6 +341,7 @@ module idma_split_midend #( req_d.dst = req_q.dst + DmaRegionWidth; end end +`endif end default: /*do nothing*/; endcase @@ -139,14 +352,14 @@ module idma_split_midend #( always_ff @(posedge clk_i or negedge rst_ni) begin automatic string str; if (rst_ni && valid_i && ready_o) begin - str = "[idma_split_midend] Got request\n"; + str = "\n\n[idma_split_midend] Got request\n"; str = $sformatf("%sSplit: Request in: From: 0x%8x To: 0x%8x with size %d\n", str, burst_req_i.src, burst_req_i.dst, burst_req_i.num_bytes); f = $fopen("dma.log", "a"); $fwrite(f, str); $fclose(f); end if (rst_ni && valid_o && ready_i) begin - str = $sformatf("Split: Out %6d: From: 0x%8x To: 0x%8x with size %d\n", num_trans_q, burst_req_o.src, burst_req_o.dst, burst_req_o.num_bytes); + str = $sformatf("Split: Out %6d: From: 0x%8x To: 0x%8x with size %d, start_addr 0x%8x.\n", num_trans_q, burst_req_o.src, burst_req_o.dst, burst_req_o.num_bytes, start_addr); f = $fopen("dma.log", "a"); $fwrite(f, str); $fclose(f); diff --git a/hardware/deps/idma/src/midends/idma_split_midend_v2.sv b/hardware/deps/idma/src/midends/idma_split_midend_v2.sv deleted file mode 100644 index 24c44d926..000000000 --- a/hardware/deps/idma/src/midends/idma_split_midend_v2.sv +++ /dev/null @@ -1,351 +0,0 @@ -// Copyright 2022 ETH Zurich and University of Bologna. -// Solderpad Hardware License, Version 0.51, see LICENSE for details. -// SPDX-License-Identifier: SHL-0.51 - -// Samuel Riedel -// Bowen Wang - - -// The Split Midend (v2) slice one burst request aligned to partition boundary, instead of -// L1 SPM boundary as in v1. - -`include "common_cells/registers.svh" - -module idma_split_midend_v2 #( - parameter int unsigned DmaRegionWidth = 1, // [B] Region that one port covers in bytes - parameter int unsigned DmaRegionStart = 32'h0000_0000, - parameter int unsigned DmaRegionEnd = 32'h1000_0000, - parameter int unsigned AddrWidth = 32, - parameter type burst_req_t = logic, - parameter type meta_t = logic -) ( - input logic clk_i, - input logic rst_ni, - // Slave - input burst_req_t burst_req_i, - input logic valid_i, - output logic ready_o, - output meta_t meta_o, - - // Partition related signals - input logic [3:0][7:0] group_factor_i, - input logic [3:0][7:0] allocated_size_i, - input logic [3:0][AddrWidth-1:0] start_addr_scheme_i, - output logic [7:0] allocated_size_o, - - // Master - output burst_req_t burst_req_o, - output logic valid_o, - input logic ready_i, - input meta_t meta_i -); - - // ------ Parameter Settings ------ // - typedef logic [AddrWidth-1:0] addr_t; - localparam DmaRegionAddressBits = $clog2(DmaRegionWidth); - localparam TileDmaRegionWidth = DmaRegionWidth / 128; - logic [AddrWidth-1:0] PartitionDmaRegionWidth; - - localparam DmaBackendWidth = 32*8*4; // 32banks*8Tiles*4bytes - - // ------ Address translation ------ // - // Only the address in L1 SPM will be scrambled - logic [AddrWidth-1:0] post_scramble_src; - logic [AddrWidth-1:0] post_scramble_dst; - logic [7:0] group_factor_src, group_factor_dst, group_factor_sel; - logic [7:0] allocated_size_src, allocated_size_dst, allocated_size_sel; - - assign group_factor_sel = group_factor_src | group_factor_dst; - assign allocated_size_sel = allocated_size_src | allocated_size_dst; - assign PartitionDmaRegionWidth = TileDmaRegionWidth * group_factor_sel; - - idma_address_scrambler i_idma_address_scrambler_src ( - .address_i (burst_req_i.src), - .num_bytes_i (burst_req_i.num_bytes), - .group_factor_i (group_factor_i), - .allocated_size_i (allocated_size_i), - .start_addr_scheme_i(start_addr_scheme_i), - .group_factor_o (group_factor_src), - .allocated_size_o (allocated_size_src), - .address_o (post_scramble_src) - ); - - idma_address_scrambler i_idma_address_scrambler_dst ( - .address_i (burst_req_i.dst), - .num_bytes_i (burst_req_i.num_bytes), - .group_factor_i (group_factor_i), - .allocated_size_i (allocated_size_i), - .start_addr_scheme_i(start_addr_scheme_i), - .group_factor_o (group_factor_dst), - .allocated_size_o (allocated_size_dst), - .address_o (post_scramble_dst) - ); - - // ------ Filter out address in L1 SPM ------ // - addr_t start_addr, end_addr; - logic spm2dram; - - always_comb begin - spm2dram = 0; - if (($unsigned(burst_req_i.src) >= DmaRegionStart) && ($unsigned(burst_req_i.src) < DmaRegionEnd)) begin - start_addr = post_scramble_src; - spm2dram = 1; - end else begin - start_addr = post_scramble_dst; - spm2dram = 0; - end - // not used - end_addr = start_addr + burst_req_i.num_bytes; - end - - // ------ Considering Partition Scheme ------ // - logic [2:0] shift_index; - logic [AddrWidth-1:0] partition_mask; - addr_t masked_start_addr; - - always_comb begin - case(group_factor_sel) - 128: shift_index = 0; - 64: shift_index = 1; - 32: shift_index = 2; - 16: shift_index = 3; - 8: shift_index = 4; - 4: shift_index = 5; - 2: shift_index = 6; - default: shift_index = 7; - endcase - end - - assign partition_mask = {DmaRegionAddressBits{1'b1}} >> shift_index; - assign masked_start_addr = start_addr & partition_mask; - - // ------ Handle Metadata ------ // - // Forward idle signal and count the trans_comlete signal - logic req_valid; - logic [31:0] num_trans_d, num_trans_q; - - assign meta_o.backend_idle = meta_i.backend_idle; - always_comb begin - num_trans_d = num_trans_q; - meta_o.trans_complete = 1'b0; - - if (req_valid) begin - num_trans_d += 1; - end - if (meta_i.trans_complete) begin - num_trans_d -= 1; - end - if (num_trans_q == 1 && num_trans_d == 0) begin - meta_o.trans_complete = 1'b1; - end - end - `FF(num_trans_q, num_trans_d, '0, clk_i, rst_ni) - - // ------ Beat Counter and Shifter Handler ------ // - logic [7:0] beat_cnt_d, beat_cnt_q; - `FFARN(beat_cnt_q, beat_cnt_d, '0, clk_i, rst_ni) - - logic [7:0] shift_row, shift_partition; - logic [2:0] shift_index_sc; - logic [7:0] mask_shift_row; - - always_comb begin - case(allocated_size_sel) - 128: shift_index_sc = 7; - 64: shift_index_sc = 6; - 32: shift_index_sc = 5; - 16: shift_index_sc = 4; - 8: shift_index_sc = 3; - 4: shift_index_sc = 2; - 2: shift_index_sc = 1; - default: shift_index_sc = 0; - endcase - end - - assign shift_partition = beat_cnt_q >> shift_index_sc; - assign mask_shift_row = ~( {8{1'b1}}<= burst_req_i.num_bytes ) begin - burst_req_o = burst_req_i; - // Address in SPM need to be translated back to physical address - if (spm2dram) begin - burst_req_o.src = post_scramble_src; - end else begin - burst_req_o.dst = post_scramble_dst; - end - valid_o = 1'b1; - ready_o = ready_i; - req_valid = ready_i; - end else begin - // Store and acknowledge - req_d = burst_req_i; - ready_o = 1'b1; - burst_req_o = burst_req_i; - // Calculate the size for the 1st burst - burst_req_o.num_bytes = PartitionDmaRegionWidth-masked_start_addr; - // TODO (bowwang): parameterize - req_d.num_bytes = (group_factor_sel <= 8) ? (allocated_size_sel*DmaBackendWidth) : (allocated_size_sel*PartitionDmaRegionWidth); - if (spm2dram) begin - burst_req_o.src = post_scramble_src; - req_d.src = post_scramble_src; - end else begin - burst_req_o.dst = post_scramble_dst; - req_d.dst = post_scramble_dst; - end - valid_o = 1'b1; - // Modify the stored info after first beat sent - if (ready_i) begin - // TODO (bowwang): May not be mecessary to consider alignment - req_d.num_bytes -= PartitionDmaRegionWidth-masked_start_addr; - if (spm2dram) begin - req_d.src += DmaRegionWidth-masked_start_addr; - req_d.dst += PartitionDmaRegionWidth-masked_start_addr; - end else begin - req_d.src += PartitionDmaRegionWidth-masked_start_addr; - req_d.dst += DmaRegionWidth-masked_start_addr; - end - req_valid = 1'b1; - beat_cnt_d = 1; - end - state_d = Busy; - end -`else - if ( (DmaRegionWidth-start_addr[DmaRegionAddressBits-1:0]) >= burst_req_i.num_bytes) begin - burst_req_o = burst_req_i; - valid_o = 1'b1; - ready_o = ready_i; - req_valid = ready_i; - end else begin - // Store and acknowledge - req_d = burst_req_i; - ready_o = 1'b1; - burst_req_o = burst_req_i; - // Calculate the size for the 1st burst - burst_req_o.num_bytes = DmaRegionWidth-start_addr[DmaRegionAddressBits-1:0]; - valid_o = 1'b1; - // Modify the stored info after first beat sent - if (ready_i) begin - req_d.num_bytes -= DmaRegionWidth-start_addr[DmaRegionAddressBits-1:0]; - req_d.src += DmaRegionWidth-start_addr[DmaRegionAddressBits-1:0]; - req_d.dst += DmaRegionWidth-start_addr[DmaRegionAddressBits-1:0]; - req_valid = 1'b1; - end - state_d = Busy; - end -`endif - end - end // Idle - - Busy: begin - // Sent next burst from split. - burst_req_o = req_q; - valid_o = 1'b1; - req_valid = ready_i; - -`ifdef PARTITION - if ($unsigned(req_q.num_bytes) <= $unsigned(PartitionDmaRegionWidth)) begin - // Last split - if (ready_i) begin - state_d = Idle; - beat_cnt_d = beat_cnt_q + 1; - end - end else begin - burst_req_o.num_bytes = PartitionDmaRegionWidth; - if (ready_i) begin - req_d.num_bytes = req_q.num_bytes - PartitionDmaRegionWidth; - beat_cnt_d = beat_cnt_q + 1; - if (spm2dram) begin - if (shift_row == allocated_size_sel-1) begin - req_d.src = req_q.src + PartitionDmaRegionWidth - shift_row*DmaRegionWidth; - end else begin - req_d.src = req_q.src + DmaRegionWidth; - end - req_d.dst = req_q.dst + PartitionDmaRegionWidth; - end else begin - req_d.src = req_q.src + PartitionDmaRegionWidth; - if (shift_row == allocated_size_sel-1) begin - req_d.dst = req_q.dst + PartitionDmaRegionWidth - shift_row*DmaRegionWidth; - if (dma_mode_i == 2'b10) begin // duplication mode: recover start adddr - req_d.src = dup_start_addr_q; - end - end else begin - req_d.dst = req_q.dst + DmaRegionWidth; - end - end// spm2dram - end // ready_i - end -`else - if ($unsigned(req_q.num_bytes) <= $unsigned(DmaRegionWidth)) begin - if (ready_i) begin - state_d = Idle; - end - end else begin - burst_req_o.num_bytes = DmaRegionWidth; - if (ready_i) begin - req_d.num_bytes = req_q.num_bytes - DmaRegionWidth; - req_d.src = req_q.src + DmaRegionWidth; - req_d.dst = req_q.dst + DmaRegionWidth; - end - end -`endif - end // Busy - default: /*do nothing*/; - endcase - end - - // pragma translate_off - int f; - always_ff @(posedge clk_i or negedge rst_ni) begin - automatic string str; - if (rst_ni && valid_i && ready_o) begin - str = "\n\n[idma_split_midend_v2] Got request\n"; - str = $sformatf("%sSplit: Request in: From: 0x%8x To: 0x%8x with size %d\n", str, burst_req_i.src, burst_req_i.dst, burst_req_i.num_bytes); - f = $fopen("dma.log", "a"); - $fwrite(f, str); - $fclose(f); - end - if (rst_ni && valid_o && ready_i) begin - str = $sformatf("Split: Out %6d: From: 0x%8x To: 0x%8x with size %d, start_addr 0x%8x.\n", num_trans_q, burst_req_o.src, burst_req_o.dst, burst_req_o.num_bytes, start_addr); - f = $fopen("dma.log", "a"); - $fwrite(f, str); - $fclose(f); - end - end - // pragma translate_on - -endmodule diff --git a/hardware/src/mempool_cluster.sv b/hardware/src/mempool_cluster.sv index a733102ed..4164bc5aa 100644 --- a/hardware/src/mempool_cluster.sv +++ b/hardware/src/mempool_cluster.sv @@ -24,12 +24,14 @@ module mempool_cluster input logic scan_enable_i, input logic scan_data_i, output logic scan_data_o, - // Wake up signal - input logic [NumCores-1:0] wake_up_i, +`ifdef DAS // Partition Selection input logic [3:0][PartitionDataWidth-1:0] partition_sel_i, input logic [3:0][PartitionDataWidth-1:0] allocated_size_i, input logic [3:0][DataWidth-1:0] start_addr_scheme_i, +`endif + // Wake up signal + input logic [NumCores-1:0] wake_up_i, // RO-Cache configuration input ro_cache_ctrl_t ro_cache_ctrl_i, // DMA request @@ -77,6 +79,7 @@ module mempool_cluster `FF(dma_meta_o, dma_meta_cut, '0, clk_i, rst_ni); + dma_req_t dma_req_split; logic dma_req_split_valid; logic dma_req_split_ready; @@ -85,45 +88,43 @@ module mempool_cluster logic [NumGroups-1:0] dma_req_group_valid, dma_req_group_q_valid; logic [NumGroups-1:0] dma_req_group_ready, dma_req_group_q_ready; dma_meta_t [NumGroups-1:0] dma_meta, dma_meta_q; - // dma after partition - dma_req_t dma_req_partition; - logic dma_req_partition_valid; - logic dma_req_partition_ready; - dma_meta_t dma_meta_partition; - logic [7:0] dma_beat_cnt; - logic partition_req_valid; - logic [7:0] group_factor_sel, allocated_size_sel; + logic [PartitionDataWidth-1:0] allocated_size_sel; `FF(dma_meta_q, dma_meta, '0, clk_i, rst_ni); - logic part_beat_cnt_rst; - - idma_split_midend_v2 #( - .DmaRegionWidth (NumBanksPerGroup*NumGroups*4), // #DmaBytes = #banks*4 = 4096*4 // size per row - .DmaRegionStart (TCDMBaseAddr ), // 0x0000_0000, defined in tb - .DmaRegionEnd (TCDMBaseAddr+TCDMSize ), // TCDMSize = #banks*l1banksize = 4096*1024 // size of DMA region - .AddrWidth (AddrWidth ), - .burst_req_t (dma_req_t ), - .meta_t (dma_meta_t ) - ) i_idma_split_midend_v2 ( - .clk_i (clk_i ), - .rst_ni (rst_ni ), - // slave - .burst_req_i (dma_req_cut ), - .valid_i (dma_req_cut_valid ), - .ready_o (dma_req_cut_ready ), - .meta_o (dma_meta_cut ), - .burst_req_o (dma_req_partition ), - .valid_o (dma_req_partition_valid), - .ready_i (dma_req_partition_ready), - .meta_i (dma_meta_partition ), + + idma_split_midend #( + .DmaRegionWidth (NumBanksPerGroup*NumGroups*4), // #DmaBytes = #banks*4 = 4096*4 // size per row + .DmaRegionStart (TCDMBaseAddr ), // 0x0000_0000, defined in tb + .DmaRegionEnd (TCDMBaseAddr+TCDMSize ), // TCDMSize = #banks*l1banksize = 4096*1024 // size of DMA region + .AddrWidth (AddrWidth ), + .burst_req_t (dma_req_t ), + .meta_t (dma_meta_t ), + .NumTiles (NumTiles ), + .NumBanksPerTile (NumBanksPerTile ), + .TCDMSizePerBank (TCDMSizePerBank ), + .NumDASPartitions (NumDASPartitions ), + .DASStartAddr (DASStartAddr ) + ) i_idma_split_midend ( + .clk_i (clk_i ), + .rst_ni (rst_ni ), +`ifdef DAS // partition information - .group_factor_i (partition_sel_i ), - .allocated_size_i (allocated_size_i ), - .start_addr_scheme_i(start_addr_scheme_i ), - .allocated_size_o (allocated_size_sel ) + .group_factor_i (partition_sel_i ), + .allocated_size_i (allocated_size_i ), + .start_addr_scheme_i(start_addr_scheme_i), + .allocated_size_o (allocated_size_sel ), +`endif + .burst_req_i(dma_req_cut ), + .valid_i (dma_req_cut_valid ), + .ready_o (dma_req_cut_ready ), + .meta_o (dma_meta_cut ), + .burst_req_o(dma_req_split ), + .valid_o (dma_req_split_valid), + .ready_i (dma_req_split_ready), + .meta_i (dma_meta_split ) ); - idma_distributed_midend_v2 #( + idma_distributed_midend #( .NoMstPorts (NumGroups ), .DmaRegionWidth (NumBanksPerGroup*4 ), .DmaRegionStart (TCDMBaseAddr ), @@ -131,21 +132,21 @@ module mempool_cluster .TransFifoDepth (16 ), .burst_req_t (dma_req_t ), .meta_t (dma_meta_t ) - ) i_idma_distributed_midend_v2 ( - .clk_i (clk_i ), - .rst_ni (rst_ni ), - // slave - .burst_req_i (dma_req_partition ), - .valid_i (dma_req_partition_valid), - .ready_o (dma_req_partition_ready), - .meta_o (dma_meta_partition ), + ) i_idma_distributed_midend ( + .clk_i (clk_i ), + .rst_ni (rst_ni ), +`ifdef DAS // partition info - .allocated_size_i(allocated_size_sel ), - // master - .burst_req_o (dma_req_group ), - .valid_o (dma_req_group_valid ), - .ready_i (dma_req_group_ready ), - .meta_i (dma_meta_q ) + .allocated_size_i(allocated_size_sel), +`endif + .burst_req_i (dma_req_split ), + .valid_i (dma_req_split_valid), + .ready_o (dma_req_split_ready), + .meta_o (dma_meta_split ), + .burst_req_o (dma_req_group ), + .valid_o (dma_req_group_valid), + .ready_i (dma_req_group_ready), + .meta_i (dma_meta_q ) ); for (genvar g = 0; unsigned'(g) < NumGroups; g++) begin: gen_dma_req_group_register @@ -317,16 +318,18 @@ module mempool_cluster .tcdm_slave_resp_o (tcdm_slave_resp[g] ), .tcdm_slave_resp_valid_o (tcdm_slave_resp_valid[g] ), .tcdm_slave_resp_ready_i (tcdm_slave_resp_ready[g] ), - .wake_up_i (wake_up_q[g*NumCoresPerGroup +: NumCoresPerGroup] ), +`ifdef DAS .partition_sel_i (partition_sel_i ), .start_addr_scheme_i (start_addr_scheme_i ), .allocated_size_i (allocated_size_i ), + .dma_allocated_size_sel_i(allocated_size_sel ), +`endif + .wake_up_i (wake_up_q[g*NumCoresPerGroup +: NumCoresPerGroup] ), .ro_cache_ctrl_i (ro_cache_ctrl_q[g] ), // DMA request .dma_req_i (dma_req_group_q[g] ), .dma_req_valid_i (dma_req_group_q_valid[g] ), .dma_req_ready_o (dma_req_group_q_ready[g] ), - .dma_allocated_size_sel_i(allocated_size_sel ), // DMA status .dma_meta_o_backend_idle_ (dma_meta[g][1] ), .dma_meta_o_trans_complete_ (dma_meta[g][0] ), @@ -362,16 +365,18 @@ module mempool_cluster .tcdm_slave_resp_o (tcdm_slave_resp[g] ), .tcdm_slave_resp_valid_o (tcdm_slave_resp_valid[g] ), .tcdm_slave_resp_ready_i (tcdm_slave_resp_ready[g] ), - .wake_up_i (wake_up_q[g*NumCoresPerGroup +: NumCoresPerGroup] ), +`ifdef DAS .partition_sel_i (partition_sel_i ), .start_addr_scheme_i (start_addr_scheme_i ), .allocated_size_i (allocated_size_i ), + .dma_allocated_size_sel_i(allocated_size_sel ), +`endif + .wake_up_i (wake_up_q[g*NumCoresPerGroup +: NumCoresPerGroup] ), .ro_cache_ctrl_i (ro_cache_ctrl_q[g] ), // DMA request .dma_req_i (dma_req_group_q[g] ), .dma_req_valid_i (dma_req_group_q_valid[g] ), .dma_req_ready_o (dma_req_group_q_ready[g] ), - .dma_allocated_size_sel_i(allocated_size_sel ), // DMA status .dma_meta_o (dma_meta[g] ), // AXI interface @@ -404,16 +409,18 @@ module mempool_cluster .tcdm_slave_resp_o (tcdm_slave_resp[g] ), .tcdm_slave_resp_valid_o (tcdm_slave_resp_valid[g] ), .tcdm_slave_resp_ready_i (tcdm_slave_resp_ready[g] ), - .wake_up_i (wake_up_q[g*NumCoresPerGroup +: NumCoresPerGroup] ), +`ifdef DAS .partition_sel_i (partition_sel_i ), .start_addr_scheme_i (start_addr_scheme_i ), .allocated_size_i (allocated_size_i ), + .dma_allocated_size_sel_i(allocated_size_sel ), +`endif + .wake_up_i (wake_up_q[g*NumCoresPerGroup +: NumCoresPerGroup] ), .ro_cache_ctrl_i (ro_cache_ctrl_q[g] ), // DMA request .dma_req_i (dma_req_group_q[g] ), .dma_req_valid_i (dma_req_group_q_valid[g] ), .dma_req_ready_o (dma_req_group_q_ready[g] ), - .dma_allocated_size_sel_i(allocated_size_sel ), // DMA status .dma_meta_o (dma_meta[g] ), // AXI interface @@ -470,42 +477,44 @@ module mempool_cluster .TCDMBaseAddr (TCDMBaseAddr ), .BootAddr (BootAddr ) ) i_group ( - .clk_i (clk_i ), - .rst_ni (rst_ni ), - .testmode_i (testmode_i ), - .scan_enable_i (scan_enable_i ), - .scan_data_i (/* Unconnected */ ), - .scan_data_o (/* Unconnected */ ), - .group_id_i (g[idx_width(NumGroups)-1:0] ), + .clk_i (clk_i ), + .rst_ni (rst_ni ), + .testmode_i (testmode_i ), + .scan_enable_i (scan_enable_i ), + .scan_data_i (/* Unconnected */ ), + .scan_data_o (/* Unconnected */ ), + .group_id_i (g[idx_width(NumGroups)-1:0] ), // TCDM Master interfaces - .tcdm_master_req_o (tcdm_master_req[g] ), - .tcdm_master_req_valid_o (tcdm_master_req_valid[g] ), - .tcdm_master_req_ready_i (tcdm_master_req_ready[g] ), - .tcdm_master_resp_i (tcdm_master_resp[g] ), - .tcdm_master_resp_valid_i(tcdm_master_resp_valid[g] ), - .tcdm_master_resp_ready_o(tcdm_master_resp_ready[g] ), + .tcdm_master_req_o (tcdm_master_req[g] ), + .tcdm_master_req_valid_o (tcdm_master_req_valid[g] ), + .tcdm_master_req_ready_i (tcdm_master_req_ready[g] ), + .tcdm_master_resp_i (tcdm_master_resp[g] ), + .tcdm_master_resp_valid_i (tcdm_master_resp_valid[g] ), + .tcdm_master_resp_ready_o (tcdm_master_resp_ready[g] ), // TCDM banks interface - .tcdm_slave_req_i (tcdm_slave_req[g] ), - .tcdm_slave_req_valid_i (tcdm_slave_req_valid[g] ), - .tcdm_slave_req_ready_o (tcdm_slave_req_ready[g] ), - .tcdm_slave_resp_o (tcdm_slave_resp[g] ), - .tcdm_slave_resp_valid_o (tcdm_slave_resp_valid[g] ), - .tcdm_slave_resp_ready_i (tcdm_slave_resp_ready[g] ), - .wake_up_i (wake_up_q[g*NumCoresPerGroup +: NumCoresPerGroup] ), - .partition_sel_i (partition_sel_i ), - .allocated_size_i (allocated_size_i ), - .start_addr_scheme_i (start_addr_scheme_i ), - .ro_cache_ctrl_i (ro_cache_ctrl_q[g] ), + .tcdm_slave_req_i (tcdm_slave_req[g] ), + .tcdm_slave_req_valid_i (tcdm_slave_req_valid[g] ), + .tcdm_slave_req_ready_o (tcdm_slave_req_ready[g] ), + .tcdm_slave_resp_o (tcdm_slave_resp[g] ), + .tcdm_slave_resp_valid_o (tcdm_slave_resp_valid[g] ), + .tcdm_slave_resp_ready_i (tcdm_slave_resp_ready[g] ), +`ifdef DAS + .partition_sel_i (partition_sel_i ), + .start_addr_scheme_i (start_addr_scheme_i ), + .allocated_size_i (allocated_size_i ), + .dma_allocated_size_sel_i(allocated_size_sel ), +`endif + .wake_up_i (wake_up_q[g*NumCoresPerGroup +: NumCoresPerGroup] ), + .ro_cache_ctrl_i (ro_cache_ctrl_q[g] ), // DMA request - .dma_req_i (dma_req_group_q[g] ), - .dma_req_valid_i (dma_req_group_q_valid[g] ), - .dma_req_ready_o (dma_req_group_q_ready[g] ), - .dma_allocated_size_sel_i(allocated_size_sel ), + .dma_req_i (dma_req_group_q[g] ), + .dma_req_valid_i (dma_req_group_q_valid[g] ), + .dma_req_ready_o (dma_req_group_q_ready[g] ), // DMA status - .dma_meta_o (dma_meta[g] ), + .dma_meta_o (dma_meta[g] ), // AXI interface - .axi_mst_req_o (axi_mst_req_o[g*NumAXIMastersPerGroup +: NumAXIMastersPerGroup] ), - .axi_mst_resp_i (axi_mst_resp_i[g*NumAXIMastersPerGroup +: NumAXIMastersPerGroup]) + .axi_mst_req_o (axi_mst_req_o[g*NumAXIMastersPerGroup +: NumAXIMastersPerGroup] ), + .axi_mst_resp_i (axi_mst_resp_i[g*NumAXIMastersPerGroup +: NumAXIMastersPerGroup]) ); end : gen_groups diff --git a/hardware/src/mempool_group.sv b/hardware/src/mempool_group.sv index 7e87a5f7c..7b9027d2d 100644 --- a/hardware/src/mempool_group.sv +++ b/hardware/src/mempool_group.sv @@ -58,24 +58,26 @@ module mempool_group output logic [NumGroups-1:1][NumTilesPerGroup-1:0] tcdm_slave_resp_valid_o, input logic [NumGroups-1:1][NumTilesPerGroup-1:0] tcdm_slave_resp_ready_i, `endif - // Wake up interface - input logic [NumCoresPerGroup-1:0] wake_up_i, +`ifdef DAS // Partition selection - input logic [3:0][PartitionDataWidth-1:0] partition_sel_i, - input logic [3:0][PartitionDataWidth-1:0] allocated_size_i, - input logic [3:0][DataWidth-1:0] start_addr_scheme_i, + input logic [NumDASPartitions-1:0][DataWidth-1:0] start_addr_scheme_i, + input logic [NumDASPartitions-1:0][PartitionDataWidth-1:0] allocated_size_i, + input logic [NumDASPartitions-1:0][PartitionDataWidth-1:0] partition_sel_i, + input logic [PartitionDataWidth-1:0] dma_allocated_size_sel_i, +`endif + // Wake up interface + input logic [NumCoresPerGroup-1:0] wake_up_i, // RO-Cache configuration - input `STRUCT_PORT(ro_cache_ctrl_t) ro_cache_ctrl_i, + input `STRUCT_PORT(ro_cache_ctrl_t) ro_cache_ctrl_i, // DMA request - input `STRUCT_PORT(dma_req_t) dma_req_i, - input logic dma_req_valid_i, - output logic dma_req_ready_o, - input logic [7:0] dma_allocated_size_sel_i, + input `STRUCT_PORT(dma_req_t) dma_req_i, + input logic dma_req_valid_i, + output logic dma_req_ready_o, // DMA status - output `STRUCT_PORT(dma_meta_t) dma_meta_o, + output `STRUCT_PORT(dma_meta_t) dma_meta_o, // AXI Interface - output `STRUCT_VECT(axi_tile_req_t, [NumAXIMastersPerGroup-1:0]) axi_mst_req_o, - input `STRUCT_VECT(axi_tile_resp_t, [NumAXIMastersPerGroup-1:0]) axi_mst_resp_i + output `STRUCT_VECT(axi_tile_req_t, [NumAXIMastersPerGroup-1:0]) axi_mst_req_o, + input `STRUCT_VECT(axi_tile_resp_t, [NumAXIMastersPerGroup-1:0]) axi_mst_resp_i ); /***************** @@ -337,12 +339,14 @@ module mempool_group .axi_mst_resp_i (axi_mst_resp[sg*NumAXIMastersPerSubGroup +: NumAXIMastersPerSubGroup] ), // RO-Cache configuration .ro_cache_ctrl_i (ro_cache_ctrl_q ), - // Wake up interface - .wake_up_i (wake_up_q[sg*NumCoresPerSubGroup +: NumCoresPerSubGroup] ), +`ifdef DAS // Partition selection .start_addr_scheme_i (start_addr_scheme_i ), .allocated_size_i (allocated_size_i ), - .partition_sel_i (partition_sel_i) + .partition_sel_i (partition_sel_i ), +`endif + // Wake up interface + .wake_up_i (wake_up_q[sg*NumCoresPerSubGroup +: NumCoresPerSubGroup] ) ); end else begin: gen_rtl_sg mempool_sub_group #( @@ -393,11 +397,14 @@ module mempool_group .axi_mst_resp_i (axi_mst_resp[sg*NumAXIMastersPerSubGroup +: NumAXIMastersPerSubGroup] ), // RO-Cache configuration .ro_cache_ctrl_i (ro_cache_ctrl_q ), +`ifdef DAS + // Partition selection + .start_addr_scheme_i (start_addr_scheme_i ), + .allocated_size_i (allocated_size_i ), + .partition_sel_i (partition_sel_i ), +`endif // Wake up interface - .wake_up_i (wake_up_q[sg*NumCoresPerSubGroup +: NumCoresPerSubGroup] ), - .start_addr_scheme_i (start_addr_scheme_i ), - .allocated_size_i (allocated_size_i ), - .partition_sel_i (partition_sel_i) + .wake_up_i (wake_up_q[sg*NumCoresPerSubGroup +: NumCoresPerSubGroup] ) ); end // Transpose the group requests @@ -572,7 +579,7 @@ module mempool_group `FF(dma_meta_o, dma_meta_cut, '0, clk_i, rst_ni); - idma_distributed_midend_v2 #( + idma_distributed_midend #( .NoMstPorts (NumDmasPerGroup ), .DmaRegionWidth (NumBanksPerGroup*4/NumDmasPerGroup ), .DmaRegionStart (TCDMBaseAddr ), @@ -580,17 +587,17 @@ module mempool_group .TransFifoDepth (8 ), .burst_req_t (dma_req_t ), .meta_t (dma_meta_t ) - ) i_idma_distributed_midend_v2 ( + ) i_idma_distributed_midend ( .clk_i (clk_i ), .rst_ni (rst_ni ), - // slave +`ifdef DAS + // partition + .allocated_size_i(dma_allocated_size_sel_i), +`endif .burst_req_i (dma_req_cut ), .valid_i (dma_req_cut_valid ), .ready_o (dma_req_cut_ready ), .meta_o (dma_meta_cut ), - // partition - .allocated_size_i(dma_allocated_size_sel_i), - // master .burst_req_o (dma_req ), .valid_o (dma_req_valid ), .ready_i (dma_req_ready ), @@ -699,11 +706,13 @@ module mempool_group // AXI interface .axi_mst_req_o (axi_tile_req[t] ), .axi_mst_resp_i (axi_tile_resp[t] ), - // Wake up interface - .wake_up_i (wake_up_q[t*NumCoresPerTile +: NumCoresPerTile]), +`ifdef DAS .start_addr_scheme_i (start_addr_scheme_i ), .allocated_size_i (allocated_size_i ), - .partition_sel_i (partition_sel_i) + .partition_sel_i (partition_sel_i ), +`endif + // Wake up interface + .wake_up_i (wake_up_q[t*NumCoresPerTile +: NumCoresPerTile]) ); // Transpose the group requests @@ -989,7 +998,7 @@ module mempool_group logic [NumDmasPerGroup-1:0] dma_req_ready; dma_meta_t [NumDmasPerGroup-1:0] dma_meta; - idma_distributed_midend_v2 #( + idma_distributed_midend #( .NoMstPorts (NumDmasPerGroup ), .DmaRegionWidth (NumBanksPerGroup*4/NumDmasPerGroup ), .DmaRegionStart (TCDMBaseAddr ), @@ -997,17 +1006,17 @@ module mempool_group .TransFifoDepth (8 ), .burst_req_t (dma_req_t ), .meta_t (dma_meta_t ) - ) i_idma_distributed_midend_v2 ( + ) i_idma_distributed_midend ( .clk_i (clk_i ), .rst_ni (rst_ni ), - // slave +`ifdef DAS + // partition + .allocated_size_i(dma_allocated_size_sel_i), +`endif .burst_req_i (dma_req_cut ), .valid_i (dma_req_cut_valid ), .ready_o (dma_req_cut_ready ), .meta_o (dma_meta_cut ), - // partition - .allocated_size_i(dma_allocated_size_sel_i), - // master .burst_req_o (dma_req ), .valid_o (dma_req_valid ), .ready_i (dma_req_ready ), diff --git a/hardware/src/mempool_pkg.sv b/hardware/src/mempool_pkg.sv index 7c2e8efb1..f320b9e9a 100644 --- a/hardware/src/mempool_pkg.sv +++ b/hardware/src/mempool_pkg.sv @@ -28,8 +28,6 @@ package mempool_pkg; localparam integer unsigned AxiDataWidth = `ifdef AXI_DATA_WIDTH `AXI_DATA_WIDTH `else 0 `endif; localparam integer unsigned AxiLiteDataWidth = 32; - localparam integer unsigned PartitionDataWidth = 8; // only support group_factor={128, 64, 32, 16, 8, 4, 2, 1} - /*********************** * MEMORY PARAMETERS * ***********************/ @@ -50,6 +48,7 @@ package mempool_pkg; localparam integer unsigned NumDASPartitions = `ifdef NUM_DAS_PARTITIONS `NUM_DAS_PARTITIONS `else 0 `endif; localparam integer unsigned DASMemSize = `ifdef DAS_MEM_SIZE `DAS_MEM_SIZE `else 0 `endif; localparam integer unsigned DASStartAddr = (NumBanks * TCDMSizePerBank) - NumCores * DASMemSize; + localparam integer unsigned PartitionDataWidth = $clog2(NumTiles)+1; // only support group_factor={128, 64, 32, 16, 8, 4, 2, 1} // L2 localparam integer unsigned L2Size = `ifdef L2_SIZE `L2_SIZE `else 0 `endif; // [B] @@ -316,9 +315,6 @@ package mempool_pkg; localparam int unsigned SeqMemSizePerCore = `ifdef SEQ_MEM_SIZE `SEQ_MEM_SIZE `else 0 `endif; localparam int unsigned SeqMemSizePerTile = NumCoresPerTile*SeqMemSizePerCore; - localparam int unsigned HeapSeqMemSizePerCore = `ifdef HEAP_SEQ_MEM_SIZE `SEQ_MEM_SIZE `else 2048 `endif; - localparam int unsigned HeapSeqMemSizePerTile = NumCoresPerTile*HeapSeqMemSizePerCore; - typedef struct packed { int unsigned slave_idx; addr_t mask; diff --git a/hardware/src/mempool_sub_group.sv b/hardware/src/mempool_sub_group.sv index 012cd77b8..e53f2f60c 100644 --- a/hardware/src/mempool_sub_group.sv +++ b/hardware/src/mempool_sub_group.sv @@ -62,12 +62,14 @@ module mempool_sub_group input `STRUCT_VECT(axi_tile_resp_t, [NumAXIMastersPerSubGroup-1:0]) axi_mst_resp_i, // RO-Cache configuration input `STRUCT_PORT(ro_cache_ctrl_t) ro_cache_ctrl_i, - // Wake up interface - input logic [NumCoresPerSubGroup-1:0] wake_up_i, +`ifdef DAS // Partition Selection - input logic [3:0][PartitionDataWidth-1:0] partition_sel_i, - input logic [3:0][PartitionDataWidth-1:0] allocated_size_i, - input logic [3:0][DataWidth-1:0] start_addr_scheme_i + input logic [NumDASPartitions-1:0][DataWidth-1:0] start_addr_scheme_i, + input logic [NumDASPartitions-1:0][PartitionDataWidth-1:0] allocated_size_i, + input logic [NumDASPartitions-1:0][PartitionDataWidth-1:0] partition_sel_i, +`endif + // Wake up interface + input logic [NumCoresPerSubGroup-1:0] wake_up_i ); /***************** @@ -202,12 +204,13 @@ module mempool_sub_group // AXI interface .axi_mst_req_o (axi_tile_req[t] ), .axi_mst_resp_i (axi_tile_resp[t] ), +`ifdef DAS + .start_addr_scheme_i (start_addr_scheme_i ), + .allocated_size_i (allocated_size_i ), + .partition_sel_i (partition_sel_i ), +`endif // Wake up interface - .wake_up_i (wake_up_q[t*NumCoresPerTile +: NumCoresPerTile]), - // Partition selection - .start_addr_scheme_i (start_addr_scheme_i ), - .allocated_size_i (allocated_size_i ), - .partition_sel_i (partition_sel_i) + .wake_up_i (wake_up_q[t*NumCoresPerTile +: NumCoresPerTile]) ); // Transpose the sub_group requests diff --git a/hardware/src/mempool_system.sv b/hardware/src/mempool_system.sv index 9d11d4484..3118c18a5 100644 --- a/hardware/src/mempool_system.sv +++ b/hardware/src/mempool_system.sv @@ -90,13 +90,12 @@ module mempool_system logic [NumCores-1:0] wake_up; logic [DataWidth-1:0] eoc; ro_cache_ctrl_t ro_cache_ctrl; - +`ifdef DAS // For dynamic partitioning logic [3:0][PartitionDataWidth-1:0] partition_sel; logic [3:0][PartitionDataWidth-1:0] allocated_size; logic [3:0][DataWidth-1:0] start_addr_scheme; - // For DMA Mode Selection - logic [DataWidth-1:0] dma_mode; +`endif dma_req_t dma_req; logic dma_req_valid; @@ -147,9 +146,11 @@ module mempool_system .clk_i (clk_i ), .rst_ni (rst_ni ), .wake_up_i (wake_up ), +`ifdef DAS .partition_sel_i (partition_sel ), .allocated_size_i (allocated_size ), .start_addr_scheme_i(start_addr_scheme ), +`endif .testmode_i (1'b0 ), .scan_enable_i (1'b0 ), .scan_data_i (1'b0 ), @@ -811,11 +812,13 @@ module mempool_system .axi_lite_slave_resp_o(axi_lite_slv_resp[CtrlRegisters]), .eoc_o (/* Unused */ ), .eoc_valid_o (eoc_valid_o ), - .wake_up_o (wake_up ), - .ro_cache_ctrl_o (ro_cache_ctrl ), +`ifdef DAS .partition_sel_o (partition_sel ), .start_addr_scheme_o (start_addr_scheme ), - .allocated_size_o (allocated_size ) + .allocated_size_o (allocated_size ), +`endif + .wake_up_o (wake_up ), + .ro_cache_ctrl_o (ro_cache_ctrl ) ); mempool_dma #( diff --git a/hardware/src/mempool_tile.sv b/hardware/src/mempool_tile.sv index 8b18b784b..dd5b41aa4 100644 --- a/hardware/src/mempool_tile.sv +++ b/hardware/src/mempool_tile.sv @@ -54,9 +54,9 @@ module mempool_tile // Wake up interface input logic [NumCoresPerTile-1:0] wake_up_i, // Partition selection - input logic [3:0][DataWidth-1:0] start_addr_scheme_i, - input logic [3:0][PartitionDataWidth-1:0] allocated_size_i, - input logic [3:0][PartitionDataWidth-1:0] partition_sel_i + input logic [NumDASPartitions-1:0][DataWidth-1:0] start_addr_scheme_i, + input logic [NumDASPartitions-1:0][PartitionDataWidth-1:0] allocated_size_i, + input logic [NumDASPartitions-1:0][PartitionDataWidth-1:0] partition_sel_i ); /**************** @@ -904,10 +904,16 @@ module mempool_tile .SeqMemSizePerTile (SeqMemSizePerTile), .NumDASPartitions (NumDASPartitions ) ) i_address_scrambler ( - .address_i (snitch_data_qaddr[c]), +`ifdef DAS .group_factor_i (partition_sel_i ), .allocated_size_i (allocated_size_i ), .start_addr_scheme_i(start_addr_scheme_i ), +`else + .group_factor_i (NumTiles ), + .allocated_size_i ('0 ), + .start_addr_scheme_i('0 ), +`endif + .address_i (snitch_data_qaddr[c]), .address_o (snitch_data_qaddr_scrambled) ); diff --git a/software/tests/baremetal/das_dma/main.c b/software/tests/baremetal/das_dma/main.c new file mode 100644 index 000000000..444aa821c --- /dev/null +++ b/software/tests/baremetal/das_dma/main.c @@ -0,0 +1,92 @@ +// Copyright 2021 ETH Zurich and University of Bologna. +// Licensed under the Apache License, Version 2.0, see LICENSE for details. +// SPDX-License-Identifier: Apache-2.0 + +// Note: This test is only for Terapool dynamic heap allocation +// Author: Bowen Wang + +#include +#include + +#include "alloc.h" +#include "dma.h" +#include "encoding.h" +#include "printf.h" +#include "runtime.h" +#include "synchronization.h" + +#define NUM_TILES (NUM_CORES / NUM_CORES_PER_TILE) +uint32_t l2_array[2 * NUM_BANKS] __attribute__((section(".l2"))); + +int main() { + uint32_t core_id = mempool_get_core_id(); + uint32_t num_cores = mempool_get_core_count(); + + // Initialization + mempool_init(core_id); + mempool_barrier_init(core_id); + + if (core_id == 0) { + + // -------------------------------------------- + // Initialize + // -------------------------------------------- + uint32_t num_tiles_per_partition = 4; + uint32_t array_size = + 2 * num_tiles_per_partition * BANKING_FACTOR * NUM_CORES_PER_TILE; + // Initialize L2 array + for (uint32_t i = 0; i < array_size; i++) { + l2_array[i] = i; + } + + // -------------------------------------------- + // Verify DMA transfers in DAS region + // -------------------------------------------- + printf("Verify DMA transfers in DAS region\n\n"); + + // 1. Init dynamic heap allocator + mempool_dynamic_heap_alloc_init(core_id); + + // 2. Set which partition write to. + uint32_t part_id = 0; + + // 3. Get the allocator + alloc_t *dynamic_heap_alloc = get_dynamic_heap_alloc(); + alloc_dump(dynamic_heap_alloc); + + // 4. Allocate memory + uint32_t *array = (uint32_t *)partition_malloc( + dynamic_heap_alloc, array_size * sizeof(uint32_t)); + + // 5. Config the hardware registers + partition_config(part_id, num_tiles_per_partition); + start_addr_scheme_config(part_id, (uint32_t)(*array), + array_size * sizeof(uint32_t)); + + // 6. Move data + dma_memcpy_blocking(array, l2_array, array_size * sizeof(uint32_t)); + + // 7. Change addressing scheme (to fully interleaved) + partition_config(part_id, NUM_TILES); + + // 8. check + for (uint32_t i = 0; i < array_size; i++) { + uint32_t *fetch_address = &array[0] + + (i % (num_tiles_per_partition * NUM_CORES_PER_TILE * BANKING_FACTOR)) + + (i / (num_tiles_per_partition * NUM_CORES_PER_TILE * BANKING_FACTOR)) * NUM_BANKS; +// if (l2_array[i] != *fetch_address) { +// printf("%4d != %4d at address %8X.\n", i, *fetch_address, +// fetch_address); +// } + } + + // 9. Free array + partition_free(dynamic_heap_alloc, array); + printf("SUCCESS on partition %d \n\n", part_id); + + printf("All correct!\n"); + } + + mempool_barrier(num_cores); + return 0; +} From 58fdcd4f1744814dc919a8dd633aa8da154a39eb Mon Sep 17 00:00:00 2001 From: bowwang Date: Fri, 24 Oct 2025 18:27:47 +0200 Subject: [PATCH 24/34] [hardware] fixed the DMA Midend bug, roll back to standard mode --- hardware/deps/idma/src/midends/idma_split_midend.sv | 2 +- software/tests/baremetal/das_dma/main.c | 8 ++++---- 2 files changed, 5 insertions(+), 5 deletions(-) diff --git a/hardware/deps/idma/src/midends/idma_split_midend.sv b/hardware/deps/idma/src/midends/idma_split_midend.sv index 05b83a80a..f5c17b938 100644 --- a/hardware/deps/idma/src/midends/idma_split_midend.sv +++ b/hardware/deps/idma/src/midends/idma_split_midend.sv @@ -238,7 +238,7 @@ module idma_split_midend #( // Calculate the size for the 1st burst burst_req_o.num_bytes = PartitionDmaRegionWidth-masked_start_addr; // TODO (bowwang): parameterize - req_d.num_bytes = (group_factor_sel <= $clog2(NumTiles) + 1) ? (allocated_size_sel*DmaBackendWidth) : (allocated_size_sel*PartitionDmaRegionWidth); + // req_d.num_bytes = (group_factor_sel <= $clog2(NumTiles) + 1) ? (allocated_size_sel*DmaBackendWidth) : (allocated_size_sel*PartitionDmaRegionWidth); if (spm2dram) begin burst_req_o.src = post_scramble_src; req_d.src = post_scramble_src; diff --git a/software/tests/baremetal/das_dma/main.c b/software/tests/baremetal/das_dma/main.c index 444aa821c..2b0d70689 100644 --- a/software/tests/baremetal/das_dma/main.c +++ b/software/tests/baremetal/das_dma/main.c @@ -74,10 +74,10 @@ int main() { uint32_t *fetch_address = &array[0] + (i % (num_tiles_per_partition * NUM_CORES_PER_TILE * BANKING_FACTOR)) + (i / (num_tiles_per_partition * NUM_CORES_PER_TILE * BANKING_FACTOR)) * NUM_BANKS; -// if (l2_array[i] != *fetch_address) { -// printf("%4d != %4d at address %8X.\n", i, *fetch_address, -// fetch_address); -// } + if (l2_array[i] != *fetch_address) { + printf("%4d != %4d at address %8X.\n", i, *fetch_address, + fetch_address); + } } // 9. Free array From cb090bfa6e729b00dde579ef659568f3dcce3c1e Mon Sep 17 00:00:00 2001 From: mbertuletti Date: Thu, 30 Oct 2025 12:24:10 +0100 Subject: [PATCH 25/34] [hardware] Change names of DAS signals --- .../src/midends/idma_distributed_midend.sv | 10 +- .../idma/src/midends/idma_split_midend.sv | 22 +- .../control_registers/control_registers.hjson | 22 +- .../control_registers_reg_pkg.sv | 83 +++--- .../control_registers_reg_top.sv | 268 +++++++++--------- hardware/src/ctrl_registers.sv | 27 +- hardware/src/mempool_cluster.sv | 96 +++---- hardware/src/mempool_group.sv | 33 ++- hardware/src/mempool_sub_group.sv | 13 +- hardware/src/mempool_system.sv | 44 +-- hardware/src/mempool_tile.sv | 23 +- software/runtime/control_registers.h | 48 ++-- 12 files changed, 345 insertions(+), 344 deletions(-) diff --git a/hardware/deps/idma/src/midends/idma_distributed_midend.sv b/hardware/deps/idma/src/midends/idma_distributed_midend.sv index 1744f60b5..3d53e2e15 100644 --- a/hardware/deps/idma/src/midends/idma_distributed_midend.sv +++ b/hardware/deps/idma/src/midends/idma_distributed_midend.sv @@ -32,7 +32,7 @@ module idma_distributed_midend #( input logic rst_ni, `ifdef DAS // DAS signals - input logic [$clog2(NumTiles):0] allocated_size_i, + input logic [$clog2(NumTiles):0] rows_das_i, `endif // Slave input burst_req_t burst_req_i, @@ -174,15 +174,15 @@ module idma_distributed_midend #( burst_req_o[i].num_bytes = (burst_req_i.num_bytes= DmaRegionStart) && ($unsigned(burst_req_i.src) < DmaRegionEnd)) begin burst_req_o[i].src = burst_req_i.src+i*DmaRegionWidth; - burst_req_o[i].dst = burst_req_i.dst+i*allocated_size_i*DmaRegionWidth; + burst_req_o[i].dst = burst_req_i.dst+i*rows_das_i*DmaRegionWidth; end else begin // L2 --> L1 if (burst_req_i.num_bytes<=DmaRegionWidth )begin - burst_req_o[i].src = burst_req_i.src+i*allocated_size_i*DmaRegionWidth; + burst_req_o[i].src = burst_req_i.src+i*rows_das_i*DmaRegionWidth; end else if (i==2) begin - burst_req_o[i].src = burst_req_i.src+i*allocated_size_i*DmaRegionWidth; + burst_req_o[i].src = burst_req_i.src+i*rows_das_i*DmaRegionWidth; end else if (i==3) begin - burst_req_o[i].src = burst_req_i.src+(i-1)*allocated_size_i*DmaRegionWidth + DmaRegionWidth; + burst_req_o[i].src = burst_req_i.src+(i-1)*rows_das_i*DmaRegionWidth + DmaRegionWidth; end burst_req_o[i].dst = burst_req_i.dst+i*DmaRegionWidth; end diff --git a/hardware/deps/idma/src/midends/idma_split_midend.sv b/hardware/deps/idma/src/midends/idma_split_midend.sv index f5c17b938..e3ba092e8 100644 --- a/hardware/deps/idma/src/midends/idma_split_midend.sv +++ b/hardware/deps/idma/src/midends/idma_split_midend.sv @@ -27,10 +27,10 @@ module idma_split_midend #( input logic rst_ni, `ifdef DAS // DAS signals - input logic [NumDASPartitions-1:0][$clog2(NumTiles):0] group_factor_i, - input logic [NumDASPartitions-1:0][$clog2(NumTiles):0] allocated_size_i, - input logic [NumDASPartitions-1:0][AddrWidth-1:0] start_addr_scheme_i, - output logic [$clog2(NumTiles):0] allocated_size_o, + input logic [NumDASPartitions-1:0][$clog2(NumTiles):0] partition_sel_i, + input logic [NumDASPartitions-1:0][AddrWidth-1:0] start_das_i, + input logic [NumDASPartitions-1:0][$clog2(NumTiles):0] rows_das_i, + output logic [$clog2(NumTiles):0] rows_das_o, `endif // Slave input burst_req_t burst_req_i, @@ -95,9 +95,9 @@ module idma_split_midend #( ) i_idma_address_scrambler_src ( .address_i (burst_req_i.src), .num_bytes_i (burst_req_i.num_bytes), - .group_factor_i (group_factor_i), - .allocated_size_i (allocated_size_i), - .start_addr_scheme_i(start_addr_scheme_i), + .group_factor_i (partition_sel_i), + .allocated_size_i (rows_das_i), + .start_addr_scheme_i(start_das_i), .group_factor_o (group_factor_src), .allocated_size_o (allocated_size_src), .address_o (post_scramble_src) @@ -113,9 +113,9 @@ module idma_split_midend #( ) i_idma_address_scrambler_dst ( .address_i (burst_req_i.dst), .num_bytes_i (burst_req_i.num_bytes), - .group_factor_i (group_factor_i), - .allocated_size_i (allocated_size_i), - .start_addr_scheme_i(start_addr_scheme_i), + .group_factor_i (partition_sel_i), + .allocated_size_i (rows_das_i), + .start_addr_scheme_i(start_das_i), .group_factor_o (group_factor_dst), .allocated_size_o (allocated_size_dst), .address_o (post_scramble_dst) @@ -208,7 +208,7 @@ module idma_split_midend #( req_valid = 1'b0; `ifdef DAS - allocated_size_o = allocated_size_sel; + rows_das_o = allocated_size_sel; beat_cnt_d = beat_cnt_q; if (num_trans_q == 1 && num_trans_d == 0) begin beat_cnt_d = 0; diff --git a/hardware/src/control_registers/control_registers.hjson b/hardware/src/control_registers/control_registers.hjson index 414682f52..69809f6f1 100644 --- a/hardware/src/control_registers/control_registers.hjson +++ b/hardware/src/control_registers/control_registers.hjson @@ -93,27 +93,27 @@ }, { multireg: { - name: "allocated_size" - desc: "Allocated size on DAS partition" + name: "start_das" + desc: "Start address of DAS partition" swaccess: "wo" - hwaccess: "hro" + hwaccess: "hrw" hwqe: "true" + // External because we want to define the reset from a parameter + hwext: "true" count: "NumDASPartitions" - cname: "allocated_size" + cname: "start_das" fields: [{ bits: "31:0" }] }, }, { multireg: { - name: "start_addr_scheme" - desc: "Start address of DAS partition" + name: "rows_das" + desc: "End address of DAS partition" swaccess: "wo" - hwaccess: "hrw" - hwqe: "true" - // External because we want to define the reset from a parameter - hwext: "true" + hwaccess: "hro" + hwqe: "false" count: "NumDASPartitions" - cname: "start_addr_scheme" + cname: "rows_das" fields: [{ bits: "31:0" }] }, }, diff --git a/hardware/src/control_registers/control_registers_reg_pkg.sv b/hardware/src/control_registers/control_registers_reg_pkg.sv index 95a5d8541..c380805b3 100644 --- a/hardware/src/control_registers/control_registers_reg_pkg.sv +++ b/hardware/src/control_registers/control_registers_reg_pkg.sv @@ -55,12 +55,11 @@ package control_registers_reg_pkg; typedef struct packed { logic [31:0] q; logic qe; - } control_registers_reg2hw_allocated_size_mreg_t; + } control_registers_reg2hw_start_das_mreg_t; typedef struct packed { logic [31:0] q; - logic qe; - } control_registers_reg2hw_start_addr_scheme_mreg_t; + } control_registers_reg2hw_rows_das_mreg_t; typedef struct packed { logic [31:0] q; @@ -86,7 +85,7 @@ package control_registers_reg_pkg; typedef struct packed { logic [31:0] d; - } control_registers_hw2reg_start_addr_scheme_mreg_t; + } control_registers_hw2reg_start_das_mreg_t; typedef struct packed { logic [31:0] d; @@ -110,15 +109,15 @@ package control_registers_reg_pkg; // Register -> HW type typedef struct packed { - control_registers_reg2hw_eoc_reg_t eoc; // [1151:1120] - control_registers_reg2hw_wake_up_reg_t wake_up; // [1119:1087] - control_registers_reg2hw_wake_up_tile_mreg_t [7:0] wake_up_tile; // [1086:823] - control_registers_reg2hw_wake_up_group_reg_t wake_up_group; // [822:790] - control_registers_reg2hw_wake_up_strd_reg_t wake_up_strd; // [789:757] - control_registers_reg2hw_wake_up_offst_reg_t wake_up_offst; // [756:724] - control_registers_reg2hw_partition_sel_mreg_t [3:0] partition_sel; // [723:592] - control_registers_reg2hw_allocated_size_mreg_t [3:0] allocated_size; // [591:460] - control_registers_reg2hw_start_addr_scheme_mreg_t [3:0] start_addr_scheme; // [459:328] + control_registers_reg2hw_eoc_reg_t eoc; // [1147:1116] + control_registers_reg2hw_wake_up_reg_t wake_up; // [1115:1083] + control_registers_reg2hw_wake_up_tile_mreg_t [7:0] wake_up_tile; // [1082:819] + control_registers_reg2hw_wake_up_group_reg_t wake_up_group; // [818:786] + control_registers_reg2hw_wake_up_strd_reg_t wake_up_strd; // [785:753] + control_registers_reg2hw_wake_up_offst_reg_t wake_up_offst; // [752:720] + control_registers_reg2hw_partition_sel_mreg_t [3:0] partition_sel; // [719:588] + control_registers_reg2hw_start_das_mreg_t [3:0] start_das; // [587:456] + control_registers_reg2hw_rows_das_mreg_t [3:0] rows_das; // [455:328] control_registers_reg2hw_ro_cache_enable_reg_t ro_cache_enable; // [327:296] control_registers_reg2hw_ro_cache_flush_reg_t ro_cache_flush; // [295:264] control_registers_reg2hw_ro_cache_start_mreg_t [3:0] ro_cache_start; // [263:132] @@ -128,7 +127,7 @@ package control_registers_reg_pkg; // HW -> register type typedef struct packed { control_registers_hw2reg_partition_sel_mreg_t [3:0] partition_sel; // [607:480] - control_registers_hw2reg_start_addr_scheme_mreg_t [3:0] start_addr_scheme; // [479:352] + control_registers_hw2reg_start_das_mreg_t [3:0] start_das; // [479:352] control_registers_hw2reg_tcdm_start_address_reg_t tcdm_start_address; // [351:320] control_registers_hw2reg_tcdm_end_address_reg_t tcdm_end_address; // [319:288] control_registers_hw2reg_nr_cores_reg_reg_t nr_cores_reg; // [287:256] @@ -154,14 +153,14 @@ package control_registers_reg_pkg; parameter logic [BlockAw-1:0] CONTROL_REGISTERS_PARTITION_SEL_1_OFFSET = 8'h 38; parameter logic [BlockAw-1:0] CONTROL_REGISTERS_PARTITION_SEL_2_OFFSET = 8'h 3c; parameter logic [BlockAw-1:0] CONTROL_REGISTERS_PARTITION_SEL_3_OFFSET = 8'h 40; - parameter logic [BlockAw-1:0] CONTROL_REGISTERS_ALLOCATED_SIZE_0_OFFSET = 8'h 44; - parameter logic [BlockAw-1:0] CONTROL_REGISTERS_ALLOCATED_SIZE_1_OFFSET = 8'h 48; - parameter logic [BlockAw-1:0] CONTROL_REGISTERS_ALLOCATED_SIZE_2_OFFSET = 8'h 4c; - parameter logic [BlockAw-1:0] CONTROL_REGISTERS_ALLOCATED_SIZE_3_OFFSET = 8'h 50; - parameter logic [BlockAw-1:0] CONTROL_REGISTERS_START_ADDR_SCHEME_0_OFFSET = 8'h 54; - parameter logic [BlockAw-1:0] CONTROL_REGISTERS_START_ADDR_SCHEME_1_OFFSET = 8'h 58; - parameter logic [BlockAw-1:0] CONTROL_REGISTERS_START_ADDR_SCHEME_2_OFFSET = 8'h 5c; - parameter logic [BlockAw-1:0] CONTROL_REGISTERS_START_ADDR_SCHEME_3_OFFSET = 8'h 60; + parameter logic [BlockAw-1:0] CONTROL_REGISTERS_START_DAS_0_OFFSET = 8'h 44; + parameter logic [BlockAw-1:0] CONTROL_REGISTERS_START_DAS_1_OFFSET = 8'h 48; + parameter logic [BlockAw-1:0] CONTROL_REGISTERS_START_DAS_2_OFFSET = 8'h 4c; + parameter logic [BlockAw-1:0] CONTROL_REGISTERS_START_DAS_3_OFFSET = 8'h 50; + parameter logic [BlockAw-1:0] CONTROL_REGISTERS_ROWS_DAS_0_OFFSET = 8'h 54; + parameter logic [BlockAw-1:0] CONTROL_REGISTERS_ROWS_DAS_1_OFFSET = 8'h 58; + parameter logic [BlockAw-1:0] CONTROL_REGISTERS_ROWS_DAS_2_OFFSET = 8'h 5c; + parameter logic [BlockAw-1:0] CONTROL_REGISTERS_ROWS_DAS_3_OFFSET = 8'h 60; parameter logic [BlockAw-1:0] CONTROL_REGISTERS_TCDM_START_ADDRESS_OFFSET = 8'h 64; parameter logic [BlockAw-1:0] CONTROL_REGISTERS_TCDM_END_ADDRESS_OFFSET = 8'h 68; parameter logic [BlockAw-1:0] CONTROL_REGISTERS_NR_CORES_REG_OFFSET = 8'h 6c; @@ -181,10 +180,10 @@ package control_registers_reg_pkg; parameter logic [31:0] CONTROL_REGISTERS_PARTITION_SEL_1_RESVAL = 32'h 0; parameter logic [31:0] CONTROL_REGISTERS_PARTITION_SEL_2_RESVAL = 32'h 0; parameter logic [31:0] CONTROL_REGISTERS_PARTITION_SEL_3_RESVAL = 32'h 0; - parameter logic [31:0] CONTROL_REGISTERS_START_ADDR_SCHEME_0_RESVAL = 32'h 0; - parameter logic [31:0] CONTROL_REGISTERS_START_ADDR_SCHEME_1_RESVAL = 32'h 0; - parameter logic [31:0] CONTROL_REGISTERS_START_ADDR_SCHEME_2_RESVAL = 32'h 0; - parameter logic [31:0] CONTROL_REGISTERS_START_ADDR_SCHEME_3_RESVAL = 32'h 0; + parameter logic [31:0] CONTROL_REGISTERS_START_DAS_0_RESVAL = 32'h 0; + parameter logic [31:0] CONTROL_REGISTERS_START_DAS_1_RESVAL = 32'h 0; + parameter logic [31:0] CONTROL_REGISTERS_START_DAS_2_RESVAL = 32'h 0; + parameter logic [31:0] CONTROL_REGISTERS_START_DAS_3_RESVAL = 32'h 0; parameter logic [31:0] CONTROL_REGISTERS_TCDM_START_ADDRESS_RESVAL = 32'h 0; parameter logic [31:0] CONTROL_REGISTERS_TCDM_END_ADDRESS_RESVAL = 32'h 0; parameter logic [31:0] CONTROL_REGISTERS_NR_CORES_REG_RESVAL = 32'h 0; @@ -216,14 +215,14 @@ package control_registers_reg_pkg; CONTROL_REGISTERS_PARTITION_SEL_1, CONTROL_REGISTERS_PARTITION_SEL_2, CONTROL_REGISTERS_PARTITION_SEL_3, - CONTROL_REGISTERS_ALLOCATED_SIZE_0, - CONTROL_REGISTERS_ALLOCATED_SIZE_1, - CONTROL_REGISTERS_ALLOCATED_SIZE_2, - CONTROL_REGISTERS_ALLOCATED_SIZE_3, - CONTROL_REGISTERS_START_ADDR_SCHEME_0, - CONTROL_REGISTERS_START_ADDR_SCHEME_1, - CONTROL_REGISTERS_START_ADDR_SCHEME_2, - CONTROL_REGISTERS_START_ADDR_SCHEME_3, + CONTROL_REGISTERS_START_DAS_0, + CONTROL_REGISTERS_START_DAS_1, + CONTROL_REGISTERS_START_DAS_2, + CONTROL_REGISTERS_START_DAS_3, + CONTROL_REGISTERS_ROWS_DAS_0, + CONTROL_REGISTERS_ROWS_DAS_1, + CONTROL_REGISTERS_ROWS_DAS_2, + CONTROL_REGISTERS_ROWS_DAS_3, CONTROL_REGISTERS_TCDM_START_ADDRESS, CONTROL_REGISTERS_TCDM_END_ADDRESS, CONTROL_REGISTERS_NR_CORES_REG, @@ -258,14 +257,14 @@ package control_registers_reg_pkg; 4'b 1111, // index[14] CONTROL_REGISTERS_PARTITION_SEL_1 4'b 1111, // index[15] CONTROL_REGISTERS_PARTITION_SEL_2 4'b 1111, // index[16] CONTROL_REGISTERS_PARTITION_SEL_3 - 4'b 1111, // index[17] CONTROL_REGISTERS_ALLOCATED_SIZE_0 - 4'b 1111, // index[18] CONTROL_REGISTERS_ALLOCATED_SIZE_1 - 4'b 1111, // index[19] CONTROL_REGISTERS_ALLOCATED_SIZE_2 - 4'b 1111, // index[20] CONTROL_REGISTERS_ALLOCATED_SIZE_3 - 4'b 1111, // index[21] CONTROL_REGISTERS_START_ADDR_SCHEME_0 - 4'b 1111, // index[22] CONTROL_REGISTERS_START_ADDR_SCHEME_1 - 4'b 1111, // index[23] CONTROL_REGISTERS_START_ADDR_SCHEME_2 - 4'b 1111, // index[24] CONTROL_REGISTERS_START_ADDR_SCHEME_3 + 4'b 1111, // index[17] CONTROL_REGISTERS_START_DAS_0 + 4'b 1111, // index[18] CONTROL_REGISTERS_START_DAS_1 + 4'b 1111, // index[19] CONTROL_REGISTERS_START_DAS_2 + 4'b 1111, // index[20] CONTROL_REGISTERS_START_DAS_3 + 4'b 1111, // index[21] CONTROL_REGISTERS_ROWS_DAS_0 + 4'b 1111, // index[22] CONTROL_REGISTERS_ROWS_DAS_1 + 4'b 1111, // index[23] CONTROL_REGISTERS_ROWS_DAS_2 + 4'b 1111, // index[24] CONTROL_REGISTERS_ROWS_DAS_3 4'b 1111, // index[25] CONTROL_REGISTERS_TCDM_START_ADDRESS 4'b 1111, // index[26] CONTROL_REGISTERS_TCDM_END_ADDRESS 4'b 1111, // index[27] CONTROL_REGISTERS_NR_CORES_REG diff --git a/hardware/src/control_registers/control_registers_reg_top.sv b/hardware/src/control_registers/control_registers_reg_top.sv index c80a4f0dd..ccd3cce5b 100644 --- a/hardware/src/control_registers/control_registers_reg_top.sv +++ b/hardware/src/control_registers/control_registers_reg_top.sv @@ -103,22 +103,22 @@ module control_registers_reg_top #( logic partition_sel_2_we; logic [31:0] partition_sel_3_wd; logic partition_sel_3_we; - logic [31:0] allocated_size_0_wd; - logic allocated_size_0_we; - logic [31:0] allocated_size_1_wd; - logic allocated_size_1_we; - logic [31:0] allocated_size_2_wd; - logic allocated_size_2_we; - logic [31:0] allocated_size_3_wd; - logic allocated_size_3_we; - logic [31:0] start_addr_scheme_0_wd; - logic start_addr_scheme_0_we; - logic [31:0] start_addr_scheme_1_wd; - logic start_addr_scheme_1_we; - logic [31:0] start_addr_scheme_2_wd; - logic start_addr_scheme_2_we; - logic [31:0] start_addr_scheme_3_wd; - logic start_addr_scheme_3_we; + logic [31:0] start_das_0_wd; + logic start_das_0_we; + logic [31:0] start_das_1_wd; + logic start_das_1_we; + logic [31:0] start_das_2_wd; + logic start_das_2_we; + logic [31:0] start_das_3_wd; + logic start_das_3_we; + logic [31:0] rows_das_0_wd; + logic rows_das_0_we; + logic [31:0] rows_das_1_wd; + logic rows_das_1_we; + logic [31:0] rows_das_2_wd; + logic rows_das_2_we; + logic [31:0] rows_das_3_wd; + logic rows_das_3_we; logic [31:0] tcdm_start_address_qs; logic tcdm_start_address_re; logic [31:0] tcdm_end_address_qs; @@ -573,173 +573,173 @@ module control_registers_reg_top #( - // Subregister 0 of Multireg allocated_size - // R[allocated_size_0]: V(False) + // Subregister 0 of Multireg start_das + // R[start_das_0]: V(True) + + prim_subreg_ext #( + .DW (32) + ) u_start_das_0 ( + .re (1'b0), + .we (start_das_0_we), + .wd (start_das_0_wd), + .d (hw2reg.start_das[0].d), + .qre (), + .qe (reg2hw.start_das[0].qe), + .q (reg2hw.start_das[0].q ), + .qs () + ); + + // Subregister 1 of Multireg start_das + // R[start_das_1]: V(True) + + prim_subreg_ext #( + .DW (32) + ) u_start_das_1 ( + .re (1'b0), + .we (start_das_1_we), + .wd (start_das_1_wd), + .d (hw2reg.start_das[1].d), + .qre (), + .qe (reg2hw.start_das[1].qe), + .q (reg2hw.start_das[1].q ), + .qs () + ); + + // Subregister 2 of Multireg start_das + // R[start_das_2]: V(True) + + prim_subreg_ext #( + .DW (32) + ) u_start_das_2 ( + .re (1'b0), + .we (start_das_2_we), + .wd (start_das_2_wd), + .d (hw2reg.start_das[2].d), + .qre (), + .qe (reg2hw.start_das[2].qe), + .q (reg2hw.start_das[2].q ), + .qs () + ); + + // Subregister 3 of Multireg start_das + // R[start_das_3]: V(True) + + prim_subreg_ext #( + .DW (32) + ) u_start_das_3 ( + .re (1'b0), + .we (start_das_3_we), + .wd (start_das_3_wd), + .d (hw2reg.start_das[3].d), + .qre (), + .qe (reg2hw.start_das[3].qe), + .q (reg2hw.start_das[3].q ), + .qs () + ); + + + + // Subregister 0 of Multireg rows_das + // R[rows_das_0]: V(False) prim_subreg #( .DW (32), .SWACCESS("WO"), .RESVAL (32'h0) - ) u_allocated_size_0 ( + ) u_rows_das_0 ( .clk_i (clk_i ), .rst_ni (rst_ni ), // from register interface - .we (allocated_size_0_we), - .wd (allocated_size_0_wd), + .we (rows_das_0_we), + .wd (rows_das_0_wd), // from internal hardware .de (1'b0), .d ('0 ), // to internal hardware - .qe (reg2hw.allocated_size[0].qe), - .q (reg2hw.allocated_size[0].q ), + .qe (), + .q (reg2hw.rows_das[0].q ), .qs () ); - // Subregister 1 of Multireg allocated_size - // R[allocated_size_1]: V(False) + // Subregister 1 of Multireg rows_das + // R[rows_das_1]: V(False) prim_subreg #( .DW (32), .SWACCESS("WO"), .RESVAL (32'h0) - ) u_allocated_size_1 ( + ) u_rows_das_1 ( .clk_i (clk_i ), .rst_ni (rst_ni ), // from register interface - .we (allocated_size_1_we), - .wd (allocated_size_1_wd), + .we (rows_das_1_we), + .wd (rows_das_1_wd), // from internal hardware .de (1'b0), .d ('0 ), // to internal hardware - .qe (reg2hw.allocated_size[1].qe), - .q (reg2hw.allocated_size[1].q ), + .qe (), + .q (reg2hw.rows_das[1].q ), .qs () ); - // Subregister 2 of Multireg allocated_size - // R[allocated_size_2]: V(False) + // Subregister 2 of Multireg rows_das + // R[rows_das_2]: V(False) prim_subreg #( .DW (32), .SWACCESS("WO"), .RESVAL (32'h0) - ) u_allocated_size_2 ( + ) u_rows_das_2 ( .clk_i (clk_i ), .rst_ni (rst_ni ), // from register interface - .we (allocated_size_2_we), - .wd (allocated_size_2_wd), + .we (rows_das_2_we), + .wd (rows_das_2_wd), // from internal hardware .de (1'b0), .d ('0 ), // to internal hardware - .qe (reg2hw.allocated_size[2].qe), - .q (reg2hw.allocated_size[2].q ), + .qe (), + .q (reg2hw.rows_das[2].q ), .qs () ); - // Subregister 3 of Multireg allocated_size - // R[allocated_size_3]: V(False) + // Subregister 3 of Multireg rows_das + // R[rows_das_3]: V(False) prim_subreg #( .DW (32), .SWACCESS("WO"), .RESVAL (32'h0) - ) u_allocated_size_3 ( + ) u_rows_das_3 ( .clk_i (clk_i ), .rst_ni (rst_ni ), // from register interface - .we (allocated_size_3_we), - .wd (allocated_size_3_wd), + .we (rows_das_3_we), + .wd (rows_das_3_wd), // from internal hardware .de (1'b0), .d ('0 ), // to internal hardware - .qe (reg2hw.allocated_size[3].qe), - .q (reg2hw.allocated_size[3].q ), - - .qs () - ); - - - - // Subregister 0 of Multireg start_addr_scheme - // R[start_addr_scheme_0]: V(True) - - prim_subreg_ext #( - .DW (32) - ) u_start_addr_scheme_0 ( - .re (1'b0), - .we (start_addr_scheme_0_we), - .wd (start_addr_scheme_0_wd), - .d (hw2reg.start_addr_scheme[0].d), - .qre (), - .qe (reg2hw.start_addr_scheme[0].qe), - .q (reg2hw.start_addr_scheme[0].q ), - .qs () - ); - - // Subregister 1 of Multireg start_addr_scheme - // R[start_addr_scheme_1]: V(True) - - prim_subreg_ext #( - .DW (32) - ) u_start_addr_scheme_1 ( - .re (1'b0), - .we (start_addr_scheme_1_we), - .wd (start_addr_scheme_1_wd), - .d (hw2reg.start_addr_scheme[1].d), - .qre (), - .qe (reg2hw.start_addr_scheme[1].qe), - .q (reg2hw.start_addr_scheme[1].q ), - .qs () - ); - - // Subregister 2 of Multireg start_addr_scheme - // R[start_addr_scheme_2]: V(True) - - prim_subreg_ext #( - .DW (32) - ) u_start_addr_scheme_2 ( - .re (1'b0), - .we (start_addr_scheme_2_we), - .wd (start_addr_scheme_2_wd), - .d (hw2reg.start_addr_scheme[2].d), - .qre (), - .qe (reg2hw.start_addr_scheme[2].qe), - .q (reg2hw.start_addr_scheme[2].q ), - .qs () - ); - - // Subregister 3 of Multireg start_addr_scheme - // R[start_addr_scheme_3]: V(True) + .qe (), + .q (reg2hw.rows_das[3].q ), - prim_subreg_ext #( - .DW (32) - ) u_start_addr_scheme_3 ( - .re (1'b0), - .we (start_addr_scheme_3_we), - .wd (start_addr_scheme_3_wd), - .d (hw2reg.start_addr_scheme[3].d), - .qre (), - .qe (reg2hw.start_addr_scheme[3].qe), - .q (reg2hw.start_addr_scheme[3].q ), .qs () ); @@ -1000,14 +1000,14 @@ module control_registers_reg_top #( addr_hit[14] = (reg_addr == CONTROL_REGISTERS_PARTITION_SEL_1_OFFSET); addr_hit[15] = (reg_addr == CONTROL_REGISTERS_PARTITION_SEL_2_OFFSET); addr_hit[16] = (reg_addr == CONTROL_REGISTERS_PARTITION_SEL_3_OFFSET); - addr_hit[17] = (reg_addr == CONTROL_REGISTERS_ALLOCATED_SIZE_0_OFFSET); - addr_hit[18] = (reg_addr == CONTROL_REGISTERS_ALLOCATED_SIZE_1_OFFSET); - addr_hit[19] = (reg_addr == CONTROL_REGISTERS_ALLOCATED_SIZE_2_OFFSET); - addr_hit[20] = (reg_addr == CONTROL_REGISTERS_ALLOCATED_SIZE_3_OFFSET); - addr_hit[21] = (reg_addr == CONTROL_REGISTERS_START_ADDR_SCHEME_0_OFFSET); - addr_hit[22] = (reg_addr == CONTROL_REGISTERS_START_ADDR_SCHEME_1_OFFSET); - addr_hit[23] = (reg_addr == CONTROL_REGISTERS_START_ADDR_SCHEME_2_OFFSET); - addr_hit[24] = (reg_addr == CONTROL_REGISTERS_START_ADDR_SCHEME_3_OFFSET); + addr_hit[17] = (reg_addr == CONTROL_REGISTERS_START_DAS_0_OFFSET); + addr_hit[18] = (reg_addr == CONTROL_REGISTERS_START_DAS_1_OFFSET); + addr_hit[19] = (reg_addr == CONTROL_REGISTERS_START_DAS_2_OFFSET); + addr_hit[20] = (reg_addr == CONTROL_REGISTERS_START_DAS_3_OFFSET); + addr_hit[21] = (reg_addr == CONTROL_REGISTERS_ROWS_DAS_0_OFFSET); + addr_hit[22] = (reg_addr == CONTROL_REGISTERS_ROWS_DAS_1_OFFSET); + addr_hit[23] = (reg_addr == CONTROL_REGISTERS_ROWS_DAS_2_OFFSET); + addr_hit[24] = (reg_addr == CONTROL_REGISTERS_ROWS_DAS_3_OFFSET); addr_hit[25] = (reg_addr == CONTROL_REGISTERS_TCDM_START_ADDRESS_OFFSET); addr_hit[26] = (reg_addr == CONTROL_REGISTERS_TCDM_END_ADDRESS_OFFSET); addr_hit[27] = (reg_addr == CONTROL_REGISTERS_NR_CORES_REG_OFFSET); @@ -1119,29 +1119,29 @@ module control_registers_reg_top #( assign partition_sel_3_we = addr_hit[16] & reg_we & !reg_error; assign partition_sel_3_wd = reg_wdata[31:0]; - assign allocated_size_0_we = addr_hit[17] & reg_we & !reg_error; - assign allocated_size_0_wd = reg_wdata[31:0]; + assign start_das_0_we = addr_hit[17] & reg_we & !reg_error; + assign start_das_0_wd = reg_wdata[31:0]; - assign allocated_size_1_we = addr_hit[18] & reg_we & !reg_error; - assign allocated_size_1_wd = reg_wdata[31:0]; + assign start_das_1_we = addr_hit[18] & reg_we & !reg_error; + assign start_das_1_wd = reg_wdata[31:0]; - assign allocated_size_2_we = addr_hit[19] & reg_we & !reg_error; - assign allocated_size_2_wd = reg_wdata[31:0]; + assign start_das_2_we = addr_hit[19] & reg_we & !reg_error; + assign start_das_2_wd = reg_wdata[31:0]; - assign allocated_size_3_we = addr_hit[20] & reg_we & !reg_error; - assign allocated_size_3_wd = reg_wdata[31:0]; + assign start_das_3_we = addr_hit[20] & reg_we & !reg_error; + assign start_das_3_wd = reg_wdata[31:0]; - assign start_addr_scheme_0_we = addr_hit[21] & reg_we & !reg_error; - assign start_addr_scheme_0_wd = reg_wdata[31:0]; + assign rows_das_0_we = addr_hit[21] & reg_we & !reg_error; + assign rows_das_0_wd = reg_wdata[31:0]; - assign start_addr_scheme_1_we = addr_hit[22] & reg_we & !reg_error; - assign start_addr_scheme_1_wd = reg_wdata[31:0]; + assign rows_das_1_we = addr_hit[22] & reg_we & !reg_error; + assign rows_das_1_wd = reg_wdata[31:0]; - assign start_addr_scheme_2_we = addr_hit[23] & reg_we & !reg_error; - assign start_addr_scheme_2_wd = reg_wdata[31:0]; + assign rows_das_2_we = addr_hit[23] & reg_we & !reg_error; + assign rows_das_2_wd = reg_wdata[31:0]; - assign start_addr_scheme_3_we = addr_hit[24] & reg_we & !reg_error; - assign start_addr_scheme_3_wd = reg_wdata[31:0]; + assign rows_das_3_we = addr_hit[24] & reg_we & !reg_error; + assign rows_das_3_wd = reg_wdata[31:0]; assign tcdm_start_address_re = addr_hit[25] & reg_re & !reg_error; diff --git a/hardware/src/ctrl_registers.sv b/hardware/src/ctrl_registers.sv index 507df9e5c..ce0b5f7ce 100644 --- a/hardware/src/ctrl_registers.sv +++ b/hardware/src/ctrl_registers.sv @@ -7,7 +7,10 @@ module ctrl_registers import mempool_pkg::ro_cache_ctrl_t; - import mempool_pkg::PartitionDataWidth; + import mempool_pkg::NumDASPartitions; + import mempool_pkg::TileInterleavingWidth; + import mempool_pkg::RowsInterleavingWidth; + import mempool_pkg::AddrWidth; #( parameter int DataWidth = 32, // Parameters @@ -24,13 +27,13 @@ module ctrl_registers input axi_lite_req_t axi_lite_slave_req_i, output axi_lite_resp_t axi_lite_slave_resp_o, // Control registers - output logic [DataWidth-1:0] eoc_o, - output logic eoc_valid_o, - output logic [NumCores-1:0] wake_up_o, - output ro_cache_ctrl_t ro_cache_ctrl_o, - output logic [3:0][PartitionDataWidth-1:0] partition_sel_o, - output logic [3:0][PartitionDataWidth-1:0] allocated_size_o, - output logic [3:0][DataWidth-1:0] start_addr_scheme_o + output logic [NumDASPartitions-1:0][TileInterleavingWidth-1:0] partition_sel_o, + output logic [NumDASPartitions-1:0][AddrWidth-1:0] start_das_o, + output logic [NumDASPartitions-1:0][RowsInterleavingWidth-1:0] rows_das_o, + output logic [DataWidth-1:0] eoc_o, + output logic eoc_valid_o, + output logic [NumCores-1:0] wake_up_o, + output ro_cache_ctrl_t ro_cache_ctrl_o ); import mempool_pkg::AddrWidth; @@ -104,10 +107,10 @@ module ctrl_registers for (genvar i = 0; i < mempool_pkg::NumDASPartitions; i++) begin: gen_das_regs `FFL(ctrl_hw2reg.partition_sel[i].d, ctrl_reg2hw.partition_sel[i].q, ctrl_reg2hw.partition_sel[i].qe, mempool_pkg::NumTiles); - `FFL(ctrl_hw2reg.start_addr_scheme[i].d, ctrl_reg2hw.start_addr_scheme[i].q, ctrl_reg2hw.start_addr_scheme[i].qe, mempool_pkg::DASStartAddr); - assign partition_sel_o[i] = ctrl_hw2reg.partition_sel[i].d[PartitionDataWidth-1:0]; - assign start_addr_scheme_o[i] = ctrl_hw2reg.start_addr_scheme[i].d; - assign allocated_size_o[i] = ctrl_reg2hw.allocated_size[i].q[PartitionDataWidth-1:0]; + `FFL(ctrl_hw2reg.start_das[i].d, ctrl_reg2hw.start_das[i].q, ctrl_reg2hw.start_das[i].qe, mempool_pkg::DASStartAddr); + assign partition_sel_o[i] = ctrl_hw2reg.partition_sel[i].d[TileInterleavingWidth-1:0]; + assign start_das_o[i] = ctrl_hw2reg.start_das[i].d; + assign rows_das_o[i] = ctrl_reg2hw.rows_das[i].q[RowsInterleavingWidth-1:0]; end /************************ diff --git a/hardware/src/mempool_cluster.sv b/hardware/src/mempool_cluster.sv index 4164bc5aa..bcf95c19d 100644 --- a/hardware/src/mempool_cluster.sv +++ b/hardware/src/mempool_cluster.sv @@ -17,32 +17,32 @@ module mempool_cluster parameter int unsigned NumAXIMasters = NumGroups * NumAXIMastersPerGroup ) ( // Clock and reset - input logic clk_i, - input logic rst_ni, - input logic testmode_i, + input logic clk_i, + input logic rst_ni, + input logic testmode_i, // Scan chain - input logic scan_enable_i, - input logic scan_data_i, - output logic scan_data_o, + input logic scan_enable_i, + input logic scan_data_i, + output logic scan_data_o, `ifdef DAS // Partition Selection - input logic [3:0][PartitionDataWidth-1:0] partition_sel_i, - input logic [3:0][PartitionDataWidth-1:0] allocated_size_i, - input logic [3:0][DataWidth-1:0] start_addr_scheme_i, + input logic [NumDASPartitions-1:0][TileInterleavingWidth-1:0] partition_sel_i, + input logic [NumDASPartitions-1:0][AddrWidth-1:0] start_das_i, + input logic [NumDASPartitions-1:0][RowsInterleavingWidth-1:0] rows_das_i, `endif // Wake up signal - input logic [NumCores-1:0] wake_up_i, + input logic [NumCores-1:0] wake_up_i, // RO-Cache configuration - input ro_cache_ctrl_t ro_cache_ctrl_i, + input ro_cache_ctrl_t ro_cache_ctrl_i, // DMA request - input dma_req_t dma_req_i, - input logic dma_req_valid_i, - output logic dma_req_ready_o, + input dma_req_t dma_req_i, + input logic dma_req_valid_i, + output logic dma_req_ready_o, // DMA status - output dma_meta_t dma_meta_o, + output dma_meta_t dma_meta_o, // AXI Interface - output axi_tile_req_t [NumAXIMasters-1:0] axi_mst_req_o, - input axi_tile_resp_t [NumAXIMasters-1:0] axi_mst_resp_i + output axi_tile_req_t [NumAXIMasters-1:0] axi_mst_req_o, + input axi_tile_resp_t [NumAXIMasters-1:0] axi_mst_resp_i ); /********************* @@ -88,7 +88,7 @@ module mempool_cluster logic [NumGroups-1:0] dma_req_group_valid, dma_req_group_q_valid; logic [NumGroups-1:0] dma_req_group_ready, dma_req_group_q_ready; dma_meta_t [NumGroups-1:0] dma_meta, dma_meta_q; - logic [PartitionDataWidth-1:0] allocated_size_sel; + logic [RowsInterleavingWidth-1:0] dma_rows_das; `FF(dma_meta_q, dma_meta, '0, clk_i, rst_ni); @@ -105,23 +105,22 @@ module mempool_cluster .NumDASPartitions (NumDASPartitions ), .DASStartAddr (DASStartAddr ) ) i_idma_split_midend ( - .clk_i (clk_i ), - .rst_ni (rst_ni ), + .clk_i (clk_i ), + .rst_ni (rst_ni ), `ifdef DAS - // partition information - .group_factor_i (partition_sel_i ), - .allocated_size_i (allocated_size_i ), - .start_addr_scheme_i(start_addr_scheme_i), - .allocated_size_o (allocated_size_sel ), + .partition_sel_i (partition_sel_i ), + .start_das_i (start_das_i ), + .rows_das_i (rows_das_i ), + .rows_das_o (dma_rows_das ), `endif - .burst_req_i(dma_req_cut ), - .valid_i (dma_req_cut_valid ), - .ready_o (dma_req_cut_ready ), - .meta_o (dma_meta_cut ), - .burst_req_o(dma_req_split ), - .valid_o (dma_req_split_valid), - .ready_i (dma_req_split_ready), - .meta_i (dma_meta_split ) + .burst_req_i (dma_req_cut ), + .valid_i (dma_req_cut_valid ), + .ready_o (dma_req_cut_ready ), + .meta_o (dma_meta_cut ), + .burst_req_o (dma_req_split ), + .valid_o (dma_req_split_valid), + .ready_i (dma_req_split_ready), + .meta_i (dma_meta_split ) ); idma_distributed_midend #( @@ -130,14 +129,15 @@ module mempool_cluster .DmaRegionStart (TCDMBaseAddr ), .DmaRegionEnd (TCDMBaseAddr+TCDMSize), .TransFifoDepth (16 ), + .NumTiles (NumTiles ), + .NumDASPartitions (NumDASPartitions ), .burst_req_t (dma_req_t ), .meta_t (dma_meta_t ) ) i_idma_distributed_midend ( .clk_i (clk_i ), .rst_ni (rst_ni ), `ifdef DAS - // partition info - .allocated_size_i(allocated_size_sel), + .rows_das_i (dma_rows_das ), `endif .burst_req_i (dma_req_split ), .valid_i (dma_req_split_valid), @@ -320,9 +320,9 @@ module mempool_cluster .tcdm_slave_resp_ready_i (tcdm_slave_resp_ready[g] ), `ifdef DAS .partition_sel_i (partition_sel_i ), - .start_addr_scheme_i (start_addr_scheme_i ), - .allocated_size_i (allocated_size_i ), - .dma_allocated_size_sel_i(allocated_size_sel ), + .start_das_i (start_das_i ), + .rows_das_i (rows_das_i ), + .dma_rows_das_i (dma_rows_das ), `endif .wake_up_i (wake_up_q[g*NumCoresPerGroup +: NumCoresPerGroup] ), .ro_cache_ctrl_i (ro_cache_ctrl_q[g] ), @@ -367,9 +367,9 @@ module mempool_cluster .tcdm_slave_resp_ready_i (tcdm_slave_resp_ready[g] ), `ifdef DAS .partition_sel_i (partition_sel_i ), - .start_addr_scheme_i (start_addr_scheme_i ), - .allocated_size_i (allocated_size_i ), - .dma_allocated_size_sel_i(allocated_size_sel ), + .start_das_i (start_das_i ), + .rows_das_i (rows_das_i ), + .dma_rows_das_i (dma_rows_das ), `endif .wake_up_i (wake_up_q[g*NumCoresPerGroup +: NumCoresPerGroup] ), .ro_cache_ctrl_i (ro_cache_ctrl_q[g] ), @@ -411,9 +411,9 @@ module mempool_cluster .tcdm_slave_resp_ready_i (tcdm_slave_resp_ready[g] ), `ifdef DAS .partition_sel_i (partition_sel_i ), - .start_addr_scheme_i (start_addr_scheme_i ), - .allocated_size_i (allocated_size_i ), - .dma_allocated_size_sel_i(allocated_size_sel ), + .start_das_i (start_das_i ), + .rows_das_i (rows_das_i ), + .dma_rows_das_i (dma_rows_das ), `endif .wake_up_i (wake_up_q[g*NumCoresPerGroup +: NumCoresPerGroup] ), .ro_cache_ctrl_i (ro_cache_ctrl_q[g] ), @@ -499,10 +499,10 @@ module mempool_cluster .tcdm_slave_resp_valid_o (tcdm_slave_resp_valid[g] ), .tcdm_slave_resp_ready_i (tcdm_slave_resp_ready[g] ), `ifdef DAS - .partition_sel_i (partition_sel_i ), - .start_addr_scheme_i (start_addr_scheme_i ), - .allocated_size_i (allocated_size_i ), - .dma_allocated_size_sel_i(allocated_size_sel ), + .partition_sel_i (partition_sel_i ), + .start_das_i (start_das_i ), + .rows_das_i (rows_das_i ), + .dma_rows_das_i (dma_rows_das ), `endif .wake_up_i (wake_up_q[g*NumCoresPerGroup +: NumCoresPerGroup] ), .ro_cache_ctrl_i (ro_cache_ctrl_q[g] ), diff --git a/hardware/src/mempool_group.sv b/hardware/src/mempool_group.sv index 7b9027d2d..5c1d1f9e4 100644 --- a/hardware/src/mempool_group.sv +++ b/hardware/src/mempool_group.sv @@ -59,11 +59,10 @@ module mempool_group input logic [NumGroups-1:1][NumTilesPerGroup-1:0] tcdm_slave_resp_ready_i, `endif `ifdef DAS - // Partition selection - input logic [NumDASPartitions-1:0][DataWidth-1:0] start_addr_scheme_i, - input logic [NumDASPartitions-1:0][PartitionDataWidth-1:0] allocated_size_i, - input logic [NumDASPartitions-1:0][PartitionDataWidth-1:0] partition_sel_i, - input logic [PartitionDataWidth-1:0] dma_allocated_size_sel_i, + input logic [NumDASPartitions-1:0][TileInterleavingWidth-1:0] partition_sel_i, + input logic [NumDASPartitions-1:0][AddrWidth-1:0] start_das_i, + input logic [NumDASPartitions-1:0][RowsInterleavingWidth-1:0] rows_das_i, + input logic [RowsInterleavingWidth-1:0] dma_rows_das_i, `endif // Wake up interface input logic [NumCoresPerGroup-1:0] wake_up_i, @@ -340,10 +339,9 @@ module mempool_group // RO-Cache configuration .ro_cache_ctrl_i (ro_cache_ctrl_q ), `ifdef DAS - // Partition selection - .start_addr_scheme_i (start_addr_scheme_i ), - .allocated_size_i (allocated_size_i ), .partition_sel_i (partition_sel_i ), + .start_das_i (start_das_i ), + .rows_das_i (rows_das_i ), `endif // Wake up interface .wake_up_i (wake_up_q[sg*NumCoresPerSubGroup +: NumCoresPerSubGroup] ) @@ -398,10 +396,9 @@ module mempool_group // RO-Cache configuration .ro_cache_ctrl_i (ro_cache_ctrl_q ), `ifdef DAS - // Partition selection - .start_addr_scheme_i (start_addr_scheme_i ), - .allocated_size_i (allocated_size_i ), .partition_sel_i (partition_sel_i ), + .start_das_i (start_das_i ), + .rows_das_i (rows_das_i ), `endif // Wake up interface .wake_up_i (wake_up_q[sg*NumCoresPerSubGroup +: NumCoresPerSubGroup] ) @@ -585,14 +582,15 @@ module mempool_group .DmaRegionStart (TCDMBaseAddr ), .DmaRegionEnd (TCDMBaseAddr+TCDMSize ), .TransFifoDepth (8 ), + .NumTiles (NumTiles ), + .NumDASPartitions (NumDASPartitions ), .burst_req_t (dma_req_t ), .meta_t (dma_meta_t ) ) i_idma_distributed_midend ( .clk_i (clk_i ), .rst_ni (rst_ni ), `ifdef DAS - // partition - .allocated_size_i(dma_allocated_size_sel_i), + .rows_das_i (dma_rows_das_i ), `endif .burst_req_i (dma_req_cut ), .valid_i (dma_req_cut_valid ), @@ -707,9 +705,9 @@ module mempool_group .axi_mst_req_o (axi_tile_req[t] ), .axi_mst_resp_i (axi_tile_resp[t] ), `ifdef DAS - .start_addr_scheme_i (start_addr_scheme_i ), - .allocated_size_i (allocated_size_i ), .partition_sel_i (partition_sel_i ), + .start_das_i (start_das_i ), + .rows_das_i (rows_das_i ), `endif // Wake up interface .wake_up_i (wake_up_q[t*NumCoresPerTile +: NumCoresPerTile]) @@ -1004,14 +1002,15 @@ module mempool_group .DmaRegionStart (TCDMBaseAddr ), .DmaRegionEnd (TCDMBaseAddr+TCDMSize ), .TransFifoDepth (8 ), + .NumTiles (NumTiles ), + .NumDASPartitions (NumDASPartitions ), .burst_req_t (dma_req_t ), .meta_t (dma_meta_t ) ) i_idma_distributed_midend ( .clk_i (clk_i ), .rst_ni (rst_ni ), `ifdef DAS - // partition - .allocated_size_i(dma_allocated_size_sel_i), + .rows_das_i (dma_rows_das_i ), `endif .burst_req_i (dma_req_cut ), .valid_i (dma_req_cut_valid ), diff --git a/hardware/src/mempool_sub_group.sv b/hardware/src/mempool_sub_group.sv index e53f2f60c..b88338ade 100644 --- a/hardware/src/mempool_sub_group.sv +++ b/hardware/src/mempool_sub_group.sv @@ -63,10 +63,9 @@ module mempool_sub_group // RO-Cache configuration input `STRUCT_PORT(ro_cache_ctrl_t) ro_cache_ctrl_i, `ifdef DAS - // Partition Selection - input logic [NumDASPartitions-1:0][DataWidth-1:0] start_addr_scheme_i, - input logic [NumDASPartitions-1:0][PartitionDataWidth-1:0] allocated_size_i, - input logic [NumDASPartitions-1:0][PartitionDataWidth-1:0] partition_sel_i, + input logic [NumDASPartitions-1:0][TileInterleavingWidth-1:0] partition_sel_i, + input logic [NumDASPartitions-1:0][AddrWidth-1:0] start_das_i, + input logic [NumDASPartitions-1:0][RowsInterleavingWidth-1:0] rows_das_i, `endif // Wake up interface input logic [NumCoresPerSubGroup-1:0] wake_up_i @@ -205,9 +204,9 @@ module mempool_sub_group .axi_mst_req_o (axi_tile_req[t] ), .axi_mst_resp_i (axi_tile_resp[t] ), `ifdef DAS - .start_addr_scheme_i (start_addr_scheme_i ), - .allocated_size_i (allocated_size_i ), - .partition_sel_i (partition_sel_i ), + .partition_sel_i (partition_sel_i ), + .start_das_i (start_das_i ), + .rows_das_i (rows_das_i ), `endif // Wake up interface .wake_up_i (wake_up_q[t*NumCoresPerTile +: NumCoresPerTile]) diff --git a/hardware/src/mempool_system.sv b/hardware/src/mempool_system.sv index 3118c18a5..ea6931f95 100644 --- a/hardware/src/mempool_system.sv +++ b/hardware/src/mempool_system.sv @@ -92,9 +92,9 @@ module mempool_system ro_cache_ctrl_t ro_cache_ctrl; `ifdef DAS // For dynamic partitioning - logic [3:0][PartitionDataWidth-1:0] partition_sel; - logic [3:0][PartitionDataWidth-1:0] allocated_size; - logic [3:0][DataWidth-1:0] start_addr_scheme; + logic [NumDASPartitions-1:0][TileInterleavingWidth-1:0] partition_sel; + logic [NumDASPartitions-1:0][AddrWidth-1:0] start_das; + logic [NumDASPartitions-1:0][RowsInterleavingWidth-1:0] rows_das; `endif dma_req_t dma_req; @@ -143,25 +143,25 @@ module mempool_system .TCDMBaseAddr(TCDMBaseAddr), .BootAddr (BootAddr ) ) i_mempool_cluster ( - .clk_i (clk_i ), - .rst_ni (rst_ni ), - .wake_up_i (wake_up ), + .clk_i (clk_i ), + .rst_ni (rst_ni ), + .wake_up_i (wake_up ), `ifdef DAS - .partition_sel_i (partition_sel ), - .allocated_size_i (allocated_size ), - .start_addr_scheme_i(start_addr_scheme ), + .partition_sel_i (partition_sel ), + .start_das_i (start_das ), + .rows_das_i (rows_das ), `endif - .testmode_i (1'b0 ), - .scan_enable_i (1'b0 ), - .scan_data_i (1'b0 ), - .scan_data_o (/* Unused */ ), - .ro_cache_ctrl_i (ro_cache_ctrl ), - .dma_req_i (dma_req ), - .dma_req_valid_i (dma_req_valid ), - .dma_req_ready_o (dma_req_ready ), - .dma_meta_o (dma_meta ), - .axi_mst_req_o (axi_mst_req[NumAXIMasters-2:0] ), - .axi_mst_resp_i (axi_mst_resp[NumAXIMasters-2:0]) + .testmode_i (1'b0 ), + .scan_enable_i (1'b0 ), + .scan_data_i (1'b0 ), + .scan_data_o (/* Unused */ ), + .ro_cache_ctrl_i (ro_cache_ctrl ), + .dma_req_i (dma_req ), + .dma_req_valid_i (dma_req_valid ), + .dma_req_ready_o (dma_req_ready ), + .dma_meta_o (dma_meta ), + .axi_mst_req_o (axi_mst_req[NumAXIMasters-2:0] ), + .axi_mst_resp_i (axi_mst_resp[NumAXIMasters-2:0]) ); /********************** @@ -814,8 +814,8 @@ module mempool_system .eoc_valid_o (eoc_valid_o ), `ifdef DAS .partition_sel_o (partition_sel ), - .start_addr_scheme_o (start_addr_scheme ), - .allocated_size_o (allocated_size ), + .start_das_o (start_das ), + .rows_das_o (rows_das ), `endif .wake_up_o (wake_up ), .ro_cache_ctrl_o (ro_cache_ctrl ) diff --git a/hardware/src/mempool_tile.sv b/hardware/src/mempool_tile.sv index dd5b41aa4..ac856a5e9 100644 --- a/hardware/src/mempool_tile.sv +++ b/hardware/src/mempool_tile.sv @@ -51,12 +51,13 @@ module mempool_tile // AXI Interface output `STRUCT_PORT(axi_tile_req_t) axi_mst_req_o, input `STRUCT_PORT(axi_tile_resp_t) axi_mst_resp_i, +`ifdef DAS + input logic [NumDASPartitions-1:0][TileInterleavingWidth-1:0] partition_sel_i, + input logic [NumDASPartitions-1:0][AddrWidth-1:0] start_das_i, + input logic [NumDASPartitions-1:0][RowsInterleavingWidth-1:0] rows_das_i, +`endif // Wake up interface - input logic [NumCoresPerTile-1:0] wake_up_i, - // Partition selection - input logic [NumDASPartitions-1:0][DataWidth-1:0] start_addr_scheme_i, - input logic [NumDASPartitions-1:0][PartitionDataWidth-1:0] allocated_size_i, - input logic [NumDASPartitions-1:0][PartitionDataWidth-1:0] partition_sel_i + input logic [NumCoresPerTile-1:0] wake_up_i ); /**************** @@ -905,13 +906,13 @@ module mempool_tile .NumDASPartitions (NumDASPartitions ) ) i_address_scrambler ( `ifdef DAS - .group_factor_i (partition_sel_i ), - .allocated_size_i (allocated_size_i ), - .start_addr_scheme_i(start_addr_scheme_i ), + .partition_sel_i (partition_sel_i ), + .start_das_i (start_das_i ), + .rows_das_i (rows_das_i ), `else - .group_factor_i (NumTiles ), - .allocated_size_i ('0 ), - .start_addr_scheme_i('0 ), + .partition_sel_i (NumTiles ), + .start_das_i ('0 ), + .rows_das_i ('0 ), `endif .address_i (snitch_data_qaddr[c]), .address_o (snitch_data_qaddr_scrambled) diff --git a/software/runtime/control_registers.h b/software/runtime/control_registers.h index 88b4f96bd..88109fb9c 100644 --- a/software/runtime/control_registers.h +++ b/software/runtime/control_registers.h @@ -86,39 +86,39 @@ extern "C" { // Tile grouping for DAS partition #define CONTROL_REGISTERS_PARTITION_SEL_3_REG_OFFSET 0x40 -// Allocated size on DAS partition (common parameters) -#define CONTROL_REGISTERS_ALLOCATED_SIZE_ALLOCATED_SIZE_FIELD_WIDTH 32 -#define CONTROL_REGISTERS_ALLOCATED_SIZE_ALLOCATED_SIZE_FIELDS_PER_REG 1 -#define CONTROL_REGISTERS_ALLOCATED_SIZE_MULTIREG_COUNT 4 - -// Allocated size on DAS partition -#define CONTROL_REGISTERS_ALLOCATED_SIZE_0_REG_OFFSET 0x44 - -// Allocated size on DAS partition -#define CONTROL_REGISTERS_ALLOCATED_SIZE_1_REG_OFFSET 0x48 - -// Allocated size on DAS partition -#define CONTROL_REGISTERS_ALLOCATED_SIZE_2_REG_OFFSET 0x4c - -// Allocated size on DAS partition -#define CONTROL_REGISTERS_ALLOCATED_SIZE_3_REG_OFFSET 0x50 - // Start address of DAS partition (common parameters) -#define CONTROL_REGISTERS_START_ADDR_SCHEME_START_ADDR_SCHEME_FIELD_WIDTH 32 -#define CONTROL_REGISTERS_START_ADDR_SCHEME_START_ADDR_SCHEME_FIELDS_PER_REG 1 -#define CONTROL_REGISTERS_START_ADDR_SCHEME_MULTIREG_COUNT 4 +#define CONTROL_REGISTERS_START_DAS_START_DAS_FIELD_WIDTH 32 +#define CONTROL_REGISTERS_START_DAS_START_DAS_FIELDS_PER_REG 1 +#define CONTROL_REGISTERS_START_DAS_MULTIREG_COUNT 4 // Start address of DAS partition -#define CONTROL_REGISTERS_START_ADDR_SCHEME_0_REG_OFFSET 0x54 +#define CONTROL_REGISTERS_START_DAS_0_REG_OFFSET 0x44 // Start address of DAS partition -#define CONTROL_REGISTERS_START_ADDR_SCHEME_1_REG_OFFSET 0x58 +#define CONTROL_REGISTERS_START_DAS_1_REG_OFFSET 0x48 // Start address of DAS partition -#define CONTROL_REGISTERS_START_ADDR_SCHEME_2_REG_OFFSET 0x5c +#define CONTROL_REGISTERS_START_DAS_2_REG_OFFSET 0x4c // Start address of DAS partition -#define CONTROL_REGISTERS_START_ADDR_SCHEME_3_REG_OFFSET 0x60 +#define CONTROL_REGISTERS_START_DAS_3_REG_OFFSET 0x50 + +// End address of DAS partition (common parameters) +#define CONTROL_REGISTERS_ROWS_DAS_ROWS_DAS_FIELD_WIDTH 32 +#define CONTROL_REGISTERS_ROWS_DAS_ROWS_DAS_FIELDS_PER_REG 1 +#define CONTROL_REGISTERS_ROWS_DAS_MULTIREG_COUNT 4 + +// End address of DAS partition +#define CONTROL_REGISTERS_ROWS_DAS_0_REG_OFFSET 0x54 + +// End address of DAS partition +#define CONTROL_REGISTERS_ROWS_DAS_1_REG_OFFSET 0x58 + +// End address of DAS partition +#define CONTROL_REGISTERS_ROWS_DAS_2_REG_OFFSET 0x5c + +// End address of DAS partition +#define CONTROL_REGISTERS_ROWS_DAS_3_REG_OFFSET 0x60 // TCDM Start Address Register #define CONTROL_REGISTERS_TCDM_START_ADDRESS_REG_OFFSET 0x64 From 1982a1274f6f75efb1e154fbee25d75dccf13553 Mon Sep 17 00:00:00 2001 From: mbertuletti Date: Thu, 30 Oct 2025 12:24:50 +0100 Subject: [PATCH 26/34] [hardware] Modify address scrambler for non-aligned scrambling --- hardware/src/address_scrambler.sv | 74 +++++++++++++++++++------------ hardware/src/mempool_pkg.sv | 3 +- 2 files changed, 47 insertions(+), 30 deletions(-) diff --git a/hardware/src/address_scrambler.sv b/hardware/src/address_scrambler.sv index 960a2665a..bc087675b 100644 --- a/hardware/src/address_scrambler.sv +++ b/hardware/src/address_scrambler.sv @@ -20,13 +20,14 @@ module address_scrambler #( parameter int unsigned SeqMemSizePerTile = 4096, parameter int unsigned NumDASPartitions = 4, // Dependant parameters, do not change + parameter int unsigned RowsWidth = $clog2(TCDMSizePerBank) - ByteOffset + 1, parameter int unsigned MemSizePerTile = NumBanksPerTile*TCDMSizePerBank, parameter int unsigned MemSizePerRow = (1 << ByteOffset)*NumBanksPerTile*NumTiles ) ( input logic [AddrWidth-1:0] address_i, - input logic [NumDASPartitions-1:0][$clog2(NumTiles):0] group_factor_i, - input logic [NumDASPartitions-1:0][$clog2(NumTiles):0] allocated_size_i, - input logic [NumDASPartitions-1:0][DataWidth-1:0] start_addr_scheme_i, + input logic [NumDASPartitions-1:0][$clog2(NumTiles):0] partition_sel_i, + input logic [NumDASPartitions-1:0][AddrWidth-1:0] start_das_i, + input logic [NumDASPartitions-1:0][RowsWidth-1:0] rows_das_i, output logic [AddrWidth-1:0] address_o ); // Stack Sequential Settings @@ -37,9 +38,10 @@ module address_scrambler #( localparam int unsigned ConstantBitsLSB = ByteOffset + BankOffsetBits; localparam int unsigned ScrambleBits = SeqPerTileBits-ConstantBitsLSB; - if (Bypass || NumTiles < 2) begin + if (Bypass || NumTiles < 2) begin: gen_bypass assign address_o = address_i; - end else begin + + end else begin: gen_scrambling // ------ Stack Region Logic ------ // logic [ScrambleBits-1:0] scramble; // Address bits that have to be shuffled around logic [TileIdBits-1:0] tile_id; // Which tile does this address region belong to @@ -52,50 +54,64 @@ module address_scrambler #( // ------ Heap Sequential Signals ------ // - // `tile_index` : how many bits to shift for TileID bits in each partition - // `row_index`: how many bits need to swap within Row Index - logic [NumDASPartitions-1:0][$clog2($clog2(NumTiles)+1)-1:0] tile_index; - logic [NumDASPartitions-1:0][$clog2($clog2(NumTiles)+1)-1:0] row_index; + // `tile_bits` : how many bits to shift for TileID bits in each partition + // `row_bits`: how many bits need to swap within Row Index + logic [NumDASPartitions-1:0][$clog2($clog2(NumTiles)+1)-1:0] tile_bits; + logic [NumDASPartitions-1:0][$clog2(RowsWidth)-1:0] row_bits; for (genvar i = 0; i < NumDASPartitions; i++) begin : gen_shift_index lzc #( .WIDTH ($clog2(NumTiles)+1), .MODE (1'b0 ) - ) i_log_tile_index ( - .in_i (group_factor_i[i]), - .cnt_o (tile_index[i] ), - .empty_o (/* Unused */ ) + ) i_log_tile_bits ( + .in_i (partition_sel_i[i]), + .cnt_o (tile_bits[i] ), + .empty_o (/* Unused */ ) ); lzc #( - .WIDTH ($clog2(NumTiles)+1), + .WIDTH (RowsWidth ), .MODE (1'b0 ) - ) i_log_row_index ( - .in_i (allocated_size_i[i][$clog2(NumTiles):0]), - .cnt_o (row_index[i] ), - .empty_o (/* Unused */ ) + ) i_log_row_bits ( + .in_i (rows_das_i[i][RowsWidth-1:0]), + .cnt_o (row_bits[i] ), + .empty_o (/* Unused */ ) ); end + logic [NumDASPartitions-1:0][AddrWidth-1:0] lsb_addr; + logic [NumDASPartitions-1:0][AddrWidth-1:0] start_row_addr; + logic [NumDASPartitions-1:0][AddrWidth-1:0] row_addr; + logic [NumDASPartitions-1:0][AddrWidth-1:0] prt_addr; + logic [NumDASPartitions-1:0][AddrWidth-1:0] msb_addr; + logic [NumDASPartitions-1:0][AddrWidth-1:0] aligned_addr; + always_comb begin + // Default: Unscrambled - address_o[ConstantBitsLSB-1:0] = address_i[ConstantBitsLSB-1:0]; - address_o[SeqTotalBits-1:ConstantBitsLSB] = {tile_id, scramble}; - address_o[AddrWidth-1:SeqTotalBits] = address_i[AddrWidth-1:SeqTotalBits]; + address_o = address_i; // Stack Region - if (address_i < (NumTiles * SeqMemSizePerTile)) begin + if (address_i < (NumTiles * SeqMemSizePerTile)) begin: gen_stack_scrambling + address_o[ConstantBitsLSB-1:0] = address_i[ConstantBitsLSB-1:0]; address_o[SeqTotalBits-1:ConstantBitsLSB] = {scramble, tile_id}; + address_o[AddrWidth-1:SeqTotalBits] = address_i[AddrWidth-1:SeqTotalBits]; // DAS address scrambling - end else begin + end else begin: gen_das_scrambling for (int p = 0; p < NumDASPartitions; p++) begin - if ( (address_i >= start_addr_scheme_i[0]) && (address_i < start_addr_scheme_i[0]+MemSizePerRow*allocated_size_i[0]) ) begin - address_o = '0; - address_o |= address_i & ((1 << (tile_index[0]+ConstantBitsLSB)) - 1); - address_o |= ((address_i >> (row_index[0]+tile_index[0]+ConstantBitsLSB)) << (tile_index[0]+ConstantBitsLSB)) & ((1 << (TileIdBits+ConstantBitsLSB)) - 1); - address_o |= ((address_i >> (tile_index[0]+ConstantBitsLSB)) << (TileIdBits + ConstantBitsLSB)) & ((1 << (row_index[0]+TileIdBits+ConstantBitsLSB)) - 1); - address_o |= address_i & ~((1 << (row_index[0]+TileIdBits+ConstantBitsLSB)) - 1); + if ( (address_i >= start_das_i[p]) && (address_i < start_das_i[p]+MemSizePerRow*rows_das_i[p]) ) begin + + lsb_addr[p] = address_i & ((1 << (tile_bits[p]+ConstantBitsLSB)) - 1); + msb_addr[p] = address_i & ~((1 << (row_bits[p]+TileIdBits+ConstantBitsLSB)) - 1); + start_row_addr[p] = start_das_i[p] & (((1 << row_bits[p]) - 1) << (TileIdBits + ConstantBitsLSB)); + aligned_addr[p] = address_i - start_row_addr[p]; + + prt_addr[p] = (aligned_addr[p] >> row_bits[p] ) & (((1 << (TileIdBits - tile_bits[p])) - 1) << (ConstantBitsLSB + tile_bits[p])); + row_addr[p] = (aligned_addr[p] << (TileIdBits - tile_bits[p])) & (((1 << (row_bits[p]) ) - 1) << (TileIdBits + ConstantBitsLSB )); + address_o = msb_addr[p] | row_addr[p] | prt_addr[p] | lsb_addr[p]; + address_o = address_o + row_addr[p]; + end end diff --git a/hardware/src/mempool_pkg.sv b/hardware/src/mempool_pkg.sv index f320b9e9a..d014cda6b 100644 --- a/hardware/src/mempool_pkg.sv +++ b/hardware/src/mempool_pkg.sv @@ -48,7 +48,8 @@ package mempool_pkg; localparam integer unsigned NumDASPartitions = `ifdef NUM_DAS_PARTITIONS `NUM_DAS_PARTITIONS `else 0 `endif; localparam integer unsigned DASMemSize = `ifdef DAS_MEM_SIZE `DAS_MEM_SIZE `else 0 `endif; localparam integer unsigned DASStartAddr = (NumBanks * TCDMSizePerBank) - NumCores * DASMemSize; - localparam integer unsigned PartitionDataWidth = $clog2(NumTiles)+1; // only support group_factor={128, 64, 32, 16, 8, 4, 2, 1} + localparam integer unsigned TileInterleavingWidth = idx_width(NumTiles) + 1; // only support {128, 64, 32, 16, 8, 4, 2, 1}; + localparam integer unsigned RowsInterleavingWidth = idx_width(TCDMSizePerBank) - ByteOffset + 1; // L2 localparam integer unsigned L2Size = `ifdef L2_SIZE `L2_SIZE `else 0 `endif; // [B] From 12cdd27adfcf38b65ba5b76fd7a21cadc0b52e4c Mon Sep 17 00:00:00 2001 From: mbertuletti Date: Thu, 30 Oct 2025 12:26:21 +0100 Subject: [PATCH 27/34] [software] Add test for non-interleaved scrambling --- software/runtime/runtime.h | 100 +++++++----------- software/runtime/runtime.mk | 1 + software/runtime/synchronization.c | 8 -- .../tests/baremetal/das_static_test/main.c | 64 +++++++++++ 4 files changed, 105 insertions(+), 68 deletions(-) create mode 100644 software/tests/baremetal/das_static_test/main.c diff --git a/software/runtime/runtime.h b/software/runtime/runtime.h index 133d6c072..d5d977038 100644 --- a/software/runtime/runtime.h +++ b/software/runtime/runtime.h @@ -56,44 +56,44 @@ static uint32_t volatile *wake_up_offset_reg = /* DAS-related regs */ -static uint32_t volatile *partition0_reg = +static uint32_t volatile *partition_0_reg = (uint32_t volatile *)(CONTROL_REGISTER_OFFSET + CONTROL_REGISTERS_PARTITION_SEL_0_REG_OFFSET); -static uint32_t volatile *partition1_reg = +static uint32_t volatile *partition_1_reg = (uint32_t volatile *)(CONTROL_REGISTER_OFFSET + CONTROL_REGISTERS_PARTITION_SEL_1_REG_OFFSET); -static uint32_t volatile *partition2_reg = +static uint32_t volatile *partition_2_reg = (uint32_t volatile *)(CONTROL_REGISTER_OFFSET + CONTROL_REGISTERS_PARTITION_SEL_2_REG_OFFSET); -static uint32_t volatile *partition3_reg = +static uint32_t volatile *partition_3_reg = (uint32_t volatile *)(CONTROL_REGISTER_OFFSET + CONTROL_REGISTERS_PARTITION_SEL_3_REG_OFFSET); -static uint32_t volatile *start_addr_scheme0_reg = +static uint32_t volatile *start_das_0_reg = (uint32_t volatile *)(CONTROL_REGISTER_OFFSET + - CONTROL_REGISTERS_START_ADDR_SCHEME_0_REG_OFFSET); -static uint32_t volatile *start_addr_scheme1_reg = + CONTROL_REGISTERS_START_DAS_0_REG_OFFSET); +static uint32_t volatile *start_das_1_reg = (uint32_t volatile *)(CONTROL_REGISTER_OFFSET + - CONTROL_REGISTERS_START_ADDR_SCHEME_1_REG_OFFSET); -static uint32_t volatile *start_addr_scheme2_reg = + CONTROL_REGISTERS_START_DAS_1_REG_OFFSET); +static uint32_t volatile *start_das_2_reg = (uint32_t volatile *)(CONTROL_REGISTER_OFFSET + - CONTROL_REGISTERS_START_ADDR_SCHEME_2_REG_OFFSET); -static uint32_t volatile *start_addr_scheme3_reg = + CONTROL_REGISTERS_START_DAS_2_REG_OFFSET); +static uint32_t volatile *start_das_3_reg = (uint32_t volatile *)(CONTROL_REGISTER_OFFSET + - CONTROL_REGISTERS_START_ADDR_SCHEME_3_REG_OFFSET); + CONTROL_REGISTERS_START_DAS_3_REG_OFFSET); -static uint32_t volatile *allocated_size0_reg = +static uint32_t volatile *rows_das_0_reg = (uint32_t volatile *)(CONTROL_REGISTER_OFFSET + - CONTROL_REGISTERS_ALLOCATED_SIZE_0_REG_OFFSET); -static uint32_t volatile *allocated_size1_reg = + CONTROL_REGISTERS_ROWS_DAS_0_REG_OFFSET); +static uint32_t volatile *rows_das_1_reg = (uint32_t volatile *)(CONTROL_REGISTER_OFFSET + - CONTROL_REGISTERS_ALLOCATED_SIZE_1_REG_OFFSET); -static uint32_t volatile *allocated_size2_reg = + CONTROL_REGISTERS_ROWS_DAS_1_REG_OFFSET); +static uint32_t volatile *rows_das_2_reg = (uint32_t volatile *)(CONTROL_REGISTER_OFFSET + - CONTROL_REGISTERS_ALLOCATED_SIZE_2_REG_OFFSET); -static uint32_t volatile *allocated_size3_reg = + CONTROL_REGISTERS_ROWS_DAS_2_REG_OFFSET); +static uint32_t volatile *rows_das_3_reg = (uint32_t volatile *)(CONTROL_REGISTER_OFFSET + - CONTROL_REGISTERS_ALLOCATED_SIZE_3_REG_OFFSET); + CONTROL_REGISTERS_ROWS_DAS_3_REG_OFFSET); typedef uint32_t mempool_id_t; typedef uint32_t mempool_timer_t; @@ -291,57 +291,37 @@ static inline void set_wake_up_offset(uint32_t offset) { } // Partition Configuration -static inline void partition_config(uint32_t reg_sel, - uint32_t tiles_per_partition) { +static inline void das_config(uint32_t reg_sel, uint32_t tiles_per_partition, uint32_t addr, uint32_t size) { asm volatile("" ::: "memory"); + // Compute number of rows + uint32_t row_bytes = NUM_BANKS * sizeof(uint32_t); + uint32_t rows_das = (size + (row_bytes - 1)) / row_bytes; + // Program DAS registers switch (reg_sel) { case 0: - *partition0_reg = tiles_per_partition; + *partition_0_reg = tiles_per_partition; + *start_das_0_reg = addr; + *rows_das_0_reg = rows_das; break; case 1: - *partition1_reg = tiles_per_partition; + *partition_1_reg = tiles_per_partition; + *start_das_1_reg = addr; + *rows_das_1_reg = rows_das; break; case 2: - *partition2_reg = tiles_per_partition; + *partition_2_reg = tiles_per_partition; + *start_das_2_reg = addr; + *rows_das_2_reg = rows_das; break; case 3: - *partition3_reg = tiles_per_partition; + *partition_3_reg = tiles_per_partition; + *start_das_3_reg = addr; + *rows_das_3_reg = rows_das; break; default: - *partition0_reg = tiles_per_partition; - break; - } - asm volatile("" ::: "memory"); -} - -// reg_sel = {3, 2, 1, 0} -static inline void start_addr_scheme_config(uint32_t reg_sel, uint32_t addr, - uint32_t size) { - asm volatile("" ::: "memory"); - uint32_t data_size = size > 2 * NUM_BANKS * sizeof(uint32_t) - ? size - : 2 * NUM_BANKS * sizeof(uint32_t); - uint32_t allocated_size = data_size / (NUM_BANKS * sizeof(uint32_t)); - switch (reg_sel) { - case 0: - *start_addr_scheme0_reg = addr; - *allocated_size0_reg = allocated_size; - break; - case 1: - *start_addr_scheme1_reg = addr; - *allocated_size1_reg = allocated_size; - break; - case 2: - *start_addr_scheme2_reg = addr; - *allocated_size2_reg = allocated_size; - break; - case 3: - *start_addr_scheme3_reg = addr; - *allocated_size3_reg = allocated_size; - break; - default: - *start_addr_scheme0_reg = addr; - *allocated_size0_reg = allocated_size; + *partition_0_reg = tiles_per_partition; + *start_das_0_reg = addr; + *rows_das_0_reg = rows_das; break; } asm volatile("" ::: "memory"); diff --git a/software/runtime/runtime.mk b/software/runtime/runtime.mk index 4d485dd22..039473a6e 100644 --- a/software/runtime/runtime.mk +++ b/software/runtime/runtime.mk @@ -89,6 +89,7 @@ RISCV_STRIP ?= $(RISCV_PREFIX)strip # Defines DEFINES += -DPRINTF_DISABLE_SUPPORT_FLOAT -DPRINTF_DISABLE_SUPPORT_LONG_LONG -DPRINTF_DISABLE_SUPPORT_PTRDIFF_T DEFINES += -DNUM_CORES=$(num_cores) +DEFINES += -DLOG2_NUM_CORES=$(shell awk 'BEGIN{print log($(num_cores))/log(2)}') DEFINES += -DNUM_GROUPS=$(num_groups) DEFINES += -DNUM_CORES_PER_TILE=$(num_cores_per_tile) DEFINES += -DBANKING_FACTOR=$(banking_factor) diff --git a/software/runtime/synchronization.c b/software/runtime/synchronization.c index c3c3846f8..93fac8321 100644 --- a/software/runtime/synchronization.c +++ b/software/runtime/synchronization.c @@ -10,14 +10,6 @@ #include "runtime.h" #include "synchronization.h" -#if NUM_CORES == (16) -#define LOG2_NUM_CORES (4) -#elif NUM_CORES == (256) -#define LOG2_NUM_CORES (8) -#elif NUM_CORES == (1024) -#define LOG2_NUM_CORES (10) -#endif - uint32_t volatile barrier __attribute__((section(".l1"))); uint32_t volatile log_barrier[NUM_CORES * 4] __attribute__((aligned(NUM_CORES * 4), section(".l1"))); diff --git a/software/tests/baremetal/das_static_test/main.c b/software/tests/baremetal/das_static_test/main.c new file mode 100644 index 000000000..3d38995a1 --- /dev/null +++ b/software/tests/baremetal/das_static_test/main.c @@ -0,0 +1,64 @@ +// Copyright 2021 ETH Zurich and University of Bologna. +// Licensed under the Apache License, Version 2.0, see LICENSE for details. +// SPDX-License-Identifier: Apache-2.0 + +// Note: This test is only for Terapool dynamic heap allocation +// Author: Bowen Wang + +#include +#include + +#include "alloc.h" +#include "encoding.h" +#include "printf.h" +#include "runtime.h" +#include "synchronization.h" + +#define NUM_TILES (NUM_CORES / NUM_CORES_PER_TILE) +#define ARRAY_SIZE (4096) + +uint32_t array[ARRAY_SIZE] __attribute__((aligned(NUM_BANKS*sizeof(int32_t)), section(".l1_prio"))); + +int main() { + uint32_t core_id = mempool_get_core_id(); + uint32_t num_cores = mempool_get_core_count(); + mempool_barrier_init(core_id); + + if (core_id == 0) { + + // -------------------------------------------- + // Verify DAS partitions + // -------------------------------------------- + printf("Verify DAS partitions\n\n"); + + uint32_t num_tiles_per_partition = 4; + uint32_t part_id = 0; + + uint32_t num_partitions = NUM_TILES / num_tiles_per_partition; + uint32_t size_partition = ARRAY_SIZE / num_partitions; + + das_config(part_id, num_tiles_per_partition, (uint32_t)(array), ARRAY_SIZE * sizeof(uint32_t)); + for (uint32_t i = 0; i < ARRAY_SIZE; i++) { + array[i] = i; + } + + das_config(part_id, NUM_TILES, (uint32_t)(array), ARRAY_SIZE * sizeof(uint32_t)); + for (uint32_t j = 0; j < num_partitions; j++) { + for (uint32_t i = 0; i < size_partition; i++) { + + uint32_t *fetch_address = &array[0] + + j * (num_tiles_per_partition * NUM_CORES_PER_TILE * BANKING_FACTOR) + + (i % (num_tiles_per_partition * NUM_CORES_PER_TILE * BANKING_FACTOR)) + + (i / (num_tiles_per_partition * NUM_CORES_PER_TILE * BANKING_FACTOR)) * NUM_BANKS; + if (i + j * size_partition != *fetch_address) { + printf("%4d != %4d at address %8X.\n", i + j * size_partition, *fetch_address, fetch_address); + return 1; + } + } + } + printf("SUCCESS on partition %d\n", part_id); + } + + mempool_barrier(num_cores); + return 0; +} From 50f43265214fa2d1f753524470b6461450541e02 Mon Sep 17 00:00:00 2001 From: bowwang Date: Fri, 7 Nov 2025 14:24:40 +0100 Subject: [PATCH 28/34] [hardware] fix hardware alignment calculation --- hardware/src/address_scrambler.sv | 47 ++++++++++++++++--------------- 1 file changed, 24 insertions(+), 23 deletions(-) diff --git a/hardware/src/address_scrambler.sv b/hardware/src/address_scrambler.sv index bc087675b..9ad4c7116 100644 --- a/hardware/src/address_scrambler.sv +++ b/hardware/src/address_scrambler.sv @@ -20,15 +20,16 @@ module address_scrambler #( parameter int unsigned SeqMemSizePerTile = 4096, parameter int unsigned NumDASPartitions = 4, // Dependant parameters, do not change - parameter int unsigned RowsWidth = $clog2(TCDMSizePerBank) - ByteOffset + 1, - parameter int unsigned MemSizePerTile = NumBanksPerTile*TCDMSizePerBank, - parameter int unsigned MemSizePerRow = (1 << ByteOffset)*NumBanksPerTile*NumTiles + parameter int unsigned RowsWidth = $clog2(TCDMSizePerBank) - ByteOffset + 1, + parameter int unsigned MaxPartitionRowWidth = $clog2(TCDMSizePerBank) - ByteOffset, // maximum half of L1 + parameter int unsigned MemSizePerTile = NumBanksPerTile*TCDMSizePerBank, + parameter int unsigned MemSizePerRow = (1 << ByteOffset)*NumBanksPerTile*NumTiles ) ( - input logic [AddrWidth-1:0] address_i, - input logic [NumDASPartitions-1:0][$clog2(NumTiles):0] partition_sel_i, - input logic [NumDASPartitions-1:0][AddrWidth-1:0] start_das_i, - input logic [NumDASPartitions-1:0][RowsWidth-1:0] rows_das_i, - output logic [AddrWidth-1:0] address_o + input logic [AddrWidth-1:0] address_i, + input logic [NumDASPartitions-1:0][$clog2(NumTiles):0] partition_sel_i, + input logic [NumDASPartitions-1:0][AddrWidth-1:0] start_das_i, + input logic [NumDASPartitions-1:0][MaxPartitionRowWidth-1:0] rows_das_i, + output logic [AddrWidth-1:0] address_o ); // Stack Sequential Settings localparam int unsigned BankOffsetBits = $clog2(NumBanksPerTile); @@ -54,27 +55,27 @@ module address_scrambler #( // ------ Heap Sequential Signals ------ // - // `tile_bits` : how many bits to shift for TileID bits in each partition - // `row_bits`: how many bits need to swap within Row Index - logic [NumDASPartitions-1:0][$clog2($clog2(NumTiles)+1)-1:0] tile_bits; - logic [NumDASPartitions-1:0][$clog2(RowsWidth)-1:0] row_bits; + // `tile_bits` : how many fixed TileID bits + // `row_bits` : how many bits need to swap to the start of Row Index + logic [NumDASPartitions-1:0][$clog2($clog2(NumTiles)+1)-1:0] tile_bits; + logic [NumDASPartitions-1:0][$clog2(MaxPartitionRowWidth)-1:0] row_bits; for (genvar i = 0; i < NumDASPartitions; i++) begin : gen_shift_index lzc #( - .WIDTH ($clog2(NumTiles)+1), - .MODE (1'b0 ) + .WIDTH ($clog2(NumTiles)+1 ), + .MODE (1'b0 ) ) i_log_tile_bits ( - .in_i (partition_sel_i[i]), - .cnt_o (tile_bits[i] ), - .empty_o (/* Unused */ ) + .in_i (partition_sel_i[i] ), + .cnt_o (tile_bits[i] ), + .empty_o (/* Unused */ ) ); lzc #( - .WIDTH (RowsWidth ), - .MODE (1'b0 ) + .WIDTH (MaxPartitionRowWidth ), + .MODE (1'b0 ) ) i_log_row_bits ( - .in_i (rows_das_i[i][RowsWidth-1:0]), - .cnt_o (row_bits[i] ), - .empty_o (/* Unused */ ) + .in_i (rows_das_i[i] ), + .cnt_o (row_bits[i] ), + .empty_o (/* Unused */ ) ); end @@ -110,7 +111,7 @@ module address_scrambler #( prt_addr[p] = (aligned_addr[p] >> row_bits[p] ) & (((1 << (TileIdBits - tile_bits[p])) - 1) << (ConstantBitsLSB + tile_bits[p])); row_addr[p] = (aligned_addr[p] << (TileIdBits - tile_bits[p])) & (((1 << (row_bits[p]) ) - 1) << (TileIdBits + ConstantBitsLSB )); address_o = msb_addr[p] | row_addr[p] | prt_addr[p] | lsb_addr[p]; - address_o = address_o + row_addr[p]; + address_o = address_o + start_row_addr[p]; end end From 6929c05b904cef8341b0c9de98cf2f9899fd4174 Mon Sep 17 00:00:00 2001 From: bowwang Date: Fri, 7 Nov 2025 14:25:18 +0100 Subject: [PATCH 29/34] [software] adapt das_config to the new API --- software/tests/baremetal/das_malloc_test/main.c | 13 +++++-------- 1 file changed, 5 insertions(+), 8 deletions(-) diff --git a/software/tests/baremetal/das_malloc_test/main.c b/software/tests/baremetal/das_malloc_test/main.c index c55708233..876075f29 100644 --- a/software/tests/baremetal/das_malloc_test/main.c +++ b/software/tests/baremetal/das_malloc_test/main.c @@ -47,15 +47,13 @@ int main() { uint32_t *array = (uint32_t *)partition_malloc( dynamic_heap_alloc, array_size * sizeof(uint32_t)); // 5. Config the hardware registers - partition_config(part_id, num_tiles_per_partition); - start_addr_scheme_config(part_id, (uint32_t)(*array), - array_size * sizeof(uint32_t)); + das_config(part_id, num_tiles_per_partition, (uint32_t)(*array), array_size * sizeof(uint32_t)); // 6. Move data for (uint32_t i = 0; i < array_size; i++) { array[i] = i; } // 7. Change addressing scheme (to fully interleaved) - partition_config(part_id, NUM_TILES); + das_config(part_id, NUM_TILES, (uint32_t)(*array), array_size * sizeof(uint32_t)); // 8. check for (uint32_t i = 0; i < array_size; i++) { uint32_t *fetch_address = @@ -94,15 +92,14 @@ int main() { uint32_t *array = (uint32_t *)partition_malloc( dynamic_heap_alloc, array_size * sizeof(uint32_t)); // 5. Config the hardware registers - partition_config(part_id, num_tiles_per_partition); - start_addr_scheme_config(part_id, (uint32_t)(*array), - array_size * sizeof(uint32_t)); + das_config(part_id, num_tiles_per_partition, (uint32_t)(*array), array_size * sizeof(uint32_t)); // 6. Move data for (uint32_t i = 0; i < array_size; i++) { array[i] = i; } // 7. Change addressing scheme (to fully interleaved) - partition_config(part_id, NUM_TILES); + das_config(part_id, NUM_TILES, (uint32_t)(*array), array_size * sizeof(uint32_t)); + // partition_config(part_id, NUM_TILES); // 8. check for (uint32_t i = 0; i < array_size; i++) { uint32_t *fetch_address = From 595e0055eeb2d4b825f0d3ebd794b8f517e86112 Mon Sep 17 00:00:00 2001 From: bowwang Date: Fri, 7 Nov 2025 14:36:01 +0100 Subject: [PATCH 30/34] [hardware] remove unnecessary bit for RowsInterleavingWidth --- hardware/src/mempool_pkg.sv | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/hardware/src/mempool_pkg.sv b/hardware/src/mempool_pkg.sv index d014cda6b..91b21ba2a 100644 --- a/hardware/src/mempool_pkg.sv +++ b/hardware/src/mempool_pkg.sv @@ -49,7 +49,7 @@ package mempool_pkg; localparam integer unsigned DASMemSize = `ifdef DAS_MEM_SIZE `DAS_MEM_SIZE `else 0 `endif; localparam integer unsigned DASStartAddr = (NumBanks * TCDMSizePerBank) - NumCores * DASMemSize; localparam integer unsigned TileInterleavingWidth = idx_width(NumTiles) + 1; // only support {128, 64, 32, 16, 8, 4, 2, 1}; - localparam integer unsigned RowsInterleavingWidth = idx_width(TCDMSizePerBank) - ByteOffset + 1; + localparam integer unsigned RowsInterleavingWidth = idx_width(TCDMSizePerBank) - ByteOffset; // L2 localparam integer unsigned L2Size = `ifdef L2_SIZE `L2_SIZE `else 0 `endif; // [B] From 4b65f4e3044932ea764dfe968c2602e66926530e Mon Sep 17 00:00:00 2001 From: bowwang Date: Fri, 7 Nov 2025 14:36:44 +0100 Subject: [PATCH 31/34] [software] enforce minimum 2 rows per partition --- software/runtime/runtime.h | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/software/runtime/runtime.h b/software/runtime/runtime.h index d5d977038..7516a0aeb 100644 --- a/software/runtime/runtime.h +++ b/software/runtime/runtime.h @@ -295,7 +295,12 @@ static inline void das_config(uint32_t reg_sel, uint32_t tiles_per_partition, ui asm volatile("" ::: "memory"); // Compute number of rows uint32_t row_bytes = NUM_BANKS * sizeof(uint32_t); - uint32_t rows_das = (size + (row_bytes - 1)) / row_bytes; + uint32_t rows_das = (size + (row_bytes-1)) / row_bytes; + + // enforce minimum 2 rows per partition + // TODO (bowwang): should add protection to enforce `rows_das` is power of 2 + if (rows_das < 2) rows_das = 2; + // Program DAS registers switch (reg_sel) { case 0: From 3090712d57adf00aaaca3a0c9411c3f036048cff Mon Sep 17 00:00:00 2001 From: bowwang Date: Fri, 7 Nov 2025 18:05:57 +0100 Subject: [PATCH 32/34] [hardware] adapt to per-row alignment requirement --- hardware/src/address_scrambler.sv | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/hardware/src/address_scrambler.sv b/hardware/src/address_scrambler.sv index 9ad4c7116..0910eea1a 100644 --- a/hardware/src/address_scrambler.sv +++ b/hardware/src/address_scrambler.sv @@ -101,7 +101,7 @@ module address_scrambler #( end else begin: gen_das_scrambling for (int p = 0; p < NumDASPartitions; p++) begin - if ( (address_i >= start_das_i[p]) && (address_i < start_das_i[p]+MemSizePerRow*rows_das_i[p]) ) begin + if ( (address_i >= start_das_i[p]) && (address_i < start_das_i[p]+MemSizePerRow*rows_das_i[p]) && (partition_sel_i[p] != NumTiles) ) begin lsb_addr[p] = address_i & ((1 << (tile_bits[p]+ConstantBitsLSB)) - 1); msb_addr[p] = address_i & ~((1 << (row_bits[p]+TileIdBits+ConstantBitsLSB)) - 1); From 2f2521a3834cd806afddc34fdc12c408f8213205 Mon Sep 17 00:00:00 2001 From: bowwang Date: Fri, 7 Nov 2025 18:06:30 +0100 Subject: [PATCH 33/34] [software] adapt alignment calculation to per-row requirement --- software/runtime/alloc.c | 51 ++++++++++------------------------------ 1 file changed, 13 insertions(+), 38 deletions(-) diff --git a/software/runtime/alloc.c b/software/runtime/alloc.c index 2b9924b66..eed230dcb 100644 --- a/software/runtime/alloc.c +++ b/software/runtime/alloc.c @@ -138,37 +138,19 @@ static void *allocate_memory(alloc_t *alloc, const uint32_t size) { } // ------ Function to calculate the aligned size ------ // -static uint32_t calc_aligned_size(uint32_t *addr, - const uint32_t allocated_size) { - // interpret the addr - uint32_t tmp = allocated_size; - uint32_t log = 0; // log2 of 0 is undefined, handled as special case if needed - while (tmp >>= 1) { // Shift right until value is 0 - ++log; - } - uint32_t mask = (uint32_t)((1 << log) - 1); - uint32_t row_id, tile_id, offset; - offset = ((uint32_t)addr) & 0x7F; - tile_id = ((uint32_t)addr >> 7) & 0x7F; - row_id = ((uint32_t)addr >> 14) & 0xFF; - row_id &= mask; - - uint32_t shift_size = 0; - if ((offset == 0) && (row_id == 0) && (tile_id == 0)) { - shift_size = 0; - } else { - uint32_t aligned_boundary = 4096 * 4 * allocated_size; - uint32_t modified_curr = (row_id << 14) | (tile_id << 7) | offset; - shift_size = aligned_boundary - modified_curr; - } +static uint32_t calc_aligned_row_size(uint32_t *addr) { + + const uint32_t row_bytes = NUM_BANKS * sizeof(uint32_t); + const uint32_t mask = (uint32_t)(row_bytes - 1); + uint32_t offset = ((uint32_t)addr) & mask; - return shift_size; + return (row_bytes - offset) & mask; } + // ------ Parameters ------ // // size: Size of the data block need to be allocated // allocated_size: How many rows the current partition scheme occupied -static void *allocate_memory_aligned(alloc_t *alloc, const uint32_t size, - const uint32_t allocated_size) { +static void *allocate_memory_aligned(alloc_t *alloc, const uint32_t size) { // Get first block of linked list of free blocks alloc_block_t *curr = alloc->first_block; alloc_block_t *prev = 0; @@ -176,14 +158,13 @@ static void *allocate_memory_aligned(alloc_t *alloc, const uint32_t size, // Search first block large enough in linked list // 1. calculate the size aligned to the partition boundary uint32_t shift_size = 0; - shift_size = calc_aligned_size((uint32_t *)curr, allocated_size); + shift_size = calc_aligned_row_size((uint32_t *)curr); uint32_t aligned_size = size + shift_size; - // while (curr && (curr->size < size)) { while (curr && (curr->size < aligned_size)) { prev = curr; curr = curr->next; - shift_size = calc_aligned_size((uint32_t *)curr, allocated_size); + shift_size = calc_aligned_row_size((uint32_t *)curr); aligned_size = size + shift_size; } printf("Dynamic Allocator >> size [%d] --- shift size [%d] --- aligned size " @@ -292,7 +273,6 @@ void *partition_malloc(alloc_t *alloc, const uint32_t size) { uint32_t data_size = size > 2 * NUM_BANKS * sizeof(uint32_t) ? size : 2 * NUM_BANKS * sizeof(uint32_t); - uint32_t allocated_size = data_size / (NUM_BANKS * sizeof(uint32_t)); uint32_t block_size = ALIGN_UP(data_size, MIN_BLOCK_SIZE); // add alignment // Check if exceed maximum allowed size @@ -303,14 +283,9 @@ void *partition_malloc(alloc_t *alloc, const uint32_t size) { // allocate void *block_ptr = NULL; - if (allocated_size < 2) { - block_ptr = allocate_memory(alloc, block_size); - } else { - block_ptr = allocate_memory_aligned(alloc, block_size, allocated_size); - } - // void *block_ptr = allocate_memory(alloc, block_size); - // void *block_ptr = allocate_memory_aligned(alloc, block_size, - // allocated_size); + block_ptr = allocate_memory_aligned(alloc, block_size); + + if (!block_ptr) { printf("Memory allocator: No large enough block found (%d)\n", block_size); return NULL; From f52591ea29eb2755f634b4038cdb9f6606d84003 Mon Sep 17 00:00:00 2001 From: bowwang Date: Fri, 7 Nov 2025 18:06:49 +0100 Subject: [PATCH 34/34] [test] add misaligned malloc cases --- .../tests/baremetal/das_malloc_test/main.c | 62 +++++++++++++++++-- 1 file changed, 58 insertions(+), 4 deletions(-) diff --git a/software/tests/baremetal/das_malloc_test/main.c b/software/tests/baremetal/das_malloc_test/main.c index 876075f29..61e58c4db 100644 --- a/software/tests/baremetal/das_malloc_test/main.c +++ b/software/tests/baremetal/das_malloc_test/main.c @@ -43,17 +43,70 @@ int main() { // 3. Get the allocator alloc_t *dynamic_heap_alloc = get_dynamic_heap_alloc(); alloc_dump(dynamic_heap_alloc); + + // 4. Allocate memory + uint32_t *array = (uint32_t *)partition_malloc( + dynamic_heap_alloc, array_size * sizeof(uint32_t)); + printf("start_addr at 0x%8x\n", array); + + // 5. Config the hardware registers + das_config(part_id, num_tiles_per_partition, (uint32_t)(array), array_size * sizeof(uint32_t)); + // 6. Move data + for (uint32_t i = 0; i < array_size; i++) { + array[i] = i; + } + // 7. Change addressing scheme (to fully interleaved) + das_config(part_id, NUM_TILES, (uint32_t)(array), array_size * sizeof(uint32_t)); + // 8. check + for (uint32_t i = 0; i < array_size; i++) { + uint32_t *fetch_address = + &array[0] + + (i % + (num_tiles_per_partition * NUM_CORES_PER_TILE * BANKING_FACTOR)) + + (i / + (num_tiles_per_partition * NUM_CORES_PER_TILE * BANKING_FACTOR)) * + NUM_BANKS; + if (i != *fetch_address) { + printf("%4d != %4d at address %8X.\n", i, *fetch_address, + fetch_address); + return 1; + } + } + // 9. Free array + partition_free(dynamic_heap_alloc, array); + printf("SUCCESS on partition %d \n\n", part_id); + } + + // -------------------------------------------- + // Verify DAS partitions with misalignment + // -------------------------------------------- + printf("Verify DAS partitions with misalignemnt\n\n"); + + // 2. Set which partition write to. + for (uint32_t part_id = 0; part_id < NUM_DAS_PARTITIONS; part_id++) { + // 3. Get the allocator + alloc_t *dynamic_heap_alloc = get_dynamic_heap_alloc(); + alloc_dump(dynamic_heap_alloc); + + // 4.0 inject misalignment + uint32_t offset = 32 * (1+part_id); + uint32_t *misalign = (uint32_t *)partition_malloc( + dynamic_heap_alloc, (2*NUM_BANKS + offset) * sizeof(uint32_t)); + printf("Inject misalignment at 0x%8x with size 0x%8x in byte\n", misalign, offset*part_id); + // 4. Allocate memory uint32_t *array = (uint32_t *)partition_malloc( dynamic_heap_alloc, array_size * sizeof(uint32_t)); + printf("Aligned start_addr at 0x%8x\n", array); + // 5. Config the hardware registers - das_config(part_id, num_tiles_per_partition, (uint32_t)(*array), array_size * sizeof(uint32_t)); + das_config(part_id, num_tiles_per_partition, (uint32_t)(array), array_size * sizeof(uint32_t)); // 6. Move data for (uint32_t i = 0; i < array_size; i++) { array[i] = i; } // 7. Change addressing scheme (to fully interleaved) - das_config(part_id, NUM_TILES, (uint32_t)(*array), array_size * sizeof(uint32_t)); + das_config(part_id, NUM_TILES, (uint32_t)(array), array_size * sizeof(uint32_t)); // 8. check for (uint32_t i = 0; i < array_size; i++) { uint32_t *fetch_address = @@ -71,6 +124,7 @@ int main() { } // 9. Free array partition_free(dynamic_heap_alloc, array); + partition_free(dynamic_heap_alloc, misalign); printf("SUCCESS on partition %d \n\n", part_id); } @@ -92,13 +146,13 @@ int main() { uint32_t *array = (uint32_t *)partition_malloc( dynamic_heap_alloc, array_size * sizeof(uint32_t)); // 5. Config the hardware registers - das_config(part_id, num_tiles_per_partition, (uint32_t)(*array), array_size * sizeof(uint32_t)); + das_config(part_id, num_tiles_per_partition, (uint32_t)(array), array_size * sizeof(uint32_t)); // 6. Move data for (uint32_t i = 0; i < array_size; i++) { array[i] = i; } // 7. Change addressing scheme (to fully interleaved) - das_config(part_id, NUM_TILES, (uint32_t)(*array), array_size * sizeof(uint32_t)); + das_config(part_id, NUM_TILES, (uint32_t)(array), array_size * sizeof(uint32_t)); // partition_config(part_id, NUM_TILES); // 8. check for (uint32_t i = 0; i < array_size; i++) {