Skip to content
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
34 commits
Select commit Hold shift + click to select a range
fb21be6
[das] init open source
May 10, 2025
1989369
[config] add terapool-das configurations
May 11, 2025
2b5cc29
[traffic generator] bandwidth benchmark in selected number of tiles
May 11, 2025
320854a
[hw] add DAS support in idma
May 11, 2025
afad72f
[hw] add DAS control logic to idma
May 11, 2025
4087bf4
[hw] extend scrambler for das
May 11, 2025
6cfabd8
[hw] add das to terapool
May 11, 2025
bd0386b
[sw] add das runtime support
May 11, 2025
6ef5845
[hardware] Add DAS registers and keep only one DMA transfer option
mbertuletti Oct 21, 2025
9199936
[hardware] Correct address scrambler copy-pasted code
mbertuletti Oct 21, 2025
c155c4d
[config] Parametrize feature with meaningful define
mbertuletti Oct 21, 2025
a92a71f
[software] Rename "group_factor" with meaningful tile grouping
mbertuletti Oct 21, 2025
833ae78
[software] Add partition test
mbertuletti Oct 21, 2025
699eb47
[hardware] Group DAS registers and assign external reset
mbertuletti Oct 22, 2025
bbf4280
[software] Streamline allocations in dynamic address regions
mbertuletti Oct 22, 2025
56654a5
[hardware] Correct waves display
mbertuletti Oct 22, 2025
9c02172
[hardware] Streamline address scrambler
mbertuletti Oct 22, 2025
d26b44a
[software] Allocation stress-test over multiple partitions & Tile-groups
mbertuletti Oct 22, 2025
92242b7
[hardware] Trash redundant file
mbertuletti Oct 22, 2025
543027b
[software] Correct format
mbertuletti Oct 22, 2025
473bd0e
[software] Remove DMA with mode selection
mbertuletti Oct 22, 2025
737d106
[hardware] Correct scrambler parametrization
mbertuletti Oct 23, 2025
ee0ff54
[hardware] Parametrize DMA for DAS
mbertuletti Oct 23, 2025
58fdcd4
[hardware] fixed the DMA Midend bug, roll back to standard mode
Oct 24, 2025
cb090bf
[hardware] Change names of DAS signals
mbertuletti Oct 30, 2025
1982a12
[hardware] Modify address scrambler for non-aligned scrambling
mbertuletti Oct 30, 2025
12cdd27
[software] Add test for non-interleaved scrambling
mbertuletti Oct 30, 2025
50f4326
[hardware] fix hardware alignment calculation
Nov 7, 2025
6929c05
[software] adapt das_config to the new API
Nov 7, 2025
595e005
[hardware] remove unnecessary bit for RowsInterleavingWidth
Nov 7, 2025
4b65f4e
[software] enforce minimum 2 rows per partition
Nov 7, 2025
3090712
[hardware] adapt to per-row alignment requirement
Nov 7, 2025
2f2521a
[software] adapt alignment calculation to per-row requirement
Nov 7, 2025
f52591e
[test] add misaligned malloc cases
Nov 7, 2025
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 4 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
@@ -1,6 +1,10 @@
[![ci](https://github.com/pulp-platform/mempool/actions/workflows/ci.yml/badge.svg)](https://github.com/pulp-platform/mempool/actions/workflows/ci.yml)
[![lint](https://github.com/pulp-platform/mempool/actions/workflows/lint.yml/badge.svg)](https://github.com/pulp-platform/mempool/actions/workflows/lint.yml)
[![License](https://img.shields.io/badge/License-Apache%202.0-blue.svg)](https://opensource.org/licenses/Apache-2.0)
# MemPool Dynamic Allocation Scheme
Dynamic Allocation Scheme (DAS), a flexible, adaptable, runtime-configurable address mapping technique. DAS remaps contiguous address spaces to physically adjacent memory banks based on the workload’s memory access patterns, placing the data physically close to PEs.

This repository branch contains DAS extensions based on MemPool.

# MemPool

Expand Down
6 changes: 6 additions & 0 deletions config/config.mk
Original file line number Diff line number Diff line change
Expand Up @@ -73,6 +73,12 @@ zquarterinx ?= 0
# DivSqrt deactivated by default
xDivSqrt ?= 0

# Enable configurable addressing scheme in the heap
das ?= 0
num_das_partitions ?= 4
# Size of DAS-heap per core
das_mem_size ?= 2048

# This parameter is only used for TeraPool configurations
num_sub_groups_per_group ?= 1
remote_group_latency_cycles ?= 7
Expand Down
2 changes: 1 addition & 1 deletion config/terapool.mk
Original file line number Diff line number Diff line change
Expand Up @@ -45,4 +45,4 @@ dmas_per_group ?= 4 # Brust Length = 16

# L2 Banks/Channels
l2_banks = 16
l2_size ?= 16777216 # 1000000
l2_size ?= 16777216 # 1000000
11 changes: 11 additions & 0 deletions hardware/Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -119,6 +119,9 @@ vlog_defs += -DL2_SIZE=32\'d$(l2_size)
vlog_defs += -DL2_BANKS=$(l2_banks)
vlog_defs += -DL1_BANK_SIZE=$(l1_bank_size)
vlog_defs += -DBOOT_ADDR=32\'d$(boot_addr)
vlog_defs += -DDAS=$(das)
vlog_defs += -DNUM_DAS_PARTITIONS=$(num_das_partitions)
vlog_defs += -DDAS_MEM_SIZE=$(das_mem_size)
# Snitch ISA
vlog_defs += -DXPULPIMG=$(xpulpimg)
vlog_defs += -DZFINX=$(zfinx)
Expand Down Expand Up @@ -151,6 +154,14 @@ ifdef tg
cpp_defs += -DTG_SEQ_PROB=$(tg_seqprob)
cpp_defs += -DTG_NCYCLES=$(tg_ncycles)
cpp_defs += -DNUM_CORES=$(num_cores)
# DAS benchmark related
cpp_defs += -DPARTITION=$(partition)
cpp_defs += -DTG_PA=$(tg_pa)
cpp_defs += -DTG_PB=$(tg_pb)
cpp_defs += -DTG_PC=$(tg_pc)
cpp_defs += -DTG_PA_PROB=$(tg_pa_prob)
cpp_defs += -DTG_PB_PROB=$(tg_pb_prob)
cpp_defs += -DTG_PC_PROB=$(tg_pc_prob)

# How many cycles should we execute?
veril_flags := --term-after-cycles=$(tg_ncycles)
Expand Down
2 changes: 2 additions & 0 deletions hardware/deps/idma/Bender.yml
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,7 @@ sources:
# levels 1 and 0, etc. Files within a level are ordered alphabetically.
# Level 0
- src/axi_dma_data_path.sv
- src/midends/idma_address_scrambler.sv
# Level 1
- src/axi_dma_data_mover.sv
- src/axi_dma_burst_reshaper.sv
Expand All @@ -23,6 +24,7 @@ sources:
# Level 3: MemPool
- src/midends/idma_split_midend.sv
- src/midends/idma_distributed_midend.sv
# If enabled DAS
- src/frontends/mempool/mempool_dma_frontend_reg_pkg.sv
- src/frontends/mempool/mempool_dma_frontend_reg_top.sv
- src/frontends/mempool/mempool_dma.sv
Expand Down
105 changes: 105 additions & 0 deletions hardware/deps/idma/src/midends/idma_address_scrambler.sv
Original file line number Diff line number Diff line change
@@ -0,0 +1,105 @@
// Copyright 2021 ETH Zurich and University of Bologna.
// Solderpad Hardware License, Version 0.51, see LICENSE for details.
// SPDX-License-Identifier: SHL-0.51

// Description: Address scrambler for iDMA Midend, scramble scheme is determined
// by group_factor
// Current constraints:

// Author: Bowen Wang <bowwang@student.ethz.ch>
// Author: Marco Bertuletti <mbertuletti@iis.ee.ethz.ch>

module idma_address_scrambler #(
parameter int unsigned AddrWidth = 32,
parameter int unsigned DataWidth = 32,
parameter int unsigned ByteOffset = 2,
parameter bit Bypass = 0,
parameter int unsigned NumTiles = 128,
parameter int unsigned NumBanksPerTile = 32,
parameter int unsigned TCDMSizePerBank = 1024,
parameter int unsigned NumDASPartitions = 4,
parameter int unsigned DASStartAddr = 1024,
parameter int unsigned MemSizePerTile = NumBanksPerTile*TCDMSizePerBank,
parameter int unsigned MemSizePerRow = (1 << ByteOffset)*NumBanksPerTile*NumTiles
) (
input logic [AddrWidth-1:0] address_i,
input logic [31:0] num_bytes_i,
input logic [NumDASPartitions-1:0][$clog2(NumTiles):0] group_factor_i,
input logic [NumDASPartitions-1:0][$clog2(NumTiles):0] allocated_size_i,
input logic [NumDASPartitions-1:0][DataWidth-1:0] start_addr_scheme_i,
output logic [$clog2(NumTiles):0] group_factor_o,
output logic [$clog2(NumTiles):0] allocated_size_o,
output logic [AddrWidth-1:0] address_o
);
// Basic Settings
localparam int unsigned BankOffsetBits = $clog2(NumBanksPerTile);
localparam int unsigned TileIdBits = $clog2(NumTiles);
localparam int unsigned ConstantBitsLSB = ByteOffset + BankOffsetBits;

if (Bypass || NumTiles < 2) begin
assign address_o = address_i;
end else begin

// ------ Heap Sequential Signals ------ //

// `tile_index` : how many bits to shift for TileID bits in each partition
// `row_index`: how many bits need to swap within Row Index
logic [NumDASPartitions-1:0][$clog2($clog2(NumTiles)+1)-1:0] tile_index;
logic [NumDASPartitions-1:0][$clog2($clog2(NumTiles)+1)-1:0] row_index;

for (genvar i = 0; i < NumDASPartitions; i++) begin : gen_shift_index
lzc #(
.WIDTH ($clog2(NumTiles)+1),
.MODE (1'b0 )
) i_log_tile_index (
.in_i (group_factor_i[i]),
.cnt_o (tile_index[i] ),
.empty_o (/* Unused */ )
);
lzc #(
.WIDTH ($clog2(NumTiles)+1),
.MODE (1'b0 )
) i_log_row_index (
.in_i (allocated_size_i[i][$clog2(NumTiles):0]),
.cnt_o (row_index[i] ),
.empty_o (/* Unused */ )
);
end

always_comb begin

// Default: Unscrambled
address_o = address_i;
group_factor_o = '0;
allocated_size_o = '0;

// TODO (bowwang): add a new register to indicate the start addr of sequential heap region, currently hard coded
if (address_i < DASStartAddr) begin
group_factor_o = NumTiles; // fully interleaved
allocated_size_o = num_bytes_i / MemSizePerRow;

// DAS address scrambling
end else begin

for (int p = 0; p < NumDASPartitions; p++) begin
if ( (address_i >= start_addr_scheme_i[p]) && (address_i < start_addr_scheme_i[p]+MemSizePerRow*allocated_size_i[p]) ) begin
address_o = '0;
address_o |= address_i & ((1 << (tile_index[p]+ConstantBitsLSB)) - 1);
address_o |= ((address_i >> (row_index[p]+tile_index[p]+ConstantBitsLSB)) << (tile_index[p]+ConstantBitsLSB)) & ((1 << (TileIdBits+ConstantBitsLSB)) - 1);
address_o |= ((address_i >> (tile_index[p]+ConstantBitsLSB)) << (TileIdBits + ConstantBitsLSB)) & ((1 << (row_index[p]+TileIdBits+ConstantBitsLSB)) - 1);
address_o |= address_i & ~((1 << (row_index[p]+TileIdBits+ConstantBitsLSB)) - 1);
group_factor_o = group_factor_i[p];
allocated_size_o = allocated_size_i[p];
end
end

end
end

end

// Check for unsupported configurations
if (NumBanksPerTile < 2)
$fatal(1, "NumBanksPerTile must be greater than 2. The special case '1' is currently not supported!");

endmodule : idma_address_scrambler
95 changes: 84 additions & 11 deletions hardware/deps/idma/src/midends/idma_distributed_midend.sv
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,8 @@
// SPDX-License-Identifier: SHL-0.51

// Samuel Riedel <sriedel@iis.ee.ethz.ch>
// Bowen Wang <bowwang@student.ethz.ch>
// Marco Bertuletti <mbertuletti@iis.ee.ethz.ch>

`include "common_cells/registers.svh"

Expand All @@ -17,23 +19,31 @@ module idma_distributed_midend #(
parameter int unsigned DmaRegionEnd = 32'h1000_0000,
/// Number of generic 1D requests that can be buffered
parameter int unsigned TransFifoDepth = 1,
`ifdef DAS
parameter int unsigned NumTiles = 64,
parameter int unsigned NumDASPartitions = 4,
`endif
/// Arbitrary 1D burst request definition
parameter type burst_req_t = logic,
/// Meta data response definition
parameter type meta_t = logic
) (
input logic clk_i,
input logic rst_ni,
input logic clk_i,
input logic rst_ni,
`ifdef DAS
// DAS signals
input logic [$clog2(NumTiles):0] rows_das_i,
`endif
// Slave
input burst_req_t burst_req_i,
input logic valid_i,
output logic ready_o,
output meta_t meta_o,
input burst_req_t burst_req_i,
input logic valid_i,
output logic ready_o,
output meta_t meta_o,
// Master
output burst_req_t [NoMstPorts-1:0] burst_req_o,
output logic [NoMstPorts-1:0] valid_o,
input logic [NoMstPorts-1:0] ready_i,
input meta_t [NoMstPorts-1:0] meta_i
output burst_req_t [NoMstPorts-1:0] burst_req_o,
output logic [NoMstPorts-1:0] valid_o,
input logic [NoMstPorts-1:0] ready_i,
input meta_t [NoMstPorts-1:0] meta_i
);

localparam DmaRegionAddressBits = $clog2(DmaRegionWidth);
Expand All @@ -57,6 +67,7 @@ module idma_distributed_midend #(
// Collect the `trans_complete` signals and reduce them once we have all of them
logic empty;
logic data;
logic push;
fifo_v3 #(
.FALL_THROUGH (0 ),
.DATA_WIDTH (1 ),
Expand All @@ -70,12 +81,44 @@ module idma_distributed_midend #(
.empty_o (empty ),
.usage_o (/*unused*/ ),
.data_i (1'b1 ),
.push_i (trans_complete_d[i] ),
.push_i (push ),
.data_o (data ),
.pop_i (meta_o.trans_complete)
);
assign trans_complete_d[i] = meta_i[i].trans_complete | tie_off_trans_complete_q[i];
assign trans_complete_q[i] = data && !empty;

`ifdef DAS
// Handle two complete signals arrive at the same time
logic [NumDASPartitions-1:0] conflict_counter_d, conflict_counter_q;
`FF(conflict_counter_q, conflict_counter_d, '0, clk_i, rst_ni)
always_comb begin
push = trans_complete_d[i] && !fifo_full[i];
conflict_counter_d = conflict_counter_q;
// FIFO is not full
if (meta_i[i].trans_complete && tie_off_trans_complete_q[i] && !fifo_full[i]) begin
conflict_counter_d = conflict_counter_q+1;
end
// FIFO is full
if (meta_i[i].trans_complete && tie_off_trans_complete_q[i] && fifo_full[i]) begin
conflict_counter_d = conflict_counter_q+2;
end
if (!meta_i[i].trans_complete && tie_off_trans_complete_q[i] && fifo_full[i]) begin
conflict_counter_d = conflict_counter_q+1;
end
if (meta_i[i].trans_complete && !tie_off_trans_complete_q[i] && fifo_full[i]) begin
conflict_counter_d = conflict_counter_q+1;
end
// FIFO is not full, safe to push
if (|conflict_counter_q && !trans_complete_d[i] && !fifo_full[i] ) begin
push = 1'b1;
conflict_counter_d = conflict_counter_q-1;
end
end
`else
assign push = trans_complete_d[i]
`endif

end

always_comb begin
Expand Down Expand Up @@ -106,6 +149,7 @@ module idma_distributed_midend #(
assign dst_addr = burst_req_i.dst[FullRegionAddressBits-1:0];

always_comb begin

if (($unsigned(burst_req_i.src) >= DmaRegionStart) && ($unsigned(burst_req_i.src) < DmaRegionEnd)) begin
start_addr = src_addr;
end else begin
Expand All @@ -126,6 +170,23 @@ module idma_distributed_midend #(
burst_req_o[i].dst = burst_req_i.dst;
// Modify lower addresses bits and size
if (($unsigned(start_addr) >= (i+1)*DmaRegionWidth) || ($unsigned(end_addr) <= i*DmaRegionWidth)) begin
`ifdef DAS
burst_req_o[i].num_bytes = (burst_req_i.num_bytes<DmaRegionWidth) ? burst_req_i.num_bytes : DmaRegionWidth;
if (($unsigned(burst_req_i.src) >= DmaRegionStart) && ($unsigned(burst_req_i.src) < DmaRegionEnd)) begin
burst_req_o[i].src = burst_req_i.src+i*DmaRegionWidth;
burst_req_o[i].dst = burst_req_i.dst+i*rows_das_i*DmaRegionWidth;
end else begin
// L2 --> L1
if (burst_req_i.num_bytes<=DmaRegionWidth )begin
burst_req_o[i].src = burst_req_i.src+i*rows_das_i*DmaRegionWidth;
end else if (i==2) begin
burst_req_o[i].src = burst_req_i.src+i*rows_das_i*DmaRegionWidth;
end else if (i==3) begin
burst_req_o[i].src = burst_req_i.src+(i-1)*rows_das_i*DmaRegionWidth + DmaRegionWidth;
end
burst_req_o[i].dst = burst_req_i.dst+i*DmaRegionWidth;
end
`else
// We are not involved in the transfer
burst_req_o[i].src = '0;
burst_req_o[i].dst = '0;
Expand All @@ -137,6 +198,7 @@ module idma_distributed_midend #(
if (valid[i]) begin
tie_off_trans_complete_d[i] = 1'b1;
end
`endif
end else if (($unsigned(start_addr) >= i*DmaRegionWidth)) begin
// First (and potentially only) slice
// Leave address as is
Expand All @@ -146,6 +208,16 @@ module idma_distributed_midend #(
burst_req_o[i].num_bytes = DmaRegionWidth-start_addr[DmaRegionAddressBits-1:0];
end
end else begin
`ifdef DAS
// Round up the address to the next DMA boundary
if (($unsigned(burst_req_i.src) >= DmaRegionStart) && ($unsigned(burst_req_i.src) < DmaRegionEnd)) begin
burst_req_o[i].src[FullRegionAddressBits-1:0] = i*DmaRegionWidth;
burst_req_o[i].dst = burst_req_i.dst+i*DmaRegionWidth-start_addr[DmaRegionAddressBits-1:0];
end else begin
burst_req_o[i].src = burst_req_i.src+(i-start_addr[DmaRegionAddressBits+1:DmaRegionAddressBits])*DmaRegionWidth-start_addr[DmaRegionAddressBits-1:0];
burst_req_o[i].dst[FullRegionAddressBits-1:0] = i*DmaRegionWidth;
end
`else
// Round up the address to the next DMA boundary
if (($unsigned(burst_req_i.src) >= DmaRegionStart) && ($unsigned(burst_req_i.src) < DmaRegionEnd)) begin
burst_req_o[i].src[FullRegionAddressBits-1:0] = i*DmaRegionWidth;
Expand All @@ -154,6 +226,7 @@ module idma_distributed_midend #(
burst_req_o[i].src = burst_req_i.src+i*DmaRegionWidth-start_addr;
burst_req_o[i].dst[FullRegionAddressBits-1:0] = i*DmaRegionWidth;
end
`endif
if ($unsigned(end_addr) >= (i+1)*DmaRegionWidth) begin
// Middle slice
// Emit a full-sized transfer
Expand Down
Loading
Loading