diff --git a/README.md b/README.md index cca8165cf..deaa6b6f8 100644 --- a/README.md +++ b/README.md @@ -1,6 +1,10 @@ [![ci](https://github.com/pulp-platform/mempool/actions/workflows/ci.yml/badge.svg)](https://github.com/pulp-platform/mempool/actions/workflows/ci.yml) [![lint](https://github.com/pulp-platform/mempool/actions/workflows/lint.yml/badge.svg)](https://github.com/pulp-platform/mempool/actions/workflows/lint.yml) [![License](https://img.shields.io/badge/License-Apache%202.0-blue.svg)](https://opensource.org/licenses/Apache-2.0) +# MemPool Dynamic Allocation Scheme +Dynamic Allocation Scheme (DAS), a flexible, adaptable, runtime-configurable address mapping technique. DAS remaps contiguous address spaces to physically adjacent memory banks based on the workload’s memory access patterns, placing the data physically close to PEs. + +This repository branch contains DAS extensions based on MemPool. # MemPool diff --git a/config/config.mk b/config/config.mk index 9ea9a0fd0..9bef70d80 100644 --- a/config/config.mk +++ b/config/config.mk @@ -73,6 +73,12 @@ zquarterinx ?= 0 # DivSqrt deactivated by default xDivSqrt ?= 0 +# Enable configurable addressing scheme in the heap +das ?= 0 +num_das_partitions ?= 4 +# Size of DAS-heap per core +das_mem_size ?= 2048 + # This parameter is only used for TeraPool configurations num_sub_groups_per_group ?= 1 remote_group_latency_cycles ?= 7 diff --git a/config/terapool.mk b/config/terapool.mk index 0f1c264f8..6bdd329e9 100644 --- a/config/terapool.mk +++ b/config/terapool.mk @@ -45,4 +45,4 @@ dmas_per_group ?= 4 # Brust Length = 16 # L2 Banks/Channels l2_banks = 16 -l2_size ?= 16777216 # 1000000 \ No newline at end of file +l2_size ?= 16777216 # 1000000 diff --git a/hardware/Makefile b/hardware/Makefile index 1a78620c7..cb17f11d5 100644 --- a/hardware/Makefile +++ b/hardware/Makefile @@ -119,6 +119,9 @@ vlog_defs += -DL2_SIZE=32\'d$(l2_size) vlog_defs += -DL2_BANKS=$(l2_banks) vlog_defs += -DL1_BANK_SIZE=$(l1_bank_size) vlog_defs += -DBOOT_ADDR=32\'d$(boot_addr) +vlog_defs += -DDAS=$(das) +vlog_defs += -DNUM_DAS_PARTITIONS=$(num_das_partitions) +vlog_defs += -DDAS_MEM_SIZE=$(das_mem_size) # Snitch ISA vlog_defs += -DXPULPIMG=$(xpulpimg) vlog_defs += -DZFINX=$(zfinx) @@ -151,6 +154,14 @@ ifdef tg cpp_defs += -DTG_SEQ_PROB=$(tg_seqprob) cpp_defs += -DTG_NCYCLES=$(tg_ncycles) cpp_defs += -DNUM_CORES=$(num_cores) + # DAS benchmark related + cpp_defs += -DPARTITION=$(partition) + cpp_defs += -DTG_PA=$(tg_pa) + cpp_defs += -DTG_PB=$(tg_pb) + cpp_defs += -DTG_PC=$(tg_pc) + cpp_defs += -DTG_PA_PROB=$(tg_pa_prob) + cpp_defs += -DTG_PB_PROB=$(tg_pb_prob) + cpp_defs += -DTG_PC_PROB=$(tg_pc_prob) # How many cycles should we execute? veril_flags := --term-after-cycles=$(tg_ncycles) diff --git a/hardware/deps/idma/Bender.yml b/hardware/deps/idma/Bender.yml index 0064ee877..0ad4a786d 100644 --- a/hardware/deps/idma/Bender.yml +++ b/hardware/deps/idma/Bender.yml @@ -15,6 +15,7 @@ sources: # levels 1 and 0, etc. Files within a level are ordered alphabetically. # Level 0 - src/axi_dma_data_path.sv + - src/midends/idma_address_scrambler.sv # Level 1 - src/axi_dma_data_mover.sv - src/axi_dma_burst_reshaper.sv @@ -23,6 +24,7 @@ sources: # Level 3: MemPool - src/midends/idma_split_midend.sv - src/midends/idma_distributed_midend.sv + # If enabled DAS - src/frontends/mempool/mempool_dma_frontend_reg_pkg.sv - src/frontends/mempool/mempool_dma_frontend_reg_top.sv - src/frontends/mempool/mempool_dma.sv diff --git a/hardware/deps/idma/src/midends/idma_address_scrambler.sv b/hardware/deps/idma/src/midends/idma_address_scrambler.sv new file mode 100644 index 000000000..345ffdba8 --- /dev/null +++ b/hardware/deps/idma/src/midends/idma_address_scrambler.sv @@ -0,0 +1,105 @@ +// Copyright 2021 ETH Zurich and University of Bologna. +// Solderpad Hardware License, Version 0.51, see LICENSE for details. +// SPDX-License-Identifier: SHL-0.51 + +// Description: Address scrambler for iDMA Midend, scramble scheme is determined +// by group_factor +// Current constraints: + +// Author: Bowen Wang +// Author: Marco Bertuletti + +module idma_address_scrambler #( + parameter int unsigned AddrWidth = 32, + parameter int unsigned DataWidth = 32, + parameter int unsigned ByteOffset = 2, + parameter bit Bypass = 0, + parameter int unsigned NumTiles = 128, + parameter int unsigned NumBanksPerTile = 32, + parameter int unsigned TCDMSizePerBank = 1024, + parameter int unsigned NumDASPartitions = 4, + parameter int unsigned DASStartAddr = 1024, + parameter int unsigned MemSizePerTile = NumBanksPerTile*TCDMSizePerBank, + parameter int unsigned MemSizePerRow = (1 << ByteOffset)*NumBanksPerTile*NumTiles +) ( + input logic [AddrWidth-1:0] address_i, + input logic [31:0] num_bytes_i, + input logic [NumDASPartitions-1:0][$clog2(NumTiles):0] group_factor_i, + input logic [NumDASPartitions-1:0][$clog2(NumTiles):0] allocated_size_i, + input logic [NumDASPartitions-1:0][DataWidth-1:0] start_addr_scheme_i, + output logic [$clog2(NumTiles):0] group_factor_o, + output logic [$clog2(NumTiles):0] allocated_size_o, + output logic [AddrWidth-1:0] address_o +); + // Basic Settings + localparam int unsigned BankOffsetBits = $clog2(NumBanksPerTile); + localparam int unsigned TileIdBits = $clog2(NumTiles); + localparam int unsigned ConstantBitsLSB = ByteOffset + BankOffsetBits; + + if (Bypass || NumTiles < 2) begin + assign address_o = address_i; + end else begin + + // ------ Heap Sequential Signals ------ // + + // `tile_index` : how many bits to shift for TileID bits in each partition + // `row_index`: how many bits need to swap within Row Index + logic [NumDASPartitions-1:0][$clog2($clog2(NumTiles)+1)-1:0] tile_index; + logic [NumDASPartitions-1:0][$clog2($clog2(NumTiles)+1)-1:0] row_index; + + for (genvar i = 0; i < NumDASPartitions; i++) begin : gen_shift_index + lzc #( + .WIDTH ($clog2(NumTiles)+1), + .MODE (1'b0 ) + ) i_log_tile_index ( + .in_i (group_factor_i[i]), + .cnt_o (tile_index[i] ), + .empty_o (/* Unused */ ) + ); + lzc #( + .WIDTH ($clog2(NumTiles)+1), + .MODE (1'b0 ) + ) i_log_row_index ( + .in_i (allocated_size_i[i][$clog2(NumTiles):0]), + .cnt_o (row_index[i] ), + .empty_o (/* Unused */ ) + ); + end + + always_comb begin + + // Default: Unscrambled + address_o = address_i; + group_factor_o = '0; + allocated_size_o = '0; + + // TODO (bowwang): add a new register to indicate the start addr of sequential heap region, currently hard coded + if (address_i < DASStartAddr) begin + group_factor_o = NumTiles; // fully interleaved + allocated_size_o = num_bytes_i / MemSizePerRow; + + // DAS address scrambling + end else begin + + for (int p = 0; p < NumDASPartitions; p++) begin + if ( (address_i >= start_addr_scheme_i[p]) && (address_i < start_addr_scheme_i[p]+MemSizePerRow*allocated_size_i[p]) ) begin + address_o = '0; + address_o |= address_i & ((1 << (tile_index[p]+ConstantBitsLSB)) - 1); + address_o |= ((address_i >> (row_index[p]+tile_index[p]+ConstantBitsLSB)) << (tile_index[p]+ConstantBitsLSB)) & ((1 << (TileIdBits+ConstantBitsLSB)) - 1); + address_o |= ((address_i >> (tile_index[p]+ConstantBitsLSB)) << (TileIdBits + ConstantBitsLSB)) & ((1 << (row_index[p]+TileIdBits+ConstantBitsLSB)) - 1); + address_o |= address_i & ~((1 << (row_index[p]+TileIdBits+ConstantBitsLSB)) - 1); + group_factor_o = group_factor_i[p]; + allocated_size_o = allocated_size_i[p]; + end + end + + end + end + + end + + // Check for unsupported configurations + if (NumBanksPerTile < 2) + $fatal(1, "NumBanksPerTile must be greater than 2. The special case '1' is currently not supported!"); + +endmodule : idma_address_scrambler diff --git a/hardware/deps/idma/src/midends/idma_distributed_midend.sv b/hardware/deps/idma/src/midends/idma_distributed_midend.sv index e1cd96e10..3d53e2e15 100644 --- a/hardware/deps/idma/src/midends/idma_distributed_midend.sv +++ b/hardware/deps/idma/src/midends/idma_distributed_midend.sv @@ -3,6 +3,8 @@ // SPDX-License-Identifier: SHL-0.51 // Samuel Riedel +// Bowen Wang +// Marco Bertuletti `include "common_cells/registers.svh" @@ -17,23 +19,31 @@ module idma_distributed_midend #( parameter int unsigned DmaRegionEnd = 32'h1000_0000, /// Number of generic 1D requests that can be buffered parameter int unsigned TransFifoDepth = 1, +`ifdef DAS + parameter int unsigned NumTiles = 64, + parameter int unsigned NumDASPartitions = 4, +`endif /// Arbitrary 1D burst request definition parameter type burst_req_t = logic, /// Meta data response definition parameter type meta_t = logic ) ( - input logic clk_i, - input logic rst_ni, + input logic clk_i, + input logic rst_ni, +`ifdef DAS + // DAS signals + input logic [$clog2(NumTiles):0] rows_das_i, +`endif // Slave - input burst_req_t burst_req_i, - input logic valid_i, - output logic ready_o, - output meta_t meta_o, + input burst_req_t burst_req_i, + input logic valid_i, + output logic ready_o, + output meta_t meta_o, // Master - output burst_req_t [NoMstPorts-1:0] burst_req_o, - output logic [NoMstPorts-1:0] valid_o, - input logic [NoMstPorts-1:0] ready_i, - input meta_t [NoMstPorts-1:0] meta_i + output burst_req_t [NoMstPorts-1:0] burst_req_o, + output logic [NoMstPorts-1:0] valid_o, + input logic [NoMstPorts-1:0] ready_i, + input meta_t [NoMstPorts-1:0] meta_i ); localparam DmaRegionAddressBits = $clog2(DmaRegionWidth); @@ -57,6 +67,7 @@ module idma_distributed_midend #( // Collect the `trans_complete` signals and reduce them once we have all of them logic empty; logic data; + logic push; fifo_v3 #( .FALL_THROUGH (0 ), .DATA_WIDTH (1 ), @@ -70,12 +81,44 @@ module idma_distributed_midend #( .empty_o (empty ), .usage_o (/*unused*/ ), .data_i (1'b1 ), - .push_i (trans_complete_d[i] ), + .push_i (push ), .data_o (data ), .pop_i (meta_o.trans_complete) ); assign trans_complete_d[i] = meta_i[i].trans_complete | tie_off_trans_complete_q[i]; assign trans_complete_q[i] = data && !empty; + +`ifdef DAS + // Handle two complete signals arrive at the same time + logic [NumDASPartitions-1:0] conflict_counter_d, conflict_counter_q; + `FF(conflict_counter_q, conflict_counter_d, '0, clk_i, rst_ni) + always_comb begin + push = trans_complete_d[i] && !fifo_full[i]; + conflict_counter_d = conflict_counter_q; + // FIFO is not full + if (meta_i[i].trans_complete && tie_off_trans_complete_q[i] && !fifo_full[i]) begin + conflict_counter_d = conflict_counter_q+1; + end + // FIFO is full + if (meta_i[i].trans_complete && tie_off_trans_complete_q[i] && fifo_full[i]) begin + conflict_counter_d = conflict_counter_q+2; + end + if (!meta_i[i].trans_complete && tie_off_trans_complete_q[i] && fifo_full[i]) begin + conflict_counter_d = conflict_counter_q+1; + end + if (meta_i[i].trans_complete && !tie_off_trans_complete_q[i] && fifo_full[i]) begin + conflict_counter_d = conflict_counter_q+1; + end + // FIFO is not full, safe to push + if (|conflict_counter_q && !trans_complete_d[i] && !fifo_full[i] ) begin + push = 1'b1; + conflict_counter_d = conflict_counter_q-1; + end + end +`else + assign push = trans_complete_d[i] +`endif + end always_comb begin @@ -106,6 +149,7 @@ module idma_distributed_midend #( assign dst_addr = burst_req_i.dst[FullRegionAddressBits-1:0]; always_comb begin + if (($unsigned(burst_req_i.src) >= DmaRegionStart) && ($unsigned(burst_req_i.src) < DmaRegionEnd)) begin start_addr = src_addr; end else begin @@ -126,6 +170,23 @@ module idma_distributed_midend #( burst_req_o[i].dst = burst_req_i.dst; // Modify lower addresses bits and size if (($unsigned(start_addr) >= (i+1)*DmaRegionWidth) || ($unsigned(end_addr) <= i*DmaRegionWidth)) begin +`ifdef DAS + burst_req_o[i].num_bytes = (burst_req_i.num_bytes= DmaRegionStart) && ($unsigned(burst_req_i.src) < DmaRegionEnd)) begin + burst_req_o[i].src = burst_req_i.src+i*DmaRegionWidth; + burst_req_o[i].dst = burst_req_i.dst+i*rows_das_i*DmaRegionWidth; + end else begin + // L2 --> L1 + if (burst_req_i.num_bytes<=DmaRegionWidth )begin + burst_req_o[i].src = burst_req_i.src+i*rows_das_i*DmaRegionWidth; + end else if (i==2) begin + burst_req_o[i].src = burst_req_i.src+i*rows_das_i*DmaRegionWidth; + end else if (i==3) begin + burst_req_o[i].src = burst_req_i.src+(i-1)*rows_das_i*DmaRegionWidth + DmaRegionWidth; + end + burst_req_o[i].dst = burst_req_i.dst+i*DmaRegionWidth; + end +`else // We are not involved in the transfer burst_req_o[i].src = '0; burst_req_o[i].dst = '0; @@ -137,6 +198,7 @@ module idma_distributed_midend #( if (valid[i]) begin tie_off_trans_complete_d[i] = 1'b1; end +`endif end else if (($unsigned(start_addr) >= i*DmaRegionWidth)) begin // First (and potentially only) slice // Leave address as is @@ -146,6 +208,16 @@ module idma_distributed_midend #( burst_req_o[i].num_bytes = DmaRegionWidth-start_addr[DmaRegionAddressBits-1:0]; end end else begin +`ifdef DAS + // Round up the address to the next DMA boundary + if (($unsigned(burst_req_i.src) >= DmaRegionStart) && ($unsigned(burst_req_i.src) < DmaRegionEnd)) begin + burst_req_o[i].src[FullRegionAddressBits-1:0] = i*DmaRegionWidth; + burst_req_o[i].dst = burst_req_i.dst+i*DmaRegionWidth-start_addr[DmaRegionAddressBits-1:0]; + end else begin + burst_req_o[i].src = burst_req_i.src+(i-start_addr[DmaRegionAddressBits+1:DmaRegionAddressBits])*DmaRegionWidth-start_addr[DmaRegionAddressBits-1:0]; + burst_req_o[i].dst[FullRegionAddressBits-1:0] = i*DmaRegionWidth; + end +`else // Round up the address to the next DMA boundary if (($unsigned(burst_req_i.src) >= DmaRegionStart) && ($unsigned(burst_req_i.src) < DmaRegionEnd)) begin burst_req_o[i].src[FullRegionAddressBits-1:0] = i*DmaRegionWidth; @@ -154,6 +226,7 @@ module idma_distributed_midend #( burst_req_o[i].src = burst_req_i.src+i*DmaRegionWidth-start_addr; burst_req_o[i].dst[FullRegionAddressBits-1:0] = i*DmaRegionWidth; end +`endif if ($unsigned(end_addr) >= (i+1)*DmaRegionWidth) begin // Middle slice // Emit a full-sized transfer diff --git a/hardware/deps/idma/src/midends/idma_split_midend.sv b/hardware/deps/idma/src/midends/idma_split_midend.sv index 42a21e2d2..e3ba092e8 100644 --- a/hardware/deps/idma/src/midends/idma_split_midend.sv +++ b/hardware/deps/idma/src/midends/idma_split_midend.sv @@ -3,6 +3,8 @@ // SPDX-License-Identifier: SHL-0.51 // Samuel Riedel +// Bowen Wang +// Marco Bertuletti `include "common_cells/registers.svh" @@ -11,11 +13,25 @@ module idma_split_midend #( parameter int unsigned DmaRegionStart = 32'h0000_0000, parameter int unsigned DmaRegionEnd = 32'h1000_0000, parameter int unsigned AddrWidth = 32, +`ifdef DAS + parameter int unsigned NumTiles = 64, + parameter int unsigned NumBanksPerTile = 32, + parameter int unsigned TCDMSizePerBank = 1024, + parameter int unsigned NumDASPartitions = 4, + parameter int unsigned DASStartAddr = 1024, +`endif parameter type burst_req_t = logic, parameter type meta_t = logic ) ( input logic clk_i, input logic rst_ni, +`ifdef DAS + // DAS signals + input logic [NumDASPartitions-1:0][$clog2(NumTiles):0] partition_sel_i, + input logic [NumDASPartitions-1:0][AddrWidth-1:0] start_das_i, + input logic [NumDASPartitions-1:0][$clog2(NumTiles):0] rows_das_i, + output logic [$clog2(NumTiles):0] rows_das_o, +`endif // Slave input burst_req_t burst_req_i, input logic valid_i, @@ -28,16 +44,13 @@ module idma_split_midend #( input meta_t meta_i ); + // ------ Parameter Settings ------ // localparam DmaRegionAddressBits = $clog2(DmaRegionWidth); - typedef logic [AddrWidth-1:0] addr_t; - addr_t start_addr, end_addr; - logic req_valid; - - - // Handle Metadata + // ------ Handle Metadata ------ // // Forward idle signal and count the trans_comlete signal + logic req_valid; logic [31:0] num_trans_d, num_trans_q; assign meta_o.backend_idle = meta_i.backend_idle; @@ -56,16 +69,130 @@ module idma_split_midend #( end `FF(num_trans_q, num_trans_d, '0, clk_i, rst_ni) - // Split requests +`ifdef DAS + localparam TileDmaRegionWidth = DmaRegionWidth / NumTiles; + logic [AddrWidth-1:0] PartitionDmaRegionWidth; + localparam DmaBackendWidth = NumBanksPerTile*NumTiles*4; // 32banks*8Tiles*4bytes + + // ------ Address translation ------ // + // Only the address in L1 SPM will be scrambled + logic [AddrWidth-1:0] post_scramble_src; + logic [AddrWidth-1:0] post_scramble_dst; + logic [$clog2(NumTiles):0] group_factor_src, group_factor_dst, group_factor_sel; + logic [$clog2(NumTiles):0] allocated_size_src, allocated_size_dst, allocated_size_sel; + + assign group_factor_sel = group_factor_src | group_factor_dst; + assign allocated_size_sel = allocated_size_src | allocated_size_dst; + assign PartitionDmaRegionWidth = TileDmaRegionWidth * group_factor_sel; + + idma_address_scrambler #( + .AddrWidth (AddrWidth ), + .NumTiles (NumTiles ), + .NumBanksPerTile (NumBanksPerTile ), + .Bypass (0 ), + .NumDASPartitions (NumDASPartitions), + .TCDMSizePerBank (TCDMSizePerBank ) + ) i_idma_address_scrambler_src ( + .address_i (burst_req_i.src), + .num_bytes_i (burst_req_i.num_bytes), + .group_factor_i (partition_sel_i), + .allocated_size_i (rows_das_i), + .start_addr_scheme_i(start_das_i), + .group_factor_o (group_factor_src), + .allocated_size_o (allocated_size_src), + .address_o (post_scramble_src) + ); + + idma_address_scrambler #( + .AddrWidth (AddrWidth ), + .NumTiles (NumTiles ), + .NumBanksPerTile (NumBanksPerTile ), + .Bypass (0 ), + .NumDASPartitions (NumDASPartitions), + .TCDMSizePerBank (TCDMSizePerBank ) + ) i_idma_address_scrambler_dst ( + .address_i (burst_req_i.dst), + .num_bytes_i (burst_req_i.num_bytes), + .group_factor_i (partition_sel_i), + .allocated_size_i (rows_das_i), + .start_addr_scheme_i(start_das_i), + .group_factor_o (group_factor_dst), + .allocated_size_o (allocated_size_dst), + .address_o (post_scramble_dst) + ); + + // ------ Filter out address in L1 SPM ------ // + addr_t start_addr; + logic spm2dram; + always_comb begin + spm2dram = 0; + if (($unsigned(burst_req_i.src) >= DmaRegionStart) && ($unsigned(burst_req_i.src) < DmaRegionEnd)) begin + start_addr = post_scramble_src; + spm2dram = 1; + end else begin + start_addr = post_scramble_dst; + spm2dram = 0; + end + end + + // ------ Considering Partition Scheme ------ // + logic [$clog2(NumTiles):0] shift_index; + logic [AddrWidth-1:0] partition_mask; + addr_t masked_start_addr; + + always_comb begin + case(group_factor_sel) + 128: shift_index = 7; + 64: shift_index = 6; + 32: shift_index = 5; + 16: shift_index = 4; + 8: shift_index = 3; + 4: shift_index = 2; + 2: shift_index = 1; + default: shift_index = 0; + endcase + end + + assign partition_mask = {DmaRegionAddressBits{1'b1}} >> ($clog2(NumTiles) - shift_index); + assign masked_start_addr = start_addr & partition_mask; + + // ------ Beat Counter and Shifter Handler ------ // + logic [$clog2(NumTiles):0] beat_cnt_d, beat_cnt_q; + `FFARN(beat_cnt_q, beat_cnt_d, '0, clk_i, rst_ni) + + logic [$clog2(NumTiles):0] shift_row, shift_partition; + logic [$clog2(NumTiles):0] shift_index_sc; + logic [$clog2(NumTiles):0] mask_shift_row; + + always_comb begin + case(allocated_size_sel) + 128: shift_index_sc = 7; + 64: shift_index_sc = 6; + 32: shift_index_sc = 5; + 16: shift_index_sc = 4; + 8: shift_index_sc = 3; + 4: shift_index_sc = 2; + 2: shift_index_sc = 1; + default: shift_index_sc = 0; + endcase + end + + assign shift_partition = beat_cnt_q >> shift_index_sc; + assign mask_shift_row = ~( {($clog2(NumTiles) + 1){1'b1}} << shift_index_sc ); + assign shift_row = beat_cnt_q & mask_shift_row; +`else + // ------ Filter out address in L1 SPM ------ // + addr_t start_addr; always_comb begin if (($unsigned(burst_req_i.src) >= DmaRegionStart) && ($unsigned(burst_req_i.src) < DmaRegionEnd)) begin start_addr = burst_req_i.src; end else begin start_addr = burst_req_i.dst; end - end_addr = start_addr + burst_req_i.num_bytes; end +`endif + // ------ Split requests ------ // enum logic {Idle, Busy} state_d, state_q; burst_req_t req_d, req_q; @@ -80,9 +207,63 @@ module idma_split_midend #( ready_o = 1'b0; req_valid = 1'b0; +`ifdef DAS + rows_das_o = allocated_size_sel; + beat_cnt_d = beat_cnt_q; + if (num_trans_q == 1 && num_trans_d == 0) begin + beat_cnt_d = 0; + end +`endif + unique case (state_q) Idle: begin - if (valid_i) begin // Splitting required. + if (valid_i) begin // Splitting required +`ifdef DAS + if ((PartitionDmaRegionWidth-masked_start_addr) >= burst_req_i.num_bytes) begin + burst_req_o = burst_req_i; + // Address in SPM need to be translated back to physical address + if (spm2dram) begin + burst_req_o.src = post_scramble_src; + end else begin + burst_req_o.dst = post_scramble_dst; + end + valid_o = 1'b1; + ready_o = ready_i; + req_valid = ready_i; + end else begin + // Store and acknowledge + req_d = burst_req_i; + ready_o = 1'b1; + burst_req_o = burst_req_i; + // Calculate the size for the 1st burst + burst_req_o.num_bytes = PartitionDmaRegionWidth-masked_start_addr; + // TODO (bowwang): parameterize + // req_d.num_bytes = (group_factor_sel <= $clog2(NumTiles) + 1) ? (allocated_size_sel*DmaBackendWidth) : (allocated_size_sel*PartitionDmaRegionWidth); + if (spm2dram) begin + burst_req_o.src = post_scramble_src; + req_d.src = post_scramble_src; + end else begin + burst_req_o.dst = post_scramble_dst; + req_d.dst = post_scramble_dst; + end + valid_o = 1'b1; + // Modify the stored info after first beat sent + if (ready_i) begin + // TODO (bowwang): May not be mecessary to consider alignment + req_d.num_bytes -= PartitionDmaRegionWidth-masked_start_addr; + if (spm2dram) begin + req_d.src += DmaRegionWidth-masked_start_addr; + req_d.dst += PartitionDmaRegionWidth-masked_start_addr; + end else begin + req_d.src += PartitionDmaRegionWidth-masked_start_addr; + req_d.dst += DmaRegionWidth-masked_start_addr; + end + req_valid = 1'b1; + beat_cnt_d = 1; + end + state_d = Busy; + end +`else if (DmaRegionWidth-start_addr[DmaRegionAddressBits-1:0] >= burst_req_i.num_bytes) begin // No splitting required, just forward burst_req_o = burst_req_i; @@ -108,6 +289,7 @@ module idma_split_midend #( end state_d = Busy; end +`endif end end Busy: begin @@ -115,7 +297,37 @@ module idma_split_midend #( burst_req_o = req_q; valid_o = 1'b1; req_valid = ready_i; - if (req_q.num_bytes <= DmaRegionWidth) begin +`ifdef DAS + if ($unsigned(req_q.num_bytes) <= $unsigned(PartitionDmaRegionWidth)) begin + // Last split + if (ready_i) begin + state_d = Idle; + beat_cnt_d = beat_cnt_q + 1; + end + end else begin + burst_req_o.num_bytes = PartitionDmaRegionWidth; + if (ready_i) begin + req_d.num_bytes = req_q.num_bytes - PartitionDmaRegionWidth; + beat_cnt_d = beat_cnt_q + 1; + if (spm2dram) begin + if (shift_row == allocated_size_sel-1) begin + req_d.src = req_q.src + PartitionDmaRegionWidth - shift_row*DmaRegionWidth; + end else begin + req_d.src = req_q.src + DmaRegionWidth; + end + req_d.dst = req_q.dst + PartitionDmaRegionWidth; + end else begin + req_d.src = req_q.src + PartitionDmaRegionWidth; + if (shift_row == allocated_size_sel-1) begin + req_d.dst = req_q.dst + PartitionDmaRegionWidth - shift_row*DmaRegionWidth; + end else begin + req_d.dst = req_q.dst + DmaRegionWidth; + end + end// spm2dram + end // ready_i + end +`else + if ($unsigned(req_q.num_bytes) <= $unsigned(DmaRegionWidth)) begin // Last split if (ready_i) begin state_d = Idle; @@ -129,6 +341,7 @@ module idma_split_midend #( req_d.dst = req_q.dst + DmaRegionWidth; end end +`endif end default: /*do nothing*/; endcase @@ -139,14 +352,14 @@ module idma_split_midend #( always_ff @(posedge clk_i or negedge rst_ni) begin automatic string str; if (rst_ni && valid_i && ready_o) begin - str = "[idma_split_midend] Got request\n"; + str = "\n\n[idma_split_midend] Got request\n"; str = $sformatf("%sSplit: Request in: From: 0x%8x To: 0x%8x with size %d\n", str, burst_req_i.src, burst_req_i.dst, burst_req_i.num_bytes); f = $fopen("dma.log", "a"); $fwrite(f, str); $fclose(f); end if (rst_ni && valid_o && ready_i) begin - str = $sformatf("Split: Out %6d: From: 0x%8x To: 0x%8x with size %d\n", num_trans_q, burst_req_o.src, burst_req_o.dst, burst_req_o.num_bytes); + str = $sformatf("Split: Out %6d: From: 0x%8x To: 0x%8x with size %d, start_addr 0x%8x.\n", num_trans_q, burst_req_o.src, burst_req_o.dst, burst_req_o.num_bytes, start_addr); f = $fopen("dma.log", "a"); $fwrite(f, str); $fclose(f); diff --git a/hardware/scripts/questa/wave_core.tcl b/hardware/scripts/questa/wave_core.tcl index 85340078d..bcc97e0a6 100644 --- a/hardware/scripts/questa/wave_core.tcl +++ b/hardware/scripts/questa/wave_core.tcl @@ -13,7 +13,7 @@ if {$config == {terapool}} { add wave -noupdate -group core[$1][$2][$3][$4] -group Params /mempool_tb/dut/i_mempool_cluster/gen_groups[$1]/gen_rtl_group/i_group/gen_sub_groups[$2]/gen_rtl_sg/i_sub_group/gen_tiles[$3]/i_tile/gen_cores[$4]/gen_mempool_cc/riscv_core/RegisterTCDMReq add wave -noupdate -group core[$1][$2][$3][$4] -group Params /mempool_tb/dut/i_mempool_cluster/gen_groups[$1]/gen_rtl_group/i_group/gen_sub_groups[$2]/gen_rtl_sg/i_sub_group/gen_tiles[$3]/i_tile/gen_cores[$4]/gen_mempool_cc/riscv_core/RegisterTCDMResp add wave -noupdate -group core[$1][$2][$3][$4] /mempool_tb/dut/i_mempool_cluster/gen_groups[$1]/gen_rtl_group/i_group/gen_sub_groups[$2]/gen_rtl_sg/i_sub_group/gen_tiles[$3]/i_tile/gen_cores[$4]/gen_mempool_cc/riscv_core/i_snitch/clk_i - add wave -noupdate -group core[$1][$2][$3][$4] /mempool_tb/dut/i_mempool_cluster/gen_groups[$1]/gen_rtl_group/i_group/gen_sub_groups[$2]/gen_rtl_sg/i_sub_group/gen_tiles[$3]/i_tile/gen_cores[$4]/gen_mempool_cc/riscv_core/i_snitch/rst_i + add wave -noupdate -group core[$1][$2][$3][$4] /mempool_tb/dut/i_mempool_cluster/gen_groups[$1]/gen_rtl_group/i_group/gen_sub_groups[$2]/gen_rtl_sg/i_sub_group/gen_tiles[$3]/i_tile/gen_cores[$4]/gen_mempool_cc/riscv_core/i_snitch/rst_ni add wave -noupdate -group core[$1][$2][$3][$4] -radix unsigned /mempool_tb/dut/i_mempool_cluster/gen_groups[$1]/gen_rtl_group/i_group/gen_sub_groups[$2]/gen_rtl_sg/i_sub_group/gen_tiles[$3]/i_tile/gen_cores[$4]/gen_mempool_cc/riscv_core/i_snitch/hart_id_i add wave -noupdate -group core[$1][$2][$3][$4] -divider Instructions @@ -182,7 +182,7 @@ if {$config == {terapool}} { add wave -noupdate -group core[$1][$2][$3] -group Params /mempool_tb/dut/i_mempool_cluster/gen_groups[$1]/i_group/gen_tiles[$2]/i_tile/gen_cores[$3]/gen_mempool_cc/riscv_core/RegisterTCDMReq add wave -noupdate -group core[$1][$2][$3] -group Params /mempool_tb/dut/i_mempool_cluster/gen_groups[$1]/i_group/gen_tiles[$2]/i_tile/gen_cores[$3]/gen_mempool_cc/riscv_core/RegisterTCDMResp add wave -noupdate -group core[$1][$2][$3] /mempool_tb/dut/i_mempool_cluster/gen_groups[$1]/i_group/gen_tiles[$2]/i_tile/gen_cores[$3]/gen_mempool_cc/riscv_core/i_snitch/clk_i - add wave -noupdate -group core[$1][$2][$3] /mempool_tb/dut/i_mempool_cluster/gen_groups[$1]/i_group/gen_tiles[$2]/i_tile/gen_cores[$3]/gen_mempool_cc/riscv_core/i_snitch/rst_i + add wave -noupdate -group core[$1][$2][$3] /mempool_tb/dut/i_mempool_cluster/gen_groups[$1]/i_group/gen_tiles[$2]/i_tile/gen_cores[$3]/gen_mempool_cc/riscv_core/i_snitch/rst_ni add wave -noupdate -group core[$1][$2][$3] -radix unsigned /mempool_tb/dut/i_mempool_cluster/gen_groups[$1]/i_group/gen_tiles[$2]/i_tile/gen_cores[$3]/gen_mempool_cc/riscv_core/i_snitch/hart_id_i add wave -noupdate -group core[$1][$2][$3] -divider Instructions diff --git a/hardware/src/address_scrambler.sv b/hardware/src/address_scrambler.sv index d2c790a65..0910eea1a 100644 --- a/hardware/src/address_scrambler.sv +++ b/hardware/src/address_scrambler.sv @@ -7,18 +7,31 @@ // Current constraints: // Author: Samuel Riedel +// Author: Marco Bertuletti module address_scrambler #( parameter int unsigned AddrWidth = 32, + parameter int unsigned DataWidth = 32, parameter int unsigned ByteOffset = 2, + parameter bit Bypass = 0, parameter int unsigned NumTiles = 2, parameter int unsigned NumBanksPerTile = 2, - parameter bit Bypass = 0, - parameter int unsigned SeqMemSizePerTile = 4*1024 + parameter int unsigned TCDMSizePerBank = 1024, + parameter int unsigned SeqMemSizePerTile = 4096, + parameter int unsigned NumDASPartitions = 4, + // Dependant parameters, do not change + parameter int unsigned RowsWidth = $clog2(TCDMSizePerBank) - ByteOffset + 1, + parameter int unsigned MaxPartitionRowWidth = $clog2(TCDMSizePerBank) - ByteOffset, // maximum half of L1 + parameter int unsigned MemSizePerTile = NumBanksPerTile*TCDMSizePerBank, + parameter int unsigned MemSizePerRow = (1 << ByteOffset)*NumBanksPerTile*NumTiles ) ( - input logic [AddrWidth-1:0] address_i, - output logic [AddrWidth-1:0] address_o + input logic [AddrWidth-1:0] address_i, + input logic [NumDASPartitions-1:0][$clog2(NumTiles):0] partition_sel_i, + input logic [NumDASPartitions-1:0][AddrWidth-1:0] start_das_i, + input logic [NumDASPartitions-1:0][MaxPartitionRowWidth-1:0] rows_das_i, + output logic [AddrWidth-1:0] address_o ); + // Stack Sequential Settings localparam int unsigned BankOffsetBits = $clog2(NumBanksPerTile); localparam int unsigned TileIdBits = $clog2(NumTiles); localparam int unsigned SeqPerTileBits = $clog2(SeqMemSizePerTile); @@ -26,33 +39,84 @@ module address_scrambler #( localparam int unsigned ConstantBitsLSB = ByteOffset + BankOffsetBits; localparam int unsigned ScrambleBits = SeqPerTileBits-ConstantBitsLSB; - if (Bypass || NumTiles < 2) begin + if (Bypass || NumTiles < 2) begin: gen_bypass assign address_o = address_i; - end else begin + + end else begin: gen_scrambling + // ------ Stack Region Logic ------ // logic [ScrambleBits-1:0] scramble; // Address bits that have to be shuffled around logic [TileIdBits-1:0] tile_id; // Which tile does this address region belong to - // Leave this part of the address unchanged - // The LSBs that correspond to the offset inside a tile. These are the byte offset (bank width) - // and the Bank offset (Number of Banks in tile) - assign address_o[ConstantBitsLSB-1:0] = address_i[ConstantBitsLSB-1:0]; - // The MSBs that are outside of the sequential memory size. Currently the sequential memory size - // always starts at 0. These are all the MSBs up to SeqMemSizePerTile*NumTiles - assign address_o[AddrWidth-1:SeqTotalBits] = address_i[AddrWidth-1:SeqTotalBits]; - // Scramble the middle part // Bits that would have gone to different tiles but now go to increasing lines in the same tile assign scramble = address_i[SeqPerTileBits-1:ConstantBitsLSB]; // Bits that would // Bits that would have gone to increasing lines in the same tile but now go to different tiles assign tile_id = address_i[SeqTotalBits-1:SeqPerTileBits]; + // ------ Heap Sequential Signals ------ // + + // `tile_bits` : how many fixed TileID bits + // `row_bits` : how many bits need to swap to the start of Row Index + logic [NumDASPartitions-1:0][$clog2($clog2(NumTiles)+1)-1:0] tile_bits; + logic [NumDASPartitions-1:0][$clog2(MaxPartitionRowWidth)-1:0] row_bits; + + for (genvar i = 0; i < NumDASPartitions; i++) begin : gen_shift_index + lzc #( + .WIDTH ($clog2(NumTiles)+1 ), + .MODE (1'b0 ) + ) i_log_tile_bits ( + .in_i (partition_sel_i[i] ), + .cnt_o (tile_bits[i] ), + .empty_o (/* Unused */ ) + ); + lzc #( + .WIDTH (MaxPartitionRowWidth ), + .MODE (1'b0 ) + ) i_log_row_bits ( + .in_i (rows_das_i[i] ), + .cnt_o (row_bits[i] ), + .empty_o (/* Unused */ ) + ); + end + + logic [NumDASPartitions-1:0][AddrWidth-1:0] lsb_addr; + logic [NumDASPartitions-1:0][AddrWidth-1:0] start_row_addr; + logic [NumDASPartitions-1:0][AddrWidth-1:0] row_addr; + logic [NumDASPartitions-1:0][AddrWidth-1:0] prt_addr; + logic [NumDASPartitions-1:0][AddrWidth-1:0] msb_addr; + logic [NumDASPartitions-1:0][AddrWidth-1:0] aligned_addr; + always_comb begin + // Default: Unscrambled - address_o[SeqTotalBits-1:ConstantBitsLSB] = {tile_id, scramble}; - // If not in bypass mode and address is in sequential region and more than one tile - if (address_i < (NumTiles * SeqMemSizePerTile)) begin + address_o = address_i; + + // Stack Region + if (address_i < (NumTiles * SeqMemSizePerTile)) begin: gen_stack_scrambling + address_o[ConstantBitsLSB-1:0] = address_i[ConstantBitsLSB-1:0]; address_o[SeqTotalBits-1:ConstantBitsLSB] = {scramble, tile_id}; - end + address_o[AddrWidth-1:SeqTotalBits] = address_i[AddrWidth-1:SeqTotalBits]; + + // DAS address scrambling + end else begin: gen_das_scrambling + + for (int p = 0; p < NumDASPartitions; p++) begin + if ( (address_i >= start_das_i[p]) && (address_i < start_das_i[p]+MemSizePerRow*rows_das_i[p]) && (partition_sel_i[p] != NumTiles) ) begin + + lsb_addr[p] = address_i & ((1 << (tile_bits[p]+ConstantBitsLSB)) - 1); + msb_addr[p] = address_i & ~((1 << (row_bits[p]+TileIdBits+ConstantBitsLSB)) - 1); + start_row_addr[p] = start_das_i[p] & (((1 << row_bits[p]) - 1) << (TileIdBits + ConstantBitsLSB)); + aligned_addr[p] = address_i - start_row_addr[p]; + + prt_addr[p] = (aligned_addr[p] >> row_bits[p] ) & (((1 << (TileIdBits - tile_bits[p])) - 1) << (ConstantBitsLSB + tile_bits[p])); + row_addr[p] = (aligned_addr[p] << (TileIdBits - tile_bits[p])) & (((1 << (row_bits[p]) ) - 1) << (TileIdBits + ConstantBitsLSB )); + address_o = msb_addr[p] | row_addr[p] | prt_addr[p] | lsb_addr[p]; + address_o = address_o + start_row_addr[p]; + + end + end + + end end end diff --git a/hardware/src/control_registers/control_registers.hjson b/hardware/src/control_registers/control_registers.hjson index 1ef33e86e..69809f6f1 100644 --- a/hardware/src/control_registers/control_registers.hjson +++ b/hardware/src/control_registers/control_registers.hjson @@ -22,6 +22,11 @@ type: "int", default: "8" } + { name: "NumDASPartitions", + desc: "Supported number of DAS partitions", + type: "int", + default: "4" + } ], regwidth: 32 registers: [ @@ -71,6 +76,47 @@ hwqe: "true" fields: [{ bits: "31:0" }] }, + + { multireg: + { + name: "partition_sel" + desc: "Tile grouping for DAS partition" + swaccess: "wo" + hwaccess: "hrw" + hwqe: "true" + // External because we want to define the reset from a parameter + hwext: "true" + count: "NumDASPartitions" + cname: "partition_sel" + fields: [{ bits: "31:0" }] + }, + }, + { multireg: + { + name: "start_das" + desc: "Start address of DAS partition" + swaccess: "wo" + hwaccess: "hrw" + hwqe: "true" + // External because we want to define the reset from a parameter + hwext: "true" + count: "NumDASPartitions" + cname: "start_das" + fields: [{ bits: "31:0" }] + }, + }, + { multireg: + { + name: "rows_das" + desc: "End address of DAS partition" + swaccess: "wo" + hwaccess: "hro" + hwqe: "false" + count: "NumDASPartitions" + cname: "rows_das" + fields: [{ bits: "31:0" }] + }, + }, { name: "tcdm_start_address" desc: "TCDM Start Address Register" swaccess: "ro" diff --git a/hardware/src/control_registers/control_registers_reg_pkg.sv b/hardware/src/control_registers/control_registers_reg_pkg.sv index 0291dc527..c380805b3 100644 --- a/hardware/src/control_registers/control_registers_reg_pkg.sv +++ b/hardware/src/control_registers/control_registers_reg_pkg.sv @@ -9,9 +9,10 @@ package control_registers_reg_pkg; // Param list parameter int ROCacheNumAddrRules = 4; parameter int MAX_NumGroups = 8; + parameter int NumDASPartitions = 4; // Address widths within the block - parameter int BlockAw = 7; + parameter int BlockAw = 8; //////////////////////////// // Typedefs for registers // @@ -46,6 +47,20 @@ package control_registers_reg_pkg; logic qe; } control_registers_reg2hw_wake_up_offst_reg_t; + typedef struct packed { + logic [31:0] q; + logic qe; + } control_registers_reg2hw_partition_sel_mreg_t; + + typedef struct packed { + logic [31:0] q; + logic qe; + } control_registers_reg2hw_start_das_mreg_t; + + typedef struct packed { + logic [31:0] q; + } control_registers_reg2hw_rows_das_mreg_t; + typedef struct packed { logic [31:0] q; } control_registers_reg2hw_ro_cache_enable_reg_t; @@ -64,6 +79,14 @@ package control_registers_reg_pkg; logic qe; } control_registers_reg2hw_ro_cache_end_mreg_t; + typedef struct packed { + logic [31:0] d; + } control_registers_hw2reg_partition_sel_mreg_t; + + typedef struct packed { + logic [31:0] d; + } control_registers_hw2reg_start_das_mreg_t; + typedef struct packed { logic [31:0] d; } control_registers_hw2reg_tcdm_start_address_reg_t; @@ -86,12 +109,15 @@ package control_registers_reg_pkg; // Register -> HW type typedef struct packed { - control_registers_reg2hw_eoc_reg_t eoc; // [755:724] - control_registers_reg2hw_wake_up_reg_t wake_up; // [723:691] - control_registers_reg2hw_wake_up_tile_mreg_t [7:0] wake_up_tile; // [690:427] - control_registers_reg2hw_wake_up_group_reg_t wake_up_group; // [426:394] - control_registers_reg2hw_wake_up_strd_reg_t wake_up_strd; // [393:361] - control_registers_reg2hw_wake_up_offst_reg_t wake_up_offst; // [360:328] + control_registers_reg2hw_eoc_reg_t eoc; // [1147:1116] + control_registers_reg2hw_wake_up_reg_t wake_up; // [1115:1083] + control_registers_reg2hw_wake_up_tile_mreg_t [7:0] wake_up_tile; // [1082:819] + control_registers_reg2hw_wake_up_group_reg_t wake_up_group; // [818:786] + control_registers_reg2hw_wake_up_strd_reg_t wake_up_strd; // [785:753] + control_registers_reg2hw_wake_up_offst_reg_t wake_up_offst; // [752:720] + control_registers_reg2hw_partition_sel_mreg_t [3:0] partition_sel; // [719:588] + control_registers_reg2hw_start_das_mreg_t [3:0] start_das; // [587:456] + control_registers_reg2hw_rows_das_mreg_t [3:0] rows_das; // [455:328] control_registers_reg2hw_ro_cache_enable_reg_t ro_cache_enable; // [327:296] control_registers_reg2hw_ro_cache_flush_reg_t ro_cache_flush; // [295:264] control_registers_reg2hw_ro_cache_start_mreg_t [3:0] ro_cache_start; // [263:132] @@ -100,6 +126,8 @@ package control_registers_reg_pkg; // HW -> register type typedef struct packed { + control_registers_hw2reg_partition_sel_mreg_t [3:0] partition_sel; // [607:480] + control_registers_hw2reg_start_das_mreg_t [3:0] start_das; // [479:352] control_registers_hw2reg_tcdm_start_address_reg_t tcdm_start_address; // [351:320] control_registers_hw2reg_tcdm_end_address_reg_t tcdm_end_address; // [319:288] control_registers_hw2reg_nr_cores_reg_reg_t nr_cores_reg; // [287:256] @@ -108,34 +136,54 @@ package control_registers_reg_pkg; } control_registers_hw2reg_t; // Register offsets - parameter logic [BlockAw-1:0] CONTROL_REGISTERS_EOC_OFFSET = 7'h 0; - parameter logic [BlockAw-1:0] CONTROL_REGISTERS_WAKE_UP_OFFSET = 7'h 4; - parameter logic [BlockAw-1:0] CONTROL_REGISTERS_WAKE_UP_TILE_0_OFFSET = 7'h 8; - parameter logic [BlockAw-1:0] CONTROL_REGISTERS_WAKE_UP_TILE_1_OFFSET = 7'h c; - parameter logic [BlockAw-1:0] CONTROL_REGISTERS_WAKE_UP_TILE_2_OFFSET = 7'h 10; - parameter logic [BlockAw-1:0] CONTROL_REGISTERS_WAKE_UP_TILE_3_OFFSET = 7'h 14; - parameter logic [BlockAw-1:0] CONTROL_REGISTERS_WAKE_UP_TILE_4_OFFSET = 7'h 18; - parameter logic [BlockAw-1:0] CONTROL_REGISTERS_WAKE_UP_TILE_5_OFFSET = 7'h 1c; - parameter logic [BlockAw-1:0] CONTROL_REGISTERS_WAKE_UP_TILE_6_OFFSET = 7'h 20; - parameter logic [BlockAw-1:0] CONTROL_REGISTERS_WAKE_UP_TILE_7_OFFSET = 7'h 24; - parameter logic [BlockAw-1:0] CONTROL_REGISTERS_WAKE_UP_GROUP_OFFSET = 7'h 28; - parameter logic [BlockAw-1:0] CONTROL_REGISTERS_WAKE_UP_STRD_OFFSET = 7'h 2c; - parameter logic [BlockAw-1:0] CONTROL_REGISTERS_WAKE_UP_OFFST_OFFSET = 7'h 30; - parameter logic [BlockAw-1:0] CONTROL_REGISTERS_TCDM_START_ADDRESS_OFFSET = 7'h 34; - parameter logic [BlockAw-1:0] CONTROL_REGISTERS_TCDM_END_ADDRESS_OFFSET = 7'h 38; - parameter logic [BlockAw-1:0] CONTROL_REGISTERS_NR_CORES_REG_OFFSET = 7'h 3c; - parameter logic [BlockAw-1:0] CONTROL_REGISTERS_RO_CACHE_ENABLE_OFFSET = 7'h 40; - parameter logic [BlockAw-1:0] CONTROL_REGISTERS_RO_CACHE_FLUSH_OFFSET = 7'h 44; - parameter logic [BlockAw-1:0] CONTROL_REGISTERS_RO_CACHE_START_0_OFFSET = 7'h 48; - parameter logic [BlockAw-1:0] CONTROL_REGISTERS_RO_CACHE_START_1_OFFSET = 7'h 4c; - parameter logic [BlockAw-1:0] CONTROL_REGISTERS_RO_CACHE_START_2_OFFSET = 7'h 50; - parameter logic [BlockAw-1:0] CONTROL_REGISTERS_RO_CACHE_START_3_OFFSET = 7'h 54; - parameter logic [BlockAw-1:0] CONTROL_REGISTERS_RO_CACHE_END_0_OFFSET = 7'h 58; - parameter logic [BlockAw-1:0] CONTROL_REGISTERS_RO_CACHE_END_1_OFFSET = 7'h 5c; - parameter logic [BlockAw-1:0] CONTROL_REGISTERS_RO_CACHE_END_2_OFFSET = 7'h 60; - parameter logic [BlockAw-1:0] CONTROL_REGISTERS_RO_CACHE_END_3_OFFSET = 7'h 64; + parameter logic [BlockAw-1:0] CONTROL_REGISTERS_EOC_OFFSET = 8'h 0; + parameter logic [BlockAw-1:0] CONTROL_REGISTERS_WAKE_UP_OFFSET = 8'h 4; + parameter logic [BlockAw-1:0] CONTROL_REGISTERS_WAKE_UP_TILE_0_OFFSET = 8'h 8; + parameter logic [BlockAw-1:0] CONTROL_REGISTERS_WAKE_UP_TILE_1_OFFSET = 8'h c; + parameter logic [BlockAw-1:0] CONTROL_REGISTERS_WAKE_UP_TILE_2_OFFSET = 8'h 10; + parameter logic [BlockAw-1:0] CONTROL_REGISTERS_WAKE_UP_TILE_3_OFFSET = 8'h 14; + parameter logic [BlockAw-1:0] CONTROL_REGISTERS_WAKE_UP_TILE_4_OFFSET = 8'h 18; + parameter logic [BlockAw-1:0] CONTROL_REGISTERS_WAKE_UP_TILE_5_OFFSET = 8'h 1c; + parameter logic [BlockAw-1:0] CONTROL_REGISTERS_WAKE_UP_TILE_6_OFFSET = 8'h 20; + parameter logic [BlockAw-1:0] CONTROL_REGISTERS_WAKE_UP_TILE_7_OFFSET = 8'h 24; + parameter logic [BlockAw-1:0] CONTROL_REGISTERS_WAKE_UP_GROUP_OFFSET = 8'h 28; + parameter logic [BlockAw-1:0] CONTROL_REGISTERS_WAKE_UP_STRD_OFFSET = 8'h 2c; + parameter logic [BlockAw-1:0] CONTROL_REGISTERS_WAKE_UP_OFFST_OFFSET = 8'h 30; + parameter logic [BlockAw-1:0] CONTROL_REGISTERS_PARTITION_SEL_0_OFFSET = 8'h 34; + parameter logic [BlockAw-1:0] CONTROL_REGISTERS_PARTITION_SEL_1_OFFSET = 8'h 38; + parameter logic [BlockAw-1:0] CONTROL_REGISTERS_PARTITION_SEL_2_OFFSET = 8'h 3c; + parameter logic [BlockAw-1:0] CONTROL_REGISTERS_PARTITION_SEL_3_OFFSET = 8'h 40; + parameter logic [BlockAw-1:0] CONTROL_REGISTERS_START_DAS_0_OFFSET = 8'h 44; + parameter logic [BlockAw-1:0] CONTROL_REGISTERS_START_DAS_1_OFFSET = 8'h 48; + parameter logic [BlockAw-1:0] CONTROL_REGISTERS_START_DAS_2_OFFSET = 8'h 4c; + parameter logic [BlockAw-1:0] CONTROL_REGISTERS_START_DAS_3_OFFSET = 8'h 50; + parameter logic [BlockAw-1:0] CONTROL_REGISTERS_ROWS_DAS_0_OFFSET = 8'h 54; + parameter logic [BlockAw-1:0] CONTROL_REGISTERS_ROWS_DAS_1_OFFSET = 8'h 58; + parameter logic [BlockAw-1:0] CONTROL_REGISTERS_ROWS_DAS_2_OFFSET = 8'h 5c; + parameter logic [BlockAw-1:0] CONTROL_REGISTERS_ROWS_DAS_3_OFFSET = 8'h 60; + parameter logic [BlockAw-1:0] CONTROL_REGISTERS_TCDM_START_ADDRESS_OFFSET = 8'h 64; + parameter logic [BlockAw-1:0] CONTROL_REGISTERS_TCDM_END_ADDRESS_OFFSET = 8'h 68; + parameter logic [BlockAw-1:0] CONTROL_REGISTERS_NR_CORES_REG_OFFSET = 8'h 6c; + parameter logic [BlockAw-1:0] CONTROL_REGISTERS_RO_CACHE_ENABLE_OFFSET = 8'h 70; + parameter logic [BlockAw-1:0] CONTROL_REGISTERS_RO_CACHE_FLUSH_OFFSET = 8'h 74; + parameter logic [BlockAw-1:0] CONTROL_REGISTERS_RO_CACHE_START_0_OFFSET = 8'h 78; + parameter logic [BlockAw-1:0] CONTROL_REGISTERS_RO_CACHE_START_1_OFFSET = 8'h 7c; + parameter logic [BlockAw-1:0] CONTROL_REGISTERS_RO_CACHE_START_2_OFFSET = 8'h 80; + parameter logic [BlockAw-1:0] CONTROL_REGISTERS_RO_CACHE_START_3_OFFSET = 8'h 84; + parameter logic [BlockAw-1:0] CONTROL_REGISTERS_RO_CACHE_END_0_OFFSET = 8'h 88; + parameter logic [BlockAw-1:0] CONTROL_REGISTERS_RO_CACHE_END_1_OFFSET = 8'h 8c; + parameter logic [BlockAw-1:0] CONTROL_REGISTERS_RO_CACHE_END_2_OFFSET = 8'h 90; + parameter logic [BlockAw-1:0] CONTROL_REGISTERS_RO_CACHE_END_3_OFFSET = 8'h 94; // Reset values for hwext registers and their fields + parameter logic [31:0] CONTROL_REGISTERS_PARTITION_SEL_0_RESVAL = 32'h 0; + parameter logic [31:0] CONTROL_REGISTERS_PARTITION_SEL_1_RESVAL = 32'h 0; + parameter logic [31:0] CONTROL_REGISTERS_PARTITION_SEL_2_RESVAL = 32'h 0; + parameter logic [31:0] CONTROL_REGISTERS_PARTITION_SEL_3_RESVAL = 32'h 0; + parameter logic [31:0] CONTROL_REGISTERS_START_DAS_0_RESVAL = 32'h 0; + parameter logic [31:0] CONTROL_REGISTERS_START_DAS_1_RESVAL = 32'h 0; + parameter logic [31:0] CONTROL_REGISTERS_START_DAS_2_RESVAL = 32'h 0; + parameter logic [31:0] CONTROL_REGISTERS_START_DAS_3_RESVAL = 32'h 0; parameter logic [31:0] CONTROL_REGISTERS_TCDM_START_ADDRESS_RESVAL = 32'h 0; parameter logic [31:0] CONTROL_REGISTERS_TCDM_END_ADDRESS_RESVAL = 32'h 0; parameter logic [31:0] CONTROL_REGISTERS_NR_CORES_REG_RESVAL = 32'h 0; @@ -163,6 +211,18 @@ package control_registers_reg_pkg; CONTROL_REGISTERS_WAKE_UP_GROUP, CONTROL_REGISTERS_WAKE_UP_STRD, CONTROL_REGISTERS_WAKE_UP_OFFST, + CONTROL_REGISTERS_PARTITION_SEL_0, + CONTROL_REGISTERS_PARTITION_SEL_1, + CONTROL_REGISTERS_PARTITION_SEL_2, + CONTROL_REGISTERS_PARTITION_SEL_3, + CONTROL_REGISTERS_START_DAS_0, + CONTROL_REGISTERS_START_DAS_1, + CONTROL_REGISTERS_START_DAS_2, + CONTROL_REGISTERS_START_DAS_3, + CONTROL_REGISTERS_ROWS_DAS_0, + CONTROL_REGISTERS_ROWS_DAS_1, + CONTROL_REGISTERS_ROWS_DAS_2, + CONTROL_REGISTERS_ROWS_DAS_3, CONTROL_REGISTERS_TCDM_START_ADDRESS, CONTROL_REGISTERS_TCDM_END_ADDRESS, CONTROL_REGISTERS_NR_CORES_REG, @@ -179,7 +239,7 @@ package control_registers_reg_pkg; } control_registers_id_e; // Register width information to check illegal writes - parameter logic [3:0] CONTROL_REGISTERS_PERMIT [26] = '{ + parameter logic [3:0] CONTROL_REGISTERS_PERMIT [38] = '{ 4'b 1111, // index[ 0] CONTROL_REGISTERS_EOC 4'b 1111, // index[ 1] CONTROL_REGISTERS_WAKE_UP 4'b 1111, // index[ 2] CONTROL_REGISTERS_WAKE_UP_TILE_0 @@ -193,19 +253,31 @@ package control_registers_reg_pkg; 4'b 1111, // index[10] CONTROL_REGISTERS_WAKE_UP_GROUP 4'b 1111, // index[11] CONTROL_REGISTERS_WAKE_UP_STRD 4'b 1111, // index[12] CONTROL_REGISTERS_WAKE_UP_OFFST - 4'b 1111, // index[13] CONTROL_REGISTERS_TCDM_START_ADDRESS - 4'b 1111, // index[14] CONTROL_REGISTERS_TCDM_END_ADDRESS - 4'b 1111, // index[15] CONTROL_REGISTERS_NR_CORES_REG - 4'b 1111, // index[16] CONTROL_REGISTERS_RO_CACHE_ENABLE - 4'b 1111, // index[17] CONTROL_REGISTERS_RO_CACHE_FLUSH - 4'b 1111, // index[18] CONTROL_REGISTERS_RO_CACHE_START_0 - 4'b 1111, // index[19] CONTROL_REGISTERS_RO_CACHE_START_1 - 4'b 1111, // index[20] CONTROL_REGISTERS_RO_CACHE_START_2 - 4'b 1111, // index[21] CONTROL_REGISTERS_RO_CACHE_START_3 - 4'b 1111, // index[22] CONTROL_REGISTERS_RO_CACHE_END_0 - 4'b 1111, // index[23] CONTROL_REGISTERS_RO_CACHE_END_1 - 4'b 1111, // index[24] CONTROL_REGISTERS_RO_CACHE_END_2 - 4'b 1111 // index[25] CONTROL_REGISTERS_RO_CACHE_END_3 + 4'b 1111, // index[13] CONTROL_REGISTERS_PARTITION_SEL_0 + 4'b 1111, // index[14] CONTROL_REGISTERS_PARTITION_SEL_1 + 4'b 1111, // index[15] CONTROL_REGISTERS_PARTITION_SEL_2 + 4'b 1111, // index[16] CONTROL_REGISTERS_PARTITION_SEL_3 + 4'b 1111, // index[17] CONTROL_REGISTERS_START_DAS_0 + 4'b 1111, // index[18] CONTROL_REGISTERS_START_DAS_1 + 4'b 1111, // index[19] CONTROL_REGISTERS_START_DAS_2 + 4'b 1111, // index[20] CONTROL_REGISTERS_START_DAS_3 + 4'b 1111, // index[21] CONTROL_REGISTERS_ROWS_DAS_0 + 4'b 1111, // index[22] CONTROL_REGISTERS_ROWS_DAS_1 + 4'b 1111, // index[23] CONTROL_REGISTERS_ROWS_DAS_2 + 4'b 1111, // index[24] CONTROL_REGISTERS_ROWS_DAS_3 + 4'b 1111, // index[25] CONTROL_REGISTERS_TCDM_START_ADDRESS + 4'b 1111, // index[26] CONTROL_REGISTERS_TCDM_END_ADDRESS + 4'b 1111, // index[27] CONTROL_REGISTERS_NR_CORES_REG + 4'b 1111, // index[28] CONTROL_REGISTERS_RO_CACHE_ENABLE + 4'b 1111, // index[29] CONTROL_REGISTERS_RO_CACHE_FLUSH + 4'b 1111, // index[30] CONTROL_REGISTERS_RO_CACHE_START_0 + 4'b 1111, // index[31] CONTROL_REGISTERS_RO_CACHE_START_1 + 4'b 1111, // index[32] CONTROL_REGISTERS_RO_CACHE_START_2 + 4'b 1111, // index[33] CONTROL_REGISTERS_RO_CACHE_START_3 + 4'b 1111, // index[34] CONTROL_REGISTERS_RO_CACHE_END_0 + 4'b 1111, // index[35] CONTROL_REGISTERS_RO_CACHE_END_1 + 4'b 1111, // index[36] CONTROL_REGISTERS_RO_CACHE_END_2 + 4'b 1111 // index[37] CONTROL_REGISTERS_RO_CACHE_END_3 }; endpackage diff --git a/hardware/src/control_registers/control_registers_reg_top.sv b/hardware/src/control_registers/control_registers_reg_top.sv index 9258d111d..ccd3cce5b 100644 --- a/hardware/src/control_registers/control_registers_reg_top.sv +++ b/hardware/src/control_registers/control_registers_reg_top.sv @@ -10,7 +10,7 @@ module control_registers_reg_top #( parameter type reg_req_t = logic, parameter type reg_rsp_t = logic, - parameter int AW = 7 + parameter int AW = 8 ) ( input logic clk_i, input logic rst_ni, @@ -95,6 +95,30 @@ module control_registers_reg_top #( logic wake_up_strd_we; logic [31:0] wake_up_offst_wd; logic wake_up_offst_we; + logic [31:0] partition_sel_0_wd; + logic partition_sel_0_we; + logic [31:0] partition_sel_1_wd; + logic partition_sel_1_we; + logic [31:0] partition_sel_2_wd; + logic partition_sel_2_we; + logic [31:0] partition_sel_3_wd; + logic partition_sel_3_we; + logic [31:0] start_das_0_wd; + logic start_das_0_we; + logic [31:0] start_das_1_wd; + logic start_das_1_we; + logic [31:0] start_das_2_wd; + logic start_das_2_we; + logic [31:0] start_das_3_wd; + logic start_das_3_we; + logic [31:0] rows_das_0_wd; + logic rows_das_0_we; + logic [31:0] rows_das_1_wd; + logic rows_das_1_we; + logic [31:0] rows_das_2_wd; + logic rows_das_2_we; + logic [31:0] rows_das_3_wd; + logic rows_das_3_we; logic [31:0] tcdm_start_address_qs; logic tcdm_start_address_re; logic [31:0] tcdm_end_address_qs; @@ -482,6 +506,244 @@ module control_registers_reg_top #( ); + + // Subregister 0 of Multireg partition_sel + // R[partition_sel_0]: V(True) + + prim_subreg_ext #( + .DW (32) + ) u_partition_sel_0 ( + .re (1'b0), + .we (partition_sel_0_we), + .wd (partition_sel_0_wd), + .d (hw2reg.partition_sel[0].d), + .qre (), + .qe (reg2hw.partition_sel[0].qe), + .q (reg2hw.partition_sel[0].q ), + .qs () + ); + + // Subregister 1 of Multireg partition_sel + // R[partition_sel_1]: V(True) + + prim_subreg_ext #( + .DW (32) + ) u_partition_sel_1 ( + .re (1'b0), + .we (partition_sel_1_we), + .wd (partition_sel_1_wd), + .d (hw2reg.partition_sel[1].d), + .qre (), + .qe (reg2hw.partition_sel[1].qe), + .q (reg2hw.partition_sel[1].q ), + .qs () + ); + + // Subregister 2 of Multireg partition_sel + // R[partition_sel_2]: V(True) + + prim_subreg_ext #( + .DW (32) + ) u_partition_sel_2 ( + .re (1'b0), + .we (partition_sel_2_we), + .wd (partition_sel_2_wd), + .d (hw2reg.partition_sel[2].d), + .qre (), + .qe (reg2hw.partition_sel[2].qe), + .q (reg2hw.partition_sel[2].q ), + .qs () + ); + + // Subregister 3 of Multireg partition_sel + // R[partition_sel_3]: V(True) + + prim_subreg_ext #( + .DW (32) + ) u_partition_sel_3 ( + .re (1'b0), + .we (partition_sel_3_we), + .wd (partition_sel_3_wd), + .d (hw2reg.partition_sel[3].d), + .qre (), + .qe (reg2hw.partition_sel[3].qe), + .q (reg2hw.partition_sel[3].q ), + .qs () + ); + + + + // Subregister 0 of Multireg start_das + // R[start_das_0]: V(True) + + prim_subreg_ext #( + .DW (32) + ) u_start_das_0 ( + .re (1'b0), + .we (start_das_0_we), + .wd (start_das_0_wd), + .d (hw2reg.start_das[0].d), + .qre (), + .qe (reg2hw.start_das[0].qe), + .q (reg2hw.start_das[0].q ), + .qs () + ); + + // Subregister 1 of Multireg start_das + // R[start_das_1]: V(True) + + prim_subreg_ext #( + .DW (32) + ) u_start_das_1 ( + .re (1'b0), + .we (start_das_1_we), + .wd (start_das_1_wd), + .d (hw2reg.start_das[1].d), + .qre (), + .qe (reg2hw.start_das[1].qe), + .q (reg2hw.start_das[1].q ), + .qs () + ); + + // Subregister 2 of Multireg start_das + // R[start_das_2]: V(True) + + prim_subreg_ext #( + .DW (32) + ) u_start_das_2 ( + .re (1'b0), + .we (start_das_2_we), + .wd (start_das_2_wd), + .d (hw2reg.start_das[2].d), + .qre (), + .qe (reg2hw.start_das[2].qe), + .q (reg2hw.start_das[2].q ), + .qs () + ); + + // Subregister 3 of Multireg start_das + // R[start_das_3]: V(True) + + prim_subreg_ext #( + .DW (32) + ) u_start_das_3 ( + .re (1'b0), + .we (start_das_3_we), + .wd (start_das_3_wd), + .d (hw2reg.start_das[3].d), + .qre (), + .qe (reg2hw.start_das[3].qe), + .q (reg2hw.start_das[3].q ), + .qs () + ); + + + + // Subregister 0 of Multireg rows_das + // R[rows_das_0]: V(False) + + prim_subreg #( + .DW (32), + .SWACCESS("WO"), + .RESVAL (32'h0) + ) u_rows_das_0 ( + .clk_i (clk_i ), + .rst_ni (rst_ni ), + + // from register interface + .we (rows_das_0_we), + .wd (rows_das_0_wd), + + // from internal hardware + .de (1'b0), + .d ('0 ), + + // to internal hardware + .qe (), + .q (reg2hw.rows_das[0].q ), + + .qs () + ); + + // Subregister 1 of Multireg rows_das + // R[rows_das_1]: V(False) + + prim_subreg #( + .DW (32), + .SWACCESS("WO"), + .RESVAL (32'h0) + ) u_rows_das_1 ( + .clk_i (clk_i ), + .rst_ni (rst_ni ), + + // from register interface + .we (rows_das_1_we), + .wd (rows_das_1_wd), + + // from internal hardware + .de (1'b0), + .d ('0 ), + + // to internal hardware + .qe (), + .q (reg2hw.rows_das[1].q ), + + .qs () + ); + + // Subregister 2 of Multireg rows_das + // R[rows_das_2]: V(False) + + prim_subreg #( + .DW (32), + .SWACCESS("WO"), + .RESVAL (32'h0) + ) u_rows_das_2 ( + .clk_i (clk_i ), + .rst_ni (rst_ni ), + + // from register interface + .we (rows_das_2_we), + .wd (rows_das_2_wd), + + // from internal hardware + .de (1'b0), + .d ('0 ), + + // to internal hardware + .qe (), + .q (reg2hw.rows_das[2].q ), + + .qs () + ); + + // Subregister 3 of Multireg rows_das + // R[rows_das_3]: V(False) + + prim_subreg #( + .DW (32), + .SWACCESS("WO"), + .RESVAL (32'h0) + ) u_rows_das_3 ( + .clk_i (clk_i ), + .rst_ni (rst_ni ), + + // from register interface + .we (rows_das_3_we), + .wd (rows_das_3_wd), + + // from internal hardware + .de (1'b0), + .d ('0 ), + + // to internal hardware + .qe (), + .q (reg2hw.rows_das[3].q ), + + .qs () + ); + + // R[tcdm_start_address]: V(True) prim_subreg_ext #( @@ -718,7 +980,7 @@ module control_registers_reg_top #( - logic [25:0] addr_hit; + logic [37:0] addr_hit; always_comb begin addr_hit = '0; addr_hit[ 0] = (reg_addr == CONTROL_REGISTERS_EOC_OFFSET); @@ -734,19 +996,31 @@ module control_registers_reg_top #( addr_hit[10] = (reg_addr == CONTROL_REGISTERS_WAKE_UP_GROUP_OFFSET); addr_hit[11] = (reg_addr == CONTROL_REGISTERS_WAKE_UP_STRD_OFFSET); addr_hit[12] = (reg_addr == CONTROL_REGISTERS_WAKE_UP_OFFST_OFFSET); - addr_hit[13] = (reg_addr == CONTROL_REGISTERS_TCDM_START_ADDRESS_OFFSET); - addr_hit[14] = (reg_addr == CONTROL_REGISTERS_TCDM_END_ADDRESS_OFFSET); - addr_hit[15] = (reg_addr == CONTROL_REGISTERS_NR_CORES_REG_OFFSET); - addr_hit[16] = (reg_addr == CONTROL_REGISTERS_RO_CACHE_ENABLE_OFFSET); - addr_hit[17] = (reg_addr == CONTROL_REGISTERS_RO_CACHE_FLUSH_OFFSET); - addr_hit[18] = (reg_addr == CONTROL_REGISTERS_RO_CACHE_START_0_OFFSET); - addr_hit[19] = (reg_addr == CONTROL_REGISTERS_RO_CACHE_START_1_OFFSET); - addr_hit[20] = (reg_addr == CONTROL_REGISTERS_RO_CACHE_START_2_OFFSET); - addr_hit[21] = (reg_addr == CONTROL_REGISTERS_RO_CACHE_START_3_OFFSET); - addr_hit[22] = (reg_addr == CONTROL_REGISTERS_RO_CACHE_END_0_OFFSET); - addr_hit[23] = (reg_addr == CONTROL_REGISTERS_RO_CACHE_END_1_OFFSET); - addr_hit[24] = (reg_addr == CONTROL_REGISTERS_RO_CACHE_END_2_OFFSET); - addr_hit[25] = (reg_addr == CONTROL_REGISTERS_RO_CACHE_END_3_OFFSET); + addr_hit[13] = (reg_addr == CONTROL_REGISTERS_PARTITION_SEL_0_OFFSET); + addr_hit[14] = (reg_addr == CONTROL_REGISTERS_PARTITION_SEL_1_OFFSET); + addr_hit[15] = (reg_addr == CONTROL_REGISTERS_PARTITION_SEL_2_OFFSET); + addr_hit[16] = (reg_addr == CONTROL_REGISTERS_PARTITION_SEL_3_OFFSET); + addr_hit[17] = (reg_addr == CONTROL_REGISTERS_START_DAS_0_OFFSET); + addr_hit[18] = (reg_addr == CONTROL_REGISTERS_START_DAS_1_OFFSET); + addr_hit[19] = (reg_addr == CONTROL_REGISTERS_START_DAS_2_OFFSET); + addr_hit[20] = (reg_addr == CONTROL_REGISTERS_START_DAS_3_OFFSET); + addr_hit[21] = (reg_addr == CONTROL_REGISTERS_ROWS_DAS_0_OFFSET); + addr_hit[22] = (reg_addr == CONTROL_REGISTERS_ROWS_DAS_1_OFFSET); + addr_hit[23] = (reg_addr == CONTROL_REGISTERS_ROWS_DAS_2_OFFSET); + addr_hit[24] = (reg_addr == CONTROL_REGISTERS_ROWS_DAS_3_OFFSET); + addr_hit[25] = (reg_addr == CONTROL_REGISTERS_TCDM_START_ADDRESS_OFFSET); + addr_hit[26] = (reg_addr == CONTROL_REGISTERS_TCDM_END_ADDRESS_OFFSET); + addr_hit[27] = (reg_addr == CONTROL_REGISTERS_NR_CORES_REG_OFFSET); + addr_hit[28] = (reg_addr == CONTROL_REGISTERS_RO_CACHE_ENABLE_OFFSET); + addr_hit[29] = (reg_addr == CONTROL_REGISTERS_RO_CACHE_FLUSH_OFFSET); + addr_hit[30] = (reg_addr == CONTROL_REGISTERS_RO_CACHE_START_0_OFFSET); + addr_hit[31] = (reg_addr == CONTROL_REGISTERS_RO_CACHE_START_1_OFFSET); + addr_hit[32] = (reg_addr == CONTROL_REGISTERS_RO_CACHE_START_2_OFFSET); + addr_hit[33] = (reg_addr == CONTROL_REGISTERS_RO_CACHE_START_3_OFFSET); + addr_hit[34] = (reg_addr == CONTROL_REGISTERS_RO_CACHE_END_0_OFFSET); + addr_hit[35] = (reg_addr == CONTROL_REGISTERS_RO_CACHE_END_1_OFFSET); + addr_hit[36] = (reg_addr == CONTROL_REGISTERS_RO_CACHE_END_2_OFFSET); + addr_hit[37] = (reg_addr == CONTROL_REGISTERS_RO_CACHE_END_3_OFFSET); end assign addrmiss = (reg_re || reg_we) ? ~|addr_hit : 1'b0 ; @@ -779,7 +1053,19 @@ module control_registers_reg_top #( (addr_hit[22] & (|(CONTROL_REGISTERS_PERMIT[22] & ~reg_be))) | (addr_hit[23] & (|(CONTROL_REGISTERS_PERMIT[23] & ~reg_be))) | (addr_hit[24] & (|(CONTROL_REGISTERS_PERMIT[24] & ~reg_be))) | - (addr_hit[25] & (|(CONTROL_REGISTERS_PERMIT[25] & ~reg_be))))); + (addr_hit[25] & (|(CONTROL_REGISTERS_PERMIT[25] & ~reg_be))) | + (addr_hit[26] & (|(CONTROL_REGISTERS_PERMIT[26] & ~reg_be))) | + (addr_hit[27] & (|(CONTROL_REGISTERS_PERMIT[27] & ~reg_be))) | + (addr_hit[28] & (|(CONTROL_REGISTERS_PERMIT[28] & ~reg_be))) | + (addr_hit[29] & (|(CONTROL_REGISTERS_PERMIT[29] & ~reg_be))) | + (addr_hit[30] & (|(CONTROL_REGISTERS_PERMIT[30] & ~reg_be))) | + (addr_hit[31] & (|(CONTROL_REGISTERS_PERMIT[31] & ~reg_be))) | + (addr_hit[32] & (|(CONTROL_REGISTERS_PERMIT[32] & ~reg_be))) | + (addr_hit[33] & (|(CONTROL_REGISTERS_PERMIT[33] & ~reg_be))) | + (addr_hit[34] & (|(CONTROL_REGISTERS_PERMIT[34] & ~reg_be))) | + (addr_hit[35] & (|(CONTROL_REGISTERS_PERMIT[35] & ~reg_be))) | + (addr_hit[36] & (|(CONTROL_REGISTERS_PERMIT[36] & ~reg_be))) | + (addr_hit[37] & (|(CONTROL_REGISTERS_PERMIT[37] & ~reg_be))))); end assign eoc_we = addr_hit[0] & reg_we & !reg_error; @@ -821,49 +1107,85 @@ module control_registers_reg_top #( assign wake_up_offst_we = addr_hit[12] & reg_we & !reg_error; assign wake_up_offst_wd = reg_wdata[31:0]; - assign tcdm_start_address_re = addr_hit[13] & reg_re & !reg_error; + assign partition_sel_0_we = addr_hit[13] & reg_we & !reg_error; + assign partition_sel_0_wd = reg_wdata[31:0]; + + assign partition_sel_1_we = addr_hit[14] & reg_we & !reg_error; + assign partition_sel_1_wd = reg_wdata[31:0]; + + assign partition_sel_2_we = addr_hit[15] & reg_we & !reg_error; + assign partition_sel_2_wd = reg_wdata[31:0]; + + assign partition_sel_3_we = addr_hit[16] & reg_we & !reg_error; + assign partition_sel_3_wd = reg_wdata[31:0]; + + assign start_das_0_we = addr_hit[17] & reg_we & !reg_error; + assign start_das_0_wd = reg_wdata[31:0]; + + assign start_das_1_we = addr_hit[18] & reg_we & !reg_error; + assign start_das_1_wd = reg_wdata[31:0]; + + assign start_das_2_we = addr_hit[19] & reg_we & !reg_error; + assign start_das_2_wd = reg_wdata[31:0]; - assign tcdm_end_address_re = addr_hit[14] & reg_re & !reg_error; + assign start_das_3_we = addr_hit[20] & reg_we & !reg_error; + assign start_das_3_wd = reg_wdata[31:0]; - assign nr_cores_reg_re = addr_hit[15] & reg_re & !reg_error; + assign rows_das_0_we = addr_hit[21] & reg_we & !reg_error; + assign rows_das_0_wd = reg_wdata[31:0]; - assign ro_cache_enable_we = addr_hit[16] & reg_we & !reg_error; + assign rows_das_1_we = addr_hit[22] & reg_we & !reg_error; + assign rows_das_1_wd = reg_wdata[31:0]; + + assign rows_das_2_we = addr_hit[23] & reg_we & !reg_error; + assign rows_das_2_wd = reg_wdata[31:0]; + + assign rows_das_3_we = addr_hit[24] & reg_we & !reg_error; + assign rows_das_3_wd = reg_wdata[31:0]; + + assign tcdm_start_address_re = addr_hit[25] & reg_re & !reg_error; + + assign tcdm_end_address_re = addr_hit[26] & reg_re & !reg_error; + + assign nr_cores_reg_re = addr_hit[27] & reg_re & !reg_error; + + assign ro_cache_enable_we = addr_hit[28] & reg_we & !reg_error; assign ro_cache_enable_wd = reg_wdata[31:0]; - assign ro_cache_flush_we = addr_hit[17] & reg_we & !reg_error; + assign ro_cache_flush_we = addr_hit[29] & reg_we & !reg_error; assign ro_cache_flush_wd = reg_wdata[31:0]; - assign ro_cache_start_0_we = addr_hit[18] & reg_we & !reg_error; + assign ro_cache_start_0_we = addr_hit[30] & reg_we & !reg_error; assign ro_cache_start_0_wd = reg_wdata[31:0]; - assign ro_cache_start_0_re = addr_hit[18] & reg_re & !reg_error; + assign ro_cache_start_0_re = addr_hit[30] & reg_re & !reg_error; - assign ro_cache_start_1_we = addr_hit[19] & reg_we & !reg_error; + assign ro_cache_start_1_we = addr_hit[31] & reg_we & !reg_error; assign ro_cache_start_1_wd = reg_wdata[31:0]; - assign ro_cache_start_1_re = addr_hit[19] & reg_re & !reg_error; + assign ro_cache_start_1_re = addr_hit[31] & reg_re & !reg_error; - assign ro_cache_start_2_we = addr_hit[20] & reg_we & !reg_error; + assign ro_cache_start_2_we = addr_hit[32] & reg_we & !reg_error; assign ro_cache_start_2_wd = reg_wdata[31:0]; - assign ro_cache_start_2_re = addr_hit[20] & reg_re & !reg_error; + assign ro_cache_start_2_re = addr_hit[32] & reg_re & !reg_error; - assign ro_cache_start_3_we = addr_hit[21] & reg_we & !reg_error; + assign ro_cache_start_3_we = addr_hit[33] & reg_we & !reg_error; assign ro_cache_start_3_wd = reg_wdata[31:0]; - assign ro_cache_start_3_re = addr_hit[21] & reg_re & !reg_error; + assign ro_cache_start_3_re = addr_hit[33] & reg_re & !reg_error; - assign ro_cache_end_0_we = addr_hit[22] & reg_we & !reg_error; + assign ro_cache_end_0_we = addr_hit[34] & reg_we & !reg_error; assign ro_cache_end_0_wd = reg_wdata[31:0]; - assign ro_cache_end_0_re = addr_hit[22] & reg_re & !reg_error; + assign ro_cache_end_0_re = addr_hit[34] & reg_re & !reg_error; - assign ro_cache_end_1_we = addr_hit[23] & reg_we & !reg_error; + assign ro_cache_end_1_we = addr_hit[35] & reg_we & !reg_error; assign ro_cache_end_1_wd = reg_wdata[31:0]; - assign ro_cache_end_1_re = addr_hit[23] & reg_re & !reg_error; + assign ro_cache_end_1_re = addr_hit[35] & reg_re & !reg_error; - assign ro_cache_end_2_we = addr_hit[24] & reg_we & !reg_error; + assign ro_cache_end_2_we = addr_hit[36] & reg_we & !reg_error; assign ro_cache_end_2_wd = reg_wdata[31:0]; - assign ro_cache_end_2_re = addr_hit[24] & reg_re & !reg_error; + assign ro_cache_end_2_re = addr_hit[36] & reg_re & !reg_error; - assign ro_cache_end_3_we = addr_hit[25] & reg_we & !reg_error; + assign ro_cache_end_3_we = addr_hit[37] & reg_we & !reg_error; assign ro_cache_end_3_wd = reg_wdata[31:0]; - assign ro_cache_end_3_re = addr_hit[25] & reg_re & !reg_error; + assign ro_cache_end_3_re = addr_hit[37] & reg_re & !reg_error; // Read data return always_comb begin @@ -922,54 +1244,102 @@ module control_registers_reg_top #( end addr_hit[13]: begin - reg_rdata_next[31:0] = tcdm_start_address_qs; + reg_rdata_next[31:0] = '0; end addr_hit[14]: begin - reg_rdata_next[31:0] = tcdm_end_address_qs; + reg_rdata_next[31:0] = '0; end addr_hit[15]: begin - reg_rdata_next[31:0] = nr_cores_reg_qs; + reg_rdata_next[31:0] = '0; end addr_hit[16]: begin - reg_rdata_next[31:0] = ro_cache_enable_qs; + reg_rdata_next[31:0] = '0; end addr_hit[17]: begin - reg_rdata_next[31:0] = ro_cache_flush_qs; + reg_rdata_next[31:0] = '0; end addr_hit[18]: begin - reg_rdata_next[31:0] = ro_cache_start_0_qs; + reg_rdata_next[31:0] = '0; end addr_hit[19]: begin - reg_rdata_next[31:0] = ro_cache_start_1_qs; + reg_rdata_next[31:0] = '0; end addr_hit[20]: begin - reg_rdata_next[31:0] = ro_cache_start_2_qs; + reg_rdata_next[31:0] = '0; end addr_hit[21]: begin - reg_rdata_next[31:0] = ro_cache_start_3_qs; + reg_rdata_next[31:0] = '0; end addr_hit[22]: begin - reg_rdata_next[31:0] = ro_cache_end_0_qs; + reg_rdata_next[31:0] = '0; end addr_hit[23]: begin - reg_rdata_next[31:0] = ro_cache_end_1_qs; + reg_rdata_next[31:0] = '0; end addr_hit[24]: begin - reg_rdata_next[31:0] = ro_cache_end_2_qs; + reg_rdata_next[31:0] = '0; end addr_hit[25]: begin + reg_rdata_next[31:0] = tcdm_start_address_qs; + end + + addr_hit[26]: begin + reg_rdata_next[31:0] = tcdm_end_address_qs; + end + + addr_hit[27]: begin + reg_rdata_next[31:0] = nr_cores_reg_qs; + end + + addr_hit[28]: begin + reg_rdata_next[31:0] = ro_cache_enable_qs; + end + + addr_hit[29]: begin + reg_rdata_next[31:0] = ro_cache_flush_qs; + end + + addr_hit[30]: begin + reg_rdata_next[31:0] = ro_cache_start_0_qs; + end + + addr_hit[31]: begin + reg_rdata_next[31:0] = ro_cache_start_1_qs; + end + + addr_hit[32]: begin + reg_rdata_next[31:0] = ro_cache_start_2_qs; + end + + addr_hit[33]: begin + reg_rdata_next[31:0] = ro_cache_start_3_qs; + end + + addr_hit[34]: begin + reg_rdata_next[31:0] = ro_cache_end_0_qs; + end + + addr_hit[35]: begin + reg_rdata_next[31:0] = ro_cache_end_1_qs; + end + + addr_hit[36]: begin + reg_rdata_next[31:0] = ro_cache_end_2_qs; + end + + addr_hit[37]: begin reg_rdata_next[31:0] = ro_cache_end_3_qs; end @@ -996,7 +1366,7 @@ endmodule /* verilator lint_off DECLFILENAME */ module control_registers_reg_top_intf #( - parameter int AW = 7, + parameter int AW = 8, localparam int DW = 32 ) ( input logic clk_i, diff --git a/hardware/src/ctrl_registers.sv b/hardware/src/ctrl_registers.sv index 03cbe1bbb..ce0b5f7ce 100644 --- a/hardware/src/ctrl_registers.sv +++ b/hardware/src/ctrl_registers.sv @@ -7,6 +7,10 @@ module ctrl_registers import mempool_pkg::ro_cache_ctrl_t; + import mempool_pkg::NumDASPartitions; + import mempool_pkg::TileInterleavingWidth; + import mempool_pkg::RowsInterleavingWidth; + import mempool_pkg::AddrWidth; #( parameter int DataWidth = 32, // Parameters @@ -17,16 +21,19 @@ module ctrl_registers parameter type axi_lite_req_t = logic, parameter type axi_lite_resp_t = logic ) ( - input logic clk_i, - input logic rst_ni, + input logic clk_i, + input logic rst_ni, // AXI Bus - input axi_lite_req_t axi_lite_slave_req_i, - output axi_lite_resp_t axi_lite_slave_resp_o, + input axi_lite_req_t axi_lite_slave_req_i, + output axi_lite_resp_t axi_lite_slave_resp_o, // Control registers - output logic [DataWidth-1:0] eoc_o, - output logic eoc_valid_o, - output logic [NumCores-1:0] wake_up_o, - output ro_cache_ctrl_t ro_cache_ctrl_o + output logic [NumDASPartitions-1:0][TileInterleavingWidth-1:0] partition_sel_o, + output logic [NumDASPartitions-1:0][AddrWidth-1:0] start_das_o, + output logic [NumDASPartitions-1:0][RowsInterleavingWidth-1:0] rows_das_o, + output logic [DataWidth-1:0] eoc_o, + output logic eoc_valid_o, + output logic [NumCores-1:0] wake_up_o, + output ro_cache_ctrl_t ro_cache_ctrl_o ); import mempool_pkg::AddrWidth; @@ -98,6 +105,14 @@ module ctrl_registers `FFL(ctrl_hw2reg.ro_cache_end[i].d, ctrl_reg2hw.ro_cache_end[i].q, ctrl_reg2hw.ro_cache_end[i].qe, ro_cache_regions[i].end_addr, clk_i, rst_ni) end + for (genvar i = 0; i < mempool_pkg::NumDASPartitions; i++) begin: gen_das_regs + `FFL(ctrl_hw2reg.partition_sel[i].d, ctrl_reg2hw.partition_sel[i].q, ctrl_reg2hw.partition_sel[i].qe, mempool_pkg::NumTiles); + `FFL(ctrl_hw2reg.start_das[i].d, ctrl_reg2hw.start_das[i].q, ctrl_reg2hw.start_das[i].qe, mempool_pkg::DASStartAddr); + assign partition_sel_o[i] = ctrl_hw2reg.partition_sel[i].d[TileInterleavingWidth-1:0]; + assign start_das_o[i] = ctrl_hw2reg.start_das[i].d; + assign rows_das_o[i] = ctrl_reg2hw.rows_das[i].q[RowsInterleavingWidth-1:0]; + end + /************************ * Wakeup Pulse Logic * ************************/ diff --git a/hardware/src/mempool_cluster.sv b/hardware/src/mempool_cluster.sv index 561e0d369..bcf95c19d 100644 --- a/hardware/src/mempool_cluster.sv +++ b/hardware/src/mempool_cluster.sv @@ -24,6 +24,12 @@ module mempool_cluster input logic scan_enable_i, input logic scan_data_i, output logic scan_data_o, +`ifdef DAS + // Partition Selection + input logic [NumDASPartitions-1:0][TileInterleavingWidth-1:0] partition_sel_i, + input logic [NumDASPartitions-1:0][AddrWidth-1:0] start_das_i, + input logic [NumDASPartitions-1:0][RowsInterleavingWidth-1:0] rows_das_i, +`endif // Wake up signal input logic [NumCores-1:0] wake_up_i, // RO-Cache configuration @@ -73,6 +79,7 @@ module mempool_cluster `FF(dma_meta_o, dma_meta_cut, '0, clk_i, rst_ni); + dma_req_t dma_req_split; logic dma_req_split_valid; logic dma_req_split_ready; @@ -81,27 +88,39 @@ module mempool_cluster logic [NumGroups-1:0] dma_req_group_valid, dma_req_group_q_valid; logic [NumGroups-1:0] dma_req_group_ready, dma_req_group_q_ready; dma_meta_t [NumGroups-1:0] dma_meta, dma_meta_q; + logic [RowsInterleavingWidth-1:0] dma_rows_das; `FF(dma_meta_q, dma_meta, '0, clk_i, rst_ni); idma_split_midend #( - .DmaRegionWidth (NumBanksPerGroup*NumGroups*4), - .DmaRegionStart (TCDMBaseAddr ), - .DmaRegionEnd (TCDMBaseAddr+TCDMSize ), - .AddrWidth (AddrWidth ), - .burst_req_t (dma_req_t ), - .meta_t (dma_meta_t ) + .DmaRegionWidth (NumBanksPerGroup*NumGroups*4), // #DmaBytes = #banks*4 = 4096*4 // size per row + .DmaRegionStart (TCDMBaseAddr ), // 0x0000_0000, defined in tb + .DmaRegionEnd (TCDMBaseAddr+TCDMSize ), // TCDMSize = #banks*l1banksize = 4096*1024 // size of DMA region + .AddrWidth (AddrWidth ), + .burst_req_t (dma_req_t ), + .meta_t (dma_meta_t ), + .NumTiles (NumTiles ), + .NumBanksPerTile (NumBanksPerTile ), + .TCDMSizePerBank (TCDMSizePerBank ), + .NumDASPartitions (NumDASPartitions ), + .DASStartAddr (DASStartAddr ) ) i_idma_split_midend ( - .clk_i (clk_i ), - .rst_ni (rst_ni ), - .burst_req_i(dma_req_cut ), - .valid_i (dma_req_cut_valid ), - .ready_o (dma_req_cut_ready ), - .meta_o (dma_meta_cut ), - .burst_req_o(dma_req_split ), - .valid_o (dma_req_split_valid), - .ready_i (dma_req_split_ready), - .meta_i (dma_meta_split ) + .clk_i (clk_i ), + .rst_ni (rst_ni ), +`ifdef DAS + .partition_sel_i (partition_sel_i ), + .start_das_i (start_das_i ), + .rows_das_i (rows_das_i ), + .rows_das_o (dma_rows_das ), +`endif + .burst_req_i (dma_req_cut ), + .valid_i (dma_req_cut_valid ), + .ready_o (dma_req_cut_ready ), + .meta_o (dma_meta_cut ), + .burst_req_o (dma_req_split ), + .valid_o (dma_req_split_valid), + .ready_i (dma_req_split_ready), + .meta_i (dma_meta_split ) ); idma_distributed_midend #( @@ -110,11 +129,16 @@ module mempool_cluster .DmaRegionStart (TCDMBaseAddr ), .DmaRegionEnd (TCDMBaseAddr+TCDMSize), .TransFifoDepth (16 ), + .NumTiles (NumTiles ), + .NumDASPartitions (NumDASPartitions ), .burst_req_t (dma_req_t ), .meta_t (dma_meta_t ) ) i_idma_distributed_midend ( .clk_i (clk_i ), .rst_ni (rst_ni ), +`ifdef DAS + .rows_das_i (dma_rows_das ), +`endif .burst_req_i (dma_req_split ), .valid_i (dma_req_split_valid), .ready_o (dma_req_split_ready), @@ -294,6 +318,12 @@ module mempool_cluster .tcdm_slave_resp_o (tcdm_slave_resp[g] ), .tcdm_slave_resp_valid_o (tcdm_slave_resp_valid[g] ), .tcdm_slave_resp_ready_i (tcdm_slave_resp_ready[g] ), +`ifdef DAS + .partition_sel_i (partition_sel_i ), + .start_das_i (start_das_i ), + .rows_das_i (rows_das_i ), + .dma_rows_das_i (dma_rows_das ), +`endif .wake_up_i (wake_up_q[g*NumCoresPerGroup +: NumCoresPerGroup] ), .ro_cache_ctrl_i (ro_cache_ctrl_q[g] ), // DMA request @@ -335,6 +365,12 @@ module mempool_cluster .tcdm_slave_resp_o (tcdm_slave_resp[g] ), .tcdm_slave_resp_valid_o (tcdm_slave_resp_valid[g] ), .tcdm_slave_resp_ready_i (tcdm_slave_resp_ready[g] ), +`ifdef DAS + .partition_sel_i (partition_sel_i ), + .start_das_i (start_das_i ), + .rows_das_i (rows_das_i ), + .dma_rows_das_i (dma_rows_das ), +`endif .wake_up_i (wake_up_q[g*NumCoresPerGroup +: NumCoresPerGroup] ), .ro_cache_ctrl_i (ro_cache_ctrl_q[g] ), // DMA request @@ -373,6 +409,12 @@ module mempool_cluster .tcdm_slave_resp_o (tcdm_slave_resp[g] ), .tcdm_slave_resp_valid_o (tcdm_slave_resp_valid[g] ), .tcdm_slave_resp_ready_i (tcdm_slave_resp_ready[g] ), +`ifdef DAS + .partition_sel_i (partition_sel_i ), + .start_das_i (start_das_i ), + .rows_das_i (rows_das_i ), + .dma_rows_das_i (dma_rows_das ), +`endif .wake_up_i (wake_up_q[g*NumCoresPerGroup +: NumCoresPerGroup] ), .ro_cache_ctrl_i (ro_cache_ctrl_q[g] ), // DMA request @@ -435,38 +477,44 @@ module mempool_cluster .TCDMBaseAddr (TCDMBaseAddr ), .BootAddr (BootAddr ) ) i_group ( - .clk_i (clk_i ), - .rst_ni (rst_ni ), - .testmode_i (testmode_i ), - .scan_enable_i (scan_enable_i ), - .scan_data_i (/* Unconnected */ ), - .scan_data_o (/* Unconnected */ ), - .group_id_i (g[idx_width(NumGroups)-1:0] ), + .clk_i (clk_i ), + .rst_ni (rst_ni ), + .testmode_i (testmode_i ), + .scan_enable_i (scan_enable_i ), + .scan_data_i (/* Unconnected */ ), + .scan_data_o (/* Unconnected */ ), + .group_id_i (g[idx_width(NumGroups)-1:0] ), // TCDM Master interfaces - .tcdm_master_req_o (tcdm_master_req[g] ), - .tcdm_master_req_valid_o (tcdm_master_req_valid[g] ), - .tcdm_master_req_ready_i (tcdm_master_req_ready[g] ), - .tcdm_master_resp_i (tcdm_master_resp[g] ), - .tcdm_master_resp_valid_i(tcdm_master_resp_valid[g] ), - .tcdm_master_resp_ready_o(tcdm_master_resp_ready[g] ), + .tcdm_master_req_o (tcdm_master_req[g] ), + .tcdm_master_req_valid_o (tcdm_master_req_valid[g] ), + .tcdm_master_req_ready_i (tcdm_master_req_ready[g] ), + .tcdm_master_resp_i (tcdm_master_resp[g] ), + .tcdm_master_resp_valid_i (tcdm_master_resp_valid[g] ), + .tcdm_master_resp_ready_o (tcdm_master_resp_ready[g] ), // TCDM banks interface - .tcdm_slave_req_i (tcdm_slave_req[g] ), - .tcdm_slave_req_valid_i (tcdm_slave_req_valid[g] ), - .tcdm_slave_req_ready_o (tcdm_slave_req_ready[g] ), - .tcdm_slave_resp_o (tcdm_slave_resp[g] ), - .tcdm_slave_resp_valid_o (tcdm_slave_resp_valid[g] ), - .tcdm_slave_resp_ready_i (tcdm_slave_resp_ready[g] ), - .wake_up_i (wake_up_q[g*NumCoresPerGroup +: NumCoresPerGroup] ), - .ro_cache_ctrl_i (ro_cache_ctrl_q[g] ), + .tcdm_slave_req_i (tcdm_slave_req[g] ), + .tcdm_slave_req_valid_i (tcdm_slave_req_valid[g] ), + .tcdm_slave_req_ready_o (tcdm_slave_req_ready[g] ), + .tcdm_slave_resp_o (tcdm_slave_resp[g] ), + .tcdm_slave_resp_valid_o (tcdm_slave_resp_valid[g] ), + .tcdm_slave_resp_ready_i (tcdm_slave_resp_ready[g] ), +`ifdef DAS + .partition_sel_i (partition_sel_i ), + .start_das_i (start_das_i ), + .rows_das_i (rows_das_i ), + .dma_rows_das_i (dma_rows_das ), +`endif + .wake_up_i (wake_up_q[g*NumCoresPerGroup +: NumCoresPerGroup] ), + .ro_cache_ctrl_i (ro_cache_ctrl_q[g] ), // DMA request - .dma_req_i (dma_req_group_q[g] ), - .dma_req_valid_i (dma_req_group_q_valid[g] ), - .dma_req_ready_o (dma_req_group_q_ready[g] ), + .dma_req_i (dma_req_group_q[g] ), + .dma_req_valid_i (dma_req_group_q_valid[g] ), + .dma_req_ready_o (dma_req_group_q_ready[g] ), // DMA status - .dma_meta_o (dma_meta[g] ), + .dma_meta_o (dma_meta[g] ), // AXI interface - .axi_mst_req_o (axi_mst_req_o[g*NumAXIMastersPerGroup +: NumAXIMastersPerGroup] ), - .axi_mst_resp_i (axi_mst_resp_i[g*NumAXIMastersPerGroup +: NumAXIMastersPerGroup]) + .axi_mst_req_o (axi_mst_req_o[g*NumAXIMastersPerGroup +: NumAXIMastersPerGroup] ), + .axi_mst_resp_i (axi_mst_resp_i[g*NumAXIMastersPerGroup +: NumAXIMastersPerGroup]) ); end : gen_groups diff --git a/hardware/src/mempool_group.sv b/hardware/src/mempool_group.sv index 733f98b9c..5c1d1f9e4 100644 --- a/hardware/src/mempool_group.sv +++ b/hardware/src/mempool_group.sv @@ -58,19 +58,25 @@ module mempool_group output logic [NumGroups-1:1][NumTilesPerGroup-1:0] tcdm_slave_resp_valid_o, input logic [NumGroups-1:1][NumTilesPerGroup-1:0] tcdm_slave_resp_ready_i, `endif +`ifdef DAS + input logic [NumDASPartitions-1:0][TileInterleavingWidth-1:0] partition_sel_i, + input logic [NumDASPartitions-1:0][AddrWidth-1:0] start_das_i, + input logic [NumDASPartitions-1:0][RowsInterleavingWidth-1:0] rows_das_i, + input logic [RowsInterleavingWidth-1:0] dma_rows_das_i, +`endif // Wake up interface - input logic [NumCoresPerGroup-1:0] wake_up_i, + input logic [NumCoresPerGroup-1:0] wake_up_i, // RO-Cache configuration - input `STRUCT_PORT(ro_cache_ctrl_t) ro_cache_ctrl_i, + input `STRUCT_PORT(ro_cache_ctrl_t) ro_cache_ctrl_i, // DMA request - input `STRUCT_PORT(dma_req_t) dma_req_i, - input logic dma_req_valid_i, - output logic dma_req_ready_o, + input `STRUCT_PORT(dma_req_t) dma_req_i, + input logic dma_req_valid_i, + output logic dma_req_ready_o, // DMA status - output `STRUCT_PORT(dma_meta_t) dma_meta_o, + output `STRUCT_PORT(dma_meta_t) dma_meta_o, // AXI Interface - output `STRUCT_VECT(axi_tile_req_t, [NumAXIMastersPerGroup-1:0]) axi_mst_req_o, - input `STRUCT_VECT(axi_tile_resp_t, [NumAXIMastersPerGroup-1:0]) axi_mst_resp_i + output `STRUCT_VECT(axi_tile_req_t, [NumAXIMastersPerGroup-1:0]) axi_mst_req_o, + input `STRUCT_VECT(axi_tile_resp_t, [NumAXIMastersPerGroup-1:0]) axi_mst_resp_i ); /***************** @@ -332,6 +338,11 @@ module mempool_group .axi_mst_resp_i (axi_mst_resp[sg*NumAXIMastersPerSubGroup +: NumAXIMastersPerSubGroup] ), // RO-Cache configuration .ro_cache_ctrl_i (ro_cache_ctrl_q ), +`ifdef DAS + .partition_sel_i (partition_sel_i ), + .start_das_i (start_das_i ), + .rows_das_i (rows_das_i ), +`endif // Wake up interface .wake_up_i (wake_up_q[sg*NumCoresPerSubGroup +: NumCoresPerSubGroup] ) ); @@ -384,6 +395,11 @@ module mempool_group .axi_mst_resp_i (axi_mst_resp[sg*NumAXIMastersPerSubGroup +: NumAXIMastersPerSubGroup] ), // RO-Cache configuration .ro_cache_ctrl_i (ro_cache_ctrl_q ), +`ifdef DAS + .partition_sel_i (partition_sel_i ), + .start_das_i (start_das_i ), + .rows_das_i (rows_das_i ), +`endif // Wake up interface .wake_up_i (wake_up_q[sg*NumCoresPerSubGroup +: NumCoresPerSubGroup] ) ); @@ -565,20 +581,25 @@ module mempool_group .DmaRegionWidth (NumBanksPerGroup*4/NumDmasPerGroup ), .DmaRegionStart (TCDMBaseAddr ), .DmaRegionEnd (TCDMBaseAddr+TCDMSize ), - .TransFifoDepth (16 ), + .TransFifoDepth (8 ), + .NumTiles (NumTiles ), + .NumDASPartitions (NumDASPartitions ), .burst_req_t (dma_req_t ), .meta_t (dma_meta_t ) ) i_idma_distributed_midend ( - .clk_i (clk_i ), - .rst_ni (rst_ni ), - .burst_req_i (dma_req_cut ), - .valid_i (dma_req_cut_valid), - .ready_o (dma_req_cut_ready), - .meta_o (dma_meta_cut ), - .burst_req_o (dma_req ), - .valid_o (dma_req_valid ), - .ready_i (dma_req_ready ), - .meta_i (dma_meta ) + .clk_i (clk_i ), + .rst_ni (rst_ni ), +`ifdef DAS + .rows_das_i (dma_rows_das_i ), +`endif + .burst_req_i (dma_req_cut ), + .valid_i (dma_req_cut_valid ), + .ready_o (dma_req_cut_ready ), + .meta_o (dma_meta_cut ), + .burst_req_o (dma_req ), + .valid_o (dma_req_valid ), + .ready_i (dma_req_ready ), + .meta_i (dma_meta ) ); `else @@ -683,6 +704,11 @@ module mempool_group // AXI interface .axi_mst_req_o (axi_tile_req[t] ), .axi_mst_resp_i (axi_tile_resp[t] ), +`ifdef DAS + .partition_sel_i (partition_sel_i ), + .start_das_i (start_das_i ), + .rows_das_i (rows_das_i ), +`endif // Wake up interface .wake_up_i (wake_up_q[t*NumCoresPerTile +: NumCoresPerTile]) ); @@ -971,24 +997,29 @@ module mempool_group dma_meta_t [NumDmasPerGroup-1:0] dma_meta; idma_distributed_midend #( - .NoMstPorts (NumDmasPerGroup ), - .DmaRegionWidth (NumBanksPerGroup*4/NumDmasPerGroup), - .DmaRegionStart (TCDMBaseAddr ), - .DmaRegionEnd (TCDMBaseAddr+TCDMSize ), - .TransFifoDepth (16 ), - .burst_req_t (dma_req_t ), - .meta_t (dma_meta_t ) + .NoMstPorts (NumDmasPerGroup ), + .DmaRegionWidth (NumBanksPerGroup*4/NumDmasPerGroup ), + .DmaRegionStart (TCDMBaseAddr ), + .DmaRegionEnd (TCDMBaseAddr+TCDMSize ), + .TransFifoDepth (8 ), + .NumTiles (NumTiles ), + .NumDASPartitions (NumDASPartitions ), + .burst_req_t (dma_req_t ), + .meta_t (dma_meta_t ) ) i_idma_distributed_midend ( - .clk_i (clk_i ), - .rst_ni (rst_ni ), - .burst_req_i (dma_req_cut ), - .valid_i (dma_req_cut_valid), - .ready_o (dma_req_cut_ready), - .meta_o (dma_meta_cut ), - .burst_req_o (dma_req ), - .valid_o (dma_req_valid ), - .ready_i (dma_req_ready ), - .meta_i (dma_meta ) + .clk_i (clk_i ), + .rst_ni (rst_ni ), +`ifdef DAS + .rows_das_i (dma_rows_das_i ), +`endif + .burst_req_i (dma_req_cut ), + .valid_i (dma_req_cut_valid ), + .ready_o (dma_req_cut_ready ), + .meta_o (dma_meta_cut ), + .burst_req_o (dma_req ), + .valid_o (dma_req_valid ), + .ready_i (dma_req_ready ), + .meta_i (dma_meta ) ); // xbar diff --git a/hardware/src/mempool_pkg.sv b/hardware/src/mempool_pkg.sv index 5d427feec..91b21ba2a 100644 --- a/hardware/src/mempool_pkg.sv +++ b/hardware/src/mempool_pkg.sv @@ -44,6 +44,12 @@ package mempool_pkg; localparam integer unsigned NumBanksPerGroup = NumBanks / NumGroups; localparam integer unsigned TCDMAddrMemWidth = $clog2(TCDMSizePerBank / mempool_pkg::BeWidth); localparam integer unsigned TCDMAddrWidth = TCDMAddrMemWidth + idx_width(NumBanksPerGroup); + // DAS parameters + localparam integer unsigned NumDASPartitions = `ifdef NUM_DAS_PARTITIONS `NUM_DAS_PARTITIONS `else 0 `endif; + localparam integer unsigned DASMemSize = `ifdef DAS_MEM_SIZE `DAS_MEM_SIZE `else 0 `endif; + localparam integer unsigned DASStartAddr = (NumBanks * TCDMSizePerBank) - NumCores * DASMemSize; + localparam integer unsigned TileInterleavingWidth = idx_width(NumTiles) + 1; // only support {128, 64, 32, 16, 8, 4, 2, 1}; + localparam integer unsigned RowsInterleavingWidth = idx_width(TCDMSizePerBank) - ByteOffset; // L2 localparam integer unsigned L2Size = `ifdef L2_SIZE `L2_SIZE `else 0 `endif; // [B] diff --git a/hardware/src/mempool_sub_group.sv b/hardware/src/mempool_sub_group.sv index a3577450f..b88338ade 100644 --- a/hardware/src/mempool_sub_group.sv +++ b/hardware/src/mempool_sub_group.sv @@ -62,6 +62,11 @@ module mempool_sub_group input `STRUCT_VECT(axi_tile_resp_t, [NumAXIMastersPerSubGroup-1:0]) axi_mst_resp_i, // RO-Cache configuration input `STRUCT_PORT(ro_cache_ctrl_t) ro_cache_ctrl_i, +`ifdef DAS + input logic [NumDASPartitions-1:0][TileInterleavingWidth-1:0] partition_sel_i, + input logic [NumDASPartitions-1:0][AddrWidth-1:0] start_das_i, + input logic [NumDASPartitions-1:0][RowsInterleavingWidth-1:0] rows_das_i, +`endif // Wake up interface input logic [NumCoresPerSubGroup-1:0] wake_up_i ); @@ -198,6 +203,11 @@ module mempool_sub_group // AXI interface .axi_mst_req_o (axi_tile_req[t] ), .axi_mst_resp_i (axi_tile_resp[t] ), +`ifdef DAS + .partition_sel_i (partition_sel_i ), + .start_das_i (start_das_i ), + .rows_das_i (rows_das_i ), +`endif // Wake up interface .wake_up_i (wake_up_q[t*NumCoresPerTile +: NumCoresPerTile]) ); diff --git a/hardware/src/mempool_system.sv b/hardware/src/mempool_system.sv index 98c6fde07..ea6931f95 100644 --- a/hardware/src/mempool_system.sv +++ b/hardware/src/mempool_system.sv @@ -90,6 +90,12 @@ module mempool_system logic [NumCores-1:0] wake_up; logic [DataWidth-1:0] eoc; ro_cache_ctrl_t ro_cache_ctrl; +`ifdef DAS + // For dynamic partitioning + logic [NumDASPartitions-1:0][TileInterleavingWidth-1:0] partition_sel; + logic [NumDASPartitions-1:0][AddrWidth-1:0] start_das; + logic [NumDASPartitions-1:0][RowsInterleavingWidth-1:0] rows_das; +`endif dma_req_t dma_req; logic dma_req_valid; @@ -137,20 +143,25 @@ module mempool_system .TCDMBaseAddr(TCDMBaseAddr), .BootAddr (BootAddr ) ) i_mempool_cluster ( - .clk_i (clk_i ), - .rst_ni (rst_ni ), - .wake_up_i (wake_up ), - .testmode_i (1'b0 ), - .scan_enable_i (1'b0 ), - .scan_data_i (1'b0 ), - .scan_data_o (/* Unused */ ), - .ro_cache_ctrl_i(ro_cache_ctrl ), - .dma_req_i (dma_req ), - .dma_req_valid_i(dma_req_valid ), - .dma_req_ready_o(dma_req_ready ), - .dma_meta_o (dma_meta ), - .axi_mst_req_o (axi_mst_req[NumAXIMasters-2:0] ), - .axi_mst_resp_i (axi_mst_resp[NumAXIMasters-2:0]) + .clk_i (clk_i ), + .rst_ni (rst_ni ), + .wake_up_i (wake_up ), +`ifdef DAS + .partition_sel_i (partition_sel ), + .start_das_i (start_das ), + .rows_das_i (rows_das ), +`endif + .testmode_i (1'b0 ), + .scan_enable_i (1'b0 ), + .scan_data_i (1'b0 ), + .scan_data_o (/* Unused */ ), + .ro_cache_ctrl_i (ro_cache_ctrl ), + .dma_req_i (dma_req ), + .dma_req_valid_i (dma_req_valid ), + .dma_req_ready_o (dma_req_ready ), + .dma_meta_o (dma_meta ), + .axi_mst_req_o (axi_mst_req[NumAXIMasters-2:0] ), + .axi_mst_resp_i (axi_mst_resp[NumAXIMasters-2:0]) ); /********************** @@ -801,6 +812,11 @@ module mempool_system .axi_lite_slave_resp_o(axi_lite_slv_resp[CtrlRegisters]), .eoc_o (/* Unused */ ), .eoc_valid_o (eoc_valid_o ), +`ifdef DAS + .partition_sel_o (partition_sel ), + .start_das_o (start_das ), + .rows_das_o (rows_das ), +`endif .wake_up_o (wake_up ), .ro_cache_ctrl_o (ro_cache_ctrl ) ); diff --git a/hardware/src/mempool_tile.sv b/hardware/src/mempool_tile.sv index 49c8ddaea..ac856a5e9 100644 --- a/hardware/src/mempool_tile.sv +++ b/hardware/src/mempool_tile.sv @@ -51,6 +51,11 @@ module mempool_tile // AXI Interface output `STRUCT_PORT(axi_tile_req_t) axi_mst_req_o, input `STRUCT_PORT(axi_tile_resp_t) axi_mst_resp_i, +`ifdef DAS + input logic [NumDASPartitions-1:0][TileInterleavingWidth-1:0] partition_sel_i, + input logic [NumDASPartitions-1:0][AddrWidth-1:0] start_das_i, + input logic [NumDASPartitions-1:0][RowsInterleavingWidth-1:0] rows_das_i, +`endif // Wake up interface input logic [NumCoresPerTile-1:0] wake_up_i ); @@ -893,13 +898,24 @@ module mempool_tile address_scrambler #( .AddrWidth (AddrWidth ), .ByteOffset (ByteOffset ), + .Bypass (0 ), .NumTiles (NumTiles ), .NumBanksPerTile (NumBanksPerTile ), - .Bypass (0 ), - .SeqMemSizePerTile (SeqMemSizePerTile) + .TCDMSizePerBank (TCDMSizePerBank ), + .SeqMemSizePerTile (SeqMemSizePerTile), + .NumDASPartitions (NumDASPartitions ) ) i_address_scrambler ( - .address_i (snitch_data_qaddr[c] ), - .address_o (snitch_data_qaddr_scrambled) +`ifdef DAS + .partition_sel_i (partition_sel_i ), + .start_das_i (start_das_i ), + .rows_das_i (rows_das_i ), +`else + .partition_sel_i (NumTiles ), + .start_das_i ('0 ), + .rows_das_i ('0 ), +`endif + .address_i (snitch_data_qaddr[c]), + .address_o (snitch_data_qaddr_scrambled) ); if (!TrafficGeneration) begin: gen_tcdm_shim diff --git a/software/runtime/alloc.c b/software/runtime/alloc.c index 519bd8e32..eed230dcb 100644 --- a/software/runtime/alloc.c +++ b/software/runtime/alloc.c @@ -34,6 +34,11 @@ alloc_t alloc_l1; // Allocators for L1 local sequential heap memory alloc_t alloc_tile[NUM_CORES / NUM_CORES_PER_TILE]; +// ---------------------------------------------------------------------------- +// Dynamic Heap Allocator +// ---------------------------------------------------------------------------- +alloc_t dynamic_heap_alloc; + // ---------------------------------------------------------------------------- // Canary System based on LSBs of block pointer // ---------------------------------------------------------------------------- @@ -55,18 +60,34 @@ static inline canary_and_size_t canary_decode(const uint32_t value) { return (canary_and_size_t){.canary = value & 0xFF, .size = value >> 8}; } +typedef struct canary_chain_s { + uint32_t canary_and_size; + uint32_t *data_address; + struct canary_chain_s *next_canary; +} canary_chain_t; + +// init as a NULL, assign this pointer when the first canary is allocated +// It is a pointer pointing to the canary chain +// canary_start_t first_canary; +canary_chain_t *first_canary = (canary_chain_t *)0x1000; + // ---------------------------------------------------------------------------- // Initialization // ---------------------------------------------------------------------------- void alloc_init(alloc_t *alloc, void *base, const uint32_t size) { // Create first block at base address aligned up uint32_t aligned_base = ALIGN_UP((uint32_t)base, MIN_BLOCK_SIZE); + // printf("base - %p - aligned_base %p\n", base, (alloc_block_t + // *)aligned_base); alloc_block_t *block_ptr = (alloc_block_t *)aligned_base; // Calculate block size aligned down uint32_t block_size = size - ((uint32_t)block_ptr - (uint32_t)base); block_size = ALIGN_DOWN(block_size, MIN_BLOCK_SIZE); + // printf("block_ptr: %p, block_ptr->size: %p, block_ptr->next: %p\n", + // block_ptr, &(block_ptr->size), &(block_ptr->next)); + // Setup allocator block_ptr->size = block_size; block_ptr->next = NULL; @@ -116,6 +137,103 @@ static void *allocate_memory(alloc_t *alloc, const uint32_t size) { } } +// ------ Function to calculate the aligned size ------ // +static uint32_t calc_aligned_row_size(uint32_t *addr) { + + const uint32_t row_bytes = NUM_BANKS * sizeof(uint32_t); + const uint32_t mask = (uint32_t)(row_bytes - 1); + uint32_t offset = ((uint32_t)addr) & mask; + + return (row_bytes - offset) & mask; +} + +// ------ Parameters ------ // +// size: Size of the data block need to be allocated +// allocated_size: How many rows the current partition scheme occupied +static void *allocate_memory_aligned(alloc_t *alloc, const uint32_t size) { + // Get first block of linked list of free blocks + alloc_block_t *curr = alloc->first_block; + alloc_block_t *prev = 0; + + // Search first block large enough in linked list + // 1. calculate the size aligned to the partition boundary + uint32_t shift_size = 0; + shift_size = calc_aligned_row_size((uint32_t *)curr); + uint32_t aligned_size = size + shift_size; + + while (curr && (curr->size < aligned_size)) { + prev = curr; + curr = curr->next; + shift_size = calc_aligned_row_size((uint32_t *)curr); + aligned_size = size + shift_size; + } + printf("Dynamic Allocator >> size [%d] --- shift size [%d] --- aligned size " + "[%d] \n", + size, shift_size, aligned_size); + + if (curr) { + // Update allocator + if (size == aligned_size) { + // address is already aligned to the partition boundary + printf("Dynamic Allocator >> No alignment needed\n"); + if (curr->size == size) { + // Special case: Whole block taken + if (prev) { + prev->next = curr->next; + } else { + alloc->first_block = curr->next; + } + } else { + // Regular case: Split off block + alloc_block_t *new_block = (alloc_block_t *)((char *)curr + size); + new_block->size = curr->size - size; + new_block->next = curr->next; + if (prev) { + prev->next = new_block; + } else { + alloc->first_block = new_block; + } + } + } else { + printf("Dynamic Allocator >> Alignment needed\n"); + if (curr->size == aligned_size) { + // Special case: Whole block taken, first part of the block is still + // empty store the curr info in tmp uint32_t tmp_size = curr->size; + struct alloc_block_s *tmp_next = curr->next; + alloc_block_t *shift_block = (alloc_block_t *)((char *)curr); + shift_block->size = shift_size; + shift_block->next = tmp_next; + if (prev) { + prev->next = shift_block; + } else { + alloc->first_block = shift_block; + } + } else { + // Regular case: Split off block + alloc_block_t *new_block = + (alloc_block_t *)((char *)curr + aligned_size); + new_block->size = curr->size - aligned_size; + new_block->next = curr->next; + + alloc_block_t *shift_block = (alloc_block_t *)((char *)curr); + shift_block->size = shift_size; + shift_block->next = new_block; + if (prev) { + prev->next = shift_block; + } else { + alloc->first_block = shift_block; + } + } + } + + // Return block pointer + return (void *)((char *)curr + shift_size); + } else { + // There is no free block large enough + return NULL; + } +} + void *domain_malloc(alloc_t *alloc, const uint32_t size) { // Calculate actually required block size uint32_t data_size = size + sizeof(uint32_t); // add size/metadata @@ -147,6 +265,98 @@ void *simple_malloc(const uint32_t size) { return domain_malloc(&alloc_l1, size); } +// ------ This function allocate data in Sequential Heap region ------ // +// Canary system is stored in a seperate linked list +// void *partition_malloc(alloc_t *alloc, const uint32_t size){ +void *partition_malloc(alloc_t *alloc, const uint32_t size) { + + uint32_t data_size = size > 2 * NUM_BANKS * sizeof(uint32_t) + ? size + : 2 * NUM_BANKS * sizeof(uint32_t); + uint32_t block_size = ALIGN_UP(data_size, MIN_BLOCK_SIZE); // add alignment + + // Check if exceed maximum allowed size + if (block_size >= (1 << (sizeof(uint32_t) * 8 - sizeof(uint8_t) * 8))) { + printf("Memory allocator: Requested memory exceeds max block size\n"); + return NULL; + } + + // allocate + void *block_ptr = NULL; + block_ptr = allocate_memory_aligned(alloc, block_size); + + + if (!block_ptr) { + printf("Memory allocator: No large enough block found (%d)\n", block_size); + return NULL; + } + + // Allocate a region in L1 heap for canary + // printf("p1\n"); + canary_chain_t *canary = + (canary_chain_t *)simple_malloc(sizeof(canary_chain_t)); + // printf("p2\n"); + // Init the canary + canary->data_address = (uint32_t *)block_ptr; + canary->canary_and_size = canary_encode(block_ptr, block_size); + canary->next_canary = NULL; + + // link the canary into the list + // canary_chain_t *curr = first_canary->first_block; + canary_chain_t *curr = first_canary; + canary_chain_t *prev = 0; + + // Fit the canary into the chain, depending on data_address + // | prev | ------> | canary | ------> | curr | + uint32_t *data_addr = 0; + if (curr != (canary_chain_t *)0x1000) { + // only access struct when init + data_addr = curr->data_address; + } + + while ((curr != (canary_chain_t *)0x1000) && (curr != NULL) && + ((uint32_t *)data_addr < (uint32_t *)block_ptr)) { + prev = curr; + // data_addr = curr->data_address; + curr = curr->next_canary; + if (curr != NULL) { + data_addr = curr->data_address; + } + // data_addr = curr->data_address; + } + + // printf("post: %p - %p \n", curr, prev); + if ((curr == (canary_chain_t *)0x1000) && !prev) { + // special case: first canary block + first_canary = canary; + printf("| First | ------> [ New ]\n"); + // printf("first_canary: %p\n", first_canary); + } else { + if (!curr) { + // reach to the last of the chain + // | prev | ------> | canary | ------> NULL + prev->next_canary = canary; + canary->next_canary = NULL; + printf("| Other | ------> [ New ] ------> NULL\n"); + } else if (!prev) { + // canary need to insert at the beginning of the chain + // first_canary ------> | canary | ------> | curr | + first_canary = canary; + canary->next_canary = curr; + printf("| First | ------> [ New ] ------> | Other |\n"); + } else { + // normal case + // | prev | ------> | canary | ------> | curr | + canary->next_canary = prev->next_canary; + prev->next_canary = canary; + printf("| Other | ------> [ New ] ------> | Other |\n"); + } + } + // return the block pointer directly + // printf("%p\n", block_ptr); + return block_ptr; +} + // ---------------------------------------------------------------------------- // Free Memory // ---------------------------------------------------------------------------- @@ -208,6 +418,75 @@ void domain_free(alloc_t *alloc, void *const ptr) { void simple_free(void *const ptr) { domain_free(&alloc_l1, ptr); } +void partition_free(alloc_t *alloc, void *const ptr) { + // block pointer is the input pointer + void *block_ptr = ptr; + + canary_and_size_t canary_and_size = + (canary_and_size_t){.canary = 0, .size = 0}; + // find the canary block in the chain + canary_chain_t *curr = first_canary; + canary_chain_t *prev = 0; + + // While loop suppose to stop when curr->data_address == block_ptr + // | prev | ------> | curr | + uint32_t *data_addr = 0; + if (curr) { + data_addr = curr->data_address; + } + printf("data_addr - %p - block_ptr - %p - curr->data_address - %p \n", + data_addr, block_ptr, curr->data_address); + while ((curr != (canary_chain_t *)0x1000) && (curr != NULL) && + (data_addr < (uint32_t *)block_ptr)) { + prev = curr; + // data_addr = curr->data_address; + curr = curr->next_canary; + if (curr != NULL) { + data_addr = curr->data_address; + } + } + + if ((curr == (canary_chain_t *)0x1000) && !prev) { + // nothing in the chain + printf("CANARY: Empty canary chain!\n"); + } else if (!curr) { + // reach to the end of the chain + printf("CANARY: Chain depleted. No info found for %p\n", block_ptr); + } else if (curr->data_address != block_ptr) { + // no information for the current free + printf("CANARY: Unmatch! %p - %p\n", curr->data_address, block_ptr); + } else if (!prev) { + // normal case 1: curr is the first canary + // first_canary ------> | curr | ------> next + canary_and_size = canary_decode(curr->canary_and_size); + if (curr->next_canary == NULL) { + first_canary = (canary_chain_t *)0x1000; + } else { + first_canary = curr->next_canary; + } + + simple_free((void *)curr); + } else { + // normal case 2: relink the chain, free the curr canary + // | prev | ------> | curr | ------> something + canary_and_size = canary_decode(curr->canary_and_size); + prev->next_canary = curr->next_canary; + simple_free((void *)curr); + } + + // Check for memory overflow + if (canary_and_size.canary != canary(block_ptr)) { + if (!canary_and_size.canary) { + printf("Empty canary.\n"); + } + printf("Memory Overflow at %p\n", block_ptr); + return; + } + + // Free memory + free_memory(alloc, block_ptr, canary_and_size.size); +} + // ---------------------------------------------------------------------------- // Debugging Functions // ---------------------------------------------------------------------------- @@ -233,9 +512,31 @@ void alloc_dump(alloc_t *alloc) { } } +void canary_dump(void) { + printf(" ------ Canary Chain Dump ------ \n"); + canary_chain_t *curr = first_canary; + if (curr == (canary_chain_t *)0x1000) { + // empty list + printf("Empty Canary list.\n"); + } else { + uint32_t cnt = 0; + while (curr != NULL) { + printf("[%d] - [%p] - [%p] - [%p]\n", cnt, curr, curr->data_address, + curr->next_canary); + cnt += 1; + curr = curr->next_canary; + } + } + printf(" ------ Canary Dump END ------ \n"); +} + // ---------------------------------------------------------------------------- // Get Allocators // ---------------------------------------------------------------------------- +// Get the address of global variable `alloc_l1` alloc_t *get_alloc_l1() { return &alloc_l1; } alloc_t *get_alloc_tile(const uint32_t tile_id) { return &alloc_tile[tile_id]; } + +// Dynamic Heap Allocator +alloc_t *get_dynamic_heap_alloc() { return &dynamic_heap_alloc; } diff --git a/software/runtime/alloc.h b/software/runtime/alloc.h index f6db489a2..0533e767a 100644 --- a/software/runtime/alloc.h +++ b/software/runtime/alloc.h @@ -39,11 +39,15 @@ void *domain_malloc(alloc_t *alloc, const uint32_t size); // Free in L1 memory void simple_free(void *const ptr); +// Free dynamic heap allocation with Canary chain +void partition_free(alloc_t *alloc, void *const ptr); + // Free with specified allocator void domain_free(alloc_t *alloc, void *const ptr); // Print out linked list of free blocks void alloc_dump(alloc_t *alloc); +void canary_dump(void); // Get allocator for L1 interleaved heap memory alloc_t *get_alloc_l1(); @@ -51,4 +55,13 @@ alloc_t *get_alloc_l1(); // Get allocator for L1 local sequential heap memory alloc_t *get_alloc_tile(const uint32_t tile_id); +// ----- Dynamic Heap Allocator ----- // +alloc_t *get_dynamic_heap_alloc(); + +// Dynamic heap allocation with Canary Chain +void *partition_malloc(alloc_t *alloc, const uint32_t size); + +// Free dynamic heap allocation with Canary chain +void partition_free(alloc_t *alloc, void *const ptr); + #endif diff --git a/software/runtime/arch.ld.c b/software/runtime/arch.ld.c index 1d8de5e57..ba1f66457 100644 --- a/software/runtime/arch.ld.c +++ b/software/runtime/arch.ld.c @@ -31,5 +31,8 @@ SECTIONS { __heap_start = __l1_start; __heap_end = __l1_end; + // DAS related, default impacted region size + __heap_seq_start = __l1_start + (NUM_CORES * BANKING_FACTOR * L1_BANK_SIZE) - NUM_CORES * DAS_MEM_SIZE; + fake_uart = 0xC0000000; } diff --git a/software/runtime/control_registers.h b/software/runtime/control_registers.h index f2a467af2..88109fb9c 100644 --- a/software/runtime/control_registers.h +++ b/software/runtime/control_registers.h @@ -19,6 +19,9 @@ extern "C" { // Maximum number of groups that we support in any configuration #define CONTROL_REGISTERS_PARAM_MAX_NUMGROUPS 8 +// Supported number of DAS partitions +#define CONTROL_REGISTERS_PARAM_NUM_D_A_S_PARTITIONS 4 + // Register width #define CONTROL_REGISTERS_PARAM_REG_WIDTH 32 @@ -66,20 +69,71 @@ extern "C" { // Wake Up Offst Register #define CONTROL_REGISTERS_WAKE_UP_OFFST_REG_OFFSET 0x30 +// Tile grouping for DAS partition (common parameters) +#define CONTROL_REGISTERS_PARTITION_SEL_PARTITION_SEL_FIELD_WIDTH 32 +#define CONTROL_REGISTERS_PARTITION_SEL_PARTITION_SEL_FIELDS_PER_REG 1 +#define CONTROL_REGISTERS_PARTITION_SEL_MULTIREG_COUNT 4 + +// Tile grouping for DAS partition +#define CONTROL_REGISTERS_PARTITION_SEL_0_REG_OFFSET 0x34 + +// Tile grouping for DAS partition +#define CONTROL_REGISTERS_PARTITION_SEL_1_REG_OFFSET 0x38 + +// Tile grouping for DAS partition +#define CONTROL_REGISTERS_PARTITION_SEL_2_REG_OFFSET 0x3c + +// Tile grouping for DAS partition +#define CONTROL_REGISTERS_PARTITION_SEL_3_REG_OFFSET 0x40 + +// Start address of DAS partition (common parameters) +#define CONTROL_REGISTERS_START_DAS_START_DAS_FIELD_WIDTH 32 +#define CONTROL_REGISTERS_START_DAS_START_DAS_FIELDS_PER_REG 1 +#define CONTROL_REGISTERS_START_DAS_MULTIREG_COUNT 4 + +// Start address of DAS partition +#define CONTROL_REGISTERS_START_DAS_0_REG_OFFSET 0x44 + +// Start address of DAS partition +#define CONTROL_REGISTERS_START_DAS_1_REG_OFFSET 0x48 + +// Start address of DAS partition +#define CONTROL_REGISTERS_START_DAS_2_REG_OFFSET 0x4c + +// Start address of DAS partition +#define CONTROL_REGISTERS_START_DAS_3_REG_OFFSET 0x50 + +// End address of DAS partition (common parameters) +#define CONTROL_REGISTERS_ROWS_DAS_ROWS_DAS_FIELD_WIDTH 32 +#define CONTROL_REGISTERS_ROWS_DAS_ROWS_DAS_FIELDS_PER_REG 1 +#define CONTROL_REGISTERS_ROWS_DAS_MULTIREG_COUNT 4 + +// End address of DAS partition +#define CONTROL_REGISTERS_ROWS_DAS_0_REG_OFFSET 0x54 + +// End address of DAS partition +#define CONTROL_REGISTERS_ROWS_DAS_1_REG_OFFSET 0x58 + +// End address of DAS partition +#define CONTROL_REGISTERS_ROWS_DAS_2_REG_OFFSET 0x5c + +// End address of DAS partition +#define CONTROL_REGISTERS_ROWS_DAS_3_REG_OFFSET 0x60 + // TCDM Start Address Register -#define CONTROL_REGISTERS_TCDM_START_ADDRESS_REG_OFFSET 0x34 +#define CONTROL_REGISTERS_TCDM_START_ADDRESS_REG_OFFSET 0x64 // TCDM End Address Register -#define CONTROL_REGISTERS_TCDM_END_ADDRESS_REG_OFFSET 0x38 +#define CONTROL_REGISTERS_TCDM_END_ADDRESS_REG_OFFSET 0x68 // Number of Cores Register -#define CONTROL_REGISTERS_NR_CORES_REG_REG_OFFSET 0x3c +#define CONTROL_REGISTERS_NR_CORES_REG_REG_OFFSET 0x6c // Read-only cache Enable -#define CONTROL_REGISTERS_RO_CACHE_ENABLE_REG_OFFSET 0x40 +#define CONTROL_REGISTERS_RO_CACHE_ENABLE_REG_OFFSET 0x70 // Read-only cache Flush -#define CONTROL_REGISTERS_RO_CACHE_FLUSH_REG_OFFSET 0x44 +#define CONTROL_REGISTERS_RO_CACHE_FLUSH_REG_OFFSET 0x74 // Read-only cache Region Start (common parameters) #define CONTROL_REGISTERS_RO_CACHE_START_RO_CACHE_START_FIELD_WIDTH 32 @@ -87,16 +141,16 @@ extern "C" { #define CONTROL_REGISTERS_RO_CACHE_START_MULTIREG_COUNT 4 // Read-only cache Region Start -#define CONTROL_REGISTERS_RO_CACHE_START_0_REG_OFFSET 0x48 +#define CONTROL_REGISTERS_RO_CACHE_START_0_REG_OFFSET 0x78 // Read-only cache Region Start -#define CONTROL_REGISTERS_RO_CACHE_START_1_REG_OFFSET 0x4c +#define CONTROL_REGISTERS_RO_CACHE_START_1_REG_OFFSET 0x7c // Read-only cache Region Start -#define CONTROL_REGISTERS_RO_CACHE_START_2_REG_OFFSET 0x50 +#define CONTROL_REGISTERS_RO_CACHE_START_2_REG_OFFSET 0x80 // Read-only cache Region Start -#define CONTROL_REGISTERS_RO_CACHE_START_3_REG_OFFSET 0x54 +#define CONTROL_REGISTERS_RO_CACHE_START_3_REG_OFFSET 0x84 // Read-only cache Region End (common parameters) #define CONTROL_REGISTERS_RO_CACHE_END_RO_CACHE_END_FIELD_WIDTH 32 @@ -104,16 +158,16 @@ extern "C" { #define CONTROL_REGISTERS_RO_CACHE_END_MULTIREG_COUNT 4 // Read-only cache Region End -#define CONTROL_REGISTERS_RO_CACHE_END_0_REG_OFFSET 0x58 +#define CONTROL_REGISTERS_RO_CACHE_END_0_REG_OFFSET 0x88 // Read-only cache Region End -#define CONTROL_REGISTERS_RO_CACHE_END_1_REG_OFFSET 0x5c +#define CONTROL_REGISTERS_RO_CACHE_END_1_REG_OFFSET 0x8c // Read-only cache Region End -#define CONTROL_REGISTERS_RO_CACHE_END_2_REG_OFFSET 0x60 +#define CONTROL_REGISTERS_RO_CACHE_END_2_REG_OFFSET 0x90 // Read-only cache Region End -#define CONTROL_REGISTERS_RO_CACHE_END_3_REG_OFFSET 0x64 +#define CONTROL_REGISTERS_RO_CACHE_END_3_REG_OFFSET 0x94 #ifdef __cplusplus } // extern "C" diff --git a/software/runtime/dma.h b/software/runtime/dma.h index 4aa7f6cec..cab318d28 100644 --- a/software/runtime/dma.h +++ b/software/runtime/dma.h @@ -73,4 +73,5 @@ void dma_memcpy_blocking(void *dest, const void *src, size_t len) { dma_memcpy_nonblocking(dest, src, len); dma_wait(); } + #endif // _DMA_H_ diff --git a/software/runtime/runtime.h b/software/runtime/runtime.h index 7ec0aa8d2..7516a0aeb 100644 --- a/software/runtime/runtime.h +++ b/software/runtime/runtime.h @@ -54,6 +54,47 @@ static uint32_t volatile *wake_up_offset_reg = (uint32_t volatile *)(CONTROL_REGISTER_OFFSET + CONTROL_REGISTERS_WAKE_UP_OFFST_REG_OFFSET); +/* DAS-related regs */ + +static uint32_t volatile *partition_0_reg = + (uint32_t volatile *)(CONTROL_REGISTER_OFFSET + + CONTROL_REGISTERS_PARTITION_SEL_0_REG_OFFSET); +static uint32_t volatile *partition_1_reg = + (uint32_t volatile *)(CONTROL_REGISTER_OFFSET + + CONTROL_REGISTERS_PARTITION_SEL_1_REG_OFFSET); +static uint32_t volatile *partition_2_reg = + (uint32_t volatile *)(CONTROL_REGISTER_OFFSET + + CONTROL_REGISTERS_PARTITION_SEL_2_REG_OFFSET); +static uint32_t volatile *partition_3_reg = + (uint32_t volatile *)(CONTROL_REGISTER_OFFSET + + CONTROL_REGISTERS_PARTITION_SEL_3_REG_OFFSET); + +static uint32_t volatile *start_das_0_reg = + (uint32_t volatile *)(CONTROL_REGISTER_OFFSET + + CONTROL_REGISTERS_START_DAS_0_REG_OFFSET); +static uint32_t volatile *start_das_1_reg = + (uint32_t volatile *)(CONTROL_REGISTER_OFFSET + + CONTROL_REGISTERS_START_DAS_1_REG_OFFSET); +static uint32_t volatile *start_das_2_reg = + (uint32_t volatile *)(CONTROL_REGISTER_OFFSET + + CONTROL_REGISTERS_START_DAS_2_REG_OFFSET); +static uint32_t volatile *start_das_3_reg = + (uint32_t volatile *)(CONTROL_REGISTER_OFFSET + + CONTROL_REGISTERS_START_DAS_3_REG_OFFSET); + +static uint32_t volatile *rows_das_0_reg = + (uint32_t volatile *)(CONTROL_REGISTER_OFFSET + + CONTROL_REGISTERS_ROWS_DAS_0_REG_OFFSET); +static uint32_t volatile *rows_das_1_reg = + (uint32_t volatile *)(CONTROL_REGISTER_OFFSET + + CONTROL_REGISTERS_ROWS_DAS_1_REG_OFFSET); +static uint32_t volatile *rows_das_2_reg = + (uint32_t volatile *)(CONTROL_REGISTER_OFFSET + + CONTROL_REGISTERS_ROWS_DAS_2_REG_OFFSET); +static uint32_t volatile *rows_das_3_reg = + (uint32_t volatile *)(CONTROL_REGISTER_OFFSET + + CONTROL_REGISTERS_ROWS_DAS_3_REG_OFFSET); + typedef uint32_t mempool_id_t; typedef uint32_t mempool_timer_t; @@ -99,8 +140,12 @@ static inline uint32_t mempool_get_core_count_per_group() { static inline void mempool_init(const uint32_t core_id) { if (core_id == 0) { // Initialize L1 Interleaved Heap Allocator - extern uint32_t __heap_start, __heap_end; - uint32_t heap_size = (uint32_t)&__heap_end - (uint32_t)&__heap_start; + extern uint32_t __heap_start; + extern uint32_t __heap_seq_start; + // Heap Region + uint32_t heap_size = + (uint32_t)&__heap_seq_start - + (uint32_t)&__heap_start; // Downscale interleaved heap size alloc_init(get_alloc_l1(), &__heap_start, heap_size); // Initialize L1 Sequential Heap Allocator per Tile @@ -123,6 +168,50 @@ static inline void mempool_init(const uint32_t core_id) { } } +// Reconfigure Interleaved Heap region, with explicit 'Dynamic Heap' start +// address Programmer API for flexible Dynamic Heap region configuration +static inline void mempool_reset_heap(const uint32_t core_id, + uint32_t heap_seq_start) { + if (core_id == 0) { + // Initialize L1 Interleaved Heap Allocator + extern uint32_t __heap_start; + uint32_t heap_size = + (uint32_t)heap_seq_start - + (uint32_t)&__heap_start; // Downscale interleaved heap size + alloc_init(get_alloc_l1(), &__heap_start, heap_size); + } +} + +// Initialize Dynamic Heap Allocator, as default specified in the linker script +static inline void mempool_dynamic_heap_alloc_init(const uint32_t core_id) { + if (core_id == 0) { + extern uint32_t __heap_seq_start; + // Dynamic allocator base and size + uint32_t seq_heap_base = (uint32_t)&__heap_seq_start; + uint32_t seq_heap_size = NUM_CORES * DAS_MEM_SIZE; + // Dynamically allocate the space for allocators + alloc_t *dynamic_heap_allocator = get_dynamic_heap_alloc(); + alloc_init(dynamic_heap_allocator, (uint32_t *)seq_heap_base, + seq_heap_size); + } +} + +// Reset Dynamic Heap region with explicit start address specification +// A UNIFIED allocator will be used +static inline void mempool_dynamic_heap_alloc_reset(const uint32_t core_id, + uint32_t heap_seq_start) { + if (core_id == 0) { + extern uint32_t __heap_end; + // Dynamic allocator base and size + uint32_t seq_heap_base = heap_seq_start; + uint32_t seq_heap_size = (uint32_t)&__heap_end - heap_seq_start; + // Reset the space for allocators + alloc_t *dynamic_heap_allocator = get_dynamic_heap_alloc(); + alloc_init(dynamic_heap_allocator, (uint32_t *)seq_heap_base, + seq_heap_size); + } +} + /// Reset a monotonically increasing cycle count. static inline void mempool_start_benchmark() { asm volatile("" ::: "memory"); @@ -200,6 +289,49 @@ static inline void set_wake_up_stride(uint32_t stride) { static inline void set_wake_up_offset(uint32_t offset) { *wake_up_offset_reg = offset; } + +// Partition Configuration +static inline void das_config(uint32_t reg_sel, uint32_t tiles_per_partition, uint32_t addr, uint32_t size) { + asm volatile("" ::: "memory"); + // Compute number of rows + uint32_t row_bytes = NUM_BANKS * sizeof(uint32_t); + uint32_t rows_das = (size + (row_bytes-1)) / row_bytes; + + // enforce minimum 2 rows per partition + // TODO (bowwang): should add protection to enforce `rows_das` is power of 2 + if (rows_das < 2) rows_das = 2; + + // Program DAS registers + switch (reg_sel) { + case 0: + *partition_0_reg = tiles_per_partition; + *start_das_0_reg = addr; + *rows_das_0_reg = rows_das; + break; + case 1: + *partition_1_reg = tiles_per_partition; + *start_das_1_reg = addr; + *rows_das_1_reg = rows_das; + break; + case 2: + *partition_2_reg = tiles_per_partition; + *start_das_2_reg = addr; + *rows_das_2_reg = rows_das; + break; + case 3: + *partition_3_reg = tiles_per_partition; + *start_das_3_reg = addr; + *rows_das_3_reg = rows_das; + break; + default: + *partition_0_reg = tiles_per_partition; + *start_das_0_reg = addr; + *rows_das_0_reg = rows_das; + break; + } + asm volatile("" ::: "memory"); +} + // Dump a value via CSR // This is only supported in simulation and an experimental feature. All writes // to unimplemented CSR registers will be dumped by Snitch. This can be diff --git a/software/runtime/runtime.mk b/software/runtime/runtime.mk index 94f822ddc..039473a6e 100644 --- a/software/runtime/runtime.mk +++ b/software/runtime/runtime.mk @@ -89,6 +89,7 @@ RISCV_STRIP ?= $(RISCV_PREFIX)strip # Defines DEFINES += -DPRINTF_DISABLE_SUPPORT_FLOAT -DPRINTF_DISABLE_SUPPORT_LONG_LONG -DPRINTF_DISABLE_SUPPORT_PTRDIFF_T DEFINES += -DNUM_CORES=$(num_cores) +DEFINES += -DLOG2_NUM_CORES=$(shell awk 'BEGIN{print log($(num_cores))/log(2)}') DEFINES += -DNUM_GROUPS=$(num_groups) DEFINES += -DNUM_CORES_PER_TILE=$(num_cores_per_tile) DEFINES += -DBANKING_FACTOR=$(banking_factor) @@ -110,6 +111,11 @@ ifdef terapool DEFINES += -DNUM_CORES_PER_SUB_GROUP=$(shell awk 'BEGIN{print ($(num_cores)/$(num_groups))/$(num_sub_groups_per_group)}') DEFINES += -DNUM_TILES_PER_SUB_GROUP=$(shell awk 'BEGIN{print ($(num_cores)/$(num_groups))/$(num_cores_per_tile)/$(num_sub_groups_per_group)}') endif +ifdef das + DEFINES += -DNUM_DAS_PARTITIONS=$(num_das_partitions) + DEFINES += -DDAS_MEM_SIZE=$(das_mem_size) + DEFINES += -DLOG2_DAS_MEM_SIZE=$(shell awk 'BEGIN{print log($(das_mem_size))/log(2)}') +endif # Specify cross compilation target. This can be omitted if LLVM is built with riscv as default target RISCV_LLVM_TARGET ?= --target=$(RISCV_TARGET) --sysroot=$(GCC_INSTALL_DIR)/$(RISCV_TARGET) --gcc-toolchain=$(GCC_INSTALL_DIR) diff --git a/software/runtime/synchronization.c b/software/runtime/synchronization.c index c3c3846f8..93fac8321 100644 --- a/software/runtime/synchronization.c +++ b/software/runtime/synchronization.c @@ -10,14 +10,6 @@ #include "runtime.h" #include "synchronization.h" -#if NUM_CORES == (16) -#define LOG2_NUM_CORES (4) -#elif NUM_CORES == (256) -#define LOG2_NUM_CORES (8) -#elif NUM_CORES == (1024) -#define LOG2_NUM_CORES (10) -#endif - uint32_t volatile barrier __attribute__((section(".l1"))); uint32_t volatile log_barrier[NUM_CORES * 4] __attribute__((aligned(NUM_CORES * 4), section(".l1"))); diff --git a/software/tests/baremetal/das_dma/main.c b/software/tests/baremetal/das_dma/main.c new file mode 100644 index 000000000..2b0d70689 --- /dev/null +++ b/software/tests/baremetal/das_dma/main.c @@ -0,0 +1,92 @@ +// Copyright 2021 ETH Zurich and University of Bologna. +// Licensed under the Apache License, Version 2.0, see LICENSE for details. +// SPDX-License-Identifier: Apache-2.0 + +// Note: This test is only for Terapool dynamic heap allocation +// Author: Bowen Wang + +#include +#include + +#include "alloc.h" +#include "dma.h" +#include "encoding.h" +#include "printf.h" +#include "runtime.h" +#include "synchronization.h" + +#define NUM_TILES (NUM_CORES / NUM_CORES_PER_TILE) +uint32_t l2_array[2 * NUM_BANKS] __attribute__((section(".l2"))); + +int main() { + uint32_t core_id = mempool_get_core_id(); + uint32_t num_cores = mempool_get_core_count(); + + // Initialization + mempool_init(core_id); + mempool_barrier_init(core_id); + + if (core_id == 0) { + + // -------------------------------------------- + // Initialize + // -------------------------------------------- + uint32_t num_tiles_per_partition = 4; + uint32_t array_size = + 2 * num_tiles_per_partition * BANKING_FACTOR * NUM_CORES_PER_TILE; + // Initialize L2 array + for (uint32_t i = 0; i < array_size; i++) { + l2_array[i] = i; + } + + // -------------------------------------------- + // Verify DMA transfers in DAS region + // -------------------------------------------- + printf("Verify DMA transfers in DAS region\n\n"); + + // 1. Init dynamic heap allocator + mempool_dynamic_heap_alloc_init(core_id); + + // 2. Set which partition write to. + uint32_t part_id = 0; + + // 3. Get the allocator + alloc_t *dynamic_heap_alloc = get_dynamic_heap_alloc(); + alloc_dump(dynamic_heap_alloc); + + // 4. Allocate memory + uint32_t *array = (uint32_t *)partition_malloc( + dynamic_heap_alloc, array_size * sizeof(uint32_t)); + + // 5. Config the hardware registers + partition_config(part_id, num_tiles_per_partition); + start_addr_scheme_config(part_id, (uint32_t)(*array), + array_size * sizeof(uint32_t)); + + // 6. Move data + dma_memcpy_blocking(array, l2_array, array_size * sizeof(uint32_t)); + + // 7. Change addressing scheme (to fully interleaved) + partition_config(part_id, NUM_TILES); + + // 8. check + for (uint32_t i = 0; i < array_size; i++) { + uint32_t *fetch_address = &array[0] + + (i % (num_tiles_per_partition * NUM_CORES_PER_TILE * BANKING_FACTOR)) + + (i / (num_tiles_per_partition * NUM_CORES_PER_TILE * BANKING_FACTOR)) * NUM_BANKS; + if (l2_array[i] != *fetch_address) { + printf("%4d != %4d at address %8X.\n", i, *fetch_address, + fetch_address); + } + } + + // 9. Free array + partition_free(dynamic_heap_alloc, array); + printf("SUCCESS on partition %d \n\n", part_id); + + printf("All correct!\n"); + } + + mempool_barrier(num_cores); + return 0; +} diff --git a/software/tests/baremetal/das_malloc_test/main.c b/software/tests/baremetal/das_malloc_test/main.c new file mode 100644 index 000000000..61e58c4db --- /dev/null +++ b/software/tests/baremetal/das_malloc_test/main.c @@ -0,0 +1,183 @@ +// Copyright 2021 ETH Zurich and University of Bologna. +// Licensed under the Apache License, Version 2.0, see LICENSE for details. +// SPDX-License-Identifier: Apache-2.0 + +// Note: This test is only for Terapool dynamic heap allocation +// Author: Bowen Wang + +#include +#include + +#include "alloc.h" +#include "encoding.h" +#include "printf.h" +#include "runtime.h" +#include "synchronization.h" + +#define NUM_TILES (NUM_CORES / NUM_CORES_PER_TILE) + +int main() { + uint32_t core_id = mempool_get_core_id(); + uint32_t num_cores = mempool_get_core_count(); + + // Initialization + mempool_init(core_id); + mempool_barrier_init(core_id); + + if (core_id == 0) { + + // -------------------------------------------- + // Verify DAS partitions + // -------------------------------------------- + printf("Verify DAS partitions\n\n"); + + uint32_t num_tiles_per_partition = 4; + uint32_t array_size = + 2 * num_tiles_per_partition * BANKING_FACTOR * NUM_CORES_PER_TILE; + + // 1. Init dynamic heap allocator + mempool_dynamic_heap_alloc_init(core_id); + + // 2. Set which partition write to. + for (uint32_t part_id = 0; part_id < NUM_DAS_PARTITIONS; part_id++) { + // 3. Get the allocator + alloc_t *dynamic_heap_alloc = get_dynamic_heap_alloc(); + alloc_dump(dynamic_heap_alloc); + + // 4. Allocate memory + uint32_t *array = (uint32_t *)partition_malloc( + dynamic_heap_alloc, array_size * sizeof(uint32_t)); + printf("start_addr at 0x%8x\n", array); + + // 5. Config the hardware registers + das_config(part_id, num_tiles_per_partition, (uint32_t)(array), array_size * sizeof(uint32_t)); + // 6. Move data + for (uint32_t i = 0; i < array_size; i++) { + array[i] = i; + } + // 7. Change addressing scheme (to fully interleaved) + das_config(part_id, NUM_TILES, (uint32_t)(array), array_size * sizeof(uint32_t)); + // 8. check + for (uint32_t i = 0; i < array_size; i++) { + uint32_t *fetch_address = + &array[0] + + (i % + (num_tiles_per_partition * NUM_CORES_PER_TILE * BANKING_FACTOR)) + + (i / + (num_tiles_per_partition * NUM_CORES_PER_TILE * BANKING_FACTOR)) * + NUM_BANKS; + if (i != *fetch_address) { + printf("%4d != %4d at address %8X.\n", i, *fetch_address, + fetch_address); + return 1; + } + } + // 9. Free array + partition_free(dynamic_heap_alloc, array); + printf("SUCCESS on partition %d \n\n", part_id); + } + + // -------------------------------------------- + // Verify DAS partitions with misalignment + // -------------------------------------------- + printf("Verify DAS partitions with misalignemnt\n\n"); + + // 2. Set which partition write to. + for (uint32_t part_id = 0; part_id < NUM_DAS_PARTITIONS; part_id++) { + // 3. Get the allocator + alloc_t *dynamic_heap_alloc = get_dynamic_heap_alloc(); + alloc_dump(dynamic_heap_alloc); + + // 4.0 inject misalignment + uint32_t offset = 32 * (1+part_id); + uint32_t *misalign = (uint32_t *)partition_malloc( + dynamic_heap_alloc, (2*NUM_BANKS + offset) * sizeof(uint32_t)); + printf("Inject misalignment at 0x%8x with size 0x%8x in byte\n", misalign, offset*part_id); + + // 4. Allocate memory + uint32_t *array = (uint32_t *)partition_malloc( + dynamic_heap_alloc, array_size * sizeof(uint32_t)); + printf("Aligned start_addr at 0x%8x\n", array); + + // 5. Config the hardware registers + das_config(part_id, num_tiles_per_partition, (uint32_t)(array), array_size * sizeof(uint32_t)); + // 6. Move data + for (uint32_t i = 0; i < array_size; i++) { + array[i] = i; + } + // 7. Change addressing scheme (to fully interleaved) + das_config(part_id, NUM_TILES, (uint32_t)(array), array_size * sizeof(uint32_t)); + // 8. check + for (uint32_t i = 0; i < array_size; i++) { + uint32_t *fetch_address = + &array[0] + + (i % + (num_tiles_per_partition * NUM_CORES_PER_TILE * BANKING_FACTOR)) + + (i / + (num_tiles_per_partition * NUM_CORES_PER_TILE * BANKING_FACTOR)) * + NUM_BANKS; + if (i != *fetch_address) { + printf("%4d != %4d at address %8X.\n", i, *fetch_address, + fetch_address); + return 1; + } + } + // 9. Free array + partition_free(dynamic_heap_alloc, array); + partition_free(dynamic_heap_alloc, misalign); + printf("SUCCESS on partition %d \n\n", part_id); + } + + // -------------------------------------------- + // Verify DAS per Tile groups + // -------------------------------------------- + printf("Verify DAS per Tile-groups\n\n"); + + // 2. Set which partition write to. + uint32_t part_id = 0; + for (num_tiles_per_partition = 1; num_tiles_per_partition < NUM_TILES; + num_tiles_per_partition *= 2) { + array_size = + 2 * num_tiles_per_partition * BANKING_FACTOR * NUM_CORES_PER_TILE; + // 3. Get the allocator + alloc_t *dynamic_heap_alloc = get_dynamic_heap_alloc(); + alloc_dump(dynamic_heap_alloc); + // 4. Allocate memory + uint32_t *array = (uint32_t *)partition_malloc( + dynamic_heap_alloc, array_size * sizeof(uint32_t)); + // 5. Config the hardware registers + das_config(part_id, num_tiles_per_partition, (uint32_t)(array), array_size * sizeof(uint32_t)); + // 6. Move data + for (uint32_t i = 0; i < array_size; i++) { + array[i] = i; + } + // 7. Change addressing scheme (to fully interleaved) + das_config(part_id, NUM_TILES, (uint32_t)(array), array_size * sizeof(uint32_t)); + // partition_config(part_id, NUM_TILES); + // 8. check + for (uint32_t i = 0; i < array_size; i++) { + uint32_t *fetch_address = + &array[0] + + (i % + (num_tiles_per_partition * NUM_CORES_PER_TILE * BANKING_FACTOR)) + + (i / + (num_tiles_per_partition * NUM_CORES_PER_TILE * BANKING_FACTOR)) * + NUM_BANKS; + if (i != *fetch_address) { + printf("%4d != %4d at address %8X.\n", i, *fetch_address, + fetch_address); + return 1; + } + } + // 9. Free array + partition_free(dynamic_heap_alloc, array); + printf("SUCCESS for groups of %d tiles over the partition \n\n", + num_tiles_per_partition); + } + + printf("All correct!\n"); + } + + mempool_barrier(num_cores); + return 0; +} diff --git a/software/tests/baremetal/das_static_test/main.c b/software/tests/baremetal/das_static_test/main.c new file mode 100644 index 000000000..3d38995a1 --- /dev/null +++ b/software/tests/baremetal/das_static_test/main.c @@ -0,0 +1,64 @@ +// Copyright 2021 ETH Zurich and University of Bologna. +// Licensed under the Apache License, Version 2.0, see LICENSE for details. +// SPDX-License-Identifier: Apache-2.0 + +// Note: This test is only for Terapool dynamic heap allocation +// Author: Bowen Wang + +#include +#include + +#include "alloc.h" +#include "encoding.h" +#include "printf.h" +#include "runtime.h" +#include "synchronization.h" + +#define NUM_TILES (NUM_CORES / NUM_CORES_PER_TILE) +#define ARRAY_SIZE (4096) + +uint32_t array[ARRAY_SIZE] __attribute__((aligned(NUM_BANKS*sizeof(int32_t)), section(".l1_prio"))); + +int main() { + uint32_t core_id = mempool_get_core_id(); + uint32_t num_cores = mempool_get_core_count(); + mempool_barrier_init(core_id); + + if (core_id == 0) { + + // -------------------------------------------- + // Verify DAS partitions + // -------------------------------------------- + printf("Verify DAS partitions\n\n"); + + uint32_t num_tiles_per_partition = 4; + uint32_t part_id = 0; + + uint32_t num_partitions = NUM_TILES / num_tiles_per_partition; + uint32_t size_partition = ARRAY_SIZE / num_partitions; + + das_config(part_id, num_tiles_per_partition, (uint32_t)(array), ARRAY_SIZE * sizeof(uint32_t)); + for (uint32_t i = 0; i < ARRAY_SIZE; i++) { + array[i] = i; + } + + das_config(part_id, NUM_TILES, (uint32_t)(array), ARRAY_SIZE * sizeof(uint32_t)); + for (uint32_t j = 0; j < num_partitions; j++) { + for (uint32_t i = 0; i < size_partition; i++) { + + uint32_t *fetch_address = &array[0] + + j * (num_tiles_per_partition * NUM_CORES_PER_TILE * BANKING_FACTOR) + + (i % (num_tiles_per_partition * NUM_CORES_PER_TILE * BANKING_FACTOR)) + + (i / (num_tiles_per_partition * NUM_CORES_PER_TILE * BANKING_FACTOR)) * NUM_BANKS; + if (i + j * size_partition != *fetch_address) { + printf("%4d != %4d at address %8X.\n", i + j * size_partition, *fetch_address, fetch_address); + return 1; + } + } + } + printf("SUCCESS on partition %d\n", part_id); + } + + mempool_barrier(num_cores); + return 0; +}