diff --git a/.gitignore b/.gitignore index 305e025f..b41e10c2 100644 --- a/.gitignore +++ b/.gitignore @@ -2,3 +2,4 @@ build __pycache__ .hypothesis .vscode +*.log \ No newline at end of file diff --git a/cgra/CgraDmaRTL.py b/cgra/CgraDmaRTL.py new file mode 100644 index 00000000..27d04985 --- /dev/null +++ b/cgra/CgraDmaRTL.py @@ -0,0 +1,217 @@ +""" +========================================================================= +CgraDmaRTL.py +========================================================================= + +Wrapper that composes a CGRA template with a DMA engine attached to the +CGRA data SPM. +""" + +from pymtl3 import * + +from .CgraTemplateRTL import CgraTemplateRTL +from ..lib.basic.val_rdy.ifcs import ValRdyRecvIfcRTL as RecvIfcRTL +from ..lib.basic.val_rdy.ifcs import ValRdySendIfcRTL as SendIfcRTL +from ..lib.messages import * +from ..lib.util.data_struct_attr import * +from ..mem.dma.DmaEngineRTL import DmaEngineRTL + + +class CgraDmaRTL( Component ): + """ + CgraDmaRTL is a top-level wrapper that integrates a CGRA instance with a + DMA engine. + + Architectural Design: + - It instantiates a standard CGRA template (`CgraTemplateRTL`) and a + DMA engine (`DmaEngineRTL`). + - The DMA engine is connected to the CGRA's internal data SPM through a + dedicated master port on the `DataMemControllerRTL`. + - CPU control packets are passed through to the CGRA's controller. + - External memory requests from the DMA engine are exposed at the top level + to be connected to a DRAM model or an AXI adapter. + - Boundary data ports for multi-CGRA configurations are also passed through + if enabled. + """ + + def construct(s, CgraPayloadType, + multi_cgra_rows, + multi_cgra_columns, + per_cgra_rows, per_cgra_columns, + ctrl_mem_size, data_mem_size_global, + data_mem_size_per_bank, num_banks_per_cgra, + num_registers_per_reg_bank, num_ctrl, + total_steps, mem_access_is_combinational, + FunctionUnit, FuList, TileList, LinkList, + dataSPM, controller2addr_map, idTo2d_map, + is_multi_cgra = True, cgra_id = 0, + # For heterogeneous multi-cgra support.(maybe remove it in CgraDmaRTL for simplicity?) + provided_max_per_cgra_rows = None, + provided_max_per_cgra_cols = None, + provided_max_num_rd_tiles = None, + provided_max_num_wr_tiles = None): + + DataType = CgraPayloadType.get_field_type(kAttrData) + data_bitwidth = DataType.get_field_type(kAttrPayload).nbits + assert data_bitwidth == 32 + + max_per_cgra_rows = provided_max_per_cgra_rows if provided_max_per_cgra_rows is not None else per_cgra_rows + max_per_cgra_cols = provided_max_per_cgra_cols if provided_max_per_cgra_cols is not None else per_cgra_columns + max_num_tiles = max_per_cgra_rows * max_per_cgra_cols + max_num_rd_tiles = provided_max_num_rd_tiles if provided_max_num_rd_tiles is not None else dataSPM.getNumOfValidReadPorts() + + CtrlPktType = mk_intra_cgra_pkt(multi_cgra_columns, multi_cgra_rows, + max_num_tiles, CgraPayloadType) + NocPktType = mk_inter_cgra_pkt(multi_cgra_columns, multi_cgra_rows, + max_num_tiles, max_num_rd_tiles, + CgraPayloadType) + + CgraIdType = mk_cgra_id_type(multi_cgra_columns, multi_cgra_rows) + DataAddrType = mk_bits(clog2(data_mem_size_global)) + DmaOpcodeType = mk_bits(3) #DMA_MVIN: 0, DMA_MVOUT: 1 + DmaDramAddrType = mk_bits(64) + DmaBytesType = mk_bits(32) + DmaTagType = mk_bits(8) + DmaMemDataType = mk_bits(128) # Write/Read 128 bits data per beat from/to DRAM + DmaMemMaskType = mk_bits(16) + + # Existing CGRA-facing interfaces. + # CGRA <-> CPU + s.recv_from_cpu_pkt = RecvIfcRTL(CtrlPktType) + s.send_to_cpu_pkt = SendIfcRTL(CtrlPktType) + + if is_multi_cgra: + s.recv_from_inter_cgra_noc = RecvIfcRTL(NocPktType) + s.send_to_inter_cgra_noc = SendIfcRTL(NocPktType) + + s.recv_data_on_boundary_north = [RecvIfcRTL(DataType) for _ in range(max_per_cgra_cols)] + s.send_data_on_boundary_north = [SendIfcRTL(DataType) for _ in range(max_per_cgra_cols)] + s.recv_data_on_boundary_south = [RecvIfcRTL(DataType) for _ in range(max_per_cgra_cols)] + s.send_data_on_boundary_south = [SendIfcRTL(DataType) for _ in range(max_per_cgra_cols)] + s.recv_data_on_boundary_west = [RecvIfcRTL(DataType) for _ in range(max_per_cgra_rows)] + s.send_data_on_boundary_west = [SendIfcRTL(DataType) for _ in range(max_per_cgra_rows)] + s.recv_data_on_boundary_east = [RecvIfcRTL(DataType) for _ in range(max_per_cgra_rows)] + s.send_data_on_boundary_east = [SendIfcRTL(DataType) for _ in range(max_per_cgra_rows)] + + s.cgra_id = InPort(CgraIdType) + # The local address range of current CGRA. + # Any address out of this range will be assumed as remote address. + s.address_lower = InPort(DataAddrType) + s.address_upper = InPort(DataAddrType) + + # DMA command/done and abstract external memory interfaces. + + s.dma_cmd_val = InPort() # dma_command_valid + s.dma_cmd_rdy = OutPort() # dma_command_ready + s.dma_cmd_opcode = InPort(DmaOpcodeType) + s.dma_cmd_dram_addr = InPort(DmaDramAddrType) + s.dma_cmd_spm_addr = InPort(DataAddrType) + s.dma_cmd_bytes = InPort(DmaBytesType) # The number of bytes to transfer. + s.dma_cmd_tag = InPort(DmaTagType) # Doesn't use it now, but keep it for future use(e.g., distinguish different DMA commands). + + s.dma_done_val = OutPort() + s.dma_done_rdy = InPort() + s.dma_done_tag = OutPort(DmaTagType) # Must be same as the input `dma_cmd_tag` + + s.dram_rd_req = SendIfcRTL(DmaDramAddrType) + s.dram_rd_resp = RecvIfcRTL(DmaMemDataType) + + s.dram_wr_req_val = OutPort() + s.dram_wr_req_rdy = InPort() + s.dram_wr_req_addr = OutPort(DmaDramAddrType) + s.dram_wr_req_data = OutPort(DmaMemDataType) + s.dram_wr_req_mask = OutPort(DmaMemMaskType) # Masks for wrting DRAM + + s.dram_wr_resp_val = InPort() + s.dram_wr_resp_rdy = OutPort() + + # Components. + + s.cgra = CgraTemplateRTL(CgraPayloadType, + multi_cgra_rows, + multi_cgra_columns, + per_cgra_rows, per_cgra_columns, + ctrl_mem_size, data_mem_size_global, + data_mem_size_per_bank, num_banks_per_cgra, + num_registers_per_reg_bank, num_ctrl, + total_steps, mem_access_is_combinational, + FunctionUnit, FuList, TileList, LinkList, + dataSPM, controller2addr_map, idTo2d_map, + is_multi_cgra, cgra_id, + provided_max_per_cgra_rows, + provided_max_per_cgra_cols, + provided_max_num_rd_tiles, + provided_max_num_wr_tiles, + has_dma_ports = True) + + s.dma = DmaEngineRTL(spm_data_nbits = data_bitwidth, + spm_addr_nbits = clog2(data_mem_size_global)) + + # CGRA passthrough connections. + + s.recv_from_cpu_pkt //= s.cgra.recv_from_cpu_pkt + s.send_to_cpu_pkt //= s.cgra.send_to_cpu_pkt + + if is_multi_cgra: + s.recv_from_inter_cgra_noc //= s.cgra.recv_from_inter_cgra_noc + s.send_to_inter_cgra_noc //= s.cgra.send_to_inter_cgra_noc + + for i in range(max_per_cgra_cols): + s.recv_data_on_boundary_north[i] //= s.cgra.recv_data_on_boundary_north[i] + s.send_data_on_boundary_north[i] //= s.cgra.send_data_on_boundary_north[i] + s.recv_data_on_boundary_south[i] //= s.cgra.recv_data_on_boundary_south[i] + s.send_data_on_boundary_south[i] //= s.cgra.send_data_on_boundary_south[i] + + for i in range(max_per_cgra_rows): + s.recv_data_on_boundary_west[i] //= s.cgra.recv_data_on_boundary_west[i] + s.send_data_on_boundary_west[i] //= s.cgra.send_data_on_boundary_west[i] + s.recv_data_on_boundary_east[i] //= s.cgra.recv_data_on_boundary_east[i] + s.send_data_on_boundary_east[i] //= s.cgra.send_data_on_boundary_east[i] + + s.cgra_id //= s.cgra.cgra_id + s.address_lower //= s.cgra.address_lower + s.address_upper //= s.cgra.address_upper + + # DMA top-level connections. + + s.dma_cmd_val //= s.dma.dma_cmd_val + s.dma_cmd_rdy //= s.dma.dma_cmd_rdy + s.dma_cmd_opcode //= s.dma.dma_cmd_opcode + s.dma_cmd_dram_addr //= s.dma.dma_cmd_dram_addr + s.dma_cmd_spm_addr //= s.dma.dma_cmd_spm_addr + s.dma_cmd_bytes //= s.dma.dma_cmd_bytes + s.dma_cmd_tag //= s.dma.dma_cmd_tag + + s.dma_done_val //= s.dma.dma_done_val + s.dma_done_rdy //= s.dma.dma_done_rdy + s.dma_done_tag //= s.dma.dma_done_tag + + s.dram_rd_req //= s.dma.dram_rd_req + s.dram_rd_resp //= s.dma.dram_rd_resp + + s.dram_wr_req_val //= s.dma.dram_wr_req_val + s.dram_wr_req_rdy //= s.dma.dram_wr_req_rdy + s.dram_wr_req_addr //= s.dma.dram_wr_req_addr + s.dram_wr_req_data //= s.dma.dram_wr_req_data + s.dram_wr_req_mask //= s.dma.dram_wr_req_mask + + s.dram_wr_resp_val //= s.dma.dram_wr_resp_val + s.dram_wr_resp_rdy //= s.dma.dram_wr_resp_rdy + + # DMA to SPM connections. + + s.dma.spm_dma_wval //= s.cgra.spm_dma_wval + s.dma.spm_dma_wrdy //= s.cgra.spm_dma_wrdy + s.dma.spm_dma_waddr //= s.cgra.spm_dma_waddr + s.dma.spm_dma_wdata //= s.cgra.spm_dma_wdata + s.dma.spm_dma_wmask //= s.cgra.spm_dma_wmask + + s.dma.spm_dma_rval //= s.cgra.spm_dma_rval + s.dma.spm_dma_rrdy //= s.cgra.spm_dma_rrdy + s.dma.spm_dma_raddr //= s.cgra.spm_dma_raddr + s.dma.spm_dma_rresp_val //= s.cgra.spm_dma_rresp_val + s.dma.spm_dma_rresp_rdy //= s.cgra.spm_dma_rresp_rdy + s.dma.spm_dma_rresp_data //= s.cgra.spm_dma_rresp_data + + def line_trace(s): + return f"{s.dma.line_trace()} || {s.cgra.line_trace()}" diff --git a/cgra/CgraTemplateRTL.py b/cgra/CgraTemplateRTL.py index 300d7832..5d57cb1e 100644 --- a/cgra/CgraTemplateRTL.py +++ b/cgra/CgraTemplateRTL.py @@ -83,7 +83,8 @@ def construct(s, CgraPayloadType, provided_max_per_cgra_rows = None, provided_max_per_cgra_cols = None, provided_max_num_rd_tiles = None, - provided_max_num_wr_tiles = None): + provided_max_num_wr_tiles = None, + has_dma_ports = False): """ provided_max_per_cgra_rows: the row number of the largest cgra in the multi heterogeneous cgra architecture. None for single cgra arch or Homogeneous multi-cgra arch. provided_max_per_cgra_cols: the column number of the largest cgra in the multi heterogeneous cgra architecture. None for single cgra arch or Homogeneous multi-cgra arch. @@ -126,6 +127,8 @@ def construct(s, CgraPayloadType, CtrlRingPos = mk_ring_pos(max_num_tiles + 1) CtrlAddrType = mk_bits(clog2(ctrl_mem_size)) DataAddrType = mk_bits(clog2(data_mem_size_global)) + DmaDataType = DataType.get_field_type(kAttrPayload) + DmaMaskType = mk_bits(max(1, DmaDataType.nbits // CHAR_BIT)) assert(data_mem_size_per_bank * num_banks_per_cgra <= \ data_mem_size_global) @@ -135,6 +138,25 @@ def construct(s, CgraPayloadType, s.recv_from_inter_cgra_noc = RecvIfcRTL(NocPktType) s.send_to_inter_cgra_noc = SendIfcRTL(NocPktType) + # Optional DMA interface ports. These are exposed at the template level + # to allow a top-level wrapper (like CgraDmaRTL) to connect a DMA engine + # directly to the internal DataMemController. + if has_dma_ports: + # DMA write request interface. + s.spm_dma_wval = InPort() # dma write request valid(write data into SPM) + s.spm_dma_wrdy = OutPort() + s.spm_dma_waddr = InPort(DataAddrType) + s.spm_dma_wdata = InPort(DmaDataType) + s.spm_dma_wmask = InPort(DmaMaskType) + + # DMA read response interface. + s.spm_dma_rval = InPort() + s.spm_dma_rrdy = OutPort() + s.spm_dma_raddr = InPort(DataAddrType) + s.spm_dma_rresp_val = OutPort() + s.spm_dma_rresp_rdy = InPort() + s.spm_dma_rresp_data = OutPort(DmaDataType) + if is_multi_cgra: # Use the largest CGRA shape to set the boundary ports for compatibility in the case of heterogeneous multi-cgra. # Remember to ground the remaining boundary ports of the current CGRA when the current CGRA has fewer rows or columns than the largest CGRA. @@ -168,7 +190,8 @@ def construct(s, CgraPayloadType, multi_cgra_columns, max_num_tiles, mem_access_is_combinational, - idTo2d_map) + idTo2d_map, + has_dma_ports) s.cgra_id = InPort(CgraIdType) s.controller = ControllerRTL(NocPktType, multi_cgra_rows, multi_cgra_columns, @@ -190,6 +213,22 @@ def construct(s, CgraPayloadType, s.data_mem.address_lower //= s.address_lower s.data_mem.address_upper //= s.address_upper + if has_dma_ports: + # DMA_MVIN: dram -> dma -> spm + s.data_mem.spm_dma_wval //= s.spm_dma_wval + s.data_mem.spm_dma_wrdy //= s.spm_dma_wrdy + s.data_mem.spm_dma_waddr //= s.spm_dma_waddr + s.data_mem.spm_dma_wdata //= s.spm_dma_wdata + s.data_mem.spm_dma_wmask //= s.spm_dma_wmask + + # DMA_MVOUT: spm -> dma -> dram + s.data_mem.spm_dma_rval //= s.spm_dma_rval + s.data_mem.spm_dma_rrdy //= s.spm_dma_rrdy + s.data_mem.spm_dma_raddr //= s.spm_dma_raddr + s.data_mem.spm_dma_rresp_val //= s.spm_dma_rresp_val + s.data_mem.spm_dma_rresp_rdy //= s.spm_dma_rresp_rdy + s.data_mem.spm_dma_rresp_data //= s.spm_dma_rresp_data + # Connects data memory with controller. s.data_mem.recv_from_noc_load_request //= s.controller.send_to_mem_load_request s.data_mem.recv_from_noc_store_request //= s.controller.send_to_mem_store_request diff --git a/cgra/test/CgraDmaRTL_test.py b/cgra/test/CgraDmaRTL_test.py new file mode 100644 index 00000000..c9d61e41 --- /dev/null +++ b/cgra/test/CgraDmaRTL_test.py @@ -0,0 +1,232 @@ +""" +========================================================================== +CgraDmaRTL_test.py +========================================================================== +""" + +from pymtl3 import * + +from ..CgraDmaRTL import CgraDmaRTL +from ...fu.single.AdderRTL import AdderRTL +from ...fu.single.MemUnitRTL import MemUnitRTL +from ...fu.single.RetRTL import RetRTL +from ...lib.messages import * +from ...lib.opt_type import * +from ...lib.util.cgra.DataSPM import DataSPM +from ...lib.util.cgra.Tile import Tile +from ...lib.util.cgra.cgra_helper import get_links +from ...mem.dma.DmaEngineRTL import DMA_MVIN, DMA_MVOUT + + +def test_cgra_dma_mvin_to_local_spm(): + """ + Integration test for the CgraDmaRTL wrapper. + It simulates a DMA MVIN command that moves data from external DRAM into + the CGRA's dataSPM. It then checks the SPM contents to ensure the + transfer was successful. + """ + ctrl_mem_size = 8 + data_mem_size_global = 64 + data_mem_size_per_bank = 16 + num_banks_per_cgra = 4 + num_registers_per_reg_bank = 16 + num_ctrl = 1 + total_steps = 1 + + DataType = mk_data(32, 1) + WordType = mk_bits(32) + DataAddrType = mk_bits(clog2(data_mem_size_global)) + CtrlAddrType = mk_bits(clog2(ctrl_mem_size)) + CtrlType = mk_ctrl(4, 2, 8, 8, num_registers_per_reg_bank) + CgraPayloadType = mk_cgra_payload(DataType, DataAddrType, CtrlType, CtrlAddrType) + CtrlPktType = mk_intra_cgra_pkt(1, 1, 4, CgraPayloadType) + + # 2x2 tiles + tiles_2d = [[Tile(x, y, num_registers_per_reg_bank, ["add", "mem", "return"]) + for x in range(2)] for y in range(2)] + TileList = [t for row in tiles_2d for t in row] + LinkList = get_links(tiles_2d) + # The first row and the first column of the 2x2 tiles are connected to the data SPM. + dataSPM = DataSPM(3, 3) + + dut = CgraDmaRTL(CgraPayloadType, + 1, 1, # multi_cgra_rows, multi_cgra_columns + 2, 2, # per_cgra_rows, per_cgra_columns + ctrl_mem_size, data_mem_size_global, + data_mem_size_per_bank, num_banks_per_cgra, + num_registers_per_reg_bank, num_ctrl, + total_steps, True, + None, [AdderRTL, MemUnitRTL, RetRTL], + TileList, LinkList, dataSPM, + {0: [0, 15]}, # controller to address map + {0: [0, 0]}, # cgra id to 2D coordinate + is_multi_cgra = False) + + dut.apply(DefaultPassGroup()) + dut.sim_reset() + + dut.cgra_id @= 0 + # Address range: [0:15] + dut.address_lower @= DataAddrType(0) + dut.address_upper @= DataAddrType(15) + + dut.recv_from_cpu_pkt.val @= 0 + dut.recv_from_cpu_pkt.msg @= CtrlPktType() + dut.send_to_cpu_pkt.rdy @= 1 + dut.dram_rd_req.rdy @= 1 + dut.dram_rd_resp.val @= 0 + dut.dram_rd_resp.msg @= 0 + dut.dram_wr_req_rdy @= 1 + dut.dram_wr_resp_val @= 0 + dut.dma_done_rdy @= 1 + + dut.dma_cmd_val @= 1 + dut.dma_cmd_opcode @= DMA_MVIN + # Read the data of DRAM from address 0x1000(16 bytes in total), + # then write the data to SPM from address 0x0 to 0x3. + dut.dma_cmd_dram_addr @= 0x1000 + dut.dma_cmd_spm_addr @= DataAddrType(0) + dut.dma_cmd_bytes @= 16 + dut.dma_cmd_tag @= 0x33 + dut.sim_eval_combinational() + assert dut.dma_cmd_rdy + dut.sim_tick() + dut.dma_cmd_val @= 0 + + beat = concat(WordType(0x44444444), WordType(0x33333333), + WordType(0x22222222), WordType(0x11111111)) + pending_resp = False + + for _ in range(40): + dut.dram_rd_resp.val @= 0 + if pending_resp: + dut.dram_rd_resp.val @= 1 + # Simulate the read response from DRAM. + dut.dram_rd_resp.msg @= beat + + dut.sim_eval_combinational() + + pending_resp = bool(dut.dram_rd_req.val & dut.dram_rd_req.rdy) + + if dut.dma_done_val: + # Transfer finished, check the tag. + assert int(dut.dma_done_tag) == 0x33 + break + + dut.sim_tick() + + assert dut.dma_done_val + # Check the data in the dataSPM. + assert dut.cgra.data_mem.memory_wrapper[0].memory.regs[0] == DataType(0x11111111, 1, 0, 0) + assert dut.cgra.data_mem.memory_wrapper[0].memory.regs[1] == DataType(0x22222222, 1, 0, 0) + assert dut.cgra.data_mem.memory_wrapper[0].memory.regs[2] == DataType(0x33333333, 1, 0, 0) + assert dut.cgra.data_mem.memory_wrapper[0].memory.regs[3] == DataType(0x44444444, 1, 0, 0) + + +def test_cgra_dma_mvout_from_local_spm(): + """ + Integration test for the CgraDmaRTL wrapper. + It simulates a DMA MVOUT command that moves data from the local SPM + into external DRAM. + """ + ctrl_mem_size = 8 + data_mem_size_global = 64 + data_mem_size_per_bank = 16 + num_banks_per_cgra = 4 + num_registers_per_reg_bank = 16 + num_ctrl = 1 + total_steps = 1 + + DataType = mk_data(32, 1) + WordType = mk_bits(32) + DataAddrType = mk_bits(clog2(data_mem_size_global)) + CtrlAddrType = mk_bits(clog2(ctrl_mem_size)) + CtrlType = mk_ctrl(4, 2, 8, 8, num_registers_per_reg_bank) + CgraPayloadType = mk_cgra_payload(DataType, DataAddrType, CtrlType, CtrlAddrType) + CtrlPktType = mk_intra_cgra_pkt(1, 1, 4, CgraPayloadType) + + tiles_2d = [[Tile(x, y, num_registers_per_reg_bank, ["add", "mem", "return"]) + for x in range(2)] for y in range(2)] + TileList = [t for row in tiles_2d for t in row] + LinkList = get_links(tiles_2d) + dataSPM = DataSPM(3, 3) + + dut = CgraDmaRTL(CgraPayloadType, + 1, 1, # multi_cgra_rows, multi_cgra_columns + 2, 2, # per_cgra_rows, per_cgra_columns + ctrl_mem_size, data_mem_size_global, + data_mem_size_per_bank, num_banks_per_cgra, + num_registers_per_reg_bank, num_ctrl, + total_steps, True, + None, [AdderRTL, MemUnitRTL, RetRTL], + TileList, LinkList, dataSPM, + {0: [0, 15]}, # controller to address map + {0: [0, 0]}, # cgra id to 2D coordinate + is_multi_cgra = False) + + dut.apply(DefaultPassGroup()) + dut.sim_reset() + + # Pre-load SPM with data + dut.cgra.data_mem.memory_wrapper[0].memory.regs[0] <<= DataType(0x11111111, 1, 0, 0) + dut.cgra.data_mem.memory_wrapper[0].memory.regs[1] <<= DataType(0x22222222, 1, 0, 0) + dut.cgra.data_mem.memory_wrapper[0].memory.regs[2] <<= DataType(0x33333333, 1, 0, 0) + dut.cgra.data_mem.memory_wrapper[0].memory.regs[3] <<= DataType(0x44444444, 1, 0, 0) + dut.sim_tick() + + dut.cgra_id @= 0 + # Address range: [0:15] + dut.address_lower @= DataAddrType(0) + dut.address_upper @= DataAddrType(15) + + dut.recv_from_cpu_pkt.val @= 0 + dut.recv_from_cpu_pkt.msg @= CtrlPktType() + dut.send_to_cpu_pkt.rdy @= 1 + dut.dram_rd_req.rdy @= 1 + dut.dram_rd_resp.val @= 0 + dut.dram_rd_resp.msg @= 0 + dut.dram_wr_req_rdy @= 1 + dut.dram_wr_resp_val @= 0 + dut.dma_done_rdy @= 1 + + # Issue DMA MVOUT command + dut.dma_cmd_val @= 1 + dut.dma_cmd_opcode @= DMA_MVOUT + # Read the data of SPM from address 0x0 to 0x3(16 bytes in total), + # then write the data to DRAM address 0x2000. + dut.dma_cmd_dram_addr @= 0x2000 + dut.dma_cmd_spm_addr @= DataAddrType(0) + dut.dma_cmd_bytes @= 16 + dut.dma_cmd_tag @= 0x44 + dut.sim_eval_combinational() + assert dut.dma_cmd_rdy + dut.sim_tick() + dut.dma_cmd_val @= 0 + + # Expected 128-bit beat + expected_beat = concat(WordType(0x44444444), WordType(0x33333333), + WordType(0x22222222), WordType(0x11111111)) + + done = False + pending_wr_resp = False + for _ in range(40): + dut.dram_wr_resp_val @= 0 + if pending_wr_resp: + dut.dram_wr_resp_val @= 1 + pending_wr_resp = False + + if dut.dram_wr_req_val: + assert dut.dram_wr_req_addr == 0x2000 + assert dut.dram_wr_req_data == expected_beat + pending_wr_resp = True + + dut.sim_eval_combinational() + + if dut.dma_done_val: + assert int(dut.dma_done_tag) == 0x44 + done = True + break + + dut.sim_tick() + + assert done diff --git a/lib/util/common.py b/lib/util/common.py index 51650d67..eedb056e 100644 --- a/lib/util/common.py +++ b/lib/util/common.py @@ -65,3 +65,28 @@ READ_TOWARDS_FU = 1 READ_TOWARDS_ROUTING_XBAR = 2 READ_TOWARDS_BOTH = 3 + +############################ +# Constants for DMA engine. +############################ +# DMA Move In and Out +# DMA_MVIN : DRAM -> DMA Engine -> SPM +# DMA_MVOUT : SPM -> DMA Engine -> DRAM +DMA_MVIN = 0 +DMA_MVOUT = 1 + +# 1 byte = 8 bits +CHAR_BIT = 8 + +# State machine definitions of DMA engine. +from pymtl3 import mk_bits +StateType = mk_bits( 4 ) +STATE_IDLE = StateType( 0 ) # Waiting for a new DMA command +STATE_MVIN_REQ = StateType( 1 ) # MVIN: Issuing DRAM read request +STATE_MVIN_RESP = StateType( 2 ) # MVIN: Waiting for DRAM read response +STATE_MVIN_WRITE = StateType( 3 ) # MVIN: Writing unpacked words to SPM +STATE_MVOUT_READ = StateType( 4 ) # MVOUT: Issuing SPM read request +STATE_MVOUT_RESP = StateType( 5 ) # MVOUT: Receiving SPM read response and packing +STATE_MVOUT_WRITE = StateType( 6 ) # MVOUT: Issuing DRAM write request +STATE_MVOUT_WAIT = StateType( 7 ) # MVOUT: Waiting for DRAM write response +STATE_DONE = StateType( 8 ) # Signaling command completion diff --git a/local_CI.py b/local_CI.py new file mode 100644 index 00000000..f35198f8 --- /dev/null +++ b/local_CI.py @@ -0,0 +1,77 @@ +""" +local_CI.py is a script that runs the CI tests locally. +Usage: +```shell +cd /path/to/VectorCGRA/ +mkdir -p build && cd build +python3 local_CI.py +``` +The log will be saved to the `local_CI.log` file. +""" +import subprocess +import os +import sys + +def run_tests(): + current_dir = os.path.dirname(os.path.abspath(__file__)) + log_file = os.path.join(current_dir, "local_CI.log") + + commands = [ + ["pytest", "..", "-v", "--tb=short"], + ["pytest", "../mem/ctrl/test/CtrlMemDynamicRTL_test.py", "-xvs"], + ["pytest", "../tile/test/TileRTL_test.py", "-xvs", "--test-verilog", "--dump-vtb", "--dump-vcd"], + ["pytest", "../controller/test/ControllerRTL_test.py", "-xvs", "--test-verilog", "--dump-vtb", "--dump-vcd"], + ["pytest", "../cgra/test/CgraTemplateRTL_test.py", "-xvs", "--test-verilog", "--dump-vtb", "--dump-vcd"], + ["pytest", "../cgra/test/CgraRTL_test.py", "-xvs", "--test-verilog", "--dump-vtb", "--dump-vcd"], + ["pytest", "../noc/PyOCN/pymtl3_net/ringnet/test/RingNetworkRTL_test.py"], + ["pytest", "../multi_cgra/test/RingMultiCgraRTL_test.py", "-xvs", "--test-verilog", "--dump-vtb", "--dump-vcd"], + ["pytest", "../multi_cgra/test/MeshMultiCgraRTL_test.py::test_verilog_homo_2x2_4x4", "-xvs", "--test-verilog", "--dump-vtb", "--dump-vcd"], + ["pytest", "../mem/const/test/ConstQueueDynamicRTL_test.py", "-xvs"], + ["pytest", "../mem/data/test/DataMemControllerRTL_test.py", "-xvs", "--test-verilog", "--dump-vtb", "--dump-vcd"], + ["pytest", "../multi_cgra/test/MeshMultiCgraTemplateRTL_test.py", "-xvs", "--test-verilog", "--dump-vtb", "--dump-vcd"], + ["pytest", "../multi_cgra/test/MeshMultiCgraRTL_test.py::test_multi_CGRA_fir_scalar_translation", "-xvs", "--test-verilog", "--dump-vtb", "--dump-vcd"], + ["pytest", "../multi_cgra/test/MeshMultiCgraRTL_test.py::test_multi_CGRA_fir_vector_global_reduce_translation", "-xvs", "--test-verilog", "--dump-vtb", "--dump-vcd"], + ["pytest", "../multi_cgra/test/MeshMultiCgraRTL_test.py::test_multi_CGRA_systolic_2x2_2x2_translation", "-xvs", "--test-verilog", "--dump-vtb", "--dump-vcd"] + ] + + with open(log_file, "w", encoding="utf-8") as f: + for cmd in commands: + cmd_str = " ".join(cmd) + header = f"\n{'='*80}\nExecuting: {cmd_str}\n{'='*80}\n" + + print(header) + f.write(header) + f.flush() + + try: + process = subprocess.Popen( + cmd, + stdout=subprocess.PIPE, + stderr=subprocess.STDOUT, + text=True, + bufsize=1 + ) + + for line in process.stdout: + print(line, end="") + f.write(line) + + process.wait() + + if process.returncode == 0: + status = f"\nSUCCESS: {cmd_str}\n" + else: + status = f"\nFAILED (Exit Code {process.returncode}): {cmd_str}\n" + + print(status) + f.write(status) + + except Exception as e: + error_msg = f"\nERROR executing {cmd_str}: {str(e)}\n" + print(error_msg) + f.write(error_msg) + + print(f"\n\nAll tests completed. Log saved to: {os.path.abspath(log_file)}") + +if __name__ == "__main__": + run_tests() \ No newline at end of file diff --git a/mem/data/DataMemControllerRTL.py b/mem/data/DataMemControllerRTL.py index 2e80af35..e7897422 100644 --- a/mem/data/DataMemControllerRTL.py +++ b/mem/data/DataMemControllerRTL.py @@ -1,451 +1,581 @@ -""" -========================================================================== -DataMemControllerRTL.py -========================================================================== -Data memory for CGRA. It has addtional port to connect to controller, -which can be used for multi-CGRA fabric. - - Send/recv data request/response to/from other CGRA controllers. - - Based on whether the target data address is within the local space. - - Coherence is not targeted for now; protyping in static memory space. - - Send/recv cmd request/response to/from other CGRA controllers. - - E.g., dynamic rescheduling. - - The cmd can be originally derived from a runtime scheduler. - -In addition, it contains a crossbar to handle multi-bank conflicts. - - Crossbar contains an arbitor, i.e., stall may happen on certain port. - - Therefore, bypass queue is leveraged on the input port. - - [ ] https://github.com/tancheng/VectorCGRA/issues/26: - Blocking vs. non-blocking should be configured/propagated here. - - Non-blocking: - - Immediate return data though it is not ready: - - Bank conflicted lower priority access. - - Remote accessed data. - - Blocking and non-blocking might be configurabled in a dynamic way. - -Author : Cheng Tan - Date : Aug 28, 2025 -""" - -from .DataMemWrapperRTL import DataMemWrapperRTL -from ...lib.basic.val_rdy.ifcs import ValRdyRecvIfcRTL as RecvIfcRTL -from ...lib.basic.val_rdy.ifcs import ValRdySendIfcRTL as SendIfcRTL -from ...lib.messages import * -from ...noc.PyOCN.pymtl3_net.xbar.XbarBypassQueueRTL import XbarBypassQueueRTL -from ...lib.util.data_struct_attr import * - -class DataMemControllerRTL(Component): - def construct(s, - NocPktType, - data_mem_size_global, - data_mem_size_per_bank, - num_banks_per_cgra = 4, - num_rd_tiles = 4, - num_wr_tiles = 4, - multi_cgra_rows = 2, - multi_cgra_columns = 2, - num_tiles = 16, - mem_access_is_combinational = True, - idTo2d_map = {0: [0, 0]}): - - CgraPayloadType = NocPktType.get_field_type(kAttrPayload) - DataType = CgraPayloadType.get_field_type(kAttrData) - # Constants. - global_addr_nbits = clog2(data_mem_size_global) - per_bank_addr_nbits = clog2(data_mem_size_per_bank) - assert(2 ** global_addr_nbits == data_mem_size_global) - assert(2 ** per_bank_addr_nbits == data_mem_size_per_bank) - XType = mk_bits(max(clog2(multi_cgra_columns), 1)) - YType = mk_bits(max(clog2(multi_cgra_rows), 1)) - AddrType = mk_bits(global_addr_nbits) - PerBankAddrType = mk_bits(per_bank_addr_nbits) - s.num_banks_per_cgra = num_banks_per_cgra - LocalBankIndexType = mk_bits(clog2(num_banks_per_cgra)) - s.num_rd_tiles = num_rd_tiles - s.num_wr_tiles = num_wr_tiles - RdTileIdType = mk_bits(clog2(num_rd_tiles)) - # The additional port is for the request from inter-cgra NoC via controller. - num_xbar_in_rd_ports = num_rd_tiles + 1 - num_xbar_in_wr_ports = num_wr_tiles + 1 - num_xbar_out_rd_ports = num_banks_per_cgra + 1 - num_xbar_out_wr_ports = num_banks_per_cgra + 1 - num_cgras = multi_cgra_rows * multi_cgra_columns - XbarOutRdType = mk_bits(clog2(num_xbar_out_rd_ports)) - XbarOutWrType = mk_bits(clog2(num_xbar_out_wr_ports)) - MemReadPktType = \ - mk_mem_access_pkt(DataType, - num_xbar_in_rd_ports, - num_xbar_out_rd_ports, - data_mem_size_global, - num_cgras, - num_tiles, - num_rd_tiles) - MemWritePktType = \ - mk_mem_access_pkt(DataType, - num_xbar_in_wr_ports, - num_xbar_out_wr_ports, - data_mem_size_global, - num_cgras, - num_tiles, - num_rd_tiles) - - # Reverses the source and destination for response packet. - MemResponsePktType = \ - mk_mem_access_pkt(DataType, - num_xbar_out_rd_ports, - num_xbar_in_rd_ports, - data_mem_size_global, - num_cgras, - num_tiles, - num_rd_tiles) - - # Interfaces. - # [num_rd_tiles] indicates the request from the NoC. ---> Add separate recv port for NoC. - s.recv_from_noc_load_request = RecvIfcRTL(NocPktType) - s.recv_from_noc_store_request = RecvIfcRTL(NocPktType) - - # [0, ..., num_rd_tiles - 1] indicate the requests from/to the tiles, - s.recv_raddr = [RecvIfcRTL(AddrType) for _ in range(num_rd_tiles)] - s.recv_waddr = [RecvIfcRTL(AddrType) for _ in range(num_wr_tiles)] - s.recv_wdata = [RecvIfcRTL(DataType) for _ in range(num_wr_tiles)] - - - s.send_rdata = [SendIfcRTL(DataType) for _ in range(num_rd_tiles)] - - s.send_to_noc_load_response_pkt = SendIfcRTL(NocPktType) - - # Response that is from a remote SRAM. - s.recv_from_noc_load_response_pkt = RecvIfcRTL(NocPktType) - - # Requests that targets remote SRAMs. - s.send_to_noc_load_request_pkt = SendIfcRTL(NocPktType) - s.send_to_noc_store_pkt = SendIfcRTL(NocPktType) - - # Components. - s.memory_wrapper = [DataMemWrapperRTL(DataType, MemReadPktType, MemWritePktType, MemResponsePktType, - data_mem_size_global, data_mem_size_per_bank, mem_access_is_combinational) - for _ in range(num_banks_per_cgra)] - # The additional 1 on inports indicates the read/write from NoC. - # The additional 1 on outports indicates the request out of bound of - # local memory space that would be forwarded to NoC. - s.read_crossbar = XbarBypassQueueRTL(MemReadPktType, num_xbar_in_rd_ports, - num_xbar_out_rd_ports) - s.write_crossbar = XbarBypassQueueRTL(MemWritePktType, num_xbar_in_wr_ports, - num_xbar_out_wr_ports) - s.response_crossbar = XbarBypassQueueRTL(MemResponsePktType, num_xbar_out_rd_ports, - num_xbar_in_rd_ports) - - s.rd_pkt = [Wire(MemReadPktType) for _ in range(num_xbar_in_rd_ports)] - s.wr_pkt = [Wire(MemWritePktType) for _ in range(num_xbar_in_wr_ports)] - - s.cgra_id = InPort(mk_bits(max(1, clog2(num_cgras)))) - - s.address_lower = InPort(AddrType) - s.address_upper = InPort(AddrType) - - # Constructs the idTo2d lut. - s.idTo2d_x_lut= [Wire(XType) for _ in range(multi_cgra_columns * multi_cgra_rows)] - s.idTo2d_y_lut= [Wire(YType) for _ in range(multi_cgra_columns * multi_cgra_rows)] - for cgra_id in idTo2d_map: - xy = idTo2d_map[cgra_id] - s.idTo2d_x_lut[cgra_id] //= XType(xy[0]) - s.idTo2d_y_lut[cgra_id] //= YType(xy[1]) - - # Connections. - for i in range(num_banks_per_cgra): - s.read_crossbar.send[i] //= s.memory_wrapper[i].recv_rd - s.write_crossbar.send[i] //= s.memory_wrapper[i].recv_wr - s.memory_wrapper[i].send //= s.response_crossbar.recv[i] - - @update - def assemble_xbar_pkt(): - for i in range(num_xbar_in_rd_ports): - s.rd_pkt[i] @= MemReadPktType(i, 0, 0, DataType(0, 0, 0, 0), 0, 0, i) - - for i in range(num_xbar_in_wr_ports): - s.wr_pkt[i] @= MemWritePktType(i, 0, 0, DataType(0, 0, 0, 0), 0, 0, i) - - for i in range(num_rd_tiles): - recv_raddr = s.recv_raddr[i].msg - # Calculates the target bank index for load. - if (recv_raddr >= s.address_lower) & (recv_raddr <= s.address_upper): - bank_index_load_local = trunc((recv_raddr - s.address_lower) >> per_bank_addr_nbits, XbarOutRdType) - else: - bank_index_load_local = XbarOutRdType(num_banks_per_cgra) - # FIXME: change to exact tile id. - s.rd_pkt[i] @= MemReadPktType(i, # src - bank_index_load_local, # dst - recv_raddr, # addr - DataType(0, 0, 0, 0), # data - s.cgra_id, # src_cgra - 0, # src_tile - i) # remote_src_port - - recv_raddr_from_noc = s.recv_from_noc_load_request.msg.payload.data_addr - # Calculates the target bank index. - if (recv_raddr_from_noc >= s.address_lower) & (recv_raddr_from_noc <= s.address_upper): - bank_index_load_from_noc = trunc((recv_raddr_from_noc - s.address_lower) >> per_bank_addr_nbits, XbarOutRdType) - else: - bank_index_load_from_noc = XbarOutRdType(num_banks_per_cgra) - s.rd_pkt[num_rd_tiles] @= MemReadPktType(num_rd_tiles, # src - bank_index_load_from_noc, # dst - recv_raddr_from_noc, # addr - DataType(0, 0, 0, 0), # data - s.recv_from_noc_load_request.msg.src, # src_cgra - s.recv_from_noc_load_request.msg.src_tile_id, # src_tile - s.recv_from_noc_load_request.msg.remote_src_port) # remote_src_port - - for i in range(num_wr_tiles): - recv_waddr = s.recv_waddr[i].msg - # Calculates the target bank index for store. - if (recv_waddr >= s.address_lower) & (recv_waddr <= s.address_upper): - bank_index_store_local = trunc((recv_waddr - s.address_lower) >> per_bank_addr_nbits, XbarOutWrType) - else: - bank_index_store_local = XbarOutWrType(num_banks_per_cgra) - s.wr_pkt[i] @= MemWritePktType(i, # src - bank_index_store_local, # dst - recv_waddr, # addr - s.recv_wdata[i].msg, # data - 0, # src_cgra - 0, # src_tile - i) # remote_src_port - - recv_waddr_from_noc = s.recv_from_noc_store_request.msg.payload.data_addr - recv_wdata_from_noc = s.recv_from_noc_store_request.msg.payload.data - if (recv_waddr_from_noc >= s.address_lower) & (recv_waddr_from_noc <= s.address_upper): - bank_index_store_from_noc = trunc((recv_waddr_from_noc - s.address_lower) >> per_bank_addr_nbits, XbarOutWrType) - else: - bank_index_store_from_noc = XbarOutWrType(num_banks_per_cgra) - s.wr_pkt[num_wr_tiles] @= MemWritePktType(num_wr_tiles, # src - bank_index_store_from_noc, # dst - recv_waddr_from_noc, # addr - recv_wdata_from_noc, # data - 0, # src_cgra - 0, # src_tile - num_wr_tiles) # remote_src_port - - # Connects xbar with the memory wrapper. - @update - def update_all(): - # Initializes the signals. - for i in range(num_rd_tiles): - s.recv_raddr[i].rdy @= 0 - s.recv_from_noc_load_request.rdy @= 0 - - for i in range(num_wr_tiles): - s.recv_waddr[i].rdy @= 0 - # s.recv_wdata_bypass_q[i].send.rdy @= 0 - s.recv_from_noc_store_request.rdy @= 0 - # s.recv_wdata_bypass_q[num_wr_tiles].send.rdy @= 0 - - for i in range(num_rd_tiles): - s.send_rdata[i].val @= 0 - s.send_rdata[i].msg @= DataType() - s.send_to_noc_load_response_pkt.val @= 0 - - s.send_to_noc_load_response_pkt.msg @= \ - NocPktType(0, # src - 0, # dst - 0, # src_x - 0, # src_y - 0, # dst_x - 0, # dst_y - 0, # src_tile_id - 0, # dst_tile_id - 0, # remote_src_port - 0, # opaque - 0, # vc_id - CgraPayloadType(0, 0, 0, 0, 0)) - - - for i in range(num_wr_tiles): - s.recv_wdata[i].rdy @= 0 - - s.send_to_noc_store_pkt.msg @= \ - NocPktType(0, # src - 0, # dst - 0, # src_x - 0, # src_y - 0, # dst_x - 0, # dst_y - 0, # src_tile_id - 0, # dst_tile_id - 0, # remote_src_port - 0, # opaque - 0, # vc_id - CgraPayloadType(0, 0, 0, 0, 0)) - - s.send_to_noc_store_pkt.val @= 0 - - for i in range(num_xbar_in_rd_ports): - s.read_crossbar.recv[i].val @= 0 - s.read_crossbar.recv[i].msg @= MemReadPktType(0, 0, 0, DataType(0, 0, 0, 0), 0, 0, 0) - - s.recv_from_noc_load_response_pkt.rdy @= 0 - - for i in range(num_xbar_in_wr_ports): - s.write_crossbar.recv[i].val @= 0 - s.write_crossbar.recv[i].msg @= MemWritePktType(0, 0, 0, DataType(0, 0, 0, 0), 0, 0, 0) - - s.send_to_noc_load_request_pkt.msg @= \ - NocPktType(0, # src - 0, # dst - 0, # src_x - 0, # src_y - 0, # dst_x - 0, # dst_y - 0, # src_tile_id - 0, # dst_tile_id - 0, # remote_src_port - 0, # opaque - 0, # vc_id - CgraPayloadType(0, 0, 0, 0, 0)) - - s.send_to_noc_load_request_pkt.val @= 0 - - # Connects the load request ports (from tiles and NoC) to the xbar targetting memory and NoC. - for i in range(num_rd_tiles): - s.read_crossbar.recv[i].val @= s.recv_raddr[i].val - s.read_crossbar.recv[i].msg @= s.rd_pkt[i] - s.recv_raddr[i].rdy @= s.read_crossbar.recv[i].rdy - s.read_crossbar.recv[num_rd_tiles].val @= s.recv_from_noc_load_request.val - s.read_crossbar.recv[num_rd_tiles].msg @= s.rd_pkt[num_rd_tiles] - s.recv_from_noc_load_request.rdy @= s.read_crossbar.recv[num_rd_tiles].rdy - - # Connects the store request ports (from tiles and NoC) to the xbar targetting memory and NoC. - for i in range(num_wr_tiles): - s.write_crossbar.recv[i].val @= s.recv_waddr[i].val - s.write_crossbar.recv[i].msg @= s.wr_pkt[i] - s.recv_waddr[i].rdy @= s.write_crossbar.recv[i].rdy - s.recv_wdata[i].rdy @= s.write_crossbar.recv[i].rdy - s.write_crossbar.recv[num_wr_tiles].val @= s.recv_from_noc_store_request.val - s.write_crossbar.recv[num_wr_tiles].msg @= s.wr_pkt[num_wr_tiles] - s.recv_from_noc_store_request.rdy @= s.write_crossbar.recv[num_wr_tiles].rdy - - # Connects the response ports to tiles and NoC from the xbar. - # Number of load responses is expected to be the same as the number of load requests. - for i in range(num_xbar_in_rd_ports): - if i < num_rd_tiles: - s.send_rdata[RdTileIdType(i)].msg @= s.response_crossbar.send[i].msg.data - s.send_rdata[RdTileIdType(i)].val @= s.response_crossbar.send[i].val - s.response_crossbar.send[i].rdy @= s.send_rdata[RdTileIdType(i)].rdy - else: - from_cgra_id = s.response_crossbar.send[i].msg.src_cgra - from_tile_id = s.response_crossbar.send[i].msg.src_tile - s.send_to_noc_load_response_pkt.msg @= \ - NocPktType( - s.cgra_id, # src_cgra_id - from_cgra_id, # dst_cgra_id - s.idTo2d_x_lut[s.cgra_id], # src_cgra_x - s.idTo2d_y_lut[s.cgra_id], # src_cgra_y - s.idTo2d_x_lut[from_cgra_id], # dst_cgra_x - s.idTo2d_y_lut[from_cgra_id], # dst_cgra_y - 0, # src_tile_id set as 0 as it is from memory rather than a specific tile. - from_tile_id, # dst_tile_id - s.response_crossbar.send[i].msg.remote_src_port, # remote_src_port, carries the original source port id towards the src. - 0, # opaque - 0, # vc_id - CgraPayloadType( - CMD_LOAD_RESPONSE, - s.response_crossbar.send[i].msg.data, - s.response_crossbar.send[i].msg.addr, 0, 0)) - - s.send_to_noc_load_response_pkt.val @= s.response_crossbar.send[i].val - s.response_crossbar.send[i].rdy @= s.send_to_noc_load_response_pkt.rdy - - # Handles the request (not response) towards the others via the NoC. The dst would be - # updated in the controller. - s.send_to_noc_load_request_pkt.msg @= \ - NocPktType(s.cgra_id, # src - 0, # dst - s.idTo2d_x_lut[s.cgra_id], # src_x - s.idTo2d_y_lut[s.cgra_id], # src_y - 0, # dst_x - 0, # dst_y - 0, # src_tile_id - 0, # dst_tile_id - s.read_crossbar.send[num_banks_per_cgra].msg.src, # remote_src_port - 0, # opaque - 0, # vc_id - CgraPayloadType( - CMD_LOAD_REQUEST, - 0, - s.read_crossbar.send[num_banks_per_cgra].msg.addr, 0, 0)) - - s.send_to_noc_load_request_pkt.val @= s.read_crossbar.send[num_banks_per_cgra].val - # TODO: https://github.com/tancheng/VectorCGRA/issues/26 -- Modify this part for non-blocking access. - # 'val` indicates the data is arbitrated successfully. - s.recv_from_noc_load_response_pkt.rdy @= s.response_crossbar.recv[num_banks_per_cgra].rdy - s.response_crossbar.recv[num_banks_per_cgra].val @= s.recv_from_noc_load_response_pkt.val - s.response_crossbar.recv[num_banks_per_cgra].msg @= \ - MemResponsePktType(num_banks_per_cgra, - s.recv_from_noc_load_response_pkt.msg.remote_src_port, - s.recv_from_noc_load_response_pkt.msg.payload.data_addr, - s.recv_from_noc_load_response_pkt.msg.payload.data, - s.recv_from_noc_load_response_pkt.msg.src, - s.recv_from_noc_load_response_pkt.msg.src_tile_id, - 0) - - # Allows other load request towards NoC when the previous one is not responded. There - # could be out-of-order load response, i.e., potential consistency issue. - s.read_crossbar.send[num_banks_per_cgra].rdy @= s.send_to_noc_load_request_pkt.rdy - - # Handles the write port towards the NoC. - s.send_to_noc_store_pkt.msg @= \ - NocPktType(s.cgra_id, # src - 0, # dst - s.idTo2d_x_lut[s.cgra_id], # src_x - s.idTo2d_y_lut[s.cgra_id], # src_y - 0, # dst_x - 0, # dst_y - 0, # src_tile_id - 0, # dst_tile_id - s.write_crossbar.send[num_banks_per_cgra].msg.src, # remote_src_port - 0, # opaque - 0, # vc_id - CgraPayloadType( - CMD_STORE_REQUEST, - s.write_crossbar.send[num_banks_per_cgra].msg.data, - s.write_crossbar.send[num_banks_per_cgra].msg.addr, 0, 0)) - - s.send_to_noc_store_pkt.val @= s.write_crossbar.send[num_banks_per_cgra].val - s.write_crossbar.send[num_banks_per_cgra].rdy @= s.send_to_noc_store_pkt.rdy - - def line_trace(s): - recv_raddr_str = "recv_from_tile_read_addr: {" - recv_waddr_str = "recv_from_tile_write_addr: {" - recv_wdata_str = "recv_from_tile_write_data: {" - content_str = "content: {" - send_rdata_str = "send_to_tile_read_data: {" - - send_to_noc_load_request_pkt_str = "send_to_noc_load_request_pkt: {" - send_to_noc_load_response_pkt_str = "send_to_noc_load_response_pkt: {" - recv_from_noc_load_response_pkt_str = "recv_from_noc_load_response_pkt: {" - send_to_noc_store_pkt_str = "send_to_noc_store_pkt: {" - - - for b in range(s.num_banks_per_cgra): - recv_raddr_str += " bank[" + str(b) + "]: " + "|".join([str(data.msg) for data in s.recv_raddr]) + ";" - recv_waddr_str += " bank[" + str(b) + "]: " + "|".join([str(data.msg) for data in s.recv_waddr]) + ";" - recv_wdata_str += " bank[" + str(b) + "]: " + "|".join([str(data.msg) for data in s.recv_wdata]) + ";" - content_str += " bank[" + str(b) + "]: " + "|".join([str(data) for data in s.memory_wrapper[b].memory.regs]) + ";" - send_rdata_str += " bank[" + str(b) + "]: " + "|".join([str(data.msg) for data in s.send_rdata]) + ";" - - send_to_noc_load_request_pkt_str += str(s.send_to_noc_load_request_pkt.msg) + ";" - send_to_noc_load_response_pkt_str += " " + str(s.send_to_noc_load_response_pkt.msg) + " " - recv_from_noc_load_response_pkt_str += str(s.recv_from_noc_load_response_pkt.msg) + ";" - send_to_noc_store_pkt_str += str(s.send_to_noc_store_pkt.msg) + ", val: " + str(s.send_to_noc_store_pkt.val) + ";" - - recv_raddr_str += "}" - send_rdata_str += "}" - recv_waddr_str += "}" - recv_wdata_str += "}" - send_to_noc_load_request_pkt_str += "}" - send_to_noc_load_response_pkt_str += "}" - recv_from_noc_load_response_pkt_str += "}" - send_to_noc_store_pkt_str += "}" - read_crossbar_str = "read_crossbar: " + s.read_crossbar.line_trace() - write_crossbar_str = "write_crossbar: " + s.write_crossbar.line_trace() - content_str += "}" - - return f'{recv_raddr_str} || {recv_waddr_str} || {recv_wdata_str} || {send_rdata_str} || {send_to_noc_load_request_pkt_str} || {send_to_noc_load_response_pkt_str} || {recv_from_noc_load_response_pkt_str} || {send_to_noc_store_pkt_str} || {read_crossbar_str} || {write_crossbar_str} || [{content_str}]' - +""" +========================================================================== +DataMemControllerRTL.py +========================================================================== +Data memory for CGRA. It has addtional port to connect to controller, +which can be used for multi-CGRA fabric. + - Send/recv data request/response to/from other CGRA controllers. + - Based on whether the target data address is within the local space. + - Coherence is not targeted for now; protyping in static memory space. + - Send/recv cmd request/response to/from other CGRA controllers. + - E.g., dynamic rescheduling. + - The cmd can be originally derived from a runtime scheduler. + +In addition, it contains a crossbar to handle multi-bank conflicts. + - Crossbar contains an arbitor, i.e., stall may happen on certain port. + - Therefore, bypass queue is leveraged on the input port. + - [ ] https://github.com/tancheng/VectorCGRA/issues/26: + Blocking vs. non-blocking should be configured/propagated here. + - Non-blocking: + - Immediate return data though it is not ready: + - Bank conflicted lower priority access. + - Remote accessed data. + - Blocking and non-blocking might be configurabled in a dynamic way. + +Author : Cheng Tan + Date : Aug 28, 2025 +""" + +from .DataMemWrapperRTL import DataMemWrapperRTL +from ...lib.basic.val_rdy.ifcs import ValRdyRecvIfcRTL as RecvIfcRTL +from ...lib.basic.val_rdy.ifcs import ValRdySendIfcRTL as SendIfcRTL +from ...lib.messages import * +from ...noc.PyOCN.pymtl3_net.xbar.XbarBypassQueueRTL import XbarBypassQueueRTL +from ...lib.util.data_struct_attr import * +from ...lib.util.common import CHAR_BIT + +class DataMemControllerRTL(Component): + """ + DataMemControllerRTL manages access to the multi-banked data SPM. + It arbitrates between multiple request sources: + 1. Local tiles (via `recv_raddr`, `recv_waddr`, `recv_wdata`) + 2. Inter-CGRA NoC (via `recv_from_noc_load_request`, etc.) + 3. Optional DMA engine (via `spm_dma_wval`, `spm_dma_rval`, etc.) + + Architectural Design: + - Uses crossbars to route requests to the correct memory bank based on the + address. + - Supports an optional DMA interface. When `has_dma_ports` is True, extra + ports are added to the read and write crossbars. + - DMA requests are treated as another master on the memory bus, competing + with tiles and NoC traffic. + """ + def construct(s, + NocPktType, + data_mem_size_global, + data_mem_size_per_bank, + num_banks_per_cgra = 4, + num_rd_tiles = 4, + num_wr_tiles = 4, + multi_cgra_rows = 2, + multi_cgra_columns = 2, + num_tiles = 16, + mem_access_is_combinational = True, + idTo2d_map = {0: [0, 0]}, + has_dma_ports = False): + + CgraPayloadType = NocPktType.get_field_type(kAttrPayload) + DataType = CgraPayloadType.get_field_type(kAttrData) + # Constants. + global_addr_nbits = clog2(data_mem_size_global) + per_bank_addr_nbits = clog2(data_mem_size_per_bank) + assert(2 ** global_addr_nbits == data_mem_size_global) + assert(2 ** per_bank_addr_nbits == data_mem_size_per_bank) + XType = mk_bits(max(clog2(multi_cgra_columns), 1)) + YType = mk_bits(max(clog2(multi_cgra_rows), 1)) + AddrType = mk_bits(global_addr_nbits) + PerBankAddrType = mk_bits(per_bank_addr_nbits) + DmaDataType = DataType.get_field_type(kAttrPayload) + DmaMaskType = mk_bits(max(1, DmaDataType.nbits // CHAR_BIT)) + NocRemoteSrcPortType = NocPktType.get_field_type(kAttrRemoteSrcPort) + s.num_banks_per_cgra = num_banks_per_cgra + s.has_dma_ports = has_dma_ports + LocalBankIndexType = mk_bits(max(1, clog2(num_banks_per_cgra))) + s.num_rd_tiles = num_rd_tiles + s.num_wr_tiles = num_wr_tiles + RdTileIdType = mk_bits(max(1, clog2(num_rd_tiles))) + # The additional port is for the request from inter-cgra NoC via controller. + # If DMA is enabled, we add one more port for the DMA engine. + dma_port_offset = 1 if has_dma_ports else 0 + num_xbar_in_rd_ports = num_rd_tiles + 1 + dma_port_offset + num_xbar_in_wr_ports = num_wr_tiles + 1 + dma_port_offset + num_xbar_out_rd_ports = num_banks_per_cgra + 1 + num_xbar_out_wr_ports = num_banks_per_cgra + 1 + num_cgras = multi_cgra_rows * multi_cgra_columns + XbarOutRdType = mk_bits(clog2(num_xbar_out_rd_ports)) + XbarOutWrType = mk_bits(clog2(num_xbar_out_wr_ports)) + XbarInRdType = mk_bits(clog2(num_xbar_in_rd_ports)) + XbarInWrType = mk_bits(clog2(num_xbar_in_wr_ports)) + MemReadPktType = \ + mk_mem_access_pkt(DataType, + num_xbar_in_rd_ports, + num_xbar_out_rd_ports, + data_mem_size_global, + num_cgras, + num_tiles, + num_rd_tiles) + MemWritePktType = \ + mk_mem_access_pkt(DataType, + num_xbar_in_wr_ports, + num_xbar_out_wr_ports, + data_mem_size_global, + num_cgras, + num_tiles, + num_rd_tiles) + + # Reverses the source and destination for response packet. + MemResponsePktType = \ + mk_mem_access_pkt(DataType, + num_xbar_out_rd_ports, + num_xbar_in_rd_ports, + data_mem_size_global, + num_cgras, + num_tiles, + num_rd_tiles) + + # Interfaces. + # [num_rd_tiles] indicates the request from the NoC. ---> Add separate recv port for NoC. + s.recv_from_noc_load_request = RecvIfcRTL(NocPktType) + s.recv_from_noc_store_request = RecvIfcRTL(NocPktType) + + # [0, ..., num_rd_tiles - 1] indicate the requests from/to the tiles, + s.recv_raddr = [RecvIfcRTL(AddrType) for _ in range(num_rd_tiles)] + s.recv_waddr = [RecvIfcRTL(AddrType) for _ in range(num_wr_tiles)] + s.recv_wdata = [RecvIfcRTL(DataType) for _ in range(num_wr_tiles)] + + + s.send_rdata = [SendIfcRTL(DataType) for _ in range(num_rd_tiles)] + + s.send_to_noc_load_response_pkt = SendIfcRTL(NocPktType) + + # Response that is from a remote SRAM. + s.recv_from_noc_load_response_pkt = RecvIfcRTL(NocPktType) + + # Requests that targets remote SRAMs. + s.send_to_noc_load_request_pkt = SendIfcRTL(NocPktType) + s.send_to_noc_store_pkt = SendIfcRTL(NocPktType) + + if has_dma_ports: + # DMA writes SPM: used by DMA_MVIN. + s.spm_dma_wval = InPort() + s.spm_dma_wrdy = OutPort() + s.spm_dma_waddr = InPort(AddrType) + s.spm_dma_wdata = InPort(DmaDataType) + s.spm_dma_wmask = InPort(DmaMaskType) + + # DMA reads SPM: used by DMA_MVOUT. + s.spm_dma_rval = InPort() + s.spm_dma_rrdy = OutPort() + s.spm_dma_raddr = InPort(AddrType) + s.spm_dma_rresp_val = OutPort() + s.spm_dma_rresp_rdy = InPort() + s.spm_dma_rresp_data = OutPort(DmaDataType) + else: + # Keep these as internal wires so PyMTL's static update-block analysis + # can see declared objects even when the optional DMA interface is off. + s.spm_dma_wval = Wire() + s.spm_dma_wrdy = Wire() + s.spm_dma_waddr = Wire(AddrType) + s.spm_dma_wdata = Wire(DmaDataType) + s.spm_dma_wmask = Wire(DmaMaskType) + + s.spm_dma_rval = Wire() + s.spm_dma_rrdy = Wire() + s.spm_dma_raddr = Wire(AddrType) + s.spm_dma_rresp_val = Wire() + s.spm_dma_rresp_rdy = Wire() + s.spm_dma_rresp_data = Wire(DmaDataType) + + s.spm_dma_wval //= 0 + s.spm_dma_waddr //= AddrType(0) + s.spm_dma_wdata //= DmaDataType(0) + s.spm_dma_wmask //= DmaMaskType(0) + s.spm_dma_rval //= 0 + s.spm_dma_raddr //= AddrType(0) + s.spm_dma_rresp_rdy //= 0 + + # Components. + # A list of DataMemWrapperRTL instances. Each one is a single memory bank. + s.memory_wrapper = [DataMemWrapperRTL(DataType, MemReadPktType, MemWritePktType, MemResponsePktType, + data_mem_size_global, data_mem_size_per_bank, mem_access_is_combinational) + for _ in range(num_banks_per_cgra)] + # The additional 1 on inports indicates the read/write from NoC. + # The additional 1 on outports indicates the request out of bound of + # local memory space that would be forwarded to NoC. + s.read_crossbar = XbarBypassQueueRTL(MemReadPktType, num_xbar_in_rd_ports, + num_xbar_out_rd_ports) + s.write_crossbar = XbarBypassQueueRTL(MemWritePktType, num_xbar_in_wr_ports, + num_xbar_out_wr_ports) + s.response_crossbar = XbarBypassQueueRTL(MemResponsePktType, num_xbar_out_rd_ports, + num_xbar_in_rd_ports) + + s.rd_pkt = [Wire(MemReadPktType) for _ in range(num_xbar_in_rd_ports)] + s.wr_pkt = [Wire(MemWritePktType) for _ in range(num_xbar_in_wr_ports)] + + s.cgra_id = InPort(mk_bits(max(1, clog2(num_cgras)))) + + s.address_lower = InPort(AddrType) + s.address_upper = InPort(AddrType) + + # Constructs the idTo2d lut. + s.idTo2d_x_lut= [Wire(XType) for _ in range(multi_cgra_columns * multi_cgra_rows)] + s.idTo2d_y_lut= [Wire(YType) for _ in range(multi_cgra_columns * multi_cgra_rows)] + for cgra_id in idTo2d_map: + xy = idTo2d_map[cgra_id] + s.idTo2d_x_lut[cgra_id] //= XType(xy[0]) + s.idTo2d_y_lut[cgra_id] //= YType(xy[1]) + + # Connections. + for i in range(num_banks_per_cgra): + s.read_crossbar.send[i] //= s.memory_wrapper[i].recv_rd + s.write_crossbar.send[i] //= s.memory_wrapper[i].recv_wr + s.memory_wrapper[i].send //= s.response_crossbar.recv[i] + + @update + def assemble_xbar_pkt(): + for i in range(num_xbar_in_rd_ports): + s.rd_pkt[i] @= MemReadPktType(i, 0, 0, DataType(0, 0, 0, 0), 0, 0, 0) + + for i in range(num_xbar_in_wr_ports): + s.wr_pkt[i] @= MemWritePktType(i, 0, 0, DataType(0, 0, 0, 0), 0, 0, 0) + + for i in range(num_rd_tiles): + recv_raddr = s.recv_raddr[i].msg + # Calculates the target bank index for load. + if (recv_raddr >= s.address_lower) & (recv_raddr <= s.address_upper): + bank_index_load_local = trunc((recv_raddr - s.address_lower) >> per_bank_addr_nbits, XbarOutRdType) + else: + bank_index_load_local = XbarOutRdType(num_banks_per_cgra) + # FIXME: change to exact tile id. + s.rd_pkt[i] @= MemReadPktType(i, # src + bank_index_load_local, # dst + recv_raddr, # addr + DataType(0, 0, 0, 0), # data + s.cgra_id, # src_cgra + 0, # src_tile + i) # remote_src_port + + recv_raddr_from_noc = s.recv_from_noc_load_request.msg.payload.data_addr + # Calculates the target bank index. + if (recv_raddr_from_noc >= s.address_lower) & (recv_raddr_from_noc <= s.address_upper): + bank_index_load_from_noc = trunc((recv_raddr_from_noc - s.address_lower) >> per_bank_addr_nbits, XbarOutRdType) + else: + bank_index_load_from_noc = XbarOutRdType(num_banks_per_cgra) + s.rd_pkt[num_rd_tiles] @= MemReadPktType(num_rd_tiles, # src + bank_index_load_from_noc, # dst + recv_raddr_from_noc, # addr + DataType(0, 0, 0, 0), # data + s.recv_from_noc_load_request.msg.src, # src_cgra + s.recv_from_noc_load_request.msg.src_tile_id, # src_tile + s.recv_from_noc_load_request.msg.remote_src_port) # remote_src_port + + for i in range(num_wr_tiles): + recv_waddr = s.recv_waddr[i].msg + # Calculates the target bank index for store. + if (recv_waddr >= s.address_lower) & (recv_waddr <= s.address_upper): + bank_index_store_local = trunc((recv_waddr - s.address_lower) >> per_bank_addr_nbits, XbarOutWrType) + else: + bank_index_store_local = XbarOutWrType(num_banks_per_cgra) + s.wr_pkt[i] @= MemWritePktType(i, # src + bank_index_store_local, # dst + recv_waddr, # addr + s.recv_wdata[i].msg, # data + 0, # src_cgra + 0, # src_tile + i) # remote_src_port + + recv_waddr_from_noc = s.recv_from_noc_store_request.msg.payload.data_addr + recv_wdata_from_noc = s.recv_from_noc_store_request.msg.payload.data + if (recv_waddr_from_noc >= s.address_lower) & (recv_waddr_from_noc <= s.address_upper): + bank_index_store_from_noc = trunc((recv_waddr_from_noc - s.address_lower) >> per_bank_addr_nbits, XbarOutWrType) + else: + bank_index_store_from_noc = XbarOutWrType(num_banks_per_cgra) + s.wr_pkt[num_wr_tiles] @= MemWritePktType(num_wr_tiles, # src + bank_index_store_from_noc, # dst + recv_waddr_from_noc, # addr + recv_wdata_from_noc, # data + 0, # src_cgra + 0, # src_tile + num_wr_tiles) # remote_src_port + + if has_dma_ports: + + # When `has_dma_ports` is True, num_xbar_in_wr_ports = num_wr_tiles + 1 + 1(dma_port_offset). + # Use dma_wr_idx = num_wr_tiles + 1 = num_xbar_in_wr_ports - 1 + # NOTE Don't use `dma_wr_idx = num_wr_tiles + 1` here since it will cause the bit mismatch error + # between `dma_wr_idx` and `num_xbar_in_wr_ports`. + dma_rd_idx = XbarInRdType(num_xbar_in_rd_ports - 1) + dma_wr_idx = XbarInWrType(num_xbar_in_wr_ports - 1) + + recv_raddr_from_dma = s.spm_dma_raddr + if (recv_raddr_from_dma >= s.address_lower) & (recv_raddr_from_dma <= s.address_upper): + bank_index_load_from_dma = trunc((recv_raddr_from_dma - s.address_lower) >> per_bank_addr_nbits, XbarOutRdType) + else: + bank_index_load_from_dma = XbarOutRdType(num_banks_per_cgra) + s.rd_pkt[dma_rd_idx] @= MemReadPktType(dma_rd_idx, # src + bank_index_load_from_dma, # dst + recv_raddr_from_dma, # addr + DataType(0, 0, 0, 0), # data + s.cgra_id, # src_cgra + 0, # src_tile + 0) # remote_src_port + + recv_waddr_from_dma = s.spm_dma_waddr + if (recv_waddr_from_dma >= s.address_lower) & (recv_waddr_from_dma <= s.address_upper): + bank_index_store_from_dma = trunc((recv_waddr_from_dma - s.address_lower) >> per_bank_addr_nbits, XbarOutWrType) + else: + bank_index_store_from_dma = XbarOutWrType(num_banks_per_cgra) + s.wr_pkt[dma_wr_idx] @= MemWritePktType(dma_wr_idx, # src + bank_index_store_from_dma, # dst + recv_waddr_from_dma, # addr + DataType(s.spm_dma_wdata, 1, 0, 0), + 0, # src_cgra + 0, # src_tile + 0) # remote_src_port + + # Connects xbar with the memory wrapper. + @update + def update_all(): + # Initializes the signals. + for i in range(num_rd_tiles): + s.recv_raddr[i].rdy @= 0 + s.recv_from_noc_load_request.rdy @= 0 + + for i in range(num_wr_tiles): + s.recv_waddr[i].rdy @= 0 + # s.recv_wdata_bypass_q[i].send.rdy @= 0 + s.recv_from_noc_store_request.rdy @= 0 + # s.recv_wdata_bypass_q[num_wr_tiles].send.rdy @= 0 + + for i in range(num_rd_tiles): + s.send_rdata[i].val @= 0 + s.send_rdata[i].msg @= DataType() + s.send_to_noc_load_response_pkt.val @= 0 + + s.send_to_noc_load_response_pkt.msg @= \ + NocPktType(0, # src + 0, # dst + 0, # src_x + 0, # src_y + 0, # dst_x + 0, # dst_y + 0, # src_tile_id + 0, # dst_tile_id + 0, # remote_src_port + 0, # opaque + 0, # vc_id + CgraPayloadType(0, 0, 0, 0, 0)) + + + for i in range(num_wr_tiles): + s.recv_wdata[i].rdy @= 0 + + s.send_to_noc_store_pkt.msg @= \ + NocPktType(0, # src + 0, # dst + 0, # src_x + 0, # src_y + 0, # dst_x + 0, # dst_y + 0, # src_tile_id + 0, # dst_tile_id + 0, # remote_src_port + 0, # opaque + 0, # vc_id + CgraPayloadType(0, 0, 0, 0, 0)) + + s.send_to_noc_store_pkt.val @= 0 + + for i in range(num_xbar_in_rd_ports): + s.read_crossbar.recv[i].val @= 0 + s.read_crossbar.recv[i].msg @= MemReadPktType(0, 0, 0, DataType(0, 0, 0, 0), 0, 0, 0) + + s.recv_from_noc_load_response_pkt.rdy @= 0 + + for i in range(num_xbar_in_wr_ports): + s.write_crossbar.recv[i].val @= 0 + s.write_crossbar.recv[i].msg @= MemWritePktType(0, 0, 0, DataType(0, 0, 0, 0), 0, 0, 0) + + if has_dma_ports: + s.spm_dma_wrdy @= 0 + s.spm_dma_rrdy @= 0 + s.spm_dma_rresp_val @= 0 + s.spm_dma_rresp_data @= DmaDataType(0) + + s.send_to_noc_load_request_pkt.msg @= \ + NocPktType(0, # src + 0, # dst + 0, # src_x + 0, # src_y + 0, # dst_x + 0, # dst_y + 0, # src_tile_id + 0, # dst_tile_id + 0, # remote_src_port + 0, # opaque + 0, # vc_id + CgraPayloadType(0, 0, 0, 0, 0)) + + s.send_to_noc_load_request_pkt.val @= 0 + + # Connects the load request ports (from tiles and NoC) to the xbar targetting memory and NoC. + for i in range(num_rd_tiles): + s.read_crossbar.recv[i].val @= s.recv_raddr[i].val + s.read_crossbar.recv[i].msg @= s.rd_pkt[i] + s.recv_raddr[i].rdy @= s.read_crossbar.recv[i].rdy + s.read_crossbar.recv[num_rd_tiles].val @= s.recv_from_noc_load_request.val + s.read_crossbar.recv[num_rd_tiles].msg @= s.rd_pkt[num_rd_tiles] + s.recv_from_noc_load_request.rdy @= s.read_crossbar.recv[num_rd_tiles].rdy + + if has_dma_ports: + # When `has_dma_ports` is True, num_xbar_in_rd_ports = num_rd_tiles + 1 + 1(dma_port_offset). + # Use dma_rd_idx = num_rd_tiles + 1 = num_xbar_in_rd_ports - 1 + # NOTE Don't use `dma_rd_idx = num_rd_tiles + 1` here since it will cause the bit mismatch error + # between `dma_rd_idx` and `num_xbar_in_rd_ports`. + dma_rd_idx = XbarInRdType(num_xbar_in_rd_ports - 1) + s.read_crossbar.recv[dma_rd_idx].val @= s.spm_dma_rval + s.read_crossbar.recv[dma_rd_idx].msg @= s.rd_pkt[dma_rd_idx] + s.spm_dma_rrdy @= s.read_crossbar.recv[dma_rd_idx].rdy + + # Connects the store request ports (from tiles and NoC) to the xbar targetting memory and NoC. + for i in range(num_wr_tiles): + s.write_crossbar.recv[i].val @= s.recv_waddr[i].val + s.write_crossbar.recv[i].msg @= s.wr_pkt[i] + s.recv_waddr[i].rdy @= s.write_crossbar.recv[i].rdy + s.recv_wdata[i].rdy @= s.write_crossbar.recv[i].rdy + s.write_crossbar.recv[num_wr_tiles].val @= s.recv_from_noc_store_request.val + s.write_crossbar.recv[num_wr_tiles].msg @= s.wr_pkt[num_wr_tiles] + s.recv_from_noc_store_request.rdy @= s.write_crossbar.recv[num_wr_tiles].rdy + + if has_dma_ports: + # When `has_dma_ports` is True, num_xbar_in_wr_ports = num_wr_tiles + 1 + 1(dma_port_offset). + # Use dma_wr_idx = num_wr_tiles + 1 = num_xbar_in_wr_ports - 1 + # NOTE Don't use `dma_wr_idx = num_wr_tiles + 1` here since it will cause the bit mismatch error + # between `dma_wr_idx` and `num_xbar_in_wr_ports`. + dma_wr_idx = XbarInWrType(num_xbar_in_wr_ports - 1) + s.write_crossbar.recv[dma_wr_idx].val @= s.spm_dma_wval + s.write_crossbar.recv[dma_wr_idx].msg @= s.wr_pkt[dma_wr_idx] + s.spm_dma_wrdy @= s.write_crossbar.recv[dma_wr_idx].rdy + + # Connects the response ports to tiles and NoC from the xbar. + # Number of load responses is expected to be the same as the number of load requests. + for i in range(num_xbar_in_rd_ports): + if i < num_rd_tiles: + s.send_rdata[RdTileIdType(i)].msg @= s.response_crossbar.send[i].msg.data + s.send_rdata[RdTileIdType(i)].val @= s.response_crossbar.send[i].val + s.response_crossbar.send[i].rdy @= s.send_rdata[RdTileIdType(i)].rdy + elif i == num_rd_tiles: + from_cgra_id = s.response_crossbar.send[i].msg.src_cgra + from_tile_id = s.response_crossbar.send[i].msg.src_tile + s.send_to_noc_load_response_pkt.msg @= \ + NocPktType( + s.cgra_id, # src_cgra_id + from_cgra_id, # dst_cgra_id + s.idTo2d_x_lut[s.cgra_id], # src_cgra_x + s.idTo2d_y_lut[s.cgra_id], # src_cgra_y + s.idTo2d_x_lut[from_cgra_id], # dst_cgra_x + s.idTo2d_y_lut[from_cgra_id], # dst_cgra_y + 0, # src_tile_id set as 0 as it is from memory rather than a specific tile. + from_tile_id, # dst_tile_id + s.response_crossbar.send[i].msg.remote_src_port, # remote_src_port, carries the original source port id towards the src. + 0, # opaque + 0, # vc_id + CgraPayloadType( + CMD_LOAD_RESPONSE, + s.response_crossbar.send[i].msg.data, + s.response_crossbar.send[i].msg.addr, 0, 0)) + + s.send_to_noc_load_response_pkt.val @= s.response_crossbar.send[i].val + s.response_crossbar.send[i].rdy @= s.send_to_noc_load_response_pkt.rdy + elif has_dma_ports: + s.spm_dma_rresp_data @= s.response_crossbar.send[i].msg.data.payload + s.spm_dma_rresp_val @= s.response_crossbar.send[i].val + s.response_crossbar.send[i].rdy @= s.spm_dma_rresp_rdy + + # Handles the request (not response) towards the others via the NoC. The dst would be + # updated in the controller. + s.send_to_noc_load_request_pkt.msg @= \ + NocPktType(s.cgra_id, # src + 0, # dst + s.idTo2d_x_lut[s.cgra_id], # src_x + s.idTo2d_y_lut[s.cgra_id], # src_y + 0, # dst_x + 0, # dst_y + 0, # src_tile_id + 0, # dst_tile_id + trunc(s.read_crossbar.send[num_banks_per_cgra].msg.src, NocRemoteSrcPortType), # remote_src_port + 0, # opaque + 0, # vc_id + CgraPayloadType( + CMD_LOAD_REQUEST, + 0, + s.read_crossbar.send[num_banks_per_cgra].msg.addr, 0, 0)) + + s.send_to_noc_load_request_pkt.val @= s.read_crossbar.send[num_banks_per_cgra].val + # TODO: https://github.com/tancheng/VectorCGRA/issues/26 -- Modify this part for non-blocking access. + # 'val` indicates the data is arbitrated successfully. + s.recv_from_noc_load_response_pkt.rdy @= s.response_crossbar.recv[num_banks_per_cgra].rdy + s.response_crossbar.recv[num_banks_per_cgra].val @= s.recv_from_noc_load_response_pkt.val + s.response_crossbar.recv[num_banks_per_cgra].msg @= \ + MemResponsePktType(num_banks_per_cgra, + zext(s.recv_from_noc_load_response_pkt.msg.remote_src_port, XbarInRdType), + s.recv_from_noc_load_response_pkt.msg.payload.data_addr, + s.recv_from_noc_load_response_pkt.msg.payload.data, + s.recv_from_noc_load_response_pkt.msg.src, + s.recv_from_noc_load_response_pkt.msg.src_tile_id, + 0) + + # Allows other load request towards NoC when the previous one is not responded. There + # could be out-of-order load response, i.e., potential consistency issue. + s.read_crossbar.send[num_banks_per_cgra].rdy @= s.send_to_noc_load_request_pkt.rdy + + # Handles the write port towards the NoC. + s.send_to_noc_store_pkt.msg @= \ + NocPktType(s.cgra_id, # src + 0, # dst + s.idTo2d_x_lut[s.cgra_id], # src_x + s.idTo2d_y_lut[s.cgra_id], # src_y + 0, # dst_x + 0, # dst_y + 0, # src_tile_id + 0, # dst_tile_id + trunc(s.write_crossbar.send[num_banks_per_cgra].msg.src, NocRemoteSrcPortType), # remote_src_port + 0, # opaque + 0, # vc_id + CgraPayloadType( + CMD_STORE_REQUEST, + s.write_crossbar.send[num_banks_per_cgra].msg.data, + s.write_crossbar.send[num_banks_per_cgra].msg.addr, 0, 0)) + + s.send_to_noc_store_pkt.val @= s.write_crossbar.send[num_banks_per_cgra].val + s.write_crossbar.send[num_banks_per_cgra].rdy @= s.send_to_noc_store_pkt.rdy + + def line_trace(s): + recv_raddr_str = "recv_from_tile_read_addr: {" + recv_waddr_str = "recv_from_tile_write_addr: {" + recv_wdata_str = "recv_from_tile_write_data: {" + content_str = "content: {" + send_rdata_str = "send_to_tile_read_data: {" + + send_to_noc_load_request_pkt_str = "send_to_noc_load_request_pkt: {" + send_to_noc_load_response_pkt_str = "send_to_noc_load_response_pkt: {" + recv_from_noc_load_response_pkt_str = "recv_from_noc_load_response_pkt: {" + send_to_noc_store_pkt_str = "send_to_noc_store_pkt: {" + + + for b in range(s.num_banks_per_cgra): + recv_raddr_str += " bank[" + str(b) + "]: " + "|".join([str(data.msg) for data in s.recv_raddr]) + ";" + recv_waddr_str += " bank[" + str(b) + "]: " + "|".join([str(data.msg) for data in s.recv_waddr]) + ";" + recv_wdata_str += " bank[" + str(b) + "]: " + "|".join([str(data.msg) for data in s.recv_wdata]) + ";" + content_str += " bank[" + str(b) + "]: " + "|".join([str(data) for data in s.memory_wrapper[b].memory.regs]) + ";" + send_rdata_str += " bank[" + str(b) + "]: " + "|".join([str(data.msg) for data in s.send_rdata]) + ";" + + send_to_noc_load_request_pkt_str += str(s.send_to_noc_load_request_pkt.msg) + ";" + send_to_noc_load_response_pkt_str += " " + str(s.send_to_noc_load_response_pkt.msg) + " " + recv_from_noc_load_response_pkt_str += str(s.recv_from_noc_load_response_pkt.msg) + ";" + send_to_noc_store_pkt_str += str(s.send_to_noc_store_pkt.msg) + ", val: " + str(s.send_to_noc_store_pkt.val) + ";" + + recv_raddr_str += "}" + send_rdata_str += "}" + recv_waddr_str += "}" + recv_wdata_str += "}" + send_to_noc_load_request_pkt_str += "}" + send_to_noc_load_response_pkt_str += "}" + recv_from_noc_load_response_pkt_str += "}" + send_to_noc_store_pkt_str += "}" + read_crossbar_str = "read_crossbar: " + s.read_crossbar.line_trace() + write_crossbar_str = "write_crossbar: " + s.write_crossbar.line_trace() + content_str += "}" + + return f'{recv_raddr_str} || {recv_waddr_str} || {recv_wdata_str} || {send_rdata_str} || {send_to_noc_load_request_pkt_str} || {send_to_noc_load_response_pkt_str} || {recv_from_noc_load_response_pkt_str} || {send_to_noc_store_pkt_str} || {read_crossbar_str} || {write_crossbar_str} || [{content_str}]' + diff --git a/mem/data/test/DataMemControllerRTL_dma_test.py b/mem/data/test/DataMemControllerRTL_dma_test.py new file mode 100644 index 00000000..b4cf1495 --- /dev/null +++ b/mem/data/test/DataMemControllerRTL_dma_test.py @@ -0,0 +1,115 @@ +""" +========================================================================== +DataMemControllerRTL_dma_test.py +========================================================================== +""" + +from pymtl3 import * + +from ..DataMemControllerRTL import DataMemControllerRTL +from ....lib.messages import * +from ....lib.opt_type import * + + +def make_types(data_mem_size_global, ctrl_mem_size, num_tiles, num_rd_tiles): + DataType = mk_data(32, 1) + DataAddrType = mk_bits(clog2(data_mem_size_global)) + CtrlAddrType = mk_bits(clog2(ctrl_mem_size)) + CtrlType = mk_ctrl(4, 2, 4, 4, 16) + CgraPayloadType = mk_cgra_payload(DataType, DataAddrType, CtrlType, CtrlAddrType) + NocPktType = mk_inter_cgra_pkt(1, 1, num_tiles, num_rd_tiles, CgraPayloadType) + return DataType, DataAddrType, NocPktType + + +def drive_defaults(dut, DataAddrType, DataType, NocPktType, num_rd_tiles, num_wr_tiles): + for i in range(num_rd_tiles): + dut.recv_raddr[i].val @= 0 + dut.recv_raddr[i].msg @= DataAddrType(0) + dut.send_rdata[i].rdy @= 1 + + for i in range(num_wr_tiles): + dut.recv_waddr[i].val @= 0 + dut.recv_waddr[i].msg @= DataAddrType(0) + dut.recv_wdata[i].val @= 0 + dut.recv_wdata[i].msg @= DataType(0, 0, 0, 0) + + dut.recv_from_noc_load_request.val @= 0 + dut.recv_from_noc_load_request.msg @= NocPktType() + dut.recv_from_noc_store_request.val @= 0 + dut.recv_from_noc_store_request.msg @= NocPktType() + dut.recv_from_noc_load_response_pkt.val @= 0 + dut.recv_from_noc_load_response_pkt.msg @= NocPktType() + dut.send_to_noc_load_request_pkt.rdy @= 1 + dut.send_to_noc_load_response_pkt.rdy @= 1 + dut.send_to_noc_store_pkt.rdy @= 1 + + dut.spm_dma_wval @= 0 + dut.spm_dma_waddr @= DataAddrType(0) + dut.spm_dma_wdata @= 0 + dut.spm_dma_wmask @= 0 + dut.spm_dma_rval @= 0 + dut.spm_dma_raddr @= DataAddrType(0) + dut.spm_dma_rresp_rdy @= 1 + + dut.cgra_id @= 0 + dut.address_lower @= DataAddrType(0) + dut.address_upper @= DataAddrType(15) + + +def test_dma_ports_write_then_read(): + """ + Verifies that the DataMemController correctly handles requests from the + DMA ports. It performs a DMA write to a specific address and then a + DMA read from the same address to verify the data. + """ + data_mem_size_global = 64 + data_mem_size_per_bank = 16 + num_banks = 4 + num_rd_tiles = 2 + num_wr_tiles = 2 + num_tiles = 4 + ctrl_mem_size = 16 + + DataType, DataAddrType, NocPktType = make_types( + data_mem_size_global, ctrl_mem_size, num_tiles, num_rd_tiles) + + dut = DataMemControllerRTL(NocPktType, + data_mem_size_global, + data_mem_size_per_bank, + num_banks, + num_rd_tiles, + num_wr_tiles, + 1, + 1, + num_tiles, + True, + {0: [0, 0]}, + has_dma_ports = True) + dut.apply(DefaultPassGroup()) + dut.sim_reset() + drive_defaults(dut, DataAddrType, DataType, NocPktType, num_rd_tiles, num_wr_tiles) + + dut.spm_dma_wval @= 1 + dut.spm_dma_waddr @= DataAddrType(3) + dut.spm_dma_wdata @= 0xaaaabbbb + dut.spm_dma_wmask @= 0xf + dut.sim_eval_combinational() + assert dut.spm_dma_wrdy + dut.sim_tick() + dut.spm_dma_wval @= 0 + + dut.spm_dma_rval @= 1 + dut.spm_dma_raddr @= DataAddrType(3) + + seen_response = False + for _ in range(10): + dut.sim_eval_combinational() + if dut.spm_dma_rval & dut.spm_dma_rrdy: + dut.spm_dma_rval @= 0 + if dut.spm_dma_rresp_val: + assert int(dut.spm_dma_rresp_data) == 0xaaaabbbb + seen_response = True + break + dut.sim_tick() + + assert seen_response diff --git a/mem/dma/DmaEngineRTL.py b/mem/dma/DmaEngineRTL.py new file mode 100644 index 00000000..efb10827 --- /dev/null +++ b/mem/dma/DmaEngineRTL.py @@ -0,0 +1,299 @@ +""" +========================================================================== +DmaEngineRTL.py +========================================================================== + +Simple DMA engine for moving opaque words between an abstract external +memory interface and the CGRA dataSPM. +""" + +from pymtl3 import * +from ...lib.basic.val_rdy.ifcs import ValRdyRecvIfcRTL as RecvIfcRTL +from ...lib.basic.val_rdy.ifcs import ValRdySendIfcRTL as SendIfcRTL +from ...lib.util.common import DMA_MVIN, DMA_MVOUT, CHAR_BIT, StateType, STATE_IDLE, STATE_MVIN_REQ, STATE_MVIN_RESP, STATE_MVIN_WRITE, STATE_MVOUT_READ, STATE_MVOUT_RESP, STATE_MVOUT_WRITE, STATE_MVOUT_WAIT, STATE_DONE + + +class DmaEngineRTL( Component ): + """ + The DmaEngineRTL module is responsible for bulk data movement between an + external DRAM-like memory and the on-chip Scratchpad Memory (dataSPM). + + It supports two main operations: + - DMA_MVIN: DRAM -> DMA Engine -> SPM + - DMA_MVOUT: SPM -> DMA Engine -> DRAM + + Architectural Design: + - 1 word = 4 bytes = 32 bits in this system. + - DRAM is byte-addressed which means each unique address points to a byte(8 bits). + - SPM is word-addressed which means each unique address points to a word(32 bits). + - The engine uses a 128-bit interface to external memory (4 words per beat) + and a 32-bit interface to the dataSPM (1 word per cycle). + - A finite state machine (FSM) manages the command execution flow, including + requesting memory, waiting for responses, and performing SPM accesses. + - MVIN logic: Requests 128-bit beats from DRAM, then unpacks them into four + sequential 32-bit SPM writes. + - MVOUT logic: Reads four 32-bit words from SPM, packs them into a 128-bit + beat, and issues a single write request to DRAM. + """ + + def construct( s, + spm_data_nbits = 32, # Bitwidth of a single SPM word + mem_data_nbits = 128, # Bitwidth of an external memory beat + dram_addr_nbits = 64, # Bitwidth of DRAM addresses + spm_addr_nbits = 32, # Bitwidth of SPM addresses + bytes_nbits = 32, # Bitwidth for transfer size in bytes + tag_nbits = 8 ): # Bitwidth for command tracking tags + + assert mem_data_nbits == spm_data_nbits * 4 + + OpcodeType = mk_bits( 3 ) + DramAddrType = mk_bits( dram_addr_nbits ) + SpmAddrType = mk_bits( spm_addr_nbits ) + BytesType = mk_bits( bytes_nbits ) + TagType = mk_bits( tag_nbits ) + SpmDataType = mk_bits( spm_data_nbits ) + MemDataType = mk_bits( mem_data_nbits ) + # Byte mask for SPM write + SpmMaskType = mk_bits( spm_data_nbits // CHAR_BIT ) + MemMaskType = mk_bits( mem_data_nbits // CHAR_BIT ) + + # Command interface + s.dma_cmd_val = InPort() + s.dma_cmd_rdy = OutPort() + s.dma_cmd_opcode = InPort( OpcodeType ) + s.dma_cmd_dram_addr = InPort( DramAddrType ) + s.dma_cmd_spm_addr = InPort( SpmAddrType ) + # An input signal that specifies the number of bytes to transfer. + s.dma_cmd_bytes = InPort( BytesType ) + s.dma_cmd_tag = InPort( TagType ) + + s.dma_done_val = OutPort() + s.dma_done_rdy = InPort() + s.dma_done_tag = OutPort( TagType ) + + # Abstract external memory interface + # Request to read from DRAM + s.dram_rd_req = SendIfcRTL( DramAddrType ) + # Response from DRAM + s.dram_rd_resp = RecvIfcRTL( MemDataType ) + + # Request to write to DRAM + s.dram_wr_req_val = OutPort() + s.dram_wr_req_rdy = InPort() + s.dram_wr_req_addr = OutPort( DramAddrType ) + s.dram_wr_req_data = OutPort( MemDataType ) + s.dram_wr_req_mask = OutPort( MemMaskType ) + s.dram_wr_resp_val = InPort() + s.dram_wr_resp_rdy = OutPort() + + # SPM interface + # Request to write to SPM + s.spm_dma_wval = OutPort() + s.spm_dma_wrdy = InPort() + s.spm_dma_waddr = OutPort( SpmAddrType ) + s.spm_dma_wdata = OutPort( SpmDataType ) + s.spm_dma_wmask = OutPort( SpmMaskType ) + + # Request to read from SPM + s.spm_dma_rval = OutPort() + s.spm_dma_rrdy = InPort() + s.spm_dma_raddr = OutPort( SpmAddrType ) + + # Response from SPM + s.spm_dma_rresp_val = InPort() + s.spm_dma_rresp_rdy = OutPort() + s.spm_dma_rresp_data = InPort( SpmDataType ) + + # State machine definitions + + s.state = Wire( StateType ) + s.state_next = Wire( StateType ) + + # Combinational logic + s.opcode_reg = Wire( OpcodeType ) # Current operation (MVIN/MVOUT) + s.dram_addr_reg = Wire( DramAddrType ) # Current DRAM byte address + s.spm_addr_reg = Wire( SpmAddrType ) # Current SPM word address + s.words_left_reg = Wire( BytesType ) # Number of 32-bit words remaining to transfer + s.tag_reg = Wire( TagType ) # Tag of the active command + s.beat_reg = Wire( MemDataType ) # Buffer for 128-bit DRAM beat + s.word_idx_reg = Wire( Bits2 ) # Index (0-3) of the word within a beat + s.wr_mask_reg = Wire( MemMaskType ) # Byte mask for DRAM write + + # Sequential logic + s.state_ff = Wire( StateType ) + s.opcode_ff = Wire( OpcodeType ) + s.dram_addr_ff = Wire( DramAddrType ) + s.spm_addr_ff = Wire( SpmAddrType ) + s.words_left_ff = Wire( BytesType ) + s.tag_ff = Wire( TagType ) + s.beat_ff = Wire( MemDataType ) + s.word_idx_ff = Wire( Bits2 ) + s.wr_mask_ff = Wire( MemMaskType ) + + # Connections + s.state //= s.state_ff + s.opcode_reg //= s.opcode_ff + s.dram_addr_reg //= s.dram_addr_ff + s.spm_addr_reg //= s.spm_addr_ff + s.words_left_reg //= s.words_left_ff + s.tag_reg //= s.tag_ff + s.beat_reg //= s.beat_ff + s.word_idx_reg //= s.word_idx_ff + s.wr_mask_reg //= s.wr_mask_ff + + @update + def comb_outputs(): + s.dma_cmd_rdy @= s.state == STATE_IDLE + s.dma_done_val @= s.state == STATE_DONE + s.dma_done_tag @= s.tag_reg + + s.dram_rd_req.val @= s.state == STATE_MVIN_REQ + s.dram_rd_req.msg @= s.dram_addr_reg + s.dram_rd_resp.rdy @= s.state == STATE_MVIN_RESP + + s.dram_wr_req_val @= s.state == STATE_MVOUT_WRITE + s.dram_wr_req_addr @= s.dram_addr_reg + s.dram_wr_req_data @= s.beat_reg + s.dram_wr_req_mask @= s.wr_mask_reg + s.dram_wr_resp_rdy @= s.state == STATE_MVOUT_WAIT + + s.spm_dma_wval @= s.state == STATE_MVIN_WRITE + s.spm_dma_waddr @= s.spm_addr_reg + s.spm_dma_wmask @= SpmMaskType( (1 << (spm_data_nbits // CHAR_BIT)) - 1 ) # Write mask for SPM write; always be 0b1111 + + if s.word_idx_reg == b2( 0 ): # Writes the first word of the beat to SPM + s.spm_dma_wdata @= s.beat_reg[0:spm_data_nbits] + elif s.word_idx_reg == b2( 1 ): # Writes the second word of the beat to SPM + s.spm_dma_wdata @= s.beat_reg[spm_data_nbits:spm_data_nbits*2] + elif s.word_idx_reg == b2( 2 ): # 3rd word + s.spm_dma_wdata @= s.beat_reg[spm_data_nbits*2:spm_data_nbits*3] + else: # 4th word + s.spm_dma_wdata @= s.beat_reg[spm_data_nbits*3:spm_data_nbits*4] + + s.spm_dma_rval @= s.state == STATE_MVOUT_READ + s.spm_dma_raddr @= s.spm_addr_reg + s.spm_dma_rresp_rdy @= s.state == STATE_MVOUT_RESP + + @update_ff + def seq_state(): + if s.reset: + s.state_ff <<= STATE_IDLE + s.opcode_ff <<= OpcodeType( 0 ) + s.dram_addr_ff <<= DramAddrType( 0 ) + s.spm_addr_ff <<= SpmAddrType( 0 ) + s.words_left_ff <<= BytesType( 0 ) + s.tag_ff <<= TagType( 0 ) + s.beat_ff <<= MemDataType( 0 ) + s.word_idx_ff <<= b2( 0 ) + s.wr_mask_ff <<= MemMaskType( 0 ) + else: + if s.state == STATE_IDLE: + if s.dma_cmd_val & s.dma_cmd_rdy: # Receives a new DMA command. + s.opcode_ff <<= s.dma_cmd_opcode + s.dram_addr_ff <<= s.dma_cmd_dram_addr + s.spm_addr_ff <<= s.dma_cmd_spm_addr + s.words_left_ff <<= s.dma_cmd_bytes >> 2 # Converts the transfer size from bytes to words. + s.tag_ff <<= s.dma_cmd_tag + s.beat_ff <<= MemDataType( 0 ) + s.word_idx_ff <<= b2( 0 ) + s.wr_mask_ff <<= MemMaskType( 0 ) + + if s.dma_cmd_bytes == BytesType( 0 ): # No more bytes to transfer. + s.state_ff <<= STATE_DONE + # Still has bytes to transfer. + elif s.dma_cmd_opcode == OpcodeType( DMA_MVIN ): + s.state_ff <<= STATE_MVIN_REQ # Move to the next state: to issue a read request to DRAM. + else: # DMA_MVOUT + s.state_ff <<= STATE_MVOUT_READ # Move to the next state: to issue a read request to SPM. + + elif s.state == STATE_MVIN_REQ: # Issues a read request to DRAM. + if s.dram_rd_req.val & s.dram_rd_req.rdy: + s.dram_addr_ff <<= s.dram_addr_reg + DramAddrType( mem_data_nbits // CHAR_BIT ) + s.state_ff <<= STATE_MVIN_RESP + + elif s.state == STATE_MVIN_RESP: # Receives a response from DRAM. + if s.dram_rd_resp.val & s.dram_rd_resp.rdy: + s.beat_ff <<= s.dram_rd_resp.msg + s.word_idx_ff <<= b2( 0 ) + s.state_ff <<= STATE_MVIN_WRITE # Move to the next state: to write to SPM. + + elif s.state == STATE_MVIN_WRITE: # Writes to SPM. + if s.spm_dma_wval & s.spm_dma_wrdy: + # Update the SPM address where write next cycle(+1) + s.spm_addr_ff <<= s.spm_addr_reg + SpmAddrType( 1 ) + # Update the number of words remaining to write to SPM. + s.words_left_ff <<= s.words_left_reg - BytesType( 1 ) + + if s.words_left_reg == BytesType( 1 ): + s.state_ff <<= STATE_DONE + elif s.word_idx_reg == b2( 3 ): + s.word_idx_ff <<= b2( 0 ) + s.state_ff <<= STATE_MVIN_REQ + else: + s.word_idx_ff <<= s.word_idx_reg + b2( 1 ) + + elif s.state == STATE_MVOUT_READ: + if s.spm_dma_rval & s.spm_dma_rrdy: + s.state_ff <<= STATE_MVOUT_RESP # Move to the next state: to receive a response from SPM. + + elif s.state == STATE_MVOUT_RESP: + if s.spm_dma_rresp_val & s.spm_dma_rresp_rdy: + # Pack the response from SPM into a 128-bit beat by left-shifting. + if s.word_idx_reg == b2( 0 ): # 1st word + s.beat_ff <<= concat( s.beat_reg[spm_data_nbits:spm_data_nbits*4], + s.spm_dma_rresp_data ) + elif s.word_idx_reg == b2( 1 ): + s.beat_ff <<= concat( s.beat_reg[spm_data_nbits*2:spm_data_nbits*4], + s.spm_dma_rresp_data, + s.beat_reg[0:spm_data_nbits] ) + elif s.word_idx_reg == b2( 2 ): + s.beat_ff <<= concat( s.beat_reg[spm_data_nbits*3:spm_data_nbits*4], + s.spm_dma_rresp_data, + s.beat_reg[0:spm_data_nbits*2] ) + else: + s.beat_ff <<= concat( s.spm_dma_rresp_data, + s.beat_reg[0:spm_data_nbits*3] ) + + s.spm_addr_ff <<= s.spm_addr_reg + SpmAddrType( 1 ) + s.words_left_ff <<= s.words_left_reg - BytesType( 1 ) + + if s.words_left_reg == BytesType( 1 ): + if s.word_idx_reg == b2( 0 ): + s.wr_mask_ff <<= MemMaskType( 0x000f ) + elif s.word_idx_reg == b2( 1 ): + s.wr_mask_ff <<= MemMaskType( 0x00ff ) + elif s.word_idx_reg == b2( 2 ): + s.wr_mask_ff <<= MemMaskType( 0x0fff ) + else: + s.wr_mask_ff <<= MemMaskType( 0xffff ) + s.state_ff <<= STATE_MVOUT_WRITE + elif s.word_idx_reg == b2( 3 ): + s.wr_mask_ff <<= MemMaskType( 0xffff ) + s.state_ff <<= STATE_MVOUT_WRITE + else: + s.word_idx_ff <<= s.word_idx_reg + b2( 1 ) + s.state_ff <<= STATE_MVOUT_READ + + elif s.state == STATE_MVOUT_WRITE: + if s.dram_wr_req_val & s.dram_wr_req_rdy: + s.state_ff <<= STATE_MVOUT_WAIT + + elif s.state == STATE_MVOUT_WAIT: + if s.dram_wr_resp_val & s.dram_wr_resp_rdy: + # Turn to the +16 address after writing 16 bytes data. + s.dram_addr_ff <<= s.dram_addr_reg + DramAddrType( mem_data_nbits // CHAR_BIT ) + s.beat_ff <<= MemDataType( 0 ) + s.word_idx_ff <<= b2( 0 ) + s.wr_mask_ff <<= MemMaskType( 0 ) + + if s.words_left_reg == BytesType( 0 ): + s.state_ff <<= STATE_DONE + else: + s.state_ff <<= STATE_MVOUT_READ + + elif s.state == STATE_DONE: + if s.dma_done_val & s.dma_done_rdy: + s.state_ff <<= STATE_IDLE + + def line_trace( s ): + return f"dma(state={int(s.state)},tag={int(s.tag_reg)},left={int(s.words_left_reg)})" diff --git a/mem/dma/__init__.py b/mem/dma/__init__.py new file mode 100644 index 00000000..8b137891 --- /dev/null +++ b/mem/dma/__init__.py @@ -0,0 +1 @@ + diff --git a/mem/dma/test/DmaEngineRTL_test.py b/mem/dma/test/DmaEngineRTL_test.py new file mode 100644 index 00000000..28f30cc1 --- /dev/null +++ b/mem/dma/test/DmaEngineRTL_test.py @@ -0,0 +1,236 @@ +""" +========================================================================== +DmaEngineRTL_test.py +========================================================================== +""" + +from pymtl3 import * + +from ..DmaEngineRTL import DmaEngineRTL, DMA_MVIN, DMA_MVOUT + + +def make_dut(): + dut = DmaEngineRTL() + dut.apply(DefaultPassGroup()) + dut.sim_reset() + + dut.dma_cmd_val @= 0 + dut.dma_cmd_opcode @= 0 + dut.dma_cmd_dram_addr @= 0 + dut.dma_cmd_spm_addr @= 0 + dut.dma_cmd_bytes @= 0 + dut.dma_cmd_tag @= 0 + dut.dma_done_rdy @= 1 + + dut.dram_rd_req.rdy @= 1 + dut.dram_rd_resp.val @= 0 + dut.dram_rd_resp.msg @= 0 + dut.dram_wr_req_rdy @= 1 + dut.dram_wr_resp_val @= 1 + + dut.spm_dma_wrdy @= 1 + dut.spm_dma_rrdy @= 1 + dut.spm_dma_rresp_val @= 0 + dut.spm_dma_rresp_data @= 0 + dut.sim_eval_combinational() + return dut + + +def issue_cmd(dut, opcode, dram_addr, spm_addr, nbytes, tag): + """ + Issues a DMA command to the DUT. + Args: + dut: The DUT instance. + opcode: The opcode of the DMA command. DMA_MVIN or DMA_MVOUT. + dram_addr: The DRAM address of the DMA command. + spm_addr: The SPM address of the DMA command. + nbytes: The number of bytes to transfer. + tag: The tag of the DMA command. + """ + dut.dma_cmd_val @= 1 + dut.dma_cmd_opcode @= opcode + dut.dma_cmd_dram_addr @= dram_addr + dut.dma_cmd_spm_addr @= spm_addr + dut.dma_cmd_bytes @= nbytes + dut.dma_cmd_tag @= tag + dut.sim_eval_combinational() + assert dut.dma_cmd_rdy + dut.sim_tick() + dut.dma_cmd_val @= 0 + + +def test_dma_mvin_one_beat(): + """ + Tests DMA_MVIN operation. + The DRAM contains 2 beats of data, which should be unpacked into 8 + sequential SPM writes. + """ + dut = make_dut() + issue_cmd(dut, DMA_MVIN, + 0x1000, # dram_addr + 4, # spm_addr + 32, # nbytes(number of bytes to transfer) + 0x5a) # tag + + dram = { + 0x1000: concat(Bits32(0x44444444), Bits32(0x33333333), + Bits32(0x22222222), Bits32(0x11111111)), # 4 x 4 bytes = 16 bytes in total. + + # Address bias: +16, since DRAM is byte-addressed(each address points to a byte). + 0x1010: concat(Bits32(0x88888888), Bits32(0x77777777), + Bits32(0x66666666), Bits32(0x55555555)), + } + pending_resp = None + spm_writes = [] + + for _ in range(20): + dut.dram_rd_resp.val @= 0 + if pending_resp is not None: + dut.dram_rd_resp.val @= 1 + dut.dram_rd_resp.msg @= pending_resp + + dut.sim_eval_combinational() + + if dut.dram_rd_req.val & dut.dram_rd_req.rdy: + pending_resp = dram[int(dut.dram_rd_req.msg)] + else: + pending_resp = None + + if dut.spm_dma_wval & dut.spm_dma_wrdy: + spm_writes.append((int(dut.spm_dma_waddr), int(dut.spm_dma_wdata))) + + if dut.dma_done_val: + assert int(dut.dma_done_tag) == 0x5a + break + + dut.sim_tick() + + for elem in spm_writes: + print(f'{elem[0]}: 0x{elem[1]:08x}') + + assert spm_writes == [ + (4, 0x11111111), + (5, 0x22222222), + (6, 0x33333333), + (7, 0x44444444), + + (8, 0x55555555), + (9, 0x66666666), + (10, 0x77777777), + (11, 0x88888888), + ] + + +def test_dma_mvout_partial_beat(): + """ + Tests a partial beat MVOUT operation (12 bytes / 3 words). + The DMA should read three words from SPM, pack them into a 128-bit beat + with a proper byte mask, and write it to DRAM. + """ + dut = make_dut() + issue_cmd(dut, DMA_MVOUT, + 0x2000, # dram_addr + 8, # spm_addr + 12, # nbytes(number of bytes to transfer) + 0xa5) # tag + + spm = { + 8: 0xaaaabbbb, + 9: 0xccccdddd, + 10: 0xeeeeffff, + } + pending_rresp = None + mem_writes = [] + + for _ in range(30): + dut.spm_dma_rresp_val @= 0 + if pending_rresp is not None: + dut.spm_dma_rresp_val @= 1 + dut.spm_dma_rresp_data @= pending_rresp + + dut.sim_eval_combinational() + + if dut.spm_dma_rval & dut.spm_dma_rrdy: + pending_rresp = spm[int(dut.spm_dma_raddr)] + else: + pending_rresp = None + + if dut.dram_wr_req_val & dut.dram_wr_req_rdy: + mem_writes.append((int(dut.dram_wr_req_addr), + int(dut.dram_wr_req_data), + int(dut.dram_wr_req_mask))) + + if dut.dma_done_val: + assert int(dut.dma_done_tag) == 0xa5 + break + + dut.sim_tick() + + assert mem_writes == [ + (0x2000, + int(concat(Bits32(0), Bits32(0xeeeeffff), + Bits32(0xccccdddd), Bits32(0xaaaabbbb))), + 0x0fff), # mask + ] + +def test_dma_mvout_full_beat(): + """ + Tests a full beat MVOUT operation (16 bytes / 4 words). + The DMA should read four words from SPM, pack them into a 128-bit beat + with a proper byte mask, and write it to DRAM. + """ + dut = make_dut() + issue_cmd(dut, DMA_MVOUT, + 0x2000, # dram_addr + 8, # spm_addr + 32, # nbytes(number of bytes to transfer) + 0xa5) # tag + + spm = { + 8 : 0x11112222, + 9 : 0x33334444, + 10: 0x55556666, + 11: 0x77778888, + 12: 0x9999aaaa, + 13: 0xbbbbcccc, + 14: 0xddddeeee, + 15: 0xffff0000, + } + pending_rresp = None + mem_writes = [] + + for _ in range(30): + dut.spm_dma_rresp_val @= 0 + if pending_rresp is not None: + dut.spm_dma_rresp_val @= 1 + dut.spm_dma_rresp_data @= pending_rresp + + dut.sim_eval_combinational() + + if dut.spm_dma_rval & dut.spm_dma_rrdy: + pending_rresp = spm[int(dut.spm_dma_raddr)] + else: + pending_rresp = None + + if dut.dram_wr_req_val & dut.dram_wr_req_rdy: + mem_writes.append((int(dut.dram_wr_req_addr), + int(dut.dram_wr_req_data), + int(dut.dram_wr_req_mask))) + + if dut.dma_done_val: + assert int(dut.dma_done_tag) == 0xa5 + break + + dut.sim_tick() + + assert mem_writes == [ + (0x2000, + int(concat(Bits32(0x77778888), Bits32(0x55556666), + Bits32(0x33334444), Bits32(0x11112222))), + 0xffff), # mask + + (0x2010, + int(concat(Bits32(0xffff0000), Bits32(0xddddeeee), + Bits32(0xbbbbcccc), Bits32(0x9999aaaa))), + 0xffff), + ] \ No newline at end of file diff --git a/mem/dma/test/__init__.py b/mem/dma/test/__init__.py new file mode 100644 index 00000000..8b137891 --- /dev/null +++ b/mem/dma/test/__init__.py @@ -0,0 +1 @@ +