diff --git a/fu/flexible/FlexibleFuRTL.py b/fu/flexible/FlexibleFuRTL.py index c4c991de..fbe2eff7 100644 --- a/fu/flexible/FlexibleFuRTL.py +++ b/fu/flexible/FlexibleFuRTL.py @@ -116,10 +116,6 @@ def comb_logic(): # opt connection. s.fu[i].recv_opt.msg @= s.recv_opt.msg - # Sets each FU's op code as NAH when prologue execution is not completed. - # As they are supposed to do nothing during that prologue cycles. - if s.prologue_count_inport != 0: - s.fu[i].recv_opt.msg.operation @= OPT_NAH s.fu[i].recv_opt.val @= s.recv_opt.val s.fu_recv_opt_rdy_vector[i] @= s.fu[i].recv_opt.rdy diff --git a/mem/ctrl/CtrlMemDynamicRTL.py b/mem/ctrl/CtrlMemDynamicRTL.py index 67f96ded..37f48a63 100644 --- a/mem/ctrl/CtrlMemDynamicRTL.py +++ b/mem/ctrl/CtrlMemDynamicRTL.py @@ -88,7 +88,6 @@ def construct(s, IntraCgraPktType, [[Wire(PrologueCountType) for _ in range(num_routing_xbar_inports)] for _ in range(ctrl_mem_size)] # Connections. - s.send_ctrl.msg //= s.reg_file.rdata[0] s.recv_pkt_from_controller //= s.recv_pkt_from_controller_queue.recv s.recv_from_element //= s.recv_from_element_queue.recv @@ -193,7 +192,7 @@ def update_send_pkt_to_controller(): s.send_pkt_to_controller.val @= 1 @update - def update_send_ctrl(): + def update_send_ctrl_val(): s.send_ctrl.val @= 0 if s.start_iterate_ctrl == b1(1): if s.sent_complete: @@ -207,6 +206,26 @@ def update_send_ctrl(): (s.recv_pkt_from_controller_queue.send.msg.payload.cmd == CMD_TERMINATE): s.send_ctrl.val @= b1(0) + @update + def update_send_ctrl_msg(): + for i in range(num_fu_inports): + s.send_ctrl.msg.fu_in[i] @= s.reg_file.rdata[0].fu_in[i] + s.send_ctrl.msg.write_reg_from[i] @= s.reg_file.rdata[0].write_reg_from[i] + s.send_ctrl.msg.write_reg_idx[i] @= s.reg_file.rdata[0].write_reg_idx[i] + s.send_ctrl.msg.read_reg_towards[i] @= s.reg_file.rdata[0].read_reg_towards[i] + s.send_ctrl.msg.read_reg_idx[i] @= s.reg_file.rdata[0].read_reg_idx[i] + for i in range(num_routing_outports): + s.send_ctrl.msg.routing_xbar_outport[i] @= s.reg_file.rdata[0].routing_xbar_outport[i] + s.send_ctrl.msg.fu_xbar_outport[i] @= s.reg_file.rdata[0].fu_xbar_outport[i] + s.send_ctrl.msg.vector_factor_power @= s.reg_file.rdata[0].vector_factor_power + s.send_ctrl.msg.is_last_ctrl @= s.reg_file.rdata[0].is_last_ctrl + # Sets each FU's op code as NAH when prologue execution has not completed. + # As FU is supposed to do nothing during prologue. + if s.prologue_count_outport_fu != 0: + s.send_ctrl.msg.operation @= OPT_NAH + else: + s.send_ctrl.msg.operation @= s.reg_file.rdata[0].operation + @update_ff def update_whether_we_can_iterate_ctrl(): if s.reset: diff --git a/noc/CrossbarRTL.py b/noc/CrossbarRTL.py index 3bbb3b58..3a6fc145 100644 --- a/noc/CrossbarRTL.py +++ b/noc/CrossbarRTL.py @@ -89,8 +89,8 @@ def construct(s, # Prologue-related wires and registers, which are used to indicate # whether the prologue steps have already been satisfied. - s.prologue_allowing_vector = Wire(num_outports) - s.recv_valid_or_prologue_allowing_vector = Wire(num_outports) + s.during_prologue_allowing_vector = Wire(num_outports) + s.recv_valid_or_during_prologue_allowing_vector = Wire(num_outports) s.prologue_counter = [[Wire(PrologueCountType) for _ in range(num_inports)] for _ in range(ctrl_mem_size)] s.prologue_counter_next = [[Wire(PrologueCountType) for _ in range(num_inports)] for _ in range(ctrl_mem_size)] s.prologue_count_inport = [[InPort(PrologueCountType) for _ in range(num_inports)] for _ in range(ctrl_mem_size)] @@ -144,7 +144,7 @@ def update_signal(): s.send_data[i].msg.predicate @= s.recv_data_msg[s.in_dir_local[i]].predicate s.recv_opt.rdy @= s.all_send_accepted & \ - reduce_and(s.recv_valid_or_prologue_allowing_vector) + reduce_and(s.recv_valid_or_during_prologue_allowing_vector) @update_ff def update_prologue_counter(): @@ -199,22 +199,22 @@ def update_send_accepted_next(): @update def update_prologue_allowing_vector(): - s.prologue_allowing_vector @= 0 + s.during_prologue_allowing_vector @= 0 for i in range(num_outports): if s.in_dir[i] > 0: # Records whether the prologue steps have already been satisfied. - s.prologue_allowing_vector[i] @= \ + s.during_prologue_allowing_vector[i] @= \ (s.prologue_counter[s.ctrl_addr_inport][s.in_dir_local[i]] < \ s.prologue_count_wire[s.ctrl_addr_inport][s.in_dir_local[i]]) else: - s.prologue_allowing_vector[i] @= 1 + s.during_prologue_allowing_vector[i] @= 0 @update def update_prologue_or_valid_vector(): - s.recv_valid_or_prologue_allowing_vector @= 0 + s.recv_valid_or_during_prologue_allowing_vector @= 0 for i in range(num_outports): - s.recv_valid_or_prologue_allowing_vector[i] @= \ - s.recv_valid_vector[i] | s.prologue_allowing_vector[i] + s.recv_valid_or_during_prologue_allowing_vector[i] @= \ + s.recv_valid_vector[i] | s.during_prologue_allowing_vector[i] @update def update_in_dir_vector(): @@ -232,9 +232,9 @@ def update_in_dir_vector(): def update_rdy_vector(): s.send_rdy_vector @= 0 for i in range(num_outports): - # The `num_inports` indicates the number of outports that go to other tiles. + # The `outport_towards_local_base_id` indicates the number of outports that go to other tiles. # Specifically, if the compute already done, we shouldn't care the ones - # (i.e., i >= num_inports) go to the FU's inports. In other words, we skip + # (i.e., i >= outport_towards_local_base_id) go to the FU's inports. In other words, we skip # the rdy checking on the FU's inports (connecting from crossbar_outport) if # the compute is already completed. if (s.in_dir[i] > 0) & \ @@ -259,7 +259,8 @@ def update_recv_required_vector(): for i in range(num_outports): if s.in_dir[i] > 0: - s.recv_required_vector[s.in_dir_local[i]] @= 1 + # Avoids crossbar mistakenly consume data during prologue. + s.recv_required_vector[s.in_dir_local[i]] @= ~s.during_prologue_allowing_vector[i] @update def update_send_required_vector(): diff --git a/tile/test/TileRTL_test.py b/tile/test/TileRTL_test.py index b8193354..d91154cf 100644 --- a/tile/test/TileRTL_test.py +++ b/tile/test/TileRTL_test.py @@ -35,6 +35,7 @@ from ...lib.cmd_type import * from ...lib.messages import * from ...lib.opt_type import * +from ...lib.util.common import * #------------------------------------------------------------------------- # Test harness @@ -47,7 +48,7 @@ def construct(s, DUT, FunctionUnit, FuList, ctrl_mem_size, data_mem_size, num_fu_inports, num_fu_outports, num_tile_inports, num_tile_outports, num_registers_per_reg_bank, src_data, - src_ctrl_pkt, sink_out, num_tiles, complete_signal_sink_out): + src_ctrl_pkt, sink_out, num_tiles, complete_signal_sink_out, num_ctrl, total_steps): CgraPayloadType = IntraCgraPktType.get_field_type(kAttrPayload) DataType = CgraPayloadType.get_field_type(kAttrData) @@ -61,7 +62,7 @@ def construct(s, DUT, FunctionUnit, FuList, for i in range(num_tile_outports)] s.complete_signal_sink_out = TestSinkRTL(IntraCgraPktType, complete_signal_sink_out) - s.dut = DUT(IntraCgraPktType, ctrl_mem_size, data_mem_size, 3, 2, # 2 opts + s.dut = DUT(IntraCgraPktType, ctrl_mem_size, data_mem_size, num_ctrl, total_steps, num_fu_inports, num_fu_outports, num_tile_inports, num_tile_outports, 1, num_tiles, num_registers_per_reg_bank, @@ -212,7 +213,8 @@ def test_tile_alu(cmdline_opts): data_mem_size_global, num_fu_inports, num_fu_outports, num_tile_inports, num_tile_outports, num_registers_per_reg_bank, src_data, - src_ctrl_pkt, sink_out, num_tiles, complete_signal_sink_out) + src_ctrl_pkt, sink_out, num_tiles, complete_signal_sink_out, + num_ctrl = 2, total_steps = 2) th.elaborate() th.dut.set_metadata(VerilogVerilatorImportPass.vl_Wno_list, ['UNSIGNED', 'UNOPTFLAT', 'WIDTH', 'WIDTHCONCAT', @@ -327,7 +329,8 @@ def test_tile_multicycle_exclusive(cmdline_opts): data_mem_size_global, num_fu_inports, num_fu_outports, num_tile_inports, num_tile_outports, num_registers_per_reg_bank, src_data, - src_ctrl_pkt, sink_out, num_tiles, complete_signal_sink_out) + src_ctrl_pkt, sink_out, num_tiles, complete_signal_sink_out, + num_ctrl = 2, total_steps = 2) th.elaborate() th.dut.set_metadata(VerilogVerilatorImportPass.vl_Wno_list, ['UNSIGNED', 'UNOPTFLAT', 'WIDTH', 'WIDTHCONCAT', @@ -440,7 +443,8 @@ def test_tile_multicycle_inclusive(cmdline_opts): data_mem_size_global, num_fu_inports, num_fu_outports, num_tile_inports, num_tile_outports, num_registers_per_reg_bank, src_data, - src_ctrl_pkt, sink_out, num_tiles, complete_signal_sink_out) + src_ctrl_pkt, sink_out, num_tiles, complete_signal_sink_out, + num_ctrl = 2, total_steps = 2) th.elaborate() th.dut.set_metadata(VerilogVerilatorImportPass.vl_Wno_list, ['UNSIGNED', 'UNOPTFLAT', 'WIDTH', 'WIDTHCONCAT', @@ -552,10 +556,180 @@ def test_readReg_routing_priority(cmdline_opts): data_mem_size_global, num_fu_inports, num_fu_outports, num_tile_inports, num_tile_outports, num_registers_per_reg_bank, src_data, - src_ctrl_pkt, sink_out, num_tiles, complete_signal_sink_out) + src_ctrl_pkt, sink_out, num_tiles, complete_signal_sink_out, + num_ctrl = 2, total_steps = 2) th.elaborate() th.dut.set_metadata(VerilogVerilatorImportPass.vl_Wno_list, ['UNSIGNED', 'UNOPTFLAT', 'WIDTH', 'WIDTHCONCAT', 'ALWCOMBORDER']) th = config_model_with_cmdline_opts(th, cmdline_opts, duts = ['dut']) run_sim(th) + +def test_prologue_nah(cmdline_opts): + num_tile_inports = 4 + num_tile_outports = 4 + num_fu_inports = 4 + num_fu_outports = 2 + num_routing_outports = num_fu_inports + num_tile_outports + ctrl_mem_size = 8 + data_mem_size_global = 16 + num_cgra_rows = 1 + num_cgra_columns = 1 + num_cgras = num_cgra_rows * num_cgra_columns + num_tiles = 4 + num_commands = NUM_CMDS + num_ctrl_operations = NUM_OPTS + num_registers_per_reg_bank = 16 + TileInType = mk_bits(clog2(num_tile_inports + num_fu_inports + 1)) + FuInType = mk_bits(clog2(num_fu_inports + 1)) + FuOutType = mk_bits(clog2(num_fu_outports + 1)) + pick_register0 = [FuInType(0) for x in range(num_fu_inports)] + pick_register1 = [FuInType(1), FuInType(2), FuInType(3), FuInType(4)] + DUT = TileRTL + FunctionUnit = FlexibleFuRTL + # FuList = [AdderRTL, MulRTL, MemUnitRTL] + FuList = [GrantRTL] + # 64-bit to satisfy the default bitwidth of vector FUs. + data_nbits = 64 + DataType = mk_data(data_nbits, 1) + PredicateType = mk_predicate(1, 1) + cgra_id_nbits = 1 + addr_nbits = clog2(data_mem_size_global) + predicate_nbits = 1 + + CtrlType = mk_ctrl(num_fu_inports, + num_fu_outports, + num_tile_inports, + num_tile_outports, + num_registers_per_reg_bank) + + CtrlAddrType = mk_bits(clog2(ctrl_mem_size)) + DataAddrType = mk_bits(addr_nbits) + + CgraPayloadType = mk_cgra_payload(DataType, + DataAddrType, + CtrlType, + CtrlAddrType) + + IntraCgraPktType = mk_intra_cgra_pkt(num_cgra_columns, + num_cgra_rows, + num_tiles, + CgraPayloadType) + + src_ctrl_pkt = [ + # NAH without routing, done in 1 cycle. + IntraCgraPktType(0, 0, 0, 0, 0, 0, 0, 0, + payload = CgraPayloadType(CMD_CONFIG, ctrl_addr = 0, + ctrl = CtrlType(OPT_NAH, + pick_register1, + [TileInType(0), TileInType(0), TileInType(0), TileInType(0), + TileInType(0), TileInType(0), TileInType(0), TileInType(0)], + [FuOutType(0), FuOutType(0), FuOutType(0), FuOutType(0), + FuOutType(0), FuOutType(0), FuOutType(0), FuOutType(0)]))), + + # NAH with routing, done in 1 cycle after PR#282. + IntraCgraPktType(0, 0, 0, 0, 0, 0, 0, 0, + payload = CgraPayloadType(CMD_CONFIG, ctrl_addr = 1, + ctrl = CtrlType(OPT_NAH, + pick_register1, + [TileInType(PORT_NORTH), TileInType(0), TileInType(0), TileInType(0), + TileInType(0), TileInType(0), TileInType(0), TileInType(0)], + [FuOutType(0), FuOutType(0), FuOutType(0), FuOutType(0), + FuOutType(0), FuOutType(0), FuOutType(0), FuOutType(0)]))), + + # Any operation, e.g., OPT_GRT_PRED, will be replaced with NAH during fu prologue, done in 1 cycle after PR#282. + # Scenario 1: Both routing and fu has prologue, OPT_GRT_PRED takes inputs from routing crossbar and directly executes in current iteration. + IntraCgraPktType(0, 0, 0, 0, 0, 0, 0, 0, + payload = CgraPayloadType(CMD_CONFIG, ctrl_addr = 2, + ctrl = CtrlType(OPT_GRT_PRED, + pick_register1, + [TileInType(0), TileInType(0), TileInType(0), TileInType(0), + TileInType(PORT_SOUTH), TileInType(PORT_WEST), TileInType(0), TileInType(0)], + [FuOutType(0), FuOutType(1), FuOutType(0), FuOutType(0), + FuOutType(0), FuOutType(0), FuOutType(0), FuOutType(0)]))), + + # Prologue settings for Scenario 1. + IntraCgraPktType(0, 0, + payload = CgraPayloadType(CMD_CONFIG_PROLOGUE_FU, ctrl_addr = 2, + data = DataType(1, 1))), + IntraCgraPktType(0, 0, + payload = CgraPayloadType(CMD_CONFIG_PROLOGUE_ROUTING_CROSSBAR, ctrl_addr = 2, + ctrl = CtrlType(routing_xbar_outport = [ + TileInType(PORT_SOUTH), TileInType(0), TileInType(0), TileInType(0), + TileInType(0), TileInType(0), TileInType(0), TileInType(0)]), + data = DataType(1, 1))), + IntraCgraPktType(0, 0, + payload = CgraPayloadType(CMD_CONFIG_PROLOGUE_ROUTING_CROSSBAR, ctrl_addr = 2, + ctrl = CtrlType(routing_xbar_outport = [ + TileInType(PORT_WEST), TileInType(0), TileInType(0), TileInType(0), + TileInType(0), TileInType(0), TileInType(0), TileInType(0)]), + data = DataType(1, 1))), + IntraCgraPktType(0, 0, + payload = CgraPayloadType(CMD_CONFIG_PROLOGUE_FU_CROSSBAR, ctrl_addr = 2, + ctrl = CtrlType(fu_xbar_outport = [ + FuOutType(0), FuOutType(0), FuOutType(0), FuOutType(0), + FuOutType(0), FuOutType(0), FuOutType(0), FuOutType(0)]), + data = DataType(1, 1))), + + # Scenario 2: Only fu has prologue, the routing value must be written to register cluster in current iteration, + # so that OPT_GRT_PRED can read inputs from register cluster and executes correctly in its first iteration. + IntraCgraPktType(0, 0, 0, 0, 0, 0, 0, 0, + payload = CgraPayloadType(CMD_CONFIG, ctrl_addr = 3, + ctrl = CtrlType(OPT_GRT_PRED, + pick_register1, + [TileInType(0), TileInType(0), TileInType(0), TileInType(0), + TileInType(PORT_SOUTH), TileInType(PORT_WEST), TileInType(0), TileInType(0)], + [FuOutType(0), FuOutType(1), FuOutType(0), FuOutType(0), + FuOutType(0), FuOutType(0), FuOutType(0), FuOutType(0)], + write_reg_from = [b2(1), b2(1), b2(0), b2(0)], + read_reg_towards = [b2(1), b2(1), b2(0), b2(0)]))), + + # Prologue settings for Scenario 2. + IntraCgraPktType(0, 0, + payload = CgraPayloadType(CMD_CONFIG_PROLOGUE_FU, ctrl_addr = 3, + data = DataType(1, 1))), + IntraCgraPktType(0, 0, + payload = CgraPayloadType(CMD_CONFIG_PROLOGUE_FU_CROSSBAR, ctrl_addr = 3, + ctrl = CtrlType(fu_xbar_outport = [ + FuOutType(0), FuOutType(0), FuOutType(0), FuOutType(0), + FuOutType(0), FuOutType(0), FuOutType(0), FuOutType(0)]), + data = DataType(1, 1))), + + # Launches the tile. + IntraCgraPktType(0, 0, 0, 0, 0, 0, 0, 0, payload = CgraPayloadType(CMD_LAUNCH))] + + # Input for NAH (ctrl_addr = 1) in the 1st iter, input for NAH (ctrl_addr = 1) in the 2nd iter. + src_data = [[DataType(3, 1), DataType(3, 1)], + # Input for OPT_GRT_PRED (ctrl_addr = 3) in the 1st iter, input for OPT_GRT_PRED (ctrl_addr = 2) in the 2nd iter, unused input for OPT_GRT_PRED (ctrl_addr = 3) in the 2nd iter (For CMD_COMPLETE). + [DataType(4, 1), DataType(5, 1), DataType(6, 1)], + [DataType(0, 1), DataType(0, 1), DataType(0, 1)], + []] + + sink_out = [ + # Output for NAH (ctrl_addr = 1) in the 1st iter, output for NAH (ctrl_addr = 1) in the 2nd iter. + [DataType(3, 1), DataType(3, 1)], + # OPT_GRT_PRED (ctrl_addr = 2) does not take any inputs in the 1st iter because of prologue for both routing and fu. + # OPT_GRT_PRED (ctrl_addr = 3) can write inputs DataType(4, 1) and DataType(0, 1) into registers because of prologue for only fu. + # Output for OPT_GRT_PRED (ctrl_addr = 2) in the 2nd iter, output for OPT_GRT_PRED (ctrl_addr = 3) in the 2nd iter. + [DataType(5, 0), DataType(4, 0)], + [], + []] + # src dst src/dst cgra x/y + complete_signal_sink_out = [IntraCgraPktType(0, num_tiles, 0, 0, 0, 0, 0, 0, payload = CgraPayloadType(CMD_COMPLETE))] +# IntraCgraPktType(0, 0, num_tiles, 0, 0, ctrl_action = CMD_COMPLETE)] + + th = TestHarness(DUT, FunctionUnit, FuList, + IntraCgraPktType, + ctrl_mem_size, + data_mem_size_global, num_fu_inports, num_fu_outports, + num_tile_inports, num_tile_outports, + num_registers_per_reg_bank, src_data, + src_ctrl_pkt, sink_out, num_tiles, complete_signal_sink_out, + num_ctrl = 4, total_steps = 8) + th.elaborate() + th.dut.set_metadata(VerilogVerilatorImportPass.vl_Wno_list, + ['UNSIGNED', 'UNOPTFLAT', 'WIDTH', 'WIDTHCONCAT', + 'ALWCOMBORDER']) + th = config_model_with_cmdline_opts(th, cmdline_opts, duts = ['dut']) + run_sim(th) +