From c60a7b270de6db8f9c6c4d90959bc6f9ce0aad7a Mon Sep 17 00:00:00 2001 From: Navaneeth-Kunhi Purayil Date: Thu, 4 Dec 2025 14:11:30 +0100 Subject: [PATCH 1/2] hw: add 2x VLSU bandwidth support for Spatz --- Bender.yml | 1 + docs/schema/spatz_cluster.schema.json | 5 + hw/ip/spatz/src/generated/spatz_pkg.sv | 14 +- hw/ip/spatz/src/spatz.sv | 128 +- hw/ip/spatz/src/spatz_controller.sv | 124 +- hw/ip/spatz/src/spatz_doublebw_vlsu.sv | 1134 +++++++++++++++++ hw/ip/spatz/src/spatz_pkg.sv.tpl | 37 +- hw/ip/spatz/src/spatz_vrf.sv | 108 +- hw/system/spatz_cluster/Makefile | 7 + .../cfg/spatz_cluster.carfield.dram.hjson | 5 +- .../cfg/spatz_cluster.carfield.l2.hjson | 2 + .../cfg/spatz_cluster.default.dram.hjson | 5 +- .../cfg/spatz_cluster.doublebw.dram.hjson | 101 ++ .../cfg/spatz_cluster.mempool.dram.hjson | 3 + .../cfg/spatz_cluster.smallvrf.dram.hjson | 5 +- hw/system/spatz_cluster/src/spatz_cluster.sv | 12 +- .../src/spatz_cluster_wrapper.sv.tpl | 9 +- .../src/spatz_tcdm_interconnect.sv | 35 +- sw/snRuntime/CMakeLists.txt | 4 +- sw/snRuntime/src/alloc.c | 2 +- 20 files changed, 1709 insertions(+), 32 deletions(-) create mode 100644 hw/ip/spatz/src/spatz_doublebw_vlsu.sv create mode 100644 hw/system/spatz_cluster/cfg/spatz_cluster.doublebw.dram.hjson diff --git a/Bender.yml b/Bender.yml index 36608314..7b67bc58 100644 --- a/Bender.yml +++ b/Bender.yml @@ -139,6 +139,7 @@ sources: - hw/ip/spatz/src/spatz_ipu.sv - hw/ip/spatz/src/spatz_vfu.sv - hw/ip/spatz/src/spatz_vlsu.sv + - hw/ip/spatz/src/spatz_doublebw_vlsu.sv - hw/ip/spatz/src/spatz_vrf.sv - hw/ip/spatz/src/spatz_vsldu.sv # Level 4 diff --git a/docs/schema/spatz_cluster.schema.json b/docs/schema/spatz_cluster.schema.json index 8379048f..f39915e8 100644 --- a/docs/schema/spatz_cluster.schema.json +++ b/docs/schema/spatz_cluster.schema.json @@ -165,6 +165,11 @@ "description": "Activate floating point support in Spatz", "default": false }, + "spatz_nports": { + "type": "number", + "description": "Number of TCDM ports per Spatz instance", + "default": 4 + }, "timing": { "type": "object", "title": "Timing and Latency Tuning Parameter", diff --git a/hw/ip/spatz/src/generated/spatz_pkg.sv b/hw/ip/spatz/src/generated/spatz_pkg.sv index 8f4bb228..b7715ee4 100644 --- a/hw/ip/spatz/src/generated/spatz_pkg.sv +++ b/hw/ip/spatz/src/generated/spatz_pkg.sv @@ -7,6 +7,7 @@ package spatz_pkg; import rvv_pkg::*; + import cf_math_pkg::idx_width; ////////////////// // Parameters // @@ -58,6 +59,9 @@ package spatz_pkg; // Number of elements per VRF Bank localparam int unsigned NrWordsPerBank = NrVRFWords / NrVRFBanks; + // Number of VLSU interfaces + localparam int unsigned NumVLSUInterfaces = 1; + // Width of scalar register file adresses // Depends on whether we have a FP regfile or not localparam int GPRWidth = FPU ? 6 : 5; @@ -286,8 +290,12 @@ package spatz_pkg; // Did the memory request trigger an exception logic exc; + + // Interface that is committing + logic intf_id; } vlsu_rsp_t; + //////////////////// // VSLDU Response // //////////////////// @@ -301,7 +309,7 @@ package spatz_pkg; // VRF/SB Ports // ////////////////// - typedef enum logic [2:0] { + typedef enum logic [idx_width(4 + 2 * 1):0] { VFU_VS2_RD, VFU_VS1_RD, VFU_VD_RD, @@ -310,13 +318,13 @@ package spatz_pkg; VSLDU_VS2_RD } vreg_port_rd_e; - typedef enum logic [1:0] { + typedef enum logic [idx_width(2 + 1):0] { VFU_VD_WD, VLSU_VD_WD, VSLDU_VD_WD } vreg_port_wd_e; - typedef enum logic [3:0] { + typedef enum logic [idx_width(6 + 3 * 1):0] { SB_VFU_VS2_RD, SB_VFU_VS1_RD, SB_VFU_VD_RD, diff --git a/hw/ip/spatz/src/spatz.sv b/hw/ip/spatz/src/spatz.sv index c1843599..cf7de201 100644 --- a/hw/ip/spatz/src/spatz.sv +++ b/hw/ip/spatz/src/spatz.sv @@ -72,11 +72,12 @@ module spatz import spatz_pkg::*; import rvv_pkg::*; import fpnew_pkg::*; #( //////////////// // Number of ports of the vector register file - localparam int unsigned NrWritePorts = 3; - localparam int unsigned NrReadPorts = 6; + localparam int unsigned NrWritePorts = 2 + NumVLSUInterfaces; // 1 for VFU and SLDU each and 1 for each VLSU + localparam int unsigned NrReadPorts = 4 + 2*NumVLSUInterfaces; // 3 for VFU, 1 for SLDU and 2 for each VLSU interface // FPU buffer size (need atleast depth of 2 to hide conflicts) localparam int unsigned FpuBufDepth = 4; + localparam int unsigned VlsuBufDepth = 2; ///////////// // Signals // @@ -92,8 +93,8 @@ module spatz import spatz_pkg::*; import rvv_pkg::*; import fpnew_pkg::*; #( vfu_rsp_t vfu_rsp, vfu_rsp_buf; logic vlsu_req_ready; - logic vlsu_rsp_valid; - vlsu_rsp_t vlsu_rsp; + logic vlsu_rsp_valid, vlsu_rsp_buf_valid; + vlsu_rsp_t vlsu_rsp, vlsu_rsp_buf; logic vsldu_req_ready; logic vsldu_rsp_valid; @@ -104,6 +105,7 @@ module spatz import spatz_pkg::*; import rvv_pkg::*; import fpnew_pkg::*; #( // buffers store the metadata to commit to the VRF in later cycles logic [$clog2(FpuBufDepth)-1:0] vfu_buf_usage; + logic [$clog2(VlsuBufDepth)-1:0] vlsu_buf_usage; typedef struct packed { vrf_data_t wdata; @@ -116,6 +118,17 @@ module spatz import spatz_pkg::*; import rvv_pkg::*; import fpnew_pkg::*; #( vfu_buf_t vfu_buf_data; + typedef struct packed { + vrf_data_t wdata; + vrf_addr_t waddr; + vrf_be_t wbe; + spatz_id_t wid; + vlsu_rsp_t rsp; + logic rsp_valid; + } vlsu_buf_t; + + vlsu_buf_t vlsu_buf_data; + ///////////////////// // FPU sequencer // ///////////////////// @@ -218,7 +231,8 @@ module spatz import spatz_pkg::*; import rvv_pkg::*; import fpnew_pkg::*; #( spatz_vrf #( .NrReadPorts (NrReadPorts ), - .NrWritePorts(NrWritePorts) + .NrWritePorts(NrWritePorts), + .FpuBufDepth (FpuBufDepth ) ) i_vrf ( .clk_i (clk_i ), .rst_ni (rst_ni ), @@ -278,9 +292,9 @@ module spatz import spatz_pkg::*; import rvv_pkg::*; import fpnew_pkg::*; #( .vfu_rsp_ready_o (vfu_rsp_ready ), .vfu_rsp_i (vfu_rsp_buf ), // VLSU - .vlsu_req_ready_i (vlsu_req_ready ), - .vlsu_rsp_valid_i (vlsu_rsp_valid ), - .vlsu_rsp_i (vlsu_rsp ), + .vlsu_req_ready_i (vlsu_req_ready ), + .vlsu_rsp_valid_i (vlsu_rsp_buf_valid ), + .vlsu_rsp_i (vlsu_rsp_buf ), // VLSD .vsldu_req_ready_i(vsldu_req_ready ), .vsldu_rsp_valid_i(vsldu_rsp_valid ), @@ -331,6 +345,40 @@ module spatz import spatz_pkg::*; import rvv_pkg::*; import fpnew_pkg::*; #( .data_o (vfu_buf_data ), .pop_i (vfu_buf_pop ) ); + +`ifdef DOUBLE_BW + // Buffering of VLSU1 when conflicting with VLSU0 + logic vlsu_buf_en, vlsu_buf_push, vlsu_buf_pop, vrf_vlsu_wvalid, vlsu_buf_full, vlsu_buf_empty; + + assign vlsu_buf_en = sb_we[VLSU_VD_WD1] && (!vrf_wvalid[VLSU_VD_WD1] || (vrf_wvalid[VLSU_VD_WD1] && !vlsu_buf_empty)); + assign vlsu_buf_push = vlsu_buf_en && !vlsu_buf_full; + assign vlsu_buf_pop = vrf_wvalid[VLSU_VD_WD1] && !vlsu_buf_empty; + assign vrf_vlsu_wvalid = sb_we[VLSU_VD_WD1] && !vlsu_buf_full; + + fifo_v3 #( + .FALL_THROUGH (1'b0 ), + .dtype (vlsu_buf_t ), + .DEPTH (VlsuBufDepth ) + ) i_vlsu_buf ( + .clk_i (clk_i ), + .rst_ni (rst_ni ), + .flush_i (1'b0 ), + .testmode_i (1'b0 ), + .full_o (vlsu_buf_full ), + .empty_o (vlsu_buf_empty ), + .usage_o (vlsu_buf_usage ), + .data_i ({vrf_wdata[VLSU_VD_WD1], + vrf_waddr[VLSU_VD_WD1], + vrf_wbe [VLSU_VD_WD1], + sb_id [SB_VLSU_VD_WD1], + vlsu_rsp, + vlsu_rsp_valid} ), + .push_i (vlsu_buf_push ), + .data_o (vlsu_buf_data ), + .pop_i (vlsu_buf_pop ) + ); + +`endif `endif always_comb begin @@ -343,6 +391,8 @@ module spatz import spatz_pkg::*; import rvv_pkg::*; import fpnew_pkg::*; #( // Responses vfu_rsp_buf = vfu_rsp; vfu_rsp_buf_valid = vfu_rsp_valid; + vlsu_rsp_buf = vlsu_rsp; + vlsu_rsp_buf_valid = vlsu_rsp_valid; // If the buffering feature is used for the FPU or VLSU, // Use the metadata to commit the data to the VRF @@ -361,6 +411,30 @@ module spatz import spatz_pkg::*; import rvv_pkg::*; import fpnew_pkg::*; #( vfu_rsp_buf_valid = 1'b0; end end + +`ifdef DOUBLE_BW + // VLSU1 buffering + // Check if interface 1 response is being buffered, if so do not send response now + if (vlsu_rsp_valid && (vlsu_rsp.intf_id == 1'b1) && vlsu_buf_push) + vlsu_rsp_buf_valid = 1'b0; + + if (!vlsu_buf_empty) begin + sb_we_buf [VLSU_VD_WD1] = 1'b1; + vrf_wdata_buf[VLSU_VD_WD1] = vlsu_buf_data.wdata; + vrf_waddr_buf[VLSU_VD_WD1] = vlsu_buf_data.waddr; + vrf_wbe_buf [VLSU_VD_WD1] = vlsu_buf_data.wbe; + sb_buf_id [SB_VLSU_VD_WD1] = vlsu_buf_data.wid; + if (vlsu_buf_data.rsp_valid) begin + vlsu_rsp_buf = vlsu_buf_data.rsp; + vlsu_rsp_buf_valid = vrf_wvalid[VLSU_VD_WD1]; + end + end else begin + // If the buffer is being enabled in this cycle, don't send the response now + if (vlsu_buf_en) begin + vlsu_rsp_buf_valid = 1'b0; + end + end +`endif `endif end // always_comb @@ -405,6 +479,43 @@ module spatz import spatz_pkg::*; import rvv_pkg::*; import fpnew_pkg::*; #( // VLSU // ////////// +`ifdef DOUBLE_BW + spatz_doublebw_vlsu #( + .NrMemPorts (NrMemPorts ), + .spatz_mem_req_t (spatz_mem_req_t ), + .spatz_mem_rsp_t (spatz_mem_rsp_t ) + ) i_vlsu ( + .clk_i (clk_i ), + .rst_ni (rst_ni ), + // Request + .spatz_req_i (spatz_req ), + .spatz_req_valid_i (spatz_req_valid ), + .spatz_req_ready_o (vlsu_req_ready ), + // Response + .vlsu_rsp_valid_o (vlsu_rsp_valid ), + .vlsu_rsp_o (vlsu_rsp ), + // VRF + .vrf_wvalid_i ({vrf_vlsu_wvalid, vrf_wvalid[VLSU_VD_WD0]} ), + .vrf_waddr_o (vrf_waddr[VLSU_VD_WD1:VLSU_VD_WD0] ), + .vrf_wdata_o (vrf_wdata[VLSU_VD_WD1:VLSU_VD_WD0] ), + .vrf_we_o (sb_we[VLSU_VD_WD1:VLSU_VD_WD0] ), + .vrf_wbe_o (vrf_wbe[VLSU_VD_WD1:VLSU_VD_WD0] ), + .vrf_raddr_o (vrf_raddr[VLSU_VD_RD1:VLSU_VS2_RD0] ), + .vrf_re_o (sb_re[VLSU_VD_RD1:VLSU_VS2_RD0] ), + .vrf_rdata_i (vrf_rdata[VLSU_VD_RD1:VLSU_VS2_RD0] ), + .vrf_rvalid_i (vrf_rvalid[VLSU_VD_RD1:VLSU_VS2_RD0] ), + .vrf_id_o ({sb_id[SB_VLSU_VD_WD1], sb_id[VLSU_VD_RD1], sb_id[VLSU_VS2_RD1], // VLSU Interface-1 + sb_id[SB_VLSU_VD_WD0], sb_id[VLSU_VD_RD0], sb_id[VLSU_VS2_RD0]}), // VLSU Interface-0 + // Interface Memory + .spatz_mem_req_o (spatz_mem_req_o ), + .spatz_mem_req_valid_o (spatz_mem_req_valid_o ), + .spatz_mem_req_ready_i (spatz_mem_req_ready_i ), + .spatz_mem_rsp_i (spatz_mem_rsp_i ), + .spatz_mem_rsp_valid_i (spatz_mem_rsp_valid_i ), + .spatz_mem_finished_o (spatz_mem_finished ), + .spatz_mem_str_finished_o(spatz_mem_str_finished ) + ); +`else spatz_vlsu #( .NrMemPorts (NrMemPorts ), .spatz_mem_req_t (spatz_mem_req_t ), @@ -439,6 +550,7 @@ module spatz import spatz_pkg::*; import rvv_pkg::*; import fpnew_pkg::*; #( .spatz_mem_finished_o (spatz_mem_finished ), .spatz_mem_str_finished_o(spatz_mem_str_finished ) ); +`endif /////////// // VSLDU // diff --git a/hw/ip/spatz/src/spatz_controller.sv b/hw/ip/spatz/src/spatz_controller.sv index 2e8837e0..cc1fe208 100644 --- a/hw/ip/spatz/src/spatz_controller.sv +++ b/hw/ip/spatz/src/spatz_controller.sv @@ -238,7 +238,26 @@ module spatz_controller `FF(scoreboard_q, scoreboard_d, '0) // Did the instruction write to the VRF in the previous cycle? +`ifdef DOUBLE_BW + logic [NumVLSUInterfaces-1:0] [NrParallelInstructions-1:0] wrote_result_q, wrote_result_d; + + // Following counters are used only by DOUBLE_BW for tracking + logic [NumVLSUInterfaces-1:0] [NrParallelInstructions-1:0] done_result_q, done_result_d; + + // Counter to track the vlen completed for each instruction + vlen_t [NrParallelInstructions-1:0] vl_cnt_d, vl_cnt_q, vl_max_d, vl_max_q; + + // Is this instruction a narrowing instruction? + logic [NrParallelInstructions-1:0] narrow_q, narrow_d; + + `FF(done_result_q, done_result_d, '0) + `FF(vl_cnt_q, vl_cnt_d, '0) + `FF(vl_max_q, vl_max_d, '0) + `FF(narrow_q, narrow_d, '0) +`else logic [NrParallelInstructions-1:0] wrote_result_q, wrote_result_d; +`endif + `FF(wrote_result_q, wrote_result_d, '0) // Is this instruction a narrowing or widening instruction? @@ -261,10 +280,79 @@ module spatz_controller wrote_result_d = '0; sb_enable_o = '0; - for (int unsigned port = 0; port < NrVregfilePorts; port++) +`ifdef DOUBLE_BW + done_result_d = done_result_q; + narrow_d = narrow_q; + vl_cnt_d = vl_cnt_q; + vl_max_d = vl_max_q; +`endif + + for (int unsigned port = 0; port < NrVregfilePorts; port++) begin +`ifdef DOUBLE_BW + // Calculate the load-store interface id to use here for chaining + automatic logic intID; + + // For vlsu ports use the write status of the corresponding interface + if (port inside {SB_VLSU_VS2_RD0, SB_VLSU_VD_RD0, SB_VLSU_VD_WD0}) begin + intID = 0; + end else if (port inside {SB_VLSU_VS2_RD1, SB_VLSU_VD_RD1, SB_VLSU_VD_WD1}) begin + intID = 1; + // For non vlsu ports, use the vector length to find the interface id for write status checks + end else begin + intID = (vl_cnt_q[sb_id_i[port]] < vl_max_d[sb_id_i[port]]) ? 0 : 1; + end + // Enable the VRF port if the dependant instructions wrote in the previous cycle + sb_enable_o[port] = sb_enable_i[port] && &(~scoreboard_q[sb_id_i[port]].deps | wrote_result_q[intID] | done_result_q[intID]) && (!(|scoreboard_q[sb_id_i[port]].deps) || !scoreboard_q[sb_id_i[port]].prevent_chaining); +`else sb_enable_o[port] = sb_enable_i[port] && &(~scoreboard_q[sb_id_i[port]].deps | wrote_result_q) && (!(|scoreboard_q[sb_id_i[port]].deps) || !scoreboard_q[sb_id_i[port]].prevent_chaining); +`endif + end +`ifdef DOUBLE_BW + // Store the decisions + if (sb_enable_o[SB_VFU_VD_WD]) begin + // Calculate the load-store interface id to use here for chaining + automatic logic intID = (vl_cnt_q[sb_id_i[SB_VFU_VD_WD]] < vl_max_d[sb_id_i[SB_VFU_VD_WD]]) ? 0 : 1; + automatic int VRFWriteSize = narrow_q[sb_id_i[SB_VFU_VD_WD]] ? VRFWordBWidth >> 1 : VRFWordBWidth; + + // Update vl_cnt if actually written into the VRF + if (sb_wrote_result_i[SB_VFU_VD_WD - SB_VFU_VD_WD]) + vl_cnt_d[sb_id_i[SB_VFU_VD_WD]] += VRFWriteSize; + + if (vl_cnt_q[sb_id_i[SB_VFU_VD_WD]] >= (vl_max_d[sb_id_i[SB_VFU_VD_WD]] * (intID + 1) - VRFWriteSize)) begin + done_result_d[intID][sb_id_i[SB_VFU_VD_WD]] = 1'b1; + end + + wrote_result_narrowing_d[sb_id_i[SB_VFU_VD_WD]] = sb_wrote_result_i[SB_VFU_VD_WD - SB_VFU_VD_WD] ^ narrow_wide_q[sb_id_i[SB_VFU_VD_WD]]; + wrote_result_d[intID][sb_id_i[SB_VFU_VD_WD]] = sb_wrote_result_i[SB_VFU_VD_WD - SB_VFU_VD_WD] && (!narrow_wide_q[sb_id_i[SB_VFU_VD_WD]] || wrote_result_narrowing_q[sb_id_i[SB_VFU_VD_WD]]); + end + if (sb_enable_o[SB_VLSU_VD_WD0]) begin + // Calculate the load-store interface id to use here for chaining + wrote_result_narrowing_d[sb_id_i[SB_VLSU_VD_WD0]] = sb_wrote_result_i[SB_VLSU_VD_WD0 - SB_VFU_VD_WD] ^ narrow_wide_q[sb_id_i[SB_VLSU_VD_WD0]]; + wrote_result_d[0][sb_id_i[SB_VLSU_VD_WD0]] = sb_wrote_result_i[SB_VLSU_VD_WD0 - SB_VFU_VD_WD] && (!narrow_wide_q[sb_id_i[SB_VLSU_VD_WD0]] || wrote_result_narrowing_q[sb_id_i[SB_VLSU_VD_WD0]]); + end + if (sb_enable_o[SB_VLSU_VD_WD1]) begin + wrote_result_narrowing_d[sb_id_i[SB_VLSU_VD_WD1]] = sb_wrote_result_i[SB_VLSU_VD_WD1 - SB_VFU_VD_WD] ^ narrow_wide_q[sb_id_i[SB_VLSU_VD_WD1]]; + wrote_result_d[1][sb_id_i[SB_VLSU_VD_WD1]] = sb_wrote_result_i[SB_VLSU_VD_WD1 - SB_VFU_VD_WD] && (!narrow_wide_q[sb_id_i[SB_VLSU_VD_WD1]] || wrote_result_narrowing_q[sb_id_i[SB_VLSU_VD_WD1]]); + end + if (sb_enable_o[SB_VSLDU_VD_WD]) begin + // Calculate the load-store interface id to use here for chaining + automatic logic intID = (vl_cnt_q[sb_id_i[SB_VSLDU_VD_WD]] < vl_max_d[sb_id_i[SB_VSLDU_VD_WD]]) ? 0 : 1; + automatic int VRFWriteSize = narrow_q[sb_id_i[SB_VSLDU_VD_WD]] ? VRFWordBWidth >> 1 : VRFWordBWidth; + + // Update vl_cnt if actually written into the VRF + if (sb_wrote_result_i[SB_VSLDU_VD_WD - SB_VFU_VD_WD]) + vl_cnt_d[sb_id_i[SB_VSLDU_VD_WD]] += VRFWriteSize; + + if (vl_cnt_q[sb_id_i[SB_VSLDU_VD_WD]] >= (vl_max_d[sb_id_i[SB_VSLDU_VD_WD]] * (intID + 1) - VRFWriteSize)) begin + done_result_d[intID][sb_id_i[SB_VSLDU_VD_WD]] = 1'b1; + end + + wrote_result_narrowing_d[sb_id_i[SB_VSLDU_VD_WD]] = sb_wrote_result_i[SB_VSLDU_VD_WD - SB_VFU_VD_WD] ^ narrow_wide_q[sb_id_i[SB_VSLDU_VD_WD]]; + wrote_result_d[intID][sb_id_i[SB_VSLDU_VD_WD]] = sb_wrote_result_i[SB_VSLDU_VD_WD - SB_VFU_VD_WD] && (!narrow_wide_q[sb_id_i[SB_VSLDU_VD_WD]] || wrote_result_narrowing_q[sb_id_i[SB_VSLDU_VD_WD]]); + end +`else // Store the decisions if (sb_enable_o[SB_VFU_VD_WD]) begin wrote_result_narrowing_d[sb_id_i[SB_VFU_VD_WD]] = sb_wrote_result_i[SB_VFU_VD_WD - SB_VFU_VD_WD] ^ narrow_wide_q[sb_id_i[SB_VFU_VD_WD]]; @@ -278,6 +366,7 @@ module spatz_controller wrote_result_narrowing_d[sb_id_i[SB_VSLDU_VD_WD]] = sb_wrote_result_i[SB_VSLDU_VD_WD - SB_VFU_VD_WD] ^ narrow_wide_q[sb_id_i[SB_VSLDU_VD_WD]]; wrote_result_d[sb_id_i[SB_VSLDU_VD_WD]] = sb_wrote_result_i[SB_VSLDU_VD_WD - SB_VFU_VD_WD] && (!narrow_wide_q[sb_id_i[SB_VSLDU_VD_WD]] || wrote_result_narrowing_q[sb_id_i[SB_VSLDU_VD_WD]]); end +`endif // A unit has finished its VRF access. Reset the scoreboard. For each instruction, check // if a dependency existed. If so, invalidate it. @@ -292,6 +381,14 @@ module spatz_controller scoreboard_d[vfu_rsp_i.id] = '0; narrow_wide_d[vfu_rsp_i.id] = 1'b0; wrote_result_narrowing_d[vfu_rsp_i.id] = 1'b0; +`ifdef DOUBLE_BW + narrow_d[vfu_rsp_i.id] = 1'b0; + wrote_result_d[0][vfu_rsp_i.id] = 1'b0; + wrote_result_d[1][vfu_rsp_i.id] = 1'b0; + done_result_d[0][vfu_rsp_i.id] = 1'b0; + done_result_d[1][vfu_rsp_i.id] = 1'b0; + vl_cnt_d[vfu_rsp_i.id] = '0; +`endif for (int unsigned insn = 0; insn < NrParallelInstructions; insn++) scoreboard_d[insn].deps[vfu_rsp_i.id] = 1'b0; end @@ -306,6 +403,14 @@ module spatz_controller scoreboard_d[vlsu_rsp_i.id] = '0; narrow_wide_d[vlsu_rsp_i.id] = 1'b0; wrote_result_narrowing_d[vlsu_rsp_i.id] = 1'b0; +`ifdef DOUBLE_BW + narrow_d[vlsu_rsp_i.id] = 1'b0; + wrote_result_d[0][vlsu_rsp_i.id] = 1'b0; + wrote_result_d[1][vlsu_rsp_i.id] = 1'b0; + done_result_d[0][vlsu_rsp_i.id] = 1'b0; + done_result_d[1][vlsu_rsp_i.id] = 1'b0; + vl_cnt_d[vlsu_rsp_i.id] = '0; +`endif for (int unsigned insn = 0; insn < NrParallelInstructions; insn++) scoreboard_d[insn].deps[vlsu_rsp_i.id] = 1'b0; end @@ -320,6 +425,14 @@ module spatz_controller scoreboard_d[vsldu_rsp_i.id] = '0; narrow_wide_d[vsldu_rsp_i.id] = 1'b0; wrote_result_narrowing_d[vsldu_rsp_i.id] = 1'b0; +`ifdef DOUBLE_BW + narrow_d[vsldu_rsp_i.id] = 1'b0; + wrote_result_d[0][vsldu_rsp_i.id] = 1'b0; + wrote_result_d[1][vsldu_rsp_i.id] = 1'b0; + done_result_d[0][vsldu_rsp_i.id] = 1'b0; + done_result_d[1][vsldu_rsp_i.id] = 1'b0; + vl_cnt_d[vsldu_rsp_i.id] = '0; +`endif for (int unsigned insn = 0; insn < NrParallelInstructions; insn++) scoreboard_d[insn].deps[vsldu_rsp_i.id] = 1'b0; end @@ -354,6 +467,15 @@ module spatz_controller // Is this a narrowing or widening instruction? if (spatz_req.op_arith.is_narrowing || spatz_req.op_arith.widen_vs1 || spatz_req.op_arith.widen_vs2) narrow_wide_d[spatz_req.id] = 1'b1; + +`ifdef DOUBLE_BW + narrow_d[spatz_req.id] = spatz_req.op_arith.is_narrowing; + + // Track request vl for vector chaining, used only for DOUBLE_BW + // Default spatz uses 1-bit credit counter using wrote_result_q for chaining + vl_max_d[spatz_req.id] = (spatz_req.vl >> 1) << spatz_req.vtype.vsew; + vl_cnt_d[spatz_req.id] = '0; +`endif end // An instruction never depends on itself diff --git a/hw/ip/spatz/src/spatz_doublebw_vlsu.sv b/hw/ip/spatz/src/spatz_doublebw_vlsu.sv new file mode 100644 index 00000000..186e793f --- /dev/null +++ b/hw/ip/spatz/src/spatz_doublebw_vlsu.sv @@ -0,0 +1,1134 @@ +// Copyright 2023 ETH Zurich and University of Bologna. +// Licensed under the Apache License, Version 2.0, see LICENSE for details. +// SPDX-License-Identifier: Apache-2.0 +// +// Author: Matheus Cavalcante, ETH Zurich +// Matteo Perotti, ETH Zurich +// Navaneeth Kunhi Purayil, ETH Zurich +// +// The vector load/store unit is used to load vectors from memory +// and to the vector register file and store them back again. +// Parametric on the number of interfaces to the TCDM + +module spatz_doublebw_vlsu + import spatz_pkg::*; + import rvv_pkg::*; + import cf_math_pkg::idx_width; #( + parameter int unsigned NrMemPorts = 1, + parameter int unsigned NrOutstandingLoads = 16, + // Memory request + parameter type spatz_mem_req_t = logic, + parameter type spatz_mem_rsp_t = logic, + // Dependant parameters. DO NOT CHANGE! + localparam int unsigned NrInterfaces = NrMemPorts / spatz_pkg::N_FU, + localparam int unsigned IdWidth = idx_width(NrOutstandingLoads) + ) ( + input logic clk_i, + input logic rst_ni, + // Spatz request + input spatz_req_t spatz_req_i, + input logic spatz_req_valid_i, + output logic spatz_req_ready_o, + // VLSU response + output logic vlsu_rsp_valid_o, + output vlsu_rsp_t vlsu_rsp_o, + // Interface with the VRF + output vrf_addr_t [NrInterfaces-1:0] vrf_waddr_o, + output vrf_data_t [NrInterfaces-1:0] vrf_wdata_o, + output logic [NrInterfaces-1:0] vrf_we_o, + output vrf_be_t [NrInterfaces-1:0] vrf_wbe_o, + input logic [NrInterfaces-1:0] vrf_wvalid_i, + + output spatz_id_t [NrInterfaces-1:0] [2:0] vrf_id_o, + output vrf_addr_t [NrInterfaces-1:0] [1:0] vrf_raddr_o, + output logic [NrInterfaces-1:0] [1:0] vrf_re_o, + input vrf_data_t [NrInterfaces-1:0] [1:0] vrf_rdata_i, + input logic [NrInterfaces-1:0] [1:0] vrf_rvalid_i, + // Memory Request + output spatz_mem_req_t [NrMemPorts-1:0] spatz_mem_req_o, + output logic [NrMemPorts-1:0] spatz_mem_req_valid_o, + input logic [NrMemPorts-1:0] spatz_mem_req_ready_i, + // Memory Response + input spatz_mem_rsp_t [NrMemPorts-1:0] spatz_mem_rsp_i, + input logic [NrMemPorts-1:0] spatz_mem_rsp_valid_i, + // Memory Finished + output logic spatz_mem_finished_o, + output logic spatz_mem_str_finished_o + ); + +// Include FF +`include "common_cells/registers.svh" + + + //////////////// + // Parameters // + //////////////// + + localparam int unsigned MemDataWidth = ELEN; + localparam int unsigned MemDataWidthB = MemDataWidth/8; + + ////////////// + // Typedefs // + ////////////// + + typedef logic [IdWidth-1:0] id_t; + typedef logic [$clog2(NrWordsPerVector*8)-1:0] vreg_elem_t; + + /////////////////////// + // Operation queue // + /////////////////////// + + spatz_req_t spatz_req_d; + + spatz_req_t mem_spatz_req; + logic mem_spatz_req_valid; + logic mem_spatz_req_ready; + + spill_register #( + .T(spatz_req_t) + ) i_operation_queue ( + .clk_i (clk_i ), + .rst_ni (rst_ni ), + .data_i (spatz_req_d ), + .valid_i(spatz_req_valid_i && spatz_req_i.ex_unit == LSU), + .ready_o(spatz_req_ready_o ), + .data_o (mem_spatz_req ), + .valid_o(mem_spatz_req_valid ), + .ready_i(mem_spatz_req_ready ) + ); + + // Convert the vl to number of bytes for all element widths + always_comb begin: proc_spatz_req + spatz_req_d = spatz_req_i; + + unique case (spatz_req_i.vtype.vsew) + EW_8: begin + spatz_req_d.vl = spatz_req_i.vl; + spatz_req_d.vstart = spatz_req_i.vstart; + end + EW_16: begin + spatz_req_d.vl = spatz_req_i.vl << 1; + spatz_req_d.vstart = spatz_req_i.vstart << 1; + end + EW_32: begin + spatz_req_d.vl = spatz_req_i.vl << 2; + spatz_req_d.vstart = spatz_req_i.vstart << 2; + end + default: begin + spatz_req_d.vl = spatz_req_i.vl << MAXEW; + spatz_req_d.vstart = spatz_req_i.vstart << MAXEW; + end + endcase + end: proc_spatz_req + + // Do we have a strided memory access + logic mem_is_strided; + assign mem_is_strided = (mem_spatz_req.op == VLSE) || (mem_spatz_req.op == VSSE); + + // Do we have an indexed memory access + logic mem_is_indexed; + assign mem_is_indexed = (mem_spatz_req.op == VLXE) || (mem_spatz_req.op == VSXE); + + ///////////// + // State // + ///////////// + + typedef enum logic { + VLSU_RunningLoad, VLSU_RunningStore + } state_t; + state_t state_d, state_q; + `FF(state_q, state_d, VLSU_RunningLoad) + + + id_t [NrInterfaces-1:0] [N_FU-1:0] store_count_q; + id_t [NrInterfaces-1:0] [N_FU-1:0] store_count_d; + + for (genvar intf = 0; intf < NrInterfaces; intf++) begin : gen_store_count_q_intf + for (genvar fu = 0; fu < N_FU; fu++) begin : gen_store_count_q_intf_fu + `FF(store_count_q[intf][fu], store_count_d[intf][fu], '0) + end: gen_store_count_q_intf_fu + end: gen_store_count_q_intf + + always_comb begin: proc_store_count + // Maintain state + store_count_d = store_count_q; + + for (int intf = 0; intf < NrInterfaces; intf++) begin + for (int fu = 0; fu < N_FU; fu++) begin + automatic int unsigned port = intf * N_FU + fu; + + if (spatz_mem_req_o[port].write && spatz_mem_req_valid_o[port] && spatz_mem_req_ready_i[port]) + // Did we send a store? + store_count_d[intf][fu]++; + + // Did we get the ack of a store? +`ifdef MEMPOOL_SPATZ + if (store_count_q[intf][fu] != '0 && spatz_mem_rsp_valid_i[port] && spatz_mem_rsp_i[port].write) + store_count_d[intf][fu]--; +`else + if (store_count_q[intf][fu] != '0 && spatz_mem_rsp_valid_i[port]) + store_count_d[intf][fu]--; +`endif + end + end + end: proc_store_count + + ////////////////////// + // Reorder Buffer // + ////////////////////// + + typedef logic [int'(MAXEW)-1:0] addr_offset_t; + + elen_t [NrInterfaces-1:0] [N_FU-1:0] rob_wdata; + id_t [NrInterfaces-1:0] [N_FU-1:0] rob_wid; + logic [NrInterfaces-1:0] [N_FU-1:0] rob_push; + logic [NrInterfaces-1:0] [N_FU-1:0] rob_rvalid; + elen_t [NrInterfaces-1:0] [N_FU-1:0] rob_rdata; + logic [NrInterfaces-1:0] [N_FU-1:0] rob_pop; + id_t [NrInterfaces-1:0] [N_FU-1:0] rob_rid; + logic [NrInterfaces-1:0] [N_FU-1:0] rob_req_id; + id_t [NrInterfaces-1:0] [N_FU-1:0] rob_id; + logic [NrInterfaces-1:0] [N_FU-1:0] rob_full; + logic [NrInterfaces-1:0] [N_FU-1:0] rob_empty; + + // The reorder buffer decouples the memory side from the register file side. + // All elements from one side to the other go through it. + // Each interface works independently from the others. + for (genvar intf = 0; intf < NrInterfaces; intf++) begin : gen_rob_intf + for (genvar fu = 0; fu < N_FU; fu++) begin : gen_rob_intf_fu +`ifdef MEMPOOL_SPATZ + reorder_buffer #( + .DataWidth(ELEN ), + .NumWords (NrOutstandingLoads) + ) i_reorder_buffer ( + .clk_i (clk_i ), + .rst_ni (rst_ni ), + .data_i (rob_wdata[intf][fu] ), + .id_i (rob_wid[intf][fu] ), + .push_i (rob_push[intf][fu] ), + .data_o (rob_rdata[intf][fu] ), + .valid_o (rob_rvalid[intf][fu]), + .id_read_o(rob_rid[intf][fu] ), + .pop_i (rob_pop[intf][fu] ), + .id_req_i (rob_req_id[intf][fu]), + .id_o (rob_id[intf][fu] ), + .full_o (rob_full[intf][fu] ), + .empty_o (rob_empty[intf][fu] ) + ); +`else + fifo_v3 #( + .DATA_WIDTH(ELEN ), + .DEPTH (NrOutstandingLoads) + ) i_reorder_buffer ( + .clk_i (clk_i ), + .rst_ni (rst_ni ), + .flush_i (1'b0 ), + .testmode_i(1'b0 ), + .data_i (rob_wdata[intf][fu] ), + .push_i (rob_push[intf][fu] ), + .data_o (rob_rdata[intf][fu] ), + .pop_i (rob_pop[intf][fu] ), + .full_o (rob_full[intf][fu] ), + .empty_o (rob_empty[intf][fu] ), + .usage_o (/* Unused */ ) + ); + assign rob_rvalid[intf][fu] = !rob_empty[intf][fu]; +`endif + end: gen_rob_intf_fu + end: gen_rob_intf + + ////////////////////// + // Memory request // + ////////////////////// + + // Is the memory operation valid and are we at the last one? + logic [NrInterfaces-1:0] [N_FU-1:0] mem_operation_valid; + logic [NrInterfaces-1:0] [N_FU-1:0] mem_operation_last; + + // For each memory port we count how many bytes we have already loaded/stored (VLSU <-> MEM). + // Multiple counters are needed all memory ports can work independent of each other. + vlen_t [NrInterfaces-1:0] [N_FU-1:0] mem_counter_max; + logic [NrInterfaces-1:0] [N_FU-1:0] mem_counter_en; + logic [NrInterfaces-1:0] [N_FU-1:0] mem_counter_load; + vlen_t [NrInterfaces-1:0] [N_FU-1:0] mem_counter_delta; + vlen_t [NrInterfaces-1:0] [N_FU-1:0] mem_counter_d; + vlen_t [NrInterfaces-1:0] [N_FU-1:0] mem_counter_q; + logic [NrInterfaces-1:0] [N_FU-1:0] mem_port_finished_d, mem_port_finished_q; + + vlen_t [NrInterfaces-1:0] [N_FU-1:0] mem_idx_counter_delta; + vlen_t [NrInterfaces-1:0] [N_FU-1:0] mem_idx_counter_d; + vlen_t [NrInterfaces-1:0] [N_FU-1:0] mem_idx_counter_q; + + for (genvar intf = 0; intf < NrInterfaces; intf++) begin: gen_mem_counters_intf + for (genvar fu = 0; fu < N_FU; fu++) begin: gen_mem_counters_intf_fu + delta_counter #( + .WIDTH($bits(vlen_t)) + ) i_delta_counter_mem ( + .clk_i (clk_i ), + .rst_ni (rst_ni ), + .clear_i (1'b0 ), + .en_i (mem_counter_en[intf][fu] ), + .load_i (mem_counter_load[intf][fu] ), + .down_i (1'b0 ), // We always count up + .delta_i (mem_counter_delta[intf][fu]), + .d_i (mem_counter_d[intf][fu] ), + .q_o (mem_counter_q[intf][fu] ), + .overflow_o(/* Unused */ ) + ); + + delta_counter #( + .WIDTH($bits(vlen_t)) + ) i_delta_counter_mem_idx ( + .clk_i (clk_i ), + .rst_ni (rst_ni ), + .clear_i (1'b0 ), + .en_i (mem_counter_en[intf][fu] ), + .load_i (mem_counter_load[intf][fu] ), + .down_i (1'b0 ), // We always count up + .delta_i (mem_idx_counter_delta[intf][fu]), + .d_i (mem_idx_counter_d[intf][fu] ), + .q_o (mem_idx_counter_q[intf][fu] ), + .overflow_o(/* Unused */ ) + ); + + assign mem_port_finished_d[intf][fu] = mem_spatz_req_valid && (mem_counter_q[intf][fu] == mem_counter_max[intf][fu] - mem_counter_delta[intf][fu]); + assign mem_port_finished_q[intf][fu] = mem_spatz_req_valid && (mem_counter_q[intf][fu] == mem_counter_max[intf][fu]); + end: gen_mem_counters_intf_fu + end: gen_mem_counters_intf + + // Did the current instruction finished the memory requests? + logic [NrParallelInstructions-1:0] mem_insn_finished_q, mem_insn_finished_d; + `FF(mem_insn_finished_q, mem_insn_finished_d, '0) + + // Is the current instruction pending? + logic [NrParallelInstructions-1:0] mem_insn_pending_q, mem_insn_pending_d; + `FF(mem_insn_pending_q, mem_insn_pending_d, '0) + + /////////////////// + // VRF request // + /////////////////// + + typedef struct packed { + spatz_id_t id; + + vreg_t vd; + vew_e vsew; + + vlen_t vl; + vlen_t vstart; + logic [2:0] rs1; + + logic is_load; + logic is_strided; + logic is_indexed; + } commit_metadata_t; + + commit_metadata_t commit_insn_d; + logic commit_insn_push; + commit_metadata_t commit_insn_q; + logic commit_insn_pop; + logic commit_insn_empty; + logic commit_insn_valid; + + fifo_v3 #( + .DEPTH (3 ), + .FALL_THROUGH(1'b1 ), + .dtype (commit_metadata_t) + ) i_fifo_commit_insn ( + .clk_i (clk_i ), + .rst_ni (rst_ni ), + .flush_i (1'b0 ), + .testmode_i(1'b0 ), + .data_i (commit_insn_d ), + .push_i (commit_insn_push ), + .full_o (/* Unused */ ), + .data_o (commit_insn_q ), + .empty_o (commit_insn_empty), + .pop_i (commit_insn_pop ), + .usage_o (/* Unused */ ) + ); + + assign commit_insn_valid = !commit_insn_empty; + assign commit_insn_d = '{ + id : mem_spatz_req.id, + vd : mem_spatz_req.vd, + vsew : mem_spatz_req.vtype.vsew, + vl : mem_spatz_req.vl, + vstart : mem_spatz_req.vstart, + rs1 : mem_spatz_req.rs1[2:0], + is_load : mem_spatz_req.op_mem.is_load, + is_strided: mem_is_strided, + is_indexed: mem_is_indexed + }; + + always_comb begin: queue_control + // Maintain state + mem_insn_finished_d = mem_insn_finished_q; + mem_insn_pending_d = mem_insn_pending_q; + + // Do not ack anything + mem_spatz_req_ready = 1'b0; + + // Do not push anything to the metadata queue + commit_insn_push = 1'b0; + + // Did we start a new instruction? + if (mem_spatz_req_valid && !mem_insn_pending_q[mem_spatz_req.id]) begin + mem_insn_pending_d[mem_spatz_req.id] = 1'b1; + commit_insn_push = 1'b1; + end + + // Did an instruction finished its requests? + if (&(mem_port_finished_q | (mem_port_finished_d & mem_counter_en))) begin + mem_insn_finished_d[mem_spatz_req.id] = 1'b1; + mem_spatz_req_ready = 1'b1; + end + // Did we acknowledge the end of an instruction? + if (vlsu_rsp_valid_o) begin + mem_insn_finished_d[vlsu_rsp_o.id] = 1'b0; + mem_insn_pending_d[vlsu_rsp_o.id] = 1'b0; + end + end + + // For each memory port that we have, count how many elements we have already loaded/stored (VRF <-> VLSU). + // Multiple counters are necessary for the case where not every single port will + // receive the same number of elements to work through. + vlen_t [NrInterfaces-1:0] [N_FU-1:0] commit_counter_max; + logic [NrInterfaces-1:0] [N_FU-1:0] commit_counter_en; + logic [NrInterfaces-1:0] [N_FU-1:0] commit_counter_load; + vlen_t [NrInterfaces-1:0] [N_FU-1:0] commit_counter_delta; + vlen_t [NrInterfaces-1:0] [N_FU-1:0] commit_counter_d; + vlen_t [NrInterfaces-1:0] [N_FU-1:0] commit_counter_q; + logic [NrInterfaces-1:0] [N_FU-1:0] commit_finished_q; + logic [NrInterfaces-1:0] [N_FU-1:0] commit_finished_d; + + for (genvar intf = 0; intf < NrInterfaces; intf++) begin : gen_vreg_counters_intf + for (genvar fu = 0; fu < N_FU; fu++) begin : gen_vreg_counters_intf_fu + delta_counter #( + .WIDTH($bits(vlen_t)) + ) i_delta_counter_vreg ( + .clk_i (clk_i ), + .rst_ni (rst_ni ), + .clear_i (1'b0 ), + .en_i (commit_counter_en[intf][fu] ), + .load_i (commit_counter_load[intf][fu] ), + .down_i (1'b0 ), // We always count up + .delta_i (commit_counter_delta[intf][fu]), + .d_i (commit_counter_d[intf][fu] ), + .q_o (commit_counter_q[intf][fu] ), + .overflow_o(/* Unused */ ) + ); + + assign commit_finished_q[intf][fu] = commit_insn_valid && (commit_counter_q[intf][fu] == commit_counter_max[intf][fu]); + assign commit_finished_d[intf][fu] = commit_insn_valid && + ((commit_counter_q[intf][fu] + commit_counter_delta[intf][fu]) == commit_counter_max[intf][fu]); + end: gen_vreg_counters_intf_fu + end: gen_vreg_counters_intf + + //////////////////////// + // Address Generation // + //////////////////////// + + elen_t [NrInterfaces-1:0] [N_FU-1:0] mem_req_addr; + + vrf_addr_t [NrInterfaces-1:0] vd_vreg_addr; + vrf_addr_t [NrInterfaces-1:0] vs2_vreg_addr, vs2_vreg_idx_addr; + + // Current element index and byte index that are being accessed at the register file + vreg_elem_t [NrInterfaces-1:0] vd_elem_id; + vreg_elem_t [NrInterfaces-1:0] vs2_elem_id_d, vs2_elem_id_q; + `FF(vs2_elem_id_q, vs2_elem_id_d, '0) + + // Pending indexes + logic [NrInterfaces-1:0] [N_FU-1:0] pending_index; + + // Calculate the memory address for each memory port + addr_offset_t [NrInterfaces-1:0] [N_FU-1:0] mem_req_addr_offset; + for (genvar intf = 0; intf < NrInterfaces; intf++) begin: gen_mem_req_addr_intf + for (genvar fu = 0; fu < N_FU; fu++) begin: gen_mem_req_addr_intf_fu + localparam int unsigned port = intf * N_FU + fu; + + logic [31:0] addr; + logic [31:0] stride; + logic [31:0] offset; + + // Pre-shuffling index offset + typedef logic [int'(MAXEW)-1:0] maxew_t; + maxew_t idx_offset; + assign idx_offset = mem_idx_counter_q[intf][fu]; + + always_comb begin + stride = mem_is_strided ? mem_spatz_req.rs2 >> mem_spatz_req.vtype.vsew : 'd1; + + if (mem_is_indexed) begin + // What is the relationship between data and index width? + automatic logic [1:0] data_index_width_diff = int'(mem_spatz_req.vtype.vsew) - int'(mem_spatz_req.op_mem.ew); + + // Pointer to index + automatic logic [idx_width(N_FU*ELENB)-1:0] word_index = (fu << (MAXEW - data_index_width_diff)) + + (maxew_t'(idx_offset >> (MAXEW - data_index_width_diff)) << (MAXEW - data_index_width_diff)) * N_FU + + (maxew_t'(idx_offset << data_index_width_diff) >> data_index_width_diff); + + // Index + unique case (mem_spatz_req.op_mem.ew) + EW_8 : offset = $signed(vrf_rdata_i[intf][1][8 * word_index +: 8]); + EW_16: offset = $signed(vrf_rdata_i[intf][1][8 * word_index +: 16]); + default: offset = $signed(vrf_rdata_i[intf][1][8 * word_index +: 32]); + endcase + end else begin + offset = ({mem_counter_q[intf][fu][$bits(vlen_t)-1:MAXEW] << $clog2(N_FU), mem_counter_q[intf][fu][int'(MAXEW)-1:0]} + (fu << MAXEW)) * stride; + end + + // The second interface starts from half of the vector to straighten the write-back VRF access pattern + // To ensure that the 2 interfaces do not also conflict at the TCDM, there is HW scrambling of addresses to TCDM + // such that they access different superbanks. + if (!mem_is_indexed && !mem_is_strided && intf == 1) offset += (mem_spatz_req.vl / 2); + + addr = mem_spatz_req.rs1 + offset; + mem_req_addr[intf][fu] = (addr >> MAXEW) << MAXEW; + mem_req_addr_offset[intf][fu] = addr[int'(MAXEW)-1:0]; + + pending_index[intf][fu] = (mem_idx_counter_q[intf][fu][$clog2(NrWordsPerVector*ELENB)-1:0] >> MAXEW) != vs2_vreg_addr[intf][$clog2(NrWordsPerVector)-1:0]; + end + end: gen_mem_req_addr_intf_fu + end: gen_mem_req_addr_intf + + // Calculate the register file addresses + always_comb begin : gen_vreg_addr + for (int intf = 0; intf < NrInterfaces; intf++) begin : gen_vreg_addr_intf + vd_vreg_addr[intf] = (commit_insn_q.vd << $clog2(NrWordsPerVector)) + $unsigned(vd_elem_id[intf]); + + // For indices for indexed operations + vs2_vreg_addr[intf] = (mem_spatz_req.vs2 << $clog2(NrWordsPerVector)) + $unsigned(vs2_elem_id_q[intf]); + vs2_vreg_idx_addr[intf] = vs2_vreg_addr[intf]; + + // The second interface starts from half of the vector to straighten the write-back VRF access pattern + if (intf == 1) begin + vd_vreg_addr[intf] += commit_insn_q.vl / (2 * N_FU * ELENB); + vs2_vreg_idx_addr[intf] += ((mem_spatz_req.vl >> (mem_spatz_req.vtype.vsew - int'(mem_spatz_req.op_mem.ew))) / (2 * N_FU * ELENB)); + end + + end + end + + /////////////// + // Control // + /////////////// + + // Are we busy? + logic busy_q, busy_d; + `FF(busy_q, busy_d, 1'b0) + + // Did we finish an instruction? + logic vlsu_finished_req; + + // Memory requests + spatz_mem_req_t [NrInterfaces-1:0] [N_FU-1:0] spatz_mem_req; + logic [NrInterfaces-1:0] [N_FU-1:0] spatz_mem_req_valid; + logic [NrInterfaces-1:0] [N_FU-1:0] spatz_mem_req_ready; + + always_comb begin: control_proc + // Maintain state + busy_d = busy_q; + + // Do not pop anything + commit_insn_pop = 1'b0; + + // Do not ack anything + vlsu_finished_req = 1'b0; + + // Finished the execution! + if (commit_insn_valid && &(commit_finished_q | (commit_finished_d & commit_counter_en)) && mem_insn_finished_q[commit_insn_q.id]) begin + commit_insn_pop = 1'b1; + busy_d = 1'b0; + + // Acknowledge response when the last load commits to the VRF, or when the store finishes + vlsu_finished_req = 1'b1; + end + // Do we have a new instruction? + else if (commit_insn_valid && !busy_d) + busy_d = 1'b1; + end: control_proc + + // Is the VRF operation valid and are we at the last one? + logic [NrInterfaces-1:0] [N_FU-1:0] commit_operation_valid; + logic [NrInterfaces-1:0] [N_FU-1:0] commit_operation_last; + + // Is instruction a load? + logic mem_is_load; + assign mem_is_load = mem_spatz_req.op_mem.is_load; + + // Signal when we are finished with with accessing the memory (necessary + // for the case with more than one memory port) + assign spatz_mem_finished_o = commit_insn_valid && &(commit_finished_q | (commit_finished_d & commit_counter_en)) && mem_insn_finished_q[commit_insn_q.id]; + assign spatz_mem_str_finished_o = commit_insn_valid && &(commit_finished_q | (commit_finished_d & commit_counter_en)) && mem_insn_finished_q[commit_insn_q.id] && !commit_insn_q.is_load; + + // Do we start at the very fist element + logic mem_is_vstart_zero; + assign mem_is_vstart_zero = mem_spatz_req.vstart == 'd0; + + // Is the memory address unaligned + logic mem_is_addr_unaligned; + assign mem_is_addr_unaligned = mem_spatz_req.rs1[int'(MAXEW)-1:0] != '0; + + // Do we have to access every single element on its own + logic mem_is_single_element_operation; + assign mem_is_single_element_operation = mem_is_addr_unaligned || mem_is_strided || mem_is_indexed || !mem_is_vstart_zero; + + // How large is a single element (in bytes) + logic [3:0] mem_single_element_size; + assign mem_single_element_size = 1'b1 << mem_spatz_req.vtype.vsew; + + // How large is an index element (in bytes) + logic [3:0] mem_idx_single_element_size; + assign mem_idx_single_element_size = 1'b1 << mem_spatz_req.op_mem.ew; + + // Is the memory address unaligned + logic commit_is_addr_unaligned; + assign commit_is_addr_unaligned = commit_insn_q.rs1[int'(MAXEW)-1:0] != '0; + + // Do we have to access every single element on its own + logic commit_is_single_element_operation; + assign commit_is_single_element_operation = commit_is_addr_unaligned || commit_insn_q.is_strided || commit_insn_q.is_indexed || (commit_insn_q.vstart != '0); + + // Size of an element in the VRF + logic [3:0] commit_single_element_size; + assign commit_single_element_size = 1'b1 << commit_insn_q.vsew; + + //////////////////// + // Offset Queue // + //////////////////// + + // Store the offsets of all loads, for realigning + addr_offset_t [NrInterfaces-1:0] [N_FU-1:0] vreg_addr_offset; + logic [NrInterfaces-1:0] [N_FU-1:0] offset_queue_full; + for (genvar intf = 0; intf < NrInterfaces; intf++) begin : gen_offset_queue_intf + for (genvar fu = 0; fu < N_FU; fu++) begin : gen_offset_queue_intf_fu + fifo_v3 #( + .DATA_WIDTH(int'(MAXEW) ), + .DEPTH (NrOutstandingLoads) + ) i_offset_queue ( + .clk_i (clk_i ), + .rst_ni (rst_ni ), + .flush_i (1'b0 ), + .testmode_i(1'b0 ), + .empty_o (/* Unused */ ), + .full_o (offset_queue_full[intf][fu] ), + .push_i (spatz_mem_req_valid[intf][fu] && spatz_mem_req_ready[intf][fu] && mem_is_load), + .data_i (mem_req_addr_offset[intf][fu] ), + .data_o (vreg_addr_offset[intf][fu] ), + .pop_i (rob_pop[intf][fu] && commit_insn_q.is_load ), + .usage_o (/* Unused */ ) + ); + end: gen_offset_queue_intf_fu + end: gen_offset_queue_intf + + /////////////////////// + // Output Register // + /////////////////////// + + typedef struct packed { + vrf_addr_t waddr; + vrf_data_t wdata; + vrf_be_t wbe; + + vlsu_rsp_t rsp; + logic rsp_valid; + } vrf_req_t; + + vrf_req_t [NrInterfaces-1:0] vrf_req_d, vrf_req_q; + logic [NrInterfaces-1:0] vrf_req_valid_d, vrf_req_ready_d; + logic [NrInterfaces-1:0] vrf_req_valid_q, vrf_req_ready_q; + logic [NrInterfaces-1:0] vrf_valid_rsp_d, vrf_valid_rsp_q, vrf_valid_rsp; + logic [NrInterfaces-1:0] vrf_commit_intf_valid, vrf_commit_intf_valid_q; + logic [NrInterfaces-1:0] resp_overlap; + + for (genvar intf = 0; intf < NrInterfaces; intf++) begin : gen_vrf_req_register_intf + spill_register #( + .T(vrf_req_t) + ) i_vrf_req_register ( + .clk_i (clk_i ), + .rst_ni (rst_ni ), + .data_i (vrf_req_d[intf] ), + .valid_i(vrf_req_valid_d[intf]), + .ready_o(vrf_req_ready_d[intf]), + .data_o (vrf_req_q[intf] ), + .valid_o(vrf_req_valid_q[intf]), + .ready_i(vrf_req_ready_q[intf]) + ); + + assign vrf_waddr_o[intf] = vrf_req_q[intf].waddr; + assign vrf_wdata_o[intf] = vrf_req_q[intf].wdata; + assign vrf_wbe_o[intf] = vrf_req_q[intf].wbe; + assign vrf_we_o[intf] = vrf_req_valid_q[intf]; + assign vrf_id_o[intf] = {vrf_req_q[intf].rsp.id, mem_spatz_req.id, commit_insn_q.id}; + assign vrf_req_ready_q[intf] = vrf_wvalid_i[intf]; + + // Remember if the interfaces finished writing back to the VRF. + // Clear this notebook once the memory instruction is over. + + // To check if one interface is ahead and has another response to be written to the VRF + // In this case check if there is a response overlap + // If there is a response overlap, do not clear the FF + assign vrf_valid_rsp[intf] = vrf_req_valid_q[intf] & vrf_req_q[intf].rsp_valid; + assign resp_overlap[intf] = vrf_valid_rsp[intf] & vrf_valid_rsp_q[intf]; + + // To track a valid response on an interface until both interfaces finish and can send to the VRF + // When this happens the FF is cleared + // `FFLARNC(vrf_valid_rsp_q[intf], 1'b1, vrf_valid_rsp[intf], vlsu_rsp_valid_o & ~resp_overlap[intf], 1'b0, clk_i, rst_ni) + assign vrf_valid_rsp_d[intf] = (vlsu_rsp_valid_o & ~resp_overlap[intf]) ? 1'b0 : (vrf_valid_rsp[intf] ? 1'b1 : vrf_valid_rsp_q[intf]); + `FF(vrf_valid_rsp_q[intf], vrf_valid_rsp_d[intf], 1'b0) + + // Check if either a previously tracked response or there is a response in the current cycle + assign vrf_commit_intf_valid[intf] = vrf_valid_rsp[intf] | vrf_valid_rsp_q[intf]; + `FF(vrf_commit_intf_valid_q[intf], vrf_commit_intf_valid[intf], 1'b0); + end + + //////////////////////////// + // Response to Controller // + //////////////////////////// + + // Ack when the vector store finishes, or when the vector load commits to the VRF. + // With more than an interface, we need to wait until all the interfaces commit to the VRF. + + // Check if interface 1 is the interface trying to commit, if so take resp information from interface 1 + // If both interfaces in sync, interface 1 is given priority + assign resp_intf = vrf_commit_intf_valid_q [1] == 1'b0 ? 1'b1 : 1'b0; + assign vlsu_rsp_o = &vrf_commit_intf_valid && |vrf_req_valid_q ? vrf_req_q[resp_intf].rsp : '{id: commit_insn_q.id, default: '0}; + + // Send response back to the controller to indicate end of request + // Check if both the interfaces have completed request and have a valid response to send + // Check if atleast one interface has a valid (interfaces can send responses asynchronously, but they finish the request together) + // Set reponse high if one of the interfaces has a ready indicating the response has been written to the VRF + assign vlsu_rsp_valid_o = &vrf_commit_intf_valid && |vrf_req_valid_q ? |vrf_req_ready_q : vlsu_finished_req && !commit_insn_q.is_load; + + ////////////// + // Counters // + ////////////// + + // Do we need to catch up to reach element idx parity? (Because of non-zero vstart) + vlen_t vreg_start_0; + assign vreg_start_0 = vlen_t'(commit_insn_q.vstart[$clog2(ELENB)-1:0]); + logic [NrInterfaces-1:0] [N_FU-1:0] catchup; + for (genvar intf = 0; intf < NrInterfaces; intf++) begin: gen_catchup_intf + for (genvar fu = 0; fu < N_FU; fu++) begin: gen_catchup_intf_fu + assign catchup[intf][fu] = (commit_counter_q[intf][fu] < vreg_start_0) & (commit_counter_max[intf][fu] != commit_counter_q[intf][fu]); + end: gen_catchup_intf_fu + end: gen_catchup_intf + + for (genvar intf = 0; intf < NrInterfaces; intf++) begin: gen_vreg_counter_proc + for (genvar fu = 0; fu < N_FU; fu++) begin: gen_vreg_counter_proc + localparam int unsigned port = intf * N_FU + fu; + + // The total amount of vector bytes we have to work through + vlen_t max_bytes; + + always_comb begin + // Default value + max_bytes = (commit_insn_q.vl >> $clog2(NrMemPorts*ELENB)) << $clog2(ELENB); + + // Full transfer + if (commit_insn_q.vl[$clog2(ELENB) +: $clog2(NrMemPorts)] > port) + max_bytes += ELENB; + else if (commit_insn_q.vl[$clog2(NrMemPorts*ELENB)-1:$clog2(ELENB)] == port) + max_bytes += commit_insn_q.vl[$clog2(ELENB)-1:0]; + + commit_counter_load[intf][fu] = commit_insn_pop; + commit_counter_d[intf][fu] = (commit_insn_q.vstart >> $clog2(NrMemPorts*ELENB)) << $clog2(ELENB); + if (commit_insn_q.vstart[$clog2(NrMemPorts*ELENB)-1:$clog2(ELENB)] > port) + commit_counter_d[intf][fu] += ELENB; + else if (commit_insn_q.vstart[idx_width(NrMemPorts*ELENB)-1:$clog2(ELENB)] == port) + commit_counter_d[intf][fu] += commit_insn_q.vstart[$clog2(ELENB)-1:0]; + commit_operation_valid[intf][fu] = commit_insn_valid && (commit_counter_q[intf][fu] != max_bytes) && (catchup[intf][fu] || (!catchup[intf][fu] && ~|catchup)); + commit_operation_last[intf][fu] = commit_operation_valid[intf][fu] && ((max_bytes - commit_counter_q[intf][fu]) <= (commit_is_single_element_operation ? commit_single_element_size : ELENB)); + commit_counter_delta[intf][fu] = !commit_operation_valid[intf][fu] ? vlen_t'('d0) : commit_is_single_element_operation ? vlen_t'(commit_single_element_size) : commit_operation_last[intf][fu] ? (max_bytes - commit_counter_q[intf][fu]) : vlen_t'(ELENB); + commit_counter_en[intf][fu] = commit_operation_valid[intf][fu] && (commit_insn_q.is_load && vrf_req_valid_d[intf] && vrf_req_ready_d[intf]) || (!commit_insn_q.is_load && vrf_rvalid_i[intf][0] && vrf_re_o[intf][0] && (!mem_is_indexed || vrf_rvalid_i[intf][1])); + commit_counter_max[intf][fu] = max_bytes; + end + end + end + + for (genvar intf = 0; intf < NrInterfaces; intf++) begin: gen_vd_elem_id + assign vd_elem_id[intf] = (commit_counter_q[intf][0] > vreg_start_0) + ? commit_counter_q[intf][0] >> $clog2(ELENB) + : commit_counter_q[intf][3] >> $clog2(ELENB); + end + + for (genvar intf = 0; intf < NrInterfaces; intf++) begin: gen_mem_counter_proc_intf + for (genvar fu = 0; fu < N_FU; fu++) begin: gen_mem_counter_proc_intf_fu + localparam int unsigned port = intf * N_FU + fu; + + // The total amount of vector bytes we have to work through + vlen_t max_bytes; + + always_comb begin + // Default value + max_bytes = (mem_spatz_req.vl >> $clog2(NrMemPorts*MemDataWidthB)) << $clog2(MemDataWidthB); + + if (NrMemPorts == 1) + max_bytes = mem_spatz_req.vl; + else + if (mem_spatz_req.vl[$clog2(MemDataWidthB) +: $clog2(NrMemPorts)] > port) + max_bytes += MemDataWidthB; + else if (mem_spatz_req.vl[$clog2(MemDataWidthB) +: $clog2(NrMemPorts)] == port) + max_bytes += mem_spatz_req.vl[$clog2(MemDataWidthB)-1:0]; + + mem_operation_valid[intf][fu] = mem_spatz_req_valid && (max_bytes != mem_counter_q[intf][fu]); + mem_operation_last[intf][fu] = mem_operation_valid[intf][fu] && ((max_bytes - mem_counter_q[intf][fu]) <= (mem_is_single_element_operation ? mem_single_element_size : MemDataWidthB)); + mem_counter_load[intf][fu] = mem_spatz_req_ready; + mem_counter_d[intf][fu] = (mem_spatz_req.vstart >> $clog2(NrMemPorts*MemDataWidthB)) << $clog2(MemDataWidthB); + if (NrMemPorts == 1) + mem_counter_d[intf][fu] = mem_spatz_req.vstart; + else + if (mem_spatz_req.vstart[$clog2(MemDataWidthB) +: $clog2(NrMemPorts)] > port) + mem_counter_d[intf][fu] += MemDataWidthB; + else if (mem_spatz_req.vstart[$clog2(MemDataWidthB) +: $clog2(NrMemPorts)] == port) + mem_counter_d[intf][fu] += mem_spatz_req.vstart[$clog2(MemDataWidthB)-1:0]; + mem_counter_delta[intf][fu] = !mem_operation_valid[intf][fu] ? 'd0 : mem_is_single_element_operation ? mem_single_element_size : mem_operation_last[intf][fu] ? (max_bytes - mem_counter_q[intf][fu]) : MemDataWidthB; + mem_counter_en[intf][fu] = spatz_mem_req_ready[intf][fu] && spatz_mem_req_valid[intf][fu]; + mem_counter_max[intf][fu] = max_bytes; + + // Index counter + mem_idx_counter_d[intf][fu] = mem_counter_d[intf][fu]; + mem_idx_counter_delta[intf][fu] = !mem_operation_valid[intf][fu] ? 'd0 : mem_idx_single_element_size; + end + end + end + + /////////// + // State // + /////////// + + always_comb begin: p_state + // Maintain state + state_d = state_q; + + unique case (state_q) + VLSU_RunningLoad: begin + if (commit_insn_valid && !commit_insn_q.is_load) + if (&rob_empty) + state_d = VLSU_RunningStore; + end + + VLSU_RunningStore: begin + if (commit_insn_valid && commit_insn_q.is_load) + if (&rob_empty) + state_d = VLSU_RunningLoad; + end + + default:; + endcase + end: p_state + + ////////////////////////// + // Memory/VRF Interface // + ////////////////////////// + + // Memory request signals + id_t [NrInterfaces-1:0] [N_FU-1:0] mem_req_id; + logic [NrInterfaces-1:0] [N_FU-1:0][MemDataWidth-1:0] mem_req_data; + logic [NrInterfaces-1:0] [N_FU-1:0] mem_req_svalid; + logic [NrInterfaces-1:0] [N_FU-1:0][ELEN/8-1:0] mem_req_strb; + logic [NrInterfaces-1:0] [N_FU-1:0] mem_req_lvalid; + logic [NrInterfaces-1:0] [N_FU-1:0] mem_req_last; + + // Number of pending requests + logic [NrInterfaces-1:0] [N_FU-1:0][idx_width(NrOutstandingLoads):0] mem_pending_d, mem_pending_q; + logic [NrInterfaces-1:0] [N_FU-1:0] mem_pending; + `FF(mem_pending_q, mem_pending_d, '{default: '0}) + always_comb begin + // Maintain state + mem_pending_d = mem_pending_q; + + for (int intf = 0; intf < NrInterfaces; intf++) begin + for (int fu = 0; fu < N_FU; fu++) begin + mem_pending[intf][fu] = mem_pending_q[intf][fu] != '0; + + // New request sent + if (mem_is_load && spatz_mem_req_valid[intf][fu] && spatz_mem_req_ready[intf][fu]) + mem_pending_d[intf][fu]++; + + // Response used + if (commit_insn_q.is_load && rob_rvalid[intf][fu] && rob_pop[intf][fu]) + mem_pending_d[intf][fu]--; + end + end + end + + // verilator lint_off LATCH + always_comb begin + for (int intf = 0; intf < NrInterfaces; intf++) begin + vrf_raddr_o[intf] = {vs2_vreg_idx_addr[intf], vd_vreg_addr[intf]}; + vrf_re_o[intf] = '0; + vrf_req_d[intf] = '0; + vrf_req_valid_d[intf] = '0; + + rob_wdata[intf] = '0; + rob_wid[intf] = '0; + rob_push[intf] = '0; + rob_pop[intf] = '0; + rob_req_id[intf] = '0; + + mem_req_id[intf] = '0; + mem_req_data[intf] = '0; + mem_req_strb[intf] = '0; + mem_req_svalid[intf] = '0; + mem_req_lvalid[intf] = '0; + mem_req_last[intf] = '0; + + // Propagate request ID + vrf_req_d[intf].rsp.id = commit_insn_q.id; + vrf_req_d[intf].rsp.intf_id = intf; + vrf_req_d[intf].rsp_valid = commit_insn_valid && &commit_finished_d[intf] && mem_insn_finished_d[commit_insn_q.id]; + + // Request indexes + vrf_re_o[intf][1] = mem_is_indexed; + + // Count which vs2 element we should load (indexed loads) + vs2_elem_id_d = vs2_elem_id_q; + for (int intf = 0; intf < NrInterfaces; intf++) begin + if (&(pending_index[intf] ^ ~mem_operation_valid[intf]) && mem_is_indexed) + vs2_elem_id_d[intf] = vs2_elem_id_q[intf] + 1; + end + if (mem_spatz_req_ready) + vs2_elem_id_d = '0; + + if (commit_insn_valid && commit_insn_q.is_load) begin + // If we have a valid element in the buffer, store it back to the register file + if (state_q == VLSU_RunningLoad && |commit_operation_valid[intf]) begin + // Enable write back from an interface to the VRF if we have a valid element in all + // the interface buffers that still have to write something back. + vrf_req_d[intf].waddr = vd_vreg_addr[intf]; + vrf_req_valid_d[intf] = &(rob_rvalid[intf] | ~mem_pending[intf]) && |mem_pending[intf]; + + for (int unsigned fu = 0; fu < N_FU; fu++) begin + automatic int unsigned port = intf * N_FU + fu; + + automatic logic [63:0] data = rob_rdata[intf][fu]; + + // Shift data to correct position if we have an unaligned memory request + if (MAXEW == EW_32) + unique case ((commit_insn_q.is_strided || commit_insn_q.is_indexed) ? vreg_addr_offset[intf][fu] : commit_insn_q.rs1[1:0]) + 2'b01: data = {data[7:0], data[31:8]}; + 2'b10: data = {data[15:0], data[31:16]}; + 2'b11: data = {data[23:0], data[31:24]}; + default: data = data; + endcase + else + unique case ((commit_insn_q.is_strided || commit_insn_q.is_indexed) ? vreg_addr_offset[intf][fu] : commit_insn_q.rs1[2:0]) + 3'b001: data = {data[7:0], data[63:8]}; + 3'b010: data = {data[15:0], data[63:16]}; + 3'b011: data = {data[23:0], data[63:24]}; + 3'b100: data = {data[31:0], data[63:32]}; + 3'b101: data = {data[39:0], data[63:40]}; + 3'b110: data = {data[47:0], data[63:48]}; + 3'b111: data = {data[55:0], data[63:56]}; + default: data = data; + endcase + + // Pop stored element and free space in buffer + rob_pop[intf][fu] = rob_rvalid[intf][fu] && vrf_req_valid_d[intf] && vrf_req_ready_d[intf] && commit_counter_en[intf][fu]; + + // Shift data to correct position if we have a strided memory access + if (commit_insn_q.is_strided || commit_insn_q.is_indexed) + if (MAXEW == EW_32) + unique case (commit_counter_q[intf][fu][1:0]) + 2'b01: data = {data[23:0], data[31:24]}; + 2'b10: data = {data[15:0], data[31:16]}; + 2'b11: data = {data[7:0], data[31:8]}; + default: data = data; + endcase + else + unique case (commit_counter_q[intf][fu][2:0]) + 3'b001: data = {data[55:0], data[63:56]}; + 3'b010: data = {data[47:0], data[63:48]}; + 3'b011: data = {data[39:0], data[63:40]}; + 3'b100: data = {data[31:0], data[63:32]}; + 3'b101: data = {data[23:0], data[63:24]}; + 3'b110: data = {data[15:0], data[63:16]}; + 3'b111: data = {data[7:0], data[63:8]}; + default: data = data; + endcase + vrf_req_d[intf].wdata[ELEN*fu +: ELEN] = data; + + // Create write byte enable mask for register file + if (commit_counter_en[intf][fu]) + if (commit_is_single_element_operation) begin + automatic logic [$clog2(ELENB)-1:0] shift = commit_counter_q[intf][fu][$clog2(ELENB)-1:0]; + automatic logic [ELENB-1:0] mask = '1; + case (commit_insn_q.vsew) + EW_8 : mask = 1; + EW_16: mask = 3; + EW_32: mask = 15; + default: mask = '1; + endcase + vrf_req_d[intf].wbe[ELENB*fu +: ELENB] = mask << shift; + end else + for (int unsigned k = 0; k < ELENB; k++) + vrf_req_d[intf].wbe[ELENB*fu+k] = k < commit_counter_delta[intf][fu]; + end + end + + for (int unsigned fu = 0; fu < N_FU; fu++) begin + automatic int unsigned port = intf * N_FU + fu; + + // Write the load result to the buffer + rob_wdata[intf][fu] = spatz_mem_rsp_i[port].data; +`ifdef MEMPOOL_SPATZ + rob_wid[intf][fu] = spatz_mem_rsp_i[port].id; + // Need to consider out-of-order memory response + rob_push[intf][fu] = spatz_mem_rsp_valid_i[port] && (state_q == VLSU_RunningLoad) && spatz_mem_rsp_i[port].write == '0; +`else + rob_push[intf][fu] = spatz_mem_rsp_valid_i[port] && (state_q == VLSU_RunningLoad) && store_count_q[intf][fu] == '0; +`endif + if (!rob_full[intf][fu] && !offset_queue_full[intf][fu] && mem_operation_valid[intf][fu]) begin + rob_req_id[intf][fu] = spatz_mem_req_ready[intf][fu] & spatz_mem_req_valid[intf][fu]; + mem_req_lvalid[intf][fu] = (!mem_is_indexed || (vrf_rvalid_i[intf][1] && !pending_index[intf][fu])) && mem_spatz_req.op_mem.is_load; + mem_req_id[intf][fu] = rob_id[intf][fu]; + mem_req_last[intf][fu] = mem_operation_last[intf][fu]; + end + end + // Store operation + end else begin + // Read new element from the register file and store it to the buffer + if (state_q == VLSU_RunningStore && !(|rob_full) && |commit_operation_valid[intf]) begin + vrf_re_o[intf][0] = 1'b1; + + for (int unsigned fu = 0; fu < N_FU; fu++) begin + automatic int unsigned port = intf * N_FU + fu; + + rob_wdata[intf][fu] = vrf_rdata_i[intf][0][ELEN*fu +: ELEN]; + rob_wid[intf][fu] = rob_id[intf][fu]; + rob_req_id[intf][fu] = vrf_rvalid_i[intf][0] && (!mem_is_indexed || vrf_rvalid_i[intf][1]); + rob_push[intf][fu] = rob_req_id[intf][fu]; + end + end + + for (int unsigned fu = 0; fu < N_FU; fu++) begin + // Read element from buffer and execute memory request + if (mem_operation_valid[intf][fu]) begin + automatic logic [63:0] data = rob_rdata[intf][fu]; + + // Shift data to lsb if we have a strided or indexed memory access + if (mem_is_strided || mem_is_indexed) + if (MAXEW == EW_32) + unique case (mem_counter_q[intf][fu][1:0]) + 2'b01: data = {data[7:0], data[31:8]}; + 2'b10: data = {data[15:0], data[31:16]}; + 2'b11: data = {data[23:0], data[31:24]}; + default:; // Do nothing + endcase + else + unique case (mem_counter_q[intf][fu][2:0]) + 3'b001: data = {data[7:0], data[63:8]}; + 3'b010: data = {data[15:0], data[63:16]}; + 3'b011: data = {data[23:0], data[63:24]}; + 3'b100: data = {data[31:0], data[63:32]}; + 3'b101: data = {data[39:0], data[63:40]}; + 3'b110: data = {data[47:0], data[63:48]}; + 3'b111: data = {data[55:0], data[63:56]}; + default:; // Do nothing + endcase + + // Shift data to correct position if we have an unaligned memory request + if (MAXEW == EW_32) + unique case ((mem_is_strided || mem_is_indexed) ? mem_req_addr_offset[intf][fu] : mem_spatz_req.rs1[1:0]) + 2'b01: mem_req_data[intf][fu] = {data[23:0], data[31:24]}; + 2'b10: mem_req_data[intf][fu] = {data[15:0], data[31:16]}; + 2'b11: mem_req_data[intf][fu] = {data[7:0], data[31:8]}; + default: mem_req_data[intf][fu] = data; + endcase + else + unique case ((mem_is_strided || mem_is_indexed) ? mem_req_addr_offset[intf][fu] : mem_spatz_req.rs1[2:0]) + 3'b001: mem_req_data[intf][fu] = {data[55:0], data[63:56]}; + 3'b010: mem_req_data[intf][fu] = {data[47:0], data[63:48]}; + 3'b011: mem_req_data[intf][fu] = {data[39:0], data[63:40]}; + 3'b100: mem_req_data[intf][fu] = {data[31:0], data[63:32]}; + 3'b101: mem_req_data[intf][fu] = {data[23:0], data[63:24]}; + 3'b110: mem_req_data[intf][fu] = {data[15:0], data[63:16]}; + 3'b111: mem_req_data[intf][fu] = {data[7:0], data[63:8]}; + default: mem_req_data[intf][fu] = data; + endcase + + mem_req_svalid[intf][fu] = rob_rvalid[intf][fu] && (!mem_is_indexed || (vrf_rvalid_i[intf][1] && !pending_index[intf][fu])) && !mem_spatz_req.op_mem.is_load; + mem_req_id[intf][fu] = rob_rid[intf][fu]; + mem_req_last[intf][fu] = mem_operation_last[intf][fu]; + rob_pop[intf][fu] = spatz_mem_req_valid[intf][fu] && spatz_mem_req_ready[intf][fu]; + + // Create byte enable signal for memory request + if (mem_is_single_element_operation) begin + automatic logic [$clog2(ELENB)-1:0] shift = (mem_is_strided || mem_is_indexed) ? mem_req_addr_offset[intf][fu] : mem_counter_q[intf][fu][$clog2(ELENB)-1:0] + commit_insn_q.rs1[int'(MAXEW)-1:0]; + automatic logic [MemDataWidthB-1:0] mask = '1; + case (mem_spatz_req.vtype.vsew) + EW_8 : mask = 1; + EW_16: mask = 3; + EW_32: mask = 15; + default: mask = '1; + endcase + mem_req_strb[intf][fu] = mask << shift; + end else + for (int unsigned k = 0; k < ELENB; k++) + mem_req_strb[intf][fu][k] = k < mem_counter_delta[intf][fu]; + end else begin + // Clear empty buffer id requests + if (!rob_empty[intf][fu]) + rob_pop[intf][fu] = 1'b1; + end + end + end + end + end + // verilator lint_on LATCH + + // Create memory requests + for (genvar intf = 0; intf < NrInterfaces; intf++) begin : gen_mem_req + for (genvar fu = 0; fu < N_FU; fu++) begin : gen_mem_req + localparam int unsigned port = intf * N_FU + fu; + + spill_register #( + .T(spatz_mem_req_t) + ) i_spatz_mem_req_register ( + .clk_i (clk_i ), + .rst_ni (rst_ni ), + .data_i (spatz_mem_req[intf][fu] ), + .valid_i (spatz_mem_req_valid[intf][fu] ), + .ready_o (spatz_mem_req_ready[intf][fu] ), + .data_o (spatz_mem_req_o[port] ), + .valid_o (spatz_mem_req_valid_o[port] ), + .ready_i (spatz_mem_req_ready_i[port] ) + ); +`ifdef MEMPOOL_SPATZ + // ID is required in Mempool-Spatz + assign spatz_mem_req[intf][fu].id = mem_req_id[intf][fu]; + assign spatz_mem_req[intf][fu].addr = mem_req_addr[intf][fu]; + assign spatz_mem_req[intf][fu].mode = '0; // Request always uses user privilege level + assign spatz_mem_req[intf][fu].size = mem_spatz_req.vtype.vsew[1:0]; + assign spatz_mem_req[intf][fu].write = !mem_is_load; + assign spatz_mem_req[intf][fu].strb = mem_req_strb[intf][fu]; + assign spatz_mem_req[intf][fu].data = mem_req_data[intf][fu]; + assign spatz_mem_req[intf][fu].last = mem_req_last[intf][fu]; + assign spatz_mem_req[intf][fu].spec = 1'b0; // Request is never speculative + assign spatz_mem_req_valid[intf][fu] = mem_req_svalid[intf][fu] || mem_req_lvalid[intf][fu]; +`else + assign spatz_mem_req[intf][fu].addr = mem_req_addr[intf][fu]; + assign spatz_mem_req[intf][fu].write = !mem_is_load; + assign spatz_mem_req[intf][fu].amo = reqrsp_pkg::AMONone; + assign spatz_mem_req[intf][fu].data = mem_req_data[intf][fu]; + assign spatz_mem_req[intf][fu].strb = mem_req_strb[intf][fu]; + assign spatz_mem_req[intf][fu].user = '0; + assign spatz_mem_req_valid[intf][fu] = mem_req_svalid[intf][fu] || mem_req_lvalid[intf][fu]; +`endif + end + end + + //////////////// + // Assertions // + //////////////// + + if (MemDataWidth != ELEN) + $error("[spatz_vlsu] The memory data width needs to be equal to %d.", ELEN); + + if (NrMemPorts != 2**$clog2(NrMemPorts)) + $error("[spatz_vlsu] The NrMemPorts parameter needs to be a power of two"); + +endmodule : spatz_doublebw_vlsu diff --git a/hw/ip/spatz/src/spatz_pkg.sv.tpl b/hw/ip/spatz/src/spatz_pkg.sv.tpl index 98fd5e2e..4ebf1559 100644 --- a/hw/ip/spatz/src/spatz_pkg.sv.tpl +++ b/hw/ip/spatz/src/spatz_pkg.sv.tpl @@ -7,6 +7,7 @@ package spatz_pkg; import rvv_pkg::*; + import cf_math_pkg::idx_width; ////////////////// // Parameters // @@ -81,6 +82,9 @@ package spatz_pkg; // Number of elements per VRF Bank localparam int unsigned NrWordsPerBank = NrVRFWords / NrVRFBanks; + // Number of VLSU interfaces + localparam int unsigned NumVLSUInterfaces = ${int(cfg['spatz_nports'] / cfg['n_fpu'])}; + // Width of scalar register file adresses // Depends on whether we have a FP regfile or not localparam int GPRWidth = FPU ? 6 : 5; @@ -309,6 +313,9 @@ package spatz_pkg; // Did the memory request trigger an exception logic exc; + + // Interface that is committing + logic intf_id; } vlsu_rsp_t; % if cfg['mempool']: @@ -330,8 +337,8 @@ package spatz_pkg; logic err; logic write; } spatz_mem_rsp_t; - %endif + //////////////////// // VSLDU Response // //////////////////// @@ -345,30 +352,52 @@ package spatz_pkg; // VRF/SB Ports // ////////////////// - typedef enum logic [2:0] { + typedef enum logic [idx_width(4 + 2 * ${int(cfg['spatz_nports'] / cfg['n_fpu'])}):0] { VFU_VS2_RD, VFU_VS1_RD, VFU_VD_RD, +% if cfg['double_bw']: + VLSU_VS2_RD0, + VLSU_VD_RD0, + VLSU_VS2_RD1, + VLSU_VD_RD1, +%else: VLSU_VS2_RD, VLSU_VD_RD, +%endif VSLDU_VS2_RD } vreg_port_rd_e; - typedef enum logic [1:0] { + typedef enum logic [idx_width(2 + ${int(cfg['spatz_nports'] / cfg['n_fpu'])}):0] { VFU_VD_WD, +% if cfg['double_bw']: + VLSU_VD_WD[${int(cfg['spatz_nports'] / cfg['n_fpu'])}], +%else: VLSU_VD_WD, +%endif VSLDU_VD_WD } vreg_port_wd_e; - typedef enum logic [3:0] { + typedef enum logic [idx_width(6 + 3 * ${int(cfg['spatz_nports'] / cfg['n_fpu'])}):0] { SB_VFU_VS2_RD, SB_VFU_VS1_RD, SB_VFU_VD_RD, +% if cfg['double_bw']: + SB_VLSU_VS2_RD0, + SB_VLSU_VD_RD0, + SB_VLSU_VS2_RD1, + SB_VLSU_VD_RD1, +%else: SB_VLSU_VS2_RD, SB_VLSU_VD_RD, +%endif SB_VSLDU_VS2_RD, SB_VFU_VD_WD, +% if cfg['double_bw']: + SB_VLSU_VD_WD[${int(cfg['spatz_nports'] / cfg['n_fpu'])}], +%else: SB_VLSU_VD_WD, +%endif SB_VSLDU_VD_WD } sb_port_e; diff --git a/hw/ip/spatz/src/spatz_vrf.sv b/hw/ip/spatz/src/spatz_vrf.sv index ea590195..9af612d4 100644 --- a/hw/ip/spatz/src/spatz_vrf.sv +++ b/hw/ip/spatz/src/spatz_vrf.sv @@ -52,11 +52,17 @@ module spatz_vrf endfunction: f_vreg function automatic logic [$clog2(NrVRFBanks)-1:0] f_bank(vrf_addr_t addr); +`ifdef DOUBLE_BW + // Use a simple vector register to bank mapping + // No particular performance benefit from barber pole layout since 3R ports per bank is already available + f_bank = addr[$clog2(NrVRFBanks)-1:0]; +`else // Is this vreg divisible by eight? automatic logic [1:0] vreg8 = addr[$clog2(8*NrWordsPerVector) +: 2]; // Barber's pole. Advance the starting bank of each vector by one every eight vector registers. f_bank = addr[$clog2(NrVRFBanks)-1:0] + vreg8; +`endif endfunction: f_bank ///////////// @@ -101,7 +107,11 @@ module spatz_vrf // second priority has the LSU, and third priority has the slide unit. for (int unsigned bank = 0; bank < NrVRFBanks; bank++) begin `ifdef BUF_FPU +`ifdef DOUBLE_BW + automatic logic write_request_vlsu = write_request[bank][VLSU_VD_WD0] | write_request[bank][VLSU_VD_WD1]; +`else automatic logic write_request_vlsu = write_request[bank][VLSU_VD_WD]; +`endif w_vlsu_vfu_conflict[bank] = write_request_vlsu & write_request[bank][VFU_VD_WD]; // Prioritize VFU when VFU buffer usage is high // Otherwise VLSU gets the priority @@ -110,6 +120,64 @@ module spatz_vrf // If no buffering is done, prioritize VFU always w_vfu[bank] = 1'b1; `endif + +`ifdef DOUBLE_BW + if (~w_vfu[bank]) begin + // Prioritize VLSU interfaces + if (write_request[bank][VLSU_VD_WD0]) begin + waddr[bank] = f_vreg(waddr_i[VLSU_VD_WD0]); + wdata[bank] = wdata_i[VLSU_VD_WD0]; + we[bank] = 1'b1; + wbe[bank] = wbe_i[VLSU_VD_WD0]; + wvalid_o[VLSU_VD_WD0] = 1'b1; + end else if (write_request[bank][VLSU_VD_WD1]) begin + waddr[bank] = f_vreg(waddr_i[VLSU_VD_WD1]); + wdata[bank] = wdata_i[VLSU_VD_WD1]; + we[bank] = 1'b1; + wbe[bank] = wbe_i[VLSU_VD_WD1]; + wvalid_o[VLSU_VD_WD1] = 1'b1; + end else if (write_request[bank][VFU_VD_WD]) begin + waddr[bank] = f_vreg(waddr_i[VFU_VD_WD]); + wdata[bank] = wdata_i[VFU_VD_WD]; + we[bank] = 1'b1; + wbe[bank] = wbe_i[VFU_VD_WD]; + wvalid_o[VFU_VD_WD] = 1'b1; + end else if (write_request[bank][VSLDU_VD_WD]) begin + waddr[bank] = f_vreg(waddr_i[VSLDU_VD_WD]); + wdata[bank] = wdata_i[VSLDU_VD_WD]; + we[bank] = 1'b1; + wbe[bank] = wbe_i[VSLDU_VD_WD]; + wvalid_o[VSLDU_VD_WD] = 1'b1; + end + end else begin + // Prioritize VFU + if (write_request[bank][VFU_VD_WD]) begin + waddr[bank] = f_vreg(waddr_i[VFU_VD_WD]); + wdata[bank] = wdata_i[VFU_VD_WD]; + we[bank] = 1'b1; + wbe[bank] = wbe_i[VFU_VD_WD]; + wvalid_o[VFU_VD_WD] = 1'b1; + end else if (write_request[bank][VLSU_VD_WD0]) begin + waddr[bank] = f_vreg(waddr_i[VLSU_VD_WD0]); + wdata[bank] = wdata_i[VLSU_VD_WD0]; + we[bank] = 1'b1; + wbe[bank] = wbe_i[VLSU_VD_WD0]; + wvalid_o[VLSU_VD_WD0] = 1'b1; + end else if (write_request[bank][VLSU_VD_WD1]) begin + waddr[bank] = f_vreg(waddr_i[VLSU_VD_WD1]); + wdata[bank] = wdata_i[VLSU_VD_WD1]; + we[bank] = 1'b1; + wbe[bank] = wbe_i[VLSU_VD_WD1]; + wvalid_o[VLSU_VD_WD1] = 1'b1; + end else if (write_request[bank][VSLDU_VD_WD]) begin + waddr[bank] = f_vreg(waddr_i[VSLDU_VD_WD]); + wdata[bank] = wdata_i[VSLDU_VD_WD]; + we[bank] = 1'b1; + wbe[bank] = wbe_i[VSLDU_VD_WD]; + wvalid_o[VSLDU_VD_WD] = 1'b1; + end + end +`else if (~w_vfu[bank]) begin // Prioritize VLSU interfaces if (write_request[bank][VLSU_VD_WD]) begin @@ -153,6 +221,7 @@ module spatz_vrf wvalid_o[VSLDU_VD_WD] = 1'b1; end end +`endif end end @@ -184,18 +253,35 @@ module spatz_vrf raddr[bank][0] = f_vreg(raddr_i[VFU_VS2_RD]); rdata_o[VFU_VS2_RD] = rdata[bank][0]; rvalid_o[VFU_VS2_RD] = 1'b1; - end else if (read_request[bank][VLSU_VS2_RD]) begin + end +`ifdef DOUBLE_BW + else if (read_request[bank][VLSU_VS2_RD0]) begin + raddr[bank][0] = f_vreg(raddr_i[VLSU_VS2_RD0]); + rdata_o[VLSU_VS2_RD0] = rdata[bank][0]; + rvalid_o[VLSU_VS2_RD0] = 1'b1; + end +`else + else if (read_request[bank][VLSU_VS2_RD]) begin raddr[bank][0] = f_vreg(raddr_i[VLSU_VS2_RD]); rdata_o[VLSU_VS2_RD] = rdata[bank][0]; rvalid_o[VLSU_VS2_RD] = 1'b1; end +`endif - // Bank read port 1 - Priority: VFU (1) -> VSLDU + // Bank read port 1 - Priority: VFU (1) -> VLSU -> VSLDU if (read_request[bank][VFU_VS1_RD]) begin raddr[bank][1] = f_vreg(raddr_i[VFU_VS1_RD]); rdata_o[VFU_VS1_RD] = rdata[bank][1]; rvalid_o[VFU_VS1_RD] = 1'b1; - end else if (read_request[bank][VSLDU_VS2_RD]) begin + end +`ifdef DOUBLE_BW + else if (read_request[bank][VLSU_VD_RD0]) begin + raddr[bank][1] = f_vreg(raddr_i[VLSU_VD_RD0]); + rdata_o[VLSU_VD_RD0] = rdata[bank][1]; + rvalid_o[VLSU_VD_RD0] = 1'b1; + end +`endif + else if (read_request[bank][VSLDU_VS2_RD]) begin raddr[bank][1] = f_vreg(raddr_i[VSLDU_VS2_RD]); rdata_o[VSLDU_VS2_RD] = rdata[bank][1]; rvalid_o[VSLDU_VS2_RD] = 1'b1; @@ -206,11 +292,25 @@ module spatz_vrf raddr[bank][2] = f_vreg(raddr_i[VFU_VD_RD]); rdata_o[VFU_VD_RD] = rdata[bank][2]; rvalid_o[VFU_VD_RD] = 1'b1; - end else if (read_request[bank][VLSU_VD_RD]) begin + end +`ifdef DOUBLE_BW + // VLSU indices + else if (read_request[bank][VLSU_VS2_RD1]) begin + raddr[bank][2] = f_vreg(raddr_i[VLSU_VS2_RD1]); + rdata_o[VLSU_VS2_RD1] = rdata[bank][2]; + rvalid_o[VLSU_VS2_RD1] = 1'b1; + end else if (read_request[bank][VLSU_VD_RD1]) begin + raddr[bank][2] = f_vreg(raddr_i[VLSU_VD_RD1]); + rdata_o[VLSU_VD_RD1] = rdata[bank][2]; + rvalid_o[VLSU_VD_RD1] = 1'b1; + end +`else + else if (read_request[bank][VLSU_VD_RD]) begin raddr[bank][2] = f_vreg(raddr_i[VLSU_VD_RD]); rdata_o[VLSU_VD_RD] = rdata[bank][2]; rvalid_o[VLSU_VD_RD] = 1'b1; end +`endif end end diff --git a/hw/system/spatz_cluster/Makefile b/hw/system/spatz_cluster/Makefile index e673bf01..892c60f0 100644 --- a/hw/system/spatz_cluster/Makefile +++ b/hw/system/spatz_cluster/Makefile @@ -23,7 +23,14 @@ SPATZ_CLUSTER_CFG_DEFINES += -DSNRT_TCDM_SIZE=$(shell python3 -c "import jstyles SPATZ_CLUSTER_CFG_DEFINES += -DSNRT_NFPU_PER_CORE=$(shell python3 -c "import jstyleson; f = open('$(SPATZ_CLUSTER_CFG_PATH)'); print(jstyleson.load(f)['cluster']['n_fpu'])") # Enable additional uA configurations for the spatz core +DOUBLE_BW := $(shell python3 -c "import jstyleson; print(jstyleson.load(open('$(SPATZ_CLUSTER_CFG_PATH)'))['cluster'].get('double_bw', 0))") BUF_FPU := $(shell python3 -c "import jstyleson; print(jstyleson.load(open('$(SPATZ_CLUSTER_CFG_PATH)'))['cluster'].get('buf_fpu', 0))") + +ifeq ($(DOUBLE_BW),1) + DEFS += -DDOUBLE_BW + SPATZ_CLUSTER_CFG_DEFINES += -DUNROLL=1 +endif + ifeq ($(BUF_FPU),1) DEFS += -DBUF_FPU endif diff --git a/hw/system/spatz_cluster/cfg/spatz_cluster.carfield.dram.hjson b/hw/system/spatz_cluster/cfg/spatz_cluster.carfield.dram.hjson index 26fbefb4..5cacc3a1 100644 --- a/hw/system/spatz_cluster/cfg/spatz_cluster.carfield.dram.hjson +++ b/hw/system/spatz_cluster/cfg/spatz_cluster.carfield.dram.hjson @@ -21,7 +21,8 @@ "axi_isolate_enable": true, "tcdm": { "size": 128, - "banks": 16 + "banks": 16, + "misalign": false }, "cluster_periph_size": 64, // kB "dma_data_width": 64, @@ -32,6 +33,8 @@ "n_fpu": 4, "n_ipu": 1, "spatz_fpu": true, + "spatz_nports": 4, + "double_bw": 0, "buf_fpu": 0, // Timing parameters "timing": { diff --git a/hw/system/spatz_cluster/cfg/spatz_cluster.carfield.l2.hjson b/hw/system/spatz_cluster/cfg/spatz_cluster.carfield.l2.hjson index 489cf69d..308a17ec 100644 --- a/hw/system/spatz_cluster/cfg/spatz_cluster.carfield.l2.hjson +++ b/hw/system/spatz_cluster/cfg/spatz_cluster.carfield.l2.hjson @@ -32,6 +32,8 @@ "n_fpu": 4, "n_ipu": 1, "spatz_fpu": true, + "spatz_nports": 4, + "double_bw": 0, "buf_fpu": 0, // Timing parameters "timing": { diff --git a/hw/system/spatz_cluster/cfg/spatz_cluster.default.dram.hjson b/hw/system/spatz_cluster/cfg/spatz_cluster.default.dram.hjson index 9ea2a270..55df3d75 100644 --- a/hw/system/spatz_cluster/cfg/spatz_cluster.default.dram.hjson +++ b/hw/system/spatz_cluster/cfg/spatz_cluster.default.dram.hjson @@ -19,7 +19,8 @@ "axi_cdc_enable": false, "tcdm": { "size": 128, - "banks": 16 + "banks": 16, + "misalign": false }, "cluster_periph_size": 64, // kB "dma_data_width": 512, @@ -30,6 +31,8 @@ "n_fpu": 4, "n_ipu": 1, "spatz_fpu": true, + "spatz_nports": 4, + "double_bw": 0, "buf_fpu": 1, // Timing parameters "timing": { diff --git a/hw/system/spatz_cluster/cfg/spatz_cluster.doublebw.dram.hjson b/hw/system/spatz_cluster/cfg/spatz_cluster.doublebw.dram.hjson new file mode 100644 index 00000000..480dca88 --- /dev/null +++ b/hw/system/spatz_cluster/cfg/spatz_cluster.doublebw.dram.hjson @@ -0,0 +1,101 @@ +// Copyright 2020 ETH Zurich and University of Bologna. +// Licensed under the Apache License, Version 2.0, see LICENSE for details. +// SPDX-License-Identifier: Apache-2.0 + +// Cluster configuration for a simple system. +{ + "cluster": { + "mempool": 0, + "boot_addr": 4096, // 0x1000 + "cluster_base_addr": 1048576, // 0x100000 + "cluster_base_offset": 0, // 0x0 + "cluster_base_hartid": 0, + "addr_width": 32, + "data_width": 64, + "id_width_in": 2, + "id_width_out": 4, + "user_width": 2, + "cluster_default_axi_user": 1, + "axi_cdc_enable": false, + "tcdm": { + "size": 128, + "banks": 16, + "misalign": true + }, + "cluster_periph_size": 64, // kB + "dma_data_width": 512, + "dma_axi_req_fifo_depth": 3, + "dma_req_fifo_depth": 3, + // Spatz parameters + "vlen": 512, + "n_fpu": 4, + "n_ipu": 1, + "spatz_fpu": true, + "spatz_nports": 8, + "double_bw": 1, + "buf_fpu": 1, + // Timing parameters + "timing": { + "lat_comp_fp32": 1, + "lat_comp_fp64": 2, + "lat_comp_fp16": 0, + "lat_comp_fp16_alt": 0, + "lat_comp_fp8": 0, + "lat_comp_fp8_alt": 0, + "lat_noncomp": 1, + "lat_conv": 2, + "lat_sdotp": 2, + "fpu_pipe_config": "BEFORE", + "xbar_latency": "CUT_ALL_PORTS", + + "register_core_req": true, + "register_core_rsp": true, + "register_offload_rsp": true + }, + "cores": [ + // DMA core + { + "isa": "rv32imafd", + "xdma": true, + "xf16": true, + "xf8": true, + "xfdotp": true, + "num_int_outstanding_loads": 1, + "num_int_outstanding_mem": 4, + "num_spatz_outstanding_loads": 4, + "num_dtlb_entries": 1, + "num_itlb_entries": 1 + }, + + // Compute core + { + "isa": "rv32imafd", + "xf16": true, + "xf8": true, + "xfdotp": true, + "xdma": false, + "num_int_outstanding_loads": 1, + "num_int_outstanding_mem": 4, + "num_spatz_outstanding_loads": 4, + "num_dtlb_entries": 1, + "num_itlb_entries": 1 + } + ], + "icache": { + "size": 4, // total instruction cache size in kByte + "ways": 2, // number of ways + "cacheline": 256 // word size in bits + } + }, + + "dram": { + // 0x8000_0000 + "address": 2147483648, + // 0x8000_0000 + "length": 2147483648 + }, + + "peripherals": { + + } +} diff --git a/hw/system/spatz_cluster/cfg/spatz_cluster.mempool.dram.hjson b/hw/system/spatz_cluster/cfg/spatz_cluster.mempool.dram.hjson index cc4ab0ef..2cf51ef5 100644 --- a/hw/system/spatz_cluster/cfg/spatz_cluster.mempool.dram.hjson +++ b/hw/system/spatz_cluster/cfg/spatz_cluster.mempool.dram.hjson @@ -18,6 +18,7 @@ tcdm: { size: 128, banks: 16, + misalign: false }, cluster_periph_size: 64, // kB dma_data_width: 512, @@ -28,6 +29,8 @@ n_fpu: 4, n_ipu: 1, spatz_fpu: true, + spatz_nports: 4, + double_bw: 0, buf_fpu: 0, // Timing parameters timing: { diff --git a/hw/system/spatz_cluster/cfg/spatz_cluster.smallvrf.dram.hjson b/hw/system/spatz_cluster/cfg/spatz_cluster.smallvrf.dram.hjson index aa4063c5..7de2cb26 100644 --- a/hw/system/spatz_cluster/cfg/spatz_cluster.smallvrf.dram.hjson +++ b/hw/system/spatz_cluster/cfg/spatz_cluster.smallvrf.dram.hjson @@ -19,7 +19,8 @@ "axi_cdc_enable": false, "tcdm": { "size": 128, - "banks": 16 + "banks": 16, + "misalign": false }, "cluster_periph_size": 64, // kB "dma_data_width": 512, @@ -30,6 +31,8 @@ "n_fpu": 4, "n_ipu": 1, "spatz_fpu": true, + "spatz_nports": 4, + "double_bw": 0, "buf_fpu": 1, // Timing parameters "timing": { diff --git a/hw/system/spatz_cluster/src/spatz_cluster.sv b/hw/system/spatz_cluster/src/spatz_cluster.sv index cbef66ab..caf9a0de 100644 --- a/hw/system/spatz_cluster/src/spatz_cluster.sv +++ b/hw/system/spatz_cluster/src/spatz_cluster.sv @@ -72,6 +72,9 @@ module spatz_cluster // Spatz parameters parameter int unsigned NumSpatzFPUs [NrCores] = '{default: '0}, parameter int unsigned NumSpatzIPUs [NrCores] = '{default: '0}, + parameter int unsigned NumSpatzTCDMPorts [NrCores] = '{default: '0}, + // Misalign rows of the TCDM + parameter logic AddrMisalign = 1'b0, /// ## Timing Tuning Parameters /// Insert Pipeline registers into off-loading path (response) parameter bit RegisterOffloadRsp = 1'b0, @@ -155,7 +158,7 @@ module spatz_cluster localparam int unsigned NrSuperBanks = NrBanks / BanksPerSuperBank; function automatic int unsigned get_tcdm_ports(int unsigned core); - return spatz_pkg::N_FU + 1; + return NumSpatzTCDMPorts[core] + 1; endfunction function automatic int unsigned get_tcdm_port_offs(int unsigned core_idx); @@ -545,7 +548,8 @@ module spatz_cluster .user_t (logic ), .MemAddrWidth (TCDMMemAddrWidth ), .DataWidth (AxiDataWidth ), - .MemoryResponseLatency (MemoryMacroLatency) + .MemoryResponseLatency (MemoryMacroLatency), + .AddrMisalign (AddrMisalign ) ) i_dma_interconnect ( .clk_i (clk_i ), .rst_ni (rst_ni ), @@ -663,7 +667,8 @@ module spatz_cluster .MemAddrWidth (TCDMMemAddrWidth ), .DataWidth (DataWidth ), .user_t (tcdm_user_t ), - .MemoryResponseLatency (1 + RegisterTCDMCuts) + .MemoryResponseLatency (1 + RegisterTCDMCuts), + .AddrMisalign (AddrMisalign ) ) i_tcdm_interconnect ( .clk_i (clk_i ), .rst_ni (rst_ni ), @@ -740,6 +745,7 @@ module spatz_cluster .IsoCrossing (1'b0 ), .NumSpatzFPUs (NumSpatzFPUs[i] ), .NumSpatzIPUs (NumSpatzIPUs[i] ), + .NumMemPortsPerSpatz (NumSpatzTCDMPorts[i] ), .NumIntOutstandingLoads (NumIntOutstandingLoads[i] ), .NumIntOutstandingMem (NumIntOutstandingMem[i] ), .NumSpatzOutstandingLoads(NumSpatzOutstandingLoads[i]), diff --git a/hw/system/spatz_cluster/src/spatz_cluster_wrapper.sv.tpl b/hw/system/spatz_cluster/src/spatz_cluster_wrapper.sv.tpl index 88a42eea..3ca4f10e 100644 --- a/hw/system/spatz_cluster/src/spatz_cluster_wrapper.sv.tpl +++ b/hw/system/spatz_cluster/src/spatz_cluster_wrapper.sv.tpl @@ -79,7 +79,11 @@ package ${cfg['pkg_name']}; localparam int unsigned TCDMStartAddr = ${to_sv_hex(cfg['cluster_base_addr'], cfg['addr_width'])}; localparam int unsigned TCDMSize = ${to_sv_hex(cfg['tcdm']['size'] * 1024, cfg['addr_width'])}; - +% if cfg['tcdm']['misalign']: + localparam logic AddrMisalign = 1'b1; // 0-aligned, 1-misalign +%else: + localparam logic AddrMisalign = 1'b0; // 0-aligned, 1-misalign +%endif localparam int unsigned PeriStartAddr = TCDMStartAddr + TCDMSize; localparam int unsigned BootAddr = ${to_sv_hex(cfg['boot_addr'], cfg['addr_width'])}; @@ -306,6 +310,7 @@ module ${cfg['name']}_wrapper localparam int unsigned NumSpatzOutstandingLoads [NumCores] = '{${core_cfg('num_spatz_outstanding_loads')}}; localparam int unsigned NumSpatzFPUs [NumCores] = '{default: ${cfg['n_fpu']}}; localparam int unsigned NumSpatzIPUs [NumCores] = '{default: ${cfg['n_ipu']}}; + localparam int unsigned NumSpatzTCDMPorts [NumCores] = '{default: ${cfg['spatz_nports']}}; typedef logic [IwcAxiIdOutWidth-1:0] axi_id_out_iwc_t; @@ -508,6 +513,8 @@ module ${cfg['name']}_wrapper .NumSpatzOutstandingLoads (NumSpatzOutstandingLoads), .NumSpatzFPUs (NumSpatzFPUs), .NumSpatzIPUs (NumSpatzIPUs), + .NumSpatzTCDMPorts (NumSpatzTCDMPorts), + .AddrMisalign (AddrMisalign), .axi_in_req_t (axi_in_req_t), .axi_in_resp_t (axi_in_resp_t), .axi_out_req_t (spatz_axi_iwc_out_req_t), diff --git a/hw/system/spatz_cluster/src/spatz_tcdm_interconnect.sv b/hw/system/spatz_cluster/src/spatz_tcdm_interconnect.sv index 9327a108..848d90de 100644 --- a/hw/system/spatz_cluster/src/spatz_tcdm_interconnect.sv +++ b/hw/system/spatz_cluster/src/spatz_tcdm_interconnect.sv @@ -35,7 +35,9 @@ module spatz_tcdm_interconnect #( parameter type user_t = logic, /// Latency of memory response (in cycles) parameter int unsigned MemoryResponseLatency = 1, - parameter snitch_pkg::topo_e Topology = snitch_pkg::LogarithmicInterconnect + parameter snitch_pkg::topo_e Topology = snitch_pkg::LogarithmicInterconnect, + // Misalign rows of the TCDM + parameter logic AddrMisalign = 1'b0 ) ( /// Clock, positive edge triggered. input logic clk_i, @@ -74,8 +76,35 @@ module spatz_tcdm_interconnect #( // Generate the `bank_select` signal based on the address. // This generates a bank interleaved addressing scheme, where consecutive // addresses are routed to individual banks. + + // Misalignment logic for TCDM to achieve no conflicts within 2x Spatz VLSU interfaces within a core + // Misalignment pattern : + // Superbank-0 Superbank-1 + // row1 : Block-0 Block-1 + // row2 : Block-3 Block-2 + // row3 : Block-5 Block-4 + // row4 : Block-6 Block-7 + + localparam int unsigned ROWSIZE = $clog2(DataWidth * NumOut / 8); + localparam int unsigned ADDRWIDTH = $bits(req_i[0].q.addr); + addr_t [NumInp-1:0] row; + logic [NumInp-1:0] [ROWSIZE-1:0] addr_shift; + logic [NumInp-1:0] [ADDRWIDTH-1:0] addr_misaligned; + + always_comb begin : gen_addr_misalign + row = '0; + addr_shift = '0; + addr_misaligned = '0; + for (int i = 0; i < NumInp; i++) begin + row[i] = req_i[i].q.addr[ADDRWIDTH-1 : ROWSIZE]; + addr_shift[i] = (row[i][1:0] == 2'b00) || (row[i][1:0] == 2'b11) ? 0 : (DataWidth * NumOut / 8) / 2; + addr_misaligned[i] = req_i[i].q.addr + addr_shift[i]; + addr_misaligned[i][ADDRWIDTH-1 : ROWSIZE] = req_i[i].q.addr[ADDRWIDTH-1 : ROWSIZE]; + end + end + for (genvar i = 0; i < NumInp; i++) begin : gen_bank_select - assign bank_select[i] = req_i[i].q.addr[ByteOffset+:SelWidth]; + assign bank_select[i] = AddrMisalign ? addr_misaligned[i][ByteOffset+:SelWidth] : req_i[i].q.addr[ByteOffset+:SelWidth]; end mem_req_chan_t [NumInp-1:0] in_req; @@ -89,7 +118,7 @@ module spatz_tcdm_interconnect #( assign req_q_valid_flat[i] = req_i[i].q_valid; assign rsp_o[i].q_ready = rsp_q_ready_flat[i]; assign in_req[i] = '{ - addr: req_i[i].q.addr[ByteOffset+SelWidth+:MemAddrWidth], + addr: AddrMisalign ? addr_misaligned[i][ByteOffset+SelWidth+:MemAddrWidth] : req_i[i].q.addr[ByteOffset+SelWidth+:MemAddrWidth], write: req_i[i].q.write, amo: req_i[i].q.amo, data: req_i[i].q.data, diff --git a/sw/snRuntime/CMakeLists.txt b/sw/snRuntime/CMakeLists.txt index 6c9dd8c8..0af8a251 100644 --- a/sw/snRuntime/CMakeLists.txt +++ b/sw/snRuntime/CMakeLists.txt @@ -25,7 +25,7 @@ endif() add_compile_options(-O3 -g -ffunction-sections) # Platform sources -if(SPATZ_CLUSTER_CFG MATCHES "^(spatz_cluster\.(default|mempool|smallvrf)\.dram)\.hjson$") +if(SPATZ_CLUSTER_CFG MATCHES "^(spatz_cluster\.(default|mempool|smallvrf|doublebw)\.dram)\.hjson$") set(_plat_folder "standalone") elseif("${SPATZ_CLUSTER_CFG}" MATCHES "^spatz_cluster.carfield\\.(l2|dram)\\.hjson$") set(_plat_folder "cheshire") @@ -41,6 +41,8 @@ set(MEM_SPATZ_CLUSTER_DEFAULT_DRAM_HJSON_ORIGIN 0x80000000) set(MEM_SPATZ_CLUSTER_DEFAULT_DRAM_HJSON_SIZE 0x80000000) set(MEM_SPATZ_CLUSTER_SMALLVRF_DRAM_HJSON_ORIGIN 0x80000000) set(MEM_SPATZ_CLUSTER_SMALLVRF_DRAM_HJSON_SIZE 0x80000000) +set(MEM_SPATZ_CLUSTER_DOUBLEBW_DRAM_HJSON_ORIGIN 0x80000000) +set(MEM_SPATZ_CLUSTER_DOUBLEBW_DRAM_HJSON_SIZE 0x80000000) set(MEM_SPATZ_CLUSTER_CARFIELD_L2_HJSON_ORIGIN 0x78000000) set(MEM_SPATZ_CLUSTER_CARFIELD_L2_HJSON_SIZE 0x00400000) set(MEM_SPATZ_CLUSTER_CARFIELD_DRAM_HJSON_ORIGIN 0x80000000) diff --git a/sw/snRuntime/src/alloc.c b/sw/snRuntime/src/alloc.c index 8fb1c01d..35aee7b4 100644 --- a/sw/snRuntime/src/alloc.c +++ b/sw/snRuntime/src/alloc.c @@ -8,7 +8,7 @@ #define ALIGN_UP(addr, size) (((addr) + (size)-1) & ~((size)-1)) #define ALIGN_DOWN(addr, size) ((addr) & ~((size)-1)) -#define MIN_CHUNK_SIZE 8 +#define MIN_CHUNK_SIZE 256 // Alignment needed when using double VLSU bandwidth /** * @brief Allocate a chunk of memory in the L1 memory From 94dd681fda5a97b8cbf406049ac0db724b2d8375 Mon Sep 17 00:00:00 2001 From: Navaneeth-Kunhi Purayil Date: Fri, 12 Dec 2025 17:20:52 +0100 Subject: [PATCH 2/2] sw: add unrolled kernels for dp-fdotp and dp-faxpy --- sw/spatzBenchmarks/CMakeLists.txt | 4 ++ sw/spatzBenchmarks/dp-faxpy/kernel/faxpy.c | 48 ++++++++++++++ sw/spatzBenchmarks/dp-faxpy/kernel/faxpy.h | 2 + sw/spatzBenchmarks/dp-faxpy/main.c | 7 +- sw/spatzBenchmarks/dp-fdotp/kernel/fdotp.c | 76 ++++++++++++++++++++++ sw/spatzBenchmarks/dp-fdotp/kernel/fdotp.h | 3 + sw/spatzBenchmarks/dp-fdotp/main.c | 4 ++ 7 files changed, 143 insertions(+), 1 deletion(-) diff --git a/sw/spatzBenchmarks/CMakeLists.txt b/sw/spatzBenchmarks/CMakeLists.txt index dd96cb0d..44814a4a 100644 --- a/sw/spatzBenchmarks/CMakeLists.txt +++ b/sw/spatzBenchmarks/CMakeLists.txt @@ -24,6 +24,10 @@ include_directories(${SNRUNTIME_INCLUDE_DIRS}) add_compile_options(-O3 -g -ffunction-sections) +# Use unrolled spatzBenchmarks +if (UNROLL) +add_definitions(-DUNROLL) +endif() # Macro to regenerate the golden values and compile a module macro(add_spatz_test_oneParam name file param1) diff --git a/sw/spatzBenchmarks/dp-faxpy/kernel/faxpy.c b/sw/spatzBenchmarks/dp-faxpy/kernel/faxpy.c index a49ef62a..2fd0b598 100644 --- a/sw/spatzBenchmarks/dp-faxpy/kernel/faxpy.c +++ b/sw/spatzBenchmarks/dp-faxpy/kernel/faxpy.c @@ -45,6 +45,54 @@ void faxpy_v64b(const double a, const double *x, const double *y, } while (avl > 0); } +// Unrolled 64-bit AXPY: y = a * x + y +void faxpy_v64b_unrl(const double a, const double *x, const double *y, + unsigned int avl) { + unsigned int vl; + double *y2; + + // Stripmine and accumulate a partial vector + do { + // Set the vl + asm volatile("vsetvli %0, %1, e64, m8, ta, ma" : "=r"(vl) : "r"(avl)); + + // Load vectors + asm volatile("vle64.v v0, (%0)" ::"r"(x)); + asm volatile("vle64.v v8, (%0)" ::"r"(y)); + + // Multiply-accumulate + asm volatile("vfmacc.vf v8, %0, v0" ::"f"(a)); + avl -= vl; + if (avl > 0) { + // Set the vl + asm volatile("vsetvli %0, %1, e64, m8, ta, ma" : "=r"(vl) : "r"(avl)); + + // Load vectors + x += vl; + asm volatile("vle64.v v16, (%0)" ::"r"(x)); + y2 = y + vl; + asm volatile("vle64.v v24, (%0)" ::"r"(y2)); + + // Multiply-accumulate + asm volatile("vfmacc.vf v24, %0, v16" ::"f"(a)); + } + + // Store results + asm volatile("vse64.v v8, (%0)" ::"r"(y)); + if (avl > 0) { + // Store results + y += vl; + asm volatile("vse64.v v24, (%0)" ::"r"(y)); + avl -= vl; + } + + // Bump pointers + x += vl; + y += vl; + + } while (avl > 0); +} + // 32-bit AXPY: y = a * x + y void faxpy_v32b(const float a, const float *x, const float *y, unsigned int avl) { diff --git a/sw/spatzBenchmarks/dp-faxpy/kernel/faxpy.h b/sw/spatzBenchmarks/dp-faxpy/kernel/faxpy.h index e5625878..a5166afa 100644 --- a/sw/spatzBenchmarks/dp-faxpy/kernel/faxpy.h +++ b/sw/spatzBenchmarks/dp-faxpy/kernel/faxpy.h @@ -21,6 +21,8 @@ inline void faxpy_v64b(const double a, const double *x, const double *y, unsigned int avl) __attribute__((always_inline)); +inline void faxpy_v64b_unrl(const double a, const double *x, const double *y, + unsigned int avl) __attribute__((always_inline)); inline void faxpy_v32b(const float a, const float *x, const float *y, unsigned int avl) __attribute__((always_inline)); inline void faxpy_v16b(const _Float16 a, const _Float16 *x, const _Float16 *y, diff --git a/sw/spatzBenchmarks/dp-faxpy/main.c b/sw/spatzBenchmarks/dp-faxpy/main.c index ff99b4d1..ed5f11b2 100644 --- a/sw/spatzBenchmarks/dp-faxpy/main.c +++ b/sw/spatzBenchmarks/dp-faxpy/main.c @@ -62,6 +62,7 @@ int main() { snrt_dma_start_1d(x, axpy_X_dram, dim * sizeof(double)); snrt_dma_start_1d(y, axpy_Y_dram, dim * sizeof(double)); + snrt_dma_wait_all(); } // Wait for all cores to finish @@ -82,8 +83,12 @@ int main() { if (cid == 0) timer = benchmark_get_cycle(); - // Call AXPY + // Call AXPY +#ifdef UNROLL + faxpy_v64b_unrl(*a, x_int, y_int, dim_core); +#else faxpy_v64b(*a, x_int, y_int, dim_core); +#endif // Wait for all cores to finish snrt_cluster_hw_barrier(); diff --git a/sw/spatzBenchmarks/dp-fdotp/kernel/fdotp.c b/sw/spatzBenchmarks/dp-fdotp/kernel/fdotp.c index d8cf9bf1..8a5adbab 100644 --- a/sw/spatzBenchmarks/dp-fdotp/kernel/fdotp.c +++ b/sw/spatzBenchmarks/dp-fdotp/kernel/fdotp.c @@ -59,6 +59,82 @@ double fdotp_v64b(const double *a, const double *b, unsigned int avl) { return red; } +// 64-bit dot-product: a * b +// m8 allows only for partial register re-allocation with factor-2 unrolling +double fdotp_v64b_m8_unrl(const double *a, const double *b, unsigned int avl) { + const unsigned int orig_avl = avl; + unsigned int vl; + + double red; + + // Stripmine and accumulate a partial reduced vector + do { + // Set the vl + asm volatile("vsetvli %0, %1, e64, m8, ta, ma" : "=r"(vl) : "r"(avl)); + + // Load chunk a and b + asm volatile("vle64.v v8, (%0)" ::"r"(a)); + asm volatile("vle64.v v16, (%0)" ::"r"(b)); + + // Multiply and accumulate + if (avl == orig_avl) { + asm volatile("vfmul.vv v24, v8, v16"); + } else { + asm volatile("vfmacc.vv v24, v8, v16"); + } + + // Bump pointers + a += vl; + b += vl; + avl -= vl; + + if (avl <= 0) + break; + + // Set the vl + asm volatile("vsetvli %0, %1, e64, m8, ta, ma" : "=r"(vl) : "r"(avl)); + + // Load chunk a and b + asm volatile("vle64.v v0, (%0)" ::"r"(a)); + asm volatile("vle64.v v8, (%0)" ::"r"(b)); + + // Multiply and accumulate + asm volatile("vfmacc.vv v24, v0, v8"); + + // Bump pointers + a += vl; + b += vl; + avl -= vl; + + if (avl <= 0) + break; + + // Set the vl + asm volatile("vsetvli %0, %1, e64, m8, ta, ma" : "=r"(vl) : "r"(avl)); + + // Load chunk a and b + asm volatile("vle64.v v16, (%0)" ::"r"(a)); + asm volatile("vle64.v v0, (%0)" ::"r"(b)); + + // Multiply and accumulate + asm volatile("vfmacc.vv v24, v0, v16"); + + // Bump pointers + a += vl; + b += vl; + avl -= vl; + } while (avl > 0); + + // Clean the accumulator + asm volatile("vmv.s.x v0, zero"); + + // Reduce and return + asm volatile("vfredusum.vs v0, v24, v0"); + asm volatile("vfmv.f.s %0, v0" : "=f"(red)); + + return red; +} + // 32-bit dot-product: a * b float fdotp_v32b(const float *a, const float *b, unsigned int avl) { const unsigned int orig_avl = avl; diff --git a/sw/spatzBenchmarks/dp-fdotp/kernel/fdotp.h b/sw/spatzBenchmarks/dp-fdotp/kernel/fdotp.h index 4b5ac88d..b3b1f1d6 100644 --- a/sw/spatzBenchmarks/dp-fdotp/kernel/fdotp.h +++ b/sw/spatzBenchmarks/dp-fdotp/kernel/fdotp.h @@ -21,6 +21,9 @@ inline double fdotp_v64b(const double *a, const double *b, unsigned int avl) __attribute__((always_inline)); +inline double fdotp_v64b_m8_unrl(const double *a, const double *b, + unsigned int avl) + __attribute__((always_inline)); inline float fdotp_v32b(const float *a, const float *b, unsigned int avl) __attribute__((always_inline)); inline _Float16 fdotp_v16b(const _Float16 *a, const _Float16 *b, diff --git a/sw/spatzBenchmarks/dp-fdotp/main.c b/sw/spatzBenchmarks/dp-fdotp/main.c index ae756478..f4a0524d 100644 --- a/sw/spatzBenchmarks/dp-fdotp/main.c +++ b/sw/spatzBenchmarks/dp-fdotp/main.c @@ -82,7 +82,11 @@ int main() { // Calculate dotp double acc; +#ifdef UNROLL + acc = fdotp_v64b_m8_unrl(a_int, b_int, dim); +#else acc = fdotp_v64b(a_int, b_int, dim); +#endif result[cid] = acc; // Wait for all cores to finish