diff --git a/rtl/common/ternip_dual_port_mem.sv b/rtl/common/ternip_dual_port_mem.sv new file mode 100644 index 0000000..b75ba8b --- /dev/null +++ b/rtl/common/ternip_dual_port_mem.sv @@ -0,0 +1,285 @@ + +`define SAFE_CLOG2(x) ( (((x)==1) || ((x)==0)) ? 1 : $clog2(x) ) + +module ternip_dual_port_mem #( + parameter int DATA_WIDTH = 8, + parameter int NUM_ENTRIES = 256, + parameter bit UNCOUPLED_READY = 0 +) ( + input logic clk_i, + input logic rst_ni, + + output logic a_request_ready_o, + input logic a_request_valid_i, + input logic a_request_write_not_read_i, + input logic [`SAFE_CLOG2(NUM_ENTRIES)-1:0] a_request_addr_i, + input logic [DATA_WIDTH-1:0] a_request_w_data_i, + + input logic a_read_ready_i, + output logic a_read_valid_o, + output logic [`SAFE_CLOG2(NUM_ENTRIES)-1:0] a_read_addr_o, + output logic [DATA_WIDTH-1:0] a_read_data_o, + + output logic b_request_ready_o, + input logic b_request_valid_i, + input logic b_request_write_not_read_i, + input logic [`SAFE_CLOG2(NUM_ENTRIES)-1:0] b_request_addr_i, + input logic [DATA_WIDTH-1:0] b_request_w_data_i, + + input logic b_read_ready_i, + output logic b_read_valid_o, + output logic [`SAFE_CLOG2(NUM_ENTRIES)-1:0] b_read_addr_o, + output logic [DATA_WIDTH-1:0] b_read_data_o +); + +localparam int ADDR_WIDTH = `SAFE_CLOG2(NUM_ENTRIES); + +logic [DATA_WIDTH-1:0] MEM [NUM_ENTRIES]; + +// port A +logic a_read_valid_d, a_read_valid_q1, a_read_valid_q2; +logic a_write_valid_d, a_write_valid_q1, a_write_valid_q2; +logic [ADDR_WIDTH-1:0] a_request_addr_q1, a_request_addr_q2; +logic [DATA_WIDTH-1:0] a_request_w_data_q1; +logic [DATA_WIDTH-1:0] a_read_data_q2; + +logic a_buffer_in_ready; +logic a_buffer_in_valid; +logic [$bits({a_read_addr_o, a_read_data_o})-1:0] a_buffer_in_data; +logic a_buffer_out_ready; +logic a_buffer_out_valid; +logic [$bits({a_read_addr_o, a_read_data_o})-1:0] a_buffer_out_data; + +assign a_read_valid_d = a_request_valid_i && !a_request_write_not_read_i; +assign a_write_valid_d = a_request_valid_i && a_request_write_not_read_i; + +logic a_stall1, a_stall2, a_stall3; +if (UNCOUPLED_READY) begin + assign a_stall2 = !a_buffer_in_ready && a_read_valid_q2; +end else begin + assign a_stall3 = !a_read_ready_i && a_read_valid_o; + assign a_stall2 = a_stall3 && (a_read_valid_q2 || a_write_valid_q2); +end +assign a_stall1 = a_stall2 && (a_read_valid_q1 || a_write_valid_q1); +assign a_request_ready_o = !a_stall1; + +always_ff @(posedge clk_i) begin + if (!rst_ni) begin + a_read_valid_q1 <= 0; + a_write_valid_q1 <= 0; + end else if (!a_stall1) begin + a_read_valid_q1 <= a_read_valid_d; + a_write_valid_q1 <= a_write_valid_d; + end +end +always_ff @(posedge clk_i) begin + if (!a_stall1) begin + a_request_addr_q1 <= a_request_addr_i; + a_request_w_data_q1 <= a_request_w_data_i; + end + `ifndef SYNTHESIS + if (!rst_ni) begin + a_request_addr_q1 <= 'x; + a_request_w_data_q1 <= 'x; + end + `endif +end +always_ff @(posedge clk_i) begin + if (!rst_ni) begin + a_read_valid_q2 <= 0; + a_write_valid_q2 <= 0; + end else if (!a_stall2) begin + a_read_valid_q2 <= a_read_valid_q1; + a_write_valid_q2 <= a_write_valid_q1; + end +end +always_ff @(posedge clk_i) begin + if (!a_stall2) begin + a_request_addr_q2 <= a_request_addr_q1; + end + `ifndef SYNTHESIS + if (!rst_ni) begin + a_request_addr_q2 <= 'x; + end + `endif +end + +// port B +logic b_read_valid_d, b_read_valid_q1, b_read_valid_q2; +logic b_write_valid_d, b_write_valid_q1, b_write_valid_q2; +logic [ADDR_WIDTH-1:0] b_request_addr_q1, b_request_addr_q2; +logic [DATA_WIDTH-1:0] b_request_w_data_q1; +logic [DATA_WIDTH-1:0] b_read_data_q2; + +logic b_buffer_in_ready; +logic b_buffer_in_valid; +logic [$bits({b_read_addr_o, b_read_data_o})-1:0] b_buffer_in_data; +logic b_buffer_out_ready; +logic b_buffer_out_valid; +logic [$bits({b_read_addr_o, b_read_data_o})-1:0] b_buffer_out_data; + +assign b_read_valid_d = b_request_valid_i && !b_request_write_not_read_i; +assign b_write_valid_d = b_request_valid_i && b_request_write_not_read_i; + +logic b_stall1, b_stall2, b_stall3; +if (UNCOUPLED_READY) begin + assign b_stall2 = !b_buffer_in_ready && b_read_valid_q2; +end else begin + assign b_stall3 = !b_read_ready_i && b_read_valid_o; + assign b_stall2 = b_stall3 && (b_read_valid_q2 || b_write_valid_q2); +end +assign b_stall1 = b_stall2 && (b_read_valid_q1 || b_write_valid_q1); +assign b_request_ready_o = !b_stall1; + +always_ff @(posedge clk_i) begin + if (!rst_ni) begin + b_read_valid_q1 <= 0; + b_write_valid_q1 <= 0; + end else if (!b_stall1) begin + b_read_valid_q1 <= b_read_valid_d; + b_write_valid_q1 <= b_write_valid_d; + end +end +always_ff @(posedge clk_i) begin + if (!b_stall1) begin + b_request_addr_q1 <= b_request_addr_i; + b_request_w_data_q1 <= b_request_w_data_i; + end + `ifndef SYNTHESIS + if (!rst_ni) begin + b_request_addr_q1 <= 'x; + b_request_w_data_q1 <= 'x; + end + `endif +end +always_ff @(posedge clk_i) begin + if (!rst_ni) begin + b_read_valid_q2 <= 0; + b_write_valid_q2 <= 0; + end else if (!b_stall2) begin + b_read_valid_q2 <= b_read_valid_q1; + b_write_valid_q2 <= b_write_valid_q1; + end +end +always_ff @(posedge clk_i) begin + if (!b_stall2) begin + b_request_addr_q2 <= b_request_addr_q1; + end + `ifndef SYNTHESIS + if (!rst_ni) begin + b_request_addr_q2 <= 'x; + end + `endif +end + +// shared MEM +always_ff @(posedge clk_i) begin + if (!a_stall2) begin + if (a_write_valid_q1) begin + MEM[a_request_addr_q1] <= a_request_w_data_q1; + end else if (a_read_valid_q1) begin + a_read_data_q2 <= MEM[a_request_addr_q1]; + end + end +end +always_ff @(posedge clk_i) begin + if (!b_stall2) begin + if (b_write_valid_q1) begin + MEM[b_request_addr_q1] <= b_request_w_data_q1; + end else if (b_read_valid_q1) begin + b_read_data_q2 <= MEM[b_request_addr_q1]; + end + end +end + +// output buffer A +if (UNCOUPLED_READY) begin : a_uncoupled + assign a_buffer_in_valid = a_read_valid_q2; + assign a_buffer_in_data = {a_request_addr_q2, a_read_data_q2}; + + assign a_buffer_out_ready = a_read_ready_i; + assign a_read_valid_o = a_buffer_out_valid; + assign {a_read_addr_o, a_read_data_o} = a_buffer_out_data; + + ternip_pipelined_interconnect #( + .DataWidth($bits(a_buffer_in_data)), + .NumStages(1) + ) a_buffer ( + .clk_i, + .rst_ni, + .in_ready_o(a_buffer_in_ready), + .in_valid_i(a_buffer_in_valid), + .in_data_i (a_buffer_in_data), + .out_ready_i(a_buffer_out_ready), + .out_valid_o(a_buffer_out_valid), + .out_data_o (a_buffer_out_data) + ); +end else begin : a_coupled + always_ff @(posedge clk_i) begin + if (!rst_ni) begin + a_read_valid_o <= 1'b0; + end else if (!a_stall3) begin + a_read_valid_o <= a_read_valid_q2; + end + end + always_ff @(posedge clk_i) begin + if (!a_stall3) begin + a_read_addr_o <= a_read_valid_q2 ? a_request_addr_q2 : 'x; + a_read_data_o <= a_read_valid_q2 ? a_read_data_q2 : 'x; + end + `ifndef SYNTHESIS + if (!rst_ni) begin + a_read_addr_o <= 'x; + a_read_data_o <= 'x; + end + `endif + end +end + +// output buffer B +if (UNCOUPLED_READY) begin : b_uncoupled + assign b_buffer_in_valid = b_read_valid_q2; + assign b_buffer_in_data = {b_request_addr_q2, b_read_data_q2}; + + assign b_buffer_out_ready = b_read_ready_i; + assign b_read_valid_o = b_buffer_out_valid; + assign {b_read_addr_o, b_read_data_o} = b_buffer_out_data; + + ternip_pipelined_interconnect #( + .DataWidth($bits(b_buffer_in_data)), + .NumStages(1) + ) b_buffer ( + .clk_i, + .rst_ni, + .in_ready_o(b_buffer_in_ready), + .in_valid_i(b_buffer_in_valid), + .in_data_i (b_buffer_in_data), + .out_ready_i(b_buffer_out_ready), + .out_valid_o(b_buffer_out_valid), + .out_data_o (b_buffer_out_data) + ); +end else begin : b_coupled + always_ff @(posedge clk_i) begin + if (!rst_ni) begin + b_read_valid_o <= 1'b0; + end else if (!b_stall3) begin + b_read_valid_o <= b_read_valid_q2; + end + end + always_ff @(posedge clk_i) begin + if (!b_stall3) begin + b_read_addr_o <= b_read_valid_q2 ? b_request_addr_q2 : 'x; + b_read_data_o <= b_read_valid_q2 ? b_read_data_q2 : 'x; + end + `ifndef SYNTHESIS + if (!rst_ni) begin + b_read_addr_o <= 'x; + b_read_data_o <= 'x; + end + `endif + end +end + +endmodule + +`undef SAFE_CLOG2 diff --git a/rtl/fus/ternip_rms.sv b/rtl/fus/ternip_rms.sv index 4e52172..c7d1440 100644 --- a/rtl/fus/ternip_rms.sv +++ b/rtl/fus/ternip_rms.sv @@ -36,7 +36,9 @@ // Use CLEAR before a new RMS group. Then issue one or more ACCUMULATE commands, // one FINISH_ACCUMULATE command, and one or more NORM commands. The unit accepts // one command on in_* and sequences the vector-register and math handshakes -// until that command finishes. +// until that command finishes. NORM uses the primary vector port for reads and +// vector_request2_* for writes so reads and writes can overlap when the register +// file is dual-ported. module ternip_rms #( parameter int FixedPointPrecision = ternip_pkg::FixedPointPrecision, @@ -87,6 +89,13 @@ module ternip_rms #( input vector_offset_t vector_read_addr_i, input vector_chunk_t vector_read_data_i, + input logic vector_request2_ready_i, + output logic vector_request2_valid_o, + output logic vector_request2_write_not_read_o, + output vector_select_t vector_request2_vector_select_o, + output vector_offset_t vector_request2_vector_addr_o, + output vector_chunk_t vector_request2_w_data_o, + // debug ports output logic accumulator_out_valid_o, output rms_accumulator_t accumulator_out_result_o, @@ -349,6 +358,12 @@ always_comb begin vector_request_w_data_o = 'x; vector_read_ready_o = 0; + vector_request2_valid_o = 0; + vector_request2_write_not_read_o = 'x; + vector_request2_vector_addr_o = 'x; + vector_request2_vector_select_o = 'x; + vector_request2_w_data_o = 'x; + vector_read_counter_d = vector_read_counter_q; vector_processed_counter_d = vector_processed_counter_q; @@ -489,14 +504,25 @@ always_comb begin norm_mul_in_a = vector_read_data_i; norm_mul_in_b = rms_value_reciprocal_q; - // buffer -> write request - if (norm_mul_out_result_buffer_valid_q) begin + // Port 1: read request stream + if (vector_read_counter_q < NumChunksPerVector) begin vector_request_valid_o = 1; - vector_request_write_not_read_o = 1; - vector_request_vector_addr_o = vector_processed_counter_q; - vector_request_w_data_o = norm_mul_out_result_buffer_q; - vector_request_vector_select_o = in_vector2_select_q; + vector_request_write_not_read_o = 0; + vector_request_vector_addr_o = vector_read_counter_q; + vector_request_vector_select_o = in_vector1_select_q; if (vector_request_ready_i) begin + vector_read_counter_d++; + end + end + + // Port 2: write request stream (drains multiplier output buffer) + if (norm_mul_out_result_buffer_valid_q) begin + vector_request2_valid_o = 1; + vector_request2_write_not_read_o = 1; + vector_request2_vector_addr_o = vector_processed_counter_q; + vector_request2_w_data_o = norm_mul_out_result_buffer_q; + vector_request2_vector_select_o = in_vector2_select_q; + if (vector_request2_ready_i) begin norm_mul_out_result_buffer_d = 'x; norm_mul_out_result_buffer_valid_d = 0; vector_processed_counter_d++; @@ -510,19 +536,10 @@ always_comb begin vector_processed_counter_d = 'x; end end - end else if (vector_read_counter_q < NumChunksPerVector) begin // read request - // if a read was just received, do not do another read - vector_request_valid_o = 1; - vector_request_write_not_read_o = 0; - vector_request_vector_addr_o = vector_read_counter_q; - vector_request_vector_select_o = in_vector1_select_q; - if (vector_request_ready_i && vector_request_valid_o) begin - vector_read_counter_d++; - end end // multiplier -> buffer - norm_mul_out_ready = {VectorParallelism{ !norm_mul_out_result_buffer_valid_q || vector_request_ready_i }}; + norm_mul_out_ready = {VectorParallelism{ !norm_mul_out_result_buffer_valid_q || vector_request2_ready_i }}; if (norm_mul_out_ready[0] && norm_mul_out_valid[0]) begin norm_mul_out_result_buffer_d = norm_mul_out_result; norm_mul_out_result_buffer_valid_d = 1; diff --git a/rtl/fus/ternip_rowwise_operation.sv b/rtl/fus/ternip_rowwise_operation.sv index 620fae8..173478a 100644 --- a/rtl/fus/ternip_rowwise_operation.sv +++ b/rtl/fus/ternip_rowwise_operation.sv @@ -76,7 +76,14 @@ module ternip_rowwise_operation #( output logic vector_read_ready_o, input logic vector_read_valid_i, - input vector_chunk_t vector_read_data_i + input vector_chunk_t vector_read_data_i, + + input logic vector_request2_ready_i, + output logic vector_request2_valid_o, + output logic vector_request2_write_not_read_o, + output vector_select_t vector_request2_vector_select_o, + output vector_offset_t vector_request2_vector_addr_o, + output vector_chunk_t vector_request2_w_data_o ); logic [$clog2(D+1):0] read_request_counter_d, read_request_counter_q; @@ -169,6 +176,11 @@ always_comb begin vector_request_vector_select_o = 0; vector_request_vector_addr_o = 'x; + vector_request2_valid_o = 0; + vector_request2_write_not_read_o = 1; + vector_request2_vector_select_o = 'x; + vector_request2_vector_addr_o = 'x; + vector_read_ready_o = 0; vector1_r_data_d = vector1_r_data_q; @@ -217,13 +229,24 @@ always_comb begin ternip_pkg::SILU: rowwise_silu_in_valid = multicycle_in_valid; endcase - // buffer -> write request - if (multicycle_result_buffer_valid_q) begin + // Port 1: read request stream + if (read_request_counter_q < NumChunksPerVector) begin vector_request_valid_o = 1; - vector_request_write_not_read_o = 1; - vector_request_vector_select_o = vector3_select_q; - vector_request_vector_addr_o = write_request_counter_q; + vector_request_write_not_read_o = 0; + vector_request_vector_select_o = vector1_select_q; + vector_request_vector_addr_o = read_request_counter_q; if (vector_request_ready_i) begin + read_request_counter_d++; + end + end + + // Port 2: write request stream (drains activation buffer) + if (multicycle_result_buffer_valid_q) begin + vector_request2_valid_o = 1; + vector_request2_write_not_read_o = 1; + vector_request2_vector_select_o = vector3_select_q; + vector_request2_vector_addr_o = write_request_counter_q; + if (vector_request2_ready_i) begin multicycle_result_buffer_d = 'x; multicycle_result_buffer_valid_d = 0; write_request_counter_d++; @@ -231,19 +254,10 @@ always_comb begin state_d = WAITING_FOR_IN; end end - end else if (read_request_counter_q < NumChunksPerVector) begin // read request - // if a read was just received, do not do another read - vector_request_valid_o = 1; - vector_request_write_not_read_o = 0; - vector_request_vector_select_o = vector1_select_q; - vector_request_vector_addr_o = read_request_counter_q; - if (vector_request_ready_i && vector_request_valid_o) begin - read_request_counter_d++; - end end // sig, csig, silu -> buffer - multicycle_out_ready = !multicycle_result_buffer_valid_q || vector_request_ready_i; + multicycle_out_ready = !multicycle_result_buffer_valid_q || vector_request2_ready_i; case (vector_operation_q) ternip_pkg::SIG: multicycle_out_valid = rowwise_sig_out_valid; ternip_pkg::CSIG: multicycle_out_valid = rowwise_csig_out_valid; @@ -267,28 +281,28 @@ always_comb begin end else if (operation_is_multioperand && !operation_is_multicycle) begin // ADD / SUB + // Port 1: continuously issue read requests (alternating vec1/vec2) + // Port 1 read response: latch vec1, or send (vec1+vec2)->result on port 2 + // Port 2: write result (combinationally produced from current vec2 read response) vector_read_ready_o = 1; if (vector_read_valid_i) begin read_response_counter_d++; if (read_response_counter_q % 2 == 0) begin // received vec1 read vector1_r_data_d = vector_read_data_i; - end - end - - // Request issuance: write takes priority over read on the cycle - // an odd response arrives; otherwise issue the next read. - if (vector_read_valid_i && (read_response_counter_q % 2 == 1)) begin // received vec2 read -> write - vector_request_valid_o = 1; - vector_request_write_not_read_o = 1; - vector_request_vector_select_o = vector3_select_q; - vector_request_vector_addr_o = write_request_counter_q; - if (vector_request_ready_i) begin - write_request_counter_d++; - if (write_request_counter_q >= NumChunksPerVector-1) begin - state_d = WAITING_FOR_IN; + end else if (read_response_counter_q % 2 == 1) begin // received vec2 read + vector_request2_valid_o = 1; + vector_request2_write_not_read_o = 1; + vector_request2_vector_select_o = vector3_select_q; + vector_request2_vector_addr_o = write_request_counter_q; + if (vector_request2_ready_i) begin + write_request_counter_d++; + if (write_request_counter_q >= NumChunksPerVector-1) begin + state_d = WAITING_FOR_IN; + end end end - end else if (read_request_counter_q < 2*NumChunksPerVector) begin + end + if (read_request_counter_q < 2*NumChunksPerVector) begin vector_request_valid_o = 1; vector_request_write_not_read_o = 0; if (read_request_counter_q % 2 == 0) @@ -324,21 +338,8 @@ always_comb begin ternip_pkg::DIV: for (int i = 0; i < VectorParallelism; i++) rowwise_div_in_valid[i] = multicycle_in_valid; endcase - // buffer -> write request - if (multicycle_result_buffer_valid_q) begin - vector_request_valid_o = 1; - vector_request_write_not_read_o = 1; - vector_request_vector_select_o = vector3_select_q; - vector_request_vector_addr_o = write_request_counter_q; - if (vector_request_ready_i) begin - multicycle_result_buffer_d = 'x; - multicycle_result_buffer_valid_d = 0; - write_request_counter_d++; - if (write_request_counter_q >= NumChunksPerVector-1) begin - state_d = WAITING_FOR_IN; - end - end - end else if (read_request_counter_q < 2*NumChunksPerVector) begin // read request + // Port 1: read request stream (alternating vec1/vec2) + if (read_request_counter_q < 2*NumChunksPerVector) begin vector_request_valid_o = 1; vector_request_write_not_read_o = 0; if (read_request_counter_q % 2 == 0) begin // request vec1 read @@ -352,8 +353,24 @@ always_comb begin end end + // Port 2: write request stream (drains multiplier/divider output buffer) + if (multicycle_result_buffer_valid_q) begin + vector_request2_valid_o = 1; + vector_request2_write_not_read_o = 1; + vector_request2_vector_select_o = vector3_select_q; + vector_request2_vector_addr_o = write_request_counter_q; + if (vector_request2_ready_i) begin + multicycle_result_buffer_d = 'x; + multicycle_result_buffer_valid_d = 0; + write_request_counter_d++; + if (write_request_counter_q >= NumChunksPerVector-1) begin + state_d = WAITING_FOR_IN; + end + end + end + // multioperand output -> buffer - multicycle_out_ready = !multicycle_result_buffer_valid_q || vector_request_ready_i; + multicycle_out_ready = !multicycle_result_buffer_valid_q || vector_request2_ready_i; if (vector_operation_q == ternip_pkg::MUL) begin for (int i = 0; i < VectorParallelism; i++) rowwise_mul_out_ready[i] = multicycle_out_ready; multicycle_out_valid = all_mul_out_valid; @@ -532,16 +549,18 @@ ternip_silu_parallelized #( .vector_data_o(rowwise_silu_result) ); +assign vector_request_w_data_o = 'x; + always_comb begin unique case (vector_operation_q) - ternip_pkg::ADD: vector_request_w_data_o = rowwise_add_result; - ternip_pkg::SUB: vector_request_w_data_o = rowwise_sub_result; - ternip_pkg::MUL: vector_request_w_data_o = multicycle_result_buffer_q; - ternip_pkg::DIV: vector_request_w_data_o = multicycle_result_buffer_q; - ternip_pkg::SIG: vector_request_w_data_o = multicycle_result_buffer_q; - ternip_pkg::CSIG: vector_request_w_data_o = multicycle_result_buffer_q; - ternip_pkg::SILU: vector_request_w_data_o = multicycle_result_buffer_q; - default: vector_request_w_data_o = 'x; + ternip_pkg::ADD: vector_request2_w_data_o = rowwise_add_result; + ternip_pkg::SUB: vector_request2_w_data_o = rowwise_sub_result; + ternip_pkg::MUL: vector_request2_w_data_o = multicycle_result_buffer_q; + ternip_pkg::DIV: vector_request2_w_data_o = multicycle_result_buffer_q; + ternip_pkg::SIG: vector_request2_w_data_o = multicycle_result_buffer_q; + ternip_pkg::CSIG: vector_request2_w_data_o = multicycle_result_buffer_q; + ternip_pkg::SILU: vector_request2_w_data_o = multicycle_result_buffer_q; + default: vector_request2_w_data_o = 'x; endcase end diff --git a/rtl/ternip/ternip_core.sv b/rtl/ternip/ternip_core.sv index ea0bac5..9d4660b 100644 --- a/rtl/ternip/ternip_core.sv +++ b/rtl/ternip/ternip_core.sv @@ -118,6 +118,25 @@ vector_select_t vector_request_vector_select; vector_offset_t vector_request_vector_addr; vector_chunk_t vector_request_w_data; +logic vector_request2_ready; +logic vector_request2_valid; +logic vector_request2_write_not_read; +vector_select_t vector_request2_vector_select; +vector_offset_t vector_request2_vector_addr; +vector_chunk_t vector_request2_w_data; + +logic rms_vector_request2_valid; +logic rms_vector_request2_write_not_read; +vector_select_t rms_vector_request2_vector_select; +vector_offset_t rms_vector_request2_vector_addr; +vector_chunk_t rms_vector_request2_w_data; + +logic rowwise_operation_vector_request2_valid; +logic rowwise_operation_vector_request2_write_not_read; +vector_select_t rowwise_operation_vector_request2_vector_select; +vector_offset_t rowwise_operation_vector_request2_vector_addr; +vector_chunk_t rowwise_operation_vector_request2_w_data; + logic vector_read_ready; logic vector_read_valid; vector_offset_t vector_read_addr; @@ -140,11 +159,24 @@ ternip_vector_registers #( .request_vector_addr_i(vector_request_vector_addr), .request_w_data_i(vector_request_w_data), + .request2_ready_o(vector_request2_ready), + .request2_valid_i(vector_request2_valid), + .request2_write_not_read_i(vector_request2_write_not_read), + .request2_vector_select_i(vector_request2_vector_select), + .request2_vector_addr_i(vector_request2_vector_addr), + .request2_w_data_i(vector_request2_w_data), + .read_ready_i(vector_read_ready), .read_valid_o(vector_read_valid), .read_vector_select_o(), .read_addr_o(vector_read_addr), - .read_data_o(vector_read_data) + .read_data_o(vector_read_data), + + .read2_ready_i(1'b1), + .read2_valid_o(), + .read2_vector_select_o(), + .read2_addr_o(), + .read2_data_o() ); logic loadstore_in_ready; @@ -255,7 +287,14 @@ ternip_rowwise_operation #( .vector_read_ready_o(rowwise_operation_vector_read_ready), .vector_read_valid_i(vector_read_valid), - .vector_read_data_i(vector_read_data) + .vector_read_data_i(vector_read_data), + + .vector_request2_ready_i(vector_request2_ready), + .vector_request2_valid_o(rowwise_operation_vector_request2_valid), + .vector_request2_write_not_read_o(rowwise_operation_vector_request2_write_not_read), + .vector_request2_vector_select_o(rowwise_operation_vector_request2_vector_select), + .vector_request2_vector_addr_o(rowwise_operation_vector_request2_vector_addr), + .vector_request2_w_data_o(rowwise_operation_vector_request2_w_data) ); logic rms_in_ready; @@ -311,6 +350,13 @@ ternip_rms #( .vector_read_addr_i(vector_read_addr), .vector_read_data_i(vector_read_data), + .vector_request2_ready_i(vector_request2_ready), + .vector_request2_valid_o(rms_vector_request2_valid), + .vector_request2_write_not_read_o(rms_vector_request2_write_not_read), + .vector_request2_vector_select_o(rms_vector_request2_vector_select), + .vector_request2_vector_addr_o(rms_vector_request2_vector_addr), + .vector_request2_w_data_o(rms_vector_request2_w_data), + .accumulator_out_valid_o(), .accumulator_out_result_o(), .rms_value_reciprocal_o(), @@ -441,6 +487,32 @@ assign vector_read_ready = (loadstore_vector_read_ready | rowwise_operation_vector_read_ready | tmatmul_vector_read_ready); +always_comb begin + vector_request2_valid = 0; + vector_request2_write_not_read = 'x; + vector_request2_vector_select = 'x; + vector_request2_vector_addr = 'x; + vector_request2_w_data = 'x; + + unique case (1) + rms_vector_request2_valid: begin + vector_request2_valid = 1; + vector_request2_write_not_read = rms_vector_request2_write_not_read; + vector_request2_vector_select = rms_vector_request2_vector_select; + vector_request2_vector_addr = rms_vector_request2_vector_addr; + vector_request2_w_data = rms_vector_request2_w_data; + end + rowwise_operation_vector_request2_valid: begin + vector_request2_valid = 1; + vector_request2_write_not_read = rowwise_operation_vector_request2_write_not_read; + vector_request2_vector_select = rowwise_operation_vector_request2_vector_select; + vector_request2_vector_addr = rowwise_operation_vector_request2_vector_addr; + vector_request2_w_data = rowwise_operation_vector_request2_w_data; + end + default: ; + endcase +end + `ifndef SYNTHESIS always @(posedge clk_i) if (rst_ni) begin assert final (1 >= $countones({ diff --git a/rtl/ternip_vector_registers.sv b/rtl/ternip_vector_registers.sv index f0dda94..4b5d755 100644 --- a/rtl/ternip_vector_registers.sv +++ b/rtl/ternip_vector_registers.sv @@ -32,6 +32,8 @@ // vector register and one chunk address, then either writes that chunk or starts // a read for that chunk. // +// The request2_* and read2_* interfaces are a second port to the same storage. +// // Use request_ready_o/request_valid_i for both reads and writes. Reads return // later on read_valid_o/read_data_o, along with the vector and chunk address // that were read. @@ -62,33 +64,60 @@ module ternip_vector_registers #( output logic read_valid_o, output vector_select_t read_vector_select_o, output vector_offset_t read_addr_o, - output vector_chunk_t read_data_o + output vector_chunk_t read_data_o, + + output logic request2_ready_o, + input logic request2_valid_i, + input logic request2_write_not_read_i, + input vector_select_t request2_vector_select_i, + input vector_offset_t request2_vector_addr_i, + input vector_chunk_t request2_w_data_i, + + input logic read2_ready_i, + output logic read2_valid_o, + output vector_select_t read2_vector_select_o, + output vector_offset_t read2_addr_o, + output vector_chunk_t read2_data_o ); logic [$bits(vector_select_t)+$bits(vector_offset_t)-1:0] request_mem_addr, read_mem_addr; +logic [$bits(vector_select_t)+$bits(vector_offset_t)-1:0] request2_mem_addr, read2_mem_addr; assign request_mem_addr = {request_vector_select_i, request_vector_addr_i}; +assign request2_mem_addr = {request2_vector_select_i, request2_vector_addr_i}; assign {read_vector_select_o, read_addr_o} = read_mem_addr; +assign {read2_vector_select_o, read2_addr_o} = read2_mem_addr; localparam int D_rounded_up = 2**$clog2(D); -ternip_pipelined_mem #( - .DATA_WIDTH(FixedPointPrecision * VectorParallelism), +ternip_dual_port_mem #( + .DATA_WIDTH($bits(read_data_o)), .NUM_ENTRIES(NumVectorRegisters * D_rounded_up / VectorParallelism), - .DECOUPLED_READY(1) -) pipelined_mem ( + .UNCOUPLED_READY(1) +) dual_port_mem ( .clk_i, .rst_ni, - .request_ready_o, - .request_valid_i, - .request_write_not_read_i, - .request_addr_i(request_mem_addr), - .request_w_data_i, + .a_request_ready_o(request_ready_o), + .a_request_valid_i(request_valid_i), + .a_request_write_not_read_i(request_write_not_read_i), + .a_request_addr_i(request_mem_addr), + .a_request_w_data_i(request_w_data_i), + + .a_read_ready_i(read_ready_i), + .a_read_valid_o(read_valid_o), + .a_read_addr_o(read_mem_addr), + .a_read_data_o(read_data_o), + + .b_request_ready_o(request2_ready_o), + .b_request_valid_i(request2_valid_i), + .b_request_write_not_read_i(request2_write_not_read_i), + .b_request_addr_i(request2_mem_addr), + .b_request_w_data_i(request2_w_data_i), - .read_ready_i, - .read_valid_o, - .read_addr_o(read_mem_addr), - .read_data_o + .b_read_ready_i(read2_ready_i), + .b_read_valid_o(read2_valid_o), + .b_read_addr_o(read2_mem_addr), + .b_read_data_o(read2_data_o) ); endmodule diff --git a/ternip.core b/ternip.core index 37939c1..09cee82 100644 --- a/ternip.core +++ b/ternip.core @@ -19,6 +19,7 @@ filesets: - rtl/common/ternip_multioperand_accumulator.sv - rtl/common/ternip_pipelined_interconnect.sv - rtl/common/ternip_pipelined_mem.sv + - rtl/common/ternip_dual_port_mem.sv - rtl/math/ternip_add.sv - rtl/math/ternip_sub.sv - rtl/math/ternip_mul.sv