Skip to content

Commit e28dc3a

Browse files
committed
- Adding support for load bytes packing extension
- Turning off TCU sparsity by default - optimize UOP logic - adding bytes selection support to commit interface
1 parent c7113c5 commit e28dc3a

26 files changed

Lines changed: 713 additions & 181 deletions

hw/VX_config.toml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -228,7 +228,7 @@ TCU_BF16_ENABLE = true
228228
TCU_FP8_ENABLE = true
229229
TCU_INT_ENABLE = true
230230
TCU_MXI8_ENABLE = false
231-
TCU_SPARSE_ENABLE = true
231+
TCU_SPARSE_ENABLE = false
232232

233233
[isa_signatures]
234234

hw/rtl/VX_define.vh

Lines changed: 9 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -16,15 +16,15 @@
1616

1717
`include "VX_platform.vh"
1818
`include "VX_config.vh"
19-
`include "VX_types.vh"
19+
`include "VX_types.vh"
2020

21-
`ifdef SV_DPI
22-
`include "dpi_util.vh"
23-
`endif
24-
25-
`ifdef ICACHE_ENABLE
26-
`define L1_ENABLE
27-
`endif
21+
`ifdef SV_DPI
22+
`include "dpi_util.vh"
23+
`endif
24+
25+
`ifdef ICACHE_ENABLE
26+
`define L1_ENABLE
27+
`endif
2828

2929
`ifdef DCACHE_ENABLE
3030
`define L1_ENABLE
@@ -471,6 +471,7 @@
471471
logic wb; \
472472
logic [NUM_XREGS-1:0] wr_xregs; \
473473
logic [NUM_REGS_BITS-1:0] rd; \
474+
logic [BYTESEL_BITS-1:0] bytesel; \
474475
} __name__``_header_t; \
475476
typedef struct packed { \
476477
__name__``_header_t header; \

hw/rtl/VX_gpu_pkg.sv

Lines changed: 14 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -31,6 +31,9 @@ package VX_gpu_pkg;
3131
localparam NB_WIDTH = `UP(NB_BITS);
3232

3333
localparam XLENB = `XLEN / 8;
34+
localparam XLENB_W = `CLOG2(XLENB);
35+
localparam BYTESEL_BITS = (XLENB_W + XLENB_W);
36+
localparam [BYTESEL_BITS-1:0] BYTESEL_DEFAULT = {XLENB_W'(XLENB-1), XLENB_W'(0)};
3437

3538
localparam RV_REGS = 32;
3639
localparam RV_REGS_BITS = 5;
@@ -76,7 +79,8 @@ package VX_gpu_pkg;
7679

7780
localparam BAR_SIZE_W = `MAX(NW_WIDTH, NC_WIDTH);
7881

79-
localparam UOP_TCU = 0;
82+
localparam UOP_PACKLD = 0;
83+
localparam UOP_TCU = UOP_PACKLD + 1;
8084
localparam UOP_DXA = UOP_TCU + `EXT_TCU_ENABLED;
8185
localparam UOP_MAX = UOP_DXA + `EXT_DXA_ENABLED;
8286
localparam UOP_CTR_W = 8;
@@ -556,7 +560,8 @@ package VX_gpu_pkg;
556560
`PACKAGE_ASSERT($bits(fpu_args_t) == INST_ARGS_BITS)
557561

558562
typedef struct packed {
559-
logic [(INST_ARGS_BITS-1-1-12)-1:0] __padding;
563+
logic [(INST_ARGS_BITS-1-1-12-2)-1:0] __padding; // 9 bits
564+
logic [1:0] pack; // 0=normal, 1=PACKLB (4×byte), 2=PACKLH (2×halfword)
560565
logic is_store;
561566
logic is_float;
562567
logic [11:0] offset;
@@ -637,6 +642,7 @@ package VX_gpu_pkg;
637642
logic [NUM_XREGS-1:0] wr_xregs;
638643
logic [NUM_SRC_OPDS-1:0] used_rs;
639644
logic [NUM_REGS_BITS-1:0] rd;
645+
logic [BYTESEL_BITS-1:0] bytesel;
640646
logic [NUM_REGS_BITS-1:0] rs1;
641647
logic [NUM_REGS_BITS-1:0] rs2;
642648
logic [NUM_REGS_BITS-1:0] rs3;
@@ -654,6 +660,7 @@ package VX_gpu_pkg;
654660
logic [NUM_XREGS-1:0] wr_xregs;
655661
logic [NUM_SRC_OPDS-1:0] used_rs;
656662
logic [NUM_REGS_BITS-1:0] rd;
663+
logic [BYTESEL_BITS-1:0] bytesel;
657664
logic [NUM_REGS_BITS-1:0] rs1;
658665
logic [NUM_REGS_BITS-1:0] rs2;
659666
logic [NUM_REGS_BITS-1:0] rs3;
@@ -671,6 +678,7 @@ package VX_gpu_pkg;
671678
logic [NUM_XREGS-1:0] wr_xregs;
672679
logic [NUM_SRC_OPDS-1:0] used_rs;
673680
logic [NUM_REGS_BITS-1:0] rd;
681+
logic [BYTESEL_BITS-1:0] bytesel;
674682
logic [NUM_REGS_BITS-1:0] rs1;
675683
logic [NUM_REGS_BITS-1:0] rs2;
676684
logic [NUM_REGS_BITS-1:0] rs3;
@@ -688,6 +696,7 @@ package VX_gpu_pkg;
688696
logic wb;
689697
logic [NUM_XREGS-1:0] wr_xregs;
690698
logic [NUM_REGS_BITS-1:0] rd;
699+
logic [BYTESEL_BITS-1:0] bytesel;
691700
logic [`SIMD_WIDTH-1:0][`XLEN-1:0] rs1_data;
692701
logic [`SIMD_WIDTH-1:0][`XLEN-1:0] rs2_data;
693702
logic [`SIMD_WIDTH-1:0][`XLEN-1:0] rs3_data;
@@ -705,6 +714,7 @@ package VX_gpu_pkg;
705714
logic wb;
706715
logic [NUM_XREGS-1:0] wr_xregs;
707716
logic [NUM_REGS_BITS-1:0] rd;
717+
logic [BYTESEL_BITS-1:0] bytesel;
708718
logic [INST_OP_BITS-1:0] op_type;
709719
op_args_t op_args;
710720
logic [`SIMD_WIDTH-1:0][`XLEN-1:0] rs1_data;
@@ -723,6 +733,7 @@ package VX_gpu_pkg;
723733
logic wb;
724734
logic [NUM_XREGS-1:0] wr_xregs;
725735
logic [NUM_REGS_BITS-1:0] rd;
736+
logic [BYTESEL_BITS-1:0] bytesel;
726737
logic [`SIMD_WIDTH-1:0][`XLEN-1:0] data;
727738
logic sop;
728739
logic eop;
@@ -737,6 +748,7 @@ package VX_gpu_pkg;
737748
logic wb;
738749
logic [NUM_XREGS-1:0] wr_xregs;
739750
logic [NUM_REGS_BITS-1:0] rd;
751+
logic [`SIMD_WIDTH-1:0][XLENB-1:0] byteen;
740752
logic [`SIMD_WIDTH-1:0][`XLEN-1:0] data;
741753
logic sop;
742754
logic eop;

hw/rtl/VX_trace_pkg.sv

Lines changed: 10 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -169,7 +169,16 @@ package VX_trace_pkg;
169169
endcase
170170
end
171171
EX_LSU: begin
172-
if (op_args.lsu.is_float) begin
172+
if (op_args.lsu.pack != 0) begin
173+
case (op_args.lsu.pack)
174+
2'd1: begin
175+
`TRACE(level, ("PACKLB.F"))
176+
end
177+
default: begin
178+
`TRACE(level, ("PACKLH.F"))
179+
end
180+
endcase
181+
end else if (op_args.lsu.is_float) begin
173182
case (INST_LSU_BITS'(op_type))
174183
INST_LSU_LW: `TRACE(level, ("FLW"))
175184
INST_LSU_LD: `TRACE(level, ("FLD"))

hw/rtl/core/VX_commit.sv

Lines changed: 21 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -76,6 +76,25 @@ module VX_commit import VX_gpu_pkg::*; #(
7676
// Writeback
7777

7878
for (genvar i = 0; i < `ISSUE_WIDTH; ++i) begin : g_writeback
79+
wire [XLENB_W-1:0] bytesel_size = commit_arb_if[i].data.bytesel[BYTESEL_BITS-1 -: XLENB_W];
80+
wire [XLENB_W-1:0] bytesel_off = commit_arb_if[i].data.bytesel[0 +: XLENB_W];
81+
wire [`SIMD_WIDTH-1:0][`XLEN-1:0] writeback_data;
82+
wire [`SIMD_WIDTH-1:0][XLENB-1:0] writeback_byteen;
83+
84+
wire [XLENB-1:0] size_mask = (bytesel_size == XLENB_W'(7)) ? XLENB'(255) :
85+
(bytesel_size == XLENB_W'(6)) ? XLENB'(127) :
86+
(bytesel_size == XLENB_W'(5)) ? XLENB'(63) :
87+
(bytesel_size == XLENB_W'(4)) ? XLENB'(31) :
88+
(bytesel_size == XLENB_W'(3)) ? XLENB'(15) :
89+
(bytesel_size == XLENB_W'(2)) ? XLENB'(7) :
90+
(bytesel_size == XLENB_W'(1)) ? XLENB'(3) : XLENB'(1);
91+
wire [XLENB-1:0] base_byteen = size_mask << bytesel_off;
92+
93+
for (genvar lane = 0; lane < `SIMD_WIDTH; ++lane) begin : g_bytesel
94+
assign writeback_data[lane] = commit_arb_if[i].data.data[lane] << (8 * bytesel_off);
95+
assign writeback_byteen[lane] = commit_arb_if[i].data.tmask[lane] ? base_byteen : '0;
96+
end
97+
7998
assign writeback_if[i].valid = commit_arb_if[i].valid;
8099
assign writeback_if[i].data.uuid = commit_arb_if[i].data.uuid;
81100
assign writeback_if[i].data.wis = wid_to_wis(commit_arb_if[i].data.wid);
@@ -85,7 +104,8 @@ module VX_commit import VX_gpu_pkg::*; #(
85104
assign writeback_if[i].data.wb = commit_arb_if[i].data.wb;
86105
assign writeback_if[i].data.wr_xregs = commit_arb_if[i].data.wr_xregs;
87106
assign writeback_if[i].data.rd = commit_arb_if[i].data.rd;
88-
assign writeback_if[i].data.data = commit_arb_if[i].data.data;
107+
assign writeback_if[i].data.byteen = writeback_byteen;
108+
assign writeback_if[i].data.data = writeback_data;
89109
assign writeback_if[i].data.sop = commit_arb_if[i].data.sop;
90110
assign writeback_if[i].data.eop = commit_arb_if[i].data.eop;
91111
assign commit_arb_if[i].ready = 1;

hw/rtl/core/VX_decode.sv

Lines changed: 50 additions & 18 deletions
Original file line numberDiff line numberDiff line change
@@ -40,6 +40,7 @@ module VX_decode import VX_gpu_pkg::*; #(
4040
reg [NUM_SRC_OPDS:0] use_regs;
4141
reg [NUM_XREGS-1:0] rd_xregs;
4242
reg [NUM_XREGS-1:0] wr_xregs;
43+
reg [BYTESEL_BITS-1:0] bytesel;
4344
reg is_wstall;
4445

4546
wire [31:0] instr = fetch_if.data.instr;
@@ -157,6 +158,7 @@ module VX_decode import VX_gpu_pkg::*; #(
157158
use_regs = '0;
158159
rd_xregs = '0;
159160
wr_xregs = '0;
161+
bytesel = BYTESEL_DEFAULT;
160162
is_wstall = 0;
161163

162164
case (opcode)
@@ -297,7 +299,8 @@ module VX_decode import VX_gpu_pkg::*; #(
297299
op_type = INST_LSU_FENCE;
298300
op_args.lsu.is_store = 0;
299301
op_args.lsu.is_float = 0;
300-
op_args.lsu.offset = 0;
302+
op_args.lsu.pack = 0;
303+
op_args.lsu.offset = 0;
301304
end
302305
INST_SYS : begin
303306
if (funct3[1:0] != 0) begin
@@ -332,7 +335,8 @@ module VX_decode import VX_gpu_pkg::*; #(
332335
op_type = INST_OP_BITS'({1'b0, funct3});
333336
op_args.lsu.is_store = 0;
334337
op_args.lsu.is_float = opcode[2];
335-
op_args.lsu.offset = u_12;
338+
op_args.lsu.pack = 0;
339+
op_args.lsu.offset = u_12;
336340
`USED_IREG (rs1);
337341
`ifdef EXT_F_ENABLE
338342
`USED_REG (opcode[2], rd, 1'b1);
@@ -348,7 +352,8 @@ module VX_decode import VX_gpu_pkg::*; #(
348352
op_type = INST_OP_BITS'({1'b1, funct3});
349353
op_args.lsu.is_store = 1;
350354
op_args.lsu.is_float = opcode[2];
351-
op_args.lsu.offset = s_imm;
355+
op_args.lsu.pack = 0;
356+
op_args.lsu.offset = s_imm;
352357
`USED_IREG (rs1);
353358
`ifdef EXT_F_ENABLE
354359
`USED_REG (opcode[2], rs2, 1'b1);
@@ -544,19 +549,6 @@ module VX_decode import VX_gpu_pkg::*; #(
544549
end
545550
op_type = INST_OP_BITS'(funct3);
546551
end
547-
`ifdef EXT_DXA_ENABLE
548-
7'h03: begin // DXA issue (dimension-specific)
549-
// funct3 encodes dimensionality: 0=1D .. 4=5D.
550-
// Expanded into micro-ops by VX_dxa_uops.
551-
if (funct3 <= 3'd4) begin
552-
ex_type = EX_SFU;
553-
op_type = INST_OP_BITS'(INST_SFU_DXA);
554-
op_args.dxa.op = funct3;
555-
`USED_IREG (rs1);
556-
`USED_IREG (rs2);
557-
end
558-
end
559-
`endif
560552
`ifdef EXT_TCU_ENABLE
561553
7'h02: begin
562554
if (funct3 == 3'h0) begin
@@ -592,6 +584,46 @@ module VX_decode import VX_gpu_pkg::*; #(
592584
`endif
593585
end
594586
`endif
587+
`ifdef EXT_DXA_ENABLE
588+
7'h03: begin // DXA issue (dimension-specific)
589+
// funct3 encodes dimensionality: 0=1D .. 4=5D.
590+
// Expanded into micro-ops by VX_dxa_uops.
591+
if (funct3 <= 3'd4) begin
592+
ex_type = EX_SFU;
593+
op_type = INST_OP_BITS'(INST_SFU_DXA);
594+
op_args.dxa.op = funct3;
595+
`USED_IREG (rs1);
596+
`USED_IREG (rs2);
597+
end
598+
end
599+
`endif
600+
7'h04: begin // Load packing: vx_packlb_f / vx_packlh_f
601+
case (funct3)
602+
3'h1: begin // vx_packlb_f — pack 4 strided bytes into float
603+
ex_type = EX_LSU;
604+
op_type = INST_OP_BITS'(INST_LSU_LBU);
605+
op_args.lsu.is_store = 0;
606+
op_args.lsu.is_float = 1;
607+
op_args.lsu.pack = 2'b01;
608+
op_args.lsu.offset = '0;
609+
`USED_FREG (rd);
610+
`USED_IREG (rs1);
611+
`USED_IREG (rs2);
612+
end
613+
3'h2: begin // vx_packlh_f — pack 2 strided halfwords into float
614+
ex_type = EX_LSU;
615+
op_type = INST_OP_BITS'(INST_LSU_LHU);
616+
op_args.lsu.is_store = 0;
617+
op_args.lsu.is_float = 1;
618+
op_args.lsu.pack = 2'b10;
619+
op_args.lsu.offset = '0;
620+
`USED_FREG (rd);
621+
`USED_IREG (rs1);
622+
`USED_IREG (rs2);
623+
end
624+
default:;
625+
endcase
626+
end
595627
default:;
596628
endcase
597629
end
@@ -610,8 +642,8 @@ module VX_decode import VX_gpu_pkg::*; #(
610642
.reset (reset),
611643
.valid_in (fetch_if.valid),
612644
.ready_in (fetch_if.ready),
613-
.data_in ({fetch_if.data.uuid, fetch_if.data.wid, fetch_if.data.tmask, fetch_if.data.PC, ex_type, op_type, op_args, wb, rd_xregs, wr_xregs, use_regs[3:1], reg_ids[RV_RD], reg_ids[RV_RS1], reg_ids[RV_RS2], reg_ids[RV_RS3]}),
614-
.data_out ({decode_if.data.uuid, decode_if.data.wid, decode_if.data.tmask, decode_if.data.PC, decode_if.data.ex_type, decode_if.data.op_type, decode_if.data.op_args, decode_if.data.wb, decode_if.data.rd_xregs, decode_if.data.wr_xregs, decode_if.data.used_rs, decode_if.data.rd, decode_if.data.rs1, decode_if.data.rs2, decode_if.data.rs3}),
645+
.data_in ({fetch_if.data.uuid, fetch_if.data.wid, fetch_if.data.tmask, fetch_if.data.PC, ex_type, op_type, op_args, wb, rd_xregs, wr_xregs, use_regs[3:1], reg_ids[RV_RD], bytesel, reg_ids[RV_RS1], reg_ids[RV_RS2], reg_ids[RV_RS3]}),
646+
.data_out ({decode_if.data.uuid, decode_if.data.wid, decode_if.data.tmask, decode_if.data.PC, decode_if.data.ex_type, decode_if.data.op_type, decode_if.data.op_args, decode_if.data.wb, decode_if.data.rd_xregs, decode_if.data.wr_xregs, decode_if.data.used_rs, decode_if.data.rd, decode_if.data.bytesel, decode_if.data.rs1, decode_if.data.rs2, decode_if.data.rs3}),
615647
.valid_out (decode_if.valid),
616648
.ready_out (decode_if.ready)
617649
);

0 commit comments

Comments
 (0)