Skip to content

Commit c338a03

Browse files
committed
simx sparsity NT=16 support
Port RTLSim NT=16 sparse WMMA behavior into SIMX. This updates SIMX decode to generate the RTL-style NT=16 sparse uops, adds instruction-local thread masks for sparse MMA execution, and enables the tensor-unit sparse data path for NT=16. Sparse B indexing now matches the RTL column-pair wrap behavior. Also add SIMX regression coverage for NT=16 sparse TCU cases.
1 parent 54fa441 commit c338a03

6 files changed

Lines changed: 119 additions & 46 deletions

File tree

ci/regression.sh.in

Lines changed: 25 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -623,6 +623,31 @@ tensor_sp()
623623
make -C tests/regression/sgemm_tcu_sp clean && CONFIGS="-DNUM_THREADS=8 -DITYPE=bf8 -DOTYPE=fp32" make -C tests/regression/sgemm_tcu_sp
624624
CONFIGS="-DNUM_THREADS=8 -DEXT_TCU_ENABLE -DTCU_SPARSE_ENABLE" ./ci/blackbox.sh --driver=simx --app=sgemm_tcu_sp --args="-m8 -n8 -k32"
625625

626+
# simx tests (NT=16)
627+
make -C tests/regression/sgemm_tcu_sp clean && CONFIGS="-DNUM_THREADS=16 -DITYPE=int8 -DOTYPE=int32" make -C tests/regression/sgemm_tcu_sp
628+
CONFIGS="-DNUM_THREADS=16 -DEXT_TCU_ENABLE -DTCU_SPARSE_ENABLE" ./ci/blackbox.sh --driver=simx --app=sgemm_tcu_sp --args="-m16 -n8 -k32"
629+
630+
make -C tests/regression/sgemm_tcu_sp clean && CONFIGS="-DNUM_THREADS=16 -DITYPE=uint8 -DOTYPE=int32" make -C tests/regression/sgemm_tcu_sp
631+
CONFIGS="-DNUM_THREADS=16 -DEXT_TCU_ENABLE -DTCU_SPARSE_ENABLE" ./ci/blackbox.sh --driver=simx --app=sgemm_tcu_sp --args="-m16 -n8 -k32"
632+
633+
make -C tests/regression/sgemm_tcu_sp clean && CONFIGS="-DNUM_THREADS=16 -DITYPE=int4 -DOTYPE=int32" make -C tests/regression/sgemm_tcu_sp
634+
CONFIGS="-DNUM_THREADS=16 -DEXT_TCU_ENABLE -DTCU_SPARSE_ENABLE" ./ci/blackbox.sh --driver=simx --app=sgemm_tcu_sp --args="-m16 -n8 -k64"
635+
636+
make -C tests/regression/sgemm_tcu_sp clean && CONFIGS="-DNUM_THREADS=16 -DITYPE=uint4 -DOTYPE=int32" make -C tests/regression/sgemm_tcu_sp
637+
CONFIGS="-DNUM_THREADS=16 -DEXT_TCU_ENABLE -DTCU_SPARSE_ENABLE" ./ci/blackbox.sh --driver=simx --app=sgemm_tcu_sp --args="-m16 -n8 -k64"
638+
639+
make -C tests/regression/sgemm_tcu_sp clean && CONFIGS="-DNUM_THREADS=16 -DITYPE=fp16 -DOTYPE=fp32" make -C tests/regression/sgemm_tcu_sp
640+
CONFIGS="-DNUM_THREADS=16 -DEXT_TCU_ENABLE -DTCU_SPARSE_ENABLE" ./ci/blackbox.sh --driver=simx --app=sgemm_tcu_sp --args="-m16 -n8 -k16"
641+
642+
make -C tests/regression/sgemm_tcu_sp clean && CONFIGS="-DNUM_THREADS=16 -DITYPE=bf16 -DOTYPE=fp32" make -C tests/regression/sgemm_tcu_sp
643+
CONFIGS="-DNUM_THREADS=16 -DEXT_TCU_ENABLE -DTCU_SPARSE_ENABLE" ./ci/blackbox.sh --driver=simx --app=sgemm_tcu_sp --args="-m16 -n8 -k16"
644+
645+
make -C tests/regression/sgemm_tcu_sp clean && CONFIGS="-DNUM_THREADS=16 -DITYPE=fp8 -DOTYPE=fp32" make -C tests/regression/sgemm_tcu_sp
646+
CONFIGS="-DNUM_THREADS=16 -DEXT_TCU_ENABLE -DTCU_SPARSE_ENABLE" ./ci/blackbox.sh --driver=simx --app=sgemm_tcu_sp --args="-m16 -n8 -k32"
647+
648+
make -C tests/regression/sgemm_tcu_sp clean && CONFIGS="-DNUM_THREADS=16 -DITYPE=bf8 -DOTYPE=fp32" make -C tests/regression/sgemm_tcu_sp
649+
CONFIGS="-DNUM_THREADS=16 -DEXT_TCU_ENABLE -DTCU_SPARSE_ENABLE" ./ci/blackbox.sh --driver=simx --app=sgemm_tcu_sp --args="-m16 -n8 -k32"
650+
626651
# simx tests (NT=32)
627652
make -C tests/regression/sgemm_tcu_sp clean && CONFIGS="-DNUM_THREADS=32 -DITYPE=int8 -DOTYPE=int32" make -C tests/regression/sgemm_tcu_sp
628653
CONFIGS="-DNUM_THREADS=32 -DEXT_TCU_ENABLE -DTCU_SPARSE_ENABLE" ./ci/blackbox.sh --driver=simx --app=sgemm_tcu_sp --args="-m16 -n16 -k64"

sim/simx/decode.cpp

Lines changed: 52 additions & 23 deletions
Original file line numberDiff line numberDiff line change
@@ -1191,7 +1191,7 @@ void Emulator::decode(uint32_t code, uint32_t wid, uint64_t uuid) {
11911191
if (is_sparse) {
11921192
// Sparse mode uses the packed sparse-A register layout from vx_tensor.h
11931193
// and a synthesized metadata phase, matching the RTL uop expansion.
1194-
#if (NUM_THREADS != 8) && (NUM_THREADS != 32)
1194+
#if (NUM_THREADS != 8) && (NUM_THREADS != 16) && (NUM_THREADS != 32)
11951195
std::abort();
11961196
#else
11971197
constexpr uint32_t sparse_k_steps = cfg::k_steps / 2;
@@ -1203,9 +1203,6 @@ void Emulator::decode(uint32_t code, uint32_t wid, uint64_t uuid) {
12031203
if ((cfg::k_steps % 2) != 0) {
12041204
std::abort();
12051205
}
1206-
if (cfg::nt16_sparse) {
1207-
std::abort();
1208-
}
12091206
if ((cfg::b_block_size_sp == 0) || (NUM_THREADS % cfg::b_block_size_sp) != 0) {
12101207
std::abort();
12111208
}
@@ -1215,7 +1212,10 @@ void Emulator::decode(uint32_t code, uint32_t wid, uint64_t uuid) {
12151212
}
12161213

12171214
uint32_t steps = 0;
1218-
uint32_t steps_count = num_meta_cols + (cfg::m_steps * cfg::n_steps * sparse_k_steps);
1215+
uint32_t sparse_mma_steps = cfg::nt16_sparse
1216+
? (cfg::m_steps * cfg::n_steps * cfg::k_steps)
1217+
: (cfg::m_steps * cfg::n_steps * sparse_k_steps);
1218+
uint32_t steps_count = num_meta_cols + sparse_mma_steps;
12191219
uint32_t steps_shift = (steps_count > 1) ? (32 - log2ceil(steps_count)) : 0;
12201220
uint32_t uuid_hi = (uuid >> 32) & 0xffffffff;
12211221
uint32_t uuid_lo = uuid & 0xffffffff;
@@ -1233,24 +1233,53 @@ void Emulator::decode(uint32_t code, uint32_t wid, uint64_t uuid) {
12331233
ibuffer.push_back(instr);
12341234
}
12351235

1236-
for (uint32_t k = 0; k < sparse_k_steps; ++k) {
1237-
for (uint32_t m = 0; m < cfg::m_steps; ++m) {
1238-
for (uint32_t n = 0; n < cfg::n_steps; ++n) {
1239-
uint32_t reg_rs1 = ra_base + (m / cfg::a_sub_blocks) * sparse_k_steps + k;
1240-
uint32_t reg_rs2 = rb_base + (k * cfg::n_steps + n) / cfg::b_sub_blocks_sp;
1241-
uint32_t reg_rs3 = rc_base + m * cfg::n_steps + n;
1242-
uint32_t uuid_lo_x = (steps << steps_shift) | uuid_lo;
1243-
uint64_t uuid_x = (static_cast<uint64_t>(uuid_hi) << 32) | uuid_lo_x;
1244-
++steps;
1245-
auto instr = std::allocate_shared<Instr>(instr_pool_, uuid_x, FUType::TCU);
1246-
instr->setOpType(tcu_type);
1247-
instr->setArgs(IntrTcuArgs{fmt_s, fmt_d, m, n, k});
1248-
instr->setDestReg(reg_rs3, RegType::Float);
1249-
instr->setSrcReg(0, reg_rs1, RegType::Float);
1250-
instr->setSrcReg(1, reg_rs2, RegType::Float);
1251-
instr->setSrcReg(2, reg_rs3, RegType::Float);
1252-
instr->setParentUUID(uuid);
1253-
ibuffer.push_back(instr);
1236+
if (cfg::nt16_sparse) {
1237+
constexpr uint32_t lg_n = (cfg::n_steps > 1) ? log2ceil(cfg::n_steps) : 0;
1238+
constexpr uint32_t lg_k = (cfg::k_steps > 1) ? log2ceil(cfg::k_steps) : 0;
1239+
constexpr uint32_t sparse_step_bits = lg_n + lg_k;
1240+
constexpr uint32_t sparse_step_mask = (sparse_step_bits != 0) ? ((1u << sparse_step_bits) - 1) : 0;
1241+
constexpr uint32_t tmask_even = 0x3333;
1242+
constexpr uint32_t tmask_odd = 0xCCCC;
1243+
for (uint32_t eff_ctr = 0; eff_ctr < sparse_mma_steps; ++eff_ctr) {
1244+
uint32_t n_sp = (sparse_step_bits != 0) ? (eff_ctr & sparse_step_mask) : 0;
1245+
uint32_t m_sp = eff_ctr >> sparse_step_bits;
1246+
uint32_t reg_rs1 = ra_base + m_sp;
1247+
uint32_t reg_rs2 = rb_base + n_sp;
1248+
uint32_t reg_rs3 = rc_base + (eff_ctr >> 1);
1249+
uint32_t uuid_lo_x = (steps << steps_shift) | uuid_lo;
1250+
uint64_t uuid_x = (static_cast<uint64_t>(uuid_hi) << 32) | uuid_lo_x;
1251+
++steps;
1252+
auto instr = std::allocate_shared<Instr>(instr_pool_, uuid_x, FUType::TCU);
1253+
instr->setOpType(tcu_type);
1254+
instr->setArgs(IntrTcuArgs{fmt_s, fmt_d, m_sp, n_sp, 0});
1255+
instr->setDestReg(reg_rs3, RegType::Float);
1256+
instr->setSrcReg(0, reg_rs1, RegType::Float);
1257+
instr->setSrcReg(1, reg_rs2, RegType::Float);
1258+
instr->setSrcReg(2, reg_rs3, RegType::Float);
1259+
instr->setTmask(ThreadMask(NUM_THREADS, (eff_ctr & 1) ? tmask_odd : tmask_even));
1260+
instr->setParentUUID(uuid);
1261+
ibuffer.push_back(instr);
1262+
}
1263+
} else {
1264+
for (uint32_t k = 0; k < sparse_k_steps; ++k) {
1265+
for (uint32_t m = 0; m < cfg::m_steps; ++m) {
1266+
for (uint32_t n = 0; n < cfg::n_steps; ++n) {
1267+
uint32_t reg_rs1 = ra_base + (m / cfg::a_sub_blocks) * sparse_k_steps + k;
1268+
uint32_t reg_rs2 = rb_base + (k * cfg::n_steps + n) / cfg::b_sub_blocks_sp;
1269+
uint32_t reg_rs3 = rc_base + m * cfg::n_steps + n;
1270+
uint32_t uuid_lo_x = (steps << steps_shift) | uuid_lo;
1271+
uint64_t uuid_x = (static_cast<uint64_t>(uuid_hi) << 32) | uuid_lo_x;
1272+
++steps;
1273+
auto instr = std::allocate_shared<Instr>(instr_pool_, uuid_x, FUType::TCU);
1274+
instr->setOpType(tcu_type);
1275+
instr->setArgs(IntrTcuArgs{fmt_s, fmt_d, m, n, k});
1276+
instr->setDestReg(reg_rs3, RegType::Float);
1277+
instr->setSrcReg(0, reg_rs1, RegType::Float);
1278+
instr->setSrcReg(1, reg_rs2, RegType::Float);
1279+
instr->setSrcReg(2, reg_rs3, RegType::Float);
1280+
instr->setParentUUID(uuid);
1281+
ibuffer.push_back(instr);
1282+
}
12541283
}
12551284
}
12561285
}

sim/simx/emulator.h

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -132,7 +132,7 @@ class Emulator {
132132

133133
instr_trace_t* execute(const Instr &instr, uint32_t wid);
134134

135-
void fetch_registers(std::vector<reg_data_t>& out, uint32_t wid, uint32_t src_index, const RegOpd& reg);
135+
void fetch_registers(std::vector<reg_data_t>& out, uint32_t wid, uint32_t src_index, const RegOpd& reg, const ThreadMask& tmask);
136136

137137
void icache_read(void* data, uint64_t addr, uint32_t size);
138138

sim/simx/execute.cpp

Lines changed: 21 additions & 18 deletions
Original file line numberDiff line numberDiff line change
@@ -48,10 +48,10 @@ inline int64_t check_boxing(int64_t a) {
4848
return nan_box(0x7fc00000); // NaN
4949
}
5050

51-
void Emulator::fetch_registers(std::vector<reg_data_t>& out, uint32_t wid, uint32_t src_index, const RegOpd& reg) {
51+
void Emulator::fetch_registers(std::vector<reg_data_t>& out, uint32_t wid, uint32_t src_index, const RegOpd& reg, const ThreadMask& tmask) {
5252
__unused(src_index);
5353
auto& warp = warps_.at(wid);
54-
uint32_t num_threads = warp.tmask.size();
54+
uint32_t num_threads = tmask.size();
5555
out.resize(num_threads);
5656
switch (reg.type) {
5757
case RegType::None:
@@ -60,7 +60,7 @@ void Emulator::fetch_registers(std::vector<reg_data_t>& out, uint32_t wid, uint3
6060
DPH(2, "Src" << src_index << " Reg: " << reg << "={");
6161
for (uint32_t t = 0; t < num_threads; ++t) {
6262
if (t) DPN(2, ", ");
63-
if (!warp.tmask.test(t)) {
63+
if (!tmask.test(t)) {
6464
DPN(2, "-");
6565
continue;
6666
}
@@ -74,7 +74,7 @@ void Emulator::fetch_registers(std::vector<reg_data_t>& out, uint32_t wid, uint3
7474
auto& reg_data = warp.ireg_file.at(reg.idx);
7575
for (uint32_t t = 0; t < num_threads; ++t) {
7676
if (t) DPN(2, ", ");
77-
if (!warp.tmask.test(t)) {
77+
if (!tmask.test(t)) {
7878
DPN(2, "-");
7979
continue;
8080
}
@@ -89,7 +89,7 @@ void Emulator::fetch_registers(std::vector<reg_data_t>& out, uint32_t wid, uint3
8989
auto& reg_data = warp.freg_file.at(reg.idx);
9090
for (uint32_t t = 0; t < num_threads; ++t) {
9191
if (t) DPN(2, ", ");
92-
if (!warp.tmask.test(t)) {
92+
if (!tmask.test(t)) {
9393
DPN(2, "-");
9494
continue;
9595
}
@@ -124,6 +124,8 @@ instr_trace_t* Emulator::execute(const Instr &instr, uint32_t wid) {
124124
auto rsrc2 = instr.getSrcReg(2);
125125

126126
auto num_threads = arch_.num_threads();
127+
auto exec_tmask = instr.hasTmask() ? (warp.tmask & instr.getTmask()) : warp.tmask;
128+
auto operand_tmask = warp.tmask;
127129

128130
// create instruction trace
129131
auto trace_alloc = core_->trace_pool().allocate(1);
@@ -133,7 +135,7 @@ instr_trace_t* Emulator::execute(const Instr &instr, uint32_t wid) {
133135
trace->cid = core_->id();
134136
trace->wid = wid;
135137
trace->PC = warp.PC;
136-
trace->tmask = warp.tmask;
138+
trace->tmask = exec_tmask;
137139
trace->dst_reg = rdest;
138140
trace->src_regs = {rsrc0, rsrc1, rsrc2};
139141

@@ -143,27 +145,27 @@ instr_trace_t* Emulator::execute(const Instr &instr, uint32_t wid) {
143145
std::vector<reg_data_t> rs3_data;
144146

145147
if (instr.is_uop()) {
146-
DP(1, "Instr: " << instr << ", cid=" << core_->id() << ", wid=" << wid << ", tmask=" << warp.tmask
148+
DP(1, "Instr: " << instr << ", cid=" << core_->id() << ", wid=" << wid << ", tmask=" << exec_tmask
147149
<< ", PC=0x" << std::hex << warp.PC << std::dec << ", parent=#" << instr.getParentUUID() << " (#" << instr.getUUID() << ")");
148150
} else {
149-
DP(1, "Instr: " << instr << ", cid=" << core_->id() << ", wid=" << wid << ", tmask=" << warp.tmask
151+
DP(1, "Instr: " << instr << ", cid=" << core_->id() << ", wid=" << wid << ", tmask=" << exec_tmask
150152
<< ", PC=0x" << std::hex << warp.PC << std::dec << " (#" << instr.getUUID() << ")");
151153
}
152154

153155
// fetch register values
154-
if (rsrc0.type != RegType::None) fetch_registers(rs1_data, wid, 0, rsrc0);
155-
if (rsrc1.type != RegType::None) fetch_registers(rs2_data, wid, 1, rsrc1);
156-
if (rsrc2.type != RegType::None) fetch_registers(rs3_data, wid, 2, rsrc2);
156+
if (rsrc0.type != RegType::None) fetch_registers(rs1_data, wid, 0, rsrc0, operand_tmask);
157+
if (rsrc1.type != RegType::None) fetch_registers(rs2_data, wid, 1, rsrc1, operand_tmask);
158+
if (rsrc2.type != RegType::None) fetch_registers(rs3_data, wid, 2, rsrc2, operand_tmask);
157159

158160
uint32_t thread_start = 0;
159161
for (; thread_start < num_threads; ++thread_start) {
160-
if (warp.tmask.test(thread_start))
162+
if (exec_tmask.test(thread_start))
161163
break;
162164
}
163165

164166
int32_t thread_last = num_threads - 1;
165167
for (; thread_last >= 0; --thread_last) {
166-
if (warp.tmask.test(thread_last))
168+
if (exec_tmask.test(thread_last))
167169
break;
168170
}
169171

@@ -1601,21 +1603,22 @@ instr_trace_t* Emulator::execute(const Instr &instr, uint32_t wid) {
16011603
case TcuType::WMMA: {
16021604
auto trace_data = std::make_shared<TensorUnit::ExeTraceData>();
16031605
trace->data = trace_data;
1604-
assert(warp.tmask.count() == num_threads);
1606+
assert(operand_tmask.count() == num_threads);
16051607
core_->tensor_unit()->wmma(wid, tpuArgs.fmt_s, tpuArgs.fmt_d, tpuArgs.step_m, tpuArgs.step_n, tpuArgs.step_k, rs1_data, rs2_data, rs3_data, rd_data, trace_data.get());
16061608
rd_write = true;
16071609
} break;
16081610
case TcuType::WMMA_SP: {
16091611
auto trace_data = std::make_shared<TensorUnit::ExeTraceData>();
16101612
trace->data = trace_data;
1611-
assert(warp.tmask.count() == num_threads);
1613+
assert(operand_tmask.count() == num_threads);
1614+
assert(exec_tmask.any());
16121615
core_->tensor_unit()->wmma_sp(wid, tpuArgs.fmt_s, tpuArgs.fmt_d, tpuArgs.step_m, tpuArgs.step_n, tpuArgs.step_k, rs1_data, rs2_data, rs3_data, rd_data, trace_data.get());
16131616
rd_write = true;
16141617
} break;
16151618
case TcuType::META_STORE: {
16161619
auto trace_data = std::make_shared<TensorUnit::ExeTraceData>();
16171620
trace->data = trace_data;
1618-
assert(warp.tmask.count() == num_threads);
1621+
assert(operand_tmask.count() == num_threads);
16191622
core_->tensor_unit()->meta_store(wid, tpuArgs.fmt_s, tpuArgs.fmt_d, rs1_data, trace_data.get());
16201623
} break;
16211624
default:
@@ -1635,7 +1638,7 @@ instr_trace_t* Emulator::execute(const Instr &instr, uint32_t wid) {
16351638
DPH(2, "Dest Reg: " << rdest << "={");
16361639
for (uint32_t t = 0; t < num_threads; ++t) {
16371640
if (t) DPN(2, ", ");
1638-
if (!warp.tmask.test(t)) {
1641+
if (!exec_tmask.test(t)) {
16391642
DPN(2, "-");
16401643
continue;
16411644
}
@@ -1652,7 +1655,7 @@ instr_trace_t* Emulator::execute(const Instr &instr, uint32_t wid) {
16521655
DPH(2, "Dest Reg: " << rdest << "={");
16531656
for (uint32_t t = 0; t < num_threads; ++t) {
16541657
if (t) DPN(2, ", ");
1655-
if (!warp.tmask.test(t)) {
1658+
if (!exec_tmask.test(t)) {
16561659
DPN(2, "-");
16571660
continue;
16581661
}

sim/simx/instr.h

Lines changed: 16 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -155,6 +155,11 @@ class Instr {
155155
args_ = static_cast<T>(args);
156156
}
157157

158+
void setTmask(const ThreadMask& tmask) {
159+
tmask_ = tmask;
160+
has_tmask_ = true;
161+
}
162+
158163
void setDestReg(uint32_t destReg, RegType type) {
159164
rdest_ = {type, destReg };
160165
}
@@ -186,6 +191,14 @@ class Instr {
186191
return is_uop_;
187192
}
188193

194+
bool hasTmask() const {
195+
return has_tmask_;
196+
}
197+
198+
const ThreadMask& getTmask() const {
199+
return tmask_;
200+
}
201+
189202
private:
190203

191204
uint64_t uuid_;
@@ -196,8 +209,10 @@ class Instr {
196209
RegOpd rsrc_[MAX_REG_SOURCES];
197210
RegOpd rdest_;
198211
bool is_uop_;
212+
ThreadMask tmask_;
213+
bool has_tmask_ = false;
199214

200215
friend std::ostream &operator<<(std::ostream &, const Instr &);
201216
};
202217

203-
}
218+
}

sim/simx/tensor_unit.cpp

Lines changed: 4 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -681,7 +681,7 @@ class TensorUnit::Impl {
681681

682682
auto fedp = select_FEDP(fmt_s, fmt_d);
683683

684-
if (cfg::nt16_sparse || (this->arch_.num_threads() != 8 && this->arch_.num_threads() != 32)) {
684+
if (this->arch_.num_threads() != 8 && this->arch_.num_threads() != 16 && this->arch_.num_threads() != 32) {
685685
std::cout << "Error: WMMA_SP unsupported for NUM_THREADS=" << this->arch_.num_threads() << std::endl;
686686
std::abort();
687687
}
@@ -732,8 +732,9 @@ class TensorUnit::Impl {
732732
uint32_t off = bit_idx % 32;
733733
return (sparse_meta_.at(wid).at(bank * kMaxMetaCols + col) >> off) & 1u;
734734
};
735-
auto bword1 = rs2_data.at(b_off + j * cfg::tcK * kCompression + z * kCompression + 0).u32;
736-
auto bword2 = rs2_data.at(b_off + j * cfg::tcK * kCompression + z * kCompression + 1).u32;
735+
uint32_t j_sp = cfg::nt16_sparse ? (j % (cfg::tcN / 2)) : j;
736+
auto bword1 = rs2_data.at(b_off + j_sp * cfg::tcK * kCompression + z * kCompression + 0).u32;
737+
auto bword2 = rs2_data.at(b_off + j_sp * cfg::tcK * kCompression + z * kCompression + 1).u32;
737738
uint32_t b_gathered = 0;
738739
if (is_16bit_sparse_fmt) {
739740
uint8_t mask_lo = 0;

0 commit comments

Comments
 (0)