Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
70 changes: 65 additions & 5 deletions src/core/NEON/kernels/arm_gemm/gemm_interleaved.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -281,14 +281,22 @@ void kernel_and_merge<false, false, DequantizeFloat>::run(
offset_bias = bias + n_0;
}

// When b_offset != 0, row sums of A are packed at the end of the A panel
// (appended by the quantized PrepareA transform with multiplier=1). Read them
// to pass to dequantize_block_32 for per-row offset correction.
const int32_t *row_sum = nullptr;
if (dq.b_offset != 0) {
row_sum = reinterpret_cast<const int32_t *>(a_ptr + strategy::out_height() * kern_k);
}

strat.kernel(// A and B pointers are just the packed panels.
a_ptr, b_panel,
// Provide relevant part of output array and row stride.
c_ptr ? (c_ptr + m_0 * ldc + n_0) : nullptr, ldc,
// M, N, K sizes
m_max-m_0, n_max - n_0, kern_k,
// Bias, activation, accumulation. Need to offset the bias as needed.
offset_col_bias, dq, offset_bias, act, accumulate, acc_buff);
offset_col_bias, dq, offset_bias, act, accumulate, acc_buff, row_sum, kern_k);
}

template<>
Expand All @@ -300,7 +308,7 @@ void kernel_and_merge<true, false, DequantizeFloat>::run(
strategy &strat, const Tlo *a_ptr, const Tro *b_panel, size_t, Tri *c_panel,
Tr *c_ptr, int ldc, int kern_k, unsigned int m_0,
unsigned int m_max, unsigned int n_0, unsigned int n_max, const Tr *bias,
const Activation &act, bool not_first_pass, const DequantizeFloat &qp, const int32_t *,
const Activation &act, bool not_first_pass, const DequantizeFloat &qp, const int32_t *col_bias,
Tab *)
{
const int bblocks = iceildiv(n_max - n_0, strategy::out_width());
Expand All @@ -317,14 +325,20 @@ void kernel_and_merge<true, false, DequantizeFloat>::run(
#ifdef CYCLE_PROFILING
auto p=prof.ScopedProfiler(PROFILE_QUANTIZE, ((m_max-m_0) * bblocks * strategy::out_width() * sizeof(Tr)));
#endif
// When b_offset != 0, row sums are packed after the A panel data
const int32_t *row_sum = (qp.b_offset != 0)
? reinterpret_cast<const int32_t *>(a_ptr + strategy::out_height() * kern_k)
: nullptr;

for (int i=0; i<bblocks; i++) {
unsigned int n_start = n_0 + (strategy::out_width() * i);
unsigned int n_end = std::min(n_start + strategy::out_width(), n_max);

dequantize_block_32(qp, (n_end - n_start), (m_max - m_0),
c_panel + (i * strategy::out_width() * strategy::out_height()), strategy::out_width(),
c_ptr + m_0 * ldc + n_start, ldc,
bias != nullptr ? bias + n_start : nullptr, not_first_pass, act);
bias != nullptr ? bias + n_start : nullptr, not_first_pass, act,
col_bias != nullptr ? col_bias + n_start : nullptr, row_sum, kern_k);

}
}
Expand Down Expand Up @@ -475,6 +489,13 @@ class GemmInterleaved : public GemmCommon<Tlo, Tro, Tr> {
return _Nsize * _nmulti * sizeof(int32_t);
}

if (std::is_same<OutputStage, DequantizeFloat>::value) {
const DequantizeFloat *dq = reinterpret_cast<const DequantizeFloat *>(&_os);
if (dq->a_offset != 0) {
return _Nsize * _nmulti * sizeof(int32_t);
}
}

return 0;
}

Expand Down Expand Up @@ -557,6 +578,12 @@ class GemmInterleaved : public GemmCommon<Tlo, Tro, Tr> {
k_depth += sizeof(int32_t) / sizeof(Tloi);
}

if (std::is_same<OutputStage, DequantizeFloat>::value && MergeStep) {
// transforms_quantized always packs row sum slots (zeros when multiplier=0, actual
// sums when b_offset != 0). Reserve space unconditionally when MergeStep is enabled.
k_depth += sizeof(int32_t) / sizeof(Tloi);
}

return k_depth;
}

Expand Down Expand Up @@ -647,6 +674,13 @@ class GemmInterleaved : public GemmCommon<Tlo, Tro, Tr> {
return -qp->b_offset;
}

if (std::is_same<OutputStage, DequantizeFloat>::value) {
const DequantizeFloat *dq = reinterpret_cast<const DequantizeFloat *>(&_os);
// Pack row sums into the A panel when b_offset is non-zero so that the
// merge step can apply the b_offset correction per output position.
return (dq->b_offset != 0) ? 1 : 0;
}

return 0;
}

Expand Down Expand Up @@ -693,6 +727,14 @@ class GemmInterleaved : public GemmCommon<Tlo, Tro, Tr> {
return get_ktotal(args);
}

// K blocking is not supported for DequantizeFloat with MergeStep when b_offset != 0,
// because row sums of A must cover the full K depth. We cannot check b_offset here
// (static function), so we conservatively disable K-blocking for all DequantizeFloat
// MergeStep cases. The working-memory cost is minimal and correctness is guaranteed.
if (std::is_same<OutputStage, DequantizeFloat>::value && MergeStep) {
return get_ktotal(args);
}

// We can't K block non-fast FP16 cases without an accumulation buffer.
#if defined(__aarch64__) && (defined(FP16_KERNELS) || defined(ARM_COMPUTE_ENABLE_FP16))
if (std::is_same<Tlo, __fp16>::value && std::is_same<Tr, __fp16>::value && !args._fast_mode && MergeStep) {
Expand Down Expand Up @@ -937,7 +979,7 @@ class GemmInterleaved : public GemmCommon<Tlo, Tro, Tr> {
#endif
// See comment above on transform_type<> class: this extracts either 'transforms' or
// 'transforms_quantized' as appropriate.
typename transform_type<strategy, MergeStep && std::is_same<OutputStage, Requantize32>::value>::type transforms;
typename transform_type<strategy, MergeStep && (std::is_same<OutputStage, Requantize32>::value || std::is_same<OutputStage, DequantizeFloat>::value)>::type transforms;

if (_indirect_buf != nullptr) {
transforms.PrepareA_indirect(a_panel,
Expand Down Expand Up @@ -1027,7 +1069,7 @@ class GemmInterleaved : public GemmCommon<Tlo, Tro, Tr> {
#endif
// See comment above on transform_type<> class: this extracts either 'transforms' or
// 'transforms_quantized' as appropriate.
typename transform_type<strategy, MergeStep && std::is_same<OutputStage, Requantize32>::value>::type transforms;
typename transform_type<strategy, MergeStep && (std::is_same<OutputStage, Requantize32>::value || std::is_same<OutputStage, DequantizeFloat>::value)>::type transforms;

for (unsigned int batch = batch_0; batch <= batch_end; batch++) {
unsigned int first_m = (batch == batch_0) ? m_0 : 0;
Expand Down Expand Up @@ -1060,6 +1102,10 @@ class GemmInterleaved : public GemmCommon<Tlo, Tro, Tr> {

if(std::is_same<OutputStage, Requantize32>::value) {
a_panel_stride = kern_k + (sizeof(int32_t) / sizeof(Tloi));
} else if (std::is_same<OutputStage, DequantizeFloat>::value) {
// transforms_quantized always packs row-sum slots (zeros when b_offset=0,
// actual sums when b_offset != 0), so the stride must include the slot.
a_panel_stride = kern_k + (sizeof(int32_t) / sizeof(Tloi));
} else {
a_panel_stride = kern_k;
}
Expand Down Expand Up @@ -1212,6 +1258,20 @@ class GemmInterleaved : public GemmCommon<Tlo, Tro, Tr> {
compute_col_sums(*qp_ptr, _Nsize, _Ksize * _Ksections, B + (i * B_multi_stride), ldb, col_bias + (i * _Nsize), _Ksize * _Ksections, i, 0);
}
}

if (std::is_same<OutputStage, DequantizeFloat>::value) {
const DequantizeFloat *dq = reinterpret_cast<const DequantizeFloat *>(&_os);
if (dq->a_offset != 0) {
// Compute raw column sums of B (weight matrix) for use in a_offset correction.
// dequantize_block_32 applies: -a_offset * col_sums[n] * scale per output channel.
col_bias = reinterpret_cast<int32_t *>(in_buffer);
for (unsigned int i = 0; i < _nmulti; ++i) {
compute_raw_col_sums(_Nsize, _Ksize * _Ksections,
B + (i * B_multi_stride), ldb,
col_bias + (i * _Nsize));
}
}
}
}

// Support for transposed B is a property of the strategy::transpose type
Expand Down
3 changes: 2 additions & 1 deletion src/core/NEON/kernels/arm_gemm/quantized-fp16.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -35,7 +35,8 @@ namespace arm_gemm {
template<>
void dequantize_block_32<__fp16>(const DequantizeFloat &qp, unsigned int width, unsigned int height,
const int32_t * in_ptr, unsigned int in_stride, __fp16 *out_ptr, unsigned int out_stride,
const __fp16 * bias_ptr, bool not_first_pass, const Activation &act)
const __fp16 * bias_ptr, bool not_first_pass, const Activation &act,
const int32_t * /*col_bias*/, const int32_t * /*row_sum*/, int32_t /*k_total*/)
{
const float32x4_t vscale = vdupq_n_f32(qp.scale);
float maxval = std::numeric_limits<float>::infinity();
Expand Down
56 changes: 52 additions & 4 deletions src/core/NEON/kernels/arm_gemm/quantized.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -973,10 +973,28 @@ void compute_col_sums(const Requantize32 &qp, unsigned int width, unsigned int h
template void compute_col_sums(const Requantize32 &qp, unsigned int width, unsigned int height, const int8_t *input, unsigned int in_stride, int32_t *col_bias, unsigned int depth, unsigned int multi, unsigned int first_col);
template void compute_col_sums(const Requantize32 &qp, unsigned int width, unsigned int height, const uint8_t *input, unsigned int in_stride, int32_t *col_bias, unsigned int depth, unsigned int multi, unsigned int first_col);

template<typename T>
void compute_raw_col_sums(unsigned int width, unsigned int height,
const T *input, unsigned int in_stride, int32_t *col_sums)
{
memset(reinterpret_cast<void *>(col_sums), 0, width * sizeof(int32_t));
for (unsigned int row = 0; row < height; ++row)
{
for (unsigned int col = 0; col < width; ++col)
{
col_sums[col] += static_cast<int32_t>(input[row * in_stride + col]);
}
}
}

template void compute_raw_col_sums(unsigned int width, unsigned int height, const int8_t *input, unsigned int in_stride, int32_t *col_sums);
template void compute_raw_col_sums(unsigned int width, unsigned int height, const uint8_t *input, unsigned int in_stride, int32_t *col_sums);

template<>
void dequantize_block_32<float>(const DequantizeFloat &qp, unsigned int width, unsigned int height,
const int32_t* in_ptr, unsigned int in_stride, float *out_ptr, unsigned int out_stride,
const float* bias_ptr, bool accumulate, const Activation &act)
const float* bias_ptr, bool accumulate, const Activation &act,
const int32_t *col_bias, const int32_t *row_sum, int32_t k_total)
{
const float32x4_t vscale = vdupq_n_f32(qp.scale);
float maxval = std::numeric_limits<float>::infinity();
Expand All @@ -1000,14 +1018,38 @@ void dequantize_block_32<float>(const DequantizeFloat &qp, unsigned int width, u
for(unsigned int row=0; row<height; row++) {
auto row_in_ptr = in_ptr + (row * in_stride);
auto row_out_ptr = out_ptr + (row * out_stride);

// Per-row addend from b_offset correction: -b_offset * sum_a_row[m] * scale
// row_sum values are packed with multiplier=1, so entry is plain sum(a_row[k]).
// Also add the cross-term: +a_offset * b_offset * K * scale (constant per tile).
float row_offset = 0.0f;
if (row_sum != nullptr) {
row_offset += static_cast<float>(-qp.b_offset * row_sum[row]) * qp.scale;
}
if (col_bias != nullptr && row_sum != nullptr && k_total != 0) {
// Cross-term: +a_offset * b_offset * K * scale
row_offset += static_cast<float>(qp.a_offset) * static_cast<float>(qp.b_offset)
* static_cast<float>(k_total) * qp.scale;
}
const float32x4_t vrow_offset = vdupq_n_f32(row_offset);

unsigned int col=0;
if (width >= 4) {
for(; col <= (width - 4); col+= 4) {
const int32x4_t vin = vld1q_s32(row_in_ptr + col);
float32x4_t vdeq = vmulq_f32(vcvtq_f32_s32(vin), vscale);
if(bias_ptr) {
const float32x4_t bin = vld1q_f32(bias_ptr + col);
vdeq = vaddq_f32(vdeq, bin);
vdeq = vaddq_f32(vdeq, vld1q_f32(bias_ptr + col));
}
if(col_bias) {
// a_offset correction: -a_offset * sum_b_col[n] * scale
const float32x4_t vcol_corr = vmulq_f32(
vcvtq_f32_s32(vld1q_s32(col_bias + col)),
vdupq_n_f32(static_cast<float>(-qp.a_offset) * qp.scale));
vdeq = vaddq_f32(vdeq, vcol_corr);
}
if(row_sum) {
vdeq = vaddq_f32(vdeq, vrow_offset);
}
if(accumulate) {
vdeq = vaddq_f32(vdeq, vld1q_f32(row_out_ptr + col));
Expand All @@ -1019,10 +1061,16 @@ void dequantize_block_32<float>(const DequantizeFloat &qp, unsigned int width, u
// left-over elements
for(; col < width; ++col) {
const int32_t val = *(row_in_ptr + col);
float res = static_cast<float>(val * qp.scale);
float res = static_cast<float>(val) * qp.scale;
if(bias_ptr) {
res += static_cast<float>(*(bias_ptr + col));
}
if(col_bias) {
res += static_cast<float>(-qp.a_offset * col_bias[col]) * qp.scale;
}
if(row_sum) {
res += row_offset;
}
if(accumulate) {
res += *(row_out_ptr + col);
}
Expand Down
12 changes: 11 additions & 1 deletion src/cpu/kernels/assembly/arm_common/internal/quantized.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -41,13 +41,23 @@ void compute_col_sums(const Requantize32 &qp, unsigned int width, unsigned int h
const T *input, unsigned int in_stride, int32_t *col_bias, unsigned int depth,
unsigned int multi, unsigned int first_col);

/** Compute raw column sums of a matrix: col_sums[n] = sum_{k} input[k * in_stride + n].
* Unlike compute_col_sums(), this does not apply any quantization offsets or scaling —
* it stores the plain integer sums for use as weight column reductions in the
* DequantizeFloat a_offset correction path. */
template<typename T>
void compute_raw_col_sums(unsigned int width, unsigned int height,
const T *input, unsigned int in_stride, int32_t *col_sums);

template<typename T>
void row_sums_indirect(size_t num_strings, const unsigned int *string_lengths, IndirectInputArg<T> A_arg,
size_t M, int32_t *output_ptr, const Requantize32 *qp);

template<typename T>
void dequantize_block_32(const DequantizeFloat &qp, unsigned int width, unsigned int height,
const int32_t* input, unsigned int in_stride, T *output, unsigned int out_stride,
const T *row_bias, bool not_first_pass, const Activation &act);
const T *row_bias, bool not_first_pass, const Activation &act,
const int32_t *col_bias = nullptr, const int32_t *row_sum = nullptr,
int32_t k_total = 0);

} // namespace arm_gemm
14 changes: 11 additions & 3 deletions src/cpu/kernels/assembly/arm_gemm/arm_gemm.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -240,12 +240,20 @@ struct Requantize32
struct DequantizeFloat
{
public:
float scale = 0;
float scale = 0;
int32_t a_offset = 0; // input quantization zero-point (subtract from each input value)
int32_t b_offset = 0; // weight quantization zero-point (subtract from each weight value)

DequantizeFloat() = default;

// Constructor
DequantizeFloat(const float scale) : scale(scale)
// Constructor without offset (symmetric quantization)
DequantizeFloat(const float scale) : scale(scale), a_offset(0), b_offset(0)
{
}

// Constructor with asymmetric quantization offsets
DequantizeFloat(const float scale, int32_t a_offset, int32_t b_offset)
: scale(scale), a_offset(a_offset), b_offset(b_offset)
{
}
};
Expand Down
24 changes: 18 additions & 6 deletions src/cpu/operators/CpuConv2d.cpp
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
/*
* Copyright (c) 2017-2021, 2023-2025 Arm Limited.
* Copyright (c) 2017-2021, 2023-2026 Arm Limited.
*
* SPDX-License-Identifier: MIT
*
Expand Down Expand Up @@ -59,13 +59,14 @@ void CpuConv2d::configure(ITensorInfo *input,
// Perform validate step
ARM_COMPUTE_ERROR_ON_NULLPTR(input, weights, output);
ARM_COMPUTE_UNUSED(num_groups);
ARM_COMPUTE_ERROR_THROW_ON(CpuConv2d::validate(input, weights, biases, output, conv_info, weights_info, dilation,
act_info, enable_fast_math, num_groups));
ARM_COMPUTE_ERROR_THROW_ON(
CpuConv2d::validate(input, weights, biases, output, conv_info, weights_info, dilation, act_info,
enable_fast_math, num_groups));

ARM_COMPUTE_LOG_PARAMS(input, weights, biases, output, conv_info, weights_info, dilation, act_info,
enable_fast_math, num_groups);

const Conv2dInfo info(conv_info, dilation, act_info, enable_fast_math, num_groups);
const Conv2dInfo info(conv_info, dilation, act_info, enable_fast_math, num_groups, weights_info);
switch (CpuConv2d::get_convolution_method(input, weights, output, conv_info, weights_info, dilation, act_info,
enable_fast_math))
{
Expand Down Expand Up @@ -119,7 +120,8 @@ Status CpuConv2d::validate(const ITensorInfo *input,
ARM_COMPUTE_TRACE_EVENT(ARM_COMPUTE_PROF_CAT_CPU, ARM_COMPUTE_PROF_LVL_CPU, "CpuConv2d::validate");
ARM_COMPUTE_RETURN_ERROR_ON_MSG((num_groups != 1), "Grouping (num_groups != 1) is not supported on Neon");

const Conv2dInfo info(conv_info, dilation, act_info, enable_fast_math, num_groups);
const Conv2dInfo info(conv_info, dilation, act_info, enable_fast_math, num_groups, weights_info);

switch (CpuConv2d::get_convolution_method(input, weights, output, conv_info, weights_info, dilation, act_info,
enable_fast_math))
{
Expand Down Expand Up @@ -155,7 +157,17 @@ ConvolutionMethod CpuConv2d::get_convolution_method(const ITensorInfo *i
bool enable_fast_math)
{
ARM_COMPUTE_ERROR_ON_NULLPTR(input, output, weights);
ARM_COMPUTE_UNUSED(weights_info);

// For QASYMM8_SIGNED→F32 with NHWC and no dilation, automatically select the single-kernel
// CpuGemmDirectConv2d path when it validates successfully.
if (input->data_type() == DataType::QASYMM8_SIGNED && output->data_type() == DataType::F32)
{
const Conv2dInfo info(conv_info, dilation, act_info, enable_fast_math, 1, weights_info);
if (bool(CpuGemmDirectConv2d::validate(input, weights, nullptr, output, info)))
{
return ConvolutionMethod::GEMM_CONV2D;
}
}

const size_t idx_w = get_data_layout_dimension_index(input->data_layout(), DataLayoutDimension::WIDTH);
const size_t idx_h = get_data_layout_dimension_index(input->data_layout(), DataLayoutDimension::HEIGHT);
Expand Down
Loading