ARM-software · alvoron · Jun 17, 2026
diff --git a/src/core/NEON/kernels/arm_gemm/gemm_interleaved.hpp b/src/core/NEON/kernels/arm_gemm/gemm_interleaved.hpp
@@ -281,14 +281,22 @@ void kernel_and_merge<false, false, DequantizeFloat>::run(
         offset_bias = bias + n_0;
     }
 
+    // When b_offset != 0, row sums of A are packed at the end of the A panel
+    // (appended by the quantized PrepareA transform with multiplier=1).  Read them
+    // to pass to dequantize_block_32 for per-row offset correction.
+    const int32_t *row_sum = nullptr;
+    if (dq.b_offset != 0) {
+        row_sum = reinterpret_cast<const int32_t *>(a_ptr + strategy::out_height() * kern_k);
+    }
+
     strat.kernel(// A and B pointers are just the packed panels.
                  a_ptr, b_panel,
                  // Provide relevant part of output array and row stride.
                  c_ptr ? (c_ptr + m_0 * ldc + n_0) : nullptr, ldc,
                  // M, N, K sizes
                  m_max-m_0, n_max - n_0, kern_k,
                  // Bias, activation, accumulation.  Need to offset the bias as needed.
-                 offset_col_bias, dq, offset_bias, act, accumulate, acc_buff);
+                 offset_col_bias, dq, offset_bias, act, accumulate, acc_buff, row_sum, kern_k);
 }
 
 template<>
@@ -300,7 +308,7 @@ void kernel_and_merge<true, false, DequantizeFloat>::run(
         strategy &strat, const Tlo *a_ptr, const Tro *b_panel, size_t, Tri *c_panel,
         Tr *c_ptr, int ldc, int kern_k, unsigned int m_0,
         unsigned int m_max, unsigned int n_0, unsigned int n_max, const Tr *bias,
-        const Activation &act, bool not_first_pass, const DequantizeFloat &qp, const int32_t *,
+        const Activation &act, bool not_first_pass, const DequantizeFloat &qp, const int32_t *col_bias,
         Tab *)
 {
     const int bblocks = iceildiv(n_max - n_0, strategy::out_width());
@@ -317,14 +325,20 @@ void kernel_and_merge<true, false, DequantizeFloat>::run(
 #ifdef CYCLE_PROFILING
         auto p=prof.ScopedProfiler(PROFILE_QUANTIZE, ((m_max-m_0) * bblocks * strategy::out_width() * sizeof(Tr)));
 #endif
+        // When b_offset != 0, row sums are packed after the A panel data
+        const int32_t *row_sum = (qp.b_offset != 0)
+            ? reinterpret_cast<const int32_t *>(a_ptr + strategy::out_height() * kern_k)
+            : nullptr;
+
         for (int i=0; i<bblocks; i++) {
             unsigned int n_start = n_0 + (strategy::out_width() * i);
             unsigned int n_end = std::min(n_start + strategy::out_width(), n_max);
 
             dequantize_block_32(qp, (n_end - n_start), (m_max - m_0),
                             c_panel + (i * strategy::out_width() * strategy::out_height()), strategy::out_width(),
                             c_ptr + m_0 * ldc + n_start, ldc,
-                            bias != nullptr ? bias + n_start : nullptr, not_first_pass, act);
+                            bias != nullptr ? bias + n_start : nullptr, not_first_pass, act,
+                            col_bias != nullptr ? col_bias + n_start : nullptr, row_sum, kern_k);
 
         }
     }
@@ -475,6 +489,13 @@ class GemmInterleaved : public GemmCommon<Tlo, Tro, Tr> {
             return _Nsize * _nmulti * sizeof(int32_t);
         }
 
+        if (std::is_same<OutputStage, DequantizeFloat>::value) {
+            const DequantizeFloat *dq = reinterpret_cast<const DequantizeFloat *>(&_os);
+            if (dq->a_offset != 0) {
+                return _Nsize * _nmulti * sizeof(int32_t);
+            }
+        }
+
         return 0;
     }
 
@@ -557,6 +578,12 @@ class GemmInterleaved : public GemmCommon<Tlo, Tro, Tr> {
             k_depth += sizeof(int32_t) / sizeof(Tloi);
         }
 
+        if (std::is_same<OutputStage, DequantizeFloat>::value && MergeStep) {
+            // transforms_quantized always packs row sum slots (zeros when multiplier=0, actual
+            // sums when b_offset != 0).  Reserve space unconditionally when MergeStep is enabled.
+            k_depth += sizeof(int32_t) / sizeof(Tloi);
+        }
+
         return k_depth;
     }
 
@@ -647,6 +674,13 @@ class GemmInterleaved : public GemmCommon<Tlo, Tro, Tr> {
             return -qp->b_offset;
         }
 
+        if (std::is_same<OutputStage, DequantizeFloat>::value) {
+            const DequantizeFloat *dq = reinterpret_cast<const DequantizeFloat *>(&_os);
+            // Pack row sums into the A panel when b_offset is non-zero so that the
+            // merge step can apply the b_offset correction per output position.
+            return (dq->b_offset != 0) ? 1 : 0;
+        }
+
         return 0;
     }
 
@@ -693,6 +727,14 @@ class GemmInterleaved : public GemmCommon<Tlo, Tro, Tr> {
             return get_ktotal(args);
         }
 
+        // K blocking is not supported for DequantizeFloat with MergeStep when b_offset != 0,
+        // because row sums of A must cover the full K depth.  We cannot check b_offset here
+        // (static function), so we conservatively disable K-blocking for all DequantizeFloat
+        // MergeStep cases.  The working-memory cost is minimal and correctness is guaranteed.
+        if (std::is_same<OutputStage, DequantizeFloat>::value && MergeStep) {
+            return get_ktotal(args);
+        }
+
         // We can't K block non-fast FP16 cases without an accumulation buffer.
 #if defined(__aarch64__) && (defined(FP16_KERNELS) || defined(ARM_COMPUTE_ENABLE_FP16))
         if (std::is_same<Tlo, __fp16>::value && std::is_same<Tr, __fp16>::value && !args._fast_mode && MergeStep) {
@@ -937,7 +979,7 @@ class GemmInterleaved : public GemmCommon<Tlo, Tro, Tr> {
 #endif
                             // See comment above on transform_type<> class: this extracts either 'transforms' or
                             // 'transforms_quantized' as appropriate.
-                            typename transform_type<strategy, MergeStep && std::is_same<OutputStage, Requantize32>::value>::type transforms;
+                            typename transform_type<strategy, MergeStep && (std::is_same<OutputStage, Requantize32>::value || std::is_same<OutputStage, DequantizeFloat>::value)>::type transforms;
 
                             if (_indirect_buf != nullptr) {
                                 transforms.PrepareA_indirect(a_panel,
@@ -1027,7 +1069,7 @@ class GemmInterleaved : public GemmCommon<Tlo, Tro, Tr> {
 #endif
                     // See comment above on transform_type<> class: this extracts either 'transforms' or
                     // 'transforms_quantized' as appropriate.
-                    typename transform_type<strategy, MergeStep && std::is_same<OutputStage, Requantize32>::value>::type transforms;
+                    typename transform_type<strategy, MergeStep && (std::is_same<OutputStage, Requantize32>::value || std::is_same<OutputStage, DequantizeFloat>::value)>::type transforms;
 
                     for (unsigned int batch = batch_0; batch <= batch_end; batch++) {
                         unsigned int first_m = (batch == batch_0)   ? m_0   : 0;
@@ -1060,6 +1102,10 @@ class GemmInterleaved : public GemmCommon<Tlo, Tro, Tr> {
 
                     if(std::is_same<OutputStage, Requantize32>::value) {
                         a_panel_stride = kern_k + (sizeof(int32_t) / sizeof(Tloi));
+                    } else if (std::is_same<OutputStage, DequantizeFloat>::value) {
+                        // transforms_quantized always packs row-sum slots (zeros when b_offset=0,
+                        // actual sums when b_offset != 0), so the stride must include the slot.
+                        a_panel_stride = kern_k + (sizeof(int32_t) / sizeof(Tloi));
                     } else {
                         a_panel_stride = kern_k;
                     }
@@ -1212,6 +1258,20 @@ class GemmInterleaved : public GemmCommon<Tlo, Tro, Tr> {
                 compute_col_sums(*qp_ptr, _Nsize, _Ksize * _Ksections, B + (i * B_multi_stride), ldb, col_bias + (i * _Nsize), _Ksize * _Ksections, i, 0);
             }
         }
+
+        if (std::is_same<OutputStage, DequantizeFloat>::value) {
+            const DequantizeFloat *dq = reinterpret_cast<const DequantizeFloat *>(&_os);
+            if (dq->a_offset != 0) {
+                // Compute raw column sums of B (weight matrix) for use in a_offset correction.
+                // dequantize_block_32 applies: -a_offset * col_sums[n] * scale per output channel.
+                col_bias = reinterpret_cast<int32_t *>(in_buffer);
+                for (unsigned int i = 0; i < _nmulti; ++i) {
+                    compute_raw_col_sums(_Nsize, _Ksize * _Ksections,
+                                         B + (i * B_multi_stride), ldb,
+                                         col_bias + (i * _Nsize));
+                }
+            }
+        }
     }
 
     // Support for transposed B is a property of the strategy::transpose type

diff --git a/src/core/NEON/kernels/arm_gemm/quantized-fp16.cpp b/src/core/NEON/kernels/arm_gemm/quantized-fp16.cpp
@@ -35,7 +35,8 @@ namespace arm_gemm {
 template<>
 void dequantize_block_32<__fp16>(const DequantizeFloat &qp, unsigned int width, unsigned int height,
                                  const int32_t * in_ptr, unsigned int in_stride, __fp16 *out_ptr, unsigned int out_stride,
-                                 const __fp16 * bias_ptr, bool not_first_pass, const Activation &act)
+                                 const __fp16 * bias_ptr, bool not_first_pass, const Activation &act,
+                                 const int32_t * /*col_bias*/, const int32_t * /*row_sum*/, int32_t /*k_total*/)
 {
     const float32x4_t vscale = vdupq_n_f32(qp.scale);
     float maxval = std::numeric_limits<float>::infinity();

diff --git a/src/core/NEON/kernels/arm_gemm/quantized.cpp b/src/core/NEON/kernels/arm_gemm/quantized.cpp
@@ -973,10 +973,28 @@ void compute_col_sums(const Requantize32 &qp, unsigned int width, unsigned int h
 template void compute_col_sums(const Requantize32 &qp, unsigned int width, unsigned int height, const int8_t *input, unsigned int in_stride, int32_t *col_bias, unsigned int depth, unsigned int multi, unsigned int first_col);
 template void compute_col_sums(const Requantize32 &qp, unsigned int width, unsigned int height, const uint8_t *input, unsigned int in_stride, int32_t *col_bias, unsigned int depth, unsigned int multi, unsigned int first_col);
 
+template<typename T>
+void compute_raw_col_sums(unsigned int width, unsigned int height,
+                          const T *input, unsigned int in_stride, int32_t *col_sums)
+{
+    memset(reinterpret_cast<void *>(col_sums), 0, width * sizeof(int32_t));
+    for (unsigned int row = 0; row < height; ++row)
+    {
+        for (unsigned int col = 0; col < width; ++col)
+        {
+            col_sums[col] += static_cast<int32_t>(input[row * in_stride + col]);
+        }
+    }
+}
+
+template void compute_raw_col_sums(unsigned int width, unsigned int height, const int8_t *input, unsigned int in_stride, int32_t *col_sums);
+template void compute_raw_col_sums(unsigned int width, unsigned int height, const uint8_t *input, unsigned int in_stride, int32_t *col_sums);
+
 template<>
 void dequantize_block_32<float>(const DequantizeFloat &qp, unsigned int width, unsigned int height,
                          const int32_t* in_ptr, unsigned int in_stride, float *out_ptr, unsigned int out_stride,
-                         const float* bias_ptr, bool accumulate, const Activation &act)
+                         const float* bias_ptr, bool accumulate, const Activation &act,
+                         const int32_t *col_bias, const int32_t *row_sum, int32_t k_total)
 {
     const float32x4_t vscale = vdupq_n_f32(qp.scale);
     float maxval = std::numeric_limits<float>::infinity();
@@ -1000,14 +1018,38 @@ void dequantize_block_32<float>(const DequantizeFloat &qp, unsigned int width, u
     for(unsigned int row=0; row<height; row++) {
         auto row_in_ptr = in_ptr + (row * in_stride);
         auto row_out_ptr = out_ptr + (row * out_stride);
+
+        // Per-row addend from b_offset correction: -b_offset * sum_a_row[m] * scale
+        // row_sum values are packed with multiplier=1, so entry is plain sum(a_row[k]).
+        // Also add the cross-term: +a_offset * b_offset * K * scale (constant per tile).
+        float row_offset = 0.0f;
+        if (row_sum != nullptr) {
+            row_offset += static_cast<float>(-qp.b_offset * row_sum[row]) * qp.scale;
+        }
+        if (col_bias != nullptr && row_sum != nullptr && k_total != 0) {
+            // Cross-term: +a_offset * b_offset * K * scale
+            row_offset += static_cast<float>(qp.a_offset) * static_cast<float>(qp.b_offset)
+                          * static_cast<float>(k_total) * qp.scale;
+        }
+        const float32x4_t vrow_offset = vdupq_n_f32(row_offset);
+
         unsigned int col=0;
         if (width >= 4) {
             for(; col <= (width - 4); col+= 4) {
                 const int32x4_t vin = vld1q_s32(row_in_ptr + col);
                 float32x4_t vdeq = vmulq_f32(vcvtq_f32_s32(vin), vscale);
                 if(bias_ptr) {
-                    const float32x4_t bin = vld1q_f32(bias_ptr + col);
-                    vdeq = vaddq_f32(vdeq, bin);
+                    vdeq = vaddq_f32(vdeq, vld1q_f32(bias_ptr + col));
+                }
+                if(col_bias) {
+                    // a_offset correction: -a_offset * sum_b_col[n] * scale
+                    const float32x4_t vcol_corr = vmulq_f32(
+                        vcvtq_f32_s32(vld1q_s32(col_bias + col)),
+                        vdupq_n_f32(static_cast<float>(-qp.a_offset) * qp.scale));
+                    vdeq = vaddq_f32(vdeq, vcol_corr);
+                }
+                if(row_sum) {
+                    vdeq = vaddq_f32(vdeq, vrow_offset);
                 }
                 if(accumulate) {
                     vdeq = vaddq_f32(vdeq, vld1q_f32(row_out_ptr + col));
@@ -1019,10 +1061,16 @@ void dequantize_block_32<float>(const DequantizeFloat &qp, unsigned int width, u
         // left-over elements
         for(; col < width; ++col) {
             const int32_t val = *(row_in_ptr + col);
-            float res = static_cast<float>(val * qp.scale);
+            float res = static_cast<float>(val) * qp.scale;
             if(bias_ptr) {
                 res += static_cast<float>(*(bias_ptr + col));
             }
+            if(col_bias) {
+                res += static_cast<float>(-qp.a_offset * col_bias[col]) * qp.scale;
+            }
+            if(row_sum) {
+                res += row_offset;
+            }
             if(accumulate) {
                 res += *(row_out_ptr + col);
             }

diff --git a/src/cpu/kernels/assembly/arm_common/internal/quantized.hpp b/src/cpu/kernels/assembly/arm_common/internal/quantized.hpp
@@ -41,13 +41,23 @@ void compute_col_sums(const Requantize32 &qp, unsigned int width, unsigned int h
                       const T *input, unsigned int in_stride, int32_t *col_bias, unsigned int depth,
                       unsigned int multi, unsigned int first_col);
 
+/** Compute raw column sums of a matrix: col_sums[n] = sum_{k} input[k * in_stride + n].
+ *  Unlike compute_col_sums(), this does not apply any quantization offsets or scaling —
+ *  it stores the plain integer sums for use as weight column reductions in the
+ *  DequantizeFloat a_offset correction path. */
+template<typename T>
+void compute_raw_col_sums(unsigned int width, unsigned int height,
+                          const T *input, unsigned int in_stride, int32_t *col_sums);
+
 template<typename T>
 void row_sums_indirect(size_t num_strings, const unsigned int *string_lengths, IndirectInputArg<T> A_arg,
                        size_t M, int32_t *output_ptr, const Requantize32 *qp);
 
 template<typename T>
 void dequantize_block_32(const DequantizeFloat &qp, unsigned int width, unsigned int height,
                          const int32_t* input, unsigned int in_stride, T *output, unsigned int out_stride,
-                         const T *row_bias, bool not_first_pass, const Activation &act);
+                         const T *row_bias, bool not_first_pass, const Activation &act,
+                         const int32_t *col_bias = nullptr, const int32_t *row_sum = nullptr,
+                         int32_t k_total = 0);
 
 } // namespace arm_gemm
diff --git a/src/cpu/kernels/assembly/arm_gemm/arm_gemm.hpp b/src/cpu/kernels/assembly/arm_gemm/arm_gemm.hpp
@@ -240,12 +240,20 @@ struct Requantize32
 struct DequantizeFloat
 {
 public:
-    float scale = 0;
+    float   scale    = 0;
+    int32_t a_offset = 0; // input quantization zero-point  (subtract from each input value)
+    int32_t b_offset = 0; // weight quantization zero-point (subtract from each weight value)
 
     DequantizeFloat() = default;
 
-    // Constructor
-    DequantizeFloat(const float scale) : scale(scale)
+    // Constructor without offset (symmetric quantization)
+    DequantizeFloat(const float scale) : scale(scale), a_offset(0), b_offset(0)
+    {
+    }
+
+    // Constructor with asymmetric quantization offsets
+    DequantizeFloat(const float scale, int32_t a_offset, int32_t b_offset)
+        : scale(scale), a_offset(a_offset), b_offset(b_offset)
     {
     }
 };

diff --git a/src/cpu/operators/CpuConv2d.cpp b/src/cpu/operators/CpuConv2d.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2021, 2023-2025 Arm Limited.
+ * Copyright (c) 2017-2021, 2023-2026 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -59,13 +59,14 @@ void CpuConv2d::configure(ITensorInfo               *input,
     // Perform validate step
     ARM_COMPUTE_ERROR_ON_NULLPTR(input, weights, output);
     ARM_COMPUTE_UNUSED(num_groups);
-    ARM_COMPUTE_ERROR_THROW_ON(CpuConv2d::validate(input, weights, biases, output, conv_info, weights_info, dilation,
-                                                   act_info, enable_fast_math, num_groups));
+    ARM_COMPUTE_ERROR_THROW_ON(
+        CpuConv2d::validate(input, weights, biases, output, conv_info, weights_info, dilation, act_info,
+                            enable_fast_math, num_groups));
 
     ARM_COMPUTE_LOG_PARAMS(input, weights, biases, output, conv_info, weights_info, dilation, act_info,
                            enable_fast_math, num_groups);
 
-    const Conv2dInfo info(conv_info, dilation, act_info, enable_fast_math, num_groups);
+    const Conv2dInfo info(conv_info, dilation, act_info, enable_fast_math, num_groups, weights_info);
     switch (CpuConv2d::get_convolution_method(input, weights, output, conv_info, weights_info, dilation, act_info,
                                               enable_fast_math))
     {
@@ -119,7 +120,8 @@ Status CpuConv2d::validate(const ITensorInfo         *input,
     ARM_COMPUTE_TRACE_EVENT(ARM_COMPUTE_PROF_CAT_CPU, ARM_COMPUTE_PROF_LVL_CPU, "CpuConv2d::validate");
     ARM_COMPUTE_RETURN_ERROR_ON_MSG((num_groups != 1), "Grouping (num_groups != 1) is not supported on Neon");
 
-    const Conv2dInfo info(conv_info, dilation, act_info, enable_fast_math, num_groups);
+    const Conv2dInfo info(conv_info, dilation, act_info, enable_fast_math, num_groups, weights_info);
+
     switch (CpuConv2d::get_convolution_method(input, weights, output, conv_info, weights_info, dilation, act_info,
                                               enable_fast_math))
     {
@@ -155,7 +157,17 @@ ConvolutionMethod CpuConv2d::get_convolution_method(const ITensorInfo         *i
                                                     bool                       enable_fast_math)
 {
     ARM_COMPUTE_ERROR_ON_NULLPTR(input, output, weights);
-    ARM_COMPUTE_UNUSED(weights_info);
+
+    // For QASYMM8_SIGNED→F32 with NHWC and no dilation, automatically select the single-kernel
+    // CpuGemmDirectConv2d path when it validates successfully.
+    if (input->data_type() == DataType::QASYMM8_SIGNED && output->data_type() == DataType::F32)
+    {
+        const Conv2dInfo info(conv_info, dilation, act_info, enable_fast_math, 1, weights_info);
+        if (bool(CpuGemmDirectConv2d::validate(input, weights, nullptr, output, info)))
+        {
+            return ConvolutionMethod::GEMM_CONV2D;
+        }
+    }
 
     const size_t idx_w = get_data_layout_dimension_index(input->data_layout(), DataLayoutDimension::WIDTH);
     const size_t idx_h = get_data_layout_dimension_index(input->data_layout(), DataLayoutDimension::HEIGHT);