From c7f4f74a4ab4d1f6ff97c0a0e39e120998e987de Mon Sep 17 00:00:00 2001
From: Scott Roy <scroy@meta.com>
Date: Tue, 9 Jun 2026 12:13:42 -0700
Subject: [PATCH] Switch to neon for interleave (#20137)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Summary:

The BGRA/RGB → planar-CHW-float deinterleave + normalization step was implemented twice and sub-optimally: the Apple backend used a strided vDSP gather (vDSP_vfltu8 ×3 + vDSP_vsmsa, ~6 passes over the input), and the portable/Android backend used a scalar triple-nested loop. This replaces both with a single hand-vectorized kernel in a new shared translation unit.

image_processor_simd.{h,cpp} provides deinterleave_to_chw():

* One vld4q_u8 (BGRA/RGBA) or vld3q_u8 (RGB) read, widen uint8→float in-register, fused per-channel affine out = in*(scale/std) + (-mean/std) via vfmaq_f32, single write per plane.
* NEON on ARM (all shipping iOS/Apple-silicon targets and Android arm64), scalar fallback elsewhere.
* Handles the fast (contiguous) path plus a row-by-row slow path for stride padding and letterbox offsets.

Both backends now call it.

From benchmark script, GPU is stable (within noise), and CPU is on par at min (0.98x) to 1.86x faster at max.

```
CPU rows       n=120  median=1.19x  min=0.98x  max=1.86x
GPU rows       n=36   median=1.11x  min=0.87x  max=1.41x
Default rows   n=24   median=1.29x  min=0.99x  max=1.71x

```

See D102373165 for inspiration.

Differential Revision: D107958353
---
 extension/image/CMakeLists.txt            |   9 +-
 extension/image/image_processor.cpp       |  37 +++--
 extension/image/image_processor_apple.cpp |  88 +---------
 extension/image/image_processor_simd.cpp  | 186 ++++++++++++++++++++++
 extension/image/image_processor_simd.h    |  55 +++++++
 extension/image/targets.bzl               |   6 +-
 6 files changed, 278 insertions(+), 103 deletions(-)
 create mode 100644 extension/image/image_processor_simd.cpp
 create mode 100644 extension/image/image_processor_simd.h
diff --git a/extension/image/CMakeLists.txt b/extension/image/CMakeLists.txt
index 7525fe7de44..0c233ffc796 100644
--- a/extension/image/CMakeLists.txt
+++ b/extension/image/CMakeLists.txt
@@ -9,8 +9,8 @@ cmake_minimum_required(VERSION 3.19)
 if(APPLE)
   enable_language(OBJCXX)
   add_library(
-    extension_image image_processor_common.cpp image_processor_apple.cpp
-                    image_processor_apple_gpu.mm
+    extension_image image_processor_common.cpp image_processor_simd.cpp
+                    image_processor_apple.cpp image_processor_apple_gpu.mm
   )
   set_source_files_properties(
     image_processor_apple_gpu.mm PROPERTIES COMPILE_FLAGS "-fobjc-arc"
@@ -39,7 +39,10 @@ else()
   )
   FetchContent_MakeAvailable(stb)
 
-  add_library(extension_image image_processor_common.cpp image_processor.cpp)
+  add_library(
+    extension_image image_processor_common.cpp image_processor_simd.cpp
+                    image_processor.cpp
+  )
 
   # stb_image_resize.h lives under deprecated/ in current stb. Private: only the
   # .cpp uses it, not the installed public headers.
diff --git a/extension/image/image_processor.cpp b/extension/image/image_processor.cpp
index 0f1b8f4f7de..4605f8004c0 100644
--- a/extension/image/image_processor.cpp
+++ b/extension/image/image_processor.cpp
@@ -7,6 +7,7 @@
  */
 
 #include <executorch/extension/image/image_processor.h>
+#include <executorch/extension/image/image_processor_simd.h>
 
 #include <algorithm>
 #include <cstring>
@@ -420,25 +421,23 @@ Error ImageProcessor::process_into(
         InvalidArgument,
         "normalization std_dev must be nonzero");
   }
-  // Source (resized RGB) carries input_channels; the output tensor carries
-  // output_channels. They are equal today, so channels map 1:1; a future
-  // divergence (e.g. grayscale) would need an explicit channel map here.
-  for (int32_t y = 0; y < resize_h; ++y) {
-    for (int32_t x = 0; x < resize_w; ++x) {
-      const int32_t src_idx = (y * resize_w + x) * input_channels;
-      const int32_t dst_y = y + offset_y;
-      const int32_t dst_x = x + offset_x;
-      for (int32_t c = 0; c < output_channels; ++c) {
-        const float val =
-            (resized_buf[src_idx + c] * norm.scale_factor - norm.mean[c]) /
-            norm.std_dev[c];
-        const size_t out_idx = static_cast<size_t>(c) * final_w * final_h +
-            static_cast<size_t>(dst_y) * final_w + dst_x;
-        output[out_idx] = val;
-      }
-    }
-  }
-  return Error::Ok;
+  // Deinterleave + normalize the resized interleaved RGB (R/G/B at byte
+  // offsets 0/1/2) into the CHW output.
+  return deinterleave_to_chw(
+      resized_buf.data(),
+      resize_w,
+      resize_h,
+      resize_w * input_channels,
+      input_channels,
+      /*r_off=*/0,
+      /*g_off=*/1,
+      /*b_off=*/2,
+      output,
+      final_w,
+      final_h,
+      offset_x,
+      offset_y,
+      norm);
 }
 
 Error ImageProcessor::process_yuv_into(
diff --git a/extension/image/image_processor_apple.cpp b/extension/image/image_processor_apple.cpp
index 44e6d2c083e..04c599ab0ff 100644
--- a/extension/image/image_processor_apple.cpp
+++ b/extension/image/image_processor_apple.cpp
@@ -20,6 +20,7 @@
 
 #include <executorch/extension/image/image_processor.h>
 #include <executorch/extension/image/image_processor_apple.h>
+#include <executorch/extension/image/image_processor_simd.h>
 
 #include <algorithm>
 #include <cstring>
@@ -391,85 +392,6 @@ size_t compute_scale_temp_size(
   return temp_size > 0 ? static_cast<size_t>(temp_size) : 0;
 }
 
-// Deinterleave BGRA uint8 → planar RGB float with fused normalization.
-// Handles offset for letterbox padding.
-//
-// Per channel (R, G, B): vDSP_vfltu8 reads the matching byte from BGRA via
-// stride=4 and converts uint8→float, then vDSP_vsmsa applies the fused
-// affine `out = in * (scale_factor / std_dev) + (-mean / std_dev)` in-place.
-Error deinterleave_bgra_to_chw(
-    const uint8_t* bgra_data,
-    int32_t src_w,
-    int32_t src_h,
-    int32_t src_stride,
-    float* output,
-    int32_t final_w,
-    int32_t final_h,
-    int32_t offset_x,
-    int32_t offset_y,
-    const Normalization& norm) {
-  const size_t spatial = static_cast<size_t>(final_w) * final_h;
-
-  // Per-channel affine coefficients for `out = in * a + b`.
-  // BGRA byte layout: byte 0 = B, byte 1 = G, byte 2 = R; norm.{mean,std_dev}
-  // are indexed in RGB order (channel 0 = R, 1 = G, 2 = B).
-  const float a_r = norm.scale_factor / norm.std_dev[0];
-  const float a_g = norm.scale_factor / norm.std_dev[1];
-  const float a_b = norm.scale_factor / norm.std_dev[2];
-  const float b_r = -norm.mean[0] / norm.std_dev[0];
-  const float b_g = -norm.mean[1] / norm.std_dev[1];
-  const float b_b = -norm.mean[2] / norm.std_dev[2];
-
-  // When the bias is zero (e.g. zeroToOne / mean=0), a plain scale (vsmul) is
-  // cheaper than the fused scale+add (vsmsa).
-  const bool no_offset = (b_r == 0.0f && b_g == 0.0f && b_b == 0.0f);
-  auto scale_bias =
-      [no_offset](float* p, const float* a, const float* b, vDSP_Length n) {
-        if (no_offset) {
-          vDSP_vsmul(p, 1, a, p, 1, n);
-        } else {
-          vDSP_vsmsa(p, 1, a, b, p, 1, n);
-        }
-      };
-
-  // Output planes in CHW order: R, G, B. Each plane is final_w × final_h
-  // floats; we write a src_h × src_w region starting at (offset_y, offset_x).
-  float* r_plane = output + 0 * spatial;
-  float* g_plane = output + 1 * spatial;
-  float* b_plane = output + 2 * spatial;
-
-  // Fast path: source is contiguous and destination region is the entire
-  // plane (offsets 0, src dims == final dims).
-  if (src_stride == src_w * 4 && offset_x == 0 && offset_y == 0 &&
-      src_w == final_w && src_h == final_h) {
-    const vDSP_Length n = static_cast<vDSP_Length>(src_w) * src_h;
-    vDSP_vfltu8(bgra_data + 2, 4, r_plane, 1, n);
-    scale_bias(r_plane, &a_r, &b_r, n);
-    vDSP_vfltu8(bgra_data + 1, 4, g_plane, 1, n);
-    scale_bias(g_plane, &a_g, &b_g, n);
-    vDSP_vfltu8(bgra_data + 0, 4, b_plane, 1, n);
-    scale_bias(b_plane, &a_b, &b_b, n);
-    return Error::Ok;
-  }
-
-  // Slow path: row-by-row to handle stride padding and/or letterbox offsets.
-  for (int32_t y = 0; y < src_h; ++y) {
-    const uint8_t* src_row = bgra_data + y * src_stride;
-    const ptrdiff_t dst_off = (y + offset_y) * final_w + offset_x;
-    float* r_dst = r_plane + dst_off;
-    float* g_dst = g_plane + dst_off;
-    float* b_dst = b_plane + dst_off;
-    const vDSP_Length n = static_cast<vDSP_Length>(src_w);
-    vDSP_vfltu8(src_row + 2, 4, r_dst, 1, n);
-    scale_bias(r_dst, &a_r, &b_r, n);
-    vDSP_vfltu8(src_row + 1, 4, g_dst, 1, n);
-    scale_bias(g_dst, &a_g, &b_g, n);
-    vDSP_vfltu8(src_row + 0, 4, b_dst, 1, n);
-    scale_bias(b_dst, &a_b, &b_b, n);
-  }
-  return Error::Ok;
-}
-
 // Rotate an interleaved BGRA (ARGB8888 layout) buffer by `orientation` using
 // vImage's SIMD/cache-aware 90-degree rotation, writing a tightly-packed result
 // into `scratch`. UP is handled by the caller (no rotation). out_data/out_w/
@@ -590,11 +512,16 @@ Error normalize_bgra_into(
     offset_y = offset.second;
   }
 
-  return deinterleave_bgra_to_chw(
+  // BGRA byte layout: B=0, G=1, R=2 (alpha dropped); norm is RGB-indexed.
+  return deinterleave_to_chw(
       bgra_data,
       width,
       height,
       stride,
+      /*in_channels=*/4,
+      /*r_off=*/2,
+      /*g_off=*/1,
+      /*b_off=*/0,
       out,
       final_w,
       final_h,
@@ -1380,6 +1307,7 @@ Error process_pixelbuffer_into(
 
 // Allocate a CHW float tensor sized to the configured target and fill it via
 // process_pixelbuffer_into.
+// cppcheck-suppress unusedFunction
 Result<TensorPtr> process_pixelbuffer(
     const ImageProcessor& processor,
     CVPixelBufferRef pixelBuffer,
diff --git a/extension/image/image_processor_simd.cpp b/extension/image/image_processor_simd.cpp
new file mode 100644
index 00000000000..57b1cc32e08
--- /dev/null
+++ b/extension/image/image_processor_simd.cpp
@@ -0,0 +1,186 @@
+/*
+ * Copyright (c) Meta Platforms, Inc. and affiliates.
+ * All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+#include <executorch/extension/image/image_processor_simd.h>
+
+#include <cstddef>
+
+#include <executorch/runtime/platform/assert.h>
+
+#if defined(__ARM_NEON) || defined(__ARM_NEON__)
+#include <arm_neon.h>
+#define ET_IMAGE_USE_NEON 1
+#else
+#define ET_IMAGE_USE_NEON 0
+#endif
+
+namespace executorch {
+namespace extension {
+namespace image {
+
+using runtime::Error;
+
+namespace {
+
+#if ET_IMAGE_USE_NEON
+// Widen 16 uint8 -> 4x float32x4, apply out = in * a + b (single-rounding FMA),
+// and store the 16 resulting floats.
+__attribute__((always_inline)) inline void
+widen_fma_store(uint8x16_t ch, float* dst, float32x4_t a, float32x4_t b) {
+  uint16x8_t lo = vmovl_u8(vget_low_u8(ch));
+  uint16x8_t hi = vmovl_u8(vget_high_u8(ch));
+  vst1q_f32(
+      dst + 0, vfmaq_f32(b, vcvtq_f32_u32(vmovl_u16(vget_low_u16(lo))), a));
+  vst1q_f32(
+      dst + 4, vfmaq_f32(b, vcvtq_f32_u32(vmovl_u16(vget_high_u16(lo))), a));
+  vst1q_f32(
+      dst + 8, vfmaq_f32(b, vcvtq_f32_u32(vmovl_u16(vget_low_u16(hi))), a));
+  vst1q_f32(
+      dst + 12, vfmaq_f32(b, vcvtq_f32_u32(vmovl_u16(vget_high_u16(hi))), a));
+}
+#endif // ET_IMAGE_USE_NEON
+
+// Deinterleave + normalize one contiguous run of `n` pixels (stride
+// in_channels bytes/pixel) into the r/g/b float planes. NEON when available,
+// scalar otherwise; the scalar tail also finishes the final (<16) pixels.
+void deinterleave_run(
+    const uint8_t* __restrict src,
+    size_t n,
+    int32_t in_channels,
+    int32_t r_off,
+    int32_t g_off,
+    int32_t b_off,
+    float* __restrict r_out,
+    float* __restrict g_out,
+    float* __restrict b_out,
+    float a_r,
+    float b_r,
+    float a_g,
+    float b_g,
+    float a_b,
+    float b_b) {
+  size_t i = 0;
+#if ET_IMAGE_USE_NEON
+  const float32x4_t va_r = vdupq_n_f32(a_r);
+  const float32x4_t vb_r = vdupq_n_f32(b_r);
+  const float32x4_t va_g = vdupq_n_f32(a_g);
+  const float32x4_t vb_g = vdupq_n_f32(b_g);
+  const float32x4_t va_b = vdupq_n_f32(a_b);
+  const float32x4_t vb_b = vdupq_n_f32(b_b);
+  if (in_channels == 4) {
+    for (; i + 16 <= n; i += 16) {
+      uint8x16x4_t px = vld4q_u8(src + i * 4);
+      widen_fma_store(px.val[r_off], r_out + i, va_r, vb_r);
+      widen_fma_store(px.val[g_off], g_out + i, va_g, vb_g);
+      widen_fma_store(px.val[b_off], b_out + i, va_b, vb_b);
+    }
+  } else { // in_channels == 3
+    for (; i + 16 <= n; i += 16) {
+      uint8x16x3_t px = vld3q_u8(src + i * 3);
+      widen_fma_store(px.val[r_off], r_out + i, va_r, vb_r);
+      widen_fma_store(px.val[g_off], g_out + i, va_g, vb_g);
+      widen_fma_store(px.val[b_off], b_out + i, va_b, vb_b);
+    }
+  }
+#endif // ET_IMAGE_USE_NEON
+  for (; i < n; ++i) {
+    const uint8_t* p = src + i * in_channels;
+    r_out[i] = static_cast<float>(p[r_off]) * a_r + b_r;
+    g_out[i] = static_cast<float>(p[g_off]) * a_g + b_g;
+    b_out[i] = static_cast<float>(p[b_off]) * a_b + b_b;
+  }
+}
+
+} // namespace
+
+Error deinterleave_to_chw(
+    const uint8_t* src,
+    int32_t src_w,
+    int32_t src_h,
+    int32_t src_stride,
+    int32_t in_channels,
+    int32_t r_off,
+    int32_t g_off,
+    int32_t b_off,
+    float* output,
+    int32_t final_w,
+    int32_t final_h,
+    int32_t offset_x,
+    int32_t offset_y,
+    const Normalization& norm) {
+  ET_DCHECK_MSG(
+      in_channels == 3 || in_channels == 4, "in_channels must be 3 or 4");
+  ET_DCHECK_MSG(
+      r_off < in_channels && g_off < in_channels && b_off < in_channels,
+      "channel offsets must be < in_channels");
+  const size_t spatial = static_cast<size_t>(final_w) * final_h;
+
+  // Per-channel affine coefficients for `out = in * a + b`, in RGB order.
+  const float a_r = norm.scale_factor / norm.std_dev[0];
+  const float a_g = norm.scale_factor / norm.std_dev[1];
+  const float a_b = norm.scale_factor / norm.std_dev[2];
+  const float b_r = -norm.mean[0] / norm.std_dev[0];
+  const float b_g = -norm.mean[1] / norm.std_dev[1];
+  const float b_b = -norm.mean[2] / norm.std_dev[2];
+
+  // Output planes in CHW order: R, G, B.
+  float* r_plane = output + 0 * spatial;
+  float* g_plane = output + 1 * spatial;
+  float* b_plane = output + 2 * spatial;
+
+  // Fast path: contiguous source covering the entire plane (no stride padding,
+  // no letterbox offset, src dims == final dims) -> one run over all pixels.
+  if (src_stride == src_w * in_channels && offset_x == 0 && offset_y == 0 &&
+      src_w == final_w && src_h == final_h) {
+    deinterleave_run(
+        src,
+        static_cast<size_t>(src_w) * src_h,
+        in_channels,
+        r_off,
+        g_off,
+        b_off,
+        r_plane,
+        g_plane,
+        b_plane,
+        a_r,
+        b_r,
+        a_g,
+        b_g,
+        a_b,
+        b_b);
+    return Error::Ok;
+  }
+
+  // Slow path: row by row to honor stride padding and/or a letterbox offset.
+  for (int32_t y = 0; y < src_h; ++y) {
+    const uint8_t* src_row = src + static_cast<size_t>(y) * src_stride;
+    const size_t dst_off =
+        static_cast<size_t>(y + offset_y) * final_w + offset_x;
+    deinterleave_run(
+        src_row,
+        src_w,
+        in_channels,
+        r_off,
+        g_off,
+        b_off,
+        r_plane + dst_off,
+        g_plane + dst_off,
+        b_plane + dst_off,
+        a_r,
+        b_r,
+        a_g,
+        b_g,
+        a_b,
+        b_b);
+  }
+  return Error::Ok;
+}
+
+} // namespace image
+} // namespace extension
+} // namespace executorch
diff --git a/extension/image/image_processor_simd.h b/extension/image/image_processor_simd.h
new file mode 100644
index 00000000000..ad7cd0191e2
--- /dev/null
+++ b/extension/image/image_processor_simd.h
@@ -0,0 +1,55 @@
+/*
+ * Copyright (c) Meta Platforms, Inc. and affiliates.
+ * All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+#pragma once
+
+#include <cstdint>
+
+#include <executorch/extension/image/image_processor_config.h>
+#include <executorch/runtime/core/error.h>
+
+namespace executorch {
+namespace extension {
+namespace image {
+
+// SIMD-accelerated image-processing kernels (NEON on ARM, scalar fallback
+// elsewhere), shared by the Apple and portable ImageProcessor backends.
+
+// Deinterleave an 8-bit interleaved image into planar CHW float with a fused
+// per-channel affine normalize:
+//   out = pixel * (scale_factor / std_dev[c]) + (-mean[c] / std_dev[c]).
+// Uses NEON (vld4q_u8 / vld3q_u8 + FMA) on ARM, scalar elsewhere.
+//
+// in_channels is 3 (RGB) or 4 (BGRA/RGBA; the alpha byte is ignored).
+// r_off/g_off/b_off are the byte offsets of R, G, B within a pixel
+// (BGRA -> {2, 1, 0}, RGB/RGBA -> {0, 1, 2}); they also index the deinterleaved
+// channels, so each must be < in_channels. norm.{mean,std_dev} are in RGB
+// order.
+//
+// Writes a src_w x src_h region at (offset_x, offset_y) into the final_w x
+// final_h planes; pixels outside that region are left untouched, so callers
+// that letterbox must pre-fill the padding. src_stride is in bytes.
+runtime::Error deinterleave_to_chw(
+    const uint8_t* src,
+    int32_t src_w,
+    int32_t src_h,
+    int32_t src_stride,
+    int32_t in_channels,
+    int32_t r_off,
+    int32_t g_off,
+    int32_t b_off,
+    float* output,
+    int32_t final_w,
+    int32_t final_h,
+    int32_t offset_x,
+    int32_t offset_y,
+    const Normalization& norm);
+
+} // namespace image
+} // namespace extension
+} // namespace executorch
diff --git a/extension/image/targets.bzl b/extension/image/targets.bzl
index f25e0e6bfe5..c857b8d9b07 100644
--- a/extension/image/targets.bzl
+++ b/extension/image/targets.bzl
@@ -29,7 +29,10 @@ def define_common_targets():
 
         runtime.cxx_library(
             name = "image_processor" + aten_suffix,
-            srcs = ["image_processor_common.cpp"] + select({
+            srcs = [
+                "image_processor_common.cpp",
+                "image_processor_simd.cpp",
+            ] + select({
                 "DEFAULT": ["image_processor.cpp"],
                 "ovr_config//os:iphoneos": [
                     "image_processor_apple.cpp",
@@ -42,6 +45,7 @@ def define_common_targets():
             }),
             headers = [
                 "image_processor_apple_gpu.h",
+                "image_processor_simd.h",
             ],
             exported_headers = [
                 "image_processor.h",