diff --git a/extension/image/CMakeLists.txt b/extension/image/CMakeLists.txt
index 7525fe7de44..0c233ffc796 100644
--- a/extension/image/CMakeLists.txt
+++ b/extension/image/CMakeLists.txt
@@ -9,8 +9,8 @@ cmake_minimum_required(VERSION 3.19)
 if(APPLE)
   enable_language(OBJCXX)
   add_library(
-    extension_image image_processor_common.cpp image_processor_apple.cpp
-                    image_processor_apple_gpu.mm
+    extension_image image_processor_common.cpp image_processor_simd.cpp
+                    image_processor_apple.cpp image_processor_apple_gpu.mm
   )
   set_source_files_properties(
     image_processor_apple_gpu.mm PROPERTIES COMPILE_FLAGS "-fobjc-arc"
@@ -39,7 +39,10 @@ else()
   )
   FetchContent_MakeAvailable(stb)
 
-  add_library(extension_image image_processor_common.cpp image_processor.cpp)
+  add_library(
+    extension_image image_processor_common.cpp image_processor_simd.cpp
+                    image_processor.cpp
+  )
 
   # stb_image_resize.h lives under deprecated/ in current stb. Private: only the
   # .cpp uses it, not the installed public headers.
diff --git a/extension/image/image_processor.cpp b/extension/image/image_processor.cpp
index 0f1b8f4f7de..4605f8004c0 100644
--- a/extension/image/image_processor.cpp
+++ b/extension/image/image_processor.cpp
@@ -7,6 +7,7 @@
  */
 
 #include <executorch/extension/image/image_processor.h>
+#include <executorch/extension/image/image_processor_simd.h>
 
 #include <algorithm>
 #include <cstring>
@@ -420,25 +421,23 @@ Error ImageProcessor::process_into(
         InvalidArgument,
         "normalization std_dev must be nonzero");
   }
-  // Source (resized RGB) carries input_channels; the output tensor carries
-  // output_channels. They are equal today, so channels map 1:1; a future
-  // divergence (e.g. grayscale) would need an explicit channel map here.
-  for (int32_t y = 0; y < resize_h; ++y) {
-    for (int32_t x = 0; x < resize_w; ++x) {
-      const int32_t src_idx = (y * resize_w + x) * input_channels;
-      const int32_t dst_y = y + offset_y;
-      const int32_t dst_x = x + offset_x;
-      for (int32_t c = 0; c < output_channels; ++c) {
-        const float val =
-            (resized_buf[src_idx + c] * norm.scale_factor - norm.mean[c]) /
-            norm.std_dev[c];
-        const size_t out_idx = static_cast<size_t>(c) * final_w * final_h +
-            static_cast<size_t>(dst_y) * final_w + dst_x;
-        output[out_idx] = val;
-      }
-    }
-  }
-  return Error::Ok;
+  // Deinterleave + normalize the resized interleaved RGB (R/G/B at byte
+  // offsets 0/1/2) into the CHW output.
+  return deinterleave_to_chw(
+      resized_buf.data(),
+      resize_w,
+      resize_h,
+      resize_w * input_channels,
+      input_channels,
+      /*r_off=*/0,
+      /*g_off=*/1,
+      /*b_off=*/2,
+      output,
+      final_w,
+      final_h,
+      offset_x,
+      offset_y,
+      norm);
 }
 
 Error ImageProcessor::process_yuv_into(
diff --git a/extension/image/image_processor_apple.cpp b/extension/image/image_processor_apple.cpp
index 44e6d2c083e..04c599ab0ff 100644
--- a/extension/image/image_processor_apple.cpp
+++ b/extension/image/image_processor_apple.cpp
@@ -20,6 +20,7 @@
 
 #include <executorch/extension/image/image_processor.h>
 #include <executorch/extension/image/image_processor_apple.h>
+#include <executorch/extension/image/image_processor_simd.h>
 
 #include <algorithm>
 #include <cstring>
@@ -391,85 +392,6 @@ size_t compute_scale_temp_size(
   return temp_size > 0 ? static_cast<size_t>(temp_size) : 0;
 }
 
-// Deinterleave BGRA uint8 → planar RGB float with fused normalization.
-// Handles offset for letterbox padding.
-//
-// Per channel (R, G, B): vDSP_vfltu8 reads the matching byte from BGRA via
-// stride=4 and converts uint8→float, then vDSP_vsmsa applies the fused
-// affine `out = in * (scale_factor / std_dev) + (-mean / std_dev)` in-place.
-Error deinterleave_bgra_to_chw(
-    const uint8_t* bgra_data,
-    int32_t src_w,
-    int32_t src_h,
-    int32_t src_stride,
-    float* output,
-    int32_t final_w,
-    int32_t final_h,
-    int32_t offset_x,
-    int32_t offset_y,
-    const Normalization& norm) {
-  const size_t spatial = static_cast<size_t>(final_w) * final_h;
-
-  // Per-channel affine coefficients for `out = in * a + b`.
-  // BGRA byte layout: byte 0 = B, byte 1 = G, byte 2 = R; norm.{mean,std_dev}
-  // are indexed in RGB order (channel 0 = R, 1 = G, 2 = B).
-  const float a_r = norm.scale_factor / norm.std_dev[0];
-  const float a_g = norm.scale_factor / norm.std_dev[1];
-  const float a_b = norm.scale_factor / norm.std_dev[2];
-  const float b_r = -norm.mean[0] / norm.std_dev[0];
-  const float b_g = -norm.mean[1] / norm.std_dev[1];
-  const float b_b = -norm.mean[2] / norm.std_dev[2];
-
-  // When the bias is zero (e.g. zeroToOne / mean=0), a plain scale (vsmul) is
-  // cheaper than the fused scale+add (vsmsa).
-  const bool no_offset = (b_r == 0.0f && b_g == 0.0f && b_b == 0.0f);
-  auto scale_bias =
-      [no_offset](float* p, const float* a, const float* b, vDSP_Length n) {
-        if (no_offset) {
-          vDSP_vsmul(p, 1, a, p, 1, n);
-        } else {
-          vDSP_vsmsa(p, 1, a, b, p, 1, n);
-        }
-      };
-
-  // Output planes in CHW order: R, G, B. Each plane is final_w × final_h
-  // floats; we write a src_h × src_w region starting at (offset_y, offset_x).
-  float* r_plane = output + 0 * spatial;
-  float* g_plane = output + 1 * spatial;
-  float* b_plane = output + 2 * spatial;
-
-  // Fast path: source is contiguous and destination region is the entire
-  // plane (offsets 0, src dims == final dims).
-  if (src_stride == src_w * 4 && offset_x == 0 && offset_y == 0 &&
-      src_w == final_w && src_h == final_h) {
-    const vDSP_Length n = static_cast<vDSP_Length>(src_w) * src_h;
-    vDSP_vfltu8(bgra_data + 2, 4, r_plane, 1, n);
-    scale_bias(r_plane, &a_r, &b_r, n);
-    vDSP_vfltu8(bgra_data + 1, 4, g_plane, 1, n);
-    scale_bias(g_plane, &a_g, &b_g, n);
-    vDSP_vfltu8(bgra_data + 0, 4, b_plane, 1, n);
-    scale_bias(b_plane, &a_b, &b_b, n);
-    return Error::Ok;
-  }
-
-  // Slow path: row-by-row to handle stride padding and/or letterbox offsets.
-  for (int32_t y = 0; y < src_h; ++y) {
-    const uint8_t* src_row = bgra_data + y * src_stride;
-    const ptrdiff_t dst_off = (y + offset_y) * final_w + offset_x;
-    float* r_dst = r_plane + dst_off;
-    float* g_dst = g_plane + dst_off;
-    float* b_dst = b_plane + dst_off;
-    const vDSP_Length n = static_cast<vDSP_Length>(src_w);
-    vDSP_vfltu8(src_row + 2, 4, r_dst, 1, n);
-    scale_bias(r_dst, &a_r, &b_r, n);
-    vDSP_vfltu8(src_row + 1, 4, g_dst, 1, n);
-    scale_bias(g_dst, &a_g, &b_g, n);
-    vDSP_vfltu8(src_row + 0, 4, b_dst, 1, n);
-    scale_bias(b_dst, &a_b, &b_b, n);
-  }
-  return Error::Ok;
-}
-
 // Rotate an interleaved BGRA (ARGB8888 layout) buffer by `orientation` using
 // vImage's SIMD/cache-aware 90-degree rotation, writing a tightly-packed result
 // into `scratch`. UP is handled by the caller (no rotation). out_data/out_w/
@@ -590,11 +512,16 @@ Error normalize_bgra_into(
     offset_y = offset.second;
   }
 
-  return deinterleave_bgra_to_chw(
+  // BGRA byte layout: B=0, G=1, R=2 (alpha dropped); norm is RGB-indexed.
+  return deinterleave_to_chw(
       bgra_data,
       width,
       height,
       stride,
+      /*in_channels=*/4,
+      /*r_off=*/2,
+      /*g_off=*/1,
+      /*b_off=*/0,
       out,
       final_w,
       final_h,
@@ -1380,6 +1307,7 @@ Error process_pixelbuffer_into(
 
 // Allocate a CHW float tensor sized to the configured target and fill it via
 // process_pixelbuffer_into.
+// cppcheck-suppress unusedFunction
 Result<TensorPtr> process_pixelbuffer(
     const ImageProcessor& processor,
     CVPixelBufferRef pixelBuffer,
diff --git a/extension/image/image_processor_simd.cpp b/extension/image/image_processor_simd.cpp
new file mode 100644
index 00000000000..57b1cc32e08
--- /dev/null
+++ b/extension/image/image_processor_simd.cpp
@@ -0,0 +1,186 @@
+/*
+ * Copyright (c) Meta Platforms, Inc. and affiliates.
+ * All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+#include <executorch/extension/image/image_processor_simd.h>
+
+#include <cstddef>
+
+#include <executorch/runtime/platform/assert.h>
+
+#if defined(__ARM_NEON) || defined(__ARM_NEON__)
+#include <arm_neon.h>
+#define ET_IMAGE_USE_NEON 1
+#else
+#define ET_IMAGE_USE_NEON 0
+#endif
+
+namespace executorch {
+namespace extension {
+namespace image {
+
+using runtime::Error;
+
+namespace {
+
+#if ET_IMAGE_USE_NEON
+// Widen 16 uint8 -> 4x float32x4, apply out = in * a + b (single-rounding FMA),
+// and store the 16 resulting floats.
+__attribute__((always_inline)) inline void
+widen_fma_store(uint8x16_t ch, float* dst, float32x4_t a, float32x4_t b) {
+  uint16x8_t lo = vmovl_u8(vget_low_u8(ch));
+  uint16x8_t hi = vmovl_u8(vget_high_u8(ch));
+  vst1q_f32(
+      dst + 0, vfmaq_f32(b, vcvtq_f32_u32(vmovl_u16(vget_low_u16(lo))), a));
+  vst1q_f32(
+      dst + 4, vfmaq_f32(b, vcvtq_f32_u32(vmovl_u16(vget_high_u16(lo))), a));
+  vst1q_f32(
+      dst + 8, vfmaq_f32(b, vcvtq_f32_u32(vmovl_u16(vget_low_u16(hi))), a));
+  vst1q_f32(
+      dst + 12, vfmaq_f32(b, vcvtq_f32_u32(vmovl_u16(vget_high_u16(hi))), a));
+}
+#endif // ET_IMAGE_USE_NEON
+
+// Deinterleave + normalize one contiguous run of `n` pixels (stride
+// in_channels bytes/pixel) into the r/g/b float planes. NEON when available,
+// scalar otherwise; the scalar tail also finishes the final (<16) pixels.
+void deinterleave_run(
+    const uint8_t* __restrict src,
+    size_t n,
+    int32_t in_channels,
+    int32_t r_off,
+    int32_t g_off,
+    int32_t b_off,
+    float* __restrict r_out,
+    float* __restrict g_out,
+    float* __restrict b_out,
+    float a_r,
+    float b_r,
+    float a_g,
+    float b_g,
+    float a_b,
+    float b_b) {
+  size_t i = 0;
+#if ET_IMAGE_USE_NEON
+  const float32x4_t va_r = vdupq_n_f32(a_r);
+  const float32x4_t vb_r = vdupq_n_f32(b_r);
+  const float32x4_t va_g = vdupq_n_f32(a_g);
+  const float32x4_t vb_g = vdupq_n_f32(b_g);
+  const float32x4_t va_b = vdupq_n_f32(a_b);
+  const float32x4_t vb_b = vdupq_n_f32(b_b);
+  if (in_channels == 4) {
+    for (; i + 16 <= n; i += 16) {
+      uint8x16x4_t px = vld4q_u8(src + i * 4);
+      widen_fma_store(px.val[r_off], r_out + i, va_r, vb_r);
+      widen_fma_store(px.val[g_off], g_out + i, va_g, vb_g);
+      widen_fma_store(px.val[b_off], b_out + i, va_b, vb_b);
+    }
+  } else { // in_channels == 3
+    for (; i + 16 <= n; i += 16) {
+      uint8x16x3_t px = vld3q_u8(src + i * 3);
+      widen_fma_store(px.val[r_off], r_out + i, va_r, vb_r);
+      widen_fma_store(px.val[g_off], g_out + i, va_g, vb_g);
+      widen_fma_store(px.val[b_off], b_out + i, va_b, vb_b);
+    }
+  }
+#endif // ET_IMAGE_USE_NEON
+  for (; i < n; ++i) {
+    const uint8_t* p = src + i * in_channels;
+    r_out[i] = static_cast<float>(p[r_off]) * a_r + b_r;
+    g_out[i] = static_cast<float>(p[g_off]) * a_g + b_g;
+    b_out[i] = static_cast<float>(p[b_off]) * a_b + b_b;
+  }
+}
+
+} // namespace
+
+Error deinterleave_to_chw(
+    const uint8_t* src,
+    int32_t src_w,
+    int32_t src_h,
+    int32_t src_stride,
+    int32_t in_channels,
+    int32_t r_off,
+    int32_t g_off,
+    int32_t b_off,
+    float* output,
+    int32_t final_w,
+    int32_t final_h,
+    int32_t offset_x,
+    int32_t offset_y,
+    const Normalization& norm) {
+  ET_DCHECK_MSG(
+      in_channels == 3 || in_channels == 4, "in_channels must be 3 or 4");
+  ET_DCHECK_MSG(
+      r_off < in_channels && g_off < in_channels && b_off < in_channels,
+      "channel offsets must be < in_channels");
+  const size_t spatial = static_cast<size_t>(final_w) * final_h;
+
+  // Per-channel affine coefficients for `out = in * a + b`, in RGB order.
+  const float a_r = norm.scale_factor / norm.std_dev[0];
+  const float a_g = norm.scale_factor / norm.std_dev[1];
+  const float a_b = norm.scale_factor / norm.std_dev[2];
+  const float b_r = -norm.mean[0] / norm.std_dev[0];
+  const float b_g = -norm.mean[1] / norm.std_dev[1];
+  const float b_b = -norm.mean[2] / norm.std_dev[2];
+
+  // Output planes in CHW order: R, G, B.
+  float* r_plane = output + 0 * spatial;
+  float* g_plane = output + 1 * spatial;
+  float* b_plane = output + 2 * spatial;
+
+  // Fast path: contiguous source covering the entire plane (no stride padding,
+  // no letterbox offset, src dims == final dims) -> one run over all pixels.
+  if (src_stride == src_w * in_channels && offset_x == 0 && offset_y == 0 &&
+      src_w == final_w && src_h == final_h) {
+    deinterleave_run(
+        src,
+        static_cast<size_t>(src_w) * src_h,
+        in_channels,
+        r_off,
+        g_off,
+        b_off,
+        r_plane,
+        g_plane,
+        b_plane,
+        a_r,
+        b_r,
+        a_g,
+        b_g,
+        a_b,
+        b_b);
+    return Error::Ok;
+  }
+
+  // Slow path: row by row to honor stride padding and/or a letterbox offset.
+  for (int32_t y = 0; y < src_h; ++y) {
+    const uint8_t* src_row = src + static_cast<size_t>(y) * src_stride;
+    const size_t dst_off =
+        static_cast<size_t>(y + offset_y) * final_w + offset_x;
+    deinterleave_run(
+        src_row,
+        src_w,
+        in_channels,
+        r_off,
+        g_off,
+        b_off,
+        r_plane + dst_off,
+        g_plane + dst_off,
+        b_plane + dst_off,
+        a_r,
+        b_r,
+        a_g,
+        b_g,
+        a_b,
+        b_b);
+  }
+  return Error::Ok;
+}
+
+} // namespace image
+} // namespace extension
+} // namespace executorch
diff --git a/extension/image/image_processor_simd.h b/extension/image/image_processor_simd.h
new file mode 100644
index 00000000000..ad7cd0191e2
--- /dev/null
+++ b/extension/image/image_processor_simd.h
@@ -0,0 +1,55 @@
+/*
+ * Copyright (c) Meta Platforms, Inc. and affiliates.
+ * All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+#pragma once
+
+#include <cstdint>
+
+#include <executorch/extension/image/image_processor_config.h>
+#include <executorch/runtime/core/error.h>
+
+namespace executorch {
+namespace extension {
+namespace image {
+
+// SIMD-accelerated image-processing kernels (NEON on ARM, scalar fallback
+// elsewhere), shared by the Apple and portable ImageProcessor backends.
+
+// Deinterleave an 8-bit interleaved image into planar CHW float with a fused
+// per-channel affine normalize:
+//   out = pixel * (scale_factor / std_dev[c]) + (-mean[c] / std_dev[c]).
+// Uses NEON (vld4q_u8 / vld3q_u8 + FMA) on ARM, scalar elsewhere.
+//
+// in_channels is 3 (RGB) or 4 (BGRA/RGBA; the alpha byte is ignored).
+// r_off/g_off/b_off are the byte offsets of R, G, B within a pixel
+// (BGRA -> {2, 1, 0}, RGB/RGBA -> {0, 1, 2}); they also index the deinterleaved
+// channels, so each must be < in_channels. norm.{mean,std_dev} are in RGB
+// order.
+//
+// Writes a src_w x src_h region at (offset_x, offset_y) into the final_w x
+// final_h planes; pixels outside that region are left untouched, so callers
+// that letterbox must pre-fill the padding. src_stride is in bytes.
+runtime::Error deinterleave_to_chw(
+    const uint8_t* src,
+    int32_t src_w,
+    int32_t src_h,
+    int32_t src_stride,
+    int32_t in_channels,
+    int32_t r_off,
+    int32_t g_off,
+    int32_t b_off,
+    float* output,
+    int32_t final_w,
+    int32_t final_h,
+    int32_t offset_x,
+    int32_t offset_y,
+    const Normalization& norm);
+
+} // namespace image
+} // namespace extension
+} // namespace executorch
diff --git a/extension/image/targets.bzl b/extension/image/targets.bzl
index f25e0e6bfe5..c857b8d9b07 100644
--- a/extension/image/targets.bzl
+++ b/extension/image/targets.bzl
@@ -29,7 +29,10 @@ def define_common_targets():
 
         runtime.cxx_library(
             name = "image_processor" + aten_suffix,
-            srcs = ["image_processor_common.cpp"] + select({
+            srcs = [
+                "image_processor_common.cpp",
+                "image_processor_simd.cpp",
+            ] + select({
                 "DEFAULT": ["image_processor.cpp"],
                 "ovr_config//os:iphoneos": [
                     "image_processor_apple.cpp",
@@ -42,6 +45,7 @@ def define_common_targets():
             }),
             headers = [
                 "image_processor_apple_gpu.h",
+                "image_processor_simd.h",
             ],
             exported_headers = [
                 "image_processor.h",