diff --git a/extension/image/CMakeLists.txt b/extension/image/CMakeLists.txt index 7525fe7de44..0c233ffc796 100644 --- a/extension/image/CMakeLists.txt +++ b/extension/image/CMakeLists.txt @@ -9,8 +9,8 @@ cmake_minimum_required(VERSION 3.19) if(APPLE) enable_language(OBJCXX) add_library( - extension_image image_processor_common.cpp image_processor_apple.cpp - image_processor_apple_gpu.mm + extension_image image_processor_common.cpp image_processor_simd.cpp + image_processor_apple.cpp image_processor_apple_gpu.mm ) set_source_files_properties( image_processor_apple_gpu.mm PROPERTIES COMPILE_FLAGS "-fobjc-arc" @@ -39,7 +39,10 @@ else() ) FetchContent_MakeAvailable(stb) - add_library(extension_image image_processor_common.cpp image_processor.cpp) + add_library( + extension_image image_processor_common.cpp image_processor_simd.cpp + image_processor.cpp + ) # stb_image_resize.h lives under deprecated/ in current stb. Private: only the # .cpp uses it, not the installed public headers. diff --git a/extension/image/image_processor.cpp b/extension/image/image_processor.cpp index 0f1b8f4f7de..4605f8004c0 100644 --- a/extension/image/image_processor.cpp +++ b/extension/image/image_processor.cpp @@ -7,6 +7,7 @@ */ #include +#include #include #include @@ -420,25 +421,23 @@ Error ImageProcessor::process_into( InvalidArgument, "normalization std_dev must be nonzero"); } - // Source (resized RGB) carries input_channels; the output tensor carries - // output_channels. They are equal today, so channels map 1:1; a future - // divergence (e.g. grayscale) would need an explicit channel map here. - for (int32_t y = 0; y < resize_h; ++y) { - for (int32_t x = 0; x < resize_w; ++x) { - const int32_t src_idx = (y * resize_w + x) * input_channels; - const int32_t dst_y = y + offset_y; - const int32_t dst_x = x + offset_x; - for (int32_t c = 0; c < output_channels; ++c) { - const float val = - (resized_buf[src_idx + c] * norm.scale_factor - norm.mean[c]) / - norm.std_dev[c]; - const size_t out_idx = static_cast(c) * final_w * final_h + - static_cast(dst_y) * final_w + dst_x; - output[out_idx] = val; - } - } - } - return Error::Ok; + // Deinterleave + normalize the resized interleaved RGB (R/G/B at byte + // offsets 0/1/2) into the CHW output. + return deinterleave_to_chw( + resized_buf.data(), + resize_w, + resize_h, + resize_w * input_channels, + input_channels, + /*r_off=*/0, + /*g_off=*/1, + /*b_off=*/2, + output, + final_w, + final_h, + offset_x, + offset_y, + norm); } Error ImageProcessor::process_yuv_into( diff --git a/extension/image/image_processor_apple.cpp b/extension/image/image_processor_apple.cpp index 44e6d2c083e..04c599ab0ff 100644 --- a/extension/image/image_processor_apple.cpp +++ b/extension/image/image_processor_apple.cpp @@ -20,6 +20,7 @@ #include #include +#include #include #include @@ -391,85 +392,6 @@ size_t compute_scale_temp_size( return temp_size > 0 ? static_cast(temp_size) : 0; } -// Deinterleave BGRA uint8 → planar RGB float with fused normalization. -// Handles offset for letterbox padding. -// -// Per channel (R, G, B): vDSP_vfltu8 reads the matching byte from BGRA via -// stride=4 and converts uint8→float, then vDSP_vsmsa applies the fused -// affine `out = in * (scale_factor / std_dev) + (-mean / std_dev)` in-place. -Error deinterleave_bgra_to_chw( - const uint8_t* bgra_data, - int32_t src_w, - int32_t src_h, - int32_t src_stride, - float* output, - int32_t final_w, - int32_t final_h, - int32_t offset_x, - int32_t offset_y, - const Normalization& norm) { - const size_t spatial = static_cast(final_w) * final_h; - - // Per-channel affine coefficients for `out = in * a + b`. - // BGRA byte layout: byte 0 = B, byte 1 = G, byte 2 = R; norm.{mean,std_dev} - // are indexed in RGB order (channel 0 = R, 1 = G, 2 = B). - const float a_r = norm.scale_factor / norm.std_dev[0]; - const float a_g = norm.scale_factor / norm.std_dev[1]; - const float a_b = norm.scale_factor / norm.std_dev[2]; - const float b_r = -norm.mean[0] / norm.std_dev[0]; - const float b_g = -norm.mean[1] / norm.std_dev[1]; - const float b_b = -norm.mean[2] / norm.std_dev[2]; - - // When the bias is zero (e.g. zeroToOne / mean=0), a plain scale (vsmul) is - // cheaper than the fused scale+add (vsmsa). - const bool no_offset = (b_r == 0.0f && b_g == 0.0f && b_b == 0.0f); - auto scale_bias = - [no_offset](float* p, const float* a, const float* b, vDSP_Length n) { - if (no_offset) { - vDSP_vsmul(p, 1, a, p, 1, n); - } else { - vDSP_vsmsa(p, 1, a, b, p, 1, n); - } - }; - - // Output planes in CHW order: R, G, B. Each plane is final_w × final_h - // floats; we write a src_h × src_w region starting at (offset_y, offset_x). - float* r_plane = output + 0 * spatial; - float* g_plane = output + 1 * spatial; - float* b_plane = output + 2 * spatial; - - // Fast path: source is contiguous and destination region is the entire - // plane (offsets 0, src dims == final dims). - if (src_stride == src_w * 4 && offset_x == 0 && offset_y == 0 && - src_w == final_w && src_h == final_h) { - const vDSP_Length n = static_cast(src_w) * src_h; - vDSP_vfltu8(bgra_data + 2, 4, r_plane, 1, n); - scale_bias(r_plane, &a_r, &b_r, n); - vDSP_vfltu8(bgra_data + 1, 4, g_plane, 1, n); - scale_bias(g_plane, &a_g, &b_g, n); - vDSP_vfltu8(bgra_data + 0, 4, b_plane, 1, n); - scale_bias(b_plane, &a_b, &b_b, n); - return Error::Ok; - } - - // Slow path: row-by-row to handle stride padding and/or letterbox offsets. - for (int32_t y = 0; y < src_h; ++y) { - const uint8_t* src_row = bgra_data + y * src_stride; - const ptrdiff_t dst_off = (y + offset_y) * final_w + offset_x; - float* r_dst = r_plane + dst_off; - float* g_dst = g_plane + dst_off; - float* b_dst = b_plane + dst_off; - const vDSP_Length n = static_cast(src_w); - vDSP_vfltu8(src_row + 2, 4, r_dst, 1, n); - scale_bias(r_dst, &a_r, &b_r, n); - vDSP_vfltu8(src_row + 1, 4, g_dst, 1, n); - scale_bias(g_dst, &a_g, &b_g, n); - vDSP_vfltu8(src_row + 0, 4, b_dst, 1, n); - scale_bias(b_dst, &a_b, &b_b, n); - } - return Error::Ok; -} - // Rotate an interleaved BGRA (ARGB8888 layout) buffer by `orientation` using // vImage's SIMD/cache-aware 90-degree rotation, writing a tightly-packed result // into `scratch`. UP is handled by the caller (no rotation). out_data/out_w/ @@ -590,11 +512,16 @@ Error normalize_bgra_into( offset_y = offset.second; } - return deinterleave_bgra_to_chw( + // BGRA byte layout: B=0, G=1, R=2 (alpha dropped); norm is RGB-indexed. + return deinterleave_to_chw( bgra_data, width, height, stride, + /*in_channels=*/4, + /*r_off=*/2, + /*g_off=*/1, + /*b_off=*/0, out, final_w, final_h, @@ -1380,6 +1307,7 @@ Error process_pixelbuffer_into( // Allocate a CHW float tensor sized to the configured target and fill it via // process_pixelbuffer_into. +// cppcheck-suppress unusedFunction Result process_pixelbuffer( const ImageProcessor& processor, CVPixelBufferRef pixelBuffer, diff --git a/extension/image/image_processor_simd.cpp b/extension/image/image_processor_simd.cpp new file mode 100644 index 00000000000..57b1cc32e08 --- /dev/null +++ b/extension/image/image_processor_simd.cpp @@ -0,0 +1,186 @@ +/* + * Copyright (c) Meta Platforms, Inc. and affiliates. + * All rights reserved. + * + * This source code is licensed under the BSD-style license found in the + * LICENSE file in the root directory of this source tree. + */ + +#include + +#include + +#include + +#if defined(__ARM_NEON) || defined(__ARM_NEON__) +#include +#define ET_IMAGE_USE_NEON 1 +#else +#define ET_IMAGE_USE_NEON 0 +#endif + +namespace executorch { +namespace extension { +namespace image { + +using runtime::Error; + +namespace { + +#if ET_IMAGE_USE_NEON +// Widen 16 uint8 -> 4x float32x4, apply out = in * a + b (single-rounding FMA), +// and store the 16 resulting floats. +__attribute__((always_inline)) inline void +widen_fma_store(uint8x16_t ch, float* dst, float32x4_t a, float32x4_t b) { + uint16x8_t lo = vmovl_u8(vget_low_u8(ch)); + uint16x8_t hi = vmovl_u8(vget_high_u8(ch)); + vst1q_f32( + dst + 0, vfmaq_f32(b, vcvtq_f32_u32(vmovl_u16(vget_low_u16(lo))), a)); + vst1q_f32( + dst + 4, vfmaq_f32(b, vcvtq_f32_u32(vmovl_u16(vget_high_u16(lo))), a)); + vst1q_f32( + dst + 8, vfmaq_f32(b, vcvtq_f32_u32(vmovl_u16(vget_low_u16(hi))), a)); + vst1q_f32( + dst + 12, vfmaq_f32(b, vcvtq_f32_u32(vmovl_u16(vget_high_u16(hi))), a)); +} +#endif // ET_IMAGE_USE_NEON + +// Deinterleave + normalize one contiguous run of `n` pixels (stride +// in_channels bytes/pixel) into the r/g/b float planes. NEON when available, +// scalar otherwise; the scalar tail also finishes the final (<16) pixels. +void deinterleave_run( + const uint8_t* __restrict src, + size_t n, + int32_t in_channels, + int32_t r_off, + int32_t g_off, + int32_t b_off, + float* __restrict r_out, + float* __restrict g_out, + float* __restrict b_out, + float a_r, + float b_r, + float a_g, + float b_g, + float a_b, + float b_b) { + size_t i = 0; +#if ET_IMAGE_USE_NEON + const float32x4_t va_r = vdupq_n_f32(a_r); + const float32x4_t vb_r = vdupq_n_f32(b_r); + const float32x4_t va_g = vdupq_n_f32(a_g); + const float32x4_t vb_g = vdupq_n_f32(b_g); + const float32x4_t va_b = vdupq_n_f32(a_b); + const float32x4_t vb_b = vdupq_n_f32(b_b); + if (in_channels == 4) { + for (; i + 16 <= n; i += 16) { + uint8x16x4_t px = vld4q_u8(src + i * 4); + widen_fma_store(px.val[r_off], r_out + i, va_r, vb_r); + widen_fma_store(px.val[g_off], g_out + i, va_g, vb_g); + widen_fma_store(px.val[b_off], b_out + i, va_b, vb_b); + } + } else { // in_channels == 3 + for (; i + 16 <= n; i += 16) { + uint8x16x3_t px = vld3q_u8(src + i * 3); + widen_fma_store(px.val[r_off], r_out + i, va_r, vb_r); + widen_fma_store(px.val[g_off], g_out + i, va_g, vb_g); + widen_fma_store(px.val[b_off], b_out + i, va_b, vb_b); + } + } +#endif // ET_IMAGE_USE_NEON + for (; i < n; ++i) { + const uint8_t* p = src + i * in_channels; + r_out[i] = static_cast(p[r_off]) * a_r + b_r; + g_out[i] = static_cast(p[g_off]) * a_g + b_g; + b_out[i] = static_cast(p[b_off]) * a_b + b_b; + } +} + +} // namespace + +Error deinterleave_to_chw( + const uint8_t* src, + int32_t src_w, + int32_t src_h, + int32_t src_stride, + int32_t in_channels, + int32_t r_off, + int32_t g_off, + int32_t b_off, + float* output, + int32_t final_w, + int32_t final_h, + int32_t offset_x, + int32_t offset_y, + const Normalization& norm) { + ET_DCHECK_MSG( + in_channels == 3 || in_channels == 4, "in_channels must be 3 or 4"); + ET_DCHECK_MSG( + r_off < in_channels && g_off < in_channels && b_off < in_channels, + "channel offsets must be < in_channels"); + const size_t spatial = static_cast(final_w) * final_h; + + // Per-channel affine coefficients for `out = in * a + b`, in RGB order. + const float a_r = norm.scale_factor / norm.std_dev[0]; + const float a_g = norm.scale_factor / norm.std_dev[1]; + const float a_b = norm.scale_factor / norm.std_dev[2]; + const float b_r = -norm.mean[0] / norm.std_dev[0]; + const float b_g = -norm.mean[1] / norm.std_dev[1]; + const float b_b = -norm.mean[2] / norm.std_dev[2]; + + // Output planes in CHW order: R, G, B. + float* r_plane = output + 0 * spatial; + float* g_plane = output + 1 * spatial; + float* b_plane = output + 2 * spatial; + + // Fast path: contiguous source covering the entire plane (no stride padding, + // no letterbox offset, src dims == final dims) -> one run over all pixels. + if (src_stride == src_w * in_channels && offset_x == 0 && offset_y == 0 && + src_w == final_w && src_h == final_h) { + deinterleave_run( + src, + static_cast(src_w) * src_h, + in_channels, + r_off, + g_off, + b_off, + r_plane, + g_plane, + b_plane, + a_r, + b_r, + a_g, + b_g, + a_b, + b_b); + return Error::Ok; + } + + // Slow path: row by row to honor stride padding and/or a letterbox offset. + for (int32_t y = 0; y < src_h; ++y) { + const uint8_t* src_row = src + static_cast(y) * src_stride; + const size_t dst_off = + static_cast(y + offset_y) * final_w + offset_x; + deinterleave_run( + src_row, + src_w, + in_channels, + r_off, + g_off, + b_off, + r_plane + dst_off, + g_plane + dst_off, + b_plane + dst_off, + a_r, + b_r, + a_g, + b_g, + a_b, + b_b); + } + return Error::Ok; +} + +} // namespace image +} // namespace extension +} // namespace executorch diff --git a/extension/image/image_processor_simd.h b/extension/image/image_processor_simd.h new file mode 100644 index 00000000000..ad7cd0191e2 --- /dev/null +++ b/extension/image/image_processor_simd.h @@ -0,0 +1,55 @@ +/* + * Copyright (c) Meta Platforms, Inc. and affiliates. + * All rights reserved. + * + * This source code is licensed under the BSD-style license found in the + * LICENSE file in the root directory of this source tree. + */ + +#pragma once + +#include + +#include +#include + +namespace executorch { +namespace extension { +namespace image { + +// SIMD-accelerated image-processing kernels (NEON on ARM, scalar fallback +// elsewhere), shared by the Apple and portable ImageProcessor backends. + +// Deinterleave an 8-bit interleaved image into planar CHW float with a fused +// per-channel affine normalize: +// out = pixel * (scale_factor / std_dev[c]) + (-mean[c] / std_dev[c]). +// Uses NEON (vld4q_u8 / vld3q_u8 + FMA) on ARM, scalar elsewhere. +// +// in_channels is 3 (RGB) or 4 (BGRA/RGBA; the alpha byte is ignored). +// r_off/g_off/b_off are the byte offsets of R, G, B within a pixel +// (BGRA -> {2, 1, 0}, RGB/RGBA -> {0, 1, 2}); they also index the deinterleaved +// channels, so each must be < in_channels. norm.{mean,std_dev} are in RGB +// order. +// +// Writes a src_w x src_h region at (offset_x, offset_y) into the final_w x +// final_h planes; pixels outside that region are left untouched, so callers +// that letterbox must pre-fill the padding. src_stride is in bytes. +runtime::Error deinterleave_to_chw( + const uint8_t* src, + int32_t src_w, + int32_t src_h, + int32_t src_stride, + int32_t in_channels, + int32_t r_off, + int32_t g_off, + int32_t b_off, + float* output, + int32_t final_w, + int32_t final_h, + int32_t offset_x, + int32_t offset_y, + const Normalization& norm); + +} // namespace image +} // namespace extension +} // namespace executorch diff --git a/extension/image/targets.bzl b/extension/image/targets.bzl index f25e0e6bfe5..c857b8d9b07 100644 --- a/extension/image/targets.bzl +++ b/extension/image/targets.bzl @@ -29,7 +29,10 @@ def define_common_targets(): runtime.cxx_library( name = "image_processor" + aten_suffix, - srcs = ["image_processor_common.cpp"] + select({ + srcs = [ + "image_processor_common.cpp", + "image_processor_simd.cpp", + ] + select({ "DEFAULT": ["image_processor.cpp"], "ovr_config//os:iphoneos": [ "image_processor_apple.cpp", @@ -42,6 +45,7 @@ def define_common_targets(): }), headers = [ "image_processor_apple_gpu.h", + "image_processor_simd.h", ], exported_headers = [ "image_processor.h",