Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
9 changes: 6 additions & 3 deletions extension/image/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -9,8 +9,8 @@ cmake_minimum_required(VERSION 3.19)
if(APPLE)
enable_language(OBJCXX)
add_library(
extension_image image_processor_common.cpp image_processor_apple.cpp
image_processor_apple_gpu.mm
extension_image image_processor_common.cpp image_processor_simd.cpp
image_processor_apple.cpp image_processor_apple_gpu.mm
)
set_source_files_properties(
image_processor_apple_gpu.mm PROPERTIES COMPILE_FLAGS "-fobjc-arc"
Expand Down Expand Up @@ -39,7 +39,10 @@ else()
)
FetchContent_MakeAvailable(stb)

add_library(extension_image image_processor_common.cpp image_processor.cpp)
add_library(
extension_image image_processor_common.cpp image_processor_simd.cpp
image_processor.cpp
)

# stb_image_resize.h lives under deprecated/ in current stb. Private: only the
# .cpp uses it, not the installed public headers.
Expand Down
37 changes: 18 additions & 19 deletions extension/image/image_processor.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,7 @@
*/

#include <executorch/extension/image/image_processor.h>
#include <executorch/extension/image/image_processor_simd.h>

#include <algorithm>
#include <cstring>
Expand Down Expand Up @@ -420,25 +421,23 @@ Error ImageProcessor::process_into(
InvalidArgument,
"normalization std_dev must be nonzero");
}
// Source (resized RGB) carries input_channels; the output tensor carries
// output_channels. They are equal today, so channels map 1:1; a future
// divergence (e.g. grayscale) would need an explicit channel map here.
for (int32_t y = 0; y < resize_h; ++y) {
for (int32_t x = 0; x < resize_w; ++x) {
const int32_t src_idx = (y * resize_w + x) * input_channels;
const int32_t dst_y = y + offset_y;
const int32_t dst_x = x + offset_x;
for (int32_t c = 0; c < output_channels; ++c) {
const float val =
(resized_buf[src_idx + c] * norm.scale_factor - norm.mean[c]) /
norm.std_dev[c];
const size_t out_idx = static_cast<size_t>(c) * final_w * final_h +
static_cast<size_t>(dst_y) * final_w + dst_x;
output[out_idx] = val;
}
}
}
return Error::Ok;
// Deinterleave + normalize the resized interleaved RGB (R/G/B at byte
// offsets 0/1/2) into the CHW output.
return deinterleave_to_chw(
resized_buf.data(),
resize_w,
resize_h,
resize_w * input_channels,
input_channels,
/*r_off=*/0,
/*g_off=*/1,
/*b_off=*/2,
output,
final_w,
final_h,
offset_x,
offset_y,
norm);
}

Error ImageProcessor::process_yuv_into(
Expand Down
88 changes: 8 additions & 80 deletions extension/image/image_processor_apple.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,7 @@

#include <executorch/extension/image/image_processor.h>
#include <executorch/extension/image/image_processor_apple.h>
#include <executorch/extension/image/image_processor_simd.h>

#include <algorithm>
#include <cstring>
Expand Down Expand Up @@ -391,85 +392,6 @@ size_t compute_scale_temp_size(
return temp_size > 0 ? static_cast<size_t>(temp_size) : 0;
}

// Deinterleave BGRA uint8 → planar RGB float with fused normalization.
// Handles offset for letterbox padding.
//
// Per channel (R, G, B): vDSP_vfltu8 reads the matching byte from BGRA via
// stride=4 and converts uint8→float, then vDSP_vsmsa applies the fused
// affine `out = in * (scale_factor / std_dev) + (-mean / std_dev)` in-place.
Error deinterleave_bgra_to_chw(
const uint8_t* bgra_data,
int32_t src_w,
int32_t src_h,
int32_t src_stride,
float* output,
int32_t final_w,
int32_t final_h,
int32_t offset_x,
int32_t offset_y,
const Normalization& norm) {
const size_t spatial = static_cast<size_t>(final_w) * final_h;

// Per-channel affine coefficients for `out = in * a + b`.
// BGRA byte layout: byte 0 = B, byte 1 = G, byte 2 = R; norm.{mean,std_dev}
// are indexed in RGB order (channel 0 = R, 1 = G, 2 = B).
const float a_r = norm.scale_factor / norm.std_dev[0];
const float a_g = norm.scale_factor / norm.std_dev[1];
const float a_b = norm.scale_factor / norm.std_dev[2];
const float b_r = -norm.mean[0] / norm.std_dev[0];
const float b_g = -norm.mean[1] / norm.std_dev[1];
const float b_b = -norm.mean[2] / norm.std_dev[2];

// When the bias is zero (e.g. zeroToOne / mean=0), a plain scale (vsmul) is
// cheaper than the fused scale+add (vsmsa).
const bool no_offset = (b_r == 0.0f && b_g == 0.0f && b_b == 0.0f);
auto scale_bias =
[no_offset](float* p, const float* a, const float* b, vDSP_Length n) {
if (no_offset) {
vDSP_vsmul(p, 1, a, p, 1, n);
} else {
vDSP_vsmsa(p, 1, a, b, p, 1, n);
}
};

// Output planes in CHW order: R, G, B. Each plane is final_w × final_h
// floats; we write a src_h × src_w region starting at (offset_y, offset_x).
float* r_plane = output + 0 * spatial;
float* g_plane = output + 1 * spatial;
float* b_plane = output + 2 * spatial;

// Fast path: source is contiguous and destination region is the entire
// plane (offsets 0, src dims == final dims).
if (src_stride == src_w * 4 && offset_x == 0 && offset_y == 0 &&
src_w == final_w && src_h == final_h) {
const vDSP_Length n = static_cast<vDSP_Length>(src_w) * src_h;
vDSP_vfltu8(bgra_data + 2, 4, r_plane, 1, n);
scale_bias(r_plane, &a_r, &b_r, n);
vDSP_vfltu8(bgra_data + 1, 4, g_plane, 1, n);
scale_bias(g_plane, &a_g, &b_g, n);
vDSP_vfltu8(bgra_data + 0, 4, b_plane, 1, n);
scale_bias(b_plane, &a_b, &b_b, n);
return Error::Ok;
}

// Slow path: row-by-row to handle stride padding and/or letterbox offsets.
for (int32_t y = 0; y < src_h; ++y) {
const uint8_t* src_row = bgra_data + y * src_stride;
const ptrdiff_t dst_off = (y + offset_y) * final_w + offset_x;
float* r_dst = r_plane + dst_off;
float* g_dst = g_plane + dst_off;
float* b_dst = b_plane + dst_off;
const vDSP_Length n = static_cast<vDSP_Length>(src_w);
vDSP_vfltu8(src_row + 2, 4, r_dst, 1, n);
scale_bias(r_dst, &a_r, &b_r, n);
vDSP_vfltu8(src_row + 1, 4, g_dst, 1, n);
scale_bias(g_dst, &a_g, &b_g, n);
vDSP_vfltu8(src_row + 0, 4, b_dst, 1, n);
scale_bias(b_dst, &a_b, &b_b, n);
}
return Error::Ok;
}

// Rotate an interleaved BGRA (ARGB8888 layout) buffer by `orientation` using
// vImage's SIMD/cache-aware 90-degree rotation, writing a tightly-packed result
// into `scratch`. UP is handled by the caller (no rotation). out_data/out_w/
Expand Down Expand Up @@ -590,11 +512,16 @@ Error normalize_bgra_into(
offset_y = offset.second;
}

return deinterleave_bgra_to_chw(
// BGRA byte layout: B=0, G=1, R=2 (alpha dropped); norm is RGB-indexed.
return deinterleave_to_chw(
bgra_data,
width,
height,
stride,
/*in_channels=*/4,
/*r_off=*/2,
/*g_off=*/1,
/*b_off=*/0,
out,
final_w,
final_h,
Expand Down Expand Up @@ -1380,6 +1307,7 @@ Error process_pixelbuffer_into(

// Allocate a CHW float tensor sized to the configured target and fill it via
// process_pixelbuffer_into.
// cppcheck-suppress unusedFunction
Result<TensorPtr> process_pixelbuffer(
const ImageProcessor& processor,
CVPixelBufferRef pixelBuffer,
Expand Down
186 changes: 186 additions & 0 deletions extension/image/image_processor_simd.cpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,186 @@
/*
* Copyright (c) Meta Platforms, Inc. and affiliates.
* All rights reserved.
*
* This source code is licensed under the BSD-style license found in the
* LICENSE file in the root directory of this source tree.
*/

#include <executorch/extension/image/image_processor_simd.h>

#include <cstddef>

#include <executorch/runtime/platform/assert.h>

#if defined(__ARM_NEON) || defined(__ARM_NEON__)
#include <arm_neon.h>
#define ET_IMAGE_USE_NEON 1
#else
#define ET_IMAGE_USE_NEON 0
#endif

namespace executorch {
namespace extension {
namespace image {

using runtime::Error;

namespace {

#if ET_IMAGE_USE_NEON
// Widen 16 uint8 -> 4x float32x4, apply out = in * a + b (single-rounding FMA),
// and store the 16 resulting floats.
__attribute__((always_inline)) inline void
widen_fma_store(uint8x16_t ch, float* dst, float32x4_t a, float32x4_t b) {
uint16x8_t lo = vmovl_u8(vget_low_u8(ch));
uint16x8_t hi = vmovl_u8(vget_high_u8(ch));
vst1q_f32(
dst + 0, vfmaq_f32(b, vcvtq_f32_u32(vmovl_u16(vget_low_u16(lo))), a));
vst1q_f32(
dst + 4, vfmaq_f32(b, vcvtq_f32_u32(vmovl_u16(vget_high_u16(lo))), a));
vst1q_f32(
dst + 8, vfmaq_f32(b, vcvtq_f32_u32(vmovl_u16(vget_low_u16(hi))), a));
vst1q_f32(
dst + 12, vfmaq_f32(b, vcvtq_f32_u32(vmovl_u16(vget_high_u16(hi))), a));
}
#endif // ET_IMAGE_USE_NEON

// Deinterleave + normalize one contiguous run of `n` pixels (stride
// in_channels bytes/pixel) into the r/g/b float planes. NEON when available,
// scalar otherwise; the scalar tail also finishes the final (<16) pixels.
void deinterleave_run(
const uint8_t* __restrict src,
size_t n,
int32_t in_channels,
int32_t r_off,
int32_t g_off,
int32_t b_off,
float* __restrict r_out,
float* __restrict g_out,
float* __restrict b_out,
float a_r,
float b_r,
float a_g,
float b_g,
float a_b,
float b_b) {
size_t i = 0;
#if ET_IMAGE_USE_NEON
const float32x4_t va_r = vdupq_n_f32(a_r);
const float32x4_t vb_r = vdupq_n_f32(b_r);
const float32x4_t va_g = vdupq_n_f32(a_g);
const float32x4_t vb_g = vdupq_n_f32(b_g);
const float32x4_t va_b = vdupq_n_f32(a_b);
const float32x4_t vb_b = vdupq_n_f32(b_b);
if (in_channels == 4) {
for (; i + 16 <= n; i += 16) {
uint8x16x4_t px = vld4q_u8(src + i * 4);
widen_fma_store(px.val[r_off], r_out + i, va_r, vb_r);
widen_fma_store(px.val[g_off], g_out + i, va_g, vb_g);
widen_fma_store(px.val[b_off], b_out + i, va_b, vb_b);
}
} else { // in_channels == 3
for (; i + 16 <= n; i += 16) {
uint8x16x3_t px = vld3q_u8(src + i * 3);
widen_fma_store(px.val[r_off], r_out + i, va_r, vb_r);
widen_fma_store(px.val[g_off], g_out + i, va_g, vb_g);
widen_fma_store(px.val[b_off], b_out + i, va_b, vb_b);
}
}
#endif // ET_IMAGE_USE_NEON
for (; i < n; ++i) {
const uint8_t* p = src + i * in_channels;
r_out[i] = static_cast<float>(p[r_off]) * a_r + b_r;
g_out[i] = static_cast<float>(p[g_off]) * a_g + b_g;
b_out[i] = static_cast<float>(p[b_off]) * a_b + b_b;
}
}

} // namespace

Error deinterleave_to_chw(
const uint8_t* src,
int32_t src_w,
int32_t src_h,
int32_t src_stride,
int32_t in_channels,
int32_t r_off,
int32_t g_off,
int32_t b_off,
float* output,
int32_t final_w,
int32_t final_h,
int32_t offset_x,
int32_t offset_y,
const Normalization& norm) {
ET_DCHECK_MSG(
in_channels == 3 || in_channels == 4, "in_channels must be 3 or 4");
ET_DCHECK_MSG(
r_off < in_channels && g_off < in_channels && b_off < in_channels,
"channel offsets must be < in_channels");
const size_t spatial = static_cast<size_t>(final_w) * final_h;

// Per-channel affine coefficients for `out = in * a + b`, in RGB order.
const float a_r = norm.scale_factor / norm.std_dev[0];
const float a_g = norm.scale_factor / norm.std_dev[1];
const float a_b = norm.scale_factor / norm.std_dev[2];
const float b_r = -norm.mean[0] / norm.std_dev[0];
const float b_g = -norm.mean[1] / norm.std_dev[1];
const float b_b = -norm.mean[2] / norm.std_dev[2];

// Output planes in CHW order: R, G, B.
float* r_plane = output + 0 * spatial;
float* g_plane = output + 1 * spatial;
float* b_plane = output + 2 * spatial;

// Fast path: contiguous source covering the entire plane (no stride padding,
// no letterbox offset, src dims == final dims) -> one run over all pixels.
if (src_stride == src_w * in_channels && offset_x == 0 && offset_y == 0 &&
src_w == final_w && src_h == final_h) {
deinterleave_run(
src,
static_cast<size_t>(src_w) * src_h,
in_channels,
r_off,
g_off,
b_off,
r_plane,
g_plane,
b_plane,
a_r,
b_r,
a_g,
b_g,
a_b,
b_b);
return Error::Ok;
}

// Slow path: row by row to honor stride padding and/or a letterbox offset.
for (int32_t y = 0; y < src_h; ++y) {
const uint8_t* src_row = src + static_cast<size_t>(y) * src_stride;
const size_t dst_off =
static_cast<size_t>(y + offset_y) * final_w + offset_x;
deinterleave_run(
src_row,
src_w,
in_channels,
r_off,
g_off,
b_off,
r_plane + dst_off,
g_plane + dst_off,
b_plane + dst_off,
a_r,
b_r,
a_g,
b_g,
a_b,
b_b);
}
return Error::Ok;
}

} // namespace image
} // namespace extension
} // namespace executorch
Loading
Loading