Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions apps/computer-vision/app/object_detection/index.tsx
Original file line number Diff line number Diff line change
Expand Up @@ -30,6 +30,7 @@ const MODELS: ModelOption<ObjectDetectionModelSources>[] = [
{ label: 'YOLO26M', value: objectDetection.yolo26m() },
{ label: 'YOLO26L', value: objectDetection.yolo26l() },
{ label: 'YOLO26X', value: objectDetection.yolo26x() },
{ label: 'BlazeFace', value: objectDetection.blazeface() },
];
import ErrorBanner from '../../components/ErrorBanner';

Expand Down
3 changes: 3 additions & 0 deletions apps/computer-vision/app/vision_camera/index.tsx
Original file line number Diff line number Diff line change
Expand Up @@ -45,6 +45,7 @@ type ModelId =
| 'objectDetectionSsdlite'
| 'objectDetectionRfdetr'
| 'objectDetectionYolo26n'
| 'objectDetectionBlazeface'
| 'segmentationDeeplabResnet50'
| 'segmentationDeeplabResnet101'
| 'segmentationDeeplabMobilenet'
Expand Down Expand Up @@ -105,6 +106,7 @@ const TASKS: Task[] = [
{ id: 'objectDetectionSsdlite', label: 'SSDLite MobileNet' },
{ id: 'objectDetectionRfdetr', label: 'RF-DETR Nano' },
{ id: 'objectDetectionYolo26n', label: 'YOLO26N' },
{ id: 'objectDetectionBlazeface', label: 'BlazeFace' },
],
},
{
Expand Down Expand Up @@ -270,6 +272,7 @@ export default function VisionCameraScreen() {
| 'objectDetectionSsdlite'
| 'objectDetectionRfdetr'
| 'objectDetectionYolo26n'
| 'objectDetectionBlazeface'
}
/>
)}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,7 @@ import {
useObjectDetection,
CocoLabel,
CocoLabelYolo,
BlazeFaceLabel,
} from 'react-native-executorch';
import BoundingBoxes from '../../BoundingBoxes';
import { FRAME_TARGET_RESOLUTION, TaskProps } from './types';
Expand All @@ -16,7 +17,8 @@ const objectDetection = models.object_detection;
type ObjModelId =
| 'objectDetectionSsdlite'
| 'objectDetectionRfdetr'
| 'objectDetectionYolo26n';
| 'objectDetectionYolo26n'
| 'objectDetectionBlazeface';

type Props = TaskProps & { activeModel: ObjModelId };

Expand Down Expand Up @@ -44,13 +46,18 @@ export default function ObjectDetectionTask({
model: objectDetection.yolo26n(),
preventLoad: activeModel !== 'objectDetectionYolo26n',
});
const blazeface = useObjectDetection({
model: objectDetection.blazeface(),
preventLoad: activeModel !== 'objectDetectionBlazeface',
});

const active =
activeModel === 'objectDetectionSsdlite'
? ssdlite
: activeModel === 'objectDetectionRfdetr'
? rfdetr
: yolo26n;
const detectors = {
objectDetectionSsdlite: ssdlite,
objectDetectionRfdetr: rfdetr,
objectDetectionYolo26n: yolo26n,
objectDetectionBlazeface: blazeface,
} satisfies Record<ObjModelId, unknown>;
const active = detectors[activeModel];

type CommonDetection = Omit<Detection, 'label'> & { label: string };

Expand Down Expand Up @@ -80,7 +87,8 @@ export default function ObjectDetectionTask({
(p: {
results:
| Detection<typeof CocoLabel>[]
| Detection<typeof CocoLabelYolo>[];
| Detection<typeof CocoLabelYolo>[]
| Detection<typeof BlazeFaceLabel>[];
imageWidth: number;
imageHeight: number;
}) => {
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -57,7 +57,8 @@ TensorPtr BaseInstanceSegmentation::buildInputTensor(const cv::Mat &image) {
std::vector<types::Instance> BaseInstanceSegmentation::runInference(
const cv::Mat &image, double confidenceThreshold, double iouThreshold,
int32_t maxInstances, const std::vector<int32_t> &classIndices,
bool returnMaskAtOriginalResolution, const std::string &methodName) {
bool returnMaskAtOriginalResolution, const std::string &methodName,
bool useWeightedNms) {

std::scoped_lock lock(inference_mutex_);

Expand Down Expand Up @@ -86,34 +87,37 @@ std::vector<types::Instance> BaseInstanceSegmentation::runInference(
auto instances = collectInstances(
forwardResult.get(), originalSize, modelInputSize, confidenceThreshold,
classIndices, returnMaskAtOriginalResolution);
return finalizeInstances(std::move(instances), iouThreshold, maxInstances);
return finalizeInstances(std::move(instances), iouThreshold, maxInstances,
useWeightedNms);
}

std::vector<types::Instance> BaseInstanceSegmentation::generateFromString(
std::string imageSource, double confidenceThreshold, double iouThreshold,
int32_t maxInstances, std::vector<int32_t> classIndices,
bool returnMaskAtOriginalResolution, std::string methodName) {
bool returnMaskAtOriginalResolution, std::string methodName,
bool useWeightedNms) {

cv::Mat imageBGR = image_processing::readImage(imageSource);
cv::Mat imageRGB;
cv::cvtColor(imageBGR, imageRGB, cv::COLOR_BGR2RGB);

return runInference(imageRGB, confidenceThreshold, iouThreshold, maxInstances,
classIndices, returnMaskAtOriginalResolution, methodName);
classIndices, returnMaskAtOriginalResolution, methodName,
useWeightedNms);
}

std::vector<types::Instance> BaseInstanceSegmentation::generateFromFrame(
jsi::Runtime &runtime, const jsi::Value &frameData,
double confidenceThreshold, double iouThreshold, int32_t maxInstances,
std::vector<int32_t> classIndices, bool returnMaskAtOriginalResolution,
std::string methodName) {
std::string methodName, bool useWeightedNms) {

auto orient = ::rnexecutorch::utils::readFrameOrientation(runtime, frameData);
cv::Mat frame = extractFromFrame(runtime, frameData);
cv::Mat rotated = utils::rotateFrameForModel(frame, orient);
auto instances =
runInference(rotated, confidenceThreshold, iouThreshold, maxInstances,
classIndices, returnMaskAtOriginalResolution, methodName);
auto instances = runInference(
rotated, confidenceThreshold, iouThreshold, maxInstances, classIndices,
returnMaskAtOriginalResolution, methodName, useWeightedNms);
for (auto &inst : instances) {
utils::inverseRotateBbox(inst.bbox, orient, rotated.size());
// Inverse-rotate the mask to match the screen orientation
Expand All @@ -131,11 +135,13 @@ std::vector<types::Instance> BaseInstanceSegmentation::generateFromFrame(
std::vector<types::Instance> BaseInstanceSegmentation::generateFromPixels(
JSTensorViewIn tensorView, double confidenceThreshold, double iouThreshold,
int32_t maxInstances, std::vector<int32_t> classIndices,
bool returnMaskAtOriginalResolution, std::string methodName) {
bool returnMaskAtOriginalResolution, std::string methodName,
bool useWeightedNms) {

cv::Mat image = extractFromPixels(tensorView);
return runInference(image, confidenceThreshold, iouThreshold, maxInstances,
classIndices, returnMaskAtOriginalResolution, methodName);
classIndices, returnMaskAtOriginalResolution, methodName,
useWeightedNms);
}

std::tuple<utils::computer_vision::BBox, float, int32_t>
Expand Down Expand Up @@ -296,11 +302,14 @@ void BaseInstanceSegmentation::ensureMethodLoaded(

std::vector<types::Instance> BaseInstanceSegmentation::finalizeInstances(
std::vector<types::Instance> instances, double iouThreshold,
int32_t maxInstances) const {
int32_t maxInstances, bool useWeightedNms) const {

if (applyNMS_) {
instances =
utils::computer_vision::nonMaxSuppression(instances, iouThreshold);
instances = useWeightedNms
? utils::computer_vision::weightedNonMaxSuppression(
instances, iouThreshold)
: utils::computer_vision::nonMaxSuppression(instances,
iouThreshold);
}

if (std::cmp_greater(instances.size(), maxInstances)) {
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -28,30 +28,32 @@ class BaseInstanceSegmentation : public VisionModel {
double iouThreshold, int32_t maxInstances,
std::vector<int32_t> classIndices,
bool returnMaskAtOriginalResolution,
std::string methodName);
std::string methodName, bool useWeightedNms);

[[nodiscard("Registered non-void function")]] std::vector<types::Instance>
generateFromFrame(jsi::Runtime &runtime, const jsi::Value &frameData,
double confidenceThreshold, double iouThreshold,
int32_t maxInstances, std::vector<int32_t> classIndices,
bool returnMaskAtOriginalResolution,
std::string methodName);
bool returnMaskAtOriginalResolution, std::string methodName,
bool useWeightedNms);

[[nodiscard("Registered non-void function")]] std::vector<types::Instance>
generateFromPixels(JSTensorViewIn tensorView, double confidenceThreshold,
double iouThreshold, int32_t maxInstances,
std::vector<int32_t> classIndices,
bool returnMaskAtOriginalResolution,
std::string methodName);
std::string methodName, bool useWeightedNms);

protected:
cv::Size modelInputSize() const override;

private:
std::vector<types::Instance> runInference(
const cv::Mat &image, double confidenceThreshold, double iouThreshold,
int32_t maxInstances, const std::vector<int32_t> &classIndices,
bool returnMaskAtOriginalResolution, const std::string &methodName);
std::vector<types::Instance>
runInference(const cv::Mat &image, double confidenceThreshold,
double iouThreshold, int32_t maxInstances,
const std::vector<int32_t> &classIndices,
bool returnMaskAtOriginalResolution,
const std::string &methodName, bool useWeightedNms);

TensorPtr buildInputTensor(const cv::Mat &image);

Expand Down Expand Up @@ -89,7 +91,7 @@ class BaseInstanceSegmentation : public VisionModel {

std::vector<types::Instance>
finalizeInstances(std::vector<types::Instance> instances, double iouThreshold,
int32_t maxInstances) const;
int32_t maxInstances, bool useWeightedNms) const;

cv::Mat processMaskFromLogits(
const cv::Mat &logitsMat, const utils::computer_vision::BBox &bboxModel,
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -82,16 +82,10 @@ std::set<int32_t> ObjectDetection::prepareAllowedClasses(
return allowedClasses;
}

std::vector<types::Detection>
ObjectDetection::postprocess(const std::vector<EValue> &tensors,
cv::Size originalSize, double detectionThreshold,
double iouThreshold,
const std::vector<int32_t> &classIndices) {
const cv::Size inputSize = modelInputSize();
float widthRatio = static_cast<float>(originalSize.width) / inputSize.width;
float heightRatio =
static_cast<float>(originalSize.height) / inputSize.height;

std::vector<types::Detection> ObjectDetection::postprocess(
const std::vector<EValue> &tensors, const BoxTransform &transform,
double detectionThreshold, double iouThreshold,
const std::vector<int32_t> &classIndices, bool useWeightedNms) {
// Prepare allowed classes set for filtering
auto allowedClasses = prepareAllowedClasses(classIndices);

Expand Down Expand Up @@ -124,10 +118,13 @@ ObjectDetection::postprocess(const std::vector<EValue> &tensors,
continue;
}

float x1 = bboxes[i * 4] * widthRatio;
float y1 = bboxes[i * 4 + 1] * heightRatio;
float x2 = bboxes[i * 4 + 2] * widthRatio;
float y2 = bboxes[i * 4 + 3] * heightRatio;
// Map model-input pixel coords back to source-image coords. The same
// affine `x_src = x_model * scale + offset` works for stretch and
// letterbox preprocessing — offsets are zero in the stretch case.
float x1 = bboxes[i * 4] * transform.scaleX + transform.offsetX;
float y1 = bboxes[i * 4 + 1] * transform.scaleY + transform.offsetY;
float x2 = bboxes[i * 4 + 2] * transform.scaleX + transform.offsetX;
float y2 = bboxes[i * 4 + 3] * transform.scaleY + transform.offsetY;

if (std::cmp_greater_equal(labelIdx, labelNames_.size())) {
throw RnExecutorchError(
Expand All @@ -140,12 +137,17 @@ ObjectDetection::postprocess(const std::vector<EValue> &tensors,
labelNames_[labelIdx], labelIdx, scores[i]);
}

return utils::computer_vision::nonMaxSuppression(detections, iouThreshold);
return useWeightedNms
? utils::computer_vision::weightedNonMaxSuppression(detections,
iouThreshold)
: utils::computer_vision::nonMaxSuppression(detections,
iouThreshold);
}

std::vector<types::Detection> ObjectDetection::runInference(
cv::Mat image, double detectionThreshold, double iouThreshold,
const std::vector<int32_t> &classIndices, const std::string &methodName) {
const std::vector<int32_t> &classIndices, const std::string &methodName,
bool useWeightedNms, bool useLetterbox) {
if (detectionThreshold < 0.0 || detectionThreshold > 1.0) {
throw RnExecutorchError(RnExecutorchErrorCode::InvalidUserInput,
"detectionThreshold must be in range [0, 1]");
Expand All @@ -171,7 +173,38 @@ std::vector<types::Detection> ObjectDetection::runInference(
}
modelInputShape_ = inputShapes[0];

cv::Mat preprocessed = preprocess(image);
const cv::Size inputSize = modelInputSize();
cv::Mat preprocessed;
BoxTransform transform;
if (useLetterbox) {
// Aspect-preserving fit + center-pad with black bars. Models trained on
// natural-aspect crops (BlazeFace) need this — plain cv::resize stretches
// the face and shifts where anchors fire.
const float fitScale =
std::min(static_cast<float>(inputSize.width) / originalSize.width,
static_cast<float>(inputSize.height) / originalSize.height);
const int newW =
static_cast<int>(std::round(originalSize.width * fitScale));
const int newH =
static_cast<int>(std::round(originalSize.height * fitScale));
const int padX = (inputSize.width - newW) / 2;
const int padY = (inputSize.height - newH) / 2;

cv::Mat resized;
cv::resize(image, resized, cv::Size(newW, newH), 0, 0, cv::INTER_AREA);
cv::copyMakeBorder(resized, preprocessed, padY,
inputSize.height - newH - padY, padX,
inputSize.width - newW - padX, cv::BORDER_CONSTANT,
cv::Scalar(0, 0, 0));

const float inv = 1.0f / fitScale;
transform = {inv, inv, -padX * inv, -padY * inv};
} else {
preprocessed = preprocess(image);
transform = {static_cast<float>(originalSize.width) / inputSize.width,
static_cast<float>(originalSize.height) / inputSize.height,
0.0f, 0.0f};
}

auto inputTensor =
(normMean_ && normStd_)
Expand All @@ -188,31 +221,34 @@ std::vector<types::Detection> ObjectDetection::runInference(
"Ensure the model input is correct.");
}

return postprocess(executeResult.get(), originalSize, detectionThreshold,
iouThreshold, classIndices);
return postprocess(executeResult.get(), transform, detectionThreshold,
iouThreshold, classIndices, useWeightedNms);
}

std::vector<types::Detection> ObjectDetection::generateFromString(
std::string imageSource, double detectionThreshold, double iouThreshold,
std::vector<int32_t> classIndices, std::string methodName) {
std::vector<int32_t> classIndices, std::string methodName,
bool useWeightedNms, bool useLetterbox) {
cv::Mat imageBGR = image_processing::readImage(imageSource);

cv::Mat imageRGB;
cv::cvtColor(imageBGR, imageRGB, cv::COLOR_BGR2RGB);

return runInference(imageRGB, detectionThreshold, iouThreshold, classIndices,
methodName);
methodName, useWeightedNms, useLetterbox);
}

std::vector<types::Detection> ObjectDetection::generateFromFrame(
jsi::Runtime &runtime, const jsi::Value &frameData,
double detectionThreshold, double iouThreshold,
std::vector<int32_t> classIndices, std::string methodName) {
std::vector<int32_t> classIndices, std::string methodName,
bool useWeightedNms, bool useLetterbox) {
auto orient = ::rnexecutorch::utils::readFrameOrientation(runtime, frameData);
cv::Mat frame = extractFromFrame(runtime, frameData);
cv::Mat rotated = ::rnexecutorch::utils::rotateFrameForModel(frame, orient);
auto detections = runInference(rotated, detectionThreshold, iouThreshold,
classIndices, methodName);
auto detections =
runInference(rotated, detectionThreshold, iouThreshold, classIndices,
methodName, useWeightedNms, useLetterbox);

for (auto &det : detections) {
::rnexecutorch::utils::inverseRotateBbox(det.bbox, orient, rotated.size());
Expand All @@ -222,10 +258,11 @@ std::vector<types::Detection> ObjectDetection::generateFromFrame(

std::vector<types::Detection> ObjectDetection::generateFromPixels(
JSTensorViewIn pixelData, double detectionThreshold, double iouThreshold,
std::vector<int32_t> classIndices, std::string methodName) {
std::vector<int32_t> classIndices, std::string methodName,
bool useWeightedNms, bool useLetterbox) {
cv::Mat image = extractFromPixels(pixelData);

return runInference(image, detectionThreshold, iouThreshold, classIndices,
methodName);
methodName, useWeightedNms, useLetterbox);
}
} // namespace rnexecutorch::models::object_detection
Loading
Loading