From 4232bcd3e4480337c8e25de087ab3b5f0af9531d Mon Sep 17 00:00:00 2001 From: Zach Vincze Date: Wed, 28 Jan 2026 14:58:24 -0500 Subject: [PATCH 1/6] Avoid branching in casting implementations --- include/core/detail/casting.hpp | 90 +++++++++++++++++++++------------ src/op_composite.cpp | 2 +- 2 files changed, 58 insertions(+), 34 deletions(-) diff --git a/include/core/detail/casting.hpp b/include/core/detail/casting.hpp index ea21b0bc..bc6bedf9 100644 --- a/include/core/detail/casting.hpp +++ b/include/core/detail/casting.hpp @@ -36,36 +36,48 @@ __device__ __host__ T ScalarSaturateCast(U v) { constexpr bool bigToSmall = !smallToBig; if constexpr (std::is_integral_v && std::is_floating_point_v) { - // Any float -> any integral - return static_cast(std::clamp(std::round(v), static_cast(std::numeric_limits::min()), - static_cast(std::numeric_limits::max()))); - } else if constexpr (std::is_integral_v && std::is_integral_v && std::is_signed_v && std::is_signed_v && - smallToBig) { - // Any integral signed -> Any integral unsigned, small -> big or equal - return v <= 0 ? 0 : static_cast(v); - } else if constexpr (std::is_integral_v && std::is_integral_v && - ((std::is_signed_v && std::is_signed_v) || - (std::is_unsigned_v && std::is_unsigned_v)) && - bigToSmall) { - // Any integral signed -> Any integral signed, big -> small - // Any integral unsigned -> Any integral unsigned, big -> small - return v <= std::numeric_limits::min() - ? std::numeric_limits::min() - : (v >= std::numeric_limits::max() ? std::numeric_limits::max() : static_cast(v)); - } else if constexpr (std::is_integral_v && std::is_unsigned_v && std::is_integral_v && - std::is_signed_v) { - // Any integral unsigned -> Any integral signed, small -> big or equal - return v >= std::numeric_limits::max() ? std::numeric_limits::max() : static_cast(v); - } else if constexpr (std::is_integral_v && std::is_signed_v && std::is_integral_v && - std::is_unsigned_v && bigToSmall) { - // Any integral signed -> Any integral unsigned, big -> small - return v <= static_cast(std::numeric_limits::min()) - ? std::numeric_limits::min() - : (v >= static_cast(std::numeric_limits::max()) ? std::numeric_limits::max() - : static_cast(v)); - } else { - // All other cases fall into this - return v; + // Float -> integral: clamp then round + constexpr U minVal = static_cast(std::numeric_limits::min()); + constexpr U maxVal = static_cast(std::numeric_limits::max()); +#ifdef __HIP_DEVICE_COMPILE__ + return static_cast(rintf(fminf(fmaxf(v, minVal), maxVal))); +#else + return static_cast(std::round(std::clamp(v, minVal, maxVal))); +#endif + } + + else if constexpr (std::is_integral_v && std::is_integral_v && std::is_signed_v && std::is_unsigned_v && + smallToBig) { + // Signed -> unsigned, small to big: clamp negative to 0 + // Branchless: max(v, 0) handles negative values + return static_cast(max(v, U{0})); + } + + else if constexpr (std::is_integral_v && std::is_integral_v && + ((std::is_signed_v && std::is_signed_v) || + (std::is_unsigned_v && std::is_unsigned_v)) && + bigToSmall) { + // Same signedness, big -> small: clamp to [min, max] + constexpr U minVal = static_cast(std::numeric_limits::min()); + constexpr U maxVal = static_cast(std::numeric_limits::max()); + return static_cast(min(max(v, minVal), maxVal)); + } + + else if constexpr (std::is_integral_v && std::is_unsigned_v && std::is_integral_v && std::is_signed_v) { + // Unsigned -> signed: clamp to max (can't exceed min since unsigned) + constexpr U maxVal = static_cast(std::numeric_limits::max()); + return static_cast(min(v, maxVal)); + } + + else if constexpr (std::is_integral_v && std::is_signed_v && std::is_integral_v && std::is_unsigned_v && + bigToSmall) { + // Signed -> unsigned, big -> small: clamp to [0, max] + constexpr U maxVal = static_cast(std::numeric_limits::max()); + return static_cast(min(max(v, U{0}), maxVal)); + } + + else { + return static_cast(v); } } @@ -117,9 +129,21 @@ __device__ __host__ T ScalarRangeCast(U v) { else if constexpr (std::is_integral_v && std::is_floating_point_v && std::is_unsigned_v) { // float to unsigned integers - return v >= T{1} ? std::numeric_limits::max() - : v <= T{0} ? 0 - : static_cast(lrintf(static_cast(std::numeric_limits::max()) * v)); + constexpr U scale = static_cast(std::numeric_limits::max()); + + if constexpr (sizeof(T) <= 2) { + // 8/16 bit integer cases. These can be represented exactly in floating point. +#ifdef __HIP_DEVICE_COMPILE__ + return static_cast(__float2int_rn(__saturatef(v) * scale)); +#else + return static_cast(lrintf(fminf(fmaxf(v, 0.0f), 1.0f) * scale)); +#endif + } else { + // 32/64 bit integer cases. + return v >= U{1} ? std::numeric_limits::max() + : v <= U{-1} ? std::numeric_limits::min() + : static_cast(std::round(v * scale)); + } } else if constexpr (std::is_floating_point_v && std::is_integral_v && std::is_signed_v) { diff --git a/src/op_composite.cpp b/src/op_composite.cpp index 6d49a156..fedc7a9b 100644 --- a/src/op_composite.cpp +++ b/src/op_composite.cpp @@ -40,7 +40,7 @@ void dispatch_composite_masktype(hipStream_t stream, const Tensor& foreground, c switch (device) { case eDeviceType::GPU: { - dim3 block(64, 16); + dim3 block(32, 8); dim3 grid((outputWrapper.width() + block.x - 1) / block.x, (outputWrapper.height() + block.y - 1) / block.y, outputWrapper.batches()); Kernels::Device::composite<<>>(fgWrapper, bgWrapper, maskWrapper, outputWrapper); From 77cabc7a032143d0a13d08550228ef675d69ec03 Mon Sep 17 00:00:00 2001 From: Zach Vincze Date: Tue, 3 Feb 2026 12:22:49 -0500 Subject: [PATCH 2/6] Add more tests for Saturate cast --- .../tests/core/detail/test_saturate_cast.cpp | 51 +++++++++++++++++++ 1 file changed, 51 insertions(+) create mode 100644 tests/roccv/cpp/src/tests/core/detail/test_saturate_cast.cpp diff --git a/tests/roccv/cpp/src/tests/core/detail/test_saturate_cast.cpp b/tests/roccv/cpp/src/tests/core/detail/test_saturate_cast.cpp new file mode 100644 index 00000000..015265c5 --- /dev/null +++ b/tests/roccv/cpp/src/tests/core/detail/test_saturate_cast.cpp @@ -0,0 +1,51 @@ +/* + * Copyright (c) 2026 Advanced Micro Devices, Inc. All rights reserved. + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN + * THE SOFTWARE. + */ + +#include + +#include "test_helpers.hpp" + +using namespace roccv::detail; +using namespace roccv::tests; +using namespace roccv; + +int main(int argc, char **argv) { + TEST_CASES_BEGIN(); + + EXPECT_EQ(SaturateCast(1.0f), 1); + EXPECT_EQ(SaturateCast(-1.0f), -1); + EXPECT_EQ(SaturateCast(1.0f), 1); + EXPECT_EQ(SaturateCast(-1.0f), 0); + EXPECT_EQ(SaturateCast(1), 1.0f); + EXPECT_EQ(SaturateCast(-1), -1.0f); + EXPECT_EQ(SaturateCast(1), 1.0); + EXPECT_EQ(SaturateCast(-1), -1.0); + + // Test numeric limits + EXPECT_EQ(SaturateCast(std::numeric_limits::max()), std::numeric_limits::max()); + EXPECT_EQ(SaturateCast(std::numeric_limits::max()), std::numeric_limits::max()); + + // Test vectorized types + EXPECT_TRUE((SaturateCast(uchar4{255, 128, 0, 255}) == float4{255.0f, 128.0f, 0.0f, 255.0f})); + EXPECT_TRUE((SaturateCast(char4{-128, -128, -128, -128}) == float4{-128.0f, -128.0f, -128.0f, -128.0f})); + + TEST_CASES_END(); +} \ No newline at end of file From d887102cacc81425561b102d364c9646d5e5d5b2 Mon Sep 17 00:00:00 2001 From: Zach Vincze Date: Fri, 6 Feb 2026 15:08:08 -0500 Subject: [PATCH 3/6] Fix issues with float -> integer saturate casts --- include/core/detail/casting.hpp | 25 ++++++++++++----- include/core/detail/type_traits.hpp | 3 +++ .../tests/core/detail/test_saturate_cast.cpp | 27 ++++++++++--------- 3 files changed, 37 insertions(+), 18 deletions(-) diff --git a/include/core/detail/casting.hpp b/include/core/detail/casting.hpp index bc6bedf9..d9c8ac14 100644 --- a/include/core/detail/casting.hpp +++ b/include/core/detail/casting.hpp @@ -21,8 +21,6 @@ #pragma once -#include - #include "core/detail/type_traits.hpp" namespace roccv::detail { @@ -36,14 +34,29 @@ __device__ __host__ T ScalarSaturateCast(U v) { constexpr bool bigToSmall = !smallToBig; if constexpr (std::is_integral_v && std::is_floating_point_v) { - // Float -> integral: clamp then round - constexpr U minVal = static_cast(std::numeric_limits::min()); + // Float -> integral: clamp to [min, max] then round. + constexpr U minVal = static_cast(std::numeric_limits::lowest()); constexpr U maxVal = static_cast(std::numeric_limits::max()); + + if constexpr (sizeof(T) <= 2) { + // 8/16 bit integer cases. These can be represented exactly in floating point. +#ifdef __HIP_DEVICE_COMPILE__ + return static_cast(rintf(fminf(fmaxf(v, minVal), maxVal))); +#else + return static_cast(std::round(std::clamp(v, minVal, maxVal))); +#endif + } else { + // 32/64 bit integer cases. #ifdef __HIP_DEVICE_COMPILE__ - return static_cast(rintf(fminf(fmaxf(v, minVal), maxVal))); + U rounded = rintf(v); #else - return static_cast(std::round(std::clamp(v, minVal, maxVal))); + U rounded = std::round(v); #endif + + return rounded >= maxVal ? std::numeric_limits::max() + : rounded <= minVal ? std::numeric_limits::min() + : static_cast(rounded); + } } else if constexpr (std::is_integral_v && std::is_integral_v && std::is_signed_v && std::is_unsigned_v && diff --git a/include/core/detail/type_traits.hpp b/include/core/detail/type_traits.hpp index dcf77eb0..32f14d58 100644 --- a/include/core/detail/type_traits.hpp +++ b/include/core/detail/type_traits.hpp @@ -20,6 +20,7 @@ */ #include + #include #pragma once @@ -83,6 +84,8 @@ DEFINE_TYPE_TRAITS_0_TO_4(int, signed int); DEFINE_TYPE_TRAITS_0_TO_4(short, signed short); DEFINE_TYPE_TRAITS_0_TO_4(ushort, unsigned short); DEFINE_TYPE_TRAITS_0_TO_4(double, double); +DEFINE_TYPE_TRAITS_0_TO_4(long, signed long); +DEFINE_TYPE_TRAITS_0_TO_4(ulong, unsigned long); /** * @brief Returns the number of elements in a HIP vectorized type. For example: uchar3 will return 3, int2 will diff --git a/tests/roccv/cpp/src/tests/core/detail/test_saturate_cast.cpp b/tests/roccv/cpp/src/tests/core/detail/test_saturate_cast.cpp index 015265c5..84be5ee2 100644 --- a/tests/roccv/cpp/src/tests/core/detail/test_saturate_cast.cpp +++ b/tests/roccv/cpp/src/tests/core/detail/test_saturate_cast.cpp @@ -30,22 +30,25 @@ using namespace roccv; int main(int argc, char **argv) { TEST_CASES_BEGIN(); - EXPECT_EQ(SaturateCast(1.0f), 1); - EXPECT_EQ(SaturateCast(-1.0f), -1); - EXPECT_EQ(SaturateCast(1.0f), 1); - EXPECT_EQ(SaturateCast(-1.0f), 0); - EXPECT_EQ(SaturateCast(1), 1.0f); - EXPECT_EQ(SaturateCast(-1), -1.0f); - EXPECT_EQ(SaturateCast(1), 1.0); - EXPECT_EQ(SaturateCast(-1), -1.0); + TEST_CASE(EXPECT_EQ(SaturateCast(1.0f), 1)); + TEST_CASE(EXPECT_EQ(SaturateCast(-1.0f), -1)); + TEST_CASE(EXPECT_EQ(SaturateCast(1.0f), 1)); + TEST_CASE(EXPECT_EQ(SaturateCast(-1.0f), 0)); + TEST_CASE(EXPECT_EQ(SaturateCast(1), 1.0f)); + TEST_CASE(EXPECT_EQ(SaturateCast(-1), -1.0f)); + TEST_CASE(EXPECT_EQ(SaturateCast(1), 1.0)); + TEST_CASE(EXPECT_EQ(SaturateCast(-1), -1.0)); // Test numeric limits - EXPECT_EQ(SaturateCast(std::numeric_limits::max()), std::numeric_limits::max()); - EXPECT_EQ(SaturateCast(std::numeric_limits::max()), std::numeric_limits::max()); + TEST_CASE(EXPECT_EQ(SaturateCast(std::numeric_limits::max()), std::numeric_limits::max())); + TEST_CASE(EXPECT_EQ(SaturateCast(std::numeric_limits::max()), std::numeric_limits::max())); + TEST_CASE(EXPECT_EQ(SaturateCast(std::numeric_limits::max()), std::numeric_limits::max())); + TEST_CASE(EXPECT_EQ(SaturateCast(std::numeric_limits::lowest()), 0UL)); // Test vectorized types - EXPECT_TRUE((SaturateCast(uchar4{255, 128, 0, 255}) == float4{255.0f, 128.0f, 0.0f, 255.0f})); - EXPECT_TRUE((SaturateCast(char4{-128, -128, -128, -128}) == float4{-128.0f, -128.0f, -128.0f, -128.0f})); + TEST_CASE(EXPECT_TRUE((SaturateCast(uchar4{255, 128, 0, 255}) == float4{255.0f, 128.0f, 0.0f, 255.0f}))); + TEST_CASE(EXPECT_TRUE( + (SaturateCast(char4{-128, -128, -128, -128}) == float4{-128.0f, -128.0f, -128.0f, -128.0f}))); TEST_CASES_END(); } \ No newline at end of file From 146a1f9740d28eef6b0c7b150e8781fb01ec91a2 Mon Sep 17 00:00:00 2001 From: Zach Vincze Date: Fri, 6 Feb 2026 15:28:08 -0500 Subject: [PATCH 4/6] Undo changes to composite --- src/op_composite.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/op_composite.cpp b/src/op_composite.cpp index fedc7a9b..6d49a156 100644 --- a/src/op_composite.cpp +++ b/src/op_composite.cpp @@ -40,7 +40,7 @@ void dispatch_composite_masktype(hipStream_t stream, const Tensor& foreground, c switch (device) { case eDeviceType::GPU: { - dim3 block(32, 8); + dim3 block(64, 16); dim3 grid((outputWrapper.width() + block.x - 1) / block.x, (outputWrapper.height() + block.y - 1) / block.y, outputWrapper.batches()); Kernels::Device::composite<<>>(fgWrapper, bgWrapper, maskWrapper, outputWrapper); From e9e9f0b8b8fd7a516f95762a4526fcae12a2dc00 Mon Sep 17 00:00:00 2001 From: Zach Vincze Date: Fri, 6 Feb 2026 16:18:22 -0500 Subject: [PATCH 5/6] Review fixes --- include/core/detail/casting.hpp | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/include/core/detail/casting.hpp b/include/core/detail/casting.hpp index d9c8ac14..688db24d 100644 --- a/include/core/detail/casting.hpp +++ b/include/core/detail/casting.hpp @@ -21,6 +21,8 @@ #pragma once +#include + #include "core/detail/type_traits.hpp" namespace roccv::detail { @@ -153,9 +155,7 @@ __device__ __host__ T ScalarRangeCast(U v) { #endif } else { // 32/64 bit integer cases. - return v >= U{1} ? std::numeric_limits::max() - : v <= U{-1} ? std::numeric_limits::min() - : static_cast(std::round(v * scale)); + return v >= U{1} ? std::numeric_limits::max() : v <= U{0} ? 0 : static_cast(std::round(v * scale)); } } From 13a78bec09a2d3e04fa7ad3a50a1cde242fe7cfb Mon Sep 17 00:00:00 2001 From: Zach Vincze Date: Fri, 6 Feb 2026 16:21:04 -0500 Subject: [PATCH 6/6] Add another test case for RangeCast --- tests/roccv/cpp/src/tests/core/detail/test_range_cast.cpp | 2 ++ 1 file changed, 2 insertions(+) diff --git a/tests/roccv/cpp/src/tests/core/detail/test_range_cast.cpp b/tests/roccv/cpp/src/tests/core/detail/test_range_cast.cpp index 7a2ccf5b..2d9ea7dd 100644 --- a/tests/roccv/cpp/src/tests/core/detail/test_range_cast.cpp +++ b/tests/roccv/cpp/src/tests/core/detail/test_range_cast.cpp @@ -37,6 +37,8 @@ int main(int argc, char **argv) { TEST_CASE(EXPECT_EQ(RangeCast(-1.0f), std::numeric_limits::min())); TEST_CASE(EXPECT_EQ(RangeCast(1.0f), std::numeric_limits::max())); TEST_CASE(EXPECT_EQ(RangeCast(-1.0f), 0)); + TEST_CASE(EXPECT_EQ(RangeCast(0.0f), 0)); + // Test unsigned/signed integer -> float casting TEST_CASE(EXPECT_EQ(RangeCast(std::numeric_limits::max()), 1.0f));