From 4232bcd3e4480337c8e25de087ab3b5f0af9531d Mon Sep 17 00:00:00 2001
From: Zach Vincze <zavincze@amd.com>
Date: Wed, 28 Jan 2026 14:58:24 -0500
Subject: [PATCH 1/6] Avoid branching in casting implementations

---
 include/core/detail/casting.hpp | 90 +++++++++++++++++++++------------
 src/op_composite.cpp            |  2 +-
 2 files changed, 58 insertions(+), 34 deletions(-)
diff --git a/include/core/detail/casting.hpp b/include/core/detail/casting.hpp
index ea21b0bc..bc6bedf9 100644
--- a/include/core/detail/casting.hpp
+++ b/include/core/detail/casting.hpp
@@ -36,36 +36,48 @@ __device__ __host__ T ScalarSaturateCast(U v) {
     constexpr bool bigToSmall = !smallToBig;
 
     if constexpr (std::is_integral_v<T> && std::is_floating_point_v<U>) {
-        // Any float -> any integral
-        return static_cast<T>(std::clamp<U>(std::round(v), static_cast<U>(std::numeric_limits<T>::min()),
-                                            static_cast<U>(std::numeric_limits<T>::max())));
-    } else if constexpr (std::is_integral_v<T> && std::is_integral_v<U> && std::is_signed_v<U> && std::is_signed_v<T> &&
-                         smallToBig) {
-        // Any integral signed -> Any integral unsigned, small -> big or equal
-        return v <= 0 ? 0 : static_cast<T>(v);
-    } else if constexpr (std::is_integral_v<U> && std::is_integral_v<T> &&
-                         ((std::is_signed_v<U> && std::is_signed_v<T>) ||
-                          (std::is_unsigned_v<U> && std::is_unsigned_v<T>)) &&
-                         bigToSmall) {
-        // Any integral signed -> Any integral signed, big -> small
-        // Any integral unsigned -> Any integral unsigned, big -> small
-        return v <= std::numeric_limits<T>::min()
-                   ? std::numeric_limits<T>::min()
-                   : (v >= std::numeric_limits<T>::max() ? std::numeric_limits<T>::max() : static_cast<T>(v));
-    } else if constexpr (std::is_integral_v<U> && std::is_unsigned_v<U> && std::is_integral_v<T> &&
-                         std::is_signed_v<T>) {
-        // Any integral unsigned -> Any integral signed, small -> big or equal
-        return v >= std::numeric_limits<T>::max() ? std::numeric_limits<T>::max() : static_cast<T>(v);
-    } else if constexpr (std::is_integral_v<U> && std::is_signed_v<U> && std::is_integral_v<T> &&
-                         std::is_unsigned_v<T> && bigToSmall) {
-        // Any integral signed -> Any integral unsigned, big -> small
-        return v <= static_cast<U>(std::numeric_limits<T>::min())
-                   ? std::numeric_limits<T>::min()
-                   : (v >= static_cast<U>(std::numeric_limits<T>::max()) ? std::numeric_limits<T>::max()
-                                                                         : static_cast<T>(v));
-    } else {
-        // All other cases fall into this
-        return v;
+        // Float -> integral: clamp then round
+        constexpr U minVal = static_cast<U>(std::numeric_limits<T>::min());
+        constexpr U maxVal = static_cast<U>(std::numeric_limits<T>::max());
+#ifdef __HIP_DEVICE_COMPILE__
+        return static_cast<T>(rintf(fminf(fmaxf(v, minVal), maxVal)));
+#else
+        return static_cast<T>(std::round(std::clamp(v, minVal, maxVal)));
+#endif
+    }
+
+    else if constexpr (std::is_integral_v<T> && std::is_integral_v<U> && std::is_signed_v<U> && std::is_unsigned_v<T> &&
+                       smallToBig) {
+        // Signed -> unsigned, small to big: clamp negative to 0
+        // Branchless: max(v, 0) handles negative values
+        return static_cast<T>(max(v, U{0}));
+    }
+
+    else if constexpr (std::is_integral_v<U> && std::is_integral_v<T> &&
+                       ((std::is_signed_v<U> && std::is_signed_v<T>) ||
+                        (std::is_unsigned_v<U> && std::is_unsigned_v<T>)) &&
+                       bigToSmall) {
+        // Same signedness, big -> small: clamp to [min, max]
+        constexpr U minVal = static_cast<U>(std::numeric_limits<T>::min());
+        constexpr U maxVal = static_cast<U>(std::numeric_limits<T>::max());
+        return static_cast<T>(min(max(v, minVal), maxVal));
+    }
+
+    else if constexpr (std::is_integral_v<U> && std::is_unsigned_v<U> && std::is_integral_v<T> && std::is_signed_v<T>) {
+        // Unsigned -> signed: clamp to max (can't exceed min since unsigned)
+        constexpr U maxVal = static_cast<U>(std::numeric_limits<T>::max());
+        return static_cast<T>(min(v, maxVal));
+    }
+
+    else if constexpr (std::is_integral_v<U> && std::is_signed_v<U> && std::is_integral_v<T> && std::is_unsigned_v<T> &&
+                       bigToSmall) {
+        // Signed -> unsigned, big -> small: clamp to [0, max]
+        constexpr U maxVal = static_cast<U>(std::numeric_limits<T>::max());
+        return static_cast<T>(min(max(v, U{0}), maxVal));
+    }
+
+    else {
+        return static_cast<T>(v);
     }
 }
 
@@ -117,9 +129,21 @@ __device__ __host__ T ScalarRangeCast(U v) {
 
     else if constexpr (std::is_integral_v<T> && std::is_floating_point_v<U> && std::is_unsigned_v<T>) {
         // float to unsigned integers
-        return v >= T{1}   ? std::numeric_limits<T>::max()
-               : v <= T{0} ? 0
-                           : static_cast<T>(lrintf(static_cast<U>(std::numeric_limits<T>::max()) * v));
+        constexpr U scale = static_cast<U>(std::numeric_limits<T>::max());
+
+        if constexpr (sizeof(T) <= 2) {
+            // 8/16 bit integer cases. These can be represented exactly in floating point.
+#ifdef __HIP_DEVICE_COMPILE__
+            return static_cast<T>(__float2int_rn(__saturatef(v) * scale));
+#else
+            return static_cast<T>(lrintf(fminf(fmaxf(v, 0.0f), 1.0f) * scale));
+#endif
+        } else {
+            // 32/64 bit integer cases.
+            return v >= U{1}    ? std::numeric_limits<T>::max()
+                   : v <= U{-1} ? std::numeric_limits<T>::min()
+                                : static_cast<T>(std::round(v * scale));
+        }
     }
 
     else if constexpr (std::is_floating_point_v<T> && std::is_integral_v<U> && std::is_signed_v<U>) {
diff --git a/src/op_composite.cpp b/src/op_composite.cpp
index 6d49a156..fedc7a9b 100644
--- a/src/op_composite.cpp
+++ b/src/op_composite.cpp
@@ -40,7 +40,7 @@ void dispatch_composite_masktype(hipStream_t stream, const Tensor& foreground, c
 
     switch (device) {
         case eDeviceType::GPU: {
-            dim3 block(64, 16);
+            dim3 block(32, 8);
             dim3 grid((outputWrapper.width() + block.x - 1) / block.x, (outputWrapper.height() + block.y - 1) / block.y,
                       outputWrapper.batches());
             Kernels::Device::composite<<<grid, block, 0, stream>>>(fgWrapper, bgWrapper, maskWrapper, outputWrapper);

From 77cabc7a032143d0a13d08550228ef675d69ec03 Mon Sep 17 00:00:00 2001
From: Zach Vincze <zavincze@amd.com>
Date: Tue, 3 Feb 2026 12:22:49 -0500
Subject: [PATCH 2/6] Add more tests for Saturate cast

---
 .../tests/core/detail/test_saturate_cast.cpp  | 51 +++++++++++++++++++
 1 file changed, 51 insertions(+)
 create mode 100644 tests/roccv/cpp/src/tests/core/detail/test_saturate_cast.cpp

diff --git a/tests/roccv/cpp/src/tests/core/detail/test_saturate_cast.cpp b/tests/roccv/cpp/src/tests/core/detail/test_saturate_cast.cpp
new file mode 100644
index 00000000..015265c5
--- /dev/null
+++ b/tests/roccv/cpp/src/tests/core/detail/test_saturate_cast.cpp
@@ -0,0 +1,51 @@
+/*
+ * Copyright (c) 2026 Advanced Micro Devices, Inc. All rights reserved.
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ */
+
+#include <core/detail/casting.hpp>
+
+#include "test_helpers.hpp"
+
+using namespace roccv::detail;
+using namespace roccv::tests;
+using namespace roccv;
+
+int main(int argc, char **argv) {
+    TEST_CASES_BEGIN();
+
+    EXPECT_EQ(SaturateCast<int>(1.0f), 1);
+    EXPECT_EQ(SaturateCast<int>(-1.0f), -1);
+    EXPECT_EQ(SaturateCast<uint>(1.0f), 1);
+    EXPECT_EQ(SaturateCast<uint>(-1.0f), 0);
+    EXPECT_EQ(SaturateCast<float>(1), 1.0f);
+    EXPECT_EQ(SaturateCast<float>(-1), -1.0f);
+    EXPECT_EQ(SaturateCast<double>(1), 1.0);
+    EXPECT_EQ(SaturateCast<double>(-1), -1.0);
+
+    // Test numeric limits
+    EXPECT_EQ(SaturateCast<int>(std::numeric_limits<float>::max()), std::numeric_limits<int>::max());
+    EXPECT_EQ(SaturateCast<uint8_t>(std::numeric_limits<float>::max()), std::numeric_limits<uint8_t>::max());
+
+    // Test vectorized types
+    EXPECT_TRUE((SaturateCast<float4>(uchar4{255, 128, 0, 255}) == float4{255.0f, 128.0f, 0.0f, 255.0f}));
+    EXPECT_TRUE((SaturateCast<float4>(char4{-128, -128, -128, -128}) == float4{-128.0f, -128.0f, -128.0f, -128.0f}));
+
+    TEST_CASES_END();
+}
\ No newline at end of file

From d887102cacc81425561b102d364c9646d5e5d5b2 Mon Sep 17 00:00:00 2001
From: Zach Vincze <zavincze@amd.com>
Date: Fri, 6 Feb 2026 15:08:08 -0500
Subject: [PATCH 3/6] Fix issues with float -> integer saturate casts

---
 include/core/detail/casting.hpp               | 25 ++++++++++++-----
 include/core/detail/type_traits.hpp           |  3 +++
 .../tests/core/detail/test_saturate_cast.cpp  | 27 ++++++++++---------
 3 files changed, 37 insertions(+), 18 deletions(-)

diff --git a/include/core/detail/casting.hpp b/include/core/detail/casting.hpp
index bc6bedf9..d9c8ac14 100644
--- a/include/core/detail/casting.hpp
+++ b/include/core/detail/casting.hpp
@@ -21,8 +21,6 @@
 
 #pragma once
 
-#include <algorithm>
-
 #include "core/detail/type_traits.hpp"
 
 namespace roccv::detail {
@@ -36,14 +34,29 @@ __device__ __host__ T ScalarSaturateCast(U v) {
     constexpr bool bigToSmall = !smallToBig;
 
     if constexpr (std::is_integral_v<T> && std::is_floating_point_v<U>) {
-        // Float -> integral: clamp then round
-        constexpr U minVal = static_cast<U>(std::numeric_limits<T>::min());
+        // Float -> integral: clamp to [min, max] then round.
+        constexpr U minVal = static_cast<U>(std::numeric_limits<T>::lowest());
         constexpr U maxVal = static_cast<U>(std::numeric_limits<T>::max());
+
+        if constexpr (sizeof(T) <= 2) {
+            // 8/16 bit integer cases. These can be represented exactly in floating point.
+#ifdef __HIP_DEVICE_COMPILE__
+            return static_cast<T>(rintf(fminf(fmaxf(v, minVal), maxVal)));
+#else
+            return static_cast<T>(std::round(std::clamp(v, minVal, maxVal)));
+#endif
+        } else {
+            // 32/64 bit integer cases.
 #ifdef __HIP_DEVICE_COMPILE__
-        return static_cast<T>(rintf(fminf(fmaxf(v, minVal), maxVal)));
+            U rounded = rintf(v);
 #else
-        return static_cast<T>(std::round(std::clamp(v, minVal, maxVal)));
+            U rounded = std::round(v);
 #endif
+
+            return rounded >= maxVal   ? std::numeric_limits<T>::max()
+                   : rounded <= minVal ? std::numeric_limits<T>::min()
+                                       : static_cast<T>(rounded);
+        }
     }
 
     else if constexpr (std::is_integral_v<T> && std::is_integral_v<U> && std::is_signed_v<U> && std::is_unsigned_v<T> &&
diff --git a/include/core/detail/type_traits.hpp b/include/core/detail/type_traits.hpp
index dcf77eb0..32f14d58 100644
--- a/include/core/detail/type_traits.hpp
+++ b/include/core/detail/type_traits.hpp
@@ -20,6 +20,7 @@
  */
 
 #include <hip/hip_runtime.h>
+
 #include <cassert>
 
 #pragma once
@@ -83,6 +84,8 @@ DEFINE_TYPE_TRAITS_0_TO_4(int, signed int);
 DEFINE_TYPE_TRAITS_0_TO_4(short, signed short);
 DEFINE_TYPE_TRAITS_0_TO_4(ushort, unsigned short);
 DEFINE_TYPE_TRAITS_0_TO_4(double, double);
+DEFINE_TYPE_TRAITS_0_TO_4(long, signed long);
+DEFINE_TYPE_TRAITS_0_TO_4(ulong, unsigned long);
 
 /**
  * @brief Returns the number of elements in a HIP vectorized type. For example: uchar3 will return 3, int2 will
diff --git a/tests/roccv/cpp/src/tests/core/detail/test_saturate_cast.cpp b/tests/roccv/cpp/src/tests/core/detail/test_saturate_cast.cpp
index 015265c5..84be5ee2 100644
--- a/tests/roccv/cpp/src/tests/core/detail/test_saturate_cast.cpp
+++ b/tests/roccv/cpp/src/tests/core/detail/test_saturate_cast.cpp
@@ -30,22 +30,25 @@ using namespace roccv;
 int main(int argc, char **argv) {
     TEST_CASES_BEGIN();
 
-    EXPECT_EQ(SaturateCast<int>(1.0f), 1);
-    EXPECT_EQ(SaturateCast<int>(-1.0f), -1);
-    EXPECT_EQ(SaturateCast<uint>(1.0f), 1);
-    EXPECT_EQ(SaturateCast<uint>(-1.0f), 0);
-    EXPECT_EQ(SaturateCast<float>(1), 1.0f);
-    EXPECT_EQ(SaturateCast<float>(-1), -1.0f);
-    EXPECT_EQ(SaturateCast<double>(1), 1.0);
-    EXPECT_EQ(SaturateCast<double>(-1), -1.0);
+    TEST_CASE(EXPECT_EQ(SaturateCast<int>(1.0f), 1));
+    TEST_CASE(EXPECT_EQ(SaturateCast<int>(-1.0f), -1));
+    TEST_CASE(EXPECT_EQ(SaturateCast<uint>(1.0f), 1));
+    TEST_CASE(EXPECT_EQ(SaturateCast<uint>(-1.0f), 0));
+    TEST_CASE(EXPECT_EQ(SaturateCast<float>(1), 1.0f));
+    TEST_CASE(EXPECT_EQ(SaturateCast<float>(-1), -1.0f));
+    TEST_CASE(EXPECT_EQ(SaturateCast<double>(1), 1.0));
+    TEST_CASE(EXPECT_EQ(SaturateCast<double>(-1), -1.0));
 
     // Test numeric limits
-    EXPECT_EQ(SaturateCast<int>(std::numeric_limits<float>::max()), std::numeric_limits<int>::max());
-    EXPECT_EQ(SaturateCast<uint8_t>(std::numeric_limits<float>::max()), std::numeric_limits<uint8_t>::max());
+    TEST_CASE(EXPECT_EQ(SaturateCast<int>(std::numeric_limits<float>::max()), std::numeric_limits<int>::max()));
+    TEST_CASE(EXPECT_EQ(SaturateCast<uint8_t>(std::numeric_limits<float>::max()), std::numeric_limits<uint8_t>::max()));
+    TEST_CASE(EXPECT_EQ(SaturateCast<long>(std::numeric_limits<float>::max()), std::numeric_limits<long>::max()));
+    TEST_CASE(EXPECT_EQ(SaturateCast<ulong>(std::numeric_limits<float>::lowest()), 0UL));
 
     // Test vectorized types
-    EXPECT_TRUE((SaturateCast<float4>(uchar4{255, 128, 0, 255}) == float4{255.0f, 128.0f, 0.0f, 255.0f}));
-    EXPECT_TRUE((SaturateCast<float4>(char4{-128, -128, -128, -128}) == float4{-128.0f, -128.0f, -128.0f, -128.0f}));
+    TEST_CASE(EXPECT_TRUE((SaturateCast<float4>(uchar4{255, 128, 0, 255}) == float4{255.0f, 128.0f, 0.0f, 255.0f})));
+    TEST_CASE(EXPECT_TRUE(
+        (SaturateCast<float4>(char4{-128, -128, -128, -128}) == float4{-128.0f, -128.0f, -128.0f, -128.0f})));
 
     TEST_CASES_END();
 }
\ No newline at end of file

From 146a1f9740d28eef6b0c7b150e8781fb01ec91a2 Mon Sep 17 00:00:00 2001
From: Zach Vincze <zavincze@amd.com>
Date: Fri, 6 Feb 2026 15:28:08 -0500
Subject: [PATCH 4/6] Undo changes to composite

---
 src/op_composite.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/op_composite.cpp b/src/op_composite.cpp
index fedc7a9b..6d49a156 100644
--- a/src/op_composite.cpp
+++ b/src/op_composite.cpp
@@ -40,7 +40,7 @@ void dispatch_composite_masktype(hipStream_t stream, const Tensor& foreground, c
 
     switch (device) {
         case eDeviceType::GPU: {
-            dim3 block(32, 8);
+            dim3 block(64, 16);
             dim3 grid((outputWrapper.width() + block.x - 1) / block.x, (outputWrapper.height() + block.y - 1) / block.y,
                       outputWrapper.batches());
             Kernels::Device::composite<<<grid, block, 0, stream>>>(fgWrapper, bgWrapper, maskWrapper, outputWrapper);

From e9e9f0b8b8fd7a516f95762a4526fcae12a2dc00 Mon Sep 17 00:00:00 2001
From: Zach Vincze <zavincze@amd.com>
Date: Fri, 6 Feb 2026 16:18:22 -0500
Subject: [PATCH 5/6] Review fixes

---
 include/core/detail/casting.hpp | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/include/core/detail/casting.hpp b/include/core/detail/casting.hpp
index d9c8ac14..688db24d 100644
--- a/include/core/detail/casting.hpp
+++ b/include/core/detail/casting.hpp
@@ -21,6 +21,8 @@
 
 #pragma once
 
+#include <algorithm>
+
 #include "core/detail/type_traits.hpp"
 
 namespace roccv::detail {
@@ -153,9 +155,7 @@ __device__ __host__ T ScalarRangeCast(U v) {
 #endif
         } else {
             // 32/64 bit integer cases.
-            return v >= U{1}    ? std::numeric_limits<T>::max()
-                   : v <= U{-1} ? std::numeric_limits<T>::min()
-                                : static_cast<T>(std::round(v * scale));
+            return v >= U{1} ? std::numeric_limits<T>::max() : v <= U{0} ? 0 : static_cast<T>(std::round(v * scale));
         }
     }
 

From 13a78bec09a2d3e04fa7ad3a50a1cde242fe7cfb Mon Sep 17 00:00:00 2001
From: Zach Vincze <zavincze@amd.com>
Date: Fri, 6 Feb 2026 16:21:04 -0500
Subject: [PATCH 6/6] Add another test case for RangeCast

---
 tests/roccv/cpp/src/tests/core/detail/test_range_cast.cpp | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/tests/roccv/cpp/src/tests/core/detail/test_range_cast.cpp b/tests/roccv/cpp/src/tests/core/detail/test_range_cast.cpp
index 7a2ccf5b..2d9ea7dd 100644
--- a/tests/roccv/cpp/src/tests/core/detail/test_range_cast.cpp
+++ b/tests/roccv/cpp/src/tests/core/detail/test_range_cast.cpp
@@ -37,6 +37,8 @@ int main(int argc, char **argv) {
     TEST_CASE(EXPECT_EQ(RangeCast<int>(-1.0f), std::numeric_limits<int>::min()));
     TEST_CASE(EXPECT_EQ(RangeCast<uint>(1.0f), std::numeric_limits<uint>::max()));
     TEST_CASE(EXPECT_EQ(RangeCast<uint>(-1.0f), 0));
+    TEST_CASE(EXPECT_EQ(RangeCast<uint>(0.0f), 0));
+
 
     // Test unsigned/signed integer -> float casting
     TEST_CASE(EXPECT_EQ(RangeCast<float>(std::numeric_limits<int>::max()), 1.0f));