From 0ddbf574b6f5ff0630a60d42e022eb8b77b6046e Mon Sep 17 00:00:00 2001
From: Zach Vincze <zavincze@amd.com>
Date: Tue, 5 May 2026 10:13:03 -0400
Subject: [PATCH 01/13] Add ImageData/ImageBuffer class and tests

---
 include/core/image_buffer.hpp                 |  81 +++++++
 include/core/image_data.hpp                   | 178 ++++++++++++++
 src/core/image_data.cpp                       |  88 +++++++
 .../src/tests/core/image/test_image_data.cpp  | 225 ++++++++++++++++++
 4 files changed, 572 insertions(+)
 create mode 100644 include/core/image_buffer.hpp
 create mode 100644 include/core/image_data.hpp
 create mode 100644 src/core/image_data.cpp
 create mode 100644 tests/roccv/cpp/src/tests/core/image/test_image_data.cpp

diff --git a/include/core/image_buffer.hpp b/include/core/image_buffer.hpp
new file mode 100644
index 00000000..aea93c2d
--- /dev/null
+++ b/include/core/image_buffer.hpp
@@ -0,0 +1,81 @@
+/*
+ * Copyright (c) 2026 Advanced Micro Devices, Inc. All rights reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ */
+
+#pragma once
+
+#include <stdint.h>
+
+/** Maximum number of data planes an image can have. */
+#define ROCCV_MAX_IMAGE_PLANES (6)
+
+namespace roccv {
+
+/**
+ * @brief Describes a single pitch-linear image plane.
+ *
+ * For interleaved-channel formats there is exactly one plane covering the whole
+ * image. For planar formats (e.g. NV12, YUV420) each channel/plane carries its
+ * own width, height, and row stride and lives in its own buffer.
+ */
+struct ImagePlaneStrided {
+    /** Width of this plane in pixels. Must be >= 1. */
+    int32_t width;
+
+    /** Height of this plane in pixels. Must be >= 1. */
+    int32_t height;
+
+    /** Distance in bytes between the start of consecutive rows. Must be at
+     *  least `(width * bits-per-pixel + 7) / 8`. */
+    int64_t rowStride;
+
+    /** Pointer to the first byte of plane data. Validity (device vs host) is
+     *  determined by the enclosing data type. */
+    void* basePtr;
+};
+
+/**
+ * @brief A pitch-linear image buffer: one or more `ImagePlaneStrided` entries.
+ *
+ * Only the first `numPlanes` entries carry valid data; the remainder of the
+ * fixed-size `planes` array is unused. Capping the array size keeps the buffer
+ * trivially copyable so it can ride inside `ImageBuffer` without an
+ * allocation.
+ */
+struct ImageBufferStrided {
+    /** Number of valid planes. Must be >= 1. */
+    int32_t numPlanes;
+
+    /** Per-plane descriptors. Only the first `numPlanes` are valid. */
+    ImagePlaneStrided planes[ROCCV_MAX_IMAGE_PLANES];
+};
+
+/**
+ * @brief An image buffer. Currently only the strided variant is supported.
+ * Mirrors the role `TensorBuffer` plays for tensors and is intentionally
+ * shaped as a tagged-union-style aggregate so additional buffer kinds can be
+ * added later (e.g. HIP textures) without changing the public type.
+ */
+struct ImageBuffer {
+    ImageBufferStrided strided;
+};
+
+}  // namespace roccv
diff --git a/include/core/image_data.hpp b/include/core/image_data.hpp
new file mode 100644
index 00000000..cf45e71c
--- /dev/null
+++ b/include/core/image_data.hpp
@@ -0,0 +1,178 @@
+/*
+ * Copyright (c) 2026 Advanced Micro Devices, Inc. All rights reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ */
+
+#pragma once
+
+#include <stdint.h>
+
+#include <optional>
+#include <type_traits>
+
+#include "core/image_buffer.hpp"
+#include "core/image_format.hpp"
+#include "core/util_enums.h"
+#include "operator_types.h"
+
+namespace roccv {
+
+/**
+ * @brief Discriminator for the kind of buffer an ImageData carries. Used by
+ * IsCompatibleKind() / cast<>() to perform safe runtime down-casting through
+ * the ImageData hierarchy.
+ */
+enum class ImageBufferType {
+    IMAGE_BUFFER_NONE,         // Default/invalid buffer type. Used when no buffer type is specified.
+    IMAGE_BUFFER_STRIDED_HIP,  // GPU-accessible buffer with strided access.
+    IMAGE_BUFFER_STRIDED_HOST  // Host-accessible buffer with strided access.
+};
+
+/**
+ * @brief Holds the underlying image data alongside metadata (format, buffer
+ * kind). Non-strided image data is not supported for use right now; use
+ * ImageDataStrided to access strided image data instead.
+ *
+ * ImageData is the interchange type for a single variable-sized image. It
+ * does not own the underlying pixel buffer — it is a metadata snapshot, valid
+ * only as long as the producing buffer outlives it.
+ */
+class ImageData {
+   public:
+    ImageData() = delete;
+    virtual ~ImageData() = default;
+
+    /**
+     * @brief Returns the pixel format of the image.
+     */
+    virtual const ImageFormat &format() const;
+
+    /**
+     * @brief Returns the device the image data resides on.
+     */
+    virtual eDeviceType device() const;
+
+    /**
+     * @brief Attempts to down-cast this ImageData to a more specific subclass.
+     * Returns the casted value if the underlying buffer kind matches what
+     * Derived expects, or std::nullopt otherwise.
+     *
+     * @tparam Derived The target subclass to cast to.
+     */
+    template <typename Derived>
+    std::optional<Derived> cast() const {
+        static_assert(std::is_base_of<ImageData, Derived>::value, "Cannot cast ImageData to an unrelated type.");
+        static_assert(sizeof(Derived) == sizeof(ImageData), "Derived type must not add any additional data members.");
+
+        if (!Derived::IsCompatibleKind(m_bufferType)) {
+            return std::nullopt;
+        }
+
+        return std::make_optional<Derived>(m_format, m_buffer);
+    }
+
+    static bool IsCompatibleKind(ImageBufferType bufferType);
+
+   protected:
+    ImageData(const ImageFormat &format, const ImageBuffer &buffer);
+
+    ImageFormat m_format;
+    eDeviceType m_deviceType;
+    ImageBufferType m_bufferType;
+    ImageBuffer m_buffer;
+};
+
+/**
+ * @brief Image data backed by one or more pitch-linear planes. Adds typed
+ * accessors for plane descriptors on top of the base ImageData. Sub-classed
+ * by ImageDataStridedHip and ImageDataStridedHost to discriminate device vs
+ * host residency.
+ */
+class ImageDataStrided : public ImageData {
+   public:
+    using Buffer = ImageBufferStrided;
+
+    ImageDataStrided(const ImageFormat &format, const ImageBuffer &buffer);
+
+    static bool IsCompatibleKind(ImageBufferType bufferType);
+
+    /**
+     * @brief Returns the logical image dimensions, taken from plane 0. For
+     * planar formats, individual planes may have smaller dimensions (e.g.
+     * chroma sub-sampling); use plane(p) to inspect each plane directly.
+     */
+    Size2D size() const;
+
+    /**
+     * @brief Returns the number of valid planes in the buffer.
+     */
+    int32_t numPlanes() const;
+
+    /**
+     * @brief Returns the descriptor for the requested plane.
+     *
+     * @param[in] p The plane index. Must satisfy `0 <= p < numPlanes()`.
+     */
+    const ImagePlaneStrided &plane(int32_t p) const;
+};
+
+/**
+ * @brief GPU-accessible strided image data.
+ */
+class ImageDataStridedHip : public ImageDataStrided {
+   public:
+    using Buffer = ImageBufferStrided;
+
+    ImageDataStridedHip(const ImageFormat &format, const ImageBuffer &buffer);
+
+    /**
+     * @brief Constructs GPU-accessible strided image data from a strided
+     * image buffer directly.
+     *
+     * @param[in] format The pixel format.
+     * @param[in] buffer A strided image buffer with planes allocated on the GPU.
+     */
+    ImageDataStridedHip(const ImageFormat &format, const Buffer &buffer);
+
+    static bool IsCompatibleKind(ImageBufferType bufferType);
+};
+
+/**
+ * @brief Host-accessible strided image data.
+ */
+class ImageDataStridedHost : public ImageDataStrided {
+   public:
+    using Buffer = ImageBufferStrided;
+
+    ImageDataStridedHost(const ImageFormat &format, const ImageBuffer &buffer);
+
+    /**
+     * @brief Constructs host-accessible strided image data from a strided
+     * image buffer directly.
+     *
+     * @param[in] format The pixel format.
+     * @param[in] buffer A strided image buffer with planes allocated on the host.
+     */
+    ImageDataStridedHost(const ImageFormat &format, const Buffer &buffer);
+
+    static bool IsCompatibleKind(ImageBufferType bufferType);
+};
+
+}  // namespace roccv
diff --git a/src/core/image_data.cpp b/src/core/image_data.cpp
new file mode 100644
index 00000000..6fb0fc83
--- /dev/null
+++ b/src/core/image_data.cpp
@@ -0,0 +1,88 @@
+/*
+ * Copyright (c) 2026 Advanced Micro Devices, Inc. All rights reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ */
+
+#include "core/image_data.hpp"
+
+#include "core/image_buffer.hpp"
+#include "core/image_format.hpp"
+#include "core/util_enums.h"
+
+namespace roccv {
+
+const ImageFormat& ImageData::format() const { return m_format; }
+
+eDeviceType ImageData::device() const { return m_deviceType; }
+
+ImageData::ImageData(const ImageFormat& format, const ImageBuffer& buffer)
+    : m_format(format),
+      m_deviceType(eDeviceType::GPU),
+      m_bufferType(ImageBufferType::IMAGE_BUFFER_NONE),
+      m_buffer(buffer) {}
+
+bool ImageData::IsCompatibleKind(ImageBufferType bufferType) {
+    return bufferType != ImageBufferType::IMAGE_BUFFER_NONE;
+}
+
+ImageDataStrided::ImageDataStrided(const ImageFormat& format, const ImageBuffer& buffer)
+    : ImageData(format, buffer) {}
+
+bool ImageDataStrided::IsCompatibleKind(ImageBufferType bufferType) {
+    return bufferType == ImageBufferType::IMAGE_BUFFER_STRIDED_HIP ||
+           bufferType == ImageBufferType::IMAGE_BUFFER_STRIDED_HOST;
+}
+
+Size2D ImageDataStrided::size() const {
+    const ImagePlaneStrided& p0 = m_buffer.strided.planes[0];
+    return Size2D{p0.width, p0.height};
+}
+
+int32_t ImageDataStrided::numPlanes() const { return m_buffer.strided.numPlanes; }
+
+const ImagePlaneStrided& ImageDataStrided::plane(int32_t p) const { return m_buffer.strided.planes[p]; }
+
+ImageDataStridedHip::ImageDataStridedHip(const ImageFormat& format, const ImageBuffer& buffer)
+    : ImageDataStrided(format, buffer) {
+    m_bufferType = ImageBufferType::IMAGE_BUFFER_STRIDED_HIP;
+    m_deviceType = eDeviceType::GPU;
+}
+
+ImageDataStridedHip::ImageDataStridedHip(const ImageFormat& format, const ImageDataStridedHip::Buffer& buffer)
+    : ImageDataStridedHip(format, ImageBuffer{.strided = buffer}) {}
+
+bool ImageDataStridedHip::IsCompatibleKind(ImageBufferType bufferType) {
+    return bufferType == ImageBufferType::IMAGE_BUFFER_STRIDED_HIP;
+}
+
+ImageDataStridedHost::ImageDataStridedHost(const ImageFormat& format, const ImageBuffer& buffer)
+    : ImageDataStrided(format, buffer) {
+    m_bufferType = ImageBufferType::IMAGE_BUFFER_STRIDED_HOST;
+    m_deviceType = eDeviceType::CPU;
+}
+
+ImageDataStridedHost::ImageDataStridedHost(const ImageFormat& format, const ImageDataStridedHost::Buffer& buffer)
+    : ImageDataStridedHost(format, ImageBuffer{.strided = buffer}) {}
+
+bool ImageDataStridedHost::IsCompatibleKind(ImageBufferType bufferType) {
+    return bufferType == ImageBufferType::IMAGE_BUFFER_STRIDED_HOST;
+}
+
+}  // namespace roccv
diff --git a/tests/roccv/cpp/src/tests/core/image/test_image_data.cpp b/tests/roccv/cpp/src/tests/core/image/test_image_data.cpp
new file mode 100644
index 00000000..885b1aa5
--- /dev/null
+++ b/tests/roccv/cpp/src/tests/core/image/test_image_data.cpp
@@ -0,0 +1,225 @@
+/*
+ * Copyright (c) 2026 Advanced Micro Devices, Inc. All rights reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ */
+
+#include <stdint.h>
+
+#include <core/image_buffer.hpp>
+#include <core/image_data.hpp>
+#include <core/image_format.hpp>
+
+#include "test_helpers.hpp"
+
+using namespace roccv;
+using namespace roccv::tests;
+
+namespace {
+
+// EXPECT_EQ feeds both sides through std::to_string, which only accepts
+// numeric types. Wrap enum/pointer/bool comparisons in these casts.
+auto AsInt = [](auto v) { return static_cast<int>(v); };
+auto AsAddr = [](void* p) { return reinterpret_cast<uintptr_t>(p); };
+
+// ImageData carries pointers but never dereferences them; the buffer is a
+// metadata snapshot. Use opaque sentinel pointers in tests so we can verify
+// values flow through without needing real allocations.
+void* const FAKE_PTR_A = reinterpret_cast<void*>(0xAAAAAAAAull);
+void* const FAKE_PTR_B = reinterpret_cast<void*>(0xBBBBBBBBull);
+void* const FAKE_PTR_C = reinterpret_cast<void*>(0xCCCCCCCCull);
+
+ImageBufferStrided MakeSinglePlaneBuffer(int32_t width, int32_t height, int64_t rowStride, void* basePtr) {
+    ImageBufferStrided buf{};
+    buf.numPlanes = 1;
+    buf.planes[0] = {width, height, rowStride, basePtr};
+    return buf;
+}
+
+ImageBufferStrided MakeThreePlaneBuffer() {
+    // Mimics a planar layout (e.g. YUV420-style) with sub-sampled chroma — three
+    // planes of differing dimensions and strides backed by distinct buffers.
+    ImageBufferStrided buf{};
+    buf.numPlanes = 3;
+    buf.planes[0] = {1920, 1080, 1920, FAKE_PTR_A};  // Y full-resolution
+    buf.planes[1] = {960, 540, 960, FAKE_PTR_B};     // U sub-sampled
+    buf.planes[2] = {960, 540, 960, FAKE_PTR_C};     // V sub-sampled
+    return buf;
+}
+
+/**
+ * @brief Verifies HIP-strided construction populates all observable state and
+ * tags itself as GPU-resident.
+ */
+void TestImageDataStridedHipConstruction() {
+    auto buf = MakeSinglePlaneBuffer(640, 480, 640 * 3, FAKE_PTR_A);
+    ImageDataStridedHip data(FMT_RGB8, buf);
+
+    EXPECT_EQ(AsInt(data.device()), AsInt(eDeviceType::GPU));
+    EXPECT_EQ(data.numPlanes(), 1);
+    EXPECT_EQ(data.size().w, 640);
+    EXPECT_EQ(data.size().h, 480);
+    EXPECT_EQ(data.plane(0).width, 640);
+    EXPECT_EQ(data.plane(0).height, 480);
+    EXPECT_EQ(data.plane(0).rowStride, static_cast<int64_t>(640 * 3));
+    EXPECT_EQ(AsAddr(data.plane(0).basePtr), AsAddr(FAKE_PTR_A));
+    EXPECT_EQ(data.format().channels(), 3);
+}
+
+/**
+ * @brief Same shape as the Hip test but for Host-resident strided data.
+ */
+void TestImageDataStridedHostConstruction() {
+    auto buf = MakeSinglePlaneBuffer(320, 240, 320, FAKE_PTR_B);
+    ImageDataStridedHost data(FMT_U8, buf);
+
+    EXPECT_EQ(AsInt(data.device()), AsInt(eDeviceType::CPU));
+    EXPECT_EQ(data.numPlanes(), 1);
+    EXPECT_EQ(data.size().w, 320);
+    EXPECT_EQ(data.size().h, 240);
+    EXPECT_EQ(AsAddr(data.plane(0).basePtr), AsAddr(FAKE_PTR_B));
+    EXPECT_EQ(data.format().channels(), 1);
+}
+
+/**
+ * @brief Multi-plane buffers must round-trip per-plane dimensions and pointers
+ * unchanged. size() reports plane 0 by convention; planes 1..N may be smaller.
+ */
+void TestImageDataStridedMultiPlane() {
+    auto buf = MakeThreePlaneBuffer();
+    ImageDataStridedHip data(FMT_U8, buf);
+
+    EXPECT_EQ(data.numPlanes(), 3);
+    EXPECT_EQ(data.size().w, 1920);
+    EXPECT_EQ(data.size().h, 1080);
+
+    EXPECT_EQ(data.plane(0).width, 1920);
+    EXPECT_EQ(data.plane(0).height, 1080);
+    EXPECT_EQ(AsAddr(data.plane(0).basePtr), AsAddr(FAKE_PTR_A));
+
+    EXPECT_EQ(data.plane(1).width, 960);
+    EXPECT_EQ(data.plane(1).height, 540);
+    EXPECT_EQ(AsAddr(data.plane(1).basePtr), AsAddr(FAKE_PTR_B));
+
+    EXPECT_EQ(data.plane(2).width, 960);
+    EXPECT_EQ(data.plane(2).height, 540);
+    EXPECT_EQ(AsAddr(data.plane(2).basePtr), AsAddr(FAKE_PTR_C));
+}
+
+/**
+ * @brief The two leaf ctors (taking ImageBuffer vs ImageBufferStrided directly)
+ * must produce observably identical state.
+ */
+void TestImageDataStridedSugarCtor() {
+    auto buf = MakeSinglePlaneBuffer(100, 200, 400, FAKE_PTR_A);
+
+    ImageDataStridedHip wide(FMT_RGBA8, ImageBuffer{.strided = buf});
+    ImageDataStridedHip sugar(FMT_RGBA8, buf);
+
+    EXPECT_EQ(AsInt(wide.device()), AsInt(sugar.device()));
+    EXPECT_EQ(wide.numPlanes(), sugar.numPlanes());
+    EXPECT_EQ(AsAddr(wide.plane(0).basePtr), AsAddr(sugar.plane(0).basePtr));
+    EXPECT_EQ(wide.plane(0).rowStride, sugar.plane(0).rowStride);
+
+    ImageDataStridedHost wideHost(FMT_U8, ImageBuffer{.strided = buf});
+    ImageDataStridedHost sugarHost(FMT_U8, buf);
+    EXPECT_EQ(AsInt(wideHost.device()), AsInt(sugarHost.device()));
+    EXPECT_EQ(AsAddr(wideHost.plane(0).basePtr), AsAddr(sugarHost.plane(0).basePtr));
+}
+
+/**
+ * @brief IsCompatibleKind on each level discriminates the buffer kinds it
+ * accepts. Base accepts anything-but-NONE; Strided accepts both Hip and Host;
+ * leaves accept only their own.
+ */
+void TestImageDataIsCompatibleKind() {
+    EXPECT_EQ(AsInt(ImageData::IsCompatibleKind(ImageBufferType::IMAGE_BUFFER_NONE)), 0);
+    EXPECT_EQ(AsInt(ImageData::IsCompatibleKind(ImageBufferType::IMAGE_BUFFER_STRIDED_HIP)), 1);
+    EXPECT_EQ(AsInt(ImageData::IsCompatibleKind(ImageBufferType::IMAGE_BUFFER_STRIDED_HOST)), 1);
+
+    EXPECT_EQ(AsInt(ImageDataStrided::IsCompatibleKind(ImageBufferType::IMAGE_BUFFER_NONE)), 0);
+    EXPECT_EQ(AsInt(ImageDataStrided::IsCompatibleKind(ImageBufferType::IMAGE_BUFFER_STRIDED_HIP)), 1);
+    EXPECT_EQ(AsInt(ImageDataStrided::IsCompatibleKind(ImageBufferType::IMAGE_BUFFER_STRIDED_HOST)), 1);
+
+    EXPECT_EQ(AsInt(ImageDataStridedHip::IsCompatibleKind(ImageBufferType::IMAGE_BUFFER_NONE)), 0);
+    EXPECT_EQ(AsInt(ImageDataStridedHip::IsCompatibleKind(ImageBufferType::IMAGE_BUFFER_STRIDED_HIP)), 1);
+    EXPECT_EQ(AsInt(ImageDataStridedHip::IsCompatibleKind(ImageBufferType::IMAGE_BUFFER_STRIDED_HOST)), 0);
+
+    EXPECT_EQ(AsInt(ImageDataStridedHost::IsCompatibleKind(ImageBufferType::IMAGE_BUFFER_NONE)), 0);
+    EXPECT_EQ(AsInt(ImageDataStridedHost::IsCompatibleKind(ImageBufferType::IMAGE_BUFFER_STRIDED_HIP)), 0);
+    EXPECT_EQ(AsInt(ImageDataStridedHost::IsCompatibleKind(ImageBufferType::IMAGE_BUFFER_STRIDED_HOST)), 1);
+}
+
+/**
+ * @brief Round-trip a derived ImageData through the base reference and back
+ * via cast<>(). Successful casts must preserve every observable field; casts
+ * to incompatible kinds must return std::nullopt.
+ */
+void TestImageDataCast() {
+    auto buf = MakeSinglePlaneBuffer(800, 600, 800 * 4, FAKE_PTR_A);
+
+    // Hip → base → Hip should round-trip, Hip → Host should fail.
+    {
+        ImageDataStridedHip hip(FMT_RGBA8, buf);
+        const ImageData& base = hip;
+
+        auto asHip = base.cast<ImageDataStridedHip>();
+        EXPECT_EQ(AsInt(asHip.has_value()), 1);
+        EXPECT_EQ(AsInt(asHip->device()), AsInt(eDeviceType::GPU));
+        EXPECT_EQ(AsAddr(asHip->plane(0).basePtr), AsAddr(FAKE_PTR_A));
+        EXPECT_EQ(asHip->plane(0).width, 800);
+
+        auto asStrided = base.cast<ImageDataStrided>();
+        EXPECT_EQ(AsInt(asStrided.has_value()), 1);
+        EXPECT_EQ(AsInt(asStrided->device()), AsInt(eDeviceType::GPU));
+
+        auto asHost = base.cast<ImageDataStridedHost>();
+        EXPECT_EQ(AsInt(asHost.has_value()), 0);
+    }
+
+    // Symmetrically: Host → base → Host succeeds, Host → Hip fails.
+    {
+        ImageDataStridedHost host(FMT_RGBA8, buf);
+        const ImageData& base = host;
+
+        auto asHost = base.cast<ImageDataStridedHost>();
+        EXPECT_EQ(AsInt(asHost.has_value()), 1);
+        EXPECT_EQ(AsInt(asHost->device()), AsInt(eDeviceType::CPU));
+
+        auto asHip = base.cast<ImageDataStridedHip>();
+        EXPECT_EQ(AsInt(asHip.has_value()), 0);
+    }
+}
+
+}  // namespace
+
+int main(int argc, char** argv) {
+    (void)argc;
+    (void)argv;
+    TEST_CASES_BEGIN();
+
+    TEST_CASE(TestImageDataStridedHipConstruction());
+    TEST_CASE(TestImageDataStridedHostConstruction());
+    TEST_CASE(TestImageDataStridedMultiPlane());
+    TEST_CASE(TestImageDataStridedSugarCtor());
+    TEST_CASE(TestImageDataIsCompatibleKind());
+    TEST_CASE(TestImageDataCast());
+
+    TEST_CASES_END();
+}

From 689a94de1987bd7465a52f09ac3d0b9f6ca70d2d Mon Sep 17 00:00:00 2001
From: Zach Vincze <zavincze@amd.com>
Date: Tue, 5 May 2026 16:04:56 -0400
Subject: [PATCH 02/13] Add Image/ImageStorage implementations

---
 include/core/image.hpp         | 183 +++++++++++++++++++++++++++++++++
 include/core/image_storage.hpp |  56 ++++++++++
 src/core/image.cpp             | 168 ++++++++++++++++++++++++++++++
 3 files changed, 407 insertions(+)
 create mode 100644 include/core/image.hpp
 create mode 100644 include/core/image_storage.hpp
 create mode 100644 src/core/image.cpp

diff --git a/include/core/image.hpp b/include/core/image.hpp
new file mode 100644
index 00000000..ecde3b51
--- /dev/null
+++ b/include/core/image.hpp
@@ -0,0 +1,183 @@
+/*
+ * Copyright (c) 2026 Advanced Micro Devices, Inc. All rights reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ */
+
+#pragma once
+
+#include <cstdint>
+#include <functional>
+#include <memory>
+#include <optional>
+
+#include "core/detail/allocators/i_allocator.hpp"
+#include "core/image_buffer.hpp"
+#include "core/image_data.hpp"
+#include "core/image_format.hpp"
+#include "core/util_enums.h"
+#include "operator_types.h"
+
+namespace roccv {
+
+class ImageStorage;
+
+/**
+ * @brief Cleanup callback signature for ImageWrapData. Invoked when the last
+ * Image handle referencing the wrapped buffer is destroyed. Receives the
+ * ImageData snapshot that was originally wrapped, so callbacks can free
+ * multi-plane buffers or dispatch on format.
+ */
+using ImageDataCleanupFunc = std::function<void(const ImageData&)>;
+
+/**
+ * @brief Per-image allocation spec describing what to allocate for a single
+ * variable-sized image. Mirrors NVCVImageRequirements: size, format, per-plane
+ * row strides, and base-address alignment. Used as the input to Image's
+ * allocating constructors and as the output of CalcRequirements; not stored
+ * on the Image instance after construction (m_metadata holds the runtime
+ * descriptor in ImageData form).
+ *
+ * Per-plane row strides are populated only for planes 0..numPlanes(format)-1;
+ * remaining slots are unused. Today's interleaved-only ImageFormat means only
+ * planeRowStride[0] is populated in practice.
+ */
+struct ImageRequirements {
+    Size2D size;                                     // Width and height in pixels.
+    ImageFormat format;                              // Pixel format (dtype + channel count + swizzle).
+    int64_t planeRowStride[ROCCV_MAX_IMAGE_PLANES];  // Per-plane row stride in bytes.
+    int32_t alignBytes;                              // Required base-address alignment, in bytes.
+};
+
+/**
+ * @brief A single variable-sized image with device-resident pixel data.
+ *
+ * Image is the per-element type held by ImageBatchVarShape. It is a handle
+ * over a refcounted ImageStorage: copying an Image bumps the refcount and
+ * leaves both handles pointing at the same underlying buffer. The buffer is
+ * freed when the last handle is destroyed (for owning Images) or when the
+ * cleanup callback fires (for ImageWrapData with a callback).
+ */
+class Image {
+   public:
+    using Requirements = ImageRequirements;
+
+    /**
+     * @brief Compute the requirements (row stride, etc.) for an image of the
+     * given dimensions and format.
+     */
+    static Requirements CalcRequirements(Size2D size, ImageFormat format);
+
+    /**
+     * @brief Allocate a new device buffer for an image of the given dimensions
+     * and format using the global default allocator.
+     */
+    explicit Image(Size2D size, ImageFormat format, eDeviceType device = eDeviceType::GPU);
+
+    /**
+     * @brief Allocate a new device buffer using a caller-supplied allocator.
+     */
+    explicit Image(Size2D size, ImageFormat format, const IAllocator& alloc, eDeviceType device = eDeviceType::GPU);
+
+    /**
+     * @brief Allocate a new device buffer from precomputed requirements.
+     */
+    explicit Image(const Requirements& reqs, eDeviceType device = eDeviceType::GPU);
+    explicit Image(const Requirements& reqs, const IAllocator& alloc, eDeviceType device = eDeviceType::GPU);
+
+    Image(const Image&) = default;  // refcount bump
+    Image(Image&&) noexcept = default;
+    Image& operator=(const Image&) = default;  // refcount bump
+    Image& operator=(Image&&) noexcept = default;
+    ~Image() = default;
+
+    /**
+     * @brief Image dimensions in pixels.
+     */
+    Size2D size() const noexcept;
+
+    /**
+     * @brief Pixel format.
+     */
+    ImageFormat format() const noexcept;
+
+    /**
+     * @brief Device the underlying buffer resides on.
+     */
+    eDeviceType device() const noexcept;
+
+    /**
+     * @brief Snapshot of the image's data buffer (pointer, stride, format).
+     *
+     * The returned ImageData references the same underlying buffer; lifetime
+     * is controlled by this Image's refcount, not by the snapshot.
+     */
+    ImageData exportData() const;
+
+    /**
+     * @brief Exports the image's data buffer and casts it to a specified image data object.
+     *
+     * Throws std::bad_cast if the underlying buffer kind does not match what
+     * `Derived` expects (e.g. exportData<ImageDataStridedHip>() on a host-resident
+     * image throws std::bad_cast). Convenience wrapper around ImageData::cast<>.
+     *
+     * @tparam Derived The ImageData subclass to cast to.
+     * @return The image data casted to the image data object specified
+     */
+    template <typename Derived>
+    Derived exportData() const {
+        ImageData data = exportData();
+        std::optional<Derived> derived_data = data.cast<Derived>();
+        if (!derived_data.has_value()) {
+            throw std::bad_cast();
+        }
+
+        return derived_data.value();
+    }
+
+   private:
+    // Internal ctor used by ImageWrapData and the allocating public ctors via
+    // delegation. Stores `metadata` and `storage` verbatim — no allocation.
+    Image(ImageData metadata, std::shared_ptr<ImageStorage> storage);
+
+    friend Image ImageWrapData(const ImageData& data, ImageDataCleanupFunc cleanup);
+
+    // m_data is declared first so the allocating ctor can initialize it
+    // (allocating the buffer) before m_metadata reads back the pointer.
+    std::shared_ptr<ImageStorage> m_data;
+    ImageData m_metadata;
+};
+
+/**
+ * @brief Wrap an externally-owned buffer as an Image without allocating.
+ *
+ * View-only by default: the wrapped buffer is NOT freed when the returned
+ * Image (and any copies) go out of scope. The caller is responsible for
+ * keeping the underlying memory alive for as long as any handle survives.
+ *
+ * Pass a non-null cleanup callback to opt into ownership transfer; the
+ * callback runs exactly once, when the last handle is destroyed.
+ *
+ * @param[in] data Pre-existing image data (pointer, layout, device).
+ * @param[in] cleanup Optional callback to free the buffer on last destruction.
+ * @return An Image referencing the wrapped buffer.
+ */
+extern Image ImageWrapData(const ImageData& data, ImageDataCleanupFunc cleanup = nullptr);
+
+}  // namespace roccv
diff --git a/include/core/image_storage.hpp b/include/core/image_storage.hpp
new file mode 100644
index 00000000..69b9d3c0
--- /dev/null
+++ b/include/core/image_storage.hpp
@@ -0,0 +1,56 @@
+/*
+ * Copyright (c) 2026 Advanced Micro Devices, Inc. All rights reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ */
+
+#pragma once
+
+namespace roccv {
+
+/**
+ * @brief Holds the raw data pointer for a single Image and serves as the
+ * refcount target shared between Image handles.
+ *
+ * ImageStorage carries no lifecycle logic of its own: freeing the underlying
+ * buffer is the responsibility of the shared_ptr<ImageStorage> deleter
+ * installed at the Image construction site. The allocating Image ctor
+ * captures the allocator + device into its deleter; ImageWrapData captures
+ * the user's cleanup callback (or installs none for the view-only case).
+ *
+ * As a result, ImageStorage is held only by shared_ptr — never by value, never
+ * copied. Move/copy are deleted to enforce that.
+ */
+class ImageStorage {
+   public:
+    explicit ImageStorage(void* data) : m_data(data) {}
+
+    ImageStorage(const ImageStorage&) = delete;
+    ImageStorage& operator=(const ImageStorage&) = delete;
+
+    /**
+     * @brief Returns the raw data pointer this storage is tracking.
+     */
+    void* data() const noexcept { return m_data; }
+
+   private:
+    void* m_data;
+};
+
+}  // namespace roccv
diff --git a/src/core/image.cpp b/src/core/image.cpp
new file mode 100644
index 00000000..995fd4a1
--- /dev/null
+++ b/src/core/image.cpp
@@ -0,0 +1,168 @@
+/*
+ * Copyright (c) 2026 Advanced Micro Devices, Inc. All rights reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ */
+
+#include "core/image.hpp"
+
+#include "core/data_type.hpp"
+#include "core/detail/context.hpp"
+#include "core/exception.hpp"
+#include "core/image_storage.hpp"
+
+namespace roccv {
+
+namespace {
+
+// Allocates a buffer through `alloc` for the requested device and wraps it
+// in an ImageStorage whose shared_ptr deleter frees through the same allocator.
+// The allocator reference is captured by reference; callers must ensure it
+// outlives every Image (and any handle copied from it) it creates.
+std::shared_ptr<ImageStorage> makeStorage(const ImageRequirements& reqs, const IAllocator& alloc, eDeviceType device) {
+    const size_t bytes = static_cast<size_t>(reqs.planeRowStride[0]) * reqs.size.h;
+
+    void* buf = nullptr;
+    switch (device) {
+        case eDeviceType::GPU:
+            buf = alloc.allocHipMem(bytes);
+            break;
+        case eDeviceType::CPU:
+            buf = alloc.allocHostMem(bytes);
+            break;
+    }
+
+    return std::shared_ptr<ImageStorage>(new ImageStorage(buf), [&alloc, device](ImageStorage* s) {
+        switch (device) {
+            case eDeviceType::GPU:
+                alloc.freeHipMem(s->data());
+                break;
+            case eDeviceType::CPU:
+                alloc.freeHostMem(s->data());
+                break;
+        }
+        delete s;
+    });
+}
+
+// Builds the canonical ImageData stored on Image from a freshly-allocated
+// (or wrapped) buffer plus its layout description. Single-plane today —
+// ImageFormat is interleaved-only, so only planes[0] is populated.
+ImageData makeImageData(const ImageRequirements& reqs, void* buf, eDeviceType device) {
+    ImageBufferStrided strided{};
+    strided.numPlanes = 1;
+    strided.planes[0].width = reqs.size.w;
+    strided.planes[0].height = reqs.size.h;
+    strided.planes[0].rowStride = reqs.planeRowStride[0];
+    strided.planes[0].basePtr = buf;
+
+    switch (device) {
+        case eDeviceType::GPU:
+            return ImageDataStridedHip(reqs.format, strided);
+        case eDeviceType::CPU:
+            return ImageDataStridedHost(reqs.format, strided);
+    }
+
+    throw Exception("Unsupported device type in Image::makeImageData.", eStatusType::INVALID_VALUE);
+}
+
+}  // namespace
+
+// -----------------------------------------------------------------------------
+// CalcRequirements
+// -----------------------------------------------------------------------------
+
+Image::Requirements Image::CalcRequirements(Size2D size, ImageFormat format) {
+    if (size.w < 1 || size.h < 1) {
+        throw Exception("Image dimensions must be >= 1.", eStatusType::INVALID_VALUE);
+    }
+
+    ImageRequirements reqs;
+    reqs.size = size;
+    reqs.format = format;
+
+    const int64_t bytesPerPixel = static_cast<int64_t>(DataType(format.dtype()).size()) * format.channels();
+    reqs.planeRowStride[0] = bytesPerPixel * size.w;  // packed; no row padding while alignBytes is unused.
+
+    // TODO: derive a sensible default base/row alignment from device attributes.
+    reqs.alignBytes = 0;
+
+    return reqs;
+}
+
+// -----------------------------------------------------------------------------
+// Constructors
+// -----------------------------------------------------------------------------
+
+Image::Image(Size2D size, ImageFormat format, eDeviceType device)
+    : Image(size, format, GlobalContext().getDefaultAllocator(), device) {}
+
+Image::Image(Size2D size, ImageFormat format, const IAllocator& alloc, eDeviceType device)
+    : Image(CalcRequirements(size, format), alloc, device) {}
+
+Image::Image(const Requirements& reqs, eDeviceType device)
+    : Image(reqs, GlobalContext().getDefaultAllocator(), device) {}
+
+Image::Image(const Requirements& reqs, const IAllocator& alloc, eDeviceType device)
+    : m_data(makeStorage(reqs, alloc, device)), m_metadata(makeImageData(reqs, m_data->data(), device)) {}
+
+Image::Image(ImageData metadata, std::shared_ptr<ImageStorage> storage)
+    : m_data(std::move(storage)), m_metadata(std::move(metadata)) {}
+
+// -----------------------------------------------------------------------------
+// Accessors
+// -----------------------------------------------------------------------------
+
+Size2D Image::size() const noexcept { return m_metadata.cast<ImageDataStrided>()->size(); }
+
+ImageFormat Image::format() const noexcept { return m_metadata.format(); }
+
+eDeviceType Image::device() const noexcept { return m_metadata.device(); }
+
+ImageData Image::exportData() const { return m_metadata; }
+
+// -----------------------------------------------------------------------------
+// ImageWrapData
+// -----------------------------------------------------------------------------
+
+Image ImageWrapData(const ImageData& data, ImageDataCleanupFunc cleanup) {
+    auto strided = data.cast<ImageDataStrided>();
+    if (!strided.has_value()) {
+        throw Exception("ImageWrapData requires strided image data.", eStatusType::INVALID_VALUE);
+    }
+
+    // Storage tracks plane(0)'s base pointer. Single-plane today; multi-plane
+    // wraps would need a richer storage shape (or to abandon storing the
+    // pointer here at all).
+    void* basePtr = strided->plane(0).basePtr;
+
+    // Deleter captures both the original ImageData snapshot and the user's
+    // cleanup callback. View-only (cleanup == nullptr) means the deleter
+    // touches nothing but the storage object itself.
+    auto storage = std::shared_ptr<ImageStorage>(new ImageStorage(basePtr), [data, cleanup](ImageStorage* s) {
+        if (cleanup) {
+            cleanup(data);
+        }
+        delete s;
+    });
+
+    return Image(data, std::move(storage));
+}
+
+}  // namespace roccv

From 72f5218adcfb6b6da8e510e09858a4cfeca31d7e Mon Sep 17 00:00:00 2001
From: Zach Vincze <zavincze@amd.com>
Date: Tue, 5 May 2026 16:16:22 -0400
Subject: [PATCH 03/13] Add Image tests and fix zero-initialization bug in
 Image::CalcRequirements

---
 src/core/image.cpp                            |  18 +-
 .../cpp/src/tests/core/image/test_image.cpp   | 441 ++++++++++++++++++
 2 files changed, 451 insertions(+), 8 deletions(-)
 create mode 100644 tests/roccv/cpp/src/tests/core/image/test_image.cpp

diff --git a/src/core/image.cpp b/src/core/image.cpp
index 995fd4a1..4319eb20 100644
--- a/src/core/image.cpp
+++ b/src/core/image.cpp
@@ -93,17 +93,19 @@ Image::Requirements Image::CalcRequirements(Size2D size, ImageFormat format) {
         throw Exception("Image dimensions must be >= 1.", eStatusType::INVALID_VALUE);
     }
 
-    ImageRequirements reqs;
-    reqs.size = size;
-    reqs.format = format;
-
     const int64_t bytesPerPixel = static_cast<int64_t>(DataType(format.dtype()).size()) * format.channels();
-    reqs.planeRowStride[0] = bytesPerPixel * size.w;  // packed; no row padding while alignBytes is unused.
 
+    // Designated aggregate init: planeRowStride[0] is set explicitly; the
+    // remaining ROCCV_MAX_IMAGE_PLANES-1 slots are zeroed by the trailing-
+    // elements rule for brace-enclosed array initializers. alignBytes stays at
+    // 0 since CalcRequirements always produces packed rows for now.
     // TODO: derive a sensible default base/row alignment from device attributes.
-    reqs.alignBytes = 0;
-
-    return reqs;
+    return ImageRequirements{
+        .size = size,
+        .format = format,
+        .planeRowStride = {bytesPerPixel * size.w},
+        .alignBytes = 0,
+    };
 }
 
 // -----------------------------------------------------------------------------
diff --git a/tests/roccv/cpp/src/tests/core/image/test_image.cpp b/tests/roccv/cpp/src/tests/core/image/test_image.cpp
new file mode 100644
index 00000000..879dd68c
--- /dev/null
+++ b/tests/roccv/cpp/src/tests/core/image/test_image.cpp
@@ -0,0 +1,441 @@
+/*
+ * Copyright (c) 2026 Advanced Micro Devices, Inc. All rights reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ */
+
+#include <stdint.h>
+#include <stdlib.h>
+
+#include <core/detail/allocators/i_allocator.hpp>
+#include <core/image.hpp>
+#include <core/image_buffer.hpp>
+#include <core/image_data.hpp>
+#include <core/image_format.hpp>
+#include <stdexcept>
+#include <typeinfo>
+#include <utility>
+
+#include "test_helpers.hpp"
+
+using namespace roccv;
+using namespace roccv::tests;
+
+namespace {
+
+// EXPECT_EQ pipes through std::to_string, so wrap enums/pointers/bools.
+auto AsInt = [](auto v) { return static_cast<int>(v); };
+auto AsAddr = [](void* p) { return reinterpret_cast<uintptr_t>(p); };
+auto AsSize = [](auto v) { return static_cast<size_t>(v); };
+
+void* const FAKE_PTR_A = reinterpret_cast<void*>(0xAAAAAAAAull);
+
+/**
+ * @brief Test allocator that backs allocations with malloc and tallies how
+ * many times each entry point is invoked. Pure host-side; no GPU dependency.
+ *
+ * The Hip path returns malloc'd memory because no test dereferences it — we
+ * only care that ptr round-trips through Image and that free is called the
+ * right number of times.
+ */
+class CountingAllocator : public IAllocator {
+   public:
+    mutable int hipAllocs = 0;
+    mutable int hipFrees = 0;
+    mutable int hostAllocs = 0;
+    mutable int hostFrees = 0;
+    mutable size_t lastAllocBytes = 0;
+
+    void* allocHipMem(size_t size) const override {
+        ++hipAllocs;
+        lastAllocBytes = size;
+        return std::malloc(size);
+    }
+    void freeHipMem(void* ptr) const noexcept override {
+        ++hipFrees;
+        std::free(ptr);
+    }
+
+    void* allocHostMem(size_t size, int32_t /*alignment*/ = 0) const override {
+        ++hostAllocs;
+        lastAllocBytes = size;
+        return std::malloc(size);
+    }
+    void freeHostMem(void* ptr) const noexcept override {
+        ++hostFrees;
+        std::free(ptr);
+    }
+
+    // Unused by the Image paths under test. Trip loudly if invoked unexpectedly.
+    void* allocHostPinnedMem(size_t) const override { throw std::runtime_error("unused in tests"); }
+    void freeHostPinnedMem(void*) const noexcept override { std::abort(); }
+};
+
+// Build a single-plane ImageData snapshot referencing a sentinel pointer. Used
+// for ImageWrapData tests where we never dereference the buffer.
+ImageDataStridedHip MakeFakeHipData(int32_t width, int32_t height, void* basePtr, ImageFormat fmt = FMT_RGB8) {
+    ImageBufferStrided buf{};
+    buf.numPlanes = 1;
+    buf.planes[0] = {width, height, static_cast<int64_t>(width * fmt.channels()), basePtr};
+    return ImageDataStridedHip(fmt, buf);
+}
+
+// =============================================================================
+// CalcRequirements
+// =============================================================================
+
+/**
+ * @brief Packed-row stride for a typical 3-channel uint8 image. Other fields
+ * propagate unchanged; remaining plane slots stay zeroed.
+ */
+void TestCalcRequirementsRgb8() {
+    auto reqs = Image::CalcRequirements({320, 240}, FMT_RGB8);
+
+    EXPECT_EQ(reqs.size.w, 320);
+    EXPECT_EQ(reqs.size.h, 240);
+    EXPECT_EQ(reqs.format.channels(), 3);
+    EXPECT_EQ(reqs.planeRowStride[0], static_cast<int64_t>(320 * 3));
+    EXPECT_EQ(reqs.planeRowStride[1], 0);
+    EXPECT_EQ(reqs.planeRowStride[5], 0);
+    EXPECT_EQ(reqs.alignBytes, 0);
+}
+
+/**
+ * @brief Multi-byte dtype is reflected in the per-pixel byte count.
+ */
+void TestCalcRequirementsF32() {
+    auto reqs = Image::CalcRequirements({64, 64}, FMT_F32);
+    EXPECT_EQ(reqs.planeRowStride[0], static_cast<int64_t>(64 * 4));
+}
+
+/**
+ * @brief Single-channel U8 → row stride equals width.
+ */
+void TestCalcRequirementsU8() {
+    auto reqs = Image::CalcRequirements({100, 50}, FMT_U8);
+    EXPECT_EQ(reqs.planeRowStride[0], 100);
+}
+
+/**
+ * @brief Width or height < 1 must throw INVALID_VALUE.
+ */
+void TestCalcRequirementsRejectsInvalidDims() {
+    EXPECT_EXCEPTION(Image::CalcRequirements({0, 100}, FMT_RGB8), eStatusType::INVALID_VALUE);
+    EXPECT_EXCEPTION(Image::CalcRequirements({100, 0}, FMT_RGB8), eStatusType::INVALID_VALUE);
+    EXPECT_EXCEPTION(Image::CalcRequirements({-5, 100}, FMT_RGB8), eStatusType::INVALID_VALUE);
+    EXPECT_EXCEPTION(Image::CalcRequirements({100, -5}, FMT_RGB8), eStatusType::INVALID_VALUE);
+}
+
+/**
+ * @brief Large widths must not overflow during stride math; row stride must
+ * fit in int64.
+ */
+void TestCalcRequirementsLargeDims() {
+    // 8K image, RGBA32 (4 channels * 4 bytes = 16 B/pixel) → 8192 * 16 = 131072 B/row.
+    auto reqs = Image::CalcRequirements({8192, 4320}, FMT_RGBA8);
+    EXPECT_EQ(reqs.planeRowStride[0], static_cast<int64_t>(8192 * 4));
+}
+
+// =============================================================================
+// Allocating constructors
+// =============================================================================
+
+/**
+ * @brief GPU-device ctor routes allocation through allocHipMem with the
+ * computed byte count.
+ */
+void TestImageHipAllocation() {
+    CountingAllocator alloc;
+    {
+        Image img({320, 240}, FMT_RGB8, alloc, eDeviceType::GPU);
+
+        EXPECT_EQ(alloc.hipAllocs, 1);
+        EXPECT_EQ(alloc.hostAllocs, 0);
+        EXPECT_EQ(AsSize(alloc.lastAllocBytes), AsSize(320 * 3 * 240));
+
+        EXPECT_EQ(img.size().w, 320);
+        EXPECT_EQ(img.size().h, 240);
+        EXPECT_EQ(AsInt(img.device()), AsInt(eDeviceType::GPU));
+        EXPECT_EQ(img.format().channels(), 3);
+
+        // Image is still alive — buffer not yet freed.
+        EXPECT_EQ(alloc.hipFrees, 0);
+    }
+    // Image dropped — buffer freed exactly once via the matching allocator.
+    EXPECT_EQ(alloc.hipFrees, 1);
+}
+
+/**
+ * @brief Same shape as the Hip test but for CPU residency.
+ */
+void TestImageHostAllocation() {
+    CountingAllocator alloc;
+    {
+        Image img({100, 50}, FMT_U8, alloc, eDeviceType::CPU);
+
+        EXPECT_EQ(alloc.hostAllocs, 1);
+        EXPECT_EQ(alloc.hipAllocs, 0);
+        EXPECT_EQ(AsSize(alloc.lastAllocBytes), AsSize(100 * 50));
+        EXPECT_EQ(AsInt(img.device()), AsInt(eDeviceType::CPU));
+    }
+    EXPECT_EQ(alloc.hostFrees, 1);
+}
+
+/**
+ * @brief Constructing from precomputed Requirements yields observably
+ * identical state to the (Size2D, ImageFormat) sugar form.
+ */
+void TestImageRequirementsCtor() {
+    CountingAllocator alloc;
+    auto reqs = Image::CalcRequirements({64, 32}, FMT_RGBA8);
+
+    Image img(reqs, alloc, eDeviceType::GPU);
+
+    EXPECT_EQ(img.size().w, 64);
+    EXPECT_EQ(img.size().h, 32);
+    EXPECT_EQ(img.format().channels(), 4);
+    EXPECT_EQ(AsSize(alloc.lastAllocBytes), AsSize(64 * 4 * 32));
+}
+
+// =============================================================================
+// Refcount / lifecycle
+// =============================================================================
+
+/**
+ * @brief Copying an Image bumps the refcount: both handles see the same
+ * underlying buffer, and free is deferred until the LAST handle drops.
+ */
+void TestImageCopySharesBuffer() {
+    CountingAllocator alloc;
+    void* buf = nullptr;
+    {
+        Image first({16, 16}, FMT_U8, alloc, eDeviceType::GPU);
+        buf = first.exportData().cast<ImageDataStrided>()->plane(0).basePtr;
+
+        Image second = first;           // refcount bump
+        EXPECT_EQ(alloc.hipAllocs, 1);  // No new allocation.
+        EXPECT_EQ(AsAddr(second.exportData().cast<ImageDataStrided>()->plane(0).basePtr), AsAddr(buf));
+
+        // Drop `first`; buffer must NOT be freed yet — `second` still holds it.
+        {
+            Image consumed = std::move(first);
+            (void)consumed;
+        }
+        EXPECT_EQ(alloc.hipFrees, 0);
+    }
+    // All handles dropped — exactly one free.
+    EXPECT_EQ(alloc.hipFrees, 1);
+}
+
+/**
+ * @brief Move-construction transfers the buffer; the source is left empty.
+ * The buffer must still free exactly once (when the destination drops).
+ */
+void TestImageMoveSemantics() {
+    CountingAllocator alloc;
+    {
+        Image src({8, 8}, FMT_U8, alloc, eDeviceType::CPU);
+        void* srcBuf = src.exportData().cast<ImageDataStrided>()->plane(0).basePtr;
+
+        Image dst = std::move(src);
+        EXPECT_EQ(AsAddr(dst.exportData().cast<ImageDataStrided>()->plane(0).basePtr), AsAddr(srcBuf));
+        EXPECT_EQ(alloc.hostFrees, 0);
+    }
+    EXPECT_EQ(alloc.hostFrees, 1);
+}
+
+// =============================================================================
+// exportData / exportData<DATA>()
+// =============================================================================
+
+/**
+ * @brief exportData() returns an ImageData snapshot that mirrors the Image's
+ * size, format, device, and base pointer.
+ */
+void TestImageExportData() {
+    CountingAllocator alloc;
+    Image img({80, 60}, FMT_RGBA8, alloc, eDeviceType::GPU);
+    ImageData data = img.exportData();
+
+    EXPECT_EQ(AsInt(data.device()), AsInt(eDeviceType::GPU));
+    EXPECT_EQ(data.format().channels(), 4);
+
+    auto strided = data.cast<ImageDataStrided>();
+    EXPECT_EQ(AsInt(strided.has_value()), 1);
+    EXPECT_EQ(strided->plane(0).width, 80);
+    EXPECT_EQ(strided->plane(0).height, 60);
+    EXPECT_EQ(strided->plane(0).rowStride, static_cast<int64_t>(80 * 4));
+}
+
+/**
+ * @brief Templated exportData<T>() returns the matching subclass directly.
+ */
+void TestImageExportDataTypedSuccess() {
+    CountingAllocator alloc;
+    Image img({4, 4}, FMT_U8, alloc, eDeviceType::GPU);
+
+    auto hip = img.exportData<ImageDataStridedHip>();
+    EXPECT_EQ(AsInt(hip.device()), AsInt(eDeviceType::GPU));
+    EXPECT_EQ(hip.plane(0).width, 4);
+}
+
+/**
+ * @brief Templated exportData<T>() throws std::bad_cast when the requested
+ * subclass does not match the underlying buffer kind.
+ */
+void TestImageExportDataTypedMismatch() {
+    CountingAllocator alloc;
+    Image img({4, 4}, FMT_U8, alloc, eDeviceType::GPU);
+
+    bool threw = false;
+    try {
+        (void)img.exportData<ImageDataStridedHost>();
+    } catch (const std::bad_cast&) {
+        threw = true;
+    }
+    EXPECT_EQ(AsInt(threw), 1);
+}
+
+// =============================================================================
+// ImageWrapData
+// =============================================================================
+
+/**
+ * @brief View-only wrap (no cleanup callback) must not free the wrapped
+ * buffer when the Image is destroyed.
+ */
+void TestImageWrapDataViewOnly() {
+    int frees = 0;
+    {
+        Image wrapped = ImageWrapData(MakeFakeHipData(640, 480, FAKE_PTR_A));
+        EXPECT_EQ(wrapped.size().w, 640);
+        EXPECT_EQ(wrapped.size().h, 480);
+        EXPECT_EQ(AsInt(wrapped.device()), AsInt(eDeviceType::GPU));
+        EXPECT_EQ(AsAddr(wrapped.exportData().cast<ImageDataStrided>()->plane(0).basePtr), AsAddr(FAKE_PTR_A));
+    }
+    // No callback was registered — nothing observable should have changed.
+    EXPECT_EQ(frees, 0);
+}
+
+/**
+ * @brief Wrap with a cleanup callback: the callback fires exactly once when
+ * the last Image handle goes out of scope.
+ */
+void TestImageWrapDataCleanupFires() {
+    int callbackInvocations = 0;
+    {
+        Image wrapped =
+            ImageWrapData(MakeFakeHipData(100, 100, FAKE_PTR_A), [&](const ImageData&) { ++callbackInvocations; });
+        EXPECT_EQ(callbackInvocations, 0);  // Not fired during normal use.
+    }
+    EXPECT_EQ(callbackInvocations, 1);
+}
+
+/**
+ * @brief Cleanup callback receives the original wrapped ImageData snapshot —
+ * the captured base pointer must match what was passed to ImageWrapData.
+ */
+void TestImageWrapDataCleanupReceivesData() {
+    void* receivedBasePtr = nullptr;
+    {
+        Image wrapped = ImageWrapData(MakeFakeHipData(50, 50, FAKE_PTR_A), [&](const ImageData& d) {
+            receivedBasePtr = d.cast<ImageDataStrided>()->plane(0).basePtr;
+        });
+    }
+    EXPECT_EQ(AsAddr(receivedBasePtr), AsAddr(FAKE_PTR_A));
+}
+
+/**
+ * @brief Cleanup must fire only on LAST handle drop — copies bump the
+ * refcount, intermediate drops do nothing.
+ */
+void TestImageWrapDataCleanupFiresOnce() {
+    int callbackInvocations = 0;
+    {
+        Image first =
+            ImageWrapData(MakeFakeHipData(10, 10, FAKE_PTR_A), [&](const ImageData&) { ++callbackInvocations; });
+        Image second = first;  // refcount = 2
+        Image third = first;   // refcount = 3
+        {
+            Image fourth = third;
+            (void)fourth;
+        }  // dropped → refcount = 3
+        EXPECT_EQ(callbackInvocations, 0);
+        // first, second, third still alive at scope exit
+    }
+    EXPECT_EQ(callbackInvocations, 1);
+}
+
+/**
+ * @brief Wrapped Image's accessors mirror the wrapped ImageData verbatim —
+ * size, format, device, and base pointer all round-trip unchanged.
+ */
+void TestImageWrapDataAccessors() {
+    auto fake = MakeFakeHipData(123, 45, FAKE_PTR_A, FMT_RGBA8);
+    Image wrapped = ImageWrapData(fake);
+
+    EXPECT_EQ(wrapped.size().w, 123);
+    EXPECT_EQ(wrapped.size().h, 45);
+    EXPECT_EQ(wrapped.format().channels(), 4);
+    EXPECT_EQ(AsInt(wrapped.device()), AsInt(eDeviceType::GPU));
+
+    auto strided = wrapped.exportData().cast<ImageDataStrided>();
+    EXPECT_EQ(AsInt(strided.has_value()), 1);
+    EXPECT_EQ(strided->plane(0).width, 123);
+    EXPECT_EQ(strided->plane(0).height, 45);
+    EXPECT_EQ(AsAddr(strided->plane(0).basePtr), AsAddr(FAKE_PTR_A));
+}
+
+}  // namespace
+
+int main(int argc, char** argv) {
+    (void)argc;
+    (void)argv;
+    TEST_CASES_BEGIN();
+
+    // CalcRequirements
+    TEST_CASE(TestCalcRequirementsRgb8());
+    TEST_CASE(TestCalcRequirementsF32());
+    TEST_CASE(TestCalcRequirementsU8());
+    TEST_CASE(TestCalcRequirementsRejectsInvalidDims());
+    TEST_CASE(TestCalcRequirementsLargeDims());
+
+    // Allocating constructors
+    TEST_CASE(TestImageHipAllocation());
+    TEST_CASE(TestImageHostAllocation());
+    TEST_CASE(TestImageRequirementsCtor());
+
+    // Refcount / lifecycle
+    TEST_CASE(TestImageCopySharesBuffer());
+    TEST_CASE(TestImageMoveSemantics());
+
+    // exportData
+    TEST_CASE(TestImageExportData());
+    TEST_CASE(TestImageExportDataTypedSuccess());
+    TEST_CASE(TestImageExportDataTypedMismatch());
+
+    // ImageWrapData
+    TEST_CASE(TestImageWrapDataViewOnly());
+    TEST_CASE(TestImageWrapDataCleanupFires());
+    TEST_CASE(TestImageWrapDataCleanupReceivesData());
+    TEST_CASE(TestImageWrapDataCleanupFiresOnce());
+    TEST_CASE(TestImageWrapDataAccessors());
+
+    TEST_CASES_END();
+}

From 554473fa89139b7569d6254080004d1aaefe8579 Mon Sep 17 00:00:00 2001
From: Zach Vincze <zavincze@amd.com>
Date: Tue, 5 May 2026 16:28:14 -0400
Subject: [PATCH 04/13] General cleanup and redundant comment removal

---
 include/core/image.hpp                        | 42 ++++---------------
 include/core/image_storage.hpp                |  3 --
 src/core/image.cpp                            | 14 +------
 tests/roccv/cpp/include/test_helpers.hpp      |  6 +++
 .../cpp/src/tests/core/image/test_image.cpp   | 29 ++++---------
 .../src/tests/core/image/test_image_data.cpp  |  5 ---
 6 files changed, 23 insertions(+), 76 deletions(-)

diff --git a/include/core/image.hpp b/include/core/image.hpp
index ecde3b51..7b057851 100644
--- a/include/core/image.hpp
+++ b/include/core/image.hpp
@@ -107,53 +107,25 @@ class Image {
     Image& operator=(Image&&) noexcept = default;
     ~Image() = default;
 
-    /**
-     * @brief Image dimensions in pixels.
-     */
     Size2D size() const noexcept;
-
-    /**
-     * @brief Pixel format.
-     */
     ImageFormat format() const noexcept;
-
-    /**
-     * @brief Device the underlying buffer resides on.
-     */
     eDeviceType device() const noexcept;
 
-    /**
-     * @brief Snapshot of the image's data buffer (pointer, stride, format).
-     *
-     * The returned ImageData references the same underlying buffer; lifetime
-     * is controlled by this Image's refcount, not by the snapshot.
-     */
-    ImageData exportData() const;
+    // Reference into m_metadata; valid as long as this Image (or any handle
+    // sharing its storage) is alive.
+    const ImageData& exportData() const noexcept { return m_metadata; }
 
-    /**
-     * @brief Exports the image's data buffer and casts it to a specified image data object.
-     *
-     * Throws std::bad_cast if the underlying buffer kind does not match what
-     * `Derived` expects (e.g. exportData<ImageDataStridedHip>() on a host-resident
-     * image throws std::bad_cast). Convenience wrapper around ImageData::cast<>.
-     *
-     * @tparam Derived The ImageData subclass to cast to.
-     * @return The image data casted to the image data object specified
-     */
+    // Throws std::bad_cast if the underlying buffer kind doesn't match Derived.
     template <typename Derived>
     Derived exportData() const {
-        ImageData data = exportData();
-        std::optional<Derived> derived_data = data.cast<Derived>();
-        if (!derived_data.has_value()) {
+        auto derived = m_metadata.cast<Derived>();
+        if (!derived.has_value()) {
             throw std::bad_cast();
         }
-
-        return derived_data.value();
+        return derived.value();
     }
 
    private:
-    // Internal ctor used by ImageWrapData and the allocating public ctors via
-    // delegation. Stores `metadata` and `storage` verbatim — no allocation.
     Image(ImageData metadata, std::shared_ptr<ImageStorage> storage);
 
     friend Image ImageWrapData(const ImageData& data, ImageDataCleanupFunc cleanup);
diff --git a/include/core/image_storage.hpp b/include/core/image_storage.hpp
index 69b9d3c0..70984742 100644
--- a/include/core/image_storage.hpp
+++ b/include/core/image_storage.hpp
@@ -44,9 +44,6 @@ class ImageStorage {
     ImageStorage(const ImageStorage&) = delete;
     ImageStorage& operator=(const ImageStorage&) = delete;
 
-    /**
-     * @brief Returns the raw data pointer this storage is tracking.
-     */
     void* data() const noexcept { return m_data; }
 
    private:
diff --git a/src/core/image.cpp b/src/core/image.cpp
index 4319eb20..04e08d2c 100644
--- a/src/core/image.cpp
+++ b/src/core/image.cpp
@@ -95,10 +95,6 @@ Image::Requirements Image::CalcRequirements(Size2D size, ImageFormat format) {
 
     const int64_t bytesPerPixel = static_cast<int64_t>(DataType(format.dtype()).size()) * format.channels();
 
-    // Designated aggregate init: planeRowStride[0] is set explicitly; the
-    // remaining ROCCV_MAX_IMAGE_PLANES-1 slots are zeroed by the trailing-
-    // elements rule for brace-enclosed array initializers. alignBytes stays at
-    // 0 since CalcRequirements always produces packed rows for now.
     // TODO: derive a sensible default base/row alignment from device attributes.
     return ImageRequirements{
         .size = size,
@@ -137,8 +133,6 @@ ImageFormat Image::format() const noexcept { return m_metadata.format(); }
 
 eDeviceType Image::device() const noexcept { return m_metadata.device(); }
 
-ImageData Image::exportData() const { return m_metadata; }
-
 // -----------------------------------------------------------------------------
 // ImageWrapData
 // -----------------------------------------------------------------------------
@@ -149,14 +143,10 @@ Image ImageWrapData(const ImageData& data, ImageDataCleanupFunc cleanup) {
         throw Exception("ImageWrapData requires strided image data.", eStatusType::INVALID_VALUE);
     }
 
-    // Storage tracks plane(0)'s base pointer. Single-plane today; multi-plane
-    // wraps would need a richer storage shape (or to abandon storing the
-    // pointer here at all).
+    // Single-plane assumption: storage tracks plane(0). Multi-plane wraps will
+    // need a richer storage shape.
     void* basePtr = strided->plane(0).basePtr;
 
-    // Deleter captures both the original ImageData snapshot and the user's
-    // cleanup callback. View-only (cleanup == nullptr) means the deleter
-    // touches nothing but the storage object itself.
     auto storage = std::shared_ptr<ImageStorage>(new ImageStorage(basePtr), [data, cleanup](ImageStorage* s) {
         if (cleanup) {
             cleanup(data);
diff --git a/tests/roccv/cpp/include/test_helpers.hpp b/tests/roccv/cpp/include/test_helpers.hpp
index 6c43053b..df6840f5 100644
--- a/tests/roccv/cpp/include/test_helpers.hpp
+++ b/tests/roccv/cpp/include/test_helpers.hpp
@@ -198,6 +198,12 @@ namespace tests {
                                  ". Expected no exceptions, but received the following exception: " + e.what()); \
     }
 
+// EXPECT_EQ pipes through std::to_string, so wrap enums/pointers/bools through
+// these casts before comparing.
+inline auto AsInt = [](auto v) { return static_cast<int>(v); };
+inline auto AsAddr = [](void* p) { return reinterpret_cast<uintptr_t>(p); };
+inline auto AsSize = [](auto v) { return static_cast<size_t>(v); };
+
 /**
  * @brief Creates a NHWC tensor which contains data loaded from an image.
  *
diff --git a/tests/roccv/cpp/src/tests/core/image/test_image.cpp b/tests/roccv/cpp/src/tests/core/image/test_image.cpp
index 879dd68c..13937dff 100644
--- a/tests/roccv/cpp/src/tests/core/image/test_image.cpp
+++ b/tests/roccv/cpp/src/tests/core/image/test_image.cpp
@@ -39,11 +39,6 @@ using namespace roccv::tests;
 
 namespace {
 
-// EXPECT_EQ pipes through std::to_string, so wrap enums/pointers/bools.
-auto AsInt = [](auto v) { return static_cast<int>(v); };
-auto AsAddr = [](void* p) { return reinterpret_cast<uintptr_t>(p); };
-auto AsSize = [](auto v) { return static_cast<size_t>(v); };
-
 void* const FAKE_PTR_A = reinterpret_cast<void*>(0xAAAAAAAAull);
 
 /**
@@ -233,10 +228,7 @@ void TestImageCopySharesBuffer() {
         EXPECT_EQ(AsAddr(second.exportData().cast<ImageDataStrided>()->plane(0).basePtr), AsAddr(buf));
 
         // Drop `first`; buffer must NOT be freed yet — `second` still holds it.
-        {
-            Image consumed = std::move(first);
-            (void)consumed;
-        }
+        { Image sink = std::move(first); }
         EXPECT_EQ(alloc.hipFrees, 0);
     }
     // All handles dropped — exactly one free.
@@ -317,20 +309,15 @@ void TestImageExportDataTypedMismatch() {
 // =============================================================================
 
 /**
- * @brief View-only wrap (no cleanup callback) must not free the wrapped
- * buffer when the Image is destroyed.
+ * @brief View-only wrap (no cleanup callback) round-trips metadata and must
+ * not crash when the Image is destroyed (no free attempt on the sentinel ptr).
  */
 void TestImageWrapDataViewOnly() {
-    int frees = 0;
-    {
-        Image wrapped = ImageWrapData(MakeFakeHipData(640, 480, FAKE_PTR_A));
-        EXPECT_EQ(wrapped.size().w, 640);
-        EXPECT_EQ(wrapped.size().h, 480);
-        EXPECT_EQ(AsInt(wrapped.device()), AsInt(eDeviceType::GPU));
-        EXPECT_EQ(AsAddr(wrapped.exportData().cast<ImageDataStrided>()->plane(0).basePtr), AsAddr(FAKE_PTR_A));
-    }
-    // No callback was registered — nothing observable should have changed.
-    EXPECT_EQ(frees, 0);
+    Image wrapped = ImageWrapData(MakeFakeHipData(640, 480, FAKE_PTR_A));
+    EXPECT_EQ(wrapped.size().w, 640);
+    EXPECT_EQ(wrapped.size().h, 480);
+    EXPECT_EQ(AsInt(wrapped.device()), AsInt(eDeviceType::GPU));
+    EXPECT_EQ(AsAddr(wrapped.exportData().cast<ImageDataStrided>()->plane(0).basePtr), AsAddr(FAKE_PTR_A));
 }
 
 /**
diff --git a/tests/roccv/cpp/src/tests/core/image/test_image_data.cpp b/tests/roccv/cpp/src/tests/core/image/test_image_data.cpp
index 885b1aa5..bea99d59 100644
--- a/tests/roccv/cpp/src/tests/core/image/test_image_data.cpp
+++ b/tests/roccv/cpp/src/tests/core/image/test_image_data.cpp
@@ -33,11 +33,6 @@ using namespace roccv::tests;
 
 namespace {
 
-// EXPECT_EQ feeds both sides through std::to_string, which only accepts
-// numeric types. Wrap enum/pointer/bool comparisons in these casts.
-auto AsInt = [](auto v) { return static_cast<int>(v); };
-auto AsAddr = [](void* p) { return reinterpret_cast<uintptr_t>(p); };
-
 // ImageData carries pointers but never dereferences them; the buffer is a
 // metadata snapshot. Use opaque sentinel pointers in tests so we can verify
 // values flow through without needing real allocations.

From 8bebbfb284a982ca7e1949732104712ac903be68 Mon Sep 17 00:00:00 2001
From: Zach Vincze <zavincze@amd.com>
Date: Wed, 6 May 2026 12:54:32 -0400
Subject: [PATCH 05/13] Remove ImageData from Image private member

---
 include/core/image.hpp | 51 +++++++++++++++-------
 src/core/image.cpp     | 97 +++++++++++++++++++++++++-----------------
 2 files changed, 92 insertions(+), 56 deletions(-)

diff --git a/include/core/image.hpp b/include/core/image.hpp
index 7b057851..7a42d51a 100644
--- a/include/core/image.hpp
+++ b/include/core/image.hpp
@@ -22,10 +22,11 @@
 
 #pragma once
 
+#include <array>
 #include <cstdint>
 #include <functional>
 #include <memory>
-#include <optional>
+#include <typeinfo>
 
 #include "core/detail/allocators/i_allocator.hpp"
 #include "core/image_buffer.hpp"
@@ -50,9 +51,9 @@ using ImageDataCleanupFunc = std::function<void(const ImageData&)>;
  * @brief Per-image allocation spec describing what to allocate for a single
  * variable-sized image. Mirrors NVCVImageRequirements: size, format, per-plane
  * row strides, and base-address alignment. Used as the input to Image's
- * allocating constructors and as the output of CalcRequirements; not stored
- * on the Image instance after construction (m_metadata holds the runtime
- * descriptor in ImageData form).
+ * allocating constructors and as the output of CalcRequirements; also
+ * preserved on the Image itself as the source of truth from which exportData()
+ * rebuilds an ImageData snapshot on demand.
  *
  * Per-plane row strides are populated only for planes 0..numPlanes(format)-1;
  * remaining slots are unused. Today's interleaved-only ImageFormat means only
@@ -73,6 +74,13 @@ struct ImageRequirements {
  * leaves both handles pointing at the same underlying buffer. The buffer is
  * freed when the last handle is destroyed (for owning Images) or when the
  * cleanup callback fires (for ImageWrapData with a callback).
+ *
+ * Storage shape: Image holds the buffer pointer (via ImageStorage) plus the
+ * "ingredients" describing it (size, format, device, per-plane row strides).
+ * It does NOT hold a precomputed ImageData snapshot — exportData() rebuilds
+ * one on demand from the ingredients. This keeps a single source of truth for
+ * the buffer pointer and aligns with how ImageBatchVarShape produces its
+ * own snapshots.
  */
 class Image {
    public:
@@ -107,18 +115,28 @@ class Image {
     Image& operator=(Image&&) noexcept = default;
     ~Image() = default;
 
-    Size2D size() const noexcept;
-    ImageFormat format() const noexcept;
-    eDeviceType device() const noexcept;
+    Size2D size() const noexcept { return m_size; }
+    ImageFormat format() const noexcept { return m_format; }
+    eDeviceType device() const noexcept { return m_device; }
 
-    // Reference into m_metadata; valid as long as this Image (or any handle
-    // sharing its storage) is alive.
-    const ImageData& exportData() const noexcept { return m_metadata; }
+    /**
+     * @brief Build and return an ImageData snapshot describing this image.
+     *
+     * Returned by value (not by reference) — Image stores ingredients, not a
+     * precomputed snapshot, so each call constructs a fresh ImageData. The
+     * snapshot's plane descriptors point into this Image's buffer; it remains
+     * valid as long as any handle to this storage is alive.
+     */
+    ImageData exportData() const;
 
-    // Throws std::bad_cast if the underlying buffer kind doesn't match Derived.
+    /**
+     * @brief Build a snapshot and down-cast it to a specific subclass. Throws
+     * std::bad_cast if the underlying buffer kind doesn't match Derived.
+     */
     template <typename Derived>
     Derived exportData() const {
-        auto derived = m_metadata.cast<Derived>();
+        ImageData data = exportData();
+        auto derived = data.cast<Derived>();
         if (!derived.has_value()) {
             throw std::bad_cast();
         }
@@ -126,14 +144,15 @@ class Image {
     }
 
    private:
-    Image(ImageData metadata, std::shared_ptr<ImageStorage> storage);
+    Image(const Requirements& reqs, eDeviceType device, std::shared_ptr<ImageStorage> storage);
 
     friend Image ImageWrapData(const ImageData& data, ImageDataCleanupFunc cleanup);
 
-    // m_data is declared first so the allocating ctor can initialize it
-    // (allocating the buffer) before m_metadata reads back the pointer.
     std::shared_ptr<ImageStorage> m_data;
-    ImageData m_metadata;
+    Size2D m_size;
+    ImageFormat m_format;
+    eDeviceType m_device;
+    std::array<int64_t, ROCCV_MAX_IMAGE_PLANES> m_planeRowStride;
 };
 
 /**
diff --git a/src/core/image.cpp b/src/core/image.cpp
index 04e08d2c..12a20d9a 100644
--- a/src/core/image.cpp
+++ b/src/core/image.cpp
@@ -22,6 +22,9 @@
 
 #include "core/image.hpp"
 
+#include <algorithm>
+#include <iterator>
+
 #include "core/data_type.hpp"
 #include "core/detail/context.hpp"
 #include "core/exception.hpp"
@@ -61,27 +64,6 @@ std::shared_ptr<ImageStorage> makeStorage(const ImageRequirements& reqs, const I
     });
 }
 
-// Builds the canonical ImageData stored on Image from a freshly-allocated
-// (or wrapped) buffer plus its layout description. Single-plane today —
-// ImageFormat is interleaved-only, so only planes[0] is populated.
-ImageData makeImageData(const ImageRequirements& reqs, void* buf, eDeviceType device) {
-    ImageBufferStrided strided{};
-    strided.numPlanes = 1;
-    strided.planes[0].width = reqs.size.w;
-    strided.planes[0].height = reqs.size.h;
-    strided.planes[0].rowStride = reqs.planeRowStride[0];
-    strided.planes[0].basePtr = buf;
-
-    switch (device) {
-        case eDeviceType::GPU:
-            return ImageDataStridedHip(reqs.format, strided);
-        case eDeviceType::CPU:
-            return ImageDataStridedHost(reqs.format, strided);
-    }
-
-    throw Exception("Unsupported device type in Image::makeImageData.", eStatusType::INVALID_VALUE);
-}
-
 }  // namespace
 
 // -----------------------------------------------------------------------------
@@ -118,20 +100,41 @@ Image::Image(const Requirements& reqs, eDeviceType device)
     : Image(reqs, GlobalContext().getDefaultAllocator(), device) {}
 
 Image::Image(const Requirements& reqs, const IAllocator& alloc, eDeviceType device)
-    : m_data(makeStorage(reqs, alloc, device)), m_metadata(makeImageData(reqs, m_data->data(), device)) {}
-
-Image::Image(ImageData metadata, std::shared_ptr<ImageStorage> storage)
-    : m_data(std::move(storage)), m_metadata(std::move(metadata)) {}
+    : Image(reqs, device, makeStorage(reqs, alloc, device)) {}
+
+Image::Image(const Requirements& reqs, eDeviceType device, std::shared_ptr<ImageStorage> storage)
+    : m_data(std::move(storage)),
+      m_size(reqs.size),
+      m_format(reqs.format),
+      m_device(device),
+      m_planeRowStride{} {
+    std::copy(std::begin(reqs.planeRowStride), std::end(reqs.planeRowStride), m_planeRowStride.begin());
+}
 
 // -----------------------------------------------------------------------------
-// Accessors
+// exportData
 // -----------------------------------------------------------------------------
 
-Size2D Image::size() const noexcept { return m_metadata.cast<ImageDataStrided>()->size(); }
+ImageData Image::exportData() const {
+    // TODO: derive numPlanes from m_format when planar formats land. Today's
+    // ImageFormat is interleaved-only, so plane 0 covers the whole image and
+    // its dimensions match m_size verbatim.
+    ImageBufferStrided strided{};
+    strided.numPlanes = 1;
+    strided.planes[0].width = m_size.w;
+    strided.planes[0].height = m_size.h;
+    strided.planes[0].rowStride = m_planeRowStride[0];
+    strided.planes[0].basePtr = m_data->data();
 
-ImageFormat Image::format() const noexcept { return m_metadata.format(); }
+    switch (m_device) {
+        case eDeviceType::GPU:
+            return ImageDataStridedHip(m_format, strided);
+        case eDeviceType::CPU:
+            return ImageDataStridedHost(m_format, strided);
+    }
 
-eDeviceType Image::device() const noexcept { return m_metadata.device(); }
+    throw Exception("Unsupported device type in Image::exportData.", eStatusType::INVALID_VALUE);
+}
 
 // -----------------------------------------------------------------------------
 // ImageWrapData
@@ -143,18 +146,32 @@ Image ImageWrapData(const ImageData& data, ImageDataCleanupFunc cleanup) {
         throw Exception("ImageWrapData requires strided image data.", eStatusType::INVALID_VALUE);
     }
 
-    // Single-plane assumption: storage tracks plane(0). Multi-plane wraps will
-    // need a richer storage shape.
-    void* basePtr = strided->plane(0).basePtr;
-
-    auto storage = std::shared_ptr<ImageStorage>(new ImageStorage(basePtr), [data, cleanup](ImageStorage* s) {
-        if (cleanup) {
-            cleanup(data);
-        }
-        delete s;
-    });
+    // Single-plane assumption: storage tracks plane(0) and Requirements only
+    // populates planeRowStride[0]. Multi-plane wraps will need to copy each
+    // plane's stride and either store per-plane base pointers or derive them
+    // from a single owning allocation.
+    const ImagePlaneStrided& plane0 = strided->plane(0);
+
+    // Designated initializers to avoid value-initializing ImageFormat through
+    // its explicit default ctor (which copy-list-init refuses).
+    Image::Requirements reqs{
+        .size = Size2D{plane0.width, plane0.height},
+        .format = data.format(),
+        .planeRowStride = {plane0.rowStride},
+        .alignBytes = 0,
+    };
 
-    return Image(data, std::move(storage));
+    // The deleter captures `data` by value so the original snapshot survives
+    // long enough to be passed to the cleanup callback on last-handle drop.
+    auto storage = std::shared_ptr<ImageStorage>(new ImageStorage(plane0.basePtr),
+                                                 [data, cleanup](ImageStorage* s) {
+                                                     if (cleanup) {
+                                                         cleanup(data);
+                                                     }
+                                                     delete s;
+                                                 });
+
+    return Image(reqs, data.device(), std::move(storage));
 }
 
 }  // namespace roccv

From b93d304b830ccfd13a385d31878a90d14df71262 Mon Sep 17 00:00:00 2001
From: Zach Vincze <zavincze@amd.com>
Date: Wed, 6 May 2026 12:55:27 -0400
Subject: [PATCH 06/13] Add ImageBatchBuffer/ImageBatchData initial
 implementation

---
 include/core/image_batch_buffer.hpp |  96 +++++++++++
 include/core/image_batch_data.hpp   | 245 ++++++++++++++++++++++++++++
 2 files changed, 341 insertions(+)
 create mode 100644 include/core/image_batch_buffer.hpp
 create mode 100644 include/core/image_batch_data.hpp

diff --git a/include/core/image_batch_buffer.hpp b/include/core/image_batch_buffer.hpp
new file mode 100644
index 00000000..884bfc6a
--- /dev/null
+++ b/include/core/image_batch_buffer.hpp
@@ -0,0 +1,96 @@
+/*
+ * Copyright (c) 2026 Advanced Micro Devices, Inc. All rights reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ */
+
+#pragma once
+
+#include <stdint.h>
+
+#include "core/image_buffer.hpp"
+#include "core/image_format.hpp"
+
+namespace roccv {
+
+/**
+ * @brief Pitch-linear descriptor table for a variable-shape image batch.
+ *
+ * Each entry of `imageList` is a full per-image strided buffer descriptor —
+ * reusing `ImageBufferStrided` keeps the per-image shape (multi-plane-capable,
+ * one base pointer per plane, per-plane row stride) identical to what a single
+ * `Image` carries today.
+ *
+ * Pointer residency:
+ *  - `imageList` is the descriptor table read by GPU kernels. For a GPU-resident
+ *    batch this points into device memory; for a hypothetical CPU-resident
+ *    batch it would point into host memory. The producing batch class owns the
+ *    allocation and decides residency.
+ *  - `formatList` mirrors `imageList`'s residency and holds one ImageFormat per
+ *    image (so kernels can branch on per-image format without dereferencing the
+ *    descriptor table).
+ *  - `hostFormatList` is always host-resident. It exists so host-side validation
+ *    code can read per-image formats without paying a D->H copy. For a
+ *    CPU-resident batch this MAY alias `formatList`; for a GPU-resident batch
+ *    it is a separate host mirror kept in sync by the producer.
+ *
+ * `uniqueFormat` is the common ImageFormat across all images, or a default-
+ * constructed (0-channel) ImageFormat sentinel if formats are heterogeneous or
+ * the batch is empty. Cached to fast-path the homogeneous case.
+ *
+ * `maxWidth` / `maxHeight` are the bounding box across all images. Used by
+ * operators to size launch grids. Both are 0 when the batch is empty.
+ *
+ * The struct is intentionally trivially copyable so it can ride inside
+ * `ImageBatchBuffer` without an allocation, mirroring `ImageBufferStrided`'s
+ * relationship to `ImageBuffer`.
+ */
+struct ImageBatchVarShapeBufferStrided {
+    /** Common format across all images in the batch, or a default-constructed
+     *  ImageFormat if formats are heterogeneous or the batch is empty. */
+    ImageFormat uniqueFormat;
+
+    /** Bounding box across all images, in pixels. Both 0 when empty. */
+    int32_t maxWidth;
+    int32_t maxHeight;
+
+    /** Per-image format array, length == numImages. Residency matches
+     *  `imageList` (device for GPU batches, host for CPU batches). */
+    ImageFormat* formatList;
+
+    /** Host-resident mirror of `formatList`. May alias `formatList` for
+     *  CPU-resident batches. Length == numImages. */
+    const ImageFormat* hostFormatList;
+
+    /** Per-image descriptor table, length == numImages. The kernel-facing
+     *  pointer; residency determines which device the batch lives on. */
+    ImageBufferStrided* imageList;
+};
+
+/**
+ * @brief An image-batch buffer. Currently only the variable-shape strided
+ * variant is supported. Shaped as a tagged-union-style aggregate so additional
+ * batch buffer kinds can be added later (e.g. tensor-backed batches) without
+ * changing the public type.
+ */
+struct ImageBatchBuffer {
+    ImageBatchVarShapeBufferStrided varShapeStrided;
+};
+
+}  // namespace roccv
diff --git a/include/core/image_batch_data.hpp b/include/core/image_batch_data.hpp
new file mode 100644
index 00000000..7d6638d4
--- /dev/null
+++ b/include/core/image_batch_data.hpp
@@ -0,0 +1,245 @@
+/*
+ * Copyright (c) 2026 Advanced Micro Devices, Inc. All rights reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ */
+
+#pragma once
+
+#include <stdint.h>
+
+#include <optional>
+#include <type_traits>
+
+#include "core/image_batch_buffer.hpp"
+#include "core/image_format.hpp"
+#include "core/util_enums.h"
+#include "operator_types.h"
+
+namespace roccv {
+
+/**
+ * @brief Discriminator for the kind of buffer an ImageBatchData carries. Used
+ * by IsCompatibleKind() / cast<>() to perform safe runtime down-casting through
+ * the ImageBatchData hierarchy.
+ *
+ * The hierarchy currently exposes only one concrete buffer kind
+ * (variable-shape, strided, GPU-resident); the enum is shaped to grow into
+ * additional kinds (e.g. tensor-backed batches, host-resident varshape) without
+ * breaking the existing buffer kind values.
+ */
+enum class ImageBatchBufferType {
+    IMAGE_BATCH_BUFFER_NONE,                   // Default/invalid buffer type.
+    IMAGE_BATCH_VARSHAPE_BUFFER_STRIDED_HIP,   // GPU-accessible varshape descriptor table.
+    IMAGE_BATCH_VARSHAPE_BUFFER_STRIDED_HOST,  // Host-accessible varshape descriptor table.
+};
+
+/**
+ * @brief Holds the underlying image-batch data alongside metadata
+ * (numImages, buffer kind). Non-strided batch data is not supported for use
+ * right now; use ImageBatchVarShapeDataStrided to access strided varshape data
+ * instead.
+ *
+ * ImageBatchData is the interchange type for a batch of variable-sized images.
+ * It does not own any of the underlying buffers (the descriptor table, the
+ * format arrays, or the per-image pixel buffers) — it is a metadata snapshot,
+ * valid only as long as the producing batch outlives it.
+ *
+ * Lazy-sync note: for a GPU-resident batch the producer (ImageBatchVarShape)
+ * is responsible for ensuring the device-side descriptor table is up to date
+ * with any pushBack/popBack edits before handing out an ImageBatchData. The
+ * snapshot itself carries no synchronization state.
+ */
+class ImageBatchData {
+   public:
+    ImageBatchData() = delete;
+    virtual ~ImageBatchData() = default;
+
+    /**
+     * @brief Returns the number of images currently in the batch.
+     */
+    virtual int32_t numImages() const;
+
+    /**
+     * @brief Returns the device the descriptor table (and per-image pixel
+     * buffers) reside on.
+     */
+    virtual eDeviceType device() const;
+
+    /**
+     * @brief Attempts to down-cast this ImageBatchData to a more specific
+     * subclass. Returns the casted value if the underlying buffer kind matches
+     * what Derived expects, or std::nullopt otherwise.
+     *
+     * @tparam Derived The target subclass to cast to.
+     */
+    template <typename Derived>
+    std::optional<Derived> cast() const {
+        static_assert(std::is_base_of<ImageBatchData, Derived>::value,
+                      "Cannot cast ImageBatchData to an unrelated type.");
+        static_assert(sizeof(Derived) == sizeof(ImageBatchData),
+                      "Derived type must not add any additional data members.");
+
+        if (!Derived::IsCompatibleKind(m_bufferType)) {
+            return std::nullopt;
+        }
+
+        return std::make_optional<Derived>(m_numImages, m_buffer);
+    }
+
+    static bool IsCompatibleKind(ImageBatchBufferType bufferType);
+
+   protected:
+    ImageBatchData(int32_t numImages, const ImageBatchBuffer& buffer);
+
+    int32_t m_numImages;
+    eDeviceType m_deviceType;
+    ImageBatchBufferType m_bufferType;
+    ImageBatchBuffer m_buffer;
+};
+
+/**
+ * @brief Image-batch data backed by a variable-shape descriptor table. Adds
+ * typed accessors for the per-image format arrays and the bounding box across
+ * the batch. Sub-classed by ImageBatchVarShapeDataStrided to discriminate
+ * pitch-linear storage; further sub-classed by ImageBatchVarShapeDataStridedHip
+ * to tag device residency.
+ */
+class ImageBatchVarShapeData : public ImageBatchData {
+   public:
+    using Buffer = ImageBatchVarShapeBufferStrided;
+
+    ImageBatchVarShapeData(int32_t numImages, const ImageBatchBuffer& buffer);
+
+    static bool IsCompatibleKind(ImageBatchBufferType bufferType);
+
+    /**
+     * @brief Bounding box across all images in the batch, in pixels. Both
+     * dimensions are 0 when the batch is empty. Used by operators to size
+     * launch grids without iterating the descriptor table.
+     */
+    Size2D maxSize() const;
+
+    /**
+     * @brief Returns the common ImageFormat across all images, or a
+     * default-constructed (0-channel) ImageFormat sentinel if formats are
+     * heterogeneous or the batch is empty.
+     */
+    ImageFormat uniqueFormat() const;
+
+    /**
+     * @brief Per-image format array. Residency matches the descriptor table
+     * (device for GPU batches). Length == numImages().
+     *
+     * Prefer hostFormatList() for host-side validation paths to avoid a D->H
+     * copy.
+     */
+    const ImageFormat* formatList() const;
+
+    /**
+     * @brief Host-resident mirror of formatList(). Always safe to dereference
+     * from host code. Length == numImages().
+     */
+    const ImageFormat* hostFormatList() const;
+};
+
+/**
+ * @brief Variable-shape image-batch data backed by a pitch-linear descriptor
+ * table. Adds the per-image descriptor accessor on top of
+ * ImageBatchVarShapeData.
+ */
+class ImageBatchVarShapeDataStrided : public ImageBatchVarShapeData {
+   public:
+    using Buffer = ImageBatchVarShapeBufferStrided;
+
+    ImageBatchVarShapeDataStrided(int32_t numImages, const ImageBatchBuffer& buffer);
+
+    static bool IsCompatibleKind(ImageBatchBufferType bufferType);
+
+    /**
+     * @brief Per-image descriptor table. Length == numImages(). Residency
+     * matches the enclosing data type — for ImageBatchVarShapeDataStridedHip
+     * this is a device pointer; kernels read it directly.
+     *
+     * Each entry is a full ImageBufferStrided so the per-image shape
+     * (multi-plane-capable, per-plane stride and base pointer) matches what a
+     * single Image carries.
+     */
+    const ImageBufferStrided* imageList() const;
+};
+
+/**
+ * @brief GPU-accessible variable-shape image-batch data.
+ */
+class ImageBatchVarShapeDataStridedHip : public ImageBatchVarShapeDataStrided {
+   public:
+    using Buffer = ImageBatchVarShapeBufferStrided;
+
+    ImageBatchVarShapeDataStridedHip(int32_t numImages, const ImageBatchBuffer& buffer);
+
+    /**
+     * @brief Constructs GPU-accessible varshape image-batch data from the
+     * concrete strided buffer directly.
+     *
+     * @param[in] numImages Number of images currently in the batch.
+     * @param[in] buffer    Descriptor table + per-image format arrays. The
+     *                      descriptor table and `formatList` must point to GPU
+     *                      memory; `hostFormatList` to host memory.
+     */
+    ImageBatchVarShapeDataStridedHip(int32_t numImages, const Buffer& buffer);
+
+    static bool IsCompatibleKind(ImageBatchBufferType bufferType);
+};
+
+/**
+ * @brief Host-accessible variable-shape image-batch data.
+ *
+ * The host-resident counterpart to ImageBatchVarShapeDataStridedHip. The
+ * descriptor table, `formatList`, and `hostFormatList` all point to host
+ * memory; `formatList` and `hostFormatList` MAY alias the same allocation
+ * since no D->H sync is required.
+ *
+ * The lazy host->device descriptor sync that the GPU producer needs is not
+ * applicable here — host-only varshape batches can edit the descriptor table
+ * in place and hand it straight to host kernels. The matching producer-side
+ * design (whether host batches are a separate type, a runtime-tagged variant
+ * of ImageBatchVarShape, or skipped entirely in favor of CPU-side per-image
+ * loops) is still open.
+ */
+class ImageBatchVarShapeDataStridedHost : public ImageBatchVarShapeDataStrided {
+   public:
+    using Buffer = ImageBatchVarShapeBufferStrided;
+
+    ImageBatchVarShapeDataStridedHost(int32_t numImages, const ImageBatchBuffer& buffer);
+
+    /**
+     * @brief Constructs host-accessible varshape image-batch data from the
+     * concrete strided buffer directly.
+     *
+     * @param[in] numImages Number of images currently in the batch.
+     * @param[in] buffer    Descriptor table + per-image format arrays. All
+     *                      pointers must reference host memory; `formatList`
+     *                      and `hostFormatList` may alias.
+     */
+    ImageBatchVarShapeDataStridedHost(int32_t numImages, const Buffer& buffer);
+
+    static bool IsCompatibleKind(ImageBatchBufferType bufferType);
+};
+
+}  // namespace roccv

From defdcd44af9ec66c30dc42a78f9e2e0c5a3250ed Mon Sep 17 00:00:00 2001
From: Zach Vincze <zavincze@amd.com>
Date: Wed, 6 May 2026 14:55:30 -0400
Subject: [PATCH 07/13] Add ImageBatchData tests

---
 include/core/image_batch_buffer.hpp           |   6 +-
 include/core/image_batch_data.hpp             |   5 +-
 include/core/image_format.hpp                 |  13 +-
 src/core/image_batch_data.cpp                 | 103 ++++++
 .../core/image/test_image_batch_data.cpp      | 297 ++++++++++++++++++
 5 files changed, 417 insertions(+), 7 deletions(-)
 create mode 100644 src/core/image_batch_data.cpp
 create mode 100644 tests/roccv/cpp/src/tests/core/image/test_image_batch_data.cpp

diff --git a/include/core/image_batch_buffer.hpp b/include/core/image_batch_buffer.hpp
index 884bfc6a..06f0c4b3 100644
--- a/include/core/image_batch_buffer.hpp
+++ b/include/core/image_batch_buffer.hpp
@@ -50,9 +50,9 @@ namespace roccv {
  *    CPU-resident batch this MAY alias `formatList`; for a GPU-resident batch
  *    it is a separate host mirror kept in sync by the producer.
  *
- * `uniqueFormat` is the common ImageFormat across all images, or a default-
- * constructed (0-channel) ImageFormat sentinel if formats are heterogeneous or
- * the batch is empty. Cached to fast-path the homogeneous case.
+ * `uniqueFormat` is the common ImageFormat across all images, or FMT_NONE if
+ * formats are heterogeneous or the batch is empty. Cached to fast-path the
+ * homogeneous case.
  *
  * `maxWidth` / `maxHeight` are the bounding box across all images. Used by
  * operators to size launch grids. Both are 0 when the batch is empty.
diff --git a/include/core/image_batch_data.hpp b/include/core/image_batch_data.hpp
index 7d6638d4..c50de010 100644
--- a/include/core/image_batch_data.hpp
+++ b/include/core/image_batch_data.hpp
@@ -137,9 +137,8 @@ class ImageBatchVarShapeData : public ImageBatchData {
     Size2D maxSize() const;
 
     /**
-     * @brief Returns the common ImageFormat across all images, or a
-     * default-constructed (0-channel) ImageFormat sentinel if formats are
-     * heterogeneous or the batch is empty.
+     * @brief Returns the common ImageFormat across all images, or FMT_NONE if
+     * formats are heterogeneous or the batch is empty.
      */
     ImageFormat uniqueFormat() const;
 
diff --git a/include/core/image_format.hpp b/include/core/image_format.hpp
index 7dd891f3..ddb1d100 100644
--- a/include/core/image_format.hpp
+++ b/include/core/image_format.hpp
@@ -40,7 +40,10 @@ enum class eSwizzle {
  */
 class ImageFormat {
    public:
-    explicit ImageFormat() {}
+    /**
+     * @brief Default-constructs to FMT_NONE.
+     */
+    constexpr ImageFormat() : m_dtype(eDataType::DATA_TYPE_U8), m_numChannels(0), m_swizzle(eSwizzle::XYZW) {}
     explicit constexpr ImageFormat(eDataType dtype, int32_t numChannels, eSwizzle swizzle = eSwizzle::XYZW)
         : m_dtype(dtype), m_numChannels(numChannels), m_swizzle(swizzle) {}
 
@@ -48,12 +51,20 @@ class ImageFormat {
     int32_t channels() const noexcept;
     eSwizzle swizzle() const noexcept;
 
+    constexpr bool operator==(const ImageFormat& other) const noexcept {
+        return m_dtype == other.m_dtype && m_numChannels == other.m_numChannels && m_swizzle == other.m_swizzle;
+    }
+    constexpr bool operator!=(const ImageFormat& other) const noexcept { return !(*this == other); }
+
    private:
     eDataType m_dtype;
     int32_t m_numChannels;
     eSwizzle m_swizzle;
 };
 
+// Undefined format. Used to represent an uninitialized or invalid format.
+constexpr ImageFormat FMT_NONE{eDataType::DATA_TYPE_U8, 0, eSwizzle::XYZW};
+
 // Single plane with one 8-bit unsigned integer channel.
 constexpr ImageFormat FMT_U8(eDataType::DATA_TYPE_U8, 1, eSwizzle::XYZW);
 
diff --git a/src/core/image_batch_data.cpp b/src/core/image_batch_data.cpp
new file mode 100644
index 00000000..a8ce07ba
--- /dev/null
+++ b/src/core/image_batch_data.cpp
@@ -0,0 +1,103 @@
+/*
+ * Copyright (c) 2026 Advanced Micro Devices, Inc. All rights reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ */
+
+#include "core/image_batch_data.hpp"
+
+#include "core/image_batch_buffer.hpp"
+#include "core/image_format.hpp"
+#include "core/util_enums.h"
+
+namespace roccv {
+
+int32_t ImageBatchData::numImages() const { return m_numImages; }
+
+eDeviceType ImageBatchData::device() const { return m_deviceType; }
+
+ImageBatchData::ImageBatchData(int32_t numImages, const ImageBatchBuffer& buffer)
+    : m_numImages(numImages),
+      m_deviceType(eDeviceType::GPU),
+      m_bufferType(ImageBatchBufferType::IMAGE_BATCH_BUFFER_NONE),
+      m_buffer(buffer) {}
+
+bool ImageBatchData::IsCompatibleKind(ImageBatchBufferType bufferType) {
+    return bufferType != ImageBatchBufferType::IMAGE_BATCH_BUFFER_NONE;
+}
+
+ImageBatchVarShapeData::ImageBatchVarShapeData(int32_t numImages, const ImageBatchBuffer& buffer)
+    : ImageBatchData(numImages, buffer) {}
+
+bool ImageBatchVarShapeData::IsCompatibleKind(ImageBatchBufferType bufferType) {
+    return bufferType == ImageBatchBufferType::IMAGE_BATCH_VARSHAPE_BUFFER_STRIDED_HIP ||
+           bufferType == ImageBatchBufferType::IMAGE_BATCH_VARSHAPE_BUFFER_STRIDED_HOST;
+}
+
+Size2D ImageBatchVarShapeData::maxSize() const {
+    return Size2D{m_buffer.varShapeStrided.maxWidth, m_buffer.varShapeStrided.maxHeight};
+}
+
+ImageFormat ImageBatchVarShapeData::uniqueFormat() const { return m_buffer.varShapeStrided.uniqueFormat; }
+
+const ImageFormat* ImageBatchVarShapeData::formatList() const { return m_buffer.varShapeStrided.formatList; }
+
+const ImageFormat* ImageBatchVarShapeData::hostFormatList() const { return m_buffer.varShapeStrided.hostFormatList; }
+
+ImageBatchVarShapeDataStrided::ImageBatchVarShapeDataStrided(int32_t numImages, const ImageBatchBuffer& buffer)
+    : ImageBatchVarShapeData(numImages, buffer) {}
+
+bool ImageBatchVarShapeDataStrided::IsCompatibleKind(ImageBatchBufferType bufferType) {
+    return bufferType == ImageBatchBufferType::IMAGE_BATCH_VARSHAPE_BUFFER_STRIDED_HIP ||
+           bufferType == ImageBatchBufferType::IMAGE_BATCH_VARSHAPE_BUFFER_STRIDED_HOST;
+}
+
+const ImageBufferStrided* ImageBatchVarShapeDataStrided::imageList() const {
+    return m_buffer.varShapeStrided.imageList;
+}
+
+ImageBatchVarShapeDataStridedHip::ImageBatchVarShapeDataStridedHip(int32_t numImages, const ImageBatchBuffer& buffer)
+    : ImageBatchVarShapeDataStrided(numImages, buffer) {
+    m_bufferType = ImageBatchBufferType::IMAGE_BATCH_VARSHAPE_BUFFER_STRIDED_HIP;
+    m_deviceType = eDeviceType::GPU;
+}
+
+ImageBatchVarShapeDataStridedHip::ImageBatchVarShapeDataStridedHip(
+    int32_t numImages, const ImageBatchVarShapeDataStridedHip::Buffer& buffer)
+    : ImageBatchVarShapeDataStridedHip(numImages, ImageBatchBuffer{.varShapeStrided = buffer}) {}
+
+bool ImageBatchVarShapeDataStridedHip::IsCompatibleKind(ImageBatchBufferType bufferType) {
+    return bufferType == ImageBatchBufferType::IMAGE_BATCH_VARSHAPE_BUFFER_STRIDED_HIP;
+}
+
+ImageBatchVarShapeDataStridedHost::ImageBatchVarShapeDataStridedHost(int32_t numImages, const ImageBatchBuffer& buffer)
+    : ImageBatchVarShapeDataStrided(numImages, buffer) {
+    m_bufferType = ImageBatchBufferType::IMAGE_BATCH_VARSHAPE_BUFFER_STRIDED_HOST;
+    m_deviceType = eDeviceType::CPU;
+}
+
+ImageBatchVarShapeDataStridedHost::ImageBatchVarShapeDataStridedHost(
+    int32_t numImages, const ImageBatchVarShapeDataStridedHost::Buffer& buffer)
+    : ImageBatchVarShapeDataStridedHost(numImages, ImageBatchBuffer{.varShapeStrided = buffer}) {}
+
+bool ImageBatchVarShapeDataStridedHost::IsCompatibleKind(ImageBatchBufferType bufferType) {
+    return bufferType == ImageBatchBufferType::IMAGE_BATCH_VARSHAPE_BUFFER_STRIDED_HOST;
+}
+
+}  // namespace roccv
diff --git a/tests/roccv/cpp/src/tests/core/image/test_image_batch_data.cpp b/tests/roccv/cpp/src/tests/core/image/test_image_batch_data.cpp
new file mode 100644
index 00000000..f402d261
--- /dev/null
+++ b/tests/roccv/cpp/src/tests/core/image/test_image_batch_data.cpp
@@ -0,0 +1,297 @@
+/*
+ * Copyright (c) 2026 Advanced Micro Devices, Inc. All rights reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ */
+
+#include <stdint.h>
+
+#include <core/image_batch_buffer.hpp>
+#include <core/image_batch_data.hpp>
+#include <core/image_buffer.hpp>
+#include <core/image_format.hpp>
+
+#include "test_helpers.hpp"
+
+using namespace roccv;
+using namespace roccv::tests;
+
+namespace {
+
+// ImageBatchData carries pointers but never dereferences them; the buffer is a
+// metadata snapshot. Use opaque sentinel pointers so we can verify values flow
+// through the hierarchy without needing real allocations.
+void* const FAKE_IMG_PTR_A = reinterpret_cast<void*>(0xA0A0A0A0ull);
+void* const FAKE_IMG_PTR_B = reinterpret_cast<void*>(0xB0B0B0B0ull);
+
+// Static descriptor/format storage for the batch buffer. These are real host
+// allocations (so the pointers are valid) but the batch tests only read
+// metadata back out of them; nothing dereferences the per-image basePtr fields.
+ImageBufferStrided g_imageList[2];
+ImageFormat g_formatList[2] = {FMT_RGB8, FMT_RGB8};
+ImageFormat g_hostFormatList[2] = {FMT_RGB8, FMT_RGB8};
+
+ImageBufferStrided MakeSinglePlaneBuffer(int32_t width, int32_t height, int64_t rowStride, void* basePtr) {
+    ImageBufferStrided buf{};
+    buf.numPlanes = 1;
+    buf.planes[0] = {width, height, rowStride, basePtr};
+    return buf;
+}
+
+// Builds a homogeneous two-image varshape descriptor with a known bounding box
+// and uniqueFormat. The returned struct's pointers reference module-static
+// arrays so addresses remain stable across calls within a test.
+ImageBatchVarShapeBufferStrided MakeHomogeneousBuffer() {
+    g_imageList[0] = MakeSinglePlaneBuffer(640, 480, 640 * 3, FAKE_IMG_PTR_A);
+    g_imageList[1] = MakeSinglePlaneBuffer(320, 240, 320 * 3, FAKE_IMG_PTR_B);
+    g_formatList[0] = FMT_RGB8;
+    g_formatList[1] = FMT_RGB8;
+    g_hostFormatList[0] = FMT_RGB8;
+    g_hostFormatList[1] = FMT_RGB8;
+
+    ImageBatchVarShapeBufferStrided buf{};
+    buf.uniqueFormat = FMT_RGB8;
+    buf.maxWidth = 640;
+    buf.maxHeight = 480;
+    buf.formatList = g_formatList;
+    buf.hostFormatList = g_hostFormatList;
+    buf.imageList = g_imageList;
+    return buf;
+}
+
+/**
+ * @brief Verifies HIP-strided varshape construction populates all observable
+ * state and tags itself as GPU-resident.
+ */
+void TestImageBatchVarShapeDataStridedHipConstruction() {
+    auto buf = MakeHomogeneousBuffer();
+    ImageBatchVarShapeDataStridedHip data(2, buf);
+
+    EXPECT_EQ(AsInt(data.device()), AsInt(eDeviceType::GPU));
+    EXPECT_EQ(data.numImages(), 2);
+    EXPECT_EQ(data.maxSize().w, 640);
+    EXPECT_EQ(data.maxSize().h, 480);
+    EXPECT_EQ(data.uniqueFormat().channels(), 3);
+    EXPECT_EQ(AsAddr(const_cast<ImageFormat*>(data.formatList())), AsAddr(g_formatList));
+    EXPECT_EQ(AsAddr(const_cast<ImageFormat*>(data.hostFormatList())), AsAddr(g_hostFormatList));
+    EXPECT_EQ(AsAddr(const_cast<ImageBufferStrided*>(data.imageList())), AsAddr(g_imageList));
+    EXPECT_EQ(data.imageList()[0].planes[0].width, 640);
+    EXPECT_EQ(data.imageList()[1].planes[0].width, 320);
+}
+
+/**
+ * @brief Same shape as the Hip test but for Host-resident varshape data.
+ */
+void TestImageBatchVarShapeDataStridedHostConstruction() {
+    auto buf = MakeHomogeneousBuffer();
+    ImageBatchVarShapeDataStridedHost data(2, buf);
+
+    EXPECT_EQ(AsInt(data.device()), AsInt(eDeviceType::CPU));
+    EXPECT_EQ(data.numImages(), 2);
+    EXPECT_EQ(data.maxSize().w, 640);
+    EXPECT_EQ(data.maxSize().h, 480);
+    EXPECT_EQ(data.uniqueFormat().channels(), 3);
+    EXPECT_EQ(AsAddr(const_cast<ImageBufferStrided*>(data.imageList())), AsAddr(g_imageList));
+}
+
+/**
+ * @brief Empty batch: maxSize collapses to 0x0 and uniqueFormat is FMT_NONE.
+ * Producers signal "no images" via numImages == 0; the buffer fields stay
+ * valid pointers but get ignored.
+ */
+void TestImageBatchVarShapeDataEmpty() {
+    ImageBatchVarShapeBufferStrided buf{};
+    buf.uniqueFormat = FMT_NONE;
+    buf.maxWidth = 0;
+    buf.maxHeight = 0;
+    buf.formatList = g_formatList;
+    buf.hostFormatList = g_hostFormatList;
+    buf.imageList = g_imageList;
+
+    ImageBatchVarShapeDataStridedHip data(0, buf);
+
+    EXPECT_EQ(data.numImages(), 0);
+    EXPECT_EQ(data.maxSize().w, 0);
+    EXPECT_EQ(data.maxSize().h, 0);
+    EXPECT_EQ(AsInt(data.uniqueFormat() == FMT_NONE), 1);
+}
+
+/**
+ * @brief Heterogeneous formats: per-image formatList carries each entry
+ * verbatim; uniqueFormat is FMT_NONE since no single format spans the batch.
+ */
+void TestImageBatchVarShapeDataHeterogeneousFormats() {
+    g_imageList[0] = MakeSinglePlaneBuffer(640, 480, 640 * 3, FAKE_IMG_PTR_A);
+    g_imageList[1] = MakeSinglePlaneBuffer(320, 240, 320 * 4, FAKE_IMG_PTR_B);
+    g_formatList[0] = FMT_RGB8;
+    g_formatList[1] = FMT_RGBA8;
+    g_hostFormatList[0] = FMT_RGB8;
+    g_hostFormatList[1] = FMT_RGBA8;
+
+    ImageBatchVarShapeBufferStrided buf{};
+    buf.uniqueFormat = FMT_NONE;
+    buf.maxWidth = 640;
+    buf.maxHeight = 480;
+    buf.formatList = g_formatList;
+    buf.hostFormatList = g_hostFormatList;
+    buf.imageList = g_imageList;
+
+    ImageBatchVarShapeDataStridedHip data(2, buf);
+
+    EXPECT_EQ(AsInt(data.uniqueFormat() == FMT_NONE), 1);
+    EXPECT_EQ(AsInt(data.hostFormatList()[0] == FMT_RGB8), 1);
+    EXPECT_EQ(AsInt(data.hostFormatList()[1] == FMT_RGBA8), 1);
+}
+
+/**
+ * @brief The two leaf ctors (taking ImageBatchBuffer vs the concrete strided
+ * buffer directly) must produce observably identical state.
+ */
+void TestImageBatchVarShapeDataSugarCtor() {
+    auto buf = MakeHomogeneousBuffer();
+
+    ImageBatchVarShapeDataStridedHip wide(2, ImageBatchBuffer{.varShapeStrided = buf});
+    ImageBatchVarShapeDataStridedHip sugar(2, buf);
+
+    EXPECT_EQ(AsInt(wide.device()), AsInt(sugar.device()));
+    EXPECT_EQ(wide.numImages(), sugar.numImages());
+    EXPECT_EQ(wide.maxSize().w, sugar.maxSize().w);
+    EXPECT_EQ(wide.maxSize().h, sugar.maxSize().h);
+    EXPECT_EQ(AsAddr(const_cast<ImageBufferStrided*>(wide.imageList())),
+              AsAddr(const_cast<ImageBufferStrided*>(sugar.imageList())));
+
+    ImageBatchVarShapeDataStridedHost wideHost(2, ImageBatchBuffer{.varShapeStrided = buf});
+    ImageBatchVarShapeDataStridedHost sugarHost(2, buf);
+    EXPECT_EQ(AsInt(wideHost.device()), AsInt(sugarHost.device()));
+    EXPECT_EQ(AsAddr(const_cast<ImageBufferStrided*>(wideHost.imageList())),
+              AsAddr(const_cast<ImageBufferStrided*>(sugarHost.imageList())));
+}
+
+/**
+ * @brief IsCompatibleKind on each level discriminates the buffer kinds it
+ * accepts. Base accepts anything-but-NONE; VarShape and VarShapeStrided accept
+ * both Hip and Host varshape; leaves accept only their own.
+ */
+void TestImageBatchDataIsCompatibleKind() {
+    EXPECT_EQ(AsInt(ImageBatchData::IsCompatibleKind(ImageBatchBufferType::IMAGE_BATCH_BUFFER_NONE)), 0);
+    EXPECT_EQ(AsInt(ImageBatchData::IsCompatibleKind(ImageBatchBufferType::IMAGE_BATCH_VARSHAPE_BUFFER_STRIDED_HIP)),
+              1);
+    EXPECT_EQ(AsInt(ImageBatchData::IsCompatibleKind(ImageBatchBufferType::IMAGE_BATCH_VARSHAPE_BUFFER_STRIDED_HOST)),
+              1);
+
+    EXPECT_EQ(AsInt(ImageBatchVarShapeData::IsCompatibleKind(ImageBatchBufferType::IMAGE_BATCH_BUFFER_NONE)), 0);
+    EXPECT_EQ(
+        AsInt(ImageBatchVarShapeData::IsCompatibleKind(ImageBatchBufferType::IMAGE_BATCH_VARSHAPE_BUFFER_STRIDED_HIP)),
+        1);
+    EXPECT_EQ(
+        AsInt(ImageBatchVarShapeData::IsCompatibleKind(ImageBatchBufferType::IMAGE_BATCH_VARSHAPE_BUFFER_STRIDED_HOST)),
+        1);
+
+    EXPECT_EQ(AsInt(ImageBatchVarShapeDataStrided::IsCompatibleKind(ImageBatchBufferType::IMAGE_BATCH_BUFFER_NONE)), 0);
+    EXPECT_EQ(AsInt(ImageBatchVarShapeDataStrided::IsCompatibleKind(
+                  ImageBatchBufferType::IMAGE_BATCH_VARSHAPE_BUFFER_STRIDED_HIP)),
+              1);
+    EXPECT_EQ(AsInt(ImageBatchVarShapeDataStrided::IsCompatibleKind(
+                  ImageBatchBufferType::IMAGE_BATCH_VARSHAPE_BUFFER_STRIDED_HOST)),
+              1);
+
+    EXPECT_EQ(AsInt(ImageBatchVarShapeDataStridedHip::IsCompatibleKind(ImageBatchBufferType::IMAGE_BATCH_BUFFER_NONE)),
+              0);
+    EXPECT_EQ(AsInt(ImageBatchVarShapeDataStridedHip::IsCompatibleKind(
+                  ImageBatchBufferType::IMAGE_BATCH_VARSHAPE_BUFFER_STRIDED_HIP)),
+              1);
+    EXPECT_EQ(AsInt(ImageBatchVarShapeDataStridedHip::IsCompatibleKind(
+                  ImageBatchBufferType::IMAGE_BATCH_VARSHAPE_BUFFER_STRIDED_HOST)),
+              0);
+
+    EXPECT_EQ(AsInt(ImageBatchVarShapeDataStridedHost::IsCompatibleKind(ImageBatchBufferType::IMAGE_BATCH_BUFFER_NONE)),
+              0);
+    EXPECT_EQ(AsInt(ImageBatchVarShapeDataStridedHost::IsCompatibleKind(
+                  ImageBatchBufferType::IMAGE_BATCH_VARSHAPE_BUFFER_STRIDED_HIP)),
+              0);
+    EXPECT_EQ(AsInt(ImageBatchVarShapeDataStridedHost::IsCompatibleKind(
+                  ImageBatchBufferType::IMAGE_BATCH_VARSHAPE_BUFFER_STRIDED_HOST)),
+              1);
+}
+
+/**
+ * @brief Round-trip a derived ImageBatchData through the base reference and
+ * back via cast<>(). Successful casts must preserve every observable field;
+ * casts to incompatible kinds must return std::nullopt.
+ */
+void TestImageBatchDataCast() {
+    auto buf = MakeHomogeneousBuffer();
+
+    // Hip → base → Hip should round-trip; intermediate VarShape/Strided also
+    // succeed; Hip → Host fails.
+    {
+        ImageBatchVarShapeDataStridedHip hip(2, buf);
+        const ImageBatchData& base = hip;
+
+        auto asHip = base.cast<ImageBatchVarShapeDataStridedHip>();
+        EXPECT_EQ(AsInt(asHip.has_value()), 1);
+        EXPECT_EQ(AsInt(asHip->device()), AsInt(eDeviceType::GPU));
+        EXPECT_EQ(asHip->numImages(), 2);
+        EXPECT_EQ(asHip->maxSize().w, 640);
+        EXPECT_EQ(AsAddr(const_cast<ImageBufferStrided*>(asHip->imageList())), AsAddr(g_imageList));
+
+        auto asStrided = base.cast<ImageBatchVarShapeDataStrided>();
+        EXPECT_EQ(AsInt(asStrided.has_value()), 1);
+        EXPECT_EQ(AsInt(asStrided->device()), AsInt(eDeviceType::GPU));
+
+        auto asVar = base.cast<ImageBatchVarShapeData>();
+        EXPECT_EQ(AsInt(asVar.has_value()), 1);
+        EXPECT_EQ(asVar->maxSize().h, 480);
+
+        auto asHost = base.cast<ImageBatchVarShapeDataStridedHost>();
+        EXPECT_EQ(AsInt(asHost.has_value()), 0);
+    }
+
+    // Symmetrically: Host → base → Host succeeds, Host → Hip fails.
+    {
+        ImageBatchVarShapeDataStridedHost host(2, buf);
+        const ImageBatchData& base = host;
+
+        auto asHost = base.cast<ImageBatchVarShapeDataStridedHost>();
+        EXPECT_EQ(AsInt(asHost.has_value()), 1);
+        EXPECT_EQ(AsInt(asHost->device()), AsInt(eDeviceType::CPU));
+        EXPECT_EQ(asHost->numImages(), 2);
+
+        auto asHip = base.cast<ImageBatchVarShapeDataStridedHip>();
+        EXPECT_EQ(AsInt(asHip.has_value()), 0);
+    }
+}
+
+}  // namespace
+
+int main(int argc, char** argv) {
+    (void)argc;
+    (void)argv;
+    TEST_CASES_BEGIN();
+
+    TEST_CASE(TestImageBatchVarShapeDataStridedHipConstruction());
+    TEST_CASE(TestImageBatchVarShapeDataStridedHostConstruction());
+    TEST_CASE(TestImageBatchVarShapeDataEmpty());
+    TEST_CASE(TestImageBatchVarShapeDataHeterogeneousFormats());
+    TEST_CASE(TestImageBatchVarShapeDataSugarCtor());
+    TEST_CASE(TestImageBatchDataIsCompatibleKind());
+    TEST_CASE(TestImageBatchDataCast());
+
+    TEST_CASES_END();
+}

From 1977fe682fa15959ef13a07529c1ec898a087b27 Mon Sep 17 00:00:00 2001
From: Zach Vincze <zavincze@amd.com>
Date: Wed, 6 May 2026 16:31:33 -0400
Subject: [PATCH 08/13] Initial ImageBatchVarShape implementation

---
 include/core/image_batch_var_shape.hpp        | 206 ++++++++
 src/core/image_batch_var_shape.cpp            | 244 ++++++++++
 .../core/image/test_image_batch_var_shape.cpp | 460 ++++++++++++++++++
 3 files changed, 910 insertions(+)
 create mode 100644 include/core/image_batch_var_shape.hpp
 create mode 100644 src/core/image_batch_var_shape.cpp
 create mode 100644 tests/roccv/cpp/src/tests/core/image/test_image_batch_var_shape.cpp

diff --git a/include/core/image_batch_var_shape.hpp b/include/core/image_batch_var_shape.hpp
new file mode 100644
index 00000000..a57f355a
--- /dev/null
+++ b/include/core/image_batch_var_shape.hpp
@@ -0,0 +1,206 @@
+/*
+ * Copyright (c) 2026 Advanced Micro Devices, Inc. All rights reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ */
+
+#pragma once
+
+#include <hip/hip_runtime.h>
+#include <stdint.h>
+
+#include <optional>
+#include <vector>
+
+#include "core/detail/allocators/i_allocator.hpp"
+#include "core/image.hpp"
+#include "core/image_batch_data.hpp"
+#include "core/image_format.hpp"
+#include "exception.hpp"
+#include "operator_types.h"
+
+namespace roccv {
+
+/**
+ * @brief Producer-side container for a batch of variable-sized images that
+ * share a single GPU-resident descriptor table.
+ *
+ * Holds up to `capacity()` Image handles and maintains a parallel descriptor
+ * table that operators can dispatch over without iterating Image-by-Image.
+ * Capacity is fixed at construction; pushBack/popBack move within it.
+ *
+ * The host descriptor mirrors are pinned so the H2D copy in exportData() is a
+ * true DMA (no runtime bounce buffer) and so the snapshot can expose the same
+ * pinned pointer as both `formatList`'s host shadow and `hostFormatList`.
+ *
+ * Sync model: pushBack/popBack mutate the host mirrors only; the device
+ * descriptor table is brought up to date lazily inside exportData(stream),
+ * which copies just the dirty suffix `[dirtyStart, numImages)`. A hipEvent
+ * (`m_postFence`) guards the host buffers — if a previous exportData's H2D
+ * is still in flight, pushBack hipEventSynchronize's on the CPU before
+ * mutating, so the snapshot a consumer is reading never tears.
+ *
+ * GPU-only in v1. CPU-resident images are rejected on push.
+ */
+class ImageBatchVarShape {
+   public:
+    using const_iterator = std::vector<Image>::const_iterator;
+
+    /**
+     * @brief Construct an empty batch with `capacity` slots, using the global
+     * default allocator.
+     */
+    explicit ImageBatchVarShape(int32_t capacity);
+
+    /**
+     * @brief Construct an empty batch with `capacity` slots, using the supplied
+     * allocator. The allocator must outlive the batch.
+     */
+    explicit ImageBatchVarShape(int32_t capacity, const IAllocator &alloc);
+
+    ~ImageBatchVarShape();
+
+    ImageBatchVarShape(const ImageBatchVarShape &) = delete;
+    ImageBatchVarShape &operator=(const ImageBatchVarShape &) = delete;
+    ImageBatchVarShape(ImageBatchVarShape &&) noexcept;
+    ImageBatchVarShape &operator=(ImageBatchVarShape &&) = delete;
+
+    int32_t capacity() const noexcept { return m_capacity; }
+    int32_t numImages() const noexcept { return static_cast<int32_t>(m_images.size()); }
+
+    /**
+     * @brief Append an image to the batch. Throws if capacity would be
+     * exceeded, the image is CPU-resident, or the image has more than one
+     * plane (rocCV is single-plane today).
+     */
+    void pushBack(const Image &img);
+
+    /**
+     * @brief Append a range of images. Strong exception guarantee — if any
+     * image fails validation, the batch is rolled back to its pre-call state
+     * and the exception is rethrown.
+     */
+    template <typename It>
+    void pushBack(It begin, It end);
+
+    /**
+     * @brief Remove the trailing `count` images. Throws if `count` exceeds
+     * numImages().
+     */
+    void popBack(int32_t count = 1);
+
+    /**
+     * @brief Drop all images. Buffers are kept; the batch is reusable.
+     */
+    void clear();
+
+    const Image &operator[](int32_t i) const { return m_images[i]; }
+
+    const_iterator begin() const noexcept { return m_images.cbegin(); }
+    const_iterator end() const noexcept { return m_images.cend(); }
+
+    /**
+     * @brief Bounding box across all images, in pixels. Returns Size2D{0, 0}
+     * for an empty batch.
+     */
+    Size2D maxSize() const;
+
+    /**
+     * @brief The common ImageFormat across all images, or FMT_NONE if formats
+     * are heterogeneous or the batch is empty. After popping the only image
+     * with a given heterogenizing format, the cached value may stay FMT_NONE
+     * until the next emptying operation — conservative, never wrong.
+     */
+    ImageFormat uniqueFormat() const;
+
+    /**
+     * @brief Build (and return by value) a GPU-resident snapshot of the batch.
+     *
+     * Synchronizes the dirty suffix of the host mirrors to the device
+     * descriptor table on the supplied stream before returning. The returned
+     * snapshot's `imageList` and `formatList` are device pointers safe for
+     * kernels enqueued on the same stream; `hostFormatList` aliases the pinned
+     * host format mirror and is safe to read from host code. The snapshot is
+     * a metadata view valid as long as this batch outlives it.
+     */
+    ImageBatchVarShapeDataStridedHip exportData(hipStream_t stream);
+
+    /**
+     * @brief Build a snapshot and down-cast it to a specific subclass. Throws
+     * std::bad_cast if the underlying buffer kind doesn't match Derived.
+     */
+    template <typename Derived>
+    Derived exportData(hipStream_t stream);
+
+   private:
+    void doSyncDirtySuffix(hipStream_t stream);
+    void doUpdateCache() const;
+
+    int32_t m_capacity;
+    int32_t m_dirtyStartingFromIndex = 0;
+    bool m_fencePending = false;
+
+    const IAllocator &m_allocator;
+    std::vector<Image> m_images;
+
+    ImageBufferStrided *m_devImagesBuffer = nullptr;
+    ImageFormat *m_devFormatsBuffer = nullptr;
+    ImageBufferStrided *m_hostImagesBuffer = nullptr;
+    ImageFormat *m_hostFormatsBuffer = nullptr;
+
+    hipEvent_t m_postFence = nullptr;
+
+    mutable std::optional<Size2D> m_cacheMaxSize;
+    mutable std::optional<ImageFormat> m_cacheUniqueFormat;
+};
+
+template <typename It>
+void ImageBatchVarShape::pushBack(It begin, It end) {
+    const int32_t incoming = static_cast<int32_t>(std::distance(begin, end));
+    if (incoming + numImages() > m_capacity) {
+        throw Exception("ImageBatchVarShape::pushBack range would exceed capacity", eStatusType::OUT_OF_BOUNDS);
+    }
+
+    const int32_t oldNumImages = numImages();
+    const auto oldMaxSize = m_cacheMaxSize;
+    const auto oldUniqueFormat = m_cacheUniqueFormat;
+
+    try {
+        for (auto it = begin; it != end; ++it) {
+            pushBack(*it);
+        }
+    } catch (...) {
+        m_images.erase(m_images.begin() + oldNumImages, m_images.end());
+        m_cacheMaxSize = oldMaxSize;
+        m_cacheUniqueFormat = oldUniqueFormat;
+        throw;
+    }
+}
+
+template <typename Derived>
+Derived ImageBatchVarShape::exportData(hipStream_t stream) {
+    ImageBatchVarShapeDataStridedHip data = exportData(stream);
+    auto derived = data.cast<Derived>();
+    if (!derived.has_value()) {
+        throw std::bad_cast();
+    }
+    return derived.value();
+}
+
+}  // namespace roccv
diff --git a/src/core/image_batch_var_shape.cpp b/src/core/image_batch_var_shape.cpp
new file mode 100644
index 00000000..d522e336
--- /dev/null
+++ b/src/core/image_batch_var_shape.cpp
@@ -0,0 +1,244 @@
+/*
+ * Copyright (c) 2026 Advanced Micro Devices, Inc. All rights reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ */
+
+#include "core/image_batch_var_shape.hpp"
+
+#include <algorithm>
+
+#include "core/detail/context.hpp"
+#include "core/exception.hpp"
+#include "core/hip_assert.h"
+#include "core/image_batch_buffer.hpp"
+#include "core/image_buffer.hpp"
+
+namespace roccv {
+
+ImageBatchVarShape::ImageBatchVarShape(int32_t capacity)
+    : ImageBatchVarShape(capacity, GlobalContext().getDefaultAllocator()) {}
+
+ImageBatchVarShape::ImageBatchVarShape(int32_t capacity, const IAllocator& alloc)
+    : m_capacity(capacity), m_allocator(alloc) {
+    if (capacity <= 0) {
+        throw Exception("ImageBatchVarShape capacity must be positive", eStatusType::INVALID_VALUE);
+    }
+
+    m_images.reserve(capacity);
+
+    const size_t imagesBytes = sizeof(ImageBufferStrided) * capacity;
+    const size_t formatsBytes = sizeof(ImageFormat) * capacity;
+
+    m_devImagesBuffer = static_cast<ImageBufferStrided*>(m_allocator.allocHipMem(imagesBytes));
+    m_devFormatsBuffer = static_cast<ImageFormat*>(m_allocator.allocHipMem(formatsBytes));
+    m_hostImagesBuffer = static_cast<ImageBufferStrided*>(m_allocator.allocHostPinnedMem(imagesBytes));
+    m_hostFormatsBuffer = static_cast<ImageFormat*>(m_allocator.allocHostPinnedMem(formatsBytes));
+
+    HIP_VALIDATE_NO_ERRORS(hipEventCreateWithFlags(&m_postFence, hipEventDisableTiming));
+}
+
+ImageBatchVarShape::~ImageBatchVarShape() {
+    if (m_fencePending && m_postFence != nullptr) {
+        // Drain any in-flight H2D copy before freeing the host mirrors it
+        // reads from. (void) — destructors must not throw.
+        (void)hipEventSynchronize(m_postFence);
+    }
+    if (m_postFence != nullptr) {
+        (void)hipEventDestroy(m_postFence);
+    }
+    if (m_hostFormatsBuffer != nullptr) m_allocator.freeHostPinnedMem(m_hostFormatsBuffer);
+    if (m_hostImagesBuffer != nullptr) m_allocator.freeHostPinnedMem(m_hostImagesBuffer);
+    if (m_devFormatsBuffer != nullptr) m_allocator.freeHipMem(m_devFormatsBuffer);
+    if (m_devImagesBuffer != nullptr) m_allocator.freeHipMem(m_devImagesBuffer);
+}
+
+ImageBatchVarShape::ImageBatchVarShape(ImageBatchVarShape&& other) noexcept
+    : m_capacity(other.m_capacity),
+      m_dirtyStartingFromIndex(other.m_dirtyStartingFromIndex),
+      m_fencePending(other.m_fencePending),
+      m_allocator(other.m_allocator),
+      m_images(std::move(other.m_images)),
+      m_devImagesBuffer(other.m_devImagesBuffer),
+      m_devFormatsBuffer(other.m_devFormatsBuffer),
+      m_hostImagesBuffer(other.m_hostImagesBuffer),
+      m_hostFormatsBuffer(other.m_hostFormatsBuffer),
+      m_postFence(other.m_postFence),
+      m_cacheMaxSize(other.m_cacheMaxSize),
+      m_cacheUniqueFormat(other.m_cacheUniqueFormat) {
+    other.m_capacity = 0;
+    other.m_dirtyStartingFromIndex = 0;
+    other.m_fencePending = false;
+    other.m_devImagesBuffer = nullptr;
+    other.m_devFormatsBuffer = nullptr;
+    other.m_hostImagesBuffer = nullptr;
+    other.m_hostFormatsBuffer = nullptr;
+    other.m_postFence = nullptr;
+    other.m_cacheMaxSize.reset();
+    other.m_cacheUniqueFormat.reset();
+}
+
+void ImageBatchVarShape::pushBack(const Image& img) {
+    const int32_t n = numImages();
+    if (n >= m_capacity) {
+        throw Exception("ImageBatchVarShape::pushBack would exceed capacity", eStatusType::OUT_OF_BOUNDS);
+    }
+    if (img.device() != eDeviceType::GPU) {
+        throw Exception("ImageBatchVarShape only accepts GPU-resident images", eStatusType::INVALID_VALUE);
+    }
+
+    ImageDataStridedHip data = img.exportData<ImageDataStridedHip>();
+    if (data.numPlanes() != 1) {
+        throw Exception("ImageBatchVarShape only supports single-plane images", eStatusType::INVALID_VALUE);
+    }
+
+    if (m_fencePending) {
+        HIP_VALIDATE_NO_ERRORS(hipEventSynchronize(m_postFence));
+        m_fencePending = false;
+    }
+
+    ImageBufferStrided slot{};
+    slot.numPlanes = 1;
+    slot.planes[0] = data.plane(0);
+    m_hostImagesBuffer[n] = slot;
+    m_hostFormatsBuffer[n] = img.format();
+
+    const Size2D imgSize = img.size();
+    if (n == 0) {
+        // Seed from scratch: an empty-batch query may have populated the
+        // cache with sentinels (FMT_NONE, 0×0); replacing avoids merging the
+        // first real image into them.
+        m_cacheMaxSize = imgSize;
+        m_cacheUniqueFormat = img.format();
+    } else {
+        // popBack invalidates m_cacheMaxSize without rescanning, so make sure
+        // both halves of the cache are populated before merging in.
+        doUpdateCache();
+        m_cacheMaxSize->w = std::max(m_cacheMaxSize->w, imgSize.w);
+        m_cacheMaxSize->h = std::max(m_cacheMaxSize->h, imgSize.h);
+        if (*m_cacheUniqueFormat != img.format()) {
+            m_cacheUniqueFormat = FMT_NONE;
+        }
+    }
+
+    m_images.push_back(img);
+}
+
+void ImageBatchVarShape::popBack(int32_t count) {
+    if (count < 0) {
+        throw Exception("ImageBatchVarShape::popBack count must be non-negative", eStatusType::INVALID_VALUE);
+    }
+    if (count > numImages()) {
+        throw Exception("ImageBatchVarShape::popBack count exceeds numImages", eStatusType::OUT_OF_BOUNDS);
+    }
+
+    m_images.erase(m_images.end() - count, m_images.end());
+    m_dirtyStartingFromIndex = std::min(m_dirtyStartingFromIndex, numImages());
+
+    // maxSize can only shrink on pop; force a rescan on next query. uniqueFormat
+    // stays — it may now be conservatively FMT_NONE, but never wrong.
+    m_cacheMaxSize.reset();
+    if (numImages() == 0) {
+        m_cacheUniqueFormat.reset();
+    }
+}
+
+void ImageBatchVarShape::clear() {
+    m_images.clear();
+    m_dirtyStartingFromIndex = 0;
+    m_cacheMaxSize.reset();
+    m_cacheUniqueFormat.reset();
+}
+
+Size2D ImageBatchVarShape::maxSize() const {
+    doUpdateCache();
+    return m_cacheMaxSize.value_or(Size2D{0, 0});
+}
+
+ImageFormat ImageBatchVarShape::uniqueFormat() const {
+    doUpdateCache();
+    return m_cacheUniqueFormat.value_or(FMT_NONE);
+}
+
+void ImageBatchVarShape::doUpdateCache() const {
+    if (m_cacheMaxSize.has_value() && m_cacheUniqueFormat.has_value()) {
+        return;
+    }
+    const int32_t n = static_cast<int32_t>(m_images.size());
+    if (n == 0) {
+        m_cacheMaxSize = Size2D{0, 0};
+        m_cacheUniqueFormat = FMT_NONE;
+        return;
+    }
+
+    Size2D maxSz{0, 0};
+    ImageFormat unique = m_hostFormatsBuffer[0];
+    bool heterogeneous = false;
+    for (int32_t i = 0; i < n; ++i) {
+        const ImagePlaneStrided& p0 = m_hostImagesBuffer[i].planes[0];
+        maxSz.w = std::max(maxSz.w, p0.width);
+        maxSz.h = std::max(maxSz.h, p0.height);
+        if (!heterogeneous && m_hostFormatsBuffer[i] != unique) {
+            heterogeneous = true;
+        }
+    }
+    m_cacheMaxSize = maxSz;
+    m_cacheUniqueFormat = heterogeneous ? FMT_NONE : unique;
+}
+
+void ImageBatchVarShape::doSyncDirtySuffix(hipStream_t stream) {
+    const int32_t n = numImages();
+    if (m_dirtyStartingFromIndex >= n) {
+        return;
+    }
+    const int32_t dirtyCount = n - m_dirtyStartingFromIndex;
+
+    if (m_fencePending) {
+        HIP_VALIDATE_NO_ERRORS(hipStreamWaitEvent(stream, m_postFence, /*flags=*/0));
+    }
+
+    HIP_VALIDATE_NO_ERRORS(hipMemcpyAsync(m_devImagesBuffer + m_dirtyStartingFromIndex,
+                                          m_hostImagesBuffer + m_dirtyStartingFromIndex,
+                                          sizeof(ImageBufferStrided) * dirtyCount, hipMemcpyHostToDevice, stream));
+    HIP_VALIDATE_NO_ERRORS(hipMemcpyAsync(m_devFormatsBuffer + m_dirtyStartingFromIndex,
+                                          m_hostFormatsBuffer + m_dirtyStartingFromIndex,
+                                          sizeof(ImageFormat) * dirtyCount, hipMemcpyHostToDevice, stream));
+
+    HIP_VALIDATE_NO_ERRORS(hipEventRecord(m_postFence, stream));
+    m_fencePending = true;
+    m_dirtyStartingFromIndex = n;
+}
+
+ImageBatchVarShapeDataStridedHip ImageBatchVarShape::exportData(hipStream_t stream) {
+    doSyncDirtySuffix(stream);
+    doUpdateCache();
+
+    const Size2D maxSz = m_cacheMaxSize.value();
+    ImageBatchVarShapeBufferStrided buffer{};
+    buffer.uniqueFormat = m_cacheUniqueFormat.value();
+    buffer.maxWidth = maxSz.w;
+    buffer.maxHeight = maxSz.h;
+    buffer.formatList = m_devFormatsBuffer;
+    buffer.hostFormatList = m_hostFormatsBuffer;
+    buffer.imageList = m_devImagesBuffer;
+
+    return ImageBatchVarShapeDataStridedHip(numImages(), buffer);
+}
+
+}  // namespace roccv
diff --git a/tests/roccv/cpp/src/tests/core/image/test_image_batch_var_shape.cpp b/tests/roccv/cpp/src/tests/core/image/test_image_batch_var_shape.cpp
new file mode 100644
index 00000000..dea3c0c0
--- /dev/null
+++ b/tests/roccv/cpp/src/tests/core/image/test_image_batch_var_shape.cpp
@@ -0,0 +1,460 @@
+/*
+ * Copyright (c) 2026 Advanced Micro Devices, Inc. All rights reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ */
+
+#include <hip/hip_runtime.h>
+#include <stdint.h>
+#include <stdlib.h>
+
+#include <core/detail/allocators/i_allocator.hpp>
+#include <core/image.hpp>
+#include <core/image_batch_data.hpp>
+#include <core/image_batch_var_shape.hpp>
+#include <core/image_buffer.hpp>
+#include <core/image_data.hpp>
+#include <core/image_format.hpp>
+#include <utility>
+#include <vector>
+
+#include "test_helpers.hpp"
+
+using namespace roccv;
+using namespace roccv::tests;
+
+namespace {
+
+/**
+ * @brief Test allocator that distinguishes pinned-host from regular-host
+ * allocations and tallies each entry-point. Pure host-backed; no actual GPU
+ * dependency on the descriptor buffers — tests verify metadata round-trip and
+ * pointer identity, never dereference device memory through these.
+ */
+class CountingAllocator : public IAllocator {
+   public:
+    mutable int hipAllocs = 0;
+    mutable int hipFrees = 0;
+    mutable int hostAllocs = 0;
+    mutable int hostFrees = 0;
+    mutable int pinnedAllocs = 0;
+    mutable int pinnedFrees = 0;
+
+    void* allocHipMem(size_t size) const override {
+        ++hipAllocs;
+        return std::malloc(size);
+    }
+    void freeHipMem(void* ptr) const noexcept override {
+        ++hipFrees;
+        std::free(ptr);
+    }
+
+    void* allocHostMem(size_t size, int32_t /*alignment*/ = 0) const override {
+        ++hostAllocs;
+        return std::malloc(size);
+    }
+    void freeHostMem(void* ptr) const noexcept override {
+        ++hostFrees;
+        std::free(ptr);
+    }
+
+    void* allocHostPinnedMem(size_t size) const override {
+        ++pinnedAllocs;
+        return std::malloc(size);
+    }
+    void freeHostPinnedMem(void* ptr) const noexcept override {
+        ++pinnedFrees;
+        std::free(ptr);
+    }
+};
+
+// Build a single-plane GPU-resident image wrapper around a sentinel pointer.
+// The pointer is never dereferenced — pushBack only reads the descriptor.
+Image MakeFakeGpuImage(int32_t w, int32_t h, ImageFormat fmt, void* basePtr) {
+    ImageBufferStrided buf{};
+    buf.numPlanes = 1;
+    buf.planes[0] = {w, h, static_cast<int64_t>(w * fmt.channels()), basePtr};
+    return ImageWrapData(ImageDataStridedHip(fmt, buf));
+}
+
+Image MakeFakeHostImage(int32_t w, int32_t h, ImageFormat fmt, void* basePtr) {
+    ImageBufferStrided buf{};
+    buf.numPlanes = 1;
+    buf.planes[0] = {w, h, static_cast<int64_t>(w * fmt.channels()), basePtr};
+    return ImageWrapData(ImageDataStridedHost(fmt, buf));
+}
+
+void* const FAKE_A = reinterpret_cast<void*>(0xA0000000ull);
+void* const FAKE_B = reinterpret_cast<void*>(0xB0000000ull);
+void* const FAKE_C = reinterpret_cast<void*>(0xC0000000ull);
+
+// =============================================================================
+// Construction
+// =============================================================================
+
+void TestConstruction() {
+    CountingAllocator alloc;
+    {
+        ImageBatchVarShape batch(8, alloc);
+        EXPECT_EQ(batch.capacity(), 8);
+        EXPECT_EQ(batch.numImages(), 0);
+        EXPECT_EQ(AsInt(batch.begin() == batch.end()), 1);
+    }
+    EXPECT_EQ(alloc.hipAllocs, 2);
+    EXPECT_EQ(alloc.pinnedAllocs, 2);
+    EXPECT_EQ(alloc.hipFrees, 2);
+    EXPECT_EQ(alloc.pinnedFrees, 2);
+}
+
+void TestConstructionRejectsBadCapacity() {
+    CountingAllocator alloc;
+    EXPECT_EXCEPTION(ImageBatchVarShape(0, alloc), eStatusType::INVALID_VALUE);
+    EXPECT_EXCEPTION(ImageBatchVarShape(-3, alloc), eStatusType::INVALID_VALUE);
+}
+
+// =============================================================================
+// pushBack — basic
+// =============================================================================
+
+void TestPushBackSingle() {
+    CountingAllocator alloc;
+    ImageBatchVarShape batch(4, alloc);
+
+    Image img = MakeFakeGpuImage(640, 480, FMT_RGB8, FAKE_A);
+    batch.pushBack(img);
+
+    EXPECT_EQ(batch.numImages(), 1);
+    EXPECT_EQ(batch[0].size().w, 640);
+    EXPECT_EQ(batch[0].size().h, 480);
+    EXPECT_EQ(AsInt(batch[0].format() == FMT_RGB8), 1);
+}
+
+void TestPushBackMultipleHeterogeneousSizes() {
+    CountingAllocator alloc;
+    ImageBatchVarShape batch(4, alloc);
+
+    batch.pushBack(MakeFakeGpuImage(640, 480, FMT_RGB8, FAKE_A));
+    batch.pushBack(MakeFakeGpuImage(320, 240, FMT_RGB8, FAKE_B));
+    batch.pushBack(MakeFakeGpuImage(800, 200, FMT_RGB8, FAKE_C));
+
+    EXPECT_EQ(batch.numImages(), 3);
+    EXPECT_EQ(batch.maxSize().w, 800);
+    EXPECT_EQ(batch.maxSize().h, 480);
+    EXPECT_EQ(AsInt(batch.uniqueFormat() == FMT_RGB8), 1);
+}
+
+void TestPushBackIteratorRange() {
+    CountingAllocator alloc;
+    ImageBatchVarShape batch(8, alloc);
+
+    std::vector<Image> imgs;
+    imgs.push_back(MakeFakeGpuImage(100, 100, FMT_RGB8, FAKE_A));
+    imgs.push_back(MakeFakeGpuImage(200, 200, FMT_RGB8, FAKE_B));
+    imgs.push_back(MakeFakeGpuImage(300, 300, FMT_RGB8, FAKE_C));
+
+    batch.pushBack(imgs.begin(), imgs.end());
+
+    EXPECT_EQ(batch.numImages(), 3);
+    EXPECT_EQ(batch.maxSize().w, 300);
+}
+
+// =============================================================================
+// pushBack — validation
+// =============================================================================
+
+void TestPushBackCapacityOverflow() {
+    CountingAllocator alloc;
+    ImageBatchVarShape batch(2, alloc);
+
+    batch.pushBack(MakeFakeGpuImage(64, 64, FMT_RGB8, FAKE_A));
+    batch.pushBack(MakeFakeGpuImage(64, 64, FMT_RGB8, FAKE_B));
+
+    EXPECT_EXCEPTION(batch.pushBack(MakeFakeGpuImage(64, 64, FMT_RGB8, FAKE_C)), eStatusType::OUT_OF_BOUNDS);
+}
+
+void TestPushBackHostImageRejected() {
+    CountingAllocator alloc;
+    ImageBatchVarShape batch(4, alloc);
+
+    Image cpuImg = MakeFakeHostImage(64, 64, FMT_U8, FAKE_A);
+    EXPECT_EXCEPTION(batch.pushBack(cpuImg), eStatusType::INVALID_VALUE);
+}
+
+// Note: pushBack's single-plane validation is defense-in-depth — Image's own
+// exportData() (image.cpp:118) currently hardcodes numPlanes=1 regardless of
+// the underlying buffer, so the public API can't construct a multi-plane Image
+// for this guard to fire on. The test would need to be revisited when planar
+// formats land in Image itself.
+
+void TestPushBackRangeRollbackOnFailure() {
+    CountingAllocator alloc;
+    ImageBatchVarShape batch(8, alloc);
+
+    // Pre-populate so we can confirm the rollback restores exactly the
+    // pre-call state, not just back to zero.
+    batch.pushBack(MakeFakeGpuImage(100, 100, FMT_RGB8, FAKE_A));
+    EXPECT_EQ(batch.numImages(), 1);
+
+    // Mid-range CPU image — should rollback the partially-pushed entries.
+    std::vector<Image> imgs;
+    imgs.push_back(MakeFakeGpuImage(200, 200, FMT_RGB8, FAKE_B));
+    imgs.push_back(MakeFakeHostImage(300, 300, FMT_RGB8, FAKE_C));  // Will throw.
+
+    EXPECT_EXCEPTION(batch.pushBack(imgs.begin(), imgs.end()), eStatusType::INVALID_VALUE);
+
+    // Pre-call state is intact: 1 image, original maxSize.
+    EXPECT_EQ(batch.numImages(), 1);
+    EXPECT_EQ(batch.maxSize().w, 100);
+}
+
+void TestPushBackRangeOverflowPrechecked() {
+    CountingAllocator alloc;
+    ImageBatchVarShape batch(2, alloc);
+
+    std::vector<Image> imgs;
+    imgs.push_back(MakeFakeGpuImage(10, 10, FMT_RGB8, FAKE_A));
+    imgs.push_back(MakeFakeGpuImage(20, 20, FMT_RGB8, FAKE_B));
+    imgs.push_back(MakeFakeGpuImage(30, 30, FMT_RGB8, FAKE_C));  // 3rd overflows capacity 2.
+
+    EXPECT_EXCEPTION(batch.pushBack(imgs.begin(), imgs.end()), eStatusType::OUT_OF_BOUNDS);
+    // Pre-checked: nothing was pushed.
+    EXPECT_EQ(batch.numImages(), 0);
+}
+
+// =============================================================================
+// popBack / clear
+// =============================================================================
+
+void TestPopBack() {
+    CountingAllocator alloc;
+    ImageBatchVarShape batch(4, alloc);
+
+    batch.pushBack(MakeFakeGpuImage(100, 100, FMT_RGB8, FAKE_A));
+    batch.pushBack(MakeFakeGpuImage(200, 200, FMT_RGB8, FAKE_B));
+    batch.popBack();
+
+    EXPECT_EQ(batch.numImages(), 1);
+    // maxSize was reset on pop; the rescan should drop back to 100.
+    EXPECT_EQ(batch.maxSize().w, 100);
+}
+
+void TestPopBackMultiple() {
+    CountingAllocator alloc;
+    ImageBatchVarShape batch(4, alloc);
+
+    batch.pushBack(MakeFakeGpuImage(100, 100, FMT_RGB8, FAKE_A));
+    batch.pushBack(MakeFakeGpuImage(200, 200, FMT_RGB8, FAKE_B));
+    batch.pushBack(MakeFakeGpuImage(300, 300, FMT_RGB8, FAKE_C));
+    batch.popBack(2);
+
+    EXPECT_EQ(batch.numImages(), 1);
+    EXPECT_EQ(batch.maxSize().w, 100);
+}
+
+void TestPopBackUnderflow() {
+    CountingAllocator alloc;
+    ImageBatchVarShape batch(4, alloc);
+    batch.pushBack(MakeFakeGpuImage(100, 100, FMT_RGB8, FAKE_A));
+
+    EXPECT_EXCEPTION(batch.popBack(2), eStatusType::OUT_OF_BOUNDS);
+    // State preserved.
+    EXPECT_EQ(batch.numImages(), 1);
+}
+
+void TestClearAndReuse() {
+    CountingAllocator alloc;
+    ImageBatchVarShape batch(4, alloc);
+
+    batch.pushBack(MakeFakeGpuImage(100, 100, FMT_RGB8, FAKE_A));
+    batch.pushBack(MakeFakeGpuImage(200, 200, FMT_RGB8, FAKE_B));
+    batch.clear();
+
+    EXPECT_EQ(batch.numImages(), 0);
+    EXPECT_EQ(batch.maxSize().w, 0);
+    EXPECT_EQ(AsInt(batch.uniqueFormat() == FMT_NONE), 1);
+
+    // Reuse after clear.
+    batch.pushBack(MakeFakeGpuImage(50, 50, FMT_U8, FAKE_C));
+    EXPECT_EQ(batch.numImages(), 1);
+    EXPECT_EQ(AsInt(batch.uniqueFormat() == FMT_U8), 1);
+}
+
+// =============================================================================
+// uniqueFormat / maxSize cache
+// =============================================================================
+
+void TestUniqueFormatHomogeneous() {
+    CountingAllocator alloc;
+    ImageBatchVarShape batch(4, alloc);
+    batch.pushBack(MakeFakeGpuImage(64, 64, FMT_RGB8, FAKE_A));
+    batch.pushBack(MakeFakeGpuImage(128, 128, FMT_RGB8, FAKE_B));
+    EXPECT_EQ(AsInt(batch.uniqueFormat() == FMT_RGB8), 1);
+}
+
+void TestUniqueFormatHeterogeneous() {
+    CountingAllocator alloc;
+    ImageBatchVarShape batch(4, alloc);
+    batch.pushBack(MakeFakeGpuImage(64, 64, FMT_RGB8, FAKE_A));
+    batch.pushBack(MakeFakeGpuImage(64, 64, FMT_RGBA8, FAKE_B));
+    EXPECT_EQ(AsInt(batch.uniqueFormat() == FMT_NONE), 1);
+}
+
+void TestUniqueFormatEmptyBatch() {
+    CountingAllocator alloc;
+    ImageBatchVarShape batch(4, alloc);
+    EXPECT_EQ(AsInt(batch.uniqueFormat() == FMT_NONE), 1);
+    EXPECT_EQ(batch.maxSize().w, 0);
+    EXPECT_EQ(batch.maxSize().h, 0);
+}
+
+// =============================================================================
+// exportData
+// =============================================================================
+
+// exportData tests use the default allocator instead of CountingAllocator
+// because they exercise the real H2D hipMemcpyAsync, which requires the
+// device-side buffer to be a real hipMalloc'd pointer.
+
+void TestExportDataEmpty() {
+    ImageBatchVarShape batch(4);
+
+    auto data = batch.exportData(0);
+    EXPECT_EQ(data.numImages(), 0);
+    EXPECT_EQ(data.maxSize().w, 0);
+    EXPECT_EQ(data.maxSize().h, 0);
+    EXPECT_EQ(AsInt(data.uniqueFormat() == FMT_NONE), 1);
+    EXPECT_EQ(AsInt(data.device()), AsInt(eDeviceType::GPU));
+}
+
+void TestExportDataMetadata() {
+    ImageBatchVarShape batch(4);
+    batch.pushBack(MakeFakeGpuImage(640, 480, FMT_RGB8, FAKE_A));
+    batch.pushBack(MakeFakeGpuImage(320, 240, FMT_RGB8, FAKE_B));
+
+    auto data = batch.exportData(0);
+    EXPECT_EQ(data.numImages(), 2);
+    EXPECT_EQ(data.maxSize().w, 640);
+    EXPECT_EQ(data.maxSize().h, 480);
+    EXPECT_EQ(AsInt(data.uniqueFormat() == FMT_RGB8), 1);
+    EXPECT_EQ(AsInt(data.imageList() != nullptr), 1);
+    EXPECT_EQ(AsInt(data.formatList() != nullptr), 1);
+    EXPECT_EQ(AsInt(data.hostFormatList() != nullptr), 1);
+    // Pinned host mirror format entries are immediately host-readable.
+    EXPECT_EQ(AsInt(data.hostFormatList()[0] == FMT_RGB8), 1);
+    EXPECT_EQ(AsInt(data.hostFormatList()[1] == FMT_RGB8), 1);
+}
+
+void TestExportDataCastRoundTrip() {
+    ImageBatchVarShape batch(4);
+    batch.pushBack(MakeFakeGpuImage(64, 64, FMT_RGB8, FAKE_A));
+
+    auto hipData = batch.exportData<ImageBatchVarShapeDataStridedHip>(0);
+    EXPECT_EQ(hipData.numImages(), 1);
+    EXPECT_EQ(AsInt(hipData.device()), AsInt(eDeviceType::GPU));
+
+    // Cast through the base reference: succeeds for compatible kinds, nullopt
+    // for the host-resident leaf.
+    const ImageBatchData& base = hipData;
+    EXPECT_EQ(AsInt(base.cast<ImageBatchVarShapeDataStridedHip>().has_value()), 1);
+    EXPECT_EQ(AsInt(base.cast<ImageBatchVarShapeDataStridedHost>().has_value()), 0);
+}
+
+// =============================================================================
+// Move semantics
+// =============================================================================
+
+void TestMoveConstruction() {
+    CountingAllocator alloc;
+    {
+        ImageBatchVarShape src(4, alloc);
+        src.pushBack(MakeFakeGpuImage(100, 100, FMT_RGB8, FAKE_A));
+        src.pushBack(MakeFakeGpuImage(200, 200, FMT_RGB8, FAKE_B));
+
+        ImageBatchVarShape dst(std::move(src));
+        EXPECT_EQ(dst.numImages(), 2);
+        EXPECT_EQ(dst.maxSize().w, 200);
+
+        // Source is valid-but-empty; destructor must not double-free.
+        EXPECT_EQ(src.numImages(), 0);
+        EXPECT_EQ(src.capacity(), 0);
+    }
+    // Exactly one set of allocations should have been freed.
+    EXPECT_EQ(alloc.hipAllocs, alloc.hipFrees);
+    EXPECT_EQ(alloc.pinnedAllocs, alloc.pinnedFrees);
+}
+
+// =============================================================================
+// Iterator
+// =============================================================================
+
+void TestIteratorRangeFor() {
+    CountingAllocator alloc;
+    ImageBatchVarShape batch(4, alloc);
+    batch.pushBack(MakeFakeGpuImage(100, 100, FMT_RGB8, FAKE_A));
+    batch.pushBack(MakeFakeGpuImage(200, 200, FMT_RGB8, FAKE_B));
+    batch.pushBack(MakeFakeGpuImage(300, 300, FMT_RGB8, FAKE_C));
+
+    int32_t expectedW = 100;
+    int32_t count = 0;
+    for (const Image& img : batch) {
+        EXPECT_EQ(img.size().w, expectedW);
+        expectedW += 100;
+        ++count;
+    }
+    EXPECT_EQ(count, 3);
+}
+
+}  // namespace
+
+int main(int argc, char** argv) {
+    (void)argc;
+    (void)argv;
+    TEST_CASES_BEGIN();
+
+    TEST_CASE(TestConstruction());
+    TEST_CASE(TestConstructionRejectsBadCapacity());
+
+    TEST_CASE(TestPushBackSingle());
+    TEST_CASE(TestPushBackMultipleHeterogeneousSizes());
+    TEST_CASE(TestPushBackIteratorRange());
+
+    TEST_CASE(TestPushBackCapacityOverflow());
+    TEST_CASE(TestPushBackHostImageRejected());
+    TEST_CASE(TestPushBackRangeRollbackOnFailure());
+    TEST_CASE(TestPushBackRangeOverflowPrechecked());
+
+    TEST_CASE(TestPopBack());
+    TEST_CASE(TestPopBackMultiple());
+    TEST_CASE(TestPopBackUnderflow());
+    TEST_CASE(TestClearAndReuse());
+
+    TEST_CASE(TestUniqueFormatHomogeneous());
+    TEST_CASE(TestUniqueFormatHeterogeneous());
+    TEST_CASE(TestUniqueFormatEmptyBatch());
+
+    TEST_CASE(TestExportDataEmpty());
+    TEST_CASE(TestExportDataMetadata());
+    TEST_CASE(TestExportDataCastRoundTrip());
+
+    TEST_CASE(TestMoveConstruction());
+
+    TEST_CASE(TestIteratorRangeFor());
+
+    TEST_CASES_END();
+}

From a289c2d6606107b04e4d72f79b3d93d83280df6a Mon Sep 17 00:00:00 2001
From: Zach Vincze <zavincze@amd.com>
Date: Thu, 14 May 2026 16:45:07 -0400
Subject: [PATCH 09/13] Move image exportData implementation outside of class
 definition

---
 include/core/image.hpp | 19 +++++++++++--------
 1 file changed, 11 insertions(+), 8 deletions(-)

diff --git a/include/core/image.hpp b/include/core/image.hpp
index 7a42d51a..ef7707a8 100644
--- a/include/core/image.hpp
+++ b/include/core/image.hpp
@@ -134,14 +134,7 @@ class Image {
      * std::bad_cast if the underlying buffer kind doesn't match Derived.
      */
     template <typename Derived>
-    Derived exportData() const {
-        ImageData data = exportData();
-        auto derived = data.cast<Derived>();
-        if (!derived.has_value()) {
-            throw std::bad_cast();
-        }
-        return derived.value();
-    }
+    Derived exportData() const;
 
    private:
     Image(const Requirements& reqs, eDeviceType device, std::shared_ptr<ImageStorage> storage);
@@ -155,6 +148,16 @@ class Image {
     std::array<int64_t, ROCCV_MAX_IMAGE_PLANES> m_planeRowStride;
 };
 
+template <typename Derived>
+Derived Image::exportData() const {
+    ImageData data = exportData();
+    auto derived = data.cast<Derived>();
+    if (!derived.has_value()) {
+        throw std::bad_cast();
+    }
+    return derived.value();
+}
+
 /**
  * @brief Wrap an externally-owned buffer as an Image without allocating.
  *

From 9055ed0b5b3c3a6c6858029b17bedacb6ab9751e Mon Sep 17 00:00:00 2001
From: Zach Vincze <zavincze@amd.com>
Date: Fri, 15 May 2026 14:24:11 -0400
Subject: [PATCH 10/13] Support batch indexing in width and height methods for
 kernel wrappers

---
 include/core/wrappers/border_wrapper.hpp      |  55 ++--
 include/core/wrappers/image_wrapper.hpp       |   8 +-
 .../core/wrappers/interpolation_wrapper.hpp   |  47 ++--
 src/op_bilateral_filter.cpp                   |  12 +-
 src/op_copy_make_border.cpp                   |   5 +-
 src/op_remap.cpp                              |   6 +-
 src/op_resize.cpp                             |  22 +-
 src/op_rotate.cpp                             |  11 +-
 src/op_warp_perspective.cpp                   |   4 +-
 .../core/wrappers/test_border_wrapper.cpp     |   7 +-
 .../wrappers/test_interpolation_wrapper.cpp   |  33 +--
 .../operators/test_op_bilateral_filter.cpp    |  15 +-
 .../operators/test_op_copy_make_border.cpp    |   3 +-
 .../cpp/src/tests/operators/test_op_remap.cpp | 234 +++++++++++-------
 .../src/tests/operators/test_op_resize.cpp    |   4 +-
 .../src/tests/operators/test_op_rotate.cpp    |   6 +-
 .../tests/operators/test_op_warp_affine.cpp   |   2 +-
 .../operators/test_op_warp_perspective.cpp    |   2 +-
 18 files changed, 266 insertions(+), 210 deletions(-)

diff --git a/include/core/wrappers/border_wrapper.hpp b/include/core/wrappers/border_wrapper.hpp
index f55493a5..08e21a3c 100644
--- a/include/core/wrappers/border_wrapper.hpp
+++ b/include/core/wrappers/border_wrapper.hpp
@@ -23,38 +23,37 @@
 
 #include <hip/hip_runtime.h>
 
+#include <algorithm>
+#include <cstdlib>
+
 #include "core/wrappers/image_wrapper.hpp"
 #include "operator_types.h"
 
 namespace roccv {
 
 /**
- * @brief Wrapper class for ImageWrapper. This extends the descriptors by defining behaviors for when tensor
- * coordinates go out of scope.
+ * @brief Wrapper class which adds border-handling behavior on top of an underlying image wrapper.
+ *
+ * Templated on the wrapper type W (e.g. ImageWrapper<T>, VarShapeImageWrapper<T>) so that the same
+ * border math is shared between uniform-shape and variable-shape image batches. The pixel value
+ * type T is recovered from W::ValueType.
  *
- * @tparam T The underlying data type of the tensor.
  * @tparam BorderType The border type to use when coordinates are out of bounds.
+ * @tparam W The underlying image wrapper type. Must expose ValueType, at(n,h,w,c), width(n), and height(n).
  */
-template <typename T, eBorderType BorderType>
+template <eBorderType BorderType, typename W>
 class BorderWrapper {
    public:
-    /**
-     * @brief Wraps an ImageWrapper and extends its capabilities to handle out of bounds coordinates.
-     *
-     * @param tensor The tensor to wrap.
-     * @param border_value The fallback border color to use when using a constant border mode.
-     */
-    BorderWrapper(const Tensor& tensor, T border_value) : m_desc(tensor), m_border_value(border_value) {}
+    using ValueType = typename W::ValueType;
 
     /**
-     * @brief Constructs a BorderWrapper from an existing ImageWrapper. Extends its capabilities to handle out of bound
-     * coordinates.
+     * @brief Constructs a BorderWrapper from an existing image wrapper. Extends its capabilities to handle out
+     * of bound coordinates.
      *
-     * @param image_wrapper The ImageWrapper to wrap around the BorderWrapper.
+     * @param image_wrapper The image wrapper to wrap around the BorderWrapper.
      * @param border_value The fallback border color to use when using a constant border mode.
      */
-    BorderWrapper(ImageWrapper<T> image_wrapper, T border_value)
-        : m_desc(image_wrapper), m_border_value(border_value) {}
+    BorderWrapper(W image_wrapper, ValueType border_value) : m_desc(image_wrapper), m_border_value(border_value) {}
 
     /**
      * @brief Returns a reference to the underlying data given image coordinates. If the coordinates fall out of bounds,
@@ -66,11 +65,14 @@ class BorderWrapper {
      * @param c The channel index.
      * @return A reference to the underlying data or a fallback border value of type T.
      */
-    __device__ __host__ const T at(int64_t n, int64_t h, int64_t w, int64_t c) const {
+    __device__ __host__ const ValueType at(int64_t n, int64_t h, int64_t w, int64_t c) const {
+        const int64_t imgWidth = width(n);
+        const int64_t imgHeight = height(n);
+
         // Constant border type implementation. This is a special case which doesn't remap values, but rather returns
         // the provided constant value.
         if constexpr (BorderType == eBorderType::BORDER_TYPE_CONSTANT) {
-            if (w < 0 || w >= width() || h < 0 || h >= height())
+            if (w < 0 || w >= imgWidth || h < 0 || h >= imgHeight)
                 return m_border_value;
             else
                 return m_desc.at(n, h, w, c);
@@ -80,13 +82,12 @@ class BorderWrapper {
         // required at image borders. While this may cause branch divergence, a good bulk of the pixels should fall
         // within image bounds and will take the same branch. This is preferred over having to do expensive calculations
         // for EVERY pixel in the image (most of which do not require said calculations).
-        if (w >= 0 && w < width() && h >= 0 && h < height()) {
+        if (w >= 0 && w < imgWidth && h >= 0 && h < imgHeight) {
             return m_desc.at(n, h, w, c);
         }
 
         // Otherwise, do some additional calculations to map the provided x and y coordinates to be within bounds.
         int64_t x = w, y = h;
-        int64_t imgWidth = width(), imgHeight = height();
 
         // Reflect border type implementation. (Note: This is NOT REFLECT101, pixels at the border will be duplicated as
         // is the intended behavior for this border mode.)
@@ -139,18 +140,20 @@ class BorderWrapper {
     }
 
     /**
-     * @brief Retrives the height of the images.
+     * @brief Retrives the height of the image at batch index n.
      *
+     * @param n Batch index. Ignored when W is a uniform-shape wrapper.
      * @return Image height.
      */
-    __device__ __host__ inline int64_t height() const { return m_desc.height(); }
+    __device__ __host__ inline int64_t height(int64_t n = 0) const { return m_desc.height(n); }
 
     /**
-     * @brief Retrieves the width of the image.
+     * @brief Retrieves the width of the image at batch index n.
      *
+     * @param n Batch index. Ignored when W is a uniform-shape wrapper.
      * @return Image width.
      */
-    __device__ __host__ inline int64_t width() const { return m_desc.width(); }
+    __device__ __host__ inline int64_t width(int64_t n = 0) const { return m_desc.width(n); }
 
     /**
      * @brief Retrieves the number of batches in the image tensor.
@@ -167,7 +170,7 @@ class BorderWrapper {
     __device__ __host__ inline int64_t channels() const { return m_desc.channels(); }
 
    private:
-    ImageWrapper<T> m_desc;
-    T m_border_value;
+    W m_desc;
+    ValueType m_border_value;
 };
 }  // namespace roccv
\ No newline at end of file
diff --git a/include/core/wrappers/image_wrapper.hpp b/include/core/wrappers/image_wrapper.hpp
index e174c64a..4f1835f5 100644
--- a/include/core/wrappers/image_wrapper.hpp
+++ b/include/core/wrappers/image_wrapper.hpp
@@ -139,16 +139,20 @@ class ImageWrapper {
     /**
      * @brief Retrives the height of the images.
      *
+     * @param n Batch index. Ignored for uniform-shape ImageWrapper; included so the signature
+     *          matches VarShapeImageWrapper for use as a template parameter to BorderWrapper et al.
      * @return Image height.
      */
-    __device__ __host__ inline int64_t height() const { return shape.h; }
+    __device__ __host__ inline int64_t height(int64_t /*n*/ = 0) const { return shape.h; }
 
     /**
      * @brief Retrieves the width of the image.
      *
+     * @param n Batch index. Ignored for uniform-shape ImageWrapper; included so the signature
+     *          matches VarShapeImageWrapper for use as a template parameter to BorderWrapper et al.
      * @return Image width.
      */
-    __device__ __host__ inline int64_t width() const { return shape.w; }
+    __device__ __host__ inline int64_t width(int64_t /*n*/ = 0) const { return shape.w; }
 
     /**
      * @brief Retrieves the number of batches in the image tensor.
diff --git a/include/core/wrappers/interpolation_wrapper.hpp b/include/core/wrappers/interpolation_wrapper.hpp
index 7adb8cb6..68daaad8 100644
--- a/include/core/wrappers/interpolation_wrapper.hpp
+++ b/include/core/wrappers/interpolation_wrapper.hpp
@@ -23,41 +23,35 @@
 
 #include "core/detail/casting.hpp"
 #include "core/detail/math/vectorized_type_math.hpp"
-#include "core/wrappers/border_wrapper.hpp"
 #include "core/detail/vector_utils.hpp"
+#include "core/wrappers/border_wrapper.hpp"
 #include "operator_types.h"
 
 namespace roccv {
 
 /**
- * @brief A kernel-friendly wrapper which provides interpolation logic based on the given
- * coordinates. This tensor wrapper is typically only used for input tensors and does not provide write access to its
- * underlying data.
+ * @brief A kernel-friendly wrapper which provides interpolation logic on top of an underlying image wrapper.
+ *
+ * Templated on the wrapper type W (e.g. ImageWrapper<T>, VarShapeImageWrapper<T>) so that the same
+ * interpolation math is shared between uniform-shape and variable-shape image batches. The pixel value
+ * type T is recovered from W::ValueType. Read-only access; do not use for output tensors.
  *
- * @tparam T Underlying data type of the tensor data.
- * @tparam C Number of channels in data type.
  * @tparam B Border type to use for interpolation.
  * @tparam I Interpolation type to use.
+ * @tparam W The underlying image wrapper type. Must expose ValueType, at(n,h,w,c), width(n), and height(n).
  */
-template <typename T, eBorderType B, eInterpolationType I>
+template <eBorderType B, eInterpolationType I, typename W>
 class InterpolationWrapper {
    public:
-    /**
-     * @brief Wraps a roccv::Tensor in an InterpolationWrapper to provide pixel interpolation when accessing
-     * non-integer coordinate mappings.
-     *
-     * @param tensor The tensor to wrap.
-     * @param border_value A fallback border value to use in the case of a constant border mode.
-     */
-    InterpolationWrapper(const Tensor& tensor, T border_value) : m_desc(tensor, border_value) {}
+    using ValueType = typename W::ValueType;
 
     /**
-     * @brief Wraps a BorderWrapper in an Interpolation wrapper. Extends capabilities to interpolate pixel values when
+     * @brief Wraps a BorderWrapper in an InterpolationWrapper. Extends capabilities to interpolate pixel values when
      * given non-integer coordinates.
      *
      * @param borderWrapper The BorderWrapper to wrap.
      */
-    InterpolationWrapper(BorderWrapper<T, B> borderWrapper) : m_desc(borderWrapper) {}
+    InterpolationWrapper(BorderWrapper<B, W> borderWrapper) : m_desc(borderWrapper) {}
 
     /**
      * @brief This function calculates the weighting coefficients for the Catmull-Rom cubic interpolation.
@@ -92,7 +86,7 @@ class InterpolationWrapper {
      * @param w Width coordinates.
      * @return An interpolated value.
      */
-    inline __device__ __host__ const T at(int64_t n, float h, float w, int64_t c) const {
+    inline __device__ __host__ const ValueType at(int64_t n, float h, float w, int64_t c) const {
         if constexpr (I == eInterpolationType::INTERP_TYPE_NEAREST) {
             // Nearest neighbor interpolation implementation
             return m_desc.at(n, lroundf(h), lroundf(w), c);
@@ -102,7 +96,7 @@ class InterpolationWrapper {
             // -     -
             // v3 -- v4
 
-            using WorkType = detail::MakeType<float, detail::NumElements<T>>;
+            using WorkType = detail::MakeType<float, detail::NumElements<ValueType>>;
 
             int64_t x0 = static_cast<int64_t>(floorf(w));
             int64_t x1 = x0 + 1;
@@ -118,10 +112,10 @@ class InterpolationWrapper {
             auto q2 = v3 * (x1 - w) + v4 * (w - x0);
             auto q = q1 * (y1 - h) + q2 * (h - y0);
 
-            return detail::RangeCast<T>(q);
+            return detail::RangeCast<ValueType>(q);
         } else if constexpr (I == eInterpolationType::INTERP_TYPE_CUBIC) {
             using namespace roccv::detail;
-            using WorkType = detail::MakeType<float, detail::NumElements<T>>;
+            using WorkType = detail::MakeType<float, detail::NumElements<ValueType>>;
 
             // Integer coordinates for pixel (x, y)
             int64_t int_x = static_cast<int64_t>(floorf(w));
@@ -136,20 +130,21 @@ class InterpolationWrapper {
             WorkType sum = SetAll<WorkType>(0.0f);
             for (int index_y = -1; index_y <= 2; index_y++) {
                 for (int index_x = -1; index_x <= 2; index_x++) {
-                    sum += detail::RangeCast<WorkType>(m_desc.at(n, int_y + index_y, int_x + index_x, 0)) * (weight_x[index_x + 1] * weight_y[index_y + 1]);
+                    sum += detail::RangeCast<WorkType>(m_desc.at(n, int_y + index_y, int_x + index_x, 0)) *
+                           (weight_x[index_x + 1] * weight_y[index_y + 1]);
                 }
             }
 
-            return detail::RangeCast<T>(sum);
+            return detail::RangeCast<ValueType>(sum);
         }
     }
 
-    __device__ __host__ inline int64_t height() const { return m_desc.height(); }
-    __device__ __host__ inline int64_t width() const { return m_desc.width(); }
+    __device__ __host__ inline int64_t height(int64_t n = 0) const { return m_desc.height(n); }
+    __device__ __host__ inline int64_t width(int64_t n = 0) const { return m_desc.width(n); }
     __device__ __host__ inline int64_t batches() const { return m_desc.batches(); }
     __device__ __host__ inline int64_t channels() const { return m_desc.channels(); }
 
    private:
-    BorderWrapper<T, B> m_desc;
+    BorderWrapper<B, W> m_desc;
 };
 }  // namespace roccv
\ No newline at end of file
diff --git a/src/op_bilateral_filter.cpp b/src/op_bilateral_filter.cpp
index dffba8ae..c4adfb25 100644
--- a/src/op_bilateral_filter.cpp
+++ b/src/op_bilateral_filter.cpp
@@ -43,7 +43,7 @@ BilateralFilter::~BilateralFilter() {}
 template <typename T, eBorderType B>
 void dispatch_bilateral_filter_border_mode(hipStream_t stream, const Tensor &input, const Tensor &output, int diameter,
                                            float sigmaColor, float sigmaSpace, T borderValue, eDeviceType device) {
-    BorderWrapper<T, B> inputWrapper(input, borderValue);
+    BorderWrapper<B, ImageWrapper<T>> inputWrapper(ImageWrapper<T>(input), borderValue);
     ImageWrapper<T> outputWrapper(output);
 
     if (outputWrapper.channels() > 4 || outputWrapper.channels() < 1) {
@@ -61,8 +61,7 @@ void dispatch_bilateral_filter_border_mode(hipStream_t stream, const Tensor &inp
         sigmaSpace = 1.0f;
     }
 
-    const int radius =
-        (diameter <= 0) ? static_cast<int>(std::roundf(sigmaSpace * 1.5f)) : (diameter >> 1);
+    const int radius = (diameter <= 0) ? static_cast<int>(std::roundf(sigmaSpace * 1.5f)) : (diameter >> 1);
 
     float spaceCoeff = -1 / (2 * sigmaSpace * sigmaSpace);
     float colorCoeff = -1 / (2 * sigmaColor * sigmaColor);
@@ -89,9 +88,10 @@ void dispatch_bilateral_filter_border_mode(hipStream_t stream, const Tensor &inp
 
         for (int j = 0; j < divisor; j++) {
             for (int i = 0; i < dividend; i++) {
-                threads.push_back(std::thread(Kernels::Host::bilateral_filter<T, BorderWrapper<T, B>, ImageWrapper<T>>,
-                                              inputWrapper, outputWrapper, radius, rollingHeight, rollingWidth,
-                                              prevHeight, prevWidth, spaceCoeff, colorCoeff));
+                threads.push_back(
+                    std::thread(Kernels::Host::bilateral_filter<T, BorderWrapper<B, ImageWrapper<T>>, ImageWrapper<T>>,
+                                inputWrapper, outputWrapper, radius, rollingHeight, rollingWidth, prevHeight, prevWidth,
+                                spaceCoeff, colorCoeff));
                 prevWidth = rollingWidth;
                 rollingWidth += factorW;
             }
diff --git a/src/op_copy_make_border.cpp b/src/op_copy_make_border.cpp
index feacfbd9..32e4ad7a 100644
--- a/src/op_copy_make_border.cpp
+++ b/src/op_copy_make_border.cpp
@@ -38,7 +38,7 @@ namespace roccv {
 template <typename T, eBorderType BorderMode>
 void dispatch_copy_make_border_border_mode(hipStream_t stream, const Tensor& input, const Tensor& output, int32_t top,
                                            int32_t left, T border_value, eDeviceType device) {
-    BorderWrapper<T, BorderMode> in_desc(input, border_value);
+    BorderWrapper<BorderMode, ImageWrapper<T>> in_desc(ImageWrapper<T>(input), border_value);
     ImageWrapper<T> out_desc(output);
 
     switch (device) {
@@ -83,8 +83,7 @@ void dispatch_copy_make_border(hipStream_t stream, const Tensor& input, const Te
 }
 
 void CopyMakeBorder::operator()(hipStream_t stream, const Tensor& input, const Tensor& output, int32_t top,
-                                int32_t left, eBorderType border_mode, float4 border_value,
-                                eDeviceType device) const {
+                                int32_t left, eBorderType border_mode, float4 border_value, eDeviceType device) const {
     CHECK_TENSOR_DEVICE(input, device);
     CHECK_TENSOR_LAYOUT(input, eTensorLayout::TENSOR_LAYOUT_NHWC, eTensorLayout::TENSOR_LAYOUT_HWC);
     CHECK_TENSOR_DATATYPES(input, eDataType::DATA_TYPE_U8, eDataType::DATA_TYPE_S8, eDataType::DATA_TYPE_U16,
diff --git a/src/op_remap.cpp b/src/op_remap.cpp
index 0992cf44..5b01f4dc 100644
--- a/src/op_remap.cpp
+++ b/src/op_remap.cpp
@@ -77,8 +77,10 @@ void dispatch_remap_mapInterp(hipStream_t stream, const Tensor &input, const Ten
                               const eRemapType mapValueType, const bool alignCorners, const T borderValue,
                               const eDeviceType device) {
     ImageWrapper<T> outputWrapper(output);
-    InterpolationWrapper<float2, B, M> wrappedMapTensor(map, make_float2(0, 0));
-    InterpolationWrapper<T, B, I> inputWrapper(input, borderValue);
+    BorderWrapper<B, ImageWrapper<float2>> mapBorder(ImageWrapper<float2>(map), make_float2(0, 0));
+    InterpolationWrapper<B, M, ImageWrapper<float2>> wrappedMapTensor(mapBorder);
+    BorderWrapper<B, ImageWrapper<T>> inputBorder(ImageWrapper<T>(input), borderValue);
+    InterpolationWrapper<B, I, ImageWrapper<T>> inputWrapper(inputBorder);
 
     int mapBatchSize = wrappedMapTensor.batches();
 
diff --git a/src/op_resize.cpp b/src/op_resize.cpp
index d7cd0b61..71fe6889 100644
--- a/src/op_resize.cpp
+++ b/src/op_resize.cpp
@@ -25,7 +25,6 @@ THE SOFTWARE.
 #include <unordered_map>
 
 #include "common/validation_helpers.hpp"
-#include "core/detail/casting.hpp"
 #include "core/exception.hpp"
 #include "core/status_type.h"
 #include "core/wrappers/interpolation_wrapper.hpp"
@@ -38,7 +37,8 @@ template <typename T, eInterpolationType I>
 void dispatch_resize_interp(hipStream_t stream, const Tensor& input, const Tensor& output, eDeviceType device) {
     ImageWrapper<T> outputWrapper(output);
     // Resize operation should clamp values at the border (REPLICATE border mode)
-    InterpolationWrapper<T, eBorderType::BORDER_TYPE_REPLICATE, I> inputWrapper(input, T{});
+    BorderWrapper<eBorderType::BORDER_TYPE_REPLICATE, ImageWrapper<T>> inputBorder(ImageWrapper<T>(input), T{});
+    InterpolationWrapper<eBorderType::BORDER_TYPE_REPLICATE, I, ImageWrapper<T>> inputWrapper(inputBorder);
 
     float scaleX = inputWrapper.width() / static_cast<float>(outputWrapper.width());
     float scaleY = inputWrapper.height() / static_cast<float>(outputWrapper.height());
@@ -62,13 +62,13 @@ void dispatch_resize_interp(hipStream_t stream, const Tensor& input, const Tenso
 template <typename T>
 void dispatch_resize_dtype(hipStream_t stream, const Tensor& input, const Tensor& output,
                            eInterpolationType interpolation, eDeviceType device) {
-    static const std::unordered_map<
-        eInterpolationType,
-        std::function<void(hipStream_t stream, const Tensor& input, const Tensor& output, eDeviceType device)>>
-        funcs = {{eInterpolationType::INTERP_TYPE_NEAREST, dispatch_resize_interp<T, eInterpolationType::INTERP_TYPE_NEAREST>},
-                 {eInterpolationType::INTERP_TYPE_LINEAR, dispatch_resize_interp<T, eInterpolationType::INTERP_TYPE_LINEAR>},
-                 {eInterpolationType::INTERP_TYPE_CUBIC, dispatch_resize_interp<T, eInterpolationType::INTERP_TYPE_CUBIC>}
-                };
+    static const std::unordered_map<eInterpolationType, std::function<void(hipStream_t stream, const Tensor& input,
+                                                                           const Tensor& output, eDeviceType device)>>
+        funcs = {
+            {eInterpolationType::INTERP_TYPE_NEAREST,
+             dispatch_resize_interp<T, eInterpolationType::INTERP_TYPE_NEAREST>},
+            {eInterpolationType::INTERP_TYPE_LINEAR, dispatch_resize_interp<T, eInterpolationType::INTERP_TYPE_LINEAR>},
+            {eInterpolationType::INTERP_TYPE_CUBIC, dispatch_resize_interp<T, eInterpolationType::INTERP_TYPE_CUBIC>}};
 
     if (!funcs.contains(interpolation)) {
         throw Exception("Operation does not support the given interpolation mode.", eStatusType::NOT_IMPLEMENTED);
@@ -78,8 +78,8 @@ void dispatch_resize_dtype(hipStream_t stream, const Tensor& input, const Tensor
     func(stream, input, output, device);
 }
 
-void Resize::operator()(hipStream_t stream, const Tensor& input, const Tensor& output,
-                        eInterpolationType interpolation, eDeviceType device) const {
+void Resize::operator()(hipStream_t stream, const Tensor& input, const Tensor& output, eInterpolationType interpolation,
+                        eDeviceType device) const {
     CHECK_TENSOR_DEVICE(input, device);
     CHECK_TENSOR_DEVICE(output, device);
 
diff --git a/src/op_rotate.cpp b/src/op_rotate.cpp
index 28806779..c1eea211 100644
--- a/src/op_rotate.cpp
+++ b/src/op_rotate.cpp
@@ -55,7 +55,8 @@ void dispatch_rotate_interp(hipStream_t stream, const Tensor &input, const Tenso
     T borderVal = detail::SaturateCast<T>(make_float4(0.0f, 0.0f, 0.0f, 0.0f));
 
     ImageWrapper<T> outputWrap(output);
-    InterpolationWrapper<T, eBorderType::BORDER_TYPE_CONSTANT, InterpType> inputWrap(input, borderVal);
+    BorderWrapper<eBorderType::BORDER_TYPE_CONSTANT, ImageWrapper<T>> inputBorder(ImageWrapper<T>(input), borderVal);
+    InterpolationWrapper<eBorderType::BORDER_TYPE_CONSTANT, InterpType, ImageWrapper<T>> inputWrap(inputBorder);
 
     switch (device) {
         case eDeviceType::GPU: {
@@ -74,8 +75,8 @@ void dispatch_rotate_interp(hipStream_t stream, const Tensor &input, const Tenso
 }
 
 template <typename T>
-void dispatch_rotate_type(hipStream_t stream, const Tensor &input, const Tensor &output, double angleDeg,
-                          double2 shift, eInterpolationType interpolation, eDeviceType device) {
+void dispatch_rotate_type(hipStream_t stream, const Tensor &input, const Tensor &output, double angleDeg, double2 shift,
+                          eInterpolationType interpolation, eDeviceType device) {
     // clang-format off
     static const std::unordered_map<eInterpolationType,
                                     std::function<void(hipStream_t, const Tensor &, const Tensor &, double,
@@ -94,8 +95,8 @@ void dispatch_rotate_type(hipStream_t stream, const Tensor &input, const Tensor
     func(stream, input, output, angleDeg, shift, device);
 }
 
-void Rotate::operator()(hipStream_t stream, const Tensor &input, const Tensor &output, double angleDeg,
-                        double2 shift, eInterpolationType interpolation, eDeviceType device) const {
+void Rotate::operator()(hipStream_t stream, const Tensor &input, const Tensor &output, double angleDeg, double2 shift,
+                        eInterpolationType interpolation, eDeviceType device) const {
     CHECK_TENSOR_DEVICE(input, device);
     CHECK_TENSOR_CHANNELS(input, 1, 3, 4);
     CHECK_TENSOR_DATATYPES(input, eDataType::DATA_TYPE_U8, eDataType::DATA_TYPE_S8, eDataType::DATA_TYPE_U16,
diff --git a/src/op_warp_perspective.cpp b/src/op_warp_perspective.cpp
index ca77fc8c..130b8ef6 100644
--- a/src/op_warp_perspective.cpp
+++ b/src/op_warp_perspective.cpp
@@ -27,7 +27,6 @@ THE SOFTWARE.
 #include "common/validation_helpers.hpp"
 #include "core/detail/casting.hpp"
 #include "core/detail/math/math.hpp"
-#include "core/detail/type_traits.hpp"
 #include "kernels/device/warp_perspective_device.hpp"
 #include "kernels/host/warp_perspective_host.hpp"
 
@@ -37,7 +36,8 @@ void dispatch_warp_perspective_interp(hipStream_t stream, const Tensor &input, c
                                       const PerspectiveTransform transMatrix, T borderValue, eDeviceType device) {
     ArrayWrapper<float, 9> transform(transMatrix);
     ImageWrapper<T> outputWrapper(output);
-    InterpolationWrapper<T, B, I> inputWrapper(input, borderValue);
+    BorderWrapper<B, ImageWrapper<T>> inputBorder(ImageWrapper<T>(input), borderValue);
+    InterpolationWrapper<B, I, ImageWrapper<T>> inputWrapper(inputBorder);
 
     // Launch CPU/GPU kernel depending on requested device type.
     switch (device) {
diff --git a/tests/roccv/cpp/src/tests/core/wrappers/test_border_wrapper.cpp b/tests/roccv/cpp/src/tests/core/wrappers/test_border_wrapper.cpp
index 873f05dc..fd836773 100644
--- a/tests/roccv/cpp/src/tests/core/wrappers/test_border_wrapper.cpp
+++ b/tests/roccv/cpp/src/tests/core/wrappers/test_border_wrapper.cpp
@@ -115,8 +115,8 @@ int64_t GetCoordOfBorderPel(int64_t u, int64_t dimSize, eBorderType borderMode)
  * coordinates fall out of bounds.
  */
 template <typename T, typename BT = detail::BaseType<T>>
-BT GoldenBorderAt(ImageWrapper<T>& input, eBorderType borderMode, T borderValue, int64_t sample, int64_t y,
-                  int64_t x, int64_t channel) {
+BT GoldenBorderAt(ImageWrapper<T>& input, eBorderType borderMode, T borderValue, int64_t sample, int64_t y, int64_t x,
+                  int64_t channel) {
     int64_t outX = x, outY = y;
 
     if (borderMode == eBorderType::BORDER_TYPE_CONSTANT) {
@@ -161,7 +161,8 @@ void TestCorrectness(float4 borderValue, int32_t batchSize, Size2D imageSize, in
     FillVector(inputData);
 
     // BorderWrapper to calculate the actual calculated values.
-    BorderWrapper<T, BorderType> borderWrap(ImageWrapper<T>(inputData, batchSize, imageSize.w, imageSize.h), borderVal);
+    BorderWrapper<BorderType, ImageWrapper<T>> borderWrap(
+        ImageWrapper<T>(inputData, batchSize, imageSize.w, imageSize.h), borderVal);
     std::vector<BT> actualOutput(numElementsWithBorder);
     int actualIndex = 0;
     for (int batch = 0; batch < batchSize; ++batch) {
diff --git a/tests/roccv/cpp/src/tests/core/wrappers/test_interpolation_wrapper.cpp b/tests/roccv/cpp/src/tests/core/wrappers/test_interpolation_wrapper.cpp
index a4466530..5ec3176e 100644
--- a/tests/roccv/cpp/src/tests/core/wrappers/test_interpolation_wrapper.cpp
+++ b/tests/roccv/cpp/src/tests/core/wrappers/test_interpolation_wrapper.cpp
@@ -21,9 +21,9 @@
 
 #include <core/detail/casting.hpp>
 #include <core/detail/type_traits.hpp>
-#include "core/detail/vector_utils.hpp"
 #include <core/wrappers/interpolation_wrapper.hpp>
 
+#include "core/detail/vector_utils.hpp"
 #include "test_helpers.hpp"
 
 using namespace roccv;
@@ -45,7 +45,7 @@ namespace {
  * @return T The interpolated pixel.
  */
 template <typename T, eBorderType BorderType>
-T GoldenLinear(BorderWrapper<T, BorderType> input, int64_t sample, float y, float x) {
+T GoldenLinear(BorderWrapper<BorderType, ImageWrapper<T>> input, int64_t sample, float y, float x) {
     // Defines the vectorized float type for intermediate calculations.
     using WorkType = detail::MakeType<float, detail::NumComponents<T>>;
 
@@ -86,7 +86,7 @@ T GoldenLinear(BorderWrapper<T, BorderType> input, int64_t sample, float y, floa
  * @return T The interpolated pixel.
  */
 template <typename T, eBorderType BorderType>
-T GoldenNearest(BorderWrapper<T, BorderType> input, int64_t sample, float y, float x) {
+T GoldenNearest(BorderWrapper<BorderType, ImageWrapper<T>> input, int64_t sample, float y, float x) {
     // Nearest neighbor interpolation. Rounds given floating point values to the nearest integer.
     return input.at(sample, lroundf(y), lroundf(x), 0);
 }
@@ -98,7 +98,7 @@ T GoldenNearest(BorderWrapper<T, BorderType> input, int64_t sample, float y, flo
  * @return None.
  */
 void CalBicubicWeights(float dist, float* weight) {
-    const float A = -0.5f; // Note OpenCV sets alpha to -0.75f
+    const float A = -0.5f;  // Note OpenCV sets alpha to -0.75f
 
     weight[0] = ((A * (dist + 1) - 5 * A) * (dist + 1) + 8 * A) * (dist + 1) - 4 * A;
     weight[1] = ((A + 2) * dist - (A + 3)) * dist * dist + 1;
@@ -107,7 +107,8 @@ void CalBicubicWeights(float dist, float* weight) {
 }
 
 /**
- * @brief Golden model for Bicubic interpolation. This is the Catmull-Rom cubic interpolation commonly used in CV libraries.
+ * @brief Golden model for Bicubic interpolation. This is the Catmull-Rom cubic interpolation commonly used in CV
+ * libraries.
  *
  * @tparam T Image datatype.
  * @tparam BorderType Border type for boundary conditions.
@@ -118,7 +119,7 @@ void CalBicubicWeights(float dist, float* weight) {
  * @return T The interpolated pixel.
  */
 template <typename T, eBorderType BorderType>
-T GoldenBicubic(BorderWrapper<T, BorderType> input, int64_t sample, float y, float x) {
+T GoldenBicubic(BorderWrapper<BorderType, ImageWrapper<T>> input, int64_t sample, float y, float x) {
     // Defines the vectorized float type for intermediate calculations.
     using WorkType = detail::MakeType<float, detail::NumComponents<T>>;
 
@@ -135,7 +136,8 @@ T GoldenBicubic(BorderWrapper<T, BorderType> input, int64_t sample, float y, flo
     WorkType sum = SetAll<WorkType>(0.0f);
     for (int indexY = -1; indexY <= 2; indexY++) {
         for (int indexX = -1; indexX <= 2; indexX++) {
-            sum += detail::RangeCast<WorkType>(input.at(sample, intY + indexY, intX + indexX, 0)) * (weightX[indexX + 1] * weightY[indexY + 1]);
+            sum += detail::RangeCast<WorkType>(input.at(sample, intY + indexY, intX + indexX, 0)) *
+                   (weightX[indexX + 1] * weightY[indexY + 1]);
         }
     }
 
@@ -156,7 +158,7 @@ T GoldenBicubic(BorderWrapper<T, BorderType> input, int64_t sample, float y, flo
  * @return T The interpolated pixel.
  */
 template <typename T, eBorderType BorderType>
-T GoldenInterpolationAt(BorderWrapper<T, BorderType> input, int64_t sample, float y, float x,
+T GoldenInterpolationAt(BorderWrapper<BorderType, ImageWrapper<T>> input, int64_t sample, float y, float x,
                         eInterpolationType interp) {
     switch (interp) {
         case eInterpolationType::INTERP_TYPE_NEAREST:
@@ -202,9 +204,11 @@ void TestCorrectness(int64_t batchSize, Size2D imageSize, float4 borderValue, fl
     std::vector<detail::BaseType<T>> goldenOutput;
 
     // Use roccv::InterpolationWrapper to get actual output
-    InterpolationWrapper<T, BorderType, InterpType> actualWrap(
-        (BorderWrapper<T, BorderType>(ImageWrapper<T>(input, batchSize, imageSize.w, imageSize.h), borderVal)));
-    BorderWrapper<T, BorderType> goldenWrap(ImageWrapper<T>(input, batchSize, imageSize.w, imageSize.h), borderVal);
+    InterpolationWrapper<BorderType, InterpType, ImageWrapper<T>> actualWrap(
+        (BorderWrapper<BorderType, ImageWrapper<T>>(ImageWrapper<T>(input, batchSize, imageSize.w, imageSize.h),
+                                                    borderVal)));
+    BorderWrapper<BorderType, ImageWrapper<T>> goldenWrap(ImageWrapper<T>(input, batchSize, imageSize.w, imageSize.h),
+                                                          borderVal);
 
     for (int b = 0; b < batchSize; b++) {
         for (float y = 0; y < imageSize.h; y += idxDelta) {
@@ -220,7 +224,8 @@ void TestCorrectness(int64_t batchSize, Size2D imageSize, float4 borderValue, fl
             }
         }
     }
-    if constexpr (std::is_integral_v<detail::BaseType<T>> && std::is_signed_v<detail::BaseType<T>> && sizeof(detail::BaseType<T>) == 4) {
+    if constexpr (std::is_integral_v<detail::BaseType<T>> && std::is_signed_v<detail::BaseType<T>> &&
+                  sizeof(detail::BaseType<T>) == 4) {
         CompareVectorsNear(actualOutput, goldenOutput, NEAR_EQUAL_THRESHOLD * 2);
     } else {
         CompareVectorsNear(actualOutput, goldenOutput);
@@ -228,7 +233,7 @@ void TestCorrectness(int64_t batchSize, Size2D imageSize, float4 borderValue, fl
 }
 }  // namespace
 
-int main(int argc, char **argv) {
+int main(int argc, char** argv) {
     (void)argc;
     (void)argv;
     TEST_CASES_BEGIN();
@@ -322,7 +327,7 @@ int main(int argc, char **argv) {
     TEST_CASE((TestCorrectness<float1, eBorderType::BORDER_TYPE_REFLECT, eInterpolationType::INTERP_TYPE_CUBIC>(1, {20, 53}, make_float4(0, 0, 0, 1), 0.1f)));
     TEST_CASE((TestCorrectness<float3, eBorderType::BORDER_TYPE_CONSTANT, eInterpolationType::INTERP_TYPE_CUBIC>(3, {38, 10}, make_float4(0, 0, 0, 1), 0.1f)));
     TEST_CASE((TestCorrectness<float4, eBorderType::BORDER_TYPE_WRAP, eInterpolationType::INTERP_TYPE_CUBIC>(5, {65, 21}, make_float4(1, 0.5, 0.5, 1), 0.1f)));
-     // clang-format on
+    // clang-format on
 
     TEST_CASES_END();
 }
\ No newline at end of file
diff --git a/tests/roccv/cpp/src/tests/operators/test_op_bilateral_filter.cpp b/tests/roccv/cpp/src/tests/operators/test_op_bilateral_filter.cpp
index f208962c..8ae5a10a 100644
--- a/tests/roccv/cpp/src/tests/operators/test_op_bilateral_filter.cpp
+++ b/tests/roccv/cpp/src/tests/operators/test_op_bilateral_filter.cpp
@@ -51,7 +51,8 @@ namespace {
 template <typename T, eBorderType borderMode, typename BT = detail::BaseType<T>>
 void GenerateGoldenBilateral(std::vector<BT>& input, std::vector<BT>& output, int32_t batchSize, Size2D imageSize,
                              int diameter, float sigmaColor, float sigmaSpace, T borderValue) {
-    BorderWrapper<T, borderMode> src(ImageWrapper<T>(input, batchSize, imageSize.w, imageSize.h), borderValue);
+    BorderWrapper<borderMode, ImageWrapper<T>> src(ImageWrapper<T>(input, batchSize, imageSize.w, imageSize.h),
+                                                   borderValue);
     ImageWrapper<T> dst(output, batchSize, imageSize.w, imageSize.h);
     using namespace roccv::detail;
     using Worktype = MakeType<float, NumElements<T>>;
@@ -179,9 +180,9 @@ int main(int argc, char** argv) {
     TEST_CASE((TestCorrectness<uchar, BORDER_TYPE_CONSTANT>(1, 20, 20, FMT_U8, 0, 50.0f, 1.2f, {0.0, 0.0, 0.0, 0.0},
                                                             eDeviceType::GPU)));
     TEST_CASE((TestCorrectness<uchar3, BORDER_TYPE_REPLICATE>(2, 20, 20, FMT_RGB8, -1, 50.0f, 1.2f,
-                                                             {0.0, 0.0, 0.0, 0.0}, eDeviceType::GPU)));
-    TEST_CASE((TestCorrectness<float1, BORDER_TYPE_WRAP>(1, 24, 24, FMT_F32, 0, 500.0f, 1.2f,
-                                                         {500.0, 500.0, 0.0, 0.0}, eDeviceType::GPU)));
+                                                              {0.0, 0.0, 0.0, 0.0}, eDeviceType::GPU)));
+    TEST_CASE((TestCorrectness<float1, BORDER_TYPE_WRAP>(1, 24, 24, FMT_F32, 0, 500.0f, 1.2f, {500.0, 500.0, 0.0, 0.0},
+                                                         eDeviceType::GPU)));
 
     TEST_CASE((TestCorrectness<uchar3, BORDER_TYPE_CONSTANT>(1, 20, 20, FMT_RGB8, 4, 50.0f, 3.0f, {0.0, 0.0, 0.0, 0.0},
                                                              eDeviceType::GPU)));
@@ -288,9 +289,9 @@ int main(int argc, char** argv) {
     TEST_CASE((TestCorrectness<uchar, BORDER_TYPE_CONSTANT>(1, 20, 20, FMT_U8, 0, 50.0f, 1.2f, {0.0, 0.0, 0.0, 0.0},
                                                             eDeviceType::CPU)));
     TEST_CASE((TestCorrectness<uchar3, BORDER_TYPE_REPLICATE>(2, 20, 20, FMT_RGB8, -1, 50.0f, 1.2f,
-                                                             {0.0, 0.0, 0.0, 0.0}, eDeviceType::CPU)));
-    TEST_CASE((TestCorrectness<float1, BORDER_TYPE_WRAP>(1, 24, 24, FMT_F32, 0, 500.0f, 1.2f,
-                                                         {500.0, 500.0, 0.0, 0.0}, eDeviceType::CPU)));
+                                                              {0.0, 0.0, 0.0, 0.0}, eDeviceType::CPU)));
+    TEST_CASE((TestCorrectness<float1, BORDER_TYPE_WRAP>(1, 24, 24, FMT_F32, 0, 500.0f, 1.2f, {500.0, 500.0, 0.0, 0.0},
+                                                         eDeviceType::CPU)));
 
     TEST_CASE((TestCorrectness<uchar3, BORDER_TYPE_CONSTANT>(1, 20, 20, FMT_RGB8, 4, 50.0f, 3.0f, {0.0, 0.0, 0.0, 0.0},
                                                              eDeviceType::CPU)));
diff --git a/tests/roccv/cpp/src/tests/operators/test_op_copy_make_border.cpp b/tests/roccv/cpp/src/tests/operators/test_op_copy_make_border.cpp
index 4320f04e..30050917 100644
--- a/tests/roccv/cpp/src/tests/operators/test_op_copy_make_border.cpp
+++ b/tests/roccv/cpp/src/tests/operators/test_op_copy_make_border.cpp
@@ -57,7 +57,8 @@ std::vector<BT> GoldenCopyMakeBorder(std::vector<BT> input, int batchSize, Size2
 
     // Wrap the input images in a BorderWrapper to handle out of bounds image behavior. The BorderWrapper has already
     // been tested in another test so it can be used reliably.
-    BorderWrapper<T, BorderType> inputWrap(ImageWrapper<T>(input, batchSize, inputSize.w, inputSize.h), borderVal);
+    BorderWrapper<BorderType, ImageWrapper<T>> inputWrap(ImageWrapper<T>(input, batchSize, inputSize.w, inputSize.h),
+                                                         borderVal);
 
     std::vector<BT> output(batchSize * outputSize.h * outputSize.w * channels);
     ImageWrapper<T> outputWrap(output, batchSize, outputSize.w, outputSize.h);
diff --git a/tests/roccv/cpp/src/tests/operators/test_op_remap.cpp b/tests/roccv/cpp/src/tests/operators/test_op_remap.cpp
index 634344a4..4ad9d914 100644
--- a/tests/roccv/cpp/src/tests/operators/test_op_remap.cpp
+++ b/tests/roccv/cpp/src/tests/operators/test_op_remap.cpp
@@ -25,8 +25,9 @@ THE SOFTWARE.
 #include <core/wrappers/interpolation_wrapper.hpp>
 #include <iostream>
 #include <op_remap.hpp>
-#include "core/detail/internal_structs.hpp"
+
 #include "core/detail/casting.hpp"
+#include "core/detail/internal_structs.hpp"
 #include "core/detail/math/vectorized_type_math.hpp"
 #include "core/detail/type_traits.hpp"
 #include "operator_types.h"
@@ -39,11 +40,11 @@ using namespace roccv::detail;
 // Keep all non-entrypoint functions in an anonymous namespace to prevent redefinition errors across translation units.
 namespace {
 
-RemapParams GetRemapParams(const int2 &srcSize, const int2 &dstSize, const int2 &mapSize, bool alignCorners, eRemapType mapValueType)
-{
+RemapParams GetRemapParams(const int2& srcSize, const int2& dstSize, const int2& mapSize, bool alignCorners,
+                           eRemapType mapValueType) {
     RemapParams params;
 
-    switch(mapValueType) {
+    switch (mapValueType) {
         case REMAP_ABSOLUTE:
             params.srcScale = make_float2(0.f, 0.f);
             params.mapScale = StaticCast<float2>(mapSize) / StaticCast<float2>(dstSize);
@@ -54,7 +55,7 @@ RemapParams GetRemapParams(const int2 &srcSize, const int2 &dstSize, const int2
         case REMAP_ABSOLUTE_NORMALIZED:
             params.srcScale = make_float2(0.f, 0.f);
             params.mapScale = StaticCast<float2>(mapSize) / StaticCast<float2>(dstSize);
-            params.valScale  = (StaticCast<float2>(srcSize) - (alignCorners ? 1.f : 0.f)) / 2.f;
+            params.valScale = (StaticCast<float2>(srcSize) - (alignCorners ? 1.f : 0.f)) / 2.f;
             params.srcOffset = params.valScale - (alignCorners ? 0.f : .5f);
             params.dstOffset = 0.f;
             break;
@@ -87,15 +88,15 @@ RemapParams GetRemapParams(const int2 &srcSize, const int2 &dstSize, const int2
  */
 template <typename T, eBorderType BorderType, eInterpolationType InterpType, eInterpolationType MapInterpType,
           typename BT = detail::BaseType<T>>
-std::vector<BT> GoldenRemap(std::vector<BT>& input, int32_t batchSize, int32_t mapBatchSize, int32_t inWidth, int32_t inHeight, int32_t outWidth, 
-                            int32_t outHeight, int32_t mapWidth, int32_t mapHeight, std::vector<float2>& mapData, eRemapType mapType, bool alignCorners, float4 borderValue) {
-    
+std::vector<BT> GoldenRemap(std::vector<BT>& input, int32_t batchSize, int32_t mapBatchSize, int32_t inWidth,
+                            int32_t inHeight, int32_t outWidth, int32_t outHeight, int32_t mapWidth, int32_t mapHeight,
+                            std::vector<float2>& mapData, eRemapType mapType, bool alignCorners, float4 borderValue) {
     int channels = detail::NumElements<T>;
     int outputSize = batchSize * outWidth * outHeight * channels;
     std::vector<BT> output(outputSize);
 
     // Create interpolation wrapper for input vector
-    InterpolationWrapper<T, BorderType, InterpType> src((BorderWrapper<T, BorderType>(
+    InterpolationWrapper<BorderType, InterpType, ImageWrapper<T>> src((BorderWrapper<BorderType, ImageWrapper<T>>(
         ImageWrapper<T>(input, batchSize, inWidth, inHeight), detail::SaturateCast<T>(borderValue))));
 
     // Wrap the output vector for simplified data access
@@ -103,8 +104,10 @@ std::vector<BT> GoldenRemap(std::vector<BT>& input, int32_t batchSize, int32_t m
 
     // Create an interpolation wrapper for the map tensor
     // InterpolationWrapper<float2, BorderType, MapInterpType> wrappedMapTensor(map, make_float2(0, 0));
-    InterpolationWrapper<float2, BorderType, MapInterpType> map((BorderWrapper<float2, BorderType>(
-        ImageWrapper<float2>(mapData.data(), mapBatchSize, mapWidth, mapHeight), detail::SaturateCast<float2>(borderValue))));
+    InterpolationWrapper<BorderType, MapInterpType, ImageWrapper<float2>> map(
+        (BorderWrapper<BorderType, ImageWrapper<float2>>(
+            ImageWrapper<float2>(mapData.data(), mapBatchSize, mapWidth, mapHeight),
+            detail::SaturateCast<float2>(borderValue))));
 
     int2 srcSize = make_int2(src.width(), src.height());
     int2 dstSize = make_int2(dst.width(), dst.height());
@@ -119,13 +122,12 @@ std::vector<BT> GoldenRemap(std::vector<BT>& input, int32_t batchSize, int32_t m
     for (int b = 0; b < dst.batches(); b++) {
         for (int y = 0; y < dst.height(); y++) {
             for (int x = 0; x < dst.width(); x++) {
-                
                 dstCoord.x = static_cast<float>(x);
                 dstCoord.y = static_cast<float>(y);
-                
+
                 mapCoord.x = (dstCoord.x + params.dstOffset) * params.mapScale.x;
                 mapCoord.y = (dstCoord.y + params.dstOffset) * params.mapScale.y;
-                
+
                 float2 mapValue = map.at((mapBatchSize == 1 ? 0 : b), mapCoord.y, mapCoord.x, 0);
 
                 srcCoord.x = dstCoord.x * params.srcScale.x + mapValue.x * params.valScale.x + params.srcOffset.x;
@@ -162,7 +164,8 @@ std::vector<BT> GoldenRemap(std::vector<BT>& input, int32_t batchSize, int32_t m
  */
 template <typename T, eBorderType BorderType, eInterpolationType InterpType, eInterpolationType MapInterpType,
           typename BT = detail::BaseType<T>>
-void TestCorrectness(int batchSize, int mapBatchSize, int inWidth, int inHeight, int outWidth, int outHeight, int mapWidth, int mapHeight, ImageFormat format, float4 borderValue, eRemapType mapType,
+void TestCorrectness(int batchSize, int mapBatchSize, int inWidth, int inHeight, int outWidth, int outHeight,
+                     int mapWidth, int mapHeight, ImageFormat format, float4 borderValue, eRemapType mapType,
                      bool alignCorners, eDeviceType device) {
     // Create input and output tensor based on test parameters
     Tensor input(batchSize, {inWidth, inHeight}, format, device);
@@ -174,7 +177,7 @@ void TestCorrectness(int batchSize, int mapBatchSize, int inWidth, int inHeight,
 
     // Copy generated input data into input tensor
     CopyVectorIntoTensor(input, inputData);
-    
+
     int mapSize = mapBatchSize * mapWidth * mapHeight;
 
     std::vector<float2> mapData(mapSize);
@@ -188,11 +191,10 @@ void TestCorrectness(int batchSize, int mapBatchSize, int inWidth, int inHeight,
                 }
             }
         }
-    }
-    else if (mapType == REMAP_ABSOLUTE_NORMALIZED) {
+    } else if (mapType == REMAP_ABSOLUTE_NORMALIZED) {
         for (int b = 0; b < mapBatchSize; b++) {
-            for (int y = 0; y < mapHeight; y++){
-                for (int x = 0; x < mapWidth; x++){
+            for (int y = 0; y < mapHeight; y++) {
+                for (int x = 0; x < mapWidth; x++) {
                     float normX = ((2.0f * static_cast<float>(x)) / static_cast<float>(mapWidth - 1)) - 1.0f;
                     float normY = ((2.0f * static_cast<float>(y)) / static_cast<float>(mapHeight - 1)) - 1.0f;
 
@@ -204,11 +206,10 @@ void TestCorrectness(int batchSize, int mapBatchSize, int inWidth, int inHeight,
                 }
             }
         }
-    }
-    else if (mapType == REMAP_RELATIVE_NORMALIZED) {
+    } else if (mapType == REMAP_RELATIVE_NORMALIZED) {
         for (int b = 0; b < mapBatchSize; b++) {
-            for (int y = 0; y < mapHeight; y++){
-                for (int x = 0; x < mapWidth; x++){
+            for (int y = 0; y < mapHeight; y++) {
+                for (int x = 0; x < mapWidth; x++) {
                     // Generate normalized coordinates in [-1, 1] range
                     float normX = ((2.0f * static_cast<float>(x)) / static_cast<float>(mapWidth - 1)) - 1.0f;
                     float normY = ((2.0f * static_cast<float>(y)) / static_cast<float>(mapHeight - 1)) - 1.0f;
@@ -235,7 +236,8 @@ void TestCorrectness(int batchSize, int mapBatchSize, int inWidth, int inHeight,
     hipStream_t stream;
     HIP_VALIDATE_NO_ERRORS(hipStreamCreate(&stream));
     Remap op;
-    op(stream, input, output, mapTensor, InterpType, MapInterpType, mapType, alignCorners, BorderType, borderValue, device);
+    op(stream, input, output, mapTensor, InterpType, MapInterpType, mapType, alignCorners, BorderType, borderValue,
+       device);
     HIP_VALIDATE_NO_ERRORS(hipStreamSynchronize(stream));
     HIP_VALIDATE_NO_ERRORS(hipStreamDestroy(stream));
 
@@ -243,9 +245,9 @@ void TestCorrectness(int batchSize, int mapBatchSize, int inWidth, int inHeight,
     std::vector<BT> result(output.shape().size());
     CopyTensorIntoVector(result, output);
 
-    std::vector<BT> ref = GoldenRemap<T, BorderType, InterpType, MapInterpType>(inputData, batchSize, mapBatchSize, inWidth,
-                                                                                        inHeight, outWidth, outHeight, 
-                                                                                        mapWidth, mapHeight, mapData, mapType, alignCorners, borderValue);
+    std::vector<BT> ref = GoldenRemap<T, BorderType, InterpType, MapInterpType>(
+        inputData, batchSize, mapBatchSize, inWidth, inHeight, outWidth, outHeight, mapWidth, mapHeight, mapData,
+        mapType, alignCorners, borderValue);
 
     // Compare data in actual output versus the generated golden reference image
     CompareVectors(result, ref);
@@ -258,144 +260,186 @@ int main(int argc, char** argv) {
     TEST_CASES_BEGIN();
 
     TEST_CASE((TestCorrectness<uchar1, eBorderType::BORDER_TYPE_CONSTANT, eInterpolationType::INTERP_TYPE_NEAREST,
-                               eInterpolationType::INTERP_TYPE_NEAREST>(
-        1, 1, 480, 360, 480, 360, 480, 360, FMT_U8, make_float4(0.0f, 0.0f, 0.0f, 1.0f), REMAP_ABSOLUTE, false, eDeviceType::GPU)));
+                               eInterpolationType::INTERP_TYPE_NEAREST>(1, 1, 480, 360, 480, 360, 480, 360, FMT_U8,
+                                                                        make_float4(0.0f, 0.0f, 0.0f, 1.0f),
+                                                                        REMAP_ABSOLUTE, false, eDeviceType::GPU)));
     TEST_CASE((TestCorrectness<uchar1, eBorderType::BORDER_TYPE_CONSTANT, eInterpolationType::INTERP_TYPE_NEAREST,
                                eInterpolationType::INTERP_TYPE_NEAREST>(
-        1, 1, 480, 360, 480, 360, 480, 360, FMT_U8, make_float4(0.0f, 0.0f, 0.0f, 1.0f), REMAP_ABSOLUTE_NORMALIZED, false, eDeviceType::GPU)));
+        1, 1, 480, 360, 480, 360, 480, 360, FMT_U8, make_float4(0.0f, 0.0f, 0.0f, 1.0f), REMAP_ABSOLUTE_NORMALIZED,
+        false, eDeviceType::GPU)));
     TEST_CASE((TestCorrectness<uchar1, eBorderType::BORDER_TYPE_CONSTANT, eInterpolationType::INTERP_TYPE_NEAREST,
                                eInterpolationType::INTERP_TYPE_NEAREST>(
-        1, 1, 480, 360, 480, 360, 480, 360, FMT_U8, make_float4(0.0f, 0.0f, 0.0f, 1.0f), REMAP_RELATIVE_NORMALIZED, false, eDeviceType::GPU)));
+        1, 1, 480, 360, 480, 360, 480, 360, FMT_U8, make_float4(0.0f, 0.0f, 0.0f, 1.0f), REMAP_RELATIVE_NORMALIZED,
+        false, eDeviceType::GPU)));
     TEST_CASE((TestCorrectness<uchar3, eBorderType::BORDER_TYPE_CONSTANT, eInterpolationType::INTERP_TYPE_NEAREST,
-                               eInterpolationType::INTERP_TYPE_NEAREST>(
-        1, 1, 480, 360, 480, 360, 480, 360, FMT_RGB8, make_float4(0.0f, 0.0f, 0.0f, 1.0f), REMAP_ABSOLUTE, false, eDeviceType::GPU)));
+                               eInterpolationType::INTERP_TYPE_NEAREST>(1, 1, 480, 360, 480, 360, 480, 360, FMT_RGB8,
+                                                                        make_float4(0.0f, 0.0f, 0.0f, 1.0f),
+                                                                        REMAP_ABSOLUTE, false, eDeviceType::GPU)));
     TEST_CASE((TestCorrectness<uchar3, eBorderType::BORDER_TYPE_CONSTANT, eInterpolationType::INTERP_TYPE_NEAREST,
                                eInterpolationType::INTERP_TYPE_NEAREST>(
-        1, 1, 480, 360, 480, 360, 480, 360, FMT_RGB8, make_float4(0.0f, 0.0f, 0.0f, 1.0f), REMAP_ABSOLUTE_NORMALIZED, false, eDeviceType::GPU)));
+        1, 1, 480, 360, 480, 360, 480, 360, FMT_RGB8, make_float4(0.0f, 0.0f, 0.0f, 1.0f), REMAP_ABSOLUTE_NORMALIZED,
+        false, eDeviceType::GPU)));
     TEST_CASE((TestCorrectness<uchar3, eBorderType::BORDER_TYPE_CONSTANT, eInterpolationType::INTERP_TYPE_NEAREST,
                                eInterpolationType::INTERP_TYPE_NEAREST>(
-        1, 1, 480, 360, 480, 360, 480, 360, FMT_RGB8, make_float4(0.0f, 0.0f, 0.0f, 1.0f), REMAP_RELATIVE_NORMALIZED, false, eDeviceType::GPU)));
+        1, 1, 480, 360, 480, 360, 480, 360, FMT_RGB8, make_float4(0.0f, 0.0f, 0.0f, 1.0f), REMAP_RELATIVE_NORMALIZED,
+        false, eDeviceType::GPU)));
     TEST_CASE((TestCorrectness<uchar4, eBorderType::BORDER_TYPE_CONSTANT, eInterpolationType::INTERP_TYPE_NEAREST,
-                               eInterpolationType::INTERP_TYPE_NEAREST>(
-        1, 1, 480, 360, 480, 360, 480, 360, FMT_RGBA8, make_float4(0.0f, 0.0f, 0.0f, 1.0f), REMAP_ABSOLUTE, false, eDeviceType::GPU)));
+                               eInterpolationType::INTERP_TYPE_NEAREST>(1, 1, 480, 360, 480, 360, 480, 360, FMT_RGBA8,
+                                                                        make_float4(0.0f, 0.0f, 0.0f, 1.0f),
+                                                                        REMAP_ABSOLUTE, false, eDeviceType::GPU)));
     TEST_CASE((TestCorrectness<uchar4, eBorderType::BORDER_TYPE_CONSTANT, eInterpolationType::INTERP_TYPE_NEAREST,
                                eInterpolationType::INTERP_TYPE_NEAREST>(
-        1, 1, 480, 360, 480, 360, 480, 360, FMT_RGBA8, make_float4(0.0f, 0.0f, 0.0f, 1.0f), REMAP_ABSOLUTE_NORMALIZED, false, eDeviceType::GPU)));
+        1, 1, 480, 360, 480, 360, 480, 360, FMT_RGBA8, make_float4(0.0f, 0.0f, 0.0f, 1.0f), REMAP_ABSOLUTE_NORMALIZED,
+        false, eDeviceType::GPU)));
     TEST_CASE((TestCorrectness<uchar4, eBorderType::BORDER_TYPE_CONSTANT, eInterpolationType::INTERP_TYPE_NEAREST,
                                eInterpolationType::INTERP_TYPE_NEAREST>(
-        1, 1, 480, 360, 480, 360, 480, 360, FMT_RGBA8, make_float4(0.0f, 0.0f, 0.0f, 1.0f), REMAP_RELATIVE_NORMALIZED, false, eDeviceType::GPU)));
-    
+        1, 1, 480, 360, 480, 360, 480, 360, FMT_RGBA8, make_float4(0.0f, 0.0f, 0.0f, 1.0f), REMAP_RELATIVE_NORMALIZED,
+        false, eDeviceType::GPU)));
+
     TEST_CASE((TestCorrectness<uchar1, eBorderType::BORDER_TYPE_CONSTANT, eInterpolationType::INTERP_TYPE_NEAREST,
-                               eInterpolationType::INTERP_TYPE_NEAREST>(
-        1, 1, 480, 360, 480, 360, 480, 360, FMT_U8, make_float4(0.0f, 0.0f, 0.0f, 1.0f), REMAP_ABSOLUTE, true, eDeviceType::GPU)));
+                               eInterpolationType::INTERP_TYPE_NEAREST>(1, 1, 480, 360, 480, 360, 480, 360, FMT_U8,
+                                                                        make_float4(0.0f, 0.0f, 0.0f, 1.0f),
+                                                                        REMAP_ABSOLUTE, true, eDeviceType::GPU)));
     TEST_CASE((TestCorrectness<uchar1, eBorderType::BORDER_TYPE_CONSTANT, eInterpolationType::INTERP_TYPE_NEAREST,
                                eInterpolationType::INTERP_TYPE_NEAREST>(
-        1, 1, 480, 360, 480, 360, 480, 360, FMT_U8, make_float4(0.0f, 0.0f, 0.0f, 1.0f), REMAP_ABSOLUTE_NORMALIZED, true, eDeviceType::GPU)));
+        1, 1, 480, 360, 480, 360, 480, 360, FMT_U8, make_float4(0.0f, 0.0f, 0.0f, 1.0f), REMAP_ABSOLUTE_NORMALIZED,
+        true, eDeviceType::GPU)));
     TEST_CASE((TestCorrectness<uchar1, eBorderType::BORDER_TYPE_CONSTANT, eInterpolationType::INTERP_TYPE_NEAREST,
                                eInterpolationType::INTERP_TYPE_NEAREST>(
-        1, 1, 480, 360, 480, 360, 480, 360, FMT_U8, make_float4(0.0f, 0.0f, 0.0f, 1.0f), REMAP_RELATIVE_NORMALIZED, true, eDeviceType::GPU)));
+        1, 1, 480, 360, 480, 360, 480, 360, FMT_U8, make_float4(0.0f, 0.0f, 0.0f, 1.0f), REMAP_RELATIVE_NORMALIZED,
+        true, eDeviceType::GPU)));
     TEST_CASE((TestCorrectness<uchar3, eBorderType::BORDER_TYPE_CONSTANT, eInterpolationType::INTERP_TYPE_NEAREST,
-                               eInterpolationType::INTERP_TYPE_NEAREST>(
-        1, 1, 480, 360, 480, 360, 480, 360, FMT_RGB8, make_float4(0.0f, 0.0f, 0.0f, 1.0f), REMAP_ABSOLUTE, true, eDeviceType::GPU)));
+                               eInterpolationType::INTERP_TYPE_NEAREST>(1, 1, 480, 360, 480, 360, 480, 360, FMT_RGB8,
+                                                                        make_float4(0.0f, 0.0f, 0.0f, 1.0f),
+                                                                        REMAP_ABSOLUTE, true, eDeviceType::GPU)));
     TEST_CASE((TestCorrectness<uchar3, eBorderType::BORDER_TYPE_CONSTANT, eInterpolationType::INTERP_TYPE_NEAREST,
                                eInterpolationType::INTERP_TYPE_NEAREST>(
-        1, 1, 480, 360, 480, 360, 480, 360, FMT_RGB8, make_float4(0.0f, 0.0f, 0.0f, 1.0f), REMAP_ABSOLUTE_NORMALIZED, true, eDeviceType::GPU)));
+        1, 1, 480, 360, 480, 360, 480, 360, FMT_RGB8, make_float4(0.0f, 0.0f, 0.0f, 1.0f), REMAP_ABSOLUTE_NORMALIZED,
+        true, eDeviceType::GPU)));
     TEST_CASE((TestCorrectness<uchar3, eBorderType::BORDER_TYPE_CONSTANT, eInterpolationType::INTERP_TYPE_NEAREST,
                                eInterpolationType::INTERP_TYPE_NEAREST>(
-        1, 1, 480, 360, 480, 360, 480, 360, FMT_RGB8, make_float4(0.0f, 0.0f, 0.0f, 1.0f), REMAP_RELATIVE_NORMALIZED, true, eDeviceType::GPU)));
+        1, 1, 480, 360, 480, 360, 480, 360, FMT_RGB8, make_float4(0.0f, 0.0f, 0.0f, 1.0f), REMAP_RELATIVE_NORMALIZED,
+        true, eDeviceType::GPU)));
     TEST_CASE((TestCorrectness<uchar4, eBorderType::BORDER_TYPE_CONSTANT, eInterpolationType::INTERP_TYPE_NEAREST,
-                               eInterpolationType::INTERP_TYPE_NEAREST>(
-        1, 1, 480, 360, 480, 360, 480, 360, FMT_RGBA8, make_float4(0.0f, 0.0f, 0.0f, 1.0f), REMAP_ABSOLUTE, true, eDeviceType::GPU)));
+                               eInterpolationType::INTERP_TYPE_NEAREST>(1, 1, 480, 360, 480, 360, 480, 360, FMT_RGBA8,
+                                                                        make_float4(0.0f, 0.0f, 0.0f, 1.0f),
+                                                                        REMAP_ABSOLUTE, true, eDeviceType::GPU)));
     TEST_CASE((TestCorrectness<uchar4, eBorderType::BORDER_TYPE_CONSTANT, eInterpolationType::INTERP_TYPE_NEAREST,
                                eInterpolationType::INTERP_TYPE_NEAREST>(
-        1, 1, 480, 360, 480, 360, 480, 360, FMT_RGBA8, make_float4(0.0f, 0.0f, 0.0f, 1.0f), REMAP_ABSOLUTE_NORMALIZED, true, eDeviceType::GPU)));
+        1, 1, 480, 360, 480, 360, 480, 360, FMT_RGBA8, make_float4(0.0f, 0.0f, 0.0f, 1.0f), REMAP_ABSOLUTE_NORMALIZED,
+        true, eDeviceType::GPU)));
     TEST_CASE((TestCorrectness<uchar4, eBorderType::BORDER_TYPE_CONSTANT, eInterpolationType::INTERP_TYPE_NEAREST,
                                eInterpolationType::INTERP_TYPE_NEAREST>(
-        1, 1, 480, 360, 480, 360, 480, 360, FMT_RGBA8, make_float4(0.0f, 0.0f, 0.0f, 1.0f), REMAP_RELATIVE_NORMALIZED, true, eDeviceType::GPU)));
+        1, 1, 480, 360, 480, 360, 480, 360, FMT_RGBA8, make_float4(0.0f, 0.0f, 0.0f, 1.0f), REMAP_RELATIVE_NORMALIZED,
+        true, eDeviceType::GPU)));
 
     TEST_CASE((TestCorrectness<uchar1, eBorderType::BORDER_TYPE_CONSTANT, eInterpolationType::INTERP_TYPE_NEAREST,
-                               eInterpolationType::INTERP_TYPE_NEAREST>(
-        2, 1, 480, 360, 480, 360, 480, 360, FMT_U8, make_float4(0.0f, 0.0f, 0.0f, 1.0f), REMAP_ABSOLUTE, false, eDeviceType::GPU)));
+                               eInterpolationType::INTERP_TYPE_NEAREST>(2, 1, 480, 360, 480, 360, 480, 360, FMT_U8,
+                                                                        make_float4(0.0f, 0.0f, 0.0f, 1.0f),
+                                                                        REMAP_ABSOLUTE, false, eDeviceType::GPU)));
     TEST_CASE((TestCorrectness<uchar1, eBorderType::BORDER_TYPE_CONSTANT, eInterpolationType::INTERP_TYPE_NEAREST,
-                               eInterpolationType::INTERP_TYPE_NEAREST>(
-        2, 2, 480, 360, 480, 360, 480, 360, FMT_U8, make_float4(0.0f, 0.0f, 0.0f, 1.0f), REMAP_ABSOLUTE, false, eDeviceType::GPU)));
+                               eInterpolationType::INTERP_TYPE_NEAREST>(2, 2, 480, 360, 480, 360, 480, 360, FMT_U8,
+                                                                        make_float4(0.0f, 0.0f, 0.0f, 1.0f),
+                                                                        REMAP_ABSOLUTE, false, eDeviceType::GPU)));
     TEST_CASE((TestCorrectness<uchar1, eBorderType::BORDER_TYPE_CONSTANT, eInterpolationType::INTERP_TYPE_NEAREST,
-                               eInterpolationType::INTERP_TYPE_NEAREST>(
-        2, 1, 480, 360, 480, 360, 480, 360, FMT_U8, make_float4(0.0f, 0.0f, 0.0f, 1.0f), REMAP_ABSOLUTE, true, eDeviceType::GPU)));
+                               eInterpolationType::INTERP_TYPE_NEAREST>(2, 1, 480, 360, 480, 360, 480, 360, FMT_U8,
+                                                                        make_float4(0.0f, 0.0f, 0.0f, 1.0f),
+                                                                        REMAP_ABSOLUTE, true, eDeviceType::GPU)));
     TEST_CASE((TestCorrectness<uchar1, eBorderType::BORDER_TYPE_CONSTANT, eInterpolationType::INTERP_TYPE_NEAREST,
-                               eInterpolationType::INTERP_TYPE_NEAREST>(
-        2, 2, 480, 360, 480, 360, 480, 360, FMT_U8, make_float4(0.0f, 0.0f, 0.0f, 1.0f), REMAP_ABSOLUTE, true, eDeviceType::GPU)));
+                               eInterpolationType::INTERP_TYPE_NEAREST>(2, 2, 480, 360, 480, 360, 480, 360, FMT_U8,
+                                                                        make_float4(0.0f, 0.0f, 0.0f, 1.0f),
+                                                                        REMAP_ABSOLUTE, true, eDeviceType::GPU)));
 
     TEST_CASE((TestCorrectness<uchar1, eBorderType::BORDER_TYPE_CONSTANT, eInterpolationType::INTERP_TYPE_NEAREST,
-                               eInterpolationType::INTERP_TYPE_NEAREST>(
-        1, 1, 480, 360, 480, 360, 480, 360, FMT_U8, make_float4(0.0f, 0.0f, 0.0f, 1.0f), REMAP_ABSOLUTE, false, eDeviceType::CPU)));
+                               eInterpolationType::INTERP_TYPE_NEAREST>(1, 1, 480, 360, 480, 360, 480, 360, FMT_U8,
+                                                                        make_float4(0.0f, 0.0f, 0.0f, 1.0f),
+                                                                        REMAP_ABSOLUTE, false, eDeviceType::CPU)));
     TEST_CASE((TestCorrectness<uchar1, eBorderType::BORDER_TYPE_CONSTANT, eInterpolationType::INTERP_TYPE_NEAREST,
                                eInterpolationType::INTERP_TYPE_NEAREST>(
-        1, 1, 480, 360, 480, 360, 480, 360, FMT_U8, make_float4(0.0f, 0.0f, 0.0f, 1.0f), REMAP_ABSOLUTE_NORMALIZED, false, eDeviceType::CPU)));
+        1, 1, 480, 360, 480, 360, 480, 360, FMT_U8, make_float4(0.0f, 0.0f, 0.0f, 1.0f), REMAP_ABSOLUTE_NORMALIZED,
+        false, eDeviceType::CPU)));
     TEST_CASE((TestCorrectness<uchar1, eBorderType::BORDER_TYPE_CONSTANT, eInterpolationType::INTERP_TYPE_NEAREST,
                                eInterpolationType::INTERP_TYPE_NEAREST>(
-        1, 1, 480, 360, 480, 360, 480, 360, FMT_U8, make_float4(0.0f, 0.0f, 0.0f, 1.0f), REMAP_RELATIVE_NORMALIZED, false, eDeviceType::CPU)));
+        1, 1, 480, 360, 480, 360, 480, 360, FMT_U8, make_float4(0.0f, 0.0f, 0.0f, 1.0f), REMAP_RELATIVE_NORMALIZED,
+        false, eDeviceType::CPU)));
     TEST_CASE((TestCorrectness<uchar3, eBorderType::BORDER_TYPE_CONSTANT, eInterpolationType::INTERP_TYPE_NEAREST,
-                               eInterpolationType::INTERP_TYPE_NEAREST>(
-        1, 1, 480, 360, 480, 360, 480, 360, FMT_RGB8, make_float4(0.0f, 0.0f, 0.0f, 1.0f), REMAP_ABSOLUTE, false, eDeviceType::CPU)));
+                               eInterpolationType::INTERP_TYPE_NEAREST>(1, 1, 480, 360, 480, 360, 480, 360, FMT_RGB8,
+                                                                        make_float4(0.0f, 0.0f, 0.0f, 1.0f),
+                                                                        REMAP_ABSOLUTE, false, eDeviceType::CPU)));
     TEST_CASE((TestCorrectness<uchar3, eBorderType::BORDER_TYPE_CONSTANT, eInterpolationType::INTERP_TYPE_NEAREST,
                                eInterpolationType::INTERP_TYPE_NEAREST>(
-        1, 1, 480, 360, 480, 360, 480, 360, FMT_RGB8, make_float4(0.0f, 0.0f, 0.0f, 1.0f), REMAP_ABSOLUTE_NORMALIZED, false, eDeviceType::CPU)));
+        1, 1, 480, 360, 480, 360, 480, 360, FMT_RGB8, make_float4(0.0f, 0.0f, 0.0f, 1.0f), REMAP_ABSOLUTE_NORMALIZED,
+        false, eDeviceType::CPU)));
     TEST_CASE((TestCorrectness<uchar3, eBorderType::BORDER_TYPE_CONSTANT, eInterpolationType::INTERP_TYPE_NEAREST,
                                eInterpolationType::INTERP_TYPE_NEAREST>(
-        1, 1, 480, 360, 480, 360, 480, 360, FMT_RGB8, make_float4(0.0f, 0.0f, 0.0f, 1.0f), REMAP_RELATIVE_NORMALIZED, false, eDeviceType::CPU)));
+        1, 1, 480, 360, 480, 360, 480, 360, FMT_RGB8, make_float4(0.0f, 0.0f, 0.0f, 1.0f), REMAP_RELATIVE_NORMALIZED,
+        false, eDeviceType::CPU)));
     TEST_CASE((TestCorrectness<uchar4, eBorderType::BORDER_TYPE_CONSTANT, eInterpolationType::INTERP_TYPE_NEAREST,
-                               eInterpolationType::INTERP_TYPE_NEAREST>(
-        1, 1, 480, 360, 480, 360, 480, 360, FMT_RGBA8, make_float4(0.0f, 0.0f, 0.0f, 1.0f), REMAP_ABSOLUTE, false, eDeviceType::CPU)));
+                               eInterpolationType::INTERP_TYPE_NEAREST>(1, 1, 480, 360, 480, 360, 480, 360, FMT_RGBA8,
+                                                                        make_float4(0.0f, 0.0f, 0.0f, 1.0f),
+                                                                        REMAP_ABSOLUTE, false, eDeviceType::CPU)));
     TEST_CASE((TestCorrectness<uchar4, eBorderType::BORDER_TYPE_CONSTANT, eInterpolationType::INTERP_TYPE_NEAREST,
                                eInterpolationType::INTERP_TYPE_NEAREST>(
-        1, 1, 480, 360, 480, 360, 480, 360, FMT_RGBA8, make_float4(0.0f, 0.0f, 0.0f, 1.0f), REMAP_ABSOLUTE_NORMALIZED, false, eDeviceType::CPU)));
+        1, 1, 480, 360, 480, 360, 480, 360, FMT_RGBA8, make_float4(0.0f, 0.0f, 0.0f, 1.0f), REMAP_ABSOLUTE_NORMALIZED,
+        false, eDeviceType::CPU)));
     TEST_CASE((TestCorrectness<uchar4, eBorderType::BORDER_TYPE_CONSTANT, eInterpolationType::INTERP_TYPE_NEAREST,
                                eInterpolationType::INTERP_TYPE_NEAREST>(
-        1, 1, 480, 360, 480, 360, 480, 360, FMT_RGBA8, make_float4(0.0f, 0.0f, 0.0f, 1.0f), REMAP_RELATIVE_NORMALIZED, false, eDeviceType::CPU)));
+        1, 1, 480, 360, 480, 360, 480, 360, FMT_RGBA8, make_float4(0.0f, 0.0f, 0.0f, 1.0f), REMAP_RELATIVE_NORMALIZED,
+        false, eDeviceType::CPU)));
 
     TEST_CASE((TestCorrectness<uchar1, eBorderType::BORDER_TYPE_CONSTANT, eInterpolationType::INTERP_TYPE_NEAREST,
-                               eInterpolationType::INTERP_TYPE_NEAREST>(
-        1, 1, 480, 360, 480, 360, 480, 360, FMT_U8, make_float4(0.0f, 0.0f, 0.0f, 1.0f), REMAP_ABSOLUTE, true, eDeviceType::CPU)));
+                               eInterpolationType::INTERP_TYPE_NEAREST>(1, 1, 480, 360, 480, 360, 480, 360, FMT_U8,
+                                                                        make_float4(0.0f, 0.0f, 0.0f, 1.0f),
+                                                                        REMAP_ABSOLUTE, true, eDeviceType::CPU)));
     TEST_CASE((TestCorrectness<uchar1, eBorderType::BORDER_TYPE_CONSTANT, eInterpolationType::INTERP_TYPE_NEAREST,
                                eInterpolationType::INTERP_TYPE_NEAREST>(
-        1, 1, 480, 360, 480, 360, 480, 360, FMT_U8, make_float4(0.0f, 0.0f, 0.0f, 1.0f), REMAP_ABSOLUTE_NORMALIZED, true, eDeviceType::CPU)));
+        1, 1, 480, 360, 480, 360, 480, 360, FMT_U8, make_float4(0.0f, 0.0f, 0.0f, 1.0f), REMAP_ABSOLUTE_NORMALIZED,
+        true, eDeviceType::CPU)));
     TEST_CASE((TestCorrectness<uchar1, eBorderType::BORDER_TYPE_CONSTANT, eInterpolationType::INTERP_TYPE_NEAREST,
                                eInterpolationType::INTERP_TYPE_NEAREST>(
-        1, 1, 480, 360, 480, 360, 480, 360, FMT_U8, make_float4(0.0f, 0.0f, 0.0f, 1.0f), REMAP_RELATIVE_NORMALIZED, true, eDeviceType::CPU)));
+        1, 1, 480, 360, 480, 360, 480, 360, FMT_U8, make_float4(0.0f, 0.0f, 0.0f, 1.0f), REMAP_RELATIVE_NORMALIZED,
+        true, eDeviceType::CPU)));
     TEST_CASE((TestCorrectness<uchar3, eBorderType::BORDER_TYPE_CONSTANT, eInterpolationType::INTERP_TYPE_NEAREST,
-                               eInterpolationType::INTERP_TYPE_NEAREST>(
-        1, 1, 480, 360, 480, 360, 480, 360, FMT_RGB8, make_float4(0.0f, 0.0f, 0.0f, 1.0f), REMAP_ABSOLUTE, true, eDeviceType::CPU)));
+                               eInterpolationType::INTERP_TYPE_NEAREST>(1, 1, 480, 360, 480, 360, 480, 360, FMT_RGB8,
+                                                                        make_float4(0.0f, 0.0f, 0.0f, 1.0f),
+                                                                        REMAP_ABSOLUTE, true, eDeviceType::CPU)));
     TEST_CASE((TestCorrectness<uchar3, eBorderType::BORDER_TYPE_CONSTANT, eInterpolationType::INTERP_TYPE_NEAREST,
                                eInterpolationType::INTERP_TYPE_NEAREST>(
-        1, 1, 480, 360, 480, 360, 480, 360, FMT_RGB8, make_float4(0.0f, 0.0f, 0.0f, 1.0f), REMAP_ABSOLUTE_NORMALIZED, true, eDeviceType::CPU)));
+        1, 1, 480, 360, 480, 360, 480, 360, FMT_RGB8, make_float4(0.0f, 0.0f, 0.0f, 1.0f), REMAP_ABSOLUTE_NORMALIZED,
+        true, eDeviceType::CPU)));
     TEST_CASE((TestCorrectness<uchar3, eBorderType::BORDER_TYPE_CONSTANT, eInterpolationType::INTERP_TYPE_NEAREST,
                                eInterpolationType::INTERP_TYPE_NEAREST>(
-        1, 1, 480, 360, 480, 360, 480, 360, FMT_RGB8, make_float4(0.0f, 0.0f, 0.0f, 1.0f), REMAP_RELATIVE_NORMALIZED, true, eDeviceType::CPU)));
+        1, 1, 480, 360, 480, 360, 480, 360, FMT_RGB8, make_float4(0.0f, 0.0f, 0.0f, 1.0f), REMAP_RELATIVE_NORMALIZED,
+        true, eDeviceType::CPU)));
     TEST_CASE((TestCorrectness<uchar4, eBorderType::BORDER_TYPE_CONSTANT, eInterpolationType::INTERP_TYPE_NEAREST,
-                               eInterpolationType::INTERP_TYPE_NEAREST>(
-        1, 1, 480, 360, 480, 360, 480, 360, FMT_RGBA8, make_float4(0.0f, 0.0f, 0.0f, 1.0f), REMAP_ABSOLUTE, true, eDeviceType::CPU)));
+                               eInterpolationType::INTERP_TYPE_NEAREST>(1, 1, 480, 360, 480, 360, 480, 360, FMT_RGBA8,
+                                                                        make_float4(0.0f, 0.0f, 0.0f, 1.0f),
+                                                                        REMAP_ABSOLUTE, true, eDeviceType::CPU)));
     TEST_CASE((TestCorrectness<uchar4, eBorderType::BORDER_TYPE_CONSTANT, eInterpolationType::INTERP_TYPE_NEAREST,
                                eInterpolationType::INTERP_TYPE_NEAREST>(
-        1, 1, 480, 360, 480, 360, 480, 360, FMT_RGBA8, make_float4(0.0f, 0.0f, 0.0f, 1.0f), REMAP_ABSOLUTE_NORMALIZED, true, eDeviceType::CPU)));
+        1, 1, 480, 360, 480, 360, 480, 360, FMT_RGBA8, make_float4(0.0f, 0.0f, 0.0f, 1.0f), REMAP_ABSOLUTE_NORMALIZED,
+        true, eDeviceType::CPU)));
     TEST_CASE((TestCorrectness<uchar4, eBorderType::BORDER_TYPE_CONSTANT, eInterpolationType::INTERP_TYPE_NEAREST,
                                eInterpolationType::INTERP_TYPE_NEAREST>(
-        1, 1, 480, 360, 480, 360, 480, 360, FMT_RGBA8, make_float4(0.0f, 0.0f, 0.0f, 1.0f), REMAP_RELATIVE_NORMALIZED, true, eDeviceType::CPU)));
+        1, 1, 480, 360, 480, 360, 480, 360, FMT_RGBA8, make_float4(0.0f, 0.0f, 0.0f, 1.0f), REMAP_RELATIVE_NORMALIZED,
+        true, eDeviceType::CPU)));
 
     TEST_CASE((TestCorrectness<uchar1, eBorderType::BORDER_TYPE_CONSTANT, eInterpolationType::INTERP_TYPE_NEAREST,
-                               eInterpolationType::INTERP_TYPE_NEAREST>(
-        2, 1, 480, 360, 480, 360, 480, 360, FMT_U8, make_float4(0.0f, 0.0f, 0.0f, 1.0f), REMAP_ABSOLUTE, false, eDeviceType::CPU)));
+                               eInterpolationType::INTERP_TYPE_NEAREST>(2, 1, 480, 360, 480, 360, 480, 360, FMT_U8,
+                                                                        make_float4(0.0f, 0.0f, 0.0f, 1.0f),
+                                                                        REMAP_ABSOLUTE, false, eDeviceType::CPU)));
     TEST_CASE((TestCorrectness<uchar1, eBorderType::BORDER_TYPE_CONSTANT, eInterpolationType::INTERP_TYPE_NEAREST,
-                               eInterpolationType::INTERP_TYPE_NEAREST>(
-        2, 2, 480, 360, 480, 360, 480, 360, FMT_U8, make_float4(0.0f, 0.0f, 0.0f, 1.0f), REMAP_ABSOLUTE, false, eDeviceType::CPU)));
+                               eInterpolationType::INTERP_TYPE_NEAREST>(2, 2, 480, 360, 480, 360, 480, 360, FMT_U8,
+                                                                        make_float4(0.0f, 0.0f, 0.0f, 1.0f),
+                                                                        REMAP_ABSOLUTE, false, eDeviceType::CPU)));
     TEST_CASE((TestCorrectness<uchar1, eBorderType::BORDER_TYPE_CONSTANT, eInterpolationType::INTERP_TYPE_NEAREST,
-                               eInterpolationType::INTERP_TYPE_NEAREST>(
-        2, 1, 480, 360, 480, 360, 480, 360, FMT_U8, make_float4(0.0f, 0.0f, 0.0f, 1.0f), REMAP_ABSOLUTE, true, eDeviceType::CPU)));
+                               eInterpolationType::INTERP_TYPE_NEAREST>(2, 1, 480, 360, 480, 360, 480, 360, FMT_U8,
+                                                                        make_float4(0.0f, 0.0f, 0.0f, 1.0f),
+                                                                        REMAP_ABSOLUTE, true, eDeviceType::CPU)));
     TEST_CASE((TestCorrectness<uchar1, eBorderType::BORDER_TYPE_CONSTANT, eInterpolationType::INTERP_TYPE_NEAREST,
-                               eInterpolationType::INTERP_TYPE_NEAREST>(
-        2, 2, 480, 360, 480, 360, 480, 360, FMT_U8, make_float4(0.0f, 0.0f, 0.0f, 1.0f), REMAP_ABSOLUTE, true, eDeviceType::CPU)));
-
-
+                               eInterpolationType::INTERP_TYPE_NEAREST>(2, 2, 480, 360, 480, 360, 480, 360, FMT_U8,
+                                                                        make_float4(0.0f, 0.0f, 0.0f, 1.0f),
+                                                                        REMAP_ABSOLUTE, true, eDeviceType::CPU)));
 
     TEST_CASES_END();
 }
\ No newline at end of file
diff --git a/tests/roccv/cpp/src/tests/operators/test_op_resize.cpp b/tests/roccv/cpp/src/tests/operators/test_op_resize.cpp
index d7c385d0..2482e346 100644
--- a/tests/roccv/cpp/src/tests/operators/test_op_resize.cpp
+++ b/tests/roccv/cpp/src/tests/operators/test_op_resize.cpp
@@ -54,8 +54,8 @@ std::vector<BT> GoldenResize(std::vector<detail::BaseType<T>> &input, int batchS
 
     // Use the replicate (or clamping) border mode by default to handle out of bounds conditions with certain
     // interpolation modes.
-    InterpolationWrapper<T, eBorderType::BORDER_TYPE_REPLICATE, InterpType> inputWrap(
-        BorderWrapper<T, eBorderType::BORDER_TYPE_REPLICATE>(
+    InterpolationWrapper<eBorderType::BORDER_TYPE_REPLICATE, InterpType, ImageWrapper<T>> inputWrap(
+        BorderWrapper<eBorderType::BORDER_TYPE_REPLICATE, ImageWrapper<T>>(
             ImageWrapper<T>(input, batchSize, inputSize.w, inputSize.h), T{}));
 
     // Determine the scaling factor required to map from the output coordinates to the corresponding input coordinates
diff --git a/tests/roccv/cpp/src/tests/operators/test_op_rotate.cpp b/tests/roccv/cpp/src/tests/operators/test_op_rotate.cpp
index 56deeabb..37b81842 100644
--- a/tests/roccv/cpp/src/tests/operators/test_op_rotate.cpp
+++ b/tests/roccv/cpp/src/tests/operators/test_op_rotate.cpp
@@ -68,9 +68,9 @@ std::vector<detail::BaseType<T>> GoldenRotate(std::vector<detail::BaseType<T>>&
     T borderVal = detail::SaturateCast<T>(make_float4(0.0f, 0.0f, 0.0f, 0.0f));
 
     ImageWrapper<T> outputWrapper(output, batchSize, imageSize.w, imageSize.h);
-    InterpolationWrapper<T, eBorderType::BORDER_TYPE_CONSTANT, InterpType> inputWrapper(
-        BorderWrapper<T, eBorderType::BORDER_TYPE_CONSTANT>(ImageWrapper<T>(input, batchSize, imageSize.w, imageSize.h),
-                                                            borderVal));
+    InterpolationWrapper<eBorderType::BORDER_TYPE_CONSTANT, InterpType, ImageWrapper<T>> inputWrapper(
+        BorderWrapper<eBorderType::BORDER_TYPE_CONSTANT, ImageWrapper<T>>(
+            ImageWrapper<T>(input, batchSize, imageSize.w, imageSize.h), borderVal));
 
     /**
      * Affine warp for a combined rotation and translate looks like the following when in its inverse representation:
diff --git a/tests/roccv/cpp/src/tests/operators/test_op_warp_affine.cpp b/tests/roccv/cpp/src/tests/operators/test_op_warp_affine.cpp
index 93c91ae9..72748a1a 100644
--- a/tests/roccv/cpp/src/tests/operators/test_op_warp_affine.cpp
+++ b/tests/roccv/cpp/src/tests/operators/test_op_warp_affine.cpp
@@ -55,7 +55,7 @@ std::vector<detail::BaseType<T>> GoldenWarpAffine(std::vector<detail::BaseType<T
                                                   const std::array<float, 6>& mat, bool isInverted, int batchSize,
                                                   Size2D inputSize, Size2D outputSize, float4 borderValue) {
     // Create interpolation wrapper for input vector
-    InterpolationWrapper<T, BorderType, InterpType> inputWrap((BorderWrapper<T, BorderType>(
+    InterpolationWrapper<BorderType, InterpType, ImageWrapper<T>> inputWrap((BorderWrapper<BorderType, ImageWrapper<T>>(
         ImageWrapper<T>(input, batchSize, inputSize.w, inputSize.h), detail::SaturateCast<T>(borderValue))));
 
     // Create ImageWrapper for output vector. We also need to create said output vector.
diff --git a/tests/roccv/cpp/src/tests/operators/test_op_warp_perspective.cpp b/tests/roccv/cpp/src/tests/operators/test_op_warp_perspective.cpp
index 1461365c..be918fd6 100644
--- a/tests/roccv/cpp/src/tests/operators/test_op_warp_perspective.cpp
+++ b/tests/roccv/cpp/src/tests/operators/test_op_warp_perspective.cpp
@@ -52,7 +52,7 @@ std::vector<detail::BaseType<T>> GoldenWarpPerspective(std::vector<detail::BaseT
                                                        const std::array<float, 9>& mat, bool isInverted, int batchSize,
                                                        Size2D inputSize, Size2D outputSize, float4 borderValue) {
     // Create interpolation wrapper for input vector
-    InterpolationWrapper<T, BorderType, InterpType> inputWrap((BorderWrapper<T, BorderType>(
+    InterpolationWrapper<BorderType, InterpType, ImageWrapper<T>> inputWrap((BorderWrapper<BorderType, ImageWrapper<T>>(
         ImageWrapper<T>(input, batchSize, inputSize.w, inputSize.h), detail::SaturateCast<T>(borderValue))));
 
     // Create ImageWrapper for output vector. We also need to create said output vector.

From 9348f875ab50d1535be89cfb744bf0c48b5795dd Mon Sep 17 00:00:00 2001
From: Zach Vincze <zavincze@amd.com>
Date: Wed, 20 May 2026 10:23:11 -0400
Subject: [PATCH 11/13] Revert "Support batch indexing in width and height
 methods for kernel wrappers"

This reverts commit 9055ed0b5b3c3a6c6858029b17bedacb6ab9751e.
---
 include/core/wrappers/border_wrapper.hpp      |  55 ++--
 include/core/wrappers/image_wrapper.hpp       |   8 +-
 .../core/wrappers/interpolation_wrapper.hpp   |  47 ++--
 src/op_bilateral_filter.cpp                   |  12 +-
 src/op_copy_make_border.cpp                   |   5 +-
 src/op_remap.cpp                              |   6 +-
 src/op_resize.cpp                             |  22 +-
 src/op_rotate.cpp                             |  11 +-
 src/op_warp_perspective.cpp                   |   4 +-
 .../core/wrappers/test_border_wrapper.cpp     |   7 +-
 .../wrappers/test_interpolation_wrapper.cpp   |  33 ++-
 .../operators/test_op_bilateral_filter.cpp    |  15 +-
 .../operators/test_op_copy_make_border.cpp    |   3 +-
 .../cpp/src/tests/operators/test_op_remap.cpp | 234 +++++++-----------
 .../src/tests/operators/test_op_resize.cpp    |   4 +-
 .../src/tests/operators/test_op_rotate.cpp    |   6 +-
 .../tests/operators/test_op_warp_affine.cpp   |   2 +-
 .../operators/test_op_warp_perspective.cpp    |   2 +-
 18 files changed, 210 insertions(+), 266 deletions(-)

diff --git a/include/core/wrappers/border_wrapper.hpp b/include/core/wrappers/border_wrapper.hpp
index 08e21a3c..f55493a5 100644
--- a/include/core/wrappers/border_wrapper.hpp
+++ b/include/core/wrappers/border_wrapper.hpp
@@ -23,37 +23,38 @@
 
 #include <hip/hip_runtime.h>
 
-#include <algorithm>
-#include <cstdlib>
-
 #include "core/wrappers/image_wrapper.hpp"
 #include "operator_types.h"
 
 namespace roccv {
 
 /**
- * @brief Wrapper class which adds border-handling behavior on top of an underlying image wrapper.
- *
- * Templated on the wrapper type W (e.g. ImageWrapper<T>, VarShapeImageWrapper<T>) so that the same
- * border math is shared between uniform-shape and variable-shape image batches. The pixel value
- * type T is recovered from W::ValueType.
+ * @brief Wrapper class for ImageWrapper. This extends the descriptors by defining behaviors for when tensor
+ * coordinates go out of scope.
  *
+ * @tparam T The underlying data type of the tensor.
  * @tparam BorderType The border type to use when coordinates are out of bounds.
- * @tparam W The underlying image wrapper type. Must expose ValueType, at(n,h,w,c), width(n), and height(n).
  */
-template <eBorderType BorderType, typename W>
+template <typename T, eBorderType BorderType>
 class BorderWrapper {
    public:
-    using ValueType = typename W::ValueType;
+    /**
+     * @brief Wraps an ImageWrapper and extends its capabilities to handle out of bounds coordinates.
+     *
+     * @param tensor The tensor to wrap.
+     * @param border_value The fallback border color to use when using a constant border mode.
+     */
+    BorderWrapper(const Tensor& tensor, T border_value) : m_desc(tensor), m_border_value(border_value) {}
 
     /**
-     * @brief Constructs a BorderWrapper from an existing image wrapper. Extends its capabilities to handle out
-     * of bound coordinates.
+     * @brief Constructs a BorderWrapper from an existing ImageWrapper. Extends its capabilities to handle out of bound
+     * coordinates.
      *
-     * @param image_wrapper The image wrapper to wrap around the BorderWrapper.
+     * @param image_wrapper The ImageWrapper to wrap around the BorderWrapper.
      * @param border_value The fallback border color to use when using a constant border mode.
      */
-    BorderWrapper(W image_wrapper, ValueType border_value) : m_desc(image_wrapper), m_border_value(border_value) {}
+    BorderWrapper(ImageWrapper<T> image_wrapper, T border_value)
+        : m_desc(image_wrapper), m_border_value(border_value) {}
 
     /**
      * @brief Returns a reference to the underlying data given image coordinates. If the coordinates fall out of bounds,
@@ -65,14 +66,11 @@ class BorderWrapper {
      * @param c The channel index.
      * @return A reference to the underlying data or a fallback border value of type T.
      */
-    __device__ __host__ const ValueType at(int64_t n, int64_t h, int64_t w, int64_t c) const {
-        const int64_t imgWidth = width(n);
-        const int64_t imgHeight = height(n);
-
+    __device__ __host__ const T at(int64_t n, int64_t h, int64_t w, int64_t c) const {
         // Constant border type implementation. This is a special case which doesn't remap values, but rather returns
         // the provided constant value.
         if constexpr (BorderType == eBorderType::BORDER_TYPE_CONSTANT) {
-            if (w < 0 || w >= imgWidth || h < 0 || h >= imgHeight)
+            if (w < 0 || w >= width() || h < 0 || h >= height())
                 return m_border_value;
             else
                 return m_desc.at(n, h, w, c);
@@ -82,12 +80,13 @@ class BorderWrapper {
         // required at image borders. While this may cause branch divergence, a good bulk of the pixels should fall
         // within image bounds and will take the same branch. This is preferred over having to do expensive calculations
         // for EVERY pixel in the image (most of which do not require said calculations).
-        if (w >= 0 && w < imgWidth && h >= 0 && h < imgHeight) {
+        if (w >= 0 && w < width() && h >= 0 && h < height()) {
             return m_desc.at(n, h, w, c);
         }
 
         // Otherwise, do some additional calculations to map the provided x and y coordinates to be within bounds.
         int64_t x = w, y = h;
+        int64_t imgWidth = width(), imgHeight = height();
 
         // Reflect border type implementation. (Note: This is NOT REFLECT101, pixels at the border will be duplicated as
         // is the intended behavior for this border mode.)
@@ -140,20 +139,18 @@ class BorderWrapper {
     }
 
     /**
-     * @brief Retrives the height of the image at batch index n.
+     * @brief Retrives the height of the images.
      *
-     * @param n Batch index. Ignored when W is a uniform-shape wrapper.
      * @return Image height.
      */
-    __device__ __host__ inline int64_t height(int64_t n = 0) const { return m_desc.height(n); }
+    __device__ __host__ inline int64_t height() const { return m_desc.height(); }
 
     /**
-     * @brief Retrieves the width of the image at batch index n.
+     * @brief Retrieves the width of the image.
      *
-     * @param n Batch index. Ignored when W is a uniform-shape wrapper.
      * @return Image width.
      */
-    __device__ __host__ inline int64_t width(int64_t n = 0) const { return m_desc.width(n); }
+    __device__ __host__ inline int64_t width() const { return m_desc.width(); }
 
     /**
      * @brief Retrieves the number of batches in the image tensor.
@@ -170,7 +167,7 @@ class BorderWrapper {
     __device__ __host__ inline int64_t channels() const { return m_desc.channels(); }
 
    private:
-    W m_desc;
-    ValueType m_border_value;
+    ImageWrapper<T> m_desc;
+    T m_border_value;
 };
 }  // namespace roccv
\ No newline at end of file
diff --git a/include/core/wrappers/image_wrapper.hpp b/include/core/wrappers/image_wrapper.hpp
index 4f1835f5..e174c64a 100644
--- a/include/core/wrappers/image_wrapper.hpp
+++ b/include/core/wrappers/image_wrapper.hpp
@@ -139,20 +139,16 @@ class ImageWrapper {
     /**
      * @brief Retrives the height of the images.
      *
-     * @param n Batch index. Ignored for uniform-shape ImageWrapper; included so the signature
-     *          matches VarShapeImageWrapper for use as a template parameter to BorderWrapper et al.
      * @return Image height.
      */
-    __device__ __host__ inline int64_t height(int64_t /*n*/ = 0) const { return shape.h; }
+    __device__ __host__ inline int64_t height() const { return shape.h; }
 
     /**
      * @brief Retrieves the width of the image.
      *
-     * @param n Batch index. Ignored for uniform-shape ImageWrapper; included so the signature
-     *          matches VarShapeImageWrapper for use as a template parameter to BorderWrapper et al.
      * @return Image width.
      */
-    __device__ __host__ inline int64_t width(int64_t /*n*/ = 0) const { return shape.w; }
+    __device__ __host__ inline int64_t width() const { return shape.w; }
 
     /**
      * @brief Retrieves the number of batches in the image tensor.
diff --git a/include/core/wrappers/interpolation_wrapper.hpp b/include/core/wrappers/interpolation_wrapper.hpp
index 68daaad8..7adb8cb6 100644
--- a/include/core/wrappers/interpolation_wrapper.hpp
+++ b/include/core/wrappers/interpolation_wrapper.hpp
@@ -23,35 +23,41 @@
 
 #include "core/detail/casting.hpp"
 #include "core/detail/math/vectorized_type_math.hpp"
-#include "core/detail/vector_utils.hpp"
 #include "core/wrappers/border_wrapper.hpp"
+#include "core/detail/vector_utils.hpp"
 #include "operator_types.h"
 
 namespace roccv {
 
 /**
- * @brief A kernel-friendly wrapper which provides interpolation logic on top of an underlying image wrapper.
- *
- * Templated on the wrapper type W (e.g. ImageWrapper<T>, VarShapeImageWrapper<T>) so that the same
- * interpolation math is shared between uniform-shape and variable-shape image batches. The pixel value
- * type T is recovered from W::ValueType. Read-only access; do not use for output tensors.
+ * @brief A kernel-friendly wrapper which provides interpolation logic based on the given
+ * coordinates. This tensor wrapper is typically only used for input tensors and does not provide write access to its
+ * underlying data.
  *
+ * @tparam T Underlying data type of the tensor data.
+ * @tparam C Number of channels in data type.
  * @tparam B Border type to use for interpolation.
  * @tparam I Interpolation type to use.
- * @tparam W The underlying image wrapper type. Must expose ValueType, at(n,h,w,c), width(n), and height(n).
  */
-template <eBorderType B, eInterpolationType I, typename W>
+template <typename T, eBorderType B, eInterpolationType I>
 class InterpolationWrapper {
    public:
-    using ValueType = typename W::ValueType;
+    /**
+     * @brief Wraps a roccv::Tensor in an InterpolationWrapper to provide pixel interpolation when accessing
+     * non-integer coordinate mappings.
+     *
+     * @param tensor The tensor to wrap.
+     * @param border_value A fallback border value to use in the case of a constant border mode.
+     */
+    InterpolationWrapper(const Tensor& tensor, T border_value) : m_desc(tensor, border_value) {}
 
     /**
-     * @brief Wraps a BorderWrapper in an InterpolationWrapper. Extends capabilities to interpolate pixel values when
+     * @brief Wraps a BorderWrapper in an Interpolation wrapper. Extends capabilities to interpolate pixel values when
      * given non-integer coordinates.
      *
      * @param borderWrapper The BorderWrapper to wrap.
      */
-    InterpolationWrapper(BorderWrapper<B, W> borderWrapper) : m_desc(borderWrapper) {}
+    InterpolationWrapper(BorderWrapper<T, B> borderWrapper) : m_desc(borderWrapper) {}
 
     /**
      * @brief This function calculates the weighting coefficients for the Catmull-Rom cubic interpolation.
@@ -86,7 +92,7 @@ class InterpolationWrapper {
      * @param w Width coordinates.
      * @return An interpolated value.
      */
-    inline __device__ __host__ const ValueType at(int64_t n, float h, float w, int64_t c) const {
+    inline __device__ __host__ const T at(int64_t n, float h, float w, int64_t c) const {
         if constexpr (I == eInterpolationType::INTERP_TYPE_NEAREST) {
             // Nearest neighbor interpolation implementation
             return m_desc.at(n, lroundf(h), lroundf(w), c);
@@ -96,7 +102,7 @@ class InterpolationWrapper {
             // -     -
             // v3 -- v4
 
-            using WorkType = detail::MakeType<float, detail::NumElements<ValueType>>;
+            using WorkType = detail::MakeType<float, detail::NumElements<T>>;
 
             int64_t x0 = static_cast<int64_t>(floorf(w));
             int64_t x1 = x0 + 1;
@@ -112,10 +118,10 @@ class InterpolationWrapper {
             auto q2 = v3 * (x1 - w) + v4 * (w - x0);
             auto q = q1 * (y1 - h) + q2 * (h - y0);
 
-            return detail::RangeCast<ValueType>(q);
+            return detail::RangeCast<T>(q);
         } else if constexpr (I == eInterpolationType::INTERP_TYPE_CUBIC) {
             using namespace roccv::detail;
-            using WorkType = detail::MakeType<float, detail::NumElements<ValueType>>;
+            using WorkType = detail::MakeType<float, detail::NumElements<T>>;
 
             // Integer coordinates for pixel (x, y)
             int64_t int_x = static_cast<int64_t>(floorf(w));
@@ -130,21 +136,20 @@ class InterpolationWrapper {
             WorkType sum = SetAll<WorkType>(0.0f);
             for (int index_y = -1; index_y <= 2; index_y++) {
                 for (int index_x = -1; index_x <= 2; index_x++) {
-                    sum += detail::RangeCast<WorkType>(m_desc.at(n, int_y + index_y, int_x + index_x, 0)) *
-                           (weight_x[index_x + 1] * weight_y[index_y + 1]);
+                    sum += detail::RangeCast<WorkType>(m_desc.at(n, int_y + index_y, int_x + index_x, 0)) * (weight_x[index_x + 1] * weight_y[index_y + 1]);
                 }
             }
 
-            return detail::RangeCast<ValueType>(sum);
+            return detail::RangeCast<T>(sum);
         }
     }
 
-    __device__ __host__ inline int64_t height(int64_t n = 0) const { return m_desc.height(n); }
-    __device__ __host__ inline int64_t width(int64_t n = 0) const { return m_desc.width(n); }
+    __device__ __host__ inline int64_t height() const { return m_desc.height(); }
+    __device__ __host__ inline int64_t width() const { return m_desc.width(); }
     __device__ __host__ inline int64_t batches() const { return m_desc.batches(); }
     __device__ __host__ inline int64_t channels() const { return m_desc.channels(); }
 
    private:
-    BorderWrapper<B, W> m_desc;
+    BorderWrapper<T, B> m_desc;
 };
 }  // namespace roccv
\ No newline at end of file
diff --git a/src/op_bilateral_filter.cpp b/src/op_bilateral_filter.cpp
index c4adfb25..dffba8ae 100644
--- a/src/op_bilateral_filter.cpp
+++ b/src/op_bilateral_filter.cpp
@@ -43,7 +43,7 @@ BilateralFilter::~BilateralFilter() {}
 template <typename T, eBorderType B>
 void dispatch_bilateral_filter_border_mode(hipStream_t stream, const Tensor &input, const Tensor &output, int diameter,
                                            float sigmaColor, float sigmaSpace, T borderValue, eDeviceType device) {
-    BorderWrapper<B, ImageWrapper<T>> inputWrapper(ImageWrapper<T>(input), borderValue);
+    BorderWrapper<T, B> inputWrapper(input, borderValue);
     ImageWrapper<T> outputWrapper(output);
 
     if (outputWrapper.channels() > 4 || outputWrapper.channels() < 1) {
@@ -61,7 +61,8 @@ void dispatch_bilateral_filter_border_mode(hipStream_t stream, const Tensor &inp
         sigmaSpace = 1.0f;
     }
 
-    const int radius = (diameter <= 0) ? static_cast<int>(std::roundf(sigmaSpace * 1.5f)) : (diameter >> 1);
+    const int radius =
+        (diameter <= 0) ? static_cast<int>(std::roundf(sigmaSpace * 1.5f)) : (diameter >> 1);
 
     float spaceCoeff = -1 / (2 * sigmaSpace * sigmaSpace);
     float colorCoeff = -1 / (2 * sigmaColor * sigmaColor);
@@ -88,10 +89,9 @@ void dispatch_bilateral_filter_border_mode(hipStream_t stream, const Tensor &inp
 
         for (int j = 0; j < divisor; j++) {
             for (int i = 0; i < dividend; i++) {
-                threads.push_back(
-                    std::thread(Kernels::Host::bilateral_filter<T, BorderWrapper<B, ImageWrapper<T>>, ImageWrapper<T>>,
-                                inputWrapper, outputWrapper, radius, rollingHeight, rollingWidth, prevHeight, prevWidth,
-                                spaceCoeff, colorCoeff));
+                threads.push_back(std::thread(Kernels::Host::bilateral_filter<T, BorderWrapper<T, B>, ImageWrapper<T>>,
+                                              inputWrapper, outputWrapper, radius, rollingHeight, rollingWidth,
+                                              prevHeight, prevWidth, spaceCoeff, colorCoeff));
                 prevWidth = rollingWidth;
                 rollingWidth += factorW;
             }
diff --git a/src/op_copy_make_border.cpp b/src/op_copy_make_border.cpp
index 32e4ad7a..feacfbd9 100644
--- a/src/op_copy_make_border.cpp
+++ b/src/op_copy_make_border.cpp
@@ -38,7 +38,7 @@ namespace roccv {
 template <typename T, eBorderType BorderMode>
 void dispatch_copy_make_border_border_mode(hipStream_t stream, const Tensor& input, const Tensor& output, int32_t top,
                                            int32_t left, T border_value, eDeviceType device) {
-    BorderWrapper<BorderMode, ImageWrapper<T>> in_desc(ImageWrapper<T>(input), border_value);
+    BorderWrapper<T, BorderMode> in_desc(input, border_value);
     ImageWrapper<T> out_desc(output);
 
     switch (device) {
@@ -83,7 +83,8 @@ void dispatch_copy_make_border(hipStream_t stream, const Tensor& input, const Te
 }
 
 void CopyMakeBorder::operator()(hipStream_t stream, const Tensor& input, const Tensor& output, int32_t top,
-                                int32_t left, eBorderType border_mode, float4 border_value, eDeviceType device) const {
+                                int32_t left, eBorderType border_mode, float4 border_value,
+                                eDeviceType device) const {
     CHECK_TENSOR_DEVICE(input, device);
     CHECK_TENSOR_LAYOUT(input, eTensorLayout::TENSOR_LAYOUT_NHWC, eTensorLayout::TENSOR_LAYOUT_HWC);
     CHECK_TENSOR_DATATYPES(input, eDataType::DATA_TYPE_U8, eDataType::DATA_TYPE_S8, eDataType::DATA_TYPE_U16,
diff --git a/src/op_remap.cpp b/src/op_remap.cpp
index 5b01f4dc..0992cf44 100644
--- a/src/op_remap.cpp
+++ b/src/op_remap.cpp
@@ -77,10 +77,8 @@ void dispatch_remap_mapInterp(hipStream_t stream, const Tensor &input, const Ten
                               const eRemapType mapValueType, const bool alignCorners, const T borderValue,
                               const eDeviceType device) {
     ImageWrapper<T> outputWrapper(output);
-    BorderWrapper<B, ImageWrapper<float2>> mapBorder(ImageWrapper<float2>(map), make_float2(0, 0));
-    InterpolationWrapper<B, M, ImageWrapper<float2>> wrappedMapTensor(mapBorder);
-    BorderWrapper<B, ImageWrapper<T>> inputBorder(ImageWrapper<T>(input), borderValue);
-    InterpolationWrapper<B, I, ImageWrapper<T>> inputWrapper(inputBorder);
+    InterpolationWrapper<float2, B, M> wrappedMapTensor(map, make_float2(0, 0));
+    InterpolationWrapper<T, B, I> inputWrapper(input, borderValue);
 
     int mapBatchSize = wrappedMapTensor.batches();
 
diff --git a/src/op_resize.cpp b/src/op_resize.cpp
index 71fe6889..d7cd0b61 100644
--- a/src/op_resize.cpp
+++ b/src/op_resize.cpp
@@ -25,6 +25,7 @@ THE SOFTWARE.
 #include <unordered_map>
 
 #include "common/validation_helpers.hpp"
+#include "core/detail/casting.hpp"
 #include "core/exception.hpp"
 #include "core/status_type.h"
 #include "core/wrappers/interpolation_wrapper.hpp"
@@ -37,8 +38,7 @@ template <typename T, eInterpolationType I>
 void dispatch_resize_interp(hipStream_t stream, const Tensor& input, const Tensor& output, eDeviceType device) {
     ImageWrapper<T> outputWrapper(output);
     // Resize operation should clamp values at the border (REPLICATE border mode)
-    BorderWrapper<eBorderType::BORDER_TYPE_REPLICATE, ImageWrapper<T>> inputBorder(ImageWrapper<T>(input), T{});
-    InterpolationWrapper<eBorderType::BORDER_TYPE_REPLICATE, I, ImageWrapper<T>> inputWrapper(inputBorder);
+    InterpolationWrapper<T, eBorderType::BORDER_TYPE_REPLICATE, I> inputWrapper(input, T{});
 
     float scaleX = inputWrapper.width() / static_cast<float>(outputWrapper.width());
     float scaleY = inputWrapper.height() / static_cast<float>(outputWrapper.height());
@@ -62,13 +62,13 @@ void dispatch_resize_interp(hipStream_t stream, const Tensor& input, const Tenso
 template <typename T>
 void dispatch_resize_dtype(hipStream_t stream, const Tensor& input, const Tensor& output,
                            eInterpolationType interpolation, eDeviceType device) {
-    static const std::unordered_map<eInterpolationType, std::function<void(hipStream_t stream, const Tensor& input,
-                                                                           const Tensor& output, eDeviceType device)>>
-        funcs = {
-            {eInterpolationType::INTERP_TYPE_NEAREST,
-             dispatch_resize_interp<T, eInterpolationType::INTERP_TYPE_NEAREST>},
-            {eInterpolationType::INTERP_TYPE_LINEAR, dispatch_resize_interp<T, eInterpolationType::INTERP_TYPE_LINEAR>},
-            {eInterpolationType::INTERP_TYPE_CUBIC, dispatch_resize_interp<T, eInterpolationType::INTERP_TYPE_CUBIC>}};
+    static const std::unordered_map<
+        eInterpolationType,
+        std::function<void(hipStream_t stream, const Tensor& input, const Tensor& output, eDeviceType device)>>
+        funcs = {{eInterpolationType::INTERP_TYPE_NEAREST, dispatch_resize_interp<T, eInterpolationType::INTERP_TYPE_NEAREST>},
+                 {eInterpolationType::INTERP_TYPE_LINEAR, dispatch_resize_interp<T, eInterpolationType::INTERP_TYPE_LINEAR>},
+                 {eInterpolationType::INTERP_TYPE_CUBIC, dispatch_resize_interp<T, eInterpolationType::INTERP_TYPE_CUBIC>}
+                };
 
     if (!funcs.contains(interpolation)) {
         throw Exception("Operation does not support the given interpolation mode.", eStatusType::NOT_IMPLEMENTED);
@@ -78,8 +78,8 @@ void dispatch_resize_dtype(hipStream_t stream, const Tensor& input, const Tensor
     func(stream, input, output, device);
 }
 
-void Resize::operator()(hipStream_t stream, const Tensor& input, const Tensor& output, eInterpolationType interpolation,
-                        eDeviceType device) const {
+void Resize::operator()(hipStream_t stream, const Tensor& input, const Tensor& output,
+                        eInterpolationType interpolation, eDeviceType device) const {
     CHECK_TENSOR_DEVICE(input, device);
     CHECK_TENSOR_DEVICE(output, device);
 
diff --git a/src/op_rotate.cpp b/src/op_rotate.cpp
index c1eea211..28806779 100644
--- a/src/op_rotate.cpp
+++ b/src/op_rotate.cpp
@@ -55,8 +55,7 @@ void dispatch_rotate_interp(hipStream_t stream, const Tensor &input, const Tenso
     T borderVal = detail::SaturateCast<T>(make_float4(0.0f, 0.0f, 0.0f, 0.0f));
 
     ImageWrapper<T> outputWrap(output);
-    BorderWrapper<eBorderType::BORDER_TYPE_CONSTANT, ImageWrapper<T>> inputBorder(ImageWrapper<T>(input), borderVal);
-    InterpolationWrapper<eBorderType::BORDER_TYPE_CONSTANT, InterpType, ImageWrapper<T>> inputWrap(inputBorder);
+    InterpolationWrapper<T, eBorderType::BORDER_TYPE_CONSTANT, InterpType> inputWrap(input, borderVal);
 
     switch (device) {
         case eDeviceType::GPU: {
@@ -75,8 +74,8 @@ void dispatch_rotate_interp(hipStream_t stream, const Tensor &input, const Tenso
 }
 
 template <typename T>
-void dispatch_rotate_type(hipStream_t stream, const Tensor &input, const Tensor &output, double angleDeg, double2 shift,
-                          eInterpolationType interpolation, eDeviceType device) {
+void dispatch_rotate_type(hipStream_t stream, const Tensor &input, const Tensor &output, double angleDeg,
+                          double2 shift, eInterpolationType interpolation, eDeviceType device) {
     // clang-format off
     static const std::unordered_map<eInterpolationType,
                                     std::function<void(hipStream_t, const Tensor &, const Tensor &, double,
@@ -95,8 +94,8 @@ void dispatch_rotate_type(hipStream_t stream, const Tensor &input, const Tensor
     func(stream, input, output, angleDeg, shift, device);
 }
 
-void Rotate::operator()(hipStream_t stream, const Tensor &input, const Tensor &output, double angleDeg, double2 shift,
-                        eInterpolationType interpolation, eDeviceType device) const {
+void Rotate::operator()(hipStream_t stream, const Tensor &input, const Tensor &output, double angleDeg,
+                        double2 shift, eInterpolationType interpolation, eDeviceType device) const {
     CHECK_TENSOR_DEVICE(input, device);
     CHECK_TENSOR_CHANNELS(input, 1, 3, 4);
     CHECK_TENSOR_DATATYPES(input, eDataType::DATA_TYPE_U8, eDataType::DATA_TYPE_S8, eDataType::DATA_TYPE_U16,
diff --git a/src/op_warp_perspective.cpp b/src/op_warp_perspective.cpp
index 130b8ef6..ca77fc8c 100644
--- a/src/op_warp_perspective.cpp
+++ b/src/op_warp_perspective.cpp
@@ -27,6 +27,7 @@ THE SOFTWARE.
 #include "common/validation_helpers.hpp"
 #include "core/detail/casting.hpp"
 #include "core/detail/math/math.hpp"
+#include "core/detail/type_traits.hpp"
 #include "kernels/device/warp_perspective_device.hpp"
 #include "kernels/host/warp_perspective_host.hpp"
 
@@ -36,8 +37,7 @@ void dispatch_warp_perspective_interp(hipStream_t stream, const Tensor &input, c
                                       const PerspectiveTransform transMatrix, T borderValue, eDeviceType device) {
     ArrayWrapper<float, 9> transform(transMatrix);
     ImageWrapper<T> outputWrapper(output);
-    BorderWrapper<B, ImageWrapper<T>> inputBorder(ImageWrapper<T>(input), borderValue);
-    InterpolationWrapper<B, I, ImageWrapper<T>> inputWrapper(inputBorder);
+    InterpolationWrapper<T, B, I> inputWrapper(input, borderValue);
 
     // Launch CPU/GPU kernel depending on requested device type.
     switch (device) {
diff --git a/tests/roccv/cpp/src/tests/core/wrappers/test_border_wrapper.cpp b/tests/roccv/cpp/src/tests/core/wrappers/test_border_wrapper.cpp
index fd836773..873f05dc 100644
--- a/tests/roccv/cpp/src/tests/core/wrappers/test_border_wrapper.cpp
+++ b/tests/roccv/cpp/src/tests/core/wrappers/test_border_wrapper.cpp
@@ -115,8 +115,8 @@ int64_t GetCoordOfBorderPel(int64_t u, int64_t dimSize, eBorderType borderMode)
  * coordinates fall out of bounds.
  */
 template <typename T, typename BT = detail::BaseType<T>>
-BT GoldenBorderAt(ImageWrapper<T>& input, eBorderType borderMode, T borderValue, int64_t sample, int64_t y, int64_t x,
-                  int64_t channel) {
+BT GoldenBorderAt(ImageWrapper<T>& input, eBorderType borderMode, T borderValue, int64_t sample, int64_t y,
+                  int64_t x, int64_t channel) {
     int64_t outX = x, outY = y;
 
     if (borderMode == eBorderType::BORDER_TYPE_CONSTANT) {
@@ -161,8 +161,7 @@ void TestCorrectness(float4 borderValue, int32_t batchSize, Size2D imageSize, in
     FillVector(inputData);
 
     // BorderWrapper to calculate the actual calculated values.
-    BorderWrapper<BorderType, ImageWrapper<T>> borderWrap(
-        ImageWrapper<T>(inputData, batchSize, imageSize.w, imageSize.h), borderVal);
+    BorderWrapper<T, BorderType> borderWrap(ImageWrapper<T>(inputData, batchSize, imageSize.w, imageSize.h), borderVal);
     std::vector<BT> actualOutput(numElementsWithBorder);
     int actualIndex = 0;
     for (int batch = 0; batch < batchSize; ++batch) {
diff --git a/tests/roccv/cpp/src/tests/core/wrappers/test_interpolation_wrapper.cpp b/tests/roccv/cpp/src/tests/core/wrappers/test_interpolation_wrapper.cpp
index 5ec3176e..a4466530 100644
--- a/tests/roccv/cpp/src/tests/core/wrappers/test_interpolation_wrapper.cpp
+++ b/tests/roccv/cpp/src/tests/core/wrappers/test_interpolation_wrapper.cpp
@@ -21,9 +21,9 @@
 
 #include <core/detail/casting.hpp>
 #include <core/detail/type_traits.hpp>
+#include "core/detail/vector_utils.hpp"
 #include <core/wrappers/interpolation_wrapper.hpp>
 
-#include "core/detail/vector_utils.hpp"
 #include "test_helpers.hpp"
 
 using namespace roccv;
@@ -45,7 +45,7 @@ namespace {
  * @return T The interpolated pixel.
  */
 template <typename T, eBorderType BorderType>
-T GoldenLinear(BorderWrapper<BorderType, ImageWrapper<T>> input, int64_t sample, float y, float x) {
+T GoldenLinear(BorderWrapper<T, BorderType> input, int64_t sample, float y, float x) {
     // Defines the vectorized float type for intermediate calculations.
     using WorkType = detail::MakeType<float, detail::NumComponents<T>>;
 
@@ -86,7 +86,7 @@ T GoldenLinear(BorderWrapper<BorderType, ImageWrapper<T>> input, int64_t sample,
  * @return T The interpolated pixel.
  */
 template <typename T, eBorderType BorderType>
-T GoldenNearest(BorderWrapper<BorderType, ImageWrapper<T>> input, int64_t sample, float y, float x) {
+T GoldenNearest(BorderWrapper<T, BorderType> input, int64_t sample, float y, float x) {
     // Nearest neighbor interpolation. Rounds given floating point values to the nearest integer.
     return input.at(sample, lroundf(y), lroundf(x), 0);
 }
@@ -98,7 +98,7 @@ T GoldenNearest(BorderWrapper<BorderType, ImageWrapper<T>> input, int64_t sample
  * @return None.
  */
 void CalBicubicWeights(float dist, float* weight) {
-    const float A = -0.5f;  // Note OpenCV sets alpha to -0.75f
+    const float A = -0.5f; // Note OpenCV sets alpha to -0.75f
 
     weight[0] = ((A * (dist + 1) - 5 * A) * (dist + 1) + 8 * A) * (dist + 1) - 4 * A;
     weight[1] = ((A + 2) * dist - (A + 3)) * dist * dist + 1;
@@ -107,8 +107,7 @@ void CalBicubicWeights(float dist, float* weight) {
 }
 
 /**
- * @brief Golden model for Bicubic interpolation. This is the Catmull-Rom cubic interpolation commonly used in CV
- * libraries.
+ * @brief Golden model for Bicubic interpolation. This is the Catmull-Rom cubic interpolation commonly used in CV libraries.
  *
  * @tparam T Image datatype.
  * @tparam BorderType Border type for boundary conditions.
@@ -119,7 +118,7 @@ void CalBicubicWeights(float dist, float* weight) {
  * @return T The interpolated pixel.
  */
 template <typename T, eBorderType BorderType>
-T GoldenBicubic(BorderWrapper<BorderType, ImageWrapper<T>> input, int64_t sample, float y, float x) {
+T GoldenBicubic(BorderWrapper<T, BorderType> input, int64_t sample, float y, float x) {
     // Defines the vectorized float type for intermediate calculations.
     using WorkType = detail::MakeType<float, detail::NumComponents<T>>;
 
@@ -136,8 +135,7 @@ T GoldenBicubic(BorderWrapper<BorderType, ImageWrapper<T>> input, int64_t sample
     WorkType sum = SetAll<WorkType>(0.0f);
     for (int indexY = -1; indexY <= 2; indexY++) {
         for (int indexX = -1; indexX <= 2; indexX++) {
-            sum += detail::RangeCast<WorkType>(input.at(sample, intY + indexY, intX + indexX, 0)) *
-                   (weightX[indexX + 1] * weightY[indexY + 1]);
+            sum += detail::RangeCast<WorkType>(input.at(sample, intY + indexY, intX + indexX, 0)) * (weightX[indexX + 1] * weightY[indexY + 1]);
         }
     }
 
@@ -158,7 +156,7 @@ T GoldenBicubic(BorderWrapper<BorderType, ImageWrapper<T>> input, int64_t sample
  * @return T The interpolated pixel.
  */
 template <typename T, eBorderType BorderType>
-T GoldenInterpolationAt(BorderWrapper<BorderType, ImageWrapper<T>> input, int64_t sample, float y, float x,
+T GoldenInterpolationAt(BorderWrapper<T, BorderType> input, int64_t sample, float y, float x,
                         eInterpolationType interp) {
     switch (interp) {
         case eInterpolationType::INTERP_TYPE_NEAREST:
@@ -204,11 +202,9 @@ void TestCorrectness(int64_t batchSize, Size2D imageSize, float4 borderValue, fl
     std::vector<detail::BaseType<T>> goldenOutput;
 
     // Use roccv::InterpolationWrapper to get actual output
-    InterpolationWrapper<BorderType, InterpType, ImageWrapper<T>> actualWrap(
-        (BorderWrapper<BorderType, ImageWrapper<T>>(ImageWrapper<T>(input, batchSize, imageSize.w, imageSize.h),
-                                                    borderVal)));
-    BorderWrapper<BorderType, ImageWrapper<T>> goldenWrap(ImageWrapper<T>(input, batchSize, imageSize.w, imageSize.h),
-                                                          borderVal);
+    InterpolationWrapper<T, BorderType, InterpType> actualWrap(
+        (BorderWrapper<T, BorderType>(ImageWrapper<T>(input, batchSize, imageSize.w, imageSize.h), borderVal)));
+    BorderWrapper<T, BorderType> goldenWrap(ImageWrapper<T>(input, batchSize, imageSize.w, imageSize.h), borderVal);
 
     for (int b = 0; b < batchSize; b++) {
         for (float y = 0; y < imageSize.h; y += idxDelta) {
@@ -224,8 +220,7 @@ void TestCorrectness(int64_t batchSize, Size2D imageSize, float4 borderValue, fl
             }
         }
     }
-    if constexpr (std::is_integral_v<detail::BaseType<T>> && std::is_signed_v<detail::BaseType<T>> &&
-                  sizeof(detail::BaseType<T>) == 4) {
+    if constexpr (std::is_integral_v<detail::BaseType<T>> && std::is_signed_v<detail::BaseType<T>> && sizeof(detail::BaseType<T>) == 4) {
         CompareVectorsNear(actualOutput, goldenOutput, NEAR_EQUAL_THRESHOLD * 2);
     } else {
         CompareVectorsNear(actualOutput, goldenOutput);
@@ -233,7 +228,7 @@ void TestCorrectness(int64_t batchSize, Size2D imageSize, float4 borderValue, fl
 }
 }  // namespace
 
-int main(int argc, char** argv) {
+int main(int argc, char **argv) {
     (void)argc;
     (void)argv;
     TEST_CASES_BEGIN();
@@ -327,7 +322,7 @@ int main(int argc, char** argv) {
     TEST_CASE((TestCorrectness<float1, eBorderType::BORDER_TYPE_REFLECT, eInterpolationType::INTERP_TYPE_CUBIC>(1, {20, 53}, make_float4(0, 0, 0, 1), 0.1f)));
     TEST_CASE((TestCorrectness<float3, eBorderType::BORDER_TYPE_CONSTANT, eInterpolationType::INTERP_TYPE_CUBIC>(3, {38, 10}, make_float4(0, 0, 0, 1), 0.1f)));
     TEST_CASE((TestCorrectness<float4, eBorderType::BORDER_TYPE_WRAP, eInterpolationType::INTERP_TYPE_CUBIC>(5, {65, 21}, make_float4(1, 0.5, 0.5, 1), 0.1f)));
-    // clang-format on
+     // clang-format on
 
     TEST_CASES_END();
 }
\ No newline at end of file
diff --git a/tests/roccv/cpp/src/tests/operators/test_op_bilateral_filter.cpp b/tests/roccv/cpp/src/tests/operators/test_op_bilateral_filter.cpp
index 8ae5a10a..f208962c 100644
--- a/tests/roccv/cpp/src/tests/operators/test_op_bilateral_filter.cpp
+++ b/tests/roccv/cpp/src/tests/operators/test_op_bilateral_filter.cpp
@@ -51,8 +51,7 @@ namespace {
 template <typename T, eBorderType borderMode, typename BT = detail::BaseType<T>>
 void GenerateGoldenBilateral(std::vector<BT>& input, std::vector<BT>& output, int32_t batchSize, Size2D imageSize,
                              int diameter, float sigmaColor, float sigmaSpace, T borderValue) {
-    BorderWrapper<borderMode, ImageWrapper<T>> src(ImageWrapper<T>(input, batchSize, imageSize.w, imageSize.h),
-                                                   borderValue);
+    BorderWrapper<T, borderMode> src(ImageWrapper<T>(input, batchSize, imageSize.w, imageSize.h), borderValue);
     ImageWrapper<T> dst(output, batchSize, imageSize.w, imageSize.h);
     using namespace roccv::detail;
     using Worktype = MakeType<float, NumElements<T>>;
@@ -180,9 +179,9 @@ int main(int argc, char** argv) {
     TEST_CASE((TestCorrectness<uchar, BORDER_TYPE_CONSTANT>(1, 20, 20, FMT_U8, 0, 50.0f, 1.2f, {0.0, 0.0, 0.0, 0.0},
                                                             eDeviceType::GPU)));
     TEST_CASE((TestCorrectness<uchar3, BORDER_TYPE_REPLICATE>(2, 20, 20, FMT_RGB8, -1, 50.0f, 1.2f,
-                                                              {0.0, 0.0, 0.0, 0.0}, eDeviceType::GPU)));
-    TEST_CASE((TestCorrectness<float1, BORDER_TYPE_WRAP>(1, 24, 24, FMT_F32, 0, 500.0f, 1.2f, {500.0, 500.0, 0.0, 0.0},
-                                                         eDeviceType::GPU)));
+                                                             {0.0, 0.0, 0.0, 0.0}, eDeviceType::GPU)));
+    TEST_CASE((TestCorrectness<float1, BORDER_TYPE_WRAP>(1, 24, 24, FMT_F32, 0, 500.0f, 1.2f,
+                                                         {500.0, 500.0, 0.0, 0.0}, eDeviceType::GPU)));
 
     TEST_CASE((TestCorrectness<uchar3, BORDER_TYPE_CONSTANT>(1, 20, 20, FMT_RGB8, 4, 50.0f, 3.0f, {0.0, 0.0, 0.0, 0.0},
                                                              eDeviceType::GPU)));
@@ -289,9 +288,9 @@ int main(int argc, char** argv) {
     TEST_CASE((TestCorrectness<uchar, BORDER_TYPE_CONSTANT>(1, 20, 20, FMT_U8, 0, 50.0f, 1.2f, {0.0, 0.0, 0.0, 0.0},
                                                             eDeviceType::CPU)));
     TEST_CASE((TestCorrectness<uchar3, BORDER_TYPE_REPLICATE>(2, 20, 20, FMT_RGB8, -1, 50.0f, 1.2f,
-                                                              {0.0, 0.0, 0.0, 0.0}, eDeviceType::CPU)));
-    TEST_CASE((TestCorrectness<float1, BORDER_TYPE_WRAP>(1, 24, 24, FMT_F32, 0, 500.0f, 1.2f, {500.0, 500.0, 0.0, 0.0},
-                                                         eDeviceType::CPU)));
+                                                             {0.0, 0.0, 0.0, 0.0}, eDeviceType::CPU)));
+    TEST_CASE((TestCorrectness<float1, BORDER_TYPE_WRAP>(1, 24, 24, FMT_F32, 0, 500.0f, 1.2f,
+                                                         {500.0, 500.0, 0.0, 0.0}, eDeviceType::CPU)));
 
     TEST_CASE((TestCorrectness<uchar3, BORDER_TYPE_CONSTANT>(1, 20, 20, FMT_RGB8, 4, 50.0f, 3.0f, {0.0, 0.0, 0.0, 0.0},
                                                              eDeviceType::CPU)));
diff --git a/tests/roccv/cpp/src/tests/operators/test_op_copy_make_border.cpp b/tests/roccv/cpp/src/tests/operators/test_op_copy_make_border.cpp
index 30050917..4320f04e 100644
--- a/tests/roccv/cpp/src/tests/operators/test_op_copy_make_border.cpp
+++ b/tests/roccv/cpp/src/tests/operators/test_op_copy_make_border.cpp
@@ -57,8 +57,7 @@ std::vector<BT> GoldenCopyMakeBorder(std::vector<BT> input, int batchSize, Size2
 
     // Wrap the input images in a BorderWrapper to handle out of bounds image behavior. The BorderWrapper has already
     // been tested in another test so it can be used reliably.
-    BorderWrapper<BorderType, ImageWrapper<T>> inputWrap(ImageWrapper<T>(input, batchSize, inputSize.w, inputSize.h),
-                                                         borderVal);
+    BorderWrapper<T, BorderType> inputWrap(ImageWrapper<T>(input, batchSize, inputSize.w, inputSize.h), borderVal);
 
     std::vector<BT> output(batchSize * outputSize.h * outputSize.w * channels);
     ImageWrapper<T> outputWrap(output, batchSize, outputSize.w, outputSize.h);
diff --git a/tests/roccv/cpp/src/tests/operators/test_op_remap.cpp b/tests/roccv/cpp/src/tests/operators/test_op_remap.cpp
index 4ad9d914..634344a4 100644
--- a/tests/roccv/cpp/src/tests/operators/test_op_remap.cpp
+++ b/tests/roccv/cpp/src/tests/operators/test_op_remap.cpp
@@ -25,9 +25,8 @@ THE SOFTWARE.
 #include <core/wrappers/interpolation_wrapper.hpp>
 #include <iostream>
 #include <op_remap.hpp>
-
-#include "core/detail/casting.hpp"
 #include "core/detail/internal_structs.hpp"
+#include "core/detail/casting.hpp"
 #include "core/detail/math/vectorized_type_math.hpp"
 #include "core/detail/type_traits.hpp"
 #include "operator_types.h"
@@ -40,11 +39,11 @@ using namespace roccv::detail;
 // Keep all non-entrypoint functions in an anonymous namespace to prevent redefinition errors across translation units.
 namespace {
 
-RemapParams GetRemapParams(const int2& srcSize, const int2& dstSize, const int2& mapSize, bool alignCorners,
-                           eRemapType mapValueType) {
+RemapParams GetRemapParams(const int2 &srcSize, const int2 &dstSize, const int2 &mapSize, bool alignCorners, eRemapType mapValueType)
+{
     RemapParams params;
 
-    switch (mapValueType) {
+    switch(mapValueType) {
         case REMAP_ABSOLUTE:
             params.srcScale = make_float2(0.f, 0.f);
             params.mapScale = StaticCast<float2>(mapSize) / StaticCast<float2>(dstSize);
@@ -55,7 +54,7 @@ RemapParams GetRemapParams(const int2& srcSize, const int2& dstSize, const int2&
         case REMAP_ABSOLUTE_NORMALIZED:
             params.srcScale = make_float2(0.f, 0.f);
             params.mapScale = StaticCast<float2>(mapSize) / StaticCast<float2>(dstSize);
-            params.valScale = (StaticCast<float2>(srcSize) - (alignCorners ? 1.f : 0.f)) / 2.f;
+            params.valScale  = (StaticCast<float2>(srcSize) - (alignCorners ? 1.f : 0.f)) / 2.f;
             params.srcOffset = params.valScale - (alignCorners ? 0.f : .5f);
             params.dstOffset = 0.f;
             break;
@@ -88,15 +87,15 @@ RemapParams GetRemapParams(const int2& srcSize, const int2& dstSize, const int2&
  */
 template <typename T, eBorderType BorderType, eInterpolationType InterpType, eInterpolationType MapInterpType,
           typename BT = detail::BaseType<T>>
-std::vector<BT> GoldenRemap(std::vector<BT>& input, int32_t batchSize, int32_t mapBatchSize, int32_t inWidth,
-                            int32_t inHeight, int32_t outWidth, int32_t outHeight, int32_t mapWidth, int32_t mapHeight,
-                            std::vector<float2>& mapData, eRemapType mapType, bool alignCorners, float4 borderValue) {
+std::vector<BT> GoldenRemap(std::vector<BT>& input, int32_t batchSize, int32_t mapBatchSize, int32_t inWidth, int32_t inHeight, int32_t outWidth, 
+                            int32_t outHeight, int32_t mapWidth, int32_t mapHeight, std::vector<float2>& mapData, eRemapType mapType, bool alignCorners, float4 borderValue) {
+    
     int channels = detail::NumElements<T>;
     int outputSize = batchSize * outWidth * outHeight * channels;
     std::vector<BT> output(outputSize);
 
     // Create interpolation wrapper for input vector
-    InterpolationWrapper<BorderType, InterpType, ImageWrapper<T>> src((BorderWrapper<BorderType, ImageWrapper<T>>(
+    InterpolationWrapper<T, BorderType, InterpType> src((BorderWrapper<T, BorderType>(
         ImageWrapper<T>(input, batchSize, inWidth, inHeight), detail::SaturateCast<T>(borderValue))));
 
     // Wrap the output vector for simplified data access
@@ -104,10 +103,8 @@ std::vector<BT> GoldenRemap(std::vector<BT>& input, int32_t batchSize, int32_t m
 
     // Create an interpolation wrapper for the map tensor
     // InterpolationWrapper<float2, BorderType, MapInterpType> wrappedMapTensor(map, make_float2(0, 0));
-    InterpolationWrapper<BorderType, MapInterpType, ImageWrapper<float2>> map(
-        (BorderWrapper<BorderType, ImageWrapper<float2>>(
-            ImageWrapper<float2>(mapData.data(), mapBatchSize, mapWidth, mapHeight),
-            detail::SaturateCast<float2>(borderValue))));
+    InterpolationWrapper<float2, BorderType, MapInterpType> map((BorderWrapper<float2, BorderType>(
+        ImageWrapper<float2>(mapData.data(), mapBatchSize, mapWidth, mapHeight), detail::SaturateCast<float2>(borderValue))));
 
     int2 srcSize = make_int2(src.width(), src.height());
     int2 dstSize = make_int2(dst.width(), dst.height());
@@ -122,12 +119,13 @@ std::vector<BT> GoldenRemap(std::vector<BT>& input, int32_t batchSize, int32_t m
     for (int b = 0; b < dst.batches(); b++) {
         for (int y = 0; y < dst.height(); y++) {
             for (int x = 0; x < dst.width(); x++) {
+                
                 dstCoord.x = static_cast<float>(x);
                 dstCoord.y = static_cast<float>(y);
-
+                
                 mapCoord.x = (dstCoord.x + params.dstOffset) * params.mapScale.x;
                 mapCoord.y = (dstCoord.y + params.dstOffset) * params.mapScale.y;
-
+                
                 float2 mapValue = map.at((mapBatchSize == 1 ? 0 : b), mapCoord.y, mapCoord.x, 0);
 
                 srcCoord.x = dstCoord.x * params.srcScale.x + mapValue.x * params.valScale.x + params.srcOffset.x;
@@ -164,8 +162,7 @@ std::vector<BT> GoldenRemap(std::vector<BT>& input, int32_t batchSize, int32_t m
  */
 template <typename T, eBorderType BorderType, eInterpolationType InterpType, eInterpolationType MapInterpType,
           typename BT = detail::BaseType<T>>
-void TestCorrectness(int batchSize, int mapBatchSize, int inWidth, int inHeight, int outWidth, int outHeight,
-                     int mapWidth, int mapHeight, ImageFormat format, float4 borderValue, eRemapType mapType,
+void TestCorrectness(int batchSize, int mapBatchSize, int inWidth, int inHeight, int outWidth, int outHeight, int mapWidth, int mapHeight, ImageFormat format, float4 borderValue, eRemapType mapType,
                      bool alignCorners, eDeviceType device) {
     // Create input and output tensor based on test parameters
     Tensor input(batchSize, {inWidth, inHeight}, format, device);
@@ -177,7 +174,7 @@ void TestCorrectness(int batchSize, int mapBatchSize, int inWidth, int inHeight,
 
     // Copy generated input data into input tensor
     CopyVectorIntoTensor(input, inputData);
-
+    
     int mapSize = mapBatchSize * mapWidth * mapHeight;
 
     std::vector<float2> mapData(mapSize);
@@ -191,10 +188,11 @@ void TestCorrectness(int batchSize, int mapBatchSize, int inWidth, int inHeight,
                 }
             }
         }
-    } else if (mapType == REMAP_ABSOLUTE_NORMALIZED) {
+    }
+    else if (mapType == REMAP_ABSOLUTE_NORMALIZED) {
         for (int b = 0; b < mapBatchSize; b++) {
-            for (int y = 0; y < mapHeight; y++) {
-                for (int x = 0; x < mapWidth; x++) {
+            for (int y = 0; y < mapHeight; y++){
+                for (int x = 0; x < mapWidth; x++){
                     float normX = ((2.0f * static_cast<float>(x)) / static_cast<float>(mapWidth - 1)) - 1.0f;
                     float normY = ((2.0f * static_cast<float>(y)) / static_cast<float>(mapHeight - 1)) - 1.0f;
 
@@ -206,10 +204,11 @@ void TestCorrectness(int batchSize, int mapBatchSize, int inWidth, int inHeight,
                 }
             }
         }
-    } else if (mapType == REMAP_RELATIVE_NORMALIZED) {
+    }
+    else if (mapType == REMAP_RELATIVE_NORMALIZED) {
         for (int b = 0; b < mapBatchSize; b++) {
-            for (int y = 0; y < mapHeight; y++) {
-                for (int x = 0; x < mapWidth; x++) {
+            for (int y = 0; y < mapHeight; y++){
+                for (int x = 0; x < mapWidth; x++){
                     // Generate normalized coordinates in [-1, 1] range
                     float normX = ((2.0f * static_cast<float>(x)) / static_cast<float>(mapWidth - 1)) - 1.0f;
                     float normY = ((2.0f * static_cast<float>(y)) / static_cast<float>(mapHeight - 1)) - 1.0f;
@@ -236,8 +235,7 @@ void TestCorrectness(int batchSize, int mapBatchSize, int inWidth, int inHeight,
     hipStream_t stream;
     HIP_VALIDATE_NO_ERRORS(hipStreamCreate(&stream));
     Remap op;
-    op(stream, input, output, mapTensor, InterpType, MapInterpType, mapType, alignCorners, BorderType, borderValue,
-       device);
+    op(stream, input, output, mapTensor, InterpType, MapInterpType, mapType, alignCorners, BorderType, borderValue, device);
     HIP_VALIDATE_NO_ERRORS(hipStreamSynchronize(stream));
     HIP_VALIDATE_NO_ERRORS(hipStreamDestroy(stream));
 
@@ -245,9 +243,9 @@ void TestCorrectness(int batchSize, int mapBatchSize, int inWidth, int inHeight,
     std::vector<BT> result(output.shape().size());
     CopyTensorIntoVector(result, output);
 
-    std::vector<BT> ref = GoldenRemap<T, BorderType, InterpType, MapInterpType>(
-        inputData, batchSize, mapBatchSize, inWidth, inHeight, outWidth, outHeight, mapWidth, mapHeight, mapData,
-        mapType, alignCorners, borderValue);
+    std::vector<BT> ref = GoldenRemap<T, BorderType, InterpType, MapInterpType>(inputData, batchSize, mapBatchSize, inWidth,
+                                                                                        inHeight, outWidth, outHeight, 
+                                                                                        mapWidth, mapHeight, mapData, mapType, alignCorners, borderValue);
 
     // Compare data in actual output versus the generated golden reference image
     CompareVectors(result, ref);
@@ -260,186 +258,144 @@ int main(int argc, char** argv) {
     TEST_CASES_BEGIN();
 
     TEST_CASE((TestCorrectness<uchar1, eBorderType::BORDER_TYPE_CONSTANT, eInterpolationType::INTERP_TYPE_NEAREST,
-                               eInterpolationType::INTERP_TYPE_NEAREST>(1, 1, 480, 360, 480, 360, 480, 360, FMT_U8,
-                                                                        make_float4(0.0f, 0.0f, 0.0f, 1.0f),
-                                                                        REMAP_ABSOLUTE, false, eDeviceType::GPU)));
+                               eInterpolationType::INTERP_TYPE_NEAREST>(
+        1, 1, 480, 360, 480, 360, 480, 360, FMT_U8, make_float4(0.0f, 0.0f, 0.0f, 1.0f), REMAP_ABSOLUTE, false, eDeviceType::GPU)));
     TEST_CASE((TestCorrectness<uchar1, eBorderType::BORDER_TYPE_CONSTANT, eInterpolationType::INTERP_TYPE_NEAREST,
                                eInterpolationType::INTERP_TYPE_NEAREST>(
-        1, 1, 480, 360, 480, 360, 480, 360, FMT_U8, make_float4(0.0f, 0.0f, 0.0f, 1.0f), REMAP_ABSOLUTE_NORMALIZED,
-        false, eDeviceType::GPU)));
+        1, 1, 480, 360, 480, 360, 480, 360, FMT_U8, make_float4(0.0f, 0.0f, 0.0f, 1.0f), REMAP_ABSOLUTE_NORMALIZED, false, eDeviceType::GPU)));
     TEST_CASE((TestCorrectness<uchar1, eBorderType::BORDER_TYPE_CONSTANT, eInterpolationType::INTERP_TYPE_NEAREST,
                                eInterpolationType::INTERP_TYPE_NEAREST>(
-        1, 1, 480, 360, 480, 360, 480, 360, FMT_U8, make_float4(0.0f, 0.0f, 0.0f, 1.0f), REMAP_RELATIVE_NORMALIZED,
-        false, eDeviceType::GPU)));
+        1, 1, 480, 360, 480, 360, 480, 360, FMT_U8, make_float4(0.0f, 0.0f, 0.0f, 1.0f), REMAP_RELATIVE_NORMALIZED, false, eDeviceType::GPU)));
     TEST_CASE((TestCorrectness<uchar3, eBorderType::BORDER_TYPE_CONSTANT, eInterpolationType::INTERP_TYPE_NEAREST,
-                               eInterpolationType::INTERP_TYPE_NEAREST>(1, 1, 480, 360, 480, 360, 480, 360, FMT_RGB8,
-                                                                        make_float4(0.0f, 0.0f, 0.0f, 1.0f),
-                                                                        REMAP_ABSOLUTE, false, eDeviceType::GPU)));
+                               eInterpolationType::INTERP_TYPE_NEAREST>(
+        1, 1, 480, 360, 480, 360, 480, 360, FMT_RGB8, make_float4(0.0f, 0.0f, 0.0f, 1.0f), REMAP_ABSOLUTE, false, eDeviceType::GPU)));
     TEST_CASE((TestCorrectness<uchar3, eBorderType::BORDER_TYPE_CONSTANT, eInterpolationType::INTERP_TYPE_NEAREST,
                                eInterpolationType::INTERP_TYPE_NEAREST>(
-        1, 1, 480, 360, 480, 360, 480, 360, FMT_RGB8, make_float4(0.0f, 0.0f, 0.0f, 1.0f), REMAP_ABSOLUTE_NORMALIZED,
-        false, eDeviceType::GPU)));
+        1, 1, 480, 360, 480, 360, 480, 360, FMT_RGB8, make_float4(0.0f, 0.0f, 0.0f, 1.0f), REMAP_ABSOLUTE_NORMALIZED, false, eDeviceType::GPU)));
     TEST_CASE((TestCorrectness<uchar3, eBorderType::BORDER_TYPE_CONSTANT, eInterpolationType::INTERP_TYPE_NEAREST,
                                eInterpolationType::INTERP_TYPE_NEAREST>(
-        1, 1, 480, 360, 480, 360, 480, 360, FMT_RGB8, make_float4(0.0f, 0.0f, 0.0f, 1.0f), REMAP_RELATIVE_NORMALIZED,
-        false, eDeviceType::GPU)));
+        1, 1, 480, 360, 480, 360, 480, 360, FMT_RGB8, make_float4(0.0f, 0.0f, 0.0f, 1.0f), REMAP_RELATIVE_NORMALIZED, false, eDeviceType::GPU)));
     TEST_CASE((TestCorrectness<uchar4, eBorderType::BORDER_TYPE_CONSTANT, eInterpolationType::INTERP_TYPE_NEAREST,
-                               eInterpolationType::INTERP_TYPE_NEAREST>(1, 1, 480, 360, 480, 360, 480, 360, FMT_RGBA8,
-                                                                        make_float4(0.0f, 0.0f, 0.0f, 1.0f),
-                                                                        REMAP_ABSOLUTE, false, eDeviceType::GPU)));
+                               eInterpolationType::INTERP_TYPE_NEAREST>(
+        1, 1, 480, 360, 480, 360, 480, 360, FMT_RGBA8, make_float4(0.0f, 0.0f, 0.0f, 1.0f), REMAP_ABSOLUTE, false, eDeviceType::GPU)));
     TEST_CASE((TestCorrectness<uchar4, eBorderType::BORDER_TYPE_CONSTANT, eInterpolationType::INTERP_TYPE_NEAREST,
                                eInterpolationType::INTERP_TYPE_NEAREST>(
-        1, 1, 480, 360, 480, 360, 480, 360, FMT_RGBA8, make_float4(0.0f, 0.0f, 0.0f, 1.0f), REMAP_ABSOLUTE_NORMALIZED,
-        false, eDeviceType::GPU)));
+        1, 1, 480, 360, 480, 360, 480, 360, FMT_RGBA8, make_float4(0.0f, 0.0f, 0.0f, 1.0f), REMAP_ABSOLUTE_NORMALIZED, false, eDeviceType::GPU)));
     TEST_CASE((TestCorrectness<uchar4, eBorderType::BORDER_TYPE_CONSTANT, eInterpolationType::INTERP_TYPE_NEAREST,
                                eInterpolationType::INTERP_TYPE_NEAREST>(
-        1, 1, 480, 360, 480, 360, 480, 360, FMT_RGBA8, make_float4(0.0f, 0.0f, 0.0f, 1.0f), REMAP_RELATIVE_NORMALIZED,
-        false, eDeviceType::GPU)));
-
+        1, 1, 480, 360, 480, 360, 480, 360, FMT_RGBA8, make_float4(0.0f, 0.0f, 0.0f, 1.0f), REMAP_RELATIVE_NORMALIZED, false, eDeviceType::GPU)));
+    
     TEST_CASE((TestCorrectness<uchar1, eBorderType::BORDER_TYPE_CONSTANT, eInterpolationType::INTERP_TYPE_NEAREST,
-                               eInterpolationType::INTERP_TYPE_NEAREST>(1, 1, 480, 360, 480, 360, 480, 360, FMT_U8,
-                                                                        make_float4(0.0f, 0.0f, 0.0f, 1.0f),
-                                                                        REMAP_ABSOLUTE, true, eDeviceType::GPU)));
+                               eInterpolationType::INTERP_TYPE_NEAREST>(
+        1, 1, 480, 360, 480, 360, 480, 360, FMT_U8, make_float4(0.0f, 0.0f, 0.0f, 1.0f), REMAP_ABSOLUTE, true, eDeviceType::GPU)));
     TEST_CASE((TestCorrectness<uchar1, eBorderType::BORDER_TYPE_CONSTANT, eInterpolationType::INTERP_TYPE_NEAREST,
                                eInterpolationType::INTERP_TYPE_NEAREST>(
-        1, 1, 480, 360, 480, 360, 480, 360, FMT_U8, make_float4(0.0f, 0.0f, 0.0f, 1.0f), REMAP_ABSOLUTE_NORMALIZED,
-        true, eDeviceType::GPU)));
+        1, 1, 480, 360, 480, 360, 480, 360, FMT_U8, make_float4(0.0f, 0.0f, 0.0f, 1.0f), REMAP_ABSOLUTE_NORMALIZED, true, eDeviceType::GPU)));
     TEST_CASE((TestCorrectness<uchar1, eBorderType::BORDER_TYPE_CONSTANT, eInterpolationType::INTERP_TYPE_NEAREST,
                                eInterpolationType::INTERP_TYPE_NEAREST>(
-        1, 1, 480, 360, 480, 360, 480, 360, FMT_U8, make_float4(0.0f, 0.0f, 0.0f, 1.0f), REMAP_RELATIVE_NORMALIZED,
-        true, eDeviceType::GPU)));
+        1, 1, 480, 360, 480, 360, 480, 360, FMT_U8, make_float4(0.0f, 0.0f, 0.0f, 1.0f), REMAP_RELATIVE_NORMALIZED, true, eDeviceType::GPU)));
     TEST_CASE((TestCorrectness<uchar3, eBorderType::BORDER_TYPE_CONSTANT, eInterpolationType::INTERP_TYPE_NEAREST,
-                               eInterpolationType::INTERP_TYPE_NEAREST>(1, 1, 480, 360, 480, 360, 480, 360, FMT_RGB8,
-                                                                        make_float4(0.0f, 0.0f, 0.0f, 1.0f),
-                                                                        REMAP_ABSOLUTE, true, eDeviceType::GPU)));
+                               eInterpolationType::INTERP_TYPE_NEAREST>(
+        1, 1, 480, 360, 480, 360, 480, 360, FMT_RGB8, make_float4(0.0f, 0.0f, 0.0f, 1.0f), REMAP_ABSOLUTE, true, eDeviceType::GPU)));
     TEST_CASE((TestCorrectness<uchar3, eBorderType::BORDER_TYPE_CONSTANT, eInterpolationType::INTERP_TYPE_NEAREST,
                                eInterpolationType::INTERP_TYPE_NEAREST>(
-        1, 1, 480, 360, 480, 360, 480, 360, FMT_RGB8, make_float4(0.0f, 0.0f, 0.0f, 1.0f), REMAP_ABSOLUTE_NORMALIZED,
-        true, eDeviceType::GPU)));
+        1, 1, 480, 360, 480, 360, 480, 360, FMT_RGB8, make_float4(0.0f, 0.0f, 0.0f, 1.0f), REMAP_ABSOLUTE_NORMALIZED, true, eDeviceType::GPU)));
     TEST_CASE((TestCorrectness<uchar3, eBorderType::BORDER_TYPE_CONSTANT, eInterpolationType::INTERP_TYPE_NEAREST,
                                eInterpolationType::INTERP_TYPE_NEAREST>(
-        1, 1, 480, 360, 480, 360, 480, 360, FMT_RGB8, make_float4(0.0f, 0.0f, 0.0f, 1.0f), REMAP_RELATIVE_NORMALIZED,
-        true, eDeviceType::GPU)));
+        1, 1, 480, 360, 480, 360, 480, 360, FMT_RGB8, make_float4(0.0f, 0.0f, 0.0f, 1.0f), REMAP_RELATIVE_NORMALIZED, true, eDeviceType::GPU)));
     TEST_CASE((TestCorrectness<uchar4, eBorderType::BORDER_TYPE_CONSTANT, eInterpolationType::INTERP_TYPE_NEAREST,
-                               eInterpolationType::INTERP_TYPE_NEAREST>(1, 1, 480, 360, 480, 360, 480, 360, FMT_RGBA8,
-                                                                        make_float4(0.0f, 0.0f, 0.0f, 1.0f),
-                                                                        REMAP_ABSOLUTE, true, eDeviceType::GPU)));
+                               eInterpolationType::INTERP_TYPE_NEAREST>(
+        1, 1, 480, 360, 480, 360, 480, 360, FMT_RGBA8, make_float4(0.0f, 0.0f, 0.0f, 1.0f), REMAP_ABSOLUTE, true, eDeviceType::GPU)));
     TEST_CASE((TestCorrectness<uchar4, eBorderType::BORDER_TYPE_CONSTANT, eInterpolationType::INTERP_TYPE_NEAREST,
                                eInterpolationType::INTERP_TYPE_NEAREST>(
-        1, 1, 480, 360, 480, 360, 480, 360, FMT_RGBA8, make_float4(0.0f, 0.0f, 0.0f, 1.0f), REMAP_ABSOLUTE_NORMALIZED,
-        true, eDeviceType::GPU)));
+        1, 1, 480, 360, 480, 360, 480, 360, FMT_RGBA8, make_float4(0.0f, 0.0f, 0.0f, 1.0f), REMAP_ABSOLUTE_NORMALIZED, true, eDeviceType::GPU)));
     TEST_CASE((TestCorrectness<uchar4, eBorderType::BORDER_TYPE_CONSTANT, eInterpolationType::INTERP_TYPE_NEAREST,
                                eInterpolationType::INTERP_TYPE_NEAREST>(
-        1, 1, 480, 360, 480, 360, 480, 360, FMT_RGBA8, make_float4(0.0f, 0.0f, 0.0f, 1.0f), REMAP_RELATIVE_NORMALIZED,
-        true, eDeviceType::GPU)));
+        1, 1, 480, 360, 480, 360, 480, 360, FMT_RGBA8, make_float4(0.0f, 0.0f, 0.0f, 1.0f), REMAP_RELATIVE_NORMALIZED, true, eDeviceType::GPU)));
 
     TEST_CASE((TestCorrectness<uchar1, eBorderType::BORDER_TYPE_CONSTANT, eInterpolationType::INTERP_TYPE_NEAREST,
-                               eInterpolationType::INTERP_TYPE_NEAREST>(2, 1, 480, 360, 480, 360, 480, 360, FMT_U8,
-                                                                        make_float4(0.0f, 0.0f, 0.0f, 1.0f),
-                                                                        REMAP_ABSOLUTE, false, eDeviceType::GPU)));
+                               eInterpolationType::INTERP_TYPE_NEAREST>(
+        2, 1, 480, 360, 480, 360, 480, 360, FMT_U8, make_float4(0.0f, 0.0f, 0.0f, 1.0f), REMAP_ABSOLUTE, false, eDeviceType::GPU)));
     TEST_CASE((TestCorrectness<uchar1, eBorderType::BORDER_TYPE_CONSTANT, eInterpolationType::INTERP_TYPE_NEAREST,
-                               eInterpolationType::INTERP_TYPE_NEAREST>(2, 2, 480, 360, 480, 360, 480, 360, FMT_U8,
-                                                                        make_float4(0.0f, 0.0f, 0.0f, 1.0f),
-                                                                        REMAP_ABSOLUTE, false, eDeviceType::GPU)));
+                               eInterpolationType::INTERP_TYPE_NEAREST>(
+        2, 2, 480, 360, 480, 360, 480, 360, FMT_U8, make_float4(0.0f, 0.0f, 0.0f, 1.0f), REMAP_ABSOLUTE, false, eDeviceType::GPU)));
     TEST_CASE((TestCorrectness<uchar1, eBorderType::BORDER_TYPE_CONSTANT, eInterpolationType::INTERP_TYPE_NEAREST,
-                               eInterpolationType::INTERP_TYPE_NEAREST>(2, 1, 480, 360, 480, 360, 480, 360, FMT_U8,
-                                                                        make_float4(0.0f, 0.0f, 0.0f, 1.0f),
-                                                                        REMAP_ABSOLUTE, true, eDeviceType::GPU)));
+                               eInterpolationType::INTERP_TYPE_NEAREST>(
+        2, 1, 480, 360, 480, 360, 480, 360, FMT_U8, make_float4(0.0f, 0.0f, 0.0f, 1.0f), REMAP_ABSOLUTE, true, eDeviceType::GPU)));
     TEST_CASE((TestCorrectness<uchar1, eBorderType::BORDER_TYPE_CONSTANT, eInterpolationType::INTERP_TYPE_NEAREST,
-                               eInterpolationType::INTERP_TYPE_NEAREST>(2, 2, 480, 360, 480, 360, 480, 360, FMT_U8,
-                                                                        make_float4(0.0f, 0.0f, 0.0f, 1.0f),
-                                                                        REMAP_ABSOLUTE, true, eDeviceType::GPU)));
+                               eInterpolationType::INTERP_TYPE_NEAREST>(
+        2, 2, 480, 360, 480, 360, 480, 360, FMT_U8, make_float4(0.0f, 0.0f, 0.0f, 1.0f), REMAP_ABSOLUTE, true, eDeviceType::GPU)));
 
     TEST_CASE((TestCorrectness<uchar1, eBorderType::BORDER_TYPE_CONSTANT, eInterpolationType::INTERP_TYPE_NEAREST,
-                               eInterpolationType::INTERP_TYPE_NEAREST>(1, 1, 480, 360, 480, 360, 480, 360, FMT_U8,
-                                                                        make_float4(0.0f, 0.0f, 0.0f, 1.0f),
-                                                                        REMAP_ABSOLUTE, false, eDeviceType::CPU)));
+                               eInterpolationType::INTERP_TYPE_NEAREST>(
+        1, 1, 480, 360, 480, 360, 480, 360, FMT_U8, make_float4(0.0f, 0.0f, 0.0f, 1.0f), REMAP_ABSOLUTE, false, eDeviceType::CPU)));
     TEST_CASE((TestCorrectness<uchar1, eBorderType::BORDER_TYPE_CONSTANT, eInterpolationType::INTERP_TYPE_NEAREST,
                                eInterpolationType::INTERP_TYPE_NEAREST>(
-        1, 1, 480, 360, 480, 360, 480, 360, FMT_U8, make_float4(0.0f, 0.0f, 0.0f, 1.0f), REMAP_ABSOLUTE_NORMALIZED,
-        false, eDeviceType::CPU)));
+        1, 1, 480, 360, 480, 360, 480, 360, FMT_U8, make_float4(0.0f, 0.0f, 0.0f, 1.0f), REMAP_ABSOLUTE_NORMALIZED, false, eDeviceType::CPU)));
     TEST_CASE((TestCorrectness<uchar1, eBorderType::BORDER_TYPE_CONSTANT, eInterpolationType::INTERP_TYPE_NEAREST,
                                eInterpolationType::INTERP_TYPE_NEAREST>(
-        1, 1, 480, 360, 480, 360, 480, 360, FMT_U8, make_float4(0.0f, 0.0f, 0.0f, 1.0f), REMAP_RELATIVE_NORMALIZED,
-        false, eDeviceType::CPU)));
+        1, 1, 480, 360, 480, 360, 480, 360, FMT_U8, make_float4(0.0f, 0.0f, 0.0f, 1.0f), REMAP_RELATIVE_NORMALIZED, false, eDeviceType::CPU)));
     TEST_CASE((TestCorrectness<uchar3, eBorderType::BORDER_TYPE_CONSTANT, eInterpolationType::INTERP_TYPE_NEAREST,
-                               eInterpolationType::INTERP_TYPE_NEAREST>(1, 1, 480, 360, 480, 360, 480, 360, FMT_RGB8,
-                                                                        make_float4(0.0f, 0.0f, 0.0f, 1.0f),
-                                                                        REMAP_ABSOLUTE, false, eDeviceType::CPU)));
+                               eInterpolationType::INTERP_TYPE_NEAREST>(
+        1, 1, 480, 360, 480, 360, 480, 360, FMT_RGB8, make_float4(0.0f, 0.0f, 0.0f, 1.0f), REMAP_ABSOLUTE, false, eDeviceType::CPU)));
     TEST_CASE((TestCorrectness<uchar3, eBorderType::BORDER_TYPE_CONSTANT, eInterpolationType::INTERP_TYPE_NEAREST,
                                eInterpolationType::INTERP_TYPE_NEAREST>(
-        1, 1, 480, 360, 480, 360, 480, 360, FMT_RGB8, make_float4(0.0f, 0.0f, 0.0f, 1.0f), REMAP_ABSOLUTE_NORMALIZED,
-        false, eDeviceType::CPU)));
+        1, 1, 480, 360, 480, 360, 480, 360, FMT_RGB8, make_float4(0.0f, 0.0f, 0.0f, 1.0f), REMAP_ABSOLUTE_NORMALIZED, false, eDeviceType::CPU)));
     TEST_CASE((TestCorrectness<uchar3, eBorderType::BORDER_TYPE_CONSTANT, eInterpolationType::INTERP_TYPE_NEAREST,
                                eInterpolationType::INTERP_TYPE_NEAREST>(
-        1, 1, 480, 360, 480, 360, 480, 360, FMT_RGB8, make_float4(0.0f, 0.0f, 0.0f, 1.0f), REMAP_RELATIVE_NORMALIZED,
-        false, eDeviceType::CPU)));
+        1, 1, 480, 360, 480, 360, 480, 360, FMT_RGB8, make_float4(0.0f, 0.0f, 0.0f, 1.0f), REMAP_RELATIVE_NORMALIZED, false, eDeviceType::CPU)));
     TEST_CASE((TestCorrectness<uchar4, eBorderType::BORDER_TYPE_CONSTANT, eInterpolationType::INTERP_TYPE_NEAREST,
-                               eInterpolationType::INTERP_TYPE_NEAREST>(1, 1, 480, 360, 480, 360, 480, 360, FMT_RGBA8,
-                                                                        make_float4(0.0f, 0.0f, 0.0f, 1.0f),
-                                                                        REMAP_ABSOLUTE, false, eDeviceType::CPU)));
+                               eInterpolationType::INTERP_TYPE_NEAREST>(
+        1, 1, 480, 360, 480, 360, 480, 360, FMT_RGBA8, make_float4(0.0f, 0.0f, 0.0f, 1.0f), REMAP_ABSOLUTE, false, eDeviceType::CPU)));
     TEST_CASE((TestCorrectness<uchar4, eBorderType::BORDER_TYPE_CONSTANT, eInterpolationType::INTERP_TYPE_NEAREST,
                                eInterpolationType::INTERP_TYPE_NEAREST>(
-        1, 1, 480, 360, 480, 360, 480, 360, FMT_RGBA8, make_float4(0.0f, 0.0f, 0.0f, 1.0f), REMAP_ABSOLUTE_NORMALIZED,
-        false, eDeviceType::CPU)));
+        1, 1, 480, 360, 480, 360, 480, 360, FMT_RGBA8, make_float4(0.0f, 0.0f, 0.0f, 1.0f), REMAP_ABSOLUTE_NORMALIZED, false, eDeviceType::CPU)));
     TEST_CASE((TestCorrectness<uchar4, eBorderType::BORDER_TYPE_CONSTANT, eInterpolationType::INTERP_TYPE_NEAREST,
                                eInterpolationType::INTERP_TYPE_NEAREST>(
-        1, 1, 480, 360, 480, 360, 480, 360, FMT_RGBA8, make_float4(0.0f, 0.0f, 0.0f, 1.0f), REMAP_RELATIVE_NORMALIZED,
-        false, eDeviceType::CPU)));
+        1, 1, 480, 360, 480, 360, 480, 360, FMT_RGBA8, make_float4(0.0f, 0.0f, 0.0f, 1.0f), REMAP_RELATIVE_NORMALIZED, false, eDeviceType::CPU)));
 
     TEST_CASE((TestCorrectness<uchar1, eBorderType::BORDER_TYPE_CONSTANT, eInterpolationType::INTERP_TYPE_NEAREST,
-                               eInterpolationType::INTERP_TYPE_NEAREST>(1, 1, 480, 360, 480, 360, 480, 360, FMT_U8,
-                                                                        make_float4(0.0f, 0.0f, 0.0f, 1.0f),
-                                                                        REMAP_ABSOLUTE, true, eDeviceType::CPU)));
+                               eInterpolationType::INTERP_TYPE_NEAREST>(
+        1, 1, 480, 360, 480, 360, 480, 360, FMT_U8, make_float4(0.0f, 0.0f, 0.0f, 1.0f), REMAP_ABSOLUTE, true, eDeviceType::CPU)));
     TEST_CASE((TestCorrectness<uchar1, eBorderType::BORDER_TYPE_CONSTANT, eInterpolationType::INTERP_TYPE_NEAREST,
                                eInterpolationType::INTERP_TYPE_NEAREST>(
-        1, 1, 480, 360, 480, 360, 480, 360, FMT_U8, make_float4(0.0f, 0.0f, 0.0f, 1.0f), REMAP_ABSOLUTE_NORMALIZED,
-        true, eDeviceType::CPU)));
+        1, 1, 480, 360, 480, 360, 480, 360, FMT_U8, make_float4(0.0f, 0.0f, 0.0f, 1.0f), REMAP_ABSOLUTE_NORMALIZED, true, eDeviceType::CPU)));
     TEST_CASE((TestCorrectness<uchar1, eBorderType::BORDER_TYPE_CONSTANT, eInterpolationType::INTERP_TYPE_NEAREST,
                                eInterpolationType::INTERP_TYPE_NEAREST>(
-        1, 1, 480, 360, 480, 360, 480, 360, FMT_U8, make_float4(0.0f, 0.0f, 0.0f, 1.0f), REMAP_RELATIVE_NORMALIZED,
-        true, eDeviceType::CPU)));
+        1, 1, 480, 360, 480, 360, 480, 360, FMT_U8, make_float4(0.0f, 0.0f, 0.0f, 1.0f), REMAP_RELATIVE_NORMALIZED, true, eDeviceType::CPU)));
     TEST_CASE((TestCorrectness<uchar3, eBorderType::BORDER_TYPE_CONSTANT, eInterpolationType::INTERP_TYPE_NEAREST,
-                               eInterpolationType::INTERP_TYPE_NEAREST>(1, 1, 480, 360, 480, 360, 480, 360, FMT_RGB8,
-                                                                        make_float4(0.0f, 0.0f, 0.0f, 1.0f),
-                                                                        REMAP_ABSOLUTE, true, eDeviceType::CPU)));
+                               eInterpolationType::INTERP_TYPE_NEAREST>(
+        1, 1, 480, 360, 480, 360, 480, 360, FMT_RGB8, make_float4(0.0f, 0.0f, 0.0f, 1.0f), REMAP_ABSOLUTE, true, eDeviceType::CPU)));
     TEST_CASE((TestCorrectness<uchar3, eBorderType::BORDER_TYPE_CONSTANT, eInterpolationType::INTERP_TYPE_NEAREST,
                                eInterpolationType::INTERP_TYPE_NEAREST>(
-        1, 1, 480, 360, 480, 360, 480, 360, FMT_RGB8, make_float4(0.0f, 0.0f, 0.0f, 1.0f), REMAP_ABSOLUTE_NORMALIZED,
-        true, eDeviceType::CPU)));
+        1, 1, 480, 360, 480, 360, 480, 360, FMT_RGB8, make_float4(0.0f, 0.0f, 0.0f, 1.0f), REMAP_ABSOLUTE_NORMALIZED, true, eDeviceType::CPU)));
     TEST_CASE((TestCorrectness<uchar3, eBorderType::BORDER_TYPE_CONSTANT, eInterpolationType::INTERP_TYPE_NEAREST,
                                eInterpolationType::INTERP_TYPE_NEAREST>(
-        1, 1, 480, 360, 480, 360, 480, 360, FMT_RGB8, make_float4(0.0f, 0.0f, 0.0f, 1.0f), REMAP_RELATIVE_NORMALIZED,
-        true, eDeviceType::CPU)));
+        1, 1, 480, 360, 480, 360, 480, 360, FMT_RGB8, make_float4(0.0f, 0.0f, 0.0f, 1.0f), REMAP_RELATIVE_NORMALIZED, true, eDeviceType::CPU)));
     TEST_CASE((TestCorrectness<uchar4, eBorderType::BORDER_TYPE_CONSTANT, eInterpolationType::INTERP_TYPE_NEAREST,
-                               eInterpolationType::INTERP_TYPE_NEAREST>(1, 1, 480, 360, 480, 360, 480, 360, FMT_RGBA8,
-                                                                        make_float4(0.0f, 0.0f, 0.0f, 1.0f),
-                                                                        REMAP_ABSOLUTE, true, eDeviceType::CPU)));
+                               eInterpolationType::INTERP_TYPE_NEAREST>(
+        1, 1, 480, 360, 480, 360, 480, 360, FMT_RGBA8, make_float4(0.0f, 0.0f, 0.0f, 1.0f), REMAP_ABSOLUTE, true, eDeviceType::CPU)));
     TEST_CASE((TestCorrectness<uchar4, eBorderType::BORDER_TYPE_CONSTANT, eInterpolationType::INTERP_TYPE_NEAREST,
                                eInterpolationType::INTERP_TYPE_NEAREST>(
-        1, 1, 480, 360, 480, 360, 480, 360, FMT_RGBA8, make_float4(0.0f, 0.0f, 0.0f, 1.0f), REMAP_ABSOLUTE_NORMALIZED,
-        true, eDeviceType::CPU)));
+        1, 1, 480, 360, 480, 360, 480, 360, FMT_RGBA8, make_float4(0.0f, 0.0f, 0.0f, 1.0f), REMAP_ABSOLUTE_NORMALIZED, true, eDeviceType::CPU)));
     TEST_CASE((TestCorrectness<uchar4, eBorderType::BORDER_TYPE_CONSTANT, eInterpolationType::INTERP_TYPE_NEAREST,
                                eInterpolationType::INTERP_TYPE_NEAREST>(
-        1, 1, 480, 360, 480, 360, 480, 360, FMT_RGBA8, make_float4(0.0f, 0.0f, 0.0f, 1.0f), REMAP_RELATIVE_NORMALIZED,
-        true, eDeviceType::CPU)));
+        1, 1, 480, 360, 480, 360, 480, 360, FMT_RGBA8, make_float4(0.0f, 0.0f, 0.0f, 1.0f), REMAP_RELATIVE_NORMALIZED, true, eDeviceType::CPU)));
 
     TEST_CASE((TestCorrectness<uchar1, eBorderType::BORDER_TYPE_CONSTANT, eInterpolationType::INTERP_TYPE_NEAREST,
-                               eInterpolationType::INTERP_TYPE_NEAREST>(2, 1, 480, 360, 480, 360, 480, 360, FMT_U8,
-                                                                        make_float4(0.0f, 0.0f, 0.0f, 1.0f),
-                                                                        REMAP_ABSOLUTE, false, eDeviceType::CPU)));
+                               eInterpolationType::INTERP_TYPE_NEAREST>(
+        2, 1, 480, 360, 480, 360, 480, 360, FMT_U8, make_float4(0.0f, 0.0f, 0.0f, 1.0f), REMAP_ABSOLUTE, false, eDeviceType::CPU)));
     TEST_CASE((TestCorrectness<uchar1, eBorderType::BORDER_TYPE_CONSTANT, eInterpolationType::INTERP_TYPE_NEAREST,
-                               eInterpolationType::INTERP_TYPE_NEAREST>(2, 2, 480, 360, 480, 360, 480, 360, FMT_U8,
-                                                                        make_float4(0.0f, 0.0f, 0.0f, 1.0f),
-                                                                        REMAP_ABSOLUTE, false, eDeviceType::CPU)));
+                               eInterpolationType::INTERP_TYPE_NEAREST>(
+        2, 2, 480, 360, 480, 360, 480, 360, FMT_U8, make_float4(0.0f, 0.0f, 0.0f, 1.0f), REMAP_ABSOLUTE, false, eDeviceType::CPU)));
     TEST_CASE((TestCorrectness<uchar1, eBorderType::BORDER_TYPE_CONSTANT, eInterpolationType::INTERP_TYPE_NEAREST,
-                               eInterpolationType::INTERP_TYPE_NEAREST>(2, 1, 480, 360, 480, 360, 480, 360, FMT_U8,
-                                                                        make_float4(0.0f, 0.0f, 0.0f, 1.0f),
-                                                                        REMAP_ABSOLUTE, true, eDeviceType::CPU)));
+                               eInterpolationType::INTERP_TYPE_NEAREST>(
+        2, 1, 480, 360, 480, 360, 480, 360, FMT_U8, make_float4(0.0f, 0.0f, 0.0f, 1.0f), REMAP_ABSOLUTE, true, eDeviceType::CPU)));
     TEST_CASE((TestCorrectness<uchar1, eBorderType::BORDER_TYPE_CONSTANT, eInterpolationType::INTERP_TYPE_NEAREST,
-                               eInterpolationType::INTERP_TYPE_NEAREST>(2, 2, 480, 360, 480, 360, 480, 360, FMT_U8,
-                                                                        make_float4(0.0f, 0.0f, 0.0f, 1.0f),
-                                                                        REMAP_ABSOLUTE, true, eDeviceType::CPU)));
+                               eInterpolationType::INTERP_TYPE_NEAREST>(
+        2, 2, 480, 360, 480, 360, 480, 360, FMT_U8, make_float4(0.0f, 0.0f, 0.0f, 1.0f), REMAP_ABSOLUTE, true, eDeviceType::CPU)));
+
+
 
     TEST_CASES_END();
 }
\ No newline at end of file
diff --git a/tests/roccv/cpp/src/tests/operators/test_op_resize.cpp b/tests/roccv/cpp/src/tests/operators/test_op_resize.cpp
index 2482e346..d7c385d0 100644
--- a/tests/roccv/cpp/src/tests/operators/test_op_resize.cpp
+++ b/tests/roccv/cpp/src/tests/operators/test_op_resize.cpp
@@ -54,8 +54,8 @@ std::vector<BT> GoldenResize(std::vector<detail::BaseType<T>> &input, int batchS
 
     // Use the replicate (or clamping) border mode by default to handle out of bounds conditions with certain
     // interpolation modes.
-    InterpolationWrapper<eBorderType::BORDER_TYPE_REPLICATE, InterpType, ImageWrapper<T>> inputWrap(
-        BorderWrapper<eBorderType::BORDER_TYPE_REPLICATE, ImageWrapper<T>>(
+    InterpolationWrapper<T, eBorderType::BORDER_TYPE_REPLICATE, InterpType> inputWrap(
+        BorderWrapper<T, eBorderType::BORDER_TYPE_REPLICATE>(
             ImageWrapper<T>(input, batchSize, inputSize.w, inputSize.h), T{}));
 
     // Determine the scaling factor required to map from the output coordinates to the corresponding input coordinates
diff --git a/tests/roccv/cpp/src/tests/operators/test_op_rotate.cpp b/tests/roccv/cpp/src/tests/operators/test_op_rotate.cpp
index 37b81842..56deeabb 100644
--- a/tests/roccv/cpp/src/tests/operators/test_op_rotate.cpp
+++ b/tests/roccv/cpp/src/tests/operators/test_op_rotate.cpp
@@ -68,9 +68,9 @@ std::vector<detail::BaseType<T>> GoldenRotate(std::vector<detail::BaseType<T>>&
     T borderVal = detail::SaturateCast<T>(make_float4(0.0f, 0.0f, 0.0f, 0.0f));
 
     ImageWrapper<T> outputWrapper(output, batchSize, imageSize.w, imageSize.h);
-    InterpolationWrapper<eBorderType::BORDER_TYPE_CONSTANT, InterpType, ImageWrapper<T>> inputWrapper(
-        BorderWrapper<eBorderType::BORDER_TYPE_CONSTANT, ImageWrapper<T>>(
-            ImageWrapper<T>(input, batchSize, imageSize.w, imageSize.h), borderVal));
+    InterpolationWrapper<T, eBorderType::BORDER_TYPE_CONSTANT, InterpType> inputWrapper(
+        BorderWrapper<T, eBorderType::BORDER_TYPE_CONSTANT>(ImageWrapper<T>(input, batchSize, imageSize.w, imageSize.h),
+                                                            borderVal));
 
     /**
      * Affine warp for a combined rotation and translate looks like the following when in its inverse representation:
diff --git a/tests/roccv/cpp/src/tests/operators/test_op_warp_affine.cpp b/tests/roccv/cpp/src/tests/operators/test_op_warp_affine.cpp
index 72748a1a..93c91ae9 100644
--- a/tests/roccv/cpp/src/tests/operators/test_op_warp_affine.cpp
+++ b/tests/roccv/cpp/src/tests/operators/test_op_warp_affine.cpp
@@ -55,7 +55,7 @@ std::vector<detail::BaseType<T>> GoldenWarpAffine(std::vector<detail::BaseType<T
                                                   const std::array<float, 6>& mat, bool isInverted, int batchSize,
                                                   Size2D inputSize, Size2D outputSize, float4 borderValue) {
     // Create interpolation wrapper for input vector
-    InterpolationWrapper<BorderType, InterpType, ImageWrapper<T>> inputWrap((BorderWrapper<BorderType, ImageWrapper<T>>(
+    InterpolationWrapper<T, BorderType, InterpType> inputWrap((BorderWrapper<T, BorderType>(
         ImageWrapper<T>(input, batchSize, inputSize.w, inputSize.h), detail::SaturateCast<T>(borderValue))));
 
     // Create ImageWrapper for output vector. We also need to create said output vector.
diff --git a/tests/roccv/cpp/src/tests/operators/test_op_warp_perspective.cpp b/tests/roccv/cpp/src/tests/operators/test_op_warp_perspective.cpp
index be918fd6..1461365c 100644
--- a/tests/roccv/cpp/src/tests/operators/test_op_warp_perspective.cpp
+++ b/tests/roccv/cpp/src/tests/operators/test_op_warp_perspective.cpp
@@ -52,7 +52,7 @@ std::vector<detail::BaseType<T>> GoldenWarpPerspective(std::vector<detail::BaseT
                                                        const std::array<float, 9>& mat, bool isInverted, int batchSize,
                                                        Size2D inputSize, Size2D outputSize, float4 borderValue) {
     // Create interpolation wrapper for input vector
-    InterpolationWrapper<BorderType, InterpType, ImageWrapper<T>> inputWrap((BorderWrapper<BorderType, ImageWrapper<T>>(
+    InterpolationWrapper<T, BorderType, InterpType> inputWrap((BorderWrapper<T, BorderType>(
         ImageWrapper<T>(input, batchSize, inputSize.w, inputSize.h), detail::SaturateCast<T>(borderValue))));
 
     // Create ImageWrapper for output vector. We also need to create said output vector.

From 8b3b1b6017c575772452cec994061dc4bb047849 Mon Sep 17 00:00:00 2001
From: Zach Vincze <zavincze@amd.com>
Date: Wed, 20 May 2026 10:50:44 -0400
Subject: [PATCH 12/13] Move common helpers to image_test_helpers.hpp

---
 .../roccv/cpp/include/image_test_helpers.hpp  | 128 ++++++++++++++++
 .../cpp/src/tests/core/image/test_image.cpp   |  57 +------
 .../core/image/test_image_batch_data.cpp      |  22 +--
 .../core/image/test_image_batch_var_shape.cpp | 144 +++++-------------
 .../src/tests/core/image/test_image_data.cpp  |  15 +-
 5 files changed, 174 insertions(+), 192 deletions(-)
 create mode 100644 tests/roccv/cpp/include/image_test_helpers.hpp

diff --git a/tests/roccv/cpp/include/image_test_helpers.hpp b/tests/roccv/cpp/include/image_test_helpers.hpp
new file mode 100644
index 00000000..f7318f89
--- /dev/null
+++ b/tests/roccv/cpp/include/image_test_helpers.hpp
@@ -0,0 +1,128 @@
+/*
+ * Copyright (c) 2026 Advanced Micro Devices, Inc. All rights reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ */
+#pragma once
+
+#include <stdint.h>
+#include <stdlib.h>
+
+#include <core/detail/allocators/i_allocator.hpp>
+#include <core/image.hpp>
+#include <core/image_buffer.hpp>
+#include <core/image_data.hpp>
+#include <core/image_format.hpp>
+
+namespace roccv {
+namespace tests {
+
+// Opaque sentinel pointers used by image-layer tests. ImageData / ImageBatchData
+// carry pointers but never dereference them — the buffer is a metadata snapshot
+// only — so tests use these to verify values flow through without needing real
+// allocations.
+inline void* const FAKE_PTR_A = reinterpret_cast<void*>(0xAAAAAAAAull);
+inline void* const FAKE_PTR_B = reinterpret_cast<void*>(0xBBBBBBBBull);
+inline void* const FAKE_PTR_C = reinterpret_cast<void*>(0xCCCCCCCCull);
+
+/**
+ * @brief Test allocator that backs every allocation kind with malloc and tallies
+ * how many times each entry point is invoked. Pure host-backed; no actual GPU
+ * dependency on the returned pointers — callers that exercise the Hip/pinned
+ * paths must only inspect metadata, never dereference device memory.
+ *
+ * `lastAllocBytes` is updated from every alloc path (hip, host, pinned), so
+ * callers may assert on the most recent allocation regardless of kind.
+ */
+class CountingAllocator : public IAllocator {
+   public:
+    mutable int hipAllocs = 0;
+    mutable int hipFrees = 0;
+    mutable int hostAllocs = 0;
+    mutable int hostFrees = 0;
+    mutable int pinnedAllocs = 0;
+    mutable int pinnedFrees = 0;
+    mutable size_t lastAllocBytes = 0;
+
+    void* allocHipMem(size_t size) const override {
+        ++hipAllocs;
+        lastAllocBytes = size;
+        return std::malloc(size);
+    }
+    void freeHipMem(void* ptr) const noexcept override {
+        ++hipFrees;
+        std::free(ptr);
+    }
+
+    void* allocHostMem(size_t size, int32_t /*alignment*/ = 0) const override {
+        ++hostAllocs;
+        lastAllocBytes = size;
+        return std::malloc(size);
+    }
+    void freeHostMem(void* ptr) const noexcept override {
+        ++hostFrees;
+        std::free(ptr);
+    }
+
+    void* allocHostPinnedMem(size_t size) const override {
+        ++pinnedAllocs;
+        lastAllocBytes = size;
+        return std::malloc(size);
+    }
+    void freeHostPinnedMem(void* ptr) const noexcept override {
+        ++pinnedFrees;
+        std::free(ptr);
+    }
+};
+
+// Single-plane packed-row buffer descriptor around `basePtr`. The pointer is
+// never dereferenced by the consumers (ImageData / ImageBatchVarShape).
+inline ImageBufferStrided MakeSinglePlaneBuffer(int32_t width, int32_t height, int64_t rowStride, void* basePtr) {
+    ImageBufferStrided buf{};
+    buf.numPlanes = 1;
+    buf.planes[0] = {width, height, rowStride, basePtr};
+    return buf;
+}
+
+// Single-plane GPU-resident ImageData snapshot with packed-row stride implied
+// by `fmt`. For tests that need an ImageData but won't touch the pixels.
+inline ImageDataStridedHip MakeFakeHipData(int32_t width, int32_t height, void* basePtr, ImageFormat fmt = FMT_RGB8) {
+    return ImageDataStridedHip(fmt, MakeSinglePlaneBuffer(width, height, static_cast<int64_t>(width * fmt.channels()),
+                                                          basePtr));
+}
+
+// Host counterpart of MakeFakeHipData.
+inline ImageDataStridedHost MakeFakeHostData(int32_t width, int32_t height, void* basePtr, ImageFormat fmt = FMT_RGB8) {
+    return ImageDataStridedHost(fmt, MakeSinglePlaneBuffer(width, height, static_cast<int64_t>(width * fmt.channels()),
+                                                           basePtr));
+}
+
+// Single-plane GPU-resident Image wrapping a sentinel pointer via ImageWrapData.
+// Use for batch tests where pushBack only reads the descriptor.
+inline Image MakeFakeGpuImage(int32_t width, int32_t height, void* basePtr, ImageFormat fmt = FMT_RGB8) {
+    return ImageWrapData(MakeFakeHipData(width, height, basePtr, fmt));
+}
+
+// Host counterpart of MakeFakeGpuImage.
+inline Image MakeFakeHostImage(int32_t width, int32_t height, void* basePtr, ImageFormat fmt = FMT_RGB8) {
+    return ImageWrapData(MakeFakeHostData(width, height, basePtr, fmt));
+}
+
+}  // namespace tests
+}  // namespace roccv
diff --git a/tests/roccv/cpp/src/tests/core/image/test_image.cpp b/tests/roccv/cpp/src/tests/core/image/test_image.cpp
index 13937dff..b2ae2aeb 100644
--- a/tests/roccv/cpp/src/tests/core/image/test_image.cpp
+++ b/tests/roccv/cpp/src/tests/core/image/test_image.cpp
@@ -21,17 +21,14 @@
  */
 
 #include <stdint.h>
-#include <stdlib.h>
 
-#include <core/detail/allocators/i_allocator.hpp>
 #include <core/image.hpp>
-#include <core/image_buffer.hpp>
 #include <core/image_data.hpp>
 #include <core/image_format.hpp>
-#include <stdexcept>
 #include <typeinfo>
 #include <utility>
 
+#include "image_test_helpers.hpp"
 #include "test_helpers.hpp"
 
 using namespace roccv;
@@ -39,58 +36,6 @@ using namespace roccv::tests;
 
 namespace {
 
-void* const FAKE_PTR_A = reinterpret_cast<void*>(0xAAAAAAAAull);
-
-/**
- * @brief Test allocator that backs allocations with malloc and tallies how
- * many times each entry point is invoked. Pure host-side; no GPU dependency.
- *
- * The Hip path returns malloc'd memory because no test dereferences it — we
- * only care that ptr round-trips through Image and that free is called the
- * right number of times.
- */
-class CountingAllocator : public IAllocator {
-   public:
-    mutable int hipAllocs = 0;
-    mutable int hipFrees = 0;
-    mutable int hostAllocs = 0;
-    mutable int hostFrees = 0;
-    mutable size_t lastAllocBytes = 0;
-
-    void* allocHipMem(size_t size) const override {
-        ++hipAllocs;
-        lastAllocBytes = size;
-        return std::malloc(size);
-    }
-    void freeHipMem(void* ptr) const noexcept override {
-        ++hipFrees;
-        std::free(ptr);
-    }
-
-    void* allocHostMem(size_t size, int32_t /*alignment*/ = 0) const override {
-        ++hostAllocs;
-        lastAllocBytes = size;
-        return std::malloc(size);
-    }
-    void freeHostMem(void* ptr) const noexcept override {
-        ++hostFrees;
-        std::free(ptr);
-    }
-
-    // Unused by the Image paths under test. Trip loudly if invoked unexpectedly.
-    void* allocHostPinnedMem(size_t) const override { throw std::runtime_error("unused in tests"); }
-    void freeHostPinnedMem(void*) const noexcept override { std::abort(); }
-};
-
-// Build a single-plane ImageData snapshot referencing a sentinel pointer. Used
-// for ImageWrapData tests where we never dereference the buffer.
-ImageDataStridedHip MakeFakeHipData(int32_t width, int32_t height, void* basePtr, ImageFormat fmt = FMT_RGB8) {
-    ImageBufferStrided buf{};
-    buf.numPlanes = 1;
-    buf.planes[0] = {width, height, static_cast<int64_t>(width * fmt.channels()), basePtr};
-    return ImageDataStridedHip(fmt, buf);
-}
-
 // =============================================================================
 // CalcRequirements
 // =============================================================================
diff --git a/tests/roccv/cpp/src/tests/core/image/test_image_batch_data.cpp b/tests/roccv/cpp/src/tests/core/image/test_image_batch_data.cpp
index f402d261..a2e238be 100644
--- a/tests/roccv/cpp/src/tests/core/image/test_image_batch_data.cpp
+++ b/tests/roccv/cpp/src/tests/core/image/test_image_batch_data.cpp
@@ -27,6 +27,7 @@
 #include <core/image_buffer.hpp>
 #include <core/image_format.hpp>
 
+#include "image_test_helpers.hpp"
 #include "test_helpers.hpp"
 
 using namespace roccv;
@@ -34,12 +35,6 @@ using namespace roccv::tests;
 
 namespace {
 
-// ImageBatchData carries pointers but never dereferences them; the buffer is a
-// metadata snapshot. Use opaque sentinel pointers so we can verify values flow
-// through the hierarchy without needing real allocations.
-void* const FAKE_IMG_PTR_A = reinterpret_cast<void*>(0xA0A0A0A0ull);
-void* const FAKE_IMG_PTR_B = reinterpret_cast<void*>(0xB0B0B0B0ull);
-
 // Static descriptor/format storage for the batch buffer. These are real host
 // allocations (so the pointers are valid) but the batch tests only read
 // metadata back out of them; nothing dereferences the per-image basePtr fields.
@@ -47,19 +42,12 @@ ImageBufferStrided g_imageList[2];
 ImageFormat g_formatList[2] = {FMT_RGB8, FMT_RGB8};
 ImageFormat g_hostFormatList[2] = {FMT_RGB8, FMT_RGB8};
 
-ImageBufferStrided MakeSinglePlaneBuffer(int32_t width, int32_t height, int64_t rowStride, void* basePtr) {
-    ImageBufferStrided buf{};
-    buf.numPlanes = 1;
-    buf.planes[0] = {width, height, rowStride, basePtr};
-    return buf;
-}
-
 // Builds a homogeneous two-image varshape descriptor with a known bounding box
 // and uniqueFormat. The returned struct's pointers reference module-static
 // arrays so addresses remain stable across calls within a test.
 ImageBatchVarShapeBufferStrided MakeHomogeneousBuffer() {
-    g_imageList[0] = MakeSinglePlaneBuffer(640, 480, 640 * 3, FAKE_IMG_PTR_A);
-    g_imageList[1] = MakeSinglePlaneBuffer(320, 240, 320 * 3, FAKE_IMG_PTR_B);
+    g_imageList[0] = MakeSinglePlaneBuffer(640, 480, 640 * 3, FAKE_PTR_A);
+    g_imageList[1] = MakeSinglePlaneBuffer(320, 240, 320 * 3, FAKE_PTR_B);
     g_formatList[0] = FMT_RGB8;
     g_formatList[1] = FMT_RGB8;
     g_hostFormatList[0] = FMT_RGB8;
@@ -137,8 +125,8 @@ void TestImageBatchVarShapeDataEmpty() {
  * verbatim; uniqueFormat is FMT_NONE since no single format spans the batch.
  */
 void TestImageBatchVarShapeDataHeterogeneousFormats() {
-    g_imageList[0] = MakeSinglePlaneBuffer(640, 480, 640 * 3, FAKE_IMG_PTR_A);
-    g_imageList[1] = MakeSinglePlaneBuffer(320, 240, 320 * 4, FAKE_IMG_PTR_B);
+    g_imageList[0] = MakeSinglePlaneBuffer(640, 480, 640 * 3, FAKE_PTR_A);
+    g_imageList[1] = MakeSinglePlaneBuffer(320, 240, 320 * 4, FAKE_PTR_B);
     g_formatList[0] = FMT_RGB8;
     g_formatList[1] = FMT_RGBA8;
     g_hostFormatList[0] = FMT_RGB8;
diff --git a/tests/roccv/cpp/src/tests/core/image/test_image_batch_var_shape.cpp b/tests/roccv/cpp/src/tests/core/image/test_image_batch_var_shape.cpp
index dea3c0c0..ec148240 100644
--- a/tests/roccv/cpp/src/tests/core/image/test_image_batch_var_shape.cpp
+++ b/tests/roccv/cpp/src/tests/core/image/test_image_batch_var_shape.cpp
@@ -22,18 +22,15 @@
 
 #include <hip/hip_runtime.h>
 #include <stdint.h>
-#include <stdlib.h>
 
-#include <core/detail/allocators/i_allocator.hpp>
 #include <core/image.hpp>
 #include <core/image_batch_data.hpp>
 #include <core/image_batch_var_shape.hpp>
-#include <core/image_buffer.hpp>
-#include <core/image_data.hpp>
 #include <core/image_format.hpp>
 #include <utility>
 #include <vector>
 
+#include "image_test_helpers.hpp"
 #include "test_helpers.hpp"
 
 using namespace roccv;
@@ -41,69 +38,6 @@ using namespace roccv::tests;
 
 namespace {
 
-/**
- * @brief Test allocator that distinguishes pinned-host from regular-host
- * allocations and tallies each entry-point. Pure host-backed; no actual GPU
- * dependency on the descriptor buffers — tests verify metadata round-trip and
- * pointer identity, never dereference device memory through these.
- */
-class CountingAllocator : public IAllocator {
-   public:
-    mutable int hipAllocs = 0;
-    mutable int hipFrees = 0;
-    mutable int hostAllocs = 0;
-    mutable int hostFrees = 0;
-    mutable int pinnedAllocs = 0;
-    mutable int pinnedFrees = 0;
-
-    void* allocHipMem(size_t size) const override {
-        ++hipAllocs;
-        return std::malloc(size);
-    }
-    void freeHipMem(void* ptr) const noexcept override {
-        ++hipFrees;
-        std::free(ptr);
-    }
-
-    void* allocHostMem(size_t size, int32_t /*alignment*/ = 0) const override {
-        ++hostAllocs;
-        return std::malloc(size);
-    }
-    void freeHostMem(void* ptr) const noexcept override {
-        ++hostFrees;
-        std::free(ptr);
-    }
-
-    void* allocHostPinnedMem(size_t size) const override {
-        ++pinnedAllocs;
-        return std::malloc(size);
-    }
-    void freeHostPinnedMem(void* ptr) const noexcept override {
-        ++pinnedFrees;
-        std::free(ptr);
-    }
-};
-
-// Build a single-plane GPU-resident image wrapper around a sentinel pointer.
-// The pointer is never dereferenced — pushBack only reads the descriptor.
-Image MakeFakeGpuImage(int32_t w, int32_t h, ImageFormat fmt, void* basePtr) {
-    ImageBufferStrided buf{};
-    buf.numPlanes = 1;
-    buf.planes[0] = {w, h, static_cast<int64_t>(w * fmt.channels()), basePtr};
-    return ImageWrapData(ImageDataStridedHip(fmt, buf));
-}
-
-Image MakeFakeHostImage(int32_t w, int32_t h, ImageFormat fmt, void* basePtr) {
-    ImageBufferStrided buf{};
-    buf.numPlanes = 1;
-    buf.planes[0] = {w, h, static_cast<int64_t>(w * fmt.channels()), basePtr};
-    return ImageWrapData(ImageDataStridedHost(fmt, buf));
-}
-
-void* const FAKE_A = reinterpret_cast<void*>(0xA0000000ull);
-void* const FAKE_B = reinterpret_cast<void*>(0xB0000000ull);
-void* const FAKE_C = reinterpret_cast<void*>(0xC0000000ull);
-
 // =============================================================================
 // Construction
 // =============================================================================
@@ -136,7 +70,7 @@ void TestPushBackSingle() {
     CountingAllocator alloc;
     ImageBatchVarShape batch(4, alloc);
 
-    Image img = MakeFakeGpuImage(640, 480, FMT_RGB8, FAKE_A);
+    Image img = MakeFakeGpuImage(640, 480, FAKE_PTR_A);
     batch.pushBack(img);
 
     EXPECT_EQ(batch.numImages(), 1);
@@ -149,9 +83,9 @@ void TestPushBackMultipleHeterogeneousSizes() {
     CountingAllocator alloc;
     ImageBatchVarShape batch(4, alloc);
 
-    batch.pushBack(MakeFakeGpuImage(640, 480, FMT_RGB8, FAKE_A));
-    batch.pushBack(MakeFakeGpuImage(320, 240, FMT_RGB8, FAKE_B));
-    batch.pushBack(MakeFakeGpuImage(800, 200, FMT_RGB8, FAKE_C));
+    batch.pushBack(MakeFakeGpuImage(640, 480, FAKE_PTR_A));
+    batch.pushBack(MakeFakeGpuImage(320, 240, FAKE_PTR_B));
+    batch.pushBack(MakeFakeGpuImage(800, 200, FAKE_PTR_C));
 
     EXPECT_EQ(batch.numImages(), 3);
     EXPECT_EQ(batch.maxSize().w, 800);
@@ -164,9 +98,9 @@ void TestPushBackIteratorRange() {
     ImageBatchVarShape batch(8, alloc);
 
     std::vector<Image> imgs;
-    imgs.push_back(MakeFakeGpuImage(100, 100, FMT_RGB8, FAKE_A));
-    imgs.push_back(MakeFakeGpuImage(200, 200, FMT_RGB8, FAKE_B));
-    imgs.push_back(MakeFakeGpuImage(300, 300, FMT_RGB8, FAKE_C));
+    imgs.push_back(MakeFakeGpuImage(100, 100, FAKE_PTR_A));
+    imgs.push_back(MakeFakeGpuImage(200, 200, FAKE_PTR_B));
+    imgs.push_back(MakeFakeGpuImage(300, 300, FAKE_PTR_C));
 
     batch.pushBack(imgs.begin(), imgs.end());
 
@@ -182,17 +116,17 @@ void TestPushBackCapacityOverflow() {
     CountingAllocator alloc;
     ImageBatchVarShape batch(2, alloc);
 
-    batch.pushBack(MakeFakeGpuImage(64, 64, FMT_RGB8, FAKE_A));
-    batch.pushBack(MakeFakeGpuImage(64, 64, FMT_RGB8, FAKE_B));
+    batch.pushBack(MakeFakeGpuImage(64, 64, FAKE_PTR_A));
+    batch.pushBack(MakeFakeGpuImage(64, 64, FAKE_PTR_B));
 
-    EXPECT_EXCEPTION(batch.pushBack(MakeFakeGpuImage(64, 64, FMT_RGB8, FAKE_C)), eStatusType::OUT_OF_BOUNDS);
+    EXPECT_EXCEPTION(batch.pushBack(MakeFakeGpuImage(64, 64, FAKE_PTR_C)), eStatusType::OUT_OF_BOUNDS);
 }
 
 void TestPushBackHostImageRejected() {
     CountingAllocator alloc;
     ImageBatchVarShape batch(4, alloc);
 
-    Image cpuImg = MakeFakeHostImage(64, 64, FMT_U8, FAKE_A);
+    Image cpuImg = MakeFakeHostImage(64, 64, FAKE_PTR_A, FMT_U8);
     EXPECT_EXCEPTION(batch.pushBack(cpuImg), eStatusType::INVALID_VALUE);
 }
 
@@ -208,13 +142,13 @@ void TestPushBackRangeRollbackOnFailure() {
 
     // Pre-populate so we can confirm the rollback restores exactly the
     // pre-call state, not just back to zero.
-    batch.pushBack(MakeFakeGpuImage(100, 100, FMT_RGB8, FAKE_A));
+    batch.pushBack(MakeFakeGpuImage(100, 100, FAKE_PTR_A));
     EXPECT_EQ(batch.numImages(), 1);
 
     // Mid-range CPU image — should rollback the partially-pushed entries.
     std::vector<Image> imgs;
-    imgs.push_back(MakeFakeGpuImage(200, 200, FMT_RGB8, FAKE_B));
-    imgs.push_back(MakeFakeHostImage(300, 300, FMT_RGB8, FAKE_C));  // Will throw.
+    imgs.push_back(MakeFakeGpuImage(200, 200, FAKE_PTR_B));
+    imgs.push_back(MakeFakeHostImage(300, 300, FAKE_PTR_C));  // Will throw.
 
     EXPECT_EXCEPTION(batch.pushBack(imgs.begin(), imgs.end()), eStatusType::INVALID_VALUE);
 
@@ -228,9 +162,9 @@ void TestPushBackRangeOverflowPrechecked() {
     ImageBatchVarShape batch(2, alloc);
 
     std::vector<Image> imgs;
-    imgs.push_back(MakeFakeGpuImage(10, 10, FMT_RGB8, FAKE_A));
-    imgs.push_back(MakeFakeGpuImage(20, 20, FMT_RGB8, FAKE_B));
-    imgs.push_back(MakeFakeGpuImage(30, 30, FMT_RGB8, FAKE_C));  // 3rd overflows capacity 2.
+    imgs.push_back(MakeFakeGpuImage(10, 10, FAKE_PTR_A));
+    imgs.push_back(MakeFakeGpuImage(20, 20, FAKE_PTR_B));
+    imgs.push_back(MakeFakeGpuImage(30, 30, FAKE_PTR_C));  // 3rd overflows capacity 2.
 
     EXPECT_EXCEPTION(batch.pushBack(imgs.begin(), imgs.end()), eStatusType::OUT_OF_BOUNDS);
     // Pre-checked: nothing was pushed.
@@ -245,8 +179,8 @@ void TestPopBack() {
     CountingAllocator alloc;
     ImageBatchVarShape batch(4, alloc);
 
-    batch.pushBack(MakeFakeGpuImage(100, 100, FMT_RGB8, FAKE_A));
-    batch.pushBack(MakeFakeGpuImage(200, 200, FMT_RGB8, FAKE_B));
+    batch.pushBack(MakeFakeGpuImage(100, 100, FAKE_PTR_A));
+    batch.pushBack(MakeFakeGpuImage(200, 200, FAKE_PTR_B));
     batch.popBack();
 
     EXPECT_EQ(batch.numImages(), 1);
@@ -258,9 +192,9 @@ void TestPopBackMultiple() {
     CountingAllocator alloc;
     ImageBatchVarShape batch(4, alloc);
 
-    batch.pushBack(MakeFakeGpuImage(100, 100, FMT_RGB8, FAKE_A));
-    batch.pushBack(MakeFakeGpuImage(200, 200, FMT_RGB8, FAKE_B));
-    batch.pushBack(MakeFakeGpuImage(300, 300, FMT_RGB8, FAKE_C));
+    batch.pushBack(MakeFakeGpuImage(100, 100, FAKE_PTR_A));
+    batch.pushBack(MakeFakeGpuImage(200, 200, FAKE_PTR_B));
+    batch.pushBack(MakeFakeGpuImage(300, 300, FAKE_PTR_C));
     batch.popBack(2);
 
     EXPECT_EQ(batch.numImages(), 1);
@@ -270,7 +204,7 @@ void TestPopBackMultiple() {
 void TestPopBackUnderflow() {
     CountingAllocator alloc;
     ImageBatchVarShape batch(4, alloc);
-    batch.pushBack(MakeFakeGpuImage(100, 100, FMT_RGB8, FAKE_A));
+    batch.pushBack(MakeFakeGpuImage(100, 100, FAKE_PTR_A));
 
     EXPECT_EXCEPTION(batch.popBack(2), eStatusType::OUT_OF_BOUNDS);
     // State preserved.
@@ -281,8 +215,8 @@ void TestClearAndReuse() {
     CountingAllocator alloc;
     ImageBatchVarShape batch(4, alloc);
 
-    batch.pushBack(MakeFakeGpuImage(100, 100, FMT_RGB8, FAKE_A));
-    batch.pushBack(MakeFakeGpuImage(200, 200, FMT_RGB8, FAKE_B));
+    batch.pushBack(MakeFakeGpuImage(100, 100, FAKE_PTR_A));
+    batch.pushBack(MakeFakeGpuImage(200, 200, FAKE_PTR_B));
     batch.clear();
 
     EXPECT_EQ(batch.numImages(), 0);
@@ -290,7 +224,7 @@ void TestClearAndReuse() {
     EXPECT_EQ(AsInt(batch.uniqueFormat() == FMT_NONE), 1);
 
     // Reuse after clear.
-    batch.pushBack(MakeFakeGpuImage(50, 50, FMT_U8, FAKE_C));
+    batch.pushBack(MakeFakeGpuImage(50, 50, FAKE_PTR_C, FMT_U8));
     EXPECT_EQ(batch.numImages(), 1);
     EXPECT_EQ(AsInt(batch.uniqueFormat() == FMT_U8), 1);
 }
@@ -302,16 +236,16 @@ void TestClearAndReuse() {
 void TestUniqueFormatHomogeneous() {
     CountingAllocator alloc;
     ImageBatchVarShape batch(4, alloc);
-    batch.pushBack(MakeFakeGpuImage(64, 64, FMT_RGB8, FAKE_A));
-    batch.pushBack(MakeFakeGpuImage(128, 128, FMT_RGB8, FAKE_B));
+    batch.pushBack(MakeFakeGpuImage(64, 64, FAKE_PTR_A));
+    batch.pushBack(MakeFakeGpuImage(128, 128, FAKE_PTR_B));
     EXPECT_EQ(AsInt(batch.uniqueFormat() == FMT_RGB8), 1);
 }
 
 void TestUniqueFormatHeterogeneous() {
     CountingAllocator alloc;
     ImageBatchVarShape batch(4, alloc);
-    batch.pushBack(MakeFakeGpuImage(64, 64, FMT_RGB8, FAKE_A));
-    batch.pushBack(MakeFakeGpuImage(64, 64, FMT_RGBA8, FAKE_B));
+    batch.pushBack(MakeFakeGpuImage(64, 64, FAKE_PTR_A));
+    batch.pushBack(MakeFakeGpuImage(64, 64, FAKE_PTR_B, FMT_RGBA8));
     EXPECT_EQ(AsInt(batch.uniqueFormat() == FMT_NONE), 1);
 }
 
@@ -344,8 +278,8 @@ void TestExportDataEmpty() {
 
 void TestExportDataMetadata() {
     ImageBatchVarShape batch(4);
-    batch.pushBack(MakeFakeGpuImage(640, 480, FMT_RGB8, FAKE_A));
-    batch.pushBack(MakeFakeGpuImage(320, 240, FMT_RGB8, FAKE_B));
+    batch.pushBack(MakeFakeGpuImage(640, 480, FAKE_PTR_A));
+    batch.pushBack(MakeFakeGpuImage(320, 240, FAKE_PTR_B));
 
     auto data = batch.exportData(0);
     EXPECT_EQ(data.numImages(), 2);
@@ -362,7 +296,7 @@ void TestExportDataMetadata() {
 
 void TestExportDataCastRoundTrip() {
     ImageBatchVarShape batch(4);
-    batch.pushBack(MakeFakeGpuImage(64, 64, FMT_RGB8, FAKE_A));
+    batch.pushBack(MakeFakeGpuImage(64, 64, FAKE_PTR_A));
 
     auto hipData = batch.exportData<ImageBatchVarShapeDataStridedHip>(0);
     EXPECT_EQ(hipData.numImages(), 1);
@@ -383,8 +317,8 @@ void TestMoveConstruction() {
     CountingAllocator alloc;
     {
         ImageBatchVarShape src(4, alloc);
-        src.pushBack(MakeFakeGpuImage(100, 100, FMT_RGB8, FAKE_A));
-        src.pushBack(MakeFakeGpuImage(200, 200, FMT_RGB8, FAKE_B));
+        src.pushBack(MakeFakeGpuImage(100, 100, FAKE_PTR_A));
+        src.pushBack(MakeFakeGpuImage(200, 200, FAKE_PTR_B));
 
         ImageBatchVarShape dst(std::move(src));
         EXPECT_EQ(dst.numImages(), 2);
@@ -406,9 +340,9 @@ void TestMoveConstruction() {
 void TestIteratorRangeFor() {
     CountingAllocator alloc;
     ImageBatchVarShape batch(4, alloc);
-    batch.pushBack(MakeFakeGpuImage(100, 100, FMT_RGB8, FAKE_A));
-    batch.pushBack(MakeFakeGpuImage(200, 200, FMT_RGB8, FAKE_B));
-    batch.pushBack(MakeFakeGpuImage(300, 300, FMT_RGB8, FAKE_C));
+    batch.pushBack(MakeFakeGpuImage(100, 100, FAKE_PTR_A));
+    batch.pushBack(MakeFakeGpuImage(200, 200, FAKE_PTR_B));
+    batch.pushBack(MakeFakeGpuImage(300, 300, FAKE_PTR_C));
 
     int32_t expectedW = 100;
     int32_t count = 0;
diff --git a/tests/roccv/cpp/src/tests/core/image/test_image_data.cpp b/tests/roccv/cpp/src/tests/core/image/test_image_data.cpp
index bea99d59..8a7945fd 100644
--- a/tests/roccv/cpp/src/tests/core/image/test_image_data.cpp
+++ b/tests/roccv/cpp/src/tests/core/image/test_image_data.cpp
@@ -26,6 +26,7 @@
 #include <core/image_data.hpp>
 #include <core/image_format.hpp>
 
+#include "image_test_helpers.hpp"
 #include "test_helpers.hpp"
 
 using namespace roccv;
@@ -33,20 +34,6 @@ using namespace roccv::tests;
 
 namespace {
 
-// ImageData carries pointers but never dereferences them; the buffer is a
-// metadata snapshot. Use opaque sentinel pointers in tests so we can verify
-// values flow through without needing real allocations.
-void* const FAKE_PTR_A = reinterpret_cast<void*>(0xAAAAAAAAull);
-void* const FAKE_PTR_B = reinterpret_cast<void*>(0xBBBBBBBBull);
-void* const FAKE_PTR_C = reinterpret_cast<void*>(0xCCCCCCCCull);
-
-ImageBufferStrided MakeSinglePlaneBuffer(int32_t width, int32_t height, int64_t rowStride, void* basePtr) {
-    ImageBufferStrided buf{};
-    buf.numPlanes = 1;
-    buf.planes[0] = {width, height, rowStride, basePtr};
-    return buf;
-}
-
 ImageBufferStrided MakeThreePlaneBuffer() {
     // Mimics a planar layout (e.g. YUV420-style) with sub-sampled chroma — three
     // planes of differing dimensions and strides backed by distinct buffers.

From 6f2469995cbbecc057204b9495199fe00c892686 Mon Sep 17 00:00:00 2001
From: Zach Vincze <zavincze@amd.com>
Date: Wed, 20 May 2026 12:33:49 -0400
Subject: [PATCH 13/13] Address review comments

---
 include/core/image_batch_var_shape.hpp        |  6 ++--
 src/core/image.cpp                            | 35 ++++++++++++-------
 src/core/image_batch_var_shape.cpp            | 20 +++++++----
 .../roccv/cpp/include/image_test_helpers.hpp  |  9 ++---
 tests/roccv/cpp/include/test_helpers.hpp      |  2 +-
 .../cpp/src/tests/core/image/test_image.cpp   |  6 ++--
 .../core/image/test_image_batch_data.cpp      | 16 ++++-----
 7 files changed, 56 insertions(+), 38 deletions(-)

diff --git a/include/core/image_batch_var_shape.hpp b/include/core/image_batch_var_shape.hpp
index a57f355a..ba4fd147 100644
--- a/include/core/image_batch_var_shape.hpp
+++ b/include/core/image_batch_var_shape.hpp
@@ -25,6 +25,7 @@
 #include <hip/hip_runtime.h>
 #include <stdint.h>
 
+#include <iterator>
 #include <optional>
 #include <vector>
 
@@ -123,9 +124,8 @@ class ImageBatchVarShape {
 
     /**
      * @brief The common ImageFormat across all images, or FMT_NONE if formats
-     * are heterogeneous or the batch is empty. After popping the only image
-     * with a given heterogenizing format, the cached value may stay FMT_NONE
-     * until the next emptying operation — conservative, never wrong.
+     * are heterogeneous or the batch is empty. popBack invalidates the cache
+     * so the next call rescans and may return an exact format again.
      */
     ImageFormat uniqueFormat() const;
 
diff --git a/src/core/image.cpp b/src/core/image.cpp
index 12a20d9a..d6077dcb 100644
--- a/src/core/image.cpp
+++ b/src/core/image.cpp
@@ -77,11 +77,19 @@ Image::Requirements Image::CalcRequirements(Size2D size, ImageFormat format) {
 
     const int64_t bytesPerPixel = static_cast<int64_t>(DataType(format.dtype()).size()) * format.channels();
 
+    // Guard signed-overflow in the rowStride = bytesPerPixel * width product
+    // (UB on overflow). Realistic image sizes don't approach INT64_MAX, but
+    // pathological callers shouldn't silently propagate garbage into strides.
+    int64_t rowStride = 0;
+    if (__builtin_mul_overflow(bytesPerPixel, static_cast<int64_t>(size.w), &rowStride)) {
+        throw Exception("Image row stride overflows int64.", eStatusType::INVALID_VALUE);
+    }
+
     // TODO: derive a sensible default base/row alignment from device attributes.
     return ImageRequirements{
         .size = size,
         .format = format,
-        .planeRowStride = {bytesPerPixel * size.w},
+        .planeRowStride = {rowStride},
         .alignBytes = 0,
     };
 }
@@ -103,11 +111,7 @@ Image::Image(const Requirements& reqs, const IAllocator& alloc, eDeviceType devi
     : Image(reqs, device, makeStorage(reqs, alloc, device)) {}
 
 Image::Image(const Requirements& reqs, eDeviceType device, std::shared_ptr<ImageStorage> storage)
-    : m_data(std::move(storage)),
-      m_size(reqs.size),
-      m_format(reqs.format),
-      m_device(device),
-      m_planeRowStride{} {
+    : m_data(std::move(storage)), m_size(reqs.size), m_format(reqs.format), m_device(device), m_planeRowStride{} {
     std::copy(std::begin(reqs.planeRowStride), std::end(reqs.planeRowStride), m_planeRowStride.begin());
 }
 
@@ -163,13 +167,18 @@ Image ImageWrapData(const ImageData& data, ImageDataCleanupFunc cleanup) {
 
     // The deleter captures `data` by value so the original snapshot survives
     // long enough to be passed to the cleanup callback on last-handle drop.
-    auto storage = std::shared_ptr<ImageStorage>(new ImageStorage(plane0.basePtr),
-                                                 [data, cleanup](ImageStorage* s) {
-                                                     if (cleanup) {
-                                                         cleanup(data);
-                                                     }
-                                                     delete s;
-                                                 });
+    // Swallow exceptions from `cleanup` — shared_ptr deleters run during
+    // destruction, and a throw would propagate into std::terminate.
+    auto storage =
+        std::shared_ptr<ImageStorage>(new ImageStorage(plane0.basePtr), [data, cleanup](ImageStorage* s) noexcept {
+            if (cleanup) {
+                try {
+                    cleanup(data);
+                } catch (...) {
+                }
+            }
+            delete s;
+        });
 
     return Image(reqs, data.device(), std::move(storage));
 }
diff --git a/src/core/image_batch_var_shape.cpp b/src/core/image_batch_var_shape.cpp
index d522e336..510cccac 100644
--- a/src/core/image_batch_var_shape.cpp
+++ b/src/core/image_batch_var_shape.cpp
@@ -46,12 +46,20 @@ ImageBatchVarShape::ImageBatchVarShape(int32_t capacity, const IAllocator& alloc
     const size_t imagesBytes = sizeof(ImageBufferStrided) * capacity;
     const size_t formatsBytes = sizeof(ImageFormat) * capacity;
 
-    m_devImagesBuffer = static_cast<ImageBufferStrided*>(m_allocator.allocHipMem(imagesBytes));
-    m_devFormatsBuffer = static_cast<ImageFormat*>(m_allocator.allocHipMem(formatsBytes));
-    m_hostImagesBuffer = static_cast<ImageBufferStrided*>(m_allocator.allocHostPinnedMem(imagesBytes));
-    m_hostFormatsBuffer = static_cast<ImageFormat*>(m_allocator.allocHostPinnedMem(formatsBytes));
-
-    HIP_VALIDATE_NO_ERRORS(hipEventCreateWithFlags(&m_postFence, hipEventDisableTiming));
+    try {
+        m_devImagesBuffer = static_cast<ImageBufferStrided*>(m_allocator.allocHipMem(imagesBytes));
+        m_devFormatsBuffer = static_cast<ImageFormat*>(m_allocator.allocHipMem(formatsBytes));
+        m_hostImagesBuffer = static_cast<ImageBufferStrided*>(m_allocator.allocHostPinnedMem(imagesBytes));
+        m_hostFormatsBuffer = static_cast<ImageFormat*>(m_allocator.allocHostPinnedMem(formatsBytes));
+
+        HIP_VALIDATE_NO_ERRORS(hipEventCreateWithFlags(&m_postFence, hipEventDisableTiming));
+    } catch (...) {
+        if (m_hostFormatsBuffer != nullptr) m_allocator.freeHostPinnedMem(m_hostFormatsBuffer);
+        if (m_hostImagesBuffer != nullptr) m_allocator.freeHostPinnedMem(m_hostImagesBuffer);
+        if (m_devFormatsBuffer != nullptr) m_allocator.freeHipMem(m_devFormatsBuffer);
+        if (m_devImagesBuffer != nullptr) m_allocator.freeHipMem(m_devImagesBuffer);
+        throw;
+    }
 }
 
 ImageBatchVarShape::~ImageBatchVarShape() {
diff --git a/tests/roccv/cpp/include/image_test_helpers.hpp b/tests/roccv/cpp/include/image_test_helpers.hpp
index f7318f89..c4613367 100644
--- a/tests/roccv/cpp/include/image_test_helpers.hpp
+++ b/tests/roccv/cpp/include/image_test_helpers.hpp
@@ -24,6 +24,7 @@
 #include <stdint.h>
 #include <stdlib.h>
 
+#include <core/data_type.hpp>
 #include <core/detail/allocators/i_allocator.hpp>
 #include <core/image.hpp>
 #include <core/image_buffer.hpp>
@@ -103,14 +104,14 @@ inline ImageBufferStrided MakeSinglePlaneBuffer(int32_t width, int32_t height, i
 // Single-plane GPU-resident ImageData snapshot with packed-row stride implied
 // by `fmt`. For tests that need an ImageData but won't touch the pixels.
 inline ImageDataStridedHip MakeFakeHipData(int32_t width, int32_t height, void* basePtr, ImageFormat fmt = FMT_RGB8) {
-    return ImageDataStridedHip(fmt, MakeSinglePlaneBuffer(width, height, static_cast<int64_t>(width * fmt.channels()),
-                                                          basePtr));
+    const int64_t rowStride = static_cast<int64_t>(width) * fmt.channels() * DataType(fmt.dtype()).size();
+    return ImageDataStridedHip(fmt, MakeSinglePlaneBuffer(width, height, rowStride, basePtr));
 }
 
 // Host counterpart of MakeFakeHipData.
 inline ImageDataStridedHost MakeFakeHostData(int32_t width, int32_t height, void* basePtr, ImageFormat fmt = FMT_RGB8) {
-    return ImageDataStridedHost(fmt, MakeSinglePlaneBuffer(width, height, static_cast<int64_t>(width * fmt.channels()),
-                                                           basePtr));
+    const int64_t rowStride = static_cast<int64_t>(width) * fmt.channels() * DataType(fmt.dtype()).size();
+    return ImageDataStridedHost(fmt, MakeSinglePlaneBuffer(width, height, rowStride, basePtr));
 }
 
 // Single-plane GPU-resident Image wrapping a sentinel pointer via ImageWrapData.
diff --git a/tests/roccv/cpp/include/test_helpers.hpp b/tests/roccv/cpp/include/test_helpers.hpp
index df6840f5..7ed56309 100644
--- a/tests/roccv/cpp/include/test_helpers.hpp
+++ b/tests/roccv/cpp/include/test_helpers.hpp
@@ -201,7 +201,7 @@ namespace tests {
 // EXPECT_EQ pipes through std::to_string, so wrap enums/pointers/bools through
 // these casts before comparing.
 inline auto AsInt = [](auto v) { return static_cast<int>(v); };
-inline auto AsAddr = [](void* p) { return reinterpret_cast<uintptr_t>(p); };
+inline auto AsAddr = [](const void* p) { return reinterpret_cast<uintptr_t>(p); };
 inline auto AsSize = [](auto v) { return static_cast<size_t>(v); };
 
 /**
diff --git a/tests/roccv/cpp/src/tests/core/image/test_image.cpp b/tests/roccv/cpp/src/tests/core/image/test_image.cpp
index b2ae2aeb..ce6ef69b 100644
--- a/tests/roccv/cpp/src/tests/core/image/test_image.cpp
+++ b/tests/roccv/cpp/src/tests/core/image/test_image.cpp
@@ -87,7 +87,7 @@ void TestCalcRequirementsRejectsInvalidDims() {
  * fit in int64.
  */
 void TestCalcRequirementsLargeDims() {
-    // 8K image, RGBA32 (4 channels * 4 bytes = 16 B/pixel) → 8192 * 16 = 131072 B/row.
+    // 8K image, RGBA8 (4 channels * 1 byte = 4 B/pixel) → 8192 * 4 = 32768 B/row.
     auto reqs = Image::CalcRequirements({8192, 4320}, FMT_RGBA8);
     EXPECT_EQ(reqs.planeRowStride[0], static_cast<int64_t>(8192 * 4));
 }
@@ -173,7 +173,9 @@ void TestImageCopySharesBuffer() {
         EXPECT_EQ(AsAddr(second.exportData().cast<ImageDataStrided>()->plane(0).basePtr), AsAddr(buf));
 
         // Drop `first`; buffer must NOT be freed yet — `second` still holds it.
-        { Image sink = std::move(first); }
+        {
+            Image sink = std::move(first);
+        }
         EXPECT_EQ(alloc.hipFrees, 0);
     }
     // All handles dropped — exactly one free.
diff --git a/tests/roccv/cpp/src/tests/core/image/test_image_batch_data.cpp b/tests/roccv/cpp/src/tests/core/image/test_image_batch_data.cpp
index a2e238be..31e449e6 100644
--- a/tests/roccv/cpp/src/tests/core/image/test_image_batch_data.cpp
+++ b/tests/roccv/cpp/src/tests/core/image/test_image_batch_data.cpp
@@ -76,9 +76,9 @@ void TestImageBatchVarShapeDataStridedHipConstruction() {
     EXPECT_EQ(data.maxSize().w, 640);
     EXPECT_EQ(data.maxSize().h, 480);
     EXPECT_EQ(data.uniqueFormat().channels(), 3);
-    EXPECT_EQ(AsAddr(const_cast<ImageFormat*>(data.formatList())), AsAddr(g_formatList));
-    EXPECT_EQ(AsAddr(const_cast<ImageFormat*>(data.hostFormatList())), AsAddr(g_hostFormatList));
-    EXPECT_EQ(AsAddr(const_cast<ImageBufferStrided*>(data.imageList())), AsAddr(g_imageList));
+    EXPECT_EQ(AsAddr(data.formatList()), AsAddr(g_formatList));
+    EXPECT_EQ(AsAddr(data.hostFormatList()), AsAddr(g_hostFormatList));
+    EXPECT_EQ(AsAddr(data.imageList()), AsAddr(g_imageList));
     EXPECT_EQ(data.imageList()[0].planes[0].width, 640);
     EXPECT_EQ(data.imageList()[1].planes[0].width, 320);
 }
@@ -95,7 +95,7 @@ void TestImageBatchVarShapeDataStridedHostConstruction() {
     EXPECT_EQ(data.maxSize().w, 640);
     EXPECT_EQ(data.maxSize().h, 480);
     EXPECT_EQ(data.uniqueFormat().channels(), 3);
-    EXPECT_EQ(AsAddr(const_cast<ImageBufferStrided*>(data.imageList())), AsAddr(g_imageList));
+    EXPECT_EQ(AsAddr(data.imageList()), AsAddr(g_imageList));
 }
 
 /**
@@ -161,14 +161,12 @@ void TestImageBatchVarShapeDataSugarCtor() {
     EXPECT_EQ(wide.numImages(), sugar.numImages());
     EXPECT_EQ(wide.maxSize().w, sugar.maxSize().w);
     EXPECT_EQ(wide.maxSize().h, sugar.maxSize().h);
-    EXPECT_EQ(AsAddr(const_cast<ImageBufferStrided*>(wide.imageList())),
-              AsAddr(const_cast<ImageBufferStrided*>(sugar.imageList())));
+    EXPECT_EQ(AsAddr(wide.imageList()), AsAddr(sugar.imageList()));
 
     ImageBatchVarShapeDataStridedHost wideHost(2, ImageBatchBuffer{.varShapeStrided = buf});
     ImageBatchVarShapeDataStridedHost sugarHost(2, buf);
     EXPECT_EQ(AsInt(wideHost.device()), AsInt(sugarHost.device()));
-    EXPECT_EQ(AsAddr(const_cast<ImageBufferStrided*>(wideHost.imageList())),
-              AsAddr(const_cast<ImageBufferStrided*>(sugarHost.imageList())));
+    EXPECT_EQ(AsAddr(wideHost.imageList()), AsAddr(sugarHost.imageList()));
 }
 
 /**
@@ -237,7 +235,7 @@ void TestImageBatchDataCast() {
         EXPECT_EQ(AsInt(asHip->device()), AsInt(eDeviceType::GPU));
         EXPECT_EQ(asHip->numImages(), 2);
         EXPECT_EQ(asHip->maxSize().w, 640);
-        EXPECT_EQ(AsAddr(const_cast<ImageBufferStrided*>(asHip->imageList())), AsAddr(g_imageList));
+        EXPECT_EQ(AsAddr(asHip->imageList()), AsAddr(g_imageList));
 
         auto asStrided = base.cast<ImageBatchVarShapeDataStrided>();
         EXPECT_EQ(AsInt(asStrided.has_value()), 1);