From 742920cffaf9a03ccd3cf5b0771126ecbae41320 Mon Sep 17 00:00:00 2001
From: Zach Vincze <zavincze@amd.com>
Date: Thu, 20 Nov 2025 12:19:59 -0500
Subject: [PATCH 01/36] Add MemAlignment to Tensor constructors

---
 include/core/mem_alignment.hpp | 68 +++++++++++++++++++++++++++++++
 include/core/tensor.hpp        | 19 ++++++---
 src/core/mem_alignment.cpp     | 38 +++++++++++++++++
 src/core/tensor.cpp            | 74 +++++++++++++++++++++-------------
 4 files changed, 166 insertions(+), 33 deletions(-)
 create mode 100644 include/core/mem_alignment.hpp
 create mode 100644 src/core/mem_alignment.cpp

diff --git a/include/core/mem_alignment.hpp b/include/core/mem_alignment.hpp
new file mode 100644
index 00000000..39f208af
--- /dev/null
+++ b/include/core/mem_alignment.hpp
@@ -0,0 +1,68 @@
+/*
+ * Copyright (c) 2025 Advanced Micro Devices, Inc. All rights reserved.
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ */
+
+#pragma once
+
+namespace roccv {
+
+/**
+ * @brief Defines memory alignment for containers.
+ *
+ */
+class MemAlignment {
+   public:
+    MemAlignment() = default;
+
+    /**
+     * @brief Returns the base address alignment.
+     *
+     * @return The base address alignment.
+     */
+    int32_t baseAddr() const;
+
+    /**
+     * @brief Returns the row address alignment.
+     *
+     * @return The row address alignment.
+     */
+    int32_t rowAddr() const;
+
+    /**
+     * @brief Sets the base address alignment.
+     *
+     * @param[in] alignment Alignment in bytes.
+     * @return A reference to this object, with the base address set.
+     */
+    MemAlignment& baseAddr(int32_t alignment);
+
+    /**
+     * @brief Sets the row address alignment.
+     *
+     * @param[in] alignment Alignment in bytes.
+     * @return A reference to this object, with the row address set.
+     */
+    MemAlignment& rowAddr(int32_t alignment);
+
+   private:
+    int32_t m_baseAddrAlignment = 0;
+    int32_t m_rowAddrAlignment = 0;
+};
+}  // namespace roccv
\ No newline at end of file
diff --git a/include/core/tensor.hpp b/include/core/tensor.hpp
index f32cb498..f17d4d33 100644
--- a/include/core/tensor.hpp
+++ b/include/core/tensor.hpp
@@ -26,6 +26,8 @@ THE SOFTWARE.
 
 #include "core/data_type.hpp"
 #include "core/detail/allocators/i_allocator.hpp"
+#include "core/detail/context.hpp"
+#include "core/mem_alignment.hpp"
 #include "core/tensor_layout.hpp"
 #include "core/util_enums.h"
 #include "tensor_data.hpp"
@@ -50,8 +52,7 @@ class Tensor {
      *
      * @param[in] reqs An object representing the requirements for this tensor.
      */
-    explicit Tensor(const TensorRequirements &reqs);
-    explicit Tensor(const TensorRequirements &reqs, const IAllocator &alloc);
+    explicit Tensor(const TensorRequirements &reqs, const IAllocator &alloc = GlobalContext().getDefaultAllocator());
 
     /**
      * @brief Constructs a Tensor object given a list of requirements and the underlying data as a TensorStorage
@@ -60,8 +61,8 @@ class Tensor {
      * @param[in] reqs An object representing the requirements for this tensor.
      * @param[in] data A TensorStorage object for the tensor's underlying data.
      */
-    explicit Tensor(const TensorRequirements &reqs, std::shared_ptr<TensorStorage> data);
-    explicit Tensor(const TensorRequirements &reqs, std::shared_ptr<TensorStorage> data, const IAllocator &alloc);
+    explicit Tensor(const TensorRequirements &reqs, std::shared_ptr<TensorStorage> data,
+                    const IAllocator &alloc = GlobalContext().getDefaultAllocator());
 
     /**
      * @brief Constructs a tensor object and allocates the appropriate amount of memory on the specified device.
@@ -71,7 +72,8 @@ class Tensor {
      * @param[in] device The device the tensor should be allocated on.
      */
     explicit Tensor(const TensorShape &shape, DataType dtype, const eDeviceType device = eDeviceType::GPU);
-    explicit Tensor(const TensorShape &shape, DataType dtype, const IAllocator &alloc,
+    explicit Tensor(const TensorShape &shape, DataType dtype, const MemAlignment &bufAlign,
+                    const IAllocator &alloc = GlobalContext().getDefaultAllocator(),
                     const eDeviceType device = eDeviceType::GPU);
 
     /**
@@ -84,7 +86,8 @@ class Tensor {
      * @param[in] device The device the tensor should be allocated on.
      */
     explicit Tensor(int num_images, Size2D image_size, ImageFormat fmt, eDeviceType device = eDeviceType::GPU);
-    explicit Tensor(int num_images, Size2D image_size, ImageFormat fmt, const IAllocator &alloc,
+    explicit Tensor(int num_images, Size2D image_size, ImageFormat fmt, const MemAlignment &bufAlign,
+                    const IAllocator &alloc = GlobalContext().getDefaultAllocator(),
                     eDeviceType device = eDeviceType::GPU);
 
     Tensor(const Tensor &other) = delete;
@@ -189,6 +192,8 @@ class Tensor {
      */
     static Requirements CalcRequirements(const TensorShape &shape, const DataType &dtype,
                                          const eDeviceType device = eDeviceType::GPU);
+    static Requirements CalcRequirements(const TensorShape &shape, const DataType &dtype, const MemAlignment &bufAlign,
+                                         const eDeviceType device = eDeviceType::GPU);
 
     /**
      * @brief Calculates tensor requirements.
@@ -214,6 +219,8 @@ class Tensor {
      */
     static Requirements CalcRequirements(int num_images, Size2D image_size, ImageFormat fmt,
                                          eDeviceType device = eDeviceType::GPU);
+    static Requirements CalcRequirements(int num_images, Size2D image_size, ImageFormat fmt,
+                                         const MemAlignment &bufAlign, eDeviceType device = eDeviceType::GPU);
 
     /**
      * @brief Calculates strides required for a tensor.
diff --git a/src/core/mem_alignment.cpp b/src/core/mem_alignment.cpp
new file mode 100644
index 00000000..247974b1
--- /dev/null
+++ b/src/core/mem_alignment.cpp
@@ -0,0 +1,38 @@
+/*
+ * Copyright (c) 2025 Advanced Micro Devices, Inc. All rights reserved.
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ */
+
+#include "core/mem_alignment.hpp"
+
+namespace roccv {
+int32_t MemAlignment::baseAddr() const { return m_baseAddrAlignment; }
+
+int32_t MemAlignment::rowAddr() const { return m_rowAddrAlignment; }
+
+MemAlignment& MemAlignment::baseAddr(int32_t alignment) {
+    m_baseAddrAlignment = alignment;
+    return *this;
+}
+
+MemAlignment& MemAlignment::rowAddr(int32_t alignment) {
+    m_rowAddrAlignment = alignment;
+    return *this;
+}
+}  // namespace roccv
\ No newline at end of file
diff --git a/src/core/tensor.cpp b/src/core/tensor.cpp
index 4eeae37b..6e68b8ef 100644
--- a/src/core/tensor.cpp
+++ b/src/core/tensor.cpp
@@ -28,6 +28,7 @@ THE SOFTWARE.
 #include "core/detail/context.hpp"
 #include "core/exception.hpp"
 #include "core/image_format.hpp"
+#include "core/mem_alignment.hpp"
 #include "core/status_type.h"
 #include "core/tensor_data.hpp"
 #include "core/tensor_layout.hpp"
@@ -39,28 +40,26 @@ THE SOFTWARE.
 namespace roccv {
 
 // Constructor definitions
-Tensor::Tensor(const TensorRequirements& reqs) : Tensor(reqs, GlobalContext().getDefaultAllocator()) {}
-
-Tensor::Tensor(const TensorRequirements& reqs, const IAllocator& alloc) : m_requirements(reqs), m_allocator(alloc) {
+Tensor::Tensor(const Tensor::Requirements& reqs, const IAllocator& alloc) : m_requirements(reqs), m_allocator(alloc) {
     size_t numBytes = reqs.device == eDeviceType::GPU ? reqs.res.deviceMem.bytes : reqs.res.hostMem.bytes;
     m_data = std::make_shared<TensorStorage>(numBytes, reqs.device, alloc);
 }
-Tensor::Tensor(const TensorRequirements& reqs, std::shared_ptr<TensorStorage> data)
-    : Tensor(reqs, data, GlobalContext().getDefaultAllocator()) {}
 
-Tensor::Tensor(const TensorRequirements& reqs, std::shared_ptr<TensorStorage> data, const IAllocator& alloc)
+Tensor::Tensor(const Tensor::Requirements& reqs, std::shared_ptr<TensorStorage> data, const IAllocator& alloc)
     : m_requirements(reqs), m_data(data), m_allocator(alloc) {}
 
 Tensor::Tensor(const TensorShape& shape, DataType dtype, const eDeviceType device)
-    : Tensor(shape, dtype, GlobalContext().getDefaultAllocator(), device) {}
+    : Tensor(shape, dtype, {}, GlobalContext().getDefaultAllocator(), device) {}
 
-Tensor::Tensor(const TensorShape& shape, DataType dtype, const IAllocator& alloc, const eDeviceType device)
+Tensor::Tensor(const TensorShape& shape, DataType dtype, const MemAlignment& bufAlign, const IAllocator& alloc,
+               const eDeviceType device)
     : Tensor(CalcRequirements(shape, dtype, device), alloc) {}
 
 Tensor::Tensor(int num_images, Size2D image_size, ImageFormat fmt, eDeviceType device)
-    : Tensor(num_images, image_size, fmt, GlobalContext().getDefaultAllocator(), device) {}
+    : Tensor(num_images, image_size, fmt, {}, GlobalContext().getDefaultAllocator(), device) {}
 
-Tensor::Tensor(int num_images, Size2D image_size, ImageFormat fmt, const IAllocator& alloc, eDeviceType device)
+Tensor::Tensor(int num_images, Size2D image_size, ImageFormat fmt, const MemAlignment& bufAlign,
+               const IAllocator& alloc, eDeviceType device)
     : Tensor(CalcRequirements(num_images, image_size, fmt, device), alloc) {}
 
 Tensor::Tensor(Tensor&& other)
@@ -108,7 +107,7 @@ Tensor Tensor::reshape(const TensorShape& new_shape) const {
                         eStatusType::INVALID_VALUE);
     }
 
-    TensorRequirements reqs = CalcRequirements(new_shape, this->dtype(), this->device());
+    Tensor::Requirements reqs = CalcRequirements(new_shape, this->dtype(), this->device());
     return Tensor(reqs, m_data);
 }
 
@@ -117,7 +116,7 @@ Tensor Tensor::reshape(const TensorShape& new_shape, const DataType& new_dtype)
         throw Exception("New tensor view must have the same underlying number of bytes.", eStatusType::INVALID_VALUE);
     }
 
-    TensorRequirements reqs = CalcRequirements(new_shape, new_dtype, this->device());
+    Tensor::Requirements reqs = CalcRequirements(new_shape, new_dtype, this->device());
     return Tensor(reqs, m_data);
 }
 
@@ -127,15 +126,21 @@ Tensor& Tensor::operator=(const Tensor& other) {
     return *this;
 }
 
-TensorRequirements Tensor::CalcRequirements(const TensorShape& shape, const DataType& dtype, const eDeviceType device) {
+Tensor::Requirements Tensor::CalcRequirements(const TensorShape& shape, const DataType& dtype,
+                                              const eDeviceType device) {
+    return CalcRequirements(shape, dtype, (MemAlignment){}, device);
+}
+
+Tensor::Requirements Tensor::CalcRequirements(const TensorShape& shape, const DataType& dtype,
+                                              const MemAlignment& bufAlign, const eDeviceType device) {
     std::array<int64_t, ROCCV_TENSOR_MAX_RANK> strides = CalcStrides(shape, dtype);
-    TensorRequirements reqs = CalcRequirements(shape, dtype, strides, device);
+    Tensor::Requirements reqs = CalcRequirements(shape, dtype, strides, device);
     return reqs;
 }
 
-TensorRequirements Tensor::CalcRequirements(const TensorShape& shape, const DataType& dtype,
-                                            std::array<int64_t, ROCCV_TENSOR_MAX_RANK> strides, eDeviceType device) {
-    TensorRequirements reqs;
+Tensor::Requirements Tensor::CalcRequirements(const TensorShape& shape, const DataType& dtype,
+                                              std::array<int64_t, ROCCV_TENSOR_MAX_RANK> strides, eDeviceType device) {
+    Tensor::Requirements reqs;
 
     reqs.shape = shape.shape();
     reqs.rank = shape.layout().rank();
@@ -145,24 +150,38 @@ TensorRequirements Tensor::CalcRequirements(const TensorShape& shape, const Data
     reqs.alignBytes = 0;  // TODO: Must be specified later
     reqs.device = device;
 
-    // TODO: Resource requirements should be calculated differently later, once padded/aligned strides have been
-    // implemented
+    // Determine resource usage
     size_t numBytes = reqs.strides[0] * reqs.shape[0];
-    if (reqs.device == eDeviceType::GPU) {
-        reqs.res.deviceMem.bytes = numBytes;
-    } else if (reqs.device == eDeviceType::CPU) {
-        reqs.res.hostMem.bytes = numBytes;
+    switch (reqs.device) {
+        case eDeviceType::GPU: {
+            reqs.res.deviceMem.bytes = numBytes;
+            break;
+        }
+
+        case eDeviceType::CPU: {
+            reqs.res.hostMem.bytes = numBytes;
+            break;
+        }
+
+        default: {
+            throw Exception("Unsupported device when calling Tensor::CalcRequirements().", eStatusType::INVALID_VALUE);
+        }
     }
 
     return reqs;
 }
 
-TensorRequirements Tensor::CalcRequirements(int num_images, Size2D image_size, ImageFormat fmt, eDeviceType device) {
+Tensor::Requirements Tensor::CalcRequirements(int num_images, Size2D image_size, ImageFormat fmt, eDeviceType device) {
+    return CalcRequirements(num_images, image_size, fmt, (MemAlignment){}, device);
+}
+
+Tensor::Requirements Tensor::CalcRequirements(int num_images, Size2D image_size, ImageFormat fmt,
+                                              const MemAlignment& bufAlign, eDeviceType device) {
     // TODO: Need to support different types of tensor layouts. This will happen once more image formats are supported
     // first.
     TensorShape shape(TensorLayout(eTensorLayout::TENSOR_LAYOUT_NHWC),
                       {num_images, image_size.h, image_size.w, fmt.channels()});
-    return CalcRequirements(shape, DataType(fmt.dtype()), device);
+    return CalcRequirements(shape, DataType(fmt.dtype()), bufAlign, device);
 }
 
 std::array<int64_t, ROCCV_TENSOR_MAX_RANK> Tensor::CalcStrides(const TensorShape& shape, const DataType& dtype) {
@@ -188,8 +207,9 @@ Tensor TensorWrapData(const TensorData& tensor_data) {
     for (int i = 0; i < tensorDataStrided->rank(); i++) {
         strides[i] = tensorDataStrided->stride(i);
     }
-    TensorRequirements reqs = Tensor::CalcRequirements(tensorDataStrided->shape(), tensorDataStrided->dtype(), strides,
-                                                       tensorDataStrided->device());
+
+    Tensor::Requirements reqs = Tensor::CalcRequirements(tensorDataStrided->shape(), tensorDataStrided->dtype(),
+                                                         strides, tensorDataStrided->device());
 
     auto data =
         std::make_shared<TensorStorage>(tensorDataStrided->basePtr(), tensorDataStrided->device(), eOwnership::OWNING);

From f345ea072334163ba715d1dc7449ccbc4be8048d Mon Sep 17 00:00:00 2001
From: Zach Vincze <zavincze@amd.com>
Date: Thu, 20 Nov 2025 12:34:08 -0500
Subject: [PATCH 02/36] Document all constructors for roccv::Tensor

---
 include/core/mem_alignment.hpp  |  2 ++
 include/core/tensor.hpp         | 63 ++++++++++++++++++++++++++++-----
 include/core/tensor_storage.hpp |  7 ++++
 src/core/tensor.cpp             |  4 +--
 src/core/tensor_storage.cpp     |  3 ++
 5 files changed, 68 insertions(+), 11 deletions(-)

diff --git a/include/core/mem_alignment.hpp b/include/core/mem_alignment.hpp
index 39f208af..ca17e349 100644
--- a/include/core/mem_alignment.hpp
+++ b/include/core/mem_alignment.hpp
@@ -21,6 +21,8 @@
 
 #pragma once
 
+#include <stdint.h>
+
 namespace roccv {
 
 /**
diff --git a/include/core/tensor.hpp b/include/core/tensor.hpp
index f17d4d33..4f85625f 100644
--- a/include/core/tensor.hpp
+++ b/include/core/tensor.hpp
@@ -61,24 +61,35 @@ class Tensor {
      * @param[in] reqs An object representing the requirements for this tensor.
      * @param[in] data A TensorStorage object for the tensor's underlying data.
      */
-    explicit Tensor(const TensorRequirements &reqs, std::shared_ptr<TensorStorage> data,
-                    const IAllocator &alloc = GlobalContext().getDefaultAllocator());
+    explicit Tensor(const TensorRequirements &reqs, std::shared_ptr<TensorStorage> data);
 
     /**
-     * @brief Constructs a tensor object and allocates the appropriate amount of memory on the specified device.
+     * @brief Constructs a tensor object and allocates the appropriate amount of memory on the specified device. Uses
+     * the default memory alignment and allocation strategy.
      *
      * @param[in] shape The shape describing the tensor.
      * @param[in] dtype The underlying datatype of the tensor.
      * @param[in] device The device the tensor should be allocated on.
      */
     explicit Tensor(const TensorShape &shape, DataType dtype, const eDeviceType device = eDeviceType::GPU);
+
+    /**
+     * @brief Constructs a tensor object and allocates the appropriate amount of memory on the specified device. Uses a
+     * user-specified memory alignment and allocation strategy.
+     *
+     * @param[in] shape The shape describing the tensor.
+     * @param[in] dtype The underlying datatype of the tensor.
+     * @param[in] bufAlign Specification for memory alignment.
+     * @param[in] alloc The allocation strategy. (Default: DefaultAllocator)
+     * @param[in] device The device the tensor should be allocated on.
+     */
     explicit Tensor(const TensorShape &shape, DataType dtype, const MemAlignment &bufAlign,
                     const IAllocator &alloc = GlobalContext().getDefaultAllocator(),
                     const eDeviceType device = eDeviceType::GPU);
 
     /**
      * @brief Constructs a tensor using image-based requirements and allocates the appropriate amount of memory on the
-     * specified device.
+     * specified device. Uses the default memory alignment and allocation strategy.
      *
      * @param[in] num_images The number of images in the batch.
      * @param[in] image_size The size for images in the batch.
@@ -86,6 +97,18 @@ class Tensor {
      * @param[in] device The device the tensor should be allocated on.
      */
     explicit Tensor(int num_images, Size2D image_size, ImageFormat fmt, eDeviceType device = eDeviceType::GPU);
+
+    /**
+     * @brief Constructs a tensor using image-based requirements and allocates the appropriate amount of memory on the
+     * specified device. Uses user-provided memory alignment and allocation strategies.
+     *
+     * @param[in] num_images The number of images in the batch.
+     * @param[in] image_size The size for images in the batch.
+     * @param[in] fmt The format of the underlying image data.
+     * @param[in] bufAlign Specification for memory alignment.
+     * @param[in] alloc The allocation strategy. (Default: DefaultAllocator)
+     * @param[in] device The device the tensor should be allocated on.
+     */
     explicit Tensor(int num_images, Size2D image_size, ImageFormat fmt, const MemAlignment &bufAlign,
                     const IAllocator &alloc = GlobalContext().getDefaultAllocator(),
                     eDeviceType device = eDeviceType::GPU);
@@ -181,8 +204,7 @@ class Tensor {
     Tensor &operator=(const Tensor &other);
 
     /**
-     * @brief Calculates tensor requirements. This essentially wraps the
-     * provided parameters into a TensorRequirements object.
+     * @brief Calculates tensor requirements using the default memory alignment strategy.
      *
      * @param[in] shape The desired shape of the tensor.
      * @param[in] dtype The desired data type of the tensor's raw data.
@@ -192,11 +214,22 @@ class Tensor {
      */
     static Requirements CalcRequirements(const TensorShape &shape, const DataType &dtype,
                                          const eDeviceType device = eDeviceType::GPU);
+
+    /**
+     * @brief Calculates tensor requirements with a user-provided memory alignment strategy.
+     *
+     * @param[in] shape The desired shape of the tensor.
+     * @param[in] dtype The desired data type of the tensor's raw data.
+     * @param[in] bufAlign Specification for memory alignment.
+     * @param[in] device The device the tensor data should belong to.
+     * @return A TensorRequirements object representing this tensor's
+     * requirements.
+     */
     static Requirements CalcRequirements(const TensorShape &shape, const DataType &dtype, const MemAlignment &bufAlign,
                                          const eDeviceType device = eDeviceType::GPU);
 
     /**
-     * @brief Calculates tensor requirements.
+     * @brief Calculates tensor requirements with user-provided strides.
      *
      * @param[in] shape The shape describing the tensor.
      * @param[in] dtype The type of the tensor's data.
@@ -209,16 +242,28 @@ class Tensor {
                                          eDeviceType device = eDeviceType::GPU);
 
     /**
-     * @brief Calculates tensor requirements using image-based parameters.
+     * @brief Calculates tensor requirements using image-based parameters. This will use a default memory alignment
+     * strategy.
      *
      * @param[in] num_images The number of images in the batch.
      * @param[in] image_size The size for images in the batch.
      * @param[in] fmt The format of the underlying image data.
      * @param[in] device The device the tensor data should belong to.
-     * @return A TensorRequirements object representing the tensor's requirements.
+     * @return A Tensor::Requirements object representing the tensor's requirements.
      */
     static Requirements CalcRequirements(int num_images, Size2D image_size, ImageFormat fmt,
                                          eDeviceType device = eDeviceType::GPU);
+
+    /**
+     * @brief Calculates tensor requirements using image-based parameters and a specified memory alignment.
+     *
+     * @param[in] num_images The number of images in the batch.
+     * @param[in] image_size The size of images in the batch.
+     * @param[in] fmt The format of the underling image data.
+     * @param[in] bufAlign Specification for memory alignment.
+     * @param[in] device The device the tensor is to be allocated on.
+     * @return A Tensor::Requirements object representing this tensor's requirements.
+     */
     static Requirements CalcRequirements(int num_images, Size2D image_size, ImageFormat fmt,
                                          const MemAlignment &bufAlign, eDeviceType device = eDeviceType::GPU);
 
diff --git a/include/core/tensor_storage.hpp b/include/core/tensor_storage.hpp
index 7d966ca6..c97d3125 100644
--- a/include/core/tensor_storage.hpp
+++ b/include/core/tensor_storage.hpp
@@ -71,6 +71,13 @@ class TensorStorage {
      */
     const eDeviceType device() const;
 
+    /**
+     * @brief Returns the allocation strategy being used.
+     *
+     * @return The allocation strategy being used.
+     */
+    const IAllocator& allocator() const;
+
    private:
     eDeviceType m_device;
     eOwnership m_ownership;
diff --git a/src/core/tensor.cpp b/src/core/tensor.cpp
index 6e68b8ef..b2902253 100644
--- a/src/core/tensor.cpp
+++ b/src/core/tensor.cpp
@@ -45,8 +45,8 @@ Tensor::Tensor(const Tensor::Requirements& reqs, const IAllocator& alloc) : m_re
     m_data = std::make_shared<TensorStorage>(numBytes, reqs.device, alloc);
 }
 
-Tensor::Tensor(const Tensor::Requirements& reqs, std::shared_ptr<TensorStorage> data, const IAllocator& alloc)
-    : m_requirements(reqs), m_data(data), m_allocator(alloc) {}
+Tensor::Tensor(const Tensor::Requirements& reqs, std::shared_ptr<TensorStorage> data)
+    : m_requirements(reqs), m_data(data), m_allocator(data->allocator()) {}
 
 Tensor::Tensor(const TensorShape& shape, DataType dtype, const eDeviceType device)
     : Tensor(shape, dtype, {}, GlobalContext().getDefaultAllocator(), device) {}
diff --git a/src/core/tensor_storage.cpp b/src/core/tensor_storage.cpp
index 99686c69..d83d5c44 100644
--- a/src/core/tensor_storage.cpp
+++ b/src/core/tensor_storage.cpp
@@ -62,4 +62,7 @@ TensorStorage::~TensorStorage() {
 void* TensorStorage::data() const { return m_data; }
 
 const eDeviceType TensorStorage::device() const { return m_device; }
+
+const IAllocator& TensorStorage::allocator() const { return m_allocator; }
+
 }  // namespace roccv

From cf99a9c0dfac883e5d9d4543b2c0b30f022df673 Mon Sep 17 00:00:00 2001
From: Zach Vincze <zavincze@amd.com>
Date: Thu, 20 Nov 2025 13:04:30 -0500
Subject: [PATCH 03/36] Add MemAlign parameter to Tensor::CalcStrides()

---
 include/core/tensor.hpp | 14 ++++++++------
 src/core/tensor.cpp     | 14 ++++++--------
 2 files changed, 14 insertions(+), 14 deletions(-)

diff --git a/include/core/tensor.hpp b/include/core/tensor.hpp
index 4f85625f..cb32a149 100644
--- a/include/core/tensor.hpp
+++ b/include/core/tensor.hpp
@@ -28,11 +28,11 @@ THE SOFTWARE.
 #include "core/detail/allocators/i_allocator.hpp"
 #include "core/detail/context.hpp"
 #include "core/mem_alignment.hpp"
+#include "core/tensor_data.hpp"
 #include "core/tensor_layout.hpp"
+#include "core/tensor_requirements.hpp"
+#include "core/tensor_storage.hpp"
 #include "core/util_enums.h"
-#include "tensor_data.hpp"
-#include "tensor_requirements.hpp"
-#include "tensor_storage.hpp"
 
 namespace roccv {
 
@@ -268,18 +268,20 @@ class Tensor {
                                          const MemAlignment &bufAlign, eDeviceType device = eDeviceType::GPU);
 
     /**
-     * @brief Calculates strides required for a tensor.
+     * @brief Calculates strides required for a tensor. Uses a user-specified memory alignment strategy to determine the
+     * strides with padding in mind.
      *
      * @param shape The tensor shape.
      * @param dtype The datatype of the tensor.
+     * @param bufAlign The memory alignment strategy to use.
      * @return An array containing strides for the given parameters.
      */
-    static std::array<int64_t, ROCCV_TENSOR_MAX_RANK> CalcStrides(const TensorShape &shape, const DataType &dtype);
+    static std::array<int64_t, ROCCV_TENSOR_MAX_RANK> CalcStrides(const TensorShape &shape, const DataType &dtype,
+                                                                  const MemAlignment &bufAlign);
 
    private:
     TensorRequirements m_requirements;      // Tensor metadata
     std::shared_ptr<TensorStorage> m_data;  // Stores raw tensor data
-    const IAllocator &m_allocator;
 };
 
 /**
diff --git a/src/core/tensor.cpp b/src/core/tensor.cpp
index b2902253..2b272e56 100644
--- a/src/core/tensor.cpp
+++ b/src/core/tensor.cpp
@@ -40,13 +40,13 @@ THE SOFTWARE.
 namespace roccv {
 
 // Constructor definitions
-Tensor::Tensor(const Tensor::Requirements& reqs, const IAllocator& alloc) : m_requirements(reqs), m_allocator(alloc) {
+Tensor::Tensor(const Tensor::Requirements& reqs, const IAllocator& alloc) : m_requirements(reqs) {
     size_t numBytes = reqs.device == eDeviceType::GPU ? reqs.res.deviceMem.bytes : reqs.res.hostMem.bytes;
     m_data = std::make_shared<TensorStorage>(numBytes, reqs.device, alloc);
 }
 
 Tensor::Tensor(const Tensor::Requirements& reqs, std::shared_ptr<TensorStorage> data)
-    : m_requirements(reqs), m_data(data), m_allocator(data->allocator()) {}
+    : m_requirements(reqs), m_data(data) {}
 
 Tensor::Tensor(const TensorShape& shape, DataType dtype, const eDeviceType device)
     : Tensor(shape, dtype, {}, GlobalContext().getDefaultAllocator(), device) {}
@@ -62,10 +62,7 @@ Tensor::Tensor(int num_images, Size2D image_size, ImageFormat fmt, const MemAlig
                const IAllocator& alloc, eDeviceType device)
     : Tensor(CalcRequirements(num_images, image_size, fmt, device), alloc) {}
 
-Tensor::Tensor(Tensor&& other)
-    : m_requirements(std::move(other.m_requirements)),
-      m_data(std::move(other.m_data)),
-      m_allocator(other.m_allocator) {}
+Tensor::Tensor(Tensor&& other) : m_requirements(std::move(other.m_requirements)), m_data(std::move(other.m_data)) {}
 
 // Member definitions
 int Tensor::rank() const { return m_requirements.rank; }
@@ -133,7 +130,7 @@ Tensor::Requirements Tensor::CalcRequirements(const TensorShape& shape, const Da
 
 Tensor::Requirements Tensor::CalcRequirements(const TensorShape& shape, const DataType& dtype,
                                               const MemAlignment& bufAlign, const eDeviceType device) {
-    std::array<int64_t, ROCCV_TENSOR_MAX_RANK> strides = CalcStrides(shape, dtype);
+    std::array<int64_t, ROCCV_TENSOR_MAX_RANK> strides = CalcStrides(shape, dtype, bufAlign);
     Tensor::Requirements reqs = CalcRequirements(shape, dtype, strides, device);
     return reqs;
 }
@@ -184,7 +181,8 @@ Tensor::Requirements Tensor::CalcRequirements(int num_images, Size2D image_size,
     return CalcRequirements(shape, DataType(fmt.dtype()), bufAlign, device);
 }
 
-std::array<int64_t, ROCCV_TENSOR_MAX_RANK> Tensor::CalcStrides(const TensorShape& shape, const DataType& dtype) {
+std::array<int64_t, ROCCV_TENSOR_MAX_RANK> Tensor::CalcStrides(const TensorShape& shape, const DataType& dtype,
+                                                               const MemAlignment& bufAlign) {
     // TODO: Support memory alignment and padding in stride calculations
 
     // Calculate strides based on the given tensor shape. Strides are byte-wise.

From 21f0cb8739578e616b40a83661371a4437db9d70 Mon Sep 17 00:00:00 2001
From: Zach Vincze <zavincze@amd.com>
Date: Thu, 20 Nov 2025 16:03:44 -0500
Subject: [PATCH 04/36] Added Tensor copy constructor an Tensor::dataSize()
 implementation

---
 include/core/tensor.hpp | 20 ++++++++++++++------
 src/core/tensor.cpp     | 20 ++++++++++++++++++--
 2 files changed, 32 insertions(+), 8 deletions(-)

diff --git a/include/core/tensor.hpp b/include/core/tensor.hpp
index cb32a149..c067d8df 100644
--- a/include/core/tensor.hpp
+++ b/include/core/tensor.hpp
@@ -27,20 +27,18 @@ THE SOFTWARE.
 #include "core/data_type.hpp"
 #include "core/detail/allocators/i_allocator.hpp"
 #include "core/detail/context.hpp"
+#include "core/image_format.hpp"
 #include "core/mem_alignment.hpp"
 #include "core/tensor_data.hpp"
 #include "core/tensor_layout.hpp"
 #include "core/tensor_requirements.hpp"
+#include "core/tensor_shape.hpp"
 #include "core/tensor_storage.hpp"
 #include "core/util_enums.h"
+#include "operator_types.h"
 
 namespace roccv {
 
-class ImageFormat;
-struct Size2D;
-class TensorShape;
-class TensorLayout;
-
 class Tensor {
    public:
     using Requirements = TensorRequirements;
@@ -113,7 +111,10 @@ class Tensor {
                     const IAllocator &alloc = GlobalContext().getDefaultAllocator(),
                     eDeviceType device = eDeviceType::GPU);
 
-    Tensor(const Tensor &other) = delete;
+    // Copy constructor
+    Tensor(const Tensor &other);
+
+    // Move constructor
     Tensor(Tensor &&other);
 
     /**
@@ -203,6 +204,13 @@ class Tensor {
 
     Tensor &operator=(const Tensor &other);
 
+    /**
+     * @brief Returns the total number of bytes being used to store the raw tensor data.
+     *
+     * @return Total number of bytes being used to store the raw tensor data.
+     */
+    size_t dataSize() const;
+
     /**
      * @brief Calculates tensor requirements using the default memory alignment strategy.
      *
diff --git a/src/core/tensor.cpp b/src/core/tensor.cpp
index 2b272e56..206aa2e6 100644
--- a/src/core/tensor.cpp
+++ b/src/core/tensor.cpp
@@ -41,8 +41,7 @@ namespace roccv {
 
 // Constructor definitions
 Tensor::Tensor(const Tensor::Requirements& reqs, const IAllocator& alloc) : m_requirements(reqs) {
-    size_t numBytes = reqs.device == eDeviceType::GPU ? reqs.res.deviceMem.bytes : reqs.res.hostMem.bytes;
-    m_data = std::make_shared<TensorStorage>(numBytes, reqs.device, alloc);
+    m_data = std::make_shared<TensorStorage>(this->dataSize(), reqs.device, alloc);
 }
 
 Tensor::Tensor(const Tensor::Requirements& reqs, std::shared_ptr<TensorStorage> data)
@@ -62,6 +61,12 @@ Tensor::Tensor(int num_images, Size2D image_size, ImageFormat fmt, const MemAlig
                const IAllocator& alloc, eDeviceType device)
     : Tensor(CalcRequirements(num_images, image_size, fmt, device), alloc) {}
 
+// Copy constructor
+Tensor::Tensor(const Tensor& other) : m_requirements(other.m_requirements) {
+    m_data = std::make_shared<TensorStorage>(this->dataSize(), m_requirements.device, other.m_data->allocator());
+}
+
+// Move constructor
 Tensor::Tensor(Tensor&& other) : m_requirements(std::move(other.m_requirements)), m_data(std::move(other.m_data)) {}
 
 // Member definitions
@@ -123,6 +128,17 @@ Tensor& Tensor::operator=(const Tensor& other) {
     return *this;
 }
 
+size_t Tensor::dataSize() const {
+    switch (m_requirements.device) {
+        case eDeviceType::GPU:
+            return m_requirements.res.deviceMem.bytes;
+        case eDeviceType::CPU:
+            return m_requirements.res.hostMem.bytes;
+    }
+
+    return 0;
+}
+
 Tensor::Requirements Tensor::CalcRequirements(const TensorShape& shape, const DataType& dtype,
                                               const eDeviceType device) {
     return CalcRequirements(shape, dtype, (MemAlignment){}, device);

From 0a761ce4166499867ec9e1fd4098cca354c4fe0a Mon Sep 17 00:00:00 2001
From: Zach Vincze <zavincze@amd.com>
Date: Mon, 24 Nov 2025 10:48:12 -0500
Subject: [PATCH 05/36] Remove copy constructor for roccv::Tensor

---
 include/core/tensor.hpp | 5 +----
 src/core/tensor.cpp     | 5 -----
 2 files changed, 1 insertion(+), 9 deletions(-)

diff --git a/include/core/tensor.hpp b/include/core/tensor.hpp
index c067d8df..da2d390b 100644
--- a/include/core/tensor.hpp
+++ b/include/core/tensor.hpp
@@ -111,10 +111,7 @@ class Tensor {
                     const IAllocator &alloc = GlobalContext().getDefaultAllocator(),
                     eDeviceType device = eDeviceType::GPU);
 
-    // Copy constructor
-    Tensor(const Tensor &other);
-
-    // Move constructor
+    Tensor(const Tensor &other) = delete;
     Tensor(Tensor &&other);
 
     /**
diff --git a/src/core/tensor.cpp b/src/core/tensor.cpp
index 206aa2e6..97492115 100644
--- a/src/core/tensor.cpp
+++ b/src/core/tensor.cpp
@@ -61,11 +61,6 @@ Tensor::Tensor(int num_images, Size2D image_size, ImageFormat fmt, const MemAlig
                const IAllocator& alloc, eDeviceType device)
     : Tensor(CalcRequirements(num_images, image_size, fmt, device), alloc) {}
 
-// Copy constructor
-Tensor::Tensor(const Tensor& other) : m_requirements(other.m_requirements) {
-    m_data = std::make_shared<TensorStorage>(this->dataSize(), m_requirements.device, other.m_data->allocator());
-}
-
 // Move constructor
 Tensor::Tensor(Tensor&& other) : m_requirements(std::move(other.m_requirements)), m_data(std::move(other.m_data)) {}
 

From fd84334c2828b2b039ff16ecba1f6c88e90badff Mon Sep 17 00:00:00 2001
From: Zach Vincze <zavincze@amd.com>
Date: Thu, 27 Nov 2025 11:49:38 -0500
Subject: [PATCH 06/36] Implement memory padding and alignment for tensors

---
 include/core/mem_alignment.hpp       | 21 +++++++-
 include/core/tensor.hpp              | 20 ++++++--
 include/core/tensor_requirements.hpp | 24 +++++++++
 include/core/tensor_storage.hpp      |  1 -
 include/core/utils.hpp               | 77 ++++++++++++++++++++++++++++
 python/include/py_tensor.hpp         | 77 +++++++++++++++++++++++-----
 python/src/py_tensor.cpp             | 42 +++++++++++----
 src/core/tensor.cpp                  | 56 ++++++++++++++++----
 8 files changed, 277 insertions(+), 41 deletions(-)
 create mode 100644 include/core/utils.hpp

diff --git a/include/core/mem_alignment.hpp b/include/core/mem_alignment.hpp
index ca17e349..4aceceba 100644
--- a/include/core/mem_alignment.hpp
+++ b/include/core/mem_alignment.hpp
@@ -26,8 +26,27 @@
 namespace roccv {
 
 /**
- * @brief Defines memory alignment for containers.
+ * @class MemAlignment
+ * @brief Class for specifying memory alignment constraints for buffer allocations.
  *
+ * The MemAlignment class allows you to specify the memory alignment in bytes that should
+ * be used when allocating or working with device or host memory buffers. Proper alignment
+ * is important for performance reasons and hardware compatibility, especially for GPU operations.
+ *
+ * There are two types of alignment constraints:
+ *  - Base address alignment: Alignment requirements for the starting address of the buffer.
+ *  - Row address alignment: Alignment requirements for the starting address of each row, which is
+ *    important for multi-dimensional data (e.g., images or tensors).
+ *
+ * Both alignment values default to 0, which implies that the system or device default alignment will be used.
+ *
+ * Example usage:
+ * @code
+ *   roccv::MemAlignment align;
+ *   align.baseAddr(256).rowAddr(128);
+ * @endcode
+ *
+ * @see roccv::Tensor
  */
 class MemAlignment {
    public:
diff --git a/include/core/tensor.hpp b/include/core/tensor.hpp
index da2d390b..14348f63 100644
--- a/include/core/tensor.hpp
+++ b/include/core/tensor.hpp
@@ -199,6 +199,16 @@ class Tensor {
      */
     Tensor reshape(const TensorShape &new_shape, const DataType &new_dtype) const;
 
+    /**
+     * @brief Performs a shallow copy of the tensor (creates a view).
+     *
+     * This assignment operator copies the tensor's metadata and data handle,
+     * resulting in a new tensor object that shares the same underlying data
+     * with the original tensor. No deep copy of the data is performed.
+     *
+     * @param other The tensor to assign from.
+     * @return Reference to this tensor.
+     */
     Tensor &operator=(const Tensor &other);
 
     /**
@@ -239,11 +249,12 @@ class Tensor {
      * @param[in] shape The shape describing the tensor.
      * @param[in] dtype The type of the tensor's data.
      * @param[in] strides The tensor's strides.
+     * @param[in] baseAlign The base address alignment.
      * @param[in] device The device the tensor data belongs on. (Default: GPU)
      * @return Tensor requirements.
      */
     static Requirements CalcRequirements(const TensorShape &shape, const DataType &dtype,
-                                         std::array<int64_t, ROCCV_TENSOR_MAX_RANK> strides,
+                                         std::array<int64_t, ROCCV_TENSOR_MAX_RANK> strides, int32_t baseAlign,
                                          eDeviceType device = eDeviceType::GPU);
 
     /**
@@ -273,16 +284,15 @@ class Tensor {
                                          const MemAlignment &bufAlign, eDeviceType device = eDeviceType::GPU);
 
     /**
-     * @brief Calculates strides required for a tensor. Uses a user-specified memory alignment strategy to determine the
-     * strides with padding in mind.
+     * @brief Calculates strides required for a tensor.
      *
      * @param shape The tensor shape.
      * @param dtype The datatype of the tensor.
-     * @param bufAlign The memory alignment strategy to use.
+     * @param rowAlign The row alignment to use. Setting to 0 will ensure contiguous memory usage.
      * @return An array containing strides for the given parameters.
      */
     static std::array<int64_t, ROCCV_TENSOR_MAX_RANK> CalcStrides(const TensorShape &shape, const DataType &dtype,
-                                                                  const MemAlignment &bufAlign);
+                                                                  int32_t rowAlign);
 
    private:
     TensorRequirements m_requirements;      // Tensor metadata
diff --git a/include/core/tensor_requirements.hpp b/include/core/tensor_requirements.hpp
index 0bb6f631..3ff30d4e 100644
--- a/include/core/tensor_requirements.hpp
+++ b/include/core/tensor_requirements.hpp
@@ -29,16 +29,40 @@ THE SOFTWARE.
 
 namespace roccv {
 
+/**
+ * @brief Specifies basic memory requirements for an allocation.
+ *
+ * This struct expresses the number of bytes required for a memory region that backs
+ * a particular tensor or buffer allocation. It is typically used to indicate the raw
+ * size required for device, host, or pinned memory allocations.
+ */
 struct MemRequirements {
     size_t bytes = 0;
 };
 
+/**
+ * @brief Specifies resource requirements for tensor memory allocations.
+ *
+ * This struct aggregates requirements for different types of memory resources
+ * that may be used for tensor allocation and operation:
+ * - deviceMem: Memory required on the device (e.g., GPU).
+ * - hostMem: Memory required on the host (CPU-accessible memory).
+ * - pinnedMem: Memory required in pinned (page-locked) host memory, which may
+ *              be used for efficient device-host transfers.
+ */
 struct ResourceRequirements {
     MemRequirements deviceMem;
     MemRequirements hostMem;
     MemRequirements pinnedMem;
 };
 
+/**
+ * @brief Specifies the requirements for creating and allocating a tensor.
+ *
+ * This struct defines all the necessary properties for specifying the memory and layout
+ * requirements of a tensor, including its datatype, shape, memory alignment, layout, strides,
+ * memory resource requirements, and device placement.
+ */
 struct TensorRequirements {
     eDataType dtype;
     eTensorLayout layout;
diff --git a/include/core/tensor_storage.hpp b/include/core/tensor_storage.hpp
index c97d3125..201b274a 100644
--- a/include/core/tensor_storage.hpp
+++ b/include/core/tensor_storage.hpp
@@ -84,5 +84,4 @@ class TensorStorage {
     void* m_data;
     const IAllocator& m_allocator;
 };
-
 }  // namespace roccv
\ No newline at end of file
diff --git a/include/core/utils.hpp b/include/core/utils.hpp
new file mode 100644
index 00000000..480d4e29
--- /dev/null
+++ b/include/core/utils.hpp
@@ -0,0 +1,77 @@
+/**
+Copyright (c) 2025 Advanced Micro Devices, Inc. All rights reserved.
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in
+all copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+THE SOFTWARE.
+*/
+
+#pragma once
+
+#include <stdint.h>
+
+#include <type_traits>
+
+namespace roccv::detail {
+/**
+ * @brief Returns the next power of two greater than or equal to the given value.
+ *
+ * @tparam T Integral type of the input and output.
+ * @param value The value to find the next power of two for.
+ * @return The next power of two greater than or equal to the given value.
+ */
+template <typename T, typename = std::enable_if_t<std::is_integral_v<T>>>
+inline constexpr T NextPowerOfTwo(T value) noexcept {
+    if (value <= 1) return 1;
+#if defined(__GNUC__) || defined(__clang__)
+    // For unsigned types, use clz built-in. For signed, cast to unsigned.
+    using U = std::make_unsigned_t<T>;
+    constexpr int numBits = sizeof(T) * 8;
+    return static_cast<T>(U(1) << (numBits - __builtin_clz(static_cast<U>(value - 1))));
+#else
+    // Portable fallback: fill lower bits, then add one.
+    value--;
+    for (size_t i = 1; i < sizeof(T) * 8; i <<= 1) value |= (value >> i);
+    return value + 1;
+#endif
+}
+
+/**
+ * @brief Returns true if the given value is a power of two.
+ *
+ * @param value The value to check.
+ * @return True if the given value is a power of two, false otherwise.
+ */
+template <typename T, typename = std::enable_if_t<std::is_integral_v<T>>>
+inline constexpr bool IsPowerOfTwo(T value) noexcept {
+    return value > 0 && (value & (value - 1)) == 0;
+}
+
+/**
+ * @brief Aligns the given value to the nearest multiple of the given alignment.
+ *
+ * @param value The value to align.
+ * @param alignment The alignment to align to (must be > 0).
+ * @return The aligned value (same type as value).
+ */
+template <typename T, typename U, typename = std::enable_if_t<std::is_integral_v<T> && std::is_integral_v<U>>>
+inline constexpr T AlignUp(T value, U alignment) noexcept {
+    return alignment > 0
+               ? (value + static_cast<T>(alignment) - 1) / static_cast<T>(alignment) * static_cast<T>(alignment)
+               : value;
+}
+}  // namespace roccv::detail
\ No newline at end of file
diff --git a/python/include/py_tensor.hpp b/python/include/py_tensor.hpp
index 523a5f27..0c940148 100644
--- a/python/include/py_tensor.hpp
+++ b/python/include/py_tensor.hpp
@@ -29,8 +29,19 @@ THE SOFTWARE.
 #include <core/tensor.hpp>
 #include <memory>
 
-namespace py = pybind11;
-
+/**
+ * @brief A Python-facing wrapper for roccv::Tensor, providing DLPack interoperability and Pybind11 integration.
+ *
+ * The PyTensor class serves as a container around roccv::Tensor, exposing its functionality to Python. It supports
+ * construction from shape, data type, layout, and device information, as well as wrapping existing tensors or
+ * external DLPack-managed tensors. The class ensures correct lifetime handling for external resources (such as those
+ * transferred via DLPack) and enables seamless data movement between devices (CPU/GPU). PyTensor also supplies
+ * utilities to export to and import from DLPack capsules, making it suitable for zero-copy interoperability with
+ * frameworks like PyTorch, NumPy, or TensorFlow.
+ *
+ * Usage scenarios include direct creation from Python, import/export to DLPack, device copying, and serving as a
+ * type-safe bridge between Python and C++ tensor operations.
+ */
 class PyTensor : public std::enable_shared_from_this<PyTensor> {
    public:
     /**
@@ -79,22 +90,51 @@ class PyTensor : public std::enable_shared_from_this<PyTensor> {
     std::shared_ptr<PyTensor> copyTo(eDeviceType device);
 
     /**
-     * @brief Creates a new tensor given a capsule containing a DLManagedTensor.
+     * @brief Creates a new PyTensor by consuming an external DLPack capsule.
      *
-     * @param src A capsule containing a DLManagedTensor.
-     * @param layout The layout for the new tensor.
-     * @return std::shared_ptr<PyTensor>
+     * This static method constructs a PyTensor that wraps a new roccv::Tensor allocated
+     * from the contents of a DLPack capsule (i.e., an object supporting the __dlpack__ protocol),
+     * typically exported from other frameworks such as PyTorch, NumPy, or TVM.
+     * The shape and datatype are taken from the capsule's DLTensor metadata, while the layout
+     * must be specified explicitly, since DLPack does not encode layout.
+     *
+     * Ownership of the underlying DLPack-managed memory is transferred to the returned PyTensor,
+     * such that when the PyTensor is destroyed (and no Python references remain), the deleter
+     * from the DLPack capsule is called, ensuring correct cross-framework resource management.
+     *
+     * @param src   Python object supporting the __dlpack__() method. This can be a capsule returned by
+     *              __dlpack__() or any object exposing the DLPack consumer protocol.
+     * @param layout The tensor layout to use for the new roccv::Tensor (e.g., NHWC, NCHW, etc.).
+     * @return      A std::shared_ptr<PyTensor> wrapping a new tensor that shares memory with the DLPack object.
+     *
+     * @throws std::runtime_error if the object does not support the DLPack protocol,
+     *         if the capsule is invalid or missing, or if conversion fails in any other way.
+     *
+     * @note The resulting tensor will have the same shape, datatype, device, and (if present) strides
+     *       as encoded in the DLPack capsule, but the layout must be provided by the caller.
      */
     static std::shared_ptr<PyTensor> fromDLPack(pybind11::object src, eTensorLayout layout);
 
     /**
-     * @brief Creates a DLManagedTensor contained in a capsule from the tensor.
+     * @brief Exports this tensor as a DLPack-compatible capsule.
+     *
+     * This method creates a DLPack DLManagedTensor wrapper for the current PyTensor,
+     * encapsulates it in a Python capsule with the "dltensor" name, and returns it.
+     * The resulting capsule can be consumed by any framework supporting the DLPack protocol
+     * for zero-copy tensor data sharing.
      *
-     * @param stream Optional stream pointer value (used for pytorch).
+     * The optional @p stream argument is present to satisfy the Pytorch DLPack consumer interface,
+     * but is ignored by this implementation currently.
      *
-     * @return py::capsule
+     * @param stream Optional stream pointer (typically unused, present for PyTorch DLPack compliance).
+     *
+     * @return py::capsule  A Python capsule containing the DLPack DLManagedTensor.
+     *
+     * @note The caller is responsible for transferring or managing ownership of the data
+     * (according to DLPack conventions), including calling the capsule consumer and ensuring
+     * that the deleter in DLManagedTensor is invoked only once.
      */
-    py::capsule toDLPack(py::object stream);
+    pybind11::capsule toDLPack(pybind11::object stream);
 
     /**
      * @brief Gets the strides of the tensor as a python list.
@@ -151,15 +191,28 @@ class PyTensor : public std::enable_shared_from_this<PyTensor> {
      * @return A python tuple with the first index corresponding to the device type, and the second index corresponding
      * to the device id.
      */
-    py::tuple getDLDevice();
+    pybind11::tuple getDLDevice();
 
     /**
      * @brief Exports this class in the provided module.
      *
      * @param m The python module to export this class to.
      */
-    static void Export(py::module& m);
+    static void Export(pybind11::module& m);
 
+    /**
+     * @brief Returns a new PyTensor with a reshaped tensor according to the specified shape and layout.
+     *
+     * Creates and returns a new PyTensor whose underlying tensor is a view or copy of this tensor,
+     * with the shape specified by newShape and the layout specified by layout. The number of elements
+     * in newShape must match the number of elements in the original tensor.
+     *
+     * @param newShape The new shape for the tensor.
+     * @param layout The new layout to use for the reshaped tensor.
+     * @return std::shared_ptr<PyTensor> A new PyTensor with the reshaped tensor.
+     *
+     * @throws std::runtime_error if the total number of elements does not match.
+     */
     std::shared_ptr<PyTensor> reshape(std::vector<int64_t> newShape, eTensorLayout layout);
 
    private:
diff --git a/python/src/py_tensor.cpp b/python/src/py_tensor.cpp
index c0b9e864..ab784347 100644
--- a/python/src/py_tensor.cpp
+++ b/python/src/py_tensor.cpp
@@ -97,10 +97,12 @@ std::shared_ptr<PyTensor> PyTensor::copyTo(eDeviceType device) {
 }
 
 std::shared_ptr<PyTensor> PyTensor::fromDLPack(pybind11::object src, eTensorLayout layout) {
+    // Check if the object supports the DLPack protocol
     if (!py::hasattr(src, "__dlpack__")) {
         throw std::runtime_error("Provided object does not support the DLPack protocol.");
     }
 
+    // Obtain a DLPack capsule
     py::capsule dlpackCapsule = src.attr("__dlpack__")();
     if (!PyCapsule_IsValid(dlpackCapsule.ptr(), "dltensor")) {
         throw std::runtime_error("Invalid DLPack capsule.");
@@ -108,23 +110,41 @@ std::shared_ptr<PyTensor> PyTensor::fromDLPack(pybind11::object src, eTensorLayo
     DLManagedTensor* dlManagedTensor = static_cast<DLManagedTensor*>(dlpackCapsule.get_pointer());
     DLTensor dlTensor = dlManagedTensor->dl_tensor;
 
-    // Mark this capsule as consumed, so that the deleter will not free underlying data.
+    // Mark this capsule as consumed, so that the deleter will not free underlying data
     dlpackCapsule.set_name("used_dltensor");
 
-    // Copy shape data
-    std::vector<int64_t> shapeData(dlTensor.ndim);
-    for (int i = 0; i < dlTensor.ndim; i++) {
+    // Copy the shape data from DLPack to a fixed-size array
+    std::array<int64_t, ROCCV_TENSOR_MAX_RANK> shapeData;
+    for (int i = 0; i < dlTensor.ndim; ++i) {
         shapeData[i] = dlTensor.shape[i];
     }
-
-    // Create a non-owning roccv::Tensor based on the received data
-    roccv::TensorShape shape(roccv::TensorLayout(layout), shapeData);
+    roccv::TensorShape shape(shapeData, dlTensor.ndim, roccv::TensorLayout(layout));
     eDeviceType device = DLDeviceToRoccvDevice(dlTensor.device);
-    roccv::TensorRequirements reqs =
-        roccv::Tensor::CalcRequirements(shape, roccv::DataType(DLTypeToRoccvType(dlTensor.dtype)), device);
-    auto data = std::make_shared<roccv::TensorStorage>(dlTensor.data, device, eOwnership::VIEW);
-    auto tensor = std::make_shared<roccv::Tensor>(reqs, data);
+    eDataType dtype = DLTypeToRoccvType(dlTensor.dtype);
+
+    // Prepare the strides array
+    std::array<int64_t, ROCCV_TENSOR_MAX_RANK> stridesData;
+
+    // If strides are not present, assume contiguous layout. We really shouldn't be recalculating strides here. DLPack
+    // now enforces that the strides are present, but we'll keep this for backwards compatibility.
+    if (dlTensor.strides == nullptr) {
+        stridesData = roccv::Tensor::CalcStrides(shape, roccv::DataType(dtype), 0);
+    } else {
+        for (int i = 0; i < dlTensor.ndim; ++i) {
+            stridesData[i] = dlTensor.strides[i];
+        }
+    }
+
+    // Set up the requirements for the new tensor.
+    roccv::Tensor::Requirements reqs =
+        roccv::Tensor::CalcRequirements(shape, roccv::DataType(dtype), stridesData, 0, device);
+
+    // Since this tensor is coming from a DLPack, we don't own the data, so we need to create a view of the data.
+    std::shared_ptr<roccv::TensorStorage> data =
+        std::make_shared<roccv::TensorStorage>(dlTensor.data, device, eOwnership::VIEW);
+    std::shared_ptr<roccv::Tensor> tensor = std::make_shared<roccv::Tensor>(reqs, data);
 
+    // Instantiate a new tensor and a PyTensor to wrap it, binding the original DLManagedTensor
     return std::make_shared<PyTensor>(tensor, dlManagedTensor);
 }
 
diff --git a/src/core/tensor.cpp b/src/core/tensor.cpp
index 97492115..97907856 100644
--- a/src/core/tensor.cpp
+++ b/src/core/tensor.cpp
@@ -23,10 +23,12 @@ THE SOFTWARE.
 #include "core/tensor.hpp"
 
 #include <array>
+#include <numeric>
 
 #include "core/data_type.hpp"
 #include "core/detail/context.hpp"
 #include "core/exception.hpp"
+#include "core/hip_assert.h"
 #include "core/image_format.hpp"
 #include "core/mem_alignment.hpp"
 #include "core/status_type.h"
@@ -35,6 +37,7 @@ THE SOFTWARE.
 #include "core/tensor_requirements.hpp"
 #include "core/tensor_shape.hpp"
 #include "core/util_enums.h"
+#include "core/utils.hpp"
 #include "operator_types.h"
 
 namespace roccv {
@@ -52,14 +55,14 @@ Tensor::Tensor(const TensorShape& shape, DataType dtype, const eDeviceType devic
 
 Tensor::Tensor(const TensorShape& shape, DataType dtype, const MemAlignment& bufAlign, const IAllocator& alloc,
                const eDeviceType device)
-    : Tensor(CalcRequirements(shape, dtype, device), alloc) {}
+    : Tensor(CalcRequirements(shape, dtype, bufAlign, device), alloc) {}
 
 Tensor::Tensor(int num_images, Size2D image_size, ImageFormat fmt, eDeviceType device)
     : Tensor(num_images, image_size, fmt, {}, GlobalContext().getDefaultAllocator(), device) {}
 
 Tensor::Tensor(int num_images, Size2D image_size, ImageFormat fmt, const MemAlignment& bufAlign,
                const IAllocator& alloc, eDeviceType device)
-    : Tensor(CalcRequirements(num_images, image_size, fmt, device), alloc) {}
+    : Tensor(CalcRequirements(num_images, image_size, fmt, bufAlign, device), alloc) {}
 
 // Move constructor
 Tensor::Tensor(Tensor&& other) : m_requirements(std::move(other.m_requirements)), m_data(std::move(other.m_data)) {}
@@ -141,13 +144,41 @@ Tensor::Requirements Tensor::CalcRequirements(const TensorShape& shape, const Da
 
 Tensor::Requirements Tensor::CalcRequirements(const TensorShape& shape, const DataType& dtype,
                                               const MemAlignment& bufAlign, const eDeviceType device) {
-    std::array<int64_t, ROCCV_TENSOR_MAX_RANK> strides = CalcStrides(shape, dtype, bufAlign);
-    Tensor::Requirements reqs = CalcRequirements(shape, dtype, strides, device);
+    int dev;
+    HIP_VALIDATE_NO_ERRORS(hipGetDevice(&dev));
+
+    // Validate memory alignment, set default alignment if set to 0.
+    // TODO: Must be supported for CPU as well.
+    int rowAlign;
+    if (bufAlign.rowAddr() == 0) {
+        HIP_VALIDATE_NO_ERRORS(hipDeviceGetAttribute(&rowAlign, hipDeviceAttributeTexturePitchAlignment, dev));
+        rowAlign = std::lcm(rowAlign, detail::NextPowerOfTwo(dtype.size()));
+    } else {
+        if (!detail::IsPowerOfTwo(bufAlign.rowAddr())) {
+            throw Exception("Row address alignment must be a power of two.", eStatusType::INVALID_VALUE);
+        }
+        rowAlign = std::lcm(bufAlign.rowAddr(), detail::NextPowerOfTwo(dtype.size()));
+    }
+
+    int baseAlign;
+    if (bufAlign.baseAddr() == 0) {
+        HIP_VALIDATE_NO_ERRORS(hipDeviceGetAttribute(&baseAlign, hipDeviceAttributeTextureAlignment, dev));
+        baseAlign = std::lcm(baseAlign, detail::NextPowerOfTwo(dtype.size()));
+    } else {
+        if (!detail::IsPowerOfTwo(bufAlign.baseAddr())) {
+            throw Exception("Base address alignment must be a power of two.", eStatusType::INVALID_VALUE);
+        }
+        baseAlign = std::lcm(bufAlign.baseAddr(), detail::NextPowerOfTwo(dtype.size()));
+    }
+
+    std::array<int64_t, ROCCV_TENSOR_MAX_RANK> strides = CalcStrides(shape, dtype, rowAlign);
+    Tensor::Requirements reqs = CalcRequirements(shape, dtype, strides, baseAlign, device);
     return reqs;
 }
 
 Tensor::Requirements Tensor::CalcRequirements(const TensorShape& shape, const DataType& dtype,
-                                              std::array<int64_t, ROCCV_TENSOR_MAX_RANK> strides, eDeviceType device) {
+                                              std::array<int64_t, ROCCV_TENSOR_MAX_RANK> strides, int32_t baseAlign,
+                                              eDeviceType device) {
     Tensor::Requirements reqs;
 
     reqs.shape = shape.shape();
@@ -155,7 +186,7 @@ Tensor::Requirements Tensor::CalcRequirements(const TensorShape& shape, const Da
     reqs.layout = shape.layout().elayout();
     reqs.strides = strides;
     reqs.dtype = dtype.etype();
-    reqs.alignBytes = 0;  // TODO: Must be specified later
+    reqs.alignBytes = baseAlign;
     reqs.device = device;
 
     // Determine resource usage
@@ -193,14 +224,17 @@ Tensor::Requirements Tensor::CalcRequirements(int num_images, Size2D image_size,
 }
 
 std::array<int64_t, ROCCV_TENSOR_MAX_RANK> Tensor::CalcStrides(const TensorShape& shape, const DataType& dtype,
-                                                               const MemAlignment& bufAlign) {
-    // TODO: Support memory alignment and padding in stride calculations
-
+                                                               int32_t rowAlign) {
     // Calculate strides based on the given tensor shape. Strides are byte-wise.
     std::array<int64_t, ROCCV_TENSOR_MAX_RANK> strides;
     strides[shape.layout().rank() - 1] = dtype.size();
     for (int i = shape.layout().rank() - 2; i >= 0; i--) {
-        strides[i] = strides[i + 1] * shape[i + 1];
+        // Ensure strides for the row are padded to the next multiple of the alignment.
+        if (i == shape.layout().height_index()) {
+            strides[i] = detail::AlignUp(strides[i + 1] * shape[i + 1], rowAlign);
+        } else {
+            strides[i] = strides[i + 1] * shape[i + 1];
+        }
     }
     return strides;
 }
@@ -218,7 +252,7 @@ Tensor TensorWrapData(const TensorData& tensor_data) {
     }
 
     Tensor::Requirements reqs = Tensor::CalcRequirements(tensorDataStrided->shape(), tensorDataStrided->dtype(),
-                                                         strides, tensorDataStrided->device());
+                                                         strides, 0, tensorDataStrided->device());
 
     auto data =
         std::make_shared<TensorStorage>(tensorDataStrided->basePtr(), tensorDataStrided->device(), eOwnership::OWNING);

From c232d1fcae5f9b5fe7b1bd8b2835884e95c64d89 Mon Sep 17 00:00:00 2001
From: Zach Vincze <zavincze@amd.com>
Date: Thu, 27 Nov 2025 14:45:44 -0500
Subject: [PATCH 07/36] Fix alignment calculations

---
 include/core/utils.hpp          | 21 ++++++---------------
 src/core/tensor.cpp             |  4 ++++
 tests/roccv/cpp/test_tensor.cpp | 28 +++++++++++++++++++++-------
 3 files changed, 31 insertions(+), 22 deletions(-)

diff --git a/include/core/utils.hpp b/include/core/utils.hpp
index 480d4e29..4449ce5b 100644
--- a/include/core/utils.hpp
+++ b/include/core/utils.hpp
@@ -24,28 +24,19 @@ THE SOFTWARE.
 
 #include <stdint.h>
 
+#include <cstddef>
 #include <type_traits>
 
 namespace roccv::detail {
-/**
- * @brief Returns the next power of two greater than or equal to the given value.
- *
- * @tparam T Integral type of the input and output.
- * @param value The value to find the next power of two for.
- * @return The next power of two greater than or equal to the given value.
- */
-template <typename T, typename = std::enable_if_t<std::is_integral_v<T>>>
-inline constexpr T NextPowerOfTwo(T value) noexcept {
+
+inline constexpr size_t NextPowerOfTwo(size_t value) noexcept {
     if (value <= 1) return 1;
 #if defined(__GNUC__) || defined(__clang__)
-    // For unsigned types, use clz built-in. For signed, cast to unsigned.
-    using U = std::make_unsigned_t<T>;
-    constexpr int numBits = sizeof(T) * 8;
-    return static_cast<T>(U(1) << (numBits - __builtin_clz(static_cast<U>(value - 1))));
+    constexpr int numBits = sizeof(size_t) * 8;
+    return 1UL << (numBits - __builtin_clzl(value - 1));
 #else
-    // Portable fallback: fill lower bits, then add one.
     value--;
-    for (size_t i = 1; i < sizeof(T) * 8; i <<= 1) value |= (value >> i);
+    for (size_t i = 1; i < sizeof(size_t) * 8; i <<= 1) value |= (value >> i);
     return value + 1;
 #endif
 }
diff --git a/src/core/tensor.cpp b/src/core/tensor.cpp
index 97907856..e26590bb 100644
--- a/src/core/tensor.cpp
+++ b/src/core/tensor.cpp
@@ -152,6 +152,8 @@ Tensor::Requirements Tensor::CalcRequirements(const TensorShape& shape, const Da
     int rowAlign;
     if (bufAlign.rowAddr() == 0) {
         HIP_VALIDATE_NO_ERRORS(hipDeviceGetAttribute(&rowAlign, hipDeviceAttributeTexturePitchAlignment, dev));
+        printf("rowAlign: %d\n", rowAlign);
+        printf("NextPowerOfTwo(dtype.size()): %lu\n", detail::NextPowerOfTwo(dtype.size()));
         rowAlign = std::lcm(rowAlign, detail::NextPowerOfTwo(dtype.size()));
     } else {
         if (!detail::IsPowerOfTwo(bufAlign.rowAddr())) {
@@ -159,6 +161,7 @@ Tensor::Requirements Tensor::CalcRequirements(const TensorShape& shape, const Da
         }
         rowAlign = std::lcm(bufAlign.rowAddr(), detail::NextPowerOfTwo(dtype.size()));
     }
+    printf("rowAlign: %d\n", rowAlign);
 
     int baseAlign;
     if (bufAlign.baseAddr() == 0) {
@@ -170,6 +173,7 @@ Tensor::Requirements Tensor::CalcRequirements(const TensorShape& shape, const Da
         }
         baseAlign = std::lcm(bufAlign.baseAddr(), detail::NextPowerOfTwo(dtype.size()));
     }
+    printf("baseAlign: %d\n", baseAlign);
 
     std::array<int64_t, ROCCV_TENSOR_MAX_RANK> strides = CalcStrides(shape, dtype, rowAlign);
     Tensor::Requirements reqs = CalcRequirements(shape, dtype, strides, baseAlign, device);
diff --git a/tests/roccv/cpp/test_tensor.cpp b/tests/roccv/cpp/test_tensor.cpp
index c0442df6..030579cb 100644
--- a/tests/roccv/cpp/test_tensor.cpp
+++ b/tests/roccv/cpp/test_tensor.cpp
@@ -20,6 +20,7 @@
  */
 
 #include <core/tensor.hpp>
+#include <core/utils.hpp>
 
 #include "test_helpers.hpp"
 
@@ -33,16 +34,22 @@ namespace {
  *
  * @param shape The tensor's shape.
  * @param dtype The datatype of the tensor.
+ * @param rowAlign The row alignment to use. Setting to 0 will ensure contiguous memory usage.
  * @return A list of strides for each dimension of the given shape.
  */
-std::vector<int64_t> CalculateStrides(const TensorShape& shape, const DataType& dtype) {
+std::vector<int64_t> CalculateStrides(const TensorShape& shape, const DataType& dtype, int32_t rowAlign) {
     std::vector<int64_t> strides(shape.layout().rank());
 
     // Strides are calculated byte-wise. Therefore, the highest dimension will refer to the stride between singular
     // elements (which, in turn, is the number of bytes per said element).
     strides[shape.layout().rank() - 1] = dtype.size();
     for (int i = shape.layout().rank() - 2; i >= 0; --i) {
-        strides[i] = strides[i + 1] * shape[i + 1];
+        // Use the row alignment for the height dimension.
+        if (i == shape.layout().height_index()) {
+            strides[i] = detail::AlignUp(strides[i + 1] * shape[i + 1], rowAlign);
+        } else {
+            strides[i] = strides[i + 1] * shape[i + 1];
+        }
     }
     return strides;
 }
@@ -61,17 +68,22 @@ void TestNegativeTensorShape() {
 }
 
 /**
- * @brief Negative tests related to the Tensor object.
+ * @brief Negative tests for the Tensor class, verifying error handling in invalid scenarios.
  *
+ * These tests confirm that the Tensor class appropriately throws exceptions when:
+ *   1. Attempting to reshape a tensor to a shape with a different number of elements.
+ *   2. Attempting to reshape a tensor to a shape and datatype combination that would change the total number of bytes
+ * in the underlying storage.
+ *
+ * In both cases, the expected behavior is to throw an exception of type eStatusType::INVALID_VALUE.
  */
 void TestNegativeTensor() {
     {
-        // Should not be able to reshape tensor into another view with a differing number of elements
+        // Case 1: Reshaping to a different number of elements is invalid
         Tensor tensor(TensorShape({1, 2, 3}, "HWC"), DataType(DATA_TYPE_U8));
         EXPECT_EXCEPTION(tensor.reshape(TensorShape({1, 1, 2, 2}, "NHWC")), eStatusType::INVALID_VALUE);
 
-        // Should not be able to reshape tensor into another view which would result in a different number of bytes in
-        // the underlying memory.
+        // Case 2: Reshaping to a different total byte size is invalid, even if element count matches
         EXPECT_EXCEPTION(tensor.reshape(TensorShape({1, 1, 2, 3}, "NHWC"), DataType(DATA_TYPE_S16)),
                          eStatusType::INVALID_VALUE);
     }
@@ -131,7 +143,9 @@ void TestTensorCorrectness() {
  */
 void TestTensorStrideCalculation(const TensorShape& shape, const DataType& dtype) {
     Tensor tensor(shape, dtype);
-    std::vector<int64_t> expectedStrides = CalculateStrides(shape, dtype);
+
+    // TODO: Use row alignment from device attributes instead of a hardcoded value.
+    std::vector<int64_t> expectedStrides = CalculateStrides(shape, dtype, 256);
     std::vector<int64_t> actualStrides(tensor.rank());
     auto data = tensor.exportData<TensorDataStrided>();
 

From c90475b400c847ceff713a73a7dc48143ee7374f Mon Sep 17 00:00:00 2001
From: Zach Vincze <zavincze@amd.com>
Date: Wed, 3 Dec 2025 11:45:58 -0500
Subject: [PATCH 08/36] Take padding into account during copies in test helpers

---
 include/core/tensor.hpp                 |   2 +-
 src/core/tensor.cpp                     |   8 +-
 tests/roccv/cpp/common/test_helpers.hpp | 124 +++++++++++-------------
 3 files changed, 60 insertions(+), 74 deletions(-)

diff --git a/include/core/tensor.hpp b/include/core/tensor.hpp
index 14348f63..ce1271b5 100644
--- a/include/core/tensor.hpp
+++ b/include/core/tensor.hpp
@@ -254,7 +254,7 @@ class Tensor {
      * @return Tensor requirements.
      */
     static Requirements CalcRequirements(const TensorShape &shape, const DataType &dtype,
-                                         std::array<int64_t, ROCCV_TENSOR_MAX_RANK> strides, int32_t baseAlign,
+                                         const std::array<int64_t, ROCCV_TENSOR_MAX_RANK> strides, int32_t baseAlign,
                                          eDeviceType device = eDeviceType::GPU);
 
     /**
diff --git a/src/core/tensor.cpp b/src/core/tensor.cpp
index e26590bb..282eeebd 100644
--- a/src/core/tensor.cpp
+++ b/src/core/tensor.cpp
@@ -152,8 +152,6 @@ Tensor::Requirements Tensor::CalcRequirements(const TensorShape& shape, const Da
     int rowAlign;
     if (bufAlign.rowAddr() == 0) {
         HIP_VALIDATE_NO_ERRORS(hipDeviceGetAttribute(&rowAlign, hipDeviceAttributeTexturePitchAlignment, dev));
-        printf("rowAlign: %d\n", rowAlign);
-        printf("NextPowerOfTwo(dtype.size()): %lu\n", detail::NextPowerOfTwo(dtype.size()));
         rowAlign = std::lcm(rowAlign, detail::NextPowerOfTwo(dtype.size()));
     } else {
         if (!detail::IsPowerOfTwo(bufAlign.rowAddr())) {
@@ -161,7 +159,6 @@ Tensor::Requirements Tensor::CalcRequirements(const TensorShape& shape, const Da
         }
         rowAlign = std::lcm(bufAlign.rowAddr(), detail::NextPowerOfTwo(dtype.size()));
     }
-    printf("rowAlign: %d\n", rowAlign);
 
     int baseAlign;
     if (bufAlign.baseAddr() == 0) {
@@ -173,7 +170,6 @@ Tensor::Requirements Tensor::CalcRequirements(const TensorShape& shape, const Da
         }
         baseAlign = std::lcm(bufAlign.baseAddr(), detail::NextPowerOfTwo(dtype.size()));
     }
-    printf("baseAlign: %d\n", baseAlign);
 
     std::array<int64_t, ROCCV_TENSOR_MAX_RANK> strides = CalcStrides(shape, dtype, rowAlign);
     Tensor::Requirements reqs = CalcRequirements(shape, dtype, strides, baseAlign, device);
@@ -181,8 +177,8 @@ Tensor::Requirements Tensor::CalcRequirements(const TensorShape& shape, const Da
 }
 
 Tensor::Requirements Tensor::CalcRequirements(const TensorShape& shape, const DataType& dtype,
-                                              std::array<int64_t, ROCCV_TENSOR_MAX_RANK> strides, int32_t baseAlign,
-                                              eDeviceType device) {
+                                              const std::array<int64_t, ROCCV_TENSOR_MAX_RANK> strides,
+                                              int32_t baseAlign, eDeviceType device) {
     Tensor::Requirements reqs;
 
     reqs.shape = shape.shape();
diff --git a/tests/roccv/cpp/common/test_helpers.hpp b/tests/roccv/cpp/common/test_helpers.hpp
index 6c1c47de..36a6cb73 100644
--- a/tests/roccv/cpp/common/test_helpers.hpp
+++ b/tests/roccv/cpp/common/test_helpers.hpp
@@ -267,24 +267,6 @@ eTestStatusType compareImage(const Tensor& tensor, const std::string& filename,
 
 void writeTensor(const Tensor& tensor, const std::string& output_file);
 
-template <typename T>
-void copyData(const Tensor& input, const std::span<const T>& data, eDeviceType device) {
-    auto tensor_data = input.exportData<TensorDataStrided>();
-
-    switch (device) {
-        case eDeviceType::GPU: {
-            HIP_VALIDATE_NO_ERRORS(
-                hipMemcpy(tensor_data.basePtr(), data.data(), data.size() * sizeof(T), hipMemcpyHostToDevice));
-            break;
-        }
-
-        case eDeviceType::CPU: {
-            memcpy(tensor_data.basePtr(), data.data(), data.size() * sizeof(T));
-            break;
-        }
-    }
-}
-
 /**
  * @brief Fills a vector with random values based on a provided seed.
  *
@@ -400,75 +382,83 @@ void CompareVectorsNear(const std::vector<T>& result, const std::vector<T>& ref,
         }
     }
 }
+/**
+ * @brief Computes copy parameters for any tensor layout.
+ * @param[in] tensor The tensor to compute copy parameters for.
+ * @return tuple of (row_width_bytes, num_rows, tensor_pitch)
+ *         If no padding, returns (total_size, 1, total_size)
+ */
+inline std::tuple<size_t, size_t, size_t> ComputeCopyParams(const Tensor& tensor) {
+    auto tensorData = tensor.exportData<TensorDataStrided>();
+    const auto& layout = tensor.layout();
+    int heightIdx = layout.height_index();
+
+    if (heightIdx < 0) {
+        // No height dimension = contiguous data, no padding
+        size_t totalSize = tensor.shape().size() * tensor.dtype().size();
+        return {totalSize, 1, totalSize};
+    }
+
+    // Row width = product of all dimensions AFTER height_index × dtype size
+    size_t rowWidth = tensor.dtype().size();
+    for (int i = heightIdx + 1; i < layout.rank(); ++i) {
+        rowWidth *= tensor.shape(i);
+    }
+
+    // Number of rows = product of all dimensions UP TO AND INCLUDING height_index
+    size_t numRows = 1;
+    for (int i = 0; i <= heightIdx; ++i) {
+        numRows *= tensor.shape(i);
+    }
+
+    // Tensor pitch comes from the stride at height_index
+    size_t tensorPitch = tensorData.stride(heightIdx);
+
+    return {rowWidth, numRows, tensorPitch};
+}
 
 /**
- * @brief Copies vector data into a roccv::Tensor. This will copy vector data into either GPU memory or CPU memory,
- * depending on the device specified in the roccv::Tensor's metadata.
+ * @brief Copies vector data into a tensor. Works with any tensor layout.
  *
- * @tparam T The base datatype of the underlying data.
- * @param dst The destination roccv::Tensor to copy data into.
- * @param src A source vector containing data.
- * @throws std::runtime_error if the size of the dst and src do not match.
+ * @tparam T The data type of the elements to copy.
+ * @param[out] dst The tensor to copy vector data into.
+ * @param[in] src The vector to copy data from.
  */
 template <typename T>
-void CopyVectorIntoTensor(const Tensor& dst, std::vector<T>& src) {
+void CopyVectorIntoTensor(const Tensor& dst, const std::vector<T>& src) {
     auto tensorData = dst.exportData<TensorDataStrided>();
-    size_t dataSize = dst.shape().size() * dst.dtype().size();
+    auto [rowWidth, numRows, dstPitch] = ComputeCopyParams(dst);
 
-    // Ensure source and destination have the same amount of memory allocated.
-    if (dataSize != src.size() * sizeof(T)) {
-        throw std::runtime_error(
-            "Cannot copy source vector into destination tensor. Size of src vector and destination tensor do not "
-            "match.");
-    }
+    // Source is always contiguous
+    size_t srcPitch = rowWidth;
 
-    switch (dst.device()) {
-        case eDeviceType::GPU: {
-            HIP_VALIDATE_NO_ERRORS(
-                hipMemcpy(tensorData.basePtr(), src.data(), dataSize, hipMemcpyKind::hipMemcpyHostToDevice));
-            break;
-        }
+    hipMemcpyKind kind = (dst.device() == eDeviceType::GPU) ? hipMemcpyHostToDevice : hipMemcpyHostToHost;
 
-        case eDeviceType::CPU: {
-            HIP_VALIDATE_NO_ERRORS(
-                hipMemcpy(tensorData.basePtr(), src.data(), dataSize, hipMemcpyKind::hipMemcpyHostToHost));
-            break;
-        }
-    }
+    HIP_VALIDATE_NO_ERRORS(hipMemcpy2D(tensorData.basePtr(), dstPitch, src.data(), srcPitch, rowWidth, numRows, kind));
 }
 
 /**
- * @brief Copies roccv::Tensor data into a destination vector.
+ * @brief Copies tensor data into a vector. Works with any tensor layout.
  *
- * @tparam T The base datatype of the underlying tensor data.
- * @param dst The destination vector which the data will be copied into.
- * @param src The roccv::Tensor containing the source data.
- * @throws std::runtime_error if the size of src and dst do not match.
+ * The destination vector @p dst is assumed to be compact/contiguous in memory; data will be copied into it as a dense
+ * contiguous array.
+ *
+ * @tparam T The data type of the elements to copy.
+ * @param[out] dst The vector to copy tensor data into. Must be preallocated to the correct size and will be filled
+ * contiguously.
+ * @param[in] src The tensor to copy data from.
  */
 template <typename T>
 void CopyTensorIntoVector(std::vector<T>& dst, const Tensor& src) {
-    size_t size = src.shape().size() * src.dtype().size();
     auto tensorData = src.exportData<TensorDataStrided>();
+    auto [rowWidth, numRows, srcPitch] = ComputeCopyParams(src);
 
-    if (size != dst.size() * sizeof(T)) {
-        throw std::runtime_error(
-            "Cannot copy source tensor data into destination vector. Size of destination vector and source tensor do "
-            "not match.");
-    }
+    // Destination is always contiguous
+    size_t dstPitch = rowWidth;
 
-    switch (src.device()) {
-        case eDeviceType::GPU: {
-            HIP_VALIDATE_NO_ERRORS(
-                hipMemcpy(dst.data(), tensorData.basePtr(), size, hipMemcpyKind::hipMemcpyDeviceToHost));
-            break;
-        }
+    hipMemcpyKind kind = (src.device() == eDeviceType::GPU) ? hipMemcpyDeviceToHost : hipMemcpyHostToHost;
 
-        case eDeviceType::CPU: {
-            HIP_VALIDATE_NO_ERRORS(
-                hipMemcpy(dst.data(), tensorData.basePtr(), size, hipMemcpyKind::hipMemcpyHostToHost));
-            break;
-        }
-    }
+    HIP_VALIDATE_NO_ERRORS(hipMemcpy2D(dst.data(), dstPitch, tensorData.basePtr(), srcPitch, rowWidth, numRows, kind));
 }
 
 }  // namespace tests

From 18d0890f1e96517ef951d240f96654010af1edfe Mon Sep 17 00:00:00 2001
From: Zach Vincze <zavincze@amd.com>
Date: Wed, 3 Dec 2025 11:58:06 -0500
Subject: [PATCH 09/36] Add isContiguous member function for roccv::Tensor

---
 include/core/tensor.hpp |  7 +++++++
 src/core/tensor.cpp     | 21 ++++++++++++---------
 2 files changed, 19 insertions(+), 9 deletions(-)

diff --git a/include/core/tensor.hpp b/include/core/tensor.hpp
index ce1271b5..6ca8c5cf 100644
--- a/include/core/tensor.hpp
+++ b/include/core/tensor.hpp
@@ -218,6 +218,13 @@ class Tensor {
      */
     size_t dataSize() const;
 
+    /**
+     * @brief Returns true if the tensor is contiguous in memory, meaning there is no padding present in the tensor.
+     *
+     * @return True if the tensor is contiguous in memory, false otherwise.
+     */
+    bool isContiguous() const;
+
     /**
      * @brief Calculates tensor requirements using the default memory alignment strategy.
      *
diff --git a/src/core/tensor.cpp b/src/core/tensor.cpp
index 282eeebd..bfd5835a 100644
--- a/src/core/tensor.cpp
+++ b/src/core/tensor.cpp
@@ -101,6 +101,11 @@ TensorData Tensor::exportData() const {
 }
 
 Tensor Tensor::reshape(const TensorShape& new_shape) const {
+    if (!isContiguous()) {
+        throw Exception("Tensor is not contiguous. Reshape can only be performed on contiguous tensors.",
+                        eStatusType::INVALID_VALUE);
+    }
+
     // New tensor shape must have the same number of elements
     if (new_shape.size() != this->shape().size()) {
         throw Exception("New tensor shape does not match the number of elements of the old shape.",
@@ -112,6 +117,11 @@ Tensor Tensor::reshape(const TensorShape& new_shape) const {
 }
 
 Tensor Tensor::reshape(const TensorShape& new_shape, const DataType& new_dtype) const {
+    if (!isContiguous()) {
+        throw Exception("Tensor is not contiguous. Reshape can only be performed on contiguous tensors.",
+                        eStatusType::INVALID_VALUE);
+    }
+
     if (new_shape.size() * new_dtype.size() != this->shape().size() * this->dtype().size()) {
         throw Exception("New tensor view must have the same underlying number of bytes.", eStatusType::INVALID_VALUE);
     }
@@ -126,16 +136,9 @@ Tensor& Tensor::operator=(const Tensor& other) {
     return *this;
 }
 
-size_t Tensor::dataSize() const {
-    switch (m_requirements.device) {
-        case eDeviceType::GPU:
-            return m_requirements.res.deviceMem.bytes;
-        case eDeviceType::CPU:
-            return m_requirements.res.hostMem.bytes;
-    }
+size_t Tensor::dataSize() const { return m_requirements.strides[0] * m_requirements.shape[0]; }
 
-    return 0;
-}
+bool Tensor::isContiguous() const { return dataSize() == shape().size() * dtype().size(); }
 
 Tensor::Requirements Tensor::CalcRequirements(const TensorShape& shape, const DataType& dtype,
                                               const eDeviceType device) {

From e97207ad8e3e4bf890a2f1411bed17e6c3c82d95 Mon Sep 17 00:00:00 2001
From: Zach Vincze <zavincze@amd.com>
Date: Wed, 3 Dec 2025 12:11:56 -0500
Subject: [PATCH 10/36] Remove non-contiguous reshaping from tests

---
 tests/roccv/cpp/test_tensor.cpp | 38 ++++++++++-----------------------
 1 file changed, 11 insertions(+), 27 deletions(-)

diff --git a/tests/roccv/cpp/test_tensor.cpp b/tests/roccv/cpp/test_tensor.cpp
index 030579cb..ae9a1016 100644
--- a/tests/roccv/cpp/test_tensor.cpp
+++ b/tests/roccv/cpp/test_tensor.cpp
@@ -71,21 +71,15 @@ void TestNegativeTensorShape() {
  * @brief Negative tests for the Tensor class, verifying error handling in invalid scenarios.
  *
  * These tests confirm that the Tensor class appropriately throws exceptions when:
- *   1. Attempting to reshape a tensor to a shape with a different number of elements.
- *   2. Attempting to reshape a tensor to a shape and datatype combination that would change the total number of bytes
- * in the underlying storage.
+ *   1. Attempting to reshape a non-contiguous tensor.
  *
  * In both cases, the expected behavior is to throw an exception of type eStatusType::INVALID_VALUE.
  */
 void TestNegativeTensor() {
+    // Reshaping a non-contiguous tensor is invalid
     {
-        // Case 1: Reshaping to a different number of elements is invalid
         Tensor tensor(TensorShape({1, 2, 3}, "HWC"), DataType(DATA_TYPE_U8));
-        EXPECT_EXCEPTION(tensor.reshape(TensorShape({1, 1, 2, 2}, "NHWC")), eStatusType::INVALID_VALUE);
-
-        // Case 2: Reshaping to a different total byte size is invalid, even if element count matches
-        EXPECT_EXCEPTION(tensor.reshape(TensorShape({1, 1, 2, 3}, "NHWC"), DataType(DATA_TYPE_S16)),
-                         eStatusType::INVALID_VALUE);
+        EXPECT_EXCEPTION(tensor.reshape(TensorShape({1, 1, 2, 3}, "NHWC")), eStatusType::INVALID_VALUE);
     }
 }
 
@@ -108,21 +102,6 @@ void TestTensorCorrectness() {
         EXPECT_EQ(tensor.dtype().size(), 1);
     }
 
-    // Tensor reshape: Change layout
-    {
-        // Reshape tensor from NHWC -> HWC layout
-        Tensor tensor(1, {720, 480}, FMT_RGB8);
-        Tensor reshapedTensor = tensor.reshape(TensorShape({720, 480, 3}, "HWC"));
-        EXPECT_EQ(reshapedTensor.rank(), 3);
-        EXPECT_NE(reshapedTensor.rank(), tensor.rank());
-        EXPECT_EQ(reshapedTensor.shape().size(), tensor.shape().size());
-
-        // Ensure they are sharing the same underlying data
-        auto data = tensor.exportData<TensorDataStrided>();
-        auto dataReshaped = reshapedTensor.exportData<TensorDataStrided>();
-        EXPECT_TRUE(data.basePtr() == dataReshaped.basePtr());
-    }
-
     // Tensor reshape: Change layout and datatype
     {
         Tensor tensor(TensorShape({1, 5, 4}, "NWC"), DataType(DATA_TYPE_S16));
@@ -142,10 +121,15 @@ void TestTensorCorrectness() {
  * @brief Tests internal stride calculations on Tensor construction.
  */
 void TestTensorStrideCalculation(const TensorShape& shape, const DataType& dtype) {
-    Tensor tensor(shape, dtype);
+    Tensor tensor(shape, dtype, eDeviceType::GPU);
+
+    // Get row alignment from device attributes
+    int dev;
+    HIP_VALIDATE_NO_ERRORS(hipGetDevice(&dev));
+    int rowAlign;
+    HIP_VALIDATE_NO_ERRORS(hipDeviceGetAttribute(&rowAlign, hipDeviceAttributeTexturePitchAlignment, dev));
 
-    // TODO: Use row alignment from device attributes instead of a hardcoded value.
-    std::vector<int64_t> expectedStrides = CalculateStrides(shape, dtype, 256);
+    std::vector<int64_t> expectedStrides = CalculateStrides(shape, dtype, rowAlign);
     std::vector<int64_t> actualStrides(tensor.rank());
     auto data = tensor.exportData<TensorDataStrided>();
 

From 6102764a0b98911c5be298945f281c2c49c63d3f Mon Sep 17 00:00:00 2001
From: Zach Vincze <zavincze@amd.com>
Date: Fri, 5 Dec 2025 14:27:35 -0500
Subject: [PATCH 11/36] Update utils.hpp to use memcpy2D to handle tensor
 padding

---
 samples/common/utils.hpp     | 142 ++++++++++++++++++++++++++++++++++-
 samples/copy_make_border.cpp |  57 +++++++-------
 2 files changed, 167 insertions(+), 32 deletions(-)

diff --git a/samples/common/utils.hpp b/samples/common/utils.hpp
index 61ae525d..36aeb127 100644
--- a/samples/common/utils.hpp
+++ b/samples/common/utils.hpp
@@ -39,7 +39,7 @@ inline void CheckHIPError(hipError_t code, const char *file, const int line) {
         CheckHIPError((val), __FILE__, __LINE__); \
     }
 
-bool ContainsExtension(const std::filesystem::path &path, const std::vector<std::string> &extension_list) {
+inline bool ContainsExtension(const std::filesystem::path &path, const std::vector<std::string> &extension_list) {
     for (auto extension : extension_list) {
         if (path.extension() == extension) return true;
     }
@@ -47,6 +47,142 @@ bool ContainsExtension(const std::filesystem::path &path, const std::vector<std:
     return false;
 }
 
+struct MemcpyParams {
+    void *basePtr = nullptr;  // Base pointer to the tensor data
+    size_t rowPitch = 0;      // Number of bytes per row, including padding
+    size_t rowBytes = 0;      // Number of bytes per row, not including padding
+    size_t imageBytes = 0;    // Number of bytes per image, including padding (rowBytes * height)
+};
+
+/**
+ * @brief Gets the memcpy parameters for a tensor to perform a memcpy2D operation.
+ *
+ * @param tensor The tensor to get the memcpy parameters for.
+ * @return The memcpy parameters to perform a memcpy2D operation.
+ */
+inline MemcpyParams GetMemcpyParams(const roccv::Tensor &tensor) {
+    MemcpyParams params;
+
+    roccv::TensorDataStrided tensorData = tensor.exportData<roccv::TensorDataStrided>();
+    params.rowPitch = tensorData.stride(tensor.layout().height_index());
+    params.rowBytes = tensor.shape(tensor.layout().width_index()) * tensor.shape(tensor.layout().channels_index()) *
+                      tensor.dtype().size();
+    params.imageBytes = params.rowBytes * tensor.shape(tensor.layout().height_index());
+    params.basePtr = tensorData.basePtr();
+
+    return params;
+}
+
+/**
+ * @brief Loads an image, or multiple images if given a directory, into a tensor. Will be in NHWC layout and U8 format.
+ * All images must be of the same size and format. This is a blocking operation.
+ *
+ * @param image_path The path to the image to load. If a directory is provided, all supported images in the directory
+ * will be loaded.
+ * @param device The device to load the images onto. Defaults to GPU.
+ * @return A NHWC tensor containing the loaded images.
+ */
+inline roccv::Tensor LoadImages(const std::string &image_path, eDeviceType device = eDeviceType::GPU) {
+    const std::vector<std::string> supportedExtensions = {".bmp", ".jpg", ".jpeg", ".png"};
+
+    std::vector<cv::Mat> images;
+
+    int width = -1;
+    int height = -1;
+    int channels = -1;
+
+    // Load images from directory or file if a single image is provided
+    if (std::filesystem::is_directory(image_path)) {
+        for (auto file : std::filesystem::directory_iterator(image_path)) {
+            if (!std::filesystem::is_directory(file.path()) && ContainsExtension(file.path(), supportedExtensions)) {
+                images.push_back(cv::imread(file.path()));
+
+                // Check if all images are of the same size
+                if (width == -1 && height == -1 && channels == -1) {
+                    width = images.back().cols;
+                    height = images.back().rows;
+                    channels = images.back().channels();
+                } else if (images.back().cols != width || images.back().rows != height ||
+                           images.back().channels() != channels) {
+                    throw std::runtime_error("All images must be of the same size and format");
+                }
+            }
+        }
+    } else if (std::filesystem::is_regular_file(image_path) && ContainsExtension(image_path, supportedExtensions)) {
+        images.push_back(cv::imread(image_path));
+        width = images.back().cols;
+        height = images.back().rows;
+        channels = images.back().channels();
+    } else {
+        throw std::runtime_error("Cannot decode " + image_path + ". File type not supported.\n");
+    }
+
+    if (images.empty()) {
+        throw std::runtime_error("No valid images found in directory " + image_path);
+    }
+
+    // Create tensor and prepare arguments for hipMemcpy2D
+    roccv::Tensor tensor(images.size(), roccv::Size2D(width, height),
+                         roccv::ImageFormat(eDataType::DATA_TYPE_U8, channels), device);
+
+    MemcpyParams params = GetMemcpyParams(tensor);
+
+    // Copy images into tensor
+    hipMemcpyKind kind = (device == eDeviceType::GPU) ? hipMemcpyHostToDevice : hipMemcpyHostToHost;
+    for (int i = 0; i < images.size(); i++) {
+        CHECK_HIP_ERROR(hipMemcpy2D(static_cast<uint8_t *>(params.basePtr) + i * params.imageBytes, params.rowPitch,
+                                    images[i].data, params.rowBytes, params.rowBytes, height, kind));
+    }
+
+    return tensor;
+}
+
+/**
+ * @brief Writes a batch of images from a tensor to the specified output path. This is a blocking operation.
+ *
+ * @param tensor The tensor to write the images from.
+ * @param output_path The path to write the images to. If a directory is provided, the images will be written to the
+ * directory.
+ */
+inline void WriteImages(const roccv::Tensor &tensor, const std::string &output_path) {
+    if (tensor.layout() != eTensorLayout::TENSOR_LAYOUT_NHWC && tensor.layout() != eTensorLayout::TENSOR_LAYOUT_HWC) {
+        throw std::runtime_error(
+            "Unsupported tensor layout in WriteImages(). Only NHWC and HWC layouts are supported.");
+    }
+
+    int64_t height = tensor.shape(tensor.layout().height_index());
+    int64_t width = tensor.shape(tensor.layout().width_index());
+    int64_t batchSize =
+        tensor.layout().batch_index() < 0 ? 1 : tensor.shape(tensor.layout().batch_index());  // Support for HWC layout
+    int64_t channels = tensor.shape(tensor.layout().channels_index());
+
+    // Get OpenCV image format
+    int64_t cvFormat = CV_MAKETYPE(CV_8U, channels);
+
+    // Get memcpy parameters
+    MemcpyParams params = GetMemcpyParams(tensor);
+    hipMemcpyKind kind = (tensor.device() == eDeviceType::GPU) ? hipMemcpyDeviceToHost : hipMemcpyHostToHost;
+
+    // Copy images from tensor to OpenCV image vector
+    std::vector<cv::Mat> images(batchSize);
+    for (int i = 0; i < batchSize; i++) {
+        images[i] = cv::Mat(height, width, cvFormat);
+        CHECK_HIP_ERROR(hipMemcpy2D(images[i].data, params.rowBytes,
+                                    static_cast<uint8_t *>(params.basePtr) + i * params.imageBytes, params.rowPitch,
+                                    params.rowBytes, height, kind));
+    }
+
+    if (std::filesystem::is_directory(output_path)) {
+        for (int i = 0; i < batchSize; i++) {
+            std::ostringstream outFilename;
+            outFilename << output_path << "/image_" << i << ".bmp";
+            cv::imwrite(outFilename.str().c_str(), images[i]);
+        }
+    } else {
+        cv::imwrite(output_path, images[0]);
+    }
+}
+
 /**
  * @brief Loads images into the GPU memory specified.
  *
@@ -54,7 +190,7 @@ bool ContainsExtension(const std::filesystem::path &path, const std::vector<std:
  * @param num_images The number of images to load into GPU memory.
  * @param gpu_input A pointer to valid GPU memory.
  */
-void DecodeRGBIImage(const std::string &images_dir, int num_images, void *gpu_input) {
+inline void DecodeRGBIImage(const std::string &images_dir, int num_images, void *gpu_input) {
     const std::vector<std::string> supportedExtensions = {".bmp", ".jpg", ".jpeg", ".png"};
 
     std::vector<std::string> imageFiles;
@@ -99,7 +235,7 @@ void DecodeRGBIImage(const std::string &images_dir, int num_images, void *gpu_in
  * @param tensor A tensor containing a batch of RGBI images.
  * @param stream The HIP stream to synchronize with.
  */
-void WriteRGBITensor(const roccv::Tensor &tensor, hipStream_t stream) {
+inline void WriteRGBITensor(const roccv::Tensor &tensor, hipStream_t stream) {
     CHECK_HIP_ERROR(hipStreamSynchronize(stream));
 
     auto srcData = tensor.exportData<roccv::TensorDataStrided>();
diff --git a/samples/copy_make_border.cpp b/samples/copy_make_border.cpp
index be7be7e5..bf515dc7 100644
--- a/samples/copy_make_border.cpp
+++ b/samples/copy_make_border.cpp
@@ -27,12 +27,23 @@ THE SOFTWARE.
 #include <op_copy_make_border.hpp>
 #include <opencv2/opencv.hpp>
 
+#include "common/utils.hpp"
+
 using namespace roccv;
 
 /**
  * @brief Copy make border operation example.
+ *
+ * This sample demonstrates the usage of the CopyMakeBorder operator. It loads an image from the specified input path,
+ * creates a border around the image based on the specified border mode and border value, and writes the output image to
+ * the specified output path.
+ *
+ * Usage:
+ * ./copy_make_border <image_filename> <output_filename> <top> <left> <r> <g> <b> <a> <border_mode> <device_id>
+ *
  */
 int main(int argc, char** argv) {
+    // Validate command line arguments
     if (argc != 11) {
         std::cerr << "Usage: " << argv[0]
                   << " <image_filename> <output_filename> <top> <left> <r> <g> <b> <a> <border_mode> <device_id>"
@@ -40,8 +51,9 @@ int main(int argc, char** argv) {
         return EXIT_FAILURE;
     }
 
-    HIP_VALIDATE_NO_ERRORS(hipSetDevice(std::stoi(argv[10])));
+    CHECK_HIP_ERROR(hipSetDevice(std::stoi(argv[10])));
 
+    // Parse command line arguments
     int32_t top = std::stoi(argv[3]);
     int32_t left = std::stoi(argv[4]);
     float r = std::stof(argv[5]);
@@ -50,42 +62,29 @@ int main(int argc, char** argv) {
     float a = std::stof(argv[8]);
     eBorderType border_mode = static_cast<eBorderType>(std::stoi(argv[9]));
 
-    cv::Mat image_data = cv::imread(argv[1]);
-
-    // Create input/output tensors for the image.
-    TensorShape shape(TensorLayout(eTensorLayout::TENSOR_LAYOUT_NHWC),
-                      {1, image_data.rows, image_data.cols, image_data.channels()});
-    DataType dtype(eDataType::DATA_TYPE_U8);
+    // Load input image
+    Tensor input = LoadImages(argv[1]);
 
-    TensorShape o_shape(TensorLayout(eTensorLayout::TENSOR_LAYOUT_NHWC),
-                        {1, image_data.rows + top * 2, image_data.cols + left * 2, image_data.channels()});
-
-    Tensor d_in(shape, dtype);
-    Tensor d_out(o_shape, dtype);
+    // Create output tensor
+    int64_t outputHeight = input.shape(input.layout().height_index()) + top * 2;
+    int64_t outputWidth = input.shape(input.layout().width_index()) + left * 2;
+    TensorShape outputShape(input.layout(), {input.shape(input.layout().batch_index()), outputHeight, outputWidth,
+                                             input.shape(input.layout().channels_index())});
+    Tensor output(outputShape, input.dtype());
 
+    // Create stream
     hipStream_t stream;
-    HIP_VALIDATE_NO_ERRORS(hipStreamCreate(&stream));
-
-    // Move image data to input tensor
-    size_t image_size = d_in.shape().size() * d_in.dtype().size();
-    auto d_input_data = d_in.exportData<TensorDataStrided>();
-    HIP_VALIDATE_NO_ERRORS(
-        hipMemcpyAsync(d_input_data.basePtr(), image_data.data, image_size, hipMemcpyHostToDevice, stream));
+    CHECK_HIP_ERROR(hipStreamCreate(&stream));
 
+    // Create CopyMakeBorder operator
     CopyMakeBorder op;
-    op(stream, d_in, d_out, top, left, border_mode, {b, g, r, a});
+    op(stream, input, output, top, left, border_mode, {b, g, r, a});
 
-    // Move image data back to device
-    auto d_out_data = d_out.exportData<TensorDataStrided>();
-    size_t out_image_size = d_out.shape().size() * d_out.dtype().size();
-    std::vector<uint8_t> h_output(out_image_size);
-    HIP_VALIDATE_NO_ERRORS(
-        hipMemcpyAsync(h_output.data(), d_out_data.basePtr(), out_image_size, hipMemcpyDeviceToHost, stream));
+    CHECK_HIP_ERROR(hipStreamSynchronize(stream));
 
-    HIP_VALIDATE_NO_ERRORS(hipStreamSynchronize(stream));
+    WriteImages(output, argv[2]);
 
-    cv::Mat output_image_data(image_data.rows + top * 2, image_data.cols + left * 2, CV_8UC3, h_output.data());
-    cv::imwrite(argv[2], output_image_data);
+    CHECK_HIP_ERROR(hipStreamSynchronize(stream));
 
     return EXIT_SUCCESS;
 }
\ No newline at end of file

From 12ff5e933e5f7ae0c7a1b34f459c01b37a48da73 Mon Sep 17 00:00:00 2001
From: Zach Vincze <zavincze@amd.com>
Date: Fri, 5 Dec 2025 15:29:02 -0500
Subject: [PATCH 12/36] Fix incorrect imageBytes calculation

---
 samples/common/utils.hpp | 13 ++++++++-----
 1 file changed, 8 insertions(+), 5 deletions(-)

diff --git a/samples/common/utils.hpp b/samples/common/utils.hpp
index 36aeb127..b8d42b1d 100644
--- a/samples/common/utils.hpp
+++ b/samples/common/utils.hpp
@@ -67,7 +67,7 @@ inline MemcpyParams GetMemcpyParams(const roccv::Tensor &tensor) {
     params.rowPitch = tensorData.stride(tensor.layout().height_index());
     params.rowBytes = tensor.shape(tensor.layout().width_index()) * tensor.shape(tensor.layout().channels_index()) *
                       tensor.dtype().size();
-    params.imageBytes = params.rowBytes * tensor.shape(tensor.layout().height_index());
+    params.imageBytes = params.rowPitch * tensor.shape(tensor.layout().height_index());
     params.basePtr = tensorData.basePtr();
 
     return params;
@@ -133,6 +133,7 @@ inline roccv::Tensor LoadImages(const std::string &image_path, eDeviceType devic
         CHECK_HIP_ERROR(hipMemcpy2D(static_cast<uint8_t *>(params.basePtr) + i * params.imageBytes, params.rowPitch,
                                     images[i].data, params.rowBytes, params.rowBytes, height, kind));
     }
+    CHECK_HIP_ERROR(hipDeviceSynchronize());
 
     return tensor;
 }
@@ -171,12 +172,14 @@ inline void WriteImages(const roccv::Tensor &tensor, const std::string &output_p
                                     static_cast<uint8_t *>(params.basePtr) + i * params.imageBytes, params.rowPitch,
                                     params.rowBytes, height, kind));
     }
+    CHECK_HIP_ERROR(hipDeviceSynchronize());
 
-    if (std::filesystem::is_directory(output_path)) {
+    std::filesystem::path outputPath(output_path);
+    if (outputPath.extension().empty()) {
         for (int i = 0; i < batchSize; i++) {
-            std::ostringstream outFilename;
-            outFilename << output_path << "/image_" << i << ".bmp";
-            cv::imwrite(outFilename.str().c_str(), images[i]);
+            std::filesystem::create_directories(outputPath);
+            std::filesystem::path outFilename = outputPath / std::format("image_{}.bmp", i);
+            cv::imwrite(outFilename.string(), images[i]);
         }
     } else {
         cv::imwrite(output_path, images[0]);

From e562ebc9f30862d958326a91c4890c07f0391baf Mon Sep 17 00:00:00 2001
From: Zach Vincze <zavincze@amd.com>
Date: Fri, 5 Dec 2025 15:36:13 -0500
Subject: [PATCH 13/36] Use HIP streams where possible

---
 samples/common/utils.hpp     | 23 +++++++++++++----------
 samples/copy_make_border.cpp | 18 +++++++++---------
 2 files changed, 22 insertions(+), 19 deletions(-)

diff --git a/samples/common/utils.hpp b/samples/common/utils.hpp
index b8d42b1d..531bd470 100644
--- a/samples/common/utils.hpp
+++ b/samples/common/utils.hpp
@@ -75,14 +75,15 @@ inline MemcpyParams GetMemcpyParams(const roccv::Tensor &tensor) {
 
 /**
  * @brief Loads an image, or multiple images if given a directory, into a tensor. Will be in NHWC layout and U8 format.
- * All images must be of the same size and format. This is a blocking operation.
+ * All images must be of the same size and format. This is a non-blocking operation.
  *
  * @param image_path The path to the image to load. If a directory is provided, all supported images in the directory
  * will be loaded.
  * @param device The device to load the images onto. Defaults to GPU.
  * @return A NHWC tensor containing the loaded images.
  */
-inline roccv::Tensor LoadImages(const std::string &image_path, eDeviceType device = eDeviceType::GPU) {
+inline roccv::Tensor LoadImages(hipStream_t stream, const std::string &image_path,
+                                eDeviceType device = eDeviceType::GPU) {
     const std::vector<std::string> supportedExtensions = {".bmp", ".jpg", ".jpeg", ".png"};
 
     std::vector<cv::Mat> images;
@@ -130,10 +131,10 @@ inline roccv::Tensor LoadImages(const std::string &image_path, eDeviceType devic
     // Copy images into tensor
     hipMemcpyKind kind = (device == eDeviceType::GPU) ? hipMemcpyHostToDevice : hipMemcpyHostToHost;
     for (int i = 0; i < images.size(); i++) {
-        CHECK_HIP_ERROR(hipMemcpy2D(static_cast<uint8_t *>(params.basePtr) + i * params.imageBytes, params.rowPitch,
-                                    images[i].data, params.rowBytes, params.rowBytes, height, kind));
+        CHECK_HIP_ERROR(hipMemcpy2DAsync(static_cast<uint8_t *>(params.basePtr) + i * params.imageBytes,
+                                         params.rowPitch, images[i].data, params.rowBytes, params.rowBytes, height,
+                                         kind, stream));
     }
-    CHECK_HIP_ERROR(hipDeviceSynchronize());
 
     return tensor;
 }
@@ -145,7 +146,7 @@ inline roccv::Tensor LoadImages(const std::string &image_path, eDeviceType devic
  * @param output_path The path to write the images to. If a directory is provided, the images will be written to the
  * directory.
  */
-inline void WriteImages(const roccv::Tensor &tensor, const std::string &output_path) {
+inline void WriteImages(hipStream_t stream, const roccv::Tensor &tensor, const std::string &output_path) {
     if (tensor.layout() != eTensorLayout::TENSOR_LAYOUT_NHWC && tensor.layout() != eTensorLayout::TENSOR_LAYOUT_HWC) {
         throw std::runtime_error(
             "Unsupported tensor layout in WriteImages(). Only NHWC and HWC layouts are supported.");
@@ -168,11 +169,13 @@ inline void WriteImages(const roccv::Tensor &tensor, const std::string &output_p
     std::vector<cv::Mat> images(batchSize);
     for (int i = 0; i < batchSize; i++) {
         images[i] = cv::Mat(height, width, cvFormat);
-        CHECK_HIP_ERROR(hipMemcpy2D(images[i].data, params.rowBytes,
-                                    static_cast<uint8_t *>(params.basePtr) + i * params.imageBytes, params.rowPitch,
-                                    params.rowBytes, height, kind));
+        CHECK_HIP_ERROR(hipMemcpy2DAsync(images[i].data, params.rowBytes,
+                                         static_cast<uint8_t *>(params.basePtr) + i * params.imageBytes,
+                                         params.rowPitch, params.rowBytes, height, kind, stream));
     }
-    CHECK_HIP_ERROR(hipDeviceSynchronize());
+
+    // Ensure all memory operations are completed before writing images
+    CHECK_HIP_ERROR(hipStreamSynchronize(stream));
 
     std::filesystem::path outputPath(output_path);
     if (outputPath.extension().empty()) {
diff --git a/samples/copy_make_border.cpp b/samples/copy_make_border.cpp
index bf515dc7..ace161d4 100644
--- a/samples/copy_make_border.cpp
+++ b/samples/copy_make_border.cpp
@@ -62,8 +62,12 @@ int main(int argc, char** argv) {
     float a = std::stof(argv[8]);
     eBorderType border_mode = static_cast<eBorderType>(std::stoi(argv[9]));
 
+    // Create stream
+    hipStream_t stream;
+    CHECK_HIP_ERROR(hipStreamCreate(&stream));
+
     // Load input image
-    Tensor input = LoadImages(argv[1]);
+    Tensor input = LoadImages(stream, argv[1]);
 
     // Create output tensor
     int64_t outputHeight = input.shape(input.layout().height_index()) + top * 2;
@@ -72,19 +76,15 @@ int main(int argc, char** argv) {
                                              input.shape(input.layout().channels_index())});
     Tensor output(outputShape, input.dtype());
 
-    // Create stream
-    hipStream_t stream;
-    CHECK_HIP_ERROR(hipStreamCreate(&stream));
-
     // Create CopyMakeBorder operator
     CopyMakeBorder op;
     op(stream, input, output, top, left, border_mode, {b, g, r, a});
 
-    CHECK_HIP_ERROR(hipStreamSynchronize(stream));
-
-    WriteImages(output, argv[2]);
+    // Synchronize not required, WriteImages will block on the given stream
+    WriteImages(stream, output, argv[2]);
 
-    CHECK_HIP_ERROR(hipStreamSynchronize(stream));
+    // Destroy stream
+    CHECK_HIP_ERROR(hipStreamDestroy(stream));
 
     return EXIT_SUCCESS;
 }
\ No newline at end of file

From cae2e8ef1db930acc8599371b1e878f2bc84bd43 Mon Sep 17 00:00:00 2001
From: Zach Vincze <zavincze@amd.com>
Date: Mon, 8 Dec 2025 10:59:40 -0500
Subject: [PATCH 14/36] Fix CopyMakeBorder and GammaContrast samples

---
 samples/common/utils.hpp     |   5 +-
 samples/copy_make_border.cpp | 122 +++++++++++++++++++++++++++--------
 samples/gamma_contrast.cpp   | 121 ++++++++++++++++++----------------
 3 files changed, 165 insertions(+), 83 deletions(-)

diff --git a/samples/common/utils.hpp b/samples/common/utils.hpp
index 531bd470..f8f0c5a0 100644
--- a/samples/common/utils.hpp
+++ b/samples/common/utils.hpp
@@ -75,7 +75,7 @@ inline MemcpyParams GetMemcpyParams(const roccv::Tensor &tensor) {
 
 /**
  * @brief Loads an image, or multiple images if given a directory, into a tensor. Will be in NHWC layout and U8 format.
- * All images must be of the same size and format. This is a non-blocking operation.
+ * All images must be of the same size and format. This operation will block on the provided stream.
  *
  * @param image_path The path to the image to load. If a directory is provided, all supported images in the directory
  * will be loaded.
@@ -136,6 +136,9 @@ inline roccv::Tensor LoadImages(hipStream_t stream, const std::string &image_pat
                                          kind, stream));
     }
 
+    // Ensure all memory operations are completed before returning the tensor
+    CHECK_HIP_ERROR(hipStreamSynchronize(stream));
+
     return tensor;
 }
 
diff --git a/samples/copy_make_border.cpp b/samples/copy_make_border.cpp
index ace161d4..cb695dfe 100644
--- a/samples/copy_make_border.cpp
+++ b/samples/copy_make_border.cpp
@@ -21,6 +21,7 @@ THE SOFTWARE.
 */
 
 #include <core/hip_assert.h>
+#include <getopt.h>
 
 #include <core/tensor.hpp>
 #include <iostream>
@@ -31,59 +32,124 @@ THE SOFTWARE.
 
 using namespace roccv;
 
+struct Config {
+    std::string inputPath;
+    std::string outputPath = "output";
+    int32_t top = 10;
+    int32_t left = 10;
+    float r = 0.0f, g = 0.0f, b = 0.0f, a = 255.0f;
+    eBorderType borderMode = eBorderType::BORDER_TYPE_CONSTANT;
+    int deviceId = 0;
+};
+
+void PrintUsage(const char* programName) {
+    std::cout << "Usage: " << programName << " -i <input_image_or_directory> [options]\n\n"
+              << "Options:\n"
+              << "  -i, --input <path>      Input image or directory (required)\n"
+              << "  -o, --output <path>     Output file or directory (default: output/image_<index>.bmp)\n"
+              << "  -t, --top <pixels>      Top/bottom border size (default: 10)\n"
+              << "  -l, --left <pixels>     Left/right border size (default: 10)\n"
+              << "  -c, --color <r,g,b,a>   Border color as r,g,b,a (default: 0,0,0,255)\n"
+              << "  -m, --mode <mode>       Border mode: 0=constant, 1=replicate, 2=reflect (default: constant)\n"
+              << "  -d, --device <id>       GPU device ID (default: 0)\n"
+              << "  -h, --help              Show this help message\n\n"
+              << "Example:\n"
+              << "  " << programName << " -i image.jpg -o bordered.png -t 20 -l 15 -c 255,0,0,255\n";
+}
+
+bool ParseColor(const std::string& colorStr, float& r, float& g, float& b, float& a) {
+    return sscanf(colorStr.c_str(), "%f,%f,%f,%f", &r, &g, &b, &a) == 4;
+}
+
 /**
  * @brief Copy make border operation example.
  *
- * This sample demonstrates the usage of the CopyMakeBorder operator. It loads an image from the specified input path,
- * creates a border around the image based on the specified border mode and border value, and writes the output image to
- * the specified output path.
- *
- * Usage:
- * ./copy_make_border <image_filename> <output_filename> <top> <left> <r> <g> <b> <a> <border_mode> <device_id>
+ * This sample demonstrates the usage of the CopyMakeBorder operator. It accepts either a single image file or a
+ * directory of images as the input path, and either a single file or a directory as the output path. If a directory is
+ * provided for the input, all supported images within the directory will be processed as a batch. Similarly, if the
+ * output path is a directory, all resulting images will be written to that directory. For each image, a border is
+ * created based on the specified border mode and border value.
  *
+ * If <image_or_directory> is a directory, a batch operation will occur on every image in the directory.
+ * If <output_file_or_directory> is a directory, the output images will be written into that directory.
  */
 int main(int argc, char** argv) {
-    // Validate command line arguments
-    if (argc != 11) {
-        std::cerr << "Usage: " << argv[0]
-                  << " <image_filename> <output_filename> <top> <left> <r> <g> <b> <a> <border_mode> <device_id>"
-                  << std::endl;
-        return EXIT_FAILURE;
+    Config config;
+
+    static struct option longOptions[] = {{"input", required_argument, nullptr, 'i'},
+                                          {"output", required_argument, nullptr, 'o'},
+                                          {"top", required_argument, nullptr, 't'},
+                                          {"left", required_argument, nullptr, 'l'},
+                                          {"color", required_argument, nullptr, 'c'},
+                                          {"mode", required_argument, nullptr, 'm'},
+                                          {"device", required_argument, nullptr, 'd'},
+                                          {"help", no_argument, nullptr, 'h'},
+                                          {nullptr, 0, nullptr, 0}};
+
+    int opt;
+    while ((opt = getopt_long(argc, argv, "i:o:t:l:c:m:d:h", longOptions, nullptr)) != -1) {
+        switch (opt) {
+            case 'i':
+                config.inputPath = optarg;
+                break;
+            case 'o':
+                config.outputPath = optarg;
+                break;
+            case 't':
+                config.top = std::stoi(optarg);
+                break;
+            case 'l':
+                config.left = std::stoi(optarg);
+                break;
+            case 'c':
+                if (!ParseColor(optarg, config.r, config.g, config.b, config.a)) {
+                    std::cerr << "Invalid color format. Use: r,g,b,a (e.g., 255,0,0,255)\n";
+                    return EXIT_FAILURE;
+                }
+                break;
+            case 'm':
+                config.borderMode = static_cast<eBorderType>(std::stoi(optarg));
+                break;
+            case 'd':
+                config.deviceId = std::stoi(optarg);
+                break;
+            case 'h':
+                PrintUsage(argv[0]);
+                return EXIT_SUCCESS;
+            default:
+                PrintUsage(argv[0]);
+                return EXIT_FAILURE;
+        }
     }
 
-    CHECK_HIP_ERROR(hipSetDevice(std::stoi(argv[10])));
+    if (config.inputPath.empty()) {
+        std::cerr << "Error: Input path is required.\n\n";
+        PrintUsage(argv[0]);
+        return EXIT_FAILURE;
+    }
 
-    // Parse command line arguments
-    int32_t top = std::stoi(argv[3]);
-    int32_t left = std::stoi(argv[4]);
-    float r = std::stof(argv[5]);
-    float g = std::stof(argv[6]);
-    float b = std::stof(argv[7]);
-    float a = std::stof(argv[8]);
-    eBorderType border_mode = static_cast<eBorderType>(std::stoi(argv[9]));
+    CHECK_HIP_ERROR(hipSetDevice(config.deviceId));
 
     // Create stream
     hipStream_t stream;
     CHECK_HIP_ERROR(hipStreamCreate(&stream));
 
     // Load input image
-    Tensor input = LoadImages(stream, argv[1]);
+    Tensor input = LoadImages(stream, config.inputPath.c_str());
 
     // Create output tensor
-    int64_t outputHeight = input.shape(input.layout().height_index()) + top * 2;
-    int64_t outputWidth = input.shape(input.layout().width_index()) + left * 2;
+    int64_t outputHeight = input.shape(input.layout().height_index()) + config.top * 2;
+    int64_t outputWidth = input.shape(input.layout().width_index()) + config.left * 2;
     TensorShape outputShape(input.layout(), {input.shape(input.layout().batch_index()), outputHeight, outputWidth,
                                              input.shape(input.layout().channels_index())});
     Tensor output(outputShape, input.dtype());
 
     // Create CopyMakeBorder operator
     CopyMakeBorder op;
-    op(stream, input, output, top, left, border_mode, {b, g, r, a});
+    op(stream, input, output, config.top, config.left, config.borderMode, {config.b, config.g, config.r, config.a});
 
-    // Synchronize not required, WriteImages will block on the given stream
-    WriteImages(stream, output, argv[2]);
+    WriteImages(stream, output, config.outputPath);
 
-    // Destroy stream
     CHECK_HIP_ERROR(hipStreamDestroy(stream));
 
     return EXIT_SUCCESS;
diff --git a/samples/gamma_contrast.cpp b/samples/gamma_contrast.cpp
index 81fe771f..62a83f28 100644
--- a/samples/gamma_contrast.cpp
+++ b/samples/gamma_contrast.cpp
@@ -21,78 +21,91 @@ THE SOFTWARE.
 */
 
 #include <core/hip_assert.h>
+#include <getopt.h>
+
 #include <core/tensor.hpp>
 #include <iostream>
 #include <op_gamma_contrast.hpp>
 #include <opencv2/opencv.hpp>
 
+#include "common/utils.hpp"
 using namespace roccv;
 
+struct Config {
+    float gamma = 2.2;
+    std::string inputPath;
+    std::string outputPath = "output";
+    int deviceId = 0;
+};
+
+void PrintUsage(const char* programName) {
+    // clang-format off
+    std::cerr << "Usage: " << programName << " -i <input_image> [-o <output_image>] [-g <gamma>] [-d <device_id>]" << std::endl;
+    std::cerr << "  -i, --input <input_image>      Input image or directory containing images (required)" << std::endl;
+    std::cerr << "  -o, --output <output_image>    Output image or directory to save the results (optional, default: output)" << std::endl;
+    std::cerr << "  -g, --gamma <gamma>            Gamma value to apply to the input images (optional, default: 2.2)" << std::endl;
+    std::cerr << "  -d, --device <device_id>       Device ID to use for execution (optional, default: 0)" << std::endl;
+    // clang-format on
+}
+
 /**
  * @brief Gamma contrast operator sample app.
  */
 int main(int argc, char** argv) {
-    if (argc != 3) {
-        std::cerr << "Usage: " << argv[0]
-                  << " <input_image> <output_image>"
-                  << std::endl;
+    Config config;
+
+    static struct option longOptions[] = {
+        {"input", required_argument, nullptr, 'i'}, {"output", required_argument, nullptr, 'o'},
+        {"gamma", required_argument, nullptr, 'g'}, {"device", required_argument, nullptr, 'd'},
+        {"help", no_argument, nullptr, 'h'},        {nullptr, 0, nullptr, 0}};
+
+    int opt;
+    while ((opt = getopt_long(argc, argv, "i:o:g:d:h", longOptions, nullptr)) != -1) {
+        switch (opt) {
+            case 'i':
+                config.inputPath = optarg;
+                break;
+            case 'o':
+                config.outputPath = optarg;
+                break;
+            case 'g':
+                config.gamma = std::stof(optarg);
+                break;
+            case 'd':
+                config.deviceId = std::stoi(optarg);
+                break;
+            case 'h':
+                PrintUsage(argv[0]);
+                return EXIT_SUCCESS;
+            default:
+                PrintUsage(argv[0]);
+                return EXIT_FAILURE;
+        }
+    }
+
+    if (config.inputPath.empty()) {
+        std::cerr << "Error: Input path is required.\n\n";
+        PrintUsage(argv[0]);
         return EXIT_FAILURE;
     }
 
-    // Device to use in this sample will be the GPU
-    eDeviceType device =  eDeviceType::GPU;
-    
-    // Load input image using the OpenCV library.
-    // The Mat image_data will store all of the data of the image
-    // Image width can be gotten with image_data.cols
-    // Image height can be gotten with image_data.rows
-    // The amount of channels can be gotten with image_data.channels()
-    cv::Mat image_data = cv::imread(argv[1]);
-
-    // Batch size is needed to create the input and output tensors
-    int batchSize = 1;
-
-    // A gamma value to apply to the input image
-    float gammaValue = 2.2;
-
-    // Create input/output tensors
-    // Tensor shape
-    //      - Takes layout as input, in this case NHWC (N - batch size, H - image height, W - image width, C - number of channels)
-    //      - Also takes the datatype, in this case U8 or an unsigned integer of 8 bits.
-    TensorShape shape(
-        TensorLayout(eTensorLayout::TENSOR_LAYOUT_NHWC),
-        {batchSize, image_data.rows, image_data.cols, image_data.channels()});
-    DataType dtype(eDataType::DATA_TYPE_U8);
-
-    Tensor input(shape, dtype, device);
-    Tensor output(shape, dtype, device);
+    CHECK_HIP_ERROR(hipSetDevice(config.deviceId));
 
     hipStream_t stream;
-    HIP_VALIDATE_NO_ERRORS(hipStreamCreate(&stream));
-
-    // imageSize is needed to know how much data needs to be copied to the GPU
-    size_t imageSize = image_data.rows * image_data.cols * image_data.channels() * sizeof(uint8_t);
-    
-    auto input_data = input.exportData<TensorDataStrided>();
-    HIP_VALIDATE_NO_ERRORS(
-            hipMemcpy(static_cast<uint8_t *>(input_data.basePtr()), image_data.data, imageSize, hipMemcpyHostToDevice));
-    
-    // Apply gamma correction
-    GammaContrast gamma_contrast;
-    gamma_contrast(stream, input, output, gammaValue, device);
+    CHECK_HIP_ERROR(hipStreamCreate(&stream));
+
+    // Load input images
+    Tensor input = LoadImages(stream, config.inputPath, eDeviceType::GPU);
 
-    // Move output data back to host
-    auto output_data = output.exportData<TensorDataStrided>();
-    std::vector<uint8_t> h_output(imageSize);
-    HIP_VALIDATE_NO_ERRORS(hipMemcpy(h_output.data(), output_data.basePtr(), imageSize, hipMemcpyDeviceToHost));
-    
-    HIP_VALIDATE_NO_ERRORS(hipStreamSynchronize(stream));
+    // Create output tensor
+    Tensor output(input.shape(), input.dtype(), eDeviceType::GPU);
 
-    // Save the gamma-corrected image
-    cv::Mat output_image(image_data.rows, image_data.cols, CV_8UC3, h_output.data());
-    cv::imwrite(argv[2], output_image);
+    // Apply gamma correction
+    GammaContrast gamma_contrast;
+    gamma_contrast(stream, input, output, config.gamma, eDeviceType::GPU);
 
-    std::cout << "Gamma correction applied successfully. Output saved to: " << argv[2] << std::endl;
-    
+    // Write output images to disk
+    WriteImages(stream, output, config.outputPath);
+    CHECK_HIP_ERROR(hipStreamDestroy(stream));
     return EXIT_SUCCESS;
 }

From 6c4221258b10bfff68149f2f31cad74f169d54d6 Mon Sep 17 00:00:00 2001
From: Zach Vincze <zavincze@amd.com>
Date: Mon, 8 Dec 2025 11:21:45 -0500
Subject: [PATCH 15/36] Fix WarpPerspective sample

---
 samples/copy_make_border.cpp |   4 +-
 samples/warp_perspective.cpp | 103 ++++++++++++++++++++++++-----------
 2 files changed, 74 insertions(+), 33 deletions(-)

diff --git a/samples/copy_make_border.cpp b/samples/copy_make_border.cpp
index cb695dfe..0dbb48f0 100644
--- a/samples/copy_make_border.cpp
+++ b/samples/copy_make_border.cpp
@@ -43,6 +43,7 @@ struct Config {
 };
 
 void PrintUsage(const char* programName) {
+    // clang-format off
     std::cout << "Usage: " << programName << " -i <input_image_or_directory> [options]\n\n"
               << "Options:\n"
               << "  -i, --input <path>      Input image or directory (required)\n"
@@ -50,11 +51,12 @@ void PrintUsage(const char* programName) {
               << "  -t, --top <pixels>      Top/bottom border size (default: 10)\n"
               << "  -l, --left <pixels>     Left/right border size (default: 10)\n"
               << "  -c, --color <r,g,b,a>   Border color as r,g,b,a (default: 0,0,0,255)\n"
-              << "  -m, --mode <mode>       Border mode: 0=constant, 1=replicate, 2=reflect (default: constant)\n"
+              << "  -m, --mode <mode>       Border mode: 0=CONSTANT, 1=REPLICATE, 2=REFLECT, 3=REFLECT101, 4=WRAP (default: CONSTANT)\n"
               << "  -d, --device <id>       GPU device ID (default: 0)\n"
               << "  -h, --help              Show this help message\n\n"
               << "Example:\n"
               << "  " << programName << " -i image.jpg -o bordered.png -t 20 -l 15 -c 255,0,0,255\n";
+    // clang-format on
 }
 
 bool ParseColor(const std::string& colorStr, float& r, float& g, float& b, float& a) {
diff --git a/samples/warp_perspective.cpp b/samples/warp_perspective.cpp
index dc816e11..18d18f3a 100644
--- a/samples/warp_perspective.cpp
+++ b/samples/warp_perspective.cpp
@@ -21,62 +21,101 @@ THE SOFTWARE.
 */
 
 #include <core/hip_assert.h>
+#include <getopt.h>
 
 #include <core/tensor.hpp>
 #include <iostream>
 #include <op_warp_perspective.hpp>
 #include <opencv2/opencv.hpp>
 
+#include "common/utils.hpp"
+
 using namespace roccv;
 
+struct Config {
+    std::string inputPath;
+    std::string outputPath = "output";
+    eInterpolationType interpolation = eInterpolationType::INTERP_TYPE_LINEAR;
+    eBorderType borderMode = eBorderType::BORDER_TYPE_CONSTANT;
+    int deviceId = 0;
+};
+
+void PrintUsage(const char* programName) {
+    // clang-format off
+    std::cerr << "Usage: " << programName << " -i <input_image> [-o <output_image>] [-p <interpolation>] [-b <border>] [-d <device_id>]" << std::endl;
+    std::cerr << "  -i, --input <input_image>           Input image or directory containing images (required)" << std::endl;
+    std::cerr << "  -o, --output <output_image>         Output image or directory to save the results (optional, default: output)" << std::endl;
+    std::cerr << "  -p, --interpolation <interpolation> Interpolation type to use for output images [0: NEAREST, 1: LINEAR, 2: CUBIC] (optional, default: 1 (LINEAR))" << std::endl;
+    std::cerr << "  -b, --border <border>               Border type for output images [0: CONSTANT, 1: REPLICATE, 2: REFLECT, 3: REFLECT101, 4: WRAP] (optional, default: 0 (CONSTANT))" << std::endl;
+    std::cerr << "  -d, --device <device_id>            Device ID to use for execution (optional, default: 0)" << std::endl;
+    // clang-format on
+}
+
 /**
  * @brief Warp perspective operation example.
  */
 int main(int argc, char** argv) {
-    if (argc != 6) {
-        std::cerr << "Usage: " << argv[0] << " <image_filename> <output_filename> <interpolation> <border> <device_id>"
-                  << std::endl;
-        return EXIT_FAILURE;
+    Config config;
+
+    static struct option longOptions[] = {{"input", required_argument, nullptr, 'i'},
+                                          {"output", required_argument, nullptr, 'o'},
+                                          {"interpolation", required_argument, nullptr, 'p'},
+                                          {"border", required_argument, nullptr, 'b'},
+                                          {"device", required_argument, nullptr, 'd'},
+                                          {"help", no_argument, nullptr, 'h'},
+                                          {nullptr, 0, nullptr, 0}};
+
+    // Parse command line arguments
+    int opt;
+    while ((opt = getopt_long(argc, argv, "i:o:p:b:d:h", longOptions, nullptr)) != -1) {
+        switch (opt) {
+            case 'i':
+                config.inputPath = optarg;
+                break;
+            case 'o':
+                config.outputPath = optarg;
+                break;
+            case 'p':
+                config.interpolation = static_cast<eInterpolationType>(std::stoi(optarg));
+                break;
+            case 'b':
+                config.borderMode = static_cast<eBorderType>(std::stoi(optarg));
+                break;
+            case 'd':
+                config.deviceId = std::stoi(optarg);
+                break;
+            case 'h':
+                PrintUsage(argv[0]);
+                return EXIT_SUCCESS;
+            default:
+                PrintUsage(argv[0]);
+                return EXIT_FAILURE;
+        }
     }
 
-    HIP_VALIDATE_NO_ERRORS(hipSetDevice(std::stoi(argv[5])));
-    eInterpolationType interp = static_cast<eInterpolationType>(std::stoi(argv[3]));
-    eBorderType border_mode = static_cast<eBorderType>(std::stoi(argv[4]));
-    cv::Mat image_data = cv::imread(argv[1]);
-
-    // Create input/output tensors for the image.
-    TensorShape shape(TensorLayout(eTensorLayout::TENSOR_LAYOUT_NHWC),
-                      {1, image_data.rows, image_data.cols, image_data.channels()});
-    DataType dtype(eDataType::DATA_TYPE_U8);
+    if (config.inputPath.empty()) {
+        std::cerr << "Error: Input path is required.\n\n";
+        PrintUsage(argv[0]);
+        return EXIT_FAILURE;
+    }
 
-    Tensor d_in(shape, dtype);
-    Tensor d_out(shape, dtype);
+    CHECK_HIP_ERROR(hipSetDevice(config.deviceId));
 
     hipStream_t stream;
-    HIP_VALIDATE_NO_ERRORS(hipStreamCreate(&stream));
+    CHECK_HIP_ERROR(hipStreamCreate(&stream));
 
-    // Move image data to input tensor
-    size_t image_size = d_in.shape().size() * d_in.dtype().size();
-    auto d_input_data = d_in.exportData<TensorDataStrided>();
-    HIP_VALIDATE_NO_ERRORS(
-        hipMemcpyAsync(d_input_data.basePtr(), image_data.data, image_size, hipMemcpyHostToDevice, stream));
+    // Create input/output tensors for the image.
+    Tensor input = LoadImages(stream, config.inputPath, eDeviceType::GPU);
+    Tensor output(input.shape(), input.dtype(), eDeviceType::GPU);
 
     PerspectiveTransform transform_matrix = {1, 0, 0, 0, 1, 0, -0.001, 0, 1};
 
     roccv::WarpPerspective op;
-    op(stream, d_in, d_out, transform_matrix, true, interp, border_mode, make_float4(0, 0, 0, 0));
+    op(stream, input, output, transform_matrix, true, config.interpolation, config.borderMode, make_float4(0, 0, 0, 0));
 
     // Move image data back to device
-    auto d_out_data = d_out.exportData<TensorDataStrided>();
-    std::vector<uint8_t> h_output(image_size);
-    HIP_VALIDATE_NO_ERRORS(
-        hipMemcpyAsync(h_output.data(), d_out_data.basePtr(), image_size, hipMemcpyDeviceToHost, stream));
-
-    HIP_VALIDATE_NO_ERRORS(hipStreamSynchronize(stream));
-
-    // Write normalized image to disk
-    cv::Mat output_image_data(image_data.rows, image_data.cols, CV_8UC3, h_output.data());
-    cv::imwrite(argv[2], output_image_data);
+    WriteImages(stream, output, config.outputPath);
+    CHECK_HIP_ERROR(hipStreamDestroy(stream));
 
     return EXIT_SUCCESS;
 }
\ No newline at end of file

From f7f383954eb27eff4fc42a4b6c2b2b819b273c66 Mon Sep 17 00:00:00 2001
From: Zach Vincze <zavincze@amd.com>
Date: Mon, 8 Dec 2025 12:02:44 -0500
Subject: [PATCH 16/36] Fix CustomCrop sample

---
 samples/custom_crop.cpp      | 204 ++++++++++++++++-------------------
 samples/warp_perspective.cpp |   8 +-
 2 files changed, 95 insertions(+), 117 deletions(-)

diff --git a/samples/custom_crop.cpp b/samples/custom_crop.cpp
index 73bd8561..25e7d83a 100644
--- a/samples/custom_crop.cpp
+++ b/samples/custom_crop.cpp
@@ -21,144 +21,122 @@ THE SOFTWARE.
 */
 
 #include <core/hip_assert.h>
+#include <getopt.h>
+
 #include <core/tensor.hpp>
 #include <iostream>
 #include <op_custom_crop.hpp>
 #include <opencv2/opencv.hpp>
 
+#include "common/utils.hpp"
+
 using namespace roccv;
 
-/**
- * @brief Custom crop operation example.
- */
+struct Config {
+    std::string inputPath;
+    std::string outputPath = "output";
+    Box_t cropRect = {0, 0, 1, 1};
+    eDeviceType device = eDeviceType::GPU;
+    int deviceId = 0;
+};
+
+void PrintUsage(const char* programName) {
+    // clang-format off
+    std::cout << "Usage: " << programName << " -i <input_image> [-o <output_image>] [-crop <x,y,w,h>] [-d <device_id>] [-c]" << std::endl;
+    std::cout << "  -i, --input <input_image>           Input image or directory containing images (required)" << std::endl;
+    std::cout << "  -o, --output <output_image>         Output image or directory to save the results (optional, default: output)" << std::endl;
+    std::cout << "  -c, --crop <x,y,w,h>                Crop rectangle as comma separated values (optional, default: 0,0,1,1)" << std::endl;
+    std::cout << "  -d, --device <device_id>            Device ID to use for execution when using GPU (optional, default: 0)" << std::endl;
+    std::cout << "  -C, --cpu                           Use CPU for execution (optional, default: GPU)" << std::endl;
+    std::cout << "  -h, --help                          Show this help message" << std::endl;
+    // clang-format on
+}
 
-void ShowHelpAndExit(const char *option = NULL) {
-    std::cout << "Options: " << option << std::endl
-    << "-i Input File Path - required" << std::endl
-    << "-o Output File Path - optional; default: output.bmp" << std::endl
-    << "-cpu Select CPU instead of GPU to perform operation - optional; default choice is GPU path" << std::endl
-    << "-d GPU device ID (0 for the first device, 1 for the second, etc.) - optional; default: 0" << std::endl
-    << "-crop Crop rectangle (top_left_corner_x, top_left_corner_y, width, height)- optional; default: use the set value in the app" << std::endl;
-    exit(0);
+bool ParseCropRectangle(const std::string& cropStr, Box_t& cropRect) {
+    std::istringstream iss(cropStr);
+    std::string token;
+    getline(iss, token, ',');
+    cropRect.x = std::stoi(token);
+    getline(iss, token, ',');
+    cropRect.y = std::stoi(token);
+    getline(iss, token, ',');
+    cropRect.width = std::stoi(token);
+    getline(iss, token, ',');
+    cropRect.height = std::stoi(token);
+    return true;
 }
 
+/**
+ * @brief Custom crop operation example.
+ */
 int main(int argc, char** argv) {
-    std::string input_file_path;
-    std::string output_file_path = "output.bmp";
-    bool gpuPath = true; // use GPU by default
-    eDeviceType device = eDeviceType::GPU;
-    int deviceId = 0;
-    Box_t cropRect = {0, 0, 1, 1};
-    bool cropSet = false;
-
-    if(argc < 3) {
-        ShowHelpAndExit("-h");
-    }
-    for (int i = 1; i < argc; i++) {
-        if (!strcmp(argv[i], "-h")) {
-            ShowHelpAndExit("-h");
-        }
-        if (!strcmp(argv[i], "-i")) {
-            if (++i == argc) {
-                ShowHelpAndExit("-i");
-            }
-            input_file_path = argv[i];
-            continue;
-        }
-        if (!strcmp(argv[i], "-o")) {
-            if (++i == argc) {
-                ShowHelpAndExit("-o");
-            }
-            output_file_path = argv[i];
-            continue;
-        }
-        if (!strcmp(argv[i], "-crop")) {
-            i++;
-            if (i + 4 > argc) {
-                ShowHelpAndExit("-crop");
-            }
-            cropRect.x = atoi(argv[i++]);
-            cropRect.y = atoi(argv[i++]);
-            cropRect.width = atoi(argv[i++]);
-            cropRect.height = atoi(argv[i]);
-            cropSet = true;
-            continue;
-        }
-        if (!strcmp(argv[i], "-cpu")) {
-            gpuPath = false;
-            continue;
+    Config config;
+    static struct option longOptions[] = {{"input", required_argument, nullptr, 'i'},
+                                          {"output", required_argument, nullptr, 'o'},
+                                          {"crop", required_argument, nullptr, 'c'},
+                                          {"device", required_argument, nullptr, 'd'},
+                                          {"cpu", no_argument, nullptr, 'C'},
+                                          {"help", no_argument, nullptr, 'h'},
+                                          {nullptr, 0, nullptr, 0}};
+    int opt;
+    while ((opt = getopt_long(argc, argv, "i:o:c:d:h:C", longOptions, nullptr)) != -1) {
+        switch (opt) {
+            case 'i':
+                config.inputPath = optarg;
+                break;
+            case 'o':
+                config.outputPath = optarg;
+                break;
+            case 'c':
+                if (!ParseCropRectangle(optarg, config.cropRect)) {
+                    std::cerr << "Invalid crop rectangle format. Use: x,y,w,h (e.g., 0,0,1,1)\n";
+                    return EXIT_FAILURE;
+                }
+                break;
+            case 'd':
+                config.deviceId = std::stoi(optarg);
+                break;
+            case 'C':
+                config.device = eDeviceType::CPU;
+                break;
+            case 'h':
+                PrintUsage(argv[0]);
+                return EXIT_SUCCESS;
+            default:
+                PrintUsage(argv[0]);
+                return EXIT_FAILURE;
         }
     }
 
-    if (gpuPath) {
-        device = eDeviceType::GPU;
-        HIP_VALIDATE_NO_ERRORS(hipSetDevice(deviceId));
-    } else {
-        device = eDeviceType::CPU;
+    if (config.inputPath.empty()) {
+        std::cerr << "Error: Input path is required.\n\n";
+        PrintUsage(argv[0]);
+        return EXIT_FAILURE;
     }
 
-    cv::Mat imageData = cv::imread(input_file_path);
-    if (imageData.empty()) {
-        std::cerr << "Failed to read the input image file" << std::endl;
-        exit(1);
-    }
-    if (!cropSet) {
-        // Set a safe crop area if no user input
-        cropRect = {(imageData.cols / 4), (imageData.rows / 4), (imageData.cols / 2), (imageData.rows / 2)};
-    }
+    CHECK_HIP_ERROR(hipSetDevice(config.deviceId));
 
-    // Create input/output tensors for the image.
-    TensorShape inputShape(TensorLayout(eTensorLayout::TENSOR_LAYOUT_NHWC), {1, imageData.rows, imageData.cols, imageData.channels()});
-    DataType dtype(eDataType::DATA_TYPE_U8);
-    Tensor input(inputShape, dtype, device);
+    hipStream_t stream;
+    CHECK_HIP_ERROR(hipStreamCreate(&stream));
 
-    TensorShape outShape(TensorLayout(eTensorLayout::TENSOR_LAYOUT_NHWC), {1, cropRect.height, cropRect.width, imageData.channels()});
-    Tensor output(outShape, dtype, device);
+    // Load input images
+    Tensor input = LoadImages(stream, config.inputPath, config.device);
 
-    hipStream_t stream = nullptr;
-    if (gpuPath) {
-        HIP_VALIDATE_NO_ERRORS(hipStreamCreate(&stream));
-    }
+    int64_t batchSize = input.shape(input.layout().batch_index());
+    int64_t channels = input.shape(input.layout().channels_index());
 
-    // Move image data to input tensor
-    size_t imageSizeInByte = input.shape().size() * input.dtype().size();
-    auto input_data = input.exportData<TensorDataStrided>();
-    if (gpuPath) {
-        HIP_VALIDATE_NO_ERRORS(hipMemcpyAsync(input_data.basePtr(), imageData.data, imageSizeInByte, hipMemcpyHostToDevice, stream));
-    } else {
-        memcpy(input_data.basePtr(), imageData.data, imageSizeInByte);
-    }
+    // Create output tensor
+    Tensor output(TensorShape(input.layout(), {batchSize, config.cropRect.height, config.cropRect.width, channels}),
+                  input.dtype(), config.device);
 
+    // Run custom crop operation
     CustomCrop op;
-    op(stream, input, output, cropRect, device);
-
-    // Move image data back to host
-    size_t outputSize = output.shape().size() * output.dtype().size();
-    auto outData = output.exportData<TensorDataStrided>();
-    std::vector<uint8_t> h_output(outputSize);
-    if (gpuPath) {
-        HIP_VALIDATE_NO_ERRORS(hipMemcpyAsync(h_output.data(), outData.basePtr(), outputSize, hipMemcpyDeviceToHost, stream));
-        HIP_VALIDATE_NO_ERRORS(hipStreamSynchronize(stream));
-    } else {
-        memcpy(h_output.data(), outData.basePtr(), outputSize);
-    }
+    op(stream, input, output, config.cropRect, config.device);
 
     // Write output image to disk
-    cv::Mat output_imageData(cropRect.height, cropRect.width, imageData.type(), h_output.data());
-    bool ret = cv::imwrite(output_file_path, output_imageData);
-    if (!ret) {
-        std::cerr << "Faild to save output image to the file" << std::endl;
-    }
+    WriteImages(stream, output, config.outputPath);
 
-    std::cout << "Input image file: " << input_file_path << std::endl;
-    std::cout << "Output image file: " << output_file_path << std::endl;
-    if (gpuPath) {
-        std::cout << "Operation on GPU device " << deviceId << std::endl;
-    } else {
-        std::cout << "Operation on CPU" << std::endl;
-    }
-    std::cout << "Input image size: width = " << imageData.cols << ", height = " << imageData.rows << std::endl;
-    std::cout << "Cropping area: top-left corner = (" << cropRect.x << ", " << cropRect.y << "), width = " << cropRect.width << ", height = " << cropRect.height << std::endl;
+    CHECK_HIP_ERROR(hipStreamDestroy(stream));
 
     return EXIT_SUCCESS;
 }
\ No newline at end of file
diff --git a/samples/warp_perspective.cpp b/samples/warp_perspective.cpp
index 18d18f3a..49039d62 100644
--- a/samples/warp_perspective.cpp
+++ b/samples/warp_perspective.cpp
@@ -45,7 +45,7 @@ void PrintUsage(const char* programName) {
     std::cerr << "Usage: " << programName << " -i <input_image> [-o <output_image>] [-p <interpolation>] [-b <border>] [-d <device_id>]" << std::endl;
     std::cerr << "  -i, --input <input_image>           Input image or directory containing images (required)" << std::endl;
     std::cerr << "  -o, --output <output_image>         Output image or directory to save the results (optional, default: output)" << std::endl;
-    std::cerr << "  -p, --interpolation <interpolation> Interpolation type to use for output images [0: NEAREST, 1: LINEAR, 2: CUBIC] (optional, default: 1 (LINEAR))" << std::endl;
+    std::cerr << "  -I, --interpolation <interpolation> Interpolation type to use for output images [0: NEAREST, 1: LINEAR, 2: CUBIC] (optional, default: 1 (LINEAR))" << std::endl;
     std::cerr << "  -b, --border <border>               Border type for output images [0: CONSTANT, 1: REPLICATE, 2: REFLECT, 3: REFLECT101, 4: WRAP] (optional, default: 0 (CONSTANT))" << std::endl;
     std::cerr << "  -d, --device <device_id>            Device ID to use for execution (optional, default: 0)" << std::endl;
     // clang-format on
@@ -59,7 +59,7 @@ int main(int argc, char** argv) {
 
     static struct option longOptions[] = {{"input", required_argument, nullptr, 'i'},
                                           {"output", required_argument, nullptr, 'o'},
-                                          {"interpolation", required_argument, nullptr, 'p'},
+                                          {"interpolation", required_argument, nullptr, 'I'},
                                           {"border", required_argument, nullptr, 'b'},
                                           {"device", required_argument, nullptr, 'd'},
                                           {"help", no_argument, nullptr, 'h'},
@@ -67,7 +67,7 @@ int main(int argc, char** argv) {
 
     // Parse command line arguments
     int opt;
-    while ((opt = getopt_long(argc, argv, "i:o:p:b:d:h", longOptions, nullptr)) != -1) {
+    while ((opt = getopt_long(argc, argv, "i:o:I:b:d:h", longOptions, nullptr)) != -1) {
         switch (opt) {
             case 'i':
                 config.inputPath = optarg;
@@ -75,7 +75,7 @@ int main(int argc, char** argv) {
             case 'o':
                 config.outputPath = optarg;
                 break;
-            case 'p':
+            case 'I':
                 config.interpolation = static_cast<eInterpolationType>(std::stoi(optarg));
                 break;
             case 'b':

From 1063baa6bdb7261ddf80a3e7d0a8a1b5bbd030ea Mon Sep 17 00:00:00 2001
From: Zach Vincze <zavincze@amd.com>
Date: Mon, 8 Dec 2025 12:26:56 -0500
Subject: [PATCH 17/36] Fix BilateralFilter sample

---
 samples/bilateral_filter.cpp | 249 +++++++++++++++--------------------
 1 file changed, 108 insertions(+), 141 deletions(-)

diff --git a/samples/bilateral_filter.cpp b/samples/bilateral_filter.cpp
index 4b7d3864..04364f5f 100644
--- a/samples/bilateral_filter.cpp
+++ b/samples/bilateral_filter.cpp
@@ -21,172 +21,139 @@ THE SOFTWARE.
 */
 
 #include <core/hip_assert.h>
+#include <getopt.h>
+
 #include <core/tensor.hpp>
 #include <iostream>
-#include <fstream>
 #include <op_bilateral_filter.hpp>
 #include <opencv2/opencv.hpp>
 
+#include "common/utils.hpp"
+
 using namespace roccv;
 
+struct Config {
+    std::string inputPath;
+    std::string outputPath = "output";
+    int deviceId = 0;
+    int diameter = 2;
+    float sigmaSpace = 2.0f;
+    float sigmaColor = 10.0f;
+    eBorderType borderMode = eBorderType::BORDER_TYPE_REPLICATE;
+    float4 borderColor = {0.0f, 0.0f, 0.0f, 0.0f};
+    eDeviceType device = eDeviceType::GPU;
+};
+
 /**
  * @brief Bilateral filter operation example.
  */
 
- void ShowHelpAndExit(const char *option = NULL) {
-    std::cout << "Options: " << option << std::endl
-    << "-i Input File Path - required" << std::endl
-    << "-o Output File Path - optional; default: output.bmp" << std::endl
-    << "-cpu Select CPU instead of GPU to perform operation - optional; default choice is GPU path" << std::endl
-    << "-d GPU device ID (0 for the first device, 1 for the second, etc.) - optional; default: 0" << std::endl
-    << "-diameter Diameter of the filtering area - optional; default: 2" << std::endl
-    << "-sigma_space Spatial parameter sigma of the Gaussian function - optional; default: 2.0f" << std::endl
-    << "-sigma_color Range parameter sigma of the Gaussian function - optional; default: 10.0f" << std::endl
-    << "-border_mode Border mode at image boundary when work pixels are outside of the image (0: constant color; 1: replicate; 2: reflect; 3: wrap) - optional; default: 1 (replicate)" << std::endl
-    << "-border_color Border color for constant color border mode - optional; default: (0, 0, 0, 0)" << std::endl;
-    exit(0);
+void PrintUsage(const char* programName) {
+    // clang-format off
+    std::cout << "Usage: " << programName << " -i <input_image> [-o <output_image>] [-d <device_id>] [-D <diameter>] [-s <sigma_space>] [-c <sigma_color>] [-b <border_mode>] [-B <border_color>]" << std::endl;
+    std::cout << "  -i, --input <input_image>           Input image or directory containing images (required)" << std::endl;
+    std::cout << "  -o, --output <output_image>         Output image or directory to save the results (optional, default: output)" << std::endl;
+    std::cout << "  -d, --device <device_id>            Device ID to use for execution (optional, default: 0)" << std::endl;
+    std::cout << "  -D, --diameter <diameter>           Diameter of the filtering area (optional, default: 2)" << std::endl;
+    std::cout << "  -s, --sigma_space <sigma_space>     Spatial parameter sigma of the Gaussian function (optional, default: 2.0f)" << std::endl;
+    std::cout << "  -c, --sigma_color <sigma_color>     Range parameter sigma of the Gaussian function (optional, default: 10.0f)" << std::endl;
+    std::cout << "  -B, --border_mode <border_mode>     Border mode at image boundary when work pixels are outside of the image (optional, default: 1 (replicate))" << std::endl;
+    std::cout << "  -B, --border_color <border_color>   Border color for constant color border mode (optional, default: 0,0,0,0)" << std::endl;
+    std::cout << "  -C, --cpu                           Use CPU for execution (optional, default: GPU)" << std::endl;
+    std::cout << "  -h, --help                          Show this help message" << std::endl;
+    // clang-format on
 }
 
-int main(int argc, char** argv) {
-    std::string input_file_path;
-    std::string output_file_path = "output.bmp";
-    bool gpuPath = true; // use GPU by default
-    eDeviceType device = eDeviceType::GPU;
-    int deviceId = 0;
-    int diameter = 2;
-    float sigmaSpace = 2.0f;
-    float sigmaColor = 10.0f;
-    eBorderType borderMode = BORDER_TYPE_REPLICATE;
-    float4 borderColor = {0.0f, 0.0f, 0.0f, 0.0f};
+bool ParseBorderColor(const std::string& borderColorStr, float4& borderColor) {
+    return sscanf(borderColorStr.c_str(), "%f,%f,%f,%f", &borderColor.x, &borderColor.y, &borderColor.z,
+                  &borderColor.w) == 4;
+}
 
-    if(argc < 3) {
-        ShowHelpAndExit("-h");
-    }
-    for (int i = 1; i < argc; i++) {
-        if (!strcmp(argv[i], "-h")) {
-            ShowHelpAndExit("-h");
-        }
-        if (!strcmp(argv[i], "-i")) {
-            if (++i == argc) {
-                ShowHelpAndExit("-i");
-            }
-            input_file_path = argv[i];
-            continue;
-        }
-        if (!strcmp(argv[i], "-o")) {
-            if (++i == argc) {
-                ShowHelpAndExit("-o");
-            }
-            output_file_path = argv[i];
-            continue;
-        }
-        if (!strcmp(argv[i], "-diameter")) {
-            if (++i == argc) {
-                ShowHelpAndExit("-diameter");
-            }
-            diameter = std::atoi(argv[i]);
-            continue;
-        }
-        if (!strcmp(argv[i], "-sigma_space")) {
-            if (++i == argc) {
-                ShowHelpAndExit("-sigma_space");
-            }
-            sigmaSpace = std::atof(argv[i]);
-            continue;
-        }
-        if (!strcmp(argv[i], "-sigma_color")) {
-            if (++i == argc) {
-                ShowHelpAndExit("-sigma_color");
-            }
-            sigmaColor = std::atof(argv[i]);
-            continue;
-        }
-        if (!strcmp(argv[i], "-border_mode")) {
-            if (++i == argc) {
-                ShowHelpAndExit("-border_mode");
-            }
-            borderMode = static_cast<eBorderType>(std::atoi(argv[i]));
-            continue;
-        }
-        if (!strcmp(argv[i], "-border_color")) {
-            i++;
-            if (i + 4 > argc) {
-                ShowHelpAndExit("-border_color");
-            }
-            borderColor.x = static_cast<float>(atoi(argv[i++]));
-            borderColor.y = static_cast<float>(atoi(argv[i++]));
-            borderColor.z = static_cast<float>(atoi(argv[i++]));
-            borderColor.w = static_cast<float>(atoi(argv[i]));
-            continue;
-        }
-        if (!strcmp(argv[i], "-cpu")) {
-            gpuPath = false;
-            continue;
+int main(int argc, char** argv) {
+    Config config;
+
+    static struct option longOptions[] = {{"input", required_argument, nullptr, 'i'},
+                                          {"output", required_argument, nullptr, 'o'},
+                                          {"device", required_argument, nullptr, 'd'},
+                                          {"diameter", required_argument, nullptr, 'D'},
+                                          {"sigma_space", required_argument, nullptr, 's'},
+                                          {"sigma_color", required_argument, nullptr, 'c'},
+                                          {"border_mode", required_argument, nullptr, 'b'},
+                                          {"border_color", required_argument, nullptr, 'B'},
+                                          {"cpu", no_argument, nullptr, 'C'},
+                                          {"help", no_argument, nullptr, 'h'},
+                                          {nullptr, 0, nullptr, 0}};
+
+    int opt;
+    while ((opt = getopt_long(argc, argv, "i:o:d:D:s:c:b:B:h:C", longOptions, nullptr)) != -1) {
+        switch (opt) {
+            case 'i':
+                config.inputPath = optarg;
+                break;
+            case 'o':
+                config.outputPath = optarg;
+                break;
+            case 'd':
+                config.deviceId = std::stoi(optarg);
+                break;
+            case 'D':
+                config.diameter = std::stoi(optarg);
+                break;
+            case 's':
+                config.sigmaSpace = std::stof(optarg);
+                break;
+            case 'c':
+                config.sigmaColor = std::stof(optarg);
+                break;
+            case 'b':
+                config.borderMode = static_cast<eBorderType>(std::stoi(optarg));
+                break;
+            case 'B':
+                if (!ParseBorderColor(optarg, config.borderColor)) {
+                    std::cerr << "Invalid border color format. Use: r,g,b,a (e.g., 0,0,0,0)\n";
+                    return EXIT_FAILURE;
+                }
+                break;
+            case 'C':
+                config.device = eDeviceType::CPU;
+                break;
+            case 'h':
+                PrintUsage(argv[0]);
+                return EXIT_SUCCESS;
+            default:
+                PrintUsage(argv[0]);
+                return EXIT_FAILURE;
         }
     }
 
-    if (gpuPath) {
-        device = eDeviceType::GPU;
-        HIP_VALIDATE_NO_ERRORS(hipSetDevice(deviceId));
-    } else {
-        device = eDeviceType::CPU;
-    }
-    hipStream_t stream = nullptr;
-    if (gpuPath) {
-        HIP_VALIDATE_NO_ERRORS(hipStreamCreate(&stream));
+    if (config.inputPath.empty()) {
+        std::cerr << "Error: Input path is required.\n\n";
+        PrintUsage(argv[0]);
+        return EXIT_FAILURE;
     }
 
-    cv::Mat imageData = cv::imread(input_file_path);
-    if (imageData.empty()) {
-        std::cerr << "Failed to read the input image file" << std::endl;
-        exit(1);
+    if (config.device == eDeviceType::GPU) {
+        CHECK_HIP_ERROR(hipSetDevice(config.deviceId));
     }
 
-    // Create input/output tensors for the image.
-    TensorShape imageShape(TensorLayout(eTensorLayout::TENSOR_LAYOUT_NHWC), {1, imageData.rows, imageData.cols, imageData.channels()});
-    DataType dtype(eDataType::DATA_TYPE_U8);
-    Tensor input(imageShape, dtype, device);
-    Tensor output(imageShape, dtype, device);
-
-    // Move image data to input tensor
-    size_t imageSizeInByte = input.shape().size() * input.dtype().size();
-    auto inputData = input.exportData<TensorDataStrided>();
-    if (gpuPath) {
-        HIP_VALIDATE_NO_ERRORS(hipMemcpyAsync(inputData.basePtr(), imageData.data, imageSizeInByte, hipMemcpyHostToDevice, stream));
-    } else {
-        memcpy(inputData.basePtr(), imageData.data, imageSizeInByte);
-    }
+    // Create stream
+    hipStream_t stream;
+    CHECK_HIP_ERROR(hipStreamCreate(&stream));
+
+    // Load input image
+    Tensor input = LoadImages(stream, config.inputPath.c_str(), config.device);
 
+    // Create output tensor
+    Tensor output(input.shape(), input.dtype(), config.device);
+
+    // Create BilateralFilter operator
     BilateralFilter op;
-    op(stream, input, output, diameter, sigmaColor, sigmaSpace, borderMode, borderColor, device);
-
-    // Move image data back to host
-    size_t outputSize = output.shape().size() * output.dtype().size();
-    auto outData = output.exportData<TensorDataStrided>();
-    std::vector<uint8_t> h_output(outputSize);
-    if (gpuPath) {
-        HIP_VALIDATE_NO_ERRORS(hipMemcpyAsync(h_output.data(), outData.basePtr(), outputSize, hipMemcpyDeviceToHost, stream));
-        HIP_VALIDATE_NO_ERRORS(hipStreamSynchronize(stream));
-    } else {
-        memcpy(h_output.data(), outData.basePtr(), outputSize);
-    }
+    op(stream, input, output, config.diameter, config.sigmaColor, config.sigmaSpace, config.borderMode,
+       config.borderColor, config.device);
 
-    // Write output image to disk
-    cv::Mat outImageData(imageData.rows, imageData.cols, imageData.type(), h_output.data());
-    bool ret = cv::imwrite(output_file_path, outImageData);
-    if (!ret) {
-        std::cerr << "Faild to save output image to the file" << std::endl;
-        exit(1);
-    }
+    WriteImages(stream, output, config.outputPath);
 
-    std::cout << "Input image file: " << input_file_path << std::endl;
-    std::cout << "Output image file: " << output_file_path << std::endl;
-    if (gpuPath) {
-        std::cout << "Operation on GPU device " << deviceId << std::endl;
-    } else {
-        std::cout << "Operation on CPU" << std::endl;
-    }
-    std::cout << "Image size: width = " << imageData.cols << ", height = " << imageData.rows << std::endl;
+    CHECK_HIP_ERROR(hipStreamDestroy(stream));
 
     return EXIT_SUCCESS;
 }
\ No newline at end of file

From 612cfb6d04920e41fe3811b3f9cef19f9e6d86ee Mon Sep 17 00:00:00 2001
From: Zach Vincze <zavincze@amd.com>
Date: Mon, 8 Dec 2025 12:32:14 -0500
Subject: [PATCH 18/36] Fix CenterCrop sample

---
 samples/center_crop.cpp | 197 ++++++++++++++++++----------------------
 1 file changed, 87 insertions(+), 110 deletions(-)

diff --git a/samples/center_crop.cpp b/samples/center_crop.cpp
index 123e8f26..7a460773 100644
--- a/samples/center_crop.cpp
+++ b/samples/center_crop.cpp
@@ -21,143 +21,120 @@ THE SOFTWARE.
 */
 
 #include <core/hip_assert.h>
+#include <getopt.h>
+
 #include <core/tensor.hpp>
 #include <iostream>
 #include <op_center_crop.hpp>
 #include <opencv2/opencv.hpp>
 
+#include "common/utils.hpp"
+
 using namespace roccv;
 
+struct Config {
+    std::string inputPath;
+    std::string outputPath = "output";
+    eDeviceType device = eDeviceType::GPU;
+    int deviceId = 0;
+    Size2D cropArea = {1, 1};
+};
+
+bool ParseCropArea(const std::string& cropStr, Size2D& cropArea) {
+    std::istringstream iss(cropStr);
+    std::string token;
+    getline(iss, token, ',');
+    cropArea.w = std::stoi(token);
+    getline(iss, token, ',');
+    cropArea.h = std::stoi(token);
+    return true;
+}
+
 /**
  * @brief Center crop operation example.
  */
 
-void ShowHelpAndExit(const char *option = NULL) {
-    std::cout << "Options: " << option << std::endl
-    << "-i Input File Path - required" << std::endl
-    << "-o Output File Path - optional; default: output.bmp" << std::endl
-    << "-cpu Select CPU instead of GPU to perform operation - optional; default choice is GPU path" << std::endl
-    << "-d GPU device ID (0 for the first device, 1 for the second, etc.) - optional; default: 0" << std::endl
-    << "-crop Center crop area (width, height)- optional; default: use the set value in the app" << std::endl;
-    exit(0);
+void PrintUsage(const char* programName) {
+    // clang-format off
+    std::cout << "Usage: " << programName << " -i <input_image> [-o <output_image>] [-d <device_id>] [-crop <width,height>] [-C]" << std::endl;
+    std::cout << "  -i, --input <input_image>           Input image or directory containing images (required)" << std::endl;
+    std::cout << "  -o, --output <output_image>         Output image or directory to save the results (optional, default: output)" << std::endl;
+    std::cout << "  -d, --device <device_id>            Device ID to use for execution (optional, default: 0)" << std::endl;
+    std::cout << "  -c, --crop <width,height>           Crop area as comma separated values (optional, default: 1,1)" << std::endl;
+    std::cout << "  -C, --cpu                           Use CPU for execution (optional, default: GPU)" << std::endl;
+    std::cout << "  -h, --help                          Show this help message" << std::endl;
+    // clang-format on
 }
-
 int main(int argc, char** argv) {
-    std::string input_file_path;
-    std::string output_file_path = "output.bmp";
-    bool gpuPath = true; // use GPU by default
-    eDeviceType device = eDeviceType::GPU;
-    int deviceId = 0;
-    Size2D cropArea = {1, 1};
-    bool cropSet = false;
-
-    if(argc < 3) {
-        ShowHelpAndExit("-h");
-    }
-    for (int i = 1; i < argc; i++) {
-        if (!strcmp(argv[i], "-h")) {
-            ShowHelpAndExit("-h");
-        }
-        if (!strcmp(argv[i], "-i")) {
-            if (++i == argc) {
-                ShowHelpAndExit("-i");
-            }
-            input_file_path = argv[i];
-            continue;
-        }
-        if (!strcmp(argv[i], "-o")) {
-            if (++i == argc) {
-                ShowHelpAndExit("-o");
-            }
-            output_file_path = argv[i];
-            continue;
-        }
-        if (!strcmp(argv[i], "-crop")) {
-            i++;
-            if (i + 2 > argc) {
-                ShowHelpAndExit("-crop");
-            }
-            cropArea.w = atoi(argv[i++]);
-            cropArea.h = atoi(argv[i]);
-            cropSet = true;
-            continue;
-        }
-        if (!strcmp(argv[i], "-cpu")) {
-            gpuPath = false;
-            continue;
+    Config config;
+    static struct option longOptions[] = {{"input", required_argument, nullptr, 'i'},
+                                          {"output", required_argument, nullptr, 'o'},
+                                          {"device", required_argument, nullptr, 'd'},
+                                          {"crop", required_argument, nullptr, 'c'},
+                                          {"cpu", no_argument, nullptr, 'C'},
+                                          {"help", no_argument, nullptr, 'h'},
+                                          {nullptr, 0, nullptr, 0}};
+    int opt;
+    while ((opt = getopt_long(argc, argv, "i:o:d:c:h:C", longOptions, nullptr)) != -1) {
+        switch (opt) {
+            case 'i':
+                config.inputPath = optarg;
+                break;
+            case 'o':
+                config.outputPath = optarg;
+                break;
+            case 'd':
+                config.deviceId = std::stoi(optarg);
+                break;
+            case 'c':
+                if (!ParseCropArea(optarg, config.cropArea)) {
+                    std::cerr << "Invalid crop area format. Use: width,height (e.g., 1,1)\n";
+                    return EXIT_FAILURE;
+                }
+                break;
+            case 'C':
+                config.device = eDeviceType::CPU;
+                break;
+            case 'h':
+                PrintUsage(argv[0]);
+                return EXIT_SUCCESS;
+            default:
+                PrintUsage(argv[0]);
+                return EXIT_FAILURE;
         }
     }
 
-    if (gpuPath) {
-        device = eDeviceType::GPU;
-        HIP_VALIDATE_NO_ERRORS(hipSetDevice(deviceId));
-    } else {
-        device = eDeviceType::CPU;
+    if (config.inputPath.empty()) {
+        std::cerr << "Error: Input path is required.\n\n";
+        PrintUsage(argv[0]);
+        return EXIT_FAILURE;
     }
 
-    cv::Mat imageData = cv::imread(input_file_path);
-    if (imageData.empty()) {
-        std::cerr << "Failed to read the input image file" << std::endl;
-        exit(1);
-    }
-    if (!cropSet) {
-        // Set a safe crop area if no user input
-        cropArea = {(imageData.cols / 2), (imageData.rows / 2)};
+    if (config.device == eDeviceType::GPU) {
+        CHECK_HIP_ERROR(hipSetDevice(config.deviceId));
     }
 
-    // Create input/output tensors for the image.
-    TensorShape inputShape(TensorLayout(eTensorLayout::TENSOR_LAYOUT_NHWC), {1, imageData.rows, imageData.cols, imageData.channels()});
-    DataType dtype(eDataType::DATA_TYPE_U8);
-    Tensor input(inputShape, dtype, device);
+    hipStream_t stream;
+    CHECK_HIP_ERROR(hipStreamCreate(&stream));
 
-    TensorShape outShape(TensorLayout(eTensorLayout::TENSOR_LAYOUT_NHWC), {1, cropArea.h, cropArea.w, imageData.channels()});
-    Tensor output(outShape, dtype, device);
+    // Load input images
+    Tensor input = LoadImages(stream, config.inputPath.c_str(), config.device);
 
-    hipStream_t stream = nullptr;
-    if (gpuPath) {
-        HIP_VALIDATE_NO_ERRORS(hipStreamCreate(&stream));
-    }
+    // Create output shape
+    TensorShape outputShape(input.layout(), {input.shape(input.layout().batch_index()), config.cropArea.h,
+                                             config.cropArea.w, input.shape(input.layout().channels_index())});
 
-    // Move image data to input tensor
-    size_t imageSizeInByte = input.shape().size() * input.dtype().size();
-    auto input_data = input.exportData<TensorDataStrided>();
-    if (gpuPath) {
-        HIP_VALIDATE_NO_ERRORS(hipMemcpyAsync(input_data.basePtr(), imageData.data, imageSizeInByte, hipMemcpyHostToDevice, stream));
-    } else {
-        memcpy(input_data.basePtr(), imageData.data, imageSizeInByte);
-    }
+    // Create output tensor
+    Tensor output(outputShape, input.dtype(), config.device);
 
+    // Create CenterCrop operator
     CenterCrop op;
-    op(stream, input, output, cropArea, device);
-
-    // Move image data back to host
-    size_t outputSize = output.shape().size() * output.dtype().size();
-    auto outData = output.exportData<TensorDataStrided>();
-    std::vector<uint8_t> h_output(outputSize);
-    if (gpuPath) {
-        HIP_VALIDATE_NO_ERRORS(hipMemcpyAsync(h_output.data(), outData.basePtr(), outputSize, hipMemcpyDeviceToHost, stream));
-        HIP_VALIDATE_NO_ERRORS(hipStreamSynchronize(stream));
-    } else {
-        memcpy(h_output.data(), outData.basePtr(), outputSize);
-    }
+    op(stream, input, output, config.cropArea, config.device);
 
-    // Write output image to disk
-    cv::Mat output_imageData(cropArea.h, cropArea.w, imageData.type(), h_output.data());
-    bool ret = cv::imwrite(output_file_path, output_imageData);
-    if (!ret) {
-        std::cerr << "Faild to save output image to the file" << std::endl;
-        exit(1);
-    }
+    // Write output images to disk
+    WriteImages(stream, output, config.outputPath);
 
-    std::cout << "Input image file: " << input_file_path << std::endl;
-    std::cout << "Output image file: " << output_file_path << std::endl;
-    if (gpuPath) {
-        std::cout << "Operation on GPU device " << deviceId << std::endl;
-    } else {
-        std::cout << "Operation on CPU" << std::endl;
-    }
-    std::cout << "Input image size: width = " << imageData.cols << ", height = " << imageData.rows << std::endl;
-    std::cout << "Cropping area: width = " << cropArea.w << ", height = " << cropArea.h << std::endl;
+    CHECK_HIP_ERROR(hipStreamDestroy(stream));
 
     return EXIT_SUCCESS;
 }
\ No newline at end of file

From b1ae692f21565b04688b146ab9e165b1b667e27e Mon Sep 17 00:00:00 2001
From: Zach Vincze <zavincze@amd.com>
Date: Mon, 8 Dec 2025 13:50:02 -0500
Subject: [PATCH 19/36] Fix/cleanup BndBox sample

---
 samples/bnd_box.cpp | 419 +++++++++++++++++++++++---------------------
 1 file changed, 215 insertions(+), 204 deletions(-)

diff --git a/samples/bnd_box.cpp b/samples/bnd_box.cpp
index 72ccbf06..621d3ef7 100644
--- a/samples/bnd_box.cpp
+++ b/samples/bnd_box.cpp
@@ -21,6 +21,7 @@ THE SOFTWARE.
 */
 
 #include <core/hip_assert.h>
+#include <getopt.h>
 
 #include <core/tensor.hpp>
 #include <fstream>
@@ -28,239 +29,249 @@ THE SOFTWARE.
 #include <op_bnd_box.hpp>
 #include <opencv2/opencv.hpp>
 
-using namespace roccv;
+#include "common/utils.hpp"
 
-/**
- * @brief Bounding Box operation example.
- */
+using namespace roccv;
 
 /**
- * @brief Example bounding box list file content
- * 1 <-- number of images
- * 2 <-- number of boxes for image 1
- * 50 <-- X coordinate of top-left corner of box 1
- * 50 <-- Y coordinate of top-left corner of box 1
- * 100 <-- width of box 1
- * 50 <-- height of box 1
- * 5 <-- thickness of box boundary of box 1
- * 0 <-- B component of box border color of box 1
- * 0 <-- G component of box border color of box 1
- * 255 <-- R component of box border color of box 1
- * 200 <-- alpha component of box border color of box 1
- * 0 <-- B component of box fill color of box 1
- * 255 <-- G component of box fill color of box 1
- * 0 <-- R component of box fill color of box 1
- * 100 <-- alpha component of box fill color of box 1
- * 250 <-- X coordinate of top-left corner of box 2
- * 250 <-- Y coordinate of top-left corner of box 2
- * 50 <-- width of box 2
- * 100 <-- height of box 2
- * 10 <-- thickness of box boundary of box 2
- * 255 <-- B component of box border color of box 2
- * 0 <-- G component of box border color of box 2
- * 0 <-- R component of box border color of box 2
- * 200 <-- alpha component of box border color of box 2
- * 0 <-- B component of box fill color of box 2
- * 0 <-- G component of box fill color of box 2
- * 0 <-- R component of box fill color of box 2
- * 0 <-- alpha component of box fill color of box 2
+ * @file bnd_box.cpp
+ * @brief Example application for drawing bounding boxes on images using ROC CV.
+ *
+ * This file demonstrates how to parse a bounding box description file and apply bounding boxes to input images.
+ * The bounding box file is expected to have the following format:
+ * - The first line contains the number of images.
+ * - Each image section starts with a line containing the number of boxes for that image.
+ * - Each box is described over 13 subsequent lines as follows:
+ *   - X coordinate of top-left corner
+ *   - Y coordinate of top-left corner
+ *   - Width
+ *   - Height
+ *   - Thickness of box boundary
+ *   - B component of box border color
+ *   - G component of box border color
+ *   - R component of box border color
+ *   - Alpha component of box border color
+ *   - B component of box fill color
+ *   - G component of box fill color
+ *   - R component of box fill color
+ *   - Alpha component of box fill color
+ *
+ * Example of bounding box list file content:
+ * @code
+ * 1
+ * 2
+ * 50
+ * 50
+ * 100
+ * 50
+ * 5
+ * 0
+ * 0
+ * 255
+ * 200
+ * 0
+ * 255
+ * 0
+ * 100
+ * 250
+ * 250
+ * 50
+ * 100
+ * 10
+ * 255
+ * 0
+ * 0
+ * 200
+ * 0
+ * 0
+ * 0
+ * 0
+ * @endcode
  */
 
-void ShowHelpAndExit(const char* option = NULL) {
-    std::cout << "Options: " << option << std::endl
-              << "-i Input File Path - required" << std::endl
-              << "-o Output File Path - optional; default: output.bmp" << std::endl
-              << "-cpu Select CPU instead of GPU to perform operation - optional; default choice is GPU path"
-              << std::endl
-              << "-d GPU device ID (0 for the first device, 1 for the second, etc.) - optional; default: 0" << std::endl
-              << "-box_file Bounding box list file - optional; default: use the set value in the app" << std::endl;
-    exit(0);
-}
-
-int main(int argc, char** argv) {
-    std::string input_file_path;
-    std::string box_file_path;
-    std::string output_file_path = "output.bmp";
-    bool gpuPath = true;  // use GPU by default
+struct Config {
+    std::string inputPath;
+    std::string outputPath = "output";
     eDeviceType device = eDeviceType::GPU;
     int deviceId = 0;
-    bool boxSet = false;  // User sets the bounding box list data in a text file
-
-    if (argc < 3) {
-        ShowHelpAndExit("-h");
-    }
-    for (int i = 1; i < argc; i++) {
-        if (!strcmp(argv[i], "-h")) {
-            ShowHelpAndExit("-h");
-        }
-        if (!strcmp(argv[i], "-i")) {
-            if (++i == argc) {
-                ShowHelpAndExit("-i");
-            }
-            input_file_path = argv[i];
-            continue;
-        }
-        if (!strcmp(argv[i], "-o")) {
-            if (++i == argc) {
-                ShowHelpAndExit("-o");
-            }
-            output_file_path = argv[i];
-            continue;
-        }
-        if (!strcmp(argv[i], "-box_file")) {
-            if (++i == argc) {
-                ShowHelpAndExit("-o");
-            }
-            box_file_path = argv[i];
-            boxSet = true;
-            continue;
-        }
-        if (!strcmp(argv[i], "-cpu")) {
-            gpuPath = false;
-            continue;
-        }
-    }
+    std::string boundingBoxFilePath = "";
+};
 
-    if (gpuPath) {
-        device = eDeviceType::GPU;
-        HIP_VALIDATE_NO_ERRORS(hipSetDevice(deviceId));
-    } else {
-        device = eDeviceType::CPU;
-    }
+void PrintUsage(const char* programName) {
+    // clang-format off
+    std::cout << "Usage: " << programName << " -i <input_image> [-o <output_image>] [-b <bounding_box_file>] [-d <device_id>] [-C]" << std::endl;
+    std::cout << "  -i, --input <input_image>           Input image or directory containing images (required)" << std::endl;
+    std::cout << "  -o, --output <output_image>         Output image or directory to save the results (optional, default: output)" << std::endl;
+    std::cout << "  -b, --box_file <bounding_box_file>  Bounding box list file (optional, default: use the set value in the app)" << std::endl;
+    std::cout << "  -d, --device <device_id>            Device ID to use for execution (optional, default: 0)" << std::endl;
+    std::cout << "  -C, --cpu                           Use CPU for execution (optional, default: GPU)" << std::endl;
+    std::cout << "  -h, --help                          Show this help message" << std::endl;
+    // clang-format on
+}
 
-    cv::Mat imageData = cv::imread(input_file_path);
-    if (imageData.empty()) {
-        std::cerr << "Failed to read the input image file" << std::endl;
-        exit(1);
+/**
+ * @brief Parse bounding box file and setup bounding box vector.
+ *
+ * @param[in] boundingBoxFilePath Path to bounding box list file.
+ * @param[out] bbox_vector Vector of bounding box vectors to be filled.
+ * @return void
+ * @throws std::runtime_error if failed to open bounding box file.
+ */
+void ParseBoundingBoxFile(const std::string& boundingBoxFilePath, std::vector<std::vector<BndBox_t>>& bbox_vector) {
+    std::ifstream file(boundingBoxFilePath);
+    if (!file.is_open()) {
+        throw std::runtime_error("Failed to open bounding box file " + boundingBoxFilePath);
     }
 
-    int batchSize = 1;
-    std::vector<std::vector<BndBox_t>> bbox_vector;
-    if (boxSet) {
-        std::ifstream box_list_file(box_file_path);
-        if (box_list_file.is_open()) {
-            std::string line;
-            std::getline(box_list_file, line);
-            batchSize = std::stoi(line.c_str());
-            if (batchSize > 0) {
-                bbox_vector.resize(batchSize);
-                for (int i = 0; i < batchSize; i++) {
-                    std::getline(box_list_file, line);
-                    int numBoxes = std::stoi(line.c_str());
-                    if (numBoxes > 0) {
-                        for (int b = 0; b < numBoxes; b++) {
-                            BndBox_t box;
-                            std::getline(box_list_file, line);
-                            box.box.x = std::atoi(line.c_str());
-                            std::getline(box_list_file, line);
-                            box.box.y = std::atoi(line.c_str());
-                            std::getline(box_list_file, line);
-                            box.box.width = std::atoi(line.c_str());
-                            std::getline(box_list_file, line);
-                            box.box.height = std::atoi(line.c_str());
+    int numImages;
+    file >> numImages;
+    bbox_vector.resize(numImages);
+    for (int i = 0; i < numImages; i++) {
+        int numBoxes;
+        file >> numBoxes;
 
-                            std::getline(box_list_file, line);
-                            box.thickness = std::atoi(line.c_str());
+        bbox_vector[i].resize(numBoxes);
+        for (int j = 0; j < numBoxes; j++) {
+            // Parse each box from 13 lines, in order:
+            //   1. x         (top-left corner)
+            //   2. y         (top-left corner)
+            //   3. width
+            //   4. height
+            //   5. thickness (box boundary)
+            //   6. border B
+            //   7. border G
+            //   8. border R
+            //   9. border A (alpha)
+            //  10. fill B
+            //  11. fill G
+            //  12. fill R
+            //  13. fill A (alpha)
+            // (13 values total per box)
 
-                            std::getline(box_list_file, line);
-                            box.borderColor.r = std::atoi(line.c_str());
-                            std::getline(box_list_file, line);
-                            box.borderColor.g = std::atoi(line.c_str());
-                            std::getline(box_list_file, line);
-                            box.borderColor.b = std::atoi(line.c_str());
-                            std::getline(box_list_file, line);
-                            box.borderColor.a = std::atoi(line.c_str());
+            // Read box dimensions
+            file >> bbox_vector[i][j].box.x >> bbox_vector[i][j].box.y >> bbox_vector[i][j].box.width >>
+                bbox_vector[i][j].box.height;
+            file >> bbox_vector[i][j].thickness;
 
-                            std::getline(box_list_file, line);
-                            box.fillColor.r = std::atoi(line.c_str());
-                            std::getline(box_list_file, line);
-                            box.fillColor.g = std::atoi(line.c_str());
-                            std::getline(box_list_file, line);
-                            box.fillColor.b = std::atoi(line.c_str());
-                            std::getline(box_list_file, line);
-                            box.fillColor.a = std::atoi(line.c_str());
+            // Read colors into temp ints
+            int r, g, b, a;
+            file >> b >> g >> r >> a;
+            bbox_vector[i][j].borderColor = {static_cast<uint8_t>(r), static_cast<uint8_t>(g), static_cast<uint8_t>(b),
+                                             static_cast<uint8_t>(a)};
 
-                            bbox_vector[i].push_back(box);
-                        }
-                    } else {
-                        std::cerr << "Invalid number of boxes: " << numBoxes << "for image: " << i << std::endl;
-                        exit(1);
-                    }
-                }
-            } else {
-                std::cerr << "Invalid batch size: " << batchSize << std::endl;
-                exit(1);
-            }
-        } else {
-            std::cerr << "Failed to open bounding box list file " << box_file_path << std::endl;
-            exit(1);
+            file >> b >> g >> r >> a;
+            bbox_vector[i][j].fillColor = {static_cast<uint8_t>(r), static_cast<uint8_t>(g), static_cast<uint8_t>(b),
+                                           static_cast<uint8_t>(a)};
         }
-    } else {
-        auto width = imageData.cols;
-        auto height = imageData.rows;
-        bbox_vector = {
-            {
+    }
+}
+
+/**
+ * @brief Setup bounding box vector from file or default values.
+ *
+ * @param[in] batchSize Number of images in the batch.
+ * @param[in] width Width of each image in the batch.
+ * @param[in] height Height of each image in the batch.
+ * @param[in] boundingBoxFilePath Path to bounding box list file.
+ * @return Vector of bounding box vectors.
+ */
+std::vector<std::vector<BndBox_t>> SetupBoundingBoxVector(int64_t batchSize, int64_t width, int64_t height,
+                                                          const std::string& boundingBoxFilePath) {
+    std::vector<std::vector<BndBox_t>> bbox_vector;
+
+    if (boundingBoxFilePath.empty()) {
+        for (int64_t b = 0; b < batchSize; b++) {
+            bbox_vector.push_back({
                 {{width / 4, height / 4, width / 2, height / 2}, 5, {0, 0, 255, 200}, {0, 255, 0, 100}},
                 {{width / 3, height / 3, width / 3 * 2, height / 4}, -1, {90, 16, 181, 50}, {0, 0, 0, 0}},
                 {{-50, (height * 2) / 3, width + 50, height / 3 + 50}, 0, {0, 0, 0, 0}, {111, 159, 232, 150}},
-            },
-        };
+            });
+        }
+    } else {
+        ParseBoundingBoxFile(boundingBoxFilePath, bbox_vector);
     }
-    BndBoxes bboxes(bbox_vector);
 
-    // Create input/output tensors for the image.
-    TensorShape imageShape(TensorLayout(eTensorLayout::TENSOR_LAYOUT_NHWC),
-                           {1, imageData.rows, imageData.cols, imageData.channels()});
-    DataType dtype(eDataType::DATA_TYPE_U8);
-    Tensor input(imageShape, dtype, device);
-    Tensor output(imageShape, dtype, device);
+    return bbox_vector;
+}
 
-    hipStream_t stream = nullptr;
-    if (gpuPath) {
-        HIP_VALIDATE_NO_ERRORS(hipStreamCreate(&stream));
+/**
+ * @brief Main function to run the bounding box operation example.
+ *
+ * @param[in] argc Number of command line arguments.
+ * @param[in] argv Command line arguments.
+ * @return int Exit status.
+ */
+int main(int argc, char** argv) {
+    Config config;
+    static struct option longOptions[] = {{"input", required_argument, nullptr, 'i'},
+                                          {"output", required_argument, nullptr, 'o'},
+                                          {"box_file", required_argument, nullptr, 'b'},
+                                          {"device", required_argument, nullptr, 'd'},
+                                          {"cpu", no_argument, nullptr, 'C'},
+                                          {"help", no_argument, nullptr, 'h'},
+                                          {nullptr, 0, nullptr, 0}};
+    int opt;
+    while ((opt = getopt_long(argc, argv, "i:o:b:d:h:C", longOptions, nullptr)) != -1) {
+        switch (opt) {
+            case 'i':
+                config.inputPath = optarg;
+                break;
+            case 'o':
+                config.outputPath = optarg;
+                break;
+            case 'b':
+                config.boundingBoxFilePath = optarg;
+                break;
+            case 'd':
+                config.deviceId = std::stoi(optarg);
+                break;
+            case 'C':
+                config.device = eDeviceType::CPU;
+                break;
+            case 'h':
+                PrintUsage(argv[0]);
+                return EXIT_SUCCESS;
+            default:
+                PrintUsage(argv[0]);
+                return EXIT_FAILURE;
+        }
     }
 
-    // Move image data to input tensor
-    size_t imageSizeInByte = input.shape().size() * input.dtype().size();
-    auto inputData = input.exportData<TensorDataStrided>();
-    if (gpuPath) {
-        HIP_VALIDATE_NO_ERRORS(
-            hipMemcpyAsync(inputData.basePtr(), imageData.data, imageSizeInByte, hipMemcpyHostToDevice, stream));
-    } else {
-        memcpy(inputData.basePtr(), imageData.data, imageSizeInByte);
+    if (config.inputPath.empty()) {
+        std::cerr << "Error: Input path is required.\n\n";
+        PrintUsage(argv[0]);
+        return EXIT_FAILURE;
     }
 
-    BndBox op;
-    op(stream, input, output, bboxes, device);
-
-    // Move image data back to host
-    size_t outputSize = output.shape().size() * output.dtype().size();
-    auto outData = output.exportData<TensorDataStrided>();
-    std::vector<uint8_t> h_output(outputSize);
-    if (gpuPath) {
-        HIP_VALIDATE_NO_ERRORS(
-            hipMemcpyAsync(h_output.data(), outData.basePtr(), outputSize, hipMemcpyDeviceToHost, stream));
-        HIP_VALIDATE_NO_ERRORS(hipStreamSynchronize(stream));
-    } else {
-        memcpy(h_output.data(), outData.basePtr(), outputSize);
+    if (config.device == eDeviceType::GPU) {
+        CHECK_HIP_ERROR(hipSetDevice(config.deviceId));
     }
 
-    // Write output image to disk
-    cv::Mat outImageData(imageData.rows, imageData.cols, imageData.type(), h_output.data());
-    bool ret = cv::imwrite(output_file_path, outImageData);
-    if (!ret) {
-        std::cerr << "Faild to save output image to the file" << std::endl;
-        exit(1);
-    }
+    hipStream_t stream;
+    CHECK_HIP_ERROR(hipStreamCreate(&stream));
 
-    std::cout << "Input image file: " << input_file_path << std::endl;
-    std::cout << "Output image file: " << output_file_path << std::endl;
-    if (gpuPath) {
-        std::cout << "Operation on GPU device " << deviceId << std::endl;
-    } else {
-        std::cout << "Operation on CPU" << std::endl;
-    }
-    std::cout << "Image size: width = " << imageData.cols << ", height = " << imageData.rows << std::endl;
+    // Load images
+    Tensor input = LoadImages(stream, config.inputPath.c_str(), config.device);
+    int64_t batchSize = input.shape()[input.shape().layout().batch_index()];
+    int64_t height = input.shape()[input.shape().layout().height_index()];
+    int64_t width = input.shape()[input.shape().layout().width_index()];
+
+    // Setup bounding box vector
+    std::vector<std::vector<BndBox_t>> bbox_vector =
+        SetupBoundingBoxVector(batchSize, width, height, config.boundingBoxFilePath);
+
+    BndBoxes bboxes(bbox_vector);
+
+    // Create output tensor
+    Tensor output(input.shape(), input.dtype(), config.device);
+
+    // Create BndBox operator
+    BndBox op;
+    op(stream, input, output, bboxes, config.device);
+
+    // Write output images to disk
+    WriteImages(stream, output, config.outputPath);
 
+    CHECK_HIP_ERROR(hipStreamDestroy(stream));
     return EXIT_SUCCESS;
 }
\ No newline at end of file

From e85d3a4d35f912311737ae28ff70b39179d40a80 Mon Sep 17 00:00:00 2001
From: Zach Vincze <zavincze@amd.com>
Date: Mon, 8 Dec 2025 14:18:22 -0500
Subject: [PATCH 20/36] Fix/clean Composite sample

---
 samples/common/utils.hpp |   4 +-
 samples/composite.cpp    | 162 +++++++++++++++++++++++----------------
 2 files changed, 96 insertions(+), 70 deletions(-)

diff --git a/samples/common/utils.hpp b/samples/common/utils.hpp
index f8f0c5a0..b34dedb4 100644
--- a/samples/common/utils.hpp
+++ b/samples/common/utils.hpp
@@ -83,7 +83,7 @@ inline MemcpyParams GetMemcpyParams(const roccv::Tensor &tensor) {
  * @return A NHWC tensor containing the loaded images.
  */
 inline roccv::Tensor LoadImages(hipStream_t stream, const std::string &image_path,
-                                eDeviceType device = eDeviceType::GPU) {
+                                eDeviceType device = eDeviceType::GPU, int openCVFlags = cv::IMREAD_UNCHANGED) {
     const std::vector<std::string> supportedExtensions = {".bmp", ".jpg", ".jpeg", ".png"};
 
     std::vector<cv::Mat> images;
@@ -96,7 +96,7 @@ inline roccv::Tensor LoadImages(hipStream_t stream, const std::string &image_pat
     if (std::filesystem::is_directory(image_path)) {
         for (auto file : std::filesystem::directory_iterator(image_path)) {
             if (!std::filesystem::is_directory(file.path()) && ContainsExtension(file.path(), supportedExtensions)) {
-                images.push_back(cv::imread(file.path()));
+                images.push_back(cv::imread(file.path(), openCVFlags));
 
                 // Check if all images are of the same size
                 if (width == -1 && height == -1 && channels == -1) {
diff --git a/samples/composite.cpp b/samples/composite.cpp
index cbc5507e..135dd9f2 100644
--- a/samples/composite.cpp
+++ b/samples/composite.cpp
@@ -19,89 +19,115 @@
  * THE SOFTWARE.
  */
 
+#include <getopt.h>
+
 #include <core/tensor.hpp>
 #include <op_composite.hpp>
 #include <opencv2/opencv.hpp>
 
+#include "common/utils.hpp"
+
+/** @file composite.cpp
+ * @brief Sample application for the Composite operation.
+ *
+ * This sample application demonstrates the Composite operation, which blends the foreground image into the background
+ * image using a grayscale alpha mask. It loads the background, foreground, and mask images, runs the Composite
+ * operation, and writes the output images to disk.
+ */
+
 using namespace roccv;
 
+struct Config {
+    std::string backgroundPath;
+    std::string foregroundPath;
+    std::string maskPath;
+    std::string outputPath = "output";
+    eDeviceType device = eDeviceType::GPU;
+    int deviceId = 0;
+};
+
+void PrintUsage(const char* programName) {
+    // clang-format off
+    std::cout << "Usage: " << programName << " -b <background_filename> -f <foreground_filename> -m <mask_filename> -o <output_filename> -d <device_id>" << std::endl;
+    std::cout << "  -b, --background <background_filename> Background image filename (required)" << std::endl;
+    std::cout << "  -f, --foreground <foreground_filename> Foreground image filename (required)" << std::endl;
+    std::cout << "  -m, --mask <mask_filename> Mask image filename (required)" << std::endl;
+    std::cout << "  -o, --output <output_filename> Output image filename (optional, default: output)" << std::endl;
+    std::cout << "  -d, --device <device_id> Device ID to use for execution (optional, default: 0)" << std::endl;
+    std::cout << "  -h, --help Show this help message" << std::endl;
+    std::cout << std::endl;
+    std::cout << "NOTE: If you use directories for the background, foreground, or mask inputs, each directory must contain the same number of images. Each image in the directories must be the same size." << std::endl;
+    // clang-format on
+}
+
+/**
+ * @brief Main function for the Composite operation.
+ *
+ * @param argc Number of command line arguments.
+ * @param argv Command line arguments.
+ * @return EXIT_SUCCESS if the operation completed successfully, EXIT_FAILURE otherwise.
+ */
 int main(int argc, char** argv) {
-    if (argc != 6) {
-        std::cerr << "Usage: " << argv[0]
-                  << " <background_filename> <foreground_filename> <mask_filename> <output_filename> <device_id>"
-                  << std::endl;
+    Config config;
+    static struct option longOptions[] = {{"background", required_argument, nullptr, 'b'},
+                                          {"foreground", required_argument, nullptr, 'f'},
+                                          {"mask", required_argument, nullptr, 'm'},
+                                          {"output", required_argument, nullptr, 'o'},
+                                          {"device", required_argument, nullptr, 'd'},
+                                          {"help", no_argument, nullptr, 'h'},
+                                          {nullptr, 0, nullptr, 0}};
+    int opt;
+    while ((opt = getopt_long(argc, argv, "b:f:m:o:d:h", longOptions, nullptr)) != -1) {
+        switch (opt) {
+            case 'b':
+                config.backgroundPath = optarg;
+                break;
+            case 'f':
+                config.foregroundPath = optarg;
+                break;
+            case 'm':
+                config.maskPath = optarg;
+                break;
+            case 'o':
+                config.outputPath = optarg;
+                break;
+            case 'd':
+                config.deviceId = std::stoi(optarg);
+            case 'h':
+                PrintUsage(argv[0]);
+                return EXIT_SUCCESS;
+            default:
+                PrintUsage(argv[0]);
+                return EXIT_FAILURE;
+        }
+    }
+    if (config.backgroundPath.empty() || config.foregroundPath.empty() || config.maskPath.empty()) {
+        std::cerr << "Error: Background, foreground, and mask paths are required.\n\n";
+        PrintUsage(argv[0]);
         return EXIT_FAILURE;
     }
 
-    hipStream_t stream;
-    HIP_VALIDATE_NO_ERRORS(hipStreamCreate(&stream));
-
-    HIP_VALIDATE_NO_ERRORS(hipSetDevice(std::stoi(argv[5])));
-
-    cv::Mat background_data = cv::imread(argv[1]);
-    cv::Mat foreground_data = cv::imread(argv[2]);
-    cv::Mat mask_data = cv::imread(argv[3], cv::IMREAD_GRAYSCALE);
-
-    // Create input/output tensors for the image.
-    DataType dtype(eDataType::DATA_TYPE_U8);
-    TensorShape background_shape(TensorLayout(eTensorLayout::TENSOR_LAYOUT_NHWC),
-                                 {1, background_data.rows, background_data.cols, background_data.channels()});
-    Tensor background_tensor(background_shape, dtype);
-
-    TensorShape foreground_shape(TensorLayout(eTensorLayout::TENSOR_LAYOUT_NHWC),
-                                 {1, foreground_data.rows, foreground_data.cols, foreground_data.channels()});
-    Tensor foreground_tensor(foreground_shape, dtype);
-
-    TensorShape mask_shape(TensorLayout(eTensorLayout::TENSOR_LAYOUT_NHWC),
-                           {1, mask_data.rows, mask_data.cols, mask_data.channels()});
-    Tensor mask_tensor(mask_shape, dtype);
-
-    Tensor output_tensor(background_shape, dtype);
-
-    auto bt_data = background_tensor.exportData<TensorDataStrided>();
-    HIP_VALIDATE_NO_ERRORS(hipMemcpyAsync(bt_data.basePtr(), background_data.data,
-                                          bt_data.shape().size() * bt_data.dtype().size(), hipMemcpyHostToDevice,
-                                          stream));
-
-    auto ft_data = foreground_tensor.exportData<TensorDataStrided>();
-    HIP_VALIDATE_NO_ERRORS(hipMemcpyAsync(ft_data.basePtr(), foreground_data.data,
-                                          foreground_tensor.shape().size() * foreground_tensor.dtype().size(),
-                                          hipMemcpyHostToDevice, stream));
-
-    auto m_data = mask_tensor.exportData<TensorDataStrided>();
-    HIP_VALIDATE_NO_ERRORS(hipMemcpyAsync(m_data.basePtr(), mask_data.data,
-                                          mask_tensor.shape().size() * mask_tensor.dtype().size(),
-                                          hipMemcpyHostToDevice, stream));
-
-    hipEvent_t begin, end;
-    HIP_VALIDATE_NO_ERRORS(hipEventCreate(&begin));
-    HIP_VALIDATE_NO_ERRORS(hipEventCreate(&end));
-
-    HIP_VALIDATE_NO_ERRORS(hipEventRecord(begin, stream));
-    roccv::Composite op;
-    op(stream, foreground_tensor, background_tensor, mask_tensor, output_tensor);
-    HIP_VALIDATE_NO_ERRORS(hipEventRecord(end, stream));
-    HIP_VALIDATE_NO_ERRORS(hipEventSynchronize(end));
+    if (config.device == eDeviceType::GPU) {
+        CHECK_HIP_ERROR(hipSetDevice(config.deviceId));
+    }
 
-    float duration;
-    HIP_VALIDATE_NO_ERRORS(hipEventElapsedTime(&duration, begin, end));
-    printf("Kernel execution time: %fms\n", duration);
+    hipStream_t stream;
+    CHECK_HIP_ERROR(hipStreamCreate(&stream));
 
-    HIP_VALIDATE_NO_ERRORS(hipEventDestroy(begin));
-    HIP_VALIDATE_NO_ERRORS(hipEventDestroy(end));
+    // Create required tensors for the Composite operation
+    Tensor background = LoadImages(hipStreamPerThread, config.backgroundPath, config.device);
+    Tensor foreground = LoadImages(hipStreamPerThread, config.foregroundPath, config.device);
+    Tensor mask = LoadImages(hipStreamPerThread, config.maskPath, config.device, cv::IMREAD_GRAYSCALE);
+    Tensor output = Tensor(background.shape(), background.dtype(), config.device);
 
-    // Move image data back to device
-    auto out_data = output_tensor.exportData<TensorDataStrided>();
-    std::vector<uint8_t> out_h(output_tensor.shape().size());
-    HIP_VALIDATE_NO_ERRORS(hipMemcpyAsync(out_h.data(), out_data.basePtr(),
-                                          output_tensor.shape().size() * output_tensor.dtype().size(),
-                                          hipMemcpyDeviceToHost, stream));
+    // Run the Composite operation
+    Composite op;
+    op(stream, foreground, background, mask, output, config.device);
 
-    HIP_VALIDATE_NO_ERRORS(hipStreamSynchronize(stream));
+    // Write output images to disk
+    WriteImages(stream, output, config.outputPath);
 
-    // Write normalized image to disk
-    cv::Mat output_image_data(background_data.rows, background_data.cols, CV_8UC3, out_h.data());
-    cv::imwrite(argv[4], output_image_data);
+    CHECK_HIP_ERROR(hipStreamDestroy(stream));
 
     return EXIT_SUCCESS;
 }
\ No newline at end of file

From 1d1879b8f10dd551564288635c9aa638d15ef400 Mon Sep 17 00:00:00 2001
From: Zach Vincze <zavincze@amd.com>
Date: Mon, 8 Dec 2025 15:45:12 -0500
Subject: [PATCH 21/36] Fix crop and resize example

---
 samples/common/utils.hpp           |  73 +-------
 samples/cropandresize/cpp/main.cpp | 281 ++++++++++++++---------------
 2 files changed, 139 insertions(+), 215 deletions(-)

diff --git a/samples/common/utils.hpp b/samples/common/utils.hpp
index b34dedb4..bd0b806f 100644
--- a/samples/common/utils.hpp
+++ b/samples/common/utils.hpp
@@ -80,6 +80,7 @@ inline MemcpyParams GetMemcpyParams(const roccv::Tensor &tensor) {
  * @param image_path The path to the image to load. If a directory is provided, all supported images in the directory
  * will be loaded.
  * @param device The device to load the images onto. Defaults to GPU.
+ * @param openCVFlags The OpenCV flags to use when loading the images. Defaults to IMREAD_UNCHANGED.
  * @return A NHWC tensor containing the loaded images.
  */
 inline roccv::Tensor LoadImages(hipStream_t stream, const std::string &image_path,
@@ -190,76 +191,4 @@ inline void WriteImages(hipStream_t stream, const roccv::Tensor &tensor, const s
     } else {
         cv::imwrite(output_path, images[0]);
     }
-}
-
-/**
- * @brief Loads images into the GPU memory specified.
- *
- * @param images_dir Either a directory or a single image to load into GPU memory.
- * @param num_images The number of images to load into GPU memory.
- * @param gpu_input A pointer to valid GPU memory.
- */
-inline void DecodeRGBIImage(const std::string &images_dir, int num_images, void *gpu_input) {
-    const std::vector<std::string> supportedExtensions = {".bmp", ".jpg", ".jpeg", ".png"};
-
-    std::vector<std::string> imageFiles;
-    if (std::filesystem::is_directory(images_dir)) {
-        // A directory is provided. Collect all supported files in the directory (non-recursively).
-        for (auto file : std::filesystem::directory_iterator(images_dir)) {
-            if (!std::filesystem::is_directory(file.path()) && ContainsExtension(file.path(), supportedExtensions)) {
-                imageFiles.push_back(file.path());
-            }
-        }
-
-        // Throw an error if there were no valid images found in the given directory
-        if (imageFiles.empty()) {
-            throw std::runtime_error("No valid images found in directory " + images_dir);
-        }
-    } else {
-        // A single image file is provided
-        if (!ContainsExtension(images_dir, supportedExtensions))
-            throw std::runtime_error("Cannot decode " + images_dir + ". File type not supported.\n");
-        imageFiles.push_back(images_dir);
-    }
-
-    // Load images into provided GPU memory
-    size_t mem_offset = 0;
-    for (int b = 0; b < num_images; b++) {
-        cv::Mat inputMat = cv::imread(imageFiles[b]);
-        if (inputMat.empty()) {
-            throw std::runtime_error("Unable to load image " + imageFiles[b]);
-        }
-
-        size_t imageSize = inputMat.rows * inputMat.cols * inputMat.channels() * sizeof(uint8_t);
-        CHECK_HIP_ERROR(
-            hipMemcpy(static_cast<uint8_t *>(gpu_input) + mem_offset, inputMat.data, imageSize, hipMemcpyHostToDevice));
-        mem_offset += imageSize;
-    }
-}
-
-/**
- * @brief Writes a batch of 3-channel RGBI images in a tensor to .bmp files. This will also block on the provided
- * stream.
- *
- * @param tensor A tensor containing a batch of RGBI images.
- * @param stream The HIP stream to synchronize with.
- */
-inline void WriteRGBITensor(const roccv::Tensor &tensor, hipStream_t stream) {
-    CHECK_HIP_ERROR(hipStreamSynchronize(stream));
-
-    auto srcData = tensor.exportData<roccv::TensorDataStrided>();
-    int batchSize = tensor.shape(tensor.layout().batch_index());
-    int height = tensor.shape(tensor.layout().height_index());
-    int width = tensor.shape(tensor.layout().width_index());
-
-    // Write each image in the batch to separate .bmp files
-    for (int b = 0; b < batchSize; b++) {
-        std::ostringstream outFilename;
-        outFilename << "./roccvtest_" << b << ".bmp";
-
-        cv::Mat outputMat(height, width, CV_8UC3);
-        CHECK_HIP_ERROR(hipMemcpy(outputMat.data, srcData.basePtr(),
-                                  (tensor.shape().size() / batchSize) * tensor.dtype().size(), hipMemcpyDeviceToHost));
-        cv::imwrite(outFilename.str().c_str(), outputMat);
-    }
 }
\ No newline at end of file
diff --git a/samples/cropandresize/cpp/main.cpp b/samples/cropandresize/cpp/main.cpp
index 8b8ed9bd..8f92f5a8 100644
--- a/samples/cropandresize/cpp/main.cpp
+++ b/samples/cropandresize/cpp/main.cpp
@@ -23,9 +23,9 @@
 #include <math.h>
 #include <stdint.h>
 
+#include <chrono>
 #include <core/image_format.hpp>
 #include <core/tensor.hpp>
-#include <fstream>
 #include <iostream>
 #include <op_custom_crop.hpp>
 #include <op_resize.hpp>
@@ -35,170 +35,165 @@
 #include "core/tensor_shape.hpp"
 
 /**
+ * @file main.cpp
  * @brief Crop and Resize sample app.
  *
- * The Crop and Resize is a simple pipeline which demonstrates usage of
- * rocCV Tensor along with a few operators.
+ * The Crop and Resize is a simple pipeline which demonstrates usage of the
+ * rocCV Tensor along with the Custom Crop and Resize operators.
  *
  * Input Batch Tensor -> Crop -> Resize -> WriteImage
  */
 
-/**
- * @brief Utility to show usage of sample app
- *
- **/
-void showUsage() {
-    std::cout << "usage: ./roccv_cropandresize_app -i <image file path or image directory> -b <batch size>"
-              << std::endl;
+using namespace roccv;
+
+struct Config {
+    std::string inputPath;
+    std::string outputPath = "output";
+    eDeviceType device = eDeviceType::GPU;
+    Size2D resizeShape = {320, 480};
+    Box_t cropRect = {50, 150, 400, 300};
+    eInterpolationType interpolation = eInterpolationType::INTERP_TYPE_LINEAR;
+    int deviceId = 0;
+};
+
+void PrintUsage(const char* programName) {
+    // clang-format off
+    std::cout << "Usage: " << programName << " -i <input_image_or_directory> [options]" << std::endl;
+    std::cout << "  -i, --input <input_image_or_directory>   Input image or directory (required)" << std::endl;
+    std::cout << "Options:" << std::endl;
+    std::cout << "  -o, --output <output_image_or_directory> Output image or directory (optional, default: output)" << std::endl;
+    std::cout << "  -r, --resize <width,height>              Resize shape as width,height (optional, default: 320,480)" << std::endl;
+    std::cout << "  -c, --crop <x,y,w,h>                     Crop rectangle as x,y,w,h (optional, default: 50,150,400,300)" << std::endl;
+    std::cout << "  -I, --interpolation <interpolation>      Interpolation type: 0=NEAREST, 1=LINEAR, 2=CUBIC (optional, default: LINEAR)" << std::endl;
+    std::cout << "  -C, --cpu                                Use CPU for execution (optional, default: GPU)" << std::endl;
+    std::cout << "  -d, --device <device_id>                 Device ID to use for execution (optional, default: 0)" << std::endl;
+    std::cout << "  -h, --help                               Show this help message" << std::endl;
+    std::cout << std::endl;
+    // clang-format on
 }
 
-/**
- * @brief Utility to parse the command line arguments
- *
- **/
-int ParseArgs(int argc, char *argv[], std::string &imagePath, uint32_t &batchSize) {
-    static struct option long_options[] = {{"help", no_argument, 0, 'h'},
-                                           {"imagePath", required_argument, 0, 'i'},
-                                           {"batch", required_argument, 0, 'b'},
-                                           {0, 0, 0, 0}};
-
-    int long_index = 0;
-    int opt = 0;
-    while ((opt = getopt_long(argc, argv, "hi:b:", long_options, &long_index)) != -1) {
+void ParseCropRectangle(const std::string& cropStr, Box_t& cropRect) {
+    std::istringstream iss(cropStr);
+    std::string token;
+    getline(iss, token, ',');
+    cropRect.x = std::stoi(token);
+    getline(iss, token, ',');
+    cropRect.y = std::stoi(token);
+    getline(iss, token, ',');
+    cropRect.width = std::stoi(token);
+    getline(iss, token, ',');
+    cropRect.height = std::stoi(token);
+}
+
+void ParseResizeShape(const std::string& resizeStr, Size2D& resizeShape) {
+    std::istringstream iss(resizeStr);
+    std::string token;
+    getline(iss, token, ',');
+    resizeShape.w = std::stoi(token);
+    getline(iss, token, ',');
+    resizeShape.h = std::stoi(token);
+}
+
+int main(int argc, char** argv) {
+    Config config;
+    static struct option longOptions[] = {{"input", required_argument, nullptr, 'i'},
+                                          {"output", required_argument, nullptr, 'o'},
+                                          {"resize", required_argument, nullptr, 'r'},
+                                          {"crop", required_argument, nullptr, 'c'},
+                                          {"interpolation", required_argument, nullptr, 'I'},
+                                          {"cpu", no_argument, nullptr, 'C'},
+                                          {"device", required_argument, nullptr, 'd'},
+                                          {"help", no_argument, nullptr, 'h'},
+                                          {nullptr, 0, nullptr, 0}};
+
+    // Parse command line arguments
+    int opt;
+    while ((opt = getopt_long(argc, argv, "i:o:r:c:I:d:h:C", longOptions, nullptr)) != -1) {
         switch (opt) {
-            case 'h':
-                showUsage();
-                return -1;
-                break;
             case 'i':
-                imagePath = optarg;
+                config.inputPath = optarg;
                 break;
-            case 'b':
-                batchSize = std::stoi(optarg);
+            case 'o':
+                config.outputPath = optarg;
                 break;
-            case ':':
-                showUsage();
-                return -1;
-            default:
+            case 'r':
+                ParseResizeShape(optarg, config.resizeShape);
+                break;
+            case 'c':
+                ParseCropRectangle(optarg, config.cropRect);
                 break;
+            case 'I':
+                config.interpolation = static_cast<eInterpolationType>(std::stoi(optarg));
+                break;
+            case 'C':
+                config.device = eDeviceType::CPU;
+                break;
+            case 'd':
+                config.deviceId = std::stoi(optarg);
+                break;
+            case 'h':
+                PrintUsage(argv[0]);
+                return EXIT_SUCCESS;
+            default:
+                PrintUsage(argv[0]);
+                return EXIT_FAILURE;
         }
     }
-    std::ifstream imageFile(imagePath);
-    if (!imageFile.good()) {
-        showUsage();
-        std::cerr << "Image path '" + imagePath + "' does not exist" << std::endl;
-        return -1;
-    }
-    return 0;
-}
 
-int main(int argc, char *argv[]) {
-    // Default parameters
-    // TODO: Default parameter for images cannot be added for now. Must specify a sample asset directory in the final
-    // build which is relative to this executable.
-    std::string imagePath = "none.jpg";
-    uint32_t batchSize = 1;
-
-    // Parse the command line paramaters to override the default parameters
-    int retval = ParseArgs(argc, argv, imagePath, batchSize);
-    if (retval != 0) {
-        return retval;
+    if (config.inputPath.empty()) {
+        std::cerr << "Error: Input path is required.\n\n";
+        PrintUsage(argv[0]);
+        return EXIT_FAILURE;
     }
 
-    // Note : The maximum input image dimensions needs to be updated in case
-    // of testing with different test images
-
-    int maxImageWidth = 720;
-    int maxImageHeight = 480;
-    int maxChannels = 3;
+    if (config.device == eDeviceType::GPU) {
+        CHECK_HIP_ERROR(hipSetDevice(config.deviceId));
+    }
 
-    // tag: Create the HIP stream
     hipStream_t stream;
     CHECK_HIP_ERROR(hipStreamCreate(&stream));
 
-    // tag: Allocate input tensor
-    // Allocating memory for RGBI input image batch of uint8_t data type.
-
-    roccv::TensorDataStrided::Buffer inBuf;
-    inBuf.strides[3] = sizeof(uint8_t);
-    inBuf.strides[2] = maxChannels * inBuf.strides[3];
-    inBuf.strides[1] = maxImageWidth * inBuf.strides[2];
-    inBuf.strides[0] = maxImageHeight * inBuf.strides[1];
-    CHECK_HIP_ERROR(hipMallocAsync(&inBuf.basePtr, batchSize * inBuf.strides[0], stream));
-
-    // tag: Tensor Requirements
-    // Calculate the requirements for the RGBI uint8_t Tensor which include
-    // pitch bytes, alignment, shape  and tensor layout
-    roccv::Tensor::Requirements inReqs =
-        roccv::Tensor::CalcRequirements(batchSize, {maxImageWidth, maxImageHeight}, roccv::FMT_RGB8);
-
-    // Create a tensor buffer to store the data pointer and pitch bytes for each plane
-    roccv::TensorDataStridedHip inData(roccv::TensorShape{inReqs.shape, inReqs.rank, inReqs.layout},
-                                       roccv::DataType{inReqs.dtype}, inBuf);
-
-    // Wrap tensor data in a rocCV tensor for use with the rocCV operators.
-    roccv::Tensor inTensor = roccv::TensorWrapData(inData);
-
-    // tag: Image Loading
-    uint8_t *gpuInput = reinterpret_cast<uint8_t *>(inBuf.basePtr);
-    // The total images is set to the same value as batch size for testing
-    uint32_t totalImages = batchSize;
-
-    // OpenCV is used to load the images, which gets copied into device memory.
-    DecodeRGBIImage(imagePath, totalImages, gpuInput);
-
-    // tag: The input buffer is now ready to be used by the operators
-
-    // Set parameters for Crop and Resize
-    // ROI dimensions to crop in the input image
-    int cropX = 50;
-    int cropY = 150;
-    int cropWidth = 400;
-    int cropHeight = 300;
-
-    // Set the resize dimensions
-    int resizeWidth = 320;
-    int resizeHeight = 240;
-
-    //  Create the crop rect for the cropping operator
-    roccv::Box_t crpRect = {cropX, cropY, cropWidth, cropHeight};
-
-    // tag: Allocate Tensors for Crop and Resize
-    // Create a rocCV Tensor based on the crop window size.
-    roccv::Tensor cropTensor(batchSize, {cropWidth, cropHeight}, roccv::FMT_RGB8);
-    // Create a rocCV Tensor based on resize dimensions
-    roccv::Tensor resizedTensor(batchSize, {resizeWidth, resizeHeight}, roccv::FMT_RGB8);
-
-#ifdef PROFILE_SAMPLE
-    hipEvent_t start, stop;
-    hipEventCreate(&start);
-    hipEventCreate(&stop);
-    hipEventRecord(start);
-#endif
-    // tag: Initialize operators for Crop and Resize
-    roccv::CustomCrop cropOp;
-    roccv::Resize resizeOp;
-
-    // tag: Executes the CustomCrop operation on the given HIP stream
-    cropOp(stream, inTensor, cropTensor, crpRect);
-
-    // Resize operator can now be enqueued into the same stream
-    resizeOp(stream, cropTensor, resizedTensor, INTERP_TYPE_LINEAR);
-
-    // tag: Profile section
-#ifdef PROFILE_SAMPLE
-    hipEventRecord(stop);
-    hipEventSynchronize(stop);
-    float operatorms = 0;
-    hipEventElapsedTime(&operatorms, start, stop);
-    std::cout << "Time for Crop and Resize : " << operatorms << " ms" << std::endl;
-#endif
-
-    // tag: Copy the buffer to CPU and write resized image into .bmp files
-    WriteRGBITensor(resizedTensor, stream);
-
-    // tag: Clean up
+    // Load batch of input images
+    Tensor input = LoadImages(stream, config.inputPath, config.device);
+
+    // Determine the batch size and channels from the input tensor
+    int64_t batchSize = input.shape(input.layout().batch_index());
+    int64_t channels = input.shape(input.layout().channels_index());
+
+    // Create tensor for the cropped image
+    Tensor cropTensor =
+        Tensor(TensorShape(input.layout(), {batchSize, config.cropRect.height, config.cropRect.width, channels}),
+               input.dtype(), config.device);
+
+    // Create tensor for the resized image
+    Tensor resizedTensor =
+        Tensor(TensorShape(input.layout(), {batchSize, config.resizeShape.w, config.resizeShape.h, channels}),
+               input.dtype(), config.device);
+
+    // Create crop and resize operators
+    CustomCrop cropOp;
+    Resize resizeOp;
+
+    std::chrono::high_resolution_clock::time_point start = std::chrono::high_resolution_clock::now();
+
+    // Run the crop operation, writing results to the crop tensor
+    cropOp(stream, input, cropTensor, config.cropRect, config.device);
+
+    // Run the resize operation, writing results to the resized tensor
+    resizeOp(stream, cropTensor, resizedTensor, config.interpolation, config.device);
+    CHECK_HIP_ERROR(hipStreamSynchronize(stream));
+
+    std::chrono::high_resolution_clock::time_point end = std::chrono::high_resolution_clock::now();
+
+    // Report the duration of the crop and resize operation
+    long executionTime = std::chrono::duration_cast<std::chrono::milliseconds>(end - start).count();
+    std::cout << "Processed " << batchSize << " images in " << executionTime << "ms" << std::endl;
+
+    // Write the cropped and resized images to disk
+    WriteImages(stream, resizedTensor, config.outputPath);
+
+    // Destroy the stream
     CHECK_HIP_ERROR(hipStreamDestroy(stream));
 
-    // tag: End of Sample
+    return EXIT_SUCCESS;
 }
\ No newline at end of file

From b7cff204b680e27e9df27228612c20176edeb87d Mon Sep 17 00:00:00 2001
From: Zach Vincze <zavincze@amd.com>
Date: Mon, 8 Dec 2025 15:49:39 -0500
Subject: [PATCH 22/36] Add more information to the help message

---
 samples/cropandresize/cpp/main.cpp | 27 +++++++++++++++++----------
 1 file changed, 17 insertions(+), 10 deletions(-)

diff --git a/samples/cropandresize/cpp/main.cpp b/samples/cropandresize/cpp/main.cpp
index 8f92f5a8..f2a9c939 100644
--- a/samples/cropandresize/cpp/main.cpp
+++ b/samples/cropandresize/cpp/main.cpp
@@ -58,16 +58,23 @@ struct Config {
 
 void PrintUsage(const char* programName) {
     // clang-format off
-    std::cout << "Usage: " << programName << " -i <input_image_or_directory> [options]" << std::endl;
-    std::cout << "  -i, --input <input_image_or_directory>   Input image or directory (required)" << std::endl;
-    std::cout << "Options:" << std::endl;
-    std::cout << "  -o, --output <output_image_or_directory> Output image or directory (optional, default: output)" << std::endl;
-    std::cout << "  -r, --resize <width,height>              Resize shape as width,height (optional, default: 320,480)" << std::endl;
-    std::cout << "  -c, --crop <x,y,w,h>                     Crop rectangle as x,y,w,h (optional, default: 50,150,400,300)" << std::endl;
-    std::cout << "  -I, --interpolation <interpolation>      Interpolation type: 0=NEAREST, 1=LINEAR, 2=CUBIC (optional, default: LINEAR)" << std::endl;
-    std::cout << "  -C, --cpu                                Use CPU for execution (optional, default: GPU)" << std::endl;
-    std::cout << "  -d, --device <device_id>                 Device ID to use for execution (optional, default: 0)" << std::endl;
-    std::cout << "  -h, --help                               Show this help message" << std::endl;
+    std::cout << "rocCV Crop and Resize Sample Application\n";
+    std::cout << "----------------------------------------\n";
+    std::cout << "This sample demonstrates how to set up a simple image processing pipeline using rocCV.\n";
+    std::cout << "It shows reading a batch of images, cropping them to a specified rectangle,\n";
+    std::cout << "resizing the cropped images to a target shape, and writing the outputs to image files.\n";
+    std::cout << "You may select the device (CPU or GPU), interpolation type, input size, and more.\n";
+    std::cout << '\n';
+    std::cout << "Usage: " << programName << " -i <input_image_or_directory> [options]\n";
+    std::cout << "  -i, --input <input_image_or_directory>   Input image or directory (required)\n";
+    std::cout << "Options:\n";
+    std::cout << "  -o, --output <output_image_or_directory> Output image or directory (optional, default: output)\n";
+    std::cout << "  -r, --resize <width,height>              Resize shape as width,height (optional, default: 320,480)\n";
+    std::cout << "  -c, --crop <x,y,w,h>                     Crop rectangle as x,y,w,h (optional, default: 50,150,400,300)\n";
+    std::cout << "  -I, --interpolation <interpolation>      Interpolation type: 0=NEAREST, 1=LINEAR, 2=CUBIC (optional, default: LINEAR)\n";
+    std::cout << "  -C, --cpu                                Use CPU for execution (optional, default: GPU)\n";
+    std::cout << "  -d, --device <device_id>                 Device ID to use for execution (optional, default: 0)\n";
+    std::cout << "  -h, --help                               Show this help message\n";
     std::cout << std::endl;
     // clang-format on
 }

From 370c3373e5d77c7b197c48cfb17340ab8e5ef5e9 Mon Sep 17 00:00:00 2001
From: Zach Vincze <zavincze@amd.com>
Date: Tue, 6 Jan 2026 16:04:38 -0500
Subject: [PATCH 23/36] Fix tensor copies for benchmarking suite

---
 benchmarks/src/roccv/roccv_bench_helpers.cpp | 51 +++++++++++++++-----
 1 file changed, 39 insertions(+), 12 deletions(-)

diff --git a/benchmarks/src/roccv/roccv_bench_helpers.cpp b/benchmarks/src/roccv/roccv_bench_helpers.cpp
index 3fbb42f1..82471b58 100644
--- a/benchmarks/src/roccv/roccv_bench_helpers.cpp
+++ b/benchmarks/src/roccv/roccv_bench_helpers.cpp
@@ -26,21 +26,48 @@
 #include <roccvbench/utils.hpp>
 #include <vector>
 
+struct MemcpyParams {
+    void* basePtr = nullptr;  // Base pointer to the tensor data
+    size_t rowPitch = 0;      // Number of bytes per row, including padding
+    size_t rowBytes = 0;      // Number of bytes per row, not including padding
+    size_t imageBytes = 0;    // Number of bytes per image, including padding (rowBytes * height)
+};
+
+/**
+ * @brief Gets the memcpy parameters for a tensor to perform a memcpy2D operation.
+ *
+ * @param tensor The tensor to get the memcpy parameters for.
+ * @return The memcpy parameters to perform a memcpy2D operation.
+ */
+inline MemcpyParams GetMemcpyParams(const roccv::Tensor& tensor) {
+    MemcpyParams params;
+
+    roccv::TensorDataStrided tensorData = tensor.exportData<roccv::TensorDataStrided>();
+    params.rowPitch = tensorData.stride(tensor.layout().height_index());
+    params.rowBytes = tensor.shape(tensor.layout().width_index()) * tensor.shape(tensor.layout().channels_index()) *
+                      tensor.dtype().size();
+    params.imageBytes = params.rowPitch * tensor.shape(tensor.layout().height_index());
+    params.basePtr = tensorData.basePtr();
+
+    return params;
+}
+
 template <typename T>
 void MoveToTensor(const roccv::Tensor& tensor, const std::vector<T>& vec) {
-    auto tensor_data = tensor.exportData<roccv::TensorDataStrided>();
-    switch (tensor.device()) {
-        case eDeviceType::GPU: {
-            HIP_VALIDATE_NO_ERRORS(hipMemcpy(tensor_data.basePtr(), vec.data(),
-                                             tensor.shape().size() * tensor.dtype().size(), hipMemcpyHostToDevice));
-            break;
-        }
+    const hipMemcpyKind kind = (tensor.device() == eDeviceType::GPU) ? hipMemcpyHostToDevice : hipMemcpyHostToHost;
 
-        case eDeviceType::CPU: {
-            HIP_VALIDATE_NO_ERRORS(hipMemcpy(tensor_data.basePtr(), vec.data(),
-                                             tensor.shape().size() * tensor.dtype().size(), hipMemcpyHostToHost));
-            break;
-        }
+    if (tensor.isContiguous()) {
+        // Contiguous data, so we can use a simple memcpy.
+        const size_t totalBytes = tensor.dataSize();
+        void* basePtr = tensor.exportData<roccv::TensorDataStrided>().basePtr();
+        HIP_VALIDATE_NO_ERRORS(hipMemcpy(basePtr, vec.data(), totalBytes, kind));
+    } else {
+        // Data is padded, so we need to use a memcpy2D.
+        const MemcpyParams params = GetMemcpyParams(tensor);
+        const size_t batchSize = tensor.shape(tensor.layout().batch_index());
+        const size_t totalRows = batchSize * tensor.shape(tensor.layout().height_index());
+        HIP_VALIDATE_NO_ERRORS(hipMemcpy2D(params.basePtr, params.rowPitch, vec.data(), params.rowBytes,
+                                           params.rowBytes, totalRows, kind));
     }
 }
 

From 3ee963622e5af0dc85aafc3658244a51614cba67 Mon Sep 17 00:00:00 2001
From: Zach Vincze <zavincze@amd.com>
Date: Wed, 7 Jan 2026 12:05:48 -0500
Subject: [PATCH 24/36] Fix tensor padding calculations

---
 src/core/tensor.cpp | 24 ++++++++++++++++++++++--
 1 file changed, 22 insertions(+), 2 deletions(-)

diff --git a/src/core/tensor.cpp b/src/core/tensor.cpp
index bfd5835a..e013c998 100644
--- a/src/core/tensor.cpp
+++ b/src/core/tensor.cpp
@@ -42,6 +42,21 @@ THE SOFTWARE.
 
 namespace roccv {
 
+namespace {
+int GetFirstPackedDimension(const TensorLayout& layout) {
+    const int rank = layout.rank();
+    switch (layout.elayout()) {
+        case eTensorLayout::TENSOR_LAYOUT_NHWC:
+        case eTensorLayout::TENSOR_LAYOUT_LNHWC:
+        case eTensorLayout::TENSOR_LAYOUT_HWC:
+        case eTensorLayout::TENSOR_LAYOUT_NWC:
+            return std::max(0, rank - 2);
+        default:
+            return rank - 1;
+    }
+}
+}  // namespace
+
 // Constructor definitions
 Tensor::Tensor(const Tensor::Requirements& reqs, const IAllocator& alloc) : m_requirements(reqs) {
     m_data = std::make_shared<TensorStorage>(this->dataSize(), reqs.device, alloc);
@@ -230,15 +245,20 @@ std::array<int64_t, ROCCV_TENSOR_MAX_RANK> Tensor::CalcStrides(const TensorShape
                                                                int32_t rowAlign) {
     // Calculate strides based on the given tensor shape. Strides are byte-wise.
     std::array<int64_t, ROCCV_TENSOR_MAX_RANK> strides;
+
+    const int firstPackedDim = GetFirstPackedDimension(shape.layout());
+
     strides[shape.layout().rank() - 1] = dtype.size();
     for (int i = shape.layout().rank() - 2; i >= 0; i--) {
-        // Ensure strides for the row are padded to the next multiple of the alignment.
-        if (i == shape.layout().height_index()) {
+        // The stride dimension preceeding the first packed dimension is padded to the next multiple of the row
+        // alignment.
+        if (i == firstPackedDim - 1) {
             strides[i] = detail::AlignUp(strides[i + 1] * shape[i + 1], rowAlign);
         } else {
             strides[i] = strides[i + 1] * shape[i + 1];
         }
     }
+
     return strides;
 }
 

From 8c0b2fbb016ccf399b88022dfc6979283caaed15 Mon Sep 17 00:00:00 2001
From: Zach Vincze <zavincze@amd.com>
Date: Wed, 7 Jan 2026 13:44:13 -0500
Subject: [PATCH 25/36] Add documentation to helper function

---
 src/core/tensor.cpp | 10 ++++++++++
 1 file changed, 10 insertions(+)

diff --git a/src/core/tensor.cpp b/src/core/tensor.cpp
index e013c998..46e7552d 100644
--- a/src/core/tensor.cpp
+++ b/src/core/tensor.cpp
@@ -43,6 +43,16 @@ THE SOFTWARE.
 namespace roccv {
 
 namespace {
+
+/*
+ * @brief Returns the index of the first packed dimension in the given tensor layout.
+ *
+ * In most cases, the first packed dimension is the last dimension. However, for layouts ending in WC, the first packed
+ * dimension is the second to last dimension.
+ *
+ * @param[in] layout The tensor layout to get the first packed dimension for.
+ * @return The index of the first packed dimension in the given tensor layout.
+ */
 int GetFirstPackedDimension(const TensorLayout& layout) {
     const int rank = layout.rank();
     switch (layout.elayout()) {

From 807396947759ffcc7b156617a51255853702c270 Mon Sep 17 00:00:00 2001
From: Zach Vincze <zavincze@amd.com>
Date: Wed, 7 Jan 2026 14:38:27 -0500
Subject: [PATCH 26/36] Reuse reshape logic

---
 src/core/tensor.cpp | 16 +---------------
 1 file changed, 1 insertion(+), 15 deletions(-)

diff --git a/src/core/tensor.cpp b/src/core/tensor.cpp
index 46e7552d..0ab41b40 100644
--- a/src/core/tensor.cpp
+++ b/src/core/tensor.cpp
@@ -125,21 +125,7 @@ TensorData Tensor::exportData() const {
     }
 }
 
-Tensor Tensor::reshape(const TensorShape& new_shape) const {
-    if (!isContiguous()) {
-        throw Exception("Tensor is not contiguous. Reshape can only be performed on contiguous tensors.",
-                        eStatusType::INVALID_VALUE);
-    }
-
-    // New tensor shape must have the same number of elements
-    if (new_shape.size() != this->shape().size()) {
-        throw Exception("New tensor shape does not match the number of elements of the old shape.",
-                        eStatusType::INVALID_VALUE);
-    }
-
-    Tensor::Requirements reqs = CalcRequirements(new_shape, this->dtype(), this->device());
-    return Tensor(reqs, m_data);
-}
+Tensor Tensor::reshape(const TensorShape& new_shape) const { return this->reshape(new_shape, this->dtype()); }
 
 Tensor Tensor::reshape(const TensorShape& new_shape, const DataType& new_dtype) const {
     if (!isContiguous()) {

From 97efadbffb687fcb51b5265a69a9c758e8d428e8 Mon Sep 17 00:00:00 2001
From: Zach Vincze <zavincze@amd.com>
Date: Tue, 13 Jan 2026 14:57:12 -0500
Subject: [PATCH 27/36] Temp commit

---
 include/core/tensor.hpp                       | 11 +--
 src/core/tensor.cpp                           | 70 ++++++++++++++++---
 .../cpp/src/tests/core/tensor/test_tensor.cpp | 19 ++++-
 3 files changed, 82 insertions(+), 18 deletions(-)

diff --git a/include/core/tensor.hpp b/include/core/tensor.hpp
index 6ca8c5cf..1a5b68fb 100644
--- a/include/core/tensor.hpp
+++ b/include/core/tensor.hpp
@@ -190,14 +190,15 @@ class Tensor {
     Tensor reshape(const TensorShape &new_shape) const;
 
     /**
-     * @brief Creates a vew of this tensor with a new shape, layout, and data type. The number of bytes allocated must
-     * match the original tensor.
+     * @brief Creates a view of this tensor with a new data type.
      *
-     * @param new_shape The new tensor shape.
-     * @param new_dtype The new data type of the underlying tensor data.
+     * This will attempt to reinterpret the tensor's base data type to the new data type by reshaping the tensor if
+     * necessary. If the reinterpretation is not possible, an exception will be thrown.
+     *
+     * @param newDtype The new data type of the underlying tensor data.
      * @return Tensor
      */
-    Tensor reshape(const TensorShape &new_shape, const DataType &new_dtype) const;
+    Tensor reinterpret(const DataType &newDtype) const;
 
     /**
      * @brief Performs a shallow copy of the tensor (creates a view).
diff --git a/src/core/tensor.cpp b/src/core/tensor.cpp
index 0ab41b40..f806b134 100644
--- a/src/core/tensor.cpp
+++ b/src/core/tensor.cpp
@@ -53,7 +53,7 @@ namespace {
  * @param[in] layout The tensor layout to get the first packed dimension for.
  * @return The index of the first packed dimension in the given tensor layout.
  */
-int GetFirstPackedDimension(const TensorLayout& layout) {
+static int GetFirstPackedDimension(const TensorLayout& layout) {
     const int rank = layout.rank();
     switch (layout.elayout()) {
         case eTensorLayout::TENSOR_LAYOUT_NHWC:
@@ -65,6 +65,61 @@ int GetFirstPackedDimension(const TensorLayout& layout) {
             return rank - 1;
     }
 }
+
+static bool ReshapeSimplified(int inRank, const std::array<int64_t, ROCCV_TENSOR_MAX_RANK>& inShape,
+                              const std::array<int64_t, ROCCV_TENSOR_MAX_RANK>& inStrides, int targetRank,
+                              const std::array<int64_t, ROCCV_TENSOR_MAX_RANK>& targetShape,
+                              std::array<int64_t, ROCCV_TENSOR_MAX_RANK>& outStrides) {
+    int i = 0, j = 0;
+    for (; i < inRank && j < targetRank; i++) {
+        int64_t inE = inShape[i];
+        int64_t outV = 1;
+        int group_start = j;
+        while (j < targetRank && (outV * targetShape[j]) <= inE) outV *= targetShape[j++];
+
+        if (outV != inE) {
+            return false;
+        }
+
+        int64_t s = inStrides[i];
+        for (int d = j - 1; d >= group_start; d--) {
+            outStrides[d] = s;
+            s *= targetShape[d];
+        }
+    }
+    return true;
+}
+
+static int Simplify(int rank, const std::array<int64_t, ROCCV_TENSOR_MAX_RANK>& shape,
+                    const std::array<int64_t, ROCCV_TENSOR_MAX_RANK>& stride,
+                    std::array<int64_t, ROCCV_TENSOR_MAX_RANK>& outShape,
+                    std::array<int64_t, ROCCV_TENSOR_MAX_RANK>& outStrides) {
+    if (rank <= 1) {
+        if (rank == 1) {
+            outShape[0] = shape[0];
+            outStrides[0] = stride[0];
+        }
+        return rank;
+    }
+
+    int outRank = 0;
+    int64_t vol = shape[0];
+    for (int d = 1; d < rank; d++) {
+        if (stride[d - 1] != shape[d] * stride[d]) {
+            outStrides[outRank] = stride[d - 1];
+            outShape[outRank] = vol;
+            vol = shape[d];
+            outRank++;
+        } else {
+            vol *= shape[d];
+        }
+    }
+    outStrides[outRank] = stride[rank - 1];
+    outShape[outRank] = vol;
+    outRank++;
+    return outRank;
+}
+
 }  // namespace
 
 // Constructor definitions
@@ -125,19 +180,12 @@ TensorData Tensor::exportData() const {
     }
 }
 
-Tensor Tensor::reshape(const TensorShape& new_shape) const { return this->reshape(new_shape, this->dtype()); }
-
-Tensor Tensor::reshape(const TensorShape& new_shape, const DataType& new_dtype) const {
-    if (!isContiguous()) {
-        throw Exception("Tensor is not contiguous. Reshape can only be performed on contiguous tensors.",
-                        eStatusType::INVALID_VALUE);
-    }
-
-    if (new_shape.size() * new_dtype.size() != this->shape().size() * this->dtype().size()) {
+Tensor Tensor::reshape(const TensorShape& new_shape) const {
+    if (new_shape.size() * dtype().size() != this->shape().size() * this->dtype().size()) {
         throw Exception("New tensor view must have the same underlying number of bytes.", eStatusType::INVALID_VALUE);
     }
 
-    Tensor::Requirements reqs = CalcRequirements(new_shape, new_dtype, this->device());
+    Tensor::Requirements reqs = CalcRequirements(new_shape, dtype(), this->device());
     return Tensor(reqs, m_data);
 }
 
diff --git a/tests/roccv/cpp/src/tests/core/tensor/test_tensor.cpp b/tests/roccv/cpp/src/tests/core/tensor/test_tensor.cpp
index 16d8bd46..6e6da31a 100644
--- a/tests/roccv/cpp/src/tests/core/tensor/test_tensor.cpp
+++ b/tests/roccv/cpp/src/tests/core/tensor/test_tensor.cpp
@@ -76,10 +76,10 @@ void TestNegativeTensorShape() {
  * In both cases, the expected behavior is to throw an exception of type eStatusType::INVALID_VALUE.
  */
 void TestNegativeTensor() {
-    // Reshaping a non-contiguous tensor is invalid
+    // Test reshaping a tensor with mismatching number of elements
     {
         Tensor tensor(TensorShape({1, 2, 3}, "HWC"), DataType(DATA_TYPE_U8));
-        EXPECT_EXCEPTION(tensor.reshape(TensorShape({1, 1, 2, 3}, "NHWC")), eStatusType::INVALID_VALUE);
+        EXPECT_EXCEPTION(tensor.reshape(TensorShape({1, 1, 2, 4}, "NHWC")), eStatusType::INVALID_VALUE);
     }
 }
 
@@ -101,6 +101,20 @@ void TestTensorCorrectness() {
         EXPECT_EQ(tensor.shape().size(), 4 * 720 * 480 * 3);
         EXPECT_EQ(tensor.dtype().size(), 1);
     }
+}
+
+void TestTensorReshapeCorrectness() {
+    {
+        Tensor tensor(TensorShape({1, 2, 3}, "HWC"), DataType(DATA_TYPE_U8));
+        Tensor reshapedTensor = tensor.reshape(TensorShape({1, 1, 2, 3}, "NHWC"));
+        EXPECT_EQ(reshapedTensor.shape().size(), tensor.shape().size());
+        EXPECT_EQ(reshapedTensor.rank(), 4);
+
+        // Ensure they are sharing the same underlying data
+        auto data = tensor.exportData<TensorDataStrided>();
+        auto dataReshaped = reshapedTensor.exportData<TensorDataStrided>();
+        EXPECT_TRUE(data.basePtr() == dataReshaped.basePtr());
+    }
 
     // Tensor reshape: Change layout and datatype
     {
@@ -151,6 +165,7 @@ int main(int argc, char** argv) {
 
     // Correctness tests
     TEST_CASE(TestTensorCorrectness());
+    // TEST_CASE(TestTensorReshapeCorrectness());
 
     // Stride calculation tests
     // clang-format off

From 6ded6bfe07c4997c5af4ffccc1fa9930a1237997 Mon Sep 17 00:00:00 2001
From: Zach Vincze <zavincze@amd.com>
Date: Tue, 10 Mar 2026 11:25:41 -0400
Subject: [PATCH 28/36] Implement tensor reshape/reinterpret methods

---
 include/core/tensor.hpp                       | 22 ++++---
 src/core/tensor.cpp                           | 64 ++++++++++++++++---
 src/op_non_max_suppression.cpp                | 10 +--
 tests/roccv/cpp/include/test_helpers.hpp      | 31 +++++----
 .../cpp/src/tests/core/tensor/test_tensor.cpp | 14 +++-
 5 files changed, 103 insertions(+), 38 deletions(-)

diff --git a/include/core/tensor.hpp b/include/core/tensor.hpp
index f68843bd..9ee7cd1e 100644
--- a/include/core/tensor.hpp
+++ b/include/core/tensor.hpp
@@ -190,23 +190,25 @@ class Tensor {
     }
 
     /**
-     * @brief Creates a view of this tensor with a new shape and layout
+     * @brief Creates a view of this tensor with a new shape and layout, keeping the same data type.
      *
-     * @param[in] new_shape the new shape of the tensor
-     * @return Tensor
+     * @param[in] newShape The new shape of the tensor.
+     * @return A new tensor view with the given shape.
      */
-    Tensor reshape(const TensorShape &new_shape) const;
+    Tensor reshape(const TensorShape &newShape) const;
 
     /**
-     * @brief Creates a view of this tensor with a new data type.
+     * @brief Creates a view of this tensor with a new data type and shape.
      *
-     * This will attempt to reinterpret the tensor's base data type to the new data type by reshaping the tensor if
-     * necessary. If the reinterpretation is not possible, an exception will be thrown.
+     * Reinterprets the tensor's underlying bytes with the given data type and shape. The total byte count
+     * (elements * dtype size) must match between the original and new view. Non-contiguous (padded) tensors
+     * are supported as long as the reshape is compatible with the stride structure.
      *
-     * @param newDtype The new data type of the underlying tensor data.
-     * @return Tensor
+     * @param[in] newDtype The new data type of the tensor elements.
+     * @param[in] newShape The new shape of the tensor.
+     * @return A new tensor view with the given data type and shape.
      */
-    Tensor reinterpret(const DataType &newDtype) const;
+    Tensor reshape(const DataType &newDtype, const TensorShape &newShape) const;
 
     /**
      * @brief Performs a shallow copy of the tensor (creates a view).
diff --git a/src/core/tensor.cpp b/src/core/tensor.cpp
index 01aa5001..3aa131bb 100644
--- a/src/core/tensor.cpp
+++ b/src/core/tensor.cpp
@@ -44,7 +44,7 @@ namespace roccv {
 
 namespace {
 
-/*
+/**
  * @brief Returns the index of the first packed dimension in the given tensor layout.
  *
  * In most cases, the first packed dimension is the last dimension. However, for layouts ending in WC, the first packed
@@ -90,14 +90,24 @@ static bool ReshapeSimplified(int inRank, const std::array<int64_t, ROCCV_TENSOR
     return true;
 }
 
+/**
+ * @brief Simplifies a tensor shape and strides into their canonical form.
+ *
+ * @param[in] rank The rank of the tensor.
+ * @param[in] shape The shape of the tensor.
+ * @param[in] strides The strides of the tensor.
+ * @param[out] outShape The simplified shape of the tensor.
+ * @param[out] outStrides The simplified strides of the tensor.
+ * @return The rank of the simplified tensor.
+ */
 static int Simplify(int rank, const std::array<int64_t, ROCCV_TENSOR_MAX_RANK>& shape,
-                    const std::array<int64_t, ROCCV_TENSOR_MAX_RANK>& stride,
+                    const std::array<int64_t, ROCCV_TENSOR_MAX_RANK>& strides,
                     std::array<int64_t, ROCCV_TENSOR_MAX_RANK>& outShape,
                     std::array<int64_t, ROCCV_TENSOR_MAX_RANK>& outStrides) {
     if (rank <= 1) {
         if (rank == 1) {
             outShape[0] = shape[0];
-            outStrides[0] = stride[0];
+            outStrides[0] = strides[0];
         }
         return rank;
     }
@@ -105,8 +115,8 @@ static int Simplify(int rank, const std::array<int64_t, ROCCV_TENSOR_MAX_RANK>&
     int outRank = 0;
     int64_t vol = shape[0];
     for (int d = 1; d < rank; d++) {
-        if (stride[d - 1] != shape[d] * stride[d]) {
-            outStrides[outRank] = stride[d - 1];
+        if (strides[d - 1] != shape[d] * strides[d]) {
+            outStrides[outRank] = strides[d - 1];
             outShape[outRank] = vol;
             vol = shape[d];
             outRank++;
@@ -114,7 +124,7 @@ static int Simplify(int rank, const std::array<int64_t, ROCCV_TENSOR_MAX_RANK>&
             vol *= shape[d];
         }
     }
-    outStrides[outRank] = stride[rank - 1];
+    outStrides[outRank] = strides[rank - 1];
     outShape[outRank] = vol;
     outRank++;
     return outRank;
@@ -182,12 +192,48 @@ TensorData Tensor::exportData() const {
     }
 }
 
-Tensor Tensor::reshape(const TensorShape& new_shape) const {
-    if (new_shape.size() * dtype().size() != this->shape().size() * this->dtype().size()) {
+Tensor Tensor::reshape(const TensorShape& newShape) const { return reshape(dtype(), newShape); }
+
+Tensor Tensor::reshape(const DataType& newDtype, const TensorShape& newShape) const {
+    if (newShape.size() * newDtype.size() != this->shape().size() * this->dtype().size()) {
         throw Exception("New tensor view must have the same underlying number of bytes.", eStatusType::INVALID_VALUE);
     }
 
-    Tensor::Requirements reqs = CalcRequirements(new_shape, dtype(), this->device());
+    const int oldRank = m_requirements.rank;
+    const int newRank = newShape.layout().rank();
+
+    if (m_requirements.strides[oldRank - 1] != static_cast<int64_t>(dtype().size())) {
+        throw Exception("Cannot reshape tensor: innermost dimension is not element-contiguous.",
+                        eStatusType::INVALID_VALUE);
+    }
+
+    // Convert to a byte-level view by expanding the innermost dimension size
+    // by the element size and setting its stride to 1.
+    std::array<int64_t, ROCCV_TENSOR_MAX_RANK> byteShape = m_requirements.shape;
+    std::array<int64_t, ROCCV_TENSOR_MAX_RANK> byteStrides = m_requirements.strides;
+    byteShape[oldRank - 1] *= dtype().size();
+    byteStrides[oldRank - 1] = 1;
+
+    std::array<int64_t, ROCCV_TENSOR_MAX_RANK> simpleShape, simpleStrides;
+    int simpleRank = Simplify(oldRank, byteShape, byteStrides, simpleShape, simpleStrides);
+
+    std::array<int64_t, ROCCV_TENSOR_MAX_RANK> targetByteShape = newShape.shape();
+    targetByteShape[newRank - 1] *= newDtype.size();
+
+    std::array<int64_t, ROCCV_TENSOR_MAX_RANK> targetByteStrides;
+    bool result =
+        ReshapeSimplified(simpleRank, simpleShape, simpleStrides, newRank, targetByteShape, targetByteStrides);
+    if (!result) {
+        throw Exception("Cannot reshape tensor into requested shape and data type.", eStatusType::INVALID_VALUE);
+    }
+
+    // The byte-level reshape produces stride 1 for the innermost dimension;
+    // scale it back to the new element size.
+    std::array<int64_t, ROCCV_TENSOR_MAX_RANK> newStrides = targetByteStrides;
+    newStrides[newRank - 1] = newDtype.size();
+
+    Tensor::Requirements reqs =
+        CalcRequirements(newShape, newDtype, newStrides, m_requirements.alignBytes, this->device());
     return Tensor(reqs, m_data);
 }
 
diff --git a/src/op_non_max_suppression.cpp b/src/op_non_max_suppression.cpp
index 93c4f90c..28d21c7b 100644
--- a/src/op_non_max_suppression.cpp
+++ b/src/op_non_max_suppression.cpp
@@ -97,12 +97,12 @@ void NonMaximumSuppression::operator()(hipStream_t stream, const Tensor& input,
     // but typically involve going from non-vectorized to vectorized shapes. These shapes should be validated
     // beforehand. For example: an input tensor with shape and datatype [NWC, <batches, boxes, 4>, S16] will be
     // reinterpreted as [NW, <batches, boxes, 4S16>]. In any case, the underlying data is structured the same.
-    Tensor inputReshaped =
-        input.reshape(TensorShape(TensorLayout(TENSOR_LAYOUT_NW), {numBatches, numBoxes}), DataType(DATA_TYPE_4S16));
+    Tensor inputReshaped = input.reshape(DataType(DATA_TYPE_4S16),
+                                          TensorShape(TensorLayout(TENSOR_LAYOUT_NW), {numBatches, numBoxes}));
     Tensor outputReshaped =
-        output.reshape(TensorShape(TensorLayout(TENSOR_LAYOUT_NW), {numBatches, numBoxes}), DataType(DATA_TYPE_U8));
-    Tensor scoresReshaped =
-        scores.reshape(TensorShape(TensorLayout(TENSOR_LAYOUT_NW), {numBatches, numBoxes}), DataType(DATA_TYPE_F32));
+        output.reshape(DataType(DATA_TYPE_U8), TensorShape(TensorLayout(TENSOR_LAYOUT_NW), {numBatches, numBoxes}));
+    Tensor scoresReshaped = scores.reshape(DataType(DATA_TYPE_F32),
+                                           TensorShape(TensorLayout(TENSOR_LAYOUT_NW), {numBatches, numBoxes}));
 
     // Launch nms kernel
     switch (device) {
diff --git a/tests/roccv/cpp/include/test_helpers.hpp b/tests/roccv/cpp/include/test_helpers.hpp
index e566ed3c..7fd9cf8f 100644
--- a/tests/roccv/cpp/include/test_helpers.hpp
+++ b/tests/roccv/cpp/include/test_helpers.hpp
@@ -392,30 +392,37 @@ void CompareVectorsNear(const std::vector<T>& result, const std::vector<T>& ref,
  *         If no padding, returns (total_size, 1, total_size)
  */
 inline std::tuple<size_t, size_t, size_t> ComputeCopyParams(const Tensor& tensor) {
-    auto tensorData = tensor.exportData<TensorDataStrided>();
-    const auto& layout = tensor.layout();
-    int heightIdx = layout.height_index();
-
-    if (heightIdx < 0) {
-        // No height dimension = contiguous data, no padding
+    if (tensor.isContiguous()) {
         size_t totalSize = tensor.shape().size() * tensor.dtype().size();
         return {totalSize, 1, totalSize};
     }
 
-    // Row width = product of all dimensions AFTER height_index × dtype size
+    auto tensorData = tensor.exportData<TensorDataStrided>();
+
+    // Find the padded dimension: the outermost dimension whose stride exceeds
+    // the packed product of the next dimension's shape and stride.
+    int paddedDim = 0;
+    for (int i = 0; i < tensor.rank() - 1; i++) {
+        if (tensorData.stride(i) != tensor.shape(i + 1) * tensorData.stride(i + 1)) {
+            paddedDim = i;
+            break;
+        }
+    }
+
+    // Row width = product of all dimensions AFTER paddedDim × dtype size
     size_t rowWidth = tensor.dtype().size();
-    for (int i = heightIdx + 1; i < layout.rank(); ++i) {
+    for (int i = paddedDim + 1; i < tensor.rank(); ++i) {
         rowWidth *= tensor.shape(i);
     }
 
-    // Number of rows = product of all dimensions UP TO AND INCLUDING height_index
+    // Number of rows = product of all dimensions UP TO AND INCLUDING paddedDim
     size_t numRows = 1;
-    for (int i = 0; i <= heightIdx; ++i) {
+    for (int i = 0; i <= paddedDim; ++i) {
         numRows *= tensor.shape(i);
     }
 
-    // Tensor pitch comes from the stride at height_index
-    size_t tensorPitch = tensorData.stride(heightIdx);
+    // Tensor pitch comes from the stride at the padded dimension
+    size_t tensorPitch = tensorData.stride(paddedDim);
 
     return {rowWidth, numRows, tensorPitch};
 }
diff --git a/tests/roccv/cpp/src/tests/core/tensor/test_tensor.cpp b/tests/roccv/cpp/src/tests/core/tensor/test_tensor.cpp
index 6e6da31a..86e46fe2 100644
--- a/tests/roccv/cpp/src/tests/core/tensor/test_tensor.cpp
+++ b/tests/roccv/cpp/src/tests/core/tensor/test_tensor.cpp
@@ -67,6 +67,15 @@ void TestNegativeTensorShape() {
     }
 }
 
+/**
+ * @brief Negative tests related to Tensor reshape.
+ *
+ */
+void TestNegativeTensorReshape() {
+    Tensor tensor(TensorShape({1, 2, 3}, "HWC"), DataType(DATA_TYPE_U8));
+    EXPECT_EXCEPTION(tensor.reshape(TensorShape({1, 1, 2, 4}, "NHWC")), eStatusType::INVALID_VALUE);
+}
+
 /**
  * @brief Negative tests for the Tensor class, verifying error handling in invalid scenarios.
  *
@@ -119,7 +128,7 @@ void TestTensorReshapeCorrectness() {
     // Tensor reshape: Change layout and datatype
     {
         Tensor tensor(TensorShape({1, 5, 4}, "NWC"), DataType(DATA_TYPE_S16));
-        Tensor reshapedTensor = tensor.reshape(TensorShape({1, 5}, "NW"), DataType(DATA_TYPE_4S16));
+        Tensor reshapedTensor = tensor.reshape(DataType(DATA_TYPE_4S16), TensorShape({1, 5}, "NW"));
         EXPECT_NE(reshapedTensor.shape().size(), tensor.shape().size());
         EXPECT_NE(reshapedTensor.rank(), tensor.rank());
         EXPECT_EQ(reshapedTensor.rank(), 2);
@@ -162,10 +171,11 @@ int main(int argc, char** argv) {
     // Negative tests
     TEST_CASE(TestNegativeTensorShape());
     TEST_CASE(TestNegativeTensor());
+    TEST_CASE(TestNegativeTensorReshape());
 
     // Correctness tests
     TEST_CASE(TestTensorCorrectness());
-    // TEST_CASE(TestTensorReshapeCorrectness());
+    TEST_CASE(TestTensorReshapeCorrectness());
 
     // Stride calculation tests
     // clang-format off

From fb1cb4407c7859a0e8425d206fa831f780e9362a Mon Sep 17 00:00:00 2001
From: Zach Vincze <zavincze@amd.com>
Date: Tue, 10 Mar 2026 11:26:09 -0400
Subject: [PATCH 29/36] Remove unused MemcpyParams from benchmark helpers

---
 benchmarks/src/roccv/roccv_bench_helpers.cpp | 26 --------------------
 1 file changed, 26 deletions(-)

diff --git a/benchmarks/src/roccv/roccv_bench_helpers.cpp b/benchmarks/src/roccv/roccv_bench_helpers.cpp
index 48d2022c..f6c1c3b2 100644
--- a/benchmarks/src/roccv/roccv_bench_helpers.cpp
+++ b/benchmarks/src/roccv/roccv_bench_helpers.cpp
@@ -82,32 +82,6 @@ class RandomGenerator {
     rocrand_generator m_gen;
 };
 
-struct MemcpyParams {
-    void* basePtr = nullptr;  // Base pointer to the tensor data
-    size_t rowPitch = 0;      // Number of bytes per row, including padding
-    size_t rowBytes = 0;      // Number of bytes per row, not including padding
-    size_t imageBytes = 0;    // Number of bytes per image, including padding (rowBytes * height)
-};
-
-/**
- * @brief Gets the memcpy parameters for a tensor to perform a memcpy2D operation.
- *
- * @param tensor The tensor to get the memcpy parameters for.
- * @return The memcpy parameters to perform a memcpy2D operation.
- */
-inline MemcpyParams GetMemcpyParams(const roccv::Tensor& tensor) {
-    MemcpyParams params;
-
-    roccv::TensorDataStrided tensorData = tensor.exportData<roccv::TensorDataStrided>();
-    params.rowPitch = tensorData.stride(tensor.layout().height_index());
-    params.rowBytes = tensor.shape(tensor.layout().width_index()) * tensor.shape(tensor.layout().channels_index()) *
-                      tensor.dtype().size();
-    params.imageBytes = params.rowPitch * tensor.shape(tensor.layout().height_index());
-    params.basePtr = tensorData.basePtr();
-
-    return params;
-}
-
 template <typename T>
 void FillTensorImpl(const roccv::Tensor& tensor) {
     RandomGenerator generator(tensor.device());

From a14435e0cae1e9da4423d49b8f71c48e06f6fab8 Mon Sep 17 00:00:00 2001
From: Zach Vincze <zavincze@amd.com>
Date: Tue, 10 Mar 2026 12:44:31 -0400
Subject: [PATCH 30/36] Add copyFromHost/copyToHost method definitions for
 Tensor

---
 include/core/tensor.hpp | 18 ++++++++++++++++++
 1 file changed, 18 insertions(+)

diff --git a/include/core/tensor.hpp b/include/core/tensor.hpp
index 9ee7cd1e..f5ac3374 100644
--- a/include/core/tensor.hpp
+++ b/include/core/tensor.hpp
@@ -236,6 +236,24 @@ class Tensor {
      */
     bool isContiguous() const;
 
+    /**
+     * @brief Copies data from a host pointer to the tensor. Host memory must be contiguous. This is a non-blocking
+     * operation, synchronized to the given stream.
+     *
+     * @param[in] src The source host pointer.
+     * @param[in] stream The stream to use for the copy.
+     */
+    void copyFromHost(const void *src, hipStream_t stream = nullptr);
+
+    /**
+     * @brief Copies data from the tensor to a host pointer. Host memory will be contiguous. This is a non-blocking
+     * operation, synchronized to the given stream.
+     *
+     * @param[out] dst The destination host pointer. Must be preallocated to the correct size.
+     * @param[in] stream The stream to use for the copy.
+     */
+    void copyToHost(void *dst, hipStream_t stream = nullptr) const noexcept;
+
     /**
      * @brief Calculates tensor requirements using the default memory alignment strategy.
      *

From dd11f3cd9f5ab1a4f72e09bf566df54e08c58918 Mon Sep 17 00:00:00 2001
From: Zach Vincze <zavincze@amd.com>
Date: Tue, 10 Mar 2026 13:12:02 -0400
Subject: [PATCH 31/36] Use copyTo/copyFromHost implementations for test
 helpers

---
 include/core/tensor.hpp                  |  4 +-
 src/core/tensor.cpp                      | 60 ++++++++++++++++++++++++
 tests/roccv/cpp/include/test_helpers.hpp | 22 ++-------
 3 files changed, 66 insertions(+), 20 deletions(-)

diff --git a/include/core/tensor.hpp b/include/core/tensor.hpp
index f5ac3374..5722019a 100644
--- a/include/core/tensor.hpp
+++ b/include/core/tensor.hpp
@@ -243,7 +243,7 @@ class Tensor {
      * @param[in] src The source host pointer.
      * @param[in] stream The stream to use for the copy.
      */
-    void copyFromHost(const void *src, hipStream_t stream = nullptr);
+    void copyFromHost(const void *src, hipStream_t stream = nullptr) const;
 
     /**
      * @brief Copies data from the tensor to a host pointer. Host memory will be contiguous. This is a non-blocking
@@ -252,7 +252,7 @@ class Tensor {
      * @param[out] dst The destination host pointer. Must be preallocated to the correct size.
      * @param[in] stream The stream to use for the copy.
      */
-    void copyToHost(void *dst, hipStream_t stream = nullptr) const noexcept;
+    void copyToHost(void *dst, hipStream_t stream = nullptr) const;
 
     /**
      * @brief Calculates tensor requirements using the default memory alignment strategy.
diff --git a/src/core/tensor.cpp b/src/core/tensor.cpp
index 3aa131bb..8e517578 100644
--- a/src/core/tensor.cpp
+++ b/src/core/tensor.cpp
@@ -130,6 +130,44 @@ static int Simplify(int rank, const std::array<int64_t, ROCCV_TENSOR_MAX_RANK>&
     return outRank;
 }
 
+/**
+ * @brief Computes copy parameters for host-tensor copy.
+ * @return (row_width_bytes, num_rows, tensor_pitch). If contiguous, returns (total_size, 1, total_size).
+ */
+static std::tuple<size_t, size_t, size_t> ComputeCopyParams(int rank,
+                                                            const std::array<int64_t, ROCCV_TENSOR_MAX_RANK>& shape,
+                                                            const std::array<int64_t, ROCCV_TENSOR_MAX_RANK>& strides,
+                                                            size_t dtypeSize, bool contiguous) {
+    if (contiguous) {
+        size_t totalSize = dtypeSize;
+        for (int i = 0; i < rank; ++i) {
+            totalSize *= static_cast<size_t>(shape[i]);
+        }
+        return {totalSize, 1, totalSize};
+    }
+
+    int paddedDim = 0;
+    for (int i = 0; i < rank - 1; i++) {
+        if (strides[i] != shape[i + 1] * strides[i + 1]) {
+            paddedDim = i;
+            break;
+        }
+    }
+
+    size_t rowWidth = dtypeSize;
+    for (int i = paddedDim + 1; i < rank; ++i) {
+        rowWidth *= static_cast<size_t>(shape[i]);
+    }
+
+    size_t numRows = 1;
+    for (int i = 0; i <= paddedDim; ++i) {
+        numRows *= static_cast<size_t>(shape[i]);
+    }
+
+    size_t tensorPitch = static_cast<size_t>(strides[paddedDim]);
+    return {rowWidth, numRows, tensorPitch};
+}
+
 }  // namespace
 
 // Constructor definitions
@@ -247,6 +285,28 @@ size_t Tensor::dataSize() const { return m_requirements.strides[0] * m_requireme
 
 bool Tensor::isContiguous() const { return dataSize() == shape().size() * dtype().size(); }
 
+void Tensor::copyFromHost(const void* src, hipStream_t stream) const {
+    auto [rowWidth, numRows, tensorPitch] = ComputeCopyParams(m_requirements.rank, m_requirements.shape,
+                                                              m_requirements.strides, dtype().size(), isContiguous());
+
+    const size_t srcPitch = rowWidth;
+    hipMemcpyKind kind = (device() == eDeviceType::GPU) ? hipMemcpyHostToDevice : hipMemcpyHostToHost;
+
+    HIP_VALIDATE_NO_ERRORS(
+        hipMemcpy2DAsync(m_data->data(), tensorPitch, src, srcPitch, rowWidth, numRows, kind, stream));
+}
+
+void Tensor::copyToHost(void* dst, hipStream_t stream) const {
+    auto [rowWidth, numRows, tensorPitch] = ComputeCopyParams(m_requirements.rank, m_requirements.shape,
+                                                              m_requirements.strides, dtype().size(), isContiguous());
+
+    const size_t dstPitch = rowWidth;
+    hipMemcpyKind kind = (device() == eDeviceType::GPU) ? hipMemcpyDeviceToHost : hipMemcpyHostToHost;
+
+    HIP_VALIDATE_NO_ERRORS(
+        hipMemcpy2DAsync(dst, dstPitch, m_data->data(), tensorPitch, rowWidth, numRows, kind, stream));
+}
+
 Tensor::Requirements Tensor::CalcRequirements(const TensorShape& shape, const DataType& dtype, eDeviceType device) {
     return CalcRequirements(shape, dtype, (MemAlignment){}, device);
 }
diff --git a/tests/roccv/cpp/include/test_helpers.hpp b/tests/roccv/cpp/include/test_helpers.hpp
index 7fd9cf8f..309d61a3 100644
--- a/tests/roccv/cpp/include/test_helpers.hpp
+++ b/tests/roccv/cpp/include/test_helpers.hpp
@@ -436,15 +436,8 @@ inline std::tuple<size_t, size_t, size_t> ComputeCopyParams(const Tensor& tensor
  */
 template <typename T>
 void CopyVectorIntoTensor(const Tensor& dst, const std::vector<T>& src) {
-    auto tensorData = dst.exportData<TensorDataStrided>();
-    auto [rowWidth, numRows, dstPitch] = ComputeCopyParams(dst);
-
-    // Source is always contiguous
-    size_t srcPitch = rowWidth;
-
-    hipMemcpyKind kind = (dst.device() == eDeviceType::GPU) ? hipMemcpyHostToDevice : hipMemcpyHostToHost;
-
-    HIP_VALIDATE_NO_ERRORS(hipMemcpy2D(tensorData.basePtr(), dstPitch, src.data(), srcPitch, rowWidth, numRows, kind));
+    dst.copyFromHost(src.data(), nullptr);
+    HIP_VALIDATE_NO_ERRORS(hipStreamSynchronize(nullptr));
 }
 
 /**
@@ -460,15 +453,8 @@ void CopyVectorIntoTensor(const Tensor& dst, const std::vector<T>& src) {
  */
 template <typename T>
 void CopyTensorIntoVector(std::vector<T>& dst, const Tensor& src) {
-    auto tensorData = src.exportData<TensorDataStrided>();
-    auto [rowWidth, numRows, srcPitch] = ComputeCopyParams(src);
-
-    // Destination is always contiguous
-    size_t dstPitch = rowWidth;
-
-    hipMemcpyKind kind = (src.device() == eDeviceType::GPU) ? hipMemcpyDeviceToHost : hipMemcpyHostToHost;
-
-    HIP_VALIDATE_NO_ERRORS(hipMemcpy2D(dst.data(), dstPitch, tensorData.basePtr(), srcPitch, rowWidth, numRows, kind));
+    src.copyToHost(dst.data(), nullptr);
+    HIP_VALIDATE_NO_ERRORS(hipStreamSynchronize(nullptr));
 }
 /**
  * @brief Computes the strides for a tensor with a given shape and data type.

From 8ffcc95c2f1bf04741263c00835c21613547ef8b Mon Sep 17 00:00:00 2001
From: Zach Vincze <zavincze@amd.com>
Date: Tue, 10 Mar 2026 14:01:59 -0400
Subject: [PATCH 32/36] Add tensor copy correctness tests

---
 .../cpp/src/tests/core/tensor/test_tensor.cpp | 28 +++++++++++++++++++
 1 file changed, 28 insertions(+)

diff --git a/tests/roccv/cpp/src/tests/core/tensor/test_tensor.cpp b/tests/roccv/cpp/src/tests/core/tensor/test_tensor.cpp
index 86e46fe2..6b543b3e 100644
--- a/tests/roccv/cpp/src/tests/core/tensor/test_tensor.cpp
+++ b/tests/roccv/cpp/src/tests/core/tensor/test_tensor.cpp
@@ -19,6 +19,8 @@
  * THE SOFTWARE.
  */
 
+#include <hip/hip_runtime.h>
+
 #include <core/tensor.hpp>
 #include <core/utils.hpp>
 
@@ -140,6 +142,31 @@ void TestTensorReshapeCorrectness() {
     }
 }
 
+/**
+ * @brief Tests the correctness of the copyFromHost and copyToHost methods.
+ *
+ */
+void TestTensorCopyCorrectness() {
+    Tensor tensor(2, {10, 10}, FMT_RGB8, eDeviceType::GPU);
+    const size_t hostDataSize = tensor.shape().size() * tensor.dtype().size();
+    std::vector<uint8_t> inputDataHost(hostDataSize);
+    for (size_t i = 0; i < inputDataHost.size(); i++) {
+        inputDataHost[i] = static_cast<uint8_t>(i % 256);
+    }
+
+    hipStream_t stream;
+    HIP_VALIDATE_NO_ERRORS(hipStreamCreate(&stream));
+
+    tensor.copyFromHost(inputDataHost.data(), stream);
+    std::vector<uint8_t> outputDataHost(hostDataSize);
+    tensor.copyToHost(outputDataHost.data(), stream);
+
+    HIP_VALIDATE_NO_ERRORS(hipStreamSynchronize(stream));
+    HIP_VALIDATE_NO_ERRORS(hipStreamDestroy(stream));
+
+    EXPECT_VECTOR_EQ(inputDataHost, outputDataHost);
+}
+
 /**
  * @brief Tests internal stride calculations on Tensor construction.
  */
@@ -176,6 +203,7 @@ int main(int argc, char** argv) {
     // Correctness tests
     TEST_CASE(TestTensorCorrectness());
     TEST_CASE(TestTensorReshapeCorrectness());
+    TEST_CASE(TestTensorCopyCorrectness());
 
     // Stride calculation tests
     // clang-format off

From 1ac14a97d705832904bd2baa2bade338e846c32c Mon Sep 17 00:00:00 2001
From: Zach Vincze <zavincze@amd.com>
Date: Tue, 10 Mar 2026 14:15:24 -0400
Subject: [PATCH 33/36] Properly convert element-wise strides to byte-wise
 strides on DLPack

---
 python/src/py_tensor.cpp | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/python/src/py_tensor.cpp b/python/src/py_tensor.cpp
index ab784347..8598e2eb 100644
--- a/python/src/py_tensor.cpp
+++ b/python/src/py_tensor.cpp
@@ -131,7 +131,8 @@ std::shared_ptr<PyTensor> PyTensor::fromDLPack(pybind11::object src, eTensorLayo
         stridesData = roccv::Tensor::CalcStrides(shape, roccv::DataType(dtype), 0);
     } else {
         for (int i = 0; i < dlTensor.ndim; ++i) {
-            stridesData[i] = dlTensor.strides[i];
+            // DLTensor strides are element-wise. Convert from element-wise to byte-wise.
+            stridesData[i] = dlTensor.strides[i] * roccv::DataType(dtype).size();
         }
     }
 

From add8a66659d0fb0ca5a1f935c9faa255df5f8d92 Mon Sep 17 00:00:00 2001
From: Zach Vincze <zavincze@amd.com>
Date: Tue, 10 Mar 2026 14:17:11 -0400
Subject: [PATCH 34/36] Add byte_offset from DLPack to rocCV tensor base
 pointer

---
 python/src/py_tensor.cpp | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/python/src/py_tensor.cpp b/python/src/py_tensor.cpp
index 8598e2eb..28bf385d 100644
--- a/python/src/py_tensor.cpp
+++ b/python/src/py_tensor.cpp
@@ -141,8 +141,8 @@ std::shared_ptr<PyTensor> PyTensor::fromDLPack(pybind11::object src, eTensorLayo
         roccv::Tensor::CalcRequirements(shape, roccv::DataType(dtype), stridesData, 0, device);
 
     // Since this tensor is coming from a DLPack, we don't own the data, so we need to create a view of the data.
-    std::shared_ptr<roccv::TensorStorage> data =
-        std::make_shared<roccv::TensorStorage>(dlTensor.data, device, eOwnership::VIEW);
+    std::shared_ptr<roccv::TensorStorage> data = std::make_shared<roccv::TensorStorage>(
+        static_cast<uint8_t*>(dlTensor.data) + dlTensor.byte_offset, device, eOwnership::VIEW);
     std::shared_ptr<roccv::Tensor> tensor = std::make_shared<roccv::Tensor>(reqs, data);
 
     // Instantiate a new tensor and a PyTensor to wrap it, binding the original DLManagedTensor

From 2affddbb4b08387003b14bdd943490ecd483750b Mon Sep 17 00:00:00 2001
From: Zach Vincze <zavincze@amd.com>
Date: Wed, 11 Mar 2026 11:28:57 -0400
Subject: [PATCH 35/36] Add sensible default alignment for CPU allocated
 tensors

---
 include/core/mem_alignment.hpp |  2 +
 src/core/tensor.cpp            | 76 +++++++++++++++++++++-------------
 2 files changed, 50 insertions(+), 28 deletions(-)

diff --git a/include/core/mem_alignment.hpp b/include/core/mem_alignment.hpp
index 4aceceba..76a0d9f0 100644
--- a/include/core/mem_alignment.hpp
+++ b/include/core/mem_alignment.hpp
@@ -25,6 +25,8 @@
 
 namespace roccv {
 
+constexpr int32_t ROCCV_CPU_DEFAULT_ALIGNMENT = 64;  // Default alignment for CPU memory
+
 /**
  * @class MemAlignment
  * @brief Class for specifying memory alignment constraints for buffer allocations.
diff --git a/src/core/tensor.cpp b/src/core/tensor.cpp
index 8e517578..f95cf811 100644
--- a/src/core/tensor.cpp
+++ b/src/core/tensor.cpp
@@ -168,6 +168,51 @@ static std::tuple<size_t, size_t, size_t> ComputeCopyParams(int rank,
     return {rowWidth, numRows, tensorPitch};
 }
 
+/**
+ * @brief Computes the memory alignment for a tensor based on the device, data type, and user provided buffer alignment.
+ * @param[in] device The device the tensor is to be allocated on.
+ * @param[in] dtype The datatype of the tensor.
+ * @param[in] bufAlign The memory alignment to use.
+ * @return The memory alignment for the tensor.
+ */
+MemAlignment ComputeMemAlignment(eDeviceType device, const DataType& dtype, const MemAlignment& bufAlign) {
+    int dev = 0;
+    if (device == eDeviceType::GPU) {
+        HIP_VALIDATE_NO_ERRORS(hipGetDevice(&dev));
+    }
+
+    int rowAlign;
+    if (bufAlign.rowAddr() == 0) {
+        if (device == eDeviceType::GPU) {
+            HIP_VALIDATE_NO_ERRORS(hipDeviceGetAttribute(&rowAlign, hipDeviceAttributeTexturePitchAlignment, dev));
+        } else {
+            rowAlign = ROCCV_CPU_DEFAULT_ALIGNMENT;
+        }
+        rowAlign = std::lcm(rowAlign, detail::NextPowerOfTwo(dtype.size()));
+    } else {
+        if (!detail::IsPowerOfTwo(bufAlign.rowAddr())) {
+            throw Exception("Row address alignment must be a power of two.", eStatusType::INVALID_VALUE);
+        }
+        rowAlign = std::lcm(bufAlign.rowAddr(), detail::NextPowerOfTwo(dtype.size()));
+    }
+
+    int baseAlign;
+    if (bufAlign.baseAddr() == 0) {
+        if (device == eDeviceType::GPU) {
+            HIP_VALIDATE_NO_ERRORS(hipDeviceGetAttribute(&baseAlign, hipDeviceAttributeTextureAlignment, dev));
+        } else {
+            baseAlign = ROCCV_CPU_DEFAULT_ALIGNMENT;
+        }
+        baseAlign = std::lcm(baseAlign, detail::NextPowerOfTwo(dtype.size()));
+    } else {
+        if (!detail::IsPowerOfTwo(bufAlign.baseAddr())) {
+            throw Exception("Base address alignment must be a power of two.", eStatusType::INVALID_VALUE);
+        }
+        baseAlign = std::lcm(bufAlign.baseAddr(), detail::NextPowerOfTwo(dtype.size()));
+    }
+
+    return MemAlignment().baseAddr(baseAlign).rowAddr(rowAlign);
+}
 }  // namespace
 
 // Constructor definitions
@@ -313,35 +358,10 @@ Tensor::Requirements Tensor::CalcRequirements(const TensorShape& shape, const Da
 
 Tensor::Requirements Tensor::CalcRequirements(const TensorShape& shape, const DataType& dtype,
                                               const MemAlignment& bufAlign, eDeviceType device) {
-    int dev;
-    HIP_VALIDATE_NO_ERRORS(hipGetDevice(&dev));
-
-    // Validate memory alignment, set default alignment if set to 0.
-    // TODO: Must be supported for CPU as well.
-    int rowAlign;
-    if (bufAlign.rowAddr() == 0) {
-        HIP_VALIDATE_NO_ERRORS(hipDeviceGetAttribute(&rowAlign, hipDeviceAttributeTexturePitchAlignment, dev));
-        rowAlign = std::lcm(rowAlign, detail::NextPowerOfTwo(dtype.size()));
-    } else {
-        if (!detail::IsPowerOfTwo(bufAlign.rowAddr())) {
-            throw Exception("Row address alignment must be a power of two.", eStatusType::INVALID_VALUE);
-        }
-        rowAlign = std::lcm(bufAlign.rowAddr(), detail::NextPowerOfTwo(dtype.size()));
-    }
-
-    int baseAlign;
-    if (bufAlign.baseAddr() == 0) {
-        HIP_VALIDATE_NO_ERRORS(hipDeviceGetAttribute(&baseAlign, hipDeviceAttributeTextureAlignment, dev));
-        baseAlign = std::lcm(baseAlign, detail::NextPowerOfTwo(dtype.size()));
-    } else {
-        if (!detail::IsPowerOfTwo(bufAlign.baseAddr())) {
-            throw Exception("Base address alignment must be a power of two.", eStatusType::INVALID_VALUE);
-        }
-        baseAlign = std::lcm(bufAlign.baseAddr(), detail::NextPowerOfTwo(dtype.size()));
-    }
+    MemAlignment newAlign = ComputeMemAlignment(device, dtype, bufAlign);
 
-    std::array<int64_t, ROCCV_TENSOR_MAX_RANK> strides = CalcStrides(shape, dtype, rowAlign);
-    Tensor::Requirements reqs = CalcRequirements(shape, dtype, strides, baseAlign, device);
+    std::array<int64_t, ROCCV_TENSOR_MAX_RANK> strides = CalcStrides(shape, dtype, newAlign.rowAddr());
+    Tensor::Requirements reqs = CalcRequirements(shape, dtype, strides, newAlign.baseAddr(), device);
     return reqs;
 }
 

From ea91e85f55a29cfb5d14d6fb68a861bf346b72da Mon Sep 17 00:00:00 2001
From: Zach Vincze <zavincze@amd.com>
Date: Mon, 23 Mar 2026 18:35:45 -0400
Subject: [PATCH 36/36] Address PR comments

---
 include/core/tensor.hpp            |  2 ++
 samples/bilateral_filter.cpp       |  2 +-
 samples/common/utils.hpp           | 28 ++++++++++++++++++----------
 samples/composite.cpp              |  1 +
 samples/cropandresize/cpp/main.cpp |  2 +-
 5 files changed, 23 insertions(+), 12 deletions(-)

diff --git a/include/core/tensor.hpp b/include/core/tensor.hpp
index 5722019a..7f2241a1 100644
--- a/include/core/tensor.hpp
+++ b/include/core/tensor.hpp
@@ -21,6 +21,8 @@ THE SOFTWARE.
 */
 #pragma once
 
+#include <hip/hip_runtime.h>
+
 #include <array>
 #include <memory>
 
diff --git a/samples/bilateral_filter.cpp b/samples/bilateral_filter.cpp
index 04364f5f..fdfd11d0 100644
--- a/samples/bilateral_filter.cpp
+++ b/samples/bilateral_filter.cpp
@@ -57,7 +57,7 @@ void PrintUsage(const char* programName) {
     std::cout << "  -D, --diameter <diameter>           Diameter of the filtering area (optional, default: 2)" << std::endl;
     std::cout << "  -s, --sigma_space <sigma_space>     Spatial parameter sigma of the Gaussian function (optional, default: 2.0f)" << std::endl;
     std::cout << "  -c, --sigma_color <sigma_color>     Range parameter sigma of the Gaussian function (optional, default: 10.0f)" << std::endl;
-    std::cout << "  -B, --border_mode <border_mode>     Border mode at image boundary when work pixels are outside of the image (optional, default: 1 (replicate))" << std::endl;
+    std::cout << "  -b, --border_mode <border_mode>     Border mode at image boundary when work pixels are outside of the image (optional, default: 1 (replicate))" << std::endl;
     std::cout << "  -B, --border_color <border_color>   Border color for constant color border mode (optional, default: 0,0,0,0)" << std::endl;
     std::cout << "  -C, --cpu                           Use CPU for execution (optional, default: GPU)" << std::endl;
     std::cout << "  -h, --help                          Show this help message" << std::endl;
diff --git a/samples/common/utils.hpp b/samples/common/utils.hpp
index bd0b806f..817cac9d 100644
--- a/samples/common/utils.hpp
+++ b/samples/common/utils.hpp
@@ -23,6 +23,7 @@
 
 #include <core/tensor.hpp>
 #include <filesystem>
+#include <format>
 #include <opencv2/opencv.hpp>
 
 inline void CheckHIPError(hipError_t code, const char *file, const int line) {
@@ -97,24 +98,31 @@ inline roccv::Tensor LoadImages(hipStream_t stream, const std::string &image_pat
     if (std::filesystem::is_directory(image_path)) {
         for (auto file : std::filesystem::directory_iterator(image_path)) {
             if (!std::filesystem::is_directory(file.path()) && ContainsExtension(file.path(), supportedExtensions)) {
-                images.push_back(cv::imread(file.path(), openCVFlags));
+                cv::Mat image = cv::imread(file.path(), openCVFlags);
+                if (image.empty()) {
+                    throw std::runtime_error("Cannot decode " + file.path().string() + ". File type not supported.\n");
+                }
+                images.push_back(image);
 
                 // Check if all images are of the same size
                 if (width == -1 && height == -1 && channels == -1) {
-                    width = images.back().cols;
-                    height = images.back().rows;
-                    channels = images.back().channels();
-                } else if (images.back().cols != width || images.back().rows != height ||
-                           images.back().channels() != channels) {
+                    width = image.cols;
+                    height = image.rows;
+                    channels = image.channels();
+                } else if (image.cols != width || image.rows != height || image.channels() != channels) {
                     throw std::runtime_error("All images must be of the same size and format");
                 }
             }
         }
     } else if (std::filesystem::is_regular_file(image_path) && ContainsExtension(image_path, supportedExtensions)) {
-        images.push_back(cv::imread(image_path));
-        width = images.back().cols;
-        height = images.back().rows;
-        channels = images.back().channels();
+        cv::Mat image = cv::imread(image_path, openCVFlags);
+        if (image.empty()) {
+            throw std::runtime_error("Cannot decode " + image_path + ". File type not supported.\n");
+        }
+        images.push_back(image);
+        width = image.cols;
+        height = image.rows;
+        channels = image.channels();
     } else {
         throw std::runtime_error("Cannot decode " + image_path + ". File type not supported.\n");
     }
diff --git a/samples/composite.cpp b/samples/composite.cpp
index 135dd9f2..bd21948f 100644
--- a/samples/composite.cpp
+++ b/samples/composite.cpp
@@ -93,6 +93,7 @@ int main(int argc, char** argv) {
                 break;
             case 'd':
                 config.deviceId = std::stoi(optarg);
+                break;
             case 'h':
                 PrintUsage(argv[0]);
                 return EXIT_SUCCESS;
diff --git a/samples/cropandresize/cpp/main.cpp b/samples/cropandresize/cpp/main.cpp
index f2a9c939..eec6532b 100644
--- a/samples/cropandresize/cpp/main.cpp
+++ b/samples/cropandresize/cpp/main.cpp
@@ -174,7 +174,7 @@ int main(int argc, char** argv) {
 
     // Create tensor for the resized image
     Tensor resizedTensor =
-        Tensor(TensorShape(input.layout(), {batchSize, config.resizeShape.w, config.resizeShape.h, channels}),
+        Tensor(TensorShape(input.layout(), {batchSize, config.resizeShape.h, config.resizeShape.w, channels}),
                input.dtype(), config.device);
 
     // Create crop and resize operators