diff --git a/benchmarks/src/roccv/roccv_bench_helpers.cpp b/benchmarks/src/roccv/roccv_bench_helpers.cpp
index a84c1616..f6c1c3b2 100644
--- a/benchmarks/src/roccv/roccv_bench_helpers.cpp
+++ b/benchmarks/src/roccv/roccv_bench_helpers.cpp
@@ -59,13 +59,14 @@ class RandomGenerator {
     void generate(const roccv::Tensor& tensor) {
         const auto tensor_data = tensor.exportData<roccv::TensorDataStrided>();
 
+        const size_t numElements = tensor.dataSize() / tensor.dtype().size();
+
         if constexpr (std::is_integral_v<T>) {
-            rocrand_generate_char(m_gen, static_cast<unsigned char*>(tensor_data.basePtr()),
-                                  tensor.shape().size() * tensor.dtype().size());
+            rocrand_generate_char(m_gen, static_cast<unsigned char*>(tensor_data.basePtr()), numElements);
         } else if constexpr (std::is_same_v<T, float>) {
-            rocrand_generate_uniform(m_gen, static_cast<float*>(tensor_data.basePtr()), tensor.shape().size());
+            rocrand_generate_uniform(m_gen, static_cast<float*>(tensor_data.basePtr()), numElements);
         } else if constexpr (std::is_same_v<T, double>) {
-            rocrand_generate_uniform_double(m_gen, static_cast<double*>(tensor_data.basePtr()), tensor.shape().size());
+            rocrand_generate_uniform_double(m_gen, static_cast<double*>(tensor_data.basePtr()), numElements);
         } else {
             throw std::runtime_error("Unsupported data type.");
         }
diff --git a/include/core/mem_alignment.hpp b/include/core/mem_alignment.hpp
new file mode 100644
index 00000000..76a0d9f0
--- /dev/null
+++ b/include/core/mem_alignment.hpp
@@ -0,0 +1,91 @@
+/*
+ * Copyright (c) 2025 Advanced Micro Devices, Inc. All rights reserved.
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ */
+
+#pragma once
+
+#include <stdint.h>
+
+namespace roccv {
+
+constexpr int32_t ROCCV_CPU_DEFAULT_ALIGNMENT = 64;  // Default alignment for CPU memory
+
+/**
+ * @class MemAlignment
+ * @brief Class for specifying memory alignment constraints for buffer allocations.
+ *
+ * The MemAlignment class allows you to specify the memory alignment in bytes that should
+ * be used when allocating or working with device or host memory buffers. Proper alignment
+ * is important for performance reasons and hardware compatibility, especially for GPU operations.
+ *
+ * There are two types of alignment constraints:
+ *  - Base address alignment: Alignment requirements for the starting address of the buffer.
+ *  - Row address alignment: Alignment requirements for the starting address of each row, which is
+ *    important for multi-dimensional data (e.g., images or tensors).
+ *
+ * Both alignment values default to 0, which implies that the system or device default alignment will be used.
+ *
+ * Example usage:
+ * @code
+ *   roccv::MemAlignment align;
+ *   align.baseAddr(256).rowAddr(128);
+ * @endcode
+ *
+ * @see roccv::Tensor
+ */
+class MemAlignment {
+   public:
+    MemAlignment() = default;
+
+    /**
+     * @brief Returns the base address alignment.
+     *
+     * @return The base address alignment.
+     */
+    int32_t baseAddr() const;
+
+    /**
+     * @brief Returns the row address alignment.
+     *
+     * @return The row address alignment.
+     */
+    int32_t rowAddr() const;
+
+    /**
+     * @brief Sets the base address alignment.
+     *
+     * @param[in] alignment Alignment in bytes.
+     * @return A reference to this object, with the base address set.
+     */
+    MemAlignment& baseAddr(int32_t alignment);
+
+    /**
+     * @brief Sets the row address alignment.
+     *
+     * @param[in] alignment Alignment in bytes.
+     * @return A reference to this object, with the row address set.
+     */
+    MemAlignment& rowAddr(int32_t alignment);
+
+   private:
+    int32_t m_baseAddrAlignment = 0;
+    int32_t m_rowAddrAlignment = 0;
+};
+}  // namespace roccv
\ No newline at end of file
diff --git a/include/core/tensor.hpp b/include/core/tensor.hpp
index e672db8a..7f2241a1 100644
--- a/include/core/tensor.hpp
+++ b/include/core/tensor.hpp
@@ -21,24 +21,26 @@ THE SOFTWARE.
 */
 #pragma once
 
+#include <hip/hip_runtime.h>
+
 #include <array>
 #include <memory>
 
 #include "core/data_type.hpp"
 #include "core/detail/allocators/i_allocator.hpp"
+#include "core/detail/context.hpp"
+#include "core/image_format.hpp"
+#include "core/mem_alignment.hpp"
+#include "core/tensor_data.hpp"
 #include "core/tensor_layout.hpp"
+#include "core/tensor_requirements.hpp"
+#include "core/tensor_shape.hpp"
+#include "core/tensor_storage.hpp"
 #include "core/util_enums.h"
-#include "tensor_data.hpp"
-#include "tensor_requirements.hpp"
-#include "tensor_storage.hpp"
+#include "operator_types.h"
 
 namespace roccv {
 
-class ImageFormat;
-struct Size2D;
-class TensorShape;
-class TensorLayout;
-
 class Tensor {
    public:
     using Requirements = TensorRequirements;
@@ -50,8 +52,7 @@ class Tensor {
      *
      * @param[in] reqs An object representing the requirements for this tensor.
      */
-    explicit Tensor(const TensorRequirements &reqs);
-    explicit Tensor(const TensorRequirements &reqs, const IAllocator &alloc);
+    explicit Tensor(const TensorRequirements &reqs, const IAllocator &alloc = GlobalContext().getDefaultAllocator());
 
     /**
      * @brief Constructs a Tensor object given a list of requirements and the underlying data as a TensorStorage
@@ -61,22 +62,34 @@ class Tensor {
      * @param[in] data A TensorStorage object for the tensor's underlying data.
      */
     explicit Tensor(const TensorRequirements &reqs, std::shared_ptr<TensorStorage> data);
-    explicit Tensor(const TensorRequirements &reqs, std::shared_ptr<TensorStorage> data, const IAllocator &alloc);
 
     /**
-     * @brief Constructs a tensor object and allocates the appropriate amount of memory on the specified device.
+     * @brief Constructs a tensor object and allocates the appropriate amount of memory on the specified device. Uses
+     * the default memory alignment and allocation strategy.
      *
      * @param[in] shape The shape describing the tensor.
      * @param[in] dtype The underlying datatype of the tensor.
      * @param[in] device The device the tensor should be allocated on.
      */
     explicit Tensor(const TensorShape &shape, DataType dtype, eDeviceType device = eDeviceType::GPU);
-    explicit Tensor(const TensorShape &shape, DataType dtype, const IAllocator &alloc,
+
+    /**
+     * @brief Constructs a tensor object and allocates the appropriate amount of memory on the specified device. Uses a
+     * user-specified memory alignment and allocation strategy.
+     *
+     * @param[in] shape The shape describing the tensor.
+     * @param[in] dtype The underlying datatype of the tensor.
+     * @param[in] bufAlign Specification for memory alignment.
+     * @param[in] alloc The allocation strategy. (Default: DefaultAllocator)
+     * @param[in] device The device the tensor should be allocated on.
+     */
+    explicit Tensor(const TensorShape &shape, DataType dtype, const MemAlignment &bufAlign,
+                    const IAllocator &alloc = GlobalContext().getDefaultAllocator(),
                     eDeviceType device = eDeviceType::GPU);
 
     /**
      * @brief Constructs a tensor using image-based requirements and allocates the appropriate amount of memory on the
-     * specified device.
+     * specified device. Uses the default memory alignment and allocation strategy.
      *
      * @param[in] num_images The number of images in the batch.
      * @param[in] image_size The size for images in the batch.
@@ -84,7 +97,20 @@ class Tensor {
      * @param[in] device The device the tensor should be allocated on.
      */
     explicit Tensor(int num_images, Size2D image_size, ImageFormat fmt, eDeviceType device = eDeviceType::GPU);
-    explicit Tensor(int num_images, Size2D image_size, ImageFormat fmt, const IAllocator &alloc,
+
+    /**
+     * @brief Constructs a tensor using image-based requirements and allocates the appropriate amount of memory on the
+     * specified device. Uses user-provided memory alignment and allocation strategies.
+     *
+     * @param[in] num_images The number of images in the batch.
+     * @param[in] image_size The size for images in the batch.
+     * @param[in] fmt The format of the underlying image data.
+     * @param[in] bufAlign Specification for memory alignment.
+     * @param[in] alloc The allocation strategy. (Default: DefaultAllocator)
+     * @param[in] device The device the tensor should be allocated on.
+     */
+    explicit Tensor(int num_images, Size2D image_size, ImageFormat fmt, const MemAlignment &bufAlign,
+                    const IAllocator &alloc = GlobalContext().getDefaultAllocator(),
                     eDeviceType device = eDeviceType::GPU);
 
     Tensor(const Tensor &other) = delete;
@@ -166,28 +192,72 @@ class Tensor {
     }
 
     /**
-     * @brief Creates a view of this tensor with a new shape and layout
+     * @brief Creates a view of this tensor with a new shape and layout, keeping the same data type.
      *
-     * @param[in] new_shape the new shape of the tensor
-     * @return Tensor
+     * @param[in] newShape The new shape of the tensor.
+     * @return A new tensor view with the given shape.
      */
-    Tensor reshape(const TensorShape &new_shape) const;
+    Tensor reshape(const TensorShape &newShape) const;
 
     /**
-     * @brief Creates a vew of this tensor with a new shape, layout, and data type. The number of bytes allocated must
-     * match the original tensor.
+     * @brief Creates a view of this tensor with a new data type and shape.
      *
-     * @param new_shape The new tensor shape.
-     * @param new_dtype The new data type of the underlying tensor data.
-     * @return Tensor
+     * Reinterprets the tensor's underlying bytes with the given data type and shape. The total byte count
+     * (elements * dtype size) must match between the original and new view. Non-contiguous (padded) tensors
+     * are supported as long as the reshape is compatible with the stride structure.
+     *
+     * @param[in] newDtype The new data type of the tensor elements.
+     * @param[in] newShape The new shape of the tensor.
+     * @return A new tensor view with the given data type and shape.
      */
-    Tensor reshape(const TensorShape &new_shape, const DataType &new_dtype) const;
+    Tensor reshape(const DataType &newDtype, const TensorShape &newShape) const;
 
+    /**
+     * @brief Performs a shallow copy of the tensor (creates a view).
+     *
+     * This assignment operator copies the tensor's metadata and data handle,
+     * resulting in a new tensor object that shares the same underlying data
+     * with the original tensor. No deep copy of the data is performed.
+     *
+     * @param other The tensor to assign from.
+     * @return Reference to this tensor.
+     */
     Tensor &operator=(const Tensor &other);
 
     /**
-     * @brief Calculates tensor requirements. This essentially wraps the
-     * provided parameters into a TensorRequirements object.
+     * @brief Returns the total number of bytes being used to store the raw tensor data.
+     *
+     * @return Total number of bytes being used to store the raw tensor data.
+     */
+    size_t dataSize() const;
+
+    /**
+     * @brief Returns true if the tensor is contiguous in memory, meaning there is no padding present in the tensor.
+     *
+     * @return True if the tensor is contiguous in memory, false otherwise.
+     */
+    bool isContiguous() const;
+
+    /**
+     * @brief Copies data from a host pointer to the tensor. Host memory must be contiguous. This is a non-blocking
+     * operation, synchronized to the given stream.
+     *
+     * @param[in] src The source host pointer.
+     * @param[in] stream The stream to use for the copy.
+     */
+    void copyFromHost(const void *src, hipStream_t stream = nullptr) const;
+
+    /**
+     * @brief Copies data from the tensor to a host pointer. Host memory will be contiguous. This is a non-blocking
+     * operation, synchronized to the given stream.
+     *
+     * @param[out] dst The destination host pointer. Must be preallocated to the correct size.
+     * @param[in] stream The stream to use for the copy.
+     */
+    void copyToHost(void *dst, hipStream_t stream = nullptr) const;
+
+    /**
+     * @brief Calculates tensor requirements using the default memory alignment strategy.
      *
      * @param[in] shape The desired shape of the tensor.
      * @param[in] dtype The desired data type of the tensor's raw data.
@@ -199,43 +269,72 @@ class Tensor {
                                          eDeviceType device = eDeviceType::GPU);
 
     /**
-     * @brief Calculates tensor requirements.
+     * @brief Calculates tensor requirements with a user-provided memory alignment strategy.
+     *
+     * @param[in] shape The desired shape of the tensor.
+     * @param[in] dtype The desired data type of the tensor's raw data.
+     * @param[in] bufAlign Specification for memory alignment.
+     * @param[in] device The device the tensor data should belong to.
+     * @return A TensorRequirements object representing this tensor's
+     * requirements.
+     */
+    static Requirements CalcRequirements(const TensorShape &shape, const DataType &dtype, const MemAlignment &bufAlign,
+                                         const eDeviceType device = eDeviceType::GPU);
+
+    /**
+     * @brief Calculates tensor requirements with user-provided strides.
      *
      * @param[in] shape The shape describing the tensor.
      * @param[in] dtype The type of the tensor's data.
      * @param[in] strides The tensor's strides.
+     * @param[in] baseAlign The base address alignment.
      * @param[in] device The device the tensor data belongs on. (Default: GPU)
      * @return Tensor requirements.
      */
     static Requirements CalcRequirements(const TensorShape &shape, const DataType &dtype,
-                                         std::array<int64_t, ROCCV_TENSOR_MAX_RANK> strides,
+                                         const std::array<int64_t, ROCCV_TENSOR_MAX_RANK> strides, int32_t baseAlign,
                                          eDeviceType device = eDeviceType::GPU);
 
     /**
-     * @brief Calculates tensor requirements using image-based parameters.
+     * @brief Calculates tensor requirements using image-based parameters. This will use a default memory alignment
+     * strategy.
      *
      * @param[in] num_images The number of images in the batch.
      * @param[in] image_size The size for images in the batch.
      * @param[in] fmt The format of the underlying image data.
      * @param[in] device The device the tensor data should belong to.
-     * @return A TensorRequirements object representing the tensor's requirements.
+     * @return A Tensor::Requirements object representing the tensor's requirements.
      */
     static Requirements CalcRequirements(int num_images, Size2D image_size, ImageFormat fmt,
                                          eDeviceType device = eDeviceType::GPU);
 
+    /**
+     * @brief Calculates tensor requirements using image-based parameters and a specified memory alignment.
+     *
+     * @param[in] num_images The number of images in the batch.
+     * @param[in] image_size The size of images in the batch.
+     * @param[in] fmt The format of the underling image data.
+     * @param[in] bufAlign Specification for memory alignment.
+     * @param[in] device The device the tensor is to be allocated on.
+     * @return A Tensor::Requirements object representing this tensor's requirements.
+     */
+    static Requirements CalcRequirements(int num_images, Size2D image_size, ImageFormat fmt,
+                                         const MemAlignment &bufAlign, eDeviceType device = eDeviceType::GPU);
+
     /**
      * @brief Calculates strides required for a tensor.
      *
      * @param shape The tensor shape.
      * @param dtype The datatype of the tensor.
+     * @param rowAlign The row alignment to use. Setting to 0 will ensure contiguous memory usage.
      * @return An array containing strides for the given parameters.
      */
-    static std::array<int64_t, ROCCV_TENSOR_MAX_RANK> CalcStrides(const TensorShape &shape, const DataType &dtype);
+    static std::array<int64_t, ROCCV_TENSOR_MAX_RANK> CalcStrides(const TensorShape &shape, const DataType &dtype,
+                                                                  int32_t rowAlign);
 
    private:
     TensorRequirements m_requirements;      // Tensor metadata
     std::shared_ptr<TensorStorage> m_data;  // Stores raw tensor data
-    const IAllocator &m_allocator;
 };
 
 /**
diff --git a/include/core/tensor_requirements.hpp b/include/core/tensor_requirements.hpp
index 0bb6f631..3ff30d4e 100644
--- a/include/core/tensor_requirements.hpp
+++ b/include/core/tensor_requirements.hpp
@@ -29,16 +29,40 @@ THE SOFTWARE.
 
 namespace roccv {
 
+/**
+ * @brief Specifies basic memory requirements for an allocation.
+ *
+ * This struct expresses the number of bytes required for a memory region that backs
+ * a particular tensor or buffer allocation. It is typically used to indicate the raw
+ * size required for device, host, or pinned memory allocations.
+ */
 struct MemRequirements {
     size_t bytes = 0;
 };
 
+/**
+ * @brief Specifies resource requirements for tensor memory allocations.
+ *
+ * This struct aggregates requirements for different types of memory resources
+ * that may be used for tensor allocation and operation:
+ * - deviceMem: Memory required on the device (e.g., GPU).
+ * - hostMem: Memory required on the host (CPU-accessible memory).
+ * - pinnedMem: Memory required in pinned (page-locked) host memory, which may
+ *              be used for efficient device-host transfers.
+ */
 struct ResourceRequirements {
     MemRequirements deviceMem;
     MemRequirements hostMem;
     MemRequirements pinnedMem;
 };
 
+/**
+ * @brief Specifies the requirements for creating and allocating a tensor.
+ *
+ * This struct defines all the necessary properties for specifying the memory and layout
+ * requirements of a tensor, including its datatype, shape, memory alignment, layout, strides,
+ * memory resource requirements, and device placement.
+ */
 struct TensorRequirements {
     eDataType dtype;
     eTensorLayout layout;
diff --git a/include/core/tensor_storage.hpp b/include/core/tensor_storage.hpp
index 33a0aaf7..4200c099 100644
--- a/include/core/tensor_storage.hpp
+++ b/include/core/tensor_storage.hpp
@@ -71,11 +71,17 @@ class TensorStorage {
      */
     eDeviceType device() const;
 
+    /**
+     * @brief Returns the allocation strategy being used.
+     *
+     * @return The allocation strategy being used.
+     */
+    const IAllocator& allocator() const;
+
    private:
     eDeviceType m_device;
     eOwnership m_ownership;
     void* m_data;
     const IAllocator& m_allocator;
 };
-
 }  // namespace roccv
\ No newline at end of file
diff --git a/include/core/utils.hpp b/include/core/utils.hpp
new file mode 100644
index 00000000..4449ce5b
--- /dev/null
+++ b/include/core/utils.hpp
@@ -0,0 +1,68 @@
+/**
+Copyright (c) 2025 Advanced Micro Devices, Inc. All rights reserved.
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in
+all copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+THE SOFTWARE.
+*/
+
+#pragma once
+
+#include <stdint.h>
+
+#include <cstddef>
+#include <type_traits>
+
+namespace roccv::detail {
+
+inline constexpr size_t NextPowerOfTwo(size_t value) noexcept {
+    if (value <= 1) return 1;
+#if defined(__GNUC__) || defined(__clang__)
+    constexpr int numBits = sizeof(size_t) * 8;
+    return 1UL << (numBits - __builtin_clzl(value - 1));
+#else
+    value--;
+    for (size_t i = 1; i < sizeof(size_t) * 8; i <<= 1) value |= (value >> i);
+    return value + 1;
+#endif
+}
+
+/**
+ * @brief Returns true if the given value is a power of two.
+ *
+ * @param value The value to check.
+ * @return True if the given value is a power of two, false otherwise.
+ */
+template <typename T, typename = std::enable_if_t<std::is_integral_v<T>>>
+inline constexpr bool IsPowerOfTwo(T value) noexcept {
+    return value > 0 && (value & (value - 1)) == 0;
+}
+
+/**
+ * @brief Aligns the given value to the nearest multiple of the given alignment.
+ *
+ * @param value The value to align.
+ * @param alignment The alignment to align to (must be > 0).
+ * @return The aligned value (same type as value).
+ */
+template <typename T, typename U, typename = std::enable_if_t<std::is_integral_v<T> && std::is_integral_v<U>>>
+inline constexpr T AlignUp(T value, U alignment) noexcept {
+    return alignment > 0
+               ? (value + static_cast<T>(alignment) - 1) / static_cast<T>(alignment) * static_cast<T>(alignment)
+               : value;
+}
+}  // namespace roccv::detail
\ No newline at end of file
diff --git a/python/include/py_tensor.hpp b/python/include/py_tensor.hpp
index 523a5f27..0c940148 100644
--- a/python/include/py_tensor.hpp
+++ b/python/include/py_tensor.hpp
@@ -29,8 +29,19 @@ THE SOFTWARE.
 #include <core/tensor.hpp>
 #include <memory>
 
-namespace py = pybind11;
-
+/**
+ * @brief A Python-facing wrapper for roccv::Tensor, providing DLPack interoperability and Pybind11 integration.
+ *
+ * The PyTensor class serves as a container around roccv::Tensor, exposing its functionality to Python. It supports
+ * construction from shape, data type, layout, and device information, as well as wrapping existing tensors or
+ * external DLPack-managed tensors. The class ensures correct lifetime handling for external resources (such as those
+ * transferred via DLPack) and enables seamless data movement between devices (CPU/GPU). PyTensor also supplies
+ * utilities to export to and import from DLPack capsules, making it suitable for zero-copy interoperability with
+ * frameworks like PyTorch, NumPy, or TensorFlow.
+ *
+ * Usage scenarios include direct creation from Python, import/export to DLPack, device copying, and serving as a
+ * type-safe bridge between Python and C++ tensor operations.
+ */
 class PyTensor : public std::enable_shared_from_this<PyTensor> {
    public:
     /**
@@ -79,22 +90,51 @@ class PyTensor : public std::enable_shared_from_this<PyTensor> {
     std::shared_ptr<PyTensor> copyTo(eDeviceType device);
 
     /**
-     * @brief Creates a new tensor given a capsule containing a DLManagedTensor.
+     * @brief Creates a new PyTensor by consuming an external DLPack capsule.
      *
-     * @param src A capsule containing a DLManagedTensor.
-     * @param layout The layout for the new tensor.
-     * @return std::shared_ptr<PyTensor>
+     * This static method constructs a PyTensor that wraps a new roccv::Tensor allocated
+     * from the contents of a DLPack capsule (i.e., an object supporting the __dlpack__ protocol),
+     * typically exported from other frameworks such as PyTorch, NumPy, or TVM.
+     * The shape and datatype are taken from the capsule's DLTensor metadata, while the layout
+     * must be specified explicitly, since DLPack does not encode layout.
+     *
+     * Ownership of the underlying DLPack-managed memory is transferred to the returned PyTensor,
+     * such that when the PyTensor is destroyed (and no Python references remain), the deleter
+     * from the DLPack capsule is called, ensuring correct cross-framework resource management.
+     *
+     * @param src   Python object supporting the __dlpack__() method. This can be a capsule returned by
+     *              __dlpack__() or any object exposing the DLPack consumer protocol.
+     * @param layout The tensor layout to use for the new roccv::Tensor (e.g., NHWC, NCHW, etc.).
+     * @return      A std::shared_ptr<PyTensor> wrapping a new tensor that shares memory with the DLPack object.
+     *
+     * @throws std::runtime_error if the object does not support the DLPack protocol,
+     *         if the capsule is invalid or missing, or if conversion fails in any other way.
+     *
+     * @note The resulting tensor will have the same shape, datatype, device, and (if present) strides
+     *       as encoded in the DLPack capsule, but the layout must be provided by the caller.
      */
     static std::shared_ptr<PyTensor> fromDLPack(pybind11::object src, eTensorLayout layout);
 
     /**
-     * @brief Creates a DLManagedTensor contained in a capsule from the tensor.
+     * @brief Exports this tensor as a DLPack-compatible capsule.
+     *
+     * This method creates a DLPack DLManagedTensor wrapper for the current PyTensor,
+     * encapsulates it in a Python capsule with the "dltensor" name, and returns it.
+     * The resulting capsule can be consumed by any framework supporting the DLPack protocol
+     * for zero-copy tensor data sharing.
      *
-     * @param stream Optional stream pointer value (used for pytorch).
+     * The optional @p stream argument is present to satisfy the Pytorch DLPack consumer interface,
+     * but is ignored by this implementation currently.
      *
-     * @return py::capsule
+     * @param stream Optional stream pointer (typically unused, present for PyTorch DLPack compliance).
+     *
+     * @return py::capsule  A Python capsule containing the DLPack DLManagedTensor.
+     *
+     * @note The caller is responsible for transferring or managing ownership of the data
+     * (according to DLPack conventions), including calling the capsule consumer and ensuring
+     * that the deleter in DLManagedTensor is invoked only once.
      */
-    py::capsule toDLPack(py::object stream);
+    pybind11::capsule toDLPack(pybind11::object stream);
 
     /**
      * @brief Gets the strides of the tensor as a python list.
@@ -151,15 +191,28 @@ class PyTensor : public std::enable_shared_from_this<PyTensor> {
      * @return A python tuple with the first index corresponding to the device type, and the second index corresponding
      * to the device id.
      */
-    py::tuple getDLDevice();
+    pybind11::tuple getDLDevice();
 
     /**
      * @brief Exports this class in the provided module.
      *
      * @param m The python module to export this class to.
      */
-    static void Export(py::module& m);
+    static void Export(pybind11::module& m);
 
+    /**
+     * @brief Returns a new PyTensor with a reshaped tensor according to the specified shape and layout.
+     *
+     * Creates and returns a new PyTensor whose underlying tensor is a view or copy of this tensor,
+     * with the shape specified by newShape and the layout specified by layout. The number of elements
+     * in newShape must match the number of elements in the original tensor.
+     *
+     * @param newShape The new shape for the tensor.
+     * @param layout The new layout to use for the reshaped tensor.
+     * @return std::shared_ptr<PyTensor> A new PyTensor with the reshaped tensor.
+     *
+     * @throws std::runtime_error if the total number of elements does not match.
+     */
     std::shared_ptr<PyTensor> reshape(std::vector<int64_t> newShape, eTensorLayout layout);
 
    private:
diff --git a/python/src/py_tensor.cpp b/python/src/py_tensor.cpp
index 913a25a4..97d71c39 100644
--- a/python/src/py_tensor.cpp
+++ b/python/src/py_tensor.cpp
@@ -97,10 +97,12 @@ std::shared_ptr<PyTensor> PyTensor::copyTo(eDeviceType device) {
 }
 
 std::shared_ptr<PyTensor> PyTensor::fromDLPack(pybind11::object src, eTensorLayout layout) {
+    // Check if the object supports the DLPack protocol
     if (!py::hasattr(src, "__dlpack__")) {
         throw std::runtime_error("Provided object does not support the DLPack protocol.");
     }
 
+    // Obtain a DLPack capsule
     py::capsule dlpackCapsule = src.attr("__dlpack__")();
     if (!PyCapsule_IsValid(dlpackCapsule.ptr(), "dltensor")) {
         throw std::runtime_error("Invalid DLPack capsule.");
@@ -108,23 +110,42 @@ std::shared_ptr<PyTensor> PyTensor::fromDLPack(pybind11::object src, eTensorLayo
     DLManagedTensor* dlManagedTensor = static_cast<DLManagedTensor*>(dlpackCapsule.get_pointer());
     DLTensor dlTensor = dlManagedTensor->dl_tensor;
 
-    // Mark this capsule as consumed, so that the deleter will not free underlying data.
+    // Mark this capsule as consumed, so that the deleter will not free underlying data
     dlpackCapsule.set_name("used_dltensor");
 
-    // Copy shape data
-    std::vector<int64_t> shapeData(dlTensor.ndim);
-    for (int i = 0; i < dlTensor.ndim; i++) {
+    // Copy the shape data from DLPack to a fixed-size array
+    std::array<int64_t, ROCCV_TENSOR_MAX_RANK> shapeData;
+    for (int i = 0; i < dlTensor.ndim; ++i) {
         shapeData[i] = dlTensor.shape[i];
     }
-
-    // Create a non-owning roccv::Tensor based on the received data
-    roccv::TensorShape shape(roccv::TensorLayout(layout), shapeData);
+    roccv::TensorShape shape(shapeData, dlTensor.ndim, roccv::TensorLayout(layout));
     eDeviceType device = DLDeviceToRoccvDevice(dlTensor.device);
-    roccv::TensorRequirements reqs =
-        roccv::Tensor::CalcRequirements(shape, roccv::DataType(DLTypeToRoccvType(dlTensor.dtype)), device);
-    auto data = std::make_shared<roccv::TensorStorage>(dlTensor.data, device, eOwnership::VIEW);
-    auto tensor = std::make_shared<roccv::Tensor>(reqs, data);
+    eDataType dtype = DLTypeToRoccvType(dlTensor.dtype);
+
+    // Prepare the strides array
+    std::array<int64_t, ROCCV_TENSOR_MAX_RANK> stridesData;
+
+    // If strides are not present, assume contiguous layout. We really shouldn't be recalculating strides here. DLPack
+    // now enforces that the strides are present, but we'll keep this for backwards compatibility.
+    if (dlTensor.strides == nullptr) {
+        stridesData = roccv::Tensor::CalcStrides(shape, roccv::DataType(dtype), 0);
+    } else {
+        for (int i = 0; i < dlTensor.ndim; ++i) {
+            // DLTensor strides are element-wise. Convert from element-wise to byte-wise.
+            stridesData[i] = dlTensor.strides[i] * roccv::DataType(dtype).size();
+        }
+    }
+
+    // Set up the requirements for the new tensor.
+    roccv::Tensor::Requirements reqs =
+        roccv::Tensor::CalcRequirements(shape, roccv::DataType(dtype), stridesData, 0, device);
+
+    // Since this tensor is coming from a DLPack, we don't own the data, so we need to create a view of the data.
+    std::shared_ptr<roccv::TensorStorage> data = std::make_shared<roccv::TensorStorage>(
+        static_cast<uint8_t*>(dlTensor.data) + dlTensor.byte_offset, device, eOwnership::VIEW);
+    std::shared_ptr<roccv::Tensor> tensor = std::make_shared<roccv::Tensor>(reqs, data);
 
+    // Instantiate a new tensor and a PyTensor to wrap it, binding the original DLManagedTensor
     return std::make_shared<PyTensor>(tensor, dlManagedTensor);
 }
 
diff --git a/samples/bilateral_filter.cpp b/samples/bilateral_filter.cpp
index 4b7d3864..fdfd11d0 100644
--- a/samples/bilateral_filter.cpp
+++ b/samples/bilateral_filter.cpp
@@ -21,172 +21,139 @@ THE SOFTWARE.
 */
 
 #include <core/hip_assert.h>
+#include <getopt.h>
+
 #include <core/tensor.hpp>
 #include <iostream>
-#include <fstream>
 #include <op_bilateral_filter.hpp>
 #include <opencv2/opencv.hpp>
 
+#include "common/utils.hpp"
+
 using namespace roccv;
 
+struct Config {
+    std::string inputPath;
+    std::string outputPath = "output";
+    int deviceId = 0;
+    int diameter = 2;
+    float sigmaSpace = 2.0f;
+    float sigmaColor = 10.0f;
+    eBorderType borderMode = eBorderType::BORDER_TYPE_REPLICATE;
+    float4 borderColor = {0.0f, 0.0f, 0.0f, 0.0f};
+    eDeviceType device = eDeviceType::GPU;
+};
+
 /**
  * @brief Bilateral filter operation example.
  */
 
- void ShowHelpAndExit(const char *option = NULL) {
-    std::cout << "Options: " << option << std::endl
-    << "-i Input File Path - required" << std::endl
-    << "-o Output File Path - optional; default: output.bmp" << std::endl
-    << "-cpu Select CPU instead of GPU to perform operation - optional; default choice is GPU path" << std::endl
-    << "-d GPU device ID (0 for the first device, 1 for the second, etc.) - optional; default: 0" << std::endl
-    << "-diameter Diameter of the filtering area - optional; default: 2" << std::endl
-    << "-sigma_space Spatial parameter sigma of the Gaussian function - optional; default: 2.0f" << std::endl
-    << "-sigma_color Range parameter sigma of the Gaussian function - optional; default: 10.0f" << std::endl
-    << "-border_mode Border mode at image boundary when work pixels are outside of the image (0: constant color; 1: replicate; 2: reflect; 3: wrap) - optional; default: 1 (replicate)" << std::endl
-    << "-border_color Border color for constant color border mode - optional; default: (0, 0, 0, 0)" << std::endl;
-    exit(0);
+void PrintUsage(const char* programName) {
+    // clang-format off
+    std::cout << "Usage: " << programName << " -i <input_image> [-o <output_image>] [-d <device_id>] [-D <diameter>] [-s <sigma_space>] [-c <sigma_color>] [-b <border_mode>] [-B <border_color>]" << std::endl;
+    std::cout << "  -i, --input <input_image>           Input image or directory containing images (required)" << std::endl;
+    std::cout << "  -o, --output <output_image>         Output image or directory to save the results (optional, default: output)" << std::endl;
+    std::cout << "  -d, --device <device_id>            Device ID to use for execution (optional, default: 0)" << std::endl;
+    std::cout << "  -D, --diameter <diameter>           Diameter of the filtering area (optional, default: 2)" << std::endl;
+    std::cout << "  -s, --sigma_space <sigma_space>     Spatial parameter sigma of the Gaussian function (optional, default: 2.0f)" << std::endl;
+    std::cout << "  -c, --sigma_color <sigma_color>     Range parameter sigma of the Gaussian function (optional, default: 10.0f)" << std::endl;
+    std::cout << "  -b, --border_mode <border_mode>     Border mode at image boundary when work pixels are outside of the image (optional, default: 1 (replicate))" << std::endl;
+    std::cout << "  -B, --border_color <border_color>   Border color for constant color border mode (optional, default: 0,0,0,0)" << std::endl;
+    std::cout << "  -C, --cpu                           Use CPU for execution (optional, default: GPU)" << std::endl;
+    std::cout << "  -h, --help                          Show this help message" << std::endl;
+    // clang-format on
 }
 
-int main(int argc, char** argv) {
-    std::string input_file_path;
-    std::string output_file_path = "output.bmp";
-    bool gpuPath = true; // use GPU by default
-    eDeviceType device = eDeviceType::GPU;
-    int deviceId = 0;
-    int diameter = 2;
-    float sigmaSpace = 2.0f;
-    float sigmaColor = 10.0f;
-    eBorderType borderMode = BORDER_TYPE_REPLICATE;
-    float4 borderColor = {0.0f, 0.0f, 0.0f, 0.0f};
+bool ParseBorderColor(const std::string& borderColorStr, float4& borderColor) {
+    return sscanf(borderColorStr.c_str(), "%f,%f,%f,%f", &borderColor.x, &borderColor.y, &borderColor.z,
+                  &borderColor.w) == 4;
+}
 
-    if(argc < 3) {
-        ShowHelpAndExit("-h");
-    }
-    for (int i = 1; i < argc; i++) {
-        if (!strcmp(argv[i], "-h")) {
-            ShowHelpAndExit("-h");
-        }
-        if (!strcmp(argv[i], "-i")) {
-            if (++i == argc) {
-                ShowHelpAndExit("-i");
-            }
-            input_file_path = argv[i];
-            continue;
-        }
-        if (!strcmp(argv[i], "-o")) {
-            if (++i == argc) {
-                ShowHelpAndExit("-o");
-            }
-            output_file_path = argv[i];
-            continue;
-        }
-        if (!strcmp(argv[i], "-diameter")) {
-            if (++i == argc) {
-                ShowHelpAndExit("-diameter");
-            }
-            diameter = std::atoi(argv[i]);
-            continue;
-        }
-        if (!strcmp(argv[i], "-sigma_space")) {
-            if (++i == argc) {
-                ShowHelpAndExit("-sigma_space");
-            }
-            sigmaSpace = std::atof(argv[i]);
-            continue;
-        }
-        if (!strcmp(argv[i], "-sigma_color")) {
-            if (++i == argc) {
-                ShowHelpAndExit("-sigma_color");
-            }
-            sigmaColor = std::atof(argv[i]);
-            continue;
-        }
-        if (!strcmp(argv[i], "-border_mode")) {
-            if (++i == argc) {
-                ShowHelpAndExit("-border_mode");
-            }
-            borderMode = static_cast<eBorderType>(std::atoi(argv[i]));
-            continue;
-        }
-        if (!strcmp(argv[i], "-border_color")) {
-            i++;
-            if (i + 4 > argc) {
-                ShowHelpAndExit("-border_color");
-            }
-            borderColor.x = static_cast<float>(atoi(argv[i++]));
-            borderColor.y = static_cast<float>(atoi(argv[i++]));
-            borderColor.z = static_cast<float>(atoi(argv[i++]));
-            borderColor.w = static_cast<float>(atoi(argv[i]));
-            continue;
-        }
-        if (!strcmp(argv[i], "-cpu")) {
-            gpuPath = false;
-            continue;
+int main(int argc, char** argv) {
+    Config config;
+
+    static struct option longOptions[] = {{"input", required_argument, nullptr, 'i'},
+                                          {"output", required_argument, nullptr, 'o'},
+                                          {"device", required_argument, nullptr, 'd'},
+                                          {"diameter", required_argument, nullptr, 'D'},
+                                          {"sigma_space", required_argument, nullptr, 's'},
+                                          {"sigma_color", required_argument, nullptr, 'c'},
+                                          {"border_mode", required_argument, nullptr, 'b'},
+                                          {"border_color", required_argument, nullptr, 'B'},
+                                          {"cpu", no_argument, nullptr, 'C'},
+                                          {"help", no_argument, nullptr, 'h'},
+                                          {nullptr, 0, nullptr, 0}};
+
+    int opt;
+    while ((opt = getopt_long(argc, argv, "i:o:d:D:s:c:b:B:h:C", longOptions, nullptr)) != -1) {
+        switch (opt) {
+            case 'i':
+                config.inputPath = optarg;
+                break;
+            case 'o':
+                config.outputPath = optarg;
+                break;
+            case 'd':
+                config.deviceId = std::stoi(optarg);
+                break;
+            case 'D':
+                config.diameter = std::stoi(optarg);
+                break;
+            case 's':
+                config.sigmaSpace = std::stof(optarg);
+                break;
+            case 'c':
+                config.sigmaColor = std::stof(optarg);
+                break;
+            case 'b':
+                config.borderMode = static_cast<eBorderType>(std::stoi(optarg));
+                break;
+            case 'B':
+                if (!ParseBorderColor(optarg, config.borderColor)) {
+                    std::cerr << "Invalid border color format. Use: r,g,b,a (e.g., 0,0,0,0)\n";
+                    return EXIT_FAILURE;
+                }
+                break;
+            case 'C':
+                config.device = eDeviceType::CPU;
+                break;
+            case 'h':
+                PrintUsage(argv[0]);
+                return EXIT_SUCCESS;
+            default:
+                PrintUsage(argv[0]);
+                return EXIT_FAILURE;
         }
     }
 
-    if (gpuPath) {
-        device = eDeviceType::GPU;
-        HIP_VALIDATE_NO_ERRORS(hipSetDevice(deviceId));
-    } else {
-        device = eDeviceType::CPU;
-    }
-    hipStream_t stream = nullptr;
-    if (gpuPath) {
-        HIP_VALIDATE_NO_ERRORS(hipStreamCreate(&stream));
+    if (config.inputPath.empty()) {
+        std::cerr << "Error: Input path is required.\n\n";
+        PrintUsage(argv[0]);
+        return EXIT_FAILURE;
     }
 
-    cv::Mat imageData = cv::imread(input_file_path);
-    if (imageData.empty()) {
-        std::cerr << "Failed to read the input image file" << std::endl;
-        exit(1);
+    if (config.device == eDeviceType::GPU) {
+        CHECK_HIP_ERROR(hipSetDevice(config.deviceId));
     }
 
-    // Create input/output tensors for the image.
-    TensorShape imageShape(TensorLayout(eTensorLayout::TENSOR_LAYOUT_NHWC), {1, imageData.rows, imageData.cols, imageData.channels()});
-    DataType dtype(eDataType::DATA_TYPE_U8);
-    Tensor input(imageShape, dtype, device);
-    Tensor output(imageShape, dtype, device);
-
-    // Move image data to input tensor
-    size_t imageSizeInByte = input.shape().size() * input.dtype().size();
-    auto inputData = input.exportData<TensorDataStrided>();
-    if (gpuPath) {
-        HIP_VALIDATE_NO_ERRORS(hipMemcpyAsync(inputData.basePtr(), imageData.data, imageSizeInByte, hipMemcpyHostToDevice, stream));
-    } else {
-        memcpy(inputData.basePtr(), imageData.data, imageSizeInByte);
-    }
+    // Create stream
+    hipStream_t stream;
+    CHECK_HIP_ERROR(hipStreamCreate(&stream));
+
+    // Load input image
+    Tensor input = LoadImages(stream, config.inputPath.c_str(), config.device);
 
+    // Create output tensor
+    Tensor output(input.shape(), input.dtype(), config.device);
+
+    // Create BilateralFilter operator
     BilateralFilter op;
-    op(stream, input, output, diameter, sigmaColor, sigmaSpace, borderMode, borderColor, device);
-
-    // Move image data back to host
-    size_t outputSize = output.shape().size() * output.dtype().size();
-    auto outData = output.exportData<TensorDataStrided>();
-    std::vector<uint8_t> h_output(outputSize);
-    if (gpuPath) {
-        HIP_VALIDATE_NO_ERRORS(hipMemcpyAsync(h_output.data(), outData.basePtr(), outputSize, hipMemcpyDeviceToHost, stream));
-        HIP_VALIDATE_NO_ERRORS(hipStreamSynchronize(stream));
-    } else {
-        memcpy(h_output.data(), outData.basePtr(), outputSize);
-    }
+    op(stream, input, output, config.diameter, config.sigmaColor, config.sigmaSpace, config.borderMode,
+       config.borderColor, config.device);
 
-    // Write output image to disk
-    cv::Mat outImageData(imageData.rows, imageData.cols, imageData.type(), h_output.data());
-    bool ret = cv::imwrite(output_file_path, outImageData);
-    if (!ret) {
-        std::cerr << "Faild to save output image to the file" << std::endl;
-        exit(1);
-    }
+    WriteImages(stream, output, config.outputPath);
 
-    std::cout << "Input image file: " << input_file_path << std::endl;
-    std::cout << "Output image file: " << output_file_path << std::endl;
-    if (gpuPath) {
-        std::cout << "Operation on GPU device " << deviceId << std::endl;
-    } else {
-        std::cout << "Operation on CPU" << std::endl;
-    }
-    std::cout << "Image size: width = " << imageData.cols << ", height = " << imageData.rows << std::endl;
+    CHECK_HIP_ERROR(hipStreamDestroy(stream));
 
     return EXIT_SUCCESS;
 }
\ No newline at end of file
diff --git a/samples/bnd_box.cpp b/samples/bnd_box.cpp
index 72ccbf06..621d3ef7 100644
--- a/samples/bnd_box.cpp
+++ b/samples/bnd_box.cpp
@@ -21,6 +21,7 @@ THE SOFTWARE.
 */
 
 #include <core/hip_assert.h>
+#include <getopt.h>
 
 #include <core/tensor.hpp>
 #include <fstream>
@@ -28,239 +29,249 @@ THE SOFTWARE.
 #include <op_bnd_box.hpp>
 #include <opencv2/opencv.hpp>
 
-using namespace roccv;
+#include "common/utils.hpp"
 
-/**
- * @brief Bounding Box operation example.
- */
+using namespace roccv;
 
 /**
- * @brief Example bounding box list file content
- * 1 <-- number of images
- * 2 <-- number of boxes for image 1
- * 50 <-- X coordinate of top-left corner of box 1
- * 50 <-- Y coordinate of top-left corner of box 1
- * 100 <-- width of box 1
- * 50 <-- height of box 1
- * 5 <-- thickness of box boundary of box 1
- * 0 <-- B component of box border color of box 1
- * 0 <-- G component of box border color of box 1
- * 255 <-- R component of box border color of box 1
- * 200 <-- alpha component of box border color of box 1
- * 0 <-- B component of box fill color of box 1
- * 255 <-- G component of box fill color of box 1
- * 0 <-- R component of box fill color of box 1
- * 100 <-- alpha component of box fill color of box 1
- * 250 <-- X coordinate of top-left corner of box 2
- * 250 <-- Y coordinate of top-left corner of box 2
- * 50 <-- width of box 2
- * 100 <-- height of box 2
- * 10 <-- thickness of box boundary of box 2
- * 255 <-- B component of box border color of box 2
- * 0 <-- G component of box border color of box 2
- * 0 <-- R component of box border color of box 2
- * 200 <-- alpha component of box border color of box 2
- * 0 <-- B component of box fill color of box 2
- * 0 <-- G component of box fill color of box 2
- * 0 <-- R component of box fill color of box 2
- * 0 <-- alpha component of box fill color of box 2
+ * @file bnd_box.cpp
+ * @brief Example application for drawing bounding boxes on images using ROC CV.
+ *
+ * This file demonstrates how to parse a bounding box description file and apply bounding boxes to input images.
+ * The bounding box file is expected to have the following format:
+ * - The first line contains the number of images.
+ * - Each image section starts with a line containing the number of boxes for that image.
+ * - Each box is described over 13 subsequent lines as follows:
+ *   - X coordinate of top-left corner
+ *   - Y coordinate of top-left corner
+ *   - Width
+ *   - Height
+ *   - Thickness of box boundary
+ *   - B component of box border color
+ *   - G component of box border color
+ *   - R component of box border color
+ *   - Alpha component of box border color
+ *   - B component of box fill color
+ *   - G component of box fill color
+ *   - R component of box fill color
+ *   - Alpha component of box fill color
+ *
+ * Example of bounding box list file content:
+ * @code
+ * 1
+ * 2
+ * 50
+ * 50
+ * 100
+ * 50
+ * 5
+ * 0
+ * 0
+ * 255
+ * 200
+ * 0
+ * 255
+ * 0
+ * 100
+ * 250
+ * 250
+ * 50
+ * 100
+ * 10
+ * 255
+ * 0
+ * 0
+ * 200
+ * 0
+ * 0
+ * 0
+ * 0
+ * @endcode
  */
 
-void ShowHelpAndExit(const char* option = NULL) {
-    std::cout << "Options: " << option << std::endl
-              << "-i Input File Path - required" << std::endl
-              << "-o Output File Path - optional; default: output.bmp" << std::endl
-              << "-cpu Select CPU instead of GPU to perform operation - optional; default choice is GPU path"
-              << std::endl
-              << "-d GPU device ID (0 for the first device, 1 for the second, etc.) - optional; default: 0" << std::endl
-              << "-box_file Bounding box list file - optional; default: use the set value in the app" << std::endl;
-    exit(0);
-}
-
-int main(int argc, char** argv) {
-    std::string input_file_path;
-    std::string box_file_path;
-    std::string output_file_path = "output.bmp";
-    bool gpuPath = true;  // use GPU by default
+struct Config {
+    std::string inputPath;
+    std::string outputPath = "output";
     eDeviceType device = eDeviceType::GPU;
     int deviceId = 0;
-    bool boxSet = false;  // User sets the bounding box list data in a text file
-
-    if (argc < 3) {
-        ShowHelpAndExit("-h");
-    }
-    for (int i = 1; i < argc; i++) {
-        if (!strcmp(argv[i], "-h")) {
-            ShowHelpAndExit("-h");
-        }
-        if (!strcmp(argv[i], "-i")) {
-            if (++i == argc) {
-                ShowHelpAndExit("-i");
-            }
-            input_file_path = argv[i];
-            continue;
-        }
-        if (!strcmp(argv[i], "-o")) {
-            if (++i == argc) {
-                ShowHelpAndExit("-o");
-            }
-            output_file_path = argv[i];
-            continue;
-        }
-        if (!strcmp(argv[i], "-box_file")) {
-            if (++i == argc) {
-                ShowHelpAndExit("-o");
-            }
-            box_file_path = argv[i];
-            boxSet = true;
-            continue;
-        }
-        if (!strcmp(argv[i], "-cpu")) {
-            gpuPath = false;
-            continue;
-        }
-    }
+    std::string boundingBoxFilePath = "";
+};
 
-    if (gpuPath) {
-        device = eDeviceType::GPU;
-        HIP_VALIDATE_NO_ERRORS(hipSetDevice(deviceId));
-    } else {
-        device = eDeviceType::CPU;
-    }
+void PrintUsage(const char* programName) {
+    // clang-format off
+    std::cout << "Usage: " << programName << " -i <input_image> [-o <output_image>] [-b <bounding_box_file>] [-d <device_id>] [-C]" << std::endl;
+    std::cout << "  -i, --input <input_image>           Input image or directory containing images (required)" << std::endl;
+    std::cout << "  -o, --output <output_image>         Output image or directory to save the results (optional, default: output)" << std::endl;
+    std::cout << "  -b, --box_file <bounding_box_file>  Bounding box list file (optional, default: use the set value in the app)" << std::endl;
+    std::cout << "  -d, --device <device_id>            Device ID to use for execution (optional, default: 0)" << std::endl;
+    std::cout << "  -C, --cpu                           Use CPU for execution (optional, default: GPU)" << std::endl;
+    std::cout << "  -h, --help                          Show this help message" << std::endl;
+    // clang-format on
+}
 
-    cv::Mat imageData = cv::imread(input_file_path);
-    if (imageData.empty()) {
-        std::cerr << "Failed to read the input image file" << std::endl;
-        exit(1);
+/**
+ * @brief Parse bounding box file and setup bounding box vector.
+ *
+ * @param[in] boundingBoxFilePath Path to bounding box list file.
+ * @param[out] bbox_vector Vector of bounding box vectors to be filled.
+ * @return void
+ * @throws std::runtime_error if failed to open bounding box file.
+ */
+void ParseBoundingBoxFile(const std::string& boundingBoxFilePath, std::vector<std::vector<BndBox_t>>& bbox_vector) {
+    std::ifstream file(boundingBoxFilePath);
+    if (!file.is_open()) {
+        throw std::runtime_error("Failed to open bounding box file " + boundingBoxFilePath);
     }
 
-    int batchSize = 1;
-    std::vector<std::vector<BndBox_t>> bbox_vector;
-    if (boxSet) {
-        std::ifstream box_list_file(box_file_path);
-        if (box_list_file.is_open()) {
-            std::string line;
-            std::getline(box_list_file, line);
-            batchSize = std::stoi(line.c_str());
-            if (batchSize > 0) {
-                bbox_vector.resize(batchSize);
-                for (int i = 0; i < batchSize; i++) {
-                    std::getline(box_list_file, line);
-                    int numBoxes = std::stoi(line.c_str());
-                    if (numBoxes > 0) {
-                        for (int b = 0; b < numBoxes; b++) {
-                            BndBox_t box;
-                            std::getline(box_list_file, line);
-                            box.box.x = std::atoi(line.c_str());
-                            std::getline(box_list_file, line);
-                            box.box.y = std::atoi(line.c_str());
-                            std::getline(box_list_file, line);
-                            box.box.width = std::atoi(line.c_str());
-                            std::getline(box_list_file, line);
-                            box.box.height = std::atoi(line.c_str());
+    int numImages;
+    file >> numImages;
+    bbox_vector.resize(numImages);
+    for (int i = 0; i < numImages; i++) {
+        int numBoxes;
+        file >> numBoxes;
 
-                            std::getline(box_list_file, line);
-                            box.thickness = std::atoi(line.c_str());
+        bbox_vector[i].resize(numBoxes);
+        for (int j = 0; j < numBoxes; j++) {
+            // Parse each box from 13 lines, in order:
+            //   1. x         (top-left corner)
+            //   2. y         (top-left corner)
+            //   3. width
+            //   4. height
+            //   5. thickness (box boundary)
+            //   6. border B
+            //   7. border G
+            //   8. border R
+            //   9. border A (alpha)
+            //  10. fill B
+            //  11. fill G
+            //  12. fill R
+            //  13. fill A (alpha)
+            // (13 values total per box)
 
-                            std::getline(box_list_file, line);
-                            box.borderColor.r = std::atoi(line.c_str());
-                            std::getline(box_list_file, line);
-                            box.borderColor.g = std::atoi(line.c_str());
-                            std::getline(box_list_file, line);
-                            box.borderColor.b = std::atoi(line.c_str());
-                            std::getline(box_list_file, line);
-                            box.borderColor.a = std::atoi(line.c_str());
+            // Read box dimensions
+            file >> bbox_vector[i][j].box.x >> bbox_vector[i][j].box.y >> bbox_vector[i][j].box.width >>
+                bbox_vector[i][j].box.height;
+            file >> bbox_vector[i][j].thickness;
 
-                            std::getline(box_list_file, line);
-                            box.fillColor.r = std::atoi(line.c_str());
-                            std::getline(box_list_file, line);
-                            box.fillColor.g = std::atoi(line.c_str());
-                            std::getline(box_list_file, line);
-                            box.fillColor.b = std::atoi(line.c_str());
-                            std::getline(box_list_file, line);
-                            box.fillColor.a = std::atoi(line.c_str());
+            // Read colors into temp ints
+            int r, g, b, a;
+            file >> b >> g >> r >> a;
+            bbox_vector[i][j].borderColor = {static_cast<uint8_t>(r), static_cast<uint8_t>(g), static_cast<uint8_t>(b),
+                                             static_cast<uint8_t>(a)};
 
-                            bbox_vector[i].push_back(box);
-                        }
-                    } else {
-                        std::cerr << "Invalid number of boxes: " << numBoxes << "for image: " << i << std::endl;
-                        exit(1);
-                    }
-                }
-            } else {
-                std::cerr << "Invalid batch size: " << batchSize << std::endl;
-                exit(1);
-            }
-        } else {
-            std::cerr << "Failed to open bounding box list file " << box_file_path << std::endl;
-            exit(1);
+            file >> b >> g >> r >> a;
+            bbox_vector[i][j].fillColor = {static_cast<uint8_t>(r), static_cast<uint8_t>(g), static_cast<uint8_t>(b),
+                                           static_cast<uint8_t>(a)};
         }
-    } else {
-        auto width = imageData.cols;
-        auto height = imageData.rows;
-        bbox_vector = {
-            {
+    }
+}
+
+/**
+ * @brief Setup bounding box vector from file or default values.
+ *
+ * @param[in] batchSize Number of images in the batch.
+ * @param[in] width Width of each image in the batch.
+ * @param[in] height Height of each image in the batch.
+ * @param[in] boundingBoxFilePath Path to bounding box list file.
+ * @return Vector of bounding box vectors.
+ */
+std::vector<std::vector<BndBox_t>> SetupBoundingBoxVector(int64_t batchSize, int64_t width, int64_t height,
+                                                          const std::string& boundingBoxFilePath) {
+    std::vector<std::vector<BndBox_t>> bbox_vector;
+
+    if (boundingBoxFilePath.empty()) {
+        for (int64_t b = 0; b < batchSize; b++) {
+            bbox_vector.push_back({
                 {{width / 4, height / 4, width / 2, height / 2}, 5, {0, 0, 255, 200}, {0, 255, 0, 100}},
                 {{width / 3, height / 3, width / 3 * 2, height / 4}, -1, {90, 16, 181, 50}, {0, 0, 0, 0}},
                 {{-50, (height * 2) / 3, width + 50, height / 3 + 50}, 0, {0, 0, 0, 0}, {111, 159, 232, 150}},
-            },
-        };
+            });
+        }
+    } else {
+        ParseBoundingBoxFile(boundingBoxFilePath, bbox_vector);
     }
-    BndBoxes bboxes(bbox_vector);
 
-    // Create input/output tensors for the image.
-    TensorShape imageShape(TensorLayout(eTensorLayout::TENSOR_LAYOUT_NHWC),
-                           {1, imageData.rows, imageData.cols, imageData.channels()});
-    DataType dtype(eDataType::DATA_TYPE_U8);
-    Tensor input(imageShape, dtype, device);
-    Tensor output(imageShape, dtype, device);
+    return bbox_vector;
+}
 
-    hipStream_t stream = nullptr;
-    if (gpuPath) {
-        HIP_VALIDATE_NO_ERRORS(hipStreamCreate(&stream));
+/**
+ * @brief Main function to run the bounding box operation example.
+ *
+ * @param[in] argc Number of command line arguments.
+ * @param[in] argv Command line arguments.
+ * @return int Exit status.
+ */
+int main(int argc, char** argv) {
+    Config config;
+    static struct option longOptions[] = {{"input", required_argument, nullptr, 'i'},
+                                          {"output", required_argument, nullptr, 'o'},
+                                          {"box_file", required_argument, nullptr, 'b'},
+                                          {"device", required_argument, nullptr, 'd'},
+                                          {"cpu", no_argument, nullptr, 'C'},
+                                          {"help", no_argument, nullptr, 'h'},
+                                          {nullptr, 0, nullptr, 0}};
+    int opt;
+    while ((opt = getopt_long(argc, argv, "i:o:b:d:h:C", longOptions, nullptr)) != -1) {
+        switch (opt) {
+            case 'i':
+                config.inputPath = optarg;
+                break;
+            case 'o':
+                config.outputPath = optarg;
+                break;
+            case 'b':
+                config.boundingBoxFilePath = optarg;
+                break;
+            case 'd':
+                config.deviceId = std::stoi(optarg);
+                break;
+            case 'C':
+                config.device = eDeviceType::CPU;
+                break;
+            case 'h':
+                PrintUsage(argv[0]);
+                return EXIT_SUCCESS;
+            default:
+                PrintUsage(argv[0]);
+                return EXIT_FAILURE;
+        }
     }
 
-    // Move image data to input tensor
-    size_t imageSizeInByte = input.shape().size() * input.dtype().size();
-    auto inputData = input.exportData<TensorDataStrided>();
-    if (gpuPath) {
-        HIP_VALIDATE_NO_ERRORS(
-            hipMemcpyAsync(inputData.basePtr(), imageData.data, imageSizeInByte, hipMemcpyHostToDevice, stream));
-    } else {
-        memcpy(inputData.basePtr(), imageData.data, imageSizeInByte);
+    if (config.inputPath.empty()) {
+        std::cerr << "Error: Input path is required.\n\n";
+        PrintUsage(argv[0]);
+        return EXIT_FAILURE;
     }
 
-    BndBox op;
-    op(stream, input, output, bboxes, device);
-
-    // Move image data back to host
-    size_t outputSize = output.shape().size() * output.dtype().size();
-    auto outData = output.exportData<TensorDataStrided>();
-    std::vector<uint8_t> h_output(outputSize);
-    if (gpuPath) {
-        HIP_VALIDATE_NO_ERRORS(
-            hipMemcpyAsync(h_output.data(), outData.basePtr(), outputSize, hipMemcpyDeviceToHost, stream));
-        HIP_VALIDATE_NO_ERRORS(hipStreamSynchronize(stream));
-    } else {
-        memcpy(h_output.data(), outData.basePtr(), outputSize);
+    if (config.device == eDeviceType::GPU) {
+        CHECK_HIP_ERROR(hipSetDevice(config.deviceId));
     }
 
-    // Write output image to disk
-    cv::Mat outImageData(imageData.rows, imageData.cols, imageData.type(), h_output.data());
-    bool ret = cv::imwrite(output_file_path, outImageData);
-    if (!ret) {
-        std::cerr << "Faild to save output image to the file" << std::endl;
-        exit(1);
-    }
+    hipStream_t stream;
+    CHECK_HIP_ERROR(hipStreamCreate(&stream));
 
-    std::cout << "Input image file: " << input_file_path << std::endl;
-    std::cout << "Output image file: " << output_file_path << std::endl;
-    if (gpuPath) {
-        std::cout << "Operation on GPU device " << deviceId << std::endl;
-    } else {
-        std::cout << "Operation on CPU" << std::endl;
-    }
-    std::cout << "Image size: width = " << imageData.cols << ", height = " << imageData.rows << std::endl;
+    // Load images
+    Tensor input = LoadImages(stream, config.inputPath.c_str(), config.device);
+    int64_t batchSize = input.shape()[input.shape().layout().batch_index()];
+    int64_t height = input.shape()[input.shape().layout().height_index()];
+    int64_t width = input.shape()[input.shape().layout().width_index()];
+
+    // Setup bounding box vector
+    std::vector<std::vector<BndBox_t>> bbox_vector =
+        SetupBoundingBoxVector(batchSize, width, height, config.boundingBoxFilePath);
+
+    BndBoxes bboxes(bbox_vector);
+
+    // Create output tensor
+    Tensor output(input.shape(), input.dtype(), config.device);
+
+    // Create BndBox operator
+    BndBox op;
+    op(stream, input, output, bboxes, config.device);
+
+    // Write output images to disk
+    WriteImages(stream, output, config.outputPath);
 
+    CHECK_HIP_ERROR(hipStreamDestroy(stream));
     return EXIT_SUCCESS;
 }
\ No newline at end of file
diff --git a/samples/center_crop.cpp b/samples/center_crop.cpp
index 123e8f26..7a460773 100644
--- a/samples/center_crop.cpp
+++ b/samples/center_crop.cpp
@@ -21,143 +21,120 @@ THE SOFTWARE.
 */
 
 #include <core/hip_assert.h>
+#include <getopt.h>
+
 #include <core/tensor.hpp>
 #include <iostream>
 #include <op_center_crop.hpp>
 #include <opencv2/opencv.hpp>
 
+#include "common/utils.hpp"
+
 using namespace roccv;
 
+struct Config {
+    std::string inputPath;
+    std::string outputPath = "output";
+    eDeviceType device = eDeviceType::GPU;
+    int deviceId = 0;
+    Size2D cropArea = {1, 1};
+};
+
+bool ParseCropArea(const std::string& cropStr, Size2D& cropArea) {
+    std::istringstream iss(cropStr);
+    std::string token;
+    getline(iss, token, ',');
+    cropArea.w = std::stoi(token);
+    getline(iss, token, ',');
+    cropArea.h = std::stoi(token);
+    return true;
+}
+
 /**
  * @brief Center crop operation example.
  */
 
-void ShowHelpAndExit(const char *option = NULL) {
-    std::cout << "Options: " << option << std::endl
-    << "-i Input File Path - required" << std::endl
-    << "-o Output File Path - optional; default: output.bmp" << std::endl
-    << "-cpu Select CPU instead of GPU to perform operation - optional; default choice is GPU path" << std::endl
-    << "-d GPU device ID (0 for the first device, 1 for the second, etc.) - optional; default: 0" << std::endl
-    << "-crop Center crop area (width, height)- optional; default: use the set value in the app" << std::endl;
-    exit(0);
+void PrintUsage(const char* programName) {
+    // clang-format off
+    std::cout << "Usage: " << programName << " -i <input_image> [-o <output_image>] [-d <device_id>] [-crop <width,height>] [-C]" << std::endl;
+    std::cout << "  -i, --input <input_image>           Input image or directory containing images (required)" << std::endl;
+    std::cout << "  -o, --output <output_image>         Output image or directory to save the results (optional, default: output)" << std::endl;
+    std::cout << "  -d, --device <device_id>            Device ID to use for execution (optional, default: 0)" << std::endl;
+    std::cout << "  -c, --crop <width,height>           Crop area as comma separated values (optional, default: 1,1)" << std::endl;
+    std::cout << "  -C, --cpu                           Use CPU for execution (optional, default: GPU)" << std::endl;
+    std::cout << "  -h, --help                          Show this help message" << std::endl;
+    // clang-format on
 }
-
 int main(int argc, char** argv) {
-    std::string input_file_path;
-    std::string output_file_path = "output.bmp";
-    bool gpuPath = true; // use GPU by default
-    eDeviceType device = eDeviceType::GPU;
-    int deviceId = 0;
-    Size2D cropArea = {1, 1};
-    bool cropSet = false;
-
-    if(argc < 3) {
-        ShowHelpAndExit("-h");
-    }
-    for (int i = 1; i < argc; i++) {
-        if (!strcmp(argv[i], "-h")) {
-            ShowHelpAndExit("-h");
-        }
-        if (!strcmp(argv[i], "-i")) {
-            if (++i == argc) {
-                ShowHelpAndExit("-i");
-            }
-            input_file_path = argv[i];
-            continue;
-        }
-        if (!strcmp(argv[i], "-o")) {
-            if (++i == argc) {
-                ShowHelpAndExit("-o");
-            }
-            output_file_path = argv[i];
-            continue;
-        }
-        if (!strcmp(argv[i], "-crop")) {
-            i++;
-            if (i + 2 > argc) {
-                ShowHelpAndExit("-crop");
-            }
-            cropArea.w = atoi(argv[i++]);
-            cropArea.h = atoi(argv[i]);
-            cropSet = true;
-            continue;
-        }
-        if (!strcmp(argv[i], "-cpu")) {
-            gpuPath = false;
-            continue;
+    Config config;
+    static struct option longOptions[] = {{"input", required_argument, nullptr, 'i'},
+                                          {"output", required_argument, nullptr, 'o'},
+                                          {"device", required_argument, nullptr, 'd'},
+                                          {"crop", required_argument, nullptr, 'c'},
+                                          {"cpu", no_argument, nullptr, 'C'},
+                                          {"help", no_argument, nullptr, 'h'},
+                                          {nullptr, 0, nullptr, 0}};
+    int opt;
+    while ((opt = getopt_long(argc, argv, "i:o:d:c:h:C", longOptions, nullptr)) != -1) {
+        switch (opt) {
+            case 'i':
+                config.inputPath = optarg;
+                break;
+            case 'o':
+                config.outputPath = optarg;
+                break;
+            case 'd':
+                config.deviceId = std::stoi(optarg);
+                break;
+            case 'c':
+                if (!ParseCropArea(optarg, config.cropArea)) {
+                    std::cerr << "Invalid crop area format. Use: width,height (e.g., 1,1)\n";
+                    return EXIT_FAILURE;
+                }
+                break;
+            case 'C':
+                config.device = eDeviceType::CPU;
+                break;
+            case 'h':
+                PrintUsage(argv[0]);
+                return EXIT_SUCCESS;
+            default:
+                PrintUsage(argv[0]);
+                return EXIT_FAILURE;
         }
     }
 
-    if (gpuPath) {
-        device = eDeviceType::GPU;
-        HIP_VALIDATE_NO_ERRORS(hipSetDevice(deviceId));
-    } else {
-        device = eDeviceType::CPU;
+    if (config.inputPath.empty()) {
+        std::cerr << "Error: Input path is required.\n\n";
+        PrintUsage(argv[0]);
+        return EXIT_FAILURE;
     }
 
-    cv::Mat imageData = cv::imread(input_file_path);
-    if (imageData.empty()) {
-        std::cerr << "Failed to read the input image file" << std::endl;
-        exit(1);
-    }
-    if (!cropSet) {
-        // Set a safe crop area if no user input
-        cropArea = {(imageData.cols / 2), (imageData.rows / 2)};
+    if (config.device == eDeviceType::GPU) {
+        CHECK_HIP_ERROR(hipSetDevice(config.deviceId));
     }
 
-    // Create input/output tensors for the image.
-    TensorShape inputShape(TensorLayout(eTensorLayout::TENSOR_LAYOUT_NHWC), {1, imageData.rows, imageData.cols, imageData.channels()});
-    DataType dtype(eDataType::DATA_TYPE_U8);
-    Tensor input(inputShape, dtype, device);
+    hipStream_t stream;
+    CHECK_HIP_ERROR(hipStreamCreate(&stream));
 
-    TensorShape outShape(TensorLayout(eTensorLayout::TENSOR_LAYOUT_NHWC), {1, cropArea.h, cropArea.w, imageData.channels()});
-    Tensor output(outShape, dtype, device);
+    // Load input images
+    Tensor input = LoadImages(stream, config.inputPath.c_str(), config.device);
 
-    hipStream_t stream = nullptr;
-    if (gpuPath) {
-        HIP_VALIDATE_NO_ERRORS(hipStreamCreate(&stream));
-    }
+    // Create output shape
+    TensorShape outputShape(input.layout(), {input.shape(input.layout().batch_index()), config.cropArea.h,
+                                             config.cropArea.w, input.shape(input.layout().channels_index())});
 
-    // Move image data to input tensor
-    size_t imageSizeInByte = input.shape().size() * input.dtype().size();
-    auto input_data = input.exportData<TensorDataStrided>();
-    if (gpuPath) {
-        HIP_VALIDATE_NO_ERRORS(hipMemcpyAsync(input_data.basePtr(), imageData.data, imageSizeInByte, hipMemcpyHostToDevice, stream));
-    } else {
-        memcpy(input_data.basePtr(), imageData.data, imageSizeInByte);
-    }
+    // Create output tensor
+    Tensor output(outputShape, input.dtype(), config.device);
 
+    // Create CenterCrop operator
     CenterCrop op;
-    op(stream, input, output, cropArea, device);
-
-    // Move image data back to host
-    size_t outputSize = output.shape().size() * output.dtype().size();
-    auto outData = output.exportData<TensorDataStrided>();
-    std::vector<uint8_t> h_output(outputSize);
-    if (gpuPath) {
-        HIP_VALIDATE_NO_ERRORS(hipMemcpyAsync(h_output.data(), outData.basePtr(), outputSize, hipMemcpyDeviceToHost, stream));
-        HIP_VALIDATE_NO_ERRORS(hipStreamSynchronize(stream));
-    } else {
-        memcpy(h_output.data(), outData.basePtr(), outputSize);
-    }
+    op(stream, input, output, config.cropArea, config.device);
 
-    // Write output image to disk
-    cv::Mat output_imageData(cropArea.h, cropArea.w, imageData.type(), h_output.data());
-    bool ret = cv::imwrite(output_file_path, output_imageData);
-    if (!ret) {
-        std::cerr << "Faild to save output image to the file" << std::endl;
-        exit(1);
-    }
+    // Write output images to disk
+    WriteImages(stream, output, config.outputPath);
 
-    std::cout << "Input image file: " << input_file_path << std::endl;
-    std::cout << "Output image file: " << output_file_path << std::endl;
-    if (gpuPath) {
-        std::cout << "Operation on GPU device " << deviceId << std::endl;
-    } else {
-        std::cout << "Operation on CPU" << std::endl;
-    }
-    std::cout << "Input image size: width = " << imageData.cols << ", height = " << imageData.rows << std::endl;
-    std::cout << "Cropping area: width = " << cropArea.w << ", height = " << cropArea.h << std::endl;
+    CHECK_HIP_ERROR(hipStreamDestroy(stream));
 
     return EXIT_SUCCESS;
 }
\ No newline at end of file
diff --git a/samples/common/utils.hpp b/samples/common/utils.hpp
index 61ae525d..817cac9d 100644
--- a/samples/common/utils.hpp
+++ b/samples/common/utils.hpp
@@ -23,6 +23,7 @@
 
 #include <core/tensor.hpp>
 #include <filesystem>
+#include <format>
 #include <opencv2/opencv.hpp>
 
 inline void CheckHIPError(hipError_t code, const char *file, const int line) {
@@ -39,7 +40,7 @@ inline void CheckHIPError(hipError_t code, const char *file, const int line) {
         CheckHIPError((val), __FILE__, __LINE__); \
     }
 
-bool ContainsExtension(const std::filesystem::path &path, const std::vector<std::string> &extension_list) {
+inline bool ContainsExtension(const std::filesystem::path &path, const std::vector<std::string> &extension_list) {
     for (auto extension : extension_list) {
         if (path.extension() == extension) return true;
     }
@@ -47,74 +48,155 @@ bool ContainsExtension(const std::filesystem::path &path, const std::vector<std:
     return false;
 }
 
+struct MemcpyParams {
+    void *basePtr = nullptr;  // Base pointer to the tensor data
+    size_t rowPitch = 0;      // Number of bytes per row, including padding
+    size_t rowBytes = 0;      // Number of bytes per row, not including padding
+    size_t imageBytes = 0;    // Number of bytes per image, including padding (rowBytes * height)
+};
+
+/**
+ * @brief Gets the memcpy parameters for a tensor to perform a memcpy2D operation.
+ *
+ * @param tensor The tensor to get the memcpy parameters for.
+ * @return The memcpy parameters to perform a memcpy2D operation.
+ */
+inline MemcpyParams GetMemcpyParams(const roccv::Tensor &tensor) {
+    MemcpyParams params;
+
+    roccv::TensorDataStrided tensorData = tensor.exportData<roccv::TensorDataStrided>();
+    params.rowPitch = tensorData.stride(tensor.layout().height_index());
+    params.rowBytes = tensor.shape(tensor.layout().width_index()) * tensor.shape(tensor.layout().channels_index()) *
+                      tensor.dtype().size();
+    params.imageBytes = params.rowPitch * tensor.shape(tensor.layout().height_index());
+    params.basePtr = tensorData.basePtr();
+
+    return params;
+}
+
 /**
- * @brief Loads images into the GPU memory specified.
+ * @brief Loads an image, or multiple images if given a directory, into a tensor. Will be in NHWC layout and U8 format.
+ * All images must be of the same size and format. This operation will block on the provided stream.
  *
- * @param images_dir Either a directory or a single image to load into GPU memory.
- * @param num_images The number of images to load into GPU memory.
- * @param gpu_input A pointer to valid GPU memory.
+ * @param image_path The path to the image to load. If a directory is provided, all supported images in the directory
+ * will be loaded.
+ * @param device The device to load the images onto. Defaults to GPU.
+ * @param openCVFlags The OpenCV flags to use when loading the images. Defaults to IMREAD_UNCHANGED.
+ * @return A NHWC tensor containing the loaded images.
  */
-void DecodeRGBIImage(const std::string &images_dir, int num_images, void *gpu_input) {
+inline roccv::Tensor LoadImages(hipStream_t stream, const std::string &image_path,
+                                eDeviceType device = eDeviceType::GPU, int openCVFlags = cv::IMREAD_UNCHANGED) {
     const std::vector<std::string> supportedExtensions = {".bmp", ".jpg", ".jpeg", ".png"};
 
-    std::vector<std::string> imageFiles;
-    if (std::filesystem::is_directory(images_dir)) {
-        // A directory is provided. Collect all supported files in the directory (non-recursively).
-        for (auto file : std::filesystem::directory_iterator(images_dir)) {
+    std::vector<cv::Mat> images;
+
+    int width = -1;
+    int height = -1;
+    int channels = -1;
+
+    // Load images from directory or file if a single image is provided
+    if (std::filesystem::is_directory(image_path)) {
+        for (auto file : std::filesystem::directory_iterator(image_path)) {
             if (!std::filesystem::is_directory(file.path()) && ContainsExtension(file.path(), supportedExtensions)) {
-                imageFiles.push_back(file.path());
+                cv::Mat image = cv::imread(file.path(), openCVFlags);
+                if (image.empty()) {
+                    throw std::runtime_error("Cannot decode " + file.path().string() + ". File type not supported.\n");
+                }
+                images.push_back(image);
+
+                // Check if all images are of the same size
+                if (width == -1 && height == -1 && channels == -1) {
+                    width = image.cols;
+                    height = image.rows;
+                    channels = image.channels();
+                } else if (image.cols != width || image.rows != height || image.channels() != channels) {
+                    throw std::runtime_error("All images must be of the same size and format");
+                }
             }
         }
-
-        // Throw an error if there were no valid images found in the given directory
-        if (imageFiles.empty()) {
-            throw std::runtime_error("No valid images found in directory " + images_dir);
+    } else if (std::filesystem::is_regular_file(image_path) && ContainsExtension(image_path, supportedExtensions)) {
+        cv::Mat image = cv::imread(image_path, openCVFlags);
+        if (image.empty()) {
+            throw std::runtime_error("Cannot decode " + image_path + ". File type not supported.\n");
         }
+        images.push_back(image);
+        width = image.cols;
+        height = image.rows;
+        channels = image.channels();
     } else {
-        // A single image file is provided
-        if (!ContainsExtension(images_dir, supportedExtensions))
-            throw std::runtime_error("Cannot decode " + images_dir + ". File type not supported.\n");
-        imageFiles.push_back(images_dir);
+        throw std::runtime_error("Cannot decode " + image_path + ". File type not supported.\n");
     }
 
-    // Load images into provided GPU memory
-    size_t mem_offset = 0;
-    for (int b = 0; b < num_images; b++) {
-        cv::Mat inputMat = cv::imread(imageFiles[b]);
-        if (inputMat.empty()) {
-            throw std::runtime_error("Unable to load image " + imageFiles[b]);
-        }
+    if (images.empty()) {
+        throw std::runtime_error("No valid images found in directory " + image_path);
+    }
 
-        size_t imageSize = inputMat.rows * inputMat.cols * inputMat.channels() * sizeof(uint8_t);
-        CHECK_HIP_ERROR(
-            hipMemcpy(static_cast<uint8_t *>(gpu_input) + mem_offset, inputMat.data, imageSize, hipMemcpyHostToDevice));
-        mem_offset += imageSize;
+    // Create tensor and prepare arguments for hipMemcpy2D
+    roccv::Tensor tensor(images.size(), roccv::Size2D(width, height),
+                         roccv::ImageFormat(eDataType::DATA_TYPE_U8, channels), device);
+
+    MemcpyParams params = GetMemcpyParams(tensor);
+
+    // Copy images into tensor
+    hipMemcpyKind kind = (device == eDeviceType::GPU) ? hipMemcpyHostToDevice : hipMemcpyHostToHost;
+    for (int i = 0; i < images.size(); i++) {
+        CHECK_HIP_ERROR(hipMemcpy2DAsync(static_cast<uint8_t *>(params.basePtr) + i * params.imageBytes,
+                                         params.rowPitch, images[i].data, params.rowBytes, params.rowBytes, height,
+                                         kind, stream));
     }
+
+    // Ensure all memory operations are completed before returning the tensor
+    CHECK_HIP_ERROR(hipStreamSynchronize(stream));
+
+    return tensor;
 }
 
 /**
- * @brief Writes a batch of 3-channel RGBI images in a tensor to .bmp files. This will also block on the provided
- * stream.
+ * @brief Writes a batch of images from a tensor to the specified output path. This is a blocking operation.
  *
- * @param tensor A tensor containing a batch of RGBI images.
- * @param stream The HIP stream to synchronize with.
+ * @param tensor The tensor to write the images from.
+ * @param output_path The path to write the images to. If a directory is provided, the images will be written to the
+ * directory.
  */
-void WriteRGBITensor(const roccv::Tensor &tensor, hipStream_t stream) {
-    CHECK_HIP_ERROR(hipStreamSynchronize(stream));
+inline void WriteImages(hipStream_t stream, const roccv::Tensor &tensor, const std::string &output_path) {
+    if (tensor.layout() != eTensorLayout::TENSOR_LAYOUT_NHWC && tensor.layout() != eTensorLayout::TENSOR_LAYOUT_HWC) {
+        throw std::runtime_error(
+            "Unsupported tensor layout in WriteImages(). Only NHWC and HWC layouts are supported.");
+    }
 
-    auto srcData = tensor.exportData<roccv::TensorDataStrided>();
-    int batchSize = tensor.shape(tensor.layout().batch_index());
-    int height = tensor.shape(tensor.layout().height_index());
-    int width = tensor.shape(tensor.layout().width_index());
+    int64_t height = tensor.shape(tensor.layout().height_index());
+    int64_t width = tensor.shape(tensor.layout().width_index());
+    int64_t batchSize =
+        tensor.layout().batch_index() < 0 ? 1 : tensor.shape(tensor.layout().batch_index());  // Support for HWC layout
+    int64_t channels = tensor.shape(tensor.layout().channels_index());
 
-    // Write each image in the batch to separate .bmp files
-    for (int b = 0; b < batchSize; b++) {
-        std::ostringstream outFilename;
-        outFilename << "./roccvtest_" << b << ".bmp";
+    // Get OpenCV image format
+    int64_t cvFormat = CV_MAKETYPE(CV_8U, channels);
 
-        cv::Mat outputMat(height, width, CV_8UC3);
-        CHECK_HIP_ERROR(hipMemcpy(outputMat.data, srcData.basePtr(),
-                                  (tensor.shape().size() / batchSize) * tensor.dtype().size(), hipMemcpyDeviceToHost));
-        cv::imwrite(outFilename.str().c_str(), outputMat);
+    // Get memcpy parameters
+    MemcpyParams params = GetMemcpyParams(tensor);
+    hipMemcpyKind kind = (tensor.device() == eDeviceType::GPU) ? hipMemcpyDeviceToHost : hipMemcpyHostToHost;
+
+    // Copy images from tensor to OpenCV image vector
+    std::vector<cv::Mat> images(batchSize);
+    for (int i = 0; i < batchSize; i++) {
+        images[i] = cv::Mat(height, width, cvFormat);
+        CHECK_HIP_ERROR(hipMemcpy2DAsync(images[i].data, params.rowBytes,
+                                         static_cast<uint8_t *>(params.basePtr) + i * params.imageBytes,
+                                         params.rowPitch, params.rowBytes, height, kind, stream));
+    }
+
+    // Ensure all memory operations are completed before writing images
+    CHECK_HIP_ERROR(hipStreamSynchronize(stream));
+
+    std::filesystem::path outputPath(output_path);
+    if (outputPath.extension().empty()) {
+        for (int i = 0; i < batchSize; i++) {
+            std::filesystem::create_directories(outputPath);
+            std::filesystem::path outFilename = outputPath / std::format("image_{}.bmp", i);
+            cv::imwrite(outFilename.string(), images[i]);
+        }
+    } else {
+        cv::imwrite(output_path, images[0]);
     }
 }
\ No newline at end of file
diff --git a/samples/composite.cpp b/samples/composite.cpp
index cbc5507e..bd21948f 100644
--- a/samples/composite.cpp
+++ b/samples/composite.cpp
@@ -19,89 +19,116 @@
  * THE SOFTWARE.
  */
 
+#include <getopt.h>
+
 #include <core/tensor.hpp>
 #include <op_composite.hpp>
 #include <opencv2/opencv.hpp>
 
+#include "common/utils.hpp"
+
+/** @file composite.cpp
+ * @brief Sample application for the Composite operation.
+ *
+ * This sample application demonstrates the Composite operation, which blends the foreground image into the background
+ * image using a grayscale alpha mask. It loads the background, foreground, and mask images, runs the Composite
+ * operation, and writes the output images to disk.
+ */
+
 using namespace roccv;
 
+struct Config {
+    std::string backgroundPath;
+    std::string foregroundPath;
+    std::string maskPath;
+    std::string outputPath = "output";
+    eDeviceType device = eDeviceType::GPU;
+    int deviceId = 0;
+};
+
+void PrintUsage(const char* programName) {
+    // clang-format off
+    std::cout << "Usage: " << programName << " -b <background_filename> -f <foreground_filename> -m <mask_filename> -o <output_filename> -d <device_id>" << std::endl;
+    std::cout << "  -b, --background <background_filename> Background image filename (required)" << std::endl;
+    std::cout << "  -f, --foreground <foreground_filename> Foreground image filename (required)" << std::endl;
+    std::cout << "  -m, --mask <mask_filename> Mask image filename (required)" << std::endl;
+    std::cout << "  -o, --output <output_filename> Output image filename (optional, default: output)" << std::endl;
+    std::cout << "  -d, --device <device_id> Device ID to use for execution (optional, default: 0)" << std::endl;
+    std::cout << "  -h, --help Show this help message" << std::endl;
+    std::cout << std::endl;
+    std::cout << "NOTE: If you use directories for the background, foreground, or mask inputs, each directory must contain the same number of images. Each image in the directories must be the same size." << std::endl;
+    // clang-format on
+}
+
+/**
+ * @brief Main function for the Composite operation.
+ *
+ * @param argc Number of command line arguments.
+ * @param argv Command line arguments.
+ * @return EXIT_SUCCESS if the operation completed successfully, EXIT_FAILURE otherwise.
+ */
 int main(int argc, char** argv) {
-    if (argc != 6) {
-        std::cerr << "Usage: " << argv[0]
-                  << " <background_filename> <foreground_filename> <mask_filename> <output_filename> <device_id>"
-                  << std::endl;
+    Config config;
+    static struct option longOptions[] = {{"background", required_argument, nullptr, 'b'},
+                                          {"foreground", required_argument, nullptr, 'f'},
+                                          {"mask", required_argument, nullptr, 'm'},
+                                          {"output", required_argument, nullptr, 'o'},
+                                          {"device", required_argument, nullptr, 'd'},
+                                          {"help", no_argument, nullptr, 'h'},
+                                          {nullptr, 0, nullptr, 0}};
+    int opt;
+    while ((opt = getopt_long(argc, argv, "b:f:m:o:d:h", longOptions, nullptr)) != -1) {
+        switch (opt) {
+            case 'b':
+                config.backgroundPath = optarg;
+                break;
+            case 'f':
+                config.foregroundPath = optarg;
+                break;
+            case 'm':
+                config.maskPath = optarg;
+                break;
+            case 'o':
+                config.outputPath = optarg;
+                break;
+            case 'd':
+                config.deviceId = std::stoi(optarg);
+                break;
+            case 'h':
+                PrintUsage(argv[0]);
+                return EXIT_SUCCESS;
+            default:
+                PrintUsage(argv[0]);
+                return EXIT_FAILURE;
+        }
+    }
+    if (config.backgroundPath.empty() || config.foregroundPath.empty() || config.maskPath.empty()) {
+        std::cerr << "Error: Background, foreground, and mask paths are required.\n\n";
+        PrintUsage(argv[0]);
         return EXIT_FAILURE;
     }
 
-    hipStream_t stream;
-    HIP_VALIDATE_NO_ERRORS(hipStreamCreate(&stream));
-
-    HIP_VALIDATE_NO_ERRORS(hipSetDevice(std::stoi(argv[5])));
-
-    cv::Mat background_data = cv::imread(argv[1]);
-    cv::Mat foreground_data = cv::imread(argv[2]);
-    cv::Mat mask_data = cv::imread(argv[3], cv::IMREAD_GRAYSCALE);
-
-    // Create input/output tensors for the image.
-    DataType dtype(eDataType::DATA_TYPE_U8);
-    TensorShape background_shape(TensorLayout(eTensorLayout::TENSOR_LAYOUT_NHWC),
-                                 {1, background_data.rows, background_data.cols, background_data.channels()});
-    Tensor background_tensor(background_shape, dtype);
-
-    TensorShape foreground_shape(TensorLayout(eTensorLayout::TENSOR_LAYOUT_NHWC),
-                                 {1, foreground_data.rows, foreground_data.cols, foreground_data.channels()});
-    Tensor foreground_tensor(foreground_shape, dtype);
-
-    TensorShape mask_shape(TensorLayout(eTensorLayout::TENSOR_LAYOUT_NHWC),
-                           {1, mask_data.rows, mask_data.cols, mask_data.channels()});
-    Tensor mask_tensor(mask_shape, dtype);
-
-    Tensor output_tensor(background_shape, dtype);
-
-    auto bt_data = background_tensor.exportData<TensorDataStrided>();
-    HIP_VALIDATE_NO_ERRORS(hipMemcpyAsync(bt_data.basePtr(), background_data.data,
-                                          bt_data.shape().size() * bt_data.dtype().size(), hipMemcpyHostToDevice,
-                                          stream));
-
-    auto ft_data = foreground_tensor.exportData<TensorDataStrided>();
-    HIP_VALIDATE_NO_ERRORS(hipMemcpyAsync(ft_data.basePtr(), foreground_data.data,
-                                          foreground_tensor.shape().size() * foreground_tensor.dtype().size(),
-                                          hipMemcpyHostToDevice, stream));
-
-    auto m_data = mask_tensor.exportData<TensorDataStrided>();
-    HIP_VALIDATE_NO_ERRORS(hipMemcpyAsync(m_data.basePtr(), mask_data.data,
-                                          mask_tensor.shape().size() * mask_tensor.dtype().size(),
-                                          hipMemcpyHostToDevice, stream));
-
-    hipEvent_t begin, end;
-    HIP_VALIDATE_NO_ERRORS(hipEventCreate(&begin));
-    HIP_VALIDATE_NO_ERRORS(hipEventCreate(&end));
-
-    HIP_VALIDATE_NO_ERRORS(hipEventRecord(begin, stream));
-    roccv::Composite op;
-    op(stream, foreground_tensor, background_tensor, mask_tensor, output_tensor);
-    HIP_VALIDATE_NO_ERRORS(hipEventRecord(end, stream));
-    HIP_VALIDATE_NO_ERRORS(hipEventSynchronize(end));
+    if (config.device == eDeviceType::GPU) {
+        CHECK_HIP_ERROR(hipSetDevice(config.deviceId));
+    }
 
-    float duration;
-    HIP_VALIDATE_NO_ERRORS(hipEventElapsedTime(&duration, begin, end));
-    printf("Kernel execution time: %fms\n", duration);
+    hipStream_t stream;
+    CHECK_HIP_ERROR(hipStreamCreate(&stream));
 
-    HIP_VALIDATE_NO_ERRORS(hipEventDestroy(begin));
-    HIP_VALIDATE_NO_ERRORS(hipEventDestroy(end));
+    // Create required tensors for the Composite operation
+    Tensor background = LoadImages(hipStreamPerThread, config.backgroundPath, config.device);
+    Tensor foreground = LoadImages(hipStreamPerThread, config.foregroundPath, config.device);
+    Tensor mask = LoadImages(hipStreamPerThread, config.maskPath, config.device, cv::IMREAD_GRAYSCALE);
+    Tensor output = Tensor(background.shape(), background.dtype(), config.device);
 
-    // Move image data back to device
-    auto out_data = output_tensor.exportData<TensorDataStrided>();
-    std::vector<uint8_t> out_h(output_tensor.shape().size());
-    HIP_VALIDATE_NO_ERRORS(hipMemcpyAsync(out_h.data(), out_data.basePtr(),
-                                          output_tensor.shape().size() * output_tensor.dtype().size(),
-                                          hipMemcpyDeviceToHost, stream));
+    // Run the Composite operation
+    Composite op;
+    op(stream, foreground, background, mask, output, config.device);
 
-    HIP_VALIDATE_NO_ERRORS(hipStreamSynchronize(stream));
+    // Write output images to disk
+    WriteImages(stream, output, config.outputPath);
 
-    // Write normalized image to disk
-    cv::Mat output_image_data(background_data.rows, background_data.cols, CV_8UC3, out_h.data());
-    cv::imwrite(argv[4], output_image_data);
+    CHECK_HIP_ERROR(hipStreamDestroy(stream));
 
     return EXIT_SUCCESS;
 }
\ No newline at end of file
diff --git a/samples/copy_make_border.cpp b/samples/copy_make_border.cpp
index be7be7e5..0dbb48f0 100644
--- a/samples/copy_make_border.cpp
+++ b/samples/copy_make_border.cpp
@@ -21,71 +21,138 @@ THE SOFTWARE.
 */
 
 #include <core/hip_assert.h>
+#include <getopt.h>
 
 #include <core/tensor.hpp>
 #include <iostream>
 #include <op_copy_make_border.hpp>
 #include <opencv2/opencv.hpp>
 
+#include "common/utils.hpp"
+
 using namespace roccv;
 
+struct Config {
+    std::string inputPath;
+    std::string outputPath = "output";
+    int32_t top = 10;
+    int32_t left = 10;
+    float r = 0.0f, g = 0.0f, b = 0.0f, a = 255.0f;
+    eBorderType borderMode = eBorderType::BORDER_TYPE_CONSTANT;
+    int deviceId = 0;
+};
+
+void PrintUsage(const char* programName) {
+    // clang-format off
+    std::cout << "Usage: " << programName << " -i <input_image_or_directory> [options]\n\n"
+              << "Options:\n"
+              << "  -i, --input <path>      Input image or directory (required)\n"
+              << "  -o, --output <path>     Output file or directory (default: output/image_<index>.bmp)\n"
+              << "  -t, --top <pixels>      Top/bottom border size (default: 10)\n"
+              << "  -l, --left <pixels>     Left/right border size (default: 10)\n"
+              << "  -c, --color <r,g,b,a>   Border color as r,g,b,a (default: 0,0,0,255)\n"
+              << "  -m, --mode <mode>       Border mode: 0=CONSTANT, 1=REPLICATE, 2=REFLECT, 3=REFLECT101, 4=WRAP (default: CONSTANT)\n"
+              << "  -d, --device <id>       GPU device ID (default: 0)\n"
+              << "  -h, --help              Show this help message\n\n"
+              << "Example:\n"
+              << "  " << programName << " -i image.jpg -o bordered.png -t 20 -l 15 -c 255,0,0,255\n";
+    // clang-format on
+}
+
+bool ParseColor(const std::string& colorStr, float& r, float& g, float& b, float& a) {
+    return sscanf(colorStr.c_str(), "%f,%f,%f,%f", &r, &g, &b, &a) == 4;
+}
+
 /**
  * @brief Copy make border operation example.
+ *
+ * This sample demonstrates the usage of the CopyMakeBorder operator. It accepts either a single image file or a
+ * directory of images as the input path, and either a single file or a directory as the output path. If a directory is
+ * provided for the input, all supported images within the directory will be processed as a batch. Similarly, if the
+ * output path is a directory, all resulting images will be written to that directory. For each image, a border is
+ * created based on the specified border mode and border value.
+ *
+ * If <image_or_directory> is a directory, a batch operation will occur on every image in the directory.
+ * If <output_file_or_directory> is a directory, the output images will be written into that directory.
  */
 int main(int argc, char** argv) {
-    if (argc != 11) {
-        std::cerr << "Usage: " << argv[0]
-                  << " <image_filename> <output_filename> <top> <left> <r> <g> <b> <a> <border_mode> <device_id>"
-                  << std::endl;
-        return EXIT_FAILURE;
+    Config config;
+
+    static struct option longOptions[] = {{"input", required_argument, nullptr, 'i'},
+                                          {"output", required_argument, nullptr, 'o'},
+                                          {"top", required_argument, nullptr, 't'},
+                                          {"left", required_argument, nullptr, 'l'},
+                                          {"color", required_argument, nullptr, 'c'},
+                                          {"mode", required_argument, nullptr, 'm'},
+                                          {"device", required_argument, nullptr, 'd'},
+                                          {"help", no_argument, nullptr, 'h'},
+                                          {nullptr, 0, nullptr, 0}};
+
+    int opt;
+    while ((opt = getopt_long(argc, argv, "i:o:t:l:c:m:d:h", longOptions, nullptr)) != -1) {
+        switch (opt) {
+            case 'i':
+                config.inputPath = optarg;
+                break;
+            case 'o':
+                config.outputPath = optarg;
+                break;
+            case 't':
+                config.top = std::stoi(optarg);
+                break;
+            case 'l':
+                config.left = std::stoi(optarg);
+                break;
+            case 'c':
+                if (!ParseColor(optarg, config.r, config.g, config.b, config.a)) {
+                    std::cerr << "Invalid color format. Use: r,g,b,a (e.g., 255,0,0,255)\n";
+                    return EXIT_FAILURE;
+                }
+                break;
+            case 'm':
+                config.borderMode = static_cast<eBorderType>(std::stoi(optarg));
+                break;
+            case 'd':
+                config.deviceId = std::stoi(optarg);
+                break;
+            case 'h':
+                PrintUsage(argv[0]);
+                return EXIT_SUCCESS;
+            default:
+                PrintUsage(argv[0]);
+                return EXIT_FAILURE;
+        }
     }
 
-    HIP_VALIDATE_NO_ERRORS(hipSetDevice(std::stoi(argv[10])));
-
-    int32_t top = std::stoi(argv[3]);
-    int32_t left = std::stoi(argv[4]);
-    float r = std::stof(argv[5]);
-    float g = std::stof(argv[6]);
-    float b = std::stof(argv[7]);
-    float a = std::stof(argv[8]);
-    eBorderType border_mode = static_cast<eBorderType>(std::stoi(argv[9]));
-
-    cv::Mat image_data = cv::imread(argv[1]);
-
-    // Create input/output tensors for the image.
-    TensorShape shape(TensorLayout(eTensorLayout::TENSOR_LAYOUT_NHWC),
-                      {1, image_data.rows, image_data.cols, image_data.channels()});
-    DataType dtype(eDataType::DATA_TYPE_U8);
-
-    TensorShape o_shape(TensorLayout(eTensorLayout::TENSOR_LAYOUT_NHWC),
-                        {1, image_data.rows + top * 2, image_data.cols + left * 2, image_data.channels()});
+    if (config.inputPath.empty()) {
+        std::cerr << "Error: Input path is required.\n\n";
+        PrintUsage(argv[0]);
+        return EXIT_FAILURE;
+    }
 
-    Tensor d_in(shape, dtype);
-    Tensor d_out(o_shape, dtype);
+    CHECK_HIP_ERROR(hipSetDevice(config.deviceId));
 
+    // Create stream
     hipStream_t stream;
-    HIP_VALIDATE_NO_ERRORS(hipStreamCreate(&stream));
+    CHECK_HIP_ERROR(hipStreamCreate(&stream));
 
-    // Move image data to input tensor
-    size_t image_size = d_in.shape().size() * d_in.dtype().size();
-    auto d_input_data = d_in.exportData<TensorDataStrided>();
-    HIP_VALIDATE_NO_ERRORS(
-        hipMemcpyAsync(d_input_data.basePtr(), image_data.data, image_size, hipMemcpyHostToDevice, stream));
+    // Load input image
+    Tensor input = LoadImages(stream, config.inputPath.c_str());
 
-    CopyMakeBorder op;
-    op(stream, d_in, d_out, top, left, border_mode, {b, g, r, a});
+    // Create output tensor
+    int64_t outputHeight = input.shape(input.layout().height_index()) + config.top * 2;
+    int64_t outputWidth = input.shape(input.layout().width_index()) + config.left * 2;
+    TensorShape outputShape(input.layout(), {input.shape(input.layout().batch_index()), outputHeight, outputWidth,
+                                             input.shape(input.layout().channels_index())});
+    Tensor output(outputShape, input.dtype());
 
-    // Move image data back to device
-    auto d_out_data = d_out.exportData<TensorDataStrided>();
-    size_t out_image_size = d_out.shape().size() * d_out.dtype().size();
-    std::vector<uint8_t> h_output(out_image_size);
-    HIP_VALIDATE_NO_ERRORS(
-        hipMemcpyAsync(h_output.data(), d_out_data.basePtr(), out_image_size, hipMemcpyDeviceToHost, stream));
+    // Create CopyMakeBorder operator
+    CopyMakeBorder op;
+    op(stream, input, output, config.top, config.left, config.borderMode, {config.b, config.g, config.r, config.a});
 
-    HIP_VALIDATE_NO_ERRORS(hipStreamSynchronize(stream));
+    WriteImages(stream, output, config.outputPath);
 
-    cv::Mat output_image_data(image_data.rows + top * 2, image_data.cols + left * 2, CV_8UC3, h_output.data());
-    cv::imwrite(argv[2], output_image_data);
+    CHECK_HIP_ERROR(hipStreamDestroy(stream));
 
     return EXIT_SUCCESS;
 }
\ No newline at end of file
diff --git a/samples/cropandresize/cpp/main.cpp b/samples/cropandresize/cpp/main.cpp
index 8b8ed9bd..eec6532b 100644
--- a/samples/cropandresize/cpp/main.cpp
+++ b/samples/cropandresize/cpp/main.cpp
@@ -23,9 +23,9 @@
 #include <math.h>
 #include <stdint.h>
 
+#include <chrono>
 #include <core/image_format.hpp>
 #include <core/tensor.hpp>
-#include <fstream>
 #include <iostream>
 #include <op_custom_crop.hpp>
 #include <op_resize.hpp>
@@ -35,170 +35,172 @@
 #include "core/tensor_shape.hpp"
 
 /**
+ * @file main.cpp
  * @brief Crop and Resize sample app.
  *
- * The Crop and Resize is a simple pipeline which demonstrates usage of
- * rocCV Tensor along with a few operators.
+ * The Crop and Resize is a simple pipeline which demonstrates usage of the
+ * rocCV Tensor along with the Custom Crop and Resize operators.
  *
  * Input Batch Tensor -> Crop -> Resize -> WriteImage
  */
 
-/**
- * @brief Utility to show usage of sample app
- *
- **/
-void showUsage() {
-    std::cout << "usage: ./roccv_cropandresize_app -i <image file path or image directory> -b <batch size>"
-              << std::endl;
+using namespace roccv;
+
+struct Config {
+    std::string inputPath;
+    std::string outputPath = "output";
+    eDeviceType device = eDeviceType::GPU;
+    Size2D resizeShape = {320, 480};
+    Box_t cropRect = {50, 150, 400, 300};
+    eInterpolationType interpolation = eInterpolationType::INTERP_TYPE_LINEAR;
+    int deviceId = 0;
+};
+
+void PrintUsage(const char* programName) {
+    // clang-format off
+    std::cout << "rocCV Crop and Resize Sample Application\n";
+    std::cout << "----------------------------------------\n";
+    std::cout << "This sample demonstrates how to set up a simple image processing pipeline using rocCV.\n";
+    std::cout << "It shows reading a batch of images, cropping them to a specified rectangle,\n";
+    std::cout << "resizing the cropped images to a target shape, and writing the outputs to image files.\n";
+    std::cout << "You may select the device (CPU or GPU), interpolation type, input size, and more.\n";
+    std::cout << '\n';
+    std::cout << "Usage: " << programName << " -i <input_image_or_directory> [options]\n";
+    std::cout << "  -i, --input <input_image_or_directory>   Input image or directory (required)\n";
+    std::cout << "Options:\n";
+    std::cout << "  -o, --output <output_image_or_directory> Output image or directory (optional, default: output)\n";
+    std::cout << "  -r, --resize <width,height>              Resize shape as width,height (optional, default: 320,480)\n";
+    std::cout << "  -c, --crop <x,y,w,h>                     Crop rectangle as x,y,w,h (optional, default: 50,150,400,300)\n";
+    std::cout << "  -I, --interpolation <interpolation>      Interpolation type: 0=NEAREST, 1=LINEAR, 2=CUBIC (optional, default: LINEAR)\n";
+    std::cout << "  -C, --cpu                                Use CPU for execution (optional, default: GPU)\n";
+    std::cout << "  -d, --device <device_id>                 Device ID to use for execution (optional, default: 0)\n";
+    std::cout << "  -h, --help                               Show this help message\n";
+    std::cout << std::endl;
+    // clang-format on
 }
 
-/**
- * @brief Utility to parse the command line arguments
- *
- **/
-int ParseArgs(int argc, char *argv[], std::string &imagePath, uint32_t &batchSize) {
-    static struct option long_options[] = {{"help", no_argument, 0, 'h'},
-                                           {"imagePath", required_argument, 0, 'i'},
-                                           {"batch", required_argument, 0, 'b'},
-                                           {0, 0, 0, 0}};
-
-    int long_index = 0;
-    int opt = 0;
-    while ((opt = getopt_long(argc, argv, "hi:b:", long_options, &long_index)) != -1) {
+void ParseCropRectangle(const std::string& cropStr, Box_t& cropRect) {
+    std::istringstream iss(cropStr);
+    std::string token;
+    getline(iss, token, ',');
+    cropRect.x = std::stoi(token);
+    getline(iss, token, ',');
+    cropRect.y = std::stoi(token);
+    getline(iss, token, ',');
+    cropRect.width = std::stoi(token);
+    getline(iss, token, ',');
+    cropRect.height = std::stoi(token);
+}
+
+void ParseResizeShape(const std::string& resizeStr, Size2D& resizeShape) {
+    std::istringstream iss(resizeStr);
+    std::string token;
+    getline(iss, token, ',');
+    resizeShape.w = std::stoi(token);
+    getline(iss, token, ',');
+    resizeShape.h = std::stoi(token);
+}
+
+int main(int argc, char** argv) {
+    Config config;
+    static struct option longOptions[] = {{"input", required_argument, nullptr, 'i'},
+                                          {"output", required_argument, nullptr, 'o'},
+                                          {"resize", required_argument, nullptr, 'r'},
+                                          {"crop", required_argument, nullptr, 'c'},
+                                          {"interpolation", required_argument, nullptr, 'I'},
+                                          {"cpu", no_argument, nullptr, 'C'},
+                                          {"device", required_argument, nullptr, 'd'},
+                                          {"help", no_argument, nullptr, 'h'},
+                                          {nullptr, 0, nullptr, 0}};
+
+    // Parse command line arguments
+    int opt;
+    while ((opt = getopt_long(argc, argv, "i:o:r:c:I:d:h:C", longOptions, nullptr)) != -1) {
         switch (opt) {
-            case 'h':
-                showUsage();
-                return -1;
-                break;
             case 'i':
-                imagePath = optarg;
+                config.inputPath = optarg;
                 break;
-            case 'b':
-                batchSize = std::stoi(optarg);
+            case 'o':
+                config.outputPath = optarg;
                 break;
-            case ':':
-                showUsage();
-                return -1;
-            default:
+            case 'r':
+                ParseResizeShape(optarg, config.resizeShape);
+                break;
+            case 'c':
+                ParseCropRectangle(optarg, config.cropRect);
                 break;
+            case 'I':
+                config.interpolation = static_cast<eInterpolationType>(std::stoi(optarg));
+                break;
+            case 'C':
+                config.device = eDeviceType::CPU;
+                break;
+            case 'd':
+                config.deviceId = std::stoi(optarg);
+                break;
+            case 'h':
+                PrintUsage(argv[0]);
+                return EXIT_SUCCESS;
+            default:
+                PrintUsage(argv[0]);
+                return EXIT_FAILURE;
         }
     }
-    std::ifstream imageFile(imagePath);
-    if (!imageFile.good()) {
-        showUsage();
-        std::cerr << "Image path '" + imagePath + "' does not exist" << std::endl;
-        return -1;
-    }
-    return 0;
-}
 
-int main(int argc, char *argv[]) {
-    // Default parameters
-    // TODO: Default parameter for images cannot be added for now. Must specify a sample asset directory in the final
-    // build which is relative to this executable.
-    std::string imagePath = "none.jpg";
-    uint32_t batchSize = 1;
-
-    // Parse the command line paramaters to override the default parameters
-    int retval = ParseArgs(argc, argv, imagePath, batchSize);
-    if (retval != 0) {
-        return retval;
+    if (config.inputPath.empty()) {
+        std::cerr << "Error: Input path is required.\n\n";
+        PrintUsage(argv[0]);
+        return EXIT_FAILURE;
     }
 
-    // Note : The maximum input image dimensions needs to be updated in case
-    // of testing with different test images
-
-    int maxImageWidth = 720;
-    int maxImageHeight = 480;
-    int maxChannels = 3;
+    if (config.device == eDeviceType::GPU) {
+        CHECK_HIP_ERROR(hipSetDevice(config.deviceId));
+    }
 
-    // tag: Create the HIP stream
     hipStream_t stream;
     CHECK_HIP_ERROR(hipStreamCreate(&stream));
 
-    // tag: Allocate input tensor
-    // Allocating memory for RGBI input image batch of uint8_t data type.
-
-    roccv::TensorDataStrided::Buffer inBuf;
-    inBuf.strides[3] = sizeof(uint8_t);
-    inBuf.strides[2] = maxChannels * inBuf.strides[3];
-    inBuf.strides[1] = maxImageWidth * inBuf.strides[2];
-    inBuf.strides[0] = maxImageHeight * inBuf.strides[1];
-    CHECK_HIP_ERROR(hipMallocAsync(&inBuf.basePtr, batchSize * inBuf.strides[0], stream));
-
-    // tag: Tensor Requirements
-    // Calculate the requirements for the RGBI uint8_t Tensor which include
-    // pitch bytes, alignment, shape  and tensor layout
-    roccv::Tensor::Requirements inReqs =
-        roccv::Tensor::CalcRequirements(batchSize, {maxImageWidth, maxImageHeight}, roccv::FMT_RGB8);
-
-    // Create a tensor buffer to store the data pointer and pitch bytes for each plane
-    roccv::TensorDataStridedHip inData(roccv::TensorShape{inReqs.shape, inReqs.rank, inReqs.layout},
-                                       roccv::DataType{inReqs.dtype}, inBuf);
-
-    // Wrap tensor data in a rocCV tensor for use with the rocCV operators.
-    roccv::Tensor inTensor = roccv::TensorWrapData(inData);
-
-    // tag: Image Loading
-    uint8_t *gpuInput = reinterpret_cast<uint8_t *>(inBuf.basePtr);
-    // The total images is set to the same value as batch size for testing
-    uint32_t totalImages = batchSize;
-
-    // OpenCV is used to load the images, which gets copied into device memory.
-    DecodeRGBIImage(imagePath, totalImages, gpuInput);
-
-    // tag: The input buffer is now ready to be used by the operators
-
-    // Set parameters for Crop and Resize
-    // ROI dimensions to crop in the input image
-    int cropX = 50;
-    int cropY = 150;
-    int cropWidth = 400;
-    int cropHeight = 300;
-
-    // Set the resize dimensions
-    int resizeWidth = 320;
-    int resizeHeight = 240;
-
-    //  Create the crop rect for the cropping operator
-    roccv::Box_t crpRect = {cropX, cropY, cropWidth, cropHeight};
-
-    // tag: Allocate Tensors for Crop and Resize
-    // Create a rocCV Tensor based on the crop window size.
-    roccv::Tensor cropTensor(batchSize, {cropWidth, cropHeight}, roccv::FMT_RGB8);
-    // Create a rocCV Tensor based on resize dimensions
-    roccv::Tensor resizedTensor(batchSize, {resizeWidth, resizeHeight}, roccv::FMT_RGB8);
-
-#ifdef PROFILE_SAMPLE
-    hipEvent_t start, stop;
-    hipEventCreate(&start);
-    hipEventCreate(&stop);
-    hipEventRecord(start);
-#endif
-    // tag: Initialize operators for Crop and Resize
-    roccv::CustomCrop cropOp;
-    roccv::Resize resizeOp;
-
-    // tag: Executes the CustomCrop operation on the given HIP stream
-    cropOp(stream, inTensor, cropTensor, crpRect);
-
-    // Resize operator can now be enqueued into the same stream
-    resizeOp(stream, cropTensor, resizedTensor, INTERP_TYPE_LINEAR);
-
-    // tag: Profile section
-#ifdef PROFILE_SAMPLE
-    hipEventRecord(stop);
-    hipEventSynchronize(stop);
-    float operatorms = 0;
-    hipEventElapsedTime(&operatorms, start, stop);
-    std::cout << "Time for Crop and Resize : " << operatorms << " ms" << std::endl;
-#endif
-
-    // tag: Copy the buffer to CPU and write resized image into .bmp files
-    WriteRGBITensor(resizedTensor, stream);
-
-    // tag: Clean up
+    // Load batch of input images
+    Tensor input = LoadImages(stream, config.inputPath, config.device);
+
+    // Determine the batch size and channels from the input tensor
+    int64_t batchSize = input.shape(input.layout().batch_index());
+    int64_t channels = input.shape(input.layout().channels_index());
+
+    // Create tensor for the cropped image
+    Tensor cropTensor =
+        Tensor(TensorShape(input.layout(), {batchSize, config.cropRect.height, config.cropRect.width, channels}),
+               input.dtype(), config.device);
+
+    // Create tensor for the resized image
+    Tensor resizedTensor =
+        Tensor(TensorShape(input.layout(), {batchSize, config.resizeShape.h, config.resizeShape.w, channels}),
+               input.dtype(), config.device);
+
+    // Create crop and resize operators
+    CustomCrop cropOp;
+    Resize resizeOp;
+
+    std::chrono::high_resolution_clock::time_point start = std::chrono::high_resolution_clock::now();
+
+    // Run the crop operation, writing results to the crop tensor
+    cropOp(stream, input, cropTensor, config.cropRect, config.device);
+
+    // Run the resize operation, writing results to the resized tensor
+    resizeOp(stream, cropTensor, resizedTensor, config.interpolation, config.device);
+    CHECK_HIP_ERROR(hipStreamSynchronize(stream));
+
+    std::chrono::high_resolution_clock::time_point end = std::chrono::high_resolution_clock::now();
+
+    // Report the duration of the crop and resize operation
+    long executionTime = std::chrono::duration_cast<std::chrono::milliseconds>(end - start).count();
+    std::cout << "Processed " << batchSize << " images in " << executionTime << "ms" << std::endl;
+
+    // Write the cropped and resized images to disk
+    WriteImages(stream, resizedTensor, config.outputPath);
+
+    // Destroy the stream
     CHECK_HIP_ERROR(hipStreamDestroy(stream));
 
-    // tag: End of Sample
+    return EXIT_SUCCESS;
 }
\ No newline at end of file
diff --git a/samples/custom_crop.cpp b/samples/custom_crop.cpp
index 73bd8561..25e7d83a 100644
--- a/samples/custom_crop.cpp
+++ b/samples/custom_crop.cpp
@@ -21,144 +21,122 @@ THE SOFTWARE.
 */
 
 #include <core/hip_assert.h>
+#include <getopt.h>
+
 #include <core/tensor.hpp>
 #include <iostream>
 #include <op_custom_crop.hpp>
 #include <opencv2/opencv.hpp>
 
+#include "common/utils.hpp"
+
 using namespace roccv;
 
-/**
- * @brief Custom crop operation example.
- */
+struct Config {
+    std::string inputPath;
+    std::string outputPath = "output";
+    Box_t cropRect = {0, 0, 1, 1};
+    eDeviceType device = eDeviceType::GPU;
+    int deviceId = 0;
+};
+
+void PrintUsage(const char* programName) {
+    // clang-format off
+    std::cout << "Usage: " << programName << " -i <input_image> [-o <output_image>] [-crop <x,y,w,h>] [-d <device_id>] [-c]" << std::endl;
+    std::cout << "  -i, --input <input_image>           Input image or directory containing images (required)" << std::endl;
+    std::cout << "  -o, --output <output_image>         Output image or directory to save the results (optional, default: output)" << std::endl;
+    std::cout << "  -c, --crop <x,y,w,h>                Crop rectangle as comma separated values (optional, default: 0,0,1,1)" << std::endl;
+    std::cout << "  -d, --device <device_id>            Device ID to use for execution when using GPU (optional, default: 0)" << std::endl;
+    std::cout << "  -C, --cpu                           Use CPU for execution (optional, default: GPU)" << std::endl;
+    std::cout << "  -h, --help                          Show this help message" << std::endl;
+    // clang-format on
+}
 
-void ShowHelpAndExit(const char *option = NULL) {
-    std::cout << "Options: " << option << std::endl
-    << "-i Input File Path - required" << std::endl
-    << "-o Output File Path - optional; default: output.bmp" << std::endl
-    << "-cpu Select CPU instead of GPU to perform operation - optional; default choice is GPU path" << std::endl
-    << "-d GPU device ID (0 for the first device, 1 for the second, etc.) - optional; default: 0" << std::endl
-    << "-crop Crop rectangle (top_left_corner_x, top_left_corner_y, width, height)- optional; default: use the set value in the app" << std::endl;
-    exit(0);
+bool ParseCropRectangle(const std::string& cropStr, Box_t& cropRect) {
+    std::istringstream iss(cropStr);
+    std::string token;
+    getline(iss, token, ',');
+    cropRect.x = std::stoi(token);
+    getline(iss, token, ',');
+    cropRect.y = std::stoi(token);
+    getline(iss, token, ',');
+    cropRect.width = std::stoi(token);
+    getline(iss, token, ',');
+    cropRect.height = std::stoi(token);
+    return true;
 }
 
+/**
+ * @brief Custom crop operation example.
+ */
 int main(int argc, char** argv) {
-    std::string input_file_path;
-    std::string output_file_path = "output.bmp";
-    bool gpuPath = true; // use GPU by default
-    eDeviceType device = eDeviceType::GPU;
-    int deviceId = 0;
-    Box_t cropRect = {0, 0, 1, 1};
-    bool cropSet = false;
-
-    if(argc < 3) {
-        ShowHelpAndExit("-h");
-    }
-    for (int i = 1; i < argc; i++) {
-        if (!strcmp(argv[i], "-h")) {
-            ShowHelpAndExit("-h");
-        }
-        if (!strcmp(argv[i], "-i")) {
-            if (++i == argc) {
-                ShowHelpAndExit("-i");
-            }
-            input_file_path = argv[i];
-            continue;
-        }
-        if (!strcmp(argv[i], "-o")) {
-            if (++i == argc) {
-                ShowHelpAndExit("-o");
-            }
-            output_file_path = argv[i];
-            continue;
-        }
-        if (!strcmp(argv[i], "-crop")) {
-            i++;
-            if (i + 4 > argc) {
-                ShowHelpAndExit("-crop");
-            }
-            cropRect.x = atoi(argv[i++]);
-            cropRect.y = atoi(argv[i++]);
-            cropRect.width = atoi(argv[i++]);
-            cropRect.height = atoi(argv[i]);
-            cropSet = true;
-            continue;
-        }
-        if (!strcmp(argv[i], "-cpu")) {
-            gpuPath = false;
-            continue;
+    Config config;
+    static struct option longOptions[] = {{"input", required_argument, nullptr, 'i'},
+                                          {"output", required_argument, nullptr, 'o'},
+                                          {"crop", required_argument, nullptr, 'c'},
+                                          {"device", required_argument, nullptr, 'd'},
+                                          {"cpu", no_argument, nullptr, 'C'},
+                                          {"help", no_argument, nullptr, 'h'},
+                                          {nullptr, 0, nullptr, 0}};
+    int opt;
+    while ((opt = getopt_long(argc, argv, "i:o:c:d:h:C", longOptions, nullptr)) != -1) {
+        switch (opt) {
+            case 'i':
+                config.inputPath = optarg;
+                break;
+            case 'o':
+                config.outputPath = optarg;
+                break;
+            case 'c':
+                if (!ParseCropRectangle(optarg, config.cropRect)) {
+                    std::cerr << "Invalid crop rectangle format. Use: x,y,w,h (e.g., 0,0,1,1)\n";
+                    return EXIT_FAILURE;
+                }
+                break;
+            case 'd':
+                config.deviceId = std::stoi(optarg);
+                break;
+            case 'C':
+                config.device = eDeviceType::CPU;
+                break;
+            case 'h':
+                PrintUsage(argv[0]);
+                return EXIT_SUCCESS;
+            default:
+                PrintUsage(argv[0]);
+                return EXIT_FAILURE;
         }
     }
 
-    if (gpuPath) {
-        device = eDeviceType::GPU;
-        HIP_VALIDATE_NO_ERRORS(hipSetDevice(deviceId));
-    } else {
-        device = eDeviceType::CPU;
+    if (config.inputPath.empty()) {
+        std::cerr << "Error: Input path is required.\n\n";
+        PrintUsage(argv[0]);
+        return EXIT_FAILURE;
     }
 
-    cv::Mat imageData = cv::imread(input_file_path);
-    if (imageData.empty()) {
-        std::cerr << "Failed to read the input image file" << std::endl;
-        exit(1);
-    }
-    if (!cropSet) {
-        // Set a safe crop area if no user input
-        cropRect = {(imageData.cols / 4), (imageData.rows / 4), (imageData.cols / 2), (imageData.rows / 2)};
-    }
+    CHECK_HIP_ERROR(hipSetDevice(config.deviceId));
 
-    // Create input/output tensors for the image.
-    TensorShape inputShape(TensorLayout(eTensorLayout::TENSOR_LAYOUT_NHWC), {1, imageData.rows, imageData.cols, imageData.channels()});
-    DataType dtype(eDataType::DATA_TYPE_U8);
-    Tensor input(inputShape, dtype, device);
+    hipStream_t stream;
+    CHECK_HIP_ERROR(hipStreamCreate(&stream));
 
-    TensorShape outShape(TensorLayout(eTensorLayout::TENSOR_LAYOUT_NHWC), {1, cropRect.height, cropRect.width, imageData.channels()});
-    Tensor output(outShape, dtype, device);
+    // Load input images
+    Tensor input = LoadImages(stream, config.inputPath, config.device);
 
-    hipStream_t stream = nullptr;
-    if (gpuPath) {
-        HIP_VALIDATE_NO_ERRORS(hipStreamCreate(&stream));
-    }
+    int64_t batchSize = input.shape(input.layout().batch_index());
+    int64_t channels = input.shape(input.layout().channels_index());
 
-    // Move image data to input tensor
-    size_t imageSizeInByte = input.shape().size() * input.dtype().size();
-    auto input_data = input.exportData<TensorDataStrided>();
-    if (gpuPath) {
-        HIP_VALIDATE_NO_ERRORS(hipMemcpyAsync(input_data.basePtr(), imageData.data, imageSizeInByte, hipMemcpyHostToDevice, stream));
-    } else {
-        memcpy(input_data.basePtr(), imageData.data, imageSizeInByte);
-    }
+    // Create output tensor
+    Tensor output(TensorShape(input.layout(), {batchSize, config.cropRect.height, config.cropRect.width, channels}),
+                  input.dtype(), config.device);
 
+    // Run custom crop operation
     CustomCrop op;
-    op(stream, input, output, cropRect, device);
-
-    // Move image data back to host
-    size_t outputSize = output.shape().size() * output.dtype().size();
-    auto outData = output.exportData<TensorDataStrided>();
-    std::vector<uint8_t> h_output(outputSize);
-    if (gpuPath) {
-        HIP_VALIDATE_NO_ERRORS(hipMemcpyAsync(h_output.data(), outData.basePtr(), outputSize, hipMemcpyDeviceToHost, stream));
-        HIP_VALIDATE_NO_ERRORS(hipStreamSynchronize(stream));
-    } else {
-        memcpy(h_output.data(), outData.basePtr(), outputSize);
-    }
+    op(stream, input, output, config.cropRect, config.device);
 
     // Write output image to disk
-    cv::Mat output_imageData(cropRect.height, cropRect.width, imageData.type(), h_output.data());
-    bool ret = cv::imwrite(output_file_path, output_imageData);
-    if (!ret) {
-        std::cerr << "Faild to save output image to the file" << std::endl;
-    }
+    WriteImages(stream, output, config.outputPath);
 
-    std::cout << "Input image file: " << input_file_path << std::endl;
-    std::cout << "Output image file: " << output_file_path << std::endl;
-    if (gpuPath) {
-        std::cout << "Operation on GPU device " << deviceId << std::endl;
-    } else {
-        std::cout << "Operation on CPU" << std::endl;
-    }
-    std::cout << "Input image size: width = " << imageData.cols << ", height = " << imageData.rows << std::endl;
-    std::cout << "Cropping area: top-left corner = (" << cropRect.x << ", " << cropRect.y << "), width = " << cropRect.width << ", height = " << cropRect.height << std::endl;
+    CHECK_HIP_ERROR(hipStreamDestroy(stream));
 
     return EXIT_SUCCESS;
 }
\ No newline at end of file
diff --git a/samples/gamma_contrast.cpp b/samples/gamma_contrast.cpp
index 81fe771f..62a83f28 100644
--- a/samples/gamma_contrast.cpp
+++ b/samples/gamma_contrast.cpp
@@ -21,78 +21,91 @@ THE SOFTWARE.
 */
 
 #include <core/hip_assert.h>
+#include <getopt.h>
+
 #include <core/tensor.hpp>
 #include <iostream>
 #include <op_gamma_contrast.hpp>
 #include <opencv2/opencv.hpp>
 
+#include "common/utils.hpp"
 using namespace roccv;
 
+struct Config {
+    float gamma = 2.2;
+    std::string inputPath;
+    std::string outputPath = "output";
+    int deviceId = 0;
+};
+
+void PrintUsage(const char* programName) {
+    // clang-format off
+    std::cerr << "Usage: " << programName << " -i <input_image> [-o <output_image>] [-g <gamma>] [-d <device_id>]" << std::endl;
+    std::cerr << "  -i, --input <input_image>      Input image or directory containing images (required)" << std::endl;
+    std::cerr << "  -o, --output <output_image>    Output image or directory to save the results (optional, default: output)" << std::endl;
+    std::cerr << "  -g, --gamma <gamma>            Gamma value to apply to the input images (optional, default: 2.2)" << std::endl;
+    std::cerr << "  -d, --device <device_id>       Device ID to use for execution (optional, default: 0)" << std::endl;
+    // clang-format on
+}
+
 /**
  * @brief Gamma contrast operator sample app.
  */
 int main(int argc, char** argv) {
-    if (argc != 3) {
-        std::cerr << "Usage: " << argv[0]
-                  << " <input_image> <output_image>"
-                  << std::endl;
+    Config config;
+
+    static struct option longOptions[] = {
+        {"input", required_argument, nullptr, 'i'}, {"output", required_argument, nullptr, 'o'},
+        {"gamma", required_argument, nullptr, 'g'}, {"device", required_argument, nullptr, 'd'},
+        {"help", no_argument, nullptr, 'h'},        {nullptr, 0, nullptr, 0}};
+
+    int opt;
+    while ((opt = getopt_long(argc, argv, "i:o:g:d:h", longOptions, nullptr)) != -1) {
+        switch (opt) {
+            case 'i':
+                config.inputPath = optarg;
+                break;
+            case 'o':
+                config.outputPath = optarg;
+                break;
+            case 'g':
+                config.gamma = std::stof(optarg);
+                break;
+            case 'd':
+                config.deviceId = std::stoi(optarg);
+                break;
+            case 'h':
+                PrintUsage(argv[0]);
+                return EXIT_SUCCESS;
+            default:
+                PrintUsage(argv[0]);
+                return EXIT_FAILURE;
+        }
+    }
+
+    if (config.inputPath.empty()) {
+        std::cerr << "Error: Input path is required.\n\n";
+        PrintUsage(argv[0]);
         return EXIT_FAILURE;
     }
 
-    // Device to use in this sample will be the GPU
-    eDeviceType device =  eDeviceType::GPU;
-    
-    // Load input image using the OpenCV library.
-    // The Mat image_data will store all of the data of the image
-    // Image width can be gotten with image_data.cols
-    // Image height can be gotten with image_data.rows
-    // The amount of channels can be gotten with image_data.channels()
-    cv::Mat image_data = cv::imread(argv[1]);
-
-    // Batch size is needed to create the input and output tensors
-    int batchSize = 1;
-
-    // A gamma value to apply to the input image
-    float gammaValue = 2.2;
-
-    // Create input/output tensors
-    // Tensor shape
-    //      - Takes layout as input, in this case NHWC (N - batch size, H - image height, W - image width, C - number of channels)
-    //      - Also takes the datatype, in this case U8 or an unsigned integer of 8 bits.
-    TensorShape shape(
-        TensorLayout(eTensorLayout::TENSOR_LAYOUT_NHWC),
-        {batchSize, image_data.rows, image_data.cols, image_data.channels()});
-    DataType dtype(eDataType::DATA_TYPE_U8);
-
-    Tensor input(shape, dtype, device);
-    Tensor output(shape, dtype, device);
+    CHECK_HIP_ERROR(hipSetDevice(config.deviceId));
 
     hipStream_t stream;
-    HIP_VALIDATE_NO_ERRORS(hipStreamCreate(&stream));
-
-    // imageSize is needed to know how much data needs to be copied to the GPU
-    size_t imageSize = image_data.rows * image_data.cols * image_data.channels() * sizeof(uint8_t);
-    
-    auto input_data = input.exportData<TensorDataStrided>();
-    HIP_VALIDATE_NO_ERRORS(
-            hipMemcpy(static_cast<uint8_t *>(input_data.basePtr()), image_data.data, imageSize, hipMemcpyHostToDevice));
-    
-    // Apply gamma correction
-    GammaContrast gamma_contrast;
-    gamma_contrast(stream, input, output, gammaValue, device);
+    CHECK_HIP_ERROR(hipStreamCreate(&stream));
+
+    // Load input images
+    Tensor input = LoadImages(stream, config.inputPath, eDeviceType::GPU);
 
-    // Move output data back to host
-    auto output_data = output.exportData<TensorDataStrided>();
-    std::vector<uint8_t> h_output(imageSize);
-    HIP_VALIDATE_NO_ERRORS(hipMemcpy(h_output.data(), output_data.basePtr(), imageSize, hipMemcpyDeviceToHost));
-    
-    HIP_VALIDATE_NO_ERRORS(hipStreamSynchronize(stream));
+    // Create output tensor
+    Tensor output(input.shape(), input.dtype(), eDeviceType::GPU);
 
-    // Save the gamma-corrected image
-    cv::Mat output_image(image_data.rows, image_data.cols, CV_8UC3, h_output.data());
-    cv::imwrite(argv[2], output_image);
+    // Apply gamma correction
+    GammaContrast gamma_contrast;
+    gamma_contrast(stream, input, output, config.gamma, eDeviceType::GPU);
 
-    std::cout << "Gamma correction applied successfully. Output saved to: " << argv[2] << std::endl;
-    
+    // Write output images to disk
+    WriteImages(stream, output, config.outputPath);
+    CHECK_HIP_ERROR(hipStreamDestroy(stream));
     return EXIT_SUCCESS;
 }
diff --git a/samples/warp_perspective.cpp b/samples/warp_perspective.cpp
index dc816e11..49039d62 100644
--- a/samples/warp_perspective.cpp
+++ b/samples/warp_perspective.cpp
@@ -21,62 +21,101 @@ THE SOFTWARE.
 */
 
 #include <core/hip_assert.h>
+#include <getopt.h>
 
 #include <core/tensor.hpp>
 #include <iostream>
 #include <op_warp_perspective.hpp>
 #include <opencv2/opencv.hpp>
 
+#include "common/utils.hpp"
+
 using namespace roccv;
 
+struct Config {
+    std::string inputPath;
+    std::string outputPath = "output";
+    eInterpolationType interpolation = eInterpolationType::INTERP_TYPE_LINEAR;
+    eBorderType borderMode = eBorderType::BORDER_TYPE_CONSTANT;
+    int deviceId = 0;
+};
+
+void PrintUsage(const char* programName) {
+    // clang-format off
+    std::cerr << "Usage: " << programName << " -i <input_image> [-o <output_image>] [-p <interpolation>] [-b <border>] [-d <device_id>]" << std::endl;
+    std::cerr << "  -i, --input <input_image>           Input image or directory containing images (required)" << std::endl;
+    std::cerr << "  -o, --output <output_image>         Output image or directory to save the results (optional, default: output)" << std::endl;
+    std::cerr << "  -I, --interpolation <interpolation> Interpolation type to use for output images [0: NEAREST, 1: LINEAR, 2: CUBIC] (optional, default: 1 (LINEAR))" << std::endl;
+    std::cerr << "  -b, --border <border>               Border type for output images [0: CONSTANT, 1: REPLICATE, 2: REFLECT, 3: REFLECT101, 4: WRAP] (optional, default: 0 (CONSTANT))" << std::endl;
+    std::cerr << "  -d, --device <device_id>            Device ID to use for execution (optional, default: 0)" << std::endl;
+    // clang-format on
+}
+
 /**
  * @brief Warp perspective operation example.
  */
 int main(int argc, char** argv) {
-    if (argc != 6) {
-        std::cerr << "Usage: " << argv[0] << " <image_filename> <output_filename> <interpolation> <border> <device_id>"
-                  << std::endl;
-        return EXIT_FAILURE;
+    Config config;
+
+    static struct option longOptions[] = {{"input", required_argument, nullptr, 'i'},
+                                          {"output", required_argument, nullptr, 'o'},
+                                          {"interpolation", required_argument, nullptr, 'I'},
+                                          {"border", required_argument, nullptr, 'b'},
+                                          {"device", required_argument, nullptr, 'd'},
+                                          {"help", no_argument, nullptr, 'h'},
+                                          {nullptr, 0, nullptr, 0}};
+
+    // Parse command line arguments
+    int opt;
+    while ((opt = getopt_long(argc, argv, "i:o:I:b:d:h", longOptions, nullptr)) != -1) {
+        switch (opt) {
+            case 'i':
+                config.inputPath = optarg;
+                break;
+            case 'o':
+                config.outputPath = optarg;
+                break;
+            case 'I':
+                config.interpolation = static_cast<eInterpolationType>(std::stoi(optarg));
+                break;
+            case 'b':
+                config.borderMode = static_cast<eBorderType>(std::stoi(optarg));
+                break;
+            case 'd':
+                config.deviceId = std::stoi(optarg);
+                break;
+            case 'h':
+                PrintUsage(argv[0]);
+                return EXIT_SUCCESS;
+            default:
+                PrintUsage(argv[0]);
+                return EXIT_FAILURE;
+        }
     }
 
-    HIP_VALIDATE_NO_ERRORS(hipSetDevice(std::stoi(argv[5])));
-    eInterpolationType interp = static_cast<eInterpolationType>(std::stoi(argv[3]));
-    eBorderType border_mode = static_cast<eBorderType>(std::stoi(argv[4]));
-    cv::Mat image_data = cv::imread(argv[1]);
-
-    // Create input/output tensors for the image.
-    TensorShape shape(TensorLayout(eTensorLayout::TENSOR_LAYOUT_NHWC),
-                      {1, image_data.rows, image_data.cols, image_data.channels()});
-    DataType dtype(eDataType::DATA_TYPE_U8);
+    if (config.inputPath.empty()) {
+        std::cerr << "Error: Input path is required.\n\n";
+        PrintUsage(argv[0]);
+        return EXIT_FAILURE;
+    }
 
-    Tensor d_in(shape, dtype);
-    Tensor d_out(shape, dtype);
+    CHECK_HIP_ERROR(hipSetDevice(config.deviceId));
 
     hipStream_t stream;
-    HIP_VALIDATE_NO_ERRORS(hipStreamCreate(&stream));
+    CHECK_HIP_ERROR(hipStreamCreate(&stream));
 
-    // Move image data to input tensor
-    size_t image_size = d_in.shape().size() * d_in.dtype().size();
-    auto d_input_data = d_in.exportData<TensorDataStrided>();
-    HIP_VALIDATE_NO_ERRORS(
-        hipMemcpyAsync(d_input_data.basePtr(), image_data.data, image_size, hipMemcpyHostToDevice, stream));
+    // Create input/output tensors for the image.
+    Tensor input = LoadImages(stream, config.inputPath, eDeviceType::GPU);
+    Tensor output(input.shape(), input.dtype(), eDeviceType::GPU);
 
     PerspectiveTransform transform_matrix = {1, 0, 0, 0, 1, 0, -0.001, 0, 1};
 
     roccv::WarpPerspective op;
-    op(stream, d_in, d_out, transform_matrix, true, interp, border_mode, make_float4(0, 0, 0, 0));
+    op(stream, input, output, transform_matrix, true, config.interpolation, config.borderMode, make_float4(0, 0, 0, 0));
 
     // Move image data back to device
-    auto d_out_data = d_out.exportData<TensorDataStrided>();
-    std::vector<uint8_t> h_output(image_size);
-    HIP_VALIDATE_NO_ERRORS(
-        hipMemcpyAsync(h_output.data(), d_out_data.basePtr(), image_size, hipMemcpyDeviceToHost, stream));
-
-    HIP_VALIDATE_NO_ERRORS(hipStreamSynchronize(stream));
-
-    // Write normalized image to disk
-    cv::Mat output_image_data(image_data.rows, image_data.cols, CV_8UC3, h_output.data());
-    cv::imwrite(argv[2], output_image_data);
+    WriteImages(stream, output, config.outputPath);
+    CHECK_HIP_ERROR(hipStreamDestroy(stream));
 
     return EXIT_SUCCESS;
 }
\ No newline at end of file
diff --git a/src/core/mem_alignment.cpp b/src/core/mem_alignment.cpp
new file mode 100644
index 00000000..247974b1
--- /dev/null
+++ b/src/core/mem_alignment.cpp
@@ -0,0 +1,38 @@
+/*
+ * Copyright (c) 2025 Advanced Micro Devices, Inc. All rights reserved.
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ */
+
+#include "core/mem_alignment.hpp"
+
+namespace roccv {
+int32_t MemAlignment::baseAddr() const { return m_baseAddrAlignment; }
+
+int32_t MemAlignment::rowAddr() const { return m_rowAddrAlignment; }
+
+MemAlignment& MemAlignment::baseAddr(int32_t alignment) {
+    m_baseAddrAlignment = alignment;
+    return *this;
+}
+
+MemAlignment& MemAlignment::rowAddr(int32_t alignment) {
+    m_rowAddrAlignment = alignment;
+    return *this;
+}
+}  // namespace roccv
\ No newline at end of file
diff --git a/src/core/tensor.cpp b/src/core/tensor.cpp
index dd6c4726..f95cf811 100644
--- a/src/core/tensor.cpp
+++ b/src/core/tensor.cpp
@@ -23,50 +23,222 @@ THE SOFTWARE.
 #include "core/tensor.hpp"
 
 #include <array>
+#include <numeric>
 
 #include "core/data_type.hpp"
 #include "core/detail/context.hpp"
 #include "core/exception.hpp"
+#include "core/hip_assert.h"
 #include "core/image_format.hpp"
+#include "core/mem_alignment.hpp"
 #include "core/status_type.h"
 #include "core/tensor_data.hpp"
 #include "core/tensor_layout.hpp"
 #include "core/tensor_requirements.hpp"
 #include "core/tensor_shape.hpp"
 #include "core/util_enums.h"
+#include "core/utils.hpp"
 #include "operator_types.h"
 
 namespace roccv {
 
-// Constructor definitions
-Tensor::Tensor(const TensorRequirements& reqs) : Tensor(reqs, GlobalContext().getDefaultAllocator()) {}
+namespace {
+
+/**
+ * @brief Returns the index of the first packed dimension in the given tensor layout.
+ *
+ * In most cases, the first packed dimension is the last dimension. However, for layouts ending in WC, the first packed
+ * dimension is the second to last dimension.
+ *
+ * @param[in] layout The tensor layout to get the first packed dimension for.
+ * @return The index of the first packed dimension in the given tensor layout.
+ */
+static int GetFirstPackedDimension(const TensorLayout& layout) {
+    const int rank = layout.rank();
+    switch (layout.elayout()) {
+        case eTensorLayout::TENSOR_LAYOUT_NHWC:
+        case eTensorLayout::TENSOR_LAYOUT_LNHWC:
+        case eTensorLayout::TENSOR_LAYOUT_HWC:
+        case eTensorLayout::TENSOR_LAYOUT_NWC:
+            return std::max(0, rank - 2);
+        default:
+            return rank - 1;
+    }
+}
+
+static bool ReshapeSimplified(int inRank, const std::array<int64_t, ROCCV_TENSOR_MAX_RANK>& inShape,
+                              const std::array<int64_t, ROCCV_TENSOR_MAX_RANK>& inStrides, int targetRank,
+                              const std::array<int64_t, ROCCV_TENSOR_MAX_RANK>& targetShape,
+                              std::array<int64_t, ROCCV_TENSOR_MAX_RANK>& outStrides) {
+    int i = 0, j = 0;
+    for (; i < inRank && j < targetRank; i++) {
+        int64_t inE = inShape[i];
+        int64_t outV = 1;
+        int group_start = j;
+        while (j < targetRank && (outV * targetShape[j]) <= inE) outV *= targetShape[j++];
+
+        if (outV != inE) {
+            return false;
+        }
+
+        int64_t s = inStrides[i];
+        for (int d = j - 1; d >= group_start; d--) {
+            outStrides[d] = s;
+            s *= targetShape[d];
+        }
+    }
+    return true;
+}
+
+/**
+ * @brief Simplifies a tensor shape and strides into their canonical form.
+ *
+ * @param[in] rank The rank of the tensor.
+ * @param[in] shape The shape of the tensor.
+ * @param[in] strides The strides of the tensor.
+ * @param[out] outShape The simplified shape of the tensor.
+ * @param[out] outStrides The simplified strides of the tensor.
+ * @return The rank of the simplified tensor.
+ */
+static int Simplify(int rank, const std::array<int64_t, ROCCV_TENSOR_MAX_RANK>& shape,
+                    const std::array<int64_t, ROCCV_TENSOR_MAX_RANK>& strides,
+                    std::array<int64_t, ROCCV_TENSOR_MAX_RANK>& outShape,
+                    std::array<int64_t, ROCCV_TENSOR_MAX_RANK>& outStrides) {
+    if (rank <= 1) {
+        if (rank == 1) {
+            outShape[0] = shape[0];
+            outStrides[0] = strides[0];
+        }
+        return rank;
+    }
+
+    int outRank = 0;
+    int64_t vol = shape[0];
+    for (int d = 1; d < rank; d++) {
+        if (strides[d - 1] != shape[d] * strides[d]) {
+            outStrides[outRank] = strides[d - 1];
+            outShape[outRank] = vol;
+            vol = shape[d];
+            outRank++;
+        } else {
+            vol *= shape[d];
+        }
+    }
+    outStrides[outRank] = strides[rank - 1];
+    outShape[outRank] = vol;
+    outRank++;
+    return outRank;
+}
+
+/**
+ * @brief Computes copy parameters for host-tensor copy.
+ * @return (row_width_bytes, num_rows, tensor_pitch). If contiguous, returns (total_size, 1, total_size).
+ */
+static std::tuple<size_t, size_t, size_t> ComputeCopyParams(int rank,
+                                                            const std::array<int64_t, ROCCV_TENSOR_MAX_RANK>& shape,
+                                                            const std::array<int64_t, ROCCV_TENSOR_MAX_RANK>& strides,
+                                                            size_t dtypeSize, bool contiguous) {
+    if (contiguous) {
+        size_t totalSize = dtypeSize;
+        for (int i = 0; i < rank; ++i) {
+            totalSize *= static_cast<size_t>(shape[i]);
+        }
+        return {totalSize, 1, totalSize};
+    }
+
+    int paddedDim = 0;
+    for (int i = 0; i < rank - 1; i++) {
+        if (strides[i] != shape[i + 1] * strides[i + 1]) {
+            paddedDim = i;
+            break;
+        }
+    }
+
+    size_t rowWidth = dtypeSize;
+    for (int i = paddedDim + 1; i < rank; ++i) {
+        rowWidth *= static_cast<size_t>(shape[i]);
+    }
+
+    size_t numRows = 1;
+    for (int i = 0; i <= paddedDim; ++i) {
+        numRows *= static_cast<size_t>(shape[i]);
+    }
+
+    size_t tensorPitch = static_cast<size_t>(strides[paddedDim]);
+    return {rowWidth, numRows, tensorPitch};
+}
+
+/**
+ * @brief Computes the memory alignment for a tensor based on the device, data type, and user provided buffer alignment.
+ * @param[in] device The device the tensor is to be allocated on.
+ * @param[in] dtype The datatype of the tensor.
+ * @param[in] bufAlign The memory alignment to use.
+ * @return The memory alignment for the tensor.
+ */
+MemAlignment ComputeMemAlignment(eDeviceType device, const DataType& dtype, const MemAlignment& bufAlign) {
+    int dev = 0;
+    if (device == eDeviceType::GPU) {
+        HIP_VALIDATE_NO_ERRORS(hipGetDevice(&dev));
+    }
+
+    int rowAlign;
+    if (bufAlign.rowAddr() == 0) {
+        if (device == eDeviceType::GPU) {
+            HIP_VALIDATE_NO_ERRORS(hipDeviceGetAttribute(&rowAlign, hipDeviceAttributeTexturePitchAlignment, dev));
+        } else {
+            rowAlign = ROCCV_CPU_DEFAULT_ALIGNMENT;
+        }
+        rowAlign = std::lcm(rowAlign, detail::NextPowerOfTwo(dtype.size()));
+    } else {
+        if (!detail::IsPowerOfTwo(bufAlign.rowAddr())) {
+            throw Exception("Row address alignment must be a power of two.", eStatusType::INVALID_VALUE);
+        }
+        rowAlign = std::lcm(bufAlign.rowAddr(), detail::NextPowerOfTwo(dtype.size()));
+    }
+
+    int baseAlign;
+    if (bufAlign.baseAddr() == 0) {
+        if (device == eDeviceType::GPU) {
+            HIP_VALIDATE_NO_ERRORS(hipDeviceGetAttribute(&baseAlign, hipDeviceAttributeTextureAlignment, dev));
+        } else {
+            baseAlign = ROCCV_CPU_DEFAULT_ALIGNMENT;
+        }
+        baseAlign = std::lcm(baseAlign, detail::NextPowerOfTwo(dtype.size()));
+    } else {
+        if (!detail::IsPowerOfTwo(bufAlign.baseAddr())) {
+            throw Exception("Base address alignment must be a power of two.", eStatusType::INVALID_VALUE);
+        }
+        baseAlign = std::lcm(bufAlign.baseAddr(), detail::NextPowerOfTwo(dtype.size()));
+    }
 
-Tensor::Tensor(const TensorRequirements& reqs, const IAllocator& alloc) : m_requirements(reqs), m_allocator(alloc) {
-    size_t numBytes = reqs.device == eDeviceType::GPU ? reqs.res.deviceMem.bytes : reqs.res.hostMem.bytes;
-    m_data = std::make_shared<TensorStorage>(numBytes, reqs.device, alloc);
+    return MemAlignment().baseAddr(baseAlign).rowAddr(rowAlign);
 }
-Tensor::Tensor(const TensorRequirements& reqs, std::shared_ptr<TensorStorage> data)
-    : Tensor(reqs, data, GlobalContext().getDefaultAllocator()) {}
+}  // namespace
 
-Tensor::Tensor(const TensorRequirements& reqs, std::shared_ptr<TensorStorage> data, const IAllocator& alloc)
-    : m_requirements(reqs), m_data(data), m_allocator(alloc) {}
+// Constructor definitions
+Tensor::Tensor(const Tensor::Requirements& reqs, const IAllocator& alloc) : m_requirements(reqs) {
+    m_data = std::make_shared<TensorStorage>(this->dataSize(), reqs.device, alloc);
+}
+
+Tensor::Tensor(const Tensor::Requirements& reqs, std::shared_ptr<TensorStorage> data)
+    : m_requirements(reqs), m_data(data) {}
 
 Tensor::Tensor(const TensorShape& shape, DataType dtype, eDeviceType device)
-    : Tensor(shape, dtype, GlobalContext().getDefaultAllocator(), device) {}
+    : Tensor(shape, dtype, {}, GlobalContext().getDefaultAllocator(), device) {}
 
-Tensor::Tensor(const TensorShape& shape, DataType dtype, const IAllocator& alloc, eDeviceType device)
-    : Tensor(CalcRequirements(shape, dtype, device), alloc) {}
+Tensor::Tensor(const TensorShape& shape, DataType dtype, const MemAlignment& bufAlign, const IAllocator& alloc,
+               eDeviceType device)
+    : Tensor(CalcRequirements(shape, dtype, bufAlign, device), alloc) {}
 
 Tensor::Tensor(int num_images, Size2D image_size, ImageFormat fmt, eDeviceType device)
-    : Tensor(num_images, image_size, fmt, GlobalContext().getDefaultAllocator(), device) {}
+    : Tensor(num_images, image_size, fmt, {}, GlobalContext().getDefaultAllocator(), device) {}
 
-Tensor::Tensor(int num_images, Size2D image_size, ImageFormat fmt, const IAllocator& alloc, eDeviceType device)
-    : Tensor(CalcRequirements(num_images, image_size, fmt, device), alloc) {}
+Tensor::Tensor(int num_images, Size2D image_size, ImageFormat fmt, const MemAlignment& bufAlign,
+               const IAllocator& alloc, eDeviceType device)
+    : Tensor(CalcRequirements(num_images, image_size, fmt, bufAlign, device), alloc) {}
 
-Tensor::Tensor(Tensor&& other)
-    : m_requirements(std::move(other.m_requirements)),
-      m_data(std::move(other.m_data)),
-      m_allocator(other.m_allocator) {}
+// Move constructor
+Tensor::Tensor(Tensor&& other) : m_requirements(std::move(other.m_requirements)), m_data(std::move(other.m_data)) {}
 
 // Member definitions
 int Tensor::rank() const { return m_requirements.rank; }
@@ -103,23 +275,48 @@ TensorData Tensor::exportData() const {
     }
 }
 
-Tensor Tensor::reshape(const TensorShape& new_shape) const {
-    // New tensor shape must have the same number of elements
-    if (new_shape.size() != this->shape().size()) {
-        throw Exception("New tensor shape does not match the number of elements of the old shape.",
+Tensor Tensor::reshape(const TensorShape& newShape) const { return reshape(dtype(), newShape); }
+
+Tensor Tensor::reshape(const DataType& newDtype, const TensorShape& newShape) const {
+    if (newShape.size() * newDtype.size() != this->shape().size() * this->dtype().size()) {
+        throw Exception("New tensor view must have the same underlying number of bytes.", eStatusType::INVALID_VALUE);
+    }
+
+    const int oldRank = m_requirements.rank;
+    const int newRank = newShape.layout().rank();
+
+    if (m_requirements.strides[oldRank - 1] != static_cast<int64_t>(dtype().size())) {
+        throw Exception("Cannot reshape tensor: innermost dimension is not element-contiguous.",
                         eStatusType::INVALID_VALUE);
     }
 
-    TensorRequirements reqs = CalcRequirements(new_shape, this->dtype(), this->device());
-    return Tensor(reqs, m_data);
-}
+    // Convert to a byte-level view by expanding the innermost dimension size
+    // by the element size and setting its stride to 1.
+    std::array<int64_t, ROCCV_TENSOR_MAX_RANK> byteShape = m_requirements.shape;
+    std::array<int64_t, ROCCV_TENSOR_MAX_RANK> byteStrides = m_requirements.strides;
+    byteShape[oldRank - 1] *= dtype().size();
+    byteStrides[oldRank - 1] = 1;
 
-Tensor Tensor::reshape(const TensorShape& new_shape, const DataType& new_dtype) const {
-    if (new_shape.size() * new_dtype.size() != this->shape().size() * this->dtype().size()) {
-        throw Exception("New tensor view must have the same underlying number of bytes.", eStatusType::INVALID_VALUE);
+    std::array<int64_t, ROCCV_TENSOR_MAX_RANK> simpleShape, simpleStrides;
+    int simpleRank = Simplify(oldRank, byteShape, byteStrides, simpleShape, simpleStrides);
+
+    std::array<int64_t, ROCCV_TENSOR_MAX_RANK> targetByteShape = newShape.shape();
+    targetByteShape[newRank - 1] *= newDtype.size();
+
+    std::array<int64_t, ROCCV_TENSOR_MAX_RANK> targetByteStrides;
+    bool result =
+        ReshapeSimplified(simpleRank, simpleShape, simpleStrides, newRank, targetByteShape, targetByteStrides);
+    if (!result) {
+        throw Exception("Cannot reshape tensor into requested shape and data type.", eStatusType::INVALID_VALUE);
     }
 
-    TensorRequirements reqs = CalcRequirements(new_shape, new_dtype, this->device());
+    // The byte-level reshape produces stride 1 for the innermost dimension;
+    // scale it back to the new element size.
+    std::array<int64_t, ROCCV_TENSOR_MAX_RANK> newStrides = targetByteStrides;
+    newStrides[newRank - 1] = newDtype.size();
+
+    Tensor::Requirements reqs =
+        CalcRequirements(newShape, newDtype, newStrides, m_requirements.alignBytes, this->device());
     return Tensor(reqs, m_data);
 }
 
@@ -129,53 +326,110 @@ Tensor& Tensor::operator=(const Tensor& other) {
     return *this;
 }
 
-TensorRequirements Tensor::CalcRequirements(const TensorShape& shape, const DataType& dtype, eDeviceType device) {
-    std::array<int64_t, ROCCV_TENSOR_MAX_RANK> strides = CalcStrides(shape, dtype);
-    TensorRequirements reqs = CalcRequirements(shape, dtype, strides, device);
+size_t Tensor::dataSize() const { return m_requirements.strides[0] * m_requirements.shape[0]; }
+
+bool Tensor::isContiguous() const { return dataSize() == shape().size() * dtype().size(); }
+
+void Tensor::copyFromHost(const void* src, hipStream_t stream) const {
+    auto [rowWidth, numRows, tensorPitch] = ComputeCopyParams(m_requirements.rank, m_requirements.shape,
+                                                              m_requirements.strides, dtype().size(), isContiguous());
+
+    const size_t srcPitch = rowWidth;
+    hipMemcpyKind kind = (device() == eDeviceType::GPU) ? hipMemcpyHostToDevice : hipMemcpyHostToHost;
+
+    HIP_VALIDATE_NO_ERRORS(
+        hipMemcpy2DAsync(m_data->data(), tensorPitch, src, srcPitch, rowWidth, numRows, kind, stream));
+}
+
+void Tensor::copyToHost(void* dst, hipStream_t stream) const {
+    auto [rowWidth, numRows, tensorPitch] = ComputeCopyParams(m_requirements.rank, m_requirements.shape,
+                                                              m_requirements.strides, dtype().size(), isContiguous());
+
+    const size_t dstPitch = rowWidth;
+    hipMemcpyKind kind = (device() == eDeviceType::GPU) ? hipMemcpyDeviceToHost : hipMemcpyHostToHost;
+
+    HIP_VALIDATE_NO_ERRORS(
+        hipMemcpy2DAsync(dst, dstPitch, m_data->data(), tensorPitch, rowWidth, numRows, kind, stream));
+}
+
+Tensor::Requirements Tensor::CalcRequirements(const TensorShape& shape, const DataType& dtype, eDeviceType device) {
+    return CalcRequirements(shape, dtype, (MemAlignment){}, device);
+}
+
+Tensor::Requirements Tensor::CalcRequirements(const TensorShape& shape, const DataType& dtype,
+                                              const MemAlignment& bufAlign, eDeviceType device) {
+    MemAlignment newAlign = ComputeMemAlignment(device, dtype, bufAlign);
+
+    std::array<int64_t, ROCCV_TENSOR_MAX_RANK> strides = CalcStrides(shape, dtype, newAlign.rowAddr());
+    Tensor::Requirements reqs = CalcRequirements(shape, dtype, strides, newAlign.baseAddr(), device);
     return reqs;
 }
 
-TensorRequirements Tensor::CalcRequirements(const TensorShape& shape, const DataType& dtype,
-                                            std::array<int64_t, ROCCV_TENSOR_MAX_RANK> strides, eDeviceType device) {
-    TensorRequirements reqs;
+Tensor::Requirements Tensor::CalcRequirements(const TensorShape& shape, const DataType& dtype,
+                                              const std::array<int64_t, ROCCV_TENSOR_MAX_RANK> strides,
+                                              int32_t baseAlign, eDeviceType device) {
+    Tensor::Requirements reqs;
 
     reqs.shape = shape.shape();
     reqs.rank = shape.layout().rank();
     reqs.layout = shape.layout().elayout();
     reqs.strides = strides;
     reqs.dtype = dtype.etype();
-    reqs.alignBytes = 0;  // TODO: Must be specified later
+    reqs.alignBytes = baseAlign;
     reqs.device = device;
 
-    // TODO: Resource requirements should be calculated differently later, once padded/aligned strides have been
-    // implemented
+    // Determine resource usage
     size_t numBytes = reqs.strides[0] * reqs.shape[0];
-    if (reqs.device == eDeviceType::GPU) {
-        reqs.res.deviceMem.bytes = numBytes;
-    } else if (reqs.device == eDeviceType::CPU) {
-        reqs.res.hostMem.bytes = numBytes;
+    switch (reqs.device) {
+        case eDeviceType::GPU: {
+            reqs.res.deviceMem.bytes = numBytes;
+            break;
+        }
+
+        case eDeviceType::CPU: {
+            reqs.res.hostMem.bytes = numBytes;
+            break;
+        }
+
+        default: {
+            throw Exception("Unsupported device when calling Tensor::CalcRequirements().", eStatusType::INVALID_VALUE);
+        }
     }
 
     return reqs;
 }
 
-TensorRequirements Tensor::CalcRequirements(int num_images, Size2D image_size, ImageFormat fmt, eDeviceType device) {
+Tensor::Requirements Tensor::CalcRequirements(int num_images, Size2D image_size, ImageFormat fmt, eDeviceType device) {
+    return CalcRequirements(num_images, image_size, fmt, (MemAlignment){}, device);
+}
+
+Tensor::Requirements Tensor::CalcRequirements(int num_images, Size2D image_size, ImageFormat fmt,
+                                              const MemAlignment& bufAlign, eDeviceType device) {
     // TODO: Need to support different types of tensor layouts. This will happen once more image formats are supported
     // first.
     TensorShape shape(TensorLayout(eTensorLayout::TENSOR_LAYOUT_NHWC),
                       {num_images, image_size.h, image_size.w, fmt.channels()});
-    return CalcRequirements(shape, DataType(fmt.dtype()), device);
+    return CalcRequirements(shape, DataType(fmt.dtype()), bufAlign, device);
 }
 
-std::array<int64_t, ROCCV_TENSOR_MAX_RANK> Tensor::CalcStrides(const TensorShape& shape, const DataType& dtype) {
-    // TODO: Support memory alignment and padding in stride calculations
-
+std::array<int64_t, ROCCV_TENSOR_MAX_RANK> Tensor::CalcStrides(const TensorShape& shape, const DataType& dtype,
+                                                               int32_t rowAlign) {
     // Calculate strides based on the given tensor shape. Strides are byte-wise.
     std::array<int64_t, ROCCV_TENSOR_MAX_RANK> strides;
+
+    const int firstPackedDim = GetFirstPackedDimension(shape.layout());
+
     strides[shape.layout().rank() - 1] = dtype.size();
     for (int i = shape.layout().rank() - 2; i >= 0; i--) {
-        strides[i] = strides[i + 1] * shape[i + 1];
+        // The stride dimension preceeding the first packed dimension is padded to the next multiple of the row
+        // alignment.
+        if (i == firstPackedDim - 1) {
+            strides[i] = detail::AlignUp(strides[i + 1] * shape[i + 1], rowAlign);
+        } else {
+            strides[i] = strides[i + 1] * shape[i + 1];
+        }
     }
+
     return strides;
 }
 
@@ -190,8 +444,9 @@ Tensor TensorWrapData(const TensorData& tensor_data) {
     for (int i = 0; i < tensorDataStrided->rank(); i++) {
         strides[i] = tensorDataStrided->stride(i);
     }
-    TensorRequirements reqs = Tensor::CalcRequirements(tensorDataStrided->shape(), tensorDataStrided->dtype(), strides,
-                                                       tensorDataStrided->device());
+
+    Tensor::Requirements reqs = Tensor::CalcRequirements(tensorDataStrided->shape(), tensorDataStrided->dtype(),
+                                                         strides, 0, tensorDataStrided->device());
 
     auto data =
         std::make_shared<TensorStorage>(tensorDataStrided->basePtr(), tensorDataStrided->device(), eOwnership::OWNING);
diff --git a/src/core/tensor_storage.cpp b/src/core/tensor_storage.cpp
index 4b567c7d..4c24121f 100644
--- a/src/core/tensor_storage.cpp
+++ b/src/core/tensor_storage.cpp
@@ -62,4 +62,7 @@ TensorStorage::~TensorStorage() {
 void* TensorStorage::data() const { return m_data; }
 
 eDeviceType TensorStorage::device() const { return m_device; }
+
+const IAllocator& TensorStorage::allocator() const { return m_allocator; }
+
 }  // namespace roccv
diff --git a/src/op_non_max_suppression.cpp b/src/op_non_max_suppression.cpp
index 93c4f90c..28d21c7b 100644
--- a/src/op_non_max_suppression.cpp
+++ b/src/op_non_max_suppression.cpp
@@ -97,12 +97,12 @@ void NonMaximumSuppression::operator()(hipStream_t stream, const Tensor& input,
     // but typically involve going from non-vectorized to vectorized shapes. These shapes should be validated
     // beforehand. For example: an input tensor with shape and datatype [NWC, <batches, boxes, 4>, S16] will be
     // reinterpreted as [NW, <batches, boxes, 4S16>]. In any case, the underlying data is structured the same.
-    Tensor inputReshaped =
-        input.reshape(TensorShape(TensorLayout(TENSOR_LAYOUT_NW), {numBatches, numBoxes}), DataType(DATA_TYPE_4S16));
+    Tensor inputReshaped = input.reshape(DataType(DATA_TYPE_4S16),
+                                          TensorShape(TensorLayout(TENSOR_LAYOUT_NW), {numBatches, numBoxes}));
     Tensor outputReshaped =
-        output.reshape(TensorShape(TensorLayout(TENSOR_LAYOUT_NW), {numBatches, numBoxes}), DataType(DATA_TYPE_U8));
-    Tensor scoresReshaped =
-        scores.reshape(TensorShape(TensorLayout(TENSOR_LAYOUT_NW), {numBatches, numBoxes}), DataType(DATA_TYPE_F32));
+        output.reshape(DataType(DATA_TYPE_U8), TensorShape(TensorLayout(TENSOR_LAYOUT_NW), {numBatches, numBoxes}));
+    Tensor scoresReshaped = scores.reshape(DataType(DATA_TYPE_F32),
+                                           TensorShape(TensorLayout(TENSOR_LAYOUT_NW), {numBatches, numBoxes}));
 
     // Launch nms kernel
     switch (device) {
diff --git a/tests/roccv/cpp/include/test_helpers.hpp b/tests/roccv/cpp/include/test_helpers.hpp
index 6c43053b..aa2d819a 100644
--- a/tests/roccv/cpp/include/test_helpers.hpp
+++ b/tests/roccv/cpp/include/test_helpers.hpp
@@ -269,24 +269,6 @@ eTestStatusType compareImage(const Tensor& tensor, const std::string& filename,
 
 void writeTensor(const Tensor& tensor, const std::string& output_file);
 
-template <typename T>
-void copyData(const Tensor& input, const std::span<const T>& data, eDeviceType device) {
-    auto tensor_data = input.exportData<TensorDataStrided>();
-
-    switch (device) {
-        case eDeviceType::GPU: {
-            HIP_VALIDATE_NO_ERRORS(
-                hipMemcpy(tensor_data.basePtr(), data.data(), data.size() * sizeof(T), hipMemcpyHostToDevice));
-            break;
-        }
-
-        case eDeviceType::CPU: {
-            memcpy(tensor_data.basePtr(), data.data(), data.size() * sizeof(T));
-            break;
-        }
-    }
-}
-
 /**
  * @brief Fills a vector with random values based on a provided seed.
  *
@@ -403,75 +385,76 @@ void CompareVectorsNear(const std::vector<T>& result, const std::vector<T>& ref,
         }
     }
 }
-
 /**
- * @brief Copies vector data into a roccv::Tensor. This will copy vector data into either GPU memory or CPU memory,
- * depending on the device specified in the roccv::Tensor's metadata.
- *
- * @tparam T The base datatype of the underlying data.
- * @param dst The destination roccv::Tensor to copy data into.
- * @param src A source vector containing data.
- * @throws std::runtime_error if the size of the dst and src do not match.
+ * @brief Computes copy parameters for any tensor layout.
+ * @param[in] tensor The tensor to compute copy parameters for.
+ * @return tuple of (row_width_bytes, num_rows, tensor_pitch)
+ *         If no padding, returns (total_size, 1, total_size)
  */
-template <typename T>
-void CopyVectorIntoTensor(const Tensor& dst, std::vector<T>& src) {
-    auto tensorData = dst.exportData<TensorDataStrided>();
-    size_t dataSize = dst.shape().size() * dst.dtype().size();
-
-    // Ensure source and destination have the same amount of memory allocated.
-    if (dataSize != src.size() * sizeof(T)) {
-        throw std::runtime_error(
-            "Cannot copy source vector into destination tensor. Size of src vector and destination tensor do not "
-            "match.");
+inline std::tuple<size_t, size_t, size_t> ComputeCopyParams(const Tensor& tensor) {
+    if (tensor.isContiguous()) {
+        size_t totalSize = tensor.shape().size() * tensor.dtype().size();
+        return {totalSize, 1, totalSize};
     }
 
-    switch (dst.device()) {
-        case eDeviceType::GPU: {
-            HIP_VALIDATE_NO_ERRORS(
-                hipMemcpy(tensorData.basePtr(), src.data(), dataSize, hipMemcpyKind::hipMemcpyHostToDevice));
-            break;
-        }
+    auto tensorData = tensor.exportData<TensorDataStrided>();
 
-        case eDeviceType::CPU: {
-            HIP_VALIDATE_NO_ERRORS(
-                hipMemcpy(tensorData.basePtr(), src.data(), dataSize, hipMemcpyKind::hipMemcpyHostToHost));
+    // Find the padded dimension: the outermost dimension whose stride exceeds
+    // the packed product of the next dimension's shape and stride.
+    int paddedDim = 0;
+    for (int i = 0; i < tensor.rank() - 1; i++) {
+        if (tensorData.stride(i) != tensor.shape(i + 1) * tensorData.stride(i + 1)) {
+            paddedDim = i;
             break;
         }
     }
+
+    // Row width = product of all dimensions AFTER paddedDim × dtype size
+    size_t rowWidth = tensor.dtype().size();
+    for (int i = paddedDim + 1; i < tensor.rank(); ++i) {
+        rowWidth *= tensor.shape(i);
+    }
+
+    // Number of rows = product of all dimensions UP TO AND INCLUDING paddedDim
+    size_t numRows = 1;
+    for (int i = 0; i <= paddedDim; ++i) {
+        numRows *= tensor.shape(i);
+    }
+
+    // Tensor pitch comes from the stride at the padded dimension
+    size_t tensorPitch = tensorData.stride(paddedDim);
+
+    return {rowWidth, numRows, tensorPitch};
 }
 
 /**
- * @brief Copies roccv::Tensor data into a destination vector.
+ * @brief Copies vector data into a tensor. Works with any tensor layout.
  *
- * @tparam T The base datatype of the underlying tensor data.
- * @param dst The destination vector which the data will be copied into.
- * @param src The roccv::Tensor containing the source data.
- * @throws std::runtime_error if the size of src and dst do not match.
+ * @tparam T The data type of the elements to copy.
+ * @param[out] dst The tensor to copy vector data into.
+ * @param[in] src The vector to copy data from.
  */
 template <typename T>
-void CopyTensorIntoVector(std::vector<T>& dst, const Tensor& src) {
-    size_t size = src.shape().size() * src.dtype().size();
-    auto tensorData = src.exportData<TensorDataStrided>();
-
-    if (size != dst.size() * sizeof(T)) {
-        throw std::runtime_error(
-            "Cannot copy source tensor data into destination vector. Size of destination vector and source tensor do "
-            "not match.");
-    }
-
-    switch (src.device()) {
-        case eDeviceType::GPU: {
-            HIP_VALIDATE_NO_ERRORS(
-                hipMemcpy(dst.data(), tensorData.basePtr(), size, hipMemcpyKind::hipMemcpyDeviceToHost));
-            break;
-        }
+void CopyVectorIntoTensor(const Tensor& dst, const std::vector<T>& src) {
+    dst.copyFromHost(src.data(), nullptr);
+    HIP_VALIDATE_NO_ERRORS(hipStreamSynchronize(nullptr));
+}
 
-        case eDeviceType::CPU: {
-            HIP_VALIDATE_NO_ERRORS(
-                hipMemcpy(dst.data(), tensorData.basePtr(), size, hipMemcpyKind::hipMemcpyHostToHost));
-            break;
-        }
-    }
+/**
+ * @brief Copies tensor data into a vector. Works with any tensor layout.
+ *
+ * The destination vector @p dst is assumed to be compact/contiguous in memory; data will be copied into it as a dense
+ * contiguous array.
+ *
+ * @tparam T The data type of the elements to copy.
+ * @param[out] dst The vector to copy tensor data into. Must be preallocated to the correct size and will be filled
+ * contiguously.
+ * @param[in] src The tensor to copy data from.
+ */
+template <typename T>
+void CopyTensorIntoVector(std::vector<T>& dst, const Tensor& src) {
+    src.copyToHost(dst.data(), nullptr);
+    HIP_VALIDATE_NO_ERRORS(hipStreamSynchronize(nullptr));
 }
 /**
  * @brief Computes the strides for a tensor with a given shape and data type.
diff --git a/tests/roccv/cpp/src/tests/core/tensor/test_tensor.cpp b/tests/roccv/cpp/src/tests/core/tensor/test_tensor.cpp
index 2b1abe2c..394c8b9e 100644
--- a/tests/roccv/cpp/src/tests/core/tensor/test_tensor.cpp
+++ b/tests/roccv/cpp/src/tests/core/tensor/test_tensor.cpp
@@ -19,7 +19,10 @@
  * THE SOFTWARE.
  */
 
+#include <hip/hip_runtime.h>
+
 #include <core/tensor.hpp>
+#include <core/utils.hpp>
 
 #include "test_helpers.hpp"
 
@@ -33,16 +36,22 @@ namespace {
  *
  * @param shape The tensor's shape.
  * @param dtype The datatype of the tensor.
+ * @param rowAlign The row alignment to use. Setting to 0 will ensure contiguous memory usage.
  * @return A list of strides for each dimension of the given shape.
  */
-std::vector<int64_t> CalculateStrides(const TensorShape& shape, const DataType& dtype) {
+std::vector<int64_t> CalculateStrides(const TensorShape& shape, const DataType& dtype, int32_t rowAlign) {
     std::vector<int64_t> strides(shape.layout().rank());
 
     // Strides are calculated byte-wise. Therefore, the highest dimension will refer to the stride between singular
     // elements (which, in turn, is the number of bytes per said element).
     strides[shape.layout().rank() - 1] = dtype.size();
     for (int i = shape.layout().rank() - 2; i >= 0; --i) {
-        strides[i] = strides[i + 1] * shape[i + 1];
+        // Use the row alignment for the height dimension.
+        if (i == shape.layout().height_index()) {
+            strides[i] = detail::AlignUp(strides[i + 1] * shape[i + 1], rowAlign);
+        } else {
+            strides[i] = strides[i + 1] * shape[i + 1];
+        }
     }
     return strides;
 }
@@ -61,19 +70,27 @@ void TestNegativeTensorShape() {
 }
 
 /**
- * @brief Negative tests related to the Tensor object.
+ * @brief Negative tests related to Tensor reshape.
+ *
+ */
+void TestNegativeTensorReshape() {
+    Tensor tensor(TensorShape({1, 2, 3}, "HWC"), DataType(DATA_TYPE_U8));
+    EXPECT_EXCEPTION(tensor.reshape(TensorShape({1, 1, 2, 4}, "NHWC")), eStatusType::INVALID_VALUE);
+}
+
+/**
+ * @brief Negative tests for the Tensor class, verifying error handling in invalid scenarios.
  *
+ * These tests confirm that the Tensor class appropriately throws exceptions when:
+ *   1. Attempting to reshape a non-contiguous tensor.
+ *
+ * In both cases, the expected behavior is to throw an exception of type eStatusType::INVALID_VALUE.
  */
 void TestNegativeTensor() {
+    // Test reshaping a tensor with mismatching number of elements
     {
-        // Should not be able to reshape tensor into another view with a differing number of elements
         Tensor tensor(TensorShape({1, 2, 3}, "HWC"), DataType(DATA_TYPE_U8));
-        EXPECT_EXCEPTION(tensor.reshape(TensorShape({1, 1, 2, 2}, "NHWC")), eStatusType::INVALID_VALUE);
-
-        // Should not be able to reshape tensor into another view which would result in a different number of bytes in
-        // the underlying memory.
-        EXPECT_EXCEPTION(tensor.reshape(TensorShape({1, 1, 2, 3}, "NHWC"), DataType(DATA_TYPE_S16)),
-                         eStatusType::INVALID_VALUE);
+        EXPECT_EXCEPTION(tensor.reshape(TensorShape({1, 1, 2, 4}, "NHWC")), eStatusType::INVALID_VALUE);
     }
 }
 
@@ -95,15 +112,14 @@ void TestTensorCorrectness() {
         EXPECT_EQ(tensor.shape().size(), 4 * 720 * 480 * 3);
         EXPECT_EQ(tensor.dtype().size(), 1);
     }
+}
 
-    // Tensor reshape: Change layout
+void TestTensorReshapeCorrectness() {
     {
-        // Reshape tensor from NHWC -> HWC layout
-        Tensor tensor(1, {720, 480}, FMT_RGB8);
-        Tensor reshapedTensor = tensor.reshape(TensorShape({720, 480, 3}, "HWC"));
-        EXPECT_EQ(reshapedTensor.rank(), 3);
-        EXPECT_NE(reshapedTensor.rank(), tensor.rank());
+        Tensor tensor(TensorShape({1, 2, 3}, "HWC"), DataType(DATA_TYPE_U8));
+        Tensor reshapedTensor = tensor.reshape(TensorShape({1, 1, 2, 3}, "NHWC"));
         EXPECT_EQ(reshapedTensor.shape().size(), tensor.shape().size());
+        EXPECT_EQ(reshapedTensor.rank(), 4);
 
         // Ensure they are sharing the same underlying data
         auto data = tensor.exportData<TensorDataStrided>();
@@ -114,7 +130,7 @@ void TestTensorCorrectness() {
     // Tensor reshape: Change layout and datatype
     {
         Tensor tensor(TensorShape({1, 5, 4}, "NWC"), DataType(DATA_TYPE_S16));
-        Tensor reshapedTensor = tensor.reshape(TensorShape({1, 5}, "NW"), DataType(DATA_TYPE_4S16));
+        Tensor reshapedTensor = tensor.reshape(DataType(DATA_TYPE_4S16), TensorShape({1, 5}, "NW"));
         EXPECT_NE(reshapedTensor.shape().size(), tensor.shape().size());
         EXPECT_NE(reshapedTensor.rank(), tensor.rank());
         EXPECT_EQ(reshapedTensor.rank(), 2);
@@ -126,12 +142,44 @@ void TestTensorCorrectness() {
     }
 }
 
+/**
+ * @brief Tests the correctness of the copyFromHost and copyToHost methods.
+ *
+ */
+void TestTensorCopyCorrectness() {
+    Tensor tensor(2, {10, 10}, FMT_RGB8, eDeviceType::GPU);
+    const size_t hostDataSize = tensor.shape().size() * tensor.dtype().size();
+    std::vector<uint8_t> inputDataHost(hostDataSize);
+    for (size_t i = 0; i < inputDataHost.size(); i++) {
+        inputDataHost[i] = static_cast<uint8_t>(i % 256);
+    }
+
+    hipStream_t stream;
+    HIP_VALIDATE_NO_ERRORS(hipStreamCreate(&stream));
+
+    tensor.copyFromHost(inputDataHost.data(), stream);
+    std::vector<uint8_t> outputDataHost(hostDataSize);
+    tensor.copyToHost(outputDataHost.data(), stream);
+
+    HIP_VALIDATE_NO_ERRORS(hipStreamSynchronize(stream));
+    HIP_VALIDATE_NO_ERRORS(hipStreamDestroy(stream));
+
+    EXPECT_VECTOR_EQ(inputDataHost, outputDataHost);
+}
+
 /**
  * @brief Tests internal stride calculations on Tensor construction.
  */
 void TestTensorStrideCalculation(const TensorShape& shape, const DataType& dtype) {
-    Tensor tensor(shape, dtype);
-    std::vector<int64_t> expectedStrides = CalculateStrides(shape, dtype);
+    Tensor tensor(shape, dtype, eDeviceType::GPU);
+
+    // Get row alignment from device attributes
+    int dev;
+    HIP_VALIDATE_NO_ERRORS(hipGetDevice(&dev));
+    int rowAlign;
+    HIP_VALIDATE_NO_ERRORS(hipDeviceGetAttribute(&rowAlign, hipDeviceAttributeTexturePitchAlignment, dev));
+
+    std::vector<int64_t> expectedStrides = CalculateStrides(shape, dtype, rowAlign);
     std::vector<int64_t> actualStrides(tensor.rank());
     auto data = tensor.exportData<TensorDataStrided>();
 
@@ -152,9 +200,12 @@ int main(int argc, char** argv) {
     // Negative tests
     TEST_CASE(TestNegativeTensorShape());
     TEST_CASE(TestNegativeTensor());
+    TEST_CASE(TestNegativeTensorReshape());
 
     // Correctness tests
     TEST_CASE(TestTensorCorrectness());
+    TEST_CASE(TestTensorReshapeCorrectness());
+    TEST_CASE(TestTensorCopyCorrectness());
 
     // Stride calculation tests
     // clang-format off