ROCm · zacharyvincze · Nov 20, 2025 · Nov 20, 2025 · Nov 20, 2025 · Nov 20, 2025
diff --git a/benchmarks/src/roccv/roccv_bench_helpers.cpp b/benchmarks/src/roccv/roccv_bench_helpers.cpp
@@ -59,13 +59,14 @@ class RandomGenerator {
     void generate(const roccv::Tensor& tensor) {
         const auto tensor_data = tensor.exportData<roccv::TensorDataStrided>();
 
+        const size_t numElements = tensor.dataSize() / tensor.dtype().size();
+
         if constexpr (std::is_integral_v<T>) {
-            rocrand_generate_char(m_gen, static_cast<unsigned char*>(tensor_data.basePtr()),
-                                  tensor.shape().size() * tensor.dtype().size());
+            rocrand_generate_char(m_gen, static_cast<unsigned char*>(tensor_data.basePtr()), numElements);
         } else if constexpr (std::is_same_v<T, float>) {
-            rocrand_generate_uniform(m_gen, static_cast<float*>(tensor_data.basePtr()), tensor.shape().size());
+            rocrand_generate_uniform(m_gen, static_cast<float*>(tensor_data.basePtr()), numElements);
         } else if constexpr (std::is_same_v<T, double>) {
-            rocrand_generate_uniform_double(m_gen, static_cast<double*>(tensor_data.basePtr()), tensor.shape().size());
+            rocrand_generate_uniform_double(m_gen, static_cast<double*>(tensor_data.basePtr()), numElements);
         } else {
             throw std::runtime_error("Unsupported data type.");
         }

diff --git a/include/core/mem_alignment.hpp b/include/core/mem_alignment.hpp
@@ -0,0 +1,91 @@
+/*
+ * Copyright (c) 2025 Advanced Micro Devices, Inc. All rights reserved.
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ */
+
+#pragma once
+
+#include <stdint.h>
+
+namespace roccv {
+
+constexpr int32_t ROCCV_CPU_DEFAULT_ALIGNMENT = 64;  // Default alignment for CPU memory
+
+/**
+ * @class MemAlignment
+ * @brief Class for specifying memory alignment constraints for buffer allocations.
+ *
+ * The MemAlignment class allows you to specify the memory alignment in bytes that should
+ * be used when allocating or working with device or host memory buffers. Proper alignment
+ * is important for performance reasons and hardware compatibility, especially for GPU operations.
+ *
+ * There are two types of alignment constraints:
+ *  - Base address alignment: Alignment requirements for the starting address of the buffer.
+ *  - Row address alignment: Alignment requirements for the starting address of each row, which is
+ *    important for multi-dimensional data (e.g., images or tensors).
+ *
+ * Both alignment values default to 0, which implies that the system or device default alignment will be used.
+ *
+ * Example usage:
+ * @code
+ *   roccv::MemAlignment align;
+ *   align.baseAddr(256).rowAddr(128);
+ * @endcode
+ *
+ * @see roccv::Tensor
+ */
+class MemAlignment {
+   public:
+    MemAlignment() = default;
+
+    /**
+     * @brief Returns the base address alignment.
+     *
+     * @return The base address alignment.
+     */
+    int32_t baseAddr() const;
+
+    /**
+     * @brief Returns the row address alignment.
+     *
+     * @return The row address alignment.
+     */
+    int32_t rowAddr() const;
+
+    /**
+     * @brief Sets the base address alignment.
+     *
+     * @param[in] alignment Alignment in bytes.
+     * @return A reference to this object, with the base address set.
+     */
+    MemAlignment& baseAddr(int32_t alignment);
+
+    /**
+     * @brief Sets the row address alignment.
+     *
+     * @param[in] alignment Alignment in bytes.
+     * @return A reference to this object, with the row address set.
+     */
+    MemAlignment& rowAddr(int32_t alignment);
+
+   private:
+    int32_t m_baseAddrAlignment = 0;
+    int32_t m_rowAddrAlignment = 0;
+};
+}  // namespace roccv
diff --git a/include/core/tensor.hpp b/include/core/tensor.hpp
@@ -21,24 +21,26 @@ THE SOFTWARE.
 */
 #pragma once
 
+#include <hip/hip_runtime.h>
+
 #include <array>
 #include <memory>
 
 #include "core/data_type.hpp"
 #include "core/detail/allocators/i_allocator.hpp"
+#include "core/detail/context.hpp"
+#include "core/image_format.hpp"
+#include "core/mem_alignment.hpp"
+#include "core/tensor_data.hpp"
 #include "core/tensor_layout.hpp"
+#include "core/tensor_requirements.hpp"
+#include "core/tensor_shape.hpp"
+#include "core/tensor_storage.hpp"
 #include "core/util_enums.h"
-#include "tensor_data.hpp"
-#include "tensor_requirements.hpp"
-#include "tensor_storage.hpp"
+#include "operator_types.h"
 
 namespace roccv {
 
-class ImageFormat;
-struct Size2D;
-class TensorShape;
-class TensorLayout;
-
 class Tensor {
    public:
     using Requirements = TensorRequirements;
@@ -50,8 +52,7 @@ class Tensor {
      *
      * @param[in] reqs An object representing the requirements for this tensor.
      */
-    explicit Tensor(const TensorRequirements &reqs);
-    explicit Tensor(const TensorRequirements &reqs, const IAllocator &alloc);
+    explicit Tensor(const TensorRequirements &reqs, const IAllocator &alloc = GlobalContext().getDefaultAllocator());
 
     /**
      * @brief Constructs a Tensor object given a list of requirements and the underlying data as a TensorStorage
@@ -61,30 +62,55 @@ class Tensor {
      * @param[in] data A TensorStorage object for the tensor's underlying data.
      */
     explicit Tensor(const TensorRequirements &reqs, std::shared_ptr<TensorStorage> data);
-    explicit Tensor(const TensorRequirements &reqs, std::shared_ptr<TensorStorage> data, const IAllocator &alloc);
 
     /**
-     * @brief Constructs a tensor object and allocates the appropriate amount of memory on the specified device.
+     * @brief Constructs a tensor object and allocates the appropriate amount of memory on the specified device. Uses
+     * the default memory alignment and allocation strategy.
      *
      * @param[in] shape The shape describing the tensor.
      * @param[in] dtype The underlying datatype of the tensor.
      * @param[in] device The device the tensor should be allocated on.
      */
     explicit Tensor(const TensorShape &shape, DataType dtype, eDeviceType device = eDeviceType::GPU);
-    explicit Tensor(const TensorShape &shape, DataType dtype, const IAllocator &alloc,
+
+    /**
+     * @brief Constructs a tensor object and allocates the appropriate amount of memory on the specified device. Uses a
+     * user-specified memory alignment and allocation strategy.
+     *
+     * @param[in] shape The shape describing the tensor.
+     * @param[in] dtype The underlying datatype of the tensor.
+     * @param[in] bufAlign Specification for memory alignment.
+     * @param[in] alloc The allocation strategy. (Default: DefaultAllocator)
+     * @param[in] device The device the tensor should be allocated on.
+     */
+    explicit Tensor(const TensorShape &shape, DataType dtype, const MemAlignment &bufAlign,
+                    const IAllocator &alloc = GlobalContext().getDefaultAllocator(),
                     eDeviceType device = eDeviceType::GPU);
 
     /**
      * @brief Constructs a tensor using image-based requirements and allocates the appropriate amount of memory on the
-     * specified device.
+     * specified device. Uses the default memory alignment and allocation strategy.
      *
      * @param[in] num_images The number of images in the batch.
      * @param[in] image_size The size for images in the batch.
      * @param[in] fmt The format of the underlying image data.
      * @param[in] device The device the tensor should be allocated on.
      */
     explicit Tensor(int num_images, Size2D image_size, ImageFormat fmt, eDeviceType device = eDeviceType::GPU);
-    explicit Tensor(int num_images, Size2D image_size, ImageFormat fmt, const IAllocator &alloc,
+
+    /**
+     * @brief Constructs a tensor using image-based requirements and allocates the appropriate amount of memory on the
+     * specified device. Uses user-provided memory alignment and allocation strategies.
+     *
+     * @param[in] num_images The number of images in the batch.
+     * @param[in] image_size The size for images in the batch.
+     * @param[in] fmt The format of the underlying image data.
+     * @param[in] bufAlign Specification for memory alignment.
+     * @param[in] alloc The allocation strategy. (Default: DefaultAllocator)
+     * @param[in] device The device the tensor should be allocated on.
+     */
+    explicit Tensor(int num_images, Size2D image_size, ImageFormat fmt, const MemAlignment &bufAlign,
+                    const IAllocator &alloc = GlobalContext().getDefaultAllocator(),
                     eDeviceType device = eDeviceType::GPU);
 
     Tensor(const Tensor &other) = delete;
@@ -166,28 +192,72 @@ class Tensor {
     }
 
     /**
-     * @brief Creates a view of this tensor with a new shape and layout
+     * @brief Creates a view of this tensor with a new shape and layout, keeping the same data type.
      *
-     * @param[in] new_shape the new shape of the tensor
-     * @return Tensor
+     * @param[in] newShape The new shape of the tensor.
+     * @return A new tensor view with the given shape.
      */
-    Tensor reshape(const TensorShape &new_shape) const;
+    Tensor reshape(const TensorShape &newShape) const;
 
     /**
-     * @brief Creates a vew of this tensor with a new shape, layout, and data type. The number of bytes allocated must
-     * match the original tensor.
+     * @brief Creates a view of this tensor with a new data type and shape.
      *
-     * @param new_shape The new tensor shape.
-     * @param new_dtype The new data type of the underlying tensor data.
-     * @return Tensor
+     * Reinterprets the tensor's underlying bytes with the given data type and shape. The total byte count
+     * (elements * dtype size) must match between the original and new view. Non-contiguous (padded) tensors
+     * are supported as long as the reshape is compatible with the stride structure.
+     *
+     * @param[in] newDtype The new data type of the tensor elements.
+     * @param[in] newShape The new shape of the tensor.
+     * @return A new tensor view with the given data type and shape.
      */
-    Tensor reshape(const TensorShape &new_shape, const DataType &new_dtype) const;
+    Tensor reshape(const DataType &newDtype, const TensorShape &newShape) const;
 
+    /**
+     * @brief Performs a shallow copy of the tensor (creates a view).
+     *
+     * This assignment operator copies the tensor's metadata and data handle,
+     * resulting in a new tensor object that shares the same underlying data
+     * with the original tensor. No deep copy of the data is performed.
+     *
+     * @param other The tensor to assign from.
+     * @return Reference to this tensor.
+     */
     Tensor &operator=(const Tensor &other);
 
     /**
-     * @brief Calculates tensor requirements. This essentially wraps the
-     * provided parameters into a TensorRequirements object.
+     * @brief Returns the total number of bytes being used to store the raw tensor data.
+     *
+     * @return Total number of bytes being used to store the raw tensor data.
+     */
+    size_t dataSize() const;
+
+    /**
+     * @brief Returns true if the tensor is contiguous in memory, meaning there is no padding present in the tensor.
+     *
+     * @return True if the tensor is contiguous in memory, false otherwise.
+     */
+    bool isContiguous() const;
+
+    /**
+     * @brief Copies data from a host pointer to the tensor. Host memory must be contiguous. This is a non-blocking
+     * operation, synchronized to the given stream.
+     *
+     * @param[in] src The source host pointer.
+     * @param[in] stream The stream to use for the copy.
+     */
+    void copyFromHost(const void *src, hipStream_t stream = nullptr) const;
+
+    /**
+     * @brief Copies data from the tensor to a host pointer. Host memory will be contiguous. This is a non-blocking
+     * operation, synchronized to the given stream.
+     *
+     * @param[out] dst The destination host pointer. Must be preallocated to the correct size.
+     * @param[in] stream The stream to use for the copy.
+     */
+    void copyToHost(void *dst, hipStream_t stream = nullptr) const;
+
+    /**
+     * @brief Calculates tensor requirements using the default memory alignment strategy.
      *
      * @param[in] shape The desired shape of the tensor.
      * @param[in] dtype The desired data type of the tensor's raw data.
@@ -199,43 +269,72 @@ class Tensor {
                                          eDeviceType device = eDeviceType::GPU);
 
     /**
-     * @brief Calculates tensor requirements.
+     * @brief Calculates tensor requirements with a user-provided memory alignment strategy.
+     *
+     * @param[in] shape The desired shape of the tensor.
+     * @param[in] dtype The desired data type of the tensor's raw data.
+     * @param[in] bufAlign Specification for memory alignment.
+     * @param[in] device The device the tensor data should belong to.
+     * @return A TensorRequirements object representing this tensor's
+     * requirements.
+     */
+    static Requirements CalcRequirements(const TensorShape &shape, const DataType &dtype, const MemAlignment &bufAlign,
+                                         const eDeviceType device = eDeviceType::GPU);
+
+    /**
+     * @brief Calculates tensor requirements with user-provided strides.
      *
      * @param[in] shape The shape describing the tensor.
      * @param[in] dtype The type of the tensor's data.
      * @param[in] strides The tensor's strides.
+     * @param[in] baseAlign The base address alignment.
      * @param[in] device The device the tensor data belongs on. (Default: GPU)
      * @return Tensor requirements.
      */
     static Requirements CalcRequirements(const TensorShape &shape, const DataType &dtype,
-                                         std::array<int64_t, ROCCV_TENSOR_MAX_RANK> strides,
+                                         const std::array<int64_t, ROCCV_TENSOR_MAX_RANK> strides, int32_t baseAlign,
                                          eDeviceType device = eDeviceType::GPU);
 
     /**
-     * @brief Calculates tensor requirements using image-based parameters.
+     * @brief Calculates tensor requirements using image-based parameters. This will use a default memory alignment
+     * strategy.
      *
      * @param[in] num_images The number of images in the batch.
      * @param[in] image_size The size for images in the batch.
      * @param[in] fmt The format of the underlying image data.
      * @param[in] device The device the tensor data should belong to.
-     * @return A TensorRequirements object representing the tensor's requirements.
+     * @return A Tensor::Requirements object representing the tensor's requirements.
      */
     static Requirements CalcRequirements(int num_images, Size2D image_size, ImageFormat fmt,
                                          eDeviceType device = eDeviceType::GPU);
 
+    /**
+     * @brief Calculates tensor requirements using image-based parameters and a specified memory alignment.
+     *
+     * @param[in] num_images The number of images in the batch.
+     * @param[in] image_size The size of images in the batch.
+     * @param[in] fmt The format of the underling image data.
+     * @param[in] bufAlign Specification for memory alignment.
+     * @param[in] device The device the tensor is to be allocated on.
+     * @return A Tensor::Requirements object representing this tensor's requirements.
+     */
+    static Requirements CalcRequirements(int num_images, Size2D image_size, ImageFormat fmt,
+                                         const MemAlignment &bufAlign, eDeviceType device = eDeviceType::GPU);
+
     /**
      * @brief Calculates strides required for a tensor.
      *
      * @param shape The tensor shape.
      * @param dtype The datatype of the tensor.
+     * @param rowAlign The row alignment to use. Setting to 0 will ensure contiguous memory usage.
      * @return An array containing strides for the given parameters.
      */
-    static std::array<int64_t, ROCCV_TENSOR_MAX_RANK> CalcStrides(const TensorShape &shape, const DataType &dtype);
+    static std::array<int64_t, ROCCV_TENSOR_MAX_RANK> CalcStrides(const TensorShape &shape, const DataType &dtype,
+                                                                  int32_t rowAlign);
 
    private:
     TensorRequirements m_requirements;      // Tensor metadata
     std::shared_ptr<TensorStorage> m_data;  // Stores raw tensor data
-    const IAllocator &m_allocator;
 };
 
 /**