diff --git a/benchmarks/src/roccv/roccv_bench_helpers.cpp b/benchmarks/src/roccv/roccv_bench_helpers.cpp index a84c1616..f6c1c3b2 100644 --- a/benchmarks/src/roccv/roccv_bench_helpers.cpp +++ b/benchmarks/src/roccv/roccv_bench_helpers.cpp @@ -59,13 +59,14 @@ class RandomGenerator { void generate(const roccv::Tensor& tensor) { const auto tensor_data = tensor.exportData(); + const size_t numElements = tensor.dataSize() / tensor.dtype().size(); + if constexpr (std::is_integral_v) { - rocrand_generate_char(m_gen, static_cast(tensor_data.basePtr()), - tensor.shape().size() * tensor.dtype().size()); + rocrand_generate_char(m_gen, static_cast(tensor_data.basePtr()), numElements); } else if constexpr (std::is_same_v) { - rocrand_generate_uniform(m_gen, static_cast(tensor_data.basePtr()), tensor.shape().size()); + rocrand_generate_uniform(m_gen, static_cast(tensor_data.basePtr()), numElements); } else if constexpr (std::is_same_v) { - rocrand_generate_uniform_double(m_gen, static_cast(tensor_data.basePtr()), tensor.shape().size()); + rocrand_generate_uniform_double(m_gen, static_cast(tensor_data.basePtr()), numElements); } else { throw std::runtime_error("Unsupported data type."); } diff --git a/include/core/mem_alignment.hpp b/include/core/mem_alignment.hpp new file mode 100644 index 00000000..76a0d9f0 --- /dev/null +++ b/include/core/mem_alignment.hpp @@ -0,0 +1,91 @@ +/* + * Copyright (c) 2025 Advanced Micro Devices, Inc. All rights reserved. + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN + * THE SOFTWARE. + */ + +#pragma once + +#include + +namespace roccv { + +constexpr int32_t ROCCV_CPU_DEFAULT_ALIGNMENT = 64; // Default alignment for CPU memory + +/** + * @class MemAlignment + * @brief Class for specifying memory alignment constraints for buffer allocations. + * + * The MemAlignment class allows you to specify the memory alignment in bytes that should + * be used when allocating or working with device or host memory buffers. Proper alignment + * is important for performance reasons and hardware compatibility, especially for GPU operations. + * + * There are two types of alignment constraints: + * - Base address alignment: Alignment requirements for the starting address of the buffer. + * - Row address alignment: Alignment requirements for the starting address of each row, which is + * important for multi-dimensional data (e.g., images or tensors). + * + * Both alignment values default to 0, which implies that the system or device default alignment will be used. + * + * Example usage: + * @code + * roccv::MemAlignment align; + * align.baseAddr(256).rowAddr(128); + * @endcode + * + * @see roccv::Tensor + */ +class MemAlignment { + public: + MemAlignment() = default; + + /** + * @brief Returns the base address alignment. + * + * @return The base address alignment. + */ + int32_t baseAddr() const; + + /** + * @brief Returns the row address alignment. + * + * @return The row address alignment. + */ + int32_t rowAddr() const; + + /** + * @brief Sets the base address alignment. + * + * @param[in] alignment Alignment in bytes. + * @return A reference to this object, with the base address set. + */ + MemAlignment& baseAddr(int32_t alignment); + + /** + * @brief Sets the row address alignment. + * + * @param[in] alignment Alignment in bytes. + * @return A reference to this object, with the row address set. + */ + MemAlignment& rowAddr(int32_t alignment); + + private: + int32_t m_baseAddrAlignment = 0; + int32_t m_rowAddrAlignment = 0; +}; +} // namespace roccv \ No newline at end of file diff --git a/include/core/tensor.hpp b/include/core/tensor.hpp index e672db8a..7f2241a1 100644 --- a/include/core/tensor.hpp +++ b/include/core/tensor.hpp @@ -21,24 +21,26 @@ THE SOFTWARE. */ #pragma once +#include + #include #include #include "core/data_type.hpp" #include "core/detail/allocators/i_allocator.hpp" +#include "core/detail/context.hpp" +#include "core/image_format.hpp" +#include "core/mem_alignment.hpp" +#include "core/tensor_data.hpp" #include "core/tensor_layout.hpp" +#include "core/tensor_requirements.hpp" +#include "core/tensor_shape.hpp" +#include "core/tensor_storage.hpp" #include "core/util_enums.h" -#include "tensor_data.hpp" -#include "tensor_requirements.hpp" -#include "tensor_storage.hpp" +#include "operator_types.h" namespace roccv { -class ImageFormat; -struct Size2D; -class TensorShape; -class TensorLayout; - class Tensor { public: using Requirements = TensorRequirements; @@ -50,8 +52,7 @@ class Tensor { * * @param[in] reqs An object representing the requirements for this tensor. */ - explicit Tensor(const TensorRequirements &reqs); - explicit Tensor(const TensorRequirements &reqs, const IAllocator &alloc); + explicit Tensor(const TensorRequirements &reqs, const IAllocator &alloc = GlobalContext().getDefaultAllocator()); /** * @brief Constructs a Tensor object given a list of requirements and the underlying data as a TensorStorage @@ -61,22 +62,34 @@ class Tensor { * @param[in] data A TensorStorage object for the tensor's underlying data. */ explicit Tensor(const TensorRequirements &reqs, std::shared_ptr data); - explicit Tensor(const TensorRequirements &reqs, std::shared_ptr data, const IAllocator &alloc); /** - * @brief Constructs a tensor object and allocates the appropriate amount of memory on the specified device. + * @brief Constructs a tensor object and allocates the appropriate amount of memory on the specified device. Uses + * the default memory alignment and allocation strategy. * * @param[in] shape The shape describing the tensor. * @param[in] dtype The underlying datatype of the tensor. * @param[in] device The device the tensor should be allocated on. */ explicit Tensor(const TensorShape &shape, DataType dtype, eDeviceType device = eDeviceType::GPU); - explicit Tensor(const TensorShape &shape, DataType dtype, const IAllocator &alloc, + + /** + * @brief Constructs a tensor object and allocates the appropriate amount of memory on the specified device. Uses a + * user-specified memory alignment and allocation strategy. + * + * @param[in] shape The shape describing the tensor. + * @param[in] dtype The underlying datatype of the tensor. + * @param[in] bufAlign Specification for memory alignment. + * @param[in] alloc The allocation strategy. (Default: DefaultAllocator) + * @param[in] device The device the tensor should be allocated on. + */ + explicit Tensor(const TensorShape &shape, DataType dtype, const MemAlignment &bufAlign, + const IAllocator &alloc = GlobalContext().getDefaultAllocator(), eDeviceType device = eDeviceType::GPU); /** * @brief Constructs a tensor using image-based requirements and allocates the appropriate amount of memory on the - * specified device. + * specified device. Uses the default memory alignment and allocation strategy. * * @param[in] num_images The number of images in the batch. * @param[in] image_size The size for images in the batch. @@ -84,7 +97,20 @@ class Tensor { * @param[in] device The device the tensor should be allocated on. */ explicit Tensor(int num_images, Size2D image_size, ImageFormat fmt, eDeviceType device = eDeviceType::GPU); - explicit Tensor(int num_images, Size2D image_size, ImageFormat fmt, const IAllocator &alloc, + + /** + * @brief Constructs a tensor using image-based requirements and allocates the appropriate amount of memory on the + * specified device. Uses user-provided memory alignment and allocation strategies. + * + * @param[in] num_images The number of images in the batch. + * @param[in] image_size The size for images in the batch. + * @param[in] fmt The format of the underlying image data. + * @param[in] bufAlign Specification for memory alignment. + * @param[in] alloc The allocation strategy. (Default: DefaultAllocator) + * @param[in] device The device the tensor should be allocated on. + */ + explicit Tensor(int num_images, Size2D image_size, ImageFormat fmt, const MemAlignment &bufAlign, + const IAllocator &alloc = GlobalContext().getDefaultAllocator(), eDeviceType device = eDeviceType::GPU); Tensor(const Tensor &other) = delete; @@ -166,28 +192,72 @@ class Tensor { } /** - * @brief Creates a view of this tensor with a new shape and layout + * @brief Creates a view of this tensor with a new shape and layout, keeping the same data type. * - * @param[in] new_shape the new shape of the tensor - * @return Tensor + * @param[in] newShape The new shape of the tensor. + * @return A new tensor view with the given shape. */ - Tensor reshape(const TensorShape &new_shape) const; + Tensor reshape(const TensorShape &newShape) const; /** - * @brief Creates a vew of this tensor with a new shape, layout, and data type. The number of bytes allocated must - * match the original tensor. + * @brief Creates a view of this tensor with a new data type and shape. * - * @param new_shape The new tensor shape. - * @param new_dtype The new data type of the underlying tensor data. - * @return Tensor + * Reinterprets the tensor's underlying bytes with the given data type and shape. The total byte count + * (elements * dtype size) must match between the original and new view. Non-contiguous (padded) tensors + * are supported as long as the reshape is compatible with the stride structure. + * + * @param[in] newDtype The new data type of the tensor elements. + * @param[in] newShape The new shape of the tensor. + * @return A new tensor view with the given data type and shape. */ - Tensor reshape(const TensorShape &new_shape, const DataType &new_dtype) const; + Tensor reshape(const DataType &newDtype, const TensorShape &newShape) const; + /** + * @brief Performs a shallow copy of the tensor (creates a view). + * + * This assignment operator copies the tensor's metadata and data handle, + * resulting in a new tensor object that shares the same underlying data + * with the original tensor. No deep copy of the data is performed. + * + * @param other The tensor to assign from. + * @return Reference to this tensor. + */ Tensor &operator=(const Tensor &other); /** - * @brief Calculates tensor requirements. This essentially wraps the - * provided parameters into a TensorRequirements object. + * @brief Returns the total number of bytes being used to store the raw tensor data. + * + * @return Total number of bytes being used to store the raw tensor data. + */ + size_t dataSize() const; + + /** + * @brief Returns true if the tensor is contiguous in memory, meaning there is no padding present in the tensor. + * + * @return True if the tensor is contiguous in memory, false otherwise. + */ + bool isContiguous() const; + + /** + * @brief Copies data from a host pointer to the tensor. Host memory must be contiguous. This is a non-blocking + * operation, synchronized to the given stream. + * + * @param[in] src The source host pointer. + * @param[in] stream The stream to use for the copy. + */ + void copyFromHost(const void *src, hipStream_t stream = nullptr) const; + + /** + * @brief Copies data from the tensor to a host pointer. Host memory will be contiguous. This is a non-blocking + * operation, synchronized to the given stream. + * + * @param[out] dst The destination host pointer. Must be preallocated to the correct size. + * @param[in] stream The stream to use for the copy. + */ + void copyToHost(void *dst, hipStream_t stream = nullptr) const; + + /** + * @brief Calculates tensor requirements using the default memory alignment strategy. * * @param[in] shape The desired shape of the tensor. * @param[in] dtype The desired data type of the tensor's raw data. @@ -199,43 +269,72 @@ class Tensor { eDeviceType device = eDeviceType::GPU); /** - * @brief Calculates tensor requirements. + * @brief Calculates tensor requirements with a user-provided memory alignment strategy. + * + * @param[in] shape The desired shape of the tensor. + * @param[in] dtype The desired data type of the tensor's raw data. + * @param[in] bufAlign Specification for memory alignment. + * @param[in] device The device the tensor data should belong to. + * @return A TensorRequirements object representing this tensor's + * requirements. + */ + static Requirements CalcRequirements(const TensorShape &shape, const DataType &dtype, const MemAlignment &bufAlign, + const eDeviceType device = eDeviceType::GPU); + + /** + * @brief Calculates tensor requirements with user-provided strides. * * @param[in] shape The shape describing the tensor. * @param[in] dtype The type of the tensor's data. * @param[in] strides The tensor's strides. + * @param[in] baseAlign The base address alignment. * @param[in] device The device the tensor data belongs on. (Default: GPU) * @return Tensor requirements. */ static Requirements CalcRequirements(const TensorShape &shape, const DataType &dtype, - std::array strides, + const std::array strides, int32_t baseAlign, eDeviceType device = eDeviceType::GPU); /** - * @brief Calculates tensor requirements using image-based parameters. + * @brief Calculates tensor requirements using image-based parameters. This will use a default memory alignment + * strategy. * * @param[in] num_images The number of images in the batch. * @param[in] image_size The size for images in the batch. * @param[in] fmt The format of the underlying image data. * @param[in] device The device the tensor data should belong to. - * @return A TensorRequirements object representing the tensor's requirements. + * @return A Tensor::Requirements object representing the tensor's requirements. */ static Requirements CalcRequirements(int num_images, Size2D image_size, ImageFormat fmt, eDeviceType device = eDeviceType::GPU); + /** + * @brief Calculates tensor requirements using image-based parameters and a specified memory alignment. + * + * @param[in] num_images The number of images in the batch. + * @param[in] image_size The size of images in the batch. + * @param[in] fmt The format of the underling image data. + * @param[in] bufAlign Specification for memory alignment. + * @param[in] device The device the tensor is to be allocated on. + * @return A Tensor::Requirements object representing this tensor's requirements. + */ + static Requirements CalcRequirements(int num_images, Size2D image_size, ImageFormat fmt, + const MemAlignment &bufAlign, eDeviceType device = eDeviceType::GPU); + /** * @brief Calculates strides required for a tensor. * * @param shape The tensor shape. * @param dtype The datatype of the tensor. + * @param rowAlign The row alignment to use. Setting to 0 will ensure contiguous memory usage. * @return An array containing strides for the given parameters. */ - static std::array CalcStrides(const TensorShape &shape, const DataType &dtype); + static std::array CalcStrides(const TensorShape &shape, const DataType &dtype, + int32_t rowAlign); private: TensorRequirements m_requirements; // Tensor metadata std::shared_ptr m_data; // Stores raw tensor data - const IAllocator &m_allocator; }; /** diff --git a/include/core/tensor_requirements.hpp b/include/core/tensor_requirements.hpp index 0bb6f631..3ff30d4e 100644 --- a/include/core/tensor_requirements.hpp +++ b/include/core/tensor_requirements.hpp @@ -29,16 +29,40 @@ THE SOFTWARE. namespace roccv { +/** + * @brief Specifies basic memory requirements for an allocation. + * + * This struct expresses the number of bytes required for a memory region that backs + * a particular tensor or buffer allocation. It is typically used to indicate the raw + * size required for device, host, or pinned memory allocations. + */ struct MemRequirements { size_t bytes = 0; }; +/** + * @brief Specifies resource requirements for tensor memory allocations. + * + * This struct aggregates requirements for different types of memory resources + * that may be used for tensor allocation and operation: + * - deviceMem: Memory required on the device (e.g., GPU). + * - hostMem: Memory required on the host (CPU-accessible memory). + * - pinnedMem: Memory required in pinned (page-locked) host memory, which may + * be used for efficient device-host transfers. + */ struct ResourceRequirements { MemRequirements deviceMem; MemRequirements hostMem; MemRequirements pinnedMem; }; +/** + * @brief Specifies the requirements for creating and allocating a tensor. + * + * This struct defines all the necessary properties for specifying the memory and layout + * requirements of a tensor, including its datatype, shape, memory alignment, layout, strides, + * memory resource requirements, and device placement. + */ struct TensorRequirements { eDataType dtype; eTensorLayout layout; diff --git a/include/core/tensor_storage.hpp b/include/core/tensor_storage.hpp index 33a0aaf7..4200c099 100644 --- a/include/core/tensor_storage.hpp +++ b/include/core/tensor_storage.hpp @@ -71,11 +71,17 @@ class TensorStorage { */ eDeviceType device() const; + /** + * @brief Returns the allocation strategy being used. + * + * @return The allocation strategy being used. + */ + const IAllocator& allocator() const; + private: eDeviceType m_device; eOwnership m_ownership; void* m_data; const IAllocator& m_allocator; }; - } // namespace roccv \ No newline at end of file diff --git a/include/core/utils.hpp b/include/core/utils.hpp new file mode 100644 index 00000000..4449ce5b --- /dev/null +++ b/include/core/utils.hpp @@ -0,0 +1,68 @@ +/** +Copyright (c) 2025 Advanced Micro Devices, Inc. All rights reserved. + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in +all copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN +THE SOFTWARE. +*/ + +#pragma once + +#include + +#include +#include + +namespace roccv::detail { + +inline constexpr size_t NextPowerOfTwo(size_t value) noexcept { + if (value <= 1) return 1; +#if defined(__GNUC__) || defined(__clang__) + constexpr int numBits = sizeof(size_t) * 8; + return 1UL << (numBits - __builtin_clzl(value - 1)); +#else + value--; + for (size_t i = 1; i < sizeof(size_t) * 8; i <<= 1) value |= (value >> i); + return value + 1; +#endif +} + +/** + * @brief Returns true if the given value is a power of two. + * + * @param value The value to check. + * @return True if the given value is a power of two, false otherwise. + */ +template >> +inline constexpr bool IsPowerOfTwo(T value) noexcept { + return value > 0 && (value & (value - 1)) == 0; +} + +/** + * @brief Aligns the given value to the nearest multiple of the given alignment. + * + * @param value The value to align. + * @param alignment The alignment to align to (must be > 0). + * @return The aligned value (same type as value). + */ +template && std::is_integral_v>> +inline constexpr T AlignUp(T value, U alignment) noexcept { + return alignment > 0 + ? (value + static_cast(alignment) - 1) / static_cast(alignment) * static_cast(alignment) + : value; +} +} // namespace roccv::detail \ No newline at end of file diff --git a/python/include/py_tensor.hpp b/python/include/py_tensor.hpp index 523a5f27..0c940148 100644 --- a/python/include/py_tensor.hpp +++ b/python/include/py_tensor.hpp @@ -29,8 +29,19 @@ THE SOFTWARE. #include #include -namespace py = pybind11; - +/** + * @brief A Python-facing wrapper for roccv::Tensor, providing DLPack interoperability and Pybind11 integration. + * + * The PyTensor class serves as a container around roccv::Tensor, exposing its functionality to Python. It supports + * construction from shape, data type, layout, and device information, as well as wrapping existing tensors or + * external DLPack-managed tensors. The class ensures correct lifetime handling for external resources (such as those + * transferred via DLPack) and enables seamless data movement between devices (CPU/GPU). PyTensor also supplies + * utilities to export to and import from DLPack capsules, making it suitable for zero-copy interoperability with + * frameworks like PyTorch, NumPy, or TensorFlow. + * + * Usage scenarios include direct creation from Python, import/export to DLPack, device copying, and serving as a + * type-safe bridge between Python and C++ tensor operations. + */ class PyTensor : public std::enable_shared_from_this { public: /** @@ -79,22 +90,51 @@ class PyTensor : public std::enable_shared_from_this { std::shared_ptr copyTo(eDeviceType device); /** - * @brief Creates a new tensor given a capsule containing a DLManagedTensor. + * @brief Creates a new PyTensor by consuming an external DLPack capsule. * - * @param src A capsule containing a DLManagedTensor. - * @param layout The layout for the new tensor. - * @return std::shared_ptr + * This static method constructs a PyTensor that wraps a new roccv::Tensor allocated + * from the contents of a DLPack capsule (i.e., an object supporting the __dlpack__ protocol), + * typically exported from other frameworks such as PyTorch, NumPy, or TVM. + * The shape and datatype are taken from the capsule's DLTensor metadata, while the layout + * must be specified explicitly, since DLPack does not encode layout. + * + * Ownership of the underlying DLPack-managed memory is transferred to the returned PyTensor, + * such that when the PyTensor is destroyed (and no Python references remain), the deleter + * from the DLPack capsule is called, ensuring correct cross-framework resource management. + * + * @param src Python object supporting the __dlpack__() method. This can be a capsule returned by + * __dlpack__() or any object exposing the DLPack consumer protocol. + * @param layout The tensor layout to use for the new roccv::Tensor (e.g., NHWC, NCHW, etc.). + * @return A std::shared_ptr wrapping a new tensor that shares memory with the DLPack object. + * + * @throws std::runtime_error if the object does not support the DLPack protocol, + * if the capsule is invalid or missing, or if conversion fails in any other way. + * + * @note The resulting tensor will have the same shape, datatype, device, and (if present) strides + * as encoded in the DLPack capsule, but the layout must be provided by the caller. */ static std::shared_ptr fromDLPack(pybind11::object src, eTensorLayout layout); /** - * @brief Creates a DLManagedTensor contained in a capsule from the tensor. + * @brief Exports this tensor as a DLPack-compatible capsule. + * + * This method creates a DLPack DLManagedTensor wrapper for the current PyTensor, + * encapsulates it in a Python capsule with the "dltensor" name, and returns it. + * The resulting capsule can be consumed by any framework supporting the DLPack protocol + * for zero-copy tensor data sharing. * - * @param stream Optional stream pointer value (used for pytorch). + * The optional @p stream argument is present to satisfy the Pytorch DLPack consumer interface, + * but is ignored by this implementation currently. * - * @return py::capsule + * @param stream Optional stream pointer (typically unused, present for PyTorch DLPack compliance). + * + * @return py::capsule A Python capsule containing the DLPack DLManagedTensor. + * + * @note The caller is responsible for transferring or managing ownership of the data + * (according to DLPack conventions), including calling the capsule consumer and ensuring + * that the deleter in DLManagedTensor is invoked only once. */ - py::capsule toDLPack(py::object stream); + pybind11::capsule toDLPack(pybind11::object stream); /** * @brief Gets the strides of the tensor as a python list. @@ -151,15 +191,28 @@ class PyTensor : public std::enable_shared_from_this { * @return A python tuple with the first index corresponding to the device type, and the second index corresponding * to the device id. */ - py::tuple getDLDevice(); + pybind11::tuple getDLDevice(); /** * @brief Exports this class in the provided module. * * @param m The python module to export this class to. */ - static void Export(py::module& m); + static void Export(pybind11::module& m); + /** + * @brief Returns a new PyTensor with a reshaped tensor according to the specified shape and layout. + * + * Creates and returns a new PyTensor whose underlying tensor is a view or copy of this tensor, + * with the shape specified by newShape and the layout specified by layout. The number of elements + * in newShape must match the number of elements in the original tensor. + * + * @param newShape The new shape for the tensor. + * @param layout The new layout to use for the reshaped tensor. + * @return std::shared_ptr A new PyTensor with the reshaped tensor. + * + * @throws std::runtime_error if the total number of elements does not match. + */ std::shared_ptr reshape(std::vector newShape, eTensorLayout layout); private: diff --git a/python/src/py_tensor.cpp b/python/src/py_tensor.cpp index 913a25a4..97d71c39 100644 --- a/python/src/py_tensor.cpp +++ b/python/src/py_tensor.cpp @@ -97,10 +97,12 @@ std::shared_ptr PyTensor::copyTo(eDeviceType device) { } std::shared_ptr PyTensor::fromDLPack(pybind11::object src, eTensorLayout layout) { + // Check if the object supports the DLPack protocol if (!py::hasattr(src, "__dlpack__")) { throw std::runtime_error("Provided object does not support the DLPack protocol."); } + // Obtain a DLPack capsule py::capsule dlpackCapsule = src.attr("__dlpack__")(); if (!PyCapsule_IsValid(dlpackCapsule.ptr(), "dltensor")) { throw std::runtime_error("Invalid DLPack capsule."); @@ -108,23 +110,42 @@ std::shared_ptr PyTensor::fromDLPack(pybind11::object src, eTensorLayo DLManagedTensor* dlManagedTensor = static_cast(dlpackCapsule.get_pointer()); DLTensor dlTensor = dlManagedTensor->dl_tensor; - // Mark this capsule as consumed, so that the deleter will not free underlying data. + // Mark this capsule as consumed, so that the deleter will not free underlying data dlpackCapsule.set_name("used_dltensor"); - // Copy shape data - std::vector shapeData(dlTensor.ndim); - for (int i = 0; i < dlTensor.ndim; i++) { + // Copy the shape data from DLPack to a fixed-size array + std::array shapeData; + for (int i = 0; i < dlTensor.ndim; ++i) { shapeData[i] = dlTensor.shape[i]; } - - // Create a non-owning roccv::Tensor based on the received data - roccv::TensorShape shape(roccv::TensorLayout(layout), shapeData); + roccv::TensorShape shape(shapeData, dlTensor.ndim, roccv::TensorLayout(layout)); eDeviceType device = DLDeviceToRoccvDevice(dlTensor.device); - roccv::TensorRequirements reqs = - roccv::Tensor::CalcRequirements(shape, roccv::DataType(DLTypeToRoccvType(dlTensor.dtype)), device); - auto data = std::make_shared(dlTensor.data, device, eOwnership::VIEW); - auto tensor = std::make_shared(reqs, data); + eDataType dtype = DLTypeToRoccvType(dlTensor.dtype); + + // Prepare the strides array + std::array stridesData; + + // If strides are not present, assume contiguous layout. We really shouldn't be recalculating strides here. DLPack + // now enforces that the strides are present, but we'll keep this for backwards compatibility. + if (dlTensor.strides == nullptr) { + stridesData = roccv::Tensor::CalcStrides(shape, roccv::DataType(dtype), 0); + } else { + for (int i = 0; i < dlTensor.ndim; ++i) { + // DLTensor strides are element-wise. Convert from element-wise to byte-wise. + stridesData[i] = dlTensor.strides[i] * roccv::DataType(dtype).size(); + } + } + + // Set up the requirements for the new tensor. + roccv::Tensor::Requirements reqs = + roccv::Tensor::CalcRequirements(shape, roccv::DataType(dtype), stridesData, 0, device); + + // Since this tensor is coming from a DLPack, we don't own the data, so we need to create a view of the data. + std::shared_ptr data = std::make_shared( + static_cast(dlTensor.data) + dlTensor.byte_offset, device, eOwnership::VIEW); + std::shared_ptr tensor = std::make_shared(reqs, data); + // Instantiate a new tensor and a PyTensor to wrap it, binding the original DLManagedTensor return std::make_shared(tensor, dlManagedTensor); } diff --git a/samples/bilateral_filter.cpp b/samples/bilateral_filter.cpp index 4b7d3864..fdfd11d0 100644 --- a/samples/bilateral_filter.cpp +++ b/samples/bilateral_filter.cpp @@ -21,172 +21,139 @@ THE SOFTWARE. */ #include +#include + #include #include -#include #include #include +#include "common/utils.hpp" + using namespace roccv; +struct Config { + std::string inputPath; + std::string outputPath = "output"; + int deviceId = 0; + int diameter = 2; + float sigmaSpace = 2.0f; + float sigmaColor = 10.0f; + eBorderType borderMode = eBorderType::BORDER_TYPE_REPLICATE; + float4 borderColor = {0.0f, 0.0f, 0.0f, 0.0f}; + eDeviceType device = eDeviceType::GPU; +}; + /** * @brief Bilateral filter operation example. */ - void ShowHelpAndExit(const char *option = NULL) { - std::cout << "Options: " << option << std::endl - << "-i Input File Path - required" << std::endl - << "-o Output File Path - optional; default: output.bmp" << std::endl - << "-cpu Select CPU instead of GPU to perform operation - optional; default choice is GPU path" << std::endl - << "-d GPU device ID (0 for the first device, 1 for the second, etc.) - optional; default: 0" << std::endl - << "-diameter Diameter of the filtering area - optional; default: 2" << std::endl - << "-sigma_space Spatial parameter sigma of the Gaussian function - optional; default: 2.0f" << std::endl - << "-sigma_color Range parameter sigma of the Gaussian function - optional; default: 10.0f" << std::endl - << "-border_mode Border mode at image boundary when work pixels are outside of the image (0: constant color; 1: replicate; 2: reflect; 3: wrap) - optional; default: 1 (replicate)" << std::endl - << "-border_color Border color for constant color border mode - optional; default: (0, 0, 0, 0)" << std::endl; - exit(0); +void PrintUsage(const char* programName) { + // clang-format off + std::cout << "Usage: " << programName << " -i [-o ] [-d ] [-D ] [-s ] [-c ] [-b ] [-B ]" << std::endl; + std::cout << " -i, --input Input image or directory containing images (required)" << std::endl; + std::cout << " -o, --output Output image or directory to save the results (optional, default: output)" << std::endl; + std::cout << " -d, --device Device ID to use for execution (optional, default: 0)" << std::endl; + std::cout << " -D, --diameter Diameter of the filtering area (optional, default: 2)" << std::endl; + std::cout << " -s, --sigma_space Spatial parameter sigma of the Gaussian function (optional, default: 2.0f)" << std::endl; + std::cout << " -c, --sigma_color Range parameter sigma of the Gaussian function (optional, default: 10.0f)" << std::endl; + std::cout << " -b, --border_mode Border mode at image boundary when work pixels are outside of the image (optional, default: 1 (replicate))" << std::endl; + std::cout << " -B, --border_color Border color for constant color border mode (optional, default: 0,0,0,0)" << std::endl; + std::cout << " -C, --cpu Use CPU for execution (optional, default: GPU)" << std::endl; + std::cout << " -h, --help Show this help message" << std::endl; + // clang-format on } -int main(int argc, char** argv) { - std::string input_file_path; - std::string output_file_path = "output.bmp"; - bool gpuPath = true; // use GPU by default - eDeviceType device = eDeviceType::GPU; - int deviceId = 0; - int diameter = 2; - float sigmaSpace = 2.0f; - float sigmaColor = 10.0f; - eBorderType borderMode = BORDER_TYPE_REPLICATE; - float4 borderColor = {0.0f, 0.0f, 0.0f, 0.0f}; +bool ParseBorderColor(const std::string& borderColorStr, float4& borderColor) { + return sscanf(borderColorStr.c_str(), "%f,%f,%f,%f", &borderColor.x, &borderColor.y, &borderColor.z, + &borderColor.w) == 4; +} - if(argc < 3) { - ShowHelpAndExit("-h"); - } - for (int i = 1; i < argc; i++) { - if (!strcmp(argv[i], "-h")) { - ShowHelpAndExit("-h"); - } - if (!strcmp(argv[i], "-i")) { - if (++i == argc) { - ShowHelpAndExit("-i"); - } - input_file_path = argv[i]; - continue; - } - if (!strcmp(argv[i], "-o")) { - if (++i == argc) { - ShowHelpAndExit("-o"); - } - output_file_path = argv[i]; - continue; - } - if (!strcmp(argv[i], "-diameter")) { - if (++i == argc) { - ShowHelpAndExit("-diameter"); - } - diameter = std::atoi(argv[i]); - continue; - } - if (!strcmp(argv[i], "-sigma_space")) { - if (++i == argc) { - ShowHelpAndExit("-sigma_space"); - } - sigmaSpace = std::atof(argv[i]); - continue; - } - if (!strcmp(argv[i], "-sigma_color")) { - if (++i == argc) { - ShowHelpAndExit("-sigma_color"); - } - sigmaColor = std::atof(argv[i]); - continue; - } - if (!strcmp(argv[i], "-border_mode")) { - if (++i == argc) { - ShowHelpAndExit("-border_mode"); - } - borderMode = static_cast(std::atoi(argv[i])); - continue; - } - if (!strcmp(argv[i], "-border_color")) { - i++; - if (i + 4 > argc) { - ShowHelpAndExit("-border_color"); - } - borderColor.x = static_cast(atoi(argv[i++])); - borderColor.y = static_cast(atoi(argv[i++])); - borderColor.z = static_cast(atoi(argv[i++])); - borderColor.w = static_cast(atoi(argv[i])); - continue; - } - if (!strcmp(argv[i], "-cpu")) { - gpuPath = false; - continue; +int main(int argc, char** argv) { + Config config; + + static struct option longOptions[] = {{"input", required_argument, nullptr, 'i'}, + {"output", required_argument, nullptr, 'o'}, + {"device", required_argument, nullptr, 'd'}, + {"diameter", required_argument, nullptr, 'D'}, + {"sigma_space", required_argument, nullptr, 's'}, + {"sigma_color", required_argument, nullptr, 'c'}, + {"border_mode", required_argument, nullptr, 'b'}, + {"border_color", required_argument, nullptr, 'B'}, + {"cpu", no_argument, nullptr, 'C'}, + {"help", no_argument, nullptr, 'h'}, + {nullptr, 0, nullptr, 0}}; + + int opt; + while ((opt = getopt_long(argc, argv, "i:o:d:D:s:c:b:B:h:C", longOptions, nullptr)) != -1) { + switch (opt) { + case 'i': + config.inputPath = optarg; + break; + case 'o': + config.outputPath = optarg; + break; + case 'd': + config.deviceId = std::stoi(optarg); + break; + case 'D': + config.diameter = std::stoi(optarg); + break; + case 's': + config.sigmaSpace = std::stof(optarg); + break; + case 'c': + config.sigmaColor = std::stof(optarg); + break; + case 'b': + config.borderMode = static_cast(std::stoi(optarg)); + break; + case 'B': + if (!ParseBorderColor(optarg, config.borderColor)) { + std::cerr << "Invalid border color format. Use: r,g,b,a (e.g., 0,0,0,0)\n"; + return EXIT_FAILURE; + } + break; + case 'C': + config.device = eDeviceType::CPU; + break; + case 'h': + PrintUsage(argv[0]); + return EXIT_SUCCESS; + default: + PrintUsage(argv[0]); + return EXIT_FAILURE; } } - if (gpuPath) { - device = eDeviceType::GPU; - HIP_VALIDATE_NO_ERRORS(hipSetDevice(deviceId)); - } else { - device = eDeviceType::CPU; - } - hipStream_t stream = nullptr; - if (gpuPath) { - HIP_VALIDATE_NO_ERRORS(hipStreamCreate(&stream)); + if (config.inputPath.empty()) { + std::cerr << "Error: Input path is required.\n\n"; + PrintUsage(argv[0]); + return EXIT_FAILURE; } - cv::Mat imageData = cv::imread(input_file_path); - if (imageData.empty()) { - std::cerr << "Failed to read the input image file" << std::endl; - exit(1); + if (config.device == eDeviceType::GPU) { + CHECK_HIP_ERROR(hipSetDevice(config.deviceId)); } - // Create input/output tensors for the image. - TensorShape imageShape(TensorLayout(eTensorLayout::TENSOR_LAYOUT_NHWC), {1, imageData.rows, imageData.cols, imageData.channels()}); - DataType dtype(eDataType::DATA_TYPE_U8); - Tensor input(imageShape, dtype, device); - Tensor output(imageShape, dtype, device); - - // Move image data to input tensor - size_t imageSizeInByte = input.shape().size() * input.dtype().size(); - auto inputData = input.exportData(); - if (gpuPath) { - HIP_VALIDATE_NO_ERRORS(hipMemcpyAsync(inputData.basePtr(), imageData.data, imageSizeInByte, hipMemcpyHostToDevice, stream)); - } else { - memcpy(inputData.basePtr(), imageData.data, imageSizeInByte); - } + // Create stream + hipStream_t stream; + CHECK_HIP_ERROR(hipStreamCreate(&stream)); + + // Load input image + Tensor input = LoadImages(stream, config.inputPath.c_str(), config.device); + // Create output tensor + Tensor output(input.shape(), input.dtype(), config.device); + + // Create BilateralFilter operator BilateralFilter op; - op(stream, input, output, diameter, sigmaColor, sigmaSpace, borderMode, borderColor, device); - - // Move image data back to host - size_t outputSize = output.shape().size() * output.dtype().size(); - auto outData = output.exportData(); - std::vector h_output(outputSize); - if (gpuPath) { - HIP_VALIDATE_NO_ERRORS(hipMemcpyAsync(h_output.data(), outData.basePtr(), outputSize, hipMemcpyDeviceToHost, stream)); - HIP_VALIDATE_NO_ERRORS(hipStreamSynchronize(stream)); - } else { - memcpy(h_output.data(), outData.basePtr(), outputSize); - } + op(stream, input, output, config.diameter, config.sigmaColor, config.sigmaSpace, config.borderMode, + config.borderColor, config.device); - // Write output image to disk - cv::Mat outImageData(imageData.rows, imageData.cols, imageData.type(), h_output.data()); - bool ret = cv::imwrite(output_file_path, outImageData); - if (!ret) { - std::cerr << "Faild to save output image to the file" << std::endl; - exit(1); - } + WriteImages(stream, output, config.outputPath); - std::cout << "Input image file: " << input_file_path << std::endl; - std::cout << "Output image file: " << output_file_path << std::endl; - if (gpuPath) { - std::cout << "Operation on GPU device " << deviceId << std::endl; - } else { - std::cout << "Operation on CPU" << std::endl; - } - std::cout << "Image size: width = " << imageData.cols << ", height = " << imageData.rows << std::endl; + CHECK_HIP_ERROR(hipStreamDestroy(stream)); return EXIT_SUCCESS; } \ No newline at end of file diff --git a/samples/bnd_box.cpp b/samples/bnd_box.cpp index 72ccbf06..621d3ef7 100644 --- a/samples/bnd_box.cpp +++ b/samples/bnd_box.cpp @@ -21,6 +21,7 @@ THE SOFTWARE. */ #include +#include #include #include @@ -28,239 +29,249 @@ THE SOFTWARE. #include #include -using namespace roccv; +#include "common/utils.hpp" -/** - * @brief Bounding Box operation example. - */ +using namespace roccv; /** - * @brief Example bounding box list file content - * 1 <-- number of images - * 2 <-- number of boxes for image 1 - * 50 <-- X coordinate of top-left corner of box 1 - * 50 <-- Y coordinate of top-left corner of box 1 - * 100 <-- width of box 1 - * 50 <-- height of box 1 - * 5 <-- thickness of box boundary of box 1 - * 0 <-- B component of box border color of box 1 - * 0 <-- G component of box border color of box 1 - * 255 <-- R component of box border color of box 1 - * 200 <-- alpha component of box border color of box 1 - * 0 <-- B component of box fill color of box 1 - * 255 <-- G component of box fill color of box 1 - * 0 <-- R component of box fill color of box 1 - * 100 <-- alpha component of box fill color of box 1 - * 250 <-- X coordinate of top-left corner of box 2 - * 250 <-- Y coordinate of top-left corner of box 2 - * 50 <-- width of box 2 - * 100 <-- height of box 2 - * 10 <-- thickness of box boundary of box 2 - * 255 <-- B component of box border color of box 2 - * 0 <-- G component of box border color of box 2 - * 0 <-- R component of box border color of box 2 - * 200 <-- alpha component of box border color of box 2 - * 0 <-- B component of box fill color of box 2 - * 0 <-- G component of box fill color of box 2 - * 0 <-- R component of box fill color of box 2 - * 0 <-- alpha component of box fill color of box 2 + * @file bnd_box.cpp + * @brief Example application for drawing bounding boxes on images using ROC CV. + * + * This file demonstrates how to parse a bounding box description file and apply bounding boxes to input images. + * The bounding box file is expected to have the following format: + * - The first line contains the number of images. + * - Each image section starts with a line containing the number of boxes for that image. + * - Each box is described over 13 subsequent lines as follows: + * - X coordinate of top-left corner + * - Y coordinate of top-left corner + * - Width + * - Height + * - Thickness of box boundary + * - B component of box border color + * - G component of box border color + * - R component of box border color + * - Alpha component of box border color + * - B component of box fill color + * - G component of box fill color + * - R component of box fill color + * - Alpha component of box fill color + * + * Example of bounding box list file content: + * @code + * 1 + * 2 + * 50 + * 50 + * 100 + * 50 + * 5 + * 0 + * 0 + * 255 + * 200 + * 0 + * 255 + * 0 + * 100 + * 250 + * 250 + * 50 + * 100 + * 10 + * 255 + * 0 + * 0 + * 200 + * 0 + * 0 + * 0 + * 0 + * @endcode */ -void ShowHelpAndExit(const char* option = NULL) { - std::cout << "Options: " << option << std::endl - << "-i Input File Path - required" << std::endl - << "-o Output File Path - optional; default: output.bmp" << std::endl - << "-cpu Select CPU instead of GPU to perform operation - optional; default choice is GPU path" - << std::endl - << "-d GPU device ID (0 for the first device, 1 for the second, etc.) - optional; default: 0" << std::endl - << "-box_file Bounding box list file - optional; default: use the set value in the app" << std::endl; - exit(0); -} - -int main(int argc, char** argv) { - std::string input_file_path; - std::string box_file_path; - std::string output_file_path = "output.bmp"; - bool gpuPath = true; // use GPU by default +struct Config { + std::string inputPath; + std::string outputPath = "output"; eDeviceType device = eDeviceType::GPU; int deviceId = 0; - bool boxSet = false; // User sets the bounding box list data in a text file - - if (argc < 3) { - ShowHelpAndExit("-h"); - } - for (int i = 1; i < argc; i++) { - if (!strcmp(argv[i], "-h")) { - ShowHelpAndExit("-h"); - } - if (!strcmp(argv[i], "-i")) { - if (++i == argc) { - ShowHelpAndExit("-i"); - } - input_file_path = argv[i]; - continue; - } - if (!strcmp(argv[i], "-o")) { - if (++i == argc) { - ShowHelpAndExit("-o"); - } - output_file_path = argv[i]; - continue; - } - if (!strcmp(argv[i], "-box_file")) { - if (++i == argc) { - ShowHelpAndExit("-o"); - } - box_file_path = argv[i]; - boxSet = true; - continue; - } - if (!strcmp(argv[i], "-cpu")) { - gpuPath = false; - continue; - } - } + std::string boundingBoxFilePath = ""; +}; - if (gpuPath) { - device = eDeviceType::GPU; - HIP_VALIDATE_NO_ERRORS(hipSetDevice(deviceId)); - } else { - device = eDeviceType::CPU; - } +void PrintUsage(const char* programName) { + // clang-format off + std::cout << "Usage: " << programName << " -i [-o ] [-b ] [-d ] [-C]" << std::endl; + std::cout << " -i, --input Input image or directory containing images (required)" << std::endl; + std::cout << " -o, --output Output image or directory to save the results (optional, default: output)" << std::endl; + std::cout << " -b, --box_file Bounding box list file (optional, default: use the set value in the app)" << std::endl; + std::cout << " -d, --device Device ID to use for execution (optional, default: 0)" << std::endl; + std::cout << " -C, --cpu Use CPU for execution (optional, default: GPU)" << std::endl; + std::cout << " -h, --help Show this help message" << std::endl; + // clang-format on +} - cv::Mat imageData = cv::imread(input_file_path); - if (imageData.empty()) { - std::cerr << "Failed to read the input image file" << std::endl; - exit(1); +/** + * @brief Parse bounding box file and setup bounding box vector. + * + * @param[in] boundingBoxFilePath Path to bounding box list file. + * @param[out] bbox_vector Vector of bounding box vectors to be filled. + * @return void + * @throws std::runtime_error if failed to open bounding box file. + */ +void ParseBoundingBoxFile(const std::string& boundingBoxFilePath, std::vector>& bbox_vector) { + std::ifstream file(boundingBoxFilePath); + if (!file.is_open()) { + throw std::runtime_error("Failed to open bounding box file " + boundingBoxFilePath); } - int batchSize = 1; - std::vector> bbox_vector; - if (boxSet) { - std::ifstream box_list_file(box_file_path); - if (box_list_file.is_open()) { - std::string line; - std::getline(box_list_file, line); - batchSize = std::stoi(line.c_str()); - if (batchSize > 0) { - bbox_vector.resize(batchSize); - for (int i = 0; i < batchSize; i++) { - std::getline(box_list_file, line); - int numBoxes = std::stoi(line.c_str()); - if (numBoxes > 0) { - for (int b = 0; b < numBoxes; b++) { - BndBox_t box; - std::getline(box_list_file, line); - box.box.x = std::atoi(line.c_str()); - std::getline(box_list_file, line); - box.box.y = std::atoi(line.c_str()); - std::getline(box_list_file, line); - box.box.width = std::atoi(line.c_str()); - std::getline(box_list_file, line); - box.box.height = std::atoi(line.c_str()); + int numImages; + file >> numImages; + bbox_vector.resize(numImages); + for (int i = 0; i < numImages; i++) { + int numBoxes; + file >> numBoxes; - std::getline(box_list_file, line); - box.thickness = std::atoi(line.c_str()); + bbox_vector[i].resize(numBoxes); + for (int j = 0; j < numBoxes; j++) { + // Parse each box from 13 lines, in order: + // 1. x (top-left corner) + // 2. y (top-left corner) + // 3. width + // 4. height + // 5. thickness (box boundary) + // 6. border B + // 7. border G + // 8. border R + // 9. border A (alpha) + // 10. fill B + // 11. fill G + // 12. fill R + // 13. fill A (alpha) + // (13 values total per box) - std::getline(box_list_file, line); - box.borderColor.r = std::atoi(line.c_str()); - std::getline(box_list_file, line); - box.borderColor.g = std::atoi(line.c_str()); - std::getline(box_list_file, line); - box.borderColor.b = std::atoi(line.c_str()); - std::getline(box_list_file, line); - box.borderColor.a = std::atoi(line.c_str()); + // Read box dimensions + file >> bbox_vector[i][j].box.x >> bbox_vector[i][j].box.y >> bbox_vector[i][j].box.width >> + bbox_vector[i][j].box.height; + file >> bbox_vector[i][j].thickness; - std::getline(box_list_file, line); - box.fillColor.r = std::atoi(line.c_str()); - std::getline(box_list_file, line); - box.fillColor.g = std::atoi(line.c_str()); - std::getline(box_list_file, line); - box.fillColor.b = std::atoi(line.c_str()); - std::getline(box_list_file, line); - box.fillColor.a = std::atoi(line.c_str()); + // Read colors into temp ints + int r, g, b, a; + file >> b >> g >> r >> a; + bbox_vector[i][j].borderColor = {static_cast(r), static_cast(g), static_cast(b), + static_cast(a)}; - bbox_vector[i].push_back(box); - } - } else { - std::cerr << "Invalid number of boxes: " << numBoxes << "for image: " << i << std::endl; - exit(1); - } - } - } else { - std::cerr << "Invalid batch size: " << batchSize << std::endl; - exit(1); - } - } else { - std::cerr << "Failed to open bounding box list file " << box_file_path << std::endl; - exit(1); + file >> b >> g >> r >> a; + bbox_vector[i][j].fillColor = {static_cast(r), static_cast(g), static_cast(b), + static_cast(a)}; } - } else { - auto width = imageData.cols; - auto height = imageData.rows; - bbox_vector = { - { + } +} + +/** + * @brief Setup bounding box vector from file or default values. + * + * @param[in] batchSize Number of images in the batch. + * @param[in] width Width of each image in the batch. + * @param[in] height Height of each image in the batch. + * @param[in] boundingBoxFilePath Path to bounding box list file. + * @return Vector of bounding box vectors. + */ +std::vector> SetupBoundingBoxVector(int64_t batchSize, int64_t width, int64_t height, + const std::string& boundingBoxFilePath) { + std::vector> bbox_vector; + + if (boundingBoxFilePath.empty()) { + for (int64_t b = 0; b < batchSize; b++) { + bbox_vector.push_back({ {{width / 4, height / 4, width / 2, height / 2}, 5, {0, 0, 255, 200}, {0, 255, 0, 100}}, {{width / 3, height / 3, width / 3 * 2, height / 4}, -1, {90, 16, 181, 50}, {0, 0, 0, 0}}, {{-50, (height * 2) / 3, width + 50, height / 3 + 50}, 0, {0, 0, 0, 0}, {111, 159, 232, 150}}, - }, - }; + }); + } + } else { + ParseBoundingBoxFile(boundingBoxFilePath, bbox_vector); } - BndBoxes bboxes(bbox_vector); - // Create input/output tensors for the image. - TensorShape imageShape(TensorLayout(eTensorLayout::TENSOR_LAYOUT_NHWC), - {1, imageData.rows, imageData.cols, imageData.channels()}); - DataType dtype(eDataType::DATA_TYPE_U8); - Tensor input(imageShape, dtype, device); - Tensor output(imageShape, dtype, device); + return bbox_vector; +} - hipStream_t stream = nullptr; - if (gpuPath) { - HIP_VALIDATE_NO_ERRORS(hipStreamCreate(&stream)); +/** + * @brief Main function to run the bounding box operation example. + * + * @param[in] argc Number of command line arguments. + * @param[in] argv Command line arguments. + * @return int Exit status. + */ +int main(int argc, char** argv) { + Config config; + static struct option longOptions[] = {{"input", required_argument, nullptr, 'i'}, + {"output", required_argument, nullptr, 'o'}, + {"box_file", required_argument, nullptr, 'b'}, + {"device", required_argument, nullptr, 'd'}, + {"cpu", no_argument, nullptr, 'C'}, + {"help", no_argument, nullptr, 'h'}, + {nullptr, 0, nullptr, 0}}; + int opt; + while ((opt = getopt_long(argc, argv, "i:o:b:d:h:C", longOptions, nullptr)) != -1) { + switch (opt) { + case 'i': + config.inputPath = optarg; + break; + case 'o': + config.outputPath = optarg; + break; + case 'b': + config.boundingBoxFilePath = optarg; + break; + case 'd': + config.deviceId = std::stoi(optarg); + break; + case 'C': + config.device = eDeviceType::CPU; + break; + case 'h': + PrintUsage(argv[0]); + return EXIT_SUCCESS; + default: + PrintUsage(argv[0]); + return EXIT_FAILURE; + } } - // Move image data to input tensor - size_t imageSizeInByte = input.shape().size() * input.dtype().size(); - auto inputData = input.exportData(); - if (gpuPath) { - HIP_VALIDATE_NO_ERRORS( - hipMemcpyAsync(inputData.basePtr(), imageData.data, imageSizeInByte, hipMemcpyHostToDevice, stream)); - } else { - memcpy(inputData.basePtr(), imageData.data, imageSizeInByte); + if (config.inputPath.empty()) { + std::cerr << "Error: Input path is required.\n\n"; + PrintUsage(argv[0]); + return EXIT_FAILURE; } - BndBox op; - op(stream, input, output, bboxes, device); - - // Move image data back to host - size_t outputSize = output.shape().size() * output.dtype().size(); - auto outData = output.exportData(); - std::vector h_output(outputSize); - if (gpuPath) { - HIP_VALIDATE_NO_ERRORS( - hipMemcpyAsync(h_output.data(), outData.basePtr(), outputSize, hipMemcpyDeviceToHost, stream)); - HIP_VALIDATE_NO_ERRORS(hipStreamSynchronize(stream)); - } else { - memcpy(h_output.data(), outData.basePtr(), outputSize); + if (config.device == eDeviceType::GPU) { + CHECK_HIP_ERROR(hipSetDevice(config.deviceId)); } - // Write output image to disk - cv::Mat outImageData(imageData.rows, imageData.cols, imageData.type(), h_output.data()); - bool ret = cv::imwrite(output_file_path, outImageData); - if (!ret) { - std::cerr << "Faild to save output image to the file" << std::endl; - exit(1); - } + hipStream_t stream; + CHECK_HIP_ERROR(hipStreamCreate(&stream)); - std::cout << "Input image file: " << input_file_path << std::endl; - std::cout << "Output image file: " << output_file_path << std::endl; - if (gpuPath) { - std::cout << "Operation on GPU device " << deviceId << std::endl; - } else { - std::cout << "Operation on CPU" << std::endl; - } - std::cout << "Image size: width = " << imageData.cols << ", height = " << imageData.rows << std::endl; + // Load images + Tensor input = LoadImages(stream, config.inputPath.c_str(), config.device); + int64_t batchSize = input.shape()[input.shape().layout().batch_index()]; + int64_t height = input.shape()[input.shape().layout().height_index()]; + int64_t width = input.shape()[input.shape().layout().width_index()]; + + // Setup bounding box vector + std::vector> bbox_vector = + SetupBoundingBoxVector(batchSize, width, height, config.boundingBoxFilePath); + + BndBoxes bboxes(bbox_vector); + + // Create output tensor + Tensor output(input.shape(), input.dtype(), config.device); + + // Create BndBox operator + BndBox op; + op(stream, input, output, bboxes, config.device); + + // Write output images to disk + WriteImages(stream, output, config.outputPath); + CHECK_HIP_ERROR(hipStreamDestroy(stream)); return EXIT_SUCCESS; } \ No newline at end of file diff --git a/samples/center_crop.cpp b/samples/center_crop.cpp index 123e8f26..7a460773 100644 --- a/samples/center_crop.cpp +++ b/samples/center_crop.cpp @@ -21,143 +21,120 @@ THE SOFTWARE. */ #include +#include + #include #include #include #include +#include "common/utils.hpp" + using namespace roccv; +struct Config { + std::string inputPath; + std::string outputPath = "output"; + eDeviceType device = eDeviceType::GPU; + int deviceId = 0; + Size2D cropArea = {1, 1}; +}; + +bool ParseCropArea(const std::string& cropStr, Size2D& cropArea) { + std::istringstream iss(cropStr); + std::string token; + getline(iss, token, ','); + cropArea.w = std::stoi(token); + getline(iss, token, ','); + cropArea.h = std::stoi(token); + return true; +} + /** * @brief Center crop operation example. */ -void ShowHelpAndExit(const char *option = NULL) { - std::cout << "Options: " << option << std::endl - << "-i Input File Path - required" << std::endl - << "-o Output File Path - optional; default: output.bmp" << std::endl - << "-cpu Select CPU instead of GPU to perform operation - optional; default choice is GPU path" << std::endl - << "-d GPU device ID (0 for the first device, 1 for the second, etc.) - optional; default: 0" << std::endl - << "-crop Center crop area (width, height)- optional; default: use the set value in the app" << std::endl; - exit(0); +void PrintUsage(const char* programName) { + // clang-format off + std::cout << "Usage: " << programName << " -i [-o ] [-d ] [-crop ] [-C]" << std::endl; + std::cout << " -i, --input Input image or directory containing images (required)" << std::endl; + std::cout << " -o, --output Output image or directory to save the results (optional, default: output)" << std::endl; + std::cout << " -d, --device Device ID to use for execution (optional, default: 0)" << std::endl; + std::cout << " -c, --crop Crop area as comma separated values (optional, default: 1,1)" << std::endl; + std::cout << " -C, --cpu Use CPU for execution (optional, default: GPU)" << std::endl; + std::cout << " -h, --help Show this help message" << std::endl; + // clang-format on } - int main(int argc, char** argv) { - std::string input_file_path; - std::string output_file_path = "output.bmp"; - bool gpuPath = true; // use GPU by default - eDeviceType device = eDeviceType::GPU; - int deviceId = 0; - Size2D cropArea = {1, 1}; - bool cropSet = false; - - if(argc < 3) { - ShowHelpAndExit("-h"); - } - for (int i = 1; i < argc; i++) { - if (!strcmp(argv[i], "-h")) { - ShowHelpAndExit("-h"); - } - if (!strcmp(argv[i], "-i")) { - if (++i == argc) { - ShowHelpAndExit("-i"); - } - input_file_path = argv[i]; - continue; - } - if (!strcmp(argv[i], "-o")) { - if (++i == argc) { - ShowHelpAndExit("-o"); - } - output_file_path = argv[i]; - continue; - } - if (!strcmp(argv[i], "-crop")) { - i++; - if (i + 2 > argc) { - ShowHelpAndExit("-crop"); - } - cropArea.w = atoi(argv[i++]); - cropArea.h = atoi(argv[i]); - cropSet = true; - continue; - } - if (!strcmp(argv[i], "-cpu")) { - gpuPath = false; - continue; + Config config; + static struct option longOptions[] = {{"input", required_argument, nullptr, 'i'}, + {"output", required_argument, nullptr, 'o'}, + {"device", required_argument, nullptr, 'd'}, + {"crop", required_argument, nullptr, 'c'}, + {"cpu", no_argument, nullptr, 'C'}, + {"help", no_argument, nullptr, 'h'}, + {nullptr, 0, nullptr, 0}}; + int opt; + while ((opt = getopt_long(argc, argv, "i:o:d:c:h:C", longOptions, nullptr)) != -1) { + switch (opt) { + case 'i': + config.inputPath = optarg; + break; + case 'o': + config.outputPath = optarg; + break; + case 'd': + config.deviceId = std::stoi(optarg); + break; + case 'c': + if (!ParseCropArea(optarg, config.cropArea)) { + std::cerr << "Invalid crop area format. Use: width,height (e.g., 1,1)\n"; + return EXIT_FAILURE; + } + break; + case 'C': + config.device = eDeviceType::CPU; + break; + case 'h': + PrintUsage(argv[0]); + return EXIT_SUCCESS; + default: + PrintUsage(argv[0]); + return EXIT_FAILURE; } } - if (gpuPath) { - device = eDeviceType::GPU; - HIP_VALIDATE_NO_ERRORS(hipSetDevice(deviceId)); - } else { - device = eDeviceType::CPU; + if (config.inputPath.empty()) { + std::cerr << "Error: Input path is required.\n\n"; + PrintUsage(argv[0]); + return EXIT_FAILURE; } - cv::Mat imageData = cv::imread(input_file_path); - if (imageData.empty()) { - std::cerr << "Failed to read the input image file" << std::endl; - exit(1); - } - if (!cropSet) { - // Set a safe crop area if no user input - cropArea = {(imageData.cols / 2), (imageData.rows / 2)}; + if (config.device == eDeviceType::GPU) { + CHECK_HIP_ERROR(hipSetDevice(config.deviceId)); } - // Create input/output tensors for the image. - TensorShape inputShape(TensorLayout(eTensorLayout::TENSOR_LAYOUT_NHWC), {1, imageData.rows, imageData.cols, imageData.channels()}); - DataType dtype(eDataType::DATA_TYPE_U8); - Tensor input(inputShape, dtype, device); + hipStream_t stream; + CHECK_HIP_ERROR(hipStreamCreate(&stream)); - TensorShape outShape(TensorLayout(eTensorLayout::TENSOR_LAYOUT_NHWC), {1, cropArea.h, cropArea.w, imageData.channels()}); - Tensor output(outShape, dtype, device); + // Load input images + Tensor input = LoadImages(stream, config.inputPath.c_str(), config.device); - hipStream_t stream = nullptr; - if (gpuPath) { - HIP_VALIDATE_NO_ERRORS(hipStreamCreate(&stream)); - } + // Create output shape + TensorShape outputShape(input.layout(), {input.shape(input.layout().batch_index()), config.cropArea.h, + config.cropArea.w, input.shape(input.layout().channels_index())}); - // Move image data to input tensor - size_t imageSizeInByte = input.shape().size() * input.dtype().size(); - auto input_data = input.exportData(); - if (gpuPath) { - HIP_VALIDATE_NO_ERRORS(hipMemcpyAsync(input_data.basePtr(), imageData.data, imageSizeInByte, hipMemcpyHostToDevice, stream)); - } else { - memcpy(input_data.basePtr(), imageData.data, imageSizeInByte); - } + // Create output tensor + Tensor output(outputShape, input.dtype(), config.device); + // Create CenterCrop operator CenterCrop op; - op(stream, input, output, cropArea, device); - - // Move image data back to host - size_t outputSize = output.shape().size() * output.dtype().size(); - auto outData = output.exportData(); - std::vector h_output(outputSize); - if (gpuPath) { - HIP_VALIDATE_NO_ERRORS(hipMemcpyAsync(h_output.data(), outData.basePtr(), outputSize, hipMemcpyDeviceToHost, stream)); - HIP_VALIDATE_NO_ERRORS(hipStreamSynchronize(stream)); - } else { - memcpy(h_output.data(), outData.basePtr(), outputSize); - } + op(stream, input, output, config.cropArea, config.device); - // Write output image to disk - cv::Mat output_imageData(cropArea.h, cropArea.w, imageData.type(), h_output.data()); - bool ret = cv::imwrite(output_file_path, output_imageData); - if (!ret) { - std::cerr << "Faild to save output image to the file" << std::endl; - exit(1); - } + // Write output images to disk + WriteImages(stream, output, config.outputPath); - std::cout << "Input image file: " << input_file_path << std::endl; - std::cout << "Output image file: " << output_file_path << std::endl; - if (gpuPath) { - std::cout << "Operation on GPU device " << deviceId << std::endl; - } else { - std::cout << "Operation on CPU" << std::endl; - } - std::cout << "Input image size: width = " << imageData.cols << ", height = " << imageData.rows << std::endl; - std::cout << "Cropping area: width = " << cropArea.w << ", height = " << cropArea.h << std::endl; + CHECK_HIP_ERROR(hipStreamDestroy(stream)); return EXIT_SUCCESS; } \ No newline at end of file diff --git a/samples/common/utils.hpp b/samples/common/utils.hpp index 61ae525d..817cac9d 100644 --- a/samples/common/utils.hpp +++ b/samples/common/utils.hpp @@ -23,6 +23,7 @@ #include #include +#include #include inline void CheckHIPError(hipError_t code, const char *file, const int line) { @@ -39,7 +40,7 @@ inline void CheckHIPError(hipError_t code, const char *file, const int line) { CheckHIPError((val), __FILE__, __LINE__); \ } -bool ContainsExtension(const std::filesystem::path &path, const std::vector &extension_list) { +inline bool ContainsExtension(const std::filesystem::path &path, const std::vector &extension_list) { for (auto extension : extension_list) { if (path.extension() == extension) return true; } @@ -47,74 +48,155 @@ bool ContainsExtension(const std::filesystem::path &path, const std::vector(); + params.rowPitch = tensorData.stride(tensor.layout().height_index()); + params.rowBytes = tensor.shape(tensor.layout().width_index()) * tensor.shape(tensor.layout().channels_index()) * + tensor.dtype().size(); + params.imageBytes = params.rowPitch * tensor.shape(tensor.layout().height_index()); + params.basePtr = tensorData.basePtr(); + + return params; +} + /** - * @brief Loads images into the GPU memory specified. + * @brief Loads an image, or multiple images if given a directory, into a tensor. Will be in NHWC layout and U8 format. + * All images must be of the same size and format. This operation will block on the provided stream. * - * @param images_dir Either a directory or a single image to load into GPU memory. - * @param num_images The number of images to load into GPU memory. - * @param gpu_input A pointer to valid GPU memory. + * @param image_path The path to the image to load. If a directory is provided, all supported images in the directory + * will be loaded. + * @param device The device to load the images onto. Defaults to GPU. + * @param openCVFlags The OpenCV flags to use when loading the images. Defaults to IMREAD_UNCHANGED. + * @return A NHWC tensor containing the loaded images. */ -void DecodeRGBIImage(const std::string &images_dir, int num_images, void *gpu_input) { +inline roccv::Tensor LoadImages(hipStream_t stream, const std::string &image_path, + eDeviceType device = eDeviceType::GPU, int openCVFlags = cv::IMREAD_UNCHANGED) { const std::vector supportedExtensions = {".bmp", ".jpg", ".jpeg", ".png"}; - std::vector imageFiles; - if (std::filesystem::is_directory(images_dir)) { - // A directory is provided. Collect all supported files in the directory (non-recursively). - for (auto file : std::filesystem::directory_iterator(images_dir)) { + std::vector images; + + int width = -1; + int height = -1; + int channels = -1; + + // Load images from directory or file if a single image is provided + if (std::filesystem::is_directory(image_path)) { + for (auto file : std::filesystem::directory_iterator(image_path)) { if (!std::filesystem::is_directory(file.path()) && ContainsExtension(file.path(), supportedExtensions)) { - imageFiles.push_back(file.path()); + cv::Mat image = cv::imread(file.path(), openCVFlags); + if (image.empty()) { + throw std::runtime_error("Cannot decode " + file.path().string() + ". File type not supported.\n"); + } + images.push_back(image); + + // Check if all images are of the same size + if (width == -1 && height == -1 && channels == -1) { + width = image.cols; + height = image.rows; + channels = image.channels(); + } else if (image.cols != width || image.rows != height || image.channels() != channels) { + throw std::runtime_error("All images must be of the same size and format"); + } } } - - // Throw an error if there were no valid images found in the given directory - if (imageFiles.empty()) { - throw std::runtime_error("No valid images found in directory " + images_dir); + } else if (std::filesystem::is_regular_file(image_path) && ContainsExtension(image_path, supportedExtensions)) { + cv::Mat image = cv::imread(image_path, openCVFlags); + if (image.empty()) { + throw std::runtime_error("Cannot decode " + image_path + ". File type not supported.\n"); } + images.push_back(image); + width = image.cols; + height = image.rows; + channels = image.channels(); } else { - // A single image file is provided - if (!ContainsExtension(images_dir, supportedExtensions)) - throw std::runtime_error("Cannot decode " + images_dir + ". File type not supported.\n"); - imageFiles.push_back(images_dir); + throw std::runtime_error("Cannot decode " + image_path + ". File type not supported.\n"); } - // Load images into provided GPU memory - size_t mem_offset = 0; - for (int b = 0; b < num_images; b++) { - cv::Mat inputMat = cv::imread(imageFiles[b]); - if (inputMat.empty()) { - throw std::runtime_error("Unable to load image " + imageFiles[b]); - } + if (images.empty()) { + throw std::runtime_error("No valid images found in directory " + image_path); + } - size_t imageSize = inputMat.rows * inputMat.cols * inputMat.channels() * sizeof(uint8_t); - CHECK_HIP_ERROR( - hipMemcpy(static_cast(gpu_input) + mem_offset, inputMat.data, imageSize, hipMemcpyHostToDevice)); - mem_offset += imageSize; + // Create tensor and prepare arguments for hipMemcpy2D + roccv::Tensor tensor(images.size(), roccv::Size2D(width, height), + roccv::ImageFormat(eDataType::DATA_TYPE_U8, channels), device); + + MemcpyParams params = GetMemcpyParams(tensor); + + // Copy images into tensor + hipMemcpyKind kind = (device == eDeviceType::GPU) ? hipMemcpyHostToDevice : hipMemcpyHostToHost; + for (int i = 0; i < images.size(); i++) { + CHECK_HIP_ERROR(hipMemcpy2DAsync(static_cast(params.basePtr) + i * params.imageBytes, + params.rowPitch, images[i].data, params.rowBytes, params.rowBytes, height, + kind, stream)); } + + // Ensure all memory operations are completed before returning the tensor + CHECK_HIP_ERROR(hipStreamSynchronize(stream)); + + return tensor; } /** - * @brief Writes a batch of 3-channel RGBI images in a tensor to .bmp files. This will also block on the provided - * stream. + * @brief Writes a batch of images from a tensor to the specified output path. This is a blocking operation. * - * @param tensor A tensor containing a batch of RGBI images. - * @param stream The HIP stream to synchronize with. + * @param tensor The tensor to write the images from. + * @param output_path The path to write the images to. If a directory is provided, the images will be written to the + * directory. */ -void WriteRGBITensor(const roccv::Tensor &tensor, hipStream_t stream) { - CHECK_HIP_ERROR(hipStreamSynchronize(stream)); +inline void WriteImages(hipStream_t stream, const roccv::Tensor &tensor, const std::string &output_path) { + if (tensor.layout() != eTensorLayout::TENSOR_LAYOUT_NHWC && tensor.layout() != eTensorLayout::TENSOR_LAYOUT_HWC) { + throw std::runtime_error( + "Unsupported tensor layout in WriteImages(). Only NHWC and HWC layouts are supported."); + } - auto srcData = tensor.exportData(); - int batchSize = tensor.shape(tensor.layout().batch_index()); - int height = tensor.shape(tensor.layout().height_index()); - int width = tensor.shape(tensor.layout().width_index()); + int64_t height = tensor.shape(tensor.layout().height_index()); + int64_t width = tensor.shape(tensor.layout().width_index()); + int64_t batchSize = + tensor.layout().batch_index() < 0 ? 1 : tensor.shape(tensor.layout().batch_index()); // Support for HWC layout + int64_t channels = tensor.shape(tensor.layout().channels_index()); - // Write each image in the batch to separate .bmp files - for (int b = 0; b < batchSize; b++) { - std::ostringstream outFilename; - outFilename << "./roccvtest_" << b << ".bmp"; + // Get OpenCV image format + int64_t cvFormat = CV_MAKETYPE(CV_8U, channels); - cv::Mat outputMat(height, width, CV_8UC3); - CHECK_HIP_ERROR(hipMemcpy(outputMat.data, srcData.basePtr(), - (tensor.shape().size() / batchSize) * tensor.dtype().size(), hipMemcpyDeviceToHost)); - cv::imwrite(outFilename.str().c_str(), outputMat); + // Get memcpy parameters + MemcpyParams params = GetMemcpyParams(tensor); + hipMemcpyKind kind = (tensor.device() == eDeviceType::GPU) ? hipMemcpyDeviceToHost : hipMemcpyHostToHost; + + // Copy images from tensor to OpenCV image vector + std::vector images(batchSize); + for (int i = 0; i < batchSize; i++) { + images[i] = cv::Mat(height, width, cvFormat); + CHECK_HIP_ERROR(hipMemcpy2DAsync(images[i].data, params.rowBytes, + static_cast(params.basePtr) + i * params.imageBytes, + params.rowPitch, params.rowBytes, height, kind, stream)); + } + + // Ensure all memory operations are completed before writing images + CHECK_HIP_ERROR(hipStreamSynchronize(stream)); + + std::filesystem::path outputPath(output_path); + if (outputPath.extension().empty()) { + for (int i = 0; i < batchSize; i++) { + std::filesystem::create_directories(outputPath); + std::filesystem::path outFilename = outputPath / std::format("image_{}.bmp", i); + cv::imwrite(outFilename.string(), images[i]); + } + } else { + cv::imwrite(output_path, images[0]); } } \ No newline at end of file diff --git a/samples/composite.cpp b/samples/composite.cpp index cbc5507e..bd21948f 100644 --- a/samples/composite.cpp +++ b/samples/composite.cpp @@ -19,89 +19,116 @@ * THE SOFTWARE. */ +#include + #include #include #include +#include "common/utils.hpp" + +/** @file composite.cpp + * @brief Sample application for the Composite operation. + * + * This sample application demonstrates the Composite operation, which blends the foreground image into the background + * image using a grayscale alpha mask. It loads the background, foreground, and mask images, runs the Composite + * operation, and writes the output images to disk. + */ + using namespace roccv; +struct Config { + std::string backgroundPath; + std::string foregroundPath; + std::string maskPath; + std::string outputPath = "output"; + eDeviceType device = eDeviceType::GPU; + int deviceId = 0; +}; + +void PrintUsage(const char* programName) { + // clang-format off + std::cout << "Usage: " << programName << " -b -f -m -o -d " << std::endl; + std::cout << " -b, --background Background image filename (required)" << std::endl; + std::cout << " -f, --foreground Foreground image filename (required)" << std::endl; + std::cout << " -m, --mask Mask image filename (required)" << std::endl; + std::cout << " -o, --output Output image filename (optional, default: output)" << std::endl; + std::cout << " -d, --device Device ID to use for execution (optional, default: 0)" << std::endl; + std::cout << " -h, --help Show this help message" << std::endl; + std::cout << std::endl; + std::cout << "NOTE: If you use directories for the background, foreground, or mask inputs, each directory must contain the same number of images. Each image in the directories must be the same size." << std::endl; + // clang-format on +} + +/** + * @brief Main function for the Composite operation. + * + * @param argc Number of command line arguments. + * @param argv Command line arguments. + * @return EXIT_SUCCESS if the operation completed successfully, EXIT_FAILURE otherwise. + */ int main(int argc, char** argv) { - if (argc != 6) { - std::cerr << "Usage: " << argv[0] - << " " - << std::endl; + Config config; + static struct option longOptions[] = {{"background", required_argument, nullptr, 'b'}, + {"foreground", required_argument, nullptr, 'f'}, + {"mask", required_argument, nullptr, 'm'}, + {"output", required_argument, nullptr, 'o'}, + {"device", required_argument, nullptr, 'd'}, + {"help", no_argument, nullptr, 'h'}, + {nullptr, 0, nullptr, 0}}; + int opt; + while ((opt = getopt_long(argc, argv, "b:f:m:o:d:h", longOptions, nullptr)) != -1) { + switch (opt) { + case 'b': + config.backgroundPath = optarg; + break; + case 'f': + config.foregroundPath = optarg; + break; + case 'm': + config.maskPath = optarg; + break; + case 'o': + config.outputPath = optarg; + break; + case 'd': + config.deviceId = std::stoi(optarg); + break; + case 'h': + PrintUsage(argv[0]); + return EXIT_SUCCESS; + default: + PrintUsage(argv[0]); + return EXIT_FAILURE; + } + } + if (config.backgroundPath.empty() || config.foregroundPath.empty() || config.maskPath.empty()) { + std::cerr << "Error: Background, foreground, and mask paths are required.\n\n"; + PrintUsage(argv[0]); return EXIT_FAILURE; } - hipStream_t stream; - HIP_VALIDATE_NO_ERRORS(hipStreamCreate(&stream)); - - HIP_VALIDATE_NO_ERRORS(hipSetDevice(std::stoi(argv[5]))); - - cv::Mat background_data = cv::imread(argv[1]); - cv::Mat foreground_data = cv::imread(argv[2]); - cv::Mat mask_data = cv::imread(argv[3], cv::IMREAD_GRAYSCALE); - - // Create input/output tensors for the image. - DataType dtype(eDataType::DATA_TYPE_U8); - TensorShape background_shape(TensorLayout(eTensorLayout::TENSOR_LAYOUT_NHWC), - {1, background_data.rows, background_data.cols, background_data.channels()}); - Tensor background_tensor(background_shape, dtype); - - TensorShape foreground_shape(TensorLayout(eTensorLayout::TENSOR_LAYOUT_NHWC), - {1, foreground_data.rows, foreground_data.cols, foreground_data.channels()}); - Tensor foreground_tensor(foreground_shape, dtype); - - TensorShape mask_shape(TensorLayout(eTensorLayout::TENSOR_LAYOUT_NHWC), - {1, mask_data.rows, mask_data.cols, mask_data.channels()}); - Tensor mask_tensor(mask_shape, dtype); - - Tensor output_tensor(background_shape, dtype); - - auto bt_data = background_tensor.exportData(); - HIP_VALIDATE_NO_ERRORS(hipMemcpyAsync(bt_data.basePtr(), background_data.data, - bt_data.shape().size() * bt_data.dtype().size(), hipMemcpyHostToDevice, - stream)); - - auto ft_data = foreground_tensor.exportData(); - HIP_VALIDATE_NO_ERRORS(hipMemcpyAsync(ft_data.basePtr(), foreground_data.data, - foreground_tensor.shape().size() * foreground_tensor.dtype().size(), - hipMemcpyHostToDevice, stream)); - - auto m_data = mask_tensor.exportData(); - HIP_VALIDATE_NO_ERRORS(hipMemcpyAsync(m_data.basePtr(), mask_data.data, - mask_tensor.shape().size() * mask_tensor.dtype().size(), - hipMemcpyHostToDevice, stream)); - - hipEvent_t begin, end; - HIP_VALIDATE_NO_ERRORS(hipEventCreate(&begin)); - HIP_VALIDATE_NO_ERRORS(hipEventCreate(&end)); - - HIP_VALIDATE_NO_ERRORS(hipEventRecord(begin, stream)); - roccv::Composite op; - op(stream, foreground_tensor, background_tensor, mask_tensor, output_tensor); - HIP_VALIDATE_NO_ERRORS(hipEventRecord(end, stream)); - HIP_VALIDATE_NO_ERRORS(hipEventSynchronize(end)); + if (config.device == eDeviceType::GPU) { + CHECK_HIP_ERROR(hipSetDevice(config.deviceId)); + } - float duration; - HIP_VALIDATE_NO_ERRORS(hipEventElapsedTime(&duration, begin, end)); - printf("Kernel execution time: %fms\n", duration); + hipStream_t stream; + CHECK_HIP_ERROR(hipStreamCreate(&stream)); - HIP_VALIDATE_NO_ERRORS(hipEventDestroy(begin)); - HIP_VALIDATE_NO_ERRORS(hipEventDestroy(end)); + // Create required tensors for the Composite operation + Tensor background = LoadImages(hipStreamPerThread, config.backgroundPath, config.device); + Tensor foreground = LoadImages(hipStreamPerThread, config.foregroundPath, config.device); + Tensor mask = LoadImages(hipStreamPerThread, config.maskPath, config.device, cv::IMREAD_GRAYSCALE); + Tensor output = Tensor(background.shape(), background.dtype(), config.device); - // Move image data back to device - auto out_data = output_tensor.exportData(); - std::vector out_h(output_tensor.shape().size()); - HIP_VALIDATE_NO_ERRORS(hipMemcpyAsync(out_h.data(), out_data.basePtr(), - output_tensor.shape().size() * output_tensor.dtype().size(), - hipMemcpyDeviceToHost, stream)); + // Run the Composite operation + Composite op; + op(stream, foreground, background, mask, output, config.device); - HIP_VALIDATE_NO_ERRORS(hipStreamSynchronize(stream)); + // Write output images to disk + WriteImages(stream, output, config.outputPath); - // Write normalized image to disk - cv::Mat output_image_data(background_data.rows, background_data.cols, CV_8UC3, out_h.data()); - cv::imwrite(argv[4], output_image_data); + CHECK_HIP_ERROR(hipStreamDestroy(stream)); return EXIT_SUCCESS; } \ No newline at end of file diff --git a/samples/copy_make_border.cpp b/samples/copy_make_border.cpp index be7be7e5..0dbb48f0 100644 --- a/samples/copy_make_border.cpp +++ b/samples/copy_make_border.cpp @@ -21,71 +21,138 @@ THE SOFTWARE. */ #include +#include #include #include #include #include +#include "common/utils.hpp" + using namespace roccv; +struct Config { + std::string inputPath; + std::string outputPath = "output"; + int32_t top = 10; + int32_t left = 10; + float r = 0.0f, g = 0.0f, b = 0.0f, a = 255.0f; + eBorderType borderMode = eBorderType::BORDER_TYPE_CONSTANT; + int deviceId = 0; +}; + +void PrintUsage(const char* programName) { + // clang-format off + std::cout << "Usage: " << programName << " -i [options]\n\n" + << "Options:\n" + << " -i, --input Input image or directory (required)\n" + << " -o, --output Output file or directory (default: output/image_.bmp)\n" + << " -t, --top Top/bottom border size (default: 10)\n" + << " -l, --left Left/right border size (default: 10)\n" + << " -c, --color Border color as r,g,b,a (default: 0,0,0,255)\n" + << " -m, --mode Border mode: 0=CONSTANT, 1=REPLICATE, 2=REFLECT, 3=REFLECT101, 4=WRAP (default: CONSTANT)\n" + << " -d, --device GPU device ID (default: 0)\n" + << " -h, --help Show this help message\n\n" + << "Example:\n" + << " " << programName << " -i image.jpg -o bordered.png -t 20 -l 15 -c 255,0,0,255\n"; + // clang-format on +} + +bool ParseColor(const std::string& colorStr, float& r, float& g, float& b, float& a) { + return sscanf(colorStr.c_str(), "%f,%f,%f,%f", &r, &g, &b, &a) == 4; +} + /** * @brief Copy make border operation example. + * + * This sample demonstrates the usage of the CopyMakeBorder operator. It accepts either a single image file or a + * directory of images as the input path, and either a single file or a directory as the output path. If a directory is + * provided for the input, all supported images within the directory will be processed as a batch. Similarly, if the + * output path is a directory, all resulting images will be written to that directory. For each image, a border is + * created based on the specified border mode and border value. + * + * If is a directory, a batch operation will occur on every image in the directory. + * If is a directory, the output images will be written into that directory. */ int main(int argc, char** argv) { - if (argc != 11) { - std::cerr << "Usage: " << argv[0] - << " " - << std::endl; - return EXIT_FAILURE; + Config config; + + static struct option longOptions[] = {{"input", required_argument, nullptr, 'i'}, + {"output", required_argument, nullptr, 'o'}, + {"top", required_argument, nullptr, 't'}, + {"left", required_argument, nullptr, 'l'}, + {"color", required_argument, nullptr, 'c'}, + {"mode", required_argument, nullptr, 'm'}, + {"device", required_argument, nullptr, 'd'}, + {"help", no_argument, nullptr, 'h'}, + {nullptr, 0, nullptr, 0}}; + + int opt; + while ((opt = getopt_long(argc, argv, "i:o:t:l:c:m:d:h", longOptions, nullptr)) != -1) { + switch (opt) { + case 'i': + config.inputPath = optarg; + break; + case 'o': + config.outputPath = optarg; + break; + case 't': + config.top = std::stoi(optarg); + break; + case 'l': + config.left = std::stoi(optarg); + break; + case 'c': + if (!ParseColor(optarg, config.r, config.g, config.b, config.a)) { + std::cerr << "Invalid color format. Use: r,g,b,a (e.g., 255,0,0,255)\n"; + return EXIT_FAILURE; + } + break; + case 'm': + config.borderMode = static_cast(std::stoi(optarg)); + break; + case 'd': + config.deviceId = std::stoi(optarg); + break; + case 'h': + PrintUsage(argv[0]); + return EXIT_SUCCESS; + default: + PrintUsage(argv[0]); + return EXIT_FAILURE; + } } - HIP_VALIDATE_NO_ERRORS(hipSetDevice(std::stoi(argv[10]))); - - int32_t top = std::stoi(argv[3]); - int32_t left = std::stoi(argv[4]); - float r = std::stof(argv[5]); - float g = std::stof(argv[6]); - float b = std::stof(argv[7]); - float a = std::stof(argv[8]); - eBorderType border_mode = static_cast(std::stoi(argv[9])); - - cv::Mat image_data = cv::imread(argv[1]); - - // Create input/output tensors for the image. - TensorShape shape(TensorLayout(eTensorLayout::TENSOR_LAYOUT_NHWC), - {1, image_data.rows, image_data.cols, image_data.channels()}); - DataType dtype(eDataType::DATA_TYPE_U8); - - TensorShape o_shape(TensorLayout(eTensorLayout::TENSOR_LAYOUT_NHWC), - {1, image_data.rows + top * 2, image_data.cols + left * 2, image_data.channels()}); + if (config.inputPath.empty()) { + std::cerr << "Error: Input path is required.\n\n"; + PrintUsage(argv[0]); + return EXIT_FAILURE; + } - Tensor d_in(shape, dtype); - Tensor d_out(o_shape, dtype); + CHECK_HIP_ERROR(hipSetDevice(config.deviceId)); + // Create stream hipStream_t stream; - HIP_VALIDATE_NO_ERRORS(hipStreamCreate(&stream)); + CHECK_HIP_ERROR(hipStreamCreate(&stream)); - // Move image data to input tensor - size_t image_size = d_in.shape().size() * d_in.dtype().size(); - auto d_input_data = d_in.exportData(); - HIP_VALIDATE_NO_ERRORS( - hipMemcpyAsync(d_input_data.basePtr(), image_data.data, image_size, hipMemcpyHostToDevice, stream)); + // Load input image + Tensor input = LoadImages(stream, config.inputPath.c_str()); - CopyMakeBorder op; - op(stream, d_in, d_out, top, left, border_mode, {b, g, r, a}); + // Create output tensor + int64_t outputHeight = input.shape(input.layout().height_index()) + config.top * 2; + int64_t outputWidth = input.shape(input.layout().width_index()) + config.left * 2; + TensorShape outputShape(input.layout(), {input.shape(input.layout().batch_index()), outputHeight, outputWidth, + input.shape(input.layout().channels_index())}); + Tensor output(outputShape, input.dtype()); - // Move image data back to device - auto d_out_data = d_out.exportData(); - size_t out_image_size = d_out.shape().size() * d_out.dtype().size(); - std::vector h_output(out_image_size); - HIP_VALIDATE_NO_ERRORS( - hipMemcpyAsync(h_output.data(), d_out_data.basePtr(), out_image_size, hipMemcpyDeviceToHost, stream)); + // Create CopyMakeBorder operator + CopyMakeBorder op; + op(stream, input, output, config.top, config.left, config.borderMode, {config.b, config.g, config.r, config.a}); - HIP_VALIDATE_NO_ERRORS(hipStreamSynchronize(stream)); + WriteImages(stream, output, config.outputPath); - cv::Mat output_image_data(image_data.rows + top * 2, image_data.cols + left * 2, CV_8UC3, h_output.data()); - cv::imwrite(argv[2], output_image_data); + CHECK_HIP_ERROR(hipStreamDestroy(stream)); return EXIT_SUCCESS; } \ No newline at end of file diff --git a/samples/cropandresize/cpp/main.cpp b/samples/cropandresize/cpp/main.cpp index 8b8ed9bd..eec6532b 100644 --- a/samples/cropandresize/cpp/main.cpp +++ b/samples/cropandresize/cpp/main.cpp @@ -23,9 +23,9 @@ #include #include +#include #include #include -#include #include #include #include @@ -35,170 +35,172 @@ #include "core/tensor_shape.hpp" /** + * @file main.cpp * @brief Crop and Resize sample app. * - * The Crop and Resize is a simple pipeline which demonstrates usage of - * rocCV Tensor along with a few operators. + * The Crop and Resize is a simple pipeline which demonstrates usage of the + * rocCV Tensor along with the Custom Crop and Resize operators. * * Input Batch Tensor -> Crop -> Resize -> WriteImage */ -/** - * @brief Utility to show usage of sample app - * - **/ -void showUsage() { - std::cout << "usage: ./roccv_cropandresize_app -i -b " - << std::endl; +using namespace roccv; + +struct Config { + std::string inputPath; + std::string outputPath = "output"; + eDeviceType device = eDeviceType::GPU; + Size2D resizeShape = {320, 480}; + Box_t cropRect = {50, 150, 400, 300}; + eInterpolationType interpolation = eInterpolationType::INTERP_TYPE_LINEAR; + int deviceId = 0; +}; + +void PrintUsage(const char* programName) { + // clang-format off + std::cout << "rocCV Crop and Resize Sample Application\n"; + std::cout << "----------------------------------------\n"; + std::cout << "This sample demonstrates how to set up a simple image processing pipeline using rocCV.\n"; + std::cout << "It shows reading a batch of images, cropping them to a specified rectangle,\n"; + std::cout << "resizing the cropped images to a target shape, and writing the outputs to image files.\n"; + std::cout << "You may select the device (CPU or GPU), interpolation type, input size, and more.\n"; + std::cout << '\n'; + std::cout << "Usage: " << programName << " -i [options]\n"; + std::cout << " -i, --input Input image or directory (required)\n"; + std::cout << "Options:\n"; + std::cout << " -o, --output Output image or directory (optional, default: output)\n"; + std::cout << " -r, --resize Resize shape as width,height (optional, default: 320,480)\n"; + std::cout << " -c, --crop Crop rectangle as x,y,w,h (optional, default: 50,150,400,300)\n"; + std::cout << " -I, --interpolation Interpolation type: 0=NEAREST, 1=LINEAR, 2=CUBIC (optional, default: LINEAR)\n"; + std::cout << " -C, --cpu Use CPU for execution (optional, default: GPU)\n"; + std::cout << " -d, --device Device ID to use for execution (optional, default: 0)\n"; + std::cout << " -h, --help Show this help message\n"; + std::cout << std::endl; + // clang-format on } -/** - * @brief Utility to parse the command line arguments - * - **/ -int ParseArgs(int argc, char *argv[], std::string &imagePath, uint32_t &batchSize) { - static struct option long_options[] = {{"help", no_argument, 0, 'h'}, - {"imagePath", required_argument, 0, 'i'}, - {"batch", required_argument, 0, 'b'}, - {0, 0, 0, 0}}; - - int long_index = 0; - int opt = 0; - while ((opt = getopt_long(argc, argv, "hi:b:", long_options, &long_index)) != -1) { +void ParseCropRectangle(const std::string& cropStr, Box_t& cropRect) { + std::istringstream iss(cropStr); + std::string token; + getline(iss, token, ','); + cropRect.x = std::stoi(token); + getline(iss, token, ','); + cropRect.y = std::stoi(token); + getline(iss, token, ','); + cropRect.width = std::stoi(token); + getline(iss, token, ','); + cropRect.height = std::stoi(token); +} + +void ParseResizeShape(const std::string& resizeStr, Size2D& resizeShape) { + std::istringstream iss(resizeStr); + std::string token; + getline(iss, token, ','); + resizeShape.w = std::stoi(token); + getline(iss, token, ','); + resizeShape.h = std::stoi(token); +} + +int main(int argc, char** argv) { + Config config; + static struct option longOptions[] = {{"input", required_argument, nullptr, 'i'}, + {"output", required_argument, nullptr, 'o'}, + {"resize", required_argument, nullptr, 'r'}, + {"crop", required_argument, nullptr, 'c'}, + {"interpolation", required_argument, nullptr, 'I'}, + {"cpu", no_argument, nullptr, 'C'}, + {"device", required_argument, nullptr, 'd'}, + {"help", no_argument, nullptr, 'h'}, + {nullptr, 0, nullptr, 0}}; + + // Parse command line arguments + int opt; + while ((opt = getopt_long(argc, argv, "i:o:r:c:I:d:h:C", longOptions, nullptr)) != -1) { switch (opt) { - case 'h': - showUsage(); - return -1; - break; case 'i': - imagePath = optarg; + config.inputPath = optarg; break; - case 'b': - batchSize = std::stoi(optarg); + case 'o': + config.outputPath = optarg; break; - case ':': - showUsage(); - return -1; - default: + case 'r': + ParseResizeShape(optarg, config.resizeShape); + break; + case 'c': + ParseCropRectangle(optarg, config.cropRect); break; + case 'I': + config.interpolation = static_cast(std::stoi(optarg)); + break; + case 'C': + config.device = eDeviceType::CPU; + break; + case 'd': + config.deviceId = std::stoi(optarg); + break; + case 'h': + PrintUsage(argv[0]); + return EXIT_SUCCESS; + default: + PrintUsage(argv[0]); + return EXIT_FAILURE; } } - std::ifstream imageFile(imagePath); - if (!imageFile.good()) { - showUsage(); - std::cerr << "Image path '" + imagePath + "' does not exist" << std::endl; - return -1; - } - return 0; -} -int main(int argc, char *argv[]) { - // Default parameters - // TODO: Default parameter for images cannot be added for now. Must specify a sample asset directory in the final - // build which is relative to this executable. - std::string imagePath = "none.jpg"; - uint32_t batchSize = 1; - - // Parse the command line paramaters to override the default parameters - int retval = ParseArgs(argc, argv, imagePath, batchSize); - if (retval != 0) { - return retval; + if (config.inputPath.empty()) { + std::cerr << "Error: Input path is required.\n\n"; + PrintUsage(argv[0]); + return EXIT_FAILURE; } - // Note : The maximum input image dimensions needs to be updated in case - // of testing with different test images - - int maxImageWidth = 720; - int maxImageHeight = 480; - int maxChannels = 3; + if (config.device == eDeviceType::GPU) { + CHECK_HIP_ERROR(hipSetDevice(config.deviceId)); + } - // tag: Create the HIP stream hipStream_t stream; CHECK_HIP_ERROR(hipStreamCreate(&stream)); - // tag: Allocate input tensor - // Allocating memory for RGBI input image batch of uint8_t data type. - - roccv::TensorDataStrided::Buffer inBuf; - inBuf.strides[3] = sizeof(uint8_t); - inBuf.strides[2] = maxChannels * inBuf.strides[3]; - inBuf.strides[1] = maxImageWidth * inBuf.strides[2]; - inBuf.strides[0] = maxImageHeight * inBuf.strides[1]; - CHECK_HIP_ERROR(hipMallocAsync(&inBuf.basePtr, batchSize * inBuf.strides[0], stream)); - - // tag: Tensor Requirements - // Calculate the requirements for the RGBI uint8_t Tensor which include - // pitch bytes, alignment, shape and tensor layout - roccv::Tensor::Requirements inReqs = - roccv::Tensor::CalcRequirements(batchSize, {maxImageWidth, maxImageHeight}, roccv::FMT_RGB8); - - // Create a tensor buffer to store the data pointer and pitch bytes for each plane - roccv::TensorDataStridedHip inData(roccv::TensorShape{inReqs.shape, inReqs.rank, inReqs.layout}, - roccv::DataType{inReqs.dtype}, inBuf); - - // Wrap tensor data in a rocCV tensor for use with the rocCV operators. - roccv::Tensor inTensor = roccv::TensorWrapData(inData); - - // tag: Image Loading - uint8_t *gpuInput = reinterpret_cast(inBuf.basePtr); - // The total images is set to the same value as batch size for testing - uint32_t totalImages = batchSize; - - // OpenCV is used to load the images, which gets copied into device memory. - DecodeRGBIImage(imagePath, totalImages, gpuInput); - - // tag: The input buffer is now ready to be used by the operators - - // Set parameters for Crop and Resize - // ROI dimensions to crop in the input image - int cropX = 50; - int cropY = 150; - int cropWidth = 400; - int cropHeight = 300; - - // Set the resize dimensions - int resizeWidth = 320; - int resizeHeight = 240; - - // Create the crop rect for the cropping operator - roccv::Box_t crpRect = {cropX, cropY, cropWidth, cropHeight}; - - // tag: Allocate Tensors for Crop and Resize - // Create a rocCV Tensor based on the crop window size. - roccv::Tensor cropTensor(batchSize, {cropWidth, cropHeight}, roccv::FMT_RGB8); - // Create a rocCV Tensor based on resize dimensions - roccv::Tensor resizedTensor(batchSize, {resizeWidth, resizeHeight}, roccv::FMT_RGB8); - -#ifdef PROFILE_SAMPLE - hipEvent_t start, stop; - hipEventCreate(&start); - hipEventCreate(&stop); - hipEventRecord(start); -#endif - // tag: Initialize operators for Crop and Resize - roccv::CustomCrop cropOp; - roccv::Resize resizeOp; - - // tag: Executes the CustomCrop operation on the given HIP stream - cropOp(stream, inTensor, cropTensor, crpRect); - - // Resize operator can now be enqueued into the same stream - resizeOp(stream, cropTensor, resizedTensor, INTERP_TYPE_LINEAR); - - // tag: Profile section -#ifdef PROFILE_SAMPLE - hipEventRecord(stop); - hipEventSynchronize(stop); - float operatorms = 0; - hipEventElapsedTime(&operatorms, start, stop); - std::cout << "Time for Crop and Resize : " << operatorms << " ms" << std::endl; -#endif - - // tag: Copy the buffer to CPU and write resized image into .bmp files - WriteRGBITensor(resizedTensor, stream); - - // tag: Clean up + // Load batch of input images + Tensor input = LoadImages(stream, config.inputPath, config.device); + + // Determine the batch size and channels from the input tensor + int64_t batchSize = input.shape(input.layout().batch_index()); + int64_t channels = input.shape(input.layout().channels_index()); + + // Create tensor for the cropped image + Tensor cropTensor = + Tensor(TensorShape(input.layout(), {batchSize, config.cropRect.height, config.cropRect.width, channels}), + input.dtype(), config.device); + + // Create tensor for the resized image + Tensor resizedTensor = + Tensor(TensorShape(input.layout(), {batchSize, config.resizeShape.h, config.resizeShape.w, channels}), + input.dtype(), config.device); + + // Create crop and resize operators + CustomCrop cropOp; + Resize resizeOp; + + std::chrono::high_resolution_clock::time_point start = std::chrono::high_resolution_clock::now(); + + // Run the crop operation, writing results to the crop tensor + cropOp(stream, input, cropTensor, config.cropRect, config.device); + + // Run the resize operation, writing results to the resized tensor + resizeOp(stream, cropTensor, resizedTensor, config.interpolation, config.device); + CHECK_HIP_ERROR(hipStreamSynchronize(stream)); + + std::chrono::high_resolution_clock::time_point end = std::chrono::high_resolution_clock::now(); + + // Report the duration of the crop and resize operation + long executionTime = std::chrono::duration_cast(end - start).count(); + std::cout << "Processed " << batchSize << " images in " << executionTime << "ms" << std::endl; + + // Write the cropped and resized images to disk + WriteImages(stream, resizedTensor, config.outputPath); + + // Destroy the stream CHECK_HIP_ERROR(hipStreamDestroy(stream)); - // tag: End of Sample + return EXIT_SUCCESS; } \ No newline at end of file diff --git a/samples/custom_crop.cpp b/samples/custom_crop.cpp index 73bd8561..25e7d83a 100644 --- a/samples/custom_crop.cpp +++ b/samples/custom_crop.cpp @@ -21,144 +21,122 @@ THE SOFTWARE. */ #include +#include + #include #include #include #include +#include "common/utils.hpp" + using namespace roccv; -/** - * @brief Custom crop operation example. - */ +struct Config { + std::string inputPath; + std::string outputPath = "output"; + Box_t cropRect = {0, 0, 1, 1}; + eDeviceType device = eDeviceType::GPU; + int deviceId = 0; +}; + +void PrintUsage(const char* programName) { + // clang-format off + std::cout << "Usage: " << programName << " -i [-o ] [-crop ] [-d ] [-c]" << std::endl; + std::cout << " -i, --input Input image or directory containing images (required)" << std::endl; + std::cout << " -o, --output Output image or directory to save the results (optional, default: output)" << std::endl; + std::cout << " -c, --crop Crop rectangle as comma separated values (optional, default: 0,0,1,1)" << std::endl; + std::cout << " -d, --device Device ID to use for execution when using GPU (optional, default: 0)" << std::endl; + std::cout << " -C, --cpu Use CPU for execution (optional, default: GPU)" << std::endl; + std::cout << " -h, --help Show this help message" << std::endl; + // clang-format on +} -void ShowHelpAndExit(const char *option = NULL) { - std::cout << "Options: " << option << std::endl - << "-i Input File Path - required" << std::endl - << "-o Output File Path - optional; default: output.bmp" << std::endl - << "-cpu Select CPU instead of GPU to perform operation - optional; default choice is GPU path" << std::endl - << "-d GPU device ID (0 for the first device, 1 for the second, etc.) - optional; default: 0" << std::endl - << "-crop Crop rectangle (top_left_corner_x, top_left_corner_y, width, height)- optional; default: use the set value in the app" << std::endl; - exit(0); +bool ParseCropRectangle(const std::string& cropStr, Box_t& cropRect) { + std::istringstream iss(cropStr); + std::string token; + getline(iss, token, ','); + cropRect.x = std::stoi(token); + getline(iss, token, ','); + cropRect.y = std::stoi(token); + getline(iss, token, ','); + cropRect.width = std::stoi(token); + getline(iss, token, ','); + cropRect.height = std::stoi(token); + return true; } +/** + * @brief Custom crop operation example. + */ int main(int argc, char** argv) { - std::string input_file_path; - std::string output_file_path = "output.bmp"; - bool gpuPath = true; // use GPU by default - eDeviceType device = eDeviceType::GPU; - int deviceId = 0; - Box_t cropRect = {0, 0, 1, 1}; - bool cropSet = false; - - if(argc < 3) { - ShowHelpAndExit("-h"); - } - for (int i = 1; i < argc; i++) { - if (!strcmp(argv[i], "-h")) { - ShowHelpAndExit("-h"); - } - if (!strcmp(argv[i], "-i")) { - if (++i == argc) { - ShowHelpAndExit("-i"); - } - input_file_path = argv[i]; - continue; - } - if (!strcmp(argv[i], "-o")) { - if (++i == argc) { - ShowHelpAndExit("-o"); - } - output_file_path = argv[i]; - continue; - } - if (!strcmp(argv[i], "-crop")) { - i++; - if (i + 4 > argc) { - ShowHelpAndExit("-crop"); - } - cropRect.x = atoi(argv[i++]); - cropRect.y = atoi(argv[i++]); - cropRect.width = atoi(argv[i++]); - cropRect.height = atoi(argv[i]); - cropSet = true; - continue; - } - if (!strcmp(argv[i], "-cpu")) { - gpuPath = false; - continue; + Config config; + static struct option longOptions[] = {{"input", required_argument, nullptr, 'i'}, + {"output", required_argument, nullptr, 'o'}, + {"crop", required_argument, nullptr, 'c'}, + {"device", required_argument, nullptr, 'd'}, + {"cpu", no_argument, nullptr, 'C'}, + {"help", no_argument, nullptr, 'h'}, + {nullptr, 0, nullptr, 0}}; + int opt; + while ((opt = getopt_long(argc, argv, "i:o:c:d:h:C", longOptions, nullptr)) != -1) { + switch (opt) { + case 'i': + config.inputPath = optarg; + break; + case 'o': + config.outputPath = optarg; + break; + case 'c': + if (!ParseCropRectangle(optarg, config.cropRect)) { + std::cerr << "Invalid crop rectangle format. Use: x,y,w,h (e.g., 0,0,1,1)\n"; + return EXIT_FAILURE; + } + break; + case 'd': + config.deviceId = std::stoi(optarg); + break; + case 'C': + config.device = eDeviceType::CPU; + break; + case 'h': + PrintUsage(argv[0]); + return EXIT_SUCCESS; + default: + PrintUsage(argv[0]); + return EXIT_FAILURE; } } - if (gpuPath) { - device = eDeviceType::GPU; - HIP_VALIDATE_NO_ERRORS(hipSetDevice(deviceId)); - } else { - device = eDeviceType::CPU; + if (config.inputPath.empty()) { + std::cerr << "Error: Input path is required.\n\n"; + PrintUsage(argv[0]); + return EXIT_FAILURE; } - cv::Mat imageData = cv::imread(input_file_path); - if (imageData.empty()) { - std::cerr << "Failed to read the input image file" << std::endl; - exit(1); - } - if (!cropSet) { - // Set a safe crop area if no user input - cropRect = {(imageData.cols / 4), (imageData.rows / 4), (imageData.cols / 2), (imageData.rows / 2)}; - } + CHECK_HIP_ERROR(hipSetDevice(config.deviceId)); - // Create input/output tensors for the image. - TensorShape inputShape(TensorLayout(eTensorLayout::TENSOR_LAYOUT_NHWC), {1, imageData.rows, imageData.cols, imageData.channels()}); - DataType dtype(eDataType::DATA_TYPE_U8); - Tensor input(inputShape, dtype, device); + hipStream_t stream; + CHECK_HIP_ERROR(hipStreamCreate(&stream)); - TensorShape outShape(TensorLayout(eTensorLayout::TENSOR_LAYOUT_NHWC), {1, cropRect.height, cropRect.width, imageData.channels()}); - Tensor output(outShape, dtype, device); + // Load input images + Tensor input = LoadImages(stream, config.inputPath, config.device); - hipStream_t stream = nullptr; - if (gpuPath) { - HIP_VALIDATE_NO_ERRORS(hipStreamCreate(&stream)); - } + int64_t batchSize = input.shape(input.layout().batch_index()); + int64_t channels = input.shape(input.layout().channels_index()); - // Move image data to input tensor - size_t imageSizeInByte = input.shape().size() * input.dtype().size(); - auto input_data = input.exportData(); - if (gpuPath) { - HIP_VALIDATE_NO_ERRORS(hipMemcpyAsync(input_data.basePtr(), imageData.data, imageSizeInByte, hipMemcpyHostToDevice, stream)); - } else { - memcpy(input_data.basePtr(), imageData.data, imageSizeInByte); - } + // Create output tensor + Tensor output(TensorShape(input.layout(), {batchSize, config.cropRect.height, config.cropRect.width, channels}), + input.dtype(), config.device); + // Run custom crop operation CustomCrop op; - op(stream, input, output, cropRect, device); - - // Move image data back to host - size_t outputSize = output.shape().size() * output.dtype().size(); - auto outData = output.exportData(); - std::vector h_output(outputSize); - if (gpuPath) { - HIP_VALIDATE_NO_ERRORS(hipMemcpyAsync(h_output.data(), outData.basePtr(), outputSize, hipMemcpyDeviceToHost, stream)); - HIP_VALIDATE_NO_ERRORS(hipStreamSynchronize(stream)); - } else { - memcpy(h_output.data(), outData.basePtr(), outputSize); - } + op(stream, input, output, config.cropRect, config.device); // Write output image to disk - cv::Mat output_imageData(cropRect.height, cropRect.width, imageData.type(), h_output.data()); - bool ret = cv::imwrite(output_file_path, output_imageData); - if (!ret) { - std::cerr << "Faild to save output image to the file" << std::endl; - } + WriteImages(stream, output, config.outputPath); - std::cout << "Input image file: " << input_file_path << std::endl; - std::cout << "Output image file: " << output_file_path << std::endl; - if (gpuPath) { - std::cout << "Operation on GPU device " << deviceId << std::endl; - } else { - std::cout << "Operation on CPU" << std::endl; - } - std::cout << "Input image size: width = " << imageData.cols << ", height = " << imageData.rows << std::endl; - std::cout << "Cropping area: top-left corner = (" << cropRect.x << ", " << cropRect.y << "), width = " << cropRect.width << ", height = " << cropRect.height << std::endl; + CHECK_HIP_ERROR(hipStreamDestroy(stream)); return EXIT_SUCCESS; } \ No newline at end of file diff --git a/samples/gamma_contrast.cpp b/samples/gamma_contrast.cpp index 81fe771f..62a83f28 100644 --- a/samples/gamma_contrast.cpp +++ b/samples/gamma_contrast.cpp @@ -21,78 +21,91 @@ THE SOFTWARE. */ #include +#include + #include #include #include #include +#include "common/utils.hpp" using namespace roccv; +struct Config { + float gamma = 2.2; + std::string inputPath; + std::string outputPath = "output"; + int deviceId = 0; +}; + +void PrintUsage(const char* programName) { + // clang-format off + std::cerr << "Usage: " << programName << " -i [-o ] [-g ] [-d ]" << std::endl; + std::cerr << " -i, --input Input image or directory containing images (required)" << std::endl; + std::cerr << " -o, --output Output image or directory to save the results (optional, default: output)" << std::endl; + std::cerr << " -g, --gamma Gamma value to apply to the input images (optional, default: 2.2)" << std::endl; + std::cerr << " -d, --device Device ID to use for execution (optional, default: 0)" << std::endl; + // clang-format on +} + /** * @brief Gamma contrast operator sample app. */ int main(int argc, char** argv) { - if (argc != 3) { - std::cerr << "Usage: " << argv[0] - << " " - << std::endl; + Config config; + + static struct option longOptions[] = { + {"input", required_argument, nullptr, 'i'}, {"output", required_argument, nullptr, 'o'}, + {"gamma", required_argument, nullptr, 'g'}, {"device", required_argument, nullptr, 'd'}, + {"help", no_argument, nullptr, 'h'}, {nullptr, 0, nullptr, 0}}; + + int opt; + while ((opt = getopt_long(argc, argv, "i:o:g:d:h", longOptions, nullptr)) != -1) { + switch (opt) { + case 'i': + config.inputPath = optarg; + break; + case 'o': + config.outputPath = optarg; + break; + case 'g': + config.gamma = std::stof(optarg); + break; + case 'd': + config.deviceId = std::stoi(optarg); + break; + case 'h': + PrintUsage(argv[0]); + return EXIT_SUCCESS; + default: + PrintUsage(argv[0]); + return EXIT_FAILURE; + } + } + + if (config.inputPath.empty()) { + std::cerr << "Error: Input path is required.\n\n"; + PrintUsage(argv[0]); return EXIT_FAILURE; } - // Device to use in this sample will be the GPU - eDeviceType device = eDeviceType::GPU; - - // Load input image using the OpenCV library. - // The Mat image_data will store all of the data of the image - // Image width can be gotten with image_data.cols - // Image height can be gotten with image_data.rows - // The amount of channels can be gotten with image_data.channels() - cv::Mat image_data = cv::imread(argv[1]); - - // Batch size is needed to create the input and output tensors - int batchSize = 1; - - // A gamma value to apply to the input image - float gammaValue = 2.2; - - // Create input/output tensors - // Tensor shape - // - Takes layout as input, in this case NHWC (N - batch size, H - image height, W - image width, C - number of channels) - // - Also takes the datatype, in this case U8 or an unsigned integer of 8 bits. - TensorShape shape( - TensorLayout(eTensorLayout::TENSOR_LAYOUT_NHWC), - {batchSize, image_data.rows, image_data.cols, image_data.channels()}); - DataType dtype(eDataType::DATA_TYPE_U8); - - Tensor input(shape, dtype, device); - Tensor output(shape, dtype, device); + CHECK_HIP_ERROR(hipSetDevice(config.deviceId)); hipStream_t stream; - HIP_VALIDATE_NO_ERRORS(hipStreamCreate(&stream)); - - // imageSize is needed to know how much data needs to be copied to the GPU - size_t imageSize = image_data.rows * image_data.cols * image_data.channels() * sizeof(uint8_t); - - auto input_data = input.exportData(); - HIP_VALIDATE_NO_ERRORS( - hipMemcpy(static_cast(input_data.basePtr()), image_data.data, imageSize, hipMemcpyHostToDevice)); - - // Apply gamma correction - GammaContrast gamma_contrast; - gamma_contrast(stream, input, output, gammaValue, device); + CHECK_HIP_ERROR(hipStreamCreate(&stream)); + + // Load input images + Tensor input = LoadImages(stream, config.inputPath, eDeviceType::GPU); - // Move output data back to host - auto output_data = output.exportData(); - std::vector h_output(imageSize); - HIP_VALIDATE_NO_ERRORS(hipMemcpy(h_output.data(), output_data.basePtr(), imageSize, hipMemcpyDeviceToHost)); - - HIP_VALIDATE_NO_ERRORS(hipStreamSynchronize(stream)); + // Create output tensor + Tensor output(input.shape(), input.dtype(), eDeviceType::GPU); - // Save the gamma-corrected image - cv::Mat output_image(image_data.rows, image_data.cols, CV_8UC3, h_output.data()); - cv::imwrite(argv[2], output_image); + // Apply gamma correction + GammaContrast gamma_contrast; + gamma_contrast(stream, input, output, config.gamma, eDeviceType::GPU); - std::cout << "Gamma correction applied successfully. Output saved to: " << argv[2] << std::endl; - + // Write output images to disk + WriteImages(stream, output, config.outputPath); + CHECK_HIP_ERROR(hipStreamDestroy(stream)); return EXIT_SUCCESS; } diff --git a/samples/warp_perspective.cpp b/samples/warp_perspective.cpp index dc816e11..49039d62 100644 --- a/samples/warp_perspective.cpp +++ b/samples/warp_perspective.cpp @@ -21,62 +21,101 @@ THE SOFTWARE. */ #include +#include #include #include #include #include +#include "common/utils.hpp" + using namespace roccv; +struct Config { + std::string inputPath; + std::string outputPath = "output"; + eInterpolationType interpolation = eInterpolationType::INTERP_TYPE_LINEAR; + eBorderType borderMode = eBorderType::BORDER_TYPE_CONSTANT; + int deviceId = 0; +}; + +void PrintUsage(const char* programName) { + // clang-format off + std::cerr << "Usage: " << programName << " -i [-o ] [-p ] [-b ] [-d ]" << std::endl; + std::cerr << " -i, --input Input image or directory containing images (required)" << std::endl; + std::cerr << " -o, --output Output image or directory to save the results (optional, default: output)" << std::endl; + std::cerr << " -I, --interpolation Interpolation type to use for output images [0: NEAREST, 1: LINEAR, 2: CUBIC] (optional, default: 1 (LINEAR))" << std::endl; + std::cerr << " -b, --border Border type for output images [0: CONSTANT, 1: REPLICATE, 2: REFLECT, 3: REFLECT101, 4: WRAP] (optional, default: 0 (CONSTANT))" << std::endl; + std::cerr << " -d, --device Device ID to use for execution (optional, default: 0)" << std::endl; + // clang-format on +} + /** * @brief Warp perspective operation example. */ int main(int argc, char** argv) { - if (argc != 6) { - std::cerr << "Usage: " << argv[0] << " " - << std::endl; - return EXIT_FAILURE; + Config config; + + static struct option longOptions[] = {{"input", required_argument, nullptr, 'i'}, + {"output", required_argument, nullptr, 'o'}, + {"interpolation", required_argument, nullptr, 'I'}, + {"border", required_argument, nullptr, 'b'}, + {"device", required_argument, nullptr, 'd'}, + {"help", no_argument, nullptr, 'h'}, + {nullptr, 0, nullptr, 0}}; + + // Parse command line arguments + int opt; + while ((opt = getopt_long(argc, argv, "i:o:I:b:d:h", longOptions, nullptr)) != -1) { + switch (opt) { + case 'i': + config.inputPath = optarg; + break; + case 'o': + config.outputPath = optarg; + break; + case 'I': + config.interpolation = static_cast(std::stoi(optarg)); + break; + case 'b': + config.borderMode = static_cast(std::stoi(optarg)); + break; + case 'd': + config.deviceId = std::stoi(optarg); + break; + case 'h': + PrintUsage(argv[0]); + return EXIT_SUCCESS; + default: + PrintUsage(argv[0]); + return EXIT_FAILURE; + } } - HIP_VALIDATE_NO_ERRORS(hipSetDevice(std::stoi(argv[5]))); - eInterpolationType interp = static_cast(std::stoi(argv[3])); - eBorderType border_mode = static_cast(std::stoi(argv[4])); - cv::Mat image_data = cv::imread(argv[1]); - - // Create input/output tensors for the image. - TensorShape shape(TensorLayout(eTensorLayout::TENSOR_LAYOUT_NHWC), - {1, image_data.rows, image_data.cols, image_data.channels()}); - DataType dtype(eDataType::DATA_TYPE_U8); + if (config.inputPath.empty()) { + std::cerr << "Error: Input path is required.\n\n"; + PrintUsage(argv[0]); + return EXIT_FAILURE; + } - Tensor d_in(shape, dtype); - Tensor d_out(shape, dtype); + CHECK_HIP_ERROR(hipSetDevice(config.deviceId)); hipStream_t stream; - HIP_VALIDATE_NO_ERRORS(hipStreamCreate(&stream)); + CHECK_HIP_ERROR(hipStreamCreate(&stream)); - // Move image data to input tensor - size_t image_size = d_in.shape().size() * d_in.dtype().size(); - auto d_input_data = d_in.exportData(); - HIP_VALIDATE_NO_ERRORS( - hipMemcpyAsync(d_input_data.basePtr(), image_data.data, image_size, hipMemcpyHostToDevice, stream)); + // Create input/output tensors for the image. + Tensor input = LoadImages(stream, config.inputPath, eDeviceType::GPU); + Tensor output(input.shape(), input.dtype(), eDeviceType::GPU); PerspectiveTransform transform_matrix = {1, 0, 0, 0, 1, 0, -0.001, 0, 1}; roccv::WarpPerspective op; - op(stream, d_in, d_out, transform_matrix, true, interp, border_mode, make_float4(0, 0, 0, 0)); + op(stream, input, output, transform_matrix, true, config.interpolation, config.borderMode, make_float4(0, 0, 0, 0)); // Move image data back to device - auto d_out_data = d_out.exportData(); - std::vector h_output(image_size); - HIP_VALIDATE_NO_ERRORS( - hipMemcpyAsync(h_output.data(), d_out_data.basePtr(), image_size, hipMemcpyDeviceToHost, stream)); - - HIP_VALIDATE_NO_ERRORS(hipStreamSynchronize(stream)); - - // Write normalized image to disk - cv::Mat output_image_data(image_data.rows, image_data.cols, CV_8UC3, h_output.data()); - cv::imwrite(argv[2], output_image_data); + WriteImages(stream, output, config.outputPath); + CHECK_HIP_ERROR(hipStreamDestroy(stream)); return EXIT_SUCCESS; } \ No newline at end of file diff --git a/src/core/mem_alignment.cpp b/src/core/mem_alignment.cpp new file mode 100644 index 00000000..247974b1 --- /dev/null +++ b/src/core/mem_alignment.cpp @@ -0,0 +1,38 @@ +/* + * Copyright (c) 2025 Advanced Micro Devices, Inc. All rights reserved. + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN + * THE SOFTWARE. + */ + +#include "core/mem_alignment.hpp" + +namespace roccv { +int32_t MemAlignment::baseAddr() const { return m_baseAddrAlignment; } + +int32_t MemAlignment::rowAddr() const { return m_rowAddrAlignment; } + +MemAlignment& MemAlignment::baseAddr(int32_t alignment) { + m_baseAddrAlignment = alignment; + return *this; +} + +MemAlignment& MemAlignment::rowAddr(int32_t alignment) { + m_rowAddrAlignment = alignment; + return *this; +} +} // namespace roccv \ No newline at end of file diff --git a/src/core/tensor.cpp b/src/core/tensor.cpp index dd6c4726..f95cf811 100644 --- a/src/core/tensor.cpp +++ b/src/core/tensor.cpp @@ -23,50 +23,222 @@ THE SOFTWARE. #include "core/tensor.hpp" #include +#include #include "core/data_type.hpp" #include "core/detail/context.hpp" #include "core/exception.hpp" +#include "core/hip_assert.h" #include "core/image_format.hpp" +#include "core/mem_alignment.hpp" #include "core/status_type.h" #include "core/tensor_data.hpp" #include "core/tensor_layout.hpp" #include "core/tensor_requirements.hpp" #include "core/tensor_shape.hpp" #include "core/util_enums.h" +#include "core/utils.hpp" #include "operator_types.h" namespace roccv { -// Constructor definitions -Tensor::Tensor(const TensorRequirements& reqs) : Tensor(reqs, GlobalContext().getDefaultAllocator()) {} +namespace { + +/** + * @brief Returns the index of the first packed dimension in the given tensor layout. + * + * In most cases, the first packed dimension is the last dimension. However, for layouts ending in WC, the first packed + * dimension is the second to last dimension. + * + * @param[in] layout The tensor layout to get the first packed dimension for. + * @return The index of the first packed dimension in the given tensor layout. + */ +static int GetFirstPackedDimension(const TensorLayout& layout) { + const int rank = layout.rank(); + switch (layout.elayout()) { + case eTensorLayout::TENSOR_LAYOUT_NHWC: + case eTensorLayout::TENSOR_LAYOUT_LNHWC: + case eTensorLayout::TENSOR_LAYOUT_HWC: + case eTensorLayout::TENSOR_LAYOUT_NWC: + return std::max(0, rank - 2); + default: + return rank - 1; + } +} + +static bool ReshapeSimplified(int inRank, const std::array& inShape, + const std::array& inStrides, int targetRank, + const std::array& targetShape, + std::array& outStrides) { + int i = 0, j = 0; + for (; i < inRank && j < targetRank; i++) { + int64_t inE = inShape[i]; + int64_t outV = 1; + int group_start = j; + while (j < targetRank && (outV * targetShape[j]) <= inE) outV *= targetShape[j++]; + + if (outV != inE) { + return false; + } + + int64_t s = inStrides[i]; + for (int d = j - 1; d >= group_start; d--) { + outStrides[d] = s; + s *= targetShape[d]; + } + } + return true; +} + +/** + * @brief Simplifies a tensor shape and strides into their canonical form. + * + * @param[in] rank The rank of the tensor. + * @param[in] shape The shape of the tensor. + * @param[in] strides The strides of the tensor. + * @param[out] outShape The simplified shape of the tensor. + * @param[out] outStrides The simplified strides of the tensor. + * @return The rank of the simplified tensor. + */ +static int Simplify(int rank, const std::array& shape, + const std::array& strides, + std::array& outShape, + std::array& outStrides) { + if (rank <= 1) { + if (rank == 1) { + outShape[0] = shape[0]; + outStrides[0] = strides[0]; + } + return rank; + } + + int outRank = 0; + int64_t vol = shape[0]; + for (int d = 1; d < rank; d++) { + if (strides[d - 1] != shape[d] * strides[d]) { + outStrides[outRank] = strides[d - 1]; + outShape[outRank] = vol; + vol = shape[d]; + outRank++; + } else { + vol *= shape[d]; + } + } + outStrides[outRank] = strides[rank - 1]; + outShape[outRank] = vol; + outRank++; + return outRank; +} + +/** + * @brief Computes copy parameters for host-tensor copy. + * @return (row_width_bytes, num_rows, tensor_pitch). If contiguous, returns (total_size, 1, total_size). + */ +static std::tuple ComputeCopyParams(int rank, + const std::array& shape, + const std::array& strides, + size_t dtypeSize, bool contiguous) { + if (contiguous) { + size_t totalSize = dtypeSize; + for (int i = 0; i < rank; ++i) { + totalSize *= static_cast(shape[i]); + } + return {totalSize, 1, totalSize}; + } + + int paddedDim = 0; + for (int i = 0; i < rank - 1; i++) { + if (strides[i] != shape[i + 1] * strides[i + 1]) { + paddedDim = i; + break; + } + } + + size_t rowWidth = dtypeSize; + for (int i = paddedDim + 1; i < rank; ++i) { + rowWidth *= static_cast(shape[i]); + } + + size_t numRows = 1; + for (int i = 0; i <= paddedDim; ++i) { + numRows *= static_cast(shape[i]); + } + + size_t tensorPitch = static_cast(strides[paddedDim]); + return {rowWidth, numRows, tensorPitch}; +} + +/** + * @brief Computes the memory alignment for a tensor based on the device, data type, and user provided buffer alignment. + * @param[in] device The device the tensor is to be allocated on. + * @param[in] dtype The datatype of the tensor. + * @param[in] bufAlign The memory alignment to use. + * @return The memory alignment for the tensor. + */ +MemAlignment ComputeMemAlignment(eDeviceType device, const DataType& dtype, const MemAlignment& bufAlign) { + int dev = 0; + if (device == eDeviceType::GPU) { + HIP_VALIDATE_NO_ERRORS(hipGetDevice(&dev)); + } + + int rowAlign; + if (bufAlign.rowAddr() == 0) { + if (device == eDeviceType::GPU) { + HIP_VALIDATE_NO_ERRORS(hipDeviceGetAttribute(&rowAlign, hipDeviceAttributeTexturePitchAlignment, dev)); + } else { + rowAlign = ROCCV_CPU_DEFAULT_ALIGNMENT; + } + rowAlign = std::lcm(rowAlign, detail::NextPowerOfTwo(dtype.size())); + } else { + if (!detail::IsPowerOfTwo(bufAlign.rowAddr())) { + throw Exception("Row address alignment must be a power of two.", eStatusType::INVALID_VALUE); + } + rowAlign = std::lcm(bufAlign.rowAddr(), detail::NextPowerOfTwo(dtype.size())); + } + + int baseAlign; + if (bufAlign.baseAddr() == 0) { + if (device == eDeviceType::GPU) { + HIP_VALIDATE_NO_ERRORS(hipDeviceGetAttribute(&baseAlign, hipDeviceAttributeTextureAlignment, dev)); + } else { + baseAlign = ROCCV_CPU_DEFAULT_ALIGNMENT; + } + baseAlign = std::lcm(baseAlign, detail::NextPowerOfTwo(dtype.size())); + } else { + if (!detail::IsPowerOfTwo(bufAlign.baseAddr())) { + throw Exception("Base address alignment must be a power of two.", eStatusType::INVALID_VALUE); + } + baseAlign = std::lcm(bufAlign.baseAddr(), detail::NextPowerOfTwo(dtype.size())); + } -Tensor::Tensor(const TensorRequirements& reqs, const IAllocator& alloc) : m_requirements(reqs), m_allocator(alloc) { - size_t numBytes = reqs.device == eDeviceType::GPU ? reqs.res.deviceMem.bytes : reqs.res.hostMem.bytes; - m_data = std::make_shared(numBytes, reqs.device, alloc); + return MemAlignment().baseAddr(baseAlign).rowAddr(rowAlign); } -Tensor::Tensor(const TensorRequirements& reqs, std::shared_ptr data) - : Tensor(reqs, data, GlobalContext().getDefaultAllocator()) {} +} // namespace -Tensor::Tensor(const TensorRequirements& reqs, std::shared_ptr data, const IAllocator& alloc) - : m_requirements(reqs), m_data(data), m_allocator(alloc) {} +// Constructor definitions +Tensor::Tensor(const Tensor::Requirements& reqs, const IAllocator& alloc) : m_requirements(reqs) { + m_data = std::make_shared(this->dataSize(), reqs.device, alloc); +} + +Tensor::Tensor(const Tensor::Requirements& reqs, std::shared_ptr data) + : m_requirements(reqs), m_data(data) {} Tensor::Tensor(const TensorShape& shape, DataType dtype, eDeviceType device) - : Tensor(shape, dtype, GlobalContext().getDefaultAllocator(), device) {} + : Tensor(shape, dtype, {}, GlobalContext().getDefaultAllocator(), device) {} -Tensor::Tensor(const TensorShape& shape, DataType dtype, const IAllocator& alloc, eDeviceType device) - : Tensor(CalcRequirements(shape, dtype, device), alloc) {} +Tensor::Tensor(const TensorShape& shape, DataType dtype, const MemAlignment& bufAlign, const IAllocator& alloc, + eDeviceType device) + : Tensor(CalcRequirements(shape, dtype, bufAlign, device), alloc) {} Tensor::Tensor(int num_images, Size2D image_size, ImageFormat fmt, eDeviceType device) - : Tensor(num_images, image_size, fmt, GlobalContext().getDefaultAllocator(), device) {} + : Tensor(num_images, image_size, fmt, {}, GlobalContext().getDefaultAllocator(), device) {} -Tensor::Tensor(int num_images, Size2D image_size, ImageFormat fmt, const IAllocator& alloc, eDeviceType device) - : Tensor(CalcRequirements(num_images, image_size, fmt, device), alloc) {} +Tensor::Tensor(int num_images, Size2D image_size, ImageFormat fmt, const MemAlignment& bufAlign, + const IAllocator& alloc, eDeviceType device) + : Tensor(CalcRequirements(num_images, image_size, fmt, bufAlign, device), alloc) {} -Tensor::Tensor(Tensor&& other) - : m_requirements(std::move(other.m_requirements)), - m_data(std::move(other.m_data)), - m_allocator(other.m_allocator) {} +// Move constructor +Tensor::Tensor(Tensor&& other) : m_requirements(std::move(other.m_requirements)), m_data(std::move(other.m_data)) {} // Member definitions int Tensor::rank() const { return m_requirements.rank; } @@ -103,23 +275,48 @@ TensorData Tensor::exportData() const { } } -Tensor Tensor::reshape(const TensorShape& new_shape) const { - // New tensor shape must have the same number of elements - if (new_shape.size() != this->shape().size()) { - throw Exception("New tensor shape does not match the number of elements of the old shape.", +Tensor Tensor::reshape(const TensorShape& newShape) const { return reshape(dtype(), newShape); } + +Tensor Tensor::reshape(const DataType& newDtype, const TensorShape& newShape) const { + if (newShape.size() * newDtype.size() != this->shape().size() * this->dtype().size()) { + throw Exception("New tensor view must have the same underlying number of bytes.", eStatusType::INVALID_VALUE); + } + + const int oldRank = m_requirements.rank; + const int newRank = newShape.layout().rank(); + + if (m_requirements.strides[oldRank - 1] != static_cast(dtype().size())) { + throw Exception("Cannot reshape tensor: innermost dimension is not element-contiguous.", eStatusType::INVALID_VALUE); } - TensorRequirements reqs = CalcRequirements(new_shape, this->dtype(), this->device()); - return Tensor(reqs, m_data); -} + // Convert to a byte-level view by expanding the innermost dimension size + // by the element size and setting its stride to 1. + std::array byteShape = m_requirements.shape; + std::array byteStrides = m_requirements.strides; + byteShape[oldRank - 1] *= dtype().size(); + byteStrides[oldRank - 1] = 1; -Tensor Tensor::reshape(const TensorShape& new_shape, const DataType& new_dtype) const { - if (new_shape.size() * new_dtype.size() != this->shape().size() * this->dtype().size()) { - throw Exception("New tensor view must have the same underlying number of bytes.", eStatusType::INVALID_VALUE); + std::array simpleShape, simpleStrides; + int simpleRank = Simplify(oldRank, byteShape, byteStrides, simpleShape, simpleStrides); + + std::array targetByteShape = newShape.shape(); + targetByteShape[newRank - 1] *= newDtype.size(); + + std::array targetByteStrides; + bool result = + ReshapeSimplified(simpleRank, simpleShape, simpleStrides, newRank, targetByteShape, targetByteStrides); + if (!result) { + throw Exception("Cannot reshape tensor into requested shape and data type.", eStatusType::INVALID_VALUE); } - TensorRequirements reqs = CalcRequirements(new_shape, new_dtype, this->device()); + // The byte-level reshape produces stride 1 for the innermost dimension; + // scale it back to the new element size. + std::array newStrides = targetByteStrides; + newStrides[newRank - 1] = newDtype.size(); + + Tensor::Requirements reqs = + CalcRequirements(newShape, newDtype, newStrides, m_requirements.alignBytes, this->device()); return Tensor(reqs, m_data); } @@ -129,53 +326,110 @@ Tensor& Tensor::operator=(const Tensor& other) { return *this; } -TensorRequirements Tensor::CalcRequirements(const TensorShape& shape, const DataType& dtype, eDeviceType device) { - std::array strides = CalcStrides(shape, dtype); - TensorRequirements reqs = CalcRequirements(shape, dtype, strides, device); +size_t Tensor::dataSize() const { return m_requirements.strides[0] * m_requirements.shape[0]; } + +bool Tensor::isContiguous() const { return dataSize() == shape().size() * dtype().size(); } + +void Tensor::copyFromHost(const void* src, hipStream_t stream) const { + auto [rowWidth, numRows, tensorPitch] = ComputeCopyParams(m_requirements.rank, m_requirements.shape, + m_requirements.strides, dtype().size(), isContiguous()); + + const size_t srcPitch = rowWidth; + hipMemcpyKind kind = (device() == eDeviceType::GPU) ? hipMemcpyHostToDevice : hipMemcpyHostToHost; + + HIP_VALIDATE_NO_ERRORS( + hipMemcpy2DAsync(m_data->data(), tensorPitch, src, srcPitch, rowWidth, numRows, kind, stream)); +} + +void Tensor::copyToHost(void* dst, hipStream_t stream) const { + auto [rowWidth, numRows, tensorPitch] = ComputeCopyParams(m_requirements.rank, m_requirements.shape, + m_requirements.strides, dtype().size(), isContiguous()); + + const size_t dstPitch = rowWidth; + hipMemcpyKind kind = (device() == eDeviceType::GPU) ? hipMemcpyDeviceToHost : hipMemcpyHostToHost; + + HIP_VALIDATE_NO_ERRORS( + hipMemcpy2DAsync(dst, dstPitch, m_data->data(), tensorPitch, rowWidth, numRows, kind, stream)); +} + +Tensor::Requirements Tensor::CalcRequirements(const TensorShape& shape, const DataType& dtype, eDeviceType device) { + return CalcRequirements(shape, dtype, (MemAlignment){}, device); +} + +Tensor::Requirements Tensor::CalcRequirements(const TensorShape& shape, const DataType& dtype, + const MemAlignment& bufAlign, eDeviceType device) { + MemAlignment newAlign = ComputeMemAlignment(device, dtype, bufAlign); + + std::array strides = CalcStrides(shape, dtype, newAlign.rowAddr()); + Tensor::Requirements reqs = CalcRequirements(shape, dtype, strides, newAlign.baseAddr(), device); return reqs; } -TensorRequirements Tensor::CalcRequirements(const TensorShape& shape, const DataType& dtype, - std::array strides, eDeviceType device) { - TensorRequirements reqs; +Tensor::Requirements Tensor::CalcRequirements(const TensorShape& shape, const DataType& dtype, + const std::array strides, + int32_t baseAlign, eDeviceType device) { + Tensor::Requirements reqs; reqs.shape = shape.shape(); reqs.rank = shape.layout().rank(); reqs.layout = shape.layout().elayout(); reqs.strides = strides; reqs.dtype = dtype.etype(); - reqs.alignBytes = 0; // TODO: Must be specified later + reqs.alignBytes = baseAlign; reqs.device = device; - // TODO: Resource requirements should be calculated differently later, once padded/aligned strides have been - // implemented + // Determine resource usage size_t numBytes = reqs.strides[0] * reqs.shape[0]; - if (reqs.device == eDeviceType::GPU) { - reqs.res.deviceMem.bytes = numBytes; - } else if (reqs.device == eDeviceType::CPU) { - reqs.res.hostMem.bytes = numBytes; + switch (reqs.device) { + case eDeviceType::GPU: { + reqs.res.deviceMem.bytes = numBytes; + break; + } + + case eDeviceType::CPU: { + reqs.res.hostMem.bytes = numBytes; + break; + } + + default: { + throw Exception("Unsupported device when calling Tensor::CalcRequirements().", eStatusType::INVALID_VALUE); + } } return reqs; } -TensorRequirements Tensor::CalcRequirements(int num_images, Size2D image_size, ImageFormat fmt, eDeviceType device) { +Tensor::Requirements Tensor::CalcRequirements(int num_images, Size2D image_size, ImageFormat fmt, eDeviceType device) { + return CalcRequirements(num_images, image_size, fmt, (MemAlignment){}, device); +} + +Tensor::Requirements Tensor::CalcRequirements(int num_images, Size2D image_size, ImageFormat fmt, + const MemAlignment& bufAlign, eDeviceType device) { // TODO: Need to support different types of tensor layouts. This will happen once more image formats are supported // first. TensorShape shape(TensorLayout(eTensorLayout::TENSOR_LAYOUT_NHWC), {num_images, image_size.h, image_size.w, fmt.channels()}); - return CalcRequirements(shape, DataType(fmt.dtype()), device); + return CalcRequirements(shape, DataType(fmt.dtype()), bufAlign, device); } -std::array Tensor::CalcStrides(const TensorShape& shape, const DataType& dtype) { - // TODO: Support memory alignment and padding in stride calculations - +std::array Tensor::CalcStrides(const TensorShape& shape, const DataType& dtype, + int32_t rowAlign) { // Calculate strides based on the given tensor shape. Strides are byte-wise. std::array strides; + + const int firstPackedDim = GetFirstPackedDimension(shape.layout()); + strides[shape.layout().rank() - 1] = dtype.size(); for (int i = shape.layout().rank() - 2; i >= 0; i--) { - strides[i] = strides[i + 1] * shape[i + 1]; + // The stride dimension preceeding the first packed dimension is padded to the next multiple of the row + // alignment. + if (i == firstPackedDim - 1) { + strides[i] = detail::AlignUp(strides[i + 1] * shape[i + 1], rowAlign); + } else { + strides[i] = strides[i + 1] * shape[i + 1]; + } } + return strides; } @@ -190,8 +444,9 @@ Tensor TensorWrapData(const TensorData& tensor_data) { for (int i = 0; i < tensorDataStrided->rank(); i++) { strides[i] = tensorDataStrided->stride(i); } - TensorRequirements reqs = Tensor::CalcRequirements(tensorDataStrided->shape(), tensorDataStrided->dtype(), strides, - tensorDataStrided->device()); + + Tensor::Requirements reqs = Tensor::CalcRequirements(tensorDataStrided->shape(), tensorDataStrided->dtype(), + strides, 0, tensorDataStrided->device()); auto data = std::make_shared(tensorDataStrided->basePtr(), tensorDataStrided->device(), eOwnership::OWNING); diff --git a/src/core/tensor_storage.cpp b/src/core/tensor_storage.cpp index 4b567c7d..4c24121f 100644 --- a/src/core/tensor_storage.cpp +++ b/src/core/tensor_storage.cpp @@ -62,4 +62,7 @@ TensorStorage::~TensorStorage() { void* TensorStorage::data() const { return m_data; } eDeviceType TensorStorage::device() const { return m_device; } + +const IAllocator& TensorStorage::allocator() const { return m_allocator; } + } // namespace roccv diff --git a/src/op_non_max_suppression.cpp b/src/op_non_max_suppression.cpp index 93c4f90c..28d21c7b 100644 --- a/src/op_non_max_suppression.cpp +++ b/src/op_non_max_suppression.cpp @@ -97,12 +97,12 @@ void NonMaximumSuppression::operator()(hipStream_t stream, const Tensor& input, // but typically involve going from non-vectorized to vectorized shapes. These shapes should be validated // beforehand. For example: an input tensor with shape and datatype [NWC, , S16] will be // reinterpreted as [NW, ]. In any case, the underlying data is structured the same. - Tensor inputReshaped = - input.reshape(TensorShape(TensorLayout(TENSOR_LAYOUT_NW), {numBatches, numBoxes}), DataType(DATA_TYPE_4S16)); + Tensor inputReshaped = input.reshape(DataType(DATA_TYPE_4S16), + TensorShape(TensorLayout(TENSOR_LAYOUT_NW), {numBatches, numBoxes})); Tensor outputReshaped = - output.reshape(TensorShape(TensorLayout(TENSOR_LAYOUT_NW), {numBatches, numBoxes}), DataType(DATA_TYPE_U8)); - Tensor scoresReshaped = - scores.reshape(TensorShape(TensorLayout(TENSOR_LAYOUT_NW), {numBatches, numBoxes}), DataType(DATA_TYPE_F32)); + output.reshape(DataType(DATA_TYPE_U8), TensorShape(TensorLayout(TENSOR_LAYOUT_NW), {numBatches, numBoxes})); + Tensor scoresReshaped = scores.reshape(DataType(DATA_TYPE_F32), + TensorShape(TensorLayout(TENSOR_LAYOUT_NW), {numBatches, numBoxes})); // Launch nms kernel switch (device) { diff --git a/tests/roccv/cpp/include/test_helpers.hpp b/tests/roccv/cpp/include/test_helpers.hpp index 6c43053b..aa2d819a 100644 --- a/tests/roccv/cpp/include/test_helpers.hpp +++ b/tests/roccv/cpp/include/test_helpers.hpp @@ -269,24 +269,6 @@ eTestStatusType compareImage(const Tensor& tensor, const std::string& filename, void writeTensor(const Tensor& tensor, const std::string& output_file); -template -void copyData(const Tensor& input, const std::span& data, eDeviceType device) { - auto tensor_data = input.exportData(); - - switch (device) { - case eDeviceType::GPU: { - HIP_VALIDATE_NO_ERRORS( - hipMemcpy(tensor_data.basePtr(), data.data(), data.size() * sizeof(T), hipMemcpyHostToDevice)); - break; - } - - case eDeviceType::CPU: { - memcpy(tensor_data.basePtr(), data.data(), data.size() * sizeof(T)); - break; - } - } -} - /** * @brief Fills a vector with random values based on a provided seed. * @@ -403,75 +385,76 @@ void CompareVectorsNear(const std::vector& result, const std::vector& ref, } } } - /** - * @brief Copies vector data into a roccv::Tensor. This will copy vector data into either GPU memory or CPU memory, - * depending on the device specified in the roccv::Tensor's metadata. - * - * @tparam T The base datatype of the underlying data. - * @param dst The destination roccv::Tensor to copy data into. - * @param src A source vector containing data. - * @throws std::runtime_error if the size of the dst and src do not match. + * @brief Computes copy parameters for any tensor layout. + * @param[in] tensor The tensor to compute copy parameters for. + * @return tuple of (row_width_bytes, num_rows, tensor_pitch) + * If no padding, returns (total_size, 1, total_size) */ -template -void CopyVectorIntoTensor(const Tensor& dst, std::vector& src) { - auto tensorData = dst.exportData(); - size_t dataSize = dst.shape().size() * dst.dtype().size(); - - // Ensure source and destination have the same amount of memory allocated. - if (dataSize != src.size() * sizeof(T)) { - throw std::runtime_error( - "Cannot copy source vector into destination tensor. Size of src vector and destination tensor do not " - "match."); +inline std::tuple ComputeCopyParams(const Tensor& tensor) { + if (tensor.isContiguous()) { + size_t totalSize = tensor.shape().size() * tensor.dtype().size(); + return {totalSize, 1, totalSize}; } - switch (dst.device()) { - case eDeviceType::GPU: { - HIP_VALIDATE_NO_ERRORS( - hipMemcpy(tensorData.basePtr(), src.data(), dataSize, hipMemcpyKind::hipMemcpyHostToDevice)); - break; - } + auto tensorData = tensor.exportData(); - case eDeviceType::CPU: { - HIP_VALIDATE_NO_ERRORS( - hipMemcpy(tensorData.basePtr(), src.data(), dataSize, hipMemcpyKind::hipMemcpyHostToHost)); + // Find the padded dimension: the outermost dimension whose stride exceeds + // the packed product of the next dimension's shape and stride. + int paddedDim = 0; + for (int i = 0; i < tensor.rank() - 1; i++) { + if (tensorData.stride(i) != tensor.shape(i + 1) * tensorData.stride(i + 1)) { + paddedDim = i; break; } } + + // Row width = product of all dimensions AFTER paddedDim × dtype size + size_t rowWidth = tensor.dtype().size(); + for (int i = paddedDim + 1; i < tensor.rank(); ++i) { + rowWidth *= tensor.shape(i); + } + + // Number of rows = product of all dimensions UP TO AND INCLUDING paddedDim + size_t numRows = 1; + for (int i = 0; i <= paddedDim; ++i) { + numRows *= tensor.shape(i); + } + + // Tensor pitch comes from the stride at the padded dimension + size_t tensorPitch = tensorData.stride(paddedDim); + + return {rowWidth, numRows, tensorPitch}; } /** - * @brief Copies roccv::Tensor data into a destination vector. + * @brief Copies vector data into a tensor. Works with any tensor layout. * - * @tparam T The base datatype of the underlying tensor data. - * @param dst The destination vector which the data will be copied into. - * @param src The roccv::Tensor containing the source data. - * @throws std::runtime_error if the size of src and dst do not match. + * @tparam T The data type of the elements to copy. + * @param[out] dst The tensor to copy vector data into. + * @param[in] src The vector to copy data from. */ template -void CopyTensorIntoVector(std::vector& dst, const Tensor& src) { - size_t size = src.shape().size() * src.dtype().size(); - auto tensorData = src.exportData(); - - if (size != dst.size() * sizeof(T)) { - throw std::runtime_error( - "Cannot copy source tensor data into destination vector. Size of destination vector and source tensor do " - "not match."); - } - - switch (src.device()) { - case eDeviceType::GPU: { - HIP_VALIDATE_NO_ERRORS( - hipMemcpy(dst.data(), tensorData.basePtr(), size, hipMemcpyKind::hipMemcpyDeviceToHost)); - break; - } +void CopyVectorIntoTensor(const Tensor& dst, const std::vector& src) { + dst.copyFromHost(src.data(), nullptr); + HIP_VALIDATE_NO_ERRORS(hipStreamSynchronize(nullptr)); +} - case eDeviceType::CPU: { - HIP_VALIDATE_NO_ERRORS( - hipMemcpy(dst.data(), tensorData.basePtr(), size, hipMemcpyKind::hipMemcpyHostToHost)); - break; - } - } +/** + * @brief Copies tensor data into a vector. Works with any tensor layout. + * + * The destination vector @p dst is assumed to be compact/contiguous in memory; data will be copied into it as a dense + * contiguous array. + * + * @tparam T The data type of the elements to copy. + * @param[out] dst The vector to copy tensor data into. Must be preallocated to the correct size and will be filled + * contiguously. + * @param[in] src The tensor to copy data from. + */ +template +void CopyTensorIntoVector(std::vector& dst, const Tensor& src) { + src.copyToHost(dst.data(), nullptr); + HIP_VALIDATE_NO_ERRORS(hipStreamSynchronize(nullptr)); } /** * @brief Computes the strides for a tensor with a given shape and data type. diff --git a/tests/roccv/cpp/src/tests/core/tensor/test_tensor.cpp b/tests/roccv/cpp/src/tests/core/tensor/test_tensor.cpp index 2b1abe2c..394c8b9e 100644 --- a/tests/roccv/cpp/src/tests/core/tensor/test_tensor.cpp +++ b/tests/roccv/cpp/src/tests/core/tensor/test_tensor.cpp @@ -19,7 +19,10 @@ * THE SOFTWARE. */ +#include + #include +#include #include "test_helpers.hpp" @@ -33,16 +36,22 @@ namespace { * * @param shape The tensor's shape. * @param dtype The datatype of the tensor. + * @param rowAlign The row alignment to use. Setting to 0 will ensure contiguous memory usage. * @return A list of strides for each dimension of the given shape. */ -std::vector CalculateStrides(const TensorShape& shape, const DataType& dtype) { +std::vector CalculateStrides(const TensorShape& shape, const DataType& dtype, int32_t rowAlign) { std::vector strides(shape.layout().rank()); // Strides are calculated byte-wise. Therefore, the highest dimension will refer to the stride between singular // elements (which, in turn, is the number of bytes per said element). strides[shape.layout().rank() - 1] = dtype.size(); for (int i = shape.layout().rank() - 2; i >= 0; --i) { - strides[i] = strides[i + 1] * shape[i + 1]; + // Use the row alignment for the height dimension. + if (i == shape.layout().height_index()) { + strides[i] = detail::AlignUp(strides[i + 1] * shape[i + 1], rowAlign); + } else { + strides[i] = strides[i + 1] * shape[i + 1]; + } } return strides; } @@ -61,19 +70,27 @@ void TestNegativeTensorShape() { } /** - * @brief Negative tests related to the Tensor object. + * @brief Negative tests related to Tensor reshape. + * + */ +void TestNegativeTensorReshape() { + Tensor tensor(TensorShape({1, 2, 3}, "HWC"), DataType(DATA_TYPE_U8)); + EXPECT_EXCEPTION(tensor.reshape(TensorShape({1, 1, 2, 4}, "NHWC")), eStatusType::INVALID_VALUE); +} + +/** + * @brief Negative tests for the Tensor class, verifying error handling in invalid scenarios. * + * These tests confirm that the Tensor class appropriately throws exceptions when: + * 1. Attempting to reshape a non-contiguous tensor. + * + * In both cases, the expected behavior is to throw an exception of type eStatusType::INVALID_VALUE. */ void TestNegativeTensor() { + // Test reshaping a tensor with mismatching number of elements { - // Should not be able to reshape tensor into another view with a differing number of elements Tensor tensor(TensorShape({1, 2, 3}, "HWC"), DataType(DATA_TYPE_U8)); - EXPECT_EXCEPTION(tensor.reshape(TensorShape({1, 1, 2, 2}, "NHWC")), eStatusType::INVALID_VALUE); - - // Should not be able to reshape tensor into another view which would result in a different number of bytes in - // the underlying memory. - EXPECT_EXCEPTION(tensor.reshape(TensorShape({1, 1, 2, 3}, "NHWC"), DataType(DATA_TYPE_S16)), - eStatusType::INVALID_VALUE); + EXPECT_EXCEPTION(tensor.reshape(TensorShape({1, 1, 2, 4}, "NHWC")), eStatusType::INVALID_VALUE); } } @@ -95,15 +112,14 @@ void TestTensorCorrectness() { EXPECT_EQ(tensor.shape().size(), 4 * 720 * 480 * 3); EXPECT_EQ(tensor.dtype().size(), 1); } +} - // Tensor reshape: Change layout +void TestTensorReshapeCorrectness() { { - // Reshape tensor from NHWC -> HWC layout - Tensor tensor(1, {720, 480}, FMT_RGB8); - Tensor reshapedTensor = tensor.reshape(TensorShape({720, 480, 3}, "HWC")); - EXPECT_EQ(reshapedTensor.rank(), 3); - EXPECT_NE(reshapedTensor.rank(), tensor.rank()); + Tensor tensor(TensorShape({1, 2, 3}, "HWC"), DataType(DATA_TYPE_U8)); + Tensor reshapedTensor = tensor.reshape(TensorShape({1, 1, 2, 3}, "NHWC")); EXPECT_EQ(reshapedTensor.shape().size(), tensor.shape().size()); + EXPECT_EQ(reshapedTensor.rank(), 4); // Ensure they are sharing the same underlying data auto data = tensor.exportData(); @@ -114,7 +130,7 @@ void TestTensorCorrectness() { // Tensor reshape: Change layout and datatype { Tensor tensor(TensorShape({1, 5, 4}, "NWC"), DataType(DATA_TYPE_S16)); - Tensor reshapedTensor = tensor.reshape(TensorShape({1, 5}, "NW"), DataType(DATA_TYPE_4S16)); + Tensor reshapedTensor = tensor.reshape(DataType(DATA_TYPE_4S16), TensorShape({1, 5}, "NW")); EXPECT_NE(reshapedTensor.shape().size(), tensor.shape().size()); EXPECT_NE(reshapedTensor.rank(), tensor.rank()); EXPECT_EQ(reshapedTensor.rank(), 2); @@ -126,12 +142,44 @@ void TestTensorCorrectness() { } } +/** + * @brief Tests the correctness of the copyFromHost and copyToHost methods. + * + */ +void TestTensorCopyCorrectness() { + Tensor tensor(2, {10, 10}, FMT_RGB8, eDeviceType::GPU); + const size_t hostDataSize = tensor.shape().size() * tensor.dtype().size(); + std::vector inputDataHost(hostDataSize); + for (size_t i = 0; i < inputDataHost.size(); i++) { + inputDataHost[i] = static_cast(i % 256); + } + + hipStream_t stream; + HIP_VALIDATE_NO_ERRORS(hipStreamCreate(&stream)); + + tensor.copyFromHost(inputDataHost.data(), stream); + std::vector outputDataHost(hostDataSize); + tensor.copyToHost(outputDataHost.data(), stream); + + HIP_VALIDATE_NO_ERRORS(hipStreamSynchronize(stream)); + HIP_VALIDATE_NO_ERRORS(hipStreamDestroy(stream)); + + EXPECT_VECTOR_EQ(inputDataHost, outputDataHost); +} + /** * @brief Tests internal stride calculations on Tensor construction. */ void TestTensorStrideCalculation(const TensorShape& shape, const DataType& dtype) { - Tensor tensor(shape, dtype); - std::vector expectedStrides = CalculateStrides(shape, dtype); + Tensor tensor(shape, dtype, eDeviceType::GPU); + + // Get row alignment from device attributes + int dev; + HIP_VALIDATE_NO_ERRORS(hipGetDevice(&dev)); + int rowAlign; + HIP_VALIDATE_NO_ERRORS(hipDeviceGetAttribute(&rowAlign, hipDeviceAttributeTexturePitchAlignment, dev)); + + std::vector expectedStrides = CalculateStrides(shape, dtype, rowAlign); std::vector actualStrides(tensor.rank()); auto data = tensor.exportData(); @@ -152,9 +200,12 @@ int main(int argc, char** argv) { // Negative tests TEST_CASE(TestNegativeTensorShape()); TEST_CASE(TestNegativeTensor()); + TEST_CASE(TestNegativeTensorReshape()); // Correctness tests TEST_CASE(TestTensorCorrectness()); + TEST_CASE(TestTensorReshapeCorrectness()); + TEST_CASE(TestTensorCopyCorrectness()); // Stride calculation tests // clang-format off