From 742920cffaf9a03ccd3cf5b0771126ecbae41320 Mon Sep 17 00:00:00 2001 From: Zach Vincze Date: Thu, 20 Nov 2025 12:19:59 -0500 Subject: [PATCH 01/36] Add MemAlignment to Tensor constructors --- include/core/mem_alignment.hpp | 68 +++++++++++++++++++++++++++++++ include/core/tensor.hpp | 19 ++++++--- src/core/mem_alignment.cpp | 38 +++++++++++++++++ src/core/tensor.cpp | 74 +++++++++++++++++++++------------- 4 files changed, 166 insertions(+), 33 deletions(-) create mode 100644 include/core/mem_alignment.hpp create mode 100644 src/core/mem_alignment.cpp diff --git a/include/core/mem_alignment.hpp b/include/core/mem_alignment.hpp new file mode 100644 index 00000000..39f208af --- /dev/null +++ b/include/core/mem_alignment.hpp @@ -0,0 +1,68 @@ +/* + * Copyright (c) 2025 Advanced Micro Devices, Inc. All rights reserved. + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN + * THE SOFTWARE. + */ + +#pragma once + +namespace roccv { + +/** + * @brief Defines memory alignment for containers. + * + */ +class MemAlignment { + public: + MemAlignment() = default; + + /** + * @brief Returns the base address alignment. + * + * @return The base address alignment. + */ + int32_t baseAddr() const; + + /** + * @brief Returns the row address alignment. + * + * @return The row address alignment. + */ + int32_t rowAddr() const; + + /** + * @brief Sets the base address alignment. + * + * @param[in] alignment Alignment in bytes. + * @return A reference to this object, with the base address set. + */ + MemAlignment& baseAddr(int32_t alignment); + + /** + * @brief Sets the row address alignment. + * + * @param[in] alignment Alignment in bytes. + * @return A reference to this object, with the row address set. + */ + MemAlignment& rowAddr(int32_t alignment); + + private: + int32_t m_baseAddrAlignment = 0; + int32_t m_rowAddrAlignment = 0; +}; +} // namespace roccv \ No newline at end of file diff --git a/include/core/tensor.hpp b/include/core/tensor.hpp index f32cb498..f17d4d33 100644 --- a/include/core/tensor.hpp +++ b/include/core/tensor.hpp @@ -26,6 +26,8 @@ THE SOFTWARE. #include "core/data_type.hpp" #include "core/detail/allocators/i_allocator.hpp" +#include "core/detail/context.hpp" +#include "core/mem_alignment.hpp" #include "core/tensor_layout.hpp" #include "core/util_enums.h" #include "tensor_data.hpp" @@ -50,8 +52,7 @@ class Tensor { * * @param[in] reqs An object representing the requirements for this tensor. */ - explicit Tensor(const TensorRequirements &reqs); - explicit Tensor(const TensorRequirements &reqs, const IAllocator &alloc); + explicit Tensor(const TensorRequirements &reqs, const IAllocator &alloc = GlobalContext().getDefaultAllocator()); /** * @brief Constructs a Tensor object given a list of requirements and the underlying data as a TensorStorage @@ -60,8 +61,8 @@ class Tensor { * @param[in] reqs An object representing the requirements for this tensor. * @param[in] data A TensorStorage object for the tensor's underlying data. */ - explicit Tensor(const TensorRequirements &reqs, std::shared_ptr data); - explicit Tensor(const TensorRequirements &reqs, std::shared_ptr data, const IAllocator &alloc); + explicit Tensor(const TensorRequirements &reqs, std::shared_ptr data, + const IAllocator &alloc = GlobalContext().getDefaultAllocator()); /** * @brief Constructs a tensor object and allocates the appropriate amount of memory on the specified device. @@ -71,7 +72,8 @@ class Tensor { * @param[in] device The device the tensor should be allocated on. */ explicit Tensor(const TensorShape &shape, DataType dtype, const eDeviceType device = eDeviceType::GPU); - explicit Tensor(const TensorShape &shape, DataType dtype, const IAllocator &alloc, + explicit Tensor(const TensorShape &shape, DataType dtype, const MemAlignment &bufAlign, + const IAllocator &alloc = GlobalContext().getDefaultAllocator(), const eDeviceType device = eDeviceType::GPU); /** @@ -84,7 +86,8 @@ class Tensor { * @param[in] device The device the tensor should be allocated on. */ explicit Tensor(int num_images, Size2D image_size, ImageFormat fmt, eDeviceType device = eDeviceType::GPU); - explicit Tensor(int num_images, Size2D image_size, ImageFormat fmt, const IAllocator &alloc, + explicit Tensor(int num_images, Size2D image_size, ImageFormat fmt, const MemAlignment &bufAlign, + const IAllocator &alloc = GlobalContext().getDefaultAllocator(), eDeviceType device = eDeviceType::GPU); Tensor(const Tensor &other) = delete; @@ -189,6 +192,8 @@ class Tensor { */ static Requirements CalcRequirements(const TensorShape &shape, const DataType &dtype, const eDeviceType device = eDeviceType::GPU); + static Requirements CalcRequirements(const TensorShape &shape, const DataType &dtype, const MemAlignment &bufAlign, + const eDeviceType device = eDeviceType::GPU); /** * @brief Calculates tensor requirements. @@ -214,6 +219,8 @@ class Tensor { */ static Requirements CalcRequirements(int num_images, Size2D image_size, ImageFormat fmt, eDeviceType device = eDeviceType::GPU); + static Requirements CalcRequirements(int num_images, Size2D image_size, ImageFormat fmt, + const MemAlignment &bufAlign, eDeviceType device = eDeviceType::GPU); /** * @brief Calculates strides required for a tensor. diff --git a/src/core/mem_alignment.cpp b/src/core/mem_alignment.cpp new file mode 100644 index 00000000..247974b1 --- /dev/null +++ b/src/core/mem_alignment.cpp @@ -0,0 +1,38 @@ +/* + * Copyright (c) 2025 Advanced Micro Devices, Inc. All rights reserved. + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN + * THE SOFTWARE. + */ + +#include "core/mem_alignment.hpp" + +namespace roccv { +int32_t MemAlignment::baseAddr() const { return m_baseAddrAlignment; } + +int32_t MemAlignment::rowAddr() const { return m_rowAddrAlignment; } + +MemAlignment& MemAlignment::baseAddr(int32_t alignment) { + m_baseAddrAlignment = alignment; + return *this; +} + +MemAlignment& MemAlignment::rowAddr(int32_t alignment) { + m_rowAddrAlignment = alignment; + return *this; +} +} // namespace roccv \ No newline at end of file diff --git a/src/core/tensor.cpp b/src/core/tensor.cpp index 4eeae37b..6e68b8ef 100644 --- a/src/core/tensor.cpp +++ b/src/core/tensor.cpp @@ -28,6 +28,7 @@ THE SOFTWARE. #include "core/detail/context.hpp" #include "core/exception.hpp" #include "core/image_format.hpp" +#include "core/mem_alignment.hpp" #include "core/status_type.h" #include "core/tensor_data.hpp" #include "core/tensor_layout.hpp" @@ -39,28 +40,26 @@ THE SOFTWARE. namespace roccv { // Constructor definitions -Tensor::Tensor(const TensorRequirements& reqs) : Tensor(reqs, GlobalContext().getDefaultAllocator()) {} - -Tensor::Tensor(const TensorRequirements& reqs, const IAllocator& alloc) : m_requirements(reqs), m_allocator(alloc) { +Tensor::Tensor(const Tensor::Requirements& reqs, const IAllocator& alloc) : m_requirements(reqs), m_allocator(alloc) { size_t numBytes = reqs.device == eDeviceType::GPU ? reqs.res.deviceMem.bytes : reqs.res.hostMem.bytes; m_data = std::make_shared(numBytes, reqs.device, alloc); } -Tensor::Tensor(const TensorRequirements& reqs, std::shared_ptr data) - : Tensor(reqs, data, GlobalContext().getDefaultAllocator()) {} -Tensor::Tensor(const TensorRequirements& reqs, std::shared_ptr data, const IAllocator& alloc) +Tensor::Tensor(const Tensor::Requirements& reqs, std::shared_ptr data, const IAllocator& alloc) : m_requirements(reqs), m_data(data), m_allocator(alloc) {} Tensor::Tensor(const TensorShape& shape, DataType dtype, const eDeviceType device) - : Tensor(shape, dtype, GlobalContext().getDefaultAllocator(), device) {} + : Tensor(shape, dtype, {}, GlobalContext().getDefaultAllocator(), device) {} -Tensor::Tensor(const TensorShape& shape, DataType dtype, const IAllocator& alloc, const eDeviceType device) +Tensor::Tensor(const TensorShape& shape, DataType dtype, const MemAlignment& bufAlign, const IAllocator& alloc, + const eDeviceType device) : Tensor(CalcRequirements(shape, dtype, device), alloc) {} Tensor::Tensor(int num_images, Size2D image_size, ImageFormat fmt, eDeviceType device) - : Tensor(num_images, image_size, fmt, GlobalContext().getDefaultAllocator(), device) {} + : Tensor(num_images, image_size, fmt, {}, GlobalContext().getDefaultAllocator(), device) {} -Tensor::Tensor(int num_images, Size2D image_size, ImageFormat fmt, const IAllocator& alloc, eDeviceType device) +Tensor::Tensor(int num_images, Size2D image_size, ImageFormat fmt, const MemAlignment& bufAlign, + const IAllocator& alloc, eDeviceType device) : Tensor(CalcRequirements(num_images, image_size, fmt, device), alloc) {} Tensor::Tensor(Tensor&& other) @@ -108,7 +107,7 @@ Tensor Tensor::reshape(const TensorShape& new_shape) const { eStatusType::INVALID_VALUE); } - TensorRequirements reqs = CalcRequirements(new_shape, this->dtype(), this->device()); + Tensor::Requirements reqs = CalcRequirements(new_shape, this->dtype(), this->device()); return Tensor(reqs, m_data); } @@ -117,7 +116,7 @@ Tensor Tensor::reshape(const TensorShape& new_shape, const DataType& new_dtype) throw Exception("New tensor view must have the same underlying number of bytes.", eStatusType::INVALID_VALUE); } - TensorRequirements reqs = CalcRequirements(new_shape, new_dtype, this->device()); + Tensor::Requirements reqs = CalcRequirements(new_shape, new_dtype, this->device()); return Tensor(reqs, m_data); } @@ -127,15 +126,21 @@ Tensor& Tensor::operator=(const Tensor& other) { return *this; } -TensorRequirements Tensor::CalcRequirements(const TensorShape& shape, const DataType& dtype, const eDeviceType device) { +Tensor::Requirements Tensor::CalcRequirements(const TensorShape& shape, const DataType& dtype, + const eDeviceType device) { + return CalcRequirements(shape, dtype, (MemAlignment){}, device); +} + +Tensor::Requirements Tensor::CalcRequirements(const TensorShape& shape, const DataType& dtype, + const MemAlignment& bufAlign, const eDeviceType device) { std::array strides = CalcStrides(shape, dtype); - TensorRequirements reqs = CalcRequirements(shape, dtype, strides, device); + Tensor::Requirements reqs = CalcRequirements(shape, dtype, strides, device); return reqs; } -TensorRequirements Tensor::CalcRequirements(const TensorShape& shape, const DataType& dtype, - std::array strides, eDeviceType device) { - TensorRequirements reqs; +Tensor::Requirements Tensor::CalcRequirements(const TensorShape& shape, const DataType& dtype, + std::array strides, eDeviceType device) { + Tensor::Requirements reqs; reqs.shape = shape.shape(); reqs.rank = shape.layout().rank(); @@ -145,24 +150,38 @@ TensorRequirements Tensor::CalcRequirements(const TensorShape& shape, const Data reqs.alignBytes = 0; // TODO: Must be specified later reqs.device = device; - // TODO: Resource requirements should be calculated differently later, once padded/aligned strides have been - // implemented + // Determine resource usage size_t numBytes = reqs.strides[0] * reqs.shape[0]; - if (reqs.device == eDeviceType::GPU) { - reqs.res.deviceMem.bytes = numBytes; - } else if (reqs.device == eDeviceType::CPU) { - reqs.res.hostMem.bytes = numBytes; + switch (reqs.device) { + case eDeviceType::GPU: { + reqs.res.deviceMem.bytes = numBytes; + break; + } + + case eDeviceType::CPU: { + reqs.res.hostMem.bytes = numBytes; + break; + } + + default: { + throw Exception("Unsupported device when calling Tensor::CalcRequirements().", eStatusType::INVALID_VALUE); + } } return reqs; } -TensorRequirements Tensor::CalcRequirements(int num_images, Size2D image_size, ImageFormat fmt, eDeviceType device) { +Tensor::Requirements Tensor::CalcRequirements(int num_images, Size2D image_size, ImageFormat fmt, eDeviceType device) { + return CalcRequirements(num_images, image_size, fmt, (MemAlignment){}, device); +} + +Tensor::Requirements Tensor::CalcRequirements(int num_images, Size2D image_size, ImageFormat fmt, + const MemAlignment& bufAlign, eDeviceType device) { // TODO: Need to support different types of tensor layouts. This will happen once more image formats are supported // first. TensorShape shape(TensorLayout(eTensorLayout::TENSOR_LAYOUT_NHWC), {num_images, image_size.h, image_size.w, fmt.channels()}); - return CalcRequirements(shape, DataType(fmt.dtype()), device); + return CalcRequirements(shape, DataType(fmt.dtype()), bufAlign, device); } std::array Tensor::CalcStrides(const TensorShape& shape, const DataType& dtype) { @@ -188,8 +207,9 @@ Tensor TensorWrapData(const TensorData& tensor_data) { for (int i = 0; i < tensorDataStrided->rank(); i++) { strides[i] = tensorDataStrided->stride(i); } - TensorRequirements reqs = Tensor::CalcRequirements(tensorDataStrided->shape(), tensorDataStrided->dtype(), strides, - tensorDataStrided->device()); + + Tensor::Requirements reqs = Tensor::CalcRequirements(tensorDataStrided->shape(), tensorDataStrided->dtype(), + strides, tensorDataStrided->device()); auto data = std::make_shared(tensorDataStrided->basePtr(), tensorDataStrided->device(), eOwnership::OWNING); From f345ea072334163ba715d1dc7449ccbc4be8048d Mon Sep 17 00:00:00 2001 From: Zach Vincze Date: Thu, 20 Nov 2025 12:34:08 -0500 Subject: [PATCH 02/36] Document all constructors for roccv::Tensor --- include/core/mem_alignment.hpp | 2 ++ include/core/tensor.hpp | 63 ++++++++++++++++++++++++++++----- include/core/tensor_storage.hpp | 7 ++++ src/core/tensor.cpp | 4 +-- src/core/tensor_storage.cpp | 3 ++ 5 files changed, 68 insertions(+), 11 deletions(-) diff --git a/include/core/mem_alignment.hpp b/include/core/mem_alignment.hpp index 39f208af..ca17e349 100644 --- a/include/core/mem_alignment.hpp +++ b/include/core/mem_alignment.hpp @@ -21,6 +21,8 @@ #pragma once +#include + namespace roccv { /** diff --git a/include/core/tensor.hpp b/include/core/tensor.hpp index f17d4d33..4f85625f 100644 --- a/include/core/tensor.hpp +++ b/include/core/tensor.hpp @@ -61,24 +61,35 @@ class Tensor { * @param[in] reqs An object representing the requirements for this tensor. * @param[in] data A TensorStorage object for the tensor's underlying data. */ - explicit Tensor(const TensorRequirements &reqs, std::shared_ptr data, - const IAllocator &alloc = GlobalContext().getDefaultAllocator()); + explicit Tensor(const TensorRequirements &reqs, std::shared_ptr data); /** - * @brief Constructs a tensor object and allocates the appropriate amount of memory on the specified device. + * @brief Constructs a tensor object and allocates the appropriate amount of memory on the specified device. Uses + * the default memory alignment and allocation strategy. * * @param[in] shape The shape describing the tensor. * @param[in] dtype The underlying datatype of the tensor. * @param[in] device The device the tensor should be allocated on. */ explicit Tensor(const TensorShape &shape, DataType dtype, const eDeviceType device = eDeviceType::GPU); + + /** + * @brief Constructs a tensor object and allocates the appropriate amount of memory on the specified device. Uses a + * user-specified memory alignment and allocation strategy. + * + * @param[in] shape The shape describing the tensor. + * @param[in] dtype The underlying datatype of the tensor. + * @param[in] bufAlign Specification for memory alignment. + * @param[in] alloc The allocation strategy. (Default: DefaultAllocator) + * @param[in] device The device the tensor should be allocated on. + */ explicit Tensor(const TensorShape &shape, DataType dtype, const MemAlignment &bufAlign, const IAllocator &alloc = GlobalContext().getDefaultAllocator(), const eDeviceType device = eDeviceType::GPU); /** * @brief Constructs a tensor using image-based requirements and allocates the appropriate amount of memory on the - * specified device. + * specified device. Uses the default memory alignment and allocation strategy. * * @param[in] num_images The number of images in the batch. * @param[in] image_size The size for images in the batch. @@ -86,6 +97,18 @@ class Tensor { * @param[in] device The device the tensor should be allocated on. */ explicit Tensor(int num_images, Size2D image_size, ImageFormat fmt, eDeviceType device = eDeviceType::GPU); + + /** + * @brief Constructs a tensor using image-based requirements and allocates the appropriate amount of memory on the + * specified device. Uses user-provided memory alignment and allocation strategies. + * + * @param[in] num_images The number of images in the batch. + * @param[in] image_size The size for images in the batch. + * @param[in] fmt The format of the underlying image data. + * @param[in] bufAlign Specification for memory alignment. + * @param[in] alloc The allocation strategy. (Default: DefaultAllocator) + * @param[in] device The device the tensor should be allocated on. + */ explicit Tensor(int num_images, Size2D image_size, ImageFormat fmt, const MemAlignment &bufAlign, const IAllocator &alloc = GlobalContext().getDefaultAllocator(), eDeviceType device = eDeviceType::GPU); @@ -181,8 +204,7 @@ class Tensor { Tensor &operator=(const Tensor &other); /** - * @brief Calculates tensor requirements. This essentially wraps the - * provided parameters into a TensorRequirements object. + * @brief Calculates tensor requirements using the default memory alignment strategy. * * @param[in] shape The desired shape of the tensor. * @param[in] dtype The desired data type of the tensor's raw data. @@ -192,11 +214,22 @@ class Tensor { */ static Requirements CalcRequirements(const TensorShape &shape, const DataType &dtype, const eDeviceType device = eDeviceType::GPU); + + /** + * @brief Calculates tensor requirements with a user-provided memory alignment strategy. + * + * @param[in] shape The desired shape of the tensor. + * @param[in] dtype The desired data type of the tensor's raw data. + * @param[in] bufAlign Specification for memory alignment. + * @param[in] device The device the tensor data should belong to. + * @return A TensorRequirements object representing this tensor's + * requirements. + */ static Requirements CalcRequirements(const TensorShape &shape, const DataType &dtype, const MemAlignment &bufAlign, const eDeviceType device = eDeviceType::GPU); /** - * @brief Calculates tensor requirements. + * @brief Calculates tensor requirements with user-provided strides. * * @param[in] shape The shape describing the tensor. * @param[in] dtype The type of the tensor's data. @@ -209,16 +242,28 @@ class Tensor { eDeviceType device = eDeviceType::GPU); /** - * @brief Calculates tensor requirements using image-based parameters. + * @brief Calculates tensor requirements using image-based parameters. This will use a default memory alignment + * strategy. * * @param[in] num_images The number of images in the batch. * @param[in] image_size The size for images in the batch. * @param[in] fmt The format of the underlying image data. * @param[in] device The device the tensor data should belong to. - * @return A TensorRequirements object representing the tensor's requirements. + * @return A Tensor::Requirements object representing the tensor's requirements. */ static Requirements CalcRequirements(int num_images, Size2D image_size, ImageFormat fmt, eDeviceType device = eDeviceType::GPU); + + /** + * @brief Calculates tensor requirements using image-based parameters and a specified memory alignment. + * + * @param[in] num_images The number of images in the batch. + * @param[in] image_size The size of images in the batch. + * @param[in] fmt The format of the underling image data. + * @param[in] bufAlign Specification for memory alignment. + * @param[in] device The device the tensor is to be allocated on. + * @return A Tensor::Requirements object representing this tensor's requirements. + */ static Requirements CalcRequirements(int num_images, Size2D image_size, ImageFormat fmt, const MemAlignment &bufAlign, eDeviceType device = eDeviceType::GPU); diff --git a/include/core/tensor_storage.hpp b/include/core/tensor_storage.hpp index 7d966ca6..c97d3125 100644 --- a/include/core/tensor_storage.hpp +++ b/include/core/tensor_storage.hpp @@ -71,6 +71,13 @@ class TensorStorage { */ const eDeviceType device() const; + /** + * @brief Returns the allocation strategy being used. + * + * @return The allocation strategy being used. + */ + const IAllocator& allocator() const; + private: eDeviceType m_device; eOwnership m_ownership; diff --git a/src/core/tensor.cpp b/src/core/tensor.cpp index 6e68b8ef..b2902253 100644 --- a/src/core/tensor.cpp +++ b/src/core/tensor.cpp @@ -45,8 +45,8 @@ Tensor::Tensor(const Tensor::Requirements& reqs, const IAllocator& alloc) : m_re m_data = std::make_shared(numBytes, reqs.device, alloc); } -Tensor::Tensor(const Tensor::Requirements& reqs, std::shared_ptr data, const IAllocator& alloc) - : m_requirements(reqs), m_data(data), m_allocator(alloc) {} +Tensor::Tensor(const Tensor::Requirements& reqs, std::shared_ptr data) + : m_requirements(reqs), m_data(data), m_allocator(data->allocator()) {} Tensor::Tensor(const TensorShape& shape, DataType dtype, const eDeviceType device) : Tensor(shape, dtype, {}, GlobalContext().getDefaultAllocator(), device) {} diff --git a/src/core/tensor_storage.cpp b/src/core/tensor_storage.cpp index 99686c69..d83d5c44 100644 --- a/src/core/tensor_storage.cpp +++ b/src/core/tensor_storage.cpp @@ -62,4 +62,7 @@ TensorStorage::~TensorStorage() { void* TensorStorage::data() const { return m_data; } const eDeviceType TensorStorage::device() const { return m_device; } + +const IAllocator& TensorStorage::allocator() const { return m_allocator; } + } // namespace roccv From cf99a9c0dfac883e5d9d4543b2c0b30f022df673 Mon Sep 17 00:00:00 2001 From: Zach Vincze Date: Thu, 20 Nov 2025 13:04:30 -0500 Subject: [PATCH 03/36] Add MemAlign parameter to Tensor::CalcStrides() --- include/core/tensor.hpp | 14 ++++++++------ src/core/tensor.cpp | 14 ++++++-------- 2 files changed, 14 insertions(+), 14 deletions(-) diff --git a/include/core/tensor.hpp b/include/core/tensor.hpp index 4f85625f..cb32a149 100644 --- a/include/core/tensor.hpp +++ b/include/core/tensor.hpp @@ -28,11 +28,11 @@ THE SOFTWARE. #include "core/detail/allocators/i_allocator.hpp" #include "core/detail/context.hpp" #include "core/mem_alignment.hpp" +#include "core/tensor_data.hpp" #include "core/tensor_layout.hpp" +#include "core/tensor_requirements.hpp" +#include "core/tensor_storage.hpp" #include "core/util_enums.h" -#include "tensor_data.hpp" -#include "tensor_requirements.hpp" -#include "tensor_storage.hpp" namespace roccv { @@ -268,18 +268,20 @@ class Tensor { const MemAlignment &bufAlign, eDeviceType device = eDeviceType::GPU); /** - * @brief Calculates strides required for a tensor. + * @brief Calculates strides required for a tensor. Uses a user-specified memory alignment strategy to determine the + * strides with padding in mind. * * @param shape The tensor shape. * @param dtype The datatype of the tensor. + * @param bufAlign The memory alignment strategy to use. * @return An array containing strides for the given parameters. */ - static std::array CalcStrides(const TensorShape &shape, const DataType &dtype); + static std::array CalcStrides(const TensorShape &shape, const DataType &dtype, + const MemAlignment &bufAlign); private: TensorRequirements m_requirements; // Tensor metadata std::shared_ptr m_data; // Stores raw tensor data - const IAllocator &m_allocator; }; /** diff --git a/src/core/tensor.cpp b/src/core/tensor.cpp index b2902253..2b272e56 100644 --- a/src/core/tensor.cpp +++ b/src/core/tensor.cpp @@ -40,13 +40,13 @@ THE SOFTWARE. namespace roccv { // Constructor definitions -Tensor::Tensor(const Tensor::Requirements& reqs, const IAllocator& alloc) : m_requirements(reqs), m_allocator(alloc) { +Tensor::Tensor(const Tensor::Requirements& reqs, const IAllocator& alloc) : m_requirements(reqs) { size_t numBytes = reqs.device == eDeviceType::GPU ? reqs.res.deviceMem.bytes : reqs.res.hostMem.bytes; m_data = std::make_shared(numBytes, reqs.device, alloc); } Tensor::Tensor(const Tensor::Requirements& reqs, std::shared_ptr data) - : m_requirements(reqs), m_data(data), m_allocator(data->allocator()) {} + : m_requirements(reqs), m_data(data) {} Tensor::Tensor(const TensorShape& shape, DataType dtype, const eDeviceType device) : Tensor(shape, dtype, {}, GlobalContext().getDefaultAllocator(), device) {} @@ -62,10 +62,7 @@ Tensor::Tensor(int num_images, Size2D image_size, ImageFormat fmt, const MemAlig const IAllocator& alloc, eDeviceType device) : Tensor(CalcRequirements(num_images, image_size, fmt, device), alloc) {} -Tensor::Tensor(Tensor&& other) - : m_requirements(std::move(other.m_requirements)), - m_data(std::move(other.m_data)), - m_allocator(other.m_allocator) {} +Tensor::Tensor(Tensor&& other) : m_requirements(std::move(other.m_requirements)), m_data(std::move(other.m_data)) {} // Member definitions int Tensor::rank() const { return m_requirements.rank; } @@ -133,7 +130,7 @@ Tensor::Requirements Tensor::CalcRequirements(const TensorShape& shape, const Da Tensor::Requirements Tensor::CalcRequirements(const TensorShape& shape, const DataType& dtype, const MemAlignment& bufAlign, const eDeviceType device) { - std::array strides = CalcStrides(shape, dtype); + std::array strides = CalcStrides(shape, dtype, bufAlign); Tensor::Requirements reqs = CalcRequirements(shape, dtype, strides, device); return reqs; } @@ -184,7 +181,8 @@ Tensor::Requirements Tensor::CalcRequirements(int num_images, Size2D image_size, return CalcRequirements(shape, DataType(fmt.dtype()), bufAlign, device); } -std::array Tensor::CalcStrides(const TensorShape& shape, const DataType& dtype) { +std::array Tensor::CalcStrides(const TensorShape& shape, const DataType& dtype, + const MemAlignment& bufAlign) { // TODO: Support memory alignment and padding in stride calculations // Calculate strides based on the given tensor shape. Strides are byte-wise. From 21f0cb8739578e616b40a83661371a4437db9d70 Mon Sep 17 00:00:00 2001 From: Zach Vincze Date: Thu, 20 Nov 2025 16:03:44 -0500 Subject: [PATCH 04/36] Added Tensor copy constructor an Tensor::dataSize() implementation --- include/core/tensor.hpp | 20 ++++++++++++++------ src/core/tensor.cpp | 20 ++++++++++++++++++-- 2 files changed, 32 insertions(+), 8 deletions(-) diff --git a/include/core/tensor.hpp b/include/core/tensor.hpp index cb32a149..c067d8df 100644 --- a/include/core/tensor.hpp +++ b/include/core/tensor.hpp @@ -27,20 +27,18 @@ THE SOFTWARE. #include "core/data_type.hpp" #include "core/detail/allocators/i_allocator.hpp" #include "core/detail/context.hpp" +#include "core/image_format.hpp" #include "core/mem_alignment.hpp" #include "core/tensor_data.hpp" #include "core/tensor_layout.hpp" #include "core/tensor_requirements.hpp" +#include "core/tensor_shape.hpp" #include "core/tensor_storage.hpp" #include "core/util_enums.h" +#include "operator_types.h" namespace roccv { -class ImageFormat; -struct Size2D; -class TensorShape; -class TensorLayout; - class Tensor { public: using Requirements = TensorRequirements; @@ -113,7 +111,10 @@ class Tensor { const IAllocator &alloc = GlobalContext().getDefaultAllocator(), eDeviceType device = eDeviceType::GPU); - Tensor(const Tensor &other) = delete; + // Copy constructor + Tensor(const Tensor &other); + + // Move constructor Tensor(Tensor &&other); /** @@ -203,6 +204,13 @@ class Tensor { Tensor &operator=(const Tensor &other); + /** + * @brief Returns the total number of bytes being used to store the raw tensor data. + * + * @return Total number of bytes being used to store the raw tensor data. + */ + size_t dataSize() const; + /** * @brief Calculates tensor requirements using the default memory alignment strategy. * diff --git a/src/core/tensor.cpp b/src/core/tensor.cpp index 2b272e56..206aa2e6 100644 --- a/src/core/tensor.cpp +++ b/src/core/tensor.cpp @@ -41,8 +41,7 @@ namespace roccv { // Constructor definitions Tensor::Tensor(const Tensor::Requirements& reqs, const IAllocator& alloc) : m_requirements(reqs) { - size_t numBytes = reqs.device == eDeviceType::GPU ? reqs.res.deviceMem.bytes : reqs.res.hostMem.bytes; - m_data = std::make_shared(numBytes, reqs.device, alloc); + m_data = std::make_shared(this->dataSize(), reqs.device, alloc); } Tensor::Tensor(const Tensor::Requirements& reqs, std::shared_ptr data) @@ -62,6 +61,12 @@ Tensor::Tensor(int num_images, Size2D image_size, ImageFormat fmt, const MemAlig const IAllocator& alloc, eDeviceType device) : Tensor(CalcRequirements(num_images, image_size, fmt, device), alloc) {} +// Copy constructor +Tensor::Tensor(const Tensor& other) : m_requirements(other.m_requirements) { + m_data = std::make_shared(this->dataSize(), m_requirements.device, other.m_data->allocator()); +} + +// Move constructor Tensor::Tensor(Tensor&& other) : m_requirements(std::move(other.m_requirements)), m_data(std::move(other.m_data)) {} // Member definitions @@ -123,6 +128,17 @@ Tensor& Tensor::operator=(const Tensor& other) { return *this; } +size_t Tensor::dataSize() const { + switch (m_requirements.device) { + case eDeviceType::GPU: + return m_requirements.res.deviceMem.bytes; + case eDeviceType::CPU: + return m_requirements.res.hostMem.bytes; + } + + return 0; +} + Tensor::Requirements Tensor::CalcRequirements(const TensorShape& shape, const DataType& dtype, const eDeviceType device) { return CalcRequirements(shape, dtype, (MemAlignment){}, device); From 0a761ce4166499867ec9e1fd4098cca354c4fe0a Mon Sep 17 00:00:00 2001 From: Zach Vincze Date: Mon, 24 Nov 2025 10:48:12 -0500 Subject: [PATCH 05/36] Remove copy constructor for roccv::Tensor --- include/core/tensor.hpp | 5 +---- src/core/tensor.cpp | 5 ----- 2 files changed, 1 insertion(+), 9 deletions(-) diff --git a/include/core/tensor.hpp b/include/core/tensor.hpp index c067d8df..da2d390b 100644 --- a/include/core/tensor.hpp +++ b/include/core/tensor.hpp @@ -111,10 +111,7 @@ class Tensor { const IAllocator &alloc = GlobalContext().getDefaultAllocator(), eDeviceType device = eDeviceType::GPU); - // Copy constructor - Tensor(const Tensor &other); - - // Move constructor + Tensor(const Tensor &other) = delete; Tensor(Tensor &&other); /** diff --git a/src/core/tensor.cpp b/src/core/tensor.cpp index 206aa2e6..97492115 100644 --- a/src/core/tensor.cpp +++ b/src/core/tensor.cpp @@ -61,11 +61,6 @@ Tensor::Tensor(int num_images, Size2D image_size, ImageFormat fmt, const MemAlig const IAllocator& alloc, eDeviceType device) : Tensor(CalcRequirements(num_images, image_size, fmt, device), alloc) {} -// Copy constructor -Tensor::Tensor(const Tensor& other) : m_requirements(other.m_requirements) { - m_data = std::make_shared(this->dataSize(), m_requirements.device, other.m_data->allocator()); -} - // Move constructor Tensor::Tensor(Tensor&& other) : m_requirements(std::move(other.m_requirements)), m_data(std::move(other.m_data)) {} From fd84334c2828b2b039ff16ecba1f6c88e90badff Mon Sep 17 00:00:00 2001 From: Zach Vincze Date: Thu, 27 Nov 2025 11:49:38 -0500 Subject: [PATCH 06/36] Implement memory padding and alignment for tensors --- include/core/mem_alignment.hpp | 21 +++++++- include/core/tensor.hpp | 20 ++++++-- include/core/tensor_requirements.hpp | 24 +++++++++ include/core/tensor_storage.hpp | 1 - include/core/utils.hpp | 77 ++++++++++++++++++++++++++++ python/include/py_tensor.hpp | 77 +++++++++++++++++++++++----- python/src/py_tensor.cpp | 42 +++++++++++---- src/core/tensor.cpp | 56 ++++++++++++++++---- 8 files changed, 277 insertions(+), 41 deletions(-) create mode 100644 include/core/utils.hpp diff --git a/include/core/mem_alignment.hpp b/include/core/mem_alignment.hpp index ca17e349..4aceceba 100644 --- a/include/core/mem_alignment.hpp +++ b/include/core/mem_alignment.hpp @@ -26,8 +26,27 @@ namespace roccv { /** - * @brief Defines memory alignment for containers. + * @class MemAlignment + * @brief Class for specifying memory alignment constraints for buffer allocations. * + * The MemAlignment class allows you to specify the memory alignment in bytes that should + * be used when allocating or working with device or host memory buffers. Proper alignment + * is important for performance reasons and hardware compatibility, especially for GPU operations. + * + * There are two types of alignment constraints: + * - Base address alignment: Alignment requirements for the starting address of the buffer. + * - Row address alignment: Alignment requirements for the starting address of each row, which is + * important for multi-dimensional data (e.g., images or tensors). + * + * Both alignment values default to 0, which implies that the system or device default alignment will be used. + * + * Example usage: + * @code + * roccv::MemAlignment align; + * align.baseAddr(256).rowAddr(128); + * @endcode + * + * @see roccv::Tensor */ class MemAlignment { public: diff --git a/include/core/tensor.hpp b/include/core/tensor.hpp index da2d390b..14348f63 100644 --- a/include/core/tensor.hpp +++ b/include/core/tensor.hpp @@ -199,6 +199,16 @@ class Tensor { */ Tensor reshape(const TensorShape &new_shape, const DataType &new_dtype) const; + /** + * @brief Performs a shallow copy of the tensor (creates a view). + * + * This assignment operator copies the tensor's metadata and data handle, + * resulting in a new tensor object that shares the same underlying data + * with the original tensor. No deep copy of the data is performed. + * + * @param other The tensor to assign from. + * @return Reference to this tensor. + */ Tensor &operator=(const Tensor &other); /** @@ -239,11 +249,12 @@ class Tensor { * @param[in] shape The shape describing the tensor. * @param[in] dtype The type of the tensor's data. * @param[in] strides The tensor's strides. + * @param[in] baseAlign The base address alignment. * @param[in] device The device the tensor data belongs on. (Default: GPU) * @return Tensor requirements. */ static Requirements CalcRequirements(const TensorShape &shape, const DataType &dtype, - std::array strides, + std::array strides, int32_t baseAlign, eDeviceType device = eDeviceType::GPU); /** @@ -273,16 +284,15 @@ class Tensor { const MemAlignment &bufAlign, eDeviceType device = eDeviceType::GPU); /** - * @brief Calculates strides required for a tensor. Uses a user-specified memory alignment strategy to determine the - * strides with padding in mind. + * @brief Calculates strides required for a tensor. * * @param shape The tensor shape. * @param dtype The datatype of the tensor. - * @param bufAlign The memory alignment strategy to use. + * @param rowAlign The row alignment to use. Setting to 0 will ensure contiguous memory usage. * @return An array containing strides for the given parameters. */ static std::array CalcStrides(const TensorShape &shape, const DataType &dtype, - const MemAlignment &bufAlign); + int32_t rowAlign); private: TensorRequirements m_requirements; // Tensor metadata diff --git a/include/core/tensor_requirements.hpp b/include/core/tensor_requirements.hpp index 0bb6f631..3ff30d4e 100644 --- a/include/core/tensor_requirements.hpp +++ b/include/core/tensor_requirements.hpp @@ -29,16 +29,40 @@ THE SOFTWARE. namespace roccv { +/** + * @brief Specifies basic memory requirements for an allocation. + * + * This struct expresses the number of bytes required for a memory region that backs + * a particular tensor or buffer allocation. It is typically used to indicate the raw + * size required for device, host, or pinned memory allocations. + */ struct MemRequirements { size_t bytes = 0; }; +/** + * @brief Specifies resource requirements for tensor memory allocations. + * + * This struct aggregates requirements for different types of memory resources + * that may be used for tensor allocation and operation: + * - deviceMem: Memory required on the device (e.g., GPU). + * - hostMem: Memory required on the host (CPU-accessible memory). + * - pinnedMem: Memory required in pinned (page-locked) host memory, which may + * be used for efficient device-host transfers. + */ struct ResourceRequirements { MemRequirements deviceMem; MemRequirements hostMem; MemRequirements pinnedMem; }; +/** + * @brief Specifies the requirements for creating and allocating a tensor. + * + * This struct defines all the necessary properties for specifying the memory and layout + * requirements of a tensor, including its datatype, shape, memory alignment, layout, strides, + * memory resource requirements, and device placement. + */ struct TensorRequirements { eDataType dtype; eTensorLayout layout; diff --git a/include/core/tensor_storage.hpp b/include/core/tensor_storage.hpp index c97d3125..201b274a 100644 --- a/include/core/tensor_storage.hpp +++ b/include/core/tensor_storage.hpp @@ -84,5 +84,4 @@ class TensorStorage { void* m_data; const IAllocator& m_allocator; }; - } // namespace roccv \ No newline at end of file diff --git a/include/core/utils.hpp b/include/core/utils.hpp new file mode 100644 index 00000000..480d4e29 --- /dev/null +++ b/include/core/utils.hpp @@ -0,0 +1,77 @@ +/** +Copyright (c) 2025 Advanced Micro Devices, Inc. All rights reserved. + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in +all copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN +THE SOFTWARE. +*/ + +#pragma once + +#include + +#include + +namespace roccv::detail { +/** + * @brief Returns the next power of two greater than or equal to the given value. + * + * @tparam T Integral type of the input and output. + * @param value The value to find the next power of two for. + * @return The next power of two greater than or equal to the given value. + */ +template >> +inline constexpr T NextPowerOfTwo(T value) noexcept { + if (value <= 1) return 1; +#if defined(__GNUC__) || defined(__clang__) + // For unsigned types, use clz built-in. For signed, cast to unsigned. + using U = std::make_unsigned_t; + constexpr int numBits = sizeof(T) * 8; + return static_cast(U(1) << (numBits - __builtin_clz(static_cast(value - 1)))); +#else + // Portable fallback: fill lower bits, then add one. + value--; + for (size_t i = 1; i < sizeof(T) * 8; i <<= 1) value |= (value >> i); + return value + 1; +#endif +} + +/** + * @brief Returns true if the given value is a power of two. + * + * @param value The value to check. + * @return True if the given value is a power of two, false otherwise. + */ +template >> +inline constexpr bool IsPowerOfTwo(T value) noexcept { + return value > 0 && (value & (value - 1)) == 0; +} + +/** + * @brief Aligns the given value to the nearest multiple of the given alignment. + * + * @param value The value to align. + * @param alignment The alignment to align to (must be > 0). + * @return The aligned value (same type as value). + */ +template && std::is_integral_v>> +inline constexpr T AlignUp(T value, U alignment) noexcept { + return alignment > 0 + ? (value + static_cast(alignment) - 1) / static_cast(alignment) * static_cast(alignment) + : value; +} +} // namespace roccv::detail \ No newline at end of file diff --git a/python/include/py_tensor.hpp b/python/include/py_tensor.hpp index 523a5f27..0c940148 100644 --- a/python/include/py_tensor.hpp +++ b/python/include/py_tensor.hpp @@ -29,8 +29,19 @@ THE SOFTWARE. #include #include -namespace py = pybind11; - +/** + * @brief A Python-facing wrapper for roccv::Tensor, providing DLPack interoperability and Pybind11 integration. + * + * The PyTensor class serves as a container around roccv::Tensor, exposing its functionality to Python. It supports + * construction from shape, data type, layout, and device information, as well as wrapping existing tensors or + * external DLPack-managed tensors. The class ensures correct lifetime handling for external resources (such as those + * transferred via DLPack) and enables seamless data movement between devices (CPU/GPU). PyTensor also supplies + * utilities to export to and import from DLPack capsules, making it suitable for zero-copy interoperability with + * frameworks like PyTorch, NumPy, or TensorFlow. + * + * Usage scenarios include direct creation from Python, import/export to DLPack, device copying, and serving as a + * type-safe bridge between Python and C++ tensor operations. + */ class PyTensor : public std::enable_shared_from_this { public: /** @@ -79,22 +90,51 @@ class PyTensor : public std::enable_shared_from_this { std::shared_ptr copyTo(eDeviceType device); /** - * @brief Creates a new tensor given a capsule containing a DLManagedTensor. + * @brief Creates a new PyTensor by consuming an external DLPack capsule. * - * @param src A capsule containing a DLManagedTensor. - * @param layout The layout for the new tensor. - * @return std::shared_ptr + * This static method constructs a PyTensor that wraps a new roccv::Tensor allocated + * from the contents of a DLPack capsule (i.e., an object supporting the __dlpack__ protocol), + * typically exported from other frameworks such as PyTorch, NumPy, or TVM. + * The shape and datatype are taken from the capsule's DLTensor metadata, while the layout + * must be specified explicitly, since DLPack does not encode layout. + * + * Ownership of the underlying DLPack-managed memory is transferred to the returned PyTensor, + * such that when the PyTensor is destroyed (and no Python references remain), the deleter + * from the DLPack capsule is called, ensuring correct cross-framework resource management. + * + * @param src Python object supporting the __dlpack__() method. This can be a capsule returned by + * __dlpack__() or any object exposing the DLPack consumer protocol. + * @param layout The tensor layout to use for the new roccv::Tensor (e.g., NHWC, NCHW, etc.). + * @return A std::shared_ptr wrapping a new tensor that shares memory with the DLPack object. + * + * @throws std::runtime_error if the object does not support the DLPack protocol, + * if the capsule is invalid or missing, or if conversion fails in any other way. + * + * @note The resulting tensor will have the same shape, datatype, device, and (if present) strides + * as encoded in the DLPack capsule, but the layout must be provided by the caller. */ static std::shared_ptr fromDLPack(pybind11::object src, eTensorLayout layout); /** - * @brief Creates a DLManagedTensor contained in a capsule from the tensor. + * @brief Exports this tensor as a DLPack-compatible capsule. + * + * This method creates a DLPack DLManagedTensor wrapper for the current PyTensor, + * encapsulates it in a Python capsule with the "dltensor" name, and returns it. + * The resulting capsule can be consumed by any framework supporting the DLPack protocol + * for zero-copy tensor data sharing. * - * @param stream Optional stream pointer value (used for pytorch). + * The optional @p stream argument is present to satisfy the Pytorch DLPack consumer interface, + * but is ignored by this implementation currently. * - * @return py::capsule + * @param stream Optional stream pointer (typically unused, present for PyTorch DLPack compliance). + * + * @return py::capsule A Python capsule containing the DLPack DLManagedTensor. + * + * @note The caller is responsible for transferring or managing ownership of the data + * (according to DLPack conventions), including calling the capsule consumer and ensuring + * that the deleter in DLManagedTensor is invoked only once. */ - py::capsule toDLPack(py::object stream); + pybind11::capsule toDLPack(pybind11::object stream); /** * @brief Gets the strides of the tensor as a python list. @@ -151,15 +191,28 @@ class PyTensor : public std::enable_shared_from_this { * @return A python tuple with the first index corresponding to the device type, and the second index corresponding * to the device id. */ - py::tuple getDLDevice(); + pybind11::tuple getDLDevice(); /** * @brief Exports this class in the provided module. * * @param m The python module to export this class to. */ - static void Export(py::module& m); + static void Export(pybind11::module& m); + /** + * @brief Returns a new PyTensor with a reshaped tensor according to the specified shape and layout. + * + * Creates and returns a new PyTensor whose underlying tensor is a view or copy of this tensor, + * with the shape specified by newShape and the layout specified by layout. The number of elements + * in newShape must match the number of elements in the original tensor. + * + * @param newShape The new shape for the tensor. + * @param layout The new layout to use for the reshaped tensor. + * @return std::shared_ptr A new PyTensor with the reshaped tensor. + * + * @throws std::runtime_error if the total number of elements does not match. + */ std::shared_ptr reshape(std::vector newShape, eTensorLayout layout); private: diff --git a/python/src/py_tensor.cpp b/python/src/py_tensor.cpp index c0b9e864..ab784347 100644 --- a/python/src/py_tensor.cpp +++ b/python/src/py_tensor.cpp @@ -97,10 +97,12 @@ std::shared_ptr PyTensor::copyTo(eDeviceType device) { } std::shared_ptr PyTensor::fromDLPack(pybind11::object src, eTensorLayout layout) { + // Check if the object supports the DLPack protocol if (!py::hasattr(src, "__dlpack__")) { throw std::runtime_error("Provided object does not support the DLPack protocol."); } + // Obtain a DLPack capsule py::capsule dlpackCapsule = src.attr("__dlpack__")(); if (!PyCapsule_IsValid(dlpackCapsule.ptr(), "dltensor")) { throw std::runtime_error("Invalid DLPack capsule."); @@ -108,23 +110,41 @@ std::shared_ptr PyTensor::fromDLPack(pybind11::object src, eTensorLayo DLManagedTensor* dlManagedTensor = static_cast(dlpackCapsule.get_pointer()); DLTensor dlTensor = dlManagedTensor->dl_tensor; - // Mark this capsule as consumed, so that the deleter will not free underlying data. + // Mark this capsule as consumed, so that the deleter will not free underlying data dlpackCapsule.set_name("used_dltensor"); - // Copy shape data - std::vector shapeData(dlTensor.ndim); - for (int i = 0; i < dlTensor.ndim; i++) { + // Copy the shape data from DLPack to a fixed-size array + std::array shapeData; + for (int i = 0; i < dlTensor.ndim; ++i) { shapeData[i] = dlTensor.shape[i]; } - - // Create a non-owning roccv::Tensor based on the received data - roccv::TensorShape shape(roccv::TensorLayout(layout), shapeData); + roccv::TensorShape shape(shapeData, dlTensor.ndim, roccv::TensorLayout(layout)); eDeviceType device = DLDeviceToRoccvDevice(dlTensor.device); - roccv::TensorRequirements reqs = - roccv::Tensor::CalcRequirements(shape, roccv::DataType(DLTypeToRoccvType(dlTensor.dtype)), device); - auto data = std::make_shared(dlTensor.data, device, eOwnership::VIEW); - auto tensor = std::make_shared(reqs, data); + eDataType dtype = DLTypeToRoccvType(dlTensor.dtype); + + // Prepare the strides array + std::array stridesData; + + // If strides are not present, assume contiguous layout. We really shouldn't be recalculating strides here. DLPack + // now enforces that the strides are present, but we'll keep this for backwards compatibility. + if (dlTensor.strides == nullptr) { + stridesData = roccv::Tensor::CalcStrides(shape, roccv::DataType(dtype), 0); + } else { + for (int i = 0; i < dlTensor.ndim; ++i) { + stridesData[i] = dlTensor.strides[i]; + } + } + + // Set up the requirements for the new tensor. + roccv::Tensor::Requirements reqs = + roccv::Tensor::CalcRequirements(shape, roccv::DataType(dtype), stridesData, 0, device); + + // Since this tensor is coming from a DLPack, we don't own the data, so we need to create a view of the data. + std::shared_ptr data = + std::make_shared(dlTensor.data, device, eOwnership::VIEW); + std::shared_ptr tensor = std::make_shared(reqs, data); + // Instantiate a new tensor and a PyTensor to wrap it, binding the original DLManagedTensor return std::make_shared(tensor, dlManagedTensor); } diff --git a/src/core/tensor.cpp b/src/core/tensor.cpp index 97492115..97907856 100644 --- a/src/core/tensor.cpp +++ b/src/core/tensor.cpp @@ -23,10 +23,12 @@ THE SOFTWARE. #include "core/tensor.hpp" #include +#include #include "core/data_type.hpp" #include "core/detail/context.hpp" #include "core/exception.hpp" +#include "core/hip_assert.h" #include "core/image_format.hpp" #include "core/mem_alignment.hpp" #include "core/status_type.h" @@ -35,6 +37,7 @@ THE SOFTWARE. #include "core/tensor_requirements.hpp" #include "core/tensor_shape.hpp" #include "core/util_enums.h" +#include "core/utils.hpp" #include "operator_types.h" namespace roccv { @@ -52,14 +55,14 @@ Tensor::Tensor(const TensorShape& shape, DataType dtype, const eDeviceType devic Tensor::Tensor(const TensorShape& shape, DataType dtype, const MemAlignment& bufAlign, const IAllocator& alloc, const eDeviceType device) - : Tensor(CalcRequirements(shape, dtype, device), alloc) {} + : Tensor(CalcRequirements(shape, dtype, bufAlign, device), alloc) {} Tensor::Tensor(int num_images, Size2D image_size, ImageFormat fmt, eDeviceType device) : Tensor(num_images, image_size, fmt, {}, GlobalContext().getDefaultAllocator(), device) {} Tensor::Tensor(int num_images, Size2D image_size, ImageFormat fmt, const MemAlignment& bufAlign, const IAllocator& alloc, eDeviceType device) - : Tensor(CalcRequirements(num_images, image_size, fmt, device), alloc) {} + : Tensor(CalcRequirements(num_images, image_size, fmt, bufAlign, device), alloc) {} // Move constructor Tensor::Tensor(Tensor&& other) : m_requirements(std::move(other.m_requirements)), m_data(std::move(other.m_data)) {} @@ -141,13 +144,41 @@ Tensor::Requirements Tensor::CalcRequirements(const TensorShape& shape, const Da Tensor::Requirements Tensor::CalcRequirements(const TensorShape& shape, const DataType& dtype, const MemAlignment& bufAlign, const eDeviceType device) { - std::array strides = CalcStrides(shape, dtype, bufAlign); - Tensor::Requirements reqs = CalcRequirements(shape, dtype, strides, device); + int dev; + HIP_VALIDATE_NO_ERRORS(hipGetDevice(&dev)); + + // Validate memory alignment, set default alignment if set to 0. + // TODO: Must be supported for CPU as well. + int rowAlign; + if (bufAlign.rowAddr() == 0) { + HIP_VALIDATE_NO_ERRORS(hipDeviceGetAttribute(&rowAlign, hipDeviceAttributeTexturePitchAlignment, dev)); + rowAlign = std::lcm(rowAlign, detail::NextPowerOfTwo(dtype.size())); + } else { + if (!detail::IsPowerOfTwo(bufAlign.rowAddr())) { + throw Exception("Row address alignment must be a power of two.", eStatusType::INVALID_VALUE); + } + rowAlign = std::lcm(bufAlign.rowAddr(), detail::NextPowerOfTwo(dtype.size())); + } + + int baseAlign; + if (bufAlign.baseAddr() == 0) { + HIP_VALIDATE_NO_ERRORS(hipDeviceGetAttribute(&baseAlign, hipDeviceAttributeTextureAlignment, dev)); + baseAlign = std::lcm(baseAlign, detail::NextPowerOfTwo(dtype.size())); + } else { + if (!detail::IsPowerOfTwo(bufAlign.baseAddr())) { + throw Exception("Base address alignment must be a power of two.", eStatusType::INVALID_VALUE); + } + baseAlign = std::lcm(bufAlign.baseAddr(), detail::NextPowerOfTwo(dtype.size())); + } + + std::array strides = CalcStrides(shape, dtype, rowAlign); + Tensor::Requirements reqs = CalcRequirements(shape, dtype, strides, baseAlign, device); return reqs; } Tensor::Requirements Tensor::CalcRequirements(const TensorShape& shape, const DataType& dtype, - std::array strides, eDeviceType device) { + std::array strides, int32_t baseAlign, + eDeviceType device) { Tensor::Requirements reqs; reqs.shape = shape.shape(); @@ -155,7 +186,7 @@ Tensor::Requirements Tensor::CalcRequirements(const TensorShape& shape, const Da reqs.layout = shape.layout().elayout(); reqs.strides = strides; reqs.dtype = dtype.etype(); - reqs.alignBytes = 0; // TODO: Must be specified later + reqs.alignBytes = baseAlign; reqs.device = device; // Determine resource usage @@ -193,14 +224,17 @@ Tensor::Requirements Tensor::CalcRequirements(int num_images, Size2D image_size, } std::array Tensor::CalcStrides(const TensorShape& shape, const DataType& dtype, - const MemAlignment& bufAlign) { - // TODO: Support memory alignment and padding in stride calculations - + int32_t rowAlign) { // Calculate strides based on the given tensor shape. Strides are byte-wise. std::array strides; strides[shape.layout().rank() - 1] = dtype.size(); for (int i = shape.layout().rank() - 2; i >= 0; i--) { - strides[i] = strides[i + 1] * shape[i + 1]; + // Ensure strides for the row are padded to the next multiple of the alignment. + if (i == shape.layout().height_index()) { + strides[i] = detail::AlignUp(strides[i + 1] * shape[i + 1], rowAlign); + } else { + strides[i] = strides[i + 1] * shape[i + 1]; + } } return strides; } @@ -218,7 +252,7 @@ Tensor TensorWrapData(const TensorData& tensor_data) { } Tensor::Requirements reqs = Tensor::CalcRequirements(tensorDataStrided->shape(), tensorDataStrided->dtype(), - strides, tensorDataStrided->device()); + strides, 0, tensorDataStrided->device()); auto data = std::make_shared(tensorDataStrided->basePtr(), tensorDataStrided->device(), eOwnership::OWNING); From c232d1fcae5f9b5fe7b1bd8b2835884e95c64d89 Mon Sep 17 00:00:00 2001 From: Zach Vincze Date: Thu, 27 Nov 2025 14:45:44 -0500 Subject: [PATCH 07/36] Fix alignment calculations --- include/core/utils.hpp | 21 ++++++--------------- src/core/tensor.cpp | 4 ++++ tests/roccv/cpp/test_tensor.cpp | 28 +++++++++++++++++++++------- 3 files changed, 31 insertions(+), 22 deletions(-) diff --git a/include/core/utils.hpp b/include/core/utils.hpp index 480d4e29..4449ce5b 100644 --- a/include/core/utils.hpp +++ b/include/core/utils.hpp @@ -24,28 +24,19 @@ THE SOFTWARE. #include +#include #include namespace roccv::detail { -/** - * @brief Returns the next power of two greater than or equal to the given value. - * - * @tparam T Integral type of the input and output. - * @param value The value to find the next power of two for. - * @return The next power of two greater than or equal to the given value. - */ -template >> -inline constexpr T NextPowerOfTwo(T value) noexcept { + +inline constexpr size_t NextPowerOfTwo(size_t value) noexcept { if (value <= 1) return 1; #if defined(__GNUC__) || defined(__clang__) - // For unsigned types, use clz built-in. For signed, cast to unsigned. - using U = std::make_unsigned_t; - constexpr int numBits = sizeof(T) * 8; - return static_cast(U(1) << (numBits - __builtin_clz(static_cast(value - 1)))); + constexpr int numBits = sizeof(size_t) * 8; + return 1UL << (numBits - __builtin_clzl(value - 1)); #else - // Portable fallback: fill lower bits, then add one. value--; - for (size_t i = 1; i < sizeof(T) * 8; i <<= 1) value |= (value >> i); + for (size_t i = 1; i < sizeof(size_t) * 8; i <<= 1) value |= (value >> i); return value + 1; #endif } diff --git a/src/core/tensor.cpp b/src/core/tensor.cpp index 97907856..e26590bb 100644 --- a/src/core/tensor.cpp +++ b/src/core/tensor.cpp @@ -152,6 +152,8 @@ Tensor::Requirements Tensor::CalcRequirements(const TensorShape& shape, const Da int rowAlign; if (bufAlign.rowAddr() == 0) { HIP_VALIDATE_NO_ERRORS(hipDeviceGetAttribute(&rowAlign, hipDeviceAttributeTexturePitchAlignment, dev)); + printf("rowAlign: %d\n", rowAlign); + printf("NextPowerOfTwo(dtype.size()): %lu\n", detail::NextPowerOfTwo(dtype.size())); rowAlign = std::lcm(rowAlign, detail::NextPowerOfTwo(dtype.size())); } else { if (!detail::IsPowerOfTwo(bufAlign.rowAddr())) { @@ -159,6 +161,7 @@ Tensor::Requirements Tensor::CalcRequirements(const TensorShape& shape, const Da } rowAlign = std::lcm(bufAlign.rowAddr(), detail::NextPowerOfTwo(dtype.size())); } + printf("rowAlign: %d\n", rowAlign); int baseAlign; if (bufAlign.baseAddr() == 0) { @@ -170,6 +173,7 @@ Tensor::Requirements Tensor::CalcRequirements(const TensorShape& shape, const Da } baseAlign = std::lcm(bufAlign.baseAddr(), detail::NextPowerOfTwo(dtype.size())); } + printf("baseAlign: %d\n", baseAlign); std::array strides = CalcStrides(shape, dtype, rowAlign); Tensor::Requirements reqs = CalcRequirements(shape, dtype, strides, baseAlign, device); diff --git a/tests/roccv/cpp/test_tensor.cpp b/tests/roccv/cpp/test_tensor.cpp index c0442df6..030579cb 100644 --- a/tests/roccv/cpp/test_tensor.cpp +++ b/tests/roccv/cpp/test_tensor.cpp @@ -20,6 +20,7 @@ */ #include +#include #include "test_helpers.hpp" @@ -33,16 +34,22 @@ namespace { * * @param shape The tensor's shape. * @param dtype The datatype of the tensor. + * @param rowAlign The row alignment to use. Setting to 0 will ensure contiguous memory usage. * @return A list of strides for each dimension of the given shape. */ -std::vector CalculateStrides(const TensorShape& shape, const DataType& dtype) { +std::vector CalculateStrides(const TensorShape& shape, const DataType& dtype, int32_t rowAlign) { std::vector strides(shape.layout().rank()); // Strides are calculated byte-wise. Therefore, the highest dimension will refer to the stride between singular // elements (which, in turn, is the number of bytes per said element). strides[shape.layout().rank() - 1] = dtype.size(); for (int i = shape.layout().rank() - 2; i >= 0; --i) { - strides[i] = strides[i + 1] * shape[i + 1]; + // Use the row alignment for the height dimension. + if (i == shape.layout().height_index()) { + strides[i] = detail::AlignUp(strides[i + 1] * shape[i + 1], rowAlign); + } else { + strides[i] = strides[i + 1] * shape[i + 1]; + } } return strides; } @@ -61,17 +68,22 @@ void TestNegativeTensorShape() { } /** - * @brief Negative tests related to the Tensor object. + * @brief Negative tests for the Tensor class, verifying error handling in invalid scenarios. * + * These tests confirm that the Tensor class appropriately throws exceptions when: + * 1. Attempting to reshape a tensor to a shape with a different number of elements. + * 2. Attempting to reshape a tensor to a shape and datatype combination that would change the total number of bytes + * in the underlying storage. + * + * In both cases, the expected behavior is to throw an exception of type eStatusType::INVALID_VALUE. */ void TestNegativeTensor() { { - // Should not be able to reshape tensor into another view with a differing number of elements + // Case 1: Reshaping to a different number of elements is invalid Tensor tensor(TensorShape({1, 2, 3}, "HWC"), DataType(DATA_TYPE_U8)); EXPECT_EXCEPTION(tensor.reshape(TensorShape({1, 1, 2, 2}, "NHWC")), eStatusType::INVALID_VALUE); - // Should not be able to reshape tensor into another view which would result in a different number of bytes in - // the underlying memory. + // Case 2: Reshaping to a different total byte size is invalid, even if element count matches EXPECT_EXCEPTION(tensor.reshape(TensorShape({1, 1, 2, 3}, "NHWC"), DataType(DATA_TYPE_S16)), eStatusType::INVALID_VALUE); } @@ -131,7 +143,9 @@ void TestTensorCorrectness() { */ void TestTensorStrideCalculation(const TensorShape& shape, const DataType& dtype) { Tensor tensor(shape, dtype); - std::vector expectedStrides = CalculateStrides(shape, dtype); + + // TODO: Use row alignment from device attributes instead of a hardcoded value. + std::vector expectedStrides = CalculateStrides(shape, dtype, 256); std::vector actualStrides(tensor.rank()); auto data = tensor.exportData(); From c90475b400c847ceff713a73a7dc48143ee7374f Mon Sep 17 00:00:00 2001 From: Zach Vincze Date: Wed, 3 Dec 2025 11:45:58 -0500 Subject: [PATCH 08/36] Take padding into account during copies in test helpers --- include/core/tensor.hpp | 2 +- src/core/tensor.cpp | 8 +- tests/roccv/cpp/common/test_helpers.hpp | 124 +++++++++++------------- 3 files changed, 60 insertions(+), 74 deletions(-) diff --git a/include/core/tensor.hpp b/include/core/tensor.hpp index 14348f63..ce1271b5 100644 --- a/include/core/tensor.hpp +++ b/include/core/tensor.hpp @@ -254,7 +254,7 @@ class Tensor { * @return Tensor requirements. */ static Requirements CalcRequirements(const TensorShape &shape, const DataType &dtype, - std::array strides, int32_t baseAlign, + const std::array strides, int32_t baseAlign, eDeviceType device = eDeviceType::GPU); /** diff --git a/src/core/tensor.cpp b/src/core/tensor.cpp index e26590bb..282eeebd 100644 --- a/src/core/tensor.cpp +++ b/src/core/tensor.cpp @@ -152,8 +152,6 @@ Tensor::Requirements Tensor::CalcRequirements(const TensorShape& shape, const Da int rowAlign; if (bufAlign.rowAddr() == 0) { HIP_VALIDATE_NO_ERRORS(hipDeviceGetAttribute(&rowAlign, hipDeviceAttributeTexturePitchAlignment, dev)); - printf("rowAlign: %d\n", rowAlign); - printf("NextPowerOfTwo(dtype.size()): %lu\n", detail::NextPowerOfTwo(dtype.size())); rowAlign = std::lcm(rowAlign, detail::NextPowerOfTwo(dtype.size())); } else { if (!detail::IsPowerOfTwo(bufAlign.rowAddr())) { @@ -161,7 +159,6 @@ Tensor::Requirements Tensor::CalcRequirements(const TensorShape& shape, const Da } rowAlign = std::lcm(bufAlign.rowAddr(), detail::NextPowerOfTwo(dtype.size())); } - printf("rowAlign: %d\n", rowAlign); int baseAlign; if (bufAlign.baseAddr() == 0) { @@ -173,7 +170,6 @@ Tensor::Requirements Tensor::CalcRequirements(const TensorShape& shape, const Da } baseAlign = std::lcm(bufAlign.baseAddr(), detail::NextPowerOfTwo(dtype.size())); } - printf("baseAlign: %d\n", baseAlign); std::array strides = CalcStrides(shape, dtype, rowAlign); Tensor::Requirements reqs = CalcRequirements(shape, dtype, strides, baseAlign, device); @@ -181,8 +177,8 @@ Tensor::Requirements Tensor::CalcRequirements(const TensorShape& shape, const Da } Tensor::Requirements Tensor::CalcRequirements(const TensorShape& shape, const DataType& dtype, - std::array strides, int32_t baseAlign, - eDeviceType device) { + const std::array strides, + int32_t baseAlign, eDeviceType device) { Tensor::Requirements reqs; reqs.shape = shape.shape(); diff --git a/tests/roccv/cpp/common/test_helpers.hpp b/tests/roccv/cpp/common/test_helpers.hpp index 6c1c47de..36a6cb73 100644 --- a/tests/roccv/cpp/common/test_helpers.hpp +++ b/tests/roccv/cpp/common/test_helpers.hpp @@ -267,24 +267,6 @@ eTestStatusType compareImage(const Tensor& tensor, const std::string& filename, void writeTensor(const Tensor& tensor, const std::string& output_file); -template -void copyData(const Tensor& input, const std::span& data, eDeviceType device) { - auto tensor_data = input.exportData(); - - switch (device) { - case eDeviceType::GPU: { - HIP_VALIDATE_NO_ERRORS( - hipMemcpy(tensor_data.basePtr(), data.data(), data.size() * sizeof(T), hipMemcpyHostToDevice)); - break; - } - - case eDeviceType::CPU: { - memcpy(tensor_data.basePtr(), data.data(), data.size() * sizeof(T)); - break; - } - } -} - /** * @brief Fills a vector with random values based on a provided seed. * @@ -400,75 +382,83 @@ void CompareVectorsNear(const std::vector& result, const std::vector& ref, } } } +/** + * @brief Computes copy parameters for any tensor layout. + * @param[in] tensor The tensor to compute copy parameters for. + * @return tuple of (row_width_bytes, num_rows, tensor_pitch) + * If no padding, returns (total_size, 1, total_size) + */ +inline std::tuple ComputeCopyParams(const Tensor& tensor) { + auto tensorData = tensor.exportData(); + const auto& layout = tensor.layout(); + int heightIdx = layout.height_index(); + + if (heightIdx < 0) { + // No height dimension = contiguous data, no padding + size_t totalSize = tensor.shape().size() * tensor.dtype().size(); + return {totalSize, 1, totalSize}; + } + + // Row width = product of all dimensions AFTER height_index × dtype size + size_t rowWidth = tensor.dtype().size(); + for (int i = heightIdx + 1; i < layout.rank(); ++i) { + rowWidth *= tensor.shape(i); + } + + // Number of rows = product of all dimensions UP TO AND INCLUDING height_index + size_t numRows = 1; + for (int i = 0; i <= heightIdx; ++i) { + numRows *= tensor.shape(i); + } + + // Tensor pitch comes from the stride at height_index + size_t tensorPitch = tensorData.stride(heightIdx); + + return {rowWidth, numRows, tensorPitch}; +} /** - * @brief Copies vector data into a roccv::Tensor. This will copy vector data into either GPU memory or CPU memory, - * depending on the device specified in the roccv::Tensor's metadata. + * @brief Copies vector data into a tensor. Works with any tensor layout. * - * @tparam T The base datatype of the underlying data. - * @param dst The destination roccv::Tensor to copy data into. - * @param src A source vector containing data. - * @throws std::runtime_error if the size of the dst and src do not match. + * @tparam T The data type of the elements to copy. + * @param[out] dst The tensor to copy vector data into. + * @param[in] src The vector to copy data from. */ template -void CopyVectorIntoTensor(const Tensor& dst, std::vector& src) { +void CopyVectorIntoTensor(const Tensor& dst, const std::vector& src) { auto tensorData = dst.exportData(); - size_t dataSize = dst.shape().size() * dst.dtype().size(); + auto [rowWidth, numRows, dstPitch] = ComputeCopyParams(dst); - // Ensure source and destination have the same amount of memory allocated. - if (dataSize != src.size() * sizeof(T)) { - throw std::runtime_error( - "Cannot copy source vector into destination tensor. Size of src vector and destination tensor do not " - "match."); - } + // Source is always contiguous + size_t srcPitch = rowWidth; - switch (dst.device()) { - case eDeviceType::GPU: { - HIP_VALIDATE_NO_ERRORS( - hipMemcpy(tensorData.basePtr(), src.data(), dataSize, hipMemcpyKind::hipMemcpyHostToDevice)); - break; - } + hipMemcpyKind kind = (dst.device() == eDeviceType::GPU) ? hipMemcpyHostToDevice : hipMemcpyHostToHost; - case eDeviceType::CPU: { - HIP_VALIDATE_NO_ERRORS( - hipMemcpy(tensorData.basePtr(), src.data(), dataSize, hipMemcpyKind::hipMemcpyHostToHost)); - break; - } - } + HIP_VALIDATE_NO_ERRORS(hipMemcpy2D(tensorData.basePtr(), dstPitch, src.data(), srcPitch, rowWidth, numRows, kind)); } /** - * @brief Copies roccv::Tensor data into a destination vector. + * @brief Copies tensor data into a vector. Works with any tensor layout. * - * @tparam T The base datatype of the underlying tensor data. - * @param dst The destination vector which the data will be copied into. - * @param src The roccv::Tensor containing the source data. - * @throws std::runtime_error if the size of src and dst do not match. + * The destination vector @p dst is assumed to be compact/contiguous in memory; data will be copied into it as a dense + * contiguous array. + * + * @tparam T The data type of the elements to copy. + * @param[out] dst The vector to copy tensor data into. Must be preallocated to the correct size and will be filled + * contiguously. + * @param[in] src The tensor to copy data from. */ template void CopyTensorIntoVector(std::vector& dst, const Tensor& src) { - size_t size = src.shape().size() * src.dtype().size(); auto tensorData = src.exportData(); + auto [rowWidth, numRows, srcPitch] = ComputeCopyParams(src); - if (size != dst.size() * sizeof(T)) { - throw std::runtime_error( - "Cannot copy source tensor data into destination vector. Size of destination vector and source tensor do " - "not match."); - } + // Destination is always contiguous + size_t dstPitch = rowWidth; - switch (src.device()) { - case eDeviceType::GPU: { - HIP_VALIDATE_NO_ERRORS( - hipMemcpy(dst.data(), tensorData.basePtr(), size, hipMemcpyKind::hipMemcpyDeviceToHost)); - break; - } + hipMemcpyKind kind = (src.device() == eDeviceType::GPU) ? hipMemcpyDeviceToHost : hipMemcpyHostToHost; - case eDeviceType::CPU: { - HIP_VALIDATE_NO_ERRORS( - hipMemcpy(dst.data(), tensorData.basePtr(), size, hipMemcpyKind::hipMemcpyHostToHost)); - break; - } - } + HIP_VALIDATE_NO_ERRORS(hipMemcpy2D(dst.data(), dstPitch, tensorData.basePtr(), srcPitch, rowWidth, numRows, kind)); } } // namespace tests From 18d0890f1e96517ef951d240f96654010af1edfe Mon Sep 17 00:00:00 2001 From: Zach Vincze Date: Wed, 3 Dec 2025 11:58:06 -0500 Subject: [PATCH 09/36] Add isContiguous member function for roccv::Tensor --- include/core/tensor.hpp | 7 +++++++ src/core/tensor.cpp | 21 ++++++++++++--------- 2 files changed, 19 insertions(+), 9 deletions(-) diff --git a/include/core/tensor.hpp b/include/core/tensor.hpp index ce1271b5..6ca8c5cf 100644 --- a/include/core/tensor.hpp +++ b/include/core/tensor.hpp @@ -218,6 +218,13 @@ class Tensor { */ size_t dataSize() const; + /** + * @brief Returns true if the tensor is contiguous in memory, meaning there is no padding present in the tensor. + * + * @return True if the tensor is contiguous in memory, false otherwise. + */ + bool isContiguous() const; + /** * @brief Calculates tensor requirements using the default memory alignment strategy. * diff --git a/src/core/tensor.cpp b/src/core/tensor.cpp index 282eeebd..bfd5835a 100644 --- a/src/core/tensor.cpp +++ b/src/core/tensor.cpp @@ -101,6 +101,11 @@ TensorData Tensor::exportData() const { } Tensor Tensor::reshape(const TensorShape& new_shape) const { + if (!isContiguous()) { + throw Exception("Tensor is not contiguous. Reshape can only be performed on contiguous tensors.", + eStatusType::INVALID_VALUE); + } + // New tensor shape must have the same number of elements if (new_shape.size() != this->shape().size()) { throw Exception("New tensor shape does not match the number of elements of the old shape.", @@ -112,6 +117,11 @@ Tensor Tensor::reshape(const TensorShape& new_shape) const { } Tensor Tensor::reshape(const TensorShape& new_shape, const DataType& new_dtype) const { + if (!isContiguous()) { + throw Exception("Tensor is not contiguous. Reshape can only be performed on contiguous tensors.", + eStatusType::INVALID_VALUE); + } + if (new_shape.size() * new_dtype.size() != this->shape().size() * this->dtype().size()) { throw Exception("New tensor view must have the same underlying number of bytes.", eStatusType::INVALID_VALUE); } @@ -126,16 +136,9 @@ Tensor& Tensor::operator=(const Tensor& other) { return *this; } -size_t Tensor::dataSize() const { - switch (m_requirements.device) { - case eDeviceType::GPU: - return m_requirements.res.deviceMem.bytes; - case eDeviceType::CPU: - return m_requirements.res.hostMem.bytes; - } +size_t Tensor::dataSize() const { return m_requirements.strides[0] * m_requirements.shape[0]; } - return 0; -} +bool Tensor::isContiguous() const { return dataSize() == shape().size() * dtype().size(); } Tensor::Requirements Tensor::CalcRequirements(const TensorShape& shape, const DataType& dtype, const eDeviceType device) { From e97207ad8e3e4bf890a2f1411bed17e6c3c82d95 Mon Sep 17 00:00:00 2001 From: Zach Vincze Date: Wed, 3 Dec 2025 12:11:56 -0500 Subject: [PATCH 10/36] Remove non-contiguous reshaping from tests --- tests/roccv/cpp/test_tensor.cpp | 38 ++++++++++----------------------- 1 file changed, 11 insertions(+), 27 deletions(-) diff --git a/tests/roccv/cpp/test_tensor.cpp b/tests/roccv/cpp/test_tensor.cpp index 030579cb..ae9a1016 100644 --- a/tests/roccv/cpp/test_tensor.cpp +++ b/tests/roccv/cpp/test_tensor.cpp @@ -71,21 +71,15 @@ void TestNegativeTensorShape() { * @brief Negative tests for the Tensor class, verifying error handling in invalid scenarios. * * These tests confirm that the Tensor class appropriately throws exceptions when: - * 1. Attempting to reshape a tensor to a shape with a different number of elements. - * 2. Attempting to reshape a tensor to a shape and datatype combination that would change the total number of bytes - * in the underlying storage. + * 1. Attempting to reshape a non-contiguous tensor. * * In both cases, the expected behavior is to throw an exception of type eStatusType::INVALID_VALUE. */ void TestNegativeTensor() { + // Reshaping a non-contiguous tensor is invalid { - // Case 1: Reshaping to a different number of elements is invalid Tensor tensor(TensorShape({1, 2, 3}, "HWC"), DataType(DATA_TYPE_U8)); - EXPECT_EXCEPTION(tensor.reshape(TensorShape({1, 1, 2, 2}, "NHWC")), eStatusType::INVALID_VALUE); - - // Case 2: Reshaping to a different total byte size is invalid, even if element count matches - EXPECT_EXCEPTION(tensor.reshape(TensorShape({1, 1, 2, 3}, "NHWC"), DataType(DATA_TYPE_S16)), - eStatusType::INVALID_VALUE); + EXPECT_EXCEPTION(tensor.reshape(TensorShape({1, 1, 2, 3}, "NHWC")), eStatusType::INVALID_VALUE); } } @@ -108,21 +102,6 @@ void TestTensorCorrectness() { EXPECT_EQ(tensor.dtype().size(), 1); } - // Tensor reshape: Change layout - { - // Reshape tensor from NHWC -> HWC layout - Tensor tensor(1, {720, 480}, FMT_RGB8); - Tensor reshapedTensor = tensor.reshape(TensorShape({720, 480, 3}, "HWC")); - EXPECT_EQ(reshapedTensor.rank(), 3); - EXPECT_NE(reshapedTensor.rank(), tensor.rank()); - EXPECT_EQ(reshapedTensor.shape().size(), tensor.shape().size()); - - // Ensure they are sharing the same underlying data - auto data = tensor.exportData(); - auto dataReshaped = reshapedTensor.exportData(); - EXPECT_TRUE(data.basePtr() == dataReshaped.basePtr()); - } - // Tensor reshape: Change layout and datatype { Tensor tensor(TensorShape({1, 5, 4}, "NWC"), DataType(DATA_TYPE_S16)); @@ -142,10 +121,15 @@ void TestTensorCorrectness() { * @brief Tests internal stride calculations on Tensor construction. */ void TestTensorStrideCalculation(const TensorShape& shape, const DataType& dtype) { - Tensor tensor(shape, dtype); + Tensor tensor(shape, dtype, eDeviceType::GPU); + + // Get row alignment from device attributes + int dev; + HIP_VALIDATE_NO_ERRORS(hipGetDevice(&dev)); + int rowAlign; + HIP_VALIDATE_NO_ERRORS(hipDeviceGetAttribute(&rowAlign, hipDeviceAttributeTexturePitchAlignment, dev)); - // TODO: Use row alignment from device attributes instead of a hardcoded value. - std::vector expectedStrides = CalculateStrides(shape, dtype, 256); + std::vector expectedStrides = CalculateStrides(shape, dtype, rowAlign); std::vector actualStrides(tensor.rank()); auto data = tensor.exportData(); From 6102764a0b98911c5be298945f281c2c49c63d3f Mon Sep 17 00:00:00 2001 From: Zach Vincze Date: Fri, 5 Dec 2025 14:27:35 -0500 Subject: [PATCH 11/36] Update utils.hpp to use memcpy2D to handle tensor padding --- samples/common/utils.hpp | 142 ++++++++++++++++++++++++++++++++++- samples/copy_make_border.cpp | 57 +++++++------- 2 files changed, 167 insertions(+), 32 deletions(-) diff --git a/samples/common/utils.hpp b/samples/common/utils.hpp index 61ae525d..36aeb127 100644 --- a/samples/common/utils.hpp +++ b/samples/common/utils.hpp @@ -39,7 +39,7 @@ inline void CheckHIPError(hipError_t code, const char *file, const int line) { CheckHIPError((val), __FILE__, __LINE__); \ } -bool ContainsExtension(const std::filesystem::path &path, const std::vector &extension_list) { +inline bool ContainsExtension(const std::filesystem::path &path, const std::vector &extension_list) { for (auto extension : extension_list) { if (path.extension() == extension) return true; } @@ -47,6 +47,142 @@ bool ContainsExtension(const std::filesystem::path &path, const std::vector(); + params.rowPitch = tensorData.stride(tensor.layout().height_index()); + params.rowBytes = tensor.shape(tensor.layout().width_index()) * tensor.shape(tensor.layout().channels_index()) * + tensor.dtype().size(); + params.imageBytes = params.rowBytes * tensor.shape(tensor.layout().height_index()); + params.basePtr = tensorData.basePtr(); + + return params; +} + +/** + * @brief Loads an image, or multiple images if given a directory, into a tensor. Will be in NHWC layout and U8 format. + * All images must be of the same size and format. This is a blocking operation. + * + * @param image_path The path to the image to load. If a directory is provided, all supported images in the directory + * will be loaded. + * @param device The device to load the images onto. Defaults to GPU. + * @return A NHWC tensor containing the loaded images. + */ +inline roccv::Tensor LoadImages(const std::string &image_path, eDeviceType device = eDeviceType::GPU) { + const std::vector supportedExtensions = {".bmp", ".jpg", ".jpeg", ".png"}; + + std::vector images; + + int width = -1; + int height = -1; + int channels = -1; + + // Load images from directory or file if a single image is provided + if (std::filesystem::is_directory(image_path)) { + for (auto file : std::filesystem::directory_iterator(image_path)) { + if (!std::filesystem::is_directory(file.path()) && ContainsExtension(file.path(), supportedExtensions)) { + images.push_back(cv::imread(file.path())); + + // Check if all images are of the same size + if (width == -1 && height == -1 && channels == -1) { + width = images.back().cols; + height = images.back().rows; + channels = images.back().channels(); + } else if (images.back().cols != width || images.back().rows != height || + images.back().channels() != channels) { + throw std::runtime_error("All images must be of the same size and format"); + } + } + } + } else if (std::filesystem::is_regular_file(image_path) && ContainsExtension(image_path, supportedExtensions)) { + images.push_back(cv::imread(image_path)); + width = images.back().cols; + height = images.back().rows; + channels = images.back().channels(); + } else { + throw std::runtime_error("Cannot decode " + image_path + ". File type not supported.\n"); + } + + if (images.empty()) { + throw std::runtime_error("No valid images found in directory " + image_path); + } + + // Create tensor and prepare arguments for hipMemcpy2D + roccv::Tensor tensor(images.size(), roccv::Size2D(width, height), + roccv::ImageFormat(eDataType::DATA_TYPE_U8, channels), device); + + MemcpyParams params = GetMemcpyParams(tensor); + + // Copy images into tensor + hipMemcpyKind kind = (device == eDeviceType::GPU) ? hipMemcpyHostToDevice : hipMemcpyHostToHost; + for (int i = 0; i < images.size(); i++) { + CHECK_HIP_ERROR(hipMemcpy2D(static_cast(params.basePtr) + i * params.imageBytes, params.rowPitch, + images[i].data, params.rowBytes, params.rowBytes, height, kind)); + } + + return tensor; +} + +/** + * @brief Writes a batch of images from a tensor to the specified output path. This is a blocking operation. + * + * @param tensor The tensor to write the images from. + * @param output_path The path to write the images to. If a directory is provided, the images will be written to the + * directory. + */ +inline void WriteImages(const roccv::Tensor &tensor, const std::string &output_path) { + if (tensor.layout() != eTensorLayout::TENSOR_LAYOUT_NHWC && tensor.layout() != eTensorLayout::TENSOR_LAYOUT_HWC) { + throw std::runtime_error( + "Unsupported tensor layout in WriteImages(). Only NHWC and HWC layouts are supported."); + } + + int64_t height = tensor.shape(tensor.layout().height_index()); + int64_t width = tensor.shape(tensor.layout().width_index()); + int64_t batchSize = + tensor.layout().batch_index() < 0 ? 1 : tensor.shape(tensor.layout().batch_index()); // Support for HWC layout + int64_t channels = tensor.shape(tensor.layout().channels_index()); + + // Get OpenCV image format + int64_t cvFormat = CV_MAKETYPE(CV_8U, channels); + + // Get memcpy parameters + MemcpyParams params = GetMemcpyParams(tensor); + hipMemcpyKind kind = (tensor.device() == eDeviceType::GPU) ? hipMemcpyDeviceToHost : hipMemcpyHostToHost; + + // Copy images from tensor to OpenCV image vector + std::vector images(batchSize); + for (int i = 0; i < batchSize; i++) { + images[i] = cv::Mat(height, width, cvFormat); + CHECK_HIP_ERROR(hipMemcpy2D(images[i].data, params.rowBytes, + static_cast(params.basePtr) + i * params.imageBytes, params.rowPitch, + params.rowBytes, height, kind)); + } + + if (std::filesystem::is_directory(output_path)) { + for (int i = 0; i < batchSize; i++) { + std::ostringstream outFilename; + outFilename << output_path << "/image_" << i << ".bmp"; + cv::imwrite(outFilename.str().c_str(), images[i]); + } + } else { + cv::imwrite(output_path, images[0]); + } +} + /** * @brief Loads images into the GPU memory specified. * @@ -54,7 +190,7 @@ bool ContainsExtension(const std::filesystem::path &path, const std::vector supportedExtensions = {".bmp", ".jpg", ".jpeg", ".png"}; std::vector imageFiles; @@ -99,7 +235,7 @@ void DecodeRGBIImage(const std::string &images_dir, int num_images, void *gpu_in * @param tensor A tensor containing a batch of RGBI images. * @param stream The HIP stream to synchronize with. */ -void WriteRGBITensor(const roccv::Tensor &tensor, hipStream_t stream) { +inline void WriteRGBITensor(const roccv::Tensor &tensor, hipStream_t stream) { CHECK_HIP_ERROR(hipStreamSynchronize(stream)); auto srcData = tensor.exportData(); diff --git a/samples/copy_make_border.cpp b/samples/copy_make_border.cpp index be7be7e5..bf515dc7 100644 --- a/samples/copy_make_border.cpp +++ b/samples/copy_make_border.cpp @@ -27,12 +27,23 @@ THE SOFTWARE. #include #include +#include "common/utils.hpp" + using namespace roccv; /** * @brief Copy make border operation example. + * + * This sample demonstrates the usage of the CopyMakeBorder operator. It loads an image from the specified input path, + * creates a border around the image based on the specified border mode and border value, and writes the output image to + * the specified output path. + * + * Usage: + * ./copy_make_border + * */ int main(int argc, char** argv) { + // Validate command line arguments if (argc != 11) { std::cerr << "Usage: " << argv[0] << " " @@ -40,8 +51,9 @@ int main(int argc, char** argv) { return EXIT_FAILURE; } - HIP_VALIDATE_NO_ERRORS(hipSetDevice(std::stoi(argv[10]))); + CHECK_HIP_ERROR(hipSetDevice(std::stoi(argv[10]))); + // Parse command line arguments int32_t top = std::stoi(argv[3]); int32_t left = std::stoi(argv[4]); float r = std::stof(argv[5]); @@ -50,42 +62,29 @@ int main(int argc, char** argv) { float a = std::stof(argv[8]); eBorderType border_mode = static_cast(std::stoi(argv[9])); - cv::Mat image_data = cv::imread(argv[1]); - - // Create input/output tensors for the image. - TensorShape shape(TensorLayout(eTensorLayout::TENSOR_LAYOUT_NHWC), - {1, image_data.rows, image_data.cols, image_data.channels()}); - DataType dtype(eDataType::DATA_TYPE_U8); + // Load input image + Tensor input = LoadImages(argv[1]); - TensorShape o_shape(TensorLayout(eTensorLayout::TENSOR_LAYOUT_NHWC), - {1, image_data.rows + top * 2, image_data.cols + left * 2, image_data.channels()}); - - Tensor d_in(shape, dtype); - Tensor d_out(o_shape, dtype); + // Create output tensor + int64_t outputHeight = input.shape(input.layout().height_index()) + top * 2; + int64_t outputWidth = input.shape(input.layout().width_index()) + left * 2; + TensorShape outputShape(input.layout(), {input.shape(input.layout().batch_index()), outputHeight, outputWidth, + input.shape(input.layout().channels_index())}); + Tensor output(outputShape, input.dtype()); + // Create stream hipStream_t stream; - HIP_VALIDATE_NO_ERRORS(hipStreamCreate(&stream)); - - // Move image data to input tensor - size_t image_size = d_in.shape().size() * d_in.dtype().size(); - auto d_input_data = d_in.exportData(); - HIP_VALIDATE_NO_ERRORS( - hipMemcpyAsync(d_input_data.basePtr(), image_data.data, image_size, hipMemcpyHostToDevice, stream)); + CHECK_HIP_ERROR(hipStreamCreate(&stream)); + // Create CopyMakeBorder operator CopyMakeBorder op; - op(stream, d_in, d_out, top, left, border_mode, {b, g, r, a}); + op(stream, input, output, top, left, border_mode, {b, g, r, a}); - // Move image data back to device - auto d_out_data = d_out.exportData(); - size_t out_image_size = d_out.shape().size() * d_out.dtype().size(); - std::vector h_output(out_image_size); - HIP_VALIDATE_NO_ERRORS( - hipMemcpyAsync(h_output.data(), d_out_data.basePtr(), out_image_size, hipMemcpyDeviceToHost, stream)); + CHECK_HIP_ERROR(hipStreamSynchronize(stream)); - HIP_VALIDATE_NO_ERRORS(hipStreamSynchronize(stream)); + WriteImages(output, argv[2]); - cv::Mat output_image_data(image_data.rows + top * 2, image_data.cols + left * 2, CV_8UC3, h_output.data()); - cv::imwrite(argv[2], output_image_data); + CHECK_HIP_ERROR(hipStreamSynchronize(stream)); return EXIT_SUCCESS; } \ No newline at end of file From 12ff5e933e5f7ae0c7a1b34f459c01b37a48da73 Mon Sep 17 00:00:00 2001 From: Zach Vincze Date: Fri, 5 Dec 2025 15:29:02 -0500 Subject: [PATCH 12/36] Fix incorrect imageBytes calculation --- samples/common/utils.hpp | 13 ++++++++----- 1 file changed, 8 insertions(+), 5 deletions(-) diff --git a/samples/common/utils.hpp b/samples/common/utils.hpp index 36aeb127..b8d42b1d 100644 --- a/samples/common/utils.hpp +++ b/samples/common/utils.hpp @@ -67,7 +67,7 @@ inline MemcpyParams GetMemcpyParams(const roccv::Tensor &tensor) { params.rowPitch = tensorData.stride(tensor.layout().height_index()); params.rowBytes = tensor.shape(tensor.layout().width_index()) * tensor.shape(tensor.layout().channels_index()) * tensor.dtype().size(); - params.imageBytes = params.rowBytes * tensor.shape(tensor.layout().height_index()); + params.imageBytes = params.rowPitch * tensor.shape(tensor.layout().height_index()); params.basePtr = tensorData.basePtr(); return params; @@ -133,6 +133,7 @@ inline roccv::Tensor LoadImages(const std::string &image_path, eDeviceType devic CHECK_HIP_ERROR(hipMemcpy2D(static_cast(params.basePtr) + i * params.imageBytes, params.rowPitch, images[i].data, params.rowBytes, params.rowBytes, height, kind)); } + CHECK_HIP_ERROR(hipDeviceSynchronize()); return tensor; } @@ -171,12 +172,14 @@ inline void WriteImages(const roccv::Tensor &tensor, const std::string &output_p static_cast(params.basePtr) + i * params.imageBytes, params.rowPitch, params.rowBytes, height, kind)); } + CHECK_HIP_ERROR(hipDeviceSynchronize()); - if (std::filesystem::is_directory(output_path)) { + std::filesystem::path outputPath(output_path); + if (outputPath.extension().empty()) { for (int i = 0; i < batchSize; i++) { - std::ostringstream outFilename; - outFilename << output_path << "/image_" << i << ".bmp"; - cv::imwrite(outFilename.str().c_str(), images[i]); + std::filesystem::create_directories(outputPath); + std::filesystem::path outFilename = outputPath / std::format("image_{}.bmp", i); + cv::imwrite(outFilename.string(), images[i]); } } else { cv::imwrite(output_path, images[0]); From e562ebc9f30862d958326a91c4890c07f0391baf Mon Sep 17 00:00:00 2001 From: Zach Vincze Date: Fri, 5 Dec 2025 15:36:13 -0500 Subject: [PATCH 13/36] Use HIP streams where possible --- samples/common/utils.hpp | 23 +++++++++++++---------- samples/copy_make_border.cpp | 18 +++++++++--------- 2 files changed, 22 insertions(+), 19 deletions(-) diff --git a/samples/common/utils.hpp b/samples/common/utils.hpp index b8d42b1d..531bd470 100644 --- a/samples/common/utils.hpp +++ b/samples/common/utils.hpp @@ -75,14 +75,15 @@ inline MemcpyParams GetMemcpyParams(const roccv::Tensor &tensor) { /** * @brief Loads an image, or multiple images if given a directory, into a tensor. Will be in NHWC layout and U8 format. - * All images must be of the same size and format. This is a blocking operation. + * All images must be of the same size and format. This is a non-blocking operation. * * @param image_path The path to the image to load. If a directory is provided, all supported images in the directory * will be loaded. * @param device The device to load the images onto. Defaults to GPU. * @return A NHWC tensor containing the loaded images. */ -inline roccv::Tensor LoadImages(const std::string &image_path, eDeviceType device = eDeviceType::GPU) { +inline roccv::Tensor LoadImages(hipStream_t stream, const std::string &image_path, + eDeviceType device = eDeviceType::GPU) { const std::vector supportedExtensions = {".bmp", ".jpg", ".jpeg", ".png"}; std::vector images; @@ -130,10 +131,10 @@ inline roccv::Tensor LoadImages(const std::string &image_path, eDeviceType devic // Copy images into tensor hipMemcpyKind kind = (device == eDeviceType::GPU) ? hipMemcpyHostToDevice : hipMemcpyHostToHost; for (int i = 0; i < images.size(); i++) { - CHECK_HIP_ERROR(hipMemcpy2D(static_cast(params.basePtr) + i * params.imageBytes, params.rowPitch, - images[i].data, params.rowBytes, params.rowBytes, height, kind)); + CHECK_HIP_ERROR(hipMemcpy2DAsync(static_cast(params.basePtr) + i * params.imageBytes, + params.rowPitch, images[i].data, params.rowBytes, params.rowBytes, height, + kind, stream)); } - CHECK_HIP_ERROR(hipDeviceSynchronize()); return tensor; } @@ -145,7 +146,7 @@ inline roccv::Tensor LoadImages(const std::string &image_path, eDeviceType devic * @param output_path The path to write the images to. If a directory is provided, the images will be written to the * directory. */ -inline void WriteImages(const roccv::Tensor &tensor, const std::string &output_path) { +inline void WriteImages(hipStream_t stream, const roccv::Tensor &tensor, const std::string &output_path) { if (tensor.layout() != eTensorLayout::TENSOR_LAYOUT_NHWC && tensor.layout() != eTensorLayout::TENSOR_LAYOUT_HWC) { throw std::runtime_error( "Unsupported tensor layout in WriteImages(). Only NHWC and HWC layouts are supported."); @@ -168,11 +169,13 @@ inline void WriteImages(const roccv::Tensor &tensor, const std::string &output_p std::vector images(batchSize); for (int i = 0; i < batchSize; i++) { images[i] = cv::Mat(height, width, cvFormat); - CHECK_HIP_ERROR(hipMemcpy2D(images[i].data, params.rowBytes, - static_cast(params.basePtr) + i * params.imageBytes, params.rowPitch, - params.rowBytes, height, kind)); + CHECK_HIP_ERROR(hipMemcpy2DAsync(images[i].data, params.rowBytes, + static_cast(params.basePtr) + i * params.imageBytes, + params.rowPitch, params.rowBytes, height, kind, stream)); } - CHECK_HIP_ERROR(hipDeviceSynchronize()); + + // Ensure all memory operations are completed before writing images + CHECK_HIP_ERROR(hipStreamSynchronize(stream)); std::filesystem::path outputPath(output_path); if (outputPath.extension().empty()) { diff --git a/samples/copy_make_border.cpp b/samples/copy_make_border.cpp index bf515dc7..ace161d4 100644 --- a/samples/copy_make_border.cpp +++ b/samples/copy_make_border.cpp @@ -62,8 +62,12 @@ int main(int argc, char** argv) { float a = std::stof(argv[8]); eBorderType border_mode = static_cast(std::stoi(argv[9])); + // Create stream + hipStream_t stream; + CHECK_HIP_ERROR(hipStreamCreate(&stream)); + // Load input image - Tensor input = LoadImages(argv[1]); + Tensor input = LoadImages(stream, argv[1]); // Create output tensor int64_t outputHeight = input.shape(input.layout().height_index()) + top * 2; @@ -72,19 +76,15 @@ int main(int argc, char** argv) { input.shape(input.layout().channels_index())}); Tensor output(outputShape, input.dtype()); - // Create stream - hipStream_t stream; - CHECK_HIP_ERROR(hipStreamCreate(&stream)); - // Create CopyMakeBorder operator CopyMakeBorder op; op(stream, input, output, top, left, border_mode, {b, g, r, a}); - CHECK_HIP_ERROR(hipStreamSynchronize(stream)); - - WriteImages(output, argv[2]); + // Synchronize not required, WriteImages will block on the given stream + WriteImages(stream, output, argv[2]); - CHECK_HIP_ERROR(hipStreamSynchronize(stream)); + // Destroy stream + CHECK_HIP_ERROR(hipStreamDestroy(stream)); return EXIT_SUCCESS; } \ No newline at end of file From cae2e8ef1db930acc8599371b1e878f2bc84bd43 Mon Sep 17 00:00:00 2001 From: Zach Vincze Date: Mon, 8 Dec 2025 10:59:40 -0500 Subject: [PATCH 14/36] Fix CopyMakeBorder and GammaContrast samples --- samples/common/utils.hpp | 5 +- samples/copy_make_border.cpp | 122 +++++++++++++++++++++++++++-------- samples/gamma_contrast.cpp | 121 ++++++++++++++++++---------------- 3 files changed, 165 insertions(+), 83 deletions(-) diff --git a/samples/common/utils.hpp b/samples/common/utils.hpp index 531bd470..f8f0c5a0 100644 --- a/samples/common/utils.hpp +++ b/samples/common/utils.hpp @@ -75,7 +75,7 @@ inline MemcpyParams GetMemcpyParams(const roccv::Tensor &tensor) { /** * @brief Loads an image, or multiple images if given a directory, into a tensor. Will be in NHWC layout and U8 format. - * All images must be of the same size and format. This is a non-blocking operation. + * All images must be of the same size and format. This operation will block on the provided stream. * * @param image_path The path to the image to load. If a directory is provided, all supported images in the directory * will be loaded. @@ -136,6 +136,9 @@ inline roccv::Tensor LoadImages(hipStream_t stream, const std::string &image_pat kind, stream)); } + // Ensure all memory operations are completed before returning the tensor + CHECK_HIP_ERROR(hipStreamSynchronize(stream)); + return tensor; } diff --git a/samples/copy_make_border.cpp b/samples/copy_make_border.cpp index ace161d4..cb695dfe 100644 --- a/samples/copy_make_border.cpp +++ b/samples/copy_make_border.cpp @@ -21,6 +21,7 @@ THE SOFTWARE. */ #include +#include #include #include @@ -31,59 +32,124 @@ THE SOFTWARE. using namespace roccv; +struct Config { + std::string inputPath; + std::string outputPath = "output"; + int32_t top = 10; + int32_t left = 10; + float r = 0.0f, g = 0.0f, b = 0.0f, a = 255.0f; + eBorderType borderMode = eBorderType::BORDER_TYPE_CONSTANT; + int deviceId = 0; +}; + +void PrintUsage(const char* programName) { + std::cout << "Usage: " << programName << " -i [options]\n\n" + << "Options:\n" + << " -i, --input Input image or directory (required)\n" + << " -o, --output Output file or directory (default: output/image_.bmp)\n" + << " -t, --top Top/bottom border size (default: 10)\n" + << " -l, --left Left/right border size (default: 10)\n" + << " -c, --color Border color as r,g,b,a (default: 0,0,0,255)\n" + << " -m, --mode Border mode: 0=constant, 1=replicate, 2=reflect (default: constant)\n" + << " -d, --device GPU device ID (default: 0)\n" + << " -h, --help Show this help message\n\n" + << "Example:\n" + << " " << programName << " -i image.jpg -o bordered.png -t 20 -l 15 -c 255,0,0,255\n"; +} + +bool ParseColor(const std::string& colorStr, float& r, float& g, float& b, float& a) { + return sscanf(colorStr.c_str(), "%f,%f,%f,%f", &r, &g, &b, &a) == 4; +} + /** * @brief Copy make border operation example. * - * This sample demonstrates the usage of the CopyMakeBorder operator. It loads an image from the specified input path, - * creates a border around the image based on the specified border mode and border value, and writes the output image to - * the specified output path. - * - * Usage: - * ./copy_make_border + * This sample demonstrates the usage of the CopyMakeBorder operator. It accepts either a single image file or a + * directory of images as the input path, and either a single file or a directory as the output path. If a directory is + * provided for the input, all supported images within the directory will be processed as a batch. Similarly, if the + * output path is a directory, all resulting images will be written to that directory. For each image, a border is + * created based on the specified border mode and border value. * + * If is a directory, a batch operation will occur on every image in the directory. + * If is a directory, the output images will be written into that directory. */ int main(int argc, char** argv) { - // Validate command line arguments - if (argc != 11) { - std::cerr << "Usage: " << argv[0] - << " " - << std::endl; - return EXIT_FAILURE; + Config config; + + static struct option longOptions[] = {{"input", required_argument, nullptr, 'i'}, + {"output", required_argument, nullptr, 'o'}, + {"top", required_argument, nullptr, 't'}, + {"left", required_argument, nullptr, 'l'}, + {"color", required_argument, nullptr, 'c'}, + {"mode", required_argument, nullptr, 'm'}, + {"device", required_argument, nullptr, 'd'}, + {"help", no_argument, nullptr, 'h'}, + {nullptr, 0, nullptr, 0}}; + + int opt; + while ((opt = getopt_long(argc, argv, "i:o:t:l:c:m:d:h", longOptions, nullptr)) != -1) { + switch (opt) { + case 'i': + config.inputPath = optarg; + break; + case 'o': + config.outputPath = optarg; + break; + case 't': + config.top = std::stoi(optarg); + break; + case 'l': + config.left = std::stoi(optarg); + break; + case 'c': + if (!ParseColor(optarg, config.r, config.g, config.b, config.a)) { + std::cerr << "Invalid color format. Use: r,g,b,a (e.g., 255,0,0,255)\n"; + return EXIT_FAILURE; + } + break; + case 'm': + config.borderMode = static_cast(std::stoi(optarg)); + break; + case 'd': + config.deviceId = std::stoi(optarg); + break; + case 'h': + PrintUsage(argv[0]); + return EXIT_SUCCESS; + default: + PrintUsage(argv[0]); + return EXIT_FAILURE; + } } - CHECK_HIP_ERROR(hipSetDevice(std::stoi(argv[10]))); + if (config.inputPath.empty()) { + std::cerr << "Error: Input path is required.\n\n"; + PrintUsage(argv[0]); + return EXIT_FAILURE; + } - // Parse command line arguments - int32_t top = std::stoi(argv[3]); - int32_t left = std::stoi(argv[4]); - float r = std::stof(argv[5]); - float g = std::stof(argv[6]); - float b = std::stof(argv[7]); - float a = std::stof(argv[8]); - eBorderType border_mode = static_cast(std::stoi(argv[9])); + CHECK_HIP_ERROR(hipSetDevice(config.deviceId)); // Create stream hipStream_t stream; CHECK_HIP_ERROR(hipStreamCreate(&stream)); // Load input image - Tensor input = LoadImages(stream, argv[1]); + Tensor input = LoadImages(stream, config.inputPath.c_str()); // Create output tensor - int64_t outputHeight = input.shape(input.layout().height_index()) + top * 2; - int64_t outputWidth = input.shape(input.layout().width_index()) + left * 2; + int64_t outputHeight = input.shape(input.layout().height_index()) + config.top * 2; + int64_t outputWidth = input.shape(input.layout().width_index()) + config.left * 2; TensorShape outputShape(input.layout(), {input.shape(input.layout().batch_index()), outputHeight, outputWidth, input.shape(input.layout().channels_index())}); Tensor output(outputShape, input.dtype()); // Create CopyMakeBorder operator CopyMakeBorder op; - op(stream, input, output, top, left, border_mode, {b, g, r, a}); + op(stream, input, output, config.top, config.left, config.borderMode, {config.b, config.g, config.r, config.a}); - // Synchronize not required, WriteImages will block on the given stream - WriteImages(stream, output, argv[2]); + WriteImages(stream, output, config.outputPath); - // Destroy stream CHECK_HIP_ERROR(hipStreamDestroy(stream)); return EXIT_SUCCESS; diff --git a/samples/gamma_contrast.cpp b/samples/gamma_contrast.cpp index 81fe771f..62a83f28 100644 --- a/samples/gamma_contrast.cpp +++ b/samples/gamma_contrast.cpp @@ -21,78 +21,91 @@ THE SOFTWARE. */ #include +#include + #include #include #include #include +#include "common/utils.hpp" using namespace roccv; +struct Config { + float gamma = 2.2; + std::string inputPath; + std::string outputPath = "output"; + int deviceId = 0; +}; + +void PrintUsage(const char* programName) { + // clang-format off + std::cerr << "Usage: " << programName << " -i [-o ] [-g ] [-d ]" << std::endl; + std::cerr << " -i, --input Input image or directory containing images (required)" << std::endl; + std::cerr << " -o, --output Output image or directory to save the results (optional, default: output)" << std::endl; + std::cerr << " -g, --gamma Gamma value to apply to the input images (optional, default: 2.2)" << std::endl; + std::cerr << " -d, --device Device ID to use for execution (optional, default: 0)" << std::endl; + // clang-format on +} + /** * @brief Gamma contrast operator sample app. */ int main(int argc, char** argv) { - if (argc != 3) { - std::cerr << "Usage: " << argv[0] - << " " - << std::endl; + Config config; + + static struct option longOptions[] = { + {"input", required_argument, nullptr, 'i'}, {"output", required_argument, nullptr, 'o'}, + {"gamma", required_argument, nullptr, 'g'}, {"device", required_argument, nullptr, 'd'}, + {"help", no_argument, nullptr, 'h'}, {nullptr, 0, nullptr, 0}}; + + int opt; + while ((opt = getopt_long(argc, argv, "i:o:g:d:h", longOptions, nullptr)) != -1) { + switch (opt) { + case 'i': + config.inputPath = optarg; + break; + case 'o': + config.outputPath = optarg; + break; + case 'g': + config.gamma = std::stof(optarg); + break; + case 'd': + config.deviceId = std::stoi(optarg); + break; + case 'h': + PrintUsage(argv[0]); + return EXIT_SUCCESS; + default: + PrintUsage(argv[0]); + return EXIT_FAILURE; + } + } + + if (config.inputPath.empty()) { + std::cerr << "Error: Input path is required.\n\n"; + PrintUsage(argv[0]); return EXIT_FAILURE; } - // Device to use in this sample will be the GPU - eDeviceType device = eDeviceType::GPU; - - // Load input image using the OpenCV library. - // The Mat image_data will store all of the data of the image - // Image width can be gotten with image_data.cols - // Image height can be gotten with image_data.rows - // The amount of channels can be gotten with image_data.channels() - cv::Mat image_data = cv::imread(argv[1]); - - // Batch size is needed to create the input and output tensors - int batchSize = 1; - - // A gamma value to apply to the input image - float gammaValue = 2.2; - - // Create input/output tensors - // Tensor shape - // - Takes layout as input, in this case NHWC (N - batch size, H - image height, W - image width, C - number of channels) - // - Also takes the datatype, in this case U8 or an unsigned integer of 8 bits. - TensorShape shape( - TensorLayout(eTensorLayout::TENSOR_LAYOUT_NHWC), - {batchSize, image_data.rows, image_data.cols, image_data.channels()}); - DataType dtype(eDataType::DATA_TYPE_U8); - - Tensor input(shape, dtype, device); - Tensor output(shape, dtype, device); + CHECK_HIP_ERROR(hipSetDevice(config.deviceId)); hipStream_t stream; - HIP_VALIDATE_NO_ERRORS(hipStreamCreate(&stream)); - - // imageSize is needed to know how much data needs to be copied to the GPU - size_t imageSize = image_data.rows * image_data.cols * image_data.channels() * sizeof(uint8_t); - - auto input_data = input.exportData(); - HIP_VALIDATE_NO_ERRORS( - hipMemcpy(static_cast(input_data.basePtr()), image_data.data, imageSize, hipMemcpyHostToDevice)); - - // Apply gamma correction - GammaContrast gamma_contrast; - gamma_contrast(stream, input, output, gammaValue, device); + CHECK_HIP_ERROR(hipStreamCreate(&stream)); + + // Load input images + Tensor input = LoadImages(stream, config.inputPath, eDeviceType::GPU); - // Move output data back to host - auto output_data = output.exportData(); - std::vector h_output(imageSize); - HIP_VALIDATE_NO_ERRORS(hipMemcpy(h_output.data(), output_data.basePtr(), imageSize, hipMemcpyDeviceToHost)); - - HIP_VALIDATE_NO_ERRORS(hipStreamSynchronize(stream)); + // Create output tensor + Tensor output(input.shape(), input.dtype(), eDeviceType::GPU); - // Save the gamma-corrected image - cv::Mat output_image(image_data.rows, image_data.cols, CV_8UC3, h_output.data()); - cv::imwrite(argv[2], output_image); + // Apply gamma correction + GammaContrast gamma_contrast; + gamma_contrast(stream, input, output, config.gamma, eDeviceType::GPU); - std::cout << "Gamma correction applied successfully. Output saved to: " << argv[2] << std::endl; - + // Write output images to disk + WriteImages(stream, output, config.outputPath); + CHECK_HIP_ERROR(hipStreamDestroy(stream)); return EXIT_SUCCESS; } From 6c4221258b10bfff68149f2f31cad74f169d54d6 Mon Sep 17 00:00:00 2001 From: Zach Vincze Date: Mon, 8 Dec 2025 11:21:45 -0500 Subject: [PATCH 15/36] Fix WarpPerspective sample --- samples/copy_make_border.cpp | 4 +- samples/warp_perspective.cpp | 103 ++++++++++++++++++++++++----------- 2 files changed, 74 insertions(+), 33 deletions(-) diff --git a/samples/copy_make_border.cpp b/samples/copy_make_border.cpp index cb695dfe..0dbb48f0 100644 --- a/samples/copy_make_border.cpp +++ b/samples/copy_make_border.cpp @@ -43,6 +43,7 @@ struct Config { }; void PrintUsage(const char* programName) { + // clang-format off std::cout << "Usage: " << programName << " -i [options]\n\n" << "Options:\n" << " -i, --input Input image or directory (required)\n" @@ -50,11 +51,12 @@ void PrintUsage(const char* programName) { << " -t, --top Top/bottom border size (default: 10)\n" << " -l, --left Left/right border size (default: 10)\n" << " -c, --color Border color as r,g,b,a (default: 0,0,0,255)\n" - << " -m, --mode Border mode: 0=constant, 1=replicate, 2=reflect (default: constant)\n" + << " -m, --mode Border mode: 0=CONSTANT, 1=REPLICATE, 2=REFLECT, 3=REFLECT101, 4=WRAP (default: CONSTANT)\n" << " -d, --device GPU device ID (default: 0)\n" << " -h, --help Show this help message\n\n" << "Example:\n" << " " << programName << " -i image.jpg -o bordered.png -t 20 -l 15 -c 255,0,0,255\n"; + // clang-format on } bool ParseColor(const std::string& colorStr, float& r, float& g, float& b, float& a) { diff --git a/samples/warp_perspective.cpp b/samples/warp_perspective.cpp index dc816e11..18d18f3a 100644 --- a/samples/warp_perspective.cpp +++ b/samples/warp_perspective.cpp @@ -21,62 +21,101 @@ THE SOFTWARE. */ #include +#include #include #include #include #include +#include "common/utils.hpp" + using namespace roccv; +struct Config { + std::string inputPath; + std::string outputPath = "output"; + eInterpolationType interpolation = eInterpolationType::INTERP_TYPE_LINEAR; + eBorderType borderMode = eBorderType::BORDER_TYPE_CONSTANT; + int deviceId = 0; +}; + +void PrintUsage(const char* programName) { + // clang-format off + std::cerr << "Usage: " << programName << " -i [-o ] [-p ] [-b ] [-d ]" << std::endl; + std::cerr << " -i, --input Input image or directory containing images (required)" << std::endl; + std::cerr << " -o, --output Output image or directory to save the results (optional, default: output)" << std::endl; + std::cerr << " -p, --interpolation Interpolation type to use for output images [0: NEAREST, 1: LINEAR, 2: CUBIC] (optional, default: 1 (LINEAR))" << std::endl; + std::cerr << " -b, --border Border type for output images [0: CONSTANT, 1: REPLICATE, 2: REFLECT, 3: REFLECT101, 4: WRAP] (optional, default: 0 (CONSTANT))" << std::endl; + std::cerr << " -d, --device Device ID to use for execution (optional, default: 0)" << std::endl; + // clang-format on +} + /** * @brief Warp perspective operation example. */ int main(int argc, char** argv) { - if (argc != 6) { - std::cerr << "Usage: " << argv[0] << " " - << std::endl; - return EXIT_FAILURE; + Config config; + + static struct option longOptions[] = {{"input", required_argument, nullptr, 'i'}, + {"output", required_argument, nullptr, 'o'}, + {"interpolation", required_argument, nullptr, 'p'}, + {"border", required_argument, nullptr, 'b'}, + {"device", required_argument, nullptr, 'd'}, + {"help", no_argument, nullptr, 'h'}, + {nullptr, 0, nullptr, 0}}; + + // Parse command line arguments + int opt; + while ((opt = getopt_long(argc, argv, "i:o:p:b:d:h", longOptions, nullptr)) != -1) { + switch (opt) { + case 'i': + config.inputPath = optarg; + break; + case 'o': + config.outputPath = optarg; + break; + case 'p': + config.interpolation = static_cast(std::stoi(optarg)); + break; + case 'b': + config.borderMode = static_cast(std::stoi(optarg)); + break; + case 'd': + config.deviceId = std::stoi(optarg); + break; + case 'h': + PrintUsage(argv[0]); + return EXIT_SUCCESS; + default: + PrintUsage(argv[0]); + return EXIT_FAILURE; + } } - HIP_VALIDATE_NO_ERRORS(hipSetDevice(std::stoi(argv[5]))); - eInterpolationType interp = static_cast(std::stoi(argv[3])); - eBorderType border_mode = static_cast(std::stoi(argv[4])); - cv::Mat image_data = cv::imread(argv[1]); - - // Create input/output tensors for the image. - TensorShape shape(TensorLayout(eTensorLayout::TENSOR_LAYOUT_NHWC), - {1, image_data.rows, image_data.cols, image_data.channels()}); - DataType dtype(eDataType::DATA_TYPE_U8); + if (config.inputPath.empty()) { + std::cerr << "Error: Input path is required.\n\n"; + PrintUsage(argv[0]); + return EXIT_FAILURE; + } - Tensor d_in(shape, dtype); - Tensor d_out(shape, dtype); + CHECK_HIP_ERROR(hipSetDevice(config.deviceId)); hipStream_t stream; - HIP_VALIDATE_NO_ERRORS(hipStreamCreate(&stream)); + CHECK_HIP_ERROR(hipStreamCreate(&stream)); - // Move image data to input tensor - size_t image_size = d_in.shape().size() * d_in.dtype().size(); - auto d_input_data = d_in.exportData(); - HIP_VALIDATE_NO_ERRORS( - hipMemcpyAsync(d_input_data.basePtr(), image_data.data, image_size, hipMemcpyHostToDevice, stream)); + // Create input/output tensors for the image. + Tensor input = LoadImages(stream, config.inputPath, eDeviceType::GPU); + Tensor output(input.shape(), input.dtype(), eDeviceType::GPU); PerspectiveTransform transform_matrix = {1, 0, 0, 0, 1, 0, -0.001, 0, 1}; roccv::WarpPerspective op; - op(stream, d_in, d_out, transform_matrix, true, interp, border_mode, make_float4(0, 0, 0, 0)); + op(stream, input, output, transform_matrix, true, config.interpolation, config.borderMode, make_float4(0, 0, 0, 0)); // Move image data back to device - auto d_out_data = d_out.exportData(); - std::vector h_output(image_size); - HIP_VALIDATE_NO_ERRORS( - hipMemcpyAsync(h_output.data(), d_out_data.basePtr(), image_size, hipMemcpyDeviceToHost, stream)); - - HIP_VALIDATE_NO_ERRORS(hipStreamSynchronize(stream)); - - // Write normalized image to disk - cv::Mat output_image_data(image_data.rows, image_data.cols, CV_8UC3, h_output.data()); - cv::imwrite(argv[2], output_image_data); + WriteImages(stream, output, config.outputPath); + CHECK_HIP_ERROR(hipStreamDestroy(stream)); return EXIT_SUCCESS; } \ No newline at end of file From f7f383954eb27eff4fc42a4b6c2b2b819b273c66 Mon Sep 17 00:00:00 2001 From: Zach Vincze Date: Mon, 8 Dec 2025 12:02:44 -0500 Subject: [PATCH 16/36] Fix CustomCrop sample --- samples/custom_crop.cpp | 204 ++++++++++++++++------------------- samples/warp_perspective.cpp | 8 +- 2 files changed, 95 insertions(+), 117 deletions(-) diff --git a/samples/custom_crop.cpp b/samples/custom_crop.cpp index 73bd8561..25e7d83a 100644 --- a/samples/custom_crop.cpp +++ b/samples/custom_crop.cpp @@ -21,144 +21,122 @@ THE SOFTWARE. */ #include +#include + #include #include #include #include +#include "common/utils.hpp" + using namespace roccv; -/** - * @brief Custom crop operation example. - */ +struct Config { + std::string inputPath; + std::string outputPath = "output"; + Box_t cropRect = {0, 0, 1, 1}; + eDeviceType device = eDeviceType::GPU; + int deviceId = 0; +}; + +void PrintUsage(const char* programName) { + // clang-format off + std::cout << "Usage: " << programName << " -i [-o ] [-crop ] [-d ] [-c]" << std::endl; + std::cout << " -i, --input Input image or directory containing images (required)" << std::endl; + std::cout << " -o, --output Output image or directory to save the results (optional, default: output)" << std::endl; + std::cout << " -c, --crop Crop rectangle as comma separated values (optional, default: 0,0,1,1)" << std::endl; + std::cout << " -d, --device Device ID to use for execution when using GPU (optional, default: 0)" << std::endl; + std::cout << " -C, --cpu Use CPU for execution (optional, default: GPU)" << std::endl; + std::cout << " -h, --help Show this help message" << std::endl; + // clang-format on +} -void ShowHelpAndExit(const char *option = NULL) { - std::cout << "Options: " << option << std::endl - << "-i Input File Path - required" << std::endl - << "-o Output File Path - optional; default: output.bmp" << std::endl - << "-cpu Select CPU instead of GPU to perform operation - optional; default choice is GPU path" << std::endl - << "-d GPU device ID (0 for the first device, 1 for the second, etc.) - optional; default: 0" << std::endl - << "-crop Crop rectangle (top_left_corner_x, top_left_corner_y, width, height)- optional; default: use the set value in the app" << std::endl; - exit(0); +bool ParseCropRectangle(const std::string& cropStr, Box_t& cropRect) { + std::istringstream iss(cropStr); + std::string token; + getline(iss, token, ','); + cropRect.x = std::stoi(token); + getline(iss, token, ','); + cropRect.y = std::stoi(token); + getline(iss, token, ','); + cropRect.width = std::stoi(token); + getline(iss, token, ','); + cropRect.height = std::stoi(token); + return true; } +/** + * @brief Custom crop operation example. + */ int main(int argc, char** argv) { - std::string input_file_path; - std::string output_file_path = "output.bmp"; - bool gpuPath = true; // use GPU by default - eDeviceType device = eDeviceType::GPU; - int deviceId = 0; - Box_t cropRect = {0, 0, 1, 1}; - bool cropSet = false; - - if(argc < 3) { - ShowHelpAndExit("-h"); - } - for (int i = 1; i < argc; i++) { - if (!strcmp(argv[i], "-h")) { - ShowHelpAndExit("-h"); - } - if (!strcmp(argv[i], "-i")) { - if (++i == argc) { - ShowHelpAndExit("-i"); - } - input_file_path = argv[i]; - continue; - } - if (!strcmp(argv[i], "-o")) { - if (++i == argc) { - ShowHelpAndExit("-o"); - } - output_file_path = argv[i]; - continue; - } - if (!strcmp(argv[i], "-crop")) { - i++; - if (i + 4 > argc) { - ShowHelpAndExit("-crop"); - } - cropRect.x = atoi(argv[i++]); - cropRect.y = atoi(argv[i++]); - cropRect.width = atoi(argv[i++]); - cropRect.height = atoi(argv[i]); - cropSet = true; - continue; - } - if (!strcmp(argv[i], "-cpu")) { - gpuPath = false; - continue; + Config config; + static struct option longOptions[] = {{"input", required_argument, nullptr, 'i'}, + {"output", required_argument, nullptr, 'o'}, + {"crop", required_argument, nullptr, 'c'}, + {"device", required_argument, nullptr, 'd'}, + {"cpu", no_argument, nullptr, 'C'}, + {"help", no_argument, nullptr, 'h'}, + {nullptr, 0, nullptr, 0}}; + int opt; + while ((opt = getopt_long(argc, argv, "i:o:c:d:h:C", longOptions, nullptr)) != -1) { + switch (opt) { + case 'i': + config.inputPath = optarg; + break; + case 'o': + config.outputPath = optarg; + break; + case 'c': + if (!ParseCropRectangle(optarg, config.cropRect)) { + std::cerr << "Invalid crop rectangle format. Use: x,y,w,h (e.g., 0,0,1,1)\n"; + return EXIT_FAILURE; + } + break; + case 'd': + config.deviceId = std::stoi(optarg); + break; + case 'C': + config.device = eDeviceType::CPU; + break; + case 'h': + PrintUsage(argv[0]); + return EXIT_SUCCESS; + default: + PrintUsage(argv[0]); + return EXIT_FAILURE; } } - if (gpuPath) { - device = eDeviceType::GPU; - HIP_VALIDATE_NO_ERRORS(hipSetDevice(deviceId)); - } else { - device = eDeviceType::CPU; + if (config.inputPath.empty()) { + std::cerr << "Error: Input path is required.\n\n"; + PrintUsage(argv[0]); + return EXIT_FAILURE; } - cv::Mat imageData = cv::imread(input_file_path); - if (imageData.empty()) { - std::cerr << "Failed to read the input image file" << std::endl; - exit(1); - } - if (!cropSet) { - // Set a safe crop area if no user input - cropRect = {(imageData.cols / 4), (imageData.rows / 4), (imageData.cols / 2), (imageData.rows / 2)}; - } + CHECK_HIP_ERROR(hipSetDevice(config.deviceId)); - // Create input/output tensors for the image. - TensorShape inputShape(TensorLayout(eTensorLayout::TENSOR_LAYOUT_NHWC), {1, imageData.rows, imageData.cols, imageData.channels()}); - DataType dtype(eDataType::DATA_TYPE_U8); - Tensor input(inputShape, dtype, device); + hipStream_t stream; + CHECK_HIP_ERROR(hipStreamCreate(&stream)); - TensorShape outShape(TensorLayout(eTensorLayout::TENSOR_LAYOUT_NHWC), {1, cropRect.height, cropRect.width, imageData.channels()}); - Tensor output(outShape, dtype, device); + // Load input images + Tensor input = LoadImages(stream, config.inputPath, config.device); - hipStream_t stream = nullptr; - if (gpuPath) { - HIP_VALIDATE_NO_ERRORS(hipStreamCreate(&stream)); - } + int64_t batchSize = input.shape(input.layout().batch_index()); + int64_t channels = input.shape(input.layout().channels_index()); - // Move image data to input tensor - size_t imageSizeInByte = input.shape().size() * input.dtype().size(); - auto input_data = input.exportData(); - if (gpuPath) { - HIP_VALIDATE_NO_ERRORS(hipMemcpyAsync(input_data.basePtr(), imageData.data, imageSizeInByte, hipMemcpyHostToDevice, stream)); - } else { - memcpy(input_data.basePtr(), imageData.data, imageSizeInByte); - } + // Create output tensor + Tensor output(TensorShape(input.layout(), {batchSize, config.cropRect.height, config.cropRect.width, channels}), + input.dtype(), config.device); + // Run custom crop operation CustomCrop op; - op(stream, input, output, cropRect, device); - - // Move image data back to host - size_t outputSize = output.shape().size() * output.dtype().size(); - auto outData = output.exportData(); - std::vector h_output(outputSize); - if (gpuPath) { - HIP_VALIDATE_NO_ERRORS(hipMemcpyAsync(h_output.data(), outData.basePtr(), outputSize, hipMemcpyDeviceToHost, stream)); - HIP_VALIDATE_NO_ERRORS(hipStreamSynchronize(stream)); - } else { - memcpy(h_output.data(), outData.basePtr(), outputSize); - } + op(stream, input, output, config.cropRect, config.device); // Write output image to disk - cv::Mat output_imageData(cropRect.height, cropRect.width, imageData.type(), h_output.data()); - bool ret = cv::imwrite(output_file_path, output_imageData); - if (!ret) { - std::cerr << "Faild to save output image to the file" << std::endl; - } + WriteImages(stream, output, config.outputPath); - std::cout << "Input image file: " << input_file_path << std::endl; - std::cout << "Output image file: " << output_file_path << std::endl; - if (gpuPath) { - std::cout << "Operation on GPU device " << deviceId << std::endl; - } else { - std::cout << "Operation on CPU" << std::endl; - } - std::cout << "Input image size: width = " << imageData.cols << ", height = " << imageData.rows << std::endl; - std::cout << "Cropping area: top-left corner = (" << cropRect.x << ", " << cropRect.y << "), width = " << cropRect.width << ", height = " << cropRect.height << std::endl; + CHECK_HIP_ERROR(hipStreamDestroy(stream)); return EXIT_SUCCESS; } \ No newline at end of file diff --git a/samples/warp_perspective.cpp b/samples/warp_perspective.cpp index 18d18f3a..49039d62 100644 --- a/samples/warp_perspective.cpp +++ b/samples/warp_perspective.cpp @@ -45,7 +45,7 @@ void PrintUsage(const char* programName) { std::cerr << "Usage: " << programName << " -i [-o ] [-p ] [-b ] [-d ]" << std::endl; std::cerr << " -i, --input Input image or directory containing images (required)" << std::endl; std::cerr << " -o, --output Output image or directory to save the results (optional, default: output)" << std::endl; - std::cerr << " -p, --interpolation Interpolation type to use for output images [0: NEAREST, 1: LINEAR, 2: CUBIC] (optional, default: 1 (LINEAR))" << std::endl; + std::cerr << " -I, --interpolation Interpolation type to use for output images [0: NEAREST, 1: LINEAR, 2: CUBIC] (optional, default: 1 (LINEAR))" << std::endl; std::cerr << " -b, --border Border type for output images [0: CONSTANT, 1: REPLICATE, 2: REFLECT, 3: REFLECT101, 4: WRAP] (optional, default: 0 (CONSTANT))" << std::endl; std::cerr << " -d, --device Device ID to use for execution (optional, default: 0)" << std::endl; // clang-format on @@ -59,7 +59,7 @@ int main(int argc, char** argv) { static struct option longOptions[] = {{"input", required_argument, nullptr, 'i'}, {"output", required_argument, nullptr, 'o'}, - {"interpolation", required_argument, nullptr, 'p'}, + {"interpolation", required_argument, nullptr, 'I'}, {"border", required_argument, nullptr, 'b'}, {"device", required_argument, nullptr, 'd'}, {"help", no_argument, nullptr, 'h'}, @@ -67,7 +67,7 @@ int main(int argc, char** argv) { // Parse command line arguments int opt; - while ((opt = getopt_long(argc, argv, "i:o:p:b:d:h", longOptions, nullptr)) != -1) { + while ((opt = getopt_long(argc, argv, "i:o:I:b:d:h", longOptions, nullptr)) != -1) { switch (opt) { case 'i': config.inputPath = optarg; @@ -75,7 +75,7 @@ int main(int argc, char** argv) { case 'o': config.outputPath = optarg; break; - case 'p': + case 'I': config.interpolation = static_cast(std::stoi(optarg)); break; case 'b': From 1063baa6bdb7261ddf80a3e7d0a8a1b5bbd030ea Mon Sep 17 00:00:00 2001 From: Zach Vincze Date: Mon, 8 Dec 2025 12:26:56 -0500 Subject: [PATCH 17/36] Fix BilateralFilter sample --- samples/bilateral_filter.cpp | 249 +++++++++++++++-------------------- 1 file changed, 108 insertions(+), 141 deletions(-) diff --git a/samples/bilateral_filter.cpp b/samples/bilateral_filter.cpp index 4b7d3864..04364f5f 100644 --- a/samples/bilateral_filter.cpp +++ b/samples/bilateral_filter.cpp @@ -21,172 +21,139 @@ THE SOFTWARE. */ #include +#include + #include #include -#include #include #include +#include "common/utils.hpp" + using namespace roccv; +struct Config { + std::string inputPath; + std::string outputPath = "output"; + int deviceId = 0; + int diameter = 2; + float sigmaSpace = 2.0f; + float sigmaColor = 10.0f; + eBorderType borderMode = eBorderType::BORDER_TYPE_REPLICATE; + float4 borderColor = {0.0f, 0.0f, 0.0f, 0.0f}; + eDeviceType device = eDeviceType::GPU; +}; + /** * @brief Bilateral filter operation example. */ - void ShowHelpAndExit(const char *option = NULL) { - std::cout << "Options: " << option << std::endl - << "-i Input File Path - required" << std::endl - << "-o Output File Path - optional; default: output.bmp" << std::endl - << "-cpu Select CPU instead of GPU to perform operation - optional; default choice is GPU path" << std::endl - << "-d GPU device ID (0 for the first device, 1 for the second, etc.) - optional; default: 0" << std::endl - << "-diameter Diameter of the filtering area - optional; default: 2" << std::endl - << "-sigma_space Spatial parameter sigma of the Gaussian function - optional; default: 2.0f" << std::endl - << "-sigma_color Range parameter sigma of the Gaussian function - optional; default: 10.0f" << std::endl - << "-border_mode Border mode at image boundary when work pixels are outside of the image (0: constant color; 1: replicate; 2: reflect; 3: wrap) - optional; default: 1 (replicate)" << std::endl - << "-border_color Border color for constant color border mode - optional; default: (0, 0, 0, 0)" << std::endl; - exit(0); +void PrintUsage(const char* programName) { + // clang-format off + std::cout << "Usage: " << programName << " -i [-o ] [-d ] [-D ] [-s ] [-c ] [-b ] [-B ]" << std::endl; + std::cout << " -i, --input Input image or directory containing images (required)" << std::endl; + std::cout << " -o, --output Output image or directory to save the results (optional, default: output)" << std::endl; + std::cout << " -d, --device Device ID to use for execution (optional, default: 0)" << std::endl; + std::cout << " -D, --diameter Diameter of the filtering area (optional, default: 2)" << std::endl; + std::cout << " -s, --sigma_space Spatial parameter sigma of the Gaussian function (optional, default: 2.0f)" << std::endl; + std::cout << " -c, --sigma_color Range parameter sigma of the Gaussian function (optional, default: 10.0f)" << std::endl; + std::cout << " -B, --border_mode Border mode at image boundary when work pixels are outside of the image (optional, default: 1 (replicate))" << std::endl; + std::cout << " -B, --border_color Border color for constant color border mode (optional, default: 0,0,0,0)" << std::endl; + std::cout << " -C, --cpu Use CPU for execution (optional, default: GPU)" << std::endl; + std::cout << " -h, --help Show this help message" << std::endl; + // clang-format on } -int main(int argc, char** argv) { - std::string input_file_path; - std::string output_file_path = "output.bmp"; - bool gpuPath = true; // use GPU by default - eDeviceType device = eDeviceType::GPU; - int deviceId = 0; - int diameter = 2; - float sigmaSpace = 2.0f; - float sigmaColor = 10.0f; - eBorderType borderMode = BORDER_TYPE_REPLICATE; - float4 borderColor = {0.0f, 0.0f, 0.0f, 0.0f}; +bool ParseBorderColor(const std::string& borderColorStr, float4& borderColor) { + return sscanf(borderColorStr.c_str(), "%f,%f,%f,%f", &borderColor.x, &borderColor.y, &borderColor.z, + &borderColor.w) == 4; +} - if(argc < 3) { - ShowHelpAndExit("-h"); - } - for (int i = 1; i < argc; i++) { - if (!strcmp(argv[i], "-h")) { - ShowHelpAndExit("-h"); - } - if (!strcmp(argv[i], "-i")) { - if (++i == argc) { - ShowHelpAndExit("-i"); - } - input_file_path = argv[i]; - continue; - } - if (!strcmp(argv[i], "-o")) { - if (++i == argc) { - ShowHelpAndExit("-o"); - } - output_file_path = argv[i]; - continue; - } - if (!strcmp(argv[i], "-diameter")) { - if (++i == argc) { - ShowHelpAndExit("-diameter"); - } - diameter = std::atoi(argv[i]); - continue; - } - if (!strcmp(argv[i], "-sigma_space")) { - if (++i == argc) { - ShowHelpAndExit("-sigma_space"); - } - sigmaSpace = std::atof(argv[i]); - continue; - } - if (!strcmp(argv[i], "-sigma_color")) { - if (++i == argc) { - ShowHelpAndExit("-sigma_color"); - } - sigmaColor = std::atof(argv[i]); - continue; - } - if (!strcmp(argv[i], "-border_mode")) { - if (++i == argc) { - ShowHelpAndExit("-border_mode"); - } - borderMode = static_cast(std::atoi(argv[i])); - continue; - } - if (!strcmp(argv[i], "-border_color")) { - i++; - if (i + 4 > argc) { - ShowHelpAndExit("-border_color"); - } - borderColor.x = static_cast(atoi(argv[i++])); - borderColor.y = static_cast(atoi(argv[i++])); - borderColor.z = static_cast(atoi(argv[i++])); - borderColor.w = static_cast(atoi(argv[i])); - continue; - } - if (!strcmp(argv[i], "-cpu")) { - gpuPath = false; - continue; +int main(int argc, char** argv) { + Config config; + + static struct option longOptions[] = {{"input", required_argument, nullptr, 'i'}, + {"output", required_argument, nullptr, 'o'}, + {"device", required_argument, nullptr, 'd'}, + {"diameter", required_argument, nullptr, 'D'}, + {"sigma_space", required_argument, nullptr, 's'}, + {"sigma_color", required_argument, nullptr, 'c'}, + {"border_mode", required_argument, nullptr, 'b'}, + {"border_color", required_argument, nullptr, 'B'}, + {"cpu", no_argument, nullptr, 'C'}, + {"help", no_argument, nullptr, 'h'}, + {nullptr, 0, nullptr, 0}}; + + int opt; + while ((opt = getopt_long(argc, argv, "i:o:d:D:s:c:b:B:h:C", longOptions, nullptr)) != -1) { + switch (opt) { + case 'i': + config.inputPath = optarg; + break; + case 'o': + config.outputPath = optarg; + break; + case 'd': + config.deviceId = std::stoi(optarg); + break; + case 'D': + config.diameter = std::stoi(optarg); + break; + case 's': + config.sigmaSpace = std::stof(optarg); + break; + case 'c': + config.sigmaColor = std::stof(optarg); + break; + case 'b': + config.borderMode = static_cast(std::stoi(optarg)); + break; + case 'B': + if (!ParseBorderColor(optarg, config.borderColor)) { + std::cerr << "Invalid border color format. Use: r,g,b,a (e.g., 0,0,0,0)\n"; + return EXIT_FAILURE; + } + break; + case 'C': + config.device = eDeviceType::CPU; + break; + case 'h': + PrintUsage(argv[0]); + return EXIT_SUCCESS; + default: + PrintUsage(argv[0]); + return EXIT_FAILURE; } } - if (gpuPath) { - device = eDeviceType::GPU; - HIP_VALIDATE_NO_ERRORS(hipSetDevice(deviceId)); - } else { - device = eDeviceType::CPU; - } - hipStream_t stream = nullptr; - if (gpuPath) { - HIP_VALIDATE_NO_ERRORS(hipStreamCreate(&stream)); + if (config.inputPath.empty()) { + std::cerr << "Error: Input path is required.\n\n"; + PrintUsage(argv[0]); + return EXIT_FAILURE; } - cv::Mat imageData = cv::imread(input_file_path); - if (imageData.empty()) { - std::cerr << "Failed to read the input image file" << std::endl; - exit(1); + if (config.device == eDeviceType::GPU) { + CHECK_HIP_ERROR(hipSetDevice(config.deviceId)); } - // Create input/output tensors for the image. - TensorShape imageShape(TensorLayout(eTensorLayout::TENSOR_LAYOUT_NHWC), {1, imageData.rows, imageData.cols, imageData.channels()}); - DataType dtype(eDataType::DATA_TYPE_U8); - Tensor input(imageShape, dtype, device); - Tensor output(imageShape, dtype, device); - - // Move image data to input tensor - size_t imageSizeInByte = input.shape().size() * input.dtype().size(); - auto inputData = input.exportData(); - if (gpuPath) { - HIP_VALIDATE_NO_ERRORS(hipMemcpyAsync(inputData.basePtr(), imageData.data, imageSizeInByte, hipMemcpyHostToDevice, stream)); - } else { - memcpy(inputData.basePtr(), imageData.data, imageSizeInByte); - } + // Create stream + hipStream_t stream; + CHECK_HIP_ERROR(hipStreamCreate(&stream)); + + // Load input image + Tensor input = LoadImages(stream, config.inputPath.c_str(), config.device); + // Create output tensor + Tensor output(input.shape(), input.dtype(), config.device); + + // Create BilateralFilter operator BilateralFilter op; - op(stream, input, output, diameter, sigmaColor, sigmaSpace, borderMode, borderColor, device); - - // Move image data back to host - size_t outputSize = output.shape().size() * output.dtype().size(); - auto outData = output.exportData(); - std::vector h_output(outputSize); - if (gpuPath) { - HIP_VALIDATE_NO_ERRORS(hipMemcpyAsync(h_output.data(), outData.basePtr(), outputSize, hipMemcpyDeviceToHost, stream)); - HIP_VALIDATE_NO_ERRORS(hipStreamSynchronize(stream)); - } else { - memcpy(h_output.data(), outData.basePtr(), outputSize); - } + op(stream, input, output, config.diameter, config.sigmaColor, config.sigmaSpace, config.borderMode, + config.borderColor, config.device); - // Write output image to disk - cv::Mat outImageData(imageData.rows, imageData.cols, imageData.type(), h_output.data()); - bool ret = cv::imwrite(output_file_path, outImageData); - if (!ret) { - std::cerr << "Faild to save output image to the file" << std::endl; - exit(1); - } + WriteImages(stream, output, config.outputPath); - std::cout << "Input image file: " << input_file_path << std::endl; - std::cout << "Output image file: " << output_file_path << std::endl; - if (gpuPath) { - std::cout << "Operation on GPU device " << deviceId << std::endl; - } else { - std::cout << "Operation on CPU" << std::endl; - } - std::cout << "Image size: width = " << imageData.cols << ", height = " << imageData.rows << std::endl; + CHECK_HIP_ERROR(hipStreamDestroy(stream)); return EXIT_SUCCESS; } \ No newline at end of file From 612cfb6d04920e41fe3811b3f9cef19f9e6d86ee Mon Sep 17 00:00:00 2001 From: Zach Vincze Date: Mon, 8 Dec 2025 12:32:14 -0500 Subject: [PATCH 18/36] Fix CenterCrop sample --- samples/center_crop.cpp | 197 ++++++++++++++++++---------------------- 1 file changed, 87 insertions(+), 110 deletions(-) diff --git a/samples/center_crop.cpp b/samples/center_crop.cpp index 123e8f26..7a460773 100644 --- a/samples/center_crop.cpp +++ b/samples/center_crop.cpp @@ -21,143 +21,120 @@ THE SOFTWARE. */ #include +#include + #include #include #include #include +#include "common/utils.hpp" + using namespace roccv; +struct Config { + std::string inputPath; + std::string outputPath = "output"; + eDeviceType device = eDeviceType::GPU; + int deviceId = 0; + Size2D cropArea = {1, 1}; +}; + +bool ParseCropArea(const std::string& cropStr, Size2D& cropArea) { + std::istringstream iss(cropStr); + std::string token; + getline(iss, token, ','); + cropArea.w = std::stoi(token); + getline(iss, token, ','); + cropArea.h = std::stoi(token); + return true; +} + /** * @brief Center crop operation example. */ -void ShowHelpAndExit(const char *option = NULL) { - std::cout << "Options: " << option << std::endl - << "-i Input File Path - required" << std::endl - << "-o Output File Path - optional; default: output.bmp" << std::endl - << "-cpu Select CPU instead of GPU to perform operation - optional; default choice is GPU path" << std::endl - << "-d GPU device ID (0 for the first device, 1 for the second, etc.) - optional; default: 0" << std::endl - << "-crop Center crop area (width, height)- optional; default: use the set value in the app" << std::endl; - exit(0); +void PrintUsage(const char* programName) { + // clang-format off + std::cout << "Usage: " << programName << " -i [-o ] [-d ] [-crop ] [-C]" << std::endl; + std::cout << " -i, --input Input image or directory containing images (required)" << std::endl; + std::cout << " -o, --output Output image or directory to save the results (optional, default: output)" << std::endl; + std::cout << " -d, --device Device ID to use for execution (optional, default: 0)" << std::endl; + std::cout << " -c, --crop Crop area as comma separated values (optional, default: 1,1)" << std::endl; + std::cout << " -C, --cpu Use CPU for execution (optional, default: GPU)" << std::endl; + std::cout << " -h, --help Show this help message" << std::endl; + // clang-format on } - int main(int argc, char** argv) { - std::string input_file_path; - std::string output_file_path = "output.bmp"; - bool gpuPath = true; // use GPU by default - eDeviceType device = eDeviceType::GPU; - int deviceId = 0; - Size2D cropArea = {1, 1}; - bool cropSet = false; - - if(argc < 3) { - ShowHelpAndExit("-h"); - } - for (int i = 1; i < argc; i++) { - if (!strcmp(argv[i], "-h")) { - ShowHelpAndExit("-h"); - } - if (!strcmp(argv[i], "-i")) { - if (++i == argc) { - ShowHelpAndExit("-i"); - } - input_file_path = argv[i]; - continue; - } - if (!strcmp(argv[i], "-o")) { - if (++i == argc) { - ShowHelpAndExit("-o"); - } - output_file_path = argv[i]; - continue; - } - if (!strcmp(argv[i], "-crop")) { - i++; - if (i + 2 > argc) { - ShowHelpAndExit("-crop"); - } - cropArea.w = atoi(argv[i++]); - cropArea.h = atoi(argv[i]); - cropSet = true; - continue; - } - if (!strcmp(argv[i], "-cpu")) { - gpuPath = false; - continue; + Config config; + static struct option longOptions[] = {{"input", required_argument, nullptr, 'i'}, + {"output", required_argument, nullptr, 'o'}, + {"device", required_argument, nullptr, 'd'}, + {"crop", required_argument, nullptr, 'c'}, + {"cpu", no_argument, nullptr, 'C'}, + {"help", no_argument, nullptr, 'h'}, + {nullptr, 0, nullptr, 0}}; + int opt; + while ((opt = getopt_long(argc, argv, "i:o:d:c:h:C", longOptions, nullptr)) != -1) { + switch (opt) { + case 'i': + config.inputPath = optarg; + break; + case 'o': + config.outputPath = optarg; + break; + case 'd': + config.deviceId = std::stoi(optarg); + break; + case 'c': + if (!ParseCropArea(optarg, config.cropArea)) { + std::cerr << "Invalid crop area format. Use: width,height (e.g., 1,1)\n"; + return EXIT_FAILURE; + } + break; + case 'C': + config.device = eDeviceType::CPU; + break; + case 'h': + PrintUsage(argv[0]); + return EXIT_SUCCESS; + default: + PrintUsage(argv[0]); + return EXIT_FAILURE; } } - if (gpuPath) { - device = eDeviceType::GPU; - HIP_VALIDATE_NO_ERRORS(hipSetDevice(deviceId)); - } else { - device = eDeviceType::CPU; + if (config.inputPath.empty()) { + std::cerr << "Error: Input path is required.\n\n"; + PrintUsage(argv[0]); + return EXIT_FAILURE; } - cv::Mat imageData = cv::imread(input_file_path); - if (imageData.empty()) { - std::cerr << "Failed to read the input image file" << std::endl; - exit(1); - } - if (!cropSet) { - // Set a safe crop area if no user input - cropArea = {(imageData.cols / 2), (imageData.rows / 2)}; + if (config.device == eDeviceType::GPU) { + CHECK_HIP_ERROR(hipSetDevice(config.deviceId)); } - // Create input/output tensors for the image. - TensorShape inputShape(TensorLayout(eTensorLayout::TENSOR_LAYOUT_NHWC), {1, imageData.rows, imageData.cols, imageData.channels()}); - DataType dtype(eDataType::DATA_TYPE_U8); - Tensor input(inputShape, dtype, device); + hipStream_t stream; + CHECK_HIP_ERROR(hipStreamCreate(&stream)); - TensorShape outShape(TensorLayout(eTensorLayout::TENSOR_LAYOUT_NHWC), {1, cropArea.h, cropArea.w, imageData.channels()}); - Tensor output(outShape, dtype, device); + // Load input images + Tensor input = LoadImages(stream, config.inputPath.c_str(), config.device); - hipStream_t stream = nullptr; - if (gpuPath) { - HIP_VALIDATE_NO_ERRORS(hipStreamCreate(&stream)); - } + // Create output shape + TensorShape outputShape(input.layout(), {input.shape(input.layout().batch_index()), config.cropArea.h, + config.cropArea.w, input.shape(input.layout().channels_index())}); - // Move image data to input tensor - size_t imageSizeInByte = input.shape().size() * input.dtype().size(); - auto input_data = input.exportData(); - if (gpuPath) { - HIP_VALIDATE_NO_ERRORS(hipMemcpyAsync(input_data.basePtr(), imageData.data, imageSizeInByte, hipMemcpyHostToDevice, stream)); - } else { - memcpy(input_data.basePtr(), imageData.data, imageSizeInByte); - } + // Create output tensor + Tensor output(outputShape, input.dtype(), config.device); + // Create CenterCrop operator CenterCrop op; - op(stream, input, output, cropArea, device); - - // Move image data back to host - size_t outputSize = output.shape().size() * output.dtype().size(); - auto outData = output.exportData(); - std::vector h_output(outputSize); - if (gpuPath) { - HIP_VALIDATE_NO_ERRORS(hipMemcpyAsync(h_output.data(), outData.basePtr(), outputSize, hipMemcpyDeviceToHost, stream)); - HIP_VALIDATE_NO_ERRORS(hipStreamSynchronize(stream)); - } else { - memcpy(h_output.data(), outData.basePtr(), outputSize); - } + op(stream, input, output, config.cropArea, config.device); - // Write output image to disk - cv::Mat output_imageData(cropArea.h, cropArea.w, imageData.type(), h_output.data()); - bool ret = cv::imwrite(output_file_path, output_imageData); - if (!ret) { - std::cerr << "Faild to save output image to the file" << std::endl; - exit(1); - } + // Write output images to disk + WriteImages(stream, output, config.outputPath); - std::cout << "Input image file: " << input_file_path << std::endl; - std::cout << "Output image file: " << output_file_path << std::endl; - if (gpuPath) { - std::cout << "Operation on GPU device " << deviceId << std::endl; - } else { - std::cout << "Operation on CPU" << std::endl; - } - std::cout << "Input image size: width = " << imageData.cols << ", height = " << imageData.rows << std::endl; - std::cout << "Cropping area: width = " << cropArea.w << ", height = " << cropArea.h << std::endl; + CHECK_HIP_ERROR(hipStreamDestroy(stream)); return EXIT_SUCCESS; } \ No newline at end of file From b1ae692f21565b04688b146ab9e165b1b667e27e Mon Sep 17 00:00:00 2001 From: Zach Vincze Date: Mon, 8 Dec 2025 13:50:02 -0500 Subject: [PATCH 19/36] Fix/cleanup BndBox sample --- samples/bnd_box.cpp | 419 +++++++++++++++++++++++--------------------- 1 file changed, 215 insertions(+), 204 deletions(-) diff --git a/samples/bnd_box.cpp b/samples/bnd_box.cpp index 72ccbf06..621d3ef7 100644 --- a/samples/bnd_box.cpp +++ b/samples/bnd_box.cpp @@ -21,6 +21,7 @@ THE SOFTWARE. */ #include +#include #include #include @@ -28,239 +29,249 @@ THE SOFTWARE. #include #include -using namespace roccv; +#include "common/utils.hpp" -/** - * @brief Bounding Box operation example. - */ +using namespace roccv; /** - * @brief Example bounding box list file content - * 1 <-- number of images - * 2 <-- number of boxes for image 1 - * 50 <-- X coordinate of top-left corner of box 1 - * 50 <-- Y coordinate of top-left corner of box 1 - * 100 <-- width of box 1 - * 50 <-- height of box 1 - * 5 <-- thickness of box boundary of box 1 - * 0 <-- B component of box border color of box 1 - * 0 <-- G component of box border color of box 1 - * 255 <-- R component of box border color of box 1 - * 200 <-- alpha component of box border color of box 1 - * 0 <-- B component of box fill color of box 1 - * 255 <-- G component of box fill color of box 1 - * 0 <-- R component of box fill color of box 1 - * 100 <-- alpha component of box fill color of box 1 - * 250 <-- X coordinate of top-left corner of box 2 - * 250 <-- Y coordinate of top-left corner of box 2 - * 50 <-- width of box 2 - * 100 <-- height of box 2 - * 10 <-- thickness of box boundary of box 2 - * 255 <-- B component of box border color of box 2 - * 0 <-- G component of box border color of box 2 - * 0 <-- R component of box border color of box 2 - * 200 <-- alpha component of box border color of box 2 - * 0 <-- B component of box fill color of box 2 - * 0 <-- G component of box fill color of box 2 - * 0 <-- R component of box fill color of box 2 - * 0 <-- alpha component of box fill color of box 2 + * @file bnd_box.cpp + * @brief Example application for drawing bounding boxes on images using ROC CV. + * + * This file demonstrates how to parse a bounding box description file and apply bounding boxes to input images. + * The bounding box file is expected to have the following format: + * - The first line contains the number of images. + * - Each image section starts with a line containing the number of boxes for that image. + * - Each box is described over 13 subsequent lines as follows: + * - X coordinate of top-left corner + * - Y coordinate of top-left corner + * - Width + * - Height + * - Thickness of box boundary + * - B component of box border color + * - G component of box border color + * - R component of box border color + * - Alpha component of box border color + * - B component of box fill color + * - G component of box fill color + * - R component of box fill color + * - Alpha component of box fill color + * + * Example of bounding box list file content: + * @code + * 1 + * 2 + * 50 + * 50 + * 100 + * 50 + * 5 + * 0 + * 0 + * 255 + * 200 + * 0 + * 255 + * 0 + * 100 + * 250 + * 250 + * 50 + * 100 + * 10 + * 255 + * 0 + * 0 + * 200 + * 0 + * 0 + * 0 + * 0 + * @endcode */ -void ShowHelpAndExit(const char* option = NULL) { - std::cout << "Options: " << option << std::endl - << "-i Input File Path - required" << std::endl - << "-o Output File Path - optional; default: output.bmp" << std::endl - << "-cpu Select CPU instead of GPU to perform operation - optional; default choice is GPU path" - << std::endl - << "-d GPU device ID (0 for the first device, 1 for the second, etc.) - optional; default: 0" << std::endl - << "-box_file Bounding box list file - optional; default: use the set value in the app" << std::endl; - exit(0); -} - -int main(int argc, char** argv) { - std::string input_file_path; - std::string box_file_path; - std::string output_file_path = "output.bmp"; - bool gpuPath = true; // use GPU by default +struct Config { + std::string inputPath; + std::string outputPath = "output"; eDeviceType device = eDeviceType::GPU; int deviceId = 0; - bool boxSet = false; // User sets the bounding box list data in a text file - - if (argc < 3) { - ShowHelpAndExit("-h"); - } - for (int i = 1; i < argc; i++) { - if (!strcmp(argv[i], "-h")) { - ShowHelpAndExit("-h"); - } - if (!strcmp(argv[i], "-i")) { - if (++i == argc) { - ShowHelpAndExit("-i"); - } - input_file_path = argv[i]; - continue; - } - if (!strcmp(argv[i], "-o")) { - if (++i == argc) { - ShowHelpAndExit("-o"); - } - output_file_path = argv[i]; - continue; - } - if (!strcmp(argv[i], "-box_file")) { - if (++i == argc) { - ShowHelpAndExit("-o"); - } - box_file_path = argv[i]; - boxSet = true; - continue; - } - if (!strcmp(argv[i], "-cpu")) { - gpuPath = false; - continue; - } - } + std::string boundingBoxFilePath = ""; +}; - if (gpuPath) { - device = eDeviceType::GPU; - HIP_VALIDATE_NO_ERRORS(hipSetDevice(deviceId)); - } else { - device = eDeviceType::CPU; - } +void PrintUsage(const char* programName) { + // clang-format off + std::cout << "Usage: " << programName << " -i [-o ] [-b ] [-d ] [-C]" << std::endl; + std::cout << " -i, --input Input image or directory containing images (required)" << std::endl; + std::cout << " -o, --output Output image or directory to save the results (optional, default: output)" << std::endl; + std::cout << " -b, --box_file Bounding box list file (optional, default: use the set value in the app)" << std::endl; + std::cout << " -d, --device Device ID to use for execution (optional, default: 0)" << std::endl; + std::cout << " -C, --cpu Use CPU for execution (optional, default: GPU)" << std::endl; + std::cout << " -h, --help Show this help message" << std::endl; + // clang-format on +} - cv::Mat imageData = cv::imread(input_file_path); - if (imageData.empty()) { - std::cerr << "Failed to read the input image file" << std::endl; - exit(1); +/** + * @brief Parse bounding box file and setup bounding box vector. + * + * @param[in] boundingBoxFilePath Path to bounding box list file. + * @param[out] bbox_vector Vector of bounding box vectors to be filled. + * @return void + * @throws std::runtime_error if failed to open bounding box file. + */ +void ParseBoundingBoxFile(const std::string& boundingBoxFilePath, std::vector>& bbox_vector) { + std::ifstream file(boundingBoxFilePath); + if (!file.is_open()) { + throw std::runtime_error("Failed to open bounding box file " + boundingBoxFilePath); } - int batchSize = 1; - std::vector> bbox_vector; - if (boxSet) { - std::ifstream box_list_file(box_file_path); - if (box_list_file.is_open()) { - std::string line; - std::getline(box_list_file, line); - batchSize = std::stoi(line.c_str()); - if (batchSize > 0) { - bbox_vector.resize(batchSize); - for (int i = 0; i < batchSize; i++) { - std::getline(box_list_file, line); - int numBoxes = std::stoi(line.c_str()); - if (numBoxes > 0) { - for (int b = 0; b < numBoxes; b++) { - BndBox_t box; - std::getline(box_list_file, line); - box.box.x = std::atoi(line.c_str()); - std::getline(box_list_file, line); - box.box.y = std::atoi(line.c_str()); - std::getline(box_list_file, line); - box.box.width = std::atoi(line.c_str()); - std::getline(box_list_file, line); - box.box.height = std::atoi(line.c_str()); + int numImages; + file >> numImages; + bbox_vector.resize(numImages); + for (int i = 0; i < numImages; i++) { + int numBoxes; + file >> numBoxes; - std::getline(box_list_file, line); - box.thickness = std::atoi(line.c_str()); + bbox_vector[i].resize(numBoxes); + for (int j = 0; j < numBoxes; j++) { + // Parse each box from 13 lines, in order: + // 1. x (top-left corner) + // 2. y (top-left corner) + // 3. width + // 4. height + // 5. thickness (box boundary) + // 6. border B + // 7. border G + // 8. border R + // 9. border A (alpha) + // 10. fill B + // 11. fill G + // 12. fill R + // 13. fill A (alpha) + // (13 values total per box) - std::getline(box_list_file, line); - box.borderColor.r = std::atoi(line.c_str()); - std::getline(box_list_file, line); - box.borderColor.g = std::atoi(line.c_str()); - std::getline(box_list_file, line); - box.borderColor.b = std::atoi(line.c_str()); - std::getline(box_list_file, line); - box.borderColor.a = std::atoi(line.c_str()); + // Read box dimensions + file >> bbox_vector[i][j].box.x >> bbox_vector[i][j].box.y >> bbox_vector[i][j].box.width >> + bbox_vector[i][j].box.height; + file >> bbox_vector[i][j].thickness; - std::getline(box_list_file, line); - box.fillColor.r = std::atoi(line.c_str()); - std::getline(box_list_file, line); - box.fillColor.g = std::atoi(line.c_str()); - std::getline(box_list_file, line); - box.fillColor.b = std::atoi(line.c_str()); - std::getline(box_list_file, line); - box.fillColor.a = std::atoi(line.c_str()); + // Read colors into temp ints + int r, g, b, a; + file >> b >> g >> r >> a; + bbox_vector[i][j].borderColor = {static_cast(r), static_cast(g), static_cast(b), + static_cast(a)}; - bbox_vector[i].push_back(box); - } - } else { - std::cerr << "Invalid number of boxes: " << numBoxes << "for image: " << i << std::endl; - exit(1); - } - } - } else { - std::cerr << "Invalid batch size: " << batchSize << std::endl; - exit(1); - } - } else { - std::cerr << "Failed to open bounding box list file " << box_file_path << std::endl; - exit(1); + file >> b >> g >> r >> a; + bbox_vector[i][j].fillColor = {static_cast(r), static_cast(g), static_cast(b), + static_cast(a)}; } - } else { - auto width = imageData.cols; - auto height = imageData.rows; - bbox_vector = { - { + } +} + +/** + * @brief Setup bounding box vector from file or default values. + * + * @param[in] batchSize Number of images in the batch. + * @param[in] width Width of each image in the batch. + * @param[in] height Height of each image in the batch. + * @param[in] boundingBoxFilePath Path to bounding box list file. + * @return Vector of bounding box vectors. + */ +std::vector> SetupBoundingBoxVector(int64_t batchSize, int64_t width, int64_t height, + const std::string& boundingBoxFilePath) { + std::vector> bbox_vector; + + if (boundingBoxFilePath.empty()) { + for (int64_t b = 0; b < batchSize; b++) { + bbox_vector.push_back({ {{width / 4, height / 4, width / 2, height / 2}, 5, {0, 0, 255, 200}, {0, 255, 0, 100}}, {{width / 3, height / 3, width / 3 * 2, height / 4}, -1, {90, 16, 181, 50}, {0, 0, 0, 0}}, {{-50, (height * 2) / 3, width + 50, height / 3 + 50}, 0, {0, 0, 0, 0}, {111, 159, 232, 150}}, - }, - }; + }); + } + } else { + ParseBoundingBoxFile(boundingBoxFilePath, bbox_vector); } - BndBoxes bboxes(bbox_vector); - // Create input/output tensors for the image. - TensorShape imageShape(TensorLayout(eTensorLayout::TENSOR_LAYOUT_NHWC), - {1, imageData.rows, imageData.cols, imageData.channels()}); - DataType dtype(eDataType::DATA_TYPE_U8); - Tensor input(imageShape, dtype, device); - Tensor output(imageShape, dtype, device); + return bbox_vector; +} - hipStream_t stream = nullptr; - if (gpuPath) { - HIP_VALIDATE_NO_ERRORS(hipStreamCreate(&stream)); +/** + * @brief Main function to run the bounding box operation example. + * + * @param[in] argc Number of command line arguments. + * @param[in] argv Command line arguments. + * @return int Exit status. + */ +int main(int argc, char** argv) { + Config config; + static struct option longOptions[] = {{"input", required_argument, nullptr, 'i'}, + {"output", required_argument, nullptr, 'o'}, + {"box_file", required_argument, nullptr, 'b'}, + {"device", required_argument, nullptr, 'd'}, + {"cpu", no_argument, nullptr, 'C'}, + {"help", no_argument, nullptr, 'h'}, + {nullptr, 0, nullptr, 0}}; + int opt; + while ((opt = getopt_long(argc, argv, "i:o:b:d:h:C", longOptions, nullptr)) != -1) { + switch (opt) { + case 'i': + config.inputPath = optarg; + break; + case 'o': + config.outputPath = optarg; + break; + case 'b': + config.boundingBoxFilePath = optarg; + break; + case 'd': + config.deviceId = std::stoi(optarg); + break; + case 'C': + config.device = eDeviceType::CPU; + break; + case 'h': + PrintUsage(argv[0]); + return EXIT_SUCCESS; + default: + PrintUsage(argv[0]); + return EXIT_FAILURE; + } } - // Move image data to input tensor - size_t imageSizeInByte = input.shape().size() * input.dtype().size(); - auto inputData = input.exportData(); - if (gpuPath) { - HIP_VALIDATE_NO_ERRORS( - hipMemcpyAsync(inputData.basePtr(), imageData.data, imageSizeInByte, hipMemcpyHostToDevice, stream)); - } else { - memcpy(inputData.basePtr(), imageData.data, imageSizeInByte); + if (config.inputPath.empty()) { + std::cerr << "Error: Input path is required.\n\n"; + PrintUsage(argv[0]); + return EXIT_FAILURE; } - BndBox op; - op(stream, input, output, bboxes, device); - - // Move image data back to host - size_t outputSize = output.shape().size() * output.dtype().size(); - auto outData = output.exportData(); - std::vector h_output(outputSize); - if (gpuPath) { - HIP_VALIDATE_NO_ERRORS( - hipMemcpyAsync(h_output.data(), outData.basePtr(), outputSize, hipMemcpyDeviceToHost, stream)); - HIP_VALIDATE_NO_ERRORS(hipStreamSynchronize(stream)); - } else { - memcpy(h_output.data(), outData.basePtr(), outputSize); + if (config.device == eDeviceType::GPU) { + CHECK_HIP_ERROR(hipSetDevice(config.deviceId)); } - // Write output image to disk - cv::Mat outImageData(imageData.rows, imageData.cols, imageData.type(), h_output.data()); - bool ret = cv::imwrite(output_file_path, outImageData); - if (!ret) { - std::cerr << "Faild to save output image to the file" << std::endl; - exit(1); - } + hipStream_t stream; + CHECK_HIP_ERROR(hipStreamCreate(&stream)); - std::cout << "Input image file: " << input_file_path << std::endl; - std::cout << "Output image file: " << output_file_path << std::endl; - if (gpuPath) { - std::cout << "Operation on GPU device " << deviceId << std::endl; - } else { - std::cout << "Operation on CPU" << std::endl; - } - std::cout << "Image size: width = " << imageData.cols << ", height = " << imageData.rows << std::endl; + // Load images + Tensor input = LoadImages(stream, config.inputPath.c_str(), config.device); + int64_t batchSize = input.shape()[input.shape().layout().batch_index()]; + int64_t height = input.shape()[input.shape().layout().height_index()]; + int64_t width = input.shape()[input.shape().layout().width_index()]; + + // Setup bounding box vector + std::vector> bbox_vector = + SetupBoundingBoxVector(batchSize, width, height, config.boundingBoxFilePath); + + BndBoxes bboxes(bbox_vector); + + // Create output tensor + Tensor output(input.shape(), input.dtype(), config.device); + + // Create BndBox operator + BndBox op; + op(stream, input, output, bboxes, config.device); + + // Write output images to disk + WriteImages(stream, output, config.outputPath); + CHECK_HIP_ERROR(hipStreamDestroy(stream)); return EXIT_SUCCESS; } \ No newline at end of file From e85d3a4d35f912311737ae28ff70b39179d40a80 Mon Sep 17 00:00:00 2001 From: Zach Vincze Date: Mon, 8 Dec 2025 14:18:22 -0500 Subject: [PATCH 20/36] Fix/clean Composite sample --- samples/common/utils.hpp | 4 +- samples/composite.cpp | 162 +++++++++++++++++++++++---------------- 2 files changed, 96 insertions(+), 70 deletions(-) diff --git a/samples/common/utils.hpp b/samples/common/utils.hpp index f8f0c5a0..b34dedb4 100644 --- a/samples/common/utils.hpp +++ b/samples/common/utils.hpp @@ -83,7 +83,7 @@ inline MemcpyParams GetMemcpyParams(const roccv::Tensor &tensor) { * @return A NHWC tensor containing the loaded images. */ inline roccv::Tensor LoadImages(hipStream_t stream, const std::string &image_path, - eDeviceType device = eDeviceType::GPU) { + eDeviceType device = eDeviceType::GPU, int openCVFlags = cv::IMREAD_UNCHANGED) { const std::vector supportedExtensions = {".bmp", ".jpg", ".jpeg", ".png"}; std::vector images; @@ -96,7 +96,7 @@ inline roccv::Tensor LoadImages(hipStream_t stream, const std::string &image_pat if (std::filesystem::is_directory(image_path)) { for (auto file : std::filesystem::directory_iterator(image_path)) { if (!std::filesystem::is_directory(file.path()) && ContainsExtension(file.path(), supportedExtensions)) { - images.push_back(cv::imread(file.path())); + images.push_back(cv::imread(file.path(), openCVFlags)); // Check if all images are of the same size if (width == -1 && height == -1 && channels == -1) { diff --git a/samples/composite.cpp b/samples/composite.cpp index cbc5507e..135dd9f2 100644 --- a/samples/composite.cpp +++ b/samples/composite.cpp @@ -19,89 +19,115 @@ * THE SOFTWARE. */ +#include + #include #include #include +#include "common/utils.hpp" + +/** @file composite.cpp + * @brief Sample application for the Composite operation. + * + * This sample application demonstrates the Composite operation, which blends the foreground image into the background + * image using a grayscale alpha mask. It loads the background, foreground, and mask images, runs the Composite + * operation, and writes the output images to disk. + */ + using namespace roccv; +struct Config { + std::string backgroundPath; + std::string foregroundPath; + std::string maskPath; + std::string outputPath = "output"; + eDeviceType device = eDeviceType::GPU; + int deviceId = 0; +}; + +void PrintUsage(const char* programName) { + // clang-format off + std::cout << "Usage: " << programName << " -b -f -m -o -d " << std::endl; + std::cout << " -b, --background Background image filename (required)" << std::endl; + std::cout << " -f, --foreground Foreground image filename (required)" << std::endl; + std::cout << " -m, --mask Mask image filename (required)" << std::endl; + std::cout << " -o, --output Output image filename (optional, default: output)" << std::endl; + std::cout << " -d, --device Device ID to use for execution (optional, default: 0)" << std::endl; + std::cout << " -h, --help Show this help message" << std::endl; + std::cout << std::endl; + std::cout << "NOTE: If you use directories for the background, foreground, or mask inputs, each directory must contain the same number of images. Each image in the directories must be the same size." << std::endl; + // clang-format on +} + +/** + * @brief Main function for the Composite operation. + * + * @param argc Number of command line arguments. + * @param argv Command line arguments. + * @return EXIT_SUCCESS if the operation completed successfully, EXIT_FAILURE otherwise. + */ int main(int argc, char** argv) { - if (argc != 6) { - std::cerr << "Usage: " << argv[0] - << " " - << std::endl; + Config config; + static struct option longOptions[] = {{"background", required_argument, nullptr, 'b'}, + {"foreground", required_argument, nullptr, 'f'}, + {"mask", required_argument, nullptr, 'm'}, + {"output", required_argument, nullptr, 'o'}, + {"device", required_argument, nullptr, 'd'}, + {"help", no_argument, nullptr, 'h'}, + {nullptr, 0, nullptr, 0}}; + int opt; + while ((opt = getopt_long(argc, argv, "b:f:m:o:d:h", longOptions, nullptr)) != -1) { + switch (opt) { + case 'b': + config.backgroundPath = optarg; + break; + case 'f': + config.foregroundPath = optarg; + break; + case 'm': + config.maskPath = optarg; + break; + case 'o': + config.outputPath = optarg; + break; + case 'd': + config.deviceId = std::stoi(optarg); + case 'h': + PrintUsage(argv[0]); + return EXIT_SUCCESS; + default: + PrintUsage(argv[0]); + return EXIT_FAILURE; + } + } + if (config.backgroundPath.empty() || config.foregroundPath.empty() || config.maskPath.empty()) { + std::cerr << "Error: Background, foreground, and mask paths are required.\n\n"; + PrintUsage(argv[0]); return EXIT_FAILURE; } - hipStream_t stream; - HIP_VALIDATE_NO_ERRORS(hipStreamCreate(&stream)); - - HIP_VALIDATE_NO_ERRORS(hipSetDevice(std::stoi(argv[5]))); - - cv::Mat background_data = cv::imread(argv[1]); - cv::Mat foreground_data = cv::imread(argv[2]); - cv::Mat mask_data = cv::imread(argv[3], cv::IMREAD_GRAYSCALE); - - // Create input/output tensors for the image. - DataType dtype(eDataType::DATA_TYPE_U8); - TensorShape background_shape(TensorLayout(eTensorLayout::TENSOR_LAYOUT_NHWC), - {1, background_data.rows, background_data.cols, background_data.channels()}); - Tensor background_tensor(background_shape, dtype); - - TensorShape foreground_shape(TensorLayout(eTensorLayout::TENSOR_LAYOUT_NHWC), - {1, foreground_data.rows, foreground_data.cols, foreground_data.channels()}); - Tensor foreground_tensor(foreground_shape, dtype); - - TensorShape mask_shape(TensorLayout(eTensorLayout::TENSOR_LAYOUT_NHWC), - {1, mask_data.rows, mask_data.cols, mask_data.channels()}); - Tensor mask_tensor(mask_shape, dtype); - - Tensor output_tensor(background_shape, dtype); - - auto bt_data = background_tensor.exportData(); - HIP_VALIDATE_NO_ERRORS(hipMemcpyAsync(bt_data.basePtr(), background_data.data, - bt_data.shape().size() * bt_data.dtype().size(), hipMemcpyHostToDevice, - stream)); - - auto ft_data = foreground_tensor.exportData(); - HIP_VALIDATE_NO_ERRORS(hipMemcpyAsync(ft_data.basePtr(), foreground_data.data, - foreground_tensor.shape().size() * foreground_tensor.dtype().size(), - hipMemcpyHostToDevice, stream)); - - auto m_data = mask_tensor.exportData(); - HIP_VALIDATE_NO_ERRORS(hipMemcpyAsync(m_data.basePtr(), mask_data.data, - mask_tensor.shape().size() * mask_tensor.dtype().size(), - hipMemcpyHostToDevice, stream)); - - hipEvent_t begin, end; - HIP_VALIDATE_NO_ERRORS(hipEventCreate(&begin)); - HIP_VALIDATE_NO_ERRORS(hipEventCreate(&end)); - - HIP_VALIDATE_NO_ERRORS(hipEventRecord(begin, stream)); - roccv::Composite op; - op(stream, foreground_tensor, background_tensor, mask_tensor, output_tensor); - HIP_VALIDATE_NO_ERRORS(hipEventRecord(end, stream)); - HIP_VALIDATE_NO_ERRORS(hipEventSynchronize(end)); + if (config.device == eDeviceType::GPU) { + CHECK_HIP_ERROR(hipSetDevice(config.deviceId)); + } - float duration; - HIP_VALIDATE_NO_ERRORS(hipEventElapsedTime(&duration, begin, end)); - printf("Kernel execution time: %fms\n", duration); + hipStream_t stream; + CHECK_HIP_ERROR(hipStreamCreate(&stream)); - HIP_VALIDATE_NO_ERRORS(hipEventDestroy(begin)); - HIP_VALIDATE_NO_ERRORS(hipEventDestroy(end)); + // Create required tensors for the Composite operation + Tensor background = LoadImages(hipStreamPerThread, config.backgroundPath, config.device); + Tensor foreground = LoadImages(hipStreamPerThread, config.foregroundPath, config.device); + Tensor mask = LoadImages(hipStreamPerThread, config.maskPath, config.device, cv::IMREAD_GRAYSCALE); + Tensor output = Tensor(background.shape(), background.dtype(), config.device); - // Move image data back to device - auto out_data = output_tensor.exportData(); - std::vector out_h(output_tensor.shape().size()); - HIP_VALIDATE_NO_ERRORS(hipMemcpyAsync(out_h.data(), out_data.basePtr(), - output_tensor.shape().size() * output_tensor.dtype().size(), - hipMemcpyDeviceToHost, stream)); + // Run the Composite operation + Composite op; + op(stream, foreground, background, mask, output, config.device); - HIP_VALIDATE_NO_ERRORS(hipStreamSynchronize(stream)); + // Write output images to disk + WriteImages(stream, output, config.outputPath); - // Write normalized image to disk - cv::Mat output_image_data(background_data.rows, background_data.cols, CV_8UC3, out_h.data()); - cv::imwrite(argv[4], output_image_data); + CHECK_HIP_ERROR(hipStreamDestroy(stream)); return EXIT_SUCCESS; } \ No newline at end of file From 1d1879b8f10dd551564288635c9aa638d15ef400 Mon Sep 17 00:00:00 2001 From: Zach Vincze Date: Mon, 8 Dec 2025 15:45:12 -0500 Subject: [PATCH 21/36] Fix crop and resize example --- samples/common/utils.hpp | 73 +------- samples/cropandresize/cpp/main.cpp | 281 ++++++++++++++--------------- 2 files changed, 139 insertions(+), 215 deletions(-) diff --git a/samples/common/utils.hpp b/samples/common/utils.hpp index b34dedb4..bd0b806f 100644 --- a/samples/common/utils.hpp +++ b/samples/common/utils.hpp @@ -80,6 +80,7 @@ inline MemcpyParams GetMemcpyParams(const roccv::Tensor &tensor) { * @param image_path The path to the image to load. If a directory is provided, all supported images in the directory * will be loaded. * @param device The device to load the images onto. Defaults to GPU. + * @param openCVFlags The OpenCV flags to use when loading the images. Defaults to IMREAD_UNCHANGED. * @return A NHWC tensor containing the loaded images. */ inline roccv::Tensor LoadImages(hipStream_t stream, const std::string &image_path, @@ -190,76 +191,4 @@ inline void WriteImages(hipStream_t stream, const roccv::Tensor &tensor, const s } else { cv::imwrite(output_path, images[0]); } -} - -/** - * @brief Loads images into the GPU memory specified. - * - * @param images_dir Either a directory or a single image to load into GPU memory. - * @param num_images The number of images to load into GPU memory. - * @param gpu_input A pointer to valid GPU memory. - */ -inline void DecodeRGBIImage(const std::string &images_dir, int num_images, void *gpu_input) { - const std::vector supportedExtensions = {".bmp", ".jpg", ".jpeg", ".png"}; - - std::vector imageFiles; - if (std::filesystem::is_directory(images_dir)) { - // A directory is provided. Collect all supported files in the directory (non-recursively). - for (auto file : std::filesystem::directory_iterator(images_dir)) { - if (!std::filesystem::is_directory(file.path()) && ContainsExtension(file.path(), supportedExtensions)) { - imageFiles.push_back(file.path()); - } - } - - // Throw an error if there were no valid images found in the given directory - if (imageFiles.empty()) { - throw std::runtime_error("No valid images found in directory " + images_dir); - } - } else { - // A single image file is provided - if (!ContainsExtension(images_dir, supportedExtensions)) - throw std::runtime_error("Cannot decode " + images_dir + ". File type not supported.\n"); - imageFiles.push_back(images_dir); - } - - // Load images into provided GPU memory - size_t mem_offset = 0; - for (int b = 0; b < num_images; b++) { - cv::Mat inputMat = cv::imread(imageFiles[b]); - if (inputMat.empty()) { - throw std::runtime_error("Unable to load image " + imageFiles[b]); - } - - size_t imageSize = inputMat.rows * inputMat.cols * inputMat.channels() * sizeof(uint8_t); - CHECK_HIP_ERROR( - hipMemcpy(static_cast(gpu_input) + mem_offset, inputMat.data, imageSize, hipMemcpyHostToDevice)); - mem_offset += imageSize; - } -} - -/** - * @brief Writes a batch of 3-channel RGBI images in a tensor to .bmp files. This will also block on the provided - * stream. - * - * @param tensor A tensor containing a batch of RGBI images. - * @param stream The HIP stream to synchronize with. - */ -inline void WriteRGBITensor(const roccv::Tensor &tensor, hipStream_t stream) { - CHECK_HIP_ERROR(hipStreamSynchronize(stream)); - - auto srcData = tensor.exportData(); - int batchSize = tensor.shape(tensor.layout().batch_index()); - int height = tensor.shape(tensor.layout().height_index()); - int width = tensor.shape(tensor.layout().width_index()); - - // Write each image in the batch to separate .bmp files - for (int b = 0; b < batchSize; b++) { - std::ostringstream outFilename; - outFilename << "./roccvtest_" << b << ".bmp"; - - cv::Mat outputMat(height, width, CV_8UC3); - CHECK_HIP_ERROR(hipMemcpy(outputMat.data, srcData.basePtr(), - (tensor.shape().size() / batchSize) * tensor.dtype().size(), hipMemcpyDeviceToHost)); - cv::imwrite(outFilename.str().c_str(), outputMat); - } } \ No newline at end of file diff --git a/samples/cropandresize/cpp/main.cpp b/samples/cropandresize/cpp/main.cpp index 8b8ed9bd..8f92f5a8 100644 --- a/samples/cropandresize/cpp/main.cpp +++ b/samples/cropandresize/cpp/main.cpp @@ -23,9 +23,9 @@ #include #include +#include #include #include -#include #include #include #include @@ -35,170 +35,165 @@ #include "core/tensor_shape.hpp" /** + * @file main.cpp * @brief Crop and Resize sample app. * - * The Crop and Resize is a simple pipeline which demonstrates usage of - * rocCV Tensor along with a few operators. + * The Crop and Resize is a simple pipeline which demonstrates usage of the + * rocCV Tensor along with the Custom Crop and Resize operators. * * Input Batch Tensor -> Crop -> Resize -> WriteImage */ -/** - * @brief Utility to show usage of sample app - * - **/ -void showUsage() { - std::cout << "usage: ./roccv_cropandresize_app -i -b " - << std::endl; +using namespace roccv; + +struct Config { + std::string inputPath; + std::string outputPath = "output"; + eDeviceType device = eDeviceType::GPU; + Size2D resizeShape = {320, 480}; + Box_t cropRect = {50, 150, 400, 300}; + eInterpolationType interpolation = eInterpolationType::INTERP_TYPE_LINEAR; + int deviceId = 0; +}; + +void PrintUsage(const char* programName) { + // clang-format off + std::cout << "Usage: " << programName << " -i [options]" << std::endl; + std::cout << " -i, --input Input image or directory (required)" << std::endl; + std::cout << "Options:" << std::endl; + std::cout << " -o, --output Output image or directory (optional, default: output)" << std::endl; + std::cout << " -r, --resize Resize shape as width,height (optional, default: 320,480)" << std::endl; + std::cout << " -c, --crop Crop rectangle as x,y,w,h (optional, default: 50,150,400,300)" << std::endl; + std::cout << " -I, --interpolation Interpolation type: 0=NEAREST, 1=LINEAR, 2=CUBIC (optional, default: LINEAR)" << std::endl; + std::cout << " -C, --cpu Use CPU for execution (optional, default: GPU)" << std::endl; + std::cout << " -d, --device Device ID to use for execution (optional, default: 0)" << std::endl; + std::cout << " -h, --help Show this help message" << std::endl; + std::cout << std::endl; + // clang-format on } -/** - * @brief Utility to parse the command line arguments - * - **/ -int ParseArgs(int argc, char *argv[], std::string &imagePath, uint32_t &batchSize) { - static struct option long_options[] = {{"help", no_argument, 0, 'h'}, - {"imagePath", required_argument, 0, 'i'}, - {"batch", required_argument, 0, 'b'}, - {0, 0, 0, 0}}; - - int long_index = 0; - int opt = 0; - while ((opt = getopt_long(argc, argv, "hi:b:", long_options, &long_index)) != -1) { +void ParseCropRectangle(const std::string& cropStr, Box_t& cropRect) { + std::istringstream iss(cropStr); + std::string token; + getline(iss, token, ','); + cropRect.x = std::stoi(token); + getline(iss, token, ','); + cropRect.y = std::stoi(token); + getline(iss, token, ','); + cropRect.width = std::stoi(token); + getline(iss, token, ','); + cropRect.height = std::stoi(token); +} + +void ParseResizeShape(const std::string& resizeStr, Size2D& resizeShape) { + std::istringstream iss(resizeStr); + std::string token; + getline(iss, token, ','); + resizeShape.w = std::stoi(token); + getline(iss, token, ','); + resizeShape.h = std::stoi(token); +} + +int main(int argc, char** argv) { + Config config; + static struct option longOptions[] = {{"input", required_argument, nullptr, 'i'}, + {"output", required_argument, nullptr, 'o'}, + {"resize", required_argument, nullptr, 'r'}, + {"crop", required_argument, nullptr, 'c'}, + {"interpolation", required_argument, nullptr, 'I'}, + {"cpu", no_argument, nullptr, 'C'}, + {"device", required_argument, nullptr, 'd'}, + {"help", no_argument, nullptr, 'h'}, + {nullptr, 0, nullptr, 0}}; + + // Parse command line arguments + int opt; + while ((opt = getopt_long(argc, argv, "i:o:r:c:I:d:h:C", longOptions, nullptr)) != -1) { switch (opt) { - case 'h': - showUsage(); - return -1; - break; case 'i': - imagePath = optarg; + config.inputPath = optarg; break; - case 'b': - batchSize = std::stoi(optarg); + case 'o': + config.outputPath = optarg; break; - case ':': - showUsage(); - return -1; - default: + case 'r': + ParseResizeShape(optarg, config.resizeShape); + break; + case 'c': + ParseCropRectangle(optarg, config.cropRect); break; + case 'I': + config.interpolation = static_cast(std::stoi(optarg)); + break; + case 'C': + config.device = eDeviceType::CPU; + break; + case 'd': + config.deviceId = std::stoi(optarg); + break; + case 'h': + PrintUsage(argv[0]); + return EXIT_SUCCESS; + default: + PrintUsage(argv[0]); + return EXIT_FAILURE; } } - std::ifstream imageFile(imagePath); - if (!imageFile.good()) { - showUsage(); - std::cerr << "Image path '" + imagePath + "' does not exist" << std::endl; - return -1; - } - return 0; -} -int main(int argc, char *argv[]) { - // Default parameters - // TODO: Default parameter for images cannot be added for now. Must specify a sample asset directory in the final - // build which is relative to this executable. - std::string imagePath = "none.jpg"; - uint32_t batchSize = 1; - - // Parse the command line paramaters to override the default parameters - int retval = ParseArgs(argc, argv, imagePath, batchSize); - if (retval != 0) { - return retval; + if (config.inputPath.empty()) { + std::cerr << "Error: Input path is required.\n\n"; + PrintUsage(argv[0]); + return EXIT_FAILURE; } - // Note : The maximum input image dimensions needs to be updated in case - // of testing with different test images - - int maxImageWidth = 720; - int maxImageHeight = 480; - int maxChannels = 3; + if (config.device == eDeviceType::GPU) { + CHECK_HIP_ERROR(hipSetDevice(config.deviceId)); + } - // tag: Create the HIP stream hipStream_t stream; CHECK_HIP_ERROR(hipStreamCreate(&stream)); - // tag: Allocate input tensor - // Allocating memory for RGBI input image batch of uint8_t data type. - - roccv::TensorDataStrided::Buffer inBuf; - inBuf.strides[3] = sizeof(uint8_t); - inBuf.strides[2] = maxChannels * inBuf.strides[3]; - inBuf.strides[1] = maxImageWidth * inBuf.strides[2]; - inBuf.strides[0] = maxImageHeight * inBuf.strides[1]; - CHECK_HIP_ERROR(hipMallocAsync(&inBuf.basePtr, batchSize * inBuf.strides[0], stream)); - - // tag: Tensor Requirements - // Calculate the requirements for the RGBI uint8_t Tensor which include - // pitch bytes, alignment, shape and tensor layout - roccv::Tensor::Requirements inReqs = - roccv::Tensor::CalcRequirements(batchSize, {maxImageWidth, maxImageHeight}, roccv::FMT_RGB8); - - // Create a tensor buffer to store the data pointer and pitch bytes for each plane - roccv::TensorDataStridedHip inData(roccv::TensorShape{inReqs.shape, inReqs.rank, inReqs.layout}, - roccv::DataType{inReqs.dtype}, inBuf); - - // Wrap tensor data in a rocCV tensor for use with the rocCV operators. - roccv::Tensor inTensor = roccv::TensorWrapData(inData); - - // tag: Image Loading - uint8_t *gpuInput = reinterpret_cast(inBuf.basePtr); - // The total images is set to the same value as batch size for testing - uint32_t totalImages = batchSize; - - // OpenCV is used to load the images, which gets copied into device memory. - DecodeRGBIImage(imagePath, totalImages, gpuInput); - - // tag: The input buffer is now ready to be used by the operators - - // Set parameters for Crop and Resize - // ROI dimensions to crop in the input image - int cropX = 50; - int cropY = 150; - int cropWidth = 400; - int cropHeight = 300; - - // Set the resize dimensions - int resizeWidth = 320; - int resizeHeight = 240; - - // Create the crop rect for the cropping operator - roccv::Box_t crpRect = {cropX, cropY, cropWidth, cropHeight}; - - // tag: Allocate Tensors for Crop and Resize - // Create a rocCV Tensor based on the crop window size. - roccv::Tensor cropTensor(batchSize, {cropWidth, cropHeight}, roccv::FMT_RGB8); - // Create a rocCV Tensor based on resize dimensions - roccv::Tensor resizedTensor(batchSize, {resizeWidth, resizeHeight}, roccv::FMT_RGB8); - -#ifdef PROFILE_SAMPLE - hipEvent_t start, stop; - hipEventCreate(&start); - hipEventCreate(&stop); - hipEventRecord(start); -#endif - // tag: Initialize operators for Crop and Resize - roccv::CustomCrop cropOp; - roccv::Resize resizeOp; - - // tag: Executes the CustomCrop operation on the given HIP stream - cropOp(stream, inTensor, cropTensor, crpRect); - - // Resize operator can now be enqueued into the same stream - resizeOp(stream, cropTensor, resizedTensor, INTERP_TYPE_LINEAR); - - // tag: Profile section -#ifdef PROFILE_SAMPLE - hipEventRecord(stop); - hipEventSynchronize(stop); - float operatorms = 0; - hipEventElapsedTime(&operatorms, start, stop); - std::cout << "Time for Crop and Resize : " << operatorms << " ms" << std::endl; -#endif - - // tag: Copy the buffer to CPU and write resized image into .bmp files - WriteRGBITensor(resizedTensor, stream); - - // tag: Clean up + // Load batch of input images + Tensor input = LoadImages(stream, config.inputPath, config.device); + + // Determine the batch size and channels from the input tensor + int64_t batchSize = input.shape(input.layout().batch_index()); + int64_t channels = input.shape(input.layout().channels_index()); + + // Create tensor for the cropped image + Tensor cropTensor = + Tensor(TensorShape(input.layout(), {batchSize, config.cropRect.height, config.cropRect.width, channels}), + input.dtype(), config.device); + + // Create tensor for the resized image + Tensor resizedTensor = + Tensor(TensorShape(input.layout(), {batchSize, config.resizeShape.w, config.resizeShape.h, channels}), + input.dtype(), config.device); + + // Create crop and resize operators + CustomCrop cropOp; + Resize resizeOp; + + std::chrono::high_resolution_clock::time_point start = std::chrono::high_resolution_clock::now(); + + // Run the crop operation, writing results to the crop tensor + cropOp(stream, input, cropTensor, config.cropRect, config.device); + + // Run the resize operation, writing results to the resized tensor + resizeOp(stream, cropTensor, resizedTensor, config.interpolation, config.device); + CHECK_HIP_ERROR(hipStreamSynchronize(stream)); + + std::chrono::high_resolution_clock::time_point end = std::chrono::high_resolution_clock::now(); + + // Report the duration of the crop and resize operation + long executionTime = std::chrono::duration_cast(end - start).count(); + std::cout << "Processed " << batchSize << " images in " << executionTime << "ms" << std::endl; + + // Write the cropped and resized images to disk + WriteImages(stream, resizedTensor, config.outputPath); + + // Destroy the stream CHECK_HIP_ERROR(hipStreamDestroy(stream)); - // tag: End of Sample + return EXIT_SUCCESS; } \ No newline at end of file From b7cff204b680e27e9df27228612c20176edeb87d Mon Sep 17 00:00:00 2001 From: Zach Vincze Date: Mon, 8 Dec 2025 15:49:39 -0500 Subject: [PATCH 22/36] Add more information to the help message --- samples/cropandresize/cpp/main.cpp | 27 +++++++++++++++++---------- 1 file changed, 17 insertions(+), 10 deletions(-) diff --git a/samples/cropandresize/cpp/main.cpp b/samples/cropandresize/cpp/main.cpp index 8f92f5a8..f2a9c939 100644 --- a/samples/cropandresize/cpp/main.cpp +++ b/samples/cropandresize/cpp/main.cpp @@ -58,16 +58,23 @@ struct Config { void PrintUsage(const char* programName) { // clang-format off - std::cout << "Usage: " << programName << " -i [options]" << std::endl; - std::cout << " -i, --input Input image or directory (required)" << std::endl; - std::cout << "Options:" << std::endl; - std::cout << " -o, --output Output image or directory (optional, default: output)" << std::endl; - std::cout << " -r, --resize Resize shape as width,height (optional, default: 320,480)" << std::endl; - std::cout << " -c, --crop Crop rectangle as x,y,w,h (optional, default: 50,150,400,300)" << std::endl; - std::cout << " -I, --interpolation Interpolation type: 0=NEAREST, 1=LINEAR, 2=CUBIC (optional, default: LINEAR)" << std::endl; - std::cout << " -C, --cpu Use CPU for execution (optional, default: GPU)" << std::endl; - std::cout << " -d, --device Device ID to use for execution (optional, default: 0)" << std::endl; - std::cout << " -h, --help Show this help message" << std::endl; + std::cout << "rocCV Crop and Resize Sample Application\n"; + std::cout << "----------------------------------------\n"; + std::cout << "This sample demonstrates how to set up a simple image processing pipeline using rocCV.\n"; + std::cout << "It shows reading a batch of images, cropping them to a specified rectangle,\n"; + std::cout << "resizing the cropped images to a target shape, and writing the outputs to image files.\n"; + std::cout << "You may select the device (CPU or GPU), interpolation type, input size, and more.\n"; + std::cout << '\n'; + std::cout << "Usage: " << programName << " -i [options]\n"; + std::cout << " -i, --input Input image or directory (required)\n"; + std::cout << "Options:\n"; + std::cout << " -o, --output Output image or directory (optional, default: output)\n"; + std::cout << " -r, --resize Resize shape as width,height (optional, default: 320,480)\n"; + std::cout << " -c, --crop Crop rectangle as x,y,w,h (optional, default: 50,150,400,300)\n"; + std::cout << " -I, --interpolation Interpolation type: 0=NEAREST, 1=LINEAR, 2=CUBIC (optional, default: LINEAR)\n"; + std::cout << " -C, --cpu Use CPU for execution (optional, default: GPU)\n"; + std::cout << " -d, --device Device ID to use for execution (optional, default: 0)\n"; + std::cout << " -h, --help Show this help message\n"; std::cout << std::endl; // clang-format on } From 370c3373e5d77c7b197c48cfb17340ab8e5ef5e9 Mon Sep 17 00:00:00 2001 From: Zach Vincze Date: Tue, 6 Jan 2026 16:04:38 -0500 Subject: [PATCH 23/36] Fix tensor copies for benchmarking suite --- benchmarks/src/roccv/roccv_bench_helpers.cpp | 51 +++++++++++++++----- 1 file changed, 39 insertions(+), 12 deletions(-) diff --git a/benchmarks/src/roccv/roccv_bench_helpers.cpp b/benchmarks/src/roccv/roccv_bench_helpers.cpp index 3fbb42f1..82471b58 100644 --- a/benchmarks/src/roccv/roccv_bench_helpers.cpp +++ b/benchmarks/src/roccv/roccv_bench_helpers.cpp @@ -26,21 +26,48 @@ #include #include +struct MemcpyParams { + void* basePtr = nullptr; // Base pointer to the tensor data + size_t rowPitch = 0; // Number of bytes per row, including padding + size_t rowBytes = 0; // Number of bytes per row, not including padding + size_t imageBytes = 0; // Number of bytes per image, including padding (rowBytes * height) +}; + +/** + * @brief Gets the memcpy parameters for a tensor to perform a memcpy2D operation. + * + * @param tensor The tensor to get the memcpy parameters for. + * @return The memcpy parameters to perform a memcpy2D operation. + */ +inline MemcpyParams GetMemcpyParams(const roccv::Tensor& tensor) { + MemcpyParams params; + + roccv::TensorDataStrided tensorData = tensor.exportData(); + params.rowPitch = tensorData.stride(tensor.layout().height_index()); + params.rowBytes = tensor.shape(tensor.layout().width_index()) * tensor.shape(tensor.layout().channels_index()) * + tensor.dtype().size(); + params.imageBytes = params.rowPitch * tensor.shape(tensor.layout().height_index()); + params.basePtr = tensorData.basePtr(); + + return params; +} + template void MoveToTensor(const roccv::Tensor& tensor, const std::vector& vec) { - auto tensor_data = tensor.exportData(); - switch (tensor.device()) { - case eDeviceType::GPU: { - HIP_VALIDATE_NO_ERRORS(hipMemcpy(tensor_data.basePtr(), vec.data(), - tensor.shape().size() * tensor.dtype().size(), hipMemcpyHostToDevice)); - break; - } + const hipMemcpyKind kind = (tensor.device() == eDeviceType::GPU) ? hipMemcpyHostToDevice : hipMemcpyHostToHost; - case eDeviceType::CPU: { - HIP_VALIDATE_NO_ERRORS(hipMemcpy(tensor_data.basePtr(), vec.data(), - tensor.shape().size() * tensor.dtype().size(), hipMemcpyHostToHost)); - break; - } + if (tensor.isContiguous()) { + // Contiguous data, so we can use a simple memcpy. + const size_t totalBytes = tensor.dataSize(); + void* basePtr = tensor.exportData().basePtr(); + HIP_VALIDATE_NO_ERRORS(hipMemcpy(basePtr, vec.data(), totalBytes, kind)); + } else { + // Data is padded, so we need to use a memcpy2D. + const MemcpyParams params = GetMemcpyParams(tensor); + const size_t batchSize = tensor.shape(tensor.layout().batch_index()); + const size_t totalRows = batchSize * tensor.shape(tensor.layout().height_index()); + HIP_VALIDATE_NO_ERRORS(hipMemcpy2D(params.basePtr, params.rowPitch, vec.data(), params.rowBytes, + params.rowBytes, totalRows, kind)); } } From 3ee963622e5af0dc85aafc3658244a51614cba67 Mon Sep 17 00:00:00 2001 From: Zach Vincze Date: Wed, 7 Jan 2026 12:05:48 -0500 Subject: [PATCH 24/36] Fix tensor padding calculations --- src/core/tensor.cpp | 24 ++++++++++++++++++++++-- 1 file changed, 22 insertions(+), 2 deletions(-) diff --git a/src/core/tensor.cpp b/src/core/tensor.cpp index bfd5835a..e013c998 100644 --- a/src/core/tensor.cpp +++ b/src/core/tensor.cpp @@ -42,6 +42,21 @@ THE SOFTWARE. namespace roccv { +namespace { +int GetFirstPackedDimension(const TensorLayout& layout) { + const int rank = layout.rank(); + switch (layout.elayout()) { + case eTensorLayout::TENSOR_LAYOUT_NHWC: + case eTensorLayout::TENSOR_LAYOUT_LNHWC: + case eTensorLayout::TENSOR_LAYOUT_HWC: + case eTensorLayout::TENSOR_LAYOUT_NWC: + return std::max(0, rank - 2); + default: + return rank - 1; + } +} +} // namespace + // Constructor definitions Tensor::Tensor(const Tensor::Requirements& reqs, const IAllocator& alloc) : m_requirements(reqs) { m_data = std::make_shared(this->dataSize(), reqs.device, alloc); @@ -230,15 +245,20 @@ std::array Tensor::CalcStrides(const TensorShape int32_t rowAlign) { // Calculate strides based on the given tensor shape. Strides are byte-wise. std::array strides; + + const int firstPackedDim = GetFirstPackedDimension(shape.layout()); + strides[shape.layout().rank() - 1] = dtype.size(); for (int i = shape.layout().rank() - 2; i >= 0; i--) { - // Ensure strides for the row are padded to the next multiple of the alignment. - if (i == shape.layout().height_index()) { + // The stride dimension preceeding the first packed dimension is padded to the next multiple of the row + // alignment. + if (i == firstPackedDim - 1) { strides[i] = detail::AlignUp(strides[i + 1] * shape[i + 1], rowAlign); } else { strides[i] = strides[i + 1] * shape[i + 1]; } } + return strides; } From 8c0b2fbb016ccf399b88022dfc6979283caaed15 Mon Sep 17 00:00:00 2001 From: Zach Vincze Date: Wed, 7 Jan 2026 13:44:13 -0500 Subject: [PATCH 25/36] Add documentation to helper function --- src/core/tensor.cpp | 10 ++++++++++ 1 file changed, 10 insertions(+) diff --git a/src/core/tensor.cpp b/src/core/tensor.cpp index e013c998..46e7552d 100644 --- a/src/core/tensor.cpp +++ b/src/core/tensor.cpp @@ -43,6 +43,16 @@ THE SOFTWARE. namespace roccv { namespace { + +/* + * @brief Returns the index of the first packed dimension in the given tensor layout. + * + * In most cases, the first packed dimension is the last dimension. However, for layouts ending in WC, the first packed + * dimension is the second to last dimension. + * + * @param[in] layout The tensor layout to get the first packed dimension for. + * @return The index of the first packed dimension in the given tensor layout. + */ int GetFirstPackedDimension(const TensorLayout& layout) { const int rank = layout.rank(); switch (layout.elayout()) { From 807396947759ffcc7b156617a51255853702c270 Mon Sep 17 00:00:00 2001 From: Zach Vincze Date: Wed, 7 Jan 2026 14:38:27 -0500 Subject: [PATCH 26/36] Reuse reshape logic --- src/core/tensor.cpp | 16 +--------------- 1 file changed, 1 insertion(+), 15 deletions(-) diff --git a/src/core/tensor.cpp b/src/core/tensor.cpp index 46e7552d..0ab41b40 100644 --- a/src/core/tensor.cpp +++ b/src/core/tensor.cpp @@ -125,21 +125,7 @@ TensorData Tensor::exportData() const { } } -Tensor Tensor::reshape(const TensorShape& new_shape) const { - if (!isContiguous()) { - throw Exception("Tensor is not contiguous. Reshape can only be performed on contiguous tensors.", - eStatusType::INVALID_VALUE); - } - - // New tensor shape must have the same number of elements - if (new_shape.size() != this->shape().size()) { - throw Exception("New tensor shape does not match the number of elements of the old shape.", - eStatusType::INVALID_VALUE); - } - - Tensor::Requirements reqs = CalcRequirements(new_shape, this->dtype(), this->device()); - return Tensor(reqs, m_data); -} +Tensor Tensor::reshape(const TensorShape& new_shape) const { return this->reshape(new_shape, this->dtype()); } Tensor Tensor::reshape(const TensorShape& new_shape, const DataType& new_dtype) const { if (!isContiguous()) { From 97efadbffb687fcb51b5265a69a9c758e8d428e8 Mon Sep 17 00:00:00 2001 From: Zach Vincze Date: Tue, 13 Jan 2026 14:57:12 -0500 Subject: [PATCH 27/36] Temp commit --- include/core/tensor.hpp | 11 +-- src/core/tensor.cpp | 70 ++++++++++++++++--- .../cpp/src/tests/core/tensor/test_tensor.cpp | 19 ++++- 3 files changed, 82 insertions(+), 18 deletions(-) diff --git a/include/core/tensor.hpp b/include/core/tensor.hpp index 6ca8c5cf..1a5b68fb 100644 --- a/include/core/tensor.hpp +++ b/include/core/tensor.hpp @@ -190,14 +190,15 @@ class Tensor { Tensor reshape(const TensorShape &new_shape) const; /** - * @brief Creates a vew of this tensor with a new shape, layout, and data type. The number of bytes allocated must - * match the original tensor. + * @brief Creates a view of this tensor with a new data type. * - * @param new_shape The new tensor shape. - * @param new_dtype The new data type of the underlying tensor data. + * This will attempt to reinterpret the tensor's base data type to the new data type by reshaping the tensor if + * necessary. If the reinterpretation is not possible, an exception will be thrown. + * + * @param newDtype The new data type of the underlying tensor data. * @return Tensor */ - Tensor reshape(const TensorShape &new_shape, const DataType &new_dtype) const; + Tensor reinterpret(const DataType &newDtype) const; /** * @brief Performs a shallow copy of the tensor (creates a view). diff --git a/src/core/tensor.cpp b/src/core/tensor.cpp index 0ab41b40..f806b134 100644 --- a/src/core/tensor.cpp +++ b/src/core/tensor.cpp @@ -53,7 +53,7 @@ namespace { * @param[in] layout The tensor layout to get the first packed dimension for. * @return The index of the first packed dimension in the given tensor layout. */ -int GetFirstPackedDimension(const TensorLayout& layout) { +static int GetFirstPackedDimension(const TensorLayout& layout) { const int rank = layout.rank(); switch (layout.elayout()) { case eTensorLayout::TENSOR_LAYOUT_NHWC: @@ -65,6 +65,61 @@ int GetFirstPackedDimension(const TensorLayout& layout) { return rank - 1; } } + +static bool ReshapeSimplified(int inRank, const std::array& inShape, + const std::array& inStrides, int targetRank, + const std::array& targetShape, + std::array& outStrides) { + int i = 0, j = 0; + for (; i < inRank && j < targetRank; i++) { + int64_t inE = inShape[i]; + int64_t outV = 1; + int group_start = j; + while (j < targetRank && (outV * targetShape[j]) <= inE) outV *= targetShape[j++]; + + if (outV != inE) { + return false; + } + + int64_t s = inStrides[i]; + for (int d = j - 1; d >= group_start; d--) { + outStrides[d] = s; + s *= targetShape[d]; + } + } + return true; +} + +static int Simplify(int rank, const std::array& shape, + const std::array& stride, + std::array& outShape, + std::array& outStrides) { + if (rank <= 1) { + if (rank == 1) { + outShape[0] = shape[0]; + outStrides[0] = stride[0]; + } + return rank; + } + + int outRank = 0; + int64_t vol = shape[0]; + for (int d = 1; d < rank; d++) { + if (stride[d - 1] != shape[d] * stride[d]) { + outStrides[outRank] = stride[d - 1]; + outShape[outRank] = vol; + vol = shape[d]; + outRank++; + } else { + vol *= shape[d]; + } + } + outStrides[outRank] = stride[rank - 1]; + outShape[outRank] = vol; + outRank++; + return outRank; +} + } // namespace // Constructor definitions @@ -125,19 +180,12 @@ TensorData Tensor::exportData() const { } } -Tensor Tensor::reshape(const TensorShape& new_shape) const { return this->reshape(new_shape, this->dtype()); } - -Tensor Tensor::reshape(const TensorShape& new_shape, const DataType& new_dtype) const { - if (!isContiguous()) { - throw Exception("Tensor is not contiguous. Reshape can only be performed on contiguous tensors.", - eStatusType::INVALID_VALUE); - } - - if (new_shape.size() * new_dtype.size() != this->shape().size() * this->dtype().size()) { +Tensor Tensor::reshape(const TensorShape& new_shape) const { + if (new_shape.size() * dtype().size() != this->shape().size() * this->dtype().size()) { throw Exception("New tensor view must have the same underlying number of bytes.", eStatusType::INVALID_VALUE); } - Tensor::Requirements reqs = CalcRequirements(new_shape, new_dtype, this->device()); + Tensor::Requirements reqs = CalcRequirements(new_shape, dtype(), this->device()); return Tensor(reqs, m_data); } diff --git a/tests/roccv/cpp/src/tests/core/tensor/test_tensor.cpp b/tests/roccv/cpp/src/tests/core/tensor/test_tensor.cpp index 16d8bd46..6e6da31a 100644 --- a/tests/roccv/cpp/src/tests/core/tensor/test_tensor.cpp +++ b/tests/roccv/cpp/src/tests/core/tensor/test_tensor.cpp @@ -76,10 +76,10 @@ void TestNegativeTensorShape() { * In both cases, the expected behavior is to throw an exception of type eStatusType::INVALID_VALUE. */ void TestNegativeTensor() { - // Reshaping a non-contiguous tensor is invalid + // Test reshaping a tensor with mismatching number of elements { Tensor tensor(TensorShape({1, 2, 3}, "HWC"), DataType(DATA_TYPE_U8)); - EXPECT_EXCEPTION(tensor.reshape(TensorShape({1, 1, 2, 3}, "NHWC")), eStatusType::INVALID_VALUE); + EXPECT_EXCEPTION(tensor.reshape(TensorShape({1, 1, 2, 4}, "NHWC")), eStatusType::INVALID_VALUE); } } @@ -101,6 +101,20 @@ void TestTensorCorrectness() { EXPECT_EQ(tensor.shape().size(), 4 * 720 * 480 * 3); EXPECT_EQ(tensor.dtype().size(), 1); } +} + +void TestTensorReshapeCorrectness() { + { + Tensor tensor(TensorShape({1, 2, 3}, "HWC"), DataType(DATA_TYPE_U8)); + Tensor reshapedTensor = tensor.reshape(TensorShape({1, 1, 2, 3}, "NHWC")); + EXPECT_EQ(reshapedTensor.shape().size(), tensor.shape().size()); + EXPECT_EQ(reshapedTensor.rank(), 4); + + // Ensure they are sharing the same underlying data + auto data = tensor.exportData(); + auto dataReshaped = reshapedTensor.exportData(); + EXPECT_TRUE(data.basePtr() == dataReshaped.basePtr()); + } // Tensor reshape: Change layout and datatype { @@ -151,6 +165,7 @@ int main(int argc, char** argv) { // Correctness tests TEST_CASE(TestTensorCorrectness()); + // TEST_CASE(TestTensorReshapeCorrectness()); // Stride calculation tests // clang-format off From 6ded6bfe07c4997c5af4ffccc1fa9930a1237997 Mon Sep 17 00:00:00 2001 From: Zach Vincze Date: Tue, 10 Mar 2026 11:25:41 -0400 Subject: [PATCH 28/36] Implement tensor reshape/reinterpret methods --- include/core/tensor.hpp | 22 ++++--- src/core/tensor.cpp | 64 ++++++++++++++++--- src/op_non_max_suppression.cpp | 10 +-- tests/roccv/cpp/include/test_helpers.hpp | 31 +++++---- .../cpp/src/tests/core/tensor/test_tensor.cpp | 14 +++- 5 files changed, 103 insertions(+), 38 deletions(-) diff --git a/include/core/tensor.hpp b/include/core/tensor.hpp index f68843bd..9ee7cd1e 100644 --- a/include/core/tensor.hpp +++ b/include/core/tensor.hpp @@ -190,23 +190,25 @@ class Tensor { } /** - * @brief Creates a view of this tensor with a new shape and layout + * @brief Creates a view of this tensor with a new shape and layout, keeping the same data type. * - * @param[in] new_shape the new shape of the tensor - * @return Tensor + * @param[in] newShape The new shape of the tensor. + * @return A new tensor view with the given shape. */ - Tensor reshape(const TensorShape &new_shape) const; + Tensor reshape(const TensorShape &newShape) const; /** - * @brief Creates a view of this tensor with a new data type. + * @brief Creates a view of this tensor with a new data type and shape. * - * This will attempt to reinterpret the tensor's base data type to the new data type by reshaping the tensor if - * necessary. If the reinterpretation is not possible, an exception will be thrown. + * Reinterprets the tensor's underlying bytes with the given data type and shape. The total byte count + * (elements * dtype size) must match between the original and new view. Non-contiguous (padded) tensors + * are supported as long as the reshape is compatible with the stride structure. * - * @param newDtype The new data type of the underlying tensor data. - * @return Tensor + * @param[in] newDtype The new data type of the tensor elements. + * @param[in] newShape The new shape of the tensor. + * @return A new tensor view with the given data type and shape. */ - Tensor reinterpret(const DataType &newDtype) const; + Tensor reshape(const DataType &newDtype, const TensorShape &newShape) const; /** * @brief Performs a shallow copy of the tensor (creates a view). diff --git a/src/core/tensor.cpp b/src/core/tensor.cpp index 01aa5001..3aa131bb 100644 --- a/src/core/tensor.cpp +++ b/src/core/tensor.cpp @@ -44,7 +44,7 @@ namespace roccv { namespace { -/* +/** * @brief Returns the index of the first packed dimension in the given tensor layout. * * In most cases, the first packed dimension is the last dimension. However, for layouts ending in WC, the first packed @@ -90,14 +90,24 @@ static bool ReshapeSimplified(int inRank, const std::array& shape, - const std::array& stride, + const std::array& strides, std::array& outShape, std::array& outStrides) { if (rank <= 1) { if (rank == 1) { outShape[0] = shape[0]; - outStrides[0] = stride[0]; + outStrides[0] = strides[0]; } return rank; } @@ -105,8 +115,8 @@ static int Simplify(int rank, const std::array& int outRank = 0; int64_t vol = shape[0]; for (int d = 1; d < rank; d++) { - if (stride[d - 1] != shape[d] * stride[d]) { - outStrides[outRank] = stride[d - 1]; + if (strides[d - 1] != shape[d] * strides[d]) { + outStrides[outRank] = strides[d - 1]; outShape[outRank] = vol; vol = shape[d]; outRank++; @@ -114,7 +124,7 @@ static int Simplify(int rank, const std::array& vol *= shape[d]; } } - outStrides[outRank] = stride[rank - 1]; + outStrides[outRank] = strides[rank - 1]; outShape[outRank] = vol; outRank++; return outRank; @@ -182,12 +192,48 @@ TensorData Tensor::exportData() const { } } -Tensor Tensor::reshape(const TensorShape& new_shape) const { - if (new_shape.size() * dtype().size() != this->shape().size() * this->dtype().size()) { +Tensor Tensor::reshape(const TensorShape& newShape) const { return reshape(dtype(), newShape); } + +Tensor Tensor::reshape(const DataType& newDtype, const TensorShape& newShape) const { + if (newShape.size() * newDtype.size() != this->shape().size() * this->dtype().size()) { throw Exception("New tensor view must have the same underlying number of bytes.", eStatusType::INVALID_VALUE); } - Tensor::Requirements reqs = CalcRequirements(new_shape, dtype(), this->device()); + const int oldRank = m_requirements.rank; + const int newRank = newShape.layout().rank(); + + if (m_requirements.strides[oldRank - 1] != static_cast(dtype().size())) { + throw Exception("Cannot reshape tensor: innermost dimension is not element-contiguous.", + eStatusType::INVALID_VALUE); + } + + // Convert to a byte-level view by expanding the innermost dimension size + // by the element size and setting its stride to 1. + std::array byteShape = m_requirements.shape; + std::array byteStrides = m_requirements.strides; + byteShape[oldRank - 1] *= dtype().size(); + byteStrides[oldRank - 1] = 1; + + std::array simpleShape, simpleStrides; + int simpleRank = Simplify(oldRank, byteShape, byteStrides, simpleShape, simpleStrides); + + std::array targetByteShape = newShape.shape(); + targetByteShape[newRank - 1] *= newDtype.size(); + + std::array targetByteStrides; + bool result = + ReshapeSimplified(simpleRank, simpleShape, simpleStrides, newRank, targetByteShape, targetByteStrides); + if (!result) { + throw Exception("Cannot reshape tensor into requested shape and data type.", eStatusType::INVALID_VALUE); + } + + // The byte-level reshape produces stride 1 for the innermost dimension; + // scale it back to the new element size. + std::array newStrides = targetByteStrides; + newStrides[newRank - 1] = newDtype.size(); + + Tensor::Requirements reqs = + CalcRequirements(newShape, newDtype, newStrides, m_requirements.alignBytes, this->device()); return Tensor(reqs, m_data); } diff --git a/src/op_non_max_suppression.cpp b/src/op_non_max_suppression.cpp index 93c4f90c..28d21c7b 100644 --- a/src/op_non_max_suppression.cpp +++ b/src/op_non_max_suppression.cpp @@ -97,12 +97,12 @@ void NonMaximumSuppression::operator()(hipStream_t stream, const Tensor& input, // but typically involve going from non-vectorized to vectorized shapes. These shapes should be validated // beforehand. For example: an input tensor with shape and datatype [NWC, , S16] will be // reinterpreted as [NW, ]. In any case, the underlying data is structured the same. - Tensor inputReshaped = - input.reshape(TensorShape(TensorLayout(TENSOR_LAYOUT_NW), {numBatches, numBoxes}), DataType(DATA_TYPE_4S16)); + Tensor inputReshaped = input.reshape(DataType(DATA_TYPE_4S16), + TensorShape(TensorLayout(TENSOR_LAYOUT_NW), {numBatches, numBoxes})); Tensor outputReshaped = - output.reshape(TensorShape(TensorLayout(TENSOR_LAYOUT_NW), {numBatches, numBoxes}), DataType(DATA_TYPE_U8)); - Tensor scoresReshaped = - scores.reshape(TensorShape(TensorLayout(TENSOR_LAYOUT_NW), {numBatches, numBoxes}), DataType(DATA_TYPE_F32)); + output.reshape(DataType(DATA_TYPE_U8), TensorShape(TensorLayout(TENSOR_LAYOUT_NW), {numBatches, numBoxes})); + Tensor scoresReshaped = scores.reshape(DataType(DATA_TYPE_F32), + TensorShape(TensorLayout(TENSOR_LAYOUT_NW), {numBatches, numBoxes})); // Launch nms kernel switch (device) { diff --git a/tests/roccv/cpp/include/test_helpers.hpp b/tests/roccv/cpp/include/test_helpers.hpp index e566ed3c..7fd9cf8f 100644 --- a/tests/roccv/cpp/include/test_helpers.hpp +++ b/tests/roccv/cpp/include/test_helpers.hpp @@ -392,30 +392,37 @@ void CompareVectorsNear(const std::vector& result, const std::vector& ref, * If no padding, returns (total_size, 1, total_size) */ inline std::tuple ComputeCopyParams(const Tensor& tensor) { - auto tensorData = tensor.exportData(); - const auto& layout = tensor.layout(); - int heightIdx = layout.height_index(); - - if (heightIdx < 0) { - // No height dimension = contiguous data, no padding + if (tensor.isContiguous()) { size_t totalSize = tensor.shape().size() * tensor.dtype().size(); return {totalSize, 1, totalSize}; } - // Row width = product of all dimensions AFTER height_index × dtype size + auto tensorData = tensor.exportData(); + + // Find the padded dimension: the outermost dimension whose stride exceeds + // the packed product of the next dimension's shape and stride. + int paddedDim = 0; + for (int i = 0; i < tensor.rank() - 1; i++) { + if (tensorData.stride(i) != tensor.shape(i + 1) * tensorData.stride(i + 1)) { + paddedDim = i; + break; + } + } + + // Row width = product of all dimensions AFTER paddedDim × dtype size size_t rowWidth = tensor.dtype().size(); - for (int i = heightIdx + 1; i < layout.rank(); ++i) { + for (int i = paddedDim + 1; i < tensor.rank(); ++i) { rowWidth *= tensor.shape(i); } - // Number of rows = product of all dimensions UP TO AND INCLUDING height_index + // Number of rows = product of all dimensions UP TO AND INCLUDING paddedDim size_t numRows = 1; - for (int i = 0; i <= heightIdx; ++i) { + for (int i = 0; i <= paddedDim; ++i) { numRows *= tensor.shape(i); } - // Tensor pitch comes from the stride at height_index - size_t tensorPitch = tensorData.stride(heightIdx); + // Tensor pitch comes from the stride at the padded dimension + size_t tensorPitch = tensorData.stride(paddedDim); return {rowWidth, numRows, tensorPitch}; } diff --git a/tests/roccv/cpp/src/tests/core/tensor/test_tensor.cpp b/tests/roccv/cpp/src/tests/core/tensor/test_tensor.cpp index 6e6da31a..86e46fe2 100644 --- a/tests/roccv/cpp/src/tests/core/tensor/test_tensor.cpp +++ b/tests/roccv/cpp/src/tests/core/tensor/test_tensor.cpp @@ -67,6 +67,15 @@ void TestNegativeTensorShape() { } } +/** + * @brief Negative tests related to Tensor reshape. + * + */ +void TestNegativeTensorReshape() { + Tensor tensor(TensorShape({1, 2, 3}, "HWC"), DataType(DATA_TYPE_U8)); + EXPECT_EXCEPTION(tensor.reshape(TensorShape({1, 1, 2, 4}, "NHWC")), eStatusType::INVALID_VALUE); +} + /** * @brief Negative tests for the Tensor class, verifying error handling in invalid scenarios. * @@ -119,7 +128,7 @@ void TestTensorReshapeCorrectness() { // Tensor reshape: Change layout and datatype { Tensor tensor(TensorShape({1, 5, 4}, "NWC"), DataType(DATA_TYPE_S16)); - Tensor reshapedTensor = tensor.reshape(TensorShape({1, 5}, "NW"), DataType(DATA_TYPE_4S16)); + Tensor reshapedTensor = tensor.reshape(DataType(DATA_TYPE_4S16), TensorShape({1, 5}, "NW")); EXPECT_NE(reshapedTensor.shape().size(), tensor.shape().size()); EXPECT_NE(reshapedTensor.rank(), tensor.rank()); EXPECT_EQ(reshapedTensor.rank(), 2); @@ -162,10 +171,11 @@ int main(int argc, char** argv) { // Negative tests TEST_CASE(TestNegativeTensorShape()); TEST_CASE(TestNegativeTensor()); + TEST_CASE(TestNegativeTensorReshape()); // Correctness tests TEST_CASE(TestTensorCorrectness()); - // TEST_CASE(TestTensorReshapeCorrectness()); + TEST_CASE(TestTensorReshapeCorrectness()); // Stride calculation tests // clang-format off From fb1cb4407c7859a0e8425d206fa831f780e9362a Mon Sep 17 00:00:00 2001 From: Zach Vincze Date: Tue, 10 Mar 2026 11:26:09 -0400 Subject: [PATCH 29/36] Remove unused MemcpyParams from benchmark helpers --- benchmarks/src/roccv/roccv_bench_helpers.cpp | 26 -------------------- 1 file changed, 26 deletions(-) diff --git a/benchmarks/src/roccv/roccv_bench_helpers.cpp b/benchmarks/src/roccv/roccv_bench_helpers.cpp index 48d2022c..f6c1c3b2 100644 --- a/benchmarks/src/roccv/roccv_bench_helpers.cpp +++ b/benchmarks/src/roccv/roccv_bench_helpers.cpp @@ -82,32 +82,6 @@ class RandomGenerator { rocrand_generator m_gen; }; -struct MemcpyParams { - void* basePtr = nullptr; // Base pointer to the tensor data - size_t rowPitch = 0; // Number of bytes per row, including padding - size_t rowBytes = 0; // Number of bytes per row, not including padding - size_t imageBytes = 0; // Number of bytes per image, including padding (rowBytes * height) -}; - -/** - * @brief Gets the memcpy parameters for a tensor to perform a memcpy2D operation. - * - * @param tensor The tensor to get the memcpy parameters for. - * @return The memcpy parameters to perform a memcpy2D operation. - */ -inline MemcpyParams GetMemcpyParams(const roccv::Tensor& tensor) { - MemcpyParams params; - - roccv::TensorDataStrided tensorData = tensor.exportData(); - params.rowPitch = tensorData.stride(tensor.layout().height_index()); - params.rowBytes = tensor.shape(tensor.layout().width_index()) * tensor.shape(tensor.layout().channels_index()) * - tensor.dtype().size(); - params.imageBytes = params.rowPitch * tensor.shape(tensor.layout().height_index()); - params.basePtr = tensorData.basePtr(); - - return params; -} - template void FillTensorImpl(const roccv::Tensor& tensor) { RandomGenerator generator(tensor.device()); From a14435e0cae1e9da4423d49b8f71c48e06f6fab8 Mon Sep 17 00:00:00 2001 From: Zach Vincze Date: Tue, 10 Mar 2026 12:44:31 -0400 Subject: [PATCH 30/36] Add copyFromHost/copyToHost method definitions for Tensor --- include/core/tensor.hpp | 18 ++++++++++++++++++ 1 file changed, 18 insertions(+) diff --git a/include/core/tensor.hpp b/include/core/tensor.hpp index 9ee7cd1e..f5ac3374 100644 --- a/include/core/tensor.hpp +++ b/include/core/tensor.hpp @@ -236,6 +236,24 @@ class Tensor { */ bool isContiguous() const; + /** + * @brief Copies data from a host pointer to the tensor. Host memory must be contiguous. This is a non-blocking + * operation, synchronized to the given stream. + * + * @param[in] src The source host pointer. + * @param[in] stream The stream to use for the copy. + */ + void copyFromHost(const void *src, hipStream_t stream = nullptr); + + /** + * @brief Copies data from the tensor to a host pointer. Host memory will be contiguous. This is a non-blocking + * operation, synchronized to the given stream. + * + * @param[out] dst The destination host pointer. Must be preallocated to the correct size. + * @param[in] stream The stream to use for the copy. + */ + void copyToHost(void *dst, hipStream_t stream = nullptr) const noexcept; + /** * @brief Calculates tensor requirements using the default memory alignment strategy. * From dd11f3cd9f5ab1a4f72e09bf566df54e08c58918 Mon Sep 17 00:00:00 2001 From: Zach Vincze Date: Tue, 10 Mar 2026 13:12:02 -0400 Subject: [PATCH 31/36] Use copyTo/copyFromHost implementations for test helpers --- include/core/tensor.hpp | 4 +- src/core/tensor.cpp | 60 ++++++++++++++++++++++++ tests/roccv/cpp/include/test_helpers.hpp | 22 ++------- 3 files changed, 66 insertions(+), 20 deletions(-) diff --git a/include/core/tensor.hpp b/include/core/tensor.hpp index f5ac3374..5722019a 100644 --- a/include/core/tensor.hpp +++ b/include/core/tensor.hpp @@ -243,7 +243,7 @@ class Tensor { * @param[in] src The source host pointer. * @param[in] stream The stream to use for the copy. */ - void copyFromHost(const void *src, hipStream_t stream = nullptr); + void copyFromHost(const void *src, hipStream_t stream = nullptr) const; /** * @brief Copies data from the tensor to a host pointer. Host memory will be contiguous. This is a non-blocking @@ -252,7 +252,7 @@ class Tensor { * @param[out] dst The destination host pointer. Must be preallocated to the correct size. * @param[in] stream The stream to use for the copy. */ - void copyToHost(void *dst, hipStream_t stream = nullptr) const noexcept; + void copyToHost(void *dst, hipStream_t stream = nullptr) const; /** * @brief Calculates tensor requirements using the default memory alignment strategy. diff --git a/src/core/tensor.cpp b/src/core/tensor.cpp index 3aa131bb..8e517578 100644 --- a/src/core/tensor.cpp +++ b/src/core/tensor.cpp @@ -130,6 +130,44 @@ static int Simplify(int rank, const std::array& return outRank; } +/** + * @brief Computes copy parameters for host-tensor copy. + * @return (row_width_bytes, num_rows, tensor_pitch). If contiguous, returns (total_size, 1, total_size). + */ +static std::tuple ComputeCopyParams(int rank, + const std::array& shape, + const std::array& strides, + size_t dtypeSize, bool contiguous) { + if (contiguous) { + size_t totalSize = dtypeSize; + for (int i = 0; i < rank; ++i) { + totalSize *= static_cast(shape[i]); + } + return {totalSize, 1, totalSize}; + } + + int paddedDim = 0; + for (int i = 0; i < rank - 1; i++) { + if (strides[i] != shape[i + 1] * strides[i + 1]) { + paddedDim = i; + break; + } + } + + size_t rowWidth = dtypeSize; + for (int i = paddedDim + 1; i < rank; ++i) { + rowWidth *= static_cast(shape[i]); + } + + size_t numRows = 1; + for (int i = 0; i <= paddedDim; ++i) { + numRows *= static_cast(shape[i]); + } + + size_t tensorPitch = static_cast(strides[paddedDim]); + return {rowWidth, numRows, tensorPitch}; +} + } // namespace // Constructor definitions @@ -247,6 +285,28 @@ size_t Tensor::dataSize() const { return m_requirements.strides[0] * m_requireme bool Tensor::isContiguous() const { return dataSize() == shape().size() * dtype().size(); } +void Tensor::copyFromHost(const void* src, hipStream_t stream) const { + auto [rowWidth, numRows, tensorPitch] = ComputeCopyParams(m_requirements.rank, m_requirements.shape, + m_requirements.strides, dtype().size(), isContiguous()); + + const size_t srcPitch = rowWidth; + hipMemcpyKind kind = (device() == eDeviceType::GPU) ? hipMemcpyHostToDevice : hipMemcpyHostToHost; + + HIP_VALIDATE_NO_ERRORS( + hipMemcpy2DAsync(m_data->data(), tensorPitch, src, srcPitch, rowWidth, numRows, kind, stream)); +} + +void Tensor::copyToHost(void* dst, hipStream_t stream) const { + auto [rowWidth, numRows, tensorPitch] = ComputeCopyParams(m_requirements.rank, m_requirements.shape, + m_requirements.strides, dtype().size(), isContiguous()); + + const size_t dstPitch = rowWidth; + hipMemcpyKind kind = (device() == eDeviceType::GPU) ? hipMemcpyDeviceToHost : hipMemcpyHostToHost; + + HIP_VALIDATE_NO_ERRORS( + hipMemcpy2DAsync(dst, dstPitch, m_data->data(), tensorPitch, rowWidth, numRows, kind, stream)); +} + Tensor::Requirements Tensor::CalcRequirements(const TensorShape& shape, const DataType& dtype, eDeviceType device) { return CalcRequirements(shape, dtype, (MemAlignment){}, device); } diff --git a/tests/roccv/cpp/include/test_helpers.hpp b/tests/roccv/cpp/include/test_helpers.hpp index 7fd9cf8f..309d61a3 100644 --- a/tests/roccv/cpp/include/test_helpers.hpp +++ b/tests/roccv/cpp/include/test_helpers.hpp @@ -436,15 +436,8 @@ inline std::tuple ComputeCopyParams(const Tensor& tensor */ template void CopyVectorIntoTensor(const Tensor& dst, const std::vector& src) { - auto tensorData = dst.exportData(); - auto [rowWidth, numRows, dstPitch] = ComputeCopyParams(dst); - - // Source is always contiguous - size_t srcPitch = rowWidth; - - hipMemcpyKind kind = (dst.device() == eDeviceType::GPU) ? hipMemcpyHostToDevice : hipMemcpyHostToHost; - - HIP_VALIDATE_NO_ERRORS(hipMemcpy2D(tensorData.basePtr(), dstPitch, src.data(), srcPitch, rowWidth, numRows, kind)); + dst.copyFromHost(src.data(), nullptr); + HIP_VALIDATE_NO_ERRORS(hipStreamSynchronize(nullptr)); } /** @@ -460,15 +453,8 @@ void CopyVectorIntoTensor(const Tensor& dst, const std::vector& src) { */ template void CopyTensorIntoVector(std::vector& dst, const Tensor& src) { - auto tensorData = src.exportData(); - auto [rowWidth, numRows, srcPitch] = ComputeCopyParams(src); - - // Destination is always contiguous - size_t dstPitch = rowWidth; - - hipMemcpyKind kind = (src.device() == eDeviceType::GPU) ? hipMemcpyDeviceToHost : hipMemcpyHostToHost; - - HIP_VALIDATE_NO_ERRORS(hipMemcpy2D(dst.data(), dstPitch, tensorData.basePtr(), srcPitch, rowWidth, numRows, kind)); + src.copyToHost(dst.data(), nullptr); + HIP_VALIDATE_NO_ERRORS(hipStreamSynchronize(nullptr)); } /** * @brief Computes the strides for a tensor with a given shape and data type. From 8ffcc95c2f1bf04741263c00835c21613547ef8b Mon Sep 17 00:00:00 2001 From: Zach Vincze Date: Tue, 10 Mar 2026 14:01:59 -0400 Subject: [PATCH 32/36] Add tensor copy correctness tests --- .../cpp/src/tests/core/tensor/test_tensor.cpp | 28 +++++++++++++++++++ 1 file changed, 28 insertions(+) diff --git a/tests/roccv/cpp/src/tests/core/tensor/test_tensor.cpp b/tests/roccv/cpp/src/tests/core/tensor/test_tensor.cpp index 86e46fe2..6b543b3e 100644 --- a/tests/roccv/cpp/src/tests/core/tensor/test_tensor.cpp +++ b/tests/roccv/cpp/src/tests/core/tensor/test_tensor.cpp @@ -19,6 +19,8 @@ * THE SOFTWARE. */ +#include + #include #include @@ -140,6 +142,31 @@ void TestTensorReshapeCorrectness() { } } +/** + * @brief Tests the correctness of the copyFromHost and copyToHost methods. + * + */ +void TestTensorCopyCorrectness() { + Tensor tensor(2, {10, 10}, FMT_RGB8, eDeviceType::GPU); + const size_t hostDataSize = tensor.shape().size() * tensor.dtype().size(); + std::vector inputDataHost(hostDataSize); + for (size_t i = 0; i < inputDataHost.size(); i++) { + inputDataHost[i] = static_cast(i % 256); + } + + hipStream_t stream; + HIP_VALIDATE_NO_ERRORS(hipStreamCreate(&stream)); + + tensor.copyFromHost(inputDataHost.data(), stream); + std::vector outputDataHost(hostDataSize); + tensor.copyToHost(outputDataHost.data(), stream); + + HIP_VALIDATE_NO_ERRORS(hipStreamSynchronize(stream)); + HIP_VALIDATE_NO_ERRORS(hipStreamDestroy(stream)); + + EXPECT_VECTOR_EQ(inputDataHost, outputDataHost); +} + /** * @brief Tests internal stride calculations on Tensor construction. */ @@ -176,6 +203,7 @@ int main(int argc, char** argv) { // Correctness tests TEST_CASE(TestTensorCorrectness()); TEST_CASE(TestTensorReshapeCorrectness()); + TEST_CASE(TestTensorCopyCorrectness()); // Stride calculation tests // clang-format off From 1ac14a97d705832904bd2baa2bade338e846c32c Mon Sep 17 00:00:00 2001 From: Zach Vincze Date: Tue, 10 Mar 2026 14:15:24 -0400 Subject: [PATCH 33/36] Properly convert element-wise strides to byte-wise strides on DLPack --- python/src/py_tensor.cpp | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/python/src/py_tensor.cpp b/python/src/py_tensor.cpp index ab784347..8598e2eb 100644 --- a/python/src/py_tensor.cpp +++ b/python/src/py_tensor.cpp @@ -131,7 +131,8 @@ std::shared_ptr PyTensor::fromDLPack(pybind11::object src, eTensorLayo stridesData = roccv::Tensor::CalcStrides(shape, roccv::DataType(dtype), 0); } else { for (int i = 0; i < dlTensor.ndim; ++i) { - stridesData[i] = dlTensor.strides[i]; + // DLTensor strides are element-wise. Convert from element-wise to byte-wise. + stridesData[i] = dlTensor.strides[i] * roccv::DataType(dtype).size(); } } From add8a66659d0fb0ca5a1f935c9faa255df5f8d92 Mon Sep 17 00:00:00 2001 From: Zach Vincze Date: Tue, 10 Mar 2026 14:17:11 -0400 Subject: [PATCH 34/36] Add byte_offset from DLPack to rocCV tensor base pointer --- python/src/py_tensor.cpp | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/python/src/py_tensor.cpp b/python/src/py_tensor.cpp index 8598e2eb..28bf385d 100644 --- a/python/src/py_tensor.cpp +++ b/python/src/py_tensor.cpp @@ -141,8 +141,8 @@ std::shared_ptr PyTensor::fromDLPack(pybind11::object src, eTensorLayo roccv::Tensor::CalcRequirements(shape, roccv::DataType(dtype), stridesData, 0, device); // Since this tensor is coming from a DLPack, we don't own the data, so we need to create a view of the data. - std::shared_ptr data = - std::make_shared(dlTensor.data, device, eOwnership::VIEW); + std::shared_ptr data = std::make_shared( + static_cast(dlTensor.data) + dlTensor.byte_offset, device, eOwnership::VIEW); std::shared_ptr tensor = std::make_shared(reqs, data); // Instantiate a new tensor and a PyTensor to wrap it, binding the original DLManagedTensor From 2affddbb4b08387003b14bdd943490ecd483750b Mon Sep 17 00:00:00 2001 From: Zach Vincze Date: Wed, 11 Mar 2026 11:28:57 -0400 Subject: [PATCH 35/36] Add sensible default alignment for CPU allocated tensors --- include/core/mem_alignment.hpp | 2 + src/core/tensor.cpp | 76 +++++++++++++++++++++------------- 2 files changed, 50 insertions(+), 28 deletions(-) diff --git a/include/core/mem_alignment.hpp b/include/core/mem_alignment.hpp index 4aceceba..76a0d9f0 100644 --- a/include/core/mem_alignment.hpp +++ b/include/core/mem_alignment.hpp @@ -25,6 +25,8 @@ namespace roccv { +constexpr int32_t ROCCV_CPU_DEFAULT_ALIGNMENT = 64; // Default alignment for CPU memory + /** * @class MemAlignment * @brief Class for specifying memory alignment constraints for buffer allocations. diff --git a/src/core/tensor.cpp b/src/core/tensor.cpp index 8e517578..f95cf811 100644 --- a/src/core/tensor.cpp +++ b/src/core/tensor.cpp @@ -168,6 +168,51 @@ static std::tuple ComputeCopyParams(int rank, return {rowWidth, numRows, tensorPitch}; } +/** + * @brief Computes the memory alignment for a tensor based on the device, data type, and user provided buffer alignment. + * @param[in] device The device the tensor is to be allocated on. + * @param[in] dtype The datatype of the tensor. + * @param[in] bufAlign The memory alignment to use. + * @return The memory alignment for the tensor. + */ +MemAlignment ComputeMemAlignment(eDeviceType device, const DataType& dtype, const MemAlignment& bufAlign) { + int dev = 0; + if (device == eDeviceType::GPU) { + HIP_VALIDATE_NO_ERRORS(hipGetDevice(&dev)); + } + + int rowAlign; + if (bufAlign.rowAddr() == 0) { + if (device == eDeviceType::GPU) { + HIP_VALIDATE_NO_ERRORS(hipDeviceGetAttribute(&rowAlign, hipDeviceAttributeTexturePitchAlignment, dev)); + } else { + rowAlign = ROCCV_CPU_DEFAULT_ALIGNMENT; + } + rowAlign = std::lcm(rowAlign, detail::NextPowerOfTwo(dtype.size())); + } else { + if (!detail::IsPowerOfTwo(bufAlign.rowAddr())) { + throw Exception("Row address alignment must be a power of two.", eStatusType::INVALID_VALUE); + } + rowAlign = std::lcm(bufAlign.rowAddr(), detail::NextPowerOfTwo(dtype.size())); + } + + int baseAlign; + if (bufAlign.baseAddr() == 0) { + if (device == eDeviceType::GPU) { + HIP_VALIDATE_NO_ERRORS(hipDeviceGetAttribute(&baseAlign, hipDeviceAttributeTextureAlignment, dev)); + } else { + baseAlign = ROCCV_CPU_DEFAULT_ALIGNMENT; + } + baseAlign = std::lcm(baseAlign, detail::NextPowerOfTwo(dtype.size())); + } else { + if (!detail::IsPowerOfTwo(bufAlign.baseAddr())) { + throw Exception("Base address alignment must be a power of two.", eStatusType::INVALID_VALUE); + } + baseAlign = std::lcm(bufAlign.baseAddr(), detail::NextPowerOfTwo(dtype.size())); + } + + return MemAlignment().baseAddr(baseAlign).rowAddr(rowAlign); +} } // namespace // Constructor definitions @@ -313,35 +358,10 @@ Tensor::Requirements Tensor::CalcRequirements(const TensorShape& shape, const Da Tensor::Requirements Tensor::CalcRequirements(const TensorShape& shape, const DataType& dtype, const MemAlignment& bufAlign, eDeviceType device) { - int dev; - HIP_VALIDATE_NO_ERRORS(hipGetDevice(&dev)); - - // Validate memory alignment, set default alignment if set to 0. - // TODO: Must be supported for CPU as well. - int rowAlign; - if (bufAlign.rowAddr() == 0) { - HIP_VALIDATE_NO_ERRORS(hipDeviceGetAttribute(&rowAlign, hipDeviceAttributeTexturePitchAlignment, dev)); - rowAlign = std::lcm(rowAlign, detail::NextPowerOfTwo(dtype.size())); - } else { - if (!detail::IsPowerOfTwo(bufAlign.rowAddr())) { - throw Exception("Row address alignment must be a power of two.", eStatusType::INVALID_VALUE); - } - rowAlign = std::lcm(bufAlign.rowAddr(), detail::NextPowerOfTwo(dtype.size())); - } - - int baseAlign; - if (bufAlign.baseAddr() == 0) { - HIP_VALIDATE_NO_ERRORS(hipDeviceGetAttribute(&baseAlign, hipDeviceAttributeTextureAlignment, dev)); - baseAlign = std::lcm(baseAlign, detail::NextPowerOfTwo(dtype.size())); - } else { - if (!detail::IsPowerOfTwo(bufAlign.baseAddr())) { - throw Exception("Base address alignment must be a power of two.", eStatusType::INVALID_VALUE); - } - baseAlign = std::lcm(bufAlign.baseAddr(), detail::NextPowerOfTwo(dtype.size())); - } + MemAlignment newAlign = ComputeMemAlignment(device, dtype, bufAlign); - std::array strides = CalcStrides(shape, dtype, rowAlign); - Tensor::Requirements reqs = CalcRequirements(shape, dtype, strides, baseAlign, device); + std::array strides = CalcStrides(shape, dtype, newAlign.rowAddr()); + Tensor::Requirements reqs = CalcRequirements(shape, dtype, strides, newAlign.baseAddr(), device); return reqs; } From ea91e85f55a29cfb5d14d6fb68a861bf346b72da Mon Sep 17 00:00:00 2001 From: Zach Vincze Date: Mon, 23 Mar 2026 18:35:45 -0400 Subject: [PATCH 36/36] Address PR comments --- include/core/tensor.hpp | 2 ++ samples/bilateral_filter.cpp | 2 +- samples/common/utils.hpp | 28 ++++++++++++++++++---------- samples/composite.cpp | 1 + samples/cropandresize/cpp/main.cpp | 2 +- 5 files changed, 23 insertions(+), 12 deletions(-) diff --git a/include/core/tensor.hpp b/include/core/tensor.hpp index 5722019a..7f2241a1 100644 --- a/include/core/tensor.hpp +++ b/include/core/tensor.hpp @@ -21,6 +21,8 @@ THE SOFTWARE. */ #pragma once +#include + #include #include diff --git a/samples/bilateral_filter.cpp b/samples/bilateral_filter.cpp index 04364f5f..fdfd11d0 100644 --- a/samples/bilateral_filter.cpp +++ b/samples/bilateral_filter.cpp @@ -57,7 +57,7 @@ void PrintUsage(const char* programName) { std::cout << " -D, --diameter Diameter of the filtering area (optional, default: 2)" << std::endl; std::cout << " -s, --sigma_space Spatial parameter sigma of the Gaussian function (optional, default: 2.0f)" << std::endl; std::cout << " -c, --sigma_color Range parameter sigma of the Gaussian function (optional, default: 10.0f)" << std::endl; - std::cout << " -B, --border_mode Border mode at image boundary when work pixels are outside of the image (optional, default: 1 (replicate))" << std::endl; + std::cout << " -b, --border_mode Border mode at image boundary when work pixels are outside of the image (optional, default: 1 (replicate))" << std::endl; std::cout << " -B, --border_color Border color for constant color border mode (optional, default: 0,0,0,0)" << std::endl; std::cout << " -C, --cpu Use CPU for execution (optional, default: GPU)" << std::endl; std::cout << " -h, --help Show this help message" << std::endl; diff --git a/samples/common/utils.hpp b/samples/common/utils.hpp index bd0b806f..817cac9d 100644 --- a/samples/common/utils.hpp +++ b/samples/common/utils.hpp @@ -23,6 +23,7 @@ #include #include +#include #include inline void CheckHIPError(hipError_t code, const char *file, const int line) { @@ -97,24 +98,31 @@ inline roccv::Tensor LoadImages(hipStream_t stream, const std::string &image_pat if (std::filesystem::is_directory(image_path)) { for (auto file : std::filesystem::directory_iterator(image_path)) { if (!std::filesystem::is_directory(file.path()) && ContainsExtension(file.path(), supportedExtensions)) { - images.push_back(cv::imread(file.path(), openCVFlags)); + cv::Mat image = cv::imread(file.path(), openCVFlags); + if (image.empty()) { + throw std::runtime_error("Cannot decode " + file.path().string() + ". File type not supported.\n"); + } + images.push_back(image); // Check if all images are of the same size if (width == -1 && height == -1 && channels == -1) { - width = images.back().cols; - height = images.back().rows; - channels = images.back().channels(); - } else if (images.back().cols != width || images.back().rows != height || - images.back().channels() != channels) { + width = image.cols; + height = image.rows; + channels = image.channels(); + } else if (image.cols != width || image.rows != height || image.channels() != channels) { throw std::runtime_error("All images must be of the same size and format"); } } } } else if (std::filesystem::is_regular_file(image_path) && ContainsExtension(image_path, supportedExtensions)) { - images.push_back(cv::imread(image_path)); - width = images.back().cols; - height = images.back().rows; - channels = images.back().channels(); + cv::Mat image = cv::imread(image_path, openCVFlags); + if (image.empty()) { + throw std::runtime_error("Cannot decode " + image_path + ". File type not supported.\n"); + } + images.push_back(image); + width = image.cols; + height = image.rows; + channels = image.channels(); } else { throw std::runtime_error("Cannot decode " + image_path + ". File type not supported.\n"); } diff --git a/samples/composite.cpp b/samples/composite.cpp index 135dd9f2..bd21948f 100644 --- a/samples/composite.cpp +++ b/samples/composite.cpp @@ -93,6 +93,7 @@ int main(int argc, char** argv) { break; case 'd': config.deviceId = std::stoi(optarg); + break; case 'h': PrintUsage(argv[0]); return EXIT_SUCCESS; diff --git a/samples/cropandresize/cpp/main.cpp b/samples/cropandresize/cpp/main.cpp index f2a9c939..eec6532b 100644 --- a/samples/cropandresize/cpp/main.cpp +++ b/samples/cropandresize/cpp/main.cpp @@ -174,7 +174,7 @@ int main(int argc, char** argv) { // Create tensor for the resized image Tensor resizedTensor = - Tensor(TensorShape(input.layout(), {batchSize, config.resizeShape.w, config.resizeShape.h, channels}), + Tensor(TensorShape(input.layout(), {batchSize, config.resizeShape.h, config.resizeShape.w, channels}), input.dtype(), config.device); // Create crop and resize operators