From d6dc01d0e768f7535ea7e9b20050fa159c8695d6 Mon Sep 17 00:00:00 2001 From: Albert Sidelnik Date: Mon, 4 Nov 2024 15:22:13 -0800 Subject: [PATCH 01/39] Update build config to pull CUDASTF --- CMakeLists.txt | 6 ++++-- cmake/versions.json | 5 ++--- 2 files changed, 6 insertions(+), 5 deletions(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index 7dd94acd..aa6514b7 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -119,16 +119,18 @@ if (CMAKE_CUDA_COMPILER_VERSION VERSION_LESS 11.5) message(FATAL_ERROR "MatX requires CUDA 11.5 or higher. Please update before using.") endif() +set(CCCL_ENABLE_UNSTABLE ON) message(STATUS "Finding CCCL...") rapids_cpm_cccl( BUILD_EXPORT_SET matx-exports INSTALL_EXPORT_SET matx-exports ) -target_link_libraries(matx INTERFACE CCCL::CCCL) +target_link_libraries(matx INTERFACE CCCL::CCCL CCCL::cudax) +set(MATX_CUDA_FLAGS ${CMAKE_CUDA_FLAGS} --expt-relaxed-constexpr --extended-lambda) # Set flags for compiling tests faster -set(MATX_CUDA_FLAGS ${CMAKE_CUDA_FLAGS} --threads 0 -ftemplate-backtrace-limit=0) +set(MATX_CUDA_FLAGS ${MATX_CUDA_FLAGS} --threads 0 -ftemplate-backtrace-limit=0) # Hack because CMake doesn't have short circult evaluation if (NOT CMAKE_BUILD_TYPE OR "${CMAKE_BUILD_TYPE}" STREQUAL "Debug") diff --git a/cmake/versions.json b/cmake/versions.json index b7d1a877..e6eba5b3 100644 --- a/cmake/versions.json +++ b/cmake/versions.json @@ -1,10 +1,9 @@ { "packages": { "CCCL": { - "version": "2.7.0-rc2", - "git_shallow": true, + "version": "2.8.0", "git_url": "https://github.com/NVIDIA/cccl.git", - "git_tag": "10e915ac7b79a1ab3b9d7a795c621b47b122f513" + "git_tag": "cb1fce5e1cb7362940bd7e74ab8fbf01942b6264" } } } From 245b20f03a5522ec6ee2809a0dd9be23235c2386 Mon Sep 17 00:00:00 2001 From: Albert Sidelnik Date: Mon, 4 Nov 2024 15:42:12 -0800 Subject: [PATCH 02/39] remove const expr --- CMakeLists.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index aa6514b7..7e9283e6 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -127,7 +127,7 @@ rapids_cpm_cccl( ) target_link_libraries(matx INTERFACE CCCL::CCCL CCCL::cudax) -set(MATX_CUDA_FLAGS ${CMAKE_CUDA_FLAGS} --expt-relaxed-constexpr --extended-lambda) +set(MATX_CUDA_FLAGS ${CMAKE_CUDA_FLAGS} --extended-lambda) # Set flags for compiling tests faster set(MATX_CUDA_FLAGS ${MATX_CUDA_FLAGS} --threads 0 -ftemplate-backtrace-limit=0) From 9b35ec844ff5ee59b48cb2605328a9fa44f362ef Mon Sep 17 00:00:00 2001 From: Albert Sidelnik Date: Mon, 4 Nov 2024 22:58:34 -0800 Subject: [PATCH 03/39] Updates to get basic cudastf functionality working with matx --- CMakeLists.txt | 2 +- examples/simple_radar_pipeline.cu | 14 ++++ examples/simple_radar_pipeline.h | 6 ++ include/matx/core/operator_utils.h | 2 +- include/matx/core/tensor.h | 51 ++++++++++--- include/matx/core/tensor_impl.h | 87 +++++++++++++++++++++-- include/matx/core/type_utils.h | 10 +++ include/matx/core/utils.h | 24 +++++++ include/matx/generators/generator1d.h | 3 + include/matx/operators/binary_operators.h | 12 ++++ include/matx/operators/constval.h | 3 + include/matx/operators/conv.h | 75 +++++++++++++++---- include/matx/operators/fft.h | 74 +++++++++++++++---- include/matx/operators/matmul.h | 40 +++++++++-- include/matx/operators/matvec.h | 28 +++++++- include/matx/operators/permute.h | 8 +++ include/matx/operators/set.h | 14 ++++ include/matx/operators/sum.h | 28 +++++++- include/matx/operators/unary_operators.h | 6 ++ 19 files changed, 438 insertions(+), 49 deletions(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index 7e9283e6..aa6514b7 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -127,7 +127,7 @@ rapids_cpm_cccl( ) target_link_libraries(matx INTERFACE CCCL::CCCL CCCL::cudax) -set(MATX_CUDA_FLAGS ${CMAKE_CUDA_FLAGS} --extended-lambda) +set(MATX_CUDA_FLAGS ${CMAKE_CUDA_FLAGS} --expt-relaxed-constexpr --extended-lambda) # Set flags for compiling tests faster set(MATX_CUDA_FLAGS ${MATX_CUDA_FLAGS} --threads 0 -ftemplate-backtrace-limit=0) diff --git a/examples/simple_radar_pipeline.cu b/examples/simple_radar_pipeline.cu index a89cb1b1..10190d5a 100644 --- a/examples/simple_radar_pipeline.cu +++ b/examples/simple_radar_pipeline.cu @@ -76,6 +76,9 @@ int main([[maybe_unused]] int argc, [[maybe_unused]] char **argv) pipelines[s]->sync(); } + /* Get STF context handle */ + auto ctx = pipelines[0]->exec.getCtx(); + MATX_NVTX_START_RANGE("Pipeline Test", matx_nvxtLogLevels::MATX_NVTX_LOG_USER, 2) printf("Running test...\n"); @@ -114,7 +117,11 @@ int main([[maybe_unused]] int argc, [[maybe_unused]] char **argv) for (uint32_t i = 0; i < iterations; i++) { for (int s = 0; s < num_streams; s++) { if (i == 1) { +#if 0 cudaEventRecord(starts[s], streams[s]); +#else + cudaEventRecord(starts[s], ctx.task_fence()); +#endif } if (ENABLE_GRAPHS) { @@ -127,9 +134,16 @@ int main([[maybe_unused]] int argc, [[maybe_unused]] char **argv) } for (int s = 0; s < num_streams; s++) { +#if 0 cudaEventRecord(stops[s], streams[s]); +#else + cudaEventRecord(stops[s], ctx.task_fence()); +#endif pipelines[s]->sync(); } + + ctx.finalize(); + MATX_NVTX_END_RANGE(2) MATX_NVTX_START_RANGE("Pipeline Results", matx_nvxtLogLevels::MATX_NVTX_LOG_USER, 3) diff --git a/examples/simple_radar_pipeline.h b/examples/simple_radar_pipeline.h index 0f52ccf2..87e19f94 100644 --- a/examples/simple_radar_pipeline.h +++ b/examples/simple_radar_pipeline.h @@ -465,5 +465,11 @@ class RadarPipeline { tensor_t cfarMaskView; cudaStream_t stream; +#if 0 cudaExecutor exec; +#else +public: + stfExecutor exec; +#endif + }; diff --git a/include/matx/core/operator_utils.h b/include/matx/core/operator_utils.h index 6f1f1f18..1a97a579 100644 --- a/include/matx/core/operator_utils.h +++ b/include/matx/core/operator_utils.h @@ -120,7 +120,7 @@ namespace matx { __MATX_HOST__ __MATX_INLINE__ void AllocateTempTensor(TensorType &tensor, Executor &&ex, ShapeType &&shape, typename TensorType::value_type **ptr) { const auto ttl_size = std::accumulate(shape.begin(), shape.end(), static_cast(1), std::multiplies()) * sizeof(typename TensorType::value_type); - if constexpr (is_cuda_executor_v) { + if constexpr (is_cuda_executor_v || is_stf_executor_v) { matxAlloc((void**)ptr, ttl_size, MATX_ASYNC_DEVICE_MEMORY, ex.getStream()); make_tensor(tensor, *ptr, shape); } diff --git a/include/matx/core/tensor.h b/include/matx/core/tensor.h index 82b7b62b..27a933c2 100644 --- a/include/matx/core/tensor.h +++ b/include/matx/core/tensor.h @@ -91,6 +91,7 @@ class tensor_t : public detail::tensor_impl_t { using stride_container = typename Desc::stride_container; using desc_type = Desc; ///< Descriptor type trait using self_type = tensor_t; + using stf_logicaldata_type = typename cuda::experimental::stf::logical_data>; /** * @brief Construct a new 0-D tensor t object @@ -107,7 +108,7 @@ class tensor_t : public detail::tensor_impl_t { * @param rhs Object to copy from */ __MATX_HOST__ tensor_t(tensor_t const &rhs) noexcept - : detail::tensor_impl_t{rhs.ldata_, rhs.desc_}, storage_(rhs.storage_) + : detail::tensor_impl_t{rhs.ldata_, rhs.desc_, rhs.stf_ldata_}, storage_(rhs.storage_) { } /** @@ -116,7 +117,7 @@ class tensor_t : public detail::tensor_impl_t { * @param rhs Object to move from */ __MATX_HOST__ tensor_t(tensor_t &&rhs) noexcept - : detail::tensor_impl_t{rhs.ldata_, std::move(rhs.desc_)}, storage_(std::move(rhs.storage_)) + : detail::tensor_impl_t{rhs.ldata_, std::move(rhs.desc_), rhs.stf_ldata_}, storage_(std::move(rhs.storage_)) { } @@ -134,6 +135,7 @@ class tensor_t : public detail::tensor_impl_t { this->ldata_ = rhs.ldata_; storage_ = rhs.storage_; this->desc_ = rhs.desc_; + this->stf_ldata_ = rhs.stf_ldata_; } /** Swaps two tensors @@ -152,6 +154,7 @@ class tensor_t : public detail::tensor_impl_t { std::swap(lhs.ldata_, rhs.ldata_); swap(lhs.storage_, rhs.storage_); swap(lhs.desc_, rhs.desc_); + std::swap(lhs.stf_ldata_, rhs.stf_ldata_); } __MATX_INLINE__ ~tensor_t() = default; @@ -177,6 +180,16 @@ class tensor_t : public detail::tensor_impl_t { this->SetLocalData(storage_.data()); } + template ::type> && is_matx_descriptor_v::type>, bool> = true> + tensor_t(S2 &&s, D2 &&desc, T* ldata, std::optional *stf_ldata_) : + detail::tensor_impl_t{std::forward(desc)}, + storage_{std::forward(s)} + { + this->stf_ldata_ = stf_ldata_; + this->SetLocalData(storage_.data()); + } + /** * @brief Construct a new tensor t object. Used to copy an existing storage object for proper reference counting * @@ -185,13 +198,28 @@ class tensor_t : public detail::tensor_impl_t { * @param ldata */ template - tensor_t(Storage s, D2 &&desc, T* ldata) : + tensor_t(Storage s, D2 &&desc, T* ldata, std::optional *stf_ldata) : detail::tensor_impl_t{std::forward(desc)}, storage_{std::move(s)} { + this->stf_ldata_ = stf_ldata; this->SetLocalData(ldata); } + /** + * @brief Construct a new tensor t object. Used to copy an existing storage object for proper reference counting + * + * @param s + * @param desc + * @param ldata + */ + template + tensor_t(Storage s, D2 &&desc, T* ldata) : + detail::tensor_impl_t{std::forward(desc)}, + storage_{std::move(s)} + { + this->SetLocalData(ldata); + } /** * Constructor for a rank-1 and above tensor. @@ -646,7 +674,7 @@ class tensor_t : public detail::tensor_impl_t { // Copy descriptor and call ctor with shape Desc new_desc{std::forward(shape)}; - return tensor_t{storage_, std::move(new_desc), this->ldata_}; + return tensor_t{storage_, std::move(new_desc), this->ldata_, this->stf_ldata_}; } /** @@ -705,7 +733,7 @@ class tensor_t : public detail::tensor_impl_t { "To get a reshaped view the tensor must be compact"); DefaultDescriptor desc{std::move(tshape)}; - return tensor_t{storage_, std::move(desc), this->ldata_}; + return tensor_t{storage_, std::move(desc), this->ldata_, this->stf_ldata_}; } /** @@ -788,7 +816,10 @@ class tensor_t : public detail::tensor_impl_t { // Copy descriptor and call ctor with shape Desc new_desc{this->desc_.Shape(), std::move(strides)}; + printf("******* FIX ASAP: tensor.h: 819 ****************\n"); return tensor_t{storage_, std::move(new_desc), data}; + /* Albert : TODO. We're creating a logical data of with the type being complex and not real. Seems to be incompatible with the approach we're taking in creating an associated logical data. Will be fixed once we move to void_interace */ + //return tensor_t{storage_, std::move(new_desc), data, this->stf_ldata_}; } /** @@ -831,7 +862,7 @@ class tensor_t : public detail::tensor_impl_t { } Desc new_desc{this->desc_.Shape(), std::move(strides)}; - return tensor_t{storage_, std::move(new_desc), data}; + return tensor_t{storage_, std::move(new_desc), data, this->stf_ldata_}; } /** @@ -854,7 +885,7 @@ class tensor_t : public detail::tensor_impl_t { MATX_NVTX_START("", matx::MATX_NVTX_LOG_API) auto new_desc = this->PermuteImpl(dims); - return tensor_t{storage_, std::move(new_desc), this->ldata_}; + return tensor_t{storage_, std::move(new_desc), this->ldata_, this->stf_ldata_}; } @@ -1030,7 +1061,7 @@ class tensor_t : public detail::tensor_impl_t { OverlapView(const cuda::std::array &windows, const cuda::std::array &strides) const { auto new_desc = this->template OverlapViewImpl(windows, strides); - return tensor_t{storage_, std::move(new_desc), this->ldata_}; + return tensor_t{storage_, std::move(new_desc), this->ldata_, this->stf_ldata_}; } /** @@ -1064,7 +1095,7 @@ class tensor_t : public detail::tensor_impl_t { MATX_NVTX_START("", matx::MATX_NVTX_LOG_API) auto new_desc = this->template CloneImpl(clones); - return tensor_t{storage_, std::move(new_desc), this->ldata_}; + return tensor_t{storage_, std::move(new_desc), this->ldata_, this->stf_ldata_}; } template @@ -1362,7 +1393,7 @@ class tensor_t : public detail::tensor_impl_t { [[maybe_unused]] StrideType strides) const { auto [new_desc, data] = this->template SliceImpl(firsts, ends, strides); - return tensor_t{storage_, std::move(new_desc), data}; + return tensor_t{storage_, std::move(new_desc), data, this->stf_ldata_}; } template diff --git a/include/matx/core/tensor_impl.h b/include/matx/core/tensor_impl.h index 77dd0dbd..d655ae74 100644 --- a/include/matx/core/tensor_impl.h +++ b/include/matx/core/tensor_impl.h @@ -41,9 +41,11 @@ #include "matx/core/type_utils.h" #include "matx/core/tensor_utils.h" #include "matx/operators/set.h" +#include "matx/core/utils.h" //#include "matx_exec_kernel.h" #include "iterator.h" #include "matx/core/make_tensor.h" +#include namespace matx { @@ -80,6 +82,10 @@ class tensor_impl_t { using matxoplvalue = bool; using self_type = tensor_impl_t; + /* TODO: convert to void_interface for the logical data.*/ + using stf_logicaldata_type = typename cuda::experimental::stf::logical_data>; + //using stf_logicaldata_type = typename cudastf::logical_data; + // Type specifier for signaling this is a matx operation using matxop = bool; @@ -110,13 +116,15 @@ class tensor_impl_t { swap(lhs.ldata_, rhs.ldata_); swap(lhs.desc_, rhs.desc_); + swap(lhs.stf_ldata_, rhs.stf_ldata_); } /** * Constructor for a rank-0 tensor (scalar). */ tensor_impl_t() { - + auto ldptr = new std::optional(); + this->stf_ldata_ = ldptr; } /** @@ -127,6 +135,8 @@ class tensor_impl_t { */ tensor_impl_t(T *const data) : ldata_(data) { static_assert(RANK == 0, "tensor_impl_t with single pointer parameter must be a rank 0 tensor"); + auto ldptr = new std::optional(); + this->stf_ldata_ = ldptr; } /** @@ -139,6 +149,8 @@ class tensor_impl_t { std::enable_if_t> && !is_matx_descriptor_v>, bool> = true> __MATX_INLINE__ tensor_impl_t(ShapeType &&shape) : desc_(std::forward(shape)) { + auto ldptr = new std::optional(); + this->stf_ldata_ = ldptr; } /** @@ -153,6 +165,8 @@ class tensor_impl_t { __MATX_INLINE__ tensor_impl_t(ShapeType &&shape, StrideType &&strides) : desc_(std::forward(shape), std::forward(strides)) { + auto ldptr = new std::optional(); + this->stf_ldata_ = ldptr; } /** @@ -193,6 +207,8 @@ class tensor_impl_t { StrideType &&strides) : ldata_(ldata), desc_(std::forward(shape), std::forward(strides)) { + auto ldptr = new std::optional(); + this->stf_ldata_ = ldptr; } @@ -216,9 +232,18 @@ IGNORE_WARNING_PUSH_GCC("-Wmaybe-uninitialized") DescriptorType &&desc) : ldata_(ldata), desc_{std::forward(desc)} { + auto ldptr = new std::optional(); + this->stf_ldata_ = ldptr; } IGNORE_WARNING_POP_GCC + template ::type>, bool> = true> + __MATX_INLINE__ __MATX_DEVICE__ __MATX_HOST__ tensor_impl_t(T *const ldata, + DescriptorType &&desc, std::optional *stf_ldata) + : ldata_(ldata), desc_{std::forward(desc)}, stf_ldata_(stf_ldata) + { + } + /** * Constructor for creating a view with only a descriptor * @@ -233,12 +258,15 @@ IGNORE_WARNING_POP_GCC __MATX_INLINE__ tensor_impl_t(DescriptorType &&desc) : desc_{std::forward(desc)} { + auto ldptr = new std::optional(); + this->stf_ldata_ = ldptr; } __MATX_HOST__ void Shallow(const self_type &rhs) noexcept { ldata_ = rhs.ldata_; desc_ = rhs.desc_; + stf_ldata_ = rhs.stf_ldata_; } /** @@ -255,6 +283,7 @@ IGNORE_WARNING_POP_GCC { ldata_ = op.ldata_; desc_ = op.desc_; + stf_ldata_ = op.stf_ldata; } @@ -770,7 +799,7 @@ IGNORE_WARNING_POP_GCC auto new_desc = CloneImpl(clones); - return tensor_impl_t{this->ldata_, std::move(new_desc)}; + return tensor_impl_t{this->ldata_, std::move(new_desc), this->stf_ldata_}; } __MATX_INLINE__ auto PermuteImpl(const cuda::std::array &dims) const @@ -800,7 +829,7 @@ IGNORE_WARNING_POP_GCC __MATX_INLINE__ auto Permute(const cuda::std::array &dims) const { auto new_desc = PermuteImpl(dims); - return tensor_impl_t{this->ldata_, std::move(new_desc)}; + return tensor_impl_t{this->ldata_, std::move(new_desc), this->stf_ldata_}; } template @@ -845,7 +874,7 @@ IGNORE_WARNING_POP_GCC OverlapView(const cuda::std::array &windows, const cuda::std::array &strides) const { auto new_desc = OverlapViewImpl(windows, strides); - return tensor_impl_t{this->ldata_, std::move(new_desc)}; + return tensor_impl_t{this->ldata_, std::move(new_desc), this->stf_ldata_}; } template @@ -1080,9 +1109,56 @@ IGNORE_WARNING_POP_GCC ldata_ = data; } + template + __MATX_INLINE__ void apply_dep_to_task(Task &&task, int perm) const noexcept + { + auto &ld = stf_ldata_->value(); + + using namespace cuda::experimental::stf; +#if 0 + data_place place = getDataPlace(Data()); +#endif + + if (perm == 0) { + task.add_deps(ld.write()); + } + else if (perm == 1) { + task.add_deps(ld.read()); + } + else if (perm == 2) { + task.add_deps(ld.rw()); + } + else { + std::cout << "abort ...\n"; + } + } + template __MATX_INLINE__ void PreRun([[maybe_unused]] ShapeType &&shape, [[maybe_unused]] Executor &&ex) const noexcept { + using namespace cuda::experimental::stf; + data_place place; + + if constexpr (is_cuda_executor_v) { + return; + } + else if constexpr (!is_cuda_executor_v) { + + /* Don't create a new logical data for a tensor if it alread had one created previously */ + if (stf_ldata_ && stf_ldata_->has_value()) { return; } + + auto ctx = ex.getCtx(); +#if 0 + // Determine the type of memory that was allocated ie. host/managed/etc + place = getDataPlace(Data()); +#endif + + /* TODO: Use void_interface for logical data */ + *stf_ldata_ = ctx.logical_data(cuda::experimental::stf::make_slice(Data(), 1)); + //*stf_ldata_ = ctx.logical_data(cudastf::shape_of()); + stf_ldata_->value().set_write_back(false); + stf_ldata_->value().set_symbol(this->str()); + } } template @@ -1094,6 +1170,9 @@ IGNORE_WARNING_POP_GCC protected: T *ldata_; Desc desc_; + + public: + mutable std::optional *stf_ldata_; }; } diff --git a/include/matx/core/type_utils.h b/include/matx/core/type_utils.h index 3c66019b..dcf11b6e 100644 --- a/include/matx/core/type_utils.h +++ b/include/matx/core/type_utils.h @@ -44,6 +44,7 @@ #include "matx/core/half.h" #include "matx/core/half_complex.h" #include "matx/executors/cuda.h" +#include "matx/executors/stf.h" /** * Defines type traits for host and device compilers. This file should be includable by @@ -287,6 +288,7 @@ inline constexpr bool is_settable_xform_v = std::conjunction_v struct is_executor : std::false_type {}; template <> struct is_executor : std::true_type {}; +template <> struct is_executor : std::true_type {}; template struct is_executor> : std::true_type {}; } @@ -307,6 +309,11 @@ template struct is_cuda_executor : std::false_type {}; template<> struct is_cuda_executor : std::true_type {}; } +namespace detail { +template struct is_stf_executor : std::false_type {}; +template<> struct is_stf_executor : std::true_type {}; +} + /** * @brief Determine if a type is a device executor * @@ -315,6 +322,9 @@ template<> struct is_cuda_executor : std::true_type {}; template inline constexpr bool is_cuda_executor_v = detail::is_cuda_executor::type>::value; +template +inline constexpr bool is_stf_executor_v = detail::is_stf_executor::type>::value; + namespace detail { template struct is_host_executor : std::false_type {}; template struct is_host_executor> : std::true_type {}; diff --git a/include/matx/core/utils.h b/include/matx/core/utils.h index 50bda7bb..0c90a4e9 100644 --- a/include/matx/core/utils.h +++ b/include/matx/core/utils.h @@ -45,6 +45,30 @@ namespace matx { namespace detail { + +#if 0 +__MATX_INLINE__ cuda::experimental::stf::data_place getDataPlace(void *ptr) { + using namespace cuda::experimental::stf; + auto kind = GetPointerKind(ptr); + switch (kind) { + case MATX_MANAGED_MEMORY: + return data_place::managed; + case MATX_HOST_MEMORY: + case MATX_HOST_MALLOC_MEMORY: + return data_place::host; + case MATX_DEVICE_MEMORY: + case MATX_ASYNC_DEVICE_MEMORY: + return data_place::current_device(); + case MATX_INVALID_MEMORY: + //std::cout << "Data kind is invalid: assuming managed memory\n"; + return data_place::managed; + //return data_place::invalid; + default: + return data_place::invalid; + } +} +#endif + __MATX_INLINE__ int GetDeviceAttr(cudaDeviceAttr attr) { int val; int dev; diff --git a/include/matx/generators/generator1d.h b/include/matx/generators/generator1d.h index cdeca628..1d684d62 100644 --- a/include/matx/generators/generator1d.h +++ b/include/matx/generators/generator1d.h @@ -54,6 +54,9 @@ namespace matx return f_(pp_get(indices...)); } + template + __MATX_INLINE__ void apply_dep_to_task([[maybe_unused]] Task &&task, [[maybe_unused]] int perm=1) const noexcept { } + constexpr inline __MATX_HOST__ __MATX_DEVICE__ auto Size(int dim) const { return *(s_.begin() + dim); diff --git a/include/matx/operators/binary_operators.h b/include/matx/operators/binary_operators.h index 49635ccd..af49b034 100644 --- a/include/matx/operators/binary_operators.h +++ b/include/matx/operators/binary_operators.h @@ -146,6 +146,18 @@ namespace matx return detail::matx_max(size1,size2); } + template + __MATX_INLINE__ void apply_dep_to_task(Task &&task, Perm perm) const noexcept + { + if constexpr (is_matx_op()) { + in1_.apply_dep_to_task(std::forward(task), perm); + } + + if constexpr (is_matx_op()) { + in2_.apply_dep_to_task(std::forward(task), perm); + } + } + template __MATX_INLINE__ void PreRun(ShapeType &&shape, Executor &&ex) const noexcept { diff --git a/include/matx/operators/constval.h b/include/matx/operators/constval.h index 3df28aff..32eec317 100644 --- a/include/matx/operators/constval.h +++ b/include/matx/operators/constval.h @@ -55,6 +55,9 @@ namespace matx __MATX_INLINE__ __MATX_DEVICE__ __MATX_HOST__ T operator()(Is...) const { return v_; }; + template + __MATX_INLINE__ void apply_dep_to_task([[maybe_unused]] Task &&task, [[maybe_unused]] int perm) const noexcept { } + constexpr inline __MATX_HOST__ __MATX_DEVICE__ auto Size(int dim) const { if constexpr (!is_noshape_v) { return *(s_.begin() + dim); diff --git a/include/matx/operators/conv.h b/include/matx/operators/conv.h index 62f3a2f8..905573e2 100644 --- a/include/matx/operators/conv.h +++ b/include/matx/operators/conv.h @@ -149,16 +149,45 @@ namespace matx return out_dims_[dim]; } + template + __MATX_INLINE__ void apply_dep_to_task(Task &&task, int perm=1) const noexcept + { + tmp_out_.apply_dep_to_task(std::forward(task), 1); + } + template void Exec(Out &&out, Executor &&ex) const { MATX_ASSERT_STR(!(is_host_executor_v && method_ == MATX_C_METHOD_DIRECT), matxNotSupported, "direct conv1d() only supports the CUDA executor currently"); MATX_STATIC_ASSERT_STR((Rank() == cuda::std::tuple_element_t<0, remove_cvref_t>::Rank()), matxInvalidParameter, "conv1d: inputs and outputs must have same rank to use conv1d with axis parameter"); - if constexpr (!std::is_same_v) { - conv1d_impl(permute(cuda::std::get<0>(out), perm_), a_, b_, mode_, method_, ex); + if constexpr (!is_cuda_executor_v) { + auto ctx = ex.getCtx(); + auto tsk = ctx.task(); + tsk.set_symbol("conv_task"); + + auto output = cuda::std::get<0>(out); + output.PreRun(out_dims_, std::forward(ex)); + output.apply_dep_to_task(tsk, 0); + a_.apply_dep_to_task(tsk, 1); + b_.apply_dep_to_task(tsk, 1); + + tsk->*[&](cudaStream_t s) { + auto exec = cudaExecutor(s); + if constexpr (!std::is_same_v) { + conv1d_impl(permute(cuda::std::get<0>(out), perm_), a_, b_, mode_, method_, exec); + } + else { + conv1d_impl(cuda::std::get<0>(out), a_, b_, mode_, method_, exec); + } + }; } - else { - conv1d_impl(cuda::std::get<0>(out), a_, b_, mode_, method_, ex); + else if constexpr (is_cuda_executor_v) { + if constexpr (!std::is_same_v) { + conv1d_impl(permute(cuda::std::get<0>(out), perm_), a_, b_, mode_, method_, ex); + } + else { + conv1d_impl(cuda::std::get<0>(out), a_, b_, mode_, method_, ex); + } } } @@ -343,14 +372,36 @@ namespace detail { template void Exec(Out &&out, Executor &&ex) const { - static_assert(is_cuda_executor_v, "conv2d() only supports the CUDA executor currently"); - - if constexpr (!std::is_same_v) { - conv2d_impl(permute(cuda::std::get<0>(out), perm_), a_, b_, mode_, ex.getStream()); - } - else { - conv2d_impl(cuda::std::get<0>(out), a_, b_, mode_, ex.getStream()); - } + //static_assert(is_cuda_executor_v, "conv2d() only supports the CUDA executor currently"); + if constexpr (!is_cuda_executor_v) { + auto ctx = ex.getCtx(); + auto tsk = ctx.task(); + tsk.set_symbol("conv_task"); + + auto output = cuda::std::get<0>(out); + output.PreRun(out_dims_, std::forward(ex)); + output.apply_dep_to_task(tsk, 0); + a_.apply_dep_to_task(tsk, 1); + b_.apply_dep_to_task(tsk, 1); + + tsk->*[&](cudaStream_t s) { + auto exec = cudaExecutor(s); + if constexpr (!std::is_same_v) { + conv2d_impl(permute(cuda::std::get<0>(out), perm_), a_, b_, mode_, exec.getStream()); + } + else { + conv2d_impl(cuda::std::get<0>(out), a_, b_, mode_, exec.getStream()); + } + }; + } + else if constexpr (is_cuda_executor_v) { + if constexpr (!std::is_same_v) { + conv2d_impl(permute(cuda::std::get<0>(out), perm_), a_, b_, mode_, ex.getStream()); + } + else { + conv2d_impl(cuda::std::get<0>(out), a_, b_, mode_, ex.getStream()); + } + } } template diff --git a/include/matx/operators/fft.h b/include/matx/operators/fft.h index 242169d1..76e57c3d 100644 --- a/include/matx/operators/fft.h +++ b/include/matx/operators/fft.h @@ -145,24 +145,68 @@ namespace matx return out_dims_[dim]; } + template + __MATX_INLINE__ void apply_dep_to_task(Task &&task, int perm=1) const noexcept + { + /* Scenario where the matvec() operator is on the RHS and op has already + run previously. So we make tmp_out have a read permission as it will be read from */ + tmp_out_.apply_dep_to_task(std::forward(task), 1); + } + template void Exec(Out &&out, Executor &&ex) const { - if constexpr (std::is_same_v) { - if constexpr (std::is_same_v) { - fft_impl(cuda::std::get<0>(out), a_, fft_size_, norm_, ex); - } - else { - ifft_impl(cuda::std::get<0>(out), a_, fft_size_, norm_, ex); - } - } - else { - if constexpr (std::is_same_v) { - fft_impl(permute(cuda::std::get<0>(out), perm_), permute(a_, perm_), fft_size_, norm_, ex); + // stfexecutor case + if constexpr (!is_cuda_executor_v) { + auto ctx = ex.getCtx(); + auto tsk = ctx.task(); + tsk.set_symbol("fft_task_no_perm"); + + auto output = cuda::std::get<0>(out); + output.PreRun(out_dims_, std::forward(ex)); + output.apply_dep_to_task(tsk, 0); + a_.apply_dep_to_task(tsk, 1); + + tsk->*[&](cudaStream_t s) { + auto exec = cudaExecutor(s); + if constexpr (std::is_same_v) { + if constexpr (std::is_same_v) { + fft_impl(output, a_, fft_size_, norm_, exec); + } + else { + ifft_impl(output, a_, fft_size_, norm_, exec); + } + } + else { + if constexpr (std::is_same_v) { + fft_impl(permute(output, perm_), permute(a_, perm_), fft_size_, norm_, exec); + } + else { + ifft_impl(permute(output, perm_), permute(a_, perm_), fft_size_, norm_, exec); + } + } + }; + } + else if constexpr (is_cuda_executor_v) { + if constexpr (std::is_same_v) { + if constexpr (std::is_same_v) { + fft_impl(cuda::std::get<0>(out), a_, fft_size_, norm_, ex); + } + else { + ifft_impl(cuda::std::get<0>(out), a_, fft_size_, norm_, ex); + } + } + else { + if constexpr (std::is_same_v) { + fft_impl(permute(cuda::std::get<0>(out), perm_), permute(a_, perm_), fft_size_, norm_, ex); + } + else { + ifft_impl(permute(cuda::std::get<0>(out), perm_), permute(a_, perm_), fft_size_, norm_, ex); + } + } } else { - ifft_impl(permute(cuda::std::get<0>(out), perm_), permute(a_, perm_), fft_size_, norm_, ex); - } - } + printf("SHOULDNT BE HERE..\n"); + } } template @@ -494,4 +538,4 @@ namespace matx return detail::FFT2Op(a, perm, detail::ifft_t{}, norm); } -} \ No newline at end of file +} diff --git a/include/matx/operators/matmul.h b/include/matx/operators/matmul.h index add2a556..bd000a87 100644 --- a/include/matx/operators/matmul.h +++ b/include/matx/operators/matmul.h @@ -111,13 +111,45 @@ namespace matx return out_dims_[dim]; } + template + __MATX_INLINE__ void apply_dep_to_task(Task &&task, int perm=1) const noexcept + { + /* Scenario where the matmul() operator is on the RHS and op has already run + previously. So we make tmp_out have a read permission as it will be read from */ + tmp_out_.apply_dep_to_task(std::forward(task), 1); + } + template void Exec(Out &&out, Executor &&ex) const { - if constexpr (!std::is_same_v) { - matmul_impl(permute(cuda::std::get<0>(out), perm_), a_, b_, ex, alpha_, beta_); + // stfexecutor case + if constexpr (!is_cuda_executor_v) { + auto ctx = ex.getCtx(); + auto tsk = ctx.task(); + tsk.set_symbol("matmul"); + + auto output = cuda::std::get<0>(out); + output.PreRun(out_dims_, std::forward(ex)); + output.apply_dep_to_task(tsk, 0); + a_.apply_dep_to_task(tsk, 1); + b_.apply_dep_to_task(tsk, 1); + + tsk->*[&](cudaStream_t s) { + auto exec = cudaExecutor(s); + if constexpr (!std::is_same_v) { + matmul_impl(permute(cuda::std::get<0>(out), perm_), a_, b_, exec, alpha_, beta_); + } + else { + matmul_impl(cuda::std::get<0>(out), a_, b_, exec, alpha_, beta_); + } + }; } - else { - matmul_impl(cuda::std::get<0>(out), a_, b_, ex, alpha_, beta_); + else if constexpr (is_cuda_executor_v) { + if constexpr (!std::is_same_v) { + matmul_impl(permute(cuda::std::get<0>(out), perm_), a_, b_, ex, alpha_, beta_); + } + else { + matmul_impl(cuda::std::get<0>(out), a_, b_, ex, alpha_, beta_); + } } } diff --git a/include/matx/operators/matvec.h b/include/matx/operators/matvec.h index 222402c2..3568c2e0 100644 --- a/include/matx/operators/matvec.h +++ b/include/matx/operators/matvec.h @@ -89,9 +89,35 @@ namespace matx return out_dims_[dim]; } + template + __MATX_INLINE__ void apply_dep_to_task(Task &&task, int perm=1) const noexcept + { + /* Scenario where the matvec() operator is on the RHS and op has already run + previously. So we make tmp_out have a read permission as it will be read from */ + tmp_out_.apply_dep_to_task(std::forward(task), 1); + } + template void Exec(Out &&out, Executor &&ex) const{ - matvec_impl(cuda::std::get<0>(out), a_, b_, ex, alpha_, beta_); + // stfexecutor case + auto output = cuda::std::get<0>(out); + if constexpr (!is_cuda_executor_v) { + auto ctx = ex.getCtx(); + auto tsk = ctx.task(); + tsk.set_symbol("matvec_task"); + output.PreRun(out_dims_, std::forward(ex)); + output.apply_dep_to_task(tsk, 0); + a_.apply_dep_to_task(tsk, 1); + b_.apply_dep_to_task(tsk, 1); + tsk->*[&](cudaStream_t s) { + auto exec = cudaExecutor(s); + matvec_impl(output, a_, b_, exec, alpha_, beta_); + }; + } + // cudaExecutor case + else if constexpr (is_cuda_executor_v) { + matvec_impl(output, a_, b_, ex, alpha_, beta_); + } } template diff --git a/include/matx/operators/permute.h b/include/matx/operators/permute.h index bf366e24..2f6fc156 100644 --- a/include/matx/operators/permute.h +++ b/include/matx/operators/permute.h @@ -123,6 +123,14 @@ IGNORE_WARNING_POP_GCC return op_.Size(dims_[dim]); } + template + __MATX_INLINE__ void apply_dep_to_task(Task &&task, int perm=1) const noexcept + { + if constexpr (is_matx_op()) { + op_.apply_dep_to_task(std::forward(task), perm); + } + } + template __MATX_INLINE__ void PreRun(ShapeType &&shape, Executor &&ex) const noexcept { diff --git a/include/matx/operators/set.h b/include/matx/operators/set.h index 89885706..3d3e7843 100644 --- a/include/matx/operators/set.h +++ b/include/matx/operators/set.h @@ -147,6 +147,20 @@ class set : public BaseOp> { return res; } + template + __MATX_INLINE__ void apply_dep_to_task(Task &&task, [[maybe_unused]] int perm=0) const noexcept + { + // LHS + if constexpr (is_matx_op()) { + out_.apply_dep_to_task(std::forward(task), 0); + } + // RHS + if constexpr (is_matx_op()) { + op_.apply_dep_to_task(std::forward(task), 1); + } + } + + template __MATX_INLINE__ void PreRun(ShapeType &&shape, Executor &&ex) const noexcept { diff --git a/include/matx/operators/sum.h b/include/matx/operators/sum.h index d95b2595..60e86248 100644 --- a/include/matx/operators/sum.h +++ b/include/matx/operators/sum.h @@ -72,9 +72,35 @@ namespace detail { return tmp_out_(indices...); } + template + __MATX_INLINE__ void apply_dep_to_task(Task &&task, int perm=1) const noexcept { + /* Albert -- Scenario where the sum() operator is on the RHS and sum has already + run previously. So we make tmp_out have a read permission as it will be read from */ + tmp_out_.apply_dep_to_task(std::forward(task), 1); + } + template void Exec(Out &&out, Executor &&ex) const { - sum_impl(cuda::std::get<0>(out), a_, ex); + auto output = cuda::std::get<0>(out); + // stfexecutor case + if constexpr (!is_cuda_executor_v) { + auto ctx = ex.getCtx(); + auto tsk = ctx.task(); + tsk.set_symbol("sum_task"); + + output.PreRun(out_dims_, std::forward(ex)); + output.apply_dep_to_task(tsk, 0); + a_.apply_dep_to_task(tsk, 1); + + tsk->*[&](cudaStream_t s) { + auto exec = cudaExecutor(s); + sum_impl(output, a_, exec); + }; + } + // cudaExecutor case + else if constexpr (is_cuda_executor_v) { + sum_impl(output, a_, ex); + } } static __MATX_INLINE__ constexpr __MATX_HOST__ __MATX_DEVICE__ int32_t Rank() diff --git a/include/matx/operators/unary_operators.h b/include/matx/operators/unary_operators.h index e0a3d9c7..b7633916 100644 --- a/include/matx/operators/unary_operators.h +++ b/include/matx/operators/unary_operators.h @@ -103,6 +103,12 @@ namespace matx return size_[dim]; } + template + __MATX_INLINE__ void apply_dep_to_task(Task &&task, Perm perm) const noexcept + { + in1_.apply_dep_to_task(std::forward(task), perm); + } + template __MATX_INLINE__ void PreRun([[maybe_unused]] ShapeType &&shape, [[maybe_unused]] Executor &&ex) const noexcept { From 7d298d405f4937af5657f4d79079924b18b15b38 Mon Sep 17 00:00:00 2001 From: Albert Sidelnik Date: Mon, 4 Nov 2024 23:13:38 -0800 Subject: [PATCH 04/39] move to void_interface --- include/matx/core/tensor.h | 7 ++----- include/matx/core/tensor_impl.h | 11 +++-------- 2 files changed, 5 insertions(+), 13 deletions(-) diff --git a/include/matx/core/tensor.h b/include/matx/core/tensor.h index 27a933c2..7eb5f8ae 100644 --- a/include/matx/core/tensor.h +++ b/include/matx/core/tensor.h @@ -91,7 +91,7 @@ class tensor_t : public detail::tensor_impl_t { using stride_container = typename Desc::stride_container; using desc_type = Desc; ///< Descriptor type trait using self_type = tensor_t; - using stf_logicaldata_type = typename cuda::experimental::stf::logical_data>; + using stf_logicaldata_type = typename cuda::experimental::stf::logical_data; /** * @brief Construct a new 0-D tensor t object @@ -816,10 +816,7 @@ class tensor_t : public detail::tensor_impl_t { // Copy descriptor and call ctor with shape Desc new_desc{this->desc_.Shape(), std::move(strides)}; - printf("******* FIX ASAP: tensor.h: 819 ****************\n"); - return tensor_t{storage_, std::move(new_desc), data}; - /* Albert : TODO. We're creating a logical data of with the type being complex and not real. Seems to be incompatible with the approach we're taking in creating an associated logical data. Will be fixed once we move to void_interace */ - //return tensor_t{storage_, std::move(new_desc), data, this->stf_ldata_}; + return tensor_t{storage_, std::move(new_desc), data, this->stf_ldata_}; } /** diff --git a/include/matx/core/tensor_impl.h b/include/matx/core/tensor_impl.h index d655ae74..f837f9cc 100644 --- a/include/matx/core/tensor_impl.h +++ b/include/matx/core/tensor_impl.h @@ -81,10 +81,7 @@ class tensor_impl_t { using stride_type = typename Desc::stride_type; using matxoplvalue = bool; using self_type = tensor_impl_t; - - /* TODO: convert to void_interface for the logical data.*/ - using stf_logicaldata_type = typename cuda::experimental::stf::logical_data>; - //using stf_logicaldata_type = typename cudastf::logical_data; + using stf_logicaldata_type = typename cuda::experimental::stf::logical_data; // Type specifier for signaling this is a matx operation using matxop = bool; @@ -1153,10 +1150,8 @@ IGNORE_WARNING_POP_GCC place = getDataPlace(Data()); #endif - /* TODO: Use void_interface for logical data */ - *stf_ldata_ = ctx.logical_data(cuda::experimental::stf::make_slice(Data(), 1)); - //*stf_ldata_ = ctx.logical_data(cudastf::shape_of()); - stf_ldata_->value().set_write_back(false); + *stf_ldata_ = ctx.logical_data(cuda::experimental::stf::void_interface()); + //stf_ldata_->value().set_write_back(false); stf_ldata_->value().set_symbol(this->str()); } } From 154b3f99c04f9ee7d7dc807e3e4030c751132d33 Mon Sep 17 00:00:00 2001 From: Albert Sidelnik Date: Tue, 5 Nov 2024 00:03:50 -0800 Subject: [PATCH 05/39] add stf executor --- include/matx/executors/stf.h | 250 +++++++++++++++++++++++++++++++++++ 1 file changed, 250 insertions(+) create mode 100644 include/matx/executors/stf.h diff --git a/include/matx/executors/stf.h b/include/matx/executors/stf.h new file mode 100644 index 00000000..00e8cef3 --- /dev/null +++ b/include/matx/executors/stf.h @@ -0,0 +1,250 @@ +//////////////////////////////////////////////////////////////////////////////// +// BSD 3-Clause License +// +// Copyright (c) 2021, NVIDIA Corporation +// All rights reserved. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are met: +// +// 1. Redistributions of source code must retain the above copyright notice, this +// list of conditions and the following disclaimer. +// +// 2. Redistributions in binary form must reproduce the above copyright notice, +// this list of conditions and the following disclaimer in the documentation +// and/or other materials provided with the distribution. +// +// 3. Neither the name of the copyright holder nor the names of its +// contributors may be used to endorse or promote products derived from +// this software without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +// DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE +// FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +// DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +// SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +// CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +// OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +///////////////////////////////////////////////////////////////////////////////// + +#pragma once + +#include "matx/core/defines.h" +#include "matx/executors/host.h" +#include "matx/executors/kernel.h" + +#include + +//using namespace cuda::experimental::stf; +//using namespace cudastf; + +namespace matx +{ + + +/* Albert - Needed to declare this here to avoid compile error. */ +template constexpr bool is_matx_op_lvalue(); +template constexpr bool is_matx_set_op(); + + class stfExecutor { + public: + using matx_cuda = bool; // signal this is a GPU executor + using matx_executor = bool; ///< Type trait indicating this is an executor +//using cudastf::async_resources_handle; +//using cudastf::stream_ctx; + /** + * @brief Construct a new stfExecutor with a stream + * + * @param stream CUDA stream + */ + stfExecutor(cudaStream_t stream) : stream_(stream) { + cuda::experimental::stf::async_resources_handle handle; + ctx_ = cuda::experimental::stf::stream_ctx(stream, handle); + } + stfExecutor(int stream) : stream_(reinterpret_cast(stream)) { + cuda::experimental::stf::async_resources_handle handle; + ctx_ = cuda::experimental::stf::stream_ctx(reinterpret_cast(stream), handle); + } + + /** + * @brief Construct a new stfExecutor object using the default stream + * + */ + stfExecutor() : stream_(0) { + ctx_ = cuda::experimental::stf::stream_ctx(); + } + + /** + * @brief Returns stream associated with executor + */ + auto getStream() const { return stream_; } + + /** + * @brief Get CUDASTF Ctx + * + */ + auto &getCtx() const noexcept { return ctx_; } + + /** + * @brief Synchronize the cuda executor's stream + * + */ + void sync() { cudaStreamSynchronize(stream_); } + + /** + * Execute an operator on a device + * + * @tparam Op Operator type + * @param op value + **/ + template + void Exec(Op &op) const { + //std::cout << "exec on stfexecutor -- start\n"; +#ifdef __CUDACC__ + dim3 threads, blocks; + + auto ctx = getCtx(); + // Parameters passed by value in CUDA are limited to 4096B. If the user exceeds this, we + // need to error out and have them break up the statement + MATX_STATIC_ASSERT((sizeof(op) + sizeof(index_t) * Op::Rank()) <= CUDA_MAX_VAL_PARAM, + "Parameter buffer to device is limited to 4096B. Please break up your operator statement into multiple executions to limit the size of the parameters"); + + if constexpr (Op::Rank() == 0) { + threads = 1; + blocks = 1; + if constexpr (is_matx_op_lvalue() || is_matx_set_op()) { + auto tsk = ctx.task(); + tsk.set_symbol(op.str()); + op.apply_dep_to_task(tsk); // recursively find the tensors from the tree to apply deps + //std::cout << "Start launch task. Rank = " << Op::Rank() << " " << op.str() << '\n'; + tsk->*[&](cudaStream_t s) { + detail::matxOpT0Kernel<<>>(op); + }; + //std::cout << "End launch task.\n"; + } + else { + //std::cout << " RANK 0 not on LHS operator = " << op.str() << '\n'; + detail::matxOpT0Kernel<<>>(op); + } + } + else { + cuda::std::array sizes; + for (int i = 0; i < Op::Rank(); i++) { + sizes[i] = op.Size(i); + } + + bool stride = detail::get_grid_dims(blocks, threads, sizes, 256); + + if constexpr (Op::Rank() == 1) { + if constexpr (is_matx_op_lvalue() || is_matx_set_op()) { + auto tsk = ctx.task(); + tsk.set_symbol(op.str()); + op.apply_dep_to_task(tsk); // recursively find the tensors from the tree to apply deps + //std::cout << "Start launch task. Rank = " << Op::Rank() << '\n'; + tsk->*[&](cudaStream_t s) { + detail::matxOpT1Kernel<<>>(op, sizes[0]); + }; + //std::cout << "End launch task.\n"; + } + else { + //std::cout << " RANK 1 not on LHS operator = " << op.str() << '\n'; + detail::matxOpT1Kernel<<>>(op, sizes[0]); + } + } + else if constexpr (Op::Rank() == 2) { + if constexpr (is_matx_op_lvalue() || is_matx_set_op()) { + auto tsk = ctx.task(); + tsk.set_symbol(op.str()); + op.apply_dep_to_task(tsk); // recursively find the tensors from the tree to apply deps + //std::cout << "About to launch task. Rank = " << Op::Rank() << '\n'; + tsk->*[&](cudaStream_t s) { + if(stride) { + detail::matxOpT2StrideKernel<<>>(op, sizes[0], sizes[1]); + } else { + detail::matxOpT2Kernel<<>>(op, sizes[0], sizes[1]); + } + }; + } + else { + //std::cout << " not on LHS operator = " << op.str() << '\n'; + if(stride) { + detail::matxOpT2StrideKernel<<>>(op, sizes[0], sizes[1]); + } else { + detail::matxOpT2Kernel<<>>(op, sizes[0], sizes[1]); + } + } + } + else if constexpr (Op::Rank() == 3) { + if constexpr (is_matx_op_lvalue() || is_matx_set_op()) { + auto tsk = ctx.task(); + tsk.set_symbol(op.str()); + op.apply_dep_to_task(tsk); // recursively find the tensors from the tree to apply deps + tsk->*[&](cudaStream_t s) { + if(stride) { + detail::matxOpT3StrideKernel<<>>(op, sizes[0], sizes[1], sizes[2]); + } else { + detail::matxOpT3Kernel<<>>(op, sizes[0], sizes[1], sizes[2]); + } + }; + } + else { + if(stride) { + detail::matxOpT3StrideKernel<<>>(op, sizes[0], sizes[1], sizes[2]); + } else { + detail::matxOpT3Kernel<<>>(op, sizes[0], sizes[1], sizes[2]); + } + } + } + else if constexpr (Op::Rank() == 4) { + if constexpr (is_matx_op_lvalue() || is_matx_set_op()) { + auto tsk = ctx.task(); + op.apply_dep_to_task(tsk); // recursively find the tensors from the tree to apply deps + tsk.set_symbol(op.str())->*[&](cudaStream_t s) { + if(stride) { + detail::matxOpT4StrideKernel<<>>(op, sizes[0], sizes[1], sizes[2], sizes[3]); + } else { + detail::matxOpT4Kernel<<>>(op, sizes[0], sizes[1], sizes[2], sizes[3]); + } + }; + } + else { + if(stride) { + detail::matxOpT4StrideKernel<<>>(op, sizes[0], sizes[1], sizes[2], sizes[3]); + } else { + detail::matxOpT4Kernel<<>>(op, sizes[0], sizes[1], sizes[2], sizes[3]); + } + } + } + else { + if constexpr (is_matx_op_lvalue() || is_matx_set_op()) { + auto tsk = ctx.task(); + tsk.set_symbol(op.str()); + op.apply_dep_to_task(tsk); // recursively find the tensors from the tree to apply deps + //std::cout << "About to launch task. Rank = " << Op::Rank() << '\n'; + + tsk->*[&](cudaStream_t s) { + index_t dims = std::accumulate(std::begin(sizes) + 1, std::end(sizes), 1, std::multiplies()); + detail::matxOpTDKernel<<>>(op, sizes, dims); + }; + } + else { + index_t dims = std::accumulate(std::begin(sizes) + 1, std::end(sizes), 1, std::multiplies()); + detail::matxOpTDKernel<<>>(op, sizes, dims); + } + } + } +#else + MATX_ASSERT_STR(false, matxInvalidParameter, "Cannot call device executor using host compiler"); +#endif + //std::cout << "exec on stfexecutor -- stop\n"; + } + + private: + cudaStream_t stream_; + cuda::experimental::stf::context ctx_; + }; + +}; From c8ef988d1e1987f320c4be256dae36621b7c02f2 Mon Sep 17 00:00:00 2001 From: Albert Sidelnik Date: Tue, 5 Nov 2024 12:39:16 -0800 Subject: [PATCH 06/39] support for cgsolve operator and a few examples --- examples/cgsolve.cu | 9 ++++++++ examples/fft_conv.cu | 33 ++++++++++++++++++++++++++---- include/matx/executors/stf.h | 3 +++ include/matx/operators/all.h | 28 ++++++++++++++++++++++++- include/matx/operators/cast.h | 8 ++++++++ include/matx/operators/cgsolve.h | 4 ++-- include/matx/operators/fft.h | 2 +- include/matx/operators/max.h | 28 ++++++++++++++++++++++++- include/matx/transforms/cgsolve.h | 34 ++++++++++++++++++++----------- 9 files changed, 128 insertions(+), 21 deletions(-) diff --git a/examples/cgsolve.cu b/examples/cgsolve.cu index 9027d634..f77d8393 100644 --- a/examples/cgsolve.cu +++ b/examples/cgsolve.cu @@ -54,7 +54,12 @@ int main([[maybe_unused]] int argc, [[maybe_unused]] char **argv) auto norm = make_tensor({BATCH}); auto maxn = make_tensor({}); +#if 0 cudaExecutor exec{}; +#else + stfExecutor exec{}; + auto ctx = exec.getCtx(); +#endif // Simple Poisson matrix for(int b = 0; b < BATCH; b++) { @@ -83,6 +88,10 @@ int main([[maybe_unused]] int argc, [[maybe_unused]] char **argv) (maxn = matx::max(sqrt(norm))).run(exec); exec.sync(); +#if 1 + ctx.finalize(); +#endif + // example-end sync-test-1 printf ("max l2 norm: %f\n", (float)sqrt(maxn())); diff --git a/examples/fft_conv.cu b/examples/fft_conv.cu index 7f871119..0fa27343 100644 --- a/examples/fft_conv.cu +++ b/examples/fft_conv.cu @@ -73,7 +73,12 @@ int main([[maybe_unused]] int argc, [[maybe_unused]] char **argv) { MATX_ENTER_HANDLER(); using complex = cuda::std::complex; +#if 0 cudaExecutor exec{}; +#else + stfExecutor exec{}; + auto ctx = exec.getCtx(); +#endif index_t signal_size = 1ULL << 16; index_t filter_size = 16; @@ -117,7 +122,11 @@ int main([[maybe_unused]] int argc, [[maybe_unused]] char **argv) // Perform the FFT in-place on both signal and filter for (int i = 0; i < iterations; i++) { if (i == 1) { +#if 0 cudaEventRecord(start, stream); +#else + cudaEventRecord(start, ctx.task_fence()); +#endif } (sig_freq = fft(sig_time, filtered_size)).run(exec); (filt_freq = fft(filt_time, filtered_size)).run(exec); @@ -129,18 +138,30 @@ int main([[maybe_unused]] int argc, [[maybe_unused]] char **argv) } +#if 0 cudaEventRecord(stop, stream); +#else + cudaEventRecord(stop, ctx.task_fence()); +#endif exec.sync(); cudaEventElapsedTime(&separate_ms, start, stop); for (int i = 0; i < iterations; i++) { if (i == 1) { - cudaEventRecord(start, stream); +#if 0 + cudaEventRecord(start, stream); +#else + cudaEventRecord(start, ctx.task_fence()); +#endif } (sig_freq = ifft(fft(sig_time, filtered_size) * fft(filt_time, filtered_size))).run(exec); } - + +#if 0 cudaEventRecord(stop, stream); +#else + cudaEventRecord(stop, ctx.task_fence()); +#endif exec.sync(); cudaEventElapsedTime(&fused_ms, start, stop); @@ -153,7 +174,11 @@ int main([[maybe_unused]] int argc, [[maybe_unused]] char **argv) (time_out = conv1d(sig_time, filt1, matxConvCorrMode_t::MATX_C_MODE_FULL)).run(exec); exec.sync(); - + +#if 1 + ctx.finalize(); +#endif + // Compare signals for (index_t b = 0; b < batches; b++) { for (index_t i = 0; i < filtered_size; i++) { @@ -172,4 +197,4 @@ int main([[maybe_unused]] int argc, [[maybe_unused]] char **argv) CUDA_CHECK_LAST_ERROR(); MATX_EXIT_HANDLER(); -} \ No newline at end of file +} diff --git a/include/matx/executors/stf.h b/include/matx/executors/stf.h index 00e8cef3..ad9e970b 100644 --- a/include/matx/executors/stf.h +++ b/include/matx/executors/stf.h @@ -63,10 +63,12 @@ template constexpr bool is_matx_set_op(); stfExecutor(cudaStream_t stream) : stream_(stream) { cuda::experimental::stf::async_resources_handle handle; ctx_ = cuda::experimental::stf::stream_ctx(stream, handle); + //ctx_ = cuda::experimental::stf::graph_ctx(stream, handle); } stfExecutor(int stream) : stream_(reinterpret_cast(stream)) { cuda::experimental::stf::async_resources_handle handle; ctx_ = cuda::experimental::stf::stream_ctx(reinterpret_cast(stream), handle); + //ctx_ = cuda::experimental::stf::graph_ctx(reinterpret_cast(stream), handle); } /** @@ -75,6 +77,7 @@ template constexpr bool is_matx_set_op(); */ stfExecutor() : stream_(0) { ctx_ = cuda::experimental::stf::stream_ctx(); + //ctx_ = cuda::experimental::stf::graph_ctx(); } /** diff --git a/include/matx/operators/all.h b/include/matx/operators/all.h index f56f8336..20769677 100644 --- a/include/matx/operators/all.h +++ b/include/matx/operators/all.h @@ -72,9 +72,35 @@ namespace detail { return tmp_out_(indices...); }; + template + __MATX_INLINE__ void apply_dep_to_task(Task &&task, int perm=1) const noexcept { + /* Albert -- Scenario where the all() operator is on the RHS and sum has already + run previously. So we make tmp_out have a read permission as it will be read from */ + tmp_out_.apply_dep_to_task(std::forward(task), 1); + } + template void Exec(Out &&out, Executor &&ex) const { - all_impl(cuda::std::get<0>(out), a_, ex); + auto output = cuda::std::get<0>(out); + // stfexecutor case + if constexpr (!is_cuda_executor_v) { + auto ctx = ex.getCtx(); + auto tsk = ctx.task(); + tsk.set_symbol("all_task"); + + output.PreRun(out_dims_, std::forward(ex)); + output.apply_dep_to_task(tsk, 0); + a_.apply_dep_to_task(tsk, 1); + + tsk->*[&](cudaStream_t s) { + auto exec = cudaExecutor(s); + all_impl(output, a_, exec); + }; + } + // cudaExecutor case + else if constexpr (is_cuda_executor_v) { + all_impl(output, a_, ex); + } } static __MATX_INLINE__ constexpr __MATX_HOST__ __MATX_DEVICE__ int32_t Rank() diff --git a/include/matx/operators/cast.h b/include/matx/operators/cast.h index 83a1aa4d..23b6d539 100644 --- a/include/matx/operators/cast.h +++ b/include/matx/operators/cast.h @@ -83,6 +83,14 @@ namespace matx return static_cast(op_(indices...)); } + template + __MATX_INLINE__ void apply_dep_to_task(Task &&task, int perm=1) const noexcept + { + if constexpr (is_matx_op()) { + op_.apply_dep_to_task(std::forward(task), perm); + } + } + template __MATX_INLINE__ void PreRun(ShapeType &&shape, Executor &&ex) const noexcept { diff --git a/include/matx/operators/cgsolve.h b/include/matx/operators/cgsolve.h index 21eab1bc..69628795 100644 --- a/include/matx/operators/cgsolve.h +++ b/include/matx/operators/cgsolve.h @@ -90,8 +90,8 @@ namespace matx template void Exec(Out &&out, Executor &&ex) const{ - static_assert(is_cuda_executor_v, "cgsolve() only supports the CUDA executor currently"); - cgsolve_impl(cuda::std::get<0>(out), a_, b_, tol_, max_iters_, ex.getStream()); + //static_assert(is_cuda_executor_v, "cgsolve() only supports the CUDA executor currently"); + cgsolve_impl(cuda::std::get<0>(out), a_, b_, ex, tol_, max_iters_, ex.getStream()); } template diff --git a/include/matx/operators/fft.h b/include/matx/operators/fft.h index 76e57c3d..cf86acdc 100644 --- a/include/matx/operators/fft.h +++ b/include/matx/operators/fft.h @@ -146,7 +146,7 @@ namespace matx } template - __MATX_INLINE__ void apply_dep_to_task(Task &&task, int perm=1) const noexcept + __MATX_INLINE__ void apply_dep_to_task(Task &&task, [[maybe_unused]] int perm=1) const noexcept { /* Scenario where the matvec() operator is on the RHS and op has already run previously. So we make tmp_out have a read permission as it will be read from */ diff --git a/include/matx/operators/max.h b/include/matx/operators/max.h index f7d9da77..562c2a4f 100644 --- a/include/matx/operators/max.h +++ b/include/matx/operators/max.h @@ -72,9 +72,35 @@ namespace detail { return tmp_out_(indices...); } + template + __MATX_INLINE__ void apply_dep_to_task(Task &&task, int perm=1) const noexcept { + /* Albert -- Scenario where the all() operator is on the RHS and sum has already + run previously. So we make tmp_out have a read permission as it will be read from */ + tmp_out_.apply_dep_to_task(std::forward(task), 1); + } + template void Exec(Out &&out, Executor &&ex) const { - max_impl(cuda::std::get<0>(out), a_, ex); + auto output = cuda::std::get<0>(out); + // stfexecutor case + if constexpr (!is_cuda_executor_v) { + auto ctx = ex.getCtx(); + auto tsk = ctx.task(); + tsk.set_symbol("max_task"); + + output.PreRun(out_dims_, std::forward(ex)); + output.apply_dep_to_task(tsk, 0); + a_.apply_dep_to_task(tsk, 1); + + tsk->*[&](cudaStream_t s) { + auto exec = cudaExecutor(s); + max_impl(output, a_, exec); + }; + } + // cudaExecutor case + else if constexpr (is_cuda_executor_v) { + max_impl(output, a_, ex); + } } static __MATX_INLINE__ constexpr __MATX_HOST__ __MATX_DEVICE__ int32_t Rank() diff --git a/include/matx/transforms/cgsolve.h b/include/matx/transforms/cgsolve.h index 0f389ca9..f08d8f0c 100644 --- a/include/matx/transforms/cgsolve.h +++ b/include/matx/transforms/cgsolve.h @@ -58,8 +58,8 @@ namespace matx * cuda Stream to execute on * */ - template - __MATX_INLINE__ void cgsolve_impl(XType X, AType A, BType B, double tol=1e-6, int max_iters=4, cudaStream_t stream=0) + template + __MATX_INLINE__ void cgsolve_impl(XType X, AType A, BType B, Executor &&exec, double tol=1e-6, int max_iters=4, cudaStream_t stream=0) { using value_type = typename XType::value_type; const int VRANK = XType::Rank(); @@ -120,15 +120,19 @@ namespace matx auto pApc = clone(pAp, clone_shape); // A*X - (Ap = matvec(A, X)).run(stream); + //(Ap = matvec(A, X)).run(stream); + (Ap = matvec(A, X)).run(exec); // r0 = B - A*X // p = r0 - (p = r0 = B - Ap).run(stream); + //(p = r0 = B - Ap).run(stream); + (p = r0 = B - Ap).run(exec); - (r0r0 = sum(r0*r0)).run(stream); + //(r0r0 = sum(r0*r0)).run(stream); + (r0r0 = sum(r0*r0)).run(exec); if(tol>0.0f) { - (converged = matx::all(as_int(sqrt(r0r0) < tol))).run(stream); + //(converged = matx::all(as_int(sqrt(r0r0) < tol))).run(stream); + (converged = matx::all(as_int(sqrt(r0r0) < tol))).run(exec); cudaEventRecord(event, stream); cudaStreamWaitEvent(d2h, event); @@ -137,10 +141,12 @@ namespace matx int i; for (i = 0 ; i < max_iters; i++) { // Ap = matvec(A, p) - (Ap = matvec(A, p)).run(stream); + //(Ap = matvec(A, p)).run(stream); + (Ap = matvec(A, p)).run(exec); // pAp = dot(p,Ap) - (pAp = sum(p*Ap)).run(stream); + //(pAp = sum(p*Ap)).run(stream); + (pAp = sum(p*Ap)).run(exec); // if pAp is zero then we have exactly numerically converged. // However, this is batched so we may iterate more. Iterating @@ -152,10 +158,12 @@ namespace matx auto updateOp = ( r1 = r0 - (r0r0c/pApc) * Ap, X = X + (r0r0c/pApc) * p); - (IF( pApc != value_type(0), updateOp)).run(stream); + //(IF( pApc != value_type(0), updateOp)).run(stream); + (IF( pApc != value_type(0), updateOp)).run(exec); // r1r1 = dot(r1, r1) - (r1r1 = sum(r1*r1)).run(stream); + //(r1r1 = sum(r1*r1)).run(stream); + (r1r1 = sum(r1*r1)).run(exec); if(tol>0.0f) { // copy convergence criteria to host. @@ -168,7 +176,8 @@ namespace matx break; } - (converged = matx::all(as_int(sqrt(r1r1) < tol))).run(stream); + //(converged = matx::all(as_int(sqrt(r1r1) < tol))).run(stream); + (converged = matx::all(as_int(sqrt(r1r1) < tol))).run(exec); cudaEventRecord(event, stream); cudaStreamWaitEvent(d2h, event); @@ -176,7 +185,8 @@ namespace matx // p = r1 + b * p auto updateP = ( p = r1 + (r1r1c/r0r0c) * p); - (IF( pApc != value_type(0), updateP)).run(stream); + //(IF( pApc != value_type(0), updateP)).run(stream); + (IF( pApc != value_type(0), updateP)).run(exec); // Advance residual swap(r0r0, r1r1); From 52b18c9e638bc93966692259ae10c9e1efa7fd78 Mon Sep 17 00:00:00 2001 From: Albert Sidelnik Date: Tue, 3 Dec 2024 13:17:32 -0800 Subject: [PATCH 07/39] make the sync() that is part of stfexecutor call ctx.task_fence() --- include/matx/executors/stf.h | 9 +++------ 1 file changed, 3 insertions(+), 6 deletions(-) diff --git a/include/matx/executors/stf.h b/include/matx/executors/stf.h index ad9e970b..85eb0e1c 100644 --- a/include/matx/executors/stf.h +++ b/include/matx/executors/stf.h @@ -53,8 +53,7 @@ template constexpr bool is_matx_set_op(); public: using matx_cuda = bool; // signal this is a GPU executor using matx_executor = bool; ///< Type trait indicating this is an executor -//using cudastf::async_resources_handle; -//using cudastf::stream_ctx; + /** * @brief Construct a new stfExecutor with a stream * @@ -92,10 +91,10 @@ template constexpr bool is_matx_set_op(); auto &getCtx() const noexcept { return ctx_; } /** - * @brief Synchronize the cuda executor's stream + * @brief Synchronize the STF executor's stream * */ - void sync() { cudaStreamSynchronize(stream_); } + void sync() { ctx.task_fence(); } /** * Execute an operator on a device @@ -122,11 +121,9 @@ template constexpr bool is_matx_set_op(); auto tsk = ctx.task(); tsk.set_symbol(op.str()); op.apply_dep_to_task(tsk); // recursively find the tensors from the tree to apply deps - //std::cout << "Start launch task. Rank = " << Op::Rank() << " " << op.str() << '\n'; tsk->*[&](cudaStream_t s) { detail::matxOpT0Kernel<<>>(op); }; - //std::cout << "End launch task.\n"; } else { //std::cout << " RANK 0 not on LHS operator = " << op.str() << '\n'; From d726b10df03881fa1542f00801c7527cb8447222 Mon Sep 17 00:00:00 2001 From: Albert Sidelnik Date: Tue, 3 Dec 2024 13:20:43 -0800 Subject: [PATCH 08/39] fix typo --- include/matx/executors/stf.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/include/matx/executors/stf.h b/include/matx/executors/stf.h index 85eb0e1c..64399230 100644 --- a/include/matx/executors/stf.h +++ b/include/matx/executors/stf.h @@ -94,7 +94,7 @@ template constexpr bool is_matx_set_op(); * @brief Synchronize the STF executor's stream * */ - void sync() { ctx.task_fence(); } + void sync() { ctx_.task_fence(); } /** * Execute an operator on a device From 5e7576c07a6c99fbe49f93b5c3c20a3805430b56 Mon Sep 17 00:00:00 2001 From: Albert Sidelnik Date: Tue, 17 Dec 2024 10:43:47 -0800 Subject: [PATCH 09/39] Added test case --- examples/CMakeLists.txt | 1 + examples/simple_stf_test.cu | 192 ++++++++++++++++++++++++++++++++++++ 2 files changed, 193 insertions(+) create mode 100644 examples/simple_stf_test.cu diff --git a/examples/CMakeLists.txt b/examples/CMakeLists.txt index d56f099b..563b6fc3 100644 --- a/examples/CMakeLists.txt +++ b/examples/CMakeLists.txt @@ -11,6 +11,7 @@ set(examples mvdr_beamformer pwelch resample_poly_bench + simple_stf_test spectrogram spectrogram_graph spherical_harmonics diff --git a/examples/simple_stf_test.cu b/examples/simple_stf_test.cu new file mode 100644 index 00000000..7061b715 --- /dev/null +++ b/examples/simple_stf_test.cu @@ -0,0 +1,192 @@ +//////////////////////////////////////////////////////////////////////////////// +// BSD 3-Clause License +// +// Copyright (c) 2021, NVIDIA Corporation +// All rights reserved. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are met: +// +// 1. Redistributions of source code must retain the above copyright notice, this +// list of conditions and the following disclaimer. +// +// 2. Redistributions in binary form must reproduce the above copyright notice, +// this list of conditions and the following disclaimer in the documentation +// and/or other materials provided with the distribution. +// +// 3. Neither the name of the copyright holder nor the names of its +// contributors may be used to endorse or promote products derived from +// this software without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +// DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE +// FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +// DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +// SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +// CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +// OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +///////////////////////////////////////////////////////////////////////////////// + +#include "matx.h" +#include +#include +#include +#include + +using namespace matx; + +/** + * MatX uses C++ expression templates to build arithmetic expressions that compile into a lazily-evaluated + * type for executing on the device. Currently, nvcc cannot see certain optimizations + * when building the expression tree that would be obvious by looking at the code. Specifically any code reusing + * the same tensor multiple times appears to the compiler as separate tensors, and it may issue multiple load + * instructions. While caching helps, this can have a slight performance impact when compared to native CUDA + * kernels. To work around this problem, complex expressions can be placed in a custom operator by adding some + * boilerplate code around the original expression. This custom operator can then be used either alone or inside + * other arithmetic expressions, and only a single load is issues for each tensor. + * + * This example uses the Black-Scholes equtation to demonstrate the two ways to implement the equation in MatX, and + * shows the performance difference. + */ + +/* Custom operator */ +template +class BlackScholes : public BaseOp> { +private: + O out_; + I1 V_, S_, K_, r_, T_; + +public: + BlackScholes(O out, I1 K, I1 V, I1 S, I1 r, I1 T) + : out_(out), K_(K), V_(V), S_(S), r_(r), T_(T) {} + + __device__ inline void operator()(index_t idx) + { + auto V = V_(idx); + auto K = K_(idx); + auto S = S_(idx); + auto T = T_(idx); + auto r = r_(idx); + + auto VsqrtT = V * sqrt(T); + auto d1 = (log(S / K) + (r + 0.5 * V * V) * T) / VsqrtT ; + auto d2 = d1 - VsqrtT; + auto cdf_d1 = normcdf(d1); + auto cdf_d2 = normcdf(d2); + auto expRT = exp(-1 * r * T); + + out_(idx) = S * cdf_d1 - K * expRT * cdf_d2; + } + + __host__ __device__ inline index_t Size(uint32_t i) const { return out_.Size(i); } + static inline constexpr __host__ __device__ int32_t Rank() { return O::Rank(); } +}; + +/* Arithmetic expression */ +template +void compute_black_scholes_matx(tensor_t& K, + tensor_t& S, + tensor_t& V, + tensor_t& r, + tensor_t& T, + tensor_t& output, +#if 0 + cudaExecutor& exec) +#else + stfExecutor& exec) +#endif +{ + auto VsqrtT = V * sqrt(T); + auto d1 = (log(S / K) + (r + 0.5 * V * V) * T) / VsqrtT ; + auto d2 = d1 - VsqrtT; + auto cdf_d1 = normcdf(d1); + auto cdf_d2 = normcdf(d2); + auto expRT = exp(-1 * r * T); + (output = S * cdf_d1 - K * expRT * cdf_d2).run(exec); + +#if 0 + (output = K + sqrt(S+V)).run(exec); +#endif + + //std::cout << "Output : " << std::endl; + //print(output); +} + +int main([[maybe_unused]] int argc, [[maybe_unused]] char **argv) +{ + MATX_ENTER_HANDLER(); + + using dtype = double; + +#if 0 + index_t input_size = 100000000; +#else + index_t input_size = 10000; +#endif + constexpr uint32_t num_iterations = 1000; + float time_ms; + + tensor_t K_tensor{{input_size}}; + tensor_t S_tensor{{input_size}}; + tensor_t V_tensor{{input_size}}; + tensor_t r_tensor{{input_size}}; + tensor_t T_tensor{{input_size}}; + tensor_t output_tensor{{input_size}}; + + cudaStream_t stream; + cudaStreamCreate(&stream); +#if 0 + cudaExecutor exec{stream}; +#else + stfExecutor exec{stream}; + auto ctx = exec.getCtx(); +#endif + + /* Albert --- initilizing input .. */ + for (int i = 0; i < input_size; i++) { + K_tensor(i) = dtype(i+1); + S_tensor(i) = dtype(i+i+1); + V_tensor(i) = dtype(i+i+i+1); + r_tensor(i) = dtype(i+i+i+i+1); + T_tensor(i) = dtype(i+i+i+i+i+1); + } + +//print(V_tensor); + + compute_black_scholes_matx(K_tensor, S_tensor, V_tensor, r_tensor, T_tensor, output_tensor, exec); + + cudaEvent_t start, stop; + cudaEventCreate(&start); + cudaEventCreate(&stop); + +#if 0 + cudaEventRecord(start, stream); +#else + cudaEventRecord(start, ctx.task_fence()); +#endif + // Time non-operator version + for (uint32_t i = 0; i < num_iterations; i++) { + compute_black_scholes_matx(K_tensor, S_tensor, V_tensor, r_tensor, T_tensor, output_tensor, exec); + } +#if 0 + cudaEventRecord(stop, stream); +#else + cudaEventRecord(stop, ctx.task_fence()); +#endif + exec.sync(); +#if 1 + ctx.finalize(); + cudaEventElapsedTime(&time_ms, start, stop); +#endif + + printf("Time without custom operator = %.2fms per iteration\n", + time_ms / num_iterations); + cudaEventDestroy(start); + cudaEventDestroy(stop); + cudaStreamDestroy(stream); + CUDA_CHECK_LAST_ERROR(); + MATX_EXIT_HANDLER(); +} From 13736996468b9fe44856d46b8b4781e725ae22fb Mon Sep 17 00:00:00 2001 From: Albert Sidelnik Date: Tue, 17 Dec 2024 11:52:19 -0800 Subject: [PATCH 10/39] Fixes to the sync --- examples/simple_stf_test.cu | 44 ++++-------------------------------- include/matx/executors/stf.h | 10 ++++++-- 2 files changed, 12 insertions(+), 42 deletions(-) diff --git a/examples/simple_stf_test.cu b/examples/simple_stf_test.cu index 7061b715..6c048f92 100644 --- a/examples/simple_stf_test.cu +++ b/examples/simple_stf_test.cu @@ -52,39 +52,6 @@ using namespace matx; * shows the performance difference. */ -/* Custom operator */ -template -class BlackScholes : public BaseOp> { -private: - O out_; - I1 V_, S_, K_, r_, T_; - -public: - BlackScholes(O out, I1 K, I1 V, I1 S, I1 r, I1 T) - : out_(out), K_(K), V_(V), S_(S), r_(r), T_(T) {} - - __device__ inline void operator()(index_t idx) - { - auto V = V_(idx); - auto K = K_(idx); - auto S = S_(idx); - auto T = T_(idx); - auto r = r_(idx); - - auto VsqrtT = V * sqrt(T); - auto d1 = (log(S / K) + (r + 0.5 * V * V) * T) / VsqrtT ; - auto d2 = d1 - VsqrtT; - auto cdf_d1 = normcdf(d1); - auto cdf_d2 = normcdf(d2); - auto expRT = exp(-1 * r * T); - - out_(idx) = S * cdf_d1 - K * expRT * cdf_d2; - } - - __host__ __device__ inline index_t Size(uint32_t i) const { return out_.Size(i); } - static inline constexpr __host__ __device__ int32_t Rank() { return O::Rank(); } -}; - /* Arithmetic expression */ template void compute_black_scholes_matx(tensor_t& K, @@ -106,11 +73,6 @@ void compute_black_scholes_matx(tensor_t& K, auto cdf_d2 = normcdf(d2); auto expRT = exp(-1 * r * T); (output = S * cdf_d1 - K * expRT * cdf_d2).run(exec); - -#if 0 - (output = K + sqrt(S+V)).run(exec); -#endif - //std::cout << "Output : " << std::endl; //print(output); } @@ -156,7 +118,7 @@ int main([[maybe_unused]] int argc, [[maybe_unused]] char **argv) //print(V_tensor); - compute_black_scholes_matx(K_tensor, S_tensor, V_tensor, r_tensor, T_tensor, output_tensor, exec); + //compute_black_scholes_matx(K_tensor, S_tensor, V_tensor, r_tensor, T_tensor, output_tensor, exec); cudaEvent_t start, stop; cudaEventCreate(&start); @@ -182,7 +144,9 @@ int main([[maybe_unused]] int argc, [[maybe_unused]] char **argv) cudaEventElapsedTime(&time_ms, start, stop); #endif - printf("Time without custom operator = %.2fms per iteration\n", + // printf("Output tensor :\n"); + // print(output_tensor); + printf("Time without custom operator = %fms per iteration\n", time_ms / num_iterations); cudaEventDestroy(start); cudaEventDestroy(stop); diff --git a/include/matx/executors/stf.h b/include/matx/executors/stf.h index 64399230..83650fc7 100644 --- a/include/matx/executors/stf.h +++ b/include/matx/executors/stf.h @@ -79,6 +79,13 @@ template constexpr bool is_matx_set_op(); //ctx_ = cuda::experimental::stf::graph_ctx(); } +#if 0 + ~stfExecutor() { + std::cout << "About to call ctx.finalize\n"; + ctx_.finalize(); + } +#endif + /** * @brief Returns stream associated with executor */ @@ -94,7 +101,7 @@ template constexpr bool is_matx_set_op(); * @brief Synchronize the STF executor's stream * */ - void sync() { ctx_.task_fence(); } + void sync() { cudaStreamSynchronize(ctx_.task_fence()); } /** * Execute an operator on a device @@ -104,7 +111,6 @@ template constexpr bool is_matx_set_op(); **/ template void Exec(Op &op) const { - //std::cout << "exec on stfexecutor -- start\n"; #ifdef __CUDACC__ dim3 threads, blocks; From 92e72043e9d7eaa9709b57e3daf9db18d3f39875 Mon Sep 17 00:00:00 2001 From: Albert Sidelnik Date: Tue, 17 Dec 2024 12:54:43 -0800 Subject: [PATCH 11/39] add support for cgsolve --- examples/cgsolve.cu | 2 +- include/matx/operators/cgsolve.h | 10 ++++++++++ 2 files changed, 11 insertions(+), 1 deletion(-) diff --git a/examples/cgsolve.cu b/examples/cgsolve.cu index f77d8393..3d0a1c20 100644 --- a/examples/cgsolve.cu +++ b/examples/cgsolve.cu @@ -58,7 +58,6 @@ int main([[maybe_unused]] int argc, [[maybe_unused]] char **argv) cudaExecutor exec{}; #else stfExecutor exec{}; - auto ctx = exec.getCtx(); #endif // Simple Poisson matrix @@ -89,6 +88,7 @@ int main([[maybe_unused]] int argc, [[maybe_unused]] char **argv) exec.sync(); #if 1 + auto ctx = exec.getCtx(); ctx.finalize(); #endif diff --git a/include/matx/operators/cgsolve.h b/include/matx/operators/cgsolve.h index 69628795..93245fe3 100644 --- a/include/matx/operators/cgsolve.h +++ b/include/matx/operators/cgsolve.h @@ -88,9 +88,19 @@ namespace matx return out_dims_[dim]; } + template + __MATX_INLINE__ void apply_dep_to_task(Task &&task, int perm=1) const noexcept { + /* Albert -- Scenario where the sum() operator is on the RHS and sum has already + run previously. So we make tmp_out have a read permission as it will be read from */ + tmp_out_.apply_dep_to_task(std::forward(task), 1); + } + + template void Exec(Out &&out, Executor &&ex) const{ //static_assert(is_cuda_executor_v, "cgsolve() only supports the CUDA executor currently"); + auto output = cuda::std::get<0>(out); + output.PreRun(out_dims_, std::forward(ex)); cgsolve_impl(cuda::std::get<0>(out), a_, b_, ex, tol_, max_iters_, ex.getStream()); } From a608f3fcf85e482eddbac407f5aa90ba97d06e6b Mon Sep 17 00:00:00 2001 From: Albert Sidelnik Date: Tue, 17 Dec 2024 13:07:10 -0800 Subject: [PATCH 12/39] update to the simple radar code --- examples/simple_radar_pipeline.cu | 15 ++++++++++++++- examples/simple_radar_pipeline.h | 7 +++---- 2 files changed, 17 insertions(+), 5 deletions(-) diff --git a/examples/simple_radar_pipeline.cu b/examples/simple_radar_pipeline.cu index 10190d5a..1cec32a4 100644 --- a/examples/simple_radar_pipeline.cu +++ b/examples/simple_radar_pipeline.cu @@ -36,11 +36,18 @@ int main([[maybe_unused]] int argc, [[maybe_unused]] char **argv) { MATX_ENTER_HANDLER(); index_t numChannels = 16; +#if 1 index_t numPulses = 128; index_t numSamples = 9000; index_t waveformLength = 1000; - constexpr bool ENABLE_GRAPHS = false; uint32_t iterations = 100; +#else + index_t numPulses = 128; + index_t numSamples = 1000; + index_t waveformLength = 1000; + uint32_t iterations = 20; +#endif + constexpr bool ENABLE_GRAPHS = false; constexpr int num_streams = 1; cudaGraph_t graphs[num_streams]; cudaGraphExec_t instances[num_streams]; @@ -77,7 +84,9 @@ int main([[maybe_unused]] int argc, [[maybe_unused]] char **argv) } /* Get STF context handle */ +#if 1 auto ctx = pipelines[0]->exec.getCtx(); +#endif MATX_NVTX_START_RANGE("Pipeline Test", matx_nvxtLogLevels::MATX_NVTX_LOG_USER, 2) printf("Running test...\n"); @@ -142,11 +151,14 @@ int main([[maybe_unused]] int argc, [[maybe_unused]] char **argv) pipelines[s]->sync(); } +#if 1 ctx.finalize(); +#endif MATX_NVTX_END_RANGE(2) MATX_NVTX_START_RANGE("Pipeline Results", matx_nvxtLogLevels::MATX_NVTX_LOG_USER, 3) +#if 1 float time_ms; cudaEventElapsedTime(&time_ms, starts[num_streams-1], stops[num_streams-1]); float time_s = time_ms * .001f; @@ -156,6 +168,7 @@ int main([[maybe_unused]] int argc, [[maybe_unused]] char **argv) time_ms, static_cast(mult) / time_s, static_cast(mult*sizeof(complex)*numSamples*8)/time_s/1e9); +#endif for (int s = 0; s < num_streams; s++) { cudaEventDestroy(starts[s]); diff --git a/examples/simple_radar_pipeline.h b/examples/simple_radar_pipeline.h index 87e19f94..336be74b 100644 --- a/examples/simple_radar_pipeline.h +++ b/examples/simple_radar_pipeline.h @@ -465,11 +465,10 @@ class RadarPipeline { tensor_t cfarMaskView; cudaStream_t stream; -#if 0 - cudaExecutor exec; -#else +#if 1 public: stfExecutor exec; +#else + cudaExecutor exec; #endif - }; From b0625778dda21476b5e361af2a6dee4a3137bd1b Mon Sep 17 00:00:00 2001 From: Albert Sidelnik Date: Tue, 17 Dec 2024 13:11:29 -0800 Subject: [PATCH 13/39] minor typo fix --- examples/simple_stf_test.cu | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/examples/simple_stf_test.cu b/examples/simple_stf_test.cu index 6c048f92..8947f191 100644 --- a/examples/simple_stf_test.cu +++ b/examples/simple_stf_test.cu @@ -141,8 +141,8 @@ int main([[maybe_unused]] int argc, [[maybe_unused]] char **argv) exec.sync(); #if 1 ctx.finalize(); - cudaEventElapsedTime(&time_ms, start, stop); #endif + cudaEventElapsedTime(&time_ms, start, stop); // printf("Output tensor :\n"); // print(output_tensor); From bbf9abc10665caf8382fc97ca1a42d4ff6e432b8 Mon Sep 17 00:00:00 2001 From: Albert Sidelnik Date: Thu, 19 Dec 2024 11:37:17 -0800 Subject: [PATCH 14/39] update version of stf --- cmake/versions.json | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/cmake/versions.json b/cmake/versions.json index e6eba5b3..acea4f9a 100644 --- a/cmake/versions.json +++ b/cmake/versions.json @@ -3,7 +3,7 @@ "CCCL": { "version": "2.8.0", "git_url": "https://github.com/NVIDIA/cccl.git", - "git_tag": "cb1fce5e1cb7362940bd7e74ab8fbf01942b6264" + "git_tag": "980e55d37f9c48776df977a4f59f9ef0cdb816f2" } } } From 3e831ea4d35678d3bfc7e6bca1db15611a58b14c Mon Sep 17 00:00:00 2001 From: Albert Sidelnik Date: Thu, 19 Dec 2024 11:51:49 -0800 Subject: [PATCH 15/39] cleanup constexpr case for stfexecutor --- include/matx/operators/fft.h | 10 ++++------ 1 file changed, 4 insertions(+), 6 deletions(-) diff --git a/include/matx/operators/fft.h b/include/matx/operators/fft.h index cf86acdc..9435d56a 100644 --- a/include/matx/operators/fft.h +++ b/include/matx/operators/fft.h @@ -156,7 +156,7 @@ namespace matx template void Exec(Out &&out, Executor &&ex) const { // stfexecutor case - if constexpr (!is_cuda_executor_v) { + if constexpr (is_stf_executor_v) { auto ctx = ex.getCtx(); auto tsk = ctx.task(); tsk.set_symbol("fft_task_no_perm"); @@ -185,8 +185,9 @@ namespace matx } } }; - } - else if constexpr (is_cuda_executor_v) { + } + // cudaExecutor or host case + else { if constexpr (std::is_same_v) { if constexpr (std::is_same_v) { fft_impl(cuda::std::get<0>(out), a_, fft_size_, norm_, ex); @@ -204,9 +205,6 @@ namespace matx } } } - else { - printf("SHOULDNT BE HERE..\n"); - } } template From 702fe79195c87abb78d4b54448f1671d1d13500d Mon Sep 17 00:00:00 2001 From: Albert Sidelnik Date: Thu, 19 Dec 2024 12:04:31 -0800 Subject: [PATCH 16/39] cleanup constexpr case for stfexecutor --- include/matx/core/tensor_impl.h | 14 ++++---------- 1 file changed, 4 insertions(+), 10 deletions(-) diff --git a/include/matx/core/tensor_impl.h b/include/matx/core/tensor_impl.h index f837f9cc..e79c4e06 100644 --- a/include/matx/core/tensor_impl.h +++ b/include/matx/core/tensor_impl.h @@ -1133,27 +1133,21 @@ IGNORE_WARNING_POP_GCC template __MATX_INLINE__ void PreRun([[maybe_unused]] ShapeType &&shape, [[maybe_unused]] Executor &&ex) const noexcept { - using namespace cuda::experimental::stf; - data_place place; - - if constexpr (is_cuda_executor_v) { - return; - } - else if constexpr (!is_cuda_executor_v) { - + if constexpr (is_stf_executor_v) { + using namespace cuda::experimental::stf; /* Don't create a new logical data for a tensor if it alread had one created previously */ if (stf_ldata_ && stf_ldata_->has_value()) { return; } auto ctx = ex.getCtx(); #if 0 // Determine the type of memory that was allocated ie. host/managed/etc - place = getDataPlace(Data()); + data_place place = getDataPlace(Data()); #endif *stf_ldata_ = ctx.logical_data(cuda::experimental::stf::void_interface()); //stf_ldata_->value().set_write_back(false); stf_ldata_->value().set_symbol(this->str()); - } + } } template From 5bfe21e3bdcba37119f1665a9db537329d502d7e Mon Sep 17 00:00:00 2001 From: Albert Sidelnik Date: Thu, 19 Dec 2024 12:36:30 -0800 Subject: [PATCH 17/39] add conditional support for cudagraph to the stf executor --- include/matx/executors/stf.h | 32 ++++++++++++++++---------------- 1 file changed, 16 insertions(+), 16 deletions(-) diff --git a/include/matx/executors/stf.h b/include/matx/executors/stf.h index 83650fc7..7f8b9674 100644 --- a/include/matx/executors/stf.h +++ b/include/matx/executors/stf.h @@ -59,32 +59,32 @@ template constexpr bool is_matx_set_op(); * * @param stream CUDA stream */ - stfExecutor(cudaStream_t stream) : stream_(stream) { + stfExecutor(cudaStream_t stream, bool is_cudagraph = false) : stream_(stream) { cuda::experimental::stf::async_resources_handle handle; - ctx_ = cuda::experimental::stf::stream_ctx(stream, handle); - //ctx_ = cuda::experimental::stf::graph_ctx(stream, handle); + if (!is_cudagraph) + ctx_ = cuda::experimental::stf::stream_ctx(stream, handle); + else + ctx_ = cuda::experimental::stf::graph_ctx(stream, handle); } - stfExecutor(int stream) : stream_(reinterpret_cast(stream)) { + + stfExecutor(int stream, bool is_cudagraph = false) : stream_(reinterpret_cast(stream)) { cuda::experimental::stf::async_resources_handle handle; - ctx_ = cuda::experimental::stf::stream_ctx(reinterpret_cast(stream), handle); - //ctx_ = cuda::experimental::stf::graph_ctx(reinterpret_cast(stream), handle); + if (!is_cudagraph) + ctx_ = cuda::experimental::stf::stream_ctx(reinterpret_cast(stream), handle); + else + ctx_ = cuda::experimental::stf::graph_ctx(reinterpret_cast(stream), handle); } /** * @brief Construct a new stfExecutor object using the default stream * */ - stfExecutor() : stream_(0) { - ctx_ = cuda::experimental::stf::stream_ctx(); - //ctx_ = cuda::experimental::stf::graph_ctx(); - } - -#if 0 - ~stfExecutor() { - std::cout << "About to call ctx.finalize\n"; - ctx_.finalize(); + stfExecutor(bool is_cudagraph = false) : stream_(0) { + if (!is_cudagraph) + ctx_ = cuda::experimental::stf::stream_ctx(); + else + ctx_ = cuda::experimental::stf::graph_ctx(); } -#endif /** * @brief Returns stream associated with executor From f407256a4815b4d43b9bc5d15ba2ad3dd2d4a39f Mon Sep 17 00:00:00 2001 From: Albert Sidelnik Date: Thu, 9 Jan 2025 12:46:19 -0800 Subject: [PATCH 18/39] update to latest cudastf --- cmake/versions.json | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/cmake/versions.json b/cmake/versions.json index acea4f9a..108b6bc1 100644 --- a/cmake/versions.json +++ b/cmake/versions.json @@ -3,7 +3,7 @@ "CCCL": { "version": "2.8.0", "git_url": "https://github.com/NVIDIA/cccl.git", - "git_tag": "980e55d37f9c48776df977a4f59f9ef0cdb816f2" + "git_tag": "36e27f7c1074010eefaab64d387ff6663569e065" } } } From 221599cb3e6a56861017969ec056c0766cfe8608 Mon Sep 17 00:00:00 2001 From: Albert Sidelnik Date: Thu, 9 Jan 2025 12:52:56 -0800 Subject: [PATCH 19/39] switch to use logical token --- include/matx/core/tensor_impl.h | 3 +-- include/matx/executors/stf.h | 4 +++- 2 files changed, 4 insertions(+), 3 deletions(-) diff --git a/include/matx/core/tensor_impl.h b/include/matx/core/tensor_impl.h index e79c4e06..142d9008 100644 --- a/include/matx/core/tensor_impl.h +++ b/include/matx/core/tensor_impl.h @@ -1144,8 +1144,7 @@ IGNORE_WARNING_POP_GCC data_place place = getDataPlace(Data()); #endif - *stf_ldata_ = ctx.logical_data(cuda::experimental::stf::void_interface()); - //stf_ldata_->value().set_write_back(false); + *stf_ldata_ = ctx.logical_token(); stf_ldata_->value().set_symbol(this->str()); } } diff --git a/include/matx/executors/stf.h b/include/matx/executors/stf.h index 7f8b9674..e055a051 100644 --- a/include/matx/executors/stf.h +++ b/include/matx/executors/stf.h @@ -86,6 +86,9 @@ template constexpr bool is_matx_set_op(); ctx_ = cuda::experimental::stf::graph_ctx(); } + ~stfExecutor() { + //ctx_.finalize(); + } /** * @brief Returns stream associated with executor */ @@ -245,7 +248,6 @@ template constexpr bool is_matx_set_op(); #else MATX_ASSERT_STR(false, matxInvalidParameter, "Cannot call device executor using host compiler"); #endif - //std::cout << "exec on stfexecutor -- stop\n"; } private: From 7a5bb6cf72fabe62be9ee23f51d24e1682e55be5 Mon Sep 17 00:00:00 2001 From: Albert Sidelnik Date: Thu, 9 Jan 2025 12:55:06 -0800 Subject: [PATCH 20/39] update parameters for radar code --- examples/simple_radar_pipeline.cu | 17 +++++++++++------ examples/simple_radar_pipeline.h | 2 ++ 2 files changed, 13 insertions(+), 6 deletions(-) diff --git a/examples/simple_radar_pipeline.cu b/examples/simple_radar_pipeline.cu index 1cec32a4..47e8f912 100644 --- a/examples/simple_radar_pipeline.cu +++ b/examples/simple_radar_pipeline.cu @@ -35,20 +35,21 @@ int main([[maybe_unused]] int argc, [[maybe_unused]] char **argv) { MATX_ENTER_HANDLER(); +#if 0 index_t numChannels = 16; -#if 1 index_t numPulses = 128; index_t numSamples = 9000; index_t waveformLength = 1000; uint32_t iterations = 100; #else + index_t numChannels = 16; index_t numPulses = 128; index_t numSamples = 1000; index_t waveformLength = 1000; - uint32_t iterations = 20; + uint32_t iterations = 100; #endif constexpr bool ENABLE_GRAPHS = false; - constexpr int num_streams = 1; + constexpr int num_streams = 8; cudaGraph_t graphs[num_streams]; cudaGraphExec_t instances[num_streams]; using complex = cuda::std::complex; @@ -85,7 +86,6 @@ int main([[maybe_unused]] int argc, [[maybe_unused]] char **argv) /* Get STF context handle */ #if 1 - auto ctx = pipelines[0]->exec.getCtx(); #endif MATX_NVTX_START_RANGE("Pipeline Test", matx_nvxtLogLevels::MATX_NVTX_LOG_USER, 2) @@ -129,6 +129,7 @@ int main([[maybe_unused]] int argc, [[maybe_unused]] char **argv) #if 0 cudaEventRecord(starts[s], streams[s]); #else + auto ctx = pipelines[s]->exec.getCtx(); cudaEventRecord(starts[s], ctx.task_fence()); #endif } @@ -145,14 +146,18 @@ int main([[maybe_unused]] int argc, [[maybe_unused]] char **argv) for (int s = 0; s < num_streams; s++) { #if 0 cudaEventRecord(stops[s], streams[s]); + pipelines[s]->sync(); #else + auto ctx = pipelines[s]->exec.getCtx(); cudaEventRecord(stops[s], ctx.task_fence()); #endif - pipelines[s]->sync(); } #if 1 - ctx.finalize(); + for (int s = 0; s < num_streams; s++) { + auto ctx = pipelines[s]->exec.getCtx(); + ctx.finalize(); + } #endif MATX_NVTX_END_RANGE(2) diff --git a/examples/simple_radar_pipeline.h b/examples/simple_radar_pipeline.h index 336be74b..74bb4b9f 100644 --- a/examples/simple_radar_pipeline.h +++ b/examples/simple_radar_pipeline.h @@ -120,6 +120,7 @@ class RadarPipeline { RadarPipeline() = delete; ~RadarPipeline() { + std::cout << "DTOR for radar\n"; } @@ -137,6 +138,7 @@ class RadarPipeline { : numPulses(_numPulses), numSamples(_numSamples), waveformLength(_wfLen), numChannels(_numChannels), stream(_stream), exec(_stream) { + std::cout << "CTOR for pipeline\n"; numSamplesRnd = 1; while (numSamplesRnd < numSamples) { numSamplesRnd *= 2; From 0c2432fd092f63161f18809e797dca626ca61fbf Mon Sep 17 00:00:00 2001 From: Albert Sidelnik Date: Thu, 9 Jan 2025 13:27:46 -0800 Subject: [PATCH 21/39] update to radar code to work with command line args --- examples/simple_radar_pipeline.cu | 70 +++++++++++++++++++++---------- 1 file changed, 48 insertions(+), 22 deletions(-) diff --git a/examples/simple_radar_pipeline.cu b/examples/simple_radar_pipeline.cu index 47e8f912..9bb91754 100644 --- a/examples/simple_radar_pipeline.cu +++ b/examples/simple_radar_pipeline.cu @@ -48,30 +48,61 @@ int main([[maybe_unused]] int argc, [[maybe_unused]] char **argv) index_t waveformLength = 1000; uint32_t iterations = 100; #endif + +#if 0 + constexpr int numStreams = 8; +#else + int numStreams = 1; +#endif + +#if 1 + // Parse command-line arguments + for (int i = 1; i < argc; ++i) { + std::string arg = argv[i]; + + if (arg == "--numChannels" && i + 1 < argc) { + numChannels = std::stoi(argv[++i]); + } else if (arg == "--numPulses" && i + 1 < argc) { + numPulses = std::stoi(argv[++i]); + } else if (arg == "--numSamples" && i + 1 < argc) { + numSamples = std::stoi(argv[++i]); + } else if (arg == "--waveformLength" && i + 1 < argc) { + waveformLength = std::stoi(argv[++i]); + } else if (arg == "--iterations" && i + 1 < argc) { + iterations = std::stoi(argv[++i]); + } else if (arg == "--numStreams" && i + 1 < argc) { + numStreams = std::stoi(argv[++i]); + } else { + std::cerr << "Unknown option or missing value: " << arg << std::endl; + return 1; // Exit with error + } + } +#endif + constexpr bool ENABLE_GRAPHS = false; - constexpr int num_streams = 8; - cudaGraph_t graphs[num_streams]; - cudaGraphExec_t instances[num_streams]; + cudaGraph_t graphs[numStreams]; + cudaGraphExec_t instances[numStreams]; using complex = cuda::std::complex; - RadarPipeline *pipelines[num_streams]; + RadarPipeline *pipelines[numStreams]; std::cout << "Iterations: " << iterations << std::endl; std::cout << "numChannels: " << numChannels << std::endl; std::cout << "numPulses: " << numPulses << std::endl; std::cout << "numNumSamples: " << numSamples << std::endl; std::cout << "waveformLength: " << waveformLength << std::endl; + std::cout << "numStreams: " << numStreams << std::endl; // cuda stream to place work in - cudaStream_t streams[num_streams]; + cudaStream_t streams[numStreams]; // manually set to log all NVTX levels MATX_NVTX_SET_LOG_LEVEL( matx_nvxtLogLevels::MATX_NVTX_LOG_ALL ); // create some events for timing - cudaEvent_t starts[num_streams]; - cudaEvent_t stops[num_streams]; + cudaEvent_t starts[numStreams]; + cudaEvent_t stops[numStreams]; - for (int s = 0; s < num_streams; s++) { + for (int s = 0; s < numStreams; s++) { cudaEventCreate(&starts[s]); cudaEventCreate(&stops[s]); cudaStreamCreate(&streams[s]); @@ -84,10 +115,6 @@ int main([[maybe_unused]] int argc, [[maybe_unused]] char **argv) pipelines[s]->sync(); } - /* Get STF context handle */ -#if 1 -#endif - MATX_NVTX_START_RANGE("Pipeline Test", matx_nvxtLogLevels::MATX_NVTX_LOG_USER, 2) printf("Running test...\n"); @@ -110,12 +137,12 @@ int main([[maybe_unused]] int argc, [[maybe_unused]] char **argv) }; // Warmup - for (int s = 0; s < num_streams; s++) { + for (int s = 0; s < numStreams; s++) { run_pipeline(s); } if (ENABLE_GRAPHS) { - for (int s = 0; s < num_streams; s++) { + for (int s = 0; s < numStreams; s++) { cudaStreamBeginCapture(streams[s], cudaStreamCaptureModeGlobal); run_pipeline(s); cudaStreamEndCapture(streams[s], &graphs[s]); @@ -124,7 +151,7 @@ int main([[maybe_unused]] int argc, [[maybe_unused]] char **argv) } for (uint32_t i = 0; i < iterations; i++) { - for (int s = 0; s < num_streams; s++) { + for (int s = 0; s < numStreams; s++) { if (i == 1) { #if 0 cudaEventRecord(starts[s], streams[s]); @@ -143,18 +170,19 @@ int main([[maybe_unused]] int argc, [[maybe_unused]] char **argv) } } - for (int s = 0; s < num_streams; s++) { + for (int s = 0; s < numStreams; s++) { #if 0 cudaEventRecord(stops[s], streams[s]); pipelines[s]->sync(); #else auto ctx = pipelines[s]->exec.getCtx(); cudaEventRecord(stops[s], ctx.task_fence()); + pipelines[s]->sync(); #endif } #if 1 - for (int s = 0; s < num_streams; s++) { + for (int s = 0; s < numStreams; s++) { auto ctx = pipelines[s]->exec.getCtx(); ctx.finalize(); } @@ -163,19 +191,17 @@ int main([[maybe_unused]] int argc, [[maybe_unused]] char **argv) MATX_NVTX_END_RANGE(2) MATX_NVTX_START_RANGE("Pipeline Results", matx_nvxtLogLevels::MATX_NVTX_LOG_USER, 3) -#if 1 float time_ms; - cudaEventElapsedTime(&time_ms, starts[num_streams-1], stops[num_streams-1]); + cudaEventElapsedTime(&time_ms, starts[numStreams-1], stops[numStreams-1]); float time_s = time_ms * .001f; - auto mult = iterations * numChannels * numPulses * num_streams; + auto mult = iterations * numChannels * numPulses * numStreams; printf("Pipeline finished in %.2fms, rate: %.2f pulses/channel/sec (%.2f Gbps)\n", time_ms, static_cast(mult) / time_s, static_cast(mult*sizeof(complex)*numSamples*8)/time_s/1e9); -#endif -for (int s = 0; s < num_streams; s++) { +for (int s = 0; s < numStreams; s++) { cudaEventDestroy(starts[s]); cudaEventDestroy(stops[s]); cudaStreamDestroy(streams[s]); From 3ae267be117b4d9b3640f72b32d198bcf8afa789 Mon Sep 17 00:00:00 2001 From: Albert Sidelnik Date: Thu, 9 Jan 2025 15:03:26 -0800 Subject: [PATCH 22/39] cleanup to support different executor --- examples/simple_radar_pipeline.cu | 49 ++++++++++++------------------- examples/simple_radar_pipeline.h | 6 +++- 2 files changed, 24 insertions(+), 31 deletions(-) diff --git a/examples/simple_radar_pipeline.cu b/examples/simple_radar_pipeline.cu index 9bb91754..98ef04fd 100644 --- a/examples/simple_radar_pipeline.cu +++ b/examples/simple_radar_pipeline.cu @@ -35,19 +35,11 @@ int main([[maybe_unused]] int argc, [[maybe_unused]] char **argv) { MATX_ENTER_HANDLER(); -#if 0 index_t numChannels = 16; index_t numPulses = 128; index_t numSamples = 9000; index_t waveformLength = 1000; uint32_t iterations = 100; -#else - index_t numChannels = 16; - index_t numPulses = 128; - index_t numSamples = 1000; - index_t waveformLength = 1000; - uint32_t iterations = 100; -#endif #if 0 constexpr int numStreams = 8; @@ -55,7 +47,6 @@ int main([[maybe_unused]] int argc, [[maybe_unused]] char **argv) int numStreams = 1; #endif -#if 1 // Parse command-line arguments for (int i = 1; i < argc; ++i) { std::string arg = argv[i]; @@ -77,21 +68,20 @@ int main([[maybe_unused]] int argc, [[maybe_unused]] char **argv) return 1; // Exit with error } } -#endif - - constexpr bool ENABLE_GRAPHS = false; - cudaGraph_t graphs[numStreams]; - cudaGraphExec_t instances[numStreams]; - using complex = cuda::std::complex; - RadarPipeline *pipelines[numStreams]; std::cout << "Iterations: " << iterations << std::endl; std::cout << "numChannels: " << numChannels << std::endl; std::cout << "numPulses: " << numPulses << std::endl; - std::cout << "numNumSamples: " << numSamples << std::endl; + std::cout << "numSamples: " << numSamples << std::endl; std::cout << "waveformLength: " << waveformLength << std::endl; std::cout << "numStreams: " << numStreams << std::endl; + constexpr bool ENABLE_GRAPHS = false; + cudaGraph_t graphs[numStreams]; + cudaGraphExec_t instances[numStreams]; + using complex = cuda::std::complex; + RadarPipeline *pipelines[numStreams]; + // cuda stream to place work in cudaStream_t streams[numStreams]; @@ -153,11 +143,11 @@ int main([[maybe_unused]] int argc, [[maybe_unused]] char **argv) for (uint32_t i = 0; i < iterations; i++) { for (int s = 0; s < numStreams; s++) { if (i == 1) { -#if 0 - cudaEventRecord(starts[s], streams[s]); -#else +#ifdef USE_STF auto ctx = pipelines[s]->exec.getCtx(); cudaEventRecord(starts[s], ctx.task_fence()); +#else + cudaEventRecord(starts[s], streams[s]); #endif } @@ -171,21 +161,20 @@ int main([[maybe_unused]] int argc, [[maybe_unused]] char **argv) } for (int s = 0; s < numStreams; s++) { -#if 0 - cudaEventRecord(stops[s], streams[s]); - pipelines[s]->sync(); -#else +#ifdef USE_STF auto ctx = pipelines[s]->exec.getCtx(); cudaEventRecord(stops[s], ctx.task_fence()); - pipelines[s]->sync(); +#else + cudaEventRecord(stops[s], streams[s]); #endif + pipelines[s]->sync(); } -#if 1 - for (int s = 0; s < numStreams; s++) { - auto ctx = pipelines[s]->exec.getCtx(); - ctx.finalize(); - } +#ifdef USE_STF + for (int s = 0; s < numStreams; s++) { + auto ctx = pipelines[s]->exec.getCtx(); + ctx.finalize(); + } #endif MATX_NVTX_END_RANGE(2) diff --git a/examples/simple_radar_pipeline.h b/examples/simple_radar_pipeline.h index 74bb4b9f..7aea6c8d 100644 --- a/examples/simple_radar_pipeline.h +++ b/examples/simple_radar_pipeline.h @@ -35,6 +35,10 @@ #include #include +#ifndef USE_STF +#define USE_STF 1 +#endif + using namespace matx; /** @@ -467,7 +471,7 @@ class RadarPipeline { tensor_t cfarMaskView; cudaStream_t stream; -#if 1 +#ifdef USE_STF public: stfExecutor exec; #else From 6a75794e7a17cffba884e65f882510a26609174c Mon Sep 17 00:00:00 2001 From: Albert Sidelnik Date: Fri, 24 Jan 2025 11:10:24 -0800 Subject: [PATCH 23/39] cleanup radar code to emit stf and cuda versions --- examples/CMakeLists.txt | 18 +++++++++++++++--- examples/simple_radar_pipeline.cu | 12 ++++++++++++ examples/simple_radar_pipeline.h | 7 ------- 3 files changed, 27 insertions(+), 10 deletions(-) diff --git a/examples/CMakeLists.txt b/examples/CMakeLists.txt index 563b6fc3..fbbd1be1 100644 --- a/examples/CMakeLists.txt +++ b/examples/CMakeLists.txt @@ -20,9 +20,6 @@ set(examples black_scholes print_styles) - - - add_library(example_lib INTERFACE) target_include_directories(example_lib SYSTEM INTERFACE ${CUTLASS_INC} ${pybind11_INCLUDE_DIR} ${PYTHON_INCLUDE_DIRS}) @@ -54,6 +51,21 @@ foreach( example ${examples} ) target_link_libraries(${example} example_lib) endforeach() +# Compile all examples with CUDASTF and append their name with _stf +foreach( example ${examples} ) + string( CONCAT file ${example} ".cu" ) + + set(output_name "${example}_stf") + add_executable( ${output_name} ${file} ) + + # Add the -DUSE_STF compilation flag + target_compile_definitions(${output_name} PRIVATE USE_STF) + + target_link_libraries(${output_name} example_lib) +endforeach() + + + # Build proprietary examples file (GLOB_RECURSE proprietary_sources CONFIGURE_DEPENDS ${CMAKE_SOURCE_DIR}/proprietary/*/examples/*.cu) foreach (pexample ${proprietary_sources}) diff --git a/examples/simple_radar_pipeline.cu b/examples/simple_radar_pipeline.cu index 98ef04fd..fb6a64a2 100644 --- a/examples/simple_radar_pipeline.cu +++ b/examples/simple_radar_pipeline.cu @@ -41,6 +41,12 @@ int main([[maybe_unused]] int argc, [[maybe_unused]] char **argv) index_t waveformLength = 1000; uint32_t iterations = 100; +#ifdef USE_STF + std::cout << "Using STF executor\n"; +#else + std::cout << "Using CUDA executor\n"; +#endif + #if 0 constexpr int numStreams = 8; #else @@ -98,7 +104,9 @@ int main([[maybe_unused]] int argc, [[maybe_unused]] char **argv) cudaStreamCreate(&streams[s]); MATX_NVTX_START_RANGE("Pipeline Initialize", matx_nvxtLogLevels::MATX_NVTX_LOG_USER, 1) +#if 0 printf("Initializing data structures for stream %d...\n", s); +#endif pipelines[s] = new RadarPipeline(numPulses, numSamples, waveformLength, numChannels, streams[s]); MATX_NVTX_END_RANGE(1) @@ -106,7 +114,9 @@ int main([[maybe_unused]] int argc, [[maybe_unused]] char **argv) } MATX_NVTX_START_RANGE("Pipeline Test", matx_nvxtLogLevels::MATX_NVTX_LOG_USER, 2) +#if 0 printf("Running test...\n"); +#endif auto run_pipeline = [&](int s) { MATX_NVTX_START_RANGE("PulseCompression", matx_nvxtLogLevels::MATX_NVTX_LOG_USER, 21) @@ -164,8 +174,10 @@ int main([[maybe_unused]] int argc, [[maybe_unused]] char **argv) #ifdef USE_STF auto ctx = pipelines[s]->exec.getCtx(); cudaEventRecord(stops[s], ctx.task_fence()); + std::cout << "using stf 2\n"; #else cudaEventRecord(stops[s], streams[s]); + std::cout << "using cudaexec 2\n"; #endif pipelines[s]->sync(); } diff --git a/examples/simple_radar_pipeline.h b/examples/simple_radar_pipeline.h index 7aea6c8d..4892e679 100644 --- a/examples/simple_radar_pipeline.h +++ b/examples/simple_radar_pipeline.h @@ -35,10 +35,6 @@ #include #include -#ifndef USE_STF -#define USE_STF 1 -#endif - using namespace matx; /** @@ -124,8 +120,6 @@ class RadarPipeline { RadarPipeline() = delete; ~RadarPipeline() { - std::cout << "DTOR for radar\n"; - } /** @@ -142,7 +136,6 @@ class RadarPipeline { : numPulses(_numPulses), numSamples(_numSamples), waveformLength(_wfLen), numChannels(_numChannels), stream(_stream), exec(_stream) { - std::cout << "CTOR for pipeline\n"; numSamplesRnd = 1; while (numSamplesRnd < numSamples) { numSamplesRnd *= 2; From f1facca21812b70aef36d2348cbe16d11ffbbb0f Mon Sep 17 00:00:00 2001 From: Albert Sidelnik Date: Fri, 24 Jan 2025 11:14:52 -0800 Subject: [PATCH 24/39] test script that runs simple radar with different input sizes. outputs a .csv file of results --- examples/simple_radar_pipeline_test.sh | 45 ++++++++++++++++++++++++++ 1 file changed, 45 insertions(+) create mode 100755 examples/simple_radar_pipeline_test.sh diff --git a/examples/simple_radar_pipeline_test.sh b/examples/simple_radar_pipeline_test.sh new file mode 100755 index 00000000..12fd39a1 --- /dev/null +++ b/examples/simple_radar_pipeline_test.sh @@ -0,0 +1,45 @@ +#!/bin/bash + +COMMAND_PARAMETERS=("../build/examples/simple_radar_pipeline" "../build/examples/simple_radar_pipeline_stf") +STREAM_PARAMETERS=("--numStreams 1" "--numStreams 2" "--numStreams 4" "--numStreams 8") +#SAMPLE_PARAMETERS=("--numSamples 1000" "--numSamples 4500" "--numSamples 9000") +SAMPLE_PARAMETERS=("--numSamples 1000" "--numSamples 2000" "--numSamples 4500" "--numSamples 6000") + +NUM_RUNS=5 + +OUTPUT_FILE="radar_heatmap_data.csv" +# Initialize the CSV file +echo "Command,NumStreams,NumSamples,AverageGbps" > $OUTPUT_FILE + +# Loop through the parameters +for command_param in "${COMMAND_PARAMETERS[@]}"; do + for str_param in "${STREAM_PARAMETERS[@]}"; do + for sam_param in "${SAMPLE_PARAMETERS[@]}"; do + TOTAL_GBPS=0 + for i in $(seq 1 $NUM_RUNS); do + #echo "Iteration $i with parameter $command_param $str_param $sam_param" + OUTPUT=$($command_param $str_param $sam_param) + GBPS=$(echo "$OUTPUT" | grep -oP '(?<=\().*? Gbps' | awk '{print $1}') + + # Add the extracted value to the total + if [ -n "$GBPS" ]; then + TOTAL_GBPS=$(echo "$TOTAL_GBPS + $GBPS" | bc) + else + echo "Failed to extract Gbps for iteration $i." + fi + done + + # Calculate the average + if [ "$NUM_RUNS" -gt 0 ]; then + AVERAGE_GBPS=$(echo "$TOTAL_GBPS / $NUM_RUNS" | bc -l) + #echo "$command_param $str_param $sam_param verage Gbps over $NUM_RUNS runs: $AVERAGE_GBPS" + # Append the results to the CSV file + echo "$command_param,$(echo $str_param | awk '{print $2}'),$(echo $sam_param | awk '{print $2}'),$AVERAGE_GBPS" >> $OUTPUT_FILE + else + echo "No runs were performed." + fi + done + done +done + +echo "Heatmap data saved to $OUTPUT_FILE." From 0199e7517c2a466912864f5c0d6daf5557760375 Mon Sep 17 00:00:00 2001 From: Albert Sidelnik Date: Fri, 24 Jan 2025 15:05:23 -0800 Subject: [PATCH 25/39] enable cuda graphs as a command line argument enableGraphs --- examples/simple_radar_pipeline.cu | 12 +++++++----- 1 file changed, 7 insertions(+), 5 deletions(-) diff --git a/examples/simple_radar_pipeline.cu b/examples/simple_radar_pipeline.cu index fb6a64a2..d200aca0 100644 --- a/examples/simple_radar_pipeline.cu +++ b/examples/simple_radar_pipeline.cu @@ -41,6 +41,7 @@ int main([[maybe_unused]] int argc, [[maybe_unused]] char **argv) index_t waveformLength = 1000; uint32_t iterations = 100; + bool enableGraphs = false; #ifdef USE_STF std::cout << "Using STF executor\n"; #else @@ -69,6 +70,9 @@ int main([[maybe_unused]] int argc, [[maybe_unused]] char **argv) iterations = std::stoi(argv[++i]); } else if (arg == "--numStreams" && i + 1 < argc) { numStreams = std::stoi(argv[++i]); + } else if (arg == "--enableGraphs") { + enableGraphs = true; + ++i; } else { std::cerr << "Unknown option or missing value: " << arg << std::endl; return 1; // Exit with error @@ -81,8 +85,8 @@ int main([[maybe_unused]] int argc, [[maybe_unused]] char **argv) std::cout << "numSamples: " << numSamples << std::endl; std::cout << "waveformLength: " << waveformLength << std::endl; std::cout << "numStreams: " << numStreams << std::endl; + std::cout << "enableGraphs: " << enableGraphs << std::endl; - constexpr bool ENABLE_GRAPHS = false; cudaGraph_t graphs[numStreams]; cudaGraphExec_t instances[numStreams]; using complex = cuda::std::complex; @@ -141,7 +145,7 @@ int main([[maybe_unused]] int argc, [[maybe_unused]] char **argv) run_pipeline(s); } - if (ENABLE_GRAPHS) { + if (enableGraphs) { for (int s = 0; s < numStreams; s++) { cudaStreamBeginCapture(streams[s], cudaStreamCaptureModeGlobal); run_pipeline(s); @@ -161,7 +165,7 @@ int main([[maybe_unused]] int argc, [[maybe_unused]] char **argv) #endif } - if (ENABLE_GRAPHS) { + if (enableGraphs) { cudaGraphLaunch(instances[s], streams[s]); } else { @@ -174,10 +178,8 @@ int main([[maybe_unused]] int argc, [[maybe_unused]] char **argv) #ifdef USE_STF auto ctx = pipelines[s]->exec.getCtx(); cudaEventRecord(stops[s], ctx.task_fence()); - std::cout << "using stf 2\n"; #else cudaEventRecord(stops[s], streams[s]); - std::cout << "using cudaexec 2\n"; #endif pipelines[s]->sync(); } From 39b16f4bce9e2ffcec5a1aaa127d63c05ec173f1 Mon Sep 17 00:00:00 2001 From: Albert Sidelnik Date: Mon, 27 Jan 2025 15:41:11 -0800 Subject: [PATCH 26/39] add support for the random/randomOp generator --- include/matx/generators/random.h | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/include/matx/generators/random.h b/include/matx/generators/random.h index 9cd89854..ee3ada05 100644 --- a/include/matx/generators/random.h +++ b/include/matx/generators/random.h @@ -305,7 +305,7 @@ namespace detail { { InnerPreRun(std::forward(shape), std::forward(ex)); #ifdef __CUDACC__ - if constexpr (is_cuda_executor_v) { + if constexpr ((is_cuda_executor_v) || (is_stf_executor_v)) { if (!init_) { auto stream = ex.getStream(); matxAlloc((void **)&states_, @@ -341,7 +341,7 @@ namespace detail { template __MATX_INLINE__ void PostRun([[maybe_unused]] ST &&shape, [[maybe_unused]] Executor &&ex) const noexcept { - if constexpr (is_cuda_executor_v) { + if constexpr ((is_cuda_executor_v) || (is_stf_executor_v)) { matxFree(states_); } else if constexpr (is_host_executor_v) { @@ -479,6 +479,9 @@ namespace detail { return shape_[dim]; } + template + __MATX_INLINE__ void apply_dep_to_task([[maybe_unused]] Task &&task, [[maybe_unused]] int perm=1) const noexcept { } + static __MATX_INLINE__ constexpr __MATX_HOST__ __MATX_DEVICE__ int32_t Rank() { return RANK; } }; } From 9b7c4b0e4c7ebf2f08e1dcc114d0b2827cd7a4b1 Mon Sep 17 00:00:00 2001 From: Albert Sidelnik Date: Mon, 27 Jan 2025 15:42:24 -0800 Subject: [PATCH 27/39] get the basic spectrogram code working with stf --- examples/spectrogram.cu | 32 +++++++++++++++++++++++++++++++- 1 file changed, 31 insertions(+), 1 deletion(-) diff --git a/examples/spectrogram.cu b/examples/spectrogram.cu index 8d566b32..16464042 100644 --- a/examples/spectrogram.cu +++ b/examples/spectrogram.cu @@ -65,7 +65,18 @@ int main([[maybe_unused]] int argc, [[maybe_unused]] char **argv) cudaEventCreate(&start); cudaEventCreate(&stop); +#ifdef USE_STF + std::cout << "Using STF executor\n"; +#else + std::cout << "Using CUDA executor\n"; +#endif + + +#ifdef USE_STF + stfExecutor exec{stream}; +#else cudaExecutor exec{stream}; +#endif float fs = 10000; constexpr index_t N = 100000; @@ -108,7 +119,12 @@ int main([[maybe_unused]] int argc, [[maybe_unused]] char **argv) for (uint32_t i = 0; i < num_iterations; i++) { if (i == 2) { // Start timer on third loop to allow generation of plot +#if USE_STF + auto ctx = exec.getCtx(); + cudaEventRecord(start, ctx.task_fence()); +#else cudaEventRecord(start, stream); +#endif } // DFT Sample Frequencies (rfftfreq) @@ -142,9 +158,23 @@ int main([[maybe_unused]] int argc, [[maybe_unused]] char **argv) } } - +#ifdef USE_STF +{ + auto ctx = exec.getCtx(); + cudaEventRecord(stop, ctx.task_fence()); +} +#else cudaEventRecord(stop, stream); +#endif exec.sync(); + +#ifdef USE_STF +{ + auto ctx = exec.getCtx(); + ctx.finalize(); +} +#endif + cudaEventElapsedTime(&time_ms, start, stop); printf("Spectrogram Time Without Graphs = %.2fus per iteration\n", From f9e09f104550ac202e01b4af201160247b0c13fc Mon Sep 17 00:00:00 2001 From: Albert Sidelnik Date: Mon, 27 Jan 2025 15:45:20 -0800 Subject: [PATCH 28/39] get spectrogram cudagraph code working with stf --- examples/spectrogram_graph.cu | 36 +++++++++++++++++++++++++++++++---- 1 file changed, 32 insertions(+), 4 deletions(-) diff --git a/examples/spectrogram_graph.cu b/examples/spectrogram_graph.cu index 6f0583c8..91958ca2 100644 --- a/examples/spectrogram_graph.cu +++ b/examples/spectrogram_graph.cu @@ -62,12 +62,22 @@ int main([[maybe_unused]] int argc, [[maybe_unused]] char **argv) cudaStream_t stream; cudaStreamCreate(&stream); - cudaExecutor exec{stream}; - cudaEvent_t start, stop; cudaEventCreate(&start); cudaEventCreate(&stop); +#ifdef USE_STF + std::cout << "Using STF executor\n"; +#else + std::cout << "Using CUDA executor\n"; +#endif + +#ifdef USE_STF + stfExecutor exec{stream}; +#else + cudaExecutor exec{stream}; +#endif + float fs = 10000; index_t N = 100000; float amp = static_cast(2 * sqrt(2)); @@ -147,15 +157,33 @@ int main([[maybe_unused]] int argc, [[maybe_unused]] char **argv) } } - exec.sync(); // Time graph execution of same kernels - cudaEventRecord(start, stream); +#if USE_STF + auto ctx = exec.getCtx(); + cudaEventRecord(start, ctx.task_fence()); +#else + cudaEventRecord(start, stream); +#endif + for (uint32_t i = 0; i < 10; i++) { cudaGraphLaunch(instance, stream); } +#ifdef USE_STF +{ + cudaEventRecord(stop, ctx.task_fence()); +} +#else cudaEventRecord(stop, stream); +#endif exec.sync(); + +#ifdef USE_STF +{ + ctx.finalize(); +} +#endif + cudaEventElapsedTime(&time_ms, start, stop); printf("Spectrogram Time With Graphs = %.2fus per iteration\n", From 6c9a791576fcd8be714fbb7c9f133050e87db579 Mon Sep 17 00:00:00 2001 From: Albert Sidelnik Date: Mon, 10 Feb 2025 10:35:44 -0800 Subject: [PATCH 29/39] add assert in the case stream capture is turned on if creating a plan --- include/matx/transforms/fft/fft_cuda.h | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/include/matx/transforms/fft/fft_cuda.h b/include/matx/transforms/fft/fft_cuda.h index f4793b07..f8ea52a4 100644 --- a/include/matx/transforms/fft/fft_cuda.h +++ b/include/matx/transforms/fft/fft_cuda.h @@ -333,7 +333,11 @@ template class matxCUDAFFTPlan_t virtual ~matxCUDAFFTPlan_t() { if (this->workspace_ != nullptr) { // Pass the default stream until we allow user-deletable caches + /* Albert -- Temporarily remove this free as we likely don't want to + insert the dependence on cudaStreamDefault */ +#if 0 matxFree(workspace_, cudaStreamDefault); +#endif this->workspace_ = nullptr; } @@ -424,6 +428,11 @@ matxCUDAFFTPlan1D_t(OutTensorType &o, const InTensorType &i, cudaStream_t stream } } + // Albert -Assert that the stream is in capture mode + cudaStreamCaptureStatus status; + cudaStreamIsCapturing(stream, &status); + MATX_ASSERT(status == cudaStreamCaptureStatusNone, matxCufftError); + size_t workspaceSize; cufftCreate(&this->plan_); [[maybe_unused]] cufftResult error; From bbb9aaef05d6fb7374f3822679974fef62436d31 Mon Sep 17 00:00:00 2001 From: Cedric Augonnet Date: Mon, 24 Mar 2025 11:08:50 +0100 Subject: [PATCH 30/39] Apps using matx with stf should get these flags --- CMakeLists.txt | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index 135127a6..66f862d7 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -136,7 +136,9 @@ rapids_cpm_cccl( ) target_link_libraries(matx INTERFACE CCCL::CCCL CCCL::cudax) -set(MATX_CUDA_FLAGS ${CMAKE_CUDA_FLAGS} --expt-relaxed-constexpr --extended-lambda) +target_compile_options(matx INTERFACE + $<$:--expt-relaxed-constexpr --extended-lambda> +) # Set flags for compiling tests faster (only for nvcc) if (NOT CMAKE_CUDA_COMPILER_ID STREQUAL "Clang") From e13c9b657662a7f50afeface9e3a50e749934c82 Mon Sep 17 00:00:00 2001 From: Cedric Augonnet Date: Mon, 24 Mar 2025 11:09:56 +0100 Subject: [PATCH 31/39] fix constructor --- include/matx/core/tensor_impl.h | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/include/matx/core/tensor_impl.h b/include/matx/core/tensor_impl.h index e03894e2..ba0f9bf9 100644 --- a/include/matx/core/tensor_impl.h +++ b/include/matx/core/tensor_impl.h @@ -262,8 +262,9 @@ MATX_IGNORE_WARNING_POP_GCC template ::type>, bool> = true> __MATX_INLINE__ __MATX_DEVICE__ __MATX_HOST__ tensor_impl_t(T *const ldata, DescriptorType &&desc, std::optional *stf_ldata) - : ldata_(ldata), desc_{std::forward(desc)}, stf_ldata_(stf_ldata) + : desc_{std::forward(desc)}, stf_ldata_(stf_ldata) { + data_.ldata_ = ldata; } /** From 7244399f66ffaaeb695d01e5fb05aa89de721cb6 Mon Sep 17 00:00:00 2001 From: Albert Sidelnik Date: Mon, 21 Apr 2025 14:29:23 -0700 Subject: [PATCH 32/39] fix typo/bug --- examples/simple_stf_test.cu | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/examples/simple_stf_test.cu b/examples/simple_stf_test.cu index 8947f191..1df260ed 100644 --- a/examples/simple_stf_test.cu +++ b/examples/simple_stf_test.cu @@ -151,6 +151,6 @@ int main([[maybe_unused]] int argc, [[maybe_unused]] char **argv) cudaEventDestroy(start); cudaEventDestroy(stop); cudaStreamDestroy(stream); - CUDA_CHECK_LAST_ERROR(); + //CUDA_CHECK_LAST_ERROR(); MATX_EXIT_HANDLER(); } From 66f68506b2ab846ea11ef207ba260c9165c03c04 Mon Sep 17 00:00:00 2001 From: Albert Sidelnik Date: Mon, 21 Apr 2025 15:09:24 -0700 Subject: [PATCH 33/39] update to example code to fix compile error --- examples/fft_conv.cu | 17 +++++++++-------- 1 file changed, 9 insertions(+), 8 deletions(-) diff --git a/examples/fft_conv.cu b/examples/fft_conv.cu index 0fbba06b..d7882ca3 100644 --- a/examples/fft_conv.cu +++ b/examples/fft_conv.cu @@ -80,13 +80,7 @@ int main([[maybe_unused]] int argc, [[maybe_unused]] char **argv) std::cout << "Using CUDA executor\n"; #endif -#ifdef USE_STF - stfExecutor exec{}; - auto ctx = exec.getCtx(); -#else - cudaExecutor exec{}; -#endif - + index_t signal_size = 1ULL << 16; index_t filter_size = 16; index_t batches = 8; @@ -99,8 +93,14 @@ int main([[maybe_unused]] int argc, [[maybe_unused]] char **argv) cudaEvent_t start, stop; cudaEventCreate(&start); cudaEventCreate(&stop); - cudaExecutor exec{stream}; +#ifdef USE_STF + stfExecutor exec{stream}; + auto ctx = exec.getCtx(); +#else + cudaExecutor exec{stream}; +#endif + // Create time domain buffers auto sig_time = make_tensor({batches, signal_size}); auto filt_time = make_tensor({batches, filter_size}); @@ -135,6 +135,7 @@ int main([[maybe_unused]] int argc, [[maybe_unused]] char **argv) #else cudaEventRecord(start, stream); #endif + } (sig_freq = fft(sig_time, filtered_size)).run(exec); (filt_freq = fft(filt_time, filtered_size)).run(exec); From 89e2a4323abf5bb667271396d83ee80c874e9d2c Mon Sep 17 00:00:00 2001 From: Albert Sidelnik Date: Mon, 21 Apr 2025 15:10:28 -0700 Subject: [PATCH 34/39] update to example code to fix compile error --- examples/spectrogram.cu | 14 -------------- 1 file changed, 14 deletions(-) diff --git a/examples/spectrogram.cu b/examples/spectrogram.cu index f9bab85c..60404a7e 100644 --- a/examples/spectrogram.cu +++ b/examples/spectrogram.cu @@ -65,17 +65,7 @@ int main([[maybe_unused]] int argc, [[maybe_unused]] char **argv) cudaEventCreate(&start); cudaEventCreate(&stop); -#ifdef USE_STF - std::cout << "Using STF executor\n"; -#else - std::cout << "Using CUDA executor\n"; -#endif - -#ifdef USE_STF - stfExecutor exec{stream}; -#else cudaExecutor exec{stream}; -#endif float fs = 10000; constexpr index_t N = 100000; @@ -155,10 +145,6 @@ int main([[maybe_unused]] int argc, [[maybe_unused]] char **argv) exec.stop_timer(); exec.sync(); - #ifdef USE_STF - auto ctx = exec.getCtx(); - ctx.finalize(); - #endif time_ms = exec.get_time_ms(); printf("Spectrogram Time Without Graphs = %.2fus per iteration\n", From 973886b3747f0ece33151a1d195716387ef0335f Mon Sep 17 00:00:00 2001 From: Albert Sidelnik Date: Mon, 21 Apr 2025 15:23:03 -0700 Subject: [PATCH 35/39] update test script for radar code --- examples/simple_radar_pipeline_test.sh | 16 +++++++++------- 1 file changed, 9 insertions(+), 7 deletions(-) diff --git a/examples/simple_radar_pipeline_test.sh b/examples/simple_radar_pipeline_test.sh index 12fd39a1..880e9ed0 100755 --- a/examples/simple_radar_pipeline_test.sh +++ b/examples/simple_radar_pipeline_test.sh @@ -1,24 +1,25 @@ #!/bin/bash COMMAND_PARAMETERS=("../build/examples/simple_radar_pipeline" "../build/examples/simple_radar_pipeline_stf") -STREAM_PARAMETERS=("--numStreams 1" "--numStreams 2" "--numStreams 4" "--numStreams 8") +STREAM_PARAMETERS=("1" "2" "4" "5" "6") #SAMPLE_PARAMETERS=("--numSamples 1000" "--numSamples 4500" "--numSamples 9000") -SAMPLE_PARAMETERS=("--numSamples 1000" "--numSamples 2000" "--numSamples 4500" "--numSamples 6000") +SAMPLE_PARAMETERS=("1000" "2000" "4500" "6000" "10000") NUM_RUNS=5 OUTPUT_FILE="radar_heatmap_data.csv" # Initialize the CSV file -echo "Command,NumStreams,NumSamples,AverageGbps" > $OUTPUT_FILE # Loop through the parameters -for command_param in "${COMMAND_PARAMETERS[@]}"; do - for str_param in "${STREAM_PARAMETERS[@]}"; do +for str_param in "${STREAM_PARAMETERS[@]}"; do + OUTPUT_FILE="radar_heatmap_data_${sam_param}.csv" + echo "Command,NumStreams,NumSamples,AverageGbps" > $OUTPUT_FILE + for command_param in "${COMMAND_PARAMETERS[@]}"; do for sam_param in "${SAMPLE_PARAMETERS[@]}"; do TOTAL_GBPS=0 for i in $(seq 1 $NUM_RUNS); do #echo "Iteration $i with parameter $command_param $str_param $sam_param" - OUTPUT=$($command_param $str_param $sam_param) + OUTPUT=$($command_param --numStreams $str_param --numSamples $sam_param) GBPS=$(echo "$OUTPUT" | grep -oP '(?<=\().*? Gbps' | awk '{print $1}') # Add the extracted value to the total @@ -34,7 +35,8 @@ for command_param in "${COMMAND_PARAMETERS[@]}"; do AVERAGE_GBPS=$(echo "$TOTAL_GBPS / $NUM_RUNS" | bc -l) #echo "$command_param $str_param $sam_param verage Gbps over $NUM_RUNS runs: $AVERAGE_GBPS" # Append the results to the CSV file - echo "$command_param,$(echo $str_param | awk '{print $2}'),$(echo $sam_param | awk '{print $2}'),$AVERAGE_GBPS" >> $OUTPUT_FILE + echo "$command_param,$(echo $str_param | awk '{print $0}'),$(echo $sam_param | awk '{print $0}'),$AVERAGE_GBPS" + echo "$command_param,$(echo $str_param | awk '{print $0}'),$(echo $sam_param | awk '{print $0}'),$AVERAGE_GBPS" >> $OUTPUT_FILE else echo "No runs were performed." fi From 92885e70ffc8b67396d0fc8f9ad7199cf47ebc62 Mon Sep 17 00:00:00 2001 From: Albert Sidelnik Date: Mon, 21 Apr 2025 15:24:54 -0700 Subject: [PATCH 36/39] temp fix to the allocator dtor --- include/matx/core/allocator.h | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/include/matx/core/allocator.h b/include/matx/core/allocator.h index ecb29e08..db841999 100644 --- a/include/matx/core/allocator.h +++ b/include/matx/core/allocator.h @@ -236,9 +236,13 @@ struct MemTracker { } ~MemTracker() { +#if 0 while (allocationMap.size()) { deallocate(allocationMap.begin()->first); } +#else + std::cout << "TODO: Fix me allocator.h\n"; +#endif } }; From 8607840dfc57873219d71a1a5960f35e13e00102 Mon Sep 17 00:00:00 2001 From: Albert Sidelnik Date: Mon, 21 Apr 2025 15:26:44 -0700 Subject: [PATCH 37/39] remove warning to work with latest stf --- CMakeLists.txt | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index 66f862d7..5fac5862 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -168,8 +168,7 @@ if (NOT ${IS_NVCPP} GREATER -1) -Wmisleading-indentation -Wduplicated-cond -Wduplicated-branches - -Wlogical-op - -Wnull-dereference) + -Wlogical-op) endif() endif() From 14e0985b568331e27f855b2703f6da2b02142726 Mon Sep 17 00:00:00 2001 From: Albert Sidelnik Date: Mon, 21 Apr 2025 16:46:28 -0700 Subject: [PATCH 38/39] replace logical token with token --- include/matx/core/tensor_impl.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/include/matx/core/tensor_impl.h b/include/matx/core/tensor_impl.h index ba0f9bf9..7a89046c 100644 --- a/include/matx/core/tensor_impl.h +++ b/include/matx/core/tensor_impl.h @@ -1265,7 +1265,7 @@ MATX_IGNORE_WARNING_POP_GCC data_place place = getDataPlace(Data()); #endif - *stf_ldata_ = ctx.logical_token(); + *stf_ldata_ = ctx.token(); stf_ldata_->value().set_symbol(this->str()); } } From 92e04d56f9a586c16c1203470f88531fabd0e75e Mon Sep 17 00:00:00 2001 From: Albert Sidelnik Date: Mon, 21 Apr 2025 16:47:40 -0700 Subject: [PATCH 39/39] update version to use cccl from main --- cmake/versions.json | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/cmake/versions.json b/cmake/versions.json index 0c44e05d..82cae91a 100644 --- a/cmake/versions.json +++ b/cmake/versions.json @@ -4,7 +4,7 @@ "version": "2.8.0", "git_shallow": false, "git_url": "https://github.com/NVIDIA/cccl.git", - "git_tag": "36e27f7c1074010eefaab64d387ff6663569e065" + "git_tag": "9f254d5d8d071d67c6a6ad107b0bbd578b3d072d" }, "nvbench" : { "version" : "0.0",