From d6dc01d0e768f7535ea7e9b20050fa159c8695d6 Mon Sep 17 00:00:00 2001
From: Albert Sidelnik <asidelnik@nvidia.com>
Date: Mon, 4 Nov 2024 15:22:13 -0800
Subject: [PATCH 01/39] Update build config to pull CUDASTF

---
 CMakeLists.txt      | 6 ++++--
 cmake/versions.json | 5 ++---
 2 files changed, 6 insertions(+), 5 deletions(-)

diff --git a/CMakeLists.txt b/CMakeLists.txt
index 7dd94acd..aa6514b7 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -119,16 +119,18 @@ if (CMAKE_CUDA_COMPILER_VERSION VERSION_LESS 11.5)
     message(FATAL_ERROR "MatX requires CUDA 11.5 or higher. Please update before using.")
 endif()
 
+set(CCCL_ENABLE_UNSTABLE ON)
 message(STATUS "Finding CCCL...")
 rapids_cpm_cccl(
     BUILD_EXPORT_SET matx-exports
     INSTALL_EXPORT_SET matx-exports
 )
 
-target_link_libraries(matx INTERFACE CCCL::CCCL)
+target_link_libraries(matx INTERFACE CCCL::CCCL CCCL::cudax)
+set(MATX_CUDA_FLAGS ${CMAKE_CUDA_FLAGS} --expt-relaxed-constexpr --extended-lambda)
 
 # Set flags for compiling tests faster
-set(MATX_CUDA_FLAGS ${CMAKE_CUDA_FLAGS} --threads 0 -ftemplate-backtrace-limit=0)
+set(MATX_CUDA_FLAGS ${MATX_CUDA_FLAGS} --threads 0 -ftemplate-backtrace-limit=0)
 
 # Hack because CMake doesn't have short circult evaluation
 if (NOT CMAKE_BUILD_TYPE OR "${CMAKE_BUILD_TYPE}" STREQUAL "Debug")
diff --git a/cmake/versions.json b/cmake/versions.json
index b7d1a877..e6eba5b3 100644
--- a/cmake/versions.json
+++ b/cmake/versions.json
@@ -1,10 +1,9 @@
 {
   "packages": {
     "CCCL": {
-      "version": "2.7.0-rc2",
-      "git_shallow": true,
+      "version": "2.8.0",
       "git_url": "https://github.com/NVIDIA/cccl.git",
-      "git_tag": "10e915ac7b79a1ab3b9d7a795c621b47b122f513"
+      "git_tag": "cb1fce5e1cb7362940bd7e74ab8fbf01942b6264"
     }
   }
 }

From 245b20f03a5522ec6ee2809a0dd9be23235c2386 Mon Sep 17 00:00:00 2001
From: Albert Sidelnik <asidelnik@nvidia.com>
Date: Mon, 4 Nov 2024 15:42:12 -0800
Subject: [PATCH 02/39] remove const expr

---
 CMakeLists.txt | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/CMakeLists.txt b/CMakeLists.txt
index aa6514b7..7e9283e6 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -127,7 +127,7 @@ rapids_cpm_cccl(
 )
 
 target_link_libraries(matx INTERFACE CCCL::CCCL CCCL::cudax)
-set(MATX_CUDA_FLAGS ${CMAKE_CUDA_FLAGS} --expt-relaxed-constexpr --extended-lambda)
+set(MATX_CUDA_FLAGS ${CMAKE_CUDA_FLAGS} --extended-lambda)
 
 # Set flags for compiling tests faster
 set(MATX_CUDA_FLAGS ${MATX_CUDA_FLAGS} --threads 0 -ftemplate-backtrace-limit=0)

From 9b35ec844ff5ee59b48cb2605328a9fa44f362ef Mon Sep 17 00:00:00 2001
From: Albert Sidelnik <asidelnik@nvidia.com>
Date: Mon, 4 Nov 2024 22:58:34 -0800
Subject: [PATCH 03/39] Updates to get basic cudastf functionality working with
 matx

---
 CMakeLists.txt                            |  2 +-
 examples/simple_radar_pipeline.cu         | 14 ++++
 examples/simple_radar_pipeline.h          |  6 ++
 include/matx/core/operator_utils.h        |  2 +-
 include/matx/core/tensor.h                | 51 ++++++++++---
 include/matx/core/tensor_impl.h           | 87 +++++++++++++++++++++--
 include/matx/core/type_utils.h            | 10 +++
 include/matx/core/utils.h                 | 24 +++++++
 include/matx/generators/generator1d.h     |  3 +
 include/matx/operators/binary_operators.h | 12 ++++
 include/matx/operators/constval.h         |  3 +
 include/matx/operators/conv.h             | 75 +++++++++++++++----
 include/matx/operators/fft.h              | 74 +++++++++++++++----
 include/matx/operators/matmul.h           | 40 +++++++++--
 include/matx/operators/matvec.h           | 28 +++++++-
 include/matx/operators/permute.h          |  8 +++
 include/matx/operators/set.h              | 14 ++++
 include/matx/operators/sum.h              | 28 +++++++-
 include/matx/operators/unary_operators.h  |  6 ++
 19 files changed, 438 insertions(+), 49 deletions(-)

diff --git a/CMakeLists.txt b/CMakeLists.txt
index 7e9283e6..aa6514b7 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -127,7 +127,7 @@ rapids_cpm_cccl(
 )
 
 target_link_libraries(matx INTERFACE CCCL::CCCL CCCL::cudax)
-set(MATX_CUDA_FLAGS ${CMAKE_CUDA_FLAGS} --extended-lambda)
+set(MATX_CUDA_FLAGS ${CMAKE_CUDA_FLAGS} --expt-relaxed-constexpr --extended-lambda)
 
 # Set flags for compiling tests faster
 set(MATX_CUDA_FLAGS ${MATX_CUDA_FLAGS} --threads 0 -ftemplate-backtrace-limit=0)
diff --git a/examples/simple_radar_pipeline.cu b/examples/simple_radar_pipeline.cu
index a89cb1b1..10190d5a 100644
--- a/examples/simple_radar_pipeline.cu
+++ b/examples/simple_radar_pipeline.cu
@@ -76,6 +76,9 @@ int main([[maybe_unused]] int argc, [[maybe_unused]] char **argv)
     pipelines[s]->sync();  
   }
 
+  /* Get STF context handle */
+  auto ctx = pipelines[0]->exec.getCtx();
+
   MATX_NVTX_START_RANGE("Pipeline Test", matx_nvxtLogLevels::MATX_NVTX_LOG_USER, 2)
   printf("Running test...\n");
 
@@ -114,7 +117,11 @@ int main([[maybe_unused]] int argc, [[maybe_unused]] char **argv)
   for (uint32_t i = 0; i < iterations; i++) {
     for (int s = 0; s < num_streams; s++) {
       if (i == 1) {
+#if 0
         cudaEventRecord(starts[s], streams[s]);
+#else
+        cudaEventRecord(starts[s], ctx.task_fence());
+#endif
       }
 
       if (ENABLE_GRAPHS) {
@@ -127,9 +134,16 @@ int main([[maybe_unused]] int argc, [[maybe_unused]] char **argv)
   }
 
   for (int s = 0; s < num_streams; s++) {
+#if 0
     cudaEventRecord(stops[s], streams[s]);
+#else
+    cudaEventRecord(stops[s], ctx.task_fence());
+#endif
     pipelines[s]->sync();
   }
+
+  ctx.finalize();
+
   MATX_NVTX_END_RANGE(2)
   
   MATX_NVTX_START_RANGE("Pipeline Results", matx_nvxtLogLevels::MATX_NVTX_LOG_USER, 3)
diff --git a/examples/simple_radar_pipeline.h b/examples/simple_radar_pipeline.h
index 0f52ccf2..87e19f94 100644
--- a/examples/simple_radar_pipeline.h
+++ b/examples/simple_radar_pipeline.h
@@ -465,5 +465,11 @@ class RadarPipeline {
   tensor_t<typename ComplexType::value_type, 2> cfarMaskView;
 
   cudaStream_t stream;
+#if 0
   cudaExecutor exec;
+#else
+public:
+  stfExecutor exec;
+#endif
+
 };
diff --git a/include/matx/core/operator_utils.h b/include/matx/core/operator_utils.h
index 6f1f1f18..1a97a579 100644
--- a/include/matx/core/operator_utils.h
+++ b/include/matx/core/operator_utils.h
@@ -120,7 +120,7 @@ namespace matx {
     __MATX_HOST__ __MATX_INLINE__ void AllocateTempTensor(TensorType &tensor, Executor &&ex, ShapeType &&shape, typename TensorType::value_type **ptr) {
       const auto ttl_size = std::accumulate(shape.begin(), shape.end(), static_cast<index_t>(1),
                                   std::multiplies<index_t>()) * sizeof(typename TensorType::value_type);      
-      if constexpr (is_cuda_executor_v<Executor>) {
+      if constexpr (is_cuda_executor_v<Executor> || is_stf_executor_v<Executor>) {
         matxAlloc((void**)ptr, ttl_size, MATX_ASYNC_DEVICE_MEMORY, ex.getStream());
         make_tensor(tensor, *ptr, shape);
       }
diff --git a/include/matx/core/tensor.h b/include/matx/core/tensor.h
index 82b7b62b..27a933c2 100644
--- a/include/matx/core/tensor.h
+++ b/include/matx/core/tensor.h
@@ -91,6 +91,7 @@ class tensor_t : public detail::tensor_impl_t<T,RANK,Desc> {
   using stride_container = typename Desc::stride_container;
   using desc_type = Desc; ///< Descriptor type trait
   using self_type = tensor_t<T, RANK, Storage, Desc>;
+  using stf_logicaldata_type = typename cuda::experimental::stf::logical_data<cuda::experimental::stf::slice<T, 1>>;
 
   /**
    * @brief Construct a new 0-D tensor t object
@@ -107,7 +108,7 @@ class tensor_t : public detail::tensor_impl_t<T,RANK,Desc> {
    * @param rhs Object to copy from
    */
   __MATX_HOST__ tensor_t(tensor_t const &rhs) noexcept
-      : detail::tensor_impl_t<T, RANK, Desc>{rhs.ldata_, rhs.desc_}, storage_(rhs.storage_)
+      : detail::tensor_impl_t<T, RANK, Desc>{rhs.ldata_, rhs.desc_, rhs.stf_ldata_}, storage_(rhs.storage_)
       { }
 
   /**
@@ -116,7 +117,7 @@ class tensor_t : public detail::tensor_impl_t<T,RANK,Desc> {
    * @param rhs Object to move from
    */
   __MATX_HOST__ tensor_t(tensor_t &&rhs) noexcept
-      : detail::tensor_impl_t<T, RANK, Desc>{rhs.ldata_, std::move(rhs.desc_)}, storage_(std::move(rhs.storage_))
+      : detail::tensor_impl_t<T, RANK, Desc>{rhs.ldata_, std::move(rhs.desc_), rhs.stf_ldata_}, storage_(std::move(rhs.storage_))
   { }
 
 
@@ -134,6 +135,7 @@ class tensor_t : public detail::tensor_impl_t<T,RANK,Desc> {
     this->ldata_ = rhs.ldata_;
     storage_ = rhs.storage_;
     this->desc_ = rhs.desc_;
+    this->stf_ldata_ = rhs.stf_ldata_;
   }
 
   /** Swaps two tensors
@@ -152,6 +154,7 @@ class tensor_t : public detail::tensor_impl_t<T,RANK,Desc> {
     std::swap(lhs.ldata_, rhs.ldata_);
     swap(lhs.storage_, rhs.storage_);
     swap(lhs.desc_, rhs.desc_);
+    std::swap(lhs.stf_ldata_, rhs.stf_ldata_);
   }
 
   __MATX_INLINE__  ~tensor_t() = default;
@@ -177,6 +180,16 @@ class tensor_t : public detail::tensor_impl_t<T,RANK,Desc> {
     this->SetLocalData(storage_.data());
   }
 
+  template <typename S2 = Storage, typename D2 = Desc,
+            std::enable_if_t<is_matx_storage_v<typename remove_cvref<S2>::type> && is_matx_descriptor_v<typename remove_cvref<D2>::type>, bool> = true>
+  tensor_t(S2 &&s, D2 &&desc, T* ldata, std::optional<stf_logicaldata_type > *stf_ldata_) :
+    detail::tensor_impl_t<T, RANK, Desc>{std::forward<D2>(desc)},
+    storage_{std::forward<S2>(s)}
+  {
+    this->stf_ldata_ = stf_ldata_;
+    this->SetLocalData(storage_.data());
+  }
+
   /**
    * @brief Construct a new tensor t object. Used to copy an existing storage object for proper reference counting
    *
@@ -185,13 +198,28 @@ class tensor_t : public detail::tensor_impl_t<T,RANK,Desc> {
    * @param ldata
    */
   template <typename D2 = Desc>
-  tensor_t(Storage s, D2 &&desc, T* ldata) :
+  tensor_t(Storage s, D2 &&desc, T* ldata, std::optional<stf_logicaldata_type > *stf_ldata) :
     detail::tensor_impl_t<T, RANK, D2>{std::forward<D2>(desc)},
     storage_{std::move(s)}
   {
+    this->stf_ldata_ = stf_ldata;
     this->SetLocalData(ldata);
   }
 
+  /**
+   * @brief Construct a new tensor t object. Used to copy an existing storage object for proper reference counting
+   *
+   * @param s
+   * @param desc
+   * @param ldata
+   */
+  template <typename D2 = Desc>
+  tensor_t(Storage s, D2 &&desc, T* ldata) :
+    detail::tensor_impl_t<T, RANK, D2>{std::forward<D2>(desc)},
+    storage_{std::move(s)}
+  {
+    this->SetLocalData(ldata);
+  }
 
   /**
    * Constructor for a rank-1 and above tensor.
@@ -646,7 +674,7 @@ class tensor_t : public detail::tensor_impl_t<T,RANK,Desc> {
 
     // Copy descriptor and call ctor with shape
     Desc new_desc{std::forward<Shape>(shape)};
-    return tensor_t<M, R, Storage, Desc>{storage_, std::move(new_desc), this->ldata_};
+    return tensor_t<M, R, Storage, Desc>{storage_, std::move(new_desc), this->ldata_, this->stf_ldata_};
   }
 
   /**
@@ -705,7 +733,7 @@ class tensor_t : public detail::tensor_impl_t<T,RANK,Desc> {
        "To get a reshaped view the tensor must be compact");
 
     DefaultDescriptor<tshape.size()> desc{std::move(tshape)};
-    return tensor_t<T, NRANK, Storage, decltype(desc)>{storage_, std::move(desc), this->ldata_};
+    return tensor_t<T, NRANK, Storage, decltype(desc)>{storage_, std::move(desc), this->ldata_, this->stf_ldata_};
   }
 
   /**
@@ -788,7 +816,10 @@ class tensor_t : public detail::tensor_impl_t<T,RANK,Desc> {
 
     // Copy descriptor and call ctor with shape
     Desc new_desc{this->desc_.Shape(), std::move(strides)};
+    printf("******* FIX ASAP: tensor.h: 819 ****************\n");
     return tensor_t<Type, RANK, Storage, Desc>{storage_, std::move(new_desc), data};
+    /* Albert : TODO. We're creating a logical data of with the type being complex and not real. Seems to be incompatible with the approach we're taking in creating an associated logical data. Will be fixed once we move to void_interace */
+    //return tensor_t<Type, RANK, Storage, Desc>{storage_, std::move(new_desc), data, this->stf_ldata_};
   }
 
   /**
@@ -831,7 +862,7 @@ class tensor_t : public detail::tensor_impl_t<T,RANK,Desc> {
     }
 
     Desc new_desc{this->desc_.Shape(), std::move(strides)};
-    return tensor_t<Type, RANK, Storage, Desc>{storage_, std::move(new_desc), data};
+    return tensor_t<Type, RANK, Storage, Desc>{storage_, std::move(new_desc), data, this->stf_ldata_};
   }
 
   /**
@@ -854,7 +885,7 @@ class tensor_t : public detail::tensor_impl_t<T,RANK,Desc> {
     MATX_NVTX_START("", matx::MATX_NVTX_LOG_API)
 
     auto new_desc = this->PermuteImpl(dims);
-    return tensor_t<T, RANK, Storage, Desc>{storage_, std::move(new_desc), this->ldata_};
+    return tensor_t<T, RANK, Storage, Desc>{storage_, std::move(new_desc), this->ldata_, this->stf_ldata_};
   }
 
 
@@ -1030,7 +1061,7 @@ class tensor_t : public detail::tensor_impl_t<T,RANK,Desc> {
   OverlapView(const cuda::std::array<typename Desc::shape_type, N> &windows,
               const cuda::std::array<typename Desc::stride_type, N> &strides) const {
     auto new_desc = this->template OverlapViewImpl<N>(windows, strides);
-    return tensor_t<T, RANK + 1, Storage, decltype(new_desc)>{storage_, std::move(new_desc), this->ldata_};
+    return tensor_t<T, RANK + 1, Storage, decltype(new_desc)>{storage_, std::move(new_desc), this->ldata_, this->stf_ldata_};
   }
 
   /**
@@ -1064,7 +1095,7 @@ class tensor_t : public detail::tensor_impl_t<T,RANK,Desc> {
     MATX_NVTX_START("", matx::MATX_NVTX_LOG_API)
 
     auto new_desc = this->template CloneImpl<N>(clones);
-    return tensor_t<T, N, Storage, decltype(new_desc)>{storage_, std::move(new_desc), this->ldata_};
+    return tensor_t<T, N, Storage, decltype(new_desc)>{storage_, std::move(new_desc), this->ldata_, this->stf_ldata_};
   }
 
   template <int N>
@@ -1362,7 +1393,7 @@ class tensor_t : public detail::tensor_impl_t<T,RANK,Desc> {
                             [[maybe_unused]] StrideType strides) const
   {
     auto [new_desc, data] = this->template SliceImpl<N, StrideType>(firsts, ends, strides);
-    return tensor_t<T, N, Storage, decltype(new_desc)>{storage_, std::move(new_desc), data};
+    return tensor_t<T, N, Storage, decltype(new_desc)>{storage_, std::move(new_desc), data, this->stf_ldata_};
   }
 
   template <typename StrideType, int N = RANK>
diff --git a/include/matx/core/tensor_impl.h b/include/matx/core/tensor_impl.h
index 77dd0dbd..d655ae74 100644
--- a/include/matx/core/tensor_impl.h
+++ b/include/matx/core/tensor_impl.h
@@ -41,9 +41,11 @@
 #include "matx/core/type_utils.h"
 #include "matx/core/tensor_utils.h"
 #include "matx/operators/set.h"
+#include "matx/core/utils.h"
 //#include "matx_exec_kernel.h"
 #include "iterator.h"
 #include "matx/core/make_tensor.h"
+#include <cuda/experimental/stf.cuh>
 
 namespace matx {
 
@@ -80,6 +82,10 @@ class tensor_impl_t {
     using matxoplvalue = bool;
     using self_type = tensor_impl_t<T, RANK, Desc>;
 
+    /* TODO: convert to void_interface for the logical data.*/
+    using stf_logicaldata_type = typename cuda::experimental::stf::logical_data<cuda::experimental::stf::slice<T, 1>>;
+    //using stf_logicaldata_type = typename cudastf::logical_data<cudastf::void_interface>;
+
     // Type specifier for signaling this is a matx operation
     using matxop = bool;
 
@@ -110,13 +116,15 @@ class tensor_impl_t {
 
       swap(lhs.ldata_, rhs.ldata_);
       swap(lhs.desc_, rhs.desc_);
+      swap(lhs.stf_ldata_, rhs.stf_ldata_);
     }
 
     /**
      * Constructor for a rank-0 tensor (scalar).
      */
     tensor_impl_t() {
-
+      auto ldptr = new std::optional<stf_logicaldata_type>();
+      this->stf_ldata_ = ldptr;
     }
 
     /**
@@ -127,6 +135,8 @@ class tensor_impl_t {
      */
     tensor_impl_t(T *const data) : ldata_(data) {
       static_assert(RANK == 0, "tensor_impl_t with single pointer parameter must be a rank 0 tensor");
+      auto ldptr = new std::optional<stf_logicaldata_type>();
+      this->stf_ldata_ = ldptr;
     }
 
     /**
@@ -139,6 +149,8 @@ class tensor_impl_t {
               std::enable_if_t<!is_tensor_view_v<remove_cvref_t<ShapeType>> && !is_matx_descriptor_v<remove_cvref_t<ShapeType>>, bool> = true>
     __MATX_INLINE__ tensor_impl_t(ShapeType &&shape) : desc_(std::forward<ShapeType>(shape))
     {
+      auto ldptr = new std::optional<stf_logicaldata_type>();
+      this->stf_ldata_ = ldptr;
     }
 
     /**
@@ -153,6 +165,8 @@ class tensor_impl_t {
     __MATX_INLINE__ tensor_impl_t(ShapeType &&shape, StrideType &&strides)
         : desc_(std::forward<ShapeType>(shape), std::forward<StrideType>(strides))
     {
+      auto ldptr = new std::optional<stf_logicaldata_type>();
+      this->stf_ldata_ = ldptr;
     }
 
     /**
@@ -193,6 +207,8 @@ class tensor_impl_t {
                     StrideType &&strides)
         : ldata_(ldata), desc_(std::forward<ShapeType>(shape), std::forward<StrideType>(strides))
     {
+      auto ldptr = new std::optional<stf_logicaldata_type>();
+      this->stf_ldata_ = ldptr;
     }
 
 
@@ -216,9 +232,18 @@ IGNORE_WARNING_PUSH_GCC("-Wmaybe-uninitialized")
                     DescriptorType &&desc)                
         : ldata_(ldata), desc_{std::forward<DescriptorType>(desc)}
     {
+      auto ldptr = new std::optional<stf_logicaldata_type>();
+      this->stf_ldata_ = ldptr;
     }
 IGNORE_WARNING_POP_GCC
 
+    template <typename DescriptorType, std::enable_if_t<is_matx_descriptor_v<typename remove_cvref<DescriptorType>::type>, bool> =     true>
+    __MATX_INLINE__ __MATX_DEVICE__ __MATX_HOST__ tensor_impl_t(T *const ldata,
+                    DescriptorType &&desc, std::optional<stf_logicaldata_type > *stf_ldata)
+        : ldata_(ldata), desc_{std::forward<DescriptorType>(desc)}, stf_ldata_(stf_ldata)
+    {
+    }
+
     /**
      * Constructor for creating a view with only a descriptor
      *
@@ -233,12 +258,15 @@ IGNORE_WARNING_POP_GCC
     __MATX_INLINE__ tensor_impl_t(DescriptorType &&desc)
         : desc_{std::forward<DescriptorType>(desc)}
     {
+        auto ldptr = new std::optional<stf_logicaldata_type>();
+        this->stf_ldata_ = ldptr;
     }
 
     __MATX_HOST__ void Shallow(const self_type &rhs) noexcept
     {
       ldata_ = rhs.ldata_;
       desc_ = rhs.desc_;
+      stf_ldata_ = rhs.stf_ldata_;
     }
 
     /**
@@ -255,6 +283,7 @@ IGNORE_WARNING_POP_GCC
     {
       ldata_ = op.ldata_;
       desc_ = op.desc_;
+      stf_ldata_ = op.stf_ldata;
     }
 
 
@@ -770,7 +799,7 @@ IGNORE_WARNING_POP_GCC
 
       auto new_desc = CloneImpl<N>(clones);
 
-      return tensor_impl_t<T, N, decltype(new_desc)>{this->ldata_, std::move(new_desc)};
+      return tensor_impl_t<T, N, decltype(new_desc)>{this->ldata_, std::move(new_desc), this->stf_ldata_};
     }
 
     __MATX_INLINE__ auto PermuteImpl(const cuda::std::array<int32_t, RANK> &dims) const
@@ -800,7 +829,7 @@ IGNORE_WARNING_POP_GCC
     __MATX_INLINE__ auto Permute(const cuda::std::array<int32_t, RANK> &dims) const
     {
       auto new_desc = PermuteImpl(dims);
-      return tensor_impl_t<T, RANK, decltype(new_desc)>{this->ldata_, std::move(new_desc)};
+      return tensor_impl_t<T, RANK, decltype(new_desc)>{this->ldata_, std::move(new_desc), this->stf_ldata_};
     }
 
     template <int N>
@@ -845,7 +874,7 @@ IGNORE_WARNING_POP_GCC
     OverlapView(const cuda::std::array<typename Desc::shape_type, N> &windows,
                 const cuda::std::array<typename Desc::stride_type, N> &strides) const {
       auto new_desc = OverlapViewImpl<N>(windows, strides);
-      return tensor_impl_t<T, RANK + 1, decltype(new_desc)>{this->ldata_, std::move(new_desc)};
+      return tensor_impl_t<T, RANK + 1, decltype(new_desc)>{this->ldata_, std::move(new_desc), this->stf_ldata_};
     }
 
     template <typename O>
@@ -1080,9 +1109,56 @@ IGNORE_WARNING_POP_GCC
       ldata_ = data;
     }
 
+    template <typename Task>
+    __MATX_INLINE__ void apply_dep_to_task(Task &&task, int perm) const noexcept
+    {
+      auto &ld = stf_ldata_->value();
+
+      using namespace cuda::experimental::stf;
+#if 0
+      data_place place = getDataPlace(Data());
+#endif
+
+      if (perm == 0) {
+          task.add_deps(ld.write());
+      }
+      else if (perm == 1) {
+          task.add_deps(ld.read());
+      }
+      else if (perm == 2) {
+          task.add_deps(ld.rw());
+      }
+      else {
+          std::cout << "abort ...\n";
+      }
+    }
+
     template <typename ShapeType, typename Executor>
     __MATX_INLINE__ void PreRun([[maybe_unused]] ShapeType &&shape, [[maybe_unused]] Executor &&ex) const noexcept
     {
+        using namespace cuda::experimental::stf;
+        data_place place;
+
+        if constexpr (is_cuda_executor_v<Executor>) {
+            return;
+        }
+        else if constexpr (!is_cuda_executor_v<Executor>) {
+
+            /* Don't create a new logical data for a tensor if it alread had one created previously */
+            if (stf_ldata_ && stf_ldata_->has_value()) { return; }
+
+            auto ctx = ex.getCtx();
+#if 0
+            // Determine the type of memory that was allocated ie. host/managed/etc
+            place = getDataPlace(Data());
+#endif
+
+            /* TODO: Use void_interface for logical data */
+            *stf_ldata_ = ctx.logical_data(cuda::experimental::stf::make_slice(Data(), 1));
+            //*stf_ldata_ = ctx.logical_data(cudastf::shape_of<cudastf::void_interface>());
+            stf_ldata_->value().set_write_back(false);
+            stf_ldata_->value().set_symbol(this->str());
+        }       
     }
 
     template <typename ShapeType, typename Executor>
@@ -1094,6 +1170,9 @@ IGNORE_WARNING_POP_GCC
   protected:
     T *ldata_;
     Desc desc_;
+
+  public:
+    mutable std::optional<stf_logicaldata_type > *stf_ldata_;
 };
 
 }
diff --git a/include/matx/core/type_utils.h b/include/matx/core/type_utils.h
index 3c66019b..dcf11b6e 100644
--- a/include/matx/core/type_utils.h
+++ b/include/matx/core/type_utils.h
@@ -44,6 +44,7 @@
 #include "matx/core/half.h"
 #include "matx/core/half_complex.h"
 #include "matx/executors/cuda.h"
+#include "matx/executors/stf.h"
 
 /**
  * Defines type traits for host and device compilers. This file should be includable by
@@ -287,6 +288,7 @@ inline constexpr bool is_settable_xform_v = std::conjunction_v<detail::is_matx_s
 namespace detail {
 template <typename T> struct is_executor : std::false_type {};
 template <> struct is_executor<cudaExecutor> : std::true_type {};
+template <> struct is_executor<stfExecutor> : std::true_type {};
 template <ThreadsMode MODE> struct is_executor<HostExecutor<MODE>> : std::true_type {};
 }
 
@@ -307,6 +309,11 @@ template<typename T> struct is_cuda_executor : std::false_type {};
 template<> struct is_cuda_executor<matx::cudaExecutor> : std::true_type {};
 }
 
+namespace detail {
+template<typename T> struct is_stf_executor : std::false_type {};
+template<> struct is_stf_executor<matx::stfExecutor> : std::true_type {};
+}
+
 /**
  * @brief Determine if a type is a device executor
  * 
@@ -315,6 +322,9 @@ template<> struct is_cuda_executor<matx::cudaExecutor> : std::true_type {};
 template <typename T> 
 inline constexpr bool is_cuda_executor_v = detail::is_cuda_executor<typename remove_cvref<T>::type>::value;
 
+template <typename T>
+inline constexpr bool is_stf_executor_v = detail::is_stf_executor<typename remove_cvref<T>::type>::value;
+
 namespace detail {
 template<typename T> struct is_host_executor : std::false_type {};
 template<ThreadsMode MODE> struct is_host_executor<matx::HostExecutor<MODE>> : std::true_type {};
diff --git a/include/matx/core/utils.h b/include/matx/core/utils.h
index 50bda7bb..0c90a4e9 100644
--- a/include/matx/core/utils.h
+++ b/include/matx/core/utils.h
@@ -45,6 +45,30 @@
 
 namespace matx {
 namespace detail {
+
+#if 0
+__MATX_INLINE__ cuda::experimental::stf::data_place getDataPlace(void *ptr) {
+    using namespace cuda::experimental::stf;
+                auto kind = GetPointerKind(ptr);
+                switch (kind) {
+                    case MATX_MANAGED_MEMORY:
+                        return data_place::managed;
+                    case MATX_HOST_MEMORY:
+                    case MATX_HOST_MALLOC_MEMORY:
+                        return data_place::host;
+                    case MATX_DEVICE_MEMORY:
+                    case MATX_ASYNC_DEVICE_MEMORY:
+                        return data_place::current_device();
+                    case MATX_INVALID_MEMORY:
+                        //std::cout << "Data kind is invalid: assuming managed memory\n";
+                        return data_place::managed;
+                        //return data_place::invalid;
+                    default:
+                        return data_place::invalid;
+                }
+}
+#endif
+
 __MATX_INLINE__ int GetDeviceAttr(cudaDeviceAttr attr) {
     int val;
     int dev;
diff --git a/include/matx/generators/generator1d.h b/include/matx/generators/generator1d.h
index cdeca628..1d684d62 100644
--- a/include/matx/generators/generator1d.h
+++ b/include/matx/generators/generator1d.h
@@ -54,6 +54,9 @@ namespace matx
             return f_(pp_get<Dim>(indices...));
           }
 
+        template <typename Task>
+        __MATX_INLINE__ void apply_dep_to_task([[maybe_unused]] Task &&task, [[maybe_unused]] int perm=1) const noexcept { }
+
         constexpr inline __MATX_HOST__ __MATX_DEVICE__ auto Size(int dim) const
         {
           return *(s_.begin() + dim);
diff --git a/include/matx/operators/binary_operators.h b/include/matx/operators/binary_operators.h
index 49635ccd..af49b034 100644
--- a/include/matx/operators/binary_operators.h
+++ b/include/matx/operators/binary_operators.h
@@ -146,6 +146,18 @@ namespace matx
         return detail::matx_max(size1,size2);
       }
 
+      template <typename Task, typename Perm>
+      __MATX_INLINE__ void apply_dep_to_task(Task &&task, Perm perm) const noexcept
+      {
+        if constexpr (is_matx_op<I1>()) {
+            in1_.apply_dep_to_task(std::forward<Task>(task), perm);
+        }
+
+        if constexpr (is_matx_op<I2>()) {
+            in2_.apply_dep_to_task(std::forward<Task>(task), perm);
+        }
+      }
+
       template <typename ShapeType, typename Executor>
       __MATX_INLINE__ void PreRun(ShapeType &&shape, Executor &&ex) const noexcept
       {
diff --git a/include/matx/operators/constval.h b/include/matx/operators/constval.h
index 3df28aff..32eec317 100644
--- a/include/matx/operators/constval.h
+++ b/include/matx/operators/constval.h
@@ -55,6 +55,9 @@ namespace matx
         __MATX_INLINE__ __MATX_DEVICE__ __MATX_HOST__ T operator()(Is...) const { 
           return v_; };
 
+      template <typename Task>
+      __MATX_INLINE__ void apply_dep_to_task([[maybe_unused]] Task &&task, [[maybe_unused]] int perm) const noexcept { }
+
       constexpr inline __MATX_HOST__ __MATX_DEVICE__ auto Size(int dim) const {
         if constexpr (!is_noshape_v<ShapeType>) {
           return *(s_.begin() + dim);
diff --git a/include/matx/operators/conv.h b/include/matx/operators/conv.h
index 62f3a2f8..905573e2 100644
--- a/include/matx/operators/conv.h
+++ b/include/matx/operators/conv.h
@@ -149,16 +149,45 @@ namespace matx
           return out_dims_[dim];
         }
 
+        template <typename Task>
+        __MATX_INLINE__ void apply_dep_to_task(Task &&task, int perm=1) const noexcept
+        {
+            tmp_out_.apply_dep_to_task(std::forward<Task>(task), 1);
+        }
+ 
         template <typename Out, typename Executor>
         void Exec(Out &&out, Executor &&ex) const {
           MATX_ASSERT_STR(!(is_host_executor_v<Executor> && method_ == MATX_C_METHOD_DIRECT), matxNotSupported, "direct conv1d() only supports the CUDA executor currently");
           MATX_STATIC_ASSERT_STR((Rank() == cuda::std::tuple_element_t<0, remove_cvref_t<Out>>::Rank()), 
                 matxInvalidParameter, "conv1d: inputs and outputs must have same rank to use conv1d with axis parameter");
-          if constexpr (!std::is_same_v<PermDims, no_permute_t>) {
-            conv1d_impl(permute(cuda::std::get<0>(out), perm_), a_, b_, mode_, method_, ex);
+          if constexpr (!is_cuda_executor_v<Executor>) {
+              auto ctx = ex.getCtx();
+              auto tsk = ctx.task();
+              tsk.set_symbol("conv_task");
+
+              auto output = cuda::std::get<0>(out);
+              output.PreRun(out_dims_, std::forward<Executor>(ex));
+              output.apply_dep_to_task(tsk, 0);
+              a_.apply_dep_to_task(tsk, 1);
+              b_.apply_dep_to_task(tsk, 1);
+
+              tsk->*[&](cudaStream_t s) {
+                  auto exec = cudaExecutor(s);
+                  if constexpr (!std::is_same_v<PermDims, no_permute_t>) {
+                      conv1d_impl(permute(cuda::std::get<0>(out), perm_), a_, b_, mode_, method_, exec);
+                  }
+                  else {
+                      conv1d_impl(cuda::std::get<0>(out), a_, b_, mode_, method_, exec);
+                  }
+              };
           }
-          else {
-            conv1d_impl(cuda::std::get<0>(out), a_, b_, mode_, method_, ex);
+          else if constexpr (is_cuda_executor_v<Executor>) {
+              if constexpr (!std::is_same_v<PermDims, no_permute_t>) {
+                  conv1d_impl(permute(cuda::std::get<0>(out), perm_), a_, b_, mode_, method_, ex);
+              }
+              else {
+                  conv1d_impl(cuda::std::get<0>(out), a_, b_, mode_, method_, ex);
+              }
           }
         }
 
@@ -343,14 +372,36 @@ namespace detail {
 
       template <typename Out, typename Executor>
       void Exec(Out &&out, Executor &&ex) const {
-        static_assert(is_cuda_executor_v<Executor>, "conv2d() only supports the CUDA executor currently");
-
-        if constexpr (!std::is_same_v<PermDims, no_permute_t>) {
-          conv2d_impl(permute(cuda::std::get<0>(out), perm_), a_, b_, mode_, ex.getStream());
-        }
-        else {
-          conv2d_impl(cuda::std::get<0>(out), a_, b_, mode_, ex.getStream());
-        }
+        //static_assert(is_cuda_executor_v<Executor>, "conv2d() only supports the CUDA executor currently");
+        if constexpr (!is_cuda_executor_v<Executor>) {
+            auto ctx = ex.getCtx();
+            auto tsk = ctx.task();
+            tsk.set_symbol("conv_task");
+
+            auto output = cuda::std::get<0>(out);
+            output.PreRun(out_dims_, std::forward<Executor>(ex));
+            output.apply_dep_to_task(tsk, 0);
+            a_.apply_dep_to_task(tsk, 1);
+            b_.apply_dep_to_task(tsk, 1);
+
+            tsk->*[&](cudaStream_t s) {
+                auto exec = cudaExecutor(s);
+                if constexpr (!std::is_same_v<PermDims, no_permute_t>) {
+                    conv2d_impl(permute(cuda::std::get<0>(out), perm_), a_, b_, mode_, exec.getStream());
+                }
+                else {
+                    conv2d_impl(cuda::std::get<0>(out), a_, b_, mode_, exec.getStream());
+                }
+            };
+          }
+          else if constexpr (is_cuda_executor_v<Executor>) {
+              if constexpr (!std::is_same_v<PermDims, no_permute_t>) {
+                  conv2d_impl(permute(cuda::std::get<0>(out), perm_), a_, b_, mode_, ex.getStream());
+              }
+              else {
+                  conv2d_impl(cuda::std::get<0>(out), a_, b_, mode_, ex.getStream());
+              }
+          }
       }
 
       template <typename ShapeType, typename Executor>
diff --git a/include/matx/operators/fft.h b/include/matx/operators/fft.h
index 242169d1..76e57c3d 100644
--- a/include/matx/operators/fft.h
+++ b/include/matx/operators/fft.h
@@ -145,24 +145,68 @@ namespace matx
           return out_dims_[dim];
         }
 
+        template <typename Task>
+        __MATX_INLINE__ void apply_dep_to_task(Task &&task, int perm=1) const noexcept
+        {
+          /* Scenario where the matvec() operator is on the RHS and op has already
+             run previously. So we make tmp_out have a read permission as it will be read from */
+          tmp_out_.apply_dep_to_task(std::forward<Task>(task), 1);
+        }
+
         template <typename Out, typename Executor>
         void Exec(Out &&out, Executor &&ex) const {
-          if constexpr (std::is_same_v<PermDims, no_permute_t>) {
-            if constexpr (std::is_same_v<FFTType, fft_t>) {
-              fft_impl(cuda::std::get<0>(out), a_, fft_size_, norm_, ex);
-            }
-            else {
-              ifft_impl(cuda::std::get<0>(out), a_, fft_size_, norm_, ex);
-            }
-          }
-          else {
-            if constexpr (std::is_same_v<FFTType, fft_t>) { 
-              fft_impl(permute(cuda::std::get<0>(out), perm_), permute(a_, perm_), fft_size_, norm_, ex);
+            // stfexecutor case
+            if constexpr (!is_cuda_executor_v<Executor>) {
+                auto ctx = ex.getCtx();
+                auto tsk = ctx.task();
+                tsk.set_symbol("fft_task_no_perm");
+
+                auto output = cuda::std::get<0>(out);
+                output.PreRun(out_dims_, std::forward<Executor>(ex));
+                output.apply_dep_to_task(tsk, 0);
+                a_.apply_dep_to_task(tsk, 1);
+
+                tsk->*[&](cudaStream_t s) {
+                    auto exec = cudaExecutor(s);
+                    if constexpr (std::is_same_v<PermDims, no_permute_t>) {
+                        if constexpr (std::is_same_v<FFTType, fft_t>) {
+                            fft_impl(output, a_, fft_size_, norm_, exec);
+                        }
+                        else {
+                            ifft_impl(output, a_, fft_size_, norm_, exec);
+                        }
+                    }
+                    else {
+                        if constexpr (std::is_same_v<FFTType, fft_t>) { 
+                            fft_impl(permute(output, perm_), permute(a_, perm_), fft_size_, norm_, exec);
+                        }
+                        else {
+                            ifft_impl(permute(output, perm_), permute(a_, perm_), fft_size_, norm_, exec);
+                        }
+                    }
+                };
+            }            
+            else if constexpr (is_cuda_executor_v<Executor>) {
+                if constexpr (std::is_same_v<PermDims, no_permute_t>) {
+                    if constexpr (std::is_same_v<FFTType, fft_t>) {
+                        fft_impl(cuda::std::get<0>(out), a_, fft_size_, norm_, ex);
+                    }
+                    else {
+                        ifft_impl(cuda::std::get<0>(out), a_, fft_size_, norm_, ex);
+                    }
+                }
+                else {
+                    if constexpr (std::is_same_v<FFTType, fft_t>) { 
+                        fft_impl(permute(cuda::std::get<0>(out), perm_), permute(a_, perm_), fft_size_, norm_, ex);
+                    }
+                    else {
+                        ifft_impl(permute(cuda::std::get<0>(out), perm_), permute(a_, perm_), fft_size_, norm_, ex);
+                    }
+                }
             }
             else {
-              ifft_impl(permute(cuda::std::get<0>(out), perm_), permute(a_, perm_), fft_size_, norm_, ex);
-            }
-          }
+                printf("SHOULDNT BE HERE..\n");
+            }            
         }
 
         template <typename ShapeType, typename Executor>
@@ -494,4 +538,4 @@ namespace matx
     return detail::FFT2Op(a, perm, detail::ifft_t{}, norm);
   }  
 
-}
\ No newline at end of file
+}
diff --git a/include/matx/operators/matmul.h b/include/matx/operators/matmul.h
index add2a556..bd000a87 100644
--- a/include/matx/operators/matmul.h
+++ b/include/matx/operators/matmul.h
@@ -111,13 +111,45 @@ namespace matx
           return out_dims_[dim];
         }
 
+        template <typename Task>
+        __MATX_INLINE__ void apply_dep_to_task(Task &&task, int perm=1) const noexcept
+        {
+          /* Scenario where the matmul() operator is on the RHS and op has already run 
+          previously. So we make tmp_out have a read permission as it will be read from */
+          tmp_out_.apply_dep_to_task(std::forward<Task>(task), 1);
+        }
+
         template <typename Out, typename Executor>
         void Exec(Out &&out, Executor &&ex) const {
-          if constexpr (!std::is_same_v<PermDims, no_permute_t>) {
-            matmul_impl(permute(cuda::std::get<0>(out), perm_), a_, b_, ex, alpha_, beta_);
+          // stfexecutor case
+          if constexpr (!is_cuda_executor_v<Executor>) {
+              auto ctx = ex.getCtx();
+              auto tsk = ctx.task();
+              tsk.set_symbol("matmul");
+
+              auto output = cuda::std::get<0>(out);
+              output.PreRun(out_dims_, std::forward<Executor>(ex));
+              output.apply_dep_to_task(tsk, 0);
+              a_.apply_dep_to_task(tsk, 1);
+              b_.apply_dep_to_task(tsk, 1);
+
+              tsk->*[&](cudaStream_t s) {
+                  auto exec = cudaExecutor(s);
+                  if constexpr (!std::is_same_v<PermDims, no_permute_t>) {
+                      matmul_impl(permute(cuda::std::get<0>(out), perm_), a_, b_, exec, alpha_, beta_);
+                  }
+                  else {
+                      matmul_impl(cuda::std::get<0>(out), a_, b_, exec, alpha_, beta_);
+                  }
+              };
           }
-          else {
-            matmul_impl(cuda::std::get<0>(out), a_, b_, ex, alpha_, beta_);
+          else if constexpr (is_cuda_executor_v<Executor>) {
+                  if constexpr (!std::is_same_v<PermDims, no_permute_t>) {
+                      matmul_impl(permute(cuda::std::get<0>(out), perm_), a_, b_, ex, alpha_, beta_);
+                  }
+                  else {
+                      matmul_impl(cuda::std::get<0>(out), a_, b_, ex, alpha_, beta_);
+                  }
           }
         }
 
diff --git a/include/matx/operators/matvec.h b/include/matx/operators/matvec.h
index 222402c2..3568c2e0 100644
--- a/include/matx/operators/matvec.h
+++ b/include/matx/operators/matvec.h
@@ -89,9 +89,35 @@ namespace matx
           return out_dims_[dim];
         }
 
+        template <typename Task>
+        __MATX_INLINE__ void apply_dep_to_task(Task &&task, int perm=1) const noexcept
+        {
+          /* Scenario where the matvec() operator is on the RHS and op has already run 
+          previously. So we make tmp_out have a read permission as it will be read from */
+          tmp_out_.apply_dep_to_task(std::forward<Task>(task), 1);
+        }
+
         template <typename Out, typename Executor>
         void Exec(Out &&out, Executor &&ex)  const{
-          matvec_impl(cuda::std::get<0>(out), a_, b_, ex, alpha_, beta_);
+            // stfexecutor case
+            auto output = cuda::std::get<0>(out);
+            if constexpr (!is_cuda_executor_v<Executor>) {
+                auto ctx = ex.getCtx();
+                auto tsk = ctx.task();
+                tsk.set_symbol("matvec_task");
+                output.PreRun(out_dims_, std::forward<Executor>(ex));
+                output.apply_dep_to_task(tsk, 0);
+                a_.apply_dep_to_task(tsk, 1);
+                b_.apply_dep_to_task(tsk, 1);
+                tsk->*[&](cudaStream_t s) {
+                    auto exec = cudaExecutor(s);
+                    matvec_impl(output, a_, b_, exec, alpha_, beta_);
+                };
+            }
+            // cudaExecutor case
+            else if constexpr (is_cuda_executor_v<Executor>) {
+                    matvec_impl(output, a_, b_, ex, alpha_, beta_);
+            }
         }
 
         template <typename ShapeType, typename Executor>
diff --git a/include/matx/operators/permute.h b/include/matx/operators/permute.h
index bf366e24..2f6fc156 100644
--- a/include/matx/operators/permute.h
+++ b/include/matx/operators/permute.h
@@ -123,6 +123,14 @@ IGNORE_WARNING_POP_GCC
           return op_.Size(dims_[dim]);
         }
 
+        template <typename Task>
+        __MATX_INLINE__ void apply_dep_to_task(Task &&task, int perm=1) const noexcept
+        {
+          if constexpr (is_matx_op<T>()) {
+            op_.apply_dep_to_task(std::forward<Task>(task), perm);
+          }
+        }
+
         template <typename ShapeType, typename Executor>
         __MATX_INLINE__ void PreRun(ShapeType &&shape, Executor &&ex) const noexcept
         {
diff --git a/include/matx/operators/set.h b/include/matx/operators/set.h
index 89885706..3d3e7843 100644
--- a/include/matx/operators/set.h
+++ b/include/matx/operators/set.h
@@ -147,6 +147,20 @@ class set : public BaseOp<set<T, Op>> {
     return res;
   }
 
+  template <typename Task>
+  __MATX_INLINE__ void apply_dep_to_task(Task &&task, [[maybe_unused]] int perm=0) const noexcept                                                                   
+  {
+    // LHS
+    if constexpr (is_matx_op<T>()) {
+      out_.apply_dep_to_task(std::forward<Task>(task), 0);
+    }
+    // RHS
+    if constexpr (is_matx_op<Op>()) {
+      op_.apply_dep_to_task(std::forward<Task>(task), 1);
+    }
+  }
+
+
   template <typename ShapeType, typename Executor>
   __MATX_INLINE__ void PreRun(ShapeType &&shape, Executor &&ex) const noexcept
   {
diff --git a/include/matx/operators/sum.h b/include/matx/operators/sum.h
index d95b2595..60e86248 100644
--- a/include/matx/operators/sum.h
+++ b/include/matx/operators/sum.h
@@ -72,9 +72,35 @@ namespace detail {
         return tmp_out_(indices...);
       }
 
+      template <typename Task>
+      __MATX_INLINE__ void apply_dep_to_task(Task &&task, int perm=1) const noexcept {
+        /* Albert -- Scenario where the sum() operator is on the RHS and sum has already 
+        run previously. So we make tmp_out have a read permission as it will be read from */
+        tmp_out_.apply_dep_to_task(std::forward<Task>(task), 1);
+      }
+
       template <typename Out, typename Executor>
       void Exec(Out &&out, Executor &&ex) const {
-        sum_impl(cuda::std::get<0>(out), a_, ex);
+        auto output = cuda::std::get<0>(out);
+        // stfexecutor case
+        if constexpr (!is_cuda_executor_v<Executor>) {
+            auto ctx = ex.getCtx();
+            auto tsk = ctx.task();
+            tsk.set_symbol("sum_task");
+
+            output.PreRun(out_dims_, std::forward<Executor>(ex));
+            output.apply_dep_to_task(tsk, 0);
+            a_.apply_dep_to_task(tsk, 1);
+
+            tsk->*[&](cudaStream_t s) {
+                auto exec = cudaExecutor(s);
+                sum_impl(output, a_, exec);
+            };
+        }
+        // cudaExecutor case
+        else if constexpr (is_cuda_executor_v<Executor>) {
+            sum_impl(output, a_, ex);
+        }
       }
 
       static __MATX_INLINE__ constexpr __MATX_HOST__ __MATX_DEVICE__ int32_t Rank()
diff --git a/include/matx/operators/unary_operators.h b/include/matx/operators/unary_operators.h
index e0a3d9c7..b7633916 100644
--- a/include/matx/operators/unary_operators.h
+++ b/include/matx/operators/unary_operators.h
@@ -103,6 +103,12 @@ namespace matx
       return size_[dim];
     }
 
+    template <typename Task, typename Perm>
+    __MATX_INLINE__ void apply_dep_to_task(Task &&task, Perm perm) const noexcept
+    {
+      in1_.apply_dep_to_task(std::forward<Task>(task), perm);
+    }
+
     template <typename ShapeType, typename Executor>
     __MATX_INLINE__ void PreRun([[maybe_unused]] ShapeType &&shape, [[maybe_unused]] Executor &&ex) const noexcept
     {

From 7d298d405f4937af5657f4d79079924b18b15b38 Mon Sep 17 00:00:00 2001
From: Albert Sidelnik <asidelnik@nvidia.com>
Date: Mon, 4 Nov 2024 23:13:38 -0800
Subject: [PATCH 04/39] move to void_interface

---
 include/matx/core/tensor.h      |  7 ++-----
 include/matx/core/tensor_impl.h | 11 +++--------
 2 files changed, 5 insertions(+), 13 deletions(-)

diff --git a/include/matx/core/tensor.h b/include/matx/core/tensor.h
index 27a933c2..7eb5f8ae 100644
--- a/include/matx/core/tensor.h
+++ b/include/matx/core/tensor.h
@@ -91,7 +91,7 @@ class tensor_t : public detail::tensor_impl_t<T,RANK,Desc> {
   using stride_container = typename Desc::stride_container;
   using desc_type = Desc; ///< Descriptor type trait
   using self_type = tensor_t<T, RANK, Storage, Desc>;
-  using stf_logicaldata_type = typename cuda::experimental::stf::logical_data<cuda::experimental::stf::slice<T, 1>>;
+  using stf_logicaldata_type = typename cuda::experimental::stf::logical_data<cuda::experimental::stf::void_interface>;
 
   /**
    * @brief Construct a new 0-D tensor t object
@@ -816,10 +816,7 @@ class tensor_t : public detail::tensor_impl_t<T,RANK,Desc> {
 
     // Copy descriptor and call ctor with shape
     Desc new_desc{this->desc_.Shape(), std::move(strides)};
-    printf("******* FIX ASAP: tensor.h: 819 ****************\n");
-    return tensor_t<Type, RANK, Storage, Desc>{storage_, std::move(new_desc), data};
-    /* Albert : TODO. We're creating a logical data of with the type being complex and not real. Seems to be incompatible with the approach we're taking in creating an associated logical data. Will be fixed once we move to void_interace */
-    //return tensor_t<Type, RANK, Storage, Desc>{storage_, std::move(new_desc), data, this->stf_ldata_};
+    return tensor_t<Type, RANK, Storage, Desc>{storage_, std::move(new_desc), data, this->stf_ldata_};
   }
 
   /**
diff --git a/include/matx/core/tensor_impl.h b/include/matx/core/tensor_impl.h
index d655ae74..f837f9cc 100644
--- a/include/matx/core/tensor_impl.h
+++ b/include/matx/core/tensor_impl.h
@@ -81,10 +81,7 @@ class tensor_impl_t {
     using stride_type = typename Desc::stride_type;
     using matxoplvalue = bool;
     using self_type = tensor_impl_t<T, RANK, Desc>;
-
-    /* TODO: convert to void_interface for the logical data.*/
-    using stf_logicaldata_type = typename cuda::experimental::stf::logical_data<cuda::experimental::stf::slice<T, 1>>;
-    //using stf_logicaldata_type = typename cudastf::logical_data<cudastf::void_interface>;
+    using stf_logicaldata_type = typename cuda::experimental::stf::logical_data<cuda::experimental::stf::void_interface>;
 
     // Type specifier for signaling this is a matx operation
     using matxop = bool;
@@ -1153,10 +1150,8 @@ IGNORE_WARNING_POP_GCC
             place = getDataPlace(Data());
 #endif
 
-            /* TODO: Use void_interface for logical data */
-            *stf_ldata_ = ctx.logical_data(cuda::experimental::stf::make_slice(Data(), 1));
-            //*stf_ldata_ = ctx.logical_data(cudastf::shape_of<cudastf::void_interface>());
-            stf_ldata_->value().set_write_back(false);
+            *stf_ldata_ = ctx.logical_data(cuda::experimental::stf::void_interface());
+            //stf_ldata_->value().set_write_back(false);
             stf_ldata_->value().set_symbol(this->str());
         }       
     }

From 154b3f99c04f9ee7d7dc807e3e4030c751132d33 Mon Sep 17 00:00:00 2001
From: Albert Sidelnik <asidelnik@nvidia.com>
Date: Tue, 5 Nov 2024 00:03:50 -0800
Subject: [PATCH 05/39] add stf executor

---
 include/matx/executors/stf.h | 250 +++++++++++++++++++++++++++++++++++
 1 file changed, 250 insertions(+)
 create mode 100644 include/matx/executors/stf.h

diff --git a/include/matx/executors/stf.h b/include/matx/executors/stf.h
new file mode 100644
index 00000000..00e8cef3
--- /dev/null
+++ b/include/matx/executors/stf.h
@@ -0,0 +1,250 @@
+////////////////////////////////////////////////////////////////////////////////
+// BSD 3-Clause License
+//
+// Copyright (c) 2021, NVIDIA Corporation
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are met:
+//
+// 1. Redistributions of source code must retain the above copyright notice, this
+//    list of conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above copyright notice,
+//    this list of conditions and the following disclaimer in the documentation
+//    and/or other materials provided with the distribution.
+//
+// 3. Neither the name of the copyright holder nor the names of its
+//    contributors may be used to endorse or promote products derived from
+//    this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+// DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+// FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+// DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+// SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+// CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+// OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+/////////////////////////////////////////////////////////////////////////////////
+
+#pragma once
+
+#include "matx/core/defines.h"
+#include "matx/executors/host.h"
+#include "matx/executors/kernel.h"
+
+#include <cuda/experimental/stf.cuh>
+
+//using namespace cuda::experimental::stf;
+//using namespace cudastf;
+
+namespace matx
+{
+
+
+/* Albert - Needed to declare this here to avoid compile error. */
+template <typename T> constexpr bool is_matx_op_lvalue();
+template <typename T> constexpr bool is_matx_set_op();
+
+  class stfExecutor {
+    public:
+      using matx_cuda = bool;  // signal this is a GPU executor
+      using matx_executor = bool; ///< Type trait indicating this is an executor
+//using cudastf::async_resources_handle;
+//using cudastf::stream_ctx;
+      /**
+       * @brief Construct a new stfExecutor with a stream
+       * 
+       * @param stream CUDA stream
+       */
+      stfExecutor(cudaStream_t stream) : stream_(stream) {
+          cuda::experimental::stf::async_resources_handle handle;
+          ctx_ = cuda::experimental::stf::stream_ctx(stream, handle);
+      }
+      stfExecutor(int stream) : stream_(reinterpret_cast<cudaStream_t>(stream)) {
+          cuda::experimental::stf::async_resources_handle handle;
+          ctx_ = cuda::experimental::stf::stream_ctx(reinterpret_cast<cudaStream_t>(stream), handle);
+      }
+
+      /**
+       * @brief Construct a new stfExecutor object using the default stream
+       * 
+       */
+      stfExecutor() : stream_(0) {
+          ctx_ = cuda::experimental::stf::stream_ctx();
+      }
+
+      /**
+       * @brief Returns stream associated with executor
+      */
+      auto getStream() const { return stream_; }
+
+      /**
+       * @brief Get CUDASTF Ctx
+       * 
+       */
+      auto &getCtx() const noexcept { return ctx_; }
+
+      /**
+       * @brief Synchronize the cuda executor's stream
+       * 
+       */
+      void sync() { cudaStreamSynchronize(stream_); }
+
+      /**
+       * Execute an operator on a device
+       * 
+       * @tparam Op Operator type
+       * @param op value
+       **/
+      template <typename Op>
+        void Exec(Op &op) const {
+            //std::cout << "exec on stfexecutor -- start\n";
+#ifdef __CUDACC__      
+          dim3 threads, blocks;  
+
+          auto ctx = getCtx();
+          // Parameters passed by value in CUDA are limited to 4096B. If the user exceeds this, we 
+          // need to error out and have them break up the statement
+          MATX_STATIC_ASSERT((sizeof(op) + sizeof(index_t) * Op::Rank()) <= CUDA_MAX_VAL_PARAM, 
+              "Parameter buffer to device is limited to 4096B. Please break up your operator statement into multiple executions to limit the size of the parameters");
+
+          if constexpr (Op::Rank() == 0) {
+            threads = 1;
+            blocks = 1;
+            if constexpr (is_matx_op_lvalue<Op>() || is_matx_set_op<Op>()) {
+                    auto tsk = ctx.task();
+                    tsk.set_symbol(op.str());
+                    op.apply_dep_to_task(tsk); // recursively find the tensors from the tree to apply deps
+                    //std::cout << "Start launch task. Rank = " << Op::Rank() << " " << op.str() << '\n';
+                    tsk->*[&](cudaStream_t s) { 
+                        detail::matxOpT0Kernel<<<blocks, threads, 0, s>>>(op);
+                    };
+                    //std::cout << "End launch task.\n";
+            }
+            else {
+                //std::cout << " RANK 0 not on LHS operator = " << op.str() << '\n';
+                detail::matxOpT0Kernel<<<blocks, threads, 0, stream_>>>(op);
+            }
+          }
+          else {
+            cuda::std::array<index_t, Op::Rank()> sizes;
+            for (int i = 0; i < Op::Rank(); i++) {
+              sizes[i] = op.Size(i);
+            }        
+
+            bool stride = detail::get_grid_dims<Op::Rank()>(blocks, threads, sizes, 256);
+
+            if constexpr (Op::Rank() == 1) {
+                if constexpr (is_matx_op_lvalue<Op>() || is_matx_set_op<Op>()) {
+                    auto tsk = ctx.task();
+                    tsk.set_symbol(op.str());
+                    op.apply_dep_to_task(tsk); // recursively find the tensors from the tree to apply deps
+                    //std::cout << "Start launch task. Rank = " << Op::Rank() << '\n';
+                    tsk->*[&](cudaStream_t s) { 
+                        detail::matxOpT1Kernel<<<blocks, threads, 0, s>>>(op, sizes[0]);
+                    };
+                    //std::cout << "End launch task.\n";
+                }
+                else {
+                    //std::cout << " RANK 1 not on LHS operator = " << op.str() << '\n';
+                    detail::matxOpT1Kernel<<<blocks, threads, 0, stream_>>>(op, sizes[0]);
+                }
+            }
+            else if constexpr (Op::Rank() == 2) {
+                if constexpr (is_matx_op_lvalue<Op>() || is_matx_set_op<Op>()) {
+                    auto tsk = ctx.task();
+                    tsk.set_symbol(op.str());
+                    op.apply_dep_to_task(tsk); // recursively find the tensors from the tree to apply deps
+                    //std::cout << "About to launch task. Rank = " << Op::Rank() << '\n';
+                    tsk->*[&](cudaStream_t s) { 
+                        if(stride) {
+                            detail::matxOpT2StrideKernel<<<blocks, threads, 0, s>>>(op, sizes[0], sizes[1]);
+                        } else {
+                            detail::matxOpT2Kernel<<<blocks, threads, 0, s>>>(op, sizes[0], sizes[1]);
+                        }
+                    };
+                }
+                else {
+                    //std::cout << " not on LHS operator = " << op.str() << '\n';
+                    if(stride) {
+                        detail::matxOpT2StrideKernel<<<blocks, threads, 0, stream_>>>(op, sizes[0], sizes[1]);
+                    } else {
+                        detail::matxOpT2Kernel<<<blocks, threads, 0, stream_>>>(op, sizes[0], sizes[1]);
+                    }
+                }
+            }
+            else if constexpr (Op::Rank() == 3) {
+                if constexpr (is_matx_op_lvalue<Op>() || is_matx_set_op<Op>()) {
+                    auto tsk = ctx.task();
+                    tsk.set_symbol(op.str());
+                    op.apply_dep_to_task(tsk); // recursively find the tensors from the tree to apply deps
+                    tsk->*[&](cudaStream_t s) { 
+                        if(stride) {
+                            detail::matxOpT3StrideKernel<<<blocks, threads, 0, s>>>(op, sizes[0], sizes[1], sizes[2]);
+                        } else {
+                            detail::matxOpT3Kernel<<<blocks, threads, 0, s>>>(op, sizes[0], sizes[1], sizes[2]);
+                        }
+                    };
+                }
+                else {
+                    if(stride) {
+                        detail::matxOpT3StrideKernel<<<blocks, threads, 0, stream_>>>(op, sizes[0], sizes[1], sizes[2]);
+                    } else {
+                        detail::matxOpT3Kernel<<<blocks, threads, 0, stream_>>>(op, sizes[0], sizes[1], sizes[2]);
+                    }
+                }
+            }
+            else if constexpr (Op::Rank() == 4) {
+                if constexpr (is_matx_op_lvalue<Op>() || is_matx_set_op<Op>()) {
+                    auto tsk = ctx.task();
+                    op.apply_dep_to_task(tsk); // recursively find the tensors from the tree to apply deps
+                    tsk.set_symbol(op.str())->*[&](cudaStream_t s) {
+                        if(stride) {
+                            detail::matxOpT4StrideKernel<<<blocks, threads, 0, s>>>(op, sizes[0], sizes[1], sizes[2], sizes[3]);
+                        } else {
+                            detail::matxOpT4Kernel<<<blocks, threads, 0, s>>>(op, sizes[0], sizes[1], sizes[2], sizes[3]);
+                        }
+                    };
+                }
+                else {
+                    if(stride) {
+                        detail::matxOpT4StrideKernel<<<blocks, threads, 0, stream_>>>(op, sizes[0], sizes[1], sizes[2], sizes[3]);
+                    } else {
+                        detail::matxOpT4Kernel<<<blocks, threads, 0, stream_>>>(op, sizes[0], sizes[1], sizes[2], sizes[3]);
+                    }
+                }
+            }        
+            else {
+                if constexpr (is_matx_op_lvalue<Op>() || is_matx_set_op<Op>()) {
+                    auto tsk = ctx.task();
+                    tsk.set_symbol(op.str());
+                    op.apply_dep_to_task(tsk); // recursively find the tensors from the tree to apply deps
+                    //std::cout << "About to launch task. Rank = " << Op::Rank() << '\n';
+
+                    tsk->*[&](cudaStream_t s) { 
+                        index_t dims = std::accumulate(std::begin(sizes) + 1, std::end(sizes), 1, std::multiplies<index_t>());
+                        detail::matxOpTDKernel<<<blocks, threads, 0, s>>>(op, sizes, dims);
+                    };
+                }
+                else {
+                    index_t dims = std::accumulate(std::begin(sizes) + 1, std::end(sizes), 1, std::multiplies<index_t>());
+                    detail::matxOpTDKernel<<<blocks, threads, 0, stream_>>>(op, sizes, dims);
+                }
+            } 
+          }
+#else
+          MATX_ASSERT_STR(false, matxInvalidParameter, "Cannot call device executor using host compiler");
+#endif    
+            //std::cout << "exec on stfexecutor -- stop\n";
+        }
+
+    private:
+      cudaStream_t stream_;
+      cuda::experimental::stf::context ctx_;
+  };
+
+};

From c8ef988d1e1987f320c4be256dae36621b7c02f2 Mon Sep 17 00:00:00 2001
From: Albert Sidelnik <asidelnik@nvidia.com>
Date: Tue, 5 Nov 2024 12:39:16 -0800
Subject: [PATCH 06/39] support for cgsolve operator and a few examples

---
 examples/cgsolve.cu               |  9 ++++++++
 examples/fft_conv.cu              | 33 ++++++++++++++++++++++++++----
 include/matx/executors/stf.h      |  3 +++
 include/matx/operators/all.h      | 28 ++++++++++++++++++++++++-
 include/matx/operators/cast.h     |  8 ++++++++
 include/matx/operators/cgsolve.h  |  4 ++--
 include/matx/operators/fft.h      |  2 +-
 include/matx/operators/max.h      | 28 ++++++++++++++++++++++++-
 include/matx/transforms/cgsolve.h | 34 ++++++++++++++++++++-----------
 9 files changed, 128 insertions(+), 21 deletions(-)

diff --git a/examples/cgsolve.cu b/examples/cgsolve.cu
index 9027d634..f77d8393 100644
--- a/examples/cgsolve.cu
+++ b/examples/cgsolve.cu
@@ -54,7 +54,12 @@ int main([[maybe_unused]] int argc, [[maybe_unused]] char **argv)
   auto norm = make_tensor<TypeParam, 1>({BATCH});
   auto maxn = make_tensor<TypeParam>({});
 
+#if 0
   cudaExecutor exec{};
+#else
+  stfExecutor exec{};
+  auto ctx = exec.getCtx();
+#endif
 
   // Simple Poisson matrix
   for(int b = 0; b < BATCH; b++) {
@@ -83,6 +88,10 @@ int main([[maybe_unused]] int argc, [[maybe_unused]] char **argv)
   (maxn = matx::max(sqrt(norm))).run(exec);
 
   exec.sync();
+#if 1
+  ctx.finalize();
+#endif
+
   // example-end sync-test-1
   printf ("max l2 norm: %f\n", (float)sqrt(maxn()));
 
diff --git a/examples/fft_conv.cu b/examples/fft_conv.cu
index 7f871119..0fa27343 100644
--- a/examples/fft_conv.cu
+++ b/examples/fft_conv.cu
@@ -73,7 +73,12 @@ int main([[maybe_unused]] int argc, [[maybe_unused]] char **argv)
 {
   MATX_ENTER_HANDLER();
   using complex = cuda::std::complex<float>;
+#if 0
   cudaExecutor exec{};
+#else
+  stfExecutor exec{};
+  auto ctx = exec.getCtx();
+#endif
 
   index_t signal_size = 1ULL << 16;
   index_t filter_size = 16;
@@ -117,7 +122,11 @@ int main([[maybe_unused]] int argc, [[maybe_unused]] char **argv)
   // Perform the FFT in-place on both signal and filter
   for (int i = 0; i < iterations; i++) {
     if (i == 1) {
+#if 0
       cudaEventRecord(start, stream);
+#else
+    cudaEventRecord(start, ctx.task_fence());
+#endif
     }    
     (sig_freq = fft(sig_time, filtered_size)).run(exec);
     (filt_freq = fft(filt_time, filtered_size)).run(exec);
@@ -129,18 +138,30 @@ int main([[maybe_unused]] int argc, [[maybe_unused]] char **argv)
     
   }
 
+#if 0
   cudaEventRecord(stop, stream);
+#else
+  cudaEventRecord(stop, ctx.task_fence());
+#endif
   exec.sync();
   cudaEventElapsedTime(&separate_ms, start, stop);   
 
   for (int i = 0; i < iterations; i++) {
     if (i == 1) {
-      cudaEventRecord(start, stream);
+#if 0
+        cudaEventRecord(start, stream);
+#else
+        cudaEventRecord(start, ctx.task_fence());
+#endif
     }
     (sig_freq = ifft(fft(sig_time, filtered_size) * fft(filt_time, filtered_size))).run(exec);
   }
-  
+
+#if 0
   cudaEventRecord(stop, stream);
+#else
+  cudaEventRecord(stop, ctx.task_fence());
+#endif
   exec.sync();
   cudaEventElapsedTime(&fused_ms, start, stop);  
 
@@ -153,7 +174,11 @@ int main([[maybe_unused]] int argc, [[maybe_unused]] char **argv)
   (time_out = conv1d(sig_time, filt1, matxConvCorrMode_t::MATX_C_MODE_FULL)).run(exec);
 
   exec.sync();
- 
+
+#if 1
+  ctx.finalize();
+#endif
+
   // Compare signals
   for (index_t b = 0; b < batches; b++) {
     for (index_t i = 0; i < filtered_size; i++) {
@@ -172,4 +197,4 @@ int main([[maybe_unused]] int argc, [[maybe_unused]] char **argv)
 
   CUDA_CHECK_LAST_ERROR();
   MATX_EXIT_HANDLER();
-}
\ No newline at end of file
+}
diff --git a/include/matx/executors/stf.h b/include/matx/executors/stf.h
index 00e8cef3..ad9e970b 100644
--- a/include/matx/executors/stf.h
+++ b/include/matx/executors/stf.h
@@ -63,10 +63,12 @@ template <typename T> constexpr bool is_matx_set_op();
       stfExecutor(cudaStream_t stream) : stream_(stream) {
           cuda::experimental::stf::async_resources_handle handle;
           ctx_ = cuda::experimental::stf::stream_ctx(stream, handle);
+          //ctx_ = cuda::experimental::stf::graph_ctx(stream, handle);
       }
       stfExecutor(int stream) : stream_(reinterpret_cast<cudaStream_t>(stream)) {
           cuda::experimental::stf::async_resources_handle handle;
           ctx_ = cuda::experimental::stf::stream_ctx(reinterpret_cast<cudaStream_t>(stream), handle);
+          //ctx_ = cuda::experimental::stf::graph_ctx(reinterpret_cast<cudaStream_t>(stream), handle);
       }
 
       /**
@@ -75,6 +77,7 @@ template <typename T> constexpr bool is_matx_set_op();
        */
       stfExecutor() : stream_(0) {
           ctx_ = cuda::experimental::stf::stream_ctx();
+          //ctx_ = cuda::experimental::stf::graph_ctx();
       }
 
       /**
diff --git a/include/matx/operators/all.h b/include/matx/operators/all.h
index f56f8336..20769677 100644
--- a/include/matx/operators/all.h
+++ b/include/matx/operators/all.h
@@ -72,9 +72,35 @@ namespace detail {
         return tmp_out_(indices...);
       };
 
+      template <typename Task>
+      __MATX_INLINE__ void apply_dep_to_task(Task &&task, int perm=1) const noexcept {
+        /* Albert -- Scenario where the all() operator is on the RHS and sum has already 
+        run previously. So we make tmp_out have a read permission as it will be read from */
+        tmp_out_.apply_dep_to_task(std::forward<Task>(task), 1);
+      }
+
       template <typename Out, typename Executor>
       void Exec(Out &&out, Executor &&ex) const {
-        all_impl(cuda::std::get<0>(out), a_, ex);
+        auto output = cuda::std::get<0>(out);
+        // stfexecutor case
+        if constexpr (!is_cuda_executor_v<Executor>) {
+            auto ctx = ex.getCtx();
+            auto tsk = ctx.task();
+            tsk.set_symbol("all_task");
+
+            output.PreRun(out_dims_, std::forward<Executor>(ex));
+            output.apply_dep_to_task(tsk, 0);
+            a_.apply_dep_to_task(tsk, 1);
+
+            tsk->*[&](cudaStream_t s) {
+                auto exec = cudaExecutor(s);
+                all_impl(output, a_, exec);
+            };
+        }
+        // cudaExecutor case
+        else if constexpr (is_cuda_executor_v<Executor>) {
+            all_impl(output, a_, ex);
+        }
       }
 
       static __MATX_INLINE__ constexpr __MATX_HOST__ __MATX_DEVICE__ int32_t Rank()
diff --git a/include/matx/operators/cast.h b/include/matx/operators/cast.h
index 83a1aa4d..23b6d539 100644
--- a/include/matx/operators/cast.h
+++ b/include/matx/operators/cast.h
@@ -83,6 +83,14 @@ namespace matx
           return static_cast<NewType>(op_(indices...));
         }
 
+        template <typename Task>
+        __MATX_INLINE__ void apply_dep_to_task(Task &&task, int perm=1) const noexcept
+        {
+          if constexpr (is_matx_op<T>()) {
+            op_.apply_dep_to_task(std::forward<Task>(task), perm);
+          }
+        }
+
         template <typename ShapeType, typename Executor>
         __MATX_INLINE__ void PreRun(ShapeType &&shape, Executor &&ex) const noexcept
         {
diff --git a/include/matx/operators/cgsolve.h b/include/matx/operators/cgsolve.h
index 21eab1bc..69628795 100644
--- a/include/matx/operators/cgsolve.h
+++ b/include/matx/operators/cgsolve.h
@@ -90,8 +90,8 @@ namespace matx
 
         template <typename Out, typename Executor>
         void Exec(Out &&out, Executor &&ex)  const{
-          static_assert(is_cuda_executor_v<Executor>, "cgsolve() only supports the CUDA executor currently");
-          cgsolve_impl(cuda::std::get<0>(out), a_, b_, tol_, max_iters_, ex.getStream());
+          //static_assert(is_cuda_executor_v<Executor>, "cgsolve() only supports the CUDA executor currently");
+          cgsolve_impl(cuda::std::get<0>(out), a_, b_, ex, tol_, max_iters_, ex.getStream());
         }
 
         template <typename ShapeType, typename Executor>
diff --git a/include/matx/operators/fft.h b/include/matx/operators/fft.h
index 76e57c3d..cf86acdc 100644
--- a/include/matx/operators/fft.h
+++ b/include/matx/operators/fft.h
@@ -146,7 +146,7 @@ namespace matx
         }
 
         template <typename Task>
-        __MATX_INLINE__ void apply_dep_to_task(Task &&task, int perm=1) const noexcept
+        __MATX_INLINE__ void apply_dep_to_task(Task &&task, [[maybe_unused]] int perm=1) const noexcept
         {
           /* Scenario where the matvec() operator is on the RHS and op has already
              run previously. So we make tmp_out have a read permission as it will be read from */
diff --git a/include/matx/operators/max.h b/include/matx/operators/max.h
index f7d9da77..562c2a4f 100644
--- a/include/matx/operators/max.h
+++ b/include/matx/operators/max.h
@@ -72,9 +72,35 @@ namespace detail {
         return tmp_out_(indices...);
       }
 
+      template <typename Task>
+      __MATX_INLINE__ void apply_dep_to_task(Task &&task, int perm=1) const noexcept {
+        /* Albert -- Scenario where the all() operator is on the RHS and sum has already 
+        run previously. So we make tmp_out have a read permission as it will be read from */
+        tmp_out_.apply_dep_to_task(std::forward<Task>(task), 1);
+      }
+
       template <typename Out, typename Executor>
       void Exec(Out &&out, Executor &&ex) const {
-        max_impl(cuda::std::get<0>(out), a_, ex);
+        auto output = cuda::std::get<0>(out);
+        // stfexecutor case
+        if constexpr (!is_cuda_executor_v<Executor>) {
+            auto ctx = ex.getCtx();
+            auto tsk = ctx.task();
+            tsk.set_symbol("max_task");
+
+            output.PreRun(out_dims_, std::forward<Executor>(ex));
+            output.apply_dep_to_task(tsk, 0);
+            a_.apply_dep_to_task(tsk, 1);
+
+            tsk->*[&](cudaStream_t s) {
+                auto exec = cudaExecutor(s);
+                max_impl(output, a_, exec);
+            };
+        }
+        // cudaExecutor case
+        else if constexpr (is_cuda_executor_v<Executor>) {
+            max_impl(output, a_, ex);
+        }
       }
 
       static __MATX_INLINE__ constexpr __MATX_HOST__ __MATX_DEVICE__ int32_t Rank()
diff --git a/include/matx/transforms/cgsolve.h b/include/matx/transforms/cgsolve.h
index 0f389ca9..f08d8f0c 100644
--- a/include/matx/transforms/cgsolve.h
+++ b/include/matx/transforms/cgsolve.h
@@ -58,8 +58,8 @@ namespace matx
    *   cuda Stream to execute on
    *
    */
-  template <typename XType, typename AType, typename BType>
-    __MATX_INLINE__ void cgsolve_impl(XType X, AType A, BType B, double tol=1e-6, int max_iters=4, cudaStream_t stream=0)
+  template <typename XType, typename AType, typename BType, typename Executor>
+    __MATX_INLINE__ void cgsolve_impl(XType X, AType A, BType B, Executor &&exec, double tol=1e-6, int max_iters=4, cudaStream_t stream=0)
     {
       using value_type = typename XType::value_type;
       const int VRANK = XType::Rank();
@@ -120,15 +120,19 @@ namespace matx
       auto pApc = clone<VRANK>(pAp, clone_shape);
     
       // A*X
-      (Ap = matvec(A, X)).run(stream);
+      //(Ap = matvec(A, X)).run(stream);
+      (Ap = matvec(A, X)).run(exec);
       // r0 = B - A*X   
       // p = r0 
-      (p = r0 = B - Ap).run(stream);  
+      //(p = r0 = B - Ap).run(stream);  
+      (p = r0 = B - Ap).run(exec);  
       
-      (r0r0 = sum(r0*r0)).run(stream);
+      //(r0r0 = sum(r0*r0)).run(stream);
+      (r0r0 = sum(r0*r0)).run(exec);
       
       if(tol>0.0f) {
-        (converged = matx::all(as_int(sqrt(r0r0) < tol))).run(stream);
+        //(converged = matx::all(as_int(sqrt(r0r0) < tol))).run(stream);
+        (converged = matx::all(as_int(sqrt(r0r0) < tol))).run(exec);
       
         cudaEventRecord(event, stream);
         cudaStreamWaitEvent(d2h, event);
@@ -137,10 +141,12 @@ namespace matx
       int i;
       for (i = 0 ; i < max_iters; i++) {
         // Ap = matvec(A, p) 
-        (Ap = matvec(A, p)).run(stream);
+        //(Ap = matvec(A, p)).run(stream);
+        (Ap = matvec(A, p)).run(exec);
 
         // pAp = dot(p,Ap)  
-        (pAp = sum(p*Ap)).run(stream);
+        //(pAp = sum(p*Ap)).run(stream);
+        (pAp = sum(p*Ap)).run(exec);
         
         // if pAp is zero then we have exactly numerically converged.
         // However, this is batched so we may iterate more.  Iterating
@@ -152,10 +158,12 @@ namespace matx
         auto updateOp = ( r1 = r0 - (r0r0c/pApc) * Ap,
              X = X + (r0r0c/pApc) * p);
 
-        (IF( pApc != value_type(0), updateOp)).run(stream);
+        //(IF( pApc != value_type(0), updateOp)).run(stream);
+        (IF( pApc != value_type(0), updateOp)).run(exec);
         
         // r1r1 = dot(r1, r1)
-        (r1r1 = sum(r1*r1)).run(stream);
+        //(r1r1 = sum(r1*r1)).run(stream);
+        (r1r1 = sum(r1*r1)).run(exec);
         
         if(tol>0.0f) {
           // copy convergence criteria to host.  
@@ -168,7 +176,8 @@ namespace matx
             break;
           }
 
-          (converged = matx::all(as_int(sqrt(r1r1) < tol))).run(stream);
+          //(converged = matx::all(as_int(sqrt(r1r1) < tol))).run(stream);
+          (converged = matx::all(as_int(sqrt(r1r1) < tol))).run(exec);
         
           cudaEventRecord(event, stream);
           cudaStreamWaitEvent(d2h, event);
@@ -176,7 +185,8 @@ namespace matx
         
         // p = r1 + b * p 
         auto updateP = ( p = r1 + (r1r1c/r0r0c) * p);
-        (IF( pApc != value_type(0), updateP)).run(stream);
+        //(IF( pApc != value_type(0), updateP)).run(stream);
+        (IF( pApc != value_type(0), updateP)).run(exec);
 
         // Advance residual
         swap(r0r0, r1r1);  

From 52b18c9e638bc93966692259ae10c9e1efa7fd78 Mon Sep 17 00:00:00 2001
From: Albert Sidelnik <asidelnik@nvidia.com>
Date: Tue, 3 Dec 2024 13:17:32 -0800
Subject: [PATCH 07/39] make the sync() that is part of stfexecutor call
 ctx.task_fence()

---
 include/matx/executors/stf.h | 9 +++------
 1 file changed, 3 insertions(+), 6 deletions(-)

diff --git a/include/matx/executors/stf.h b/include/matx/executors/stf.h
index ad9e970b..85eb0e1c 100644
--- a/include/matx/executors/stf.h
+++ b/include/matx/executors/stf.h
@@ -53,8 +53,7 @@ template <typename T> constexpr bool is_matx_set_op();
     public:
       using matx_cuda = bool;  // signal this is a GPU executor
       using matx_executor = bool; ///< Type trait indicating this is an executor
-//using cudastf::async_resources_handle;
-//using cudastf::stream_ctx;
+
       /**
        * @brief Construct a new stfExecutor with a stream
        * 
@@ -92,10 +91,10 @@ template <typename T> constexpr bool is_matx_set_op();
       auto &getCtx() const noexcept { return ctx_; }
 
       /**
-       * @brief Synchronize the cuda executor's stream
+       * @brief Synchronize the STF executor's stream
        * 
        */
-      void sync() { cudaStreamSynchronize(stream_); }
+      void sync() { ctx.task_fence(); }
 
       /**
        * Execute an operator on a device
@@ -122,11 +121,9 @@ template <typename T> constexpr bool is_matx_set_op();
                     auto tsk = ctx.task();
                     tsk.set_symbol(op.str());
                     op.apply_dep_to_task(tsk); // recursively find the tensors from the tree to apply deps
-                    //std::cout << "Start launch task. Rank = " << Op::Rank() << " " << op.str() << '\n';
                     tsk->*[&](cudaStream_t s) { 
                         detail::matxOpT0Kernel<<<blocks, threads, 0, s>>>(op);
                     };
-                    //std::cout << "End launch task.\n";
             }
             else {
                 //std::cout << " RANK 0 not on LHS operator = " << op.str() << '\n';

From d726b10df03881fa1542f00801c7527cb8447222 Mon Sep 17 00:00:00 2001
From: Albert Sidelnik <asidelnik@nvidia.com>
Date: Tue, 3 Dec 2024 13:20:43 -0800
Subject: [PATCH 08/39] fix typo

---
 include/matx/executors/stf.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/include/matx/executors/stf.h b/include/matx/executors/stf.h
index 85eb0e1c..64399230 100644
--- a/include/matx/executors/stf.h
+++ b/include/matx/executors/stf.h
@@ -94,7 +94,7 @@ template <typename T> constexpr bool is_matx_set_op();
        * @brief Synchronize the STF executor's stream
        * 
        */
-      void sync() { ctx.task_fence(); }
+      void sync() { ctx_.task_fence(); }
 
       /**
        * Execute an operator on a device

From 5e7576c07a6c99fbe49f93b5c3c20a3805430b56 Mon Sep 17 00:00:00 2001
From: Albert Sidelnik <asidelnik@nvidia.com>
Date: Tue, 17 Dec 2024 10:43:47 -0800
Subject: [PATCH 09/39] Added test case

---
 examples/CMakeLists.txt     |   1 +
 examples/simple_stf_test.cu | 192 ++++++++++++++++++++++++++++++++++++
 2 files changed, 193 insertions(+)
 create mode 100644 examples/simple_stf_test.cu

diff --git a/examples/CMakeLists.txt b/examples/CMakeLists.txt
index d56f099b..563b6fc3 100644
--- a/examples/CMakeLists.txt
+++ b/examples/CMakeLists.txt
@@ -11,6 +11,7 @@ set(examples
     mvdr_beamformer
     pwelch
     resample_poly_bench
+    simple_stf_test
     spectrogram
     spectrogram_graph
     spherical_harmonics
diff --git a/examples/simple_stf_test.cu b/examples/simple_stf_test.cu
new file mode 100644
index 00000000..7061b715
--- /dev/null
+++ b/examples/simple_stf_test.cu
@@ -0,0 +1,192 @@
+////////////////////////////////////////////////////////////////////////////////
+// BSD 3-Clause License
+//
+// Copyright (c) 2021, NVIDIA Corporation
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are met:
+//
+// 1. Redistributions of source code must retain the above copyright notice, this
+//    list of conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above copyright notice,
+//    this list of conditions and the following disclaimer in the documentation
+//    and/or other materials provided with the distribution.
+//
+// 3. Neither the name of the copyright holder nor the names of its
+//    contributors may be used to endorse or promote products derived from
+//    this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+// DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+// FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+// DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+// SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+// CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+// OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+/////////////////////////////////////////////////////////////////////////////////
+
+#include "matx.h"
+#include <cassert>
+#include <cstdio>
+#include <math.h>
+#include <memory>
+
+using namespace matx;
+
+/**
+ * MatX uses C++ expression templates to build arithmetic expressions that compile into a lazily-evaluated
+ * type for executing on the device. Currently, nvcc cannot see certain optimizations
+ * when building the expression tree that would be obvious by looking at the code. Specifically any code reusing
+ * the same tensor multiple times appears to the compiler as separate tensors, and it may issue multiple load
+ * instructions. While caching helps, this can have a slight performance impact when compared to native CUDA
+ * kernels. To work around this problem, complex expressions can be placed in a custom operator by adding some
+ * boilerplate code around the original expression. This custom operator can then be used either alone or inside
+ * other arithmetic expressions, and only a single load is issues for each tensor. 
+ * 
+ * This example uses the Black-Scholes equtation to demonstrate the two ways to implement the equation in MatX, and
+ * shows the performance difference.
+ */
+
+/* Custom operator */
+template <class O, class I1>
+class BlackScholes : public BaseOp<BlackScholes<O, I1>> {
+private:
+  O out_;
+  I1 V_, S_, K_, r_, T_;
+
+public:
+  BlackScholes(O out, I1 K, I1 V, I1 S, I1 r, I1 T)
+      : out_(out), K_(K), V_(V), S_(S), r_(r), T_(T)  {}
+
+  __device__ inline void operator()(index_t idx)
+  {
+    auto V = V_(idx);
+    auto K = K_(idx);
+    auto S = S_(idx);
+    auto T = T_(idx);
+    auto r = r_(idx);
+
+    auto VsqrtT = V * sqrt(T);
+    auto d1 = (log(S / K) + (r + 0.5 * V * V) * T) / VsqrtT ;
+    auto d2 = d1 - VsqrtT;
+    auto cdf_d1 = normcdf(d1);
+    auto cdf_d2 = normcdf(d2);
+    auto expRT = exp(-1 * r * T); 
+
+    out_(idx) = S * cdf_d1 - K * expRT * cdf_d2;
+  }
+
+  __host__ __device__ inline index_t Size(uint32_t i) const  { return out_.Size(i); }
+  static inline constexpr __host__ __device__ int32_t Rank() { return O::Rank(); }
+};
+
+/* Arithmetic expression */
+template<typename T1>
+void compute_black_scholes_matx(tensor_t<T1,1>& K, 
+                                tensor_t<T1,1>& S, 
+                                tensor_t<T1,1>& V, 
+                                tensor_t<T1,1>& r, 
+                                tensor_t<T1,1>& T, 
+                                tensor_t<T1,1>& output, 
+#if 0
+                                cudaExecutor& exec)
+#else
+                                stfExecutor& exec)
+#endif
+{
+    auto VsqrtT = V * sqrt(T);
+    auto d1 = (log(S / K) + (r + 0.5 * V * V) * T) / VsqrtT ;
+    auto d2 = d1 - VsqrtT;
+    auto cdf_d1 = normcdf(d1);
+    auto cdf_d2 = normcdf(d2);
+    auto expRT = exp(-1 * r * T); 
+    (output = S * cdf_d1 - K * expRT * cdf_d2).run(exec);
+
+#if 0
+    (output = K + sqrt(S+V)).run(exec);
+#endif
+
+    //std::cout << "Output : " << std::endl;
+    //print(output);
+}
+
+int main([[maybe_unused]] int argc, [[maybe_unused]] char **argv)
+{
+  MATX_ENTER_HANDLER();
+
+  using dtype = double;
+
+#if 0
+  index_t input_size = 100000000;
+#else
+  index_t input_size = 10000;
+#endif
+  constexpr uint32_t num_iterations = 1000;
+  float time_ms;
+
+  tensor_t<dtype, 1> K_tensor{{input_size}};
+  tensor_t<dtype, 1> S_tensor{{input_size}};
+  tensor_t<dtype, 1> V_tensor{{input_size}};
+  tensor_t<dtype, 1> r_tensor{{input_size}};
+  tensor_t<dtype, 1> T_tensor{{input_size}};
+  tensor_t<dtype, 1> output_tensor{{input_size}};  
+
+  cudaStream_t stream;
+  cudaStreamCreate(&stream);
+#if 0
+  cudaExecutor exec{stream};
+#else
+  stfExecutor exec{stream};
+  auto ctx = exec.getCtx();
+#endif
+
+    /* Albert --- initilizing input .. */
+    for (int i = 0; i < input_size; i++) {
+        K_tensor(i) = dtype(i+1);
+        S_tensor(i) = dtype(i+i+1);
+        V_tensor(i) = dtype(i+i+i+1);
+        r_tensor(i) = dtype(i+i+i+i+1);
+        T_tensor(i) = dtype(i+i+i+i+i+1);
+    }
+
+//print(V_tensor);
+
+  compute_black_scholes_matx(K_tensor, S_tensor, V_tensor, r_tensor, T_tensor, output_tensor, exec);  
+
+  cudaEvent_t start, stop;
+  cudaEventCreate(&start);
+  cudaEventCreate(&stop);
+
+#if 0
+  cudaEventRecord(start, stream);
+#else
+  cudaEventRecord(start, ctx.task_fence());
+#endif
+  // Time non-operator version
+  for (uint32_t i = 0; i < num_iterations; i++) {
+    compute_black_scholes_matx(K_tensor, S_tensor, V_tensor, r_tensor, T_tensor, output_tensor, exec);
+  }
+#if 0
+  cudaEventRecord(stop, stream);
+#else
+  cudaEventRecord(stop, ctx.task_fence());
+#endif
+  exec.sync();
+#if 1
+  ctx.finalize();
+  cudaEventElapsedTime(&time_ms, start, stop);
+#endif
+
+  printf("Time without custom operator = %.2fms per iteration\n",
+         time_ms / num_iterations);
+  cudaEventDestroy(start);
+  cudaEventDestroy(stop);
+  cudaStreamDestroy(stream);
+  CUDA_CHECK_LAST_ERROR();
+  MATX_EXIT_HANDLER();
+}

From 13736996468b9fe44856d46b8b4781e725ae22fb Mon Sep 17 00:00:00 2001
From: Albert Sidelnik <asidelnik@nvidia.com>
Date: Tue, 17 Dec 2024 11:52:19 -0800
Subject: [PATCH 10/39] Fixes to the sync

---
 examples/simple_stf_test.cu  | 44 ++++--------------------------------
 include/matx/executors/stf.h | 10 ++++++--
 2 files changed, 12 insertions(+), 42 deletions(-)

diff --git a/examples/simple_stf_test.cu b/examples/simple_stf_test.cu
index 7061b715..6c048f92 100644
--- a/examples/simple_stf_test.cu
+++ b/examples/simple_stf_test.cu
@@ -52,39 +52,6 @@ using namespace matx;
  * shows the performance difference.
  */
 
-/* Custom operator */
-template <class O, class I1>
-class BlackScholes : public BaseOp<BlackScholes<O, I1>> {
-private:
-  O out_;
-  I1 V_, S_, K_, r_, T_;
-
-public:
-  BlackScholes(O out, I1 K, I1 V, I1 S, I1 r, I1 T)
-      : out_(out), K_(K), V_(V), S_(S), r_(r), T_(T)  {}
-
-  __device__ inline void operator()(index_t idx)
-  {
-    auto V = V_(idx);
-    auto K = K_(idx);
-    auto S = S_(idx);
-    auto T = T_(idx);
-    auto r = r_(idx);
-
-    auto VsqrtT = V * sqrt(T);
-    auto d1 = (log(S / K) + (r + 0.5 * V * V) * T) / VsqrtT ;
-    auto d2 = d1 - VsqrtT;
-    auto cdf_d1 = normcdf(d1);
-    auto cdf_d2 = normcdf(d2);
-    auto expRT = exp(-1 * r * T); 
-
-    out_(idx) = S * cdf_d1 - K * expRT * cdf_d2;
-  }
-
-  __host__ __device__ inline index_t Size(uint32_t i) const  { return out_.Size(i); }
-  static inline constexpr __host__ __device__ int32_t Rank() { return O::Rank(); }
-};
-
 /* Arithmetic expression */
 template<typename T1>
 void compute_black_scholes_matx(tensor_t<T1,1>& K, 
@@ -106,11 +73,6 @@ void compute_black_scholes_matx(tensor_t<T1,1>& K,
     auto cdf_d2 = normcdf(d2);
     auto expRT = exp(-1 * r * T); 
     (output = S * cdf_d1 - K * expRT * cdf_d2).run(exec);
-
-#if 0
-    (output = K + sqrt(S+V)).run(exec);
-#endif
-
     //std::cout << "Output : " << std::endl;
     //print(output);
 }
@@ -156,7 +118,7 @@ int main([[maybe_unused]] int argc, [[maybe_unused]] char **argv)
 
 //print(V_tensor);
 
-  compute_black_scholes_matx(K_tensor, S_tensor, V_tensor, r_tensor, T_tensor, output_tensor, exec);  
+  //compute_black_scholes_matx(K_tensor, S_tensor, V_tensor, r_tensor, T_tensor, output_tensor, exec);  
 
   cudaEvent_t start, stop;
   cudaEventCreate(&start);
@@ -182,7 +144,9 @@ int main([[maybe_unused]] int argc, [[maybe_unused]] char **argv)
   cudaEventElapsedTime(&time_ms, start, stop);
 #endif
 
-  printf("Time without custom operator = %.2fms per iteration\n",
+  //  printf("Output tensor :\n");
+  //  print(output_tensor);
+  printf("Time without custom operator = %fms per iteration\n",
          time_ms / num_iterations);
   cudaEventDestroy(start);
   cudaEventDestroy(stop);
diff --git a/include/matx/executors/stf.h b/include/matx/executors/stf.h
index 64399230..83650fc7 100644
--- a/include/matx/executors/stf.h
+++ b/include/matx/executors/stf.h
@@ -79,6 +79,13 @@ template <typename T> constexpr bool is_matx_set_op();
           //ctx_ = cuda::experimental::stf::graph_ctx();
       }
 
+#if 0
+      ~stfExecutor() {
+        std::cout << "About to call ctx.finalize\n";
+        ctx_.finalize();
+      }
+#endif
+
       /**
        * @brief Returns stream associated with executor
       */
@@ -94,7 +101,7 @@ template <typename T> constexpr bool is_matx_set_op();
        * @brief Synchronize the STF executor's stream
        * 
        */
-      void sync() { ctx_.task_fence(); }
+      void sync() { cudaStreamSynchronize(ctx_.task_fence()); }
 
       /**
        * Execute an operator on a device
@@ -104,7 +111,6 @@ template <typename T> constexpr bool is_matx_set_op();
        **/
       template <typename Op>
         void Exec(Op &op) const {
-            //std::cout << "exec on stfexecutor -- start\n";
 #ifdef __CUDACC__      
           dim3 threads, blocks;  
 

From 92e72043e9d7eaa9709b57e3daf9db18d3f39875 Mon Sep 17 00:00:00 2001
From: Albert Sidelnik <asidelnik@nvidia.com>
Date: Tue, 17 Dec 2024 12:54:43 -0800
Subject: [PATCH 11/39] add support for cgsolve

---
 examples/cgsolve.cu              |  2 +-
 include/matx/operators/cgsolve.h | 10 ++++++++++
 2 files changed, 11 insertions(+), 1 deletion(-)

diff --git a/examples/cgsolve.cu b/examples/cgsolve.cu
index f77d8393..3d0a1c20 100644
--- a/examples/cgsolve.cu
+++ b/examples/cgsolve.cu
@@ -58,7 +58,6 @@ int main([[maybe_unused]] int argc, [[maybe_unused]] char **argv)
   cudaExecutor exec{};
 #else
   stfExecutor exec{};
-  auto ctx = exec.getCtx();
 #endif
 
   // Simple Poisson matrix
@@ -89,6 +88,7 @@ int main([[maybe_unused]] int argc, [[maybe_unused]] char **argv)
 
   exec.sync();
 #if 1
+  auto ctx = exec.getCtx();
   ctx.finalize();
 #endif
 
diff --git a/include/matx/operators/cgsolve.h b/include/matx/operators/cgsolve.h
index 69628795..93245fe3 100644
--- a/include/matx/operators/cgsolve.h
+++ b/include/matx/operators/cgsolve.h
@@ -88,9 +88,19 @@ namespace matx
           return out_dims_[dim];
         }
 
+      template <typename Task>
+      __MATX_INLINE__ void apply_dep_to_task(Task &&task, int perm=1) const noexcept {
+        /* Albert -- Scenario where the sum() operator is on the RHS and sum has already 
+        run previously. So we make tmp_out have a read permission as it will be read from */
+        tmp_out_.apply_dep_to_task(std::forward<Task>(task), 1);
+      }
+
+
         template <typename Out, typename Executor>
         void Exec(Out &&out, Executor &&ex)  const{
           //static_assert(is_cuda_executor_v<Executor>, "cgsolve() only supports the CUDA executor currently");
+            auto output = cuda::std::get<0>(out);
+          output.PreRun(out_dims_, std::forward<Executor>(ex));
           cgsolve_impl(cuda::std::get<0>(out), a_, b_, ex, tol_, max_iters_, ex.getStream());
         }
 

From a608f3fcf85e482eddbac407f5aa90ba97d06e6b Mon Sep 17 00:00:00 2001
From: Albert Sidelnik <asidelnik@nvidia.com>
Date: Tue, 17 Dec 2024 13:07:10 -0800
Subject: [PATCH 12/39] update to the simple radar code

---
 examples/simple_radar_pipeline.cu | 15 ++++++++++++++-
 examples/simple_radar_pipeline.h  |  7 +++----
 2 files changed, 17 insertions(+), 5 deletions(-)

diff --git a/examples/simple_radar_pipeline.cu b/examples/simple_radar_pipeline.cu
index 10190d5a..1cec32a4 100644
--- a/examples/simple_radar_pipeline.cu
+++ b/examples/simple_radar_pipeline.cu
@@ -36,11 +36,18 @@ int main([[maybe_unused]] int argc, [[maybe_unused]] char **argv)
 {
   MATX_ENTER_HANDLER();
   index_t numChannels = 16;
+#if 1
   index_t numPulses = 128;
   index_t numSamples = 9000;
   index_t waveformLength = 1000;
-  constexpr bool ENABLE_GRAPHS = false;
   uint32_t iterations = 100;
+#else
+  index_t numPulses = 128;
+  index_t numSamples = 1000;
+  index_t waveformLength = 1000;
+  uint32_t iterations = 20;
+#endif
+  constexpr bool ENABLE_GRAPHS = false;
   constexpr int num_streams = 1;
   cudaGraph_t graphs[num_streams];
   cudaGraphExec_t instances[num_streams];  
@@ -77,7 +84,9 @@ int main([[maybe_unused]] int argc, [[maybe_unused]] char **argv)
   }
 
   /* Get STF context handle */
+#if 1
   auto ctx = pipelines[0]->exec.getCtx();
+#endif
 
   MATX_NVTX_START_RANGE("Pipeline Test", matx_nvxtLogLevels::MATX_NVTX_LOG_USER, 2)
   printf("Running test...\n");
@@ -142,11 +151,14 @@ int main([[maybe_unused]] int argc, [[maybe_unused]] char **argv)
     pipelines[s]->sync();
   }
 
+#if 1
   ctx.finalize();
+#endif
 
   MATX_NVTX_END_RANGE(2)
   
   MATX_NVTX_START_RANGE("Pipeline Results", matx_nvxtLogLevels::MATX_NVTX_LOG_USER, 3)
+#if 1
   float time_ms;
   cudaEventElapsedTime(&time_ms, starts[num_streams-1], stops[num_streams-1]);
   float time_s = time_ms * .001f;
@@ -156,6 +168,7 @@ int main([[maybe_unused]] int argc, [[maybe_unused]] char **argv)
         time_ms,
          static_cast<float>(mult) / time_s,
          static_cast<float>(mult*sizeof(complex)*numSamples*8)/time_s/1e9);
+#endif
 
 for (int s = 0; s < num_streams; s++) {
     cudaEventDestroy(starts[s]);
diff --git a/examples/simple_radar_pipeline.h b/examples/simple_radar_pipeline.h
index 87e19f94..336be74b 100644
--- a/examples/simple_radar_pipeline.h
+++ b/examples/simple_radar_pipeline.h
@@ -465,11 +465,10 @@ class RadarPipeline {
   tensor_t<typename ComplexType::value_type, 2> cfarMaskView;
 
   cudaStream_t stream;
-#if 0
-  cudaExecutor exec;
-#else
+#if 1
 public:
   stfExecutor exec;
+#else
+  cudaExecutor exec;
 #endif
-
 };

From b0625778dda21476b5e361af2a6dee4a3137bd1b Mon Sep 17 00:00:00 2001
From: Albert Sidelnik <asidelnik@nvidia.com>
Date: Tue, 17 Dec 2024 13:11:29 -0800
Subject: [PATCH 13/39] minor typo fix

---
 examples/simple_stf_test.cu | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/examples/simple_stf_test.cu b/examples/simple_stf_test.cu
index 6c048f92..8947f191 100644
--- a/examples/simple_stf_test.cu
+++ b/examples/simple_stf_test.cu
@@ -141,8 +141,8 @@ int main([[maybe_unused]] int argc, [[maybe_unused]] char **argv)
   exec.sync();
 #if 1
   ctx.finalize();
-  cudaEventElapsedTime(&time_ms, start, stop);
 #endif
+  cudaEventElapsedTime(&time_ms, start, stop);
 
   //  printf("Output tensor :\n");
   //  print(output_tensor);

From bbf9abc10665caf8382fc97ca1a42d4ff6e432b8 Mon Sep 17 00:00:00 2001
From: Albert Sidelnik <asidelnik@nvidia.com>
Date: Thu, 19 Dec 2024 11:37:17 -0800
Subject: [PATCH 14/39] update version of stf

---
 cmake/versions.json | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/cmake/versions.json b/cmake/versions.json
index e6eba5b3..acea4f9a 100644
--- a/cmake/versions.json
+++ b/cmake/versions.json
@@ -3,7 +3,7 @@
     "CCCL": {
       "version": "2.8.0",
       "git_url": "https://github.com/NVIDIA/cccl.git",
-      "git_tag": "cb1fce5e1cb7362940bd7e74ab8fbf01942b6264"
+      "git_tag": "980e55d37f9c48776df977a4f59f9ef0cdb816f2"
     }
   }
 }

From 3e831ea4d35678d3bfc7e6bca1db15611a58b14c Mon Sep 17 00:00:00 2001
From: Albert Sidelnik <asidelnik@nvidia.com>
Date: Thu, 19 Dec 2024 11:51:49 -0800
Subject: [PATCH 15/39] cleanup constexpr case for stfexecutor

---
 include/matx/operators/fft.h | 10 ++++------
 1 file changed, 4 insertions(+), 6 deletions(-)

diff --git a/include/matx/operators/fft.h b/include/matx/operators/fft.h
index cf86acdc..9435d56a 100644
--- a/include/matx/operators/fft.h
+++ b/include/matx/operators/fft.h
@@ -156,7 +156,7 @@ namespace matx
         template <typename Out, typename Executor>
         void Exec(Out &&out, Executor &&ex) const {
             // stfexecutor case
-            if constexpr (!is_cuda_executor_v<Executor>) {
+            if constexpr (is_stf_executor_v<Executor>) {
                 auto ctx = ex.getCtx();
                 auto tsk = ctx.task();
                 tsk.set_symbol("fft_task_no_perm");
@@ -185,8 +185,9 @@ namespace matx
                         }
                     }
                 };
-            }            
-            else if constexpr (is_cuda_executor_v<Executor>) {
+            }
+            // cudaExecutor or host case
+            else {
                 if constexpr (std::is_same_v<PermDims, no_permute_t>) {
                     if constexpr (std::is_same_v<FFTType, fft_t>) {
                         fft_impl(cuda::std::get<0>(out), a_, fft_size_, norm_, ex);
@@ -204,9 +205,6 @@ namespace matx
                     }
                 }
             }
-            else {
-                printf("SHOULDNT BE HERE..\n");
-            }            
         }
 
         template <typename ShapeType, typename Executor>

From 702fe79195c87abb78d4b54448f1671d1d13500d Mon Sep 17 00:00:00 2001
From: Albert Sidelnik <asidelnik@nvidia.com>
Date: Thu, 19 Dec 2024 12:04:31 -0800
Subject: [PATCH 16/39] cleanup constexpr case for stfexecutor

---
 include/matx/core/tensor_impl.h | 14 ++++----------
 1 file changed, 4 insertions(+), 10 deletions(-)

diff --git a/include/matx/core/tensor_impl.h b/include/matx/core/tensor_impl.h
index f837f9cc..e79c4e06 100644
--- a/include/matx/core/tensor_impl.h
+++ b/include/matx/core/tensor_impl.h
@@ -1133,27 +1133,21 @@ IGNORE_WARNING_POP_GCC
     template <typename ShapeType, typename Executor>
     __MATX_INLINE__ void PreRun([[maybe_unused]] ShapeType &&shape, [[maybe_unused]] Executor &&ex) const noexcept
     {
-        using namespace cuda::experimental::stf;
-        data_place place;
-
-        if constexpr (is_cuda_executor_v<Executor>) {
-            return;
-        }
-        else if constexpr (!is_cuda_executor_v<Executor>) {
-
+        if constexpr (is_stf_executor_v<Executor>) {
+            using namespace cuda::experimental::stf;
             /* Don't create a new logical data for a tensor if it alread had one created previously */
             if (stf_ldata_ && stf_ldata_->has_value()) { return; }
 
             auto ctx = ex.getCtx();
 #if 0
             // Determine the type of memory that was allocated ie. host/managed/etc
-            place = getDataPlace(Data());
+            data_place place = getDataPlace(Data());
 #endif
 
             *stf_ldata_ = ctx.logical_data(cuda::experimental::stf::void_interface());
             //stf_ldata_->value().set_write_back(false);
             stf_ldata_->value().set_symbol(this->str());
-        }       
+        } 
     }
 
     template <typename ShapeType, typename Executor>

From 5bfe21e3bdcba37119f1665a9db537329d502d7e Mon Sep 17 00:00:00 2001
From: Albert Sidelnik <asidelnik@nvidia.com>
Date: Thu, 19 Dec 2024 12:36:30 -0800
Subject: [PATCH 17/39] add conditional support for cudagraph to the stf
 executor

---
 include/matx/executors/stf.h | 32 ++++++++++++++++----------------
 1 file changed, 16 insertions(+), 16 deletions(-)

diff --git a/include/matx/executors/stf.h b/include/matx/executors/stf.h
index 83650fc7..7f8b9674 100644
--- a/include/matx/executors/stf.h
+++ b/include/matx/executors/stf.h
@@ -59,32 +59,32 @@ template <typename T> constexpr bool is_matx_set_op();
        * 
        * @param stream CUDA stream
        */
-      stfExecutor(cudaStream_t stream) : stream_(stream) {
+      stfExecutor(cudaStream_t stream, bool is_cudagraph = false) : stream_(stream) {
           cuda::experimental::stf::async_resources_handle handle;
-          ctx_ = cuda::experimental::stf::stream_ctx(stream, handle);
-          //ctx_ = cuda::experimental::stf::graph_ctx(stream, handle);
+          if (!is_cudagraph)
+              ctx_ = cuda::experimental::stf::stream_ctx(stream, handle);
+          else
+              ctx_ = cuda::experimental::stf::graph_ctx(stream, handle);
       }
-      stfExecutor(int stream) : stream_(reinterpret_cast<cudaStream_t>(stream)) {
+
+      stfExecutor(int stream, bool is_cudagraph = false) : stream_(reinterpret_cast<cudaStream_t>(stream)) {
           cuda::experimental::stf::async_resources_handle handle;
-          ctx_ = cuda::experimental::stf::stream_ctx(reinterpret_cast<cudaStream_t>(stream), handle);
-          //ctx_ = cuda::experimental::stf::graph_ctx(reinterpret_cast<cudaStream_t>(stream), handle);
+          if (!is_cudagraph)
+              ctx_ = cuda::experimental::stf::stream_ctx(reinterpret_cast<cudaStream_t>(stream), handle);
+          else
+              ctx_ = cuda::experimental::stf::graph_ctx(reinterpret_cast<cudaStream_t>(stream), handle);
       }
 
       /**
        * @brief Construct a new stfExecutor object using the default stream
        * 
        */
-      stfExecutor() : stream_(0) {
-          ctx_ = cuda::experimental::stf::stream_ctx();
-          //ctx_ = cuda::experimental::stf::graph_ctx();
-      }
-
-#if 0
-      ~stfExecutor() {
-        std::cout << "About to call ctx.finalize\n";
-        ctx_.finalize();
+      stfExecutor(bool is_cudagraph = false) : stream_(0) {
+          if (!is_cudagraph)
+              ctx_ = cuda::experimental::stf::stream_ctx();
+          else
+              ctx_ = cuda::experimental::stf::graph_ctx();
       }
-#endif
 
       /**
        * @brief Returns stream associated with executor

From f407256a4815b4d43b9bc5d15ba2ad3dd2d4a39f Mon Sep 17 00:00:00 2001
From: Albert Sidelnik <asidelnik@nvidia.com>
Date: Thu, 9 Jan 2025 12:46:19 -0800
Subject: [PATCH 18/39] update to latest cudastf

---
 cmake/versions.json | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/cmake/versions.json b/cmake/versions.json
index acea4f9a..108b6bc1 100644
--- a/cmake/versions.json
+++ b/cmake/versions.json
@@ -3,7 +3,7 @@
     "CCCL": {
       "version": "2.8.0",
       "git_url": "https://github.com/NVIDIA/cccl.git",
-      "git_tag": "980e55d37f9c48776df977a4f59f9ef0cdb816f2"
+      "git_tag": "36e27f7c1074010eefaab64d387ff6663569e065"
     }
   }
 }

From 221599cb3e6a56861017969ec056c0766cfe8608 Mon Sep 17 00:00:00 2001
From: Albert Sidelnik <asidelnik@nvidia.com>
Date: Thu, 9 Jan 2025 12:52:56 -0800
Subject: [PATCH 19/39] switch to use logical token

---
 include/matx/core/tensor_impl.h | 3 +--
 include/matx/executors/stf.h    | 4 +++-
 2 files changed, 4 insertions(+), 3 deletions(-)

diff --git a/include/matx/core/tensor_impl.h b/include/matx/core/tensor_impl.h
index e79c4e06..142d9008 100644
--- a/include/matx/core/tensor_impl.h
+++ b/include/matx/core/tensor_impl.h
@@ -1144,8 +1144,7 @@ IGNORE_WARNING_POP_GCC
             data_place place = getDataPlace(Data());
 #endif
 
-            *stf_ldata_ = ctx.logical_data(cuda::experimental::stf::void_interface());
-            //stf_ldata_->value().set_write_back(false);
+            *stf_ldata_ = ctx.logical_token();
             stf_ldata_->value().set_symbol(this->str());
         } 
     }
diff --git a/include/matx/executors/stf.h b/include/matx/executors/stf.h
index 7f8b9674..e055a051 100644
--- a/include/matx/executors/stf.h
+++ b/include/matx/executors/stf.h
@@ -86,6 +86,9 @@ template <typename T> constexpr bool is_matx_set_op();
               ctx_ = cuda::experimental::stf::graph_ctx();
       }
 
+      ~stfExecutor() {
+          //ctx_.finalize();
+      }
       /**
        * @brief Returns stream associated with executor
       */
@@ -245,7 +248,6 @@ template <typename T> constexpr bool is_matx_set_op();
 #else
           MATX_ASSERT_STR(false, matxInvalidParameter, "Cannot call device executor using host compiler");
 #endif    
-            //std::cout << "exec on stfexecutor -- stop\n";
         }
 
     private:

From 7a5bb6cf72fabe62be9ee23f51d24e1682e55be5 Mon Sep 17 00:00:00 2001
From: Albert Sidelnik <asidelnik@nvidia.com>
Date: Thu, 9 Jan 2025 12:55:06 -0800
Subject: [PATCH 20/39] update parameters for radar code

---
 examples/simple_radar_pipeline.cu | 17 +++++++++++------
 examples/simple_radar_pipeline.h  |  2 ++
 2 files changed, 13 insertions(+), 6 deletions(-)

diff --git a/examples/simple_radar_pipeline.cu b/examples/simple_radar_pipeline.cu
index 1cec32a4..47e8f912 100644
--- a/examples/simple_radar_pipeline.cu
+++ b/examples/simple_radar_pipeline.cu
@@ -35,20 +35,21 @@
 int main([[maybe_unused]] int argc, [[maybe_unused]] char **argv)
 {
   MATX_ENTER_HANDLER();
+#if 0
   index_t numChannels = 16;
-#if 1
   index_t numPulses = 128;
   index_t numSamples = 9000;
   index_t waveformLength = 1000;
   uint32_t iterations = 100;
 #else
+  index_t numChannels = 16;
   index_t numPulses = 128;
   index_t numSamples = 1000;
   index_t waveformLength = 1000;
-  uint32_t iterations = 20;
+  uint32_t iterations = 100;
 #endif
   constexpr bool ENABLE_GRAPHS = false;
-  constexpr int num_streams = 1;
+  constexpr int num_streams = 8;
   cudaGraph_t graphs[num_streams];
   cudaGraphExec_t instances[num_streams];  
   using complex = cuda::std::complex<float>;
@@ -85,7 +86,6 @@ int main([[maybe_unused]] int argc, [[maybe_unused]] char **argv)
 
   /* Get STF context handle */
 #if 1
-  auto ctx = pipelines[0]->exec.getCtx();
 #endif
 
   MATX_NVTX_START_RANGE("Pipeline Test", matx_nvxtLogLevels::MATX_NVTX_LOG_USER, 2)
@@ -129,6 +129,7 @@ int main([[maybe_unused]] int argc, [[maybe_unused]] char **argv)
 #if 0
         cudaEventRecord(starts[s], streams[s]);
 #else
+        auto ctx = pipelines[s]->exec.getCtx();
         cudaEventRecord(starts[s], ctx.task_fence());
 #endif
       }
@@ -145,14 +146,18 @@ int main([[maybe_unused]] int argc, [[maybe_unused]] char **argv)
   for (int s = 0; s < num_streams; s++) {
 #if 0
     cudaEventRecord(stops[s], streams[s]);
+    pipelines[s]->sync();
 #else
+    auto ctx = pipelines[s]->exec.getCtx();
     cudaEventRecord(stops[s], ctx.task_fence());
 #endif
-    pipelines[s]->sync();
   }
 
 #if 1
-  ctx.finalize();
+  for (int s = 0; s < num_streams; s++) {
+      auto ctx = pipelines[s]->exec.getCtx();
+      ctx.finalize();
+  }
 #endif
 
   MATX_NVTX_END_RANGE(2)
diff --git a/examples/simple_radar_pipeline.h b/examples/simple_radar_pipeline.h
index 336be74b..74bb4b9f 100644
--- a/examples/simple_radar_pipeline.h
+++ b/examples/simple_radar_pipeline.h
@@ -120,6 +120,7 @@ class RadarPipeline {
   RadarPipeline() = delete;
   ~RadarPipeline()
   {
+      std::cout << "DTOR for radar\n";
 
   }
 
@@ -137,6 +138,7 @@ class RadarPipeline {
       : numPulses(_numPulses), numSamples(_numSamples), waveformLength(_wfLen),
         numChannels(_numChannels), stream(_stream), exec(_stream)
   {
+      std::cout << "CTOR for pipeline\n";
     numSamplesRnd = 1;
     while (numSamplesRnd < numSamples) {
       numSamplesRnd *= 2;

From 0c2432fd092f63161f18809e797dca626ca61fbf Mon Sep 17 00:00:00 2001
From: Albert Sidelnik <asidelnik@nvidia.com>
Date: Thu, 9 Jan 2025 13:27:46 -0800
Subject: [PATCH 21/39] update to radar code to work with command line args

---
 examples/simple_radar_pipeline.cu | 70 +++++++++++++++++++++----------
 1 file changed, 48 insertions(+), 22 deletions(-)

diff --git a/examples/simple_radar_pipeline.cu b/examples/simple_radar_pipeline.cu
index 47e8f912..9bb91754 100644
--- a/examples/simple_radar_pipeline.cu
+++ b/examples/simple_radar_pipeline.cu
@@ -48,30 +48,61 @@ int main([[maybe_unused]] int argc, [[maybe_unused]] char **argv)
   index_t waveformLength = 1000;
   uint32_t iterations = 100;
 #endif
+
+#if 0
+  constexpr int numStreams = 8;
+#else
+  int numStreams = 1;
+#endif
+
+#if 1
+    // Parse command-line arguments
+    for (int i = 1; i < argc; ++i) {
+        std::string arg = argv[i];
+
+        if (arg == "--numChannels" && i + 1 < argc) {
+            numChannels = std::stoi(argv[++i]);
+        } else if (arg == "--numPulses" && i + 1 < argc) {
+            numPulses = std::stoi(argv[++i]);
+        } else if (arg == "--numSamples" && i + 1 < argc) {
+            numSamples = std::stoi(argv[++i]);
+        } else if (arg == "--waveformLength" && i + 1 < argc) {
+            waveformLength = std::stoi(argv[++i]);
+        } else if (arg == "--iterations" && i + 1 < argc) {
+            iterations = std::stoi(argv[++i]);
+        } else if (arg == "--numStreams" && i + 1 < argc) {
+            numStreams = std::stoi(argv[++i]);
+        } else {
+            std::cerr << "Unknown option or missing value: " << arg << std::endl;
+            return 1; // Exit with error
+        }
+    }
+#endif
+
   constexpr bool ENABLE_GRAPHS = false;
-  constexpr int num_streams = 8;
-  cudaGraph_t graphs[num_streams];
-  cudaGraphExec_t instances[num_streams];  
+  cudaGraph_t graphs[numStreams];
+  cudaGraphExec_t instances[numStreams];  
   using complex = cuda::std::complex<float>;
-  RadarPipeline<complex> *pipelines[num_streams];
+  RadarPipeline<complex> *pipelines[numStreams];
 
   std::cout << "Iterations: " << iterations << std::endl;
   std::cout << "numChannels: " << numChannels << std::endl;
   std::cout << "numPulses: " << numPulses << std::endl;
   std::cout << "numNumSamples: " << numSamples << std::endl;
   std::cout << "waveformLength: " << waveformLength << std::endl;
+  std::cout << "numStreams: " << numStreams << std::endl;
 
   // cuda stream to place work in
-  cudaStream_t streams[num_streams];
+  cudaStream_t streams[numStreams];
   
   // manually set to log all NVTX levels
   MATX_NVTX_SET_LOG_LEVEL( matx_nvxtLogLevels::MATX_NVTX_LOG_ALL );
   
   // create some events for timing
-  cudaEvent_t starts[num_streams];
-  cudaEvent_t stops[num_streams];
+  cudaEvent_t starts[numStreams];
+  cudaEvent_t stops[numStreams];
 
-  for (int s = 0; s < num_streams; s++) {
+  for (int s = 0; s < numStreams; s++) {
     cudaEventCreate(&starts[s]);
     cudaEventCreate(&stops[s]);
     cudaStreamCreate(&streams[s]);
@@ -84,10 +115,6 @@ int main([[maybe_unused]] int argc, [[maybe_unused]] char **argv)
     pipelines[s]->sync();  
   }
 
-  /* Get STF context handle */
-#if 1
-#endif
-
   MATX_NVTX_START_RANGE("Pipeline Test", matx_nvxtLogLevels::MATX_NVTX_LOG_USER, 2)
   printf("Running test...\n");
 
@@ -110,12 +137,12 @@ int main([[maybe_unused]] int argc, [[maybe_unused]] char **argv)
   }; 
 
   // Warmup
-  for (int s = 0; s < num_streams; s++) {
+  for (int s = 0; s < numStreams; s++) {
     run_pipeline(s);
   }
 
   if (ENABLE_GRAPHS) {
-    for (int s = 0; s < num_streams; s++) {
+    for (int s = 0; s < numStreams; s++) {
       cudaStreamBeginCapture(streams[s], cudaStreamCaptureModeGlobal);
       run_pipeline(s);
       cudaStreamEndCapture(streams[s], &graphs[s]);
@@ -124,7 +151,7 @@ int main([[maybe_unused]] int argc, [[maybe_unused]] char **argv)
   }
   
   for (uint32_t i = 0; i < iterations; i++) {
-    for (int s = 0; s < num_streams; s++) {
+    for (int s = 0; s < numStreams; s++) {
       if (i == 1) {
 #if 0
         cudaEventRecord(starts[s], streams[s]);
@@ -143,18 +170,19 @@ int main([[maybe_unused]] int argc, [[maybe_unused]] char **argv)
     }
   }
 
-  for (int s = 0; s < num_streams; s++) {
+  for (int s = 0; s < numStreams; s++) {
 #if 0
     cudaEventRecord(stops[s], streams[s]);
     pipelines[s]->sync();
 #else
     auto ctx = pipelines[s]->exec.getCtx();
     cudaEventRecord(stops[s], ctx.task_fence());
+    pipelines[s]->sync();
 #endif
   }
 
 #if 1
-  for (int s = 0; s < num_streams; s++) {
+  for (int s = 0; s < numStreams; s++) {
       auto ctx = pipelines[s]->exec.getCtx();
       ctx.finalize();
   }
@@ -163,19 +191,17 @@ int main([[maybe_unused]] int argc, [[maybe_unused]] char **argv)
   MATX_NVTX_END_RANGE(2)
   
   MATX_NVTX_START_RANGE("Pipeline Results", matx_nvxtLogLevels::MATX_NVTX_LOG_USER, 3)
-#if 1
   float time_ms;
-  cudaEventElapsedTime(&time_ms, starts[num_streams-1], stops[num_streams-1]);
+  cudaEventElapsedTime(&time_ms, starts[numStreams-1], stops[numStreams-1]);
   float time_s = time_ms * .001f;
 
-  auto mult = iterations * numChannels * numPulses * num_streams;
+  auto mult = iterations * numChannels * numPulses * numStreams;
   printf("Pipeline finished in %.2fms, rate: %.2f pulses/channel/sec (%.2f Gbps)\n",
         time_ms,
          static_cast<float>(mult) / time_s,
          static_cast<float>(mult*sizeof(complex)*numSamples*8)/time_s/1e9);
-#endif
 
-for (int s = 0; s < num_streams; s++) {
+for (int s = 0; s < numStreams; s++) {
     cudaEventDestroy(starts[s]);
     cudaEventDestroy(stops[s]);
     cudaStreamDestroy(streams[s]);

From 3ae267be117b4d9b3640f72b32d198bcf8afa789 Mon Sep 17 00:00:00 2001
From: Albert Sidelnik <asidelnik@nvidia.com>
Date: Thu, 9 Jan 2025 15:03:26 -0800
Subject: [PATCH 22/39] cleanup to support different executor

---
 examples/simple_radar_pipeline.cu | 49 ++++++++++++-------------------
 examples/simple_radar_pipeline.h  |  6 +++-
 2 files changed, 24 insertions(+), 31 deletions(-)

diff --git a/examples/simple_radar_pipeline.cu b/examples/simple_radar_pipeline.cu
index 9bb91754..98ef04fd 100644
--- a/examples/simple_radar_pipeline.cu
+++ b/examples/simple_radar_pipeline.cu
@@ -35,19 +35,11 @@
 int main([[maybe_unused]] int argc, [[maybe_unused]] char **argv)
 {
   MATX_ENTER_HANDLER();
-#if 0
   index_t numChannels = 16;
   index_t numPulses = 128;
   index_t numSamples = 9000;
   index_t waveformLength = 1000;
   uint32_t iterations = 100;
-#else
-  index_t numChannels = 16;
-  index_t numPulses = 128;
-  index_t numSamples = 1000;
-  index_t waveformLength = 1000;
-  uint32_t iterations = 100;
-#endif
 
 #if 0
   constexpr int numStreams = 8;
@@ -55,7 +47,6 @@ int main([[maybe_unused]] int argc, [[maybe_unused]] char **argv)
   int numStreams = 1;
 #endif
 
-#if 1
     // Parse command-line arguments
     for (int i = 1; i < argc; ++i) {
         std::string arg = argv[i];
@@ -77,21 +68,20 @@ int main([[maybe_unused]] int argc, [[maybe_unused]] char **argv)
             return 1; // Exit with error
         }
     }
-#endif
-
-  constexpr bool ENABLE_GRAPHS = false;
-  cudaGraph_t graphs[numStreams];
-  cudaGraphExec_t instances[numStreams];  
-  using complex = cuda::std::complex<float>;
-  RadarPipeline<complex> *pipelines[numStreams];
 
   std::cout << "Iterations: " << iterations << std::endl;
   std::cout << "numChannels: " << numChannels << std::endl;
   std::cout << "numPulses: " << numPulses << std::endl;
-  std::cout << "numNumSamples: " << numSamples << std::endl;
+  std::cout << "numSamples: " << numSamples << std::endl;
   std::cout << "waveformLength: " << waveformLength << std::endl;
   std::cout << "numStreams: " << numStreams << std::endl;
 
+  constexpr bool ENABLE_GRAPHS = false;
+  cudaGraph_t graphs[numStreams];
+  cudaGraphExec_t instances[numStreams];  
+  using complex = cuda::std::complex<float>;
+  RadarPipeline<complex> *pipelines[numStreams];
+
   // cuda stream to place work in
   cudaStream_t streams[numStreams];
   
@@ -153,11 +143,11 @@ int main([[maybe_unused]] int argc, [[maybe_unused]] char **argv)
   for (uint32_t i = 0; i < iterations; i++) {
     for (int s = 0; s < numStreams; s++) {
       if (i == 1) {
-#if 0
-        cudaEventRecord(starts[s], streams[s]);
-#else
+#ifdef USE_STF
         auto ctx = pipelines[s]->exec.getCtx();
         cudaEventRecord(starts[s], ctx.task_fence());
+#else
+        cudaEventRecord(starts[s], streams[s]);
 #endif
       }
 
@@ -171,21 +161,20 @@ int main([[maybe_unused]] int argc, [[maybe_unused]] char **argv)
   }
 
   for (int s = 0; s < numStreams; s++) {
-#if 0
-    cudaEventRecord(stops[s], streams[s]);
-    pipelines[s]->sync();
-#else
+#ifdef USE_STF
     auto ctx = pipelines[s]->exec.getCtx();
     cudaEventRecord(stops[s], ctx.task_fence());
-    pipelines[s]->sync();
+#else
+    cudaEventRecord(stops[s], streams[s]);
 #endif
+    pipelines[s]->sync();
   }
 
-#if 1
-  for (int s = 0; s < numStreams; s++) {
-      auto ctx = pipelines[s]->exec.getCtx();
-      ctx.finalize();
-  }
+#ifdef USE_STF
+      for (int s = 0; s < numStreams; s++) {
+          auto ctx = pipelines[s]->exec.getCtx();
+          ctx.finalize();
+      }
 #endif
 
   MATX_NVTX_END_RANGE(2)
diff --git a/examples/simple_radar_pipeline.h b/examples/simple_radar_pipeline.h
index 74bb4b9f..7aea6c8d 100644
--- a/examples/simple_radar_pipeline.h
+++ b/examples/simple_radar_pipeline.h
@@ -35,6 +35,10 @@
 #include <memory>
 #include <stdint.h>
 
+#ifndef USE_STF
+#define USE_STF 1
+#endif
+
 using namespace matx;
 
 /**
@@ -467,7 +471,7 @@ class RadarPipeline {
   tensor_t<typename ComplexType::value_type, 2> cfarMaskView;
 
   cudaStream_t stream;
-#if 1
+#ifdef USE_STF
 public:
   stfExecutor exec;
 #else

From 6a75794e7a17cffba884e65f882510a26609174c Mon Sep 17 00:00:00 2001
From: Albert Sidelnik <asidelnik@nvidia.com>
Date: Fri, 24 Jan 2025 11:10:24 -0800
Subject: [PATCH 23/39] cleanup radar code to emit stf and cuda versions

---
 examples/CMakeLists.txt           | 18 +++++++++++++++---
 examples/simple_radar_pipeline.cu | 12 ++++++++++++
 examples/simple_radar_pipeline.h  |  7 -------
 3 files changed, 27 insertions(+), 10 deletions(-)

diff --git a/examples/CMakeLists.txt b/examples/CMakeLists.txt
index 563b6fc3..fbbd1be1 100644
--- a/examples/CMakeLists.txt
+++ b/examples/CMakeLists.txt
@@ -20,9 +20,6 @@ set(examples
     black_scholes
     print_styles)
 
-
-    
-    
 add_library(example_lib INTERFACE)
 target_include_directories(example_lib SYSTEM INTERFACE ${CUTLASS_INC} ${pybind11_INCLUDE_DIR} ${PYTHON_INCLUDE_DIRS})
 
@@ -54,6 +51,21 @@ foreach( example ${examples} )
     target_link_libraries(${example} example_lib)
 endforeach()
 
+# Compile all examples with CUDASTF and append their name with _stf
+foreach( example ${examples} )
+    string( CONCAT file ${example} ".cu" )
+
+    set(output_name "${example}_stf")
+    add_executable( ${output_name} ${file} )
+
+    # Add the -DUSE_STF compilation flag
+    target_compile_definitions(${output_name} PRIVATE USE_STF)
+
+    target_link_libraries(${output_name} example_lib)
+endforeach()
+
+
+
 # Build proprietary examples
 file (GLOB_RECURSE proprietary_sources CONFIGURE_DEPENDS ${CMAKE_SOURCE_DIR}/proprietary/*/examples/*.cu)
 foreach (pexample ${proprietary_sources})
diff --git a/examples/simple_radar_pipeline.cu b/examples/simple_radar_pipeline.cu
index 98ef04fd..fb6a64a2 100644
--- a/examples/simple_radar_pipeline.cu
+++ b/examples/simple_radar_pipeline.cu
@@ -41,6 +41,12 @@ int main([[maybe_unused]] int argc, [[maybe_unused]] char **argv)
   index_t waveformLength = 1000;
   uint32_t iterations = 100;
 
+#ifdef USE_STF
+  std::cout << "Using STF executor\n";
+#else
+  std::cout << "Using CUDA executor\n";
+#endif
+
 #if 0
   constexpr int numStreams = 8;
 #else
@@ -98,7 +104,9 @@ int main([[maybe_unused]] int argc, [[maybe_unused]] char **argv)
     cudaStreamCreate(&streams[s]);
     
     MATX_NVTX_START_RANGE("Pipeline Initialize", matx_nvxtLogLevels::MATX_NVTX_LOG_USER, 1)
+#if 0
     printf("Initializing data structures for stream %d...\n", s);
+#endif
     pipelines[s] = new RadarPipeline(numPulses, numSamples, waveformLength, numChannels, streams[s]);
     MATX_NVTX_END_RANGE(1)
 
@@ -106,7 +114,9 @@ int main([[maybe_unused]] int argc, [[maybe_unused]] char **argv)
   }
 
   MATX_NVTX_START_RANGE("Pipeline Test", matx_nvxtLogLevels::MATX_NVTX_LOG_USER, 2)
+#if 0
   printf("Running test...\n");
+#endif
 
   auto run_pipeline = [&](int s) {
     MATX_NVTX_START_RANGE("PulseCompression", matx_nvxtLogLevels::MATX_NVTX_LOG_USER, 21)
@@ -164,8 +174,10 @@ int main([[maybe_unused]] int argc, [[maybe_unused]] char **argv)
 #ifdef USE_STF
     auto ctx = pipelines[s]->exec.getCtx();
     cudaEventRecord(stops[s], ctx.task_fence());
+    std::cout << "using stf 2\n";
 #else
     cudaEventRecord(stops[s], streams[s]);
+    std::cout << "using cudaexec 2\n";
 #endif
     pipelines[s]->sync();
   }
diff --git a/examples/simple_radar_pipeline.h b/examples/simple_radar_pipeline.h
index 7aea6c8d..4892e679 100644
--- a/examples/simple_radar_pipeline.h
+++ b/examples/simple_radar_pipeline.h
@@ -35,10 +35,6 @@
 #include <memory>
 #include <stdint.h>
 
-#ifndef USE_STF
-#define USE_STF 1
-#endif
-
 using namespace matx;
 
 /**
@@ -124,8 +120,6 @@ class RadarPipeline {
   RadarPipeline() = delete;
   ~RadarPipeline()
   {
-      std::cout << "DTOR for radar\n";
-
   }
 
   /**
@@ -142,7 +136,6 @@ class RadarPipeline {
       : numPulses(_numPulses), numSamples(_numSamples), waveformLength(_wfLen),
         numChannels(_numChannels), stream(_stream), exec(_stream)
   {
-      std::cout << "CTOR for pipeline\n";
     numSamplesRnd = 1;
     while (numSamplesRnd < numSamples) {
       numSamplesRnd *= 2;

From f1facca21812b70aef36d2348cbe16d11ffbbb0f Mon Sep 17 00:00:00 2001
From: Albert Sidelnik <asidelnik@nvidia.com>
Date: Fri, 24 Jan 2025 11:14:52 -0800
Subject: [PATCH 24/39] test script that runs simple radar with different input
 sizes. outputs a .csv file of results

---
 examples/simple_radar_pipeline_test.sh | 45 ++++++++++++++++++++++++++
 1 file changed, 45 insertions(+)
 create mode 100755 examples/simple_radar_pipeline_test.sh

diff --git a/examples/simple_radar_pipeline_test.sh b/examples/simple_radar_pipeline_test.sh
new file mode 100755
index 00000000..12fd39a1
--- /dev/null
+++ b/examples/simple_radar_pipeline_test.sh
@@ -0,0 +1,45 @@
+#!/bin/bash
+
+COMMAND_PARAMETERS=("../build/examples/simple_radar_pipeline" "../build/examples/simple_radar_pipeline_stf")
+STREAM_PARAMETERS=("--numStreams 1" "--numStreams 2" "--numStreams 4" "--numStreams 8")
+#SAMPLE_PARAMETERS=("--numSamples 1000" "--numSamples 4500" "--numSamples 9000")
+SAMPLE_PARAMETERS=("--numSamples 1000" "--numSamples 2000" "--numSamples 4500" "--numSamples 6000")
+
+NUM_RUNS=5
+
+OUTPUT_FILE="radar_heatmap_data.csv"
+# Initialize the CSV file
+echo "Command,NumStreams,NumSamples,AverageGbps" > $OUTPUT_FILE
+
+# Loop through the parameters
+for command_param in "${COMMAND_PARAMETERS[@]}"; do
+  for str_param in "${STREAM_PARAMETERS[@]}"; do
+    for sam_param in "${SAMPLE_PARAMETERS[@]}"; do
+      TOTAL_GBPS=0
+      for i in $(seq 1 $NUM_RUNS); do
+        #echo "Iteration $i with parameter $command_param $str_param $sam_param"
+        OUTPUT=$($command_param $str_param $sam_param)
+        GBPS=$(echo "$OUTPUT" | grep -oP '(?<=\().*? Gbps' | awk '{print $1}')
+
+        # Add the extracted value to the total
+        if [ -n "$GBPS" ]; then
+          TOTAL_GBPS=$(echo "$TOTAL_GBPS + $GBPS" | bc)
+        else
+          echo "Failed to extract Gbps for iteration $i."
+        fi
+      done
+
+      # Calculate the average
+      if [ "$NUM_RUNS" -gt 0 ]; then
+        AVERAGE_GBPS=$(echo "$TOTAL_GBPS / $NUM_RUNS" | bc -l)
+        #echo "$command_param $str_param $sam_param verage Gbps over $NUM_RUNS runs: $AVERAGE_GBPS"
+        # Append the results to the CSV file
+        echo "$command_param,$(echo $str_param | awk '{print $2}'),$(echo $sam_param | awk '{print $2}'),$AVERAGE_GBPS" >> $OUTPUT_FILE
+      else
+        echo "No runs were performed."
+      fi
+    done
+  done
+done
+
+echo "Heatmap data saved to $OUTPUT_FILE."

From 0199e7517c2a466912864f5c0d6daf5557760375 Mon Sep 17 00:00:00 2001
From: Albert Sidelnik <asidelnik@nvidia.com>
Date: Fri, 24 Jan 2025 15:05:23 -0800
Subject: [PATCH 25/39] enable cuda graphs as a command line argument
 enableGraphs

---
 examples/simple_radar_pipeline.cu | 12 +++++++-----
 1 file changed, 7 insertions(+), 5 deletions(-)

diff --git a/examples/simple_radar_pipeline.cu b/examples/simple_radar_pipeline.cu
index fb6a64a2..d200aca0 100644
--- a/examples/simple_radar_pipeline.cu
+++ b/examples/simple_radar_pipeline.cu
@@ -41,6 +41,7 @@ int main([[maybe_unused]] int argc, [[maybe_unused]] char **argv)
   index_t waveformLength = 1000;
   uint32_t iterations = 100;
 
+  bool enableGraphs = false;
 #ifdef USE_STF
   std::cout << "Using STF executor\n";
 #else
@@ -69,6 +70,9 @@ int main([[maybe_unused]] int argc, [[maybe_unused]] char **argv)
             iterations = std::stoi(argv[++i]);
         } else if (arg == "--numStreams" && i + 1 < argc) {
             numStreams = std::stoi(argv[++i]);
+        } else if (arg == "--enableGraphs") {
+            enableGraphs = true;
+            ++i;
         } else {
             std::cerr << "Unknown option or missing value: " << arg << std::endl;
             return 1; // Exit with error
@@ -81,8 +85,8 @@ int main([[maybe_unused]] int argc, [[maybe_unused]] char **argv)
   std::cout << "numSamples: " << numSamples << std::endl;
   std::cout << "waveformLength: " << waveformLength << std::endl;
   std::cout << "numStreams: " << numStreams << std::endl;
+  std::cout << "enableGraphs: " << enableGraphs << std::endl;
 
-  constexpr bool ENABLE_GRAPHS = false;
   cudaGraph_t graphs[numStreams];
   cudaGraphExec_t instances[numStreams];  
   using complex = cuda::std::complex<float>;
@@ -141,7 +145,7 @@ int main([[maybe_unused]] int argc, [[maybe_unused]] char **argv)
     run_pipeline(s);
   }
 
-  if (ENABLE_GRAPHS) {
+  if (enableGraphs) {
     for (int s = 0; s < numStreams; s++) {
       cudaStreamBeginCapture(streams[s], cudaStreamCaptureModeGlobal);
       run_pipeline(s);
@@ -161,7 +165,7 @@ int main([[maybe_unused]] int argc, [[maybe_unused]] char **argv)
 #endif
       }
 
-      if (ENABLE_GRAPHS) {
+      if (enableGraphs) {
         cudaGraphLaunch(instances[s], streams[s]);
       }
       else {
@@ -174,10 +178,8 @@ int main([[maybe_unused]] int argc, [[maybe_unused]] char **argv)
 #ifdef USE_STF
     auto ctx = pipelines[s]->exec.getCtx();
     cudaEventRecord(stops[s], ctx.task_fence());
-    std::cout << "using stf 2\n";
 #else
     cudaEventRecord(stops[s], streams[s]);
-    std::cout << "using cudaexec 2\n";
 #endif
     pipelines[s]->sync();
   }

From 39b16f4bce9e2ffcec5a1aaa127d63c05ec173f1 Mon Sep 17 00:00:00 2001
From: Albert Sidelnik <asidelnik@nvidia.com>
Date: Mon, 27 Jan 2025 15:41:11 -0800
Subject: [PATCH 26/39] add support for the random/randomOp generator

---
 include/matx/generators/random.h | 7 +++++--
 1 file changed, 5 insertions(+), 2 deletions(-)

diff --git a/include/matx/generators/random.h b/include/matx/generators/random.h
index 9cd89854..ee3ada05 100644
--- a/include/matx/generators/random.h
+++ b/include/matx/generators/random.h
@@ -305,7 +305,7 @@ namespace detail {
       {
         InnerPreRun(std::forward<ST>(shape), std::forward<Executor>(ex));          
 #ifdef __CUDACC__
-        if constexpr (is_cuda_executor_v<Executor>) {
+        if constexpr ((is_cuda_executor_v<Executor>) || (is_stf_executor_v<Executor>)) {
           if (!init_) {
             auto stream = ex.getStream();
             matxAlloc((void **)&states_,
@@ -341,7 +341,7 @@ namespace detail {
       template <typename ST, typename Executor>
       __MATX_INLINE__ void PostRun([[maybe_unused]] ST &&shape, [[maybe_unused]] Executor &&ex) const noexcept
       {
-        if constexpr (is_cuda_executor_v<Executor>) {
+        if constexpr ((is_cuda_executor_v<Executor>) || (is_stf_executor_v<Executor>)) {
           matxFree(states_);
         }
         else if constexpr (is_host_executor_v<Executor>) {
@@ -479,6 +479,9 @@ namespace detail {
         return shape_[dim];
       }
 
+      template <typename Task>
+      __MATX_INLINE__ void apply_dep_to_task([[maybe_unused]] Task &&task, [[maybe_unused]] int perm=1) const noexcept { }
+
       static __MATX_INLINE__ constexpr __MATX_HOST__ __MATX_DEVICE__ int32_t Rank() { return RANK; }
     };
   }

From 9b7c4b0e4c7ebf2f08e1dcc114d0b2827cd7a4b1 Mon Sep 17 00:00:00 2001
From: Albert Sidelnik <asidelnik@nvidia.com>
Date: Mon, 27 Jan 2025 15:42:24 -0800
Subject: [PATCH 27/39] get the basic spectrogram code working with stf

---
 examples/spectrogram.cu | 32 +++++++++++++++++++++++++++++++-
 1 file changed, 31 insertions(+), 1 deletion(-)

diff --git a/examples/spectrogram.cu b/examples/spectrogram.cu
index 8d566b32..16464042 100644
--- a/examples/spectrogram.cu
+++ b/examples/spectrogram.cu
@@ -65,7 +65,18 @@ int main([[maybe_unused]] int argc, [[maybe_unused]] char **argv)
   cudaEventCreate(&start);
   cudaEventCreate(&stop);
 
+#ifdef USE_STF
+  std::cout << "Using STF executor\n";
+#else
+  std::cout << "Using CUDA executor\n";
+#endif
+
+
+#ifdef USE_STF
+  stfExecutor exec{stream};
+#else
   cudaExecutor exec{stream};
+#endif
 
   float fs = 10000;
   constexpr index_t N = 100000;
@@ -108,7 +119,12 @@ int main([[maybe_unused]] int argc, [[maybe_unused]] char **argv)
 
   for (uint32_t i = 0; i < num_iterations; i++) {
     if (i == 2) { // Start timer on third loop to allow generation of plot
+#if USE_STF
+      auto ctx = exec.getCtx();
+      cudaEventRecord(start, ctx.task_fence());
+#else
       cudaEventRecord(start, stream);
+#endif
     }
 
     // DFT Sample Frequencies (rfftfreq)
@@ -142,9 +158,23 @@ int main([[maybe_unused]] int argc, [[maybe_unused]] char **argv)
     }
 
   }
-
+#ifdef USE_STF
+{
+    auto ctx = exec.getCtx();
+    cudaEventRecord(stop, ctx.task_fence());
+}
+#else
   cudaEventRecord(stop, stream);
+#endif
   exec.sync();
+
+#ifdef USE_STF
+{
+  auto ctx = exec.getCtx();
+  ctx.finalize();
+}
+#endif
+
   cudaEventElapsedTime(&time_ms, start, stop);
 
   printf("Spectrogram Time Without Graphs = %.2fus per iteration\n",

From f9e09f104550ac202e01b4af201160247b0c13fc Mon Sep 17 00:00:00 2001
From: Albert Sidelnik <asidelnik@nvidia.com>
Date: Mon, 27 Jan 2025 15:45:20 -0800
Subject: [PATCH 28/39] get spectrogram cudagraph code working with stf

---
 examples/spectrogram_graph.cu | 36 +++++++++++++++++++++++++++++++----
 1 file changed, 32 insertions(+), 4 deletions(-)

diff --git a/examples/spectrogram_graph.cu b/examples/spectrogram_graph.cu
index 6f0583c8..91958ca2 100644
--- a/examples/spectrogram_graph.cu
+++ b/examples/spectrogram_graph.cu
@@ -62,12 +62,22 @@ int main([[maybe_unused]] int argc, [[maybe_unused]] char **argv)
   cudaStream_t stream;
   cudaStreamCreate(&stream);
 
-  cudaExecutor exec{stream};
-
   cudaEvent_t start, stop;
   cudaEventCreate(&start);
   cudaEventCreate(&stop);
 
+#ifdef USE_STF
+  std::cout << "Using STF executor\n";
+#else
+  std::cout << "Using CUDA executor\n";
+#endif
+
+#ifdef USE_STF
+  stfExecutor exec{stream};
+#else
+  cudaExecutor exec{stream};
+#endif
+
   float fs = 10000;
   index_t N = 100000;
   float amp = static_cast<float>(2 * sqrt(2));
@@ -147,15 +157,33 @@ int main([[maybe_unused]] int argc, [[maybe_unused]] char **argv)
     }
   }
 
-
   exec.sync();
   // Time graph execution of same kernels
-  cudaEventRecord(start, stream);
+#if USE_STF
+      auto ctx = exec.getCtx();
+      cudaEventRecord(start, ctx.task_fence());
+#else
+      cudaEventRecord(start, stream);
+#endif
+
   for (uint32_t i = 0; i < 10; i++) {
     cudaGraphLaunch(instance, stream);
   }
+#ifdef USE_STF
+{
+    cudaEventRecord(stop, ctx.task_fence());
+}
+#else
   cudaEventRecord(stop, stream);
+#endif
   exec.sync();
+
+#ifdef USE_STF
+{
+  ctx.finalize();
+}
+#endif
+
   cudaEventElapsedTime(&time_ms, start, stop);
 
   printf("Spectrogram Time With Graphs = %.2fus per iteration\n",

From 6c9a791576fcd8be714fbb7c9f133050e87db579 Mon Sep 17 00:00:00 2001
From: Albert Sidelnik <asidelnik@nvidia.com>
Date: Mon, 10 Feb 2025 10:35:44 -0800
Subject: [PATCH 29/39] add assert in the case stream capture is turned on if
 creating a plan

---
 include/matx/transforms/fft/fft_cuda.h | 9 +++++++++
 1 file changed, 9 insertions(+)

diff --git a/include/matx/transforms/fft/fft_cuda.h b/include/matx/transforms/fft/fft_cuda.h
index f4793b07..f8ea52a4 100644
--- a/include/matx/transforms/fft/fft_cuda.h
+++ b/include/matx/transforms/fft/fft_cuda.h
@@ -333,7 +333,11 @@ template <typename OutTensorType, typename InTensorType> class matxCUDAFFTPlan_t
   virtual ~matxCUDAFFTPlan_t() {
     if (this->workspace_ != nullptr) {
       // Pass the default stream until we allow user-deletable caches
+      /* Albert -- Temporarily remove this free as we likely don't want to 
+                   insert the dependence on cudaStreamDefault */
+#if 0
       matxFree(workspace_, cudaStreamDefault);
+#endif
       this->workspace_ = nullptr;
     }
 
@@ -424,6 +428,11 @@ matxCUDAFFTPlan1D_t(OutTensorType &o, const InTensorType &i, cudaStream_t stream
     }
   }
 
+  // Albert -Assert that the stream is in capture mode
+  cudaStreamCaptureStatus status;
+  cudaStreamIsCapturing(stream, &status);
+  MATX_ASSERT(status == cudaStreamCaptureStatusNone, matxCufftError);
+
   size_t workspaceSize;
   cufftCreate(&this->plan_);
   [[maybe_unused]] cufftResult error;

From bbb9aaef05d6fb7374f3822679974fef62436d31 Mon Sep 17 00:00:00 2001
From: Cedric Augonnet <caugonnet@nvidia.com>
Date: Mon, 24 Mar 2025 11:08:50 +0100
Subject: [PATCH 30/39] Apps using matx with stf should get these flags

---
 CMakeLists.txt | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/CMakeLists.txt b/CMakeLists.txt
index 135127a6..66f862d7 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -136,7 +136,9 @@ rapids_cpm_cccl(
 )
 
 target_link_libraries(matx INTERFACE CCCL::CCCL CCCL::cudax)
-set(MATX_CUDA_FLAGS ${CMAKE_CUDA_FLAGS} --expt-relaxed-constexpr --extended-lambda)
+target_compile_options(matx INTERFACE
+  $<$<COMPILE_LANGUAGE:CUDA>:--expt-relaxed-constexpr --extended-lambda>
+)
 
 # Set flags for compiling tests faster (only for nvcc)
 if (NOT CMAKE_CUDA_COMPILER_ID STREQUAL "Clang")

From e13c9b657662a7f50afeface9e3a50e749934c82 Mon Sep 17 00:00:00 2001
From: Cedric Augonnet <caugonnet@nvidia.com>
Date: Mon, 24 Mar 2025 11:09:56 +0100
Subject: [PATCH 31/39] fix constructor

---
 include/matx/core/tensor_impl.h | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/include/matx/core/tensor_impl.h b/include/matx/core/tensor_impl.h
index e03894e2..ba0f9bf9 100644
--- a/include/matx/core/tensor_impl.h
+++ b/include/matx/core/tensor_impl.h
@@ -262,8 +262,9 @@ MATX_IGNORE_WARNING_POP_GCC
     template <typename DescriptorType, std::enable_if_t<is_matx_descriptor_v<typename remove_cvref<DescriptorType>::type>, bool> =     true>
     __MATX_INLINE__ __MATX_DEVICE__ __MATX_HOST__ tensor_impl_t(T *const ldata,
                     DescriptorType &&desc, std::optional<stf_logicaldata_type > *stf_ldata)
-        : ldata_(ldata), desc_{std::forward<DescriptorType>(desc)}, stf_ldata_(stf_ldata)
+        : desc_{std::forward<DescriptorType>(desc)}, stf_ldata_(stf_ldata)
     {
+        data_.ldata_ = ldata;
     }
 
     /**

From 7244399f66ffaaeb695d01e5fb05aa89de721cb6 Mon Sep 17 00:00:00 2001
From: Albert Sidelnik <asidelnik@nvidia.com>
Date: Mon, 21 Apr 2025 14:29:23 -0700
Subject: [PATCH 32/39] fix typo/bug

---
 examples/simple_stf_test.cu | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/examples/simple_stf_test.cu b/examples/simple_stf_test.cu
index 8947f191..1df260ed 100644
--- a/examples/simple_stf_test.cu
+++ b/examples/simple_stf_test.cu
@@ -151,6 +151,6 @@ int main([[maybe_unused]] int argc, [[maybe_unused]] char **argv)
   cudaEventDestroy(start);
   cudaEventDestroy(stop);
   cudaStreamDestroy(stream);
-  CUDA_CHECK_LAST_ERROR();
+  //CUDA_CHECK_LAST_ERROR();
   MATX_EXIT_HANDLER();
 }

From 66f68506b2ab846ea11ef207ba260c9165c03c04 Mon Sep 17 00:00:00 2001
From: Albert Sidelnik <asidelnik@nvidia.com>
Date: Mon, 21 Apr 2025 15:09:24 -0700
Subject: [PATCH 33/39] update to example code to fix compile error

---
 examples/fft_conv.cu | 17 +++++++++--------
 1 file changed, 9 insertions(+), 8 deletions(-)

diff --git a/examples/fft_conv.cu b/examples/fft_conv.cu
index 0fbba06b..d7882ca3 100644
--- a/examples/fft_conv.cu
+++ b/examples/fft_conv.cu
@@ -80,13 +80,7 @@ int main([[maybe_unused]] int argc, [[maybe_unused]] char **argv)
   std::cout << "Using CUDA executor\n";
 #endif
 
-#ifdef USE_STF
-  stfExecutor exec{};
-  auto ctx = exec.getCtx();
-#else
-  cudaExecutor exec{};
-#endif
-  
+ 
   index_t signal_size = 1ULL << 16;
   index_t filter_size = 16;
   index_t batches = 8;
@@ -99,8 +93,14 @@ int main([[maybe_unused]] int argc, [[maybe_unused]] char **argv)
   cudaEvent_t start, stop;
   cudaEventCreate(&start);
   cudaEventCreate(&stop);
-  cudaExecutor exec{stream};
 
+#ifdef USE_STF
+  stfExecutor exec{stream};
+  auto ctx = exec.getCtx();
+#else
+  cudaExecutor exec{stream};
+#endif
+ 
   // Create time domain buffers
   auto sig_time  = make_tensor<complex>({batches, signal_size});
   auto filt_time = make_tensor<complex>({batches, filter_size});
@@ -135,6 +135,7 @@ int main([[maybe_unused]] int argc, [[maybe_unused]] char **argv)
 #else
         cudaEventRecord(start, stream);
 #endif
+    }
     (sig_freq = fft(sig_time, filtered_size)).run(exec);
     (filt_freq = fft(filt_time, filtered_size)).run(exec);
 

From 89e2a4323abf5bb667271396d83ee80c874e9d2c Mon Sep 17 00:00:00 2001
From: Albert Sidelnik <asidelnik@nvidia.com>
Date: Mon, 21 Apr 2025 15:10:28 -0700
Subject: [PATCH 34/39] update to example code to fix compile error

---
 examples/spectrogram.cu | 14 --------------
 1 file changed, 14 deletions(-)

diff --git a/examples/spectrogram.cu b/examples/spectrogram.cu
index f9bab85c..60404a7e 100644
--- a/examples/spectrogram.cu
+++ b/examples/spectrogram.cu
@@ -65,17 +65,7 @@ int main([[maybe_unused]] int argc, [[maybe_unused]] char **argv)
   cudaEventCreate(&start);
   cudaEventCreate(&stop);
 
-#ifdef USE_STF
-  std::cout << "Using STF executor\n";
-#else
-  std::cout << "Using CUDA executor\n";
-#endif
-
-#ifdef USE_STF
-  stfExecutor exec{stream};
-#else
   cudaExecutor exec{stream};
-#endif
 
   float fs = 10000;
   constexpr index_t N = 100000;
@@ -155,10 +145,6 @@ int main([[maybe_unused]] int argc, [[maybe_unused]] char **argv)
 
   exec.stop_timer();
   exec.sync();
-  #ifdef USE_STF
-    auto ctx = exec.getCtx();
-    ctx.finalize();
-  #endif
   time_ms = exec.get_time_ms();
 
   printf("Spectrogram Time Without Graphs = %.2fus per iteration\n",

From 973886b3747f0ece33151a1d195716387ef0335f Mon Sep 17 00:00:00 2001
From: Albert Sidelnik <asidelnik@nvidia.com>
Date: Mon, 21 Apr 2025 15:23:03 -0700
Subject: [PATCH 35/39] update test script for radar code

---
 examples/simple_radar_pipeline_test.sh | 16 +++++++++-------
 1 file changed, 9 insertions(+), 7 deletions(-)

diff --git a/examples/simple_radar_pipeline_test.sh b/examples/simple_radar_pipeline_test.sh
index 12fd39a1..880e9ed0 100755
--- a/examples/simple_radar_pipeline_test.sh
+++ b/examples/simple_radar_pipeline_test.sh
@@ -1,24 +1,25 @@
 #!/bin/bash
 
 COMMAND_PARAMETERS=("../build/examples/simple_radar_pipeline" "../build/examples/simple_radar_pipeline_stf")
-STREAM_PARAMETERS=("--numStreams 1" "--numStreams 2" "--numStreams 4" "--numStreams 8")
+STREAM_PARAMETERS=("1" "2" "4" "5" "6")
 #SAMPLE_PARAMETERS=("--numSamples 1000" "--numSamples 4500" "--numSamples 9000")
-SAMPLE_PARAMETERS=("--numSamples 1000" "--numSamples 2000" "--numSamples 4500" "--numSamples 6000")
+SAMPLE_PARAMETERS=("1000" "2000" "4500" "6000" "10000")
 
 NUM_RUNS=5
 
 OUTPUT_FILE="radar_heatmap_data.csv"
 # Initialize the CSV file
-echo "Command,NumStreams,NumSamples,AverageGbps" > $OUTPUT_FILE
 
 # Loop through the parameters
-for command_param in "${COMMAND_PARAMETERS[@]}"; do
-  for str_param in "${STREAM_PARAMETERS[@]}"; do
+for str_param in "${STREAM_PARAMETERS[@]}"; do
+  OUTPUT_FILE="radar_heatmap_data_${sam_param}.csv"
+  echo "Command,NumStreams,NumSamples,AverageGbps" > $OUTPUT_FILE
+  for command_param in "${COMMAND_PARAMETERS[@]}"; do
     for sam_param in "${SAMPLE_PARAMETERS[@]}"; do
       TOTAL_GBPS=0
       for i in $(seq 1 $NUM_RUNS); do
         #echo "Iteration $i with parameter $command_param $str_param $sam_param"
-        OUTPUT=$($command_param $str_param $sam_param)
+        OUTPUT=$($command_param --numStreams $str_param --numSamples $sam_param)
         GBPS=$(echo "$OUTPUT" | grep -oP '(?<=\().*? Gbps' | awk '{print $1}')
 
         # Add the extracted value to the total
@@ -34,7 +35,8 @@ for command_param in "${COMMAND_PARAMETERS[@]}"; do
         AVERAGE_GBPS=$(echo "$TOTAL_GBPS / $NUM_RUNS" | bc -l)
         #echo "$command_param $str_param $sam_param verage Gbps over $NUM_RUNS runs: $AVERAGE_GBPS"
         # Append the results to the CSV file
-        echo "$command_param,$(echo $str_param | awk '{print $2}'),$(echo $sam_param | awk '{print $2}'),$AVERAGE_GBPS" >> $OUTPUT_FILE
+        echo "$command_param,$(echo $str_param | awk '{print $0}'),$(echo $sam_param | awk '{print $0}'),$AVERAGE_GBPS"
+        echo "$command_param,$(echo $str_param | awk '{print $0}'),$(echo $sam_param | awk '{print $0}'),$AVERAGE_GBPS" >> $OUTPUT_FILE
       else
         echo "No runs were performed."
       fi

From 92885e70ffc8b67396d0fc8f9ad7199cf47ebc62 Mon Sep 17 00:00:00 2001
From: Albert Sidelnik <asidelnik@nvidia.com>
Date: Mon, 21 Apr 2025 15:24:54 -0700
Subject: [PATCH 36/39] temp fix to the allocator dtor

---
 include/matx/core/allocator.h | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/include/matx/core/allocator.h b/include/matx/core/allocator.h
index ecb29e08..db841999 100644
--- a/include/matx/core/allocator.h
+++ b/include/matx/core/allocator.h
@@ -236,9 +236,13 @@ struct MemTracker {
   }
 
   ~MemTracker() {
+#if 0
     while (allocationMap.size()) {
       deallocate(allocationMap.begin()->first);
     }
+#else
+    std::cout << "TODO: Fix me allocator.h\n";
+#endif
   }
 };
 

From 8607840dfc57873219d71a1a5960f35e13e00102 Mon Sep 17 00:00:00 2001
From: Albert Sidelnik <asidelnik@nvidia.com>
Date: Mon, 21 Apr 2025 15:26:44 -0700
Subject: [PATCH 37/39] remove warning to work with latest stf

---
 CMakeLists.txt | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/CMakeLists.txt b/CMakeLists.txt
index 66f862d7..5fac5862 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -168,8 +168,7 @@ if (NOT ${IS_NVCPP} GREATER -1)
             -Wmisleading-indentation
             -Wduplicated-cond
             -Wduplicated-branches
-            -Wlogical-op
-            -Wnull-dereference)
+            -Wlogical-op)
     endif()
 endif()
 

From 14e0985b568331e27f855b2703f6da2b02142726 Mon Sep 17 00:00:00 2001
From: Albert Sidelnik <sidelnik@gmail.com>
Date: Mon, 21 Apr 2025 16:46:28 -0700
Subject: [PATCH 38/39] replace logical token with token

---
 include/matx/core/tensor_impl.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/include/matx/core/tensor_impl.h b/include/matx/core/tensor_impl.h
index ba0f9bf9..7a89046c 100644
--- a/include/matx/core/tensor_impl.h
+++ b/include/matx/core/tensor_impl.h
@@ -1265,7 +1265,7 @@ MATX_IGNORE_WARNING_POP_GCC
             data_place place = getDataPlace(Data());
 #endif
 
-            *stf_ldata_ = ctx.logical_token();
+            *stf_ldata_ = ctx.token();
             stf_ldata_->value().set_symbol(this->str());
         } 
     }

From 92e04d56f9a586c16c1203470f88531fabd0e75e Mon Sep 17 00:00:00 2001
From: Albert Sidelnik <sidelnik@gmail.com>
Date: Mon, 21 Apr 2025 16:47:40 -0700
Subject: [PATCH 39/39] update version to use cccl from main

---
 cmake/versions.json | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/cmake/versions.json b/cmake/versions.json
index 0c44e05d..82cae91a 100644
--- a/cmake/versions.json
+++ b/cmake/versions.json
@@ -4,7 +4,7 @@
       "version": "2.8.0",
       "git_shallow": false,
       "git_url": "https://github.com/NVIDIA/cccl.git",
-      "git_tag": "36e27f7c1074010eefaab64d387ff6663569e065"
+      "git_tag": "9f254d5d8d071d67c6a6ad107b0bbd578b3d072d"
     },
     "nvbench" : {
       "version" : "0.0",