diff --git a/CMakeLists.txt b/CMakeLists.txt
index 8037337c..5fac5862 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -128,13 +128,17 @@ if (CMAKE_CUDA_COMPILER_VERSION VERSION_LESS 11.5)
     message(FATAL_ERROR "MatX requires CUDA 11.5 or higher. Please update before using.")
 endif()
 
+set(CCCL_ENABLE_UNSTABLE ON)
 message(STATUS "Finding CCCL...")
 rapids_cpm_cccl(
     BUILD_EXPORT_SET matx-exports
     INSTALL_EXPORT_SET matx-exports
 )
 
-target_link_libraries(matx INTERFACE CCCL::CCCL)
+target_link_libraries(matx INTERFACE CCCL::CCCL CCCL::cudax)
+target_compile_options(matx INTERFACE
+  $<$<COMPILE_LANGUAGE:CUDA>:--expt-relaxed-constexpr --extended-lambda>
+)
 
 # Set flags for compiling tests faster (only for nvcc)
 if (NOT CMAKE_CUDA_COMPILER_ID STREQUAL "Clang")
@@ -164,8 +168,7 @@ if (NOT ${IS_NVCPP} GREATER -1)
             -Wmisleading-indentation
             -Wduplicated-cond
             -Wduplicated-branches
-            -Wlogical-op
-            -Wnull-dereference)
+            -Wlogical-op)
     endif()
 endif()
 
diff --git a/cmake/versions.json b/cmake/versions.json
index df7c4ce6..82cae91a 100644
--- a/cmake/versions.json
+++ b/cmake/versions.json
@@ -4,7 +4,7 @@
       "version": "2.8.0",
       "git_shallow": false,
       "git_url": "https://github.com/NVIDIA/cccl.git",
-      "git_tag": "6d02e11"
+      "git_tag": "9f254d5d8d071d67c6a6ad107b0bbd578b3d072d"
     },
     "nvbench" : {
       "version" : "0.0",
diff --git a/examples/CMakeLists.txt b/examples/CMakeLists.txt
index 077efeeb..95430347 100644
--- a/examples/CMakeLists.txt
+++ b/examples/CMakeLists.txt
@@ -12,6 +12,7 @@ set(examples
     pwelch
     resample_poly_bench
     sparse_tensor
+    simple_stf_test
     spectrogram
     spectrogram_graph
     spherical_harmonics
@@ -20,9 +21,6 @@ set(examples
     black_scholes
     print_styles)
 
-
-
-
 add_library(example_lib INTERFACE)
 target_include_directories(example_lib SYSTEM INTERFACE ${CUTLASS_INC} ${pybind11_INCLUDE_DIR} ${PYTHON_INCLUDE_DIRS})
 
@@ -54,6 +52,21 @@ foreach( example ${examples} )
     target_link_libraries(${example} example_lib)
 endforeach()
 
+# Compile all examples with CUDASTF and append their name with _stf
+foreach( example ${examples} )
+    string( CONCAT file ${example} ".cu" )
+
+    set(output_name "${example}_stf")
+    add_executable( ${output_name} ${file} )
+
+    # Add the -DUSE_STF compilation flag
+    target_compile_definitions(${output_name} PRIVATE USE_STF)
+
+    target_link_libraries(${output_name} example_lib)
+endforeach()
+
+
+
 # Build proprietary examples
 file (GLOB_RECURSE proprietary_sources CONFIGURE_DEPENDS ${CMAKE_SOURCE_DIR}/proprietary/*/examples/*.cu)
 foreach (pexample ${proprietary_sources})
diff --git a/examples/cgsolve.cu b/examples/cgsolve.cu
index cba23f02..7c50ef85 100644
--- a/examples/cgsolve.cu
+++ b/examples/cgsolve.cu
@@ -54,7 +54,11 @@ int main([[maybe_unused]] int argc, [[maybe_unused]] char **argv)
   auto norm = make_tensor<TypeParam, 1>({BATCH});
   auto maxn = make_tensor<TypeParam>({});
 
+#if 0
   cudaExecutor exec{};
+#else
+  stfExecutor exec{};
+#endif
 
   // Simple Poisson matrix
   for(int b = 0; b < BATCH; b++) {
@@ -83,6 +87,11 @@ int main([[maybe_unused]] int argc, [[maybe_unused]] char **argv)
   (maxn = matx::max(sqrt(norm))).run(exec);
 
   exec.sync();
+#if 1
+  auto ctx = exec.getCtx();
+  ctx.finalize();
+#endif
+
   // example-end sync-test-1
   printf ("max l2 norm: %f\n", (float)sqrt(maxn()));
 
diff --git a/examples/fft_conv.cu b/examples/fft_conv.cu
index c6361481..d7882ca3 100644
--- a/examples/fft_conv.cu
+++ b/examples/fft_conv.cu
@@ -74,6 +74,13 @@ int main([[maybe_unused]] int argc, [[maybe_unused]] char **argv)
   MATX_ENTER_HANDLER();
   using complex = cuda::std::complex<float>;
 
+#ifdef USE_STF                                                                                                                                             
+  std::cout << "Using STF executor\n";
+#else
+  std::cout << "Using CUDA executor\n";
+#endif
+
+ 
   index_t signal_size = 1ULL << 16;
   index_t filter_size = 16;
   index_t batches = 8;
@@ -86,8 +93,14 @@ int main([[maybe_unused]] int argc, [[maybe_unused]] char **argv)
   cudaEvent_t start, stop;
   cudaEventCreate(&start);
   cudaEventCreate(&stop);
-  cudaExecutor exec{stream};
 
+#ifdef USE_STF
+  stfExecutor exec{stream};
+  auto ctx = exec.getCtx();
+#else
+  cudaExecutor exec{stream};
+#endif
+ 
   // Create time domain buffers
   auto sig_time  = make_tensor<complex>({batches, signal_size});
   auto filt_time = make_tensor<complex>({batches, filter_size});
@@ -117,7 +130,11 @@ int main([[maybe_unused]] int argc, [[maybe_unused]] char **argv)
   // Perform the FFT in-place on both signal and filter
   for (int i = 0; i < iterations; i++) {
     if (i == 1) {
-      cudaEventRecord(start, stream);
+#ifdef USE_STF
+        cudaEventRecord(start, ctx.task_fence());
+#else
+        cudaEventRecord(start, stream);
+#endif
     }
     (sig_freq = fft(sig_time, filtered_size)).run(exec);
     (filt_freq = fft(filt_time, filtered_size)).run(exec);
@@ -129,18 +146,30 @@ int main([[maybe_unused]] int argc, [[maybe_unused]] char **argv)
 
   }
 
+#ifdef USE_STF
+  cudaEventRecord(stop, ctx.task_fence());
+#else
   cudaEventRecord(stop, stream);
+#endif
   exec.sync();
   cudaEventElapsedTime(&separate_ms, start, stop);
 
   for (int i = 0; i < iterations; i++) {
     if (i == 1) {
-      cudaEventRecord(start, stream);
+#ifdef USE_STF
+        cudaEventRecord(start, ctx.task_fence());
+#else
+        cudaEventRecord(start, stream);
+#endif
     }
     (sig_freq = ifft(fft(sig_time, filtered_size) * fft(filt_time, filtered_size))).run(exec);
   }
 
+#ifdef USE_STF
+  cudaEventRecord(stop, ctx.task_fence());
+#else
   cudaEventRecord(stop, stream);
+#endif
   exec.sync();
   cudaEventElapsedTime(&fused_ms, start, stop);
 
@@ -153,7 +182,9 @@ int main([[maybe_unused]] int argc, [[maybe_unused]] char **argv)
   (time_out = conv1d(sig_time, filt1, matxConvCorrMode_t::MATX_C_MODE_FULL)).run(exec);
 
   exec.sync();
-
+#ifdef USE_STF
+  ctx.finalize();
+#endif
   // Compare signals
   for (index_t b = 0; b < batches; b++) {
     for (index_t i = 0; i < filtered_size; i++) {
@@ -172,4 +203,4 @@ int main([[maybe_unused]] int argc, [[maybe_unused]] char **argv)
 
   MATX_CUDA_CHECK_LAST_ERROR();
   MATX_EXIT_HANDLER();
-}
\ No newline at end of file
+}
diff --git a/examples/simple_radar_pipeline.cu b/examples/simple_radar_pipeline.cu
index 126ded2b..00e67e00 100644
--- a/examples/simple_radar_pipeline.cu
+++ b/examples/simple_radar_pipeline.cu
@@ -39,37 +39,78 @@ int main([[maybe_unused]] int argc, [[maybe_unused]] char **argv)
   index_t numPulses = 128;
   index_t numSamples = 9000;
   index_t waveformLength = 1000;
-  constexpr bool ENABLE_GRAPHS = false;
   uint32_t iterations = 100;
-  constexpr int num_streams = 1;
-  cudaGraph_t graphs[num_streams];
-  cudaGraphExec_t instances[num_streams];
-  using complex = cuda::std::complex<float>;
-  RadarPipeline<complex> *pipelines[num_streams];
+
+  bool enableGraphs = false;
+#ifdef USE_STF
+  std::cout << "Using STF executor\n";
+#else
+  std::cout << "Using CUDA executor\n";
+#endif
+
+#if 0
+  constexpr int numStreams = 8;
+#else
+  int numStreams = 1;
+#endif
+
+    // Parse command-line arguments
+    for (int i = 1; i < argc; ++i) {
+        std::string arg = argv[i];
+
+        if (arg == "--numChannels" && i + 1 < argc) {
+            numChannels = std::stoi(argv[++i]);
+        } else if (arg == "--numPulses" && i + 1 < argc) {
+            numPulses = std::stoi(argv[++i]);
+        } else if (arg == "--numSamples" && i + 1 < argc) {
+            numSamples = std::stoi(argv[++i]);
+        } else if (arg == "--waveformLength" && i + 1 < argc) {
+            waveformLength = std::stoi(argv[++i]);
+        } else if (arg == "--iterations" && i + 1 < argc) {
+            iterations = std::stoi(argv[++i]);
+        } else if (arg == "--numStreams" && i + 1 < argc) {
+            numStreams = std::stoi(argv[++i]);
+        } else if (arg == "--enableGraphs") {
+            enableGraphs = true;
+            ++i;
+        } else {
+            std::cerr << "Unknown option or missing value: " << arg << std::endl;
+            return 1; // Exit with error
+        }
+    }
 
   std::cout << "Iterations: " << iterations << std::endl;
   std::cout << "numChannels: " << numChannels << std::endl;
   std::cout << "numPulses: " << numPulses << std::endl;
-  std::cout << "numNumSamples: " << numSamples << std::endl;
+  std::cout << "numSamples: " << numSamples << std::endl;
   std::cout << "waveformLength: " << waveformLength << std::endl;
+  std::cout << "numStreams: " << numStreams << std::endl;
+  std::cout << "enableGraphs: " << enableGraphs << std::endl;
 
-  // cuda stream to place work in
-  cudaStream_t streams[num_streams];
+  cudaGraph_t graphs[numStreams];
+  cudaGraphExec_t instances[numStreams];  
+  using complex = cuda::std::complex<float>;
+  RadarPipeline<complex> *pipelines[numStreams];
 
+  // cuda stream to place work in
+  cudaStream_t streams[numStreams];
+  
   // manually set to log all NVTX levels
   MATX_NVTX_SET_LOG_LEVEL( matx_nvxtLogLevels::MATX_NVTX_LOG_ALL );
 
   // create some events for timing
-  cudaEvent_t starts[num_streams];
-  cudaEvent_t stops[num_streams];
+  cudaEvent_t starts[numStreams];
+  cudaEvent_t stops[numStreams];
 
-  for (int s = 0; s < num_streams; s++) {
+  for (int s = 0; s < numStreams; s++) {
     cudaEventCreate(&starts[s]);
     cudaEventCreate(&stops[s]);
     cudaStreamCreate(&streams[s]);
 
     MATX_NVTX_START_RANGE("Pipeline Initialize", matx_nvxtLogLevels::MATX_NVTX_LOG_USER, 1)
+#if 0
     printf("Initializing data structures for stream %d...\n", s);
+#endif
     pipelines[s] = new RadarPipeline(numPulses, numSamples, waveformLength, numChannels, streams[s]);
     MATX_NVTX_END_RANGE(1)
 
@@ -77,7 +118,9 @@ int main([[maybe_unused]] int argc, [[maybe_unused]] char **argv)
   }
 
   MATX_NVTX_START_RANGE("Pipeline Test", matx_nvxtLogLevels::MATX_NVTX_LOG_USER, 2)
+#if 0
   printf("Running test...\n");
+#endif
 
   auto run_pipeline = [&](int s) {
     MATX_NVTX_START_RANGE("PulseCompression", matx_nvxtLogLevels::MATX_NVTX_LOG_USER, 21)
@@ -98,12 +141,12 @@ int main([[maybe_unused]] int argc, [[maybe_unused]] char **argv)
   };
 
   // Warmup
-  for (int s = 0; s < num_streams; s++) {
+  for (int s = 0; s < numStreams; s++) {
     run_pipeline(s);
   }
 
-  if (ENABLE_GRAPHS) {
-    for (int s = 0; s < num_streams; s++) {
+  if (enableGraphs) {
+    for (int s = 0; s < numStreams; s++) {
       cudaStreamBeginCapture(streams[s], cudaStreamCaptureModeGlobal);
       run_pipeline(s);
       cudaStreamEndCapture(streams[s], &graphs[s]);
@@ -112,12 +155,17 @@ int main([[maybe_unused]] int argc, [[maybe_unused]] char **argv)
   }
 
   for (uint32_t i = 0; i < iterations; i++) {
-    for (int s = 0; s < num_streams; s++) {
+    for (int s = 0; s < numStreams; s++) {
       if (i == 1) {
+#ifdef USE_STF
+        auto ctx = pipelines[s]->exec.getCtx();
+        cudaEventRecord(starts[s], ctx.task_fence());
+#else
         cudaEventRecord(starts[s], streams[s]);
+#endif
       }
 
-      if (ENABLE_GRAPHS) {
+      if (enableGraphs) {
         cudaGraphLaunch(instances[s], streams[s]);
       }
       else {
@@ -126,24 +174,37 @@ int main([[maybe_unused]] int argc, [[maybe_unused]] char **argv)
     }
   }
 
-  for (int s = 0; s < num_streams; s++) {
+  for (int s = 0; s < numStreams; s++) {
+#ifdef USE_STF
+    auto ctx = pipelines[s]->exec.getCtx();
+    cudaEventRecord(stops[s], ctx.task_fence());
+#else
     cudaEventRecord(stops[s], streams[s]);
+#endif
     pipelines[s]->sync();
   }
+
+#ifdef USE_STF
+      for (int s = 0; s < numStreams; s++) {
+          auto ctx = pipelines[s]->exec.getCtx();
+          ctx.finalize();
+      }
+#endif
+
   MATX_NVTX_END_RANGE(2)
 
   MATX_NVTX_START_RANGE("Pipeline Results", matx_nvxtLogLevels::MATX_NVTX_LOG_USER, 3)
   float time_ms;
-  cudaEventElapsedTime(&time_ms, starts[num_streams-1], stops[num_streams-1]);
+  cudaEventElapsedTime(&time_ms, starts[numStreams-1], stops[numStreams-1]);
   float time_s = time_ms * .001f;
 
-  auto mult = iterations * numChannels * numPulses * num_streams;
+  auto mult = iterations * numChannels * numPulses * numStreams;
   printf("Pipeline finished in %.2fms, rate: %.2f pulses/channel/sec (%.2f Gbps)\n",
         time_ms,
          static_cast<float>(mult) / time_s,
          static_cast<float>(mult*sizeof(complex)*numSamples*8)/time_s/1e9);
 
-for (int s = 0; s < num_streams; s++) {
+for (int s = 0; s < numStreams; s++) {
     cudaEventDestroy(starts[s]);
     cudaEventDestroy(stops[s]);
     cudaStreamDestroy(streams[s]);
diff --git a/examples/simple_radar_pipeline.h b/examples/simple_radar_pipeline.h
index 0f52ccf2..4892e679 100644
--- a/examples/simple_radar_pipeline.h
+++ b/examples/simple_radar_pipeline.h
@@ -120,7 +120,6 @@ class RadarPipeline {
   RadarPipeline() = delete;
   ~RadarPipeline()
   {
-
   }
 
   /**
@@ -465,5 +464,10 @@ class RadarPipeline {
   tensor_t<typename ComplexType::value_type, 2> cfarMaskView;
 
   cudaStream_t stream;
+#ifdef USE_STF
+public:
+  stfExecutor exec;
+#else
   cudaExecutor exec;
+#endif
 };
diff --git a/examples/simple_radar_pipeline_test.sh b/examples/simple_radar_pipeline_test.sh
new file mode 100755
index 00000000..880e9ed0
--- /dev/null
+++ b/examples/simple_radar_pipeline_test.sh
@@ -0,0 +1,47 @@
+#!/bin/bash
+
+COMMAND_PARAMETERS=("../build/examples/simple_radar_pipeline" "../build/examples/simple_radar_pipeline_stf")
+STREAM_PARAMETERS=("1" "2" "4" "5" "6")
+#SAMPLE_PARAMETERS=("--numSamples 1000" "--numSamples 4500" "--numSamples 9000")
+SAMPLE_PARAMETERS=("1000" "2000" "4500" "6000" "10000")
+
+NUM_RUNS=5
+
+OUTPUT_FILE="radar_heatmap_data.csv"
+# Initialize the CSV file
+
+# Loop through the parameters
+for str_param in "${STREAM_PARAMETERS[@]}"; do
+  OUTPUT_FILE="radar_heatmap_data_${sam_param}.csv"
+  echo "Command,NumStreams,NumSamples,AverageGbps" > $OUTPUT_FILE
+  for command_param in "${COMMAND_PARAMETERS[@]}"; do
+    for sam_param in "${SAMPLE_PARAMETERS[@]}"; do
+      TOTAL_GBPS=0
+      for i in $(seq 1 $NUM_RUNS); do
+        #echo "Iteration $i with parameter $command_param $str_param $sam_param"
+        OUTPUT=$($command_param --numStreams $str_param --numSamples $sam_param)
+        GBPS=$(echo "$OUTPUT" | grep -oP '(?<=\().*? Gbps' | awk '{print $1}')
+
+        # Add the extracted value to the total
+        if [ -n "$GBPS" ]; then
+          TOTAL_GBPS=$(echo "$TOTAL_GBPS + $GBPS" | bc)
+        else
+          echo "Failed to extract Gbps for iteration $i."
+        fi
+      done
+
+      # Calculate the average
+      if [ "$NUM_RUNS" -gt 0 ]; then
+        AVERAGE_GBPS=$(echo "$TOTAL_GBPS / $NUM_RUNS" | bc -l)
+        #echo "$command_param $str_param $sam_param verage Gbps over $NUM_RUNS runs: $AVERAGE_GBPS"
+        # Append the results to the CSV file
+        echo "$command_param,$(echo $str_param | awk '{print $0}'),$(echo $sam_param | awk '{print $0}'),$AVERAGE_GBPS"
+        echo "$command_param,$(echo $str_param | awk '{print $0}'),$(echo $sam_param | awk '{print $0}'),$AVERAGE_GBPS" >> $OUTPUT_FILE
+      else
+        echo "No runs were performed."
+      fi
+    done
+  done
+done
+
+echo "Heatmap data saved to $OUTPUT_FILE."
diff --git a/examples/simple_stf_test.cu b/examples/simple_stf_test.cu
new file mode 100644
index 00000000..1df260ed
--- /dev/null
+++ b/examples/simple_stf_test.cu
@@ -0,0 +1,156 @@
+////////////////////////////////////////////////////////////////////////////////
+// BSD 3-Clause License
+//
+// Copyright (c) 2021, NVIDIA Corporation
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are met:
+//
+// 1. Redistributions of source code must retain the above copyright notice, this
+//    list of conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above copyright notice,
+//    this list of conditions and the following disclaimer in the documentation
+//    and/or other materials provided with the distribution.
+//
+// 3. Neither the name of the copyright holder nor the names of its
+//    contributors may be used to endorse or promote products derived from
+//    this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+// DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+// FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+// DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+// SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+// CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+// OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+/////////////////////////////////////////////////////////////////////////////////
+
+#include "matx.h"
+#include <cassert>
+#include <cstdio>
+#include <math.h>
+#include <memory>
+
+using namespace matx;
+
+/**
+ * MatX uses C++ expression templates to build arithmetic expressions that compile into a lazily-evaluated
+ * type for executing on the device. Currently, nvcc cannot see certain optimizations
+ * when building the expression tree that would be obvious by looking at the code. Specifically any code reusing
+ * the same tensor multiple times appears to the compiler as separate tensors, and it may issue multiple load
+ * instructions. While caching helps, this can have a slight performance impact when compared to native CUDA
+ * kernels. To work around this problem, complex expressions can be placed in a custom operator by adding some
+ * boilerplate code around the original expression. This custom operator can then be used either alone or inside
+ * other arithmetic expressions, and only a single load is issues for each tensor. 
+ * 
+ * This example uses the Black-Scholes equtation to demonstrate the two ways to implement the equation in MatX, and
+ * shows the performance difference.
+ */
+
+/* Arithmetic expression */
+template<typename T1>
+void compute_black_scholes_matx(tensor_t<T1,1>& K, 
+                                tensor_t<T1,1>& S, 
+                                tensor_t<T1,1>& V, 
+                                tensor_t<T1,1>& r, 
+                                tensor_t<T1,1>& T, 
+                                tensor_t<T1,1>& output, 
+#if 0
+                                cudaExecutor& exec)
+#else
+                                stfExecutor& exec)
+#endif
+{
+    auto VsqrtT = V * sqrt(T);
+    auto d1 = (log(S / K) + (r + 0.5 * V * V) * T) / VsqrtT ;
+    auto d2 = d1 - VsqrtT;
+    auto cdf_d1 = normcdf(d1);
+    auto cdf_d2 = normcdf(d2);
+    auto expRT = exp(-1 * r * T); 
+    (output = S * cdf_d1 - K * expRT * cdf_d2).run(exec);
+    //std::cout << "Output : " << std::endl;
+    //print(output);
+}
+
+int main([[maybe_unused]] int argc, [[maybe_unused]] char **argv)
+{
+  MATX_ENTER_HANDLER();
+
+  using dtype = double;
+
+#if 0
+  index_t input_size = 100000000;
+#else
+  index_t input_size = 10000;
+#endif
+  constexpr uint32_t num_iterations = 1000;
+  float time_ms;
+
+  tensor_t<dtype, 1> K_tensor{{input_size}};
+  tensor_t<dtype, 1> S_tensor{{input_size}};
+  tensor_t<dtype, 1> V_tensor{{input_size}};
+  tensor_t<dtype, 1> r_tensor{{input_size}};
+  tensor_t<dtype, 1> T_tensor{{input_size}};
+  tensor_t<dtype, 1> output_tensor{{input_size}};  
+
+  cudaStream_t stream;
+  cudaStreamCreate(&stream);
+#if 0
+  cudaExecutor exec{stream};
+#else
+  stfExecutor exec{stream};
+  auto ctx = exec.getCtx();
+#endif
+
+    /* Albert --- initilizing input .. */
+    for (int i = 0; i < input_size; i++) {
+        K_tensor(i) = dtype(i+1);
+        S_tensor(i) = dtype(i+i+1);
+        V_tensor(i) = dtype(i+i+i+1);
+        r_tensor(i) = dtype(i+i+i+i+1);
+        T_tensor(i) = dtype(i+i+i+i+i+1);
+    }
+
+//print(V_tensor);
+
+  //compute_black_scholes_matx(K_tensor, S_tensor, V_tensor, r_tensor, T_tensor, output_tensor, exec);  
+
+  cudaEvent_t start, stop;
+  cudaEventCreate(&start);
+  cudaEventCreate(&stop);
+
+#if 0
+  cudaEventRecord(start, stream);
+#else
+  cudaEventRecord(start, ctx.task_fence());
+#endif
+  // Time non-operator version
+  for (uint32_t i = 0; i < num_iterations; i++) {
+    compute_black_scholes_matx(K_tensor, S_tensor, V_tensor, r_tensor, T_tensor, output_tensor, exec);
+  }
+#if 0
+  cudaEventRecord(stop, stream);
+#else
+  cudaEventRecord(stop, ctx.task_fence());
+#endif
+  exec.sync();
+#if 1
+  ctx.finalize();
+#endif
+  cudaEventElapsedTime(&time_ms, start, stop);
+
+  //  printf("Output tensor :\n");
+  //  print(output_tensor);
+  printf("Time without custom operator = %fms per iteration\n",
+         time_ms / num_iterations);
+  cudaEventDestroy(start);
+  cudaEventDestroy(stop);
+  cudaStreamDestroy(stream);
+  //CUDA_CHECK_LAST_ERROR();
+  MATX_EXIT_HANDLER();
+}
diff --git a/examples/spectrogram.cu b/examples/spectrogram.cu
index 9d238b0e..60404a7e 100644
--- a/examples/spectrogram.cu
+++ b/examples/spectrogram.cu
@@ -60,6 +60,11 @@ int main([[maybe_unused]] int argc, [[maybe_unused]] char **argv)
 
   cudaStream_t stream;
   cudaStreamCreate(&stream);
+
+  cudaEvent_t start, stop;
+  cudaEventCreate(&start);
+  cudaEventCreate(&stop);
+
   cudaExecutor exec{stream};
 
   float fs = 10000;
diff --git a/examples/spectrogram_graph.cu b/examples/spectrogram_graph.cu
index 38d7c5d2..cf296875 100644
--- a/examples/spectrogram_graph.cu
+++ b/examples/spectrogram_graph.cu
@@ -62,12 +62,22 @@ int main([[maybe_unused]] int argc, [[maybe_unused]] char **argv)
   cudaStream_t stream;
   cudaStreamCreate(&stream);
 
-  cudaExecutor exec{stream};
-
   cudaEvent_t start, stop;
   cudaEventCreate(&start);
   cudaEventCreate(&stop);
 
+#ifdef USE_STF
+  std::cout << "Using STF executor\n";
+#else
+  std::cout << "Using CUDA executor\n";
+#endif
+
+#ifdef USE_STF
+  stfExecutor exec{stream};
+#else
+  cudaExecutor exec{stream};
+#endif
+
   float fs = 10000;
   index_t N = 100000;
   float amp = static_cast<float>(2 * sqrt(2));
@@ -147,15 +157,33 @@ int main([[maybe_unused]] int argc, [[maybe_unused]] char **argv)
     }
   }
 
-
   exec.sync();
   // Time graph execution of same kernels
-  cudaEventRecord(start, stream);
+#if USE_STF
+      auto ctx = exec.getCtx();
+      cudaEventRecord(start, ctx.task_fence());
+#else
+      cudaEventRecord(start, stream);
+#endif
+
   for (uint32_t i = 0; i < 10; i++) {
     cudaGraphLaunch(instance, stream);
   }
+#ifdef USE_STF
+{
+    cudaEventRecord(stop, ctx.task_fence());
+}
+#else
   cudaEventRecord(stop, stream);
+#endif
   exec.sync();
+
+#ifdef USE_STF
+{
+  ctx.finalize();
+}
+#endif
+
   cudaEventElapsedTime(&time_ms, start, stop);
 
   printf("Spectrogram Time With Graphs = %.2fus per iteration\n",
diff --git a/include/matx/core/allocator.h b/include/matx/core/allocator.h
index ecb29e08..db841999 100644
--- a/include/matx/core/allocator.h
+++ b/include/matx/core/allocator.h
@@ -236,9 +236,13 @@ struct MemTracker {
   }
 
   ~MemTracker() {
+#if 0
     while (allocationMap.size()) {
       deallocate(allocationMap.begin()->first);
     }
+#else
+    std::cout << "TODO: Fix me allocator.h\n";
+#endif
   }
 };
 
diff --git a/include/matx/core/operator_utils.h b/include/matx/core/operator_utils.h
index 95375c77..5907f56e 100644
--- a/include/matx/core/operator_utils.h
+++ b/include/matx/core/operator_utils.h
@@ -120,7 +120,7 @@ namespace matx {
     __MATX_HOST__ __MATX_INLINE__ void AllocateTempTensor(TensorType &tensor, Executor &&ex, ShapeType &&shape, typename TensorType::value_type **ptr) {
       const auto ttl_size = std::accumulate(shape.begin(), shape.end(), static_cast<index_t>(1),
                                   std::multiplies<index_t>()) * sizeof(typename TensorType::value_type);      
-      if constexpr (is_cuda_executor_v<Executor>) {
+      if constexpr (is_cuda_executor_v<Executor> || is_stf_executor_v<Executor>) {
         matxAlloc((void**)ptr, ttl_size, MATX_ASYNC_DEVICE_MEMORY, ex.getStream());
         make_tensor(tensor, *ptr, shape);
       }
diff --git a/include/matx/core/tensor.h b/include/matx/core/tensor.h
index f042d18b..6b13ee03 100644
--- a/include/matx/core/tensor.h
+++ b/include/matx/core/tensor.h
@@ -91,6 +91,7 @@ class tensor_t : public detail::tensor_impl_t<T,RANK,Desc> {
   using stride_container = typename Desc::stride_container;
   using desc_type = Desc; ///< Descriptor type trait
   using self_type = tensor_t<T, RANK, Storage, Desc>;
+  using stf_logicaldata_type = typename cuda::experimental::stf::logical_data<cuda::experimental::stf::void_interface>;
 
   /**
    * @brief Construct a new 0-D tensor t object
@@ -107,7 +108,7 @@ class tensor_t : public detail::tensor_impl_t<T,RANK,Desc> {
    * @param rhs Object to copy from
    */
   __MATX_HOST__ tensor_t(tensor_t const &rhs) noexcept
-      : detail::tensor_impl_t<T, RANK, Desc>{rhs.Data(), rhs.desc_}, storage_(rhs.storage_)
+      : detail::tensor_impl_t<T, RANK, Desc>{rhs.Data(), rhs.desc_, rhs.stf_ldata_}, storage_(rhs.storage_)
       { }
 
   /**
@@ -116,7 +117,7 @@ class tensor_t : public detail::tensor_impl_t<T,RANK,Desc> {
    * @param rhs Object to move from
    */
   __MATX_HOST__ tensor_t(tensor_t &&rhs) noexcept
-      : detail::tensor_impl_t<T, RANK, Desc>{rhs.Data(), std::move(rhs.desc_)}, storage_(std::move(rhs.storage_))
+      : detail::tensor_impl_t<T, RANK, Desc>{rhs.Data(), std::move(rhs.desc_), rhs.stf_ldata_}, storage_(std::move(rhs.storage_))
   { }
 
 
@@ -134,6 +135,7 @@ class tensor_t : public detail::tensor_impl_t<T,RANK,Desc> {
     this->SetData(rhs.Data());
     storage_ = rhs.storage_;
     this->desc_ = rhs.desc_;
+    this->stf_ldata_ = rhs.stf_ldata_;
   }
 
   /** Swaps two tensors
@@ -154,6 +156,7 @@ class tensor_t : public detail::tensor_impl_t<T,RANK,Desc> {
     rhs.SetData(tmpdata);
     swap(lhs.storage_, rhs.storage_);
     swap(lhs.desc_, rhs.desc_);
+    std::swap(lhs.stf_ldata_, rhs.stf_ldata_);
   }
 
   __MATX_INLINE__  ~tensor_t() = default;
@@ -184,6 +187,16 @@ class tensor_t : public detail::tensor_impl_t<T,RANK,Desc> {
     this->SetLocalData(storage_.data());
   }
 
+  template <typename S2 = Storage, typename D2 = Desc,
+            std::enable_if_t<is_matx_storage_v<typename remove_cvref<S2>::type> && is_matx_descriptor_v<typename remove_cvref<D2>::type>, bool> = true>
+  tensor_t(S2 &&s, D2 &&desc, T* ldata, std::optional<stf_logicaldata_type > *stf_ldata_) :
+    detail::tensor_impl_t<T, RANK, Desc>{std::forward<D2>(desc)},
+    storage_{std::forward<S2>(s)}
+  {
+    this->stf_ldata_ = stf_ldata_;
+    this->SetLocalData(storage_.data());
+  }
+
   /**
    * @brief Construct a new tensor t object. Used to copy an existing storage object for proper reference counting
    *
@@ -192,13 +205,28 @@ class tensor_t : public detail::tensor_impl_t<T,RANK,Desc> {
    * @param ldata
    */
   template <typename D2 = Desc>
-  tensor_t(Storage s, D2 &&desc, T* ldata) :
+  tensor_t(Storage s, D2 &&desc, T* ldata, std::optional<stf_logicaldata_type > *stf_ldata) :
     detail::tensor_impl_t<T, RANK, D2>{std::forward<D2>(desc)},
     storage_{std::move(s)}
   {
+    this->stf_ldata_ = stf_ldata;
     this->SetLocalData(ldata);
   }
 
+  /**
+   * @brief Construct a new tensor t object. Used to copy an existing storage object for proper reference counting
+   *
+   * @param s
+   * @param desc
+   * @param ldata
+   */
+  template <typename D2 = Desc>
+  tensor_t(Storage s, D2 &&desc, T* ldata) :
+    detail::tensor_impl_t<T, RANK, D2>{std::forward<D2>(desc)},
+    storage_{std::move(s)}
+  {
+    this->SetLocalData(ldata);
+  }
 
   /**
    * Constructor for a rank-1 and above tensor.
@@ -653,7 +681,7 @@ class tensor_t : public detail::tensor_impl_t<T,RANK,Desc> {
 
     // Copy descriptor and call ctor with shape
     Desc new_desc{std::forward<Shape>(shape)};
-    return tensor_t<M, R, Storage, Desc>{storage_, std::move(new_desc), this->Data()};
+    return tensor_t<M, R, Storage, Desc>{storage_, std::move(new_desc), this->Data(), this->stf_ldata_};
   }
 
   /**
@@ -712,7 +740,7 @@ class tensor_t : public detail::tensor_impl_t<T,RANK,Desc> {
        "To get a reshaped view the tensor must be compact");
 
     DefaultDescriptor<tshape.size()> desc{std::move(tshape)};
-    return tensor_t<T, NRANK, Storage, decltype(desc)>{storage_, std::move(desc), this->Data()};
+    return tensor_t<T, NRANK, Storage, decltype(desc)>{storage_, std::move(desc), this->Data(), this->stf_ldata_};
   }
 
   /**
@@ -795,7 +823,7 @@ class tensor_t : public detail::tensor_impl_t<T,RANK,Desc> {
 
     // Copy descriptor and call ctor with shape
     Desc new_desc{this->desc_.Shape(), std::move(strides)};
-    return tensor_t<Type, RANK, Storage, Desc>{storage_, std::move(new_desc), data};
+    return tensor_t<Type, RANK, Storage, Desc>{storage_, std::move(new_desc), data, this->stf_ldata_};
   }
 
   /**
@@ -838,7 +866,7 @@ class tensor_t : public detail::tensor_impl_t<T,RANK,Desc> {
     }
 
     Desc new_desc{this->desc_.Shape(), std::move(strides)};
-    return tensor_t<Type, RANK, Storage, Desc>{storage_, std::move(new_desc), data};
+    return tensor_t<Type, RANK, Storage, Desc>{storage_, std::move(new_desc), data, this->stf_ldata_};
   }
 
   /**
@@ -861,7 +889,7 @@ class tensor_t : public detail::tensor_impl_t<T,RANK,Desc> {
     MATX_NVTX_START("", matx::MATX_NVTX_LOG_API)
 
     auto new_desc = this->PermuteImpl(dims);
-    return tensor_t<T, RANK, Storage, Desc>{storage_, std::move(new_desc), this->Data()};
+    return tensor_t<T, RANK, Storage, Desc>{storage_, std::move(new_desc), this->Data(), this->stf_ldata_};
   }
 
 
@@ -1029,7 +1057,7 @@ class tensor_t : public detail::tensor_impl_t<T,RANK,Desc> {
   OverlapView(const cuda::std::array<typename Desc::shape_type, N> &windows,
               const cuda::std::array<typename Desc::stride_type, N> &strides) const {
     auto new_desc = this->template OverlapViewImpl<N>(windows, strides);
-    return tensor_t<T, RANK + 1, Storage, decltype(new_desc)>{storage_, std::move(new_desc), this->Data()};
+    return tensor_t<T, RANK + 1, Storage, decltype(new_desc)>{storage_, std::move(new_desc), this->Data(), this->stf_ldata_};
   }
 
   /**
@@ -1063,7 +1091,7 @@ class tensor_t : public detail::tensor_impl_t<T,RANK,Desc> {
     MATX_NVTX_START("", matx::MATX_NVTX_LOG_API)
 
     auto new_desc = this->template CloneImpl<N>(clones);
-    return tensor_t<T, N, Storage, decltype(new_desc)>{storage_, std::move(new_desc), this->Data()};
+    return tensor_t<T, N, Storage, decltype(new_desc)>{storage_, std::move(new_desc), this->Data(), this->stf_ldata_};
   }
 
   template <int N>
@@ -1361,7 +1389,7 @@ class tensor_t : public detail::tensor_impl_t<T,RANK,Desc> {
                             [[maybe_unused]] StrideType strides) const
   {
     auto [new_desc, data] = this->template SliceImpl<N, StrideType>(firsts, ends, strides);
-    return tensor_t<T, N, Storage, decltype(new_desc)>{storage_, std::move(new_desc), data};
+    return tensor_t<T, N, Storage, decltype(new_desc)>{storage_, std::move(new_desc), data, this->stf_ldata_};
   }
 
   template <typename StrideType, int N = RANK>
diff --git a/include/matx/core/tensor_impl.h b/include/matx/core/tensor_impl.h
index ae755d1f..7a89046c 100644
--- a/include/matx/core/tensor_impl.h
+++ b/include/matx/core/tensor_impl.h
@@ -42,9 +42,11 @@
 #include "matx/core/tensor_utils.h"
 #include "matx/operators/set.h"
 #include "matx/core/sparse_tensor_format.h"
+#include "matx/core/utils.h"
 //#include "matx_exec_kernel.h"
 #include "iterator.h"
 #include "matx/core/make_tensor.h"
+#include <cuda/experimental/stf.cuh>
 
 namespace matx {
 
@@ -99,6 +101,7 @@ class tensor_impl_t {
     using stride_type = typename Desc::stride_type;
     using matxoplvalue = bool;
     using self_type = tensor_impl_t<T, RANK, Desc, TensorData>;
+    using stf_logicaldata_type = typename cuda::experimental::stf::logical_data<cuda::experimental::stf::void_interface>;
 
     // Type specifier for signaling this is a matx operation
     using matxop = bool;
@@ -130,13 +133,15 @@ class tensor_impl_t {
 
       swap(lhs.data_, rhs.data_);
       swap(lhs.desc_, rhs.desc_);
+      swap(lhs.stf_ldata_, rhs.stf_ldata_);
     }
 
     /**
      * Constructor for a rank-0 tensor (scalar).
      */
     tensor_impl_t() {
-
+      auto ldptr = new std::optional<stf_logicaldata_type>();
+      this->stf_ldata_ = ldptr;
     }
 
     /**
@@ -148,6 +153,8 @@ class tensor_impl_t {
     tensor_impl_t(T *const data)  {
       data_.ldata_ = data;
       static_assert(RANK == 0, "tensor_impl_t with single pointer parameter must be a rank 0 tensor");
+      auto ldptr = new std::optional<stf_logicaldata_type>();
+      this->stf_ldata_ = ldptr;
     }
     /**
      * Constructor for a rank-1 and above tensor.
@@ -159,6 +166,8 @@ class tensor_impl_t {
               std::enable_if_t<!is_tensor_view_v<remove_cvref_t<ShapeType>> && !is_matx_descriptor_v<remove_cvref_t<ShapeType>>, bool> = true>
     __MATX_INLINE__ tensor_impl_t(ShapeType &&shape) : desc_(std::forward<ShapeType>(shape))
     {
+      auto ldptr = new std::optional<stf_logicaldata_type>();
+      this->stf_ldata_ = ldptr;
     }
 
     /**
@@ -172,7 +181,10 @@ class tensor_impl_t {
     template <typename ShapeType, typename StrideType>
     __MATX_INLINE__ tensor_impl_t(ShapeType &&shape, StrideType &&strides)
         : desc_(std::forward<ShapeType>(shape), std::forward<StrideType>(strides))
-    {}
+    {
+      auto ldptr = new std::optional<stf_logicaldata_type>();
+      this->stf_ldata_ = ldptr;
+    }
 
     /**
      * Constructor for a rank-1 and above tensor using a user pointer and shape
@@ -190,6 +202,8 @@ class tensor_impl_t {
         : desc_(std::forward<ShapeType>(shape))
     {
       data_.ldata_ = ldata;
+      auto ldptr = new std::optional<stf_logicaldata_type>();
+      this->stf_ldata_ = ldptr;
     }
 
 
@@ -214,6 +228,8 @@ class tensor_impl_t {
         : desc_(std::forward<ShapeType>(shape), std::forward<StrideType>(strides))
     {
       data_.ldata_ = ldata;
+      auto ldptr = new std::optional<stf_logicaldata_type>();
+      this->stf_ldata_ = ldptr;
     }
 
 
@@ -238,9 +254,19 @@ MATX_IGNORE_WARNING_PUSH_GCC("-Wmaybe-uninitialized")
         : desc_{std::forward<DescriptorType>(desc)}
     {
       data_.ldata_ = ldata;
+      auto ldptr = new std::optional<stf_logicaldata_type>();
+      this->stf_ldata_ = ldptr;
     }
 MATX_IGNORE_WARNING_POP_GCC
 
+    template <typename DescriptorType, std::enable_if_t<is_matx_descriptor_v<typename remove_cvref<DescriptorType>::type>, bool> =     true>
+    __MATX_INLINE__ __MATX_DEVICE__ __MATX_HOST__ tensor_impl_t(T *const ldata,
+                    DescriptorType &&desc, std::optional<stf_logicaldata_type > *stf_ldata)
+        : desc_{std::forward<DescriptorType>(desc)}, stf_ldata_(stf_ldata)
+    {
+        data_.ldata_ = ldata;
+    }
+
     /**
      * Constructor for creating a view with only a descriptor
      *
@@ -255,12 +281,15 @@ MATX_IGNORE_WARNING_POP_GCC
     __MATX_INLINE__ tensor_impl_t(DescriptorType &&desc)
         : desc_{std::forward<DescriptorType>(desc)}
     {
+        auto ldptr = new std::optional<stf_logicaldata_type>();
+        this->stf_ldata_ = ldptr;
     }
 
     __MATX_HOST__ void Shallow(const self_type &rhs) noexcept
     {
       data_.ldata_ = rhs.Data();
       desc_ = rhs.desc_;
+      stf_ldata_ = rhs.stf_ldata_;
     }
 
     /**
@@ -277,6 +306,7 @@ MATX_IGNORE_WARNING_POP_GCC
     {
       data_.ldata_ = op.Data();
       desc_ = op.desc_;
+      stf_ldata_ = op.stf_ldata;
     }
 
 
@@ -793,7 +823,7 @@ MATX_IGNORE_WARNING_POP_GCC
 
       auto new_desc = CloneImpl<N>(clones);
 
-      return tensor_impl_t<T, N, decltype(new_desc)>{this->data_.ldata_, std::move(new_desc)};
+      return tensor_impl_t<T, N, decltype(new_desc)>{this->data_.ldata_, std::move(new_desc), this->stf_ldata_};
     }
 
     __MATX_INLINE__ auto PermuteImpl(const cuda::std::array<int32_t, RANK> &dims) const
@@ -823,7 +853,7 @@ MATX_IGNORE_WARNING_POP_GCC
     __MATX_INLINE__ auto Permute(const cuda::std::array<int32_t, RANK> &dims) const
     {
       auto new_desc = PermuteImpl(dims);
-      return tensor_impl_t<T, RANK, decltype(new_desc)>{this->data_.ldata_, std::move(new_desc)};
+      return tensor_impl_t<T, RANK, decltype(new_desc)>{this->data_.ldata_, std::move(new_desc), this->stf_ldata_};
     }
 
     template <int N>
@@ -868,7 +898,7 @@ MATX_IGNORE_WARNING_POP_GCC
     OverlapView(const cuda::std::array<typename Desc::shape_type, N> &windows,
                 const cuda::std::array<typename Desc::stride_type, N> &strides) const {
       auto new_desc = OverlapViewImpl<N>(windows, strides);
-      return tensor_impl_t<T, RANK + 1, decltype(new_desc)>{this->data_.ldata_, std::move(new_desc)};
+      return tensor_impl_t<T, RANK + 1, decltype(new_desc)>{this->data_.ldata_, std::move(new_desc), this->stf_ldata_};
     }
 
     template <typename O>
@@ -1197,9 +1227,47 @@ MATX_IGNORE_WARNING_POP_GCC
       memcpy(data_.pos_, pos, U::LVL*sizeof(pos[0]));
     }
 
+    template <typename Task>
+    __MATX_INLINE__ void apply_dep_to_task(Task &&task, int perm) const noexcept
+    {
+      auto &ld = stf_ldata_->value();
+
+      using namespace cuda::experimental::stf;
+#if 0
+      data_place place = getDataPlace(Data());
+#endif
+
+      if (perm == 0) {
+          task.add_deps(ld.write());
+      }
+      else if (perm == 1) {
+          task.add_deps(ld.read());
+      }
+      else if (perm == 2) {
+          task.add_deps(ld.rw());
+      }
+      else {
+          std::cout << "abort ...\n";
+      }
+    }
+
     template <typename ShapeType, typename Executor>
     __MATX_INLINE__ void PreRun([[maybe_unused]] ShapeType &&shape, [[maybe_unused]] Executor &&ex) const noexcept
     {
+        if constexpr (is_stf_executor_v<Executor>) {
+            using namespace cuda::experimental::stf;
+            /* Don't create a new logical data for a tensor if it alread had one created previously */
+            if (stf_ldata_ && stf_ldata_->has_value()) { return; }
+
+            auto ctx = ex.getCtx();
+#if 0
+            // Determine the type of memory that was allocated ie. host/managed/etc
+            data_place place = getDataPlace(Data());
+#endif
+
+            *stf_ldata_ = ctx.token();
+            stf_ldata_->value().set_symbol(this->str());
+        } 
     }
 
     template <typename ShapeType, typename Executor>
@@ -1211,6 +1279,9 @@ MATX_IGNORE_WARNING_POP_GCC
   protected:
     TensorData data_;
     Desc desc_;
+
+  public:
+    mutable std::optional<stf_logicaldata_type > *stf_ldata_;
 };
 
 }
diff --git a/include/matx/core/type_utils.h b/include/matx/core/type_utils.h
index e1b6f38d..c104bba2 100644
--- a/include/matx/core/type_utils.h
+++ b/include/matx/core/type_utils.h
@@ -45,6 +45,7 @@
 #include "matx/core/half.h"
 #include "matx/core/half_complex.h"
 #include "matx/executors/cuda.h"
+#include "matx/executors/stf.h"
 
 /**
  * Defines type traits for host and device compilers. This file should be includable by
@@ -302,6 +303,7 @@ inline constexpr bool is_settable_xform_v = std::conjunction_v<detail::is_matx_s
 namespace detail {
 template <typename T> struct is_executor : std::false_type {};
 template <> struct is_executor<cudaExecutor> : std::true_type {};
+template <> struct is_executor<stfExecutor> : std::true_type {};
 template <ThreadsMode MODE> struct is_executor<HostExecutor<MODE>> : std::true_type {};
 }
 
@@ -322,6 +324,11 @@ template<typename T> struct is_cuda_executor : std::false_type {};
 template<> struct is_cuda_executor<matx::cudaExecutor> : std::true_type {};
 }
 
+namespace detail {
+template<typename T> struct is_stf_executor : std::false_type {};
+template<> struct is_stf_executor<matx::stfExecutor> : std::true_type {};
+}
+
 /**
  * @brief Determine if a type is a device executor
  * 
@@ -330,6 +337,9 @@ template<> struct is_cuda_executor<matx::cudaExecutor> : std::true_type {};
 template <typename T> 
 inline constexpr bool is_cuda_executor_v = detail::is_cuda_executor<typename remove_cvref<T>::type>::value;
 
+template <typename T>
+inline constexpr bool is_stf_executor_v = detail::is_stf_executor<typename remove_cvref<T>::type>::value;
+
 namespace detail {
 template<typename T> struct is_host_executor : std::false_type {};
 template<ThreadsMode MODE> struct is_host_executor<matx::HostExecutor<MODE>> : std::true_type {};
diff --git a/include/matx/core/utils.h b/include/matx/core/utils.h
index 81643210..1b4c6e06 100644
--- a/include/matx/core/utils.h
+++ b/include/matx/core/utils.h
@@ -46,6 +46,30 @@ namespace matx {
   constexpr int PASCAL_CC = 6;
 
 namespace detail {
+
+#if 0
+__MATX_INLINE__ cuda::experimental::stf::data_place getDataPlace(void *ptr) {
+    using namespace cuda::experimental::stf;
+                auto kind = GetPointerKind(ptr);
+                switch (kind) {
+                    case MATX_MANAGED_MEMORY:
+                        return data_place::managed;
+                    case MATX_HOST_MEMORY:
+                    case MATX_HOST_MALLOC_MEMORY:
+                        return data_place::host;
+                    case MATX_DEVICE_MEMORY:
+                    case MATX_ASYNC_DEVICE_MEMORY:
+                        return data_place::current_device();
+                    case MATX_INVALID_MEMORY:
+                        //std::cout << "Data kind is invalid: assuming managed memory\n";
+                        return data_place::managed;
+                        //return data_place::invalid;
+                    default:
+                        return data_place::invalid;
+                }
+}
+#endif
+
 __MATX_INLINE__ int GetDeviceAttr(cudaDeviceAttr attr) {
     int val;
     int dev;
diff --git a/include/matx/executors/stf.h b/include/matx/executors/stf.h
new file mode 100644
index 00000000..e055a051
--- /dev/null
+++ b/include/matx/executors/stf.h
@@ -0,0 +1,258 @@
+////////////////////////////////////////////////////////////////////////////////
+// BSD 3-Clause License
+//
+// Copyright (c) 2021, NVIDIA Corporation
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are met:
+//
+// 1. Redistributions of source code must retain the above copyright notice, this
+//    list of conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above copyright notice,
+//    this list of conditions and the following disclaimer in the documentation
+//    and/or other materials provided with the distribution.
+//
+// 3. Neither the name of the copyright holder nor the names of its
+//    contributors may be used to endorse or promote products derived from
+//    this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+// DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+// FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+// DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+// SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+// CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+// OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+/////////////////////////////////////////////////////////////////////////////////
+
+#pragma once
+
+#include "matx/core/defines.h"
+#include "matx/executors/host.h"
+#include "matx/executors/kernel.h"
+
+#include <cuda/experimental/stf.cuh>
+
+//using namespace cuda::experimental::stf;
+//using namespace cudastf;
+
+namespace matx
+{
+
+
+/* Albert - Needed to declare this here to avoid compile error. */
+template <typename T> constexpr bool is_matx_op_lvalue();
+template <typename T> constexpr bool is_matx_set_op();
+
+  class stfExecutor {
+    public:
+      using matx_cuda = bool;  // signal this is a GPU executor
+      using matx_executor = bool; ///< Type trait indicating this is an executor
+
+      /**
+       * @brief Construct a new stfExecutor with a stream
+       * 
+       * @param stream CUDA stream
+       */
+      stfExecutor(cudaStream_t stream, bool is_cudagraph = false) : stream_(stream) {
+          cuda::experimental::stf::async_resources_handle handle;
+          if (!is_cudagraph)
+              ctx_ = cuda::experimental::stf::stream_ctx(stream, handle);
+          else
+              ctx_ = cuda::experimental::stf::graph_ctx(stream, handle);
+      }
+
+      stfExecutor(int stream, bool is_cudagraph = false) : stream_(reinterpret_cast<cudaStream_t>(stream)) {
+          cuda::experimental::stf::async_resources_handle handle;
+          if (!is_cudagraph)
+              ctx_ = cuda::experimental::stf::stream_ctx(reinterpret_cast<cudaStream_t>(stream), handle);
+          else
+              ctx_ = cuda::experimental::stf::graph_ctx(reinterpret_cast<cudaStream_t>(stream), handle);
+      }
+
+      /**
+       * @brief Construct a new stfExecutor object using the default stream
+       * 
+       */
+      stfExecutor(bool is_cudagraph = false) : stream_(0) {
+          if (!is_cudagraph)
+              ctx_ = cuda::experimental::stf::stream_ctx();
+          else
+              ctx_ = cuda::experimental::stf::graph_ctx();
+      }
+
+      ~stfExecutor() {
+          //ctx_.finalize();
+      }
+      /**
+       * @brief Returns stream associated with executor
+      */
+      auto getStream() const { return stream_; }
+
+      /**
+       * @brief Get CUDASTF Ctx
+       * 
+       */
+      auto &getCtx() const noexcept { return ctx_; }
+
+      /**
+       * @brief Synchronize the STF executor's stream
+       * 
+       */
+      void sync() { cudaStreamSynchronize(ctx_.task_fence()); }
+
+      /**
+       * Execute an operator on a device
+       * 
+       * @tparam Op Operator type
+       * @param op value
+       **/
+      template <typename Op>
+        void Exec(Op &op) const {
+#ifdef __CUDACC__      
+          dim3 threads, blocks;  
+
+          auto ctx = getCtx();
+          // Parameters passed by value in CUDA are limited to 4096B. If the user exceeds this, we 
+          // need to error out and have them break up the statement
+          MATX_STATIC_ASSERT((sizeof(op) + sizeof(index_t) * Op::Rank()) <= CUDA_MAX_VAL_PARAM, 
+              "Parameter buffer to device is limited to 4096B. Please break up your operator statement into multiple executions to limit the size of the parameters");
+
+          if constexpr (Op::Rank() == 0) {
+            threads = 1;
+            blocks = 1;
+            if constexpr (is_matx_op_lvalue<Op>() || is_matx_set_op<Op>()) {
+                    auto tsk = ctx.task();
+                    tsk.set_symbol(op.str());
+                    op.apply_dep_to_task(tsk); // recursively find the tensors from the tree to apply deps
+                    tsk->*[&](cudaStream_t s) { 
+                        detail::matxOpT0Kernel<<<blocks, threads, 0, s>>>(op);
+                    };
+            }
+            else {
+                //std::cout << " RANK 0 not on LHS operator = " << op.str() << '\n';
+                detail::matxOpT0Kernel<<<blocks, threads, 0, stream_>>>(op);
+            }
+          }
+          else {
+            cuda::std::array<index_t, Op::Rank()> sizes;
+            for (int i = 0; i < Op::Rank(); i++) {
+              sizes[i] = op.Size(i);
+            }        
+
+            bool stride = detail::get_grid_dims<Op::Rank()>(blocks, threads, sizes, 256);
+
+            if constexpr (Op::Rank() == 1) {
+                if constexpr (is_matx_op_lvalue<Op>() || is_matx_set_op<Op>()) {
+                    auto tsk = ctx.task();
+                    tsk.set_symbol(op.str());
+                    op.apply_dep_to_task(tsk); // recursively find the tensors from the tree to apply deps
+                    //std::cout << "Start launch task. Rank = " << Op::Rank() << '\n';
+                    tsk->*[&](cudaStream_t s) { 
+                        detail::matxOpT1Kernel<<<blocks, threads, 0, s>>>(op, sizes[0]);
+                    };
+                    //std::cout << "End launch task.\n";
+                }
+                else {
+                    //std::cout << " RANK 1 not on LHS operator = " << op.str() << '\n';
+                    detail::matxOpT1Kernel<<<blocks, threads, 0, stream_>>>(op, sizes[0]);
+                }
+            }
+            else if constexpr (Op::Rank() == 2) {
+                if constexpr (is_matx_op_lvalue<Op>() || is_matx_set_op<Op>()) {
+                    auto tsk = ctx.task();
+                    tsk.set_symbol(op.str());
+                    op.apply_dep_to_task(tsk); // recursively find the tensors from the tree to apply deps
+                    //std::cout << "About to launch task. Rank = " << Op::Rank() << '\n';
+                    tsk->*[&](cudaStream_t s) { 
+                        if(stride) {
+                            detail::matxOpT2StrideKernel<<<blocks, threads, 0, s>>>(op, sizes[0], sizes[1]);
+                        } else {
+                            detail::matxOpT2Kernel<<<blocks, threads, 0, s>>>(op, sizes[0], sizes[1]);
+                        }
+                    };
+                }
+                else {
+                    //std::cout << " not on LHS operator = " << op.str() << '\n';
+                    if(stride) {
+                        detail::matxOpT2StrideKernel<<<blocks, threads, 0, stream_>>>(op, sizes[0], sizes[1]);
+                    } else {
+                        detail::matxOpT2Kernel<<<blocks, threads, 0, stream_>>>(op, sizes[0], sizes[1]);
+                    }
+                }
+            }
+            else if constexpr (Op::Rank() == 3) {
+                if constexpr (is_matx_op_lvalue<Op>() || is_matx_set_op<Op>()) {
+                    auto tsk = ctx.task();
+                    tsk.set_symbol(op.str());
+                    op.apply_dep_to_task(tsk); // recursively find the tensors from the tree to apply deps
+                    tsk->*[&](cudaStream_t s) { 
+                        if(stride) {
+                            detail::matxOpT3StrideKernel<<<blocks, threads, 0, s>>>(op, sizes[0], sizes[1], sizes[2]);
+                        } else {
+                            detail::matxOpT3Kernel<<<blocks, threads, 0, s>>>(op, sizes[0], sizes[1], sizes[2]);
+                        }
+                    };
+                }
+                else {
+                    if(stride) {
+                        detail::matxOpT3StrideKernel<<<blocks, threads, 0, stream_>>>(op, sizes[0], sizes[1], sizes[2]);
+                    } else {
+                        detail::matxOpT3Kernel<<<blocks, threads, 0, stream_>>>(op, sizes[0], sizes[1], sizes[2]);
+                    }
+                }
+            }
+            else if constexpr (Op::Rank() == 4) {
+                if constexpr (is_matx_op_lvalue<Op>() || is_matx_set_op<Op>()) {
+                    auto tsk = ctx.task();
+                    op.apply_dep_to_task(tsk); // recursively find the tensors from the tree to apply deps
+                    tsk.set_symbol(op.str())->*[&](cudaStream_t s) {
+                        if(stride) {
+                            detail::matxOpT4StrideKernel<<<blocks, threads, 0, s>>>(op, sizes[0], sizes[1], sizes[2], sizes[3]);
+                        } else {
+                            detail::matxOpT4Kernel<<<blocks, threads, 0, s>>>(op, sizes[0], sizes[1], sizes[2], sizes[3]);
+                        }
+                    };
+                }
+                else {
+                    if(stride) {
+                        detail::matxOpT4StrideKernel<<<blocks, threads, 0, stream_>>>(op, sizes[0], sizes[1], sizes[2], sizes[3]);
+                    } else {
+                        detail::matxOpT4Kernel<<<blocks, threads, 0, stream_>>>(op, sizes[0], sizes[1], sizes[2], sizes[3]);
+                    }
+                }
+            }        
+            else {
+                if constexpr (is_matx_op_lvalue<Op>() || is_matx_set_op<Op>()) {
+                    auto tsk = ctx.task();
+                    tsk.set_symbol(op.str());
+                    op.apply_dep_to_task(tsk); // recursively find the tensors from the tree to apply deps
+                    //std::cout << "About to launch task. Rank = " << Op::Rank() << '\n';
+
+                    tsk->*[&](cudaStream_t s) { 
+                        index_t dims = std::accumulate(std::begin(sizes) + 1, std::end(sizes), 1, std::multiplies<index_t>());
+                        detail::matxOpTDKernel<<<blocks, threads, 0, s>>>(op, sizes, dims);
+                    };
+                }
+                else {
+                    index_t dims = std::accumulate(std::begin(sizes) + 1, std::end(sizes), 1, std::multiplies<index_t>());
+                    detail::matxOpTDKernel<<<blocks, threads, 0, stream_>>>(op, sizes, dims);
+                }
+            } 
+          }
+#else
+          MATX_ASSERT_STR(false, matxInvalidParameter, "Cannot call device executor using host compiler");
+#endif    
+        }
+
+    private:
+      cudaStream_t stream_;
+      cuda::experimental::stf::context ctx_;
+  };
+
+};
diff --git a/include/matx/generators/generator1d.h b/include/matx/generators/generator1d.h
index cdeca628..1d684d62 100644
--- a/include/matx/generators/generator1d.h
+++ b/include/matx/generators/generator1d.h
@@ -54,6 +54,9 @@ namespace matx
             return f_(pp_get<Dim>(indices...));
           }
 
+        template <typename Task>
+        __MATX_INLINE__ void apply_dep_to_task([[maybe_unused]] Task &&task, [[maybe_unused]] int perm=1) const noexcept { }
+
         constexpr inline __MATX_HOST__ __MATX_DEVICE__ auto Size(int dim) const
         {
           return *(s_.begin() + dim);
diff --git a/include/matx/generators/random.h b/include/matx/generators/random.h
index fff49e92..ad404b55 100644
--- a/include/matx/generators/random.h
+++ b/include/matx/generators/random.h
@@ -305,7 +305,7 @@ namespace detail {
       {
         InnerPreRun(std::forward<ST>(shape), std::forward<Executor>(ex));          
 #ifdef __CUDACC__
-        if constexpr (is_cuda_executor_v<Executor>) {
+        if constexpr ((is_cuda_executor_v<Executor>) || (is_stf_executor_v<Executor>)) {
           if (!init_) {
             auto stream = ex.getStream();
             matxAlloc((void **)&states_,
@@ -341,7 +341,7 @@ namespace detail {
       template <typename ST, typename Executor>
       __MATX_INLINE__ void PostRun([[maybe_unused]] ST &&shape, [[maybe_unused]] Executor &&ex) const noexcept
       {
-        if constexpr (is_cuda_executor_v<Executor>) {
+        if constexpr ((is_cuda_executor_v<Executor>) || (is_stf_executor_v<Executor>)) {
           matxFree(states_);
         }
         else if constexpr (is_host_executor_v<Executor>) {
@@ -479,6 +479,9 @@ namespace detail {
         return shape_[dim];
       }
 
+      template <typename Task>
+      __MATX_INLINE__ void apply_dep_to_task([[maybe_unused]] Task &&task, [[maybe_unused]] int perm=1) const noexcept { }
+
       static __MATX_INLINE__ constexpr __MATX_HOST__ __MATX_DEVICE__ int32_t Rank() { return RANK; }
     };
   }
diff --git a/include/matx/operators/all.h b/include/matx/operators/all.h
index 8f01b02c..8a286ad8 100644
--- a/include/matx/operators/all.h
+++ b/include/matx/operators/all.h
@@ -72,9 +72,35 @@ namespace detail {
         return tmp_out_(indices...);
       };
 
+      template <typename Task>
+      __MATX_INLINE__ void apply_dep_to_task(Task &&task, int perm=1) const noexcept {
+        /* Albert -- Scenario where the all() operator is on the RHS and sum has already 
+        run previously. So we make tmp_out have a read permission as it will be read from */
+        tmp_out_.apply_dep_to_task(std::forward<Task>(task), 1);
+      }
+
       template <typename Out, typename Executor>
       void Exec(Out &&out, Executor &&ex) const {
-        all_impl(cuda::std::get<0>(out), a_, ex);
+        auto output = cuda::std::get<0>(out);
+        // stfexecutor case
+        if constexpr (!is_cuda_executor_v<Executor>) {
+            auto ctx = ex.getCtx();
+            auto tsk = ctx.task();
+            tsk.set_symbol("all_task");
+
+            output.PreRun(out_dims_, std::forward<Executor>(ex));
+            output.apply_dep_to_task(tsk, 0);
+            a_.apply_dep_to_task(tsk, 1);
+
+            tsk->*[&](cudaStream_t s) {
+                auto exec = cudaExecutor(s);
+                all_impl(output, a_, exec);
+            };
+        }
+        // cudaExecutor case
+        else if constexpr (is_cuda_executor_v<Executor>) {
+            all_impl(output, a_, ex);
+        }
       }
 
       static __MATX_INLINE__ constexpr __MATX_HOST__ __MATX_DEVICE__ int32_t Rank()
diff --git a/include/matx/operators/binary_operators.h b/include/matx/operators/binary_operators.h
index c6c0eea1..eeca1842 100644
--- a/include/matx/operators/binary_operators.h
+++ b/include/matx/operators/binary_operators.h
@@ -146,6 +146,18 @@ namespace matx
         return detail::matx_max(size1,size2);
       }
 
+      template <typename Task, typename Perm>
+      __MATX_INLINE__ void apply_dep_to_task(Task &&task, Perm perm) const noexcept
+      {
+        if constexpr (is_matx_op<I1>()) {
+            in1_.apply_dep_to_task(std::forward<Task>(task), perm);
+        }
+
+        if constexpr (is_matx_op<I2>()) {
+            in2_.apply_dep_to_task(std::forward<Task>(task), perm);
+        }
+      }
+
       template <typename ShapeType, typename Executor>
       __MATX_INLINE__ void PreRun(ShapeType &&shape, Executor &&ex) const noexcept
       {
diff --git a/include/matx/operators/cast.h b/include/matx/operators/cast.h
index c1af0708..844e8c3d 100644
--- a/include/matx/operators/cast.h
+++ b/include/matx/operators/cast.h
@@ -83,6 +83,14 @@ namespace matx
           return static_cast<NewType>(get_value(op_, indices...));
         }
 
+        template <typename Task>
+        __MATX_INLINE__ void apply_dep_to_task(Task &&task, int perm=1) const noexcept
+        {
+          if constexpr (is_matx_op<T>()) {
+            op_.apply_dep_to_task(std::forward<Task>(task), perm);
+          }
+        }
+
         template <typename ShapeType, typename Executor>
         __MATX_INLINE__ void PreRun(ShapeType &&shape, Executor &&ex) const noexcept
         {
diff --git a/include/matx/operators/cgsolve.h b/include/matx/operators/cgsolve.h
index db1f508e..bbf4d2f4 100644
--- a/include/matx/operators/cgsolve.h
+++ b/include/matx/operators/cgsolve.h
@@ -88,10 +88,20 @@ namespace matx
           return out_dims_[dim];
         }
 
+      template <typename Task>
+      __MATX_INLINE__ void apply_dep_to_task(Task &&task, int perm=1) const noexcept {
+        /* Albert -- Scenario where the sum() operator is on the RHS and sum has already 
+        run previously. So we make tmp_out have a read permission as it will be read from */
+        tmp_out_.apply_dep_to_task(std::forward<Task>(task), 1);
+      }
+
+
         template <typename Out, typename Executor>
         void Exec(Out &&out, Executor &&ex)  const{
-          static_assert(is_cuda_executor_v<Executor>, "cgsolve() only supports the CUDA executor currently");
-          cgsolve_impl(cuda::std::get<0>(out), a_, b_, tol_, max_iters_, ex.getStream());
+          //static_assert(is_cuda_executor_v<Executor>, "cgsolve() only supports the CUDA executor currently");
+            auto output = cuda::std::get<0>(out);
+          output.PreRun(out_dims_, std::forward<Executor>(ex));
+          cgsolve_impl(cuda::std::get<0>(out), a_, b_, ex, tol_, max_iters_, ex.getStream());
         }
 
         template <typename ShapeType, typename Executor>
diff --git a/include/matx/operators/constval.h b/include/matx/operators/constval.h
index 3df28aff..32eec317 100644
--- a/include/matx/operators/constval.h
+++ b/include/matx/operators/constval.h
@@ -55,6 +55,9 @@ namespace matx
         __MATX_INLINE__ __MATX_DEVICE__ __MATX_HOST__ T operator()(Is...) const { 
           return v_; };
 
+      template <typename Task>
+      __MATX_INLINE__ void apply_dep_to_task([[maybe_unused]] Task &&task, [[maybe_unused]] int perm) const noexcept { }
+
       constexpr inline __MATX_HOST__ __MATX_DEVICE__ auto Size(int dim) const {
         if constexpr (!is_noshape_v<ShapeType>) {
           return *(s_.begin() + dim);
diff --git a/include/matx/operators/conv.h b/include/matx/operators/conv.h
index 37645bab..ba5497c7 100644
--- a/include/matx/operators/conv.h
+++ b/include/matx/operators/conv.h
@@ -149,16 +149,45 @@ namespace matx
           return out_dims_[dim];
         }
 
+        template <typename Task>
+        __MATX_INLINE__ void apply_dep_to_task(Task &&task, int perm=1) const noexcept
+        {
+            tmp_out_.apply_dep_to_task(std::forward<Task>(task), 1);
+        }
+ 
         template <typename Out, typename Executor>
         void Exec(Out &&out, Executor &&ex) const {
           MATX_ASSERT_STR(!(is_host_executor_v<Executor> && method_ == MATX_C_METHOD_DIRECT), matxNotSupported, "direct conv1d() only supports the CUDA executor currently");
           MATX_STATIC_ASSERT_STR((Rank() == cuda::std::tuple_element_t<0, remove_cvref_t<Out>>::Rank()), 
                 matxInvalidParameter, "conv1d: inputs and outputs must have same rank to use conv1d with axis parameter");
-          if constexpr (!std::is_same_v<PermDims, no_permute_t>) {
-            conv1d_impl(permute(cuda::std::get<0>(out), perm_), a_, b_, mode_, method_, ex);
+          if constexpr (!is_cuda_executor_v<Executor>) {
+              auto ctx = ex.getCtx();
+              auto tsk = ctx.task();
+              tsk.set_symbol("conv_task");
+
+              auto output = cuda::std::get<0>(out);
+              output.PreRun(out_dims_, std::forward<Executor>(ex));
+              output.apply_dep_to_task(tsk, 0);
+              a_.apply_dep_to_task(tsk, 1);
+              b_.apply_dep_to_task(tsk, 1);
+
+              tsk->*[&](cudaStream_t s) {
+                  auto exec = cudaExecutor(s);
+                  if constexpr (!std::is_same_v<PermDims, no_permute_t>) {
+                      conv1d_impl(permute(cuda::std::get<0>(out), perm_), a_, b_, mode_, method_, exec);
+                  }
+                  else {
+                      conv1d_impl(cuda::std::get<0>(out), a_, b_, mode_, method_, exec);
+                  }
+              };
           }
-          else {
-            conv1d_impl(cuda::std::get<0>(out), a_, b_, mode_, method_, ex);
+          else if constexpr (is_cuda_executor_v<Executor>) {
+              if constexpr (!std::is_same_v<PermDims, no_permute_t>) {
+                  conv1d_impl(permute(cuda::std::get<0>(out), perm_), a_, b_, mode_, method_, ex);
+              }
+              else {
+                  conv1d_impl(cuda::std::get<0>(out), a_, b_, mode_, method_, ex);
+              }
           }
         }
 
@@ -343,14 +372,36 @@ namespace detail {
 
       template <typename Out, typename Executor>
       void Exec(Out &&out, Executor &&ex) const {
-        static_assert(is_cuda_executor_v<Executor>, "conv2d() only supports the CUDA executor currently");
-
-        if constexpr (!std::is_same_v<PermDims, no_permute_t>) {
-          conv2d_impl(permute(cuda::std::get<0>(out), perm_), a_, b_, mode_, ex.getStream());
-        }
-        else {
-          conv2d_impl(cuda::std::get<0>(out), a_, b_, mode_, ex.getStream());
-        }
+        //static_assert(is_cuda_executor_v<Executor>, "conv2d() only supports the CUDA executor currently");
+        if constexpr (!is_cuda_executor_v<Executor>) {
+            auto ctx = ex.getCtx();
+            auto tsk = ctx.task();
+            tsk.set_symbol("conv_task");
+
+            auto output = cuda::std::get<0>(out);
+            output.PreRun(out_dims_, std::forward<Executor>(ex));
+            output.apply_dep_to_task(tsk, 0);
+            a_.apply_dep_to_task(tsk, 1);
+            b_.apply_dep_to_task(tsk, 1);
+
+            tsk->*[&](cudaStream_t s) {
+                auto exec = cudaExecutor(s);
+                if constexpr (!std::is_same_v<PermDims, no_permute_t>) {
+                    conv2d_impl(permute(cuda::std::get<0>(out), perm_), a_, b_, mode_, exec.getStream());
+                }
+                else {
+                    conv2d_impl(cuda::std::get<0>(out), a_, b_, mode_, exec.getStream());
+                }
+            };
+          }
+          else if constexpr (is_cuda_executor_v<Executor>) {
+              if constexpr (!std::is_same_v<PermDims, no_permute_t>) {
+                  conv2d_impl(permute(cuda::std::get<0>(out), perm_), a_, b_, mode_, ex.getStream());
+              }
+              else {
+                  conv2d_impl(cuda::std::get<0>(out), a_, b_, mode_, ex.getStream());
+              }
+          }
       }
 
       template <typename ShapeType, typename Executor>
diff --git a/include/matx/operators/fft.h b/include/matx/operators/fft.h
index 242169d1..9435d56a 100644
--- a/include/matx/operators/fft.h
+++ b/include/matx/operators/fft.h
@@ -145,24 +145,66 @@ namespace matx
           return out_dims_[dim];
         }
 
+        template <typename Task>
+        __MATX_INLINE__ void apply_dep_to_task(Task &&task, [[maybe_unused]] int perm=1) const noexcept
+        {
+          /* Scenario where the matvec() operator is on the RHS and op has already
+             run previously. So we make tmp_out have a read permission as it will be read from */
+          tmp_out_.apply_dep_to_task(std::forward<Task>(task), 1);
+        }
+
         template <typename Out, typename Executor>
         void Exec(Out &&out, Executor &&ex) const {
-          if constexpr (std::is_same_v<PermDims, no_permute_t>) {
-            if constexpr (std::is_same_v<FFTType, fft_t>) {
-              fft_impl(cuda::std::get<0>(out), a_, fft_size_, norm_, ex);
+            // stfexecutor case
+            if constexpr (is_stf_executor_v<Executor>) {
+                auto ctx = ex.getCtx();
+                auto tsk = ctx.task();
+                tsk.set_symbol("fft_task_no_perm");
+
+                auto output = cuda::std::get<0>(out);
+                output.PreRun(out_dims_, std::forward<Executor>(ex));
+                output.apply_dep_to_task(tsk, 0);
+                a_.apply_dep_to_task(tsk, 1);
+
+                tsk->*[&](cudaStream_t s) {
+                    auto exec = cudaExecutor(s);
+                    if constexpr (std::is_same_v<PermDims, no_permute_t>) {
+                        if constexpr (std::is_same_v<FFTType, fft_t>) {
+                            fft_impl(output, a_, fft_size_, norm_, exec);
+                        }
+                        else {
+                            ifft_impl(output, a_, fft_size_, norm_, exec);
+                        }
+                    }
+                    else {
+                        if constexpr (std::is_same_v<FFTType, fft_t>) { 
+                            fft_impl(permute(output, perm_), permute(a_, perm_), fft_size_, norm_, exec);
+                        }
+                        else {
+                            ifft_impl(permute(output, perm_), permute(a_, perm_), fft_size_, norm_, exec);
+                        }
+                    }
+                };
             }
+            // cudaExecutor or host case
             else {
-              ifft_impl(cuda::std::get<0>(out), a_, fft_size_, norm_, ex);
+                if constexpr (std::is_same_v<PermDims, no_permute_t>) {
+                    if constexpr (std::is_same_v<FFTType, fft_t>) {
+                        fft_impl(cuda::std::get<0>(out), a_, fft_size_, norm_, ex);
+                    }
+                    else {
+                        ifft_impl(cuda::std::get<0>(out), a_, fft_size_, norm_, ex);
+                    }
+                }
+                else {
+                    if constexpr (std::is_same_v<FFTType, fft_t>) { 
+                        fft_impl(permute(cuda::std::get<0>(out), perm_), permute(a_, perm_), fft_size_, norm_, ex);
+                    }
+                    else {
+                        ifft_impl(permute(cuda::std::get<0>(out), perm_), permute(a_, perm_), fft_size_, norm_, ex);
+                    }
+                }
             }
-          }
-          else {
-            if constexpr (std::is_same_v<FFTType, fft_t>) { 
-              fft_impl(permute(cuda::std::get<0>(out), perm_), permute(a_, perm_), fft_size_, norm_, ex);
-            }
-            else {
-              ifft_impl(permute(cuda::std::get<0>(out), perm_), permute(a_, perm_), fft_size_, norm_, ex);
-            }
-          }
         }
 
         template <typename ShapeType, typename Executor>
@@ -494,4 +536,4 @@ namespace matx
     return detail::FFT2Op(a, perm, detail::ifft_t{}, norm);
   }  
 
-}
\ No newline at end of file
+}
diff --git a/include/matx/operators/matmul.h b/include/matx/operators/matmul.h
index 7d1b70f3..a88afecd 100644
--- a/include/matx/operators/matmul.h
+++ b/include/matx/operators/matmul.h
@@ -112,24 +112,65 @@ namespace matx
           return out_dims_[dim];
         }
 
+        template <typename Task>
+        __MATX_INLINE__ void apply_dep_to_task(Task &&task, int perm=1) const noexcept
+        {
+          /* Scenario where the matmul() operator is on the RHS and op has already run 
+          previously. So we make tmp_out have a read permission as it will be read from */
+          tmp_out_.apply_dep_to_task(std::forward<Task>(task), 1);
+        }
+
         template <typename Out, typename Executor>
         void Exec(Out &&out, Executor &&ex) const {
           // Perform SpMM or otherwise GEMM.
           static_assert(!is_sparse_tensor_v<OpB>, "sparse rhs not implemented");
-          if constexpr (is_sparse_tensor_v<OpA>) {
-            if constexpr (!std::is_same_v<PermDims, no_permute_t>) {
-              sparse_matmul_impl(permute(cuda::std::get<0>(out), perm_), a_, b_, ex, alpha_, beta_);
+          
+          // stfexecutor case
+          if constexpr (!is_cuda_executor_v<Executor>) {
+              auto ctx = ex.getCtx();
+              auto tsk = ctx.task();
+              tsk.set_symbol("matmul");
+
+              auto output = cuda::std::get<0>(out);
+              output.PreRun(out_dims_, std::forward<Executor>(ex));
+              output.apply_dep_to_task(tsk, 0);
+              a_.apply_dep_to_task(tsk, 1);
+              b_.apply_dep_to_task(tsk, 1);
+
+              tsk->*[&](cudaStream_t s) {
+                  auto exec = cudaExecutor(s);
+                  if constexpr (is_sparse_tensor_v<OpA>) {
+                    if constexpr (!std::is_same_v<PermDims, no_permute_t>) {
+                      sparse_matmul_impl(permute(cuda::std::get<0>(out), perm_), a_, b_, exec, alpha_, beta_);
+                    }
+                    else {
+                      sparse_matmul_impl(cuda::std::get<0>(out), a_, b_, exec, alpha_, beta_);
+                    }
+                  }
+                  else if constexpr (!std::is_same_v<PermDims, no_permute_t>) {
+                    matmul_impl(permute(cuda::std::get<0>(out), perm_), a_, b_, exec, alpha_, beta_);
+                  }
+                  else {
+                    matmul_impl(cuda::std::get<0>(out), a_, b_, exec, alpha_, beta_);
+                  }
+              };
+          }
+          else if constexpr (is_cuda_executor_v<Executor>) {
+            if constexpr (is_sparse_tensor_v<OpA>) {
+              if constexpr (!std::is_same_v<PermDims, no_permute_t>) {
+                sparse_matmul_impl(permute(cuda::std::get<0>(out), perm_), a_, b_, ex, alpha_, beta_);
+              }
+              else {
+                sparse_matmul_impl(cuda::std::get<0>(out), a_, b_, ex, alpha_, beta_);
+              }
+            }
+            else if constexpr (!std::is_same_v<PermDims, no_permute_t>) {
+              matmul_impl(permute(cuda::std::get<0>(out), perm_), a_, b_, ex, alpha_, beta_);
             }
             else {
-              sparse_matmul_impl(cuda::std::get<0>(out), a_, b_, ex, alpha_, beta_);
+              matmul_impl(cuda::std::get<0>(out), a_, b_, ex, alpha_, beta_);
             }
           }
-          else if constexpr (!std::is_same_v<PermDims, no_permute_t>) {
-            matmul_impl(permute(cuda::std::get<0>(out), perm_), a_, b_, ex, alpha_, beta_);
-          }
-          else {
-            matmul_impl(cuda::std::get<0>(out), a_, b_, ex, alpha_, beta_);
-          }
         }
 
         template <typename ShapeType, typename Executor>
diff --git a/include/matx/operators/matvec.h b/include/matx/operators/matvec.h
index 7de2913f..1b4d3a1b 100644
--- a/include/matx/operators/matvec.h
+++ b/include/matx/operators/matvec.h
@@ -90,14 +90,44 @@ namespace matx
           return out_dims_[dim];
         }
 
+        template <typename Task>
+        __MATX_INLINE__ void apply_dep_to_task(Task &&task, int perm=1) const noexcept
+        {
+          /* Scenario where the matvec() operator is on the RHS and op has already run 
+          previously. So we make tmp_out have a read permission as it will be read from */
+          tmp_out_.apply_dep_to_task(std::forward<Task>(task), 1);
+        }
+
         template <typename Out, typename Executor>
         void Exec(Out &&out, Executor &&ex)  const{
-          static_assert(!is_sparse_tensor_v<OpB>, "sparse rhs not implemented");
-          if constexpr (is_sparse_tensor_v<OpA>) {
-	    sparse_matvec_impl(cuda::std::get<0>(out), a_, b_, ex, alpha_, beta_);
-          } else {
-            matvec_impl(cuda::std::get<0>(out), a_, b_, ex, alpha_, beta_);
-	  }
+            static_assert(!is_sparse_tensor_v<OpB>, "sparse rhs not implemented");
+            // stfexecutor case
+            auto output = cuda::std::get<0>(out);
+            if constexpr (!is_cuda_executor_v<Executor>) {
+                auto ctx = ex.getCtx();
+                auto tsk = ctx.task();
+                tsk.set_symbol("matvec_task");
+                output.PreRun(out_dims_, std::forward<Executor>(ex));
+                output.apply_dep_to_task(tsk, 0);
+                a_.apply_dep_to_task(tsk, 1);
+                b_.apply_dep_to_task(tsk, 1);
+                tsk->*[&](cudaStream_t s) {
+                    auto exec = cudaExecutor(s);
+                    if constexpr (is_sparse_tensor_v<OpA>) {
+	                    sparse_matvec_impl(output, a_, b_, exec, alpha_, beta_);
+                    } else {
+                      matvec_impl(output, a_, b_, exec, alpha_, beta_);
+	                  }   
+                };
+            }
+            // cudaExecutor case
+            else if constexpr (is_cuda_executor_v<Executor>) {
+              if constexpr (is_sparse_tensor_v<OpA>) {
+	              sparse_matvec_impl(output, a_, b_, ex, alpha_, beta_);
+              } else {
+                matvec_impl(output, a_, b_, ex, alpha_, beta_);
+	            }             
+            }
         }
 
         template <typename ShapeType, typename Executor>
diff --git a/include/matx/operators/max.h b/include/matx/operators/max.h
index f7d9da77..562c2a4f 100644
--- a/include/matx/operators/max.h
+++ b/include/matx/operators/max.h
@@ -72,9 +72,35 @@ namespace detail {
         return tmp_out_(indices...);
       }
 
+      template <typename Task>
+      __MATX_INLINE__ void apply_dep_to_task(Task &&task, int perm=1) const noexcept {
+        /* Albert -- Scenario where the all() operator is on the RHS and sum has already 
+        run previously. So we make tmp_out have a read permission as it will be read from */
+        tmp_out_.apply_dep_to_task(std::forward<Task>(task), 1);
+      }
+
       template <typename Out, typename Executor>
       void Exec(Out &&out, Executor &&ex) const {
-        max_impl(cuda::std::get<0>(out), a_, ex);
+        auto output = cuda::std::get<0>(out);
+        // stfexecutor case
+        if constexpr (!is_cuda_executor_v<Executor>) {
+            auto ctx = ex.getCtx();
+            auto tsk = ctx.task();
+            tsk.set_symbol("max_task");
+
+            output.PreRun(out_dims_, std::forward<Executor>(ex));
+            output.apply_dep_to_task(tsk, 0);
+            a_.apply_dep_to_task(tsk, 1);
+
+            tsk->*[&](cudaStream_t s) {
+                auto exec = cudaExecutor(s);
+                max_impl(output, a_, exec);
+            };
+        }
+        // cudaExecutor case
+        else if constexpr (is_cuda_executor_v<Executor>) {
+            max_impl(output, a_, ex);
+        }
       }
 
       static __MATX_INLINE__ constexpr __MATX_HOST__ __MATX_DEVICE__ int32_t Rank()
diff --git a/include/matx/operators/permute.h b/include/matx/operators/permute.h
index 0732b3ae..19fa287e 100644
--- a/include/matx/operators/permute.h
+++ b/include/matx/operators/permute.h
@@ -126,6 +126,14 @@ MATX_IGNORE_WARNING_POP_GCC
           return op_.Size(dims_[dim]);
         }
 
+        template <typename Task>
+        __MATX_INLINE__ void apply_dep_to_task(Task &&task, int perm=1) const noexcept
+        {
+          if constexpr (is_matx_op<T>()) {
+            op_.apply_dep_to_task(std::forward<Task>(task), perm);
+          }
+        }
+
         template <typename ShapeType, typename Executor>
         __MATX_INLINE__ void PreRun(ShapeType &&shape, Executor &&ex) const noexcept
         {
diff --git a/include/matx/operators/set.h b/include/matx/operators/set.h
index 811181f4..50faa0e1 100644
--- a/include/matx/operators/set.h
+++ b/include/matx/operators/set.h
@@ -147,6 +147,20 @@ class set : public BaseOp<set<T, Op>> {
     return res;
   }
 
+  template <typename Task>
+  __MATX_INLINE__ void apply_dep_to_task(Task &&task, [[maybe_unused]] int perm=0) const noexcept                                                                   
+  {
+    // LHS
+    if constexpr (is_matx_op<T>()) {
+      out_.apply_dep_to_task(std::forward<Task>(task), 0);
+    }
+    // RHS
+    if constexpr (is_matx_op<Op>()) {
+      op_.apply_dep_to_task(std::forward<Task>(task), 1);
+    }
+  }
+
+
   template <typename ShapeType, typename Executor>
   __MATX_INLINE__ void PreRun(ShapeType &&shape, Executor &&ex) const noexcept
   {
diff --git a/include/matx/operators/sum.h b/include/matx/operators/sum.h
index eec0d9b7..62c5853d 100644
--- a/include/matx/operators/sum.h
+++ b/include/matx/operators/sum.h
@@ -72,9 +72,35 @@ namespace detail {
         return tmp_out_(indices...);
       }
 
+      template <typename Task>
+      __MATX_INLINE__ void apply_dep_to_task(Task &&task, int perm=1) const noexcept {
+        /* Albert -- Scenario where the sum() operator is on the RHS and sum has already 
+        run previously. So we make tmp_out have a read permission as it will be read from */
+        tmp_out_.apply_dep_to_task(std::forward<Task>(task), 1);
+      }
+
       template <typename Out, typename Executor>
       void Exec(Out &&out, Executor &&ex) const {
-        sum_impl(cuda::std::get<0>(out), a_, ex);
+        auto output = cuda::std::get<0>(out);
+        // stfexecutor case
+        if constexpr (!is_cuda_executor_v<Executor>) {
+            auto ctx = ex.getCtx();
+            auto tsk = ctx.task();
+            tsk.set_symbol("sum_task");
+
+            output.PreRun(out_dims_, std::forward<Executor>(ex));
+            output.apply_dep_to_task(tsk, 0);
+            a_.apply_dep_to_task(tsk, 1);
+
+            tsk->*[&](cudaStream_t s) {
+                auto exec = cudaExecutor(s);
+                sum_impl(output, a_, exec);
+            };
+        }
+        // cudaExecutor case
+        else if constexpr (is_cuda_executor_v<Executor>) {
+            sum_impl(output, a_, ex);
+        }
       }
 
       static __MATX_INLINE__ constexpr __MATX_HOST__ __MATX_DEVICE__ int32_t Rank()
diff --git a/include/matx/operators/unary_operators.h b/include/matx/operators/unary_operators.h
index dbf1afdb..0b21ace1 100644
--- a/include/matx/operators/unary_operators.h
+++ b/include/matx/operators/unary_operators.h
@@ -103,6 +103,12 @@ namespace matx
       return size_[dim];
     }
 
+    template <typename Task, typename Perm>
+    __MATX_INLINE__ void apply_dep_to_task(Task &&task, Perm perm) const noexcept
+    {
+      in1_.apply_dep_to_task(std::forward<Task>(task), perm);
+    }
+
     template <typename ShapeType, typename Executor>
     __MATX_INLINE__ void PreRun([[maybe_unused]] ShapeType &&shape, [[maybe_unused]] Executor &&ex) const noexcept
     {
diff --git a/include/matx/transforms/cgsolve.h b/include/matx/transforms/cgsolve.h
index 9d8f6d41..05874bf4 100644
--- a/include/matx/transforms/cgsolve.h
+++ b/include/matx/transforms/cgsolve.h
@@ -58,8 +58,8 @@ namespace matx
    *   cuda Stream to execute on
    *
    */
-  template <typename XType, typename AType, typename BType>
-    __MATX_INLINE__ void cgsolve_impl(XType X, AType A, BType B, double tol=1e-6, int max_iters=4, cudaStream_t stream=0)
+  template <typename XType, typename AType, typename BType, typename Executor>
+    __MATX_INLINE__ void cgsolve_impl(XType X, AType A, BType B, Executor &&exec, double tol=1e-6, int max_iters=4, cudaStream_t stream=0)
     {
       using value_type = typename XType::value_type;
       const int VRANK = XType::Rank();
@@ -119,15 +119,19 @@ namespace matx
       auto pApc = clone<VRANK>(pAp, clone_shape);
     
       // A*X
-      (Ap = matvec(A, X)).run(stream);
+      //(Ap = matvec(A, X)).run(stream);
+      (Ap = matvec(A, X)).run(exec);
       // r0 = B - A*X   
       // p = r0 
-      (p = r0 = B - Ap).run(stream);  
+      //(p = r0 = B - Ap).run(stream);  
+      (p = r0 = B - Ap).run(exec);  
       
-      (r0r0 = sum(r0*r0)).run(stream);
+      //(r0r0 = sum(r0*r0)).run(stream);
+      (r0r0 = sum(r0*r0)).run(exec);
       
       if(tol>0.0f) {
-        (converged = matx::all(as_int(sqrt(r0r0) < tol))).run(stream);
+        //(converged = matx::all(as_int(sqrt(r0r0) < tol))).run(stream);
+        (converged = matx::all(as_int(sqrt(r0r0) < tol))).run(exec);
       
         cudaEventRecord(event, stream);
         cudaStreamWaitEvent(d2h, event);
@@ -136,10 +140,12 @@ namespace matx
       int i;
       for (i = 0 ; i < max_iters; i++) {
         // Ap = matvec(A, p) 
-        (Ap = matvec(A, p)).run(stream);
+        //(Ap = matvec(A, p)).run(stream);
+        (Ap = matvec(A, p)).run(exec);
 
         // pAp = dot(p,Ap)  
-        (pAp = sum(p*Ap)).run(stream);
+        //(pAp = sum(p*Ap)).run(stream);
+        (pAp = sum(p*Ap)).run(exec);
         
         // if pAp is zero then we have exactly numerically converged.
         // However, this is batched so we may iterate more.  Iterating
@@ -151,10 +157,12 @@ namespace matx
         auto updateOp = ( r1 = r0 - (r0r0c/pApc) * Ap,
              X = X + (r0r0c/pApc) * p);
 
-        (IF( pApc != value_type(0), updateOp)).run(stream);
+        //(IF( pApc != value_type(0), updateOp)).run(stream);
+        (IF( pApc != value_type(0), updateOp)).run(exec);
         
         // r1r1 = dot(r1, r1)
-        (r1r1 = sum(r1*r1)).run(stream);
+        //(r1r1 = sum(r1*r1)).run(stream);
+        (r1r1 = sum(r1*r1)).run(exec);
         
         if(tol>0.0f) {
           // copy convergence criteria to host.  
@@ -167,7 +175,8 @@ namespace matx
             break;
           }
 
-          (converged = matx::all(as_int(sqrt(r1r1) < tol))).run(stream);
+          //(converged = matx::all(as_int(sqrt(r1r1) < tol))).run(stream);
+          (converged = matx::all(as_int(sqrt(r1r1) < tol))).run(exec);
         
           cudaEventRecord(event, stream);
           cudaStreamWaitEvent(d2h, event);
@@ -175,7 +184,8 @@ namespace matx
         
         // p = r1 + b * p 
         auto updateP = ( p = r1 + (r1r1c/r0r0c) * p);
-        (IF( pApc != value_type(0), updateP)).run(stream);
+        //(IF( pApc != value_type(0), updateP)).run(stream);
+        (IF( pApc != value_type(0), updateP)).run(exec);
 
         // Advance residual
         swap(r0r0, r1r1);  
diff --git a/include/matx/transforms/fft/fft_cuda.h b/include/matx/transforms/fft/fft_cuda.h
index c2eaa9c2..de9089b7 100644
--- a/include/matx/transforms/fft/fft_cuda.h
+++ b/include/matx/transforms/fft/fft_cuda.h
@@ -323,7 +323,11 @@ template <typename OutTensorType, typename InTensorType> class matxCUDAFFTPlan_t
   virtual ~matxCUDAFFTPlan_t() {
     if (this->workspace_ != nullptr) {
       // Pass the default stream until we allow user-deletable caches
+      /* Albert -- Temporarily remove this free as we likely don't want to 
+                   insert the dependence on cudaStreamDefault */
+#if 0
       matxFree(workspace_, cudaStreamDefault);
+#endif
       this->workspace_ = nullptr;
     }
 
@@ -414,6 +418,11 @@ matxCUDAFFTPlan1D_t(OutTensorType &o, const InTensorType &i, cudaStream_t stream
     }
   }
 
+  // Albert -Assert that the stream is in capture mode
+  cudaStreamCaptureStatus status;
+  cudaStreamIsCapturing(stream, &status);
+  MATX_ASSERT(status == cudaStreamCaptureStatusNone, matxCufftError);
+
   size_t workspaceSize;
   cufftCreate(&this->plan_);
   [[maybe_unused]] cufftResult error;