diff --git a/CMakeLists.txt b/CMakeLists.txt index 8037337c..5fac5862 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -128,13 +128,17 @@ if (CMAKE_CUDA_COMPILER_VERSION VERSION_LESS 11.5) message(FATAL_ERROR "MatX requires CUDA 11.5 or higher. Please update before using.") endif() +set(CCCL_ENABLE_UNSTABLE ON) message(STATUS "Finding CCCL...") rapids_cpm_cccl( BUILD_EXPORT_SET matx-exports INSTALL_EXPORT_SET matx-exports ) -target_link_libraries(matx INTERFACE CCCL::CCCL) +target_link_libraries(matx INTERFACE CCCL::CCCL CCCL::cudax) +target_compile_options(matx INTERFACE + $<$:--expt-relaxed-constexpr --extended-lambda> +) # Set flags for compiling tests faster (only for nvcc) if (NOT CMAKE_CUDA_COMPILER_ID STREQUAL "Clang") @@ -164,8 +168,7 @@ if (NOT ${IS_NVCPP} GREATER -1) -Wmisleading-indentation -Wduplicated-cond -Wduplicated-branches - -Wlogical-op - -Wnull-dereference) + -Wlogical-op) endif() endif() diff --git a/cmake/versions.json b/cmake/versions.json index df7c4ce6..82cae91a 100644 --- a/cmake/versions.json +++ b/cmake/versions.json @@ -4,7 +4,7 @@ "version": "2.8.0", "git_shallow": false, "git_url": "https://github.com/NVIDIA/cccl.git", - "git_tag": "6d02e11" + "git_tag": "9f254d5d8d071d67c6a6ad107b0bbd578b3d072d" }, "nvbench" : { "version" : "0.0", diff --git a/examples/CMakeLists.txt b/examples/CMakeLists.txt index 077efeeb..95430347 100644 --- a/examples/CMakeLists.txt +++ b/examples/CMakeLists.txt @@ -12,6 +12,7 @@ set(examples pwelch resample_poly_bench sparse_tensor + simple_stf_test spectrogram spectrogram_graph spherical_harmonics @@ -20,9 +21,6 @@ set(examples black_scholes print_styles) - - - add_library(example_lib INTERFACE) target_include_directories(example_lib SYSTEM INTERFACE ${CUTLASS_INC} ${pybind11_INCLUDE_DIR} ${PYTHON_INCLUDE_DIRS}) @@ -54,6 +52,21 @@ foreach( example ${examples} ) target_link_libraries(${example} example_lib) endforeach() +# Compile all examples with CUDASTF and append their name with _stf +foreach( example ${examples} ) + string( CONCAT file ${example} ".cu" ) + + set(output_name "${example}_stf") + add_executable( ${output_name} ${file} ) + + # Add the -DUSE_STF compilation flag + target_compile_definitions(${output_name} PRIVATE USE_STF) + + target_link_libraries(${output_name} example_lib) +endforeach() + + + # Build proprietary examples file (GLOB_RECURSE proprietary_sources CONFIGURE_DEPENDS ${CMAKE_SOURCE_DIR}/proprietary/*/examples/*.cu) foreach (pexample ${proprietary_sources}) diff --git a/examples/cgsolve.cu b/examples/cgsolve.cu index cba23f02..7c50ef85 100644 --- a/examples/cgsolve.cu +++ b/examples/cgsolve.cu @@ -54,7 +54,11 @@ int main([[maybe_unused]] int argc, [[maybe_unused]] char **argv) auto norm = make_tensor({BATCH}); auto maxn = make_tensor({}); +#if 0 cudaExecutor exec{}; +#else + stfExecutor exec{}; +#endif // Simple Poisson matrix for(int b = 0; b < BATCH; b++) { @@ -83,6 +87,11 @@ int main([[maybe_unused]] int argc, [[maybe_unused]] char **argv) (maxn = matx::max(sqrt(norm))).run(exec); exec.sync(); +#if 1 + auto ctx = exec.getCtx(); + ctx.finalize(); +#endif + // example-end sync-test-1 printf ("max l2 norm: %f\n", (float)sqrt(maxn())); diff --git a/examples/fft_conv.cu b/examples/fft_conv.cu index c6361481..d7882ca3 100644 --- a/examples/fft_conv.cu +++ b/examples/fft_conv.cu @@ -74,6 +74,13 @@ int main([[maybe_unused]] int argc, [[maybe_unused]] char **argv) MATX_ENTER_HANDLER(); using complex = cuda::std::complex; +#ifdef USE_STF + std::cout << "Using STF executor\n"; +#else + std::cout << "Using CUDA executor\n"; +#endif + + index_t signal_size = 1ULL << 16; index_t filter_size = 16; index_t batches = 8; @@ -86,8 +93,14 @@ int main([[maybe_unused]] int argc, [[maybe_unused]] char **argv) cudaEvent_t start, stop; cudaEventCreate(&start); cudaEventCreate(&stop); - cudaExecutor exec{stream}; +#ifdef USE_STF + stfExecutor exec{stream}; + auto ctx = exec.getCtx(); +#else + cudaExecutor exec{stream}; +#endif + // Create time domain buffers auto sig_time = make_tensor({batches, signal_size}); auto filt_time = make_tensor({batches, filter_size}); @@ -117,7 +130,11 @@ int main([[maybe_unused]] int argc, [[maybe_unused]] char **argv) // Perform the FFT in-place on both signal and filter for (int i = 0; i < iterations; i++) { if (i == 1) { - cudaEventRecord(start, stream); +#ifdef USE_STF + cudaEventRecord(start, ctx.task_fence()); +#else + cudaEventRecord(start, stream); +#endif } (sig_freq = fft(sig_time, filtered_size)).run(exec); (filt_freq = fft(filt_time, filtered_size)).run(exec); @@ -129,18 +146,30 @@ int main([[maybe_unused]] int argc, [[maybe_unused]] char **argv) } +#ifdef USE_STF + cudaEventRecord(stop, ctx.task_fence()); +#else cudaEventRecord(stop, stream); +#endif exec.sync(); cudaEventElapsedTime(&separate_ms, start, stop); for (int i = 0; i < iterations; i++) { if (i == 1) { - cudaEventRecord(start, stream); +#ifdef USE_STF + cudaEventRecord(start, ctx.task_fence()); +#else + cudaEventRecord(start, stream); +#endif } (sig_freq = ifft(fft(sig_time, filtered_size) * fft(filt_time, filtered_size))).run(exec); } +#ifdef USE_STF + cudaEventRecord(stop, ctx.task_fence()); +#else cudaEventRecord(stop, stream); +#endif exec.sync(); cudaEventElapsedTime(&fused_ms, start, stop); @@ -153,7 +182,9 @@ int main([[maybe_unused]] int argc, [[maybe_unused]] char **argv) (time_out = conv1d(sig_time, filt1, matxConvCorrMode_t::MATX_C_MODE_FULL)).run(exec); exec.sync(); - +#ifdef USE_STF + ctx.finalize(); +#endif // Compare signals for (index_t b = 0; b < batches; b++) { for (index_t i = 0; i < filtered_size; i++) { @@ -172,4 +203,4 @@ int main([[maybe_unused]] int argc, [[maybe_unused]] char **argv) MATX_CUDA_CHECK_LAST_ERROR(); MATX_EXIT_HANDLER(); -} \ No newline at end of file +} diff --git a/examples/simple_radar_pipeline.cu b/examples/simple_radar_pipeline.cu index 126ded2b..00e67e00 100644 --- a/examples/simple_radar_pipeline.cu +++ b/examples/simple_radar_pipeline.cu @@ -39,37 +39,78 @@ int main([[maybe_unused]] int argc, [[maybe_unused]] char **argv) index_t numPulses = 128; index_t numSamples = 9000; index_t waveformLength = 1000; - constexpr bool ENABLE_GRAPHS = false; uint32_t iterations = 100; - constexpr int num_streams = 1; - cudaGraph_t graphs[num_streams]; - cudaGraphExec_t instances[num_streams]; - using complex = cuda::std::complex; - RadarPipeline *pipelines[num_streams]; + + bool enableGraphs = false; +#ifdef USE_STF + std::cout << "Using STF executor\n"; +#else + std::cout << "Using CUDA executor\n"; +#endif + +#if 0 + constexpr int numStreams = 8; +#else + int numStreams = 1; +#endif + + // Parse command-line arguments + for (int i = 1; i < argc; ++i) { + std::string arg = argv[i]; + + if (arg == "--numChannels" && i + 1 < argc) { + numChannels = std::stoi(argv[++i]); + } else if (arg == "--numPulses" && i + 1 < argc) { + numPulses = std::stoi(argv[++i]); + } else if (arg == "--numSamples" && i + 1 < argc) { + numSamples = std::stoi(argv[++i]); + } else if (arg == "--waveformLength" && i + 1 < argc) { + waveformLength = std::stoi(argv[++i]); + } else if (arg == "--iterations" && i + 1 < argc) { + iterations = std::stoi(argv[++i]); + } else if (arg == "--numStreams" && i + 1 < argc) { + numStreams = std::stoi(argv[++i]); + } else if (arg == "--enableGraphs") { + enableGraphs = true; + ++i; + } else { + std::cerr << "Unknown option or missing value: " << arg << std::endl; + return 1; // Exit with error + } + } std::cout << "Iterations: " << iterations << std::endl; std::cout << "numChannels: " << numChannels << std::endl; std::cout << "numPulses: " << numPulses << std::endl; - std::cout << "numNumSamples: " << numSamples << std::endl; + std::cout << "numSamples: " << numSamples << std::endl; std::cout << "waveformLength: " << waveformLength << std::endl; + std::cout << "numStreams: " << numStreams << std::endl; + std::cout << "enableGraphs: " << enableGraphs << std::endl; - // cuda stream to place work in - cudaStream_t streams[num_streams]; + cudaGraph_t graphs[numStreams]; + cudaGraphExec_t instances[numStreams]; + using complex = cuda::std::complex; + RadarPipeline *pipelines[numStreams]; + // cuda stream to place work in + cudaStream_t streams[numStreams]; + // manually set to log all NVTX levels MATX_NVTX_SET_LOG_LEVEL( matx_nvxtLogLevels::MATX_NVTX_LOG_ALL ); // create some events for timing - cudaEvent_t starts[num_streams]; - cudaEvent_t stops[num_streams]; + cudaEvent_t starts[numStreams]; + cudaEvent_t stops[numStreams]; - for (int s = 0; s < num_streams; s++) { + for (int s = 0; s < numStreams; s++) { cudaEventCreate(&starts[s]); cudaEventCreate(&stops[s]); cudaStreamCreate(&streams[s]); MATX_NVTX_START_RANGE("Pipeline Initialize", matx_nvxtLogLevels::MATX_NVTX_LOG_USER, 1) +#if 0 printf("Initializing data structures for stream %d...\n", s); +#endif pipelines[s] = new RadarPipeline(numPulses, numSamples, waveformLength, numChannels, streams[s]); MATX_NVTX_END_RANGE(1) @@ -77,7 +118,9 @@ int main([[maybe_unused]] int argc, [[maybe_unused]] char **argv) } MATX_NVTX_START_RANGE("Pipeline Test", matx_nvxtLogLevels::MATX_NVTX_LOG_USER, 2) +#if 0 printf("Running test...\n"); +#endif auto run_pipeline = [&](int s) { MATX_NVTX_START_RANGE("PulseCompression", matx_nvxtLogLevels::MATX_NVTX_LOG_USER, 21) @@ -98,12 +141,12 @@ int main([[maybe_unused]] int argc, [[maybe_unused]] char **argv) }; // Warmup - for (int s = 0; s < num_streams; s++) { + for (int s = 0; s < numStreams; s++) { run_pipeline(s); } - if (ENABLE_GRAPHS) { - for (int s = 0; s < num_streams; s++) { + if (enableGraphs) { + for (int s = 0; s < numStreams; s++) { cudaStreamBeginCapture(streams[s], cudaStreamCaptureModeGlobal); run_pipeline(s); cudaStreamEndCapture(streams[s], &graphs[s]); @@ -112,12 +155,17 @@ int main([[maybe_unused]] int argc, [[maybe_unused]] char **argv) } for (uint32_t i = 0; i < iterations; i++) { - for (int s = 0; s < num_streams; s++) { + for (int s = 0; s < numStreams; s++) { if (i == 1) { +#ifdef USE_STF + auto ctx = pipelines[s]->exec.getCtx(); + cudaEventRecord(starts[s], ctx.task_fence()); +#else cudaEventRecord(starts[s], streams[s]); +#endif } - if (ENABLE_GRAPHS) { + if (enableGraphs) { cudaGraphLaunch(instances[s], streams[s]); } else { @@ -126,24 +174,37 @@ int main([[maybe_unused]] int argc, [[maybe_unused]] char **argv) } } - for (int s = 0; s < num_streams; s++) { + for (int s = 0; s < numStreams; s++) { +#ifdef USE_STF + auto ctx = pipelines[s]->exec.getCtx(); + cudaEventRecord(stops[s], ctx.task_fence()); +#else cudaEventRecord(stops[s], streams[s]); +#endif pipelines[s]->sync(); } + +#ifdef USE_STF + for (int s = 0; s < numStreams; s++) { + auto ctx = pipelines[s]->exec.getCtx(); + ctx.finalize(); + } +#endif + MATX_NVTX_END_RANGE(2) MATX_NVTX_START_RANGE("Pipeline Results", matx_nvxtLogLevels::MATX_NVTX_LOG_USER, 3) float time_ms; - cudaEventElapsedTime(&time_ms, starts[num_streams-1], stops[num_streams-1]); + cudaEventElapsedTime(&time_ms, starts[numStreams-1], stops[numStreams-1]); float time_s = time_ms * .001f; - auto mult = iterations * numChannels * numPulses * num_streams; + auto mult = iterations * numChannels * numPulses * numStreams; printf("Pipeline finished in %.2fms, rate: %.2f pulses/channel/sec (%.2f Gbps)\n", time_ms, static_cast(mult) / time_s, static_cast(mult*sizeof(complex)*numSamples*8)/time_s/1e9); -for (int s = 0; s < num_streams; s++) { +for (int s = 0; s < numStreams; s++) { cudaEventDestroy(starts[s]); cudaEventDestroy(stops[s]); cudaStreamDestroy(streams[s]); diff --git a/examples/simple_radar_pipeline.h b/examples/simple_radar_pipeline.h index 0f52ccf2..4892e679 100644 --- a/examples/simple_radar_pipeline.h +++ b/examples/simple_radar_pipeline.h @@ -120,7 +120,6 @@ class RadarPipeline { RadarPipeline() = delete; ~RadarPipeline() { - } /** @@ -465,5 +464,10 @@ class RadarPipeline { tensor_t cfarMaskView; cudaStream_t stream; +#ifdef USE_STF +public: + stfExecutor exec; +#else cudaExecutor exec; +#endif }; diff --git a/examples/simple_radar_pipeline_test.sh b/examples/simple_radar_pipeline_test.sh new file mode 100755 index 00000000..880e9ed0 --- /dev/null +++ b/examples/simple_radar_pipeline_test.sh @@ -0,0 +1,47 @@ +#!/bin/bash + +COMMAND_PARAMETERS=("../build/examples/simple_radar_pipeline" "../build/examples/simple_radar_pipeline_stf") +STREAM_PARAMETERS=("1" "2" "4" "5" "6") +#SAMPLE_PARAMETERS=("--numSamples 1000" "--numSamples 4500" "--numSamples 9000") +SAMPLE_PARAMETERS=("1000" "2000" "4500" "6000" "10000") + +NUM_RUNS=5 + +OUTPUT_FILE="radar_heatmap_data.csv" +# Initialize the CSV file + +# Loop through the parameters +for str_param in "${STREAM_PARAMETERS[@]}"; do + OUTPUT_FILE="radar_heatmap_data_${sam_param}.csv" + echo "Command,NumStreams,NumSamples,AverageGbps" > $OUTPUT_FILE + for command_param in "${COMMAND_PARAMETERS[@]}"; do + for sam_param in "${SAMPLE_PARAMETERS[@]}"; do + TOTAL_GBPS=0 + for i in $(seq 1 $NUM_RUNS); do + #echo "Iteration $i with parameter $command_param $str_param $sam_param" + OUTPUT=$($command_param --numStreams $str_param --numSamples $sam_param) + GBPS=$(echo "$OUTPUT" | grep -oP '(?<=\().*? Gbps' | awk '{print $1}') + + # Add the extracted value to the total + if [ -n "$GBPS" ]; then + TOTAL_GBPS=$(echo "$TOTAL_GBPS + $GBPS" | bc) + else + echo "Failed to extract Gbps for iteration $i." + fi + done + + # Calculate the average + if [ "$NUM_RUNS" -gt 0 ]; then + AVERAGE_GBPS=$(echo "$TOTAL_GBPS / $NUM_RUNS" | bc -l) + #echo "$command_param $str_param $sam_param verage Gbps over $NUM_RUNS runs: $AVERAGE_GBPS" + # Append the results to the CSV file + echo "$command_param,$(echo $str_param | awk '{print $0}'),$(echo $sam_param | awk '{print $0}'),$AVERAGE_GBPS" + echo "$command_param,$(echo $str_param | awk '{print $0}'),$(echo $sam_param | awk '{print $0}'),$AVERAGE_GBPS" >> $OUTPUT_FILE + else + echo "No runs were performed." + fi + done + done +done + +echo "Heatmap data saved to $OUTPUT_FILE." diff --git a/examples/simple_stf_test.cu b/examples/simple_stf_test.cu new file mode 100644 index 00000000..1df260ed --- /dev/null +++ b/examples/simple_stf_test.cu @@ -0,0 +1,156 @@ +//////////////////////////////////////////////////////////////////////////////// +// BSD 3-Clause License +// +// Copyright (c) 2021, NVIDIA Corporation +// All rights reserved. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are met: +// +// 1. Redistributions of source code must retain the above copyright notice, this +// list of conditions and the following disclaimer. +// +// 2. Redistributions in binary form must reproduce the above copyright notice, +// this list of conditions and the following disclaimer in the documentation +// and/or other materials provided with the distribution. +// +// 3. Neither the name of the copyright holder nor the names of its +// contributors may be used to endorse or promote products derived from +// this software without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +// DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE +// FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +// DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +// SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +// CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +// OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +///////////////////////////////////////////////////////////////////////////////// + +#include "matx.h" +#include +#include +#include +#include + +using namespace matx; + +/** + * MatX uses C++ expression templates to build arithmetic expressions that compile into a lazily-evaluated + * type for executing on the device. Currently, nvcc cannot see certain optimizations + * when building the expression tree that would be obvious by looking at the code. Specifically any code reusing + * the same tensor multiple times appears to the compiler as separate tensors, and it may issue multiple load + * instructions. While caching helps, this can have a slight performance impact when compared to native CUDA + * kernels. To work around this problem, complex expressions can be placed in a custom operator by adding some + * boilerplate code around the original expression. This custom operator can then be used either alone or inside + * other arithmetic expressions, and only a single load is issues for each tensor. + * + * This example uses the Black-Scholes equtation to demonstrate the two ways to implement the equation in MatX, and + * shows the performance difference. + */ + +/* Arithmetic expression */ +template +void compute_black_scholes_matx(tensor_t& K, + tensor_t& S, + tensor_t& V, + tensor_t& r, + tensor_t& T, + tensor_t& output, +#if 0 + cudaExecutor& exec) +#else + stfExecutor& exec) +#endif +{ + auto VsqrtT = V * sqrt(T); + auto d1 = (log(S / K) + (r + 0.5 * V * V) * T) / VsqrtT ; + auto d2 = d1 - VsqrtT; + auto cdf_d1 = normcdf(d1); + auto cdf_d2 = normcdf(d2); + auto expRT = exp(-1 * r * T); + (output = S * cdf_d1 - K * expRT * cdf_d2).run(exec); + //std::cout << "Output : " << std::endl; + //print(output); +} + +int main([[maybe_unused]] int argc, [[maybe_unused]] char **argv) +{ + MATX_ENTER_HANDLER(); + + using dtype = double; + +#if 0 + index_t input_size = 100000000; +#else + index_t input_size = 10000; +#endif + constexpr uint32_t num_iterations = 1000; + float time_ms; + + tensor_t K_tensor{{input_size}}; + tensor_t S_tensor{{input_size}}; + tensor_t V_tensor{{input_size}}; + tensor_t r_tensor{{input_size}}; + tensor_t T_tensor{{input_size}}; + tensor_t output_tensor{{input_size}}; + + cudaStream_t stream; + cudaStreamCreate(&stream); +#if 0 + cudaExecutor exec{stream}; +#else + stfExecutor exec{stream}; + auto ctx = exec.getCtx(); +#endif + + /* Albert --- initilizing input .. */ + for (int i = 0; i < input_size; i++) { + K_tensor(i) = dtype(i+1); + S_tensor(i) = dtype(i+i+1); + V_tensor(i) = dtype(i+i+i+1); + r_tensor(i) = dtype(i+i+i+i+1); + T_tensor(i) = dtype(i+i+i+i+i+1); + } + +//print(V_tensor); + + //compute_black_scholes_matx(K_tensor, S_tensor, V_tensor, r_tensor, T_tensor, output_tensor, exec); + + cudaEvent_t start, stop; + cudaEventCreate(&start); + cudaEventCreate(&stop); + +#if 0 + cudaEventRecord(start, stream); +#else + cudaEventRecord(start, ctx.task_fence()); +#endif + // Time non-operator version + for (uint32_t i = 0; i < num_iterations; i++) { + compute_black_scholes_matx(K_tensor, S_tensor, V_tensor, r_tensor, T_tensor, output_tensor, exec); + } +#if 0 + cudaEventRecord(stop, stream); +#else + cudaEventRecord(stop, ctx.task_fence()); +#endif + exec.sync(); +#if 1 + ctx.finalize(); +#endif + cudaEventElapsedTime(&time_ms, start, stop); + + // printf("Output tensor :\n"); + // print(output_tensor); + printf("Time without custom operator = %fms per iteration\n", + time_ms / num_iterations); + cudaEventDestroy(start); + cudaEventDestroy(stop); + cudaStreamDestroy(stream); + //CUDA_CHECK_LAST_ERROR(); + MATX_EXIT_HANDLER(); +} diff --git a/examples/spectrogram.cu b/examples/spectrogram.cu index 9d238b0e..60404a7e 100644 --- a/examples/spectrogram.cu +++ b/examples/spectrogram.cu @@ -60,6 +60,11 @@ int main([[maybe_unused]] int argc, [[maybe_unused]] char **argv) cudaStream_t stream; cudaStreamCreate(&stream); + + cudaEvent_t start, stop; + cudaEventCreate(&start); + cudaEventCreate(&stop); + cudaExecutor exec{stream}; float fs = 10000; diff --git a/examples/spectrogram_graph.cu b/examples/spectrogram_graph.cu index 38d7c5d2..cf296875 100644 --- a/examples/spectrogram_graph.cu +++ b/examples/spectrogram_graph.cu @@ -62,12 +62,22 @@ int main([[maybe_unused]] int argc, [[maybe_unused]] char **argv) cudaStream_t stream; cudaStreamCreate(&stream); - cudaExecutor exec{stream}; - cudaEvent_t start, stop; cudaEventCreate(&start); cudaEventCreate(&stop); +#ifdef USE_STF + std::cout << "Using STF executor\n"; +#else + std::cout << "Using CUDA executor\n"; +#endif + +#ifdef USE_STF + stfExecutor exec{stream}; +#else + cudaExecutor exec{stream}; +#endif + float fs = 10000; index_t N = 100000; float amp = static_cast(2 * sqrt(2)); @@ -147,15 +157,33 @@ int main([[maybe_unused]] int argc, [[maybe_unused]] char **argv) } } - exec.sync(); // Time graph execution of same kernels - cudaEventRecord(start, stream); +#if USE_STF + auto ctx = exec.getCtx(); + cudaEventRecord(start, ctx.task_fence()); +#else + cudaEventRecord(start, stream); +#endif + for (uint32_t i = 0; i < 10; i++) { cudaGraphLaunch(instance, stream); } +#ifdef USE_STF +{ + cudaEventRecord(stop, ctx.task_fence()); +} +#else cudaEventRecord(stop, stream); +#endif exec.sync(); + +#ifdef USE_STF +{ + ctx.finalize(); +} +#endif + cudaEventElapsedTime(&time_ms, start, stop); printf("Spectrogram Time With Graphs = %.2fus per iteration\n", diff --git a/include/matx/core/allocator.h b/include/matx/core/allocator.h index ecb29e08..db841999 100644 --- a/include/matx/core/allocator.h +++ b/include/matx/core/allocator.h @@ -236,9 +236,13 @@ struct MemTracker { } ~MemTracker() { +#if 0 while (allocationMap.size()) { deallocate(allocationMap.begin()->first); } +#else + std::cout << "TODO: Fix me allocator.h\n"; +#endif } }; diff --git a/include/matx/core/operator_utils.h b/include/matx/core/operator_utils.h index 95375c77..5907f56e 100644 --- a/include/matx/core/operator_utils.h +++ b/include/matx/core/operator_utils.h @@ -120,7 +120,7 @@ namespace matx { __MATX_HOST__ __MATX_INLINE__ void AllocateTempTensor(TensorType &tensor, Executor &&ex, ShapeType &&shape, typename TensorType::value_type **ptr) { const auto ttl_size = std::accumulate(shape.begin(), shape.end(), static_cast(1), std::multiplies()) * sizeof(typename TensorType::value_type); - if constexpr (is_cuda_executor_v) { + if constexpr (is_cuda_executor_v || is_stf_executor_v) { matxAlloc((void**)ptr, ttl_size, MATX_ASYNC_DEVICE_MEMORY, ex.getStream()); make_tensor(tensor, *ptr, shape); } diff --git a/include/matx/core/tensor.h b/include/matx/core/tensor.h index f042d18b..6b13ee03 100644 --- a/include/matx/core/tensor.h +++ b/include/matx/core/tensor.h @@ -91,6 +91,7 @@ class tensor_t : public detail::tensor_impl_t { using stride_container = typename Desc::stride_container; using desc_type = Desc; ///< Descriptor type trait using self_type = tensor_t; + using stf_logicaldata_type = typename cuda::experimental::stf::logical_data; /** * @brief Construct a new 0-D tensor t object @@ -107,7 +108,7 @@ class tensor_t : public detail::tensor_impl_t { * @param rhs Object to copy from */ __MATX_HOST__ tensor_t(tensor_t const &rhs) noexcept - : detail::tensor_impl_t{rhs.Data(), rhs.desc_}, storage_(rhs.storage_) + : detail::tensor_impl_t{rhs.Data(), rhs.desc_, rhs.stf_ldata_}, storage_(rhs.storage_) { } /** @@ -116,7 +117,7 @@ class tensor_t : public detail::tensor_impl_t { * @param rhs Object to move from */ __MATX_HOST__ tensor_t(tensor_t &&rhs) noexcept - : detail::tensor_impl_t{rhs.Data(), std::move(rhs.desc_)}, storage_(std::move(rhs.storage_)) + : detail::tensor_impl_t{rhs.Data(), std::move(rhs.desc_), rhs.stf_ldata_}, storage_(std::move(rhs.storage_)) { } @@ -134,6 +135,7 @@ class tensor_t : public detail::tensor_impl_t { this->SetData(rhs.Data()); storage_ = rhs.storage_; this->desc_ = rhs.desc_; + this->stf_ldata_ = rhs.stf_ldata_; } /** Swaps two tensors @@ -154,6 +156,7 @@ class tensor_t : public detail::tensor_impl_t { rhs.SetData(tmpdata); swap(lhs.storage_, rhs.storage_); swap(lhs.desc_, rhs.desc_); + std::swap(lhs.stf_ldata_, rhs.stf_ldata_); } __MATX_INLINE__ ~tensor_t() = default; @@ -184,6 +187,16 @@ class tensor_t : public detail::tensor_impl_t { this->SetLocalData(storage_.data()); } + template ::type> && is_matx_descriptor_v::type>, bool> = true> + tensor_t(S2 &&s, D2 &&desc, T* ldata, std::optional *stf_ldata_) : + detail::tensor_impl_t{std::forward(desc)}, + storage_{std::forward(s)} + { + this->stf_ldata_ = stf_ldata_; + this->SetLocalData(storage_.data()); + } + /** * @brief Construct a new tensor t object. Used to copy an existing storage object for proper reference counting * @@ -192,13 +205,28 @@ class tensor_t : public detail::tensor_impl_t { * @param ldata */ template - tensor_t(Storage s, D2 &&desc, T* ldata) : + tensor_t(Storage s, D2 &&desc, T* ldata, std::optional *stf_ldata) : detail::tensor_impl_t{std::forward(desc)}, storage_{std::move(s)} { + this->stf_ldata_ = stf_ldata; this->SetLocalData(ldata); } + /** + * @brief Construct a new tensor t object. Used to copy an existing storage object for proper reference counting + * + * @param s + * @param desc + * @param ldata + */ + template + tensor_t(Storage s, D2 &&desc, T* ldata) : + detail::tensor_impl_t{std::forward(desc)}, + storage_{std::move(s)} + { + this->SetLocalData(ldata); + } /** * Constructor for a rank-1 and above tensor. @@ -653,7 +681,7 @@ class tensor_t : public detail::tensor_impl_t { // Copy descriptor and call ctor with shape Desc new_desc{std::forward(shape)}; - return tensor_t{storage_, std::move(new_desc), this->Data()}; + return tensor_t{storage_, std::move(new_desc), this->Data(), this->stf_ldata_}; } /** @@ -712,7 +740,7 @@ class tensor_t : public detail::tensor_impl_t { "To get a reshaped view the tensor must be compact"); DefaultDescriptor desc{std::move(tshape)}; - return tensor_t{storage_, std::move(desc), this->Data()}; + return tensor_t{storage_, std::move(desc), this->Data(), this->stf_ldata_}; } /** @@ -795,7 +823,7 @@ class tensor_t : public detail::tensor_impl_t { // Copy descriptor and call ctor with shape Desc new_desc{this->desc_.Shape(), std::move(strides)}; - return tensor_t{storage_, std::move(new_desc), data}; + return tensor_t{storage_, std::move(new_desc), data, this->stf_ldata_}; } /** @@ -838,7 +866,7 @@ class tensor_t : public detail::tensor_impl_t { } Desc new_desc{this->desc_.Shape(), std::move(strides)}; - return tensor_t{storage_, std::move(new_desc), data}; + return tensor_t{storage_, std::move(new_desc), data, this->stf_ldata_}; } /** @@ -861,7 +889,7 @@ class tensor_t : public detail::tensor_impl_t { MATX_NVTX_START("", matx::MATX_NVTX_LOG_API) auto new_desc = this->PermuteImpl(dims); - return tensor_t{storage_, std::move(new_desc), this->Data()}; + return tensor_t{storage_, std::move(new_desc), this->Data(), this->stf_ldata_}; } @@ -1029,7 +1057,7 @@ class tensor_t : public detail::tensor_impl_t { OverlapView(const cuda::std::array &windows, const cuda::std::array &strides) const { auto new_desc = this->template OverlapViewImpl(windows, strides); - return tensor_t{storage_, std::move(new_desc), this->Data()}; + return tensor_t{storage_, std::move(new_desc), this->Data(), this->stf_ldata_}; } /** @@ -1063,7 +1091,7 @@ class tensor_t : public detail::tensor_impl_t { MATX_NVTX_START("", matx::MATX_NVTX_LOG_API) auto new_desc = this->template CloneImpl(clones); - return tensor_t{storage_, std::move(new_desc), this->Data()}; + return tensor_t{storage_, std::move(new_desc), this->Data(), this->stf_ldata_}; } template @@ -1361,7 +1389,7 @@ class tensor_t : public detail::tensor_impl_t { [[maybe_unused]] StrideType strides) const { auto [new_desc, data] = this->template SliceImpl(firsts, ends, strides); - return tensor_t{storage_, std::move(new_desc), data}; + return tensor_t{storage_, std::move(new_desc), data, this->stf_ldata_}; } template diff --git a/include/matx/core/tensor_impl.h b/include/matx/core/tensor_impl.h index ae755d1f..7a89046c 100644 --- a/include/matx/core/tensor_impl.h +++ b/include/matx/core/tensor_impl.h @@ -42,9 +42,11 @@ #include "matx/core/tensor_utils.h" #include "matx/operators/set.h" #include "matx/core/sparse_tensor_format.h" +#include "matx/core/utils.h" //#include "matx_exec_kernel.h" #include "iterator.h" #include "matx/core/make_tensor.h" +#include namespace matx { @@ -99,6 +101,7 @@ class tensor_impl_t { using stride_type = typename Desc::stride_type; using matxoplvalue = bool; using self_type = tensor_impl_t; + using stf_logicaldata_type = typename cuda::experimental::stf::logical_data; // Type specifier for signaling this is a matx operation using matxop = bool; @@ -130,13 +133,15 @@ class tensor_impl_t { swap(lhs.data_, rhs.data_); swap(lhs.desc_, rhs.desc_); + swap(lhs.stf_ldata_, rhs.stf_ldata_); } /** * Constructor for a rank-0 tensor (scalar). */ tensor_impl_t() { - + auto ldptr = new std::optional(); + this->stf_ldata_ = ldptr; } /** @@ -148,6 +153,8 @@ class tensor_impl_t { tensor_impl_t(T *const data) { data_.ldata_ = data; static_assert(RANK == 0, "tensor_impl_t with single pointer parameter must be a rank 0 tensor"); + auto ldptr = new std::optional(); + this->stf_ldata_ = ldptr; } /** * Constructor for a rank-1 and above tensor. @@ -159,6 +166,8 @@ class tensor_impl_t { std::enable_if_t> && !is_matx_descriptor_v>, bool> = true> __MATX_INLINE__ tensor_impl_t(ShapeType &&shape) : desc_(std::forward(shape)) { + auto ldptr = new std::optional(); + this->stf_ldata_ = ldptr; } /** @@ -172,7 +181,10 @@ class tensor_impl_t { template __MATX_INLINE__ tensor_impl_t(ShapeType &&shape, StrideType &&strides) : desc_(std::forward(shape), std::forward(strides)) - {} + { + auto ldptr = new std::optional(); + this->stf_ldata_ = ldptr; + } /** * Constructor for a rank-1 and above tensor using a user pointer and shape @@ -190,6 +202,8 @@ class tensor_impl_t { : desc_(std::forward(shape)) { data_.ldata_ = ldata; + auto ldptr = new std::optional(); + this->stf_ldata_ = ldptr; } @@ -214,6 +228,8 @@ class tensor_impl_t { : desc_(std::forward(shape), std::forward(strides)) { data_.ldata_ = ldata; + auto ldptr = new std::optional(); + this->stf_ldata_ = ldptr; } @@ -238,9 +254,19 @@ MATX_IGNORE_WARNING_PUSH_GCC("-Wmaybe-uninitialized") : desc_{std::forward(desc)} { data_.ldata_ = ldata; + auto ldptr = new std::optional(); + this->stf_ldata_ = ldptr; } MATX_IGNORE_WARNING_POP_GCC + template ::type>, bool> = true> + __MATX_INLINE__ __MATX_DEVICE__ __MATX_HOST__ tensor_impl_t(T *const ldata, + DescriptorType &&desc, std::optional *stf_ldata) + : desc_{std::forward(desc)}, stf_ldata_(stf_ldata) + { + data_.ldata_ = ldata; + } + /** * Constructor for creating a view with only a descriptor * @@ -255,12 +281,15 @@ MATX_IGNORE_WARNING_POP_GCC __MATX_INLINE__ tensor_impl_t(DescriptorType &&desc) : desc_{std::forward(desc)} { + auto ldptr = new std::optional(); + this->stf_ldata_ = ldptr; } __MATX_HOST__ void Shallow(const self_type &rhs) noexcept { data_.ldata_ = rhs.Data(); desc_ = rhs.desc_; + stf_ldata_ = rhs.stf_ldata_; } /** @@ -277,6 +306,7 @@ MATX_IGNORE_WARNING_POP_GCC { data_.ldata_ = op.Data(); desc_ = op.desc_; + stf_ldata_ = op.stf_ldata; } @@ -793,7 +823,7 @@ MATX_IGNORE_WARNING_POP_GCC auto new_desc = CloneImpl(clones); - return tensor_impl_t{this->data_.ldata_, std::move(new_desc)}; + return tensor_impl_t{this->data_.ldata_, std::move(new_desc), this->stf_ldata_}; } __MATX_INLINE__ auto PermuteImpl(const cuda::std::array &dims) const @@ -823,7 +853,7 @@ MATX_IGNORE_WARNING_POP_GCC __MATX_INLINE__ auto Permute(const cuda::std::array &dims) const { auto new_desc = PermuteImpl(dims); - return tensor_impl_t{this->data_.ldata_, std::move(new_desc)}; + return tensor_impl_t{this->data_.ldata_, std::move(new_desc), this->stf_ldata_}; } template @@ -868,7 +898,7 @@ MATX_IGNORE_WARNING_POP_GCC OverlapView(const cuda::std::array &windows, const cuda::std::array &strides) const { auto new_desc = OverlapViewImpl(windows, strides); - return tensor_impl_t{this->data_.ldata_, std::move(new_desc)}; + return tensor_impl_t{this->data_.ldata_, std::move(new_desc), this->stf_ldata_}; } template @@ -1197,9 +1227,47 @@ MATX_IGNORE_WARNING_POP_GCC memcpy(data_.pos_, pos, U::LVL*sizeof(pos[0])); } + template + __MATX_INLINE__ void apply_dep_to_task(Task &&task, int perm) const noexcept + { + auto &ld = stf_ldata_->value(); + + using namespace cuda::experimental::stf; +#if 0 + data_place place = getDataPlace(Data()); +#endif + + if (perm == 0) { + task.add_deps(ld.write()); + } + else if (perm == 1) { + task.add_deps(ld.read()); + } + else if (perm == 2) { + task.add_deps(ld.rw()); + } + else { + std::cout << "abort ...\n"; + } + } + template __MATX_INLINE__ void PreRun([[maybe_unused]] ShapeType &&shape, [[maybe_unused]] Executor &&ex) const noexcept { + if constexpr (is_stf_executor_v) { + using namespace cuda::experimental::stf; + /* Don't create a new logical data for a tensor if it alread had one created previously */ + if (stf_ldata_ && stf_ldata_->has_value()) { return; } + + auto ctx = ex.getCtx(); +#if 0 + // Determine the type of memory that was allocated ie. host/managed/etc + data_place place = getDataPlace(Data()); +#endif + + *stf_ldata_ = ctx.token(); + stf_ldata_->value().set_symbol(this->str()); + } } template @@ -1211,6 +1279,9 @@ MATX_IGNORE_WARNING_POP_GCC protected: TensorData data_; Desc desc_; + + public: + mutable std::optional *stf_ldata_; }; } diff --git a/include/matx/core/type_utils.h b/include/matx/core/type_utils.h index e1b6f38d..c104bba2 100644 --- a/include/matx/core/type_utils.h +++ b/include/matx/core/type_utils.h @@ -45,6 +45,7 @@ #include "matx/core/half.h" #include "matx/core/half_complex.h" #include "matx/executors/cuda.h" +#include "matx/executors/stf.h" /** * Defines type traits for host and device compilers. This file should be includable by @@ -302,6 +303,7 @@ inline constexpr bool is_settable_xform_v = std::conjunction_v struct is_executor : std::false_type {}; template <> struct is_executor : std::true_type {}; +template <> struct is_executor : std::true_type {}; template struct is_executor> : std::true_type {}; } @@ -322,6 +324,11 @@ template struct is_cuda_executor : std::false_type {}; template<> struct is_cuda_executor : std::true_type {}; } +namespace detail { +template struct is_stf_executor : std::false_type {}; +template<> struct is_stf_executor : std::true_type {}; +} + /** * @brief Determine if a type is a device executor * @@ -330,6 +337,9 @@ template<> struct is_cuda_executor : std::true_type {}; template inline constexpr bool is_cuda_executor_v = detail::is_cuda_executor::type>::value; +template +inline constexpr bool is_stf_executor_v = detail::is_stf_executor::type>::value; + namespace detail { template struct is_host_executor : std::false_type {}; template struct is_host_executor> : std::true_type {}; diff --git a/include/matx/core/utils.h b/include/matx/core/utils.h index 81643210..1b4c6e06 100644 --- a/include/matx/core/utils.h +++ b/include/matx/core/utils.h @@ -46,6 +46,30 @@ namespace matx { constexpr int PASCAL_CC = 6; namespace detail { + +#if 0 +__MATX_INLINE__ cuda::experimental::stf::data_place getDataPlace(void *ptr) { + using namespace cuda::experimental::stf; + auto kind = GetPointerKind(ptr); + switch (kind) { + case MATX_MANAGED_MEMORY: + return data_place::managed; + case MATX_HOST_MEMORY: + case MATX_HOST_MALLOC_MEMORY: + return data_place::host; + case MATX_DEVICE_MEMORY: + case MATX_ASYNC_DEVICE_MEMORY: + return data_place::current_device(); + case MATX_INVALID_MEMORY: + //std::cout << "Data kind is invalid: assuming managed memory\n"; + return data_place::managed; + //return data_place::invalid; + default: + return data_place::invalid; + } +} +#endif + __MATX_INLINE__ int GetDeviceAttr(cudaDeviceAttr attr) { int val; int dev; diff --git a/include/matx/executors/stf.h b/include/matx/executors/stf.h new file mode 100644 index 00000000..e055a051 --- /dev/null +++ b/include/matx/executors/stf.h @@ -0,0 +1,258 @@ +//////////////////////////////////////////////////////////////////////////////// +// BSD 3-Clause License +// +// Copyright (c) 2021, NVIDIA Corporation +// All rights reserved. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are met: +// +// 1. Redistributions of source code must retain the above copyright notice, this +// list of conditions and the following disclaimer. +// +// 2. Redistributions in binary form must reproduce the above copyright notice, +// this list of conditions and the following disclaimer in the documentation +// and/or other materials provided with the distribution. +// +// 3. Neither the name of the copyright holder nor the names of its +// contributors may be used to endorse or promote products derived from +// this software without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +// DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE +// FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +// DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +// SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +// CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +// OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +///////////////////////////////////////////////////////////////////////////////// + +#pragma once + +#include "matx/core/defines.h" +#include "matx/executors/host.h" +#include "matx/executors/kernel.h" + +#include + +//using namespace cuda::experimental::stf; +//using namespace cudastf; + +namespace matx +{ + + +/* Albert - Needed to declare this here to avoid compile error. */ +template constexpr bool is_matx_op_lvalue(); +template constexpr bool is_matx_set_op(); + + class stfExecutor { + public: + using matx_cuda = bool; // signal this is a GPU executor + using matx_executor = bool; ///< Type trait indicating this is an executor + + /** + * @brief Construct a new stfExecutor with a stream + * + * @param stream CUDA stream + */ + stfExecutor(cudaStream_t stream, bool is_cudagraph = false) : stream_(stream) { + cuda::experimental::stf::async_resources_handle handle; + if (!is_cudagraph) + ctx_ = cuda::experimental::stf::stream_ctx(stream, handle); + else + ctx_ = cuda::experimental::stf::graph_ctx(stream, handle); + } + + stfExecutor(int stream, bool is_cudagraph = false) : stream_(reinterpret_cast(stream)) { + cuda::experimental::stf::async_resources_handle handle; + if (!is_cudagraph) + ctx_ = cuda::experimental::stf::stream_ctx(reinterpret_cast(stream), handle); + else + ctx_ = cuda::experimental::stf::graph_ctx(reinterpret_cast(stream), handle); + } + + /** + * @brief Construct a new stfExecutor object using the default stream + * + */ + stfExecutor(bool is_cudagraph = false) : stream_(0) { + if (!is_cudagraph) + ctx_ = cuda::experimental::stf::stream_ctx(); + else + ctx_ = cuda::experimental::stf::graph_ctx(); + } + + ~stfExecutor() { + //ctx_.finalize(); + } + /** + * @brief Returns stream associated with executor + */ + auto getStream() const { return stream_; } + + /** + * @brief Get CUDASTF Ctx + * + */ + auto &getCtx() const noexcept { return ctx_; } + + /** + * @brief Synchronize the STF executor's stream + * + */ + void sync() { cudaStreamSynchronize(ctx_.task_fence()); } + + /** + * Execute an operator on a device + * + * @tparam Op Operator type + * @param op value + **/ + template + void Exec(Op &op) const { +#ifdef __CUDACC__ + dim3 threads, blocks; + + auto ctx = getCtx(); + // Parameters passed by value in CUDA are limited to 4096B. If the user exceeds this, we + // need to error out and have them break up the statement + MATX_STATIC_ASSERT((sizeof(op) + sizeof(index_t) * Op::Rank()) <= CUDA_MAX_VAL_PARAM, + "Parameter buffer to device is limited to 4096B. Please break up your operator statement into multiple executions to limit the size of the parameters"); + + if constexpr (Op::Rank() == 0) { + threads = 1; + blocks = 1; + if constexpr (is_matx_op_lvalue() || is_matx_set_op()) { + auto tsk = ctx.task(); + tsk.set_symbol(op.str()); + op.apply_dep_to_task(tsk); // recursively find the tensors from the tree to apply deps + tsk->*[&](cudaStream_t s) { + detail::matxOpT0Kernel<<>>(op); + }; + } + else { + //std::cout << " RANK 0 not on LHS operator = " << op.str() << '\n'; + detail::matxOpT0Kernel<<>>(op); + } + } + else { + cuda::std::array sizes; + for (int i = 0; i < Op::Rank(); i++) { + sizes[i] = op.Size(i); + } + + bool stride = detail::get_grid_dims(blocks, threads, sizes, 256); + + if constexpr (Op::Rank() == 1) { + if constexpr (is_matx_op_lvalue() || is_matx_set_op()) { + auto tsk = ctx.task(); + tsk.set_symbol(op.str()); + op.apply_dep_to_task(tsk); // recursively find the tensors from the tree to apply deps + //std::cout << "Start launch task. Rank = " << Op::Rank() << '\n'; + tsk->*[&](cudaStream_t s) { + detail::matxOpT1Kernel<<>>(op, sizes[0]); + }; + //std::cout << "End launch task.\n"; + } + else { + //std::cout << " RANK 1 not on LHS operator = " << op.str() << '\n'; + detail::matxOpT1Kernel<<>>(op, sizes[0]); + } + } + else if constexpr (Op::Rank() == 2) { + if constexpr (is_matx_op_lvalue() || is_matx_set_op()) { + auto tsk = ctx.task(); + tsk.set_symbol(op.str()); + op.apply_dep_to_task(tsk); // recursively find the tensors from the tree to apply deps + //std::cout << "About to launch task. Rank = " << Op::Rank() << '\n'; + tsk->*[&](cudaStream_t s) { + if(stride) { + detail::matxOpT2StrideKernel<<>>(op, sizes[0], sizes[1]); + } else { + detail::matxOpT2Kernel<<>>(op, sizes[0], sizes[1]); + } + }; + } + else { + //std::cout << " not on LHS operator = " << op.str() << '\n'; + if(stride) { + detail::matxOpT2StrideKernel<<>>(op, sizes[0], sizes[1]); + } else { + detail::matxOpT2Kernel<<>>(op, sizes[0], sizes[1]); + } + } + } + else if constexpr (Op::Rank() == 3) { + if constexpr (is_matx_op_lvalue() || is_matx_set_op()) { + auto tsk = ctx.task(); + tsk.set_symbol(op.str()); + op.apply_dep_to_task(tsk); // recursively find the tensors from the tree to apply deps + tsk->*[&](cudaStream_t s) { + if(stride) { + detail::matxOpT3StrideKernel<<>>(op, sizes[0], sizes[1], sizes[2]); + } else { + detail::matxOpT3Kernel<<>>(op, sizes[0], sizes[1], sizes[2]); + } + }; + } + else { + if(stride) { + detail::matxOpT3StrideKernel<<>>(op, sizes[0], sizes[1], sizes[2]); + } else { + detail::matxOpT3Kernel<<>>(op, sizes[0], sizes[1], sizes[2]); + } + } + } + else if constexpr (Op::Rank() == 4) { + if constexpr (is_matx_op_lvalue() || is_matx_set_op()) { + auto tsk = ctx.task(); + op.apply_dep_to_task(tsk); // recursively find the tensors from the tree to apply deps + tsk.set_symbol(op.str())->*[&](cudaStream_t s) { + if(stride) { + detail::matxOpT4StrideKernel<<>>(op, sizes[0], sizes[1], sizes[2], sizes[3]); + } else { + detail::matxOpT4Kernel<<>>(op, sizes[0], sizes[1], sizes[2], sizes[3]); + } + }; + } + else { + if(stride) { + detail::matxOpT4StrideKernel<<>>(op, sizes[0], sizes[1], sizes[2], sizes[3]); + } else { + detail::matxOpT4Kernel<<>>(op, sizes[0], sizes[1], sizes[2], sizes[3]); + } + } + } + else { + if constexpr (is_matx_op_lvalue() || is_matx_set_op()) { + auto tsk = ctx.task(); + tsk.set_symbol(op.str()); + op.apply_dep_to_task(tsk); // recursively find the tensors from the tree to apply deps + //std::cout << "About to launch task. Rank = " << Op::Rank() << '\n'; + + tsk->*[&](cudaStream_t s) { + index_t dims = std::accumulate(std::begin(sizes) + 1, std::end(sizes), 1, std::multiplies()); + detail::matxOpTDKernel<<>>(op, sizes, dims); + }; + } + else { + index_t dims = std::accumulate(std::begin(sizes) + 1, std::end(sizes), 1, std::multiplies()); + detail::matxOpTDKernel<<>>(op, sizes, dims); + } + } + } +#else + MATX_ASSERT_STR(false, matxInvalidParameter, "Cannot call device executor using host compiler"); +#endif + } + + private: + cudaStream_t stream_; + cuda::experimental::stf::context ctx_; + }; + +}; diff --git a/include/matx/generators/generator1d.h b/include/matx/generators/generator1d.h index cdeca628..1d684d62 100644 --- a/include/matx/generators/generator1d.h +++ b/include/matx/generators/generator1d.h @@ -54,6 +54,9 @@ namespace matx return f_(pp_get(indices...)); } + template + __MATX_INLINE__ void apply_dep_to_task([[maybe_unused]] Task &&task, [[maybe_unused]] int perm=1) const noexcept { } + constexpr inline __MATX_HOST__ __MATX_DEVICE__ auto Size(int dim) const { return *(s_.begin() + dim); diff --git a/include/matx/generators/random.h b/include/matx/generators/random.h index fff49e92..ad404b55 100644 --- a/include/matx/generators/random.h +++ b/include/matx/generators/random.h @@ -305,7 +305,7 @@ namespace detail { { InnerPreRun(std::forward(shape), std::forward(ex)); #ifdef __CUDACC__ - if constexpr (is_cuda_executor_v) { + if constexpr ((is_cuda_executor_v) || (is_stf_executor_v)) { if (!init_) { auto stream = ex.getStream(); matxAlloc((void **)&states_, @@ -341,7 +341,7 @@ namespace detail { template __MATX_INLINE__ void PostRun([[maybe_unused]] ST &&shape, [[maybe_unused]] Executor &&ex) const noexcept { - if constexpr (is_cuda_executor_v) { + if constexpr ((is_cuda_executor_v) || (is_stf_executor_v)) { matxFree(states_); } else if constexpr (is_host_executor_v) { @@ -479,6 +479,9 @@ namespace detail { return shape_[dim]; } + template + __MATX_INLINE__ void apply_dep_to_task([[maybe_unused]] Task &&task, [[maybe_unused]] int perm=1) const noexcept { } + static __MATX_INLINE__ constexpr __MATX_HOST__ __MATX_DEVICE__ int32_t Rank() { return RANK; } }; } diff --git a/include/matx/operators/all.h b/include/matx/operators/all.h index 8f01b02c..8a286ad8 100644 --- a/include/matx/operators/all.h +++ b/include/matx/operators/all.h @@ -72,9 +72,35 @@ namespace detail { return tmp_out_(indices...); }; + template + __MATX_INLINE__ void apply_dep_to_task(Task &&task, int perm=1) const noexcept { + /* Albert -- Scenario where the all() operator is on the RHS and sum has already + run previously. So we make tmp_out have a read permission as it will be read from */ + tmp_out_.apply_dep_to_task(std::forward(task), 1); + } + template void Exec(Out &&out, Executor &&ex) const { - all_impl(cuda::std::get<0>(out), a_, ex); + auto output = cuda::std::get<0>(out); + // stfexecutor case + if constexpr (!is_cuda_executor_v) { + auto ctx = ex.getCtx(); + auto tsk = ctx.task(); + tsk.set_symbol("all_task"); + + output.PreRun(out_dims_, std::forward(ex)); + output.apply_dep_to_task(tsk, 0); + a_.apply_dep_to_task(tsk, 1); + + tsk->*[&](cudaStream_t s) { + auto exec = cudaExecutor(s); + all_impl(output, a_, exec); + }; + } + // cudaExecutor case + else if constexpr (is_cuda_executor_v) { + all_impl(output, a_, ex); + } } static __MATX_INLINE__ constexpr __MATX_HOST__ __MATX_DEVICE__ int32_t Rank() diff --git a/include/matx/operators/binary_operators.h b/include/matx/operators/binary_operators.h index c6c0eea1..eeca1842 100644 --- a/include/matx/operators/binary_operators.h +++ b/include/matx/operators/binary_operators.h @@ -146,6 +146,18 @@ namespace matx return detail::matx_max(size1,size2); } + template + __MATX_INLINE__ void apply_dep_to_task(Task &&task, Perm perm) const noexcept + { + if constexpr (is_matx_op()) { + in1_.apply_dep_to_task(std::forward(task), perm); + } + + if constexpr (is_matx_op()) { + in2_.apply_dep_to_task(std::forward(task), perm); + } + } + template __MATX_INLINE__ void PreRun(ShapeType &&shape, Executor &&ex) const noexcept { diff --git a/include/matx/operators/cast.h b/include/matx/operators/cast.h index c1af0708..844e8c3d 100644 --- a/include/matx/operators/cast.h +++ b/include/matx/operators/cast.h @@ -83,6 +83,14 @@ namespace matx return static_cast(get_value(op_, indices...)); } + template + __MATX_INLINE__ void apply_dep_to_task(Task &&task, int perm=1) const noexcept + { + if constexpr (is_matx_op()) { + op_.apply_dep_to_task(std::forward(task), perm); + } + } + template __MATX_INLINE__ void PreRun(ShapeType &&shape, Executor &&ex) const noexcept { diff --git a/include/matx/operators/cgsolve.h b/include/matx/operators/cgsolve.h index db1f508e..bbf4d2f4 100644 --- a/include/matx/operators/cgsolve.h +++ b/include/matx/operators/cgsolve.h @@ -88,10 +88,20 @@ namespace matx return out_dims_[dim]; } + template + __MATX_INLINE__ void apply_dep_to_task(Task &&task, int perm=1) const noexcept { + /* Albert -- Scenario where the sum() operator is on the RHS and sum has already + run previously. So we make tmp_out have a read permission as it will be read from */ + tmp_out_.apply_dep_to_task(std::forward(task), 1); + } + + template void Exec(Out &&out, Executor &&ex) const{ - static_assert(is_cuda_executor_v, "cgsolve() only supports the CUDA executor currently"); - cgsolve_impl(cuda::std::get<0>(out), a_, b_, tol_, max_iters_, ex.getStream()); + //static_assert(is_cuda_executor_v, "cgsolve() only supports the CUDA executor currently"); + auto output = cuda::std::get<0>(out); + output.PreRun(out_dims_, std::forward(ex)); + cgsolve_impl(cuda::std::get<0>(out), a_, b_, ex, tol_, max_iters_, ex.getStream()); } template diff --git a/include/matx/operators/constval.h b/include/matx/operators/constval.h index 3df28aff..32eec317 100644 --- a/include/matx/operators/constval.h +++ b/include/matx/operators/constval.h @@ -55,6 +55,9 @@ namespace matx __MATX_INLINE__ __MATX_DEVICE__ __MATX_HOST__ T operator()(Is...) const { return v_; }; + template + __MATX_INLINE__ void apply_dep_to_task([[maybe_unused]] Task &&task, [[maybe_unused]] int perm) const noexcept { } + constexpr inline __MATX_HOST__ __MATX_DEVICE__ auto Size(int dim) const { if constexpr (!is_noshape_v) { return *(s_.begin() + dim); diff --git a/include/matx/operators/conv.h b/include/matx/operators/conv.h index 37645bab..ba5497c7 100644 --- a/include/matx/operators/conv.h +++ b/include/matx/operators/conv.h @@ -149,16 +149,45 @@ namespace matx return out_dims_[dim]; } + template + __MATX_INLINE__ void apply_dep_to_task(Task &&task, int perm=1) const noexcept + { + tmp_out_.apply_dep_to_task(std::forward(task), 1); + } + template void Exec(Out &&out, Executor &&ex) const { MATX_ASSERT_STR(!(is_host_executor_v && method_ == MATX_C_METHOD_DIRECT), matxNotSupported, "direct conv1d() only supports the CUDA executor currently"); MATX_STATIC_ASSERT_STR((Rank() == cuda::std::tuple_element_t<0, remove_cvref_t>::Rank()), matxInvalidParameter, "conv1d: inputs and outputs must have same rank to use conv1d with axis parameter"); - if constexpr (!std::is_same_v) { - conv1d_impl(permute(cuda::std::get<0>(out), perm_), a_, b_, mode_, method_, ex); + if constexpr (!is_cuda_executor_v) { + auto ctx = ex.getCtx(); + auto tsk = ctx.task(); + tsk.set_symbol("conv_task"); + + auto output = cuda::std::get<0>(out); + output.PreRun(out_dims_, std::forward(ex)); + output.apply_dep_to_task(tsk, 0); + a_.apply_dep_to_task(tsk, 1); + b_.apply_dep_to_task(tsk, 1); + + tsk->*[&](cudaStream_t s) { + auto exec = cudaExecutor(s); + if constexpr (!std::is_same_v) { + conv1d_impl(permute(cuda::std::get<0>(out), perm_), a_, b_, mode_, method_, exec); + } + else { + conv1d_impl(cuda::std::get<0>(out), a_, b_, mode_, method_, exec); + } + }; } - else { - conv1d_impl(cuda::std::get<0>(out), a_, b_, mode_, method_, ex); + else if constexpr (is_cuda_executor_v) { + if constexpr (!std::is_same_v) { + conv1d_impl(permute(cuda::std::get<0>(out), perm_), a_, b_, mode_, method_, ex); + } + else { + conv1d_impl(cuda::std::get<0>(out), a_, b_, mode_, method_, ex); + } } } @@ -343,14 +372,36 @@ namespace detail { template void Exec(Out &&out, Executor &&ex) const { - static_assert(is_cuda_executor_v, "conv2d() only supports the CUDA executor currently"); - - if constexpr (!std::is_same_v) { - conv2d_impl(permute(cuda::std::get<0>(out), perm_), a_, b_, mode_, ex.getStream()); - } - else { - conv2d_impl(cuda::std::get<0>(out), a_, b_, mode_, ex.getStream()); - } + //static_assert(is_cuda_executor_v, "conv2d() only supports the CUDA executor currently"); + if constexpr (!is_cuda_executor_v) { + auto ctx = ex.getCtx(); + auto tsk = ctx.task(); + tsk.set_symbol("conv_task"); + + auto output = cuda::std::get<0>(out); + output.PreRun(out_dims_, std::forward(ex)); + output.apply_dep_to_task(tsk, 0); + a_.apply_dep_to_task(tsk, 1); + b_.apply_dep_to_task(tsk, 1); + + tsk->*[&](cudaStream_t s) { + auto exec = cudaExecutor(s); + if constexpr (!std::is_same_v) { + conv2d_impl(permute(cuda::std::get<0>(out), perm_), a_, b_, mode_, exec.getStream()); + } + else { + conv2d_impl(cuda::std::get<0>(out), a_, b_, mode_, exec.getStream()); + } + }; + } + else if constexpr (is_cuda_executor_v) { + if constexpr (!std::is_same_v) { + conv2d_impl(permute(cuda::std::get<0>(out), perm_), a_, b_, mode_, ex.getStream()); + } + else { + conv2d_impl(cuda::std::get<0>(out), a_, b_, mode_, ex.getStream()); + } + } } template diff --git a/include/matx/operators/fft.h b/include/matx/operators/fft.h index 242169d1..9435d56a 100644 --- a/include/matx/operators/fft.h +++ b/include/matx/operators/fft.h @@ -145,24 +145,66 @@ namespace matx return out_dims_[dim]; } + template + __MATX_INLINE__ void apply_dep_to_task(Task &&task, [[maybe_unused]] int perm=1) const noexcept + { + /* Scenario where the matvec() operator is on the RHS and op has already + run previously. So we make tmp_out have a read permission as it will be read from */ + tmp_out_.apply_dep_to_task(std::forward(task), 1); + } + template void Exec(Out &&out, Executor &&ex) const { - if constexpr (std::is_same_v) { - if constexpr (std::is_same_v) { - fft_impl(cuda::std::get<0>(out), a_, fft_size_, norm_, ex); + // stfexecutor case + if constexpr (is_stf_executor_v) { + auto ctx = ex.getCtx(); + auto tsk = ctx.task(); + tsk.set_symbol("fft_task_no_perm"); + + auto output = cuda::std::get<0>(out); + output.PreRun(out_dims_, std::forward(ex)); + output.apply_dep_to_task(tsk, 0); + a_.apply_dep_to_task(tsk, 1); + + tsk->*[&](cudaStream_t s) { + auto exec = cudaExecutor(s); + if constexpr (std::is_same_v) { + if constexpr (std::is_same_v) { + fft_impl(output, a_, fft_size_, norm_, exec); + } + else { + ifft_impl(output, a_, fft_size_, norm_, exec); + } + } + else { + if constexpr (std::is_same_v) { + fft_impl(permute(output, perm_), permute(a_, perm_), fft_size_, norm_, exec); + } + else { + ifft_impl(permute(output, perm_), permute(a_, perm_), fft_size_, norm_, exec); + } + } + }; } + // cudaExecutor or host case else { - ifft_impl(cuda::std::get<0>(out), a_, fft_size_, norm_, ex); + if constexpr (std::is_same_v) { + if constexpr (std::is_same_v) { + fft_impl(cuda::std::get<0>(out), a_, fft_size_, norm_, ex); + } + else { + ifft_impl(cuda::std::get<0>(out), a_, fft_size_, norm_, ex); + } + } + else { + if constexpr (std::is_same_v) { + fft_impl(permute(cuda::std::get<0>(out), perm_), permute(a_, perm_), fft_size_, norm_, ex); + } + else { + ifft_impl(permute(cuda::std::get<0>(out), perm_), permute(a_, perm_), fft_size_, norm_, ex); + } + } } - } - else { - if constexpr (std::is_same_v) { - fft_impl(permute(cuda::std::get<0>(out), perm_), permute(a_, perm_), fft_size_, norm_, ex); - } - else { - ifft_impl(permute(cuda::std::get<0>(out), perm_), permute(a_, perm_), fft_size_, norm_, ex); - } - } } template @@ -494,4 +536,4 @@ namespace matx return detail::FFT2Op(a, perm, detail::ifft_t{}, norm); } -} \ No newline at end of file +} diff --git a/include/matx/operators/matmul.h b/include/matx/operators/matmul.h index 7d1b70f3..a88afecd 100644 --- a/include/matx/operators/matmul.h +++ b/include/matx/operators/matmul.h @@ -112,24 +112,65 @@ namespace matx return out_dims_[dim]; } + template + __MATX_INLINE__ void apply_dep_to_task(Task &&task, int perm=1) const noexcept + { + /* Scenario where the matmul() operator is on the RHS and op has already run + previously. So we make tmp_out have a read permission as it will be read from */ + tmp_out_.apply_dep_to_task(std::forward(task), 1); + } + template void Exec(Out &&out, Executor &&ex) const { // Perform SpMM or otherwise GEMM. static_assert(!is_sparse_tensor_v, "sparse rhs not implemented"); - if constexpr (is_sparse_tensor_v) { - if constexpr (!std::is_same_v) { - sparse_matmul_impl(permute(cuda::std::get<0>(out), perm_), a_, b_, ex, alpha_, beta_); + + // stfexecutor case + if constexpr (!is_cuda_executor_v) { + auto ctx = ex.getCtx(); + auto tsk = ctx.task(); + tsk.set_symbol("matmul"); + + auto output = cuda::std::get<0>(out); + output.PreRun(out_dims_, std::forward(ex)); + output.apply_dep_to_task(tsk, 0); + a_.apply_dep_to_task(tsk, 1); + b_.apply_dep_to_task(tsk, 1); + + tsk->*[&](cudaStream_t s) { + auto exec = cudaExecutor(s); + if constexpr (is_sparse_tensor_v) { + if constexpr (!std::is_same_v) { + sparse_matmul_impl(permute(cuda::std::get<0>(out), perm_), a_, b_, exec, alpha_, beta_); + } + else { + sparse_matmul_impl(cuda::std::get<0>(out), a_, b_, exec, alpha_, beta_); + } + } + else if constexpr (!std::is_same_v) { + matmul_impl(permute(cuda::std::get<0>(out), perm_), a_, b_, exec, alpha_, beta_); + } + else { + matmul_impl(cuda::std::get<0>(out), a_, b_, exec, alpha_, beta_); + } + }; + } + else if constexpr (is_cuda_executor_v) { + if constexpr (is_sparse_tensor_v) { + if constexpr (!std::is_same_v) { + sparse_matmul_impl(permute(cuda::std::get<0>(out), perm_), a_, b_, ex, alpha_, beta_); + } + else { + sparse_matmul_impl(cuda::std::get<0>(out), a_, b_, ex, alpha_, beta_); + } + } + else if constexpr (!std::is_same_v) { + matmul_impl(permute(cuda::std::get<0>(out), perm_), a_, b_, ex, alpha_, beta_); } else { - sparse_matmul_impl(cuda::std::get<0>(out), a_, b_, ex, alpha_, beta_); + matmul_impl(cuda::std::get<0>(out), a_, b_, ex, alpha_, beta_); } } - else if constexpr (!std::is_same_v) { - matmul_impl(permute(cuda::std::get<0>(out), perm_), a_, b_, ex, alpha_, beta_); - } - else { - matmul_impl(cuda::std::get<0>(out), a_, b_, ex, alpha_, beta_); - } } template diff --git a/include/matx/operators/matvec.h b/include/matx/operators/matvec.h index 7de2913f..1b4d3a1b 100644 --- a/include/matx/operators/matvec.h +++ b/include/matx/operators/matvec.h @@ -90,14 +90,44 @@ namespace matx return out_dims_[dim]; } + template + __MATX_INLINE__ void apply_dep_to_task(Task &&task, int perm=1) const noexcept + { + /* Scenario where the matvec() operator is on the RHS and op has already run + previously. So we make tmp_out have a read permission as it will be read from */ + tmp_out_.apply_dep_to_task(std::forward(task), 1); + } + template void Exec(Out &&out, Executor &&ex) const{ - static_assert(!is_sparse_tensor_v, "sparse rhs not implemented"); - if constexpr (is_sparse_tensor_v) { - sparse_matvec_impl(cuda::std::get<0>(out), a_, b_, ex, alpha_, beta_); - } else { - matvec_impl(cuda::std::get<0>(out), a_, b_, ex, alpha_, beta_); - } + static_assert(!is_sparse_tensor_v, "sparse rhs not implemented"); + // stfexecutor case + auto output = cuda::std::get<0>(out); + if constexpr (!is_cuda_executor_v) { + auto ctx = ex.getCtx(); + auto tsk = ctx.task(); + tsk.set_symbol("matvec_task"); + output.PreRun(out_dims_, std::forward(ex)); + output.apply_dep_to_task(tsk, 0); + a_.apply_dep_to_task(tsk, 1); + b_.apply_dep_to_task(tsk, 1); + tsk->*[&](cudaStream_t s) { + auto exec = cudaExecutor(s); + if constexpr (is_sparse_tensor_v) { + sparse_matvec_impl(output, a_, b_, exec, alpha_, beta_); + } else { + matvec_impl(output, a_, b_, exec, alpha_, beta_); + } + }; + } + // cudaExecutor case + else if constexpr (is_cuda_executor_v) { + if constexpr (is_sparse_tensor_v) { + sparse_matvec_impl(output, a_, b_, ex, alpha_, beta_); + } else { + matvec_impl(output, a_, b_, ex, alpha_, beta_); + } + } } template diff --git a/include/matx/operators/max.h b/include/matx/operators/max.h index f7d9da77..562c2a4f 100644 --- a/include/matx/operators/max.h +++ b/include/matx/operators/max.h @@ -72,9 +72,35 @@ namespace detail { return tmp_out_(indices...); } + template + __MATX_INLINE__ void apply_dep_to_task(Task &&task, int perm=1) const noexcept { + /* Albert -- Scenario where the all() operator is on the RHS and sum has already + run previously. So we make tmp_out have a read permission as it will be read from */ + tmp_out_.apply_dep_to_task(std::forward(task), 1); + } + template void Exec(Out &&out, Executor &&ex) const { - max_impl(cuda::std::get<0>(out), a_, ex); + auto output = cuda::std::get<0>(out); + // stfexecutor case + if constexpr (!is_cuda_executor_v) { + auto ctx = ex.getCtx(); + auto tsk = ctx.task(); + tsk.set_symbol("max_task"); + + output.PreRun(out_dims_, std::forward(ex)); + output.apply_dep_to_task(tsk, 0); + a_.apply_dep_to_task(tsk, 1); + + tsk->*[&](cudaStream_t s) { + auto exec = cudaExecutor(s); + max_impl(output, a_, exec); + }; + } + // cudaExecutor case + else if constexpr (is_cuda_executor_v) { + max_impl(output, a_, ex); + } } static __MATX_INLINE__ constexpr __MATX_HOST__ __MATX_DEVICE__ int32_t Rank() diff --git a/include/matx/operators/permute.h b/include/matx/operators/permute.h index 0732b3ae..19fa287e 100644 --- a/include/matx/operators/permute.h +++ b/include/matx/operators/permute.h @@ -126,6 +126,14 @@ MATX_IGNORE_WARNING_POP_GCC return op_.Size(dims_[dim]); } + template + __MATX_INLINE__ void apply_dep_to_task(Task &&task, int perm=1) const noexcept + { + if constexpr (is_matx_op()) { + op_.apply_dep_to_task(std::forward(task), perm); + } + } + template __MATX_INLINE__ void PreRun(ShapeType &&shape, Executor &&ex) const noexcept { diff --git a/include/matx/operators/set.h b/include/matx/operators/set.h index 811181f4..50faa0e1 100644 --- a/include/matx/operators/set.h +++ b/include/matx/operators/set.h @@ -147,6 +147,20 @@ class set : public BaseOp> { return res; } + template + __MATX_INLINE__ void apply_dep_to_task(Task &&task, [[maybe_unused]] int perm=0) const noexcept + { + // LHS + if constexpr (is_matx_op()) { + out_.apply_dep_to_task(std::forward(task), 0); + } + // RHS + if constexpr (is_matx_op()) { + op_.apply_dep_to_task(std::forward(task), 1); + } + } + + template __MATX_INLINE__ void PreRun(ShapeType &&shape, Executor &&ex) const noexcept { diff --git a/include/matx/operators/sum.h b/include/matx/operators/sum.h index eec0d9b7..62c5853d 100644 --- a/include/matx/operators/sum.h +++ b/include/matx/operators/sum.h @@ -72,9 +72,35 @@ namespace detail { return tmp_out_(indices...); } + template + __MATX_INLINE__ void apply_dep_to_task(Task &&task, int perm=1) const noexcept { + /* Albert -- Scenario where the sum() operator is on the RHS and sum has already + run previously. So we make tmp_out have a read permission as it will be read from */ + tmp_out_.apply_dep_to_task(std::forward(task), 1); + } + template void Exec(Out &&out, Executor &&ex) const { - sum_impl(cuda::std::get<0>(out), a_, ex); + auto output = cuda::std::get<0>(out); + // stfexecutor case + if constexpr (!is_cuda_executor_v) { + auto ctx = ex.getCtx(); + auto tsk = ctx.task(); + tsk.set_symbol("sum_task"); + + output.PreRun(out_dims_, std::forward(ex)); + output.apply_dep_to_task(tsk, 0); + a_.apply_dep_to_task(tsk, 1); + + tsk->*[&](cudaStream_t s) { + auto exec = cudaExecutor(s); + sum_impl(output, a_, exec); + }; + } + // cudaExecutor case + else if constexpr (is_cuda_executor_v) { + sum_impl(output, a_, ex); + } } static __MATX_INLINE__ constexpr __MATX_HOST__ __MATX_DEVICE__ int32_t Rank() diff --git a/include/matx/operators/unary_operators.h b/include/matx/operators/unary_operators.h index dbf1afdb..0b21ace1 100644 --- a/include/matx/operators/unary_operators.h +++ b/include/matx/operators/unary_operators.h @@ -103,6 +103,12 @@ namespace matx return size_[dim]; } + template + __MATX_INLINE__ void apply_dep_to_task(Task &&task, Perm perm) const noexcept + { + in1_.apply_dep_to_task(std::forward(task), perm); + } + template __MATX_INLINE__ void PreRun([[maybe_unused]] ShapeType &&shape, [[maybe_unused]] Executor &&ex) const noexcept { diff --git a/include/matx/transforms/cgsolve.h b/include/matx/transforms/cgsolve.h index 9d8f6d41..05874bf4 100644 --- a/include/matx/transforms/cgsolve.h +++ b/include/matx/transforms/cgsolve.h @@ -58,8 +58,8 @@ namespace matx * cuda Stream to execute on * */ - template - __MATX_INLINE__ void cgsolve_impl(XType X, AType A, BType B, double tol=1e-6, int max_iters=4, cudaStream_t stream=0) + template + __MATX_INLINE__ void cgsolve_impl(XType X, AType A, BType B, Executor &&exec, double tol=1e-6, int max_iters=4, cudaStream_t stream=0) { using value_type = typename XType::value_type; const int VRANK = XType::Rank(); @@ -119,15 +119,19 @@ namespace matx auto pApc = clone(pAp, clone_shape); // A*X - (Ap = matvec(A, X)).run(stream); + //(Ap = matvec(A, X)).run(stream); + (Ap = matvec(A, X)).run(exec); // r0 = B - A*X // p = r0 - (p = r0 = B - Ap).run(stream); + //(p = r0 = B - Ap).run(stream); + (p = r0 = B - Ap).run(exec); - (r0r0 = sum(r0*r0)).run(stream); + //(r0r0 = sum(r0*r0)).run(stream); + (r0r0 = sum(r0*r0)).run(exec); if(tol>0.0f) { - (converged = matx::all(as_int(sqrt(r0r0) < tol))).run(stream); + //(converged = matx::all(as_int(sqrt(r0r0) < tol))).run(stream); + (converged = matx::all(as_int(sqrt(r0r0) < tol))).run(exec); cudaEventRecord(event, stream); cudaStreamWaitEvent(d2h, event); @@ -136,10 +140,12 @@ namespace matx int i; for (i = 0 ; i < max_iters; i++) { // Ap = matvec(A, p) - (Ap = matvec(A, p)).run(stream); + //(Ap = matvec(A, p)).run(stream); + (Ap = matvec(A, p)).run(exec); // pAp = dot(p,Ap) - (pAp = sum(p*Ap)).run(stream); + //(pAp = sum(p*Ap)).run(stream); + (pAp = sum(p*Ap)).run(exec); // if pAp is zero then we have exactly numerically converged. // However, this is batched so we may iterate more. Iterating @@ -151,10 +157,12 @@ namespace matx auto updateOp = ( r1 = r0 - (r0r0c/pApc) * Ap, X = X + (r0r0c/pApc) * p); - (IF( pApc != value_type(0), updateOp)).run(stream); + //(IF( pApc != value_type(0), updateOp)).run(stream); + (IF( pApc != value_type(0), updateOp)).run(exec); // r1r1 = dot(r1, r1) - (r1r1 = sum(r1*r1)).run(stream); + //(r1r1 = sum(r1*r1)).run(stream); + (r1r1 = sum(r1*r1)).run(exec); if(tol>0.0f) { // copy convergence criteria to host. @@ -167,7 +175,8 @@ namespace matx break; } - (converged = matx::all(as_int(sqrt(r1r1) < tol))).run(stream); + //(converged = matx::all(as_int(sqrt(r1r1) < tol))).run(stream); + (converged = matx::all(as_int(sqrt(r1r1) < tol))).run(exec); cudaEventRecord(event, stream); cudaStreamWaitEvent(d2h, event); @@ -175,7 +184,8 @@ namespace matx // p = r1 + b * p auto updateP = ( p = r1 + (r1r1c/r0r0c) * p); - (IF( pApc != value_type(0), updateP)).run(stream); + //(IF( pApc != value_type(0), updateP)).run(stream); + (IF( pApc != value_type(0), updateP)).run(exec); // Advance residual swap(r0r0, r1r1); diff --git a/include/matx/transforms/fft/fft_cuda.h b/include/matx/transforms/fft/fft_cuda.h index c2eaa9c2..de9089b7 100644 --- a/include/matx/transforms/fft/fft_cuda.h +++ b/include/matx/transforms/fft/fft_cuda.h @@ -323,7 +323,11 @@ template class matxCUDAFFTPlan_t virtual ~matxCUDAFFTPlan_t() { if (this->workspace_ != nullptr) { // Pass the default stream until we allow user-deletable caches + /* Albert -- Temporarily remove this free as we likely don't want to + insert the dependence on cudaStreamDefault */ +#if 0 matxFree(workspace_, cudaStreamDefault); +#endif this->workspace_ = nullptr; } @@ -414,6 +418,11 @@ matxCUDAFFTPlan1D_t(OutTensorType &o, const InTensorType &i, cudaStream_t stream } } + // Albert -Assert that the stream is in capture mode + cudaStreamCaptureStatus status; + cudaStreamIsCapturing(stream, &status); + MATX_ASSERT(status == cudaStreamCaptureStatusNone, matxCufftError); + size_t workspaceSize; cufftCreate(&this->plan_); [[maybe_unused]] cufftResult error;