gpu-mode · ngc92 · Mar 28, 2026 · Mar 25, 2026 · Mar 25, 2026 · Mar 25, 2026
diff --git a/csrc/binding.cpp b/csrc/binding.cpp
@@ -7,7 +7,9 @@
 
 #include <utility>
 #include <random>
+#include <thread>
 #include "manager.h"
+#include "utils.h"
 
 int supervisor_main(int sock_fd);
 
@@ -22,8 +24,30 @@ void do_bench(int result_fd, int input_fd, const std::string& kernel_qualname, c
     signature.allocate(32, rng);
     auto config = read_benchmark_parameters(input_fd, signature.data());
     auto mgr = make_benchmark_manager(result_fd, std::move(signature), config.Seed, discard, nvtx, landlock, mseal, supervisor_sock_fd);
-    auto [args, expected] = mgr->setup_benchmark(nb::cast<nb::callable>(test_generator), test_kwargs, config.Repeats);
-    mgr->do_bench_py(kernel_qualname, args, expected, reinterpret_cast<cudaStream_t>(stream));
+
+    {
+        nb::gil_scoped_release release;
+        std::exception_ptr thread_exception;
+        int device;
+        CUDA_CHECK(cudaGetDevice(&device));
+        std::thread run_thread ([&]()
+        {
+            try {
+                 CUDA_CHECK(cudaSetDevice(device));
+                 nb::gil_scoped_acquire acquire;
+                 auto [args, expected] = mgr->setup_benchmark(nb::cast<nb::callable>(test_generator), test_kwargs, config.Repeats);
+                 mgr->do_bench_py(kernel_qualname, args, expected, reinterpret_cast<cudaStream_t>(stream));
+             } catch (...) {
+                 thread_exception = std::current_exception();
+             }
+        });
+        run_thread.join();
+        if (thread_exception)
+            std::rethrow_exception(thread_exception);
+    }
+
+    mgr->send_report();
+    mgr->clean_up();
 }
 
 

diff --git a/csrc/manager.cpp b/csrc/manager.cpp
@@ -181,7 +181,8 @@ BenchmarkManager::BenchmarkManager(std::byte* arena, std::size_t arena_size,
       mEndEvents(&mResource),
       mExpectedOutputs(&mResource),
       mShadowArguments(&mResource),
-      mOutputBuffers(&mResource)
+      mOutputBuffers(&mResource),
+      mTestOrder(&mResource)
 {
     int device;
     CUDA_CHECK(cudaGetDevice(&device));
@@ -336,39 +337,6 @@ void BenchmarkManager::install_protections() {
     install_seccomp_filter();
 }
 
-int BenchmarkManager::run_warmup(nb::callable& kernel, const nb::tuple& args, cudaStream_t stream) {
-    std::chrono::high_resolution_clock::time_point cpu_start = std::chrono::high_resolution_clock::now();
-    int warmup_run_count = 0;
-    double time_estimate;
-    nvtx_push("timing");
-    while (true) {
-        // note: we are assuming here that calling the kernel multiple times for the same input is a safe operation
-        // this is only potentially problematic for in-place kernels;
-        CUDA_CHECK(cudaDeviceSynchronize());
-        clear_cache(stream);
-        kernel(*args);
-        CUDA_CHECK(cudaDeviceSynchronize());
-        std::chrono::high_resolution_clock::time_point cpu_end = std::chrono::high_resolution_clock::now();
-        std::chrono::duration<double> elapsed_seconds = cpu_end - cpu_start;
-        ++warmup_run_count;
-        if (elapsed_seconds.count() > mWarmupSeconds) {
-            time_estimate = elapsed_seconds.count() / warmup_run_count;
-            break;
-        }
-    }
-    nvtx_pop();
-
-    // note: this is a very conservative estimate. Timing above was measured with syncs between every kernel.
-    int calls = mOutputBuffers.size() - 1;
-    const int actual_calls = std::clamp(static_cast<int>(std::ceil(mBenchmarkSeconds / time_estimate)), 1, calls);
-
-    if (actual_calls < 3) {
-        throw std::runtime_error("The initial speed test indicated that running times are too slow to generate meaningful benchmark numbers: " + std::to_string(time_estimate));
-    }
-
-    return actual_calls;
-}
-
 static inline std::uintptr_t page_mask() {
     std::uintptr_t page_size = getpagesize();
     return ~(page_size - 1u);
@@ -381,58 +349,89 @@ void protect_range(void* ptr, size_t size, int prot) {
         throw std::system_error(errno, std::system_category(), "mprotect");
 }
 
-nb::callable BenchmarkManager::get_kernel(const std::string& qualname, const nb::tuple& call_args) {
-    nb::gil_scoped_release release;
-    const std::uintptr_t lo = reinterpret_cast<std::uintptr_t>(this->mArena);
+static void setup_seccomp(int sock, bool install_notify, std::uintptr_t lo, std::uintptr_t hi) {
+    if (sock < 0)
+        return;
+    try {
+        if (install_notify)
+            seccomp_install_memory_notify(sock, lo, hi);
+    } catch (...) {
+        close(sock);
+        throw;
+    }
+    close(sock);
+}
+
+static double run_warmup_loop(nb::callable& kernel, const nb::tuple& args, cudaStream_t stream,
+                              void* cc_memory, std::size_t l2_clear_size, bool discard_cache,
+                              double warmup_seconds) {
+    CUDA_CHECK(cudaDeviceSynchronize());
+    auto cpu_start = std::chrono::high_resolution_clock::now();
+    int run_count = 0;
+
+    while (true) {
+        ::clear_cache(cc_memory, 2 * l2_clear_size, discard_cache, stream);
+        kernel(*args);
+        CUDA_CHECK(cudaDeviceSynchronize());
+
+        ++run_count;
+        double elapsed = std::chrono::duration<double>(
+            std::chrono::high_resolution_clock::now() - cpu_start).count();
+        if (elapsed > warmup_seconds)
+            return elapsed / run_count;
+    }
+}
+
+nb::callable BenchmarkManager::initial_kernel_setup(double& time_estimate, const std::string& qualname,
+                                                     const nb::tuple& call_args, cudaStream_t stream) {
+    const std::uintptr_t lo = reinterpret_cast<std::uintptr_t>(mArena);
     const std::uintptr_t hi = lo + BenchmarkManagerArenaSize;
 
+    // snapshot all member state needed in the thread before protecting the arena
+    const int sock = mSupervisorSock;
+    const bool install_notify = mSeal || supports_seccomp_notify();
+    const double warmup_seconds = mWarmupSeconds;
+    void* const cc_memory = mDeviceDummyMemory;
+    const std::size_t l2_clear_size = mL2CacheSize;
+    const bool discard_cache = mDiscardCache;
+    int device;
+    CUDA_CHECK(cudaGetDevice(&device));
+
     nb::callable kernel;
     std::exception_ptr thread_exception;
-    int sock = mSupervisorSock;
-    bool install_notify = mSeal || supports_seccomp_notify();
 
     nvtx_push("trigger-compile");
-
-    // make the BenchmarkManager inaccessible
     protect_range(reinterpret_cast<void*>(lo), hi - lo, PROT_NONE);
-    // TODO make stack inaccessible (may be impossible) or read-only during the call
-    // call the python kernel generation function from a different thread.
-
-    std::thread make_kernel_thread([&kernel, sock, lo, hi, qualname, &call_args, &thread_exception, install_notify]() {
-        try {
-            if (sock >= 0) {
-                try {
-                    if (install_notify)
-                        seccomp_install_memory_notify(sock, lo, hi);
-                } catch (...) {
-                    close(sock);
-                    throw;
-                }
-                close(sock);
+
+    {
+        nb::gil_scoped_release release;
+        std::thread worker([&] {
+            try {
+                CUDA_CHECK(cudaSetDevice(device));
+                setup_seccomp(sock, install_notify, lo, hi);
+
+                nb::gil_scoped_acquire guard;
+
+                kernel = kernel_from_qualname(qualname);
+                CUDA_CHECK(cudaDeviceSynchronize());
+                kernel(*call_args);  // trigger JIT compile
+
+                time_estimate = run_warmup_loop(kernel, call_args, stream,
+                                                cc_memory, l2_clear_size, discard_cache,
+                                                warmup_seconds);
+            } catch (...) {
+                thread_exception = std::current_exception();
             }
-            nb::gil_scoped_acquire guard;
-            kernel = kernel_from_qualname(qualname);
-
-            // ok, first run for compilations etc
-            CUDA_CHECK(cudaDeviceSynchronize());
-            kernel(*call_args);
-            CUDA_CHECK(cudaDeviceSynchronize());
-        } catch (...) {
-            thread_exception = std::current_exception();
-        }
-    });
+        });
+        worker.join();
+    }
 
-    make_kernel_thread.join();
-    // make it accessible again. This is in the original thread, so the tightened seccomp
-    // policy does not apply here.
     protect_range(reinterpret_cast<void*>(lo), hi - lo, PROT_READ | PROT_WRITE);
-    // closed now, so set to -1
     mSupervisorSock = -1;
     nvtx_pop();
 
-    if (thread_exception) {
+    if (thread_exception)
         std::rethrow_exception(thread_exception);
-    }
 
     return kernel;
 }
@@ -446,21 +445,33 @@ void BenchmarkManager::do_bench_py(
     setup_test_cases(args, expected, stream);
     install_protections();
 
-    // at this point, we call user code as we import the kernel (executing arbitrary top-level code)
-    // after this, we cannot trust python anymore
-    nb::callable kernel = get_kernel(kernel_qualname, args.at(0));
-
-    // now, run a few more times for warmup; in total aim for 1 second of warmup runs
-    int actual_calls = run_warmup(kernel, args.at(0), stream);
-    constexpr int DRY_EVENTS = 100;
-    const int num_events = std::max(actual_calls, DRY_EVENTS);
+    constexpr std::size_t DRY_EVENTS = 100;
+    const std::size_t num_events = std::max(mShadowArguments.size(), DRY_EVENTS);
     mStartEvents.resize(num_events);
     mEndEvents.resize(num_events);
     for (int i = 0; i < num_events; i++) {
         CUDA_CHECK(cudaEventCreate(&mStartEvents.at(i)));
         CUDA_CHECK(cudaEventCreate(&mEndEvents.at(i)));
     }
 
+    // dry run -- measure overhead of events
+    mMedianEventTime = measure_event_overhead(DRY_EVENTS, stream);
+
+    double time_estimate = 0.0;
+    // at this point, we call user code as we import the kernel (executing arbitrary top-level code)
+    // after this, we cannot trust python anymore
+    nb::callable kernel = initial_kernel_setup(time_estimate, kernel_qualname, args.at(0), stream);
+
+    int calls = mOutputBuffers.size() - 1;
+    const int actual_calls = std::clamp(
+        static_cast<int>(std::ceil(mBenchmarkSeconds / time_estimate)), 1, calls);
+
+    if (actual_calls < 3) {
+        throw std::runtime_error(
+            "The initial speed test indicated that running times are too slow to generate "
+            "meaningful benchmark numbers: " + std::to_string(time_estimate));
+    }
+
     // pick a random spot for the unsigned
     // initialize the whole area with random junk; the error counter
     // will be shifted by the initial value, so just writing zero
@@ -476,20 +487,17 @@ void BenchmarkManager::do_bench_py(
     mDeviceErrorCounter = mDeviceErrorBase + offset;
     mErrorCountShift = noise.at(offset);
 
-    // dry run -- measure overhead of events
-    float median_event_time = measure_event_overhead(DRY_EVENTS, stream);
-
     // create a randomized order for running the tests
-    std::vector<int> test_order(actual_calls);
-    std::iota(test_order.begin(), test_order.end(), 1);
-    std::shuffle(test_order.begin(), test_order.end(), rng);
+    mTestOrder.resize(actual_calls);
+    std::iota(mTestOrder.begin(), mTestOrder.end(), 1);
+    std::shuffle(mTestOrder.begin(), mTestOrder.end(), rng);
 
     std::uniform_int_distribution<unsigned> check_seed_generator(0,  0xffffffff);
 
     nvtx_push("benchmark");
     // now do the real runs
     for (int i = 0; i < actual_calls; i++) {
-        int test_id = test_order.at(i);
+        const int test_id = mTestOrder.at(i);
         // page-in real inputs. If the user kernel runs on the wrong stream, it's likely it won't see the correct inputs
         // unfortunately, we need to do this before clearing the cache, so there is a window of opportunity
         // *but* we deliberately modify a small subset of the inputs, which only get corrected immediately before
@@ -522,26 +530,29 @@ void BenchmarkManager::do_bench_py(
         validate_result(mExpectedOutputs.at(test_id), mOutputBuffers.at(test_id), check_seed_generator(rng), stream);
     }
     nvtx_pop();
+}
 
-    cudaEventSynchronize(mEndEvents.back());
+void BenchmarkManager::send_report() {
+    CUDA_CHECK(cudaEventSynchronize(mEndEvents.at(mTestOrder.size() - 1)));
     unsigned error_count;
     CUDA_CHECK(cudaMemcpy(&error_count, mDeviceErrorCounter, sizeof(unsigned), cudaMemcpyDeviceToHost));
     // subtract the nuisance shift that we applied to the counter
     error_count -= mErrorCountShift;
 
-    std::string message = build_result_message(test_order, error_count, median_event_time);
+    std::string message = build_result_message(mTestOrder, error_count, mMedianEventTime);
     message = encrypt_message(mSignature.data(), 32, message);
     fwrite(message.data(), 1, message.size(), mOutputPipe);
     fflush(mOutputPipe);
+}
 
-    // cleanup events
+void BenchmarkManager::clean_up() {
     for (auto& event : mStartEvents) CUDA_CHECK(cudaEventDestroy(event));
     for (auto& event : mEndEvents) CUDA_CHECK(cudaEventDestroy(event));
     mStartEvents.clear();
     mEndEvents.clear();
 }
 
-std::string BenchmarkManager::build_result_message(const std::vector<int>& test_order, unsigned error_count, float median_event_time) const {
+std::string BenchmarkManager::build_result_message(const std::pmr::vector<int>& test_order, unsigned error_count, float median_event_time) const {
     std::ostringstream oss;
 
     oss << "event-overhead\t" << median_event_time * 1000 << " µs\n";

diff --git a/csrc/manager.h b/csrc/manager.h
@@ -50,6 +50,8 @@ class BenchmarkManager {
 public:
     std::pair<std::vector<nb::tuple>, std::vector<nb::tuple>> setup_benchmark(const nb::callable& generate_test_case, const nb::dict& kwargs, int repeats);
     void do_bench_py(const std::string& kernel_qualname, const std::vector<nb::tuple>& args, const std::vector<nb::tuple>& expected, cudaStream_t stream);
+    void send_report();
+    void clean_up();
 private:
     friend BenchmarkManagerPtr make_benchmark_manager(int result_fd, ObfuscatedHexDigest signature, std::uint64_t seed, bool discard, bool nvtx, bool landlock, bool mseal, int supervisor_socket);
     friend BenchmarkManagerDeleter;
@@ -113,10 +115,13 @@ class BenchmarkManager {
     std::pmr::vector<Expected> mExpectedOutputs;
     std::pmr::vector<ShadowArgumentList> mShadowArguments;
     std::pmr::vector<nb_cuda_array> mOutputBuffers;
+    std::pmr::vector<int> mTestOrder;
 
     FILE* mOutputPipe = nullptr;
     ObfuscatedHexDigest mSignature;
 
+    float mMedianEventTime = -1.f;
+
     static ShadowArgumentList make_shadow_args(const nb::tuple& args, cudaStream_t stream,
                                                std::pmr::memory_resource* resource);
 
@@ -130,10 +135,9 @@ class BenchmarkManager {
     void setup_test_cases(const std::vector<nb::tuple>& args, const std::vector<nb::tuple>& expected, cudaStream_t stream);
 
     void install_protections();
-    int run_warmup(nb::callable& kernel, const nb::tuple& args, cudaStream_t stream);
-    nb::callable get_kernel(const std::string& qualname, const nb::tuple& call_args);
+    nb::callable initial_kernel_setup(double& time_estimate, const std::string& qualname, const nb::tuple& call_args, cudaStream_t stream);
 
-    [[nodiscard]] std::string build_result_message(const std::vector<int>& test_order, unsigned error_count, float median_event_time) const;
+    [[nodiscard]] std::string build_result_message(const std::pmr::vector<int>& test_order, unsigned error_count, float median_event_time) const;
 
 
     // debug only: Any sort of test exploit that targets specific values of this class is going to be brittle,