fixes

ngc92 · ngc92 · commit bfa3dfe220cd · 2026-03-28T00:21:35.000+01:00
diff --git a/csrc/binding.cpp b/csrc/binding.cpp
@@ -9,6 +9,7 @@
 #include <random>
 #include <thread>
 #include "manager.h"
+#include "utils.h"
 
 int supervisor_main(int sock_fd);
 
@@ -23,14 +24,28 @@ void do_bench(int result_fd, int input_fd, const std::string& kernel_qualname, c
     signature.allocate(32, rng);
     auto config = read_benchmark_parameters(input_fd, signature.data());
     auto mgr = make_benchmark_manager(result_fd, std::move(signature), config.Seed, discard, nvtx, landlock, mseal, supervisor_sock_fd);
-    nb::gil_scoped_release release;
-    std::thread run_thread ([&]()
+
     {
-        nb::gil_scoped_acquire acquire;
-        auto [args, expected] = mgr->setup_benchmark(nb::cast<nb::callable>(test_generator), test_kwargs, config.Repeats);
-        mgr->do_bench_py(kernel_qualname, args, expected, reinterpret_cast<cudaStream_t>(stream));
-    });
-    run_thread.join();
+        nb::gil_scoped_release release;
+        std::exception_ptr thread_exception;
+        int device;
+        CUDA_CHECK(cudaGetDevice(&device));
+        std::thread run_thread ([&]()
+        {
+            try {
+                 CUDA_CHECK(cudaSetDevice(device));
+                 nb::gil_scoped_acquire acquire;
+                 auto [args, expected] = mgr->setup_benchmark(nb::cast<nb::callable>(test_generator), test_kwargs, config.Repeats);
+                 mgr->do_bench_py(kernel_qualname, args, expected, reinterpret_cast<cudaStream_t>(stream));
+             } catch (...) {
+                 thread_exception = std::current_exception();
+             }
+        });
+        run_thread.join();
+        if (thread_exception)
+            std::rethrow_exception(thread_exception);
+    }
+
     mgr->send_report();
     mgr->clean_up();
 }
diff --git a/csrc/manager.cpp b/csrc/manager.cpp
@@ -181,7 +181,8 @@ BenchmarkManager::BenchmarkManager(std::byte* arena, std::size_t arena_size,
       mEndEvents(&mResource),
       mExpectedOutputs(&mResource),
       mShadowArguments(&mResource),
-      mOutputBuffers(&mResource)
+      mOutputBuffers(&mResource),
+      mTestOrder(&mResource),
 {
     int device;
     CUDA_CHECK(cudaGetDevice(&device));
@@ -393,6 +394,8 @@ nb::callable BenchmarkManager::initial_kernel_setup(double& time_estimate, const
     void* const cc_memory = mDeviceDummyMemory;
     const std::size_t l2_clear_size = mL2CacheSize;
     const bool discard_cache = mDiscardCache;
+    int device;
+    CUDA_CHECK(cudaGetDevice(&device));
 
     nb::callable kernel;
     std::exception_ptr thread_exception;
@@ -404,6 +407,7 @@ nb::callable BenchmarkManager::initial_kernel_setup(double& time_estimate, const
         nb::gil_scoped_release release;
         std::thread worker([&] {
             try {
+                CUDA_CHECK(cudaSetDevice(device));
                 setup_seccomp(sock, install_notify, lo, hi);
 
                 nb::gil_scoped_acquire guard;
@@ -529,7 +533,7 @@ void BenchmarkManager::do_bench_py(
 }
 
 void BenchmarkManager::send_report() {
-    cudaEventSynchronize(mEndEvents.back());
+    CUDA_CHECK(cudaEventSynchronize(mEndEvents.at(mTestOrder.size() - 1)));
     unsigned error_count;
     CUDA_CHECK(cudaMemcpy(&error_count, mDeviceErrorCounter, sizeof(unsigned), cudaMemcpyDeviceToHost));
     // subtract the nuisance shift that we applied to the counter