@@ -181,7 +181,8 @@ BenchmarkManager::BenchmarkManager(std::byte* arena, std::size_t arena_size,
181181 mEndEvents(&mResource ),
182182 mExpectedOutputs(&mResource ),
183183 mShadowArguments(&mResource ),
184- mOutputBuffers(&mResource )
184+ mOutputBuffers(&mResource ),
185+ mTestOrder(&mResource ),
185186{
186187 int device;
187188 CUDA_CHECK (cudaGetDevice (&device));
@@ -393,6 +394,8 @@ nb::callable BenchmarkManager::initial_kernel_setup(double& time_estimate, const
393394 void * const cc_memory = mDeviceDummyMemory ;
394395 const std::size_t l2_clear_size = mL2CacheSize ;
395396 const bool discard_cache = mDiscardCache ;
397+ int device;
398+ CUDA_CHECK (cudaGetDevice (&device));
396399
397400 nb::callable kernel;
398401 std::exception_ptr thread_exception;
@@ -404,6 +407,7 @@ nb::callable BenchmarkManager::initial_kernel_setup(double& time_estimate, const
404407 nb::gil_scoped_release release;
405408 std::thread worker ([&] {
406409 try {
410+ CUDA_CHECK (cudaSetDevice (device));
407411 setup_seccomp (sock, install_notify, lo, hi);
408412
409413 nb::gil_scoped_acquire guard;
@@ -529,7 +533,7 @@ void BenchmarkManager::do_bench_py(
529533}
530534
531535void BenchmarkManager::send_report () {
532- cudaEventSynchronize (mEndEvents .back ( ));
536+ CUDA_CHECK ( cudaEventSynchronize (mEndEvents .at ( mTestOrder . size () - 1 ) ));
533537 unsigned error_count;
534538 CUDA_CHECK (cudaMemcpy (&error_count, mDeviceErrorCounter , sizeof (unsigned ), cudaMemcpyDeviceToHost));
535539 // subtract the nuisance shift that we applied to the counter
0 commit comments