Skip to content

Commit bfa3dfe

Browse files
committed
fixes
1 parent 044007c commit bfa3dfe

2 files changed

Lines changed: 28 additions & 9 deletions

File tree

csrc/binding.cpp

Lines changed: 22 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -9,6 +9,7 @@
99
#include <random>
1010
#include <thread>
1111
#include "manager.h"
12+
#include "utils.h"
1213

1314
int supervisor_main(int sock_fd);
1415

@@ -23,14 +24,28 @@ void do_bench(int result_fd, int input_fd, const std::string& kernel_qualname, c
2324
signature.allocate(32, rng);
2425
auto config = read_benchmark_parameters(input_fd, signature.data());
2526
auto mgr = make_benchmark_manager(result_fd, std::move(signature), config.Seed, discard, nvtx, landlock, mseal, supervisor_sock_fd);
26-
nb::gil_scoped_release release;
27-
std::thread run_thread ([&]()
27+
2828
{
29-
nb::gil_scoped_acquire acquire;
30-
auto [args, expected] = mgr->setup_benchmark(nb::cast<nb::callable>(test_generator), test_kwargs, config.Repeats);
31-
mgr->do_bench_py(kernel_qualname, args, expected, reinterpret_cast<cudaStream_t>(stream));
32-
});
33-
run_thread.join();
29+
nb::gil_scoped_release release;
30+
std::exception_ptr thread_exception;
31+
int device;
32+
CUDA_CHECK(cudaGetDevice(&device));
33+
std::thread run_thread ([&]()
34+
{
35+
try {
36+
CUDA_CHECK(cudaSetDevice(device));
37+
nb::gil_scoped_acquire acquire;
38+
auto [args, expected] = mgr->setup_benchmark(nb::cast<nb::callable>(test_generator), test_kwargs, config.Repeats);
39+
mgr->do_bench_py(kernel_qualname, args, expected, reinterpret_cast<cudaStream_t>(stream));
40+
} catch (...) {
41+
thread_exception = std::current_exception();
42+
}
43+
});
44+
run_thread.join();
45+
if (thread_exception)
46+
std::rethrow_exception(thread_exception);
47+
}
48+
3449
mgr->send_report();
3550
mgr->clean_up();
3651
}

csrc/manager.cpp

Lines changed: 6 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -181,7 +181,8 @@ BenchmarkManager::BenchmarkManager(std::byte* arena, std::size_t arena_size,
181181
mEndEvents(&mResource),
182182
mExpectedOutputs(&mResource),
183183
mShadowArguments(&mResource),
184-
mOutputBuffers(&mResource)
184+
mOutputBuffers(&mResource),
185+
mTestOrder(&mResource),
185186
{
186187
int device;
187188
CUDA_CHECK(cudaGetDevice(&device));
@@ -393,6 +394,8 @@ nb::callable BenchmarkManager::initial_kernel_setup(double& time_estimate, const
393394
void* const cc_memory = mDeviceDummyMemory;
394395
const std::size_t l2_clear_size = mL2CacheSize;
395396
const bool discard_cache = mDiscardCache;
397+
int device;
398+
CUDA_CHECK(cudaGetDevice(&device));
396399

397400
nb::callable kernel;
398401
std::exception_ptr thread_exception;
@@ -404,6 +407,7 @@ nb::callable BenchmarkManager::initial_kernel_setup(double& time_estimate, const
404407
nb::gil_scoped_release release;
405408
std::thread worker([&] {
406409
try {
410+
CUDA_CHECK(cudaSetDevice(device));
407411
setup_seccomp(sock, install_notify, lo, hi);
408412

409413
nb::gil_scoped_acquire guard;
@@ -529,7 +533,7 @@ void BenchmarkManager::do_bench_py(
529533
}
530534

531535
void BenchmarkManager::send_report() {
532-
cudaEventSynchronize(mEndEvents.back());
536+
CUDA_CHECK(cudaEventSynchronize(mEndEvents.at(mTestOrder.size() - 1)));
533537
unsigned error_count;
534538
CUDA_CHECK(cudaMemcpy(&error_count, mDeviceErrorCounter, sizeof(unsigned), cudaMemcpyDeviceToHost));
535539
// subtract the nuisance shift that we applied to the counter

0 commit comments

Comments
 (0)