1919#include < nanobind/stl/string.h>
2020#include < sys/mman.h>
2121#include < unistd.h>
22+ #include " protect.h"
2223
2324static constexpr std::size_t ArenaSize = 2 * 1024 * 1024 ;
2425
@@ -137,7 +138,7 @@ void BenchmarkManagerDeleter::operator()(BenchmarkManager* p) const noexcept {
137138
138139
139140BenchmarkManagerPtr make_benchmark_manager (
140- int result_fd, ObfuscatedHexDigest signature, std::uint64_t seed,
141+ int result_fd, const std::vector< char >& signature, std::uint64_t seed,
141142 bool discard, bool nvtx, bool landlock, bool mseal, int supervisor_socket)
142143{
143144 const std::size_t page_size = static_cast <std::size_t >(getpagesize ());
@@ -153,7 +154,7 @@ BenchmarkManagerPtr make_benchmark_manager(
153154 try {
154155 raw = new (mem) BenchmarkManager (
155156 static_cast <std::byte*>(mem), alloc_size,
156- result_fd, std::move ( signature) , seed,
157+ result_fd, signature, seed,
157158 discard, nvtx, landlock, mseal, supervisor_socket);
158159 } catch (...) {
159160 // If construction throws, release the mmap'd region before propagating.
@@ -168,14 +169,14 @@ BenchmarkManagerPtr make_benchmark_manager(
168169
169170
170171BenchmarkManager::BenchmarkManager (std::byte* arena, std::size_t arena_size,
171- int result_fd, ObfuscatedHexDigest signature, std::uint64_t seed, bool discard,
172+ int result_fd, const std::vector< char >& signature, std::uint64_t seed, bool discard,
172173 bool nvtx, bool landlock, bool mseal, int supervisor_socket)
173174 : mArena(arena),
174175 mResource(arena + sizeof (BenchmarkManager),
175176 arena_size - sizeof(BenchmarkManager),
176177 std::pmr::null_memory_resource()),
177178
178- mSignature(std::move(signature) ),
179+ mSignature(& mResource ),
179180 mSupervisorSock(supervisor_socket),
180181 mStartEvents(&mResource ),
181182 mEndEvents(&mResource ),
@@ -195,11 +196,19 @@ BenchmarkManager::BenchmarkManager(std::byte* arena, std::size_t arena_size,
195196 throw std::runtime_error (" Could not open output pipe" );
196197 }
197198
199+ if (signature.size () != 32 ) {
200+ throw std::invalid_argument (" Invalid signature length" );
201+ }
202+
198203 mNVTXEnabled = nvtx;
199204 mLandlock = landlock;
200205 mSeal = mseal;
201206 mDiscardCache = discard;
202207 mSeed = seed;
208+ std::random_device rd;
209+ std::mt19937 rng (rd ());
210+ mSignature .allocate (32 , rng);
211+ std::copy (signature.begin (), signature.end (), mSignature .data ());
203212}
204213
205214
@@ -337,18 +346,6 @@ void BenchmarkManager::install_protections() {
337346 install_seccomp_filter ();
338347}
339348
340- static inline std::uintptr_t page_mask () {
341- std::uintptr_t page_size = getpagesize ();
342- return ~(page_size - 1u );
343- }
344-
345- void protect_range (void * ptr, size_t size, int prot) {
346- std::uintptr_t start = reinterpret_cast <std::uintptr_t >(ptr) & page_mask ();
347- std::uintptr_t end = (reinterpret_cast <std::uintptr_t >(ptr) + size + getpagesize () - 1 ) & page_mask ();
348- if (mprotect (reinterpret_cast <void *>(start), end - start, prot) < 0 )
349- throw std::system_error (errno, std::system_category (), " mprotect" );
350- }
351-
352349static void setup_seccomp (int sock, bool install_notify, std::uintptr_t lo, std::uintptr_t hi) {
353350 if (sock < 0 )
354351 return ;
@@ -394,48 +391,46 @@ nb::callable BenchmarkManager::initial_kernel_setup(double& time_estimate, const
394391 void * const cc_memory = mDeviceDummyMemory ;
395392 const std::size_t l2_clear_size = mL2CacheSize ;
396393 const bool discard_cache = mDiscardCache ;
397- int device;
398- CUDA_CHECK (cudaGetDevice (&device));
399-
400- nb::callable kernel;
401- std::exception_ptr thread_exception;
402394
403395 nvtx_push (" trigger-compile" );
404- protect_range (reinterpret_cast <void *>(lo), hi - lo, PROT_NONE);
405-
406- {
407- nb::gil_scoped_release release;
408- std::thread worker ([&] {
409- try {
410- CUDA_CHECK (cudaSetDevice (device));
411- setup_seccomp (sock, install_notify, lo, hi);
412-
413- nb::gil_scoped_acquire guard;
414-
415- kernel = kernel_from_qualname (qualname);
416- CUDA_CHECK (cudaDeviceSynchronize ());
417- kernel (*call_args); // trigger JIT compile
418-
419- time_estimate = run_warmup_loop (kernel, call_args, stream,
420- cc_memory, l2_clear_size, discard_cache,
421- warmup_seconds);
422- } catch (...) {
423- thread_exception = std::current_exception ();
424- }
425- });
426- worker.join ();
427- }
396+ PROTECT_RANGE (lo, hi-lo, PROT_NONE);
397+ setup_seccomp (sock, install_notify, lo, hi);
428398
429- protect_range (reinterpret_cast <void *>(lo), hi - lo, PROT_READ | PROT_WRITE);
399+ nb::callable kernel = kernel_from_qualname (qualname);
400+ CUDA_CHECK (cudaDeviceSynchronize ());
401+ kernel (*call_args); // trigger JIT compile
402+
403+ time_estimate = run_warmup_loop (kernel, call_args, stream,
404+ cc_memory, l2_clear_size, discard_cache,
405+ warmup_seconds);
406+
407+ PROTECT_RANGE (lo, hi - lo, PROT_READ | PROT_WRITE);
430408 mSupervisorSock = -1 ;
431409 nvtx_pop ();
432410
433- if (thread_exception)
434- std::rethrow_exception (thread_exception);
435-
436411 return kernel;
437412}
438413
414+ void BenchmarkManager::randomize_before_test (int num_calls, std::mt19937& rng, cudaStream_t stream) {
415+ // pick a random spot for the unsigned
416+ // initialize the whole area with random junk; the error counter
417+ // will be shifted by the initial value, so just writing zero
418+ // won't result in passing the tests.
419+ std::uniform_int_distribution<std::ptrdiff_t > dist (0 , ArenaSize / sizeof (unsigned ) - 1 );
420+ std::uniform_int_distribution<unsigned > noise_generator (0 , std::numeric_limits<unsigned >::max ());
421+ std::vector<unsigned > noise (ArenaSize / sizeof (unsigned ));
422+ std::generate (noise.begin (), noise.end (), [&]() -> unsigned { return noise_generator (rng); });
423+ CUDA_CHECK (cudaMemcpyAsync (mDeviceErrorBase , noise.data (), noise.size () * sizeof (unsigned ), cudaMemcpyHostToDevice, stream));
424+ std::ptrdiff_t offset = dist (rng);
425+ mDeviceErrorCounter = mDeviceErrorBase + offset;
426+ mErrorCountShift = noise.at (offset);
427+
428+ // create a randomized order for running the tests
429+ mTestOrder .resize (num_calls);
430+ std::iota (mTestOrder .begin (), mTestOrder .end (), 1 );
431+ std::shuffle (mTestOrder .begin (), mTestOrder .end (), rng);
432+ }
433+
439434void BenchmarkManager::do_bench_py (
440435 const std::string& kernel_qualname,
441436 const std::vector<nb::tuple>& args,
@@ -472,25 +467,13 @@ void BenchmarkManager::do_bench_py(
472467 " meaningful benchmark numbers: " + std::to_string (time_estimate));
473468 }
474469
475- // pick a random spot for the unsigned
476- // initialize the whole area with random junk; the error counter
477- // will be shifted by the initial value, so just writing zero
478- // won't result in passing the tests.
479470 std::random_device rd;
480471 std::mt19937 rng (rd ());
481- std::uniform_int_distribution<std::ptrdiff_t > dist (0 , ArenaSize / sizeof (unsigned ) - 1 );
482- std::uniform_int_distribution<unsigned > noise_generator (0 , std::numeric_limits<unsigned >::max ());
483- std::vector<unsigned > noise (ArenaSize / sizeof (unsigned ));
484- std::generate (noise.begin (), noise.end (), [&]() -> unsigned { return noise_generator (rng); });
485- CUDA_CHECK (cudaMemcpyAsync (mDeviceErrorBase , noise.data (), noise.size () * sizeof (unsigned ), cudaMemcpyHostToDevice, stream));
486- std::ptrdiff_t offset = dist (rng);
487- mDeviceErrorCounter = mDeviceErrorBase + offset;
488- mErrorCountShift = noise.at (offset);
489472
490- // create a randomized order for running the tests
491- mTestOrder . resize (actual_calls);
492- std::iota ( mTestOrder . begin (), mTestOrder . end (), 1 );
493- std::shuffle ( mTestOrder . begin (), mTestOrder . end (), rng);
473+ randomize_before_test (actual_calls, rng, stream);
474+ // from this point on, even the benchmark thread won't write to the arena anymore
475+ PROTECT_RANGE ( mArena , BenchmarkManagerArenaSize, PROT_READ );
476+ PROTECT_RANGE ( mSignature . page_ptr (), 4096 , PROT_NONE); // make the key fully inaccessible
494477
495478 std::uniform_int_distribution<unsigned > check_seed_generator (0 , 0xffffffff );
496479
@@ -540,12 +523,18 @@ void BenchmarkManager::send_report() {
540523 error_count -= mErrorCountShift ;
541524
542525 std::string message = build_result_message (mTestOrder , error_count, mMedianEventTime );
526+ PROTECT_RANGE (mSignature .page_ptr (), 4096 , PROT_READ);
543527 message = encrypt_message (mSignature .data (), 32 , message);
528+ PROTECT_RANGE (mSignature .page_ptr (), 4096 , PROT_WRITE);
529+ cleanse (mSignature .data (), 32 );
530+ PROTECT_RANGE (mSignature .page_ptr (), 4096 , PROT_NONE);
544531 fwrite (message.data (), 1 , message.size (), mOutputPipe );
545532 fflush (mOutputPipe );
546533}
547534
548535void BenchmarkManager::clean_up () {
536+ PROTECT_RANGE (mArena , BenchmarkManagerArenaSize, PROT_READ | PROT_WRITE);
537+
549538 for (auto & event : mStartEvents ) CUDA_CHECK (cudaEventDestroy (event));
550539 for (auto & event : mEndEvents ) CUDA_CHECK (cudaEventDestroy (event));
551540 mStartEvents .clear ();
0 commit comments