Skip to content
28 changes: 26 additions & 2 deletions csrc/binding.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,9 @@

#include <utility>
#include <random>
#include <thread>
#include "manager.h"
#include "utils.h"

int supervisor_main(int sock_fd);

Expand All @@ -22,8 +24,30 @@ void do_bench(int result_fd, int input_fd, const std::string& kernel_qualname, c
signature.allocate(32, rng);
auto config = read_benchmark_parameters(input_fd, signature.data());
auto mgr = make_benchmark_manager(result_fd, std::move(signature), config.Seed, discard, nvtx, landlock, mseal, supervisor_sock_fd);
auto [args, expected] = mgr->setup_benchmark(nb::cast<nb::callable>(test_generator), test_kwargs, config.Repeats);
mgr->do_bench_py(kernel_qualname, args, expected, reinterpret_cast<cudaStream_t>(stream));

{
nb::gil_scoped_release release;
std::exception_ptr thread_exception;
int device;
CUDA_CHECK(cudaGetDevice(&device));
std::thread run_thread ([&]()
{
try {
CUDA_CHECK(cudaSetDevice(device));
nb::gil_scoped_acquire acquire;
auto [args, expected] = mgr->setup_benchmark(nb::cast<nb::callable>(test_generator), test_kwargs, config.Repeats);
mgr->do_bench_py(kernel_qualname, args, expected, reinterpret_cast<cudaStream_t>(stream));
} catch (...) {
thread_exception = std::current_exception();
}
});
run_thread.join();
if (thread_exception)
std::rethrow_exception(thread_exception);
}

mgr->send_report();
mgr->clean_up();
}


Expand Down
193 changes: 102 additions & 91 deletions csrc/manager.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -181,7 +181,8 @@ BenchmarkManager::BenchmarkManager(std::byte* arena, std::size_t arena_size,
mEndEvents(&mResource),
mExpectedOutputs(&mResource),
mShadowArguments(&mResource),
mOutputBuffers(&mResource)
mOutputBuffers(&mResource),
mTestOrder(&mResource)
{
int device;
CUDA_CHECK(cudaGetDevice(&device));
Expand Down Expand Up @@ -336,39 +337,6 @@ void BenchmarkManager::install_protections() {
install_seccomp_filter();
}

int BenchmarkManager::run_warmup(nb::callable& kernel, const nb::tuple& args, cudaStream_t stream) {
std::chrono::high_resolution_clock::time_point cpu_start = std::chrono::high_resolution_clock::now();
int warmup_run_count = 0;
double time_estimate;
nvtx_push("timing");
while (true) {
// note: we are assuming here that calling the kernel multiple times for the same input is a safe operation
// this is only potentially problematic for in-place kernels;
CUDA_CHECK(cudaDeviceSynchronize());
clear_cache(stream);
kernel(*args);
CUDA_CHECK(cudaDeviceSynchronize());
std::chrono::high_resolution_clock::time_point cpu_end = std::chrono::high_resolution_clock::now();
std::chrono::duration<double> elapsed_seconds = cpu_end - cpu_start;
++warmup_run_count;
if (elapsed_seconds.count() > mWarmupSeconds) {
time_estimate = elapsed_seconds.count() / warmup_run_count;
break;
}
}
nvtx_pop();

// note: this is a very conservative estimate. Timing above was measured with syncs between every kernel.
int calls = mOutputBuffers.size() - 1;
const int actual_calls = std::clamp(static_cast<int>(std::ceil(mBenchmarkSeconds / time_estimate)), 1, calls);

if (actual_calls < 3) {
throw std::runtime_error("The initial speed test indicated that running times are too slow to generate meaningful benchmark numbers: " + std::to_string(time_estimate));
}

return actual_calls;
}

static inline std::uintptr_t page_mask() {
std::uintptr_t page_size = getpagesize();
return ~(page_size - 1u);
Expand All @@ -381,58 +349,89 @@ void protect_range(void* ptr, size_t size, int prot) {
throw std::system_error(errno, std::system_category(), "mprotect");
}

nb::callable BenchmarkManager::get_kernel(const std::string& qualname, const nb::tuple& call_args) {
nb::gil_scoped_release release;
const std::uintptr_t lo = reinterpret_cast<std::uintptr_t>(this->mArena);
static void setup_seccomp(int sock, bool install_notify, std::uintptr_t lo, std::uintptr_t hi) {
if (sock < 0)
return;
try {
if (install_notify)
seccomp_install_memory_notify(sock, lo, hi);
} catch (...) {
close(sock);
throw;
}
close(sock);
}

static double run_warmup_loop(nb::callable& kernel, const nb::tuple& args, cudaStream_t stream,
void* cc_memory, std::size_t l2_clear_size, bool discard_cache,
double warmup_seconds) {
CUDA_CHECK(cudaDeviceSynchronize());
auto cpu_start = std::chrono::high_resolution_clock::now();
int run_count = 0;

while (true) {
::clear_cache(cc_memory, 2 * l2_clear_size, discard_cache, stream);
kernel(*args);
CUDA_CHECK(cudaDeviceSynchronize());

++run_count;
double elapsed = std::chrono::duration<double>(
std::chrono::high_resolution_clock::now() - cpu_start).count();
if (elapsed > warmup_seconds)
return elapsed / run_count;
}
}

nb::callable BenchmarkManager::initial_kernel_setup(double& time_estimate, const std::string& qualname,
const nb::tuple& call_args, cudaStream_t stream) {
const std::uintptr_t lo = reinterpret_cast<std::uintptr_t>(mArena);
const std::uintptr_t hi = lo + BenchmarkManagerArenaSize;

// snapshot all member state needed in the thread before protecting the arena
const int sock = mSupervisorSock;
const bool install_notify = mSeal || supports_seccomp_notify();
const double warmup_seconds = mWarmupSeconds;
void* const cc_memory = mDeviceDummyMemory;
const std::size_t l2_clear_size = mL2CacheSize;
const bool discard_cache = mDiscardCache;
int device;
CUDA_CHECK(cudaGetDevice(&device));

nb::callable kernel;
std::exception_ptr thread_exception;
int sock = mSupervisorSock;
bool install_notify = mSeal || supports_seccomp_notify();

nvtx_push("trigger-compile");

// make the BenchmarkManager inaccessible
protect_range(reinterpret_cast<void*>(lo), hi - lo, PROT_NONE);
// TODO make stack inaccessible (may be impossible) or read-only during the call
// call the python kernel generation function from a different thread.

std::thread make_kernel_thread([&kernel, sock, lo, hi, qualname, &call_args, &thread_exception, install_notify]() {
try {
if (sock >= 0) {
try {
if (install_notify)
seccomp_install_memory_notify(sock, lo, hi);
} catch (...) {
close(sock);
throw;
}
close(sock);

{
nb::gil_scoped_release release;
std::thread worker([&] {
try {
CUDA_CHECK(cudaSetDevice(device));
setup_seccomp(sock, install_notify, lo, hi);

nb::gil_scoped_acquire guard;

kernel = kernel_from_qualname(qualname);
CUDA_CHECK(cudaDeviceSynchronize());
kernel(*call_args); // trigger JIT compile

time_estimate = run_warmup_loop(kernel, call_args, stream,
cc_memory, l2_clear_size, discard_cache,
warmup_seconds);
} catch (...) {
thread_exception = std::current_exception();
}
nb::gil_scoped_acquire guard;
kernel = kernel_from_qualname(qualname);

// ok, first run for compilations etc
CUDA_CHECK(cudaDeviceSynchronize());
kernel(*call_args);
CUDA_CHECK(cudaDeviceSynchronize());
} catch (...) {
thread_exception = std::current_exception();
}
});
});
worker.join();
}

make_kernel_thread.join();
// make it accessible again. This is in the original thread, so the tightened seccomp
// policy does not apply here.
protect_range(reinterpret_cast<void*>(lo), hi - lo, PROT_READ | PROT_WRITE);
// closed now, so set to -1
mSupervisorSock = -1;
nvtx_pop();

if (thread_exception) {
if (thread_exception)
std::rethrow_exception(thread_exception);
}

return kernel;
}
Expand All @@ -446,21 +445,33 @@ void BenchmarkManager::do_bench_py(
setup_test_cases(args, expected, stream);
install_protections();

// at this point, we call user code as we import the kernel (executing arbitrary top-level code)
// after this, we cannot trust python anymore
nb::callable kernel = get_kernel(kernel_qualname, args.at(0));

// now, run a few more times for warmup; in total aim for 1 second of warmup runs
int actual_calls = run_warmup(kernel, args.at(0), stream);
constexpr int DRY_EVENTS = 100;
const int num_events = std::max(actual_calls, DRY_EVENTS);
constexpr std::size_t DRY_EVENTS = 100;
const std::size_t num_events = std::max(mShadowArguments.size(), DRY_EVENTS);
mStartEvents.resize(num_events);
mEndEvents.resize(num_events);
for (int i = 0; i < num_events; i++) {
CUDA_CHECK(cudaEventCreate(&mStartEvents.at(i)));
CUDA_CHECK(cudaEventCreate(&mEndEvents.at(i)));
}

// dry run -- measure overhead of events
mMedianEventTime = measure_event_overhead(DRY_EVENTS, stream);

double time_estimate = 0.0;
// at this point, we call user code as we import the kernel (executing arbitrary top-level code)
// after this, we cannot trust python anymore
nb::callable kernel = initial_kernel_setup(time_estimate, kernel_qualname, args.at(0), stream);

int calls = mOutputBuffers.size() - 1;
const int actual_calls = std::clamp(
static_cast<int>(std::ceil(mBenchmarkSeconds / time_estimate)), 1, calls);

if (actual_calls < 3) {
throw std::runtime_error(
"The initial speed test indicated that running times are too slow to generate "
"meaningful benchmark numbers: " + std::to_string(time_estimate));
}

// pick a random spot for the unsigned
// initialize the whole area with random junk; the error counter
// will be shifted by the initial value, so just writing zero
Expand All @@ -476,20 +487,17 @@ void BenchmarkManager::do_bench_py(
mDeviceErrorCounter = mDeviceErrorBase + offset;
mErrorCountShift = noise.at(offset);

// dry run -- measure overhead of events
float median_event_time = measure_event_overhead(DRY_EVENTS, stream);

// create a randomized order for running the tests
std::vector<int> test_order(actual_calls);
std::iota(test_order.begin(), test_order.end(), 1);
std::shuffle(test_order.begin(), test_order.end(), rng);
mTestOrder.resize(actual_calls);
std::iota(mTestOrder.begin(), mTestOrder.end(), 1);
std::shuffle(mTestOrder.begin(), mTestOrder.end(), rng);

std::uniform_int_distribution<unsigned> check_seed_generator(0, 0xffffffff);

nvtx_push("benchmark");
// now do the real runs
for (int i = 0; i < actual_calls; i++) {
int test_id = test_order.at(i);
const int test_id = mTestOrder.at(i);
// page-in real inputs. If the user kernel runs on the wrong stream, it's likely it won't see the correct inputs
// unfortunately, we need to do this before clearing the cache, so there is a window of opportunity
// *but* we deliberately modify a small subset of the inputs, which only get corrected immediately before
Expand Down Expand Up @@ -522,26 +530,29 @@ void BenchmarkManager::do_bench_py(
validate_result(mExpectedOutputs.at(test_id), mOutputBuffers.at(test_id), check_seed_generator(rng), stream);
}
nvtx_pop();
}

cudaEventSynchronize(mEndEvents.back());
void BenchmarkManager::send_report() {
CUDA_CHECK(cudaEventSynchronize(mEndEvents.at(mTestOrder.size() - 1)));
unsigned error_count;
CUDA_CHECK(cudaMemcpy(&error_count, mDeviceErrorCounter, sizeof(unsigned), cudaMemcpyDeviceToHost));
// subtract the nuisance shift that we applied to the counter
error_count -= mErrorCountShift;

std::string message = build_result_message(test_order, error_count, median_event_time);
std::string message = build_result_message(mTestOrder, error_count, mMedianEventTime);
message = encrypt_message(mSignature.data(), 32, message);
fwrite(message.data(), 1, message.size(), mOutputPipe);
fflush(mOutputPipe);
}

// cleanup events
void BenchmarkManager::clean_up() {
for (auto& event : mStartEvents) CUDA_CHECK(cudaEventDestroy(event));
for (auto& event : mEndEvents) CUDA_CHECK(cudaEventDestroy(event));
mStartEvents.clear();
mEndEvents.clear();
}

std::string BenchmarkManager::build_result_message(const std::vector<int>& test_order, unsigned error_count, float median_event_time) const {
std::string BenchmarkManager::build_result_message(const std::pmr::vector<int>& test_order, unsigned error_count, float median_event_time) const {
std::ostringstream oss;

oss << "event-overhead\t" << median_event_time * 1000 << " µs\n";
Expand Down
10 changes: 7 additions & 3 deletions csrc/manager.h
Original file line number Diff line number Diff line change
Expand Up @@ -50,6 +50,8 @@ class BenchmarkManager {
public:
std::pair<std::vector<nb::tuple>, std::vector<nb::tuple>> setup_benchmark(const nb::callable& generate_test_case, const nb::dict& kwargs, int repeats);
void do_bench_py(const std::string& kernel_qualname, const std::vector<nb::tuple>& args, const std::vector<nb::tuple>& expected, cudaStream_t stream);
void send_report();
void clean_up();
private:
friend BenchmarkManagerPtr make_benchmark_manager(int result_fd, ObfuscatedHexDigest signature, std::uint64_t seed, bool discard, bool nvtx, bool landlock, bool mseal, int supervisor_socket);
friend BenchmarkManagerDeleter;
Expand Down Expand Up @@ -113,10 +115,13 @@ class BenchmarkManager {
std::pmr::vector<Expected> mExpectedOutputs;
std::pmr::vector<ShadowArgumentList> mShadowArguments;
std::pmr::vector<nb_cuda_array> mOutputBuffers;
std::pmr::vector<int> mTestOrder;

FILE* mOutputPipe = nullptr;
ObfuscatedHexDigest mSignature;

float mMedianEventTime = -1.f;

static ShadowArgumentList make_shadow_args(const nb::tuple& args, cudaStream_t stream,
std::pmr::memory_resource* resource);

Expand All @@ -130,10 +135,9 @@ class BenchmarkManager {
void setup_test_cases(const std::vector<nb::tuple>& args, const std::vector<nb::tuple>& expected, cudaStream_t stream);

void install_protections();
int run_warmup(nb::callable& kernel, const nb::tuple& args, cudaStream_t stream);
nb::callable get_kernel(const std::string& qualname, const nb::tuple& call_args);
nb::callable initial_kernel_setup(double& time_estimate, const std::string& qualname, const nb::tuple& call_args, cudaStream_t stream);

[[nodiscard]] std::string build_result_message(const std::vector<int>& test_order, unsigned error_count, float median_event_time) const;
[[nodiscard]] std::string build_result_message(const std::pmr::vector<int>& test_order, unsigned error_count, float median_event_time) const;


// debug only: Any sort of test exploit that targets specific values of this class is going to be brittle,
Expand Down
17 changes: 11 additions & 6 deletions csrc/obfuscate.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -66,20 +66,25 @@ void ObfuscatedHexDigest::allocate(std::size_t size, std::mt19937& rng) {
if (size > PAGE_SIZE / 2) {
throw std::runtime_error("target size too big");
}
if (Len != 0 || Offset != 0) {
if (this->size() != 0) {
throw std::runtime_error("already allocated");
}

fill_random_hex(page_ptr(), PAGE_SIZE, rng);
const std::size_t max_offset = PAGE_SIZE - size - 1;
std::uniform_int_distribution<std::size_t> offset_dist(0, max_offset);
const std::uintptr_t max_offset = PAGE_SIZE - size - 1;
std::uniform_int_distribution<std::uintptr_t> offset_dist(0, max_offset);

Offset = offset_dist(rng);
Len = size;
const std::uintptr_t offset = offset_dist(rng);
HashedOffset = slow_hash(offset);
HashedLen = slow_hash(size ^ offset);
}

char* ObfuscatedHexDigest::data() {
return reinterpret_cast<char*>(page_ptr()) + Offset;
return reinterpret_cast<char*>(page_ptr()) + slow_unhash(HashedOffset);
}

std::size_t ObfuscatedHexDigest::size() const {
return slow_unhash(HashedLen ^ slow_unhash(HashedOffset));
}

void fill_random_hex(void* target, std::size_t size, std::mt19937& rng) {
Expand Down
Loading
Loading