Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
28 changes: 26 additions & 2 deletions csrc/binding.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,9 @@

#include <utility>
#include <random>
#include <thread>
#include "manager.h"
#include "utils.h"

int supervisor_main(int sock_fd);

Expand All @@ -22,8 +24,30 @@ void do_bench(int result_fd, int input_fd, const std::string& kernel_qualname, c
signature.allocate(32, rng);
auto config = read_benchmark_parameters(input_fd, signature.data());
auto mgr = make_benchmark_manager(result_fd, std::move(signature), config.Seed, discard, nvtx, landlock, mseal, supervisor_sock_fd);
auto [args, expected] = mgr->setup_benchmark(nb::cast<nb::callable>(test_generator), test_kwargs, config.Repeats);
mgr->do_bench_py(kernel_qualname, args, expected, reinterpret_cast<cudaStream_t>(stream));

{
nb::gil_scoped_release release;
std::exception_ptr thread_exception;
int device;
CUDA_CHECK(cudaGetDevice(&device));
std::thread run_thread ([&]()
{
try {
CUDA_CHECK(cudaSetDevice(device));
nb::gil_scoped_acquire acquire;
auto [args, expected] = mgr->setup_benchmark(nb::cast<nb::callable>(test_generator), test_kwargs, config.Repeats);
mgr->do_bench_py(kernel_qualname, args, expected, reinterpret_cast<cudaStream_t>(stream));
} catch (...) {
thread_exception = std::current_exception();
}
});
run_thread.join();
if (thread_exception)
std::rethrow_exception(thread_exception);
}

mgr->send_report();
mgr->clean_up();
}


Expand Down
193 changes: 102 additions & 91 deletions csrc/manager.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -181,7 +181,8 @@ BenchmarkManager::BenchmarkManager(std::byte* arena, std::size_t arena_size,
mEndEvents(&mResource),
mExpectedOutputs(&mResource),
mShadowArguments(&mResource),
mOutputBuffers(&mResource)
mOutputBuffers(&mResource),
mTestOrder(&mResource)
{
int device;
CUDA_CHECK(cudaGetDevice(&device));
Expand Down Expand Up @@ -336,39 +337,6 @@ void BenchmarkManager::install_protections() {
install_seccomp_filter();
}

int BenchmarkManager::run_warmup(nb::callable& kernel, const nb::tuple& args, cudaStream_t stream) {
std::chrono::high_resolution_clock::time_point cpu_start = std::chrono::high_resolution_clock::now();
int warmup_run_count = 0;
double time_estimate;
nvtx_push("timing");
while (true) {
// note: we are assuming here that calling the kernel multiple times for the same input is a safe operation
// this is only potentially problematic for in-place kernels;
CUDA_CHECK(cudaDeviceSynchronize());
clear_cache(stream);
kernel(*args);
CUDA_CHECK(cudaDeviceSynchronize());
std::chrono::high_resolution_clock::time_point cpu_end = std::chrono::high_resolution_clock::now();
std::chrono::duration<double> elapsed_seconds = cpu_end - cpu_start;
++warmup_run_count;
if (elapsed_seconds.count() > mWarmupSeconds) {
time_estimate = elapsed_seconds.count() / warmup_run_count;
break;
}
}
nvtx_pop();

// note: this is a very conservative estimate. Timing above was measured with syncs between every kernel.
int calls = mOutputBuffers.size() - 1;
const int actual_calls = std::clamp(static_cast<int>(std::ceil(mBenchmarkSeconds / time_estimate)), 1, calls);

if (actual_calls < 3) {
throw std::runtime_error("The initial speed test indicated that running times are too slow to generate meaningful benchmark numbers: " + std::to_string(time_estimate));
}

return actual_calls;
}

static inline std::uintptr_t page_mask() {
std::uintptr_t page_size = getpagesize();
return ~(page_size - 1u);
Expand All @@ -381,58 +349,89 @@ void protect_range(void* ptr, size_t size, int prot) {
throw std::system_error(errno, std::system_category(), "mprotect");
}

nb::callable BenchmarkManager::get_kernel(const std::string& qualname, const nb::tuple& call_args) {
nb::gil_scoped_release release;
const std::uintptr_t lo = reinterpret_cast<std::uintptr_t>(this->mArena);
static void setup_seccomp(int sock, bool install_notify, std::uintptr_t lo, std::uintptr_t hi) {
if (sock < 0)
return;
try {
if (install_notify)
seccomp_install_memory_notify(sock, lo, hi);
} catch (...) {
close(sock);
throw;
}
close(sock);
}

static double run_warmup_loop(nb::callable& kernel, const nb::tuple& args, cudaStream_t stream,
void* cc_memory, std::size_t l2_clear_size, bool discard_cache,
double warmup_seconds) {
CUDA_CHECK(cudaDeviceSynchronize());
auto cpu_start = std::chrono::high_resolution_clock::now();
int run_count = 0;

while (true) {
::clear_cache(cc_memory, 2 * l2_clear_size, discard_cache, stream);
kernel(*args);
CUDA_CHECK(cudaDeviceSynchronize());

++run_count;
double elapsed = std::chrono::duration<double>(
std::chrono::high_resolution_clock::now() - cpu_start).count();
if (elapsed > warmup_seconds)
return elapsed / run_count;
}
}

nb::callable BenchmarkManager::initial_kernel_setup(double& time_estimate, const std::string& qualname,
const nb::tuple& call_args, cudaStream_t stream) {
const std::uintptr_t lo = reinterpret_cast<std::uintptr_t>(mArena);
const std::uintptr_t hi = lo + BenchmarkManagerArenaSize;

// snapshot all member state needed in the thread before protecting the arena
const int sock = mSupervisorSock;
const bool install_notify = mSeal || supports_seccomp_notify();
const double warmup_seconds = mWarmupSeconds;
void* const cc_memory = mDeviceDummyMemory;
const std::size_t l2_clear_size = mL2CacheSize;
const bool discard_cache = mDiscardCache;
int device;
CUDA_CHECK(cudaGetDevice(&device));

nb::callable kernel;
std::exception_ptr thread_exception;
int sock = mSupervisorSock;
bool install_notify = mSeal || supports_seccomp_notify();

nvtx_push("trigger-compile");

// make the BenchmarkManager inaccessible
protect_range(reinterpret_cast<void*>(lo), hi - lo, PROT_NONE);
// TODO make stack inaccessible (may be impossible) or read-only during the call
// call the python kernel generation function from a different thread.

std::thread make_kernel_thread([&kernel, sock, lo, hi, qualname, &call_args, &thread_exception, install_notify]() {
try {
if (sock >= 0) {
try {
if (install_notify)
seccomp_install_memory_notify(sock, lo, hi);
} catch (...) {
close(sock);
throw;
}
close(sock);

{
nb::gil_scoped_release release;
std::thread worker([&] {
try {
CUDA_CHECK(cudaSetDevice(device));
setup_seccomp(sock, install_notify, lo, hi);

nb::gil_scoped_acquire guard;

kernel = kernel_from_qualname(qualname);
CUDA_CHECK(cudaDeviceSynchronize());
kernel(*call_args); // trigger JIT compile

time_estimate = run_warmup_loop(kernel, call_args, stream,
cc_memory, l2_clear_size, discard_cache,
warmup_seconds);
} catch (...) {
thread_exception = std::current_exception();
}
nb::gil_scoped_acquire guard;
kernel = kernel_from_qualname(qualname);

// ok, first run for compilations etc
CUDA_CHECK(cudaDeviceSynchronize());
kernel(*call_args);
CUDA_CHECK(cudaDeviceSynchronize());
} catch (...) {
thread_exception = std::current_exception();
}
});
});
worker.join();
}

make_kernel_thread.join();
// make it accessible again. This is in the original thread, so the tightened seccomp
// policy does not apply here.
protect_range(reinterpret_cast<void*>(lo), hi - lo, PROT_READ | PROT_WRITE);
// closed now, so set to -1
mSupervisorSock = -1;
nvtx_pop();

if (thread_exception) {
if (thread_exception)
std::rethrow_exception(thread_exception);
}

return kernel;
}
Expand All @@ -446,21 +445,33 @@ void BenchmarkManager::do_bench_py(
setup_test_cases(args, expected, stream);
install_protections();

// at this point, we call user code as we import the kernel (executing arbitrary top-level code)
// after this, we cannot trust python anymore
nb::callable kernel = get_kernel(kernel_qualname, args.at(0));

// now, run a few more times for warmup; in total aim for 1 second of warmup runs
int actual_calls = run_warmup(kernel, args.at(0), stream);
constexpr int DRY_EVENTS = 100;
const int num_events = std::max(actual_calls, DRY_EVENTS);
constexpr std::size_t DRY_EVENTS = 100;
const std::size_t num_events = std::max(mShadowArguments.size(), DRY_EVENTS);
mStartEvents.resize(num_events);
mEndEvents.resize(num_events);
for (int i = 0; i < num_events; i++) {
CUDA_CHECK(cudaEventCreate(&mStartEvents.at(i)));
CUDA_CHECK(cudaEventCreate(&mEndEvents.at(i)));
}

// dry run -- measure overhead of events
mMedianEventTime = measure_event_overhead(DRY_EVENTS, stream);

double time_estimate = 0.0;
// at this point, we call user code as we import the kernel (executing arbitrary top-level code)
// after this, we cannot trust python anymore
nb::callable kernel = initial_kernel_setup(time_estimate, kernel_qualname, args.at(0), stream);

int calls = mOutputBuffers.size() - 1;
const int actual_calls = std::clamp(
static_cast<int>(std::ceil(mBenchmarkSeconds / time_estimate)), 1, calls);

if (actual_calls < 3) {
throw std::runtime_error(
"The initial speed test indicated that running times are too slow to generate "
"meaningful benchmark numbers: " + std::to_string(time_estimate));
}

// pick a random spot for the unsigned
// initialize the whole area with random junk; the error counter
// will be shifted by the initial value, so just writing zero
Expand All @@ -476,20 +487,17 @@ void BenchmarkManager::do_bench_py(
mDeviceErrorCounter = mDeviceErrorBase + offset;
mErrorCountShift = noise.at(offset);

// dry run -- measure overhead of events
float median_event_time = measure_event_overhead(DRY_EVENTS, stream);

// create a randomized order for running the tests
std::vector<int> test_order(actual_calls);
std::iota(test_order.begin(), test_order.end(), 1);
std::shuffle(test_order.begin(), test_order.end(), rng);
mTestOrder.resize(actual_calls);
std::iota(mTestOrder.begin(), mTestOrder.end(), 1);
std::shuffle(mTestOrder.begin(), mTestOrder.end(), rng);

std::uniform_int_distribution<unsigned> check_seed_generator(0, 0xffffffff);

nvtx_push("benchmark");
// now do the real runs
for (int i = 0; i < actual_calls; i++) {
int test_id = test_order.at(i);
const int test_id = mTestOrder.at(i);
// page-in real inputs. If the user kernel runs on the wrong stream, it's likely it won't see the correct inputs
// unfortunately, we need to do this before clearing the cache, so there is a window of opportunity
// *but* we deliberately modify a small subset of the inputs, which only get corrected immediately before
Expand Down Expand Up @@ -522,26 +530,29 @@ void BenchmarkManager::do_bench_py(
validate_result(mExpectedOutputs.at(test_id), mOutputBuffers.at(test_id), check_seed_generator(rng), stream);
}
nvtx_pop();
}

cudaEventSynchronize(mEndEvents.back());
void BenchmarkManager::send_report() {
CUDA_CHECK(cudaEventSynchronize(mEndEvents.at(mTestOrder.size() - 1)));
unsigned error_count;
CUDA_CHECK(cudaMemcpy(&error_count, mDeviceErrorCounter, sizeof(unsigned), cudaMemcpyDeviceToHost));
// subtract the nuisance shift that we applied to the counter
error_count -= mErrorCountShift;

std::string message = build_result_message(test_order, error_count, median_event_time);
std::string message = build_result_message(mTestOrder, error_count, mMedianEventTime);
message = encrypt_message(mSignature.data(), 32, message);
fwrite(message.data(), 1, message.size(), mOutputPipe);
fflush(mOutputPipe);
}

// cleanup events
void BenchmarkManager::clean_up() {
for (auto& event : mStartEvents) CUDA_CHECK(cudaEventDestroy(event));
for (auto& event : mEndEvents) CUDA_CHECK(cudaEventDestroy(event));
mStartEvents.clear();
mEndEvents.clear();
}

std::string BenchmarkManager::build_result_message(const std::vector<int>& test_order, unsigned error_count, float median_event_time) const {
std::string BenchmarkManager::build_result_message(const std::pmr::vector<int>& test_order, unsigned error_count, float median_event_time) const {
std::ostringstream oss;

oss << "event-overhead\t" << median_event_time * 1000 << " µs\n";
Expand Down
10 changes: 7 additions & 3 deletions csrc/manager.h
Original file line number Diff line number Diff line change
Expand Up @@ -50,6 +50,8 @@ class BenchmarkManager {
public:
std::pair<std::vector<nb::tuple>, std::vector<nb::tuple>> setup_benchmark(const nb::callable& generate_test_case, const nb::dict& kwargs, int repeats);
void do_bench_py(const std::string& kernel_qualname, const std::vector<nb::tuple>& args, const std::vector<nb::tuple>& expected, cudaStream_t stream);
void send_report();
void clean_up();
private:
friend BenchmarkManagerPtr make_benchmark_manager(int result_fd, ObfuscatedHexDigest signature, std::uint64_t seed, bool discard, bool nvtx, bool landlock, bool mseal, int supervisor_socket);
friend BenchmarkManagerDeleter;
Expand Down Expand Up @@ -113,10 +115,13 @@ class BenchmarkManager {
std::pmr::vector<Expected> mExpectedOutputs;
std::pmr::vector<ShadowArgumentList> mShadowArguments;
std::pmr::vector<nb_cuda_array> mOutputBuffers;
std::pmr::vector<int> mTestOrder;

FILE* mOutputPipe = nullptr;
ObfuscatedHexDigest mSignature;

float mMedianEventTime = -1.f;

static ShadowArgumentList make_shadow_args(const nb::tuple& args, cudaStream_t stream,
std::pmr::memory_resource* resource);

Expand All @@ -130,10 +135,9 @@ class BenchmarkManager {
void setup_test_cases(const std::vector<nb::tuple>& args, const std::vector<nb::tuple>& expected, cudaStream_t stream);

void install_protections();
int run_warmup(nb::callable& kernel, const nb::tuple& args, cudaStream_t stream);
nb::callable get_kernel(const std::string& qualname, const nb::tuple& call_args);
nb::callable initial_kernel_setup(double& time_estimate, const std::string& qualname, const nb::tuple& call_args, cudaStream_t stream);

[[nodiscard]] std::string build_result_message(const std::vector<int>& test_order, unsigned error_count, float median_event_time) const;
[[nodiscard]] std::string build_result_message(const std::pmr::vector<int>& test_order, unsigned error_count, float median_event_time) const;


// debug only: Any sort of test exploit that targets specific values of this class is going to be brittle,
Expand Down
Loading