Skip to content
Merged
9 changes: 4 additions & 5 deletions csrc/binding.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -19,11 +19,10 @@ namespace nb = nanobind;
void do_bench(int result_fd, int input_fd, const std::string& kernel_qualname, const nb::object& test_generator,
const nb::dict& test_kwargs, std::uintptr_t stream, bool discard, bool nvtx, bool landlock, bool mseal,
int supervisor_sock_fd) {
ObfuscatedHexDigest signature;
std::mt19937 rng(std::random_device{}());
signature.allocate(32, rng);
auto config = read_benchmark_parameters(input_fd, signature.data());
auto mgr = make_benchmark_manager(result_fd, std::move(signature), config.Seed, discard, nvtx, landlock, mseal, supervisor_sock_fd);
std::vector<char> signature_bytes(32);
auto config = read_benchmark_parameters(input_fd, signature_bytes.data());
auto mgr = make_benchmark_manager(result_fd, signature_bytes, config.Seed, discard, nvtx, landlock, mseal, supervisor_sock_fd);
cleanse(signature_bytes.data(), 32);

{
nb::gil_scoped_release release;
Expand Down
10 changes: 0 additions & 10 deletions csrc/landlock.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -211,16 +211,6 @@ void setup_seccomp_filter(scmp_filter_ctx ctx) {
check_seccomp(seccomp_rule_add(ctx, SCMP_ACT_ERRNO(EPERM), SCMP_SYS(prctl), 1,
SCMP_A0(SCMP_CMP_EQ, PR_SET_PTRACER)),
"block prctl(SET_PTRACER)");
// TODO figure out what else we can and should block
/*
check_seccomp(seccomp_rule_add(ctx, SCMP_ACT_ERRNO(EPERM), SCMP_SYS(mprotect), 1,
SCMP_A2(SCMP_CMP_MASKED_EQ, PROT_WRITE, PROT_WRITE)),
"block mprotect+WRITE");

check_seccomp(seccomp_rule_add(ctx, SCMP_ACT_ERRNO(EPERM), SCMP_SYS(pkey_mprotect), 1,
SCMP_A2(SCMP_CMP_MASKED_EQ, PROT_WRITE, PROT_WRITE)),
"block pkey_mprotect+WRITE");
*/
}

void install_seccomp_filter() {
Expand Down
119 changes: 54 additions & 65 deletions csrc/manager.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,7 @@
#include <nanobind/stl/string.h>
#include <sys/mman.h>
#include <unistd.h>
#include "protect.h"

static constexpr std::size_t ArenaSize = 2 * 1024 * 1024;

Expand Down Expand Up @@ -137,7 +138,7 @@ void BenchmarkManagerDeleter::operator()(BenchmarkManager* p) const noexcept {


BenchmarkManagerPtr make_benchmark_manager(
int result_fd, ObfuscatedHexDigest signature, std::uint64_t seed,
int result_fd, const std::vector<char>& signature, std::uint64_t seed,
bool discard, bool nvtx, bool landlock, bool mseal, int supervisor_socket)
{
const std::size_t page_size = static_cast<std::size_t>(getpagesize());
Expand All @@ -153,7 +154,7 @@ BenchmarkManagerPtr make_benchmark_manager(
try {
raw = new (mem) BenchmarkManager(
static_cast<std::byte*>(mem), alloc_size,
result_fd, std::move(signature), seed,
result_fd, signature, seed,
discard, nvtx, landlock, mseal, supervisor_socket);
} catch (...) {
// If construction throws, release the mmap'd region before propagating.
Expand All @@ -168,14 +169,14 @@ BenchmarkManagerPtr make_benchmark_manager(


BenchmarkManager::BenchmarkManager(std::byte* arena, std::size_t arena_size,
int result_fd, ObfuscatedHexDigest signature, std::uint64_t seed, bool discard,
int result_fd, const std::vector<char>& signature, std::uint64_t seed, bool discard,
bool nvtx, bool landlock, bool mseal, int supervisor_socket)
: mArena(arena),
mResource(arena + sizeof(BenchmarkManager),
arena_size - sizeof(BenchmarkManager),
std::pmr::null_memory_resource()),

mSignature(std::move(signature)),
mSignature(&mResource),
mSupervisorSock(supervisor_socket),
mStartEvents(&mResource),
mEndEvents(&mResource),
Expand All @@ -195,11 +196,19 @@ BenchmarkManager::BenchmarkManager(std::byte* arena, std::size_t arena_size,
throw std::runtime_error("Could not open output pipe");
}

if (signature.size() != 32) {
throw std::invalid_argument("Invalid signature length");
}

mNVTXEnabled = nvtx;
mLandlock = landlock;
mSeal = mseal;
mDiscardCache = discard;
mSeed = seed;
std::random_device rd;
std::mt19937 rng(rd());
mSignature.allocate(32, rng);
std::copy(signature.begin(), signature.end(), mSignature.data());
}


Expand Down Expand Up @@ -337,18 +346,6 @@ void BenchmarkManager::install_protections() {
install_seccomp_filter();
}

static inline std::uintptr_t page_mask() {
std::uintptr_t page_size = getpagesize();
return ~(page_size - 1u);
}

void protect_range(void* ptr, size_t size, int prot) {
std::uintptr_t start = reinterpret_cast<std::uintptr_t>(ptr) & page_mask();
std::uintptr_t end = (reinterpret_cast<std::uintptr_t>(ptr) + size + getpagesize() - 1) & page_mask();
if (mprotect(reinterpret_cast<void*>(start), end - start, prot) < 0)
throw std::system_error(errno, std::system_category(), "mprotect");
}

static void setup_seccomp(int sock, bool install_notify, std::uintptr_t lo, std::uintptr_t hi) {
if (sock < 0)
return;
Expand Down Expand Up @@ -394,48 +391,46 @@ nb::callable BenchmarkManager::initial_kernel_setup(double& time_estimate, const
void* const cc_memory = mDeviceDummyMemory;
const std::size_t l2_clear_size = mL2CacheSize;
const bool discard_cache = mDiscardCache;
int device;
CUDA_CHECK(cudaGetDevice(&device));

nb::callable kernel;
std::exception_ptr thread_exception;

nvtx_push("trigger-compile");
protect_range(reinterpret_cast<void*>(lo), hi - lo, PROT_NONE);

{
nb::gil_scoped_release release;
std::thread worker([&] {
try {
CUDA_CHECK(cudaSetDevice(device));
setup_seccomp(sock, install_notify, lo, hi);

nb::gil_scoped_acquire guard;

kernel = kernel_from_qualname(qualname);
CUDA_CHECK(cudaDeviceSynchronize());
kernel(*call_args); // trigger JIT compile

time_estimate = run_warmup_loop(kernel, call_args, stream,
cc_memory, l2_clear_size, discard_cache,
warmup_seconds);
} catch (...) {
thread_exception = std::current_exception();
}
});
worker.join();
}
PROTECT_RANGE(lo, hi-lo, PROT_NONE);
setup_seccomp(sock, install_notify, lo, hi);

protect_range(reinterpret_cast<void*>(lo), hi - lo, PROT_READ | PROT_WRITE);
nb::callable kernel = kernel_from_qualname(qualname);
CUDA_CHECK(cudaDeviceSynchronize());
kernel(*call_args); // trigger JIT compile

time_estimate = run_warmup_loop(kernel, call_args, stream,
cc_memory, l2_clear_size, discard_cache,
warmup_seconds);

PROTECT_RANGE(lo, hi - lo, PROT_READ | PROT_WRITE);
mSupervisorSock = -1;
nvtx_pop();

if (thread_exception)
std::rethrow_exception(thread_exception);

return kernel;
}

void BenchmarkManager::randomize_before_test(int num_calls, std::mt19937& rng, cudaStream_t stream) {
// pick a random spot for the unsigned
// initialize the whole area with random junk; the error counter
// will be shifted by the initial value, so just writing zero
// won't result in passing the tests.
std::uniform_int_distribution<std::ptrdiff_t> dist(0, ArenaSize / sizeof(unsigned) - 1);
std::uniform_int_distribution<unsigned> noise_generator(0, std::numeric_limits<unsigned>::max());
std::vector<unsigned> noise(ArenaSize / sizeof(unsigned));
std::generate(noise.begin(), noise.end(), [&]() -> unsigned { return noise_generator(rng); });
CUDA_CHECK(cudaMemcpyAsync(mDeviceErrorBase, noise.data(), noise.size() * sizeof(unsigned), cudaMemcpyHostToDevice, stream));
std::ptrdiff_t offset = dist(rng);
mDeviceErrorCounter = mDeviceErrorBase + offset;
mErrorCountShift = noise.at(offset);

// create a randomized order for running the tests
mTestOrder.resize(num_calls);
std::iota(mTestOrder.begin(), mTestOrder.end(), 1);
std::shuffle(mTestOrder.begin(), mTestOrder.end(), rng);
}

void BenchmarkManager::do_bench_py(
const std::string& kernel_qualname,
const std::vector<nb::tuple>& args,
Expand Down Expand Up @@ -472,25 +467,13 @@ void BenchmarkManager::do_bench_py(
"meaningful benchmark numbers: " + std::to_string(time_estimate));
}

// pick a random spot for the unsigned
// initialize the whole area with random junk; the error counter
// will be shifted by the initial value, so just writing zero
// won't result in passing the tests.
std::random_device rd;
std::mt19937 rng(rd());
std::uniform_int_distribution<std::ptrdiff_t> dist(0, ArenaSize / sizeof(unsigned) - 1);
std::uniform_int_distribution<unsigned> noise_generator(0, std::numeric_limits<unsigned>::max());
std::vector<unsigned> noise(ArenaSize / sizeof(unsigned));
std::generate(noise.begin(), noise.end(), [&]() -> unsigned { return noise_generator(rng); });
CUDA_CHECK(cudaMemcpyAsync(mDeviceErrorBase, noise.data(), noise.size() * sizeof(unsigned), cudaMemcpyHostToDevice, stream));
std::ptrdiff_t offset = dist(rng);
mDeviceErrorCounter = mDeviceErrorBase + offset;
mErrorCountShift = noise.at(offset);

// create a randomized order for running the tests
mTestOrder.resize(actual_calls);
std::iota(mTestOrder.begin(), mTestOrder.end(), 1);
std::shuffle(mTestOrder.begin(), mTestOrder.end(), rng);
randomize_before_test(actual_calls, rng, stream);
// from this point on, even the benchmark thread won't write to the arena anymore
PROTECT_RANGE(mArena, BenchmarkManagerArenaSize, PROT_READ);
PROTECT_RANGE(mSignature.page_ptr(), 4096, PROT_NONE); // make the key fully inaccessible

std::uniform_int_distribution<unsigned> check_seed_generator(0, 0xffffffff);

Expand Down Expand Up @@ -540,12 +523,18 @@ void BenchmarkManager::send_report() {
error_count -= mErrorCountShift;

std::string message = build_result_message(mTestOrder, error_count, mMedianEventTime);
PROTECT_RANGE(mSignature.page_ptr(), 4096, PROT_READ);
message = encrypt_message(mSignature.data(), 32, message);
PROTECT_RANGE(mSignature.page_ptr(), 4096, PROT_WRITE);
cleanse(mSignature.data(), 32);
PROTECT_RANGE(mSignature.page_ptr(), 4096, PROT_NONE);
fwrite(message.data(), 1, message.size(), mOutputPipe);
fflush(mOutputPipe);
}

void BenchmarkManager::clean_up() {
PROTECT_RANGE(mArena, BenchmarkManagerArenaSize, PROT_READ | PROT_WRITE);

for (auto& event : mStartEvents) CUDA_CHECK(cudaEventDestroy(event));
for (auto& event : mEndEvents) CUDA_CHECK(cudaEventDestroy(event));
mStartEvents.clear();
Expand Down
7 changes: 4 additions & 3 deletions csrc/manager.h
Original file line number Diff line number Diff line change
Expand Up @@ -42,7 +42,7 @@ struct BenchmarkManagerDeleter {
using BenchmarkManagerPtr = std::unique_ptr<BenchmarkManager, BenchmarkManagerDeleter>;

BenchmarkManagerPtr make_benchmark_manager(
int result_fd, ObfuscatedHexDigest signature, std::uint64_t seed,
int result_fd, const std::vector<char>& signature, std::uint64_t seed,
bool discard, bool nvtx, bool landlock, bool mseal, int supervisor_socket);


Expand All @@ -53,13 +53,13 @@ class BenchmarkManager {
void send_report();
void clean_up();
private:
friend BenchmarkManagerPtr make_benchmark_manager(int result_fd, ObfuscatedHexDigest signature, std::uint64_t seed, bool discard, bool nvtx, bool landlock, bool mseal, int supervisor_socket);
friend BenchmarkManagerPtr make_benchmark_manager(int result_fd, const std::vector<char>& signature, std::uint64_t seed, bool discard, bool nvtx, bool landlock, bool mseal, int supervisor_socket);
friend BenchmarkManagerDeleter;
/// `arena` is the mmap region that owns all memory for this object and its vectors.
/// The BenchmarkManager must have been placement-newed into the front of that region;
/// the rest is used as a monotonic PMR arena for internal vectors.
BenchmarkManager(std::byte* arena, std::size_t arena_size,
int result_fd, ObfuscatedHexDigest signature, std::uint64_t seed,
int result_fd, const std::vector<char>& signature, std::uint64_t seed,
bool discard, bool nvtx, bool landlock, bool mseal, int supervisor_socket);
~BenchmarkManager();

Expand Down Expand Up @@ -135,6 +135,7 @@ class BenchmarkManager {
void setup_test_cases(const std::vector<nb::tuple>& args, const std::vector<nb::tuple>& expected, cudaStream_t stream);

void install_protections();
void randomize_before_test(int num_calls, std::mt19937& rng, cudaStream_t stream);
nb::callable initial_kernel_setup(double& time_estimate, const std::string& qualname, const nb::tuple& call_args, cudaStream_t stream);

[[nodiscard]] std::string build_result_message(const std::pmr::vector<int>& test_order, unsigned error_count, float median_event_time) const;
Expand Down
75 changes: 18 additions & 57 deletions csrc/obfuscate.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -18,48 +18,11 @@
#include <openssl/evp.h>
#include <openssl/rand.h>

constexpr std::size_t PAGE_SIZE = 4096;
constexpr static std::size_t PAGE_SIZE = 4096;

ProtectablePage::ProtectablePage() {
void* page = mmap(nullptr, PAGE_SIZE, PROT_READ | PROT_WRITE,
MAP_PRIVATE | MAP_ANONYMOUS, -1, 0);
if (page == MAP_FAILED) {
throw std::runtime_error("mmap failed");
}
Page = slow_hash(page);
}

ProtectablePage::~ProtectablePage() {
void* page = page_ptr();
if (page) {
if (mprotect(page, PAGE_SIZE, PROT_READ | PROT_WRITE) != 0) {
std::perror("mprotect restore failed in ~ProtectablePage");
}
if (munmap(page, PAGE_SIZE) != 0) {
std::perror("munmap failed in ~ProtectablePage");
}
}
}

ProtectablePage::ProtectablePage(ProtectablePage&& other) noexcept : Page(std::exchange(other.Page, slow_hash((void*)nullptr))){
}

void ProtectablePage::lock() {
void* page = page_ptr();
if (mprotect(page, PAGE_SIZE, PROT_NONE) != 0) {
throw std::system_error(errno, std::generic_category(), "mprotect(PROT_NONE) failed");
}
}

void ProtectablePage::unlock() {
void* page = page_ptr();
if (mprotect(page, PAGE_SIZE, PROT_READ) != 0) {
throw std::system_error(errno, std::generic_category(), "mprotect(PROT_READ) failed");
}
}

void* ProtectablePage::page_ptr() const {
return reinterpret_cast<void*>(slow_unhash(Page));
ObfuscatedHexDigest::ObfuscatedHexDigest(std::pmr::monotonic_buffer_resource* mem) {
void* page = mem->allocate(PAGE_SIZE, PAGE_SIZE);
HashedPagePtr = slow_hash(reinterpret_cast<std::uintptr_t>(page));
}

void ObfuscatedHexDigest::allocate(std::size_t size, std::mt19937& rng) {
Expand All @@ -70,7 +33,7 @@ void ObfuscatedHexDigest::allocate(std::size_t size, std::mt19937& rng) {
throw std::runtime_error("already allocated");
}

fill_random_hex(page_ptr(), PAGE_SIZE, rng);
fill_random_hex(reinterpret_cast<void*>(slow_unhash(HashedPagePtr)), PAGE_SIZE, rng);
const std::uintptr_t max_offset = PAGE_SIZE - size - 1;
std::uniform_int_distribution<std::uintptr_t> offset_dist(0, max_offset);

Expand All @@ -79,8 +42,12 @@ void ObfuscatedHexDigest::allocate(std::size_t size, std::mt19937& rng) {
HashedLen = slow_hash(size ^ offset);
}

const void* ObfuscatedHexDigest::page_ptr() const {
return reinterpret_cast<const void*>(slow_unhash(HashedPagePtr));
}

char* ObfuscatedHexDigest::data() {
return reinterpret_cast<char*>(page_ptr()) + slow_unhash(HashedOffset);
return reinterpret_cast<char*>(slow_unhash(HashedPagePtr)) + slow_unhash(HashedOffset);
}

std::size_t ObfuscatedHexDigest::size() const {
Expand Down Expand Up @@ -115,20 +82,15 @@ std::uintptr_t slow_unhash(std::uintptr_t p, int rounds) {
return p;
}

std::string encrypt_message(void* key, size_t keyLen, const std::string& plaintext)
void cleanse(void* ptr, size_t size) {
OPENSSL_cleanse(ptr, size);
}

std::string encrypt_message(const char* key, size_t keyLen, const std::string& plaintext)
{
if (keyLen != 32)
throw std::invalid_argument("encrypt_message: key must be exactly 32 bytes for AES-256");

struct Cleanse
{
void* key;
size_t keyLen;
~Cleanse() {
OPENSSL_cleanse(key, keyLen);
}
} cleanse_guard{key, keyLen};

constexpr int NONCE_LEN = 12;
constexpr int TAG_LEN = 16;

Expand All @@ -142,9 +104,8 @@ std::string encrypt_message(void* key, size_t keyLen, const std::string& plainte
struct CtxGuard { EVP_CIPHER_CTX* c; ~CtxGuard() { EVP_CIPHER_CTX_free(c); } } guard{ctx};

if (EVP_EncryptInit_ex(ctx, EVP_aes_256_gcm(), nullptr, nullptr, nullptr) != 1 ||
EVP_CIPHER_CTX_ctrl(ctx, EVP_CTRL_GCM_SET_IVLEN, NONCE_LEN, nullptr) != 1 ||
EVP_EncryptInit_ex(ctx, nullptr, nullptr,
static_cast<const unsigned char*>(key), nonce) != 1)
EVP_CIPHER_CTX_ctrl(ctx, EVP_CTRL_GCM_SET_IVLEN, NONCE_LEN, nullptr) != 1 ||
EVP_EncryptInit_ex(ctx, nullptr, nullptr, reinterpret_cast<const unsigned char*>(key), nonce) != 1)
{
throw std::runtime_error("encrypt_message: GCM init failed");
}
Expand Down Expand Up @@ -173,4 +134,4 @@ std::string encrypt_message(void* key, size_t keyLen, const std::string& plainte
packet.append(reinterpret_cast<char*>(ciphertext.data()), out_len + final_len);

return packet;
}
}
Loading
Loading