diff --git a/Cargo.toml b/Cargo.toml index 9452e91..1187332 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -22,6 +22,7 @@ libc = "0.2" [dev-dependencies] criterion = { version = "0.5", features = ["html_reports"] } +rayon = "1" [[bench]] name = "alloc_throughput" diff --git a/README.md b/README.md index 558f131..ca2f597 100644 --- a/README.md +++ b/README.md @@ -16,15 +16,50 @@ static ALLOC: ZkAllocator = ZkAllocator; fn main() { loop { - zk_alloc::begin_phase(); // activate arena, reset slabs - let proof = generate_proof(); // all allocs go to arena - zk_alloc::end_phase(); // deactivate arena - let output = proof.clone(); // clone out before next reset + let proof = zk_alloc::phase(|| generate_proof()); // arena on inside + let output = proof.clone(); // detach to System submit(output); } } ``` +`phase(|| { ... })` activates the arena, runs the closure, and deactivates +on return — including during panic unwinding (it's an RAII wrapper around +`begin_phase()` / `end_phase()`, which are also exposed for callers that +need finer-grained control). + +### Two-allocator model + +`ZkAllocator` routes each request to one of two backends: + +- **Arena** — bump-pointer slab, used during an active phase for allocations + ≥ `ZK_ALLOC_MIN_BYTES` (default 4096). Reset on the next `begin_phase()`. +- **System** — `glibc malloc`, used for everything else: allocations made + outside any phase, allocations under the size-routing threshold (small + library bookkeeping like rayon's injector blocks, tracing-subscriber + registry slots, hashbrown HashMap entries), and `realloc` of any pointer + that originated in System (sticky-System routing — System allocations + never silently migrate to arena on growth). + +### Phase-scoping contract + +Allocations made during phase N must not be held past `begin_phase()` of +phase N+1 — that call recycles the slab, and the next allocation at the +same offset overwrites the retained bytes. In practice: + +1. Drop or `clone()` arena-allocated values before the phase ends. +2. Construct long-lived state (thread pools, channels, registries) *before* + any phase begins so it lives in System. +3. Use `phase(|| { ... })` (or a `PhaseGuard`) instead of paired calls so + the phase ends correctly even on panic. + +### Environment variables + +| Variable | Default | Effect | +|----------|---------|--------| +| `ZK_ALLOC_SLAB_GB` | `8` | Per-thread slab size, in GiB. Raise for workloads that overflow (`overflow_stats()` reports the count). | +| `ZK_ALLOC_MIN_BYTES` | `4096` | Size-routing threshold. Allocations smaller than this go to System even during a phase. Set to `0` to send everything to arena (loses size-routing protection against library-internal pooled allocations). | + ## Results | Prover | Architecture | vs glibc | Mechanism | diff --git a/src/lib.rs b/src/lib.rs index db2903d..ec7b4fd 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -1,17 +1,66 @@ //! Bump-pointer arena allocator for ZK proving workloads. //! -//! One mmap region split into per-thread slabs. Allocation = increment a thread-local -//! pointer; free = no-op. `begin_phase()` resets the arena: each thread's next -//! allocation starts over at the beginning of its slab, overwriting the previous -//! phase's data. Allocations that don't fit (too large, or beyond max threads) fall -//! back to the system allocator. +//! # Two-allocator model +//! +//! `ZkAllocator` is a façade over two allocators selected per call: +//! +//! - **Arena**: one `mmap` region split into per-thread slabs. Allocation +//! bumps a thread-local pointer; `dealloc` is a no-op. `begin_phase()` +//! resets every slab so the next phase reuses the same physical pages. +//! - **System**: `std::alloc::System` (glibc on Linux). Used for everything +//! the arena shouldn't hold: +//! - any allocation when no phase is active; +//! - any allocation smaller than [`min_arena_bytes()`] even during a phase +//! (size-routing — keeps small library bookkeeping outside the arena); +//! - oversize allocations or threads that arrived after slabs were claimed +//! ([`overflow_stats()`] reports these); +//! - regrowth via `realloc` of a pointer that was already in System +//! (sticky-System routing — System allocations don't migrate to arena +//! on growth, even if the new size exceeds the size-routing threshold). +//! +//! # Phase scoping contract +//! +//! `begin_phase()` activates the arena and resets every slab. `end_phase()` +//! deactivates the arena. Allocations made during phase N must not be held +//! past `begin_phase()` of phase N+1: that call recycles the slab, and the +//! next allocation at the same offset will silently overwrite the retained +//! bytes. +//! +//! Practical rules: +//! +//! 1. Drop or `clone()` arena-allocated values before the phase ends. +//! 2. Use [`PhaseGuard`] / [`phase`] to ensure `end_phase` runs even on +//! panic — without it, an unwinding phase leaves the arena active and +//! subsequent "post-phase" allocations land in arena territory. +//! 3. Keep long-lived state (thread pools, channels, registries, caches) +//! constructed *outside* any active phase so it lives in System. +//! +//! # Realloc migration: prevented +//! +//! `realloc` checks whether the input pointer lies in the arena region. +//! If it does, growth goes through the normal arena path (subject to +//! size-routing). If it does not, growth stays in System via +//! `System::realloc` — preventing the failure mode where a System-backed +//! `Vec` silently migrates into the arena on `push`. +//! +//! # Configuration +//! +//! - `ZK_ALLOC_SLAB_GB` — per-thread slab size in GiB (default `8`). +//! - `ZK_ALLOC_MIN_BYTES` — size-routing threshold in bytes (default `4096`). +//! Set to `0` to send every active-phase allocation to the arena. +//! +//! # Example //! //! ```ignore +//! use zk_alloc::ZkAllocator; +//! +//! #[global_allocator] +//! static ALLOC: ZkAllocator = ZkAllocator; +//! //! loop { -//! begin_phase(); // arena ON; slabs reset lazily -//! let res = heavy_work(); // fast bump increments -//! end_phase(); // arena OFF; new allocations go to System -//! let copy = res.clone(); // detach from arena before next phase resets it +//! let proof = zk_alloc::phase(|| heavy_work()); // arena on inside +//! let output = proof.clone(); // detach into System +//! submit(output); //! } //! ``` @@ -22,12 +71,16 @@ use std::sync::Once; mod syscall; -const SLAB_SIZE: usize = 8 << 30; // 8GB +const DEFAULT_SLAB_GB: usize = 8; const SLACK: usize = 4; #[derive(Debug)] pub struct ZkAllocator; +/// Per-thread slab size in bytes. Set once during `ensure_region()` from the +/// `ZK_ALLOC_SLAB_GB` environment variable (default: 8). +static SLAB_SIZE: AtomicUsize = AtomicUsize::new(0); + /// Incremented by `begin_phase()`. Every thread caches the last value it saw in /// `ARENA_GEN`; when they differ, the thread resets its allocation cursor to the start /// of its slab on the next allocation. This is how a single store on the main thread @@ -59,6 +112,19 @@ static MAX_THREADS: AtomicUsize = AtomicUsize::new(0); static OVERFLOW_COUNT: AtomicUsize = AtomicUsize::new(0); static OVERFLOW_BYTES: AtomicUsize = AtomicUsize::new(0); +/// Allocations smaller than this go to System even during active phases. +/// Routes registry / hashmap / injector-block-sized allocations away from +/// the arena, so library state that outlives a phase doesn't land in +/// recycled memory. +/// +/// Defaults to 4096 (one page) — covers the known phase-crossing patterns: +/// crossbeam_deque::Injector blocks (~1.5 KB), tracing-subscriber Registry +/// slot data (sub-KB), hashbrown HashMap entries (sub-KB), rayon-core job +/// stack frames (sub-KB). Set ZK_ALLOC_MIN_BYTES=0 to disable, or override +/// to a different threshold. +const DEFAULT_MIN_ARENA_BYTES: usize = 4096; +static MIN_ARENA_BYTES: AtomicUsize = AtomicUsize::new(DEFAULT_MIN_ARENA_BYTES); + thread_local! { /// Where this thread's next allocation lands. Advanced past each allocation. static ARENA_PTR: Cell = const { Cell::new(0) }; @@ -74,11 +140,24 @@ thread_local! { fn ensure_region() -> usize { REGION_INIT.call_once(|| { + let slab_gb = std::env::var("ZK_ALLOC_SLAB_GB") + .ok() + .and_then(|s| s.parse::().ok()) + .unwrap_or(DEFAULT_SLAB_GB); + let slab_size = slab_gb << 30; + SLAB_SIZE.store(slab_size, Ordering::Release); + + if let Ok(s) = std::env::var("ZK_ALLOC_MIN_BYTES") { + if let Ok(n) = s.parse::() { + MIN_ARENA_BYTES.store(n, Ordering::Release); + } + } + let cpus = std::thread::available_parallelism() .map(|n| n.get()) .unwrap_or(8); let max_threads = cpus + SLACK; - let region_size = SLAB_SIZE * max_threads; + let region_size = slab_size * max_threads; // SAFETY: mmap_anonymous returns a page-aligned pointer or null. // MAP_NORESERVE means no physical memory is committed until pages are touched. @@ -96,7 +175,27 @@ fn ensure_region() -> usize { /// Activates the arena and resets every thread's slab. All allocations until the next /// `end_phase()` go to the arena; the previous phase's data is overwritten in place. +/// +/// ## Retention is unsafe +/// +/// Allocations made during phase N that are still held when phase N+1 begins +/// are silently overwritten by phase N+1's first allocations at the same slab +/// offset. Any of the following held across `begin_phase()` will be corrupted: +/// +/// - `Vec` with capacity ≥ [`min_arena_bytes()`] (`push` triggers `realloc` +/// that copies from now-recycled source memory). +/// - `Arc` / `Rc` with payload ≥ [`min_arena_bytes()`] (refcount fields +/// become arbitrary bytes — silent leak or use-after-free). +/// - `HashMap`, `BTreeMap`, etc. with bucket allocation ≥ [`min_arena_bytes()`] +/// (lookup may infinite-loop on corrupted ctrl bytes). +/// - `Box` with backing data ≥ [`min_arena_bytes()`] (vtable +/// dispatch survives but field reads return filler bytes). +/// +/// To preserve data across phases, `clone()` it into a System-backed copy +/// (e.g., wrap in `Box::leak(Box::new(...))` while ARENA_ACTIVE is false, +/// or copy into a `Vec` allocated outside any phase). pub fn begin_phase() { + ensure_region(); GENERATION.fetch_add(1, Ordering::Release); ARENA_ACTIVE.store(true, Ordering::Release); } @@ -127,6 +226,53 @@ fn flush_rayon() { } } +/// RAII guard for an arena phase. Calls `begin_phase()` on construction and +/// `end_phase()` on drop — including during panic unwinding. Use this in +/// place of paired `begin_phase()`/`end_phase()` calls when the phase body +/// can panic, to avoid leaving the arena active across the unwind. +/// +/// ```ignore +/// loop { +/// let _guard = zk_alloc::PhaseGuard::new(); +/// heavy_work_that_might_panic(); +/// // _guard drops here on normal return AND on unwind +/// } +/// ``` +pub struct PhaseGuard { + _private: (), +} + +impl PhaseGuard { + /// Begins a phase. The phase ends when the returned guard is dropped. + pub fn new() -> Self { + begin_phase(); + Self { _private: () } + } +} + +impl Default for PhaseGuard { + fn default() -> Self { + Self::new() + } +} + +impl Drop for PhaseGuard { + fn drop(&mut self) { + end_phase(); + } +} + +/// Runs `f` inside a phase. Equivalent to constructing a `PhaseGuard`, +/// running `f`, and dropping the guard. Panics in `f` propagate, but the +/// phase is guaranteed to end before unwinding leaves this function. +pub fn phase(f: F) -> R +where + F: FnOnce() -> R, +{ + let _guard = PhaseGuard::new(); + f() +} + /// Returns (overflow_count, overflow_bytes) — allocations that fell through to System /// because they exceeded the slab or arrived after all slabs were claimed. pub fn overflow_stats() -> (usize, usize) { @@ -141,6 +287,17 @@ pub fn reset_overflow_stats() { OVERFLOW_BYTES.store(0, Ordering::Relaxed); } +/// Returns the per-thread slab size in bytes. Zero before the first `begin_phase()`. +pub fn slab_size() -> usize { + SLAB_SIZE.load(Ordering::Relaxed) +} + +/// Returns the minimum allocation size routed through the arena. Allocations +/// smaller than this go to System even during active phases. +pub fn min_arena_bytes() -> usize { + MIN_ARENA_BYTES.load(Ordering::Relaxed) +} + #[cold] #[inline(never)] unsafe fn arena_alloc_cold(size: usize, align: usize) -> *mut u8 { @@ -157,9 +314,10 @@ unsafe fn arena_alloc_cold(size: usize, align: usize) -> *mut u8 { std::alloc::System.alloc(Layout::from_size_align_unchecked(size, align)) }; } - base = region + idx * SLAB_SIZE; + let slab_size = SLAB_SIZE.load(Ordering::Relaxed); + base = region + idx * slab_size; ARENA_BASE.set(base); - ARENA_END.set(base + SLAB_SIZE); + ARENA_END.set(base + slab_size); } ARENA_PTR.set(base); ARENA_GEN.set(generation); @@ -184,6 +342,14 @@ unsafe impl GlobalAlloc for ZkAllocator { #[inline(always)] unsafe fn alloc(&self, layout: Layout) -> *mut u8 { if ARENA_ACTIVE.load(Ordering::Relaxed) { + // Small allocs bypass arena: registry slots / HashMap entries / + // injector-block-sized allocations from rayon/tracing libraries + // commonly outlive a phase. Routing them to System keeps them + // safe across begin_phase()/end_phase() boundaries. + let min_bytes = MIN_ARENA_BYTES.load(Ordering::Relaxed); + if min_bytes != 0 && layout.size() < min_bytes { + return unsafe { std::alloc::System.alloc(layout) }; + } let generation = GENERATION.load(Ordering::Relaxed); if ARENA_GEN.get() == generation { let ptr = ARENA_PTR.get(); @@ -215,6 +381,18 @@ unsafe impl GlobalAlloc for ZkAllocator { if new_size <= layout.size() { return ptr; } + // Sticky-System routing: if the original allocation came from System + // (small, or pre-phase, or routed by size-routing), keep the grown + // allocation in System too. Without this, a Vec allocated outside + // a phase that grows inside one would silently migrate into the + // arena and become subject to phase recycling. + let addr = ptr as usize; + let base = REGION_BASE.load(Ordering::Relaxed); + let region_size = REGION_SIZE.load(Ordering::Relaxed); + let in_arena = base != 0 && addr >= base && addr < base + region_size; + if !in_arena { + return unsafe { std::alloc::System.realloc(ptr, layout, new_size) }; + } let new_layout = unsafe { Layout::from_size_align_unchecked(new_size, layout.align()) }; let new_ptr = unsafe { self.alloc(new_layout) }; if !new_ptr.is_null() { diff --git a/tests/test_crossbeam_epoch.rs b/tests/test_crossbeam_epoch.rs new file mode 100644 index 0000000..eb3f450 --- /dev/null +++ b/tests/test_crossbeam_epoch.rs @@ -0,0 +1,78 @@ +//! Scenario 1: empirical test for crossbeam-epoch deferred garbage. +//! +//! crossbeam-deque uses crossbeam-epoch to defer-deallocate retired Buffers. +//! Each thread keeps a Local with a list of Bag nodes (~1.5 KB +//! each). Bag nodes themselves are heap-allocated; if allocated during a +//! phase, they live in the arena slab. If the slab is recycled before +//! crossbeam-epoch processes the bag, walking the garbage list reads +//! recycled bytes → silent corruption or crash inside crossbeam. +//! +//! F6 (source audit) hypothesized this is covered by size-routing (Bags < +//! 4 KB go to System). Empirical test: drive many Buffer resizes during a +//! phase to retire many objects to crossbeam-epoch, cross a phase boundary, +//! drive more retires, and assert program integrity over many cycles. + +use rayon::prelude::*; + +#[global_allocator] +static A: zk_alloc::ZkAllocator = zk_alloc::ZkAllocator; + +/// Force per-worker crossbeam-deque buffer growth via deep recursion. Each +/// growth retires the prior buffer to crossbeam-epoch. +fn nested_join(depth: usize) { + if depth == 0 { + return; + } + rayon::join(|| nested_join(depth - 1), || {}); +} + +#[test] +fn crossbeam_epoch_garbage_survives_phase_cycles() { + let _: u64 = (0..1_000_000_u64).into_par_iter().sum(); + + const CYCLES: usize = 50; + for _ in 0..CYCLES { + // Phase 1: drive buffer growth → retire old buffers to epoch garbage. + // Depth 1024 → buffer grows 32 → 64 → 128 → 256 → 512 → 1024 → 2048 + // (six resizes per worker that participates). + zk_alloc::begin_phase(); + rayon::join(|| nested_join(1024), || {}); + zk_alloc::end_phase(); + + // Phase 2: drive more growth + epoch participation. If a Bag from + // phase 1 was allocated in arena and its slab was recycled, this + // would crash inside crossbeam-epoch's collect(). + zk_alloc::begin_phase(); + rayon::join(|| nested_join(1024), || {}); + zk_alloc::end_phase(); + } + + eprintln!( + "crossbeam_epoch_garbage_survives_phase_cycles: {CYCLES} cycles OK (MIN_ARENA_BYTES={})", + zk_alloc::min_arena_bytes() + ); +} + +/// par_iter with collect — drives crossbeam-channel + crossbeam-deque +/// allocations through normal rayon usage. Used to confirm typical +/// rayon-heavy workloads survive 100 cycles. +#[test] +fn crossbeam_in_par_iter_collect_survives_cycles() { + let _: u64 = (0..1_000_000_u64).into_par_iter().sum(); + + for _ in 0..100 { + zk_alloc::begin_phase(); + let v: Vec = (0..4096_u64) + .into_par_iter() + .map(|i| { + let mut acc = 0u64; + for j in 0..32 { + acc = acc.wrapping_add((i * j) ^ 0xDEADBEEF); + } + acc + }) + .collect(); + std::hint::black_box(v); + zk_alloc::end_phase(); + } +} diff --git a/tests/test_panic_phase.rs b/tests/test_panic_phase.rs new file mode 100644 index 0000000..c03c1b1 --- /dev/null +++ b/tests/test_panic_phase.rs @@ -0,0 +1,64 @@ +//! Scenario 3: panic unwinding through a phase boundary. +//! +//! There is no RAII guard around begin_phase()/end_phase(). If a panic +//! propagates out of phase code without reaching end_phase(), ARENA_ACTIVE +//! stays true. Subsequent "post-phase" allocations land in arena and get +//! silently recycled on the next begin_phase(). +//! +//! This is a plain API hazard: the recovery path of any prove_with_panic +//! pattern is unsafe. + +#[global_allocator] +static A: zk_alloc::ZkAllocator = zk_alloc::ZkAllocator; + +#[test] +fn panic_inside_phase_leaves_arena_active() { + use std::panic; + + // Suppress default panic print to minimize incidental allocations between + // the panic and our observation point. + panic::set_hook(Box::new(|_| {})); + let _ = vec![0u8; 1024]; // warm up + + zk_alloc::begin_phase(); + let r = panic::catch_unwind(panic::AssertUnwindSafe(|| panic!("simulated"))); + assert!(r.is_err()); + // No end_phase reached. ARENA_ACTIVE is still true. + + // This Vec lands in arena (since arena is still active and 8192 >= + // MIN_ARENA_BYTES default 4096). + let post_panic: Vec = vec![0xCC; 8192]; + let post_panic_ptr = post_panic.as_ptr() as usize; + + // Begin the next phase (e.g., next iteration of a prove loop). Arena + // resets — anything allocated during the "ghost" phase between panic + // and now gets recycled. + zk_alloc::begin_phase(); + // Span enough of the slab to cover post_panic's offset, regardless of + // how many small bumps the panic introduced. + let big: Vec = vec![0x33; 1 << 20]; + let big_ptr = big.as_ptr() as usize; + let big_end = big_ptr + big.len(); + zk_alloc::end_phase(); + + let _ = panic::take_hook(); + + let in_big_range = post_panic_ptr >= big_ptr && post_panic_ptr < big_end; + let observed = post_panic[0]; + + eprintln!( + "post_panic_ptr=0x{post_panic_ptr:x} big=[0x{big_ptr:x}, 0x{big_end:x}); \ + in_range={in_big_range} observed=0x{observed:02x}" + ); + + assert!( + in_big_range, + "post-panic Vec didn't land in arena's slab — test layout assumption broken" + ); + assert_eq!( + observed, 0x33, + "expected post-panic Vec contents to be recycled by next begin_phase \ + (arena was still active after the panic) — got 0x{observed:02x}" + ); + eprintln!("BUG REPRODUCED: panic without end_phase leaves arena active; post-panic allocations recycled silently."); +} diff --git a/tests/test_phase_guard.rs b/tests/test_phase_guard.rs new file mode 100644 index 0000000..a914815 --- /dev/null +++ b/tests/test_phase_guard.rs @@ -0,0 +1,96 @@ +//! Verify that PhaseGuard / phase() makes F17 (panic leaves arena active) +//! impossible by construction. Drop runs during unwind, calling end_phase. +//! +//! Mirrors test_panic_phase but uses the RAII API. Asserts NO corruption. +//! +//! All three tests in this binary touch the global ARENA_ACTIVE / bump +//! pointer state, so they must not run concurrently — the panic-handler +//! hook is also process-global. Serialize via a file-local mutex. + +static PHASE_LOCK: std::sync::Mutex<()> = std::sync::Mutex::new(()); + +#[global_allocator] +static A: zk_alloc::ZkAllocator = zk_alloc::ZkAllocator; + +#[test] +fn phase_guard_runs_end_phase_on_panic() { + let _lock = PHASE_LOCK.lock().unwrap(); + use std::panic; + + panic::set_hook(Box::new(|_| {})); + let _ = vec![0u8; 1024]; + + // Mirror of test_panic_phase::panic_inside_phase_leaves_arena_active. + // Use phase() / PhaseGuard around the panic — the guard's Drop ends + // the phase during unwind. + let r = panic::catch_unwind(panic::AssertUnwindSafe(|| { + zk_alloc::phase(|| panic!("simulated")) + })); + assert!(r.is_err()); + + // Arena should now be inactive — this large allocation should land in + // System, not arena. + let post_panic: Vec = vec![0xCC; 8192]; + let post_panic_ptr = post_panic.as_ptr() as usize; + + // Begin a new phase + 1 MB filler. If the previous phase was correctly + // ended, post_panic is in System and won't be recycled. The filler + // lands somewhere in arena slab+0 — but post_panic_ptr is NOT in arena. + zk_alloc::phase(|| { + let big: Vec = vec![0x33; 1 << 20]; + let big_ptr = big.as_ptr() as usize; + let big_end = big_ptr + big.len(); + let in_big_range = post_panic_ptr >= big_ptr && post_panic_ptr < big_end; + eprintln!( + "post_panic_ptr=0x{post_panic_ptr:x} big=[0x{big_ptr:x}, 0x{big_end:x}) \ + in_range={in_big_range}" + ); + // post_panic should NOT be in arena range (it was allocated when + // ARENA_ACTIVE=false because PhaseGuard's Drop ran during the unwind). + assert!( + !in_big_range, + "PhaseGuard didn't run end_phase during unwind — post_panic landed in arena" + ); + }); + + let _ = panic::take_hook(); + + // Verify post_panic's contents are pristine. + assert!( + post_panic.iter().all(|&b| b == 0xCC), + "post_panic was corrupted; PhaseGuard didn't end the phase on panic" + ); + eprintln!("PhaseGuard fix verified: panic unwound through phase, end_phase ran, post-panic Vec safe in System"); +} + +#[test] +fn phase_guard_runs_end_phase_on_normal_return() { + let _lock = PHASE_LOCK.lock().unwrap(); + let v = zk_alloc::phase(|| vec![0xAB_u8; 8192]); + // After phase, arena is inactive. Subsequent allocations go to System. + let after: Vec = vec![0xCD_u8; 8192]; + + // Begin another phase + filler. `after` should not be recycled (it's in System). + zk_alloc::phase(|| { + let _filler: Vec = vec![0x77_u8; 1 << 20]; + }); + + assert!( + after.iter().all(|&b| b == 0xCD), + "after-phase Vec was corrupted" + ); + // v is in arena from the first phase; it MAY be corrupted by phase 2. + // That's the F16 family — not what this test is about. We don't assert + // on v. + std::hint::black_box(v); +} + +#[test] +fn nested_phase_guards_compose() { + let _lock = PHASE_LOCK.lock().unwrap(); + // Outer phase + inner phase. Inner phase end_phases (sets active=false), + // then outer phase end_phases again. Sequence: begin, begin, end, end. + // Final state: active=false. No panic. + let result = zk_alloc::phase(|| zk_alloc::phase(|| 42_u64)); + assert_eq!(result, 42); +} diff --git a/tests/test_rayon.rs b/tests/test_rayon.rs new file mode 100644 index 0000000..eeefdd8 --- /dev/null +++ b/tests/test_rayon.rs @@ -0,0 +1,42 @@ +//! Reproducer for the rayon/zk-alloc interaction bug documented in +//! leanMultisig commit f5e2299b. Pulls Tom's regression test verbatim and +//! adds a few stress variants to characterize how reliably the bug fires. +//! +//! Mechanism: +//! 1. rayon::join from a non-worker thread routes through the global +//! `crossbeam_deque::Injector`, which is a linked list of fixed-size +//! blocks (BLOCK_CAP = 63 slots). +//! 2. If a fresh injector block is allocated *during* an arena phase, +//! the block lives in the arena slab. +//! 3. The next `begin_phase()` recycles the slab. Rayon still holds a +//! pointer to that block; the next push writes a JobRef over whatever +//! the application has allocated on top — silent corruption. +//! +//! These tests use #[global_allocator] so that rayon's allocations route +//! through ZkAllocator (otherwise they go to the system allocator and +//! can't be corrupted). + +use rayon::prelude::*; + +#[global_allocator] +static A: zk_alloc::ZkAllocator = zk_alloc::ZkAllocator; + +/// Tom's original MRE. +#[test] +fn rayon_does_not_corrupt_zkalloc() { + let _: u64 = (0..1_000_000_u64).into_par_iter().sum(); + + zk_alloc::begin_phase(); + for _ in 0..200 { + rayon::join(|| {}, || {}); + } + zk_alloc::end_phase(); + + zk_alloc::begin_phase(); + let canary = vec![0xAB_u8; 8192]; + rayon::join(|| {}, || {}); + zk_alloc::end_phase(); + + let pos = canary.iter().position(|&b| b != 0xAB); + assert!(pos.is_none(), "canary corrupted at offset {}", pos.unwrap()); +} diff --git a/tests/test_size_routing_stress.rs b/tests/test_size_routing_stress.rs new file mode 100644 index 0000000..1b59c09 --- /dev/null +++ b/tests/test_size_routing_stress.rs @@ -0,0 +1,47 @@ +//! Stress test for the size-routing fix (ZK_ALLOC_MIN_BYTES). Drives many +//! phase cycles with rayon::join from main thread + canaries, to validate +//! that the fix holds at scale (not just the 3-iter Plonky3 example). +//! +//! Run with `ZK_ALLOC_MIN_BYTES=4096 cargo test --release --test +//! test_size_routing_stress -- --nocapture`. Without the env var the test +//! is expected to fail (bug reproduces). + +use rayon::prelude::*; + +#[global_allocator] +static A: zk_alloc::ZkAllocator = zk_alloc::ZkAllocator; + +#[test] +fn many_phase_cycles_with_canaries() { + let _: u64 = (0..1_000_000_u64).into_par_iter().sum(); + + const CYCLES: usize = 100; + let mut failures = 0; + for cycle in 0..CYCLES { + zk_alloc::begin_phase(); + for _ in 0..200 { + rayon::join(|| {}, || {}); + } + zk_alloc::end_phase(); + + zk_alloc::begin_phase(); + let canary = vec![0xC1_u8; 65536]; + rayon::join(|| {}, || {}); + zk_alloc::end_phase(); + + if let Some(pos) = canary.iter().position(|&b| b != 0xC1) { + eprintln!("cycle {cycle}: canary corrupted at offset {pos}"); + failures += 1; + } + } + eprintln!("many_phase_cycles_with_canaries: {failures}/{CYCLES} corrupted"); + + // Default MIN_ARENA_BYTES is 4096 (size-routing fix on by default). + // With ZK_ALLOC_MIN_BYTES=0, fix is disabled and bug should reproduce. + let min_bytes_active = zk_alloc::min_arena_bytes() >= 256; + if min_bytes_active { + assert_eq!(failures, 0, "fix should prevent ALL corruption"); + } else { + assert!(failures > 0, "without fix, bug should reproduce"); + } +}