diff --git a/Cargo.toml b/Cargo.toml
index 9452e91..8a6d121 100644
--- a/Cargo.toml
+++ b/Cargo.toml
@@ -22,6 +22,9 @@ libc = "0.2"
 
 [dev-dependencies]
 criterion = { version = "0.5", features = ["html_reports"] }
+rayon = "1"
+tracing = "0.1"
+tracing-subscriber = { version = "0.3", features = ["env-filter", "registry"] }
 
 [[bench]]
 name = "alloc_throughput"
diff --git a/README.md b/README.md
index 558f131..9c24b3a 100644
--- a/README.md
+++ b/README.md
@@ -16,15 +16,51 @@ static ALLOC: ZkAllocator = ZkAllocator;
 
 fn main() {
     loop {
-        zk_alloc::begin_phase();             // activate arena, reset slabs
-        let proof = generate_proof();        // all allocs go to arena
-        zk_alloc::end_phase();               // deactivate arena
-        let output = proof.clone();          // clone out before next reset
+        let proof = zk_alloc::phase(|| generate_proof()); // arena on inside
+        let output = proof.clone();                       // detach to System
         submit(output);
     }
 }
 ```
 
+`phase(|| { ... })` activates the arena, runs the closure, and deactivates
+on return — including during panic unwinding (it's an RAII wrapper around
+`begin_phase()` / `end_phase()`, which are also exposed for callers that
+need finer-grained control).
+
+### Two-allocator model
+
+`ZkAllocator` routes each request to one of two backends:
+
+- **Arena** — bump-pointer slab, used during an active phase for allocations
+  ≥ `ZK_ALLOC_MIN_BYTES` (default 4096). Reset on the next `begin_phase()`.
+- **System** — `glibc malloc`, used for everything else: allocations made
+  outside any phase, allocations under the size-routing threshold (small
+  library bookkeeping like rayon's injector blocks, tracing-subscriber
+  registry slots, hashbrown HashMap entries), and `realloc` of any pointer
+  that originated in System (sticky-System routing — System allocations
+  never silently migrate to arena on growth).
+
+### Phase-scoping contract
+
+Allocations made during phase N must not be held past `begin_phase()` of
+phase N+1 — that call recycles the slab, and the next allocation at the
+same offset overwrites the retained bytes. In practice:
+
+1. Drop or `clone()` arena-allocated values before the phase ends.
+2. Construct long-lived state (thread pools, channels, registries) *before*
+   any phase begins so it lives in System.
+3. Use `phase(|| { ... })` (or a `PhaseGuard`) instead of paired calls so
+   the phase ends correctly even on panic.
+
+### Environment variables
+
+| Variable | Default | Effect |
+|----------|---------|--------|
+| `ZK_ALLOC_SLAB_GB` | `8` | Per-thread slab size, in GiB. Raise for workloads that overflow (`overflow_stats()` reports the count). |
+| `ZK_ALLOC_MIN_BYTES` | `4096` | Size-routing threshold. Allocations smaller than this go to System even during a phase. Set to `0` to send everything to arena (loses size-routing protection against library-internal pooled allocations). |
+| `ZK_ALLOC_POISON_RESET` | unset | Diagnostic. Set to `1` to `MADV_DONTNEED` the previous phase's pages on reset, so any stale-pointer read returns zero pages instead of last-phase data. |
+
 ## Results
 
 | Prover | Architecture | vs glibc | Mechanism |
diff --git a/src/lib.rs b/src/lib.rs
index db2903d..190ac85 100644
--- a/src/lib.rs
+++ b/src/lib.rs
@@ -1,17 +1,69 @@
 //! Bump-pointer arena allocator for ZK proving workloads.
 //!
-//! One mmap region split into per-thread slabs. Allocation = increment a thread-local
-//! pointer; free = no-op. `begin_phase()` resets the arena: each thread's next
-//! allocation starts over at the beginning of its slab, overwriting the previous
-//! phase's data. Allocations that don't fit (too large, or beyond max threads) fall
-//! back to the system allocator.
+//! # Two-allocator model
+//!
+//! `ZkAllocator` is a façade over two allocators selected per call:
+//!
+//! - **Arena**: one `mmap` region split into per-thread slabs. Allocation
+//!   bumps a thread-local pointer; `dealloc` is a no-op. `begin_phase()`
+//!   resets every slab so the next phase reuses the same physical pages.
+//! - **System**: `std::alloc::System` (glibc on Linux). Used for everything
+//!   the arena shouldn't hold:
+//!   - any allocation when no phase is active;
+//!   - any allocation smaller than [`min_arena_bytes()`] even during a phase
+//!     (size-routing — keeps small library bookkeeping outside the arena);
+//!   - oversize allocations or threads that arrived after slabs were claimed
+//!     ([`overflow_stats()`] reports these);
+//!   - regrowth via `realloc` of a pointer that was already in System
+//!     (sticky-System routing — System allocations don't migrate to arena
+//!     on growth, even if the new size exceeds the size-routing threshold).
+//!
+//! # Phase scoping contract
+//!
+//! `begin_phase()` activates the arena and resets every slab. `end_phase()`
+//! deactivates the arena. Allocations made during phase N must not be held
+//! past `begin_phase()` of phase N+1: that call recycles the slab, and the
+//! next allocation at the same offset will silently overwrite the retained
+//! bytes.
+//!
+//! Practical rules:
+//!
+//! 1. Drop or `clone()` arena-allocated values before the phase ends.
+//! 2. Use [`PhaseGuard`] / [`phase`] to ensure `end_phase` runs even on
+//!    panic — without it, an unwinding phase leaves the arena active and
+//!    subsequent "post-phase" allocations land in arena territory.
+//! 3. Keep long-lived state (thread pools, channels, registries, caches)
+//!    constructed *outside* any active phase so it lives in System.
+//!
+//! # Realloc migration: prevented
+//!
+//! `realloc` checks whether the input pointer lies in the arena region.
+//! If it does, growth goes through the normal arena path (subject to
+//! size-routing). If it does not, growth stays in System via
+//! `System::realloc` — preventing the failure mode where a System-backed
+//! `Vec` silently migrates into the arena on `push`.
+//!
+//! # Configuration
+//!
+//! - `ZK_ALLOC_SLAB_GB` — per-thread slab size in GiB (default `8`).
+//! - `ZK_ALLOC_MIN_BYTES` — size-routing threshold in bytes (default `4096`).
+//!   Set to `0` to send every active-phase allocation to the arena.
+//! - `ZK_ALLOC_POISON_RESET` — diagnostic; set to `1` to `MADV_DONTNEED`
+//!   the previous phase's pages on reset (catches stale-pointer reads as
+//!   zero pages instead of last-phase data).
+//!
+//! # Example
 //!
 //! ```ignore
+//! use zk_alloc::ZkAllocator;
+//!
+//! #[global_allocator]
+//! static ALLOC: ZkAllocator = ZkAllocator;
+//!
 //! loop {
-//!     begin_phase();               // arena ON; slabs reset lazily
-//!     let res = heavy_work();      // fast bump increments
-//!     end_phase();                 // arena OFF; new allocations go to System
-//!     let copy = res.clone();      // detach from arena before next phase resets it
+//!     let proof = zk_alloc::phase(|| heavy_work()); // arena on inside
+//!     let output = proof.clone();                   // detach into System
+//!     submit(output);
 //! }
 //! ```
 
@@ -22,12 +74,16 @@ use std::sync::Once;
 
 mod syscall;
 
-const SLAB_SIZE: usize = 8 << 30; // 8GB
+const DEFAULT_SLAB_GB: usize = 8;
 const SLACK: usize = 4;
 
 #[derive(Debug)]
 pub struct ZkAllocator;
 
+/// Per-thread slab size in bytes. Set once during `ensure_region()` from the
+/// `ZK_ALLOC_SLAB_GB` environment variable (default: 8).
+static SLAB_SIZE: AtomicUsize = AtomicUsize::new(0);
+
 /// Incremented by `begin_phase()`. Every thread caches the last value it saw in
 /// `ARENA_GEN`; when they differ, the thread resets its allocation cursor to the start
 /// of its slab on the next allocation. This is how a single store on the main thread
@@ -59,6 +115,24 @@ static MAX_THREADS: AtomicUsize = AtomicUsize::new(0);
 static OVERFLOW_COUNT: AtomicUsize = AtomicUsize::new(0);
 static OVERFLOW_BYTES: AtomicUsize = AtomicUsize::new(0);
 
+/// Diagnostic mode: when true, begin_phase forcibly drops the previous phase's
+/// pages via MADV_DONTNEED so any stale arena pointer reads zero instead of
+/// last-phase data. Set via ZK_ALLOC_POISON_RESET=1 env var.
+static POISON_RESET: AtomicBool = AtomicBool::new(false);
+
+/// Allocations smaller than this go to System even during active phases.
+/// Routes registry / hashmap / injector-block-sized allocations away from
+/// the arena, so library state that outlives a phase doesn't land in
+/// recycled memory.
+///
+/// Defaults to 4096 (one page) — covers the known phase-crossing patterns:
+/// crossbeam_deque::Injector blocks (~1.5 KB), tracing-subscriber Registry
+/// slot data (sub-KB), hashbrown HashMap entries (sub-KB), rayon-core job
+/// stack frames (sub-KB). Set ZK_ALLOC_MIN_BYTES=0 to disable, or override
+/// to a different threshold.
+const DEFAULT_MIN_ARENA_BYTES: usize = 4096;
+static MIN_ARENA_BYTES: AtomicUsize = AtomicUsize::new(DEFAULT_MIN_ARENA_BYTES);
+
 thread_local! {
     /// Where this thread's next allocation lands. Advanced past each allocation.
     static ARENA_PTR: Cell<usize> = const { Cell::new(0) };
@@ -74,11 +148,27 @@ thread_local! {
 
 fn ensure_region() -> usize {
     REGION_INIT.call_once(|| {
+        let slab_gb = std::env::var("ZK_ALLOC_SLAB_GB")
+            .ok()
+            .and_then(|s| s.parse::<usize>().ok())
+            .unwrap_or(DEFAULT_SLAB_GB);
+        let slab_size = slab_gb << 30;
+        SLAB_SIZE.store(slab_size, Ordering::Release);
+
+        if std::env::var("ZK_ALLOC_POISON_RESET").as_deref() == Ok("1") {
+            POISON_RESET.store(true, Ordering::Release);
+        }
+        if let Ok(s) = std::env::var("ZK_ALLOC_MIN_BYTES") {
+            if let Ok(n) = s.parse::<usize>() {
+                MIN_ARENA_BYTES.store(n, Ordering::Release);
+            }
+        }
+
         let cpus = std::thread::available_parallelism()
             .map(|n| n.get())
             .unwrap_or(8);
         let max_threads = cpus + SLACK;
-        let region_size = SLAB_SIZE * max_threads;
+        let region_size = slab_size * max_threads;
 
         // SAFETY: mmap_anonymous returns a page-aligned pointer or null.
         // MAP_NORESERVE means no physical memory is committed until pages are touched.
@@ -96,7 +186,27 @@ fn ensure_region() -> usize {
 
 /// Activates the arena and resets every thread's slab. All allocations until the next
 /// `end_phase()` go to the arena; the previous phase's data is overwritten in place.
+///
+/// ## Retention is unsafe
+///
+/// Allocations made during phase N that are still held when phase N+1 begins
+/// are silently overwritten by phase N+1's first allocations at the same slab
+/// offset. Any of the following held across `begin_phase()` will be corrupted:
+///
+/// - `Vec<T>` with capacity ≥ [`min_arena_bytes()`] (`push` triggers `realloc`
+///   that copies from now-recycled source memory).
+/// - `Arc<T>` / `Rc<T>` with payload ≥ [`min_arena_bytes()`] (refcount fields
+///   become arbitrary bytes — silent leak or use-after-free).
+/// - `HashMap`, `BTreeMap`, etc. with bucket allocation ≥ [`min_arena_bytes()`]
+///   (lookup may infinite-loop on corrupted ctrl bytes).
+/// - `Box<dyn Trait>` with backing data ≥ [`min_arena_bytes()`] (vtable
+///   dispatch survives but field reads return filler bytes).
+///
+/// To preserve data across phases, `clone()` it into a System-backed copy
+/// (e.g., wrap in `Box::leak(Box::new(...))` while ARENA_ACTIVE is false,
+/// or copy into a `Vec` allocated outside any phase).
 pub fn begin_phase() {
+    ensure_region();
     GENERATION.fetch_add(1, Ordering::Release);
     ARENA_ACTIVE.store(true, Ordering::Release);
 }
@@ -127,6 +237,53 @@ fn flush_rayon() {
     }
 }
 
+/// RAII guard for an arena phase. Calls `begin_phase()` on construction and
+/// `end_phase()` on drop — including during panic unwinding. Use this in
+/// place of paired `begin_phase()`/`end_phase()` calls when the phase body
+/// can panic, to avoid leaving the arena active across the unwind.
+///
+/// ```ignore
+/// loop {
+///     let _guard = zk_alloc::PhaseGuard::new();
+///     heavy_work_that_might_panic();
+///     // _guard drops here on normal return AND on unwind
+/// }
+/// ```
+pub struct PhaseGuard {
+    _private: (),
+}
+
+impl PhaseGuard {
+    /// Begins a phase. The phase ends when the returned guard is dropped.
+    pub fn new() -> Self {
+        begin_phase();
+        Self { _private: () }
+    }
+}
+
+impl Default for PhaseGuard {
+    fn default() -> Self {
+        Self::new()
+    }
+}
+
+impl Drop for PhaseGuard {
+    fn drop(&mut self) {
+        end_phase();
+    }
+}
+
+/// Runs `f` inside a phase. Equivalent to constructing a `PhaseGuard`,
+/// running `f`, and dropping the guard. Panics in `f` propagate, but the
+/// phase is guaranteed to end before unwinding leaves this function.
+pub fn phase<F, R>(f: F) -> R
+where
+    F: FnOnce() -> R,
+{
+    let _guard = PhaseGuard::new();
+    f()
+}
+
 /// Returns (overflow_count, overflow_bytes) — allocations that fell through to System
 /// because they exceeded the slab or arrived after all slabs were claimed.
 pub fn overflow_stats() -> (usize, usize) {
@@ -141,6 +298,17 @@ pub fn reset_overflow_stats() {
     OVERFLOW_BYTES.store(0, Ordering::Relaxed);
 }
 
+/// Returns the per-thread slab size in bytes. Zero before the first `begin_phase()`.
+pub fn slab_size() -> usize {
+    SLAB_SIZE.load(Ordering::Relaxed)
+}
+
+/// Returns the minimum allocation size routed through the arena. Allocations
+/// smaller than this go to System even during active phases.
+pub fn min_arena_bytes() -> usize {
+    MIN_ARENA_BYTES.load(Ordering::Relaxed)
+}
+
 #[cold]
 #[inline(never)]
 unsafe fn arena_alloc_cold(size: usize, align: usize) -> *mut u8 {
@@ -157,9 +325,25 @@ unsafe fn arena_alloc_cold(size: usize, align: usize) -> *mut u8 {
                     std::alloc::System.alloc(Layout::from_size_align_unchecked(size, align))
                 };
             }
-            base = region + idx * SLAB_SIZE;
+            let slab_size = SLAB_SIZE.load(Ordering::Relaxed);
+            base = region + idx * slab_size;
             ARENA_BASE.set(base);
-            ARENA_END.set(base + SLAB_SIZE);
+            ARENA_END.set(base + slab_size);
+        }
+        // Diagnostic: MADV_DONTNEED on previous phase's used range to force
+        // any stale references to read fresh zero pages instead of the
+        // last-phase data. Behind ZK_ALLOC_POISON_RESET=1 to keep prod fast.
+        if POISON_RESET.load(Ordering::Relaxed) {
+            let prev_ptr = ARENA_PTR.get();
+            if prev_ptr > base {
+                let len = prev_ptr - base;
+                let page_aligned_len = len & !0xFFF;
+                if page_aligned_len > 0 {
+                    unsafe {
+                        syscall::madvise(base as *mut u8, page_aligned_len, syscall::MADV_DONTNEED)
+                    };
+                }
+            }
         }
         ARENA_PTR.set(base);
         ARENA_GEN.set(generation);
@@ -184,6 +368,14 @@ unsafe impl GlobalAlloc for ZkAllocator {
     #[inline(always)]
     unsafe fn alloc(&self, layout: Layout) -> *mut u8 {
         if ARENA_ACTIVE.load(Ordering::Relaxed) {
+            // Small allocs bypass arena: registry slots / HashMap entries /
+            // injector-block-sized allocations from rayon/tracing libraries
+            // commonly outlive a phase. Routing them to System keeps them
+            // safe across begin_phase()/end_phase() boundaries.
+            let min_bytes = MIN_ARENA_BYTES.load(Ordering::Relaxed);
+            if min_bytes != 0 && layout.size() < min_bytes {
+                return unsafe { std::alloc::System.alloc(layout) };
+            }
             let generation = GENERATION.load(Ordering::Relaxed);
             if ARENA_GEN.get() == generation {
                 let ptr = ARENA_PTR.get();
@@ -215,6 +407,18 @@ unsafe impl GlobalAlloc for ZkAllocator {
         if new_size <= layout.size() {
             return ptr;
         }
+        // Sticky-System routing: if the original allocation came from System
+        // (small, or pre-phase, or routed by size-routing), keep the grown
+        // allocation in System too. Without this, a Vec allocated outside
+        // a phase that grows inside one would silently migrate into the
+        // arena and become subject to phase recycling.
+        let addr = ptr as usize;
+        let base = REGION_BASE.load(Ordering::Relaxed);
+        let region_size = REGION_SIZE.load(Ordering::Relaxed);
+        let in_arena = base != 0 && addr >= base && addr < base + region_size;
+        if !in_arena {
+            return unsafe { std::alloc::System.realloc(ptr, layout, new_size) };
+        }
         let new_layout = unsafe { Layout::from_size_align_unchecked(new_size, layout.align()) };
         let new_ptr = unsafe { self.alloc(new_layout) };
         if !new_ptr.is_null() {
diff --git a/src/syscall.rs b/src/syscall.rs
index f676b2a..251e11d 100644
--- a/src/syscall.rs
+++ b/src/syscall.rs
@@ -16,6 +16,7 @@ mod imp {
     const MAP_NORESERVE: usize = 0x4000;
 
     pub const MADV_NOHUGEPAGE: usize = 15;
+    pub const MADV_DONTNEED: usize = 4;
 
     #[inline]
     unsafe fn syscall6(
@@ -97,6 +98,7 @@ mod imp {
     use std::ptr;
 
     pub const MADV_NOHUGEPAGE: usize = 15;
+    pub const MADV_DONTNEED: usize = 4;
 
     #[inline]
     pub unsafe fn mmap_anonymous(size: usize) -> *mut u8 {
@@ -119,4 +121,4 @@ mod imp {
     }
 }
 
-pub use imp::{madvise, mmap_anonymous, MADV_NOHUGEPAGE};
+pub use imp::{madvise, mmap_anonymous, MADV_DONTNEED, MADV_NOHUGEPAGE};
diff --git a/tests/test_concurrent_phase.rs b/tests/test_concurrent_phase.rs
new file mode 100644
index 0000000..622558c
--- /dev/null
+++ b/tests/test_concurrent_phase.rs
@@ -0,0 +1,179 @@
+//! Scenario 6: concurrent begin_phase / end_phase across threads.
+//!
+//! GENERATION and ARENA_ACTIVE are global atomics. begin_phase() from any
+//! thread bumps GENERATION, which forces every other thread's next allocation
+//! through the cold path (ARENA_GEN mismatch → reset ARENA_PTR to slab base).
+//! That silently invalidates arena data those threads still hold.
+//!
+//! Race patterns observable:
+//!   (a) T2.begin_phase() while T1 holds an arena Vec → T1's next alloc lands
+//!       on top of T1's existing Vec (per-thread slab, but offset-0 conflict).
+//!   (b) T1.begin_phase() racing T2.end_phase() → ARENA_ACTIVE final state
+//!       depends on store ordering; allocations between can route either way.
+//!
+//! These are public-API hazards: the docs imply single-threaded lifecycle.
+//! Tests document the failure modes so a future PhaseGuard / scoped API can
+//! address them.
+
+use std::sync::atomic::{AtomicBool, AtomicUsize, Ordering};
+use std::sync::{Arc, Barrier};
+use std::thread;
+
+#[global_allocator]
+static A: zk_alloc::ZkAllocator = zk_alloc::ZkAllocator;
+
+#[test]
+fn cross_thread_begin_phase_invalidates_data() {
+    let _: u64 = (0..1024_u64).map(|i| i * 2).sum();
+
+    let barrier = Arc::new(Barrier::new(2));
+    let aliased = Arc::new(AtomicBool::new(false));
+    let bug = Arc::new(AtomicBool::new(false));
+
+    zk_alloc::begin_phase();
+
+    let bar1 = Arc::clone(&barrier);
+    let aliased1 = Arc::clone(&aliased);
+    let bug1 = Arc::clone(&bug);
+    let t1 = thread::spawn(move || {
+        let v: Vec<u8> = vec![0xAA; 8192];
+        let v_ptr = v.as_ptr() as usize;
+        bar1.wait(); // [1] v allocated, wait for T2
+        bar1.wait(); // [2] T2 has called begin_phase; resume
+
+        // The cross-thread begin_phase bumped GENERATION. T1's ARENA_GEN is
+        // now stale → cold path on next alloc resets ARENA_PTR to T1's slab
+        // base. On Linux this lands w on top of v; macOS aarch64 places v
+        // and w in different ranges (T1's first alloc may go to System),
+        // so the overlap doesn't happen — the bug is real but unobservable
+        // from this test on that platform.
+        let w: Vec<u8> = vec![0xBB; 8192];
+        let w_ptr = w.as_ptr() as usize;
+        let v_corrupted = v.iter().any(|&b| b != 0xAA);
+        eprintln!("t1: v=0x{v_ptr:x} w=0x{w_ptr:x} v_corrupt={v_corrupted}");
+        if v_ptr == w_ptr {
+            aliased1.store(true, Ordering::Relaxed);
+        }
+        if v_corrupted {
+            bug1.store(true, Ordering::Relaxed);
+        }
+        std::hint::black_box((v, w));
+    });
+
+    let bar2 = barrier;
+    let t2 = thread::spawn(move || {
+        bar2.wait(); // [1] T1 has v
+        zk_alloc::begin_phase(); // bumps GENERATION globally
+        bar2.wait(); // [2] release T1
+    });
+
+    t1.join().unwrap();
+    t2.join().unwrap();
+
+    zk_alloc::end_phase();
+
+    let saw_aliasing = aliased.load(Ordering::Relaxed);
+    let saw_corruption = bug.load(Ordering::Relaxed);
+
+    if saw_aliasing {
+        // Linux: cold-path slab reset re-bumps to slab base, w aliases v,
+        // and the writes to w corrupt v's bytes.
+        assert!(
+            saw_corruption,
+            "v and w aliased but v's bytes are pristine — \
+             cross-thread invalidation got fixed or layout assumption changed"
+        );
+    } else {
+        // macOS aarch64 (and any platform where T1's two allocations land
+        // at different addresses) — corruption can't be observed via this
+        // exact pattern, but the underlying hazard remains. Pass without
+        // asserting; document.
+        eprintln!(
+            "test inconclusive on this platform: v and w didn't alias, \
+             so cross-thread invalidation isn't observable here"
+        );
+    }
+}
+
+/// Two threads each running their own begin_phase/work/end_phase loop —
+/// expecting each iteration to be self-contained. Because GENERATION is
+/// global, A's begin_phase mid-iteration corrupts B's in-flight data when
+/// B allocates a second time after the cross-thread reset.
+#[test]
+fn two_threads_running_lifecycle_concurrently_corrupt_each_other() {
+    let _: u64 = (0..1024_u64).map(|i| i * 2).sum();
+
+    const ITERS: usize = 200;
+    let bug = Arc::new(AtomicUsize::new(0));
+
+    thread::scope(|s| {
+        for tid in 0u8..2 {
+            let bug = Arc::clone(&bug);
+            s.spawn(move || {
+                for _ in 0..ITERS {
+                    zk_alloc::begin_phase();
+                    let pattern = if tid == 0 { 0xA1 } else { 0xB2 };
+                    // Two allocations per iteration; the second triggers a
+                    // cold-path slab reset if the other thread's begin_phase
+                    // bumped GENERATION between them.
+                    let v: Vec<u8> = vec![pattern; 8192];
+                    let _filler: Vec<u8> = vec![0; 8192];
+                    if v.iter().any(|&b| b != pattern) {
+                        bug.fetch_add(1, Ordering::Relaxed);
+                    }
+                    std::hint::black_box((v, _filler));
+                    zk_alloc::end_phase();
+                }
+            });
+        }
+    });
+
+    let n = bug.load(Ordering::Relaxed);
+    eprintln!(
+        "two_threads_running_lifecycle_concurrently: {n} cross-thread corruptions over {} iters",
+        2 * ITERS
+    );
+
+    // Race window is narrow (single-µs alloc-to-alloc gap); count is
+    // observational, not asserted. The deterministic version is in
+    // cross_thread_begin_phase_invalidates_data above.
+    eprintln!("(stress observation: race window too tight to be a reliable assertion)");
+}
+
+/// Sanity: concurrent begin/end stress doesn't crash the allocator's atomics
+/// even if it corrupts user data. Verifies invariants like REGION_BASE are
+/// stable.
+#[test]
+fn concurrent_phase_stress_no_crash() {
+    let _: u64 = (0..1024_u64).map(|i| i * 2).sum();
+
+    const ITERS: usize = 5000;
+    let stop = Arc::new(AtomicBool::new(false));
+
+    let mut threads = vec![];
+    for _ in 0..4 {
+        let stop = Arc::clone(&stop);
+        threads.push(thread::spawn(move || {
+            while !stop.load(Ordering::Relaxed) {
+                zk_alloc::begin_phase();
+                let _v = vec![0u8; 16384];
+                zk_alloc::end_phase();
+            }
+        }));
+    }
+
+    thread::sleep(std::time::Duration::from_millis(50));
+    for _ in 0..ITERS {
+        zk_alloc::begin_phase();
+        zk_alloc::end_phase();
+    }
+    stop.store(true, Ordering::Relaxed);
+    for t in threads {
+        t.join().unwrap();
+    }
+
+    zk_alloc::end_phase();
+    eprintln!(
+        "concurrent_phase_stress_no_crash: completed {ITERS} main-thread cycles + worker churn"
+    );
+}
diff --git a/tests/test_crossbeam_epoch.rs b/tests/test_crossbeam_epoch.rs
new file mode 100644
index 0000000..eb3f450
--- /dev/null
+++ b/tests/test_crossbeam_epoch.rs
@@ -0,0 +1,78 @@
+//! Scenario 1: empirical test for crossbeam-epoch deferred garbage.
+//!
+//! crossbeam-deque uses crossbeam-epoch to defer-deallocate retired Buffers.
+//! Each thread keeps a Local with a list of Bag<Garbage> nodes (~1.5 KB
+//! each). Bag nodes themselves are heap-allocated; if allocated during a
+//! phase, they live in the arena slab. If the slab is recycled before
+//! crossbeam-epoch processes the bag, walking the garbage list reads
+//! recycled bytes → silent corruption or crash inside crossbeam.
+//!
+//! F6 (source audit) hypothesized this is covered by size-routing (Bags <
+//! 4 KB go to System). Empirical test: drive many Buffer resizes during a
+//! phase to retire many objects to crossbeam-epoch, cross a phase boundary,
+//! drive more retires, and assert program integrity over many cycles.
+
+use rayon::prelude::*;
+
+#[global_allocator]
+static A: zk_alloc::ZkAllocator = zk_alloc::ZkAllocator;
+
+/// Force per-worker crossbeam-deque buffer growth via deep recursion. Each
+/// growth retires the prior buffer to crossbeam-epoch.
+fn nested_join(depth: usize) {
+    if depth == 0 {
+        return;
+    }
+    rayon::join(|| nested_join(depth - 1), || {});
+}
+
+#[test]
+fn crossbeam_epoch_garbage_survives_phase_cycles() {
+    let _: u64 = (0..1_000_000_u64).into_par_iter().sum();
+
+    const CYCLES: usize = 50;
+    for _ in 0..CYCLES {
+        // Phase 1: drive buffer growth → retire old buffers to epoch garbage.
+        // Depth 1024 → buffer grows 32 → 64 → 128 → 256 → 512 → 1024 → 2048
+        // (six resizes per worker that participates).
+        zk_alloc::begin_phase();
+        rayon::join(|| nested_join(1024), || {});
+        zk_alloc::end_phase();
+
+        // Phase 2: drive more growth + epoch participation. If a Bag from
+        // phase 1 was allocated in arena and its slab was recycled, this
+        // would crash inside crossbeam-epoch's collect().
+        zk_alloc::begin_phase();
+        rayon::join(|| nested_join(1024), || {});
+        zk_alloc::end_phase();
+    }
+
+    eprintln!(
+        "crossbeam_epoch_garbage_survives_phase_cycles: {CYCLES} cycles OK (MIN_ARENA_BYTES={})",
+        zk_alloc::min_arena_bytes()
+    );
+}
+
+/// par_iter with collect — drives crossbeam-channel + crossbeam-deque
+/// allocations through normal rayon usage. Used to confirm typical
+/// rayon-heavy workloads survive 100 cycles.
+#[test]
+fn crossbeam_in_par_iter_collect_survives_cycles() {
+    let _: u64 = (0..1_000_000_u64).into_par_iter().sum();
+
+    for _ in 0..100 {
+        zk_alloc::begin_phase();
+        let v: Vec<u64> = (0..4096_u64)
+            .into_par_iter()
+            .map(|i| {
+                let mut acc = 0u64;
+                for j in 0..32 {
+                    acc = acc.wrapping_add((i * j) ^ 0xDEADBEEF);
+                }
+                acc
+            })
+            .collect();
+        std::hint::black_box(v);
+        zk_alloc::end_phase();
+    }
+}
diff --git a/tests/test_many_spans_stress.rs b/tests/test_many_spans_stress.rs
new file mode 100644
index 0000000..c8b78f3
--- /dev/null
+++ b/tests/test_many_spans_stress.rs
@@ -0,0 +1,66 @@
+//! Stresses tracing-subscriber's sharded-slab into allocating page 1 (~6KB,
+//! 64 slots) inside an arena phase. The first page (~3.2KB) is covered by
+//! the 4096-byte size-routing threshold, but page 1 exceeds it — so this
+//! test verifies whether the fix holds under heavy span concurrency.
+//!
+//! Run with `cargo test --release --test test_many_spans_stress`.
+
+use rayon::prelude::*;
+use tracing::info_span;
+use tracing_subscriber::{layer::SubscriberExt, util::SubscriberInitExt, Registry};
+
+#[global_allocator]
+static A: zk_alloc::ZkAllocator = zk_alloc::ZkAllocator;
+
+#[test]
+fn many_concurrent_spans_across_phases() {
+    inner(64);
+}
+
+/// Documents the upper limit of the size-routing fix. With default
+/// MIN_BYTES=4096, 512 concurrent spans triggers a sharded-slab page
+/// allocation that exceeds the threshold, lands in arena, and corrupts
+/// across phase boundaries. To pass, set ZK_ALLOC_MIN_BYTES=6144 or higher.
+/// Run manually: `cargo test --release --test test_many_spans_stress -- --ignored`.
+#[test]
+#[ignore]
+fn extreme_concurrent_spans_across_phases() {
+    inner(512);
+}
+
+fn inner(n: u64) {
+    let _ = Registry::default()
+        .with(tracing_subscriber::EnvFilter::new("info"))
+        .try_init();
+
+    // Warm up rayon outside arena.
+    let _: u64 = (0..1_000_000_u64).into_par_iter().sum();
+
+    for cycle in 0..5 {
+        zk_alloc::begin_phase();
+
+        // Create 64 concurrent live spans -- forces sharded-slab to grow
+        // past page 0 (~32 slots) into page 1 (~6KB allocation).
+        let spans: Vec<_> = (0..n).map(|i| info_span!("concurrent", cycle, i)).collect();
+        {
+            let _entered: Vec<_> = spans.iter().map(|s| s.enter()).collect();
+            // guards drop here, before spans
+        }
+        drop(spans);
+
+        zk_alloc::end_phase();
+    }
+
+    // After 5 phase cycles, create one more span and observe whether its
+    // backing data is corrupted. With size-routing fix off, the pooled
+    // page-1 slot data has been overwritten across phases; with the fix
+    // on at >= 4096, page 1 might still go to arena (> 4KB), so this
+    // probes the limit of the fix.
+    zk_alloc::begin_phase();
+    let spans2: Vec<_> = (0..n).map(|i| info_span!("post_cycle", i)).collect();
+    {
+        let _entered: Vec<_> = spans2.iter().map(|s| s.enter()).collect();
+    }
+    drop(spans2);
+    zk_alloc::end_phase();
+}
diff --git a/tests/test_panic_phase.rs b/tests/test_panic_phase.rs
new file mode 100644
index 0000000..c03c1b1
--- /dev/null
+++ b/tests/test_panic_phase.rs
@@ -0,0 +1,64 @@
+//! Scenario 3: panic unwinding through a phase boundary.
+//!
+//! There is no RAII guard around begin_phase()/end_phase(). If a panic
+//! propagates out of phase code without reaching end_phase(), ARENA_ACTIVE
+//! stays true. Subsequent "post-phase" allocations land in arena and get
+//! silently recycled on the next begin_phase().
+//!
+//! This is a plain API hazard: the recovery path of any prove_with_panic
+//! pattern is unsafe.
+
+#[global_allocator]
+static A: zk_alloc::ZkAllocator = zk_alloc::ZkAllocator;
+
+#[test]
+fn panic_inside_phase_leaves_arena_active() {
+    use std::panic;
+
+    // Suppress default panic print to minimize incidental allocations between
+    // the panic and our observation point.
+    panic::set_hook(Box::new(|_| {}));
+    let _ = vec![0u8; 1024]; // warm up
+
+    zk_alloc::begin_phase();
+    let r = panic::catch_unwind(panic::AssertUnwindSafe(|| panic!("simulated")));
+    assert!(r.is_err());
+    // No end_phase reached. ARENA_ACTIVE is still true.
+
+    // This Vec lands in arena (since arena is still active and 8192 >=
+    // MIN_ARENA_BYTES default 4096).
+    let post_panic: Vec<u8> = vec![0xCC; 8192];
+    let post_panic_ptr = post_panic.as_ptr() as usize;
+
+    // Begin the next phase (e.g., next iteration of a prove loop). Arena
+    // resets — anything allocated during the "ghost" phase between panic
+    // and now gets recycled.
+    zk_alloc::begin_phase();
+    // Span enough of the slab to cover post_panic's offset, regardless of
+    // how many small bumps the panic introduced.
+    let big: Vec<u8> = vec![0x33; 1 << 20];
+    let big_ptr = big.as_ptr() as usize;
+    let big_end = big_ptr + big.len();
+    zk_alloc::end_phase();
+
+    let _ = panic::take_hook();
+
+    let in_big_range = post_panic_ptr >= big_ptr && post_panic_ptr < big_end;
+    let observed = post_panic[0];
+
+    eprintln!(
+        "post_panic_ptr=0x{post_panic_ptr:x} big=[0x{big_ptr:x}, 0x{big_end:x}); \
+         in_range={in_big_range} observed=0x{observed:02x}"
+    );
+
+    assert!(
+        in_big_range,
+        "post-panic Vec didn't land in arena's slab — test layout assumption broken"
+    );
+    assert_eq!(
+        observed, 0x33,
+        "expected post-panic Vec contents to be recycled by next begin_phase \
+         (arena was still active after the panic) — got 0x{observed:02x}"
+    );
+    eprintln!("BUG REPRODUCED: panic without end_phase leaves arena active; post-panic allocations recycled silently.");
+}
diff --git a/tests/test_phase_guard.rs b/tests/test_phase_guard.rs
new file mode 100644
index 0000000..a914815
--- /dev/null
+++ b/tests/test_phase_guard.rs
@@ -0,0 +1,96 @@
+//! Verify that PhaseGuard / phase() makes F17 (panic leaves arena active)
+//! impossible by construction. Drop runs during unwind, calling end_phase.
+//!
+//! Mirrors test_panic_phase but uses the RAII API. Asserts NO corruption.
+//!
+//! All three tests in this binary touch the global ARENA_ACTIVE / bump
+//! pointer state, so they must not run concurrently — the panic-handler
+//! hook is also process-global. Serialize via a file-local mutex.
+
+static PHASE_LOCK: std::sync::Mutex<()> = std::sync::Mutex::new(());
+
+#[global_allocator]
+static A: zk_alloc::ZkAllocator = zk_alloc::ZkAllocator;
+
+#[test]
+fn phase_guard_runs_end_phase_on_panic() {
+    let _lock = PHASE_LOCK.lock().unwrap();
+    use std::panic;
+
+    panic::set_hook(Box::new(|_| {}));
+    let _ = vec![0u8; 1024];
+
+    // Mirror of test_panic_phase::panic_inside_phase_leaves_arena_active.
+    // Use phase() / PhaseGuard around the panic — the guard's Drop ends
+    // the phase during unwind.
+    let r = panic::catch_unwind(panic::AssertUnwindSafe(|| {
+        zk_alloc::phase(|| panic!("simulated"))
+    }));
+    assert!(r.is_err());
+
+    // Arena should now be inactive — this large allocation should land in
+    // System, not arena.
+    let post_panic: Vec<u8> = vec![0xCC; 8192];
+    let post_panic_ptr = post_panic.as_ptr() as usize;
+
+    // Begin a new phase + 1 MB filler. If the previous phase was correctly
+    // ended, post_panic is in System and won't be recycled. The filler
+    // lands somewhere in arena slab+0 — but post_panic_ptr is NOT in arena.
+    zk_alloc::phase(|| {
+        let big: Vec<u8> = vec![0x33; 1 << 20];
+        let big_ptr = big.as_ptr() as usize;
+        let big_end = big_ptr + big.len();
+        let in_big_range = post_panic_ptr >= big_ptr && post_panic_ptr < big_end;
+        eprintln!(
+            "post_panic_ptr=0x{post_panic_ptr:x} big=[0x{big_ptr:x}, 0x{big_end:x}) \
+             in_range={in_big_range}"
+        );
+        // post_panic should NOT be in arena range (it was allocated when
+        // ARENA_ACTIVE=false because PhaseGuard's Drop ran during the unwind).
+        assert!(
+            !in_big_range,
+            "PhaseGuard didn't run end_phase during unwind — post_panic landed in arena"
+        );
+    });
+
+    let _ = panic::take_hook();
+
+    // Verify post_panic's contents are pristine.
+    assert!(
+        post_panic.iter().all(|&b| b == 0xCC),
+        "post_panic was corrupted; PhaseGuard didn't end the phase on panic"
+    );
+    eprintln!("PhaseGuard fix verified: panic unwound through phase, end_phase ran, post-panic Vec safe in System");
+}
+
+#[test]
+fn phase_guard_runs_end_phase_on_normal_return() {
+    let _lock = PHASE_LOCK.lock().unwrap();
+    let v = zk_alloc::phase(|| vec![0xAB_u8; 8192]);
+    // After phase, arena is inactive. Subsequent allocations go to System.
+    let after: Vec<u8> = vec![0xCD_u8; 8192];
+
+    // Begin another phase + filler. `after` should not be recycled (it's in System).
+    zk_alloc::phase(|| {
+        let _filler: Vec<u8> = vec![0x77_u8; 1 << 20];
+    });
+
+    assert!(
+        after.iter().all(|&b| b == 0xCD),
+        "after-phase Vec was corrupted"
+    );
+    // v is in arena from the first phase; it MAY be corrupted by phase 2.
+    // That's the F16 family — not what this test is about. We don't assert
+    // on v.
+    std::hint::black_box(v);
+}
+
+#[test]
+fn nested_phase_guards_compose() {
+    let _lock = PHASE_LOCK.lock().unwrap();
+    // Outer phase + inner phase. Inner phase end_phases (sets active=false),
+    // then outer phase end_phases again. Sequence: begin, begin, end, end.
+    // Final state: active=false. No panic.
+    let result = zk_alloc::phase(|| zk_alloc::phase(|| 42_u64));
+    assert_eq!(result, 42);
+}
diff --git a/tests/test_rayon.rs b/tests/test_rayon.rs
new file mode 100644
index 0000000..eeefdd8
--- /dev/null
+++ b/tests/test_rayon.rs
@@ -0,0 +1,42 @@
+//! Reproducer for the rayon/zk-alloc interaction bug documented in
+//! leanMultisig commit f5e2299b. Pulls Tom's regression test verbatim and
+//! adds a few stress variants to characterize how reliably the bug fires.
+//!
+//! Mechanism:
+//!   1. rayon::join from a non-worker thread routes through the global
+//!      `crossbeam_deque::Injector`, which is a linked list of fixed-size
+//!      blocks (BLOCK_CAP = 63 slots).
+//!   2. If a fresh injector block is allocated *during* an arena phase,
+//!      the block lives in the arena slab.
+//!   3. The next `begin_phase()` recycles the slab. Rayon still holds a
+//!      pointer to that block; the next push writes a JobRef over whatever
+//!      the application has allocated on top — silent corruption.
+//!
+//! These tests use #[global_allocator] so that rayon's allocations route
+//! through ZkAllocator (otherwise they go to the system allocator and
+//! can't be corrupted).
+
+use rayon::prelude::*;
+
+#[global_allocator]
+static A: zk_alloc::ZkAllocator = zk_alloc::ZkAllocator;
+
+/// Tom's original MRE.
+#[test]
+fn rayon_does_not_corrupt_zkalloc() {
+    let _: u64 = (0..1_000_000_u64).into_par_iter().sum();
+
+    zk_alloc::begin_phase();
+    for _ in 0..200 {
+        rayon::join(|| {}, || {});
+    }
+    zk_alloc::end_phase();
+
+    zk_alloc::begin_phase();
+    let canary = vec![0xAB_u8; 8192];
+    rayon::join(|| {}, || {});
+    zk_alloc::end_phase();
+
+    let pos = canary.iter().position(|&b| b != 0xAB);
+    assert!(pos.is_none(), "canary corrupted at offset {}", pos.unwrap());
+}
diff --git a/tests/test_rayon_audit.rs b/tests/test_rayon_audit.rs
new file mode 100644
index 0000000..0204239
--- /dev/null
+++ b/tests/test_rayon_audit.rs
@@ -0,0 +1,163 @@
+//! Characterizes how reliably the rayon/zk-alloc bug fires under variants
+//! of Tom's MRE: cold rayon pool, repeated cycles, large canaries, sleep
+//! between phases. The goal is to map the "trigger surface" of this bug
+//! class: which allocation patterns survive a phase boundary into the
+//! next phase's recycled slab, and what the typical corruption profile
+//! looks like.
+
+use rayon::prelude::*;
+
+#[global_allocator]
+static A: zk_alloc::ZkAllocator = zk_alloc::ZkAllocator;
+
+fn check_canary(canary: &[u8], expect: u8) -> Option<usize> {
+    canary.iter().position(|&b| b != expect)
+}
+
+/// Cold rayon: no pre-warm. The very first parallel call happens INSIDE an
+/// arena phase. Rayon's thread pool, registry, AND injector blocks all get
+/// allocated in the arena slab — much bigger blast radius than the warm
+/// case. Not only injector blocks: thread stacks, registry state, sleep
+/// pools.
+#[test]
+#[ignore] // Run manually: cargo test --release --test test_rayon_audit -- --ignored --test-threads=1
+fn cold_rayon_inside_arena() {
+    zk_alloc::begin_phase();
+    // First parallel call ever — allocates the entire rayon pool in arena.
+    let _: u64 = (0..1_000_000_u64).into_par_iter().sum();
+    zk_alloc::end_phase();
+
+    zk_alloc::begin_phase();
+    let canary = vec![0xCD_u8; 64 * 1024];
+    rayon::join(|| {}, || {});
+    zk_alloc::end_phase();
+
+    let pos = check_canary(&canary, 0xCD);
+    assert!(
+        pos.is_none(),
+        "cold-rayon canary corrupted at offset {}",
+        pos.unwrap()
+    );
+}
+
+/// Repeats Tom's MRE 10 times. If the bug is rare/non-deterministic the
+/// average failure offset and frequency tell us how many slots an Injector
+/// block has when the slab is recycled.
+#[test]
+fn repeated_phase_cycles() {
+    let _: u64 = (0..1_000_000_u64).into_par_iter().sum();
+
+    let mut failures = 0;
+    for cycle in 0..10 {
+        zk_alloc::begin_phase();
+        for _ in 0..200 {
+            rayon::join(|| {}, || {});
+        }
+        zk_alloc::end_phase();
+
+        zk_alloc::begin_phase();
+        let canary = vec![0xAB_u8; 8192];
+        rayon::join(|| {}, || {});
+        zk_alloc::end_phase();
+
+        if let Some(pos) = check_canary(&canary, 0xAB) {
+            eprintln!("cycle {cycle}: canary corrupted at offset {pos}");
+            failures += 1;
+        }
+    }
+    eprintln!("repeated_phase_cycles: {failures}/10 cycles corrupted");
+    let fix_active = zk_alloc::min_arena_bytes() >= 256;
+    if fix_active {
+        assert_eq!(failures, 0, "fix should prevent corruption in all cycles");
+    } else {
+        assert!(
+            failures > 0,
+            "expected at least one cycle to corrupt — bug should be reproducible"
+        );
+    }
+}
+
+/// Canary larger than a typical injector block — does the corruption have
+/// a bounded blast radius (one block-sized region) or does it cascade?
+#[test]
+fn large_canary_blast_radius() {
+    let _: u64 = (0..1_000_000_u64).into_par_iter().sum();
+
+    zk_alloc::begin_phase();
+    for _ in 0..200 {
+        rayon::join(|| {}, || {});
+    }
+    zk_alloc::end_phase();
+
+    zk_alloc::begin_phase();
+    let canary = vec![0x55_u8; 1 << 20]; // 1 MB
+    rayon::join(|| {}, || {});
+    zk_alloc::end_phase();
+
+    let mut corruption_runs = Vec::new();
+    let mut i = 0;
+    while i < canary.len() {
+        if canary[i] != 0x55 {
+            let start = i;
+            while i < canary.len() && canary[i] != 0x55 {
+                i += 1;
+            }
+            corruption_runs.push((start, i - start));
+        }
+        i += 1;
+    }
+    if !corruption_runs.is_empty() {
+        eprintln!("large_canary corruption runs: {:?}", corruption_runs);
+    }
+    let fix_active = zk_alloc::min_arena_bytes() >= 256;
+    if fix_active {
+        assert!(
+            corruption_runs.is_empty(),
+            "{} corruption runs in 1MB canary (fix should prevent)",
+            corruption_runs.len()
+        );
+    } else {
+        assert_eq!(
+            corruption_runs.len(),
+            1,
+            "without fix, expected exactly one block-sized corruption run, got {}",
+            corruption_runs.len()
+        );
+        let (_start, len) = corruption_runs[0];
+        assert!(
+            len <= 32,
+            "expected single JobRef-sized run (<=32B), got {}B",
+            len
+        );
+    }
+}
+
+/// Drives rayon::join from a SPAWNED thread, not the main thread. Both go
+/// through the injector (only rayon worker threads bypass it via per-worker
+/// deque). Confirms the bug is about non-worker callers, not specifically
+/// the main thread.
+#[test]
+fn injector_bug_from_spawned_thread() {
+    let _: u64 = (0..1_000_000_u64).into_par_iter().sum();
+
+    zk_alloc::begin_phase();
+    let h = std::thread::spawn(|| {
+        for _ in 0..200 {
+            rayon::join(|| {}, || {});
+        }
+    });
+    h.join().unwrap();
+    zk_alloc::end_phase();
+
+    zk_alloc::begin_phase();
+    let canary = vec![0xEE_u8; 8192];
+    rayon::join(|| {}, || {});
+    zk_alloc::end_phase();
+
+    let pos = check_canary(&canary, 0xEE);
+    assert!(
+        pos.is_none(),
+        "spawned-thread canary corrupted at offset {}",
+        pos.unwrap()
+    );
+}
diff --git a/tests/test_scope_nesting.rs b/tests/test_scope_nesting.rs
new file mode 100644
index 0000000..b797fbe
--- /dev/null
+++ b/tests/test_scope_nesting.rs
@@ -0,0 +1,70 @@
+//! Tests phase boundaries that interact with rayon::scope. Workers spawned
+//! inside a scope hold references to arena allocations; if begin_phase runs
+//! while those workers still have pending tasks, the workers' captured data
+//! could land in recycled memory.
+
+use rayon::prelude::*;
+
+#[global_allocator]
+static A: zk_alloc::ZkAllocator = zk_alloc::ZkAllocator;
+
+/// Phase boundary inside scope: while workers are still running, begin a
+/// new phase. The worker's stack-frame data is on the worker thread's stack
+/// (not arena), but any heap allocations they performed during the phase
+/// could be in arena.
+#[test]
+fn phase_boundary_during_par_iter() {
+    let _: u64 = (0..1_000_000_u64).into_par_iter().sum();
+
+    zk_alloc::begin_phase();
+
+    // Workers each allocate a vec, sum it. Force them to allocate in arena.
+    let result: u64 = (0..16_u64)
+        .into_par_iter()
+        .map(|i| {
+            let v: Vec<u64> = (0..(1 << 14)).map(|j| j ^ i).collect();
+            v.iter().sum::<u64>()
+        })
+        .sum();
+    std::hint::black_box(result);
+
+    zk_alloc::end_phase();
+
+    zk_alloc::begin_phase();
+    let canary = vec![0xC9_u8; 8 << 20];
+    let _: u64 = (0..16_u64)
+        .into_par_iter()
+        .map(|i| {
+            let v: Vec<u64> = (0..(1 << 14)).map(|j| j ^ i).collect();
+            v.iter().sum::<u64>()
+        })
+        .sum();
+    zk_alloc::end_phase();
+
+    let pos = canary.iter().position(|&b| b != 0xC9);
+    assert!(
+        pos.is_none(),
+        "8MB canary corrupted at offset {}",
+        pos.unwrap()
+    );
+}
+
+/// Repeated par_iter without any explicit canary, just check program
+/// integrity over 100 iterations.
+#[test]
+fn many_par_iter_phase_cycles() {
+    let _: u64 = (0..1_000_000_u64).into_par_iter().sum();
+
+    for _ in 0..100 {
+        zk_alloc::begin_phase();
+        let sum: u64 = (0..256_u64)
+            .into_par_iter()
+            .map(|i| {
+                let v: Vec<u64> = (0..(1 << 12)).map(|j| j ^ i).collect();
+                v.iter().sum::<u64>()
+            })
+            .sum();
+        std::hint::black_box(sum);
+        zk_alloc::end_phase();
+    }
+}
diff --git a/tests/test_size_distribution.rs b/tests/test_size_distribution.rs
new file mode 100644
index 0000000..d31cfa3
--- /dev/null
+++ b/tests/test_size_distribution.rs
@@ -0,0 +1,69 @@
+//! Profiles the size distribution of arena allocations during prove-style
+//! workloads. Helps validate that ZK_ALLOC_MIN_BYTES=4096 catches the
+//! "library state" allocations without filtering out the bulk-data ones
+//! the arena is meant to accelerate.
+//!
+//! Usage: `cargo test --release --test test_size_distribution -- --nocapture`.
+//! Output is a histogram of size buckets and the count of allocations under
+//! 4096 bytes vs. above.
+
+use std::sync::Mutex;
+
+#[global_allocator]
+static A: zk_alloc::ZkAllocator = zk_alloc::ZkAllocator;
+
+static SIZES: Mutex<Vec<usize>> = Mutex::new(Vec::new());
+
+#[test]
+fn profile_allocation_sizes_in_phase() {
+    use rayon::prelude::*;
+
+    // Warm up rayon outside arena.
+    let _: u64 = (0..1_000_000_u64).into_par_iter().sum();
+
+    // Capture allocations during a phase by piggybacking the global
+    // allocator -- we measure indirectly via overflow_stats.
+    zk_alloc::reset_overflow_stats();
+
+    zk_alloc::begin_phase();
+
+    // Mix of allocations: tiny (HashMap-style), small, medium, large.
+    {
+        let mut tiny: Vec<Vec<u8>> = (0..1000).map(|_| vec![0_u8; 32]).collect();
+        let small: Vec<Vec<u8>> = (0..1000).map(|_| vec![0_u8; 256]).collect();
+        let medium: Vec<Vec<u8>> = (0..100).map(|_| vec![0_u8; 4096]).collect();
+        let large: Vec<Vec<u8>> = (0..10).map(|_| vec![0_u8; 1 << 20]).collect();
+        std::hint::black_box((&tiny, &small, &medium, &large));
+        tiny.clear();
+        SIZES
+            .lock()
+            .unwrap()
+            .extend([tiny.capacity(), small.len(), medium.len(), large.len()]);
+    }
+
+    zk_alloc::end_phase();
+
+    let (overflow_count, overflow_bytes) = zk_alloc::overflow_stats();
+    eprintln!(
+        "overflow stats during phase (arena fallthrough): count={overflow_count}, bytes={overflow_bytes}"
+    );
+    eprintln!(
+        "min_arena_bytes() = {} (allocations below this size go to System)",
+        zk_alloc::min_arena_bytes()
+    );
+
+    // With size routing, allocations < min_arena_bytes don't touch the
+    // arena AND don't increment overflow_stats (they bypass the arena
+    // path entirely). Overflow_stats only counts allocations that tried
+    // arena but couldn't fit (slab full or too-large).
+    if zk_alloc::min_arena_bytes() >= 4096 {
+        // 1000 tiny + 1000 small were < 4096 — they go to System silently.
+        // 100 medium = 4096 each — at boundary, bypass.
+        // 10 large = 1MB each — go to arena.
+        // No overflow expected for this size mix.
+        assert_eq!(
+            overflow_count, 0,
+            "expected no overflow with size routing on"
+        );
+    }
+}
diff --git a/tests/test_size_routing_stress.rs b/tests/test_size_routing_stress.rs
new file mode 100644
index 0000000..1b59c09
--- /dev/null
+++ b/tests/test_size_routing_stress.rs
@@ -0,0 +1,47 @@
+//! Stress test for the size-routing fix (ZK_ALLOC_MIN_BYTES). Drives many
+//! phase cycles with rayon::join from main thread + canaries, to validate
+//! that the fix holds at scale (not just the 3-iter Plonky3 example).
+//!
+//! Run with `ZK_ALLOC_MIN_BYTES=4096 cargo test --release --test
+//! test_size_routing_stress -- --nocapture`. Without the env var the test
+//! is expected to fail (bug reproduces).
+
+use rayon::prelude::*;
+
+#[global_allocator]
+static A: zk_alloc::ZkAllocator = zk_alloc::ZkAllocator;
+
+#[test]
+fn many_phase_cycles_with_canaries() {
+    let _: u64 = (0..1_000_000_u64).into_par_iter().sum();
+
+    const CYCLES: usize = 100;
+    let mut failures = 0;
+    for cycle in 0..CYCLES {
+        zk_alloc::begin_phase();
+        for _ in 0..200 {
+            rayon::join(|| {}, || {});
+        }
+        zk_alloc::end_phase();
+
+        zk_alloc::begin_phase();
+        let canary = vec![0xC1_u8; 65536];
+        rayon::join(|| {}, || {});
+        zk_alloc::end_phase();
+
+        if let Some(pos) = canary.iter().position(|&b| b != 0xC1) {
+            eprintln!("cycle {cycle}: canary corrupted at offset {pos}");
+            failures += 1;
+        }
+    }
+    eprintln!("many_phase_cycles_with_canaries: {failures}/{CYCLES} corrupted");
+
+    // Default MIN_ARENA_BYTES is 4096 (size-routing fix on by default).
+    // With ZK_ALLOC_MIN_BYTES=0, fix is disabled and bug should reproduce.
+    let min_bytes_active = zk_alloc::min_arena_bytes() >= 256;
+    if min_bytes_active {
+        assert_eq!(failures, 0, "fix should prevent ALL corruption");
+    } else {
+        assert!(failures > 0, "without fix, bug should reproduce");
+    }
+}
diff --git a/tests/test_threadpool_resize.rs b/tests/test_threadpool_resize.rs
new file mode 100644
index 0000000..db28a2c
--- /dev/null
+++ b/tests/test_threadpool_resize.rs
@@ -0,0 +1,89 @@
+//! Scenario 4: thread pool resizing / building mid-phase.
+//!
+//! ThreadPoolBuilder::build() allocates a Registry, per-worker ThreadInfo
+//! arrays, initial Worker deques, and a Sleep struct. If built during an
+//! active phase, these allocations land in the arena. The pool is held by
+//! the user across phase boundaries; Registry pointers reference arena
+//! memory that gets recycled on the next begin_phase, so subsequent
+//! .install() calls walk corrupted scheduler state.
+//!
+//! Most of these allocations are sub-KB (per F6/F13 audit) and bypass arena
+//! under default size-routing. Empirical test: build a fresh ThreadPool
+//! mid-phase, cross a boundary, install work, and look for crashes / hangs.
+
+use rayon::prelude::*;
+
+#[global_allocator]
+static A: zk_alloc::ZkAllocator = zk_alloc::ZkAllocator;
+
+#[test]
+fn build_threadpool_during_phase_then_use_across_boundary() {
+    let _: u64 = (0..1_000_000_u64).into_par_iter().sum();
+
+    const CYCLES: usize = 10;
+    for cycle in 0..CYCLES {
+        zk_alloc::begin_phase();
+        let pool = rayon::ThreadPoolBuilder::new()
+            .num_threads(4)
+            .thread_name(move |i| format!("test-pool-{cycle}-{i}"))
+            .build()
+            .expect("build pool");
+        zk_alloc::end_phase();
+
+        // Phase boundary. Pool's Registry, ThreadInfo arrays, deques: any
+        // that were arena-allocated are now in recycled territory.
+        zk_alloc::begin_phase();
+        // Force the pool to use its Registry via .install + par work. If
+        // any pointer-walking state was in arena and got recycled, this
+        // crashes or hangs.
+        let result: u64 = pool.install(|| (0..1024_u64).into_par_iter().sum());
+        assert_eq!(result, 1024 * 1023 / 2);
+        zk_alloc::end_phase();
+
+        drop(pool);
+    }
+    eprintln!(
+        "build_threadpool_during_phase: {CYCLES} cycles OK (MIN_ARENA_BYTES={})",
+        zk_alloc::min_arena_bytes()
+    );
+}
+
+#[test]
+fn many_threadpool_builds_during_phase() {
+    let _: u64 = (0..1_000_000_u64).into_par_iter().sum();
+
+    for cycle in 0..20 {
+        zk_alloc::begin_phase();
+        // Build, use immediately within phase, drop. All within one phase.
+        let pool = rayon::ThreadPoolBuilder::new()
+            .num_threads(2)
+            .thread_name(move |i| format!("ephemeral-{cycle}-{i}"))
+            .build()
+            .expect("build pool");
+        let result: u64 = pool.install(|| (0..512_u64).into_par_iter().sum());
+        assert_eq!(result, 512 * 511 / 2);
+        drop(pool);
+        zk_alloc::end_phase();
+    }
+}
+
+/// Build pool BEFORE any phase, use it across many phases. Pool's allocations
+/// are pre-phase (System); should be fully isolated from phase resets.
+#[test]
+fn pre_phase_threadpool_used_across_many_phases() {
+    let _: u64 = (0..1_000_000_u64).into_par_iter().sum();
+
+    let pool = rayon::ThreadPoolBuilder::new()
+        .num_threads(4)
+        .thread_name(|i| format!("pre-phase-{i}"))
+        .build()
+        .expect("build pool");
+
+    for _ in 0..50 {
+        zk_alloc::begin_phase();
+        let result: u64 = pool.install(|| (0..1024_u64).into_par_iter().sum());
+        assert_eq!(result, 1024 * 1023 / 2);
+        zk_alloc::end_phase();
+    }
+    drop(pool);
+}
diff --git a/tests/test_worker_deque_growth.rs b/tests/test_worker_deque_growth.rs
new file mode 100644
index 0000000..7e6b45b
--- /dev/null
+++ b/tests/test_worker_deque_growth.rs
@@ -0,0 +1,149 @@
+//! Scenario 2: per-worker crossbeam-deque Buffer growth.
+//!
+//! crossbeam-deque's Worker::push doubles its Buffer when the deque fills.
+//! Initial capacity 32 slots × ~16 bytes per JobRef ≈ 512 bytes (under
+//! MIN_ARENA_BYTES=4096). At ≥ 256 simultaneously pending tasks, the buffer
+//! grows past 4 KB and lands in the arena slab.
+//!
+//! Workers retain their Buffer across phase boundaries — crossbeam never
+//! shrinks. After end_phase + begin_phase, the slab is recycled but the
+//! Worker still references the same Buffer pointer. The next push writes
+//! a JobRef into recycled memory.
+//!
+//! Tests: drive rayon::join recursion from inside a worker (so pushes
+//! land on a worker's local deque, not the global Injector) to force Buffer
+//! growth past the size-routing threshold, then look for canary corruption.
+//!
+//! Recursion depth is capped at 256 to stay within macOS's smaller default
+//! thread stack in debug builds; 256 pending tasks already drives the
+//! Buffer to the 256-slot capacity that crosses MIN_ARENA_BYTES=4096.
+
+use rayon::prelude::*;
+
+#[global_allocator]
+static A: zk_alloc::ZkAllocator = zk_alloc::ZkAllocator;
+
+/// Recursive rayon::join — each level pushes one right-task to the worker's
+/// local deque. Peak pending tasks on the deque ≈ depth.
+fn nested_join(depth: usize) {
+    if depth == 0 {
+        return;
+    }
+    rayon::join(|| nested_join(depth - 1), || {});
+}
+
+#[test]
+fn worker_deque_growth_during_phase() {
+    let _: u64 = (0..1_000_000_u64).into_par_iter().sum();
+
+    const CYCLES: usize = 20;
+    const DEPTH: usize = 256; // crosses Buffer growth into arena (~4 KB)
+
+    let mut failures = 0;
+    for cycle in 0..CYCLES {
+        zk_alloc::begin_phase();
+        // Push from a worker context so growth happens on a per-worker deque,
+        // not the global Injector.
+        rayon::join(|| nested_join(DEPTH), || {});
+        zk_alloc::end_phase();
+
+        zk_alloc::begin_phase();
+        let canary = vec![0xC9_u8; 65536];
+        // Force more worker activity to consume / push deque slots.
+        rayon::join(|| nested_join(64), || {});
+        zk_alloc::end_phase();
+
+        if let Some(pos) = canary.iter().position(|&b| b != 0xC9) {
+            eprintln!("cycle {cycle}: canary corrupted at offset {pos}");
+            failures += 1;
+        }
+    }
+    eprintln!(
+        "worker_deque_growth_during_phase: {failures}/{CYCLES} cycles corrupted (MIN_ARENA_BYTES={})",
+        zk_alloc::min_arena_bytes()
+    );
+
+    // With size-routing default 4096, Buffers up to 256 slots (~4 KB) go to
+    // System. Buffers above that — driven here by DEPTH=1024 — go to arena.
+    // If size-routing is enough, failures==0. If not, failures>0.
+    if zk_alloc::min_arena_bytes() >= 4096 {
+        // Document outcome — assertion deferred to actual observation.
+    }
+}
+
+/// Same idea but with a single very deep recursion, no canary mismatch
+/// allowed. If buffer growth + phase recycle causes corruption, this should
+/// crash or panic via tracing/rayon internals (similar to F1).
+#[test]
+fn deep_recursion_phase_cycle_program_integrity() {
+    let _: u64 = (0..1_000_000_u64).into_par_iter().sum();
+
+    for _ in 0..50 {
+        zk_alloc::begin_phase();
+        rayon::join(|| nested_join(256), || {});
+        zk_alloc::end_phase();
+    }
+}
+
+/// Drive worker buffer growth via deep recursion that ALSO performs heap
+/// allocations in each frame. If the worker's grown Buffer landed in the
+/// worker's own slab, then phase 2 worker allocations at the same offset
+/// would corrupt either the buffer (visible as a crash on next push/pop) or
+/// — if the canary placement aligns — corrupt the canary directly.
+fn nested_join_with_alloc(depth: usize) {
+    if depth == 0 {
+        return;
+    }
+    let v: Vec<u64> = vec![depth as u64; 1024]; // 8 KB, > MIN_ARENA_BYTES
+    rayon::join(|| nested_join_with_alloc(depth - 1), || {});
+    std::hint::black_box(v);
+}
+
+#[test]
+fn worker_buffer_growth_with_per_worker_canary() {
+    let _: u64 = (0..1_000_000_u64).into_par_iter().sum();
+
+    let mut failures = 0;
+    const CYCLES: usize = 20;
+    for cycle in 0..CYCLES {
+        zk_alloc::begin_phase();
+        rayon::join(|| nested_join_with_alloc(256), || {});
+        zk_alloc::end_phase();
+
+        zk_alloc::begin_phase();
+        // Each worker allocates a 16 KB canary in its own slab, then drives
+        // more rayon work that uses the (potentially recycled) deque buffer.
+        let results: Vec<bool> = (0..32_u64)
+            .into_par_iter()
+            .map(|_| {
+                let canary = vec![0xC9_u8; 16384];
+                rayon::join(|| nested_join_with_alloc(64), || {});
+                canary.iter().all(|&b| b == 0xC9)
+            })
+            .collect();
+        zk_alloc::end_phase();
+
+        let n_corrupt = results.iter().filter(|&&ok| !ok).count();
+        if n_corrupt > 0 {
+            eprintln!("cycle {cycle}: {n_corrupt}/32 workers saw canary corruption");
+            failures += 1;
+        }
+    }
+    eprintln!(
+        "worker_buffer_growth_with_per_worker_canary: {failures}/{CYCLES} cycles with corruption (MIN_ARENA_BYTES={})",
+        zk_alloc::min_arena_bytes()
+    );
+
+    // With size-routing (default 4096), the worker's grown buffer falls into
+    // arena only at cap >= 256 slots (4 KB). The 8 KB Vecs allocated in
+    // each frame do go to arena and could overlap. In practice the
+    // size-routing fix is enough to prevent corruption observable from the
+    // canary; without it, the bug manifests as SIGSEGV (verified with
+    // ZK_ALLOC_MIN_BYTES=0 + --no-default-features).
+    if zk_alloc::min_arena_bytes() >= 4096 {
+        assert_eq!(
+            failures, 0,
+            "size-routing should prevent worker-deque corruption"
+        );
+    }
+}