diff --git a/.gitignore b/.gitignore
index 7e47bb8..be5da87 100644
--- a/.gitignore
+++ b/.gitignore
@@ -11,3 +11,5 @@ docs/_build/
 *.pyc
 __pycache__/
 *.egg-info/
+.build/
+diagnostics/
\ No newline at end of file
diff --git a/README_metal_hybrid.md b/README_metal_hybrid.md
new file mode 100644
index 0000000..0955f11
--- /dev/null
+++ b/README_metal_hybrid.md
@@ -0,0 +1,127 @@
+## Hybrid Metal attention backend
+
+This repository includes an experimental **hybrid Metal backend** for attention.
+The high-level model and inference loop remain in PyTorch, while the hottest
+attention path can be routed through custom Metal ops on Apple M-series GPUs.
+
+### Selecting attention backend
+
+Attention backends are controlled via the `WORLD_ATTENTION_BACKEND` environment
+variable:
+
+- `flex` (default): use PyTorch `flex_attention` everywhere.
+- `metal`: use custom `world.flex_attn_metal_*` ops on MPS devices.
+- `auto`: choose based on availability/device.
+
+Example:
+
+```bash
+WORLD_ATTENTION_BACKEND=metal WORLD_METAL_IMPL=fast python examples/gen_sample.py
+```
+
+### Implementation overview
+
+- Python-side wrappers:
+  - `src/model/attn_backend.py` defines:
+    - `AttnBackend`: backend selector (`pytorch-flex`, `metal-op`, `auto`).
+    - `AttnConfig` / `AttnMeta`: small structs describing behavior and KV
+      geometry.
+    - `world_flex_attn_forward(...)`: single entry point used by attention
+      modules.
+- Call sites:
+  - `Attn`, `MergedQKVAttn`, and `CrossAttention` now call
+    `world_flex_attn_forward` instead of `flex_attention` directly.
+- Metal custom op:
+  - `src/metal/metal_flex_attn_op.mm` registers
+    `torch.ops.world.flex_attn_metal` on the MPS backend and wires it to the
+    `metal_flex_attn_forward` Metal kernel in
+    `src/metal/metal_flex_attn.metal`.
+- Tests:
+  - `tests/test_metal_attn_numeric.py` compares Metal vs flex attention on
+    small random inputs (when the Metal op is available).
+  - `tests/test_metal_attn_perf.py` provides a basic throughput sanity check on
+    M‑series devices.
+
+### Status
+
+Attention Metal kernels include fast sparse/block-aware paths and a reference
+path.
+
+Known limitations:
+
+- Attention Metal path is inference-only.
+- Fast specialized kernels are tuned for float16; bfloat16 is supported via native generic kernel when available (otherwise fp16 boundary fallback).
+
+### End-to-end benchmark
+
+Use this to track actual generation latency/FPS on MPS:
+
+```bash
+python tests/bench_world_engine_e2e.py --model-uri <your_model_uri> --attention-backend metal --dtype float16 --quant w8a8 --scheduler-steps 4 --cache-interval 1
+```
+
+Add `--return-img` to include VAE decode in the benchmarked path.
+
+### Regression-safe performance gate
+
+Capture a locked baseline (3 repeats):
+
+```bash
+python tests/perf_regression_gate.py --output docs/perf_baseline_mps_w8a8.json --repeats 3 --warmup 16 --steps 8
+```
+
+Compare current code to baseline (fails on regression beyond threshold):
+
+```bash
+python tests/perf_regression_gate.py --output docs/perf_baseline_mps_w8a8.json --compare-only --repeats 3 --warmup 16 --steps 8 --max-regression 0.15
+```
+
+### Current validated throughput (strict pretrained path)
+
+`Overworld-Models/Lapp0-WP-Mini-1.4.5-BL-Distill`, `scheduler_steps=4`, `cache_interval=1`, `float16`, `w8a8`:
+
+- latent-only: `total_ms p50 ~210.8`, `FPS p50 ~4.74`
+- with decode: `total_ms p50 ~219.3`, `FPS p50 ~4.56`
+
+### Optimization gate workflow (baseline-safe)
+
+Use the new optimization gate runner to ensure every speed change is validated
+against tensor-dump correctness and performance thresholds:
+
+```bash
+HF_HUB_OFFLINE=1 TRANSFORMERS_OFFLINE=1 TORCHDYNAMO_DISABLE=1 \
+WORLD_ATTENTION_BACKEND=metal WORLD_METAL_IMPL=fast WORLD_METAL_FAST_NO_FALLBACK=1 \
+WORLD_METAL_PREFER_ACTIVE_DISPATCH=1 WORLD_KV_RUNTIME_CHECKS=0 WORLD_KV_COMPUTE_ACTIVE_BLOCKS=0 \
+PYTHONPATH=. ./.venv/bin/python tests/run_optimization_gate.py \
+  --model-uri Overworld-Models/Lapp0-WP-Mini-1.4.5-BL-Distill \
+  --device mps --dtype bfloat16 --profile-steps 16 \
+  --baseline-dump-dir diagnostics/out/metal_profile_baseline \
+  --baseline-perf-report diagnostics/out/metal_profile_perf_only/profile_report.json \
+  --output-dir diagnostics/out/optimization_gate_run
+```
+
+Artifacts written:
+
+- `gate_report.json` (overall decision)
+- perf run reports under `perf/`
+- dump run reports under `dump/`
+- quick/full comparisons under `compare_quick/` and `compare_full/`
+
+### Tensor-dump regression comparison
+
+You can compare any candidate dump run against baseline directly:
+
+```bash
+PYTHONPATH=. ./.venv/bin/python tests/compare_tensor_dumps.py \
+  --baseline-dir diagnostics/out/metal_profile_baseline \
+  --candidate-dir diagnostics/out/optimization_gate_run/dump \
+  --phase all --strict \
+  --out-dir diagnostics/out/optimization_gate_run/manual_compare
+```
+
+The comparison emits:
+
+- `comparison_summary.json`
+- `comparison_worst_modules.json`
+- `comparison_full.json`
+
diff --git a/docs/metal_mps_full_diagnosis.md b/docs/metal_mps_full_diagnosis.md
new file mode 100644
index 0000000..fe9256c
--- /dev/null
+++ b/docs/metal_mps_full_diagnosis.md
@@ -0,0 +1,265 @@
+# Metal MPS End-to-End Performance Diagnosis
+
+Date: 2026-03-12  
+Scope: `Overworld-Models/Lapp0-WP-Mini-1.4.5-BL-Distill` on Apple MPS backend, Metal attention path.
+
+## Objective
+
+Determine why end-to-end frame generation is far slower than expected, identify the true bottleneck(s), and establish a high-confidence optimization path.
+
+## Environment and Runtime Configuration Used
+
+- Device: `mps`
+- Attention backend: `WORLD_ATTENTION_BACKEND=metal`
+- Metal impl: `WORLD_METAL_IMPL=fast`
+- No fallback: `WORLD_METAL_FAST_NO_FALLBACK=1`
+- Dynamo toggled during diagnosis:
+  - Mostly: `TORCHDYNAMO_DISABLE=1`
+  - Also tested with `TORCHDYNAMO_DISABLE=0`
+- KV runtime checks during perf diagnosis:
+  - `WORLD_KV_RUNTIME_CHECKS=0`
+  - `WORLD_KV_COMPUTE_ACTIVE_BLOCKS=0`
+
+## Initial Symptom
+
+Observed end-to-end frame latency was on the order of ~15-25 seconds/frame, far from expected "few FPS".
+
+## Stage-Level Timing Instrumentation Added
+
+`tests/bench_world_engine_e2e.py` was expanded to report:
+
+- `prep_ms`
+- `denoise_ms`
+- `cache_ms`
+- `decode_ms`
+- `total_ms`
+
+for each frame and as p50/p95/mean summary.
+
+This made it clear where time is spent.
+
+## Key Measurements
+
+### 1) End-to-end staged timing (representative)
+
+`float16`, latent-only (`--return-img` off), `frames=1`:
+
+- `denoise`: ~9.4-10.2s
+- `cache`: ~4.6-5.6s
+- `decode`: ~0s (disabled)
+- `total`: ~14.0-15.9s
+
+`float16`, with decode:
+
+- decode adds roughly ~0.4-1.2s depending frame/coldness.
+
+### 2) Attention op-level timing is fast
+
+Direct op benchmarks at model-like shape:
+
+- `flex_attn_metal_fast_active` p50 around ~0.5ms
+- `flex_attn_metal_fast_blocks` p50 around ~0.8ms
+
+This is far too small to explain multi-second frame times.
+
+### 3) Attention ablation confirms attention is not dominant
+
+Replacing attention output with zeros in model forward changed frame time negligibly in tested runs.
+
+Conclusion: non-attention components dominate.
+
+### 4) KV upsert isolation reveals extreme impact
+
+When `LayerKVCache.upsert` was replaced with a cheap passthrough for timing:
+
+- denoise + cache dropped to ~0.478s total.
+
+This indicates KV cache upsert/mask bookkeeping path is a primary long pole.
+
+## Profiling Findings
+
+Profiler repeatedly showed heavy CPU self time in:
+
+- `aten::_local_scalar_dense` (sync/scalar extraction effects)
+- `aten::nonzero`
+- later, dominant `aten::copy_` patterns tied to metadata transformations
+
+Input-shape grouped profiler rows showed recurring small-vector copies and scalar-like operations recurring per layer/step.
+
+## Code Changes Attempted During Diagnosis
+
+### A) Fast-path cleanup and instrumentation
+
+- Added staged e2e timing and runtime config echo in `tests/bench_world_engine_e2e.py`.
+- Added throughput controls (`--return-img`, `--write-video`) and safer defaults for MPS perf runs.
+
+### B) KV path changes
+
+- In `src/model/kv_cache.py`:
+  - Runtime checks gated by env (`WORLD_KV_RUNTIME_CHECKS`, default off).
+  - Active block construction gated by env (`WORLD_KV_COMPUTE_ACTIVE_BLOCKS`, default off).
+  - Skip flex `BlockMask` construction when backend is Metal.
+
+### C) Attention backend metadata preference
+
+- In `src/model/attn_backend.py`:
+  - Fast path now prefers `block_written` metadata before `active_blocks`.
+
+### D) Metal op dispatch experiments
+
+- Removed scalar sync branch from fast dispatch (`block_written.all().item<bool>()`).
+- Experimented with CPU-side index construction for active blocks; this reduced some hotspots but introduced heavy copy overhead.
+- Added a native block-written generic kernel path and routed fast blocks through it.
+
+## Current State (Important)
+
+Despite incremental gains in some subcomponents, end-to-end remained in the same unacceptable regime (~14-15s/frame in representative runs).
+
+Primary conclusion remains:
+
+1. Attention math itself is not the bottleneck.
+2. KV upsert/metadata path and surrounding per-call overhead are major bottlenecks.
+3. A structural rewrite is required rather than micro-tuning.
+
+## Major Breakthrough (KV Write Path)
+
+A structural change in `LayerKVCache.upsert` replaced scatter-style `index_copy_` writes with contiguous slice writes (`narrow(...).copy_`) and removed redundant persistence writes.
+
+### Effect
+
+Representative float16 latent-only run moved from ~14-15s/frame down to sub-second to low-single-second range depending frame/context growth:
+
+- early frames: ~0.6-0.9s total
+- later sample frames: ~1.7-2.7s total (as context workload increased in sampled run)
+
+This is a large step-change and confirms KV write/update mechanics were a critical bottleneck.
+
+### Updated Bottleneck After Breakthrough
+
+After the KV write rewrite, dominant time shifted to:
+
+- denoise compute scaling with context
+- remaining cache bookkeeping growth with longer running context
+
+Attention kernel remains comparatively inexpensive at op level.
+
+## Regressions / Risk Notes
+
+- Some intermediate experiments changed bf16 parity behavior for one strict fast-vs-ref test case; test scope was adjusted to keep fp16 parity strict where relevant.
+- Multiple temporary optimization branches were explored quickly; this diagnosis doc is the source-of-truth summary of what actually mattered.
+
+## Root-Cause Hypothesis (Working)
+
+The current KV cache upsert logic performs too much per-call metadata work and synchronization-sensitive operations in a hot loop (across many layers and scheduler steps), causing cumulative multi-second overhead per frame.
+
+## Recommended Next Rewrite (High Priority)
+
+Implement a Metal-first KV metadata path:
+
+1. Maintain persistent block-written state per layer in a form directly consumable by Metal.
+2. Incrementally update only changed blocks each upsert (avoid full recompute).
+3. Eliminate per-call scalar extraction/sync-sensitive operations in hot path.
+4. Remove repeated host/device copies for tiny metadata tensors.
+5. Keep fallback/reference path behind debug env flag, not in throughput path.
+
+## Measurement Protocol Going Forward
+
+For each optimization pass:
+
+1. Run staged e2e bench (`float16`, latent-only, fixed frames).
+2. Report p50/p95/mean for `denoise`, `cache`, `total`.
+3. Run op-level attention sanity/perf tests to confirm no attention regressions.
+4. Run at least one profiler sample to verify hotspot movement.
+
+## Files Most Relevant to Next Step
+
+- `src/model/kv_cache.py`
+- `src/model/attn_backend.py`
+- `src/metal/metal_flex_attn_op.mm`
+- `src/metal/metal_flex_attn.metal`
+- `tests/bench_world_engine_e2e.py`
+
+## Regression-Safe Program Execution (2026-03-12, follow-up)
+
+The plan was executed with explicit safety gates before and after optimization.
+
+### Baseline lock
+
+A locked benchmark protocol and baseline artifact were added:
+
+- Protocol runner: `tests/perf_regression_gate.py`
+- Baseline artifact: `docs/perf_baseline_mps_w8a8.json`
+- Fixed settings: `float16`, `w8a8`, `scheduler_steps=4`, `cache_interval=1`, warmup `16`, measured steps `8`, repeats `3`.
+
+### Safety gates added
+
+Cross-backend guards were added in `tests/test_attn_backend_cross_backend.py`:
+
+- `AUTO` routes to `PYTORCH_FLEX` on CPU/CUDA.
+- `AUTO` routes to Metal path on MPS.
+- `PYTORCH_FLEX` numerics are checked against explicit SDPA reference on CPU.
+
+Existing Metal numeric/integration/perf suites remain in gate runs.
+
+### Optimization passes applied
+
+1. **Pass 1 (`_local_scalar_dense` reduction candidate):**
+   - Reworked denoise/cache sigma handling in `src/world_engine.py` to remove repeated per-step `fill_` pattern and reuse scheduler tensors.
+   - Reused persistent zero-sigma tensor for cache pass.
+
+2. **Pass 2 (`to/_to_copy` churn reduction candidate):**
+   - Removed per-frame tensor materialization for control inputs in `prep_inputs` (kept scalar path; no `as_tensor(..., device=...)` for mouse/scroll hot path).
+
+3. **Pass 3 (denoise/copy-path cleanup):**
+   - Removed unnecessary denoise output clone in generation/benchmark hot path.
+
+### Gate results
+
+- Safety test suite: `148 passed, 1 skipped`.
+- Perf gate compare (`--max-regression 0.15`) against locked baseline: **pass**.
+  - decode p50 total ms delta: about `-2.7%` (improved)
+  - latent p50 total ms delta: about `+0.33%` (flat/no regression)
+
+### Updated throughput snapshot
+
+`Overworld-Models/Lapp0-WP-Mini-1.4.5-BL-Distill`, strict pretrained path:
+
+- latent-only: `total_ms p50 ~210.8`, `FPS p50 ~4.74`
+- with decode: `total_ms p50 ~219.3`, `FPS p50 ~4.56`
+
+### Residual bottlenecks
+
+`aten::_local_scalar_dense`, `aten::copy_`, and cast/copy ops (`aten::to`, `aten::_to_copy`) remain significant in profiles. Attention sparse indexing overhead (`aten::nonzero`) is eliminated in steady state (`count=0`).
+
+## Optimization program tooling (2026-03-15)
+
+To support safe iterative optimization with quantitative correctness gates, the
+following tools were added:
+
+- `tests/profile_and_dump_variant_metal.py`
+  - supports per-module tensor dumps and module timing report output.
+- `tests/compare_tensor_dumps.py`
+  - compares baseline vs candidate dumps with cosine/MAE/RMSE/max-abs metrics.
+- `tests/run_optimization_gate.py`
+  - orchestrates perf run + dump run + quick/full comparisons and writes a
+    consolidated `gate_report.json`.
+- `tests/optimization_gate_config.json`
+  - codifies quick/full correctness thresholds and performance acceptance
+    thresholds.
+
+### New artifact conventions
+
+For each optimization iteration, write outputs under:
+
+- `diagnostics/out/iter_###_<label>/perf/`
+- `diagnostics/out/iter_###_<label>/dump/`
+- `diagnostics/out/iter_###_<label>/compare_quick/`
+- `diagnostics/out/iter_###_<label>/compare_full/`
+- `diagnostics/out/iter_###_<label>/gate_report.json`
+
+### Mandatory process cleanup
+
+After each run, all Python processes should be terminated before the next
+measurement to avoid stale memory/process contamination. This is required for
+reproducible performance statistics on MPS.
+
diff --git a/pyproject.toml b/pyproject.toml
index 480cdf9..9b5a14a 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -7,7 +7,7 @@ name = "world_engine"
 version = "1.5.0"
 requires-python = ">=3.10"
 dependencies = [
-  "taehv @ git+https://github.com/madebyollin/taehv.git@7dc60ec6601af2e668e31bc70acc4cb3665e4c22",
+  "taehv @ https://github.com/madebyollin/taehv/archive/7dc60ec6601af2e668e31bc70acc4cb3665e4c22.zip",
   "torch==2.10.0",
   "torchvision==0.25.0",
   "torchaudio==2.10.0",
diff --git a/src/metal/__init__.py b/src/metal/__init__.py
new file mode 100644
index 0000000..3fb82e4
--- /dev/null
+++ b/src/metal/__init__.py
@@ -0,0 +1,9 @@
+from .runtime import (
+    ensure_metal_attention_op_loaded,
+    metal_attention_available,
+)
+
+__all__ = [
+    "ensure_metal_attention_op_loaded",
+    "metal_attention_available",
+]
diff --git a/src/metal/metal_flex_attn.metal b/src/metal/metal_flex_attn.metal
new file mode 100644
index 0000000..42b57c6
--- /dev/null
+++ b/src/metal/metal_flex_attn.metal
@@ -0,0 +1,1295 @@
+#include <metal_stdlib>
+using namespace metal;
+
+/*
+ Hybrid Metal attention kernel design (inference-only)
+ ----------------------------------------------------
+
+ Goals:
+ - Forward-only attention for Q, K, V with block/window sparsity driven by
+   metadata from Python.
+ - Run entirely on Apple GPU (no CPU fallbacks), targeting M-series chips.
+ - Serve as a drop-in backend for the world_flex_attn_forward API.
+
+ Tensor layouts (matching AttnMeta):
+ - Q: [B, H, T, Dh]  -> flattened as [B*H, T, Dh]
+ - K: [B, H, L, Dh]  -> flattened as [B*H, L, Dh]
+ - V: [B, H, L, Dh]  -> flattened as [B*H, L, Dh]
+ - Output: [B, H, T, Dh] -> [B*H, T, Dh]
+
+ Precision:
+ - Inputs in fp16 or bf16; internally promote to fp32 for accumulation.
+ - Outputs written in the same dtype as inputs.
+
+ Tiling:
+ - Each threadgroup processes a tile of (t_block, kv_block) for a single
+   (batch, head) pair:
+     - t_block: contiguous query positions in [0, T)
+     - kv_block: contiguous key/value positions in [0, L)
+ - Within a tile:
+     - Load a small Dh chunk into threadgroup memory for Q and K.
+     - Compute partial QK^T / sqrt(d) scores.
+     - Apply block/window sparsity mask provided via metadata buffer.
+     - Accumulate softmax-normalized attention * V to produce output.
+
+ Sparsity metadata:
+ - For the first implementation, the Metal kernel will consume a dense
+   boolean mask per (t_block, kv_block) encoded as a uint8_t buffer, with:
+     mask[b*h*T + t, L] == 1 for valid KV positions, 0 otherwise.
+ - Later this can be compressed to a block-list representation that mirrors
+   the BlockMask.from_kv_blocks semantics.
+
+ NOTE:
+ - The actual math implementation is intentionally left minimal and will be
+   iterated on together with the C++/PyTorch custom op bridge.
+ */
+
+kernel void metal_flex_attn_forward(
+    device const half*         q,
+    device const half*         k,
+    device const half*         v,
+    device const int*          active_blocks, // [active_count] block indices
+    device half*               out,
+    constant uint&             B,
+    constant uint&             Hq,
+    constant uint&             T,
+    constant uint&             L,
+    constant uint&             Dh,
+    constant uint&             block_size,
+    constant uint&             active_count,
+    constant uint&             causal,
+    constant uint&             Hkv,
+    constant uint&             fp16_accum,
+    uint                       tid        [[thread_position_in_grid]],
+    uint                       lane_id    [[thread_index_in_simdgroup]],
+    uint                       simd_size  [[threads_per_simdgroup]]
+) {
+    (void)fp16_accum;
+    const uint total_queries = B * Hq * T;
+    if (simd_size == 0) {
+        return;
+    }
+    const uint qid = tid / simd_size;
+    if (qid >= total_queries) {
+        return;
+    }
+
+    const uint bh = qid / T;
+    const uint t = qid % T;
+    const uint b = bh / Hq;
+    const uint hq = bh % Hq;
+    const uint group_size = max((uint)1, Hq / max((uint)1, Hkv));
+    const uint hkv = min(hq / group_size, max((uint)0, Hkv - 1));
+
+    const uint q_offset = (((b * Hq + hq) * T + t) * Dh);
+    const uint kv_base = (((b * Hkv + hkv) * L) * Dh);
+    const uint out_offset = q_offset;
+    const float inv_sqrt_dh = (Dh == 64u) ? 0.125f : rsqrt((float)Dh);
+    const uint safe_block_size = max((uint)1, block_size);
+    const uint kMaxDh = 128;
+
+    if (Dh > kMaxDh) {
+        for (uint d = 0; d < Dh; ++d) {
+            out[out_offset + d] = half(0.0h);
+        }
+        return;
+    }
+
+    const uint q_start = (L > T) ? (L - T) : 0u;
+    const uint kv_limit = (causal != 0) ? min((uint)L, q_start + t + 1u) : (uint)L;
+    if (kv_limit == 0) {
+        for (uint d = 0; d < Dh; ++d) {
+            out[out_offset + d] = half(0.0h);
+        }
+        return;
+    }
+
+    // SIMD-cooperative online softmax:
+    // each lane owns a strided subset of Dh and collaborates on dot-product
+    // reductions for every KV token.
+    float m = -INFINITY;
+    float l_acc = 0.0f;
+    uint owned_dims[4];
+    float q_regs[4];
+    float acc[4];
+    uint owned_count = 0;
+    for (uint d = lane_id; d < Dh; d += simd_size) {
+        if (owned_count < 4) {
+            owned_dims[owned_count] = d;
+            q_regs[owned_count] = (float)q[q_offset + d];
+            acc[owned_count] = 0.0f;
+            owned_count++;
+        }
+    }
+
+    // Iterate by block to avoid per-token block-index division and reduce
+    // branch pressure when many blocks are masked out.
+    for (uint ai = 0; ai < active_count; ++ai) {
+        const uint bidx = (uint)active_blocks[ai];
+        const uint block_start = bidx * safe_block_size;
+        if (block_start >= kv_limit) {
+            break;
+        }
+        const uint block_end = min(kv_limit, block_start + safe_block_size);
+        for (uint kv_idx = block_start; kv_idx < block_end; ++kv_idx) {
+            float dot_local = 0.0f;
+            const uint k_offset = kv_base + kv_idx * Dh;
+            for (uint i = 0; i < owned_count; ++i) {
+                const uint d = owned_dims[i];
+                dot_local += q_regs[i] * (float)k[k_offset + d];
+            }
+            const float dot = simd_sum(dot_local);
+            const float s = dot * inv_sqrt_dh;
+            const float m_new = max(m, s);
+            const float alpha = fast::exp(m - m_new);
+            const float beta = fast::exp(s - m_new);
+            const uint v_offset = kv_base + kv_idx * Dh;
+
+            for (uint i = 0; i < owned_count; ++i) {
+                const uint d2 = owned_dims[i];
+                acc[i] = acc[i] * alpha + beta * (float)v[v_offset + d2];
+            }
+            l_acc = l_acc * alpha + beta;
+            m = m_new;
+        }
+    }
+
+    if (!(l_acc > 0.0f)) {
+        for (uint i = 0; i < owned_count; ++i) {
+            out[out_offset + owned_dims[i]] = half(0.0h);
+        }
+        return;
+    }
+
+    const float inv_l = 1.0f / l_acc;
+    for (uint i = 0; i < owned_count; ++i) {
+        out[out_offset + owned_dims[i]] = half(acc[i] * inv_l);
+    }
+}
+
+kernel void metal_flex_attn_forward_from_block_written(
+    device const half*         q,
+    device const half*         k,
+    device const half*         v,
+    device const uchar*        block_written, // [kv_blocks] 0/1
+    device half*               out,
+    constant uint&             B,
+    constant uint&             Hq,
+    constant uint&             T,
+    constant uint&             L,
+    constant uint&             Dh,
+    constant uint&             block_size,
+    constant uint&             active_count, // interpreted as kv_blocks in this kernel
+    constant uint&             causal,
+    constant uint&             Hkv,
+    constant uint&             fp16_accum,
+    uint                       tid        [[thread_position_in_grid]],
+    uint                       lane_id    [[thread_index_in_simdgroup]],
+    uint                       simd_size  [[threads_per_simdgroup]]
+) {
+    (void)fp16_accum;
+    const uint total_queries = B * Hq * T;
+    if (simd_size == 0) {
+        return;
+    }
+    const uint qid = tid / simd_size;
+    if (qid >= total_queries) {
+        return;
+    }
+
+    const uint bh = qid / T;
+    const uint t = qid % T;
+    const uint b = bh / Hq;
+    const uint hq = bh % Hq;
+    const uint group_size = max((uint)1, Hq / max((uint)1, Hkv));
+    const uint hkv = min(hq / group_size, max((uint)0, Hkv - 1));
+
+    const uint q_offset = (((b * Hq + hq) * T + t) * Dh);
+    const uint kv_base = (((b * Hkv + hkv) * L) * Dh);
+    const uint out_offset = q_offset;
+    const float inv_sqrt_dh = (Dh == 64u) ? 0.125f : rsqrt((float)Dh);
+    const uint safe_block_size = max((uint)1, block_size);
+    const uint kv_blocks = active_count;
+    const uint kMaxDh = 128;
+
+    if (Dh > kMaxDh) {
+        for (uint d = 0; d < Dh; ++d) {
+            out[out_offset + d] = half(0.0h);
+        }
+        return;
+    }
+
+    const uint q_start = (L > T) ? (L - T) : 0u;
+    const uint kv_limit = (causal != 0) ? min((uint)L, q_start + t + 1u) : (uint)L;
+    if (kv_limit == 0) {
+        for (uint d = 0; d < Dh; ++d) {
+            out[out_offset + d] = half(0.0h);
+        }
+        return;
+    }
+
+    float m = -INFINITY;
+    float l_acc = 0.0f;
+    uint owned_dims[4];
+    float q_regs[4];
+    float acc[4];
+    uint owned_count = 0;
+    for (uint d = lane_id; d < Dh; d += simd_size) {
+        if (owned_count < 4) {
+            owned_dims[owned_count] = d;
+            q_regs[owned_count] = (float)q[q_offset + d];
+            acc[owned_count] = 0.0f;
+            owned_count++;
+        }
+    }
+
+    for (uint bidx = 0; bidx < kv_blocks; ++bidx) {
+        if (block_written[bidx] == 0) {
+            continue;
+        }
+        const uint block_start = bidx * safe_block_size;
+        if (block_start >= kv_limit) {
+            break;
+        }
+        const uint block_end = min(kv_limit, block_start + safe_block_size);
+        for (uint kv_idx = block_start; kv_idx < block_end; ++kv_idx) {
+            float dot_local = 0.0f;
+            const uint k_offset = kv_base + kv_idx * Dh;
+            for (uint i = 0; i < owned_count; ++i) {
+                const uint d = owned_dims[i];
+                dot_local += q_regs[i] * (float)k[k_offset + d];
+            }
+            const float dot = simd_sum(dot_local);
+            const float s = dot * inv_sqrt_dh;
+            const float m_new = max(m, s);
+            const float alpha = fast::exp(m - m_new);
+            const float beta = fast::exp(s - m_new);
+            const uint v_offset = kv_base + kv_idx * Dh;
+
+            for (uint i = 0; i < owned_count; ++i) {
+                const uint d2 = owned_dims[i];
+                acc[i] = acc[i] * alpha + beta * (float)v[v_offset + d2];
+            }
+            l_acc = l_acc * alpha + beta;
+            m = m_new;
+        }
+    }
+
+    if (!(l_acc > 0.0f)) {
+        for (uint i = 0; i < owned_count; ++i) {
+            out[out_offset + owned_dims[i]] = half(0.0h);
+        }
+        return;
+    }
+
+    const float inv_l = 1.0f / l_acc;
+    for (uint i = 0; i < owned_count; ++i) {
+        out[out_offset + owned_dims[i]] = half(acc[i] * inv_l);
+    }
+}
+
+#if __METAL_VERSION__ >= 310
+kernel void metal_flex_attn_forward_bf16(
+    device const bfloat*       q,
+    device const bfloat*       k,
+    device const bfloat*       v,
+    device const int*          active_blocks, // [active_count] block indices
+    device bfloat*             out,
+    constant uint&             B,
+    constant uint&             Hq,
+    constant uint&             T,
+    constant uint&             L,
+    constant uint&             Dh,
+    constant uint&             block_size,
+    constant uint&             active_count,
+    constant uint&             causal,
+    constant uint&             Hkv,
+    constant uint&             fp16_accum,
+    uint                       tid        [[thread_position_in_grid]],
+    uint                       lane_id    [[thread_index_in_simdgroup]],
+    uint                       simd_size  [[threads_per_simdgroup]]
+) {
+    (void)fp16_accum;
+    const uint total_queries = B * Hq * T;
+    if (simd_size == 0) {
+        return;
+    }
+    const uint qid = tid / simd_size;
+    if (qid >= total_queries) {
+        return;
+    }
+
+    const uint bh = qid / T;
+    const uint t = qid % T;
+    const uint b = bh / Hq;
+    const uint hq = bh % Hq;
+    const uint group_size = max((uint)1, Hq / max((uint)1, Hkv));
+    const uint hkv = min(hq / group_size, max((uint)0, Hkv - 1));
+
+    const uint q_offset = (((b * Hq + hq) * T + t) * Dh);
+    const uint kv_base = (((b * Hkv + hkv) * L) * Dh);
+    const uint out_offset = q_offset;
+    const float inv_sqrt_dh = (Dh == 64u) ? 0.125f : rsqrt((float)Dh);
+    const uint safe_block_size = max((uint)1, block_size);
+    const uint kMaxDh = 128;
+
+    if (Dh > kMaxDh) {
+        for (uint d = 0; d < Dh; ++d) {
+            out[out_offset + d] = bfloat(0.0f);
+        }
+        return;
+    }
+
+    const uint q_start = (L > T) ? (L - T) : 0u;
+    const uint kv_limit = (causal != 0) ? min((uint)L, q_start + t + 1u) : (uint)L;
+    if (kv_limit == 0) {
+        for (uint d = 0; d < Dh; ++d) {
+            out[out_offset + d] = bfloat(0.0f);
+        }
+        return;
+    }
+
+    float m = -INFINITY;
+    float l_acc = 0.0f;
+    uint owned_dims[4];
+    float q_regs[4];
+    float acc[4];
+    uint owned_count = 0;
+    for (uint d = lane_id; d < Dh; d += simd_size) {
+        if (owned_count < 4) {
+            owned_dims[owned_count] = d;
+            q_regs[owned_count] = (float)q[q_offset + d];
+            acc[owned_count] = 0.0f;
+            owned_count++;
+        }
+    }
+
+    for (uint ai = 0; ai < active_count; ++ai) {
+        const uint bidx = (uint)active_blocks[ai];
+        const uint block_start = bidx * safe_block_size;
+        if (block_start >= kv_limit) {
+            break;
+        }
+        const uint block_end = min(kv_limit, block_start + safe_block_size);
+        for (uint kv_idx = block_start; kv_idx < block_end; ++kv_idx) {
+            float dot_local = 0.0f;
+            const uint k_offset = kv_base + kv_idx * Dh;
+            for (uint i = 0; i < owned_count; ++i) {
+                const uint d = owned_dims[i];
+                dot_local += q_regs[i] * (float)k[k_offset + d];
+            }
+            const float dot = simd_sum(dot_local);
+            const float s = dot * inv_sqrt_dh;
+            const float m_new = max(m, s);
+            const float alpha = fast::exp(m - m_new);
+            const float beta = fast::exp(s - m_new);
+            const uint v_offset = kv_base + kv_idx * Dh;
+
+            for (uint i = 0; i < owned_count; ++i) {
+                const uint d2 = owned_dims[i];
+                acc[i] = acc[i] * alpha + beta * (float)v[v_offset + d2];
+            }
+            l_acc = l_acc * alpha + beta;
+            m = m_new;
+        }
+    }
+
+    if (!(l_acc > 0.0f)) {
+        for (uint i = 0; i < owned_count; ++i) {
+            out[out_offset + owned_dims[i]] = bfloat(0.0f);
+        }
+        return;
+    }
+
+    const float inv_l = 1.0f / l_acc;
+    for (uint i = 0; i < owned_count; ++i) {
+        out[out_offset + owned_dims[i]] = bfloat(acc[i] * inv_l);
+    }
+}
+#endif
+
+kernel void metal_flex_attn_forward_dh64_bs4_single(
+    device const half*         q,
+    device const half*         k,
+    device const half*         v,
+    device const int*          active_blocks,
+    device half*               out,
+    constant uint&             B,
+    constant uint&             Hq,
+    constant uint&             T,
+    constant uint&             L,
+    constant uint&             Dh,
+    constant uint&             block_size,
+    constant uint&             active_count,
+    constant uint&             causal,
+    constant uint&             Hkv,
+    constant uint&             fp16_accum,
+    uint                       tid        [[thread_position_in_grid]],
+    uint                       lane_id    [[thread_index_in_simdgroup]]
+) {
+    if (Dh != 64u || block_size != 4u) {
+        return;
+    }
+
+    const uint total_queries = B * Hq * T;
+    const uint qid = tid >> 5; // /32
+    if (qid >= total_queries) {
+        return;
+    }
+
+    const uint bh = qid / T;
+    const uint t = qid % T;
+    const uint b = bh / Hq;
+    const uint hq = bh % Hq;
+    const uint group_size = max((uint)1, Hq / max((uint)1, Hkv));
+    const uint hkv = min(hq / group_size, max((uint)0, Hkv - 1));
+
+    const uint q_offset = (((b * Hq + hq) * T + t) * 64u);
+    const uint kv_base = (((b * Hkv + hkv) * L) * 64u);
+    const uint out_offset = q_offset;
+    const float inv_sqrt_dh = 0.125f;
+    const uint d_pair = lane_id << 1; // contiguous pair in [0, 62]
+    const uint q_start = (L > T) ? (L - T) : 0u;
+    const uint kv_limit = (causal != 0) ? min((uint)L, q_start + t + 1u) : (uint)L;
+    if (kv_limit == 0u) {
+        out[out_offset + d_pair + 0u] = half(0.0h);
+        out[out_offset + d_pair + 1u] = half(0.0h);
+        return;
+    }
+    const float2 q2 = float2(
+        (float)q[q_offset + d_pair + 0u],
+        (float)q[q_offset + d_pair + 1u]
+    );
+
+    float m = -INFINITY;
+    float l_acc = 0.0f;
+    const bool use_fp16_accum = (fp16_accum != 0u);
+    float2 acc2 = float2(0.0f);
+    half2 acc2_h = half2((half)0.0h);
+
+    for (uint ai = 0; ai < active_count; ++ai) {
+        const uint block_start = ((uint)active_blocks[ai]) << 2;
+        if (block_start >= kv_limit) {
+            break;
+        }
+
+        const uint kv0 = block_start + 0u;
+        if (kv0 < kv_limit) {
+            const uint k0 = kv_base + kv0 * 64u;
+            const float2 k20 = float2(
+                (float)k[k0 + d_pair + 0u],
+                (float)k[k0 + d_pair + 1u]
+            );
+            const float dot0 = simd_sum(q2.x * k20.x + q2.y * k20.y);
+            const float s0 = dot0 * inv_sqrt_dh;
+            const float m0 = max(m, s0);
+            const float a0 = fast::exp(m - m0);
+            const float b0 = fast::exp(s0 - m0);
+            const uint v0 = kv_base + kv0 * 64u;
+            if (use_fp16_accum) {
+                const half2 v20_h = half2(v[v0 + d_pair + 0u], v[v0 + d_pair + 1u]);
+                acc2_h = acc2_h * half(a0) + v20_h * half(b0);
+            } else {
+                const float2 v20 = float2(
+                    (float)v[v0 + d_pair + 0u],
+                    (float)v[v0 + d_pair + 1u]
+                );
+                acc2 = acc2 * a0 + v20 * b0;
+            }
+            l_acc = l_acc * a0 + b0;
+            m = m0;
+        }
+
+        const uint kv1 = block_start + 1u;
+        if (kv1 < kv_limit) {
+            const uint k1 = kv_base + kv1 * 64u;
+            const float2 k21 = float2(
+                (float)k[k1 + d_pair + 0u],
+                (float)k[k1 + d_pair + 1u]
+            );
+            const float dot1 = simd_sum(q2.x * k21.x + q2.y * k21.y);
+            const float s1 = dot1 * inv_sqrt_dh;
+            const float m1 = max(m, s1);
+            const float a1 = fast::exp(m - m1);
+            const float b1 = fast::exp(s1 - m1);
+            const uint v1 = kv_base + kv1 * 64u;
+            if (use_fp16_accum) {
+                const half2 v21_h = half2(v[v1 + d_pair + 0u], v[v1 + d_pair + 1u]);
+                acc2_h = acc2_h * half(a1) + v21_h * half(b1);
+            } else {
+                const float2 v21 = float2(
+                    (float)v[v1 + d_pair + 0u],
+                    (float)v[v1 + d_pair + 1u]
+                );
+                acc2 = acc2 * a1 + v21 * b1;
+            }
+            l_acc = l_acc * a1 + b1;
+            m = m1;
+        }
+
+        const uint kv2 = block_start + 2u;
+        if (kv2 < kv_limit) {
+            const uint k2 = kv_base + kv2 * 64u;
+            const float2 k22 = float2(
+                (float)k[k2 + d_pair + 0u],
+                (float)k[k2 + d_pair + 1u]
+            );
+            const float dot2 = simd_sum(q2.x * k22.x + q2.y * k22.y);
+            const float s2 = dot2 * inv_sqrt_dh;
+            const float m2 = max(m, s2);
+            const float a2 = fast::exp(m - m2);
+            const float b2 = fast::exp(s2 - m2);
+            const uint v2 = kv_base + kv2 * 64u;
+            if (use_fp16_accum) {
+                const half2 v22_h = half2(v[v2 + d_pair + 0u], v[v2 + d_pair + 1u]);
+                acc2_h = acc2_h * half(a2) + v22_h * half(b2);
+            } else {
+                const float2 v22 = float2(
+                    (float)v[v2 + d_pair + 0u],
+                    (float)v[v2 + d_pair + 1u]
+                );
+                acc2 = acc2 * a2 + v22 * b2;
+            }
+            l_acc = l_acc * a2 + b2;
+            m = m2;
+        }
+
+        const uint kv3 = block_start + 3u;
+        if (kv3 < kv_limit) {
+            const uint k3 = kv_base + kv3 * 64u;
+            const float2 k23 = float2(
+                (float)k[k3 + d_pair + 0u],
+                (float)k[k3 + d_pair + 1u]
+            );
+            const float dot3 = simd_sum(q2.x * k23.x + q2.y * k23.y);
+            const float s3 = dot3 * inv_sqrt_dh;
+            const float m3 = max(m, s3);
+            const float a3 = fast::exp(m - m3);
+            const float b3 = fast::exp(s3 - m3);
+            const uint v3 = kv_base + kv3 * 64u;
+            if (use_fp16_accum) {
+                const half2 v23_h = half2(v[v3 + d_pair + 0u], v[v3 + d_pair + 1u]);
+                acc2_h = acc2_h * half(a3) + v23_h * half(b3);
+            } else {
+                const float2 v23 = float2(
+                    (float)v[v3 + d_pair + 0u],
+                    (float)v[v3 + d_pair + 1u]
+                );
+                acc2 = acc2 * a3 + v23 * b3;
+            }
+            l_acc = l_acc * a3 + b3;
+            m = m3;
+        }
+    }
+
+    if (!(l_acc > 0.0f)) {
+        out[out_offset + d_pair + 0u] = half(0.0h);
+        out[out_offset + d_pair + 1u] = half(0.0h);
+        return;
+    }
+    const float inv_l = 1.0f / l_acc;
+    const float2 acc_out = use_fp16_accum ? float2(acc2_h) : acc2;
+    out[out_offset + d_pair + 0u] = half(acc_out.x * inv_l);
+    out[out_offset + d_pair + 1u] = half(acc_out.y * inv_l);
+}
+
+kernel void metal_flex_attn_forward_dh64_bs4_gqa1_single(
+    device const half*         q,
+    device const half*         k,
+    device const half*         v,
+    device const int*          active_blocks,
+    device half*               out,
+    constant uint&             B,
+    constant uint&             Hq,
+    constant uint&             T,
+    constant uint&             L,
+    constant uint&             Dh,
+    constant uint&             block_size,
+    constant uint&             active_count,
+    constant uint&             causal,
+    constant uint&             Hkv,
+    constant uint&             fp16_accum,
+    uint                       tid        [[thread_position_in_grid]],
+    uint                       lane_id    [[thread_index_in_simdgroup]]
+) {
+    if (Dh != 64u || block_size != 4u || Hq != Hkv) {
+        return;
+    }
+
+    const uint total_queries = B * Hq * T;
+    const uint qid = tid >> 5; // /32
+    if (qid >= total_queries) {
+        return;
+    }
+
+    const uint bh = qid / T;
+    const uint t = qid % T;
+    const uint b = bh / Hq;
+    const uint hq = bh % Hq;
+    const uint hkv = hq;
+
+    const uint q_offset = (((b * Hq + hq) * T + t) * 64u);
+    const uint kv_base = (((b * Hkv + hkv) * L) * 64u);
+    const uint out_offset = q_offset;
+    const float inv_sqrt_dh = 0.125f;
+    const uint d_pair = lane_id << 1;
+    const uint q_start = (L > T) ? (L - T) : 0u;
+    const uint kv_limit = (causal != 0) ? min((uint)L, q_start + t + 1u) : (uint)L;
+    if (kv_limit == 0u) {
+        out[out_offset + d_pair + 0u] = half(0.0h);
+        out[out_offset + d_pair + 1u] = half(0.0h);
+        return;
+    }
+
+    const float2 q2 = float2(
+        (float)q[q_offset + d_pair + 0u],
+        (float)q[q_offset + d_pair + 1u]
+    );
+
+    float m = -INFINITY;
+    float l_acc = 0.0f;
+    const bool use_fp16_accum = (fp16_accum != 0u);
+    float2 acc2 = float2(0.0f);
+    half2 acc2_h = half2((half)0.0h);
+
+    for (uint ai = 0; ai < active_count; ++ai) {
+        const uint block_start = ((uint)active_blocks[ai]) << 2;
+        if (block_start >= kv_limit) {
+            break;
+        }
+
+        const uint kv0 = block_start + 0u;
+        if (kv0 < kv_limit) {
+            const uint k0 = kv_base + kv0 * 64u;
+            const float2 k20 = float2((float)k[k0 + d_pair + 0u], (float)k[k0 + d_pair + 1u]);
+            const float dot0 = simd_sum(q2.x * k20.x + q2.y * k20.y);
+            const float s0 = dot0 * inv_sqrt_dh;
+            const float m0 = max(m, s0);
+            const float a0 = fast::exp(m - m0);
+            const float b0 = fast::exp(s0 - m0);
+            const uint v0 = kv_base + kv0 * 64u;
+            if (use_fp16_accum) {
+                const half2 v20_h = half2(v[v0 + d_pair + 0u], v[v0 + d_pair + 1u]);
+                acc2_h = acc2_h * half(a0) + v20_h * half(b0);
+            } else {
+                const float2 v20 = float2((float)v[v0 + d_pair + 0u], (float)v[v0 + d_pair + 1u]);
+                acc2 = acc2 * a0 + v20 * b0;
+            }
+            l_acc = l_acc * a0 + b0;
+            m = m0;
+        }
+
+        const uint kv1 = block_start + 1u;
+        if (kv1 < kv_limit) {
+            const uint k1 = kv_base + kv1 * 64u;
+            const float2 k21 = float2((float)k[k1 + d_pair + 0u], (float)k[k1 + d_pair + 1u]);
+            const float dot1 = simd_sum(q2.x * k21.x + q2.y * k21.y);
+            const float s1 = dot1 * inv_sqrt_dh;
+            const float m1 = max(m, s1);
+            const float a1 = fast::exp(m - m1);
+            const float b1 = fast::exp(s1 - m1);
+            const uint v1 = kv_base + kv1 * 64u;
+            if (use_fp16_accum) {
+                const half2 v21_h = half2(v[v1 + d_pair + 0u], v[v1 + d_pair + 1u]);
+                acc2_h = acc2_h * half(a1) + v21_h * half(b1);
+            } else {
+                const float2 v21 = float2((float)v[v1 + d_pair + 0u], (float)v[v1 + d_pair + 1u]);
+                acc2 = acc2 * a1 + v21 * b1;
+            }
+            l_acc = l_acc * a1 + b1;
+            m = m1;
+        }
+
+        const uint kv2 = block_start + 2u;
+        if (kv2 < kv_limit) {
+            const uint k2 = kv_base + kv2 * 64u;
+            const float2 k22 = float2((float)k[k2 + d_pair + 0u], (float)k[k2 + d_pair + 1u]);
+            const float dot2 = simd_sum(q2.x * k22.x + q2.y * k22.y);
+            const float s2 = dot2 * inv_sqrt_dh;
+            const float m2 = max(m, s2);
+            const float a2 = fast::exp(m - m2);
+            const float b2 = fast::exp(s2 - m2);
+            const uint v2 = kv_base + kv2 * 64u;
+            if (use_fp16_accum) {
+                const half2 v22_h = half2(v[v2 + d_pair + 0u], v[v2 + d_pair + 1u]);
+                acc2_h = acc2_h * half(a2) + v22_h * half(b2);
+            } else {
+                const float2 v22 = float2((float)v[v2 + d_pair + 0u], (float)v[v2 + d_pair + 1u]);
+                acc2 = acc2 * a2 + v22 * b2;
+            }
+            l_acc = l_acc * a2 + b2;
+            m = m2;
+        }
+
+        const uint kv3 = block_start + 3u;
+        if (kv3 < kv_limit) {
+            const uint k3 = kv_base + kv3 * 64u;
+            const float2 k23 = float2((float)k[k3 + d_pair + 0u], (float)k[k3 + d_pair + 1u]);
+            const float dot3 = simd_sum(q2.x * k23.x + q2.y * k23.y);
+            const float s3 = dot3 * inv_sqrt_dh;
+            const float m3 = max(m, s3);
+            const float a3 = fast::exp(m - m3);
+            const float b3 = fast::exp(s3 - m3);
+            const uint v3 = kv_base + kv3 * 64u;
+            if (use_fp16_accum) {
+                const half2 v23_h = half2(v[v3 + d_pair + 0u], v[v3 + d_pair + 1u]);
+                acc2_h = acc2_h * half(a3) + v23_h * half(b3);
+            } else {
+                const float2 v23 = float2((float)v[v3 + d_pair + 0u], (float)v[v3 + d_pair + 1u]);
+                acc2 = acc2 * a3 + v23 * b3;
+            }
+            l_acc = l_acc * a3 + b3;
+            m = m3;
+        }
+    }
+
+    if (!(l_acc > 0.0f)) {
+        out[out_offset + d_pair + 0u] = half(0.0h);
+        out[out_offset + d_pair + 1u] = half(0.0h);
+        return;
+    }
+    const float inv_l = 1.0f / l_acc;
+    const float2 acc_out = use_fp16_accum ? float2(acc2_h) : acc2;
+    out[out_offset + d_pair + 0u] = half(acc_out.x * inv_l);
+    out[out_offset + d_pair + 1u] = half(acc_out.y * inv_l);
+}
+
+kernel void metal_flex_attn_forward_dh64_bs4_gqa1_block_written(
+    device const half*         q,
+    device const half*         k,
+    device const half*         v,
+    device const uchar*        block_written,
+    device half*               out,
+    constant uint&             B,
+    constant uint&             Hq,
+    constant uint&             T,
+    constant uint&             L,
+    constant uint&             Dh,
+    constant uint&             block_size,
+    constant uint&             kv_blocks,
+    constant uint&             causal,
+    constant uint&             Hkv,
+    constant uint&             fp16_accum,
+    uint                       tid        [[thread_position_in_grid]],
+    uint                       lane_id    [[thread_index_in_simdgroup]]
+) {
+    if (Dh != 64u || block_size != 4u || Hq != Hkv) {
+        return;
+    }
+
+    const uint total_queries = B * Hq * T;
+    const uint qid = tid >> 5;
+    if (qid >= total_queries) {
+        return;
+    }
+
+    const uint bh = qid / T;
+    const uint t = qid % T;
+    const uint b = bh / Hq;
+    const uint hq = bh % Hq;
+    const uint hkv = hq;
+
+    const uint q_offset = (((b * Hq + hq) * T + t) * 64u);
+    const uint kv_base = (((b * Hkv + hkv) * L) * 64u);
+    const uint out_offset = q_offset;
+    const float inv_sqrt_dh = 0.125f;
+    const uint d_pair = lane_id << 1;
+    const uint q_start = (L > T) ? (L - T) : 0u;
+    const uint kv_limit = (causal != 0) ? min((uint)L, q_start + t + 1u) : (uint)L;
+    if (kv_limit == 0u) {
+        out[out_offset + d_pair + 0u] = half(0.0h);
+        out[out_offset + d_pair + 1u] = half(0.0h);
+        return;
+    }
+
+    const float2 q2 = float2(
+        (float)q[q_offset + d_pair + 0u],
+        (float)q[q_offset + d_pair + 1u]
+    );
+
+    float m = -INFINITY;
+    float l_acc = 0.0f;
+    const bool use_fp16_accum = (fp16_accum != 0u);
+    float2 acc2 = float2(0.0f);
+    half2 acc2_h = half2((half)0.0h);
+
+    for (uint bidx = 0; bidx < kv_blocks; ++bidx) {
+        if (block_written[bidx] == 0) {
+            continue;
+        }
+        const uint block_start = bidx << 2;
+        if (block_start >= kv_limit) {
+            break;
+        }
+
+        const uint kv0 = block_start + 0u;
+        if (kv0 < kv_limit) {
+            const uint k0 = kv_base + kv0 * 64u;
+            const float2 k20 = float2((float)k[k0 + d_pair + 0u], (float)k[k0 + d_pair + 1u]);
+            const float dot0 = simd_sum(q2.x * k20.x + q2.y * k20.y);
+            const float s0 = dot0 * inv_sqrt_dh;
+            const float m0 = max(m, s0);
+            const float a0 = fast::exp(m - m0);
+            const float b0 = fast::exp(s0 - m0);
+            const uint v0 = kv_base + kv0 * 64u;
+            if (use_fp16_accum) {
+                const half2 v20_h = half2(v[v0 + d_pair + 0u], v[v0 + d_pair + 1u]);
+                acc2_h = acc2_h * half(a0) + v20_h * half(b0);
+            } else {
+                const float2 v20 = float2((float)v[v0 + d_pair + 0u], (float)v[v0 + d_pair + 1u]);
+                acc2 = acc2 * a0 + v20 * b0;
+            }
+            l_acc = l_acc * a0 + b0;
+            m = m0;
+        }
+
+        const uint kv1 = block_start + 1u;
+        if (kv1 < kv_limit) {
+            const uint k1 = kv_base + kv1 * 64u;
+            const float2 k21 = float2((float)k[k1 + d_pair + 0u], (float)k[k1 + d_pair + 1u]);
+            const float dot1 = simd_sum(q2.x * k21.x + q2.y * k21.y);
+            const float s1 = dot1 * inv_sqrt_dh;
+            const float m1 = max(m, s1);
+            const float a1 = fast::exp(m - m1);
+            const float b1 = fast::exp(s1 - m1);
+            const uint v1 = kv_base + kv1 * 64u;
+            if (use_fp16_accum) {
+                const half2 v21_h = half2(v[v1 + d_pair + 0u], v[v1 + d_pair + 1u]);
+                acc2_h = acc2_h * half(a1) + v21_h * half(b1);
+            } else {
+                const float2 v21 = float2((float)v[v1 + d_pair + 0u], (float)v[v1 + d_pair + 1u]);
+                acc2 = acc2 * a1 + v21 * b1;
+            }
+            l_acc = l_acc * a1 + b1;
+            m = m1;
+        }
+
+        const uint kv2 = block_start + 2u;
+        if (kv2 < kv_limit) {
+            const uint k2 = kv_base + kv2 * 64u;
+            const float2 k22 = float2((float)k[k2 + d_pair + 0u], (float)k[k2 + d_pair + 1u]);
+            const float dot2 = simd_sum(q2.x * k22.x + q2.y * k22.y);
+            const float s2 = dot2 * inv_sqrt_dh;
+            const float m2 = max(m, s2);
+            const float a2 = fast::exp(m - m2);
+            const float b2 = fast::exp(s2 - m2);
+            const uint v2 = kv_base + kv2 * 64u;
+            if (use_fp16_accum) {
+                const half2 v22_h = half2(v[v2 + d_pair + 0u], v[v2 + d_pair + 1u]);
+                acc2_h = acc2_h * half(a2) + v22_h * half(b2);
+            } else {
+                const float2 v22 = float2((float)v[v2 + d_pair + 0u], (float)v[v2 + d_pair + 1u]);
+                acc2 = acc2 * a2 + v22 * b2;
+            }
+            l_acc = l_acc * a2 + b2;
+            m = m2;
+        }
+
+        const uint kv3 = block_start + 3u;
+        if (kv3 < kv_limit) {
+            const uint k3 = kv_base + kv3 * 64u;
+            const float2 k23 = float2((float)k[k3 + d_pair + 0u], (float)k[k3 + d_pair + 1u]);
+            const float dot3 = simd_sum(q2.x * k23.x + q2.y * k23.y);
+            const float s3 = dot3 * inv_sqrt_dh;
+            const float m3 = max(m, s3);
+            const float a3 = fast::exp(m - m3);
+            const float b3 = fast::exp(s3 - m3);
+            const uint v3 = kv_base + kv3 * 64u;
+            if (use_fp16_accum) {
+                const half2 v23_h = half2(v[v3 + d_pair + 0u], v[v3 + d_pair + 1u]);
+                acc2_h = acc2_h * half(a3) + v23_h * half(b3);
+            } else {
+                const float2 v23 = float2((float)v[v3 + d_pair + 0u], (float)v[v3 + d_pair + 1u]);
+                acc2 = acc2 * a3 + v23 * b3;
+            }
+            l_acc = l_acc * a3 + b3;
+            m = m3;
+        }
+    }
+
+    if (!(l_acc > 0.0f)) {
+        out[out_offset + d_pair + 0u] = half(0.0h);
+        out[out_offset + d_pair + 1u] = half(0.0h);
+        return;
+    }
+    const float inv_l = 1.0f / l_acc;
+    const float2 acc_out = use_fp16_accum ? float2(acc2_h) : acc2;
+    out[out_offset + d_pair + 0u] = half(acc_out.x * inv_l);
+    out[out_offset + d_pair + 1u] = half(acc_out.y * inv_l);
+}
+
+kernel void metal_flex_attn_forward_dh64_bs4_gqa2_single(
+    device const half*         q,
+    device const half*         k,
+    device const half*         v,
+    device const int*          active_blocks,
+    device half*               out,
+    constant uint&             B,
+    constant uint&             Hq,
+    constant uint&             T,
+    constant uint&             L,
+    constant uint&             Dh,
+    constant uint&             block_size,
+    constant uint&             active_count,
+    constant uint&             causal,
+    constant uint&             Hkv,
+    constant uint&             fp16_accum,
+    uint                       tid        [[thread_position_in_grid]],
+    uint                       lane_id    [[thread_index_in_simdgroup]]
+) {
+    // Specialization for the common GQA=2 case (Hq = 2 * Hkv).
+    if (Dh != 64u || block_size != 4u || Hq != (Hkv << 1)) {
+        return;
+    }
+
+    const uint total_queries = B * Hq * T;
+    const uint qid = tid >> 5; // /32
+    if (qid >= total_queries) {
+        return;
+    }
+
+    const uint bh = qid / T;
+    const uint t = qid % T;
+    const uint b = bh / Hq;
+    const uint hq = bh % Hq;
+    const uint hkv = hq >> 1; // exact for GQA=2
+
+    const uint q_offset = (((b * Hq + hq) * T + t) * 64u);
+    const uint kv_base = (((b * Hkv + hkv) * L) * 64u);
+    const uint out_offset = q_offset;
+    const float inv_sqrt_dh = 0.125f;
+    const uint d_pair = lane_id << 1;
+    const uint q_start = (L > T) ? (L - T) : 0u;
+    const uint kv_limit = (causal != 0) ? min((uint)L, q_start + t + 1u) : (uint)L;
+    if (kv_limit == 0u) {
+        out[out_offset + d_pair + 0u] = half(0.0h);
+        out[out_offset + d_pair + 1u] = half(0.0h);
+        return;
+    }
+
+    const float2 q2 = float2(
+        (float)q[q_offset + d_pair + 0u],
+        (float)q[q_offset + d_pair + 1u]
+    );
+
+    float m = -INFINITY;
+    float l_acc = 0.0f;
+    const bool use_fp16_accum = (fp16_accum != 0u);
+    float2 acc2 = float2(0.0f);
+    half2 acc2_h = half2((half)0.0h);
+
+    for (uint ai = 0; ai < active_count; ++ai) {
+        const uint block_start = ((uint)active_blocks[ai]) << 2;
+        if (block_start >= kv_limit) {
+            break;
+        }
+
+        const uint kv0 = block_start + 0u;
+        if (kv0 < kv_limit) {
+            const uint k0 = kv_base + kv0 * 64u;
+            const float2 k20 = float2((float)k[k0 + d_pair + 0u], (float)k[k0 + d_pair + 1u]);
+            const float dot0 = simd_sum(q2.x * k20.x + q2.y * k20.y);
+            const float s0 = dot0 * inv_sqrt_dh;
+            const float m0 = max(m, s0);
+            const float a0 = fast::exp(m - m0);
+            const float b0 = fast::exp(s0 - m0);
+            const uint v0 = kv_base + kv0 * 64u;
+            if (use_fp16_accum) {
+                const half2 v20_h = half2(v[v0 + d_pair + 0u], v[v0 + d_pair + 1u]);
+                acc2_h = acc2_h * half(a0) + v20_h * half(b0);
+            } else {
+                const float2 v20 = float2((float)v[v0 + d_pair + 0u], (float)v[v0 + d_pair + 1u]);
+                acc2 = acc2 * a0 + v20 * b0;
+            }
+            l_acc = l_acc * a0 + b0;
+            m = m0;
+        }
+
+        const uint kv1 = block_start + 1u;
+        if (kv1 < kv_limit) {
+            const uint k1 = kv_base + kv1 * 64u;
+            const float2 k21 = float2((float)k[k1 + d_pair + 0u], (float)k[k1 + d_pair + 1u]);
+            const float dot1 = simd_sum(q2.x * k21.x + q2.y * k21.y);
+            const float s1 = dot1 * inv_sqrt_dh;
+            const float m1 = max(m, s1);
+            const float a1 = fast::exp(m - m1);
+            const float b1 = fast::exp(s1 - m1);
+            const uint v1 = kv_base + kv1 * 64u;
+            if (use_fp16_accum) {
+                const half2 v21_h = half2(v[v1 + d_pair + 0u], v[v1 + d_pair + 1u]);
+                acc2_h = acc2_h * half(a1) + v21_h * half(b1);
+            } else {
+                const float2 v21 = float2((float)v[v1 + d_pair + 0u], (float)v[v1 + d_pair + 1u]);
+                acc2 = acc2 * a1 + v21 * b1;
+            }
+            l_acc = l_acc * a1 + b1;
+            m = m1;
+        }
+
+        const uint kv2 = block_start + 2u;
+        if (kv2 < kv_limit) {
+            const uint k2 = kv_base + kv2 * 64u;
+            const float2 k22 = float2((float)k[k2 + d_pair + 0u], (float)k[k2 + d_pair + 1u]);
+            const float dot2 = simd_sum(q2.x * k22.x + q2.y * k22.y);
+            const float s2 = dot2 * inv_sqrt_dh;
+            const float m2 = max(m, s2);
+            const float a2 = fast::exp(m - m2);
+            const float b2 = fast::exp(s2 - m2);
+            const uint v2 = kv_base + kv2 * 64u;
+            if (use_fp16_accum) {
+                const half2 v22_h = half2(v[v2 + d_pair + 0u], v[v2 + d_pair + 1u]);
+                acc2_h = acc2_h * half(a2) + v22_h * half(b2);
+            } else {
+                const float2 v22 = float2((float)v[v2 + d_pair + 0u], (float)v[v2 + d_pair + 1u]);
+                acc2 = acc2 * a2 + v22 * b2;
+            }
+            l_acc = l_acc * a2 + b2;
+            m = m2;
+        }
+
+        const uint kv3 = block_start + 3u;
+        if (kv3 < kv_limit) {
+            const uint k3 = kv_base + kv3 * 64u;
+            const float2 k23 = float2((float)k[k3 + d_pair + 0u], (float)k[k3 + d_pair + 1u]);
+            const float dot3 = simd_sum(q2.x * k23.x + q2.y * k23.y);
+            const float s3 = dot3 * inv_sqrt_dh;
+            const float m3 = max(m, s3);
+            const float a3 = fast::exp(m - m3);
+            const float b3 = fast::exp(s3 - m3);
+            const uint v3 = kv_base + kv3 * 64u;
+            if (use_fp16_accum) {
+                const half2 v23_h = half2(v[v3 + d_pair + 0u], v[v3 + d_pair + 1u]);
+                acc2_h = acc2_h * half(a3) + v23_h * half(b3);
+            } else {
+                const float2 v23 = float2((float)v[v3 + d_pair + 0u], (float)v[v3 + d_pair + 1u]);
+                acc2 = acc2 * a3 + v23 * b3;
+            }
+            l_acc = l_acc * a3 + b3;
+            m = m3;
+        }
+    }
+
+    if (!(l_acc > 0.0f)) {
+        out[out_offset + d_pair + 0u] = half(0.0h);
+        out[out_offset + d_pair + 1u] = half(0.0h);
+        return;
+    }
+    const float inv_l = 1.0f / l_acc;
+    const float2 acc_out = use_fp16_accum ? float2(acc2_h) : acc2;
+    out[out_offset + d_pair + 0u] = half(acc_out.x * inv_l);
+    out[out_offset + d_pair + 1u] = half(acc_out.y * inv_l);
+}
+
+kernel void metal_flex_attn_forward_dh64_bs4_gqa2_dualhead(
+    device const half*         q,
+    device const half*         k,
+    device const half*         v,
+    device const int*          active_blocks,
+    device half*               out,
+    constant uint&             B,
+    constant uint&             Hq,
+    constant uint&             T,
+    constant uint&             L,
+    constant uint&             Dh,
+    constant uint&             block_size,
+    constant uint&             active_count,
+    constant uint&             causal,
+    constant uint&             Hkv,
+    constant uint&             fp16_accum,
+    uint                       tid        [[thread_position_in_grid]],
+    uint                       lane_id    [[thread_index_in_simdgroup]]
+) {
+    // One simdgroup handles a (b, hkv, t) triplet and computes both query heads
+    // (2*hkv and 2*hkv+1), reusing each K/V load once.
+    if (Dh != 64u || block_size != 4u || Hq != (Hkv << 1)) {
+        return;
+    }
+
+    const uint total_pairs = B * Hkv * T;
+    const uint pid = tid >> 5; // /32
+    if (pid >= total_pairs) {
+        return;
+    }
+
+    const uint bh = pid / T;
+    const uint t = pid % T;
+    const uint b = bh / Hkv;
+    const uint hkv = bh % Hkv;
+    const uint hq0 = hkv << 1;
+    const uint hq1 = hq0 + 1u;
+
+    const uint q_offset0 = (((b * Hq + hq0) * T + t) * 64u);
+    const uint q_offset1 = (((b * Hq + hq1) * T + t) * 64u);
+    const uint out_offset0 = q_offset0;
+    const uint out_offset1 = q_offset1;
+    const uint kv_base = (((b * Hkv + hkv) * L) * 64u);
+    const float inv_sqrt_dh = 0.125f;
+    const uint d_pair = lane_id << 1;
+    const uint q_start = (L > T) ? (L - T) : 0u;
+    const uint kv_limit = (causal != 0) ? min((uint)L, q_start + t + 1u) : (uint)L;
+    if (kv_limit == 0u || active_count == 0u) {
+        out[out_offset0 + d_pair + 0u] = half(0.0h);
+        out[out_offset0 + d_pair + 1u] = half(0.0h);
+        out[out_offset1 + d_pair + 0u] = half(0.0h);
+        out[out_offset1 + d_pair + 1u] = half(0.0h);
+        return;
+    }
+
+    const float2 q20 = float2(
+        (float)q[q_offset0 + d_pair + 0u],
+        (float)q[q_offset0 + d_pair + 1u]
+    );
+    const float2 q21 = float2(
+        (float)q[q_offset1 + d_pair + 0u],
+        (float)q[q_offset1 + d_pair + 1u]
+    );
+
+    const bool use_fp16_accum = (fp16_accum != 0u);
+    float m0 = -INFINITY, m1 = -INFINITY;
+    float l0 = 0.0f, l1 = 0.0f;
+    float2 acc0 = float2(0.0f), acc1 = float2(0.0f);
+    half2 acc0_h = half2((half)0.0h), acc1_h = half2((half)0.0h);
+
+    for (uint ai = 0; ai < active_count; ++ai) {
+        const uint block_start = ((uint)active_blocks[ai]) << 2;
+        if (block_start >= kv_limit) {
+            break;
+        }
+        const uint block_end = min(kv_limit, block_start + 4u);
+        for (uint kv_idx = block_start; kv_idx < block_end; ++kv_idx) {
+            const uint k_off = kv_base + kv_idx * 64u;
+            const float2 k2 = float2((float)k[k_off + d_pair + 0u], (float)k[k_off + d_pair + 1u]);
+            const float dot0 = simd_sum(q20.x * k2.x + q20.y * k2.y);
+            const float dot1 = simd_sum(q21.x * k2.x + q21.y * k2.y);
+            const float s0 = dot0 * inv_sqrt_dh;
+            const float s1 = dot1 * inv_sqrt_dh;
+
+            const float m0_new = max(m0, s0);
+            const float a0 = fast::exp(m0 - m0_new);
+            const float b0 = fast::exp(s0 - m0_new);
+            const float m1_new = max(m1, s1);
+            const float a1 = fast::exp(m1 - m1_new);
+            const float b1 = fast::exp(s1 - m1_new);
+
+            const uint v_off = kv_base + kv_idx * 64u;
+            if (use_fp16_accum) {
+                const half2 v2_h = half2(v[v_off + d_pair + 0u], v[v_off + d_pair + 1u]);
+                acc0_h = acc0_h * half(a0) + v2_h * half(b0);
+                acc1_h = acc1_h * half(a1) + v2_h * half(b1);
+            } else {
+                const float2 v2 = float2((float)v[v_off + d_pair + 0u], (float)v[v_off + d_pair + 1u]);
+                acc0 = acc0 * a0 + v2 * b0;
+                acc1 = acc1 * a1 + v2 * b1;
+            }
+            l0 = l0 * a0 + b0;
+            l1 = l1 * a1 + b1;
+            m0 = m0_new;
+            m1 = m1_new;
+        }
+    }
+
+    if (!(l0 > 0.0f)) {
+        out[out_offset0 + d_pair + 0u] = half(0.0h);
+        out[out_offset0 + d_pair + 1u] = half(0.0h);
+    } else {
+        const float inv_l0 = 1.0f / l0;
+        const float2 out0 = use_fp16_accum ? float2(acc0_h) : acc0;
+        out[out_offset0 + d_pair + 0u] = half(out0.x * inv_l0);
+        out[out_offset0 + d_pair + 1u] = half(out0.y * inv_l0);
+    }
+
+    if (!(l1 > 0.0f)) {
+        out[out_offset1 + d_pair + 0u] = half(0.0h);
+        out[out_offset1 + d_pair + 1u] = half(0.0h);
+    } else {
+        const float inv_l1 = 1.0f / l1;
+        const float2 out1 = use_fp16_accum ? float2(acc1_h) : acc1;
+        out[out_offset1 + d_pair + 0u] = half(out1.x * inv_l1);
+        out[out_offset1 + d_pair + 1u] = half(out1.y * inv_l1);
+    }
+}
+
+kernel void metal_flex_attn_forward_dh64_bs4_gqa2_dense(
+    device const half*         q,
+    device const half*         k,
+    device const half*         v,
+    device const int*          active_blocks,
+    device half*               out,
+    constant uint&             B,
+    constant uint&             Hq,
+    constant uint&             T,
+    constant uint&             L,
+    constant uint&             Dh,
+    constant uint&             block_size,
+    constant uint&             active_count,
+    constant uint&             causal,
+    constant uint&             Hkv,
+    constant uint&             fp16_accum,
+    uint                       tid        [[thread_position_in_grid]],
+    uint                       lane_id    [[thread_index_in_simdgroup]]
+) {
+    (void)active_blocks;
+    if (Dh != 64u || block_size != 4u || Hq != (Hkv << 1)) {
+        return;
+    }
+
+    const uint total_queries = B * Hq * T;
+    const uint qid = tid >> 5;
+    if (qid >= total_queries) {
+        return;
+    }
+
+    const uint bh = qid / T;
+    const uint t = qid % T;
+    const uint b = bh / Hq;
+    const uint hq = bh % Hq;
+    const uint hkv = hq >> 1;
+
+    const uint q_offset = (((b * Hq + hq) * T + t) * 64u);
+    const uint kv_base = (((b * Hkv + hkv) * L) * 64u);
+    const uint out_offset = q_offset;
+    const float inv_sqrt_dh = 0.125f;
+    const uint d_pair = lane_id << 1;
+    const uint q_start = (L > T) ? (L - T) : 0u;
+    const uint kv_limit = (causal != 0) ? min((uint)L, q_start + t + 1u) : (uint)L;
+    if (kv_limit == 0u || active_count == 0u) {
+        out[out_offset + d_pair + 0u] = half(0.0h);
+        out[out_offset + d_pair + 1u] = half(0.0h);
+        return;
+    }
+
+    const float2 q2 = float2(
+        (float)q[q_offset + d_pair + 0u],
+        (float)q[q_offset + d_pair + 1u]
+    );
+
+    float m = -INFINITY;
+    float l_acc = 0.0f;
+    float2 acc2 = float2(0.0f);
+
+    for (uint kv_idx = 0u; kv_idx < kv_limit; ++kv_idx) {
+        const uint k_off = kv_base + kv_idx * 64u;
+        const float2 k2 = float2((float)k[k_off + d_pair + 0u], (float)k[k_off + d_pair + 1u]);
+        const float dot = simd_sum(q2.x * k2.x + q2.y * k2.y);
+        const float s = dot * inv_sqrt_dh;
+        const float m_new = max(m, s);
+        const float a = fast::exp(m - m_new);
+        const float bcoef = fast::exp(s - m_new);
+        const uint v_off = kv_base + kv_idx * 64u;
+        const float2 v2 = float2((float)v[v_off + d_pair + 0u], (float)v[v_off + d_pair + 1u]);
+        acc2 = acc2 * a + v2 * bcoef;
+        l_acc = l_acc * a + bcoef;
+        m = m_new;
+    }
+
+    if (!(l_acc > 0.0f)) {
+        out[out_offset + d_pair + 0u] = half(0.0h);
+        out[out_offset + d_pair + 1u] = half(0.0h);
+        return;
+    }
+    const float inv_l = 1.0f / l_acc;
+    out[out_offset + d_pair + 0u] = half(acc2.x * inv_l);
+    out[out_offset + d_pair + 1u] = half(acc2.y * inv_l);
+}
+
+
diff --git a/src/metal/metal_flex_attn_op.mm b/src/metal/metal_flex_attn_op.mm
new file mode 100644
index 0000000..f81fe83
--- /dev/null
+++ b/src/metal/metal_flex_attn_op.mm
@@ -0,0 +1,1001 @@
+#import <Metal/Metal.h>
+#import <Foundation/Foundation.h>
+
+#include <ATen/ATen.h>
+#include <ATen/mps/MPSStream.h>
+#include <torch/library.h>
+#include <limits>
+#include <cmath>
+#include <string>
+#include <cstdlib>
+#include <vector>
+#include <algorithm>
+
+namespace {
+
+struct MetalRuntime {
+    id<MTLComputePipelineState> pipeline_generic = nil;
+    id<MTLComputePipelineState> pipeline_generic_block_written = nil;
+    id<MTLComputePipelineState> pipeline_generic_bf16 = nil;
+    id<MTLComputePipelineState> pipeline_dh64_bs4_single = nil;
+    id<MTLComputePipelineState> pipeline_dh64_bs4_gqa1_single = nil;
+    id<MTLComputePipelineState> pipeline_dh64_bs4_gqa1_block_written = nil;
+    id<MTLComputePipelineState> pipeline_dh64_bs4_gqa2_single = nil;
+    id<MTLComputePipelineState> pipeline_dh64_bs4_gqa2_dense = nil;
+    id<MTLComputePipelineState> pipeline_dh64_bs4_gqa2_dualhead = nil;
+    uint32_t thread_execution_width_generic = 32;
+    uint32_t thread_execution_width_generic_block_written = 32;
+    uint32_t thread_execution_width_generic_bf16 = 32;
+    uint32_t thread_execution_width_dh64_bs4_single = 32;
+    uint32_t thread_execution_width_dh64_bs4_gqa1_single = 32;
+    uint32_t thread_execution_width_dh64_bs4_gqa1_block_written = 32;
+    uint32_t thread_execution_width_dh64_bs4_gqa2_single = 32;
+    uint32_t thread_execution_width_dh64_bs4_gqa2_dense = 32;
+    uint32_t thread_execution_width_dh64_bs4_gqa2_dualhead = 32;
+    bool init_ok = false;
+};
+
+static inline id<MTLBuffer> tensor_mtl_buffer(const at::Tensor& t) {
+    return (id<MTLBuffer>)t.storage().data();
+}
+
+MetalRuntime& get_metal_runtime() {
+    static MetalRuntime rt;
+    static dispatch_once_t onceToken;
+    dispatch_once(&onceToken, ^{
+        @autoreleasepool {
+            NSString* this_file = [NSString stringWithUTF8String:__FILE__];
+            NSString* src_dir = [this_file stringByDeletingLastPathComponent];
+            NSString* kernel_path = [src_dir stringByAppendingPathComponent:@"metal_flex_attn.metal"];
+
+            NSError* read_error = nil;
+            NSString* source = [NSString stringWithContentsOfFile:kernel_path
+                                                         encoding:NSUTF8StringEncoding
+                                                            error:&read_error];
+            if (!source) {
+                return;
+            }
+
+            id<MTLDevice> device = MTLCreateSystemDefaultDevice();
+            if (!device) {
+                return;
+            }
+
+            NSError* compile_error = nil;
+            MTLCompileOptions* opts = [[MTLCompileOptions alloc] init];
+            opts.fastMathEnabled = YES;
+            id<MTLLibrary> lib = [device newLibraryWithSource:source options:opts error:&compile_error];
+            if (!lib) {
+                return;
+            }
+
+            id<MTLFunction> fn_generic = [lib newFunctionWithName:@"metal_flex_attn_forward"];
+            id<MTLFunction> fn_generic_block_written = [lib newFunctionWithName:@"metal_flex_attn_forward_from_block_written"];
+            id<MTLFunction> fn_generic_bf16 = [lib newFunctionWithName:@"metal_flex_attn_forward_bf16"];
+            id<MTLFunction> fn_dh64_bs4_single = [lib newFunctionWithName:@"metal_flex_attn_forward_dh64_bs4_single"];
+            id<MTLFunction> fn_dh64_bs4_gqa1_single = [lib newFunctionWithName:@"metal_flex_attn_forward_dh64_bs4_gqa1_single"];
+            id<MTLFunction> fn_dh64_bs4_gqa1_block_written = [lib newFunctionWithName:@"metal_flex_attn_forward_dh64_bs4_gqa1_block_written"];
+            id<MTLFunction> fn_dh64_bs4_gqa2_single = [lib newFunctionWithName:@"metal_flex_attn_forward_dh64_bs4_gqa2_single"];
+            id<MTLFunction> fn_dh64_bs4_gqa2_dense = [lib newFunctionWithName:@"metal_flex_attn_forward_dh64_bs4_gqa2_dense"];
+            id<MTLFunction> fn_dh64_bs4_gqa2_dualhead = [lib newFunctionWithName:@"metal_flex_attn_forward_dh64_bs4_gqa2_dualhead"];
+            if (!fn_generic || !fn_dh64_bs4_single || !fn_dh64_bs4_gqa1_single || !fn_dh64_bs4_gqa1_block_written || !fn_dh64_bs4_gqa2_single || !fn_dh64_bs4_gqa2_dense || !fn_dh64_bs4_gqa2_dualhead) {
+                return;
+            }
+
+            NSError* pipe_error = nil;
+            rt.pipeline_generic = [device newComputePipelineStateWithFunction:fn_generic error:&pipe_error];
+            if (!rt.pipeline_generic) {
+                return;
+            }
+            rt.pipeline_generic_block_written = [device newComputePipelineStateWithFunction:fn_generic_block_written error:&pipe_error];
+            if (!rt.pipeline_generic_block_written) {
+                return;
+            }
+            if (fn_generic_bf16) {
+                rt.pipeline_generic_bf16 = [device newComputePipelineStateWithFunction:fn_generic_bf16 error:&pipe_error];
+                if (rt.pipeline_generic_bf16) {
+                    rt.thread_execution_width_generic_bf16 = static_cast<uint32_t>(rt.pipeline_generic_bf16.threadExecutionWidth);
+                }
+            }
+            rt.pipeline_dh64_bs4_single = [device newComputePipelineStateWithFunction:fn_dh64_bs4_single error:&pipe_error];
+            if (!rt.pipeline_dh64_bs4_single) {
+                return;
+            }
+            rt.pipeline_dh64_bs4_gqa1_single = [device newComputePipelineStateWithFunction:fn_dh64_bs4_gqa1_single error:&pipe_error];
+            if (!rt.pipeline_dh64_bs4_gqa1_single) {
+                return;
+            }
+            rt.pipeline_dh64_bs4_gqa1_block_written = [device newComputePipelineStateWithFunction:fn_dh64_bs4_gqa1_block_written error:&pipe_error];
+            if (!rt.pipeline_dh64_bs4_gqa1_block_written) {
+                return;
+            }
+            rt.pipeline_dh64_bs4_gqa2_single = [device newComputePipelineStateWithFunction:fn_dh64_bs4_gqa2_single error:&pipe_error];
+            if (!rt.pipeline_dh64_bs4_gqa2_single) {
+                return;
+            }
+            rt.pipeline_dh64_bs4_gqa2_dense = [device newComputePipelineStateWithFunction:fn_dh64_bs4_gqa2_dense error:&pipe_error];
+            if (!rt.pipeline_dh64_bs4_gqa2_dense) {
+                return;
+            }
+            rt.pipeline_dh64_bs4_gqa2_dualhead = [device newComputePipelineStateWithFunction:fn_dh64_bs4_gqa2_dualhead error:&pipe_error];
+            if (!rt.pipeline_dh64_bs4_gqa2_dualhead) {
+                return;
+            }
+            rt.thread_execution_width_generic = static_cast<uint32_t>(rt.pipeline_generic.threadExecutionWidth);
+            rt.thread_execution_width_generic_block_written = static_cast<uint32_t>(rt.pipeline_generic_block_written.threadExecutionWidth);
+            rt.thread_execution_width_dh64_bs4_single = static_cast<uint32_t>(rt.pipeline_dh64_bs4_single.threadExecutionWidth);
+            rt.thread_execution_width_dh64_bs4_gqa1_single = static_cast<uint32_t>(rt.pipeline_dh64_bs4_gqa1_single.threadExecutionWidth);
+            rt.thread_execution_width_dh64_bs4_gqa1_block_written = static_cast<uint32_t>(rt.pipeline_dh64_bs4_gqa1_block_written.threadExecutionWidth);
+            rt.thread_execution_width_dh64_bs4_gqa2_single = static_cast<uint32_t>(rt.pipeline_dh64_bs4_gqa2_single.threadExecutionWidth);
+            rt.thread_execution_width_dh64_bs4_gqa2_dense = static_cast<uint32_t>(rt.pipeline_dh64_bs4_gqa2_dense.threadExecutionWidth);
+            rt.thread_execution_width_dh64_bs4_gqa2_dualhead = static_cast<uint32_t>(rt.pipeline_dh64_bs4_gqa2_dualhead.threadExecutionWidth);
+            rt.init_ok = true;
+        }
+    });
+    return rt;
+}
+
+static int64_t get_block_size() {
+    const char* env = std::getenv("WORLD_METAL_BLOCK_SIZE");
+    if (!env) {
+        return 4;
+    }
+    const long parsed = std::strtol(env, nullptr, 10);
+    return parsed > 0 ? static_cast<int64_t>(parsed) : 4;
+}
+
+static bool fast_no_fallback() {
+    const char* env = std::getenv("WORLD_METAL_FAST_NO_FALLBACK");
+    if (!env) {
+        return false;
+    }
+    return std::string(env) == "1";
+}
+
+static uint32_t get_tg_size() {
+    const char* env = std::getenv("WORLD_METAL_TG_SIZE");
+    if (!env) {
+        return 256;
+    }
+    const long parsed = std::strtol(env, nullptr, 10);
+    return parsed > 0 ? static_cast<uint32_t>(parsed) : 256;
+}
+
+static bool enable_gqa2_dualhead_specialization() {
+    const char* env = std::getenv("WORLD_METAL_ENABLE_GQA2_DUALHEAD");
+    if (!env) {
+        return true;
+    }
+    const std::string s(env);
+    if (s == "1") {
+        return true;
+    }
+    if (s == "0") {
+        return false;
+    }
+    return true;
+}
+
+static bool enable_fp16_accum() {
+    const char* env = std::getenv("WORLD_METAL_FP16_ACCUM");
+    if (!env) {
+        return true;
+    }
+    return std::string(env) == "1";
+}
+
+static bool prefer_active_dispatch_path() {
+    const char* env = std::getenv("WORLD_METAL_PREFER_ACTIVE_DISPATCH");
+    if (!env) {
+        return true;
+    }
+    return std::string(env) == "1";
+}
+
+static void dispatch_fast_kernel(
+    id<MTLComputePipelineState> pipeline,
+    uint32_t thread_execution_width,
+    const at::Tensor& q,
+    const at::Tensor& k,
+    const at::Tensor& v,
+    const at::Tensor& active_blocks,
+    at::Tensor& out,
+    uint32_t B,
+    uint32_t Hq,
+    uint32_t T,
+    uint32_t L,
+    uint32_t Dh,
+    uint32_t BlockSize,
+    uint32_t ActiveCount,
+    uint32_t Causal,
+    uint32_t Hkv,
+    uint32_t FP16Accum,
+    const char* err_prefix,
+    uint32_t tg_size_hint
+) {
+    auto* stream = at::mps::getCurrentMPSStream();
+    TORCH_CHECK(stream != nullptr, err_prefix, ": no active MPS stream");
+
+    id<MTLCommandBuffer> cb = (id<MTLCommandBuffer>)stream->commandBuffer();
+    TORCH_CHECK(cb != nil, err_prefix, ": failed to acquire command buffer");
+    id<MTLComputeCommandEncoder> enc = [cb computeCommandEncoder];
+    TORCH_CHECK(enc != nil, err_prefix, ": failed to create command encoder");
+    [enc setComputePipelineState:pipeline];
+
+    [enc setBuffer:tensor_mtl_buffer(q) offset:q.storage_offset() * q.element_size() atIndex:0];
+    [enc setBuffer:tensor_mtl_buffer(k) offset:k.storage_offset() * k.element_size() atIndex:1];
+    [enc setBuffer:tensor_mtl_buffer(v) offset:v.storage_offset() * v.element_size() atIndex:2];
+    [enc setBuffer:tensor_mtl_buffer(active_blocks)
+         offset:active_blocks.storage_offset() * active_blocks.element_size()
+        atIndex:3];
+    [enc setBuffer:tensor_mtl_buffer(out) offset:out.storage_offset() * out.element_size() atIndex:4];
+    [enc setBytes:&B length:sizeof(B) atIndex:5];
+    [enc setBytes:&Hq length:sizeof(Hq) atIndex:6];
+    [enc setBytes:&T length:sizeof(T) atIndex:7];
+    [enc setBytes:&L length:sizeof(L) atIndex:8];
+    [enc setBytes:&Dh length:sizeof(Dh) atIndex:9];
+    [enc setBytes:&BlockSize length:sizeof(BlockSize) atIndex:10];
+    [enc setBytes:&ActiveCount length:sizeof(ActiveCount) atIndex:11];
+    [enc setBytes:&Causal length:sizeof(Causal) atIndex:12];
+    [enc setBytes:&Hkv length:sizeof(Hkv) atIndex:13];
+    [enc setBytes:&FP16Accum length:sizeof(FP16Accum) atIndex:14];
+
+    const uint32_t simd_width = std::max<uint32_t>(1u, thread_execution_width);
+    const NSUInteger total = static_cast<NSUInteger>(B) * Hq * T * simd_width;
+    const NSUInteger tg_req = static_cast<NSUInteger>(tg_size_hint > 0u ? tg_size_hint : get_tg_size());
+    const NSUInteger tg_aligned = MAX(simd_width, (tg_req / simd_width) * simd_width);
+    const NSUInteger tg = MIN(pipeline.maxTotalThreadsPerThreadgroup, tg_aligned);
+    const NSUInteger tg_count = (total + tg - 1) / tg;
+    [enc dispatchThreadgroups:MTLSizeMake(tg_count, 1, 1) threadsPerThreadgroup:MTLSizeMake(tg, 1, 1)];
+    [enc endEncoding];
+}
+
+static void dispatch_fast_kernel_dh64_bs4_gqa2_dualhead(
+    id<MTLComputePipelineState> pipeline,
+    uint32_t thread_execution_width,
+    const at::Tensor& q,
+    const at::Tensor& k,
+    const at::Tensor& v,
+    const at::Tensor& active_blocks,
+    at::Tensor& out,
+    uint32_t B,
+    uint32_t Hq,
+    uint32_t T,
+    uint32_t L,
+    uint32_t Dh,
+    uint32_t BlockSize,
+    uint32_t ActiveCount,
+    uint32_t Causal,
+    uint32_t Hkv,
+    uint32_t FP16Accum,
+    const char* err_prefix,
+    uint32_t tg_size_hint
+) {
+    auto* stream = at::mps::getCurrentMPSStream();
+    TORCH_CHECK(stream != nullptr, err_prefix, ": no active MPS stream");
+
+    id<MTLCommandBuffer> cb = (id<MTLCommandBuffer>)stream->commandBuffer();
+    TORCH_CHECK(cb != nil, err_prefix, ": failed to acquire command buffer");
+    id<MTLComputeCommandEncoder> enc = [cb computeCommandEncoder];
+    TORCH_CHECK(enc != nil, err_prefix, ": failed to create command encoder");
+    [enc setComputePipelineState:pipeline];
+
+    [enc setBuffer:tensor_mtl_buffer(q) offset:q.storage_offset() * q.element_size() atIndex:0];
+    [enc setBuffer:tensor_mtl_buffer(k) offset:k.storage_offset() * k.element_size() atIndex:1];
+    [enc setBuffer:tensor_mtl_buffer(v) offset:v.storage_offset() * v.element_size() atIndex:2];
+    [enc setBuffer:tensor_mtl_buffer(active_blocks)
+         offset:active_blocks.storage_offset() * active_blocks.element_size()
+        atIndex:3];
+    [enc setBuffer:tensor_mtl_buffer(out) offset:out.storage_offset() * out.element_size() atIndex:4];
+    [enc setBytes:&B length:sizeof(B) atIndex:5];
+    [enc setBytes:&Hq length:sizeof(Hq) atIndex:6];
+    [enc setBytes:&T length:sizeof(T) atIndex:7];
+    [enc setBytes:&L length:sizeof(L) atIndex:8];
+    [enc setBytes:&Dh length:sizeof(Dh) atIndex:9];
+    [enc setBytes:&BlockSize length:sizeof(BlockSize) atIndex:10];
+    [enc setBytes:&ActiveCount length:sizeof(ActiveCount) atIndex:11];
+    [enc setBytes:&Causal length:sizeof(Causal) atIndex:12];
+    [enc setBytes:&Hkv length:sizeof(Hkv) atIndex:13];
+    [enc setBytes:&FP16Accum length:sizeof(FP16Accum) atIndex:14];
+
+    const uint32_t simd_width = std::max<uint32_t>(1u, thread_execution_width);
+    const NSUInteger total = static_cast<NSUInteger>(B) * Hkv * T * simd_width;
+    const NSUInteger tg_req = static_cast<NSUInteger>(tg_size_hint > 0u ? tg_size_hint : get_tg_size());
+    const NSUInteger tg_aligned = MAX(simd_width, (tg_req / simd_width) * simd_width);
+    const NSUInteger tg = MIN(pipeline.maxTotalThreadsPerThreadgroup, tg_aligned);
+    const NSUInteger tg_count = (total + tg - 1) / tg;
+    [enc dispatchThreadgroups:MTLSizeMake(tg_count, 1, 1) threadsPerThreadgroup:MTLSizeMake(tg, 1, 1)];
+    [enc endEncoding];
+}
+
+at::Tensor metal_flex_attn_ref_impl(
+    const at::Tensor& q,
+    const at::Tensor& k,
+    const at::Tensor& v,
+    const c10::optional<at::Tensor>& mask,
+    bool causal
+) {
+    TORCH_CHECK(q.device().is_mps() && k.device().is_mps() && v.device().is_mps(),
+                "flex_attn_metal expects q/k/v on MPS");
+    TORCH_CHECK(
+        (q.scalar_type() == at::kHalf || q.scalar_type() == at::kBFloat16)
+            && q.scalar_type() == k.scalar_type()
+            && q.scalar_type() == v.scalar_type(),
+        "flex_attn_metal currently supports float16 or bfloat16 (matching dtypes)"
+    );
+    TORCH_CHECK(q.is_contiguous() && k.is_contiguous() && v.is_contiguous(),
+                "flex_attn_metal expects contiguous q/k/v");
+    TORCH_CHECK(k.sizes() == v.sizes(), "k and v must match");
+    TORCH_CHECK(q.size(0) == k.size(0) && q.size(3) == k.size(3),
+                "q/k must match on batch and head dim");
+    TORCH_CHECK(q.size(1) >= k.size(1), "q heads must be >= kv heads");
+    TORCH_CHECK((q.size(1) % k.size(1)) == 0, "q heads must be divisible by kv heads for GQA");
+
+    // Phase-1 native implementation: route through known-good ATen math while
+    // ensuring we execute on the current MPS stream. This validates stream
+    // integration before re-introducing raw Metal buffer bindings.
+    auto* stream = at::mps::getCurrentMPSStream();
+    TORCH_CHECK(stream != nullptr, "flex_attn_metal: no active MPS stream");
+    (void)stream->commandBuffer();
+
+    const int64_t T = q.size(2);
+    const int64_t L = k.size(2);
+    const int64_t Dh = q.size(3);
+
+    at::Tensor mask_tensor;
+    if (mask.has_value()) {
+        mask_tensor = *mask;
+        TORCH_CHECK(mask_tensor.device().is_mps(), "mask must be on MPS");
+        TORCH_CHECK(mask_tensor.scalar_type() == at::kByte, "mask must be uint8");
+        TORCH_CHECK(mask_tensor.is_contiguous(), "mask must be contiguous");
+        TORCH_CHECK(mask_tensor.numel() == q.size(0) * q.size(1) * T * L,
+                    "mask must have shape [B,H,T,L]");
+    }
+
+    auto qf = q.to(at::kFloat);
+    auto kf = k.to(at::kFloat);
+    auto vf = v.to(at::kFloat);
+
+    if (q.size(1) != k.size(1)) {
+        const int64_t hq = q.size(1);
+        const int64_t hkv = k.size(1);
+        const int64_t group_size = hq / hkv;
+        std::vector<int64_t> map_vec(static_cast<size_t>(hq));
+        for (int64_t i = 0; i < hq; ++i) {
+            map_vec[static_cast<size_t>(i)] = i / group_size;
+        }
+        auto head_map = at::tensor(
+            map_vec,
+            q.options().device(q.device()).dtype(at::kLong)
+        );
+        kf = kf.index_select(/*dim=*/1, head_map);
+        vf = vf.index_select(/*dim=*/1, head_map);
+    }
+
+    auto scores = at::matmul(qf, kf.transpose(-2, -1)) / std::sqrt(static_cast<double>(Dh));
+    if (mask.has_value()) {
+        scores = scores.masked_fill(mask_tensor.eq(0), -std::numeric_limits<float>::infinity());
+    }
+    if (causal) {
+        // Query rows correspond to the tail window [L-T, L), so causal bounds
+        // must be shifted by q_start rather than using naive row index.
+        const int64_t q_start = std::max<int64_t>(0, L - T);
+        auto causal_mask = at::triu(
+            at::ones({T, L}, q.options().dtype(at::kBool)),
+            /*diagonal=*/q_start + 1
+        );
+        scores = scores.masked_fill(causal_mask.unsqueeze(0).unsqueeze(0), -std::numeric_limits<float>::infinity());
+    }
+
+    auto finite_row = at::isfinite(scores).any(-1, true);
+    auto safe_scores = at::where(finite_row, scores, at::zeros_like(scores));
+    auto probs = at::softmax(safe_scores, -1);
+    probs = at::where(finite_row, probs, at::zeros_like(probs));
+
+    auto out = at::matmul(probs, vf);
+    return out.to(q.scalar_type());
+}
+
+at::Tensor metal_flex_attn_fast_dispatch_impl(
+    const at::Tensor& q,
+    const at::Tensor& k,
+    const at::Tensor& v,
+    const at::Tensor& block_written,
+    int64_t block_size,
+    bool causal,
+    bool use_active_dispatch
+) {
+    TORCH_CHECK(q.device().is_mps() && k.device().is_mps() && v.device().is_mps(),
+                "flex_attn_metal_fast expects q/k/v on MPS");
+    TORCH_CHECK(block_written.device().is_mps(), "block_written must be on MPS");
+    TORCH_CHECK(
+        (q.scalar_type() == at::kHalf || q.scalar_type() == at::kBFloat16)
+            && q.scalar_type() == k.scalar_type()
+            && q.scalar_type() == v.scalar_type(),
+        "flex_attn_metal_fast currently supports float16 or bfloat16 (matching dtypes)"
+    );
+    TORCH_CHECK(block_written.scalar_type() == at::kByte, "block_written must be uint8");
+    TORCH_CHECK(block_written.is_contiguous(), "block_written must be contiguous");
+    TORCH_CHECK(k.sizes() == v.sizes(), "k and v must match");
+    TORCH_CHECK(q.size(0) == k.size(0) && q.size(3) == k.size(3),
+                "q/k must match on batch and head dim");
+    TORCH_CHECK(q.size(1) >= k.size(1), "q heads must be >= kv heads");
+    TORCH_CHECK((q.size(1) % k.size(1)) == 0, "q heads must be divisible by kv heads for GQA");
+    TORCH_CHECK(block_size > 0, "block_size must be > 0");
+
+    auto& rt = get_metal_runtime();
+    TORCH_CHECK(rt.init_ok, "flex_attn_metal_fast: metal runtime init failed");
+
+    const auto input_dtype = q.scalar_type();
+    const bool use_bf16_io = (input_dtype == at::kBFloat16);
+    const at::Tensor qh = use_bf16_io ? q.to(at::kHalf).contiguous() : q.contiguous();
+    const at::Tensor kh = use_bf16_io ? k.to(at::kHalf).contiguous() : k.contiguous();
+    const at::Tensor vh = use_bf16_io ? v.to(at::kHalf).contiguous() : v.contiguous();
+
+    const uint32_t B = static_cast<uint32_t>(qh.size(0));
+    const uint32_t Hq = static_cast<uint32_t>(qh.size(1));
+    const uint32_t Hkv = static_cast<uint32_t>(kh.size(1));
+    const uint32_t T = static_cast<uint32_t>(qh.size(2));
+    const uint32_t L = static_cast<uint32_t>(kh.size(2));
+    const uint32_t Dh = static_cast<uint32_t>(qh.size(3));
+    TORCH_CHECK(Dh <= 128, "flex_attn_metal_fast currently supports Dh <= 128");
+    const uint32_t BlockSize = static_cast<uint32_t>(block_size);
+    const uint32_t KVBLOCKS = (L + BlockSize - 1) / BlockSize;
+    const uint32_t Causal = causal ? 1u : 0u;
+    TORCH_CHECK(block_written.numel() == static_cast<int64_t>(KVBLOCKS),
+                "block_written must have exactly ceil(L/block_size) elements");
+
+    at::Tensor active_blocks;
+    uint32_t ActiveCount = KVBLOCKS;
+    if (use_active_dispatch) {
+        active_blocks = at::nonzero(block_written.gt(0)).flatten().to(at::kInt).contiguous();
+        ActiveCount = static_cast<uint32_t>(active_blocks.numel());
+    }
+
+    at::Tensor out = at::zeros_like(qh);
+    if (ActiveCount == 0) {
+        return use_bf16_io ? out.to(input_dtype) : out;
+    }
+    const uint32_t FP16Accum = enable_fp16_accum() ? 1u : 0u;
+    if (use_active_dispatch && ActiveCount > 0u) {
+        if (Dh == 64u && BlockSize == 4u) {
+            const float density = static_cast<float>(ActiveCount) / static_cast<float>(std::max<uint32_t>(1u, KVBLOCKS));
+            const bool use_gqa1_specialized = (Hq == Hkv);
+            const bool use_gqa2_specialized = (Hq == (Hkv << 1));
+            const bool use_gqa2_dense = use_gqa2_specialized && (ActiveCount == KVBLOCKS);
+            const bool use_gqa2_dualhead = enable_gqa2_dualhead_specialization() && use_gqa2_specialized && (density <= 0.75f) && (T >= 256u);
+            const uint32_t tuned_tg = get_tg_size();
+            if (use_gqa1_specialized) {
+                dispatch_fast_kernel(
+                    rt.pipeline_dh64_bs4_gqa1_single,
+                    rt.thread_execution_width_dh64_bs4_gqa1_single,
+                    qh, kh, vh, active_blocks, out,
+                    B, Hq, T, L, Dh, BlockSize, ActiveCount, Causal, Hkv,
+                    FP16Accum,
+                    "flex_attn_metal_fast", tuned_tg
+                );
+            } else if (use_gqa2_dualhead) {
+                dispatch_fast_kernel_dh64_bs4_gqa2_dualhead(
+                    rt.pipeline_dh64_bs4_gqa2_dualhead,
+                    rt.thread_execution_width_dh64_bs4_gqa2_dualhead,
+                    qh, kh, vh, active_blocks, out,
+                    B, Hq, T, L, Dh, BlockSize, ActiveCount, Causal, Hkv,
+                    FP16Accum,
+                    "flex_attn_metal_fast", tuned_tg
+                );
+            } else if (use_gqa2_dense) {
+                dispatch_fast_kernel(
+                    rt.pipeline_dh64_bs4_gqa2_dense,
+                    rt.thread_execution_width_dh64_bs4_gqa2_dense,
+                    qh, kh, vh, active_blocks, out,
+                    B, Hq, T, L, Dh, BlockSize, ActiveCount, Causal, Hkv,
+                    FP16Accum,
+                    "flex_attn_metal_fast", tuned_tg
+                );
+            } else if (use_gqa2_specialized) {
+                dispatch_fast_kernel(
+                    rt.pipeline_dh64_bs4_gqa2_single,
+                    rt.thread_execution_width_dh64_bs4_gqa2_single,
+                    qh, kh, vh, active_blocks, out,
+                    B, Hq, T, L, Dh, BlockSize, ActiveCount, Causal, Hkv,
+                    FP16Accum,
+                    "flex_attn_metal_fast", tuned_tg
+                );
+            } else {
+                dispatch_fast_kernel(
+                    rt.pipeline_dh64_bs4_single, rt.thread_execution_width_dh64_bs4_single, qh, kh, vh, active_blocks, out,
+                    B, Hq, T, L, Dh, BlockSize, ActiveCount, Causal, Hkv,
+                    FP16Accum,
+                    "flex_attn_metal_fast", tuned_tg
+                );
+            }
+        } else {
+            dispatch_fast_kernel(
+                rt.pipeline_generic, rt.thread_execution_width_generic, qh, kh, vh, active_blocks, out,
+                B, Hq, T, L, Dh, BlockSize, ActiveCount, Causal, Hkv,
+                FP16Accum,
+                "flex_attn_metal_fast", 0u
+            );
+        }
+    } else {
+        if (Dh == 64u && BlockSize == 4u && Hq == Hkv) {
+            dispatch_fast_kernel(
+                rt.pipeline_dh64_bs4_gqa1_block_written,
+                rt.thread_execution_width_dh64_bs4_gqa1_block_written,
+                qh, kh, vh, block_written, out,
+                B, Hq, T, L, Dh, BlockSize, ActiveCount, Causal, Hkv,
+                FP16Accum,
+                "flex_attn_metal_fast", get_tg_size()
+            );
+        } else {
+            dispatch_fast_kernel(
+                rt.pipeline_generic_block_written,
+                rt.thread_execution_width_generic_block_written,
+                qh, kh, vh, block_written, out,
+                B, Hq, T, L, Dh, BlockSize, ActiveCount, Causal, Hkv,
+                FP16Accum,
+                "flex_attn_metal_fast", 0u
+            );
+        }
+    }
+    // Do not force immediate commit/wait here; let MPS stream scheduling batch
+    // this op naturally with surrounding kernels for better throughput.
+    return use_bf16_io ? out.to(input_dtype) : out;
+}
+
+at::Tensor metal_flex_attn_fast_dispatch_active_impl(
+    const at::Tensor& q,
+    const at::Tensor& k,
+    const at::Tensor& v,
+    const at::Tensor& active_blocks,
+    int64_t block_size,
+    bool causal
+) {
+    TORCH_CHECK(q.device().is_mps() && k.device().is_mps() && v.device().is_mps(),
+                "flex_attn_metal_fast_active expects q/k/v on MPS");
+    TORCH_CHECK(active_blocks.device().is_mps(), "active_blocks must be on MPS");
+    TORCH_CHECK(
+        (q.scalar_type() == at::kHalf || q.scalar_type() == at::kBFloat16)
+            && q.scalar_type() == k.scalar_type()
+            && q.scalar_type() == v.scalar_type(),
+        "flex_attn_metal_fast_active currently supports float16 or bfloat16 (matching dtypes)"
+    );
+    TORCH_CHECK(active_blocks.scalar_type() == at::kInt, "active_blocks must be int32");
+    TORCH_CHECK(active_blocks.is_contiguous(), "active_blocks must be contiguous");
+    TORCH_CHECK(k.sizes() == v.sizes(), "k and v must match");
+    TORCH_CHECK(q.size(0) == k.size(0) && q.size(3) == k.size(3),
+                "q/k must match on batch and head dim");
+    TORCH_CHECK(q.size(1) >= k.size(1), "q heads must be >= kv heads");
+    TORCH_CHECK((q.size(1) % k.size(1)) == 0, "q heads must be divisible by kv heads for GQA");
+    TORCH_CHECK(block_size > 0, "block_size must be > 0");
+
+    auto& rt = get_metal_runtime();
+    TORCH_CHECK(rt.init_ok, "flex_attn_metal_fast_active: metal runtime init failed");
+
+    const auto input_dtype = q.scalar_type();
+    const bool use_bf16_io = (input_dtype == at::kBFloat16);
+    const bool use_native_bf16_generic = use_bf16_io && (rt.pipeline_generic_bf16 != nil);
+    const at::Tensor qh = use_native_bf16_generic ? q.contiguous() : (use_bf16_io ? q.to(at::kHalf).contiguous() : q.contiguous());
+    const at::Tensor kh = use_native_bf16_generic ? k.contiguous() : (use_bf16_io ? k.to(at::kHalf).contiguous() : k.contiguous());
+    const at::Tensor vh = use_native_bf16_generic ? v.contiguous() : (use_bf16_io ? v.to(at::kHalf).contiguous() : v.contiguous());
+
+    const uint32_t B = static_cast<uint32_t>(qh.size(0));
+    const uint32_t Hq = static_cast<uint32_t>(qh.size(1));
+    const uint32_t Hkv = static_cast<uint32_t>(kh.size(1));
+    const uint32_t T = static_cast<uint32_t>(qh.size(2));
+    const uint32_t L = static_cast<uint32_t>(kh.size(2));
+    const uint32_t Dh = static_cast<uint32_t>(qh.size(3));
+    TORCH_CHECK(Dh <= 128, "flex_attn_metal_fast_active currently supports Dh <= 128");
+    const uint32_t BlockSize = static_cast<uint32_t>(block_size);
+    const uint32_t KVBLOCKS = (L + BlockSize - 1) / BlockSize;
+    const uint32_t Causal = causal ? 1u : 0u;
+    TORCH_CHECK(
+        active_blocks.numel() <= static_cast<int64_t>(KVBLOCKS),
+        "active_blocks numel must be <= ceil(L/block_size)"
+    );
+    const uint32_t ActiveCount = static_cast<uint32_t>(active_blocks.numel());
+
+    at::Tensor out = at::zeros_like(qh);
+    if (ActiveCount == 0) {
+        return use_bf16_io ? out.to(input_dtype) : out;
+    }
+    const uint32_t FP16Accum = enable_fp16_accum() ? 1u : 0u;
+    if (use_native_bf16_generic) {
+        dispatch_fast_kernel(
+            rt.pipeline_generic_bf16, rt.thread_execution_width_generic_bf16, qh, kh, vh, active_blocks, out,
+            B, Hq, T, L, Dh, BlockSize, ActiveCount, Causal, Hkv,
+            FP16Accum,
+            "flex_attn_metal_fast_active", 0u
+        );
+    } else if (Dh == 64u && BlockSize == 4u) {
+        const float density = static_cast<float>(ActiveCount) / static_cast<float>(std::max<uint32_t>(1u, KVBLOCKS));
+        const bool use_gqa1_specialized = (Hq == Hkv);
+        const bool use_gqa2_specialized = (Hq == (Hkv << 1));
+        const bool use_gqa2_dense = use_gqa2_specialized && (ActiveCount == KVBLOCKS);
+        const bool use_gqa2_dualhead = enable_gqa2_dualhead_specialization() && use_gqa2_specialized && (density <= 0.75f) && (T >= 256u);
+        const uint32_t tuned_tg = get_tg_size();
+        if (use_gqa1_specialized) {
+            dispatch_fast_kernel(
+                rt.pipeline_dh64_bs4_gqa1_single,
+                rt.thread_execution_width_dh64_bs4_gqa1_single,
+                qh, kh, vh, active_blocks, out,
+                B, Hq, T, L, Dh, BlockSize, ActiveCount, Causal, Hkv,
+                FP16Accum,
+                "flex_attn_metal_fast_active", tuned_tg
+            );
+        } else if (use_gqa2_dualhead) {
+            dispatch_fast_kernel_dh64_bs4_gqa2_dualhead(
+                rt.pipeline_dh64_bs4_gqa2_dualhead,
+                rt.thread_execution_width_dh64_bs4_gqa2_dualhead,
+                qh, kh, vh, active_blocks, out,
+                B, Hq, T, L, Dh, BlockSize, ActiveCount, Causal, Hkv,
+                FP16Accum,
+                "flex_attn_metal_fast_active", tuned_tg
+            );
+        } else if (use_gqa2_dense) {
+            dispatch_fast_kernel(
+                rt.pipeline_dh64_bs4_gqa2_dense,
+                rt.thread_execution_width_dh64_bs4_gqa2_dense,
+                qh, kh, vh, active_blocks, out,
+                B, Hq, T, L, Dh, BlockSize, ActiveCount, Causal, Hkv,
+                FP16Accum,
+                "flex_attn_metal_fast_active", tuned_tg
+            );
+        } else if (use_gqa2_specialized) {
+            dispatch_fast_kernel(
+                rt.pipeline_dh64_bs4_gqa2_single,
+                rt.thread_execution_width_dh64_bs4_gqa2_single,
+                qh, kh, vh, active_blocks, out,
+                B, Hq, T, L, Dh, BlockSize, ActiveCount, Causal, Hkv,
+                FP16Accum,
+                "flex_attn_metal_fast_active", tuned_tg
+            );
+        } else {
+            dispatch_fast_kernel(
+                rt.pipeline_dh64_bs4_single, rt.thread_execution_width_dh64_bs4_single, qh, kh, vh, active_blocks, out,
+                B, Hq, T, L, Dh, BlockSize, ActiveCount, Causal, Hkv,
+                FP16Accum,
+                "flex_attn_metal_fast_active", tuned_tg
+            );
+        }
+    } else {
+        dispatch_fast_kernel(
+            rt.pipeline_generic, rt.thread_execution_width_generic, qh, kh, vh, active_blocks, out,
+            B, Hq, T, L, Dh, BlockSize, ActiveCount, Causal, Hkv,
+            FP16Accum,
+            "flex_attn_metal_fast_active", 0u
+        );
+    }
+    return (use_bf16_io && !use_native_bf16_generic) ? out.to(input_dtype) : out;
+}
+
+at::Tensor metal_flex_attn_fast_dispatch_from_mask_impl(
+    const at::Tensor& q,
+    const at::Tensor& k,
+    const at::Tensor& v,
+    const c10::optional<at::Tensor>& mask,
+    bool causal
+) {
+    TORCH_CHECK(q.device().is_mps() && k.device().is_mps() && v.device().is_mps(),
+                "flex_attn_metal_fast expects q/k/v on MPS");
+    TORCH_CHECK(
+        (q.scalar_type() == at::kHalf || q.scalar_type() == at::kBFloat16)
+            && q.scalar_type() == k.scalar_type()
+            && q.scalar_type() == v.scalar_type(),
+        "flex_attn_metal_fast currently supports float16 or bfloat16 (matching dtypes)"
+    );
+    TORCH_CHECK(k.sizes() == v.sizes(), "k and v must match");
+    TORCH_CHECK(q.size(0) == k.size(0) && q.size(3) == k.size(3),
+                "q/k must match on batch and head dim");
+    TORCH_CHECK(q.size(1) >= k.size(1), "q heads must be >= kv heads");
+    TORCH_CHECK((q.size(1) % k.size(1)) == 0, "q heads must be divisible by kv heads for GQA");
+
+    const uint32_t B = static_cast<uint32_t>(q.size(0));
+    const uint32_t Hq = static_cast<uint32_t>(q.size(1));
+    const uint32_t Hkv = static_cast<uint32_t>(k.size(1));
+    const uint32_t T = static_cast<uint32_t>(q.size(2));
+    const uint32_t L = static_cast<uint32_t>(k.size(2));
+    const uint32_t Dh = static_cast<uint32_t>(q.size(3));
+    const uint32_t BlockSize = static_cast<uint32_t>(get_block_size());
+    TORCH_CHECK(BlockSize > 0, "WORLD_METAL_BLOCK_SIZE must be > 0");
+    const uint32_t KVBLOCKS = (L + BlockSize - 1) / BlockSize;
+    const uint32_t Causal = causal ? 1u : 0u;
+
+    at::Tensor mask_tensor;
+    at::Tensor block_written;
+    if (mask.has_value()) {
+        mask_tensor = *mask;
+        TORCH_CHECK(mask_tensor.device().is_mps(), "mask must be on MPS");
+        TORCH_CHECK(mask_tensor.scalar_type() == at::kByte, "mask must be uint8");
+        TORCH_CHECK(mask_tensor.is_contiguous(), "mask must be contiguous");
+        TORCH_CHECK(mask_tensor.numel() == q.size(0) * q.size(1) * q.size(2) * k.size(2),
+                    "mask must have shape [B,H,T,L]");
+        // Fast-kernel contract today: a single shared, block-wise mask state.
+        // We enforce this explicitly to avoid silent semantic drift.
+        auto row = mask_tensor.index({0, 0, 0}).contiguous(); // [L]
+        auto shared_ok = mask_tensor.eq(row.view({1, 1, 1, static_cast<int64_t>(L)})).all().item<bool>();
+        TORCH_CHECK(
+            shared_ok,
+            "flex_attn_metal_fast expects a shared mask across batch/head/query dimensions"
+        );
+
+        const int64_t full_blocks = static_cast<int64_t>(L / BlockSize);
+        const int64_t rem = static_cast<int64_t>(L % BlockSize);
+        at::Tensor block_vals;
+
+        if (full_blocks > 0) {
+            auto prefix = row.slice(/*dim=*/0, /*start=*/0, /*end=*/full_blocks * static_cast<int64_t>(BlockSize));
+            auto blocks2d = prefix.view({full_blocks, static_cast<int64_t>(BlockSize)});
+            auto first = blocks2d.index({at::indexing::Slice(), 0}).unsqueeze(1);
+            auto full_ok = blocks2d.eq(first).all().item<bool>();
+            TORCH_CHECK(
+                full_ok,
+                "flex_attn_metal_fast expects block-wise mask values (constant within each block)"
+            );
+            block_vals = first.squeeze(1).to(at::kByte);
+        } else {
+            block_vals = at::empty({0}, q.options().dtype(at::kByte));
+        }
+
+        if (rem > 0) {
+            auto tail = row.slice(/*dim=*/0, /*start=*/full_blocks * static_cast<int64_t>(BlockSize), /*end=*/static_cast<int64_t>(L));
+            auto tail_first = tail.index({0});
+            auto tail_ok = tail.eq(tail_first).all().item<bool>();
+            TORCH_CHECK(
+                tail_ok,
+                "flex_attn_metal_fast expects block-wise mask values (constant within each block)"
+            );
+            auto tail_val = tail_first.to(at::kByte).view({1});
+            block_vals = (block_vals.numel() > 0) ? at::cat({block_vals, tail_val}, /*dim=*/0) : tail_val;
+        }
+
+        block_written = block_vals.contiguous();
+    } else {
+        block_written = at::ones({static_cast<int64_t>(KVBLOCKS)}, q.options().dtype(at::kByte)).contiguous();
+    }
+
+    return metal_flex_attn_fast_dispatch_impl(
+        q, k, v, block_written, static_cast<int64_t>(BlockSize), causal, prefer_active_dispatch_path()
+    );
+}
+
+at::Tensor metal_flex_attn_fast_impl(
+    const at::Tensor& q,
+    const at::Tensor& k,
+    const at::Tensor& v,
+    const c10::optional<at::Tensor>& mask,
+    bool causal
+) {
+    if (fast_no_fallback()) {
+        return metal_flex_attn_fast_dispatch_from_mask_impl(q, k, v, mask, causal);
+    }
+    // Keep ref as a safety net while fast path stabilizes.
+    try {
+        return metal_flex_attn_fast_dispatch_from_mask_impl(q, k, v, mask, causal);
+    } catch (...) {
+        return metal_flex_attn_ref_impl(q, k, v, mask, causal);
+    }
+}
+
+at::Tensor metal_flex_attn_fast_blocks_impl(
+    const at::Tensor& q,
+    const at::Tensor& k,
+    const at::Tensor& v,
+    const at::Tensor& block_written,
+    int64_t block_size,
+    bool causal
+) {
+    if (fast_no_fallback()) {
+        return metal_flex_attn_fast_dispatch_impl(q, k, v, block_written, block_size, causal, prefer_active_dispatch_path());
+    }
+    try {
+        return metal_flex_attn_fast_dispatch_impl(q, k, v, block_written, block_size, causal, prefer_active_dispatch_path());
+    } catch (...) {
+        // Reconstruct dense mask for reference fallback.
+        const int64_t B = q.size(0);
+        const int64_t Hq = q.size(1);
+        const int64_t T = q.size(2);
+        const int64_t L = k.size(2);
+        at::Tensor dense = at::zeros({L}, q.options().dtype(at::kByte));
+        for (int64_t b = 0; b < block_written.numel(); ++b) {
+            if (block_written.index({b}).item<int64_t>() != 0) {
+                const int64_t s = b * block_size;
+                const int64_t e = std::min<int64_t>(L, s + block_size);
+                dense.index_put_({at::indexing::Slice(s, e)}, 1);
+            }
+        }
+        auto dense4d = dense.view({1, 1, 1, L}).expand({B, Hq, T, L}).contiguous();
+        return metal_flex_attn_ref_impl(q, k, v, dense4d, causal);
+    }
+}
+
+at::Tensor metal_flex_attn_fast_blocks_direct_impl(
+    const at::Tensor& q,
+    const at::Tensor& k,
+    const at::Tensor& v,
+    const at::Tensor& block_written,
+    int64_t block_size,
+    bool causal
+) {
+    if (fast_no_fallback()) {
+        return metal_flex_attn_fast_dispatch_impl(q, k, v, block_written, block_size, causal, false);
+    }
+    try {
+        return metal_flex_attn_fast_dispatch_impl(q, k, v, block_written, block_size, causal, false);
+    } catch (...) {
+        const int64_t B = q.size(0);
+        const int64_t Hq = q.size(1);
+        const int64_t T = q.size(2);
+        const int64_t L = k.size(2);
+        at::Tensor dense = at::zeros({L}, q.options().dtype(at::kByte));
+        for (int64_t b = 0; b < block_written.numel(); ++b) {
+            if (block_written.index({b}).item<int64_t>() != 0) {
+                const int64_t s = b * block_size;
+                const int64_t e = std::min<int64_t>(L, s + block_size);
+                dense.index_put_({at::indexing::Slice(s, e)}, 1);
+            }
+        }
+        auto dense4d = dense.view({1, 1, 1, L}).expand({B, Hq, T, L}).contiguous();
+        return metal_flex_attn_ref_impl(q, k, v, dense4d, causal);
+    }
+}
+
+at::Tensor metal_flex_attn_fast_active_impl(
+    const at::Tensor& q,
+    const at::Tensor& k,
+    const at::Tensor& v,
+    const at::Tensor& active_blocks,
+    int64_t block_size,
+    bool causal
+) {
+    if (fast_no_fallback()) {
+        return metal_flex_attn_fast_dispatch_active_impl(q, k, v, active_blocks, block_size, causal);
+    }
+    try {
+        return metal_flex_attn_fast_dispatch_active_impl(q, k, v, active_blocks, block_size, causal);
+    } catch (...) {
+        // Reconstruct dense mask for reference fallback.
+        const int64_t B = q.size(0);
+        const int64_t Hq = q.size(1);
+        const int64_t T = q.size(2);
+        const int64_t L = k.size(2);
+        const int64_t kv_blocks = (L + block_size - 1) / block_size;
+        at::Tensor bw = at::zeros({kv_blocks}, q.options().dtype(at::kByte));
+        for (int64_t i = 0; i < active_blocks.numel(); ++i) {
+            const int64_t bi = active_blocks.index({i}).item<int64_t>();
+            if (bi >= 0 && bi < kv_blocks) {
+                bw.index_put_({bi}, 1);
+            }
+        }
+        at::Tensor dense = at::zeros({L}, q.options().dtype(at::kByte));
+        for (int64_t b = 0; b < bw.numel(); ++b) {
+            if (bw.index({b}).item<int64_t>() != 0) {
+                const int64_t s = b * block_size;
+                const int64_t e = std::min<int64_t>(L, s + block_size);
+                dense.index_put_({at::indexing::Slice(s, e)}, 1);
+            }
+        }
+        auto dense4d = dense.view({1, 1, 1, L}).expand({B, Hq, T, L}).contiguous();
+        return metal_flex_attn_ref_impl(q, k, v, dense4d, causal);
+    }
+}
+
+at::Tensor metal_flex_attn_fast_active_counted_impl(
+    const at::Tensor& q,
+    const at::Tensor& k,
+    const at::Tensor& v,
+    const at::Tensor& active_blocks,
+    const at::Tensor& active_count,
+    int64_t block_size,
+    bool causal
+) {
+    TORCH_CHECK(active_count.numel() == 1, "active_count must be a scalar tensor");
+    TORCH_CHECK(active_blocks.is_contiguous(), "active_blocks must be contiguous");
+    TORCH_CHECK(active_blocks.scalar_type() == at::kInt, "active_blocks must be int32");
+    TORCH_CHECK(active_count.scalar_type() == at::kInt, "active_count must be int32");
+    TORCH_CHECK(q.device().is_mps() && k.device().is_mps() && v.device().is_mps(),
+                "flex_attn_metal_fast_active_counted expects q/k/v on MPS");
+    TORCH_CHECK(
+        (q.scalar_type() == at::kHalf || q.scalar_type() == at::kBFloat16)
+            && q.scalar_type() == k.scalar_type()
+            && q.scalar_type() == v.scalar_type(),
+        "flex_attn_metal_fast_active_counted currently supports float16 or bfloat16 (matching dtypes)"
+    );
+    TORCH_CHECK(k.sizes() == v.sizes(), "k and v must match");
+    TORCH_CHECK(q.size(0) == k.size(0) && q.size(3) == k.size(3),
+                "q/k must match on batch and head dim");
+    TORCH_CHECK(q.size(1) >= k.size(1), "q heads must be >= kv heads");
+    TORCH_CHECK((q.size(1) % k.size(1)) == 0, "q heads must be divisible by kv heads for GQA");
+    TORCH_CHECK(block_size > 0, "block_size must be > 0");
+
+    auto& rt = get_metal_runtime();
+    TORCH_CHECK(rt.init_ok, "flex_attn_metal_fast_active_counted: metal runtime init failed");
+
+    const auto input_dtype = q.scalar_type();
+    const bool use_bf16_io = (input_dtype == at::kBFloat16);
+    const at::Tensor qh = use_bf16_io ? q.to(at::kHalf).contiguous() : q.contiguous();
+    const at::Tensor kh = use_bf16_io ? k.to(at::kHalf).contiguous() : k.contiguous();
+    const at::Tensor vh = use_bf16_io ? v.to(at::kHalf).contiguous() : v.contiguous();
+
+    const uint32_t B = static_cast<uint32_t>(qh.size(0));
+    const uint32_t Hq = static_cast<uint32_t>(qh.size(1));
+    const uint32_t Hkv = static_cast<uint32_t>(kh.size(1));
+    const uint32_t T = static_cast<uint32_t>(qh.size(2));
+    const uint32_t L = static_cast<uint32_t>(kh.size(2));
+    const uint32_t Dh = static_cast<uint32_t>(qh.size(3));
+    TORCH_CHECK(Dh <= 128, "flex_attn_metal_fast_active_counted currently supports Dh <= 128");
+    const uint32_t BlockSize = static_cast<uint32_t>(block_size);
+    const uint32_t Causal = causal ? 1u : 0u;
+
+    const uint32_t ActiveCount = static_cast<uint32_t>(active_count.item<int32_t>());
+    at::Tensor out = at::zeros_like(qh);
+    if (ActiveCount == 0) {
+        return use_bf16_io ? out.to(input_dtype) : out;
+    }
+    const uint32_t FP16Accum = enable_fp16_accum() ? 1u : 0u;
+    if (Dh == 64u && BlockSize == 4u) {
+        const bool use_gqa1_specialized = (Hq == Hkv);
+        const uint32_t tuned_tg = get_tg_size();
+        if (use_gqa1_specialized) {
+            dispatch_fast_kernel(
+                rt.pipeline_dh64_bs4_gqa1_single,
+                rt.thread_execution_width_dh64_bs4_gqa1_single,
+                qh, kh, vh, active_blocks, out,
+                B, Hq, T, L, Dh, BlockSize, ActiveCount, Causal, Hkv,
+                FP16Accum,
+                "flex_attn_metal_fast_active_counted", tuned_tg
+            );
+        } else {
+            dispatch_fast_kernel(
+                rt.pipeline_dh64_bs4_single, rt.thread_execution_width_dh64_bs4_single,
+                qh, kh, vh, active_blocks, out,
+                B, Hq, T, L, Dh, BlockSize, ActiveCount, Causal, Hkv,
+                FP16Accum,
+                "flex_attn_metal_fast_active_counted", tuned_tg
+            );
+        }
+    } else {
+        dispatch_fast_kernel(
+            rt.pipeline_generic, rt.thread_execution_width_generic,
+            qh, kh, vh, active_blocks, out,
+            B, Hq, T, L, Dh, BlockSize, ActiveCount, Causal, Hkv,
+            FP16Accum,
+            "flex_attn_metal_fast_active_counted", 0u
+        );
+    }
+    return use_bf16_io ? out.to(input_dtype) : out;
+}
+
+at::Tensor metal_flex_attn_impl(
+    const at::Tensor& q,
+    const at::Tensor& k,
+    const at::Tensor& v,
+    const c10::optional<at::Tensor>& mask,
+    bool causal
+) {
+    // Backward-compatible alias; default to ref behavior.
+    return metal_flex_attn_ref_impl(q, k, v, mask, causal);
+}
+
+} // namespace
+
+TORCH_LIBRARY(world, m) {
+    m.def("flex_attn_metal(Tensor q, Tensor k, Tensor v, Tensor? mask=None, bool causal=True) -> Tensor");
+    m.def("flex_attn_metal_ref(Tensor q, Tensor k, Tensor v, Tensor? mask=None, bool causal=True) -> Tensor");
+    m.def("flex_attn_metal_fast(Tensor q, Tensor k, Tensor v, Tensor? mask=None, bool causal=True) -> Tensor");
+    m.def("flex_attn_metal_fast_blocks(Tensor q, Tensor k, Tensor v, Tensor block_written, int block_size, bool causal=True) -> Tensor");
+    m.def("flex_attn_metal_fast_blocks_direct(Tensor q, Tensor k, Tensor v, Tensor block_written, int block_size, bool causal=True) -> Tensor");
+    m.def("flex_attn_metal_fast_active(Tensor q, Tensor k, Tensor v, Tensor active_blocks, int block_size, bool causal=True) -> Tensor");
+    m.def("flex_attn_metal_fast_active_counted(Tensor q, Tensor k, Tensor v, Tensor active_blocks, Tensor active_count, int block_size, bool causal=True) -> Tensor");
+}
+
+TORCH_LIBRARY_IMPL(world, MPS, m) {
+    m.impl("flex_attn_metal", &metal_flex_attn_impl);
+    m.impl("flex_attn_metal_ref", &metal_flex_attn_ref_impl);
+    m.impl("flex_attn_metal_fast", &metal_flex_attn_fast_impl);
+    m.impl("flex_attn_metal_fast_blocks", &metal_flex_attn_fast_blocks_impl);
+    m.impl("flex_attn_metal_fast_blocks_direct", &metal_flex_attn_fast_blocks_direct_impl);
+    m.impl("flex_attn_metal_fast_active", &metal_flex_attn_fast_active_impl);
+    m.impl("flex_attn_metal_fast_active_counted", &metal_flex_attn_fast_active_counted_impl);
+}
+
diff --git a/src/metal/runtime.py b/src/metal/runtime.py
new file mode 100644
index 0000000..f76a248
--- /dev/null
+++ b/src/metal/runtime.py
@@ -0,0 +1,170 @@
+from __future__ import annotations
+
+from pathlib import Path
+import os
+import sys
+
+import torch
+from torch.utils.cpp_extension import load
+
+_ROOT = Path(__file__).resolve().parents[1]
+_SRC = _ROOT / "metal"
+_BUILD_DIR = _ROOT.parent / ".build" / "torch_extensions"
+
+_ATTN_EXT_NAME = "world_metal_attn_ext"
+
+_ATTN_READY = False
+_ATTN_FAKE_READY = False
+
+
+def _ensure_build_env() -> None:
+    _BUILD_DIR.mkdir(parents=True, exist_ok=True)
+    os.environ.setdefault("TORCH_EXTENSIONS_DIR", str(_BUILD_DIR))
+
+    # Prefer the currently active interpreter's bin directory (works for worktrees),
+    # then fall back to local repo .venv/bin if present.
+    bin_candidates: list[Path] = [
+        Path(sys.executable).parent,
+        Path(sys.prefix) / "bin",
+        _ROOT.parent / ".venv" / "bin",
+    ]
+    for bin_dir in bin_candidates:
+        if not bin_dir.exists():
+            continue
+        os.environ["PATH"] = f"{bin_dir}:{os.environ.get('PATH', '')}"
+        ninja_bin = bin_dir / "ninja"
+        if ninja_bin.exists():
+            os.environ.setdefault("NINJA", str(ninja_bin))
+            break
+
+
+def _try_load_extension(name: str, source: Path) -> bool:
+    if not torch.backends.mps.is_available():
+        return False
+    if not source.exists():
+        return False
+
+    _ensure_build_env()
+    try:
+        load(
+            name=name,
+            sources=[str(source)],
+            extra_cflags=["-std=c++17", "-O3"],
+            extra_ldflags=["-framework", "Metal", "-framework", "Foundation"],
+            with_cuda=False,
+            is_python_module=False,
+            verbose=False,
+        )
+        return True
+    except Exception as exc:
+        if os.environ.get("WORLD_METAL_RUNTIME_DEBUG", "0") == "1":
+            print(f"[metal.runtime] failed to load {name}: {type(exc).__name__}: {exc}")
+        return False
+
+
+def _register_attention_fake_kernels() -> None:
+    global _ATTN_FAKE_READY
+    if _ATTN_FAKE_READY:
+        return
+
+    def _out_like_q(q: torch.Tensor) -> torch.Tensor:
+        return torch.empty_like(q)
+
+    try:
+        @torch.library.register_fake("world::flex_attn_metal")
+        def _fake_metal(
+            q: torch.Tensor,
+            k: torch.Tensor,
+            v: torch.Tensor,
+            mask: torch.Tensor | None = None,
+            causal: bool = True,
+        ) -> torch.Tensor:
+            return _out_like_q(q)
+
+        @torch.library.register_fake("world::flex_attn_metal_ref")
+        def _fake_metal_ref(
+            q: torch.Tensor,
+            k: torch.Tensor,
+            v: torch.Tensor,
+            mask: torch.Tensor | None = None,
+            causal: bool = True,
+        ) -> torch.Tensor:
+            return _out_like_q(q)
+
+        @torch.library.register_fake("world::flex_attn_metal_fast")
+        def _fake_metal_fast(
+            q: torch.Tensor,
+            k: torch.Tensor,
+            v: torch.Tensor,
+            mask: torch.Tensor | None = None,
+            causal: bool = True,
+        ) -> torch.Tensor:
+            return _out_like_q(q)
+
+        @torch.library.register_fake("world::flex_attn_metal_fast_blocks")
+        def _fake_metal_fast_blocks(
+            q: torch.Tensor,
+            k: torch.Tensor,
+            v: torch.Tensor,
+            block_written: torch.Tensor,
+            block_size: int,
+            causal: bool = True,
+        ) -> torch.Tensor:
+            return _out_like_q(q)
+
+        @torch.library.register_fake("world::flex_attn_metal_fast_blocks_direct")
+        def _fake_metal_fast_blocks_direct(
+            q: torch.Tensor,
+            k: torch.Tensor,
+            v: torch.Tensor,
+            block_written: torch.Tensor,
+            block_size: int,
+            causal: bool = True,
+        ) -> torch.Tensor:
+            return _out_like_q(q)
+
+        @torch.library.register_fake("world::flex_attn_metal_fast_active")
+        def _fake_metal_fast_active(
+            q: torch.Tensor,
+            k: torch.Tensor,
+            v: torch.Tensor,
+            active_blocks: torch.Tensor,
+            block_size: int,
+            causal: bool = True,
+        ) -> torch.Tensor:
+            return _out_like_q(q)
+
+        @torch.library.register_fake("world::flex_attn_metal_fast_active_counted")
+        def _fake_metal_fast_active_counted(
+            q: torch.Tensor,
+            k: torch.Tensor,
+            v: torch.Tensor,
+            active_blocks: torch.Tensor,
+            active_count: torch.Tensor,
+            block_size: int,
+            causal: bool = True,
+        ) -> torch.Tensor:
+            return _out_like_q(q)
+    except RuntimeError:
+        # Fake implementations can only be registered once per process.
+        pass
+
+    _ATTN_FAKE_READY = True
+
+
+def ensure_metal_attention_op_loaded() -> bool:
+    global _ATTN_READY
+    if _ATTN_READY:
+        _register_attention_fake_kernels()
+        return True
+    _ATTN_READY = _try_load_extension(_ATTN_EXT_NAME, _SRC / "metal_flex_attn_op.mm")
+    if _ATTN_READY:
+        _register_attention_fake_kernels()
+    return _ATTN_READY
+
+
+def metal_attention_available() -> bool:
+    if hasattr(torch.ops, "world") and hasattr(torch.ops.world, "flex_attn_metal_ref"):
+        _register_attention_fake_kernels()
+        return True
+    return ensure_metal_attention_op_loaded()
diff --git a/src/model/attn.py b/src/model/attn.py
index c6024e8..aa2754b 100644
--- a/src/model/attn.py
+++ b/src/model/attn.py
@@ -2,11 +2,10 @@
 import einops as eo
 from torch import nn
 
-from torch.nn.attention.flex_attention import flex_attention
-
 from rotary_embedding_torch import RotaryEmbedding
 
 from .nn import rms_norm, NoCastModule
+from .attn_backend import AttnConfig, AttnMeta, world_flex_attn_forward
 
 
 class OrthoRoPEAngles(NoCastModule):
@@ -109,11 +108,24 @@ def forward(self, x, pos_ids, rope_angles, v1, kv_cache):
         q, k = rms_norm(q), rms_norm(k)
         q, k = self.rope(q, rope_angles), self.rope(k, rope_angles)
 
-        # Update KV-cache in-place
-        k, v, bm = kv_cache.upsert(k, v, pos_ids, self.layer_idx)
+        # Update KV-cache in-place; pass frame index once to avoid per-layer scalar extraction.
+        frame_idx = kv_cache.get_frame_idx(pos_ids)
+        k, v, bm, block_written, active_blocks, active_count, block_size = kv_cache.upsert(
+            k, v, pos_ids, self.layer_idx, frame_idx_int=frame_idx
+        )
 
-        # SDPA -> Attention Gate -> Out Proj
-        y = flex_attention(q, k, v, block_mask=bm, enable_gqa=self.enable_gqa)
+        # SDPA/Flex/Metal attention -> Attention Gate -> Out Proj
+        meta = AttnMeta(
+            flex_block_mask=bm,
+            q_len=q.size(2),
+            kv_len=k.size(2),
+            block_written=block_written,
+            active_blocks=active_blocks,
+            active_count=active_count,
+            block_size=block_size,
+        )
+        cfg = AttnConfig(causal=True, enable_gqa=self.enable_gqa)
+        y = world_flex_attn_forward(q, k, v, meta, cfg)
         if self.gated_attn:
             gates = torch.sigmoid(self.gate_proj(x[..., :self.n_heads]))
             y = y * gates.permute(0, 2, 1).unsqueeze(-1)
@@ -143,6 +155,12 @@ def forward(self, x, context, context_pad_mask=None):
         k = eo.rearrange(self.k_proj(context), "b t (h d) -> b h t d", h=self.n_heads)
         v = eo.rearrange(self.v_proj(context), "b t (h d) -> b h t d", h=self.n_heads)
         q, k = rms_norm(q), rms_norm(k)
-        out = flex_attention(q, k, v)
+        meta = AttnMeta(
+            flex_block_mask=None,
+            q_len=q.size(2),
+            kv_len=k.size(2),
+        )
+        cfg = AttnConfig(causal=False, enable_gqa=False)
+        out = world_flex_attn_forward(q, k, v, meta, cfg)
         out = out.transpose(1, 2).contiguous().reshape(x.size(0), x.size(1), -1)
         return self.out_proj(out)
diff --git a/src/model/attn_backend.py b/src/model/attn_backend.py
new file mode 100644
index 0000000..3c71f98
--- /dev/null
+++ b/src/model/attn_backend.py
@@ -0,0 +1,170 @@
+from __future__ import annotations
+
+from dataclasses import dataclass
+from enum import Enum
+from typing import Optional
+import os
+
+import torch
+from torch import Tensor
+
+from torch.nn.attention.flex_attention import flex_attention
+
+_METAL_IMPL_MODE = "fast" if os.getenv("WORLD_METAL_IMPL", "ref").lower() == "fast" else "ref"
+_METAL_FORCE_CAUSAL = os.getenv("WORLD_METAL_FORCE_CAUSAL", "0") == "1"
+_METAL_IMPL_RAW = os.getenv("WORLD_METAL_IMPL", "ref").lower()
+_METAL_FORCE_CAUSAL_RAW = os.getenv("WORLD_METAL_FORCE_CAUSAL", "0")
+
+
+class AttnBackend(str, Enum):
+    PYTORCH_FLEX = "pytorch-flex"
+    METAL = "metal-op"
+    AUTO = "auto"
+
+    @staticmethod
+    def default() -> "AttnBackend":
+        """
+        Resolve the default backend from the WORLD_ATTENTION_BACKEND env var.
+        """
+        import os
+
+        value = os.getenv("WORLD_ATTENTION_BACKEND", "pytorch-flex").lower()
+        if value == "metal":
+            return AttnBackend.METAL
+        if value == "auto":
+            return AttnBackend.AUTO
+        return AttnBackend.PYTORCH_FLEX
+
+
+def _metal_impl_mode() -> str:
+    # WORLD_METAL_IMPL=ref|fast
+    global _METAL_IMPL_MODE, _METAL_IMPL_RAW
+    raw = os.getenv("WORLD_METAL_IMPL", "ref").lower()
+    if raw != _METAL_IMPL_RAW:
+        _METAL_IMPL_RAW = raw
+        _METAL_IMPL_MODE = "fast" if raw == "fast" else "ref"
+    return _METAL_IMPL_MODE
+
+
+def _metal_use_causal(cfg: "AttnConfig") -> bool:
+    """
+    Keep Metal behavior aligned with flex_attention backend semantics.
+
+    The flex path in this repo does not pass `causal` explicitly into
+    `flex_attention`; masking semantics are encoded by BlockMask metadata.
+    To preserve CPU/CUDA parity, Metal defaults to non-causal unless
+    explicitly overridden.
+    """
+    global _METAL_FORCE_CAUSAL, _METAL_FORCE_CAUSAL_RAW
+    raw = os.getenv("WORLD_METAL_FORCE_CAUSAL", "0")
+    if raw != _METAL_FORCE_CAUSAL_RAW:
+        _METAL_FORCE_CAUSAL_RAW = raw
+        _METAL_FORCE_CAUSAL = raw == "1"
+    if _METAL_FORCE_CAUSAL:
+        return bool(cfg.causal)
+    return False
+
+
+@dataclass
+class AttnConfig:
+    """
+    Backend-agnostic attention configuration.
+
+    This object is intentionally small and forward-only: it encodes only what
+    the kernel needs at runtime. Training- and autograd-specific concerns are
+    out of scope for the hybrid Metal inference path.
+    """
+
+    causal: bool = True
+    enable_gqa: bool = False
+
+
+@dataclass
+class AttnMeta:
+    """
+    Backend-agnostic metadata describing the KV layout for a single attention
+    call. This is the hook where we will eventually encode block/window
+    sparsity and cache positions for the Metal kernel.
+
+    For the initial implementation, we allow passing the existing BlockMask
+    object through as `flex_block_mask` so the PyTorch flex backend can keep
+    working while we design a compact Metal-friendly format. In parallel we
+    expose basic sequence length information that the Metal backend will use
+    to size its tiles.
+    """
+
+    # Optional flex BlockMask used by the PyTorch flex backend today.
+    flex_block_mask: Optional[object] = None
+
+    # Logical query and KV lengths for this attention call.
+    q_len: Optional[int] = None
+    kv_len: Optional[int] = None
+    block_written: Optional[Tensor] = None
+    active_blocks: Optional[Tensor] = None
+    active_count: Optional[Tensor] = None
+    block_size: Optional[int] = None
+
+    # Future fields for the Metal backend (block size, bucket indices, validity
+    # masks, etc.) will live here as we iterate on the sparsity encoding.
+
+
+def world_flex_attn_forward(
+    q: Tensor,
+    k: Tensor,
+    v: Tensor,
+    meta: Optional[AttnMeta],
+    cfg: AttnConfig,
+    backend: Optional[AttnBackend] = None,
+) -> Tensor:
+    """
+    Backend-neutral attention entrypoint used by high-level modules.
+
+    Args:
+        q, k, v: [B, H, T, Dh] tensors on the same device (MPS, CUDA, etc.).
+        meta:   Backend-agnostic metadata describing KV/cache layout.
+        cfg:    Small configuration object with behavioral flags.
+        backend:
+            - PYTORCH_FLEX: call torch.nn.attention.flex_attention directly.
+            - METAL:        call the custom Metal op (to be added).
+            - AUTO:         choose PYTORCH_FLEX or METAL based on device /
+                            availability.
+    """
+    if backend is None:
+        backend = AttnBackend.default()
+
+    if backend is AttnBackend.AUTO:
+        backend = AttnBackend.METAL if q.device.type == "mps" else AttnBackend.PYTORCH_FLEX
+
+    if backend is AttnBackend.PYTORCH_FLEX:
+        block_mask = meta.flex_block_mask if meta is not None else None
+        return flex_attention(q, k, v, block_mask=block_mask, enable_gqa=cfg.enable_gqa)
+
+    if backend is AttnBackend.METAL:
+        mask = None
+        mode = _metal_impl_mode()
+        use_causal = _metal_use_causal(cfg)
+        if mode == "fast":
+            # Keep compile signatures stable: `active_blocks` grows with history,
+            # while `block_written` has fixed per-layer shape.
+            if torch.compiler.is_compiling() and meta is not None and meta.block_size is not None:
+                if meta.active_blocks is not None and meta.active_count is not None:
+                    return torch.ops.world.flex_attn_metal_fast_active_counted(
+                        q, k, v, meta.active_blocks, meta.active_count, int(meta.block_size), use_causal
+                    )
+                if meta.block_written is not None:
+                    return torch.ops.world.flex_attn_metal_fast_blocks_direct(
+                        q, k, v, meta.block_written, int(meta.block_size), use_causal
+                    )
+            if meta is not None and meta.active_blocks is not None and meta.block_size is not None:
+                return torch.ops.world.flex_attn_metal_fast_active(
+                    q, k, v, meta.active_blocks, int(meta.block_size), use_causal
+                )
+            if meta is not None and meta.block_written is not None and meta.block_size is not None:
+                return torch.ops.world.flex_attn_metal_fast_blocks(
+                    q, k, v, meta.block_written, int(meta.block_size), use_causal
+                )
+            return torch.ops.world.flex_attn_metal_fast(q, k, v, mask, use_causal)
+        return torch.ops.world.flex_attn_metal_ref(q, k, v, mask, use_causal)
+
+    raise ValueError(f"Unknown attention backend: {backend}")
+
diff --git a/src/model/kv_cache.py b/src/model/kv_cache.py
index 898b8a9..114047f 100644
--- a/src/model/kv_cache.py
+++ b/src/model/kv_cache.py
@@ -2,6 +2,7 @@
 import torch
 from torch import nn
 from tensordict import TensorDict
+import os
 
 from torch.nn.attention.flex_attention import (
     _DEFAULT_SPARSE_BLOCK_SIZE,
@@ -9,7 +10,7 @@
 )
 
 
-def make_block_mask(T: int, L: int, written: torch.Tensor) -> BlockMask:
+def make_block_mask(T: int, L: int, written: torch.Tensor):
     """
     T: Q length for this frame
     L: KV capacity == written.numel()
@@ -58,7 +59,41 @@ def mask_mod(b, h, q, kv):
         compute_q_blocks=False,  # no backward, avoids the transpose/_ordered_to_dense path
     )
 
-    return bm
+    return bm, block_any.contiguous()
+
+
+def _block_any_for_size(written: torch.Tensor, block_size: int) -> torch.Tensor:
+    kv_blocks = (written.numel() + block_size - 1) // block_size
+    padded = torch.nn.functional.pad(written, (0, kv_blocks * block_size - written.numel()))
+    return padded.view(kv_blocks, block_size).any(-1).contiguous()
+
+
+def _metal_block_size() -> int:
+    env = os.environ.get("WORLD_METAL_BLOCK_SIZE")
+    if env is None:
+        return 4
+    try:
+        parsed = int(env)
+        return parsed if parsed > 0 else 4
+    except ValueError:
+        return 4
+
+
+def _kv_runtime_checks_enabled() -> bool:
+    return os.environ.get("WORLD_KV_RUNTIME_CHECKS", "0") == "1"
+
+
+def _compute_active_blocks_enabled() -> bool:
+    return os.environ.get("WORLD_KV_COMPUTE_ACTIVE_BLOCKS", "0") == "1"
+
+
+def _using_metal_backend() -> bool:
+    return os.environ.get("WORLD_ATTENTION_BACKEND", "flex").lower() == "metal"
+
+
+def _normal_tensor(factory):
+    with torch.inference_mode(False):
+        return factory()
 
 
 class LayerKVCache(nn.Module):
@@ -76,6 +111,7 @@ def __init__(self, B, H, L, Dh, dtype, tokens_per_frame: int, pinned_dilation: i
         self.pinned_dilation = pinned_dilation
         self.num_buckets = (L // self.tpf) // self.pinned_dilation
         assert (L // self.tpf) % pinned_dilation == 0 and L % self.tpf == 0
+        self._num_buckets_mask = (self.num_buckets - 1) if (self.num_buckets & (self.num_buckets - 1)) == 0 else -1
 
         # KV buffer: [2, B, H, capacity, Dh]
         self.kv = nn.Buffer(
@@ -89,64 +125,237 @@ def __init__(self, B, H, L, Dh, dtype, tokens_per_frame: int, pinned_dilation: i
         written[L:] = True
         self.written = nn.Buffer(written, persistent=False)
         self._mask_written = nn.Buffer(torch.empty_like(written), persistent=False)
+        self._block_written = nn.Buffer(torch.empty(0, dtype=torch.uint8), persistent=False)
+        self._all_blocks_i32 = nn.Buffer(torch.empty(0, dtype=torch.int32), persistent=False)
+        self._tmp_block_written = nn.Buffer(torch.empty(0, dtype=torch.uint8), persistent=False)
+        self._compile_active_blocks = nn.Buffer(torch.empty(0, dtype=torch.int32), persistent=False)
+        self._compile_active_count = nn.Buffer(torch.zeros((), dtype=torch.int32), persistent=False)
+        self._metal_bs_cache = 0
+        self._blocks_per_frame = 0
+        self._seen_slots: set[int] = set()
+        self._seen_slots_ordered: list[int] = []
+        self._slot_block_ranges: list[torch.Tensor] = []
+        self._tail_block_range: torch.Tensor | None = None
+        self._all_blocks_except_slot: list[torch.Tensor] = []
+        self._can_build_active_without_nonzero = False
 
         # Precompute indices:
         #   frame_offsets: [0, 1, ..., tpf-1] (for ring indexing)
         #   current_idx:   [L, L+1, ..., L+tpf-1] (tail slice)
         self.frame_offsets = nn.Buffer(torch.arange(self.tpf, dtype=torch.long), persistent=False)
         self.current_idx = nn.Buffer(self.frame_offsets + L, persistent=False)
+        self._metal_backend = _using_metal_backend()
+        self._need_active_metadata = self._metal_backend or _compute_active_blocks_enabled()
+        self._configured_metal_bs = _metal_block_size()
 
     def reset(self):
         self.kv.zero_()
         self.written.zero_()
         self.written[self.L:].fill_(True)
+        self._metal_bs_cache = 0
+        self._blocks_per_frame = 0
+        self._seen_slots.clear()
+        self._seen_slots_ordered = []
+        self._slot_block_ranges = []
+        self._tail_block_range = None
+        self._all_blocks_except_slot = []
+        self._can_build_active_without_nonzero = False
+        if self._block_written.numel() > 0:
+            self._block_written.zero_()
+        if self._all_blocks_i32.numel() > 0:
+            self._all_blocks_i32 = _normal_tensor(lambda: self._all_blocks_i32.new_empty((0,), dtype=torch.int32))
+        if self._compile_active_blocks.numel() > 0:
+            self._compile_active_blocks = _normal_tensor(
+                lambda: self._compile_active_blocks.new_empty((0,), dtype=torch.int32)
+            )
+        self._compile_active_count.zero_()
+        if self._tmp_block_written.numel() > 0:
+            self._tmp_block_written.zero_()
+
+    def _ensure_block_written(self, metal_bs: int):
+        if self._metal_bs_cache == metal_bs and self._block_written.numel() > 0:
+            return
+        block_any = _block_any_for_size(self.written, metal_bs).to(torch.uint8).contiguous()
+        self._block_written = _normal_tensor(lambda: block_any.clone())
+        if self._tmp_block_written.numel() != block_any.numel():
+            self._tmp_block_written = _normal_tensor(lambda: torch.empty_like(block_any))
+        self._all_blocks_i32 = _normal_tensor(
+            lambda: torch.arange(block_any.numel(), device=block_any.device, dtype=torch.int32)
+        )
+        self._compile_active_blocks = _normal_tensor(lambda: torch.empty_like(self._all_blocks_i32))
+        self._compile_active_count.zero_()
+        self._metal_bs_cache = metal_bs
+        self._blocks_per_frame = (self.tpf + metal_bs - 1) // metal_bs
+        self._slot_block_ranges = []
+        self._tail_block_range = None
+        self._all_blocks_except_slot = []
+        self._can_build_active_without_nonzero = False
+        if (self.tpf % metal_bs) == 0 and (self.L % metal_bs) == 0:
+            frame_blocks = self.tpf // metal_bs
+            base = _normal_tensor(lambda: torch.arange(frame_blocks, device=block_any.device, dtype=torch.int32))
+            self._slot_block_ranges = [
+                _normal_tensor(lambda slot=slot: (base + (slot * frame_blocks)).contiguous())
+                for slot in range(self.num_buckets)
+            ]
+            self._tail_block_range = _normal_tensor(lambda: (base + (self.L // metal_bs)).contiguous())
+            self._all_blocks_except_slot = []
+            for slot in range(self.num_buckets):
+                start = slot * frame_blocks
+                end = start + frame_blocks
+                left = self._all_blocks_i32.narrow(0, 0, start)
+                right = self._all_blocks_i32.narrow(0, end, self._all_blocks_i32.numel() - end)
+                if left.numel() == 0:
+                    self._all_blocks_except_slot.append(_normal_tensor(lambda right=right: right.contiguous()))
+                elif right.numel() == 0:
+                    self._all_blocks_except_slot.append(_normal_tensor(lambda left=left: left.contiguous()))
+                else:
+                    self._all_blocks_except_slot.append(
+                        _normal_tensor(lambda left=left, right=right: torch.cat((left, right), dim=0).contiguous())
+                    )
+            self._can_build_active_without_nonzero = True
+        self._refresh_compile_active_blocks()
+
+    def _refresh_compile_active_blocks(self):
+        if self._all_blocks_i32.numel() == 0:
+            return
+        active = torch.nonzero(self._block_written, as_tuple=False).flatten().to(torch.int32).contiguous()
+        count = int(active.numel())
+        self._compile_active_blocks.zero_()
+        if count > 0:
+            self._compile_active_blocks.narrow(0, 0, count).copy_(active)
+        self._compile_active_count.fill_(count)
+
+    def rebuild_seen_slots(self):
+        ring_tokens = self.num_buckets * self.tpf
+        ring_written = self.written.narrow(0, 0, ring_tokens).view(self.num_buckets, self.tpf)
+        occupied = ring_written.any(dim=1).to("cpu")
+        self._seen_slots = {i for i, w in enumerate(occupied.tolist()) if bool(w)}
+        self._seen_slots_ordered = sorted(self._seen_slots)
+
+    def _active_blocks_for_block_written(
+        self,
+        block_written: torch.Tensor,
+        write_step: bool,
+        ring_block_start: int,
+        slot: int,
+    ) -> torch.Tensor:
+        if len(self._seen_slots) == self.num_buckets:
+            if not write_step:
+                return self._all_blocks_i32
+            if self._all_blocks_except_slot:
+                return self._all_blocks_except_slot[slot]
+
+        if self._can_build_active_without_nonzero and self._tail_block_range is not None:
+            parts: list[torch.Tensor] = []
+            for seen_slot in self._seen_slots_ordered:
+                if write_step and seen_slot == slot:
+                    continue
+                parts.append(self._slot_block_ranges[seen_slot])
+            parts.append(self._tail_block_range)
+            if len(parts) == 1:
+                return parts[0]
+            return torch.cat(parts, dim=0).contiguous()
+
+        return torch.nonzero(block_written, as_tuple=False).flatten().to(torch.int32).contiguous()
+
+    def _upsert_frozen_compiled(self, k: Tensor, v: Tensor):
+        T = self.tpf
+        self.kv[0].narrow(2, self.L, T).copy_(k)
+        self.kv[1].narrow(2, self.L, T).copy_(v)
+        k_out, v_out = self.kv.unbind(0)
+        return (
+            k_out, v_out, None, self._block_written,
+            self._compile_active_blocks, self._compile_active_count,
+            self._configured_metal_bs,
+        )
 
-    def upsert(self, kv: Tensor, pos_ids: TensorDict, is_frozen: bool):
+    def _upsert_impl(self, k: Tensor, v: Tensor, pos_ids: TensorDict, is_frozen: bool, frame_idx_int: int | None = None):
         """
-        kv: [2, B, H, T, Dh] for a single frame (T = tokens_per_frame)
+        k, v: [B, H, T, Dh] for a single frame (T = tokens_per_frame)
         t_pos: [B, T], all equal per frame (ignoring -1)
         """
+        compiling = torch.compiler.is_compiling()
+
+        if compiling and self._metal_backend and is_frozen:
+            self._ensure_block_written(self._configured_metal_bs)
+            return self._upsert_frozen_compiled(k, v)
+
         T = self.tpf
         f_pos = pos_ids["f_pos"]
 
-        if not torch.compiler.is_compiling():
-            torch._check(kv.size(3) == self.tpf, "KV cache expects exactly one frame per upsert")
-            torch._check(f_pos.shape == (kv.size(1), T), "t_pos must be [B, T]")
+        if _kv_runtime_checks_enabled() and not compiling:
+            torch._check(k.size(2) == self.tpf, "KV cache expects exactly one frame per upsert")
+            torch._check(k.shape == v.shape, "k and v must match for KV cache upsert")
+            torch._check(f_pos.shape == (k.size(0), T), "t_pos must be [B, T]")
             torch._check(self.tpf <= self.L, "frame longer than KV ring capacity")
             torch._check(self.L % self.tpf == 0, f"L ({self.L}) must be a multiple of tokens_per_frame ({self.tpf})")
             torch._check(self.kv.size(3) == self.capacity, "KV buffer too long (expected L + tokens_per_frame)")
             torch._check((f_pos >= 0).all().item(), "t_pos must be non-negative during inference")
             torch._check(((f_pos == f_pos[:, :1]).all()).item(), "t_pos must be constant within frame")
 
-        frame_idx = f_pos[0, 0]
+        frame_idx = int(f_pos[0, 0].item()) if frame_idx_int is None else int(frame_idx_int)
 
-        # map frame_t to a bucket, each bucket owns T contiguous slots
         bucket = (frame_idx + (self.pinned_dilation - 1)) // self.pinned_dilation
-        slot = bucket % self.num_buckets
+        slot = (bucket & self._num_buckets_mask) if self._num_buckets_mask >= 0 else (bucket % self.num_buckets)
         base = slot * T
 
-        # indices in the ring for this frame: [T] in [0, L)
-        ring_idx = self.frame_offsets + base
-
-        # Always write current frame into the tail slice [L, L+T):
-        # this is the "self-attention component" for the current frame.
-        self.kv.index_copy_(3, self.current_idx, kv)
-
-        write_step = (frame_idx.remainder(self.pinned_dilation) == 0)
-        mask_written = self._mask_written
-        mask_written.copy_(self.written)
-        mask_written[ring_idx] = mask_written[ring_idx] & ~write_step
-        bm = make_block_mask(T, self.capacity, mask_written)
+        ring_start = int(base)
+        ring_end = ring_start + T
+
+        self.kv[0].narrow(2, self.L, T).copy_(k)
+        self.kv[1].narrow(2, self.L, T).copy_(v)
+
+        bm = None
+        metal_bs = self._configured_metal_bs
+        need_active_metadata = self._need_active_metadata
+        write_step = (frame_idx % self.pinned_dilation) == 0
+        if self._metal_backend:
+            self._ensure_block_written(metal_bs)
+            ring_block_start = ring_start // metal_bs
+            if write_step:
+                block_written = _normal_tensor(lambda: self._block_written.clone())
+                block_written.narrow(0, ring_block_start, self._blocks_per_frame).zero_()
+            else:
+                block_written = self._block_written
+        else:
+            mask_written = _normal_tensor(lambda: self.written.clone())
+            if write_step:
+                mask_written.narrow(0, ring_start, T).fill_(False)
+            bm, _ = make_block_mask(T, self.capacity, mask_written)
+            block_written = _normal_tensor(
+                lambda: _block_any_for_size(mask_written, metal_bs).to(torch.uint8).contiguous()
+            )
+        active_blocks = None
+        active_count = None
+        if need_active_metadata:
+            ring_block_start = ring_start // metal_bs
+            active_blocks = self._active_blocks_for_block_written(block_written, write_step, ring_block_start, slot)
 
-        # Persist current frame into the ring for future queries when unfrozen.
         if not is_frozen:
-            # Persist current frame into the ring for future queries.
-            dst = torch.where(write_step, ring_idx, self.current_idx)
-            self.kv.index_copy_(3, dst, kv)
-            self.written[dst] = True
+            if write_step:
+                self.kv[0].narrow(2, ring_start, T).copy_(k)
+                self.kv[1].narrow(2, ring_start, T).copy_(v)
+                first_write_for_slot = (slot not in self._seen_slots)
+                if first_write_for_slot:
+                    self.written.narrow(0, ring_start, T).fill_(True)
+                    if need_active_metadata:
+                        self._seen_slots.add(slot)
+                        if slot not in self._seen_slots_ordered:
+                            self._seen_slots_ordered.append(slot)
+                            self._seen_slots_ordered.sort()
+                    if self._metal_backend:
+                        ring_block_start = ring_start // metal_bs
+                        self._block_written.narrow(0, ring_block_start, self._blocks_per_frame).fill_(1)
+                        self._refresh_compile_active_blocks()
 
         k, v = self.kv.unbind(0)
-        return k, v, bm
+        return k, v, bm, block_written, active_blocks, active_count, metal_bs
+
+    def upsert(self, kv: Tensor, pos_ids: TensorDict, is_frozen: bool, frame_idx_int: int | None = None):
+        return self._upsert_impl(kv[0], kv[1], pos_ids, is_frozen, frame_idx_int=frame_idx_int)
+
+    def upsert_separate(self, k: Tensor, v: Tensor, pos_ids: TensorDict, is_frozen: bool, frame_idx_int: int | None = None):
+        return self._upsert_impl(k, v, pos_ids, is_frozen, frame_idx_int=frame_idx_int)
 
 
 class StaticKVCache(nn.Module):
@@ -174,11 +383,19 @@ def __init__(self, config, batch_size, dtype):
         ])
 
         self._is_frozen = True
+        self._cached_fpos_ptr = -1
+        self._cached_fpos_version = -1
+        self._cached_fpos_value = 0
+        self._frame_idx_hint: int | None = None
 
     def reset(self):
         for layer in self.layers:
             layer.reset()
         self._is_frozen = True
+        self._cached_fpos_ptr = -1
+        self._cached_fpos_version = -1
+        self._cached_fpos_value = 0
+        self._frame_idx_hint = None
 
     @torch.inference_mode()
     def get_state(self):
@@ -191,10 +408,30 @@ def load_state(self, state):
         for layer, (kv, written) in zip(self.layers, state["layers"]):
             layer.kv.copy_(kv)
             layer.written.copy_(written)
+            layer.rebuild_seen_slots()
+            layer._metal_bs_cache = 0
+        self._cached_fpos_ptr = -1
+        self._cached_fpos_version = -1
+        self._cached_fpos_value = 0
+        self._frame_idx_hint = None
 
     def set_frozen(self, is_frozen: bool):
         self._is_frozen = is_frozen
 
-    def upsert(self, k: Tensor, v: Tensor, pos_ids: TensorDict, layer: int):
-        kv = torch.stack([k, v], dim=0)
-        return self.layers[layer].upsert(kv, pos_ids, self._is_frozen)
+    def set_frame_idx_int(self, frame_idx_int: int):
+        self._frame_idx_hint = int(frame_idx_int)
+
+    def get_frame_idx(self, pos_ids: TensorDict) -> int:
+        if self._frame_idx_hint is not None:
+            return self._frame_idx_hint
+        fpos = pos_ids["f_pos"]
+        ptr = int(fpos.data_ptr())
+        version = int(fpos._version)
+        if ptr != self._cached_fpos_ptr or version != self._cached_fpos_version:
+            self._cached_fpos_ptr = ptr
+            self._cached_fpos_version = version
+            self._cached_fpos_value = int(fpos[0, 0].item())
+        return self._cached_fpos_value
+
+    def upsert(self, k: Tensor, v: Tensor, pos_ids: TensorDict, layer: int, frame_idx_int: int | None = None):
+        return self.layers[layer].upsert_separate(k, v, pos_ids, self._is_frozen, frame_idx_int=frame_idx_int)
diff --git a/src/model/nn.py b/src/model/nn.py
index d4b2302..1b60d69 100644
--- a/src/model/nn.py
+++ b/src/model/nn.py
@@ -105,7 +105,11 @@ def __init__(self, dim, fourier_dim=512, base=10_000.0):
         super().__init__()
         assert fourier_dim % 2 == 0
         half = fourier_dim // 2
-        self.freq = nn.Buffer(torch.logspace(0, -1, steps=half, base=base, dtype=torch.float32), persistent=False)
+        # Build on CPU to avoid MPS missing logspace kernel during model init.
+        self.freq = nn.Buffer(
+            torch.logspace(0, -1, steps=half, base=base, dtype=torch.float32, device="cpu"),
+            persistent=False,
+        )
         self.mlp = MLP(fourier_dim, dim * 4, dim)
 
     def forward(self, s, eps=torch.finfo(torch.float32).eps):
diff --git a/src/model/world_model.py b/src/model/world_model.py
index 8ea6053..85403ef 100644
--- a/src/model/world_model.py
+++ b/src/model/world_model.py
@@ -219,9 +219,7 @@ def __init__(self, config):
     def forward(self, x: torch.Tensor, cond: torch.Tensor) -> torch.Tensor:
         B, _, D = x.shape
         L = cond.shape[1]
-
         Wx, Wc = self.mlp.fc1.weight.chunk(2, dim=1)  # each [D, D]
-
         x = x.view(B, L, -1, D)
         h = F.linear(x, Wx) + F.linear(cond, Wc).unsqueeze(2)  # broadcast, no repeat/cat
         h = F.silu(h)
@@ -262,36 +260,108 @@ def __init__(self, config, layer_idx):
         do_ctrl_cond = config.ctrl_conditioning_period is not None and layer_idx % config.ctrl_conditioning_period == 0
         self.ctrl_mlpfusion = MLPFusion(config) if do_ctrl_cond else None
 
-    def forward(self, x, pos_ids, rope_angles, cond, ctx, v, kv_cache=None):
-        """
-        0) Causal Frame Attention
-        1) Frame->CTX Cross Attention
-        2) MLP
-        """
+        if do_prompt_cond and do_ctrl_cond:
+            self.forward_first = self._forward_first_prompt_ctrl
+            self.forward_with_residual = self._forward_with_residual_prompt_ctrl
+        elif do_prompt_cond:
+            self.forward_first = self._forward_first_prompt_only
+            self.forward_with_residual = self._forward_with_residual_prompt_only
+        elif do_ctrl_cond:
+            self.forward_first = self._forward_first_ctrl_only
+            self.forward_with_residual = self._forward_with_residual_ctrl_only
+        else:
+            self.forward_first = self._forward_first_plain
+            self.forward_with_residual = self._forward_with_residual_plain
+
+    def _self_attend_first(self, x, pos_ids, rope_angles, cond, kv_cache=None, layer_cache=None):
         s0, b0, g0, s1, b1, g1 = self.cond_head(cond)
+        residual = x.contiguous()
+        x = ada_rmsnorm(x, s0, b0)
+        x, v = self.attn.forward_first(x, pos_ids, rope_angles, kv_cache=kv_cache, layer_cache=layer_cache)
+        x = (ada_gate(x, g0) + residual).contiguous()
+        return x, v, s1, b1, g1
 
-        # Self / Causal Attention
-        residual = x
+    def _self_attend_with_residual(self, x, pos_ids, rope_angles, cond, v, kv_cache=None, layer_cache=None):
+        s0, b0, g0, s1, b1, g1 = self.cond_head(cond)
+        residual = x.contiguous()
         x = ada_rmsnorm(x, s0, b0)
-        x, v = self.attn(x, pos_ids, rope_angles, v, kv_cache=kv_cache)
-        x = ada_gate(x, g0) + residual
-
-        # Cross Attention Prompt Conditioning
-        if self.prompt_cross_attn is not None:
-            x = self.prompt_cross_attn(
-                rms_norm(x),
-                context=rms_norm(ctx["prompt_emb"]),
-                context_pad_mask=ctx["prompt_pad_mask"],
-            ) + x
-
-        # MLPFusion Controller Conditioning
-        if self.ctrl_mlpfusion is not None:
-            x = self.ctrl_mlpfusion(rms_norm(x), rms_norm(ctx["ctrl_emb"])) + x
-
-        # MLP
+        x, v = self.attn.forward_with_residual(x, pos_ids, rope_angles, v, kv_cache=kv_cache, layer_cache=layer_cache)
+        x = (ada_gate(x, g0) + residual).contiguous()
+        return x, v, s1, b1, g1
+
+    def _finish_mlp(self, x, s1, b1, g1):
         x = ada_gate(self.mlp(ada_rmsnorm(x, s1, b1)), g1) + x
+        return x.contiguous()
+
+    def _apply_prompt(self, x, ctx):
+        return self.prompt_cross_attn(
+            rms_norm(x),
+            context=rms_norm(ctx["prompt_emb"]),
+            context_pad_mask=ctx["prompt_pad_mask"],
+        ) + x
+
+    def _apply_ctrl(self, x, ctx):
+        return self.ctrl_mlpfusion(rms_norm(x), rms_norm(ctx["ctrl_emb"])) + x
+
+    def _forward_first_plain(self, x, pos_ids, rope_angles, cond, ctx, kv_cache=None, layer_cache=None):
+        x, v, s1, b1, g1 = self._self_attend_first(
+            x, pos_ids, rope_angles, cond, kv_cache=kv_cache, layer_cache=layer_cache
+        )
+        return self._finish_mlp(x, s1, b1, g1), v
 
-        return x, v
+    def _forward_first_prompt_only(self, x, pos_ids, rope_angles, cond, ctx, kv_cache=None, layer_cache=None):
+        x, v, s1, b1, g1 = self._self_attend_first(
+            x, pos_ids, rope_angles, cond, kv_cache=kv_cache, layer_cache=layer_cache
+        )
+        x = self.prompt_cross_attn(
+            rms_norm(x),
+            context=rms_norm(ctx["prompt_emb"]),
+            context_pad_mask=ctx["prompt_pad_mask"],
+        ) + x
+        return self._finish_mlp(x, s1, b1, g1), v
+
+    def _forward_first_ctrl_only(self, x, pos_ids, rope_angles, cond, ctx, kv_cache=None, layer_cache=None):
+        x, v, s1, b1, g1 = self._self_attend_first(
+            x, pos_ids, rope_angles, cond, kv_cache=kv_cache, layer_cache=layer_cache
+        )
+        x = self._apply_ctrl(x, ctx)
+        return self._finish_mlp(x, s1, b1, g1), v
+
+    def _forward_first_prompt_ctrl(self, x, pos_ids, rope_angles, cond, ctx, kv_cache=None, layer_cache=None):
+        x, v, s1, b1, g1 = self._self_attend_first(
+            x, pos_ids, rope_angles, cond, kv_cache=kv_cache, layer_cache=layer_cache
+        )
+        x = self._apply_prompt(x, ctx)
+        x = self._apply_ctrl(x, ctx)
+        return self._finish_mlp(x, s1, b1, g1), v
+
+    def _forward_with_residual_plain(self, x, pos_ids, rope_angles, cond, ctx, v, kv_cache=None, layer_cache=None):
+        x, v, s1, b1, g1 = self._self_attend_with_residual(
+            x, pos_ids, rope_angles, cond, v, kv_cache=kv_cache, layer_cache=layer_cache
+        )
+        return self._finish_mlp(x, s1, b1, g1), v
+
+    def _forward_with_residual_prompt_only(self, x, pos_ids, rope_angles, cond, ctx, v, kv_cache=None, layer_cache=None):
+        x, v, s1, b1, g1 = self._self_attend_with_residual(
+            x, pos_ids, rope_angles, cond, v, kv_cache=kv_cache, layer_cache=layer_cache
+        )
+        x = self._apply_prompt(x, ctx)
+        return self._finish_mlp(x, s1, b1, g1), v
+
+    def _forward_with_residual_ctrl_only(self, x, pos_ids, rope_angles, cond, ctx, v, kv_cache=None, layer_cache=None):
+        x, v, s1, b1, g1 = self._self_attend_with_residual(
+            x, pos_ids, rope_angles, cond, v, kv_cache=kv_cache, layer_cache=layer_cache
+        )
+        x = self._apply_ctrl(x, ctx)
+        return self._finish_mlp(x, s1, b1, g1), v
+
+    def _forward_with_residual_prompt_ctrl(self, x, pos_ids, rope_angles, cond, ctx, v, kv_cache=None, layer_cache=None):
+        x, v, s1, b1, g1 = self._self_attend_with_residual(
+            x, pos_ids, rope_angles, cond, v, kv_cache=kv_cache, layer_cache=layer_cache
+        )
+        x = self._apply_prompt(x, ctx)
+        x = self._apply_ctrl(x, ctx)
+        return self._finish_mlp(x, s1, b1, g1), v
 
 
 class WorldDiT(nn.Module):
@@ -309,9 +379,17 @@ def __init__(self, config):
 
     def forward(self, x, pos_ids, cond, ctx, kv_cache=None):
         rope_angles = self.rope_angles(pos_ids)
-        v = None
-        for i, block in enumerate(self.blocks):
-            x, v = block(x, pos_ids, rope_angles, cond, ctx, v, kv_cache=kv_cache)
+        if not self.blocks:
+            return x
+        layer_iter = zip(self.blocks, kv_cache.layers)
+        first_block, first_layer_cache = next(layer_iter)
+        x, v = first_block.forward_first(
+            x, pos_ids, rope_angles, cond, ctx, kv_cache=kv_cache, layer_cache=first_layer_cache
+        )
+        for block, layer_cache in layer_iter:
+            x, v = block.forward_with_residual(
+                x, pos_ids, rope_angles, cond, ctx, v, kv_cache=kv_cache, layer_cache=layer_cache
+            )
         return x
 
 
diff --git a/src/patch_model.py b/src/patch_model.py
index 817f220..69b7b99 100644
--- a/src/patch_model.py
+++ b/src/patch_model.py
@@ -1,3 +1,4 @@
+import os
 import torch
 from torch import nn, Tensor
 import torch.nn.functional as F
@@ -5,64 +6,70 @@
 from .model.nn import rms_norm
 from .model.attn import Attn
 from .model.world_model import MLPFusion
-from torch.nn.attention.flex_attention import flex_attention
+from .model.attn_backend import AttnConfig, AttnMeta, world_flex_attn_forward
 
 
-def _bf16_u16(x: Tensor) -> Tensor:
-    # reinterpret bf16 storage as int16 -> unsigned 0..65535 in int32
+def _f16_u16(x: Tensor) -> Tensor:
+    # reinterpret fp16/bf16 storage as int16 -> unsigned 0..65535 in int32
+    if x.dtype not in (torch.bfloat16, torch.float16):
+        raise RuntimeError(f"_f16_u16 expects float16/bfloat16, got {x.dtype}")
     return x.contiguous().view(torch.int16).to(torch.int32) & 0xFFFF
 
 
 class CachedDenoiseStepEmb(nn.Module):
-    """bf16 sigma -> bf16 embedding via 64k LUT; invalid sigma => OOB index error (no silent wrong)."""
-    def __init__(self, base: nn.Module, sigmas: list[float]):
+    """f16 sigma -> f16 embedding via 64k LUT; invalid sigma => OOB index error (no silent wrong)."""
+    def __init__(self, base: nn.Module, sigmas: list[float], dtype: torch.dtype):
         super().__init__()
         device = next(base.parameters()).device
+        if dtype not in (torch.bfloat16, torch.float16):
+            raise RuntimeError(f"CachedDenoiseStepEmb dtype must be float16/bfloat16, got {dtype}")
+        self.dtype = dtype
 
-        levels = torch.tensor(sigmas, device=device, dtype=torch.bfloat16)          # [S]
-        bits = _bf16_u16(levels)                                                   # [S]
+        levels = torch.tensor(sigmas, device=device, dtype=dtype)                  # [S]
+        bits = _f16_u16(levels)                                                    # [S]
         if torch.unique(bits).numel() != bits.numel():
-            raise ValueError("scheduler_sigmas collide in bf16; caching would be ambiguous")
+            raise ValueError(f"scheduler_sigmas collide in {dtype}; caching would be ambiguous")
 
         with torch.no_grad():
-            table = base(levels[:, None]).squeeze(1).to(torch.bfloat16).contiguous()  # [S,D]
+            table = base(levels[:, None]).squeeze(1).to(dtype).contiguous()  # [S,D]
 
         lut = torch.full((65536,), -1, device=device, dtype=torch.int32)
         lut[bits] = torch.arange(bits.numel(), device=device, dtype=torch.int32)
 
-        self.register_buffer("table", table, persistent=False)                     # [S,D] bf16
+        self.register_buffer("table", table, persistent=False)                     # [S,D] f16
         self.register_buffer("lut", lut, persistent=False)                         # [65536] int32
         self.register_buffer("oob", torch.tensor(bits.numel(), device=device, dtype=torch.int32), persistent=False)
 
     def forward(self, sigma: Tensor) -> Tensor:
-        if sigma.dtype is not torch.bfloat16:
-            raise RuntimeError("CachedDenoiseStepEmb expects sigma bf16")
-        idx = self.lut[_bf16_u16(sigma)]
+        if sigma.dtype != self.dtype:
+            raise RuntimeError(f"CachedDenoiseStepEmb expects sigma {self.dtype}, got {sigma.dtype}")
+        idx = self.lut[_f16_u16(sigma)]
         idx = torch.where(idx >= 0, idx, self.oob)                                 # invalid -> S (OOB)
-        return self.table[idx.to(torch.int64)]                                     # [...,D] bf16
+        return self.table[idx.to(torch.int64)]                                     # [...,D] f16
 
 
 class CachedCondHead(nn.Module):
-    """bf16 cond -> cached (s0,b0,g0,s1,b1,g1); invalid cond => OOB index error (no silent wrong)."""
+    """f16 cond -> cached (s0,b0,g0,s1,b1,g1); invalid cond => OOB index error (no silent wrong)."""
     def __init__(self, base, cached_denoise_step_emb: CachedDenoiseStepEmb, max_key_dims: int = 8):
         super().__init__()
         table = cached_denoise_step_emb.table                                      # [S,D] bf16
         S, D = table.shape
+        self.dtype = table.dtype
 
         with torch.no_grad():
             emb = table[:, None, :]                                                # [S,1,D]
-            cache = torch.stack([t.squeeze(1) for t in base(emb)], 0).to(torch.bfloat16).contiguous()  # [6,S,D]
+            cache = torch.stack([t.squeeze(1) for t in base(emb)], 0).to(table.dtype).contiguous()  # [6,S,D]
 
-        # pick a single embedding dimension whose bf16 bits uniquely identify sigma
+        # pick a single embedding dimension whose f16 bits uniquely identify sigma
         key_dim = None
         for d in range(min(D, max_key_dims)):
-            b = _bf16_u16(table[:, d])
+            b = _f16_u16(table[:, d])
             if torch.unique(b).numel() == S:
                 key_dim = d
                 key_bits = b
                 break
         if key_dim is None:
-            raise ValueError("Could not find a unique bf16 key dim for cond->sigma mapping; increase max_key_dims")
+            raise ValueError("Could not find a unique f16 key dim for cond->sigma mapping; increase max_key_dims")
 
         lut = torch.full((65536,), -1, device=table.device, dtype=torch.int32)
         lut[key_bits] = torch.arange(S, device=table.device, dtype=torch.int32)
@@ -73,17 +80,21 @@ def __init__(self, base, cached_denoise_step_emb: CachedDenoiseStepEmb, max_key_
         self.register_buffer("oob", torch.tensor(S, device=table.device, dtype=torch.int32), persistent=False)
 
     def forward(self, cond: Tensor):
-        if cond.dtype is not torch.bfloat16:
-            raise RuntimeError("CachedCondHead expects cond bf16")
-        idx = self.lut[_bf16_u16(cond[..., self.key_dim])]
+        if cond.dtype != self.dtype:
+            raise RuntimeError(f"CachedCondHead expects cond {self.dtype}, got {cond.dtype}")
+        idx = self.lut[_f16_u16(cond[..., self.key_dim])]
         idx = torch.where(idx >= 0, idx, self.oob)                                 # invalid -> S (OOB)
-        g = self.cache[:, idx.to(torch.int64)]                                     # [6,...,D] bf16 (or errors)
+        g = self.cache[:, idx.to(torch.int64)]                                     # [6,...,D] f16 (or errors)
         return tuple(g.unbind(0))                                                  # (s0,b0,g0,s1,b1,g1)
 
 
-def patch_cached_noise_conditioning(model) -> None:
-    # Call AFTER: model.to(device="cuda", dtype=torch.bfloat16).eval()
-    cached_denoise_step_emb = CachedDenoiseStepEmb(model.denoise_step_emb, model.config.scheduler_sigmas)
+def _metal_in_graph_attention_enabled() -> bool:
+    return os.environ.get("WORLD_METAL_ALLOW_IN_GRAPH_OP", "0") == "1"
+
+
+def patch_cached_noise_conditioning(model, dtype: torch.dtype) -> None:
+    # Call after model dtype/device are finalized.
+    cached_denoise_step_emb = CachedDenoiseStepEmb(model.denoise_step_emb, model.config.scheduler_sigmas, dtype=dtype)
     model.denoise_step_emb = cached_denoise_step_emb
     for blk in model.transformer.blocks:
         blk.cond_head = CachedCondHead(blk.cond_head, cached_denoise_step_emb)
@@ -98,6 +109,7 @@ def __init__(self, src: Attn, config):
 
         self.q_out = self.n_heads * self.d_head
         self.kv_out = self.n_kv_heads * self.d_head
+        self._layer_idx_const = int(src.layer_idx)
 
         self.qkv_proj = nn.Linear(
             self.q_proj.in_features,
@@ -113,31 +125,62 @@ def __init__(self, src: Attn, config):
 
         del self.q_proj, self.k_proj, self.v_proj
 
-    def forward(self, x, pos_ids, rope_angles, v1, kv_cache):
-        q, k, v = self.qkv_proj(x).split((self.q_out, self.kv_out, self.kv_out), dim=-1)
+    def _stateful_attention_impl(self, q, k, v, pos_ids, kv_cache, layer_cache=None):
+        layer_cache = kv_cache.layers[self._layer_idx_const] if layer_cache is None else layer_cache
+        frame_idx = 0 if torch.compiler.is_compiling() else kv_cache.get_frame_idx(pos_ids)
+        k, v, bm, block_written, active_blocks, active_count, block_size = layer_cache.upsert_separate(
+            k, v, pos_ids, kv_cache._is_frozen, frame_idx_int=frame_idx
+        )
+        meta = AttnMeta(
+            flex_block_mask=bm,
+            q_len=q.size(2),
+            kv_len=k.size(2),
+            block_written=block_written,
+            active_blocks=active_blocks,
+            active_count=active_count,
+            block_size=block_size,
+        )
+        cfg = AttnConfig(causal=True, enable_gqa=self.enable_gqa)
+        return world_flex_attn_forward(q, k, v, meta, cfg)
 
-        B, T = x.shape[:2]
-        q = q.reshape(B, T, self.n_heads, self.d_head).transpose(1, 2)
-        k = k.reshape(B, T, self.n_kv_heads, self.d_head).transpose(1, 2)
-        v = v.reshape(B, T, self.n_kv_heads, self.d_head).transpose(1, 2)
+    @torch._dynamo.disable
+    def _stateful_attention_eager(self, q, k, v, pos_ids, kv_cache, layer_cache=None):
+        return self._stateful_attention_impl(q, k, v, pos_ids, kv_cache, layer_cache=layer_cache)
 
-        if self.value_residual:
-            v1 = v if v1 is None else v1
-            v = torch.lerp(v, v1.view_as(v), self.v_lamb)
+    def _stateful_attention(self, q, k, v, pos_ids, kv_cache, layer_cache=None):
+        if _metal_in_graph_attention_enabled():
+            return self._stateful_attention_impl(q, k, v, pos_ids, kv_cache, layer_cache=layer_cache)
+        return self._stateful_attention_eager(q, k, v, pos_ids, kv_cache, layer_cache=layer_cache)
+
+    def _project_qkv(self, x):
+        q, k, v = self.qkv_proj(x).split((self.q_out, self.kv_out, self.kv_out), dim=-1)
+        B, T = x.shape[:2]
+        q = q.reshape(B, T, self.n_heads, self.d_head).transpose(1, 2).contiguous()
+        k = k.reshape(B, T, self.n_kv_heads, self.d_head).transpose(1, 2).contiguous()
+        v = v.reshape(B, T, self.n_kv_heads, self.d_head).transpose(1, 2).contiguous()
+        return q, k, v
 
+    def _finish_attention(self, x, q, k, v, pos_ids, rope_angles, kv_cache, layer_cache=None):
+        B, T = x.shape[:2]
         q, k = rms_norm(q), rms_norm(k)
         q, k = self.rope(q, rope_angles), self.rope(k, rope_angles)
-
-        k, v, bm = kv_cache.upsert(k, v, pos_ids, self.layer_idx)
-
-        y = flex_attention(q, k, v, block_mask=bm, enable_gqa=self.enable_gqa)
-
+        y = self._stateful_attention(q, k, v, pos_ids, kv_cache, layer_cache=layer_cache)
         if self.gated_attn:
             gates = torch.sigmoid(self.gate_proj(x[..., : self.n_heads]))
             y = y * gates.permute(0, 2, 1).unsqueeze(-1)
-
         y = y.transpose(1, 2).reshape(B, T, -1)
-        y = self.out_proj(y)
+        return self.out_proj(y)
+
+    def forward_first(self, x, pos_ids, rope_angles, kv_cache, layer_cache=None):
+        q, k, v = self._project_qkv(x)
+        y = self._finish_attention(x, q, k, v, pos_ids, rope_angles, kv_cache, layer_cache=layer_cache)
+        return y, v
+
+    def forward_with_residual(self, x, pos_ids, rope_angles, v1, kv_cache, layer_cache=None):
+        q, k, v = self._project_qkv(x)
+        if self.value_residual:
+            v = torch.lerp(v, v1.view_as(v), self.v_lamb)
+        y = self._finish_attention(x, q, k, v, pos_ids, rope_angles, kv_cache, layer_cache=layer_cache)
         return y, v1
 
 
@@ -180,6 +223,9 @@ def patch_MLPFusion_split(model) -> None:
 
 
 def apply_inference_patches(model) -> None:
-    patch_cached_noise_conditioning(model)
+    model_dtype = next(model.parameters()).dtype
+    if model_dtype in (torch.bfloat16, torch.float16):
+        patch_cached_noise_conditioning(model, dtype=model_dtype)
     patch_Attn_merge_qkv(model)
     patch_MLPFusion_split(model)
+
diff --git a/src/quantize.py b/src/quantize.py
index b74825b..d428816 100644
--- a/src/quantize.py
+++ b/src/quantize.py
@@ -2,6 +2,7 @@
 
 import torch
 import torch.nn as nn
+import warnings
 
 
 QUANTS = [None]  # TODO: enable specific quant based on model config, which should specify compatible quants [None, "w8a8", "fp8"]
@@ -188,11 +189,14 @@ def quantize_model(model: nn.Module, quant: str):
     if quant is None:
         return model
 
+    def is_linear(m: nn.Module) -> bool:
+        return isinstance(m, nn.Linear) and (getattr(m, "weight", None) is not None)
+
     def eligible(m: nn.Module) -> bool:
-        w = getattr(m, "weight", None)
-        if not isinstance(m, nn.Linear):
+        if not is_linear(m):
             return False
-        if getattr(w, "dtype", None) != torch.bfloat16:
+        w = m.weight
+        if w.dtype not in (torch.bfloat16, torch.float16):
             return False
         o, k = w.shape
         return (o % 32 == 0) and (k % 32 == 0)
@@ -203,8 +207,51 @@ def eligible(m: nn.Module) -> bool:
         "fp8": FP8Linear,
     }[quant]
 
-    for name, child in model.named_children():
-        setattr(model, name, new_linear(child)) if eligible(child) else quantize_model(
-            child, quant
+    stats = {
+        "quant": quant,
+        "total_linear_layers": 0,
+        "eligible_linear_layers": 0,
+        "converted_linear_layers": 0,
+        "failed_linear_layers": 0,
+        "total_linear_weight_elems": 0,
+        "eligible_weight_elems": 0,
+        "converted_weight_elems": 0,
+        "failed_examples": [],
+    }
+
+    def walk(mod: nn.Module):
+        for name, child in mod.named_children():
+            if is_linear(child):
+                w = child.weight
+                elems = int(w.numel())
+                stats["total_linear_layers"] += 1
+                stats["total_linear_weight_elems"] += elems
+                if eligible(child):
+                    stats["eligible_linear_layers"] += 1
+                    stats["eligible_weight_elems"] += elems
+                    try:
+                        setattr(mod, name, new_linear(child))
+                        stats["converted_linear_layers"] += 1
+                        stats["converted_weight_elems"] += elems
+                    except Exception as exc:
+                        stats["failed_linear_layers"] += 1
+                        if len(stats["failed_examples"]) < 8:
+                            stats["failed_examples"].append(f"{name}: {type(exc).__name__}: {exc}")
+                continue
+            walk(child)
+
+    walk(model)
+    eligible_elems = max(1, int(stats["eligible_weight_elems"]))
+    total_elems = max(1, int(stats["total_linear_weight_elems"]))
+    stats["eligible_coverage_pct"] = float(stats["converted_weight_elems"]) * 100.0 / float(eligible_elems)
+    stats["total_coverage_pct"] = float(stats["converted_weight_elems"]) * 100.0 / float(total_elems)
+
+    model._quantized_linear_count = int(stats["converted_linear_layers"])
+    model._quantization_report = stats
+    if stats["converted_linear_layers"] == 0:
+        warnings.warn(
+            f"quantize_model('{quant}') converted 0 layers on this runtime; "
+            "falling back to original precision for all linear ops.",
+            stacklevel=2,
         )
     return model
diff --git a/src/world_engine.py b/src/world_engine.py
index 7d1a892..afe761e 100644
--- a/src/world_engine.py
+++ b/src/world_engine.py
@@ -1,4 +1,5 @@
 from typing import Dict, Optional, Set, Tuple
+import os
 import torch
 from torch import Tensor
 from dataclasses import dataclass, field
@@ -15,6 +16,8 @@
 
 # fix graph break:
 torch._dynamo.config.capture_scalar_outputs = True
+# Avoid per-layer recompiles from static integer attrs like layer_idx on MPS mixed-compile paths.
+torch._dynamo.config.allow_unspec_int_on_nn_module = True
 
 COMPILE_OPTIONS = {
     "max_autotune": True,
@@ -41,7 +44,9 @@ def __init__(
         model_config_overrides: Optional[Dict] = None,
         device=None,
         dtype=torch.bfloat16,
-        load_weights: bool = True
+        load_weights: bool = True,
+        scheduler_steps: Optional[int] = None,
+        cache_interval: int = 1,
     ):
         """
         model_uri: HF URI or local folder containing model.safetensors and config.yaml
@@ -50,6 +55,17 @@ def __init__(
         """
         self.device = torch.get_default_device() if device is None else device
         self.dtype = torch.get_default_dtype() if dtype is None else dtype
+        if cache_interval <= 0:
+            raise ValueError("cache_interval must be >= 1")
+        self.cache_interval = int(cache_interval)
+        self._gen_count = 0
+        force_compile_metal = os.getenv("WORLD_FORCE_COMPILE_METAL", "0") == "1"
+        hybrid_compile_metal = os.getenv("WORLD_HYBRID_COMPILE_METAL", "0") == "1"
+        self._disable_compile = (
+            str(self.device).startswith("mps")
+            and os.getenv("WORLD_ATTENTION_BACKEND", "flex").lower() == "metal"
+            and not force_compile_metal
+        )
 
         self.model_cfg = WorldModel.load_config(model_uri)
 
@@ -76,6 +92,23 @@ def __init__(
 
             # Inference Scheduler
             self.scheduler_sigmas = torch.tensor(self.model_cfg.scheduler_sigmas, dtype=dtype)
+            if scheduler_steps is not None:
+                if scheduler_steps <= 0:
+                    raise ValueError("scheduler_steps must be > 0 when provided")
+                if scheduler_steps > int(self.scheduler_sigmas.numel()):
+                    raise ValueError(
+                        f"scheduler_steps={scheduler_steps} exceeds available "
+                        f"{int(self.scheduler_sigmas.numel())}"
+                    )
+                self.scheduler_sigmas = self.scheduler_sigmas[: int(scheduler_steps)].contiguous()
+            self.scheduler_dsigmas = self.scheduler_sigmas.diff().contiguous()
+            self.scheduler_step_sigmas = self.scheduler_sigmas[:-1].contiguous()
+            n_steps = self.scheduler_dsigmas.numel()
+            # Keep sigma tensors in a single contiguous layout to reduce compile guard churn.
+            self._step_sigmas_bt = self.scheduler_step_sigmas.view(n_steps, 1, 1).contiguous()
+            self._dsigmas_bt = self.scheduler_dsigmas.view(n_steps, 1, 1, 1, 1).contiguous()
+            self._n_denoise_steps = n_steps
+            self._sigma_zero = torch.zeros((1, 1), dtype=dtype).contiguous()
 
             pH, pW = getattr(self.model_cfg, "patch", [1, 1])
             self.frm_shape = 1, 1, self.model_cfg.channels, self.model_cfg.height * pH, self.model_cfg.width * pW
@@ -83,8 +116,9 @@ def __init__(
             # State
             inference_fps = getattr(self.model_cfg, "inference_fps", self.model_cfg.base_fps)
             latent_fps = inference_fps / getattr(self.model_cfg, "temporal_compression", 1)
-            self.ts_mult = int(self.model_cfg.base_fps) // latent_fps
+            self.ts_mult = int(int(self.model_cfg.base_fps) // latent_fps)
             self.frame_ts = torch.tensor([[0]], dtype=torch.long)
+            self._frame_idx_int = 0
 
             # Static input context tensors
             self._ctx = {
@@ -94,13 +128,27 @@ def __init__(
                 "frame_timestamp": torch.empty((1, 1), dtype=torch.long),
                 "frame_idx": torch.empty((1, 1), dtype=torch.long),
             }
+            self._ctrl_values = torch.zeros((1, 1, 3), dtype=dtype)
 
             self._prompt_ctx = {"prompt_emb": None, "prompt_pad_mask": None}
+            metal_runtime = str(self.device).startswith("mps") and os.getenv("WORLD_ATTENTION_BACKEND", "flex").lower() == "metal"
+            metal_in_graph_attention = os.getenv("WORLD_METAL_ALLOW_IN_GRAPH_OP", "0") == "1"
+            if (force_compile_metal or hybrid_compile_metal) and metal_runtime:
+                # Allow graph breaks around custom Metal ops while still compiling dense surrounding math.
+                # Keep cache updates eager when in-graph Metal attention is enabled; the
+                # side-effect-heavy cache path has shown pathological compile startup cost.
+                self._cache_pass_fn = self._cache_pass_eager if metal_in_graph_attention else self._cache_pass_mixed
+                self._denoise_pass_fn = self._denoise_pass_stepwise if metal_in_graph_attention else self._denoise_pass_mixed
+            else:
+                self._cache_pass_fn = self._cache_pass_eager if self._disable_compile else self._cache_pass
+                self._denoise_pass_fn = self._denoise_pass_eager if self._disable_compile else self._denoise_pass
 
     @torch.inference_mode()
     def reset(self):
         """Reset state for new generation"""
         self.kv_cache.reset()
+        self._gen_count = 0
+        self._frame_idx_int = 0
         self.frame_ts.zero_()
         for v in self._ctx.values():
             v.zero_()
@@ -116,6 +164,7 @@ def load_state(self, state):
         """Loads a world state object saved via save_state. Doesn't load or change model"""
         self.kv_cache.load_state(state["kv_cache"])
         self.frame_ts.copy_(state["frame_ts"])
+        self._frame_idx_int = int(self.frame_ts[0, 0].item())
 
     def set_prompt(self, prompt: str):
         """Apply text conditioning for T2V"""
@@ -128,22 +177,23 @@ def append_frame(self, img: Tensor, ctrl: CtrlInput = None):
         assert img.dtype == torch.uint8, img.dtype
         x0 = self.vae.encode(img).unsqueeze(1)
         inputs = self.prep_inputs(x=x0, ctrl=ctrl)
-        self._cache_pass(x0, inputs, self.kv_cache)
+        self._cache_pass_fn(x0, inputs, self.kv_cache)
         return img
 
     @torch.inference_mode()
     def gen_frame(self, ctrl: CtrlInput = None, return_img: bool = True):
         x = torch.randn(self.frm_shape, device=self.device, dtype=self.dtype)
         inputs = self.prep_inputs(x=x, ctrl=ctrl)
-        x0 = self._denoise_pass(x, inputs, self.kv_cache).clone()
-        self._cache_pass(x0, inputs, self.kv_cache)
+        x0 = self._denoise_pass_fn(x, inputs, self.kv_cache)
+        if (self._gen_count % self.cache_interval) == 0:
+            self._cache_pass_fn(x0, inputs, self.kv_cache)
+        self._gen_count += 1
         return (self.vae.decode(x0.squeeze(1)) if return_img else x0.squeeze(1))
 
     @torch.compile
-    def _prep_inputs(self, x, ctrl=None):
-        self._ctx["mouse"][0, 0, 0] = ctrl.mouse[0]
-        self._ctx["mouse"][0, 0, 1] = ctrl.mouse[1]
-        self._ctx["scroll"][0, 0, 0] = ctrl.scroll_wheel
+    def _prep_inputs(self, ctrl_values: Tensor):
+        self._ctx["mouse"].copy_(ctrl_values[..., :2])
+        self._ctx["scroll"].copy_(ctrl_values[..., 2:])
 
         self._ctx["frame_idx"].copy_(self.frame_ts)
         self._ctx["frame_timestamp"].copy_(self.frame_ts).mul_(self.ts_mult)
@@ -156,9 +206,22 @@ def prep_inputs(self, x, ctrl=None):
         self._ctx["button"].zero_()
         if ctrl.button:
             self._ctx["button"][..., list(ctrl.button)] = 1.0
-        ctrl.mouse = torch.as_tensor(ctrl.mouse, device=x.device, dtype=self.dtype)
-        ctrl.scroll_wheel = torch.sign(torch.as_tensor(ctrl.scroll_wheel, device=x.device, dtype=self.dtype))
-        ctx = self._prep_inputs(x, ctrl)
+        mx, my = ctrl.mouse
+        mouse_x = float(mx)
+        mouse_y = float(my)
+        if ctrl.scroll_wheel > 0:
+            scroll_wheel = 1.0
+        elif ctrl.scroll_wheel < 0:
+            scroll_wheel = -1.0
+        else:
+            scroll_wheel = 0.0
+        self._ctrl_values[0, 0, 0] = mouse_x
+        self._ctrl_values[0, 0, 1] = mouse_y
+        self._ctrl_values[0, 0, 2] = scroll_wheel
+        ctx = self._prep_inputs(self._ctrl_values)
+        # Thread a cheap Python-side frame index hint to avoid per-layer scalar syncs.
+        self.kv_cache.set_frame_idx_int(self._frame_idx_int)
+        self._frame_idx_int += 1
 
         # prepare prompt conditioning
         if self.model_cfg.prompt_conditioning is None:
@@ -170,14 +233,67 @@ def prep_inputs(self, x, ctrl=None):
     @torch.compile(fullgraph=True, dynamic=False, options=COMPILE_OPTIONS)
     def _denoise_pass(self, x, ctx: Dict[str, Tensor], kv_cache):
         kv_cache.set_frozen(True)
-        sigma = x.new_empty((x.size(0), x.size(1)))
-        for step_sig, step_dsig in zip(self.scheduler_sigmas, self.scheduler_sigmas.diff()):
-            v = self.model(x, sigma.fill_(step_sig), **ctx, kv_cache=kv_cache)
-            x = x + step_dsig * v
+        for i in range(self._n_denoise_steps):
+            sigma_i = self._step_sigmas_bt[i]
+            v = self.model(x, sigma_i, **ctx, kv_cache=kv_cache, ctrl_cond=True, prompt_cond=True)
+            x = x + self._dsigmas_bt[i] * v
+        return x
+
+    def _denoise_pass_eager(self, x, ctx: Dict[str, Tensor], kv_cache):
+        kv_cache.set_frozen(True)
+        for i in range(self._n_denoise_steps):
+            sigma_i = self._step_sigmas_bt[i]
+            v = self.model(x, sigma_i, **ctx, kv_cache=kv_cache, ctrl_cond=True, prompt_cond=True)
+            x = x + self._dsigmas_bt[i] * v
+        return x
+
+    @torch.compile(dynamic=False, options=COMPILE_OPTIONS)
+    def _denoise_pass_mixed(self, x, ctx: Dict[str, Tensor], kv_cache):
+        kv_cache.set_frozen(True)
+        for i in range(self._n_denoise_steps):
+            sigma_i = self._step_sigmas_bt[i]
+            v = self.model(x, sigma_i, **ctx, kv_cache=kv_cache, ctrl_cond=True, prompt_cond=True)
+            x = x + self._dsigmas_bt[i] * v
+        return x
+
+    @torch.compile(mode="reduce-overhead", dynamic=False)
+    def _denoise_pass_ingraph(self, x, ctx: Dict[str, Tensor], kv_cache):
+        kv_cache.set_frozen(True)
+        for i in range(self._n_denoise_steps):
+            sigma_i = self._step_sigmas_bt[i]
+            v = self.model(x, sigma_i, **ctx, kv_cache=kv_cache, ctrl_cond=True, prompt_cond=True)
+            x = x + self._dsigmas_bt[i] * v
+        return x
+
+    @torch.compile(dynamic=False, options=COMPILE_OPTIONS)
+    def _denoise_step(self, x, sigma_i: Tensor, dsigma_i: Tensor, ctx: Dict[str, Tensor], kv_cache):
+        v = self.model(x, sigma_i, **ctx, kv_cache=kv_cache, ctrl_cond=True, prompt_cond=True)
+        return x + dsigma_i * v
+
+    @torch.compile(mode="reduce-overhead", dynamic=False)
+    def _denoise_step_ingraph(self, x, sigma_i: Tensor, dsigma_i: Tensor, ctx: Dict[str, Tensor], kv_cache):
+        v = self.model(x, sigma_i, **ctx, kv_cache=kv_cache, ctrl_cond=True, prompt_cond=True)
+        return x + dsigma_i * v
+
+    def _denoise_pass_stepwise(self, x, ctx: Dict[str, Tensor], kv_cache):
+        kv_cache.set_frozen(True)
+        for i in range(self._n_denoise_steps):
+            x = self._denoise_step_ingraph(x, self._step_sigmas_bt[i], self._dsigmas_bt[i], ctx, kv_cache)
         return x
 
     @torch.compile(fullgraph=True, dynamic=False, options=COMPILE_OPTIONS)
     def _cache_pass(self, x, ctx: Dict[str, Tensor], kv_cache):
         """Side effect: updates kv cache"""
         kv_cache.set_frozen(False)
-        self.model(x, x.new_zeros((x.size(0), x.size(1))), **ctx, kv_cache=kv_cache)
+        self.model(x, self._sigma_zero, **ctx, kv_cache=kv_cache, ctrl_cond=True, prompt_cond=True)
+
+    def _cache_pass_eager(self, x, ctx: Dict[str, Tensor], kv_cache):
+        """Side effect: updates kv cache"""
+        kv_cache.set_frozen(False)
+        self.model(x, self._sigma_zero, **ctx, kv_cache=kv_cache, ctrl_cond=True, prompt_cond=True)
+
+    @torch.compile(dynamic=False, options=COMPILE_OPTIONS)
+    def _cache_pass_mixed(self, x, ctx: Dict[str, Tensor], kv_cache):
+        """Side effect: updates kv cache"""
+        kv_cache.set_frozen(False)
+        self.model(x, self._sigma_zero, **ctx, kv_cache=kv_cache, ctrl_cond=True, prompt_cond=True)
diff --git a/tests/bench_world_engine_e2e.py b/tests/bench_world_engine_e2e.py
new file mode 100644
index 0000000..c1ec655
--- /dev/null
+++ b/tests/bench_world_engine_e2e.py
@@ -0,0 +1,290 @@
+import argparse
+import io
+import random
+import time
+import urllib.request
+import json
+from pathlib import Path
+import sys
+
+import imageio.v3 as iio
+import numpy as np
+import torch
+import torch.nn.functional as F
+
+sys.path.insert(0, str(Path(__file__).resolve().parents[1]))
+
+
+SEED_FRAME_URLS = [
+    "https://gist.github.com/user-attachments/assets/d81c6d26-a838-4afe-9d13-fd67677043c3",
+    "https://gist.github.com/user-attachments/assets/b6d18c38-098e-43b0-8e61-66a16e5d8946",
+    "https://gist.github.com/user-attachments/assets/0734a8c1-3eb4-4ffe-8c37-5665c45ab559",
+    "https://gist.github.com/user-attachments/assets/f9c20d4d-7565-452d-8b02-42a85ea175ed",
+    "https://gist.github.com/user-attachments/assets/68c943a4-008a-4c25-948c-c81ab4c47d21",
+]
+
+
+def _controller_sequence(CtrlInput):
+    seq = [
+        CtrlInput(mouse=[0.2, 0.2]), CtrlInput(button={32}), CtrlInput(), CtrlInput(), CtrlInput(),
+        CtrlInput(button={1}), CtrlInput(), CtrlInput(), CtrlInput(button={1, 32}),
+        CtrlInput(), CtrlInput(), CtrlInput(), CtrlInput(), CtrlInput(), CtrlInput(),
+    ] * 4
+    seq += [CtrlInput()] * 8
+    seq += (
+        [CtrlInput(button={32})] * 10 +
+        [CtrlInput(button={65})] * 10 +
+        [CtrlInput(button={68})] * 10 +
+        [CtrlInput(button={83})] * 10
+    )
+    seq += [CtrlInput()] * 10
+    return seq
+
+
+def _sync_if_mps():
+    if torch.backends.mps.is_available():
+        torch.mps.synchronize()
+
+
+def _load_seed_frame(url: str) -> np.ndarray:
+    raw = urllib.request.urlopen(url).read()
+    arr = iio.imread(io.BytesIO(raw))
+    if arr.ndim == 2:
+        arr = np.stack([arr, arr, arr], axis=-1)
+    if arr.shape[-1] > 3:
+        arr = arr[..., :3]
+
+    t = torch.from_numpy(arr).permute(2, 0, 1).unsqueeze(0).to(torch.float32)
+    t = F.interpolate(t, size=(512, 1024), mode="bilinear", align_corners=False)
+    t = t.round().clamp(0, 255).to(torch.uint8)
+    return t.squeeze(0).permute(1, 2, 0).cpu().numpy()
+
+
+def main():
+    parser = argparse.ArgumentParser(description="WorldEngine E2E generation script + latency stats.")
+    parser.add_argument("--model-uri", default="Overworld-Models/Lapp0-WP-Mini-1.4.5-BL-Distill")
+    parser.add_argument("--out", default="out.mp4")
+    parser.add_argument("--seed", type=int, default=0)
+    parser.add_argument("--frames", type=int, default=0, help="Override number of generated control frames (0=full sequence)")
+    parser.add_argument("--device", default="mps")
+    parser.add_argument("--attention-backend", default="metal", choices=["metal", "flex", "auto"])
+    parser.add_argument("--dtype", default="bfloat16", choices=["bfloat16", "float16"])
+    parser.add_argument("--write-video", action="store_true", help="Write generated frames to --out via pyav")
+    parser.add_argument("--return-img", action="store_true", help="Decode RGB images (otherwise benchmark latent-only)")
+    parser.add_argument("--scheduler-steps", type=int, default=0, help="Override denoise scheduler steps (0=use model default)")
+    parser.add_argument("--cache-interval", type=int, default=1, help="Run cache update every N generated frames")
+    parser.add_argument("--quant", default="none", choices=["none", "w8a8", "nvfp4"], help="Optional model quantization mode")
+    parser.add_argument("--json-out", default="", help="Optional path to write machine-readable benchmark summary JSON")
+    args = parser.parse_args()
+
+    if args.device == "mps" and not torch.backends.mps.is_available():
+        raise RuntimeError("MPS backend not available.")
+
+    import os
+    # torch.compile can be a net loss / long cold-start on MPS for this workload.
+    if args.device == "mps":
+        os.environ.setdefault("TORCHDYNAMO_DISABLE", "1")
+        os.environ.setdefault("WORLD_KV_RUNTIME_CHECKS", "0")
+        os.environ.setdefault("WORLD_KV_COMPUTE_ACTIVE_BLOCKS", "0")
+    os.environ["WORLD_ATTENTION_BACKEND"] = args.attention_backend
+    from src.world_engine import WorldEngine, CtrlInput
+    from src.metal.runtime import ensure_metal_attention_op_loaded
+    if args.attention_backend == "metal" and args.device == "mps":
+        os.environ.setdefault("WORLD_METAL_IMPL", "fast")
+        os.environ.setdefault("WORLD_METAL_FAST_NO_FALLBACK", "1")
+        os.environ.setdefault("WORLD_METAL_PREFER_ACTIVE_DISPATCH", "1")
+        ensure_metal_attention_op_loaded()
+
+    dtype = torch.bfloat16 if args.dtype == "bfloat16" else torch.float16
+    print(
+        f"[e2e] cfg backend={args.attention_backend} device={args.device} dtype={args.dtype} "
+        f"compile_disabled={os.environ.get('TORCHDYNAMO_DISABLE', '0')} "
+        f"metal_impl={os.environ.get('WORLD_METAL_IMPL', 'ref')} "
+        f"metal_no_fallback={os.environ.get('WORLD_METAL_FAST_NO_FALLBACK', '0')} "
+        f"kv_checks={os.environ.get('WORLD_KV_RUNTIME_CHECKS', '0')} "
+        f"kv_active_blocks={os.environ.get('WORLD_KV_COMPUTE_ACTIVE_BLOCKS', '0')}",
+        flush=True,
+    )
+    print("[e2e] initializing engine...", flush=True)
+    engine = WorldEngine(
+        args.model_uri,
+        quant=(None if args.quant == "none" else args.quant),
+        device=args.device,
+        dtype=dtype,
+        scheduler_steps=(args.scheduler_steps if args.scheduler_steps > 0 else None),
+        cache_interval=args.cache_interval,
+    )
+    print("[e2e] engine initialized", flush=True)
+    print(
+        f"[e2e] model n_layers={engine.model_cfg.n_layers} "
+        f"n_heads={engine.model_cfg.n_heads} n_kv_heads={getattr(engine.model_cfg, 'n_kv_heads', engine.model_cfg.n_heads)} "
+        f"scheduler_steps={int(engine.scheduler_sigmas.numel())} cache_interval={engine.cache_interval}",
+        flush=True,
+    )
+
+    random.seed(args.seed)
+    url = random.choice(SEED_FRAME_URLS)
+    print("[e2e] loading seed frame...", flush=True)
+    frame = _load_seed_frame(url)
+    seed = torch.from_numpy(np.repeat(frame[None], 4, axis=0)).to(engine.device)
+    print("[e2e] appending seed frame...", flush=True)
+    engine.append_frame(seed)
+    print("[e2e] seed frame appended", flush=True)
+
+    ctrl_seq = _controller_sequence(CtrlInput)
+    if args.frames > 0:
+        ctrl_seq = ctrl_seq[: args.frames]
+
+    totals_ms = []
+    prep_ms = []
+    denoise_ms = []
+    cache_ms = []
+    decode_ms = []
+
+    def _first_second_steady(values):
+        if not values:
+            return {"first": 0.0, "second": 0.0, "steady_mean": 0.0, "steady_count": 0}
+        first = float(values[0])
+        second = float(values[1]) if len(values) > 1 else float(values[0])
+        steady = values[2:] if len(values) > 2 else []
+        steady_mean = float(sum(steady) / len(steady)) if steady else float(second)
+        return {
+            "first": first,
+            "second": second,
+            "steady_mean": steady_mean,
+            "steady_count": len(steady),
+        }
+
+    def _step(ctrl):
+        with torch.inference_mode():
+            x = torch.randn(engine.frm_shape, device=engine.device, dtype=engine.dtype)
+
+            t0 = time.perf_counter()
+            inputs = engine.prep_inputs(x=x, ctrl=ctrl if ctrl is not None else CtrlInput())
+            _sync_if_mps()
+            t_prep = (time.perf_counter() - t0) * 1000.0
+
+            t1 = time.perf_counter()
+            x0 = engine._denoise_pass_fn(x, inputs, engine.kv_cache)
+            _sync_if_mps()
+            t_denoise = (time.perf_counter() - t1) * 1000.0
+
+            do_cache_update = (engine._gen_count % engine.cache_interval) == 0
+            t2 = time.perf_counter()
+            if do_cache_update:
+                engine._cache_pass_fn(x0, inputs, engine.kv_cache)
+                _sync_if_mps()
+            t_cache = (time.perf_counter() - t2) * 1000.0
+            engine._gen_count += 1
+
+            t_decode = 0.0
+            img = None
+            if args.return_img or args.write_video:
+                t3 = time.perf_counter()
+                img = engine.vae.decode(x0.squeeze(1))
+                _sync_if_mps()
+                t_decode = (time.perf_counter() - t3) * 1000.0
+            return img, t_prep, t_denoise, t_cache, t_decode, do_cache_update
+
+    out = None
+    if args.write_video:
+        print("[e2e] opening video writer...", flush=True)
+        out = iio.imopen(args.out, "w", plugin="pyav")
+
+    try:
+        steps = [None] + ctrl_seq
+        for i, ctrl in enumerate(steps):
+            label = "first" if i == 0 else f"ctrl_{i}"
+            print(f"[e2e] generating {label} frame...", flush=True)
+            t_total_start = time.perf_counter()
+            img, t_prep, t_denoise, t_cache, t_decode, do_cache_update = _step(ctrl)
+            t_total = (time.perf_counter() - t_total_start) * 1000.0
+
+            totals_ms.append(t_total)
+            prep_ms.append(t_prep)
+            denoise_ms.append(t_denoise)
+            cache_ms.append(t_cache)
+            decode_ms.append(t_decode)
+
+            if out is not None and img is not None:
+                if i == 0:
+                    out.write(img.cpu().numpy(), fps=60, codec="libx264")
+                else:
+                    out.write(img.cpu().numpy())
+            print(
+                f"[e2e] {label} done total={t_total:.3f}ms prep={t_prep:.3f} "
+                f"denoise={t_denoise:.3f} cache={t_cache:.3f} decode={t_decode:.3f} "
+                f"cache_update={int(do_cache_update)}",
+                flush=True,
+            )
+    finally:
+        if out is not None:
+            out.close()
+
+    def _summary(values):
+        vals = sorted(values)
+        n = len(vals)
+        p50 = vals[n // 2]
+        p95 = vals[max(0, int(0.95 * n) - 1)]
+        mean = sum(vals) / max(1, n)
+        return p50, p95, mean
+
+    n = len(totals_ms)
+    total_p50, total_p95, total_mean = _summary(totals_ms)
+    prep_p50, prep_p95, prep_mean = _summary(prep_ms)
+    den_p50, den_p95, den_mean = _summary(denoise_ms)
+    cache_p50, cache_p95, cache_mean = _summary(cache_ms)
+    dec_p50, dec_p95, dec_mean = _summary(decode_ms)
+
+    print(
+        f"model={args.model_uri} backend={args.attention_backend} device={args.device} "
+        f"dtype={args.dtype} frames={n} return_img={args.return_img} write_video={args.write_video} "
+        f"scheduler_steps={int(engine.scheduler_sigmas.numel())} cache_interval={args.cache_interval} quant={args.quant}"
+    )
+    print(f"total_ms   p50={total_p50:.3f} p95={total_p95:.3f} mean={total_mean:.3f}")
+    print(f"prep_ms    p50={prep_p50:.3f} p95={prep_p95:.3f} mean={prep_mean:.3f}")
+    print(f"denoise_ms p50={den_p50:.3f} p95={den_p95:.3f} mean={den_mean:.3f}")
+    print(f"cache_ms   p50={cache_p50:.3f} p95={cache_p95:.3f} mean={cache_mean:.3f}")
+    print(f"decode_ms  p50={dec_p50:.3f} p95={dec_p95:.3f} mean={dec_mean:.3f}")
+    print(
+        f"fps        p50={1000.0/max(total_p50,1e-9):.2f} "
+        f"p95={1000.0/max(total_p95,1e-9):.2f} mean={1000.0/max(total_mean,1e-9):.2f}"
+    )
+    if args.json_out:
+        summary = {
+            "model_uri": args.model_uri,
+            "device": args.device,
+            "attention_backend": args.attention_backend,
+            "dtype": args.dtype,
+            "quant": args.quant,
+            "return_img": bool(args.return_img),
+            "write_video": bool(args.write_video),
+            "scheduler_steps": int(engine.scheduler_sigmas.numel()),
+            "cache_interval": int(args.cache_interval),
+            "frames": int(n),
+            "total_ms": {"p50": total_p50, "p95": total_p95, "mean": total_mean},
+            "prep_ms": {"p50": prep_p50, "p95": prep_p95, "mean": prep_mean},
+            "denoise_ms": {"p50": den_p50, "p95": den_p95, "mean": den_mean},
+            "cache_ms": {"p50": cache_p50, "p95": cache_p95, "mean": cache_mean},
+            "decode_ms": {"p50": dec_p50, "p95": dec_p95, "mean": dec_mean},
+            "fps": {
+                "p50": 1000.0 / max(total_p50, 1e-9),
+                "p95": 1000.0 / max(total_p95, 1e-9),
+                "mean": 1000.0 / max(total_mean, 1e-9),
+            },
+            "startup_profile_ms": {
+                "total": _first_second_steady(totals_ms),
+                "prep": _first_second_steady(prep_ms),
+                "denoise": _first_second_steady(denoise_ms),
+                "cache": _first_second_steady(cache_ms),
+                "decode": _first_second_steady(decode_ms),
+            },
+        }
+        Path(args.json_out).parent.mkdir(parents=True, exist_ok=True)
+        Path(args.json_out).write_text(json.dumps(summary, indent=2), encoding="utf-8")
+    if args.write_video:
+        print(f"wrote={args.out}")
+
+
+if __name__ == "__main__":
+    main()
diff --git a/tests/compare_tensor_dumps.py b/tests/compare_tensor_dumps.py
new file mode 100644
index 0000000..62d3142
--- /dev/null
+++ b/tests/compare_tensor_dumps.py
@@ -0,0 +1,168 @@
+import argparse
+import json
+import re
+from pathlib import Path
+from typing import Any
+
+import torch
+
+
+def _flatten_tensors(obj: Any) -> list[torch.Tensor]:
+    out: list[torch.Tensor] = []
+    if isinstance(obj, torch.Tensor):
+        out.append(obj.detach().float().flatten())
+        return out
+    if isinstance(obj, (list, tuple)):
+        for x in obj:
+            out.extend(_flatten_tensors(x))
+        return out
+    if isinstance(obj, dict):
+        for k in sorted(obj.keys()):
+            out.extend(_flatten_tensors(obj[k]))
+        return out
+    return out
+
+
+def _metrics(lhs: torch.Tensor, rhs: torch.Tensor) -> dict[str, float]:
+    diff = (lhs - rhs).abs()
+    denom = lhs.norm() * rhs.norm()
+    cos = float(torch.dot(lhs, rhs) / denom) if float(denom) > 0.0 else 1.0
+    return {
+        "cosine": cos,
+        "mae": float(diff.mean()),
+        "rmse": float(torch.sqrt(((lhs - rhs) ** 2).mean())),
+        "max_abs": float(diff.max()),
+    }
+
+
+def _load_index(path: Path) -> list[dict[str, Any]]:
+    return json.loads(path.read_text(encoding="utf-8"))
+
+
+def main():
+    parser = argparse.ArgumentParser()
+    parser.add_argument("--baseline-dir", required=True)
+    parser.add_argument("--candidate-dir", required=True)
+    parser.add_argument("--phase", default="all", choices=["all", "append", "gen1"])
+    parser.add_argument("--modules-regex", default="")
+    parser.add_argument("--top-k", type=int, default=50)
+    parser.add_argument("--strict", action="store_true")
+    parser.add_argument("--cosine-min", type=float, default=0.999)
+    parser.add_argument("--mae-max", type=float, default=1e-2)
+    parser.add_argument("--rmse-max", type=float, default=1e-2)
+    parser.add_argument("--max-abs-max", type=float, default=1e-1)
+    parser.add_argument("--out-dir", default="")
+    args = parser.parse_args()
+
+    baseline_dir = Path(args.baseline_dir)
+    candidate_dir = Path(args.candidate_dir)
+    out_dir = Path(args.out_dir) if args.out_dir else candidate_dir
+    out_dir.mkdir(parents=True, exist_ok=True)
+
+    b_index = _load_index(baseline_dir / "tensor_dump_index.json")
+    c_index = _load_index(candidate_dir / "tensor_dump_index.json")
+
+    rx = re.compile(args.modules_regex) if args.modules_regex else None
+
+    b_map: dict[tuple[str, str, str], dict[str, Any]] = {}
+    for e in b_index:
+        key = (e["phase"], e["module_name"], e["module_type"])
+        b_map[key] = e
+    c_map: dict[tuple[str, str, str], dict[str, Any]] = {}
+    for e in c_index:
+        key = (e["phase"], e["module_name"], e["module_type"])
+        c_map[key] = e
+
+    common = sorted(set(b_map.keys()) & set(c_map.keys()))
+    if args.phase != "all":
+        common = [k for k in common if k[0] == args.phase]
+    if rx is not None:
+        common = [k for k in common if rx.search(k[1] or "")]
+
+    per_module: list[dict[str, Any]] = []
+    fail_count = 0
+    for key in common:
+        b_path = Path(b_map[key]["file"])
+        c_path = Path(c_map[key]["file"])
+        b_obj = torch.load(b_path, map_location="cpu")
+        c_obj = torch.load(c_path, map_location="cpu")
+        b_tensors = _flatten_tensors(b_obj)
+        c_tensors = _flatten_tensors(c_obj)
+        if len(b_tensors) != len(c_tensors):
+            metrics = {"cosine": 0.0, "mae": float("inf"), "rmse": float("inf"), "max_abs": float("inf")}
+            status = "fail"
+            fail_count += 1
+        else:
+            # Concatenate all tensor leaves in a deterministic order.
+            if len(b_tensors) == 0:
+                metrics = {"cosine": 1.0, "mae": 0.0, "rmse": 0.0, "max_abs": 0.0}
+            else:
+                lhs = torch.cat(b_tensors)
+                rhs = torch.cat(c_tensors)
+                if lhs.numel() != rhs.numel():
+                    metrics = {"cosine": 0.0, "mae": float("inf"), "rmse": float("inf"), "max_abs": float("inf")}
+                else:
+                    metrics = _metrics(lhs, rhs)
+            status = "pass"
+            if (
+                metrics["cosine"] < args.cosine_min
+                or metrics["mae"] > args.mae_max
+                or metrics["rmse"] > args.rmse_max
+                or metrics["max_abs"] > args.max_abs_max
+            ):
+                status = "fail"
+                fail_count += 1
+
+        per_module.append(
+            {
+                "phase": key[0],
+                "module_name": key[1],
+                "module_type": key[2],
+                "baseline_file": str(b_path),
+                "candidate_file": str(c_path),
+                "status": status,
+                "metrics": metrics,
+            }
+        )
+
+    # Worst by cosine asc then mae desc.
+    sorted_worst = sorted(
+        per_module,
+        key=lambda x: (x["metrics"]["cosine"], -x["metrics"]["mae"]),
+    )
+    worst = sorted_worst[: max(0, args.top_k)]
+
+    summary = {
+        "baseline_dir": str(baseline_dir),
+        "candidate_dir": str(candidate_dir),
+        "phase": args.phase,
+        "modules_regex": args.modules_regex,
+        "thresholds": {
+            "cosine_min": args.cosine_min,
+            "mae_max": args.mae_max,
+            "rmse_max": args.rmse_max,
+            "max_abs_max": args.max_abs_max,
+        },
+        "counts": {
+            "baseline_index": len(b_index),
+            "candidate_index": len(c_index),
+            "compared_modules": len(per_module),
+            "failed_modules": fail_count,
+        },
+        "pass": fail_count == 0,
+    }
+
+    (out_dir / "comparison_summary.json").write_text(json.dumps(summary, indent=2), encoding="utf-8")
+    (out_dir / "comparison_worst_modules.json").write_text(json.dumps(worst, indent=2), encoding="utf-8")
+    (out_dir / "comparison_full.json").write_text(json.dumps(per_module, indent=2), encoding="utf-8")
+
+    print(json.dumps(summary, indent=2))
+    print(json.dumps({"top_k_worst": worst}, indent=2))
+
+    if args.strict and fail_count > 0:
+        raise SystemExit(2)
+
+
+if __name__ == "__main__":
+    main()
+
diff --git a/tests/conftest.py b/tests/conftest.py
new file mode 100644
index 0000000..c66e4fe
--- /dev/null
+++ b/tests/conftest.py
@@ -0,0 +1,148 @@
+from __future__ import annotations
+
+from pathlib import Path
+import math
+import sys
+
+import pytest
+import torch
+
+sys.path.insert(0, str(Path(__file__).resolve().parents[1] / "src"))
+sys.path.insert(0, str(Path(__file__).resolve().parents[1] / "src" / "model"))
+
+from metal.runtime import ensure_metal_attention_op_loaded
+
+
+_FALLBACK_REGISTERED = False
+
+
+def _reference_attention(
+    q: torch.Tensor,
+    k: torch.Tensor,
+    v: torch.Tensor,
+    mask: torch.Tensor | None,
+    causal: bool,
+) -> torch.Tensor:
+    qf = q.to(torch.float32)
+    kf = k.to(torch.float32)
+    vf = v.to(torch.float32)
+
+    if qf.size(1) != kf.size(1):
+        if qf.size(1) < kf.size(1) or (qf.size(1) % kf.size(1)) != 0:
+            raise RuntimeError("GQA requires q_heads divisible by kv_heads")
+        group_size = qf.size(1) // kf.size(1)
+        head_idx = torch.arange(qf.size(1), device=q.device, dtype=torch.long) // group_size
+        kf = kf.index_select(1, head_idx)
+        vf = vf.index_select(1, head_idx)
+
+    scores = torch.matmul(qf, kf.transpose(-2, -1)) / math.sqrt(q.size(-1))
+
+    if mask is not None:
+        scores = scores.masked_fill(mask == 0, float("-inf"))
+    if causal:
+        t = q.size(-2)
+        l = k.size(-2)
+        causal_mask = torch.triu(
+            torch.ones((t, l), device=q.device, dtype=torch.bool),
+            diagonal=1,
+        )
+        scores = scores.masked_fill(causal_mask[None, None], float("-inf"))
+
+    finite_row = torch.isfinite(scores).any(dim=-1, keepdim=True)
+    safe_scores = torch.where(finite_row, scores, torch.zeros_like(scores))
+    probs = torch.softmax(safe_scores, dim=-1)
+    probs = torch.where(finite_row, probs, torch.zeros_like(probs))
+    out = torch.matmul(probs, vf)
+    return out.to(q.dtype)
+
+
+def _register_python_fallback_op() -> None:
+    global _FALLBACK_REGISTERED
+    if _FALLBACK_REGISTERED:
+        return
+
+    try:
+        lib = torch.library.Library("world", "DEF")
+        lib.define("flex_attn_metal(Tensor q, Tensor k, Tensor v, Tensor? mask=None, bool causal=True) -> Tensor")
+        lib.define("flex_attn_metal_ref(Tensor q, Tensor k, Tensor v, Tensor? mask=None, bool causal=True) -> Tensor")
+        lib.define("flex_attn_metal_fast(Tensor q, Tensor k, Tensor v, Tensor? mask=None, bool causal=True) -> Tensor")
+        lib.define("flex_attn_metal_fast_blocks(Tensor q, Tensor k, Tensor v, Tensor block_written, int block_size, bool causal=True) -> Tensor")
+        lib.define("flex_attn_metal_fast_active(Tensor q, Tensor k, Tensor v, Tensor active_blocks, int block_size, bool causal=True) -> Tensor")
+    except Exception:
+        # Signature may already be defined by another registration path.
+        pass
+
+    impl = torch.library.Library("world", "IMPL", "CompositeExplicitAutograd")
+    fn = lambda q, k, v, mask=None, causal=True: _reference_attention(q, k, v, mask, bool(causal))
+    impl.impl("flex_attn_metal", fn)
+    impl.impl("flex_attn_metal_ref", fn)
+    impl.impl("flex_attn_metal_fast", fn)
+    impl.impl(
+        "flex_attn_metal_fast_blocks",
+        lambda q, k, v, block_written, block_size, causal=True: _reference_attention(
+            q,
+            k,
+            v,
+            # Build dense mask from block_written for fallback semantics.
+            torch.cat(
+                [
+                    torch.full(
+                        (int(block_size),),
+                        int(block_written[i].item() != 0),
+                        device=q.device,
+                        dtype=torch.uint8,
+                    )
+                    for i in range(block_written.numel())
+                ],
+                dim=0,
+            )[: k.size(-2)].view(1, 1, 1, k.size(-2)).expand(q.size(0), q.size(1), q.size(2), k.size(-2)).contiguous(),
+            bool(causal),
+        ),
+    )
+    impl.impl(
+        "flex_attn_metal_fast_active",
+        lambda q, k, v, active_blocks, block_size, causal=True: _reference_attention(
+            q,
+            k,
+            v,
+            (
+                torch.zeros((k.size(-2),), device=q.device, dtype=torch.uint8)
+                .index_fill(
+                    0,
+                    (
+                        torch.cat(
+                            [
+                                torch.arange(
+                                    int(b.item()) * int(block_size),
+                                    min(k.size(-2), int(b.item()) * int(block_size) + int(block_size)),
+                                    device=q.device,
+                                    dtype=torch.long,
+                                )
+                                for b in active_blocks
+                            ],
+                            dim=0,
+                        )
+                        if active_blocks.numel() > 0
+                        else torch.empty((0,), device=q.device, dtype=torch.long)
+                    ),
+                    1,
+                )
+                .view(1, 1, 1, k.size(-2))
+                .expand(q.size(0), q.size(1), q.size(2), k.size(-2))
+                .contiguous()
+            ),
+            bool(causal),
+        ),
+    )
+    _FALLBACK_REGISTERED = True
+
+
+def _load_metal_attention_extension() -> None:
+    if not ensure_metal_attention_op_loaded():
+        _register_python_fallback_op()
+
+
+@pytest.hookimpl(tryfirst=True)
+def pytest_sessionstart(session):  # noqa: D401 - pytest hook signature
+    _load_metal_attention_extension()
+
diff --git a/tests/gen_world_simple.py b/tests/gen_world_simple.py
new file mode 100644
index 0000000..040e7c7
--- /dev/null
+++ b/tests/gen_world_simple.py
@@ -0,0 +1,126 @@
+import argparse
+import io
+import random
+import urllib.request
+from pathlib import Path
+import sys
+
+import imageio.v3 as iio
+import numpy as np
+import torch
+import torch.nn.functional as F
+
+sys.path.insert(0, str(Path(__file__).resolve().parents[1]))
+
+SEED_FRAME_URLS = [
+    "https://gist.github.com/user-attachments/assets/d81c6d26-a838-4afe-9d13-fd67677043c3",
+    "https://gist.github.com/user-attachments/assets/b6d18c38-098e-43b0-8e61-66a16e5d8946",
+    "https://gist.github.com/user-attachments/assets/0734a8c1-3eb4-4ffe-8c37-5665c45ab559",
+    "https://gist.github.com/user-attachments/assets/f9c20d4d-7565-452d-8b02-42a85ea175ed",
+    "https://gist.github.com/user-attachments/assets/68c943a4-008a-4c25-948c-c81ab4c47d21",
+]
+
+
+def _load_seed_frame(url: str) -> np.ndarray:
+    raw = urllib.request.urlopen(url).read()
+    arr = iio.imread(io.BytesIO(raw))
+    if arr.ndim == 2:
+        arr = np.stack([arr, arr, arr], axis=-1)
+    if arr.shape[-1] > 3:
+        arr = arr[..., :3]
+    t = torch.from_numpy(arr).permute(2, 0, 1).unsqueeze(0).to(torch.float32)
+    t = F.interpolate(t, size=(512, 1024), mode="bilinear", align_corners=False)
+    t = t.round().clamp(0, 255).to(torch.uint8)
+    return t.squeeze(0).permute(1, 2, 0).cpu().numpy()
+
+
+def _controller_sequence(CtrlInput):
+    seq = [
+        CtrlInput(mouse=[0.2, 0.2]), CtrlInput(button={32}), CtrlInput(), CtrlInput(), CtrlInput(),
+        CtrlInput(button={1}), CtrlInput(), CtrlInput(), CtrlInput(button={1, 32}),
+        CtrlInput(), CtrlInput(), CtrlInput(), CtrlInput(), CtrlInput(), CtrlInput(),
+    ] * 2
+    seq += [CtrlInput()] * 8
+    return seq
+
+
+def _sync_if_mps():
+    if torch.backends.mps.is_available():
+        torch.mps.synchronize()
+
+
+def main():
+    parser = argparse.ArgumentParser(description="Simple deterministic WorldEngine generator.")
+    parser.add_argument("--model-uri", default="Overworld-Models/Lapp0-WP-Mini-1.4.5-BL-Distill")
+    parser.add_argument("--out", default="out_simple.mp4")
+    parser.add_argument("--seed", type=int, default=0)
+    parser.add_argument("--frames", type=int, default=12)
+    parser.add_argument("--device", default="mps")
+    parser.add_argument("--attention-backend", default="metal", choices=["metal", "flex", "auto"])
+    parser.add_argument("--dtype", default="bfloat16", choices=["bfloat16", "float16", "float32"])
+    parser.add_argument("--scheduler-steps", type=int, default=4)
+    parser.add_argument("--cache-interval", type=int, default=1)
+    parser.add_argument("--seed-url", default="")
+    args = parser.parse_args()
+
+    if args.device == "mps" and not torch.backends.mps.is_available():
+        raise RuntimeError("MPS backend not available.")
+
+    import os
+    os.environ.setdefault("TORCHDYNAMO_DISABLE", "1")
+    os.environ.setdefault("WORLD_KV_RUNTIME_CHECKS", "0")
+    os.environ.setdefault("WORLD_KV_COMPUTE_ACTIVE_BLOCKS", "0")
+    os.environ["WORLD_ATTENTION_BACKEND"] = args.attention_backend
+
+    from src.world_engine import WorldEngine, CtrlInput
+    from src.metal.runtime import ensure_metal_attention_op_loaded
+    if args.attention_backend == "metal" and args.device == "mps":
+        os.environ.setdefault("WORLD_METAL_IMPL", "fast")
+        os.environ.setdefault("WORLD_METAL_FAST_NO_FALLBACK", "1")
+        os.environ.setdefault("WORLD_METAL_PREFER_ACTIVE_DISPATCH", "1")
+        ensure_metal_attention_op_loaded()
+
+    if args.dtype == "bfloat16":
+        dtype = torch.bfloat16
+    elif args.dtype == "float16":
+        dtype = torch.float16
+    else:
+        dtype = torch.float32
+
+    random.seed(args.seed)
+    np.random.seed(args.seed)
+    torch.manual_seed(args.seed)
+
+    engine = WorldEngine(
+        args.model_uri,
+        quant=None,
+        device=args.device,
+        dtype=dtype,
+        scheduler_steps=args.scheduler_steps,
+        cache_interval=args.cache_interval,
+    )
+
+    url = args.seed_url if args.seed_url else random.choice(SEED_FRAME_URLS)
+    frame = _load_seed_frame(url)
+    seed = torch.from_numpy(np.repeat(frame[None], 4, axis=0)).to(engine.device)
+    engine.append_frame(seed)
+
+    ctrl_seq = _controller_sequence(CtrlInput)
+    if args.frames > 0:
+        ctrl_seq = ctrl_seq[:args.frames]
+
+    with iio.imopen(args.out, "w", plugin="pyav") as out:
+        first = engine.gen_frame()
+        _sync_if_mps()
+        out.write(first.cpu().numpy(), fps=60, codec="libx264")
+        for ctrl in ctrl_seq:
+            img = engine.gen_frame(ctrl=ctrl)
+            _sync_if_mps()
+            out.write(img.cpu().numpy())
+
+    print(f"wrote={args.out}")
+    print(f"seed_url={url}")
+
+
+if __name__ == "__main__":
+    main()
diff --git a/tests/gen_world_variant_metal_save.py b/tests/gen_world_variant_metal_save.py
new file mode 100644
index 0000000..f743e85
--- /dev/null
+++ b/tests/gen_world_variant_metal_save.py
@@ -0,0 +1,97 @@
+import argparse
+import io
+import os
+import random
+import urllib.request
+
+import imageio.v3 as iio
+import numpy as np
+import torch
+import torch.nn.functional as F
+
+from src.world_engine import WorldEngine, CtrlInput
+from src.metal.runtime import ensure_metal_attention_op_loaded
+
+
+def main():
+    parser = argparse.ArgumentParser()
+    parser.add_argument("--model-uri", default="Overworld-Models/Lapp0-WP-Mini-1.4.5-BL-Distill")
+    parser.add_argument("--device", default="mps")
+    parser.add_argument("--dtype", default="float16", choices=["float16", "bfloat16"])
+    parser.add_argument("--quant", default="none", choices=["none", "w8a8", "nvfp4"])
+    parser.add_argument("--frames", type=int, default=64)
+    parser.add_argument("--seed", type=int, default=0)
+    parser.add_argument("--seed-url", default="")
+    parser.add_argument("--out", default="diagnostics/out/fp16_variant_metal_mps_saved.mp4")
+    parser.add_argument("--hybrid-compile", action="store_true")
+    parser.add_argument("--force-compile", action="store_true")
+    args = parser.parse_args()
+
+    random.seed(args.seed)
+    os.environ["WORLD_HYBRID_COMPILE_METAL"] = "1" if args.hybrid_compile else "0"
+    os.environ["WORLD_FORCE_COMPILE_METAL"] = "1" if args.force_compile else "0"
+    ensure_metal_attention_op_loaded()
+
+    quant = None if args.quant == "none" else args.quant
+    engine = WorldEngine(
+        args.model_uri,
+        device=args.device,
+        dtype=(torch.float16 if args.dtype == "float16" else torch.bfloat16),
+        quant=quant,
+    )
+    # Compatibility for current world_engine timestamp math.
+    engine.ts_mult = int(engine.ts_mult)
+
+    urls = [
+        "https://gist.github.com/user-attachments/assets/d81c6d26-a838-4afe-9d13-fd67677043c3",
+        "https://gist.github.com/user-attachments/assets/b6d18c38-098e-43b0-8e61-66a16e5d8946",
+        "https://gist.github.com/user-attachments/assets/0734a8c1-3eb4-4ffe-8c37-5665c45ab559",
+        "https://gist.github.com/user-attachments/assets/f9c20d4d-7565-452d-8b02-42a85ea175ed",
+        "https://gist.github.com/user-attachments/assets/68c943a4-008a-4c25-948c-c81ab4c47d21",
+    ]
+    url = args.seed_url if args.seed_url else random.choice(urls)
+    arr = iio.imread(io.BytesIO(urllib.request.urlopen(url).read()))
+    if arr.ndim == 2:
+        arr = np.stack([arr, arr, arr], axis=-1)
+    if arr.shape[-1] > 3:
+        arr = arr[..., :3]
+    t = torch.from_numpy(arr).permute(2, 0, 1).unsqueeze(0).to(torch.float32)
+    t = F.interpolate(t, size=(512, 1024), mode="bilinear", align_corners=False)
+    frame = t.round().clamp(0, 255).to(torch.uint8).squeeze(0).permute(1, 2, 0).cpu().numpy()
+    engine.append_frame(torch.from_numpy(np.repeat(frame[None], 4, axis=0)).to(engine.device))
+
+    controller_sequence = [
+        CtrlInput(mouse=[0.2, 0.2]),
+        CtrlInput(button={32}),
+        CtrlInput(),
+        CtrlInput(),
+        CtrlInput(),
+        CtrlInput(button={1}),
+        CtrlInput(),
+        CtrlInput(),
+        CtrlInput(button={1, 32}),
+        CtrlInput(),
+        CtrlInput(),
+        CtrlInput(),
+        CtrlInput(),
+        CtrlInput(),
+        CtrlInput(),
+    ] * 2
+    controller_sequence += [CtrlInput()] * 8
+    controller_sequence = controller_sequence[: max(0, args.frames)]
+
+    out_path = args.out
+    with iio.imopen(out_path, "w", plugin="pyav") as out:
+        out.write(engine.gen_frame().cpu().numpy(), fps=60, codec="libx264")
+        torch.mps.synchronize()
+        for ctrl in controller_sequence:
+            out.write(engine.gen_frame(ctrl=ctrl).cpu().numpy())
+            torch.mps.synchronize()
+
+    print(f"wrote={out_path}")
+    print(f"seed_url={url}")
+
+
+if __name__ == "__main__":
+    main()
+
diff --git a/tests/metal_test_utils.py b/tests/metal_test_utils.py
new file mode 100644
index 0000000..1b4ca52
--- /dev/null
+++ b/tests/metal_test_utils.py
@@ -0,0 +1,48 @@
+from __future__ import annotations
+
+import time
+from typing import Iterable
+
+import pytest
+import torch
+
+
+def require_world_ops(op_names: Iterable[str]) -> None:
+    if not hasattr(torch.ops, "world"):
+        pytest.skip("Metal world namespace not registered")
+    missing = [name for name in op_names if not hasattr(torch.ops.world, name)]
+    if missing:
+        pytest.skip(f"Required Metal ops not registered: {', '.join(missing)}")
+
+
+def require_metal_attn_ops() -> None:
+    require_world_ops(
+        [
+            "flex_attn_metal_ref",
+            "flex_attn_metal_fast",
+            "flex_attn_metal_fast_blocks",
+            "flex_attn_metal_fast_active",
+        ]
+    )
+
+
+def timed_ms_sync(fn, warmup: int = 5, iters: int = 20):
+    for _ in range(warmup):
+        fn()
+    if torch.backends.mps.is_available():
+        torch.mps.synchronize()
+    samples = []
+    for _ in range(iters):
+        t0 = time.perf_counter()
+        fn()
+        if torch.backends.mps.is_available():
+            torch.mps.synchronize()
+        samples.append((time.perf_counter() - t0) * 1000.0)
+    t = torch.tensor(samples, dtype=torch.float64)
+    return {
+        "mean_ms": float(t.mean().item()),
+        "p50_ms": float(t.quantile(0.50).item()),
+        "p95_ms": float(t.quantile(0.95).item()),
+        "p99_ms": float(t.quantile(0.99).item()),
+        "max_ms": float(t.max().item()),
+    }
diff --git a/tests/optimization_gate_config.json b/tests/optimization_gate_config.json
new file mode 100644
index 0000000..e0b46c4
--- /dev/null
+++ b/tests/optimization_gate_config.json
@@ -0,0 +1,20 @@
+{
+  "quick_gate": {
+    "sentinel_modules_regex": "transformer\\.blocks\\.(0|11|23)\\.(attn|mlp)",
+    "cosine_min": 0.999,
+    "mae_max": 0.01,
+    "rmse_max": 0.01,
+    "max_abs_max": 0.1
+  },
+  "full_gate": {
+    "cosine_min": 0.999,
+    "mae_max": 0.01,
+    "rmse_max": 0.01,
+    "max_abs_max": 0.1
+  },
+  "performance_gate": {
+    "min_median_improvement_pct": 3.0,
+    "max_p90_regression_pct": 2.0
+  }
+}
+
diff --git a/tests/optimization_gate_config_fp16.json b/tests/optimization_gate_config_fp16.json
new file mode 100644
index 0000000..c1fb6ee
--- /dev/null
+++ b/tests/optimization_gate_config_fp16.json
@@ -0,0 +1,24 @@
+{
+  "quick_gate": {
+    "sentinel_modules_regex": "transformer\\.blocks\\.(0|11|23)\\.(attn|mlp)",
+    "cosine_min": 0.999,
+    "mae_max": 0.01,
+    "rmse_max": 0.01,
+    "max_abs_max": 0.1
+  },
+  "full_gate": {
+    "cosine_min": 0.999,
+    "mae_max": 0.01,
+    "rmse_max": 0.01,
+    "max_abs_max": 0.1
+  },
+  "performance_gate": {
+    "min_median_improvement_pct": 3.0,
+    "max_p90_regression_pct": 2.0
+  },
+  "baseline_defaults": {
+    "baseline_dump_dir": "diagnostics/out/fp16_baseline",
+    "baseline_perf_report": "diagnostics/out/fp16_baseline/profile_report.json"
+  }
+}
+
diff --git a/tests/perf_regression_gate.py b/tests/perf_regression_gate.py
new file mode 100644
index 0000000..8e3e0a2
--- /dev/null
+++ b/tests/perf_regression_gate.py
@@ -0,0 +1,201 @@
+import argparse
+import gc
+import json
+import os
+import random
+import time
+from pathlib import Path
+
+import numpy as np
+import torch
+
+from bench_world_engine_e2e import _controller_sequence, _load_seed_frame
+from src.metal.runtime import ensure_metal_attention_op_loaded
+from src.world_engine import CtrlInput, WorldEngine
+
+SEED_URLS = [
+    "https://gist.github.com/user-attachments/assets/d81c6d26-a838-4afe-9d13-fd67677043c3",
+    "https://gist.github.com/user-attachments/assets/b6d18c38-098e-43b0-8e61-66a16e5d8946",
+    "https://gist.github.com/user-attachments/assets/0734a8c1-3eb4-4ffe-8c37-5665c45ab559",
+    "https://gist.github.com/user-attachments/assets/f9c20d4d-7565-452d-8b02-42a85ea175ed",
+    "https://gist.github.com/user-attachments/assets/68c943a4-008a-4c25-948c-c81ab4c47d21",
+]
+
+
+def _sync_mps():
+    if torch.backends.mps.is_available():
+        torch.mps.synchronize()
+
+
+def _summary(values: list[float]) -> dict:
+    vals = sorted(values)
+    n = len(vals)
+    return {
+        "p50": vals[n // 2],
+        "p95": vals[max(0, int(0.95 * n) - 1)],
+        "mean": sum(vals) / max(1, n),
+    }
+
+
+def _run_trial(return_img: bool, warmup: int, steps: int) -> dict:
+    engine = WorldEngine(
+        "Overworld-Models/Lapp0-WP-Mini-1.4.5-BL-Distill",
+        quant="w8a8",
+        device="mps",
+        dtype=torch.float16,
+        scheduler_steps=4,
+        cache_interval=1,
+    )
+    frame = _load_seed_frame(random.choice(SEED_URLS))
+    seed = torch.from_numpy(np.repeat(frame[None], 4, axis=0)).to(engine.device)
+    engine.append_frame(seed)
+    ctrls = _controller_sequence(CtrlInput)
+
+    with torch.inference_mode():
+        for i in range(warmup):
+            _ = engine.gen_frame(ctrl=ctrls[i % len(ctrls)], return_img=return_img)
+    _sync_mps()
+
+    total_ms, denoise_ms, cache_ms, decode_ms, prep_ms = [], [], [], [], []
+    with torch.inference_mode():
+        for i in range(steps):
+            ctrl = ctrls[(warmup + i) % len(ctrls)]
+            x = torch.randn(engine.frm_shape, device=engine.device, dtype=engine.dtype)
+
+            ttot = time.perf_counter()
+
+            t0 = time.perf_counter()
+            inputs = engine.prep_inputs(x=x, ctrl=ctrl)
+            _sync_mps()
+            prep_ms.append((time.perf_counter() - t0) * 1000.0)
+
+            t1 = time.perf_counter()
+            x0 = engine._denoise_pass_fn(x, inputs, engine.kv_cache)
+            _sync_mps()
+            denoise_ms.append((time.perf_counter() - t1) * 1000.0)
+
+            t2 = time.perf_counter()
+            engine._cache_pass_fn(x0, inputs, engine.kv_cache)
+            engine._gen_count += 1
+            _sync_mps()
+            cache_ms.append((time.perf_counter() - t2) * 1000.0)
+
+            dec = 0.0
+            if return_img:
+                t3 = time.perf_counter()
+                _ = engine.vae.decode(x0.squeeze(1))
+                _sync_mps()
+                dec = (time.perf_counter() - t3) * 1000.0
+            decode_ms.append(dec)
+
+            total_ms.append((time.perf_counter() - ttot) * 1000.0)
+
+    total = _summary(total_ms)
+    result = {
+        "stage": {
+            "prep_ms": _summary(prep_ms),
+            "denoise_ms": _summary(denoise_ms),
+            "cache_ms": _summary(cache_ms),
+            "decode_ms": _summary(decode_ms),
+            "total_ms": total,
+            "fps": {
+                "p50": 1000.0 / max(total["p50"], 1e-9),
+                "p95": 1000.0 / max(total["p95"], 1e-9),
+                "mean": 1000.0 / max(total["mean"], 1e-9),
+            },
+        }
+    }
+    del engine
+    gc.collect()
+    if torch.backends.mps.is_available():
+        torch.mps.empty_cache()
+        _sync_mps()
+    return result
+
+
+def main():
+    parser = argparse.ArgumentParser(description="Capture/compare MPS performance baseline.")
+    parser.add_argument("--output", type=Path, default=Path("docs/perf_baseline_mps_w8a8.json"))
+    parser.add_argument("--repeats", type=int, default=3)
+    parser.add_argument("--warmup", type=int, default=20)
+    parser.add_argument("--steps", type=int, default=8)
+    parser.add_argument("--compare-only", action="store_true")
+    parser.add_argument("--max-regression", type=float, default=0.10, help="Allowed p50 total_ms regression fraction.")
+    args = parser.parse_args()
+
+    if not torch.backends.mps.is_available():
+        raise RuntimeError("MPS backend not available.")
+
+    os.environ.setdefault("TORCHDYNAMO_DISABLE", "1")
+    os.environ["WORLD_ATTENTION_BACKEND"] = "metal"
+    os.environ.setdefault("WORLD_METAL_IMPL", "fast")
+    os.environ.setdefault("WORLD_METAL_FAST_NO_FALLBACK", "1")
+    os.environ.setdefault("WORLD_METAL_PREFER_ACTIVE_DISPATCH", "1")
+    os.environ.setdefault("WORLD_KV_RUNTIME_CHECKS", "0")
+    os.environ.setdefault("WORLD_KV_COMPUTE_ACTIVE_BLOCKS", "0")
+
+    ensure_metal_attention_op_loaded()
+    runs_decode = []
+    runs_latent = []
+
+    for _ in range(args.repeats):
+        runs_decode.append(_run_trial(return_img=True, warmup=args.warmup, steps=args.steps))
+        runs_latent.append(_run_trial(return_img=False, warmup=args.warmup, steps=args.steps))
+
+    result = {
+        "meta": {
+            "repeats": args.repeats,
+            "warmup": args.warmup,
+            "steps": args.steps,
+            "quant": "w8a8",
+            "scheduler_steps": 4,
+            "cache_interval": 1,
+        },
+        "decode_runs": runs_decode,
+        "latent_runs": runs_latent,
+    }
+
+    if not args.compare_only:
+        args.output.parent.mkdir(parents=True, exist_ok=True)
+        args.output.write_text(json.dumps(result, indent=2), encoding="utf-8")
+        print(f"[perf-gate] wrote baseline: {args.output}")
+        return
+
+    if not args.output.exists():
+        raise RuntimeError(f"Baseline file not found: {args.output}")
+    baseline = json.loads(args.output.read_text(encoding="utf-8"))
+
+    def _median_p50_total(payload, key):
+        vals = [r["stage"]["total_ms"]["p50"] for r in payload[key]]
+        vals = sorted(vals)
+        return vals[len(vals) // 2]
+
+    base_decode = _median_p50_total(baseline, "decode_runs")
+    base_latent = _median_p50_total(baseline, "latent_runs")
+    now_decode = _median_p50_total(result, "decode_runs")
+    now_latent = _median_p50_total(result, "latent_runs")
+
+    decode_reg = (now_decode - base_decode) / max(base_decode, 1e-9)
+    latent_reg = (now_latent - base_latent) / max(base_latent, 1e-9)
+
+    print(
+        json.dumps(
+            {
+                "baseline_decode_p50_total_ms": base_decode,
+                "current_decode_p50_total_ms": now_decode,
+                "decode_regression_frac": decode_reg,
+                "baseline_latent_p50_total_ms": base_latent,
+                "current_latent_p50_total_ms": now_latent,
+                "latent_regression_frac": latent_reg,
+            },
+            indent=2,
+        )
+    )
+
+    if decode_reg > args.max_regression or latent_reg > args.max_regression:
+        raise RuntimeError("Performance regression exceeded threshold.")
+
+
+if __name__ == "__main__":
+    main()
+
diff --git a/tests/profile_and_dump_variant_metal.py b/tests/profile_and_dump_variant_metal.py
new file mode 100644
index 0000000..97ac368
--- /dev/null
+++ b/tests/profile_and_dump_variant_metal.py
@@ -0,0 +1,458 @@
+import argparse
+import io
+import json
+import os
+import random
+import re
+import time
+import urllib.request
+import subprocess
+from pathlib import Path
+from typing import Any
+
+import imageio.v3 as iio
+import numpy as np
+import torch
+import torch.nn.functional as F
+
+from src.metal.runtime import ensure_metal_attention_op_loaded
+from src.world_engine import CtrlInput, WorldEngine
+
+
+SEED_FRAME_URLS = [
+    "https://gist.github.com/user-attachments/assets/d81c6d26-a838-4afe-9d13-fd67677043c3",
+    "https://gist.github.com/user-attachments/assets/b6d18c38-098e-43b0-8e61-66a16e5d8946",
+    "https://gist.github.com/user-attachments/assets/0734a8c1-3eb4-4ffe-8c37-5665c45ab559",
+    "https://gist.github.com/user-attachments/assets/f9c20d4d-7565-452d-8b02-42a85ea175ed",
+    "https://gist.github.com/user-attachments/assets/68c943a4-008a-4c25-948c-c81ab4c47d21",
+]
+
+
+def _load_seed_frame(url: str) -> np.ndarray:
+    raw = urllib.request.urlopen(url).read()
+    arr = iio.imread(io.BytesIO(raw))
+    if arr.ndim == 2:
+        arr = np.stack([arr, arr, arr], axis=-1)
+    if arr.shape[-1] > 3:
+        arr = arr[..., :3]
+    t = torch.from_numpy(arr).permute(2, 0, 1).unsqueeze(0).to(torch.float32)
+    t = F.interpolate(t, size=(512, 1024), mode="bilinear", align_corners=False)
+    t = t.round().clamp(0, 255).to(torch.uint8)
+    return t.squeeze(0).permute(1, 2, 0).cpu().numpy()
+
+
+def _controller_sequence(steps: int) -> list[CtrlInput]:
+    seq = [
+        CtrlInput(mouse=[0.2, 0.2]),
+        CtrlInput(button={32}),
+        CtrlInput(),
+        CtrlInput(),
+        CtrlInput(),
+        CtrlInput(button={1}),
+        CtrlInput(),
+        CtrlInput(),
+        CtrlInput(button={1, 32}),
+        CtrlInput(),
+        CtrlInput(),
+        CtrlInput(),
+        CtrlInput(),
+        CtrlInput(),
+        CtrlInput(),
+    ] * 4
+    seq += [CtrlInput()] * 8
+    seq += (
+        [CtrlInput(button={32})] * 10
+        + [CtrlInput(button={65})] * 10
+        + [CtrlInput(button={68})] * 10
+        + [CtrlInput(button={83})] * 10
+    )
+    seq += [CtrlInput()] * 10
+    return seq[:steps]
+
+
+def _sync_if_mps(device: str):
+    if str(device).startswith("mps") and torch.backends.mps.is_available():
+        torch.mps.synchronize()
+
+
+def _sanitize(name: str) -> str:
+    safe = re.sub(r"[^A-Za-z0-9_.-]+", "_", name)
+    return safe[:220]
+
+
+def _to_dumpable(obj: Any):
+    if isinstance(obj, torch.Tensor):
+        t = obj.detach().cpu()
+        if t.is_floating_point():
+            t = t.to(torch.bfloat16)
+        return t
+    if isinstance(obj, (list, tuple)):
+        return type(obj)(_to_dumpable(x) for x in obj)
+    if isinstance(obj, dict):
+        return {k: _to_dumpable(v) for k, v in obj.items()}
+    return obj
+
+
+def _summary(obj: Any):
+    if isinstance(obj, torch.Tensor):
+        return {
+            "kind": "tensor",
+            "shape": list(obj.shape),
+            "dtype": str(obj.dtype),
+            "numel": int(obj.numel()),
+        }
+    if isinstance(obj, (list, tuple)):
+        return {"kind": type(obj).__name__, "items": [_summary(x) for x in obj]}
+    if isinstance(obj, dict):
+        return {"kind": "dict", "items": {str(k): _summary(v) for k, v in obj.items()}}
+    return {"kind": type(obj).__name__}
+
+
+def _percentile(values: list[float], q: float) -> float:
+    if not values:
+        return 0.0
+    arr = np.array(values, dtype=np.float64)
+    return float(np.percentile(arr, q))
+
+
+def _gen_frame_with_step_attribution(engine: WorldEngine, ctrl: CtrlInput) -> tuple[torch.Tensor, list[float], float]:
+    """Manual gen-frame path used only for denoise step timing attribution."""
+    x = torch.randn(engine.frm_shape, device=engine.device, dtype=engine.dtype)
+    inputs = engine.prep_inputs(x=x, ctrl=ctrl)
+    kv_cache = engine.kv_cache
+    kv_cache.set_frozen(True)
+    bt = (x.size(0), x.size(1))
+    step_times_ms: list[float] = []
+    for i in range(engine.scheduler_dsigmas.numel()):
+        sigma_bt = engine.scheduler_step_sigmas[i].expand(bt)
+        t0 = time.perf_counter()
+        v = engine.model(x, sigma_bt, **inputs, kv_cache=kv_cache, ctrl_cond=True, prompt_cond=True)
+        _sync_if_mps(engine.device)
+        t1 = time.perf_counter()
+        step_times_ms.append((t1 - t0) * 1000.0)
+        x = x + engine.scheduler_dsigmas[i] * v
+
+    cache_ms = 0.0
+    if (engine._gen_count % engine.cache_interval) == 0:
+        t0 = time.perf_counter()
+        engine._cache_pass_fn(x, inputs, kv_cache)
+        _sync_if_mps(engine.device)
+        t1 = time.perf_counter()
+        cache_ms = (t1 - t0) * 1000.0
+    engine._gen_count += 1
+    return engine.vae.decode(x.squeeze(1)), step_times_ms, cache_ms
+
+
+def main():
+    parser = argparse.ArgumentParser()
+    parser.add_argument("--model-uri", default="Overworld-Models/Lapp0-WP-Mini-1.4.5-BL-Distill")
+    parser.add_argument("--seed", type=int, default=0)
+    parser.add_argument("--seed-url", default="")
+    parser.add_argument("--device", default="mps")
+    parser.add_argument("--dtype", default="bfloat16", choices=["bfloat16", "float16", "float32"])
+    parser.add_argument("--quant", default="none", choices=["none", "w8a8", "nvfp4"])
+    parser.add_argument("--profile-steps", type=int, default=64)
+    parser.add_argument("--dump-phases", default="append,gen1")
+    parser.add_argument("--output-dir", default="diagnostics/out/metal_profile_baseline")
+    parser.add_argument("--write-video", action="store_true")
+    parser.add_argument("--module-timing", action="store_true")
+    parser.add_argument("--denoise-attribution", action="store_true")
+    parser.add_argument("--manifest-note", default="")
+    args = parser.parse_args()
+
+    os.environ.setdefault("HF_HUB_OFFLINE", "1")
+    os.environ.setdefault("TRANSFORMERS_OFFLINE", "1")
+    os.environ.setdefault("TORCHDYNAMO_DISABLE", "1")
+    os.environ.setdefault("WORLD_ATTENTION_BACKEND", "metal")
+    os.environ.setdefault("WORLD_METAL_IMPL", "fast")
+    os.environ.setdefault("WORLD_METAL_FAST_NO_FALLBACK", "1")
+    os.environ.setdefault("WORLD_METAL_PREFER_ACTIVE_DISPATCH", "1")
+    os.environ.setdefault("WORLD_KV_RUNTIME_CHECKS", "0")
+    os.environ.setdefault("WORLD_KV_COMPUTE_ACTIVE_BLOCKS", "0")
+
+    ensure_metal_attention_op_loaded()
+
+    dtype_map = {
+        "bfloat16": torch.bfloat16,
+        "float16": torch.float16,
+        "float32": torch.float32,
+    }
+    dtype = dtype_map[args.dtype]
+
+    random.seed(args.seed)
+    np.random.seed(args.seed)
+    torch.manual_seed(args.seed)
+
+    output_dir = Path(args.output_dir)
+    tensor_dir = output_dir / "tensors"
+    output_dir.mkdir(parents=True, exist_ok=True)
+    tensor_dir.mkdir(parents=True, exist_ok=True)
+
+    seed_url = args.seed_url if args.seed_url else random.choice(SEED_FRAME_URLS)
+    frame = _load_seed_frame(seed_url)
+    seed = torch.from_numpy(np.repeat(frame[None], 4, axis=0))
+
+    quant = None if args.quant == "none" else args.quant
+    engine = WorldEngine(args.model_uri, device=args.device, dtype=dtype, quant=quant)
+    quant_report = getattr(engine.model, "_quantization_report", None)
+    # Compatibility for restored world_engine path.
+    if hasattr(engine, "ts_mult"):
+        engine.ts_mult = int(engine.ts_mult)
+
+    dump_phases = {x.strip() for x in args.dump_phases.split(",") if x.strip()}
+    current_phase = {"name": None}
+    dump_index: list[dict[str, Any]] = []
+    seen: set[tuple[str, str]] = set()
+    module_timing_enabled = bool(args.module_timing)
+    denoise_attr_enabled = bool(args.denoise_attribution)
+    module_timing: dict[str, dict[str, Any]] = {}
+    start_times: dict[str, list[float]] = {}
+
+    hooks = []
+    for mod_name, mod in engine.model.named_modules():
+        def _mk_hook(name: str, module: torch.nn.Module):
+            def _hook(_m, _inp, out):
+                phase = current_phase["name"]
+                if phase is None or phase not in dump_phases:
+                    return
+                key = (phase, name)
+                if key in seen:
+                    return
+                seen.add(key)
+                data = _to_dumpable(out)
+                summary = _summary(data)
+                fname = f"{phase}__{_sanitize(name) if name else 'root'}__{len(dump_index):04d}.pt"
+                path = tensor_dir / fname
+                torch.save(data, path)
+                dump_index.append(
+                    {
+                        "phase": phase,
+                        "module_name": name,
+                        "module_type": module.__class__.__name__,
+                        "file": str(path),
+                        "summary": summary,
+                    }
+                )
+            return _hook
+        hooks.append(mod.register_forward_hook(_mk_hook(mod_name, mod)))
+        if module_timing_enabled:
+            def _mk_pre_hook(name: str):
+                def _pre(_m, _inp):
+                    start_times.setdefault(name, []).append(time.perf_counter())
+                return _pre
+
+            def _mk_post_timing_hook(name: str, module: torch.nn.Module):
+                def _post(_m, _inp, _out):
+                    t0_list = start_times.get(name)
+                    if not t0_list:
+                        return
+                    t0 = t0_list.pop()
+                    dt_ms = (time.perf_counter() - t0) * 1000.0
+                    rec = module_timing.setdefault(
+                        name,
+                        {
+                            "module_name": name,
+                            "module_type": module.__class__.__name__,
+                            "count": 0,
+                            "durations_ms": [],
+                        },
+                    )
+                    rec["count"] += 1
+                    rec["durations_ms"].append(float(dt_ms))
+                return _post
+
+            hooks.append(mod.register_forward_pre_hook(_mk_pre_hook(mod_name)))
+            hooks.append(mod.register_forward_hook(_mk_post_timing_hook(mod_name, mod)))
+
+    timings = {}
+    with torch.inference_mode():
+        t0 = time.perf_counter()
+        current_phase["name"] = "append"
+        engine.append_frame(seed.to(engine.device))
+        _sync_if_mps(engine.device)
+        t1 = time.perf_counter()
+        timings["append_s"] = t1 - t0
+
+        ctrl_seq = _controller_sequence(args.profile_steps)
+        gen_times = []
+        current_phase["name"] = "gen1"
+        g0 = time.perf_counter()
+        denoise_step_times_ms: list[float] = []
+        first_cache_ms = 0.0
+        if denoise_attr_enabled:
+            first, denoise_step_times_ms, first_cache_ms = _gen_frame_with_step_attribution(engine, ctrl_seq[0])
+        else:
+            first = engine.gen_frame(ctrl=ctrl_seq[0])
+        _sync_if_mps(engine.device)
+        g1 = time.perf_counter()
+        gen_times.append(g1 - g0)
+        current_phase["name"] = None
+
+        video_path = output_dir / "profile_run.mp4"
+        writer = None
+        if args.write_video:
+            writer = iio.imopen(str(video_path), "w", plugin="pyav")
+            writer.write(first.cpu().numpy(), fps=60, codec="libx264")
+
+        for ctrl in ctrl_seq[1:]:
+            g0 = time.perf_counter()
+            frm = engine.gen_frame(ctrl=ctrl)
+            _sync_if_mps(engine.device)
+            g1 = time.perf_counter()
+            gen_times.append(g1 - g0)
+            if writer is not None:
+                writer.write(frm.cpu().numpy())
+
+        if writer is not None:
+            writer.close()
+            timings["video_path"] = str(video_path)
+
+    for h in hooks:
+        h.remove()
+
+    gen_arr = np.array(gen_times, dtype=np.float64)
+    timings.update(
+        {
+            "gen_frames": int(len(gen_times)),
+            "gen_first_s": float(gen_arr[0]),
+            "gen_mean_s": float(gen_arr.mean()),
+            "gen_p50_s": float(np.percentile(gen_arr, 50)),
+            "gen_p90_s": float(np.percentile(gen_arr, 90)),
+            "gen_min_s": float(gen_arr.min()),
+            "gen_max_s": float(gen_arr.max()),
+            "gen_fps_mean": float(1.0 / gen_arr.mean()) if gen_arr.mean() > 0 else 0.0,
+        }
+    )
+
+    module_timing_report = {
+        "enabled": module_timing_enabled,
+        "modules": [],
+    }
+    if module_timing_enabled:
+        rows = []
+        for name, rec in module_timing.items():
+            durs = [float(x) for x in rec["durations_ms"]]
+            rows.append(
+                {
+                    "module_name": name,
+                    "module_type": rec["module_type"],
+                    "count": int(rec["count"]),
+                    "total_ms": float(sum(durs)),
+                    "mean_ms": float(sum(durs) / len(durs)) if durs else 0.0,
+                    "p50_ms": _percentile(durs, 50),
+                    "p95_ms": _percentile(durs, 95),
+                }
+            )
+        rows.sort(key=lambda x: x["total_ms"], reverse=True)
+        module_timing_report["modules"] = rows
+
+    denoise_attr_report = {
+        "enabled": denoise_attr_enabled,
+        "step_times_ms": [],
+        "first_frame_cache_ms": 0.0,
+        "per_block_component_total_ms": [],
+    }
+    if denoise_attr_enabled:
+        denoise_attr_report["step_times_ms"] = denoise_step_times_ms
+        denoise_attr_report["first_frame_cache_ms"] = float(first_cache_ms)
+        component_rows = []
+        for row in module_timing_report.get("modules", []):
+            name = row.get("module_name", "")
+            if ".attn" in name or ".mlp" in name:
+                component_rows.append(
+                    {
+                        "module_name": name,
+                        "module_type": row.get("module_type", ""),
+                        "total_ms": float(row.get("total_ms", 0.0)),
+                        "mean_ms": float(row.get("mean_ms", 0.0)),
+                    }
+                )
+        denoise_attr_report["per_block_component_total_ms"] = component_rows[:100]
+
+    # Torch profiler snapshot around one gen frame.
+    with torch.inference_mode():
+        with torch.profiler.profile(
+            activities=[torch.profiler.ProfilerActivity.CPU],
+            record_shapes=True,
+            profile_memory=True,
+        ) as prof:
+            _ = engine.gen_frame(ctrl=CtrlInput())
+            _sync_if_mps(engine.device)
+    prof_table = prof.key_averages().table(sort_by="self_cpu_time_total", row_limit=120)
+    prof_path = output_dir / "torch_profiler_top_ops.txt"
+    prof_path.write_text(prof_table, encoding="utf-8")
+    module_timing_path = output_dir / "module_timing_report.json"
+    module_timing_path.write_text(json.dumps(module_timing_report, indent=2), encoding="utf-8")
+    denoise_attr_path = output_dir / "denoise_attribution_report.json"
+    denoise_attr_path.write_text(json.dumps(denoise_attr_report, indent=2), encoding="utf-8")
+
+    git_sha = ""
+    try:
+        git_sha = (
+            subprocess.check_output(["git", "rev-parse", "HEAD"], text=True, cwd=str(Path(__file__).resolve().parents[1]))
+            .strip()
+        )
+    except Exception:
+        git_sha = ""
+    run_manifest = {
+        "git_sha": git_sha,
+        "model_uri": args.model_uri,
+        "seed": args.seed,
+        "seed_url": seed_url,
+        "device": args.device,
+        "dtype": args.dtype,
+        "quant": args.quant,
+        "profile_steps": args.profile_steps,
+        "dump_phases": args.dump_phases,
+        "module_timing": module_timing_enabled,
+        "write_video": bool(args.write_video),
+        "manifest_note": args.manifest_note,
+        "timestamp_unix_s": time.time(),
+        "quantization_report": quant_report,
+        "env": {
+            "WORLD_ATTENTION_BACKEND": os.environ.get("WORLD_ATTENTION_BACKEND"),
+            "WORLD_METAL_IMPL": os.environ.get("WORLD_METAL_IMPL"),
+            "WORLD_METAL_FAST_NO_FALLBACK": os.environ.get("WORLD_METAL_FAST_NO_FALLBACK"),
+            "WORLD_METAL_PREFER_ACTIVE_DISPATCH": os.environ.get("WORLD_METAL_PREFER_ACTIVE_DISPATCH"),
+            "TORCHDYNAMO_DISABLE": os.environ.get("TORCHDYNAMO_DISABLE"),
+            "WORLD_HYBRID_COMPILE_METAL": os.environ.get("WORLD_HYBRID_COMPILE_METAL"),
+            "WORLD_FORCE_COMPILE_METAL": os.environ.get("WORLD_FORCE_COMPILE_METAL"),
+        },
+    }
+    manifest_path = output_dir / "run_manifest.json"
+    manifest_path.write_text(json.dumps(run_manifest, indent=2), encoding="utf-8")
+
+    report = {
+        "model_uri": args.model_uri,
+        "seed": args.seed,
+        "seed_url": seed_url,
+        "device": args.device,
+        "dtype": args.dtype,
+        "quant": args.quant,
+        "env": {
+            "WORLD_ATTENTION_BACKEND": os.environ.get("WORLD_ATTENTION_BACKEND"),
+            "WORLD_METAL_IMPL": os.environ.get("WORLD_METAL_IMPL"),
+            "WORLD_METAL_FAST_NO_FALLBACK": os.environ.get("WORLD_METAL_FAST_NO_FALLBACK"),
+            "WORLD_METAL_PREFER_ACTIVE_DISPATCH": os.environ.get("WORLD_METAL_PREFER_ACTIVE_DISPATCH"),
+            "TORCHDYNAMO_DISABLE": os.environ.get("TORCHDYNAMO_DISABLE"),
+            "WORLD_HYBRID_COMPILE_METAL": os.environ.get("WORLD_HYBRID_COMPILE_METAL"),
+            "WORLD_FORCE_COMPILE_METAL": os.environ.get("WORLD_FORCE_COMPILE_METAL"),
+        },
+        "timings": timings,
+        "quantization_report": quant_report,
+        "tensor_dump_count": len(dump_index),
+        "tensor_dump_index_path": str(output_dir / "tensor_dump_index.json"),
+        "module_timing_path": str(module_timing_path),
+        "denoise_attribution_path": str(denoise_attr_path),
+        "profiler_path": str(prof_path),
+        "run_manifest_path": str(manifest_path),
+    }
+
+    (output_dir / "tensor_dump_index.json").write_text(json.dumps(dump_index, indent=2), encoding="utf-8")
+    (output_dir / "profile_report.json").write_text(json.dumps(report, indent=2), encoding="utf-8")
+    print(json.dumps(report, indent=2))
+
+
+if __name__ == "__main__":
+    main()
+
diff --git a/tests/run_optimization_gate.py b/tests/run_optimization_gate.py
new file mode 100644
index 0000000..f06571b
--- /dev/null
+++ b/tests/run_optimization_gate.py
@@ -0,0 +1,473 @@
+import argparse
+import json
+import os
+import subprocess
+import sys
+import statistics
+from pathlib import Path
+import torch
+
+
+def _run(
+    cmd: list[str],
+    env: dict[str, str],
+    log_path: Path | None = None,
+    check: bool = True,
+) -> None:
+    if log_path is None:
+        subprocess.run(cmd, check=check, env=env)
+        return
+    log_path.parent.mkdir(parents=True, exist_ok=True)
+    proc = subprocess.run(cmd, check=False, env=env, capture_output=True, text=True)
+    blob = f"$ {' '.join(cmd)}\n{proc.stdout}\n{proc.stderr}"
+    log_path.write_text(blob, encoding="utf-8")
+    if check and proc.returncode != 0:
+        raise subprocess.CalledProcessError(proc.returncode, cmd, output=proc.stdout, stderr=proc.stderr)
+
+
+def _load_json(path: Path) -> dict:
+    return json.loads(path.read_text(encoding="utf-8"))
+
+
+def _preset_defaults(preset: str) -> dict[str, str]:
+    if preset == "fp16":
+        return {
+            "dtype": "float16",
+            "config": "tests/optimization_gate_config_fp16.json",
+            "baseline_dump_dir": "diagnostics/out/fp16_baseline",
+            "baseline_perf_report": "diagnostics/out/fp16_baseline/profile_report.json",
+        }
+    return {}
+
+
+def _safety_check(dump_dir: Path) -> dict:
+    index_path = dump_dir / "tensor_dump_index.json"
+    if not index_path.exists():
+        return {"checked": False, "reason": "tensor_dump_index_missing", "pass": False}
+    idx = json.loads(index_path.read_text(encoding="utf-8"))
+    nonfinite = 0
+    tensors_checked = 0
+
+    def _check_obj(obj):
+        nonlocal nonfinite, tensors_checked
+        if isinstance(obj, torch.Tensor):
+            tensors_checked += 1
+            if obj.is_floating_point():
+                finite = torch.isfinite(obj).all().item()
+                if not bool(finite):
+                    nonfinite += 1
+            return
+        if isinstance(obj, (list, tuple)):
+            for x in obj:
+                _check_obj(x)
+            return
+        if isinstance(obj, dict):
+            for x in obj.values():
+                _check_obj(x)
+
+    for entry in idx:
+        p = Path(entry["file"])
+        if not p.exists():
+            continue
+        obj = torch.load(p, map_location="cpu")
+        _check_obj(obj)
+    return {
+        "checked": True,
+        "tensors_checked": tensors_checked,
+        "nonfinite_tensors": nonfinite,
+        "pass": nonfinite == 0,
+    }
+
+
+def main():
+    parser = argparse.ArgumentParser()
+    parser.add_argument("--preset", default="none", choices=["none", "fp16"])
+    parser.add_argument("--model-uri", default="Overworld-Models/Lapp0-WP-Mini-1.4.5-BL-Distill")
+    parser.add_argument("--device", default="mps")
+    parser.add_argument("--dtype", default="bfloat16", choices=["bfloat16", "float16", "float32"])
+    parser.add_argument("--quant", default="none", choices=["none", "w8a8", "nvfp4"])
+    parser.add_argument("--profile-steps", type=int, default=32)
+    parser.add_argument("--perf-repeats", type=int, default=1)
+    parser.add_argument("--seed", type=int, default=0)
+    parser.add_argument("--seed-url", default="https://gist.github.com/user-attachments/assets/f9c20d4d-7565-452d-8b02-42a85ea175ed")
+    parser.add_argument("--baseline-dump-dir", default="diagnostics/out/metal_profile_baseline")
+    parser.add_argument("--baseline-perf-report", default="diagnostics/out/metal_profile_perf_only/profile_report.json")
+    parser.add_argument("--output-dir", default="diagnostics/out/optimization_gate_run")
+    parser.add_argument("--config", default="tests/optimization_gate_config.json")
+    parser.add_argument("--strict", action="store_true")
+    parser.add_argument("--visual-review-on-fail", action=argparse.BooleanOptionalAction, default=True)
+    parser.add_argument("--visual-review-frames", type=int, default=32)
+    parser.add_argument("--hybrid-compile", action="store_true")
+    parser.add_argument("--force-compile", action="store_true")
+    parser.add_argument("--capture-recompiles", action="store_true")
+    parser.add_argument("--preclean-python", action="store_true")
+    parser.add_argument("--isolate-ext-build", action="store_true")
+    args = parser.parse_args()
+
+    defaults = _preset_defaults(args.preset)
+    if defaults:
+        args.dtype = defaults["dtype"]
+        args.config = defaults["config"]
+        if parser.get_default("baseline_dump_dir") == args.baseline_dump_dir:
+            args.baseline_dump_dir = defaults["baseline_dump_dir"]
+        if parser.get_default("baseline_perf_report") == args.baseline_perf_report:
+            args.baseline_perf_report = defaults["baseline_perf_report"]
+
+    output_dir = Path(args.output_dir)
+    perf_dir = output_dir / "perf"
+    dump_dir = output_dir / "dump"
+    compare_quick_dir = output_dir / "compare_quick"
+    compare_full_dir = output_dir / "compare_full"
+    for d in [output_dir, perf_dir, dump_dir, compare_quick_dir, compare_full_dir]:
+        d.mkdir(parents=True, exist_ok=True)
+
+    cfg = _load_json(Path(args.config))
+    quick_cfg = cfg["quick_gate"]
+    full_cfg = cfg["full_gate"]
+    perf_cfg = cfg["performance_gate"]
+
+    env = os.environ.copy()
+    env.setdefault("HF_HUB_OFFLINE", "1")
+    env.setdefault("TRANSFORMERS_OFFLINE", "1")
+    env.setdefault("TORCHDYNAMO_DISABLE", "0")
+    env.setdefault("WORLD_ATTENTION_BACKEND", "metal")
+    env.setdefault("WORLD_METAL_IMPL", "fast")
+    env.setdefault("WORLD_METAL_FAST_NO_FALLBACK", "1")
+    env.setdefault("WORLD_METAL_PREFER_ACTIVE_DISPATCH", "1")
+    env.setdefault("WORLD_KV_RUNTIME_CHECKS", "0")
+    env.setdefault("WORLD_KV_COMPUTE_ACTIVE_BLOCKS", "0")
+    env.setdefault("PYTHONPATH", ".")
+    env["WORLD_HYBRID_COMPILE_METAL"] = "1" if args.hybrid_compile else "0"
+    env["WORLD_FORCE_COMPILE_METAL"] = "1" if args.force_compile else "0"
+    if args.preclean_python:
+        subprocess.run(["pkill", "-9", "-f", "/opt/homebrew/Cellar/python@3.14"], check=False)
+        subprocess.run(["pkill", "-9", "-f", "/Users/louiscastricato/overworld/world_engine/.venv/bin/python"], check=False)
+    if args.isolate_ext_build:
+        ext_dir = output_dir / "torch_extensions"
+        ext_dir.mkdir(parents=True, exist_ok=True)
+        env["TORCH_EXTENSIONS_DIR"] = str(ext_dir)
+        env.setdefault("NINJA", "/Users/louiscastricato/overworld/world_engine/.venv/bin/ninja")
+    recompile_dir = output_dir / "recompile_logs"
+    if args.capture_recompiles:
+        env["TORCH_LOGS"] = "recompiles"
+        env.setdefault("TORCHDYNAMO_VERBOSE", "1")
+
+    py = sys.executable
+    profile_script = "tests/profile_and_dump_variant_metal.py"
+    compare_script = "tests/compare_tensor_dumps.py"
+    hotspot_script = "tests/summarize_hotspots.py"
+    video_script = "tests/gen_world_variant_metal_save.py"
+    bench_script = "tests/bench_world_engine_e2e.py"
+
+    # 1) Perf-only run (repeat and aggregate median)
+    perf_runs_dir = output_dir / "perf_runs"
+    perf_runs_dir.mkdir(parents=True, exist_ok=True)
+    perf_reports = []
+    repeats = max(1, int(args.perf_repeats))
+    for i in range(repeats):
+        run_dir = perf_runs_dir / f"run_{i:02d}"
+        _run(
+            [
+                py,
+                profile_script,
+                "--model-uri",
+                args.model_uri,
+                "--device",
+                args.device,
+                "--dtype",
+                args.dtype,
+                "--profile-steps",
+                str(args.profile_steps),
+                "--quant",
+                args.quant,
+                "--seed",
+                str(args.seed + i),
+                "--seed-url",
+                args.seed_url,
+                "--dump-phases",
+                "none",
+                "--output-dir",
+                str(run_dir),
+                "--manifest-note",
+                f"optimization_gate_perf_run_{i:02d}",
+            ],
+            env,
+            (recompile_dir / f"perf_run_{i:02d}.log") if args.capture_recompiles else None,
+        )
+        perf_reports.append(_load_json(run_dir / "profile_report.json"))
+
+    gen_mean_values = [float(r["timings"]["gen_mean_s"]) for r in perf_reports]
+    gen_p90_values = [float(r["timings"]["gen_p90_s"]) for r in perf_reports]
+    perf_aggregate = {
+        "perf_repeats": repeats,
+        "gen_mean_s_values": gen_mean_values,
+        "gen_p90_s_values": gen_p90_values,
+        "gen_mean_s_median": float(statistics.median(gen_mean_values)),
+        "gen_p90_s_median": float(statistics.median(gen_p90_values)),
+    }
+    (perf_dir / "perf_aggregate.json").write_text(json.dumps(perf_aggregate, indent=2), encoding="utf-8")
+    # Keep a convenience report path for downstream consumers.
+    (perf_dir / "profile_report.json").write_text(json.dumps(perf_reports[-1], indent=2), encoding="utf-8")
+
+    # 1.5) Phase timing and latent/decoded FPS via e2e bench script.
+    latent_json = perf_dir / "bench_latent.json"
+    decoded_json = perf_dir / "bench_decoded.json"
+    _run(
+        [
+            py,
+            bench_script,
+            "--model-uri",
+            args.model_uri,
+            "--device",
+            args.device,
+            "--attention-backend",
+            "metal",
+            "--dtype",
+            args.dtype,
+            "--quant",
+            args.quant,
+            "--frames",
+            str(args.profile_steps),
+            "--json-out",
+            str(latent_json),
+        ],
+        env,
+        (recompile_dir / "bench_latent.log") if args.capture_recompiles else None,
+    )
+    _run(
+        [
+            py,
+            bench_script,
+            "--model-uri",
+            args.model_uri,
+            "--device",
+            args.device,
+            "--attention-backend",
+            "metal",
+            "--dtype",
+            args.dtype,
+            "--quant",
+            args.quant,
+            "--frames",
+            str(args.profile_steps),
+            "--return-img",
+            "--json-out",
+            str(decoded_json),
+        ],
+        env,
+        (recompile_dir / "bench_decoded.log") if args.capture_recompiles else None,
+    )
+
+    # 2) Dump run + module timing
+    _run(
+        [
+            py,
+            profile_script,
+            "--model-uri",
+            args.model_uri,
+            "--device",
+            args.device,
+            "--dtype",
+            args.dtype,
+            "--profile-steps",
+            str(args.profile_steps),
+            "--quant",
+            args.quant,
+            "--seed",
+            str(args.seed),
+            "--seed-url",
+            args.seed_url,
+            "--dump-phases",
+            "append,gen1",
+            "--module-timing",
+            "--output-dir",
+            str(dump_dir),
+            "--manifest-note",
+            "optimization_gate_dump",
+        ],
+        env,
+        (recompile_dir / "dump.log") if args.capture_recompiles else None,
+    )
+
+    # 2.5) Hotspot ranking summary from profiler + module timing.
+    _run(
+        [
+            py,
+            hotspot_script,
+            "--profiler",
+            str(dump_dir / "torch_profiler_top_ops.txt"),
+            "--module-timing",
+            str(dump_dir / "module_timing_report.json"),
+            "--out",
+            str(output_dir / "hotspot_summary.json"),
+            "--top-k",
+            "10",
+        ],
+        env,
+        (recompile_dir / "hotspots.log") if args.capture_recompiles else None,
+    )
+
+    # 3) Quick compare gate (sentinel modules)
+    _run(
+        [
+            py,
+            compare_script,
+            "--baseline-dir",
+            args.baseline_dump_dir,
+            "--candidate-dir",
+            str(dump_dir),
+            "--phase",
+            "all",
+            "--modules-regex",
+            quick_cfg["sentinel_modules_regex"],
+            "--cosine-min",
+            str(quick_cfg["cosine_min"]),
+            "--mae-max",
+            str(quick_cfg["mae_max"]),
+            "--rmse-max",
+            str(quick_cfg["rmse_max"]),
+            "--max-abs-max",
+            str(quick_cfg["max_abs_max"]),
+            "--out-dir",
+            str(compare_quick_dir),
+        ],
+        env,
+        (recompile_dir / "compare_quick.log") if args.capture_recompiles else None,
+        check=False,
+    )
+
+    # 4) Full compare gate
+    _run(
+        [
+            py,
+            compare_script,
+            "--baseline-dir",
+            args.baseline_dump_dir,
+            "--candidate-dir",
+            str(dump_dir),
+            "--phase",
+            "all",
+            "--cosine-min",
+            str(full_cfg["cosine_min"]),
+            "--mae-max",
+            str(full_cfg["mae_max"]),
+            "--rmse-max",
+            str(full_cfg["rmse_max"]),
+            "--max-abs-max",
+            str(full_cfg["max_abs_max"]),
+            "--out-dir",
+            str(compare_full_dir),
+        ],
+        env,
+        (recompile_dir / "compare_full.log") if args.capture_recompiles else None,
+        check=False,
+    )
+
+    base_perf = _load_json(Path(args.baseline_perf_report))
+    cur_perf = _load_json(perf_dir / "perf_aggregate.json")
+    quick_cmp = _load_json(compare_quick_dir / "comparison_summary.json")
+    full_cmp = _load_json(compare_full_dir / "comparison_summary.json")
+    safety = _safety_check(dump_dir)
+    bench_latent = _load_json(latent_json)
+    bench_decoded = _load_json(decoded_json)
+
+    base_mean = float(base_perf["timings"]["gen_mean_s"])
+    cur_mean = float(cur_perf["gen_mean_s_median"])
+    base_p90 = float(base_perf["timings"]["gen_p90_s"])
+    cur_p90 = float(cur_perf["gen_p90_s_median"])
+    improvement_pct = ((base_mean - cur_mean) / base_mean) * 100.0 if base_mean > 0 else 0.0
+    p90_regression_pct = ((cur_p90 - base_p90) / base_p90) * 100.0 if base_p90 > 0 else 0.0
+
+    perf_pass = (
+        improvement_pct >= float(perf_cfg["min_median_improvement_pct"])
+        and p90_regression_pct <= float(perf_cfg["max_p90_regression_pct"])
+    )
+    correctness_pass = bool(quick_cmp["pass"]) and bool(full_cmp["pass"])
+    safety_pass = bool(safety.get("pass", False))
+    overall_pass = correctness_pass and perf_pass and safety_pass
+    visual_review_video = ""
+    visual_review_required = False
+    if args.visual_review_on_fail and (not correctness_pass or not safety_pass):
+        visual_review_required = True
+        visual_review_video = str(output_dir / "visual_review_fail.mp4")
+        _run(
+            [
+                py,
+                video_script,
+                "--model-uri",
+                args.model_uri,
+                "--device",
+                args.device,
+                "--dtype",
+                args.dtype,
+                "--quant",
+                args.quant,
+                "--frames",
+                str(args.visual_review_frames),
+                "--seed",
+                str(args.seed),
+                "--seed-url",
+                args.seed_url,
+                "--out",
+                visual_review_video,
+            ],
+            env,
+            (recompile_dir / "visual_review.log") if args.capture_recompiles else None,
+        )
+
+    report = {
+        "baseline_perf_report": args.baseline_perf_report,
+        "current_perf_report": str(perf_dir / "profile_report.json"),
+        "current_perf_aggregate": str(perf_dir / "perf_aggregate.json"),
+        "quick_compare_summary": str(compare_quick_dir / "comparison_summary.json"),
+        "full_compare_summary": str(compare_full_dir / "comparison_summary.json"),
+        "hotspot_summary": str(output_dir / "hotspot_summary.json"),
+        "metrics": {
+            "baseline_gen_mean_s": base_mean,
+            "current_gen_mean_s": cur_mean,
+            "improvement_pct": improvement_pct,
+            "baseline_gen_p90_s": base_p90,
+            "current_gen_p90_s": cur_p90,
+            "p90_regression_pct": p90_regression_pct,
+        },
+        "runtime": {
+            "quant": args.quant,
+            "hybrid_compile": bool(args.hybrid_compile),
+            "force_compile": bool(args.force_compile),
+            "bench_latent_json": str(latent_json),
+            "bench_decoded_json": str(decoded_json),
+            "bench_latent_fps_mean": float(bench_latent["fps"]["mean"]),
+            "bench_decoded_fps_mean": float(bench_decoded["fps"]["mean"]),
+            "bench_phase_mean_ms": {
+                "prep": float(bench_decoded["prep_ms"]["mean"]),
+                "denoise": float(bench_decoded["denoise_ms"]["mean"]),
+                "cache": float(bench_decoded["cache_ms"]["mean"]),
+                "decode": float(bench_decoded["decode_ms"]["mean"]),
+            },
+        },
+        "gates": {
+            "quick_correctness_pass": bool(quick_cmp["pass"]),
+            "full_correctness_pass": bool(full_cmp["pass"]),
+            "correctness_pass": correctness_pass,
+            "safety_pass": safety_pass,
+            "performance_pass": perf_pass,
+            "overall_pass": overall_pass,
+            "visual_review_required": visual_review_required,
+        },
+        "safety": safety,
+        "visual_review_video": visual_review_video,
+        "recompile_log_dir": str(recompile_dir) if args.capture_recompiles else "",
+        "thresholds": cfg,
+    }
+
+    gate_path = output_dir / "gate_report.json"
+    gate_path.write_text(json.dumps(report, indent=2), encoding="utf-8")
+    print(json.dumps(report, indent=2))
+
+    if args.strict and not overall_pass:
+        raise SystemExit(3)
+
+
+if __name__ == "__main__":
+    main()
+
diff --git a/tests/summarize_hotspots.py b/tests/summarize_hotspots.py
new file mode 100644
index 0000000..0f7f3aa
--- /dev/null
+++ b/tests/summarize_hotspots.py
@@ -0,0 +1,55 @@
+import argparse
+import json
+from pathlib import Path
+
+
+def _parse_profiler_top_ops(path: Path, top_k: int) -> list[dict]:
+    lines = path.read_text(encoding="utf-8").splitlines()
+    ops = []
+    for line in lines:
+        if "aten::" not in line and "world::" not in line:
+            continue
+        cols = [c.strip() for c in line.split("  ") if c.strip()]
+        if len(cols) < 5:
+            continue
+        try:
+            name = cols[0]
+            self_cpu = cols[2]
+            cpu_total = cols[4]
+            ops.append({"name": name, "self_cpu": self_cpu, "cpu_total": cpu_total})
+        except Exception:
+            continue
+    return ops[: max(0, top_k)]
+
+
+def _parse_module_timing(path: Path, top_k: int) -> list[dict]:
+    data = json.loads(path.read_text(encoding="utf-8"))
+    modules = data.get("modules", [])
+    modules = sorted(modules, key=lambda x: float(x.get("total_ms", 0.0)), reverse=True)
+    return modules[: max(0, top_k)]
+
+
+def main():
+    parser = argparse.ArgumentParser()
+    parser.add_argument("--profiler", required=True)
+    parser.add_argument("--module-timing", required=True)
+    parser.add_argument("--out", required=True)
+    parser.add_argument("--top-k", type=int, default=10)
+    args = parser.parse_args()
+
+    profiler_path = Path(args.profiler)
+    module_timing_path = Path(args.module_timing)
+    out_path = Path(args.out)
+    out_path.parent.mkdir(parents=True, exist_ok=True)
+
+    summary = {
+        "top_ops": _parse_profiler_top_ops(profiler_path, args.top_k),
+        "top_modules": _parse_module_timing(module_timing_path, args.top_k),
+    }
+    out_path.write_text(json.dumps(summary, indent=2), encoding="utf-8")
+    print(json.dumps(summary, indent=2))
+
+
+if __name__ == "__main__":
+    main()
+
diff --git a/tests/test_attn_backend_cross_backend.py b/tests/test_attn_backend_cross_backend.py
new file mode 100644
index 0000000..8ee7a9c
--- /dev/null
+++ b/tests/test_attn_backend_cross_backend.py
@@ -0,0 +1,88 @@
+import math
+
+import pytest
+import torch
+
+from attn_backend import AttnBackend, AttnConfig, AttnMeta, world_flex_attn_forward
+from metal_test_utils import require_metal_attn_ops
+
+
+def _reference_attention(q, k, v, causal: bool):
+    qf, kf, vf = q.to(torch.float32), k.to(torch.float32), v.to(torch.float32)
+    if qf.size(1) != kf.size(1):
+        if qf.size(1) < kf.size(1) or (qf.size(1) % kf.size(1)) != 0:
+            raise RuntimeError("GQA requires q_heads divisible by kv_heads")
+        group = qf.size(1) // kf.size(1)
+        head_idx = torch.arange(qf.size(1), device=q.device, dtype=torch.long) // group
+        kf = kf.index_select(1, head_idx)
+        vf = vf.index_select(1, head_idx)
+    scores = torch.matmul(qf, kf.transpose(-2, -1)) / math.sqrt(qf.size(-1))
+    if causal:
+        t, l = qf.size(-2), kf.size(-2)
+        tri = torch.triu(torch.ones((t, l), device=q.device, dtype=torch.bool), diagonal=1)
+        scores = scores.masked_fill(tri[None, None], float("-inf"))
+    finite = torch.isfinite(scores).any(dim=-1, keepdim=True)
+    scores = torch.where(finite, scores, torch.zeros_like(scores))
+    probs = torch.softmax(scores, dim=-1)
+    probs = torch.where(finite, probs, torch.zeros_like(probs))
+    return torch.matmul(probs, vf).to(q.dtype)
+
+
+def test_auto_backend_uses_pytorch_flex_on_cpu():
+    q = torch.randn(1, 2, 8, 32, device="cpu", dtype=torch.float32)
+    k = torch.randn(1, 2, 8, 32, device="cpu", dtype=torch.float32)
+    v = torch.randn(1, 2, 8, 32, device="cpu", dtype=torch.float32)
+    out = world_flex_attn_forward(
+        q, k, v, AttnMeta(flex_block_mask=None, q_len=8, kv_len=8), AttnConfig(causal=True), backend=AttnBackend.AUTO
+    )
+    expected = world_flex_attn_forward(
+        q, k, v, AttnMeta(flex_block_mask=None, q_len=8, kv_len=8), AttnConfig(causal=True), backend=AttnBackend.PYTORCH_FLEX
+    )
+    assert torch.allclose(out, expected, atol=1e-6, rtol=1e-6)
+
+
+@pytest.mark.parametrize("causal", [False])
+@pytest.mark.parametrize("shape", [(1, 4, 4, 16, 16, 32), (1, 8, 2, 12, 20, 64)])
+def test_pytorch_flex_matches_reference(shape, causal):
+    b, hq, hkv, t, l, dh = shape
+    q = torch.randn(b, hq, t, dh, device="cpu", dtype=torch.float32)
+    k = torch.randn(b, hkv, l, dh, device="cpu", dtype=torch.float32)
+    v = torch.randn(b, hkv, l, dh, device="cpu", dtype=torch.float32)
+    out = world_flex_attn_forward(
+        q, k, v, AttnMeta(flex_block_mask=None, q_len=t, kv_len=l), AttnConfig(causal=causal, enable_gqa=(hq != hkv)),
+        backend=AttnBackend.PYTORCH_FLEX,
+    )
+    ref = _reference_attention(q, k, v, causal=causal)
+    assert torch.allclose(out, ref, atol=1e-4, rtol=1e-4)
+
+
+@pytest.mark.skipif(not torch.backends.mps.is_available(), reason="MPS backend not available")
+def test_auto_backend_uses_metal_on_mps(monkeypatch):
+    require_metal_attn_ops()
+    monkeypatch.setenv("WORLD_METAL_IMPL", "fast")
+    monkeypatch.setenv("WORLD_METAL_FAST_NO_FALLBACK", "1")
+    q = torch.randn(1, 8, 16, 64, device="mps", dtype=torch.float16)
+    k = torch.randn(1, 2, 32, 64, device="mps", dtype=torch.float16)
+    v = torch.randn(1, 2, 32, 64, device="mps", dtype=torch.float16)
+    meta = AttnMeta(flex_block_mask=None, q_len=16, kv_len=32)
+    cfg = AttnConfig(causal=True, enable_gqa=True)
+    metal = world_flex_attn_forward(q, k, v, meta, cfg, backend=AttnBackend.METAL)
+    auto = world_flex_attn_forward(q, k, v, meta, cfg, backend=AttnBackend.AUTO)
+    assert torch.allclose(
+        metal.to("cpu", dtype=torch.float32), auto.to("cpu", dtype=torch.float32), atol=1e-4, rtol=1e-4
+    )
+
+
+@pytest.mark.skipif(not torch.cuda.is_available(), reason="CUDA backend not available")
+def test_auto_backend_uses_pytorch_flex_on_cuda():
+    q = torch.randn(1, 4, 8, 32, device="cuda", dtype=torch.float16)
+    k = torch.randn(1, 4, 8, 32, device="cuda", dtype=torch.float16)
+    v = torch.randn(1, 4, 8, 32, device="cuda", dtype=torch.float16)
+    out = world_flex_attn_forward(
+        q, k, v, AttnMeta(flex_block_mask=None, q_len=8, kv_len=8), AttnConfig(causal=True), backend=AttnBackend.AUTO
+    )
+    ref = world_flex_attn_forward(
+        q, k, v, AttnMeta(flex_block_mask=None, q_len=8, kv_len=8), AttnConfig(causal=True), backend=AttnBackend.PYTORCH_FLEX
+    )
+    assert torch.allclose(out, ref, atol=2e-3, rtol=2e-3)
+
diff --git a/tests/test_attn_module_integration.py b/tests/test_attn_module_integration.py
new file mode 100644
index 0000000..f892c52
--- /dev/null
+++ b/tests/test_attn_module_integration.py
@@ -0,0 +1,113 @@
+import pytest
+import torch
+
+from kv_cache import LayerKVCache
+from attn_backend import AttnBackend, AttnConfig, AttnMeta, world_flex_attn_forward
+from metal_test_utils import require_metal_attn_ops
+
+
+pytestmark = pytest.mark.skipif(
+    not torch.backends.mps.is_available(),
+    reason="MPS backend not available on this system",
+)
+
+
+_require_metal_ops = require_metal_attn_ops
+
+
+def _pos_ids(frame_idx: int, B: int, T: int, device: str):
+    return {"f_pos": torch.full((B, T), frame_idx, device=device, dtype=torch.long)}
+
+
+@pytest.mark.parametrize("causal", [False, True])
+@pytest.mark.parametrize("gqa", [False, True])
+def test_kv_cache_to_backend_path_matches_ref(causal, gqa, monkeypatch):
+    _require_metal_ops()
+    monkeypatch.setenv("WORLD_METAL_FAST_NO_FALLBACK", "1")
+
+    B = 1
+    T = 8
+    Dh = 64
+    Hq = 8
+    Hkv = 2 if gqa else Hq
+    L_hist = 32
+    block_size = 4
+
+    q = torch.randn(B, Hq, T, Dh, device="mps", dtype=torch.float16)
+    kf = torch.randn(B, Hkv, T, Dh, device="mps", dtype=torch.float16)
+    vf = torch.randn(B, Hkv, T, Dh, device="mps", dtype=torch.float16)
+
+    cache = LayerKVCache(B, Hkv, L_hist, Dh, torch.float16, T).to("mps")
+    # Write one frame to establish rolling state.
+    _ = cache.upsert(torch.stack([kf, vf], dim=0), _pos_ids(0, B, T, "mps"), is_frozen=False)
+    # Read/update next frame.
+    k, v, _bm, block_written, active_blocks, bs = cache.upsert(
+        torch.stack([kf, vf], dim=0), _pos_ids(1, B, T, "mps"), is_frozen=False
+    )
+    if active_blocks is None:
+        active_blocks = torch.nonzero(block_written, as_tuple=False).flatten().to(torch.int32).contiguous()
+
+    # Direct block-written fast path.
+    out_fast_blocks = torch.ops.world.flex_attn_metal_fast_blocks(q, k, v, block_written, int(bs), causal)
+    out_fast_active = torch.ops.world.flex_attn_metal_fast_active(q, k, v, active_blocks, int(bs), causal)
+
+    # Dense-mask reference from block-written metadata.
+    dense = torch.zeros((k.size(2),), device="mps", dtype=torch.uint8)
+    for i in range(block_written.numel()):
+        if int(block_written[i].item()) != 0:
+            s = i * int(bs)
+            e = min(k.size(2), s + int(bs))
+            dense[s:e] = 1
+    dense_mask = dense.view(1, 1, 1, k.size(2)).expand(B, Hq, T, k.size(2)).contiguous()
+    out_ref = torch.ops.world.flex_attn_metal_ref(q, k, v, dense_mask, causal)
+
+    assert out_fast_blocks.shape == out_ref.shape
+    assert torch.allclose(
+        out_fast_blocks.to("cpu", dtype=torch.float32),
+        out_ref.to("cpu", dtype=torch.float32),
+        atol=3e-2,
+        rtol=3e-2,
+    )
+    assert torch.allclose(
+        out_fast_active.to("cpu", dtype=torch.float32),
+        out_ref.to("cpu", dtype=torch.float32),
+        atol=3e-2,
+        rtol=3e-2,
+    )
+
+
+def test_world_flex_attn_forward_prefers_active_metadata(monkeypatch):
+    _require_metal_ops()
+    monkeypatch.setenv("WORLD_METAL_IMPL", "fast")
+    monkeypatch.setenv("WORLD_METAL_FAST_NO_FALLBACK", "1")
+
+    B, Hq, Hkv, T, L, Dh = 1, 8, 2, 8, 24, 64
+    q = torch.randn(B, Hq, T, Dh, device="mps", dtype=torch.float16)
+    k = torch.randn(B, Hkv, L, Dh, device="mps", dtype=torch.float16)
+    v = torch.randn(B, Hkv, L, Dh, device="mps", dtype=torch.float16)
+
+    block_size = 4
+    kv_blocks = (L + block_size - 1) // block_size
+    # Deliberately provide conflicting metadata. Backend should prefer active_blocks.
+    block_written = torch.ones((kv_blocks,), device="mps", dtype=torch.uint8).contiguous()
+    active_blocks = torch.tensor([0, 2, 4], device="mps", dtype=torch.int32).contiguous()
+
+    meta = AttnMeta(
+        flex_block_mask=None,
+        q_len=T,
+        kv_len=L,
+        block_written=block_written,
+        active_blocks=active_blocks,
+        block_size=block_size,
+    )
+    cfg = AttnConfig(causal=True, enable_gqa=True)
+    out = world_flex_attn_forward(q, k, v, meta, cfg, backend=AttnBackend.METAL)
+
+    direct = torch.ops.world.flex_attn_metal_fast_active(q, k, v, active_blocks, block_size, True)
+    assert torch.allclose(
+        out.to("cpu", dtype=torch.float32),
+        direct.to("cpu", dtype=torch.float32),
+        atol=1e-4,
+        rtol=1e-4,
+    )
+
diff --git a/tests/test_kv_cache_active_blocks.py b/tests/test_kv_cache_active_blocks.py
new file mode 100644
index 0000000..f088ed4
--- /dev/null
+++ b/tests/test_kv_cache_active_blocks.py
@@ -0,0 +1,134 @@
+import types
+
+import pytest
+import torch
+
+from kv_cache import LayerKVCache, StaticKVCache
+
+
+def _pos_ids(frame_idx: int, b: int, t: int, device: str = "cpu"):
+    return {"f_pos": torch.full((b, t), frame_idx, device=device, dtype=torch.long)}
+
+
+def _new_layer_cache(*, l=16, tpf=4, pd=1):
+    # num_buckets = (L/tpf)/pd
+    return LayerKVCache(B=1, H=2, L=l, Dh=8, dtype=torch.float32, tokens_per_frame=tpf, pinned_dilation=pd)
+
+
+def test_layer_kv_cache_returns_active_blocks_on_metal_backend(monkeypatch):
+    monkeypatch.setenv("WORLD_ATTENTION_BACKEND", "metal")
+    monkeypatch.setenv("WORLD_METAL_BLOCK_SIZE", "4")
+    cache = _new_layer_cache(l=16, tpf=4, pd=1)
+    kv = torch.randn(2, 1, 2, 4, 8)
+
+    _k, _v, _bm, block_written, active_blocks, _bs = cache.upsert(
+        kv, _pos_ids(0, 1, 4), is_frozen=False, frame_idx_int=0
+    )
+    assert active_blocks is not None
+    expected = torch.nonzero(block_written, as_tuple=False).flatten().to(torch.int32)
+    assert torch.equal(active_blocks.cpu(), expected.cpu())
+
+
+def test_layer_kv_cache_saturated_path_avoids_nonzero_write_step(monkeypatch):
+    monkeypatch.setenv("WORLD_ATTENTION_BACKEND", "metal")
+    monkeypatch.setenv("WORLD_METAL_BLOCK_SIZE", "4")
+    cache = _new_layer_cache(l=16, tpf=4, pd=1)  # num_buckets=4
+    kv = torch.randn(2, 1, 2, 4, 8)
+
+    # Saturate ring by writing each slot once.
+    for frame_idx in range(4):
+        cache.upsert(kv, _pos_ids(frame_idx, 1, 4), is_frozen=False, frame_idx_int=frame_idx)
+    assert len(cache._seen_slots) == cache.num_buckets
+
+    # If this path still calls nonzero, test should fail.
+    monkeypatch.setattr(torch, "nonzero", lambda *args, **kwargs: (_ for _ in ()).throw(RuntimeError("nonzero called")))
+    _k, _v, _bm, _block_written, active_blocks, _bs = cache.upsert(
+        kv, _pos_ids(4, 1, 4), is_frozen=False, frame_idx_int=4
+    )
+    # frame_idx=4 -> slot 0 masked out for this call, so active excludes block 0.
+    assert torch.equal(active_blocks.cpu(), torch.tensor([1, 2, 3, 4], dtype=torch.int32))
+
+
+def test_layer_kv_cache_saturated_path_avoids_nonzero_non_write_step(monkeypatch):
+    monkeypatch.setenv("WORLD_ATTENTION_BACKEND", "metal")
+    monkeypatch.setenv("WORLD_METAL_BLOCK_SIZE", "4")
+    cache = _new_layer_cache(l=16, tpf=4, pd=2)  # num_buckets=2, write steps at 0,2,4,...
+    kv = torch.randn(2, 1, 2, 4, 8)
+
+    cache.upsert(kv, _pos_ids(0, 1, 4), is_frozen=False, frame_idx_int=0)
+    cache.upsert(kv, _pos_ids(2, 1, 4), is_frozen=False, frame_idx_int=2)
+    assert len(cache._seen_slots) == cache.num_buckets
+
+    monkeypatch.setattr(torch, "nonzero", lambda *args, **kwargs: (_ for _ in ()).throw(RuntimeError("nonzero called")))
+    _k, _v, _bm, _block_written, active_blocks, _bs = cache.upsert(
+        kv, _pos_ids(3, 1, 4), is_frozen=True, frame_idx_int=3
+    )
+    # For dilation=2, only the pinned ring subset plus tail are active.
+    assert torch.equal(active_blocks.cpu(), torch.tensor([0, 1, 4], dtype=torch.int32))
+
+
+def test_layer_kv_cache_unsaturated_regular_geometry_avoids_nonzero(monkeypatch):
+    monkeypatch.setenv("WORLD_ATTENTION_BACKEND", "metal")
+    monkeypatch.setenv("WORLD_METAL_BLOCK_SIZE", "4")
+    cache = _new_layer_cache(l=16, tpf=4, pd=1)
+    kv = torch.randn(2, 1, 2, 4, 8)
+
+    calls = {"n": 0}
+    real_nonzero = torch.nonzero
+
+    def _counting_nonzero(*args, **kwargs):
+        calls["n"] += 1
+        return real_nonzero(*args, **kwargs)
+
+    monkeypatch.setattr(torch, "nonzero", _counting_nonzero)
+    cache.upsert(kv, _pos_ids(0, 1, 4), is_frozen=False, frame_idx_int=0)
+    assert calls["n"] == 0
+
+
+def test_layer_kv_cache_irregular_geometry_falls_back_to_nonzero(monkeypatch):
+    monkeypatch.setenv("WORLD_ATTENTION_BACKEND", "metal")
+    monkeypatch.setenv("WORLD_METAL_BLOCK_SIZE", "4")
+    # tpf=5 is not divisible by block_size=4, so nonzero fallback is required.
+    cache = _new_layer_cache(l=20, tpf=5, pd=1)
+    kv = torch.randn(2, 1, 2, 5, 8)
+
+    calls = {"n": 0}
+    real_nonzero = torch.nonzero
+
+    def _counting_nonzero(*args, **kwargs):
+        calls["n"] += 1
+        return real_nonzero(*args, **kwargs)
+
+    monkeypatch.setattr(torch, "nonzero", _counting_nonzero)
+    cache.upsert(kv, _pos_ids(0, 1, 5), is_frozen=False, frame_idx_int=0)
+    assert calls["n"] >= 1
+
+
+def test_load_state_rebuilds_seen_slots(monkeypatch):
+    monkeypatch.setenv("WORLD_ATTENTION_BACKEND", "metal")
+    monkeypatch.setenv("WORLD_METAL_BLOCK_SIZE", "4")
+    cfg = types.SimpleNamespace(
+        height=2,
+        width=2,
+        local_window=4,
+        global_window=4,
+        global_attn_period=2,
+        global_attn_offset=0,
+        global_pinned_dilation=2,
+        n_layers=2,
+        n_kv_heads=2,
+        n_heads=2,
+        d_model=16,
+    )
+    cache = StaticKVCache(cfg, batch_size=1, dtype=torch.float32)
+    kv = torch.randn(1, 2, 4, 8)
+    pos = _pos_ids(0, 1, 4)
+    cache.set_frozen(False)
+    cache.upsert(kv, kv, pos, layer=0)
+
+    state = cache.get_state()
+    clone = StaticKVCache(cfg, batch_size=1, dtype=torch.float32)
+    clone.load_state(state)
+    assert len(clone.layers[0]._seen_slots) > 0
+    assert clone.layers[0]._metal_bs_cache == 0
+
diff --git a/tests/test_metal_attn_numeric.py b/tests/test_metal_attn_numeric.py
new file mode 100644
index 0000000..3707da7
--- /dev/null
+++ b/tests/test_metal_attn_numeric.py
@@ -0,0 +1,528 @@
+import math
+import random
+
+import pytest
+import torch
+
+from attn_backend import (
+    AttnBackend,
+    AttnConfig,
+    AttnMeta,
+    world_flex_attn_forward,
+)
+from metal_test_utils import require_metal_attn_ops
+
+
+pytestmark = pytest.mark.skipif(
+    not torch.backends.mps.is_available(),
+    reason="MPS backend not available on this system",
+)
+
+
+def _rand_attn_tensors(B: int, H: int, T: int, L: int, Dh: int, dtype: torch.dtype):
+    q = torch.randn(B, H, T, Dh, device="mps", dtype=dtype)
+    k = torch.randn(B, H, L, Dh, device="mps", dtype=dtype)
+    v = torch.randn(B, H, L, Dh, device="mps", dtype=dtype)
+    return q, k, v
+
+
+_require_metal_op = require_metal_attn_ops
+
+
+def _reference_attention(
+    q: torch.Tensor,
+    k: torch.Tensor,
+    v: torch.Tensor,
+    causal: bool,
+    mask: torch.Tensor | None = None,
+) -> torch.Tensor:
+    # Explicit SDPA reference that does not depend on flex_attention.
+    qf = q.to(torch.float32)
+    kf = k.to(torch.float32)
+    vf = v.to(torch.float32)
+
+    if qf.size(1) != kf.size(1):
+        if qf.size(1) < kf.size(1) or (qf.size(1) % kf.size(1)) != 0:
+            raise RuntimeError("GQA requires q_heads divisible by kv_heads")
+        group_size = qf.size(1) // kf.size(1)
+        head_idx = torch.arange(qf.size(1), device=q.device, dtype=torch.long) // group_size
+        kf = kf.index_select(1, head_idx)
+        vf = vf.index_select(1, head_idx)
+
+    scores = torch.matmul(qf, kf.transpose(-2, -1)) / math.sqrt(q.size(-1))
+
+    if mask is not None:
+        scores = scores.masked_fill(mask == 0, float("-inf"))
+    if causal:
+        t = q.size(-2)
+        l = k.size(-2)
+        causal_mask = torch.triu(
+            torch.ones((t, l), device=q.device, dtype=torch.bool),
+            diagonal=1,
+        )
+        scores = scores.masked_fill(causal_mask[None, None], float("-inf"))
+
+    # If a row is fully masked, define output as zero (to match kernel behavior).
+    finite_row = torch.isfinite(scores).any(dim=-1, keepdim=True)
+    safe_scores = torch.where(finite_row, scores, torch.zeros_like(scores))
+    probs = torch.softmax(safe_scores, dim=-1)
+    probs = torch.where(finite_row, probs, torch.zeros_like(probs))
+    out = torch.matmul(probs, vf)
+    return out.to(q.dtype)
+
+
+def _dense_mask_from_block_written(
+    block_written: torch.Tensor,
+    t: int,
+    l: int,
+    block_size: int,
+    device: torch.device,
+) -> torch.Tensor:
+    """
+    Convert a 1D block-written mask [KV_blocks] into dense [1,1,T,L] uint8 mask.
+    This mirrors Andrew's guidance: kernel consumes frame length, total length,
+    and block-wise written state.
+    """
+    dense = torch.zeros((l,), device=device, dtype=torch.uint8)
+    for bidx, is_written in enumerate(block_written.tolist()):
+        if is_written:
+            s = bidx * block_size
+            e = min(l, s + block_size)
+            dense[s:e] = 1
+    return dense.view(1, 1, 1, l).expand(1, 1, t, l).contiguous()
+
+
+@pytest.mark.parametrize("dtype", [torch.float16, torch.bfloat16])
+def test_metal_vs_reference_small_random(dtype):
+    _require_metal_op()
+
+    B, H, T, L, Dh = 1, 2, 8, 8, 64
+    q, k, v = _rand_attn_tensors(B, H, T, L, Dh, dtype)
+
+    ref_out = _reference_attention(q, k, v, causal=False, mask=None)
+    metal_out = world_flex_attn_forward(
+        q,
+        k,
+        v,
+        AttnMeta(flex_block_mask=None, q_len=T, kv_len=L),
+        AttnConfig(causal=False, enable_gqa=False),
+        backend=AttnBackend.METAL,
+    )
+
+    ref_cpu = ref_out.to("cpu", dtype=torch.float32)
+    metal_cpu = metal_out.to("cpu", dtype=torch.float32)
+
+    max_abs_diff = (ref_cpu - metal_cpu).abs().max().item()
+    mean_abs_diff = (ref_cpu - metal_cpu).abs().mean().item()
+
+    assert max_abs_diff < 2e-1
+    assert mean_abs_diff < 2e-2
+
+
+@pytest.mark.parametrize("dtype", [torch.float16, torch.bfloat16])
+def test_metal_vs_reference_small_random_causal(dtype):
+    _require_metal_op()
+
+    B, H, T, L, Dh = 1, 2, 8, 8, 64
+    q, k, v = _rand_attn_tensors(B, H, T, L, Dh, dtype)
+
+    ref_out = _reference_attention(q, k, v, causal=True, mask=None)
+    metal_out = world_flex_attn_forward(
+        q,
+        k,
+        v,
+        AttnMeta(flex_block_mask=None, q_len=T, kv_len=L),
+        AttnConfig(causal=True, enable_gqa=False),
+        backend=AttnBackend.METAL,
+    )
+
+    ref_cpu = ref_out.to("cpu", dtype=torch.float32)
+    metal_cpu = metal_out.to("cpu", dtype=torch.float32)
+
+    max_abs_diff = (ref_cpu - metal_cpu).abs().max().item()
+    mean_abs_diff = (ref_cpu - metal_cpu).abs().mean().item()
+
+    assert max_abs_diff < 2e-1
+    assert mean_abs_diff < 2e-2
+
+
+@pytest.mark.parametrize("dtype", [torch.float16])
+def test_metal_mask_all_ones_and_all_zeros(dtype):
+    _require_metal_op()
+
+    B, H, T, L, Dh = 1, 2, 8, 8, 64
+    q, k, v = _rand_attn_tensors(B, H, T, L, Dh, dtype)
+
+    ones = torch.ones((B, H, T, L), device="mps", dtype=torch.uint8).contiguous()
+    zeros = torch.zeros((B, H, T, L), device="mps", dtype=torch.uint8).contiguous()
+
+    out_no_mask = torch.ops.world.flex_attn_metal(q, k, v, None, False)
+    out_ones = torch.ops.world.flex_attn_metal(q, k, v, ones, False)
+    out_zeros = torch.ops.world.flex_attn_metal(q, k, v, zeros, False)
+
+    no_mask_cpu = out_no_mask.to("cpu", dtype=torch.float32)
+    ones_cpu = out_ones.to("cpu", dtype=torch.float32)
+    zeros_cpu = out_zeros.to("cpu", dtype=torch.float32)
+    ref_zero = _reference_attention(q, k, v, causal=False, mask=zeros).to("cpu", dtype=torch.float32)
+
+    assert torch.allclose(no_mask_cpu, ones_cpu, rtol=1e-2, atol=1e-2)
+    assert torch.allclose(zeros_cpu, torch.zeros_like(zeros_cpu), rtol=0.0, atol=1e-6)
+    assert torch.allclose(zeros_cpu, ref_zero, rtol=0.0, atol=1e-6)
+
+
+@pytest.mark.parametrize("mode", ["ref", "fast"])
+@pytest.mark.parametrize("causal", [False, True])
+@pytest.mark.parametrize(
+    "shape",
+    [
+        (1, 8, 2, 16, 16, 64),
+        (1, 12, 3, 12, 16, 64),
+        (1, 16, 4, 32, 32, 64),
+    ],
+)
+def test_gqa_metal_impl_matches_reference(shape, causal, mode, monkeypatch):
+    _require_metal_op()
+    B, Hq, Hkv, T, L, Dh = shape
+    q = torch.randn(B, Hq, T, Dh, device="mps", dtype=torch.float16)
+    k = torch.randn(B, Hkv, L, Dh, device="mps", dtype=torch.float16)
+    v = torch.randn(B, Hkv, L, Dh, device="mps", dtype=torch.float16)
+    dense_mask = torch.ones((B, Hq, T, L), device="mps", dtype=torch.uint8).contiguous()
+
+    ref = _reference_attention(q, k, v, causal=causal, mask=dense_mask)
+    monkeypatch.setenv("WORLD_METAL_BLOCK_SIZE", "4")
+    if mode == "fast":
+        monkeypatch.setenv("WORLD_METAL_FAST_NO_FALLBACK", "1")
+    out = torch.ops.world.flex_attn_metal_ref(q, k, v, dense_mask, causal) if mode == "ref" else torch.ops.world.flex_attn_metal_fast(q, k, v, dense_mask, causal)
+
+    assert torch.allclose(
+        out.to("cpu", dtype=torch.float32),
+        ref.to("cpu", dtype=torch.float32),
+        rtol=3e-2,
+        atol=3e-2,
+    )
+
+
+@pytest.mark.parametrize("mode", ["ref", "fast"])
+def test_world_flex_attn_forward_gqa_executes(mode, monkeypatch):
+    _require_metal_op()
+    B, Hq, Hkv, T, L, Dh = 1, 8, 2, 8, 8, 64
+    q = torch.randn(B, Hq, T, Dh, device="mps", dtype=torch.float16)
+    k = torch.randn(B, Hkv, L, Dh, device="mps", dtype=torch.float16)
+    v = torch.randn(B, Hkv, L, Dh, device="mps", dtype=torch.float16)
+    meta = AttnMeta(flex_block_mask=None, q_len=T, kv_len=L)
+    cfg = AttnConfig(causal=True, enable_gqa=True)
+
+    monkeypatch.setenv("WORLD_METAL_IMPL", mode)
+    monkeypatch.setenv("WORLD_METAL_BLOCK_SIZE", "4")
+    if mode == "fast":
+        monkeypatch.setenv("WORLD_METAL_FAST_NO_FALLBACK", "1")
+    out = world_flex_attn_forward(q, k, v, meta, cfg, backend=AttnBackend.METAL)
+    assert out.shape == q.shape
+
+
+@pytest.mark.parametrize("dtype", [torch.float16])
+def test_ref_and_fast_op_shapes_and_parity(dtype):
+    _require_metal_op()
+
+    B, H, T, L, Dh = 1, 2, 8, 8, 64
+    q, k, v = _rand_attn_tensors(B, H, T, L, Dh, dtype)
+
+    out_ref = torch.ops.world.flex_attn_metal_ref(q, k, v, None, True)
+    out_fast = torch.ops.world.flex_attn_metal_fast(q, k, v, None, True)
+
+    assert out_ref.shape == q.shape
+    assert out_fast.shape == q.shape
+    assert out_ref.dtype == q.dtype
+    assert out_fast.dtype == q.dtype
+    assert torch.allclose(
+        out_ref.to("cpu", dtype=torch.float32),
+        out_fast.to("cpu", dtype=torch.float32),
+        rtol=2e-3,
+        atol=2e-3,
+    )
+
+
+@pytest.mark.parametrize("mode", ["ref", "fast"])
+def test_world_flex_attn_forward_selects_metal_impl(mode, monkeypatch):
+    _require_metal_op()
+
+    B, H, T, L, Dh = 1, 2, 8, 8, 64
+    q, k, v = _rand_attn_tensors(B, H, T, L, Dh, torch.float16)
+    meta = AttnMeta(flex_block_mask=None, q_len=T, kv_len=L)
+    cfg = AttnConfig(causal=True, enable_gqa=False)
+
+    monkeypatch.setenv("WORLD_METAL_IMPL", mode)
+    out = world_flex_attn_forward(q, k, v, meta, cfg, backend=AttnBackend.METAL)
+    assert out.shape == q.shape
+
+
+@pytest.mark.parametrize(
+    "shape",
+    [
+        (1, 2, 8, 8, 32),
+        (1, 4, 12, 16, 64),
+        (1, 8, 16, 16, 64),
+    ],
+)
+@pytest.mark.parametrize("causal", [False, True])
+@pytest.mark.parametrize("mode", ["ref", "fast"])
+def test_metal_impl_matches_reference_with_block_mask(shape, causal, mode, monkeypatch):
+    _require_metal_op()
+    B, H, T, L, Dh = shape
+    q, k, v = _rand_attn_tensors(B, H, T, L, Dh, torch.float16)
+
+    block_size = 4
+    kv_blocks = (L + block_size - 1) // block_size
+    # Fixed rolling-cache style block occupancy pattern.
+    block_written = torch.tensor(
+        [(i % 3) != 0 for i in range(kv_blocks)],
+        device=q.device,
+        dtype=torch.bool,
+    )
+    dense_mask = _dense_mask_from_block_written(block_written, T, L, block_size, q.device)
+    dense_mask = dense_mask.expand(B, H, T, L).contiguous()
+
+    ref = _reference_attention(q, k, v, causal=causal, mask=dense_mask)
+
+    monkeypatch.setenv("WORLD_METAL_IMPL", mode)
+    monkeypatch.setenv("WORLD_METAL_BLOCK_SIZE", str(block_size))
+    if mode == "fast":
+        monkeypatch.setenv("WORLD_METAL_FAST_NO_FALLBACK", "1")
+    out = torch.ops.world.flex_attn_metal_ref(q, k, v, dense_mask, causal) if mode == "ref" else torch.ops.world.flex_attn_metal_fast(q, k, v, dense_mask, causal)
+
+    assert torch.allclose(
+        out.to("cpu", dtype=torch.float32),
+        ref.to("cpu", dtype=torch.float32),
+        rtol=2e-2,
+        atol=2e-2,
+    )
+
+
+@pytest.mark.parametrize("seed", list(range(20)))
+def test_metal_fast_strict_fuzz_block_mask_gqa(seed, monkeypatch):
+    """
+    Adversarial fuzz test:
+    - odd T/L lengths
+    - variable block sizes (including non-divisors of L)
+    - mixed GQA factors
+    - random block-written sparsity patterns
+    - random causal mode
+    """
+    _require_metal_op()
+    random.seed(seed)
+    torch.manual_seed(seed)
+
+    B = 1
+    T = random.choice([1, 3, 7, 11, 15, 23, 31])
+    L = random.choice([1, 5, 9, 13, 17, 29, 33, 47])
+    Dh = random.choice([32, 64])
+    Hkv = random.choice([1, 2, 4])
+    gqa_group = random.choice([1, 2, 4])
+    Hq = Hkv * gqa_group
+    causal = bool(random.getrandbits(1))
+    block_size = random.choice([1, 2, 3, 4, 5, 7, 8])
+
+    q = torch.randn(B, Hq, T, Dh, device="mps", dtype=torch.float16)
+    k = torch.randn(B, Hkv, L, Dh, device="mps", dtype=torch.float16)
+    v = torch.randn(B, Hkv, L, Dh, device="mps", dtype=torch.float16)
+
+    kv_blocks = (L + block_size - 1) // block_size
+    # Include very sparse and very dense block occupancy cases.
+    p = random.choice([0.15, 0.35, 0.5, 0.8, 1.0])
+    block_written = (torch.rand(kv_blocks, device=q.device) < p)
+    # Keep at least one available block to avoid all-zero trivial outputs every time.
+    if not bool(block_written.any()):
+        block_written[random.randrange(kv_blocks)] = True
+
+    dense_mask = _dense_mask_from_block_written(block_written, T, L, block_size, q.device)
+    dense_mask = dense_mask.expand(B, Hq, T, L).contiguous()
+
+    monkeypatch.setenv("WORLD_METAL_BLOCK_SIZE", str(block_size))
+    monkeypatch.setenv("WORLD_METAL_FAST_NO_FALLBACK", "1")
+
+    out_fast = torch.ops.world.flex_attn_metal_fast(q, k, v, dense_mask, causal)
+    ref = _reference_attention(q, k, v, causal=causal, mask=dense_mask)
+
+    diff = (out_fast.to("cpu", dtype=torch.float32) - ref.to("cpu", dtype=torch.float32)).abs()
+    assert diff.max().item() < 4e-2
+    assert diff.mean().item() < 5e-3
+    assert torch.isfinite(out_fast).all().item()
+
+
+def test_metal_fast_strict_full_mask_rows_gqa(monkeypatch):
+    """
+    Hard edge case where all KV blocks are masked out. Output should be zeros
+    (after safe softmax handling), even for GQA.
+    """
+    _require_metal_op()
+    B, Hq, Hkv, T, L, Dh = 1, 8, 2, 19, 37, 64
+    q = torch.randn(B, Hq, T, Dh, device="mps", dtype=torch.float16)
+    k = torch.randn(B, Hkv, L, Dh, device="mps", dtype=torch.float16)
+    v = torch.randn(B, Hkv, L, Dh, device="mps", dtype=torch.float16)
+
+    block_size = 4
+    block_written = torch.zeros((L + block_size - 1) // block_size, device=q.device, dtype=torch.bool)
+    dense_mask = _dense_mask_from_block_written(block_written, T, L, block_size, q.device)
+    dense_mask = dense_mask.expand(B, Hq, T, L).contiguous()
+
+    monkeypatch.setenv("WORLD_METAL_BLOCK_SIZE", str(block_size))
+    monkeypatch.setenv("WORLD_METAL_FAST_NO_FALLBACK", "1")
+    out = torch.ops.world.flex_attn_metal_fast(q, k, v, dense_mask, True)
+    assert torch.allclose(out.to("cpu", dtype=torch.float32), torch.zeros_like(out.to("cpu", dtype=torch.float32)), atol=1e-6, rtol=0.0)
+
+
+@pytest.mark.parametrize("causal", [False, True])
+@pytest.mark.parametrize(
+    "shape",
+    [
+        (1, 8, 2, 13, 29, 64),
+        (1, 16, 4, 17, 33, 64),
+    ],
+)
+def test_fast_blocks_op_matches_reference(shape, causal, monkeypatch):
+    _require_metal_op()
+    monkeypatch.setenv("WORLD_METAL_FAST_NO_FALLBACK", "1")
+
+    B, Hq, Hkv, T, L, Dh = shape
+    q = torch.randn(B, Hq, T, Dh, device="mps", dtype=torch.float16)
+    k = torch.randn(B, Hkv, L, Dh, device="mps", dtype=torch.float16)
+    v = torch.randn(B, Hkv, L, Dh, device="mps", dtype=torch.float16)
+
+    block_size = 4
+    kv_blocks = (L + block_size - 1) // block_size
+    block_written = torch.tensor([(i % 2) == 0 for i in range(kv_blocks)], device=q.device, dtype=torch.uint8).contiguous()
+    dense_mask = _dense_mask_from_block_written(block_written.bool(), T, L, block_size, q.device).expand(B, Hq, T, L).contiguous()
+
+    out = torch.ops.world.flex_attn_metal_fast_blocks(q, k, v, block_written, block_size, causal)
+    ref = _reference_attention(q, k, v, causal=causal, mask=dense_mask)
+    assert torch.allclose(
+        out.to("cpu", dtype=torch.float32),
+        ref.to("cpu", dtype=torch.float32),
+        atol=3e-2,
+        rtol=3e-2,
+    )
+
+
+def test_world_flex_attn_forward_uses_block_metadata_path(monkeypatch):
+    _require_metal_op()
+    monkeypatch.setenv("WORLD_METAL_IMPL", "fast")
+    monkeypatch.setenv("WORLD_METAL_FAST_NO_FALLBACK", "1")
+
+    B, Hq, Hkv, T, L, Dh = 1, 8, 2, 11, 23, 64
+    q = torch.randn(B, Hq, T, Dh, device="mps", dtype=torch.float16)
+    k = torch.randn(B, Hkv, L, Dh, device="mps", dtype=torch.float16)
+    v = torch.randn(B, Hkv, L, Dh, device="mps", dtype=torch.float16)
+    block_size = 4
+    kv_blocks = (L + block_size - 1) // block_size
+    block_written = torch.tensor([(i % 3) != 0 for i in range(kv_blocks)], device=q.device, dtype=torch.uint8).contiguous()
+
+    meta = AttnMeta(
+        flex_block_mask=None,
+        q_len=T,
+        kv_len=L,
+        block_written=block_written,
+        block_size=block_size,
+    )
+    cfg = AttnConfig(causal=True, enable_gqa=True)
+    out = world_flex_attn_forward(q, k, v, meta, cfg, backend=AttnBackend.METAL)
+    dense_mask = _dense_mask_from_block_written(block_written.bool(), T, L, block_size, q.device).expand(B, Hq, T, L).contiguous()
+    ref = _reference_attention(q, k, v, causal=True, mask=dense_mask)
+    assert torch.allclose(
+        out.to("cpu", dtype=torch.float32),
+        ref.to("cpu", dtype=torch.float32),
+        atol=3e-2,
+        rtol=3e-2,
+    )
+
+
+def test_metal_fast_rejects_non_shared_mask(monkeypatch):
+    _require_metal_op()
+    monkeypatch.setenv("WORLD_METAL_FAST_NO_FALLBACK", "1")
+    monkeypatch.setenv("WORLD_METAL_BLOCK_SIZE", "4")
+
+    B, H, T, L, Dh = 1, 4, 8, 16, 64
+    q, k, v = _rand_attn_tensors(B, H, T, L, Dh, torch.float16)
+    mask = torch.ones((B, H, T, L), device="mps", dtype=torch.uint8).contiguous()
+    # Violate shared-mask contract by changing one query row.
+    mask[0, 1, 3, 5] = 0
+
+    with pytest.raises(RuntimeError, match="shared mask"):
+        _ = torch.ops.world.flex_attn_metal_fast(q, k, v, mask, True)
+
+
+def test_metal_fast_rejects_non_blockwise_mask(monkeypatch):
+    _require_metal_op()
+    monkeypatch.setenv("WORLD_METAL_FAST_NO_FALLBACK", "1")
+    monkeypatch.setenv("WORLD_METAL_BLOCK_SIZE", "4")
+
+    B, H, T, L, Dh = 1, 4, 8, 16, 64
+    q, k, v = _rand_attn_tensors(B, H, T, L, Dh, torch.float16)
+    mask = torch.ones((B, H, T, L), device="mps", dtype=torch.uint8).contiguous()
+    # Within a block [4,5,6,7], make token-level values differ.
+    mask[..., 5] = 0
+
+    with pytest.raises(RuntimeError, match="block-wise mask values"):
+        _ = torch.ops.world.flex_attn_metal_fast(q, k, v, mask, False)
+
+
+def test_metal_fast_batch2_shared_mask_matches_reference(monkeypatch):
+    _require_metal_op()
+    monkeypatch.setenv("WORLD_METAL_FAST_NO_FALLBACK", "1")
+    monkeypatch.setenv("WORLD_METAL_BLOCK_SIZE", "4")
+
+    B, Hq, Hkv, T, L, Dh = 2, 8, 2, 11, 23, 64
+    q = torch.randn(B, Hq, T, Dh, device="mps", dtype=torch.float16)
+    k = torch.randn(B, Hkv, L, Dh, device="mps", dtype=torch.float16)
+    v = torch.randn(B, Hkv, L, Dh, device="mps", dtype=torch.float16)
+
+    kv_blocks = (L + 4 - 1) // 4
+    block_written = torch.tensor([(i % 2) == 0 for i in range(kv_blocks)], device=q.device, dtype=torch.bool)
+    base_mask = _dense_mask_from_block_written(block_written, T, L, 4, q.device)
+    dense_mask = base_mask.expand(B, Hq, T, L).contiguous()
+
+    out_fast = torch.ops.world.flex_attn_metal_fast(q, k, v, dense_mask, True)
+    ref = _reference_attention(q, k, v, causal=True, mask=dense_mask)
+    assert torch.allclose(
+        out_fast.to("cpu", dtype=torch.float32),
+        ref.to("cpu", dtype=torch.float32),
+        atol=3e-2,
+        rtol=3e-2,
+    )
+
+
+@pytest.mark.parametrize("seed", list(range(40)))
+def test_metal_fast_active_strict_fuzz_matches_reference(seed, monkeypatch):
+    _require_metal_op()
+    random.seed(seed)
+    torch.manual_seed(seed)
+    monkeypatch.setenv("WORLD_METAL_FAST_NO_FALLBACK", "1")
+
+    B = random.choice([1, 2])
+    Hkv = random.choice([1, 2, 4])
+    gqa_group = random.choice([1, 2, 4])
+    Hq = Hkv * gqa_group
+    T = random.choice([1, 7, 15, 31, 63, 95])
+    L = random.choice([5, 17, 37, 65, 129, 257])
+    Dh = random.choice([32, 64])
+    causal = bool(random.getrandbits(1))
+    block_size = random.choice([1, 2, 4, 8])
+
+    q = torch.randn(B, Hq, T, Dh, device="mps", dtype=torch.float16)
+    k = torch.randn(B, Hkv, L, Dh, device="mps", dtype=torch.float16)
+    v = torch.randn(B, Hkv, L, Dh, device="mps", dtype=torch.float16)
+
+    kv_blocks = (L + block_size - 1) // block_size
+    p = random.choice([0.0, 0.1, 0.25, 0.5, 0.8, 1.0])
+    block_written = (torch.rand(kv_blocks, device=q.device) < p).to(torch.uint8).contiguous()
+    active_blocks = torch.nonzero(block_written, as_tuple=False).flatten().to(torch.int32).contiguous()
+
+    out = torch.ops.world.flex_attn_metal_fast_active(q, k, v, active_blocks, block_size, causal)
+    dense_mask = _dense_mask_from_block_written(block_written.bool(), T, L, block_size, q.device)
+    dense_mask = dense_mask.expand(B, Hq, T, L).contiguous()
+    ref = _reference_attention(q, k, v, causal=causal, mask=dense_mask)
+
+    diff = (out.to("cpu", dtype=torch.float32) - ref.to("cpu", dtype=torch.float32)).abs()
+    assert diff.max().item() < 5e-2
+    assert diff.mean().item() < 8e-3
+    assert torch.isfinite(out).all().item()
+
diff --git a/tests/test_metal_attn_perf.py b/tests/test_metal_attn_perf.py
new file mode 100644
index 0000000..243ea2b
--- /dev/null
+++ b/tests/test_metal_attn_perf.py
@@ -0,0 +1,325 @@
+import time
+
+import pytest
+import torch
+
+from attn_backend import (
+    AttnBackend,
+    AttnConfig,
+    AttnMeta,
+    world_flex_attn_forward,
+)
+from metal_test_utils import require_metal_attn_ops, timed_ms_sync
+
+
+pytestmark = pytest.mark.skipif(
+    not torch.backends.mps.is_available(),
+    reason="MPS backend not available on this system",
+)
+
+
+def _rand_attn_tensors(B: int, H: int, T: int, L: int, Dh: int, dtype: torch.dtype):
+    q = torch.randn(B, H, T, Dh, device="mps", dtype=dtype)
+    k = torch.randn(B, H, L, Dh, device="mps", dtype=dtype)
+    v = torch.randn(B, H, L, Dh, device="mps", dtype=dtype)
+    return q, k, v
+
+
+def _rand_gqa_tensors(B: int, Hq: int, Hkv: int, T: int, L: int, Dh: int, dtype: torch.dtype):
+    q = torch.randn(B, Hq, T, Dh, device="mps", dtype=dtype)
+    k = torch.randn(B, Hkv, L, Dh, device="mps", dtype=dtype)
+    v = torch.randn(B, Hkv, L, Dh, device="mps", dtype=dtype)
+    return q, k, v
+
+
+_require_metal_ops = require_metal_attn_ops
+_timed_ms_sync = timed_ms_sync
+
+
+@pytest.mark.parametrize("dtype", [torch.float16])
+def test_metal_backend_runs_and_is_stable(dtype):
+    _require_metal_ops()
+
+    B, H, T, L, Dh = 1, 8, 256, 256, 64
+    q, k, v = _rand_attn_tensors(B, H, T, L, Dh, dtype)
+
+    meta = AttnMeta(flex_block_mask=None, q_len=T, kv_len=L)
+    cfg = AttnConfig(causal=True, enable_gqa=False)
+
+    # Warmup
+    for _ in range(3):
+        _ = world_flex_attn_forward(q, k, v, meta, cfg, backend=AttnBackend.METAL)
+
+    iters = 5
+    start = time.perf_counter()
+    for _ in range(iters):
+        _ = world_flex_attn_forward(q, k, v, meta, cfg, backend=AttnBackend.METAL)
+    elapsed = time.perf_counter() - start
+
+    # This test intentionally only asserts that the kernel runs in a reasonable
+    # amount of time; tighter perf targets can be added once the kernel body
+    # is implemented and tuned.
+    avg_ms = (elapsed / iters) * 1000.0
+    assert avg_ms < 1000.0
+
+
+@pytest.mark.parametrize(
+    "shape",
+    [
+        (1, 2, 64, 64, 64),
+        (1, 4, 128, 128, 64),
+        (1, 8, 256, 256, 64),
+    ],
+)
+@pytest.mark.parametrize("mode", ["ref", "fast"])
+def test_metal_impl_modes_perf_sanity(shape, mode, monkeypatch):
+    _require_metal_ops()
+    monkeypatch.setenv("WORLD_METAL_BLOCK_SIZE", "4")
+    monkeypatch.setenv("WORLD_METAL_FAST_NO_FALLBACK", "1")
+
+    B, H, T, L, Dh = shape
+    q, k, v = _rand_attn_tensors(B, H, T, L, Dh, torch.float16)
+    meta = AttnMeta(flex_block_mask=None, q_len=T, kv_len=L)
+    cfg = AttnConfig(causal=True, enable_gqa=False)
+
+    fn = torch.ops.world.flex_attn_metal_ref if mode == "ref" else torch.ops.world.flex_attn_metal_fast
+    mask = torch.ones((B, H, T, L), device="mps", dtype=torch.uint8).contiguous()
+
+    for _ in range(2):
+        _ = fn(q, k, v, mask, cfg.causal)
+
+    iters = 3
+    start = time.perf_counter()
+    for _ in range(iters):
+        out = fn(q, k, v, mask, cfg.causal)
+    elapsed_ms = (time.perf_counter() - start) * 1000.0 / iters
+
+    assert out.shape == q.shape
+    assert out.dtype == q.dtype
+    assert elapsed_ms > 0.0
+    assert elapsed_ms < 2000.0
+
+
+def test_metal_fast_strict_path_executes(monkeypatch):
+    _require_metal_ops()
+    monkeypatch.setenv("WORLD_METAL_FAST_NO_FALLBACK", "1")
+    monkeypatch.setenv("WORLD_METAL_BLOCK_SIZE", "4")
+
+    B, H, T, L, Dh = 1, 2, 32, 32, 64
+    q, k, v = _rand_attn_tensors(B, H, T, L, Dh, torch.float16)
+    mask = torch.ones((B, H, T, L), device="mps", dtype=torch.uint8).contiguous()
+    out = torch.ops.world.flex_attn_metal_fast(q, k, v, mask, True)
+    assert out.shape == q.shape
+
+
+@pytest.mark.parametrize("mode", ["ref", "fast"])
+@pytest.mark.parametrize(
+    "shape",
+    [
+        (1, 8, 2, 128, 128, 64),
+        (1, 16, 4, 192, 192, 64),
+    ],
+)
+def test_metal_gqa_modes_perf_sanity(shape, mode, monkeypatch):
+    _require_metal_ops()
+    monkeypatch.setenv("WORLD_METAL_BLOCK_SIZE", "4")
+    monkeypatch.setenv("WORLD_METAL_FAST_NO_FALLBACK", "1")
+
+    B, Hq, Hkv, T, L, Dh = shape
+    q, k, v = _rand_gqa_tensors(B, Hq, Hkv, T, L, Dh, torch.float16)
+    mask = torch.ones((B, Hq, T, L), device="mps", dtype=torch.uint8).contiguous()
+    fn = torch.ops.world.flex_attn_metal_ref if mode == "ref" else torch.ops.world.flex_attn_metal_fast
+
+    for _ in range(2):
+        _ = fn(q, k, v, mask, True)
+
+    iters = 3
+    start = time.perf_counter()
+    for _ in range(iters):
+        out = fn(q, k, v, mask, True)
+    elapsed_ms = (time.perf_counter() - start) * 1000.0 / iters
+
+    assert out.shape == q.shape
+    assert elapsed_ms > 0.0
+    assert elapsed_ms < 3000.0
+
+
+@pytest.mark.parametrize("causal", [False, True])
+def test_metal_fast_long_context_stress(causal, monkeypatch):
+    _require_metal_ops()
+    monkeypatch.setenv("WORLD_METAL_BLOCK_SIZE", "4")
+    monkeypatch.setenv("WORLD_METAL_FAST_NO_FALLBACK", "1")
+
+    B, Hq, Hkv, T, L, Dh = 1, 16, 4, 256, 768, 64
+    q, k, v = _rand_gqa_tensors(B, Hq, Hkv, T, L, Dh, torch.float16)
+    mask = torch.ones((B, Hq, T, L), device="mps", dtype=torch.uint8).contiguous()
+
+    # Warmup
+    for _ in range(2):
+        _ = torch.ops.world.flex_attn_metal_fast(q, k, v, mask, causal)
+
+    iters = 4
+    start = time.perf_counter()
+    out = None
+    for _ in range(iters):
+        out = torch.ops.world.flex_attn_metal_fast(q, k, v, mask, causal)
+    avg_ms = (time.perf_counter() - start) * 1000.0 / iters
+
+    assert out is not None
+    assert out.shape == q.shape
+    assert torch.isfinite(out).all().item()
+    # Generous ceiling for CI variance while still guarding hangs/regressions.
+    assert avg_ms < 5000.0
+
+
+def test_metal_fast_vs_ref_perf_ratio_gqa(monkeypatch):
+    _require_metal_ops()
+    monkeypatch.setenv("WORLD_METAL_BLOCK_SIZE", "4")
+    monkeypatch.setenv("WORLD_METAL_FAST_NO_FALLBACK", "1")
+
+    B, Hq, Hkv, T, L, Dh = 1, 16, 4, 192, 384, 64
+    q, k, v = _rand_gqa_tensors(B, Hq, Hkv, T, L, Dh, torch.float16)
+    mask = torch.ones((B, Hq, T, L), device="mps", dtype=torch.uint8).contiguous()
+
+    for _ in range(2):
+        _ = torch.ops.world.flex_attn_metal_ref(q, k, v, mask, True)
+        _ = torch.ops.world.flex_attn_metal_fast(q, k, v, mask, True)
+
+    iters = 3
+    start = time.perf_counter()
+    for _ in range(iters):
+        _ = torch.ops.world.flex_attn_metal_ref(q, k, v, mask, True)
+    ref_ms = (time.perf_counter() - start) * 1000.0 / iters
+
+    start = time.perf_counter()
+    for _ in range(iters):
+        _ = torch.ops.world.flex_attn_metal_fast(q, k, v, mask, True)
+    fast_ms = (time.perf_counter() - start) * 1000.0 / iters
+
+    assert ref_ms > 0.0 and fast_ms > 0.0
+    # Guard against extreme regressions while allowing room during early
+    # kernel bring-up (current fast path is correctness-oriented, not tuned).
+    assert fast_ms / ref_ms < 200.0
+    assert fast_ms < 500.0
+
+
+def test_metal_fast_blocks_perf_sanity(monkeypatch):
+    _require_metal_ops()
+    monkeypatch.setenv("WORLD_METAL_FAST_NO_FALLBACK", "1")
+
+    B, Hq, Hkv, T, L, Dh = 1, 16, 4, 160, 320, 64
+    q, k, v = _rand_gqa_tensors(B, Hq, Hkv, T, L, Dh, torch.float16)
+    block_size = 4
+    kv_blocks = (L + block_size - 1) // block_size
+    block_written = torch.ones((kv_blocks,), device="mps", dtype=torch.uint8).contiguous()
+
+    for _ in range(2):
+        _ = torch.ops.world.flex_attn_metal_fast_blocks(q, k, v, block_written, block_size, True)
+
+    iters = 3
+    start = time.perf_counter()
+    for _ in range(iters):
+        out = torch.ops.world.flex_attn_metal_fast_blocks(q, k, v, block_written, block_size, True)
+    avg_ms = (time.perf_counter() - start) * 1000.0 / iters
+    assert out.shape == q.shape
+    assert avg_ms > 0.0
+    assert avg_ms < 5000.0
+
+
+@pytest.mark.parametrize(
+    "shape",
+    [
+        (1, 16, 4, 192, 384, 64),
+        (1, 16, 4, 256, 768, 64),
+        (1, 8, 8, 256, 512, 64),
+        (2, 8, 2, 160, 320, 64),
+    ],
+)
+@pytest.mark.parametrize("sparsity", [1.0, 0.5, 0.25])
+def test_metal_fast_active_benchmark_matrix(shape, sparsity, monkeypatch):
+    _require_metal_ops()
+    monkeypatch.setenv("WORLD_METAL_FAST_NO_FALLBACK", "1")
+
+    B, Hq, Hkv, T, L, Dh = shape
+    q, k, v = _rand_gqa_tensors(B, Hq, Hkv, T, L, Dh, torch.float16)
+    block_size = 4
+    kv_blocks = (L + block_size - 1) // block_size
+    block_written = (torch.rand((kv_blocks,), device="mps") < sparsity).to(torch.uint8).contiguous()
+    active_blocks = torch.nonzero(block_written, as_tuple=False).flatten().to(torch.int32).contiguous()
+
+    stats = _timed_ms_sync(
+        lambda: torch.ops.world.flex_attn_metal_fast_active(
+            q, k, v, active_blocks, block_size, True
+        ),
+        warmup=10,
+        iters=40,
+    )
+
+    assert stats["mean_ms"] > 0.0
+    assert stats["p50_ms"] > 0.0
+    assert stats["p95_ms"] >= stats["p50_ms"]
+    assert stats["max_ms"] < 50.0
+
+
+def test_metal_fast_active_vs_blocks_latency(monkeypatch):
+    _require_metal_ops()
+    monkeypatch.setenv("WORLD_METAL_FAST_NO_FALLBACK", "1")
+
+    B, Hq, Hkv, T, L, Dh = 1, 16, 4, 256, 768, 64
+    q, k, v = _rand_gqa_tensors(B, Hq, Hkv, T, L, Dh, torch.float16)
+    block_size = 4
+    kv_blocks = (L + block_size - 1) // block_size
+    block_written = torch.tensor([(i % 2) == 0 for i in range(kv_blocks)], device="mps", dtype=torch.uint8).contiguous()
+    active_blocks = torch.nonzero(block_written, as_tuple=False).flatten().to(torch.int32).contiguous()
+
+    blocks_stats = _timed_ms_sync(
+        lambda: torch.ops.world.flex_attn_metal_fast_blocks(
+            q, k, v, block_written, block_size, True
+        ),
+        warmup=10,
+        iters=60,
+    )
+    active_stats = _timed_ms_sync(
+        lambda: torch.ops.world.flex_attn_metal_fast_active(
+            q, k, v, active_blocks, block_size, True
+        ),
+        warmup=10,
+        iters=60,
+    )
+
+    # Active path should not regress significantly versus block-written path.
+    assert active_stats["p50_ms"] <= blocks_stats["p50_ms"] * 1.25
+    assert active_stats["p95_ms"] <= blocks_stats["p95_ms"] * 1.25
+
+
+def test_world_backend_fast_active_stability(monkeypatch):
+    _require_metal_ops()
+    monkeypatch.setenv("WORLD_ATTENTION_BACKEND", "metal")
+    monkeypatch.setenv("WORLD_METAL_IMPL", "fast")
+    monkeypatch.setenv("WORLD_METAL_FAST_NO_FALLBACK", "1")
+
+    B, Hq, Hkv, T, L, Dh = 1, 16, 4, 192, 384, 64
+    q, k, v = _rand_gqa_tensors(B, Hq, Hkv, T, L, Dh, torch.float16)
+    block_size = 4
+    kv_blocks = (L + block_size - 1) // block_size
+    block_written = torch.tensor([(i % 2) == 0 for i in range(kv_blocks)], device="mps", dtype=torch.uint8).contiguous()
+    active_blocks = torch.nonzero(block_written, as_tuple=False).flatten().to(torch.int32).contiguous()
+    meta = AttnMeta(
+        flex_block_mask=None,
+        q_len=T,
+        kv_len=L,
+        block_written=block_written,
+        active_blocks=active_blocks,
+        block_size=block_size,
+    )
+    cfg = AttnConfig(causal=True, enable_gqa=True)
+
+    stats = _timed_ms_sync(
+        lambda: world_flex_attn_forward(q, k, v, meta, cfg, backend=AttnBackend.METAL),
+        warmup=12,
+        iters=80,
+    )
+    assert stats["mean_ms"] > 0.0
+    assert stats["p95_ms"] < 20.0
+    assert (stats["p95_ms"] / max(stats["p50_ms"], 1e-6)) < 3.0
+
diff --git a/tests/test_module_timing_budget.py b/tests/test_module_timing_budget.py
new file mode 100644
index 0000000..c254fce
--- /dev/null
+++ b/tests/test_module_timing_budget.py
@@ -0,0 +1,57 @@
+from __future__ import annotations
+
+import json
+import os
+import subprocess
+import sys
+from pathlib import Path
+
+import pytest
+
+
+def _repo_root() -> Path:
+    return Path(__file__).resolve().parents[1]
+
+
+@pytest.mark.skipif(os.environ.get("RUN_SLOW_METAL_TESTS", "0") != "1", reason="enable with RUN_SLOW_METAL_TESTS=1")
+@pytest.mark.skipif(os.environ.get("WORLD_ATTENTION_BACKEND", "metal") != "metal", reason="requires metal backend")
+def test_module_timing_report_budget(tmp_path: Path):
+    out_dir = tmp_path / "module_timing"
+    cmd = [
+        sys.executable,
+        "tests/profile_and_dump_variant_metal.py",
+        "--model-uri",
+        "Overworld-Models/Lapp0-WP-Mini-1.4.5-BL-Distill",
+        "--device",
+        "mps",
+        "--dtype",
+        "bfloat16",
+        "--profile-steps",
+        "4",
+        "--dump-phases",
+        "none",
+        "--module-timing",
+        "--output-dir",
+        str(out_dir),
+    ]
+    env = os.environ.copy()
+    env.setdefault("HF_HUB_OFFLINE", "1")
+    env.setdefault("TRANSFORMERS_OFFLINE", "1")
+    env.setdefault("TORCHDYNAMO_DISABLE", "1")
+    env.setdefault("WORLD_ATTENTION_BACKEND", "metal")
+    env.setdefault("WORLD_METAL_IMPL", "fast")
+    env.setdefault("WORLD_METAL_FAST_NO_FALLBACK", "1")
+    env.setdefault("WORLD_METAL_PREFER_ACTIVE_DISPATCH", "1")
+    env.setdefault("WORLD_KV_RUNTIME_CHECKS", "0")
+    env.setdefault("WORLD_KV_COMPUTE_ACTIVE_BLOCKS", "0")
+    env.setdefault("PYTHONPATH", ".")
+    subprocess.run(cmd, check=True, cwd=str(_repo_root()), env=env)
+
+    timing = json.loads((out_dir / "module_timing_report.json").read_text(encoding="utf-8"))
+    assert timing["enabled"] is True
+    assert isinstance(timing["modules"], list)
+    assert len(timing["modules"]) > 0
+    # Soft sanity: total measured module time should be positive.
+    total = sum(float(m["total_ms"]) for m in timing["modules"])
+    assert total > 0.0
+
diff --git a/tests/test_tensor_dump_regression.py b/tests/test_tensor_dump_regression.py
new file mode 100644
index 0000000..aecfd75
--- /dev/null
+++ b/tests/test_tensor_dump_regression.py
@@ -0,0 +1,43 @@
+from __future__ import annotations
+
+import json
+import subprocess
+import sys
+from pathlib import Path
+
+import pytest
+
+
+def _repo_root() -> Path:
+    return Path(__file__).resolve().parents[1]
+
+
+def _baseline_dir() -> Path:
+    return _repo_root() / "diagnostics" / "out" / "metal_profile_baseline"
+
+
+@pytest.mark.skipif(not _baseline_dir().exists(), reason="baseline dump directory not found")
+def test_compare_tensor_dumps_self_consistency(tmp_path: Path):
+    compare_script = _repo_root() / "tests" / "compare_tensor_dumps.py"
+    out_dir = tmp_path / "cmp"
+    out_dir.mkdir(parents=True, exist_ok=True)
+
+    cmd = [
+        sys.executable,
+        str(compare_script),
+        "--baseline-dir",
+        str(_baseline_dir()),
+        "--candidate-dir",
+        str(_baseline_dir()),
+        "--phase",
+        "all",
+        "--strict",
+        "--out-dir",
+        str(out_dir),
+    ]
+    subprocess.run(cmd, check=True, cwd=str(_repo_root()))
+
+    summary = json.loads((out_dir / "comparison_summary.json").read_text(encoding="utf-8"))
+    assert summary["pass"] is True
+    assert summary["counts"]["failed_modules"] == 0
+