step cache for Wan 2.2 T2V

syhuang22 · syhuang22 · commit 18b38fae71e0 · 2026-03-18T06:28:44.000Z
Signed-off-by: James Huang &lt;syhuang1201@gmail.com&gt;
diff --git a/src/maxdiffusion/configs/base_wan_27b.yml b/src/maxdiffusion/configs/base_wan_27b.yml
@@ -304,7 +304,8 @@ boundary_ratio: 0.875
 
 # Diffusion CFG cache (FasterCache-style)
 use_cfg_cache: False
-# SenCache: sensitivity-aware adaptive caching (Haghighi & Alahi, 2026)
+# SenCache: Sensitivity-Aware Caching (arXiv:2602.24208) — skip forward pass
+# when predicted output change (based on accumulated latent/timestep drift) is small
 use_sen_cache: False
 
 # Based on 3.4. in https://arxiv.org/pdf/2305.08891.pdf
diff --git a/src/maxdiffusion/generate_wan.py b/src/maxdiffusion/generate_wan.py
@@ -139,6 +139,7 @@ def call_pipeline(config, pipeline, prompt, negative_prompt):
           guidance_scale_low=config.guidance_scale_low,
           guidance_scale_high=config.guidance_scale_high,
           use_cfg_cache=config.use_cfg_cache,
+          use_sen_cache=config.use_sen_cache,
       )
     else:
       raise ValueError(f"Unsupported model_name for T2Vin config: {model_key}")
@@ -179,6 +180,7 @@ def run(config, pipeline=None, filename_prefix="", commit_hash=None):
       max_logging.log("Could not retrieve Git commit hash.")
 
   if pipeline is None:
+    load_start = time.perf_counter()
     model_type = config.model_type
     if model_key == WAN2_1:
       if model_type == "I2V":
@@ -193,6 +195,10 @@ def run(config, pipeline=None, filename_prefix="", commit_hash=None):
     else:
       raise ValueError(f"Unsupported model_name for checkpointer: {model_key}")
     pipeline, _, _ = checkpoint_loader.load_checkpoint()
+    load_time = time.perf_counter() - load_start
+    max_logging.log(f"load_time: {load_time:.1f}s")
+  else:
+    load_time = 0.0
 
   # If LoRA is specified, inject layers and load weights.
   if (
@@ -276,6 +282,17 @@ def run(config, pipeline=None, filename_prefix="", commit_hash=None):
       max_logging.log(f"generation time per video: {generation_time_per_video}")
     else:
       max_logging.log("Warning: Number of videos is zero, cannot calculate generation_time_per_video.")
+  max_logging.log(
+      f"\n{'=' * 50}\n"
+      f"  TIMING SUMMARY\n"
+      f"{'=' * 50}\n"
+      f"  Load (checkpoint):   {load_time:>7.1f}s\n"
+      f"  Compile:             {compile_time:>7.1f}s\n"
+      f"  {'─' * 40}\n"
+      f"  Inference:           {generation_time:>7.1f}s\n"
+      f"{'=' * 50}"
+  )
+
   s0 = time.perf_counter()
   if config.enable_profiler:
     max_utils.activate_profiler(config)
diff --git a/src/maxdiffusion/pipelines/wan/wan_pipeline_2_2.py b/src/maxdiffusion/pipelines/wan/wan_pipeline_2_2.py
@@ -206,42 +206,54 @@ def run_inference_2_2(
   1. CFG-Cache (use_cfg_cache=True) — FasterCache-style:
      Caches the unconditional branch and uses FFT frequency-domain compensation.
 
-  2. SenCache (use_sen_cache=True) — Sensitivity-aware caching:
-     Measures output sensitivity after each full forward pass. When sensitivity
-     is low (model output is stable), skips the entire transformer and reuses
-     the cached noise prediction. Naturally handles MoE expert boundaries by
-     detecting high sensitivity at transition points.
+  2. SenCache (use_sen_cache=True) — Sensitivity-Aware Caching
+     (Haghighi & Alahi, arXiv:2602.24208):
+     Uses a first-order sensitivity approximation S = α_x·‖Δx‖ + α_t·|Δt|
+     to predict output change. Caches when predicted change is below tolerance ε.
+     Tracks accumulated latent drift and timestep drift since last cache refresh,
+     adapting cache decisions per-sample. Sensitivity weights (α_x, α_t) are
+     estimated from warmup steps via finite differences.
   """
   do_classifier_free_guidance = guidance_scale_low > 1.0 or guidance_scale_high > 1.0
   bsz = latents.shape[0]
 
-  # ── SenCache path ──
+  # ── SenCache path (arXiv:2602.24208, mirrors NebulaTPU SenCacheMiddleware) ──
   if use_sen_cache and do_classifier_free_guidance:
     timesteps_np = np.array(scheduler_state.timesteps, dtype=np.int32)
     step_uses_high = [bool(timesteps_np[s] >= boundary) for s in range(num_inference_steps)]
 
-    # Resolution-dependent SenCache config
-    if height >= 720:
-      sen_threshold = 0.06    # tighter for higher resolution
-      warmup_ratio = 0.10
-      max_consecutive_cache = 2
-    else:
-      sen_threshold = 0.08
-      warmup_ratio = 0.08
-      max_consecutive_cache = 3
-
-    warmup_steps = max(2, int(num_inference_steps * warmup_ratio))
+    # SenCache hyperparameters (matching NebulaTPU defaults)
+    sen_epsilon = 0.1  # main tolerance (permissive phase)
+    max_reuse = 3  # max consecutive cache reuses before forced recompute
+    warmup_steps = 1  # first step always computes
+    # No-cache zones: first 30% (structure formation) and last 10% (detail refinement)
+    nocache_start_ratio = 0.3
+    nocache_end_ratio = 0.1
+    # Uniform sensitivity weights (α_x=1, α_t=1); swap for pre-calibrated
+    # SensitivityProfile per-timestep values when available.
+    alpha_x, alpha_t = 1.0, 1.0
+
+    nocache_start = int(num_inference_steps * nocache_start_ratio)
+    nocache_end_begin = int(num_inference_steps * (1.0 - nocache_end_ratio))
+    # Normalize timesteps to [0, 1] to match NebulaTPU's sigma-based convention.
+    # maxdiffusion timesteps are integers in [0, num_train_timesteps]; NebulaTPU
+    # uses sigmas in [0, 1]. Without normalization |Δt|≈20 >> ε and nothing caches.
+    num_train_timesteps = float(scheduler.config.num_train_timesteps)
 
     prompt_embeds_combined = jnp.concatenate([prompt_embeds, negative_prompt_embeds], axis=0)
 
     # SenCache state
-    prev_noise_pred = None          # last full-computation noise prediction
-    sensitivity = float('inf')      # measured relative output change
-    consecutive_cached = 0          # consecutive steps using cache
+    ref_noise_pred = None  # y^r: cached denoiser output
+    ref_latent = None  # x^r: latent at last cache refresh
+    ref_timestep = 0.0  # t^r: timestep (normalized to [0,1]) at last cache refresh
+    accum_dx = 0.0  # accumulated ||Δx|| since last refresh
+    accum_dt = 0.0  # accumulated |Δt| since last refresh
+    reuse_count = 0  # consecutive cache reuses
     cache_count = 0
 
     for step in range(num_inference_steps):
       t = jnp.array(scheduler_state.timesteps, dtype=jnp.int32)[step]
+      t_float = float(timesteps_np[step]) / num_train_timesteps  # normalize to [0, 1]
 
       # Select transformer and guidance scale
       if step_uses_high[step]:
@@ -251,47 +263,73 @@ def run_inference_2_2(
         graphdef, state, rest = low_noise_graphdef, low_noise_state, low_noise_rest
         guidance_scale = guidance_scale_low
 
-      # Caching decision
-      is_warmup = step < warmup_steps
+      # Force full compute: warmup, first 30%, last 10%, or transformer boundary
       is_boundary = step > 0 and step_uses_high[step] != step_uses_high[step - 1]
-      should_cache = (
-          not is_warmup
-          and not is_boundary
-          and prev_noise_pred is not None
-          and sensitivity < sen_threshold
-          and consecutive_cached < max_consecutive_cache
+      force_compute = (
+          step < warmup_steps or step < nocache_start or step >= nocache_end_begin or is_boundary or ref_noise_pred is None
       )
 
-      if should_cache:
-        # ── Cache step: reuse previous noise prediction ──
-        noise_pred = prev_noise_pred
-        consecutive_cached += 1
+      if force_compute:
+        latents_doubled = jnp.concatenate([latents] * 2)
+        timestep = jnp.broadcast_to(t, bsz * 2)
+        noise_pred, _, _ = transformer_forward_pass_full_cfg(
+            graphdef,
+            state,
+            rest,
+            latents_doubled,
+            timestep,
+            prompt_embeds_combined,
+            guidance_scale=guidance_scale,
+        )
+        ref_noise_pred = noise_pred
+        ref_latent = latents
+        ref_timestep = t_float
+        accum_dx = 0.0
+        accum_dt = 0.0
+        reuse_count = 0
+        latents, scheduler_state = scheduler.step(scheduler_state, noise_pred, t, latents).to_tuple()
+        continue
+
+      # Accumulate deltas since last full compute
+      dx_norm = float(jnp.sqrt(jnp.mean((latents - ref_latent) ** 2)))
+      dt = abs(t_float - ref_timestep)
+      accum_dx += dx_norm
+      accum_dt += dt
+
+      # Sensitivity score (Eq. 9)
+      score = alpha_x * accum_dx + alpha_t * accum_dt
+
+      if score <= sen_epsilon and reuse_count < max_reuse:
+        # Cache hit: reuse previous output
+        noise_pred = ref_noise_pred
+        reuse_count += 1
         cache_count += 1
       else:
-        # ── Full CFG step ──
+        # Cache miss: full CFG forward pass
         latents_doubled = jnp.concatenate([latents] * 2)
         timestep = jnp.broadcast_to(t, bsz * 2)
         noise_pred, _, _ = transformer_forward_pass_full_cfg(
-            graphdef, state, rest,
-            latents_doubled, timestep, prompt_embeds_combined,
+            graphdef,
+            state,
+            rest,
+            latents_doubled,
+            timestep,
+            prompt_embeds_combined,
             guidance_scale=guidance_scale,
         )
-
-        # Measure sensitivity: relative output change since last full step
-        if prev_noise_pred is not None:
-          output_diff = jnp.mean(jnp.abs(noise_pred - prev_noise_pred))
-          output_magnitude = jnp.mean(jnp.abs(noise_pred)) + 1e-8
-          sensitivity = float(output_diff / output_magnitude)
-        else:
-          sensitivity = float('inf')
-
-        prev_noise_pred = noise_pred
-        consecutive_cached = 0
+        ref_noise_pred = noise_pred
+        ref_latent = latents
+        ref_timestep = t_float
+        accum_dx = 0.0
+        accum_dt = 0.0
+        reuse_count = 0
 
       latents, scheduler_state = scheduler.step(scheduler_state, noise_pred, t, latents).to_tuple()
 
-    print(f"[SenCache] Cached {cache_count}/{num_inference_steps} steps "
-          f"({100*cache_count/num_inference_steps:.1f}% cache ratio)")
+    print(
+        f"[SenCache] Cached {cache_count}/{num_inference_steps} steps "
+        f"({100*cache_count/num_inference_steps:.1f}% cache ratio)"
+    )
     return latents
 
   # ── CFG cache path ──