From 785edd46e1ac8084c2fcd064347dc7030e8e2056 Mon Sep 17 00:00:00 2001
From: haok1402 <haok1402@gmail.com>
Date: Sat, 4 Apr 2026 14:55:01 -0400
Subject: [PATCH 1/2] Revise the interface to skip checkpoint saving if
 save_interval left unspecified to support short validation runs

---
 pithtrain/modules/training.py              | 14 ++++++++++----
 pithtrain/tasks/pretrain_language_model.py | 16 +++++++++++-----
 2 files changed, 21 insertions(+), 9 deletions(-)

diff --git a/pithtrain/modules/training.py b/pithtrain/modules/training.py
index 00633b1..0df65e6 100644
--- a/pithtrain/modules/training.py
+++ b/pithtrain/modules/training.py
@@ -95,14 +95,20 @@ class TrainingCfg(SlottedDefault):
     (e.g. ``"examples/.../qwen3_30b_a3b_config.json"``).
     """
 
-    save_interval: int
+    save_interval: Optional[int] = None
     """
-    The interval (in steps) at which to save model checkpoints.
+    The interval (in steps) at which to save checkpoints. When None,
+    checkpoint saving is disabled but loading still occurs from
+    ``save_location`` (if set). This is useful for validation runs
+    that need to load a pretrained checkpoint without writing new ones.
     """
 
-    save_location: Path
+    save_location: Optional[Path] = None
     """
-    The directory where model checkpoints will be saved.
+    The directory for checkpoint storage. Checkpoints are loaded from
+    and saved to ``<save_location>/torch-dcp/step-XXXXXXXX``. When
+    None, both loading and saving are disabled and the model trains
+    from scratch.
     """
 
     moe_load_balance_coef: float = 0.0
diff --git a/pithtrain/tasks/pretrain_language_model.py b/pithtrain/tasks/pretrain_language_model.py
index d13b590..c740898 100644
--- a/pithtrain/tasks/pretrain_language_model.py
+++ b/pithtrain/tasks/pretrain_language_model.py
@@ -284,6 +284,7 @@ def save_checkpoint(cfg: PretrainLanguageModelCfg, ctx: PretrainLanguageModelCtx
     DTensors are kept as CPU DTensors and DCP saves each rank's shard.
     """
     stdout = ctx.logging.stdout
+    assert cfg.training.save_location is not None
     save_location = Path(cfg.training.save_location, "torch-dcp", "step-%08d" % ctx.training.step)
     model = ctx.training.model
     optimizer = ctx.training.optimizer
@@ -317,6 +318,9 @@ def load_checkpoint(cfg: PretrainLanguageModelCfg, ctx: PretrainLanguageModelCtx
     Load the checkpoint from the latest step.
     """
     stdout = ctx.logging.stdout
+    if cfg.training.save_location is None:
+        stdout.info("No save_location set; training from scratch.")
+        return
     path2step = lambda p: int(p.stem.removeprefix("step-"))
     checkpoints = Path(cfg.training.save_location, "torch-dcp").glob("step-*")
     checkpoints = sorted(checkpoints, key=path2step)
@@ -466,11 +470,13 @@ def train_step(cfg: PretrainLanguageModelCfg, ctx: PretrainLanguageModelCtx) ->
     # We should save the checkpoint if any of the following conditions is true:
     # 1. The current step is a multiple of save_interval.
     # 2. The current step is the last step (max_steps).
-    should_save = False
-    should_save |= ctx.training.step % cfg.training.save_interval == 0
-    should_save |= ctx.training.step == cfg.training.max_steps
-    if should_save:
-        save_checkpoint(cfg, ctx)
+    # Skip entirely if save_interval is None.
+    if cfg.training.save_interval is not None:
+        should_save = False
+        should_save |= ctx.training.step % cfg.training.save_interval == 0
+        should_save |= ctx.training.step == cfg.training.max_steps
+        if should_save:
+            save_checkpoint(cfg, ctx)
 
     # Run deferred GC here so cyclic collection never fires mid-forward/backward.
     gc.collect()

From 6fe932eaedd879df9beafe8c55eeae17e9815eb9 Mon Sep 17 00:00:00 2001
From: haok1402 <haok1402@gmail.com>
Date: Sat, 4 Apr 2026 15:18:11 -0400
Subject: [PATCH 2/2] Add the claude skills for correctness validation

---
 .../skills/correctness-validation/SKILL.md    | 206 ++++++++++++++++
 .../correctness-validation/scripts/compare.py | 224 ++++++++++++++++++
 .../scripts/launch_setup.sh                   |  39 +++
 .../scripts/launch_validate.sh                |  52 ++++
 .../scripts/setup_deepseek_v2_lite.py         |  73 ++++++
 .../scripts/setup_qwen3_30b_a3b.py            |  73 ++++++
 .../scripts/validate_deepseek_v2_lite.py      |  41 ++++
 .../scripts/validate_qwen3_30b_a3b.py         |  41 ++++
 .gitignore                                    |   1 -
 9 files changed, 749 insertions(+), 1 deletion(-)
 create mode 100644 .claude/skills/correctness-validation/SKILL.md
 create mode 100644 .claude/skills/correctness-validation/scripts/compare.py
 create mode 100755 .claude/skills/correctness-validation/scripts/launch_setup.sh
 create mode 100755 .claude/skills/correctness-validation/scripts/launch_validate.sh
 create mode 100644 .claude/skills/correctness-validation/scripts/setup_deepseek_v2_lite.py
 create mode 100644 .claude/skills/correctness-validation/scripts/setup_qwen3_30b_a3b.py
 create mode 100644 .claude/skills/correctness-validation/scripts/validate_deepseek_v2_lite.py
 create mode 100644 .claude/skills/correctness-validation/scripts/validate_qwen3_30b_a3b.py

diff --git a/.claude/skills/correctness-validation/SKILL.md b/.claude/skills/correctness-validation/SKILL.md
new file mode 100644
index 0000000..a6f0856
--- /dev/null
+++ b/.claude/skills/correctness-validation/SKILL.md
@@ -0,0 +1,206 @@
+---
+name: correctness-validation
+description: Validates that code changes do not break training correctness by comparing loss curves between a base branch and the current feature branch. Use when user asks to "validate correctness", "check if changes break training", "compare loss curves", "run a regression test", or "verify my changes are correct". Also use when a feature branch modifies model code, operators, pipeline logic, or distributed training modules.
+---
+
+# Correctness Validation
+
+Validates training correctness by running a short 15-step training run on both a base branch and the current feature branch, then comparing three metrics step-by-step: cross-entropy loss, load-balance loss, and gradient norm.
+
+## Overview
+
+The validation has two phases:
+
+1. **Shared setup** (run once, reused across branches): download a minimal DCLM corpus shard, tokenize it, download and convert the HuggingFace checkpoint to DCP format.
+2. **Branch comparison**: run 15 training steps on the base branch (via git worktree) and the feature branch, then compare the stdout logs.
+
+Shared setup artifacts live in `workspace/` and are deterministic given the same seed and released checkpoint, so they are safe to share between branches.
+
+## Prerequisites
+
+- **Python environment**: Use the `.venv` in the original repo root (not the worktree). Activate it before running any scripts: `source $REPO_ROOT/.venv/bin/activate`. If `.venv` does not exist, create it following the README instructions (`uv venv && uv sync`).
+- **Hardware**: Minimum **4x B200 GPUs** (PP=2, EP=2 with DeepSeek-V2-Lite).
+
+Note: both `.venv` and `workspace/` live in the original repo root. The worktree gets both via symlink (see Step 4).
+
+## Supported Models
+
+Each model has a validation script and a setup script under `scripts/`:
+
+| Model | Setup Script | Validation Script | GPUs |
+|---|---|---|---|
+| DeepSeek-V2-Lite | `setup_deepseek_v2_lite.py` | `validate_deepseek_v2_lite.py` | 4 (PP=2, EP=2) |
+| Qwen3-30B-A3B | `setup_qwen3_30b_a3b.py` | `validate_qwen3_30b_a3b.py` | 16 (PP=2, EP=8) |
+
+## Step-by-Step Workflow
+
+### Step 1: Determine Impact and Select Models
+
+Analyze the code change to decide which models need validation. The goal is to run validation on **every model whose behavior could be affected**.
+
+**How to analyze impact:**
+
+1. Get the list of changed files:
+   ```bash
+   git diff --name-only <base_branch>
+   ```
+
+2. **If changes are under a model-specific directory** (e.g., `pithtrain/models/deepseek_v2_lite/` or `pithtrain/models/qwen3_moe/`), only that model is affected.
+
+3. **If changes are in shared code** (e.g., `pithtrain/operators/`, `pithtrain/layers/`, `pithtrain/dualpipe/`, `pithtrain/modules/`, `pithtrain/tasks/`), read the changed code and determine whether it touches a feature that is model-specific or universal:
+   - Read each model's `config.json` at `examples/pretrain_language_model/<model>/config.json` to understand what features that model uses (attention type, shared experts, expert count, RoPE variant, etc.)
+   - Read the changed code to understand what architectural features it touches
+   - A model is affected if it uses any feature touched by the change
+
+4. **If unsure whether a model is affected, include it.** Over-validating is better than missing a regression.
+
+### Step 2: Detect Environment
+
+Check if running under SLURM by testing for `SLURM_JOB_ID`:
+
+```bash
+if [ -n "${SLURM_JOB_ID:-}" ]; then
+    echo "SLURM detected (job $SLURM_JOB_ID) — will use srun for multi-node launch"
+else
+    echo "No SLURM — single-node launch"
+fi
+```
+
+This determines whether to prefix commands with `srun -W 0`. The workspace directory is **node-local storage**, so setup (data download, tokenization, checkpoint conversion) must run on **every node**.
+
+### Step 3: Shared Setup
+
+Run the setup launch script for each affected model. The setup scripts are idempotent — they skip steps whose output already exists.
+
+```bash
+# Single-node (replace <model> with deepseek-v2-lite or qwen3-30b-a3b)
+bash .claude/skills/correctness-validation/scripts/launch_setup.sh <model>
+
+# Multi-node (SLURM) — must run on every node since workspace is node-local
+srun -W 0 .claude/skills/correctness-validation/scripts/launch_setup.sh <model>
+```
+
+This downloads a single minimal DCLM shard (`global-shard_01_of_10/local-shard_0_of_10/shard_00000000_processed.jsonl.zst`), tokenizes it with the model's tokenizer, downloads the HuggingFace checkpoint, and converts it to DCP format.
+
+### Step 4: Create Git Worktree for Base Branch
+
+Create a worktree for the base branch. Symlink `workspace/` and `.venv` from the repo root so both branches share the same data and environment.
+
+```bash
+BASE_BRANCH=main  # or the branch this feature was based on
+WORKTREE=$(mktemp -d)
+REPO_ROOT=$(git rev-parse --show-toplevel)
+
+git worktree add $WORKTREE $BASE_BRANCH
+ln -sfn $REPO_ROOT/workspace $WORKTREE/workspace
+ln -sfn $REPO_ROOT/.venv $WORKTREE/.venv
+```
+
+### Step 5: Run Validation on Base Branch
+
+Run 15 training steps in the base worktree. Only run the model(s) selected in Step 1.
+
+```bash
+cd $WORKTREE
+
+# Single-node (replace <model> with deepseek-v2-lite or qwen3-30b-a3b)
+bash .claude/skills/correctness-validation/scripts/launch_validate.sh <model>
+
+# Multi-node (SLURM)
+srun -W 0 .claude/skills/correctness-validation/scripts/launch_validate.sh <model>
+```
+
+The launch script auto-detects SLURM environment variables (`SLURM_NNODES`, `SLURM_NODEID`, `SLURM_STEP_GPUS`, `SLURM_STEP_NODELIST`) to configure `torchrun` arguments. On single-node, it falls back to localhost defaults.
+
+Logs are written to `logging/correctness-validation/validate_<model>_node<N>.log`.
+
+Return to the original repo directory after the run completes.
+
+### Step 6: Run Validation on Feature Branch
+
+Run the same 15 steps in the current (feature) working directory, for the same model(s).
+
+```bash
+cd $REPO_ROOT
+
+# Single-node
+bash .claude/skills/correctness-validation/scripts/launch_validate.sh <model>
+
+# Multi-node (SLURM)
+srun -W 0 .claude/skills/correctness-validation/scripts/launch_validate.sh <model>
+```
+
+### Step 7: Compare Results
+
+Run the compare script for each model that was validated. Use the node-0 logs (rank 0 emits the metrics). Run `python3 .claude/skills/correctness-validation/scripts/compare.py --help` for full options.
+
+```bash
+python3 .claude/skills/correctness-validation/scripts/compare.py \
+    $WORKTREE/logging/correctness-validation/validate_<model>_node0.log \
+    logging/correctness-validation/validate_<model>_node0.log
+```
+
+The compare script parses both logs, extracts per-step metrics, and reports pass/fail. It checks:
+
+- **cross-entropy-loss**: relative tolerance per step
+- **load-balance-loss**: relative tolerance per step
+- **gradient-norm**: relative tolerance per step
+
+Default tolerance is 1e-3 relative difference. Use `--tolerance` to adjust.
+
+Expected output on success:
+
+```
+PASS: All metrics within tolerance across all steps.
+```
+
+Expected output on failure:
+
+```
+FAIL: Metrics diverged beyond tolerance:
+  cross-entropy-loss:
+    step 003: cross-entropy-loss diverged — base=2.663700, feature=2.680100, rel_diff=6.16e-03 > tolerance=1e-03
+```
+
+### Step 8: Clean Up
+
+```bash
+git worktree remove $WORKTREE
+```
+
+## Log Format
+
+The training scripts emit lines like:
+
+```
+2026-04-02 12:32:40 | INFO | step 00000001/00000015 | step-time 110.990 sec | cross-entropy-loss 2.6637 | load-balance-loss 0.001234 | learning-rate 1.000000e-06 | gradient-norm 20.3210 | tokens-per-second 18,895 | peak-gpu-memory 47.20 GB
+```
+
+The compare script parses pipe-separated key-value pairs from lines containing `| INFO | step `.
+
+## Common Issues
+
+### Setup fails on HuggingFace download
+
+Ensure `HF_TOKEN` is set if the model is gated. DeepSeek-V2-Lite and Qwen3-30B-A3B are public models.
+
+### OOM during validation
+
+DeepSeek-V2-Lite requires 4x B200 GPUs. Qwen3-30B-A3B requires 16x B200 GPUs. If OOM occurs, check that no other processes are using GPU memory.
+
+### Logs show no load-balance-loss
+
+The validation scripts set `moe_load_balance_coef > 0` to ensure this metric is logged. If it is missing, check that the validation script (not an example script) was used.
+
+### Tolerance too strict
+
+FP8 operations and flash attention can introduce small non-determinism. If validation fails with very small differences, try increasing tolerance:
+
+```bash
+python3 .claude/skills/correctness-validation/scripts/compare.py \
+    base.log feature.log --tolerance 4e-3
+```
+
+### Worktree conflicts
+
+If the worktree was not cleaned up from a previous run, use `git worktree list` to find it and `git worktree remove <path> --force` to remove it.
diff --git a/.claude/skills/correctness-validation/scripts/compare.py b/.claude/skills/correctness-validation/scripts/compare.py
new file mode 100644
index 0000000..aa726be
--- /dev/null
+++ b/.claude/skills/correctness-validation/scripts/compare.py
@@ -0,0 +1,224 @@
+"""
+Compare training logs from two branches for correctness validation.
+
+Parses stdout logs from PithTrain training runs, extracts per-step metrics
+(cross-entropy-loss, load-balance-loss, gradient-norm), and reports whether
+the two runs produce consistent results within a configurable tolerance.
+
+Usage:
+    python3 compare.py <base_log> <feature_log> [--tolerance 1e-3]
+
+Exit code 0 = PASS, exit code 1 = FAIL.
+"""
+
+import argparse
+import re
+import sys
+
+METRICS = ["cross-entropy-loss", "load-balance-loss", "gradient-norm"]
+STEP_PATTERN = re.compile(r"step\s+(\d+)/(\d+)")
+
+
+def parse_log(path):
+    """
+    Parse a PithTrain training log and extract per-step metrics.
+
+    Parameters
+    ----
+    path : str
+        Path to the log file.
+
+    Returns
+    ----
+    steps : list[dict[str, float]]
+        List of dicts, one per step, with metric names as keys.
+    """
+    steps = []
+
+    with open(path) as f:
+        for line in f:
+            if "| INFO | step " not in line:
+                continue
+
+            parts = line.split("|")
+            metrics = dict()
+
+            for part in parts:
+                part = part.strip()
+
+                m = STEP_PATTERN.match(part)
+                if m:
+                    metrics["step"] = int(m.group(1))
+                    continue
+
+                tokens = part.rsplit(None, 1)
+                if len(tokens) == 2:
+                    key = tokens[0].strip()
+                    val = tokens[1].strip().replace(",", "")
+                    try:
+                        metrics[key] = float(val)
+                    except ValueError:
+                        pass
+
+            if "step" in metrics:
+                steps.append(metrics)
+
+    return steps
+
+
+def compare_metric(base_steps, feature_steps, metric, tolerance):
+    """
+    Compare a single metric across steps.
+
+    Parameters
+    ----
+    base_steps : list[dict]
+        Parsed steps from the base branch log.
+    feature_steps : list[dict]
+        Parsed steps from the feature branch log.
+    metric : str
+        Name of the metric to compare.
+    tolerance : float
+        Maximum allowed relative difference.
+
+    Returns
+    ----
+    failures : list[str]
+        List of failure messages. Empty if all steps pass.
+    """
+    failures = []
+
+    for base, feature in zip(base_steps, feature_steps):
+        step = base["step"]
+        base_val = base.get(metric)
+        feature_val = feature.get(metric)
+
+        if base_val is None and feature_val is None:
+            continue
+
+        if base_val is None or feature_val is None:
+            failures.append(
+                f"  step {step:03d}: {metric} present in one log but not the other "
+                f"(base={base_val}, feature={feature_val})"
+            )
+            continue
+
+        if base_val == 0 and feature_val == 0:
+            continue
+
+        denom = abs(base_val) if base_val != 0 else abs(feature_val)
+        rel_diff = abs(base_val - feature_val) / denom
+
+        if rel_diff > tolerance:
+            failures.append(
+                f"  step {step:03d}: {metric} diverged — "
+                f"base={base_val:.6f}, feature={feature_val:.6f}, "
+                f"rel_diff={rel_diff:.2e} > tolerance={tolerance:.0e}"
+            )
+
+    return failures
+
+
+def print_comparison_table(base_steps, feature_steps):
+    """
+    Print a step-by-step comparison table to stdout.
+
+    Parameters
+    ----
+    base_steps : list[dict]
+        Parsed steps from the base branch log.
+    feature_steps : list[dict]
+        Parsed steps from the feature branch log.
+    """
+    print("Step-by-step comparison:")
+    print("-" * 100)
+
+    header = f"{'step':>5}"
+    for metric in METRICS:
+        header += f" | {'base ' + metric:>28} {'feature':>12} {'rel_diff':>10}"
+    print(header)
+    print("-" * 100)
+
+    for base, feature in zip(base_steps, feature_steps):
+        step = base["step"]
+        row = f"{step:5d}"
+        for metric in METRICS:
+            bv = base.get(metric)
+            fv = feature.get(metric)
+            if bv is not None and fv is not None:
+                denom = abs(bv) if bv != 0 else (abs(fv) if fv != 0 else 1.0)
+                rd = abs(bv - fv) / denom
+                row += f" | {bv:28.6f} {fv:12.6f} {rd:10.2e}"
+            elif bv is not None:
+                row += f" | {bv:28.6f} {'N/A':>12} {'N/A':>10}"
+            elif fv is not None:
+                row += f" | {'N/A':>28} {fv:12.6f} {'N/A':>10}"
+            else:
+                row += f" | {'N/A':>28} {'N/A':>12} {'N/A':>10}"
+        print(row)
+
+    print("-" * 100)
+    print()
+
+
+def main():
+    """
+    Entry point. Parse arguments, compare logs, and report pass/fail.
+    """
+    parser = argparse.ArgumentParser(description="Compare PithTrain training logs.")
+    parser.add_argument("base_log", help="Path to base branch log file")
+    parser.add_argument("feature_log", help="Path to feature branch log file")
+    parser.add_argument(
+        "--tolerance",
+        type=float,
+        default=1e-3,
+        help="Relative tolerance for metric comparison (default: 1e-3)",
+    )
+    args = parser.parse_args()
+
+    base_steps = parse_log(args.base_log)
+    feature_steps = parse_log(args.feature_log)
+
+    print(f"Base log:    {args.base_log} ({len(base_steps)} steps)")
+    print(f"Feature log: {args.feature_log} ({len(feature_steps)} steps)")
+    print(f"Tolerance:   {args.tolerance:.0e}")
+    print()
+
+    if len(base_steps) == 0:
+        print("FAIL: No training steps found in base log.")
+        sys.exit(1)
+
+    if len(feature_steps) == 0:
+        print("FAIL: No training steps found in feature log.")
+        sys.exit(1)
+
+    if len(base_steps) != len(feature_steps):
+        print(
+            f"WARNING: Step count mismatch — base has {len(base_steps)}, "
+            f"feature has {len(feature_steps)}. Comparing first "
+            f"{min(len(base_steps), len(feature_steps))} steps."
+        )
+        print()
+
+    all_failures = dict()
+    for metric in METRICS:
+        failures = compare_metric(base_steps, feature_steps, metric, args.tolerance)
+        if failures:
+            all_failures[metric] = failures
+
+    print_comparison_table(base_steps, feature_steps)
+
+    if not all_failures:
+        print("PASS: All metrics within tolerance across all steps.")
+        sys.exit(0)
+    else:
+        print("FAIL: Metrics diverged beyond tolerance:")
+        for metric, failures in all_failures.items():
+            print(f"\n  {metric}:")
+            for f in failures:
+                print(f)
+        sys.exit(1)
+
+
+if __name__ == "__main__":
+    main()
diff --git a/.claude/skills/correctness-validation/scripts/launch_setup.sh b/.claude/skills/correctness-validation/scripts/launch_setup.sh
new file mode 100755
index 0000000..bf4641f
--- /dev/null
+++ b/.claude/skills/correctness-validation/scripts/launch_setup.sh
@@ -0,0 +1,39 @@
+#!/bin/bash
+# Download data, tokenize corpus, and convert checkpoint for correctness validation.
+#
+# Single-node usage:
+#   bash .claude/skills/correctness-validation/scripts/launch_setup.sh deepseek-v2-lite
+#   bash .claude/skills/correctness-validation/scripts/launch_setup.sh qwen3-30b-a3b
+#
+# Multi-node usage (SLURM) — run on every node since workspace is node-local:
+#   srun -W 0 .claude/skills/correctness-validation/scripts/launch_setup.sh deepseek-v2-lite
+
+set -euo pipefail
+export PYTHONUNBUFFERED=1
+
+if [ $# -ne 1 ]; then
+    echo "Usage: launch_setup.sh <model>" >&2
+    echo "  Models: deepseek-v2-lite, qwen3-30b-a3b" >&2
+    exit 1
+fi
+
+MODEL=$1
+
+case $MODEL in
+    deepseek-v2-lite)
+        SCRIPT=.claude/skills/correctness-validation/scripts/setup_deepseek_v2_lite.py
+        ;;
+    qwen3-30b-a3b)
+        SCRIPT=.claude/skills/correctness-validation/scripts/setup_qwen3_30b_a3b.py
+        ;;
+    *)
+        echo "Unknown model: $MODEL" >&2
+        echo "  Models: deepseek-v2-lite, qwen3-30b-a3b" >&2
+        exit 1
+        ;;
+esac
+
+OUTPUT=logging/correctness-validation/setup_${MODEL}_node${SLURM_NODEID:-0}.log
+
+mkdir -p $(dirname $OUTPUT) && exec > >(tee $OUTPUT) 2>&1
+python3 $SCRIPT
diff --git a/.claude/skills/correctness-validation/scripts/launch_validate.sh b/.claude/skills/correctness-validation/scripts/launch_validate.sh
new file mode 100755
index 0000000..84221ad
--- /dev/null
+++ b/.claude/skills/correctness-validation/scripts/launch_validate.sh
@@ -0,0 +1,52 @@
+#!/bin/bash
+# Run 15-step correctness validation training.
+#
+# Single-node usage:
+#   bash .claude/skills/correctness-validation/scripts/launch_validate.sh deepseek-v2-lite
+#   bash .claude/skills/correctness-validation/scripts/launch_validate.sh qwen3-30b-a3b
+#
+# Multi-node usage (SLURM):
+#   srun -W 0 .claude/skills/correctness-validation/scripts/launch_validate.sh deepseek-v2-lite
+
+set -euo pipefail
+export OMP_NUM_THREADS=8
+export PYTHONUNBUFFERED=1
+
+if [ $# -ne 1 ]; then
+    echo "Usage: launch_validate.sh <model>" >&2
+    echo "  Models: deepseek-v2-lite, qwen3-30b-a3b" >&2
+    exit 1
+fi
+
+MODEL=$1
+
+case $MODEL in
+    deepseek-v2-lite)
+        SCRIPT=.claude/skills/correctness-validation/scripts/validate_deepseek_v2_lite.py
+        ;;
+    qwen3-30b-a3b)
+        SCRIPT=.claude/skills/correctness-validation/scripts/validate_qwen3_30b_a3b.py
+        ;;
+    *)
+        echo "Unknown model: $MODEL" >&2
+        echo "  Models: deepseek-v2-lite, qwen3-30b-a3b" >&2
+        exit 1
+        ;;
+esac
+
+# Setup distributed — auto-detect SLURM or fall back to single-node.
+SLURM_NNODES=${SLURM_NNODES:-1}
+SLURM_NODEID=${SLURM_NODEID:-0}
+SLURM_STEP_GPUS=${SLURM_STEP_GPUS:-${CUDA_VISIBLE_DEVICES:-$(nvidia-smi --query-gpu=index --format=csv,noheader | paste -sd,)}}
+SLURM_STEP_NODELIST=${SLURM_STEP_NODELIST:-$(hostname)}
+
+LAUNCH_ARGS=()
+LAUNCH_ARGS+=(--nnodes=$SLURM_NNODES --node-rank=$SLURM_NODEID)
+LAUNCH_ARGS+=(--nproc-per-node=$(echo "$SLURM_STEP_GPUS" | tr ',' '\n' | wc -l))
+LAUNCH_ARGS+=(--rdzv-backend=c10d)
+LAUNCH_ARGS+=(--rdzv-endpoint=$(command -v scontrol &>/dev/null && scontrol show hostnames $SLURM_STEP_NODELIST | head -n 1 || echo localhost):15213)
+
+OUTPUT=logging/correctness-validation/validate_${MODEL}_node${SLURM_NODEID:-0}.log
+
+mkdir -p $(dirname $OUTPUT) && exec > >(tee $OUTPUT) 2>&1
+torchrun ${LAUNCH_ARGS[@]} $SCRIPT
diff --git a/.claude/skills/correctness-validation/scripts/setup_deepseek_v2_lite.py b/.claude/skills/correctness-validation/scripts/setup_deepseek_v2_lite.py
new file mode 100644
index 0000000..f110b6a
--- /dev/null
+++ b/.claude/skills/correctness-validation/scripts/setup_deepseek_v2_lite.py
@@ -0,0 +1,73 @@
+"""
+Setup shared data for DeepSeek-V2-Lite correctness validation.
+
+Downloads a minimal DCLM corpus shard, tokenizes it with the DeepSeek-V2 tokenizer,
+and converts the HuggingFace checkpoint to DCP format.
+
+Idempotent: skips steps whose output already exists.
+"""
+
+from pathlib import Path
+
+from huggingface_hub import snapshot_download
+
+from pithtrain.tasks.build_tokenized_corpus import BuildTokenizedCorpusCfg
+from pithtrain.tasks.build_tokenized_corpus import launch as step2_launch
+from pithtrain.tasks.convert_checkpoint import ConvertCheckpointCfg
+from pithtrain.tasks.convert_checkpoint import launch as step4_launch
+
+# Step 1: Download minimal DCLM shard
+
+RAWTXT = Path("workspace/datasets/dclm-baseline/rawtxt")
+SHARD = "global-shard_01_of_10/local-shard_0_of_10/shard_00000000_processed.jsonl.zst"
+
+if not (Path(RAWTXT, SHARD)).exists():
+    print(f"Downloading DCLM shard: {SHARD}")
+    snapshot_download(
+        "mlfoundations/dclm-baseline-1.0",
+        repo_type="dataset",
+        local_dir=str(RAWTXT),
+        allow_patterns=SHARD,
+    )
+else:
+    print(f"DCLM shard already exists: {Path(RAWTXT, SHARD)}")
+
+# Step 2: Tokenize with DeepSeek-V2 tokenizer
+
+TOKTXT = Path("workspace/datasets/dclm-baseline/toktxt/deepseek-v2")
+
+if not TOKTXT.exists() or not any(TOKTXT.glob("*.bin")):
+    print("Tokenizing corpus with DeepSeek-V2 tokenizer")
+    cfg = BuildTokenizedCorpusCfg()
+    cfg.tokenizer_name = "deepseek-ai/DeepSeek-V2-Lite"
+    cfg.source_path = RAWTXT
+    cfg.output_path = TOKTXT
+    step2_launch(cfg)
+else:
+    print(f"Tokenized corpus already exists: {TOKTXT}")
+
+# Step 3: Download HuggingFace checkpoint
+
+HF_IMPORT = Path("workspace/checkpoints/deepseek-v2-lite/hf-import")
+
+if not HF_IMPORT.exists() or not any(HF_IMPORT.glob("*.safetensors")):
+    print("Downloading DeepSeek-V2-Lite HuggingFace checkpoint")
+    snapshot_download(repo_id="deepseek-ai/DeepSeek-V2-Lite", local_dir=str(HF_IMPORT))
+else:
+    print(f"HuggingFace checkpoint already exists: {HF_IMPORT}")
+
+# Step 4: Convert to DCP format
+
+TORCH_DCP = Path("workspace/checkpoints/deepseek-v2-lite/torch-dcp/step-00000000")
+
+if not TORCH_DCP.exists() or not any(TORCH_DCP.iterdir()):
+    print("Converting HuggingFace checkpoint to DCP format")
+    cfg = ConvertCheckpointCfg()
+    cfg.operation = "hf2dcp"
+    cfg.load_path = HF_IMPORT
+    cfg.save_path = TORCH_DCP
+    step4_launch(cfg)
+else:
+    print(f"DCP checkpoint already exists: {TORCH_DCP}")
+
+print("Setup complete for DeepSeek-V2-Lite correctness validation.")
diff --git a/.claude/skills/correctness-validation/scripts/setup_qwen3_30b_a3b.py b/.claude/skills/correctness-validation/scripts/setup_qwen3_30b_a3b.py
new file mode 100644
index 0000000..4dfe2b6
--- /dev/null
+++ b/.claude/skills/correctness-validation/scripts/setup_qwen3_30b_a3b.py
@@ -0,0 +1,73 @@
+"""
+Setup shared data for Qwen3-30B-A3B correctness validation.
+
+Downloads a minimal DCLM corpus shard, tokenizes it with the Qwen3 tokenizer,
+and converts the HuggingFace checkpoint to DCP format.
+
+Idempotent: skips steps whose output already exists.
+"""
+
+from pathlib import Path
+
+from huggingface_hub import snapshot_download
+
+from pithtrain.tasks.build_tokenized_corpus import BuildTokenizedCorpusCfg
+from pithtrain.tasks.build_tokenized_corpus import launch as step2_launch
+from pithtrain.tasks.convert_checkpoint import ConvertCheckpointCfg
+from pithtrain.tasks.convert_checkpoint import launch as step4_launch
+
+# Step 1: Download minimal DCLM shard
+
+RAWTXT = Path("workspace/datasets/dclm-baseline/rawtxt")
+SHARD = "global-shard_01_of_10/local-shard_0_of_10/shard_00000000_processed.jsonl.zst"
+
+if not (Path(RAWTXT, SHARD)).exists():
+    print(f"Downloading DCLM shard: {SHARD}")
+    snapshot_download(
+        "mlfoundations/dclm-baseline-1.0",
+        repo_type="dataset",
+        local_dir=str(RAWTXT),
+        allow_patterns=SHARD,
+    )
+else:
+    print(f"DCLM shard already exists: {Path(RAWTXT, SHARD)}")
+
+# Step 2: Tokenize with Qwen3 tokenizer
+
+TOKTXT = Path("workspace/datasets/dclm-baseline/toktxt/qwen3")
+
+if not TOKTXT.exists() or not any(TOKTXT.glob("*.bin")):
+    print("Tokenizing corpus with Qwen3 tokenizer")
+    cfg = BuildTokenizedCorpusCfg()
+    cfg.tokenizer_name = "Qwen/Qwen3-30B-A3B"
+    cfg.source_path = RAWTXT
+    cfg.output_path = TOKTXT
+    step2_launch(cfg)
+else:
+    print(f"Tokenized corpus already exists: {TOKTXT}")
+
+# Step 3: Download HuggingFace checkpoint
+
+HF_IMPORT = Path("workspace/checkpoints/qwen3-30b-a3b/hf-import")
+
+if not HF_IMPORT.exists() or not any(HF_IMPORT.glob("*.safetensors")):
+    print("Downloading Qwen3-30B-A3B HuggingFace checkpoint")
+    snapshot_download(repo_id="Qwen/Qwen3-30B-A3B", local_dir=str(HF_IMPORT))
+else:
+    print(f"HuggingFace checkpoint already exists: {HF_IMPORT}")
+
+# Step 4: Convert to DCP format
+
+TORCH_DCP = Path("workspace/checkpoints/qwen3-30b-a3b/torch-dcp/step-00000000")
+
+if not TORCH_DCP.exists() or not any(TORCH_DCP.iterdir()):
+    print("Converting HuggingFace checkpoint to DCP format")
+    cfg = ConvertCheckpointCfg()
+    cfg.operation = "hf2dcp"
+    cfg.load_path = HF_IMPORT
+    cfg.save_path = TORCH_DCP
+    step4_launch(cfg)
+else:
+    print(f"DCP checkpoint already exists: {TORCH_DCP}")
+
+print("Setup complete for Qwen3-30B-A3B correctness validation.")
diff --git a/.claude/skills/correctness-validation/scripts/validate_deepseek_v2_lite.py b/.claude/skills/correctness-validation/scripts/validate_deepseek_v2_lite.py
new file mode 100644
index 0000000..c8a2047
--- /dev/null
+++ b/.claude/skills/correctness-validation/scripts/validate_deepseek_v2_lite.py
@@ -0,0 +1,41 @@
+"""
+Run 15 training steps with DeepSeek-V2-Lite for correctness validation.
+
+Uses a constant tiny learning rate (1e-6) and loads from a released HuggingFace
+checkpoint converted to DCP. The goal is to verify that loss computation and
+gradient flow are correct, not to actually train.
+
+Launch with:
+    bash .claude/skills/correctness-validation/scripts/launch_validate.sh deepseek-v2-lite
+"""
+
+from pathlib import Path
+
+from pithtrain.tasks.pretrain_language_model import PretrainLanguageModelCfg, launch
+
+cfg = PretrainLanguageModelCfg()
+
+distributed = cfg.distributed
+distributed.context_parallel_size = 1
+distributed.pipeline_parallel_size = 2
+distributed.expert_parallel_size = 2
+
+training = cfg.training
+training.model = Path("examples/pretrain_language_model/deepseek-v2-lite/config.json")
+training.optimizer = "Adam"
+training.scheduler = "Constant"
+training.max_lr = 1e-6
+training.min_lr = 1e-6
+training.warmup_steps = 0
+training.max_steps = 15
+training.micro_batch_size = 1
+training.global_batch_size = 512
+training.sequence_length = 2048
+training.dataset = Path("workspace/datasets/dclm-baseline/toktxt/deepseek-v2")
+training.moe_load_balance_type = "sequence"
+training.moe_load_balance_coef = 3e-3
+training.fp8_training = "disabled"
+training.save_location = Path("workspace/checkpoints/deepseek-v2-lite")
+
+if __name__ == "__main__":
+    launch(cfg)
diff --git a/.claude/skills/correctness-validation/scripts/validate_qwen3_30b_a3b.py b/.claude/skills/correctness-validation/scripts/validate_qwen3_30b_a3b.py
new file mode 100644
index 0000000..f2b08c7
--- /dev/null
+++ b/.claude/skills/correctness-validation/scripts/validate_qwen3_30b_a3b.py
@@ -0,0 +1,41 @@
+"""
+Run 15 training steps with Qwen3-30B-A3B for correctness validation.
+
+Uses a constant tiny learning rate (1e-6) and loads from a released HuggingFace
+checkpoint converted to DCP. The goal is to verify that loss computation and
+gradient flow are correct, not to actually train.
+
+Launch with:
+    bash .claude/skills/correctness-validation/scripts/launch_validate.sh qwen3-30b-a3b
+"""
+
+from pathlib import Path
+
+from pithtrain.tasks.pretrain_language_model import PretrainLanguageModelCfg, launch
+
+cfg = PretrainLanguageModelCfg()
+
+distributed = cfg.distributed
+distributed.context_parallel_size = 1
+distributed.pipeline_parallel_size = 2
+distributed.expert_parallel_size = 8
+
+training = cfg.training
+training.model = Path("examples/pretrain_language_model/qwen3-30b-a3b/config.json")
+training.optimizer = "Adam"
+training.scheduler = "Constant"
+training.max_lr = 1e-6
+training.min_lr = 1e-6
+training.warmup_steps = 0
+training.max_steps = 15
+training.micro_batch_size = 1
+training.global_batch_size = 512
+training.sequence_length = 2048
+training.dataset = Path("workspace/datasets/dclm-baseline/toktxt/qwen3")
+training.moe_load_balance_type = "global-batch"
+training.moe_load_balance_coef = 1e-3
+training.fp8_training = "disabled"
+training.save_location = Path("workspace/checkpoints/qwen3-30b-a3b")
+
+if __name__ == "__main__":
+    launch(cfg)
diff --git a/.gitignore b/.gitignore
index 842f40c..7c1fcd1 100644
--- a/.gitignore
+++ b/.gitignore
@@ -22,7 +22,6 @@ uv.lock
 /venv_cmd.sh
 
 # IDEs and editors
-.claude/
 .vscode/
 .idea/
 *.swp