From 785edd46e1ac8084c2fcd064347dc7030e8e2056 Mon Sep 17 00:00:00 2001 From: haok1402 Date: Sat, 4 Apr 2026 14:55:01 -0400 Subject: [PATCH 1/2] Revise the interface to skip checkpoint saving if save_interval left unspecified to support short validation runs --- pithtrain/modules/training.py | 14 ++++++++++---- pithtrain/tasks/pretrain_language_model.py | 16 +++++++++++----- 2 files changed, 21 insertions(+), 9 deletions(-) diff --git a/pithtrain/modules/training.py b/pithtrain/modules/training.py index 00633b1..0df65e6 100644 --- a/pithtrain/modules/training.py +++ b/pithtrain/modules/training.py @@ -95,14 +95,20 @@ class TrainingCfg(SlottedDefault): (e.g. ``"examples/.../qwen3_30b_a3b_config.json"``). """ - save_interval: int + save_interval: Optional[int] = None """ - The interval (in steps) at which to save model checkpoints. + The interval (in steps) at which to save checkpoints. When None, + checkpoint saving is disabled but loading still occurs from + ``save_location`` (if set). This is useful for validation runs + that need to load a pretrained checkpoint without writing new ones. """ - save_location: Path + save_location: Optional[Path] = None """ - The directory where model checkpoints will be saved. + The directory for checkpoint storage. Checkpoints are loaded from + and saved to ``/torch-dcp/step-XXXXXXXX``. When + None, both loading and saving are disabled and the model trains + from scratch. """ moe_load_balance_coef: float = 0.0 diff --git a/pithtrain/tasks/pretrain_language_model.py b/pithtrain/tasks/pretrain_language_model.py index d13b590..c740898 100644 --- a/pithtrain/tasks/pretrain_language_model.py +++ b/pithtrain/tasks/pretrain_language_model.py @@ -284,6 +284,7 @@ def save_checkpoint(cfg: PretrainLanguageModelCfg, ctx: PretrainLanguageModelCtx DTensors are kept as CPU DTensors and DCP saves each rank's shard. """ stdout = ctx.logging.stdout + assert cfg.training.save_location is not None save_location = Path(cfg.training.save_location, "torch-dcp", "step-%08d" % ctx.training.step) model = ctx.training.model optimizer = ctx.training.optimizer @@ -317,6 +318,9 @@ def load_checkpoint(cfg: PretrainLanguageModelCfg, ctx: PretrainLanguageModelCtx Load the checkpoint from the latest step. """ stdout = ctx.logging.stdout + if cfg.training.save_location is None: + stdout.info("No save_location set; training from scratch.") + return path2step = lambda p: int(p.stem.removeprefix("step-")) checkpoints = Path(cfg.training.save_location, "torch-dcp").glob("step-*") checkpoints = sorted(checkpoints, key=path2step) @@ -466,11 +470,13 @@ def train_step(cfg: PretrainLanguageModelCfg, ctx: PretrainLanguageModelCtx) -> # We should save the checkpoint if any of the following conditions is true: # 1. The current step is a multiple of save_interval. # 2. The current step is the last step (max_steps). - should_save = False - should_save |= ctx.training.step % cfg.training.save_interval == 0 - should_save |= ctx.training.step == cfg.training.max_steps - if should_save: - save_checkpoint(cfg, ctx) + # Skip entirely if save_interval is None. + if cfg.training.save_interval is not None: + should_save = False + should_save |= ctx.training.step % cfg.training.save_interval == 0 + should_save |= ctx.training.step == cfg.training.max_steps + if should_save: + save_checkpoint(cfg, ctx) # Run deferred GC here so cyclic collection never fires mid-forward/backward. gc.collect() From 6fe932eaedd879df9beafe8c55eeae17e9815eb9 Mon Sep 17 00:00:00 2001 From: haok1402 Date: Sat, 4 Apr 2026 15:18:11 -0400 Subject: [PATCH 2/2] Add the claude skills for correctness validation --- .../skills/correctness-validation/SKILL.md | 206 ++++++++++++++++ .../correctness-validation/scripts/compare.py | 224 ++++++++++++++++++ .../scripts/launch_setup.sh | 39 +++ .../scripts/launch_validate.sh | 52 ++++ .../scripts/setup_deepseek_v2_lite.py | 73 ++++++ .../scripts/setup_qwen3_30b_a3b.py | 73 ++++++ .../scripts/validate_deepseek_v2_lite.py | 41 ++++ .../scripts/validate_qwen3_30b_a3b.py | 41 ++++ .gitignore | 1 - 9 files changed, 749 insertions(+), 1 deletion(-) create mode 100644 .claude/skills/correctness-validation/SKILL.md create mode 100644 .claude/skills/correctness-validation/scripts/compare.py create mode 100755 .claude/skills/correctness-validation/scripts/launch_setup.sh create mode 100755 .claude/skills/correctness-validation/scripts/launch_validate.sh create mode 100644 .claude/skills/correctness-validation/scripts/setup_deepseek_v2_lite.py create mode 100644 .claude/skills/correctness-validation/scripts/setup_qwen3_30b_a3b.py create mode 100644 .claude/skills/correctness-validation/scripts/validate_deepseek_v2_lite.py create mode 100644 .claude/skills/correctness-validation/scripts/validate_qwen3_30b_a3b.py diff --git a/.claude/skills/correctness-validation/SKILL.md b/.claude/skills/correctness-validation/SKILL.md new file mode 100644 index 0000000..a6f0856 --- /dev/null +++ b/.claude/skills/correctness-validation/SKILL.md @@ -0,0 +1,206 @@ +--- +name: correctness-validation +description: Validates that code changes do not break training correctness by comparing loss curves between a base branch and the current feature branch. Use when user asks to "validate correctness", "check if changes break training", "compare loss curves", "run a regression test", or "verify my changes are correct". Also use when a feature branch modifies model code, operators, pipeline logic, or distributed training modules. +--- + +# Correctness Validation + +Validates training correctness by running a short 15-step training run on both a base branch and the current feature branch, then comparing three metrics step-by-step: cross-entropy loss, load-balance loss, and gradient norm. + +## Overview + +The validation has two phases: + +1. **Shared setup** (run once, reused across branches): download a minimal DCLM corpus shard, tokenize it, download and convert the HuggingFace checkpoint to DCP format. +2. **Branch comparison**: run 15 training steps on the base branch (via git worktree) and the feature branch, then compare the stdout logs. + +Shared setup artifacts live in `workspace/` and are deterministic given the same seed and released checkpoint, so they are safe to share between branches. + +## Prerequisites + +- **Python environment**: Use the `.venv` in the original repo root (not the worktree). Activate it before running any scripts: `source $REPO_ROOT/.venv/bin/activate`. If `.venv` does not exist, create it following the README instructions (`uv venv && uv sync`). +- **Hardware**: Minimum **4x B200 GPUs** (PP=2, EP=2 with DeepSeek-V2-Lite). + +Note: both `.venv` and `workspace/` live in the original repo root. The worktree gets both via symlink (see Step 4). + +## Supported Models + +Each model has a validation script and a setup script under `scripts/`: + +| Model | Setup Script | Validation Script | GPUs | +|---|---|---|---| +| DeepSeek-V2-Lite | `setup_deepseek_v2_lite.py` | `validate_deepseek_v2_lite.py` | 4 (PP=2, EP=2) | +| Qwen3-30B-A3B | `setup_qwen3_30b_a3b.py` | `validate_qwen3_30b_a3b.py` | 16 (PP=2, EP=8) | + +## Step-by-Step Workflow + +### Step 1: Determine Impact and Select Models + +Analyze the code change to decide which models need validation. The goal is to run validation on **every model whose behavior could be affected**. + +**How to analyze impact:** + +1. Get the list of changed files: + ```bash + git diff --name-only + ``` + +2. **If changes are under a model-specific directory** (e.g., `pithtrain/models/deepseek_v2_lite/` or `pithtrain/models/qwen3_moe/`), only that model is affected. + +3. **If changes are in shared code** (e.g., `pithtrain/operators/`, `pithtrain/layers/`, `pithtrain/dualpipe/`, `pithtrain/modules/`, `pithtrain/tasks/`), read the changed code and determine whether it touches a feature that is model-specific or universal: + - Read each model's `config.json` at `examples/pretrain_language_model//config.json` to understand what features that model uses (attention type, shared experts, expert count, RoPE variant, etc.) + - Read the changed code to understand what architectural features it touches + - A model is affected if it uses any feature touched by the change + +4. **If unsure whether a model is affected, include it.** Over-validating is better than missing a regression. + +### Step 2: Detect Environment + +Check if running under SLURM by testing for `SLURM_JOB_ID`: + +```bash +if [ -n "${SLURM_JOB_ID:-}" ]; then + echo "SLURM detected (job $SLURM_JOB_ID) — will use srun for multi-node launch" +else + echo "No SLURM — single-node launch" +fi +``` + +This determines whether to prefix commands with `srun -W 0`. The workspace directory is **node-local storage**, so setup (data download, tokenization, checkpoint conversion) must run on **every node**. + +### Step 3: Shared Setup + +Run the setup launch script for each affected model. The setup scripts are idempotent — they skip steps whose output already exists. + +```bash +# Single-node (replace with deepseek-v2-lite or qwen3-30b-a3b) +bash .claude/skills/correctness-validation/scripts/launch_setup.sh + +# Multi-node (SLURM) — must run on every node since workspace is node-local +srun -W 0 .claude/skills/correctness-validation/scripts/launch_setup.sh +``` + +This downloads a single minimal DCLM shard (`global-shard_01_of_10/local-shard_0_of_10/shard_00000000_processed.jsonl.zst`), tokenizes it with the model's tokenizer, downloads the HuggingFace checkpoint, and converts it to DCP format. + +### Step 4: Create Git Worktree for Base Branch + +Create a worktree for the base branch. Symlink `workspace/` and `.venv` from the repo root so both branches share the same data and environment. + +```bash +BASE_BRANCH=main # or the branch this feature was based on +WORKTREE=$(mktemp -d) +REPO_ROOT=$(git rev-parse --show-toplevel) + +git worktree add $WORKTREE $BASE_BRANCH +ln -sfn $REPO_ROOT/workspace $WORKTREE/workspace +ln -sfn $REPO_ROOT/.venv $WORKTREE/.venv +``` + +### Step 5: Run Validation on Base Branch + +Run 15 training steps in the base worktree. Only run the model(s) selected in Step 1. + +```bash +cd $WORKTREE + +# Single-node (replace with deepseek-v2-lite or qwen3-30b-a3b) +bash .claude/skills/correctness-validation/scripts/launch_validate.sh + +# Multi-node (SLURM) +srun -W 0 .claude/skills/correctness-validation/scripts/launch_validate.sh +``` + +The launch script auto-detects SLURM environment variables (`SLURM_NNODES`, `SLURM_NODEID`, `SLURM_STEP_GPUS`, `SLURM_STEP_NODELIST`) to configure `torchrun` arguments. On single-node, it falls back to localhost defaults. + +Logs are written to `logging/correctness-validation/validate__node.log`. + +Return to the original repo directory after the run completes. + +### Step 6: Run Validation on Feature Branch + +Run the same 15 steps in the current (feature) working directory, for the same model(s). + +```bash +cd $REPO_ROOT + +# Single-node +bash .claude/skills/correctness-validation/scripts/launch_validate.sh + +# Multi-node (SLURM) +srun -W 0 .claude/skills/correctness-validation/scripts/launch_validate.sh +``` + +### Step 7: Compare Results + +Run the compare script for each model that was validated. Use the node-0 logs (rank 0 emits the metrics). Run `python3 .claude/skills/correctness-validation/scripts/compare.py --help` for full options. + +```bash +python3 .claude/skills/correctness-validation/scripts/compare.py \ + $WORKTREE/logging/correctness-validation/validate__node0.log \ + logging/correctness-validation/validate__node0.log +``` + +The compare script parses both logs, extracts per-step metrics, and reports pass/fail. It checks: + +- **cross-entropy-loss**: relative tolerance per step +- **load-balance-loss**: relative tolerance per step +- **gradient-norm**: relative tolerance per step + +Default tolerance is 1e-3 relative difference. Use `--tolerance` to adjust. + +Expected output on success: + +``` +PASS: All metrics within tolerance across all steps. +``` + +Expected output on failure: + +``` +FAIL: Metrics diverged beyond tolerance: + cross-entropy-loss: + step 003: cross-entropy-loss diverged — base=2.663700, feature=2.680100, rel_diff=6.16e-03 > tolerance=1e-03 +``` + +### Step 8: Clean Up + +```bash +git worktree remove $WORKTREE +``` + +## Log Format + +The training scripts emit lines like: + +``` +2026-04-02 12:32:40 | INFO | step 00000001/00000015 | step-time 110.990 sec | cross-entropy-loss 2.6637 | load-balance-loss 0.001234 | learning-rate 1.000000e-06 | gradient-norm 20.3210 | tokens-per-second 18,895 | peak-gpu-memory 47.20 GB +``` + +The compare script parses pipe-separated key-value pairs from lines containing `| INFO | step `. + +## Common Issues + +### Setup fails on HuggingFace download + +Ensure `HF_TOKEN` is set if the model is gated. DeepSeek-V2-Lite and Qwen3-30B-A3B are public models. + +### OOM during validation + +DeepSeek-V2-Lite requires 4x B200 GPUs. Qwen3-30B-A3B requires 16x B200 GPUs. If OOM occurs, check that no other processes are using GPU memory. + +### Logs show no load-balance-loss + +The validation scripts set `moe_load_balance_coef > 0` to ensure this metric is logged. If it is missing, check that the validation script (not an example script) was used. + +### Tolerance too strict + +FP8 operations and flash attention can introduce small non-determinism. If validation fails with very small differences, try increasing tolerance: + +```bash +python3 .claude/skills/correctness-validation/scripts/compare.py \ + base.log feature.log --tolerance 4e-3 +``` + +### Worktree conflicts + +If the worktree was not cleaned up from a previous run, use `git worktree list` to find it and `git worktree remove --force` to remove it. diff --git a/.claude/skills/correctness-validation/scripts/compare.py b/.claude/skills/correctness-validation/scripts/compare.py new file mode 100644 index 0000000..aa726be --- /dev/null +++ b/.claude/skills/correctness-validation/scripts/compare.py @@ -0,0 +1,224 @@ +""" +Compare training logs from two branches for correctness validation. + +Parses stdout logs from PithTrain training runs, extracts per-step metrics +(cross-entropy-loss, load-balance-loss, gradient-norm), and reports whether +the two runs produce consistent results within a configurable tolerance. + +Usage: + python3 compare.py [--tolerance 1e-3] + +Exit code 0 = PASS, exit code 1 = FAIL. +""" + +import argparse +import re +import sys + +METRICS = ["cross-entropy-loss", "load-balance-loss", "gradient-norm"] +STEP_PATTERN = re.compile(r"step\s+(\d+)/(\d+)") + + +def parse_log(path): + """ + Parse a PithTrain training log and extract per-step metrics. + + Parameters + ---- + path : str + Path to the log file. + + Returns + ---- + steps : list[dict[str, float]] + List of dicts, one per step, with metric names as keys. + """ + steps = [] + + with open(path) as f: + for line in f: + if "| INFO | step " not in line: + continue + + parts = line.split("|") + metrics = dict() + + for part in parts: + part = part.strip() + + m = STEP_PATTERN.match(part) + if m: + metrics["step"] = int(m.group(1)) + continue + + tokens = part.rsplit(None, 1) + if len(tokens) == 2: + key = tokens[0].strip() + val = tokens[1].strip().replace(",", "") + try: + metrics[key] = float(val) + except ValueError: + pass + + if "step" in metrics: + steps.append(metrics) + + return steps + + +def compare_metric(base_steps, feature_steps, metric, tolerance): + """ + Compare a single metric across steps. + + Parameters + ---- + base_steps : list[dict] + Parsed steps from the base branch log. + feature_steps : list[dict] + Parsed steps from the feature branch log. + metric : str + Name of the metric to compare. + tolerance : float + Maximum allowed relative difference. + + Returns + ---- + failures : list[str] + List of failure messages. Empty if all steps pass. + """ + failures = [] + + for base, feature in zip(base_steps, feature_steps): + step = base["step"] + base_val = base.get(metric) + feature_val = feature.get(metric) + + if base_val is None and feature_val is None: + continue + + if base_val is None or feature_val is None: + failures.append( + f" step {step:03d}: {metric} present in one log but not the other " + f"(base={base_val}, feature={feature_val})" + ) + continue + + if base_val == 0 and feature_val == 0: + continue + + denom = abs(base_val) if base_val != 0 else abs(feature_val) + rel_diff = abs(base_val - feature_val) / denom + + if rel_diff > tolerance: + failures.append( + f" step {step:03d}: {metric} diverged — " + f"base={base_val:.6f}, feature={feature_val:.6f}, " + f"rel_diff={rel_diff:.2e} > tolerance={tolerance:.0e}" + ) + + return failures + + +def print_comparison_table(base_steps, feature_steps): + """ + Print a step-by-step comparison table to stdout. + + Parameters + ---- + base_steps : list[dict] + Parsed steps from the base branch log. + feature_steps : list[dict] + Parsed steps from the feature branch log. + """ + print("Step-by-step comparison:") + print("-" * 100) + + header = f"{'step':>5}" + for metric in METRICS: + header += f" | {'base ' + metric:>28} {'feature':>12} {'rel_diff':>10}" + print(header) + print("-" * 100) + + for base, feature in zip(base_steps, feature_steps): + step = base["step"] + row = f"{step:5d}" + for metric in METRICS: + bv = base.get(metric) + fv = feature.get(metric) + if bv is not None and fv is not None: + denom = abs(bv) if bv != 0 else (abs(fv) if fv != 0 else 1.0) + rd = abs(bv - fv) / denom + row += f" | {bv:28.6f} {fv:12.6f} {rd:10.2e}" + elif bv is not None: + row += f" | {bv:28.6f} {'N/A':>12} {'N/A':>10}" + elif fv is not None: + row += f" | {'N/A':>28} {fv:12.6f} {'N/A':>10}" + else: + row += f" | {'N/A':>28} {'N/A':>12} {'N/A':>10}" + print(row) + + print("-" * 100) + print() + + +def main(): + """ + Entry point. Parse arguments, compare logs, and report pass/fail. + """ + parser = argparse.ArgumentParser(description="Compare PithTrain training logs.") + parser.add_argument("base_log", help="Path to base branch log file") + parser.add_argument("feature_log", help="Path to feature branch log file") + parser.add_argument( + "--tolerance", + type=float, + default=1e-3, + help="Relative tolerance for metric comparison (default: 1e-3)", + ) + args = parser.parse_args() + + base_steps = parse_log(args.base_log) + feature_steps = parse_log(args.feature_log) + + print(f"Base log: {args.base_log} ({len(base_steps)} steps)") + print(f"Feature log: {args.feature_log} ({len(feature_steps)} steps)") + print(f"Tolerance: {args.tolerance:.0e}") + print() + + if len(base_steps) == 0: + print("FAIL: No training steps found in base log.") + sys.exit(1) + + if len(feature_steps) == 0: + print("FAIL: No training steps found in feature log.") + sys.exit(1) + + if len(base_steps) != len(feature_steps): + print( + f"WARNING: Step count mismatch — base has {len(base_steps)}, " + f"feature has {len(feature_steps)}. Comparing first " + f"{min(len(base_steps), len(feature_steps))} steps." + ) + print() + + all_failures = dict() + for metric in METRICS: + failures = compare_metric(base_steps, feature_steps, metric, args.tolerance) + if failures: + all_failures[metric] = failures + + print_comparison_table(base_steps, feature_steps) + + if not all_failures: + print("PASS: All metrics within tolerance across all steps.") + sys.exit(0) + else: + print("FAIL: Metrics diverged beyond tolerance:") + for metric, failures in all_failures.items(): + print(f"\n {metric}:") + for f in failures: + print(f) + sys.exit(1) + + +if __name__ == "__main__": + main() diff --git a/.claude/skills/correctness-validation/scripts/launch_setup.sh b/.claude/skills/correctness-validation/scripts/launch_setup.sh new file mode 100755 index 0000000..bf4641f --- /dev/null +++ b/.claude/skills/correctness-validation/scripts/launch_setup.sh @@ -0,0 +1,39 @@ +#!/bin/bash +# Download data, tokenize corpus, and convert checkpoint for correctness validation. +# +# Single-node usage: +# bash .claude/skills/correctness-validation/scripts/launch_setup.sh deepseek-v2-lite +# bash .claude/skills/correctness-validation/scripts/launch_setup.sh qwen3-30b-a3b +# +# Multi-node usage (SLURM) — run on every node since workspace is node-local: +# srun -W 0 .claude/skills/correctness-validation/scripts/launch_setup.sh deepseek-v2-lite + +set -euo pipefail +export PYTHONUNBUFFERED=1 + +if [ $# -ne 1 ]; then + echo "Usage: launch_setup.sh " >&2 + echo " Models: deepseek-v2-lite, qwen3-30b-a3b" >&2 + exit 1 +fi + +MODEL=$1 + +case $MODEL in + deepseek-v2-lite) + SCRIPT=.claude/skills/correctness-validation/scripts/setup_deepseek_v2_lite.py + ;; + qwen3-30b-a3b) + SCRIPT=.claude/skills/correctness-validation/scripts/setup_qwen3_30b_a3b.py + ;; + *) + echo "Unknown model: $MODEL" >&2 + echo " Models: deepseek-v2-lite, qwen3-30b-a3b" >&2 + exit 1 + ;; +esac + +OUTPUT=logging/correctness-validation/setup_${MODEL}_node${SLURM_NODEID:-0}.log + +mkdir -p $(dirname $OUTPUT) && exec > >(tee $OUTPUT) 2>&1 +python3 $SCRIPT diff --git a/.claude/skills/correctness-validation/scripts/launch_validate.sh b/.claude/skills/correctness-validation/scripts/launch_validate.sh new file mode 100755 index 0000000..84221ad --- /dev/null +++ b/.claude/skills/correctness-validation/scripts/launch_validate.sh @@ -0,0 +1,52 @@ +#!/bin/bash +# Run 15-step correctness validation training. +# +# Single-node usage: +# bash .claude/skills/correctness-validation/scripts/launch_validate.sh deepseek-v2-lite +# bash .claude/skills/correctness-validation/scripts/launch_validate.sh qwen3-30b-a3b +# +# Multi-node usage (SLURM): +# srun -W 0 .claude/skills/correctness-validation/scripts/launch_validate.sh deepseek-v2-lite + +set -euo pipefail +export OMP_NUM_THREADS=8 +export PYTHONUNBUFFERED=1 + +if [ $# -ne 1 ]; then + echo "Usage: launch_validate.sh " >&2 + echo " Models: deepseek-v2-lite, qwen3-30b-a3b" >&2 + exit 1 +fi + +MODEL=$1 + +case $MODEL in + deepseek-v2-lite) + SCRIPT=.claude/skills/correctness-validation/scripts/validate_deepseek_v2_lite.py + ;; + qwen3-30b-a3b) + SCRIPT=.claude/skills/correctness-validation/scripts/validate_qwen3_30b_a3b.py + ;; + *) + echo "Unknown model: $MODEL" >&2 + echo " Models: deepseek-v2-lite, qwen3-30b-a3b" >&2 + exit 1 + ;; +esac + +# Setup distributed — auto-detect SLURM or fall back to single-node. +SLURM_NNODES=${SLURM_NNODES:-1} +SLURM_NODEID=${SLURM_NODEID:-0} +SLURM_STEP_GPUS=${SLURM_STEP_GPUS:-${CUDA_VISIBLE_DEVICES:-$(nvidia-smi --query-gpu=index --format=csv,noheader | paste -sd,)}} +SLURM_STEP_NODELIST=${SLURM_STEP_NODELIST:-$(hostname)} + +LAUNCH_ARGS=() +LAUNCH_ARGS+=(--nnodes=$SLURM_NNODES --node-rank=$SLURM_NODEID) +LAUNCH_ARGS+=(--nproc-per-node=$(echo "$SLURM_STEP_GPUS" | tr ',' '\n' | wc -l)) +LAUNCH_ARGS+=(--rdzv-backend=c10d) +LAUNCH_ARGS+=(--rdzv-endpoint=$(command -v scontrol &>/dev/null && scontrol show hostnames $SLURM_STEP_NODELIST | head -n 1 || echo localhost):15213) + +OUTPUT=logging/correctness-validation/validate_${MODEL}_node${SLURM_NODEID:-0}.log + +mkdir -p $(dirname $OUTPUT) && exec > >(tee $OUTPUT) 2>&1 +torchrun ${LAUNCH_ARGS[@]} $SCRIPT diff --git a/.claude/skills/correctness-validation/scripts/setup_deepseek_v2_lite.py b/.claude/skills/correctness-validation/scripts/setup_deepseek_v2_lite.py new file mode 100644 index 0000000..f110b6a --- /dev/null +++ b/.claude/skills/correctness-validation/scripts/setup_deepseek_v2_lite.py @@ -0,0 +1,73 @@ +""" +Setup shared data for DeepSeek-V2-Lite correctness validation. + +Downloads a minimal DCLM corpus shard, tokenizes it with the DeepSeek-V2 tokenizer, +and converts the HuggingFace checkpoint to DCP format. + +Idempotent: skips steps whose output already exists. +""" + +from pathlib import Path + +from huggingface_hub import snapshot_download + +from pithtrain.tasks.build_tokenized_corpus import BuildTokenizedCorpusCfg +from pithtrain.tasks.build_tokenized_corpus import launch as step2_launch +from pithtrain.tasks.convert_checkpoint import ConvertCheckpointCfg +from pithtrain.tasks.convert_checkpoint import launch as step4_launch + +# Step 1: Download minimal DCLM shard + +RAWTXT = Path("workspace/datasets/dclm-baseline/rawtxt") +SHARD = "global-shard_01_of_10/local-shard_0_of_10/shard_00000000_processed.jsonl.zst" + +if not (Path(RAWTXT, SHARD)).exists(): + print(f"Downloading DCLM shard: {SHARD}") + snapshot_download( + "mlfoundations/dclm-baseline-1.0", + repo_type="dataset", + local_dir=str(RAWTXT), + allow_patterns=SHARD, + ) +else: + print(f"DCLM shard already exists: {Path(RAWTXT, SHARD)}") + +# Step 2: Tokenize with DeepSeek-V2 tokenizer + +TOKTXT = Path("workspace/datasets/dclm-baseline/toktxt/deepseek-v2") + +if not TOKTXT.exists() or not any(TOKTXT.glob("*.bin")): + print("Tokenizing corpus with DeepSeek-V2 tokenizer") + cfg = BuildTokenizedCorpusCfg() + cfg.tokenizer_name = "deepseek-ai/DeepSeek-V2-Lite" + cfg.source_path = RAWTXT + cfg.output_path = TOKTXT + step2_launch(cfg) +else: + print(f"Tokenized corpus already exists: {TOKTXT}") + +# Step 3: Download HuggingFace checkpoint + +HF_IMPORT = Path("workspace/checkpoints/deepseek-v2-lite/hf-import") + +if not HF_IMPORT.exists() or not any(HF_IMPORT.glob("*.safetensors")): + print("Downloading DeepSeek-V2-Lite HuggingFace checkpoint") + snapshot_download(repo_id="deepseek-ai/DeepSeek-V2-Lite", local_dir=str(HF_IMPORT)) +else: + print(f"HuggingFace checkpoint already exists: {HF_IMPORT}") + +# Step 4: Convert to DCP format + +TORCH_DCP = Path("workspace/checkpoints/deepseek-v2-lite/torch-dcp/step-00000000") + +if not TORCH_DCP.exists() or not any(TORCH_DCP.iterdir()): + print("Converting HuggingFace checkpoint to DCP format") + cfg = ConvertCheckpointCfg() + cfg.operation = "hf2dcp" + cfg.load_path = HF_IMPORT + cfg.save_path = TORCH_DCP + step4_launch(cfg) +else: + print(f"DCP checkpoint already exists: {TORCH_DCP}") + +print("Setup complete for DeepSeek-V2-Lite correctness validation.") diff --git a/.claude/skills/correctness-validation/scripts/setup_qwen3_30b_a3b.py b/.claude/skills/correctness-validation/scripts/setup_qwen3_30b_a3b.py new file mode 100644 index 0000000..4dfe2b6 --- /dev/null +++ b/.claude/skills/correctness-validation/scripts/setup_qwen3_30b_a3b.py @@ -0,0 +1,73 @@ +""" +Setup shared data for Qwen3-30B-A3B correctness validation. + +Downloads a minimal DCLM corpus shard, tokenizes it with the Qwen3 tokenizer, +and converts the HuggingFace checkpoint to DCP format. + +Idempotent: skips steps whose output already exists. +""" + +from pathlib import Path + +from huggingface_hub import snapshot_download + +from pithtrain.tasks.build_tokenized_corpus import BuildTokenizedCorpusCfg +from pithtrain.tasks.build_tokenized_corpus import launch as step2_launch +from pithtrain.tasks.convert_checkpoint import ConvertCheckpointCfg +from pithtrain.tasks.convert_checkpoint import launch as step4_launch + +# Step 1: Download minimal DCLM shard + +RAWTXT = Path("workspace/datasets/dclm-baseline/rawtxt") +SHARD = "global-shard_01_of_10/local-shard_0_of_10/shard_00000000_processed.jsonl.zst" + +if not (Path(RAWTXT, SHARD)).exists(): + print(f"Downloading DCLM shard: {SHARD}") + snapshot_download( + "mlfoundations/dclm-baseline-1.0", + repo_type="dataset", + local_dir=str(RAWTXT), + allow_patterns=SHARD, + ) +else: + print(f"DCLM shard already exists: {Path(RAWTXT, SHARD)}") + +# Step 2: Tokenize with Qwen3 tokenizer + +TOKTXT = Path("workspace/datasets/dclm-baseline/toktxt/qwen3") + +if not TOKTXT.exists() or not any(TOKTXT.glob("*.bin")): + print("Tokenizing corpus with Qwen3 tokenizer") + cfg = BuildTokenizedCorpusCfg() + cfg.tokenizer_name = "Qwen/Qwen3-30B-A3B" + cfg.source_path = RAWTXT + cfg.output_path = TOKTXT + step2_launch(cfg) +else: + print(f"Tokenized corpus already exists: {TOKTXT}") + +# Step 3: Download HuggingFace checkpoint + +HF_IMPORT = Path("workspace/checkpoints/qwen3-30b-a3b/hf-import") + +if not HF_IMPORT.exists() or not any(HF_IMPORT.glob("*.safetensors")): + print("Downloading Qwen3-30B-A3B HuggingFace checkpoint") + snapshot_download(repo_id="Qwen/Qwen3-30B-A3B", local_dir=str(HF_IMPORT)) +else: + print(f"HuggingFace checkpoint already exists: {HF_IMPORT}") + +# Step 4: Convert to DCP format + +TORCH_DCP = Path("workspace/checkpoints/qwen3-30b-a3b/torch-dcp/step-00000000") + +if not TORCH_DCP.exists() or not any(TORCH_DCP.iterdir()): + print("Converting HuggingFace checkpoint to DCP format") + cfg = ConvertCheckpointCfg() + cfg.operation = "hf2dcp" + cfg.load_path = HF_IMPORT + cfg.save_path = TORCH_DCP + step4_launch(cfg) +else: + print(f"DCP checkpoint already exists: {TORCH_DCP}") + +print("Setup complete for Qwen3-30B-A3B correctness validation.") diff --git a/.claude/skills/correctness-validation/scripts/validate_deepseek_v2_lite.py b/.claude/skills/correctness-validation/scripts/validate_deepseek_v2_lite.py new file mode 100644 index 0000000..c8a2047 --- /dev/null +++ b/.claude/skills/correctness-validation/scripts/validate_deepseek_v2_lite.py @@ -0,0 +1,41 @@ +""" +Run 15 training steps with DeepSeek-V2-Lite for correctness validation. + +Uses a constant tiny learning rate (1e-6) and loads from a released HuggingFace +checkpoint converted to DCP. The goal is to verify that loss computation and +gradient flow are correct, not to actually train. + +Launch with: + bash .claude/skills/correctness-validation/scripts/launch_validate.sh deepseek-v2-lite +""" + +from pathlib import Path + +from pithtrain.tasks.pretrain_language_model import PretrainLanguageModelCfg, launch + +cfg = PretrainLanguageModelCfg() + +distributed = cfg.distributed +distributed.context_parallel_size = 1 +distributed.pipeline_parallel_size = 2 +distributed.expert_parallel_size = 2 + +training = cfg.training +training.model = Path("examples/pretrain_language_model/deepseek-v2-lite/config.json") +training.optimizer = "Adam" +training.scheduler = "Constant" +training.max_lr = 1e-6 +training.min_lr = 1e-6 +training.warmup_steps = 0 +training.max_steps = 15 +training.micro_batch_size = 1 +training.global_batch_size = 512 +training.sequence_length = 2048 +training.dataset = Path("workspace/datasets/dclm-baseline/toktxt/deepseek-v2") +training.moe_load_balance_type = "sequence" +training.moe_load_balance_coef = 3e-3 +training.fp8_training = "disabled" +training.save_location = Path("workspace/checkpoints/deepseek-v2-lite") + +if __name__ == "__main__": + launch(cfg) diff --git a/.claude/skills/correctness-validation/scripts/validate_qwen3_30b_a3b.py b/.claude/skills/correctness-validation/scripts/validate_qwen3_30b_a3b.py new file mode 100644 index 0000000..f2b08c7 --- /dev/null +++ b/.claude/skills/correctness-validation/scripts/validate_qwen3_30b_a3b.py @@ -0,0 +1,41 @@ +""" +Run 15 training steps with Qwen3-30B-A3B for correctness validation. + +Uses a constant tiny learning rate (1e-6) and loads from a released HuggingFace +checkpoint converted to DCP. The goal is to verify that loss computation and +gradient flow are correct, not to actually train. + +Launch with: + bash .claude/skills/correctness-validation/scripts/launch_validate.sh qwen3-30b-a3b +""" + +from pathlib import Path + +from pithtrain.tasks.pretrain_language_model import PretrainLanguageModelCfg, launch + +cfg = PretrainLanguageModelCfg() + +distributed = cfg.distributed +distributed.context_parallel_size = 1 +distributed.pipeline_parallel_size = 2 +distributed.expert_parallel_size = 8 + +training = cfg.training +training.model = Path("examples/pretrain_language_model/qwen3-30b-a3b/config.json") +training.optimizer = "Adam" +training.scheduler = "Constant" +training.max_lr = 1e-6 +training.min_lr = 1e-6 +training.warmup_steps = 0 +training.max_steps = 15 +training.micro_batch_size = 1 +training.global_batch_size = 512 +training.sequence_length = 2048 +training.dataset = Path("workspace/datasets/dclm-baseline/toktxt/qwen3") +training.moe_load_balance_type = "global-batch" +training.moe_load_balance_coef = 1e-3 +training.fp8_training = "disabled" +training.save_location = Path("workspace/checkpoints/qwen3-30b-a3b") + +if __name__ == "__main__": + launch(cfg) diff --git a/.gitignore b/.gitignore index 842f40c..7c1fcd1 100644 --- a/.gitignore +++ b/.gitignore @@ -22,7 +22,6 @@ uv.lock /venv_cmd.sh # IDEs and editors -.claude/ .vscode/ .idea/ *.swp