From 2207c92236e081fdf49c21d71e85acf44de38707 Mon Sep 17 00:00:00 2001 From: Jayce-Ping <315229706@qq.com> Date: Sun, 17 May 2026 08:52:05 +0800 Subject: [PATCH 1/3] [adapter,hparams] feat: support resuming from Hugging Face checkpoint paths Extend `BaseAdapter.load_checkpoint` to transparently resolve a Hugging Face repo spec (either `hf://owner/repo[/subdir][@rev]` or bare `owner/repo[/...]`) to a local cache directory via `huggingface_hub.snapshot_download`, reusing the existing `lora` / `full` / `state` loading branches unchanged. Logic priority at `_resolve_checkpoint_path`: 1. `hf://` prefix -> force HF (overrides any colliding local dir) 2. Local path exists -> return as-is 3. Otherwise parse as `owner/repo[/subfolder][@revision]` and download Multi-node-safe: download gated on `is_local_main_process` (one per node), not `is_main_process` (one global). On non-shared filesystems each node populates its own HF cache once; on shared filesystems huggingface_hub's per-blob `WeakFileLock` dedupes the concurrent `snapshot_download` calls so only one node transfers bytes. Fail-fast: narrow `except (RepositoryNotFoundError, HfHubHTTPError)` re-raised as `FileNotFoundError` with full context (path, repo_id, subfolder, revision, HF_TOKEN hint) for actionable error messages. Also updates the `resume_path` help text and normalizes the inline comment on all 59 example YAMLs to document the new HF support. No new config fields; `resume_path: Optional[str]` keeps the same shape. Note: pre-existing black/isort lint debt in the touched src files is unrelated to this change. Co-authored-by: Cursor --- examples/awm/lora/flux1/default.yaml | 2 +- .../awm/lora/flux2_klein_base/default.yaml | 2 +- examples/awm/lora/sd3_5/default.yaml | 2 +- examples/crd/lora/sd3_5/default.yaml | 2 +- examples/dgpo/lora/sd3_5/default.yaml | 2 +- examples/dgpo/lora/sd3_5/nocfg.yaml | 2 +- examples/dpo/lora/sd3_5/default.yaml | 2 +- examples/grpo/full/flux1/default.yaml | 2 +- examples/grpo/full/flux1_kontext/default.yaml | 2 +- examples/grpo/full/flux2/i2i.yaml | 2 +- examples/grpo/full/flux2/t2i.yaml | 2 +- examples/grpo/full/flux2_klein/default.yaml | 2 +- .../grpo/full/flux2_klein_base/default.yaml | 2 +- examples/grpo/full/qwen_image/default.yaml | 2 +- .../full/qwen_image_edit_plus/default.yaml | 2 +- examples/grpo/full/sd3_5/default.yaml | 2 +- examples/grpo/full/wan21/i2v.yaml | 2 +- examples/grpo/full/wan21/t2v.yaml | 2 +- examples/grpo/full/wan22/i2v.yaml | 2 +- examples/grpo/full/wan22/t2v.yaml | 2 +- examples/grpo/full/z_image/default.yaml | 2 +- examples/grpo/full/z_image_turbo/default.yaml | 2 +- examples/grpo/lora/flux1/default.yaml | 2 +- examples/grpo/lora/flux1_kontext/default.yaml | 2 +- examples/grpo/lora/flux2/i2i.yaml | 2 +- examples/grpo/lora/flux2/t2i.yaml | 2 +- examples/grpo/lora/flux2_klein/default.yaml | 2 +- .../grpo/lora/flux2_klein_base/default.yaml | 2 +- examples/grpo/lora/ltx2/i2av.yaml | 2 +- examples/grpo/lora/ltx2/t2av.yaml | 2 +- examples/grpo/lora/ltx2/t2av_pickscore.yaml | 2 +- examples/grpo/lora/qwen_image/default.yaml | 2 +- .../lora/qwen_image_edit_plus/default.yaml | 2 +- examples/grpo/lora/sd3_5/default.yaml | 2 +- examples/grpo/lora/sd3_5/nocfg.yaml | 2 +- examples/grpo/lora/wan21/i2v.yaml | 2 +- examples/grpo/lora/wan21/t2v.yaml | 2 +- examples/grpo/lora/wan21/v2v.yaml | 2 +- examples/grpo/lora/wan22/i2v.yaml | 2 +- examples/grpo/lora/wan22/t2v.yaml | 2 +- examples/grpo/lora/z_image/default.yaml | 2 +- examples/grpo/lora/z_image_turbo/default.yaml | 2 +- examples/nft/full/flux1/default.yaml | 2 +- .../nft/full/flux2_klein_base/default.yaml | 2 +- examples/nft/full/wan22/t2v.yaml | 2 +- examples/nft/full/z_image/default.yaml | 2 +- examples/nft/full/z_image_turbo/default.yaml | 2 +- examples/nft/lora/flux1/default.yaml | 2 +- .../nft/lora/flux1/rational_rewards_t2i.yaml | 2 +- .../flux1_kontext/rational_rewards_edit.yaml | 2 +- .../nft/lora/flux2_klein_base/default.yaml | 2 +- .../lora/qwen_image/rational_rewards_t2i.yaml | 2 +- .../rational_rewards_edit.yaml | 2 +- examples/nft/lora/sd3_5/default.yaml | 2 +- examples/nft/lora/wan21/i2v.yaml | 2 +- examples/nft/lora/wan21/t2v.yaml | 2 +- examples/nft/lora/wan22/t2v.yaml | 2 +- examples/nft/lora/z_image/default.yaml | 2 +- examples/template/sd3_5/async_reward.yaml | 2 +- src/flow_factory/hparams/model_args.py | 10 +- src/flow_factory/models/abc.py | 65 ++++++++++- src/flow_factory/utils/checkpoint.py | 110 +++++++++++++++++- 62 files changed, 241 insertions(+), 62 deletions(-) diff --git a/examples/awm/lora/flux1/default.yaml b/examples/awm/lora/flux1/default.yaml index 075e8e03..01ba0de4 100644 --- a/examples/awm/lora/flux1/default.yaml +++ b/examples/awm/lora/flux1/default.yaml @@ -23,7 +23,7 @@ model: target_modules: "default" # Options: all, default, or list of module names like ["to_k", "to_q", "to_v", "to_out.0"] model_name_or_path: "black-forest-labs/FLUX.1-dev" # HuggingFace model ID or local path model_type: "flux1" - resume_path: null # Path to load previous checkpoint/lora adapter + resume_path: null # Local path or HF repo id (e.g. 'owner/repo[/subdir][@rev]') for previous checkpoint/lora adapter resume_type: null # Options: lora, full, state. Null to auto-detect based on `finetune_type` # attn_backend: '_flash_3_hub' diff --git a/examples/awm/lora/flux2_klein_base/default.yaml b/examples/awm/lora/flux2_klein_base/default.yaml index 1986a2e4..f05d3855 100644 --- a/examples/awm/lora/flux2_klein_base/default.yaml +++ b/examples/awm/lora/flux2_klein_base/default.yaml @@ -23,7 +23,7 @@ model: target_modules: "default" # Options: all, default, or list of module names like ["to_k", "to_q", "to_v", "to_out.0"] model_name_or_path: "black-forest-labs/FLUX.2-klein-base-4B" # Options: black-forest-labs/FLUX.2-klein-base-4B, black-forest-labs/FLUX.2-klein-base-9B model_type: "flux2-klein" - resume_path: null # Path to load previous checkpoint/lora adapter + resume_path: null # Local path or HF repo id (e.g. 'owner/repo[/subdir][@rev]') for previous checkpoint/lora adapter resume_type: null # Options: lora, full, state. Null to auto-detect based on `finetune_type` # attn_backend: '_flash_3_hub' # Attention backend for training. diff --git a/examples/awm/lora/sd3_5/default.yaml b/examples/awm/lora/sd3_5/default.yaml index bc32cf89..625cb99f 100644 --- a/examples/awm/lora/sd3_5/default.yaml +++ b/examples/awm/lora/sd3_5/default.yaml @@ -24,7 +24,7 @@ model: target_modules: "default" # Options: all, default, or list of module names like ["to_k", "to_q", "to_v", "to_out.0"] model_name_or_path: "stabilityai/stable-diffusion-3.5-medium" model_type: "sd3-5" - resume_path: null # Path to load previous checkpoint/lora adapter + resume_path: null # Local path or HF repo id (e.g. 'owner/repo[/subdir][@rev]') for previous checkpoint/lora adapter resume_type: null # Options: lora, full, state. Null to auto-detect based on `finetune_type` # attn_backend: '_flash_3_hub' # Attention backend for training. diff --git a/examples/crd/lora/sd3_5/default.yaml b/examples/crd/lora/sd3_5/default.yaml index cea6a766..2ef8aaed 100644 --- a/examples/crd/lora/sd3_5/default.yaml +++ b/examples/crd/lora/sd3_5/default.yaml @@ -25,7 +25,7 @@ model: target_modules: "default" # Options: all, default, or list of module names like ["to_k", "to_q", "to_v", "to_out.0"] model_name_or_path: "stabilityai/stable-diffusion-3.5-medium" model_type: "sd3-5" - resume_path: null # Path to load previous checkpoint/lora adapter + resume_path: null # Local path or HF repo id (e.g. 'owner/repo[/subdir][@rev]') for previous checkpoint/lora adapter resume_type: null # Options: lora, full, state. Null to auto-detect based on `finetune_type` # attn_backend: '_flash_3_hub' # Attention backend for training. diff --git a/examples/dgpo/lora/sd3_5/default.yaml b/examples/dgpo/lora/sd3_5/default.yaml index e7794146..6876e14a 100644 --- a/examples/dgpo/lora/sd3_5/default.yaml +++ b/examples/dgpo/lora/sd3_5/default.yaml @@ -48,7 +48,7 @@ model: target_modules: "default" model_name_or_path: "stabilityai/stable-diffusion-3.5-medium" # config.pretrained.model model_type: "sd3-5" - resume_path: null + resume_path: null # Local path or HF repo id (e.g. 'owner/repo[/subdir][@rev]') for previous checkpoint/lora adapter resume_type: null # Training Configuration diff --git a/examples/dgpo/lora/sd3_5/nocfg.yaml b/examples/dgpo/lora/sd3_5/nocfg.yaml index b5519b93..23fd6295 100644 --- a/examples/dgpo/lora/sd3_5/nocfg.yaml +++ b/examples/dgpo/lora/sd3_5/nocfg.yaml @@ -39,7 +39,7 @@ model: target_modules: "default" model_name_or_path: "stabilityai/stable-diffusion-3.5-medium" # config.pretrained.model model_type: "sd3-5" - resume_path: null + resume_path: null # Local path or HF repo id (e.g. 'owner/repo[/subdir][@rev]') for previous checkpoint/lora adapter resume_type: null # Training Configuration diff --git a/examples/dpo/lora/sd3_5/default.yaml b/examples/dpo/lora/sd3_5/default.yaml index 8d885f51..e4d48884 100644 --- a/examples/dpo/lora/sd3_5/default.yaml +++ b/examples/dpo/lora/sd3_5/default.yaml @@ -50,7 +50,7 @@ model: target_modules: "default" model_name_or_path: "stabilityai/stable-diffusion-3.5-medium" # Same as flow_grpo model_type: "sd3-5" - resume_path: null + resume_path: null # Local path or HF repo id (e.g. 'owner/repo[/subdir][@rev]') for previous checkpoint/lora adapter resume_type: null log: diff --git a/examples/grpo/full/flux1/default.yaml b/examples/grpo/full/flux1/default.yaml index ab7b2514..4845e4f7 100644 --- a/examples/grpo/full/flux1/default.yaml +++ b/examples/grpo/full/flux1/default.yaml @@ -21,7 +21,7 @@ model: target_modules: "default" # Options: all, default, or list of module names like ["to_k", "to_q", "to_v", "to_out.0"] model_name_or_path: "black-forest-labs/FLUX.1-dev" # HuggingFace model ID or local path model_type: "flux1" - resume_path: null # Directory contains previous checkpoint/lora adapter + resume_path: null # Local path or HF repo id (e.g. 'owner/repo[/subdir][@rev]') for previous checkpoint/lora adapter resume_type: null # Options: lora, full, state. Null to auto-detect based on `finetune_type` log: diff --git a/examples/grpo/full/flux1_kontext/default.yaml b/examples/grpo/full/flux1_kontext/default.yaml index 8cefd8cf..99a57442 100644 --- a/examples/grpo/full/flux1_kontext/default.yaml +++ b/examples/grpo/full/flux1_kontext/default.yaml @@ -21,7 +21,7 @@ model: target_modules: "default" # Options: all, default, or list of module names like ["to_k", "to_q", "to_v", "to_out.0"] model_name_or_path: "black-forest-labs/FLUX.1-Kontext-dev" # HuggingFace model ID or local path model_type: "flux1-kontext" - resume_path: null # Directory contains previous checkpoint/lora adapter + resume_path: null # Local path or HF repo id (e.g. 'owner/repo[/subdir][@rev]') for previous checkpoint/lora adapter resume_type: null # Options: lora, full, state. Null to auto-detect based on `finetune_type` log: diff --git a/examples/grpo/full/flux2/i2i.yaml b/examples/grpo/full/flux2/i2i.yaml index 59d731a0..32ea5c04 100644 --- a/examples/grpo/full/flux2/i2i.yaml +++ b/examples/grpo/full/flux2/i2i.yaml @@ -24,7 +24,7 @@ model: target_modules: ["attn.to_q", "attn.to_k", "attn.to_v", "attn.to_out.0"] model_name_or_path: "black-forest-labs/FLUX.2-dev" # HuggingFace model ID or local path model_type: "flux2" # Options: flux1, flux1-kontext, flux2, qwenimage, qwenimage-edit - resume_path: null # Directory contains previous checkpoint/lora adapter + resume_path: null # Local path or HF repo id (e.g. 'owner/repo[/subdir][@rev]') for previous checkpoint/lora adapter resume_type: null # Options: lora, full, state. Null to auto-detect based on `finetune_type` log: diff --git a/examples/grpo/full/flux2/t2i.yaml b/examples/grpo/full/flux2/t2i.yaml index 9d9691c0..b9af3835 100644 --- a/examples/grpo/full/flux2/t2i.yaml +++ b/examples/grpo/full/flux2/t2i.yaml @@ -24,7 +24,7 @@ model: target_modules: ["attn.to_q", "attn.to_k", "attn.to_v", "attn.to_out.0"] model_name_or_path: "black-forest-labs/FLUX.2-dev" # HuggingFace model ID or local path model_type: "flux2" # Options: flux1, flux1-kontext, flux2, qwenimage, qwenimage-edit - resume_path: null # Path to load previous checkpoint/lora adapter + resume_path: null # Local path or HF repo id (e.g. 'owner/repo[/subdir][@rev]') for previous checkpoint/lora adapter resume_type: null # Options: lora, full, state. Null to auto-detect based on `finetune_type` log: diff --git a/examples/grpo/full/flux2_klein/default.yaml b/examples/grpo/full/flux2_klein/default.yaml index 2b834d11..fc515f68 100644 --- a/examples/grpo/full/flux2_klein/default.yaml +++ b/examples/grpo/full/flux2_klein/default.yaml @@ -21,7 +21,7 @@ model: target_modules: "default" # Options: all, default, or list of module names like ["to_k", "to_q", "to_v", "to_out.0"] model_name_or_path: "black-forest-labs/FLUX.2-klein-4B" # Options: black-forest-labs/FLUX.2-klein-4B, black-forest-labs/FLUX.2-klein-9B model_type: "flux2-klein" - resume_path: null # Path to load previous checkpoint/lora adapter + resume_path: null # Local path or HF repo id (e.g. 'owner/repo[/subdir][@rev]') for previous checkpoint/lora adapter resume_type: null # Options: lora, full, state. Null to auto-detect based on `finetune_type` log: diff --git a/examples/grpo/full/flux2_klein_base/default.yaml b/examples/grpo/full/flux2_klein_base/default.yaml index 2cab32ce..af13dbeb 100644 --- a/examples/grpo/full/flux2_klein_base/default.yaml +++ b/examples/grpo/full/flux2_klein_base/default.yaml @@ -21,7 +21,7 @@ model: target_modules: "default" # Options: all, default, or list of module names like ["to_k", "to_q", "to_v", "to_out.0"] model_name_or_path: "black-forest-labs/FLUX.2-klein-base-4B" # Options: black-forest-labs/FLUX.2-klein-base-4B, black-forest-labs/FLUX.2-klein-base-9B model_type: "flux2-klein" - resume_path: null # Directory contains previous checkpoint/lora adapter + resume_path: null # Local path or HF repo id (e.g. 'owner/repo[/subdir][@rev]') for previous checkpoint/lora adapter resume_type: null # Options: lora, full, state. Null to auto-detect based on `finetune_type` log: diff --git a/examples/grpo/full/qwen_image/default.yaml b/examples/grpo/full/qwen_image/default.yaml index 538d43e9..dc6684ba 100644 --- a/examples/grpo/full/qwen_image/default.yaml +++ b/examples/grpo/full/qwen_image/default.yaml @@ -22,7 +22,7 @@ model: target_modules: "default" # Options: all, default, or list of module names like ["to_k", "to_q", "to_v", "to_out.0"] model_name_or_path: "Qwen/Qwen-Image" # HuggingFace model ID or local path model_type: "qwen-image" # Options: flux1, flux1-kontext, flux2, qwenimage, qwenimage-edit - resume_path: null # Path to load previous checkpoint/lora adapter + resume_path: null # Local path or HF repo id (e.g. 'owner/repo[/subdir][@rev]') for previous checkpoint/lora adapter resume_type: null # Options: lora, full, state. Null to auto-detect based on `finetune_type` # attn_backend: '_flash_3_varlen_hub' # Attention backend for Qwen-Image Series, which uses masked attention with variable sequence length. diff --git a/examples/grpo/full/qwen_image_edit_plus/default.yaml b/examples/grpo/full/qwen_image_edit_plus/default.yaml index f81b515e..b0ba0730 100644 --- a/examples/grpo/full/qwen_image_edit_plus/default.yaml +++ b/examples/grpo/full/qwen_image_edit_plus/default.yaml @@ -21,7 +21,7 @@ model: target_modules: "default" # Options: all, default, or list of module names like ["to_k", "to_q", "to_v", "to_out.0"] model_name_or_path: "Qwen/Qwen-Image-Edit-2509" # Qwen/Qwen-Image-Edit-2509 or Qwen/Qwen-Image-Edit-2511 model_type: "qwen-image-edit-plus" - resume_path: null # Path to load previous checkpoint/lora adapter + resume_path: null # Local path or HF repo id (e.g. 'owner/repo[/subdir][@rev]') for previous checkpoint/lora adapter resume_type: null # Options: lora, full, state. Null to auto-detect based on `finetune_type` # attn_backend: '_flash_3_varlen_hub' # Attention backend for Qwen-Image Series, which uses masked attention with variable sequence length. diff --git a/examples/grpo/full/sd3_5/default.yaml b/examples/grpo/full/sd3_5/default.yaml index 6b02f8a7..b44f2a65 100644 --- a/examples/grpo/full/sd3_5/default.yaml +++ b/examples/grpo/full/sd3_5/default.yaml @@ -21,7 +21,7 @@ model: target_modules: "default" # Options: all, default, or list of module names like ["to_k", "to_q", "to_v", "to_out.0"] model_name_or_path: "stabilityai/stable-diffusion-3.5-medium" # HuggingFace model ID or local path model_type: "sd3-5" # Options: flux1, flux1-kontext, flux2, qwenimage, qwenimage-edit, sd3-5 - resume_path: null # Path to load previous checkpoint/lora adapter + resume_path: null # Local path or HF repo id (e.g. 'owner/repo[/subdir][@rev]') for previous checkpoint/lora adapter resume_type: null # Options: lora, full, state. Null to auto-detect based on `finetune_type` log: diff --git a/examples/grpo/full/wan21/i2v.yaml b/examples/grpo/full/wan21/i2v.yaml index 8e6a363d..d7c5dc34 100644 --- a/examples/grpo/full/wan21/i2v.yaml +++ b/examples/grpo/full/wan21/i2v.yaml @@ -22,7 +22,7 @@ model: target_modules: "default" # Options: all, default, or list of module names like ["to_k", "to_q", "to_v", "to_out.0"] model_name_or_path: "Wan-AI/Wan2.1-I2V-14B-480P-Diffusers" # Wan-AI/Wan2.1-I2V-14B-480P-Diffusers / Wan-AI/Wan2.1-I2V-14B-720P-Diffusers model_type: "wan2_i2v" # wan2_t2v, wan2_i2v, wan2_v2v - resume_path: null # Path to load previous checkpoint/lora adapter + resume_path: null # Local path or HF repo id (e.g. 'owner/repo[/subdir][@rev]') for previous checkpoint/lora adapter resume_type: null # Options: lora, full, state. Null to auto-detect based on `finetune_type` log: diff --git a/examples/grpo/full/wan21/t2v.yaml b/examples/grpo/full/wan21/t2v.yaml index 09c8bef5..fea1d8b9 100644 --- a/examples/grpo/full/wan21/t2v.yaml +++ b/examples/grpo/full/wan21/t2v.yaml @@ -22,7 +22,7 @@ model: target_modules: "default" # Options: all, default, or list of module names like ["to_k", "to_q", "to_v", "to_out.0"] model_name_or_path: "Wan-AI/Wan2.1-T2V-14B-Diffusers" # Wan-AI/Wan2.1-T2V-14B-Diffusers / Wan-AI/Wan2.2-T2V-A14B-Diffusers model_type: "wan2_t2v" # wan2_t2v, wan2_i2v, wan2_v2v - resume_path: null # Path to load previous checkpoint/lora adapter + resume_path: null # Local path or HF repo id (e.g. 'owner/repo[/subdir][@rev]') for previous checkpoint/lora adapter resume_type: null # Options: lora, full, state. Null to auto-detect based on `finetune_type` log: diff --git a/examples/grpo/full/wan22/i2v.yaml b/examples/grpo/full/wan22/i2v.yaml index 63731384..6ffa6a5e 100644 --- a/examples/grpo/full/wan22/i2v.yaml +++ b/examples/grpo/full/wan22/i2v.yaml @@ -28,7 +28,7 @@ model: target_modules: "default" model_name_or_path: "Wan-AI/Wan2.2-I2V-A14B-Diffusers" # Wan-AI/Wan2.2-TI2V-5B-Diffusers / Wan-AI/Wan2.2-I2V-A14B-Diffusers model_type: "wan2_i2v" # wan2_t2v, wan2_i2v, wan2_v2v - resume_path: null # Path to load previous checkpoint/lora adapter + resume_path: null # Local path or HF repo id (e.g. 'owner/repo[/subdir][@rev]') for previous checkpoint/lora adapter resume_type: null # Options: lora, full, state. Null to auto-detect based on `finetune_type` log: diff --git a/examples/grpo/full/wan22/t2v.yaml b/examples/grpo/full/wan22/t2v.yaml index 74d10030..a188a0f9 100644 --- a/examples/grpo/full/wan22/t2v.yaml +++ b/examples/grpo/full/wan22/t2v.yaml @@ -28,7 +28,7 @@ model: target_modules: "default" model_name_or_path: "Wan-AI/Wan2.2-T2V-A14B-Diffusers" # Wan-AI/Wan2.1-T2V-14B-Diffusers / Wan-AI/Wan2.2-T2V-A14B-Diffusers model_type: "wan2_t2v" # wan2_t2v, wan2_i2v, wan2_v2v - resume_path: null # Path to load previous checkpoint/lora adapter + resume_path: null # Local path or HF repo id (e.g. 'owner/repo[/subdir][@rev]') for previous checkpoint/lora adapter resume_type: null # Options: lora, full, state. Null to auto-detect based on `finetune_type` log: diff --git a/examples/grpo/full/z_image/default.yaml b/examples/grpo/full/z_image/default.yaml index 3abeab6a..e7794942 100644 --- a/examples/grpo/full/z_image/default.yaml +++ b/examples/grpo/full/z_image/default.yaml @@ -22,7 +22,7 @@ model: target_modules: "default" # Options: all, default, or list of module names like ["to_k", "to_q", "to_v", "to_out.0"] model_name_or_path: "Tongyi-MAI/Z-Image" # Options: Tongyi-MAI/Z-Image, Tongyi-MAI/Z-Image-Turbo model_type: "z-image" - resume_path: null # Path to load previous checkpoint/lora adapter + resume_path: null # Local path or HF repo id (e.g. 'owner/repo[/subdir][@rev]') for previous checkpoint/lora adapter resume_type: null # Options: lora, full, state. Null to auto-detect based on `finetune_type` log: diff --git a/examples/grpo/full/z_image_turbo/default.yaml b/examples/grpo/full/z_image_turbo/default.yaml index 298a34bf..427e663b 100644 --- a/examples/grpo/full/z_image_turbo/default.yaml +++ b/examples/grpo/full/z_image_turbo/default.yaml @@ -22,7 +22,7 @@ model: target_modules: "default" # Options: all, default, or list of module names like ["to_k", "to_q", "to_v", "to_out.0"] model_name_or_path: "Tongyi-MAI/Z-Image-Turbo" # HuggingFace model ID or local path model_type: "z-image" - resume_path: null # Path to load previous checkpoint/lora adapter + resume_path: null # Local path or HF repo id (e.g. 'owner/repo[/subdir][@rev]') for previous checkpoint/lora adapter resume_type: null # Options: lora, full, state. Null to auto-detect based on `finetune_type` log: diff --git a/examples/grpo/lora/flux1/default.yaml b/examples/grpo/lora/flux1/default.yaml index 24fab9c2..9d814985 100644 --- a/examples/grpo/lora/flux1/default.yaml +++ b/examples/grpo/lora/flux1/default.yaml @@ -23,7 +23,7 @@ model: target_modules: "default" # Options: all, default, or list of module names like ["to_k", "to_q", "to_v", "to_out.0"] model_name_or_path: "black-forest-labs/FLUX.1-dev" # HuggingFace model ID or local path model_type: "flux1" # Options: flux1, flux1-kontext, flux2, qwenimage, qwenimage-edit - resume_path: null # Path to load previous checkpoint/lora adapter + resume_path: null # Local path or HF repo id (e.g. 'owner/repo[/subdir][@rev]') for previous checkpoint/lora adapter resume_type: null # Options: lora, full, state. Null to auto-detect based on `finetune_type` log: diff --git a/examples/grpo/lora/flux1_kontext/default.yaml b/examples/grpo/lora/flux1_kontext/default.yaml index b6c6084d..269f4b03 100644 --- a/examples/grpo/lora/flux1_kontext/default.yaml +++ b/examples/grpo/lora/flux1_kontext/default.yaml @@ -23,7 +23,7 @@ model: target_modules: "default" # Options: all, default, or list of module names like ["to_k", "to_q", "to_v", "to_out.0"] model_name_or_path: "black-forest-labs/FLUX.1-Kontext-dev" # HuggingFace model ID or local path model_type: "flux1-kontext" - resume_path: null # Directory contains previous checkpoint/lora adapter + resume_path: null # Local path or HF repo id (e.g. 'owner/repo[/subdir][@rev]') for previous checkpoint/lora adapter resume_type: null # Options: lora, full, state. Null to auto-detect based on `finetune_type` log: diff --git a/examples/grpo/lora/flux2/i2i.yaml b/examples/grpo/lora/flux2/i2i.yaml index b3185cb3..21ef994c 100644 --- a/examples/grpo/lora/flux2/i2i.yaml +++ b/examples/grpo/lora/flux2/i2i.yaml @@ -23,7 +23,7 @@ model: target_modules: "default" # Options: all, default, or list of module names like ["to_k", "to_q", "to_v", "to_out.0"] model_name_or_path: "black-forest-labs/FLUX.2-dev" # HuggingFace model ID or local path model_type: "flux2" - resume_path: null # Path to load previous checkpoint/lora adapter + resume_path: null # Local path or HF repo id (e.g. 'owner/repo[/subdir][@rev]') for previous checkpoint/lora adapter resume_type: null # Options: lora, full, state. Null to auto-detect based on `finetune_type` log: diff --git a/examples/grpo/lora/flux2/t2i.yaml b/examples/grpo/lora/flux2/t2i.yaml index 56dd92c5..17b89c83 100644 --- a/examples/grpo/lora/flux2/t2i.yaml +++ b/examples/grpo/lora/flux2/t2i.yaml @@ -23,7 +23,7 @@ model: target_modules: "default" # Options: all, default, or list of module names like ["to_k", "to_q", "to_v", "to_out.0"] model_name_or_path: "black-forest-labs/FLUX.2-dev" # HuggingFace model ID or local path model_type: "flux2" - resume_path: null # Path to load previous checkpoint/lora adapter + resume_path: null # Local path or HF repo id (e.g. 'owner/repo[/subdir][@rev]') for previous checkpoint/lora adapter resume_type: null # Options: lora, full, state. Null to auto-detect based on `finetune_type` log: diff --git a/examples/grpo/lora/flux2_klein/default.yaml b/examples/grpo/lora/flux2_klein/default.yaml index ce2e9fc4..286be971 100644 --- a/examples/grpo/lora/flux2_klein/default.yaml +++ b/examples/grpo/lora/flux2_klein/default.yaml @@ -23,7 +23,7 @@ model: target_modules: "default" # Options: all, default, or list of module names like ["to_k", "to_q", "to_v", "to_out.0"] model_name_or_path: "black-forest-labs/FLUX.2-klein-9B" # Options: black-forest-labs/FLUX.2-klein-4B, black-forest-labs/FLUX.2-klein-9B model_type: "flux2-klein" - resume_path: null # Path to load previous checkpoint/lora adapter + resume_path: null # Local path or HF repo id (e.g. 'owner/repo[/subdir][@rev]') for previous checkpoint/lora adapter resume_type: null # Options: lora, full, state. Null to auto-detect based on `finetune_type` log: diff --git a/examples/grpo/lora/flux2_klein_base/default.yaml b/examples/grpo/lora/flux2_klein_base/default.yaml index 21c8a4bd..890484d3 100644 --- a/examples/grpo/lora/flux2_klein_base/default.yaml +++ b/examples/grpo/lora/flux2_klein_base/default.yaml @@ -23,7 +23,7 @@ model: target_modules: "default" # Options: all, default, or list of module names like ["to_k", "to_q", "to_v", "to_out.0"] model_name_or_path: "black-forest-labs/FLUX.2-klein-base-9B" # Options: black-forest-labs/FLUX.2-klein-base-4B, black-forest-labs/FLUX.2-klein-base-9B model_type: "flux2-klein" - resume_path: null # Path to load previous checkpoint/lora adapter + resume_path: null # Local path or HF repo id (e.g. 'owner/repo[/subdir][@rev]') for previous checkpoint/lora adapter resume_type: null # Options: lora, full, state. Null to auto-detect based on `finetune_type` log: diff --git a/examples/grpo/lora/ltx2/i2av.yaml b/examples/grpo/lora/ltx2/i2av.yaml index c960506b..1e4c2872 100644 --- a/examples/grpo/lora/ltx2/i2av.yaml +++ b/examples/grpo/lora/ltx2/i2av.yaml @@ -28,7 +28,7 @@ model: target_modules: "default" # 28 Linear layers per block (video/audio attn + cross-modal attn + FFN) model_name_or_path: "Lightricks/LTX-2" # Options: Lightricks/LTX-2, dg845/LTX-2.3-Diffusers model_type: "ltx2_i2av" - resume_path: null # Path to load previous checkpoint/lora adapter + resume_path: null # Local path or HF repo id (e.g. 'owner/repo[/subdir][@rev]') for previous checkpoint/lora adapter resume_type: null # Options: lora, full, state. Null to auto-detect based on `finetune_type` log: diff --git a/examples/grpo/lora/ltx2/t2av.yaml b/examples/grpo/lora/ltx2/t2av.yaml index fb093871..ae27c722 100644 --- a/examples/grpo/lora/ltx2/t2av.yaml +++ b/examples/grpo/lora/ltx2/t2av.yaml @@ -27,7 +27,7 @@ model: target_modules: "default" # 28 Linear layers per block (video/audio attn + cross-modal attn + FFN) model_name_or_path: "Lightricks/LTX-2" # Options: Lightricks/LTX-2, dg845/LTX-2.3-Diffusers model_type: "ltx2_t2av" - resume_path: null # Path to load previous checkpoint/lora adapter + resume_path: null # Local path or HF repo id (e.g. 'owner/repo[/subdir][@rev]') for previous checkpoint/lora adapter resume_type: null # Options: lora, full, state. Null to auto-detect based on `finetune_type` # attn_backend: '_flash_3_hub' # Attention backend. diff --git a/examples/grpo/lora/ltx2/t2av_pickscore.yaml b/examples/grpo/lora/ltx2/t2av_pickscore.yaml index e358a48a..9dfeae3d 100644 --- a/examples/grpo/lora/ltx2/t2av_pickscore.yaml +++ b/examples/grpo/lora/ltx2/t2av_pickscore.yaml @@ -25,7 +25,7 @@ model: target_modules: "default" # 28 Linear layers per block (video/audio attn + cross-modal attn + FFN) model_name_or_path: "dg845/LTX-2.3-Diffusers" # Options: Lightricks/LTX-2, dg845/LTX-2.3-Diffusers model_type: "ltx2_t2av" - resume_path: null # Path to load previous checkpoint/lora adapter + resume_path: null # Local path or HF repo id (e.g. 'owner/repo[/subdir][@rev]') for previous checkpoint/lora adapter resume_type: null # Options: lora, full, state. Null to auto-detect based on `finetune_type` # attn_backend: '_flash_3_hub' # Attention backend. diff --git a/examples/grpo/lora/qwen_image/default.yaml b/examples/grpo/lora/qwen_image/default.yaml index cc34ddcb..ef785057 100644 --- a/examples/grpo/lora/qwen_image/default.yaml +++ b/examples/grpo/lora/qwen_image/default.yaml @@ -23,7 +23,7 @@ model: target_modules: "default" # Options: all, default, or list of module names like ["to_k", "to_q", "to_v", "to_out.0"] model_name_or_path: "Qwen/Qwen-Image-2512" # Qwen/Qwen-Image or Qwen/Qwen-Image-2512 model_type: "qwen-image" - resume_path: null # Path to load previous checkpoint/lora adapter + resume_path: null # Local path or HF repo id (e.g. 'owner/repo[/subdir][@rev]') for previous checkpoint/lora adapter resume_type: null # Options: lora, full, state. Null to auto-detect based on `finetune_type` # attn_backend: '_flash_3_varlen_hub' # Attention backend for Qwen-Image Series, which uses masked attention with variable sequence length. diff --git a/examples/grpo/lora/qwen_image_edit_plus/default.yaml b/examples/grpo/lora/qwen_image_edit_plus/default.yaml index 42302ee6..cbe66e45 100644 --- a/examples/grpo/lora/qwen_image_edit_plus/default.yaml +++ b/examples/grpo/lora/qwen_image_edit_plus/default.yaml @@ -25,7 +25,7 @@ model: target_modules: "default" # Options: all, default, or list of module names like ["to_k", "to_q", "to_v", "to_out.0"] model_name_or_path: "Qwen/Qwen-Image-Edit-2509" # Qwen/Qwen-Image-Edit-2509 or Qwen/Qwen-Image-Edit-2511 model_type: "qwen-image-edit-plus" - resume_path: null # Path to load previous checkpoint/lora adapter + resume_path: null # Local path or HF repo id (e.g. 'owner/repo[/subdir][@rev]') for previous checkpoint/lora adapter resume_type: null # Options: lora, full, state. Null to auto-detect based on `finetune_type` # attn_backend: '_flash_3_varlen_hub' # Attention backend for Qwen-Image Series, which uses masked attention with variable sequence length. diff --git a/examples/grpo/lora/sd3_5/default.yaml b/examples/grpo/lora/sd3_5/default.yaml index 31f1b86b..f52936ee 100644 --- a/examples/grpo/lora/sd3_5/default.yaml +++ b/examples/grpo/lora/sd3_5/default.yaml @@ -26,7 +26,7 @@ model: target_modules: "default" # Options: all, default, or list of module names like ["to_k", "to_q", "to_v", "to_out.0"] model_name_or_path: "stabilityai/stable-diffusion-3.5-medium" # HuggingFace model ID or local path model_type: "sd3-5" - resume_path: null # Path to load previous checkpoint/lora adapter + resume_path: null # Local path or HF repo id (e.g. 'owner/repo[/subdir][@rev]') for previous checkpoint/lora adapter resume_type: null # Options: lora, full, state. Null to auto-detect based on `finetune_type` # attn_backend: '_flash_3_hub' # Attention backend for training. diff --git a/examples/grpo/lora/sd3_5/nocfg.yaml b/examples/grpo/lora/sd3_5/nocfg.yaml index 29cd7938..bf1d6931 100644 --- a/examples/grpo/lora/sd3_5/nocfg.yaml +++ b/examples/grpo/lora/sd3_5/nocfg.yaml @@ -26,7 +26,7 @@ model: target_modules: "default" # Options: all, default, or list of module names like ["to_k", "to_q", "to_v", "to_out.0"] model_name_or_path: "stabilityai/stable-diffusion-3.5-medium" # HuggingFace model ID or local path model_type: "sd3-5" - resume_path: null # Path to load previous checkpoint/lora adapter + resume_path: null # Local path or HF repo id (e.g. 'owner/repo[/subdir][@rev]') for previous checkpoint/lora adapter resume_type: null # Options: lora, full, state. Null to auto-detect based on `finetune_type` # attn_backend: '_flash_3_hub' # Attention backend for training. diff --git a/examples/grpo/lora/wan21/i2v.yaml b/examples/grpo/lora/wan21/i2v.yaml index 144f801f..52f0f13a 100644 --- a/examples/grpo/lora/wan21/i2v.yaml +++ b/examples/grpo/lora/wan21/i2v.yaml @@ -24,7 +24,7 @@ model: target_modules: "default" # Options: all, default, or list of module names like ["to_k", "to_q", "to_v", "to_out.0"] model_name_or_path: "Wan-AI/Wan2.1-I2V-14B-480P-Diffusers" # Wan-AI/Wan2.1-I2V-14B-480P-Diffusers / Wan-AI/Wan2.1-I2V-14B-720P-Diffusers model_type: "wan2_i2v" # wan2_t2v, wan2_i2v, wan2_v2v - resume_path: null # Path to load previous checkpoint/lora adapter + resume_path: null # Local path or HF repo id (e.g. 'owner/repo[/subdir][@rev]') for previous checkpoint/lora adapter resume_type: null # Options: lora, full, state. Null to auto-detect based on `finetune_type` log: diff --git a/examples/grpo/lora/wan21/t2v.yaml b/examples/grpo/lora/wan21/t2v.yaml index 0d68323c..55a46b9c 100644 --- a/examples/grpo/lora/wan21/t2v.yaml +++ b/examples/grpo/lora/wan21/t2v.yaml @@ -24,7 +24,7 @@ model: target_modules: "default" # Options: all, default, or list of module names like ["to_k", "to_q", "to_v", "to_out.0"] model_name_or_path: "Wan-AI/Wan2.1-T2V-14B-Diffusers" # Wan-AI/Wan2.1-T2V-14B-Diffusers / Wan-AI/Wan2.2-T2V-A14B-Diffusers model_type: "wan2_t2v" # wan2_t2v, wan2_i2v, wan2_v2v - resume_path: null # Path to load previous checkpoint/lora adapter + resume_path: null # Local path or HF repo id (e.g. 'owner/repo[/subdir][@rev]') for previous checkpoint/lora adapter resume_type: null # Options: lora, full, state. Null to auto-detect based on `finetune_type` log: diff --git a/examples/grpo/lora/wan21/v2v.yaml b/examples/grpo/lora/wan21/v2v.yaml index 44fd3cf8..93929834 100644 --- a/examples/grpo/lora/wan21/v2v.yaml +++ b/examples/grpo/lora/wan21/v2v.yaml @@ -24,7 +24,7 @@ model: target_modules: "default" # Options: all, default, or list of module names like ["to_k", "to_q", "to_v", "to_out.0"] model_name_or_path: "Wan-AI/Wan2.1-T2V-14B-Diffusers" # Wan-AI/Wan2.1-T2V-1.3B-Diffusers / Wan-AI/Wan2.1-T2V-14B-Diffusers model_type: "wan2_v2v" - resume_path: null # Path to load previous checkpoint/lora adapter + resume_path: null # Local path or HF repo id (e.g. 'owner/repo[/subdir][@rev]') for previous checkpoint/lora adapter resume_type: null # Options: lora, full, state. Null to auto-detect based on `finetune_type` log: diff --git a/examples/grpo/lora/wan22/i2v.yaml b/examples/grpo/lora/wan22/i2v.yaml index 96db5fac..105e62c4 100644 --- a/examples/grpo/lora/wan22/i2v.yaml +++ b/examples/grpo/lora/wan22/i2v.yaml @@ -33,7 +33,7 @@ model: target_modules: "transformer.default" model_name_or_path: "Wan-AI/Wan2.2-I2V-A14B-Diffusers" # Wan-AI/Wan2.2-TI2V-5B-Diffusers / Wan-AI/Wan2.2-I2V-A14B-Diffusers model_type: "wan2_i2v" # wan2_t2v, wan2_i2v, wan2_v2v - resume_path: null # Path to load previous checkpoint/lora adapter + resume_path: null # Local path or HF repo id (e.g. 'owner/repo[/subdir][@rev]') for previous checkpoint/lora adapter resume_type: null # Options: lora, full, state. Null to auto-detect based on `finetune_type` log: diff --git a/examples/grpo/lora/wan22/t2v.yaml b/examples/grpo/lora/wan22/t2v.yaml index d279ed84..3d3969d7 100644 --- a/examples/grpo/lora/wan22/t2v.yaml +++ b/examples/grpo/lora/wan22/t2v.yaml @@ -33,7 +33,7 @@ model: target_modules: "default" model_name_or_path: "Wan-AI/Wan2.2-T2V-A14B-Diffusers" # Wan-AI/Wan2.1-T2V-14B-Diffusers / Wan-AI/Wan2.2-T2V-A14B-Diffusers model_type: "wan2_t2v" # wan2_t2v, wan2_i2v, wan2_v2v - resume_path: null # Path to load previous checkpoint/lora adapter + resume_path: null # Local path or HF repo id (e.g. 'owner/repo[/subdir][@rev]') for previous checkpoint/lora adapter resume_type: null # Options: lora, full, state. Null to auto-detect based on `finetune_type` log: diff --git a/examples/grpo/lora/z_image/default.yaml b/examples/grpo/lora/z_image/default.yaml index a517bd2b..07da324b 100644 --- a/examples/grpo/lora/z_image/default.yaml +++ b/examples/grpo/lora/z_image/default.yaml @@ -24,7 +24,7 @@ model: target_modules: "default" # Options: all, default, or list of module names like ["to_k", "to_q", "to_v", "to_out.0"] model_name_or_path: "Tongyi-MAI/Z-Image" # Options: Tongyi-MAI/Z-Image, Tongyi-MAI/Z-Image-Turbo model_type: "z-image" - resume_path: null # Path to load previous checkpoint/lora adapter + resume_path: null # Local path or HF repo id (e.g. 'owner/repo[/subdir][@rev]') for previous checkpoint/lora adapter resume_type: null # Options: lora, full, state. Null to auto-detect based on `finetune_type` log: diff --git a/examples/grpo/lora/z_image_turbo/default.yaml b/examples/grpo/lora/z_image_turbo/default.yaml index 293dc222..fdac1937 100644 --- a/examples/grpo/lora/z_image_turbo/default.yaml +++ b/examples/grpo/lora/z_image_turbo/default.yaml @@ -24,7 +24,7 @@ model: target_modules: "default" # Options: all, default, or list of module names like ["to_k", "to_q", "to_v", "to_out.0"] model_name_or_path: "Tongyi-MAI/Z-Image-Turbo" # HuggingFace model ID or local path model_type: "z-image" # Options: flux1, flux1-kontext, flux2, qwenimage, qwenimage-edit, z-image - resume_path: null # Path to load previous checkpoint/lora adapter + resume_path: null # Local path or HF repo id (e.g. 'owner/repo[/subdir][@rev]') for previous checkpoint/lora adapter resume_type: null # Options: lora, full, state. Null to auto-detect based on `finetune_type` log: diff --git a/examples/nft/full/flux1/default.yaml b/examples/nft/full/flux1/default.yaml index 61d90c45..502a2235 100644 --- a/examples/nft/full/flux1/default.yaml +++ b/examples/nft/full/flux1/default.yaml @@ -21,7 +21,7 @@ model: target_modules: "default" # Options: all, default, or list of module names like ["to_k", "to_q", "to_v", "to_out.0"] model_name_or_path: "black-forest-labs/FLUX.1-dev" # HuggingFace model ID or local path model_type: "flux1" - resume_path: null # Path to load previous checkpoint/lora adapter + resume_path: null # Local path or HF repo id (e.g. 'owner/repo[/subdir][@rev]') for previous checkpoint/lora adapter resume_type: null # Options: lora, full, state. Null to auto-detect based on `finetune_type` log: diff --git a/examples/nft/full/flux2_klein_base/default.yaml b/examples/nft/full/flux2_klein_base/default.yaml index d66bb878..1ddd0675 100644 --- a/examples/nft/full/flux2_klein_base/default.yaml +++ b/examples/nft/full/flux2_klein_base/default.yaml @@ -21,7 +21,7 @@ model: target_modules: "default" # Options: all, default, or list of module names like ["to_k", "to_q", "to_v", "to_out.0"] model_name_or_path: "black-forest-labs/FLUX.2-klein-base-4B" # Options: black-forest-labs/FLUX.2-klein-base-4B, black-forest-labs/FLUX.2-klein-base-9B model_type: "flux2-klein" - resume_path: null # Path to load previous checkpoint/lora adapter + resume_path: null # Local path or HF repo id (e.g. 'owner/repo[/subdir][@rev]') for previous checkpoint/lora adapter resume_type: null # Options: lora, full, state. Null to auto-detect based on `finetune_type` log: diff --git a/examples/nft/full/wan22/t2v.yaml b/examples/nft/full/wan22/t2v.yaml index 6cf4ddaa..4e788752 100644 --- a/examples/nft/full/wan22/t2v.yaml +++ b/examples/nft/full/wan22/t2v.yaml @@ -26,7 +26,7 @@ model: target_modules: "default" model_name_or_path: "Wan-AI/Wan2.2-T2V-A14B-Diffusers" # Wan-AI/Wan2.1-T2V-14B-Diffusers / Wan-AI/Wan2.2-T2V-A14B-Diffusers model_type: "wan2_t2v" # wan2_t2v, wan2_i2v, wan2_v2v - resume_path: null # Path to load previous checkpoint/lora adapter + resume_path: null # Local path or HF repo id (e.g. 'owner/repo[/subdir][@rev]') for previous checkpoint/lora adapter resume_type: null # Options: lora, full, state. Null to auto-detect based on `finetune_type` log: diff --git a/examples/nft/full/z_image/default.yaml b/examples/nft/full/z_image/default.yaml index 83d6b4c5..50252fb4 100644 --- a/examples/nft/full/z_image/default.yaml +++ b/examples/nft/full/z_image/default.yaml @@ -22,7 +22,7 @@ model: target_modules: "default" # Options: all, default, or list of module names like ["to_k", "to_q", "to_v", "to_out.0"] model_name_or_path: "Tongyi-MAI/Z-Image" # Options: Tongyi-MAI/Z-Image, Tongyi-MAI/Z-Image-Turbo model_type: "z-image" - resume_path: null # Path to load previous checkpoint/lora adapter + resume_path: null # Local path or HF repo id (e.g. 'owner/repo[/subdir][@rev]') for previous checkpoint/lora adapter resume_type: null # Options: lora, full, state. Null to auto-detect based on `finetune_type` log: diff --git a/examples/nft/full/z_image_turbo/default.yaml b/examples/nft/full/z_image_turbo/default.yaml index b08702f5..3af795a2 100644 --- a/examples/nft/full/z_image_turbo/default.yaml +++ b/examples/nft/full/z_image_turbo/default.yaml @@ -22,7 +22,7 @@ model: target_modules: "default" # Options: all, default, or list of module names like ["to_k", "to_q", "to_v", "to_out.0"] model_name_or_path: "Tongyi-MAI/Z-Image-Turbo" # Options: Tongyi-MAI/Z-Image, Tongyi-MAI/Z-Image-Turbo model_type: "z-image" - resume_path: null # Path to load previous checkpoint/lora adapter + resume_path: null # Local path or HF repo id (e.g. 'owner/repo[/subdir][@rev]') for previous checkpoint/lora adapter resume_type: null # Options: lora, full, state. Null to auto-detect based on `finetune_type` log: diff --git a/examples/nft/lora/flux1/default.yaml b/examples/nft/lora/flux1/default.yaml index 93cdb5ac..88d33b52 100644 --- a/examples/nft/lora/flux1/default.yaml +++ b/examples/nft/lora/flux1/default.yaml @@ -23,7 +23,7 @@ model: target_modules: "default" # Options: all, default, or list of module names like ["to_k", "to_q", "to_v", "to_out.0"] model_name_or_path: "black-forest-labs/FLUX.1-dev" # HuggingFace model ID or local path model_type: "flux1" - resume_path: null # Path to load previous checkpoint/lora adapter + resume_path: null # Local path or HF repo id (e.g. 'owner/repo[/subdir][@rev]') for previous checkpoint/lora adapter resume_type: null # Options: lora, full, state. Null to auto-detect based on `finetune_type` # attn_backend: '_flash_3_hub' # Use flash attention 3 backend. diff --git a/examples/nft/lora/flux1/rational_rewards_t2i.yaml b/examples/nft/lora/flux1/rational_rewards_t2i.yaml index 38071fb9..ff40b2c3 100644 --- a/examples/nft/lora/flux1/rational_rewards_t2i.yaml +++ b/examples/nft/lora/flux1/rational_rewards_t2i.yaml @@ -24,7 +24,7 @@ model: target_modules: "default" model_name_or_path: "black-forest-labs/FLUX.1-dev" model_type: "flux1" - resume_path: null + resume_path: null # Local path or HF repo id (e.g. 'owner/repo[/subdir][@rev]') for previous checkpoint/lora adapter resume_type: null log: diff --git a/examples/nft/lora/flux1_kontext/rational_rewards_edit.yaml b/examples/nft/lora/flux1_kontext/rational_rewards_edit.yaml index bf0f900e..ad2ebdc8 100644 --- a/examples/nft/lora/flux1_kontext/rational_rewards_edit.yaml +++ b/examples/nft/lora/flux1_kontext/rational_rewards_edit.yaml @@ -24,7 +24,7 @@ model: target_modules: "default" model_name_or_path: "black-forest-labs/FLUX.1-Kontext-dev" model_type: "flux1-kontext" - resume_path: null + resume_path: null # Local path or HF repo id (e.g. 'owner/repo[/subdir][@rev]') for previous checkpoint/lora adapter resume_type: null log: diff --git a/examples/nft/lora/flux2_klein_base/default.yaml b/examples/nft/lora/flux2_klein_base/default.yaml index 9274f075..a79fedb8 100644 --- a/examples/nft/lora/flux2_klein_base/default.yaml +++ b/examples/nft/lora/flux2_klein_base/default.yaml @@ -23,7 +23,7 @@ model: target_modules: "default" # Options: all, default, or list of module names like ["to_k", "to_q", "to_v", "to_out.0"] model_name_or_path: "black-forest-labs/FLUX.2-klein-base-4B" # Options: black-forest-labs/FLUX.2-klein-base-4B, black-forest-labs/FLUX.2-klein-base-9B model_type: "flux2-klein" - resume_path: null # Path to load previous checkpoint/lora adapter + resume_path: null # Local path or HF repo id (e.g. 'owner/repo[/subdir][@rev]') for previous checkpoint/lora adapter resume_type: null # Options: lora, full, state. Null to auto-detect based on `finetune_type` log: diff --git a/examples/nft/lora/qwen_image/rational_rewards_t2i.yaml b/examples/nft/lora/qwen_image/rational_rewards_t2i.yaml index 7559e802..e99289f8 100644 --- a/examples/nft/lora/qwen_image/rational_rewards_t2i.yaml +++ b/examples/nft/lora/qwen_image/rational_rewards_t2i.yaml @@ -25,7 +25,7 @@ model: target_modules: "default" model_name_or_path: "Qwen/Qwen-Image-2512" model_type: "qwen-image" - resume_path: null + resume_path: null # Local path or HF repo id (e.g. 'owner/repo[/subdir][@rev]') for previous checkpoint/lora adapter resume_type: null log: diff --git a/examples/nft/lora/qwen_image_edit_plus/rational_rewards_edit.yaml b/examples/nft/lora/qwen_image_edit_plus/rational_rewards_edit.yaml index 8137aade..b326cf76 100644 --- a/examples/nft/lora/qwen_image_edit_plus/rational_rewards_edit.yaml +++ b/examples/nft/lora/qwen_image_edit_plus/rational_rewards_edit.yaml @@ -24,7 +24,7 @@ model: target_modules: "default" model_name_or_path: "Qwen/Qwen-Image-Edit-2509" model_type: "qwen-image-edit-plus" - resume_path: null + resume_path: null # Local path or HF repo id (e.g. 'owner/repo[/subdir][@rev]') for previous checkpoint/lora adapter resume_type: null log: diff --git a/examples/nft/lora/sd3_5/default.yaml b/examples/nft/lora/sd3_5/default.yaml index 7eafa33b..ba2b5cb8 100644 --- a/examples/nft/lora/sd3_5/default.yaml +++ b/examples/nft/lora/sd3_5/default.yaml @@ -24,7 +24,7 @@ model: target_modules: "default" # Options: all, default, or list of module names like ["to_k", "to_q", "to_v", "to_out.0"] model_name_or_path: "stabilityai/stable-diffusion-3.5-medium" model_type: "sd3-5" - resume_path: null # Path to load previous checkpoint/lora adapter + resume_path: null # Local path or HF repo id (e.g. 'owner/repo[/subdir][@rev]') for previous checkpoint/lora adapter resume_type: null # Options: lora, full, state. Null to auto-detect based on `finetune_type` # attn_backend: '_flash_3_hub' # Attention backend for training. diff --git a/examples/nft/lora/wan21/i2v.yaml b/examples/nft/lora/wan21/i2v.yaml index 23680484..78b9552f 100644 --- a/examples/nft/lora/wan21/i2v.yaml +++ b/examples/nft/lora/wan21/i2v.yaml @@ -24,7 +24,7 @@ model: target_modules: "default" model_name_or_path: "Wan-AI/Wan2.1-I2V-14B-720P-Diffusers" # Wan-AI/Wan2.1-I2V-14B-480P-Diffusers / Wan-AI/Wan2.1-I2V-14B-480P-Diffusers model_type: "wan2_i2v" # wan2_t2v, wan2_i2v, wan2_v2v - resume_path: null # Path to load previous checkpoint/lora adapter + resume_path: null # Local path or HF repo id (e.g. 'owner/repo[/subdir][@rev]') for previous checkpoint/lora adapter resume_type: null # Options: lora, full, state. Null to auto-detect based on `finetune_type` # attn_backend: '_flash_3_hub' # Use flash attention 3 backend. diff --git a/examples/nft/lora/wan21/t2v.yaml b/examples/nft/lora/wan21/t2v.yaml index b2a215ae..3c7ce05f 100644 --- a/examples/nft/lora/wan21/t2v.yaml +++ b/examples/nft/lora/wan21/t2v.yaml @@ -24,7 +24,7 @@ model: target_modules: "default" model_name_or_path: "Wan-AI/Wan2.1-T2V-1.3B-Diffusers" # Wan-AI/Wan2.1-T2V-1.3B-Diffusers / Wan-AI/Wan2.1-T2V-14B-Diffusers model_type: "wan2_t2v" # wan2_t2v, wan2_i2v, wan2_v2v - resume_path: null # Path to load previous checkpoint/lora adapter + resume_path: null # Local path or HF repo id (e.g. 'owner/repo[/subdir][@rev]') for previous checkpoint/lora adapter resume_type: null # Options: lora, full, state. Null to auto-detect based on `finetune_type` # attn_backend: '_flash_3_hub' # Use flash attention 3 backend. diff --git a/examples/nft/lora/wan22/t2v.yaml b/examples/nft/lora/wan22/t2v.yaml index a493ec46..546e592a 100644 --- a/examples/nft/lora/wan22/t2v.yaml +++ b/examples/nft/lora/wan22/t2v.yaml @@ -24,7 +24,7 @@ model: target_modules: "default" model_name_or_path: "Wan-AI/Wan2.2-T2V-A14B-Diffusers" # Wan-AI/Wan2.2-TI2V-5B-Diffusers / Wan-AI/Wan2.2-T2V-A14B-Diffusers model_type: "wan2_t2v" # wan2_t2v, wan2_i2v, wan2_v2v - resume_path: null # Path to load previous checkpoint/lora adapter + resume_path: null # Local path or HF repo id (e.g. 'owner/repo[/subdir][@rev]') for previous checkpoint/lora adapter resume_type: null # Options: lora, full, state. Null to auto-detect based on `finetune_type` # attn_backend: '_flash_3_hub' # Use flash attention 3 backend. diff --git a/examples/nft/lora/z_image/default.yaml b/examples/nft/lora/z_image/default.yaml index 50e9fdce..c0b809dd 100644 --- a/examples/nft/lora/z_image/default.yaml +++ b/examples/nft/lora/z_image/default.yaml @@ -24,7 +24,7 @@ model: target_modules: "default" # Options: all, default, or list of module names like ["to_k", "to_q", "to_v", "to_out.0"] model_name_or_path: "Tongyi-MAI/Z-Image" # Options: Tongyi-MAI/Z-Image, Tongyi-MAI/Z-Image-Turbo model_type: "z-image" - resume_path: null # Path to load previous checkpoint/lora adapter + resume_path: null # Local path or HF repo id (e.g. 'owner/repo[/subdir][@rev]') for previous checkpoint/lora adapter resume_type: null # Options: lora, full, state. Null to auto-detect based on `finetune_type` log: diff --git a/examples/template/sd3_5/async_reward.yaml b/examples/template/sd3_5/async_reward.yaml index a7f19e1d..db004bfb 100644 --- a/examples/template/sd3_5/async_reward.yaml +++ b/examples/template/sd3_5/async_reward.yaml @@ -23,7 +23,7 @@ model: target_modules: "default" # Options: all, default, or list of module names like ["to_k", "to_q", "to_v", "to_out.0"] model_name_or_path: "stabilityai/stable-diffusion-3.5-medium" # HuggingFace model ID or local path model_type: "sd3-5" - resume_path: null # Path to load previous checkpoint/lora adapter + resume_path: null # Local path or HF repo id (e.g. 'owner/repo[/subdir][@rev]') for previous checkpoint/lora adapter resume_type: null # Options: lora, full, state. Null to auto-detect based on `finetune_type` # attn_backend: '_flash_3_hub' # Attention backend for training. diff --git a/src/flow_factory/hparams/model_args.py b/src/flow_factory/hparams/model_args.py index 9c7de4e0..d86c7947 100644 --- a/src/flow_factory/hparams/model_args.py +++ b/src/flow_factory/hparams/model_args.py @@ -80,7 +80,15 @@ class ModelArguments(ArgABC): resume_path : Optional[str] = field( default=None, - metadata={"help": "Resume from checkpoint directory."} + metadata={ + "help": "Resume from checkpoint. Accepts either a local directory or a " + "Hugging Face repo spec ('owner/repo[/subfolder][@revision]', or " + "explicit 'hf://owner/repo[/subfolder][@revision]'). When a local " + "path doesn't exist, falls back to Hugging Face Hub download. " + "Multi-node: HF_TOKEN must be set on every node; downloads happen " + "once per node; consider HF_HUB_ENABLE_HF_TRANSFER=1 for large " + "checkpoints to avoid NCCL watchdog timeouts." + } ) resume_type : Optional[Literal['lora', 'full', 'state']] = field( diff --git a/src/flow_factory/models/abc.py b/src/flow_factory/models/abc.py index d3a00572..0d1e8243 100644 --- a/src/flow_factory/models/abc.py +++ b/src/flow_factory/models/abc.py @@ -58,6 +58,9 @@ mapping_lora_state_dict, infer_lora_config, infer_target_modules, + parse_hf_checkpoint_path, + download_hf_checkpoint, + HF_PATH_PREFIX, ) from ..samples import BaseSample from ..ema import EMAModuleWrapper @@ -1455,6 +1458,63 @@ def save_checkpoint( logger.info(f"Checkpoint saved successfully to {save_directory}") # -------------------------------------------- Load ------------------------------------------- + def _resolve_checkpoint_path(self, path: str) -> str: + """ + Resolve `path` to a local directory, downloading from Hugging Face Hub when needed. + + Resolution order: + 1. If `path` starts with ``hf://``, strip the prefix and force HF download + (lets users override a colliding local directory). + 2. Otherwise, if `path` exists locally, return it as-is. + 3. Otherwise, parse as ``owner/repo[/subfolder][@revision]`` and download + via Hugging Face Hub. + + Multi-node-safe: the download is gated on ``is_local_main_process`` (one + process per node), not ``is_main_process`` (one global). This populates the + per-node HF cache exactly once on non-shared filesystems; on shared + filesystems, ``huggingface_hub``'s per-blob ``WeakFileLock`` dedupes the + concurrent ``snapshot_download`` calls so only one node transfers bytes. + + Args: + path: Local filesystem path or HF spec (with or without ``hf://`` prefix). + + Returns: + Absolute local directory path ready for the existing checkpoint loaders. + + Raises: + FileNotFoundError: When the spec is neither a local path nor a reachable HF repo. + """ + from huggingface_hub.errors import RepositoryNotFoundError, HfHubHTTPError + + force_hf = path.startswith(HF_PATH_PREFIX) + spec = path[len(HF_PATH_PREFIX):] if force_hf else path + + if not force_hf and os.path.exists(spec): + return spec + + repo_id, subfolder, revision = parse_hf_checkpoint_path(spec) + + if self.accelerator.is_local_main_process: + local_path = download_hf_checkpoint(repo_id, subfolder, revision) + logger.info( + f"[local rank 0 / global rank {self.accelerator.process_index}] " + f"resolved checkpoint '{path}' -> {local_path}" + ) + self.accelerator.wait_for_everyone() + + # All ranks call again; on the populated cache this is a metadata-only + # path lookup. Narrow re-raise for the specific HF-Hub failure modes so + # users see a single actionable message instead of a raw HTTPError. + try: + return download_hf_checkpoint(repo_id, subfolder, revision) + except (RepositoryNotFoundError, HfHubHTTPError) as e: + raise FileNotFoundError( + f"Checkpoint {path!r} not found locally and could not be fetched " + f"from Hugging Face Hub (repo={repo_id!r}, subfolder={subfolder!r}, " + f"revision={revision!r}). For private repos, ensure HF_TOKEN is set " + f"on ALL nodes." + ) from e + @staticmethod def load_sharded_checkpoint(checkpoint_dir: str, index_file: str) -> Dict[str, torch.Tensor]: """Load sharded safetensors checkpoint.""" @@ -1674,8 +1734,11 @@ def load_checkpoint( - None: Auto-detect based on checkpoint directory contents """ path = os.path.expanduser(path) + path = self._resolve_checkpoint_path(path) if not os.path.exists(path): - raise FileNotFoundError(f"Checkpoint path not found: {path}") + raise FileNotFoundError( + f"Checkpoint path not found locally or on Hugging Face Hub: {path!r}" + ) # Auto-detect if not specified if resume_type is None: diff --git a/src/flow_factory/utils/checkpoint.py b/src/flow_factory/utils/checkpoint.py index 14c79227..86968dd8 100644 --- a/src/flow_factory/utils/checkpoint.py +++ b/src/flow_factory/utils/checkpoint.py @@ -137,4 +137,112 @@ def infer_target_modules( if match: target_modules.add(match.group(1)) - return sorted(target_modules) \ No newline at end of file + return sorted(target_modules) + + +# ================================ Hugging Face Hub ================================ +HF_PATH_PREFIX = "hf://" + + +def parse_hf_checkpoint_path(path: str) -> Tuple[str, Optional[str], Optional[str]]: + """ + Parse a Hugging Face checkpoint path spec into ``(repo_id, subfolder, revision)``. + + Accepts both bare and ``hf://``-prefixed specs: + - ``owner/repo`` -> (``owner/repo``, None, None) + - ``hf://owner/repo`` -> (``owner/repo``, None, None) + - ``owner/repo/sub/dir`` -> (``owner/repo``, ``sub/dir``, None) + - ``owner/repo@v1.0`` -> (``owner/repo``, None, ``v1.0``) + - ``hf://owner/repo/sub/dir@v1.0`` -> (``owner/repo``, ``sub/dir``, ``v1.0``) + + Args: + path: A bare or ``hf://``-prefixed checkpoint spec. + + Returns: + Tuple of (repo_id, subfolder, revision); subfolder and revision are ``None`` when absent. + + Raises: + ValueError: If the spec lacks the ``owner/repo`` form (at minimum two path segments). + """ + if not isinstance(path, str): + raise TypeError( + f"expected str for path, got {type(path).__name__}: {path!r}" + ) + + spec = path[len(HF_PATH_PREFIX):] if path.startswith(HF_PATH_PREFIX) else path + + # Split off optional @revision (revision token cannot contain '/' or '@'). + revision: Optional[str] = None + if "@" in spec: + spec, revision = spec.rsplit("@", 1) + if not revision or "/" in revision: + raise ValueError( + f"invalid revision in HF checkpoint path: {path!r} " + f"(expected 'owner/repo[/subfolder][@revision]', got revision={revision!r})" + ) + + parts = [p for p in spec.split("/") if p] + if len(parts) < 2: + raise ValueError( + f"invalid HF checkpoint path: {path!r} " + f"(expected at least 'owner/repo', got {len(parts)} non-empty segments)" + ) + + repo_id = "/".join(parts[:2]) + subfolder = "/".join(parts[2:]) if len(parts) > 2 else None + return repo_id, subfolder, revision + + +def download_hf_checkpoint( + repo_id: str, + subfolder: Optional[str] = None, + revision: Optional[str] = None, +) -> str: + """ + Download a Hugging Face checkpoint snapshot and return the local directory path. + + Thin wrapper over ``huggingface_hub.snapshot_download``. When ``subfolder`` is + provided, restricts the download to that subtree via ``allow_patterns`` and + returns the path joined with the subfolder so the caller receives the directory + layout that the existing local-checkpoint loaders expect. + + Authentication is taken from the standard ``HF_TOKEN`` / ``HUGGING_FACE_HUB_TOKEN`` + environment variables (and the local ``~/.cache/huggingface/token`` cache). For + multi-node training the token must be available on every node. + + Args: + repo_id: HF repository identifier in ``owner/repo`` form. + subfolder: Optional subdirectory within the repo to fetch. + revision: Optional git revision (branch, tag, or commit SHA). + + Returns: + Absolute local directory path containing the snapshot (with ``subfolder`` appended when set). + """ + if not isinstance(repo_id, str) or "/" not in repo_id: + raise ValueError( + f"expected 'owner/repo' for repo_id, got {repo_id!r}" + ) + + from huggingface_hub import snapshot_download + + allow_patterns: Optional[List[str]] = None + if subfolder: + # Match the subfolder itself plus everything beneath it. + allow_patterns = [f"{subfolder}/*", f"{subfolder}/**"] + + local_root = snapshot_download( + repo_id=repo_id, + revision=revision, + allow_patterns=allow_patterns, + ) + + if subfolder: + local_path = os.path.join(local_root, subfolder) + if not os.path.isdir(local_path): + raise FileNotFoundError( + f"HF snapshot for repo_id={repo_id!r} (revision={revision!r}) did not " + f"contain expected subfolder {subfolder!r}; downloaded root={local_root!r}" + ) + return local_path + + return local_root \ No newline at end of file From 540ff6d6ddb041b3bfc00c117574cfe9e5bbe13e Mon Sep 17 00:00:00 2001 From: Jayce-Ping <315229706@qq.com> Date: Sun, 17 May 2026 08:56:22 +0800 Subject: [PATCH 2/3] [adapter,docs] refactor: hoist HF imports to module top; codify rule Move the two `huggingface_hub` imports added in the previous commit out of function bodies and up to the module's import block: - `from huggingface_hub import snapshot_download` in `src/flow_factory/utils/checkpoint.py` - `from huggingface_hub.errors import RepositoryNotFoundError, HfHubHTTPError` in `src/flow_factory/models/abc.py` `huggingface_hub` is already a hard dependency (`pyproject.toml:39`), so there is no import-cost concern; lazy-loading only hid the dependency surface from readers and `isort`. Codify the rule so this pattern is caught in review going forward: - Extend constraint #22 ("Import Style") in `.agents/knowledge/constraints.md` with an explicit "Top-level imports only" bullet, listing the three sanctioned exceptions (optional deps via `try/except ImportError`, backend-gated imports under runtime feature checks like DeepSpeed/FSDP, and unresolvable circular imports). - Add a matching checklist item to the Code Style section of `.agents/skills/ff-review/SKILL.md`. Pre-existing inline FSDP/DeepSpeed imports in `models/abc.py` (lines ~997-1152) are grandfathered under the new rule's exception (b). Co-authored-by: Cursor --- .agents/knowledge/constraints.md | 1 + .agents/skills/ff-review/SKILL.md | 1 + src/flow_factory/models/abc.py | 3 +-- src/flow_factory/utils/checkpoint.py | 3 +-- 4 files changed, 4 insertions(+), 4 deletions(-) diff --git a/.agents/knowledge/constraints.md b/.agents/knowledge/constraints.md index 3623f515..0e0fd1c7 100644 --- a/.agents/knowledge/constraints.md +++ b/.agents/knowledge/constraints.md @@ -131,6 +131,7 @@ The adapter sets inference dtype for frozen components and training dtype for tr - Use relative imports within `flow_factory` package (e.g., `from ..hparams import *`) - Use absolute imports for external packages - Follow existing wildcard import patterns for `hparams` +- **Top-level imports only**: All `import` / `from ... import ...` statements MUST live at the top of the module, never inside function bodies, methods, `__init__`, or conditional branches. Sanctioned exceptions: (a) optional dependencies wrapped in `try/except ImportError` (e.g., `deepspeed`, `xformers`); (b) backend-gated imports where the target symbol is only resolvable under a specific runtime backend already selected by a preceding feature check (e.g., DeepSpeed/FSDP submodules guarded by `is_deepspeed()` / `is_fsdp2()` in `models/abc.py`); (c) genuine unresolvable circular imports documented inline. Lazy imports added merely for "import speed" or "to keep the module light" are NOT acceptable — every hard dependency already runs through Python's import machinery on a typical import path. Inline imports hide the dependency surface from readers, `isort`, and static-analysis tools, and re-execute on every call in hot loops. ### 23. Type Annotations All public methods must have type annotations. Use `typing` module types (`List`, `Dict`, `Optional`, `Tuple`, `Union`) for Python 3.10 compatibility. diff --git a/.agents/skills/ff-review/SKILL.md b/.agents/skills/ff-review/SKILL.md index de301a1a..1dd10ea2 100644 --- a/.agents/skills/ff-review/SKILL.md +++ b/.agents/skills/ff-review/SKILL.md @@ -60,6 +60,7 @@ git status # Modified files - [ ] English comments and docstrings - [ ] Apache 2.0 license header on new files - [ ] No unnecessary wildcard imports (except `hparams`) +- [ ] **Top-level imports only** — no `import` / `from ... import ...` inside function or method bodies (constraint #22). Exceptions: documented optional deps via `try/except ImportError`, or genuine unresolvable circular imports. ### Documentation - [ ] `guidance/` docs updated if behavior changed diff --git a/src/flow_factory/models/abc.py b/src/flow_factory/models/abc.py index 0d1e8243..5f4f0bc6 100644 --- a/src/flow_factory/models/abc.py +++ b/src/flow_factory/models/abc.py @@ -38,6 +38,7 @@ from peft import get_peft_model, LoraConfig, PeftModel from huggingface_hub import split_torch_state_dict_into_shards +from huggingface_hub.errors import RepositoryNotFoundError, HfHubHTTPError from accelerate import Accelerator, DistributedType from accelerate.state import PartialState from accelerate.utils.modeling import ( @@ -1484,8 +1485,6 @@ def _resolve_checkpoint_path(self, path: str) -> str: Raises: FileNotFoundError: When the spec is neither a local path nor a reachable HF repo. """ - from huggingface_hub.errors import RepositoryNotFoundError, HfHubHTTPError - force_hf = path.startswith(HF_PATH_PREFIX) spec = path[len(HF_PATH_PREFIX):] if force_hf else path diff --git a/src/flow_factory/utils/checkpoint.py b/src/flow_factory/utils/checkpoint.py index 86968dd8..2c0b5cd6 100644 --- a/src/flow_factory/utils/checkpoint.py +++ b/src/flow_factory/utils/checkpoint.py @@ -24,6 +24,7 @@ from typing import Dict, Optional, List, Tuple, Literal from safetensors.torch import save_file, load_file +from huggingface_hub import snapshot_download def mapping_lora_state_dict( state_dict: Dict[str, torch.Tensor], @@ -223,8 +224,6 @@ def download_hf_checkpoint( f"expected 'owner/repo' for repo_id, got {repo_id!r}" ) - from huggingface_hub import snapshot_download - allow_patterns: Optional[List[str]] = None if subfolder: # Match the subfolder itself plus everything beneath it. From aa883aac8a1724790da6794553a6d5f7e1f32eac Mon Sep 17 00:00:00 2001 From: Jayce-Ping <315229706@qq.com> Date: Sun, 17 May 2026 10:31:48 +0800 Subject: [PATCH 3/3] [adapter,hparams] fix: address Copilot review on PR #160 Three fixes from the Copilot review on the upstream PR: 1. Path-traversal validation (utils/checkpoint.py): parse_hf_checkpoint_path now rejects '.', '..', and backslash segments with an informative ValueError that preserves the original spec. Validates at the parser front door rather than at the download site so error messages keep the user's exact input. Without this, a spec like 'owner/repo/..' would escape the snapshot directory via os.path.join. 2. Un-gate _resolve_checkpoint_path (models/abc.py): Remove the `if is_local_main_process:` gate and the post-barrier double-call pattern. All ranks now call snapshot_download directly; huggingface_hub's per-blob WeakFileLock serializes concurrent calls within each filesystem domain (cross-node on POSIX-locking shared FS, per-node on non-shared FS), so we still get exactly one download per filesystem domain. This eliminates the distributed-deadlock hazard where a download failure on the gated rank would raise before reaching wait_for_everyone(), leaving siblings blocked at the barrier until NCCL watchdog timeout. The trailing wait_for_everyone() is kept to maintain lockstep entry into the downstream loaders. Residual asymmetric-failure risk (one rank's network blip while others succeed) is documented in the docstring. 3. Skill-checklist alignment (.agents/skills/ff-review/SKILL.md): Replace the duplicated import-exception list with a reference to constraint #22, where the full set of three sanctioned exceptions (optional deps, backend-gated runtime feature checks, circular imports) lives. Prevents future drift between the two documents. Verified with 8 happy-path + 5 original-error + 6 path-traversal parser test cases (all pass). Co-authored-by: Cursor --- .agents/skills/ff-review/SKILL.md | 2 +- src/flow_factory/models/abc.py | 43 +++++++++++++++++----------- src/flow_factory/utils/checkpoint.py | 11 +++++++ 3 files changed, 38 insertions(+), 18 deletions(-) diff --git a/.agents/skills/ff-review/SKILL.md b/.agents/skills/ff-review/SKILL.md index 1dd10ea2..ae85957b 100644 --- a/.agents/skills/ff-review/SKILL.md +++ b/.agents/skills/ff-review/SKILL.md @@ -60,7 +60,7 @@ git status # Modified files - [ ] English comments and docstrings - [ ] Apache 2.0 license header on new files - [ ] No unnecessary wildcard imports (except `hparams`) -- [ ] **Top-level imports only** — no `import` / `from ... import ...` inside function or method bodies (constraint #22). Exceptions: documented optional deps via `try/except ImportError`, or genuine unresolvable circular imports. +- [ ] **Top-level imports only** (constraint #22) — see that file for the three sanctioned exceptions (optional deps via `try/except ImportError`, backend-gated runtime feature checks like DeepSpeed/FSDP, unresolvable circular imports). ### Documentation - [ ] `guidance/` docs updated if behavior changed diff --git a/src/flow_factory/models/abc.py b/src/flow_factory/models/abc.py index 5f4f0bc6..d1a36818 100644 --- a/src/flow_factory/models/abc.py +++ b/src/flow_factory/models/abc.py @@ -1470,11 +1470,17 @@ def _resolve_checkpoint_path(self, path: str) -> str: 3. Otherwise, parse as ``owner/repo[/subfolder][@revision]`` and download via Hugging Face Hub. - Multi-node-safe: the download is gated on ``is_local_main_process`` (one - process per node), not ``is_main_process`` (one global). This populates the - per-node HF cache exactly once on non-shared filesystems; on shared - filesystems, ``huggingface_hub``'s per-blob ``WeakFileLock`` dedupes the - concurrent ``snapshot_download`` calls so only one node transfers bytes. + Multi-node-safe: all ranks call ``snapshot_download`` directly. Hugging + Face Hub's per-blob ``WeakFileLock`` serializes concurrent calls within + each filesystem domain (cross-node on POSIX-locking shared FS, per-node + on non-shared FS), so exactly one rank per filesystem domain actually + transfers bytes. Un-gated (rather than ``is_local_main_process`` plus a + barrier) so a failed download raises uniformly on every affected rank + instead of leaving siblings deadlocked at a barrier the failing rank + never reaches. Residual hazard: a rare single-rank transient failure + (e.g. one node's network blip) can produce asymmetric progress, in + which case the surviving ranks will eventually trip the NCCL watchdog + on the final barrier below. Args: path: Local filesystem path or HF spec (with or without ``hf://`` prefix). @@ -1493,19 +1499,8 @@ def _resolve_checkpoint_path(self, path: str) -> str: repo_id, subfolder, revision = parse_hf_checkpoint_path(spec) - if self.accelerator.is_local_main_process: - local_path = download_hf_checkpoint(repo_id, subfolder, revision) - logger.info( - f"[local rank 0 / global rank {self.accelerator.process_index}] " - f"resolved checkpoint '{path}' -> {local_path}" - ) - self.accelerator.wait_for_everyone() - - # All ranks call again; on the populated cache this is a metadata-only - # path lookup. Narrow re-raise for the specific HF-Hub failure modes so - # users see a single actionable message instead of a raw HTTPError. try: - return download_hf_checkpoint(repo_id, subfolder, revision) + local_path = download_hf_checkpoint(repo_id, subfolder, revision) except (RepositoryNotFoundError, HfHubHTTPError) as e: raise FileNotFoundError( f"Checkpoint {path!r} not found locally and could not be fetched " @@ -1514,6 +1509,20 @@ def _resolve_checkpoint_path(self, path: str) -> str: f"on ALL nodes." ) from e + # Sync after download so downstream loaders enter the lockstep dispatch + # together. On symmetric failure every rank raises above before this + # barrier is reached, so no deadlock; the residual asymmetric-failure + # case is documented in the docstring. + self.accelerator.wait_for_everyone() + + if self.accelerator.is_local_main_process: + logger.info( + f"[local rank 0 / global rank {self.accelerator.process_index}] " + f"resolved checkpoint '{path}' -> {local_path}" + ) + + return local_path + @staticmethod def load_sharded_checkpoint(checkpoint_dir: str, index_file: str) -> Dict[str, torch.Tensor]: """Load sharded safetensors checkpoint.""" diff --git a/src/flow_factory/utils/checkpoint.py b/src/flow_factory/utils/checkpoint.py index 2c0b5cd6..fa101246 100644 --- a/src/flow_factory/utils/checkpoint.py +++ b/src/flow_factory/utils/checkpoint.py @@ -189,6 +189,17 @@ def parse_hf_checkpoint_path(path: str) -> Tuple[str, Optional[str], Optional[st f"(expected at least 'owner/repo', got {len(parts)} non-empty segments)" ) + # Reject path-traversal segments. Without this, a spec like + # 'owner/repo/..' would resolve via os.path.join to a directory outside + # the snapshot root and let downstream loaders read from unintended + # locations. Backslashes are rejected to block Windows-style traversal. + for seg in parts: + if seg in (".", "..") or "\\" in seg: + raise ValueError( + f"invalid segment {seg!r} in HF checkpoint path: {path!r} " + f"('.', '..', and backslashes are not allowed)" + ) + repo_id = "/".join(parts[:2]) subfolder = "/".join(parts[2:]) if len(parts) > 2 else None return repo_id, subfolder, revision