diff --git a/.agents/knowledge/constraints.md b/.agents/knowledge/constraints.md index 3623f515..0e0fd1c7 100644 --- a/.agents/knowledge/constraints.md +++ b/.agents/knowledge/constraints.md @@ -131,6 +131,7 @@ The adapter sets inference dtype for frozen components and training dtype for tr - Use relative imports within `flow_factory` package (e.g., `from ..hparams import *`) - Use absolute imports for external packages - Follow existing wildcard import patterns for `hparams` +- **Top-level imports only**: All `import` / `from ... import ...` statements MUST live at the top of the module, never inside function bodies, methods, `__init__`, or conditional branches. Sanctioned exceptions: (a) optional dependencies wrapped in `try/except ImportError` (e.g., `deepspeed`, `xformers`); (b) backend-gated imports where the target symbol is only resolvable under a specific runtime backend already selected by a preceding feature check (e.g., DeepSpeed/FSDP submodules guarded by `is_deepspeed()` / `is_fsdp2()` in `models/abc.py`); (c) genuine unresolvable circular imports documented inline. Lazy imports added merely for "import speed" or "to keep the module light" are NOT acceptable — every hard dependency already runs through Python's import machinery on a typical import path. Inline imports hide the dependency surface from readers, `isort`, and static-analysis tools, and re-execute on every call in hot loops. ### 23. Type Annotations All public methods must have type annotations. Use `typing` module types (`List`, `Dict`, `Optional`, `Tuple`, `Union`) for Python 3.10 compatibility. diff --git a/.agents/skills/ff-review/SKILL.md b/.agents/skills/ff-review/SKILL.md index de301a1a..ae85957b 100644 --- a/.agents/skills/ff-review/SKILL.md +++ b/.agents/skills/ff-review/SKILL.md @@ -60,6 +60,7 @@ git status # Modified files - [ ] English comments and docstrings - [ ] Apache 2.0 license header on new files - [ ] No unnecessary wildcard imports (except `hparams`) +- [ ] **Top-level imports only** (constraint #22) — see that file for the three sanctioned exceptions (optional deps via `try/except ImportError`, backend-gated runtime feature checks like DeepSpeed/FSDP, unresolvable circular imports). ### Documentation - [ ] `guidance/` docs updated if behavior changed diff --git a/examples/awm/lora/flux1/default.yaml b/examples/awm/lora/flux1/default.yaml index 075e8e03..01ba0de4 100644 --- a/examples/awm/lora/flux1/default.yaml +++ b/examples/awm/lora/flux1/default.yaml @@ -23,7 +23,7 @@ model: target_modules: "default" # Options: all, default, or list of module names like ["to_k", "to_q", "to_v", "to_out.0"] model_name_or_path: "black-forest-labs/FLUX.1-dev" # HuggingFace model ID or local path model_type: "flux1" - resume_path: null # Path to load previous checkpoint/lora adapter + resume_path: null # Local path or HF repo id (e.g. 'owner/repo[/subdir][@rev]') for previous checkpoint/lora adapter resume_type: null # Options: lora, full, state. Null to auto-detect based on `finetune_type` # attn_backend: '_flash_3_hub' diff --git a/examples/awm/lora/flux2_klein_base/default.yaml b/examples/awm/lora/flux2_klein_base/default.yaml index 1986a2e4..f05d3855 100644 --- a/examples/awm/lora/flux2_klein_base/default.yaml +++ b/examples/awm/lora/flux2_klein_base/default.yaml @@ -23,7 +23,7 @@ model: target_modules: "default" # Options: all, default, or list of module names like ["to_k", "to_q", "to_v", "to_out.0"] model_name_or_path: "black-forest-labs/FLUX.2-klein-base-4B" # Options: black-forest-labs/FLUX.2-klein-base-4B, black-forest-labs/FLUX.2-klein-base-9B model_type: "flux2-klein" - resume_path: null # Path to load previous checkpoint/lora adapter + resume_path: null # Local path or HF repo id (e.g. 'owner/repo[/subdir][@rev]') for previous checkpoint/lora adapter resume_type: null # Options: lora, full, state. Null to auto-detect based on `finetune_type` # attn_backend: '_flash_3_hub' # Attention backend for training. diff --git a/examples/awm/lora/sd3_5/default.yaml b/examples/awm/lora/sd3_5/default.yaml index bc32cf89..625cb99f 100644 --- a/examples/awm/lora/sd3_5/default.yaml +++ b/examples/awm/lora/sd3_5/default.yaml @@ -24,7 +24,7 @@ model: target_modules: "default" # Options: all, default, or list of module names like ["to_k", "to_q", "to_v", "to_out.0"] model_name_or_path: "stabilityai/stable-diffusion-3.5-medium" model_type: "sd3-5" - resume_path: null # Path to load previous checkpoint/lora adapter + resume_path: null # Local path or HF repo id (e.g. 'owner/repo[/subdir][@rev]') for previous checkpoint/lora adapter resume_type: null # Options: lora, full, state. Null to auto-detect based on `finetune_type` # attn_backend: '_flash_3_hub' # Attention backend for training. diff --git a/examples/crd/lora/sd3_5/default.yaml b/examples/crd/lora/sd3_5/default.yaml index cea6a766..2ef8aaed 100644 --- a/examples/crd/lora/sd3_5/default.yaml +++ b/examples/crd/lora/sd3_5/default.yaml @@ -25,7 +25,7 @@ model: target_modules: "default" # Options: all, default, or list of module names like ["to_k", "to_q", "to_v", "to_out.0"] model_name_or_path: "stabilityai/stable-diffusion-3.5-medium" model_type: "sd3-5" - resume_path: null # Path to load previous checkpoint/lora adapter + resume_path: null # Local path or HF repo id (e.g. 'owner/repo[/subdir][@rev]') for previous checkpoint/lora adapter resume_type: null # Options: lora, full, state. Null to auto-detect based on `finetune_type` # attn_backend: '_flash_3_hub' # Attention backend for training. diff --git a/examples/dgpo/lora/sd3_5/default.yaml b/examples/dgpo/lora/sd3_5/default.yaml index e7794146..6876e14a 100644 --- a/examples/dgpo/lora/sd3_5/default.yaml +++ b/examples/dgpo/lora/sd3_5/default.yaml @@ -48,7 +48,7 @@ model: target_modules: "default" model_name_or_path: "stabilityai/stable-diffusion-3.5-medium" # config.pretrained.model model_type: "sd3-5" - resume_path: null + resume_path: null # Local path or HF repo id (e.g. 'owner/repo[/subdir][@rev]') for previous checkpoint/lora adapter resume_type: null # Training Configuration diff --git a/examples/dgpo/lora/sd3_5/nocfg.yaml b/examples/dgpo/lora/sd3_5/nocfg.yaml index b5519b93..23fd6295 100644 --- a/examples/dgpo/lora/sd3_5/nocfg.yaml +++ b/examples/dgpo/lora/sd3_5/nocfg.yaml @@ -39,7 +39,7 @@ model: target_modules: "default" model_name_or_path: "stabilityai/stable-diffusion-3.5-medium" # config.pretrained.model model_type: "sd3-5" - resume_path: null + resume_path: null # Local path or HF repo id (e.g. 'owner/repo[/subdir][@rev]') for previous checkpoint/lora adapter resume_type: null # Training Configuration diff --git a/examples/dpo/lora/sd3_5/default.yaml b/examples/dpo/lora/sd3_5/default.yaml index 8d885f51..e4d48884 100644 --- a/examples/dpo/lora/sd3_5/default.yaml +++ b/examples/dpo/lora/sd3_5/default.yaml @@ -50,7 +50,7 @@ model: target_modules: "default" model_name_or_path: "stabilityai/stable-diffusion-3.5-medium" # Same as flow_grpo model_type: "sd3-5" - resume_path: null + resume_path: null # Local path or HF repo id (e.g. 'owner/repo[/subdir][@rev]') for previous checkpoint/lora adapter resume_type: null log: diff --git a/examples/grpo/full/flux1/default.yaml b/examples/grpo/full/flux1/default.yaml index ab7b2514..4845e4f7 100644 --- a/examples/grpo/full/flux1/default.yaml +++ b/examples/grpo/full/flux1/default.yaml @@ -21,7 +21,7 @@ model: target_modules: "default" # Options: all, default, or list of module names like ["to_k", "to_q", "to_v", "to_out.0"] model_name_or_path: "black-forest-labs/FLUX.1-dev" # HuggingFace model ID or local path model_type: "flux1" - resume_path: null # Directory contains previous checkpoint/lora adapter + resume_path: null # Local path or HF repo id (e.g. 'owner/repo[/subdir][@rev]') for previous checkpoint/lora adapter resume_type: null # Options: lora, full, state. Null to auto-detect based on `finetune_type` log: diff --git a/examples/grpo/full/flux1_kontext/default.yaml b/examples/grpo/full/flux1_kontext/default.yaml index 8cefd8cf..99a57442 100644 --- a/examples/grpo/full/flux1_kontext/default.yaml +++ b/examples/grpo/full/flux1_kontext/default.yaml @@ -21,7 +21,7 @@ model: target_modules: "default" # Options: all, default, or list of module names like ["to_k", "to_q", "to_v", "to_out.0"] model_name_or_path: "black-forest-labs/FLUX.1-Kontext-dev" # HuggingFace model ID or local path model_type: "flux1-kontext" - resume_path: null # Directory contains previous checkpoint/lora adapter + resume_path: null # Local path or HF repo id (e.g. 'owner/repo[/subdir][@rev]') for previous checkpoint/lora adapter resume_type: null # Options: lora, full, state. Null to auto-detect based on `finetune_type` log: diff --git a/examples/grpo/full/flux2/i2i.yaml b/examples/grpo/full/flux2/i2i.yaml index 59d731a0..32ea5c04 100644 --- a/examples/grpo/full/flux2/i2i.yaml +++ b/examples/grpo/full/flux2/i2i.yaml @@ -24,7 +24,7 @@ model: target_modules: ["attn.to_q", "attn.to_k", "attn.to_v", "attn.to_out.0"] model_name_or_path: "black-forest-labs/FLUX.2-dev" # HuggingFace model ID or local path model_type: "flux2" # Options: flux1, flux1-kontext, flux2, qwenimage, qwenimage-edit - resume_path: null # Directory contains previous checkpoint/lora adapter + resume_path: null # Local path or HF repo id (e.g. 'owner/repo[/subdir][@rev]') for previous checkpoint/lora adapter resume_type: null # Options: lora, full, state. Null to auto-detect based on `finetune_type` log: diff --git a/examples/grpo/full/flux2/t2i.yaml b/examples/grpo/full/flux2/t2i.yaml index 9d9691c0..b9af3835 100644 --- a/examples/grpo/full/flux2/t2i.yaml +++ b/examples/grpo/full/flux2/t2i.yaml @@ -24,7 +24,7 @@ model: target_modules: ["attn.to_q", "attn.to_k", "attn.to_v", "attn.to_out.0"] model_name_or_path: "black-forest-labs/FLUX.2-dev" # HuggingFace model ID or local path model_type: "flux2" # Options: flux1, flux1-kontext, flux2, qwenimage, qwenimage-edit - resume_path: null # Path to load previous checkpoint/lora adapter + resume_path: null # Local path or HF repo id (e.g. 'owner/repo[/subdir][@rev]') for previous checkpoint/lora adapter resume_type: null # Options: lora, full, state. Null to auto-detect based on `finetune_type` log: diff --git a/examples/grpo/full/flux2_klein/default.yaml b/examples/grpo/full/flux2_klein/default.yaml index 2b834d11..fc515f68 100644 --- a/examples/grpo/full/flux2_klein/default.yaml +++ b/examples/grpo/full/flux2_klein/default.yaml @@ -21,7 +21,7 @@ model: target_modules: "default" # Options: all, default, or list of module names like ["to_k", "to_q", "to_v", "to_out.0"] model_name_or_path: "black-forest-labs/FLUX.2-klein-4B" # Options: black-forest-labs/FLUX.2-klein-4B, black-forest-labs/FLUX.2-klein-9B model_type: "flux2-klein" - resume_path: null # Path to load previous checkpoint/lora adapter + resume_path: null # Local path or HF repo id (e.g. 'owner/repo[/subdir][@rev]') for previous checkpoint/lora adapter resume_type: null # Options: lora, full, state. Null to auto-detect based on `finetune_type` log: diff --git a/examples/grpo/full/flux2_klein_base/default.yaml b/examples/grpo/full/flux2_klein_base/default.yaml index 2cab32ce..af13dbeb 100644 --- a/examples/grpo/full/flux2_klein_base/default.yaml +++ b/examples/grpo/full/flux2_klein_base/default.yaml @@ -21,7 +21,7 @@ model: target_modules: "default" # Options: all, default, or list of module names like ["to_k", "to_q", "to_v", "to_out.0"] model_name_or_path: "black-forest-labs/FLUX.2-klein-base-4B" # Options: black-forest-labs/FLUX.2-klein-base-4B, black-forest-labs/FLUX.2-klein-base-9B model_type: "flux2-klein" - resume_path: null # Directory contains previous checkpoint/lora adapter + resume_path: null # Local path or HF repo id (e.g. 'owner/repo[/subdir][@rev]') for previous checkpoint/lora adapter resume_type: null # Options: lora, full, state. Null to auto-detect based on `finetune_type` log: diff --git a/examples/grpo/full/qwen_image/default.yaml b/examples/grpo/full/qwen_image/default.yaml index 538d43e9..dc6684ba 100644 --- a/examples/grpo/full/qwen_image/default.yaml +++ b/examples/grpo/full/qwen_image/default.yaml @@ -22,7 +22,7 @@ model: target_modules: "default" # Options: all, default, or list of module names like ["to_k", "to_q", "to_v", "to_out.0"] model_name_or_path: "Qwen/Qwen-Image" # HuggingFace model ID or local path model_type: "qwen-image" # Options: flux1, flux1-kontext, flux2, qwenimage, qwenimage-edit - resume_path: null # Path to load previous checkpoint/lora adapter + resume_path: null # Local path or HF repo id (e.g. 'owner/repo[/subdir][@rev]') for previous checkpoint/lora adapter resume_type: null # Options: lora, full, state. Null to auto-detect based on `finetune_type` # attn_backend: '_flash_3_varlen_hub' # Attention backend for Qwen-Image Series, which uses masked attention with variable sequence length. diff --git a/examples/grpo/full/qwen_image_edit_plus/default.yaml b/examples/grpo/full/qwen_image_edit_plus/default.yaml index f81b515e..b0ba0730 100644 --- a/examples/grpo/full/qwen_image_edit_plus/default.yaml +++ b/examples/grpo/full/qwen_image_edit_plus/default.yaml @@ -21,7 +21,7 @@ model: target_modules: "default" # Options: all, default, or list of module names like ["to_k", "to_q", "to_v", "to_out.0"] model_name_or_path: "Qwen/Qwen-Image-Edit-2509" # Qwen/Qwen-Image-Edit-2509 or Qwen/Qwen-Image-Edit-2511 model_type: "qwen-image-edit-plus" - resume_path: null # Path to load previous checkpoint/lora adapter + resume_path: null # Local path or HF repo id (e.g. 'owner/repo[/subdir][@rev]') for previous checkpoint/lora adapter resume_type: null # Options: lora, full, state. Null to auto-detect based on `finetune_type` # attn_backend: '_flash_3_varlen_hub' # Attention backend for Qwen-Image Series, which uses masked attention with variable sequence length. diff --git a/examples/grpo/full/sd3_5/default.yaml b/examples/grpo/full/sd3_5/default.yaml index 6b02f8a7..b44f2a65 100644 --- a/examples/grpo/full/sd3_5/default.yaml +++ b/examples/grpo/full/sd3_5/default.yaml @@ -21,7 +21,7 @@ model: target_modules: "default" # Options: all, default, or list of module names like ["to_k", "to_q", "to_v", "to_out.0"] model_name_or_path: "stabilityai/stable-diffusion-3.5-medium" # HuggingFace model ID or local path model_type: "sd3-5" # Options: flux1, flux1-kontext, flux2, qwenimage, qwenimage-edit, sd3-5 - resume_path: null # Path to load previous checkpoint/lora adapter + resume_path: null # Local path or HF repo id (e.g. 'owner/repo[/subdir][@rev]') for previous checkpoint/lora adapter resume_type: null # Options: lora, full, state. Null to auto-detect based on `finetune_type` log: diff --git a/examples/grpo/full/wan21/i2v.yaml b/examples/grpo/full/wan21/i2v.yaml index 8e6a363d..d7c5dc34 100644 --- a/examples/grpo/full/wan21/i2v.yaml +++ b/examples/grpo/full/wan21/i2v.yaml @@ -22,7 +22,7 @@ model: target_modules: "default" # Options: all, default, or list of module names like ["to_k", "to_q", "to_v", "to_out.0"] model_name_or_path: "Wan-AI/Wan2.1-I2V-14B-480P-Diffusers" # Wan-AI/Wan2.1-I2V-14B-480P-Diffusers / Wan-AI/Wan2.1-I2V-14B-720P-Diffusers model_type: "wan2_i2v" # wan2_t2v, wan2_i2v, wan2_v2v - resume_path: null # Path to load previous checkpoint/lora adapter + resume_path: null # Local path or HF repo id (e.g. 'owner/repo[/subdir][@rev]') for previous checkpoint/lora adapter resume_type: null # Options: lora, full, state. Null to auto-detect based on `finetune_type` log: diff --git a/examples/grpo/full/wan21/t2v.yaml b/examples/grpo/full/wan21/t2v.yaml index 09c8bef5..fea1d8b9 100644 --- a/examples/grpo/full/wan21/t2v.yaml +++ b/examples/grpo/full/wan21/t2v.yaml @@ -22,7 +22,7 @@ model: target_modules: "default" # Options: all, default, or list of module names like ["to_k", "to_q", "to_v", "to_out.0"] model_name_or_path: "Wan-AI/Wan2.1-T2V-14B-Diffusers" # Wan-AI/Wan2.1-T2V-14B-Diffusers / Wan-AI/Wan2.2-T2V-A14B-Diffusers model_type: "wan2_t2v" # wan2_t2v, wan2_i2v, wan2_v2v - resume_path: null # Path to load previous checkpoint/lora adapter + resume_path: null # Local path or HF repo id (e.g. 'owner/repo[/subdir][@rev]') for previous checkpoint/lora adapter resume_type: null # Options: lora, full, state. Null to auto-detect based on `finetune_type` log: diff --git a/examples/grpo/full/wan22/i2v.yaml b/examples/grpo/full/wan22/i2v.yaml index 63731384..6ffa6a5e 100644 --- a/examples/grpo/full/wan22/i2v.yaml +++ b/examples/grpo/full/wan22/i2v.yaml @@ -28,7 +28,7 @@ model: target_modules: "default" model_name_or_path: "Wan-AI/Wan2.2-I2V-A14B-Diffusers" # Wan-AI/Wan2.2-TI2V-5B-Diffusers / Wan-AI/Wan2.2-I2V-A14B-Diffusers model_type: "wan2_i2v" # wan2_t2v, wan2_i2v, wan2_v2v - resume_path: null # Path to load previous checkpoint/lora adapter + resume_path: null # Local path or HF repo id (e.g. 'owner/repo[/subdir][@rev]') for previous checkpoint/lora adapter resume_type: null # Options: lora, full, state. Null to auto-detect based on `finetune_type` log: diff --git a/examples/grpo/full/wan22/t2v.yaml b/examples/grpo/full/wan22/t2v.yaml index 74d10030..a188a0f9 100644 --- a/examples/grpo/full/wan22/t2v.yaml +++ b/examples/grpo/full/wan22/t2v.yaml @@ -28,7 +28,7 @@ model: target_modules: "default" model_name_or_path: "Wan-AI/Wan2.2-T2V-A14B-Diffusers" # Wan-AI/Wan2.1-T2V-14B-Diffusers / Wan-AI/Wan2.2-T2V-A14B-Diffusers model_type: "wan2_t2v" # wan2_t2v, wan2_i2v, wan2_v2v - resume_path: null # Path to load previous checkpoint/lora adapter + resume_path: null # Local path or HF repo id (e.g. 'owner/repo[/subdir][@rev]') for previous checkpoint/lora adapter resume_type: null # Options: lora, full, state. Null to auto-detect based on `finetune_type` log: diff --git a/examples/grpo/full/z_image/default.yaml b/examples/grpo/full/z_image/default.yaml index 3abeab6a..e7794942 100644 --- a/examples/grpo/full/z_image/default.yaml +++ b/examples/grpo/full/z_image/default.yaml @@ -22,7 +22,7 @@ model: target_modules: "default" # Options: all, default, or list of module names like ["to_k", "to_q", "to_v", "to_out.0"] model_name_or_path: "Tongyi-MAI/Z-Image" # Options: Tongyi-MAI/Z-Image, Tongyi-MAI/Z-Image-Turbo model_type: "z-image" - resume_path: null # Path to load previous checkpoint/lora adapter + resume_path: null # Local path or HF repo id (e.g. 'owner/repo[/subdir][@rev]') for previous checkpoint/lora adapter resume_type: null # Options: lora, full, state. Null to auto-detect based on `finetune_type` log: diff --git a/examples/grpo/full/z_image_turbo/default.yaml b/examples/grpo/full/z_image_turbo/default.yaml index 298a34bf..427e663b 100644 --- a/examples/grpo/full/z_image_turbo/default.yaml +++ b/examples/grpo/full/z_image_turbo/default.yaml @@ -22,7 +22,7 @@ model: target_modules: "default" # Options: all, default, or list of module names like ["to_k", "to_q", "to_v", "to_out.0"] model_name_or_path: "Tongyi-MAI/Z-Image-Turbo" # HuggingFace model ID or local path model_type: "z-image" - resume_path: null # Path to load previous checkpoint/lora adapter + resume_path: null # Local path or HF repo id (e.g. 'owner/repo[/subdir][@rev]') for previous checkpoint/lora adapter resume_type: null # Options: lora, full, state. Null to auto-detect based on `finetune_type` log: diff --git a/examples/grpo/lora/flux1/default.yaml b/examples/grpo/lora/flux1/default.yaml index 24fab9c2..9d814985 100644 --- a/examples/grpo/lora/flux1/default.yaml +++ b/examples/grpo/lora/flux1/default.yaml @@ -23,7 +23,7 @@ model: target_modules: "default" # Options: all, default, or list of module names like ["to_k", "to_q", "to_v", "to_out.0"] model_name_or_path: "black-forest-labs/FLUX.1-dev" # HuggingFace model ID or local path model_type: "flux1" # Options: flux1, flux1-kontext, flux2, qwenimage, qwenimage-edit - resume_path: null # Path to load previous checkpoint/lora adapter + resume_path: null # Local path or HF repo id (e.g. 'owner/repo[/subdir][@rev]') for previous checkpoint/lora adapter resume_type: null # Options: lora, full, state. Null to auto-detect based on `finetune_type` log: diff --git a/examples/grpo/lora/flux1_kontext/default.yaml b/examples/grpo/lora/flux1_kontext/default.yaml index b6c6084d..269f4b03 100644 --- a/examples/grpo/lora/flux1_kontext/default.yaml +++ b/examples/grpo/lora/flux1_kontext/default.yaml @@ -23,7 +23,7 @@ model: target_modules: "default" # Options: all, default, or list of module names like ["to_k", "to_q", "to_v", "to_out.0"] model_name_or_path: "black-forest-labs/FLUX.1-Kontext-dev" # HuggingFace model ID or local path model_type: "flux1-kontext" - resume_path: null # Directory contains previous checkpoint/lora adapter + resume_path: null # Local path or HF repo id (e.g. 'owner/repo[/subdir][@rev]') for previous checkpoint/lora adapter resume_type: null # Options: lora, full, state. Null to auto-detect based on `finetune_type` log: diff --git a/examples/grpo/lora/flux2/i2i.yaml b/examples/grpo/lora/flux2/i2i.yaml index b3185cb3..21ef994c 100644 --- a/examples/grpo/lora/flux2/i2i.yaml +++ b/examples/grpo/lora/flux2/i2i.yaml @@ -23,7 +23,7 @@ model: target_modules: "default" # Options: all, default, or list of module names like ["to_k", "to_q", "to_v", "to_out.0"] model_name_or_path: "black-forest-labs/FLUX.2-dev" # HuggingFace model ID or local path model_type: "flux2" - resume_path: null # Path to load previous checkpoint/lora adapter + resume_path: null # Local path or HF repo id (e.g. 'owner/repo[/subdir][@rev]') for previous checkpoint/lora adapter resume_type: null # Options: lora, full, state. Null to auto-detect based on `finetune_type` log: diff --git a/examples/grpo/lora/flux2/t2i.yaml b/examples/grpo/lora/flux2/t2i.yaml index 56dd92c5..17b89c83 100644 --- a/examples/grpo/lora/flux2/t2i.yaml +++ b/examples/grpo/lora/flux2/t2i.yaml @@ -23,7 +23,7 @@ model: target_modules: "default" # Options: all, default, or list of module names like ["to_k", "to_q", "to_v", "to_out.0"] model_name_or_path: "black-forest-labs/FLUX.2-dev" # HuggingFace model ID or local path model_type: "flux2" - resume_path: null # Path to load previous checkpoint/lora adapter + resume_path: null # Local path or HF repo id (e.g. 'owner/repo[/subdir][@rev]') for previous checkpoint/lora adapter resume_type: null # Options: lora, full, state. Null to auto-detect based on `finetune_type` log: diff --git a/examples/grpo/lora/flux2_klein/default.yaml b/examples/grpo/lora/flux2_klein/default.yaml index ce2e9fc4..286be971 100644 --- a/examples/grpo/lora/flux2_klein/default.yaml +++ b/examples/grpo/lora/flux2_klein/default.yaml @@ -23,7 +23,7 @@ model: target_modules: "default" # Options: all, default, or list of module names like ["to_k", "to_q", "to_v", "to_out.0"] model_name_or_path: "black-forest-labs/FLUX.2-klein-9B" # Options: black-forest-labs/FLUX.2-klein-4B, black-forest-labs/FLUX.2-klein-9B model_type: "flux2-klein" - resume_path: null # Path to load previous checkpoint/lora adapter + resume_path: null # Local path or HF repo id (e.g. 'owner/repo[/subdir][@rev]') for previous checkpoint/lora adapter resume_type: null # Options: lora, full, state. Null to auto-detect based on `finetune_type` log: diff --git a/examples/grpo/lora/flux2_klein_base/default.yaml b/examples/grpo/lora/flux2_klein_base/default.yaml index 21c8a4bd..890484d3 100644 --- a/examples/grpo/lora/flux2_klein_base/default.yaml +++ b/examples/grpo/lora/flux2_klein_base/default.yaml @@ -23,7 +23,7 @@ model: target_modules: "default" # Options: all, default, or list of module names like ["to_k", "to_q", "to_v", "to_out.0"] model_name_or_path: "black-forest-labs/FLUX.2-klein-base-9B" # Options: black-forest-labs/FLUX.2-klein-base-4B, black-forest-labs/FLUX.2-klein-base-9B model_type: "flux2-klein" - resume_path: null # Path to load previous checkpoint/lora adapter + resume_path: null # Local path or HF repo id (e.g. 'owner/repo[/subdir][@rev]') for previous checkpoint/lora adapter resume_type: null # Options: lora, full, state. Null to auto-detect based on `finetune_type` log: diff --git a/examples/grpo/lora/ltx2/i2av.yaml b/examples/grpo/lora/ltx2/i2av.yaml index c960506b..1e4c2872 100644 --- a/examples/grpo/lora/ltx2/i2av.yaml +++ b/examples/grpo/lora/ltx2/i2av.yaml @@ -28,7 +28,7 @@ model: target_modules: "default" # 28 Linear layers per block (video/audio attn + cross-modal attn + FFN) model_name_or_path: "Lightricks/LTX-2" # Options: Lightricks/LTX-2, dg845/LTX-2.3-Diffusers model_type: "ltx2_i2av" - resume_path: null # Path to load previous checkpoint/lora adapter + resume_path: null # Local path or HF repo id (e.g. 'owner/repo[/subdir][@rev]') for previous checkpoint/lora adapter resume_type: null # Options: lora, full, state. Null to auto-detect based on `finetune_type` log: diff --git a/examples/grpo/lora/ltx2/t2av.yaml b/examples/grpo/lora/ltx2/t2av.yaml index fb093871..ae27c722 100644 --- a/examples/grpo/lora/ltx2/t2av.yaml +++ b/examples/grpo/lora/ltx2/t2av.yaml @@ -27,7 +27,7 @@ model: target_modules: "default" # 28 Linear layers per block (video/audio attn + cross-modal attn + FFN) model_name_or_path: "Lightricks/LTX-2" # Options: Lightricks/LTX-2, dg845/LTX-2.3-Diffusers model_type: "ltx2_t2av" - resume_path: null # Path to load previous checkpoint/lora adapter + resume_path: null # Local path or HF repo id (e.g. 'owner/repo[/subdir][@rev]') for previous checkpoint/lora adapter resume_type: null # Options: lora, full, state. Null to auto-detect based on `finetune_type` # attn_backend: '_flash_3_hub' # Attention backend. diff --git a/examples/grpo/lora/ltx2/t2av_pickscore.yaml b/examples/grpo/lora/ltx2/t2av_pickscore.yaml index e358a48a..9dfeae3d 100644 --- a/examples/grpo/lora/ltx2/t2av_pickscore.yaml +++ b/examples/grpo/lora/ltx2/t2av_pickscore.yaml @@ -25,7 +25,7 @@ model: target_modules: "default" # 28 Linear layers per block (video/audio attn + cross-modal attn + FFN) model_name_or_path: "dg845/LTX-2.3-Diffusers" # Options: Lightricks/LTX-2, dg845/LTX-2.3-Diffusers model_type: "ltx2_t2av" - resume_path: null # Path to load previous checkpoint/lora adapter + resume_path: null # Local path or HF repo id (e.g. 'owner/repo[/subdir][@rev]') for previous checkpoint/lora adapter resume_type: null # Options: lora, full, state. Null to auto-detect based on `finetune_type` # attn_backend: '_flash_3_hub' # Attention backend. diff --git a/examples/grpo/lora/qwen_image/default.yaml b/examples/grpo/lora/qwen_image/default.yaml index cc34ddcb..ef785057 100644 --- a/examples/grpo/lora/qwen_image/default.yaml +++ b/examples/grpo/lora/qwen_image/default.yaml @@ -23,7 +23,7 @@ model: target_modules: "default" # Options: all, default, or list of module names like ["to_k", "to_q", "to_v", "to_out.0"] model_name_or_path: "Qwen/Qwen-Image-2512" # Qwen/Qwen-Image or Qwen/Qwen-Image-2512 model_type: "qwen-image" - resume_path: null # Path to load previous checkpoint/lora adapter + resume_path: null # Local path or HF repo id (e.g. 'owner/repo[/subdir][@rev]') for previous checkpoint/lora adapter resume_type: null # Options: lora, full, state. Null to auto-detect based on `finetune_type` # attn_backend: '_flash_3_varlen_hub' # Attention backend for Qwen-Image Series, which uses masked attention with variable sequence length. diff --git a/examples/grpo/lora/qwen_image_edit_plus/default.yaml b/examples/grpo/lora/qwen_image_edit_plus/default.yaml index 42302ee6..cbe66e45 100644 --- a/examples/grpo/lora/qwen_image_edit_plus/default.yaml +++ b/examples/grpo/lora/qwen_image_edit_plus/default.yaml @@ -25,7 +25,7 @@ model: target_modules: "default" # Options: all, default, or list of module names like ["to_k", "to_q", "to_v", "to_out.0"] model_name_or_path: "Qwen/Qwen-Image-Edit-2509" # Qwen/Qwen-Image-Edit-2509 or Qwen/Qwen-Image-Edit-2511 model_type: "qwen-image-edit-plus" - resume_path: null # Path to load previous checkpoint/lora adapter + resume_path: null # Local path or HF repo id (e.g. 'owner/repo[/subdir][@rev]') for previous checkpoint/lora adapter resume_type: null # Options: lora, full, state. Null to auto-detect based on `finetune_type` # attn_backend: '_flash_3_varlen_hub' # Attention backend for Qwen-Image Series, which uses masked attention with variable sequence length. diff --git a/examples/grpo/lora/sd3_5/default.yaml b/examples/grpo/lora/sd3_5/default.yaml index 31f1b86b..f52936ee 100644 --- a/examples/grpo/lora/sd3_5/default.yaml +++ b/examples/grpo/lora/sd3_5/default.yaml @@ -26,7 +26,7 @@ model: target_modules: "default" # Options: all, default, or list of module names like ["to_k", "to_q", "to_v", "to_out.0"] model_name_or_path: "stabilityai/stable-diffusion-3.5-medium" # HuggingFace model ID or local path model_type: "sd3-5" - resume_path: null # Path to load previous checkpoint/lora adapter + resume_path: null # Local path or HF repo id (e.g. 'owner/repo[/subdir][@rev]') for previous checkpoint/lora adapter resume_type: null # Options: lora, full, state. Null to auto-detect based on `finetune_type` # attn_backend: '_flash_3_hub' # Attention backend for training. diff --git a/examples/grpo/lora/sd3_5/nocfg.yaml b/examples/grpo/lora/sd3_5/nocfg.yaml index 29cd7938..bf1d6931 100644 --- a/examples/grpo/lora/sd3_5/nocfg.yaml +++ b/examples/grpo/lora/sd3_5/nocfg.yaml @@ -26,7 +26,7 @@ model: target_modules: "default" # Options: all, default, or list of module names like ["to_k", "to_q", "to_v", "to_out.0"] model_name_or_path: "stabilityai/stable-diffusion-3.5-medium" # HuggingFace model ID or local path model_type: "sd3-5" - resume_path: null # Path to load previous checkpoint/lora adapter + resume_path: null # Local path or HF repo id (e.g. 'owner/repo[/subdir][@rev]') for previous checkpoint/lora adapter resume_type: null # Options: lora, full, state. Null to auto-detect based on `finetune_type` # attn_backend: '_flash_3_hub' # Attention backend for training. diff --git a/examples/grpo/lora/wan21/i2v.yaml b/examples/grpo/lora/wan21/i2v.yaml index 144f801f..52f0f13a 100644 --- a/examples/grpo/lora/wan21/i2v.yaml +++ b/examples/grpo/lora/wan21/i2v.yaml @@ -24,7 +24,7 @@ model: target_modules: "default" # Options: all, default, or list of module names like ["to_k", "to_q", "to_v", "to_out.0"] model_name_or_path: "Wan-AI/Wan2.1-I2V-14B-480P-Diffusers" # Wan-AI/Wan2.1-I2V-14B-480P-Diffusers / Wan-AI/Wan2.1-I2V-14B-720P-Diffusers model_type: "wan2_i2v" # wan2_t2v, wan2_i2v, wan2_v2v - resume_path: null # Path to load previous checkpoint/lora adapter + resume_path: null # Local path or HF repo id (e.g. 'owner/repo[/subdir][@rev]') for previous checkpoint/lora adapter resume_type: null # Options: lora, full, state. Null to auto-detect based on `finetune_type` log: diff --git a/examples/grpo/lora/wan21/t2v.yaml b/examples/grpo/lora/wan21/t2v.yaml index 0d68323c..55a46b9c 100644 --- a/examples/grpo/lora/wan21/t2v.yaml +++ b/examples/grpo/lora/wan21/t2v.yaml @@ -24,7 +24,7 @@ model: target_modules: "default" # Options: all, default, or list of module names like ["to_k", "to_q", "to_v", "to_out.0"] model_name_or_path: "Wan-AI/Wan2.1-T2V-14B-Diffusers" # Wan-AI/Wan2.1-T2V-14B-Diffusers / Wan-AI/Wan2.2-T2V-A14B-Diffusers model_type: "wan2_t2v" # wan2_t2v, wan2_i2v, wan2_v2v - resume_path: null # Path to load previous checkpoint/lora adapter + resume_path: null # Local path or HF repo id (e.g. 'owner/repo[/subdir][@rev]') for previous checkpoint/lora adapter resume_type: null # Options: lora, full, state. Null to auto-detect based on `finetune_type` log: diff --git a/examples/grpo/lora/wan21/v2v.yaml b/examples/grpo/lora/wan21/v2v.yaml index 44fd3cf8..93929834 100644 --- a/examples/grpo/lora/wan21/v2v.yaml +++ b/examples/grpo/lora/wan21/v2v.yaml @@ -24,7 +24,7 @@ model: target_modules: "default" # Options: all, default, or list of module names like ["to_k", "to_q", "to_v", "to_out.0"] model_name_or_path: "Wan-AI/Wan2.1-T2V-14B-Diffusers" # Wan-AI/Wan2.1-T2V-1.3B-Diffusers / Wan-AI/Wan2.1-T2V-14B-Diffusers model_type: "wan2_v2v" - resume_path: null # Path to load previous checkpoint/lora adapter + resume_path: null # Local path or HF repo id (e.g. 'owner/repo[/subdir][@rev]') for previous checkpoint/lora adapter resume_type: null # Options: lora, full, state. Null to auto-detect based on `finetune_type` log: diff --git a/examples/grpo/lora/wan22/i2v.yaml b/examples/grpo/lora/wan22/i2v.yaml index 96db5fac..105e62c4 100644 --- a/examples/grpo/lora/wan22/i2v.yaml +++ b/examples/grpo/lora/wan22/i2v.yaml @@ -33,7 +33,7 @@ model: target_modules: "transformer.default" model_name_or_path: "Wan-AI/Wan2.2-I2V-A14B-Diffusers" # Wan-AI/Wan2.2-TI2V-5B-Diffusers / Wan-AI/Wan2.2-I2V-A14B-Diffusers model_type: "wan2_i2v" # wan2_t2v, wan2_i2v, wan2_v2v - resume_path: null # Path to load previous checkpoint/lora adapter + resume_path: null # Local path or HF repo id (e.g. 'owner/repo[/subdir][@rev]') for previous checkpoint/lora adapter resume_type: null # Options: lora, full, state. Null to auto-detect based on `finetune_type` log: diff --git a/examples/grpo/lora/wan22/t2v.yaml b/examples/grpo/lora/wan22/t2v.yaml index d279ed84..3d3969d7 100644 --- a/examples/grpo/lora/wan22/t2v.yaml +++ b/examples/grpo/lora/wan22/t2v.yaml @@ -33,7 +33,7 @@ model: target_modules: "default" model_name_or_path: "Wan-AI/Wan2.2-T2V-A14B-Diffusers" # Wan-AI/Wan2.1-T2V-14B-Diffusers / Wan-AI/Wan2.2-T2V-A14B-Diffusers model_type: "wan2_t2v" # wan2_t2v, wan2_i2v, wan2_v2v - resume_path: null # Path to load previous checkpoint/lora adapter + resume_path: null # Local path or HF repo id (e.g. 'owner/repo[/subdir][@rev]') for previous checkpoint/lora adapter resume_type: null # Options: lora, full, state. Null to auto-detect based on `finetune_type` log: diff --git a/examples/grpo/lora/z_image/default.yaml b/examples/grpo/lora/z_image/default.yaml index a517bd2b..07da324b 100644 --- a/examples/grpo/lora/z_image/default.yaml +++ b/examples/grpo/lora/z_image/default.yaml @@ -24,7 +24,7 @@ model: target_modules: "default" # Options: all, default, or list of module names like ["to_k", "to_q", "to_v", "to_out.0"] model_name_or_path: "Tongyi-MAI/Z-Image" # Options: Tongyi-MAI/Z-Image, Tongyi-MAI/Z-Image-Turbo model_type: "z-image" - resume_path: null # Path to load previous checkpoint/lora adapter + resume_path: null # Local path or HF repo id (e.g. 'owner/repo[/subdir][@rev]') for previous checkpoint/lora adapter resume_type: null # Options: lora, full, state. Null to auto-detect based on `finetune_type` log: diff --git a/examples/grpo/lora/z_image_turbo/default.yaml b/examples/grpo/lora/z_image_turbo/default.yaml index 293dc222..fdac1937 100644 --- a/examples/grpo/lora/z_image_turbo/default.yaml +++ b/examples/grpo/lora/z_image_turbo/default.yaml @@ -24,7 +24,7 @@ model: target_modules: "default" # Options: all, default, or list of module names like ["to_k", "to_q", "to_v", "to_out.0"] model_name_or_path: "Tongyi-MAI/Z-Image-Turbo" # HuggingFace model ID or local path model_type: "z-image" # Options: flux1, flux1-kontext, flux2, qwenimage, qwenimage-edit, z-image - resume_path: null # Path to load previous checkpoint/lora adapter + resume_path: null # Local path or HF repo id (e.g. 'owner/repo[/subdir][@rev]') for previous checkpoint/lora adapter resume_type: null # Options: lora, full, state. Null to auto-detect based on `finetune_type` log: diff --git a/examples/nft/full/flux1/default.yaml b/examples/nft/full/flux1/default.yaml index 61d90c45..502a2235 100644 --- a/examples/nft/full/flux1/default.yaml +++ b/examples/nft/full/flux1/default.yaml @@ -21,7 +21,7 @@ model: target_modules: "default" # Options: all, default, or list of module names like ["to_k", "to_q", "to_v", "to_out.0"] model_name_or_path: "black-forest-labs/FLUX.1-dev" # HuggingFace model ID or local path model_type: "flux1" - resume_path: null # Path to load previous checkpoint/lora adapter + resume_path: null # Local path or HF repo id (e.g. 'owner/repo[/subdir][@rev]') for previous checkpoint/lora adapter resume_type: null # Options: lora, full, state. Null to auto-detect based on `finetune_type` log: diff --git a/examples/nft/full/flux2_klein_base/default.yaml b/examples/nft/full/flux2_klein_base/default.yaml index d66bb878..1ddd0675 100644 --- a/examples/nft/full/flux2_klein_base/default.yaml +++ b/examples/nft/full/flux2_klein_base/default.yaml @@ -21,7 +21,7 @@ model: target_modules: "default" # Options: all, default, or list of module names like ["to_k", "to_q", "to_v", "to_out.0"] model_name_or_path: "black-forest-labs/FLUX.2-klein-base-4B" # Options: black-forest-labs/FLUX.2-klein-base-4B, black-forest-labs/FLUX.2-klein-base-9B model_type: "flux2-klein" - resume_path: null # Path to load previous checkpoint/lora adapter + resume_path: null # Local path or HF repo id (e.g. 'owner/repo[/subdir][@rev]') for previous checkpoint/lora adapter resume_type: null # Options: lora, full, state. Null to auto-detect based on `finetune_type` log: diff --git a/examples/nft/full/wan22/t2v.yaml b/examples/nft/full/wan22/t2v.yaml index 6cf4ddaa..4e788752 100644 --- a/examples/nft/full/wan22/t2v.yaml +++ b/examples/nft/full/wan22/t2v.yaml @@ -26,7 +26,7 @@ model: target_modules: "default" model_name_or_path: "Wan-AI/Wan2.2-T2V-A14B-Diffusers" # Wan-AI/Wan2.1-T2V-14B-Diffusers / Wan-AI/Wan2.2-T2V-A14B-Diffusers model_type: "wan2_t2v" # wan2_t2v, wan2_i2v, wan2_v2v - resume_path: null # Path to load previous checkpoint/lora adapter + resume_path: null # Local path or HF repo id (e.g. 'owner/repo[/subdir][@rev]') for previous checkpoint/lora adapter resume_type: null # Options: lora, full, state. Null to auto-detect based on `finetune_type` log: diff --git a/examples/nft/full/z_image/default.yaml b/examples/nft/full/z_image/default.yaml index 83d6b4c5..50252fb4 100644 --- a/examples/nft/full/z_image/default.yaml +++ b/examples/nft/full/z_image/default.yaml @@ -22,7 +22,7 @@ model: target_modules: "default" # Options: all, default, or list of module names like ["to_k", "to_q", "to_v", "to_out.0"] model_name_or_path: "Tongyi-MAI/Z-Image" # Options: Tongyi-MAI/Z-Image, Tongyi-MAI/Z-Image-Turbo model_type: "z-image" - resume_path: null # Path to load previous checkpoint/lora adapter + resume_path: null # Local path or HF repo id (e.g. 'owner/repo[/subdir][@rev]') for previous checkpoint/lora adapter resume_type: null # Options: lora, full, state. Null to auto-detect based on `finetune_type` log: diff --git a/examples/nft/full/z_image_turbo/default.yaml b/examples/nft/full/z_image_turbo/default.yaml index b08702f5..3af795a2 100644 --- a/examples/nft/full/z_image_turbo/default.yaml +++ b/examples/nft/full/z_image_turbo/default.yaml @@ -22,7 +22,7 @@ model: target_modules: "default" # Options: all, default, or list of module names like ["to_k", "to_q", "to_v", "to_out.0"] model_name_or_path: "Tongyi-MAI/Z-Image-Turbo" # Options: Tongyi-MAI/Z-Image, Tongyi-MAI/Z-Image-Turbo model_type: "z-image" - resume_path: null # Path to load previous checkpoint/lora adapter + resume_path: null # Local path or HF repo id (e.g. 'owner/repo[/subdir][@rev]') for previous checkpoint/lora adapter resume_type: null # Options: lora, full, state. Null to auto-detect based on `finetune_type` log: diff --git a/examples/nft/lora/flux1/default.yaml b/examples/nft/lora/flux1/default.yaml index 93cdb5ac..88d33b52 100644 --- a/examples/nft/lora/flux1/default.yaml +++ b/examples/nft/lora/flux1/default.yaml @@ -23,7 +23,7 @@ model: target_modules: "default" # Options: all, default, or list of module names like ["to_k", "to_q", "to_v", "to_out.0"] model_name_or_path: "black-forest-labs/FLUX.1-dev" # HuggingFace model ID or local path model_type: "flux1" - resume_path: null # Path to load previous checkpoint/lora adapter + resume_path: null # Local path or HF repo id (e.g. 'owner/repo[/subdir][@rev]') for previous checkpoint/lora adapter resume_type: null # Options: lora, full, state. Null to auto-detect based on `finetune_type` # attn_backend: '_flash_3_hub' # Use flash attention 3 backend. diff --git a/examples/nft/lora/flux1/rational_rewards_t2i.yaml b/examples/nft/lora/flux1/rational_rewards_t2i.yaml index 38071fb9..ff40b2c3 100644 --- a/examples/nft/lora/flux1/rational_rewards_t2i.yaml +++ b/examples/nft/lora/flux1/rational_rewards_t2i.yaml @@ -24,7 +24,7 @@ model: target_modules: "default" model_name_or_path: "black-forest-labs/FLUX.1-dev" model_type: "flux1" - resume_path: null + resume_path: null # Local path or HF repo id (e.g. 'owner/repo[/subdir][@rev]') for previous checkpoint/lora adapter resume_type: null log: diff --git a/examples/nft/lora/flux1_kontext/rational_rewards_edit.yaml b/examples/nft/lora/flux1_kontext/rational_rewards_edit.yaml index bf0f900e..ad2ebdc8 100644 --- a/examples/nft/lora/flux1_kontext/rational_rewards_edit.yaml +++ b/examples/nft/lora/flux1_kontext/rational_rewards_edit.yaml @@ -24,7 +24,7 @@ model: target_modules: "default" model_name_or_path: "black-forest-labs/FLUX.1-Kontext-dev" model_type: "flux1-kontext" - resume_path: null + resume_path: null # Local path or HF repo id (e.g. 'owner/repo[/subdir][@rev]') for previous checkpoint/lora adapter resume_type: null log: diff --git a/examples/nft/lora/flux2_klein_base/default.yaml b/examples/nft/lora/flux2_klein_base/default.yaml index 9274f075..a79fedb8 100644 --- a/examples/nft/lora/flux2_klein_base/default.yaml +++ b/examples/nft/lora/flux2_klein_base/default.yaml @@ -23,7 +23,7 @@ model: target_modules: "default" # Options: all, default, or list of module names like ["to_k", "to_q", "to_v", "to_out.0"] model_name_or_path: "black-forest-labs/FLUX.2-klein-base-4B" # Options: black-forest-labs/FLUX.2-klein-base-4B, black-forest-labs/FLUX.2-klein-base-9B model_type: "flux2-klein" - resume_path: null # Path to load previous checkpoint/lora adapter + resume_path: null # Local path or HF repo id (e.g. 'owner/repo[/subdir][@rev]') for previous checkpoint/lora adapter resume_type: null # Options: lora, full, state. Null to auto-detect based on `finetune_type` log: diff --git a/examples/nft/lora/qwen_image/rational_rewards_t2i.yaml b/examples/nft/lora/qwen_image/rational_rewards_t2i.yaml index 7559e802..e99289f8 100644 --- a/examples/nft/lora/qwen_image/rational_rewards_t2i.yaml +++ b/examples/nft/lora/qwen_image/rational_rewards_t2i.yaml @@ -25,7 +25,7 @@ model: target_modules: "default" model_name_or_path: "Qwen/Qwen-Image-2512" model_type: "qwen-image" - resume_path: null + resume_path: null # Local path or HF repo id (e.g. 'owner/repo[/subdir][@rev]') for previous checkpoint/lora adapter resume_type: null log: diff --git a/examples/nft/lora/qwen_image_edit_plus/rational_rewards_edit.yaml b/examples/nft/lora/qwen_image_edit_plus/rational_rewards_edit.yaml index 8137aade..b326cf76 100644 --- a/examples/nft/lora/qwen_image_edit_plus/rational_rewards_edit.yaml +++ b/examples/nft/lora/qwen_image_edit_plus/rational_rewards_edit.yaml @@ -24,7 +24,7 @@ model: target_modules: "default" model_name_or_path: "Qwen/Qwen-Image-Edit-2509" model_type: "qwen-image-edit-plus" - resume_path: null + resume_path: null # Local path or HF repo id (e.g. 'owner/repo[/subdir][@rev]') for previous checkpoint/lora adapter resume_type: null log: diff --git a/examples/nft/lora/sd3_5/default.yaml b/examples/nft/lora/sd3_5/default.yaml index 7eafa33b..ba2b5cb8 100644 --- a/examples/nft/lora/sd3_5/default.yaml +++ b/examples/nft/lora/sd3_5/default.yaml @@ -24,7 +24,7 @@ model: target_modules: "default" # Options: all, default, or list of module names like ["to_k", "to_q", "to_v", "to_out.0"] model_name_or_path: "stabilityai/stable-diffusion-3.5-medium" model_type: "sd3-5" - resume_path: null # Path to load previous checkpoint/lora adapter + resume_path: null # Local path or HF repo id (e.g. 'owner/repo[/subdir][@rev]') for previous checkpoint/lora adapter resume_type: null # Options: lora, full, state. Null to auto-detect based on `finetune_type` # attn_backend: '_flash_3_hub' # Attention backend for training. diff --git a/examples/nft/lora/wan21/i2v.yaml b/examples/nft/lora/wan21/i2v.yaml index 23680484..78b9552f 100644 --- a/examples/nft/lora/wan21/i2v.yaml +++ b/examples/nft/lora/wan21/i2v.yaml @@ -24,7 +24,7 @@ model: target_modules: "default" model_name_or_path: "Wan-AI/Wan2.1-I2V-14B-720P-Diffusers" # Wan-AI/Wan2.1-I2V-14B-480P-Diffusers / Wan-AI/Wan2.1-I2V-14B-480P-Diffusers model_type: "wan2_i2v" # wan2_t2v, wan2_i2v, wan2_v2v - resume_path: null # Path to load previous checkpoint/lora adapter + resume_path: null # Local path or HF repo id (e.g. 'owner/repo[/subdir][@rev]') for previous checkpoint/lora adapter resume_type: null # Options: lora, full, state. Null to auto-detect based on `finetune_type` # attn_backend: '_flash_3_hub' # Use flash attention 3 backend. diff --git a/examples/nft/lora/wan21/t2v.yaml b/examples/nft/lora/wan21/t2v.yaml index b2a215ae..3c7ce05f 100644 --- a/examples/nft/lora/wan21/t2v.yaml +++ b/examples/nft/lora/wan21/t2v.yaml @@ -24,7 +24,7 @@ model: target_modules: "default" model_name_or_path: "Wan-AI/Wan2.1-T2V-1.3B-Diffusers" # Wan-AI/Wan2.1-T2V-1.3B-Diffusers / Wan-AI/Wan2.1-T2V-14B-Diffusers model_type: "wan2_t2v" # wan2_t2v, wan2_i2v, wan2_v2v - resume_path: null # Path to load previous checkpoint/lora adapter + resume_path: null # Local path or HF repo id (e.g. 'owner/repo[/subdir][@rev]') for previous checkpoint/lora adapter resume_type: null # Options: lora, full, state. Null to auto-detect based on `finetune_type` # attn_backend: '_flash_3_hub' # Use flash attention 3 backend. diff --git a/examples/nft/lora/wan22/t2v.yaml b/examples/nft/lora/wan22/t2v.yaml index a493ec46..546e592a 100644 --- a/examples/nft/lora/wan22/t2v.yaml +++ b/examples/nft/lora/wan22/t2v.yaml @@ -24,7 +24,7 @@ model: target_modules: "default" model_name_or_path: "Wan-AI/Wan2.2-T2V-A14B-Diffusers" # Wan-AI/Wan2.2-TI2V-5B-Diffusers / Wan-AI/Wan2.2-T2V-A14B-Diffusers model_type: "wan2_t2v" # wan2_t2v, wan2_i2v, wan2_v2v - resume_path: null # Path to load previous checkpoint/lora adapter + resume_path: null # Local path or HF repo id (e.g. 'owner/repo[/subdir][@rev]') for previous checkpoint/lora adapter resume_type: null # Options: lora, full, state. Null to auto-detect based on `finetune_type` # attn_backend: '_flash_3_hub' # Use flash attention 3 backend. diff --git a/examples/nft/lora/z_image/default.yaml b/examples/nft/lora/z_image/default.yaml index 50e9fdce..c0b809dd 100644 --- a/examples/nft/lora/z_image/default.yaml +++ b/examples/nft/lora/z_image/default.yaml @@ -24,7 +24,7 @@ model: target_modules: "default" # Options: all, default, or list of module names like ["to_k", "to_q", "to_v", "to_out.0"] model_name_or_path: "Tongyi-MAI/Z-Image" # Options: Tongyi-MAI/Z-Image, Tongyi-MAI/Z-Image-Turbo model_type: "z-image" - resume_path: null # Path to load previous checkpoint/lora adapter + resume_path: null # Local path or HF repo id (e.g. 'owner/repo[/subdir][@rev]') for previous checkpoint/lora adapter resume_type: null # Options: lora, full, state. Null to auto-detect based on `finetune_type` log: diff --git a/examples/template/sd3_5/async_reward.yaml b/examples/template/sd3_5/async_reward.yaml index a7f19e1d..db004bfb 100644 --- a/examples/template/sd3_5/async_reward.yaml +++ b/examples/template/sd3_5/async_reward.yaml @@ -23,7 +23,7 @@ model: target_modules: "default" # Options: all, default, or list of module names like ["to_k", "to_q", "to_v", "to_out.0"] model_name_or_path: "stabilityai/stable-diffusion-3.5-medium" # HuggingFace model ID or local path model_type: "sd3-5" - resume_path: null # Path to load previous checkpoint/lora adapter + resume_path: null # Local path or HF repo id (e.g. 'owner/repo[/subdir][@rev]') for previous checkpoint/lora adapter resume_type: null # Options: lora, full, state. Null to auto-detect based on `finetune_type` # attn_backend: '_flash_3_hub' # Attention backend for training. diff --git a/src/flow_factory/hparams/model_args.py b/src/flow_factory/hparams/model_args.py index 9c7de4e0..d86c7947 100644 --- a/src/flow_factory/hparams/model_args.py +++ b/src/flow_factory/hparams/model_args.py @@ -80,7 +80,15 @@ class ModelArguments(ArgABC): resume_path : Optional[str] = field( default=None, - metadata={"help": "Resume from checkpoint directory."} + metadata={ + "help": "Resume from checkpoint. Accepts either a local directory or a " + "Hugging Face repo spec ('owner/repo[/subfolder][@revision]', or " + "explicit 'hf://owner/repo[/subfolder][@revision]'). When a local " + "path doesn't exist, falls back to Hugging Face Hub download. " + "Multi-node: HF_TOKEN must be set on every node; downloads happen " + "once per node; consider HF_HUB_ENABLE_HF_TRANSFER=1 for large " + "checkpoints to avoid NCCL watchdog timeouts." + } ) resume_type : Optional[Literal['lora', 'full', 'state']] = field( diff --git a/src/flow_factory/models/abc.py b/src/flow_factory/models/abc.py index d3a00572..d1a36818 100644 --- a/src/flow_factory/models/abc.py +++ b/src/flow_factory/models/abc.py @@ -38,6 +38,7 @@ from peft import get_peft_model, LoraConfig, PeftModel from huggingface_hub import split_torch_state_dict_into_shards +from huggingface_hub.errors import RepositoryNotFoundError, HfHubHTTPError from accelerate import Accelerator, DistributedType from accelerate.state import PartialState from accelerate.utils.modeling import ( @@ -58,6 +59,9 @@ mapping_lora_state_dict, infer_lora_config, infer_target_modules, + parse_hf_checkpoint_path, + download_hf_checkpoint, + HF_PATH_PREFIX, ) from ..samples import BaseSample from ..ema import EMAModuleWrapper @@ -1455,6 +1459,70 @@ def save_checkpoint( logger.info(f"Checkpoint saved successfully to {save_directory}") # -------------------------------------------- Load ------------------------------------------- + def _resolve_checkpoint_path(self, path: str) -> str: + """ + Resolve `path` to a local directory, downloading from Hugging Face Hub when needed. + + Resolution order: + 1. If `path` starts with ``hf://``, strip the prefix and force HF download + (lets users override a colliding local directory). + 2. Otherwise, if `path` exists locally, return it as-is. + 3. Otherwise, parse as ``owner/repo[/subfolder][@revision]`` and download + via Hugging Face Hub. + + Multi-node-safe: all ranks call ``snapshot_download`` directly. Hugging + Face Hub's per-blob ``WeakFileLock`` serializes concurrent calls within + each filesystem domain (cross-node on POSIX-locking shared FS, per-node + on non-shared FS), so exactly one rank per filesystem domain actually + transfers bytes. Un-gated (rather than ``is_local_main_process`` plus a + barrier) so a failed download raises uniformly on every affected rank + instead of leaving siblings deadlocked at a barrier the failing rank + never reaches. Residual hazard: a rare single-rank transient failure + (e.g. one node's network blip) can produce asymmetric progress, in + which case the surviving ranks will eventually trip the NCCL watchdog + on the final barrier below. + + Args: + path: Local filesystem path or HF spec (with or without ``hf://`` prefix). + + Returns: + Absolute local directory path ready for the existing checkpoint loaders. + + Raises: + FileNotFoundError: When the spec is neither a local path nor a reachable HF repo. + """ + force_hf = path.startswith(HF_PATH_PREFIX) + spec = path[len(HF_PATH_PREFIX):] if force_hf else path + + if not force_hf and os.path.exists(spec): + return spec + + repo_id, subfolder, revision = parse_hf_checkpoint_path(spec) + + try: + local_path = download_hf_checkpoint(repo_id, subfolder, revision) + except (RepositoryNotFoundError, HfHubHTTPError) as e: + raise FileNotFoundError( + f"Checkpoint {path!r} not found locally and could not be fetched " + f"from Hugging Face Hub (repo={repo_id!r}, subfolder={subfolder!r}, " + f"revision={revision!r}). For private repos, ensure HF_TOKEN is set " + f"on ALL nodes." + ) from e + + # Sync after download so downstream loaders enter the lockstep dispatch + # together. On symmetric failure every rank raises above before this + # barrier is reached, so no deadlock; the residual asymmetric-failure + # case is documented in the docstring. + self.accelerator.wait_for_everyone() + + if self.accelerator.is_local_main_process: + logger.info( + f"[local rank 0 / global rank {self.accelerator.process_index}] " + f"resolved checkpoint '{path}' -> {local_path}" + ) + + return local_path + @staticmethod def load_sharded_checkpoint(checkpoint_dir: str, index_file: str) -> Dict[str, torch.Tensor]: """Load sharded safetensors checkpoint.""" @@ -1674,8 +1742,11 @@ def load_checkpoint( - None: Auto-detect based on checkpoint directory contents """ path = os.path.expanduser(path) + path = self._resolve_checkpoint_path(path) if not os.path.exists(path): - raise FileNotFoundError(f"Checkpoint path not found: {path}") + raise FileNotFoundError( + f"Checkpoint path not found locally or on Hugging Face Hub: {path!r}" + ) # Auto-detect if not specified if resume_type is None: diff --git a/src/flow_factory/utils/checkpoint.py b/src/flow_factory/utils/checkpoint.py index 14c79227..fa101246 100644 --- a/src/flow_factory/utils/checkpoint.py +++ b/src/flow_factory/utils/checkpoint.py @@ -24,6 +24,7 @@ from typing import Dict, Optional, List, Tuple, Literal from safetensors.torch import save_file, load_file +from huggingface_hub import snapshot_download def mapping_lora_state_dict( state_dict: Dict[str, torch.Tensor], @@ -137,4 +138,121 @@ def infer_target_modules( if match: target_modules.add(match.group(1)) - return sorted(target_modules) \ No newline at end of file + return sorted(target_modules) + + +# ================================ Hugging Face Hub ================================ +HF_PATH_PREFIX = "hf://" + + +def parse_hf_checkpoint_path(path: str) -> Tuple[str, Optional[str], Optional[str]]: + """ + Parse a Hugging Face checkpoint path spec into ``(repo_id, subfolder, revision)``. + + Accepts both bare and ``hf://``-prefixed specs: + - ``owner/repo`` -> (``owner/repo``, None, None) + - ``hf://owner/repo`` -> (``owner/repo``, None, None) + - ``owner/repo/sub/dir`` -> (``owner/repo``, ``sub/dir``, None) + - ``owner/repo@v1.0`` -> (``owner/repo``, None, ``v1.0``) + - ``hf://owner/repo/sub/dir@v1.0`` -> (``owner/repo``, ``sub/dir``, ``v1.0``) + + Args: + path: A bare or ``hf://``-prefixed checkpoint spec. + + Returns: + Tuple of (repo_id, subfolder, revision); subfolder and revision are ``None`` when absent. + + Raises: + ValueError: If the spec lacks the ``owner/repo`` form (at minimum two path segments). + """ + if not isinstance(path, str): + raise TypeError( + f"expected str for path, got {type(path).__name__}: {path!r}" + ) + + spec = path[len(HF_PATH_PREFIX):] if path.startswith(HF_PATH_PREFIX) else path + + # Split off optional @revision (revision token cannot contain '/' or '@'). + revision: Optional[str] = None + if "@" in spec: + spec, revision = spec.rsplit("@", 1) + if not revision or "/" in revision: + raise ValueError( + f"invalid revision in HF checkpoint path: {path!r} " + f"(expected 'owner/repo[/subfolder][@revision]', got revision={revision!r})" + ) + + parts = [p for p in spec.split("/") if p] + if len(parts) < 2: + raise ValueError( + f"invalid HF checkpoint path: {path!r} " + f"(expected at least 'owner/repo', got {len(parts)} non-empty segments)" + ) + + # Reject path-traversal segments. Without this, a spec like + # 'owner/repo/..' would resolve via os.path.join to a directory outside + # the snapshot root and let downstream loaders read from unintended + # locations. Backslashes are rejected to block Windows-style traversal. + for seg in parts: + if seg in (".", "..") or "\\" in seg: + raise ValueError( + f"invalid segment {seg!r} in HF checkpoint path: {path!r} " + f"('.', '..', and backslashes are not allowed)" + ) + + repo_id = "/".join(parts[:2]) + subfolder = "/".join(parts[2:]) if len(parts) > 2 else None + return repo_id, subfolder, revision + + +def download_hf_checkpoint( + repo_id: str, + subfolder: Optional[str] = None, + revision: Optional[str] = None, +) -> str: + """ + Download a Hugging Face checkpoint snapshot and return the local directory path. + + Thin wrapper over ``huggingface_hub.snapshot_download``. When ``subfolder`` is + provided, restricts the download to that subtree via ``allow_patterns`` and + returns the path joined with the subfolder so the caller receives the directory + layout that the existing local-checkpoint loaders expect. + + Authentication is taken from the standard ``HF_TOKEN`` / ``HUGGING_FACE_HUB_TOKEN`` + environment variables (and the local ``~/.cache/huggingface/token`` cache). For + multi-node training the token must be available on every node. + + Args: + repo_id: HF repository identifier in ``owner/repo`` form. + subfolder: Optional subdirectory within the repo to fetch. + revision: Optional git revision (branch, tag, or commit SHA). + + Returns: + Absolute local directory path containing the snapshot (with ``subfolder`` appended when set). + """ + if not isinstance(repo_id, str) or "/" not in repo_id: + raise ValueError( + f"expected 'owner/repo' for repo_id, got {repo_id!r}" + ) + + allow_patterns: Optional[List[str]] = None + if subfolder: + # Match the subfolder itself plus everything beneath it. + allow_patterns = [f"{subfolder}/*", f"{subfolder}/**"] + + local_root = snapshot_download( + repo_id=repo_id, + revision=revision, + allow_patterns=allow_patterns, + ) + + if subfolder: + local_path = os.path.join(local_root, subfolder) + if not os.path.isdir(local_path): + raise FileNotFoundError( + f"HF snapshot for repo_id={repo_id!r} (revision={revision!r}) did not " + f"contain expected subfolder {subfolder!r}; downloaded root={local_root!r}" + ) + return local_path + + return local_root \ No newline at end of file