From f0748093d336deaae1289b4c3617ae2b2ebc34a3 Mon Sep 17 00:00:00 2001 From: Vu Dinh Date: Fri, 29 May 2026 09:29:29 -0400 Subject: [PATCH 1/3] [fix] Save trust_remote_code artifacts in MegatronStrategy.save_hf_model When exporting a trained Megatron model to Hugging Face format, MegatronStrategy.save_hf_model only invokes bridge.save_hf_weights and the inherited save_hf_configs (which calls config/tokenizer/generation_config save_pretrained). Models that rely on trust_remote_code (e.g. Nemotron-H, custom MoE variants) keep their architecture in modeling_*.py modules and reference auxiliary files via auto_map; none of these are emitted by save_pretrained, so the resulting checkpoint directory is structurally incomplete and cannot be loaded with AutoModel.from_pretrained(..., trust_remote_code=True) without manually copying files from the original source path. This change adds a rank-0 call to bridge.hf_pretrained.save_artifacts(work_dir) before save_hf_configs. PreTrainedBase.save_artifacts copies all custom modeling files (*.py), required ARTIFACTS, and OPTIONAL_ARTIFACTS from the original source path. save_hf_configs runs afterwards so the existing config.json/tokenizer write semantics are preserved (it overwrites the config that save_artifacts wrote with the strategy's current view). Verified bridge API against NVIDIA-NeMo/Megatron-Bridge:src/megatron/bridge/models/hf_pretrained/base.py (save_artifacts) and conversion/auto_bridge.py (hf_pretrained attribute). Verification: - pre-commit hooks (ruff, black, gitleaks): pass - ast.parse + targeted AST walk confirms save_artifacts is invoked from within save_hf_model on rank 0 - Existing save/load coverage in tests/backends/skyrl_train/gpu/test_save_load_model.py is GPU-gated and exercises this path end-to-end on CI; no behavior change expected for non-trust_remote_code models because save_artifacts is a superset of save_hf_configs for those. Signed-off-by: Vu Dinh --- .../skyrl_train/distributed/megatron/megatron_strategy.py | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/skyrl/backends/skyrl_train/distributed/megatron/megatron_strategy.py b/skyrl/backends/skyrl_train/distributed/megatron/megatron_strategy.py index 0544d3dc29..3c4d097636 100644 --- a/skyrl/backends/skyrl_train/distributed/megatron/megatron_strategy.py +++ b/skyrl/backends/skyrl_train/distributed/megatron/megatron_strategy.py @@ -521,6 +521,14 @@ def save_hf_model(self, bridge, model: MegatronModelWrapper, output_dir: str, to # Only rank 0 saves the Huggingface config and tokenizer. if self.is_rank_0(): + # Preserve any custom modeling artifacts (e.g. modeling_*.py, + # special_tokens_map.json, auto_map-referenced files) that + # trust_remote_code models depend on. save_hf_configs below + # overwrites config.json/tokenizer files with the strategy's + # current view, but save_artifacts is required to copy the + # custom Python modules and other artifacts that + # save_pretrained() alone does not emit. + bridge.hf_pretrained.save_artifacts(work_dir) self.save_hf_configs(self.hf_config, work_dir, tokenizer) self.print(f"Successfully saved HF config and tokenizer to {output_dir}") From 86dd024a01b1da5255558638b75d7fdf99d4096f Mon Sep 17 00:00:00 2001 From: Vu Dinh Date: Fri, 29 May 2026 21:17:55 -0400 Subject: [PATCH 2/3] [test] Add unit tests for MegatronStrategy.save_hf_model save_artifacts ordering Locks in the rank-0-only invocation of bridge.hf_pretrained.save_artifacts and the relative ordering of save_hf_weights -> save_artifacts -> save_hf_configs. Gated on _has_megatron consistent with the rest of test_megatron_correctness.py. Signed-off-by: Vu Dinh --- .../distributed/test_megatron_correctness.py | 93 +++++++++++++++++++ 1 file changed, 93 insertions(+) diff --git a/tests/backends/skyrl_train/distributed/test_megatron_correctness.py b/tests/backends/skyrl_train/distributed/test_megatron_correctness.py index e57f4f5df3..8974a07d3e 100644 --- a/tests/backends/skyrl_train/distributed/test_megatron_correctness.py +++ b/tests/backends/skyrl_train/distributed/test_megatron_correctness.py @@ -273,3 +273,96 @@ async def test_non_colocated_megatron_merge_lora_still_pauses(self): dispatch._inference_engine_client.pause_generation.assert_awaited_once() dispatch._inference_engine_client.resume_generation.assert_awaited_once() + + +# --------------------------------------------------------------------------- +# save_hf_model: save_artifacts ordering +# --------------------------------------------------------------------------- + + +@pytest.mark.skipif(not _has_megatron, reason="megatron-core not installed") +class TestSaveHFModelArtifacts: + """Verify ``save_hf_model`` invokes ``save_artifacts`` rank-0-only and in + the correct order relative to ``save_hf_weights`` / ``save_hf_configs``. + """ + + def _build_strategy(self, *, is_rank_0: bool): + from skyrl.backends.skyrl_train.distributed.megatron.megatron_strategy import ( + MegatronStrategy, + ) + + strategy = MegatronStrategy.__new__(MegatronStrategy) + strategy.hf_config = MagicMock(name="hf_config") + strategy.is_rank_0 = MagicMock(return_value=is_rank_0) + strategy.save_hf_configs = MagicMock(name="save_hf_configs") + strategy.print = MagicMock(name="print") + return strategy + + def _build_bridge_and_model(self): + bridge = MagicMock(name="bridge") + model = MagicMock(name="model") + return bridge, model + + def _patch_module_io(self, *, work_dir: str): + """Patch ``io`` and ``dist`` at the megatron_strategy module level. + + ``io.local_work_dir`` is a context manager yielding ``work_dir``. + """ + io_mock = MagicMock(name="io") + io_mock.local_work_dir.return_value.__enter__.return_value = work_dir + io_mock.local_work_dir.return_value.__exit__.return_value = False + dist_mock = MagicMock(name="dist") + return ( + patch( + "skyrl.backends.skyrl_train.distributed.megatron.megatron_strategy.io", + io_mock, + ), + patch( + "skyrl.backends.skyrl_train.distributed.megatron.megatron_strategy.dist", + dist_mock, + ), + io_mock, + dist_mock, + ) + + def test_rank0_calls_save_artifacts_before_save_hf_configs(self): + strategy = self._build_strategy(is_rank_0=True) + bridge, model = self._build_bridge_and_model() + io_patch, dist_patch, _io_mock, _dist_mock = self._patch_module_io( + work_dir="/tmp/work" + ) + + parent = MagicMock() + parent.attach_mock(bridge.save_hf_weights, "save_hf_weights") + parent.attach_mock(bridge.hf_pretrained.save_artifacts, "save_artifacts") + parent.attach_mock(strategy.save_hf_configs, "save_hf_configs") + + with io_patch, dist_patch: + strategy.save_hf_model( + bridge=bridge, model=model, output_dir="/out", tokenizer="tok" + ) + + bridge.save_hf_weights.assert_called_once_with(model.actor_module, "/tmp/work") + bridge.hf_pretrained.save_artifacts.assert_called_once_with("/tmp/work") + strategy.save_hf_configs.assert_called_once_with( + strategy.hf_config, "/tmp/work", "tok" + ) + + call_order = [c[0] for c in parent.mock_calls] + assert call_order == ["save_hf_weights", "save_artifacts", "save_hf_configs"] + + def test_non_rank0_skips_save_artifacts_and_save_hf_configs(self): + strategy = self._build_strategy(is_rank_0=False) + bridge, model = self._build_bridge_and_model() + io_patch, dist_patch, _io_mock, _dist_mock = self._patch_module_io( + work_dir="/tmp/work" + ) + + with io_patch, dist_patch: + strategy.save_hf_model( + bridge=bridge, model=model, output_dir="/out", tokenizer="tok" + ) + + bridge.save_hf_weights.assert_called_once_with(model.actor_module, "/tmp/work") + bridge.hf_pretrained.save_artifacts.assert_not_called() + strategy.save_hf_configs.assert_not_called() From 079216c411bdce3b8f919662c454f696686b5f98 Mon Sep 17 00:00:00 2001 From: Vu Dinh Date: Fri, 29 May 2026 21:37:41 -0400 Subject: [PATCH 3/3] [style] Apply black to TestSaveHFModelArtifacts helpers Pure formatting (line-length only); no logic changes. Fixes check_code_quality / black on PR #1726. Signed-off-by: Vu Dinh --- .../distributed/test_megatron_correctness.py | 20 +++++-------------- 1 file changed, 5 insertions(+), 15 deletions(-) diff --git a/tests/backends/skyrl_train/distributed/test_megatron_correctness.py b/tests/backends/skyrl_train/distributed/test_megatron_correctness.py index 8974a07d3e..c7cedcd2d9 100644 --- a/tests/backends/skyrl_train/distributed/test_megatron_correctness.py +++ b/tests/backends/skyrl_train/distributed/test_megatron_correctness.py @@ -328,9 +328,7 @@ def _patch_module_io(self, *, work_dir: str): def test_rank0_calls_save_artifacts_before_save_hf_configs(self): strategy = self._build_strategy(is_rank_0=True) bridge, model = self._build_bridge_and_model() - io_patch, dist_patch, _io_mock, _dist_mock = self._patch_module_io( - work_dir="/tmp/work" - ) + io_patch, dist_patch, _io_mock, _dist_mock = self._patch_module_io(work_dir="/tmp/work") parent = MagicMock() parent.attach_mock(bridge.save_hf_weights, "save_hf_weights") @@ -338,15 +336,11 @@ def test_rank0_calls_save_artifacts_before_save_hf_configs(self): parent.attach_mock(strategy.save_hf_configs, "save_hf_configs") with io_patch, dist_patch: - strategy.save_hf_model( - bridge=bridge, model=model, output_dir="/out", tokenizer="tok" - ) + strategy.save_hf_model(bridge=bridge, model=model, output_dir="/out", tokenizer="tok") bridge.save_hf_weights.assert_called_once_with(model.actor_module, "/tmp/work") bridge.hf_pretrained.save_artifacts.assert_called_once_with("/tmp/work") - strategy.save_hf_configs.assert_called_once_with( - strategy.hf_config, "/tmp/work", "tok" - ) + strategy.save_hf_configs.assert_called_once_with(strategy.hf_config, "/tmp/work", "tok") call_order = [c[0] for c in parent.mock_calls] assert call_order == ["save_hf_weights", "save_artifacts", "save_hf_configs"] @@ -354,14 +348,10 @@ def test_rank0_calls_save_artifacts_before_save_hf_configs(self): def test_non_rank0_skips_save_artifacts_and_save_hf_configs(self): strategy = self._build_strategy(is_rank_0=False) bridge, model = self._build_bridge_and_model() - io_patch, dist_patch, _io_mock, _dist_mock = self._patch_module_io( - work_dir="/tmp/work" - ) + io_patch, dist_patch, _io_mock, _dist_mock = self._patch_module_io(work_dir="/tmp/work") with io_patch, dist_patch: - strategy.save_hf_model( - bridge=bridge, model=model, output_dir="/out", tokenizer="tok" - ) + strategy.save_hf_model(bridge=bridge, model=model, output_dir="/out", tokenizer="tok") bridge.save_hf_weights.assert_called_once_with(model.actor_module, "/tmp/work") bridge.hf_pretrained.save_artifacts.assert_not_called()