Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -521,6 +521,14 @@ def save_hf_model(self, bridge, model: MegatronModelWrapper, output_dir: str, to

# Only rank 0 saves the Huggingface config and tokenizer.
if self.is_rank_0():
# Preserve any custom modeling artifacts (e.g. modeling_*.py,
# special_tokens_map.json, auto_map-referenced files) that
# trust_remote_code models depend on. save_hf_configs below
# overwrites config.json/tokenizer files with the strategy's
# current view, but save_artifacts is required to copy the
# custom Python modules and other artifacts that
# save_pretrained() alone does not emit.
bridge.hf_pretrained.save_artifacts(work_dir)
Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

high

To prevent potential AttributeError or NoneType errors during model saving, it is safer to use defensive programming and check if bridge has a non-None hf_pretrained attribute before calling save_artifacts. If bridge is a custom or mock bridge that does not implement hf_pretrained, this call would otherwise crash the training process at the very end.

Suggested change
bridge.hf_pretrained.save_artifacts(work_dir)
if getattr(bridge, "hf_pretrained", None) is not None:
bridge.hf_pretrained.save_artifacts(work_dir)

Copy link
Copy Markdown
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The bridge.save_hf_weights call three lines above is unguarded on the same object — if bridge lacks hf_pretrained, it almost certainly lacks save_hf_weights too and we've already crashed.

More importantly, silently skipping save_artifacts is exactly the bug this PR fixes: HF dirs missing modeling_*.py reload as broken trust_remote_code checkpoints. A loud AttributeError surfaces a bridge gap; a silent skip re-introduces data loss. Prefer to fail loudly.

self.save_hf_configs(self.hf_config, work_dir, tokenizer)
self.print(f"Successfully saved HF config and tokenizer to {output_dir}")

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -273,3 +273,86 @@ async def test_non_colocated_megatron_merge_lora_still_pauses(self):

dispatch._inference_engine_client.pause_generation.assert_awaited_once()
dispatch._inference_engine_client.resume_generation.assert_awaited_once()


# ---------------------------------------------------------------------------
# save_hf_model: save_artifacts ordering
# ---------------------------------------------------------------------------


@pytest.mark.skipif(not _has_megatron, reason="megatron-core not installed")
class TestSaveHFModelArtifacts:
"""Verify ``save_hf_model`` invokes ``save_artifacts`` rank-0-only and in
the correct order relative to ``save_hf_weights`` / ``save_hf_configs``.
"""

def _build_strategy(self, *, is_rank_0: bool):
from skyrl.backends.skyrl_train.distributed.megatron.megatron_strategy import (
MegatronStrategy,
)

strategy = MegatronStrategy.__new__(MegatronStrategy)
strategy.hf_config = MagicMock(name="hf_config")
strategy.is_rank_0 = MagicMock(return_value=is_rank_0)
strategy.save_hf_configs = MagicMock(name="save_hf_configs")
strategy.print = MagicMock(name="print")
return strategy

def _build_bridge_and_model(self):
bridge = MagicMock(name="bridge")
model = MagicMock(name="model")
return bridge, model

def _patch_module_io(self, *, work_dir: str):
"""Patch ``io`` and ``dist`` at the megatron_strategy module level.

``io.local_work_dir`` is a context manager yielding ``work_dir``.
"""
io_mock = MagicMock(name="io")
io_mock.local_work_dir.return_value.__enter__.return_value = work_dir
io_mock.local_work_dir.return_value.__exit__.return_value = False
dist_mock = MagicMock(name="dist")
return (
patch(
"skyrl.backends.skyrl_train.distributed.megatron.megatron_strategy.io",
io_mock,
),
patch(
"skyrl.backends.skyrl_train.distributed.megatron.megatron_strategy.dist",
dist_mock,
),
io_mock,
dist_mock,
)

def test_rank0_calls_save_artifacts_before_save_hf_configs(self):
strategy = self._build_strategy(is_rank_0=True)
bridge, model = self._build_bridge_and_model()
io_patch, dist_patch, _io_mock, _dist_mock = self._patch_module_io(work_dir="/tmp/work")

parent = MagicMock()
parent.attach_mock(bridge.save_hf_weights, "save_hf_weights")
parent.attach_mock(bridge.hf_pretrained.save_artifacts, "save_artifacts")
parent.attach_mock(strategy.save_hf_configs, "save_hf_configs")

with io_patch, dist_patch:
strategy.save_hf_model(bridge=bridge, model=model, output_dir="/out", tokenizer="tok")

bridge.save_hf_weights.assert_called_once_with(model.actor_module, "/tmp/work")
bridge.hf_pretrained.save_artifacts.assert_called_once_with("/tmp/work")
strategy.save_hf_configs.assert_called_once_with(strategy.hf_config, "/tmp/work", "tok")

call_order = [c[0] for c in parent.mock_calls]
assert call_order == ["save_hf_weights", "save_artifacts", "save_hf_configs"]

def test_non_rank0_skips_save_artifacts_and_save_hf_configs(self):
strategy = self._build_strategy(is_rank_0=False)
bridge, model = self._build_bridge_and_model()
io_patch, dist_patch, _io_mock, _dist_mock = self._patch_module_io(work_dir="/tmp/work")

with io_patch, dist_patch:
strategy.save_hf_model(bridge=bridge, model=model, output_dir="/out", tokenizer="tok")

bridge.save_hf_weights.assert_called_once_with(model.actor_module, "/tmp/work")
bridge.hf_pretrained.save_artifacts.assert_not_called()
strategy.save_hf_configs.assert_not_called()
Loading