From ef84d6c013740d744823a8db20110f6280eb2f57 Mon Sep 17 00:00:00 2001 From: "njzjz-bot (driven by OpenClaw (model: gpt-5.5))[bot]" <48687836+njzjz-bot@users.noreply.github.com> Date: Sat, 30 May 2026 06:12:29 +0000 Subject: [PATCH 1/5] fix(train): allow zero-step training with bias adjustment Skip post-training bias adjustment when no training step has run, so zero-step jobs can keep the existing initial-checkpoint behavior without evaluating step -1 learning rates. Authored by OpenClaw (model: custom-chat-jinzhezeng-group/gpt-5.5) --- deepmd/pd/train/training.py | 6 +++++- deepmd/pt/train/training.py | 6 +++++- source/tests/pd/test_training.py | 16 ++++++++++++++++ source/tests/pt/test_training.py | 17 +++++++++++++++++ 4 files changed, 43 insertions(+), 2 deletions(-) diff --git a/deepmd/pd/train/training.py b/deepmd/pd/train/training.py index 7d91218468..3208903b95 100644 --- a/deepmd/pd/train/training.py +++ b/deepmd/pd/train/training.py @@ -1038,7 +1038,11 @@ def log_loss_valid(_task_key: str = "Default") -> dict: if JIT: break - if self.change_bias_after_training and (self.rank == 0 or dist.get_rank() == 0): + if ( + self.change_bias_after_training + and self.num_steps > self.start_step + and (self.rank == 0 or dist.get_rank() == 0) + ): if not self.multi_task: self.model = model_change_out_bias( self.model, diff --git a/deepmd/pt/train/training.py b/deepmd/pt/train/training.py index a5b799dbdc..8d66a86a9c 100644 --- a/deepmd/pt/train/training.py +++ b/deepmd/pt/train/training.py @@ -1745,7 +1745,11 @@ def log_loss_valid(_task_key: str = "Default") -> dict: if JIT: break - if self.change_bias_after_training and (self.rank == 0 or dist.get_rank() == 0): + if ( + self.change_bias_after_training + and self.num_steps > self.start_step + and (self.rank == 0 or dist.get_rank() == 0) + ): if not self.multi_task: self.model = model_change_out_bias( self.model, diff --git a/source/tests/pd/test_training.py b/source/tests/pd/test_training.py index 21b7b3b854..b842444a74 100644 --- a/source/tests/pd/test_training.py +++ b/source/tests/pd/test_training.py @@ -11,6 +11,7 @@ ) import numpy as np +import paddle from deepmd.pd.entrypoints.main import ( get_trainer, @@ -163,6 +164,21 @@ def setUp(self) -> None: self.config["training"]["save_freq"] = 1 enable_prim(True) + def test_zero_step_with_change_bias_saves_initial_checkpoint(self) -> None: + config = deepcopy(self.config) + config["training"]["numb_steps"] = 0 + config["training"]["change_bias_after_training"] = True + trainer = get_trainer(config) + trainer.run() + + self.assertEqual(Path("model.ckpt-0.pd"), trainer.latest_model) + self.assertTrue(Path("model.ckpt-0.pd").exists()) + self.assertEqual(Path("model.ckpt-0.pd"), Path("checkpoint").read_text()) + checkpoint = paddle.load("model.ckpt-0.pd") + train_infos = checkpoint["model"]["_extra_state"]["train_infos"] + self.assertEqual(0, train_infos["step"]) + self.assertEqual(0.0, train_infos["lr"]) + def tearDown(self) -> None: DPTrainTest.tearDown(self) diff --git a/source/tests/pt/test_training.py b/source/tests/pt/test_training.py index c4e58c0368..712ae613d3 100644 --- a/source/tests/pt/test_training.py +++ b/source/tests/pt/test_training.py @@ -263,6 +263,23 @@ def test_yaml_input(self) -> None: ) self.assertTrue(Path("out.json").exists()) + def test_zero_step_with_change_bias_saves_initial_checkpoint(self) -> None: + config = deepcopy(self.config) + config["training"]["numb_steps"] = 0 + config["training"]["change_bias_after_training"] = True + trainer = get_trainer(config) + trainer.run() + + self.assertEqual(Path("model.ckpt-0.pt"), trainer.latest_model) + self.assertTrue(Path("model.ckpt-0.pt").exists()) + self.assertEqual(Path("model.ckpt-0.pt"), Path("checkpoint").read_text()) + checkpoint = torch.load( + "model.ckpt-0.pt", map_location="cpu", weights_only=True + ) + train_infos = checkpoint["model"]["_extra_state"]["train_infos"] + self.assertEqual(0, train_infos["step"]) + self.assertEqual(0.0, train_infos["lr"]) + def tearDown(self) -> None: DPTrainTest.tearDown(self) for ff in ["out.json", "input.yaml"]: From 631039c5470fffb544a5a500d2169f6e1f3ad085 Mon Sep 17 00:00:00 2001 From: "njzjz-bot (driven by OpenClaw (model: gpt-5.5))[bot]" <48687836+njzjz-bot@users.noreply.github.com> Date: Sat, 30 May 2026 07:55:10 +0000 Subject: [PATCH 2/5] test(train): fix zero-step checkpoint assertions Compare checkpoint pointers as paths and add timeout guards to zero-step training regression tests. Authored by OpenClaw (model: custom-chat-jinzhezeng-group/gpt-5.5) --- source/tests/pd/test_training.py | 43 +++++++++++++++++++++++++++++++- source/tests/pt/test_training.py | 6 ++++- 2 files changed, 47 insertions(+), 2 deletions(-) diff --git a/source/tests/pd/test_training.py b/source/tests/pd/test_training.py index b842444a74..a416d54e2c 100644 --- a/source/tests/pd/test_training.py +++ b/source/tests/pd/test_training.py @@ -2,13 +2,21 @@ import json import os import shutil +import signal import unittest +from collections.abc import ( + Callable, +) from copy import ( deepcopy, ) from pathlib import ( Path, ) +from typing import ( + Any, + TypeVar, +) import numpy as np import paddle @@ -32,6 +40,35 @@ model_se_e2_a, ) +_F = TypeVar("_F", bound=Callable[..., Any]) + + +def _training_timeout(seconds: int) -> Callable[[_F], _F]: + """Limit real training tests on platforms that support SIGALRM.""" + + def decorate(func: _F) -> _F: + if not hasattr(signal, "SIGALRM"): + return func + + def wrapped(*args: Any, **kwargs: Any) -> Any: + def raise_timeout(signum: int, frame: Any) -> None: + raise TimeoutError(f"training test exceeded {seconds} seconds") + + previous_handler = signal.signal(signal.SIGALRM, raise_timeout) + signal.alarm(seconds) + try: + return func(*args, **kwargs) + finally: + signal.alarm(0) + signal.signal(signal.SIGALRM, previous_handler) + + return wrapped # type: ignore[return-value] + + return decorate + + +TRAINING_TEST_TIMEOUT = _training_timeout(60) + class DPTrainTest: def test_dp_train(self) -> None: @@ -164,6 +201,7 @@ def setUp(self) -> None: self.config["training"]["save_freq"] = 1 enable_prim(True) + @TRAINING_TEST_TIMEOUT def test_zero_step_with_change_bias_saves_initial_checkpoint(self) -> None: config = deepcopy(self.config) config["training"]["numb_steps"] = 0 @@ -173,7 +211,10 @@ def test_zero_step_with_change_bias_saves_initial_checkpoint(self) -> None: self.assertEqual(Path("model.ckpt-0.pd"), trainer.latest_model) self.assertTrue(Path("model.ckpt-0.pd").exists()) - self.assertEqual(Path("model.ckpt-0.pd"), Path("checkpoint").read_text()) + self.assertEqual( + Path("model.ckpt-0.pd"), + Path(Path("checkpoint").read_text().strip()), + ) checkpoint = paddle.load("model.ckpt-0.pd") train_infos = checkpoint["model"]["_extra_state"]["train_infos"] self.assertEqual(0, train_infos["step"]) diff --git a/source/tests/pt/test_training.py b/source/tests/pt/test_training.py index 712ae613d3..3c194f2ac3 100644 --- a/source/tests/pt/test_training.py +++ b/source/tests/pt/test_training.py @@ -263,6 +263,7 @@ def test_yaml_input(self) -> None: ) self.assertTrue(Path("out.json").exists()) + @TRAINING_TEST_TIMEOUT def test_zero_step_with_change_bias_saves_initial_checkpoint(self) -> None: config = deepcopy(self.config) config["training"]["numb_steps"] = 0 @@ -272,7 +273,10 @@ def test_zero_step_with_change_bias_saves_initial_checkpoint(self) -> None: self.assertEqual(Path("model.ckpt-0.pt"), trainer.latest_model) self.assertTrue(Path("model.ckpt-0.pt").exists()) - self.assertEqual(Path("model.ckpt-0.pt"), Path("checkpoint").read_text()) + self.assertEqual( + Path("model.ckpt-0.pt"), + Path(Path("checkpoint").read_text().strip()), + ) checkpoint = torch.load( "model.ckpt-0.pt", map_location="cpu", weights_only=True ) From 5da14129a30d24957d30b6d005b3341a81f69891 Mon Sep 17 00:00:00 2001 From: "njzjz-bot (driven by OpenClaw (model: gpt-5.5))[bot]" <48687836+njzjz-bot@users.noreply.github.com> Date: Sat, 30 May 2026 08:03:39 +0000 Subject: [PATCH 3/5] Revert "test(train): fix zero-step checkpoint assertions" This reverts commit 631039c5470fffb544a5a500d2169f6e1f3ad085. --- source/tests/pd/test_training.py | 43 +------------------------------- source/tests/pt/test_training.py | 6 +---- 2 files changed, 2 insertions(+), 47 deletions(-) diff --git a/source/tests/pd/test_training.py b/source/tests/pd/test_training.py index a416d54e2c..b842444a74 100644 --- a/source/tests/pd/test_training.py +++ b/source/tests/pd/test_training.py @@ -2,21 +2,13 @@ import json import os import shutil -import signal import unittest -from collections.abc import ( - Callable, -) from copy import ( deepcopy, ) from pathlib import ( Path, ) -from typing import ( - Any, - TypeVar, -) import numpy as np import paddle @@ -40,35 +32,6 @@ model_se_e2_a, ) -_F = TypeVar("_F", bound=Callable[..., Any]) - - -def _training_timeout(seconds: int) -> Callable[[_F], _F]: - """Limit real training tests on platforms that support SIGALRM.""" - - def decorate(func: _F) -> _F: - if not hasattr(signal, "SIGALRM"): - return func - - def wrapped(*args: Any, **kwargs: Any) -> Any: - def raise_timeout(signum: int, frame: Any) -> None: - raise TimeoutError(f"training test exceeded {seconds} seconds") - - previous_handler = signal.signal(signal.SIGALRM, raise_timeout) - signal.alarm(seconds) - try: - return func(*args, **kwargs) - finally: - signal.alarm(0) - signal.signal(signal.SIGALRM, previous_handler) - - return wrapped # type: ignore[return-value] - - return decorate - - -TRAINING_TEST_TIMEOUT = _training_timeout(60) - class DPTrainTest: def test_dp_train(self) -> None: @@ -201,7 +164,6 @@ def setUp(self) -> None: self.config["training"]["save_freq"] = 1 enable_prim(True) - @TRAINING_TEST_TIMEOUT def test_zero_step_with_change_bias_saves_initial_checkpoint(self) -> None: config = deepcopy(self.config) config["training"]["numb_steps"] = 0 @@ -211,10 +173,7 @@ def test_zero_step_with_change_bias_saves_initial_checkpoint(self) -> None: self.assertEqual(Path("model.ckpt-0.pd"), trainer.latest_model) self.assertTrue(Path("model.ckpt-0.pd").exists()) - self.assertEqual( - Path("model.ckpt-0.pd"), - Path(Path("checkpoint").read_text().strip()), - ) + self.assertEqual(Path("model.ckpt-0.pd"), Path("checkpoint").read_text()) checkpoint = paddle.load("model.ckpt-0.pd") train_infos = checkpoint["model"]["_extra_state"]["train_infos"] self.assertEqual(0, train_infos["step"]) diff --git a/source/tests/pt/test_training.py b/source/tests/pt/test_training.py index 3c194f2ac3..712ae613d3 100644 --- a/source/tests/pt/test_training.py +++ b/source/tests/pt/test_training.py @@ -263,7 +263,6 @@ def test_yaml_input(self) -> None: ) self.assertTrue(Path("out.json").exists()) - @TRAINING_TEST_TIMEOUT def test_zero_step_with_change_bias_saves_initial_checkpoint(self) -> None: config = deepcopy(self.config) config["training"]["numb_steps"] = 0 @@ -273,10 +272,7 @@ def test_zero_step_with_change_bias_saves_initial_checkpoint(self) -> None: self.assertEqual(Path("model.ckpt-0.pt"), trainer.latest_model) self.assertTrue(Path("model.ckpt-0.pt").exists()) - self.assertEqual( - Path("model.ckpt-0.pt"), - Path(Path("checkpoint").read_text().strip()), - ) + self.assertEqual(Path("model.ckpt-0.pt"), Path("checkpoint").read_text()) checkpoint = torch.load( "model.ckpt-0.pt", map_location="cpu", weights_only=True ) From d27334cf0f74c4562456907187836128aad904f9 Mon Sep 17 00:00:00 2001 From: "njzjz-bot (driven by OpenClaw (model: gpt-5.5))[bot]" <48687836+njzjz-bot@users.noreply.github.com> Date: Sat, 30 May 2026 08:05:11 +0000 Subject: [PATCH 4/5] test(train): fix zero-step checkpoint assertions Compare checkpoint pointers as paths without adding timeout guards, since the regression covers the zero-step no-op path. Authored by OpenClaw (model: custom-chat-jinzhezeng-group/gpt-5.5) --- source/tests/pd/test_training.py | 5 ++++- source/tests/pt/test_training.py | 5 ++++- 2 files changed, 8 insertions(+), 2 deletions(-) diff --git a/source/tests/pd/test_training.py b/source/tests/pd/test_training.py index b842444a74..a159394585 100644 --- a/source/tests/pd/test_training.py +++ b/source/tests/pd/test_training.py @@ -173,7 +173,10 @@ def test_zero_step_with_change_bias_saves_initial_checkpoint(self) -> None: self.assertEqual(Path("model.ckpt-0.pd"), trainer.latest_model) self.assertTrue(Path("model.ckpt-0.pd").exists()) - self.assertEqual(Path("model.ckpt-0.pd"), Path("checkpoint").read_text()) + self.assertEqual( + Path("model.ckpt-0.pd"), + Path(Path("checkpoint").read_text().strip()), + ) checkpoint = paddle.load("model.ckpt-0.pd") train_infos = checkpoint["model"]["_extra_state"]["train_infos"] self.assertEqual(0, train_infos["step"]) diff --git a/source/tests/pt/test_training.py b/source/tests/pt/test_training.py index 712ae613d3..950fb9c51c 100644 --- a/source/tests/pt/test_training.py +++ b/source/tests/pt/test_training.py @@ -272,7 +272,10 @@ def test_zero_step_with_change_bias_saves_initial_checkpoint(self) -> None: self.assertEqual(Path("model.ckpt-0.pt"), trainer.latest_model) self.assertTrue(Path("model.ckpt-0.pt").exists()) - self.assertEqual(Path("model.ckpt-0.pt"), Path("checkpoint").read_text()) + self.assertEqual( + Path("model.ckpt-0.pt"), + Path(Path("checkpoint").read_text().strip()), + ) checkpoint = torch.load( "model.ckpt-0.pt", map_location="cpu", weights_only=True ) From 3d7168fb007cd5092fc1124318bb52a7ca70bb0e Mon Sep 17 00:00:00 2001 From: "njzjz-bot (driven by OpenClaw (model: gpt-5.5))[bot]" <48687836+njzjz-bot@users.noreply.github.com> Date: Sat, 30 May 2026 09:57:54 +0000 Subject: [PATCH 5/5] test(train): use configured zero-step checkpoint path Build the expected zero-step checkpoint path from trainer.save_ckpt so the regression follows each test fixture's configured checkpoint prefix. Authored by OpenClaw (model: custom-chat-jinzhezeng-group/gpt-5.5) --- source/tests/pd/test_training.py | 9 +++++---- source/tests/pt/test_training.py | 11 +++++------ 2 files changed, 10 insertions(+), 10 deletions(-) diff --git a/source/tests/pd/test_training.py b/source/tests/pd/test_training.py index a159394585..d7cdf96577 100644 --- a/source/tests/pd/test_training.py +++ b/source/tests/pd/test_training.py @@ -171,13 +171,14 @@ def test_zero_step_with_change_bias_saves_initial_checkpoint(self) -> None: trainer = get_trainer(config) trainer.run() - self.assertEqual(Path("model.ckpt-0.pd"), trainer.latest_model) - self.assertTrue(Path("model.ckpt-0.pd").exists()) + expected_model = Path(trainer.save_ckpt + "-0.pd") + self.assertEqual(expected_model, trainer.latest_model) + self.assertTrue(expected_model.exists()) self.assertEqual( - Path("model.ckpt-0.pd"), + expected_model, Path(Path("checkpoint").read_text().strip()), ) - checkpoint = paddle.load("model.ckpt-0.pd") + checkpoint = paddle.load(expected_model) train_infos = checkpoint["model"]["_extra_state"]["train_infos"] self.assertEqual(0, train_infos["step"]) self.assertEqual(0.0, train_infos["lr"]) diff --git a/source/tests/pt/test_training.py b/source/tests/pt/test_training.py index 950fb9c51c..e776074f5e 100644 --- a/source/tests/pt/test_training.py +++ b/source/tests/pt/test_training.py @@ -270,15 +270,14 @@ def test_zero_step_with_change_bias_saves_initial_checkpoint(self) -> None: trainer = get_trainer(config) trainer.run() - self.assertEqual(Path("model.ckpt-0.pt"), trainer.latest_model) - self.assertTrue(Path("model.ckpt-0.pt").exists()) + expected_model = Path(trainer.save_ckpt + "-0.pt") + self.assertEqual(expected_model, trainer.latest_model) + self.assertTrue(expected_model.exists()) self.assertEqual( - Path("model.ckpt-0.pt"), + expected_model, Path(Path("checkpoint").read_text().strip()), ) - checkpoint = torch.load( - "model.ckpt-0.pt", map_location="cpu", weights_only=True - ) + checkpoint = torch.load(expected_model, map_location="cpu", weights_only=True) train_infos = checkpoint["model"]["_extra_state"]["train_infos"] self.assertEqual(0, train_infos["step"]) self.assertEqual(0.0, train_infos["lr"])