From 9b86a63cc31b04edca35bb4569d1a369c97cec95 Mon Sep 17 00:00:00 2001 From: Arjun Krishnakumar Date: Tue, 28 Apr 2026 22:04:20 +0200 Subject: [PATCH] fix: gate run-dir override to submit.py to prevent mid-job wipe setup_run_directory() would delete the active run directory whenever debug.override_existing=True, including when called from train.py on compute nodes during a running job. Add allow_override=False default so the rmtree only fires when submit.py explicitly opts in. --- scripts/submit.py | 2 +- src/post_training/utils/paths.py | 11 ++++++++--- 2 files changed, 9 insertions(+), 4 deletions(-) diff --git a/scripts/submit.py b/scripts/submit.py index b86cbc2..e5faa4d 100644 --- a/scripts/submit.py +++ b/scripts/submit.py @@ -71,7 +71,7 @@ def main() -> None: prefetch_assets(config) # Set up the run directory (so the SLURM script can reference it). - run_dir = setup_run_directory(config) + run_dir = setup_run_directory(config, allow_override=True) logger.info("Run directory: %s", run_dir) if not confirmed: diff --git a/src/post_training/utils/paths.py b/src/post_training/utils/paths.py index a46a89d..9a0277f 100644 --- a/src/post_training/utils/paths.py +++ b/src/post_training/utils/paths.py @@ -33,7 +33,7 @@ def generate_run_name(config: PostTrainingConfig) -> str: return get_backend(config.backend).generate_run_name(config, timestamp) -def setup_run_directory(config: PostTrainingConfig) -> Path: +def setup_run_directory(config: PostTrainingConfig, allow_override: bool = False) -> Path: """Create the run output directory tree and return the run root. Directory layout:: @@ -58,8 +58,13 @@ def setup_run_directory(config: PostTrainingConfig) -> Path: run_dir = base / run_name - # Handle debug override. - if config.debug.enabled and config.debug.override_existing and run_dir.exists(): + # Handle debug override — only when submit.py explicitly opts in. + if ( + allow_override + and config.debug.enabled + and config.debug.override_existing + and run_dir.exists() + ): shutil.rmtree(run_dir) from post_training.backend import get_backend