4848MONITOR_DEAD_SENTINEL = "DEAD"
4949
5050
51+ def _build_status_check_cmd (work_dir_quoted : str ) -> str :
52+ """Build shell command to check job exit status via .exitcode / PID probe."""
53+ return (
54+ f"test -f { work_dir_quoted } /.exitcode && cat { work_dir_quoted } /.exitcode "
55+ f"|| (kill -0 $(cat { work_dir_quoted } /.pid 2>/dev/null) 2>/dev/null "
56+ f"&& echo { MONITOR_RUNNING_SENTINEL } "
57+ f"|| echo { MONITOR_DEAD_SENTINEL } )"
58+ )
59+
60+
61+ def _parse_job_status (status : str ) -> int | None :
62+ """Parse status check output → exit code, -1 for dead, or None for running."""
63+ if status == MONITOR_RUNNING_SENTINEL :
64+ return None
65+ if status == MONITOR_DEAD_SENTINEL :
66+ return - 1
67+ try :
68+ return int (status )
69+ except ValueError :
70+ logger .warning ("Unparsable job status value: %r, treating as running" , status )
71+ return None
72+
73+
5174def _build_rsync_filter (cache_dirs : list [str ]) -> str :
5275 """Build rsync --include/--exclude args from cache_dirs patterns.
5376
@@ -84,8 +107,8 @@ def _add(arg: str) -> None:
84107class SlurmSSHBackend (ComputeBackend ):
85108 """
86109 Compute backend that connects to an HPC head node via SSH,
87- clones a workflow, runs Snakemake via pixi in a detached process ,
88- and monitors via polling.
110+ fetches a workflow into a bare repo and creates a worktree ,
111+ runs Snakemake via pixi in a detached process, and monitors via polling.
89112 """
90113
91114 def __init__ (self , config : SlurmSSHConfig ) -> None :
@@ -314,16 +337,11 @@ async def monitor(
314337
315338 while True :
316339 try :
317- # Read new log bytes, print a marker, then print the exit
318- # code (or "RUNNING" if still alive)
319340 wd = shlex .quote (work_dir )
320341 cmd = (
321342 f"tail -c +{ offset + 1 } { wd } /.stdout.log 2>/dev/null; "
322343 f"echo '{ MONITOR_LOG_MARKER } '; "
323- f"test -f { wd } /.exitcode && cat { wd } /.exitcode "
324- f"|| (kill -0 $(cat { wd } /.pid 2>/dev/null) 2>/dev/null "
325- f"&& echo { MONITOR_RUNNING_SENTINEL } "
326- f"|| echo { MONITOR_DEAD_SENTINEL } )"
344+ f"{ _build_status_check_cmd (wd )} "
327345 )
328346 result = await self ._run_ssh (cmd , check = False )
329347 stdout = result .stdout or ""
@@ -336,32 +354,20 @@ async def monitor(
336354 consecutive_errors = 0
337355
338356 if new_log_data :
339- # Advance offset so next poll only reads new bytes
340357 offset += len (new_log_data .encode ("utf-8" , errors = "replace" ))
341358 for line in new_log_data .splitlines ():
342359 log_callback (line )
343360
344- # "RUNNING" = still going
345- # "DEAD" = process killed without writing .exitcode
346- # number = exit code
347- if status_part == MONITOR_DEAD_SENTINEL :
361+ exit_code = _parse_job_status (status_part )
362+ if exit_code == - 1 :
348363 logger .warning (
349364 "Job %s: wrapper process died without writing "
350365 ".exitcode (likely OOM or SIGKILL)" ,
351366 job_id ,
352367 )
353368 return - 1
354- if status_part != MONITOR_RUNNING_SENTINEL :
355- try :
356- return int (status_part )
357- except ValueError :
358- logger .warning (
359- "Unexpected status value: %r, treating as running" ,
360- status_part ,
361- )
362- # Sleep for poll_interval before retrying
363- await asyncio .sleep (self ._config .poll_interval )
364- continue
369+ if exit_code is not None :
370+ return exit_code
365371
366372 except (TimeoutError , OSError , asyncssh .Error ) as exc :
367373 consecutive_errors += 1
@@ -374,22 +380,8 @@ async def monitor(
374380 async def check_job_status (self , job_id : str , work_dir : str ) -> int | None :
375381 """Check if a job process has finished without blocking."""
376382 wd = shlex .quote (work_dir )
377- cmd = (
378- f"test -f { wd } /.exitcode && cat { wd } /.exitcode "
379- f"|| (kill -0 $(cat { wd } /.pid 2>/dev/null) 2>/dev/null "
380- f"&& echo { MONITOR_RUNNING_SENTINEL } "
381- f"|| echo { MONITOR_DEAD_SENTINEL } )"
382- )
383- result = await self ._run_ssh (cmd , check = False )
384- status = (result .stdout or "" ).strip ()
385- if status == MONITOR_RUNNING_SENTINEL :
386- return None
387- if status == MONITOR_DEAD_SENTINEL :
388- return - 1
389- try :
390- return int (status )
391- except ValueError :
392- return None
383+ result = await self ._run_ssh (_build_status_check_cmd (wd ), check = False )
384+ return _parse_job_status ((result .stdout or "" ).strip ())
393385
394386 async def check_connectivity (self ) -> bool :
395387 """Check SSH connectivity and scratch filesystem health."""
0 commit comments