1717
1818from app .backends .base import CHUNK_SIZE , SNKMT_DB_FILENAME , ComputeBackend
1919from app .config import SlurmSSHConfig
20- from app .models import JobStatus , WorkflowFileInfo
20+ from app .models import JobStatus , SnkmtJobResponse , WorkflowFileInfo
2121from app .utils import (
2222 build_wrapper_script ,
2323 enforce_error_limit ,
@@ -218,34 +218,21 @@ async def _upload_dir(self, local_dir: str, remote_dir: str) -> None:
218218 msg = f"local tar failed with exit { local_proc .returncode } "
219219 raise RuntimeError (msg )
220220
221- async def prepare (
222- self ,
223- job_id : str ,
224- workflow : str ,
225- git_ref : str | None = None ,
226- ) -> tuple [str , str | None , str | None ]:
227- """Clone/upload workflow and return (work_dir, git_ref, git_sha)."""
228- work_dir = self .work_dir (job_id )
229- git_sha : str | None = None
230-
231- if workflow .startswith (("http://" , "https://" )):
232- cmd = "git clone --depth=1 "
233- if git_ref :
234- cmd += f"--branch { shlex .quote (git_ref )} "
235- cmd += f"{ shlex .quote (workflow )} { shlex .quote (work_dir )} "
236- await self ._run_ssh (cmd )
221+ def _scratch_dir (self ) -> str :
222+ return self ._config .scratch_dir
237223
238- result = await self . _run_ssh (
239- f" git -C { shlex . quote ( work_dir ) } rev-parse HEAD && "
240- f"git -C { shlex .quote (work_dir ) } rev-parse --abbrev-ref HEAD"
241- )
242- lines = result . stdout . strip (). splitlines ()
243- git_sha = lines [ 0 ] if lines else None
244- git_ref = lines [ 1 ] if len ( lines ) > 1 else None
245- else :
246- await self . _upload_dir ( workflow , work_dir )
224+ async def _run_git_cmd ( self , * args : str ) -> str :
225+ """Run a git command over SSH, returning stdout."" "
226+ quoted = [ shlex .quote (a ) for a in args ]
227+ # For init, ensure parent dirs exist remotely
228+ if "init" in args :
229+ repo_path = shlex . quote ( args [ - 1 ])
230+ await self . _run_ssh ( f"mkdir -p $(dirname { repo_path } )" )
231+ result = await self . _run_ssh ( " " . join ( quoted ))
232+ return result . stdout or ""
247233
248- return work_dir , git_ref , git_sha
234+ async def _copy_local_workflow (self , src : str , dst : str ) -> None :
235+ await self ._upload_dir (src , dst )
249236
250237 async def setup (
251238 self ,
@@ -268,7 +255,6 @@ async def setup(
268255 await f .write (content )
269256 logger .debug ("Wrote setup file: %s" , full_path )
270257
271-
272258 async def launch (
273259 self ,
274260 job_id : str ,
@@ -520,6 +506,45 @@ async def sync_snkmt_db(self, job_id: str, work_dir: str, local_path: Path) -> N
520506 finally :
521507 await self ._run_ssh (f"rm -f { shlex .quote (remote_bak )} " , check = False )
522508
509+ def resolve_job_logs (
510+ self ,
511+ jobs : list [SnkmtJobResponse ],
512+ workflow_files : list [WorkflowFileInfo ] | None ,
513+ ) -> None :
514+ """Match slurm log files to jobs via structured path convention.
515+
516+ Expects logs at: logs/slurm/{rule}/{wildcards_string}/{slurm_id}.out
517+ If multiple logs exist (retries), picks the latest (highest slurm ID).
518+ """
519+ # Index slurm log files by directory prefix
520+ slurm_logs : dict [str , list [str ]] = {}
521+ if workflow_files :
522+ for wf in workflow_files :
523+ p = wf ["path" ]
524+ if p .startswith ("logs/slurm/" ) and p .endswith (".out" ):
525+ dir_prefix = p .rsplit ("/" , 1 )[0 ]
526+ slurm_logs .setdefault (dir_prefix , []).append (p )
527+
528+ def _slurm_id (path : str ) -> int :
529+ fname = path .rsplit ("/" , 1 )[- 1 ].removesuffix (".out" )
530+ try :
531+ return int (fname )
532+ except ValueError :
533+ return - 1
534+
535+ for job in jobs :
536+ if job .wildcards :
537+ wildcards_str = "," .join (
538+ f"{ k } ={ v } " for k , v in sorted (job .wildcards .items ())
539+ )
540+ slurm_dir = f"logs/slurm/{ job .rule } /{ wildcards_str } "
541+ else :
542+ slurm_dir = f"logs/slurm/{ job .rule } "
543+
544+ candidates = slurm_logs .get (slurm_dir , [])
545+ if candidates :
546+ job .log = max (candidates , key = _slurm_id )
547+
523548 async def cleanup (
524549 self ,
525550 job_id : str ,
@@ -528,15 +553,13 @@ async def cleanup(
528553 # 1. Kill the wrapper process if a PID file exists
529554 pid_file = f"{ shlex .quote (work_dir )} /.pid"
530555 await self ._run_ssh (
531- f"test -f { pid_file } && "
532- f"kill $(cat { pid_file } ) 2>/dev/null || true" ,
556+ f"test -f { pid_file } && kill $(cat { pid_file } ) 2>/dev/null || true" ,
533557 check = False ,
534558 )
535559 # Wait before sending SIGKILL in case SIGTERM wasn't enough
536560 await asyncio .sleep (5 )
537561 await self ._run_ssh (
538- f"test -f { pid_file } && "
539- f"kill -9 $(cat { pid_file } ) 2>/dev/null || true" ,
562+ f"test -f { pid_file } && kill -9 $(cat { pid_file } ) 2>/dev/null || true" ,
540563 check = False ,
541564 )
542565 # 2. Cancel only SLURM jobs launched from this work_dir
0 commit comments