@@ -179,8 +179,9 @@ def log_callback(line: str) -> None:
179179 )
180180
181181 except asyncio .CancelledError :
182- logger .info ("Job %s was cancelled" , job_id )
183- store ._mark_cancelled (job_id )
182+ if record .status not in TERMINAL_STATUSES :
183+ logger .info ("Job %s was cancelled" , job_id )
184+ store ._mark_cancelled (job_id )
184185 raise
185186
186187 except Exception as exc :
@@ -224,27 +225,27 @@ async def sync_job_data_loop(
224225 if not record .work_dir :
225226 continue
226227
227- # Recover orphaned RUNNING jobs (no active monitor task)
228- if record .task is None or record .task .done ():
229- try :
230- exit_code = await backend .check_job_status (
231- job_id , record .work_dir
232- )
233- if exit_code is not None :
234- logger .warning (
235- "Recovering orphaned job %s (exit code %d)" ,
236- job_id ,
237- exit_code ,
238- )
239- store .mark_finished (job_id , exit_code )
240- store .persist (record )
241- continue
242- except Exception :
228+ try :
229+ exit_code = await backend .check_job_status (
230+ job_id , record .work_dir
231+ )
232+ if exit_code is not None :
243233 logger .warning (
244- "Failed to check orphaned job %s" ,
234+ "Recovering stuck job %s (exit code %d) " ,
245235 job_id ,
246- exc_info = True ,
236+ exit_code ,
247237 )
238+ store .mark_finished (job_id , exit_code )
239+ store .persist (record )
240+ if record .task and not record .task .done ():
241+ record .task .cancel ()
242+ continue
243+ except Exception :
244+ logger .warning (
245+ "Failed to check job %s" ,
246+ job_id ,
247+ exc_info = True ,
248+ )
248249
249250 _flush_logs (store , job_id )
250251 snkmt_db_path = store .get_snkmt_db_path (job_id )
0 commit comments