22
33import logging
44import functools
5- import math , sys
5+ import math , random , sys
66from os import environ
77from time import sleep
88from typing import Any , Generator , Tuple
@@ -154,16 +154,12 @@ def print_status(imanager: IngestionManager, redis, upgrade: bool = False):
154154 def _refresh_status ():
155155 pipeline = redis .pipeline ()
156156 pipeline .get (r_keys .JOB_TYPE )
157- worker_busy = []
157+ worker_busy = ["-" ] * len ( layers )
158158 for layer in layers :
159159 pipeline .scard (f"{ layer } c" )
160160 queue = Queue (f"l{ layer } " , connection = redis )
161161 pipeline .llen (queue .key )
162162 pipeline .zcard (queue .failed_job_registry .key )
163- workers = Worker .all (queue = queue )
164- worker_busy .append (
165- sum ([w .get_state () == WorkerStatus .BUSY for w in workers ])
166- )
167163
168164 results = pipeline .execute ()
169165 job_type = "not_available"
@@ -218,6 +214,7 @@ def queue_layer_helper(
218214 batch_size = int (environ .get ("JOB_BATCH_SIZE" , 10000 ))
219215 timeout_scale = int (environ .get ("TIMEOUT_SCALE_FACTOR" , 1 ))
220216 batches = chunked (chunk_coords , batch_size )
217+ failure_ttl = int (environ .get ("FAILURE_TTL" , 300 ))
221218 for batch in batches :
222219 _coords = get_chunks_not_done (imanager , parent_layer , batch , splits = splits )
223220 # buffer for optimal use of redis memory
@@ -227,6 +224,7 @@ def queue_layer_helper(
227224 sleep (interval )
228225
229226 job_datas = []
227+ retry = int (environ .get ("RETRY_COUNT" , 0 ))
230228 for chunk_coord in _coords :
231229 if splits > 0 :
232230 coord , split = chunk_coord
@@ -238,7 +236,9 @@ def queue_layer_helper(
238236 result_ttl = 0 ,
239237 job_id = jid ,
240238 timeout = f"{ timeout_scale * int (parent_layer * parent_layer )} m" ,
241- retry = Retry (int (environ .get ("RETRY_COUNT" , 1 ))),
239+ retry = Retry (retry ) if retry > 1 else None ,
240+ description = "" ,
241+ failure_ttl = failure_ttl ,
242242 )
243243 )
244244 else :
@@ -249,7 +249,9 @@ def queue_layer_helper(
249249 result_ttl = 0 ,
250250 job_id = chunk_id_str (parent_layer , chunk_coord ),
251251 timeout = f"{ timeout_scale * int (parent_layer * parent_layer )} m" ,
252- retry = Retry (int (environ .get ("RETRY_COUNT" , 1 ))),
252+ retry = Retry (retry ) if retry > 1 else None ,
253+ description = "" ,
254+ failure_ttl = failure_ttl ,
253255 )
254256 )
255257 q .enqueue_many (job_datas )
0 commit comments