fix(upgrade): avoid reading the entire chunk for split tasks

akhileshh · akhileshh · commit ff021e5ed35d · 2026-01-22T21:05:57.000Z
diff --git a/pychunkedgraph/ingest/cluster.py b/pychunkedgraph/ingest/cluster.py
@@ -45,6 +45,7 @@ def _post_task_completion(
         chunk_str += f"_{split}"
     # mark chunk as completed - "c"
     imanager.redis.sadd(f"{layer}c", chunk_str)
+    logging.info(f"{chunk_str} marked as complete")
 
 
 def create_parent_chunk(
@@ -197,6 +198,8 @@ def _queue_tasks(imanager: IngestionManager, chunk_fn: Callable, coords: Iterabl
     q = imanager.get_task_queue(queue_name)
     batch_size = int(environ.get("JOB_BATCH_SIZE", 10000))
     batches = chunked(coords, batch_size)
+    retry = int(environ.get("RETRY_COUNT", 0))
+    failure_ttl = int(environ.get("FAILURE_TTL", 300))
     for batch in batches:
         _coords = get_chunks_not_done(imanager, 2, batch)
         # buffer for optimal use of redis memory
@@ -214,7 +217,9 @@ def _queue_tasks(imanager: IngestionManager, chunk_fn: Callable, coords: Iterabl
                     timeout=environ.get("L2JOB_TIMEOUT", "3m"),
                     result_ttl=0,
                     job_id=chunk_id_str(2, chunk_coord),
-                    retry=Retry(int(environ.get("RETRY_COUNT", 1))),
+                    retry=Retry(retry) if retry > 1 else None,
+                    description="",
+                    failure_ttl=failure_ttl
                 )
             )
         q.enqueue_many(job_datas)
diff --git a/pychunkedgraph/ingest/upgrade/parent_layer.py b/pychunkedgraph/ingest/upgrade/parent_layer.py
@@ -13,7 +13,7 @@
 
 from pychunkedgraph.graph import ChunkedGraph, edges
 from pychunkedgraph.graph.attributes import Connectivity, Hierarchy
-from pychunkedgraph.graph.utils import serializers
+from pychunkedgraph.graph.utils import serializers, basetypes
 from pychunkedgraph.graph.types import empty_2d
 from pychunkedgraph.utils.general import chunked
 
@@ -31,7 +31,10 @@ def _populate_nodes_and_children(
 ) -> dict:
     global CHILDREN
     if nodes:
-        CHILDREN = cg.get_children(nodes)
+        children_map = cg.get_children(nodes)
+        for k, v in children_map.items():
+            if len(v):
+                CHILDREN[k] = v
         return
     response = cg.range_read_chunk(chunk_id, properties=Hierarchy.Child)
     for k, v in response.items():
@@ -188,6 +191,17 @@ def _update_cross_edges_helper(args):
     gc.collect()
 
 
+def _get_split_nodes(
+    cg: ChunkedGraph, chunk_id: basetypes.CHUNK_ID, split: int, splits: int
+):
+    max_id = cg.client.get_max_node_id(chunk_id)
+    total = max_id - chunk_id
+    split_size = int(ceil(total / splits))
+    start = int(chunk_id + np.uint64(split * split_size))
+    end = int(start + split_size)
+    return range(int(start), int(end))
+
+
 def update_chunk(
     cg: ChunkedGraph,
     chunk_coords: list[int],
@@ -204,23 +218,12 @@ def update_chunk(
     x, y, z = chunk_coords
     chunk_id = cg.get_chunk_id(layer=layer, x=x, y=y, z=z)
 
-    _populate_nodes_and_children(cg, chunk_id, nodes=nodes)
-    logging.info(f"_populate_nodes_and_children: {time.time() - start}")
-    if not CHILDREN:
-        return
-
-    allnodes = list(CHILDREN.keys())
     if splits is not None:
-        nodes = []
-        split_size = int(ceil(len(allnodes) / splits))
-        split_nodes = chunked(allnodes, split_size)
-        for i, _nodes in enumerate(split_nodes):
-            if i == split:
-                nodes = list(_nodes)
-                break
-    else:
-        nodes = allnodes
+        nodes = _get_split_nodes(cg, chunk_id, split, splits)
 
+    _populate_nodes_and_children(cg, chunk_id, nodes=nodes)
+    logging.info(f"_populate_nodes_and_children: {time.time() - start}")
+    nodes = list(CHILDREN.keys())
     if len(nodes) == 0:
         return
 
@@ -267,4 +270,3 @@ def update_chunk(
             )
         )
     logging.info(f"total elaspsed time: {time.time() - start}")
-    gc.collect()
diff --git a/pychunkedgraph/ingest/utils.py b/pychunkedgraph/ingest/utils.py
@@ -2,7 +2,7 @@
 
 import logging
 import functools
-import math, sys
+import math, random, sys
 from os import environ
 from time import sleep
 from typing import Any, Generator, Tuple
@@ -154,16 +154,12 @@ def print_status(imanager: IngestionManager, redis, upgrade: bool = False):
     def _refresh_status():
         pipeline = redis.pipeline()
         pipeline.get(r_keys.JOB_TYPE)
-        worker_busy = []
+        worker_busy = ["-"] * len(layers)
         for layer in layers:
             pipeline.scard(f"{layer}c")
             queue = Queue(f"l{layer}", connection=redis)
             pipeline.llen(queue.key)
             pipeline.zcard(queue.failed_job_registry.key)
-            workers = Worker.all(queue=queue)
-            worker_busy.append(
-                sum([w.get_state() == WorkerStatus.BUSY for w in workers])
-            )
 
         results = pipeline.execute()
         job_type = "not_available"
@@ -218,6 +214,7 @@ def queue_layer_helper(
     batch_size = int(environ.get("JOB_BATCH_SIZE", 10000))
     timeout_scale = int(environ.get("TIMEOUT_SCALE_FACTOR", 1))
     batches = chunked(chunk_coords, batch_size)
+    failure_ttl = int(environ.get("FAILURE_TTL", 300))
     for batch in batches:
         _coords = get_chunks_not_done(imanager, parent_layer, batch, splits=splits)
         # buffer for optimal use of redis memory
@@ -227,6 +224,7 @@ def queue_layer_helper(
             sleep(interval)
 
         job_datas = []
+        retry = int(environ.get("RETRY_COUNT", 0))
         for chunk_coord in _coords:
             if splits > 0:
                 coord, split = chunk_coord
@@ -238,7 +236,9 @@ def queue_layer_helper(
                         result_ttl=0,
                         job_id=jid,
                         timeout=f"{timeout_scale * int(parent_layer * parent_layer)}m",
-                        retry=Retry(int(environ.get("RETRY_COUNT", 1))),
+                        retry=Retry(retry) if retry > 1 else None,
+                        description="",
+                        failure_ttl=failure_ttl,
                     )
                 )
             else:
@@ -249,7 +249,9 @@ def queue_layer_helper(
                         result_ttl=0,
                         job_id=chunk_id_str(parent_layer, chunk_coord),
                         timeout=f"{timeout_scale * int(parent_layer * parent_layer)}m",
-                        retry=Retry(int(environ.get("RETRY_COUNT", 1))),
+                        retry=Retry(retry) if retry > 1 else None,
+                        description="",
+                        failure_ttl=failure_ttl,
                     )
                 )
         q.enqueue_many(job_datas)