make ingest/delete idempotent and scale celery workers with configurable concurrency

Yukigeshiki · Yukigeshiki · commit 55b17c1df0d7 · 2026-04-07T11:39:59.000+02:00
diff --git a/README.md b/README.md
@@ -84,6 +84,15 @@ The UI provides a query page for asking questions about your documents and an up
 - **Redis** — Celery broker, rate limiting, cache payloads
 - **GCS** — document upload storage
 
+### Scaling the worker
+
+The Celery worker uses the prefork pool with process-level concurrency. Each worker process has its own KG service, Neo4j driver, postgres engine, and LlamaIndex indexes — process isolation gives crash isolation and avoids races in LlamaIndex internals.
+
+- **Vertical** — tune the `CELERY_WORKER_CONCURRENCY` env var (default `4`) to change the number of worker processes per container. `WORKER_MAX_TASKS_PER_CHILD` (default `100`) recycles processes to bound memory leaks.
+- **Horizontal** — run multiple worker containers behind the same Redis broker.
+
+Tasks use `task_acks_late` + `task_reject_on_worker_lost`, so a task is redelivered to another worker if its worker crashes mid-execution.
+
 ## Project Structure
 
 ```
diff --git a/services/query-engine/.env.example b/services/query-engine/.env.example
@@ -42,3 +42,5 @@ RATE_LIMIT_INGEST=10/minute
 
 # Celery
 CELERY_BROKER_URL=redis://localhost:6379/0
+CELERY_WORKER_CONCURRENCY=4
+WORKER_MAX_TASKS_PER_CHILD=100
diff --git a/services/query-engine/app/api/v1/knowledge_graph.py b/services/query-engine/app/api/v1/knowledge_graph.py
@@ -4,7 +4,7 @@
 from fastapi import APIRouter, Depends, Query, Request, UploadFile
 
 from app.core.config import settings
-from app.core.errors import ServiceUnavailableError
+from app.core.errors import NotFoundError, ServiceUnavailableError
 from app.core.rate_limit import limiter
 from app.dependencies import get_kg_service, get_upload_service
 from app.models.knowledge_graph import (
@@ -52,13 +52,19 @@ async def list_documents(
 async def delete_document(
     request: Request,
     doc_id: str,
+    service: KnowledgeGraphService = Depends(get_kg_service),
 ) -> TaskAcceptedResponse:
     """
     Submit a document deletion job for background processing.
 
-    Deletes the document from all storage layers (Neo4j, pgvector, docstore).
-    Returns a task ID for polling. Retries automatically on partial failure.
+    Validates that the document exists synchronously so a typoed doc_id
+    returns 404 immediately. The actual deletion runs as a Celery task
+    that deletes from all storage layers (Neo4j, pgvector, docstore).
+    Returns a task ID for polling.
     """
+    if not await service.document_exists(doc_id):
+        raise NotFoundError(detail=f"Document {doc_id} not found")
+
     try:
         result = delete_document_task.delay(doc_id=doc_id)
     except Exception as exc:
diff --git a/services/query-engine/app/connectors/gcs.py b/services/query-engine/app/connectors/gcs.py
@@ -1,5 +1,6 @@
 """Google Cloud Storage document connector."""
 
+import os
 import tempfile
 from collections.abc import Iterator
 from pathlib import Path
@@ -77,6 +78,20 @@ def load_documents(self, config: dict[str, Any]) -> Iterator[Document]:
 
                 documents = reader.load_data()
 
+                # Attach a stable, fully qualified source path so downstream
+                # consumers can derive deterministic doc_ids that survive task
+                # retries and process restarts. SimpleDirectoryReader sets
+                # metadata["file_path"] to the absolute temp-dir path, which
+                # is not stable across runs — strip the temp prefix to get
+                # the GCS-relative blob name.
+                for doc in documents:
+                    abs_path = doc.metadata.get("file_path", "") if doc.metadata else ""
+                    if abs_path:
+                        rel_path = os.path.relpath(abs_path, str(tmp_path))
+                        doc.metadata["source_path"] = f"gs://{bucket_name}/{rel_path}"
+                    else:
+                        doc.metadata["source_path"] = f"gs://{bucket_name}/{prefix}"
+
                 logger.info(
                     "gcs_documents_loaded",
                     bucket=bucket_name,
diff --git a/services/query-engine/app/core/config.py b/services/query-engine/app/core/config.py
@@ -59,6 +59,8 @@ class Settings(BaseSettings):
 
     # Celery
     celery_broker_url: str = ""
+    celery_worker_concurrency: int = 4
+    worker_max_tasks_per_child: int = 100
 
 
 settings = Settings()
diff --git a/services/query-engine/app/core/gcs.py b/services/query-engine/app/core/gcs.py
@@ -1,15 +1,15 @@
 """Singleton GCS client factory."""
 
 import json
+from typing import Any
 
 from google.cloud import storage as gcs_storage  # type: ignore[import-untyped]
 
 from app.core.config import Settings
 
-# NOTE: This lazy singleton is safe because the API server is single-threaded
-# async and the Celery worker runs with concurrency=1. If worker concurrency
-# is ever increased, this must be replaced with thread-safe init.
-_client: gcs_storage.Client | None = None
+# Lazy per-process singleton. Safe under FastAPI (single-threaded async) and
+# Celery's prefork pool (each worker process has its own copy of this global).
+_client: Any = None
 
 
 def get_gcs_client(config: Settings) -> gcs_storage.Client:
diff --git a/services/query-engine/app/core/postgres.py b/services/query-engine/app/core/postgres.py
@@ -2,9 +2,8 @@
 
 from sqlalchemy import Engine, create_engine
 
-# NOTE: This lazy singleton is safe because the API server is single-threaded
-# async and the Celery worker runs with concurrency=1. If worker concurrency
-# is ever increased, this must be replaced with thread-safe init.
+# Lazy per-process singleton. Safe under FastAPI (single-threaded async) and
+# Celery's prefork pool (each worker process has its own copy of this global).
 _engine: Engine | None = None
 
 
diff --git a/services/query-engine/app/services/ingestion_pipeline.py b/services/query-engine/app/services/ingestion_pipeline.py
@@ -1,6 +1,7 @@
 """Ingestion pipeline: connects document connectors to the KG service."""
 
 import asyncio
+import hashlib
 from typing import Any
 
 import structlog
@@ -49,9 +50,27 @@ async def run(
 
         for doc in documents:
             try:
+                content = doc.get_content()
+                # Derive a stable source identifier so the KG service can
+                # produce a deterministic doc_id. This makes the task safe
+                # to retry: a Celery redelivery (worker crash, OOM, time
+                # limit) re-runs with the same source_id, which hashes to
+                # the same doc_id, and the storage layers replace prior
+                # state instead of creating duplicates.
+                # The content hash is included so an in-place file
+                # replacement (same path, new content) is treated as a
+                # different document, not a stale duplicate.
+                source_path = (
+                    doc.metadata.get("source_path")
+                    or doc.metadata.get("file_name")
+                    or "unknown"
+                )
+                content_hash = hashlib.sha256(content.encode()).hexdigest()
+                source_id = f"{source_type.value}:{source_path}:{content_hash}"
                 _doc_id, triplets = await self._kg_service.ingest(
-                    text=doc.get_content(),
+                    text=content,
                     metadata=doc.metadata,
+                    source_id=source_id,
                 )
                 total_triplets += triplets
                 ingested_count += 1
diff --git a/services/query-engine/app/services/knowledge_graph.py b/services/query-engine/app/services/knowledge_graph.py
@@ -3,7 +3,6 @@
 import asyncio
 import hashlib
 import time
-import uuid
 from datetime import UTC, datetime
 from functools import partial
 from typing import Any
@@ -344,6 +343,22 @@ async def check_cache_health(self) -> dict[str, str] | None:
         kg_cache_up.set(1 if result.get("status") == "ok" else 0)
         return result
 
+    async def document_exists(self, doc_id: str) -> bool:
+        """Return True if a document with this doc_id exists in the docstore.
+
+        Cheap key lookup (single docstore query) used by the delete endpoint
+        to validate input synchronously, so a typoed doc_id returns 404
+        immediately instead of dispatching a background task that would
+        eventually report failure.
+        """
+        loop = asyncio.get_running_loop()
+
+        def _check() -> bool:
+            info = self._storage_context.docstore.get_ref_doc_info(doc_id)
+            return info is not None
+
+        return await loop.run_in_executor(None, _check)
+
     async def list_documents(
         self,
         limit: int = 20,
@@ -536,14 +551,22 @@ def _delete_sync() -> list[str]:
     async def ingest(
         self,
         text: str,
+        source_id: str,
         metadata: dict[str, Any] | None = None,
     ) -> tuple[str, int]:
         """
         Ingest a document into both KG and vector indexes.
 
+        `source_id` must be a stable identifier for this document (e.g.
+        derived from the source path and content hash). The resulting
+        `doc_id` is `sha256(source_id)`, which makes the call idempotent —
+        re-running with the same source_id replaces any prior vector-store
+        state for that document instead of creating duplicates. This makes
+        the ingest path safe to retry after a Celery worker crash.
+
         Returns a tuple of (document_id, triplet_count).
         """
-        doc_id = str(uuid.uuid4())
+        doc_id = hashlib.sha256(source_id.encode()).hexdigest()
         # Store all metadata in the docstore (for grouping, display, etc.)
         # but exclude everything from LLM triplet extraction so it doesn't
         # pollute the knowledge graph.
@@ -581,6 +604,23 @@ def _stable_id(i: int, doc: Document) -> str:
                 for node in nodes:
                     node.excluded_llm_metadata_keys = list(node.metadata.keys())
 
+                # Idempotency guard: PGVectorStore.add() does NOT enforce
+                # uniqueness on node_id, so a Celery retry of an ingest
+                # would otherwise accumulate duplicate vector rows.
+                # Explicitly purge any prior rows for this doc_id before
+                # re-inserting. Safe no-op on the first run. Neo4j MERGE and
+                # the postgres docstore (`allow_update=True`) handle their
+                # own dedupe; the vector store is the only layer that needs
+                # this.
+                try:
+                    self._vector_index.vector_store.delete(ref_doc_id=doc_id)
+                except Exception as exc:
+                    logger.warning(
+                        "vector_store_predelete_failed",
+                        doc_id=doc_id,
+                        error=str(exc),
+                    )
+
                 # Vector-first: embedding/pgvector write is more likely to fail
                 # (external API call). If it fails, Neo4j is untouched.
                 # If it succeeds and KG insert fails, we have embeddings without
diff --git a/services/query-engine/app/worker/celery_app.py b/services/query-engine/app/worker/celery_app.py
@@ -17,8 +17,16 @@
     accept_content=["json"],
     result_expires=86400,  # 24 hours
     task_track_started=True,
-    worker_concurrency=1,
-    worker_prefetch_multiplier=1,
+    # Prefork pool (Celery default on Unix). Each worker process has its own
+    # KG service, Neo4j driver, postgres engine, GCS client, and LlamaIndex
+    # indexes — process isolation makes the lazy singletons safe and avoids
+    # races in LlamaIndex internals. Scale horizontally by running more worker
+    # containers.
+    worker_concurrency=settings.celery_worker_concurrency,
+    worker_prefetch_multiplier=1,  # fair distribution for long-running tasks
+    worker_max_tasks_per_child=settings.worker_max_tasks_per_child,  # recycle to bound memory leaks
+    task_acks_late=True,  # redeliver task if worker crashes mid-execution
+    task_reject_on_worker_lost=True,  # pairs with acks_late for crash recovery
     task_soft_time_limit=180,  # 3 minutes
     task_time_limit=240,  # 4 minutes
     beat_schedule={
diff --git a/services/query-engine/app/worker/tasks.py b/services/query-engine/app/worker/tasks.py
@@ -20,8 +20,8 @@
 
 logger = structlog.stdlib.get_logger(__name__)
 
-# NOTE: This lazy singleton is safe because worker_concurrency=1 in celery_app.py.
-# If concurrency is ever increased, this must be replaced with thread-safe init.
+# Lazy per-process singleton. Safe under Celery's prefork pool because each
+# worker process has its own copy of this global.
 _kg_service: KnowledgeGraphService | None = None
 
 
@@ -88,11 +88,28 @@ def delete_document_task(doc_id: str) -> dict[str, Any]:
     """
     Background task: delete a document from all storage layers.
 
-    Retries automatically on failure to avoid orphaned records across
-    Neo4j, pgvector, and PostgreSQL docstore.
+    Retries automatically on transient failures to avoid orphaned records
+    across Neo4j, pgvector, and PostgreSQL docstore.
+
+    Idempotency: with task_acks_late enabled, a worker crash after the
+    deletion completes but before the broker ACK is sent will redeliver
+    this task. On the second run the document is already gone and the
+    service raises NotFoundError. We catch it here and report success
+    with an empty deleted_doc_ids list — the post-condition ("document
+    is gone") holds either way, and DELETE semantics are conventionally
+    idempotent. Callers can distinguish "actually deleted now" from
+    "already gone" by inspecting the deleted_doc_ids field.
     """
     kg_service = _get_kg_service()
-    deleted_ids = asyncio.run(kg_service.delete_document(doc_id=doc_id))
+    try:
+        deleted_ids = asyncio.run(kg_service.delete_document(doc_id=doc_id))
+    except NotFoundError:
+        logger.info("delete_already_completed", doc_id=doc_id)
+        return {
+            "task_type": "delete_document",
+            "doc_id": doc_id,
+            "deleted_doc_ids": [],
+        }
 
     return {
         "task_type": "delete_document",
diff --git a/services/query-engine/tests/test_connectors.py b/services/query-engine/tests/test_connectors.py
@@ -48,6 +48,7 @@ def test_loads_documents(self, mock_reader_cls: MagicMock) -> None:
         mock_client.bucket.return_value = mock_bucket
 
         mock_doc = MagicMock()
+        mock_doc.metadata = {}
         mock_reader_cls.return_value.load_data.return_value = [mock_doc]
 
         connector = GCSConnector(gcs_bucket="my-bucket", gcs_client=mock_client)
@@ -58,6 +59,48 @@ def test_loads_documents(self, mock_reader_cls: MagicMock) -> None:
         mock_bucket.list_blobs.assert_called_once_with(prefix="docs/")
         mock_blob.download_to_filename.assert_called_once()
 
+    @patch("app.connectors.gcs.SimpleDirectoryReader")
+    def test_attaches_stable_source_path(self, mock_reader_cls: MagicMock) -> None:
+        """Verify each loaded Document gets a stable source_path metadata key.
+
+        The source_path must survive temp-dir changes so retries can derive
+        the same doc_id and remain idempotent.
+        """
+        mock_blob = MagicMock()
+        mock_blob.name = "uploads/abc-123/report.pdf"
+        mock_bucket = MagicMock()
+        mock_bucket.list_blobs.return_value = [mock_blob]
+
+        mock_client = MagicMock()
+        mock_client.bucket.return_value = mock_bucket
+
+        mock_doc = MagicMock()
+
+        # SimpleDirectoryReader sets file_path to the absolute temp-dir path.
+        # We use a side_effect on load_data so we can read the actual temp path
+        # the connector chose and simulate the reader's behavior accurately.
+        captured_input_dir: dict[str, str] = {}
+
+        def fake_init(input_dir: str, **_: object) -> MagicMock:
+            captured_input_dir["path"] = input_dir
+            return mock_reader_cls.return_value
+
+        mock_reader_cls.side_effect = fake_init
+
+        def fake_load_data() -> list[MagicMock]:
+            mock_doc.metadata = {
+                "file_path": f"{captured_input_dir['path']}/uploads/abc-123/report.pdf",
+            }
+            return [mock_doc]
+
+        mock_reader_cls.return_value.load_data.side_effect = fake_load_data
+
+        connector = GCSConnector(gcs_bucket="my-bucket", gcs_client=mock_client)
+        docs = list(connector.load_documents({"prefix": "uploads/abc-123/"}))
+
+        assert len(docs) == 1
+        assert docs[0].metadata["source_path"] == "gs://my-bucket/uploads/abc-123/report.pdf"
+
     def test_missing_bucket_raises(self) -> None:
         """Verify BadRequestError when no bucket is configured."""
         connector = GCSConnector(gcs_bucket="", gcs_client=MagicMock())
diff --git a/services/query-engine/tests/test_delete_document_task.py b/services/query-engine/tests/test_delete_document_task.py
diff --git a/services/query-engine/tests/test_ingestion_pipeline.py b/services/query-engine/tests/test_ingestion_pipeline.py
diff --git a/services/query-engine/tests/test_knowledge_graph.py b/services/query-engine/tests/test_knowledge_graph.py
diff --git a/services/query-engine/tests/test_knowledge_graph_service.py b/services/query-engine/tests/test_knowledge_graph_service.py