|
3 | 3 | import asyncio |
4 | 4 | import hashlib |
5 | 5 | import time |
6 | | -import uuid |
7 | 6 | from datetime import UTC, datetime |
8 | 7 | from functools import partial |
9 | 8 | from typing import Any |
@@ -344,6 +343,22 @@ async def check_cache_health(self) -> dict[str, str] | None: |
344 | 343 | kg_cache_up.set(1 if result.get("status") == "ok" else 0) |
345 | 344 | return result |
346 | 345 |
|
| 346 | + async def document_exists(self, doc_id: str) -> bool: |
| 347 | + """Return True if a document with this doc_id exists in the docstore. |
| 348 | +
|
| 349 | + Cheap key lookup (single docstore query) used by the delete endpoint |
| 350 | + to validate input synchronously, so a typoed doc_id returns 404 |
| 351 | + immediately instead of dispatching a background task that would |
| 352 | + eventually report failure. |
| 353 | + """ |
| 354 | + loop = asyncio.get_running_loop() |
| 355 | + |
| 356 | + def _check() -> bool: |
| 357 | + info = self._storage_context.docstore.get_ref_doc_info(doc_id) |
| 358 | + return info is not None |
| 359 | + |
| 360 | + return await loop.run_in_executor(None, _check) |
| 361 | + |
347 | 362 | async def list_documents( |
348 | 363 | self, |
349 | 364 | limit: int = 20, |
@@ -536,14 +551,22 @@ def _delete_sync() -> list[str]: |
536 | 551 | async def ingest( |
537 | 552 | self, |
538 | 553 | text: str, |
| 554 | + source_id: str, |
539 | 555 | metadata: dict[str, Any] | None = None, |
540 | 556 | ) -> tuple[str, int]: |
541 | 557 | """ |
542 | 558 | Ingest a document into both KG and vector indexes. |
543 | 559 |
|
| 560 | + `source_id` must be a stable identifier for this document (e.g. |
| 561 | + derived from the source path and content hash). The resulting |
| 562 | + `doc_id` is `sha256(source_id)`, which makes the call idempotent — |
| 563 | + re-running with the same source_id replaces any prior vector-store |
| 564 | + state for that document instead of creating duplicates. This makes |
| 565 | + the ingest path safe to retry after a Celery worker crash. |
| 566 | +
|
544 | 567 | Returns a tuple of (document_id, triplet_count). |
545 | 568 | """ |
546 | | - doc_id = str(uuid.uuid4()) |
| 569 | + doc_id = hashlib.sha256(source_id.encode()).hexdigest() |
547 | 570 | # Store all metadata in the docstore (for grouping, display, etc.) |
548 | 571 | # but exclude everything from LLM triplet extraction so it doesn't |
549 | 572 | # pollute the knowledge graph. |
@@ -581,6 +604,23 @@ def _stable_id(i: int, doc: Document) -> str: |
581 | 604 | for node in nodes: |
582 | 605 | node.excluded_llm_metadata_keys = list(node.metadata.keys()) |
583 | 606 |
|
| 607 | + # Idempotency guard: PGVectorStore.add() does NOT enforce |
| 608 | + # uniqueness on node_id, so a Celery retry of an ingest |
| 609 | + # would otherwise accumulate duplicate vector rows. |
| 610 | + # Explicitly purge any prior rows for this doc_id before |
| 611 | + # re-inserting. Safe no-op on the first run. Neo4j MERGE and |
| 612 | + # the postgres docstore (`allow_update=True`) handle their |
| 613 | + # own dedupe; the vector store is the only layer that needs |
| 614 | + # this. |
| 615 | + try: |
| 616 | + self._vector_index.vector_store.delete(ref_doc_id=doc_id) |
| 617 | + except Exception as exc: |
| 618 | + logger.warning( |
| 619 | + "vector_store_predelete_failed", |
| 620 | + doc_id=doc_id, |
| 621 | + error=str(exc), |
| 622 | + ) |
| 623 | + |
584 | 624 | # Vector-first: embedding/pgvector write is more likely to fail |
585 | 625 | # (external API call). If it fails, Neo4j is untouched. |
586 | 626 | # If it succeeds and KG insert fails, we have embeddings without |
|
0 commit comments