Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
13 changes: 12 additions & 1 deletion apps/api/.env.example
Original file line number Diff line number Diff line change
Expand Up @@ -95,8 +95,19 @@ MAX_FILE_SIZE=314572800
MAX_PDF_PAGE_LIMIT=200
OVERSIZED_PDF_SHARD_ENABLED=true
OVERSIZED_PDF_SOFT_LIMIT=1500
PDF_PROFILE_TOC_ENABLED=false
PDF_PAGE_TOC_ENABLED=true
RETRIEVAL_PAGE_MEMORY_ENABLED=false
MINERU_SHARD_CONCURRENCY=3
PARSE_AGENT_PLAN_BUDGET=50000
PARSE_AGENT_VISUAL_BUDGET=80000
PARSE_AGENT_TOC_CONFIRM_MIN_BUDGET=8000
PARSE_AGENT_TOC_CONFIRM_CAP=24000
PARSE_AGENT_COARSE_PLANNER_MIN_BUDGET=12000
PARSE_AGENT_COARSE_PLANNER_CAP=36000
PARSE_AGENT_STRUCTURAL_REACT_MIN_BUDGET=24000
PARSE_AGENT_STRUCTURAL_REACT_CAP=64000
PARSE_AGENT_PAGE_TAGGING_MIN_BUDGET=0
PARSE_AGENT_PAGE_TAGGING_CAP=0

# Required for specific features: webhooks and callbacks
WEBHOOK_MASTER_KEY=
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,29 @@
"""add doc profile to document page plan

Revision ID: f9a0b1c2d3e4
Revises: f8a9b0c1d2e3
Create Date: 2026-06-11 09:50:00.000000

"""

from typing import Sequence, Union

from alembic import op
import sqlalchemy as sa


revision: str = "f9a0b1c2d3e4"
down_revision: Union[str, Sequence[str], None] = "f8a9b0c1d2e3"
branch_labels: Union[str, Sequence[str], None] = None
depends_on: Union[str, Sequence[str], None] = None


def upgrade() -> None:
op.add_column(
"document_page_plan",
sa.Column("doc_profile", sa.JSON(), nullable=True),
)


def downgrade() -> None:
op.drop_column("document_page_plan", "doc_profile")
Original file line number Diff line number Diff line change
@@ -0,0 +1,35 @@
"""add parse track to documents

Revision ID: f9b0c1d2e3f4
Revises: f9a0b1c2d3e4
Create Date: 2026-06-11 10:05:00.000000

"""

from typing import Sequence, Union

from alembic import op
import sqlalchemy as sa


revision: str = "f9b0c1d2e3f4"
down_revision: Union[str, Sequence[str], None] = "f9a0b1c2d3e4"
branch_labels: Union[str, Sequence[str], None] = None
depends_on: Union[str, Sequence[str], None] = None


def upgrade() -> None:
op.add_column(
"documents",
sa.Column(
"parse_track",
sa.String(length=32),
nullable=False,
server_default="chunk",
),
)
op.alter_column("documents", "parse_track", server_default=None)


def downgrade() -> None:
op.drop_column("documents", "parse_track")
2 changes: 1 addition & 1 deletion apps/api/app/api/v1/routes/documents.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,7 +17,7 @@
router = APIRouter(tags=["Documents"])

_document_service = DocumentService()
DocumentChunkType = Literal["text", "image", "table"]
DocumentChunkType = Literal["text", "image", "table", "page"]


async def _archive_document_response(
Expand Down
41 changes: 41 additions & 0 deletions apps/api/app/services/document_ingestion/service.py
Original file line number Diff line number Diff line change
Expand Up @@ -196,6 +196,15 @@ async def _validate_create_payload(self, payload: JobCreate) -> None:
}
],
)
_validate_parse_track_for_extension(
parse_track=payload.parse_track,
file_extension=file_extension,
)
elif payload.file_name:
_validate_parse_track_for_extension(
parse_track=payload.parse_track,
file_extension=os.path.splitext(payload.file_name)[1].lower(),
)

async def _resolve_scope(
self,
Expand Down Expand Up @@ -261,3 +270,35 @@ def _is_supported_file_name(file_name: str) -> bool:
return False
file_extension = os.path.splitext(file_name)[1].lower()
return file_extension in settings.get_supported_extensions()


def _validate_parse_track_for_extension(*, parse_track: str, file_extension: str) -> None:
if parse_track == "chunk":
return
if parse_track != "page_memory":
raise ValidationException(
user_message="Unsupported parse_track",
violations=[
{"field": "parse_track", "description": "Must be chunk or page_memory"}
],
)
if not settings.RETRIEVAL_PAGE_MEMORY_ENABLED:
raise ValidationException(
user_message="page_memory parse track is not enabled",
violations=[
{
"field": "parse_track",
"description": "page_memory is disabled by configuration",
}
],
)
if file_extension.lower() not in {".pdf", ".pptx"}:
raise ValidationException(
user_message="page_memory parse track only supports PDF and PPTX",
violations=[
{
"field": "parse_track",
"description": "Allowed file types in this build: .pdf, .pptx",
}
],
)
6 changes: 6 additions & 0 deletions apps/api/tests/contract/test_documents_contract.py
Original file line number Diff line number Diff line change
Expand Up @@ -42,6 +42,7 @@ async def _insert_document(
status,
current_job_result_id,
source_file_name,
parse_track,
created_at,
updated_at,
archived_at
Expand All @@ -52,6 +53,7 @@ async def _insert_document(
:status,
:current_job_result_id,
:source_file_name,
:parse_track,
:created_at,
:updated_at,
:archived_at
Expand All @@ -64,6 +66,7 @@ async def _insert_document(
"status": status,
"current_job_result_id": None,
"source_file_name": source_file_name or f"{document_id}.pdf",
"parse_track": "chunk",
"created_at": timestamp,
"updated_at": effective_updated_at,
"archived_at": (
Expand Down Expand Up @@ -268,6 +271,7 @@ async def _insert_document_revision_with_chunks(
status,
current_job_result_id,
source_file_name,
parse_track,
created_at,
updated_at,
archived_at
Expand All @@ -278,6 +282,7 @@ async def _insert_document_revision_with_chunks(
'active',
NULL,
:source_file_name,
:parse_track,
:created_at,
:updated_at,
NULL
Expand All @@ -288,6 +293,7 @@ async def _insert_document_revision_with_chunks(
"user_id": user_id,
"namespace": namespace,
"source_file_name": source_file_name,
"parse_track": "chunk",
"created_at": timestamp,
"updated_at": timestamp,
},
Expand Down
3 changes: 3 additions & 0 deletions apps/api/tests/contract/test_job_creation_contract.py
Original file line number Diff line number Diff line change
Expand Up @@ -80,6 +80,7 @@ async def _insert_document(
namespace,
status,
source_file_name,
parse_track,
created_at,
updated_at,
archived_at
Expand All @@ -89,6 +90,7 @@ async def _insert_document(
:namespace,
:status,
:source_file_name,
:parse_track,
:created_at,
:updated_at,
:archived_at
Expand All @@ -101,6 +103,7 @@ async def _insert_document(
"namespace": namespace,
"status": status,
"source_file_name": f"{document_id}.pdf",
"parse_track": "chunk",
"created_at": timestamp,
"updated_at": timestamp,
"archived_at": timestamp if status == "archived" else None,
Expand Down
60 changes: 60 additions & 0 deletions apps/api/tests/contract/test_page_memory_parse_track_contract.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,60 @@
from __future__ import annotations

import os

import pytest

os.environ.setdefault("DATABASE_URL", "postgresql+asyncpg://test:test@localhost/test")
os.environ.setdefault("TMP_PATH", "/tmp/knowhere-test")
os.environ.setdefault("S3_BUCKET_NAME", "test-uploads")
os.environ.setdefault("S3_ACCESS_KEY_ID", "test")
os.environ.setdefault("S3_SECRET_ACCESS_KEY", "test")
os.environ.setdefault("S3_TEMP_PATH", "/tmp")

from shared.core.exceptions.domain_exceptions import ValidationException


def test_page_memory_parse_track_rejects_when_flag_disabled(monkeypatch) -> None:
from app.services.document_ingestion.service import (
_validate_parse_track_for_extension,
)
from shared.core.config import settings

monkeypatch.setattr(
settings,
"RETRIEVAL_PAGE_MEMORY_ENABLED",
False,
)

with pytest.raises(ValidationException):
_validate_parse_track_for_extension(
parse_track="page_memory",
file_extension=".pdf",
)


def test_page_memory_parse_track_allows_only_pdf_and_pptx(monkeypatch) -> None:
from app.services.document_ingestion.service import (
_validate_parse_track_for_extension,
)
from shared.core.config import settings

monkeypatch.setattr(
settings,
"RETRIEVAL_PAGE_MEMORY_ENABLED",
True,
)

_validate_parse_track_for_extension(
parse_track="page_memory",
file_extension=".pdf",
)
_validate_parse_track_for_extension(
parse_track="page_memory",
file_extension=".pptx",
)
with pytest.raises(ValidationException):
_validate_parse_track_for_extension(
parse_track="page_memory",
file_extension=".docx",
)
4 changes: 4 additions & 0 deletions apps/api/tests/support/contract_database.py
Original file line number Diff line number Diff line change
Expand Up @@ -290,6 +290,7 @@ async def insert_document(
status: str = "active",
current_job_result_id: str | None = None,
source_file_name: str | None = None,
parse_track: str = "chunk",
created_at: datetime | None = None,
updated_at: datetime | None = None,
archived_at: datetime | None = None,
Expand All @@ -304,6 +305,7 @@ async def insert_document(
status,
current_job_result_id,
source_file_name,
parse_track,
created_at,
updated_at,
archived_at
Expand All @@ -314,6 +316,7 @@ async def insert_document(
:status,
:current_job_result_id,
:source_file_name,
:parse_track,
:created_at,
:updated_at,
:archived_at
Expand All @@ -326,6 +329,7 @@ async def insert_document(
"status": status,
"current_job_result_id": current_job_result_id,
"source_file_name": source_file_name or f"{document_id}.pdf",
"parse_track": parse_track,
"created_at": timestamp,
"updated_at": updated_at or timestamp,
"archived_at": archived_at,
Expand Down
13 changes: 12 additions & 1 deletion apps/worker/.env.example
Original file line number Diff line number Diff line change
Expand Up @@ -120,8 +120,19 @@ MAX_FILE_SIZE=314572800
MAX_PDF_PAGE_LIMIT=200
OVERSIZED_PDF_SHARD_ENABLED=true
OVERSIZED_PDF_SOFT_LIMIT=1500
PDF_PROFILE_TOC_ENABLED=false
PDF_PAGE_TOC_ENABLED=true
RETRIEVAL_PAGE_MEMORY_ENABLED=false
MINERU_SHARD_CONCURRENCY=3
PARSE_AGENT_PLAN_BUDGET=50000
PARSE_AGENT_VISUAL_BUDGET=80000
PARSE_AGENT_TOC_CONFIRM_MIN_BUDGET=8000
PARSE_AGENT_TOC_CONFIRM_CAP=24000
PARSE_AGENT_COARSE_PLANNER_MIN_BUDGET=12000
PARSE_AGENT_COARSE_PLANNER_CAP=36000
PARSE_AGENT_STRUCTURAL_REACT_MIN_BUDGET=24000
PARSE_AGENT_STRUCTURAL_REACT_CAP=64000
PARSE_AGENT_PAGE_TAGGING_MIN_BUDGET=0
PARSE_AGENT_PAGE_TAGGING_CAP=0

# Legacy parser compatibility fields.
# ALL_DF_COLS=content,path,type,length,keywords,summary,know_id,tokens,connectto,addtime,page_nums
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -62,6 +62,7 @@ def _llm_summarize(snippets_text: str, node_name: str, max_tokens: int = 100) ->
messages=messages,
timeout=60,
max_tokens=max_tokens,
usage_task="finalization.doc_nav_summary",
)
if resp is None:
return ""
Expand Down
Loading
Loading