Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -142,7 +142,7 @@ def ask_image(

if task in ("summary-images", "atlas-page-info"):
image_model = settings.IMAGE_MODEL or "gpt-4-vision-preview"
else: # Image Q&A and OCR use better models
else: # OCR and image type classification use higher-capability models
image_model = settings.IMAGE_MODEL_MAX or "gpt-4-vision-preview"

if len(urls_) > 0:
Expand Down
2 changes: 1 addition & 1 deletion packages/shared-python/shared/core/config/ai.py
Original file line number Diff line number Diff line change
Expand Up @@ -35,7 +35,7 @@ class AIConfig(BaseModel):

IMAGE_MODEL_MAX: str = Field(
default="qwen3.5-flash",
description="Higher-capability image model for OCR and ask-image Q&A",
description="Higher-capability image model for OCR and image type classification",
)
RETRIEVAL_DECOMPOSITION_ENABLED: bool = Field(
default=False,
Expand Down
12 changes: 4 additions & 8 deletions packages/shared-python/shared/services/ai/llm_mock.py
Original file line number Diff line number Diff line change
Expand Up @@ -139,15 +139,11 @@ def _detect_mock_task(prompt_text: str) -> str:
if "perform ocr operation" in normalized_prompt:
return "ocr-image"
if (
"you will receive an image" in normalized_prompt
and "line 1: output a short title" in normalized_prompt
"you will receive an image from a document" in normalized_prompt
and "identify the image type" in normalized_prompt
):
return "summary-images"
if (
"you will receive one or more images and the user's current question"
in normalized_prompt
):
return "ask-image"

if "summaries of sub-sections from a document section" in normalized_prompt:
return "file-summary"
if (
Expand Down Expand Up @@ -302,7 +298,7 @@ def _build_mock_response(task_name: str) -> str:
"atlas-page-info": "Mock atlas page info",
"ocr-image": "Mock OCR text",
"summary-images": "Mock Image Title\nMock image summary",
"ask-image": "Mock image answer",

"file-summary": "Mock section summary",
"summary-titled": "Mock Title\nMock summary",
"summary": "Mock summary",
Expand Down
78 changes: 55 additions & 23 deletions packages/shared-python/shared/services/ai/prompt_service.py
Original file line number Diff line number Diff line change
Expand Up @@ -566,16 +566,65 @@ def build_prompt(task, texts, query, **kwargs):
temperature = 0.1
max_tokens = int(kwargs["paras"]["max_tokens"] * 1.2)
if texts.strip():
img_context = f"- Image context is [{texts}], you may reference the title for summarization"
img_context = f"- Image context is [{texts}], you may reference the context for summarization"
else:
img_context = ""

prompt = f"""
You will receive an image, which may be a photo, chart, or an image requiring OCR.
Your task is to extract the main content described in the image. Note:
- Line 1: Output a short title (no more than 15 characters) summarizing the image's core topic
- Line 2 onward: Provide a precise and concise summary, using text descriptions only, avoid extracting specific data from the image
- Your response **MUST BE in the SAME LANGUAGE** as any text visible in the image (if there is no text, English is preferred)
You will receive an image from a document. Your task is to extract the most
USEFUL information from this image based on its type.

**STEP 1: Identify the image type** (do NOT output this step, use it internally):
- Credential/ID: identity cards, passports, driver licenses, business licenses, certificates, permits
- Data Chart: bar charts, line charts, pie charts, scatter plots, heatmaps, gauge charts
- Table Screenshot: tabular data rendered as an image
- Diagram: flowcharts, org charts, architecture diagrams, mind maps, UML diagrams
- Engineering Drawing: architectural plans, circuit diagrams, CAD drawings, mechanical drawings
- Photo: real-world photographs of people, objects, scenes, products
- Other: anything not fitting the above categories

**STEP 2: Extract information according to image type**:

For Credential/ID images:
- Extract ALL visible fields: name, ID number, date of birth, expiry date,
issuing authority, company name, registration number, legal representative,
business scope, qualification level, etc.
- Preserve exact values as shown (numbers, dates, codes)

For Data Charts:
- Chart title, axis labels and units
- Key data points, trends, and notable patterns
- Time range or categories covered
- Data source if visible

For Table Screenshots:
- Table title and column headers
- Key data entries and notable values
- Number of rows/columns and what the table represents

For Diagrams (flow/architecture/org):
- All node names and their relationships
- Flow direction and process steps
- Hierarchy levels and key connections

For Engineering/Technical Drawings:
- Drawing title, drawing number, scale
- Key dimensions and annotations
- Component/part names, material specifications

For Photos:
- Primary subject and scene description
- Notable features, text, or signage visible
- Context clues about location or purpose

For Other:
- Describe the most important visual information

**Output format**:
- Line 1: A concise title (no more than 20 characters) capturing the core topic
- Line 2 onward: The extracted information following the type-specific guidelines above
- Your response **MUST BE in the SAME LANGUAGE** as any text visible in the image
(if no text, use English)
- If the image is blank, unreadable, or contains no meaningful content, return exactly: null

{img_context}
Expand All @@ -595,23 +644,6 @@ def build_prompt(task, texts, query, **kwargs):
- Do not add any format wrappers, prefixes, or explanations beyond the text content
"""

elif task == "ask-image":
temperature = 0.1
max_tokens = int(kwargs["paras"]["max_tokens"] * 1.2)

prompt = f"""
You will receive one or more images and the user's current question: [{query}]
You may also receive context related to the image(s).

{texts}

Your task is to answer the user's question based on the image(s) and context (if any). Note:
- Your answer must be in the SAME LANGUAGE as the user's question
- Provide a complete and accurate answer with some explanation, but not exceeding {max_tokens} characters
- If the image content is unrelated to the user's question, return exactly: null
- Do not return any additional explanations or descriptions beyond the answer
"""

elif task == "judge-image-type":
temperature = 0.1
prompt = """
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,7 @@
from shared.services.retrieval.agentic.core.budget import BudgetExceeded
from shared.services.retrieval.agentic.prompts import (
DISCOVERY_SELECT_PROMPT,
adjust_budget_snapshot,
format_budget_block,
parse_action_response,
)
Expand Down Expand Up @@ -208,10 +209,17 @@ def _build_discovery_selection_prompt(
hint_lines: list[str],
budget_snapshot: dict | None,
) -> str:
# Estimate this call's prompt token cost and adjust snapshot so
# the LLM sees post-call budget (consistent with navigate_step).
items_text = "\n".join(hint_lines)
prompt_tokens_est = (len(items_text) + 400) // 2 # rough chars-to-tokens
adjusted_snapshot = adjust_budget_snapshot(
budget_snapshot, prompt_tokens_est,
)
return DISCOVERY_SELECT_PROMPT.format(
doc_name=doc_name or document_id,
budget_block=format_budget_block(budget_snapshot),
items="\n".join(hint_lines),
budget_block=format_budget_block(adjusted_snapshot),
items=items_text,
query=query,
)

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -27,6 +27,7 @@
from shared.services.retrieval.agentic.core.budget import BudgetExceeded
from shared.services.retrieval.agentic.prompts import (
COLLECTOR_PROMPT,
adjust_budget_snapshot,
format_budget_block,
parse_collector_response,
)
Expand All @@ -42,38 +43,6 @@
from shared.services.retrieval.llm_adapter import LLMFn


def _adjust_budget_snapshot(
snapshot: dict | None,
additional_tokens: int,
) -> dict | None:
"""Adjust a budget snapshot by adding estimated tokens for the current call.

This ensures the LLM sees the budget state *after* this call's cost,
not before, preventing misleadingly low percentages.
"""
if not snapshot:
return snapshot
import copy
adjusted = copy.deepcopy(snapshot)
planning = adjusted.get("planning")
if not planning:
return adjusted
capacity = planning.get("capacity", 1)
used = planning.get("used", 0) + additional_tokens
used_pct = min(int(used * 100 / capacity), 100) if capacity > 0 else 100
planning["used"] = used
planning["used_pct"] = used_pct
planning["remaining"] = max(0, capacity - used)
if used_pct >= 90:
planning["status"] = "EXHAUSTED"
elif used_pct >= 75:
planning["status"] = "CRITICAL"
elif used_pct >= 50:
planning["status"] = "TIGHT"
else:
planning["status"] = "HEALTHY"
return adjusted


async def navigate_step(
db: AsyncSession,
Expand Down Expand Up @@ -162,7 +131,7 @@ async def navigate_step(
prompt_tokens_est = (
len(items_text) + len(trace_block) + len(tools_block) + 800
) // 2 # rough chars-to-tokens ratio
adjusted_snapshot = _adjust_budget_snapshot(
adjusted_snapshot = adjust_budget_snapshot(
budget_snapshot, prompt_tokens_est,
)

Expand Down
Loading
Loading