-
Notifications
You must be signed in to change notification settings - Fork 1.7k
Add PageIndexClient with agent-based retrieval via OpenAI Agents SDK #125
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Open
KylinMountain
wants to merge
10
commits into
VectifyAI:main
Choose a base branch
from
KylinMountain:feat/retrieve
base: main
Could not load branches
Branch not found: {{ refName }}
Loading
Could not load tags
Nothing to show
Loading
Are you sure you want to change the base?
Some commits from the old base branch may be removed from the timeline,
and old review comments may become outdated.
Open
Changes from all commits
Commits
Show all changes
10 commits
Select commit
Hold shift + click to select a range
3fb4c74
Add PageIndexClient with storage persistence and streaming support
KylinMountain cc4c5fc
Rename go_deeper to explore in retrieve prompt for clarity
KylinMountain 94c8838
refactor to tools and agent
KylinMountain 75a5839
Remove PDF from repo; download at runtime; add verbose tool logging
KylinMountain 239e5e8
remove test mock client
KylinMountain 3cf7d21
resolve review comments
KylinMountain 58a61f6
Fix critical and important robustness issues in retrieve and client
KylinMountain 8fffedb
fix async issue
KylinMountain 8746fbe
Decouple openai-agents from PageIndexClient; move agent demo to examples
KylinMountain ffaec3c
Update README: add Python API usage and agent
KylinMountain File filter
Filter by extension
Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Some comments aren't visible on the classic Files Changed page.
There are no files selected for viewing
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,154 @@ | ||
| """ | ||
| PageIndex x OpenAI Agents Demo | ||
|
|
||
| Demonstrates how to use PageIndexClient with the OpenAI Agents SDK | ||
| to build a document QA agent with 3 tools: | ||
| - get_document() | ||
| - get_document_structure() | ||
| - get_page_content() | ||
|
|
||
| Requirements: | ||
| pip install openai-agents | ||
|
|
||
| Steps: | ||
| 1 — Index PDF and inspect tree structure | ||
| 2 — Inspect document metadata | ||
| 3 — Ask a question (agent auto-calls tools) | ||
| 4 — Reload from workspace and verify persistence | ||
| """ | ||
| import os | ||
| import sys | ||
| import asyncio | ||
| import concurrent.futures | ||
| import requests | ||
|
|
||
| sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.abspath(__file__)))) | ||
|
|
||
| from agents import Agent, Runner, function_tool | ||
| from agents.stream_events import RunItemStreamEvent | ||
|
|
||
| from pageindex import PageIndexClient | ||
| import pageindex.utils as utils | ||
|
|
||
| PDF_URL = "https://arxiv.org/pdf/2501.12948.pdf" | ||
| PDF_PATH = "tests/pdfs/deepseek-r1.pdf" | ||
| WORKSPACE = "~/.pageindex" | ||
|
|
||
| AGENT_SYSTEM_PROMPT = """ | ||
| You are PageIndex, a document QA assistant. | ||
| TOOL USE: | ||
| - Call get_document() first to confirm status and page/line count. | ||
| - Call get_document_structure() to find relevant page ranges (use node summaries and start_index/end_index). | ||
| - Call get_page_content(pages="5-7") with tight ranges. Never fetch the whole doc. | ||
| - For Markdown, pages = line numbers from the structure (the line_num field). Use line_count from get_document() as the upper bound. | ||
| ANSWERING: Answer based only on tool output. Be concise. | ||
| """ | ||
|
|
||
|
|
||
| def query_agent(client: PageIndexClient, doc_id: str, prompt: str, verbose: bool = False) -> str: | ||
| """Run a document QA agent using the OpenAI Agents SDK.""" | ||
|
|
||
| @function_tool | ||
| def get_document() -> str: | ||
| """Get document metadata: status, page count, name, and description.""" | ||
| return client.get_document(doc_id) | ||
|
|
||
| @function_tool | ||
| def get_document_structure() -> str: | ||
| """Get the document's full tree structure (without text) to find relevant sections.""" | ||
| return client.get_document_structure(doc_id) | ||
|
|
||
| @function_tool | ||
| def get_page_content(pages: str) -> str: | ||
| """ | ||
| Get the text content of specific pages or line numbers. | ||
| Use tight ranges: e.g. '5-7' for pages 5 to 7, '3,8' for pages 3 and 8, '12' for page 12. | ||
| For Markdown documents, use line numbers from the structure's line_num field. | ||
| """ | ||
| return client.get_page_content(doc_id, pages) | ||
|
|
||
| agent = Agent( | ||
| name="PageIndex", | ||
| instructions=AGENT_SYSTEM_PROMPT, | ||
| tools=[get_document, get_document_structure, get_page_content], | ||
| model=client.model, | ||
| ) | ||
|
|
||
| async def _run(): | ||
| if not verbose: | ||
| result = await Runner.run(agent, prompt) | ||
| return result.final_output | ||
|
|
||
| turn = 0 | ||
| stream = Runner.run_streamed(agent, prompt) | ||
| async for event in stream.stream_events(): | ||
| if not isinstance(event, RunItemStreamEvent): | ||
| continue | ||
| if event.name == "tool_called": | ||
| turn += 1 | ||
| raw = event.item.raw_item | ||
| args = getattr(raw, "arguments", "{}") | ||
| print(f"\n[Turn {turn}] → {raw.name}({args})") | ||
| elif event.name == "tool_output": | ||
| output = str(event.item.output) | ||
| preview = output[:200] + "..." if len(output) > 200 else output | ||
| print(f" ← {preview}") | ||
| return stream.final_output | ||
|
|
||
| try: | ||
| asyncio.get_running_loop() | ||
| with concurrent.futures.ThreadPoolExecutor(max_workers=1) as pool: | ||
| return pool.submit(asyncio.run, _run()).result() | ||
| except RuntimeError: | ||
| return asyncio.run(_run()) | ||
|
|
||
|
|
||
| # ── Download PDF if needed ───────────────────────────────────────────────────── | ||
| if not os.path.exists(PDF_PATH): | ||
| print(f"Downloading {PDF_URL} ...") | ||
| os.makedirs(os.path.dirname(PDF_PATH), exist_ok=True) | ||
| with requests.get(PDF_URL, stream=True, timeout=30) as r: | ||
| r.raise_for_status() | ||
| with open(PDF_PATH, "wb") as f: | ||
| for chunk in r.iter_content(chunk_size=8192): | ||
| if chunk: | ||
| f.write(chunk) | ||
| print("Download complete.\n") | ||
|
|
||
| # ── Setup ────────────────────────────────────────────────────────────────────── | ||
| client = PageIndexClient(workspace=WORKSPACE) | ||
|
|
||
| # ── Step 1: Index + Tree ─────────────────────────────────────────────────────── | ||
| print("=" * 60) | ||
| print("Step 1: Indexing PDF and inspecting tree structure") | ||
| print("=" * 60) | ||
| doc_id = client.index(PDF_PATH) | ||
| print(f"\nDocument ID: {doc_id}") | ||
| print("\nTree Structure:") | ||
| utils.print_tree(client.documents[doc_id]["structure"]) | ||
|
|
||
| # ── Step 2: Document Metadata ────────────────────────────────────────────────── | ||
| print("\n" + "=" * 60) | ||
| print("Step 2: Document Metadata (get_document)") | ||
| print("=" * 60) | ||
| print(client.get_document(doc_id)) | ||
|
|
||
| # ── Step 3: Agent Query ──────────────────────────────────────────────────────── | ||
| print("\n" + "=" * 60) | ||
| print("Step 3: Agent Query (auto tool-use)") | ||
| print("=" * 60) | ||
| question = "What are the main conclusions of this paper?" | ||
| print(f"\nQuestion: '{question}'\n") | ||
| answer = query_agent(client, doc_id, question) | ||
| print("Answer:") | ||
| print(answer) | ||
|
|
||
| # ── Step 4: Persistence Verification ────────────────────────────────────────── | ||
| print("\n" + "=" * 60) | ||
| print("Step 4: Persistence — reload without re-indexing") | ||
| print("=" * 60) | ||
| client2 = PageIndexClient(workspace=WORKSPACE) | ||
| answer2 = query_agent(client2, doc_id, "What are the main conclusions of this paper?", verbose=True) | ||
| print("Answer from reloaded client:") | ||
| print(answer2) | ||
| print("\nPersistence verified. ✓") |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -1,2 +1,4 @@ | ||
| from .page_index import * | ||
| from .page_index_md import md_to_tree | ||
| from .page_index_md import md_to_tree | ||
| from .retrieve import get_document, get_document_structure, get_page_content | ||
| from .client import PageIndexClient |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,118 @@ | ||
| import os | ||
| import uuid | ||
| import json | ||
| import asyncio | ||
| from pathlib import Path | ||
|
|
||
| from .page_index import page_index | ||
| from .page_index_md import md_to_tree | ||
| from .retrieve import get_document, get_document_structure, get_page_content | ||
|
|
||
|
|
||
| class PageIndexClient: | ||
| """ | ||
| A client for indexing and retrieving document content. | ||
| Flow: index() -> get_document() / get_document_structure() / get_page_content() | ||
|
|
||
| For agent-based QA, see examples/openai_agents_demo.py. | ||
| """ | ||
| def __init__(self, api_key: str = None, model: str = "gpt-4o-2024-11-20", workspace: str = None): | ||
| if api_key: | ||
| os.environ["OPENAI_API_KEY"] = api_key | ||
| elif not os.getenv("OPENAI_API_KEY") and os.getenv("CHATGPT_API_KEY"): | ||
| os.environ["OPENAI_API_KEY"] = os.getenv("CHATGPT_API_KEY") | ||
| self.model = model | ||
| self.workspace = Path(workspace).expanduser() if workspace else None | ||
| if self.workspace: | ||
| self.workspace.mkdir(parents=True, exist_ok=True) | ||
| self.documents = {} | ||
| if self.workspace: | ||
| self._load_workspace() | ||
|
|
||
| def index(self, file_path: str, mode: str = "auto") -> str: | ||
| """Index a document. Returns a document_id.""" | ||
| if not os.path.exists(file_path): | ||
| raise FileNotFoundError(f"File not found: {file_path}") | ||
|
|
||
| doc_id = str(uuid.uuid4()) | ||
| ext = os.path.splitext(file_path)[1].lower() | ||
|
|
||
| is_pdf = ext == '.pdf' | ||
| is_md = ext in ['.md', '.markdown'] | ||
|
|
||
| if mode == "pdf" or (mode == "auto" and is_pdf): | ||
| print(f"Indexing PDF: {file_path}") | ||
| result = page_index( | ||
| doc=file_path, | ||
| model=self.model, | ||
| if_add_node_summary='yes', | ||
| if_add_node_text='yes', | ||
| if_add_node_id='yes', | ||
| if_add_doc_description='yes' | ||
| ) | ||
| self.documents[doc_id] = { | ||
| 'id': doc_id, | ||
| 'path': file_path, | ||
| 'type': 'pdf', | ||
| 'structure': result['structure'], | ||
| 'doc_name': result.get('doc_name', ''), | ||
| 'doc_description': result.get('doc_description', '') | ||
| } | ||
|
|
||
| elif mode == "md" or (mode == "auto" and is_md): | ||
| print(f"Indexing Markdown: {file_path}") | ||
| result = asyncio.run(md_to_tree( | ||
| md_path=file_path, | ||
| if_thinning=False, | ||
| if_add_node_summary='yes', | ||
| summary_token_threshold=200, | ||
| model=self.model, | ||
| if_add_doc_description='yes', | ||
| if_add_node_text='yes', | ||
| if_add_node_id='yes' | ||
| )) | ||
| self.documents[doc_id] = { | ||
| 'id': doc_id, | ||
| 'path': file_path, | ||
| 'type': 'md', | ||
| 'structure': result['structure'], | ||
| 'doc_name': result.get('doc_name', ''), | ||
| 'doc_description': result.get('doc_description', '') | ||
| } | ||
| else: | ||
| raise ValueError(f"Unsupported file format for: {file_path}") | ||
|
|
||
| print(f"Indexing complete. Document ID: {doc_id}") | ||
| if self.workspace: | ||
| self._save_doc(doc_id) | ||
| return doc_id | ||
|
|
||
| def _save_doc(self, doc_id: str): | ||
| path = self.workspace / f"{doc_id}.json" | ||
| with open(path, "w", encoding="utf-8") as f: | ||
| json.dump(self.documents[doc_id], f, ensure_ascii=False, indent=2) | ||
|
|
||
| def _load_workspace(self): | ||
| loaded = 0 | ||
| for path in self.workspace.glob("*.json"): | ||
| try: | ||
| with open(path, "r", encoding="utf-8") as f: | ||
| doc = json.load(f) | ||
| self.documents[path.stem] = doc | ||
| loaded += 1 | ||
| except (json.JSONDecodeError, OSError) as e: | ||
| print(f"Warning: skipping corrupt workspace file {path.name}: {e}") | ||
| if loaded: | ||
| print(f"Loaded {loaded} document(s) from workspace.") | ||
|
|
||
| def get_document(self, doc_id: str) -> str: | ||
| """Return document metadata JSON.""" | ||
| return get_document(self.documents, doc_id) | ||
|
|
||
| def get_document_structure(self, doc_id: str) -> str: | ||
| """Return document tree structure JSON (without text fields).""" | ||
| return get_document_structure(self.documents, doc_id) | ||
|
|
||
| def get_page_content(self, doc_id: str, pages: str) -> str: | ||
| """Return page content for the given pages string (e.g. '5-7', '3,8', '12').""" | ||
| return get_page_content(self.documents, doc_id, pages) | ||
Oops, something went wrong.
Add this suggestion to a batch that can be applied as a single commit.
This suggestion is invalid because no changes were made to the code.
Suggestions cannot be applied while the pull request is closed.
Suggestions cannot be applied while viewing a subset of changes.
Only one suggestion per line can be applied in a batch.
Add this suggestion to a batch that can be applied as a single commit.
Applying suggestions on deleted lines is not supported.
You must change the existing code in this line in order to create a valid suggestion.
Outdated suggestions cannot be applied.
This suggestion has been applied or marked resolved.
Suggestions cannot be applied from pending reviews.
Suggestions cannot be applied on multi-line comments.
Suggestions cannot be applied while the pull request is queued to merge.
Suggestion cannot be applied right now. Please check back later.
Uh oh!
There was an error while loading. Please reload this page.