Skip to content
Open
42 changes: 39 additions & 3 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -147,12 +147,14 @@ You can follow these steps to generate a PageIndex tree from a PDF document.
pip3 install --upgrade -r requirements.txt
```

### 2. Set your OpenAI API key
### 2. Set your API key

Create a `.env` file in the root directory and add your API key:

```bash
CHATGPT_API_KEY=your_openai_key_here
OPENAI_API_KEY=your_openai_key_here
# or
CHATGPT_API_KEY=your_openai_key_here # legacy, still supported
```

### 3. Run PageIndex on your PDF
Expand Down Expand Up @@ -189,7 +191,41 @@ python3 run_pageindex.py --md_path /path/to/your/document.md
> Note: in this function, we use "#" to determine node heading and their levels. For example, "##" is level 2, "###" is level 3, etc. Make sure your markdown file is formatted correctly. If your Markdown file was converted from a PDF or HTML, we don't recommend using this function, since most existing conversion tools cannot preserve the original hierarchy. Instead, use our [PageIndex OCR](https://pageindex.ai/blog/ocr), which is designed to preserve the original hierarchy, to convert the PDF to a markdown file and then use this function.
</details>

<!--
---

# 🐍 Python API

### Index & Retrieve

```python
from pageindex import PageIndexClient

client = PageIndexClient(workspace="~/.pageindex")

# Index a document (PDF or Markdown)
doc_id = client.index("path/to/document.pdf")

# Retrieve
client.get_document(doc_id) # metadata: name, type, page count
client.get_document_structure(doc_id) # full tree structure
client.get_page_content(doc_id, pages="5-7") # page content
```

### Agent-based QA (OpenAI Agents)

For a complete agent QA example using the [OpenAI Agents SDK](https://github.com/openai/openai-agents-python), see [`examples/openai_agents_demo.py`](examples/openai_agents_demo.py).

```bash
# Install optional dependency
pip install openai-agents

# Run the demo
python examples/openai_agents_demo.py
```

---

<!--
# ☁️ Improved Tree Generation with PageIndex OCR

This repo is designed for generating PageIndex tree structure for simple PDFs, but many real-world use cases involve complex PDFs that are hard to parse by classic Python tools. However, extracting high-quality text from PDF documents remains a non-trivial challenge. Most OCR tools only extract page-level content, losing the broader document context and hierarchy.
Expand Down
154 changes: 154 additions & 0 deletions examples/openai_agents_demo.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,154 @@
"""
PageIndex x OpenAI Agents Demo

Demonstrates how to use PageIndexClient with the OpenAI Agents SDK
to build a document QA agent with 3 tools:
- get_document()
- get_document_structure()
- get_page_content()

Requirements:
pip install openai-agents

Steps:
1 — Index PDF and inspect tree structure
2 — Inspect document metadata
3 — Ask a question (agent auto-calls tools)
4 — Reload from workspace and verify persistence
"""
import os
import sys
import asyncio
import concurrent.futures
import requests

sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.abspath(__file__))))

from agents import Agent, Runner, function_tool
from agents.stream_events import RunItemStreamEvent

from pageindex import PageIndexClient
import pageindex.utils as utils

PDF_URL = "https://arxiv.org/pdf/2501.12948.pdf"
PDF_PATH = "tests/pdfs/deepseek-r1.pdf"
WORKSPACE = "~/.pageindex"

AGENT_SYSTEM_PROMPT = """
You are PageIndex, a document QA assistant.
TOOL USE:
- Call get_document() first to confirm status and page/line count.
- Call get_document_structure() to find relevant page ranges (use node summaries and start_index/end_index).
- Call get_page_content(pages="5-7") with tight ranges. Never fetch the whole doc.
- For Markdown, pages = line numbers from the structure (the line_num field). Use line_count from get_document() as the upper bound.
ANSWERING: Answer based only on tool output. Be concise.
"""


def query_agent(client: PageIndexClient, doc_id: str, prompt: str, verbose: bool = False) -> str:
"""Run a document QA agent using the OpenAI Agents SDK."""

@function_tool
def get_document() -> str:
"""Get document metadata: status, page count, name, and description."""
return client.get_document(doc_id)

@function_tool
def get_document_structure() -> str:
"""Get the document's full tree structure (without text) to find relevant sections."""
return client.get_document_structure(doc_id)

@function_tool
def get_page_content(pages: str) -> str:
"""
Get the text content of specific pages or line numbers.
Use tight ranges: e.g. '5-7' for pages 5 to 7, '3,8' for pages 3 and 8, '12' for page 12.
For Markdown documents, use line numbers from the structure's line_num field.
"""
return client.get_page_content(doc_id, pages)

agent = Agent(
name="PageIndex",
instructions=AGENT_SYSTEM_PROMPT,
tools=[get_document, get_document_structure, get_page_content],
model=client.model,
)

async def _run():
if not verbose:
result = await Runner.run(agent, prompt)
return result.final_output

turn = 0
stream = Runner.run_streamed(agent, prompt)
async for event in stream.stream_events():
if not isinstance(event, RunItemStreamEvent):
continue
if event.name == "tool_called":
turn += 1
raw = event.item.raw_item
args = getattr(raw, "arguments", "{}")
print(f"\n[Turn {turn}] → {raw.name}({args})")
elif event.name == "tool_output":
output = str(event.item.output)
preview = output[:200] + "..." if len(output) > 200 else output
print(f" ← {preview}")
return stream.final_output

try:
asyncio.get_running_loop()
with concurrent.futures.ThreadPoolExecutor(max_workers=1) as pool:
return pool.submit(asyncio.run, _run()).result()
except RuntimeError:
return asyncio.run(_run())


# ── Download PDF if needed ─────────────────────────────────────────────────────
if not os.path.exists(PDF_PATH):
print(f"Downloading {PDF_URL} ...")
os.makedirs(os.path.dirname(PDF_PATH), exist_ok=True)
with requests.get(PDF_URL, stream=True, timeout=30) as r:
r.raise_for_status()
with open(PDF_PATH, "wb") as f:
for chunk in r.iter_content(chunk_size=8192):
if chunk:
f.write(chunk)
print("Download complete.\n")

# ── Setup ──────────────────────────────────────────────────────────────────────
client = PageIndexClient(workspace=WORKSPACE)

# ── Step 1: Index + Tree ───────────────────────────────────────────────────────
print("=" * 60)
print("Step 1: Indexing PDF and inspecting tree structure")
print("=" * 60)
doc_id = client.index(PDF_PATH)
print(f"\nDocument ID: {doc_id}")
print("\nTree Structure:")
utils.print_tree(client.documents[doc_id]["structure"])

# ── Step 2: Document Metadata ──────────────────────────────────────────────────
print("\n" + "=" * 60)
print("Step 2: Document Metadata (get_document)")
print("=" * 60)
print(client.get_document(doc_id))

# ── Step 3: Agent Query ────────────────────────────────────────────────────────
print("\n" + "=" * 60)
print("Step 3: Agent Query (auto tool-use)")
print("=" * 60)
question = "What are the main conclusions of this paper?"
print(f"\nQuestion: '{question}'\n")
answer = query_agent(client, doc_id, question)
print("Answer:")
print(answer)

# ── Step 4: Persistence Verification ──────────────────────────────────────────
print("\n" + "=" * 60)
print("Step 4: Persistence — reload without re-indexing")
print("=" * 60)
client2 = PageIndexClient(workspace=WORKSPACE)
answer2 = query_agent(client2, doc_id, "What are the main conclusions of this paper?", verbose=True)
print("Answer from reloaded client:")
print(answer2)
print("\nPersistence verified. ✓")
4 changes: 3 additions & 1 deletion pageindex/__init__.py
Original file line number Diff line number Diff line change
@@ -1,2 +1,4 @@
from .page_index import *
from .page_index_md import md_to_tree
from .page_index_md import md_to_tree
from .retrieve import get_document, get_document_structure, get_page_content
from .client import PageIndexClient
118 changes: 118 additions & 0 deletions pageindex/client.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,118 @@
import os
import uuid
import json
import asyncio
from pathlib import Path

from .page_index import page_index
from .page_index_md import md_to_tree
from .retrieve import get_document, get_document_structure, get_page_content


class PageIndexClient:
"""
A client for indexing and retrieving document content.
Flow: index() -> get_document() / get_document_structure() / get_page_content()

For agent-based QA, see examples/openai_agents_demo.py.
"""
def __init__(self, api_key: str = None, model: str = "gpt-4o-2024-11-20", workspace: str = None):
if api_key:
os.environ["OPENAI_API_KEY"] = api_key
elif not os.getenv("OPENAI_API_KEY") and os.getenv("CHATGPT_API_KEY"):
os.environ["OPENAI_API_KEY"] = os.getenv("CHATGPT_API_KEY")
self.model = model
self.workspace = Path(workspace).expanduser() if workspace else None
if self.workspace:
self.workspace.mkdir(parents=True, exist_ok=True)
self.documents = {}
if self.workspace:
self._load_workspace()

def index(self, file_path: str, mode: str = "auto") -> str:
"""Index a document. Returns a document_id."""
if not os.path.exists(file_path):
raise FileNotFoundError(f"File not found: {file_path}")

doc_id = str(uuid.uuid4())
ext = os.path.splitext(file_path)[1].lower()

is_pdf = ext == '.pdf'
is_md = ext in ['.md', '.markdown']

if mode == "pdf" or (mode == "auto" and is_pdf):
print(f"Indexing PDF: {file_path}")
result = page_index(
doc=file_path,
model=self.model,
if_add_node_summary='yes',
if_add_node_text='yes',
if_add_node_id='yes',
if_add_doc_description='yes'
)
self.documents[doc_id] = {
'id': doc_id,
'path': file_path,
'type': 'pdf',
'structure': result['structure'],
'doc_name': result.get('doc_name', ''),
'doc_description': result.get('doc_description', '')
}

elif mode == "md" or (mode == "auto" and is_md):
print(f"Indexing Markdown: {file_path}")
result = asyncio.run(md_to_tree(
md_path=file_path,
if_thinning=False,
if_add_node_summary='yes',
summary_token_threshold=200,
model=self.model,
if_add_doc_description='yes',
if_add_node_text='yes',
if_add_node_id='yes'
))
self.documents[doc_id] = {
'id': doc_id,
'path': file_path,
'type': 'md',
'structure': result['structure'],
'doc_name': result.get('doc_name', ''),
'doc_description': result.get('doc_description', '')
}
else:
raise ValueError(f"Unsupported file format for: {file_path}")

print(f"Indexing complete. Document ID: {doc_id}")
if self.workspace:
self._save_doc(doc_id)
return doc_id

def _save_doc(self, doc_id: str):
path = self.workspace / f"{doc_id}.json"
with open(path, "w", encoding="utf-8") as f:
json.dump(self.documents[doc_id], f, ensure_ascii=False, indent=2)

def _load_workspace(self):
loaded = 0
for path in self.workspace.glob("*.json"):
try:
with open(path, "r", encoding="utf-8") as f:
doc = json.load(f)
self.documents[path.stem] = doc
loaded += 1
except (json.JSONDecodeError, OSError) as e:
print(f"Warning: skipping corrupt workspace file {path.name}: {e}")
if loaded:
print(f"Loaded {loaded} document(s) from workspace.")

def get_document(self, doc_id: str) -> str:
"""Return document metadata JSON."""
return get_document(self.documents, doc_id)

def get_document_structure(self, doc_id: str) -> str:
"""Return document tree structure JSON (without text fields)."""
return get_document_structure(self.documents, doc_id)

def get_page_content(self, doc_id: str, pages: str) -> str:
"""Return page content for the given pages string (e.g. '5-7', '3,8', '12')."""
return get_page_content(self.documents, doc_id, pages)
Loading