Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
24 changes: 24 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -174,9 +174,33 @@ You can customize the processing with additional optional arguments:
--if-add-node-id Add node ID (yes/no, default: yes)
--if-add-node-summary Add node summary (yes/no, default: yes)
--if-add-doc-description Add doc description (yes/no, default: yes)
--enable-profile Enable runtime CPU/memory profiling
--profile-output Output path for profile report JSON (default: ./results/profile_report.json)
```
</details>

<details>
<summary><strong>CPU and memory profiling under heavy load</strong></summary>
<br>
Use profiling mode to measure end-to-end runtime and peak memory when processing larger PDFs or running repeated load tests.

```bash
python3 run_pageindex.py \
--pdf_path /path/to/your/large-document.pdf \
--max-pages-per-node 20 \
--max-tokens-per-node 30000 \
--enable-profile \
--profile-output ./results/heavy-load-profile.json
```

The generated JSON report includes:
- `elapsed_seconds`: total wall-clock runtime
- `peak_memory_mb`: Python peak memory during the run (tracemalloc)
- `rss_mb`: max resident memory of the process (when supported by OS)

Use this output to compare tuning changes (e.g., `--max-pages-per-node`, model choice, or input-size buckets) and identify memory pressure regressions.
</details>

<details>
<summary><strong>Markdown support</strong></summary>
<br>
Expand Down
63 changes: 63 additions & 0 deletions pageindex/profiling.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,63 @@
from __future__ import annotations

import contextlib
import json
import os
import time
import tracemalloc
from dataclasses import asdict, dataclass
from pathlib import Path


@dataclass
class ProfileReport:
elapsed_seconds: float
peak_memory_mb: float
rss_mb: float | None


@contextlib.contextmanager
def profile_run(enabled: bool = False):
if not enabled:
yield None
return

tracemalloc.start()
start = time.perf_counter()
try:
yield
finally:
elapsed = time.perf_counter() - start
_current, peak = tracemalloc.get_traced_memory()
tracemalloc.stop()

rss_mb = None
try:
import resource

# Linux returns KB, macOS returns bytes.
ru_maxrss = resource.getrusage(resource.RUSAGE_SELF).ru_maxrss
rss_mb = ru_maxrss / 1024.0
if rss_mb > 10_000: # likely macOS bytes -> convert to MB
rss_mb = ru_maxrss / (1024.0 * 1024.0)
except Exception:
rss_mb = None

yield_data = ProfileReport(
elapsed_seconds=round(elapsed, 3),
peak_memory_mb=round(peak / (1024.0 * 1024.0), 3),
rss_mb=round(rss_mb, 3) if rss_mb is not None else None,
)

# stash report on context manager instance for caller retrieval
profile_run.last_report = yield_data


def write_profile_report(output_path: str | os.PathLike[str], report: ProfileReport) -> Path:
path = Path(output_path)
path.parent.mkdir(parents=True, exist_ok=True)
path.write_text(json.dumps(asdict(report), indent=2), encoding="utf-8")
return path


profile_run.last_report = None
186 changes: 94 additions & 92 deletions run_pageindex.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@
import json
from pageindex import *
from pageindex.page_index_md import md_to_tree
from pageindex.profiling import profile_run, write_profile_report

if __name__ == "__main__":
# Set up argument parser
Expand All @@ -12,7 +13,7 @@

parser.add_argument('--model', type=str, default='gpt-4o-2024-11-20', help='Model to use')

parser.add_argument('--toc-check-pages', type=int, default=20,
parser.add_argument('--toc-check-pages', type=int, default=20,
help='Number of pages to check for table of contents (PDF only)')
parser.add_argument('--max-pages-per-node', type=int, default=10,
help='Maximum number of pages per node (PDF only)')
Expand All @@ -27,107 +28,108 @@
help='Whether to add doc description to the doc')
parser.add_argument('--if-add-node-text', type=str, default='no',
help='Whether to add text to the node')

# Markdown specific arguments
parser.add_argument('--if-thinning', type=str, default='no',
help='Whether to apply tree thinning for markdown (markdown only)')
parser.add_argument('--thinning-threshold', type=int, default=5000,
help='Minimum token threshold for thinning (markdown only)')
parser.add_argument('--summary-token-threshold', type=int, default=200,
help='Token threshold for generating summaries (markdown only)')

# Profiling arguments
parser.add_argument('--enable-profile', action='store_true',
help='Enable CPU time and memory profiling for the end-to-end run')
parser.add_argument('--profile-output', type=str, default='./results/profile_report.json',
help='Path to write profile report JSON when --enable-profile is set')

args = parser.parse_args()

# Validate that exactly one file type is specified
if not args.pdf_path and not args.md_path:
raise ValueError("Either --pdf_path or --md_path must be specified")
if args.pdf_path and args.md_path:
raise ValueError("Only one of --pdf_path or --md_path can be specified")

if args.pdf_path:
# Validate PDF file
if not args.pdf_path.lower().endswith('.pdf'):
raise ValueError("PDF file must have .pdf extension")
if not os.path.isfile(args.pdf_path):
raise ValueError(f"PDF file not found: {args.pdf_path}")

# Process PDF file
# Configure options
opt = config(
model=args.model,
toc_check_page_num=args.toc_check_pages,
max_page_num_each_node=args.max_pages_per_node,
max_token_num_each_node=args.max_tokens_per_node,
if_add_node_id=args.if_add_node_id,
if_add_node_summary=args.if_add_node_summary,
if_add_doc_description=args.if_add_doc_description,
if_add_node_text=args.if_add_node_text
)

# Process the PDF
toc_with_page_number = page_index_main(args.pdf_path, opt)
print('Parsing done, saving to file...')

# Save results
pdf_name = os.path.splitext(os.path.basename(args.pdf_path))[0]
output_dir = './results'
output_file = f'{output_dir}/{pdf_name}_structure.json'
os.makedirs(output_dir, exist_ok=True)

with open(output_file, 'w', encoding='utf-8') as f:
json.dump(toc_with_page_number, f, indent=2)

print(f'Tree structure saved to: {output_file}')

elif args.md_path:
# Validate Markdown file
if not args.md_path.lower().endswith(('.md', '.markdown')):
raise ValueError("Markdown file must have .md or .markdown extension")
if not os.path.isfile(args.md_path):
raise ValueError(f"Markdown file not found: {args.md_path}")

# Process markdown file
print('Processing markdown file...')

# Process the markdown
import asyncio

# Use ConfigLoader to get consistent defaults (matching PDF behavior)
from pageindex.utils import ConfigLoader
config_loader = ConfigLoader()

# Create options dict with user args
user_opt = {
'model': args.model,
'if_add_node_summary': args.if_add_node_summary,
'if_add_doc_description': args.if_add_doc_description,
'if_add_node_text': args.if_add_node_text,
'if_add_node_id': args.if_add_node_id
}

# Load config with defaults from config.yaml
opt = config_loader.load(user_opt)

toc_with_page_number = asyncio.run(md_to_tree(
md_path=args.md_path,
if_thinning=args.if_thinning.lower() == 'yes',
min_token_threshold=args.thinning_threshold,
if_add_node_summary=opt.if_add_node_summary,
summary_token_threshold=args.summary_token_threshold,
model=opt.model,
if_add_doc_description=opt.if_add_doc_description,
if_add_node_text=opt.if_add_node_text,
if_add_node_id=opt.if_add_node_id
))

print('Parsing done, saving to file...')

# Save results
md_name = os.path.splitext(os.path.basename(args.md_path))[0]
output_dir = './results'
output_file = f'{output_dir}/{md_name}_structure.json'
os.makedirs(output_dir, exist_ok=True)

with open(output_file, 'w', encoding='utf-8') as f:
json.dump(toc_with_page_number, f, indent=2, ensure_ascii=False)

print(f'Tree structure saved to: {output_file}')

with profile_run(args.enable_profile):
if args.pdf_path:
# Validate PDF file
if not args.pdf_path.lower().endswith('.pdf'):
raise ValueError("PDF file must have .pdf extension")
if not os.path.isfile(args.pdf_path):
raise ValueError(f"PDF file not found: {args.pdf_path}")

# Process PDF file
opt = config(
model=args.model,
toc_check_page_num=args.toc_check_pages,
max_page_num_each_node=args.max_pages_per_node,
max_token_num_each_node=args.max_tokens_per_node,
if_add_node_id=args.if_add_node_id,
if_add_node_summary=args.if_add_node_summary,
if_add_doc_description=args.if_add_doc_description,
if_add_node_text=args.if_add_node_text
)

toc_with_page_number = page_index_main(args.pdf_path, opt)
print('Parsing done, saving to file...')

pdf_name = os.path.splitext(os.path.basename(args.pdf_path))[0]
output_dir = './results'
output_file = f'{output_dir}/{pdf_name}_structure.json'
os.makedirs(output_dir, exist_ok=True)

with open(output_file, 'w', encoding='utf-8') as f:
json.dump(toc_with_page_number, f, indent=2)

print(f'Tree structure saved to: {output_file}')

elif args.md_path:
if not args.md_path.lower().endswith(('.md', '.markdown')):
raise ValueError("Markdown file must have .md or .markdown extension")
if not os.path.isfile(args.md_path):
raise ValueError(f"Markdown file not found: {args.md_path}")

print('Processing markdown file...')

import asyncio
from pageindex.utils import ConfigLoader

config_loader = ConfigLoader()
user_opt = {
'model': args.model,
'if_add_node_summary': args.if_add_node_summary,
'if_add_doc_description': args.if_add_doc_description,
'if_add_node_text': args.if_add_node_text,
'if_add_node_id': args.if_add_node_id
}
opt = config_loader.load(user_opt)

toc_with_page_number = asyncio.run(md_to_tree(
md_path=args.md_path,
if_thinning=args.if_thinning.lower() == 'yes',
min_token_threshold=args.thinning_threshold,
if_add_node_summary=opt.if_add_node_summary,
summary_token_threshold=args.summary_token_threshold,
model=opt.model,
if_add_doc_description=opt.if_add_doc_description,
if_add_node_text=opt.if_add_node_text,
if_add_node_id=opt.if_add_node_id
))

print('Parsing done, saving to file...')

md_name = os.path.splitext(os.path.basename(args.md_path))[0]
output_dir = './results'
output_file = f'{output_dir}/{md_name}_structure.json'
os.makedirs(output_dir, exist_ok=True)

with open(output_file, 'w', encoding='utf-8') as f:
json.dump(toc_with_page_number, f, indent=2, ensure_ascii=False)

print(f'Tree structure saved to: {output_file}')

if args.enable_profile and profile_run.last_report is not None:
profile_path = write_profile_report(args.profile_output, profile_run.last_report)
print(f'Profile report saved to: {profile_path}')
print(f'Profile summary: {profile_run.last_report}')
29 changes: 29 additions & 0 deletions tests/test_profiling.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,29 @@
import json
from pathlib import Path

from pageindex.profiling import profile_run, write_profile_report


def test_profile_run_collects_report():
profile_run.last_report = None
with profile_run(True):
_ = sum(i for i in range(10_000))

report = profile_run.last_report
assert report is not None
assert report.elapsed_seconds >= 0
assert report.peak_memory_mb >= 0


def test_write_profile_report(tmp_path: Path):
profile_run.last_report = None
with profile_run(True):
_ = [str(i) for i in range(1000)]

out = tmp_path / "profile.json"
write_profile_report(out, profile_run.last_report)

payload = json.loads(out.read_text(encoding="utf-8"))
assert "elapsed_seconds" in payload
assert "peak_memory_mb" in payload
assert "rss_mb" in payload