Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 3 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -13,3 +13,6 @@ log/*
logs/
parts/*
json_results/*
*.egg-info/
dist/
build/
124 changes: 124 additions & 0 deletions pageindex/cli.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,124 @@
import argparse
import asyncio
import json
import os


def main():
parser = argparse.ArgumentParser(description='Process PDF or Markdown document and generate structure')
parser.add_argument('--pdf_path', type=str, help='Path to the PDF file')
parser.add_argument('--md_path', type=str, help='Path to the Markdown file')

parser.add_argument('--model', type=str, default='gpt-4o-2024-11-20', help='Model to use')
parser.add_argument('--base-url', type=str, default=None,
help='Custom OpenAI-compatible API base URL (e.g. http://localhost:11434/v1 for Ollama)')

parser.add_argument('--toc-check-pages', type=int, default=20,
help='Number of pages to check for table of contents (PDF only)')
parser.add_argument('--max-pages-per-node', type=int, default=10,
help='Maximum number of pages per node (PDF only)')
parser.add_argument('--max-tokens-per-node', type=int, default=20000,
help='Maximum number of tokens per node (PDF only)')

parser.add_argument('--if-add-node-id', type=str, default='yes',
help='Whether to add node id to the node')
parser.add_argument('--if-add-node-summary', type=str, default='yes',
help='Whether to add summary to the node')
parser.add_argument('--if-add-doc-description', type=str, default='no',
help='Whether to add doc description to the doc')
parser.add_argument('--if-add-node-text', type=str, default='no',
help='Whether to add text to the node')

# Markdown specific arguments
parser.add_argument('--if-thinning', type=str, default='no',
help='Whether to apply tree thinning for markdown (markdown only)')
parser.add_argument('--thinning-threshold', type=int, default=5000,
help='Minimum token threshold for thinning (markdown only)')
parser.add_argument('--summary-token-threshold', type=int, default=200,
help='Token threshold for generating summaries (markdown only)')
args = parser.parse_args()

if not args.pdf_path and not args.md_path:
raise ValueError("Either --pdf_path or --md_path must be specified")
if args.pdf_path and args.md_path:
raise ValueError("Only one of --pdf_path or --md_path can be specified")

if args.pdf_path:
from pageindex import config, page_index_main

if not args.pdf_path.lower().endswith('.pdf'):
raise ValueError("PDF file must have .pdf extension")
if not os.path.isfile(args.pdf_path):
raise ValueError(f"PDF file not found: {args.pdf_path}")

opt = config(
model=args.model,
base_url=args.base_url,
toc_check_page_num=args.toc_check_pages,
max_page_num_each_node=args.max_pages_per_node,
max_token_num_each_node=args.max_tokens_per_node,
if_add_node_id=args.if_add_node_id,
if_add_node_summary=args.if_add_node_summary,
if_add_doc_description=args.if_add_doc_description,
if_add_node_text=args.if_add_node_text
)

toc_with_page_number = page_index_main(args.pdf_path, opt)
print('Parsing done, saving to file...')

pdf_name = os.path.splitext(os.path.basename(args.pdf_path))[0]
output_dir = './results'
output_file = f'{output_dir}/{pdf_name}_structure.json'
os.makedirs(output_dir, exist_ok=True)

with open(output_file, 'w', encoding='utf-8') as f:
json.dump(toc_with_page_number, f, indent=2)

print(f'Tree structure saved to: {output_file}')

elif args.md_path:
from pageindex.page_index_md import md_to_tree
from pageindex.utils import ConfigLoader

if not args.md_path.lower().endswith(('.md', '.markdown')):
raise ValueError("Markdown file must have .md or .markdown extension")
if not os.path.isfile(args.md_path):
raise ValueError(f"Markdown file not found: {args.md_path}")

print('Processing markdown file...')

config_loader = ConfigLoader()
user_opt = {
'model': args.model,
'base_url': args.base_url,
'if_add_node_summary': args.if_add_node_summary,
'if_add_doc_description': args.if_add_doc_description,
'if_add_node_text': args.if_add_node_text,
'if_add_node_id': args.if_add_node_id
}
opt = config_loader.load(user_opt)

toc_with_page_number = asyncio.run(md_to_tree(
md_path=args.md_path,
if_thinning=args.if_thinning.lower() == 'yes',
min_token_threshold=args.thinning_threshold,
if_add_node_summary=opt.if_add_node_summary,
summary_token_threshold=args.summary_token_threshold,
model=opt.model,
base_url=opt.base_url,
if_add_doc_description=opt.if_add_doc_description,
if_add_node_text=opt.if_add_node_text,
if_add_node_id=opt.if_add_node_id
))

print('Parsing done, saving to file...')

md_name = os.path.splitext(os.path.basename(args.md_path))[0]
output_dir = './results'
output_file = f'{output_dir}/{md_name}_structure.json'
os.makedirs(output_dir, exist_ok=True)

with open(output_file, 'w', encoding='utf-8') as f:
json.dump(toc_with_page_number, f, indent=2, ensure_ascii=False)

print(f'Tree structure saved to: {output_file}')
1 change: 1 addition & 0 deletions pageindex/config.yaml
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
model: "gpt-4o-2024-11-20"
base_url: null
toc_check_page_num: 20
max_page_num_each_node: 10
max_token_num_each_node: 20000
Expand Down
Loading