From 80a06dbc9e818ec2c57be46f17cdfe962e7ba7ea Mon Sep 17 00:00:00 2001 From: sumedhkumar Date: Mon, 2 Mar 2026 12:57:57 +0530 Subject: [PATCH] fix: resolve 12 bugs and add 6 improvements to PageIndex Bug Fixes (3 critical, 9 standard): - Add missing `import re` causing NameError on page extraction - Fix variable shadowing in fix_incorrect_toc corrupting TOC entries - Fix ChatGPT_API_with_finish_reason returning wrong type on error - Prevent chat_history mutation across API retries - Add infinite loop guard to extract_toc_content (max 10 attempts) - Fix get_leaf_nodes KeyError on nodes without 'nodes' key - Fix extract_json corrupting text containing word "None" - Fix process_none_page_numbers KeyError on missing 'page' key - Fix count_tokens crash when model=None - Replace bare except with except ImportError in page_index_md.py - Remove duplicate imports (logging, os) - Fix add_page_number_to_toc receiving list instead of string Improvements: - Add pyproject.toml for proper Python packaging and pip install - Add 45-test automated test suite (pytest) for utils and markdown - Add exponential backoff with jitter for API retries - Add optional progress_callback for long-running processing - Add optional in-memory LLM response caching - Allow doc_description generation independent of node summaries Made-with: Cursor --- IMPROVEMENTS.md | 230 +++++++++++++++++++++++++++++++++++ pageindex/config.yaml | 6 +- pageindex/page_index.py | 76 ++++++++---- pageindex/page_index_md.py | 2 +- pageindex/utils.py | 81 ++++++++++--- pyproject.toml | 56 +++++++++ run_pageindex.py | 9 +- tests/__init__.py | 2 + tests/test_page_index_md.py | 118 ++++++++++++++++++ tests/test_utils.py | 235 ++++++++++++++++++++++++++++++++++++ 10 files changed, 770 insertions(+), 45 deletions(-) create mode 100644 IMPROVEMENTS.md create mode 100644 pyproject.toml create mode 100644 tests/__init__.py create mode 100644 tests/test_page_index_md.py create mode 100644 tests/test_utils.py diff --git a/IMPROVEMENTS.md b/IMPROVEMENTS.md new file mode 100644 index 000000000..6190d9b0b --- /dev/null +++ b/IMPROVEMENTS.md @@ -0,0 +1,230 @@ +# PageIndex: Bug Fixes & Improvements Report + +## Overview + +This document summarizes all changes made to the PageIndex repository. The work covers **12 bug fixes** (3 critical, 9 standard) and **6 feature improvements**, all verified with a new 45-test automated test suite. + +No behavioral changes were made to the core LLM prompts or retrieval logic. All fixes target correctness, robustness, and developer experience. + +--- + +## Critical Bug Fixes + +### 1. Missing `import re` in `utils.py` + +**Problem:** `utils.py` used `re.search()` and `re.finditer()` in `get_first_start_page_from_text()` and `get_last_start_page_from_text()` but never imported the `re` module. Any call to these functions would crash with `NameError: name 're' is not defined`. + +**Fix:** Added `import re` to the top-level imports. + +**Impact:** These functions are called during PDF page tag extraction. Without this fix, any PDF with page index tags would fail at runtime. + +--- + +### 2. Variable shadowing in `fix_incorrect_toc()` corrupts results + +**Problem:** In `fix_incorrect_toc()` > `process_and_check_item()`, the variable `list_index` (the TOC entry index) was overwritten inside a loop that iterates over page ranges: + +```python +list_index = incorrect_item['list_index'] # correct TOC index +for page_index in range(prev_correct, next_correct+1): + list_index = page_index - start_index # OVERWRITES with page offset +``` + +The returned `list_index` would be wrong, causing the fix to update the wrong TOC entry silently. + +**Fix:** Renamed the inner loop variable to `page_list_index`. + +**Impact:** Without this fix, TOC correction could place fixes at wrong indices, producing silently corrupted output for any document where initial verification fails and the fix path is triggered. + +--- + +### 3. `ChatGPT_API_with_finish_reason` returns wrong type on error + +**Problem:** On success the function returns a tuple `(content, finish_reason)`, but on max retries exhaustion it returned just the string `"Error"`. Any caller that unpacks like `response, reason = ChatGPT_API_with_finish_reason(...)` would crash with `ValueError: not enough values to unpack`. + +**Fix:** Changed to return `("Error", "error")` on failure. + +**Impact:** Every call site that unpacks this return value (`extract_toc_content`, `toc_transformer`, `generate_toc_init`, `generate_toc_continue`) would crash if the API failed 10 times consecutively. + +--- + +## Standard Bug Fixes + +### 4. `chat_history` mutation across retries + +**Problem:** Both `ChatGPT_API_with_finish_reason` and `ChatGPT_API` assigned `messages = chat_history` then appended to it. This mutated the caller's original list and accumulated duplicate messages across retries. + +**Fix:** Changed to `messages = list(chat_history)` to create a shallow copy. + +--- + +### 5. Infinite loop guard in `extract_toc_content` was non-functional + +**Problem:** The guard `if len(chat_history) > 5` could never trigger because `chat_history` was recreated with exactly 2 items at the start of each loop iteration. + +**Fix:** Added an external `attempt` counter that increments each iteration and breaks at `max_attempts = 10`. + +--- + +### 6. `get_leaf_nodes` crashes on nodes without `nodes` key + +**Problem:** `if not structure['nodes']` raises `KeyError` when a dict has no `nodes` key (common for leaf nodes in certain tree states). + +**Fix:** Changed to `if not structure.get('nodes')`. + +--- + +### 7. `extract_json` corrupts text containing the word "None" + +**Problem:** `json_content.replace('None', 'null')` was a global string replace. A title like `"None of the Above"` would become `"null of the Above"`. + +**Fix:** Replaced with context-aware regex that only substitutes `None` in JSON value positions (after `:`, `[`, `,`), not inside quoted strings. + +--- + +### 8. `process_none_page_numbers` crashes on missing `page` key + +**Problem:** `del item_copy['page']` and `del item['page']` would raise `KeyError` if the key doesn't exist. + +**Fix:** Changed to `item_copy.pop('page', None)` and `item.pop('page', None)`. + +--- + +### 9. `count_tokens` crashes when `model=None` + +**Problem:** `tiktoken.encoding_for_model(None)` raises an error. Multiple call sites pass `model=None`. + +**Fix:** Added `model = model or "gpt-4o"` as default fallback. + +--- + +### 10. Bare `except:` in `page_index_md.py` + +**Problem:** `except:` catches everything including `SystemExit` and `KeyboardInterrupt`, making it impossible to interrupt the process cleanly. + +**Fix:** Changed to `except ImportError:`. + +--- + +### 11. Duplicate imports + +**Problem:** `import logging` appeared twice in `utils.py`; `import os` appeared twice in `page_index.py`. + +**Fix:** Removed the duplicates. + +--- + +### 12. `add_page_number_to_toc` called with list instead of string + +**Problem:** `process_none_page_numbers` passed `page_contents` (a Python list) to `add_page_number_to_toc`, which expects a string for its `part` parameter. Python would convert the list to its repr `['...', '...']` in the f-string prompt, sending garbled text to the LLM. Also added a guard for empty/invalid `result` before indexing. + +**Fix:** Changed to `''.join(page_contents)` and added `result and isinstance(result[0].get(...))` check. + +--- + +## Feature Improvements + +### 1. `pyproject.toml` for proper Python packaging + +Added a standard `pyproject.toml` with: +- Package metadata (name, version, description, license, classifiers) +- Dependencies mirroring `requirements.txt` (with `>=` instead of `==` for flexibility) +- Optional `[dev]` dependencies (pytest, pytest-asyncio) +- Console script entry point: `run_pageindex` command +- Setuptools configuration to include both `pageindex/` package and root `run_pageindex.py` + +Also added a `main()` function wrapper in `run_pageindex.py` so the console script entry point works. + +**Benefit:** Users can `pip install .` or eventually `pip install pageindex` from PyPI. + +--- + +### 2. Automated test suite (45 tests) + +Created two test files: +- `tests/test_utils.py` (36 tests) -- covers `count_tokens`, `extract_json`, `get_leaf_nodes`, `get_first_start_page_from_text`, `get_last_start_page_from_text`, `sanitize_filename`, `list_to_tree`, `add_preface_if_needed`, `remove_fields`, `reorder_dict`, `convert_physical_index_to_int`, `convert_page_to_int`, `write_node_id`, `get_nodes`, `structure_to_list` +- `tests/test_page_index_md.py` (9 tests) -- covers `extract_nodes_from_markdown`, `extract_node_text_content`, `build_tree_from_nodes`, `clean_tree_for_output` + +All 45 tests pass on Python 3.14 with pytest. + +**Benefit:** Catches regressions automatically. Several of the bugs fixed above would have been caught immediately by these tests. + +--- + +### 3. Exponential backoff with jitter for API retries + +Replaced fixed `time.sleep(1)` in all three API functions (`ChatGPT_API`, `ChatGPT_API_with_finish_reason`, `ChatGPT_API_async`) with `_retry_delay(attempt)` that computes: + +``` +delay = min(base * 2^attempt, max_delay) + random(0, 1) +``` + +Delays: ~1.5s, ~2.7s, ~4.8s, ~8.5s, ... capped at ~60s. + +**Benefit:** Better resilience to API rate limits. Avoids hammering the API with fixed 1-second retries under load. + +--- + +### 4. Progress reporting for long-running processing + +Added an optional `progress_callback(stage, message, **kwargs)` parameter to `page_index()` and to `config.yaml`. When provided, it is called at key stages: + +| Stage | When | +|-------|------| +| `pages_loaded` | After PDF page extraction | +| `toc_detection` | Starting TOC detection | +| `verify_toc` | Verifying section positions | +| `build_tree` | Building the document tree | +| `node_ids` | Assigning node IDs | +| `summaries` | Generating node summaries | +| `description` | Generating document description | + +Usage: `page_index(doc, progress_callback=lambda stage, msg, **kw: print(f"[{stage}] {msg}"))` + +**Benefit:** Users processing large PDFs (which can take minutes with dozens of LLM calls) get visibility into progress. + +--- + +### 5. Optional LLM response caching + +Added in-memory response caching keyed on `(model, sha256(prompt + chat_history))`. Enable via: +- `page_index(doc, response_cache=True)` or +- `response_cache: true` in `config.yaml` + +Cache is automatically cleaned up after `page_index_main()` returns. Also exposed `set_response_cache(True/False)` for manual control. + +**Benefit:** Avoids redundant API calls during retries or re-processing of the same document. Reduces cost and latency during development. + +--- + +### 6. `doc_description` works independently of node summaries + +**Problem:** In `page_index_main()`, the `if_add_doc_description` check was nested inside the `if_add_node_summary == 'yes'` block. Setting `if_add_doc_description='yes'` with `if_add_node_summary='no'` would silently skip description generation. + +**Fix:** Moved `if_add_doc_description` to a separate top-level check after the summary block. The result dict is now built first, then description is conditionally added. + +**Benefit:** Users can generate document descriptions without the cost of per-node summaries. + +--- + +## Files Changed + +| File | Changes | +|------|---------| +| `pageindex/utils.py` | Bug fixes 1, 3, 4, 6, 7, 9, 11; Features 3, 5 | +| `pageindex/page_index.py` | Bug fixes 2, 5, 8, 11, 12; Features 4, 6 | +| `pageindex/page_index_md.py` | Bug fix 10 | +| `run_pageindex.py` | Feature 1 (added `main()` wrapper) | +| `pageindex/config.yaml` | Features 4, 5 (new config keys) | +| `pyproject.toml` | Feature 1 (new file) | +| `tests/__init__.py` | Feature 2 (new file) | +| `tests/test_utils.py` | Feature 2 (new file, 36 tests) | +| `tests/test_page_index_md.py` | Feature 2 (new file, 9 tests) | + +## Test Results + +``` +45 passed, 0 failed (Python 3.14.0, pytest 9.0.2) +``` + +All imports verified, CLI `--help` verified, new features (backoff, caching, config) independently verified. diff --git a/pageindex/config.yaml b/pageindex/config.yaml index fd73e3a2c..ea1152df7 100644 --- a/pageindex/config.yaml +++ b/pageindex/config.yaml @@ -5,4 +5,8 @@ max_token_num_each_node: 20000 if_add_node_id: "yes" if_add_node_summary: "yes" if_add_doc_description: "no" -if_add_node_text: "no" \ No newline at end of file +if_add_node_text: "no" +# Optional: progress_callback(stage, message, **kwargs) for long-running PDF processing +progress_callback: null +# Optional: set to true to cache LLM responses in memory (faster re-runs, lower API cost) +response_cache: false \ No newline at end of file diff --git a/pageindex/page_index.py b/pageindex/page_index.py index 39018c4df..79b71f7f2 100644 --- a/pageindex/page_index.py +++ b/pageindex/page_index.py @@ -5,10 +5,19 @@ import random import re from .utils import * -import os from concurrent.futures import ThreadPoolExecutor, as_completed +def _progress_report(opt, stage, message, **kwargs): + """Call optional progress_callback(stage, message, **kwargs) if set on opt.""" + callback = getattr(opt, "progress_callback", None) + if callable(callback): + try: + callback(stage, message, **kwargs) + except Exception: + pass + + ################### check title in page ######################################################### async def check_title_appearance(item, page_list, start_index=1, model=None): title=item['title'] @@ -180,20 +189,21 @@ def extract_toc_content(content, model=None): response = response + new_response if_complete = check_if_toc_transformation_is_complete(content, response, model) + max_attempts = 10 + attempt = 0 while not (if_complete == "yes" and finish_reason == "finished"): + attempt += 1 + if attempt > max_attempts: + raise Exception('Failed to complete table of contents after maximum retries') chat_history = [ - {"role": "user", "content": prompt}, - {"role": "assistant", "content": response}, + {"role": "user", "content": prompt}, + {"role": "assistant", "content": response}, ] prompt = f"""please continue the generation of table of contents , directly output the remaining part of the structure""" new_response, finish_reason = ChatGPT_API_with_finish_reason(model=model, prompt=prompt, chat_history=chat_history) response = response + new_response if_complete = check_if_toc_transformation_is_complete(content, response, model) - - # Optional: Add a maximum retry limit to prevent infinite loops - if len(chat_history) > 5: # Arbitrary limit of 10 attempts - raise Exception('Failed to complete table of contents after maximum retries') - + return response def detect_page_index(toc_content, model=None): @@ -674,11 +684,11 @@ def process_none_page_numbers(toc_items, page_list, start_index=1, model=None): continue item_copy = copy.deepcopy(item) - del item_copy['page'] - result = add_page_number_to_toc(page_contents, item_copy, model) - if isinstance(result[0]['physical_index'], str) and result[0]['physical_index'].startswith('').strip()) - del item['page'] + item.pop('page', None) return toc_items @@ -804,9 +814,9 @@ async def process_and_check_item(incorrect_item): page_contents=[] for page_index in range(prev_correct, next_correct+1): # Add bounds checking to prevent IndexError - list_index = page_index - start_index - if list_index >= 0 and list_index < len(page_list): - page_text = f"\n{page_list[list_index][0]}\n\n\n" + page_list_index = page_index - start_index + if page_list_index >= 0 and page_list_index < len(page_list): + page_text = f"\n{page_list[page_list_index][0]}\n\n\n" page_contents.append(page_text) else: continue @@ -1019,6 +1029,7 @@ async def process_large_node_recursively(node, page_list, opt=None, logger=None) return node async def tree_parser(page_list, opt, doc=None, logger=None): + _progress_report(opt, "toc_detection", "Detecting table of contents...") check_toc_result = check_toc(page_list, opt) logger.info(check_toc_result) @@ -1039,12 +1050,14 @@ async def tree_parser(page_list, opt, doc=None, logger=None): opt=opt, logger=logger) + _progress_report(opt, "verify_toc", "Verifying section positions...") toc_with_page_number = add_preface_if_needed(toc_with_page_number) toc_with_page_number = await check_title_appearance_in_start_concurrent(toc_with_page_number, page_list, model=opt.model, logger=logger) # Filter out items with None physical_index before post_processings valid_toc_items = [item for item in toc_with_page_number if item.get('physical_index') is not None] + _progress_report(opt, "build_tree", "Building document tree...", total_pages=len(page_list)) toc_tree = post_processing(valid_toc_items, len(page_list)) tasks = [ process_large_node_recursively(node, page_list, opt, logger=logger) @@ -1056,6 +1069,16 @@ async def tree_parser(page_list, opt, doc=None, logger=None): def page_index_main(doc, opt=None): + from pageindex.utils import set_response_cache + if opt is not None and getattr(opt, "response_cache", False): + set_response_cache(True) + try: + return _page_index_main_impl(doc, opt) + finally: + set_response_cache(False) + + +def _page_index_main_impl(doc, opt=None): logger = JsonLogger(doc) is_valid_pdf = ( @@ -1066,42 +1089,43 @@ def page_index_main(doc, opt=None): raise ValueError("Unsupported input type. Expected a PDF file path or BytesIO object.") print('Parsing PDF...') + _progress_report(opt, "pages_loaded", "Extracting pages...") page_list = get_page_tokens(doc) + _progress_report(opt, "pages_loaded", f"Extracted {len(page_list)} pages", total_pages=len(page_list)) logger.info({'total_page_number': len(page_list)}) logger.info({'total_token': sum([page[1] for page in page_list])}) async def page_index_builder(): structure = await tree_parser(page_list, opt, doc=doc, logger=logger) + _progress_report(opt, "node_ids", "Assigning node IDs...") if opt.if_add_node_id == 'yes': write_node_id(structure) if opt.if_add_node_text == 'yes': add_node_text(structure, page_list) if opt.if_add_node_summary == 'yes': + _progress_report(opt, "summaries", "Generating node summaries...") if opt.if_add_node_text == 'no': add_node_text(structure, page_list) await generate_summaries_for_structure(structure, model=opt.model) if opt.if_add_node_text == 'no': remove_structure_text(structure) - if opt.if_add_doc_description == 'yes': - # Create a clean structure without unnecessary fields for description generation - clean_structure = create_clean_structure_for_description(structure) - doc_description = generate_doc_description(clean_structure, model=opt.model) - return { - 'doc_name': get_pdf_name(doc), - 'doc_description': doc_description, - 'structure': structure, - } - return { + result = { 'doc_name': get_pdf_name(doc), 'structure': structure, } + if opt.if_add_doc_description == 'yes': + _progress_report(opt, "description", "Generating document description...") + clean_structure = create_clean_structure_for_description(structure) + result['doc_description'] = generate_doc_description(clean_structure, model=opt.model) + return result return asyncio.run(page_index_builder()) def page_index(doc, model=None, toc_check_page_num=None, max_page_num_each_node=None, max_token_num_each_node=None, - if_add_node_id=None, if_add_node_summary=None, if_add_doc_description=None, if_add_node_text=None): + if_add_node_id=None, if_add_node_summary=None, if_add_doc_description=None, if_add_node_text=None, + progress_callback=None, response_cache=None): user_opt = { arg: value for arg, value in locals().items() diff --git a/pageindex/page_index_md.py b/pageindex/page_index_md.py index 70e8de086..a0ec7a9ed 100644 --- a/pageindex/page_index_md.py +++ b/pageindex/page_index_md.py @@ -4,7 +4,7 @@ import os try: from .utils import * -except: +except ImportError: from utils import * async def get_node_summary(node, summary_token_threshold=200, model=None): diff --git a/pageindex/utils.py b/pageindex/utils.py index dc7acd888..41e9ff14a 100644 --- a/pageindex/utils.py +++ b/pageindex/utils.py @@ -5,6 +5,8 @@ from datetime import datetime import time import json +import re +import random import PyPDF2 import copy import asyncio @@ -12,27 +14,58 @@ from io import BytesIO from dotenv import load_dotenv load_dotenv() -import logging import yaml +import hashlib from pathlib import Path from types import SimpleNamespace as config CHATGPT_API_KEY = os.getenv("CHATGPT_API_KEY") +# Optional in-memory LLM response cache: when set to a dict, API responses are cached by (model, prompt_hash). +_response_cache = None + + +def set_response_cache(enable): + """Enable or disable LLM response caching. Pass True to use a new in-memory cache, False to disable, or a dict to use as cache.""" + global _response_cache + if enable is True: + _response_cache = {} + elif enable is False or enable is None: + _response_cache = None + else: + _response_cache = enable + + +def _retry_delay(attempt, base=1.0, max_delay=60.0): + """Exponential backoff with jitter for API retries.""" + delay = min(base * (2 ** attempt), max_delay) + random.uniform(0, 1) + return delay + + def count_tokens(text, model=None): if not text: return 0 + model = model or "gpt-4o" enc = tiktoken.encoding_for_model(model) tokens = enc.encode(text) return len(tokens) +def _cache_key(model, prompt, chat_history=None): + raw = prompt + json.dumps(chat_history or [], sort_keys=True) + return (model, hashlib.sha256(raw.encode("utf-8")).hexdigest()) + + def ChatGPT_API_with_finish_reason(model, prompt, api_key=CHATGPT_API_KEY, chat_history=None): + if _response_cache is not None: + key = _cache_key(model, prompt, chat_history) + if key in _response_cache: + return _response_cache[key] max_retries = 10 client = openai.OpenAI(api_key=api_key) for i in range(max_retries): try: if chat_history: - messages = chat_history + messages = list(chat_history) messages.append({"role": "user", "content": prompt}) else: messages = [{"role": "user", "content": prompt}] @@ -43,28 +76,35 @@ def ChatGPT_API_with_finish_reason(model, prompt, api_key=CHATGPT_API_KEY, chat_ temperature=0, ) if response.choices[0].finish_reason == "length": - return response.choices[0].message.content, "max_output_reached" + result = response.choices[0].message.content, "max_output_reached" else: - return response.choices[0].message.content, "finished" + result = response.choices[0].message.content, "finished" + if _response_cache is not None: + _response_cache[key] = result + return result except Exception as e: print('************* Retrying *************') logging.error(f"Error: {e}") if i < max_retries - 1: - time.sleep(1) # Wait for 1秒 before retrying + time.sleep(_retry_delay(i)) else: logging.error('Max retries reached for prompt: ' + prompt) - return "Error" + return ("Error", "error") def ChatGPT_API(model, prompt, api_key=CHATGPT_API_KEY, chat_history=None): + if _response_cache is not None: + key = _cache_key(model, prompt, chat_history) + if key in _response_cache: + return _response_cache[key] max_retries = 10 client = openai.OpenAI(api_key=api_key) for i in range(max_retries): try: if chat_history: - messages = chat_history + messages = list(chat_history) messages.append({"role": "user", "content": prompt}) else: messages = [{"role": "user", "content": prompt}] @@ -74,19 +114,25 @@ def ChatGPT_API(model, prompt, api_key=CHATGPT_API_KEY, chat_history=None): messages=messages, temperature=0, ) - - return response.choices[0].message.content + result = response.choices[0].message.content + if _response_cache is not None: + _response_cache[key] = result + return result except Exception as e: print('************* Retrying *************') logging.error(f"Error: {e}") if i < max_retries - 1: - time.sleep(1) # Wait for 1秒 before retrying + time.sleep(_retry_delay(i)) else: logging.error('Max retries reached for prompt: ' + prompt) return "Error" async def ChatGPT_API_async(model, prompt, api_key=CHATGPT_API_KEY): + if _response_cache is not None: + key = _cache_key(model, prompt, None) + if key in _response_cache: + return _response_cache[key] max_retries = 10 messages = [{"role": "user", "content": prompt}] for i in range(max_retries): @@ -97,12 +143,15 @@ async def ChatGPT_API_async(model, prompt, api_key=CHATGPT_API_KEY): messages=messages, temperature=0, ) - return response.choices[0].message.content + result = response.choices[0].message.content + if _response_cache is not None: + _response_cache[key] = result + return result except Exception as e: print('************* Retrying *************') logging.error(f"Error: {e}") if i < max_retries - 1: - await asyncio.sleep(1) # Wait for 1s before retrying + await asyncio.sleep(_retry_delay(i)) else: logging.error('Max retries reached for prompt: ' + prompt) return "Error" @@ -134,8 +183,10 @@ def extract_json(content): # If no delimiters, assume entire content could be JSON json_content = content.strip() - # Clean up common issues that might cause parsing errors - json_content = json_content.replace('None', 'null') # Replace Python None with JSON null + # Replace Python None with JSON null only when in a value position (after : [ ,), not inside quoted strings + json_content = re.sub(r'(?<=:)\s*None(?=\s*[,\}\]\s])', ' null', json_content) + json_content = re.sub(r'(?<=\[)\s*None(?=\s*[,\]\s])', 'null', json_content) + json_content = re.sub(r'(?<=,)\s*None(?=\s*[,\}\]\s])', ' null', json_content) json_content = json_content.replace('\n', ' ').replace('\r', ' ') # Remove newlines json_content = ' '.join(json_content.split()) # Normalize whitespace @@ -198,7 +249,7 @@ def structure_to_list(structure): def get_leaf_nodes(structure): if isinstance(structure, dict): - if not structure['nodes']: + if not structure.get('nodes'): structure_node = copy.deepcopy(structure) structure_node.pop('nodes', None) return [structure_node] diff --git a/pyproject.toml b/pyproject.toml new file mode 100644 index 000000000..52bf1d6fc --- /dev/null +++ b/pyproject.toml @@ -0,0 +1,56 @@ +[build-system] +requires = ["setuptools>=61.0", "wheel"] +build-backend = "setuptools.build_meta" + +[project] +name = "pageindex" +version = "0.1.0" +description = "Vectorless, reasoning-based RAG framework. No vector DB, no chunking, human-like retrieval." +readme = "README.md" +license = { text = "MIT" } +requires-python = ">=3.8" +authors = [ + { name = "Vectify AI" } +] +keywords = ["rag", "llm", "document", "pdf", "markdown", "retrieval", "vectorless"] +classifiers = [ + "Development Status :: 4 - Beta", + "Intended Audience :: Developers", + "License :: OSI Approved :: MIT License", + "Programming Language :: Python :: 3", + "Programming Language :: Python :: 3.8", + "Programming Language :: Python :: 3.9", + "Programming Language :: Python :: 3.10", + "Programming Language :: Python :: 3.11", + "Programming Language :: Python :: 3.12", + "Topic :: Scientific/Engineering :: Artificial Intelligence", +] +dependencies = [ + "openai>=1.101.0", + "pymupdf>=1.26.4", + "PyPDF2>=3.0.1", + "python-dotenv>=1.1.0", + "tiktoken>=0.11.0", + "pyyaml>=6.0.2", +] + +[project.optional-dependencies] +dev = [ + "pytest>=7.0", + "pytest-asyncio>=0.21", +] + +[project.scripts] +run_pageindex = "run_pageindex:main" + +[project.urls] +Homepage = "https://vectify.ai/pageindex" +Documentation = "https://docs.pageindex.ai" +Repository = "https://github.com/VectifyAI/PageIndex" + +[tool.setuptools.packages.find] +where = ["."] +include = ["pageindex*"] + +[tool.setuptools] +py-modules = ["run_pageindex"] diff --git a/run_pageindex.py b/run_pageindex.py index 107024505..3671ec46e 100644 --- a/run_pageindex.py +++ b/run_pageindex.py @@ -4,7 +4,8 @@ from pageindex import * from pageindex.page_index_md import md_to_tree -if __name__ == "__main__": + +def main(): # Set up argument parser parser = argparse.ArgumentParser(description='Process PDF or Markdown document and generate structure') parser.add_argument('--pdf_path', type=str, help='Path to the PDF file') @@ -130,4 +131,8 @@ with open(output_file, 'w', encoding='utf-8') as f: json.dump(toc_with_page_number, f, indent=2, ensure_ascii=False) - print(f'Tree structure saved to: {output_file}') \ No newline at end of file + print(f'Tree structure saved to: {output_file}') + + +if __name__ == "__main__": + main() \ No newline at end of file diff --git a/tests/__init__.py b/tests/__init__.py new file mode 100644 index 000000000..b3a758ee5 --- /dev/null +++ b/tests/__init__.py @@ -0,0 +1,2 @@ +# Tests for PageIndex. Run with: pytest tests/ -v +# Requires: pip install -r requirements.txt (Python 3.8--3.12 recommended for tiktoken wheels) diff --git a/tests/test_page_index_md.py b/tests/test_page_index_md.py new file mode 100644 index 000000000..84f09eee3 --- /dev/null +++ b/tests/test_page_index_md.py @@ -0,0 +1,118 @@ +"""Unit tests for pageindex.page_index_md pure functions (no API/IO).""" +import pytest +from pageindex.page_index_md import ( + extract_nodes_from_markdown, + extract_node_text_content, + build_tree_from_nodes, + clean_tree_for_output, +) + + +class TestExtractNodesFromMarkdown: + def test_single_header(self): + md = "# Title" + node_list, lines = extract_nodes_from_markdown(md) + assert len(node_list) == 1 + assert node_list[0]["node_title"] == "Title" + assert node_list[0]["line_num"] == 1 + + def test_multiple_levels(self): + md = """# One +## Two +### Three +""" + node_list, lines = extract_nodes_from_markdown(md) + assert len(node_list) == 3 + assert node_list[0]["node_title"] == "One" + assert node_list[1]["node_title"] == "Two" + assert node_list[2]["node_title"] == "Three" + + def test_skips_headers_in_code_block(self): + md = """# Real Header +``` +# Not a header +``` +## Another +""" + node_list, lines = extract_nodes_from_markdown(md) + titles = [n["node_title"] for n in node_list] + assert "Real Header" in titles + assert "Another" in titles + assert "Not a header" not in titles + + +class TestExtractNodeTextContent: + def test_assigns_level_and_text(self): + md = """# A +content for A +## B +content for B +""" + node_list, lines = extract_nodes_from_markdown(md) + nodes = extract_node_text_content(node_list, lines) + assert len(nodes) == 2 + assert nodes[0]["level"] == 1 + assert nodes[0]["title"] == "A" + assert "content for A" in nodes[0]["text"] + assert nodes[1]["level"] == 2 + assert "content for B" in nodes[1]["text"] + + +class TestBuildTreeFromNodes: + def test_single_node(self): + nodes = [ + {"title": "Root", "line_num": 1, "level": 1, "text": "x"} + ] + tree = build_tree_from_nodes(nodes) + assert len(tree) == 1 + assert tree[0]["title"] == "Root" + assert tree[0]["node_id"] == "0001" + assert tree[0]["nodes"] == [] + + def test_parent_child(self): + nodes = [ + {"title": "Parent", "line_num": 1, "level": 1, "text": "p"}, + {"title": "Child", "line_num": 2, "level": 2, "text": "c"}, + ] + tree = build_tree_from_nodes(nodes) + assert len(tree) == 1 + assert len(tree[0]["nodes"]) == 1 + assert tree[0]["nodes"][0]["title"] == "Child" + + def test_empty_list_returns_empty(self): + assert build_tree_from_nodes([]) == [] + + +class TestCleanTreeForOutput: + def test_keeps_title_node_id_text_line_num(self): + node = { + "title": "T", + "node_id": "0001", + "text": "content", + "line_num": 1, + "nodes": [], + "extra": "ignored", + } + tree = [node] + cleaned = clean_tree_for_output(tree) + assert cleaned[0]["title"] == "T" + assert cleaned[0]["node_id"] == "0001" + assert cleaned[0]["text"] == "content" + assert cleaned[0]["line_num"] == 1 + assert "extra" not in cleaned[0] + + def test_recurses_children(self): + tree = [ + { + "title": "A", + "node_id": "0001", + "text": "a", + "line_num": 1, + "nodes": [ + {"title": "B", "node_id": "0002", "text": "b", "line_num": 2, "nodes": []} + ], + } + ] + cleaned = clean_tree_for_output(tree) + assert len(cleaned[0]["nodes"]) == 1 + assert cleaned[0]["nodes"][0]["title"] == "B" diff --git a/tests/test_utils.py b/tests/test_utils.py new file mode 100644 index 000000000..dfb5a0c41 --- /dev/null +++ b/tests/test_utils.py @@ -0,0 +1,235 @@ +"""Unit tests for pageindex.utils pure functions (no API/IO).""" +import json +import pytest +from pageindex.utils import ( + count_tokens, + get_json_content, + extract_json, + write_node_id, + get_nodes, + structure_to_list, + get_leaf_nodes, + get_first_start_page_from_text, + get_last_start_page_from_text, + sanitize_filename, + list_to_tree, + add_preface_if_needed, + remove_fields, + reorder_dict, + convert_physical_index_to_int, + convert_page_to_int, +) + + +class TestCountTokens: + def test_empty_string_returns_zero(self): + assert count_tokens("") == 0 + assert count_tokens(None) == 0 + + def test_non_empty_string_returns_positive(self): + n = count_tokens("hello world", model="gpt-4o") + assert n > 0 + + def test_model_none_uses_default(self): + """count_tokens with model=None should not raise (uses default gpt-4o).""" + n = count_tokens("test", model=None) + assert n >= 0 + + +class TestGetJsonContent: + def test_extracts_content_between_delimiters(self): + response = "prefix\n```json\n{\"a\": 1}\n```\nsuffix" + assert get_json_content(response) == "{\"a\": 1}" + + def test_no_delimiters_returns_stripped(self): + response = " {\"x\": 2} " + assert get_json_content(response) == "{\"x\": 2}" + + +class TestExtractJson: + def test_plain_json(self): + assert extract_json('{"a": 1}') == {"a": 1} + + def test_json_in_code_block(self): + content = "```json\n{\"b\": 2}\n```" + assert extract_json(content) == {"b": 2} + + def test_none_word_boundary_not_corrupted(self): + """'None of the Above' should not become 'null of the Above'.""" + content = '{"title": "None of the Above", "value": null}' + result = extract_json(content) + assert result.get("title") == "None of the Above" + assert result.get("value") is None + + def test_python_none_replaced(self): + content = '{"key": None}' + result = extract_json(content) + assert result == {"key": None} + + +class TestGetLeafNodes: + def test_leaf_node_without_nodes_key(self): + """Node with no 'nodes' key should be treated as leaf (no KeyError).""" + structure = {"title": "A", "node_id": "0001"} + leaves = get_leaf_nodes(structure) + assert len(leaves) == 1 + assert leaves[0]["title"] == "A" + + def test_leaf_node_with_empty_nodes(self): + structure = {"title": "A", "nodes": []} + leaves = get_leaf_nodes(structure) + assert len(leaves) == 1 + + def test_nested_structure(self): + structure = { + "title": "Root", + "nodes": [ + {"title": "Child", "nodes": []} + ], + } + leaves = get_leaf_nodes(structure) + assert len(leaves) == 1 + assert leaves[0]["title"] == "Child" + + +class TestPageIndexFromText: + def test_get_first_start_page(self): + text = "x y " + assert get_first_start_page_from_text(text) == 5 + + def test_get_first_start_page_no_match(self): + assert get_first_start_page_from_text("no tags") == -1 + + def test_get_last_start_page(self): + text = " a b c" + assert get_last_start_page_from_text(text) == 10 + + def test_get_last_start_page_no_match(self): + assert get_last_start_page_from_text("no tags") == -1 + + +class TestSanitizeFilename: + def test_replaces_slash(self): + assert sanitize_filename("a/b/c") == "a-b-c" + + def test_custom_replacement(self): + assert sanitize_filename("a/b", replacement="_") == "a_b" + + +class TestListToTree: + def test_flat_list(self): + data = [ + {"structure": "1", "title": "A", "start_index": 1, "end_index": 2}, + {"structure": "2", "title": "B", "start_index": 3, "end_index": 4}, + ] + tree = list_to_tree(data) + assert len(tree) == 2 + assert tree[0]["title"] == "A" and tree[1]["title"] == "B" + + def test_hierarchy(self): + data = [ + {"structure": "1", "title": "A", "start_index": 1, "end_index": 5}, + {"structure": "1.1", "title": "A1", "start_index": 2, "end_index": 3}, + ] + tree = list_to_tree(data) + assert len(tree) == 1 + assert tree[0]["title"] == "A" + assert len(tree[0]["nodes"]) == 1 + assert tree[0]["nodes"][0]["title"] == "A1" + + +class TestAddPrefaceIfNeeded: + def test_empty_list_unchanged(self): + assert add_preface_if_needed([]) == [] + assert add_preface_if_needed(None) is None + + def test_first_physical_index_one_no_preface(self): + data = [{"physical_index": 1, "title": "Intro"}] + assert add_preface_if_needed(data) == data + assert len(data) == 1 + + def test_first_physical_index_gt_one_adds_preface(self): + data = [{"physical_index": 3, "title": "Chapter 1"}] + result = add_preface_if_needed(data) + assert len(result) == 2 + assert result[0]["title"] == "Preface" + assert result[0]["physical_index"] == 1 + + +class TestRemoveFields: + def test_removes_specified_field(self): + data = {"a": 1, "text": "hello", "nodes": []} + out = remove_fields(data, fields=["text"]) + assert "text" not in out + assert out["a"] == 1 + + def test_nested_removal(self): + data = {"text": "x", "nodes": [{"text": "y"}]} + out = remove_fields(data, fields=["text"]) + assert "text" not in out + assert "text" not in out["nodes"][0] + + +class TestReorderDict: + def test_reorders_by_key_order(self): + data = {"b": 2, "a": 1, "c": 3} + out = reorder_dict(data, ["a", "b", "c"]) + assert list(out.keys()) == ["a", "b", "c"] + + def test_skips_missing_keys(self): + data = {"a": 1} + out = reorder_dict(data, ["a", "b", "c"]) + assert out == {"a": 1} + + def test_empty_key_order_returns_unchanged(self): + data = {"a": 1} + assert reorder_dict(data, None) == data + + +class TestConvertPhysicalIndexToInt: + def test_list_of_dicts_with_tag_format(self): + data = [{"physical_index": ""}] + convert_physical_index_to_int(data) + assert data[0]["physical_index"] == 5 + + def test_string_tag_returns_int(self): + assert convert_physical_index_to_int("") == 10 + + +class TestConvertPageToInt: + def test_converts_string_page_to_int(self): + data = [{"page": "3", "title": "A"}] + convert_page_to_int(data) + assert data[0]["page"] == 3 + + def test_invalid_string_left_unchanged(self): + data = [{"page": "nope", "title": "A"}] + convert_page_to_int(data) + assert data[0]["page"] == "nope" + + +class TestWriteNodeId: + def test_assigns_zero_padded_ids(self): + structure = [{"title": "A", "nodes": []}] + write_node_id(structure) + assert structure[0]["node_id"] == "0000" + + def test_nested_ids(self): + structure = [{"title": "A", "nodes": [{"title": "B"}]}] + write_node_id(structure) + assert structure[0]["node_id"] == "0000" + assert structure[0]["nodes"][0]["node_id"] == "0001" + + +class TestGetNodesAndStructureToList: + def test_structure_to_list_flattens(self): + tree = [{"title": "A", "nodes": [{"title": "B"}]}] + flat = structure_to_list(tree) + assert len(flat) == 2 + assert flat[0]["title"] == "A" and flat[1]["title"] == "B" + + def test_get_nodes_excludes_children_from_node_dict(self): + tree = [{"title": "A", "nodes": [{"title": "B"}]}] + nodes = get_nodes(tree) + assert len(nodes) == 2 + assert "nodes" not in nodes[0] or nodes[0].get("nodes") is None