Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
60 changes: 49 additions & 11 deletions pageindex/page_index.py
Original file line number Diff line number Diff line change
Expand Up @@ -119,7 +119,9 @@ def toc_detector_single_page(content, model=None):
response = llm_completion(model=model, prompt=prompt)
# print('response', response)
json_content = extract_json(response)
return json_content['toc_detected']
if not isinstance(json_content, dict):
return 'no'
return json_content.get('toc_detected', 'no')


def check_if_toc_extraction_is_complete(content, toc, model=None):
Expand All @@ -137,7 +139,9 @@ def check_if_toc_extraction_is_complete(content, toc, model=None):
prompt = prompt + '\n Document:\n' + content + '\n Table of contents:\n' + toc
response = llm_completion(model=model, prompt=prompt)
json_content = extract_json(response)
return json_content['completed']
if not isinstance(json_content, dict):
return 'no'
return json_content.get('completed', 'no')


def check_if_toc_transformation_is_complete(content, toc, model=None):
Expand All @@ -155,7 +159,9 @@ def check_if_toc_transformation_is_complete(content, toc, model=None):
prompt = prompt + '\n Raw Table of contents:\n' + content + '\n Cleaned Table of contents:\n' + toc
response = llm_completion(model=model, prompt=prompt)
json_content = extract_json(response)
return json_content['completed']
if not isinstance(json_content, dict):
return 'no'
return json_content.get('completed', 'no')

def extract_toc_content(content, model=None):
prompt = f"""
Expand Down Expand Up @@ -217,7 +223,9 @@ def detect_page_index(toc_content, model=None):

response = llm_completion(model=model, prompt=prompt)
json_content = extract_json(response)
return json_content['page_index_given_in_toc']
if not isinstance(json_content, dict):
return 'no'
return json_content.get('page_index_given_in_toc', 'no')

def toc_extractor(page_list, toc_page_list, model):
def transform_dots_to_colon(text):
Expand Down Expand Up @@ -414,6 +422,8 @@ def calculate_page_offset(pairs):
return most_common

def add_page_offset_to_toc_json(data, offset):
if offset is None:
return data
for i in range(len(data)):
if data[i].get('page') is not None and isinstance(data[i]['page'], int):
data[i]['physical_index'] = data[i]['page'] + offset
Expand Down Expand Up @@ -503,6 +513,22 @@ def remove_first_physical_index_section(text):
return text.replace(match.group(0), '', 1)
return text


def _as_toc_list(value):
"""Normalize model-produced TOC JSON to a list of section dictionaries."""

if isinstance(value, list):
return [item for item in value if isinstance(item, dict)]
if isinstance(value, dict):
if isinstance(value.get("toc"), list):
return [item for item in value["toc"] if isinstance(item, dict)]
if isinstance(value.get("items"), list):
return [item for item in value["items"] if isinstance(item, dict)]
if all(key in value for key in ("title", "physical_index")):
return [value]
return []


### add verify completeness
def generate_toc_continue(toc_content, part, model=None):
print('start generate_toc_continue')
Expand Down Expand Up @@ -534,7 +560,7 @@ def generate_toc_continue(toc_content, part, model=None):
prompt = prompt + '\nGiven text\n:' + part + '\nPrevious tree structure\n:' + json.dumps(toc_content, indent=2)
response, finish_reason = llm_completion(model=model, prompt=prompt, return_finish_reason=True)
if finish_reason == 'finished':
return extract_json(response)
return _as_toc_list(extract_json(response))
else:
raise Exception(f'finish reason: {finish_reason}')

Expand Down Expand Up @@ -569,7 +595,7 @@ def generate_toc_init(part, model=None):
response, finish_reason = llm_completion(model=model, prompt=prompt, return_finish_reason=True)

if finish_reason == 'finished':
return extract_json(response)
return _as_toc_list(extract_json(response))
else:
raise Exception(f'finish reason: {finish_reason}')

Expand Down Expand Up @@ -684,8 +710,14 @@ def process_none_page_numbers(toc_items, page_list, start_index=1, model=None):
item_copy = copy.deepcopy(item)
del item_copy['page']
result = add_page_number_to_toc(page_contents, item_copy, model)
if isinstance(result[0]['physical_index'], str) and result[0]['physical_index'].startswith('<physical_index'):
item['physical_index'] = int(result[0]['physical_index'].split('_')[-1].rstrip('>').strip())
# LLM output may be empty or omit physical_index. Leave the item
# unresolved so the caller can filter or fall back instead of
# failing the whole document.
if not isinstance(result, list) or not result or not isinstance(result[0], dict):
continue
physical_index = result[0].get('physical_index')
if isinstance(physical_index, str) and physical_index.startswith('<physical_index'):
item['physical_index'] = int(physical_index.split('_')[-1].rstrip('>').strip())
del item['page']

return toc_items
Expand Down Expand Up @@ -753,7 +785,12 @@ async def single_toc_item_index_fixer(section_title, content, model=None):
prompt = toc_extractor_prompt + '\nSection Title:\n' + str(section_title) + '\nDocument pages:\n' + content
response = await llm_acompletion(model=model, prompt=prompt)
json_content = extract_json(response)
return convert_physical_index_to_int(json_content['physical_index'])
if not isinstance(json_content, dict):
return None
physical_index = json_content.get('physical_index')
if physical_index is None:
return None
return convert_physical_index_to_int(physical_index)



Expand Down Expand Up @@ -994,7 +1031,8 @@ async def meta_processor(page_list, mode=None, toc_content=None, toc_page_list=N
elif mode == 'process_toc_no_page_numbers':
return await meta_processor(page_list, mode='process_no_toc', start_index=start_index, opt=opt, logger=logger)
else:
raise Exception('Processing failed')
logger.warning('Falling back to low-confidence no-TOC structure')
return toc_with_page_number


async def process_large_node_recursively(node, page_list, opt=None, logger=None):
Expand Down Expand Up @@ -1151,4 +1189,4 @@ def validate_and_truncate_physical_indices(toc_with_page_number, page_list_lengt
if truncated_items:
print(f"Truncated {len(truncated_items)} TOC items that exceeded document length")

return toc_with_page_number
return toc_with_page_number
80 changes: 50 additions & 30 deletions pageindex/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,7 @@
import copy
import asyncio
import pymupdf
import re
from io import BytesIO
from dotenv import load_dotenv
load_dotenv()
Expand Down Expand Up @@ -96,38 +97,58 @@ def get_json_content(response):
return json_content


def _normalize_json_candidate(candidate):
candidate = candidate.strip()
candidate = re.sub(r"\bNone\b", "null", candidate)
candidate = re.sub(r"\bTrue\b", "true", candidate)
candidate = re.sub(r"\bFalse\b", "false", candidate)
candidate = candidate.replace(",]", "]").replace(",}", "}")
return candidate


def _json_candidates(content):
if not content:
return

fenced_blocks = re.findall(r"```(?:json)?\s*(.*?)```", content, flags=re.DOTALL | re.IGNORECASE)
for block in fenced_blocks:
yield block

yield content

# Some models add explanations before or after JSON. Try decoding from each
# plausible JSON start and allow trailing text after the decoded object.
for index, char in enumerate(content):
if char in "{[":
yield content[index:]


def extract_json(content):
try:
# First, try to extract JSON enclosed within ```json and ```
start_idx = content.find("```json")
if start_idx != -1:
start_idx += 7 # Adjust index to start after the delimiter
end_idx = content.rfind("```")
json_content = content[start_idx:end_idx].strip()
else:
# If no delimiters, assume entire content could be JSON
json_content = content.strip()

# Clean up common issues that might cause parsing errors
json_content = json_content.replace('None', 'null') # Replace Python None with JSON null
json_content = json_content.replace('\n', ' ').replace('\r', ' ') # Remove newlines
json_content = ' '.join(json_content.split()) # Normalize whitespace

# Attempt to parse and return the JSON object
return json.loads(json_content)
except json.JSONDecodeError as e:
logging.error(f"Failed to extract JSON: {e}")
# Try to clean up the content further if initial parsing fails
decoder = json.JSONDecoder()
last_error = None

for candidate in _json_candidates(content):
json_content = _normalize_json_candidate(candidate)
if not json_content:
continue

try:
# Remove any trailing commas before closing brackets/braces
json_content = json_content.replace(',]', ']').replace(',}', '}')
return json.loads(json_content)
except:
logging.error("Failed to parse JSON even after cleanup")
return {}
except Exception as e:
logging.error(f"Unexpected error while extracting JSON: {e}")
return {}
except json.JSONDecodeError as e:
last_error = e

try:
parsed, _ = decoder.raw_decode(json_content)
return parsed
except json.JSONDecodeError as e:
last_error = e

if last_error:
logging.error(f"Failed to extract JSON: {last_error}")
logging.error("Failed to parse JSON even after cleanup")
else:
logging.error("Failed to extract JSON: empty response")
return {}

def write_node_id(data, node_id=0):
if isinstance(data, dict):
Expand Down Expand Up @@ -707,4 +728,3 @@ def print_tree(tree, indent=0):
def print_wrapped(text, width=100):
for line in text.splitlines():
print(textwrap.fill(line, width=width))

49 changes: 49 additions & 0 deletions tests/test_json_resilience.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,49 @@
import unittest

from pageindex.page_index import _as_toc_list, add_page_offset_to_toc_json
from pageindex.utils import extract_json


class JsonExtractionTests(unittest.TestCase):
def test_extract_json_from_fenced_block(self):
payload = extract_json('```json\n{"toc_detected": "yes"}\n```')

self.assertEqual(payload["toc_detected"], "yes")

def test_extract_json_after_explanatory_text(self):
payload = extract_json('Here is the JSON:\n{"toc_detected": "no"}')

self.assertEqual(payload["toc_detected"], "no")

def test_extract_json_array_with_trailing_text(self):
payload = extract_json('[{"title": "Item 1", "physical_index": "<physical_index_3>"}]\nDone.')

self.assertEqual(payload[0]["title"], "Item 1")

def test_extract_json_python_literals(self):
payload = extract_json('{"page": None, "valid": True, "done": False}')

self.assertIsNone(payload["page"])
self.assertIs(payload["valid"], True)
self.assertIs(payload["done"], False)


class TocFallbackTests(unittest.TestCase):
def test_as_toc_list_accepts_plain_list(self):
payload = [{"title": "Item 1", "physical_index": 3}, "bad"]

self.assertEqual(_as_toc_list(payload), [{"title": "Item 1", "physical_index": 3}])

def test_as_toc_list_accepts_common_wrappers(self):
payload = {"toc": [{"title": "Item 1", "physical_index": 3}]}

self.assertEqual(_as_toc_list(payload), [{"title": "Item 1", "physical_index": 3}])

def test_page_offset_none_is_noop(self):
payload = [{"title": "Item 1", "page": 5}]

self.assertEqual(add_page_offset_to_toc_json(payload, None), payload)


if __name__ == "__main__":
unittest.main()