From 1cf28e5df04ec1e0dd8c60e773d001a81ef4e33b Mon Sep 17 00:00:00 2001
From: HeaL <kevinzenith800123@gmail.com>
Date: Mon, 22 Jun 2026 09:46:00 +0800
Subject: [PATCH] Improve JSON extraction and TOC fallback handling

---
 pageindex/page_index.py       | 60 +++++++++++++++++++++-----
 pageindex/utils.py            | 80 ++++++++++++++++++++++-------------
 tests/test_json_resilience.py | 49 +++++++++++++++++++++
 3 files changed, 148 insertions(+), 41 deletions(-)
 create mode 100644 tests/test_json_resilience.py

diff --git a/pageindex/page_index.py b/pageindex/page_index.py
index 9004309fb..2ee7c31d8 100644
--- a/pageindex/page_index.py
+++ b/pageindex/page_index.py
@@ -119,7 +119,9 @@ def toc_detector_single_page(content, model=None):
     response = llm_completion(model=model, prompt=prompt)
     # print('response', response)
     json_content = extract_json(response)    
-    return json_content['toc_detected']
+    if not isinstance(json_content, dict):
+        return 'no'
+    return json_content.get('toc_detected', 'no')
 
 
 def check_if_toc_extraction_is_complete(content, toc, model=None):
@@ -137,7 +139,9 @@ def check_if_toc_extraction_is_complete(content, toc, model=None):
     prompt = prompt + '\n Document:\n' + content + '\n Table of contents:\n' + toc
     response = llm_completion(model=model, prompt=prompt)
     json_content = extract_json(response)
-    return json_content['completed']
+    if not isinstance(json_content, dict):
+        return 'no'
+    return json_content.get('completed', 'no')
 
 
 def check_if_toc_transformation_is_complete(content, toc, model=None):
@@ -155,7 +159,9 @@ def check_if_toc_transformation_is_complete(content, toc, model=None):
     prompt = prompt + '\n Raw Table of contents:\n' + content + '\n Cleaned Table of contents:\n' + toc
     response = llm_completion(model=model, prompt=prompt)
     json_content = extract_json(response)
-    return json_content['completed']
+    if not isinstance(json_content, dict):
+        return 'no'
+    return json_content.get('completed', 'no')
 
 def extract_toc_content(content, model=None):
     prompt = f"""
@@ -217,7 +223,9 @@ def detect_page_index(toc_content, model=None):
 
     response = llm_completion(model=model, prompt=prompt)
     json_content = extract_json(response)
-    return json_content['page_index_given_in_toc']
+    if not isinstance(json_content, dict):
+        return 'no'
+    return json_content.get('page_index_given_in_toc', 'no')
 
 def toc_extractor(page_list, toc_page_list, model):
     def transform_dots_to_colon(text):
@@ -414,6 +422,8 @@ def calculate_page_offset(pairs):
     return most_common
 
 def add_page_offset_to_toc_json(data, offset):
+    if offset is None:
+        return data
     for i in range(len(data)):
         if data[i].get('page') is not None and isinstance(data[i]['page'], int):
             data[i]['physical_index'] = data[i]['page'] + offset
@@ -503,6 +513,22 @@ def remove_first_physical_index_section(text):
         return text.replace(match.group(0), '', 1)
     return text
 
+
+def _as_toc_list(value):
+    """Normalize model-produced TOC JSON to a list of section dictionaries."""
+
+    if isinstance(value, list):
+        return [item for item in value if isinstance(item, dict)]
+    if isinstance(value, dict):
+        if isinstance(value.get("toc"), list):
+            return [item for item in value["toc"] if isinstance(item, dict)]
+        if isinstance(value.get("items"), list):
+            return [item for item in value["items"] if isinstance(item, dict)]
+        if all(key in value for key in ("title", "physical_index")):
+            return [value]
+    return []
+
+
 ### add verify completeness
 def generate_toc_continue(toc_content, part, model=None):
     print('start generate_toc_continue')
@@ -534,7 +560,7 @@ def generate_toc_continue(toc_content, part, model=None):
     prompt = prompt + '\nGiven text\n:' + part + '\nPrevious tree structure\n:' + json.dumps(toc_content, indent=2)
     response, finish_reason = llm_completion(model=model, prompt=prompt, return_finish_reason=True)
     if finish_reason == 'finished':
-        return extract_json(response)
+        return _as_toc_list(extract_json(response))
     else:
         raise Exception(f'finish reason: {finish_reason}')
     
@@ -569,7 +595,7 @@ def generate_toc_init(part, model=None):
     response, finish_reason = llm_completion(model=model, prompt=prompt, return_finish_reason=True)
 
     if finish_reason == 'finished':
-         return extract_json(response)
+         return _as_toc_list(extract_json(response))
     else:
         raise Exception(f'finish reason: {finish_reason}')
 
@@ -684,8 +710,14 @@ def process_none_page_numbers(toc_items, page_list, start_index=1, model=None):
             item_copy = copy.deepcopy(item)
             del item_copy['page']
             result = add_page_number_to_toc(page_contents, item_copy, model)
-            if isinstance(result[0]['physical_index'], str) and result[0]['physical_index'].startswith('<physical_index'):
-                item['physical_index'] = int(result[0]['physical_index'].split('_')[-1].rstrip('>').strip())
+            # LLM output may be empty or omit physical_index. Leave the item
+            # unresolved so the caller can filter or fall back instead of
+            # failing the whole document.
+            if not isinstance(result, list) or not result or not isinstance(result[0], dict):
+                continue
+            physical_index = result[0].get('physical_index')
+            if isinstance(physical_index, str) and physical_index.startswith('<physical_index'):
+                item['physical_index'] = int(physical_index.split('_')[-1].rstrip('>').strip())
                 del item['page']
     
     return toc_items
@@ -753,7 +785,12 @@ async def single_toc_item_index_fixer(section_title, content, model=None):
     prompt = toc_extractor_prompt + '\nSection Title:\n' + str(section_title) + '\nDocument pages:\n' + content
     response = await llm_acompletion(model=model, prompt=prompt)
     json_content = extract_json(response)    
-    return convert_physical_index_to_int(json_content['physical_index'])
+    if not isinstance(json_content, dict):
+        return None
+    physical_index = json_content.get('physical_index')
+    if physical_index is None:
+        return None
+    return convert_physical_index_to_int(physical_index)
 
 
 
@@ -994,7 +1031,8 @@ async def meta_processor(page_list, mode=None, toc_content=None, toc_page_list=N
         elif mode == 'process_toc_no_page_numbers':
             return await meta_processor(page_list, mode='process_no_toc', start_index=start_index, opt=opt, logger=logger)
         else:
-            raise Exception('Processing failed')
+            logger.warning('Falling back to low-confidence no-TOC structure')
+            return toc_with_page_number
         
  
 async def process_large_node_recursively(node, page_list, opt=None, logger=None):
@@ -1151,4 +1189,4 @@ def validate_and_truncate_physical_indices(toc_with_page_number, page_list_lengt
     if truncated_items:
         print(f"Truncated {len(truncated_items)} TOC items that exceeded document length")
      
-    return toc_with_page_number
\ No newline at end of file
+    return toc_with_page_number
diff --git a/pageindex/utils.py b/pageindex/utils.py
index f00ccf3a7..ed39ae684 100644
--- a/pageindex/utils.py
+++ b/pageindex/utils.py
@@ -9,6 +9,7 @@
 import copy
 import asyncio
 import pymupdf
+import re
 from io import BytesIO
 from dotenv import load_dotenv
 load_dotenv()
@@ -96,38 +97,58 @@ def get_json_content(response):
     return json_content
          
 
+def _normalize_json_candidate(candidate):
+    candidate = candidate.strip()
+    candidate = re.sub(r"\bNone\b", "null", candidate)
+    candidate = re.sub(r"\bTrue\b", "true", candidate)
+    candidate = re.sub(r"\bFalse\b", "false", candidate)
+    candidate = candidate.replace(",]", "]").replace(",}", "}")
+    return candidate
+
+
+def _json_candidates(content):
+    if not content:
+        return
+
+    fenced_blocks = re.findall(r"```(?:json)?\s*(.*?)```", content, flags=re.DOTALL | re.IGNORECASE)
+    for block in fenced_blocks:
+        yield block
+
+    yield content
+
+    # Some models add explanations before or after JSON. Try decoding from each
+    # plausible JSON start and allow trailing text after the decoded object.
+    for index, char in enumerate(content):
+        if char in "{[":
+            yield content[index:]
+
+
 def extract_json(content):
-    try:
-        # First, try to extract JSON enclosed within ```json and ```
-        start_idx = content.find("```json")
-        if start_idx != -1:
-            start_idx += 7  # Adjust index to start after the delimiter
-            end_idx = content.rfind("```")
-            json_content = content[start_idx:end_idx].strip()
-        else:
-            # If no delimiters, assume entire content could be JSON
-            json_content = content.strip()
-
-        # Clean up common issues that might cause parsing errors
-        json_content = json_content.replace('None', 'null')  # Replace Python None with JSON null
-        json_content = json_content.replace('\n', ' ').replace('\r', ' ')  # Remove newlines
-        json_content = ' '.join(json_content.split())  # Normalize whitespace
-
-        # Attempt to parse and return the JSON object
-        return json.loads(json_content)
-    except json.JSONDecodeError as e:
-        logging.error(f"Failed to extract JSON: {e}")
-        # Try to clean up the content further if initial parsing fails
+    decoder = json.JSONDecoder()
+    last_error = None
+
+    for candidate in _json_candidates(content):
+        json_content = _normalize_json_candidate(candidate)
+        if not json_content:
+            continue
+
         try:
-            # Remove any trailing commas before closing brackets/braces
-            json_content = json_content.replace(',]', ']').replace(',}', '}')
             return json.loads(json_content)
-        except:
-            logging.error("Failed to parse JSON even after cleanup")
-            return {}
-    except Exception as e:
-        logging.error(f"Unexpected error while extracting JSON: {e}")
-        return {}
+        except json.JSONDecodeError as e:
+            last_error = e
+
+        try:
+            parsed, _ = decoder.raw_decode(json_content)
+            return parsed
+        except json.JSONDecodeError as e:
+            last_error = e
+
+    if last_error:
+        logging.error(f"Failed to extract JSON: {last_error}")
+        logging.error("Failed to parse JSON even after cleanup")
+    else:
+        logging.error("Failed to extract JSON: empty response")
+    return {}
 
 def write_node_id(data, node_id=0):
     if isinstance(data, dict):
@@ -707,4 +728,3 @@ def print_tree(tree, indent=0):
 def print_wrapped(text, width=100):
     for line in text.splitlines():
         print(textwrap.fill(line, width=width))
-
diff --git a/tests/test_json_resilience.py b/tests/test_json_resilience.py
new file mode 100644
index 000000000..e57ca7b65
--- /dev/null
+++ b/tests/test_json_resilience.py
@@ -0,0 +1,49 @@
+﻿import unittest
+
+from pageindex.page_index import _as_toc_list, add_page_offset_to_toc_json
+from pageindex.utils import extract_json
+
+
+class JsonExtractionTests(unittest.TestCase):
+    def test_extract_json_from_fenced_block(self):
+        payload = extract_json('```json\n{"toc_detected": "yes"}\n```')
+
+        self.assertEqual(payload["toc_detected"], "yes")
+
+    def test_extract_json_after_explanatory_text(self):
+        payload = extract_json('Here is the JSON:\n{"toc_detected": "no"}')
+
+        self.assertEqual(payload["toc_detected"], "no")
+
+    def test_extract_json_array_with_trailing_text(self):
+        payload = extract_json('[{"title": "Item 1", "physical_index": "<physical_index_3>"}]\nDone.')
+
+        self.assertEqual(payload[0]["title"], "Item 1")
+
+    def test_extract_json_python_literals(self):
+        payload = extract_json('{"page": None, "valid": True, "done": False}')
+
+        self.assertIsNone(payload["page"])
+        self.assertIs(payload["valid"], True)
+        self.assertIs(payload["done"], False)
+
+
+class TocFallbackTests(unittest.TestCase):
+    def test_as_toc_list_accepts_plain_list(self):
+        payload = [{"title": "Item 1", "physical_index": 3}, "bad"]
+
+        self.assertEqual(_as_toc_list(payload), [{"title": "Item 1", "physical_index": 3}])
+
+    def test_as_toc_list_accepts_common_wrappers(self):
+        payload = {"toc": [{"title": "Item 1", "physical_index": 3}]}
+
+        self.assertEqual(_as_toc_list(payload), [{"title": "Item 1", "physical_index": 3}])
+
+    def test_page_offset_none_is_noop(self):
+        payload = [{"title": "Item 1", "page": 5}]
+
+        self.assertEqual(add_page_offset_to_toc_json(payload, None), payload)
+
+
+if __name__ == "__main__":
+    unittest.main()