diff --git a/README.md b/README.md index 7180efd5..abd6b7f3 100644 --- a/README.md +++ b/README.md @@ -147,14 +147,20 @@ You can follow these steps to generate a PageIndex tree from a PDF document. pip3 install --upgrade -r requirements.txt ``` -### 2. Set your OpenAI API key +### 2. Set your API key -Create a `.env` file in the root directory and add your API key: +Create a `.env` file in the root directory and add at least one API key: ```bash +# For OpenAI models (e.g. gpt-4o, gpt-4o-2024-11-20) CHATGPT_API_KEY=your_openai_key_here + +# For Gemini models (e.g. gemini-1.5-pro, gemini-1.5-flash) +GEMINI_API_KEY=your_gemini_key_here ``` +Use `--model` to choose the model; if the model name starts with `gemini-`, `GEMINI_API_KEY` is used; otherwise `CHATGPT_API_KEY` is used. + ### 3. Run PageIndex on your PDF ```bash @@ -167,7 +173,7 @@ python3 run_pageindex.py --pdf_path /path/to/your/document.pdf You can customize the processing with additional optional arguments: ``` ---model OpenAI model to use (default: gpt-4o-2024-11-20) +--model LLM model: OpenAI (e.g. gpt-4o-2024-11-20) or Gemini (e.g. gemini-1.5-pro) (default: gpt-4o-2024-11-20) --toc-check-pages Pages to check for table of contents (default: 20) --max-pages-per-node Max pages per node (default: 10) --max-tokens-per-node Max tokens per node (default: 20000) diff --git a/pageindex/utils.py b/pageindex/utils.py index dc7acd88..652c19bf 100644 --- a/pageindex/utils.py +++ b/pageindex/utils.py @@ -1,3 +1,4 @@ +import re import tiktoken import openai import logging @@ -18,17 +19,80 @@ from types import SimpleNamespace as config CHATGPT_API_KEY = os.getenv("CHATGPT_API_KEY") +GEMINI_API_KEY = os.getenv("GEMINI_API_KEY") + +def _use_gemini(model, api_key): + """Return (use_gemini: bool, key to use). Uses Gemini when model starts with 'gemini-' and key is available.""" + if not model or not (str(model).startswith("gemini-")): + return False, api_key or CHATGPT_API_KEY + gemini_key = api_key or GEMINI_API_KEY + if gemini_key: + return True, gemini_key + return False, api_key or CHATGPT_API_KEY + +def _gemini_generate_sync(model, prompt, api_key, chat_history=None): + """Sync Gemini generate_content via google.genai. Returns (text, finish_reason).""" + from google import genai + from google.genai import types + client = genai.Client(api_key=api_key) + config = types.GenerateContentConfig(temperature=0) + if chat_history: + contents = [] + for m in chat_history: + role = "user" if m.get("role") == "user" else "model" + contents.append(types.Content(role=role, parts=[types.Part.from_text(text=m.get("content", ""))])) + contents.append(types.Content(role="user", parts=[types.Part.from_text(text=prompt)])) + response = client.models.generate_content(model=model, contents=contents, config=config) + else: + response = client.models.generate_content(model=model, contents=prompt, config=config) + text = response.text if response.text else "Error" + finish_reason = "finished" + if response.candidates: + fr = getattr(response.candidates[0], "finish_reason", None) + if fr is not None and "MAX" in str(fr).upper(): + finish_reason = "max_output_reached" + return text, finish_reason + +async def _gemini_generate_async(model, prompt, api_key): + """Async Gemini generate_content via google.genai. Returns text.""" + from google import genai + from google.genai import types + client = genai.Client(api_key=api_key) + config = types.GenerateContentConfig(temperature=0) + loop = asyncio.get_event_loop() + response = await loop.run_in_executor( + None, + lambda: client.models.generate_content(model=model, contents=prompt, config=config), + ) + return response.text if response.text else "Error" def count_tokens(text, model=None): if not text: return 0 - enc = tiktoken.encoding_for_model(model) + if model and str(model).startswith("gemini-"): + return max(1, len(text) // 4) + enc = tiktoken.encoding_for_model(model or "gpt-4o") tokens = enc.encode(text) return len(tokens) def ChatGPT_API_with_finish_reason(model, prompt, api_key=CHATGPT_API_KEY, chat_history=None): + use_gemini, key = _use_gemini(model, api_key) + if use_gemini: + max_retries = 10 + for i in range(max_retries): + try: + return _gemini_generate_sync(model, prompt, key, chat_history=chat_history) + except Exception as e: + print('************* Retrying *************') + logging.error(f"Error: {e}") + if i < max_retries - 1: + time.sleep(1) + else: + logging.error('Max retries reached for prompt: ' + prompt) + return "Error", "error" + return "Error", "error" max_retries = 10 - client = openai.OpenAI(api_key=api_key) + client = openai.OpenAI(api_key=key) for i in range(max_retries): try: if chat_history: @@ -54,13 +118,29 @@ def ChatGPT_API_with_finish_reason(model, prompt, api_key=CHATGPT_API_KEY, chat_ time.sleep(1) # Wait for 1秒 before retrying else: logging.error('Max retries reached for prompt: ' + prompt) - return "Error" + return "Error", "error" def ChatGPT_API(model, prompt, api_key=CHATGPT_API_KEY, chat_history=None): + use_gemini, key = _use_gemini(model, api_key) + if use_gemini: + max_retries = 10 + for i in range(max_retries): + try: + text, _ = _gemini_generate_sync(model, prompt, key, chat_history=chat_history) + return text + except Exception as e: + print('************* Retrying *************') + logging.error(f"Error: {e}") + if i < max_retries - 1: + time.sleep(1) + else: + logging.error('Max retries reached for prompt: ' + prompt) + return "Error" + return "Error" max_retries = 10 - client = openai.OpenAI(api_key=api_key) + client = openai.OpenAI(api_key=key) for i in range(max_retries): try: if chat_history: @@ -87,11 +167,26 @@ def ChatGPT_API(model, prompt, api_key=CHATGPT_API_KEY, chat_history=None): async def ChatGPT_API_async(model, prompt, api_key=CHATGPT_API_KEY): + use_gemini, key = _use_gemini(model, api_key) + if use_gemini: + max_retries = 10 + for i in range(max_retries): + try: + return await _gemini_generate_async(model, prompt, key) + except Exception as e: + print('************* Retrying *************') + logging.error(f"Error: {e}") + if i < max_retries - 1: + await asyncio.sleep(1) + else: + logging.error('Max retries reached for prompt: ' + prompt) + return "Error" + return "Error" max_retries = 10 messages = [{"role": "user", "content": prompt}] for i in range(max_retries): try: - async with openai.AsyncOpenAI(api_key=api_key) as client: + async with openai.AsyncOpenAI(api_key=key) as client: response = await client.chat.completions.create( model=model, messages=messages, @@ -411,14 +506,15 @@ def add_preface_if_needed(data): def get_page_tokens(pdf_path, model="gpt-4o-2024-11-20", pdf_parser="PyPDF2"): - enc = tiktoken.encoding_for_model(model) + use_tiktoken = not (model and str(model).startswith("gemini-")) + enc = tiktoken.encoding_for_model(model or "gpt-4o") if use_tiktoken else None if pdf_parser == "PyPDF2": pdf_reader = PyPDF2.PdfReader(pdf_path) page_list = [] for page_num in range(len(pdf_reader.pages)): page = pdf_reader.pages[page_num] page_text = page.extract_text() - token_length = len(enc.encode(page_text)) + token_length = len(enc.encode(page_text)) if enc else max(1, len(page_text) // 4) page_list.append((page_text, token_length)) return page_list elif pdf_parser == "PyMuPDF": @@ -430,7 +526,7 @@ def get_page_tokens(pdf_path, model="gpt-4o-2024-11-20", pdf_parser="PyPDF2"): page_list = [] for page in doc: page_text = page.get_text() - token_length = len(enc.encode(page_text)) + token_length = len(enc.encode(page_text)) if enc else max(1, len(page_text) // 4) page_list.append((page_text, token_length)) return page_list else: diff --git a/requirements.txt b/requirements.txt index 463db58f..5d2b2776 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,4 +1,5 @@ openai==1.101.0 +google-genai>=1.0.0 pymupdf==1.26.4 PyPDF2==3.0.1 python-dotenv==1.1.0 diff --git a/results/earthmover_structure.json b/results/earthmover_structure.json new file mode 100644 index 00000000..7c95507d --- /dev/null +++ b/results/earthmover_structure.json @@ -0,0 +1,144 @@ +{ + "doc_name": "earthmover.pdf", + "structure": [ + { + "title": "INTRODUCTION", + "start_index": 1, + "end_index": 2, + "node_id": "0000", + "summary": "This document presents an optimized approach for Earth Mover\u2019s Distance (EMD) based similarity search at scale. It addresses the high computational cost of EMD, which typically acts as a bottleneck in the filter-and-refinement framework. The authors propose three main techniques to optimize the refinement phase: (i) adapting a Simplified Graph Incremental Algorithm (SIA) for more efficient min-cost flow computation, (ii) introducing a Progressive Bounding (PB) technique to enable early termination of EMD calculations, and (iii) implementing a Dynamic Refinement Ordering (DRO) strategy to reduce redundant computations. These methods are designed to be orthogonal to existing filtering techniques and are shown to significantly improve performance on large-scale datasets with high-dimensional histograms." + }, + { + "title": "PRELIMINARIES", + "start_index": 2, + "end_index": 3, + "nodes": [ + { + "title": "Computing the EMD", + "start_index": 3, + "end_index": 3, + "node_id": "0002", + "summary": "This document section details the computational methods for Earth Mover's Distance (EMD) and strategies for efficient similarity searching. It explains how EMD can be modeled as a minimum-cost network flow problem, specifically describing the Successive Shortest Path (SSP) algorithm, its mechanics regarding feasible edges and flow augmentation, and its computational complexity. Furthermore, it introduces a filter-and-refinement framework for k-nearest neighbor (k-NN) queries, which utilizes various lower-bound filtering techniques to prune the search space before performing exact EMD calculations on the remaining candidates." + }, + { + "title": "Filter-and-Re\ufb01nement Framework", + "start_index": 3, + "end_index": 4, + "node_id": "0003", + "summary": "This document details the computational methods for Earth Mover's Distance (EMD) and its application in similarity search. It covers the modeling of EMD as a minimum-cost flow problem, specifically explaining the Successive Shortest Path (SSP) algorithm and the mechanics of augmenting flow along feasible paths. Furthermore, it describes the \"filter-and-re\ufb01nement\" framework used for efficient k-nearest neighbor (k-NN) queries, which utilizes lower-bound filtering to prune histograms before performing exact EMD calculations. Finally, the text introduces the Simplified Graph Incremental Algorithm (SIA) as an optimization to scale up EMD computation by incrementally constructing partial flow graphs, thereby reducing the search cost compared to standard SSP." + } + ], + "node_id": "0001", + "summary": "Error" + }, + { + "title": "SCALING UP SSP", + "start_index": 4, + "end_index": 5, + "node_id": "0004", + "summary": "This document describes an optimized framework for k-nearest neighbor (k-NN) similarity search using Earth Mover's Distance (EMD). It details a filter-and-refinement approach, focusing on scaling up EMD computation for high-granularity histograms through the Simplified Graph Incremental Algorithm (SIA). Furthermore, it introduces two techniques\u2014Progressive Bounding (PB) and Dynamic Refinement Ordering (DRO)\u2014designed to optimize the refinement phase by allowing early termination of EMD calculations when a candidate object is unlikely to be part of the k-NN result set." + }, + { + "title": "BOOSTING THE REFINEMENT PHASE", + "start_index": 5, + "end_index": 5, + "nodes": [ + { + "title": "Analysis of EMD Calculation", + "start_index": 5, + "end_index": 5, + "node_id": "0006", + "summary": "This document section focuses on optimizing the refinement phase of EMD-based similarity search. It introduces two novel techniques, Progressive Bounding (PB) and Dynamic Refinement Ordering (DRO), designed to improve efficiency by moving away from treating EMD calculations as black-box processes. The text analyzes the performance of the SIA algorithm, noting that a significant portion of execution time occurs in the final iterations. To address this, the authors propose using a running lower bound (emd-) that combines the current accumulated cost with an estimation of remaining flow costs, allowing for early termination of EMD calculations when an object is unlikely to be part of the query result set." + }, + { + "title": "Progressive Bounding", + "start_index": 5, + "end_index": 8, + "node_id": "0007", + "summary": "Error" + }, + { + "title": "Sensitivity to Re\ufb01nement Order", + "start_index": 8, + "end_index": 7, + "node_id": "0008", + "summary": "Error" + }, + { + "title": "Dynamic Re\ufb01nement Ordering", + "start_index": 7, + "end_index": 8, + "node_id": "0009", + "summary": "This document introduces a \"Dynamic Refinement Ordering\" (DRO) technique designed to optimize Earth Mover's Distance (EMD)-based k-NN similarity searches. The approach improves upon traditional filter-and-refinement frameworks by concurrently refining multiple candidate objects using a priority-based strategy. By prioritizing candidates with the lowest current EMD lower bounds, the method accelerates the convergence of the pruning threshold, allowing for more efficient filtering and earlier termination of expensive EMD calculations. The text also details the implementation of a \"Running Upper Bound\" (GreedyUB) algorithm, which provides an efficient way to estimate EMD upper bounds during the refinement process, and outlines the trade-offs between memory consumption and performance in the proposed DRO strategy." + }, + { + "title": "Running Upper Bound", + "start_index": 8, + "end_index": 12, + "node_id": "0010", + "summary": "This document segment details advanced techniques for optimizing Earth Mover\u2019s Distance (EMD) computation within a filter-and-refinement similarity search framework. It introduces Dynamic Refinement Ordering (DRO), which prioritizes the refinement of candidate objects by concurrently managing multiple partial EMD computations, and a Progressive Bounding (PB) technique that utilizes a greedy approach to maintain running upper bounds for early pruning. The text also presents an experimental evaluation demonstrating that these methods significantly outperform state-of-the-art EMD computation approaches in terms of query time and scalability across various high-dimensional datasets." + } + ], + "node_id": "0005", + "summary": "This document section focuses on optimizing the refinement phase of EMD-based similarity search. It introduces two novel techniques, Progressive Bounding (PB) and Dynamic Refinement Ordering (DRO), designed to improve efficiency by moving away from treating EMD calculations as black-box processes. The text analyzes the performance of the SIA algorithm, noting that a significant portion of execution time occurs in the final iterations. To address this, the authors propose using a running lower bound (emd-) that combines the current accumulated cost with an estimation of remaining flow costs, allowing for early termination of EMD calculations when an object is unlikely to be part of the query result set." + }, + { + "title": "EXPERIMENTAL EVALUATION", + "start_index": 12, + "end_index": 9, + "nodes": [ + { + "title": "Performance Improvement", + "start_index": 9, + "end_index": 10, + "node_id": "0012", + "summary": "This document section evaluates the performance of various Earth Mover's Distance (EMD) computation methods within a filter-and-refinement framework for k-NN similarity queries. It introduces and benchmarks two novel techniques\u2014Progressive Bounding (PB) and Dynamic Refinement Ordering (DRO)\u2014against existing black-box EMD methods (such as SIA, SSP, and TRA) across six real-world image datasets. The results demonstrate that the proposed DRO methodology significantly outperforms traditional approaches, achieving performance levels close to ideal oracle-based methods. Additionally, the text includes scalability experiments showing that PB and DRO are highly efficient and remain robust as database size increases." + }, + { + "title": "Scalability Experiments", + "start_index": 10, + "end_index": 11, + "node_id": "0013", + "summary": "This document presents an empirical performance evaluation of various methods for computing Earth Mover\u2019s Distance (EMD) in the context of k-nearest neighbor (k-NN) similarity queries. It compares the proposed Dynamic Reordering (DRO) and Progressive Bounding (PB) techniques against existing approaches (SIA, OI, ESS) across multiple datasets (RETINA, IRMA, PANORAMIO, FRIENDS, WORLD). The text highlights the effectiveness of DRO in minimizing the number of EMD refinements and query time, demonstrates the scalability of these methods regarding database cardinality and histogram dimensionality, and discusses parameter tuning for the DRO algorithm. Finally, it provides a literature review of EMD applications and existing filter-and-refinement frameworks in database research." + }, + { + "title": "Parameter Tuning in DRO", + "start_index": 11, + "end_index": 11, + "node_id": "0014", + "summary": "This document segment evaluates the performance and scalability of the proposed DRO (Dynamic Reordering) technique for Earth Mover\u2019s Distance (EMD)-based similarity search. It presents experimental results demonstrating that DRO and PB (Pruning-based) methods scale more effectively than SIA regarding dataset cardinality and histogram dimensionality. Additionally, the text details parameter tuning for DRO, specifically analyzing the impact of concurrent refinement size and the effectiveness of different object-prioritization functions. Finally, the document provides a literature review of existing EMD research, noting that while previous studies focused primarily on filter-phase efficiency, the proposed work addresses the critical, often overlooked, computational cost of the refinement phase." + } + ], + "node_id": "0011", + "summary": "Please provide the text of the partial document you would like me to describe." + }, + { + "title": "RELATED WORK", + "start_index": 11, + "end_index": 12, + "node_id": "0015", + "summary": "This document presents an efficient approach for evaluating Earth Mover\u2019s Distance (EMD)-based similarity queries. It introduces a progressive refinement strategy that updates lower bounds during computation to enable early pruning, and a concurrent refinement technique (DRO) that dynamically reorders candidates to improve efficiency. The paper evaluates these methods through scalability studies regarding dataset cardinality and histogram dimensionality, demonstrates parameter tuning for the DRO technique, and provides a comprehensive review of related work in EMD-based similarity search, concluding that the proposed methods significantly reduce computational costs compared to existing state-of-the-art solutions." + }, + { + "title": "CONCLUSION", + "start_index": 12, + "end_index": 12, + "node_id": "0016", + "summary": "This document section concludes a research paper on the efficient evaluation of similarity queries using Earth Mover\u2019s Distance (EMD). It summarizes the authors' contributions, which include adapting the SIA algorithm for histogram comparison, implementing a progressive refinement strategy for early pruning, and developing a technique for concurrent candidate refinement. The text highlights that these methods significantly outperform state-of-the-art solutions, notes the framework's flexibility regarding histogram normalization and range queries, and outlines future research directions involving other distance measures like dynamic time warping. The document concludes with acknowledgments and a comprehensive list of references." + }, + { + "title": "ACKNOWLEDGMENT", + "start_index": 12, + "end_index": 12, + "node_id": "0017", + "summary": "This document section concludes a research paper on the efficient evaluation of similarity queries using Earth Mover\u2019s Distance (EMD). It summarizes the authors' contributions, which include adapting the SIA algorithm for histogram comparison, implementing a progressive refinement strategy for early pruning, and developing a technique for concurrent candidate refinement. The text highlights that these methods significantly outperform state-of-the-art solutions, notes the framework's flexibility regarding histogram normalization and range queries, and outlines future research directions involving other distance measures like dynamic time warping. The document concludes with acknowledgments and a comprehensive list of references." + }, + { + "title": "REFERENCES", + "start_index": 12, + "end_index": 12, + "node_id": "0018", + "summary": "This document section concludes a research paper on the efficient evaluation of similarity queries using Earth Mover\u2019s Distance (EMD). It summarizes the authors' contributions, which include adapting the SIA algorithm for histogram comparison, implementing a progressive refinement strategy for early pruning, and developing a technique for concurrent candidate refinement. The text highlights that these methods significantly outperform state-of-the-art solutions, notes the framework's flexibility regarding histogram normalization and range queries, and outlines future research directions involving other distance measures like dynamic time warping. The document concludes with acknowledgments and a comprehensive list of references." + } + ] +} \ No newline at end of file diff --git a/run_pageindex.py b/run_pageindex.py index 10702450..f40ebdda 100644 --- a/run_pageindex.py +++ b/run_pageindex.py @@ -10,7 +10,7 @@ parser.add_argument('--pdf_path', type=str, help='Path to the PDF file') parser.add_argument('--md_path', type=str, help='Path to the Markdown file') - parser.add_argument('--model', type=str, default='gpt-4o-2024-11-20', help='Model to use') + parser.add_argument('--model', type=str, default='gpt-4o-2024-11-20', help='LLM model: OpenAI (e.g. gpt-4o-2024-11-20) or Gemini (e.g. gemini-1.5-pro)') parser.add_argument('--toc-check-pages', type=int, default=20, help='Number of pages to check for table of contents (PDF only)')