66This script uses PyMuPDF for accurate and efficient text extraction from
77scientific PDFs. It preserves reading order, handles multi-column text, and
88automatically applies OCR when a page contains only images (e.g., scanned documents).
9- Tables are automatically detected and extracted using PyMuPDF first, with camelot-py as fallback.
109"""
1110
11+ # Extract all text from a PDF using PyMuPDF
1212import fitz
1313import pytesseract
1414from PIL import Image
1515import io
1616import argparse
1717from pathlib import Path
1818import sys
19- import json
20- from typing import List , Dict
21- import camelot
2219
2320Image .MAX_IMAGE_PIXELS = None
2421fitz .TOOLS .mupdf_display_errors (False )
2522
2623
27- def extract_tables_with_camelot (pdf_path : str ) -> List [Dict ]:
28- """Extract tables using camelot-py (fallback method).
29-
30- Args:
31- pdf_path: Path to the PDF file
32-
33- Returns:
34- List of dictionaries containing table data and metadata
35- """
36-
37- tables_data = []
38-
39- try :
40- # Try stream method first with edge detection (better for tables without borders)
41- tables = camelot .read_pdf (
42- str (pdf_path ), pages = 'all' , flavor = 'stream' , edge_tol = 500 , row_tol = 10 , column_tol = 5 # Tolerance for detecting table edges # Ttolerance for row detection # Tolerance for column detection
43- )
44-
45- # If still no tables, try lattice method (for bordered tables)
46- if len (tables ) == 0 :
47- tables = camelot .read_pdf (str (pdf_path ), pages = 'all' , flavor = 'lattice' , line_scale = 40 )
48-
49- for idx , table in enumerate (tables , start = 1 ):
50- # Convert to list of lists
51- table_cells = table .df .values .tolist ()
52-
53- # Add header row (pandas columns)
54- header = table .df .columns .tolist ()
55- table_cells .insert (0 , header )
56-
57- # Skip tables with very few cells (likely detection errors)
58- if len (table_cells ) < 3 or (len (table_cells [0 ]) if table_cells else 0 ) < 2 :
59- continue
60-
61- table_info = {
62- "table_id" : f"Table_P{ table .page } _T{ idx } " ,
63- "page_number" : table .page ,
64- "table_index" : idx ,
65- "bbox" : {
66- "x0" : table ._bbox [0 ] if hasattr (table , '_bbox' ) else 0 ,
67- "y0" : table ._bbox [1 ] if hasattr (table , '_bbox' ) else 0 ,
68- "x1" : table ._bbox [2 ] if hasattr (table , '_bbox' ) else 0 ,
69- "y1" : table ._bbox [3 ] if hasattr (table , '_bbox' ) else 0 ,
70- },
71- "num_rows" : len (table_cells ),
72- "num_cols" : len (table_cells [0 ]) if table_cells else 0 ,
73- "cells" : table_cells ,
74- "accuracy" : float (table .accuracy ) if hasattr (table , 'accuracy' ) else 0.0 ,
75- "extraction_method" : "camelot" ,
76- }
77-
78- tables_data .append (table_info )
79-
80- except Exception as e :
81- print (f"[ERROR] Camelot extraction failed: { e } " , file = sys .stderr )
82-
83- return tables_data
84-
85-
86- def extract_tables_from_pdf (pdf_path : str ) -> List [Dict ]:
87- """Extract tables from PDF using PyMuPDF first, then camelot-py as fallback.
88-
89- Args:
90- pdf_path: Path to the PDF file
91-
92- Returns:
93- List of dictionaries containing table data and metadata
94- """
95- tables_data = []
96-
97- # Try PyMuPDF first
98- try :
99- with fitz .open (pdf_path ) as doc :
100- for page_num , page in enumerate (doc , start = 1 ):
101- tabs = page .find_tables ()
102-
103- if not tabs .tables :
104- continue
105-
106- for table_idx , tab in enumerate (tabs .tables , start = 1 ):
107- try :
108- table_cells = tab .extract ()
109-
110- if not table_cells or len (table_cells ) == 0 :
111- continue
112-
113- bbox = tab .bbox
114-
115- table_info = {
116- "table_id" : f"Table_P{ page_num } _T{ table_idx } " ,
117- "page_number" : page_num ,
118- "table_index" : table_idx ,
119- "bbox" : {"x0" : bbox .x0 , "y0" : bbox .y0 , "x1" : bbox .x1 , "y1" : bbox .y1 },
120- "num_rows" : len (table_cells ),
121- "num_cols" : len (table_cells [0 ]) if table_cells else 0 ,
122- "cells" : table_cells ,
123- "extraction_method" : "pymupdf" ,
124- }
125-
126- tables_data .append (table_info )
127-
128- except Exception as e :
129- tables_data .append ({"table_id" : f"Table_P{ page_num } _T{ table_idx } " , "page_number" : page_num , "table_index" : table_idx , "error" : str (e ), "extraction_method" : "pymupdf" })
130-
131- except Exception as e :
132- print (f"[ERROR] PyMuPDF table extraction failed: { e } " , file = sys .stderr )
133-
134- # If PyMuPDF found no tables, try camelot
135- if len (tables_data ) == 0 :
136- tables_data = extract_tables_with_camelot (pdf_path )
137-
138- return tables_data
139-
140-
14124def extract_text_from_pdf (pdf_path : str ) -> str :
14225 text = []
26+ print (f"Extracting text from { pdf_path } ." )
14327 try :
14428 with fitz .open (pdf_path ) as doc :
14529 for page_num , page in enumerate (doc , start = 1 ):
@@ -160,7 +44,7 @@ def extract_text_from_pdf(pdf_path: str) -> str:
16044 except Exception as e :
16145 print (f"[ERROR] Failed to extract text from { pdf_path } : { e } " , file = sys .stderr )
16246 return ""
163-
47+ # Join all pages into a single string separated by newlines
16448 return "\n " .join (text )
16549
16650
@@ -184,46 +68,31 @@ def extract_text_from_pdf_bytes(data: bytes) -> str:
18468 return "\n " .join (text )
18569
18670
71+ # Save extracted text to a file.
18772def save_to_file (text : str , output_path : str ):
188- """Save extracted text to a file."""
18973 try :
19074 with open (output_path , "w" , encoding = "utf-8" ) as f :
19175 f .write (text )
76+ print (f"[INFO] Text successfully saved to { output_path } " )
19277 except Exception as e :
19378 print (f"[ERROR] Could not save text to { output_path } : { e } " , file = sys .stderr )
19479
19580
19681def main ():
197- parser = argparse .ArgumentParser (description = "Extract text and tables from PDF using PyMuPDF and camelot-py ." )
82+ parser = argparse .ArgumentParser (description = "Extract text from PDF using PyMuPDF." )
19883 parser .add_argument ("pdf" , type = str , help = "Path to the input PDF file." )
199- parser .add_argument ("--output-dir" , type = str , default = "data/processed-text" , help = "Output directory for extracted text (default: data/processed-text)" )
200-
20184 args = parser .parse_args ()
20285
20386 pdf_path = Path (args .pdf )
20487 if not pdf_path .exists ():
20588 print (f"[ERROR] File not found: { pdf_path } " , file = sys .stderr )
20689 sys .exit (1 )
20790
208- # Extract text
91+ # Perform extraction
20992 text = extract_text_from_pdf (str (pdf_path ))
21093
211- # Extract tables
212- tables_data = extract_tables_from_pdf (str (pdf_path ))
213-
214- # If tables were found, save JSON
215- if tables_data :
216-
217- # Save tables JSON
218- output_dir = Path (args .output_dir )
219- output_dir .mkdir (parents = True , exist_ok = True )
220-
221- tables_json_path = output_dir / (pdf_path .stem + "_tables.json" )
222- with open (tables_json_path , "w" , encoding = "utf-8" ) as f :
223- json .dump (tables_data , f , indent = 2 )
94+ output_path = Path ("data/processed-text" ) / pdf_path .with_suffix (".txt" ).name
22495
225- # Save combined text
226- output_path = Path (args .output_dir ) / pdf_path .with_suffix (".txt" ).name
22796 save_to_file (text , str (output_path ))
22897
22998
0 commit comments