2323Image .MAX_IMAGE_PIXELS = None
2424fitz .TOOLS .mupdf_display_errors (False )
2525
26+
2627def extract_tables_with_camelot (pdf_path : str ) -> List [Dict ]:
2728 """Extract tables using camelot-py (fallback method).
28-
29+
2930 Args:
3031 pdf_path: Path to the PDF file
31-
32+
3233 Returns:
3334 List of dictionaries containing table data and metadata
3435 """
35-
36+
3637 tables_data = []
37-
38+
3839 try :
3940 # Try stream method first with edge detection (better for tables without borders)
4041 tables = camelot .read_pdf (
41- str (pdf_path ),
42- pages = 'all' ,
43- flavor = 'stream' ,
44- edge_tol = 500 , # Tolerance for detecting table edges
45- row_tol = 10 , # Ttolerance for row detection
46- column_tol = 5 # Tolerance for column detection
42+ str (pdf_path ), pages = 'all' , flavor = 'stream' , edge_tol = 500 , row_tol = 10 , column_tol = 5 # Tolerance for detecting table edges # Ttolerance for row detection # Tolerance for column detection
4743 )
48-
44+
4945 # If still no tables, try lattice method (for bordered tables)
5046 if len (tables ) == 0 :
51- tables = camelot .read_pdf (
52- str (pdf_path ),
53- pages = 'all' ,
54- flavor = 'lattice' ,
55- line_scale = 40
56- )
57-
47+ tables = camelot .read_pdf (str (pdf_path ), pages = 'all' , flavor = 'lattice' , line_scale = 40 )
48+
5849 for idx , table in enumerate (tables , start = 1 ):
5950 # Convert to list of lists
6051 table_cells = table .df .values .tolist ()
61-
52+
6253 # Add header row (pandas columns)
6354 header = table .df .columns .tolist ()
6455 table_cells .insert (0 , header )
65-
56+
6657 # Skip tables with very few cells (likely detection errors)
6758 if len (table_cells ) < 3 or (len (table_cells [0 ]) if table_cells else 0 ) < 2 :
6859 continue
69-
60+
7061 table_info = {
7162 "table_id" : f"Table_P{ table .page } _T{ idx } " ,
7263 "page_number" : table .page ,
@@ -75,86 +66,75 @@ def extract_tables_with_camelot(pdf_path: str) -> List[Dict]:
7566 "x0" : table ._bbox [0 ] if hasattr (table , '_bbox' ) else 0 ,
7667 "y0" : table ._bbox [1 ] if hasattr (table , '_bbox' ) else 0 ,
7768 "x1" : table ._bbox [2 ] if hasattr (table , '_bbox' ) else 0 ,
78- "y1" : table ._bbox [3 ] if hasattr (table , '_bbox' ) else 0
69+ "y1" : table ._bbox [3 ] if hasattr (table , '_bbox' ) else 0 ,
7970 },
8071 "num_rows" : len (table_cells ),
8172 "num_cols" : len (table_cells [0 ]) if table_cells else 0 ,
8273 "cells" : table_cells ,
8374 "accuracy" : float (table .accuracy ) if hasattr (table , 'accuracy' ) else 0.0 ,
84- "extraction_method" : "camelot"
75+ "extraction_method" : "camelot" ,
8576 }
86-
77+
8778 tables_data .append (table_info )
88-
79+
8980 except Exception as e :
9081 print (f"[ERROR] Camelot extraction failed: { e } " , file = sys .stderr )
91-
82+
9283 return tables_data
9384
9485
9586def extract_tables_from_pdf (pdf_path : str ) -> List [Dict ]:
9687 """Extract tables from PDF using PyMuPDF first, then camelot-py as fallback.
97-
88+
9889 Args:
9990 pdf_path: Path to the PDF file
100-
91+
10192 Returns:
10293 List of dictionaries containing table data and metadata
10394 """
10495 tables_data = []
105-
96+
10697 # Try PyMuPDF first
10798 try :
10899 with fitz .open (pdf_path ) as doc :
109100 for page_num , page in enumerate (doc , start = 1 ):
110101 tabs = page .find_tables ()
111-
102+
112103 if not tabs .tables :
113104 continue
114-
105+
115106 for table_idx , tab in enumerate (tabs .tables , start = 1 ):
116107 try :
117108 table_cells = tab .extract ()
118-
109+
119110 if not table_cells or len (table_cells ) == 0 :
120111 continue
121-
112+
122113 bbox = tab .bbox
123-
114+
124115 table_info = {
125116 "table_id" : f"Table_P{ page_num } _T{ table_idx } " ,
126117 "page_number" : page_num ,
127118 "table_index" : table_idx ,
128- "bbox" : {
129- "x0" : bbox .x0 ,
130- "y0" : bbox .y0 ,
131- "x1" : bbox .x1 ,
132- "y1" : bbox .y1
133- },
119+ "bbox" : {"x0" : bbox .x0 , "y0" : bbox .y0 , "x1" : bbox .x1 , "y1" : bbox .y1 },
134120 "num_rows" : len (table_cells ),
135121 "num_cols" : len (table_cells [0 ]) if table_cells else 0 ,
136122 "cells" : table_cells ,
137- "extraction_method" : "pymupdf"
123+ "extraction_method" : "pymupdf" ,
138124 }
139-
125+
140126 tables_data .append (table_info )
141-
127+
142128 except Exception as e :
143- tables_data .append ({
144- "table_id" : f"Table_P{ page_num } _T{ table_idx } " ,
145- "page_number" : page_num ,
146- "table_index" : table_idx ,
147- "error" : str (e ),
148- "extraction_method" : "pymupdf"
149- })
150-
129+ tables_data .append ({"table_id" : f"Table_P{ page_num } _T{ table_idx } " , "page_number" : page_num , "table_index" : table_idx , "error" : str (e ), "extraction_method" : "pymupdf" })
130+
151131 except Exception as e :
152132 print (f"[ERROR] PyMuPDF table extraction failed: { e } " , file = sys .stderr )
153-
133+
154134 # If PyMuPDF found no tables, try camelot
155135 if len (tables_data ) == 0 :
156136 tables_data = extract_tables_with_camelot (pdf_path )
157-
137+
158138 return tables_data
159139
160140
@@ -180,7 +160,7 @@ def extract_text_from_pdf(pdf_path: str) -> str:
180160 except Exception as e :
181161 print (f"[ERROR] Failed to extract text from { pdf_path } : { e } " , file = sys .stderr )
182162 return ""
183-
163+
184164 return "\n " .join (text )
185165
186166
@@ -214,13 +194,10 @@ def save_to_file(text: str, output_path: str):
214194
215195
216196def main ():
217- parser = argparse .ArgumentParser (
218- description = "Extract text and tables from PDF using PyMuPDF and camelot-py."
219- )
197+ parser = argparse .ArgumentParser (description = "Extract text and tables from PDF using PyMuPDF and camelot-py." )
220198 parser .add_argument ("pdf" , type = str , help = "Path to the input PDF file." )
221- parser .add_argument ("--output-dir" , type = str , default = "data/processed-text" ,
222- help = "Output directory for extracted text (default: data/processed-text)" )
223-
199+ parser .add_argument ("--output-dir" , type = str , default = "data/processed-text" , help = "Output directory for extracted text (default: data/processed-text)" )
200+
224201 args = parser .parse_args ()
225202
226203 pdf_path = Path (args .pdf )
@@ -230,17 +207,17 @@ def main():
230207
231208 # Extract text
232209 text = extract_text_from_pdf (str (pdf_path ))
233-
210+
234211 # Extract tables
235212 tables_data = extract_tables_from_pdf (str (pdf_path ))
236-
213+
237214 # If tables were found, save JSON
238215 if tables_data :
239-
216+
240217 # Save tables JSON
241218 output_dir = Path (args .output_dir )
242219 output_dir .mkdir (parents = True , exist_ok = True )
243-
220+
244221 tables_json_path = output_dir / (pdf_path .stem + "_tables.json" )
245222 with open (tables_json_path , "w" , encoding = "utf-8" ) as f :
246223 json .dump (tables_data , f , indent = 2 )
@@ -251,4 +228,4 @@ def main():
251228
252229
253230if __name__ == "__main__" :
254- main ()
231+ main ()
0 commit comments