Skip to content

Commit bb89bae

Browse files
committed
Fix: Formatting issues.
1 parent 916de8f commit bb89bae

1 file changed

Lines changed: 42 additions & 65 deletions

File tree

src/preprocessing/pdf_text_extraction.py

Lines changed: 42 additions & 65 deletions
Original file line numberDiff line numberDiff line change
@@ -23,50 +23,41 @@
2323
Image.MAX_IMAGE_PIXELS = None
2424
fitz.TOOLS.mupdf_display_errors(False)
2525

26+
2627
def extract_tables_with_camelot(pdf_path: str) -> List[Dict]:
2728
"""Extract tables using camelot-py (fallback method).
28-
29+
2930
Args:
3031
pdf_path: Path to the PDF file
31-
32+
3233
Returns:
3334
List of dictionaries containing table data and metadata
3435
"""
35-
36+
3637
tables_data = []
37-
38+
3839
try:
3940
# Try stream method first with edge detection (better for tables without borders)
4041
tables = camelot.read_pdf(
41-
str(pdf_path),
42-
pages='all',
43-
flavor='stream',
44-
edge_tol=500, # Tolerance for detecting table edges
45-
row_tol=10, # Ttolerance for row detection
46-
column_tol=5 # Tolerance for column detection
42+
str(pdf_path), pages='all', flavor='stream', edge_tol=500, row_tol=10, column_tol=5 # Tolerance for detecting table edges # Ttolerance for row detection # Tolerance for column detection
4743
)
48-
44+
4945
# If still no tables, try lattice method (for bordered tables)
5046
if len(tables) == 0:
51-
tables = camelot.read_pdf(
52-
str(pdf_path),
53-
pages='all',
54-
flavor='lattice',
55-
line_scale=40
56-
)
57-
47+
tables = camelot.read_pdf(str(pdf_path), pages='all', flavor='lattice', line_scale=40)
48+
5849
for idx, table in enumerate(tables, start=1):
5950
# Convert to list of lists
6051
table_cells = table.df.values.tolist()
61-
52+
6253
# Add header row (pandas columns)
6354
header = table.df.columns.tolist()
6455
table_cells.insert(0, header)
65-
56+
6657
# Skip tables with very few cells (likely detection errors)
6758
if len(table_cells) < 3 or (len(table_cells[0]) if table_cells else 0) < 2:
6859
continue
69-
60+
7061
table_info = {
7162
"table_id": f"Table_P{table.page}_T{idx}",
7263
"page_number": table.page,
@@ -75,86 +66,75 @@ def extract_tables_with_camelot(pdf_path: str) -> List[Dict]:
7566
"x0": table._bbox[0] if hasattr(table, '_bbox') else 0,
7667
"y0": table._bbox[1] if hasattr(table, '_bbox') else 0,
7768
"x1": table._bbox[2] if hasattr(table, '_bbox') else 0,
78-
"y1": table._bbox[3] if hasattr(table, '_bbox') else 0
69+
"y1": table._bbox[3] if hasattr(table, '_bbox') else 0,
7970
},
8071
"num_rows": len(table_cells),
8172
"num_cols": len(table_cells[0]) if table_cells else 0,
8273
"cells": table_cells,
8374
"accuracy": float(table.accuracy) if hasattr(table, 'accuracy') else 0.0,
84-
"extraction_method": "camelot"
75+
"extraction_method": "camelot",
8576
}
86-
77+
8778
tables_data.append(table_info)
88-
79+
8980
except Exception as e:
9081
print(f"[ERROR] Camelot extraction failed: {e}", file=sys.stderr)
91-
82+
9283
return tables_data
9384

9485

9586
def extract_tables_from_pdf(pdf_path: str) -> List[Dict]:
9687
"""Extract tables from PDF using PyMuPDF first, then camelot-py as fallback.
97-
88+
9889
Args:
9990
pdf_path: Path to the PDF file
100-
91+
10192
Returns:
10293
List of dictionaries containing table data and metadata
10394
"""
10495
tables_data = []
105-
96+
10697
# Try PyMuPDF first
10798
try:
10899
with fitz.open(pdf_path) as doc:
109100
for page_num, page in enumerate(doc, start=1):
110101
tabs = page.find_tables()
111-
102+
112103
if not tabs.tables:
113104
continue
114-
105+
115106
for table_idx, tab in enumerate(tabs.tables, start=1):
116107
try:
117108
table_cells = tab.extract()
118-
109+
119110
if not table_cells or len(table_cells) == 0:
120111
continue
121-
112+
122113
bbox = tab.bbox
123-
114+
124115
table_info = {
125116
"table_id": f"Table_P{page_num}_T{table_idx}",
126117
"page_number": page_num,
127118
"table_index": table_idx,
128-
"bbox": {
129-
"x0": bbox.x0,
130-
"y0": bbox.y0,
131-
"x1": bbox.x1,
132-
"y1": bbox.y1
133-
},
119+
"bbox": {"x0": bbox.x0, "y0": bbox.y0, "x1": bbox.x1, "y1": bbox.y1},
134120
"num_rows": len(table_cells),
135121
"num_cols": len(table_cells[0]) if table_cells else 0,
136122
"cells": table_cells,
137-
"extraction_method": "pymupdf"
123+
"extraction_method": "pymupdf",
138124
}
139-
125+
140126
tables_data.append(table_info)
141-
127+
142128
except Exception as e:
143-
tables_data.append({
144-
"table_id": f"Table_P{page_num}_T{table_idx}",
145-
"page_number": page_num,
146-
"table_index": table_idx,
147-
"error": str(e),
148-
"extraction_method": "pymupdf"
149-
})
150-
129+
tables_data.append({"table_id": f"Table_P{page_num}_T{table_idx}", "page_number": page_num, "table_index": table_idx, "error": str(e), "extraction_method": "pymupdf"})
130+
151131
except Exception as e:
152132
print(f"[ERROR] PyMuPDF table extraction failed: {e}", file=sys.stderr)
153-
133+
154134
# If PyMuPDF found no tables, try camelot
155135
if len(tables_data) == 0:
156136
tables_data = extract_tables_with_camelot(pdf_path)
157-
137+
158138
return tables_data
159139

160140

@@ -180,7 +160,7 @@ def extract_text_from_pdf(pdf_path: str) -> str:
180160
except Exception as e:
181161
print(f"[ERROR] Failed to extract text from {pdf_path}: {e}", file=sys.stderr)
182162
return ""
183-
163+
184164
return "\n".join(text)
185165

186166

@@ -214,13 +194,10 @@ def save_to_file(text: str, output_path: str):
214194

215195

216196
def main():
217-
parser = argparse.ArgumentParser(
218-
description="Extract text and tables from PDF using PyMuPDF and camelot-py."
219-
)
197+
parser = argparse.ArgumentParser(description="Extract text and tables from PDF using PyMuPDF and camelot-py.")
220198
parser.add_argument("pdf", type=str, help="Path to the input PDF file.")
221-
parser.add_argument("--output-dir", type=str, default="data/processed-text",
222-
help="Output directory for extracted text (default: data/processed-text)")
223-
199+
parser.add_argument("--output-dir", type=str, default="data/processed-text", help="Output directory for extracted text (default: data/processed-text)")
200+
224201
args = parser.parse_args()
225202

226203
pdf_path = Path(args.pdf)
@@ -230,17 +207,17 @@ def main():
230207

231208
# Extract text
232209
text = extract_text_from_pdf(str(pdf_path))
233-
210+
234211
# Extract tables
235212
tables_data = extract_tables_from_pdf(str(pdf_path))
236-
213+
237214
# If tables were found, save JSON
238215
if tables_data:
239-
216+
240217
# Save tables JSON
241218
output_dir = Path(args.output_dir)
242219
output_dir.mkdir(parents=True, exist_ok=True)
243-
220+
244221
tables_json_path = output_dir / (pdf_path.stem + "_tables.json")
245222
with open(tables_json_path, "w", encoding="utf-8") as f:
246223
json.dump(tables_data, f, indent=2)
@@ -251,4 +228,4 @@ def main():
251228

252229

253230
if __name__ == "__main__":
254-
main()
231+
main()

0 commit comments

Comments
 (0)