-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathsolution_1a.py
More file actions
183 lines (149 loc) · 8.04 KB
/
solution_1a.py
File metadata and controls
183 lines (149 loc) · 8.04 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
# solution_1a.py
import pdfplumber
import os
import json
import re
from collections import Counter, defaultdict
def clean_and_normalize_text(text):
"""Cleans text, normalizes whitespace, and handles common unicode artifacts."""
if not text:
return ""
# More aggressive garbled text cleaning
cleaned = re.sub(r'(.)\1{2,}', r'\1', text)
cleaned = re.sub(r'([a-zA-Z])\1\1', r'\1', cleaned)
# Standardize all forms of whitespace to a single space
cleaned = re.sub(r'\s+', ' ', cleaned).strip()
# Fix common OCR/extraction errors for this document
replacements = {
'Sumary': 'Summary', 'Aces': 'Access', 'Apendix': 'Appendix',
'Busines': 'Business', 'Planing': 'Planning', 'Comite': 'Committee',
'Stering': 'Steering', 'Aproach': 'Approach'
}
for wrong, right in replacements.items():
cleaned = cleaned.replace(wrong, right)
return cleaned
def find_common_styles_and_noise(pdf):
"""Analyzes the document to find body text size and recurring header/footer text."""
sizes = Counter()
line_counts = Counter()
# Analyze a sample of pages to determine common styles
page_sample = pdf.pages[1:5] if len(pdf.pages) > 5 else pdf.pages
for page in page_sample:
words = page.extract_words(extra_attrs=["size"])
if words:
sizes.update(round(w['size'], 1) for w in words)
# Scan all pages for headers and footers
for page in pdf.pages:
# Define a tighter BBox for headers/footers
header_box = (0, 0, page.width, page.height * 0.10)
footer_box = (0, page.height * 0.92, page.width, page.height)
# Use a more tolerant text extraction setting for noisy areas
header_text = page.crop(header_box).extract_text(x_tolerance=3, y_tolerance=3)
footer_text = page.crop(footer_box).extract_text(x_tolerance=3, y_tolerance=3)
for line in (header_text or "").split('\n') + (footer_text or "").split('\n'):
# Match the core part of the recurring text, ignoring page numbers
match = re.match(r'^(.*\D)\s*\d*\s*$', line.strip())
if match:
cleaned_line = clean_and_normalize_text(match.group(1))
if cleaned_line and len(cleaned_line) > 10:
line_counts[cleaned_line] += 1
body_size = sizes.most_common(1)[0][0] if sizes else 10.0
# A line is considered noise if it appears on at least 3 pages or > 30% of pages
noise_threshold = max(3, len(pdf.pages) * 0.3)
noise_texts = {text for text, count in line_counts.items() if count >= noise_threshold}
return body_size, noise_texts
def get_title_and_cover_elements(page, body_size):
"""Extracts the multi-line title and identifies all text on the cover page to be ignored."""
words = page.extract_words(extra_attrs=["size"])
if not words: return "", set()
max_size = 0
for w in words:
if w['size'] > max_size:
max_size = w['size']
# Title consists of all text blocks with font size close to the maximum
title_words = [w for w in words if w['size'] >= max_size * 0.75]
title_words.sort(key=lambda w: (w['top'], w['x0']))
# Reconstruct lines based on vertical position
lines = defaultdict(list)
for word in title_words:
lines[word['top']].append(word)
full_title_parts = []
for top in sorted(lines.keys()):
line_text = ' '.join(word['text'] for word in sorted(lines[top], key=lambda w: w['x0']))
full_title_parts.append(line_text)
raw_title = ' '.join(full_title_parts)
title = clean_and_normalize_text(raw_title)
# All text on the first page that is larger than body text is a cover element
cover_page_elements = {clean_and_normalize_text(w['text']) for w in words if w['size'] > body_size * 1.1}
return title, cover_page_elements
def is_likely_heading(line, body_size, noise_texts):
"""Determines if a line of text is a heading using a strict set of rules."""
text = clean_and_normalize_text(line['text'])
# --- Strict Filtering ---
if not text or text in noise_texts or text.isdigit(): return False
if len(text.split()) > 10 or (text.endswith('.') and len(text.split()) > 6): return False
if re.match(r"^\s*(January|February|March|April|May|June|July|August|September|October|November|December)\s+\d{1,2},\s+\d{4}\.?\s*$", text): return False
char = line['chars'][0]
font_size = char['size']
is_bold = "bold" in char['fontname'].lower() or "black" in char['fontname'].lower()
is_larger = font_size > body_size * 1.05
is_all_caps = text.isupper() and len(text.split()) > 1 and not any(c.isdigit() for c in text)
is_numbered = bool(re.match(r"^\d+(\.\d+)*\s*|^[IVX]+\.\s*|^Appendix\s+[A-Z]", text))
if is_larger or is_bold or is_all_caps or is_numbered:
return True
return False
def main():
"""Main execution block."""
input_dir = "input"
output_dir = "output"
os.makedirs(output_dir, exist_ok=True)
for filename in os.listdir(input_dir):
if not filename.lower().endswith(".pdf"): continue
pdf_path = os.path.join(input_dir, filename)
output_path = os.path.join(output_dir, f"{os.path.splitext(filename)[0]}.json")
print(f"Processing {pdf_path}...")
try:
with pdfplumber.open(pdf_path) as pdf:
if not pdf.pages:
structured_data = {"title": "Empty Document", "outline": []}
else:
body_size, noise_texts = find_common_styles_and_noise(pdf)
title, cover_page_elements = get_title_and_cover_elements(pdf.pages[0], body_size)
headings = []
for i, page in enumerate(pdf.pages):
# Use a more tolerant text extraction strategy for the main content
lines = page.extract_text_lines(layout=True, strip=True, x_tolerance=3, y_tolerance=3)
for line in lines:
if not line['chars']: continue
line_text_cleaned = clean_and_normalize_text(line['text'])
# The two crucial checks: is it a heading AND not a cover page element?
if is_likely_heading(line, body_size, noise_texts) and line_text_cleaned not in cover_page_elements:
headings.append({
"text": line_text_cleaned,
"size": round(line['chars'][0]['size'], 1),
"page": i + 1
})
unique_headings = []
seen_texts = set()
for h in headings:
if h['text'].lower() not in seen_texts:
unique_headings.append(h)
seen_texts.add(h['text'].lower())
if unique_headings:
heading_sizes = sorted(list(set(h["size"] for h in unique_headings)), reverse=True)
size_map = {size: f"H{i+1}" for i, size in enumerate(heading_sizes[:4])}
outline = []
for h in unique_headings:
level = size_map.get(h["size"], "H4")
if h['size'] >= body_size * 0.95:
outline.append({"level": level, "text": h["text"], "page": h["page"]})
else:
outline = []
structured_data = {"title": title, "outline": outline}
with open(output_path, 'w', encoding='utf-8') as f:
json.dump(structured_data, f, ensure_ascii=False, indent=4)
print(f"Successfully generated {output_path}")
except Exception as e:
print(f"!!! Error processing {pdf_path}: {e}")
if __name__ == "__main__":
main()