-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathdocument_loader.py
More file actions
45 lines (37 loc) · 1.22 KB
/
document_loader.py
File metadata and controls
45 lines (37 loc) · 1.22 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
from pathlib import Path
from pymupdf4llm import to_markdown
from typing import List, Dict
def load_pdf_document(pdf_path: str, output_path: str) -> List[Dict]:
"""
Load and extract text from PDF documents.
"""
pages = to_markdown(
pdf_path,
page_chunks=True, # keep page boundaries
write_images=False # ignore images for RAG
)
documents = []
for page in pages:
text = page.get("text", "").strip()
# Skip empty / junk pages
if not text:
continue
documents.append({
"text": text,
"metadata": {
"source": pdf_path,
"page_number": None,
"chunk_index": None, # filled in chunking.py
"section_title": None # can be inferred later
}
})
output_path = Path(output_path)
output_path.parent.mkdir(parents=True, exist_ok=True)
with output_path.open("w", encoding="utf-8") as f:
for i, page in enumerate(pages, start=1):
# print(page["text"])
# print(page["metadata"])
f.write(f"## Page {i}\n\n")
f.write(page["text"])
f.write("\n\n")
return documents