slm-rag/src/document_loader.py at main · AINexumLab/slm-rag · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
from pathlib import Path
from pymupdf4llm import to_markdown
from typing import List, Dict

def load_pdf_document(pdf_path: str, output_path: str) -> List[Dict]:
    """
    Load and extract text from PDF documents.
    """
    pages = to_markdown(
        pdf_path,
        page_chunks=True,     # keep page boundaries
        write_images=False    # ignore images for RAG
    )

    documents = []

    for page in pages:
        text = page.get("text", "").strip()

        # Skip empty / junk pages
        if not text:
            continue

        documents.append({
            "text": text,
            "metadata": {
                "source": pdf_path,
                "page_number": None,
                "chunk_index": None,     # filled in chunking.py
                "section_title": None    # can be inferred later
            }
        })

    output_path = Path(output_path)
    output_path.parent.mkdir(parents=True, exist_ok=True)

    with output_path.open("w", encoding="utf-8") as f:
        for i, page in enumerate(pages, start=1):
            # print(page["text"])
            # print(page["metadata"])
            f.write(f"## Page {i}\n\n")
            f.write(page["text"])
            f.write("\n\n")

    return documents