pdf-reader/main.py at dev · amyford/pdf-reader · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
import json
import os
from time import sleep

from flask import Flask, request, jsonify
import pdfplumber
import vertexai
from vertexai.generative_models import GenerativeModel

app = Flask(__name__)

# NB: if running locally may need to add an api_key
vertexai.init(project="pdf-reader-438611", location="europe-west1")

instructions = """
        Extract the biological entities from the text below.
        Return the list of entities.

        The output must be in JSON.

        [{
            "entity",
            "context",
            "start",
            "end",
        }]

        Where:
            "entity": Name of the identified medical entity,
            "context": Text surrounding the entity for clarity. Make sure to keep whole words. This should be as it is in the text"
            "start": The start position of the entity in the context with respect to the original text,
            "end": The end position of the entity in the context with respect to the original text,

        For example
        {"entities": [
            {
                "entity": "COVID-19",
                "context": "... was observed in patients with COVID-19",
                "start": 119,
                "end": 140
            },
            {
                "entity": "ERK1",
                "context": "... elevated levels of ERK1 were seen in patients with COVID-19",
                "start": 119,
                "end": 140
            }
        ]}
"""

model = GenerativeModel(
    model_name="gemini-1.5-flash-002",
    generation_config={"response_mime_type": "application/json"},
    system_instruction=instructions,
)


def extract_text_from_pdf(pdf_path: str) -> str:
    """
    Extracts text from a PDF file using pdfplumber.

    Args:
        pdf_path (str): The file path to the PDF.

    Returns:
        text (str): The extracted text from the PDF.
    """
    with pdfplumber.open(pdf_path) as pdf:
        text = ""
        for page in pdf.pages:
            text += (
                page.extract_text() or ""
            )
    return text


def split_text_into_chunks(text: str, max_length: int = 512) -> list:
    """
    Splits text into smaller chunks, each of a specified maximum length.

    Args:
        text (str): The input text to be split.
        max_length (int): Maximum length for each chunk of text.

    Returns:
        list: A list of text chunks.
    """
    chunks = [text[i : i + max_length] for i in range(0, len(text), max_length)]
    return chunks


def perform_ner(text: str) -> list:
    """
    Performs NER on the input text using the Vertex AI model.

    Args:
        text (str): The input text for which NER will be performed.

    Returns:
        list[Entity]: A list of entities extracted from the text.
    """
    entities = []
    chunks = split_text_into_chunks(text)

    for chunk in chunks:
        try:
            response = model.generate_content(chunk)
            entities_in_chunk = json.loads(response.text)
            entities = [*entities, *entities_in_chunk["entities"]]
        except Exception as e:
            print("Error extracting entities", e)

        # Hit Gemini limit on free plan! - sleep between requests
        sleep(1)

    return entities


@app.route("/api/v1/extract", methods=["POST"])
def upload_pdf():
    if "file" not in request.files:
        return jsonify({"error": "No file part"}), 400

    file = request.files["file"]

    if file.filename == "":
        return jsonify({"error": "No selected file"}), 400

    if file and file.filename.endswith(".pdf"):
        pdf_path = os.path.join("/tmp", file.filename)
        file.save(pdf_path)

        extracted_text = extract_text_from_pdf(pdf_path)
        entities = perform_ner(extracted_text)

        # Clean up the temp file
        os.remove(pdf_path)

        return jsonify({"entities": entities}), 200
    else:
        return jsonify({"error": "File format not supported, please upload a PDF"}), 415


if __name__ == "__main__":
    app.run(debug=True)