PDFQueryChatLM/multi-model.py at master · Ya-Tin/PDFQueryChatLM · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
import streamlit as st
import os
from dotenv import load_dotenv
from PyPDF2 import PdfReader
from langchain.text_splitter import CharacterTextSplitter
from langchain.chains.combine_documents import create_stuff_documents_chain
from langchain_core.prompts import ChatPromptTemplate
from langchain_community.vectorstores import FAISS
from langchain_google_genai import GoogleGenerativeAIEmbeddings, ChatGoogleGenerativeAI
import google.generativeai as genai
import pathlib
import re
from transformers import pipeline

load_dotenv()
os.getenv("GOOGLE_API_KEY")
genai.configure(api_key=os.getenv("GOOGLE_API_KEY"))

HUGGINGFACE_TOKEN = os.getenv("HUGGINGFACE_TOKEN")

def load_css(file_path):
    with open(file_path) as f:
        st.markdown(f"<style>{f.read()}</style>", unsafe_allow_html=True)

def get_pdf_text(pdf_docs):
    text=""
    for pdf in pdf_docs:
        pdf_reader = PdfReader(pdf)
        for page in pdf_reader.pages:
            text+=page.extract_text()
    return text

def chonky(text):
    text_splitter= CharacterTextSplitter(separator="\n",
        chunk_size=1000,
        chunk_overlap=200,
        length_function=len
    )
    chunks = text_splitter.split_text(text)
    return chunks

def get_vectorstore(text_chunks):
    embeddings = GoogleGenerativeAIEmbeddings(model = "models/embedding-001")
    vector_store = FAISS.from_texts(text_chunks, embedding=embeddings)
    vector_store.save_local("faiss_index")

def get_doc_vectorstore():
    if not os.path.exists("faiss_index"):
        return None
    embeddings = GoogleGenerativeAIEmbeddings(model="models/embedding-001")
    return FAISS.load_local("faiss_index", embeddings, allow_dangerous_deserialization=True)

def get_query_vectorstore():
    if not os.path.exists("query_index"):
        return None
    embeddings = GoogleGenerativeAIEmbeddings(model="models/embedding-001")
    return FAISS.load_local("query_index", embeddings, allow_dangerous_deserialization=True)

def save_query_embedding(query):
    embeddings = GoogleGenerativeAIEmbeddings(model="models/embedding-001")
    if not os.path.exists("query_index"):
        vector_store = FAISS.from_texts([query], embedding=embeddings)
    else:
        vector_store = get_query_vectorstore()
        vector_store.add_texts([query])
    vector_store.save_local("query_index")


def get_huggingface_qa_model():
    return pipeline("question-answering", model="tiiuae/falcon-7b-instruct", token=HUGGINGFACE_TOKEN)

def get_conversational_chain(model_type="google"):
    if model_type == "huggingface":
        return get_huggingface_qa_model()
    else:
        model = ChatGoogleGenerativeAI(model="gemini-1.5-pro", temperature=0.7)
        prompt = ChatPromptTemplate.from_template("""
         Try using Context or the Past Queries sent by User in this session for finding an answer, but if the answer is not available in the context, reply with "Not enough information is available in the documents provided, but I can get an answer based on the Internet knowledge."
        Context:
        {context}
        Past Queries sent by User in this session:
        {query}
        Question:
        {question}
        """)
        return create_stuff_documents_chain(model, prompt)

def user_input(user_question, model_type="google"):
    new_db = get_doc_vectorstore()
    query_db = get_query_vectorstore()

    docs = new_db.similarity_search(user_question) if new_db else []
    query = query_db.similarity_search(user_question) if query_db else []

    chain = get_conversational_chain(model_type)

    if model_type == "huggingface":
        context = " ".join([doc.page_content for doc in docs]) if docs else "No context available"
        response = chain(question=user_question, context=context)
        response = response["answer"]
    else:
        response = chain.invoke(
            {"context": docs, "query": query, "question": user_question}
        )

    save_query_embedding(user_question)
    return response


def delete_faiss_index():
    if os.path.exists("faiss_index") or os.path.exists("query_index"):
        for root, dirs, files in os.walk("faiss_index", topdown=False):
            for name in files:
                os.remove(os.path.join(root, name))
            for name in dirs:
                os.rmdir(os.path.join(root, name))
        if os.path.exists("faiss_index"):
            os.rmdir("faiss_index")
        delete_query_index()
        st.success("Cleaned up the cache")
    else:
        st.warning("Cache file doesn't exist")

def delete_query_index():
    if os.path.exists("query_index"):
        for root, dirs, files in os.walk("query_index", topdown=False):
            for name in files:
                os.remove(os.path.join(root, name))
            for name in dirs:
                os.rmdir(os.path.join(root, name))
        os.rmdir("query_index")

# Main app
st.set_page_config(page_title="PAQ Bot", page_icon="🤖")
css_path = pathlib.Path("style.css")
load_css(css_path)
if "messages" not in st.session_state:
    st.session_state["messages"] = [
        {"role": "assistant", "content": "How can I help you?"}
    ]
    delete_query_index()
# if "chat_history" not in st.session_state:
#     st.session_state["chat_history"] = []
st.header("PAQ Bot", divider="red")
st.markdown('<div class="intro">Welcome to PAQ Bot! This bot can help you with your queries based on the documents you provide. Upload your PDF documents and ask your queries. The bot will try to answer your queries based on the content of the documents. Use the &#39;Reset Bot Memory&#39; button to clear the cache and &#39;Stop App button&#39; to stop the app.</div>', unsafe_allow_html=True)
# Display chat messages
for msg in st.session_state["messages"]:
    st.chat_message(msg["role"]).write(msg["content"])
# Sidebar
with st.sidebar:
    st.header("PAQ Bot", divider="red")
    model_type = st.selectbox("Choose Model", ["Google Gemini", "Hugging Face (tiiuae/falcon-7b-instruct)"])
    model_key = "google" if model_type == "Google Gemini" else "huggingface"

    st.subheader("Upload PDF Documents")
    pdf_docs = st.file_uploader("Pick a pdf file", type="pdf", accept_multiple_files=True)
    if pdf_docs and st.button("Process Documents", key="green"):
        with st.spinner("Processing", show_time=True):
            raw_text = get_pdf_text(pdf_docs)
            text_chunks = chonky(raw_text)
            vector_store = get_vectorstore(text_chunks)
            st.markdown('<div class="donepdf">Done</div>', unsafe_allow_html=True)
    if not pdf_docs:
        st.markdown('<div class="uppdf">Please upload a PDF file to start</div>', unsafe_allow_html=True)
    st.markdown('<div class="blanki"></div>', unsafe_allow_html=True)
    st.markdown('<div class="luvacm">Made with ❤️ by PEC ACM </div>', unsafe_allow_html=True)
    st.link_button("View the source code", "https://github.com/Ya-Tin/PDFQueryChatLM.git")
    if st.button("Reset Bot Memory", key="red"):
        delete_faiss_index()
    if st.button("Stop App", key="red2"):
        delete_query_index()
        os._exit(0)
# Chat input box
user_question = st.chat_input("Input your Query here and Press 'Process Query' button")
if user_question:
    # Append user message first
    st.session_state["messages"].append({"role": "user", "content": user_question})
    # Display user message immediately
    st.chat_message("user").markdown(user_question)
    with st.spinner("Generating response...", show_time=True):
        response = user_input(user_question, model_type=model_key)

    # Append assistant's response and display it
    unwanted_line_pattern = r"Not enough information is available in the documents provided, but I can get an answer based on the Internet knowledge."
    response = re.sub(unwanted_line_pattern, "", response)
    st.session_state["messages"].append({"role": "assistant", "content": response})
    st.chat_message("assistant").markdown(response)