-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathextract.py
More file actions
120 lines (97 loc) · 3.66 KB
/
extract.py
File metadata and controls
120 lines (97 loc) · 3.66 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
import os
import re
import tempfile
import streamlit as st
from langchain_community.document_loaders import PyPDFLoader ,TextLoader ,UnstructuredWordDocumentLoader , UnstructuredExcelLoader
from langchain_community.vectorstores import Chroma
from langchain_community.embeddings import HuggingFaceEmbeddings
from langchain.text_splitter import RecursiveCharacterTextSplitter
EMBED_MODEL_NAME = 'all-MiniLM-L6-v2'
# ----------------------------
# Load Vector Database
@st.cache_resource
def load_vector_db():
"""
Initializes and caches the vector database with HuggingFace embeddings.
"""
embeddings = HuggingFaceEmbeddings(model_name=EMBED_MODEL_NAME)
vectordb = Chroma(embedding_function=embeddings)
return vectordb
# ----------------------------
# PDF Processing
def process_pdf(pdf_path, doc_name,file_path):
"""
Loads and splits PDF into text chunks with metadata.
"""
if file_path == ".pdf":
loader = PyPDFLoader(pdf_path)
elif file_path == ".txt":
loader = TextLoader(pdf_path)
elif file_path == ".docx":
loader = UnstructuredWordDocumentLoader(pdf_path)
elif file_path in [".xls", ".xlsx"]:
loader = UnstructuredExcelLoader(pdf_path)
else:
raise ValueError(f"Unsupported file type: {pdf_path}")
pages = loader.load()
splitter = RecursiveCharacterTextSplitter(
chunk_size=1000,
chunk_overlap=20,
separators=["\n\n", "\n", ".", " "]
)
docs = splitter.split_documents(pages)
for doc in docs:
doc.metadata["source"] = doc_name
return docs
# ----------------------------
# Embedding Documents
def embed_documents(uploaded_files, vectordb, files_to_remove=None):
"""
Embeds new PDF files and optionally removes old ones from the vector DB.
"""
all_docs = []
new_files = []
if files_to_remove:
for file_name in files_to_remove:
vectordb._collection.delete(where={"source": file_name})
st.success(f"Removed {len(files_to_remove)} file(s) from the database.")
for uploaded_file in uploaded_files:
doc_name = uploaded_file.name
file_path = os.path.splitext(doc_name)[1].lower()
with tempfile.NamedTemporaryFile(delete=False, suffix=file_path) as tmp_file:
tmp_file.write(uploaded_file.read())
tmp_path = tmp_file.name
docs = process_pdf(tmp_path, doc_name, file_path)
all_docs.extend(docs)
new_files.append(doc_name)
if all_docs:
vectordb.add_documents(all_docs)
return vectordb, len(new_files)
# ----------------------------
# Text Filtering
def has_overlap(question: str, content: str, min_overlap: int = 3) -> bool:
"""
Checks if the content shares at least `min_overlap` words with the question.
"""
words_q = set(re.findall(r'\w+', question.lower()))
words_c = set(re.findall(r'\w+', content.lower()))
return len(words_q & words_c) >= min_overlap
def filter_relevant_chunks(question, documents, min_overlap=3):
"""
Filters relevant document chunks based on word overlap with the question.
Returns a list of (content, source) tuples.
"""
seen_chunks = set()
filtered = []
for doc in documents:
content = doc.page_content.strip()
if content not in seen_chunks and has_overlap(question, content, min_overlap):
seen_chunks.add(content)
source = doc.metadata.get("source", "Unknown")
filtered.append((content, source))
return filtered
def build_context(filtered_chunks):
"""
Builds a single text context string from the list of filtered chunks.
"""
return "\n\n".join([chunk for chunk, _ in filtered_chunks])