-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathmain.py
More file actions
325 lines (263 loc) · 10.7 KB
/
main.py
File metadata and controls
325 lines (263 loc) · 10.7 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
"""
Main Application - CLI interface for the research engine
"""
import sys
from pathlib import Path
from loguru import logger
from utils import load_config, setup_logging, ensure_directories
from llm.ollama_client import OllamaClient
from llm.answer_generator import AnswerGenerator
from ingest.chunker import SemanticChunker
from ingest.document_processor import DocumentProcessor
from index.vector_store import VectorStore
from index.keyword_index import KeywordIndex
from index.knowledge_graph import KnowledgeGraph
from retrieval.hybrid_search import HybridRetriever
from retrieval.reranker import LLMReranker
from retrieval.citation_mapper import CitationMapper
class ResearchEngine:
"""Main research engine orchestrator."""
def __init__(self, config_path: str = "config.yaml"):
"""
Initialize research engine.
Args:
config_path: Path to configuration file
"""
# Load configuration
self.config = load_config(config_path)
setup_logging(self.config)
ensure_directories(self.config)
logger.info("=" * 60)
logger.info("Local AI Research Engine")
logger.info("=" * 60)
# Initialize Ollama client
ollama_config = self.config["ollama"]
self.ollama_client = OllamaClient(
base_url=ollama_config["base_url"],
llm_model=ollama_config["llm_model"],
embedding_model=ollama_config["embedding_model"],
timeout=ollama_config["timeout"],
max_retries=ollama_config["max_retries"]
)
# Verify Ollama setup
logger.info("Verifying Ollama setup...")
verification = self.ollama_client.verify_setup()
if not all(verification.values()):
logger.error("Ollama setup verification failed!")
logger.error("Please ensure:")
logger.error("1. Ollama is running")
logger.error("2. Required models are pulled:")
logger.error(f" - ollama pull {ollama_config['llm_model']}")
logger.error(f" - ollama pull {ollama_config['embedding_model']}")
sys.exit(1)
# Initialize components
logger.info("Initializing components...")
# Ingestion
ingest_config = self.config["ingestion"]
self.chunker = SemanticChunker(
ollama_client=self.ollama_client,
target_chunk_size=ingest_config["chunk_size"],
max_chunk_size=ingest_config["max_chunk_size"],
overlap=ingest_config["chunk_overlap"]
)
self.document_processor = DocumentProcessor(
chunker=self.chunker,
supported_formats=ingest_config["supported_formats"]
)
# Indexing
vector_config = self.config["vector_store"]
self.vector_store = VectorStore(
persist_directory=vector_config["persist_directory"],
collection_name=vector_config["collection_name"],
distance_metric=vector_config["distance_metric"]
)
keyword_config = self.config["keyword_index"]
self.keyword_index = KeywordIndex(
persist_path=keyword_config["persist_path"],
k1=keyword_config["k1"],
b=keyword_config["b"]
)
# Try to load existing index
self.keyword_index.load()
graph_config = self.config["knowledge_graph"]
self.knowledge_graph = KnowledgeGraph(
db_path=graph_config["db_path"],
ollama_client=self.ollama_client
)
# Retrieval
retrieval_config = self.config["retrieval"]
self.retriever = HybridRetriever(
vector_store=self.vector_store,
keyword_index=self.keyword_index,
knowledge_graph=self.knowledge_graph,
ollama_client=self.ollama_client
)
self.reranker = LLMReranker(
ollama_client=self.ollama_client
)
self.citation_mapper = CitationMapper(
citation_format=self.config["answer_generation"]["citation_format"]
)
# Answer generation
answer_config = self.config["answer_generation"]
self.answer_generator = AnswerGenerator(
ollama_client=self.ollama_client,
temperature=answer_config["temperature"],
max_tokens=answer_config["max_tokens"],
citation_format=answer_config["citation_format"]
)
# Advanced Features
from advanced.paper_comparator import PaperComparator
from advanced.contradiction_detector import ContradictionDetector
from advanced.literature_review import LiteratureReviewGenerator
from advanced.export_manager import ExportManager
self.paper_comparator = PaperComparator(self.ollama_client)
self.contradiction_detector = ContradictionDetector(self.ollama_client)
self.lit_reviewer = LiteratureReviewGenerator(self.ollama_client)
self.exporter = ExportManager()
logger.info("✓ All components initialized successfully")
def ingest_documents(self, directory: str = None):
"""
Ingest documents from directory.
Args:
directory: Directory path (defaults to config)
"""
directory = directory or self.config["paths"]["documents"]
logger.info(f"Ingesting documents from: {directory}")
# Process documents
results = self.document_processor.process_directory(
directory=directory,
recursive=True,
extract_entities=True
)
if not results:
logger.warning("No documents found to ingest")
return
# Flatten chunks
all_chunks = []
for file_path, chunks in results.items():
all_chunks.extend(chunks)
logger.info(f"Processing {len(all_chunks)} chunks...")
# Generate embeddings
logger.info("Generating embeddings...")
texts = [chunk["content"] for chunk in all_chunks]
embeddings = self.ollama_client.embed_batch(texts, batch_size=10)
# Add to vector store
logger.info("Adding to vector store...")
doc_ids = self.vector_store.add_documents(all_chunks, embeddings)
# Build keyword index
logger.info("Building keyword index...")
self.keyword_index.build_index(all_chunks, doc_ids)
self.keyword_index.save()
# Build knowledge graph
logger.info("Building knowledge graph...")
for chunk in all_chunks:
self.knowledge_graph.extract_and_add_from_chunk(chunk)
self.knowledge_graph.save()
logger.info("✓ Ingestion complete!")
logger.info(f" - {len(all_chunks)} chunks indexed")
logger.info(f" - {self.knowledge_graph.graph.number_of_nodes()} entities")
logger.info(f" - {self.knowledge_graph.graph.number_of_edges()} relationships")
def query(self, question: str) -> dict:
"""
Answer a question.
Args:
question: User's question
Returns:
Answer dictionary
"""
logger.info(f"\nQuery: {question}")
# Retrieve
retrieval_config = self.config["retrieval"]
results = self.retriever.retrieve(
query=question,
vector_top_k=retrieval_config["vector_top_k"],
keyword_top_k=retrieval_config["keyword_top_k"],
graph_expansion=True,
final_top_k=retrieval_config["final_top_k"]
)
if not results:
return {
"answer": "No relevant information found in the knowledge base.",
"citations": [],
"confidence": 0.0
}
# Rerank
reranked = self.reranker.rerank(
query=question,
results=results,
top_k=retrieval_config["rerank_top_k"]
)
# Register citations
evidence = self.citation_mapper.register_sources(reranked)
# Generate answer
answer_data = self.answer_generator.generate_answer(
question=question,
evidence=evidence
)
return answer_data
def interactive_mode(self):
"""Run interactive query loop."""
logger.info("\n" + "=" * 60)
logger.info("Interactive Mode - Type 'quit' to exit")
logger.info("=" * 60 + "\n")
while True:
try:
question = input("\n🔍 Question: ").strip()
if not question:
continue
if question.lower() in ['quit', 'exit', 'q']:
logger.info("Goodbye!")
break
# Get answer
result = self.query(question)
# Display
print("\n" + "=" * 60)
print("📝 Answer:")
print("=" * 60)
print(result["answer"])
print("\n" + "-" * 60)
print(f"Confidence: {result['confidence']:.2f}")
print(f"Sources: {result['num_sources']}")
print(f"Citations: {len(result['citations'])}")
if result["citations"]:
print("\n📚 Citations:")
for i, citation in enumerate(result["citations"], 1):
print(f" [{i}] {citation['source_name']} §{citation['section']}")
except KeyboardInterrupt:
print("\n\nGoodbye!")
break
except Exception as e:
logger.error(f"Error: {e}")
def main():
"""Main entry point."""
import argparse
parser = argparse.ArgumentParser(description="Local AI Research Engine")
parser.add_argument(
"--ingest",
action="store_true",
help="Ingest documents from the documents directory"
)
parser.add_argument(
"--query",
type=str,
help="Single query mode"
)
parser.add_argument(
"--config",
type=str,
default="config.yaml",
help="Path to config file"
)
args = parser.parse_args()
# Initialize engine
engine = ResearchEngine(config_path=args.config)
if args.ingest:
engine.ingest_documents()
elif args.query:
result = engine.query(args.query)
print("\n" + result["answer"])
else:
engine.interactive_mode()
if __name__ == "__main__":
main()