Skip to content

Commit 7e77260

Browse files
committed
Refactored ingestion
1 parent 621d6e2 commit 7e77260

77 files changed

Lines changed: 5563 additions & 870 deletions

File tree

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

README.md

Lines changed: 7 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -113,7 +113,7 @@ nodes = h_parser.get_nodes(documents)
113113

114114
```python
115115
# Add nodes to index (embeddings are auto-generated!)
116-
doc_ids = await index.add_nodes(chunks, show_progress=True)
116+
doc_ids = await index.add_nodes(DocumentNode, chunks, show_progress=True)
117117
print(f"Indexed {len(doc_ids)} chunks")
118118
```
119119

@@ -384,7 +384,7 @@ from fetchcraft.index.vector_index import VectorIndex
384384
index = VectorIndex(vector_store=vector_store, index_id="my-index")
385385

386386
# Add documents (embeddings auto-generated)
387-
ids = await index.add_nodes(chunks)
387+
ids = await index.add_nodes(DocumentNode, chunks)
388388

389389
# Search by text
390390
results = await index.search_by_text("query", k=5)
@@ -408,7 +408,7 @@ from fetchcraft.node import SymNode
408408

409409
# Create hierarchical parser
410410
parser = HierarchicalNodeParser(
411-
chunk_size=2048, # Parent chunk size
411+
chunk_size=2048, # Parent chunk size
412412
overlap=100,
413413
child_sizes=[512, 128], # Create 2 levels of children
414414
child_overlap=20
@@ -418,7 +418,7 @@ parser = HierarchicalNodeParser(
418418
nodes = parser.get_nodes(documents)
419419

420420
# Index all nodes (parents and children)
421-
await index.add_nodes(nodes)
421+
await index.add_nodes(DocumentNode, nodes)
422422

423423
# Retrieve with parent resolution
424424
retriever = index.as_retriever(top_k=5, resolve_parents=True)
@@ -482,8 +482,8 @@ support_index = VectorIndex(
482482
)
483483

484484
# Each index operates independently
485-
await tech_docs_index.add_nodes(tech_chunks)
486-
await marketing_index.add_nodes(marketing_chunks)
485+
await tech_docs_index.add_nodes(DocumentNode, tech_chunks)
486+
await marketing_index.add_nodes(DocumentNode, marketing_chunks)
487487

488488
# Searches are automatically isolated to each index
489489
tech_results = await tech_docs_index.search_by_text("query", k=5)
@@ -536,7 +536,7 @@ async def build_rag_index():
536536
chunks = parser.get_nodes(documents)
537537

538538
# Step 4: Index chunks (embeddings auto-generated!)
539-
document_ids = await index.add_nodes(chunks, show_progress=True)
539+
document_ids = await index.add_nodes(DocumentNode, chunks, show_progress=True)
540540

541541
print(f"✓ Indexed {len(document_ids)} chunks")
542542
return index

docs/CHROMA_VECTOR_STORE.md

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -72,7 +72,7 @@ vector_index = VectorIndex(
7272
)
7373

7474
# Add documents
75-
await vector_index.add_nodes(chunks)
75+
await vector_index.add_nodes(DocumentNode, chunks)
7676

7777
# Search
7878
results = await vector_index.search_by_text("your query", k=5)
@@ -220,7 +220,7 @@ parser = TextFileDocumentParser(chunker=chunker)
220220
nodes = parser.parse_directory("docs/", pattern="*.md", recursive=True)
221221

222222
# Index all nodes (parents + children)
223-
await vector_index.add_nodes(nodes)
223+
await vector_index.add_nodes(DocumentNode, nodes)
224224

225225
# Search with parent resolution
226226
retriever = vector_index.as_retriever(top_k=5, resolve_parents=True)

docs/DOCUMENT_STORE_USAGE.md

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -176,7 +176,7 @@ chunk_nodes = nodes[1:]
176176
await doc_store.add_document(doc_node)
177177

178178
# 4. Store all nodes (including chunks) in vector store for search
179-
await index.add_nodes(nodes)
179+
await index.add_nodes(DocumentNode, nodes)
180180

181181
# 5. Search using vectors
182182
results = await index.search_by_text("What is the main topic?", k=5)

docs/HYBRID_SEARCH.md

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -47,7 +47,7 @@ index = VectorIndex(vector_store=vector_store)
4747

4848
# Index documents (automatically generates both dense and sparse vectors)
4949
nodes = [Node(text="Your document text here")]
50-
await index.add_nodes(nodes)
50+
await index.add_nodes(DocumentNode, nodes)
5151

5252
# Search (automatically uses hybrid search)
5353
results = await index.search_by_text("your query", k=5)

docs/HYBRID_SEARCH_QUICK_START.md

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -20,7 +20,7 @@ vector_store = QdrantVectorStore(
2020

2121
# 3. Use normally
2222
index = VectorIndex(vector_store=vector_store)
23-
await index.add_nodes([Node(text="Your docs")])
23+
await index.add_nodes(DocumentNode, [Node(text="Your docs")])
2424
results = await index.search_by_text("query", k=5)
2525
```
2626

fetchcraft-demos/fetchcraft-gradio-chatbot/pyproject.toml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,6 @@
11
[project]
22
name = "fetchcraft-gradio-chatbot"
3-
version = "0.2.0"
3+
version = "0.3.0"
44
description = "Fetchcraft demo using a simple Gradio UI"
55
requires-python = ">=3.12,<3.14"
66
dependencies = [

fetchcraft-demos/fetchcraft-gradio-chatbot/src/fetchcraft/demos/gradio/app.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -145,7 +145,7 @@ async def load_and_index_documents(
145145
logger.info(f"Created {len(all_chunks)} chunks")
146146

147147
logger.info(f"Indexing {len(all_chunks)} chunks with hybrid search...")
148-
await vector_index.add_nodes(all_chunks, show_progress=True)
148+
await vector_index.add_nodes(DocumentNode, all_chunks, show_progress=True)
149149

150150
logger.info(f"Successfully indexed {len(all_chunks)} chunks!")
151151
return len(all_chunks)

fetchcraft-demos/fetchcraft-hybrid-search/pyproject.toml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,6 @@
11
[project]
22
name = "fetchcraft-hybrid-search"
3-
version = "0.2.0"
3+
version = "0.3.0"
44
description = "Fetchcraft demo using Hybrid Search"
55
requires-python = ">=3.12,<3.14"
66
dependencies = [

fetchcraft-demos/fetchcraft-hybrid-search/src/fetchcraft/demos/hybrid/main.py

Lines changed: 30 additions & 92 deletions
Original file line numberDiff line numberDiff line change
@@ -18,7 +18,6 @@
1818
"""
1919

2020
import asyncio
21-
import os
2221
import sys
2322
from pathlib import Path
2423

@@ -33,31 +32,10 @@
3332
from fetchcraft.node_parser import HierarchicalNodeParser, SimpleNodeParser
3433
from fetchcraft.vector_store import QdrantVectorStore
3534

36-
# Configuration
37-
QDRANT_HOST = "localhost"
38-
QDRANT_PORT = 6333
39-
COLLECTION_NAME = "fetchcraft_chatbot" # Different collection for hybrid search
40-
DOCUMENTS_PATH = Path(os.getenv("DOCUMENTS_PATH", "Documents"))
35+
from fetchcraft.demos.hybrid.settings import Settings
4136

42-
# Embeddings configuration (adjust based on your setup)
43-
EMBEDDING_MODEL = os.getenv("EMBEDDING_MODEL", "bge-m3")
44-
EMBEDDING_API_KEY = os.getenv("OPENAI_API_KEY", "sk-321")
45-
EMBEDDING_BASE_URL = os.getenv("EMBEDDING_BASE_URL", None) # None = use OpenAI default
46-
INDEX_ID = "docs-index"
47-
48-
# LLM configuration for the agent
49-
LLM_MODEL = os.getenv("LLM_MODEL", "gpt-4-turbo")
50-
LLM_API_KEY = os.getenv("OPENAI_API_KEY", "sk-123")
51-
52-
# Chunking configuration
53-
CHUNK_SIZE = int(os.getenv("CHUNK_SIZE", "8192"))
54-
CHILD_SIZES = [4096, 1024]
55-
CHUNK_OVERLAP = int(os.getenv("CHUNK_OVERLAP", "200"))
56-
USE_HIERARCHICAL_CHUNKING = os.getenv("USE_HIERARCHICAL_CHUNKING", "true").lower() == "true"
57-
58-
# 🔥 HYBRID SEARCH CONFIGURATION
59-
ENABLE_HYBRID = os.getenv("ENABLE_HYBRID", "true").lower() == "true"
60-
FUSION_METHOD = os.getenv("FUSION_METHOD", "rrf") # "rrf" or "dbsf"
37+
# Initialize settings
38+
settings = Settings()
6139

6240

6341
def collection_exists(client: QdrantClient, collection_name: str) -> bool:
@@ -157,7 +135,7 @@ async def load_and_index_documents(
157135
print(f" 🔍 Sparse vector (keyword matching)")
158136

159137
# Index all chunks (embeddings will be generated automatically)
160-
await vector_index.add_nodes(all_chunks, show_progress=True)
138+
await vector_index.add_nodes(DocumentNode, all_chunks, show_progress=True)
161139

162140
print(f"✅ Successfully indexed {len(all_chunks)} chunks with hybrid search!")
163141
return len(all_chunks)
@@ -180,64 +158,52 @@ async def setup_rag_system():
180158
# Initialize embeddings
181159
print("\n1️⃣ Initializing embeddings...")
182160
embeddings = OpenAIEmbeddings(
183-
model=EMBEDDING_MODEL,
184-
api_key=EMBEDDING_API_KEY,
185-
base_url=EMBEDDING_BASE_URL
161+
model=settings.embedding_model,
162+
api_key=settings.openai_api_key,
163+
base_url=settings.embedding_base_url
186164
)
187165

188166

189167
# Connect to Qdrant
190-
print(f"\n2️⃣ Connecting to Qdrant at {QDRANT_HOST}:{QDRANT_PORT}...")
191-
client = QdrantClient(host=QDRANT_HOST, port=QDRANT_PORT)
168+
print(f"\n2️⃣ Connecting to Qdrant at {settings.qdrant_host}:{settings.qdrant_port}...")
169+
client = QdrantClient(host=settings.qdrant_host, port=settings.qdrant_port)
192170
client.get_collections() # Test connection
193171
print(f" ✓ Connected to Qdrant")
194172

195173
# Check if collection exists
196-
print(f"\n3️⃣ Checking collection '{COLLECTION_NAME}'...")
197-
needs_indexing = not collection_exists(client, COLLECTION_NAME)
198-
199-
if needs_indexing:
200-
print(f" ⚠️ Collection '{COLLECTION_NAME}' does not exist - will create and index")
201-
else:
202-
print(f" ✓ Collection '{COLLECTION_NAME}' already exists - skipping indexing")
174+
print(f"\n3️⃣ Checking collection '{settings.collection_name}'...")
175+
needs_indexing = not collection_exists(client, settings.collection_name)
203176

204177
# Create vector store with HYBRID SEARCH enabled
205178
print(f"\n🔥 Creating vector store with HYBRID SEARCH...")
206-
print(f" • Enable Hybrid: {ENABLE_HYBRID}")
207-
print(f" • Fusion Method: {FUSION_METHOD.upper()}")
208-
209-
try:
210-
vector_store = QdrantVectorStore(
211-
client=client,
212-
collection_name=COLLECTION_NAME,
213-
embeddings=embeddings,
214-
distance="Cosine",
215-
enable_hybrid=ENABLE_HYBRID, # 🔥 Enable hybrid search
216-
fusion_method=FUSION_METHOD # Choose RRF or DBSF
217-
)
218-
print(f" ✓ Vector store created with hybrid search enabled!")
219-
except ImportError as e:
220-
print(f"\n❌ Error: {e}")
221-
print("\n💡 Hybrid search requires fastembed:")
222-
print(" pip install fastembed")
223-
sys.exit(1)
224-
179+
print(f" • Enable Hybrid: {settings.enable_hybrid}")
180+
print(f" • Fusion Method: {settings.fusion_method.upper()}")
181+
182+
vector_store = QdrantVectorStore(
183+
client=client,
184+
collection_name=settings.collection_name,
185+
embeddings=embeddings,
186+
distance="Cosine",
187+
enable_hybrid=settings.enable_hybrid, # 🔥 Enable hybrid search
188+
fusion_method=settings.fusion_method # Choose RRF or DBSF
189+
)
190+
225191
# Create vector index with a consistent index_id
226192
vector_index = VectorIndex(
227193
vector_store=vector_store,
228-
index_id=INDEX_ID
194+
index_id=settings.index_id
229195
)
230196
needs_indexing = False
231197
# Index documents if needed
232198
if needs_indexing:
233199
print(f"\n4️⃣ Indexing documents with hybrid search...")
234200
num_chunks = await load_and_index_documents(
235201
vector_index=vector_index,
236-
documents_path=DOCUMENTS_PATH,
237-
chunk_size=CHUNK_SIZE,
238-
child_sizes=CHILD_SIZES,
239-
overlap=CHUNK_OVERLAP,
240-
use_hierarchical=USE_HIERARCHICAL_CHUNKING
202+
documents_path=Path(settings.documents_path),
203+
chunk_size=settings.chunk_size,
204+
child_sizes=settings.child_sizes,
205+
overlap=settings.chunk_overlap,
206+
use_hierarchical=settings.use_hierarchical_chunking
241207
)
242208
if num_chunks == 0:
243209
print("\n⚠️ Warning: No documents were indexed!")
@@ -256,7 +222,7 @@ async def setup_rag_system():
256222
tools = [Tool(tool_func, takes_ctx=True, max_retries=3)]
257223

258224
agent = PydanticAgent.create(
259-
model=LLM_MODEL,
225+
model=settings.llm_model,
260226
tools=tools,
261227
retries=3
262228
)
@@ -331,33 +297,6 @@ async def repl_loop(agent: PydanticAgent):
331297
print("─" * 70)
332298

333299

334-
def print_error_hints(error: Exception):
335-
"""Print helpful hints based on the error type."""
336-
error_msg = str(error).lower()
337-
338-
if "fastembed" in error_msg:
339-
print("\n💡 FastEmbed Missing:")
340-
print(" - Hybrid search requires fastembed")
341-
print(" - Install with: pip install fastembed")
342-
elif "api key" in error_msg or "authentication" in error_msg:
343-
print("\n💡 API Key Issue:")
344-
print(" - Set OPENAI_API_KEY environment variable")
345-
print(" - Or configure EMBEDDING_BASE_URL for a custom endpoint")
346-
elif "connection" in error_msg or "refused" in error_msg or "qdrant" in error_msg:
347-
print("\n💡 Connection Issue:")
348-
print(" - Make sure Qdrant is running on localhost:6333")
349-
print(" - Start with: docker run -p 6333:6333 qdrant/qdrant")
350-
elif "not found" in error_msg or "no such file" in error_msg:
351-
print("\n💡 File Path Issue:")
352-
print(f" - Check that {DOCUMENTS_PATH} exists")
353-
print(" - Make sure it contains .txt files")
354-
elif "pydantic" in error_msg or "import" in error_msg:
355-
print("\n💡 Dependency Issue:")
356-
print(" - Install required packages: pip install pydantic-ai qdrant-client openai fastembed")
357-
else:
358-
print("\n💡 For more help, check the README.md file")
359-
360-
361300
async def main():
362301
"""Main entry point for the hybrid search demo."""
363302
try:
@@ -371,7 +310,6 @@ async def main():
371310
print("\n\n👋 Demo interrupted. Goodbye!")
372311
except Exception as e:
373312
print(f"\n❌ Error: {e}")
374-
print_error_hints(e)
375313
import traceback
376314
traceback.print_exc()
377315
sys.exit(1)
Lines changed: 40 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,40 @@
1+
from pydantic_settings import BaseSettings, SettingsConfigDict
2+
3+
4+
class Settings(BaseSettings):
5+
"""Configuration settings for the hybrid search demo."""
6+
7+
model_config = SettingsConfigDict(
8+
env_file=".env",
9+
env_file_encoding="utf-8",
10+
extra="ignore"
11+
)
12+
13+
# Qdrant configuration
14+
qdrant_host: str = "localhost"
15+
qdrant_port: int = 6333
16+
collection_name: str = "fetchcraft_chatbot"
17+
18+
# Documents configuration
19+
documents_path: str = "Documents"
20+
21+
# Embeddings configuration
22+
embedding_model: str = "bge-m3"
23+
openai_api_key: str = "sk-321"
24+
embedding_base_url: str | None = None
25+
26+
# Index configuration
27+
index_id: str = "docs-index"
28+
29+
# LLM configuration
30+
llm_model: str = "gpt-4-turbo"
31+
32+
# Chunking configuration
33+
chunk_size: int = 8192
34+
child_sizes: list[int] = [4096, 1024]
35+
chunk_overlap: int = 200
36+
use_hierarchical_chunking: bool = True
37+
38+
# Hybrid search configuration
39+
enable_hybrid: bool = True
40+
fusion_method: str = "rrf"

0 commit comments

Comments
 (0)