From 4a71acbf767c7179667635f311a2cf7f70c2dc47 Mon Sep 17 00:00:00 2001 From: Shane Date: Mon, 14 Apr 2025 21:12:58 -0700 Subject: [PATCH] introduced factories for embeddings and evctor stores for ease of testing and experimenting --- requirements.txt | 42 ++++++------ src/innieme/discord_bot.py | 2 +- src/innieme/document_processor.py | 83 ++++++++++++------------ src/innieme/embeddings_factory.py | 37 +++++++++++ src/innieme/innie.py | 41 ++++++++++-- src/innieme/vector_store_factory.py | 41 ++++++++++++ tests/test_document_processor.py | 99 ++++++++++++++++++++++++++--- 7 files changed, 264 insertions(+), 81 deletions(-) create mode 100644 src/innieme/embeddings_factory.py create mode 100644 src/innieme/vector_store_factory.py diff --git a/requirements.txt b/requirements.txt index a3de4e2..c3a19ee 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,28 +1,26 @@ -# Discord API -discord.py>=2.3.0 +# Discord Integration +discord.py -# Environment variables -python-dotenv>=1.0.0 +# Environment Configuration +python-dotenv -# Document processing -pypdf>=3.15.1 -python-docx>=0.8.11 -langchain>=0.0.267 -langchain-community>=0.0.10 -langchain-openai>=0.0.5 +# Document Processing & LLM +chromadb +langchain +langchain-community +langchain-chroma +langchain_huggingface +langchain-openai +pypdf +python-docx -# Vector database -faiss-cpu>=1.7.4 +# Vector Database +faiss-cpu -# OpenAI embeddings -openai>=1.0.0 - -# Async support -asyncio>=3.4.3 +# AI/ML Dependencies +numpy +openai # Testing -pytest>=7.4.0 -pytest-asyncio>=0.21.0 - -# Utilities -numpy>=1.24.0 +pytest +pytest-asyncio diff --git a/src/innieme/discord_bot.py b/src/innieme/discord_bot.py index bbb4343..b7be0e4 100644 --- a/src/innieme/discord_bot.py +++ b/src/innieme/discord_bot.py @@ -230,7 +230,7 @@ async def on_message(self, message:Message): await self.respond(message, "Sorry I am not set up to support a topic in this channel.") return - logger.info(f"On message, located topic: {topic.config.name}") + logger.debug(f"On message, located topic: {topic.config.name}") outie_id = topic.outie_config.outie_id # Check if message is in a thread diff --git a/src/innieme/document_processor.py b/src/innieme/document_processor.py index df25d67..b63c154 100644 --- a/src/innieme/document_processor.py +++ b/src/innieme/document_processor.py @@ -1,65 +1,53 @@ -from langchain_community.vectorstores import FAISS -from langchain_community.docstore.in_memory import InMemoryDocstore -from langchain_community.vectorstores.faiss import FAISS as LangchainFAISS -from langchain_community.embeddings import FakeEmbeddings +from .embeddings_factory import EmbeddingsFactory +from .vector_store_factory import VectorStoreFactory + from langchain.text_splitter import RecursiveCharacterTextSplitter -import faiss import pypdf import docx import glob +from pydantic import SecretStr -from typing import List, Dict +from typing import List, Dict, Optional, Union +from langchain.embeddings.base import Embeddings import logging import os +import time logger = logging.getLogger(__name__) class DocumentProcessor: - def __init__(self, docs_dir, embedding_config:Dict[str, str]=None): + def __init__(self, + topic: str, + docs_dir: str, + embeddings_factory: EmbeddingsFactory, + vector_store_factory: VectorStoreFactory): self.docs_dir = docs_dir - self.embedding_config = embedding_config or {} - self.embeddings = self._get_embeddings() - self.vectorstore = None + self.topic = topic + self.embeddings_factory = embeddings_factory + self.vector_store_factory = vector_store_factory self.text_splitter = RecursiveCharacterTextSplitter( chunk_size=1000, chunk_overlap=200 ) - - def _get_embeddings(self): - embedding_type = self.embedding_config.get("type", "fake") - if embedding_type == "openai": - # Only import if needed - from langchain_openai import OpenAIEmbeddings - api_key = self.embedding_config.get("api_key") - return OpenAIEmbeddings(api_key=api_key) - elif embedding_type == "huggingface": - # Only import if needed - from langchain_huggingface import HuggingFaceEmbeddings - model_name = self.embedding_config.get("model_name", "all-MiniLM-L6-v2") - return HuggingFaceEmbeddings( - model_name=model_name, - cache_folder=os.path.join(self.docs_dir, ".cache", "langchain"), - ) - elif embedding_type == "fake": - # Simple embedding for testing - return FakeEmbeddings(size=1536) # OpenAI compatible dimension - else: - raise ValueError(f"Unsupported embedding type: {embedding_type}") def _create_empty_store(self): - """Handle the case where no texts are found to vectorize by creating an empty FAISS index""" - dimension = 1536 # Same as OpenAI embeddings dimension - # Create empty FAISS instance - return LangchainFAISS( - embedding_function=self.embeddings, - index=faiss.IndexFlatL2(dimension), - docstore=InMemoryDocstore({}), - index_to_docstore_id={} + """Handle the case where no texts are found to vectorize""" + collection_name = self._get_collection_name() + return self.vector_store_factory.create_empty_store( + collection_name=collection_name, + embeddings=self.embeddings_factory.create_embeddings() ) - async def scan_and_vectorize(self, topic_name:str) -> str: + def _get_collection_name(self) -> str: + """Create a unique collection name using topic and timestamp""" + # Clean topic name to be filesystem safe + safe_topic = "".join(c if c.isalnum() else "_" for c in self.topic) + timestamp = int(time.time() * 1000) # Milliseconds since epoch + return f"{safe_topic}_{timestamp}" + + async def scan_and_vectorize(self) -> str: """Scan all documents in the specified directory and create vector embeddings""" document_texts = [] @@ -68,7 +56,7 @@ async def scan_and_vectorize(self, topic_name:str) -> str: for ext in ['*.pdf', '*.docx', '*.txt', '*.md']: files.extend(glob.glob(os.path.join(self.docs_dir, '**', ext), recursive=True)) - logger.info(f"For {topic_name}: Found {len(files)} documents to process under {self.docs_dir}...") + logger.info(f"For {self.topic}: Found {len(files)} documents to process under {self.docs_dir}...") # Process each file based on its type count = 0 for file_path in files: @@ -89,15 +77,22 @@ async def scan_and_vectorize(self, topic_name:str) -> str: # Create vector store texts = [chunk["text"] for chunk in all_chunks] + + collection_name = self._get_collection_name() response = "" if not texts: self.vectorstore = self._create_empty_store() - response = f"On topic '{topic_name}': no documents found to process" + response = f"On topic '{self.topic}': no documents found to process" else: metadatas = [{"source": chunk["source"]} for chunk in all_chunks] - self.vectorstore = FAISS.from_texts(texts, self.embeddings, metadatas=metadatas) - response = f"On topic '{topic_name}': {len(all_chunks)} chunks created from {count} out of {len(files)} references" + self.vectorstore = self.vector_store_factory.create_from_texts( + texts, + self.embeddings_factory.create_embeddings(), + collection_name=collection_name, + metadatas=metadatas + ) + response = f"On topic '{self.topic}': {len(all_chunks)} chunks created from {count} out of {len(files)} references" return response async def _extract_text(self, file_path): diff --git a/src/innieme/embeddings_factory.py b/src/innieme/embeddings_factory.py new file mode 100644 index 0000000..5c9efe2 --- /dev/null +++ b/src/innieme/embeddings_factory.py @@ -0,0 +1,37 @@ +from abc import ABC, abstractmethod +from pydantic import SecretStr +from langchain.embeddings.base import Embeddings +from langchain_openai import OpenAIEmbeddings +from langchain_huggingface import HuggingFaceEmbeddings + +class EmbeddingsFactory(ABC): + """Abstract factory interface for creating embeddings""" + @abstractmethod + def create_embeddings(self) -> Embeddings: + """Create and return an embeddings instance""" + pass + +class OpenAIEmbeddingsFactory(EmbeddingsFactory): + def __init__(self, api_key: str): + self.api_key = api_key + + def create_embeddings(self) -> Embeddings: + return OpenAIEmbeddings(api_key=SecretStr(self.api_key)) + +class HuggingFaceEmbeddingsFactory(EmbeddingsFactory): + def __init__(self, cache_dir: str, model_name: str = "all-MiniLM-L6-v2"): + self.model_name = model_name + self.cache_dir = cache_dir + + def create_embeddings(self) -> Embeddings: + return HuggingFaceEmbeddings( + model_name=self.model_name, + cache_folder=self.cache_dir + ) + +class ExistingEmbeddingsFactory(EmbeddingsFactory): + def __init__(self, embeddings: Embeddings): + self.embeddings = embeddings + + def create_embeddings(self) -> Embeddings: + return self.embeddings \ No newline at end of file diff --git a/src/innieme/innie.py b/src/innieme/innie.py index 7d1cdb9..fd5935c 100644 --- a/src/innieme/innie.py +++ b/src/innieme/innie.py @@ -1,19 +1,35 @@ -from dataclasses import dataclass -from typing import List -from functools import wraps +from .embeddings_factory import EmbeddingsFactory, OpenAIEmbeddingsFactory, HuggingFaceEmbeddingsFactory, ExistingEmbeddingsFactory +from .vector_store_factory import ChromaVectorStoreFactory, FAISSVectorStoreFactory from .document_processor import DocumentProcessor from .knowledge_manager import KnowledgeManager from .conversation_engine import ConversationEngine from .discord_bot_config import OutieConfig, TopicConfig +from langchain_community.embeddings import FakeEmbeddings + +import os + +from dataclasses import dataclass +from typing import Dict +from functools import wraps + class Topic: def __init__(self, outie_config:OutieConfig, api_key:str, config: TopicConfig): self.config = config self.outie_config = outie_config # Initialize components self.document_processor = DocumentProcessor( + self.config.name, config.docs_dir, - embedding_config={"type":outie_config.bot.embedding_model, "api_key": outie_config.bot.openai_api_key} + self._create_embeddings_from_config( + { + "type":outie_config.bot.embedding_model, + "api_key": outie_config.bot.openai_api_key, + "cache_dir": os.path.join(config.docs_dir, ".cache", "langchain") + } + ), + ChromaVectorStoreFactory() +# FAISSVectorStoreFactory() ) self.knowledge_manager = KnowledgeManager() self.active_threads = set() @@ -24,6 +40,21 @@ def __init__(self, outie_config:OutieConfig, api_key:str, config: TopicConfig): self.knowledge_manager ) + def _create_embeddings_from_config(self, config: Dict[str, str]) -> EmbeddingsFactory: + embedding_type = config.get("type", "") + if embedding_type == "openai": + api_key = config['api_key'] + return OpenAIEmbeddingsFactory(api_key) + elif embedding_type == "huggingface": + return HuggingFaceEmbeddingsFactory( + cache_dir=config['cache_dir'], + model_name=config.get("model_name", "all-MiniLM-L6-v2") + ) + elif embedding_type == "fake": + return ExistingEmbeddingsFactory(FakeEmbeddings(size=1536)) + else: + raise ValueError(f"Unsupported embedding type: {embedding_type}") + def is_following_thread(self, thread_id:int) -> bool: return thread_id in self.active_threads @@ -32,7 +63,7 @@ async def process_query(self, thread_id: int, query: str, context_messages: list return await self.conversation_engine.process_query(query, context_messages) async def scan_and_vectorize(self) -> str: - return await self.document_processor.scan_and_vectorize(self.config.name) + return await self.document_processor.scan_and_vectorize() async def generate_summary(self, thread_id) -> str: return await self.knowledge_manager.generate_summary(thread_id) diff --git a/src/innieme/vector_store_factory.py b/src/innieme/vector_store_factory.py new file mode 100644 index 0000000..c602ace --- /dev/null +++ b/src/innieme/vector_store_factory.py @@ -0,0 +1,41 @@ +from langchain.vectorstores.base import VectorStore +from langchain.embeddings.base import Embeddings +from langchain_chroma.vectorstores import Chroma +from langchain_community.vectorstores import FAISS + +from abc import ABC, abstractmethod +from typing import List, Dict, Optional + +class VectorStoreFactory(ABC): + """Abstract factory interface for creating vector stores""" + @abstractmethod + def create_empty_store(self, collection_name: str, embeddings: Embeddings) -> VectorStore: + """Create an empty vector store""" + pass + + @abstractmethod + def create_from_texts(self, texts: List[str], embeddings: Embeddings, collection_name: str, metadatas: Optional[List[Dict]] = None) -> VectorStore: + """Create a vector store from texts""" + pass + +class ChromaVectorStoreFactory(VectorStoreFactory): + def create_empty_store(self, collection_name: str, embeddings: Embeddings) -> VectorStore: + return Chroma( + collection_name=collection_name, + embedding_function=embeddings, + ) + + def create_from_texts(self, texts: List[str], embeddings: Embeddings, collection_name: str, metadatas: Optional[List[Dict]] = None) -> VectorStore: + return Chroma.from_texts( + texts, + embeddings, + collection_name=collection_name, + metadatas=metadatas + ) + +class FAISSVectorStoreFactory(VectorStoreFactory): + def create_empty_store(self, collection_name: str, embeddings: Embeddings) -> VectorStore: + return FAISS.from_texts([], embeddings) + + def create_from_texts(self, texts: List[str], embeddings: Embeddings, collection_name: str, metadatas: Optional[List[Dict]] = None) -> VectorStore: + return FAISS.from_texts(texts, embeddings, metadatas=metadatas) \ No newline at end of file diff --git a/tests/test_document_processor.py b/tests/test_document_processor.py index df8b21e..dd568a1 100644 --- a/tests/test_document_processor.py +++ b/tests/test_document_processor.py @@ -1,8 +1,31 @@ import pytest from innieme.document_processor import DocumentProcessor +from innieme.vector_store_factory import ChromaVectorStoreFactory +from innieme.embeddings_factory import ExistingEmbeddingsFactory +from langchain_core.embeddings import Embeddings +import numpy as np # Test data directory TEST_DOCS_DIR = "test_documents" +TEST_DOCS_2_DIR = "test_documents_2" + +class FakeEmbeddings(Embeddings): + """Fake embeddings for testing""" + def embed_documents(self, texts: list[str]) -> list[list[float]]: + # Return consistent fake embeddings based on text content + return [self._get_fake_embedding(text) for text in texts] + + def embed_query(self, text: str) -> list[float]: + return self._get_fake_embedding(text) + + def _get_fake_embedding(self, text: str) -> list[float]: + # Generate deterministic fake embeddings + if "cars" in text.lower(): + return [1.0, 0.0, 0.0] + elif "plants" in text.lower(): + return [0.0, 1.0, 0.0] + else: + return [0.0, 0.0, 1.0] @pytest.fixture def test_docs_dir(tmp_path): @@ -11,6 +34,13 @@ def test_docs_dir(tmp_path): docs_dir.mkdir() return docs_dir +@pytest.fixture +def test_docs_2_dir(tmp_path): + """Create a second temporary directory for test documents""" + docs_2_dir = tmp_path / TEST_DOCS_2_DIR + docs_2_dir.mkdir() + return docs_2_dir + @pytest.fixture def sample_txt_file(test_docs_dir): """Create a sample text file for testing""" @@ -21,7 +51,12 @@ def sample_txt_file(test_docs_dir): @pytest.fixture def document_processor(test_docs_dir) -> DocumentProcessor: """Create a DocumentProcessor instance for testing""" - return DocumentProcessor(str(test_docs_dir)) + return DocumentProcessor( + "testing", + str(test_docs_dir), + ExistingEmbeddingsFactory(FakeEmbeddings()), + ChromaVectorStoreFactory() + ) @pytest.mark.asyncio async def test_extract_from_txt(document_processor, sample_txt_file): @@ -34,13 +69,9 @@ async def test_extract_from_txt(document_processor, sample_txt_file): @pytest.mark.asyncio async def test_scan_and_vectorize_empty_dir(document_processor): """Test scanning an empty directory""" - result = await document_processor.scan_and_vectorize(topic_name="test_topic") - assert result == "On topic 'test_topic': no documents found to process" + result = await document_processor.scan_and_vectorize() + assert result == "On topic 'testing': no documents found to process" assert document_processor.vectorstore is not None - -@pytest.mark.asyncio -async def test_search_documents_empty_vectorstore(document_processor): - """Test searching with empty vectorstore""" results = await document_processor.search_documents("test query") assert results == [] @@ -48,9 +79,59 @@ async def test_search_documents_empty_vectorstore(document_processor): async def test_search_documents_with_data(document_processor, sample_txt_file): """Test searching after processing documents""" # First scan and vectorize - await document_processor.scan_and_vectorize(topic_name="test_topic") + await document_processor.scan_and_vectorize() # Then search results = await document_processor.search_documents("test document") assert len(results) > 0 - assert "test document" in results[0].page_content.lower() \ No newline at end of file + assert "test document" in results[0].page_content.lower() + +@pytest.mark.asyncio +async def test_independent_vectorstores_different_topics(test_docs_dir, test_docs_2_dir): + """Test that document processors with different topics maintain separate vectorstores""" + # Create test files for different topics + topic1_file = test_docs_dir / "cars.txt" + topic2_file = test_docs_2_dir / "plants.txt" + topic1_file.write_text("This is a document about cars and vehicles") + topic2_file.write_text("This is a document about plants and gardens") + + fake_embeddings1 = FakeEmbeddings() + fake_embeddings2 = FakeEmbeddings() + + # Create processors for different topics with fake embeddings + cars_processor = DocumentProcessor( + "cars", + str(test_docs_dir), + ExistingEmbeddingsFactory(fake_embeddings1), + ChromaVectorStoreFactory() + ) + plants_processor = DocumentProcessor( + "plants", + str(test_docs_2_dir), + ExistingEmbeddingsFactory(fake_embeddings2), + ChromaVectorStoreFactory() + ) + + # Process documents under different topics + await cars_processor.scan_and_vectorize() + await plants_processor.scan_and_vectorize() + + # Search for cars in cars processor - should find just one result + car_results = await cars_processor.search_documents("cars") + assert len(car_results) == 1 + assert car_results[0].page_content == "This is a document about cars and vehicles" + assert car_results[0].metadata["source"].endswith("cars.txt") + + # Search for plants in cars processor - should find at most 1 result + plant_results_in_cars = await cars_processor.search_documents("plants") + assert len(plant_results_in_cars) < 2 + + # Search for plants in plants processor - should find just one result + plant_results = await plants_processor.search_documents("plants") + assert len(plant_results) == 1 + assert plant_results[0].page_content == "This is a document about plants and gardens" + assert plant_results[0].metadata["source"].endswith("plants.txt") + + # Search for cars in plants processor - should find at most 1 result + car_results_in_plants = await plants_processor.search_documents("cars") + assert len(car_results_in_plants) < 2 \ No newline at end of file