From 42cd21f6e8a873728dbfe3e68aad0c0dd6b87b54 Mon Sep 17 00:00:00 2001 From: Shane Date: Sat, 5 Apr 2025 22:02:49 -0700 Subject: [PATCH 1/2] cleaned up the code to support different embedding model --- config.example.yaml | 1 + src/innieme/discord_bot_config.py | 8 ++++++ src/innieme/document_processor.py | 46 ++++++++++++------------------- src/innieme/innie.py | 3 +- tests/test_discord_bot.py | 1 + tests/test_discord_bot_config.py | 3 +- 6 files changed, 31 insertions(+), 31 deletions(-) diff --git a/config.example.yaml b/config.example.yaml index 34bb5c8..8692c4b 100644 --- a/config.example.yaml +++ b/config.example.yaml @@ -17,6 +17,7 @@ discord_token: discord_bot_token # 5. Copy your API key (you won't be able to see it again!) # 6. Paste your key below (keep it secret!) openai_api_key: openai_api_key +embedding_model: "openai" # Options: "openai", "huggingface" outies: # To get your Discord admin user ID: # 1. Go to Discord and go to User Settings (gear icon) diff --git a/src/innieme/discord_bot_config.py b/src/innieme/discord_bot_config.py index d009ed4..7901e79 100644 --- a/src/innieme/discord_bot_config.py +++ b/src/innieme/discord_bot_config.py @@ -50,6 +50,7 @@ def set_back_references(self): class DiscordBotConfig(BaseModel): discord_token: str openai_api_key: str + embedding_model: str outies: List[OutieConfig] @field_validator('discord_token') @@ -58,6 +59,13 @@ def token_must_not_be_empty(cls, v): raise ValueError('Discord token cannot be empty') return v + @field_validator('embedding_model') + def model_must_be_supported(cls, v): + supported_models = ['openai', 'huggingface', 'fake'] + if v not in supported_models: + raise ValueError(f'Unsupported embedding model: {v}') + return v + @model_validator(mode='after') def set_back_references(self): for outie in self.outies: diff --git a/src/innieme/document_processor.py b/src/innieme/document_processor.py index 8f5294e..aa123a7 100644 --- a/src/innieme/document_processor.py +++ b/src/innieme/document_processor.py @@ -1,32 +1,20 @@ import os import glob -import asyncio from typing import List import pypdf import docx -import numpy as np from langchain_community.vectorstores import FAISS from langchain_community.docstore.in_memory import InMemoryDocstore from langchain_community.vectorstores.faiss import FAISS as LangchainFAISS -from langchain_community.embeddings import FakeEmbeddings # Simple in-memory embedding -from langchain_core.embeddings import Embeddings # Base class for embeddings +from langchain_community.embeddings import FakeEmbeddings from langchain.text_splitter import RecursiveCharacterTextSplitter import faiss class DocumentProcessor: - def __init__(self, docs_dir, embedding_type="fake", embedding_config=None): - """ - Initialize document processor with configurable embeddings - - Args: - docs_dir (str): Directory containing documents to process - embedding_type (str): Type of embedding to use ('fake', 'openai', etc.) - embedding_config (dict, optional): Configuration for the embedding - """ + def __init__(self, docs_dir, embedding_config={}): self.docs_dir = docs_dir - self.embedding_type = embedding_type - self.embedding_config = embedding_config or {} + self.embedding_config = embedding_config self.embeddings = self._get_embeddings() self.vectorstore = None self.text_splitter = RecursiveCharacterTextSplitter( @@ -35,22 +23,25 @@ def __init__(self, docs_dir, embedding_type="fake", embedding_config=None): ) def _get_embeddings(self): - """ - Factory method to get embeddings based on configuration - - Returns: - Embeddings: An instance of embeddings - """ - if self.embedding_type == "openai": + embedding_type = self.embedding_config.get("type", "fake") + if embedding_type == "openai": # Only import if needed from langchain_openai import OpenAIEmbeddings - api_key = self.embedding_config.get("api_key", os.getenv("OPENAI_API_KEY")) + api_key = self.embedding_config["api_key"] return OpenAIEmbeddings(api_key=api_key) - elif self.embedding_type == "fake": + elif embedding_type == "huggingface": + # Only import if needed + from langchain_huggingface import HuggingFaceEmbeddings + model_name = self.embedding_config.get("model_name", "all-MiniLM-L6-v2") + return HuggingFaceEmbeddings( + model_name=model_name, + cache_folder=os.path.join(self.docs_dir, ".cache", "langchain"), + ) + elif embedding_type == "fake": # Simple embedding for testing return FakeEmbeddings(size=1536) # OpenAI compatible dimension else: - raise ValueError(f"Unsupported embedding type: {self.embedding_type}") + raise ValueError(f"Unsupported embedding type: {embedding_type}") def _create_empty_store(self): """Handle the case where no texts are found to vectorize by creating an empty FAISS index""" @@ -62,7 +53,6 @@ def _create_empty_store(self): docstore=InMemoryDocstore({}), index_to_docstore_id={} ) - async def scan_and_vectorize(self, topic_name:str) -> str: """Scan all documents in the specified directory and create vector embeddings""" @@ -77,13 +67,13 @@ async def scan_and_vectorize(self, topic_name:str) -> str: # Process each file based on its type count = 0 for file_path in files: - print(f"- {file_path}") + print(f" - {file_path}") text = await self._extract_text(file_path) if text: document_texts.append({"text": text, "source": file_path}) count += 1 else: - print(f"Text extraction failed for {file_path}") + print(f" Text extraction failed for {file_path}") print(f"Done. Extracted text from {count} documents") # Split texts into chunks diff --git a/src/innieme/innie.py b/src/innieme/innie.py index 6cb242c..2924eba 100644 --- a/src/innieme/innie.py +++ b/src/innieme/innie.py @@ -13,8 +13,7 @@ def __init__(self, outie_config:OutieConfig, api_key:str, config: TopicConfig): # Initialize components self.document_processor = DocumentProcessor( config.docs_dir, - embedding_type="openai", - embedding_config={"api_key": api_key} + embedding_config={"type":outie_config.bot.embedding_model, "api_key": outie_config.bot.openai_api_key} ) self.knowledge_manager = KnowledgeManager() self.conversation_engine = ConversationEngine( diff --git a/tests/test_discord_bot.py b/tests/test_discord_bot.py index 009e4ea..4dcbae6 100644 --- a/tests/test_discord_bot.py +++ b/tests/test_discord_bot.py @@ -12,6 +12,7 @@ bot_config = DiscordBotConfig( discord_token="test_token", openai_api_key="test_key", + embedding_model="fake", outies=[] ) outie_config = OutieConfig( diff --git a/tests/test_discord_bot_config.py b/tests/test_discord_bot_config.py index 826de74..dfcbefb 100644 --- a/tests/test_discord_bot_config.py +++ b/tests/test_discord_bot_config.py @@ -7,7 +7,7 @@ def test_valid_outie_id(): """Test that a positive outie_id is accepted""" # Create a bot config first - bot = DiscordBotConfig(discord_token="test_token", openai_api_key="key", outies=[]) # Add minimal bot config + bot = DiscordBotConfig(discord_token="test_token", openai_api_key="key", embedding_model="huggingface", outies=[]) # Add minimal bot config outie = OutieConfig(outie_id=1, topics=[], bot=bot) # Add bot reference assert outie.outie_id == 1 @@ -35,6 +35,7 @@ def test_config_from_yaml(): yaml_content = f""" discord_token: "test_discord_token" openai_api_key: "test_openai_key" + embedding_model: "openai" outies: - outie_id: 1 topics: From b9c40a230116b243ca26915e7577f2564cd6455f Mon Sep 17 00:00:00 2001 From: Shane Date: Sat, 5 Apr 2025 22:05:48 -0700 Subject: [PATCH 2/2] remove the default mutable argument --- src/innieme/document_processor.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/innieme/document_processor.py b/src/innieme/document_processor.py index aa123a7..347e82f 100644 --- a/src/innieme/document_processor.py +++ b/src/innieme/document_processor.py @@ -12,9 +12,9 @@ class DocumentProcessor: - def __init__(self, docs_dir, embedding_config={}): + def __init__(self, docs_dir, embedding_config=None): self.docs_dir = docs_dir - self.embedding_config = embedding_config + self.embedding_config = embedding_config or {} self.embeddings = self._get_embeddings() self.vectorstore = None self.text_splitter = RecursiveCharacterTextSplitter(