Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions config.example.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,7 @@ discord_token: discord_bot_token
# 5. Copy your API key (you won't be able to see it again!)
# 6. Paste your key below (keep it secret!)
openai_api_key: openai_api_key
embedding_model: "openai" # Options: "openai", "huggingface"
outies:
# To get your Discord admin user ID:
# 1. Go to Discord and go to User Settings (gear icon)
Expand Down
8 changes: 8 additions & 0 deletions src/innieme/discord_bot_config.py
Original file line number Diff line number Diff line change
Expand Up @@ -50,6 +50,7 @@ def set_back_references(self):
class DiscordBotConfig(BaseModel):
discord_token: str
openai_api_key: str
embedding_model: str
outies: List[OutieConfig]

@field_validator('discord_token')
Expand All @@ -58,6 +59,13 @@ def token_must_not_be_empty(cls, v):
raise ValueError('Discord token cannot be empty')
return v

@field_validator('embedding_model')
def model_must_be_supported(cls, v):
supported_models = ['openai', 'huggingface', 'fake']
if v not in supported_models:
raise ValueError(f'Unsupported embedding model: {v}')
return v

@model_validator(mode='after')
def set_back_references(self):
for outie in self.outies:
Expand Down
44 changes: 17 additions & 27 deletions src/innieme/document_processor.py
Original file line number Diff line number Diff line change
@@ -1,31 +1,19 @@
import os
import glob
import asyncio
from typing import List
import pypdf
import docx
import numpy as np
from langchain_community.vectorstores import FAISS
from langchain_community.docstore.in_memory import InMemoryDocstore
from langchain_community.vectorstores.faiss import FAISS as LangchainFAISS
from langchain_community.embeddings import FakeEmbeddings # Simple in-memory embedding
from langchain_core.embeddings import Embeddings # Base class for embeddings
from langchain_community.embeddings import FakeEmbeddings
from langchain.text_splitter import RecursiveCharacterTextSplitter
import faiss


class DocumentProcessor:
def __init__(self, docs_dir, embedding_type="fake", embedding_config=None):
"""
Initialize document processor with configurable embeddings

Args:
docs_dir (str): Directory containing documents to process
embedding_type (str): Type of embedding to use ('fake', 'openai', etc.)
embedding_config (dict, optional): Configuration for the embedding
"""
def __init__(self, docs_dir, embedding_config=None):
self.docs_dir = docs_dir
self.embedding_type = embedding_type
self.embedding_config = embedding_config or {}
self.embeddings = self._get_embeddings()
self.vectorstore = None
Expand All @@ -35,22 +23,25 @@ def __init__(self, docs_dir, embedding_type="fake", embedding_config=None):
)

def _get_embeddings(self):
"""
Factory method to get embeddings based on configuration

Returns:
Embeddings: An instance of embeddings
"""
if self.embedding_type == "openai":
embedding_type = self.embedding_config.get("type", "fake")
if embedding_type == "openai":
# Only import if needed
from langchain_openai import OpenAIEmbeddings
api_key = self.embedding_config.get("api_key", os.getenv("OPENAI_API_KEY"))
api_key = self.embedding_config["api_key"]
Copy link

Copilot AI Apr 6, 2025

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Removing the fallback using os.getenv may lead to a KeyError when the API key is not provided. Consider restoring fallback behavior by using self.embedding_config.get('api_key', os.getenv('OPENAI_API_KEY')).

Suggested change
api_key = self.embedding_config["api_key"]
api_key = self.embedding_config.get('api_key', os.getenv('OPENAI_API_KEY'))

Copilot uses AI. Check for mistakes.
return OpenAIEmbeddings(api_key=api_key)
elif self.embedding_type == "fake":
elif embedding_type == "huggingface":
# Only import if needed
from langchain_huggingface import HuggingFaceEmbeddings
model_name = self.embedding_config.get("model_name", "all-MiniLM-L6-v2")
return HuggingFaceEmbeddings(
model_name=model_name,
cache_folder=os.path.join(self.docs_dir, ".cache", "langchain"),
)
elif embedding_type == "fake":
# Simple embedding for testing
return FakeEmbeddings(size=1536) # OpenAI compatible dimension
else:
raise ValueError(f"Unsupported embedding type: {self.embedding_type}")
raise ValueError(f"Unsupported embedding type: {embedding_type}")

def _create_empty_store(self):
"""Handle the case where no texts are found to vectorize by creating an empty FAISS index"""
Expand All @@ -62,7 +53,6 @@ def _create_empty_store(self):
docstore=InMemoryDocstore({}),
index_to_docstore_id={}
)


async def scan_and_vectorize(self, topic_name:str) -> str:
"""Scan all documents in the specified directory and create vector embeddings"""
Expand All @@ -77,13 +67,13 @@ async def scan_and_vectorize(self, topic_name:str) -> str:
# Process each file based on its type
count = 0
for file_path in files:
print(f"- {file_path}")
print(f" - {file_path}")
text = await self._extract_text(file_path)
if text:
document_texts.append({"text": text, "source": file_path})
count += 1
else:
print(f"Text extraction failed for {file_path}")
print(f" Text extraction failed for {file_path}")
print(f"Done. Extracted text from {count} documents")

# Split texts into chunks
Expand Down
3 changes: 1 addition & 2 deletions src/innieme/innie.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,8 +13,7 @@ def __init__(self, outie_config:OutieConfig, api_key:str, config: TopicConfig):
# Initialize components
self.document_processor = DocumentProcessor(
config.docs_dir,
embedding_type="openai",
embedding_config={"api_key": api_key}
embedding_config={"type":outie_config.bot.embedding_model, "api_key": outie_config.bot.openai_api_key}
)
self.knowledge_manager = KnowledgeManager()
self.conversation_engine = ConversationEngine(
Expand Down
1 change: 1 addition & 0 deletions tests/test_discord_bot.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,7 @@
bot_config = DiscordBotConfig(
discord_token="test_token",
openai_api_key="test_key",
embedding_model="fake",
outies=[]
)
outie_config = OutieConfig(
Expand Down
3 changes: 2 additions & 1 deletion tests/test_discord_bot_config.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,7 @@
def test_valid_outie_id():
"""Test that a positive outie_id is accepted"""
# Create a bot config first
bot = DiscordBotConfig(discord_token="test_token", openai_api_key="key", outies=[]) # Add minimal bot config
bot = DiscordBotConfig(discord_token="test_token", openai_api_key="key", embedding_model="huggingface", outies=[]) # Add minimal bot config
outie = OutieConfig(outie_id=1, topics=[], bot=bot) # Add bot reference
assert outie.outie_id == 1

Expand Down Expand Up @@ -35,6 +35,7 @@ def test_config_from_yaml():
yaml_content = f"""
discord_token: "test_discord_token"
openai_api_key: "test_openai_key"
embedding_model: "openai"
outies:
- outie_id: 1
topics:
Expand Down