Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
42 changes: 20 additions & 22 deletions requirements.txt
Original file line number Diff line number Diff line change
@@ -1,28 +1,26 @@
# Discord API
discord.py>=2.3.0
# Discord Integration
discord.py

# Environment variables
python-dotenv>=1.0.0
# Environment Configuration
python-dotenv

# Document processing
pypdf>=3.15.1
python-docx>=0.8.11
langchain>=0.0.267
langchain-community>=0.0.10
langchain-openai>=0.0.5
# Document Processing & LLM
chromadb
langchain
langchain-community
langchain-chroma
langchain_huggingface
langchain-openai
pypdf
python-docx

# Vector database
faiss-cpu>=1.7.4
# Vector Database
faiss-cpu

# OpenAI embeddings
openai>=1.0.0

# Async support
asyncio>=3.4.3
# AI/ML Dependencies
numpy
openai

# Testing
pytest>=7.4.0
pytest-asyncio>=0.21.0

# Utilities
numpy>=1.24.0
pytest
pytest-asyncio
2 changes: 1 addition & 1 deletion src/innieme/discord_bot.py
Original file line number Diff line number Diff line change
Expand Up @@ -230,7 +230,7 @@ async def on_message(self, message:Message):
await self.respond(message, "Sorry I am not set up to support a topic in this channel.")
return

logger.info(f"On message, located topic: {topic.config.name}")
logger.debug(f"On message, located topic: {topic.config.name}")
outie_id = topic.outie_config.outie_id

# Check if message is in a thread
Expand Down
83 changes: 39 additions & 44 deletions src/innieme/document_processor.py
Original file line number Diff line number Diff line change
@@ -1,65 +1,53 @@
from langchain_community.vectorstores import FAISS
from langchain_community.docstore.in_memory import InMemoryDocstore
from langchain_community.vectorstores.faiss import FAISS as LangchainFAISS
from langchain_community.embeddings import FakeEmbeddings
from .embeddings_factory import EmbeddingsFactory
from .vector_store_factory import VectorStoreFactory

from langchain.text_splitter import RecursiveCharacterTextSplitter
import faiss

import pypdf
import docx
import glob
from pydantic import SecretStr

from typing import List, Dict
from typing import List, Dict, Optional, Union
from langchain.embeddings.base import Embeddings

import logging
import os
import time

logger = logging.getLogger(__name__)

class DocumentProcessor:
def __init__(self, docs_dir, embedding_config:Dict[str, str]=None):
def __init__(self,
topic: str,
docs_dir: str,
embeddings_factory: EmbeddingsFactory,
vector_store_factory: VectorStoreFactory):
self.docs_dir = docs_dir
self.embedding_config = embedding_config or {}
self.embeddings = self._get_embeddings()
self.vectorstore = None
self.topic = topic
self.embeddings_factory = embeddings_factory
self.vector_store_factory = vector_store_factory
self.text_splitter = RecursiveCharacterTextSplitter(
chunk_size=1000,
chunk_overlap=200
)

def _get_embeddings(self):
embedding_type = self.embedding_config.get("type", "fake")
if embedding_type == "openai":
# Only import if needed
from langchain_openai import OpenAIEmbeddings
api_key = self.embedding_config.get("api_key")
return OpenAIEmbeddings(api_key=api_key)
elif embedding_type == "huggingface":
# Only import if needed
from langchain_huggingface import HuggingFaceEmbeddings
model_name = self.embedding_config.get("model_name", "all-MiniLM-L6-v2")
return HuggingFaceEmbeddings(
model_name=model_name,
cache_folder=os.path.join(self.docs_dir, ".cache", "langchain"),
)
elif embedding_type == "fake":
# Simple embedding for testing
return FakeEmbeddings(size=1536) # OpenAI compatible dimension
else:
raise ValueError(f"Unsupported embedding type: {embedding_type}")

def _create_empty_store(self):
"""Handle the case where no texts are found to vectorize by creating an empty FAISS index"""
dimension = 1536 # Same as OpenAI embeddings dimension
# Create empty FAISS instance
return LangchainFAISS(
embedding_function=self.embeddings,
index=faiss.IndexFlatL2(dimension),
docstore=InMemoryDocstore({}),
index_to_docstore_id={}
"""Handle the case where no texts are found to vectorize"""
collection_name = self._get_collection_name()
return self.vector_store_factory.create_empty_store(
collection_name=collection_name,
embeddings=self.embeddings_factory.create_embeddings()
)

async def scan_and_vectorize(self, topic_name:str) -> str:
def _get_collection_name(self) -> str:
"""Create a unique collection name using topic and timestamp"""
# Clean topic name to be filesystem safe
safe_topic = "".join(c if c.isalnum() else "_" for c in self.topic)
timestamp = int(time.time() * 1000) # Milliseconds since epoch
return f"{safe_topic}_{timestamp}"

async def scan_and_vectorize(self) -> str:
"""Scan all documents in the specified directory and create vector embeddings"""
document_texts = []

Expand All @@ -68,7 +56,7 @@ async def scan_and_vectorize(self, topic_name:str) -> str:
for ext in ['*.pdf', '*.docx', '*.txt', '*.md']:
files.extend(glob.glob(os.path.join(self.docs_dir, '**', ext), recursive=True))

logger.info(f"For {topic_name}: Found {len(files)} documents to process under {self.docs_dir}...")
logger.info(f"For {self.topic}: Found {len(files)} documents to process under {self.docs_dir}...")
# Process each file based on its type
count = 0
for file_path in files:
Expand All @@ -89,15 +77,22 @@ async def scan_and_vectorize(self, topic_name:str) -> str:

# Create vector store
texts = [chunk["text"] for chunk in all_chunks]

collection_name = self._get_collection_name()

response = ""
if not texts:
self.vectorstore = self._create_empty_store()
response = f"On topic '{topic_name}': no documents found to process"
response = f"On topic '{self.topic}': no documents found to process"
else:
metadatas = [{"source": chunk["source"]} for chunk in all_chunks]
self.vectorstore = FAISS.from_texts(texts, self.embeddings, metadatas=metadatas)
response = f"On topic '{topic_name}': {len(all_chunks)} chunks created from {count} out of {len(files)} references"
self.vectorstore = self.vector_store_factory.create_from_texts(
texts,
self.embeddings_factory.create_embeddings(),
collection_name=collection_name,
metadatas=metadatas
)
response = f"On topic '{self.topic}': {len(all_chunks)} chunks created from {count} out of {len(files)} references"
return response

async def _extract_text(self, file_path):
Expand Down
37 changes: 37 additions & 0 deletions src/innieme/embeddings_factory.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,37 @@
from abc import ABC, abstractmethod
from pydantic import SecretStr
from langchain.embeddings.base import Embeddings
from langchain_openai import OpenAIEmbeddings
from langchain_huggingface import HuggingFaceEmbeddings

class EmbeddingsFactory(ABC):
"""Abstract factory interface for creating embeddings"""
@abstractmethod
def create_embeddings(self) -> Embeddings:
"""Create and return an embeddings instance"""
pass

class OpenAIEmbeddingsFactory(EmbeddingsFactory):
def __init__(self, api_key: str):
self.api_key = api_key

def create_embeddings(self) -> Embeddings:
return OpenAIEmbeddings(api_key=SecretStr(self.api_key))

class HuggingFaceEmbeddingsFactory(EmbeddingsFactory):
def __init__(self, cache_dir: str, model_name: str = "all-MiniLM-L6-v2"):
self.model_name = model_name
self.cache_dir = cache_dir

def create_embeddings(self) -> Embeddings:
return HuggingFaceEmbeddings(
model_name=self.model_name,
cache_folder=self.cache_dir
)

class ExistingEmbeddingsFactory(EmbeddingsFactory):
def __init__(self, embeddings: Embeddings):
self.embeddings = embeddings

def create_embeddings(self) -> Embeddings:
return self.embeddings
41 changes: 36 additions & 5 deletions src/innieme/innie.py
Original file line number Diff line number Diff line change
@@ -1,19 +1,35 @@
from dataclasses import dataclass
from typing import List
from functools import wraps
from .embeddings_factory import EmbeddingsFactory, OpenAIEmbeddingsFactory, HuggingFaceEmbeddingsFactory, ExistingEmbeddingsFactory
from .vector_store_factory import ChromaVectorStoreFactory, FAISSVectorStoreFactory
from .document_processor import DocumentProcessor
from .knowledge_manager import KnowledgeManager
from .conversation_engine import ConversationEngine
from .discord_bot_config import OutieConfig, TopicConfig

from langchain_community.embeddings import FakeEmbeddings

import os

from dataclasses import dataclass
from typing import Dict
from functools import wraps

class Topic:
def __init__(self, outie_config:OutieConfig, api_key:str, config: TopicConfig):
self.config = config
self.outie_config = outie_config
# Initialize components
self.document_processor = DocumentProcessor(
self.config.name,
config.docs_dir,
embedding_config={"type":outie_config.bot.embedding_model, "api_key": outie_config.bot.openai_api_key}
self._create_embeddings_from_config(
{
"type":outie_config.bot.embedding_model,
"api_key": outie_config.bot.openai_api_key,
"cache_dir": os.path.join(config.docs_dir, ".cache", "langchain")
}
),
ChromaVectorStoreFactory()
# FAISSVectorStoreFactory()
)
self.knowledge_manager = KnowledgeManager()
self.active_threads = set()
Expand All @@ -24,6 +40,21 @@ def __init__(self, outie_config:OutieConfig, api_key:str, config: TopicConfig):
self.knowledge_manager
)

def _create_embeddings_from_config(self, config: Dict[str, str]) -> EmbeddingsFactory:
embedding_type = config.get("type", "<empty>")
if embedding_type == "openai":
api_key = config['api_key']
return OpenAIEmbeddingsFactory(api_key)
elif embedding_type == "huggingface":
return HuggingFaceEmbeddingsFactory(
cache_dir=config['cache_dir'],
model_name=config.get("model_name", "all-MiniLM-L6-v2")
)
elif embedding_type == "fake":
return ExistingEmbeddingsFactory(FakeEmbeddings(size=1536))
else:
raise ValueError(f"Unsupported embedding type: {embedding_type}")

def is_following_thread(self, thread_id:int) -> bool:
return thread_id in self.active_threads

Expand All @@ -32,7 +63,7 @@ async def process_query(self, thread_id: int, query: str, context_messages: list
return await self.conversation_engine.process_query(query, context_messages)

async def scan_and_vectorize(self) -> str:
return await self.document_processor.scan_and_vectorize(self.config.name)
return await self.document_processor.scan_and_vectorize()

async def generate_summary(self, thread_id) -> str:
return await self.knowledge_manager.generate_summary(thread_id)
Expand Down
41 changes: 41 additions & 0 deletions src/innieme/vector_store_factory.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,41 @@
from langchain.vectorstores.base import VectorStore
from langchain.embeddings.base import Embeddings
from langchain_chroma.vectorstores import Chroma
from langchain_community.vectorstores import FAISS

from abc import ABC, abstractmethod
from typing import List, Dict, Optional

class VectorStoreFactory(ABC):
"""Abstract factory interface for creating vector stores"""
@abstractmethod
def create_empty_store(self, collection_name: str, embeddings: Embeddings) -> VectorStore:
"""Create an empty vector store"""
pass

@abstractmethod
def create_from_texts(self, texts: List[str], embeddings: Embeddings, collection_name: str, metadatas: Optional[List[Dict]] = None) -> VectorStore:
"""Create a vector store from texts"""
pass

class ChromaVectorStoreFactory(VectorStoreFactory):
def create_empty_store(self, collection_name: str, embeddings: Embeddings) -> VectorStore:
return Chroma(
collection_name=collection_name,
embedding_function=embeddings,
)

def create_from_texts(self, texts: List[str], embeddings: Embeddings, collection_name: str, metadatas: Optional[List[Dict]] = None) -> VectorStore:
return Chroma.from_texts(
texts,
embeddings,
collection_name=collection_name,
metadatas=metadatas
)

class FAISSVectorStoreFactory(VectorStoreFactory):
def create_empty_store(self, collection_name: str, embeddings: Embeddings) -> VectorStore:
return FAISS.from_texts([], embeddings)

def create_from_texts(self, texts: List[str], embeddings: Embeddings, collection_name: str, metadatas: Optional[List[Dict]] = None) -> VectorStore:
return FAISS.from_texts(texts, embeddings, metadatas=metadatas)
Loading