AnswerDotAI · bclavie · Aug 7, 2024 · Aug 7, 2024 · Aug 7, 2024 · Aug 7, 2024
diff --git a/.github/workflows/test.yml b/.github/workflows/test.yml
@@ -16,15 +16,11 @@ jobs:
         uses: actions/setup-python@v4
         with:
           python-version: 3.9
-
-      - name: Install Poetry
-        uses: snok/install-poetry@v1.3.1
-
-      - name: Clean poetry
-        run: rm poetry.lock
+      - name: Install pytest
+        run: pip install pytest
 
       - name: Install dependencies
-        run: poetry install --with dev --no-cache
+        run: pip install -e .
 
       - name: Run tests
-        run: poetry run pytest tests/
+        run: pytest tests/
diff --git a/examples/02-basic_training.ipynb b/examples/02-basic_training.ipynb
@@ -125,9 +125,9 @@
    },
    "outputs": [],
    "source": [
-    "from ragatouille.data import CorpusProcessor, llama_index_sentence_splitter\n",
+    "from ragatouille.data import CorpusProcessor, simple_sentence_splitter\n",
     "\n",
-    "corpus_processor = CorpusProcessor(document_splitter_fn=llama_index_sentence_splitter)\n",
+    "corpus_processor = CorpusProcessor(document_splitter_fn=simple_sentence_splitter)\n",
     "documents = corpus_processor.process_corpus(my_full_corpus, chunk_size=256)"
    ]
   },

diff --git a/examples/03-finetuning_without_annotations_with_instructor_and_RAGatouille.ipynb b/examples/03-finetuning_without_annotations_with_instructor_and_RAGatouille.ipynb
@@ -184,9 +184,9 @@
     "my_full_corpus += [get_wikipedia_page(\"Studio_Ghibli\")]\n",
     "my_full_corpus += [get_wikipedia_page(\"Toei_Animation\")]\n",
     "\n",
-    "from ragatouille.data import CorpusProcessor, llama_index_sentence_splitter\n",
+    "from ragatouille.data import CorpusProcessor, simple_sentence_splitter\n",
     "\n",
-    "corpus_processor = CorpusProcessor(document_splitter_fn=llama_index_sentence_splitter)\n",
+    "corpus_processor = CorpusProcessor(document_splitter_fn=simple_sentence_splitter)\n",
     "documents = corpus_processor.process_corpus(my_full_corpus, chunk_size=180)"
    ]
   },

diff --git a/poetry.lock b/poetry.lock
diff --git a/pyproject.toml b/pyproject.toml
@@ -1,39 +1,60 @@
-[tool.poetry]
-name = "RAGatouille"
+[build-system]
+requires = ["setuptools"] 
+build-backend = "setuptools.build_meta" 
+
+[tool.setuptools]
+packages = [
+    "ragatouille",
+    "ragatouille.models",
+    "ragatouille.data",
+    "ragatouille.integrations",
+    "ragatouille.indexing",
+    "ragatouille.training",
+]
+
+[project]
+name = "RAGatouille" 
 version = "0.0.8post2"
 description = "Library to facilitate the use of state-of-the-art retrieval models in common RAG contexts."
-authors = ["Benjamin Clavie <ben@clavie.eu>"]
-license = "Apache-2.0"
 readme = "README.md"
-packages = [{include = "ragatouille"}]
-repository = "https://github.com/bclavie/ragatouille"
+requires-python = ">=3.8"
+license = {file = "LICENSE"}
+keywords = ["reranking", "retrieval", "rag", "nlp"]
+authors = [
+  {name = "Ben Clavié", email = "bc@answer.ai" }
+]
+maintainers = [
+  {name = "Ben Clavié", email = "bc@answer.ai" }
+]
 
-[tool.poetry.dependencies]
-python = ">=3.9,<4.0"
-faiss-cpu = "^1.7.4"
-transformers = "^4.36.2"
-voyager = "^2.0.2"
-sentence-transformers = "^2.2.2"
-torch = ">=1.13"
-llama-index = ">=0.7"
-langchain_core = "^0.1.4"
-colbert-ai = "0.2.19"
-langchain = "^0.1.0"
-onnx = "^1.15.0"
-srsly = "2.4.8"
-fast-pytorch-kmeans= "0.2.0.1"
+dependencies = [
+  "llama-index",
+  "faiss-cpu",
+  "langchain_core",
+  "colbert-ai>=0.2.19",
+  "langchain",
+  "onnx",
+  "srsly",
+  "voyager",
+  "torch>=1.13",
+  "fast-pytorch-kmeans",
+  "sentence-transformers",
+]
 
-[tool.poetry.group.dev.dependencies]
-pytest = "^7.4.0"
-mkdocs = "^1.4.3"
-mkdocs-material = "^9.1.18"
-mkdocstrings = "^0.22.0"
-mkdocstrings-python = "^1.1.2"
-ruff = "^0.1.9"
 
-[build-system]
-requires = ["poetry-core"]
-build-backend = "poetry.core.masonry.api"
+[project.optional-dependencies]
+all = [
+    "llama-index",
+    "langchain",
+    "rerankers",
+    "voyager",
+]
+llamaindex = ["llama-index"]
+langchain = ["langchain"]
+train = ["sentence-transformers", "rerankers"]
+
+[project.urls]
+"Homepage" = "https://github.com/bclavie/ragatouille"
 
 [tool.pytest.ini_options]
 filterwarnings = [

diff --git a/ragatouille/RAGPretrainedModel.py b/ragatouille/RAGPretrainedModel.py
@@ -6,7 +6,7 @@
 from langchain_core.retrievers import BaseRetriever
 
 from ragatouille.data.corpus_processor import CorpusProcessor
-from ragatouille.data.preprocessors import llama_index_sentence_splitter
+from ragatouille.data.preprocessors import simple_sentence_splitter
 from ragatouille.integrations import (
     RAGatouilleLangChainCompressor,
     RAGatouilleLangChainRetriever,
@@ -177,7 +177,7 @@ def index(
         overwrite_index: Union[bool, str] = True,
         max_document_length: int = 256,
         split_documents: bool = True,
-        document_splitter_fn: Optional[Callable] = llama_index_sentence_splitter,
+        document_splitter_fn: Optional[Callable] = simple_sentence_splitter,
         preprocessing_fn: Optional[Union[Callable, list[Callable]]] = None,
         bsize: int = 32,
         use_faiss: bool = False,
@@ -191,7 +191,7 @@ def index(
             overwrite_index (Union[bool, str]): Whether to overwrite an existing index with the same name.
             max_document_length (int): The maximum length of a document. Documents longer than this will be split into chunks.
             split_documents (bool): Whether to split documents into chunks.
-            document_splitter_fn (Optional[Callable]): A function to split documents into chunks. If None and by default, will use the llama_index_sentence_splitter.
+            document_splitter_fn (Optional[Callable]): A function to split documents into chunks. If None and by default, will use the simple_sentence_splitter.
             preprocessing_fn (Optional[Union[Callable, list[Callable]]]): A function or list of functions to preprocess documents. If None and by default, will not preprocess documents.
             bsize (int): The batch size to use for encoding the passages.
 
@@ -226,7 +226,7 @@ def add_to_index(
         new_document_metadatas: Optional[list[dict]] = None,
         index_name: Optional[str] = None,
         split_documents: bool = True,
-        document_splitter_fn: Optional[Callable] = llama_index_sentence_splitter,
+        document_splitter_fn: Optional[Callable] = simple_sentence_splitter,
         preprocessing_fn: Optional[Union[Callable, list[Callable]]] = None,
         bsize: int = 32,
         use_faiss: bool = False,

diff --git a/ragatouille/data/__init__.py b/ragatouille/data/__init__.py
@@ -1,9 +1,9 @@
 from .corpus_processor import CorpusProcessor
-from .preprocessors import llama_index_sentence_splitter
+from .preprocessors import simple_sentence_splitter
 from .training_data_processor import TrainingDataProcessor
 
 __all__ = [
     "TrainingDataProcessor",
     "CorpusProcessor",
-    "llama_index_sentence_splitter",
+    "simple_sentence_splitter",
 ]
diff --git a/ragatouille/data/corpus_processor.py b/ragatouille/data/corpus_processor.py
@@ -1,13 +1,13 @@
 from typing import Callable, List, Optional, Union
 from uuid import uuid4
 
-from ragatouille.data.preprocessors import llama_index_sentence_splitter
+from ragatouille.data.preprocessors import simple_sentence_splitter
 
 
 class CorpusProcessor:
     def __init__(
         self,
-        document_splitter_fn: Optional[Callable] = llama_index_sentence_splitter,
+        document_splitter_fn: Optional[Callable] = simple_sentence_splitter,
         preprocessing_fn: Optional[Union[Callable, list[Callable]]] = None,
     ):
         self.document_splitter_fn = document_splitter_fn

diff --git a/ragatouille/data/preprocessors.py b/ragatouille/data/preprocessors.py
@@ -1,20 +1,92 @@
+import re
+from typing import List, Dict, Optional, Callable
+
 try:
-    from llama_index import Document
-    from llama_index.text_splitter import SentenceSplitter
+    try:
+        from llama_index import Document
+        from llama_index.text_splitter import SentenceSplitter
+    except ImportError:
+        from llama_index.core import Document
+        from llama_index.core.text_splitter import SentenceSplitter
+    has_llama_index = True
 except ImportError:
-    from llama_index.core import Document
-    from llama_index.core.text_splitter import SentenceSplitter
+    print(
+        "Llamaindex is not installed, defaulting to a naive sentence splitter instead."
+    )
+    has_llama_index = False
+
+
+def estimate_token_length(text: str) -> int:
+    return int(len(text.split()) * 1.5)
+
+
+def split_into_sentences(text: str) -> List[str]:
+    return re.split(r"(?<=[.!?])\s+", text)
+
+
+def merge_sentences(
+    sentences: List[str], chunk_size: int, chunk_overlap: int
+) -> List[str]:
+    chunks = []
+    current_chunk = []
+    current_chunk_size = 0
+
+    for sentence in sentences:
+        sentence_size = estimate_token_length(sentence)
+
+        if current_chunk_size + sentence_size > chunk_size and current_chunk:
+            chunks.append(" ".join(current_chunk))
+            overlap_size = 0
+            while overlap_size < chunk_overlap and current_chunk:
+                overlap_size += estimate_token_length(current_chunk[0])
+                if overlap_size <= chunk_overlap:
+                    current_chunk.pop(0)
+            current_chunk_size = sum(estimate_token_length(s) for s in current_chunk)
+
+        current_chunk.append(sentence)
+        current_chunk_size += sentence_size
 
+    if current_chunk:
+        chunks.append(" ".join(current_chunk))
 
-def llama_index_sentence_splitter(
-    documents: list[str], document_ids: list[str], chunk_size=256
-):
-    chunk_overlap = min(chunk_size / 4, min(chunk_size / 2, 64))
+    return chunks
+
+
+def naive_simple_sentence_splitter(
+    documents: List[str], document_ids: List[str], chunk_size: int = 256
+) -> List[Dict[str, str]]:
+    chunk_overlap = min(chunk_size // 4, min(chunk_size // 2, 64))
     chunks = []
-    node_parser = SentenceSplitter(chunk_size=chunk_size, chunk_overlap=chunk_overlap)
-    docs = [[Document(text=doc)] for doc in documents]
-    for doc_id, doc in zip(document_ids, docs):
-        chunks += [
-            {"document_id": doc_id, "content": node.text} for node in node_parser(doc)
-        ]
+
+    for doc_id, doc in zip(document_ids, documents):
+        sentences = split_into_sentences(doc)
+        doc_chunks = merge_sentences(sentences, chunk_size, chunk_overlap)
+        chunks.extend(
+            [{"document_id": doc_id, "content": chunk} for chunk in doc_chunks]
+        )
+
     return chunks
+
+
+simple_sentence_splitter = naive_simple_sentence_splitter
+
+
+if has_llama_index:
+
+    def llama_index_sentence_splitter(
+        documents: List[str], document_ids: List[str], chunk_size: int = 256
+    ) -> List[Dict[str, str]]:
+        chunk_overlap = min(chunk_size // 4, min(chunk_size // 2, 64))
+        chunks = []
+        node_parser = SentenceSplitter(
+            chunk_size=chunk_size, chunk_overlap=chunk_overlap
+        )
+        docs = [Document(text=doc) for doc in documents]
+        for doc_id, doc in zip(document_ids, docs):
+            chunks += [
+                {"document_id": doc_id, "content": node.text}
+                for node in node_parser.get_nodes_from_documents([doc])
+            ]
+        return chunks
+
+    simple_sentence_splitter = llama_index_sentence_splitter
diff --git a/ragatouille/indexing/__init__.py b/ragatouille/indexing/__init__.py