Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
12 changes: 4 additions & 8 deletions .github/workflows/test.yml
Original file line number Diff line number Diff line change
Expand Up @@ -16,15 +16,11 @@ jobs:
uses: actions/setup-python@v4
with:
python-version: 3.9

- name: Install Poetry
uses: snok/install-poetry@v1.3.1

- name: Clean poetry
run: rm poetry.lock
- name: Install pytest
run: pip install pytest

- name: Install dependencies
run: poetry install --with dev --no-cache
run: pip install -e .

- name: Run tests
run: poetry run pytest tests/
run: pytest tests/
4 changes: 2 additions & 2 deletions examples/02-basic_training.ipynb
Original file line number Diff line number Diff line change
Expand Up @@ -125,9 +125,9 @@
},
"outputs": [],
"source": [
"from ragatouille.data import CorpusProcessor, llama_index_sentence_splitter\n",
"from ragatouille.data import CorpusProcessor, simple_sentence_splitter\n",
"\n",
"corpus_processor = CorpusProcessor(document_splitter_fn=llama_index_sentence_splitter)\n",
"corpus_processor = CorpusProcessor(document_splitter_fn=simple_sentence_splitter)\n",
"documents = corpus_processor.process_corpus(my_full_corpus, chunk_size=256)"
]
},
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -184,9 +184,9 @@
"my_full_corpus += [get_wikipedia_page(\"Studio_Ghibli\")]\n",
"my_full_corpus += [get_wikipedia_page(\"Toei_Animation\")]\n",
"\n",
"from ragatouille.data import CorpusProcessor, llama_index_sentence_splitter\n",
"from ragatouille.data import CorpusProcessor, simple_sentence_splitter\n",
"\n",
"corpus_processor = CorpusProcessor(document_splitter_fn=llama_index_sentence_splitter)\n",
"corpus_processor = CorpusProcessor(document_splitter_fn=simple_sentence_splitter)\n",
"documents = corpus_processor.process_corpus(my_full_corpus, chunk_size=180)"
]
},
Expand Down
4,616 changes: 0 additions & 4,616 deletions poetry.lock

This file was deleted.

81 changes: 51 additions & 30 deletions pyproject.toml
Original file line number Diff line number Diff line change
@@ -1,39 +1,60 @@
[tool.poetry]
name = "RAGatouille"
[build-system]
requires = ["setuptools"]
build-backend = "setuptools.build_meta"

[tool.setuptools]
packages = [
"ragatouille",
"ragatouille.models",
"ragatouille.data",
"ragatouille.integrations",
"ragatouille.indexing",
"ragatouille.training",
]

[project]
name = "RAGatouille"
version = "0.0.8post2"
description = "Library to facilitate the use of state-of-the-art retrieval models in common RAG contexts."
authors = ["Benjamin Clavie <ben@clavie.eu>"]
license = "Apache-2.0"
readme = "README.md"
packages = [{include = "ragatouille"}]
repository = "https://github.com/bclavie/ragatouille"
requires-python = ">=3.8"
license = {file = "LICENSE"}
keywords = ["reranking", "retrieval", "rag", "nlp"]
authors = [
{name = "Ben Clavié", email = "bc@answer.ai" }
]
maintainers = [
{name = "Ben Clavié", email = "bc@answer.ai" }
]

[tool.poetry.dependencies]
python = ">=3.9,<4.0"
faiss-cpu = "^1.7.4"
transformers = "^4.36.2"
voyager = "^2.0.2"
sentence-transformers = "^2.2.2"
torch = ">=1.13"
llama-index = ">=0.7"
langchain_core = "^0.1.4"
colbert-ai = "0.2.19"
langchain = "^0.1.0"
onnx = "^1.15.0"
srsly = "2.4.8"
fast-pytorch-kmeans= "0.2.0.1"
dependencies = [
"llama-index",
"faiss-cpu",
"langchain_core",
"colbert-ai>=0.2.19",
"langchain",
"onnx",
"srsly",
"voyager",
"torch>=1.13",
"fast-pytorch-kmeans",
"sentence-transformers",
]

[tool.poetry.group.dev.dependencies]
pytest = "^7.4.0"
mkdocs = "^1.4.3"
mkdocs-material = "^9.1.18"
mkdocstrings = "^0.22.0"
mkdocstrings-python = "^1.1.2"
ruff = "^0.1.9"

[build-system]
requires = ["poetry-core"]
build-backend = "poetry.core.masonry.api"
[project.optional-dependencies]
all = [
"llama-index",
"langchain",
"rerankers",
"voyager",
]
llamaindex = ["llama-index"]
langchain = ["langchain"]
train = ["sentence-transformers", "rerankers"]

[project.urls]
"Homepage" = "https://github.com/bclavie/ragatouille"

[tool.pytest.ini_options]
filterwarnings = [
Expand Down
8 changes: 4 additions & 4 deletions ragatouille/RAGPretrainedModel.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,7 @@
from langchain_core.retrievers import BaseRetriever

from ragatouille.data.corpus_processor import CorpusProcessor
from ragatouille.data.preprocessors import llama_index_sentence_splitter
from ragatouille.data.preprocessors import simple_sentence_splitter
from ragatouille.integrations import (
RAGatouilleLangChainCompressor,
RAGatouilleLangChainRetriever,
Expand Down Expand Up @@ -177,7 +177,7 @@ def index(
overwrite_index: Union[bool, str] = True,
max_document_length: int = 256,
split_documents: bool = True,
document_splitter_fn: Optional[Callable] = llama_index_sentence_splitter,
document_splitter_fn: Optional[Callable] = simple_sentence_splitter,
preprocessing_fn: Optional[Union[Callable, list[Callable]]] = None,
bsize: int = 32,
use_faiss: bool = False,
Expand All @@ -191,7 +191,7 @@ def index(
overwrite_index (Union[bool, str]): Whether to overwrite an existing index with the same name.
max_document_length (int): The maximum length of a document. Documents longer than this will be split into chunks.
split_documents (bool): Whether to split documents into chunks.
document_splitter_fn (Optional[Callable]): A function to split documents into chunks. If None and by default, will use the llama_index_sentence_splitter.
document_splitter_fn (Optional[Callable]): A function to split documents into chunks. If None and by default, will use the simple_sentence_splitter.
preprocessing_fn (Optional[Union[Callable, list[Callable]]]): A function or list of functions to preprocess documents. If None and by default, will not preprocess documents.
bsize (int): The batch size to use for encoding the passages.

Expand Down Expand Up @@ -226,7 +226,7 @@ def add_to_index(
new_document_metadatas: Optional[list[dict]] = None,
index_name: Optional[str] = None,
split_documents: bool = True,
document_splitter_fn: Optional[Callable] = llama_index_sentence_splitter,
document_splitter_fn: Optional[Callable] = simple_sentence_splitter,
preprocessing_fn: Optional[Union[Callable, list[Callable]]] = None,
bsize: int = 32,
use_faiss: bool = False,
Expand Down
4 changes: 2 additions & 2 deletions ragatouille/data/__init__.py
Original file line number Diff line number Diff line change
@@ -1,9 +1,9 @@
from .corpus_processor import CorpusProcessor
from .preprocessors import llama_index_sentence_splitter
from .preprocessors import simple_sentence_splitter
from .training_data_processor import TrainingDataProcessor

__all__ = [
"TrainingDataProcessor",
"CorpusProcessor",
"llama_index_sentence_splitter",
"simple_sentence_splitter",
]
4 changes: 2 additions & 2 deletions ragatouille/data/corpus_processor.py
Original file line number Diff line number Diff line change
@@ -1,13 +1,13 @@
from typing import Callable, List, Optional, Union
from uuid import uuid4

from ragatouille.data.preprocessors import llama_index_sentence_splitter
from ragatouille.data.preprocessors import simple_sentence_splitter


class CorpusProcessor:
def __init__(
self,
document_splitter_fn: Optional[Callable] = llama_index_sentence_splitter,
document_splitter_fn: Optional[Callable] = simple_sentence_splitter,
preprocessing_fn: Optional[Union[Callable, list[Callable]]] = None,
):
self.document_splitter_fn = document_splitter_fn
Expand Down
100 changes: 86 additions & 14 deletions ragatouille/data/preprocessors.py
Original file line number Diff line number Diff line change
@@ -1,20 +1,92 @@
import re
from typing import List, Dict, Optional, Callable

try:
from llama_index import Document
from llama_index.text_splitter import SentenceSplitter
try:
from llama_index import Document
from llama_index.text_splitter import SentenceSplitter
except ImportError:
from llama_index.core import Document
from llama_index.core.text_splitter import SentenceSplitter
has_llama_index = True
except ImportError:
from llama_index.core import Document
from llama_index.core.text_splitter import SentenceSplitter
print(
"Llamaindex is not installed, defaulting to a naive sentence splitter instead."
)
has_llama_index = False


def estimate_token_length(text: str) -> int:
return int(len(text.split()) * 1.5)


def split_into_sentences(text: str) -> List[str]:
return re.split(r"(?<=[.!?])\s+", text)


def merge_sentences(
sentences: List[str], chunk_size: int, chunk_overlap: int
) -> List[str]:
chunks = []
current_chunk = []
current_chunk_size = 0

for sentence in sentences:
sentence_size = estimate_token_length(sentence)

if current_chunk_size + sentence_size > chunk_size and current_chunk:
chunks.append(" ".join(current_chunk))
overlap_size = 0
while overlap_size < chunk_overlap and current_chunk:
overlap_size += estimate_token_length(current_chunk[0])
if overlap_size <= chunk_overlap:
current_chunk.pop(0)
current_chunk_size = sum(estimate_token_length(s) for s in current_chunk)

current_chunk.append(sentence)
current_chunk_size += sentence_size

if current_chunk:
chunks.append(" ".join(current_chunk))

def llama_index_sentence_splitter(
documents: list[str], document_ids: list[str], chunk_size=256
):
chunk_overlap = min(chunk_size / 4, min(chunk_size / 2, 64))
return chunks


def naive_simple_sentence_splitter(
documents: List[str], document_ids: List[str], chunk_size: int = 256
) -> List[Dict[str, str]]:
chunk_overlap = min(chunk_size // 4, min(chunk_size // 2, 64))
chunks = []
node_parser = SentenceSplitter(chunk_size=chunk_size, chunk_overlap=chunk_overlap)
docs = [[Document(text=doc)] for doc in documents]
for doc_id, doc in zip(document_ids, docs):
chunks += [
{"document_id": doc_id, "content": node.text} for node in node_parser(doc)
]

for doc_id, doc in zip(document_ids, documents):
sentences = split_into_sentences(doc)
doc_chunks = merge_sentences(sentences, chunk_size, chunk_overlap)
chunks.extend(
[{"document_id": doc_id, "content": chunk} for chunk in doc_chunks]
)

return chunks


simple_sentence_splitter = naive_simple_sentence_splitter


if has_llama_index:

def llama_index_sentence_splitter(
documents: List[str], document_ids: List[str], chunk_size: int = 256
) -> List[Dict[str, str]]:
chunk_overlap = min(chunk_size // 4, min(chunk_size // 2, 64))
chunks = []
node_parser = SentenceSplitter(
chunk_size=chunk_size, chunk_overlap=chunk_overlap
)
docs = [Document(text=doc) for doc in documents]
for doc_id, doc in zip(document_ids, docs):
chunks += [
{"document_id": doc_id, "content": node.text}
for node in node_parser.get_nodes_from_documents([doc])
]
return chunks

simple_sentence_splitter = llama_index_sentence_splitter
Empty file.
Loading