Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
11 changes: 8 additions & 3 deletions docker/Dockerfile
Original file line number Diff line number Diff line change
Expand Up @@ -64,17 +64,21 @@ RUN pip install --no-cache-dir tiktoken==0.8.0

RUN pip install --no-cache-dir duckdb==1.1.3

RUN pip install --no-cache-dir docling==2.25.2
RUN pip install --no-cache-dir docling==2.26.0

RUN pip install --no-cache-dir docling_core==2.21.1
RUN pip install --no-cache-dir docling_core==2.23.0

RUN pip install --no-cache-dir chonkie==0.5.1

RUN pip install --no-cache-dir langchain-community==0.3.15

RUN pip install --no-cache-dir langid==1.1.6

RUN pip install --no-cache-dir nest-asyncio==1.6.0

RUN pip install --no-cache-dir firecrawl-py==1.12.0

RUN pip install --no-cache-dir langid==1.1.6
RUN pip install --no-cache-dir lxml==5.3.0


# it seems that the version of apache2-utils is not stable
Expand All @@ -99,5 +103,6 @@ RUN pip install -e . \
EXPOSE ${EDS_API_SERVICE_PORT}

ENV LOG_OUTPUT="console"
ENV GIT_SHA=$GIT_SHA

CMD ["./scripts/run.sh"]
2 changes: 2 additions & 0 deletions docker/build.sh
Original file line number Diff line number Diff line change
Expand Up @@ -25,6 +25,8 @@ cd "$BASE_DIR"
cp "$DIR"/dockerignore.template "$BASE_DIR"/.dockerignore
cat "$BASE_DIR"/.gitignore >> "$BASE_DIR"/.dockerignore

export GIT_SHA=$(git rev-parse HEAD)

docker build -t "${org_name}/${app_name}":"${version}" -f "$DIR"/Dockerfile "$BASE_DIR"

# tag the image with the latest version
Expand Down
171 changes: 171 additions & 0 deletions docker/start_single_docker.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,171 @@
#!/bin/bash

# this program is used to start the docker container for the leettools service

set -e -u

DIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )" && pwd )"
BASE_DIR="$(cd "$DIR"/.. && pwd)"

# read the optional .env filename if specified on command line using -e flag
# default is .env
ENV_FILE=".env"

# Default values
FORCE_REMOVE=false

# Initialize MOUNT_DIRS as an array
declare -a MOUNT_DIRS=()

while getopts ":e:fm:" opt; do
case ${opt} in
e )
ENV_FILE=$OPTARG
# if the file does not exist, exit
if [ ! -f "${BASE_DIR}"/"${ENV_FILE}" ]; then
echo "Specified env file not found: ${BASE_DIR}/${ENV_FILE}"
exit 1
fi
;;
f )
FORCE_REMOVE=true
;;
m )
# Split multiple mount directories by comma and add to array
IFS=',' read -ra MOUNTS <<< "$OPTARG"
for mount in "${MOUNTS[@]}"; do
MOUNT_DIRS+=("$mount")
done
;;
\? )
echo "Usage: $0 [-e <env_file>] [-f] [-m <mount_dirs>]"
echo " -e <env_file> Specify environment file (default: ${BASE_DIR}/.env)"
echo " -f Force remove existing container"
echo " -m <mount_dirs> Specify comma-separated directories to mount (format: src:dest[,src2:dest2,...])"
exit 1
;;
esac
done

# convert MOUNT_DIRS array to mount options
mount_opts=""
if [ ${#MOUNT_DIRS[@]} -gt 0 ]; then
for mount in "${MOUNT_DIRS[@]}"; do
mount_opts+="-v $mount:ro "
done
fi

org_name="leettools"
app_name="leettools"
container_name="leettools"

# If force remove is enabled, stop and remove existing container
if [ "$FORCE_REMOVE" = true ]; then
# Check if container exists and is running
if docker ps | grep -q "$container_name"; then
echo "Stopping running $container_name container..."
docker stop "$container_name" 2>/dev/null || true
fi
# Check if container exists (running or not)
if docker ps -a | grep -q "$container_name"; then
echo "Removing $container_name container..."
docker rm "$container_name" 2>/dev/null || true
fi
else
# Check if container exists (running or not)
if docker ps -a | grep -q "$container_name"; then
echo -e "\033[1;33mWarning:\033[0m $container_name container already exists" >&2
echo -e "\033[1;33mSolution:\033[0m Use -f flag to force remove existing container" >&2
exit 1
fi
fi

# read the version number project.toml file and use in the docker image
version=$(grep "^version = " "$BASE_DIR"/pyproject.toml | cut -d'"' -f2)

# check if the version number is valid
if [[ ! "$version" =~ ^[0-9]+\.[0-9]+\.[0-9]+$ ]]; then
echo "Invalid version number: $version"
exit 1
fi

# check if the docker image exists
if ! docker images "${org_name}/${app_name}":"${version}" | grep -q "${version}"; then
echo "Docker image ${org_name}/${app_name}:${version} does not exist"
version="latest"
if ! docker images "${org_name}/${app_name}":"${version}" | grep -q "${version}"; then
echo "Docker image leettools-dev/leettools:${version} does not exist"
exit 1
fi
echo "Using latest version instead"
fi

# Check if the .env file exists in the root directory
if [ -f "${BASE_DIR}"/"${ENV_FILE}" ]; then
#Load environment variables from .env file
while IFS='=' read -r name value; do
if [[ ! $name =~ ^\# ]] && [[ -n $name ]]; then
# echo "$name" "$value";
export "$name=$value";
fi;
done < "${BASE_DIR}"/"${ENV_FILE}"
envfile_opt="--env-file ${BASE_DIR}/${ENV_FILE}"
else
envfile_opt=""
fi

# check if the LEET_HOME, EDS_DATA_DIR, and EDS_LOG_DIR environment variables are set
if [ -z "${LEET_HOME:-}" ]; then
case "$(uname -s)" in
Darwin|Linux)
LEET_HOME=~/leettools
;;
CYGWIN*|MINGW*|MSYS*)
LEET_HOME="$USERPROFILE/leettools"
;;
*)
echo "Unsupported operating system, using the value from .env file"
;;
esac
echo "LEET_HOME is not set, using the default value: $LEET_HOME"
export LEET_HOME="$LEET_HOME"
fi

if [ -z "${EDS_DATA_DIR:-}" ]; then
EDS_DATA_DIR="${LEET_HOME}/data"
fi

if [ -z "${EDS_LOG_DIR:-}" ]; then
EDS_LOG_DIR="${LEET_HOME}/logs"
fi

if [ -z "${EDS_API_SERVICE_PORT:-}" ]; then
EDS_API_SERVICE_PORT=8000
fi

# run the docker container as a service with port 8000:8000
# mount the data directory at $LEET_HOME, $EDS_DATA_DIR, $EDS_LOG_DIR
# run the docker container with the .env.docker file
leet_home_in_docker="/leettools"

# print the docker run command
echo "Running docker container with command:"
echo "docker run -d ${envfile_opt} ${mount_opts} \\
--name \"$container_name\" \\
-p \"$EDS_API_SERVICE_PORT\":\"$EDS_API_SERVICE_PORT\" \\
-e LEET_HOME=\"$leet_home_in_docker\" \\
-v \"$LEET_HOME\":\"$leet_home_in_docker\" \\
-v \"$EDS_DATA_DIR\":\"$leet_home_in_docker/data\" \\
-v \"$EDS_LOG_DIR\":\"$leet_home_in_docker/logs\" \\
${org_name}/${app_name}:\"${version}\""

# run the docker container
# shellcheck disable=SC2086
docker run -d ${envfile_opt} ${mount_opts} \
--name "$container_name" \
-p "$EDS_API_SERVICE_PORT":"$EDS_API_SERVICE_PORT" \
-e LEET_HOME="$leet_home_in_docker" \
-v "$LEET_HOME":"$leet_home_in_docker" \
-v "$EDS_DATA_DIR":"$leet_home_in_docker/data" \
-v "$EDS_LOG_DIR":"$leet_home_in_docker/logs" \
${org_name}/${app_name}:"${version}"
6 changes: 4 additions & 2 deletions pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@ build-backend = "hatchling.build"

[project]
name = "leettools"
version = "1.0.17"
version = "1.1.0"
authors = [
{ name="LeetTools-dev", email="leettools@gmail.com" },
]
Expand Down Expand Up @@ -44,8 +44,10 @@ dependencies = [
"docling_core==2.23.0",
"chonkie==0.5.1",
"langchain-community==0.3.15",
"firecrawl-py==1.12.0",
"langid==1.1.6",
"nest-asyncio==1.6.0",
"lxml==5.3.0",
"firecrawl-py==1.12.0",
]

[project.urls]
Expand Down
12 changes: 12 additions & 0 deletions requirements-dev.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,12 @@
# dependencies for development

pytest==7.3.1
pytest-asyncio==0.23.6
pytest-cov==6.0.0
pytest-mock==3.14.0
ragas==0.2.10
langchain_openai==0.3.5

# benchmark
# conflicts with tiktoken 0.7.0
# tonic-validate==4.0.3
8 changes: 8 additions & 0 deletions requirements-ext.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,8 @@
# Video News
api.video==1.4.1
edge-tts==6.1.12
google-cloud-texttospeech==2.16.3
moviepy==1.0.3
pillow==10.4.0
replicate==0.32.0
PyVimeo==1.1.2
2 changes: 1 addition & 1 deletion requirements.txt
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,7 @@ psutil==5.9.8
tldextract==5.1.3
urllib3==2.2.3
Babel==2.16.0

# dev
beautifulsoup4==4.12.3
openai==1.59.7
Expand All @@ -27,7 +28,6 @@ docling==2.26.0
docling_core==2.23.0
chonkie==0.5.1
langchain-community==0.3.15
firecrawl-py==1.12.0
langid==1.1.6


48 changes: 13 additions & 35 deletions src/leettools/common/utils/tokenizer.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,6 @@
from typing import List, Optional
from typing import List

from leettools.common import exceptions
from leettools.common.logging import logger
from leettools.context_manager import Context
from leettools.settings import SystemSettings

Expand All @@ -10,38 +9,17 @@ class Tokenizer:
def __init__(self, settings: SystemSettings):
self.settings = settings

def token_count(self, text: str, model_name: Optional[str] = None) -> int:
if model_name is None:
model_name = self.settings.DEFAULT_INFERENCE_MODEL
def est_token_count(self, text: str) -> int:
"""
Estimate the token count of the text, since we do not want to import
transformers in the common module.
"""
import tiktoken

if model_name.startswith("gpt") or model_name.startswith("o1"):
import tiktoken

encoding = tiktoken.encoding_for_model(model_name)
return len(encoding.encode(text))
elif model_name.startswith("qwen"):
# The tokenizer from Qwen models is really slow, so we don't use it now:
# from dashscope import get_tokenizer
# tokenizer = get_tokenizer(model_name)
return len(text)
elif model_name.startswith("deepseek"):
from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained("deepseek-ai/DeepSeek-V3")
tokens = tokenizer.tokenize(text)
return len(tokens)
# return len(text)
elif model_name.startswith("llama"):
from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained("unsloth/Llama-3.2-3B")
tokens = tokenizer.tokenize(text)
return len(tokens)
else:
logger().warning(
f"Unknown model name: {model_name}, using text length as token count."
)
return len(text)
# We use gpt-3.5 as the default model for estimating the token count
model_name = "gpt-3.5"
encoding = tiktoken.encoding_for_model(model_name)
return len(encoding.encode(text))

def split_text(self, text: str, num_parts: int) -> List[str]:
words = text.split() # Split the text into words
Expand Down Expand Up @@ -76,10 +54,10 @@ def split_text(self, text: str, num_parts: int) -> List[str]:

tokenizer = Tokenizer(context.settings)
text = "This is a test sentence."
print(tokenizer.token_count(text, "gpt-4o-mini"))
print(tokenizer.est_token_count(text))

text = "This is a test sentence; This is another test sentence."
num_parts = 2
parts = Tokenizer.split_text(text, num_parts)
parts = tokenizer.split_text(text, num_parts)
for part in parts:
print(part)
2 changes: 1 addition & 1 deletion src/leettools/eds/pipeline/chunk/_impl/chunker_simple.py
Original file line number Diff line number Diff line change
Expand Up @@ -79,7 +79,7 @@ def _add_chunk(self, state: _ChunkState) -> None:
state.start_offset = state.end_offset

def _get_chunk_size(self, text: str) -> int:
return self.tokenizer.token_count(text)
return self.tokenizer.est_token_count(text)

def _check_chunk_size(self, state: _ChunkState, line: str) -> None:
combined_content = state.chunk_content + line
Expand Down
4 changes: 2 additions & 2 deletions src/leettools/eds/pipeline/split/splitter.py
Original file line number Diff line number Diff line change
Expand Up @@ -234,7 +234,7 @@ def _split(self, doc: Document) -> ReturnCode:
"""
if self.kb.enable_contextual_retrieval:
if (
self.tokenizer.token_count(doc.content)
self.tokenizer.est_token_count(doc.content)
< self.settings.DEFAULT_CONTEXT_LIMIT
):
logger().info(
Expand All @@ -246,7 +246,7 @@ def _split(self, doc: Document) -> ReturnCode:
context_token_count = 0
logger().info("Combining chunks for contextual retrieval")
for chunk in chunks:
chunk_token_count = self.tokenizer.token_count(chunk.content)
chunk_token_count = self.tokenizer.est_token_count(chunk.content)
if (
context_token_count + chunk_token_count
< self.settings.DEFAULT_CONTEXT_LIMIT
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -126,7 +126,7 @@ def _get_context(
"segment store, maybe from a deleted document."
)
continue
segment_token_count = self.tokenizer.token_count(segment.content)
segment_token_count = self.tokenizer.est_token_count(segment.content)
if (context_token_count + segment_token_count) > context_limit:
self.display_logger.info(
f"Rewrite: Context token count exceeds {context_limit}. "
Expand Down
4 changes: 2 additions & 2 deletions src/leettools/flow/steps/step_extend_context.py
Original file line number Diff line number Diff line change
Expand Up @@ -186,7 +186,7 @@ def run_step(

if chat_history_str is not None and chat_history_str != "":
extended_context = f"Here is the chat history:\n{chat_history_str}\n"
context_token_count = tokenizer.token_count(extended_context)
context_token_count = tokenizer.est_token_count(extended_context)
display_logger.debug(
f"Extended context chat_history token count: {context_token_count}"
)
Expand Down Expand Up @@ -223,7 +223,7 @@ def run_step(
segments_set=segments_set,
)

segment_token_count = tokenizer.token_count(segment_content)
segment_token_count = tokenizer.est_token_count(segment_content)
if (context_token_count + segment_token_count) > context_limit:
display_logger.info(
f"Reference token count exceeds {context_limit}. "
Expand Down
Loading