diff --git a/docker/Dockerfile b/docker/Dockerfile index be2b083..bf78d94 100644 --- a/docker/Dockerfile +++ b/docker/Dockerfile @@ -64,17 +64,21 @@ RUN pip install --no-cache-dir tiktoken==0.8.0 RUN pip install --no-cache-dir duckdb==1.1.3 -RUN pip install --no-cache-dir docling==2.25.2 +RUN pip install --no-cache-dir docling==2.26.0 -RUN pip install --no-cache-dir docling_core==2.21.1 +RUN pip install --no-cache-dir docling_core==2.23.0 RUN pip install --no-cache-dir chonkie==0.5.1 RUN pip install --no-cache-dir langchain-community==0.3.15 +RUN pip install --no-cache-dir langid==1.1.6 + +RUN pip install --no-cache-dir nest-asyncio==1.6.0 + RUN pip install --no-cache-dir firecrawl-py==1.12.0 -RUN pip install --no-cache-dir langid==1.1.6 +RUN pip install --no-cache-dir lxml==5.3.0 # it seems that the version of apache2-utils is not stable @@ -99,5 +103,6 @@ RUN pip install -e . \ EXPOSE ${EDS_API_SERVICE_PORT} ENV LOG_OUTPUT="console" +ENV GIT_SHA=$GIT_SHA CMD ["./scripts/run.sh"] \ No newline at end of file diff --git a/docker/build.sh b/docker/build.sh index 56287ca..4af28ce 100755 --- a/docker/build.sh +++ b/docker/build.sh @@ -25,6 +25,8 @@ cd "$BASE_DIR" cp "$DIR"/dockerignore.template "$BASE_DIR"/.dockerignore cat "$BASE_DIR"/.gitignore >> "$BASE_DIR"/.dockerignore +export GIT_SHA=$(git rev-parse HEAD) + docker build -t "${org_name}/${app_name}":"${version}" -f "$DIR"/Dockerfile "$BASE_DIR" # tag the image with the latest version diff --git a/docker/start_single_docker.sh b/docker/start_single_docker.sh new file mode 100644 index 0000000..f3dea9b --- /dev/null +++ b/docker/start_single_docker.sh @@ -0,0 +1,171 @@ +#!/bin/bash + +# this program is used to start the docker container for the leettools service + +set -e -u + +DIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )" && pwd )" +BASE_DIR="$(cd "$DIR"/.. && pwd)" + +# read the optional .env filename if specified on command line using -e flag +# default is .env +ENV_FILE=".env" + +# Default values +FORCE_REMOVE=false + +# Initialize MOUNT_DIRS as an array +declare -a MOUNT_DIRS=() + +while getopts ":e:fm:" opt; do + case ${opt} in + e ) + ENV_FILE=$OPTARG + # if the file does not exist, exit + if [ ! -f "${BASE_DIR}"/"${ENV_FILE}" ]; then + echo "Specified env file not found: ${BASE_DIR}/${ENV_FILE}" + exit 1 + fi + ;; + f ) + FORCE_REMOVE=true + ;; + m ) + # Split multiple mount directories by comma and add to array + IFS=',' read -ra MOUNTS <<< "$OPTARG" + for mount in "${MOUNTS[@]}"; do + MOUNT_DIRS+=("$mount") + done + ;; + \? ) + echo "Usage: $0 [-e ] [-f] [-m ]" + echo " -e Specify environment file (default: ${BASE_DIR}/.env)" + echo " -f Force remove existing container" + echo " -m Specify comma-separated directories to mount (format: src:dest[,src2:dest2,...])" + exit 1 + ;; + esac +done + +# convert MOUNT_DIRS array to mount options +mount_opts="" +if [ ${#MOUNT_DIRS[@]} -gt 0 ]; then + for mount in "${MOUNT_DIRS[@]}"; do + mount_opts+="-v $mount:ro " + done +fi + +org_name="leettools" +app_name="leettools" +container_name="leettools" + +# If force remove is enabled, stop and remove existing container +if [ "$FORCE_REMOVE" = true ]; then + # Check if container exists and is running + if docker ps | grep -q "$container_name"; then + echo "Stopping running $container_name container..." + docker stop "$container_name" 2>/dev/null || true + fi + # Check if container exists (running or not) + if docker ps -a | grep -q "$container_name"; then + echo "Removing $container_name container..." + docker rm "$container_name" 2>/dev/null || true + fi +else + # Check if container exists (running or not) + if docker ps -a | grep -q "$container_name"; then + echo -e "\033[1;33mWarning:\033[0m $container_name container already exists" >&2 + echo -e "\033[1;33mSolution:\033[0m Use -f flag to force remove existing container" >&2 + exit 1 + fi +fi + +# read the version number project.toml file and use in the docker image +version=$(grep "^version = " "$BASE_DIR"/pyproject.toml | cut -d'"' -f2) + +# check if the version number is valid +if [[ ! "$version" =~ ^[0-9]+\.[0-9]+\.[0-9]+$ ]]; then + echo "Invalid version number: $version" + exit 1 +fi + +# check if the docker image exists +if ! docker images "${org_name}/${app_name}":"${version}" | grep -q "${version}"; then + echo "Docker image ${org_name}/${app_name}:${version} does not exist" + version="latest" + if ! docker images "${org_name}/${app_name}":"${version}" | grep -q "${version}"; then + echo "Docker image leettools-dev/leettools:${version} does not exist" + exit 1 + fi + echo "Using latest version instead" +fi + +# Check if the .env file exists in the root directory +if [ -f "${BASE_DIR}"/"${ENV_FILE}" ]; then + #Load environment variables from .env file + while IFS='=' read -r name value; do + if [[ ! $name =~ ^\# ]] && [[ -n $name ]]; then + # echo "$name" "$value"; + export "$name=$value"; + fi; + done < "${BASE_DIR}"/"${ENV_FILE}" + envfile_opt="--env-file ${BASE_DIR}/${ENV_FILE}" +else + envfile_opt="" +fi + +# check if the LEET_HOME, EDS_DATA_DIR, and EDS_LOG_DIR environment variables are set +if [ -z "${LEET_HOME:-}" ]; then + case "$(uname -s)" in + Darwin|Linux) + LEET_HOME=~/leettools + ;; + CYGWIN*|MINGW*|MSYS*) + LEET_HOME="$USERPROFILE/leettools" + ;; + *) + echo "Unsupported operating system, using the value from .env file" + ;; + esac + echo "LEET_HOME is not set, using the default value: $LEET_HOME" + export LEET_HOME="$LEET_HOME" +fi + +if [ -z "${EDS_DATA_DIR:-}" ]; then + EDS_DATA_DIR="${LEET_HOME}/data" +fi + +if [ -z "${EDS_LOG_DIR:-}" ]; then + EDS_LOG_DIR="${LEET_HOME}/logs" +fi + +if [ -z "${EDS_API_SERVICE_PORT:-}" ]; then + EDS_API_SERVICE_PORT=8000 +fi + +# run the docker container as a service with port 8000:8000 +# mount the data directory at $LEET_HOME, $EDS_DATA_DIR, $EDS_LOG_DIR +# run the docker container with the .env.docker file +leet_home_in_docker="/leettools" + +# print the docker run command +echo "Running docker container with command:" +echo "docker run -d ${envfile_opt} ${mount_opts} \\ + --name \"$container_name\" \\ + -p \"$EDS_API_SERVICE_PORT\":\"$EDS_API_SERVICE_PORT\" \\ + -e LEET_HOME=\"$leet_home_in_docker\" \\ + -v \"$LEET_HOME\":\"$leet_home_in_docker\" \\ + -v \"$EDS_DATA_DIR\":\"$leet_home_in_docker/data\" \\ + -v \"$EDS_LOG_DIR\":\"$leet_home_in_docker/logs\" \\ + ${org_name}/${app_name}:\"${version}\"" + +# run the docker container +# shellcheck disable=SC2086 +docker run -d ${envfile_opt} ${mount_opts} \ + --name "$container_name" \ + -p "$EDS_API_SERVICE_PORT":"$EDS_API_SERVICE_PORT" \ + -e LEET_HOME="$leet_home_in_docker" \ + -v "$LEET_HOME":"$leet_home_in_docker" \ + -v "$EDS_DATA_DIR":"$leet_home_in_docker/data" \ + -v "$EDS_LOG_DIR":"$leet_home_in_docker/logs" \ + ${org_name}/${app_name}:"${version}" \ No newline at end of file diff --git a/pyproject.toml b/pyproject.toml index 88910a1..9753c26 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -4,7 +4,7 @@ build-backend = "hatchling.build" [project] name = "leettools" -version = "1.0.17" +version = "1.1.0" authors = [ { name="LeetTools-dev", email="leettools@gmail.com" }, ] @@ -44,8 +44,10 @@ dependencies = [ "docling_core==2.23.0", "chonkie==0.5.1", "langchain-community==0.3.15", - "firecrawl-py==1.12.0", "langid==1.1.6", + "nest-asyncio==1.6.0", + "lxml==5.3.0", + "firecrawl-py==1.12.0", ] [project.urls] diff --git a/requirements-dev.txt b/requirements-dev.txt new file mode 100644 index 0000000..0886b76 --- /dev/null +++ b/requirements-dev.txt @@ -0,0 +1,12 @@ +# dependencies for development + +pytest==7.3.1 +pytest-asyncio==0.23.6 +pytest-cov==6.0.0 +pytest-mock==3.14.0 +ragas==0.2.10 +langchain_openai==0.3.5 + +# benchmark +# conflicts with tiktoken 0.7.0 +# tonic-validate==4.0.3 \ No newline at end of file diff --git a/requirements-ext.txt b/requirements-ext.txt new file mode 100644 index 0000000..c303f61 --- /dev/null +++ b/requirements-ext.txt @@ -0,0 +1,8 @@ +# Video News +api.video==1.4.1 +edge-tts==6.1.12 +google-cloud-texttospeech==2.16.3 +moviepy==1.0.3 +pillow==10.4.0 +replicate==0.32.0 +PyVimeo==1.1.2 diff --git a/requirements.txt b/requirements.txt index a3d70b1..72eee5a 100644 --- a/requirements.txt +++ b/requirements.txt @@ -16,6 +16,7 @@ psutil==5.9.8 tldextract==5.1.3 urllib3==2.2.3 Babel==2.16.0 + # dev beautifulsoup4==4.12.3 openai==1.59.7 @@ -27,7 +28,6 @@ docling==2.26.0 docling_core==2.23.0 chonkie==0.5.1 langchain-community==0.3.15 -firecrawl-py==1.12.0 langid==1.1.6 diff --git a/src/leettools/common/utils/tokenizer.py b/src/leettools/common/utils/tokenizer.py index 329966a..43af8e5 100644 --- a/src/leettools/common/utils/tokenizer.py +++ b/src/leettools/common/utils/tokenizer.py @@ -1,7 +1,6 @@ -from typing import List, Optional +from typing import List from leettools.common import exceptions -from leettools.common.logging import logger from leettools.context_manager import Context from leettools.settings import SystemSettings @@ -10,38 +9,17 @@ class Tokenizer: def __init__(self, settings: SystemSettings): self.settings = settings - def token_count(self, text: str, model_name: Optional[str] = None) -> int: - if model_name is None: - model_name = self.settings.DEFAULT_INFERENCE_MODEL + def est_token_count(self, text: str) -> int: + """ + Estimate the token count of the text, since we do not want to import + transformers in the common module. + """ + import tiktoken - if model_name.startswith("gpt") or model_name.startswith("o1"): - import tiktoken - - encoding = tiktoken.encoding_for_model(model_name) - return len(encoding.encode(text)) - elif model_name.startswith("qwen"): - # The tokenizer from Qwen models is really slow, so we don't use it now: - # from dashscope import get_tokenizer - # tokenizer = get_tokenizer(model_name) - return len(text) - elif model_name.startswith("deepseek"): - from transformers import AutoTokenizer - - tokenizer = AutoTokenizer.from_pretrained("deepseek-ai/DeepSeek-V3") - tokens = tokenizer.tokenize(text) - return len(tokens) - # return len(text) - elif model_name.startswith("llama"): - from transformers import AutoTokenizer - - tokenizer = AutoTokenizer.from_pretrained("unsloth/Llama-3.2-3B") - tokens = tokenizer.tokenize(text) - return len(tokens) - else: - logger().warning( - f"Unknown model name: {model_name}, using text length as token count." - ) - return len(text) + # We use gpt-3.5 as the default model for estimating the token count + model_name = "gpt-3.5" + encoding = tiktoken.encoding_for_model(model_name) + return len(encoding.encode(text)) def split_text(self, text: str, num_parts: int) -> List[str]: words = text.split() # Split the text into words @@ -76,10 +54,10 @@ def split_text(self, text: str, num_parts: int) -> List[str]: tokenizer = Tokenizer(context.settings) text = "This is a test sentence." - print(tokenizer.token_count(text, "gpt-4o-mini")) + print(tokenizer.est_token_count(text)) text = "This is a test sentence; This is another test sentence." num_parts = 2 - parts = Tokenizer.split_text(text, num_parts) + parts = tokenizer.split_text(text, num_parts) for part in parts: print(part) diff --git a/src/leettools/eds/pipeline/chunk/_impl/chunker_simple.py b/src/leettools/eds/pipeline/chunk/_impl/chunker_simple.py index 6371091..7d6cc6b 100644 --- a/src/leettools/eds/pipeline/chunk/_impl/chunker_simple.py +++ b/src/leettools/eds/pipeline/chunk/_impl/chunker_simple.py @@ -79,7 +79,7 @@ def _add_chunk(self, state: _ChunkState) -> None: state.start_offset = state.end_offset def _get_chunk_size(self, text: str) -> int: - return self.tokenizer.token_count(text) + return self.tokenizer.est_token_count(text) def _check_chunk_size(self, state: _ChunkState, line: str) -> None: combined_content = state.chunk_content + line diff --git a/src/leettools/eds/pipeline/split/splitter.py b/src/leettools/eds/pipeline/split/splitter.py index 3b7465f..4af85a8 100644 --- a/src/leettools/eds/pipeline/split/splitter.py +++ b/src/leettools/eds/pipeline/split/splitter.py @@ -234,7 +234,7 @@ def _split(self, doc: Document) -> ReturnCode: """ if self.kb.enable_contextual_retrieval: if ( - self.tokenizer.token_count(doc.content) + self.tokenizer.est_token_count(doc.content) < self.settings.DEFAULT_CONTEXT_LIMIT ): logger().info( @@ -246,7 +246,7 @@ def _split(self, doc: Document) -> ReturnCode: context_token_count = 0 logger().info("Combining chunks for contextual retrieval") for chunk in chunks: - chunk_token_count = self.tokenizer.token_count(chunk.content) + chunk_token_count = self.tokenizer.est_token_count(chunk.content) if ( context_token_count + chunk_token_count < self.settings.DEFAULT_CONTEXT_LIMIT diff --git a/src/leettools/eds/rag/rewrite/_impl/rewrite_keywords_dynamic.py b/src/leettools/eds/rag/rewrite/_impl/rewrite_keywords_dynamic.py index 218c005..6206497 100644 --- a/src/leettools/eds/rag/rewrite/_impl/rewrite_keywords_dynamic.py +++ b/src/leettools/eds/rag/rewrite/_impl/rewrite_keywords_dynamic.py @@ -126,7 +126,7 @@ def _get_context( "segment store, maybe from a deleted document." ) continue - segment_token_count = self.tokenizer.token_count(segment.content) + segment_token_count = self.tokenizer.est_token_count(segment.content) if (context_token_count + segment_token_count) > context_limit: self.display_logger.info( f"Rewrite: Context token count exceeds {context_limit}. " diff --git a/src/leettools/flow/steps/step_extend_context.py b/src/leettools/flow/steps/step_extend_context.py index edd294e..6605ec2 100644 --- a/src/leettools/flow/steps/step_extend_context.py +++ b/src/leettools/flow/steps/step_extend_context.py @@ -186,7 +186,7 @@ def run_step( if chat_history_str is not None and chat_history_str != "": extended_context = f"Here is the chat history:\n{chat_history_str}\n" - context_token_count = tokenizer.token_count(extended_context) + context_token_count = tokenizer.est_token_count(extended_context) display_logger.debug( f"Extended context chat_history token count: {context_token_count}" ) @@ -223,7 +223,7 @@ def run_step( segments_set=segments_set, ) - segment_token_count = tokenizer.token_count(segment_content) + segment_token_count = tokenizer.est_token_count(segment_content) if (context_token_count + segment_token_count) > context_limit: display_logger.info( f"Reference token count exceeds {context_limit}. "