leettools-dev · pengfeng · Apr 2, 2025 · Apr 2, 2025
diff --git a/docker/Dockerfile b/docker/Dockerfile
@@ -64,17 +64,21 @@ RUN pip install --no-cache-dir tiktoken==0.8.0
 
 RUN pip install --no-cache-dir duckdb==1.1.3
 
-RUN pip install --no-cache-dir docling==2.25.2
+RUN pip install --no-cache-dir docling==2.26.0
 
-RUN pip install --no-cache-dir docling_core==2.21.1
+RUN pip install --no-cache-dir docling_core==2.23.0
 
 RUN pip install --no-cache-dir chonkie==0.5.1
 
 RUN pip install --no-cache-dir langchain-community==0.3.15
 
+RUN pip install --no-cache-dir langid==1.1.6
+
+RUN pip install --no-cache-dir nest-asyncio==1.6.0
+
 RUN pip install --no-cache-dir firecrawl-py==1.12.0
 
-RUN pip install --no-cache-dir langid==1.1.6
+RUN pip install --no-cache-dir lxml==5.3.0
 
 
 # it seems that the version of apache2-utils is not stable
@@ -99,5 +103,6 @@ RUN pip install -e . \
 EXPOSE ${EDS_API_SERVICE_PORT}
 
 ENV LOG_OUTPUT="console"
+ENV GIT_SHA=$GIT_SHA
 
 CMD ["./scripts/run.sh"]
diff --git a/docker/build.sh b/docker/build.sh
@@ -25,6 +25,8 @@ cd "$BASE_DIR"
 cp "$DIR"/dockerignore.template "$BASE_DIR"/.dockerignore
 cat "$BASE_DIR"/.gitignore >> "$BASE_DIR"/.dockerignore
 
+export GIT_SHA=$(git rev-parse HEAD)
+
 docker build -t "${org_name}/${app_name}":"${version}" -f "$DIR"/Dockerfile "$BASE_DIR"
 
 # tag the image with the latest version

diff --git a/docker/start_single_docker.sh b/docker/start_single_docker.sh
@@ -0,0 +1,171 @@
+#!/bin/bash
+
+# this program is used to start the docker container for the leettools service
+
+set -e -u
+
+DIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )" && pwd )"
+BASE_DIR="$(cd "$DIR"/.. && pwd)"
+
+# read the optional .env filename if specified on command line using -e flag
+# default is .env
+ENV_FILE=".env"
+
+# Default values
+FORCE_REMOVE=false
+
+# Initialize MOUNT_DIRS as an array
+declare -a MOUNT_DIRS=()
+
+while getopts ":e:fm:" opt; do
+  case ${opt} in
+    e )
+      ENV_FILE=$OPTARG
+      # if the file does not exist, exit
+      if [ ! -f "${BASE_DIR}"/"${ENV_FILE}" ]; then
+        echo "Specified env file not found: ${BASE_DIR}/${ENV_FILE}"
+        exit 1
+      fi
+      ;;
+    f )
+      FORCE_REMOVE=true
+      ;;
+    m )
+      # Split multiple mount directories by comma and add to array
+      IFS=',' read -ra MOUNTS <<< "$OPTARG"
+      for mount in "${MOUNTS[@]}"; do
+        MOUNT_DIRS+=("$mount")
+      done
+      ;;
+    \? )
+      echo "Usage: $0 [-e <env_file>] [-f] [-m <mount_dirs>]"
+      echo "  -e <env_file>  Specify environment file (default: ${BASE_DIR}/.env)"
+      echo "  -f             Force remove existing container"
+      echo "  -m <mount_dirs> Specify comma-separated directories to mount (format: src:dest[,src2:dest2,...])"
+      exit 1
+      ;;
+  esac
+done
+
+# convert MOUNT_DIRS array to mount options
+mount_opts=""
+if [ ${#MOUNT_DIRS[@]} -gt 0 ]; then
+  for mount in "${MOUNT_DIRS[@]}"; do
+    mount_opts+="-v $mount:ro "
+  done
+fi
+
+org_name="leettools"
+app_name="leettools"
+container_name="leettools"
+
+# If force remove is enabled, stop and remove existing container
+if [ "$FORCE_REMOVE" = true ]; then
+  # Check if container exists and is running
+  if docker ps | grep -q "$container_name"; then
+    echo "Stopping running $container_name container..."
+    docker stop "$container_name" 2>/dev/null || true
+  fi
+  # Check if container exists (running or not)
+  if docker ps -a | grep -q "$container_name"; then
+    echo "Removing $container_name container..."
+    docker rm "$container_name" 2>/dev/null || true
+  fi
+else
+  # Check if container exists (running or not)
+  if docker ps -a | grep -q "$container_name"; then
+    echo -e "\033[1;33mWarning:\033[0m  $container_name container already exists" >&2
+    echo -e "\033[1;33mSolution:\033[0m Use -f flag to force remove existing container" >&2
+    exit 1
+  fi
+fi
+
+# read the version number project.toml file and use in the docker image
+version=$(grep "^version = " "$BASE_DIR"/pyproject.toml | cut -d'"' -f2)
+
+# check if the version number is valid
+if [[ ! "$version" =~ ^[0-9]+\.[0-9]+\.[0-9]+$ ]]; then
+    echo "Invalid version number: $version"
+    exit 1
+fi
+
+# check if the docker image exists
+if ! docker images "${org_name}/${app_name}":"${version}" | grep -q "${version}"; then
+    echo "Docker image ${org_name}/${app_name}:${version} does not exist"
+    version="latest"
+    if ! docker images "${org_name}/${app_name}":"${version}" | grep -q "${version}"; then
+        echo "Docker image leettools-dev/leettools:${version} does not exist"
+        exit 1
+    fi
+    echo "Using latest version instead"
+fi
+
+# Check if the .env file exists in the root directory
+if [ -f "${BASE_DIR}"/"${ENV_FILE}" ]; then
+  #Load environment variables from .env file
+    while IFS='=' read -r name value; do
+    if [[ ! $name =~ ^\# ]] && [[ -n $name ]]; then
+      # echo "$name" "$value";
+      export "$name=$value";
+    fi;
+    done < "${BASE_DIR}"/"${ENV_FILE}"
+    envfile_opt="--env-file ${BASE_DIR}/${ENV_FILE}"
+else
+    envfile_opt=""
+fi
+
+# check if the LEET_HOME, EDS_DATA_DIR, and EDS_LOG_DIR environment variables are set
+if [ -z "${LEET_HOME:-}" ]; then
+    case "$(uname -s)" in
+        Darwin|Linux)
+            LEET_HOME=~/leettools
+            ;;
+        CYGWIN*|MINGW*|MSYS*)
+            LEET_HOME="$USERPROFILE/leettools"
+            ;;
+        *)
+            echo "Unsupported operating system, using the value from .env file"
+            ;;
+    esac
+    echo "LEET_HOME is not set, using the default value: $LEET_HOME"
+    export LEET_HOME="$LEET_HOME"
+fi
+
+if [ -z "${EDS_DATA_DIR:-}" ]; then
+    EDS_DATA_DIR="${LEET_HOME}/data"
+fi
+
+if [ -z "${EDS_LOG_DIR:-}" ]; then
+    EDS_LOG_DIR="${LEET_HOME}/logs"
+fi
+
+if [ -z "${EDS_API_SERVICE_PORT:-}" ]; then
+    EDS_API_SERVICE_PORT=8000
+fi
+
+# run the docker container as a service with port 8000:8000
+# mount the data directory at $LEET_HOME, $EDS_DATA_DIR, $EDS_LOG_DIR
+# run the docker container with the .env.docker file
+leet_home_in_docker="/leettools"
+
+# print the docker run command
+echo "Running docker container with command:"
+echo "docker run -d ${envfile_opt} ${mount_opts} \\
+    --name \"$container_name\" \\
+    -p \"$EDS_API_SERVICE_PORT\":\"$EDS_API_SERVICE_PORT\" \\
+    -e LEET_HOME=\"$leet_home_in_docker\" \\
+    -v \"$LEET_HOME\":\"$leet_home_in_docker\" \\
+    -v \"$EDS_DATA_DIR\":\"$leet_home_in_docker/data\" \\
+    -v \"$EDS_LOG_DIR\":\"$leet_home_in_docker/logs\" \\
+    ${org_name}/${app_name}:\"${version}\""
+
+# run the docker container
+# shellcheck disable=SC2086
+docker run -d ${envfile_opt} ${mount_opts} \
+    --name "$container_name" \
+    -p "$EDS_API_SERVICE_PORT":"$EDS_API_SERVICE_PORT" \
+    -e LEET_HOME="$leet_home_in_docker" \
+    -v "$LEET_HOME":"$leet_home_in_docker" \
+    -v "$EDS_DATA_DIR":"$leet_home_in_docker/data" \
+    -v "$EDS_LOG_DIR":"$leet_home_in_docker/logs" \
+    ${org_name}/${app_name}:"${version}"
diff --git a/pyproject.toml b/pyproject.toml
@@ -4,7 +4,7 @@ build-backend = "hatchling.build"
 
 [project]
 name = "leettools"
-version = "1.0.17"
+version = "1.1.0"
 authors = [
   { name="LeetTools-dev", email="leettools@gmail.com" },
 ]
@@ -44,8 +44,10 @@ dependencies = [
     "docling_core==2.23.0",
     "chonkie==0.5.1",
     "langchain-community==0.3.15",
-    "firecrawl-py==1.12.0",
     "langid==1.1.6",
+    "nest-asyncio==1.6.0",
+    "lxml==5.3.0",
+    "firecrawl-py==1.12.0",
 ]
 
 [project.urls]

diff --git a/requirements-dev.txt b/requirements-dev.txt
@@ -0,0 +1,12 @@
+# dependencies for development
+
+pytest==7.3.1
+pytest-asyncio==0.23.6
+pytest-cov==6.0.0
+pytest-mock==3.14.0
+ragas==0.2.10
+langchain_openai==0.3.5
+
+# benchmark
+# conflicts with tiktoken 0.7.0
+# tonic-validate==4.0.3
diff --git a/requirements-ext.txt b/requirements-ext.txt
@@ -0,0 +1,8 @@
+# Video News
+api.video==1.4.1
+edge-tts==6.1.12
+google-cloud-texttospeech==2.16.3
+moviepy==1.0.3
+pillow==10.4.0
+replicate==0.32.0
+PyVimeo==1.1.2
diff --git a/requirements.txt b/requirements.txt
@@ -16,6 +16,7 @@ psutil==5.9.8
 tldextract==5.1.3
 urllib3==2.2.3
 Babel==2.16.0
+
 # dev
 beautifulsoup4==4.12.3
 openai==1.59.7
@@ -27,7 +28,6 @@ docling==2.26.0
 docling_core==2.23.0
 chonkie==0.5.1
 langchain-community==0.3.15
-firecrawl-py==1.12.0
 langid==1.1.6
 
 
diff --git a/src/leettools/common/utils/tokenizer.py b/src/leettools/common/utils/tokenizer.py
@@ -1,7 +1,6 @@
-from typing import List, Optional
+from typing import List
 
 from leettools.common import exceptions
-from leettools.common.logging import logger
 from leettools.context_manager import Context
 from leettools.settings import SystemSettings
 
@@ -10,38 +9,17 @@ class Tokenizer:
     def __init__(self, settings: SystemSettings):
         self.settings = settings
 
-    def token_count(self, text: str, model_name: Optional[str] = None) -> int:
-        if model_name is None:
-            model_name = self.settings.DEFAULT_INFERENCE_MODEL
+    def est_token_count(self, text: str) -> int:
+        """
+        Estimate the token count of the text, since we do not want to import
+        transformers in the common module.
+        """
+        import tiktoken
 
-        if model_name.startswith("gpt") or model_name.startswith("o1"):
-            import tiktoken
-
-            encoding = tiktoken.encoding_for_model(model_name)
-            return len(encoding.encode(text))
-        elif model_name.startswith("qwen"):
-            # The tokenizer from Qwen models is really slow, so we don't use it now:
-            # from dashscope import get_tokenizer
-            # tokenizer = get_tokenizer(model_name)
-            return len(text)
-        elif model_name.startswith("deepseek"):
-            from transformers import AutoTokenizer
-
-            tokenizer = AutoTokenizer.from_pretrained("deepseek-ai/DeepSeek-V3")
-            tokens = tokenizer.tokenize(text)
-            return len(tokens)
-            # return len(text)
-        elif model_name.startswith("llama"):
-            from transformers import AutoTokenizer
-
-            tokenizer = AutoTokenizer.from_pretrained("unsloth/Llama-3.2-3B")
-            tokens = tokenizer.tokenize(text)
-            return len(tokens)
-        else:
-            logger().warning(
-                f"Unknown model name: {model_name}, using text length as token count."
-            )
-            return len(text)
+        # We use gpt-3.5 as the default model for estimating the token count
+        model_name = "gpt-3.5"
+        encoding = tiktoken.encoding_for_model(model_name)
+        return len(encoding.encode(text))
 
     def split_text(self, text: str, num_parts: int) -> List[str]:
         words = text.split()  # Split the text into words
@@ -76,10 +54,10 @@ def split_text(self, text: str, num_parts: int) -> List[str]:
 
     tokenizer = Tokenizer(context.settings)
     text = "This is a test sentence."
-    print(tokenizer.token_count(text, "gpt-4o-mini"))
+    print(tokenizer.est_token_count(text))
 
     text = "This is a test sentence; This is another test sentence."
     num_parts = 2
-    parts = Tokenizer.split_text(text, num_parts)
+    parts = tokenizer.split_text(text, num_parts)
     for part in parts:
         print(part)
diff --git a/src/leettools/eds/pipeline/chunk/_impl/chunker_simple.py b/src/leettools/eds/pipeline/chunk/_impl/chunker_simple.py
@@ -79,7 +79,7 @@ def _add_chunk(self, state: _ChunkState) -> None:
         state.start_offset = state.end_offset
 
     def _get_chunk_size(self, text: str) -> int:
-        return self.tokenizer.token_count(text)
+        return self.tokenizer.est_token_count(text)
 
     def _check_chunk_size(self, state: _ChunkState, line: str) -> None:
         combined_content = state.chunk_content + line

diff --git a/src/leettools/eds/pipeline/split/splitter.py b/src/leettools/eds/pipeline/split/splitter.py
@@ -234,7 +234,7 @@ def _split(self, doc: Document) -> ReturnCode:
         """
         if self.kb.enable_contextual_retrieval:
             if (
-                self.tokenizer.token_count(doc.content)
+                self.tokenizer.est_token_count(doc.content)
                 < self.settings.DEFAULT_CONTEXT_LIMIT
             ):
                 logger().info(
@@ -246,7 +246,7 @@ def _split(self, doc: Document) -> ReturnCode:
                 context_token_count = 0
                 logger().info("Combining chunks for contextual retrieval")
                 for chunk in chunks:
-                    chunk_token_count = self.tokenizer.token_count(chunk.content)
+                    chunk_token_count = self.tokenizer.est_token_count(chunk.content)
                     if (
                         context_token_count + chunk_token_count
                         < self.settings.DEFAULT_CONTEXT_LIMIT

diff --git a/src/leettools/eds/rag/rewrite/_impl/rewrite_keywords_dynamic.py b/src/leettools/eds/rag/rewrite/_impl/rewrite_keywords_dynamic.py
@@ -126,7 +126,7 @@ def _get_context(
                     "segment store, maybe from a deleted document."
                 )
                 continue
-            segment_token_count = self.tokenizer.token_count(segment.content)
+            segment_token_count = self.tokenizer.est_token_count(segment.content)
             if (context_token_count + segment_token_count) > context_limit:
                 self.display_logger.info(
                     f"Rewrite: Context token count exceeds {context_limit}. "

diff --git a/src/leettools/flow/steps/step_extend_context.py b/src/leettools/flow/steps/step_extend_context.py
@@ -186,7 +186,7 @@ def run_step(
 
         if chat_history_str is not None and chat_history_str != "":
             extended_context = f"Here is the chat history:\n{chat_history_str}\n"
-            context_token_count = tokenizer.token_count(extended_context)
+            context_token_count = tokenizer.est_token_count(extended_context)
             display_logger.debug(
                 f"Extended context chat_history token count: {context_token_count}"
             )
@@ -223,7 +223,7 @@ def run_step(
                     segments_set=segments_set,
                 )
 
-            segment_token_count = tokenizer.token_count(segment_content)
+            segment_token_count = tokenizer.est_token_count(segment_content)
             if (context_token_count + segment_token_count) > context_limit:
                 display_logger.info(
                     f"Reference token count exceeds {context_limit}. "