diff --git a/README.md b/README.md index 7ee85b4..2bc03b7 100644 --- a/README.md +++ b/README.md @@ -50,6 +50,9 @@ Currently LeetTools provides the following workflows: * extract : Extract and store structured data for given schema. [📖](https://leettools-dev.github.io/Flow/extract) * opinions: Generate sentiment analysis and facts from the search results. [📖](https://leettools-dev.github.io/Flow/opinions) +We are in the process of implementing a full-automated flow generation pipeline that allows +users to generate their own customized flows with natural language prompts. Stay tuned for more updates! + # Quick Start **Before you start** diff --git a/pyproject.toml b/pyproject.toml index 9753c26..d58f8d9 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -35,14 +35,14 @@ dependencies = [ "urllib3==2.2.3", "Babel==2.16.0", "beautifulsoup4==4.12.3", - "openai==1.59.7", + "openai==1.93.0", "scipy==1.14.1", "sentence_transformers==2.5.1", "tiktoken==0.8.0", "duckdb==1.1.3", - "docling==2.26.0", - "docling_core==2.23.0", - "chonkie==0.5.1", + "docling==2.39.0", + "docling_core==2.39.0", + "chonkie==1.0.10", "langchain-community==0.3.15", "langid==1.1.6", "nest-asyncio==1.6.0", diff --git a/requirements.txt b/requirements.txt index 72eee5a..acad78e 100644 --- a/requirements.txt +++ b/requirements.txt @@ -19,14 +19,14 @@ Babel==2.16.0 # dev beautifulsoup4==4.12.3 -openai==1.59.7 +openai==1.93.0 scipy==1.14.1 sentence_transformers==2.5.1 tiktoken==0.8.0 duckdb==1.1.3 -docling==2.26.0 -docling_core==2.23.0 -chonkie==0.5.1 +docling==2.39.0 +docling_core==2.39.0 +chonkie==1.0.10 langchain-community==0.3.15 langid==1.1.6 diff --git a/scripts/format-diff.sh b/scripts/format-diff.sh new file mode 100755 index 0000000..789be98 --- /dev/null +++ b/scripts/format-diff.sh @@ -0,0 +1,289 @@ +#!/bin/bash + +# Script to format changed files according to VS Code settings +# Supports Python files (.py), Markdown files (.md), and Bash scripts (.sh) +# Checks diff against dev branch and formats all modified files +# +# Usage: +# ./scripts/format-diff.sh # Interactive mode (asks for confirmation) +# ./scripts/format-diff.sh --yes # Auto-format without confirmation +# ./scripts/format-diff.sh -y # Same as --yes +# ./scripts/format-diff.sh --file # Format specific file +# ./scripts/format-diff.sh -f # Same as --file +# ./scripts/format-diff.sh --dir # Format all Python files in directory +# ./scripts/format-diff.sh -d # Same as --dir +# ./scripts/format-diff.sh -f file.py --yes # Combine options +# +# Formatting applied: +# Python files (.py): +# - Black formatter with line-length 100 (if available) +# - Ruff formatting and import organization +# - Trailing whitespace removal +# Markdown (.md) and Bash (.sh) files: +# - Trailing whitespace removal +# - Empty line cleanup +# - Follows .vscode/settings.json configuration + +set -e # Exit on any error + +# Colors for output +RED='\033[0;31m' +GREEN='\033[0;32m' +YELLOW='\033[1;33m' +BLUE='\033[0;34m' +NC='\033[0m' # No Color + +# Parse command-line arguments +SPECIFIC_FILE="" +SPECIFIC_DIR="" +AUTO_YES=false + +while [[ $# -gt 0 ]]; do + case $1 in + --file|-f) + SPECIFIC_FILE="$2" + shift 2 + ;; + --dir|-d) + SPECIFIC_DIR="$2" + shift 2 + ;; + --yes|-y) + AUTO_YES=true + shift + ;; + *) + echo -e "${RED}❌ Unknown option: $1${NC}" + echo -e "${BLUE}Usage: $0 [--file|-f ] [--dir|-d ] [--yes|-y]${NC}" + exit 1 + ;; + esac +done + +# Display current formatting configuration +echo -e "${BLUE}📋 Formatting Configuration:${NC}" + +# Function to extract value from JSON (simplified) +get_vscode_setting() { + local key="$1" + local file=".vscode/settings.json" + if [ -f "$file" ]; then + grep -o "\"$key\"[[:space:]]*:[[:space:]]*[^,}]*" "$file" 2>/dev/null | sed 's/.*:[[:space:]]*//' | tr -d '"[]' | head -1 + fi +} + +# Function to extract value from TOML +get_toml_setting() { + local section="$1" + local key="$2" + local file="pyproject.toml" + if [ -f "$file" ]; then + awk -v section="[$section]" -v key="$key" ' + $0 == section {in_section=1; next} + /^\[/ && in_section {in_section=0} + in_section && $0 ~ "^" key "[[:space:]]*=" { + gsub(/^[^=]*=[[:space:]]*/, "") + gsub(/^"/, ""); gsub(/"$/, "") + print + exit + }' "$file" 2>/dev/null + fi +} + +# Read Black configuration +BLACK_LINE_LENGTH="" +if [ -f ".vscode/settings.json" ]; then + VSCODE_BLACK_ARGS=$(get_vscode_setting "black-formatter.args") + if echo "$VSCODE_BLACK_ARGS" | grep -q "line-length"; then + BLACK_LINE_LENGTH=$(echo "$VSCODE_BLACK_ARGS" | grep -o '[0-9]\+' | head -1) + fi +fi + +if [ -z "$BLACK_LINE_LENGTH" ] && [ -f "pyproject.toml" ]; then + BLACK_LINE_LENGTH=$(get_toml_setting "tool.black" "line-length") +fi + +# Set default if not found +BLACK_LINE_LENGTH=${BLACK_LINE_LENGTH:-88} + +# Display configuration +echo -e " ${GREEN}Black formatter:${NC}" +echo -e " Line length: ${YELLOW}${BLACK_LINE_LENGTH}${NC}" +if [ -f ".vscode/settings.json" ]; then + echo -e " Source: ${BLUE}.vscode/settings.json${NC} (black-formatter.args)" +elif [ -f "pyproject.toml" ]; then + echo -e " Source: ${BLUE}pyproject.toml${NC} ([tool.black] line-length)" +else + echo -e " Source: ${YELLOW}default${NC}" +fi + +echo -e " ${GREEN}Ruff:${NC}" +if command -v ruff >/dev/null 2>&1; then + echo -e " Status: ${GREEN}available${NC}" + RUFF_VERSION=$(ruff --version 2>/dev/null | head -1 || echo "unknown") + echo -e " Version: ${YELLOW}${RUFF_VERSION}${NC}" +else + echo -e " Status: ${RED}not available${NC}" +fi + +echo -e " ${GREEN}File types supported:${NC}" +echo -e " Python (.py): ${YELLOW}Black + Ruff + whitespace cleanup${NC}" +echo -e " Markdown (.md): ${YELLOW}whitespace cleanup + empty line removal${NC}" +echo -e " Bash (.sh): ${YELLOW}whitespace cleanup + empty line removal${NC}" + +echo -e " ${GREEN}Configuration files:${NC}" +[ -f ".vscode/settings.json" ] && echo -e " ${GREEN}✓${NC} .vscode/settings.json found" || echo -e " ${RED}✗${NC} .vscode/settings.json not found" +[ -f "pyproject.toml" ] && echo -e " ${GREEN}✓${NC} pyproject.toml found" || echo -e " ${RED}✗${NC} pyproject.toml not found" + +echo "" + +# Collect files to format based on options +if [[ -n "$SPECIFIC_FILE" ]]; then + echo -e "${BLUE}🎯 Formatting specific file: $SPECIFIC_FILE${NC}" + + # Validate file exists and is a Python file + if [[ ! -f "$SPECIFIC_FILE" ]]; then + echo -e "${RED}❌ File not found: $SPECIFIC_FILE${NC}" + exit 1 + fi + + if [[ ! "$SPECIFIC_FILE" == *.py && ! "$SPECIFIC_FILE" == *.md && ! "$SPECIFIC_FILE" == *.sh ]]; then + echo -e "${RED}❌ File must be a Python (.py), Markdown (.md), or Bash (.sh) file: $SPECIFIC_FILE${NC}" + exit 1 + fi + + ALL_FILES="$SPECIFIC_FILE" +elif [[ -n "$SPECIFIC_DIR" ]]; then + echo -e "${BLUE}🎯 Formatting files in directory: $SPECIFIC_DIR${NC}" + + # Validate directory exists + if [[ ! -d "$SPECIFIC_DIR" ]]; then + echo -e "${RED}❌ Directory not found: $SPECIFIC_DIR${NC}" + exit 1 + fi + + # Find all supported files in the directory recursively + ALL_FILES=$(find "$SPECIFIC_DIR" \( -name "*.py" -o -name "*.md" -o -name "*.sh" \) -type f 2>/dev/null | sort || true) + + if [[ -z "$ALL_FILES" ]]; then + echo -e "${RED}❌ No Python (.py), Markdown (.md), or Bash (.sh) files found in directory: $SPECIFIC_DIR${NC}" + exit 1 + fi +else + echo -e "${BLUE}🔍 Checking for changed files against dev branch...${NC}" + + # Get list of changed files compared to dev branch + CHANGED_FILES=$(git diff dev...HEAD --name-only --diff-filter=AM | grep -E '\.(py|md|sh)$' || true) + + # Also check for locally modified files not yet committed + LOCAL_FILES=$(git status --porcelain | grep -E '^[ M].*\.(py|md|sh)$' | awk '{print $2}' || true) + + # Combine and deduplicate files + ALL_FILES=$(echo -e "$CHANGED_FILES\n$LOCAL_FILES" | sort -u | grep -v '^$' || true) +fi + +if [ -z "$ALL_FILES" ]; then + echo -e "${GREEN}✅ No files to format${NC}" + exit 0 +fi + +echo -e "${YELLOW}📝 Found files to format:${NC}" +echo "$ALL_FILES" | while read -r file; do + echo " - $file" +done + +# Check if running in interactive mode or with --yes flag +if [[ "$AUTO_YES" == true ]]; then + echo -e "${GREEN}✅ Auto-formatting enabled${NC}" +else + echo "" + read -p "Do you want to format these files? (y/N): " -n 1 -r + echo "" + + if [[ ! $REPLY =~ ^[Yy]$ ]]; then + echo -e "${YELLOW}⏭️ Formatting cancelled${NC}" + exit 0 + fi +fi + +echo -e "${BLUE}🔧 Formatting files...${NC}" + +# Format each file +FORMATTED_COUNT=0 +ERROR_COUNT=0 + +while IFS= read -r file; do + if [ -f "$file" ]; then + echo -e "${BLUE}Formatting: $file${NC}" + + # Determine file type and apply appropriate formatting + if [[ "$file" == *.py ]]; then + # Python file - apply full formatting + + # Check if Black is available, otherwise use Ruff format + if command -v black >/dev/null 2>&1; then + # Apply Black formatting with detected line length + if black --line-length "$BLACK_LINE_LENGTH" "$file" 2>/dev/null; then + echo -e " ${GREEN}✅ Black formatting applied${NC}" + else + echo -e " ${RED}❌ Black formatting failed${NC}" + ((ERROR_COUNT++)) + continue + fi + else + echo -e " ${YELLOW}⚠️ Black not available, using Ruff format${NC}" + fi + + # Apply ruff formatting and organize imports + if ruff format "$file" 2>/dev/null; then + echo -e " ${GREEN}✅ Ruff formatting applied${NC}" + else + echo -e " ${YELLOW}⚠️ Ruff format skipped (not available or failed)${NC}" + fi + + # Fix imports and other issues with ruff + if ruff check --fix "$file" 2>/dev/null; then + echo -e " ${GREEN}✅ Import organization applied${NC}" + else + echo -e " ${YELLOW}⚠️ Ruff check/fix skipped (not available or failed)${NC}" + fi + + elif [[ "$file" == *.md || "$file" == *.sh ]]; then + # Markdown or Bash file - only whitespace cleanup + echo -e " ${BLUE}📝 Applying whitespace cleanup for ${file##*.} file${NC}" + fi + + # Remove trailing whitespace (applied to all file types) + if sed -i '' 's/[[:space:]]*$//' "$file" 2>/dev/null; then + echo -e " ${GREEN}✅ Trailing whitespace removed${NC}" + else + echo -e " ${YELLOW}⚠️ Trailing whitespace removal failed${NC}" + fi + + # Remove excessive empty lines for markdown and bash files + if [[ "$file" == *.md || "$file" == *.sh ]]; then + # Remove multiple consecutive empty lines, keeping max 2 + if sed -i '' '/^$/N;/^\n$/d' "$file" 2>/dev/null; then + echo -e " ${GREEN}✅ Empty line cleanup applied${NC}" + else + echo -e " ${YELLOW}⚠️ Empty line cleanup failed${NC}" + fi + fi + + ((FORMATTED_COUNT++)) + echo "" + else + echo -e "${RED}❌ File not found: $file${NC}" + ((ERROR_COUNT++)) + fi +done <<< "$ALL_FILES" + +echo -e "${GREEN}🎉 Formatting complete!${NC}" +echo -e "Files processed: $FORMATTED_COUNT" +if [ $ERROR_COUNT -gt 0 ]; then + echo -e "${RED}Errors: $ERROR_COUNT${NC}" +fi + +echo "" +echo -e "${BLUE}💡 To see what changed, run:${NC}" +echo " git diff" \ No newline at end of file diff --git a/src/extensions/flow/flows/demo/flow_demo.py b/src/extensions/flow/flows/demo/flow_demo.py index 5f3a417..32f1a9d 100644 --- a/src/extensions/flow/flows/demo/flow_demo.py +++ b/src/extensions/flow/flows/demo/flow_demo.py @@ -2,7 +2,6 @@ from leettools.common.logging.event_logger import EventLogger from leettools.common.utils import config_utils -from leettools.core.consts import flow_option from leettools.core.consts.article_type import ArticleType from leettools.core.consts.display_type import DisplayType from leettools.core.schemas.chat_query_item import ChatQueryItem @@ -18,7 +17,6 @@ from leettools.flow.flow import AbstractFlow from leettools.flow.flow_component import FlowComponent from leettools.flow.flow_option_items import FlowOptionItem -from leettools.flow.flow_type import FlowType from leettools.flow.utils import flow_utils @@ -144,12 +142,14 @@ def execute_query( ) # Extend context with relevant information - extended_context, context_token_count, source_items = ( - steps.StepExtendContext.run_step( - exec_info=exec_info, - reranked_result=top_ranked_result_segments, - accumulated_source_items={}, - ) + ( + extended_context, + context_token_count, + source_items, + ) = steps.StepExtendContext.run_step( + exec_info=exec_info, + reranked_result=top_ranked_result_segments, + accumulated_source_items={}, ) display_logger.debug( diff --git a/src/leettools/chat/_impl/duckdb/history_manager_duckdb.py b/src/leettools/chat/_impl/duckdb/history_manager_duckdb.py index 10bbf2e..2619827 100644 --- a/src/leettools/chat/_impl/duckdb/history_manager_duckdb.py +++ b/src/leettools/chat/_impl/duckdb/history_manager_duckdb.py @@ -22,7 +22,7 @@ from leettools.common.logging import logger, remove_logger from leettools.common.logging.event_logger import EventLogger from leettools.common.logging.logger_for_query import get_logger_for_chat -from leettools.common.utils import content_utils, time_utils +from leettools.common.utils import time_utils from leettools.context_manager import Context from leettools.core.consts.article_type import ArticleType from leettools.core.schemas.chat_query_item import ChatQueryItem, ChatQueryItemCreate @@ -44,7 +44,6 @@ class HistoryManagerDuckDB(AbstractHistoryManager): """DuckDB implementation of the chat manager.""" def __init__(self, context: Context): - self.initialized = True self.context = context self.settings = context.settings diff --git a/src/leettools/chat/_utils/position_util.py b/src/leettools/chat/_utils/position_util.py index b4e6513..e8c46fc 100644 --- a/src/leettools/chat/_utils/position_util.py +++ b/src/leettools/chat/_utils/position_util.py @@ -20,7 +20,7 @@ 2.2.1 -> layer 3, index 9 3 -> layer 1, index 10 -We may have pictures, tables, text blocks as sections in the chat history, therefore, +We may have pictures, tables, text blocks as sections in the chat history, therefore, the position_in_answer may not be the section_id shown. Only the sections that have position_heading=True will be shown with a new heading in the chat history. Note that we also have the title field in the answer_item, but it may be the title of the picture @@ -33,7 +33,6 @@ - Add: add a section in the chat history and all the positions after it will be moved down. """ -from functools import cmp_to_key from leettools.common import exceptions from leettools.core.schemas.chat_query_result import ChatAnswerItem @@ -78,7 +77,7 @@ def shift_down(pos: str, insertion_layer: int, insertion_index: int) -> str: """ if pos == "all": raise exceptions.UnexpectedCaseException( - f"pos 'all' is not expected to be shifted down" + "pos 'all' is not expected to be shifted down" ) cur_index = int(pos) if cur_index < insertion_index: diff --git a/src/leettools/chat/chat_utils.py b/src/leettools/chat/chat_utils.py index 2314ea8..e76be0f 100644 --- a/src/leettools/chat/chat_utils.py +++ b/src/leettools/chat/chat_utils.py @@ -64,7 +64,7 @@ def setup_exec_info_base( # we will create the kb if it does not exist if kb == None: if kb_description is None: - kb_description = f"Created by auto setup." + kb_description = "Created by auto setup." if ad_hoc_kb: auto_schedule = False else: diff --git a/src/leettools/chat/schemas/chat_history.py b/src/leettools/chat/schemas/chat_history.py index 3ae4e65..11bce7d 100644 --- a/src/leettools/chat/schemas/chat_history.py +++ b/src/leettools/chat/schemas/chat_history.py @@ -2,7 +2,7 @@ from abc import ABC, abstractmethod from dataclasses import dataclass from datetime import datetime -from typing import Any, Dict, List, Optional +from typing import Any, Dict, Optional from pydantic import BaseModel, Field diff --git a/src/leettools/cli/chunker/chunker_text2chunks.py b/src/leettools/cli/chunker/chunker_text2chunks.py index 105e8fc..77e04fc 100644 --- a/src/leettools/cli/chunker/chunker_text2chunks.py +++ b/src/leettools/cli/chunker/chunker_text2chunks.py @@ -10,7 +10,7 @@ def text2chunks(ctx, input: Path, output: Path, chunker_type: str | None, force: bool): """Convert text file to chunks.""" # Check if input is a text file - if not input.suffix.lower() in [".md"]: + if input.suffix.lower() not in [".md"]: raise click.ClickException("Input file must be a markdown file (.md)") used_chunker = convert_to_chunks(input, output, chunker_type, force) diff --git a/src/leettools/cli/cli_utils.py b/src/leettools/cli/cli_utils.py index 17c787b..cd0309e 100644 --- a/src/leettools/cli/cli_utils.py +++ b/src/leettools/cli/cli_utils.py @@ -95,7 +95,7 @@ def setup_org_kb_user( org=org, kb_create=KBCreate( name=kb_name, - description=f"Created automatically by CLI command", + description="Created automatically by CLI command", user_uuid=user.user_uuid, auto_schedule=False, enable_contextual_retrieval=context.settings.ENABLE_CONTEXTUAL_RETRIEVAL, @@ -111,7 +111,7 @@ def setup_org_kb_user( org, KBCreate( name=kb_name, - description=f"Created automatically by CLI command", + description="Created automatically by CLI command", user_uuid=user.user_uuid, auto_schedule=True, enable_contextual_retrieval=context.settings.ENABLE_CONTEXTUAL_RETRIEVAL, diff --git a/src/leettools/cli/doc/doc_list_all_segments.py b/src/leettools/cli/doc/doc_list_all_segments.py index f9ff047..c024895 100644 --- a/src/leettools/cli/doc/doc_list_all_segments.py +++ b/src/leettools/cli/doc/doc_list_all_segments.py @@ -32,8 +32,7 @@ def list_all_segements( indent: Optional[int] = None, **kwargs, ) -> None: - - from leettools.context_manager import Context, ContextManager + from leettools.context_manager import ContextManager context = ContextManager().get_context() # type: Context segment_store = context.get_repo_manager().get_segment_store() diff --git a/src/leettools/cli/doc/doc_list_segments.py b/src/leettools/cli/doc/doc_list_segments.py index 025dc26..d19c1a2 100644 --- a/src/leettools/cli/doc/doc_list_segments.py +++ b/src/leettools/cli/doc/doc_list_segments.py @@ -44,7 +44,7 @@ def list_segments_for_doc( indent: Optional[int] = None, **kwargs, ) -> None: - from leettools.context_manager import Context, ContextManager + from leettools.context_manager import ContextManager context = ContextManager().get_context() # type: Context segment_store = context.get_repo_manager().get_segment_store() diff --git a/src/leettools/cli/doc/doc_print.py b/src/leettools/cli/doc/doc_print.py index 93daef1..5576da5 100644 --- a/src/leettools/cli/doc/doc_print.py +++ b/src/leettools/cli/doc/doc_print.py @@ -44,7 +44,7 @@ def print( indent: Optional[int] = None, **kwargs, ) -> None: - from leettools.context_manager import Context, ContextManager + from leettools.context_manager import ContextManager context = ContextManager().get_context() # type: Context segment_store = context.get_repo_manager().get_segment_store() diff --git a/src/leettools/cli/doc/doc_remove.py b/src/leettools/cli/doc/doc_remove.py index fa14582..1ccf79b 100644 --- a/src/leettools/cli/doc/doc_remove.py +++ b/src/leettools/cli/doc/doc_remove.py @@ -40,7 +40,7 @@ def remove( doc_uuid: str, **kwargs, ) -> None: - from leettools.context_manager import Context, ContextManager + from leettools.context_manager import ContextManager context = ContextManager().get_context() # type: Context document_store = context.get_repo_manager().get_document_store() diff --git a/src/leettools/cli/doc/doc_summarize.py b/src/leettools/cli/doc/doc_summarize.py index 3256546..2c1de32 100644 --- a/src/leettools/cli/doc/doc_summarize.py +++ b/src/leettools/cli/doc/doc_summarize.py @@ -41,7 +41,7 @@ def summarize_all( force: bool, **kwargs, ) -> None: - from leettools.context_manager import Context, ContextManager + from leettools.context_manager import ContextManager context = ContextManager().get_context() # type: Context document_store = context.get_repo_manager().get_document_store() diff --git a/src/leettools/cli/kb/kb_add_local.py b/src/leettools/cli/kb/kb_add_local.py index 6b35c14..c0d8dc0 100644 --- a/src/leettools/cli/kb/kb_add_local.py +++ b/src/leettools/cli/kb/kb_add_local.py @@ -8,7 +8,6 @@ from leettools.common.logging import logger from leettools.core.consts.docsource_type import DocSourceType from leettools.core.schemas.docsource import DocSourceCreate -from leettools.core.schemas.schedule_config import ScheduleConfig from leettools.flow.utils import pipeline_utils @@ -89,7 +88,7 @@ def add_local( if doc_source is None: doc_source = path.as_uri() - from leettools.context_manager import Context, ContextManager + from leettools.context_manager import ContextManager context = ContextManager().get_context() # type: Context context.is_svc = False diff --git a/src/leettools/cli/kb/kb_add_local_dir.py b/src/leettools/cli/kb/kb_add_local_dir.py index 6e00ae9..a2770c8 100644 --- a/src/leettools/cli/kb/kb_add_local_dir.py +++ b/src/leettools/cli/kb/kb_add_local_dir.py @@ -88,7 +88,7 @@ def add_local_dir( if doc_source is None: doc_source = path.as_uri() - from leettools.context_manager import Context, ContextManager + from leettools.context_manager import ContextManager context = ContextManager().get_context() # type: Context context.is_svc = False diff --git a/src/leettools/cli/kb/kb_add_url.py b/src/leettools/cli/kb/kb_add_url.py index 4163412..57b29f9 100644 --- a/src/leettools/cli/kb/kb_add_url.py +++ b/src/leettools/cli/kb/kb_add_url.py @@ -74,7 +74,7 @@ def add_url( indent: Optional[int] = 2, **kwargs, ) -> None: - from leettools.context_manager import Context, ContextManager + from leettools.context_manager import ContextManager context = ContextManager().get_context() # type: Context context.is_svc = False diff --git a/src/leettools/cli/kb/kb_add_url_list.py b/src/leettools/cli/kb/kb_add_url_list.py index 040a73d..1499e24 100644 --- a/src/leettools/cli/kb/kb_add_url_list.py +++ b/src/leettools/cli/kb/kb_add_url_list.py @@ -81,7 +81,7 @@ def add_url_list( logger().error(f"The filename path {file_path} does not exist.") return - from leettools.context_manager import Context, ContextManager + from leettools.context_manager import ContextManager context = ContextManager().get_context() # type: Context context.is_svc = False diff --git a/src/leettools/cli/kb/kb_list.py b/src/leettools/cli/kb/kb_list.py index d86e7d7..fcd9b47 100644 --- a/src/leettools/cli/kb/kb_list.py +++ b/src/leettools/cli/kb/kb_list.py @@ -26,7 +26,6 @@ def list( indent: int = None, **kwargs, ) -> None: - from leettools.context_manager import ContextManager context = ContextManager().get_context() diff --git a/src/leettools/cli/kb/kb_list_db.py b/src/leettools/cli/kb/kb_list_db.py index df59932..63890a9 100644 --- a/src/leettools/cli/kb/kb_list_db.py +++ b/src/leettools/cli/kb/kb_list_db.py @@ -4,7 +4,6 @@ from leettools.cli.options_common import common_options from leettools.common import exceptions -from leettools.context_manager import Context from leettools.flow.metadata.extract_metadata_manager import ( create_extraction_metadata_manager, ) @@ -80,7 +79,7 @@ def _show_db_list_for_kb(): ) if not json_output: - click.echo(f"KnowledgeBase\tExtractDBName\tType\tItems\tUpdated") + click.echo("KnowledgeBase\tExtractDBName\tType\tItems\tUpdated") if kb_name is None: for kb in kb_manager.get_all_kbs_for_org(org): diff --git a/src/leettools/cli/query/query_get.py b/src/leettools/cli/query/query_get.py index 3a563bb..16394b1 100644 --- a/src/leettools/cli/query/query_get.py +++ b/src/leettools/cli/query/query_get.py @@ -41,7 +41,7 @@ def get( Command line interface to get answers for a query in the format of a MD file. """ - from leettools.context_manager import Context, ContextManager + from leettools.context_manager import ContextManager context = ContextManager().get_context() chat_manager = get_history_manager(context) @@ -68,7 +68,7 @@ def get( if answer.query_id == query_id: if answer.position_in_answer == "all": click.echo(f"{answer.answer_content}\n") - click.echo(f"# References:\n") + click.echo("# References:\n") for _, source_item in answer.answer_source_items.items(): click.echo(f"{source_item.answer_source.source_content}\n") break diff --git a/src/leettools/cli/query/query_get_article_md.py b/src/leettools/cli/query/query_get_article_md.py index 265002a..afd10d1 100644 --- a/src/leettools/cli/query/query_get_article_md.py +++ b/src/leettools/cli/query/query_get_article_md.py @@ -40,7 +40,7 @@ def get_article_md( """ Command line interface to get answers for a query in the format of a MD file. """ - from leettools.context_manager import Context, ContextManager + from leettools.context_manager import ContextManager context = ContextManager().get_context() chat_manager = get_history_manager(context) diff --git a/src/leettools/cli/query/query_list.py b/src/leettools/cli/query/query_list.py index 8bcbf1d..ae43cbc 100644 --- a/src/leettools/cli/query/query_list.py +++ b/src/leettools/cli/query/query_list.py @@ -25,7 +25,7 @@ def list( Command line interface to list all queries for a KB and user. """ - from leettools.context_manager import Context, ContextManager + from leettools.context_manager import ContextManager context = ContextManager().get_context() userstore = context.get_user_store() diff --git a/src/leettools/cli/query/query_run.py b/src/leettools/cli/query/query_run.py index 35e625d..389c928 100644 --- a/src/leettools/cli/query/query_run.py +++ b/src/leettools/cli/query/query_run.py @@ -57,7 +57,7 @@ def run( Command line interface to use the local repo to answer the input query. """ - from leettools.context_manager import Context, ContextManager + from leettools.context_manager import ContextManager context = ContextManager().get_context() # type: Context context.is_svc = False diff --git a/src/leettools/cli/query/query_section_regen.py b/src/leettools/cli/query/query_section_regen.py index 2c999e4..fc587dc 100644 --- a/src/leettools/cli/query/query_section_regen.py +++ b/src/leettools/cli/query/query_section_regen.py @@ -9,9 +9,7 @@ from leettools.common import exceptions from leettools.core.schemas.chat_query_result import ChatAnswerItemCreate from leettools.core.schemas.user import User -from leettools.flow.exec_info import ExecInfo from leettools.flow.schemas.article import ArticleSection, ArticleSectionPlan -from leettools.flow.utils import flow_utils @click.command(help="Generate the section again with new prompts/title.") diff --git a/src/leettools/cli/query/query_section_remove.py b/src/leettools/cli/query/query_section_remove.py index 10f8a2d..2463aef 100644 --- a/src/leettools/cli/query/query_section_remove.py +++ b/src/leettools/cli/query/query_section_remove.py @@ -5,7 +5,6 @@ from leettools.chat.history_manager import get_history_manager from leettools.cli.options_common import common_options from leettools.common import exceptions -from leettools.core.schemas.chat_query_result import ChatAnswerItemCreate from leettools.core.schemas.user import User diff --git a/src/leettools/common/duckdb/duckdb_client.py b/src/leettools/common/duckdb/duckdb_client.py index a87c685..8d565e9 100644 --- a/src/leettools/common/duckdb/duckdb_client.py +++ b/src/leettools/common/duckdb/duckdb_client.py @@ -15,7 +15,6 @@ class SingletonMetaDuckDB(SingletonMeta): class DuckDBClient(metaclass=SingletonMetaDuckDB): - # mapping from defined schema to existing stored type if different # see [Readme.md](./Readme.md) for more details TYPE_MAP: ClassVar[Dict[str, str]] = { @@ -167,11 +166,11 @@ def create_table_if_not_exists( try: existing_schema = cursor.execute( f""" - SELECT name, type + SELECT name, type FROM pragma_table_info('{new_schema_name}.{new_table_name}') """ ).fetchall() - except Exception as e: + except Exception: existing_schema = None # result = cursor.execute( @@ -233,7 +232,7 @@ def create_table_if_not_exists( else: # Add new column alter_sql = f""" - ALTER TABLE {new_schema_name}.{new_table_name} + ALTER TABLE {new_schema_name}.{new_table_name} ADD COLUMN {col_name} {col_type} """ logger().info(f"Adding new column: {alter_sql}") @@ -263,7 +262,7 @@ def _get_create_table_sql( columns = [f"{name} {type_}" for name, type_ in columns.items()] return f""" {create_table_sql} - CREATE TABLE IF NOT EXISTS {schema_name}.{table_name} ({','.join(columns)}) + CREATE TABLE IF NOT EXISTS {schema_name}.{table_name} ({",".join(columns)}) """ def execute_sql(self, sql: str, value_list: List[Any] = None) -> None: @@ -302,7 +301,7 @@ def fetch_all_from_table( with self.conn.cursor() as cursor: select_sql = f""" - SELECT {column_str} FROM {table_name} + SELECT {column_str} FROM {table_name} {where_clause} """ logger().noop( @@ -329,7 +328,6 @@ def fetch_one_from_table( value_list: List[Any] = None, where_clause: str = None, ) -> Optional[Dict[str, Any]]: - if column_list is None: column_str = "*" else: @@ -340,7 +338,7 @@ def fetch_one_from_table( with self.conn.cursor() as cursor: select_sql = f""" - SELECT {column_str} FROM {table_name} + SELECT {column_str} FROM {table_name} {where_clause} """ with self._get_table_lock(table_name): diff --git a/src/leettools/common/duckdb/duckdb_schema_utils.py b/src/leettools/common/duckdb/duckdb_schema_utils.py index 66cc97b..91241e6 100644 --- a/src/leettools/common/duckdb/duckdb_schema_utils.py +++ b/src/leettools/common/duckdb/duckdb_schema_utils.py @@ -147,7 +147,6 @@ def base_type_conversion(field_type, value): if __name__ == "__main__": - # Example Pydantic model class ExampleModel(BaseModel): name: str = Field(..., json_schema_extra={"primary_key": True}) diff --git a/src/leettools/common/emailer/_impl/emailer_mailgun.py b/src/leettools/common/emailer/_impl/emailer_mailgun.py index 3f805b8..30b7a74 100644 --- a/src/leettools/common/emailer/_impl/emailer_mailgun.py +++ b/src/leettools/common/emailer/_impl/emailer_mailgun.py @@ -24,7 +24,6 @@ def send_email(self, to: str, subject: str, body: str) -> None: variables["body"] = body variables_json_string = json.dumps(variables) try: - response = requests.post( mailgun_uri, auth=("api", mailgun_api_key), diff --git a/src/leettools/common/logging/README.md b/src/leettools/common/logging/README.md index 4c9f425..9ffd1af 100644 --- a/src/leettools/common/logging/README.md +++ b/src/leettools/common/logging/README.md @@ -16,4 +16,3 @@ logger().log_to_file(path=current_dir, filename='test.log', level='DEBUG') my_logger = EventLogger.get_instance('my_logger') my_logger.info('test') ``` - diff --git a/src/leettools/common/logging/event_logger.py b/src/leettools/common/logging/event_logger.py index 4a2fd55..3244909 100644 --- a/src/leettools/common/logging/event_logger.py +++ b/src/leettools/common/logging/event_logger.py @@ -116,7 +116,7 @@ def __init__(self, name): # ) try: # enable utf-8 encoding for stdout and stderr - if os.getenv(f"EDS_LOGGING_ENABLE_RICH"): + if os.getenv("EDS_LOGGING_ENABLE_RICH"): from rich.logging import Console, RichHandler formatter = logging.Formatter("%(message)s") @@ -134,7 +134,7 @@ def __init__(self, name): else: handler = logging.StreamHandler(stream=stdout_wrapper) handler.setFormatter(self.get_default_formatter()) - except Exception as e: + except Exception: if os.getenv("EDS_LOGGING_TO_STDERR"): handler = logging.StreamHandler(stream=stderr_wrapper) else: @@ -214,9 +214,9 @@ def log_to_file( level (str): Can only be INFO, DEBUG, WARNING and ERROR. If None, use current logger level. mode (str): The mode to write log into the file. """ - assert isinstance( - file, (str, Path) - ), f"expected argument path to be type str or Path, but got {type(file)}" + assert isinstance(file, (str, Path)), ( + f"expected argument path to be type str or Path, but got {type(file)}" + ) if isinstance(file, str): file = Path(file) return self.log_to_dir(file.parent, level, mode, file.name) @@ -236,9 +236,9 @@ def log_to_dir( - level (str): Can only be INFO, DEBUG, WARNING and ERROR. If None, use current logger level. - filename (str): a log filename, default is 'events.log'. """ - assert isinstance( - dir, (str, Path) - ), f"expected argument path to be type str or Path, but got {type(dir)}" + assert isinstance(dir, (str, Path)), ( + f"expected argument path to be type str or Path, but got {type(dir)}" + ) if level is None: log_level = self.level else: diff --git a/src/leettools/common/logging/log_location.py b/src/leettools/common/logging/log_location.py index 0d8715e..fb9cba3 100644 --- a/src/leettools/common/logging/log_location.py +++ b/src/leettools/common/logging/log_location.py @@ -2,7 +2,7 @@ import leettools.common.exceptions as exceptions from leettools.common.logging.event_logger import EventLogger -from leettools.context_manager import Context, ContextManager +from leettools.context_manager import ContextManager class LogLocator: diff --git a/src/leettools/common/temp_setup.py b/src/leettools/common/temp_setup.py index 33a4db3..fbc75c5 100644 --- a/src/leettools/common/temp_setup.py +++ b/src/leettools/common/temp_setup.py @@ -18,7 +18,6 @@ class TempSetup: - def __init__(self): from leettools.context_manager import ContextManager diff --git a/src/leettools/common/utils/factory_util.py b/src/leettools/common/utils/factory_util.py index 726efc0..916ddde 100644 --- a/src/leettools/common/utils/factory_util.py +++ b/src/leettools/common/utils/factory_util.py @@ -29,7 +29,7 @@ def get_subclass_from_module(module_name: str, base_class: Type[T]) -> List[Type if issubclass(cls, base_class) and cls is not base_class ] return subclasses - except ModuleNotFoundError as e: + except ModuleNotFoundError: raise exceptions.EntityNotFoundException( entity_name=module_name, entity_type="module" ) diff --git a/src/leettools/common/utils/template_eval.py b/src/leettools/common/utils/template_eval.py index 2e637bc..28a35eb 100644 --- a/src/leettools/common/utils/template_eval.py +++ b/src/leettools/common/utils/template_eval.py @@ -77,13 +77,13 @@ def find_template_variables(template_str: str) -> set[str]: {% block header %} Hello {{ name }}! {% endblock %} - + {% block list %} {% for item in items %} - {{ item }} {% endfor %} {% endblock %} - + Undefined: {{ undefined_variable }} """ variables = {"name": "John Doe", "items": ["Apple", "Banana", "Cherry"]} diff --git a/src/leettools/common/utils/tokenizer.py b/src/leettools/common/utils/tokenizer.py index 43af8e5..8d81b66 100644 --- a/src/leettools/common/utils/tokenizer.py +++ b/src/leettools/common/utils/tokenizer.py @@ -1,7 +1,6 @@ from typing import List from leettools.common import exceptions -from leettools.context_manager import Context from leettools.settings import SystemSettings diff --git a/src/leettools/context_manager.py b/src/leettools/context_manager.py index 43e984a..5af1685 100644 --- a/src/leettools/context_manager.py +++ b/src/leettools/context_manager.py @@ -37,7 +37,6 @@ class ContextStatus(str, Enum): class Context: - EDS_CLI_CONTEXT_PREFIX: ClassVar[str] = "eds_cli" def __init__(self, settings: SystemSettings): @@ -172,7 +171,6 @@ def get_task_manager(self) -> TaskManager: return self._task_manager def reset(self, is_test: bool = False, new_env_file: str = None): - self.is_test = is_test settings = self.initial_settings.model_copy() diff --git a/src/leettools/core/consts/retriever_type.py b/src/leettools/core/consts/retriever_type.py index bfd6004..15ca45f 100644 --- a/src/leettools/core/consts/retriever_type.py +++ b/src/leettools/core/consts/retriever_type.py @@ -28,7 +28,6 @@ def is_search_engine(retriever_type: str) -> bool: def supported_retriever(region: Optional[str] = "all") -> List[str]: - # we only show supported web retriever in the web UI if region is None: return [ diff --git a/src/leettools/core/knowledgebase/_impl/duckdb/kb_duckdb_schema.py b/src/leettools/core/knowledgebase/_impl/duckdb/kb_duckdb_schema.py index 1f2c447..4f8b138 100644 --- a/src/leettools/core/knowledgebase/_impl/duckdb/kb_duckdb_schema.py +++ b/src/leettools/core/knowledgebase/_impl/duckdb/kb_duckdb_schema.py @@ -6,7 +6,6 @@ @dataclass class KBDuckDBSchema(BaseKBSchema): - @classmethod def get_schema(cls) -> Dict[str, Any]: """ diff --git a/src/leettools/core/knowledgebase/_impl/duckdb/kb_manager_duckdb.py b/src/leettools/core/knowledgebase/_impl/duckdb/kb_manager_duckdb.py index de3d8c2..215aab2 100644 --- a/src/leettools/core/knowledgebase/_impl/duckdb/kb_manager_duckdb.py +++ b/src/leettools/core/knowledgebase/_impl/duckdb/kb_manager_duckdb.py @@ -80,7 +80,7 @@ def _kb_to_dict(self, kb: KnowledgeBase) -> Dict: def _get_user_store(self) -> AbstractUserStore: # this is kind of hacky to avoid circular import - from leettools.context_manager import Context, ContextManager + from leettools.context_manager import ContextManager context = ContextManager().get_context() # type: Context @@ -88,7 +88,7 @@ def _get_user_store(self) -> AbstractUserStore: def _get_repo_manager(self) -> RepoManager: # this is kind of hacky to avoid circular import - from leettools.context_manager import Context, ContextManager + from leettools.context_manager import ContextManager context = ContextManager().get_context() return context.get_repo_manager() diff --git a/src/leettools/core/knowledgebase/kb_manager.py b/src/leettools/core/knowledgebase/kb_manager.py index 11f8baa..26f294d 100644 --- a/src/leettools/core/knowledgebase/kb_manager.py +++ b/src/leettools/core/knowledgebase/kb_manager.py @@ -1,5 +1,4 @@ from abc import ABC, abstractmethod -from datetime import datetime from typing import ClassVar, List, Optional from leettools.common.utils import time_utils diff --git a/src/leettools/core/org/org_manager.py b/src/leettools/core/org/org_manager.py index 2c2b190..71092f2 100644 --- a/src/leettools/core/org/org_manager.py +++ b/src/leettools/core/org/org_manager.py @@ -6,7 +6,6 @@ class AbstractOrgManager(ABC): - @abstractmethod def get_default_org(self) -> Org: """ diff --git a/src/leettools/core/repo/_impl/duckdb/docsink_store_duckdb.py b/src/leettools/core/repo/_impl/duckdb/docsink_store_duckdb.py index 3ccd1ca..9341095 100644 --- a/src/leettools/core/repo/_impl/duckdb/docsink_store_duckdb.py +++ b/src/leettools/core/repo/_impl/duckdb/docsink_store_duckdb.py @@ -1,5 +1,4 @@ import uuid -from datetime import datetime from typing import Any, List, Optional from leettools.common import exceptions diff --git a/src/leettools/core/repo/_impl/duckdb/vector_store_dense_duckdb.py b/src/leettools/core/repo/_impl/duckdb/vector_store_dense_duckdb.py index 5b24147..85b89df 100644 --- a/src/leettools/core/repo/_impl/duckdb/vector_store_dense_duckdb.py +++ b/src/leettools/core/repo/_impl/duckdb/vector_store_dense_duckdb.py @@ -6,7 +6,6 @@ import numpy as np -from leettools.common import exceptions from leettools.common.duckdb.duckdb_client import DuckDBClient from leettools.common.logging import logger from leettools.context_manager import Context @@ -491,7 +490,7 @@ def _full_text_search_in_kb( where_clause = "WHERE score is not null" query_statement = f""" - SELECT *, fts_{table_name.replace('.', '_')}.match_bm25( + SELECT *, fts_{table_name.replace(".", "_")}.match_bm25( {Segment.FIELD_SEGMENT_UUID}, ?, fields := '{Segment.FIELD_CONTENT}' @@ -522,10 +521,10 @@ def _rebuild_full_text_index(self, org: Org, kb: KnowledgeBase, user: User) -> N table_name = self._get_table_name(org, kb, embedding_dimension) # TODO: support customizable stemmer, stopwords, ignore, strip_accents, lower rebuild_fts_index_sql = f""" - PRAGMA create_fts_index({table_name}, {Segment.FIELD_SEGMENT_UUID}, + PRAGMA create_fts_index({table_name}, {Segment.FIELD_SEGMENT_UUID}, {Segment.FIELD_CONTENT}, stemmer = 'porter', stopwords = 'english', ignore = '(\\.|[^a-z])+', - strip_accents = 1, lower = 1, overwrite = 1) + strip_accents = 1, lower = 1, overwrite = 1) """ logger().debug( f"Building the full text index for table {table_name}: {rebuild_fts_index_sql}" diff --git a/src/leettools/core/repo/_impl/duckdb/vector_store_duckdb_schema.py b/src/leettools/core/repo/_impl/duckdb/vector_store_duckdb_schema.py index 3f3b815..f75bf25 100644 --- a/src/leettools/core/repo/_impl/duckdb/vector_store_duckdb_schema.py +++ b/src/leettools/core/repo/_impl/duckdb/vector_store_duckdb_schema.py @@ -4,7 +4,6 @@ class VectorDuckDBSchema: - @classmethod def get_schema(cls, dense_embedder_dimension: int) -> Dict[str, str]: return { diff --git a/src/leettools/core/repo/docgraph_store.py b/src/leettools/core/repo/docgraph_store.py index ddeb24c..1df6a09 100644 --- a/src/leettools/core/repo/docgraph_store.py +++ b/src/leettools/core/repo/docgraph_store.py @@ -78,7 +78,6 @@ def update_segment_node(self, segment_in_store: SegmentInDB) -> int: def create_docgraph_store(settings: SystemSettings) -> AbstractDocGraphStore: - from leettools.common.utils import factory_util return factory_util.create_manager_with_repo_type( diff --git a/src/leettools/core/repo/docsource_store.py b/src/leettools/core/repo/docsource_store.py index 200f321..90f2579 100644 --- a/src/leettools/core/repo/docsource_store.py +++ b/src/leettools/core/repo/docsource_store.py @@ -9,7 +9,6 @@ class AbstractDocsourceStore(ABC): - @abstractmethod def create_docsource( self, diff --git a/src/leettools/core/repo/segment_store.py b/src/leettools/core/repo/segment_store.py index 17bfc7f..ef417b1 100644 --- a/src/leettools/core/repo/segment_store.py +++ b/src/leettools/core/repo/segment_store.py @@ -14,7 +14,6 @@ class AbstractSegmentStore(ABC): - @abstractmethod def create_segment( self, org: Org, kb: KnowledgeBase, segment_create: SegmentCreate diff --git a/src/leettools/core/repo/vector_store.py b/src/leettools/core/repo/vector_store.py index 19d39c0..5838f60 100644 --- a/src/leettools/core/repo/vector_store.py +++ b/src/leettools/core/repo/vector_store.py @@ -25,7 +25,6 @@ class VectorSearchResult(BaseModel): class AbstractVectorStore(ABC): - @abstractmethod def __init__(self, context: Context): pass diff --git a/src/leettools/core/schemas/README.md b/src/leettools/core/schemas/README.md index a5c6c40..f94db93 100644 --- a/src/leettools/core/schemas/README.md +++ b/src/leettools/core/schemas/README.md @@ -12,7 +12,7 @@ Here is the main idea of using different pydantic models for different APIs: the primary key and update all other fields specified in the XUpdate. Some special derived fields need to be updated through special APIs. - XInDB: (parent XInDBBase) the fields that are actually stored in the database -- X: (parent XInDB or XBase) the fields that are returned by the API and used +- X: (parent XInDB or XBase) the fields that are returned by the API and used by the applications. So for the CRUD APIs, we use different Pydantic models for different purposes: diff --git a/src/leettools/core/schemas/api_provider_config.py b/src/leettools/core/schemas/api_provider_config.py index 4a902dd..d7760fb 100644 --- a/src/leettools/core/schemas/api_provider_config.py +++ b/src/leettools/core/schemas/api_provider_config.py @@ -24,7 +24,6 @@ class APIEndpointInfo(BaseModel): @add_fieldname_constants class APIProviderConfig(BaseModel): - api_provider: str = Field( ..., description="The name for the API provider. Although we only use OpenAI-compatible " diff --git a/src/leettools/core/schemas/chat_query_item.py b/src/leettools/core/schemas/chat_query_item.py index ab91fe0..f67b49b 100644 --- a/src/leettools/core/schemas/chat_query_item.py +++ b/src/leettools/core/schemas/chat_query_item.py @@ -95,7 +95,7 @@ def get_strategy( "Strategy base must be provided for dynamic strategy." ) - display_logger.debug(f"Using dynamic strategy.") + display_logger.debug("Using dynamic strategy.") strategy = Strategy.get_dynamic_strategy(strategy_base) return strategy diff --git a/src/leettools/core/schemas/docsink.py b/src/leettools/core/schemas/docsink.py index a07576f..37c5bd2 100644 --- a/src/leettools/core/schemas/docsink.py +++ b/src/leettools/core/schemas/docsink.py @@ -17,7 +17,6 @@ class DocSinkBase(BaseModel): - original_doc_uri: str = Field(..., description="The original URI of the document.") raw_doc_uri: str = Field( ..., description="The URI of the raw document (the docsink)." diff --git a/src/leettools/core/schemas/docsource.py b/src/leettools/core/schemas/docsource.py index a7ca2f5..fd418bf 100644 --- a/src/leettools/core/schemas/docsource.py +++ b/src/leettools/core/schemas/docsource.py @@ -22,7 +22,7 @@ class IngestConfig(BaseModel): """ """ -Each DocSource may be ingest many times if the schedule config is set. +Each DocSource may be ingest many times if the schedule config is set. MANUAL = "manual" # manual run, no retry, no schedule ONCE = "once" # run once until success or retry limit reached @@ -36,7 +36,7 @@ class IngestConfig(BaseModel): ## Once When created, this type of docsource will be triggered once. If it fails, it will be -retried until the retry limit is reached. However, we can also trigger to run the +retried until the retry limit is reached. However, we can also trigger to run the docsource again manually. ## Recurring @@ -46,9 +46,9 @@ class IngestConfig(BaseModel): # Different versions of the same DocSink -For each URI determined by the DocSource in each ingest operations, we will try create a -DocSink in the system whose key is the URI and the creation timestamp. -- If the raw document for the DocSink has the same hash with an existing DocSink, or +For each URI determined by the DocSource in each ingest operations, we will try create a +DocSink in the system whose key is the URI and the creation timestamp. +- If the raw document for the DocSink has the same hash with an existing DocSink, or the DocSource can tell the ingestion job that the document has not been updated, we will not ingest the document again and no new DocSink is created. - If the raw document for the DocSink has a different hash with an existing DocSink, we @@ -58,7 +58,7 @@ class IngestConfig(BaseModel): DocSink has a one-to-one relation with the Document object. The Document object is the converted markdown document. The Document object should have the same "expired_timestamp" -field as the DocSink object. +field as the DocSink object. In the segment store and embedding store, all the segments and embeddings should have the same "expired_timestamp" field as the DocSink object. It is not ideal to have to set @@ -70,7 +70,6 @@ class IngestConfig(BaseModel): class DocSourceBase(BaseModel): - org_id: str = Field(..., description="Organization ID") kb_id: str = Field(..., description="Knowledge base ID") source_type: DocSourceType = Field(..., description="Type of document source") diff --git a/src/leettools/core/schemas/knowledgebase.py b/src/leettools/core/schemas/knowledgebase.py index 3e4ff8d..f8be52f 100644 --- a/src/leettools/core/schemas/knowledgebase.py +++ b/src/leettools/core/schemas/knowledgebase.py @@ -82,7 +82,6 @@ class KBInDBBase(KBCreate): @add_fieldname_constants class KBUpdate(KBBase): - # we use kb_name as the key to update the kb # so we need a new name to update the kb name # set to non-null value if we want to rename the kb diff --git a/src/leettools/core/schemas/organization.py b/src/leettools/core/schemas/organization.py index 25d0508..eb79ca3 100644 --- a/src/leettools/core/schemas/organization.py +++ b/src/leettools/core/schemas/organization.py @@ -1,7 +1,7 @@ from abc import ABC, abstractmethod from dataclasses import dataclass from datetime import datetime -from typing import Any, ClassVar, Dict, List, Optional +from typing import Any, ClassVar, Dict, Optional from pydantic import BaseModel, Field @@ -72,7 +72,7 @@ def get_org_db_name(cls, org_id: str) -> str: Although the org name is unique, we use org_id as the DB name avoid DB rename and allow wider range of DB name chars. """ - from leettools.context_manager import Context, ContextManager + from leettools.context_manager import ContextManager context = ContextManager().get_context() # type: Context if context.is_test: diff --git a/src/leettools/core/schemas/user.py b/src/leettools/core/schemas/user.py index a9d3d03..be9d9a9 100644 --- a/src/leettools/core/schemas/user.py +++ b/src/leettools/core/schemas/user.py @@ -33,7 +33,6 @@ class UserUpdate(UserBase): class UserInDB(UserCreate): - user_uuid: str = Field(..., description="The uuid of the user.") # balance will be updated through specific API balance: Optional[int] = Field(None, description="The balance of the user.") @@ -77,7 +76,7 @@ class User(UserInDB): @classmethod def get_admin_user(cls) -> "User": - from leettools.context_manager import Context, ContextManager + from leettools.context_manager import ContextManager context = ContextManager().get_context() # type: Context user_store = context.get_user_store() @@ -86,7 +85,7 @@ def get_admin_user(cls) -> "User": @classmethod def get_user_db_name(cls, user_uuid: str) -> str: - from leettools.context_manager import Context, ContextManager + from leettools.context_manager import ContextManager context = ContextManager().get_context() # type: Context if context.is_test: diff --git a/src/leettools/core/schemas/user_settings.py b/src/leettools/core/schemas/user_settings.py index ac1f65d..cb48e08 100644 --- a/src/leettools/core/schemas/user_settings.py +++ b/src/leettools/core/schemas/user_settings.py @@ -12,7 +12,6 @@ class UserSettingsItem(BaseModel): - section: str = Field(..., description=_("The section of the settings")) name: str = Field(..., description=_("The name of the variable.")) description: Optional[str] = Field( diff --git a/src/leettools/core/strategy/_impl/duckdb/intention_store_duckdb.py b/src/leettools/core/strategy/_impl/duckdb/intention_store_duckdb.py index 5209943..62e116d 100644 --- a/src/leettools/core/strategy/_impl/duckdb/intention_store_duckdb.py +++ b/src/leettools/core/strategy/_impl/duckdb/intention_store_duckdb.py @@ -1,5 +1,4 @@ import os -from datetime import datetime from typing import List, Optional from leettools.common.duckdb.duckdb_client import DuckDBClient @@ -20,7 +19,6 @@ class IntentionStoreDuckDB(AbstractIntentionStore): - def __init__( self, settings: SystemSettings, diff --git a/src/leettools/core/strategy/_impl/duckdb/prompt_store_duckdb.py b/src/leettools/core/strategy/_impl/duckdb/prompt_store_duckdb.py index ad967fd..2eb442b 100644 --- a/src/leettools/core/strategy/_impl/duckdb/prompt_store_duckdb.py +++ b/src/leettools/core/strategy/_impl/duckdb/prompt_store_duckdb.py @@ -1,7 +1,6 @@ import hashlib import json import uuid -from datetime import datetime from typing import List, Optional from leettools.common.duckdb.duckdb_client import DuckDBClient diff --git a/src/leettools/core/strategy/_impl/duckdb/strategy_store_duckdb.py b/src/leettools/core/strategy/_impl/duckdb/strategy_store_duckdb.py index 06c83fa..2102265 100644 --- a/src/leettools/core/strategy/_impl/duckdb/strategy_store_duckdb.py +++ b/src/leettools/core/strategy/_impl/duckdb/strategy_store_duckdb.py @@ -510,7 +510,7 @@ def list_active_strategies_for_user(self, user: User) -> List[Strategy]: # Get all the active strategies for the user and the admin user table_name = self._get_table_name() where_clause = ( - f"WHERE {Strategy.FIELD_USER_UUID} IN ({', '.join([f'?' for _ in user_ids])}) " + f"WHERE {Strategy.FIELD_USER_UUID} IN ({', '.join(['?' for _ in user_ids])}) " f"AND {Strategy.FIELD_STRATEGY_STATUS} = '{StrategyStatus.ACTIVE.value}'" ) value_list = [user_id for user_id in user_ids] diff --git a/src/leettools/core/strategy/schemas/prompt.py b/src/leettools/core/strategy/schemas/prompt.py index d1a6958..96dfe38 100644 --- a/src/leettools/core/strategy/schemas/prompt.py +++ b/src/leettools/core/strategy/schemas/prompt.py @@ -2,15 +2,15 @@ from dataclasses import dataclass from datetime import datetime from enum import Enum -from typing import Any, Dict, List, Optional +from typing import Any, Dict, Optional from pydantic import BaseModel, Field from leettools.common.utils.obj_utils import add_fieldname_constants """ -A prompt that user can use to inference with the model. It is defined as a string -template with a list of optional variables. The prompt is immutable after creation. +A prompt that user can use to inference with the model. It is defined as a string +template with a list of optional variables. The prompt is immutable after creation. The user can override the default value of the variables when using the prompt. """ diff --git a/src/leettools/core/strategy/schemas/strategy.py b/src/leettools/core/strategy/schemas/strategy.py index 703a744..52f7927 100644 --- a/src/leettools/core/strategy/schemas/strategy.py +++ b/src/leettools/core/strategy/schemas/strategy.py @@ -14,7 +14,7 @@ """ The Strategy class contains all the dynamic configurations for a Flow, required by -the Executor that runs the flow. A Flow is a program of Steps, each of which is an LLM +the Executor that runs the flow. A Flow is a program of Steps, each of which is an LLM API call or similar tool execution function. Each Strategy object contains a dictionary of StrategySection objects, the key is @@ -30,7 +30,6 @@ class StrategyBase(BaseModel): - # The key is the step name, for example, "intention", "rewrite", "search", "rerank" # The value is the StrategySection object that has the information needed for the step. # When we add a new step, we need to add a new StrategySection object to the dictionary. @@ -55,7 +54,6 @@ class StrategyCreate(StrategyBase): @add_fieldname_constants class Strategy(StrategyCreate): - DYNAMIC_STRATEGY_ID: ClassVar[str] = "-1" @classmethod @@ -95,7 +93,7 @@ def get_display_name(self) -> str: """ -Originally the strategy was created from a single JSON file and loaded into the +Originally the strategy was created from a single JSON file and loaded into the StrategyConfCreate object. After we create the Strategy object, we reused the JSON file format and convert that into the Strategy object here. """ @@ -104,7 +102,6 @@ def get_display_name(self) -> str: def convert_strategy_conf_create( strategy_conf_create: StrategyConfCreate, ) -> StrategyCreate: - if strategy_conf_create.intention_options is not None: intention_model_name = strategy_conf_create.intention_options.get( "model_name", None diff --git a/src/leettools/core/strategy/schemas/strategy_conf.py b/src/leettools/core/strategy/schemas/strategy_conf.py index 5691dce..41aea95 100644 --- a/src/leettools/core/strategy/schemas/strategy_conf.py +++ b/src/leettools/core/strategy/schemas/strategy_conf.py @@ -93,7 +93,6 @@ class StrategyConfBase(BaseModel): class StrategyConfCreate(StrategyConfBase): - strategy_name: str = Field( ..., description="The strategy name, required to be unique for a user." ) diff --git a/src/leettools/core/strategy/schemas/strategy_display_settings.py b/src/leettools/core/strategy/schemas/strategy_display_settings.py index fbd50f7..e84d4fe 100644 --- a/src/leettools/core/strategy/schemas/strategy_display_settings.py +++ b/src/leettools/core/strategy/schemas/strategy_display_settings.py @@ -23,7 +23,7 @@ class StrategyOptionItemDisplay(BaseModel): ) value_type: Optional[str] = Field( "str", - description="The type of the value," "currently support str, int, float, bool.", + description="The type of the value,currently support str, int, float, bool.", ) default_value: Optional[str] = Field( None, description="The default value of the variable." diff --git a/src/leettools/core/strategy/schemas/strategy_section.py b/src/leettools/core/strategy/schemas/strategy_section.py index afcabb0..f6e8deb 100644 --- a/src/leettools/core/strategy/schemas/strategy_section.py +++ b/src/leettools/core/strategy/schemas/strategy_section.py @@ -6,7 +6,6 @@ class StrategySection(BaseModel): - # the name of the section, should correspond to a step name section_name: StrategySectionName diff --git a/src/leettools/core/user/_impl/duckdb/user_store_duckdb.py b/src/leettools/core/user/_impl/duckdb/user_store_duckdb.py index 4701ead..bac20ef 100644 --- a/src/leettools/core/user/_impl/duckdb/user_store_duckdb.py +++ b/src/leettools/core/user/_impl/duckdb/user_store_duckdb.py @@ -1,12 +1,10 @@ import uuid -from datetime import datetime from typing import List, Optional from leettools.common import exceptions from leettools.common.duckdb.duckdb_client import DuckDBClient from leettools.common.logging import logger from leettools.common.utils import time_utils -from leettools.context_manager import Context from leettools.core.schemas.user import User, UserCreate, UserInDB, UserUpdate from leettools.core.user._impl.duckdb.user_store_duckdb_schema import UserDuckDBSchema from leettools.core.user.user_store import AbstractUserStore @@ -54,7 +52,7 @@ def change_user_balance(self, user_uuid: str, balance_change: int) -> User: return self.get_user_by_uuid(user_uuid) def create_user(self, user_create: UserCreate) -> Optional[User]: - from leettools.context_manager import Context, ContextManager + from leettools.context_manager import ContextManager context = ContextManager().get_context() # type: Context if context.is_test: diff --git a/src/leettools/core/user/user_settings_helper.py b/src/leettools/core/user/user_settings_helper.py index afb4c43..f5a7b56 100644 --- a/src/leettools/core/user/user_settings_helper.py +++ b/src/leettools/core/user/user_settings_helper.py @@ -117,7 +117,7 @@ def _get_value_from_settings_for_svc( if value is not None and value != "": return value - logger().noop(f"Checking admin settings ...", noop_lvl=1) + logger().noop("Checking admin settings ...", noop_lvl=1) admin_user = context.get_user_store().get_user_by_name(User.ADMIN_USERNAME) admin_user_settings = context.get_user_settings_store().get_settings_for_user( admin_user @@ -162,7 +162,6 @@ def _get_value_from_settings_for_cli( second_key: Optional[str] = None, allow_empty: Optional[bool] = False, ) -> str: - env_var_name = f"{ENV_VAR_PREFIX}{default_env.upper()}" value = os.environ.get(env_var_name, None) @@ -187,7 +186,7 @@ def _get_value_from_settings_for_cli( if value is not None and value != "": return value - logger().noop(f"Checking admin settings ...", noop_lvl=1) + logger().noop("Checking admin settings ...", noop_lvl=1) admin_user = context.get_user_store().get_user_by_name(User.ADMIN_USERNAME) admin_user_settings = context.get_user_settings_store().get_settings_for_user( admin_user diff --git a/src/leettools/core/user/user_settings_store.py b/src/leettools/core/user/user_settings_store.py index ce9f525..5a4fc5b 100644 --- a/src/leettools/core/user/user_settings_store.py +++ b/src/leettools/core/user/user_settings_store.py @@ -8,7 +8,6 @@ class AbstractUserSettingsStore(ABC): - @abstractmethod def get_settings_for_user(self, user: User) -> UserSettings: """ diff --git a/src/leettools/eds/api_caller/api_utils.py b/src/leettools/eds/api_caller/api_utils.py index 6e3dcd6..9c52925 100644 --- a/src/leettools/eds/api_caller/api_utils.py +++ b/src/leettools/eds/api_caller/api_utils.py @@ -248,7 +248,6 @@ def run_inference_call_direct( if use_parsed: response_str = completion.choices[0].message.parsed.model_dump_json() else: - response_str = completion.choices[0].message.content display_logger.debug(f"Response from inference call\n: {response_str}") if need_json: @@ -289,7 +288,7 @@ def run_inference_call_direct( if match: response_str = match.group(0) else: - display_logger.debug(f"No items found in response.") + display_logger.debug("No items found in response.") display_logger.debug(f"Clean up: {response_str}") @@ -375,7 +374,6 @@ def get_openai_embedder_client_for_user( def get_default_inference_api_provider_config( context: Context, user: Optional[User] = None ) -> APIProviderConfig: - if user is None: user = User.get_admin_user() @@ -422,7 +420,6 @@ def get_default_inference_api_provider_config( def get_default_embed_api_provider_config( context: Context, user: Optional[User] = None ) -> APIProviderConfig: - if user is None: user = User.get_admin_user() @@ -488,7 +485,6 @@ def get_default_embed_api_provider_config( def get_default_rerank_api_provider_config( context: Context, user: Optional[User] = None ) -> APIProviderConfig: - if user is None: user = User.get_admin_user() @@ -540,7 +536,6 @@ def get_openai_client_for_user( api_provider_config: Optional[APIProviderConfig] = None, display_logger: Optional[EventLogger] = None, ) -> OpenAI: - if display_logger is None: display_logger = logger() diff --git a/src/leettools/eds/extract/_impl/duckdb/extract_store_duckdb.py b/src/leettools/eds/extract/_impl/duckdb/extract_store_duckdb.py index 5225e09..34d40a6 100644 --- a/src/leettools/eds/extract/_impl/duckdb/extract_store_duckdb.py +++ b/src/leettools/eds/extract/_impl/duckdb/extract_store_duckdb.py @@ -1,4 +1,3 @@ -from datetime import datetime from typing import Any, Dict, List, Optional, Type, Union from leettools.common.duckdb.duckdb_client import DuckDBClient @@ -28,7 +27,6 @@ class ExtractStoreDuckdb(AbstractExtractStore): - def __init__( self, context: Context, diff --git a/src/leettools/eds/metadata/_impl/duckdb/kb_metadata_manager_duckdb.py b/src/leettools/eds/metadata/_impl/duckdb/kb_metadata_manager_duckdb.py index 93da5c8..fa4074b 100644 --- a/src/leettools/eds/metadata/_impl/duckdb/kb_metadata_manager_duckdb.py +++ b/src/leettools/eds/metadata/_impl/duckdb/kb_metadata_manager_duckdb.py @@ -1,6 +1,5 @@ import json import time -from datetime import datetime from typing import Any, Dict, List, Optional import leettools.common.utils.url_utils @@ -25,7 +24,6 @@ class KBMetadataManagerDuckDB(AbstractKBMetadataManager): """ def __init__(self, context: Context): - self.logger = get_logger(name="scheduler") self.settings = context.settings diff --git a/src/leettools/eds/metadata/schemas/kb_metadata.py b/src/leettools/eds/metadata/schemas/kb_metadata.py index 832d239..83c8878 100644 --- a/src/leettools/eds/metadata/schemas/kb_metadata.py +++ b/src/leettools/eds/metadata/schemas/kb_metadata.py @@ -1,7 +1,7 @@ from abc import ABC, abstractmethod from dataclasses import dataclass from datetime import datetime -from typing import Any, Dict, List, Optional +from typing import Any, Dict, Optional from pydantic import BaseModel diff --git a/src/leettools/eds/pipeline/chunk/_impl/chunker_simple.py b/src/leettools/eds/pipeline/chunk/_impl/chunker_simple.py index 7d6cc6b..06b5b2c 100644 --- a/src/leettools/eds/pipeline/chunk/_impl/chunker_simple.py +++ b/src/leettools/eds/pipeline/chunk/_impl/chunker_simple.py @@ -3,7 +3,6 @@ from typing import List from leettools.common import exceptions -from leettools.common.logging import logger from leettools.common.utils.tokenizer import Tokenizer from leettools.core.schemas.chunk import Chunk from leettools.eds.pipeline.chunk.chunker import AbstractChunker diff --git a/src/leettools/eds/pipeline/convert/_impl/converter_local.py b/src/leettools/eds/pipeline/convert/_impl/converter_local.py index cd066ca..94be56c 100644 --- a/src/leettools/eds/pipeline/convert/_impl/converter_local.py +++ b/src/leettools/eds/pipeline/convert/_impl/converter_local.py @@ -214,7 +214,7 @@ def convert(self) -> ReturnCode: else: return rtn_code return rtn_code - except Exception as e: + except Exception: trace = traceback.format_exc() logger().error(f"Error converting document: {trace}") return ReturnCode.FAILURE diff --git a/src/leettools/eds/pipeline/convert/_impl/converter_utils.py b/src/leettools/eds/pipeline/convert/_impl/converter_utils.py index 3bf1e99..bbe7a28 100644 --- a/src/leettools/eds/pipeline/convert/_impl/converter_utils.py +++ b/src/leettools/eds/pipeline/convert/_impl/converter_utils.py @@ -1,11 +1,11 @@ from leettools.settings import SystemSettings TABLE_PROMPT = """ -Given the following piece of text obtained from a PDF file, which represents a table, +Given the following piece of text obtained from a PDF file, which represents a table, please return a table in markdown format without changing its content. If it is not a table, then return the text as is. Don't return anything else. -If the first line of this text starts with something like this: "| 1.1.1 introduction column | ...", +If the first line of this text starts with something like this: "| 1.1.1 introduction column | ...", please extract "1.1.1 introduction" as a heading in the markdown format and put it at the beginning of the table. The "column" should be treated as part of the table header. So "| 1.1.1 introduction column | ..." will be converted to: @@ -21,8 +21,8 @@ """ TITLE_PROMPT = """ -Given the following first few lines of text obtained from a PDF file. -Please extract the title and return it in markdown format, +Given the following first few lines of text obtained from a PDF file. +Please extract the title and return it in markdown format, remembering to add only one # in front of the title text: # Some Title @@ -67,7 +67,7 @@ Some introduction text ``` -If the first line is something like "White Paper", or "User Manual", or "Product Manual", +If the first line is something like "White Paper", or "User Manual", or "Product Manual", then the title to be extracted is just this first line: "White Paper" or "User Manual" or "Product Manual. It means in this case, you only need to look into the first line to get the purpose of this article. ---------------- @@ -79,7 +79,7 @@ At company A, we are committed to providing the best products and services to our customers... ``` -In this case, the title to be extracted is "Company A: A leading company in the xxx industry", +In this case, the title to be extracted is "Company A: A leading company in the xxx industry", which means you need to look into the company name and the followed description of this company. ---------------- diff --git a/src/leettools/eds/pipeline/convert/_impl/parser_html.py b/src/leettools/eds/pipeline/convert/_impl/parser_html.py index f226502..03941a2 100644 --- a/src/leettools/eds/pipeline/convert/_impl/parser_html.py +++ b/src/leettools/eds/pipeline/convert/_impl/parser_html.py @@ -1,4 +1,4 @@ -""" Module to convert HTML to Markdown. """ +"""Module to convert HTML to Markdown.""" import click import markdownify @@ -70,8 +70,7 @@ def _filter_and_group_paragraphs_with_headings(self, text: str) -> str: for line in lines: if ( # TODO: This logic needs to be improved - len(line.split()) - > 3 + len(line.split()) > 3 ): # Consider lines with more than 3 words as meaningful current_paragraph.append(line) elif current_paragraph: @@ -83,9 +82,7 @@ def _filter_and_group_paragraphs_with_headings(self, text: str) -> str: else: # Numbered headings for subsequent paragraphs paragraphs.append(f"{joined_paragraph}") paragraph_number += 1 - current_paragraph = ( - [] - ) # Start a new paragraph for the next set of meaningful lines + current_paragraph = [] # Start a new paragraph for the next set of meaningful lines # Ensure the last paragraph is added if it wasn't ended by a short line if current_paragraph: diff --git a/src/leettools/eds/pipeline/convert/_impl/parser_llmsherpa.py b/src/leettools/eds/pipeline/convert/_impl/parser_llmsherpa.py index 79bbd1e..8f1a794 100644 --- a/src/leettools/eds/pipeline/convert/_impl/parser_llmsherpa.py +++ b/src/leettools/eds/pipeline/convert/_impl/parser_llmsherpa.py @@ -6,7 +6,6 @@ from re import Match from typing import Optional -import click import urllib3 from llmsherpa.readers import ( Block, @@ -19,7 +18,6 @@ ) from leettools.common.logging import logger -from leettools.context_manager import Context, ContextManager from leettools.eds.pipeline.convert._impl import converter_utils from leettools.eds.pipeline.convert.parser import AbstractParser from leettools.settings import SystemSettings @@ -188,7 +186,7 @@ def pdf2md(self, pdf_filepath: str, target_path: Optional[Path] = None) -> str: """ try: doc = self.pdf_reader.read_pdf(pdf_filepath) - except Exception as e: + except Exception: trace = traceback.format_exc() logger().error(f"Failed to parsePDF file {pdf_filepath}, error: {trace}") return "" @@ -212,7 +210,7 @@ def docx2md(self, docx_filepath: str, target_path: Optional[Path] = None) -> str response_json = json.loads(parser_response.data.decode("utf-8")) blocks = response_json["return_dict"]["result"]["blocks"] return self._traversal_doc(Document(blocks).root_node, target_path) - except Exception as e: + except Exception: trace = traceback.format_exc() logger().error(f"Failed to parse file {docx_filepath}, error: {trace}") return "" @@ -228,7 +226,7 @@ def pptx2md(self, pptx_path: str, target_path: Optional[Path] = None) -> str: response_json = json.loads(parser_response.data.decode("utf-8")) blocks = response_json["return_dict"]["result"]["blocks"] return self._traversal_doc(Document(blocks).root_node, target_path) - except Exception as e: + except Exception: trace = traceback.format_exc() logger().error(f"Failed to parse file {pptx_path}, error: {trace}") return "" @@ -244,7 +242,7 @@ def xlsx2md(self, xlsx_path: str, target_path: Optional[Path] = None) -> str: response_json = json.loads(parser_response.data.decode("utf-8")) blocks = response_json["return_dict"]["result"]["blocks"] return self._traversal_doc(Document(blocks).root_node, target_path) - except Exception as e: + except Exception: trace = traceback.format_exc() logger().error(f"Failed to parse file {xlsx_path}, error: {trace}") return "" diff --git a/src/leettools/eds/pipeline/convert/_impl/parser_unstructured.py b/src/leettools/eds/pipeline/convert/_impl/parser_unstructured.py index 7a1b1b2..1be7c9e 100644 --- a/src/leettools/eds/pipeline/convert/_impl/parser_unstructured.py +++ b/src/leettools/eds/pipeline/convert/_impl/parser_unstructured.py @@ -2,14 +2,12 @@ from pathlib import Path from typing import Optional -import click from unstructured.partition.docx import partition_docx from unstructured.partition.pdf import partition_pdf from unstructured.partition.pptx import partition_pptx from unstructured.partition.xlsx import partition_xlsx from leettools.common.logging import logger -from leettools.context_manager import Context, ContextManager from leettools.eds.pipeline.convert._impl import converter_utils from leettools.settings import SystemSettings @@ -46,7 +44,6 @@ def _replacement(self, match: re.Match) -> str: return "#" * level def docx2md(self, docx_filepath: str, target_path: Optional[Path] = None) -> str: - logger().debug(f"Converting DOCX to markdown: {docx_filepath}") try: elements = partition_docx(filename=docx_filepath) @@ -59,7 +56,6 @@ def docx2md(self, docx_filepath: str, target_path: Optional[Path] = None) -> str return "" def pdf2md(self, pdf_filepath: str, target_path: Optional[Path] = None) -> str: - rtn_text = "" elements = partition_pdf( filename=pdf_filepath, strategy="hi_res", check_extractable=False @@ -92,7 +88,6 @@ def pdf2md(self, pdf_filepath: str, target_path: Optional[Path] = None) -> str: return return_text def pptx2md(self, pptx_filepath: str, target_path: Optional[Path] = None) -> str: - logger().debug(f"Converting PPTX to markdown: {pptx_filepath}") try: elements = partition_pptx(filename=pptx_filepath) @@ -106,7 +101,6 @@ def pptx2md(self, pptx_filepath: str, target_path: Optional[Path] = None) -> str return "" def xlsx2md(self, xlsx_filepath: str, target_path: Optional[Path] = None) -> str: - logger().debug(f"Converting XLSX to markdown: {xlsx_filepath}") rtn_text = "" try: diff --git a/src/leettools/eds/pipeline/embed/segement_embedder_hybrid.py b/src/leettools/eds/pipeline/embed/segement_embedder_hybrid.py index 1aeecd2..520d256 100644 --- a/src/leettools/eds/pipeline/embed/segement_embedder_hybrid.py +++ b/src/leettools/eds/pipeline/embed/segement_embedder_hybrid.py @@ -1,7 +1,6 @@ import traceback from typing import List, Optional -from leettools.common import exceptions from leettools.common.logging import logger from leettools.common.logging.event_logger import EventLogger from leettools.context_manager import Context @@ -42,7 +41,7 @@ def _embed( ) -> ReturnCode: rtn_code = ReturnCode.SUCCESS if len(segments) == 0: - display_logger.debug(f"No segments to embed for this run.") + display_logger.debug("No segments to embed for this run.") return rtn_code dense_embed_successful = self.dense_vectorstore.save_segments( self.org, self.kb, self.user, segments @@ -67,13 +66,13 @@ def embed_segment_list( display_logger = logger() if segments is None or len(segments) == 0: - display_logger.info(f"No segments to embed for this run.") + display_logger.info("No segments to embed for this run.") return ReturnCode.SUCCESS try: rtn_code = self._embed(segments, display_logger) return rtn_code - except Exception as e: + except Exception: trace = traceback.format_exc() err_str = f"{trace}" if "Please reduce your prompt; or completion length." in err_str: diff --git a/src/leettools/eds/pipeline/embed/segment_embedder_simple.py b/src/leettools/eds/pipeline/embed/segment_embedder_simple.py index 8589d84..0905bcb 100644 --- a/src/leettools/eds/pipeline/embed/segment_embedder_simple.py +++ b/src/leettools/eds/pipeline/embed/segment_embedder_simple.py @@ -33,7 +33,7 @@ def _embed( display_logger: EventLogger, ) -> ReturnCode: if len(segments) == 0: - display_logger.info(f"No segments to embed for this run.") + display_logger.info("No segments to embed for this run.") return ReturnCode.SUCCESS doc_id = segments[0].document_uuid @@ -56,13 +56,13 @@ def embed_segment_list( display_logger = logger() if segments is None or len(segments) == 0: - display_logger.info(f"No segments to embed for this run.") + display_logger.info("No segments to embed for this run.") return ReturnCode.SUCCESS try: rtn_code = self._embed(segments, display_logger) return rtn_code - except Exception as e: + except Exception: trace = traceback.format_exc() display_logger.error(f"Error embedding document: {trace}") return ReturnCode.FAILURE diff --git a/src/leettools/eds/pipeline/ingest/_impl/connector_simple.py b/src/leettools/eds/pipeline/ingest/_impl/connector_simple.py index 10777e6..693e716 100644 --- a/src/leettools/eds/pipeline/ingest/_impl/connector_simple.py +++ b/src/leettools/eds/pipeline/ingest/_impl/connector_simple.py @@ -362,7 +362,7 @@ def _ingest_url(self, url: str) -> ScrapeResult: # check if the URL is a valid http or https URL if not url_str.startswith("http://") and not url_str.startswith("https://"): self.display_logger.warning( - f"URL not starting with http:// or https://, adding https:// as default." + "URL not starting with http:// or https://, adding https:// as default." ) url_str = f"https://{url_str}" @@ -688,7 +688,7 @@ def ingest(self) -> ReturnCode: try: rtn_code = self._ingest() return rtn_code - except Exception as e: + except Exception: trace = traceback.format_exc() self.display_logger.error(f"Error ingesting docsource: {trace}") return ReturnCode.FAILURE diff --git a/src/leettools/eds/pipeline/ingest/_impl/notion/schemas/block.py b/src/leettools/eds/pipeline/ingest/_impl/notion/schemas/block.py index fda3b1d..171d2fe 100644 --- a/src/leettools/eds/pipeline/ingest/_impl/notion/schemas/block.py +++ b/src/leettools/eds/pipeline/ingest/_impl/notion/schemas/block.py @@ -1,4 +1,3 @@ -import os from abc import ABC, abstractmethod from typing import Any, Dict, List, Optional, Union @@ -385,7 +384,6 @@ def to_text(self) -> str: class Block(BaseModel): - object: str = "block" id: str parent: AnyParent diff --git a/src/leettools/eds/pipeline/ingest/_impl/notion/schemas/rich_text.py b/src/leettools/eds/pipeline/ingest/_impl/notion/schemas/rich_text.py index c5f4a4d..73eee0b 100644 --- a/src/leettools/eds/pipeline/ingest/_impl/notion/schemas/rich_text.py +++ b/src/leettools/eds/pipeline/ingest/_impl/notion/schemas/rich_text.py @@ -2,15 +2,19 @@ from typing import Any, Dict, Optional, Union from .mention import MentionObject + class EquationObject(BaseModel): expression: str + class TextObject(BaseModel): content: str link: Optional[Dict[str, str]] = None + AnyRichText = Union[TextObject, MentionObject, EquationObject] + class Annotations(BaseModel): bold: bool = False italic: bool = False @@ -19,6 +23,7 @@ class Annotations(BaseModel): code: bool = False color: Optional[str] = None + class RichTextObject(BaseModel): type: str = "text" rich_text: Optional[AnyRichText] = None diff --git a/src/leettools/eds/pipeline/ingest/connector.py b/src/leettools/eds/pipeline/ingest/connector.py index bc01c74..5c86d55 100644 --- a/src/leettools/eds/pipeline/ingest/connector.py +++ b/src/leettools/eds/pipeline/ingest/connector.py @@ -12,7 +12,6 @@ class AbstractConnector(ABC): - @abstractmethod def __init__( self, diff --git a/src/leettools/eds/pipeline/split/splitter.py b/src/leettools/eds/pipeline/split/splitter.py index 20bfbd5..5d313d3 100644 --- a/src/leettools/eds/pipeline/split/splitter.py +++ b/src/leettools/eds/pipeline/split/splitter.py @@ -1,7 +1,6 @@ import os import re import traceback -from datetime import datetime from pathlib import Path from typing import Dict, List, Optional, Tuple from urllib.parse import unquote @@ -64,17 +63,17 @@ def separate_heading_from_content(content: str) -> Tuple[str, str]: """ CONTEXTUAL_RETRIEVAL_USER_PROMPT = """ - -{document_content} - -Here is the chunk we want to situate within the whole document - -{chunk_content} - -Please give a short succinct context to situate this chunk within the overall document -for the purposes of improving search retrieval of the chunk. Answer only with the -succinct context and nothing else. -The response should be in the following json format (Make sure the "document", "paragraph_id", "summary" + +{document_content} + +Here is the chunk we want to situate within the whole document + +{chunk_content} + +Please give a short succinct context to situate this chunk within the overall document +for the purposes of improving search retrieval of the chunk. Answer only with the +succinct context and nothing else. +The response should be in the following json format (Make sure the "document", "paragraph_id", "summary" and "other_info" are in the same language as the chunk itself): {{ "document": "the document name", @@ -327,7 +326,7 @@ def split( try: rtn_code = self._split(doc) return rtn_code - except Exception as e: + except Exception: trace = traceback.format_exc() err_str = f"{trace}" if "Number of parts exceeds the number of words in the text" in err_str: diff --git a/src/leettools/eds/rag/inference/_impl/inference_dynamic.py b/src/leettools/eds/rag/inference/_impl/inference_dynamic.py index 9c1f91d..dde2f9a 100644 --- a/src/leettools/eds/rag/inference/_impl/inference_dynamic.py +++ b/src/leettools/eds/rag/inference/_impl/inference_dynamic.py @@ -18,7 +18,6 @@ class InferenceDynamic(AbstractInference, APICallerBase): - def __init__( self, context: Context, @@ -38,7 +37,6 @@ def inference( query_metadata: ChatQueryMetadata, template_vars: Dict[str, str], ) -> Tuple[str, ChatCompletion]: - # we allow different prompts for different intentions self.setup_prompts_for_intention(query_metadata) @@ -81,10 +79,10 @@ def inference( if user_prompt_template is None: # should be in sync with src/leettools/strategy/default/inference_up_default.txt user_prompt_template = """ -{{ context_presentation }}, please answer the following question +{{ context_presentation }}, please answer the following question {{ lang_instruction }}. {{ word_count_instruction }}. {{ reference_instruction }} -If the context does not provide enough information to answer the question, please answer +If the context does not provide enough information to answer the question, please answer {{ out_of_context_instruction }} {{ lang_instruction }}. diff --git a/src/leettools/eds/rag/inference/inference.py b/src/leettools/eds/rag/inference/inference.py index 64cb03e..c4dc309 100644 --- a/src/leettools/eds/rag/inference/inference.py +++ b/src/leettools/eds/rag/inference/inference.py @@ -14,7 +14,6 @@ class AbstractInference(ABC): - @abstractmethod def inference( self, diff --git a/src/leettools/eds/rag/intention/_impl/intention_getter_dynamic.py b/src/leettools/eds/rag/intention/_impl/intention_getter_dynamic.py index 2f83fc7..fc5ab25 100644 --- a/src/leettools/eds/rag/intention/_impl/intention_getter_dynamic.py +++ b/src/leettools/eds/rag/intention/_impl/intention_getter_dynamic.py @@ -31,7 +31,6 @@ def __init__( intention_section: StrategySection, event_logger: Optional[EventLogger] = None, ) -> None: - self.setup_with_strategy( context, user, intention_section, _script_dir, event_logger ) @@ -66,7 +65,7 @@ def get_intention(self, question: str) -> ChatQueryMetadata: system_prompt=system_prompt, user_prompt=user_prompt ) return ChatQueryMetadata.model_validate_json(response_str) - except Exception as e: + except Exception: if response_str is not None: self.display_logger.error( f"ModelValidating ChatQueryMetadata failed: {response_str}" diff --git a/src/leettools/eds/rag/intention/intention_getter.py b/src/leettools/eds/rag/intention/intention_getter.py index 85edd9e..61ad51e 100644 --- a/src/leettools/eds/rag/intention/intention_getter.py +++ b/src/leettools/eds/rag/intention/intention_getter.py @@ -10,7 +10,6 @@ class AbstractIntentionGetter(ABC): - @abstractmethod def get_intention(self, query: str) -> ChatQueryMetadata: """ diff --git a/src/leettools/eds/rag/rerank/_impl/reranker_cohere.py b/src/leettools/eds/rag/rerank/_impl/reranker_cohere.py index ad91ac4..3871189 100644 --- a/src/leettools/eds/rag/rerank/_impl/reranker_cohere.py +++ b/src/leettools/eds/rag/rerank/_impl/reranker_cohere.py @@ -16,7 +16,6 @@ class RerankerCohere(AbstractReranker, APICallerBase): - def __init__( self, context: Context, @@ -43,7 +42,6 @@ def rerank( top_k: int, rerank_options: Dict[str, Any] = None, ) -> RerankResult: - import cohere logger().info(f"Calling cohere reranker for query {query}") diff --git a/src/leettools/eds/rag/rerank/_impl/reranker_dummy.py b/src/leettools/eds/rag/rerank/_impl/reranker_dummy.py index 7433811..357db3f 100644 --- a/src/leettools/eds/rag/rerank/_impl/reranker_dummy.py +++ b/src/leettools/eds/rag/rerank/_impl/reranker_dummy.py @@ -11,7 +11,6 @@ class RerankerDummy(AbstractReranker): - def __init__( self, context: Context, diff --git a/src/leettools/eds/rag/rerank/reranker.py b/src/leettools/eds/rag/rerank/reranker.py index 8d5872f..687a5e5 100644 --- a/src/leettools/eds/rag/rerank/reranker.py +++ b/src/leettools/eds/rag/rerank/reranker.py @@ -10,7 +10,6 @@ class AbstractReranker(ABC): - @abstractmethod def __init__( self, @@ -41,7 +40,6 @@ def create_reranker_by_strategy( rerank_section: StrategySection, display_logger: EventLogger, ) -> AbstractReranker: - from leettools.common.utils import factory_util settings = context.settings diff --git a/src/leettools/eds/rag/rewrite/_impl/rewrite_direct_dynamic.py b/src/leettools/eds/rag/rewrite/_impl/rewrite_direct_dynamic.py index 1c181f6..f5aa6ea 100644 --- a/src/leettools/eds/rag/rewrite/_impl/rewrite_direct_dynamic.py +++ b/src/leettools/eds/rag/rewrite/_impl/rewrite_direct_dynamic.py @@ -24,7 +24,6 @@ get_query_rewriter_by_strategy, ) from leettools.eds.rag.schemas.rewrite import Rewrite -from leettools.flow.exec_info import ExecInfo from leettools.flow.utils import prompt_utils _script_dir = os.path.dirname(os.path.abspath(__file__)) @@ -49,7 +48,6 @@ def rewrite( query_item: ChatQueryItem, query_metadata: ChatQueryMetadata, ) -> Rewrite: - self.setup_prompts_for_intention(query_metadata) query = query_item.query_content @@ -88,7 +86,7 @@ def rewrite( system_prompt=system_prompt, user_prompt=user_prompt ) return Rewrite.model_validate_json(response_str) - except Exception as e: + except Exception: if response_str is not None: self.display_logger.error( f"ModelValidating Rewrite failed: {response_str}" diff --git a/src/leettools/eds/rag/rewrite/_impl/rewrite_keywords_dynamic.py b/src/leettools/eds/rag/rewrite/_impl/rewrite_keywords_dynamic.py index 6206497..0ac6a3e 100644 --- a/src/leettools/eds/rag/rewrite/_impl/rewrite_keywords_dynamic.py +++ b/src/leettools/eds/rag/rewrite/_impl/rewrite_keywords_dynamic.py @@ -33,7 +33,6 @@ ) from leettools.eds.rag.schemas.rewrite import Rewrite from leettools.eds.rag.search._impl.searcher_hybrid import SearcherHybrid -from leettools.flow.exec_info import ExecInfo from leettools.flow.utils import prompt_utils _script_dir = os.path.dirname(os.path.abspath(__file__)) @@ -62,13 +61,13 @@ def _get_context( Get the keywords from the query metadata and use them to search the knowledgebase for the related context. Use the retrieved context as the context for the rewrite. """ - if query_metadata.keywords is not None and query_metadata.keywords is not []: + if query_metadata.keywords is not None and query_metadata.keywords != []: all_keywords: set[str] = set() for keyword in query_metadata.keywords: all_keywords.add(keyword) if ( query_metadata.entities is not None - and query_metadata.entities is not [] + and query_metadata.entities != [] ): for entity in query_metadata.entities: all_keywords.add(entity) @@ -145,7 +144,6 @@ def rewrite( query_item: ChatQueryItem, query_metadata: ChatQueryMetadata, ) -> Rewrite: - self.setup_prompts_for_intention(query_metadata) query = query_item.query_content @@ -186,7 +184,7 @@ def rewrite( system_prompt=system_prompt, user_prompt=user_prompt ) return Rewrite.model_validate_json(response_str) - except Exception as e: + except Exception: if response_str is not None: self.display_logger.error( f"ModelValidating Rewrite failed: {response_str}" diff --git a/src/leettools/eds/rag/rewrite/rewrite.py b/src/leettools/eds/rag/rewrite/rewrite.py index d144ca7..e0b939f 100644 --- a/src/leettools/eds/rag/rewrite/rewrite.py +++ b/src/leettools/eds/rag/rewrite/rewrite.py @@ -11,11 +11,9 @@ from leettools.core.schemas.user import User from leettools.core.strategy.schemas.strategy_section import StrategySection from leettools.eds.rag.schemas.rewrite import Rewrite -from leettools.flow.exec_info import ExecInfo class AbstractQueryRewriter(ABC): - @abstractmethod def rewrite( self, diff --git a/src/leettools/eds/rag/schemas/rewrite.py b/src/leettools/eds/rag/schemas/rewrite.py index c0b002c..b9880eb 100644 --- a/src/leettools/eds/rag/schemas/rewrite.py +++ b/src/leettools/eds/rag/schemas/rewrite.py @@ -4,6 +4,5 @@ class Rewrite(BaseModel): - rewritten_question: str search_keywords: Optional[str] = None diff --git a/src/leettools/eds/rag/search/_impl/searcher_bm25_dense.py b/src/leettools/eds/rag/search/_impl/searcher_bm25_dense.py index ec4c816..cfbca17 100644 --- a/src/leettools/eds/rag/search/_impl/searcher_bm25_dense.py +++ b/src/leettools/eds/rag/search/_impl/searcher_bm25_dense.py @@ -13,7 +13,6 @@ VectorSearchResult, VectorType, create_vector_store_dense, - create_vector_store_sparse, ) from leettools.core.schemas.chat_query_metadata import ChatQueryMetadata from leettools.core.schemas.knowledgebase import KnowledgeBase @@ -53,7 +52,6 @@ def initialize_nltk(cls) -> None: @classmethod def extract_keywords(cls, sentence: str) -> str: - cls.initialize_nltk() # Tokenize the sentence @@ -247,11 +245,11 @@ def execute_kb_search( f"Found {len(results_from_dense_vector)} from dense vector search." ) - logger().info(f"Extracting keywords from query...") + logger().info("Extracting keywords from query...") if ( query_meta is not None and query_meta.keywords is not None - and query_meta.keywords is not [] + and query_meta.keywords != [] ): keyword_query = "|".join(query_meta.keywords) logger().info(f"keyword_query from metadata: {keyword_query}") @@ -260,7 +258,7 @@ def execute_kb_search( keyword_query = self.extract_keywords(query) logger().info(f"keyword_query from original query: {keyword_query}") - logger().info(f"Performing BM25 search ...") + logger().info("Performing BM25 search ...") try: results_from_sparse_vector: List[VectorSearchResult] = ( self.dense_vectorstore.search_in_kb( diff --git a/src/leettools/eds/rag/search/_impl/searcher_hybrid.py b/src/leettools/eds/rag/search/_impl/searcher_hybrid.py index f913514..4c498f7 100644 --- a/src/leettools/eds/rag/search/_impl/searcher_hybrid.py +++ b/src/leettools/eds/rag/search/_impl/searcher_hybrid.py @@ -52,7 +52,6 @@ def initialize_nltk(cls) -> None: @classmethod def extract_keywords(cls, sentence: str) -> str: - cls.initialize_nltk() # Tokenize the sentence @@ -282,11 +281,11 @@ def execute_kb_search( ) # keyword_query = self._extract_keywords(query) - logger().info(f"Extracting keywords from query...") + logger().info("Extracting keywords from query...") if ( query_meta is not None and query_meta.keywords is not None - and query_meta.keywords is not [] + and query_meta.keywords != [] ): keyword_query = "|".join(query_meta.keywords) logger().info(f"keyword_query from metadata: {keyword_query}") @@ -295,7 +294,7 @@ def execute_kb_search( keyword_query = self.extract_keywords(query) logger().info(f"keyword_query from original query: {keyword_query}") - logger().info(f"Searching Sparse Vector...") + logger().info("Searching Sparse Vector...") try: results_from_sparse_vector: List[VectorSearchResult] = ( self.sparse_vectorstore.search_in_kb( diff --git a/src/leettools/eds/rag/search/filter_duckdb.py b/src/leettools/eds/rag/search/filter_duckdb.py index 4f1e9c0..9b81927 100644 --- a/src/leettools/eds/rag/search/filter_duckdb.py +++ b/src/leettools/eds/rag/search/filter_duckdb.py @@ -4,7 +4,7 @@ def _convert_condition_to_duckdb( - condition: Union[BaseCondition, Filter] + condition: Union[BaseCondition, Filter], ) -> Tuple[str, List[str], List[Any]]: if isinstance(condition, Filter): return to_duckdb_filter(condition) @@ -28,7 +28,7 @@ def _convert_condition_to_duckdb( def to_duckdb_filter( - filter: Union[Filter, BaseCondition] + filter: Union[Filter, BaseCondition], ) -> Tuple[str, List[str], List[Any]]: """ Convert a Filter to a DuckDB filter expression. diff --git a/src/leettools/eds/rag/search/searcher.py b/src/leettools/eds/rag/search/searcher.py index 3aee42c..7c46b7c 100644 --- a/src/leettools/eds/rag/search/searcher.py +++ b/src/leettools/eds/rag/search/searcher.py @@ -10,7 +10,6 @@ from leettools.context_manager import Context, ContextManager from leettools.core.consts.segment_embedder_type import SegmentEmbedderType from leettools.core.schemas.chat_query_metadata import ( - DEFAULT_INTENTION, ChatQueryMetadata, ) from leettools.core.schemas.knowledgebase import KnowledgeBase @@ -22,7 +21,6 @@ class AbstractSearcher(ABC): - @abstractmethod def execute_kb_search( self, diff --git a/src/leettools/eds/scheduler/_impl/scheduler_simple.py b/src/leettools/eds/scheduler/_impl/scheduler_simple.py index e250138..97a03d2 100644 --- a/src/leettools/eds/scheduler/_impl/scheduler_simple.py +++ b/src/leettools/eds/scheduler/_impl/scheduler_simple.py @@ -71,9 +71,9 @@ def _current_task_info(self) -> str: def _clear_tasks(self) -> None: self.logger.info("Clearing tasks in the queue.") - assert ( - self.status != SchedulerStatus.RUNNING - ), f"Scheduler status is {self.status} while trying to clear tasks." + assert self.status != SchedulerStatus.RUNNING, ( + f"Scheduler status is {self.status} while trying to clear tasks." + ) with self.lock: self.logger.noop("Inside the lock ...", noop_lvl=3) if self.task_queue.qsize() > 0: @@ -148,9 +148,9 @@ def _init_load_tasks(self) -> None: """ This function is called when the scheduler is started or resumed. """ - assert ( - self.status != SchedulerStatus.RUNNING - ), f"Scheduler status is {self.status} while trying to reload tasks." + assert self.status != SchedulerStatus.RUNNING, ( + f"Scheduler status is {self.status} while trying to reload tasks." + ) with self.lock: self.logger.noop("Inside the lock ...", noop_lvl=3) todo_tasks = self.task_scanner.scan_kb_for_tasks( @@ -192,9 +192,9 @@ def _update_tasks(self) -> None: This function is called with a specified interval to update the tasks. Logging inside this function should be very careful. """ - assert ( - self.status == SchedulerStatus.RUNNING - ), f"Scheduler status is {self.status} while trying to load tasks." + assert self.status == SchedulerStatus.RUNNING, ( + f"Scheduler status is {self.status} while trying to load tasks." + ) with self.lock: self.logger.noop("Inside the lock ...", noop_lvl=3) @@ -237,9 +237,9 @@ def _task_loader(self, interval: int) -> None: self.logger.info( f"Starting the task loader thread with interval {interval} seconds." ) - assert ( - self.status == SchedulerStatus.RUNNING - ), f"Scheduler status is {self.status} while trying to load tasks." + assert self.status == SchedulerStatus.RUNNING, ( + f"Scheduler status is {self.status} while trying to load tasks." + ) # we only print out log once every 1 minute logging_count = int(60 / interval) while self.status == SchedulerStatus.RUNNING: @@ -261,7 +261,7 @@ def _task_loader(self, interval: int) -> None: self.logger.error(f"Error in the task loader: {tb_str}") raise e - self.logger.info(f"The task loader thread is finished.") + self.logger.info("The task loader thread is finished.") def _check_cooldown_queue(self) -> None: base_delay = self.context.settings.scheduler_base_delay_in_seconds @@ -274,9 +274,9 @@ def _check_cooldown_queue(self) -> None: f"There are {self.cooldown_queue.qsize()} jobs in cooldown queue." ) job = self.cooldown_queue.get() - assert ( - job.job_status == JobStatus.FAILED - ), f"Checking the delay time for a failed task {job.task_uuid}." + assert job.job_status == JobStatus.FAILED, ( + f"Checking the delay time for a failed task {job.task_uuid}." + ) if job.retry_count > max_retries: self.logger.warning( f"Job {job.job_uuid} has reached the max retry count {max_retries}." @@ -478,7 +478,7 @@ def _worker(self, id: int): self.logger.debug(f"Executing job_uuid {job.job_uuid} ...") if job_is_executable: self._worker_executes_job(id, job) - except Exception as e: + except Exception: tb_str = traceback.format_exc() self.logger.error(f"[{id}]Critical error in the worker: {tb_str}") finally: @@ -494,12 +494,12 @@ def _worker(self, id: int): def _start_workers(self) -> None: self.logger.info("Starting the task loader and workers in the scheduler.") - assert ( - self.status == SchedulerStatus.RUNNING - ), f"Scheduler status is {self.status} while trying to start workers." - assert ( - self.task_loader == None - ), f"The task loader is not None while trying to start workers." + assert self.status == SchedulerStatus.RUNNING, ( + f"Scheduler status is {self.status} while trying to start workers." + ) + assert self.task_loader == None, ( + "The task loader is not None while trying to start workers." + ) self.task_loader = self.threadpool.submit(self._task_loader, interval=3) for id in range(self.num_of_workers): self.workers[id] = self.threadpool.submit(self._worker, id) @@ -510,9 +510,9 @@ def _start_workers(self) -> None: def _stop_workers(self, force: bool = False) -> None: self.logger.info("Stopping the task loader and workers in the scheduler.") - assert ( - self.status != SchedulerStatus.RUNNING - ), f"Scheduler status is {self.status} while trying to stop workers." + assert self.status != SchedulerStatus.RUNNING, ( + f"Scheduler status is {self.status} while trying to stop workers." + ) if self.task_loader is not None: if self.task_loader.running(): self.task_loader.result() @@ -536,12 +536,12 @@ def _stop_workers(self, force: bool = False) -> None: self.task_queue.put(None) def _check_workers_done(self) -> bool: - assert ( - self.status != SchedulerStatus.RUNNING - ), f"Scheduler status is {self.status} while trying to check workers done." + assert self.status != SchedulerStatus.RUNNING, ( + f"Scheduler status is {self.status} while trying to check workers done." + ) done = True if self.task_loader is not None and self.task_loader.running(): - self.logger.info(f"Task loader is running.") + self.logger.info("Task loader is running.") done = False for id in range(self.num_of_workers): diff --git a/src/leettools/eds/scheduler/_impl/task_runner_eds.py b/src/leettools/eds/scheduler/_impl/task_runner_eds.py index a1aee77..e4161d8 100644 --- a/src/leettools/eds/scheduler/_impl/task_runner_eds.py +++ b/src/leettools/eds/scheduler/_impl/task_runner_eds.py @@ -185,7 +185,6 @@ def _run_splitter(self, split_program: SplitProgramSpec, job: Job) -> ReturnCode return rnt_code def run_job(self, job: Job) -> Job: - job.job_status = JobStatus.RUNNING job = self.jobstore.update_job_status(job.job_uuid, job.job_status) diff --git a/src/leettools/eds/scheduler/_impl/task_scanner_kb.py b/src/leettools/eds/scheduler/_impl/task_scanner_kb.py index fb39b0a..b81a82a 100644 --- a/src/leettools/eds/scheduler/_impl/task_scanner_kb.py +++ b/src/leettools/eds/scheduler/_impl/task_scanner_kb.py @@ -38,7 +38,6 @@ class TaskScannerKB(AbstractTaskScanner): """ def __init__(self, context: Context): - self.logger = get_logger(name="scheduler") self.repo_manager = context.get_repo_manager() @@ -240,7 +239,6 @@ def _add_tasks_for_docsource( current_tasks = self.taskstore.get_tasks_for_docsource(docsource.docsource_uuid) for program_type, program_spec in program_dict.items(): - task = None for t in current_tasks: if t.program_spec.program_type == program_type: @@ -517,7 +515,6 @@ def _docsource_in_cur_tasks( continue def _need_to_check_docsource() -> bool: - if schedule_config.schedule_type == ScheduleType.RECURRING: self.logger.debug( f"Found recurring docsource to be scanned: {dssig}" diff --git a/src/leettools/eds/scheduler/scheduler.py b/src/leettools/eds/scheduler/scheduler.py index 1751143..21a8c8a 100644 --- a/src/leettools/eds/scheduler/scheduler.py +++ b/src/leettools/eds/scheduler/scheduler.py @@ -1,5 +1,5 @@ from abc import ABC, abstractmethod -from typing import Dict, List, Optional +from typing import Dict, List from leettools.context_manager import Context from leettools.core.schemas.docsource import DocSource diff --git a/src/leettools/eds/scheduler/scheduler_manager.py b/src/leettools/eds/scheduler/scheduler_manager.py index 4f3e1f4..005f2f3 100644 --- a/src/leettools/eds/scheduler/scheduler_manager.py +++ b/src/leettools/eds/scheduler/scheduler_manager.py @@ -123,7 +123,6 @@ class SingletonMetaSchedule(SingletonMeta): class SchedulerManager(metaclass=SingletonMetaSchedule): - # right now passing in different context will not create a new instance def __init__(self, context: Context): if not hasattr( @@ -192,10 +191,10 @@ def run_scheduler( logger().info("[run_scheduler]Getting the scheduler from SchedulerManager.") try: scheduler_manager = SchedulerManager(context) # type: SchedulerManager - except UnexpectedCaseException as e: + except UnexpectedCaseException: # this is possible in the manual run case # basically another scheduler is already running - logger().info(f"Another scheduler is already running. No need to run.") + logger().info("Another scheduler is already running. No need to run.") return False scheduler = scheduler_manager.get_scheduler() diff --git a/src/leettools/eds/scheduler/schemas/job.py b/src/leettools/eds/scheduler/schemas/job.py index 7d16bca..8dc0748 100644 --- a/src/leettools/eds/scheduler/schemas/job.py +++ b/src/leettools/eds/scheduler/schemas/job.py @@ -58,7 +58,6 @@ class JobUpdate(JobInDBBase): # Properties properties stored in DB class JobInDB(JobInDBBase): - created_at: Optional[datetime] = Field(None, description="The creation time.") updated_at: Optional[datetime] = Field(None, description="The update time.") @@ -112,7 +111,6 @@ def set_job_uuid(self, job_uuid: str): # Properties to return to client @add_fieldname_constants class Job(JobInDB): - @classmethod def from_job_in_db(Job, job_in_db: JobInDB) -> "Job": # Note: we need to assign all the required properties and diff --git a/src/leettools/eds/scheduler/schemas/task.py b/src/leettools/eds/scheduler/schemas/task.py index db26a46..a4a20e1 100644 --- a/src/leettools/eds/scheduler/schemas/task.py +++ b/src/leettools/eds/scheduler/schemas/task.py @@ -3,7 +3,7 @@ from dataclasses import dataclass from datetime import datetime from enum import Enum -from typing import Any, Dict, List, Optional +from typing import Any, Dict, Optional from pydantic import BaseModel @@ -104,7 +104,6 @@ def from_task_update(TaskInDB, task_update: TaskUpdate) -> "TaskInDB": @add_fieldname_constants class Task(TaskInDBBase): - @classmethod def get_task_status_descriptions(cls) -> list[TaskStatusDescription]: task_status_descriptions = [ diff --git a/src/leettools/eds/scheduler/task/_impl/duckdb/taskstore_duckdb.py b/src/leettools/eds/scheduler/task/_impl/duckdb/taskstore_duckdb.py index 84884d3..5b2704e 100644 --- a/src/leettools/eds/scheduler/task/_impl/duckdb/taskstore_duckdb.py +++ b/src/leettools/eds/scheduler/task/_impl/duckdb/taskstore_duckdb.py @@ -35,7 +35,7 @@ def __init__(self, settings: SystemSettings) -> None: """ Initialize the DuckDB Taskstore. """ - logger().info(f"TaskStoreDuckDB: initializing") + logger().info("TaskStoreDuckDB: initializing") self.settings = settings self.duckdb_client = DuckDBClient(self.settings) self.table_name = self._get_table_name() @@ -79,7 +79,6 @@ def create_task(self, task_create: TaskCreate) -> Task: return self.update_task(task_in_db) def delete_task(self, task_uuid: str) -> bool: - task = self.get_task_by_uuid(task_uuid) if task is None: raise EntityNotFoundException(entity_name=task_uuid, entity_type="Task") @@ -98,7 +97,7 @@ def delete_task(self, task_uuid: str) -> bool: where_clause=f"WHERE {Task.FIELD_TASK_UUID} = ?", ) - from leettools.context_manager import Context, ContextManager + from leettools.context_manager import ContextManager context = ContextManager().get_context() # type: Context job_store = context.get_task_manager().get_jobstore() diff --git a/src/leettools/eds/scheduler/task/jobstore.py b/src/leettools/eds/scheduler/task/jobstore.py index 114425e..8565cf0 100644 --- a/src/leettools/eds/scheduler/task/jobstore.py +++ b/src/leettools/eds/scheduler/task/jobstore.py @@ -8,7 +8,6 @@ class AbstractJobStore(ABC): - @abstractmethod def create_job(self, job_create: JobCreate) -> Optional[Job]: pass diff --git a/src/leettools/eds/scheduler/task/taskstore.py b/src/leettools/eds/scheduler/task/taskstore.py index 5125a1e..24a7ffc 100644 --- a/src/leettools/eds/scheduler/task/taskstore.py +++ b/src/leettools/eds/scheduler/task/taskstore.py @@ -9,7 +9,6 @@ class AbstractTaskStore(ABC): - @abstractmethod def create_task(self, task_create: TaskCreate) -> Task: """ diff --git a/src/leettools/eds/scheduler/task_runner.py b/src/leettools/eds/scheduler/task_runner.py index 776aa24..47a7f0c 100644 --- a/src/leettools/eds/scheduler/task_runner.py +++ b/src/leettools/eds/scheduler/task_runner.py @@ -4,7 +4,6 @@ class AbstractTaskRunner(ABC): - @abstractmethod def run_job(self, job: Job) -> Job: """ diff --git a/src/leettools/eds/scheduler/task_scanner.py b/src/leettools/eds/scheduler/task_scanner.py index 662177b..596e80f 100644 --- a/src/leettools/eds/scheduler/task_scanner.py +++ b/src/leettools/eds/scheduler/task_scanner.py @@ -8,7 +8,6 @@ class AbstractTaskScanner(ABC): - @abstractmethod def scan_kb_for_tasks( self, diff --git a/src/leettools/eds/str_embedder/_impl/dense_embedder_local_svc_client.py b/src/leettools/eds/str_embedder/_impl/dense_embedder_local_svc_client.py index 88f4677..fa7e963 100644 --- a/src/leettools/eds/str_embedder/_impl/dense_embedder_local_svc_client.py +++ b/src/leettools/eds/str_embedder/_impl/dense_embedder_local_svc_client.py @@ -16,7 +16,6 @@ DenseEmbeddingRequest, DenseEmbeddings, ) -from leettools.settings import SystemSettings class DenseEmbedderLocalSvcClient(AbstractDenseEmbedder): diff --git a/src/leettools/eds/str_embedder/_impl/dense_embedder_openai.py b/src/leettools/eds/str_embedder/_impl/dense_embedder_openai.py index af26436..c69d0f6 100644 --- a/src/leettools/eds/str_embedder/_impl/dense_embedder_openai.py +++ b/src/leettools/eds/str_embedder/_impl/dense_embedder_openai.py @@ -21,11 +21,9 @@ API_CALL_ENDPOINT_EMBED, UsageAPICallCreate, ) -from leettools.settings import SystemSettings class DenseEmbedderOpenAI(AbstractDenseEmbedder): - def __init__( self, context: Context, @@ -33,7 +31,6 @@ def __init__( kb: Optional[KnowledgeBase] = None, user: Optional[User] = None, ) -> None: - self.org = org self.kb = kb self.user = user @@ -65,7 +62,7 @@ def __init__( else: user_store = self.context.get_user_store() if self.kb is None: - logger().debug(f"No KB specified. Using admin user.") + logger().debug("No KB specified. Using admin user.") user = user_store.get_user_by_name(User.ADMIN_USERNAME) else: if self.kb.user_uuid is None: @@ -86,7 +83,6 @@ def __init__( ) def embed(self, embed_requests: DenseEmbeddingRequest) -> DenseEmbeddings: - response = None start_timestamp_in_ms = time_utils.cur_timestamp_in_ms() try: diff --git a/src/leettools/eds/str_embedder/_impl/dense_embedder_qwen.py b/src/leettools/eds/str_embedder/_impl/dense_embedder_qwen.py index 21cda06..bf4e695 100644 --- a/src/leettools/eds/str_embedder/_impl/dense_embedder_qwen.py +++ b/src/leettools/eds/str_embedder/_impl/dense_embedder_qwen.py @@ -6,7 +6,7 @@ from leettools.common.exceptions import ConfigValueException from leettools.common.logging import logger -from leettools.common.utils import config_utils, time_utils +from leettools.common.utils import time_utils from leettools.context_manager import Context from leettools.core.schemas.knowledgebase import KnowledgeBase from leettools.core.schemas.organization import Org @@ -24,7 +24,6 @@ API_CALL_ENDPOINT_EMBED, UsageAPICallCreate, ) -from leettools.settings import SystemSettings EMBEDDER_MODEL_MAPPNG = { "text-embedding-v1": dashscope.TextEmbedding.Models.text_embedding_v1, @@ -33,7 +32,6 @@ class DenseEmbedderQwen(AbstractDenseEmbedder): - def __init__( self, context: Context, @@ -41,7 +39,6 @@ def __init__( kb: Optional[KnowledgeBase] = None, user: Optional[User] = None, ) -> None: - self.org = org self.kb = kb self.user = user diff --git a/src/leettools/eds/str_embedder/_impl/sparse_embedder_splade.py b/src/leettools/eds/str_embedder/_impl/sparse_embedder_splade.py index c3cfa7d..54eccd1 100644 --- a/src/leettools/eds/str_embedder/_impl/sparse_embedder_splade.py +++ b/src/leettools/eds/str_embedder/_impl/sparse_embedder_splade.py @@ -17,7 +17,6 @@ class SparseStrEmbedderSplade(AbstractSparseEmbedder): - def __init__( self, context: Context, diff --git a/src/leettools/eds/str_embedder/_impl/splade_function.py b/src/leettools/eds/str_embedder/_impl/splade_function.py index 6191449..c332d74 100644 --- a/src/leettools/eds/str_embedder/_impl/splade_function.py +++ b/src/leettools/eds/str_embedder/_impl/splade_function.py @@ -2,14 +2,14 @@ from leettools.common.logging import logger from leettools.common.singleton_meta import SingletonMeta -from leettools.context_manager import Context, ContextManager +from leettools.context_manager import ContextManager """ Right now the reranker and embedder are using the system-wide settings and shared by all the usres. Only the intention detection, query rewriting, and final inference are using the customizable user settings through the api-provider-config. -One consideration is that the reranker and embedder are too technical to expose to +One consideration is that the reranker and embedder are too technical to expose to the users, and the choices should be determined before querying time. Especially for embedders, we can't switch embedders after the documents are processed. """ diff --git a/src/leettools/eds/str_embedder/utils/splade_init.py b/src/leettools/eds/str_embedder/utils/splade_init.py index d97d37d..dddbc34 100644 --- a/src/leettools/eds/str_embedder/utils/splade_init.py +++ b/src/leettools/eds/str_embedder/utils/splade_init.py @@ -10,7 +10,7 @@ os.environ[f"{ENV_VAR_PREFIX}LOG_ROOT"] = "dummy" # put the imports after the dummy environment variables - from leettools.context_manager import Context, ContextManager + from leettools.context_manager import ContextManager from leettools.eds.str_embedder._impl.sparse_embedder_splade import ( SparseStrEmbedderSplade, ) diff --git a/src/leettools/eds/usage/_impl/duckdb/usage_store_duckdb.py b/src/leettools/eds/usage/_impl/duckdb/usage_store_duckdb.py index 8c60f79..284b9b5 100644 --- a/src/leettools/eds/usage/_impl/duckdb/usage_store_duckdb.py +++ b/src/leettools/eds/usage/_impl/duckdb/usage_store_duckdb.py @@ -1,5 +1,4 @@ import uuid -from datetime import datetime from typing import List, Optional from leettools.common.duckdb.duckdb_client import DuckDBClient @@ -24,9 +23,7 @@ class UsageStoreDuckDB(AbstractUsageStore): - def __init__(self, settings: SystemSettings, user_store: AbstractUserStore) -> None: - self.settings = settings self.token_converter = create_token_converter(settings) self.user_store = user_store @@ -108,10 +105,10 @@ def get_usage_summary_by_user( ) value_list = [user_uuid, start_time_in_ms, end_time_in_ms] if limit > 0 and start > 0: - where_clause += f" LIMIT ? OFFSET ?" + where_clause += " LIMIT ? OFFSET ?" value_list += [limit, start] elif start > 0: - where_clause += f" OFFSET ?" + where_clause += " OFFSET ?" value_list += [start] rtn_dicts = self.duckdb_client.fetch_all_from_table( @@ -235,10 +232,10 @@ def get_api_usage_details_by_user( ) value_list = [user_uuid, start_time_in_ms, end_time_in_ms] if limit > 0 and start >= 0: - where_clause += f" LIMIT ? OFFSET ?" + where_clause += " LIMIT ? OFFSET ?" value_list += [limit, start] elif start > 0: - where_clause += f" OFFSET ?" + where_clause += " OFFSET ?" value_list += [start] rtn_dicts = self.duckdb_client.fetch_all_from_table( diff --git a/src/leettools/eds/usage/_impl/token_converter_basic.py b/src/leettools/eds/usage/_impl/token_converter_basic.py index c6b7633..6c30046 100644 --- a/src/leettools/eds/usage/_impl/token_converter_basic.py +++ b/src/leettools/eds/usage/_impl/token_converter_basic.py @@ -6,7 +6,6 @@ class TokenConverterBasic(AbstractTokenConverter): - def __init__(self, settings: SystemSettings) -> None: self.settings = settings self.token_map = ModelInfoManager().get_token_map() diff --git a/src/leettools/flow/README.md b/src/leettools/flow/README.md index d944f03..b19574b 100644 --- a/src/leettools/flow/README.md +++ b/src/leettools/flow/README.md @@ -11,7 +11,7 @@ The following are the different components of the flow: - iterator: iterates over a docource, a KB, or a list of them to perform a task. - flow: sequence of steps that performs a task and return a result as the ChatQueryResultCreate data structure. -- executuor: implements the AbstractExcutor interface, mainly the execute_for_query +- executuor: implements the AbstractExcutor interface, mainly the execute_for_query fucntion so that it can be served through the EDS API. # Flow and Strategy @@ -19,13 +19,11 @@ Each flow will be a python program that hooks up different types of steps to per specific task. For each kind of step, it takes a specific kind of configuration (which is the current strategy section) for the task. -The strategy specifies the configuration the steps use, such as the API or model -parameters. Each strategy is separated into different sections, each section serving a +The strategy specifies the configuration the steps use, such as the API or model +parameters. Each strategy is separated into different sections, each section serving a specific purpose such as intention detecion, query rewrite, rerank, inference, and etc. For some of the steps, they will multiple sections configuration. For example, the section planning step can use the configuration for the inference step. - - # Prompt templates When defining the prompt templates, there are different types of variables that can be @@ -76,9 +74,9 @@ prompt = template_eval.render_template(prompt_template, template_vars) ``` ## Create a prompt template with variables with f-string format - + In this case, we need to use the f-string format in the template to get variables in the -current context, and some other variables that will be provided at runtime. +current context, and some other variables that will be provided at runtime. - {{{{ lang_instruction }}}} : these will be replaced at runtime using the variables provided by the prompt_util module. diff --git a/src/leettools/flow/flow.py b/src/leettools/flow/flow.py index f505a91..ccd6686 100644 --- a/src/leettools/flow/flow.py +++ b/src/leettools/flow/flow.py @@ -14,7 +14,6 @@ class AbstractFlow(ABC, FlowComponent): - FLOW_TYPE: ClassVar[str] = None ARTICLE_TYPE: ClassVar[str] = None COMPONENT_TYPE: ClassVar[FlowComponentType] = FlowComponentType.FLOW diff --git a/src/leettools/flow/flow_manager.py b/src/leettools/flow/flow_manager.py index e2ea2e3..1de85e1 100644 --- a/src/leettools/flow/flow_manager.py +++ b/src/leettools/flow/flow_manager.py @@ -15,7 +15,6 @@ class FlowManager(metaclass=SingletonMeta): - def __init__(self, settings: SystemSettings): if not hasattr( self, "initialized" @@ -40,7 +39,6 @@ def get_flow_by_type(self, flow_type: str) -> AbstractFlow: return flow_class(context=ContextManager().get_context()) def _scan_all_flows(self): - # the extension path is under the {project_root}/extensions # the current script is under the {project_root}/src/leettools/flow @@ -93,7 +91,6 @@ def _scan_dir_for_flows(self, dir_str: str) -> Dict[str, Type[AbstractFlow]]: if __name__ == "__main__": - from leettools.context_manager import ContextManager context = ContextManager().get_context() diff --git a/src/leettools/flow/flow_option_items.py b/src/leettools/flow/flow_option_items.py index 4aeed7b..3d2a472 100644 --- a/src/leettools/flow/flow_option_items.py +++ b/src/leettools/flow/flow_option_items.py @@ -66,9 +66,7 @@ class FlowOptionItem(BaseModel): ) value_type: Optional[str] = Field( "str", - description=_( - "The type of the value," "currently support str, int, float, bool." - ), + description=_("The type of the value,currently support str, int, float, bool."), ) required: Optional[bool] = Field( False, diff --git a/src/leettools/flow/flows/README.md b/src/leettools/flow/flows/README.md index ec2b2ff..fa80f2d 100644 --- a/src/leettools/flow/flows/README.md +++ b/src/leettools/flow/flows/README.md @@ -12,4 +12,3 @@ chat_query_item: ChatQueryItem, display_logger: Optional[EventLogger] = None, And shoudl return a ChatQueryResultCreate object which will be added to the chat result. - diff --git a/src/leettools/flow/flows/answer/flow_answer.py b/src/leettools/flow/flows/answer/flow_answer.py index 4c09002..a417eb5 100644 --- a/src/leettools/flow/flows/answer/flow_answer.py +++ b/src/leettools/flow/flows/answer/flow_answer.py @@ -27,7 +27,6 @@ class FlowAnswer(AbstractFlow): - FLOW_TYPE: ClassVar[str] = FlowType.ANSWER.value ARTICLE_TYPE: ClassVar[str] = ArticleType.CHAT.value COMPONENT_NAME: ClassVar[str] = FlowType.ANSWER.value @@ -41,7 +40,7 @@ def full_description(cls) -> str: return """ Search the web or local KB with the query and answer with source references: - Perform the search with retriever: "local" for local KB, a search engine - (e.g., google) fetches top documents from the web. If no KB is specified, + (e.g., google) fetches top documents from the web. If no KB is specified, create an adhoc KB; otherwise, save and process results in the KB. - New web search results are processed by the document pipeline: conversion, chunking, and indexing. @@ -175,12 +174,14 @@ def execute_query( top_ranked_result_segments=top_ranked_result_segments, ) - extended_context, context_token_count, source_items = ( - steps.StepExtendContext.run_step( - exec_info=exec_info, - reranked_result=reranked_result, - accumulated_source_items={}, - ) + ( + extended_context, + context_token_count, + source_items, + ) = steps.StepExtendContext.run_step( + exec_info=exec_info, + reranked_result=reranked_result, + accumulated_source_items={}, ) display_logger.debug( @@ -202,13 +203,14 @@ def execute_query( result_content = completion.choices[0].message.content - answer_content, reorder_cited_source_items = ( - flow_utils.inference_result_to_answer( - result_content=result_content, - source_items=source_items, - reference_style=reference_style, - display_logger=display_logger, - ) + ( + answer_content, + reorder_cited_source_items, + ) = flow_utils.inference_result_to_answer( + result_content=result_content, + source_items=source_items, + reference_style=reference_style, + display_logger=display_logger, ) caic_list = [] diff --git a/src/leettools/flow/flows/digest/flow_digest.py b/src/leettools/flow/flows/digest/flow_digest.py index 2538220..2b2c162 100644 --- a/src/leettools/flow/flows/digest/flow_digest.py +++ b/src/leettools/flow/flows/digest/flow_digest.py @@ -22,7 +22,6 @@ class FlowDigest(AbstractFlow): - FLOW_TYPE: ClassVar[str] = FlowType.DIGEST.value ARTICLE_TYPE: ClassVar[str] = ArticleType.RESEARCH.value COMPONENT_NAME: ClassVar[str] = FlowType.DIGEST.value @@ -37,9 +36,9 @@ def full_description(cls) -> str: When interested in a topic, you can generate a digest article: - Define search keywords and optional content instructions for relevance filtering. - Perform the search with retriever: "local" for local KB, a search engine (e.g., Google) - fetches top documents from the web. If no KB is specified, create an adhoc KB; + fetches top documents from the web. If no KB is specified, create an adhoc KB; otherwise, save and process results in the KB. -- New web search results are processed through the document pipeline: conversion, +- New web search results are processed through the document pipeline: conversion, chunking, and indexing. - Each result document is summarized using a LLM API call. - Generate a topic plan for the digest from the document summaries. @@ -75,7 +74,6 @@ def execute_query( chat_query_item: ChatQueryItem, display_logger: Optional[EventLogger] = None, ) -> ChatQueryResultCreate: - # common setup exec_info = ExecInfo( context=self.context, @@ -247,7 +245,7 @@ def execute_query( # now we have the document summaries from either local or web search if document_summaries == "" or document_summaries is None: - display_logger.debug(f"[Digest] Document summaries is empty") + display_logger.debug("[Digest] Document summaries is empty") return flow_utils.create_chat_result_for_empty_search( exec_info=exec_info, query_metadata=None ) diff --git a/src/leettools/flow/flows/dummy/flow_dummy.py b/src/leettools/flow/flows/dummy/flow_dummy.py index e8f979a..2d4db8f 100644 --- a/src/leettools/flow/flows/dummy/flow_dummy.py +++ b/src/leettools/flow/flows/dummy/flow_dummy.py @@ -21,7 +21,6 @@ class FlowDummy(AbstractFlow): - ARTICLE_TYPE: ClassVar[str] = ArticleType.CHAT.value FLOW_TYPE: ClassVar[str] = FlowType.DUMMY.value COMPONENT_NAME: ClassVar[str] = FlowType.DUMMY.value @@ -65,7 +64,6 @@ def execute_query( chat_query_item: ChatQueryItem, display_logger: EventLogger, ) -> ChatQueryResultCreate: - exec_info = ExecInfo( context=self.context, org=org, diff --git a/src/leettools/flow/flows/extract/flow_extract.py b/src/leettools/flow/flows/extract/flow_extract.py index aecab82..93c745e 100644 --- a/src/leettools/flow/flows/extract/flow_extract.py +++ b/src/leettools/flow/flows/extract/flow_extract.py @@ -57,9 +57,9 @@ def full_description(cls) -> str: return """ Extra structured data from web or local KB search results: - Perform the search with retriever: "local" for local KB, a search engine (e.g., Google) - fetches top documents from the web. If no KB is specified, create an adhoc KB; + fetches top documents from the web. If no KB is specified, create an adhoc KB; otherwise, save and process results in the KB. -- New web search results are processed through the document pipeline: conversion, +- New web search results are processed through the document pipeline: conversion, chunking, and indexing. - Extract structured data from matched documents based on the specified model. - Display the extracted data as a table in the output. @@ -103,7 +103,6 @@ def depends_on(cls) -> List[Type["FlowComponent"]]: @classmethod def direct_flow_option_items(cls) -> List[FlowOptionItem]: - return AbstractFlow.get_flow_option_items() + [ flow_option_items.FOI_RETRIEVER(explicit=True), flow_option_items.FOI_EXTRACT_PYDANTIC(explicit=True, required=True), @@ -125,7 +124,6 @@ def execute_query( chat_query_item: ChatQueryItem, display_logger: Optional[EventLogger] = None, ) -> ChatQueryResultCreate: - # common setup exec_info = ExecInfo( context=self.context, @@ -281,7 +279,7 @@ def execute_query( if "target_model_name" not in var_dict: if len(type_dict) > 1: err_msgs.append( - f"Specified more than one model but target_model not specfied." + "Specified more than one model but target_model not specfied." ) target_model_name = None else: @@ -431,7 +429,7 @@ def docsource_filter(_: ExecInfo, docsource: DocSource) -> bool: for o in obj: rows_data.append([str(x) for x in o.model_dump().values()]) else: - self.display_logger.debug(f"obj is empty list") + self.display_logger.debug("obj is empty list") else: rows_data.append([str(x) for x in obj.model_dump().values()]) diff --git a/src/leettools/flow/flows/medium/flow_medium.py b/src/leettools/flow/flows/medium/flow_medium.py index 101e35a..130f067 100644 --- a/src/leettools/flow/flows/medium/flow_medium.py +++ b/src/leettools/flow/flows/medium/flow_medium.py @@ -18,7 +18,7 @@ from leettools.flow.flow import AbstractFlow from leettools.flow.flow_option_items import FlowOptionItem from leettools.flow.flow_type import FlowType -from leettools.flow.flows.medium.prompts import QUERY_PROMPT, SUMMARY_PROMPT +from leettools.flow.flows.medium.prompts import SUMMARY_PROMPT from leettools.flow.schemas.medium_article import MediumArticle MAX_ARTICLE_NUMBER = 10 @@ -63,7 +63,6 @@ def execute_query( chat_query_item: ChatQueryItem, display_logger: EventLogger, ) -> ChatQueryResultCreate: - exec_info = ExecInfo( context=self.context, org=org, @@ -127,7 +126,7 @@ def _generate_prompt( # Start constructing the collected data section collected_data = "" if articles: - collected_data += f"\n\n" + collected_data += "\n\n" for idx, article in enumerate(articles, start=1): if idx > MAX_ARTICLE_NUMBER: display_logger.debug( diff --git a/src/leettools/flow/flows/medium/prompts.py b/src/leettools/flow/flows/medium/prompts.py index 6137c67..1a4894d 100644 --- a/src/leettools/flow/flows/medium/prompts.py +++ b/src/leettools/flow/flows/medium/prompts.py @@ -1,7 +1,7 @@ QUERY_PROMPT = """ -Given the topic description {topic} input by user, generate a search query for google to search +Given the topic description {topic} input by user, generate a search query for google to search for Medium articles about this topic. The result should be a query string that can be used to search. -The query string should always in English no matter what language the topic is. +The query string should always in English no matter what language the topic is. Don't include the following words in the return search query: articles @@ -17,7 +17,7 @@ """ SUMMARY_PROMPT = """ -You are an assistant that creates structured writing suggestions for Medium articles in Markdown format. +You are an assistant that creates structured writing suggestions for Medium articles in Markdown format. The following is the topic that a user is interested in: {topic} @@ -25,9 +25,9 @@ Using the collected popular Medium articles, generate a new writing idea based on the topic and the collected data: 1. Generate a blog title that is catchy, engaging, and SEO friendly. 2. Generate a blog outline that is detailed and well-structured. -3. For each section of the outline, provide specific strategies and techniques to enhance reader engagement and differentiate the content from other popular articles on the same topic. +3. For each section of the outline, provide specific strategies and techniques to enhance reader engagement and differentiate the content from other popular articles on the same topic. Include actionable tips on style, unique perspectives, and value-added information that can make each section stand out. -4. Use **MarkDown** format to organize the writing idea. +4. Use **MarkDown** format to organize the writing idea. **Output Format:** Provide the output as a organized text with *MarkDown* format by following the structure below: diff --git a/src/leettools/flow/flows/news/flow_news.py b/src/leettools/flow/flows/news/flow_news.py index 36064e4..fe4db56 100644 --- a/src/leettools/flow/flows/news/flow_news.py +++ b/src/leettools/flow/flows/news/flow_news.py @@ -92,7 +92,7 @@ def short_description(cls) -> str: @classmethod def full_description(cls) -> str: return """ -This flow generates a list of news items from the updated items in the KB: +This flow generates a list of news items from the updated items in the KB: 1. check the KB for recently updated documents and find news items in them. 2. combine all the similar items into one. 3. remove items that have been reported before. @@ -100,9 +100,7 @@ def full_description(cls) -> str: 5. generate a list of news items with references. """ - default_news_instructions: ClassVar[ - str - ] = """ + default_news_instructions: ClassVar[str] = """ Please find the news items in the context about {{ query }} and return - The title of the news item - The detailed description of the news in the style of {{ article_style }}, up to {{ word_count }} words @@ -268,7 +266,6 @@ def _get_news_params(self, exec_info: ExecInfo) -> _NewsParams: def _llm_dedupe( self, exec_info: ExecInfo, news_params: _NewsParams, news_results_md: str ) -> List[CombinedNewsItems]: - display_logger = exec_info.display_logger query = exec_info.query word_count = news_params.word_count @@ -455,7 +452,6 @@ def execute_query( chat_query_item: ChatQueryItem, display_logger: Optional[EventLogger] = None, ) -> ChatQueryResultCreate: - # common setup exec_info = ExecInfo( context=self.context, @@ -641,7 +637,7 @@ def document_filter(_: ExecInfo, document: Document) -> bool: ) combined_news_store.save_records(final_news_items, metadata={}) - display_logger.info(f"Generating results for the final answer.") + display_logger.info("Generating results for the final answer.") if len(final_news_items) == 0: return flow_utils.create_chat_result_with_manual_msg( diff --git a/src/leettools/flow/flows/opinions/flow_opinions.py b/src/leettools/flow/flows/opinions/flow_opinions.py index 5fc4af7..953c4aa 100644 --- a/src/leettools/flow/flows/opinions/flow_opinions.py +++ b/src/leettools/flow/flows/opinions/flow_opinions.py @@ -124,7 +124,6 @@ def dedupe_items( dedupe_step: Dict[str, str], display_logger: EventLogger, ) -> List[type]: - input_md_table = flow_utils.to_markdown_table(input_items, skip_fields=skip_fields) model_class = type_dict[target_model_name] @@ -156,11 +155,11 @@ def dedupe_items( ) if hasattr(message, "parsed"): - display_logger.debug(f"Returning list of objects using message.parsed.") + display_logger.debug("Returning list of objects using message.parsed.") extract_result = message.parsed return extract_result.items else: - display_logger.debug(f"Returning list of objects using model_validate_json.") + display_logger.debug("Returning list of objects using model_validate_json.") response_str = json_utils.ensure_json_item_list(response_str) try: items = response_pydantic_model.model_validate_json(response_str) @@ -198,18 +197,14 @@ def full_description(cls) -> str: FLOW_OPTION_OPINIONS_INSTRUCTION: ClassVar[str] = "opinions_instruction" - default_opinions_instructions: ClassVar[ - str - ] = """ + default_opinions_instructions: ClassVar[str] = """ Please find the opinions about {{ query }} in the context and return - The keywords about the opinion - The description of the opinion - the sentiment of the opinion (positive, negative, neutral) """ - default_facts_instructions: ClassVar[ - str - ] = """ + default_facts_instructions: ClassVar[str] = """ Please list interesting facts about {{ query }} in the context and return - The keywords about the fact - The description of the fact @@ -282,7 +277,6 @@ def execute_query( chat_query_item: ChatQueryItem, display_logger: Optional[EventLogger] = None, ) -> ChatQueryResultCreate: - # common setup exec_info = ExecInfo( context=self.context, @@ -431,14 +425,14 @@ def execute_query( opinion_dedupe_step = { "system_prompt_template": "You are an expert of deduplicate items.", "user_prompt_template": f""" -Given the following {item_type}s in a table where the left most column is the description, -the second column is the key words of the {item_type}, and the the right most column is +Given the following {item_type}s in a table where the left most column is the description, +the second column is the key words of the {item_type}, and the the right most column is the source url of the {item_type}: {{{{ results }}}} Please combine {item_type}s with similar descriptions and key words into one {item_type}, -limit the length of the combined description to 100 words, adding all source urls in a +limit the length of the combined description to 100 words, adding all source urls in a list for the combined {item_type}, and return the combine {item_type}s as the schema provided. """, } @@ -553,8 +547,8 @@ def execute_query( {{ results }} -Please combine facts with similar descriptions and key words into one fact, limit the -length of the combined description to 100 words, adding all source urls in a list for +Please combine facts with similar descriptions and key words into one fact, limit the +length of the combined description to 100 words, adding all source urls in a list for the combined fact, and return the combine facts as the schema provided. """, } diff --git a/src/leettools/flow/flows/post/flow_post.py b/src/leettools/flow/flows/post/flow_post.py index d591bda..cdeb598 100644 --- a/src/leettools/flow/flows/post/flow_post.py +++ b/src/leettools/flow/flows/post/flow_post.py @@ -24,7 +24,6 @@ def _section_plan_for_posts(query: str, search_phrases: str) -> ArticleSectionPlan: - user_prompt_template = FlowPosts.used_prompt_templates()[ FlowPosts.COMPONENT_NAME ].prompt_template @@ -41,7 +40,7 @@ def _section_plan_for_posts(query: str, search_phrases: str) -> ArticleSectionPl title=query, search_query=search_phrases + " " + query, system_prompt_template=""" - You are an expert news writer, you can write a brief news report about the topic + You are an expert news writer, you can write a brief news report about the topic using the provided context and the specified style shown in the example. """, user_prompt_template=user_prompt_template, @@ -67,7 +66,7 @@ def short_description(cls) -> str: def full_description(cls) -> str: return """ Specify the topic of the post, -- Specify the number of days to search for the content (right now only Google search is +- Specify the number of days to search for the content (right now only Google search is supported for this option); - Crawl the web with the keywords in the topic and save the top documents to the KB; - Summarize the saved documents; @@ -96,7 +95,7 @@ def used_prompt_templates(cls) -> Dict[str, PromptBase]: {{ style_instruction }} {{ word_count_instruction }} {{ ouput_example }} - + Here is the query: {{ query }} Here is the context: {{ context }} """ @@ -176,11 +175,13 @@ def execute_query( docsource=docsource, ) - document_summaries, all_docs, all_keywords = ( - flow_utils.get_doc_summaries_for_docsource( - docsource=docsource, - exec_info=exec_info, - ) + ( + document_summaries, + all_docs, + all_keywords, + ) = flow_utils.get_doc_summaries_for_docsource( + docsource=docsource, + exec_info=exec_info, ) sections: List[ArticleSection] = [] diff --git a/src/leettools/flow/flows/search/flow_search.py b/src/leettools/flow/flows/search/flow_search.py index cf4879a..b25a750 100644 --- a/src/leettools/flow/flows/search/flow_search.py +++ b/src/leettools/flow/flows/search/flow_search.py @@ -26,7 +26,6 @@ class FlowSearch(AbstractFlow): - FLOW_TYPE: ClassVar[str] = FlowType.SEARCH.value ARTICLE_TYPE: ClassVar[str] = ArticleType.SEARCH.value COMPONENT_NAME: ClassVar[str] = FlowType.SEARCH.value @@ -40,11 +39,11 @@ def full_description(cls) -> str: return """ Return top segements that match the query with links to the original documents. - Perform the search with retriever: "local" for local KB, a search engine (e.g., Google) - fetches top documents from the web. If no KB is specified, create an adhoc KB; + fetches top documents from the web. If no KB is specified, create an adhoc KB; otherwise, save and process results in the KB. -- New web search results are processed through the document pipeline: conversion, +- New web search results are processed through the document pipeline: conversion, chunking, and indexing. -- Now the query is executed on the local KB using hybrid search, e.g., full text and +- Now the query is executed on the local KB using hybrid search, e.g., full text and vector; - The top matched segments, with the ranking score and the original document links; - Right now SPLADE and Vector Cosine similarity are used in the hybried search. diff --git a/src/leettools/flow/iterator.py b/src/leettools/flow/iterator.py index 4946ecb..2f12389 100644 --- a/src/leettools/flow/iterator.py +++ b/src/leettools/flow/iterator.py @@ -1,5 +1,5 @@ from abc import ABC, abstractmethod -from typing import Any, ClassVar +from typing import Any from leettools.flow.exec_info import ExecInfo from leettools.flow.flow_component import FlowComponent diff --git a/src/leettools/flow/iterators/extract_kb.py b/src/leettools/flow/iterators/extract_kb.py index 8c0124a..640aaad 100644 --- a/src/leettools/flow/iterators/extract_kb.py +++ b/src/leettools/flow/iterators/extract_kb.py @@ -20,7 +20,6 @@ class ExtractKB(AbstractIterator): - from typing import ClassVar from leettools.flow.flow_component import FlowComponent @@ -36,7 +35,7 @@ def short_description(cls) -> str: def full_description(cls) -> str: return """ Given a pydantic model, extract structured information from the documents. If specified -to use a backend store, existing data will be checked and returned if exists and the +to use a backend store, existing data will be checked and returned if exists and the newly extracted data will be saved to the backend storage. """ diff --git a/src/leettools/flow/iterators/extract_search.py b/src/leettools/flow/iterators/extract_search.py index bccce84..c91cd08 100644 --- a/src/leettools/flow/iterators/extract_search.py +++ b/src/leettools/flow/iterators/extract_search.py @@ -17,7 +17,6 @@ class ExtractSearch(AbstractIterator): - from typing import ClassVar from leettools.flow.flow_component import FlowComponent @@ -180,6 +179,6 @@ def run( continue display_logger.info( - f"Finished extracting information from local search result." + "Finished extracting information from local search result." ) return new_objs, existing_objs diff --git a/src/leettools/flow/iterators/summarize.py b/src/leettools/flow/iterators/summarize.py index 9636a6c..bb1754d 100644 --- a/src/leettools/flow/iterators/summarize.py +++ b/src/leettools/flow/iterators/summarize.py @@ -11,7 +11,6 @@ class Summarize(AbstractIterator): - from typing import ClassVar from leettools.flow.flow_component import FlowComponent diff --git a/src/leettools/flow/step.py b/src/leettools/flow/step.py index de03ac1..b0b3c8b 100644 --- a/src/leettools/flow/step.py +++ b/src/leettools/flow/step.py @@ -1,5 +1,5 @@ from abc import ABC, abstractmethod -from typing import Any, ClassVar +from typing import Any from leettools.flow.exec_info import ExecInfo from leettools.flow.flow_component import FlowComponent diff --git a/src/leettools/flow/steps/README.md b/src/leettools/flow/steps/README.md index c4455b9..01c943b 100644 --- a/src/leettools/flow/steps/README.md +++ b/src/leettools/flow/steps/README.md @@ -13,8 +13,6 @@ - [Class Name: `StepScrapeUrlsToDocSource`](#class-name-stepscrapeurlstodocsource) - [Class Name: `StepSearchMedium`](#class-name-stepsearchmedium) - [Class Name: `StepSearchToDocsource`](#class-name-stepsearchtodocsource) - - # Class Name: `StepExtendContext` - **Component Name**: `"extend_context"` - **Dependencies**: None @@ -154,7 +152,7 @@ - If no strategy is provided or if it is disabled, it uses the default intention. - Uses the strategy to determine the intention of the query. - Returns the intention as part of the `ChatQueryMetadata`. - + # Class Name: `StepLocalKBSearch` - **Component Name**: `"local_kb_search"` - **Dependencies**: None diff --git a/src/leettools/flow/steps/step_extend_context.py b/src/leettools/flow/steps/step_extend_context.py index 6605ec2..fc913cb 100644 --- a/src/leettools/flow/steps/step_extend_context.py +++ b/src/leettools/flow/steps/step_extend_context.py @@ -18,7 +18,6 @@ class StepExtendContext(AbstractStep): - COMPONENT_NAME: ClassVar[str] = "extend_context" @classmethod diff --git a/src/leettools/flow/steps/step_extract_info.py b/src/leettools/flow/steps/step_extract_info.py index 60655d0..cb42836 100644 --- a/src/leettools/flow/steps/step_extract_info.py +++ b/src/leettools/flow/steps/step_extract_info.py @@ -20,7 +20,6 @@ class StepExtractInfo(AbstractStep): - COMPONENT_NAME: ClassVar[str] = "extract_info" @classmethod @@ -29,8 +28,8 @@ def short_description(cls) -> str: @classmethod def full_description(cls) -> str: - return """Extract information from the document. The function will always -return a list of the model class. If the instruction says to only extract one object, + return """Extract information from the document. The function will always +return a list of the model class. If the instruction says to only extract one object, the caller should take the first object from the list. """ @@ -39,7 +38,7 @@ def used_prompt_templates(cls) -> Dict[str, PromptBase]: # See [src/leettools/flow/README.md] for how to use template varaibles extract_info_template_str = """ Given the provided content, please follow the instructions and return the results -{{ lang_instruction }}: +{{ lang_instruction }}: {{ extraction_instructions }} Below is the provided content: @@ -172,7 +171,7 @@ def run_step( ) if hasattr(message, "parsed"): - display_logger.debug(f"Returning list of objects using message.parsed.") + display_logger.debug("Returning list of objects using message.parsed.") extract_result = message.parsed if multiple_items: return extract_result.items @@ -180,7 +179,7 @@ def run_step( return [extract_result] else: display_logger.debug( - f"Returning list of objects using model_validate_json." + "Returning list of objects using model_validate_json." ) response_str = json_utils.ensure_json_item_list(response_str) try: diff --git a/src/leettools/flow/steps/step_gen_intro.py b/src/leettools/flow/steps/step_gen_intro.py index a7f6a00..090c4e8 100644 --- a/src/leettools/flow/steps/step_gen_intro.py +++ b/src/leettools/flow/steps/step_gen_intro.py @@ -12,7 +12,6 @@ class StepGenIntro(AbstractStep): - COMPONENT_NAME: ClassVar[str] = "gen_intro" @classmethod @@ -54,7 +53,6 @@ def _step_gen_intro_section( content: str, query_metadata: Optional[ChatQueryMetadata] = None, ) -> ArticleSection: - display_logger = exec_info.display_logger display_logger.info("[Status] Generating introduction.") @@ -90,11 +88,11 @@ def _step_gen_intro_section( user_prompt_template = f""" {{{{ context_presentation }}}}, please generate an introduction section for a research report -about { query } {{{{ lang_instruction }}}} -{ content_instruction } +about {query} {{{{ lang_instruction }}}} +{content_instruction} Return the result as a string, do not include the title in the result. - + Here is the query: {{{{ rewritten_query }}}} diff --git a/src/leettools/flow/steps/step_gen_search_phrases.py b/src/leettools/flow/steps/step_gen_search_phrases.py index b806edc..784b66b 100644 --- a/src/leettools/flow/steps/step_gen_search_phrases.py +++ b/src/leettools/flow/steps/step_gen_search_phrases.py @@ -19,7 +19,6 @@ class StepGenSearchPhrases(AbstractStep): - COMPONENT_NAME: ClassVar[str] = "gen_search_phrases" @classmethod @@ -36,12 +35,12 @@ def full_description(cls) -> str: def used_prompt_templates(cls) -> Dict[str, PromptBase]: # See [src/leettools/flow/README.md] for how to use template varaibles search_phrase_template_str = """ -Given the following query, create a web search query +Given the following query, create a web search query {{ lang_instruction }} that will return most relavant information about the query from the the web search engine. Return the result as a string without quotes, do not include the title in the result. - + Here is the query: {{ query }} """ diff --git a/src/leettools/flow/steps/step_gen_section.py b/src/leettools/flow/steps/step_gen_section.py index c1b9c65..b08673a 100644 --- a/src/leettools/flow/steps/step_gen_section.py +++ b/src/leettools/flow/steps/step_gen_section.py @@ -15,7 +15,6 @@ class StepGenSection(AbstractStep): - COMPONENT_NAME: ClassVar[str] = "gen_section" @classmethod @@ -25,7 +24,7 @@ def short_description(cls) -> str: @classmethod def full_description(cls) -> str: return """Based on the article section plan, search the local KB for related -information and generate the section following the instructions in the plan and the +information and generate the section following the instructions in the plan and the options set in the query such as style, words, language, etc. """ diff --git a/src/leettools/flow/steps/step_inference.py b/src/leettools/flow/steps/step_inference.py index 4f605f8..5ad89dc 100644 --- a/src/leettools/flow/steps/step_inference.py +++ b/src/leettools/flow/steps/step_inference.py @@ -15,7 +15,6 @@ class StepInference(AbstractStep): - COMPONENT_NAME: ClassVar[str] = "inference" @classmethod diff --git a/src/leettools/flow/steps/step_intention.py b/src/leettools/flow/steps/step_intention.py index 1f040ee..a3a2342 100644 --- a/src/leettools/flow/steps/step_intention.py +++ b/src/leettools/flow/steps/step_intention.py @@ -4,7 +4,6 @@ DEFAULT_INTENTION, ChatQueryMetadata, ) -from leettools.core.strategy.schemas.strategy_section import StrategySection from leettools.core.strategy.schemas.strategy_section_name import StrategySectionName from leettools.eds.rag.intention.intention_getter import ( get_intention_getter_by_strategy, @@ -16,7 +15,6 @@ class StepIntention(AbstractStep): - COMPONENT_NAME: ClassVar[str] = "intention" @classmethod @@ -35,7 +33,6 @@ def run_step(exec_info: ExecInfo) -> ChatQueryMetadata: def _run_intention( exec_info: ExecInfo, ) -> ChatQueryMetadata: - context = exec_info.context display_logger = exec_info.display_logger user = exec_info.user diff --git a/src/leettools/flow/steps/step_local_kb_search.py b/src/leettools/flow/steps/step_local_kb_search.py index bdd646a..2b4422d 100644 --- a/src/leettools/flow/steps/step_local_kb_search.py +++ b/src/leettools/flow/steps/step_local_kb_search.py @@ -11,7 +11,6 @@ class StepLocalKBSearch(AbstractStep): - COMPONENT_NAME: ClassVar[str] = "local_kb_search" @classmethod diff --git a/src/leettools/flow/steps/step_plan_topic.py b/src/leettools/flow/steps/step_plan_topic.py index 103e54a..d371cf1 100644 --- a/src/leettools/flow/steps/step_plan_topic.py +++ b/src/leettools/flow/steps/step_plan_topic.py @@ -21,7 +21,6 @@ class StepPlanTopic(AbstractStep): - COMPONENT_NAME: ClassVar[str] = "plan_topic" @classmethod @@ -31,14 +30,14 @@ def short_description(cls) -> str: @classmethod def full_description(cls) -> str: return """Read the content provided, usually a list of summaries of the related -documents, and generate a list of topics that are discussed in theese documents and +documents, and generate a list of topics that are discussed in theese documents and the instructions to write detailed sections about these topics. """ @classmethod def used_prompt_templates(cls) -> Dict[str, PromptBase]: topic_plan_template_str = """ -{{ context_presentation }}, {{ num_of_section_instruction }} +{{ context_presentation }}, {{ num_of_section_instruction }} from the content as the outline for {{ article_style }} for this subject: {{ query }} @@ -48,7 +47,7 @@ def used_prompt_templates(cls) -> Dict[str, PromptBase]: Please create the title for each topic {{ output_lang_instruction }} For each topic, also generate a prompt {{ search_lang_instruction }} that can guide the LLM to find the most relevant information and write a detailed section -about it. +about it. {{ json_format_instruction }} @@ -276,7 +275,7 @@ def _parse_topic_list( try: rtn_obj = json.loads(response_str) - except Exception as e: + except Exception: raise exceptions.LLMInferenceResultException( f"Model response is not a valid JSON: {response_str}" ) diff --git a/src/leettools/flow/steps/step_query_rewrite.py b/src/leettools/flow/steps/step_query_rewrite.py index 0ec1448..500f691 100644 --- a/src/leettools/flow/steps/step_query_rewrite.py +++ b/src/leettools/flow/steps/step_query_rewrite.py @@ -12,7 +12,6 @@ class StepQueryRewrite(AbstractStep): - COMPONENT_NAME: ClassVar[str] = "query_rewrite" @classmethod diff --git a/src/leettools/flow/steps/step_rerank.py b/src/leettools/flow/steps/step_rerank.py index 812c36a..256ab60 100644 --- a/src/leettools/flow/steps/step_rerank.py +++ b/src/leettools/flow/steps/step_rerank.py @@ -12,7 +12,6 @@ class StepRerank(AbstractStep): - COMPONENT_NAME: ClassVar[str] = "rerank" @classmethod @@ -38,7 +37,6 @@ def _run_rerank( exec_info: ExecInfo, top_ranked_result_segments: List[SearchResultSegment], ) -> List[SearchResultSegment]: - context = exec_info.context settings = exec_info.settings display_logger = exec_info.display_logger @@ -58,7 +56,7 @@ def _run_rerank( or rerank_section.strategy_name == "" or rerank_section.strategy_name == "disabled" ): - display_logger.info(f"Rerank is disabled. Skip reranking.") + display_logger.info("Rerank is disabled. Skip reranking.") return top_ranked_result_segments display_logger.info("[Status] Rerank the search results.") diff --git a/src/leettools/flow/steps/step_scrape_urls.py b/src/leettools/flow/steps/step_scrape_urls.py index 95181f6..4b7864a 100644 --- a/src/leettools/flow/steps/step_scrape_urls.py +++ b/src/leettools/flow/steps/step_scrape_urls.py @@ -13,7 +13,6 @@ class StepScrpaeUrlsToDocSource(AbstractStep): - COMPONENT_NAME: ClassVar[str] = "scrape_urls_to_docsource" @classmethod @@ -111,7 +110,7 @@ def run_step( ) continue successful_documents[document.original_uri] = document - except Exception as e: + except Exception: pass else: diff --git a/src/leettools/flow/steps/step_search_medium.py b/src/leettools/flow/steps/step_search_medium.py index c89bd0a..5456c53 100644 --- a/src/leettools/flow/steps/step_search_medium.py +++ b/src/leettools/flow/steps/step_search_medium.py @@ -22,7 +22,6 @@ class StepSearchMedium(AbstractStep): - COMPONENT_NAME: ClassVar[str] = "search_medium" @classmethod @@ -80,8 +79,7 @@ def run_step( display_logger.info("[Status] Start the medium search pipeline ...") medium_articles = _run_medium_search_pipeline(exec_info, search_keywords) display_logger.info( - f"Successfully find {len(medium_articles)} " - "Medium.com articles from search." + f"Successfully find {len(medium_articles)} Medium.com articles from search." ) return medium_articles @@ -210,5 +208,5 @@ def _get_page_content(url: str) -> Optional[str]: response = requests.get(url, headers=headers) response.raise_for_status() # raise an HTTPError for bad responses return response.text - except requests.RequestException as e: + except requests.RequestException: return None diff --git a/src/leettools/flow/steps/step_search_to_docsource.py b/src/leettools/flow/steps/step_search_to_docsource.py index ed7dad8..af85b8a 100644 --- a/src/leettools/flow/steps/step_search_to_docsource.py +++ b/src/leettools/flow/steps/step_search_to_docsource.py @@ -21,7 +21,6 @@ class StepSearchToDocsource(AbstractStep): - COMPONENT_NAME: ClassVar[str] = "search_to_docsource" @classmethod @@ -30,15 +29,15 @@ def short_description(cls) -> str: @classmethod def full_description(cls) -> str: - return """Create a document source with web search. + return """Create a document source with web search. -For knowledge base that has auto_schedule set to True, if a scheduler is running, the +For knowledge base that has auto_schedule set to True, if a scheduler is running, the document source will be scheduled for processing, otherwise, the scheduler will bestarted to process the document source. The actual web searcher will be started by the scheduler -using the config in the exec_info. This function will wait for the document source to +using the config in the exec_info. This function will wait for the document source to finish processing or timeout (currently hardcoded at 10 minutes). -For knowledge base that has auto_schedule set to False, the document source will be +For knowledge base that has auto_schedule set to False, the document source will be processed immediately. The function will return after the document source is processed. """ @@ -173,7 +172,6 @@ def run_step( def _run_web_search_pipeline( exec_info: ExecInfo, docsource: DocSource, search_keywords: str ) -> List[Document]: - # this is basically the logic from the scheduler context = exec_info.context display_logger = exec_info.display_logger @@ -217,7 +215,6 @@ def _create_docsrc_for_search( search_keywords: str, schedule_config: ScheduleConfig, ) -> DocSource: - context = exec_info.context org = exec_info.org kb = exec_info.kb diff --git a/src/leettools/flow/steps/step_summarize.py b/src/leettools/flow/steps/step_summarize.py index 47e167f..6a57aef 100644 --- a/src/leettools/flow/steps/step_summarize.py +++ b/src/leettools/flow/steps/step_summarize.py @@ -20,7 +20,6 @@ class StepSummarize(AbstractStep): - COMPONENT_NAME: ClassVar[str] = "summarize" @classmethod @@ -29,7 +28,7 @@ def short_description(cls) -> str: @classmethod def full_description(cls) -> str: - return """Given a document content as a string, summarize the content using + return """Given a document content as a string, summarize the content using the model specified as the summarization model in the flow option. """ @@ -41,10 +40,10 @@ def used_prompt_templates(cls) -> Dict[str, PromptBase]: {{ content_instruction }} please do the following tasks: - write a concise summary of the document less than 100 words {{ lang_instruction }}, -- get up to 10 keywords that the document is about {{ lang_instruction }}, +- get up to 10 keywords that the document is about {{ lang_instruction }}, - find up to 10 URL links in the document - get the authors of the document if possible -- if there is an explicit publishing date in the document, get the content_date for document. +- if there is an explicit publishing date in the document, get the content_date for document. - generate a relevance score between 1 and 100, 100 means 100% fit the content instruction above. {{ json_format_instruction }} { @@ -246,7 +245,7 @@ def _summarize_content(exec_info: ExecInfo, content: str) -> DocumentSummary: try: doc_summary = DocumentSummary.model_validate_json(response_str) - except Exception as e: + except Exception: display_logger.error( f"ModelValidating DocumentSummary failed: {response_str}" ) diff --git a/src/leettools/flow/steps/step_vectdb_search.py b/src/leettools/flow/steps/step_vectdb_search.py index d9211ad..65df421 100644 --- a/src/leettools/flow/steps/step_vectdb_search.py +++ b/src/leettools/flow/steps/step_vectdb_search.py @@ -1,7 +1,5 @@ -import os from typing import ClassVar, List, Type -from leettools.common import exceptions from leettools.common.utils import config_utils from leettools.core.schemas.chat_query_metadata import ChatQueryMetadata from leettools.core.schemas.docsink import DocSink @@ -24,7 +22,6 @@ class StepVectorSearch(AbstractStep): - COMPONENT_NAME: ClassVar[str] = "vector_search" @classmethod diff --git a/src/leettools/flow/subflow.py b/src/leettools/flow/subflow.py index 709a9b2..89c9664 100644 --- a/src/leettools/flow/subflow.py +++ b/src/leettools/flow/subflow.py @@ -1,5 +1,5 @@ from abc import ABC, abstractmethod -from typing import Any, ClassVar +from typing import Any from leettools.flow.exec_info import ExecInfo from leettools.flow.flow_component import FlowComponent diff --git a/src/leettools/flow/subflows/subflow_gen_essay.py b/src/leettools/flow/subflows/subflow_gen_essay.py index 27accf5..a3a589f 100644 --- a/src/leettools/flow/subflows/subflow_gen_essay.py +++ b/src/leettools/flow/subflows/subflow_gen_essay.py @@ -128,12 +128,12 @@ def _section_plan_for_research(topic: TopicSpec, query: str): title=topic.title, search_query=query + " " + topic.title, system_prompt_template=""" -You are an expert research writer, you can write a detailed section about the topic +You are an expert research writer, you can write a detailed section about the topic using the provided context and the specified style shown in the example. """, user_prompt_template=f""" -{{{{ context_presentation }}}} please write the section {{{{ lang_instruction }}}} -following the instructions below. +{{{{ context_presentation }}}} please write the section {{{{ lang_instruction }}}} +following the instructions below. {topic.prompt} {{{{ reference_instruction }}}} @@ -214,7 +214,6 @@ def report_for_docsource( docsource_uuid: str, log_level: str, ) -> None: - EventLogger.set_global_default_level(log_level.upper()) from leettools.context_manager import ContextManager @@ -251,11 +250,13 @@ def report_for_docsource( entity_name=docsource_uuid, entity_type="DocSource" ) - document_summaries, all_docs, all_keywords = ( - flow_utils.get_doc_summaries_for_docsource( - docsource=docsource, - exec_info=exec_info, - ) + ( + document_summaries, + all_docs, + all_keywords, + ) = flow_utils.get_doc_summaries_for_docsource( + docsource=docsource, + exec_info=exec_info, ) chat_query_result_create = SubflowGenEssay.run_subflow( diff --git a/src/leettools/flow/subflows/subflow_gen_section.py b/src/leettools/flow/subflows/subflow_gen_section.py index 315a3bc..1b42775 100644 --- a/src/leettools/flow/subflows/subflow_gen_section.py +++ b/src/leettools/flow/subflows/subflow_gen_section.py @@ -3,9 +3,6 @@ from leettools.common.utils import config_utils from leettools.core.consts import flow_option from leettools.core.schemas.chat_query_result import SourceItem -from leettools.core.strategy.schemas.strategy_display_settings import ( - StrategySectionName, -) from leettools.flow import steps from leettools.flow.exec_info import ExecInfo from leettools.flow.flow_component import FlowComponent @@ -100,13 +97,15 @@ def run_subflow( ) display_logger.info(f"Using {section_model} to compute the context.") - extended_context, context_token_count, section_source_items = ( - steps.StepExtendContext.run_step( - exec_info=exec_info, - reranked_result=reranked_result, - accumulated_source_items=accumulated_source_items, - override_model_name=section_model, - ) + ( + extended_context, + context_token_count, + section_source_items, + ) = steps.StepExtendContext.run_step( + exec_info=exec_info, + reranked_result=reranked_result, + accumulated_source_items=accumulated_source_items, + override_model_name=section_model, ) display_logger.info( diff --git a/src/leettools/flow/subflows/subflow_web_answer.py b/src/leettools/flow/subflows/subflow_web_answer.py index c8364b5..5dcaec4 100644 --- a/src/leettools/flow/subflows/subflow_web_answer.py +++ b/src/leettools/flow/subflows/subflow_web_answer.py @@ -89,7 +89,6 @@ def _subflow_answer_with_web_search( multi_items: bool = False, save_to_db: bool = True, ) -> List[TypeVar_BaseModel]: - display_logger = exec_info.display_logger strategy = exec_info.strategy accumulated_source_items: Dict[str, SourceItem] = {} @@ -113,12 +112,14 @@ def _subflow_answer_with_web_search( display_logger.warning("No top ranked result found.") return [] - extended_context, context_token_count, source_items = ( - steps.StepExtendContext.run_step( - exec_info=exec_info, - reranked_result=top_ranked_result_segments, - accumulated_source_items=accumulated_source_items, - ) + ( + extended_context, + context_token_count, + source_items, + ) = steps.StepExtendContext.run_step( + exec_info=exec_info, + reranked_result=top_ranked_result_segments, + accumulated_source_items=accumulated_source_items, ) extracted_obj_list = steps.StepExtractInfo.run_step( @@ -196,7 +197,6 @@ def get_answer_from_web( username: str, log_level: str, ) -> None: - EventLogger.set_global_default_level(log_level.upper()) from leettools.context_manager import ContextManager diff --git a/src/leettools/flow/utils/flow_utils.py b/src/leettools/flow/utils/flow_utils.py index 789ffe7..efa36dc 100644 --- a/src/leettools/flow/utils/flow_utils.py +++ b/src/leettools/flow/utils/flow_utils.py @@ -54,10 +54,10 @@ def _replace_think_section_in_result(content: str, display_logger: EventLogger) # Extract the think section content think_content = content[7:end_tag_pos] # Replace with HTML comment version - content = f"{content[end_tag_pos+8:]}" - display_logger.debug(f"Replaced think section in content.") + content = f"{content[end_tag_pos + 8 :]}" + display_logger.debug("Replaced think section in content.") else: - display_logger.debug(f"No think section found in content.") + display_logger.debug("No think section found in content.") return content @@ -149,7 +149,7 @@ def get_search_lang( logger().info(f"Using search language specified in exec_info: {lang}") return normalize_lang_name(exec_info.output_lang) - logger().info(f"No language specified for search.") + logger().info("No language specified for search.") return None diff --git a/src/leettools/flow/utils/pipeline_utils.py b/src/leettools/flow/utils/pipeline_utils.py index 1c35cc0..8318485 100644 --- a/src/leettools/flow/utils/pipeline_utils.py +++ b/src/leettools/flow/utils/pipeline_utils.py @@ -1,5 +1,4 @@ from concurrent.futures import ThreadPoolExecutor -from datetime import datetime from functools import partial from pathlib import Path from typing import List diff --git a/src/leettools/flow/utils/prompt_utils.py b/src/leettools/flow/utils/prompt_utils.py index 7d6c5fc..6f3ee02 100644 --- a/src/leettools/flow/utils/prompt_utils.py +++ b/src/leettools/flow/utils/prompt_utils.py @@ -1,4 +1,4 @@ -from datetime import datetime, timezone +from datetime import datetime from typing import Any, Dict, List, Optional from leettools.common.logging import logger @@ -110,7 +110,7 @@ def get_template_vars( def context_presentation() -> str: - return """Given the context as a sequence of references with a reference id in the + return """Given the context as a sequence of references with a reference id in the format of a leading [x],""" @@ -126,8 +126,8 @@ def date_instruction(timezone: Optional[str] = None) -> str: def reference_instruction() -> str: return """ -In the answer, use format [1], [2], ..., [n] in line where the reference is used. -For example, "According to the research from Google[3], ...". +In the answer, use format [1], [2], ..., [n] in line where the reference is used. +For example, "According to the research from Google[3], ...". DO NOT add References section at the end of the output. """ @@ -141,10 +141,9 @@ def lang_instruction(lang: Optional[str] = None) -> str: def json_format_instruction() -> str: - # TODO: change all steps using this instruction to use the new API parameter return """ -Return the result in the following JSON format, ensuring the output is formatted as +Return the result in the following JSON format, ensuring the output is formatted as JSON data, and not in a JSON block: """ diff --git a/src/leettools/settings.py b/src/leettools/settings.py index f1e629c..fe79066 100644 --- a/src/leettools/settings.py +++ b/src/leettools/settings.py @@ -604,7 +604,6 @@ def initialize( logger().debug(f"Checking env variable: {env_var_name}, value is {env_var}") if env_var is not None: - field_info = self.model_fields[field_name] # set the values based on the field type if field_info.annotation == bool: @@ -625,17 +624,17 @@ def initialize( if self.is_production: logger().info( - f"Current system is running in production mode per EDS_IS_PRODUCTION." + "Current system is running in production mode per EDS_IS_PRODUCTION." ) else: - logger().info(f"Current system is running in development mode.") + logger().info("Current system is running in development mode.") if self.SINGLE_USER_MODE: logger().info( - f"Current system is running in single user mode per EDS_SINGLE_USER_MODE." + "Current system is running in single user mode per EDS_SINGLE_USER_MODE." ) else: - logger().info(f"Current system is running in multi user mode.") + logger().info("Current system is running in multi user mode.") # set derived values that have not been set by env variables diff --git a/src/leettools/svc/api/v1/api.py b/src/leettools/svc/api/v1/api.py index a3313f8..8b9f3d3 100644 --- a/src/leettools/svc/api/v1/api.py +++ b/src/leettools/svc/api/v1/api.py @@ -32,7 +32,7 @@ def __init__(self, *args, **kwargs): super().include_router(self.user_router, prefix="/users", tags=["Users"]) self.kb_router = kb_router.KnowledgeBaseRouter() - super().include_router(self.kb_router, prefix=f"/kbs", tags=["KnowledgeBase"]) + super().include_router(self.kb_router, prefix="/kbs", tags=["KnowledgeBase"]) self.chat_router = chat_router.ChatRouter() super().include_router(self.chat_router, prefix="/chat", tags=["ChatHistory"]) diff --git a/src/leettools/svc/api/v1/routers/chat_router.py b/src/leettools/svc/api/v1/routers/chat_router.py index 7b7ffea..b95f441 100644 --- a/src/leettools/svc/api/v1/routers/chat_router.py +++ b/src/leettools/svc/api/v1/routers/chat_router.py @@ -28,7 +28,6 @@ class QueryProgress(BaseModel): - org_name: Optional[str] = None kb_name: Optional[str] = None user_name: Optional[str] = None @@ -432,7 +431,6 @@ async def update_chat_history_new( ch_update: CHUpdate, calling_user: User = Depends(self.auth.get_user_from_request), ) -> ChatHistory: - if ch_update.chat_id != chat_id: raise HTTPException( status_code=400, @@ -453,7 +451,6 @@ async def share_chat_history( chat_id: str, calling_user: User = Depends(self.auth.get_user_from_request), ) -> Optional[ChatHistory]: - ch = self.chat_manager.get_ch_entry(calling_user.username, chat_id) if ch is None: logger().warning( @@ -475,7 +472,6 @@ async def unshare_chat_history( chat_id: str, calling_user: User = Depends(self.auth.get_user_from_request), ) -> Optional[ChatHistory]: - ch = self.chat_manager.get_ch_entry(calling_user.username, chat_id) if ch is None: logger().warning( diff --git a/src/leettools/svc/api/v1/routers/docsink_router.py b/src/leettools/svc/api/v1/routers/docsink_router.py index bce0705..d9f6520 100644 --- a/src/leettools/svc/api/v1/routers/docsink_router.py +++ b/src/leettools/svc/api/v1/routers/docsink_router.py @@ -11,7 +11,6 @@ class DocSinkRouter(APIRouterBase): - def _get_org(self, org_name: str) -> Org: org = self.org_manager.get_org_by_name(org_name) if org is None: diff --git a/src/leettools/svc/api/v1/routers/docsource_router.py b/src/leettools/svc/api/v1/routers/docsource_router.py index 735f0b7..8888fd3 100644 --- a/src/leettools/svc/api/v1/routers/docsource_router.py +++ b/src/leettools/svc/api/v1/routers/docsource_router.py @@ -17,7 +17,6 @@ class DocSourceRouter(APIRouterBase): - def _get_org(self, org_name: str) -> Org: org = self.org_manager.get_org_by_name(org_name) if org is None: diff --git a/src/leettools/svc/api/v1/routers/document_router.py b/src/leettools/svc/api/v1/routers/document_router.py index 8d2e709..15a4d75 100644 --- a/src/leettools/svc/api/v1/routers/document_router.py +++ b/src/leettools/svc/api/v1/routers/document_router.py @@ -19,7 +19,6 @@ class DocumentRouter(APIRouterBase): - def _get_org(self, org_name: str) -> Org: org = self.org_manager.get_org_by_name(org_name) if org is None: @@ -280,7 +279,6 @@ async def delete_document_by_id( document_uuid: str, calling_user: User = Depends(self.auth.get_user_from_request), ): - org = self._get_org(org_name) kb = self._get_kb(org_name, kb_name) if not self.auth.can_write_kb(org=org, kb=kb, user=calling_user): diff --git a/src/leettools/svc/api/v1/routers/file_router.py b/src/leettools/svc/api/v1/routers/file_router.py index d289680..384e001 100644 --- a/src/leettools/svc/api/v1/routers/file_router.py +++ b/src/leettools/svc/api/v1/routers/file_router.py @@ -8,7 +8,6 @@ class FileRouter(APIRouterBase): - def __init__(self, *args, **kwargs): super().__init__(*args, **kwargs) context = self.context @@ -55,9 +54,9 @@ async def read_raw_document(uri: str) -> FileResponse: # This security check assumes files are served from a subdirectory `safe_dir`. parents = absolute_file_path.resolve().parents if ( - not safe_base_path.resolve() in parents - and not incoming_file_path.resolve() in parents - and not uploads_file_path.resolve() in parents + safe_base_path.resolve() not in parents + and incoming_file_path.resolve() not in parents + and uploads_file_path.resolve() not in parents ): raise HTTPException( status_code=400, diff --git a/src/leettools/svc/api/v1/routers/job_router.py b/src/leettools/svc/api/v1/routers/job_router.py index 50c6d1b..25ac035 100644 --- a/src/leettools/svc/api/v1/routers/job_router.py +++ b/src/leettools/svc/api/v1/routers/job_router.py @@ -13,7 +13,6 @@ class JobRouter(APIRouterBase): - def __init__(self, *args, **kwargs): super().__init__(*args, **kwargs) context = self.context diff --git a/src/leettools/svc/api/v1/routers/md_router.py b/src/leettools/svc/api/v1/routers/md_router.py index fdc0278..b35f85c 100644 --- a/src/leettools/svc/api/v1/routers/md_router.py +++ b/src/leettools/svc/api/v1/routers/md_router.py @@ -15,7 +15,6 @@ class MDRouter(APIRouterBase): - def _get_org(self, org_name: str) -> Org: org = self.org_manager.get_org_by_name(org_name) if org is None: diff --git a/src/leettools/svc/api/v1/routers/org_router.py b/src/leettools/svc/api/v1/routers/org_router.py index 0ac0d63..1a64ace 100644 --- a/src/leettools/svc/api/v1/routers/org_router.py +++ b/src/leettools/svc/api/v1/routers/org_router.py @@ -8,7 +8,6 @@ class OrgRouter(APIRouterBase): - def _get_org(self, org_name: str): org = self.org_manager.get_org_by_name(org_name) if org is None: diff --git a/src/leettools/svc/api/v1/routers/segment_router.py b/src/leettools/svc/api/v1/routers/segment_router.py index 63b4798..f8d5c86 100644 --- a/src/leettools/svc/api/v1/routers/segment_router.py +++ b/src/leettools/svc/api/v1/routers/segment_router.py @@ -11,7 +11,6 @@ class SegmentRouter(APIRouterBase): - def _get_org(self, org_name: str) -> Org: org = self.org_manager.get_org_by_name(org_name) if org is None: diff --git a/src/leettools/svc/api/v1/routers/settings_router.py b/src/leettools/svc/api/v1/routers/settings_router.py index 50e46a8..22fcf18 100644 --- a/src/leettools/svc/api/v1/routers/settings_router.py +++ b/src/leettools/svc/api/v1/routers/settings_router.py @@ -19,7 +19,6 @@ class SettingsRouter(APIRouterBase): - def __init__(self, *args, **kwargs): super().__init__(*args, **kwargs) context = self.context diff --git a/src/leettools/svc/api/v1/routers/strategy_router.py b/src/leettools/svc/api/v1/routers/strategy_router.py index 602607d..b5f01ae 100644 --- a/src/leettools/svc/api/v1/routers/strategy_router.py +++ b/src/leettools/svc/api/v1/routers/strategy_router.py @@ -21,7 +21,6 @@ class StrategyRouter(APIRouterBase): - def __init__(self, *args, **kwargs): super().__init__(*args, **kwargs) context = self.context @@ -67,7 +66,7 @@ async def flow_options( flow: AbstractFlow = None try: flow = self.flow_manager.get_flow_by_type(flow_type) - except Exception as e: + except Exception: raise HTTPException( status_code=400, detail=f"Invalid flow type {flow_type}", diff --git a/src/leettools/svc/api/v1/routers/task_router.py b/src/leettools/svc/api/v1/routers/task_router.py index 7e4f544..a352592 100644 --- a/src/leettools/svc/api/v1/routers/task_router.py +++ b/src/leettools/svc/api/v1/routers/task_router.py @@ -19,7 +19,6 @@ class TaskRouter(APIRouterBase): - def _get_org(self, org_name: str) -> Org: org = self.org_manager.get_org_by_name(org_name) if org is None: diff --git a/src/leettools/svc/api/v1/routers/user_router.py b/src/leettools/svc/api/v1/routers/user_router.py index 63547d9..f24f309 100644 --- a/src/leettools/svc/api/v1/routers/user_router.py +++ b/src/leettools/svc/api/v1/routers/user_router.py @@ -14,7 +14,6 @@ class UserRouter(APIRouterBase): - def __init__(self, *args, **kwargs): super().__init__(*args, **kwargs) context = self.context diff --git a/src/leettools/svc/api_router_base.py b/src/leettools/svc/api_router_base.py index 0abe258..3033b32 100644 --- a/src/leettools/svc/api_router_base.py +++ b/src/leettools/svc/api_router_base.py @@ -6,7 +6,6 @@ class APIRouterBase(APIRouter): - def __init__(self, *args, **kwargs): super().__init__(*args, **kwargs) context = ContextManager().get_context() diff --git a/src/leettools/svc/main.py b/src/leettools/svc/main.py index 1744892..d336b86 100644 --- a/src/leettools/svc/main.py +++ b/src/leettools/svc/main.py @@ -66,7 +66,6 @@ def start_service( if __name__ == "__main__": - # set the env variable INIT_STRATEGY_STORE to True environ["INIT_STRATEGY_STORE"] = "true" start_service() diff --git a/src/leettools/web/image_searcher.py b/src/leettools/web/image_searcher.py index 171307e..ffc3ed1 100644 --- a/src/leettools/web/image_searcher.py +++ b/src/leettools/web/image_searcher.py @@ -9,7 +9,6 @@ from leettools.common.utils import config_utils, file_utils from leettools.context_manager import Context from leettools.core.consts import flow_option -from leettools.core.consts.retriever_type import RetrieverType from leettools.core.schemas.knowledgebase import KnowledgeBase from leettools.core.schemas.organization import Org from leettools.core.schemas.user import User @@ -34,7 +33,6 @@ def __init__( kb: Optional[KnowledgeBase] = None, user: Optional[User] = None, ): - self.context = context self.org = org self.kb = kb @@ -73,7 +71,6 @@ def search_image( flow_options: Optional[Dict[str, Any]] = {}, display_logger: Optional[EventLogger] = None, ) -> List[ImageSearchResult]: - # visited_urls stores the url to path mapping visited_urls: Dict[str, str] = {} diff --git a/src/leettools/web/retrievers/baidu/baidu.py b/src/leettools/web/retrievers/baidu/baidu.py index cc63a19..8216db6 100644 --- a/src/leettools/web/retrievers/baidu/baidu.py +++ b/src/leettools/web/retrievers/baidu/baidu.py @@ -9,7 +9,6 @@ from selenium.webdriver.support import expected_conditions as EC from selenium.webdriver.support.ui import WebDriverWait -from leettools.common.logging import logger from leettools.common.logging.event_logger import EventLogger from leettools.context_manager import Context from leettools.core.schemas.knowledgebase import KnowledgeBase @@ -36,7 +35,7 @@ def __init__( Initializes the Baidu Search object """ super().__init__(context, org, kb, user) - self.logger.info(f"Installing Chrome Driver...") + self.logger.info("Installing Chrome Driver...") # Configure Chrome options to run in headless mode chrome_options = Options() @@ -72,7 +71,6 @@ def _retrieve( flow_options: Dict[str, Any], display_logger: EventLogger, ) -> List[SearchResult]: - days_limit, max_results = search_utils.get_common_search_paras( flow_options=flow_options, settings=self.context.settings, diff --git a/src/leettools/web/retrievers/bing/bing.py b/src/leettools/web/retrievers/bing/bing.py index 51b937f..3a63886 100644 --- a/src/leettools/web/retrievers/bing/bing.py +++ b/src/leettools/web/retrievers/bing/bing.py @@ -2,9 +2,8 @@ import requests -from leettools.common.logging import logger from leettools.common.logging.event_logger import EventLogger -from leettools.context_manager import Context, ContextManager +from leettools.context_manager import Context from leettools.core.schemas.knowledgebase import KnowledgeBase from leettools.core.schemas.organization import Org from leettools.core.schemas.user import User diff --git a/src/leettools/web/retrievers/firecrawl/firecrawl.py b/src/leettools/web/retrievers/firecrawl/firecrawl.py index e998156..aebb219 100644 --- a/src/leettools/web/retrievers/firecrawl/firecrawl.py +++ b/src/leettools/web/retrievers/firecrawl/firecrawl.py @@ -164,7 +164,7 @@ def _get_api_url(self) -> str: ) if api_url is None or api_url == "": api_url = "https://api.firecrawl.dev" - except Exception as e: + except Exception: self.logger.debug( "Failed to get FireCrawl API URL. Using the default URL. " ) diff --git a/src/leettools/web/retrievers/google/google.py b/src/leettools/web/retrievers/google/google.py index b513e55..50fbb1e 100644 --- a/src/leettools/web/retrievers/google/google.py +++ b/src/leettools/web/retrievers/google/google.py @@ -67,13 +67,9 @@ def _retrieve( flow_options: Dict[str, Any], display_logger: EventLogger, ) -> List[SearchResult]: - from leettools.common.utils import config_utils from leettools.core.consts.flow_option import ( FLOW_OPTION_EXCLUDED_SITES, - FLOW_OPTION_IMAGE_SEARCH, - FLOW_OPTION_SEARCH_ITERATION, - FLOW_OPTION_TARGET_SITE, ) display_logger.info(f"Google search with query: {query}...") @@ -147,8 +143,8 @@ def _retrieve( if max_iteration == 0: display_logger.warning( - f"Max iteration is set to 0, which means no search will be performed." - f"Setting it to default value 3." + "Max iteration is set to 0, which means no search will be performed." + "Setting it to default value 3." ) max_iteration = 3 @@ -286,13 +282,13 @@ def _process_url( return None if search_results_dict is None or len(search_results_dict) == 0: display_logger.warning( - f"search_results_dict is None or empty, which should not happen." + "search_results_dict is None or empty, which should not happen." ) return None results = search_results_dict.get("items", []) if results is None or len(results) == 0: - display_logger.warning(f"No 'items' field in resp.text.") + display_logger.warning("No 'items' field in resp.text.") return None return results diff --git a/src/leettools/web/retrievers/google_patent/google_patent.py b/src/leettools/web/retrievers/google_patent/google_patent.py index 83e2707..d254aff 100644 --- a/src/leettools/web/retrievers/google_patent/google_patent.py +++ b/src/leettools/web/retrievers/google_patent/google_patent.py @@ -57,7 +57,6 @@ def _retrieve( flow_options: Dict[str, Any], display_logger: EventLogger, ) -> List[SearchResult]: - display_logger.info(f"Searching with query {query}...") days_limit, max_results = search_utils.get_common_search_paras( @@ -89,7 +88,7 @@ def _retrieve( while len(search_results) < max_results and last_iteration_filled: iteration += 1 - url_paras = f"&safe=active" f"&start={start}" f"{date_restrict}" + url_paras = f"&safe=active&start={start}{date_restrict}" url = f"{url_base}{url_paras}" redacted_url = f"{redacted_url_base}{url_paras}" diff --git a/src/leettools/web/retrievers/local/local.py b/src/leettools/web/retrievers/local/local.py index 8be00a1..e90df43 100644 --- a/src/leettools/web/retrievers/local/local.py +++ b/src/leettools/web/retrievers/local/local.py @@ -35,7 +35,6 @@ def retrieve_search_result( flow_options: Optional[Dict[str, Any]] = {}, display_logger: Optional[EventLogger] = None, ) -> List[SearchResult]: - if display_logger is None: display_logger = logger() context = self.context diff --git a/src/leettools/web/retrievers/searx/searx.py b/src/leettools/web/retrievers/searx/searx.py index d096076..8902d5d 100644 --- a/src/leettools/web/retrievers/searx/searx.py +++ b/src/leettools/web/retrievers/searx/searx.py @@ -139,7 +139,6 @@ def _search_searx_advanced( # Example usage if __name__ == "__main__": - context = ContextManager().get_context() ss = SearxSearch(context=context) query = "AI advancements" diff --git a/src/leettools/web/retrievers/tavily/tavily.py b/src/leettools/web/retrievers/tavily/tavily.py index de8f2e8..593fae8 100644 --- a/src/leettools/web/retrievers/tavily/tavily.py +++ b/src/leettools/web/retrievers/tavily/tavily.py @@ -4,7 +4,6 @@ from leettools.common import exceptions from leettools.common.logging.event_logger import EventLogger -from leettools.common.utils.obj_utils import ENV_VAR_PREFIX from leettools.context_manager import Context from leettools.core.schemas.knowledgebase import KnowledgeBase from leettools.core.schemas.organization import Org diff --git a/src/leettools/web/scrapers/arxiv/arxiv.py b/src/leettools/web/scrapers/arxiv/arxiv.py index b4637d3..341d7c5 100644 --- a/src/leettools/web/scrapers/arxiv/arxiv.py +++ b/src/leettools/web/scrapers/arxiv/arxiv.py @@ -11,7 +11,6 @@ class ArxivScraper(AbstractScraper): - def __init__( self, session: requests.Session = None, diff --git a/src/leettools/web/scrapers/beautiful_soup/beautiful_soup.py b/src/leettools/web/scrapers/beautiful_soup/beautiful_soup.py index 503b4ef..d822574 100644 --- a/src/leettools/web/scrapers/beautiful_soup/beautiful_soup.py +++ b/src/leettools/web/scrapers/beautiful_soup/beautiful_soup.py @@ -1,4 +1,3 @@ -from datetime import timedelta from pathlib import Path from typing import Optional @@ -7,7 +6,7 @@ from leettools.common.logging import logger from leettools.common.logging.event_logger import EventLogger -from leettools.common.utils import file_utils, time_utils, url_utils +from leettools.common.utils import file_utils from leettools.core.consts.return_code import ReturnCode from leettools.web.schemas.scrape_result import ScrapeResult from leettools.web.scrapers.scraper import AbstractScraper @@ -18,7 +17,6 @@ class BeautifulSoupSimpleScraper(AbstractScraper): - def __init__( self, session: requests.Session = None, @@ -39,7 +37,7 @@ def _is_content_length_ok(self, content: str) -> bool: context = ContextManager().get_context() if context.is_test: self.display_logger.info( - f"In the test mode. Ignoring the content length check." + "In the test mode. Ignoring the content length check." ) else: if len(content) < 300: diff --git a/src/leettools/web/scrapers/crawler4ai/crawler4ai.py b/src/leettools/web/scrapers/crawler4ai/crawler4ai.py index 8127a6d..ecfc97f 100644 --- a/src/leettools/web/scrapers/crawler4ai/crawler4ai.py +++ b/src/leettools/web/scrapers/crawler4ai/crawler4ai.py @@ -15,7 +15,6 @@ class Crawler4aiScraper(AbstractScraper): - def __init__( self, session: requests.Session = None, diff --git a/src/leettools/web/scrapers/firecrawl/firecrawl.py b/src/leettools/web/scrapers/firecrawl/firecrawl.py index 18c0aee..5e5b9c3 100644 --- a/src/leettools/web/scrapers/firecrawl/firecrawl.py +++ b/src/leettools/web/scrapers/firecrawl/firecrawl.py @@ -167,7 +167,7 @@ def _get_api_url(self) -> str: ) if api_url is None or api_url == "": api_url = "https://api.firecrawl.dev" - except Exception as e: + except Exception: self.display_logger.debug( "Failed to get FireCrawl API URL. Using the default URL. " ) diff --git a/src/leettools/web/scrapers/newspaper/newspaper.py b/src/leettools/web/scrapers/newspaper/newspaper.py index 40f09f2..97ed6ec 100644 --- a/src/leettools/web/scrapers/newspaper/newspaper.py +++ b/src/leettools/web/scrapers/newspaper/newspaper.py @@ -9,7 +9,6 @@ class NewspaperScraper(AbstractScraper): - def __init__( self, session: requests.Session = None, diff --git a/src/leettools/web/scrapers/pymupdf/pymupdf.py b/src/leettools/web/scrapers/pymupdf/pymupdf.py index 5fd2bce..493a28c 100644 --- a/src/leettools/web/scrapers/pymupdf/pymupdf.py +++ b/src/leettools/web/scrapers/pymupdf/pymupdf.py @@ -11,7 +11,6 @@ class PyMuPDFScraper(AbstractScraper): - def __init__( self, session: requests.Session = None, diff --git a/src/leettools/web/scrapers/scraper_utils.py b/src/leettools/web/scrapers/scraper_utils.py index 67c61e8..8e215e9 100644 --- a/src/leettools/web/scrapers/scraper_utils.py +++ b/src/leettools/web/scrapers/scraper_utils.py @@ -1,7 +1,6 @@ from datetime import timedelta from typing import Any -import requests from leettools.common.logging.event_logger import EventLogger from leettools.common.utils import file_utils, time_utils, url_utils @@ -50,7 +49,7 @@ def is_content_length_ok(content: str, display_logger: EventLogger) -> bool: context = ContextManager().get_context() if context.is_test: - display_logger.info(f"In the test mode. Ignoring the content length check.") + display_logger.info("In the test mode. Ignoring the content length check.") else: if len(content) < 300: display_logger.info( diff --git a/src/leettools/web/scrapers/web_base_loader/web_base_loader.py b/src/leettools/web/scrapers/web_base_loader/web_base_loader.py index b3189a4..5bfb584 100644 --- a/src/leettools/web/scrapers/web_base_loader/web_base_loader.py +++ b/src/leettools/web/scrapers/web_base_loader/web_base_loader.py @@ -9,7 +9,6 @@ class WebBaseLoaderScraper(AbstractScraper): - def __init__( self, session: requests.Session = None, diff --git a/src/leettools/web/search_utils.py b/src/leettools/web/search_utils.py index d745cfa..2e51dc6 100644 --- a/src/leettools/web/search_utils.py +++ b/src/leettools/web/search_utils.py @@ -9,7 +9,6 @@ def get_common_search_paras( flow_options: Dict[str, Any], settings: SystemSettings, display_logger: EventLogger ) -> Tuple[int, int]: - days_limit = config_utils.get_int_option_value( options=flow_options, option_name=flow_option.FLOW_OPTION_DAYS_LIMIT, @@ -33,8 +32,8 @@ def get_common_search_paras( if search_max_results == 0: display_logger.warning( - f"Max results is set to 0, which means no search will be performed." - f"Setting it to default value 10." + "Max results is set to 0, which means no search will be performed." + "Setting it to default value 10." ) search_max_results = 10 if search_max_results > settings.SEARCH_MAX_RESULTS_FROM_RETRIEVER: diff --git a/src/leettools/web/web_searcher.py b/src/leettools/web/web_searcher.py index a66b5c6..a004fcb 100644 --- a/src/leettools/web/web_searcher.py +++ b/src/leettools/web/web_searcher.py @@ -72,7 +72,7 @@ def short_description(cls) -> str: @classmethod def full_description(cls) -> str: - return """Given a query, search the web, scrape the results, and save them to + return """Given a query, search the web, scrape the results, and save them to local storage. """ @@ -94,7 +94,6 @@ def direct_flow_option_items(cls) -> List[FlowOptionItem]: ] def __init__(self, context: Context): - self.context = context self.settings = context.settings repo_manager = context.get_repo_manager() diff --git a/src/local/embedding/README.md b/src/local/embedding/README.md index c657bb5..4641086 100644 --- a/src/local/embedding/README.md +++ b/src/local/embedding/README.md @@ -3,10 +3,10 @@ This service provides a local embedding service that can be used to embed text into vectors (called dense embeddings comparing to sparse embeddings such as SPLADE). The service is based on the SentenceTransformer library, which provides a simple interface to embed text -into vectors. It can be used for local dev and testing purposes since using a +into vectors. It can be used for local dev and testing purposes since using a SentenceTransformer model can be slow to start up. -All string dense embedding operations should use the factor method to create the +All string dense embedding operations should use the factor method to create the embedder object as follows: ```python @@ -49,7 +49,7 @@ EDS_DEFAULT_DENSE_EMBEDDING_SERVICE_HOST=127.0.0.1 EDS_DEFAULT_DENSE_EMBEDDING_SERVICE_PORT=8001 ``` -To start the local embedding service, run the following command: +To start the local embedding service, run the following command: ```bash # by default this starts the service on http://127.0.0.1:8001 diff --git a/src/local/embedding/local_embdedding_service.py b/src/local/embedding/local_embdedding_service.py index 541a44a..7064183 100644 --- a/src/local/embedding/local_embdedding_service.py +++ b/src/local/embedding/local_embdedding_service.py @@ -7,7 +7,7 @@ from leettools.common.logging import logger from leettools.common.logging.event_logger import EventLogger -from leettools.context_manager import Context, ContextManager +from leettools.context_manager import ContextManager from leettools.eds.str_embedder._impl.dense_embedder_sentence_transformer import ( DenseEmbedderSentenceTransformer, ) diff --git a/src/local/embedding/run.sh b/src/local/embedding/run.sh index 7cef166..fef8a10 100755 --- a/src/local/embedding/run.sh +++ b/src/local/embedding/run.sh @@ -12,10 +12,10 @@ if [ ! -f "$BASE_DIR/.env" ]; then exit 1 fi -while IFS='=' read -r name value; do - if [[ ! $name =~ ^\# ]] && [[ -n $name ]]; then - export "$name=$value"; - fi; +while IFS='=' read -r name value; do + if [[ ! $name =~ ^\# ]] && [[ -n $name ]]; then + export "$name=$value"; + fi; done < "$BASE_DIR/.env" if [ -z "${LEET_HOME-}" ]; then