Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 2 additions & 2 deletions pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -40,8 +40,8 @@ dependencies = [
"sentence_transformers==2.5.1",
"tiktoken==0.8.0",
"duckdb==1.1.3",
"docling==2.25.2",
"docling_core==2.21.1",
"docling==2.26.0",
"docling_core==2.23.0",
"chonkie==0.5.1",
"langchain-community==0.3.15",
"firecrawl-py==1.12.0",
Expand Down
4 changes: 2 additions & 2 deletions requirements.txt
Original file line number Diff line number Diff line change
Expand Up @@ -23,8 +23,8 @@ scipy==1.14.1
sentence_transformers==2.5.1
tiktoken==0.8.0
duckdb==1.1.3
docling==2.25.2
docling_core==2.21.1
docling==2.26.0
docling_core==2.23.0
chonkie==0.5.1
langchain-community==0.3.15
firecrawl-py==1.12.0
Expand Down
23 changes: 23 additions & 0 deletions src/leettools/chat/schemas/chat_history.py
Original file line number Diff line number Diff line change
Expand Up @@ -126,6 +126,29 @@ class ChatHistory(CHInDB):
None, description="For adhoc chat, we need to return the kb_name created."
)

def get_history_str(self, ignore_last: bool = False) -> str:
"""
Get the history string of the chat history in the format of:
[user_query_1]
[assistant_answer_1]
[user_query_2]
[assistant_answer_2]
...
"""
history_str = ""
if len(self.queries) == 0:
return history_str
if ignore_last:
total_count = len(self.queries) - 1
else:
total_count = len(self.queries)

for i in range(total_count):
history_str += f"[query] {self.queries[i].query_content}\n"
if i < len(self.answers):
history_str += f"[answer] {self.answers[i].answer_content}\n"
return history_str

@classmethod
def from_ch_in_db(ChatHistory, ch_in_db: CHInDB) -> "ChatHistory":
# we need to assignt attributes with non-None values
Expand Down
1 change: 1 addition & 0 deletions src/leettools/core/consts/flow_option.py
Original file line number Diff line number Diff line change
Expand Up @@ -35,5 +35,6 @@
FLOW_OPTION_STRICT_CONTEXT = "strict_context"
FLOW_OPTION_SUMMARIZING_MODEL = "summarizing_model"
FLOW_OPTION_TARGET_SITE = "target_site"
FLOW_OPTION_TIMEZONE = "timezone"
FLOW_OPTION_WORD_COUNT = "word_count"
FLOW_OPTION_WRITING_MODEL = "writing_model"
7 changes: 7 additions & 0 deletions src/leettools/core/schemas/chat_query_item.py
Original file line number Diff line number Diff line change
Expand Up @@ -104,6 +104,9 @@ def get_strategy(
return strategy

if strategy_base is None:
display_logger.debug(
f"Using strategy id (strategy_base not provided): {strategy_id}"
)
strategy = strategy_store.get_strategy_by_id(strategy_id)
if strategy is None:
raise exceptions.EntityNotFoundException(
Expand All @@ -128,4 +131,8 @@ def get_strategy(
strategy_status=StrategyStatus.ACTIVE.value,
is_system=True,
)
else:
display_logger.debug(
f"Using strategy id (strategy_base provided): {strategy_id}"
)
return strategy
Original file line number Diff line number Diff line change
@@ -1,4 +1,6 @@
{
"strategy_name": "default",
"strategy_description": "Default strategy using system default settings"
"strategy_description": "Default strategy using system default settings",
"rewrite": "default",
"rewrite_options": {"model_name": "gpt-4o-mini"}
}
Original file line number Diff line number Diff line change
@@ -1,41 +1,42 @@
When users ask questions, they often do not provide enough information or
clear purposes. However, given a context about the question, we want to rewrite
the question so that the LLM can have a more clear goal and path to generate the
answer. Here are a few examples:
[default rewrite user prompt]

Example 1:
Context: The user is asking the question on a web site called google.com.
Question: How can I create an account on the website?
Rewrite:
{{ date_instruction }}

{
"rewritten_question": “I want to create an account on the website like google.com, please
show me a sequence of operations on the website and information I need in each step
to create an account.”
}
When users ask questions, they often provide limited context or unclear objectives, making
it challenging for an LLM to generate precise answers. Considering the previous query
history if provided, rewrite the user's current question into a detailed and structured
query, clearly stating the intent and outlining specific steps or information needed.
Additionally, replace any pronouns or vague references (like "it" or "they") with specific
terms to clarify exactly what is being referred to.

Example 2:
Context: We are working on some C++ code.
Question: My program is reporting an OOM error, what should I do?
Rewrite:
Example:

Original Question:
How do I reset my password?

Rewritten Question:
{
"rewritten_question": "I am getting an OOM (out of memory) error in my C++ program,
please provide me with a sequence of steps to diagnose and fix the error."
"rewritten_question": "I need to reset my password on the website. Provide a clear
sequence of steps, including where on the website I should navigate, what information I'll
need, and any verification steps involved.",
"search_keywords": "reset password steps website verification"
}

As illustrated by the above example, rewrite the given question, using the same language
as the original question, as a list of instructions that lead to a clear path to generate
the answer. Just output the rewritten query itself without any extra information:
Rewrite the following question into a clear, structured, and actionable set of
instructions to facilitate generating an accurate and useful answer. Be sure to explicitly
clarify all pronouns or vague references. Additionally, generate a concise string containing
relevant keywords suitable for performing an internet search to gather further information
about the query. Using the user query history for context, if provided.

Context: {{ context }}
{{ query_history_instruction }}

Question: {{ question }}
Question:
{{ question }}

Please output your answer in the following format, ensuring the output is formatted as
JSON data, the rewritten question is in the same language as the input question, and
not in a JSON block:
Output your response strictly in the following JSON format, ensuring the output is
formatted as JSON data, and not in a JSON block:

{
"rewritten_question": "rewritten_question"
}
"rewritten_question": "rewritten_question",
"search_keywords": "search_keywords"
}
2 changes: 1 addition & 1 deletion src/leettools/core/strategy/schemas/strategy_conf.py
Original file line number Diff line number Diff line change
Expand Up @@ -39,7 +39,7 @@ class StrategyConfBase(BaseModel):
"intention_list will be passed in the prompt.",
)

rewrite: Optional[str] = Field(None, description="The query rewrite strategy")
rewrite: Optional[str] = Field("default", description="The query rewrite strategy")
rewrite_options: Optional[Dict[str, Any]] = Field(
None,
description="The options for rewrite, right now support "
Expand Down
3 changes: 3 additions & 0 deletions src/leettools/eds/api_caller/api_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -205,6 +205,9 @@ def run_inference_call_direct(
display_logger.info(
f"({completion.usage.total_tokens}) tokens used for ({call_target})."
)
except Exception as e:
display_logger.error(f"Error in running inference call: {e}")
raise e
finally:
end_timestamp_in_ms = time_utils.cur_timestamp_in_ms()
if completion is not None:
Expand Down
Original file line number Diff line number Diff line change
@@ -1,7 +1,6 @@
import os
import traceback
from pathlib import Path
from typing import List, Optional

from leettools.common.logging import logger
from leettools.core.consts.return_code import ReturnCode
Expand Down
7 changes: 2 additions & 5 deletions src/leettools/eds/pipeline/convert/_impl/parser_docling.py
Original file line number Diff line number Diff line change
Expand Up @@ -112,8 +112,5 @@ def pptx2md(self, pptx_filepath: str, target_path: Optional[Path] = None) -> str
return self._convert(pptx_filepath, target_path)

def xlsx2md(self, xlsx_filepath: str, target_path: Optional[Path] = None) -> str:
# not supported yet
logger().error(
f"XLSX to markdown conversion is not supported yet: {xlsx_filepath}"
)
return ""
logger().debug(f"Converting XLSX to markdown: {xlsx_filepath}")
return self._convert(xlsx_filepath, target_path)
Original file line number Diff line number Diff line change
@@ -1,40 +1,41 @@
When users ask questions, they often do not provide enough information or
clear purposes. However, given a context about the question, we want to rewrite
the question so that the LLM can have a more clear goal and path to generate the
answer. Here are a few examples:
{{ date_instruction }}
When users ask questions, they often provide limited context or unclear objectives, making
it challenging for an LLM to generate precise answers. Considering the previous query
history if provided, rewrite the user's current question into a detailed and structured
query, clearly stating the intent and outlining specific steps or information needed.
Additionally, replace any pronouns or vague references (like "it" or "they") with specific
terms to clarify exactly what is being referred to.

Example 1:
Question: How can I create an account on the website?
Context: The user is asking the question on a web site called google.com.
Rewrite:
{{ query_history_instruction }}

{
"rewritten_question": “I want to create an account on the website like google.com, please
show me a sequence of operations on the website and information I need in each step
to create an account.”
}
Example:

Example 2:
Question: My program is reporting an OOM error, what should I do?
Context: We are working on some C++ code.
Rewrite:
Original Question:
How do I reset my password?

Rewritten Question:
{
"rewritten_question": "I am getting an OOM (out of memory) error in my C++ program,
please provide me with a sequence of steps to diagnose and fix the error."
"rewritten_question": "I need to reset my password on the website. Provide a clear
sequence of steps, including where on the website I should navigate, what information I'll
need, and any verification steps involved.",
"search_keywords": "reset password steps website verification"
}

As illustrated by the above example, given the question and the context, rewrite the
given question as a list of instructions that lead to a clear path to generate the answer.
Just output the rewritten query itself without any extra information:
Using the provided query history for context, rewrite the following question into a clear,
structured, and actionable set of instructions to facilitate generating an accurate and
useful answer. Additionally, generate a concise string containing relevant keywords
suitable for performing an internet search to gather further information about the query.
Be sure to explicitly clarify all pronouns or vague references.

Question: {{ question }}
Question:
{{ question }}

Context: {{ context }}

Please output your answer in the following format, ensuring the output is formatted as
JSON data, and not in a JSON block:
Output your response strictly in the following JSON format, ensuring the output is
formatted as JSON data, and not in a JSON block:

{
"rewritten_question": "rewritten_question"
}
"rewritten_question": "rewritten_question",
"search_keywords": "search_keywords"
}
52 changes: 29 additions & 23 deletions src/leettools/eds/rag/rewrite/_impl/prompts/default_user_prompt.txt
Original file line number Diff line number Diff line change
@@ -1,35 +1,41 @@
When users ask questions, they often do not provide enough information or
clear purposes. We want to rewrite the question so that the LLM can have
a more clear goal and path to generate the answer. Here are a few examples:
[rewrite user prompt under the rewrite _impl]

Example 1:
Question: How can I create an account on the website?
Rewrite:
{{ date_instruction }}
When users ask questions, they often provide limited context or unclear objectives, making
it challenging for an LLM to generate precise answers. Considering the previous query
history if provided, rewrite the user's current question into a detailed and structured
query, clearly stating the intent and outlining specific steps or information needed.
Additionally, replace any pronouns or vague references (like "it" or "they") with specific
terms to clarify exactly what is being referred to.

{
"rewritten_question": “I want to create an account on the website, please show me
a sequence of operations on the website and information I need in each step
to create an account.”
}
{{ query_history_instruction }}

Example 2:
Question: My program is reporting an OOM error, what should I do?
Rewrite:
Example:

Original Question:
How do I reset my password?

Rewritten Question:
{
"rewritten_question": "I am getting an OOM (out of memory) error in my program,
please provide me with a sequence of steps to diagnose and fix the error."
"rewritten_question": "I need to reset my password on the website. Provide a clear
sequence of steps, including where on the website I should navigate, what information I'll
need, and any verification steps involved.",
"search_keywords": "reset password steps website verification"
}

As illustrated by the above example, rewrite the given question as a list of
instructions that lead to a clear path to generate the answer. Just output the
rewritten query itself without any extra information:
Using the provided query history for context, rewrite the following question into a clear,
structured, and actionable set of instructions to facilitate generating an accurate and
useful answer. Be sure to explicitly clarify all pronouns or vague references. Additionally,
generate a concise string containing relevant keywords suitable for performing an
internet search to gather further information about the query.

Question:
{{ question }}

Please output your answer in the following format, ensuring the output is formatted as
JSON data, and not in a JSON block:
Output your response strictly in the following JSON format, ensuring the output is
formatted as JSON data, and not in a JSON block:

{
"rewritten_question": "rewritten_question"
}
"rewritten_question": "rewritten_question",
"search_keywords": "search_keywords"
}
37 changes: 34 additions & 3 deletions src/leettools/eds/rag/rewrite/_impl/rewrite_direct_dynamic.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,10 +4,12 @@

import click

from leettools.chat.history_manager import get_history_manager
from leettools.common.logging import logger
from leettools.common.logging.event_logger import EventLogger
from leettools.common.utils.template_eval import render_template
from leettools.context_manager import Context, ContextManager
from leettools.core.schemas.chat_query_item import ChatQueryItem
from leettools.core.schemas.chat_query_metadata import ChatQueryMetadata
from leettools.core.schemas.knowledgebase import KnowledgeBase
from leettools.core.schemas.organization import Org
Expand All @@ -22,6 +24,8 @@
get_query_rewriter_by_strategy,
)
from leettools.eds.rag.schemas.rewrite import Rewrite
from leettools.flow.exec_info import ExecInfo
from leettools.flow.utils import prompt_utils

_script_dir = os.path.dirname(os.path.abspath(__file__))

Expand All @@ -39,12 +43,39 @@ def __init__(
)

def rewrite(
self, org: Org, kb: KnowledgeBase, query: str, query_metadata: ChatQueryMetadata
self,
org: Org,
kb: KnowledgeBase,
query_item: ChatQueryItem,
query_metadata: ChatQueryMetadata,
) -> Rewrite:

self.setup_prompts_for_intention(query_metadata)

user_prompt = render_template(self.user_prompt_template, {"question": query})
query = query_item.query_content

# add query history
query_id = query_item.query_id
ch_manager = get_history_manager(self.context)
query_history = ch_manager.get_ch_entry(
username=self.user.username,
chat_id=query_item.chat_id,
)
if query_history is not None and query_history != "":
query_history_str = query_history.get_history_str(ignore_last=True)
query_history_instruction = (
"Here is the chat history:\n" + query_history_str
)
else:
query_history_instruction = ""

user_prompt = render_template(
self.user_prompt_template,
{
"question": query,
"query_history_instruction": query_history_instruction,
"date_instruction": prompt_utils.date_instruction(),
},
)
logger().debug(f"Final user prompt for rewrite: {user_prompt}")

system_prompt = render_template(
Expand Down
Loading
Loading