Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
@@ -0,0 +1,32 @@
"""persona multilingual_query_expansion flag

Revision ID: a3f1d7c4e9b2
Revises: c8a4e2f9d1b3
Create Date: 2026-05-04 12:00:00.000000

"""
from alembic import op
import sqlalchemy as sa


# revision identifiers, used by Alembic.
revision = "a3f1d7c4e9b2"
down_revision = "c8a4e2f9d1b3"
branch_labels: None = None
depends_on: None = None


def upgrade() -> None:
op.add_column(
"persona",
sa.Column(
"multilingual_query_expansion",
sa.Boolean(),
nullable=False,
server_default=sa.text("false"),
),
)


def downgrade() -> None:
op.drop_column("persona", "multilingual_query_expansion")
151 changes: 151 additions & 0 deletions backend/danswer/chat/multilingual_translation.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,151 @@
"""Helpers for the per-persona multi-language post-processing pass.

When a persona has `multilingual_query_expansion=True` and the user's
query is non-English, the answering LLM still produces English most of
the time (it tends to mirror the English context corpus regardless of
the LANGUAGE_HINT directive). We compensate by post-translating the
English answer back into the user's original language.

Trade-off: in translate mode we buffer the streamed answer instead of
showing it token-by-token. The user sees a brief delay (one extra LLM
round-trip), but reliably gets a reply in their language. English
queries are unaffected — they keep streaming normally.
"""
from __future__ import annotations

import unicodedata

from danswer.llm.interfaces import LLM
from danswer.llm.utils import dict_based_prompt_to_langchain_prompt
from danswer.llm.utils import message_to_string
from danswer.utils.logger import setup_logger

logger = setup_logger()


# Display name passed to the translation prompt. Keys are the language
# codes detect_query_language returns. Anything not in this map is
# treated as English (no translation needed).
_LANGUAGE_NAMES: dict[str, str] = {
"ja": "Japanese",
"zh": "Chinese (Simplified)",
"ko": "Korean",
}


def detect_query_language(text: str) -> str:
"""Cheap script-based language detector covering the languages we
explicitly support translation for. Returns one of: 'ja', 'zh',
'ko', or 'en' (English/other — no translation needed).

Heuristic mirrors the script-presence test in
backend/scripts/test_multilanguage_e2e.py: a few percent of CJK /
Hangul / kana code points is enough to decide. We don't try to be
clever about mixed-language queries — the dominant non-English
script wins, and ties default to English.
"""
if not text:
return "en"

counts = {"hiragana_katakana": 0, "hangul": 0, "cjk": 0, "ascii_letter": 0}
total_letters = 0
for ch in text:
cp = ord(ch)
if (0x3040 <= cp <= 0x309F) or (0x30A0 <= cp <= 0x30FF):
counts["hiragana_katakana"] += 1
total_letters += 1
elif 0xAC00 <= cp <= 0xD7AF:
counts["hangul"] += 1
total_letters += 1
elif (0x4E00 <= cp <= 0x9FFF) or (0x3400 <= cp <= 0x4DBF):
counts["cjk"] += 1
total_letters += 1
elif unicodedata.category(ch).startswith("L"):
counts["ascii_letter"] += 1
total_letters += 1

if total_letters == 0:
return "en"
threshold = max(1, total_letters // 20) # ~5%
if counts["hiragana_katakana"] >= threshold:
return "ja"
if counts["hangul"] >= threshold:
return "ko"
if counts["cjk"] >= threshold:
return "zh"
return "en"


def language_name(code: str) -> str | None:
return _LANGUAGE_NAMES.get(code)


# The prompt is intentionally directive about preserving citations and
# not adding commentary. Citations are bracketed numerals like [1] /
# [[1]](url); URLs and code blocks should also pass through unchanged.
_TRANSLATE_PROMPT = """\
You are a precise translator.

Translate the text below into {target_language}.

CRITICAL RULES — follow exactly:
- Preserve every citation marker exactly as-is. Citation markers look
like [1], [2], [[1]](https://example.com), etc. Do not translate
them, do not change the brackets, do not change the numbers.
- Preserve every URL exactly.
- Preserve every code block (text between triple backticks) exactly.
- Preserve every inline code span (text between single backticks).
- Do not add any commentary, preface, or trailing notes — output only
the translated text.
- Keep numbers, proper nouns, and product names in their original
form unless the target language has a well-established equivalent.

TEXT TO TRANSLATE:
{text}
"""


def translate_answer_to_language(
answer_text: str,
target_language_code: str,
llm: LLM,
) -> str:
"""Translate `answer_text` into the language named by
`target_language_code` (a key of _LANGUAGE_NAMES). Returns the
English original on any failure — better to ship an English answer
than to drop the response entirely."""
target_name = _LANGUAGE_NAMES.get(target_language_code)
if target_name is None:
# Caller should have skipped, but be defensive.
return answer_text

if not answer_text.strip():
return answer_text

prompt_messages = [
{
"role": "user",
"content": _TRANSLATE_PROMPT.format(
target_language=target_name, text=answer_text
),
}
]

try:
filled = dict_based_prompt_to_langchain_prompt(prompt_messages)
translated = message_to_string(llm.invoke(filled))
except Exception:
logger.exception(
"Failed to translate answer to %s; falling back to English",
target_name,
)
return answer_text

translated = translated.strip()
if not translated:
logger.warning(
"Translation to %s came back empty; falling back to English",
target_name,
)
return answer_text
return translated
76 changes: 71 additions & 5 deletions backend/danswer/chat/process_message.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,9 @@
from danswer.chat.models import LLMRelevanceFilterResponse
from danswer.chat.models import QADocsResponse
from danswer.chat.models import StreamingError
from danswer.chat.multilingual_translation import detect_query_language
from danswer.chat.multilingual_translation import language_name
from danswer.chat.multilingual_translation import translate_answer_to_language
from danswer.configs.chat_configs import CHAT_TARGET_CHUNK_PERCENTAGE
from danswer.configs.chat_configs import DISABLE_LLM_CHOOSE_SEARCH
from danswer.configs.chat_configs import MAX_CHUNKS_FED_TO_CHAT
Expand Down Expand Up @@ -92,11 +95,11 @@ def translate_citations(
for db_doc in db_docs:
if db_doc.document_id not in doc_id_to_saved_doc_id_map:
doc_id_to_saved_doc_id_map[db_doc.document_id] = db_doc.id
#print(f'found doc id: {db_doc.id}')
# print(f'found doc id: {db_doc.id}')

citation_to_saved_doc_id_map: dict[int, int] = {}
for citation in citations_list:
#print(f'citation id {citation.document_id} for doc num {citation.citation_num}')
# print(f'citation id {citation.document_id} for doc num {citation.citation_num}')
if citation.citation_num not in citation_to_saved_doc_id_map:
citation_to_saved_doc_id_map[
citation.citation_num
Expand Down Expand Up @@ -404,15 +407,25 @@ def stream_chat_message_objects(
if not final_msg.prompt:
raise RuntimeError("No Prompt found")

# Persona may be None for legacy flows; treat the flag as off in
# that case. When persona exists, thread its flag through so the
# answer-side prompt builders add the LANGUAGE_HINT.
persona_multilingual = (
persona.multilingual_query_expansion if persona is not None else False
)
prompt_config = (
PromptConfig.from_model(
final_msg.prompt,
prompt_override=(
new_msg_req.prompt_override or chat_session.prompt_override
),
multilingual_query_expansion=persona_multilingual,
)
if not persona
else PromptConfig.from_model(persona.prompts[0])
else PromptConfig.from_model(
persona.prompts[0],
multilingual_query_expansion=persona_multilingual,
)
)

# find out what tools to use
Expand Down Expand Up @@ -539,6 +552,22 @@ def stream_chat_message_objects(
ai_message_files = None # any files to associate with the AI message e.g. dall-e generated images
dropped_indices = None
tool_result = None

# Multi-language post-processing pass (option C in the design):
# when the persona has multilingual_query_expansion=True and the
# user's question is in a non-English language, the LLM tends
# to answer in English regardless of the LANGUAGE_HINT
# directive. We compensate by buffering DanswerAnswerPiece
# tokens during the stream and emitting a single translated
# piece at the end. Other packet types (citations, tool
# responses, image generation, etc.) still flow in real time.
translate_target = None
if persona_multilingual:
detected = detect_query_language(message_text)
if language_name(detected) is not None:
translate_target = detected
buffered_answer_pieces: list[str] = []

for packet in answer.processed_streamed_output:
if isinstance(packet, ToolResponse):
if packet.id == SEARCH_RESPONSE_SUMMARY_ID:
Expand Down Expand Up @@ -594,8 +623,35 @@ def stream_chat_message_objects(
else:
if isinstance(packet, ToolCallFinalResult):
tool_result = packet
if (
translate_target is not None
and isinstance(packet, DanswerAnswerPiece)
and packet.answer_piece
):
# Hold answer tokens back; we'll translate the full
# answer at the end of the stream.
buffered_answer_pieces.append(packet.answer_piece)
continue
yield cast(ChatPacket, packet)

# End of stream. If we buffered for translation, do the second
# LLM pass now and emit the translated answer as one piece.
# `answer.llm_answer` reads from the same processed stream, so
# it already contains the full English text — we use that as
# the source of truth (more reliable than reassembling from
# buffered pieces, which may have None entries from end-of-
# stream sentinels).
translated_answer_text: str | None = None
if translate_target is not None:
english_answer = answer.llm_answer
translated_answer_text = translate_answer_to_language(
answer_text=english_answer,
target_language_code=translate_target,
llm=llm,
)
yield DanswerAnswerPiece(answer_piece=translated_answer_text)
yield DanswerAnswerPiece(answer_piece=None)

except Exception as e:
logger.exception("Failed to process chat message")

Expand Down Expand Up @@ -627,14 +683,24 @@ def stream_chat_message_objects(
for tool in tool_list:
tool_name_to_tool_id[tool.name()] = tool_id

# If we translated, persist the user-facing translated text
# rather than the English intermediate. Citations are computed
# from the LLM's English output (where the [1]/[2] markers
# were emitted relative to retrieved docs); the translation
# prompt preserves those markers verbatim.
final_answer_text = (
translated_answer_text
if translated_answer_text is not None
else answer.llm_answer
)
gen_ai_response_message = partial_response(
message=answer.llm_answer,
message=final_answer_text,
rephrased_query=(
qa_docs_response.rephrased_query if qa_docs_response else None
),
reference_docs=reference_db_search_docs,
files=ai_message_files,
token_count=len(llm_tokenizer_encode_func(answer.llm_answer)),
token_count=len(llm_tokenizer_encode_func(final_answer_text)),
citations=db_citations,
error=None,
tool_calls=[
Expand Down
9 changes: 9 additions & 0 deletions backend/danswer/db/models.py
Original file line number Diff line number Diff line change
Expand Up @@ -1003,6 +1003,15 @@ class Persona(Base):
# Enables using LLM to extract time and source type filters
# Can also be admin disabled globally
llm_filter_extraction: Mapped[bool] = mapped_column(Boolean)
# When true, non-English queries on this persona are translated to
# English before retrieval and the LLM is instructed to answer in
# the user's original language. Off by default since most traffic
# is English and turning it on incurs an extra LLM call per query.
# Behaves as an override of the global MULTILINGUAL_QUERY_EXPANSION
# env var: persona flag wins; if false, falls back to env var.
multilingual_query_expansion: Mapped[bool] = mapped_column(
Boolean, nullable=False, default=False, server_default="false"
)
recency_bias: Mapped[RecencyBiasSetting] = mapped_column(
Enum(RecencyBiasSetting, native_enum=False)
)
Expand Down
4 changes: 4 additions & 0 deletions backend/danswer/db/persona.py
Original file line number Diff line number Diff line change
Expand Up @@ -79,6 +79,7 @@ def create_update_persona(
llm_model_version_override=create_persona_request.llm_model_version_override,
starter_messages=create_persona_request.starter_messages,
is_public=create_persona_request.is_public,
multilingual_query_expansion=create_persona_request.multilingual_query_expansion,
db_session=db_session,
)

Expand Down Expand Up @@ -327,6 +328,7 @@ def upsert_persona(
tool_ids: list[int] | None = None,
persona_id: int | None = None,
default_persona: bool = False,
multilingual_query_expansion: bool = False,
commit: bool = True,
) -> Persona:
if persona_id is not None:
Expand Down Expand Up @@ -379,6 +381,7 @@ def upsert_persona(
persona.starter_messages = starter_messages
persona.deleted = False # Un-delete if previously deleted
persona.is_public = is_public
persona.multilingual_query_expansion = multilingual_query_expansion

# Do not delete any associations manually added unless
# a new updated list is provided
Expand Down Expand Up @@ -411,6 +414,7 @@ def upsert_persona(
llm_model_version_override=llm_model_version_override,
starter_messages=starter_messages,
tools=tools or [],
multilingual_query_expansion=multilingual_query_expansion,
)
db_session.add(persona)

Expand Down
10 changes: 9 additions & 1 deletion backend/danswer/llm/answering/models.py
Original file line number Diff line number Diff line change
Expand Up @@ -121,10 +121,17 @@ class PromptConfig(BaseModel):
task_prompt: str
datetime_aware: bool
include_citations: bool
# When true, the answer-side prompts add the LANGUAGE_HINT directive
# so the LLM responds in the user's original language. Sourced from
# the persona's multilingual_query_expansion flag at construction.
multilingual_query_expansion: bool = False

@classmethod
def from_model(
cls, model: "Prompt", prompt_override: PromptOverride | None = None
cls,
model: "Prompt",
prompt_override: PromptOverride | None = None,
multilingual_query_expansion: bool = False,
) -> "PromptConfig":
override_system_prompt = (
prompt_override.system_prompt if prompt_override else None
Expand All @@ -136,6 +143,7 @@ def from_model(
task_prompt=override_task_prompt or model.task_prompt,
datetime_aware=model.datetime_aware,
include_citations=model.include_citations,
multilingual_query_expansion=multilingual_query_expansion,
)

# needed so that this can be passed into lru_cache funcs
Expand Down
Loading
Loading