From 28eaf4e36558c8a1ac3803f630db590adeb46a54 Mon Sep 17 00:00:00 2001 From: ulleo Date: Thu, 25 Sep 2025 17:30:53 +0800 Subject: [PATCH 1/2] feat: terminology settings add datasource #127 --- backend/apps/chat/task/llm.py | 8 ++-- backend/apps/terminology/curd/terminology.py | 42 ++++++++++++++++---- 2 files changed, 39 insertions(+), 11 deletions(-) diff --git a/backend/apps/chat/task/llm.py b/backend/apps/chat/task/llm.py index 89fb52a13..f36bca316 100644 --- a/backend/apps/chat/task/llm.py +++ b/backend/apps/chat/task/llm.py @@ -241,8 +241,9 @@ def generate_analysis(self): self.chat_question.data = orjson.dumps(data.get('data')).decode() analysis_msg: List[Union[BaseMessage, dict[str, Any]]] = [] + ds_id = self.ds.id if isinstance(self.ds, CoreDatasource) else None self.chat_question.terminologies = get_terminology_template(self.session, self.chat_question.question, - self.current_user.oid) + self.current_user.oid, ds_id) analysis_msg.append(SystemMessage(content=self.chat_question.analysis_sys_question())) analysis_msg.append(HumanMessage(content=self.chat_question.analysis_user_question())) @@ -504,7 +505,8 @@ def select_datasource(self): oid = self.ds.oid if isinstance(self.ds, CoreDatasource) else 1 ds_id = self.ds.id if isinstance(self.ds, CoreDatasource) else None - self.chat_question.terminologies = get_terminology_template(self.session, self.chat_question.question, oid) + self.chat_question.terminologies = get_terminology_template(self.session, self.chat_question.question, oid, + ds_id) self.chat_question.data_training = get_training_template(self.session, self.chat_question.question, ds_id, oid) @@ -897,7 +899,7 @@ def run_task(self, in_chat: bool = True, stream: bool = True, oid = self.ds.oid if isinstance(self.ds, CoreDatasource) else 1 ds_id = self.ds.id if isinstance(self.ds, CoreDatasource) else None self.chat_question.terminologies = get_terminology_template(self.session, self.chat_question.question, - oid) + oid, ds_id) self.chat_question.data_training = get_training_template(self.session, self.chat_question.question, ds_id, oid) diff --git a/backend/apps/terminology/curd/terminology.py b/backend/apps/terminology/curd/terminology.py index 7b8d0a044..f10f34818 100644 --- a/backend/apps/terminology/curd/terminology.py +++ b/backend/apps/terminology/curd/terminology.py @@ -1,7 +1,7 @@ import datetime import logging import traceback -from typing import List, Optional +from typing import List, Optional, Any from xml.dom.minidom import parseString import dicttoxml @@ -367,17 +367,22 @@ def save_embeddings(session: Session, ids: List[int]): embedding_sql = f""" SELECT id, pid, word, similarity FROM -(SELECT id, pid, word, oid, +(SELECT id, pid, word, oid, specific_ds, datasource_ids, ( 1 - (embedding <=> :embedding_array) ) AS similarity FROM terminology AS child ) TEMP -WHERE similarity > {settings.EMBEDDING_TERMINOLOGY_SIMILARITY} and oid = :oid +WHERE similarity > {settings.EMBEDDING_TERMINOLOGY_SIMILARITY} AND oid = :oid +AND ( + (:datasource IS NULL AND (specific_ds = false OR specific_ds IS NULL)) + OR + (:datasource IS NOT NULL AND ((specific_ds = false OR specific_ds IS NULL) OR (specific_ds = true AND datasource_ids IS NOT NULL AND datasource_ids @> jsonb_build_array(:datasource)))) +) ORDER BY similarity DESC LIMIT {settings.EMBEDDING_TERMINOLOGY_TOP_COUNT} """ -def select_terminology_by_word(session: SessionDep, word: str, oid: int): +def select_terminology_by_word(session: SessionDep, word: str, oid: int, datasource: int = None): if word.strip() == "": return [] @@ -394,7 +399,26 @@ def select_terminology_by_word(session: SessionDep, word: str, oid: int): ) ) - results = session.execute(stmt, {'sentence': word}).fetchall() + if datasource is not None: + stmt = stmt.where( + or_( + or_(Terminology.specific_ds == False, Terminology.specific_ds.is_(None)), + and_( + Terminology.specific_ds == True, + Terminology.datasource_ids.isnot(None), + text("datasource_ids @> jsonb_build_array(:datasource)") + ) + ) + ) + else: + stmt = stmt.where(or_(Terminology.specific_ds == False, Terminology.specific_ds.is_(None))) + + # 执行查询 + params: dict[str, Any] = {'sentence': word} + if datasource is not None: + params['datasource'] = datasource + + results = session.execute(stmt, params).fetchall() for row in results: _list.append(Terminology(id=row.id, word=row.word, pid=row.pid)) @@ -405,7 +429,8 @@ def select_terminology_by_word(session: SessionDep, word: str, oid: int): embedding = model.embed_query(word) - results = session.execute(text(embedding_sql), {'embedding_array': str(embedding), 'oid': oid}) + results = session.execute(text(embedding_sql), {'embedding_array': str(embedding), 'oid': oid, + 'datasource': datasource}).fetchall() for row in results: _list.append(Terminology(id=row.id, word=row.word, pid=row.pid)) @@ -481,10 +506,11 @@ def to_xml_string(_dict: list[dict] | dict, root: str = 'terminologies') -> str: return pretty_xml -def get_terminology_template(session: SessionDep, question: str, oid: Optional[int] = 1) -> str: +def get_terminology_template(session: SessionDep, question: str, oid: Optional[int] = 1, + datasource: Optional[int] = None) -> str: if not oid: oid = 1 - _results = select_terminology_by_word(session, question, oid) + _results = select_terminology_by_word(session, question, oid, datasource) if _results and len(_results) > 0: terminology = to_xml_string(_results) template = get_base_terminology_template().format(terminologies=terminology) From 6499730b26ec68dbe070e1ca11e9c5d7a0eb433d Mon Sep 17 00:00:00 2001 From: ulleo Date: Thu, 25 Sep 2025 17:42:46 +0800 Subject: [PATCH 2/2] feat: modify analysis template --- backend/template.yaml | 23 ++++++++++++++++++----- 1 file changed, 18 insertions(+), 5 deletions(-) diff --git a/backend/template.yaml b/backend/template.yaml index 90914147c..a9b8d7e4b 100644 --- a/backend/template.yaml +++ b/backend/template.yaml @@ -10,8 +10,8 @@ template: 你是"SQLBOT",智能问数小助手,可以根据用户提问,专业生成SQL与可视化图表。 你当前的任务是根据给定的表结构和用户问题生成SQL语句、可能适合展示的图表类型以及该SQL中所用到的表名。 - 我们会在块内提供给你信息,帮助你生成SQL: - 内有等信息; + 我们会在块内提供给你信息,帮助你生成SQL: + 内有等信息; 其中,:提供数据库引擎及版本信息; :以 M-Schema 格式提供数据库表结构信息; :提供一组术语,块内每一个就是术语,其中同一个内的多个代表术语的多种叫法,也就是术语与它的同义词,即该术语对应的描述,其中也可能是能够用来参考的计算公式,或者是一些其他的查询条件 @@ -389,12 +389,25 @@ template: {old_questions} analysis: system: | - ### 请使用语言:{lang} 回答,若有深度思考过程,则思考过程也需要使用 {lang} 输出 + + 你是"SQLBOT",智能问数小助手,可以根据用户提问,专业生成SQL与可视化图表。 + 你当前的任务是根据给定的数据分析数据,并给出你的分析结果。 + 我们会在块内提供给你信息,帮助你进行分析: + 内有等信息; + :提供一组术语,块内每一个就是术语,其中同一个内的多个代表术语的多种叫法,也就是术语与它的同义词,即该术语对应的描述,其中也可能是能够用来参考的计算公式,或者是一些其他的查询条件 + - ### 说明: - 你是一个数据分析师,你的任务是根据给定的数据分析数据,并给出你的分析结果。 + 你必须遵守以下规则: + + + 请使用语言:{lang} 回答,若有深度思考过程,则思考过程也需要使用 {lang} 输出 + + + ### 下面是提供的信息 + {terminologies} + user: | ### 字段(字段别名): {fields}