feat: Vector retrieval matches datasource

XiaJunjie2020 · XiaJunjie2020 · commit 144c67a3e3f7 · 2025-09-18T17:39:23.000+08:00
diff --git a/backend/apps/chat/task/llm.py b/backend/apps/chat/task/llm.py
@@ -32,7 +32,6 @@
 from apps.data_training.curd.data_training import get_training_template
 from apps.datasource.crud.datasource import get_table_schema
 from apps.datasource.crud.permission import get_row_permission_filters, is_normal_user
-from apps.datasource.embedding.ds_embedding import get_ds_embedding
 from apps.datasource.models.datasource import CoreDatasource
 from apps.db.db import exec_sql, get_version, check_connection
 from apps.system.crud.assistant import AssistantOutDs, AssistantOutDsFactory, get_assistant_ds
@@ -426,64 +425,58 @@ def select_datasource(self):
         full_thinking_text = ''
         full_text = ''
 
-        ds = None
         if not ignore_auto_select:
-            if settings.EMBEDDING_ENABLED:
-                ds = get_ds_embedding(self.session, self.current_user, _ds_list, self.chat_question.question)
-                yield {'content': '{"id":' + ds.id + '}'}
-            else:
-                _ds_list_dict = []
-                for _ds in _ds_list:
-                    _ds_list_dict.append(_ds)
-                datasource_msg.append(
-                    HumanMessage(self.chat_question.datasource_user_question(orjson.dumps(_ds_list_dict).decode())))
-
-                self.current_logs[OperationEnum.CHOOSE_DATASOURCE] = start_log(session=self.session,
-                                                                               ai_modal_id=self.chat_question.ai_modal_id,
-                                                                               ai_modal_name=self.chat_question.ai_modal_name,
-                                                                               operate=OperationEnum.CHOOSE_DATASOURCE,
-                                                                               record_id=self.record.id,
-                                                                               full_message=[{'type': msg.type,
-                                                                                              'content': msg.content}
-                                                                                             for
-                                                                                             msg in datasource_msg])
-
-                token_usage = {}
-                res = self.llm.stream(datasource_msg)
-                for chunk in res:
-                    SQLBotLogUtil.info(chunk)
+            _ds_list_dict = []
+            for _ds in _ds_list:
+                _ds_list_dict.append(_ds)
+            datasource_msg.append(
+                HumanMessage(self.chat_question.datasource_user_question(orjson.dumps(_ds_list_dict).decode())))
+
+            self.current_logs[OperationEnum.CHOOSE_DATASOURCE] = start_log(session=self.session,
+                                                                           ai_modal_id=self.chat_question.ai_modal_id,
+                                                                           ai_modal_name=self.chat_question.ai_modal_name,
+                                                                           operate=OperationEnum.CHOOSE_DATASOURCE,
+                                                                           record_id=self.record.id,
+                                                                           full_message=[{'type': msg.type,
+                                                                                          'content': msg.content}
+                                                                                         for
+                                                                                         msg in datasource_msg])
+
+            token_usage = {}
+            res = self.llm.stream(datasource_msg)
+            for chunk in res:
+                SQLBotLogUtil.info(chunk)
+                reasoning_content_chunk = ''
+                if 'reasoning_content' in chunk.additional_kwargs:
+                    reasoning_content_chunk = chunk.additional_kwargs.get('reasoning_content', '')
+                # else:
+                #     reasoning_content_chunk = chunk.get('reasoning_content')
+                if reasoning_content_chunk is None:
                     reasoning_content_chunk = ''
-                    if 'reasoning_content' in chunk.additional_kwargs:
-                        reasoning_content_chunk = chunk.additional_kwargs.get('reasoning_content', '')
-                    # else:
-                    #     reasoning_content_chunk = chunk.get('reasoning_content')
-                    if reasoning_content_chunk is None:
-                        reasoning_content_chunk = ''
-                    full_thinking_text += reasoning_content_chunk
-
-                    full_text += chunk.content
-                    yield {'content': chunk.content, 'reasoning_content': reasoning_content_chunk}
-                    get_token_usage(chunk, token_usage)
-                datasource_msg.append(AIMessage(full_text))
-
-                self.current_logs[OperationEnum.CHOOSE_DATASOURCE] = end_log(session=self.session,
-                                                                             log=self.current_logs[
-                                                                                 OperationEnum.CHOOSE_DATASOURCE],
-                                                                             full_message=[
-                                                                                 {'type': msg.type,
-                                                                                  'content': msg.content}
-                                                                                 for msg in datasource_msg],
-                                                                             reasoning_content=full_thinking_text,
-                                                                             token_usage=token_usage)
-
-                json_str = extract_nested_json(full_text)
-                ds = orjson.loads(json_str)
+                full_thinking_text += reasoning_content_chunk
+
+                full_text += chunk.content
+                yield {'content': chunk.content, 'reasoning_content': reasoning_content_chunk}
+                get_token_usage(chunk, token_usage)
+            datasource_msg.append(AIMessage(full_text))
+
+            self.current_logs[OperationEnum.CHOOSE_DATASOURCE] = end_log(session=self.session,
+                                                                         log=self.current_logs[
+                                                                             OperationEnum.CHOOSE_DATASOURCE],
+                                                                         full_message=[
+                                                                             {'type': msg.type,
+                                                                              'content': msg.content}
+                                                                             for msg in datasource_msg],
+                                                                         reasoning_content=full_thinking_text,
+                                                                         token_usage=token_usage)
+
+            json_str = extract_nested_json(full_text)
 
         _error: Exception | None = None
         _datasource: int | None = None
         _engine_type: str | None = None
         try:
-            data: dict = _ds_list[0] if ignore_auto_select else ds
+            data: dict = _ds_list[0] if ignore_auto_select else orjson.loads(json_str)
 
             if data.get('id') and data.get('id') != 0:
                 _datasource = data['id']
@@ -522,7 +515,7 @@ def select_datasource(self):
         except Exception as e:
             _error = e
 
-        if not ignore_auto_select and not settings.EMBEDDING_ENABLED:
+        if not ignore_auto_select:
             self.record = save_select_datasource_answer(session=self.session, record_id=self.record.id,
                                                         answer=orjson.dumps({'content': full_text}).decode(),
                                                         datasource=_datasource,