From f81b927922b91d00e206516f733ffde0f9bcf46b Mon Sep 17 00:00:00 2001 From: DShomin Date: Thu, 27 Mar 2025 16:30:54 +0900 Subject: [PATCH 1/3] feat: Add detect_language_regex function to detect text language using regex --- llm_utils/graph.py | 77 ++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 77 insertions(+) diff --git a/llm_utils/graph.py b/llm_utils/graph.py index 0aef51d..ce3c28c 100644 --- a/llm_utils/graph.py +++ b/llm_utils/graph.py @@ -1,5 +1,6 @@ import os import json +import re from typing_extensions import TypedDict, Annotated from langgraph.graph import END, StateGraph @@ -16,6 +17,7 @@ from llm_utils.tools import get_info_from_db # 노드 식별자 정의 +DETECT_LANGUAGE = "detect_language" QUERY_REFINER = "query_refiner" GET_TABLE_INFO = "get_table_info" TOOL = "tool" @@ -33,6 +35,81 @@ class QueryMakerState(TypedDict): generated_query: str +# 노드 함수: 언어 감지 +def detect_language_regex(state: QueryMakerState): + """ + 정규표현식을 사용해 텍스트의 언어를 감지하는 함수. + + Args: + text (str): 감지할 텍스트 + + Returns: + dict: 감지된 언어와 관련 정보 + """ + # 언어별 고유 문자 패턴 정의 + patterns = { + "ko": r"[\u3131-\u3163\uAC00-\uD7A3]", # 한글 (Hangul) + "ja": r"[\u3040-\u309F\u30A0-\u30FF]", # 일본어 (Hiragana, Katakana) + "zh": r"[\u4E00-\u9FFF]", # 중국어 (Han characters) + "ru": r"[\u0400-\u04FF]", # 러시아어 (Cyrillic) + "fr": r"[àâçéèêëîïôûùüÿ]", # 프랑스어 고유 문자 + "es": r"[áéíóúñ¿¡]", # 스페인어 고유 문자 + "en": r"[a-zA-Z]", # 영어 (기본 Latin alphabet) + } + text = state["messages"][-1].content + + # 특수 문자와 공백 제거 + cleaned_text = re.sub(r"[!@#$%^&*(),.?\"':{}|<>]", "", text) + cleaned_text = cleaned_text.strip() + + if not cleaned_text: + return {"language": None, "confidence": 0.0, "method": "regex"} + + # 각 언어별 문자 수 계산 + char_counts = {} + total_chars = len(cleaned_text) + + for lang, pattern in patterns.items(): + matches = re.findall(pattern, cleaned_text) + char_count = len(matches) + + # 언어별 가중치 적용 + if lang in ["fr", "es"]: + # 프랑스어나 스페인어 고유 문자가 있으면 해당 언어일 가능성이 매우 높음 + if char_count > 0: + char_count = total_chars + elif lang == "en": + # 영어는 라틴 알파벳을 공유하는 언어들이 많으므로 가중치 감소 + char_count *= 0.8 + + if char_count > 0: + char_counts[lang] = char_count + + if not char_counts: + return {"language": None, "confidence": 0.0, "method": "regex"} + + # 가장 많은 문자 수를 가진 언어 선택 + detected_lang = max(char_counts, key=char_counts.get) + confidence = char_counts[detected_lang] / total_chars + + # 신뢰도 조정 + if detected_lang in ["fr", "es"] and confidence > 0.1: + confidence = 0.95 # 고유 문자가 있으면 높은 신뢰도 + elif detected_lang == "en": + # 다른 언어의 문자가 없을 때만 영어 신뢰도 상승 + other_chars = sum( + char_counts.get(lang, 0) for lang in char_counts if lang != "en" + ) + if other_chars == 0: + confidence = 0.95 + + return { + "language": detected_lang, + "confidence": round(confidence, 4), + "method": "regex", + } + + # 노드 함수: QUERY_REFINER 노드 def query_refiner_node(state: QueryMakerState): res = query_refiner_chain.invoke( From 89d819a440029407ae32e57f249ae26827d2c728 Mon Sep 17 00:00:00 2001 From: DShomin Date: Thu, 27 Mar 2025 16:32:23 +0900 Subject: [PATCH 2/3] feat: Update Graph entry point to detect_language and add corresponding edge --- llm_utils/graph.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/llm_utils/graph.py b/llm_utils/graph.py index ce3c28c..7d4549c 100644 --- a/llm_utils/graph.py +++ b/llm_utils/graph.py @@ -211,9 +211,10 @@ def query_maker_node_with_db_guide(state: QueryMakerState): # StateGraph 생성 및 구성 builder = StateGraph(QueryMakerState) -builder.set_entry_point(QUERY_REFINER) +builder.set_entry_point(DETECT_LANGUAGE) # 노드 추가 +builder.add_node(DETECT_LANGUAGE, detect_language_regex) builder.add_node(QUERY_REFINER, query_refiner_node) builder.add_node(GET_TABLE_INFO, get_table_info_node) # builder.add_node(QUERY_MAKER, query_maker_node) # query_maker_node_with_db_guide @@ -222,6 +223,7 @@ def query_maker_node_with_db_guide(state: QueryMakerState): ) # query_maker_node_with_db_guide # 기본 엣지 설정 +builder.add_edge(DETECT_LANGUAGE, QUERY_REFINER) builder.add_edge(QUERY_REFINER, GET_TABLE_INFO) builder.add_edge(GET_TABLE_INFO, QUERY_MAKER) From 1884f141c1d148f896d318ec936e21a7aa7cef5c Mon Sep 17 00:00:00 2001 From: DShomin Date: Thu, 27 Mar 2025 16:33:05 +0900 Subject: [PATCH 3/3] =?UTF-8?q?feat:=20QueryMakerState=EC=97=90=20?= =?UTF-8?q?=EC=96=B8=EC=96=B4,=20=EC=8B=A0=EB=A2=B0=EB=8F=84=20=EB=B0=8F?= =?UTF-8?q?=20=EB=B0=A9=EB=B2=95=20=ED=95=84=EB=93=9C=20=EC=B6=94=EA=B0=80?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- llm_utils/graph.py | 3 +++ 1 file changed, 3 insertions(+) diff --git a/llm_utils/graph.py b/llm_utils/graph.py index 7d4549c..31e3d96 100644 --- a/llm_utils/graph.py +++ b/llm_utils/graph.py @@ -33,6 +33,9 @@ class QueryMakerState(TypedDict): best_practice_query: str refined_input: str generated_query: str + language: str + confidence: float + method: str # 노드 함수: 언어 감지