From f81b927922b91d00e206516f733ffde0f9bcf46b Mon Sep 17 00:00:00 2001
From: DShomin <lhm53779354@gmail.com>
Date: Thu, 27 Mar 2025 16:30:54 +0900
Subject: [PATCH 1/3] feat: Add detect_language_regex function to detect text
 language using regex

---
 llm_utils/graph.py | 77 ++++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 77 insertions(+)

diff --git a/llm_utils/graph.py b/llm_utils/graph.py
index 0aef51d..ce3c28c 100644
--- a/llm_utils/graph.py
+++ b/llm_utils/graph.py
@@ -1,5 +1,6 @@
 import os
 import json
+import re
 
 from typing_extensions import TypedDict, Annotated
 from langgraph.graph import END, StateGraph
@@ -16,6 +17,7 @@
 from llm_utils.tools import get_info_from_db
 
 # 노드 식별자 정의
+DETECT_LANGUAGE = "detect_language"
 QUERY_REFINER = "query_refiner"
 GET_TABLE_INFO = "get_table_info"
 TOOL = "tool"
@@ -33,6 +35,81 @@ class QueryMakerState(TypedDict):
     generated_query: str
 
 
+# 노드 함수: 언어 감지
+def detect_language_regex(state: QueryMakerState):
+    """
+    정규표현식을 사용해 텍스트의 언어를 감지하는 함수.
+
+    Args:
+        text (str): 감지할 텍스트
+
+    Returns:
+        dict: 감지된 언어와 관련 정보
+    """
+    # 언어별 고유 문자 패턴 정의
+    patterns = {
+        "ko": r"[\u3131-\u3163\uAC00-\uD7A3]",  # 한글 (Hangul)
+        "ja": r"[\u3040-\u309F\u30A0-\u30FF]",  # 일본어 (Hiragana, Katakana)
+        "zh": r"[\u4E00-\u9FFF]",  # 중국어 (Han characters)
+        "ru": r"[\u0400-\u04FF]",  # 러시아어 (Cyrillic)
+        "fr": r"[àâçéèêëîïôûùüÿ]",  # 프랑스어 고유 문자
+        "es": r"[áéíóúñ¿¡]",  # 스페인어 고유 문자
+        "en": r"[a-zA-Z]",  # 영어 (기본 Latin alphabet)
+    }
+    text = state["messages"][-1].content
+
+    # 특수 문자와 공백 제거
+    cleaned_text = re.sub(r"[!@#$%^&*(),.?\"':{}|<>]", "", text)
+    cleaned_text = cleaned_text.strip()
+
+    if not cleaned_text:
+        return {"language": None, "confidence": 0.0, "method": "regex"}
+
+    # 각 언어별 문자 수 계산
+    char_counts = {}
+    total_chars = len(cleaned_text)
+
+    for lang, pattern in patterns.items():
+        matches = re.findall(pattern, cleaned_text)
+        char_count = len(matches)
+
+        # 언어별 가중치 적용
+        if lang in ["fr", "es"]:
+            # 프랑스어나 스페인어 고유 문자가 있으면 해당 언어일 가능성이 매우 높음
+            if char_count > 0:
+                char_count = total_chars
+        elif lang == "en":
+            # 영어는 라틴 알파벳을 공유하는 언어들이 많으므로 가중치 감소
+            char_count *= 0.8
+
+        if char_count > 0:
+            char_counts[lang] = char_count
+
+    if not char_counts:
+        return {"language": None, "confidence": 0.0, "method": "regex"}
+
+    # 가장 많은 문자 수를 가진 언어 선택
+    detected_lang = max(char_counts, key=char_counts.get)
+    confidence = char_counts[detected_lang] / total_chars
+
+    # 신뢰도 조정
+    if detected_lang in ["fr", "es"] and confidence > 0.1:
+        confidence = 0.95  # 고유 문자가 있으면 높은 신뢰도
+    elif detected_lang == "en":
+        # 다른 언어의 문자가 없을 때만 영어 신뢰도 상승
+        other_chars = sum(
+            char_counts.get(lang, 0) for lang in char_counts if lang != "en"
+        )
+        if other_chars == 0:
+            confidence = 0.95
+
+    return {
+        "language": detected_lang,
+        "confidence": round(confidence, 4),
+        "method": "regex",
+    }
+
+
 # 노드 함수: QUERY_REFINER 노드
 def query_refiner_node(state: QueryMakerState):
     res = query_refiner_chain.invoke(

From 89d819a440029407ae32e57f249ae26827d2c728 Mon Sep 17 00:00:00 2001
From: DShomin <lhm53779354@gmail.com>
Date: Thu, 27 Mar 2025 16:32:23 +0900
Subject: [PATCH 2/3] feat: Update Graph entry point to detect_language and add
 corresponding edge

---
 llm_utils/graph.py | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/llm_utils/graph.py b/llm_utils/graph.py
index ce3c28c..7d4549c 100644
--- a/llm_utils/graph.py
+++ b/llm_utils/graph.py
@@ -211,9 +211,10 @@ def query_maker_node_with_db_guide(state: QueryMakerState):
 
 # StateGraph 생성 및 구성
 builder = StateGraph(QueryMakerState)
-builder.set_entry_point(QUERY_REFINER)
+builder.set_entry_point(DETECT_LANGUAGE)
 
 # 노드 추가
+builder.add_node(DETECT_LANGUAGE, detect_language_regex)
 builder.add_node(QUERY_REFINER, query_refiner_node)
 builder.add_node(GET_TABLE_INFO, get_table_info_node)
 # builder.add_node(QUERY_MAKER, query_maker_node)  #  query_maker_node_with_db_guide
@@ -222,6 +223,7 @@ def query_maker_node_with_db_guide(state: QueryMakerState):
 )  #  query_maker_node_with_db_guide
 
 # 기본 엣지 설정
+builder.add_edge(DETECT_LANGUAGE, QUERY_REFINER)
 builder.add_edge(QUERY_REFINER, GET_TABLE_INFO)
 builder.add_edge(GET_TABLE_INFO, QUERY_MAKER)
 

From 1884f141c1d148f896d318ec936e21a7aa7cef5c Mon Sep 17 00:00:00 2001
From: DShomin <lhm53779354@gmail.com>
Date: Thu, 27 Mar 2025 16:33:05 +0900
Subject: [PATCH 3/3] =?UTF-8?q?feat:=20QueryMakerState=EC=97=90=20?=
 =?UTF-8?q?=EC=96=B8=EC=96=B4,=20=EC=8B=A0=EB=A2=B0=EB=8F=84=20=EB=B0=8F?=
 =?UTF-8?q?=20=EB=B0=A9=EB=B2=95=20=ED=95=84=EB=93=9C=20=EC=B6=94=EA=B0=80?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 llm_utils/graph.py | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/llm_utils/graph.py b/llm_utils/graph.py
index 7d4549c..31e3d96 100644
--- a/llm_utils/graph.py
+++ b/llm_utils/graph.py
@@ -33,6 +33,9 @@ class QueryMakerState(TypedDict):
     best_practice_query: str
     refined_input: str
     generated_query: str
+    language: str
+    confidence: float
+    method: str
 
 
 # 노드 함수: 언어 감지