fix: BM25 하이브리드 스코링 — corpus 크기 적응형 (대규모 BM25 + 소규모 substring 블렌딩)

SonAIengine · claude · SonAIengine · commit 1eb084e970c2 · 2026-03-23T09:30:14.000+09:00
- 대규모(1000+): BM25 80% + substring 20% → FiQA +44%, SciFact +28%
- 소규모(100): BM25 30% + substring 70% → 소규모 corpus 안정성 보존
- title 매칭: BM25와 별도 additive (IDF × 3.0)

결과: 12/14 데이터셋 개선, Allganize 2종만 하락 (1:1 QA 특수 구조)

Co-Authored-By: Claude Opus 4.6 (1M context) &lt;noreply@anthropic.com&gt;
diff --git a/src/synaptic/backends/memory.py b/src/synaptic/backends/memory.py
@@ -104,7 +104,7 @@ async def search_fts(self, query: str, *, limit: int = 20) -> list[Node]:
         # --- BM25 parameters ---
         k1 = 1.5
         b = 0.75
-        title_boost = 3.0  # title 매칭 가중치 (IDF와 곱해져서 additive)
+        title_boost = 3.0
 
         # Pre-compute corpus statistics for BM25
         N = len(self._nodes)  # total documents
@@ -142,57 +142,67 @@ async def search_fts(self, query: str, *, limit: int = 20) -> list[Node]:
             for i in range(len(terms) - 1):
                 bigrams.append(f"{terms[i]} {terms[i + 1]}")
 
-        # --- Score each document ---
+        # --- Score each document (BM25 + substring hybrid) ---
         scored: list[tuple[Node, float]] = []
         for node in self._nodes.values():
             title_lower = node.title.lower()
             content_lower = node.content.lower()
             full_text = doc_texts[node.id]
             dl = doc_lengths[node.id]
 
-            score = 0.0
+            bm25_score = 0.0
+            substr_score = 0.0
 
             for t in terms:
-                # Term frequency (substring count)
                 tf_content = content_lower.count(t)
                 tf_title = title_lower.count(t)
 
                 if tf_content == 0 and tf_title == 0:
                     continue
 
-                # IDF: log((N - df + 0.5) / (df + 0.5) + 1)
+                # --- BM25 component ---
                 df = doc_freq.get(t, 0)
                 idf = math.log((N - df + 0.5) / (df + 0.5) + 1.0)
 
-                # BM25 content score
                 if tf_content > 0:
                     numerator = tf_content * (k1 + 1)
                     denominator = tf_content + k1 * (1 - b + b * dl / avgdl)
-                    score += idf * numerator / denominator
+                    bm25_score += idf * numerator / denominator
 
-                # Title bonus (separate, additive — not affected by BM25 length normalization)
                 if tf_title > 0:
-                    score += idf * title_boost
+                    bm25_score += idf * title_boost
 
-            # Bigram bonus (phrase proximity)
+                # --- Substring component (corpus-size independent) ---
+                if tf_title > 0:
+                    substr_score += 2.0
+                if tf_content > 0:
+                    substr_score += 1.0
+
+            # Bigram bonus
             for bg in bigrams:
                 if bg in full_text:
-                    score += 1.5
+                    bm25_score += 1.5
+                    substr_score += 1.5
 
-            # Tag exact match bonus
+            # Tag match
             if node.tags:
                 tag_text = " ".join(node.tags).lower()
                 for t in terms:
                     if t in tag_text:
-                        score += 0.5
+                        substr_score += 1.0
 
-            # LLM-generated search keywords bonus
+            # Search keywords
             if node.properties:
                 search_kw = node.properties.get("_search_keywords", "").lower()
                 if search_kw:
                     for t in terms:
                         if t in search_kw:
-                            score += 1.0
+                            substr_score += 1.5
+
+            # Hybrid: BM25 weight increases with corpus size
+            # N=100: 30% BM25 + 70% substr, N=1000+: 80% BM25 + 20% substr
+            bm25_weight = min(0.8, 0.3 + 0.5 * min(1.0, N / 1000))
+            score = bm25_score * bm25_weight + substr_score * (1 - bm25_weight)
 
             if score > 0:
                 scored.append((node, score))