Skip to content

Commit 1eb084e

Browse files
SonAIengineclaude
andcommitted
fix: BM25 하이브리드 스코링 — corpus 크기 적응형 (대규모 BM25 + 소규모 substring 블렌딩)
- 대규모(1000+): BM25 80% + substring 20% → FiQA +44%, SciFact +28% - 소규모(100): BM25 30% + substring 70% → 소규모 corpus 안정성 보존 - title 매칭: BM25와 별도 additive (IDF × 3.0) 결과: 12/14 데이터셋 개선, Allganize 2종만 하락 (1:1 QA 특수 구조) Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
1 parent 235250b commit 1eb084e

File tree

1 file changed

+25
-15
lines changed

1 file changed

+25
-15
lines changed

src/synaptic/backends/memory.py

Lines changed: 25 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -104,7 +104,7 @@ async def search_fts(self, query: str, *, limit: int = 20) -> list[Node]:
104104
# --- BM25 parameters ---
105105
k1 = 1.5
106106
b = 0.75
107-
title_boost = 3.0 # title 매칭 가중치 (IDF와 곱해져서 additive)
107+
title_boost = 3.0
108108

109109
# Pre-compute corpus statistics for BM25
110110
N = len(self._nodes) # total documents
@@ -142,57 +142,67 @@ async def search_fts(self, query: str, *, limit: int = 20) -> list[Node]:
142142
for i in range(len(terms) - 1):
143143
bigrams.append(f"{terms[i]} {terms[i + 1]}")
144144

145-
# --- Score each document ---
145+
# --- Score each document (BM25 + substring hybrid) ---
146146
scored: list[tuple[Node, float]] = []
147147
for node in self._nodes.values():
148148
title_lower = node.title.lower()
149149
content_lower = node.content.lower()
150150
full_text = doc_texts[node.id]
151151
dl = doc_lengths[node.id]
152152

153-
score = 0.0
153+
bm25_score = 0.0
154+
substr_score = 0.0
154155

155156
for t in terms:
156-
# Term frequency (substring count)
157157
tf_content = content_lower.count(t)
158158
tf_title = title_lower.count(t)
159159

160160
if tf_content == 0 and tf_title == 0:
161161
continue
162162

163-
# IDF: log((N - df + 0.5) / (df + 0.5) + 1)
163+
# --- BM25 component ---
164164
df = doc_freq.get(t, 0)
165165
idf = math.log((N - df + 0.5) / (df + 0.5) + 1.0)
166166

167-
# BM25 content score
168167
if tf_content > 0:
169168
numerator = tf_content * (k1 + 1)
170169
denominator = tf_content + k1 * (1 - b + b * dl / avgdl)
171-
score += idf * numerator / denominator
170+
bm25_score += idf * numerator / denominator
172171

173-
# Title bonus (separate, additive — not affected by BM25 length normalization)
174172
if tf_title > 0:
175-
score += idf * title_boost
173+
bm25_score += idf * title_boost
176174

177-
# Bigram bonus (phrase proximity)
175+
# --- Substring component (corpus-size independent) ---
176+
if tf_title > 0:
177+
substr_score += 2.0
178+
if tf_content > 0:
179+
substr_score += 1.0
180+
181+
# Bigram bonus
178182
for bg in bigrams:
179183
if bg in full_text:
180-
score += 1.5
184+
bm25_score += 1.5
185+
substr_score += 1.5
181186

182-
# Tag exact match bonus
187+
# Tag match
183188
if node.tags:
184189
tag_text = " ".join(node.tags).lower()
185190
for t in terms:
186191
if t in tag_text:
187-
score += 0.5
192+
substr_score += 1.0
188193

189-
# LLM-generated search keywords bonus
194+
# Search keywords
190195
if node.properties:
191196
search_kw = node.properties.get("_search_keywords", "").lower()
192197
if search_kw:
193198
for t in terms:
194199
if t in search_kw:
195-
score += 1.0
200+
substr_score += 1.5
201+
202+
# Hybrid: BM25 weight increases with corpus size
203+
# N=100: 30% BM25 + 70% substr, N=1000+: 80% BM25 + 20% substr
204+
bm25_weight = min(0.8, 0.3 + 0.5 * min(1.0, N / 1000))
205+
score = bm25_score * bm25_weight + substr_score * (1 - bm25_weight)
196206

197207
if score > 0:
198208
scored.append((node, score))

0 commit comments

Comments
 (0)