@@ -104,7 +104,7 @@ async def search_fts(self, query: str, *, limit: int = 20) -> list[Node]:
104104 # --- BM25 parameters ---
105105 k1 = 1.5
106106 b = 0.75
107- title_boost = 3.0 # title 매칭 가중치 (IDF와 곱해져서 additive)
107+ title_boost = 3.0
108108
109109 # Pre-compute corpus statistics for BM25
110110 N = len (self ._nodes ) # total documents
@@ -142,57 +142,67 @@ async def search_fts(self, query: str, *, limit: int = 20) -> list[Node]:
142142 for i in range (len (terms ) - 1 ):
143143 bigrams .append (f"{ terms [i ]} { terms [i + 1 ]} " )
144144
145- # --- Score each document ---
145+ # --- Score each document (BM25 + substring hybrid) ---
146146 scored : list [tuple [Node , float ]] = []
147147 for node in self ._nodes .values ():
148148 title_lower = node .title .lower ()
149149 content_lower = node .content .lower ()
150150 full_text = doc_texts [node .id ]
151151 dl = doc_lengths [node .id ]
152152
153- score = 0.0
153+ bm25_score = 0.0
154+ substr_score = 0.0
154155
155156 for t in terms :
156- # Term frequency (substring count)
157157 tf_content = content_lower .count (t )
158158 tf_title = title_lower .count (t )
159159
160160 if tf_content == 0 and tf_title == 0 :
161161 continue
162162
163- # IDF: log((N - df + 0.5) / (df + 0.5) + 1)
163+ # --- BM25 component ---
164164 df = doc_freq .get (t , 0 )
165165 idf = math .log ((N - df + 0.5 ) / (df + 0.5 ) + 1.0 )
166166
167- # BM25 content score
168167 if tf_content > 0 :
169168 numerator = tf_content * (k1 + 1 )
170169 denominator = tf_content + k1 * (1 - b + b * dl / avgdl )
171- score += idf * numerator / denominator
170+ bm25_score += idf * numerator / denominator
172171
173- # Title bonus (separate, additive — not affected by BM25 length normalization)
174172 if tf_title > 0 :
175- score += idf * title_boost
173+ bm25_score += idf * title_boost
176174
177- # Bigram bonus (phrase proximity)
175+ # --- Substring component (corpus-size independent) ---
176+ if tf_title > 0 :
177+ substr_score += 2.0
178+ if tf_content > 0 :
179+ substr_score += 1.0
180+
181+ # Bigram bonus
178182 for bg in bigrams :
179183 if bg in full_text :
180- score += 1.5
184+ bm25_score += 1.5
185+ substr_score += 1.5
181186
182- # Tag exact match bonus
187+ # Tag match
183188 if node .tags :
184189 tag_text = " " .join (node .tags ).lower ()
185190 for t in terms :
186191 if t in tag_text :
187- score += 0.5
192+ substr_score += 1.0
188193
189- # LLM-generated search keywords bonus
194+ # Search keywords
190195 if node .properties :
191196 search_kw = node .properties .get ("_search_keywords" , "" ).lower ()
192197 if search_kw :
193198 for t in terms :
194199 if t in search_kw :
195- score += 1.0
200+ substr_score += 1.5
201+
202+ # Hybrid: BM25 weight increases with corpus size
203+ # N=100: 30% BM25 + 70% substr, N=1000+: 80% BM25 + 20% substr
204+ bm25_weight = min (0.8 , 0.3 + 0.5 * min (1.0 , N / 1000 ))
205+ score = bm25_score * bm25_weight + substr_score * (1 - bm25_weight )
196206
197207 if score > 0 :
198208 scored .append ((node , score ))
0 commit comments