Skip to content

Commit c03ff51

Browse files
author
Your Name
committed
Use ngram to rerank partial_ratio results for more humanistic matching
1 parent c1c8e3e commit c03ff51

4 files changed

Lines changed: 23 additions & 3 deletions

File tree

cecli/helpers/file_system/ngram_index.py

Lines changed: 16 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -6,6 +6,7 @@
66
internally stored list when no trie reference is available.
77
"""
88

9+
from ngram import NGram
910
from rapidfuzz import fuzz, process
1011

1112
# Threshold presets for different use cases (0-1 scale, converted to 0-100 internally)
@@ -66,8 +67,11 @@ def search(
6667
"""
6768
Fuzzy search for items matching a query string.
6869
69-
Uses rapidfuzz.process.extract() with WRatio scorer for
70+
Uses rapidfuzz.process.extract() with partial_ratio scorer for
7071
fast C++-backed fuzzy matching against the trie's path list.
72+
When fewer than 100 results are returned, re-ranks them using
73+
the ngram library's trigram similarity to better match human
74+
expectations for ordering.
7175
7276
Args:
7377
query: Search string
@@ -85,11 +89,20 @@ def search(
8589
results = process.extract(
8690
query,
8791
items,
88-
scorer=fuzz.WRatio,
92+
scorer=fuzz.partial_ratio,
8993
limit=max_results,
9094
score_cutoff=score_cutoff,
9195
)
92-
return [match for match, score, _ in results]
96+
match_names = [match for match, score, _ in results]
97+
98+
# Re-rank with ngram trigram similarity when result set is small
99+
# enough to benefit from the more human-like ordering.
100+
if len(match_names) < 100:
101+
ng = NGram(match_names, N=3)
102+
reranked = ng.search(query, threshold=0.0)
103+
match_names = [item for item, score in reranked]
104+
105+
return match_names
93106

94107
def search_with_scores(
95108
self,

requirements.txt

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -222,6 +222,10 @@ multidict==6.7.0
222222
# -c requirements/common-constraints.txt
223223
# aiohttp
224224
# yarl
225+
ngram==4.0.3
226+
# via
227+
# -c requirements/common-constraints.txt
228+
# -r requirements/requirements.in
225229
numpy==2.3.5
226230
# via
227231
# -c requirements/common-constraints.txt

requirements/common-constraints.txt

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -239,6 +239,8 @@ networkx==3.6
239239
# via
240240
# llama-index-core
241241
# torch
242+
ngram==4.0.3
243+
# via -r requirements/requirements.in
242244
nltk==3.9.2
243245
# via llama-index-core
244246
nodeenv==1.9.1

requirements/requirements.in

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -37,6 +37,7 @@ cryptography>=42.0.0
3737
# File system lookup aids
3838
marisa-trie>=1.0
3939
rapidfuzz>=3.0
40+
ngram>=4.0.3
4041

4142
# Replaced networkx with rustworkx for better performance in repomap
4243
rustworkx>=0.15.0

0 commit comments

Comments
 (0)