Skip to content

Commit 1293ac1

Browse files
committed
feat(store): replace PostgreSQL FTS with BM25 in HybridStore
Replace ts_rank_cd (PostgreSQL Full-Text Search) with BM25 algorithm for better keyword search ranking in hybrid search. Changes: - Add bm25Language parameter (configurable via YAML) - Replace FTS CTEs with bm25topk() function calls - Add DISTINCT ON fixes to prevent duplicate results - Add fuzzy matching with word_similarity (pg_trgm) - Add score normalization (0-100 range) - Add searchable attributes with field-specific boosting - Bundle configuration in options.php and AiBundle.php Tests: - Update 6 existing tests for BM25 compatibility - Add 3 new tests for fuzzy matching and searchable attributes - All 19 tests passing (132 assertions) Breaking changes: - Requires plpgsql_bm25 extension instead of native FTS - BM25 uses short language codes ('en', 'fr') vs FTS full names
1 parent 2c7b49a commit 1293ac1

File tree

4 files changed

+564
-82
lines changed

4 files changed

+564
-82
lines changed

src/ai-bundle/config/options.php

Lines changed: 54 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -765,6 +765,10 @@
765765
->info('PostgreSQL text search configuration (e.g., "simple", "english", "french"). Default: "simple" (multilingual)')
766766
->defaultValue('simple')
767767
->end()
768+
->stringNode('bm25_language')
769+
->info('BM25 language code for stemming (e.g., "en", "fr", "es", "de", "it", "pt", "nl", "ru", "ar", "zh"). Default: "en"')
770+
->defaultValue('en')
771+
->end()
768772
->integerNode('rrf_k')
769773
->info('RRF (Reciprocal Rank Fusion) constant. Higher = more equal weighting. Default: 60 (Supabase)')
770774
->defaultValue(60)
@@ -774,6 +778,56 @@
774778
->info('Default maximum distance threshold for filtering results (optional)')
775779
->defaultNull()
776780
->end()
781+
->floatNode('default_min_score')
782+
->info('Default minimum RRF score threshold for filtering results (optional)')
783+
->defaultNull()
784+
->end()
785+
->booleanNode('normalize_scores')
786+
->info('Normalize scores to 0-100 range for better readability')
787+
->defaultTrue()
788+
->end()
789+
->floatNode('fuzzy_primary_threshold')
790+
->info('Primary threshold for fuzzy matching (pg_trgm word_similarity). Higher = stricter. Default: 0.25')
791+
->defaultValue(0.25)
792+
->min(0.0)
793+
->max(1.0)
794+
->end()
795+
->floatNode('fuzzy_secondary_threshold')
796+
->info('Secondary threshold for fuzzy matching with double validation. Catches more typos. Default: 0.2')
797+
->defaultValue(0.2)
798+
->min(0.0)
799+
->max(1.0)
800+
->end()
801+
->floatNode('fuzzy_strict_threshold')
802+
->info('Strict similarity threshold for double validation to eliminate false positives. Default: 0.15')
803+
->defaultValue(0.15)
804+
->min(0.0)
805+
->max(1.0)
806+
->end()
807+
->floatNode('fuzzy_weight')
808+
->info('Weight of fuzzy matching vs FTS in hybrid search. 0.0 = disabled, 0.5 = equal (recommended), 1.0 = fuzzy only')
809+
->defaultValue(0.5)
810+
->min(0.0)
811+
->max(1.0)
812+
->end()
813+
->arrayNode('searchable_attributes')
814+
->info('Searchable attributes with field-specific boosting (similar to Meilisearch). Each attribute creates a separate tsvector column.')
815+
->useAttributeAsKey('name')
816+
->arrayPrototype()
817+
->children()
818+
->floatNode('boost')
819+
->info('Boost multiplier for this field (e.g., 2.0 = twice as important). Default: 1.0')
820+
->defaultValue(1.0)
821+
->min(0.0)
822+
->end()
823+
->scalarNode('metadata_key')
824+
->info('JSON path to extract value from metadata (e.g., "title", "description")')
825+
->isRequired()
826+
->cannotBeEmpty()
827+
->end()
828+
->end()
829+
->end()
830+
->end()
777831
->stringNode('dbal_connection')->cannotBeEmpty()->end()
778832
->end()
779833
->validate()

src/ai-bundle/src/AiBundle.php

Lines changed: 32 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1432,6 +1432,38 @@ private function processStoreConfig(string $type, array $stores, ContainerBuilde
14321432
$arguments[8] = $store['default_max_score'];
14331433
}
14341434

1435+
if (\array_key_exists('default_min_score', $store)) {
1436+
$arguments[9] = $store['default_min_score'];
1437+
}
1438+
1439+
if (\array_key_exists('normalize_scores', $store)) {
1440+
$arguments[10] = $store['normalize_scores'];
1441+
}
1442+
1443+
if (\array_key_exists('fuzzy_primary_threshold', $store)) {
1444+
$arguments[11] = $store['fuzzy_primary_threshold'];
1445+
}
1446+
1447+
if (\array_key_exists('fuzzy_secondary_threshold', $store)) {
1448+
$arguments[12] = $store['fuzzy_secondary_threshold'];
1449+
}
1450+
1451+
if (\array_key_exists('fuzzy_strict_threshold', $store)) {
1452+
$arguments[13] = $store['fuzzy_strict_threshold'];
1453+
}
1454+
1455+
if (\array_key_exists('fuzzy_weight', $store)) {
1456+
$arguments[14] = $store['fuzzy_weight'];
1457+
}
1458+
1459+
if (\array_key_exists('searchable_attributes', $store)) {
1460+
$arguments[15] = $store['searchable_attributes'];
1461+
}
1462+
1463+
if (\array_key_exists('bm25_language', $store)) {
1464+
$arguments[16] = $store['bm25_language'];
1465+
}
1466+
14351467
$definition
14361468
->addTag('ai.store')
14371469
->setArguments($arguments);

0 commit comments

Comments
 (0)