Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
19 changes: 19 additions & 0 deletions src/c2/search/reranker/browser/configure.zcml
Original file line number Diff line number Diff line change
Expand Up @@ -20,4 +20,23 @@
permission="cmf.ManagePortal"
/>

<!-- Override classic @@search and @@ajax-search with reranking.
Use plone.app.layout INavigationRoot which works on both
Plone 5.2 and Plone 6 (deprecated but functional on 6). -->
<browser:page
name="search"
for="plone.app.layout.navigation.interfaces.INavigationRoot"
class=".search_override.RerankedSearch"
permission="zope2.View"
layer="c2.search.reranker.interfaces.IBrowserLayer"
/>

<browser:page
name="ajax-search"
for="plone.app.layout.navigation.interfaces.INavigationRoot"
class=".search_override.RerankedAjaxSearch"
permission="zope2.View"
layer="c2.search.reranker.interfaces.IBrowserLayer"
/>

</configure>
130 changes: 14 additions & 116 deletions src/c2/search/reranker/browser/hybrid_search.py
Original file line number Diff line number Diff line change
@@ -1,12 +1,19 @@
"""Browser view for hybrid search combining keyword and vector search."""

from c2.search.reranker import logger
from c2.search.reranker.interfaces import IRerankerSettings
from c2.search.reranker.reranker import (
RerankerSettings,
calculate_time_decay,
get_content_age_days,
)
from c2.search.reranker.search import (
compute_rrf_scores,
find_vector_index,
get_brain_by_rid,
is_vectorsearch_available,
keyword_search,
vector_search,
)
from DateTime import DateTime
from plone.registry.interfaces import IRegistry
from Products.CMFCore.utils import getToolByName
Expand All @@ -23,9 +30,6 @@ class HybridSearchView(BrowserView):
Access via: @@hybrid-search?SearchableText=keyword
"""

# RRF constant: prevents top-ranked items from dominating too much.
RRF_K = 60

def __call__(self):
self.search_text = self.request.form.get("SearchableText", "")
self.results = []
Expand All @@ -50,90 +54,13 @@ def __call__(self):

return self.index()

def _is_vectorsearch_available(self):
"""Check if collective.vectorsearch is importable."""
try:
from collective.vectorsearch.vector_index import VectorIndex # noqa: F401

return True
except ImportError:
return False

def _find_vector_index(self, catalog):
"""Find the first VectorIndex in catalog.

Returns the index object or None.
"""
for idx_name in catalog.Indexes:
idx = catalog.Indexes[idx_name]
if getattr(idx, "meta_type", "") == "VectorIndex":
return idx
return None

def _keyword_search(self, catalog):
"""Execute keyword search via catalog.

Returns dict: {rid: (brain, normalized_score)}
"""
query = {"SearchableText": self.search_text, "sort_limit": 200}
brains = list(catalog.searchResults(**query))
result = {}
max_score = 0.0
for brain in brains:
rid = brain.getRID()
score = getattr(brain, "data_record_normalized_score_", None)
score = 1.0 if score is None or score == 0 else float(score)
if score > max_score:
max_score = score
result[rid] = (brain, score)

# Normalize to 0.0-1.0 range
if max_score > 0:
result = {
rid: (brain, score / max_score)
for rid, (brain, score) in result.items()
}
return result

def _vector_search(self, vector_index):
"""Execute vector search via VectorIndex.

Returns dict: {rid: vector_score_normalized}
"""

class QueryRecord:
def __init__(self, text):
self.keys = [text]

record = QueryRecord(self.search_text)
bucket = vector_index.query_index(record)

if bucket is None:
return {}

result = {}
for rid, int_score in bucket.items():
result[rid] = float(int_score) / 100_000_000.0
return result

def _get_brain_by_rid(self, catalog, rid):
"""Get a catalog brain for a given RID."""
try:
path = catalog.getpath(rid)
results = catalog.searchResults(path={"query": path, "depth": 0})
if results:
return results[0]
except Exception:
logger.debug("Could not resolve brain for RID %s", rid)
return None

def _prepare_vector_search(self, catalog):
"""Prepare and execute vector search if possible.

Returns (vector_results, keyword_ratio) tuple.
Sets self.vector_message and self.vector_index_name as side effects.
"""
if not self._is_vectorsearch_available():
if not is_vectorsearch_available():
self.vector_message = (
"collective.vectorsearch is not installed. "
"Showing keyword-only results."
Expand All @@ -147,7 +74,7 @@ def _prepare_vector_search(self, catalog):
)
return {}, 100

vector_index = self._find_vector_index(catalog)
vector_index = find_vector_index(catalog)
if vector_index is None:
self.vector_message = (
"No VectorIndex found in catalog. Please add a VectorIndex."
Expand All @@ -156,48 +83,19 @@ def _prepare_vector_search(self, catalog):

self.vector_index_name = vector_index.id
try:
return self._vector_search(vector_index), self.keyword_ratio
return vector_search(vector_index, self.search_text), self.keyword_ratio
except Exception as e:
self.vector_message = f"Vector search error: {e}"
return {}, 100

def _compute_rrf_scores(self, keyword_results, vector_results):
"""Compute RRF (Reciprocal Rank Fusion) scores from rank positions.

Returns dict: {rid: (keyword_rrf, vector_rrf)}
"""
k = self.RRF_K

# Sort keyword results by score descending → assign ranks
kw_ranked = sorted(
keyword_results.keys(),
key=lambda rid: keyword_results[rid][1],
reverse=True,
)
kw_rrf = {}
for rank, rid in enumerate(kw_ranked, start=1):
kw_rrf[rid] = 1.0 / (k + rank)

# Sort vector results by score descending → assign ranks
vec_ranked = sorted(
vector_results.keys(),
key=lambda rid: vector_results[rid],
reverse=True,
)
vec_rrf = {}
for rank, rid in enumerate(vec_ranked, start=1):
vec_rrf[rid] = 1.0 / (k + rank)

return kw_rrf, vec_rrf

def _search_hybrid(self):
"""Execute hybrid search combining keyword and vector results."""
catalog = getToolByName(self.context, "portal_catalog")
now = DateTime()
reranker_settings = RerankerSettings()

# Step 1: Keyword search (always)
keyword_results = self._keyword_search(catalog)
keyword_results = keyword_search(catalog, self.search_text)

# Step 2: Vector search (if available and enabled)
vector_results, keyword_ratio = self._prepare_vector_search(catalog)
Expand All @@ -213,7 +111,7 @@ def _search_hybrid(self):
kw_rrf = {}
vec_rrf = {}
if use_rrf and vector_results:
kw_rrf, vec_rrf = self._compute_rrf_scores(keyword_results, vector_results)
kw_rrf, vec_rrf = compute_rrf_scores(keyword_results, vector_results)

all_rids = set(keyword_results.keys()) | set(vector_results.keys())
results = []
Expand All @@ -226,7 +124,7 @@ def _search_hybrid(self):
brain, ks = kw_data
else:
ks = 0.0
brain = self._get_brain_by_rid(catalog, rid)
brain = get_brain_by_rid(catalog, rid)
if brain is None:
continue

Expand Down
Loading
Loading