|
2 | 2 |
|
3 | 3 | from __future__ import annotations |
4 | 4 |
|
| 5 | +import math |
5 | 6 | from time import time |
6 | 7 |
|
7 | 8 | from synaptic.models import ActivatedNode, Node, NodeKind, SearchResult |
|
35 | 36 | _KIND_BOOST = 0.05 # kind 매칭 시 search_score 부스트량 (보수적) |
36 | 37 |
|
37 | 38 |
|
| 39 | +def _cosine_sim(a: list[float], b: list[float]) -> float: |
| 40 | + """두 벡터의 코사인 유사도.""" |
| 41 | + dot = sum(x * y for x, y in zip(a, b)) |
| 42 | + na = math.sqrt(sum(x * x for x in a)) |
| 43 | + nb = math.sqrt(sum(x * x for x in b)) |
| 44 | + if na == 0 or nb == 0: |
| 45 | + return 0.0 |
| 46 | + return dot / (na * nb) |
| 47 | + |
| 48 | + |
38 | 49 | class HybridSearch: |
39 | 50 | """3-stage fallback search: FTS+vector → synonym expansion → query rewrite.""" |
40 | 51 |
|
@@ -66,24 +77,43 @@ async def search( |
66 | 77 | stages_used: list[str] = [] |
67 | 78 | all_nodes: dict[str, tuple[Node, float]] = {} |
68 | 79 |
|
69 | | - # Stage 1: FTS + vector |
| 80 | + # Stage 1: FTS + vector hybrid scoring |
| 81 | + fts_scores: dict[str, float] = {} |
70 | 82 | fts_nodes = await backend.search_fts(query, limit=limit * 2) |
71 | 83 | stages_used.append("fts") |
72 | 84 | for rank, node in enumerate(fts_nodes): |
73 | | - # FTS 순위 기반 점수: 1위=0.95, 2위=0.90, ... |
74 | | - score = max(0.5, 0.95 - rank * 0.05) |
75 | | - if node.id not in all_nodes: |
76 | | - all_nodes[node.id] = (node, score) |
| 85 | + # FTS 순위 기반 점수: 1위=0.95, 감소율 0.05 |
| 86 | + score = max(0.3, 0.95 - rank * 0.05) |
| 87 | + fts_scores[node.id] = score |
| 88 | + all_nodes[node.id] = (node, score) |
77 | 89 |
|
| 90 | + vec_scores: dict[str, float] = {} |
78 | 91 | if embedding: |
79 | 92 | vec_nodes = await backend.search_vector(embedding, limit=limit * 2) |
80 | 93 | stages_used.append("vector") |
81 | | - for node in vec_nodes: |
82 | | - if node.id not in all_nodes: |
83 | | - all_nodes[node.id] = (node, 0.7) |
| 94 | + for rank, node in enumerate(vec_nodes): |
| 95 | + # Vector 순위 기반 점수 + 실제 cosine similarity 반영 |
| 96 | + rank_score = max(0.3, 0.95 - rank * 0.05) |
| 97 | + # cosine similarity 직접 계산 (가능한 경우) |
| 98 | + if node.embedding and embedding: |
| 99 | + sim = _cosine_sim(embedding, node.embedding) |
| 100 | + vec_score = sim * 0.7 + rank_score * 0.3 # sim 우선 |
| 101 | + else: |
| 102 | + vec_score = rank_score |
| 103 | + vec_scores[node.id] = vec_score |
| 104 | + |
| 105 | + # FTS + vector 하이브리드 점수 합산 |
| 106 | + alpha = 0.5 # FTS vs vector 가중치 (0.5 = 동등) |
| 107 | + for nid, node in {n.id: n for n in vec_nodes}.items(): |
| 108 | + fts_s = fts_scores.get(nid, 0.0) |
| 109 | + vec_s = vec_scores.get(nid, 0.0) |
| 110 | + if nid in all_nodes: |
| 111 | + # 양쪽 다 있으면 하이브리드 점수 |
| 112 | + hybrid = alpha * fts_s + (1 - alpha) * vec_s + 0.1 # 양쪽 매칭 보너스 |
| 113 | + all_nodes[nid] = (all_nodes[nid][0], min(1.0, hybrid)) |
84 | 114 | else: |
85 | | - existing = all_nodes[node.id] |
86 | | - all_nodes[node.id] = (existing[0], min(1.0, existing[1] + 0.2)) |
| 115 | + # vector only |
| 116 | + all_nodes[nid] = (node, vec_s * 0.9) # FTS 매칭 없으면 약간 감쇠 |
87 | 117 |
|
88 | 118 | # Stage 2: Synonym expansion (if insufficient results) |
89 | 119 | if len(all_nodes) < limit: |
|
0 commit comments