Skip to content

Commit f808f2c

Browse files
SonAIengineclaude
andcommitted
feat: Evidence Chain Assembly — 소형 LLM 증강, HotPotQA Correctness 0.856 (+9.2%)
## Evidence Chain (evidence.py) - BFS shortest path: seed 노드 간 bridge 노드 자동 발견 (max_depth=3) - 위상 정렬: Kahn's algorithm으로 논리적 순서 배치 - Context compression: query term overlap 기반 문장 선택 (zero-dep) - Fact extraction: 정규식으로 숫자/날짜/고유명사 문장 추출 - 최종 포맷팅: [SEED/BRIDGE] 역할 + 압축 content + key facts ## graph.build_evidence() - search → evidence chain assembly → compressed context 반환 - 기존 search() 하위 호환 유지 ## E2E 벤치마크 결과 (qwen3.5:4b, HotPotQA 24문항) - Correctness: 0.784 → 0.856 (+9.2%) - 정답: 10/24 → 13/24 (+3건) - Cognee(GPT-4o) 0.925 대비 92.5% 달성 (Gap 7.4%) - 4B 소형 모델에서도 multi-hop QA 가능해짐 ## 새로 해결된 문항 - 날짜 정보 추출 (fact extraction) - 핵심 정보 집중 (context compression) - 두 문서 연결 (bridge node discovery) Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
1 parent b8a0ae7 commit f808f2c

File tree

5 files changed

+438
-4
lines changed

5 files changed

+438
-4
lines changed

src/synaptic/__init__.py

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -9,12 +9,15 @@
99
from synaptic.extensions.embedder import EmbeddingProvider, MockEmbeddingProvider
1010
from synaptic.extensions.relation_detector import RuleBasedRelationDetector
1111
from synaptic.graph import SynapticGraph
12+
from synaptic.evidence import EvidenceAssembler
1213
from synaptic.models import (
1314
ActivatedNode,
1415
ConsolidationLevel,
1516
DigestResult,
1617
Edge,
1718
EdgeKind,
19+
EvidenceChain,
20+
EvidenceStep,
1821
Node,
1922
NodeKind,
2023
SearchResult,
@@ -48,6 +51,9 @@
4851
"Digester",
4952
"Edge",
5053
"EdgeKind",
54+
"EvidenceAssembler",
55+
"EvidenceChain",
56+
"EvidenceStep",
5157
"EmbeddingProvider",
5258
"GraphTraversal",
5359
"KindClassifier",

src/synaptic/evidence.py

Lines changed: 371 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,371 @@
1+
"""EvidenceAssembler — SearchResult를 LLM-optimized evidence chain으로 변환."""
2+
3+
from __future__ import annotations
4+
5+
import re
6+
from collections import deque
7+
from time import time
8+
from typing import TYPE_CHECKING
9+
10+
if TYPE_CHECKING:
11+
from synaptic.protocols import StorageBackend
12+
13+
from synaptic.models import (
14+
Edge,
15+
EdgeKind,
16+
EvidenceChain,
17+
EvidenceStep,
18+
Node,
19+
SearchResult,
20+
)
21+
22+
23+
# 위상 정렬에 사용할 방향성 edge kinds
24+
_DIRECTED_KINDS = frozenset({
25+
EdgeKind.CAUSED,
26+
EdgeKind.RESULTED_IN,
27+
EdgeKind.DEPENDS_ON,
28+
EdgeKind.FOLLOWED_BY,
29+
EdgeKind.LEARNED_FROM,
30+
})
31+
32+
# 불용어 (term overlap 계산에서 제외)
33+
_STOPWORDS = frozenset({
34+
# 영어
35+
"the", "a", "an", "is", "are", "was", "were", "in", "on", "at",
36+
"to", "for", "of", "and", "or", "but", "not", "with", "by", "from",
37+
"that", "this", "it", "its", "be", "been", "being", "have", "has",
38+
"had", "do", "does", "did", "will", "would", "could", "should",
39+
"what", "which", "who", "when", "where", "how", "why",
40+
# 한국어
41+
"은", "는", "이", "가", "을", "를", "에", "의", "와", "과", "도",
42+
"에서", "로", "으로", "하는", "있는", "하고", "하면", "에게",
43+
})
44+
45+
# Fact 추출 패턴
46+
_FACT_PATTERNS = [
47+
# 숫자 + 단위
48+
re.compile(
49+
r'\d[\d,.]*\s*(%|만|억|원|달러|km|kg|GB|MB|TB|명|건|개|년|월|일|시간|분|초|percent|million|billion|thousand)',
50+
re.IGNORECASE,
51+
),
52+
# 날짜 (2024-01-01, 2024년, January 2024, 15 March 1990)
53+
re.compile(r'\b\d{4}[-/년.]\d{1,2}[-/월.]?\d{0,2}일?\b'),
54+
re.compile(
55+
r'\b(?:January|February|March|April|May|June|July|August|September|October|November|December)'
56+
r'\s+\d{1,2},?\s*\d{4}\b',
57+
re.IGNORECASE,
58+
),
59+
re.compile(
60+
r'\b\d{1,2}\s+'
61+
r'(?:January|February|March|April|May|June|July|August|September|October|November|December)'
62+
r'\s+\d{4}\b',
63+
re.IGNORECASE,
64+
),
65+
# 숫자만 (연도, 인구 등) - 4자리 이상
66+
re.compile(r'\b\d{4,}\b'),
67+
]
68+
69+
70+
class EvidenceAssembler:
71+
"""SearchResult를 LLM-optimized evidence chain으로 변환."""
72+
73+
__slots__ = ("_max_sentences", "_relevance_threshold", "_max_tokens")
74+
75+
def __init__(
76+
self,
77+
*,
78+
max_sentences_per_node: int = 5,
79+
relevance_threshold: float = 0.2,
80+
max_tokens: int = 2048,
81+
) -> None:
82+
self._max_sentences = max_sentences_per_node
83+
self._relevance_threshold = relevance_threshold
84+
self._max_tokens = max_tokens
85+
86+
async def assemble(
87+
self,
88+
backend: StorageBackend,
89+
query: str,
90+
search_result: SearchResult,
91+
*,
92+
max_steps: int = 8,
93+
) -> EvidenceChain:
94+
"""Search 결과를 evidence chain으로 조립."""
95+
t0 = time()
96+
97+
if not search_result.nodes:
98+
return EvidenceChain(query=query, assembly_time_ms=(time() - t0) * 1000)
99+
100+
# 1. Seed 노드 추출 (상위 max_steps개)
101+
seed_nodes = search_result.nodes[:max_steps]
102+
seed_ids = [a.node.id for a in seed_nodes]
103+
seed_map: dict[str, Node] = {a.node.id: a.node for a in seed_nodes}
104+
105+
# 2. BFS로 bridge 노드 탐색
106+
bridge_paths = await self._find_bridge_paths(backend, seed_ids)
107+
108+
# bridge에서 발견된 새 노드 수집
109+
all_ids: list[str] = list(seed_ids)
110+
for path in bridge_paths:
111+
for nid in path:
112+
if nid not in seed_map:
113+
node = await backend.get_node(nid)
114+
if node:
115+
seed_map[nid] = node
116+
if nid not in all_ids:
117+
all_ids.append(nid)
118+
119+
# 3. 엣지 수집 (위상 정렬용)
120+
all_edges: list[Edge] = []
121+
id_set = set(all_ids)
122+
for nid in all_ids:
123+
edges = await backend.get_edges(nid)
124+
for e in edges:
125+
other = e.target_id if e.source_id == nid else e.source_id
126+
if other in id_set:
127+
all_edges.append(e)
128+
129+
# 4. 위상 정렬
130+
sorted_ids = self._topological_sort(all_ids, all_edges, seed_ids)
131+
132+
# 5. Step 생성
133+
steps: list[EvidenceStep] = []
134+
all_facts: list[str] = []
135+
seed_id_set = set(seed_ids)
136+
137+
for i, nid in enumerate(sorted_ids[:max_steps]):
138+
node = seed_map.get(nid)
139+
if not node:
140+
continue
141+
142+
role = "seed" if nid in seed_id_set else "bridge"
143+
compressed = self._compress_content(node.content, query)
144+
facts = self._extract_facts(node.content)
145+
all_facts.extend(facts)
146+
147+
# 다음 step으로의 연결 설명
148+
conn = ""
149+
if i < len(sorted_ids) - 1:
150+
next_id = sorted_ids[i + 1]
151+
for e in all_edges:
152+
if (e.source_id == nid and e.target_id == next_id) or \
153+
(e.target_id == nid and e.source_id == next_id):
154+
conn = e.kind.value
155+
break
156+
157+
steps.append(EvidenceStep(
158+
node=node,
159+
role=role,
160+
connection_to_next=conn,
161+
compressed_content=compressed,
162+
facts=facts,
163+
))
164+
165+
# 6. 최종 context 포맷팅
166+
context = self._format_context(steps)
167+
168+
# 토큰 근사
169+
tokens = len(context.split())
170+
171+
return EvidenceChain(
172+
query=query,
173+
steps=steps,
174+
compressed_context=context,
175+
facts=list(dict.fromkeys(all_facts)), # 중복 제거, 순서 유지
176+
total_tokens_approx=tokens,
177+
assembly_time_ms=(time() - t0) * 1000,
178+
)
179+
180+
async def _find_bridge_paths(
181+
self,
182+
backend: StorageBackend,
183+
seed_ids: list[str],
184+
) -> list[list[str]]:
185+
"""Seed 노드 간 BFS shortest path 탐색."""
186+
paths: list[list[str]] = []
187+
max_depth = 3
188+
189+
# 상위 5개 seed만 (O(N²) 방지)
190+
seeds = seed_ids[:5]
191+
192+
for i in range(len(seeds) - 1):
193+
src, dst = seeds[i], seeds[i + 1]
194+
path = await self._bfs_shortest(backend, src, dst, max_depth)
195+
if path and len(path) > 2: # bridge가 있는 경우만
196+
paths.append(path)
197+
198+
return paths
199+
200+
async def _bfs_shortest(
201+
self,
202+
backend: StorageBackend,
203+
src: str,
204+
dst: str,
205+
max_depth: int,
206+
) -> list[str] | None:
207+
"""BFS로 src → dst 최단 경로."""
208+
if src == dst:
209+
return [src]
210+
211+
queue: deque[tuple[str, list[str]]] = deque([(src, [src])])
212+
visited: set[str] = {src}
213+
214+
while queue:
215+
current, path = queue.popleft()
216+
if len(path) > max_depth + 1:
217+
break
218+
219+
edges = await backend.get_edges(current)
220+
for edge in edges:
221+
neighbor = edge.target_id if edge.source_id == current else edge.source_id
222+
if neighbor == dst:
223+
return path + [neighbor]
224+
if neighbor not in visited:
225+
visited.add(neighbor)
226+
queue.append((neighbor, path + [neighbor]))
227+
228+
return None
229+
230+
def _topological_sort(
231+
self,
232+
node_ids: list[str],
233+
edges: list[Edge],
234+
seed_ids: list[str],
235+
) -> list[str]:
236+
"""위상 정렬. 방향성 edge만 사용, 실패 시 원래 순서 폴백."""
237+
id_set = set(node_ids)
238+
239+
# 방향성 edge 필터
240+
directed = [
241+
e for e in edges
242+
if e.kind in _DIRECTED_KINDS
243+
and e.source_id in id_set
244+
and e.target_id in id_set
245+
]
246+
247+
if not directed:
248+
return list(node_ids) # 원래 순서 (activation 순)
249+
250+
# Kahn's algorithm
251+
in_degree: dict[str, int] = {nid: 0 for nid in node_ids}
252+
adj: dict[str, list[str]] = {nid: [] for nid in node_ids}
253+
254+
for e in directed:
255+
adj[e.source_id].append(e.target_id)
256+
in_degree[e.target_id] = in_degree.get(e.target_id, 0) + 1
257+
258+
queue: deque[str] = deque(nid for nid in node_ids if in_degree.get(nid, 0) == 0)
259+
result: list[str] = []
260+
261+
while queue:
262+
nid = queue.popleft()
263+
result.append(nid)
264+
for neighbor in adj.get(nid, []):
265+
in_degree[neighbor] -= 1
266+
if in_degree[neighbor] == 0:
267+
queue.append(neighbor)
268+
269+
# 순환 등으로 누락된 노드 추가 (원래 순서)
270+
remaining = [nid for nid in node_ids if nid not in set(result)]
271+
result.extend(remaining)
272+
273+
return result
274+
275+
def _compress_content(self, content: str, query: str) -> str:
276+
"""Query 관련 문장만 선택하여 압축."""
277+
if not content:
278+
return ""
279+
280+
# 문장 분리 — 마침표/물음표/느낌표 뒤 공백 + 다음 문자
281+
sentences = re.split(r'(?<=[.!?。])\s+', content.strip())
282+
if not sentences:
283+
return content[:500]
284+
285+
# query term 추출
286+
query_terms = {
287+
t.lower() for t in re.split(r'[\s,;:!?()\[\]]+', query)
288+
if t.lower() not in _STOPWORDS and len(t) >= 2
289+
}
290+
291+
if not query_terms:
292+
# query에서 term을 못 뽑으면 처음 N문장 반환
293+
return " ".join(sentences[:self._max_sentences])
294+
295+
# 각 문장의 relevance
296+
scored: list[tuple[int, str, float]] = []
297+
for i, sent in enumerate(sentences):
298+
sent_lower = sent.lower()
299+
sent_terms = set(re.split(r'[\s,;:!?()\[\]]+', sent_lower))
300+
overlap = len(query_terms & sent_terms)
301+
relevance = overlap / len(query_terms)
302+
scored.append((i, sent, relevance))
303+
304+
# threshold 이상 선택
305+
selected = [(i, s) for i, s, r in scored if r >= self._relevance_threshold]
306+
307+
# 없으면 상위 N개 폴백
308+
if not selected:
309+
scored.sort(key=lambda x: x[2], reverse=True)
310+
selected = [(i, s) for i, s, _ in scored[:self._max_sentences]]
311+
312+
# 원래 순서 유지
313+
selected.sort(key=lambda x: x[0])
314+
315+
# 개수 제한
316+
selected = selected[:self._max_sentences]
317+
318+
return " ".join(s for _, s in selected)
319+
320+
def _extract_facts(self, content: str) -> list[str]:
321+
"""정규식으로 핵심 사실(숫자, 날짜, 고유명사) 포함 문장 추출."""
322+
if not content:
323+
return []
324+
325+
sentences = re.split(r'(?<=[.!?。])\s+', content.strip())
326+
facts: list[str] = []
327+
seen: set[str] = set()
328+
329+
for sent in sentences:
330+
for pattern in _FACT_PATTERNS:
331+
if pattern.search(sent):
332+
normalized = sent.strip()
333+
if normalized and normalized not in seen:
334+
facts.append(normalized)
335+
seen.add(normalized)
336+
break
337+
338+
return facts
339+
340+
def _format_context(self, steps: list[EvidenceStep]) -> str:
341+
"""Steps를 LLM에게 전달할 최종 context 문자열로 조립."""
342+
parts: list[str] = []
343+
344+
for i, step in enumerate(steps):
345+
# 역할 + 제목
346+
title = step.node.title or "Untitled"
347+
parts.append(f"[{step.role.upper()}] {title}")
348+
349+
# 압축된 content
350+
if step.compressed_content:
351+
parts.append(step.compressed_content)
352+
353+
# 핵심 facts (최대 3개)
354+
if step.facts:
355+
facts_text = " | ".join(step.facts[:3])
356+
parts.append(f"Key facts: {facts_text}")
357+
358+
# 다음 step 연결
359+
if step.connection_to_next and i < len(steps) - 1:
360+
parts.append(f"→ {step.connection_to_next}")
361+
362+
parts.append("") # 구분
363+
364+
context = "\n".join(parts).strip()
365+
366+
# 토큰 제한
367+
words = context.split()
368+
if len(words) > self._max_tokens:
369+
context = " ".join(words[:self._max_tokens])
370+
371+
return context

0 commit comments

Comments
 (0)