Skip to content

Commit b0ccf2b

Browse files
author
StackMemory Bot (CLI)
committed
feat(dspy): scaffold prompt optimization pipeline for retrieval
Adds DSPy-based offline prompt optimization for StackMemory: - signatures.py: FrameRetrieval, QueryComplexity, FrameScoring, ContextCompression - data.py: loads from retrieval_audit + synthetic cold-start examples - optimize.py: MIPROv2/BootstrapFewShot optimization, exports state JSON - eval.py: CI-friendly eval with regression detection - setup.sh: venv + deps installer Does not add DSPy to TS runtime — runs offline, exports optimized prompts back to llm-context-retrieval.ts.
1 parent 93d303d commit b0ccf2b

6 files changed

Lines changed: 676 additions & 0 deletions

File tree

scripts/dspy/data.py

Lines changed: 200 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,200 @@
1+
"""
2+
Data loading from StackMemory SQLite databases.
3+
4+
Loads training examples from:
5+
- retrieval_audit: past retrieval queries with confidence scores
6+
- frames: available frame metadata
7+
- anchors: decision/constraint anchors
8+
- events: frame events
9+
10+
Falls back to synthetic examples when audit data is sparse.
11+
"""
12+
13+
import json
14+
import sqlite3
15+
from pathlib import Path
16+
from typing import Optional
17+
18+
import dspy
19+
20+
21+
def find_db(repo_root: Optional[str] = None) -> Path:
22+
"""Find the StackMemory context.db file."""
23+
candidates = [
24+
Path(repo_root or ".") / ".stackmemory" / "context.db",
25+
Path.home() / ".stackmemory" / "context.db",
26+
Path.home() / ".stackmemory" / "symphony" / "context.db",
27+
]
28+
for p in candidates:
29+
if p.exists():
30+
return p
31+
raise FileNotFoundError(
32+
f"No context.db found. Searched: {[str(c) for c in candidates]}"
33+
)
34+
35+
36+
def load_audit_examples(db_path: Path, min_confidence: float = 0.5) -> list[dspy.Example]:
37+
"""Load training examples from retrieval_audit table."""
38+
db = sqlite3.connect(str(db_path))
39+
db.row_factory = sqlite3.Row
40+
rows = db.execute(
41+
"""
42+
SELECT query, reasoning, frames_retrieved, confidence_score,
43+
tokens_used, token_budget, query_complexity
44+
FROM retrieval_audit
45+
WHERE confidence_score >= ?
46+
ORDER BY confidence_score DESC
47+
LIMIT 200
48+
""",
49+
(min_confidence,),
50+
).fetchall()
51+
db.close()
52+
53+
examples = []
54+
for r in rows:
55+
examples.append(
56+
dspy.Example(
57+
query=r["query"],
58+
reasoning=r["reasoning"],
59+
frames_to_retrieve=r["frames_retrieved"],
60+
confidence_score=r["confidence_score"],
61+
token_budget=r["token_budget"],
62+
).with_inputs("query", "token_budget", "session_summary", "available_frames", "key_decisions")
63+
)
64+
return examples
65+
66+
67+
def load_frames(db_path: Path, limit: int = 50) -> list[dict]:
68+
"""Load frame metadata for building training context."""
69+
db = sqlite3.connect(str(db_path))
70+
db.row_factory = sqlite3.Row
71+
rows = db.execute(
72+
"""
73+
SELECT frame_id, name, type, importance_score, access_count,
74+
created_at, closed_at
75+
FROM frames
76+
ORDER BY last_accessed DESC
77+
LIMIT ?
78+
""",
79+
(limit,),
80+
).fetchall()
81+
db.close()
82+
return [dict(r) for r in rows]
83+
84+
85+
def load_anchors(db_path: Path, limit: int = 30) -> list[dict]:
86+
"""Load decision/constraint anchors."""
87+
db = sqlite3.connect(str(db_path))
88+
db.row_factory = sqlite3.Row
89+
rows = db.execute(
90+
"""
91+
SELECT anchor_id, frame_id, type, text, priority, created_at
92+
FROM anchors
93+
ORDER BY created_at DESC
94+
LIMIT ?
95+
""",
96+
(limit,),
97+
).fetchall()
98+
db.close()
99+
return [dict(r) for r in rows]
100+
101+
102+
def build_frame_summary(frames: list[dict]) -> str:
103+
"""Build the available_frames field from frame metadata."""
104+
lines = []
105+
for f in frames[:15]:
106+
score = f.get("importance_score", 0) or 0
107+
lines.append(
108+
f"- {f['frame_id']}: \"{f['name']}\" ({f['type']}, score: {score:.2f}, events: {f.get('access_count', 0)})"
109+
)
110+
return "\n".join(lines)
111+
112+
113+
def build_decisions_summary(anchors: list[dict]) -> str:
114+
"""Build key_decisions field from anchors."""
115+
decisions = [a for a in anchors if a.get("type") == "decision"]
116+
if not decisions:
117+
decisions = anchors[:5]
118+
lines = []
119+
for d in decisions[:5]:
120+
text = (d.get("text") or "")[:80]
121+
lines.append(f"- {text}...")
122+
return "\n".join(lines) or "No decisions recorded yet."
123+
124+
125+
# --- Synthetic examples for cold-start ---
126+
127+
SYNTHETIC_QUERIES = [
128+
{
129+
"query": "What errors happened in the last hour?",
130+
"complexity": "simple",
131+
"use_llm": False,
132+
"strategy": "recent",
133+
"reasoning": "Time-scoped error lookup — heuristic recency filter suffices",
134+
},
135+
{
136+
"query": "How does the authentication flow work end to end?",
137+
"complexity": "complex",
138+
"use_llm": True,
139+
"strategy": "semantic",
140+
"reasoning": "Cross-cutting architectural query needs semantic understanding of auth-related frames",
141+
},
142+
{
143+
"query": "What did I work on yesterday?",
144+
"complexity": "simple",
145+
"use_llm": False,
146+
"strategy": "recent",
147+
"reasoning": "Simple time-scoped standup query — filter by date, sort by activity",
148+
},
149+
{
150+
"query": "Why is the API returning 500 on the /users endpoint?",
151+
"complexity": "complex",
152+
"use_llm": True,
153+
"strategy": "hybrid",
154+
"reasoning": "Debugging requires correlating error events, recent changes to user routes, and related decisions",
155+
},
156+
{
157+
"query": "Show me the database schema changes this week",
158+
"complexity": "moderate",
159+
"use_llm": False,
160+
"strategy": "keyword",
161+
"reasoning": "File-type filter (migrations) + time constraint — keyword search on .sql files",
162+
},
163+
{
164+
"query": "What's the current state of the billing integration?",
165+
"complexity": "moderate",
166+
"use_llm": True,
167+
"strategy": "semantic",
168+
"reasoning": "Feature-scoped query across multiple frames — needs semantic matching on billing-related work",
169+
},
170+
{
171+
"query": "List all TODO items and unfinished tasks",
172+
"complexity": "simple",
173+
"use_llm": False,
174+
"strategy": "keyword",
175+
"reasoning": "Keyword match on TODO/task anchors — no semantic analysis needed",
176+
},
177+
{
178+
"query": "What architectural decisions were made about the caching layer and why?",
179+
"complexity": "complex",
180+
"use_llm": True,
181+
"strategy": "semantic",
182+
"reasoning": "Decision retrieval across time requires understanding context of caching-related anchors and their rationale",
183+
},
184+
]
185+
186+
187+
def build_synthetic_examples() -> list[dspy.Example]:
188+
"""Build synthetic training examples for cold-start optimization."""
189+
examples = []
190+
for q in SYNTHETIC_QUERIES:
191+
examples.append(
192+
dspy.Example(
193+
query=q["query"],
194+
complexity=q["complexity"],
195+
use_llm=q["use_llm"],
196+
strategy=q["strategy"],
197+
reasoning=q["reasoning"],
198+
).with_inputs("query", "frame_count", "has_time_constraint", "has_file_constraint")
199+
)
200+
return examples

scripts/dspy/eval.py

Lines changed: 110 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,110 @@
1+
#!/usr/bin/env python3
2+
"""
3+
Evaluate StackMemory retrieval quality against baseline.
4+
5+
Runs the current (or optimized) retrieval prompt against test queries
6+
and reports metrics. Use in CI to detect prompt regression.
7+
8+
Usage:
9+
python scripts/dspy/eval.py [--optimized PATH] [--db PATH]
10+
"""
11+
12+
import argparse
13+
import json
14+
import os
15+
import sys
16+
from pathlib import Path
17+
18+
import dspy
19+
20+
from signatures import FrameRetrieval, QueryComplexity
21+
from data import find_db, load_frames, load_anchors, build_frame_summary, build_decisions_summary
22+
from optimize import retrieval_metric, complexity_metric
23+
24+
25+
# Fixed eval queries — stable across runs
26+
EVAL_QUERIES = [
27+
"What errors happened in the last hour?",
28+
"How does the authentication flow work?",
29+
"What did I work on yesterday?",
30+
"Why is the API returning 500?",
31+
"Show me recent database schema changes",
32+
"What's the current state of the billing integration?",
33+
"What architectural decisions were made about caching?",
34+
"List all unfinished tasks",
35+
"What files were changed in the last commit?",
36+
"How is the deployment pipeline configured?",
37+
]
38+
39+
40+
def run_eval(db_path: Path, model: str, optimized_path: Path | None):
41+
"""Run evaluation against fixed queries."""
42+
lm = dspy.LM(model, api_key=os.environ.get("ANTHROPIC_API_KEY"))
43+
dspy.configure(lm=lm)
44+
45+
frames = load_frames(db_path)
46+
anchors = load_anchors(db_path)
47+
frame_summary = build_frame_summary(frames)
48+
decisions_summary = build_decisions_summary(anchors)
49+
50+
# Build eval examples
51+
eval_set = []
52+
for q in EVAL_QUERIES:
53+
eval_set.append(
54+
dspy.Example(
55+
query=q,
56+
token_budget=4096,
57+
session_summary=f"Frames: {len(frames)}, recent activity",
58+
available_frames=frame_summary,
59+
key_decisions=decisions_summary,
60+
).with_inputs("query", "token_budget", "session_summary", "available_frames", "key_decisions")
61+
)
62+
63+
# Baseline
64+
baseline = dspy.ChainOfThought(FrameRetrieval)
65+
evaluate = dspy.Evaluate(devset=eval_set, metric=retrieval_metric, num_threads=2)
66+
baseline_score = evaluate(baseline)
67+
print(f"Baseline score: {baseline_score:.3f}")
68+
69+
# Optimized (if available)
70+
if optimized_path and optimized_path.exists():
71+
state = json.loads(optimized_path.read_text())
72+
optimized = dspy.ChainOfThought(FrameRetrieval)
73+
optimized.load_state(state["retrieval"]["state"])
74+
optimized_score = evaluate(optimized)
75+
print(f"Optimized score: {optimized_score:.3f}")
76+
delta = optimized_score - baseline_score
77+
print(f"Delta: {delta:+.3f}")
78+
79+
if delta < -0.05:
80+
print("REGRESSION DETECTED — optimized prompt is worse than baseline")
81+
sys.exit(1)
82+
elif delta > 0.02:
83+
print("IMPROVEMENT — consider updating the production prompt")
84+
else:
85+
print("NO SIGNIFICANT CHANGE")
86+
else:
87+
print("No optimized state found — baseline only")
88+
89+
return baseline_score
90+
91+
92+
def main():
93+
parser = argparse.ArgumentParser(description="Evaluate StackMemory retrieval")
94+
parser.add_argument("--db", type=str, help="Path to context.db")
95+
parser.add_argument("--model", type=str, default="anthropic/claude-haiku-4-5-20251001")
96+
parser.add_argument("--optimized", type=str, default="scripts/dspy/optimized_state.json")
97+
args = parser.parse_args()
98+
99+
db_path = Path(args.db) if args.db else find_db()
100+
optimized_path = Path(args.optimized) if args.optimized else None
101+
102+
if not os.environ.get("ANTHROPIC_API_KEY"):
103+
print("ERROR: ANTHROPIC_API_KEY not set", file=sys.stderr)
104+
sys.exit(1)
105+
106+
run_eval(db_path, args.model, optimized_path)
107+
108+
109+
if __name__ == "__main__":
110+
main()

0 commit comments

Comments
 (0)