stackmemoryai
diff --git a/‎scripts/dspy/data.py‎
Lines changed: 200 additions & 0 deletions b/‎scripts/dspy/data.py‎
Lines changed: 200 additions & 0 deletions
diff --git a/‎scripts/dspy/eval.py‎
Lines changed: 110 additions & 0 deletions b/‎scripts/dspy/eval.py‎
Lines changed: 110 additions & 0 deletions
@@ -0,0 +1,200 @@
+"""
+Data loading from StackMemory SQLite databases.
+
+Loads training examples from:
+- retrieval_audit: past retrieval queries with confidence scores
+- frames: available frame metadata
+- anchors: decision/constraint anchors
+- events: frame events
+
+Falls back to synthetic examples when audit data is sparse.
+"""
+
+import json
+import sqlite3
+from pathlib import Path
+from typing import Optional
+
+import dspy
+
+
+def find_db(repo_root: Optional[str] = None) -> Path:
+    """Find the StackMemory context.db file."""
+    candidates = [
+        Path(repo_root or ".") / ".stackmemory" / "context.db",
+        Path.home() / ".stackmemory" / "context.db",
+        Path.home() / ".stackmemory" / "symphony" / "context.db",
+    ]
+    for p in candidates:
+        if p.exists():
+            return p
+    raise FileNotFoundError(
+        f"No context.db found. Searched: {[str(c) for c in candidates]}"
+    )
+
+
+def load_audit_examples(db_path: Path, min_confidence: float = 0.5) -> list[dspy.Example]:
+    """Load training examples from retrieval_audit table."""
+    db = sqlite3.connect(str(db_path))
+    db.row_factory = sqlite3.Row
+    rows = db.execute(
+        """
+        SELECT query, reasoning, frames_retrieved, confidence_score,
+               tokens_used, token_budget, query_complexity
+        FROM retrieval_audit
+        WHERE confidence_score >= ?
+        ORDER BY confidence_score DESC
+        LIMIT 200
+        """,
+        (min_confidence,),
+    ).fetchall()
+    db.close()
+
+    examples = []
+    for r in rows:
+        examples.append(
+            dspy.Example(
+                query=r["query"],
+                reasoning=r["reasoning"],
+                frames_to_retrieve=r["frames_retrieved"],
+                confidence_score=r["confidence_score"],
+                token_budget=r["token_budget"],
+            ).with_inputs("query", "token_budget", "session_summary", "available_frames", "key_decisions")
+        )
+    return examples
+
+
+def load_frames(db_path: Path, limit: int = 50) -> list[dict]:
+    """Load frame metadata for building training context."""
+    db = sqlite3.connect(str(db_path))
+    db.row_factory = sqlite3.Row
+    rows = db.execute(
+        """
+        SELECT frame_id, name, type, importance_score, access_count,
+               created_at, closed_at
+        FROM frames
+        ORDER BY last_accessed DESC
+        LIMIT ?
+        """,
+        (limit,),
+    ).fetchall()
+    db.close()
+    return [dict(r) for r in rows]
+
+
+def load_anchors(db_path: Path, limit: int = 30) -> list[dict]:
+    """Load decision/constraint anchors."""
+    db = sqlite3.connect(str(db_path))
+    db.row_factory = sqlite3.Row
+    rows = db.execute(
+        """
+        SELECT anchor_id, frame_id, type, text, priority, created_at
+        FROM anchors
+        ORDER BY created_at DESC
+        LIMIT ?
+        """,
+        (limit,),
+    ).fetchall()
+    db.close()
+    return [dict(r) for r in rows]
+
+
+def build_frame_summary(frames: list[dict]) -> str:
+    """Build the available_frames field from frame metadata."""
+    lines = []
+    for f in frames[:15]:
+        score = f.get("importance_score", 0) or 0
+        lines.append(
+            f"- {f['frame_id']}: \"{f['name']}\" ({f['type']}, score: {score:.2f}, events: {f.get('access_count', 0)})"
+        )
+    return "\n".join(lines)
+
+
+def build_decisions_summary(anchors: list[dict]) -> str:
+    """Build key_decisions field from anchors."""
+    decisions = [a for a in anchors if a.get("type") == "decision"]
+    if not decisions:
+        decisions = anchors[:5]
+    lines = []
+    for d in decisions[:5]:
+        text = (d.get("text") or "")[:80]
+        lines.append(f"- {text}...")
+    return "\n".join(lines) or "No decisions recorded yet."
+
+
+# --- Synthetic examples for cold-start ---
+
+SYNTHETIC_QUERIES = [
+    {
+        "query": "What errors happened in the last hour?",
+        "complexity": "simple",
+        "use_llm": False,
+        "strategy": "recent",
+        "reasoning": "Time-scoped error lookup — heuristic recency filter suffices",
+    },
+    {
+        "query": "How does the authentication flow work end to end?",
+        "complexity": "complex",
+        "use_llm": True,
+        "strategy": "semantic",
+        "reasoning": "Cross-cutting architectural query needs semantic understanding of auth-related frames",
+    },
+    {
+        "query": "What did I work on yesterday?",
+        "complexity": "simple",
+        "use_llm": False,
+        "strategy": "recent",
+        "reasoning": "Simple time-scoped standup query — filter by date, sort by activity",
+    },
+    {
+        "query": "Why is the API returning 500 on the /users endpoint?",
+        "complexity": "complex",
+        "use_llm": True,
+        "strategy": "hybrid",
+        "reasoning": "Debugging requires correlating error events, recent changes to user routes, and related decisions",
+    },
+    {
+        "query": "Show me the database schema changes this week",
+        "complexity": "moderate",
+        "use_llm": False,
+        "strategy": "keyword",
+        "reasoning": "File-type filter (migrations) + time constraint — keyword search on .sql files",
+    },
+    {
+        "query": "What's the current state of the billing integration?",
+        "complexity": "moderate",
+        "use_llm": True,
+        "strategy": "semantic",
+        "reasoning": "Feature-scoped query across multiple frames — needs semantic matching on billing-related work",
+    },
+    {
+        "query": "List all TODO items and unfinished tasks",
+        "complexity": "simple",
+        "use_llm": False,
+        "strategy": "keyword",
+        "reasoning": "Keyword match on TODO/task anchors — no semantic analysis needed",
+    },
+    {
+        "query": "What architectural decisions were made about the caching layer and why?",
+        "complexity": "complex",
+        "use_llm": True,
+        "strategy": "semantic",
+        "reasoning": "Decision retrieval across time requires understanding context of caching-related anchors and their rationale",
+    },
+]
+
+
+def build_synthetic_examples() -> list[dspy.Example]:
+    """Build synthetic training examples for cold-start optimization."""
+    examples = []
+    for q in SYNTHETIC_QUERIES:
+        examples.append(
+            dspy.Example(
+                query=q["query"],
+                complexity=q["complexity"],
+                use_llm=q["use_llm"],
+                strategy=q["strategy"],
+                reasoning=q["reasoning"],
+            ).with_inputs("query", "frame_count", "has_time_constraint", "has_file_constraint")
+        )
+    return examples
@@ -0,0 +1,110 @@
+#!/usr/bin/env python3
+"""
+Evaluate StackMemory retrieval quality against baseline.
+
+Runs the current (or optimized) retrieval prompt against test queries
+and reports metrics. Use in CI to detect prompt regression.
+
+Usage:
+    python scripts/dspy/eval.py [--optimized PATH] [--db PATH]
+"""
+
+import argparse
+import json
+import os
+import sys
+from pathlib import Path
+
+import dspy
+
+from signatures import FrameRetrieval, QueryComplexity
+from data import find_db, load_frames, load_anchors, build_frame_summary, build_decisions_summary
+from optimize import retrieval_metric, complexity_metric
+
+
+# Fixed eval queries — stable across runs
+EVAL_QUERIES = [
+    "What errors happened in the last hour?",
+    "How does the authentication flow work?",
+    "What did I work on yesterday?",
+    "Why is the API returning 500?",
+    "Show me recent database schema changes",
+    "What's the current state of the billing integration?",
+    "What architectural decisions were made about caching?",
+    "List all unfinished tasks",
+    "What files were changed in the last commit?",
+    "How is the deployment pipeline configured?",
+]
+
+
+def run_eval(db_path: Path, model: str, optimized_path: Path | None):
+    """Run evaluation against fixed queries."""
+    lm = dspy.LM(model, api_key=os.environ.get("ANTHROPIC_API_KEY"))
+    dspy.configure(lm=lm)
+
+    frames = load_frames(db_path)
+    anchors = load_anchors(db_path)
+    frame_summary = build_frame_summary(frames)
+    decisions_summary = build_decisions_summary(anchors)
+
+    # Build eval examples
+    eval_set = []
+    for q in EVAL_QUERIES:
+        eval_set.append(
+            dspy.Example(
+                query=q,
+                token_budget=4096,
+                session_summary=f"Frames: {len(frames)}, recent activity",
+                available_frames=frame_summary,
+                key_decisions=decisions_summary,
+            ).with_inputs("query", "token_budget", "session_summary", "available_frames", "key_decisions")
+        )
+
+    # Baseline
+    baseline = dspy.ChainOfThought(FrameRetrieval)
+    evaluate = dspy.Evaluate(devset=eval_set, metric=retrieval_metric, num_threads=2)
+    baseline_score = evaluate(baseline)
+    print(f"Baseline score: {baseline_score:.3f}")
+
+    # Optimized (if available)
+    if optimized_path and optimized_path.exists():
+        state = json.loads(optimized_path.read_text())
+        optimized = dspy.ChainOfThought(FrameRetrieval)
+        optimized.load_state(state["retrieval"]["state"])
+        optimized_score = evaluate(optimized)
+        print(f"Optimized score: {optimized_score:.3f}")
+        delta = optimized_score - baseline_score
+        print(f"Delta: {delta:+.3f}")
+
+        if delta < -0.05:
+            print("REGRESSION DETECTED — optimized prompt is worse than baseline")
+            sys.exit(1)
+        elif delta > 0.02:
+            print("IMPROVEMENT — consider updating the production prompt")
+        else:
+            print("NO SIGNIFICANT CHANGE")
+    else:
+        print("No optimized state found — baseline only")
+
+    return baseline_score
+
+
+def main():
+    parser = argparse.ArgumentParser(description="Evaluate StackMemory retrieval")
+    parser.add_argument("--db", type=str, help="Path to context.db")
+    parser.add_argument("--model", type=str, default="anthropic/claude-haiku-4-5-20251001")
+    parser.add_argument("--optimized", type=str, default="scripts/dspy/optimized_state.json")
+    args = parser.parse_args()
+
+    db_path = Path(args.db) if args.db else find_db()
+    optimized_path = Path(args.optimized) if args.optimized else None
+
+    if not os.environ.get("ANTHROPIC_API_KEY"):
+        print("ERROR: ANTHROPIC_API_KEY not set", file=sys.stderr)
+        sys.exit(1)
+
+    run_eval(db_path, args.model, optimized_path)
+
+
+if __name__ == "__main__":
+    main()