From 22e65aeeb4c9c2efcb48ea873e44223957750f00 Mon Sep 17 00:00:00 2001 From: Lior Cohen Date: Thu, 9 Apr 2026 03:03:19 +0300 Subject: [PATCH 1/4] KS76: Universal reader prompt across all benchmark files - Replace per-file prompt variants with canonical READER_SYSTEM_PROMPT and READER_USER_TEMPLATE module-level constants - Add context fence (-----) separating memories from question - Add "not prior knowledge" constraint for grounded extraction - Add "Answer:" completion suffix for consistent output format - Remove hedge clause ("say you don't have that information") from v1 - Standardize v1 to temperature=0.0, num_predict=64 (matching v2) - All 3 files now use identical prompt text Files: run_longmemeval.py, run_longmemeval_v2.py, cross_model_smoke.py Co-Authored-By: Claude Opus 4.6 (1M context) --- benchmarks/cross_model_smoke.py | 29 ++++++++++++++++------ benchmarks/run_longmemeval.py | 41 ++++++++++++++++++-------------- benchmarks/run_longmemeval_v2.py | 36 +++++++++++++++++----------- 3 files changed, 67 insertions(+), 39 deletions(-) diff --git a/benchmarks/cross_model_smoke.py b/benchmarks/cross_model_smoke.py index 8d5c9e3..99ecf98 100644 --- a/benchmarks/cross_model_smoke.py +++ b/benchmarks/cross_model_smoke.py @@ -93,18 +93,33 @@ def format_context(echo_results): return "\n\n".join(kept) if kept else "No relevant memories found." +READER_SYSTEM_PROMPT = ( + "You are extracting facts from conversation memories. " + "The answer is contained in the memories below. " + "Focus on what the USER said — user statements contain personal facts. " + "Extract the specific answer. Respond in one short sentence." +) + +READER_USER_TEMPLATE = ( + "Context:\n" + "-----\n" + "{context}\n" + "-----\n" + "\n" + "Given only the context above and not prior knowledge, extract the answer.\n" + "Question: {question}\n" + "Answer:" +) + + def ask_ollama(question, context, model): - system = ("You are extracting facts from conversation memories. " - "The answer is contained in the memories below. " - "Focus on what the USER said -- user statements contain personal facts. " - "Extract the specific answer. Respond in one short sentence.") - user = f"{context}\n\nQuestion: {question}\nBased on the memories above, the answer is:" t0 = time.time() r = requests.post(f"{OLLAMA}/api/chat", json={ "model": model, "messages": [ - {"role": "system", "content": system}, - {"role": "user", "content": user}, + {"role": "system", "content": READER_SYSTEM_PROMPT}, + {"role": "user", "content": READER_USER_TEMPLATE.format( + context=context, question=question)}, ], "stream": False, "options": {"temperature": 0.0, "num_predict": 64}, diff --git a/benchmarks/run_longmemeval.py b/benchmarks/run_longmemeval.py index 0746af4..a5eb9cc 100644 --- a/benchmarks/run_longmemeval.py +++ b/benchmarks/run_longmemeval.py @@ -146,33 +146,38 @@ def truncate_context(context_parts, max_total=16000, max_per_item=3000): return truncated -def ask_ollama(question, context, model="gemma3:1b"): - """Ask Ollama to answer based on retrieved context.""" - system_prompt = ( - "You are answering questions about past conversations. " - "Use ONLY the retrieved conversation memories below to answer. " - "If the information is not in the memories, say you don't have that information. " - "Be concise and direct. Give short factual answers, ideally one sentence." - ) - - user_prompt = f"""Retrieved memories: +READER_SYSTEM_PROMPT = ( + "You are extracting facts from conversation memories. " + "The answer is contained in the memories below. " + "Focus on what the USER said — user statements contain personal facts. " + "Extract the specific answer. Respond in one short sentence." +) + +READER_USER_TEMPLATE = ( + "Context:\n" + "-----\n" + "{context}\n" + "-----\n" + "\n" + "Given only the context above and not prior knowledge, extract the answer.\n" + "Question: {question}\n" + "Answer:" +) -{context} - -Question: {question} - -Answer in one short sentence.""" +def ask_ollama(question, context, model="gemma3:1b"): + """Ask Ollama to answer based on retrieved context.""" r = requests.post( f"{OLLAMA_URL}/api/chat", json={ "model": model, "messages": [ - {"role": "system", "content": system_prompt}, - {"role": "user", "content": user_prompt}, + {"role": "system", "content": READER_SYSTEM_PROMPT}, + {"role": "user", "content": READER_USER_TEMPLATE.format( + context=context, question=question)}, ], "stream": False, - "options": {"temperature": 0.1, "num_predict": 128}, + "options": {"temperature": 0.0, "num_predict": 64}, }, timeout=300, ) diff --git a/benchmarks/run_longmemeval_v2.py b/benchmarks/run_longmemeval_v2.py index 3facd24..a297723 100644 --- a/benchmarks/run_longmemeval_v2.py +++ b/benchmarks/run_longmemeval_v2.py @@ -191,27 +191,35 @@ def format_context(echo_results): # FIX 2: Extraction-focused prompt — no refusal, positive framing # --------------------------------------------------------------------------- -def ask_ollama(question, context, model="gemma3:1b"): - """Ask Ollama with extraction-focused prompt.""" - system_prompt = ( - "You are extracting facts from conversation memories. " - "The answer is contained in the memories below. " - "Focus on what the USER said — user statements contain personal facts. " - "Extract the specific answer. Respond in one short sentence." - ) +READER_SYSTEM_PROMPT = ( + "You are extracting facts from conversation memories. " + "The answer is contained in the memories below. " + "Focus on what the USER said — user statements contain personal facts. " + "Extract the specific answer. Respond in one short sentence." +) + +READER_USER_TEMPLATE = ( + "Context:\n" + "-----\n" + "{context}\n" + "-----\n" + "\n" + "Given only the context above and not prior knowledge, extract the answer.\n" + "Question: {question}\n" + "Answer:" +) - user_prompt = f"""{context} - -Question: {question} -Based on the memories above, the answer is:""" +def ask_ollama(question, context, model="gemma3:1b"): + """Ask Ollama with extraction-focused prompt.""" r = requests.post( f"{OLLAMA_URL}/api/chat", json={ "model": model, "messages": [ - {"role": "system", "content": system_prompt}, - {"role": "user", "content": user_prompt}, + {"role": "system", "content": READER_SYSTEM_PROMPT}, + {"role": "user", "content": READER_USER_TEMPLATE.format( + context=context, question=question)}, ], "stream": False, "options": {"temperature": 0.0, "num_predict": 64}, From 7167496f072c67d51e36f5e2d60e37511cd92646 Mon Sep 17 00:00:00 2001 From: Lior Cohen Date: Thu, 9 Apr 2026 03:06:31 +0300 Subject: [PATCH 2/4] KS76: fix temporal boost keywords + increase magnitude to 0.08 + raise score cap - Replace inline TEMPORAL_KEYWORDS with shared TEMPORAL_QUERY_KEYWORDS from shrimpk_core (adds: recently, today, yesterday, last week/month/year, just now, this morning/week/month, days/weeks/months ago) - Increase temporal boost from +0.015 to +0.08 for meaningful ranking impact - Raise score inflation cap from 0.35 to 0.50 for temporal + importance headroom - Update existing test assertion to match new boost magnitude - Add temporal_boost_uses_shared_keywords test covering "recently" keyword Co-Authored-By: Claude Opus 4.6 (1M context) --- crates/shrimpk-memory/src/echo.rs | 53 +++++++++++++++++++------------ 1 file changed, 33 insertions(+), 20 deletions(-) diff --git a/crates/shrimpk-memory/src/echo.rs b/crates/shrimpk-memory/src/echo.rs index 9bafce4..3164c77 100644 --- a/crates/shrimpk-memory/src/echo.rs +++ b/crates/shrimpk-memory/src/echo.rs @@ -11,7 +11,7 @@ use shrimpk_core::{ EchoConfig, EchoResult, GraphCluster, GraphEdge, GraphInterEdge, GraphNeighbor, GraphNeighborsResult, GraphNode, GraphNodePreview, GraphOverviewResult, GraphSubgraphResult, LabelConnection, MemoryEntry, MemoryEntrySummary, MemoryGraphResult, MemoryId, MemoryStats, - Modality, QueryMode, Result, SensitivityLevel, ShrimPKError, + Modality, QueryMode, Result, SensitivityLevel, ShrimPKError, TEMPORAL_QUERY_KEYWORDS, }; use std::path::PathBuf; use std::sync::{Arc, Mutex}; @@ -1615,9 +1615,10 @@ impl EchoEngine { // 7c5. Career/intro adjustment (KS68 IE-1) career_intro_adjustment(&all_query_labels, &mut results); - // 7c6. Score inflation cap (KS69): prevent unbounded boost stacking + // 7c6. Score inflation cap (KS69, KS76 Track 3): prevent unbounded boost stacking + // Raised from 0.35 to 0.50 to give temporal + importance boosts headroom. for result in &mut results { - let max_allowed = result.similarity as f64 + 0.35; + let max_allowed = result.similarity as f64 + 0.50; if result.final_score > max_allowed { result.final_score = max_allowed; } @@ -3238,27 +3239,19 @@ fn co_occurrence_boost(content: &str) -> f64 { } } -/// Temporal query boost (KS68 TR-3): if the query contains temporal keywords, -/// boost results that have `temporal:*` labels by +0.015. +/// Temporal query boost (KS68 TR-3, KS76 Track 2): if the query contains temporal +/// keywords, boost results that have `temporal:*` labels by +0.08. +/// Uses the shared `TEMPORAL_QUERY_KEYWORDS` constant from `shrimpk_core`. fn apply_temporal_boost(query: &str, results: &mut [EchoResult]) { - const TEMPORAL_KEYWORDS: &[&str] = &[ - "deadline", - "upcoming", - "when", - "scheduled", - "date", - "due", - "plan", - "next week", - "next month", - ]; let query_lower = query.to_lowercase(); - let is_temporal_query = TEMPORAL_KEYWORDS.iter().any(|kw| query_lower.contains(kw)); + let is_temporal_query = TEMPORAL_QUERY_KEYWORDS + .iter() + .any(|kw| query_lower.contains(kw)); if is_temporal_query { for result in results.iter_mut() { let has_temporal_label = result.labels.iter().any(|l| l.starts_with("temporal:")); if has_temporal_label { - result.final_score += 0.015; + result.final_score += 0.08; } } } @@ -4291,8 +4284,8 @@ mod tests { ]; super::apply_temporal_boost("What upcoming deadlines does Sam have?", &mut results); assert!( - (results[0].final_score - 0.515).abs() < f64::EPSILON, - "Temporal result should be boosted to 0.515, got {}", + (results[0].final_score - 0.58).abs() < f64::EPSILON, + "Temporal result should be boosted to 0.58, got {}", results[0].final_score ); assert!( @@ -4858,4 +4851,24 @@ mod tests { super::deduplicate_parent_child(&mut results, &parent_map); assert_eq!(results.len(), 2, "No pairs — nothing removed"); } + + #[test] + fn temporal_boost_uses_shared_keywords() { + // Verify that apply_temporal_boost detects keywords from the shared constant + let mut results = vec![ + make_echo_result("Tokyo trip last week", 0.5, vec!["temporal:past".into()]), + make_echo_result("Favorite food is sushi", 0.5, vec!["topic:food".into()]), + ]; + // "recently" is in the shared constant but was NOT in the old inline list + super::apply_temporal_boost("Where did I travel recently?", &mut results); + assert!( + results[0].final_score > 0.5, + "Temporal result should be boosted for 'recently' query, got {}", + results[0].final_score + ); + assert!( + (results[1].final_score - 0.5).abs() < f64::EPSILON, + "Non-temporal result should not be boosted", + ); + } } From e5bfd4ac35972709045e401f1cae17f818f18063 Mon Sep 17 00:00:00 2001 From: Lior Cohen Date: Thu, 9 Apr 2026 03:39:18 +0300 Subject: [PATCH 3/4] Add viz & UI polish backlog items - Graph polish: view transitions, Louvain viz, edge labels, temporal slider, custom node shapes, entity super-nodes, echo-frequency sizing - Memory curation: inline edit, merge, manual links, retag, bulk ops - Export formats: per-memory JSON, GraphML/GEXF graph export --- BACKLOG.md | 147 +++++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 147 insertions(+) create mode 100644 BACKLOG.md diff --git a/BACKLOG.md b/BACKLOG.md new file mode 100644 index 0000000..8837bda --- /dev/null +++ b/BACKLOG.md @@ -0,0 +1,147 @@ +# ShrimPK Backlog + +Tracked items for the ShrimPK kernel project. Updated after each sprint. +Source of truth for the ShrimPK kernel project. + +## Status Legend +- **DONE** — shipped and tested +- **PLANNED** — scheduled for a specific sprint +- **BACKLOG** — accepted, not yet scheduled +- **RESEARCH** — needs investigation before scheduling + +--- + +## Sprint Roadmap (KS73-KS80) + +- [x] KS73: Entity unification — EntityFrame, UUID v5, alias store, entity supersession (PR #10) +- [ ] KS74: v0.8.0-beta — recall gap fixes (NR demotion, abstention threshold), TUI dashboard, README rewrite, installer testing +- [ ] KS75: Store-time contradiction detection +- [ ] KS76: Memory import — cold start solver, 4+ parsers (Claude Code, ChatGPT, Obsidian, Mem0) +- [ ] KS77: KU-3 fix + remaining recall fixes (90% gate) +- [ ] KS78: Public launch preparation +- [ ] KS79: Context compression — LLMLingua-2 ONNX at store time +- [ ] KS80: Retroactive link re-scoring + sleep replay + +--- + +## HIGH — Retrieval Quality + +*Components exist, need wiring. Validated by academic research.* + +- [ ] PPR-weighted Hebbian traversal — Personalized PageRank seeded on echo hits, weighted by edge strength x ACT-R. +20% multi-hop QA (HippoRAG, NeurIPS 2024) +- [ ] Multi-resolution retrieval fallback — memory → label cluster → community summary cascade. All three layers exist, not connected as fallback chain (RAPTOR, ICLR 2024) +- [ ] Retrieval mode parameter — expose naive/local/global/hybrid on `echo` API (LightRAG, EMNLP 2025) +- [ ] Citation-weighted memory scoring — track which injected memories LLM actually cites in response, upweight high-utility memories. Proxy already intercepts responses (RMM, ACL 2025) + +## HIGH — Memory Lifecycle + +- [ ] Merge operation — explicit ADD/UPDATE/DELETE/NOOP diff during consolidation. All production systems converge on merge as required (Mem0, RMM, Think-in-Memory) +- [ ] Multi-granularity storage — tag memories by scale: utterance/turn/session. +10% LME accuracy (RMM paper) +- [ ] Write-path learned filtering — decide what NOT to store before embedding. Most underresearched area per 2026 survey (arXiv 2603.07670) +- [ ] Soft-deletion compaction — GC when FSRS strength drops below threshold. Currently decay only de-ranks, never removes (MemoryBank pattern) + +## HIGH — Cortex Prerequisites (blocking v0.10.0) + +- [ ] Inter-layer protocol design — Soul ↔ Brain ↔ Memory API surface. Direct Rust calls vs Tokio channels vs message types +- [ ] Security model for agentic stack — data safety layer, poisoned memory detection. Distinct from command-level Brainstem +- [ ] Alpha/Beta ARC competition model — formal design doc. Async parallelism, leader election, Adaptive Resonance Theory mapping + +## MEDIUM — Model & Format Upgrades + +- [ ] Nomic Embed Vision v1.5 — CLIP ViT-B/32 → Nomic, +7.8pp ImageNet zero-shot, 6x smaller ONNX. Breaking: 512→768 dim migration +- [ ] f16 quantization for vision/speech — SHRM v3, ~50% disk/RAM savings, f32 promotion at query time +- [ ] Band-limited resampling — replace resample_linear() with rubato crate. Correctness bug: aliasing at 48→16kHz +- [ ] BuiltinConsolidator — bundled extraction model, zero Ollama dependency for consolidation quality +- [ ] Configurable embedding provider — EmbeddingProvider trait, 10 fastembed models + OpenAI API (KS75 — DONE) + +## MEDIUM — Graph & Entity + +- [ ] Retroactive link invalidation — when A supersedes B, downweight ALL B-anchored Hebbian links, not just B itself (A-MEM/Zettelkasten pattern) +- [ ] Episodic anchoring — bidirectional indices linking Hebbian edges back to source episodes (Graphiti/Zep pattern) +- [ ] Entity-cluster summaries — entity-level community nodes, not just label-level (Graphiti temporal KG) + +## MEDIUM — Viz & UI Polish + +*Current state: Tauri 2 + Sigma.js 3.0 + ForceAtlas2, 3-level zoom (KS65-66). Functional but early MVP.* + +**Graph Polish:** +- [ ] Smooth view transitions — animated node repositioning between galaxy/cluster/neighborhood (currently hard-resets layout) +- [ ] Louvain community visualization — color nodes by community, show boundaries (graphology-communities-louvain installed, unused) +- [ ] Edge labels on hover — show typed relationship (CoActivation, WorksAt, PrefersTool, etc.) +- [ ] Temporal slider — filter graph by time range, animate memory formation over time +- [ ] Custom node shapes per category — distinct shapes for Identity/Fact/Preference/ActiveProject/Conversation +- [ ] Entity super-nodes — render EntityFrame nodes at graph level, not just label clusters +- [ ] Node size by echo frequency — proportional to retrieval count, not just importance score + +**Memory Curation:** +- [ ] Inline memory edit — edit content/labels from detail panel, PATCH endpoint on daemon +- [ ] Memory merge — select 2+ nodes, merge into one (new daemon endpoint) +- [ ] Manual link creation — create Hebbian edges from graph view (new daemon endpoint) +- [ ] Retag from graph — drag-drop between clusters or multi-select retag +- [ ] Bulk operations — multi-select for delete/retag/export + +**Export Formats:** +- [ ] JSON export per memory — full metadata + embeddings + graph edges +- [ ] Graph export — GraphML/GEXF for external visualization tools + +## MEDIUM — Quantization (v0.8.0) + +- [ ] Int8 scalar quantization (4x compression, simsimd ready) +- [ ] TurboQuant integration (turbo-quant crate, 8-10x) +- [ ] Binary + float32 rescore pipeline + +## MEDIUM — Intelligence Tuning + +- [ ] Full ACT-R retrieval history (Vec ring buffer) +- [ ] ACT-R activation ON by default (after benchmarking) +- [ ] Three-tier store (hot/warm/cold) +- [ ] Importance retrieval boost (A/B test, then enable) + +## MEDIUM — Product & Distribution + +- [ ] Memory file export as .md sidecars — per-memory files with YAML frontmatter (distinct from bulk `shrimpk dump`) +- [ ] Cloud sync — encrypted cross-device memory, E2E encrypted, server sees only ciphertext +- [ ] Managed API planning +- [ ] Revenue model implementation + +## MEDIUM — Benchmarks Not Yet Running + +- [ ] LoCoMo benchmark +- [ ] MemoryAgentBench (ICLR 2026) — contradiction/conflict resolution focus +- [ ] EverMemBench (2025) — entity disambiguation focus + +## LOW — Backlog + +- [ ] Memory as weights prototype (PyTorch via shrimpk-python) +- [ ] Cluster summary tree (MemTree pattern) +- [ ] Custom fine-tuned embedding model +- [ ] crates.io publish (after API stabilizes) +- [ ] Code signing certificate +- [ ] PostToolUse async hook +- [ ] Predictive coding layer — surprise/prediction error signal (~300 lines Rust) +- [ ] Session-level dynamics tracking (COMEDY pattern — user-bot relationship) +- [ ] Emotion channel — Apache 2.0 ONNX model needed (slot reserved in SHRM) +- [ ] CAM++ speaker model upgrade — needs Apache 2.0 ONNX verification +- [ ] SigLIP 2 vision model — needs upstream ONNX availability + +## RESEARCH — Long-horizon + +- [ ] Causal retrieval — retrieve by causal relevance, not just similarity (2026 survey frontier) +- [ ] Model weight printing — cross-model knowledge transfer via externalized Hebbian weights +- [ ] PyTorch cross-attention memory module — ShrimPK as transformer memory (v1.0+ ML stage) +- [ ] GAAMA paper (arXiv 2603.27910) — concept-mediated KG with 4 node types, very close to ShrimPK architecture +- [ ] Reflexion pattern — self-improvement via failure memories (Shinn et al. 2023) +- [ ] Interleaved replay during sleep consolidation — novel-familiar mixing (neuroscience pattern) +- [ ] EWC (Elastic Weight Consolidation) — prevent catastrophic forgetting in Hebbian updates (Nature Comms 2025) + +--- + +## Sync Issues (fix before next release) + +- [ ] `docs/ROADMAP.md` stale at v0.5.0 — update to reflect v0.7.5 state +- [ ] `CHANGELOG.md` stops at v0.7.0 — missing v0.7.1 through v0.7.5 +- [ ] MCP tool count inconsistent across docs (12 vs 14) + +--- + +*Last updated: 2026-04-09* From 88ff25efb108558d6717819056d9f43b92b66f19 Mon Sep 17 00:00:00 2001 From: Lior Cohen Date: Thu, 9 Apr 2026 03:44:56 +0300 Subject: [PATCH 4/4] KS76: add shared TEMPORAL_QUERY_KEYWORDS constant + importance_weight 0.25 - Add TEMPORAL_QUERY_KEYWORDS (21 keywords) to shrimpk-core/config.rs as single source of truth for echo scoring and reformulation - Re-export from shrimpk_core lib.rs - Change importance_weight default from 0.0 to 0.25 so importance scoring contributes to final ranking out of the box Co-Authored-By: Claude Opus 4.6 (1M context) --- crates/shrimpk-core/src/config.rs | 31 ++++++++++++++++++++++++++++++- crates/shrimpk-core/src/lib.rs | 5 +++-- 2 files changed, 33 insertions(+), 3 deletions(-) diff --git a/crates/shrimpk-core/src/config.rs b/crates/shrimpk-core/src/config.rs index 80b1521..a84db27 100644 --- a/crates/shrimpk-core/src/config.rs +++ b/crates/shrimpk-core/src/config.rs @@ -9,6 +9,35 @@ use std::path::PathBuf; /// Default maximum disk usage: 2 GB. const DEFAULT_MAX_DISK_BYTES: u64 = 2_147_483_648; +/// Temporal query keywords shared between echo scoring (`apply_temporal_boost`) +/// and memory reformulation (`detect_temporal_keyword`). +/// +/// Single source of truth to prevent keyword vocabulary mismatch (KS76 Track 2). +pub const TEMPORAL_QUERY_KEYWORDS: &[&str] = &[ + "yesterday", + "today", + "last week", + "last month", + "last year", + "recently", + "just now", + "this morning", + "this week", + "this month", + "days ago", + "weeks ago", + "months ago", + "deadline", + "upcoming", + "when", + "scheduled", + "date", + "due", + "plan", + "next week", + "next month", +]; + /// Reranker backend for echo result reranking. #[derive(Debug, Clone, Copy, PartialEq, Eq, Serialize, Deserialize, Default)] pub enum RerankerBackend { @@ -458,7 +487,7 @@ impl Default for EchoConfig { use_power_law_decay: default_true(), use_importance: default_true(), activation_weight: default_activation_weight(), - importance_weight: 0.0, + importance_weight: 0.25, use_full_actr_history: false, community_summaries_enabled: default_true(), community_summary_threshold: default_community_summary_threshold(), diff --git a/crates/shrimpk-core/src/lib.rs b/crates/shrimpk-core/src/lib.rs index 0dd9f8c..7794335 100644 --- a/crates/shrimpk-core/src/lib.rs +++ b/crates/shrimpk-core/src/lib.rs @@ -13,8 +13,9 @@ pub mod traits; // Re-export commonly used types at crate root pub use config::{ - EchoConfig, EmbeddingBackend, FileConfig, QuantizationMode, RerankerBackend, config_dir, - config_path, disk_usage, load_config_file, resolve_config, save_config_file, + EchoConfig, EmbeddingBackend, FileConfig, QuantizationMode, RerankerBackend, + TEMPORAL_QUERY_KEYWORDS, config_dir, config_path, disk_usage, load_config_file, resolve_config, + save_config_file, }; pub use entity::{EntityFrame, EntityId, EntityKind}; pub use error::{Result, ShrimPKError};