pilot-protocol · TeoSlayer · May 28, 2026 · May 28, 2026
diff --git a/internal/server/http.go b/internal/server/http.go
@@ -21,7 +21,6 @@ import (
 	"sort"
 	"strconv"
 	"strings"
-	"sync"
 	"time"
 
 	"github.com/pilot-protocol/cosift/internal/embed"
@@ -232,41 +231,39 @@ func (s *Server) WithFetcher(f FetchFn) *Server {
 // post-restart server) skip the LLM call on popular queries.
 //
 // Lookup order: in-memory L1 → store L2 → LLM. Save order: LLM → store + L1.
+// paraphraserL1Max is the hard cap on the in-memory L1 paraphrase cache.
+// Beyond this, LRU eviction kicks in to prevent unbounded memory growth.
+const paraphraserL1Max = 10_000
+
 type paraphraser struct {
 	chat    embed.ChatClient
 	n       int
-	store   *store.Store // optional L2; nil = L1-only
-	metrics *Metrics     // optional; nil disables cache observability
-	mu      sync.Mutex
-	cache   map[string][]string // L1 key: model + "\x00" + query
+	store   *store.Store          // optional L2; nil = L1-only
+	metrics *Metrics              // optional; nil disables cache observability
+	cache   *lruCache[[]string]   // L1 key: model + "\x00" + query
 }
 
 func newParaphraser(chat embed.ChatClient, n int, s *store.Store, m *Metrics) *paraphraser {
 	if n <= 0 {
 		n = 2
 	}
-	return &paraphraser{chat: chat, n: n, store: s, metrics: m, cache: make(map[string][]string)}
+	return &paraphraser{chat: chat, n: n, store: s, metrics: m, cache: newLRU[[]string](paraphraserL1Max, 0)}
 }
 
 // generate returns N paraphrases, with the L1+L2 cache pattern above.
 func (p *paraphraser) generate(ctx context.Context, q string) []string {
 	key := p.chat.Model() + "\x00" + q
-	p.mu.Lock()
-	if cached, ok := p.cache[key]; ok {
-		p.mu.Unlock()
+	if cached, ok := p.cache.get(key); ok {
 		if p.metrics != nil {
 			p.metrics.RecordParaphraseL1Hit()
 		}
 		return cached
 	}
-	p.mu.Unlock()
 
 	// L2: SQLite-persisted cache.
 	if p.store != nil {
 		if cached, err := p.store.GetParaphrases(ctx, p.chat.Model(), q); err == nil && len(cached) > 0 {
-			p.mu.Lock()
-			p.cache[key] = cached
-			p.mu.Unlock()
+			p.cache.set(key, cached)
 			if p.metrics != nil {
 				p.metrics.RecordParaphraseL2Hit()
 			}
@@ -307,9 +304,7 @@ Example output for "go programming language": ["golang concurrent compiled langu
 	if len(arr) > p.n {
 		arr = arr[:p.n]
 	}
-	p.mu.Lock()
-	p.cache[key] = arr
-	p.mu.Unlock()
+	p.cache.set(key, arr)
 	if p.store != nil {
 		_ = p.store.SaveParaphrases(ctx, p.chat.Model(), q, arr) // best-effort
 	}

diff --git a/internal/server/hyde.go b/internal/server/hyde.go
@@ -3,7 +3,6 @@ package server
 import (
 	"context"
 	"strings"
-	"sync"
 
 	"github.com/pilot-protocol/cosift/internal/embed"
 	"github.com/pilot-protocol/cosift/internal/store"
@@ -25,17 +24,20 @@ import (
 
 const hydeSystemPrompt = `Write a brief, factual passage (2-4 sentences) that would directly answer the user's question. Output ONLY the passage — no preamble, no commentary, no apology if you're uncertain. If the question is ambiguous, pick the most plausible interpretation and answer that. The passage doesn't need to be true; it needs to be the SHAPE of what a relevant document would say. Embedding this passage and searching by its vector will find documents that look like real answers, even if the user's original query was just a few keywords.`
 
+// hydeL1Max is the hard cap on the in-memory L1 HyDE passage cache.
+// Beyond this, LRU eviction kicks in to prevent unbounded memory growth.
+const hydeL1Max = 10_000
+
 // hydePassager owns the chat client + 2-level cache.
 type hydePassager struct {
 	chat    embed.ChatClient
-	store   *store.Store // optional L2; nil = L1-only
-	metrics *Metrics     // optional cache observability
-	mu      sync.Mutex
-	cache   map[string]string // L1 key: model + "\x00" + query
+	store   *store.Store       // optional L2; nil = L1-only
+	metrics *Metrics           // optional cache observability
+	cache   *lruCache[string]  // L1 key: model + "\x00" + query
 }
 
 func newHydePassager(chat embed.ChatClient, s *store.Store, m *Metrics) *hydePassager {
-	return &hydePassager{chat: chat, store: s, metrics: m, cache: make(map[string]string)}
+	return &hydePassager{chat: chat, store: s, metrics: m, cache: newLRU[string](hydeL1Max, 0)}
 }
 
 // Passage returns the hypothetical-answer text for the query, threading the
@@ -47,23 +49,18 @@ func (p *hydePassager) Passage(ctx context.Context, q string) string {
 	}
 	key := p.chat.Model() + "\x00" + q
 
-	// L1: in-memory.
-	p.mu.Lock()
-	if cached, ok := p.cache[key]; ok {
-		p.mu.Unlock()
+	// L1: in-memory LRU.
+	if cached, ok := p.cache.get(key); ok {
 		if p.metrics != nil {
 			p.metrics.RecordHyDEL1Hit()
 		}
 		return cached
 	}
-	p.mu.Unlock()
 
 	// L2: SQLite.
 	if p.store != nil {
 		if cached, err := p.store.GetHyDE(ctx, p.chat.Model(), q); err == nil && cached != "" {
-			p.mu.Lock()
-			p.cache[key] = cached
-			p.mu.Unlock()
+			p.cache.set(key, cached)
 			if p.metrics != nil {
 				p.metrics.RecordHyDEL2Hit()
 			}
@@ -88,9 +85,7 @@ func (p *hydePassager) Passage(ctx context.Context, q string) string {
 	}
 
 	// Save both levels.
-	p.mu.Lock()
-	p.cache[key] = passage
-	p.mu.Unlock()
+	p.cache.set(key, passage)
 	if p.store != nil {
 		_ = p.store.SaveHyDE(ctx, p.chat.Model(), q, passage) // best-effort
 	}

diff --git a/internal/server/lru.go b/internal/server/lru.go
@@ -0,0 +1,95 @@
+package server
+
+import (
+	"container/list"
+	"sync"
+	"time"
+)
+
+// lruEntry is one slot in the LRU eviction list.
+type lruEntry[V any] struct {
+	key     string
+	value   V
+	expires time.Time // zero = no TTL
+}
+
+// lruCache is a thread-safe, bounded LRU cache with optional per-entry TTL.
+// When the cache exceeds maxSize, the least-recently-used entry is evicted.
+// A zero TTL disables time-based expiry.
+type lruCache[V any] struct {
+	mu        sync.Mutex
+	maxSize   int
+	ttl       time.Duration
+	items     map[string]*list.Element
+	evictList *list.List
+}
+
+func newLRU[V any](maxSize int, ttl time.Duration) *lruCache[V] {
+	if maxSize <= 0 {
+		maxSize = 1
+	}
+	return &lruCache[V]{
+		maxSize:   maxSize,
+		ttl:       ttl,
+		items:     make(map[string]*list.Element, maxSize),
+		evictList: list.New(),
+	}
+}
+
+// get returns the value for key and true on hit. On miss or expired entry,
+// returns the zero value and false. A hit promotes the entry to MRU.
+func (c *lruCache[V]) get(key string) (V, bool) {
+	c.mu.Lock()
+	defer c.mu.Unlock()
+	elem, ok := c.items[key]
+	if !ok {
+		var zero V
+		return zero, false
+	}
+	entry := elem.Value.(*lruEntry[V])
+	if c.ttl > 0 && !entry.expires.IsZero() && time.Now().After(entry.expires) {
+		c.evictList.Remove(elem)
+		delete(c.items, key)
+		var zero V
+		return zero, false
+	}
+	c.evictList.MoveToFront(elem)
+	return entry.value, true
+}
+
+// set inserts or updates key→value and promotes the entry to MRU. If the
+// cache is at capacity, the LRU entry is evicted.
+func (c *lruCache[V]) set(key string, value V) {
+	c.mu.Lock()
+	defer c.mu.Unlock()
+	if elem, ok := c.items[key]; ok {
+		c.evictList.MoveToFront(elem)
+		entry := elem.Value.(*lruEntry[V])
+		entry.value = value
+		if c.ttl > 0 {
+			entry.expires = time.Now().Add(c.ttl)
+		}
+		return
+	}
+	entry := &lruEntry[V]{key: key, value: value}
+	if c.ttl > 0 {
+		entry.expires = time.Now().Add(c.ttl)
+	}
+	elem := c.evictList.PushFront(entry)
+	c.items[key] = elem
+	for c.evictList.Len() > c.maxSize {
+		oldest := c.evictList.Back()
+		if oldest == nil {
+			break
+		}
+		c.evictList.Remove(oldest)
+		delete(c.items, oldest.Value.(*lruEntry[V]).key)
+	}
+}
+
+// len returns the current number of entries.
+func (c *lruCache[V]) len() int {
+	c.mu.Lock()
+	defer c.mu.Unlock()
+	return c.evictList.Len()
+}