From 44202eca386cc8df66e11158304589c47eecde32 Mon Sep 17 00:00:00 2001 From: matthew-pilot Date: Thu, 28 May 2026 07:18:39 +0000 Subject: [PATCH] fix: wrap L1 caches in bounded LRU to prevent unbounded memory growth (PILOT-106) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Replace unbounded map[string]... caches in paraphraser and hydePassager with a generic, thread-safe lruCache that enforces a hard 10,000-entry cap. LRU eviction prevents monotonic memory growth from unique queries. - New file: internal/server/lru.go — generic LRU with optional TTL - paraphraser: map[string][]string → lruCache[[]string], cap 10k - hydePassager: map[string]string → lruCache[string], cap 10k - Removes sync.Mutex from both structs (LRU handles its own locking) Fixes PILOT-106. --- internal/server/http.go | 27 +++++------- internal/server/hyde.go | 29 ++++++------- internal/server/lru.go | 95 +++++++++++++++++++++++++++++++++++++++++ 3 files changed, 118 insertions(+), 33 deletions(-) create mode 100644 internal/server/lru.go diff --git a/internal/server/http.go b/internal/server/http.go index dd46ad9..35e041f 100644 --- a/internal/server/http.go +++ b/internal/server/http.go @@ -21,7 +21,6 @@ import ( "sort" "strconv" "strings" - "sync" "time" "github.com/pilot-protocol/cosift/internal/embed" @@ -232,41 +231,39 @@ func (s *Server) WithFetcher(f FetchFn) *Server { // post-restart server) skip the LLM call on popular queries. // // Lookup order: in-memory L1 → store L2 → LLM. Save order: LLM → store + L1. +// paraphraserL1Max is the hard cap on the in-memory L1 paraphrase cache. +// Beyond this, LRU eviction kicks in to prevent unbounded memory growth. +const paraphraserL1Max = 10_000 + type paraphraser struct { chat embed.ChatClient n int - store *store.Store // optional L2; nil = L1-only - metrics *Metrics // optional; nil disables cache observability - mu sync.Mutex - cache map[string][]string // L1 key: model + "\x00" + query + store *store.Store // optional L2; nil = L1-only + metrics *Metrics // optional; nil disables cache observability + cache *lruCache[[]string] // L1 key: model + "\x00" + query } func newParaphraser(chat embed.ChatClient, n int, s *store.Store, m *Metrics) *paraphraser { if n <= 0 { n = 2 } - return ¶phraser{chat: chat, n: n, store: s, metrics: m, cache: make(map[string][]string)} + return ¶phraser{chat: chat, n: n, store: s, metrics: m, cache: newLRU[[]string](paraphraserL1Max, 0)} } // generate returns N paraphrases, with the L1+L2 cache pattern above. func (p *paraphraser) generate(ctx context.Context, q string) []string { key := p.chat.Model() + "\x00" + q - p.mu.Lock() - if cached, ok := p.cache[key]; ok { - p.mu.Unlock() + if cached, ok := p.cache.get(key); ok { if p.metrics != nil { p.metrics.RecordParaphraseL1Hit() } return cached } - p.mu.Unlock() // L2: SQLite-persisted cache. if p.store != nil { if cached, err := p.store.GetParaphrases(ctx, p.chat.Model(), q); err == nil && len(cached) > 0 { - p.mu.Lock() - p.cache[key] = cached - p.mu.Unlock() + p.cache.set(key, cached) if p.metrics != nil { p.metrics.RecordParaphraseL2Hit() } @@ -307,9 +304,7 @@ Example output for "go programming language": ["golang concurrent compiled langu if len(arr) > p.n { arr = arr[:p.n] } - p.mu.Lock() - p.cache[key] = arr - p.mu.Unlock() + p.cache.set(key, arr) if p.store != nil { _ = p.store.SaveParaphrases(ctx, p.chat.Model(), q, arr) // best-effort } diff --git a/internal/server/hyde.go b/internal/server/hyde.go index 7dbbe12..7492b92 100644 --- a/internal/server/hyde.go +++ b/internal/server/hyde.go @@ -3,7 +3,6 @@ package server import ( "context" "strings" - "sync" "github.com/pilot-protocol/cosift/internal/embed" "github.com/pilot-protocol/cosift/internal/store" @@ -25,17 +24,20 @@ import ( const hydeSystemPrompt = `Write a brief, factual passage (2-4 sentences) that would directly answer the user's question. Output ONLY the passage — no preamble, no commentary, no apology if you're uncertain. If the question is ambiguous, pick the most plausible interpretation and answer that. The passage doesn't need to be true; it needs to be the SHAPE of what a relevant document would say. Embedding this passage and searching by its vector will find documents that look like real answers, even if the user's original query was just a few keywords.` +// hydeL1Max is the hard cap on the in-memory L1 HyDE passage cache. +// Beyond this, LRU eviction kicks in to prevent unbounded memory growth. +const hydeL1Max = 10_000 + // hydePassager owns the chat client + 2-level cache. type hydePassager struct { chat embed.ChatClient - store *store.Store // optional L2; nil = L1-only - metrics *Metrics // optional cache observability - mu sync.Mutex - cache map[string]string // L1 key: model + "\x00" + query + store *store.Store // optional L2; nil = L1-only + metrics *Metrics // optional cache observability + cache *lruCache[string] // L1 key: model + "\x00" + query } func newHydePassager(chat embed.ChatClient, s *store.Store, m *Metrics) *hydePassager { - return &hydePassager{chat: chat, store: s, metrics: m, cache: make(map[string]string)} + return &hydePassager{chat: chat, store: s, metrics: m, cache: newLRU[string](hydeL1Max, 0)} } // Passage returns the hypothetical-answer text for the query, threading the @@ -47,23 +49,18 @@ func (p *hydePassager) Passage(ctx context.Context, q string) string { } key := p.chat.Model() + "\x00" + q - // L1: in-memory. - p.mu.Lock() - if cached, ok := p.cache[key]; ok { - p.mu.Unlock() + // L1: in-memory LRU. + if cached, ok := p.cache.get(key); ok { if p.metrics != nil { p.metrics.RecordHyDEL1Hit() } return cached } - p.mu.Unlock() // L2: SQLite. if p.store != nil { if cached, err := p.store.GetHyDE(ctx, p.chat.Model(), q); err == nil && cached != "" { - p.mu.Lock() - p.cache[key] = cached - p.mu.Unlock() + p.cache.set(key, cached) if p.metrics != nil { p.metrics.RecordHyDEL2Hit() } @@ -88,9 +85,7 @@ func (p *hydePassager) Passage(ctx context.Context, q string) string { } // Save both levels. - p.mu.Lock() - p.cache[key] = passage - p.mu.Unlock() + p.cache.set(key, passage) if p.store != nil { _ = p.store.SaveHyDE(ctx, p.chat.Model(), q, passage) // best-effort } diff --git a/internal/server/lru.go b/internal/server/lru.go new file mode 100644 index 0000000..56beda7 --- /dev/null +++ b/internal/server/lru.go @@ -0,0 +1,95 @@ +package server + +import ( + "container/list" + "sync" + "time" +) + +// lruEntry is one slot in the LRU eviction list. +type lruEntry[V any] struct { + key string + value V + expires time.Time // zero = no TTL +} + +// lruCache is a thread-safe, bounded LRU cache with optional per-entry TTL. +// When the cache exceeds maxSize, the least-recently-used entry is evicted. +// A zero TTL disables time-based expiry. +type lruCache[V any] struct { + mu sync.Mutex + maxSize int + ttl time.Duration + items map[string]*list.Element + evictList *list.List +} + +func newLRU[V any](maxSize int, ttl time.Duration) *lruCache[V] { + if maxSize <= 0 { + maxSize = 1 + } + return &lruCache[V]{ + maxSize: maxSize, + ttl: ttl, + items: make(map[string]*list.Element, maxSize), + evictList: list.New(), + } +} + +// get returns the value for key and true on hit. On miss or expired entry, +// returns the zero value and false. A hit promotes the entry to MRU. +func (c *lruCache[V]) get(key string) (V, bool) { + c.mu.Lock() + defer c.mu.Unlock() + elem, ok := c.items[key] + if !ok { + var zero V + return zero, false + } + entry := elem.Value.(*lruEntry[V]) + if c.ttl > 0 && !entry.expires.IsZero() && time.Now().After(entry.expires) { + c.evictList.Remove(elem) + delete(c.items, key) + var zero V + return zero, false + } + c.evictList.MoveToFront(elem) + return entry.value, true +} + +// set inserts or updates key→value and promotes the entry to MRU. If the +// cache is at capacity, the LRU entry is evicted. +func (c *lruCache[V]) set(key string, value V) { + c.mu.Lock() + defer c.mu.Unlock() + if elem, ok := c.items[key]; ok { + c.evictList.MoveToFront(elem) + entry := elem.Value.(*lruEntry[V]) + entry.value = value + if c.ttl > 0 { + entry.expires = time.Now().Add(c.ttl) + } + return + } + entry := &lruEntry[V]{key: key, value: value} + if c.ttl > 0 { + entry.expires = time.Now().Add(c.ttl) + } + elem := c.evictList.PushFront(entry) + c.items[key] = elem + for c.evictList.Len() > c.maxSize { + oldest := c.evictList.Back() + if oldest == nil { + break + } + c.evictList.Remove(oldest) + delete(c.items, oldest.Value.(*lruEntry[V]).key) + } +} + +// len returns the current number of entries. +func (c *lruCache[V]) len() int { + c.mu.Lock() + defer c.mu.Unlock() + return c.evictList.Len() +}