Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
27 changes: 11 additions & 16 deletions internal/server/http.go
Original file line number Diff line number Diff line change
Expand Up @@ -21,7 +21,6 @@ import (
"sort"
"strconv"
"strings"
"sync"
"time"

"github.com/pilot-protocol/cosift/internal/embed"
Expand Down Expand Up @@ -232,41 +231,39 @@ func (s *Server) WithFetcher(f FetchFn) *Server {
// post-restart server) skip the LLM call on popular queries.
//
// Lookup order: in-memory L1 → store L2 → LLM. Save order: LLM → store + L1.
// paraphraserL1Max is the hard cap on the in-memory L1 paraphrase cache.
// Beyond this, LRU eviction kicks in to prevent unbounded memory growth.
const paraphraserL1Max = 10_000

type paraphraser struct {
chat embed.ChatClient
n int
store *store.Store // optional L2; nil = L1-only
metrics *Metrics // optional; nil disables cache observability
mu sync.Mutex
cache map[string][]string // L1 key: model + "\x00" + query
store *store.Store // optional L2; nil = L1-only
metrics *Metrics // optional; nil disables cache observability
cache *lruCache[[]string] // L1 key: model + "\x00" + query
}

func newParaphraser(chat embed.ChatClient, n int, s *store.Store, m *Metrics) *paraphraser {
if n <= 0 {
n = 2
}
return &paraphraser{chat: chat, n: n, store: s, metrics: m, cache: make(map[string][]string)}
return &paraphraser{chat: chat, n: n, store: s, metrics: m, cache: newLRU[[]string](paraphraserL1Max, 0)}
}

// generate returns N paraphrases, with the L1+L2 cache pattern above.
func (p *paraphraser) generate(ctx context.Context, q string) []string {
key := p.chat.Model() + "\x00" + q
p.mu.Lock()
if cached, ok := p.cache[key]; ok {
p.mu.Unlock()
if cached, ok := p.cache.get(key); ok {
if p.metrics != nil {
p.metrics.RecordParaphraseL1Hit()
}
return cached
}
p.mu.Unlock()

// L2: SQLite-persisted cache.
if p.store != nil {
if cached, err := p.store.GetParaphrases(ctx, p.chat.Model(), q); err == nil && len(cached) > 0 {
p.mu.Lock()
p.cache[key] = cached
p.mu.Unlock()
p.cache.set(key, cached)
if p.metrics != nil {
p.metrics.RecordParaphraseL2Hit()
}
Expand Down Expand Up @@ -307,9 +304,7 @@ Example output for "go programming language": ["golang concurrent compiled langu
if len(arr) > p.n {
arr = arr[:p.n]
}
p.mu.Lock()
p.cache[key] = arr
p.mu.Unlock()
p.cache.set(key, arr)
if p.store != nil {
_ = p.store.SaveParaphrases(ctx, p.chat.Model(), q, arr) // best-effort
}
Expand Down
29 changes: 12 additions & 17 deletions internal/server/hyde.go
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,6 @@ package server
import (
"context"
"strings"
"sync"

"github.com/pilot-protocol/cosift/internal/embed"
"github.com/pilot-protocol/cosift/internal/store"
Expand All @@ -25,17 +24,20 @@ import (

const hydeSystemPrompt = `Write a brief, factual passage (2-4 sentences) that would directly answer the user's question. Output ONLY the passage — no preamble, no commentary, no apology if you're uncertain. If the question is ambiguous, pick the most plausible interpretation and answer that. The passage doesn't need to be true; it needs to be the SHAPE of what a relevant document would say. Embedding this passage and searching by its vector will find documents that look like real answers, even if the user's original query was just a few keywords.`

// hydeL1Max is the hard cap on the in-memory L1 HyDE passage cache.
// Beyond this, LRU eviction kicks in to prevent unbounded memory growth.
const hydeL1Max = 10_000

// hydePassager owns the chat client + 2-level cache.
type hydePassager struct {
chat embed.ChatClient
store *store.Store // optional L2; nil = L1-only
metrics *Metrics // optional cache observability
mu sync.Mutex
cache map[string]string // L1 key: model + "\x00" + query
store *store.Store // optional L2; nil = L1-only
metrics *Metrics // optional cache observability
cache *lruCache[string] // L1 key: model + "\x00" + query
}

func newHydePassager(chat embed.ChatClient, s *store.Store, m *Metrics) *hydePassager {
return &hydePassager{chat: chat, store: s, metrics: m, cache: make(map[string]string)}
return &hydePassager{chat: chat, store: s, metrics: m, cache: newLRU[string](hydeL1Max, 0)}
}

// Passage returns the hypothetical-answer text for the query, threading the
Expand All @@ -47,23 +49,18 @@ func (p *hydePassager) Passage(ctx context.Context, q string) string {
}
key := p.chat.Model() + "\x00" + q

// L1: in-memory.
p.mu.Lock()
if cached, ok := p.cache[key]; ok {
p.mu.Unlock()
// L1: in-memory LRU.
if cached, ok := p.cache.get(key); ok {
if p.metrics != nil {
p.metrics.RecordHyDEL1Hit()
}
return cached
}
p.mu.Unlock()

// L2: SQLite.
if p.store != nil {
if cached, err := p.store.GetHyDE(ctx, p.chat.Model(), q); err == nil && cached != "" {
p.mu.Lock()
p.cache[key] = cached
p.mu.Unlock()
p.cache.set(key, cached)
if p.metrics != nil {
p.metrics.RecordHyDEL2Hit()
}
Expand All @@ -88,9 +85,7 @@ func (p *hydePassager) Passage(ctx context.Context, q string) string {
}

// Save both levels.
p.mu.Lock()
p.cache[key] = passage
p.mu.Unlock()
p.cache.set(key, passage)
if p.store != nil {
_ = p.store.SaveHyDE(ctx, p.chat.Model(), q, passage) // best-effort
}
Expand Down
95 changes: 95 additions & 0 deletions internal/server/lru.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,95 @@
package server

import (
"container/list"
"sync"
"time"
)

// lruEntry is one slot in the LRU eviction list.
type lruEntry[V any] struct {
key string
value V
expires time.Time // zero = no TTL
}

// lruCache is a thread-safe, bounded LRU cache with optional per-entry TTL.
// When the cache exceeds maxSize, the least-recently-used entry is evicted.
// A zero TTL disables time-based expiry.
type lruCache[V any] struct {
mu sync.Mutex
maxSize int
ttl time.Duration
items map[string]*list.Element
evictList *list.List
}

func newLRU[V any](maxSize int, ttl time.Duration) *lruCache[V] {
if maxSize <= 0 {
maxSize = 1
}
return &lruCache[V]{
maxSize: maxSize,
ttl: ttl,
items: make(map[string]*list.Element, maxSize),
evictList: list.New(),
}
}

// get returns the value for key and true on hit. On miss or expired entry,
// returns the zero value and false. A hit promotes the entry to MRU.
func (c *lruCache[V]) get(key string) (V, bool) {
c.mu.Lock()
defer c.mu.Unlock()
elem, ok := c.items[key]
if !ok {
var zero V
return zero, false
}
entry := elem.Value.(*lruEntry[V])
if c.ttl > 0 && !entry.expires.IsZero() && time.Now().After(entry.expires) {
c.evictList.Remove(elem)
delete(c.items, key)
var zero V
return zero, false
}
c.evictList.MoveToFront(elem)
return entry.value, true
}

// set inserts or updates key→value and promotes the entry to MRU. If the
// cache is at capacity, the LRU entry is evicted.
func (c *lruCache[V]) set(key string, value V) {
c.mu.Lock()
defer c.mu.Unlock()
if elem, ok := c.items[key]; ok {
c.evictList.MoveToFront(elem)
entry := elem.Value.(*lruEntry[V])
entry.value = value
if c.ttl > 0 {
entry.expires = time.Now().Add(c.ttl)
}
return
}
entry := &lruEntry[V]{key: key, value: value}
if c.ttl > 0 {
entry.expires = time.Now().Add(c.ttl)
}
elem := c.evictList.PushFront(entry)
c.items[key] = elem
for c.evictList.Len() > c.maxSize {
oldest := c.evictList.Back()
if oldest == nil {
break
}
c.evictList.Remove(oldest)
delete(c.items, oldest.Value.(*lruEntry[V]).key)
}
}

// len returns the current number of entries.
func (c *lruCache[V]) len() int {
c.mu.Lock()
defer c.mu.Unlock()
return c.evictList.Len()
}
Loading