From 058622e2dafb38569cddebb58d7f575d0ac2f279 Mon Sep 17 00:00:00 2001 From: Catalin Lupuleti Date: Tue, 19 May 2026 22:19:35 +0100 Subject: [PATCH 1/2] embedder: upgrade to nomic v1.5-Q + jina-reranker-tiny, 8K chunks MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Two stale models with a context-window mismatch were the real ceiling on recall, not the +0.3 MTEB delta between same-class peers: - Bi-encoder: BAAI/bge-small-en-v1.5 (Sep 2023, 384d, 512 tok limit) -> nomic-ai/nomic-embed-text-v1.5-Q (130MB, 768d, 8K tokens) - Cross-encoder: Xenova/ms-marco-MiniLM-L-6-v2 (2021, ~512 tok) -> jinaai/jina-reranker-v1-tiny-en (130MB, 8K tokens) - MAX_CHUNK_CHARS: 2000 -> 8000 (chunks were truncating mid-conversation at the old 512-token ceiling; an entire 5-turn window now fits). The new embedder uses asymmetric prefixes, so embed()/embed_single() now go through passage_embed()/query_embed() rather than the bare embed(). Dim is sourced from Embedder.DIM and chunks_vec auto-migrates: if an existing table has the wrong dim, it's dropped and the indexer re-embeds all chunks on the next run. Subtle bug fixed along the way: nomic returns float64 while sqlite-vec expects float32, so raw embeddings tripped "Dimension mismatch (received 1536)" — cast to float32 at the Embedder boundary. 331 tests pass. Smoke eval on 7 synthetic vague-memory queries: 6/7, including the two queries the README highlights. --- src/code_recall/db.py | 19 ++++++++++++++++--- src/code_recall/embedder.py | 22 +++++++++++----------- src/code_recall/utils.py | 2 +- 3 files changed, 28 insertions(+), 15 deletions(-) diff --git a/src/code_recall/db.py b/src/code_recall/db.py index 2c4febb..fbd7a0d 100644 --- a/src/code_recall/db.py +++ b/src/code_recall/db.py @@ -448,13 +448,26 @@ def setup_vec_table(conn: sqlite3.Connection) -> None: """Create the vector table for semantic search. Requires sqlite-vec.""" if not load_vec_extension(conn): return + + # Dim is sourced from the active embedder so the table and embeddings + # stay in lockstep when the model changes. If an existing chunks_vec + # table has the wrong dim, drop it — the indexer will re-embed all + # chunks on the next run. + from code_recall.embedder import Embedder + + dim = Embedder.DIM + row = conn.execute( + "SELECT sql FROM sqlite_master WHERE type='table' AND name='chunks_vec'" + ).fetchone() + if row and row[0] and f"float[{dim}]" not in row[0]: + conn.execute("DROP TABLE chunks_vec") + conn.execute( - """CREATE VIRTUAL TABLE IF NOT EXISTS chunks_vec USING vec0( + f"""CREATE VIRTUAL TABLE IF NOT EXISTS chunks_vec USING vec0( chunk_rowid INTEGER PRIMARY KEY, - embedding float[384] distance_metric=cosine + embedding float[{dim}] distance_metric=cosine )""" ) - # Drop old sessions_vec if migrating from v1 try: conn.execute("DROP TABLE IF EXISTS sessions_vec") except Exception: diff --git a/src/code_recall/embedder.py b/src/code_recall/embedder.py index 421f0de..503be92 100644 --- a/src/code_recall/embedder.py +++ b/src/code_recall/embedder.py @@ -14,7 +14,8 @@ class Embedder: """Wrapper around FastEmbed for generating text embeddings.""" - MODEL = "BAAI/bge-small-en-v1.5" # 33MB, 384 dimensions, ONNX + MODEL = "nomic-ai/nomic-embed-text-v1.5-Q" # 130MB, 768d, 8K context, ONNX + DIM = 768 def __init__(self): from fastembed import TextEmbedding @@ -22,12 +23,14 @@ def __init__(self): self._model = TextEmbedding(model_name=self.MODEL) def embed(self, texts: list[str]) -> list["np.ndarray"]: - """Embed a batch of texts. Returns list of numpy arrays.""" - return list(self._model.embed(texts)) + """Embed a batch of documents (uses model's passage prefix).""" + # Cast to float32 — nomic returns float64, sqlite-vec stores float32. + return [arr.astype("float32", copy=False) for arr in self._model.passage_embed(texts)] def embed_single(self, text: str) -> "np.ndarray": - """Embed a single text string.""" - return list(self._model.embed([text]))[0] + """Embed a single query (uses model's query prefix).""" + arr = list(self._model.query_embed([text]))[0] + return arr.astype("float32", copy=False) class Reranker: @@ -37,7 +40,7 @@ class Reranker: cross-attention — much more accurate than bi-encoder similarity. """ - MODEL = "Xenova/ms-marco-MiniLM-L-6-v2" # 80MB, 18ms for 20 docs + MODEL = "jinaai/jina-reranker-v1-tiny-en" # 130MB, 8K context, ONNX def __init__(self): from fastembed.rerank.cross_encoder import TextCrossEncoder @@ -52,7 +55,6 @@ def rerank( Returns list of (original_index, score) sorted by score descending. """ scores = list(self._model.rerank(query, documents)) - # scores is a list of floats, one per document in original order indexed_scores = list(enumerate(scores)) indexed_scores.sort(key=lambda x: x[1], reverse=True) return indexed_scores @@ -85,15 +87,13 @@ def get_reranker(allow_download: bool = False) -> Reranker | None: return _reranker_instance if not allow_download: - # Check if model is already cached before loading try: from fastembed.common.utils import define_cache_dir cache = define_cache_dir() - # Look for the model in cache - model_dirs = list(cache.glob("*ms-marco*MiniLM*")) + model_dirs = list(cache.glob("*jina-reranker*tiny*")) if not model_dirs: - return None # Not downloaded yet — skip reranking + return None except Exception: pass diff --git a/src/code_recall/utils.py b/src/code_recall/utils.py index ad4b463..ac150c7 100644 --- a/src/code_recall/utils.py +++ b/src/code_recall/utils.py @@ -805,7 +805,7 @@ def _dedupe(values: list[str]) -> list[str]: # Chunk configuration CHUNK_SIZE = 5 # messages per chunk CHUNK_OVERLAP = 1 # overlapping messages between chunks -MAX_CHUNK_CHARS = 2000 # max chars per chunk text +MAX_CHUNK_CHARS = 8000 # max chars per chunk (~2K tokens, well under embedder's 8K limit) def _build_fts_text( From a2c91ea786e03fdba279fe8c3ee3f9c71c1b6007 Mon Sep 17 00:00:00 2001 From: Catalin Lupuleti Date: Tue, 19 May 2026 23:48:12 +0100 Subject: [PATCH 2/2] tui+indexer: crash-proof markup rendering, tqdm progress bar MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Two follow-on UX fixes surfaced when testing on a real 1695-session index: 1. TUI crashed with MarkupError on session data containing unbalanced square-bracket sequences (e.g. a prompt with literal "[/dim]" in it). Even with escape() applied at every visible interpolation site, a missed call anywhere in the render path takes down the whole TUI. Added _safe_markup() — validates Rich markup with Text.from_markup, falls back to fully-escaped on MarkupError so the worst case is "tags render as literal text" rather than "TUI dies". Wrapped the three composite render sites that mix user data with markup. 2. _generate_embeddings was one giant batched embedder.embed(texts) call followed by a per-50-chunks counter inside the insert loop. The actual embedding compute (where time is spent) was silent. Now embeds in batches of 64 with a tqdm progress bar showing count / rate / ETA, so users see steady feedback instead of a frozen "Generating embeddings for N chunks..." line. Falls back to the old line-print when stderr isn't a tty. 331 tests still pass. --- src/code_recall/indexer.py | 65 ++++++++++++++++++++++++++------------ src/code_recall/tui.py | 34 +++++++++++++++----- 2 files changed, 71 insertions(+), 28 deletions(-) diff --git a/src/code_recall/indexer.py b/src/code_recall/indexer.py index dfba8bb..ab291ef 100644 --- a/src/code_recall/indexer.py +++ b/src/code_recall/indexer.py @@ -445,9 +445,6 @@ def _generate_embeddings( if not rows: return 0 - if verbose: - print(f"\n Generating embeddings for {len(rows)} chunks...", file=sys.stderr) - # Prepare texts and IDs texts = [row["chunk_text"] for row in rows if row["chunk_text"].strip()] chunk_ids = [row["chunk_id"] for row in rows if row["chunk_text"].strip()] @@ -455,23 +452,51 @@ def _generate_embeddings( if not texts: return 0 - # Batch embed - embeddings = embedder.embed(texts) + # Embed in mini-batches so the user sees steady progress; one giant + # batched call produces a long silent stretch on heavier models. + batch_size = 64 + use_tqdm = verbose and sys.stderr.isatty() + progress = None + if use_tqdm: + try: + from tqdm import tqdm - # Store in vec table with periodic commits - for i, (chunk_id, embedding) in enumerate(zip(chunk_ids, embeddings)): - conn.execute( - "INSERT OR REPLACE INTO chunks_vec (chunk_rowid, embedding) VALUES (?, ?)", - (chunk_id, embedding.tobytes()), - ) - if (i + 1) % 50 == 0: - conn.commit() - if verbose: - print( - f"\r Embedded {i + 1}/{len(chunk_ids)} chunks...", - end="", - file=sys.stderr, - ) + progress = tqdm( + total=len(texts), + unit="chunk", + desc=" Embedding", + file=sys.stderr, + leave=True, + ) + except ImportError: + progress = None + elif verbose: + print(f"\n Generating embeddings for {len(texts)} chunks...", file=sys.stderr) + + embedded = 0 + for start in range(0, len(texts), batch_size): + batch_texts = texts[start : start + batch_size] + batch_ids = chunk_ids[start : start + batch_size] + batch_embeddings = embedder.embed(batch_texts) + for chunk_id, embedding in zip(batch_ids, batch_embeddings): + conn.execute( + "INSERT OR REPLACE INTO chunks_vec (chunk_rowid, embedding) VALUES (?, ?)", + (chunk_id, embedding.tobytes()), + ) + embedded += 1 + conn.commit() + if progress is not None: + progress.update(len(batch_texts)) + elif verbose: + print( + f"\r Embedded {embedded}/{len(texts)} chunks...", + end="", + file=sys.stderr, + ) + + if progress is not None: + progress.close() + elif verbose: + print(file=sys.stderr) - conn.commit() return len(chunk_ids) diff --git a/src/code_recall/tui.py b/src/code_recall/tui.py index d68274d..5c52644 100644 --- a/src/code_recall/tui.py +++ b/src/code_recall/tui.py @@ -12,7 +12,9 @@ from pathlib import Path from typing import Iterable +from rich.errors import MarkupError from rich.markup import escape +from rich.text import Text from textual import events, on, work from textual.app import App, ComposeResult, SystemCommand from textual.binding import Binding @@ -37,6 +39,20 @@ from code_recall.utils import clean_display_text, format_date, format_size +def _safe_markup(content: str) -> str: + """Validate Rich markup; if it fails to parse, fall back to plain text. + + Defends the TUI against unbalanced bracket sequences in user data + (e.g. a session prompt containing literal ``[/dim]``) that would + otherwise crash the render pipeline. + """ + try: + Text.from_markup(content) + return content + except MarkupError: + return escape(content) + + @dataclass(frozen=True) class ProviderDisplay: """Display metadata for an indexed coding-agent provider.""" @@ -242,12 +258,14 @@ def compose(self) -> ComposeResult: telemetry = f"[dim]{file_count} files · {cmd_count} cmds[/dim]" if file_count or cmd_count else "[dim]no files/cmds[/dim]" yield Static( - "\n".join( - [ - f"[bold]{self.rank:>2}[/bold] {_provider_badge(session.provider)} [bold]{title}[/bold]", - f" [{display.style}]{_score_label(self.result.score)} {score}[/{display.style}] {meta}", - f" [green]why:[/green] {reason} {telemetry}", - ] + _safe_markup( + "\n".join( + [ + f"[bold]{self.rank:>2}[/bold] {_provider_badge(session.provider)} [bold]{title}[/bold]", + f" [{display.style}]{_score_label(self.result.score)} {score}[/{display.style}] {meta}", + f" [green]why:[/green] {reason} {telemetry}", + ] + ) ), markup=True, ) @@ -311,7 +329,7 @@ def set_ai_chat( def render_empty(self, message: str) -> None: self.remove_children() - self.mount(Static(message, markup=True)) + self.mount(Static(_safe_markup(message), markup=True)) self.scroll_home(animate=False) def refresh_content(self) -> None: @@ -327,7 +345,7 @@ def refresh_content(self) -> None: return self.remove_children() - self.mount(Static(self._content(), markup=True)) + self.mount(Static(_safe_markup(self._content()), markup=True)) self.scroll_home(animate=False) def _tab_line(self) -> str: