From cb7da66d06150688e0328d738b25dd870a7995a3 Mon Sep 17 00:00:00 2001
From: Sumit Sahoo <sumitsahoo1988@gmail.com>
Date: Sun, 17 May 2026 18:13:16 +0530
Subject: [PATCH] fix(ai): correct model bundle size labels to match actual HF
 downloads
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Registry `approxSizeBytes` values had drifted from the real on-disk
weights — the Quality tier was showing "≈ 1.8 GB total" in the gate
when the actual download is ~1.88 GB, and the Compact tier overstated
by ~390 MB on the chat model alone. Verified against HuggingFace file
listings for each pinned dtype:

  - LFM2.5-1.2B (q4):  1.2 GB → 810 MB (model_q4.onnx_data is 850 MB)
  - LFM2-2.6B (q4f16): 1.5 GB → 1.55 GB
  - EmbeddingGemma q8: 309 MB → 320 MB (was missing the 26 MB Gemma
                                       SentencePiece tokenizer)
  - Embed peak RAM:    400 MB → 500 MB (int8 dequant overhead)

Aggregate now reads "≈ 1.1 GB total" on Compact and "≈ 1.9 GB total"
on Quality. Prose updated in README, docs/local-ai.md, tool-registry,
ChatModelPicker, AiModelDetailsModal, ai-runtime, useRagModels, and
the AskPdf timing-weight comment.
---
 README.md                              | 10 +++---
 docs/local-ai.md                       | 26 +++++++-------
 src/components/AiModelDetailsModal.tsx |  5 +--
 src/components/ChatModelPicker.tsx     |  4 +--
 src/config/tool-registry.ts            |  7 ++--
 src/hooks/useRagModels.ts              |  5 +--
 src/tools/AskPdf.tsx                   |  2 +-
 src/utils/ai-models.ts                 | 49 ++++++++++++++------------
 src/utils/ai-runtime.ts                |  5 +--
 9 files changed, 63 insertions(+), 50 deletions(-)

diff --git a/README.md b/README.md
index 16b1058..9a0bf88 100644
--- a/README.md
+++ b/README.md
@@ -33,9 +33,9 @@ CloakPDF offers **36 powerful PDF tools**, all running 100% client-side. Feature
 
 _Chat with your PDF using a small AI model running entirely in your browser — no API keys, no server round-trips_
 
-| Tool                      | Description                                                                                                                                                                                                                              |
-| ------------------------- | ---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
-| **Ask your PDF** _(beta)_ | Ask natural-language questions about any PDF and get grounded answers extracted from the document text. Powered by a downloadable small chat model (1.2 GB or 2.6 GB tier, your choice). See [Local AI](#-local-ai-on-device-rag) below. |
+| Tool                      | Description                                                                                                                                                                                                                                    |
+| ------------------------- | ---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
+| **Ask your PDF** _(beta)_ | Ask natural-language questions about any PDF and get grounded answers extracted from the document text. Powered by a downloadable small chat model (1.2B or 2.6B parameter tier, your choice). See [Local AI](#-local-ai-on-device-rag) below. |
 
 ### 🗂️ Organise & Edit
 
@@ -207,8 +207,8 @@ pipeline **entirely in your browser** — no API key, no inference
 server, no usage quota. The model weights download once from Hugging
 Face's CDN, get cached, and work offline forever after.
 
-Three small models work together on first use (~1.55 GB total on the
-default Compact chat tier):
+Three small models work together on first use (~1.15 GB total on the
+default Compact chat tier; ~1.9 GB on Quality):
 
 - **Chat** — [LFM2.5-1.2B-Instruct](https://huggingface.co/LiquidAI/LFM2.5-1.2B-Instruct) _(Compact, default)_ or [LFM2-2.6B](https://huggingface.co/LiquidAI/LFM2-2.6B) _(Quality)_
 - **Retrieval** — [EmbeddingGemma-300M](https://huggingface.co/onnx-community/embeddinggemma-300m-ONNX)
diff --git a/docs/local-ai.md b/docs/local-ai.md
index 7ea18ee..1e67dc8 100644
--- a/docs/local-ai.md
+++ b/docs/local-ai.md
@@ -18,17 +18,19 @@ the pipeline or debugging an unexpected answer.
 
 Three pipelines load together on first use:
 
-| Role             | Model                                                                                 | On disk | Peak RAM | License           |
-| ---------------- | ------------------------------------------------------------------------------------- | ------- | -------- | ----------------- |
-| Chat _(Compact)_ | [LFM2.5-1.2B-Instruct](https://huggingface.co/LiquidAI/LFM2.5-1.2B-Instruct)          | ~1.2 GB | ~2.0 GB  | LFM Open Lic v1.0 |
-| Chat _(Quality)_ | [LFM2-2.6B](https://huggingface.co/LiquidAI/LFM2-2.6B) _(picker upgrade)_             | ~1.5 GB | ~3.5 GB  | LFM Open Lic v1.0 |
-| Retrieval        | [EmbeddingGemma-300M](https://huggingface.co/onnx-community/embeddinggemma-300m-ONNX) | ~309 MB | ~400 MB  | Gemma Terms       |
-| Reranking        | [MS MARCO MiniLM-L-6-v2](https://huggingface.co/Xenova/ms-marco-MiniLM-L-6-v2)        | ~23 MB  | ~90 MB   | Apache 2.0        |
-
-A fresh visitor on the **Compact** tier downloads **~1.55 GB** total.
-The chat tier is user-selectable in the consent modal; both LFM tiers
-share the embedder and reranker, so swapping tiers only re-downloads
-the chat slot.
+| Role             | Model                                                                                 | On disk  | Peak RAM | License           |
+| ---------------- | ------------------------------------------------------------------------------------- | -------- | -------- | ----------------- |
+| Chat _(Compact)_ | [LFM2.5-1.2B-Instruct](https://huggingface.co/LiquidAI/LFM2.5-1.2B-Instruct)          | ~810 MB  | ~2.0 GB  | LFM Open Lic v1.0 |
+| Chat _(Quality)_ | [LFM2-2.6B](https://huggingface.co/LiquidAI/LFM2-2.6B) _(picker upgrade)_             | ~1.55 GB | ~3.5 GB  | LFM Open Lic v1.0 |
+| Retrieval        | [EmbeddingGemma-300M](https://huggingface.co/onnx-community/embeddinggemma-300m-ONNX) | ~320 MB  | ~500 MB  | Gemma Terms       |
+| Reranking        | [MS MARCO MiniLM-L-6-v2](https://huggingface.co/Xenova/ms-marco-MiniLM-L-6-v2)        | ~23 MB   | ~90 MB   | Apache 2.0        |
+
+A fresh visitor on the **Compact** tier downloads **~1.15 GB** total
+(810 MB chat + 320 MB embed + 23 MB rerank); switching to **Quality**
+adds ~750 MB on top for the 2.6B chat weights, landing the bundle
+at **~1.9 GB**. The chat tier is user-selectable in the consent modal;
+both LFM tiers share the embedder and reranker, so swapping tiers
+only re-downloads the chat slot.
 
 The registry of these models lives in
 [`src/utils/ai-models.ts`](../src/utils/ai-models.ts) — every entry
@@ -224,7 +226,7 @@ caches:
 ```mermaid
 flowchart LR
     subgraph Cold["Cold visit"]
-        CDN[Hugging Face CDN] --> Cache[CacheStorage<br/><i>~1.55 GB</i>]
+        CDN[Hugging Face CDN] --> Cache[CacheStorage<br/><i>~1.15 GB Compact / ~1.9 GB Quality</i>]
         PDF1([upload PDF]) --> Embed1[chunk + embed] --> IDB1[(IndexedDB)]
     end
 
diff --git a/src/components/AiModelDetailsModal.tsx b/src/components/AiModelDetailsModal.tsx
index 087f476..544aafa 100644
--- a/src/components/AiModelDetailsModal.tsx
+++ b/src/components/AiModelDetailsModal.tsx
@@ -51,7 +51,7 @@ interface AiModelDetailsModalProps {
    * and clear the consent flags so the user re-experiences the
    * download dialog on next use. Wire from `useRagModels.evict`.
    * Hidden when omitted; rendered with an inline two-step confirm
-   * when present so a stray click can't nuke a 1.5 GB download.
+   * when present so a stray click can't nuke a 1+ GB download.
    */
   onDelete?: () => void | Promise<unknown>;
   /**
@@ -290,7 +290,8 @@ function RequirementsLine({ totalBytes }: { totalBytes: number }) {
  * Footer panel offering the two storage knobs: a soft "Free memory"
  * (release RAM, keep the downloaded weights cached on disk so the
  * next use warm-loads in seconds) and a destructive "Delete cached
- * models" (also evict the CacheStorage bytes, ~1.5 GB).
+ * models" (also evict the CacheStorage bytes — roughly 1.2 GB on the
+ * Compact tier, 1.9 GB on Quality).
  *
  * The destructive action goes through a two-step confirm: the first
  * click swaps the button into an "armed" state with a red warning
diff --git a/src/components/ChatModelPicker.tsx b/src/components/ChatModelPicker.tsx
index 1f7be5b..064a4b4 100644
--- a/src/components/ChatModelPicker.tsx
+++ b/src/components/ChatModelPicker.tsx
@@ -3,8 +3,8 @@
  *
  * Two tiers, each backed by an entry in `src/utils/ai-models.ts`:
  *
- *   - Compact  → LFM2.5-1.2B-Instruct (~1.2 GB / ~2 GB peak)
- *   - Quality  → LFM2-2.6B  (~1.5 GB / ~3.5 GB peak)
+ *   - Compact  → LFM2.5-1.2B-Instruct (~810 MB / ~2 GB peak)
+ *   - Quality  → LFM2-2.6B  (~1.55 GB / ~3.5 GB peak)
  *
  * The picker shows download size and peak RAM so users can see what
  * they're committing to. We deliberately do **not** auto-recommend a
diff --git a/src/config/tool-registry.ts b/src/config/tool-registry.ts
index ed3cd09..4fffb65 100644
--- a/src/config/tool-registry.ts
+++ b/src/config/tool-registry.ts
@@ -361,11 +361,14 @@ export const tools: Tool[] = [
     beta: true,
     // Two chat tiers ship today (see `CHAT_VARIANT_IDS` in
     // `src/utils/ai-models.ts`): LFM2.5-1.2B-Instruct Compact
-    // (~1.2 GB / 2 GB peak) and LFM2-2.6B Quality (~1.5 GB / 3.5 GB
+    // (~810 MB / 2 GB peak) and LFM2-2.6B Quality (~1.55 GB / 3.5 GB
     // peak). Both are Liquid AI's LFM family — the cross-tier e2e
     // showed they dominate SmolLM2-1.7B on speed AND extraction
     // discipline, so we dropped SmolLM2 entirely. The embedder is
-    // shared — EmbeddingGemma 300M (~197 MB / ~300 MB peak RAM).
+    // shared — EmbeddingGemma 300M (~320 MB / ~500 MB peak RAM,
+    // tokenizer included). A 23 MB MS MARCO MiniLM reranker rides
+    // alongside. Aggregate first-time download: ~1.15 GB on Compact,
+    // ~1.9 GB on Quality.
     // With OS + browser + tab overhead the practical floor for the
     // Quality tier is ~16 GB; users on lower-RAM machines should
     // pick Compact in the gate's tier picker. The `desktopOnly` flag below hides the tool on mobile
diff --git a/src/hooks/useRagModels.ts b/src/hooks/useRagModels.ts
index a6bbf06..30695f5 100644
--- a/src/hooks/useRagModels.ts
+++ b/src/hooks/useRagModels.ts
@@ -98,8 +98,9 @@ export interface UseRagModelsReturn {
    * Full evict — releases RAM **and** deletes the model weights
    * from the browser's CacheStorage, then clears every
    * `cloakpdf:ai-model-ready:*` flag so the consent dialog re-
-   * appears on next use. Frees ~1.5 GB of disk for the current AI
-   * bundle; the user pays a full re-download next time they touch
+   * appears on next use. Frees roughly 1.2 GB (Compact) / 1.9 GB
+   * (Quality) of disk for the current AI bundle; the user pays a
+   * full re-download next time they touch
    * the AI feature. Returns the cache-evict result so the caller
    * can tell the user how much was actually deleted (0 caches
    * means there was nothing cached to begin with).
diff --git a/src/tools/AskPdf.tsx b/src/tools/AskPdf.tsx
index 848ed70..1f4e72d 100644
--- a/src/tools/AskPdf.tsx
+++ b/src/tools/AskPdf.tsx
@@ -566,7 +566,7 @@ function IndexingCard({ progress }: { progress: IndexingProgress | null }) {
  * **Weight choice** (extract = 30 %, embed = 70 %): on a typical
  * text-layer PDF, extraction is fast (a few ms per page) while
  * embedding is the long pole — each batch is a WASM forward pass
- * against a 309 MB int8 model. Roughly matches the wall-clock split
+ * against a ~320 MB int8 embedder. Roughly matches the wall-clock split
  * we see on the résumé fixture. OCR-heavy PDFs invert this, but we
  * accept the small lie there because OCR users see the dedicated
  * "Running OCR on scanned pages…" label and know it's slow.
diff --git a/src/utils/ai-models.ts b/src/utils/ai-models.ts
index abeb3ea..004f0be 100644
--- a/src/utils/ai-models.ts
+++ b/src/utils/ai-models.ts
@@ -14,10 +14,10 @@
  * The chat slot ships **two tiers** (see {@link CHAT_VARIANT_IDS}),
  * both from Liquid AI's LFM family:
  *
- *   - `lfm2.5-1.2b` — Compact: ~1.2 GB / ~2 GB peak. Liquid AI's
+ *   - `lfm2.5-1.2b` — Compact: ~810 MB / ~2 GB peak. Liquid AI's
  *     latest 1.2B hybrid (LFM2.5 = LFM2 base + extended pretraining
  *     + RL post-training). The static default for fresh visitors.
- *   - `lfm2-2.6b` — Quality: ~1.5 GB / ~3.5 GB peak. Liquid AI's
+ *   - `lfm2-2.6b` — Quality: ~1.55 GB / ~3.5 GB peak. Liquid AI's
  *     larger hybrid; purpose-built for on-device structured extraction
  *     and RAG. Liquid hasn't shipped a 2.6 B variant of LFM2.5 yet, so
  *     this tier stays on the LFM2 build. Recommended on ≥ 8 GB free RAM.
@@ -179,21 +179,23 @@ const CHAT_LFM2_5_1_2B: AiModelInfo = {
   displayName: "LFM2.5 (1.2B, instruct, Liquid AI)",
   repo: "LiquidAI/LFM2.5-1.2B-Instruct-ONNX",
   task: "text-generation",
-  // ~1.2 GB on disk at q4, ~2 GB peak RAM (Liquid AI's published
-  // q4 size; q4f16 isn't shipped for this repo so we use plain q4
-  // which is their documented WebGPU-recommended quant). Same
-  // hybrid architecture as LFM2-1.2B (10-conv + 6-attention) but
-  // newer training recipe (extended pretraining + RL post-training)
-  // — Liquid markets LFM2.5 as the latest of the family.
+  // ~810 MB on disk at q4 (`model_q4.onnx_data` 850 MB on HF +
+  // tokenizer/configs ~3 MB; the q4 weights count toward both disk
+  // and RAM), ~2 GB peak RAM. Same hybrid architecture as LFM2-1.2B
+  // (10-conv + 6-attention) but newer training recipe (extended
+  // pretraining + RL post-training) — Liquid markets LFM2.5 as the
+  // latest of the family. We pin `dtype: "q4"` (plain int4 with fp32
+  // activations) because it's the WebGPU-validated quant on this repo;
+  // q4f16 *is* shipped now (760 MB) but introduces fp16 LayerNorms
+  // that have historically broken onnxruntime-web's WebGPU shader on
+  // some Chrome builds — sticking with q4 keeps the pipeline robust.
   //
   // **Why this slot is LFM2.5-1.2B-Instruct and not LFM2-1.2B**:
   // straight version-superset. Same parameter count, same family,
-  // newer training. The q4-vs-q4f16 swap is forced by Liquid's
-  // ONNX export (they don't ship q4f16 for LFM2.5-1.2B) — q4 with
-  // fp32 activations is slightly heavier on disk but works on the
-  // same WebGPU path. Validated against the résumé probe before
-  // shipping; passes phone/email/address extraction the same way
-  // the LFM2-1.2B q4f16 build did.
+  // newer training. We pin `dtype: "q4"` (not q4f16) because the
+  // q4 build is the one we validated end-to-end on WebGPU — passes
+  // phone/email/address extraction on the résumé probe the same
+  // way the prior LFM2-1.2B q4f16 build did.
   //
   // **Why not LFM2.5-350M**: tried it on paper but the chat slot
   // has burned every model at ≤ 500M params (SmolLM2-360M, Qwen
@@ -201,7 +203,7 @@ const CHAT_LFM2_5_1_2B: AiModelInfo = {
   // verbatim extraction — they confabulate plausible-looking
   // digits/emails instead of copying from the retrieved chunk.
   // Sticking to 1.2B keeps the discipline guarantee.
-  approxSizeBytes: Math.round(1.2 * 1024 * 1024 * 1024),
+  approxSizeBytes: Math.round(810 * 1024 * 1024),
   approxPeakRamBytes: Math.round(2 * 1024 * 1024 * 1024),
   description:
     "Liquid AI's latest 1.2B hybrid (extended pretraining + RL post-training over the LFM2 base). Designed for on-device structured extraction and RAG. The smaller of the two LFM2-family tiers we ship.",
@@ -239,12 +241,13 @@ const CHAT_LFM2_2_6B: AiModelInfo = {
   displayName: "LFM2 (2.6B, Liquid AI)",
   repo: "onnx-community/LFM2-2.6B-ONNX",
   task: "text-generation",
-  // ~1.5 GB on disk at q4f16, ~3.5 GB peak RAM. The largest of the
-  // three tiers — recommended on ≥ 8 GB free RAM. Same hybrid
+  // ~1.55 GB on disk at q4f16 (`model_q4f16.onnx_data` 1.54 GB on HF
+  // + tokenizer 3.3 MB + configs), ~3.5 GB peak RAM. The larger of
+  // the two tiers — recommended on ≥ 8 GB free RAM. Same hybrid
   // architecture and training discipline as LFM2-1.2B but with the
   // extra capacity that lets it handle longer, more nuanced
   // extraction questions.
-  approxSizeBytes: Math.round(1.5 * 1024 * 1024 * 1024),
+  approxSizeBytes: Math.round(1.55 * 1024 * 1024 * 1024),
   approxPeakRamBytes: Math.round(3.5 * 1024 * 1024 * 1024),
   description:
     "Liquid AI's larger hybrid model. Same on-device extraction discipline as LFM2-1.2B with more capacity for longer answers and harder questions.",
@@ -270,8 +273,10 @@ const EMBED: AiModelInfo = {
   displayName: "EmbeddingGemma (300M)",
   repo: "onnx-community/embeddinggemma-300m-ONNX",
   task: "feature-extraction",
-  // ~309 MB on disk (int8 quantized weights), ~400 MB peak RAM.
-  // 2× the prior bge-base-en-v1.5 (~140 MB) on disk but the
+  // ~320 MB on disk (~295 MB int8 weights via `model_quantized.onnx`
+  // + ~26 MB Gemma SentencePiece tokenizer — the latter is non-trivial
+  // and used to be missed in the registry estimate), ~500 MB peak RAM.
+  // Bigger than the prior bge-base-en-v1.5 (~140 MB) on disk but the
   // retrieval quality jump from EmbeddingGemma's asymmetric
   // task-prefix training is meaningful, and runtime RAM is
   // comparable thanks to int8 weights vs bge's fp16. 308M params
@@ -315,8 +320,8 @@ const EMBED: AiModelInfo = {
   // Prefix handling lives in `src/rag/embeddings.ts` — swapping
   // back to a symmetric embedder (e.g. bge) means dropping that
   // prefix layer.
-  approxSizeBytes: 309 * 1024 * 1024,
-  approxPeakRamBytes: 400 * 1024 * 1024,
+  approxSizeBytes: 320 * 1024 * 1024,
+  approxPeakRamBytes: 500 * 1024 * 1024,
   description:
     "Google's on-device embedding model from the Gemma family. Trained for asymmetric retrieval — applies task-specific prompts to PDF chunks vs your question, then matches them in a 768-dim vector space so the chat model gets the right pages. Multilingual (100+ langs).",
   bestFor: "Semantic retrieval over PDFs in any of 100+ languages.",
diff --git a/src/utils/ai-runtime.ts b/src/utils/ai-runtime.ts
index 02ab2db..1a0f69e 100644
--- a/src/utils/ai-runtime.ts
+++ b/src/utils/ai-runtime.ts
@@ -254,8 +254,9 @@ export interface ModelCacheEvictResult {
 
 /**
  * Evict the Transformers.js model bytes from the browser's
- * CacheStorage. Frees ~1.5 GB of disk for the current AI bundle
- * (chat + embed + rerank) and forces a fresh download on next use.
+ * CacheStorage. Frees roughly 1.2 GB on the Compact tier / 1.9 GB on
+ * Quality for the current AI bundle (chat + embed + rerank) and
+ * forces a fresh download on next use.
  *
  * Does **not** unload the in-memory pipelines — call
  * {@link disposeAllModels} alongside this when you actually want the